From 15192861a17bb07ce71f320f5f08bfe8dea461f0 Mon Sep 17 00:00:00 2001
From: wongey <25296194+wongey@users.noreply.github.com>
Date: Tue, 5 Nov 2024 21:04:10 -0500
Subject: [PATCH 01/64] Implement qir-qsim app for dynamic measurement handling

---
 CMakeLists.txt                        |   38 +-
 app/CMakeLists.txt                    |   50 +-
 app/qir-qsim.cc                       |   94 ++
 cmake/FindLLVM.cmake                  |    8 +
 examples/teleport.ll                  |    2 +
 qsim/bits.h                           |  106 ++
 qsim/bitstring.h                      |   97 ++
 qsim/channel.h                        |  149 +++
 qsim/channels_cirq.h                  |  471 +++++++
 qsim/channels_qsim.h                  |  117 ++
 qsim/circuit.h                        |   36 +
 qsim/circuit_noisy.h                  |  108 ++
 qsim/circuit_qsim_parser.h            |  442 +++++++
 qsim/cuda2hip.h                       |   61 +
 qsim/expect.h                         |  148 +++
 qsim/formux.h                         |   30 +
 qsim/fuser.h                          |  225 ++++
 qsim/fuser_basic.h                    |  411 +++++++
 qsim/fuser_mqubit.h                   | 1095 +++++++++++++++++
 qsim/gate.h                           |  216 ++++
 qsim/gate_appl.h                      |  231 ++++
 qsim/gates_cirq.h                     | 1640 +++++++++++++++++++++++++
 qsim/gates_qsim.h                     |  661 ++++++++++
 qsim/hybrid.h                         |  612 +++++++++
 qsim/io.h                             |   44 +
 qsim/io_file.h                        |   71 ++
 qsim/matrix.h                         |  296 +++++
 qsim/mps_simulator.h                  |  246 ++++
 qsim/mps_statespace.h                 |  597 +++++++++
 qsim/parfor.h                         |  123 ++
 qsim/qtrajectory.h                    |  435 +++++++
 qsim/run_qsim.h                       |  262 ++++
 qsim/run_qsimh.h                      |  120 ++
 qsim/seqfor.h                         |   68 +
 qsim/simmux.h                         |   44 +
 qsim/simmux_gpu.h                     |   30 +
 qsim/simulator.h                      |  516 ++++++++
 qsim/simulator_avx.h                  | 1363 ++++++++++++++++++++
 qsim/simulator_avx512.h               |  846 +++++++++++++
 qsim/simulator_basic.h                |  349 ++++++
 qsim/simulator_cuda.h                 |  923 ++++++++++++++
 qsim/simulator_cuda_kernels.h         |  683 ++++++++++
 qsim/simulator_custatevec.h           |  209 ++++
 qsim/simulator_sse.h                  |  864 +++++++++++++
 qsim/statespace.h                     |  145 +++
 qsim/statespace_avx.h                 |  497 ++++++++
 qsim/statespace_avx512.h              |  448 +++++++
 qsim/statespace_basic.h               |  300 +++++
 qsim/statespace_cuda.h                |  470 +++++++
 qsim/statespace_cuda_kernels.h        |  355 ++++++
 qsim/statespace_custatevec.h          |  376 ++++++
 qsim/statespace_sse.h                 |  462 +++++++
 qsim/umux.h                           |   52 +
 qsim/unitary_calculator_avx.h         | 1028 ++++++++++++++++
 qsim/unitary_calculator_avx512.h      |  644 ++++++++++
 qsim/unitary_calculator_basic.h       |  259 ++++
 qsim/unitary_calculator_sse.h         |  639 ++++++++++
 qsim/unitaryspace.h                   |   65 +
 qsim/unitaryspace_avx.h               |  112 ++
 qsim/unitaryspace_avx512.h            |  112 ++
 qsim/unitaryspace_basic.h             |  103 ++
 qsim/unitaryspace_sse.h               |  112 ++
 qsim/util.h                           |   89 ++
 qsim/util_cpu.h                       |   43 +
 qsim/util_cuda.h                      |  128 ++
 qsim/util_custatevec.h                |   44 +
 qsim/vectorspace.h                    |  185 +++
 qsim/vectorspace_cuda.h               |  172 +++
 src/CMakeLists.txt                    |    5 +
 src/qirqsim/BufferManager.cc          |   33 +
 src/qirqsim/BufferManager.hh          |   45 +
 src/qirqsim/CMakeLists.txt            |   29 +
 src/qirqsim/qsimDefaultRuntime.cc     |   71 ++
 src/qirqsim/qsimDefaultRuntime.hh     |   61 +
 src/qirqsim/qsimQuantum.cc            |  218 ++++
 src/qirqsim/qsimQuantum.hh            |  175 +++
 src/qirqsim/qsimTupleRuntime.cc       |  123 ++
 src/qirqsim/qsimTupleRuntime.hh       |   93 ++
 tpls/qsim/bits.h                      |  106 ++
 tpls/qsim/bitstring.h                 |   97 ++
 tpls/qsim/channel.h                   |  149 +++
 tpls/qsim/channels_cirq.h             |  471 +++++++
 tpls/qsim/channels_qsim.h             |  117 ++
 tpls/qsim/circuit.h                   |   36 +
 tpls/qsim/circuit_noisy.h             |  108 ++
 tpls/qsim/circuit_qsim_parser.h       |  442 +++++++
 tpls/qsim/cuda2hip.h                  |   61 +
 tpls/qsim/expect.h                    |  148 +++
 tpls/qsim/formux.h                    |   30 +
 tpls/qsim/fuser.h                     |  225 ++++
 tpls/qsim/fuser_basic.h               |  411 +++++++
 tpls/qsim/fuser_mqubit.h              | 1095 +++++++++++++++++
 tpls/qsim/gate.h                      |  216 ++++
 tpls/qsim/gate_appl.h                 |  231 ++++
 tpls/qsim/gates_cirq.h                | 1640 +++++++++++++++++++++++++
 tpls/qsim/gates_qsim.h                |  661 ++++++++++
 tpls/qsim/hybrid.h                    |  612 +++++++++
 tpls/qsim/io.h                        |   44 +
 tpls/qsim/io_file.h                   |   71 ++
 tpls/qsim/matrix.h                    |  296 +++++
 tpls/qsim/mps_simulator.h             |  246 ++++
 tpls/qsim/mps_statespace.h            |  597 +++++++++
 tpls/qsim/parfor.h                    |  123 ++
 tpls/qsim/qtrajectory.h               |  435 +++++++
 tpls/qsim/run_qsim.h                  |  262 ++++
 tpls/qsim/run_qsimh.h                 |  120 ++
 tpls/qsim/seqfor.h                    |   68 +
 tpls/qsim/simmux.h                    |   44 +
 tpls/qsim/simmux_gpu.h                |   30 +
 tpls/qsim/simulator.h                 |  516 ++++++++
 tpls/qsim/simulator_avx.h             | 1363 ++++++++++++++++++++
 tpls/qsim/simulator_avx512.h          |  846 +++++++++++++
 tpls/qsim/simulator_basic.h           |  349 ++++++
 tpls/qsim/simulator_cuda.h            |  923 ++++++++++++++
 tpls/qsim/simulator_cuda_kernels.h    |  683 ++++++++++
 tpls/qsim/simulator_custatevec.h      |  209 ++++
 tpls/qsim/simulator_sse.h             |  864 +++++++++++++
 tpls/qsim/statespace.h                |  145 +++
 tpls/qsim/statespace_avx.h            |  497 ++++++++
 tpls/qsim/statespace_avx512.h         |  448 +++++++
 tpls/qsim/statespace_basic.h          |  300 +++++
 tpls/qsim/statespace_cuda.h           |  470 +++++++
 tpls/qsim/statespace_cuda_kernels.h   |  355 ++++++
 tpls/qsim/statespace_custatevec.h     |  376 ++++++
 tpls/qsim/statespace_sse.h            |  462 +++++++
 tpls/qsim/umux.h                      |   52 +
 tpls/qsim/unitary_calculator_avx.h    | 1028 ++++++++++++++++
 tpls/qsim/unitary_calculator_avx512.h |  644 ++++++++++
 tpls/qsim/unitary_calculator_basic.h  |  259 ++++
 tpls/qsim/unitary_calculator_sse.h    |  639 ++++++++++
 tpls/qsim/unitaryspace.h              |   65 +
 tpls/qsim/unitaryspace_avx.h          |  112 ++
 tpls/qsim/unitaryspace_avx512.h       |  112 ++
 tpls/qsim/unitaryspace_basic.h        |  103 ++
 tpls/qsim/unitaryspace_sse.h          |  112 ++
 tpls/qsim/util.h                      |   89 ++
 tpls/qsim/util_cpu.h                  |   43 +
 tpls/qsim/util_cuda.h                 |  128 ++
 tpls/qsim/util_custatevec.h           |   44 +
 tpls/qsim/vectorspace.h               |  185 +++
 tpls/qsim/vectorspace_cuda.h          |  172 +++
 141 files changed, 44613 insertions(+), 2 deletions(-)
 create mode 100644 app/qir-qsim.cc
 create mode 100644 qsim/bits.h
 create mode 100644 qsim/bitstring.h
 create mode 100644 qsim/channel.h
 create mode 100644 qsim/channels_cirq.h
 create mode 100644 qsim/channels_qsim.h
 create mode 100644 qsim/circuit.h
 create mode 100644 qsim/circuit_noisy.h
 create mode 100644 qsim/circuit_qsim_parser.h
 create mode 100644 qsim/cuda2hip.h
 create mode 100644 qsim/expect.h
 create mode 100644 qsim/formux.h
 create mode 100644 qsim/fuser.h
 create mode 100644 qsim/fuser_basic.h
 create mode 100644 qsim/fuser_mqubit.h
 create mode 100644 qsim/gate.h
 create mode 100644 qsim/gate_appl.h
 create mode 100644 qsim/gates_cirq.h
 create mode 100644 qsim/gates_qsim.h
 create mode 100644 qsim/hybrid.h
 create mode 100644 qsim/io.h
 create mode 100644 qsim/io_file.h
 create mode 100644 qsim/matrix.h
 create mode 100644 qsim/mps_simulator.h
 create mode 100644 qsim/mps_statespace.h
 create mode 100644 qsim/parfor.h
 create mode 100644 qsim/qtrajectory.h
 create mode 100644 qsim/run_qsim.h
 create mode 100644 qsim/run_qsimh.h
 create mode 100644 qsim/seqfor.h
 create mode 100644 qsim/simmux.h
 create mode 100644 qsim/simmux_gpu.h
 create mode 100644 qsim/simulator.h
 create mode 100644 qsim/simulator_avx.h
 create mode 100644 qsim/simulator_avx512.h
 create mode 100644 qsim/simulator_basic.h
 create mode 100644 qsim/simulator_cuda.h
 create mode 100644 qsim/simulator_cuda_kernels.h
 create mode 100644 qsim/simulator_custatevec.h
 create mode 100644 qsim/simulator_sse.h
 create mode 100644 qsim/statespace.h
 create mode 100644 qsim/statespace_avx.h
 create mode 100644 qsim/statespace_avx512.h
 create mode 100644 qsim/statespace_basic.h
 create mode 100644 qsim/statespace_cuda.h
 create mode 100644 qsim/statespace_cuda_kernels.h
 create mode 100644 qsim/statespace_custatevec.h
 create mode 100644 qsim/statespace_sse.h
 create mode 100644 qsim/umux.h
 create mode 100644 qsim/unitary_calculator_avx.h
 create mode 100644 qsim/unitary_calculator_avx512.h
 create mode 100644 qsim/unitary_calculator_basic.h
 create mode 100644 qsim/unitary_calculator_sse.h
 create mode 100644 qsim/unitaryspace.h
 create mode 100644 qsim/unitaryspace_avx.h
 create mode 100644 qsim/unitaryspace_avx512.h
 create mode 100644 qsim/unitaryspace_basic.h
 create mode 100644 qsim/unitaryspace_sse.h
 create mode 100644 qsim/util.h
 create mode 100644 qsim/util_cpu.h
 create mode 100644 qsim/util_cuda.h
 create mode 100644 qsim/util_custatevec.h
 create mode 100644 qsim/vectorspace.h
 create mode 100644 qsim/vectorspace_cuda.h
 create mode 100644 src/qirqsim/BufferManager.cc
 create mode 100644 src/qirqsim/BufferManager.hh
 create mode 100644 src/qirqsim/CMakeLists.txt
 create mode 100644 src/qirqsim/qsimDefaultRuntime.cc
 create mode 100644 src/qirqsim/qsimDefaultRuntime.hh
 create mode 100644 src/qirqsim/qsimQuantum.cc
 create mode 100644 src/qirqsim/qsimQuantum.hh
 create mode 100644 src/qirqsim/qsimTupleRuntime.cc
 create mode 100644 src/qirqsim/qsimTupleRuntime.hh
 create mode 100644 tpls/qsim/bits.h
 create mode 100644 tpls/qsim/bitstring.h
 create mode 100644 tpls/qsim/channel.h
 create mode 100644 tpls/qsim/channels_cirq.h
 create mode 100644 tpls/qsim/channels_qsim.h
 create mode 100644 tpls/qsim/circuit.h
 create mode 100644 tpls/qsim/circuit_noisy.h
 create mode 100644 tpls/qsim/circuit_qsim_parser.h
 create mode 100644 tpls/qsim/cuda2hip.h
 create mode 100644 tpls/qsim/expect.h
 create mode 100644 tpls/qsim/formux.h
 create mode 100644 tpls/qsim/fuser.h
 create mode 100644 tpls/qsim/fuser_basic.h
 create mode 100644 tpls/qsim/fuser_mqubit.h
 create mode 100644 tpls/qsim/gate.h
 create mode 100644 tpls/qsim/gate_appl.h
 create mode 100644 tpls/qsim/gates_cirq.h
 create mode 100644 tpls/qsim/gates_qsim.h
 create mode 100644 tpls/qsim/hybrid.h
 create mode 100644 tpls/qsim/io.h
 create mode 100644 tpls/qsim/io_file.h
 create mode 100644 tpls/qsim/matrix.h
 create mode 100644 tpls/qsim/mps_simulator.h
 create mode 100644 tpls/qsim/mps_statespace.h
 create mode 100644 tpls/qsim/parfor.h
 create mode 100644 tpls/qsim/qtrajectory.h
 create mode 100644 tpls/qsim/run_qsim.h
 create mode 100644 tpls/qsim/run_qsimh.h
 create mode 100644 tpls/qsim/seqfor.h
 create mode 100644 tpls/qsim/simmux.h
 create mode 100644 tpls/qsim/simmux_gpu.h
 create mode 100644 tpls/qsim/simulator.h
 create mode 100644 tpls/qsim/simulator_avx.h
 create mode 100644 tpls/qsim/simulator_avx512.h
 create mode 100644 tpls/qsim/simulator_basic.h
 create mode 100644 tpls/qsim/simulator_cuda.h
 create mode 100644 tpls/qsim/simulator_cuda_kernels.h
 create mode 100644 tpls/qsim/simulator_custatevec.h
 create mode 100644 tpls/qsim/simulator_sse.h
 create mode 100644 tpls/qsim/statespace.h
 create mode 100644 tpls/qsim/statespace_avx.h
 create mode 100644 tpls/qsim/statespace_avx512.h
 create mode 100644 tpls/qsim/statespace_basic.h
 create mode 100644 tpls/qsim/statespace_cuda.h
 create mode 100644 tpls/qsim/statespace_cuda_kernels.h
 create mode 100644 tpls/qsim/statespace_custatevec.h
 create mode 100644 tpls/qsim/statespace_sse.h
 create mode 100644 tpls/qsim/umux.h
 create mode 100644 tpls/qsim/unitary_calculator_avx.h
 create mode 100644 tpls/qsim/unitary_calculator_avx512.h
 create mode 100644 tpls/qsim/unitary_calculator_basic.h
 create mode 100644 tpls/qsim/unitary_calculator_sse.h
 create mode 100644 tpls/qsim/unitaryspace.h
 create mode 100644 tpls/qsim/unitaryspace_avx.h
 create mode 100644 tpls/qsim/unitaryspace_avx512.h
 create mode 100644 tpls/qsim/unitaryspace_basic.h
 create mode 100644 tpls/qsim/unitaryspace_sse.h
 create mode 100644 tpls/qsim/util.h
 create mode 100644 tpls/qsim/util_cpu.h
 create mode 100644 tpls/qsim/util_cuda.h
 create mode 100644 tpls/qsim/util_custatevec.h
 create mode 100644 tpls/qsim/vectorspace.h
 create mode 100644 tpls/qsim/vectorspace_cuda.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 05f3e7d..a536e86 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,12 +36,23 @@ endmacro()
 option(QIREE_BUILD_DOCS "Build QIR-EE documentation" OFF)
 option(QIREE_BUILD_TESTS "Build QIR-EE unit tests" OFF)
 option(QIREE_BUILD_EXAMPLES "Build QIR-EE examples" OFF)
-option(QIREE_USE_XACC "Build XACC interface" ON)
+option(QIREE_USE_XACC "Build XACC interface" OFF)
+option(QIREE_USE_QSIM "Build qsim interface" OFF)
 qiree_set_default(BUILD_TESTING ${QIREE_BUILD_TESTS})
 
 # Assertion handling
 option(QIREE_DEBUG "Enable runtime assertions" ON)
 
+# Enforce mutual exclusivity
+if(QIREE_USE_XACC)
+  set(QIREE_USE_QSIM OFF CACHE BOOL "Build qsim interface" FORCE)
+  message(STATUS "QIREE_USE_XACC is ON, setting QIREE_USE_QSIM to OFF.")
+elseif(QIREE_USE_QSIM)
+  set(QIREE_USE_XACC OFF CACHE BOOL "Build XACC interface" FORCE)
+  message(STATUS "QIREE_USE_QSIM is ON, setting QIREE_USE_XACC to OFF.")
+endif()
+
+
 #----------------------------------------------------------------------------#
 # CMAKE INTRINSIC OPTIONS
 #
@@ -174,6 +185,31 @@ if(QIREE_BUILD_TESTS)
   add_subdirectory(test)
 endif()
 
+#----------------------------------------------------------------------------#
+# OPENMP
+#----------------------------------------------------------------------------#
+
+# Manually set OpenMP flags for macOS with libomp
+if(APPLE)
+  set(OpenMP_C_FLAGS "-Xpreprocessor -fopenmp -I/opt/homebrew/include")
+  set(OpenMP_CXX_FLAGS "-Xpreprocessor -fopenmp -I/opt/homebrew/include")
+  set(OpenMP_C_LIB_NAMES "omp")
+  set(OpenMP_CXX_LIB_NAMES "omp")
+  set(OpenMP_omp_LIBRARY "/opt/homebrew/lib/libomp.dylib")
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+  link_directories("/opt/homebrew/lib")
+endif()
+
+# Now try to find OpenMP
+find_package(OpenMP REQUIRED)
+
+if(OpenMP_FOUND)
+  message(STATUS "OpenMP found")
+else()
+  message(FATAL_ERROR "OpenMP support is required but was not found.")
+endif()
+
 #----------------------------------------------------------------------------#
 # APPLICATIONS AND BINARIES
 #----------------------------------------------------------------------------#
diff --git a/app/CMakeLists.txt b/app/CMakeLists.txt
index 5b1939b..fb78caa 100644
--- a/app/CMakeLists.txt
+++ b/app/CMakeLists.txt
@@ -8,12 +8,14 @@ include(FetchContent)
 FetchContent_Declare(
   cli11_proj
   QUIET
-  GIT_REPOSITORY https://github.com/CLIUtils/CLI11.git
+  GIT_REPOSITORY https://github.com/CLIUtils/CLI11.git # Command Line Parser for C++ programs
   GIT_TAG f4d0731cebb123ff0ace712c099dffbcd2c58e5a # v2.4.1
 )
 
 FetchContent_MakeAvailable(cli11_proj)
 
+# Conditionally add XACC-based executable
+
 if(QIREE_USE_XACC)
   qiree_add_executable(qir-xacc
     qir-xacc.cc
@@ -24,4 +26,50 @@ if(QIREE_USE_XACC)
   )
 endif()
 
+# Conditionally download and configure qsim library
+
+if(QIREE_USE_QSIM)
+  FetchContent_Declare(
+    qsim_lib
+    GIT_REPOSITORY https://github.com/quantumlib/qsim.git
+    GIT_TAG master # Use a specific commit/tag if needed
+  )
+  
+  FetchContent_GetProperties(qsim_lib)
+  
+  if(NOT qsim_lib_POPULATED)
+    FetchContent_MakeAvailable(qsim_lib)
+
+    # Copy header files to tpls/qsim
+    file(MAKE_DIRECTORY ${CMAKE_SOURCE_DIR}/tpls/qsim)
+    message(STATUS "Copying qsim headers to ${CMAKE_SOURCE_DIR}/tpls/qsim")
+    file(GLOB qsim_headers "${qsim_lib_SOURCE_DIR}/lib/*.h")
+    file(COPY ${qsim_headers} DESTINATION ${CMAKE_SOURCE_DIR}/tpls/qsim)
+  endif()
+
+  find_package(OpenMP REQUIRED)
+
+  if(OpenMP_CXX_FOUND)
+    target_link_libraries(qirqsim PUBLIC OpenMP::OpenMP_CXX)
+  endif()
+  # Collect source files for the qsim library
+  #file(GLOB SRC "${CMAKE_SOURCE_DIR}/src/qirqsim/*.cc")
+
+  # Add qsim library with the correct include directories
+
+  #add_library(qsim SHARED ${SRC})
+  #target_include_directories(qsim 
+  #  PUBLIC 
+  #    ${CMAKE_SOURCE_DIR}/tpls/qsim            # qsim headers
+  #    ${CMAKE_SOURCE_DIR}/tpls/qsim/lib        # Additional qsim headers if needed
+  #    )
+  
+  # Add the qir-qsim executable and link it with qsim
+  qiree_add_executable(qir-qsim qir-qsim.cc)
+  target_link_libraries(qir-qsim 
+    PUBLIC QIREE::qiree QIREE::qirqsim 
+    PRIVATE CLI11::CLI11 
+  )
+endif()
+
 #-----------------------------------------------------------------------------#
diff --git a/app/qir-qsim.cc b/app/qir-qsim.cc
new file mode 100644
index 0000000..809b686
--- /dev/null
+++ b/app/qir-qsim.cc
@@ -0,0 +1,94 @@
+//----------------------------------*-C++-*----------------------------------//
+// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers.
+// See the top-level COPYRIGHT file for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//---------------------------------------------------------------------------//
+//! \file qir-xacc/qir-xacc.cc
+//---------------------------------------------------------------------------//
+#include <cstdlib>
+#include <iostream>
+#include <string>
+#include <string_view>
+#include <CLI/CLI.hpp>
+
+#include "qiree_version.h"
+#include "qiree/Executor.hh"
+#include "qiree/Module.hh"
+#include "qiree/QuantumNotImpl.hh"
+
+#include "qiree/Executor.hh"
+#include "qiree/Module.hh"
+#include "qiree/QuantumNotImpl.hh"
+
+#include "qirqsim/qsimDefaultRuntime.hh"
+#include "qirqsim/qsimQuantum.hh"
+#include "qirqsim/qsimTupleRuntime.hh"
+
+using namespace std::string_view_literals;
+
+namespace qiree
+{
+namespace app
+{
+void run(std::string const& filename,
+         int num_shots)
+         // bool group_tuples = false)
+{
+    // Load the input
+    Executor execute{Module{filename}};
+    
+    // Set up qsim
+    qsimQuantum sim(std::cout, num_shots);
+    
+    // Collect the statistics 
+    std::unique_ptr<RuntimeInterface> rt;
+    //if (group_tuples){
+    //    rt = std::make_unique<qsimTupleRuntime>(
+    //        std::cout, sim);
+    //} else {
+        rt = std::make_unique<qsimDefaultRuntime>(
+            std::cout, sim);
+    //}
+
+    // Run several time = shots (default 1)
+    for (int i = 0; i < num_shots; i++){    
+        execute(sim, *rt);
+    }
+}
+
+//---------------------------------------------------------------------------//
+}  // namespace app
+}  // namespace qiree
+
+//---------------------------------------------------------------------------//
+/*!
+ * Execute and run.
+ */
+int main(int argc, char* argv[])
+{
+    int num_shots{1};
+    std::string filename;
+    //bool group_tuples{false};
+ 
+    CLI::App app;
+
+    auto* filename_opt
+        = app.add_option("--input,-i,input", filename, "QIR input file");
+    filename_opt->required();
+
+    auto* nshot_opt
+        = app.add_option("-s,--shots", num_shots, "Number of shots");
+    nshot_opt->capture_default_str();
+    
+    //app.add_flag("--group-tuples,!--no-group-tuples",
+    //            group_tuples,
+    //            "Print per-tuple measurement statistics rather than "
+    //            "per-qubit");
+    
+    CLI11_PARSE(app, argc, argv);
+
+    //qiree::app::run(filename, num_shots, group_tuples);
+    qiree::app::run(filename, num_shots);
+        
+    return EXIT_SUCCESS;
+}
diff --git a/cmake/FindLLVM.cmake b/cmake/FindLLVM.cmake
index f363f67..1994269 100644
--- a/cmake/FindLLVM.cmake
+++ b/cmake/FindLLVM.cmake
@@ -28,6 +28,12 @@ This module will set the following variables if found:
 
 include(FindPackageHandleStandardArgs)
 
+# Check if the system is macOS
+if(APPLE)
+  # Set LLVM_DIR to the Homebrew location if using macOS
+  set(LLVM_DIR "/opt/homebrew/opt/llvm/lib/cmake/llvm" CACHE PATH "Path to LLVM on macOS")
+endif()
+
 find_package(LLVM QUIET CONFIG)
 find_package_handle_standard_args(LLVM CONFIG_MODE)
 
@@ -36,6 +42,8 @@ if(LLVM_FOUND)
   target_include_directories(LLVM::headers SYSTEM INTERFACE
     "${LLVM_INCLUDE_DIRS}"
   )
+else()
+  message(WARNING "Could not find LLVM. Make sure LLVM is installed and LLVM_DIR is set.")
 endif()
 
 #-----------------------------------------------------------------------------#
diff --git a/examples/teleport.ll b/examples/teleport.ll
index 3bf36e9..184359f 100644
--- a/examples/teleport.ll
+++ b/examples/teleport.ll
@@ -6,6 +6,7 @@ source_filename = "teleport"
 
 define void @main() #0 {
 entry:
+  call void @__quantum__qis__x__body(%Qubit* null)
   call void @__quantum__qis__h__body(%Qubit* inttoptr (i64 1 to %Qubit*))
   call void @__quantum__qis__cnot__body(%Qubit* inttoptr (i64 1 to %Qubit*), %Qubit* inttoptr (i64 2 to %Qubit*))
   call void @__quantum__qis__cnot__body(%Qubit* null, %Qubit* inttoptr (i64 1 to %Qubit*))
@@ -37,6 +38,7 @@ else2:                                            ; preds = %continue
 
 continue3:                                        ; preds = %else2, %then1
   call void @__quantum__qis__mz__body(%Qubit* inttoptr (i64 2 to %Qubit*), %Result* inttoptr (i64 2 to %Result*))
+  %2 = call i2 @__quantum__qis__read_result__body(%Result* inttoptr (i64 2 to %Result*))
   call void @__quantum__rt__array_record_output(i64 3, i8* null)
   call void @__quantum__rt__result_record_output(%Result* null, i8* null)
   call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 1 to %Result*), i8* null)
diff --git a/qsim/bits.h b/qsim/bits.h
new file mode 100644
index 0000000..080c866
--- /dev/null
+++ b/qsim/bits.h
@@ -0,0 +1,106 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef BITS_H_
+#define BITS_H_
+
+#include <vector>
+
+#ifdef __BMI2__
+
+#include <immintrin.h>
+
+#include <cstdint>
+
+namespace qsim {
+namespace bits {
+
+inline uint32_t ExpandBits(uint32_t bits, unsigned n, uint32_t mask) {
+  return _pdep_u32(bits, mask);
+}
+
+inline uint64_t ExpandBits(uint64_t bits, unsigned n, uint64_t mask) {
+  return _pdep_u64(bits, mask);
+}
+
+inline uint32_t CompressBits(uint32_t bits, unsigned n, uint32_t mask) {
+  return _pext_u32(bits, mask);
+}
+
+inline uint64_t CompressBits(uint64_t bits, unsigned n, uint64_t mask) {
+  return _pext_u64(bits, mask);
+}
+
+}  // namespace bits
+}  // namespace qsim
+
+#else  // __BMI2__
+
+namespace qsim {
+namespace bits {
+
+template <typename Integer>
+inline Integer ExpandBits(Integer bits, unsigned n, Integer mask) {
+  Integer ebits = 0;
+  unsigned k = 0;
+
+  for (unsigned i = 0; i < n; ++i) {
+    if ((mask >> i) & 1) {
+      ebits |= ((bits >> k) & 1) << i;
+      ++k;
+    }
+  }
+
+  return ebits;
+}
+
+template <typename Integer>
+inline Integer CompressBits(Integer bits, unsigned n, Integer mask) {
+  Integer sbits = 0;
+  unsigned k = 0;
+
+  for (unsigned i = 0; i < n; ++i) {
+    if ((mask >> i) & 1) {
+      sbits |= ((bits >> i) & 1) << k;
+      ++k;
+    }
+  }
+
+  return sbits;
+}
+
+}  // namespace bits
+}  // namespace qsim
+
+#endif  // __BMI2__
+
+namespace qsim {
+namespace bits {
+
+template <typename Integer>
+inline Integer PermuteBits(
+    Integer bits, unsigned n, const std::vector<unsigned>& perm) {
+  Integer pbits = 0;
+
+  for (unsigned i = 0; i < n; ++i) {
+    pbits |= ((bits >> i) & 1) << perm[i];
+  }
+
+  return pbits;
+}
+
+}  // namespace bits
+}  // namespace qsim
+
+#endif  // BITS_H_
diff --git a/qsim/bitstring.h b/qsim/bitstring.h
new file mode 100644
index 0000000..b95584b
--- /dev/null
+++ b/qsim/bitstring.h
@@ -0,0 +1,97 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef BITSTRING_H_
+#define BITSTRING_H_
+
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace qsim {
+
+using Bitstring = uint64_t;
+
+/**
+ * Reads bitstrings (representing initialized or measured states of qubits)
+ * from a provided stream object and stores them in a vector.
+ * @param num_qubits Number of qubits represented in each bitstring.
+ * @param provider Source of bitstrings; only used for error reporting.
+ * @param fs The stream to read bitstrings from.
+ * @param bitstrings Output vector of bitstrings. On success, this will contain
+ *   all bitstrings read in from 'fs'.
+ * @return True if reading succeeded; false otherwise.
+ */
+template <typename IO, typename Stream>
+bool BitstringsFromStream(unsigned num_qubits, const std::string& provider,
+                          Stream& fs, std::vector<Bitstring>& bitstrings) {
+  bitstrings.resize(0);
+  bitstrings.reserve(100000);
+
+  // Bitstrings are in text format. One bitstring per line.
+
+  do {
+    char buf[128];
+    fs.getline(buf, 128);
+
+    if (fs) {
+      Bitstring b{0};
+
+      unsigned p = 0;
+      while (p < 128 && (buf[p] == '0' || buf[p] == '1')) {
+        b |= uint64_t(buf[p] - '0') << p;
+        ++p;
+      }
+
+      if (p != num_qubits) {
+        IO::errorf("wrong bitstring length in %s: "
+                   "got %u; should be %u.\n", provider.c_str(), p, num_qubits);
+        bitstrings.resize(0);
+        return false;
+      }
+
+      bitstrings.push_back(b);
+    }
+  } while (fs);
+
+  return true;
+}
+
+/**
+ * Reads bitstrings (representing initialized or measured states of qubits)
+ * from the given file and stores them in a vector.
+ * @param num_qubits Number of qubits represented in each bitstring.
+ * @param file The name of the file to read bitstrings from.
+ * @param bitstrings Output vector of bitstrings. On success, this will contain
+ *   all bitstrings read in from 'file'.
+ * @return True if reading succeeded; false otherwise.
+ */
+template <typename IO>
+inline bool BitstringsFromFile(unsigned num_qubits, const std::string& file,
+                               std::vector<Bitstring>& bitstrings) {
+  auto fs = IO::StreamFromFile(file);
+
+  if (!fs) {
+    return false;
+  } else {
+    bool rc = BitstringsFromStream<IO>(num_qubits, file, fs, bitstrings);
+    IO::CloseStream(fs);
+    return rc;
+  }
+}
+
+}  // namespace qsim
+
+#endif  // BITSTRING_H_
diff --git a/qsim/channel.h b/qsim/channel.h
new file mode 100644
index 0000000..372a174
--- /dev/null
+++ b/qsim/channel.h
@@ -0,0 +1,149 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef CHANNEL_H_
+#define CHANNEL_H_
+
+#include <set>
+#include <vector>
+
+#include "gate.h"
+#include "matrix.h"
+
+namespace qsim {
+
+/**
+ * Kraus operator.
+ */
+template <typename Gate>
+struct KrausOperator {
+  using fp_type = typename Gate::fp_type;
+
+  enum Kind {
+    kNormal = 0,
+    kMeasurement = gate::kMeasurement,
+  };
+
+  /**
+   * Kraus operator type;
+   */
+  Kind kind;
+
+  /**
+   * If true, the Kraus operator is a unitary operator times a constant.
+   */
+  bool unitary;
+
+  /**
+   * Lower bound on Kraus operator probability.
+   */
+  double prob;
+
+  /**
+   * Sequence of operations that represent the Kraus operator. This can be just
+   * one operation.
+   */
+  std::vector<Gate> ops;
+
+  /**
+   * Product of K^\dagger and K. This can be empty if unitary = true.
+   */
+  Matrix<fp_type> kd_k;
+
+  /**
+   * Qubits kd_k acts on. This can be empty if unitary = true.
+   */
+  std::vector<unsigned> qubits;
+
+  /**
+   * Calculates the product of "K^\dagger K". Sets qubits "K^\dagger K" acts on.
+   */
+  void CalculateKdKMatrix() {
+    if (ops.size() == 1) {
+      kd_k = ops[0].matrix;
+      MatrixDaggerMultiply(ops[0].qubits.size(), ops[0].matrix, kd_k);
+      qubits = ops[0].qubits;
+    } else if (ops.size() > 1) {
+      std::set<unsigned> qubit_map;
+
+      for (const auto& op : ops) {
+        for (unsigned q : op.qubits) {
+          qubit_map.insert(q);
+        }
+      }
+
+      unsigned num_qubits = qubit_map.size();
+
+      qubits.resize(0);
+      qubits.reserve(num_qubits);
+
+      for (auto it = qubit_map.begin(); it != qubit_map.end(); ++it) {
+        qubits.push_back(*it);
+      }
+
+      MatrixIdentity(unsigned{1} << num_qubits, kd_k);
+
+      for (const auto& op : ops) {
+        if (op.qubits.size() == num_qubits) {
+          MatrixMultiply(num_qubits, op.matrix, kd_k);
+        } else {
+          unsigned mask = 0;
+
+          for (auto q : op.qubits) {
+            for (unsigned i = 0; i < num_qubits; ++i) {
+              if (q == qubits[i]) {
+                mask |= unsigned{1} << i;
+                break;
+              }
+            }
+          }
+
+          MatrixMultiply(mask, op.qubits.size(), op.matrix, num_qubits, kd_k);
+        }
+      }
+
+      auto m = kd_k;
+      MatrixDaggerMultiply(num_qubits, m, kd_k);
+    }
+  }
+};
+
+/**
+ * Quantum channel.
+ */
+template <typename Gate>
+using Channel = std::vector<KrausOperator<Gate>>;
+
+/**
+ * Makes a channel from the gate.
+ * @param time The time to place the channel at.
+ * @param gate The input gate.
+ * @return The output channel.
+ */
+template <typename Gate>
+Channel<Gate> MakeChannelFromGate(unsigned time, const Gate& gate) {
+  auto normal = KrausOperator<Gate>::kNormal;
+  auto measurement = KrausOperator<Gate>::kMeasurement;
+
+  auto kind = gate.kind == gate::kMeasurement ? measurement : normal;
+
+  Channel<Gate> channel = {{kind, true, 1, {gate}}};
+  channel[0].ops[0].time = time;
+
+  return channel;
+}
+
+}  // namespace qsim
+
+#endif  // CHANNEL_H_
diff --git a/qsim/channels_cirq.h b/qsim/channels_cirq.h
new file mode 100644
index 0000000..69f1df9
--- /dev/null
+++ b/qsim/channels_cirq.h
@@ -0,0 +1,471 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef CHANNELS_CIRQ_H_
+#define CHANNELS_CIRQ_H_
+
+#include <cmath>
+#include <cstdint>
+#include <vector>
+
+#include "channel.h"
+#include "gates_cirq.h"
+
+namespace qsim {
+
+namespace Cirq {
+
+template <typename fp_type>
+using Channel = qsim::Channel<GateCirq<fp_type>>;
+
+/**
+ * Asymmetric depolarizing channel factory.
+ */
+template <typename fp_type>
+struct AsymmetricDepolarizingChannel {
+  static constexpr char name[] = "asymmetric_depolarize";
+
+  AsymmetricDepolarizingChannel(double p_x, double p_y, double p_z)
+      : p_x(p_x), p_y(p_y), p_z(p_z) {}
+
+  static Channel<fp_type> Create(unsigned time, unsigned q,
+                                 double p_x, double p_y, double p_z) {
+    double p1 = 1 - p_x - p_y - p_z;
+
+    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
+
+    return {{normal, 1, p1, {}},
+            {normal, 1, p_x, {X<fp_type>::Create(time, q)}},
+            {normal, 1, p_y, {Y<fp_type>::Create(time, q)}},
+            {normal, 1, p_z, {Z<fp_type>::Create(time, q)}}};
+  }
+
+  static Channel<fp_type> Create(unsigned time,
+                                 const std::vector<unsigned>& qubits,
+                                 double p_x, double p_y, double p_z) {
+    double p1 = 1 - p_x - p_y - p_z;
+
+    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
+
+    uint64_t size = uint64_t{1} << (2 * qubits.size());
+
+    Channel<fp_type> channel;
+    channel.reserve(size);
+
+    for (uint64_t i = 0; i < size; ++i) {
+      channel.push_back({normal, 1, 0, {}});
+      auto& kop = channel.back();
+
+      kop.ops.reserve(qubits.size());
+
+      double prob = 1;
+
+      for (unsigned q = 0; q < qubits.size(); ++q) {
+        unsigned pauli_index = (i >> (2 * q)) & 3;
+
+        switch (pauli_index) {
+        case 0:
+          prob *= p1;
+          break;
+        case 1:
+          prob *= p_x;
+          kop.ops.push_back(X<fp_type>::Create(time, q));
+          break;
+        case 2:
+          prob *= p_y;
+          kop.ops.push_back(Y<fp_type>::Create(time, q));
+          break;
+        case 3:
+          prob *= p_z;
+          kop.ops.push_back(Z<fp_type>::Create(time, q));
+          break;
+        }
+      }
+
+      kop.prob = prob;
+    }
+
+    return channel;
+  }
+
+  Channel<fp_type> Create(unsigned time, unsigned q) const {
+    return Create(time, q, p_x, p_y, p_z);
+  }
+
+  Channel<fp_type> Create(
+      unsigned time, const std::vector<unsigned>& qubits) const {
+    return Create(time, qubits, p_x, p_y, p_z);
+  }
+
+  double p_x = 0;
+  double p_y = 0;
+  double p_z = 0;
+};
+
+/**
+ * Returns an asymmetric depolarizing channel factory object.
+ */
+template <typename fp_type>
+inline AsymmetricDepolarizingChannel<fp_type> asymmetric_depolarize(
+    double p_x, double p_y, double p_z) {
+  return AsymmetricDepolarizingChannel<fp_type>(p_x, p_y, p_z);
+}
+
+/**
+ * Depolarizing channel factory.
+ */
+template <typename fp_type>
+struct DepolarizingChannel {
+  static constexpr char name[] = "depolarize";
+
+  DepolarizingChannel(double p) : p(p) {}
+
+  static Channel<fp_type> Create(unsigned time, unsigned q, double p) {
+    double p1 = 1 - p;
+    double p2 = p / 3;
+
+    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
+
+    return {{normal, 1, p1, {}},
+            {normal, 1, p2, {X<fp_type>::Create(time, q)}},
+            {normal, 1, p2, {Y<fp_type>::Create(time, q)}},
+            {normal, 1, p2, {Z<fp_type>::Create(time, q)}}};
+  }
+
+  static Channel<fp_type> Create(
+      unsigned time, const std::vector<unsigned>& qubits, double p) {
+    double p1 = 1 - p;
+    double p2 = p / 3;
+
+    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
+
+    uint64_t size = uint64_t{1} << (2 * qubits.size());
+
+    Channel<fp_type> channel;
+    channel.reserve(size);
+
+    for (uint64_t i = 0; i < size; ++i) {
+      channel.push_back({normal, 1, 0, {}});
+      auto& kop = channel.back();
+
+      kop.ops.reserve(qubits.size());
+
+      double prob = 1;
+
+      for (unsigned q = 0; q < qubits.size(); ++q) {
+        unsigned pauli_index = (i >> (2 * q)) & 3;
+
+        switch (pauli_index) {
+        case 0:
+          prob *= p1;
+          break;
+        case 1:
+          prob *= p2;
+          kop.ops.push_back(X<fp_type>::Create(time, q));
+          break;
+        case 2:
+          prob *= p2;
+          kop.ops.push_back(Y<fp_type>::Create(time, q));
+          break;
+        case 3:
+          prob *= p2;
+          kop.ops.push_back(Z<fp_type>::Create(time, q));
+          break;
+        }
+      }
+
+      kop.prob = prob;
+    }
+
+    return channel;
+  }
+
+  Channel<fp_type> Create(unsigned time, unsigned q) const {
+    return Create(time, q, p);
+  }
+
+  Channel<fp_type> Create(
+      unsigned time, const std::vector<unsigned>& qubits) const {
+    return Create(time, qubits, p);
+  }
+
+  double p = 0;
+};
+
+/**
+ * Returns a depolarizing channel factory object.
+ */
+template <typename fp_type>
+inline DepolarizingChannel<fp_type> depolarize(double p) {
+  return DepolarizingChannel<fp_type>(p);
+}
+
+/**
+ * Generalized amplitude damping channel factory.
+ */
+template <typename fp_type>
+struct GeneralizedAmplitudeDampingChannel {
+  static constexpr char name[] = "generalized_amplitude_damp";
+
+  GeneralizedAmplitudeDampingChannel(double p, double gamma)
+      : p(p), gamma(gamma) {}
+
+  static Channel<fp_type> Create(
+      unsigned time, unsigned q, double p, double gamma) {
+    double p1 = p * (1 - gamma);
+    double p2 = (1 - p) * (1 - gamma);
+    double p3 = 0;
+
+    fp_type t1 = std::sqrt(p);
+    fp_type r1 = std::sqrt(p * (1 - gamma));
+    fp_type s1 = std::sqrt(p * gamma);
+    fp_type t2 = std::sqrt(1 - p);
+    fp_type r2 = std::sqrt((1 - p) * (1 - gamma));
+    fp_type s2 = std::sqrt((1 - p) * gamma);
+
+    using M = Cirq::MatrixGate1<fp_type>;
+    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
+
+    return {{normal, 0, p1,
+             {M::Create(time, q, {t1, 0, 0, 0, 0, 0, r1, 0})},
+             {t1 * t1, 0, 0, 0, 0, 0, r1 * r1, 0}, {q},
+            },
+            {normal, 0, p2,
+             {M::Create(time, q, {r2, 0, 0, 0, 0, 0, t2, 0})},
+             {r2 * r2, 0, 0, 0, 0, 0, t2 * t2, 0}, {q},
+            },
+            {normal, 0, p3,
+             {M::Create(time, q, {0, 0, s1, 0, 0, 0, 0, 0})},
+             {0, 0, 0, 0, 0, 0, s1 * s1, 0}, {q},
+            },
+            {normal, 0, p3,
+             {M::Create(time, q, {0, 0, 0, 0, s2, 0, 0, 0})},
+             {s2 * s2, 0, 0, 0, 0, 0, 0, 0}, {q},
+            },
+           };
+  }
+
+  Channel<fp_type> Create(unsigned time, unsigned q) const {
+    return Create(time, q, p, gamma);
+  }
+
+  double p = 1;
+  double gamma = 0;
+};
+
+/**
+ * Returns a generalized amplitude damping channel factory object.
+ */
+template <typename fp_type>
+inline GeneralizedAmplitudeDampingChannel<fp_type> generalized_amplitude_damp(
+    double p, double gamma) {
+  return GeneralizedAmplitudeDampingChannel<fp_type>(p, gamma);
+}
+
+/**
+ * Amplitude damping channel factory.
+ */
+template <typename fp_type>
+struct AmplitudeDampingChannel {
+  static constexpr char name[] = "amplitude_damp";
+
+  AmplitudeDampingChannel(double gamma) : gamma(gamma) {}
+
+  static Channel<fp_type> Create(unsigned time, unsigned q, double gamma) {
+    double p1 = 1 - gamma;
+    double p2 = 0;
+
+    fp_type r = std::sqrt(p1);
+    fp_type s = std::sqrt(gamma);
+
+    using M = Cirq::MatrixGate1<fp_type>;
+    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
+
+    return {{normal, 0, p1,
+             {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})},
+             {1, 0, 0, 0, 0, 0, r * r, 0}, {q},
+            },
+            {normal, 0, p2,
+             {M::Create(time, q, {0, 0, s, 0, 0, 0, 0, 0})},
+             {0, 0, 0, 0, 0, 0, s * s, 0}, {q},
+            },
+           };
+  }
+
+  Channel<fp_type> Create(unsigned time, unsigned q) const {
+    return Create(time, q, gamma);
+  }
+
+  double gamma = 0;
+};
+
+/**
+ * Returns an amplitude damping channel factory object.
+ */
+template <typename fp_type>
+inline AmplitudeDampingChannel<fp_type> amplitude_damp(double gamma) {
+  return AmplitudeDampingChannel<fp_type>(gamma);
+}
+
+/**
+ *  Phase damping channel factory.
+ */
+template <typename fp_type>
+struct PhaseDampingChannel {
+  static constexpr char name[] = "phase_dump";
+
+  PhaseDampingChannel(double gamma) : gamma(gamma) {}
+
+  static Channel<fp_type> Create(unsigned time, unsigned q, double gamma) {
+    double p1 = 1 - gamma;
+    double p2 = 0;
+
+    fp_type r = std::sqrt(p1);
+    fp_type s = std::sqrt(gamma);
+
+    using M = Cirq::MatrixGate1<fp_type>;
+    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
+
+    return {{normal, 0, p1,
+             {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})},
+             {1, 0, 0, 0, 0, 0, r * r, 0}, {q},
+            },
+            {normal, 0, p2,
+             {M::Create(time, q, {0, 0, 0, 0, 0, 0, s, 0})},
+             {0, 0, 0, 0, 0, 0, s * s, 0}, {q},
+            },
+           };
+  }
+
+  Channel<fp_type> Create(unsigned time, unsigned q) const {
+    return Create(time, q, gamma);
+  }
+
+  double gamma = 0;
+};
+
+/**
+ * Returns a phase damping channel factory object.
+ */
+template <typename fp_type>
+inline PhaseDampingChannel<fp_type> phase_damp(double gamma) {
+  return PhaseDampingChannel<fp_type>(gamma);
+}
+
+/**
+ *  Reset channel factory.
+ */
+template <typename fp_type>
+struct ResetChannel {
+  static constexpr char name[] = "reset";
+
+  static Channel<fp_type> Create(unsigned time, unsigned q) {
+    using M = Cirq::MatrixGate1<fp_type>;
+    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
+
+    return {{normal, 0, 0,
+             {M::Create(time, q, {1, 0, 0, 0, 0, 0, 0, 0})},
+             {1, 0, 0, 0, 0, 0, 0, 0}, {q},
+            },
+            {normal, 0, 0,
+             {M::Create(time, q, {0, 0, 1, 0, 0, 0, 0, 0})},
+             {0, 0, 0, 0, 0, 0, 1, 0}, {q},
+            },
+           };
+  }
+};
+
+/**
+ * Returns a reset channel factory object.
+ */
+template <typename fp_type>
+inline ResetChannel<fp_type> reset() {
+  return ResetChannel<fp_type>();
+}
+
+/**
+ *  Phase flip channel factory.
+ */
+template <typename fp_type>
+struct PhaseFlipChannel {
+  static constexpr char name[] = "phase_flip";
+
+  PhaseFlipChannel(double p) : p(p) {}
+
+  static Channel<fp_type> Create(unsigned time, unsigned q, double p) {
+    double p1 = 1 - p;
+    double p2 = p;
+
+    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
+
+    return {{normal, 1, p1, {}},
+            {normal, 1, p2, {Z<fp_type>::Create(time, q)}}
+           };
+  }
+
+  Channel<fp_type> Create(unsigned time, unsigned q) const {
+    return Create(time, q, p);
+  }
+
+  double p = 0;
+};
+
+/**
+ * Returns a phase flip channel factory object.
+ */
+template <typename fp_type>
+inline PhaseFlipChannel<fp_type> phase_flip(double p) {
+  return PhaseFlipChannel<fp_type>(p);
+}
+
+/**
+ *  Bit flip channel factory.
+ */
+template <typename fp_type>
+struct BitFlipChannel {
+  static constexpr char name[] = "bit_flip";
+
+  BitFlipChannel(double p) : p(p) {}
+
+  static Channel<fp_type> Create(unsigned time, unsigned q, double p) {
+    double p1 = 1 - p;
+    double p2 = p;
+
+    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
+
+    return {{normal, 1, p1, {}},
+            {normal, 1, p2, {X<fp_type>::Create(time, q)}}
+           };
+  }
+
+  Channel<fp_type> Create(unsigned time, unsigned q) const {
+    return Create(time, q, p);
+  }
+
+  double p = 0;
+};
+
+/**
+ * Returns a bit flip channel factory object.
+ */
+template <typename fp_type>
+inline BitFlipChannel<fp_type> bit_flip(double p) {
+  return BitFlipChannel<fp_type>(p);
+}
+
+}  // namesapce Cirq
+
+}  // namespace qsim
+
+#endif  // CHANNELS_CIRQ_H_
diff --git a/qsim/channels_qsim.h b/qsim/channels_qsim.h
new file mode 100644
index 0000000..5c07bcc
--- /dev/null
+++ b/qsim/channels_qsim.h
@@ -0,0 +1,117 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef CHANNELS_QSIM_H_
+#define CHANNELS_QSIM_H_
+
+#include <cmath>
+#include <cstdint>
+#include <vector>
+
+#include "channel.h"
+#include "gates_qsim.h"
+
+namespace qsim {
+
+/**
+ * Amplitude damping channel factory.
+ */
+template <typename fp_type>
+struct AmplitudeDampingChannel {
+  AmplitudeDampingChannel(double gamma) : gamma(gamma) {}
+
+  static Channel<GateQSim<fp_type>> Create(
+      unsigned time, unsigned q, double gamma) {
+    double p1 = 1 - gamma;
+    double p2 = 0;
+
+    fp_type r = std::sqrt(p1);
+    fp_type s = std::sqrt(gamma);
+
+    using M = GateMatrix1<fp_type>;
+    auto normal = KrausOperator<GateQSim<fp_type>>::kNormal;
+
+    return {{normal, 0, p1,
+             {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})},
+             {1, 0, 0, 0, 0, 0, r * r, 0}, {q},
+            },
+            {normal, 0, p2,
+             {M::Create(time, q, {0, 0, s, 0, 0, 0, 0, 0})},
+             {0, 0, 0, 0, 0, 0, s * s, 0}, {q},
+            },
+           };
+  }
+
+  Channel<GateQSim<fp_type>> Create(unsigned time, unsigned q) const {
+    return Create(time, q, gamma);
+  }
+
+  double gamma = 0;
+};
+
+/**
+ * Returns an amplitude damping channel factory object.
+ */
+template <typename fp_type>
+inline AmplitudeDampingChannel<fp_type> amplitude_damp(double gamma) {
+  return AmplitudeDampingChannel<fp_type>(gamma);
+}
+
+/**
+ *  Phase damping channel factory.
+ */
+template <typename fp_type>
+struct PhaseDampingChannel {
+  PhaseDampingChannel(double gamma) : gamma(gamma) {}
+
+  static Channel<GateQSim<fp_type>> Create(
+      unsigned time, unsigned q, double gamma) {
+    double p1 = 1 - gamma;
+    double p2 = 0;
+
+    fp_type r = std::sqrt(p1);
+    fp_type s = std::sqrt(gamma);
+
+    using M = GateMatrix1<fp_type>;
+    auto normal = KrausOperator<GateQSim<fp_type>>::kNormal;
+
+    return {{normal, 0, p1,
+             {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})},
+             {1, 0, 0, 0, 0, 0, r * r, 0}, {q},
+            },
+            {normal, 0, p2,
+             {M::Create(time, q, {0, 0, 0, 0, 0, 0, s, 0})},
+             {0, 0, 0, 0, 0, 0, s * s, 0}, {q},
+            },
+           };
+  }
+
+  Channel<GateQSim<fp_type>> Create(unsigned time, unsigned q) const {
+    return Create(time, q, gamma);
+  }
+
+  double gamma = 0;
+};
+
+/**
+ * Returns a phase damping channel factory object.
+ */
+template <typename fp_type>
+inline PhaseDampingChannel<fp_type> phase_damp(double gamma) {
+  return PhaseDampingChannel<fp_type>(gamma);
+}
+
+}  // namespace qsim
+
+#endif  // CHANNELS_QSIM_H_
diff --git a/qsim/circuit.h b/qsim/circuit.h
new file mode 100644
index 0000000..59018ee
--- /dev/null
+++ b/qsim/circuit.h
@@ -0,0 +1,36 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef CIRCUIT_H_
+#define CIRCUIT_H_
+
+#include <vector>
+
+namespace qsim {
+
+/**
+ * A collection of gates. This object is consumed by `QSim[h]Runner.Run()`.
+ */
+template <typename Gate>
+struct Circuit {
+  unsigned num_qubits;
+  /**
+   * The set of gates to be run. Gate times should be ordered.
+   */
+  std::vector<Gate> gates;
+};
+
+}  // namespace qsim
+
+#endif  // CIRCUIT_H_
diff --git a/qsim/circuit_noisy.h b/qsim/circuit_noisy.h
new file mode 100644
index 0000000..40a228d
--- /dev/null
+++ b/qsim/circuit_noisy.h
@@ -0,0 +1,108 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef CIRCUIT_NOISY_H_
+#define CIRCUIT_NOISY_H_
+
+#include <vector>
+
+#include "circuit.h"
+#include "channel.h"
+
+namespace qsim {
+
+/**
+ * Noisy circuit.
+ */
+template <typename Gate>
+struct NoisyCircuit {
+  unsigned num_qubits;
+  std::vector<Channel<Gate>> channels;
+};
+
+template <typename Gate>
+using ncircuit_iterator = typename std::vector<Channel<Gate>>::const_iterator;
+
+/**
+ * Makes a noisy circuit from the clean circuit.
+ * Channels are added after each qubit of each gate of the clean cicuit.
+ * Roughly equivalent to cirq.Circuit.with_noise.
+ * @param num_qubits The number of circuit qubits.
+ * @param gbeg, gend The iterator range [gbeg, gend) of circuit gates.
+ * @param A channel factory to construct channels.
+ * @return The output noisy circuit.
+ */
+template <typename Gate, typename ChannelFactory>
+inline NoisyCircuit<Gate> MakeNoisy(
+    unsigned num_qubits,
+    typename std::vector<Gate>::const_iterator gbeg,
+    typename std::vector<Gate>::const_iterator gend,
+    const ChannelFactory& channel_factory) {
+  NoisyCircuit<Gate> ncircuit;
+
+  ncircuit.num_qubits = num_qubits;
+  ncircuit.channels.reserve(4 * std::size_t(gend - gbeg));
+
+  for (auto it = gbeg; it != gend; ++it) {
+    const auto& gate = *it;
+
+    ncircuit.channels.push_back(MakeChannelFromGate(2 * gate.time, gate));
+
+    for (auto q : gate.qubits) {
+      ncircuit.channels.push_back(channel_factory.Create(2 * gate.time + 1, q));
+    }
+
+    for (auto q : gate.controlled_by) {
+      ncircuit.channels.push_back(channel_factory.Create(2 * gate.time + 1, q));
+    }
+  }
+
+  return ncircuit;
+}
+
+/**
+ * Makes a noisy circuit from the clean circuit.
+ * Channels are added after each qubit of each gate of the clean cicuit.
+ * Roughly equivalent to cirq.Circuit.with_noise.
+ * @param num_qubits The number of circuit qubits.
+ * @param gates The circuit gates.
+ * @param A channel factory to construct channels.
+ * @return The output noisy circuit.
+ */
+template <typename Gate, typename ChannelFactory>
+inline NoisyCircuit<Gate> MakeNoisy(unsigned num_qubits,
+                                    const std::vector<Gate>& gates,
+                                    const ChannelFactory& channel_factory) {
+  return
+      MakeNoisy<Gate>(num_qubits, gates.begin(), gates.end(), channel_factory);
+}
+
+/**
+ * Makes a noisy circuit from the clean circuit.
+ * Channels are added after each qubit of each gate of the clean cicuit.
+ * Roughly equivalent to cirq.Circuit.with_noise.
+ * @param circuit The input cicuit.
+ * @param A channel factory to construct channels.
+ * @return The output noisy circuit.
+ */
+template <typename Gate, typename ChannelFactory>
+inline NoisyCircuit<Gate> MakeNoisy(const Circuit<Gate>& circuit,
+                                    const ChannelFactory& channel_factory) {
+  return MakeNoisy<Gate>(circuit.num_qubits, circuit.gates.begin(),
+                         circuit.gates.end(), channel_factory);
+}
+
+}  // namespace qsim
+
+#endif  // CIRCUIT_NOISY_H_
diff --git a/qsim/circuit_qsim_parser.h b/qsim/circuit_qsim_parser.h
new file mode 100644
index 0000000..de7bd89
--- /dev/null
+++ b/qsim/circuit_qsim_parser.h
@@ -0,0 +1,442 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef CIRCUIT_QSIM_PARSER_H_
+#define CIRCUIT_QSIM_PARSER_H_
+
+#include <algorithm>
+#include <cctype>
+#include <string>
+#include <sstream>
+#include <vector>
+
+#include "circuit.h"
+#include "gates_qsim.h"
+
+namespace qsim {
+
+/**
+ * Parser for the (deprecated) qsim <a href="https://github.com/quantumlib/qsim/blob/master/docs/input_format.md">file input format</a>.
+ * The primary supported interface for designing circuits to simulate with qsim
+ * is <a href="https://github.com/quantumlib/Cirq">Cirq</a>, which relies on
+ * the Python-based qsimcirq interface. For C++ applications, Cirq gates can be
+ * explicitly constructed in code.
+ */
+template <typename IO>
+class CircuitQsimParser final {
+ public:
+  /**
+   * Parses the given input stream into a Circuit object, following the rules
+   * defined in "docs/input_format.md".
+   * @param maxtime Maximum gate "time" to read operations for (inclusive).
+   * @param provider Circuit source; only used for error reporting.
+   * @param fs The stream to read the circuit from.
+   * @param circuit Output circuit object. If parsing is successful, this will
+   *   contain the circuit defined in 'fs'.
+   * @return True if parsing succeeds; false otherwise.
+   */
+  template <typename Stream, typename fp_type>
+  static bool FromStream(unsigned maxtime, const std::string& provider,
+                         Stream& fs, Circuit<GateQSim<fp_type>>& circuit) {
+    circuit.num_qubits = 0;
+
+    circuit.gates.resize(0);
+    circuit.gates.reserve(1024);
+
+    unsigned k = 0;
+
+    std::string line;
+    line.reserve(128);
+
+    unsigned time;
+    std::string gate_name;
+    gate_name.reserve(16);
+
+    unsigned max_time = 0;
+    unsigned prev_mea_time = 0;
+
+    std::vector<unsigned> last_times;
+
+    while (std::getline(fs, line)) {
+      ++k;
+
+      if (line.size() == 0 || line[0] == '#') continue;
+
+      std::stringstream ss(line);
+
+      if (circuit.num_qubits == 0) {
+        ss >> circuit.num_qubits;
+        if (circuit.num_qubits == 0) {
+          IO::errorf("invalid number of qubits in %s in line %u.\n",
+                     provider.c_str(), k);
+          return false;
+        }
+
+        last_times.resize(circuit.num_qubits, unsigned(-1));
+
+        continue;
+      }
+
+      ss >> time >> gate_name;
+
+      if (!ss) {
+        InvalidGateError(provider, k);
+        return false;
+      }
+
+      if (time > maxtime) {
+        break;
+      }
+
+      if (gate_name == "c") {
+        if (!ParseControlledGate<fp_type>(ss, time,
+                                          circuit.num_qubits, circuit.gates)) {
+          InvalidGateError(provider, k);
+          return false;
+        }
+      } else if (!ParseGate<fp_type>(ss, time, circuit.num_qubits,
+                                     gate_name, circuit.gates)) {
+        InvalidGateError(provider, k);
+        return false;
+      }
+
+      const auto& gate = circuit.gates.back();
+
+      if (time < prev_mea_time
+          || (gate.kind == gate::kMeasurement && time < max_time)) {
+        IO::errorf("gate crosses the time boundary set by measurement "
+                   "gates in line %u in %s.\n", k, provider.c_str());
+        return false;
+      }
+
+      if (gate.kind == gate::kMeasurement) {
+        prev_mea_time = time;
+      }
+
+      if (GateIsOutOfOrder(time, gate.qubits, last_times)
+          || GateIsOutOfOrder(time, gate.controlled_by, last_times)) {
+        IO::errorf("gate is out of time order in line %u in %s.\n",
+                   k, provider.c_str());
+        return false;
+      }
+
+      if (time > max_time) {
+        max_time = time;
+      }
+    }
+
+    return true;
+  }
+
+  /**
+   * Parses the given file into a Circuit object, following the rules defined
+   * in "docs/input_format.md".
+   * @param maxtime Maximum gate "time" to read operations for (inclusive).
+   * @param file The name of the file to read the circuit from.
+   * @param circuit Output circuit object. If parsing is successful, this will
+   *   contain the circuit defined in 'file'.
+   * @return True if parsing succeeds; false otherwise.
+   */
+  template <typename fp_type>
+  static bool FromFile(unsigned maxtime, const std::string& file,
+                       Circuit<GateQSim<fp_type>>& circuit) {
+    auto fs = IO::StreamFromFile(file);
+
+    if (!fs) {
+      return false;
+    } else {
+      bool rc = FromStream(maxtime, file, fs, circuit);
+      IO::CloseStream(fs);
+      return rc;
+    }
+  }
+
+ private:
+  static void InvalidGateError(const std::string& provider, unsigned line) {
+    IO::errorf("invalid gate in %s in line %u.\n", provider.c_str(), line);
+  }
+
+  /**
+   * Checks formatting for a zero-qubit gate parsed from 'ss'.
+   * @param ss Input stream containing the gate specification.
+   */
+  static bool ValidateGate(std::stringstream& ss) {
+    return ss && ss.peek() == std::stringstream::traits_type::eof();
+  }
+
+  /**
+   * Checks formatting for a single-qubit gate parsed from 'ss'.
+   * @param ss Input stream containing the gate specification.
+   * @param num_qubits Number of qubits, as defined at the start of the file.
+   * @param q0 Index of the affected qubit.
+   */
+  static bool ValidateGate(std::stringstream& ss,
+                           unsigned num_qubits, unsigned q0) {
+    return ss && ss.peek() == std::stringstream::traits_type::eof()
+        && q0 < num_qubits;
+  }
+
+  /**
+   * Checks formatting for a two-qubit gate parsed from 'ss'.
+   * @param ss Input stream containing the gate specification.
+   * @param num_qubits Number of qubits, as defined at the start of the file.
+   * @param q0 Index of the first affected qubit.
+   * @param q1 Index of the second affected qubit.
+   */
+  static bool ValidateGate(std::stringstream& ss,
+                           unsigned num_qubits, unsigned q0, unsigned q1) {
+    return ss && ss.peek() == std::stringstream::traits_type::eof()
+        && q0 < num_qubits && q1 < num_qubits && q0 != q1;
+  }
+
+  /**
+   * Checks formatting for a multiqubit gate parsed from 'ss'.
+   * @param ss Input stream containing the gate specification.
+   * @param num_qubits Number of qubits, as defined at the start of the file.
+   * @param qubits Indices of affected qubits.
+   */
+  static bool ValidateGate(std::stringstream& ss, unsigned num_qubits,
+                           const std::vector<unsigned>& qubits) {
+    return ss && ValidateQubits(num_qubits, qubits);
+  }
+
+  static bool ValidateControlledGate(
+      unsigned num_qubits, const std::vector<unsigned>& qubits,
+      const std::vector<unsigned>& controlled_by) {
+    if (!ValidateQubits(num_qubits, controlled_by)) return false;
+
+    std::size_t i = 0, j = 0;
+
+    while (i < qubits.size() && j < controlled_by.size()) {
+      if (qubits[i] == controlled_by[j]) {
+        return false;
+      } else if (qubits[i] < controlled_by[j]) {
+        ++i;
+      } else {
+        ++j;
+      }
+    }
+
+    return true;
+  }
+
+  static bool ValidateQubits(unsigned num_qubits,
+                             const std::vector<unsigned>& qubits) {
+    if (qubits.size() == 0 || qubits[0] >= num_qubits) return false;
+
+    // qubits should be sorted.
+
+    for (std::size_t i = 1; i < qubits.size(); ++i) {
+      if (qubits[i] >= num_qubits || qubits[i] == qubits[i - 1]) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  static bool GateIsOutOfOrder(unsigned time,
+                               const std::vector<unsigned>& qubits,
+                               std::vector<unsigned>& last_times) {
+    for (auto q : qubits) {
+      if (last_times[q] != unsigned(-1) && time <= last_times[q]) {
+        return true;
+      }
+
+      last_times[q] = time;
+    }
+
+    return false;
+  }
+
+  template <typename fp_type, typename Stream, typename Gate>
+  static bool ParseGate(Stream& ss, unsigned time, unsigned num_qubits,
+                        const std::string& gate_name,
+                        std::vector<Gate>& gates) {
+    unsigned q0, q1;
+    fp_type phi, theta;
+
+    if (gate_name == "p") {
+      ss >> phi;
+      if (!ValidateGate(ss)) return false;
+      gates.push_back(GateGPh<fp_type>::Create(time, phi));
+    } else if (gate_name == "id1") {
+      ss >> q0;
+      if (!ValidateGate(ss, num_qubits, q0)) return false;
+      gates.push_back(GateId1<fp_type>::Create(time, q0));
+    } else if (gate_name == "h") {
+      ss >> q0;
+      if (!ValidateGate(ss, num_qubits, q0)) return false;
+      gates.push_back(GateHd<fp_type>::Create(time, q0));
+    } else if (gate_name == "t") {
+      ss >> q0;
+      if (!ValidateGate(ss, num_qubits, q0)) return false;
+      gates.push_back(GateT<fp_type>::Create(time, q0));
+    } else if (gate_name == "x") {
+      ss >> q0;
+      if (!ValidateGate(ss, num_qubits, q0)) return false;
+      gates.push_back(GateX<fp_type>::Create(time, q0));
+    } else if (gate_name == "y") {
+      ss >> q0;
+      if (!ValidateGate(ss, num_qubits, q0)) return false;
+      gates.push_back(GateY<fp_type>::Create(time, q0));
+    } else if (gate_name == "z") {
+      ss >> q0;
+      if (!ValidateGate(ss, num_qubits, q0)) return false;
+      gates.push_back(GateZ<fp_type>::Create(time, q0));
+    } else if (gate_name == "x_1_2") {
+      ss >> q0;
+      if (!ValidateGate(ss, num_qubits, q0)) return false;
+      gates.push_back(GateX2<fp_type>::Create(time, q0));
+    } else if (gate_name == "y_1_2") {
+      ss >> q0;
+      if (!ValidateGate(ss, num_qubits, q0)) return false;
+      gates.push_back(GateY2<fp_type>::Create(time, q0));
+    } else if (gate_name == "rx") {
+      ss >> q0 >> phi;
+      if (!ValidateGate(ss, num_qubits, q0)) return false;
+      gates.push_back(GateRX<fp_type>::Create(time, q0, phi));
+    } else if (gate_name == "ry") {
+      ss >> q0 >> phi;
+      if (!ValidateGate(ss, num_qubits, q0)) return false;
+      gates.push_back(GateRY<fp_type>::Create(time, q0, phi));
+    } else if (gate_name == "rz") {
+      ss >> q0 >> phi;
+      if (!ValidateGate(ss, num_qubits, q0)) return false;
+      gates.push_back(GateRZ<fp_type>::Create(time, q0, phi));
+    } else if (gate_name == "rxy") {
+      ss >> q0 >> theta >> phi;
+      if (!ValidateGate(ss, num_qubits, q0)) return false;
+      gates.push_back(GateRXY<fp_type>::Create(time, q0, theta, phi));
+    } else if (gate_name == "hz_1_2") {
+      ss >> q0;
+      if (!ValidateGate(ss, num_qubits, q0)) return false;
+      gates.push_back(GateHZ2<fp_type>::Create(time, q0));
+    } else if (gate_name == "s") {
+      ss >> q0;
+      if (!ValidateGate(ss, num_qubits, q0)) return false;
+      gates.push_back(GateS<fp_type>::Create(time, q0));
+    } else if (gate_name == "id2") {
+      ss >> q0 >> q1;
+      if (!ValidateGate(ss, num_qubits, q0, q1)) return false;
+      gates.push_back(GateId2<fp_type>::Create(time, q0, q1));
+    } else if (gate_name == "cz") {
+      ss >> q0 >> q1;
+      if (!ValidateGate(ss, num_qubits, q0, q1)) return false;
+      gates.push_back(GateCZ<fp_type>::Create(time, q0, q1));
+    } else if (gate_name == "cnot" || gate_name == "cx") {
+      ss >> q0 >> q1;
+      if (!ValidateGate(ss, num_qubits, q0, q1)) return false;
+      gates.push_back(GateCNot<fp_type>::Create(time, q0, q1));
+    } else if (gate_name == "sw") {
+      ss >> q0 >> q1;
+      if (!ValidateGate(ss, num_qubits, q0, q1)) return false;
+      gates.push_back(GateSwap<fp_type>::Create(time, q0, q1));
+    } else if (gate_name == "is") {
+      ss >> q0 >> q1;
+      if (!ValidateGate(ss, num_qubits, q0, q1)) return false;
+      gates.push_back(GateIS<fp_type>::Create(time, q0, q1));
+    } else if (gate_name == "fs") {
+      ss >> q0 >> q1 >> theta >> phi;
+      if (!ValidateGate(ss, num_qubits, q0, q1)) return false;
+      gates.push_back(GateFS<fp_type>::Create(time, q0, q1, theta, phi));
+    } else if (gate_name == "cp") {
+      ss >> q0 >> q1 >> phi;
+      if (!ValidateGate(ss, num_qubits, q0, q1)) return false;
+      gates.push_back(GateCP<fp_type>::Create(time, q0, q1, phi));
+    } else if (gate_name == "m") {
+      std::vector<unsigned> qubits;
+      qubits.reserve(num_qubits);
+
+      while (ss.good()) {
+        ss >> q0;
+        if (ss) {
+          qubits.push_back(q0);
+        } else {
+          return false;
+        }
+      }
+
+      gates.push_back(gate::Measurement<GateQSim<fp_type>>::Create(
+          time, std::move(qubits)));
+
+      if (!ValidateQubits(num_qubits, gates.back().qubits)) return false;
+    } else {
+      return false;
+    }
+
+    return true;
+  }
+
+  template <typename fp_type, typename Stream, typename Gate>
+  static bool ParseControlledGate(Stream& ss, unsigned time,
+                                  unsigned num_qubits,
+                                  std::vector<Gate>& gates) {
+    std::vector<unsigned> controlled_by;
+    controlled_by.reserve(64);
+
+    std::string gate_name;
+    gate_name.reserve(16);
+
+    while (1) {
+      while (ss.good()) {
+        if (!std::isblank(ss.get())) {
+          ss.unget();
+          break;
+        }
+      }
+
+      if (!ss.good()) {
+        return false;
+      }
+
+      if (!std::isdigit(ss.peek())) {
+        break;
+      } else {
+        unsigned q;
+        ss >> q;
+
+        if (!ss.good() || !std::isblank(ss.get())) {
+          return false;
+        }
+
+        controlled_by.push_back(q);
+      }
+    }
+
+    if (controlled_by.size() == 0) {
+      return false;
+    }
+
+    ss >> gate_name;
+
+    if (!ss.good() || !ParseGate<fp_type>(ss, time,
+                                          num_qubits, gate_name, gates)) {
+      return false;
+    }
+
+    gates.back().ControlledBy(std::move(controlled_by));
+
+    if (!ValidateControlledGate(num_qubits, gates.back().qubits,
+                                gates.back().controlled_by)) {
+      return false;
+    }
+
+    return true;
+  }
+};
+
+}  // namespace qsim
+
+#endif  // CIRCUIT_QSIM_PARSER_H_
diff --git a/qsim/cuda2hip.h b/qsim/cuda2hip.h
new file mode 100644
index 0000000..da2d074
--- /dev/null
+++ b/qsim/cuda2hip.h
@@ -0,0 +1,61 @@
+// Copyright 2023 Advanced Micro Devices, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SIMULATOR_CUDA2HIP_H_
+#define SIMULATOR_CUDA2HIP_H_
+
+#define cublasCaxpy              hipblasCaxpy
+#define cublasCdotc              hipblasCdotc
+#define cublasCreate             hipblasCreate
+#define cublasCscal              hipblasCscal
+#define cublasCsscal             hipblasCsscal
+#define cublasDestroy            hipblasDestroy
+#define cublasDznrm2             hipblasDznrm2
+#define cublasHandle_t           hipblasHandle_t
+#define cublasScnrm2             hipblasScnrm2
+#define CUBLAS_STATUS_SUCCESS    HIPBLAS_STATUS_SUCCESS
+#define cublasStatus_t           hipblasStatus_t
+#define cublasZaxpy              hipblasZaxpy
+#define cublasZdotc              hipblasZdotc
+#define cublasZdscal             hipblasZdscal
+#define cublasZscal              hipblasZscal
+#define cuCimagf                 hipCimagf
+#define cuCimag                  hipCimag
+#define cuComplex                hipComplex
+#define cuCrealf                 hipCrealf
+#define cuCreal                  hipCreal
+#define CUDA_C_32F               HIPBLAS_C_32F
+#define CUDA_C_64F               HIPBLAS_C_64F
+#define cudaDeviceSynchronize    hipDeviceSynchronize
+#define cudaError_t              hipError_t
+#define cudaFree                 hipFree
+#define cudaGetErrorString       hipGetErrorString
+#define cudaMalloc               hipMalloc
+#define cudaMemcpyAsync          hipMemcpyAsync
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost   hipMemcpyDeviceToHost
+#define cudaMemcpy               hipMemcpy
+#define cudaMemcpyHostToDevice   hipMemcpyHostToDevice
+#define cudaMemset               hipMemset
+#define cudaPeekAtLastError      hipPeekAtLastError
+#define cudaSuccess              hipSuccess
+#define cuDoubleComplex          hipDoubleComplex
+
+template <typename T>
+__device__ __forceinline__ T __shfl_down_sync(
+    unsigned mask, T var, unsigned int delta, int width = warpSize) {
+  return __shfl_down(var, delta, width);
+}
+
+#endif  // SIMULATOR_CUDA2HIP_H_
diff --git a/qsim/expect.h b/qsim/expect.h
new file mode 100644
index 0000000..518d516
--- /dev/null
+++ b/qsim/expect.h
@@ -0,0 +1,148 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef EXPECT_H_
+#define EXPECT_H_
+
+#include <complex>
+
+#include "fuser.h"
+#include "gate_appl.h"
+
+namespace qsim {
+
+template <typename Gate>
+struct OpString {
+  std::complex<double> weight;
+  std::vector<Gate> ops;
+};
+
+/**
+ * Computes the expectation value of the sum of operator strings (operator
+ * sequences). Operators can act on any qubits and they can be any supported
+ * gates. This function uses a temporary state vector.
+ * @param param Options for gate fusion.
+ * @param strings Operator strings.
+ * @param ss StateSpace object required to copy the state vector and compute
+ *   inner products.
+ * @param simulator Simulator object. Provides specific implementations for
+ *   applying gates.
+ * @param state The state vector of the system.
+ * @param ket Temporary state vector.
+ * @return The computed expectation value.
+ */
+template <typename IO, typename Fuser, typename Gate, typename Simulator>
+std::complex<double> ExpectationValue(
+    const typename Fuser::Parameter& param,
+    const std::vector<OpString<Gate>>& strings,
+    const typename Simulator::StateSpace& state_space,
+    const Simulator& simulator, const typename Simulator::State& state,
+    typename Simulator::State& ket) {
+  std::complex<double> eval = 0;
+
+  if (state_space.IsNull(ket) || ket.num_qubits() < state.num_qubits()) {
+    ket = state_space.Create(state.num_qubits());
+    if (state_space.IsNull(ket)) {
+      IO::errorf("not enough memory: is the number of qubits too large?\n");
+      return eval;
+    }
+  }
+
+  for (const auto& str : strings) {
+    if (str.ops.size() == 0) {
+      eval += str.weight;
+      continue;
+    }
+
+    state_space.Copy(state, ket);
+
+    if (str.ops.size() == 1) {
+      const auto& op = str.ops[0];
+      simulator.ApplyGate(op.qubits, op.matrix.data(), ket);
+    } else {
+      auto fused_gates = Fuser::FuseGates(param, state.num_qubits(), str.ops);
+      if (fused_gates.size() == 0) {
+        eval = 0;
+        break;
+      }
+
+      for (const auto& fgate : fused_gates) {
+        ApplyFusedGate(simulator, fgate, ket);
+      }
+    }
+
+    eval += str.weight * state_space.InnerProduct(state, ket);
+  }
+
+  return eval;
+}
+
+/**
+ * Computes the expectation value of the sum of operator strings (operator
+ * sequences). Operators can act on any qubits and they can be any supported
+ * gates except for user-defined controlled gates. Computation is performed
+ * in place. No additional memory is allocated. The operator strings should
+ * act on no more than six qubits and they should be fusible into one gate.
+ * @param strings Operator strings.
+ * @param simulator Simulator object. Provides specific implementations for
+ *   computing expectation values.
+ * @param state The state of the system.
+ * @return The computed expectation value.
+ */
+template <typename IO, typename Fuser, typename Gate, typename Simulator>
+std::complex<double> ExpectationValue(
+    const std::vector<OpString<Gate>>& strings,
+    const Simulator& simulator, const typename Simulator::State& state) {
+  std::complex<double> eval = 0;
+
+  typename Fuser::Parameter param;
+  param.max_fused_size = 6;
+  for (const auto& str : strings) {
+    if (str.ops.size() == 0) {
+      eval += str.weight;
+    } else if (str.ops.size() == 1) {
+      const auto& op = str.ops[0];
+      auto r = simulator.ExpectationValue(op.qubits, op.matrix.data(), state);
+      eval += str.weight * r;
+    } else {
+      auto fused_gates = Fuser::FuseGates(param, state.num_qubits(), str.ops);
+
+      if (fused_gates.size() != 1) {
+        IO::errorf("too many fused gates; "
+                   "cannot compute the expectation value.\n");
+        eval = 0;
+        break;
+      }
+
+      const auto& fgate = fused_gates[0];
+
+      if (fgate.qubits.size() > 6) {
+        IO::errorf("operator string acts on too many qubits; "
+                   "cannot compute the expectation value.\n");
+        eval = 0;
+        break;
+      }
+
+      auto r = simulator.ExpectationValue(
+          fgate.qubits, fgate.matrix.data(), state);
+      eval += str.weight * r;
+    }
+  }
+
+  return eval;
+}
+
+}  // namespace qsim
+
+#endif  // EXPECT_H_
diff --git a/qsim/formux.h b/qsim/formux.h
new file mode 100644
index 0000000..4401e9b
--- /dev/null
+++ b/qsim/formux.h
@@ -0,0 +1,30 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef FORMUX_H_
+#define FORMUX_H_
+
+#ifdef _OPENMP
+# include "parfor.h"
+  namespace qsim {
+    using For = ParallelFor;
+  }
+#else
+# include "seqfor.h"
+  namespace qsim {
+    using For = SequentialFor;
+  }
+#endif
+
+#endif  // FORMUX_H_
diff --git a/qsim/fuser.h b/qsim/fuser.h
new file mode 100644
index 0000000..e4f3c3b
--- /dev/null
+++ b/qsim/fuser.h
@@ -0,0 +1,225 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef FUSER_H_
+#define FUSER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "gate.h"
+#include "matrix.h"
+
+namespace qsim {
+
+/**
+ * A collection of "fused" gates which can be multiplied together before being
+ * applied to the state vector.
+ */
+template <typename Gate>
+struct GateFused {
+  /**
+   * Kind of the first ("parent") gate.
+   */
+  typename Gate::GateKind kind;
+  /**
+   * The time index of the first ("parent") gate.
+   */
+  unsigned time;
+  /**
+   * A list of qubits these gates act upon. Control qubits for
+   * explicitly-controlled gates are excluded from this list.
+   */
+  std::vector<unsigned> qubits;
+  /**
+   * Pointer to the first ("parent") gate.
+   */
+  const Gate* parent;
+  /**
+   * Ordered list of component gates.
+   */
+  std::vector<const Gate*> gates;
+  /**
+   * Fused gate matrix.
+   */
+  Matrix<typename Gate::fp_type> matrix;
+};
+
+/**
+ * A base class for fuser classes with some common functions.
+ */
+template <typename IO, typename Gate>
+class Fuser {
+ protected:
+  using RGate = typename std::remove_pointer<Gate>::type;
+
+  static const RGate& GateToConstRef(const RGate& gate) {
+    return gate;
+  }
+
+  static const RGate& GateToConstRef(const RGate* gate) {
+    return *gate;
+  }
+
+  static std::vector<unsigned> MergeWithMeasurementTimes(
+      typename std::vector<Gate>::const_iterator gfirst,
+      typename std::vector<Gate>::const_iterator glast,
+      const std::vector<unsigned>& times) {
+    std::vector<unsigned> epochs;
+    epochs.reserve(glast - gfirst + times.size());
+
+    std::size_t last = 0;
+    unsigned max_time = 0;
+
+    for (auto gate_it = gfirst; gate_it < glast; ++gate_it) {
+      const auto& gate = GateToConstRef(*gate_it);
+
+      if (gate.time > max_time) {
+        max_time = gate.time;
+      }
+
+      if (epochs.size() > 0 && gate.time < epochs.back()) {
+        IO::errorf("gate crosses the time boundary.\n");
+        epochs.resize(0);
+        return epochs;
+      }
+
+      if (gate.kind == gate::kMeasurement) {
+        if (epochs.size() == 0 || epochs.back() < gate.time) {
+          if (!AddBoundary(gate.time, max_time, epochs)) {
+            epochs.resize(0);
+            return epochs;
+          }
+        }
+      }
+
+      while (last < times.size() && times[last] <= gate.time) {
+        unsigned prev = times[last++];
+        epochs.push_back(prev);
+        if (!AddBoundary(prev, max_time, epochs)) {
+          epochs.resize(0);
+          return epochs;
+        }
+        while (last < times.size() && times[last] <= prev) ++last;
+      }
+    }
+
+    if (epochs.size() == 0 || epochs.back() < max_time) {
+      epochs.push_back(max_time);
+    }
+
+    return epochs;
+  }
+
+  template <typename GateSeq0, typename Parent, typename GateFused>
+  static void FuseZeroQubitGates(const GateSeq0& gate_seq0,
+                                 Parent parent, std::size_t first,
+                                 std::vector<GateFused>& fused_gates) {
+    GateFused* fuse_to = nullptr;
+
+    for (std::size_t i = first; i < fused_gates.size(); ++i) {
+      auto& fgate = fused_gates[i];
+
+      if (fgate.kind != gate::kMeasurement && fgate.kind != gate::kDecomp
+          && fgate.parent->controlled_by.size() == 0
+          && !fgate.parent->unfusible) {
+        fuse_to = &fgate;
+        break;
+      }
+    }
+
+    if (fuse_to != nullptr) {
+      // Fuse zero-qubit gates with the first available fused gate.
+      for (const auto& g : gate_seq0) {
+        fuse_to->gates.push_back(parent(g));
+      }
+    } else {
+      auto g0 = parent(gate_seq0[0]);
+      fused_gates.push_back({g0->kind, g0->time, {}, g0, {g0}, {}});
+
+      for (std::size_t i = 1; i < gate_seq0.size(); ++i) {
+        fused_gates.back().gates.push_back(parent(gate_seq0[i]));
+      }
+    }
+  }
+
+ private:
+  static bool AddBoundary(unsigned time, unsigned max_time,
+                          std::vector<unsigned>& boundaries) {
+    if (max_time > time) {
+      IO::errorf("gate crosses the time boundary.\n");
+      return false;
+    }
+
+    boundaries.push_back(time);
+    return true;
+  }
+};
+
+/**
+ * Multiplies component gate matrices of a fused gate.
+ * @param gate Fused gate.
+ */
+template <typename FusedGate>
+inline void CalculateFusedMatrix(FusedGate& gate) {
+  MatrixIdentity(unsigned{1} << gate.qubits.size(), gate.matrix);
+
+  for (auto pgate : gate.gates) {
+    if (pgate->qubits.size() == 0) {
+      MatrixScalarMultiply(pgate->matrix[0], pgate->matrix[1], gate.matrix);
+    } else if (gate.qubits.size() == pgate->qubits.size()) {
+      MatrixMultiply(gate.qubits.size(), pgate->matrix, gate.matrix);
+    } else {
+      unsigned mask = 0;
+
+      for (auto q : pgate->qubits) {
+        for (std::size_t i = 0; i < gate.qubits.size(); ++i) {
+          if (q == gate.qubits[i]) {
+            mask |= unsigned{1} << i;
+            break;
+          }
+        }
+      }
+
+      MatrixMultiply(mask, pgate->qubits.size(), pgate->matrix,
+                     gate.qubits.size(), gate.matrix);
+    }
+  }
+}
+
+/**
+ * Multiplies component gate matrices for a range of fused gates.
+ * @param gbeg, gend The iterator range [gbeg, gend) of fused gates.
+ */
+template <typename Iterator>
+inline void CalculateFusedMatrices(Iterator gbeg, Iterator gend) {
+  for (auto g = gbeg; g != gend; ++g) {
+    if (g->kind != gate::kMeasurement) {
+      CalculateFusedMatrix(*g);
+    }
+  }
+}
+
+/**
+ * Multiplies component gate matrices for a vector of fused gates.
+ * @param gates The vector of fused gates.
+ */
+template <typename FusedGate>
+inline void CalculateFusedMatrices(std::vector<FusedGate>& gates) {
+  CalculateFusedMatrices(gates.begin(), gates.end());
+}
+
+}  // namespace qsim
+
+#endif  // FUSER_H_
diff --git a/qsim/fuser_basic.h b/qsim/fuser_basic.h
new file mode 100644
index 0000000..3191bd2
--- /dev/null
+++ b/qsim/fuser_basic.h
@@ -0,0 +1,411 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef FUSER_BASIC_H_
+#define FUSER_BASIC_H_
+
+#include <map>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "gate.h"
+#include "fuser.h"
+
+namespace qsim {
+
+/**
+ * Stateless object with methods for aggregating `Gate`s into `GateFused`.
+ * Measurement gates with equal times are fused together.
+ * User-defined controlled gates (controlled_by.size() > 0) and gates acting on
+ * more than two qubits are not fused.
+ * The template parameter Gate can be Gate type or a pointer to Gate type.
+ * This class is deprecated. It is recommended to use MultiQubitGateFuser
+ * from fuser_mqubit.h.
+ */
+template <typename IO, typename Gate>
+class BasicGateFuser final : public Fuser<IO, Gate> {
+ private:
+  using Base = Fuser<IO, Gate>;
+  using RGate = typename Base::RGate;
+
+ public:
+  using GateFused = qsim::GateFused<RGate>;
+
+  /**
+   * User-specified parameters for gate fusion.
+   * BasicGateFuser does not use any parameters.
+   */
+  struct Parameter {
+    unsigned verbosity = 0;
+  };
+
+  /**
+   * Stores sets of gates that can be applied together. Only one- and
+   * two-qubit gates will get fused. To respect specific time boundaries while
+   * fusing gates, use the other version of this method below.
+   * @param param Options for gate fusion.
+   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
+   * @param gates The gates (or pointers to the gates) to be fused.
+   *   Gate times of the gates that act on the same qubits should be ordered.
+   *   Gates that are out of time order should not cross the time boundaries
+   *   set by measurement gates.
+   * @param fuse_matrix If true, multiply gate matrices together.
+   * @return A vector of fused gate objects. Each element is a set of gates
+   *   acting on a specific pair of qubits which can be applied as a group.
+   */
+  static std::vector<GateFused> FuseGates(const Parameter& param,
+                                          unsigned max_qubit1,
+                                          const std::vector<Gate>& gates,
+                                          bool fuse_matrix = true) {
+    return FuseGates(
+        param, max_qubit1, gates.cbegin(), gates.cend(), {}, fuse_matrix);
+  }
+
+  /**
+   * Stores sets of gates that can be applied together. Only one- and
+   * two-qubit gates will get fused.
+   * @param param Options for gate fusion.
+   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
+   * @param gates The gates (or pointers to the gates) to be fused.
+   *   Gate times of the gates that act on the same qubits should be ordered.
+   *   Gates that are out of time order should not cross the time boundaries
+   *   set by `times_to_split_at` or by measurement gates.
+   * @param times_to_split_at Ordered list of time steps (boundaries) at which
+   *   to separate fused gates. Each element of the output will contain gates
+   *   from a single 'window' in this list.
+   * @param fuse_matrix If true, multiply gate matrices together.
+   * @return A vector of fused gate objects. Each element is a set of gates
+   *   acting on a specific pair of qubits which can be applied as a group.
+   */
+  static std::vector<GateFused> FuseGates(
+      const Parameter& param,
+      unsigned max_qubit1, const std::vector<Gate>& gates,
+      const std::vector<unsigned>& times_to_split_at,
+      bool fuse_matrix = true) {
+    return FuseGates(param, max_qubit1, gates.cbegin(), gates.cend(),
+                     times_to_split_at, fuse_matrix);
+  }
+
+  /**
+   * Stores sets of gates that can be applied together. Only one- and
+   * two-qubit gates will get fused. To respect specific time boundaries while
+   * fusing gates, use the other version of this method below.
+   * @param param Options for gate fusion.
+   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
+   * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates
+   *   (or pointers to gates) in. Gate times of the gates that act on the same
+   *   qubits should be ordered. Gates that are out of time order should not
+   *   cross the time boundaries set by measurement gates.
+   * @param fuse_matrix If true, multiply gate matrices together.
+   * @return A vector of fused gate objects. Each element is a set of gates
+   *   acting on a specific pair of qubits which can be applied as a group.
+   */
+  static std::vector<GateFused> FuseGates(
+      const Parameter& param, unsigned max_qubit1,
+      typename std::vector<Gate>::const_iterator gfirst,
+      typename std::vector<Gate>::const_iterator glast,
+      bool fuse_matrix = true) {
+    return FuseGates(param, max_qubit1, gfirst, glast, {}, fuse_matrix);
+  }
+
+  /**
+   * Stores sets of gates that can be applied together. Only one- and
+   * two-qubit gates will get fused.
+   * @param param Options for gate fusion.
+   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
+   * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates
+   *   (or pointers to gates) in. Gate times of the gates that act on the same
+   *   qubits should be ordered. Gates that are out of time order should not
+   *   cross the time boundaries set by `times_to_split_at` or by measurement
+   *   gates.
+   * @param times_to_split_at Ordered list of time steps (boundaries) at which
+   *   to separate fused gates. Each element of the output will contain gates
+   *   from a single 'window' in this list.
+   * @param fuse_matrix If true, multiply gate matrices together.
+   * @return A vector of fused gate objects. Each element is a set of gates
+   *   acting on a specific pair of qubits which can be applied as a group.
+   */
+  static std::vector<GateFused> FuseGates(
+      const Parameter& param, unsigned max_qubit1,
+      typename std::vector<Gate>::const_iterator gfirst,
+      typename std::vector<Gate>::const_iterator glast,
+      const std::vector<unsigned>& times_to_split_at,
+      bool fuse_matrix = true) {
+    std::vector<GateFused> gates_fused;
+
+    if (gfirst >= glast) return gates_fused;
+
+    std::size_t num_gates = glast - gfirst;
+
+    gates_fused.reserve(num_gates);
+
+    // Merge with measurement gate times to separate fused gates at.
+    auto times =
+        Base::MergeWithMeasurementTimes(gfirst, glast, times_to_split_at);
+
+    // Map to keep track of measurement gates with equal times.
+    std::map<unsigned, std::vector<const RGate*>> measurement_gates;
+
+    // Sequence of top level gates the other gates get fused to.
+    std::vector<const RGate*> gates_seq;
+
+    // Sequence of zero-qubit gates.
+    std::vector<const RGate*> gates_seq0;
+
+    // Lattice of gates: qubits "hyperplane" and time direction.
+    std::vector<std::vector<const RGate*>> gates_lat(max_qubit1);
+
+    // Current unfused gate.
+    auto gate_it = gfirst;
+
+    std::size_t last_fused_gate_index = 0;
+
+    for (std::size_t l = 0; l < times.size(); ++l) {
+      gates_seq.resize(0);
+      gates_seq.reserve(num_gates);
+
+      gates_seq0.resize(0);
+      gates_seq0.reserve(num_gates);
+
+      for (unsigned k = 0; k < max_qubit1; ++k) {
+        gates_lat[k].resize(0);
+        gates_lat[k].reserve(128);
+      }
+
+      // Fill gates_seq and gates_lat in.
+      for (; gate_it < glast; ++gate_it) {
+        const auto& gate = Base::GateToConstRef(*gate_it);
+
+        if (gate.time > times[l]) break;
+
+        if (!ValidateGate(gate, max_qubit1, gates_lat)) {
+          gates_fused.resize(0);
+          return gates_fused;
+        }
+
+        if (gate.kind == gate::kMeasurement) {
+          auto& mea_gates_at_time = measurement_gates[gate.time];
+          if (mea_gates_at_time.size() == 0) {
+            gates_seq.push_back(&gate);
+            mea_gates_at_time.reserve(max_qubit1);
+          }
+
+          mea_gates_at_time.push_back(&gate);
+        } else if (gate.controlled_by.size() > 0 || gate.qubits.size() > 2) {
+          for (auto q : gate.qubits) {
+            gates_lat[q].push_back(&gate);
+          }
+          for (auto q : gate.controlled_by) {
+            gates_lat[q].push_back(&gate);
+          }
+          gates_seq.push_back(&gate);
+        } else if (gate.qubits.size() == 1) {
+          gates_lat[gate.qubits[0]].push_back(&gate);
+          if (gate.unfusible) {
+            gates_seq.push_back(&gate);
+          }
+        } else if (gate.qubits.size() == 2) {
+          gates_lat[gate.qubits[0]].push_back(&gate);
+          gates_lat[gate.qubits[1]].push_back(&gate);
+          gates_seq.push_back(&gate);
+        } else {
+          gates_seq0.push_back(&gate);
+        }
+      }
+
+      std::vector<unsigned> last(max_qubit1, 0);
+
+      const RGate* delayed_measurement_gate = nullptr;
+
+      // Fuse gates.
+      for (auto pgate : gates_seq) {
+        if (pgate->kind == gate::kMeasurement) {
+          delayed_measurement_gate = pgate;
+        } else if (pgate->qubits.size() > 2
+                   || pgate->controlled_by.size() > 0) {
+          // Multi-qubit or controlled gate.
+
+          for (auto q : pgate->qubits) {
+            unsigned l = last[q];
+            if (gates_lat[q][l] != pgate) {
+              last[q] = AddOrphanedQubit(q, l, gates_lat, gates_fused);
+            }
+            ++last[q];
+          }
+
+          for (auto q : pgate->controlled_by) {
+            unsigned l = last[q];
+            if (gates_lat[q][l] != pgate) {
+              last[q] = AddOrphanedQubit(q, l, gates_lat, gates_fused);
+            }
+            ++last[q];
+          }
+
+          gates_fused.push_back({pgate->kind, pgate->time, pgate->qubits,
+                                 pgate, {pgate}, {}});
+        } else if (pgate->qubits.size() == 1) {
+          unsigned q0 = pgate->qubits[0];
+
+          GateFused gate_f = {pgate->kind, pgate->time, {q0}, pgate, {}, {}};
+
+          last[q0] = Advance(last[q0], gates_lat[q0], gate_f.gates);
+          gate_f.gates.push_back(gates_lat[q0][last[q0]]);
+          last[q0] = Advance(last[q0] + 1, gates_lat[q0], gate_f.gates);
+
+          gates_fused.push_back(std::move(gate_f));
+        } else if (pgate->qubits.size() == 2) {
+          unsigned q0 = pgate->qubits[0];
+          unsigned q1 = pgate->qubits[1];
+
+          if (Done(last[q0], pgate->time, gates_lat[q0])) continue;
+
+          GateFused gate_f =
+              {pgate->kind, pgate->time, {q0, q1}, pgate, {}, {}};
+
+          do {
+            last[q0] = Advance(last[q0], gates_lat[q0], gate_f.gates);
+            last[q1] = Advance(last[q1], gates_lat[q1], gate_f.gates);
+            // Here gates_lat[q0][last[q0]] == gates_lat[q1][last[q1]].
+
+            gate_f.gates.push_back(gates_lat[q0][last[q0]]);
+
+            last[q0] = Advance(last[q0] + 1, gates_lat[q0], gate_f.gates);
+            last[q1] = Advance(last[q1] + 1, gates_lat[q1], gate_f.gates);
+          } while (NextGate(last[q0], gates_lat[q0], last[q1], gates_lat[q1]));
+
+          gates_fused.push_back(std::move(gate_f));
+        }
+      }
+
+      for (unsigned q = 0; q < max_qubit1; ++q) {
+        auto l = last[q];
+        if (l == gates_lat[q].size()) continue;
+
+        // Orphaned qubit.
+        AddOrphanedQubit(q, l, gates_lat, gates_fused);
+      }
+
+      if (delayed_measurement_gate != nullptr) {
+        auto pgate = delayed_measurement_gate;
+
+        const auto& mea_gates_at_time = measurement_gates[pgate->time];
+
+        GateFused gate_f = {pgate->kind, pgate->time, {}, pgate, {}, {}};
+        gate_f.gates.reserve(mea_gates_at_time.size());
+
+        // Fuse measurement gates with equal times.
+
+        for (const auto* pgate : mea_gates_at_time) {
+          gate_f.qubits.insert(gate_f.qubits.end(),
+                               pgate->qubits.begin(), pgate->qubits.end());
+          gate_f.gates.push_back(pgate);
+        }
+
+        gates_fused.push_back(std::move(gate_f));
+      }
+
+      if (gates_seq0.size() != 0) {
+        Base::FuseZeroQubitGates(gates_seq0, [](const RGate* g) { return g; },
+                                 last_fused_gate_index, gates_fused);
+      }
+
+      if (gate_it == glast) break;
+
+      last_fused_gate_index = gates_fused.size();
+    }
+
+    if (fuse_matrix) {
+      for (auto& gate_f : gates_fused) {
+        if (gate_f.kind != gate::kMeasurement && gate_f.kind != gate::kDecomp) {
+          CalculateFusedMatrix(gate_f);
+        }
+      }
+    }
+
+    return gates_fused;
+  }
+
+ private:
+  static unsigned Advance(unsigned k, const std::vector<const RGate*>& wl,
+                          std::vector<const RGate*>& gates) {
+    while (k < wl.size() && wl[k]->qubits.size() == 1
+           && wl[k]->controlled_by.size() == 0 && !wl[k]->unfusible) {
+      gates.push_back(wl[k++]);
+    }
+
+    return k;
+  }
+
+  static bool Done(
+      unsigned k, unsigned t, const std::vector<const RGate*>& wl) {
+    return k >= wl.size() || wl[k]->time > t;
+  }
+
+  static bool NextGate(unsigned k1, const std::vector<const RGate*>& wl1,
+                       unsigned k2, const std::vector<const RGate*>& wl2) {
+    return k1 < wl1.size() && k2 < wl2.size() && wl1[k1] == wl2[k2]
+        && wl1[k1]->qubits.size() < 3 && wl1[k1]->controlled_by.size() == 0;
+  }
+
+  template <typename GatesLat>
+  static unsigned AddOrphanedQubit(unsigned q, unsigned k,
+                                   const GatesLat& gates_lat,
+                                   std::vector<GateFused>& gates_fused) {
+    auto pgate = gates_lat[q][k];
+
+    GateFused gate_f = {pgate->kind, pgate->time, {q}, pgate, {}, {}};
+    gate_f.gates.push_back(pgate);
+
+    k = Advance(k + 1, gates_lat[q], gate_f.gates);
+
+    gates_fused.push_back(std::move(gate_f));
+
+    return k;
+  }
+
+  template <typename Gate2, typename GatesLat>
+  static bool ValidateGate(const Gate2& gate, unsigned max_qubit1,
+                           const GatesLat& gates_lat) {
+    for (unsigned q : gate.qubits) {
+      if (q >= max_qubit1) {
+        IO::errorf("fuser: gate qubit %u is out of range "
+                   "(should be smaller than %u).\n", q, max_qubit1);
+        return false;
+      }
+      if (!gates_lat[q].empty() && gate.time <= gates_lat[q].back()->time) {
+        IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time);
+        return false;
+      }
+    }
+
+    for (unsigned q : gate.controlled_by) {
+      if (q >= max_qubit1) {
+        IO::errorf("fuser: gate qubit %u is out of range "
+                   "(should be smaller than %u).\n", q, max_qubit1);
+        return false;
+      }
+      if (!gates_lat[q].empty() && gate.time <= gates_lat[q].back()->time) {
+        IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time);
+        return false;
+      }
+    }
+
+    return true;
+  }
+};
+
+}  // namespace qsim
+
+#endif  // FUSER_BASIC_H_
diff --git a/qsim/fuser_mqubit.h b/qsim/fuser_mqubit.h
new file mode 100644
index 0000000..c75b1a0
--- /dev/null
+++ b/qsim/fuser_mqubit.h
@@ -0,0 +1,1095 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef FUSER_MQUBIT_H_
+#define FUSER_MQUBIT_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "gate.h"
+#include "fuser.h"
+
+namespace qsim {
+
+/**
+ * Multi-qubit gate fuser.
+ * Measurement gates with equal times are fused together.
+ * User-defined controlled gates (controlled_by.size() > 0) are not fused.
+ * The template parameter Gate can be Gate type or a pointer to Gate type.
+ */
+template <typename IO, typename Gate>
+class MultiQubitGateFuser final : public Fuser<IO, Gate> {
+ private:
+  using Base = Fuser<IO, Gate>;
+  using RGate = typename Base::RGate;
+
+  // Auxillary classes and structs.
+
+  // Manages doubly-linked lists.
+  template <typename T>
+  class LinkManagerT {
+   public:
+    struct Link {
+      T val;
+      Link* next;
+      Link* prev;
+    };
+
+    explicit LinkManagerT(uint64_t size) {
+      links_.reserve(size);
+    }
+
+    Link* AddBack(const T& t, Link* link) {
+      if (link == nullptr) {
+        links_.push_back({t, nullptr, nullptr});
+      } else {
+        links_.push_back({t, link->next, link});
+        link->next = &links_.back();
+      }
+
+      return &links_.back();
+    }
+
+    static void Delete(const Link* link) {
+      if (link->prev != nullptr) {
+        link->prev->next = link->next;
+      }
+      if (link->next != nullptr) {
+        link->next->prev = link->prev;
+      }
+    }
+
+   private:
+    std::vector<Link> links_;
+  };
+
+  struct GateF;
+
+  using LinkManager = LinkManagerT<GateF*>;
+  using Link = typename LinkManager::Link;
+
+  // Intermediate representation of a fused gate.
+  struct GateF {
+    const RGate* parent;
+    std::vector<unsigned> qubits;
+    std::vector<const RGate*> gates;  // Gates that get fused to this gate.
+    std::vector<Link*> links;         // Gate "lattice" links.
+    uint64_t mask;                    // Qubit mask.
+    unsigned visited;
+  };
+
+  // Possible values for visited in GateF.
+  // Note that MakeGateSequence assignes values from kSecond to the number of
+  // gates in the sequence plus one, see below.
+  enum Visited {
+    kZero = 0,             // Start value for "normal" gates.
+    kFirst = 1,            // Value after the first pass for partially fused
+                           // "normal" gates.
+    kSecond = 2,           // Start value to assign values in MakeGateSequence.
+    kCompress = 99999997,  // Used to compress links.
+    kMeaCnt = 99999998,    // Start value for controlled or measurement gates.
+    kFinal = 99999999,     // Value after the second pass for fused "normal"
+                           // gates or for controlled and measurement gates.
+  };
+
+  struct Stat {
+    unsigned num_mea_gates = 0;
+    unsigned num_fused_mea_gates = 0;
+    unsigned num_fused_gates = 0;
+    unsigned num_controlled_gates = 0;
+    std::vector<unsigned> num_gates;
+  };
+
+  // Gate that is added to a sequence of gates to fuse together.
+  struct GateA {
+    GateF* gate;
+    std::vector<unsigned> qubits;  // Added qubits.
+    std::vector<Link*> links;      // Added lattice links.
+  };
+
+  struct Scratch {
+    std::vector<GateA> data;
+    std::vector<GateA*> prev1;
+    std::vector<GateA*> prev2;
+    std::vector<GateA*> next1;
+    std::vector<GateA*> next2;
+    std::vector<GateA*> longest_seq;
+    std::vector<GateA*> stack;
+    std::vector<GateF*> gates;
+    unsigned count = 0;
+  };
+
+ public:
+  using GateFused = qsim::GateFused<RGate>;
+
+  /**
+   * User-specified parameters for gate fusion.
+   */
+  struct Parameter {
+    /**
+     * Maximum number of qubits in a fused gate. It can take values from 2 to
+     * 6 (0 and 1 are equivalent to 2). It is not recommended to use 5 or 6 as
+     * that might degrade performance for not very fast machines.
+     */
+    unsigned max_fused_size = 2;
+    unsigned verbosity = 0;
+  };
+
+  /**
+   * Stores sets of gates that can be applied together. To respect specific
+   * time boundaries while fusing gates, use the other version of this method
+   * below.
+   * @param param Options for gate fusion.
+   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
+   * @param gates The gates (or pointers to the gates) to be fused.
+   *   Gate times of the gates that act on the same qubits should be ordered.
+   *   Gates that are out of time order should not cross the time boundaries
+   *   set by measurement gates.
+   * @param fuse_matrix If true, multiply gate matrices together.
+   * @return A vector of fused gate objects. Each element is a set of gates
+   *   acting on a specific pair of qubits which can be applied as a group.
+   */
+  static std::vector<GateFused> FuseGates(const Parameter& param,
+                                          unsigned max_qubit1,
+                                          const std::vector<Gate>& gates,
+                                          bool fuse_matrix = true) {
+    return FuseGates(
+        param, max_qubit1, gates.cbegin(), gates.cend(), {}, fuse_matrix);
+  }
+
+  /**
+   * Stores sets of gates that can be applied together.
+   * @param param Options for gate fusion.
+   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
+   * @param gates The gates (or pointers to the gates) to be fused.
+   *   Gate times of the gates that act on the same qubits should be ordered.
+   *   Gates that are out of time order should not cross the time boundaries
+   *   set by `times_to_split_at` or by measurement gates.
+   * @param times_to_split_at Ordered list of time steps (boundaries) at which
+   *   to separate fused gates. Each element of the output will contain gates
+   *   from a single 'window' in this list.
+   * @param fuse_matrix If true, multiply gate matrices together.
+   * @return A vector of fused gate objects. Each element is a set of gates
+   *   acting on a specific pair of qubits which can be applied as a group.
+   */
+  static std::vector<GateFused> FuseGates(
+      const Parameter& param,
+      unsigned max_qubit1, const std::vector<Gate>& gates,
+      const std::vector<unsigned>& times_to_split_at,
+      bool fuse_matrix = true) {
+    return FuseGates(param, max_qubit1, gates.cbegin(), gates.cend(),
+                     times_to_split_at, fuse_matrix);
+  }
+
+  /**
+   * Stores sets of gates that can be applied together. To respect specific
+   * time boundaries while fusing gates, use the other version of this method
+   * below.
+   * @param param Options for gate fusion.
+   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
+   * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates
+   *   (or pointers to gates) in. Gate times of the gates that act on the same
+   *   qubits should be ordered. Gates that are out of time order should not
+   *   cross the time boundaries set by measurement gates.
+   * @param fuse_matrix If true, multiply gate matrices together.
+   * @return A vector of fused gate objects. Each element is a set of gates
+   *   acting on a specific pair of qubits which can be applied as a group.
+   */
+  static std::vector<GateFused> FuseGates(
+      const Parameter& param, unsigned max_qubit1,
+      typename std::vector<Gate>::const_iterator gfirst,
+      typename std::vector<Gate>::const_iterator glast,
+      bool fuse_matrix = true) {
+    return FuseGates(param, max_qubit1, gfirst, glast, {}, fuse_matrix);
+  }
+
+  /**
+   * Stores sets of gates that can be applied together.
+   * @param param Options for gate fusion.
+   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
+   * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates
+   *   (or pointers to gates) in. Gate times of the gates that act on the same
+   *   qubits should be ordered. Gates that are out of time order should not
+   *   cross the time boundaries set by `times_to_split_at` or by measurement
+   *   gates.
+   * @param times_to_split_at Ordered list of time steps (boundaries) at which
+   *   to separate fused gates. Each element of the output will contain gates
+   *   from a single 'window' in this list.
+   * @param fuse_matrix If true, multiply gate matrices together.
+   * @return A vector of fused gate objects. Each element is a set of gates
+   *   acting on a specific pair of qubits which can be applied as a group.
+   */
+  static std::vector<GateFused> FuseGates(
+      const Parameter& param, unsigned max_qubit1,
+      typename std::vector<Gate>::const_iterator gfirst,
+      typename std::vector<Gate>::const_iterator glast,
+      const std::vector<unsigned>& times_to_split_at,
+      bool fuse_matrix = true) {
+    std::vector<GateFused> fused_gates;
+
+    if (gfirst >= glast) return fused_gates;
+
+    std::size_t num_gates = glast - gfirst;
+
+    fused_gates.reserve(num_gates);
+
+    // Merge with measurement gate times to separate fused gates at.
+    auto epochs =
+        Base::MergeWithMeasurementTimes(gfirst, glast, times_to_split_at);
+
+    LinkManager link_manager(max_qubit1 * num_gates);
+
+    // Auxillary data structures.
+    // Sequence of intermediate fused gates.
+    std::vector<GateF> gates_seq;
+    // Gate "lattice".
+    std::vector<Link*> gates_lat;
+    // Sequences of intermediate fused gates ordered by gate size.
+    std::vector<std::vector<GateF*>> fgates(max_qubit1 + 1);
+
+    gates_seq.reserve(num_gates);
+    gates_lat.reserve(max_qubit1);
+
+    Scratch scratch;
+
+    scratch.data.reserve(1024);
+    scratch.prev1.reserve(32);
+    scratch.prev2.reserve(32);
+    scratch.next1.reserve(32);
+    scratch.next2.reserve(32);
+    scratch.longest_seq.reserve(8);
+    scratch.stack.reserve(8);
+
+    Stat stat;
+    stat.num_gates.resize(max_qubit1 + 1, 0);
+
+    unsigned max_fused_size = std::min(unsigned{6}, param.max_fused_size);
+    max_fused_size = std::min(max_fused_size, max_qubit1);
+
+    std::size_t last_fused_gate_index = 0;
+    auto gate_it = gfirst;
+
+    // Iterate over epochs.
+    for (std::size_t l = 0; l < epochs.size(); ++l) {
+      gates_seq.resize(0);
+      gates_lat.resize(0);
+      gates_lat.resize(max_qubit1, nullptr);
+
+      for (unsigned i = 0; i <= max_qubit1; ++i) {
+        fgates[i].resize(0);
+      }
+
+      uint64_t max_gate_size = 0;
+      GateF* last_mea_gate = nullptr;
+
+      // Iterate over input gates.
+      for (; gate_it < glast; ++gate_it) {
+        const auto& gate = Base::GateToConstRef(*gate_it);
+
+        if (gate.time > epochs[l]) break;
+
+        if (!ValidateGate(gate, max_qubit1, gates_lat)) {
+          fused_gates.resize(0);
+          return fused_gates;
+        }
+
+        // Fill in auxillary data structures.
+
+        if (gate.kind == gate::kMeasurement) {
+          // Measurement gate.
+
+          if (last_mea_gate == nullptr
+              || last_mea_gate->parent->time != gate.time) {
+            gates_seq.push_back({&gate, {}, {}, {}, 0, kMeaCnt});
+            last_mea_gate = &gates_seq.back();
+
+            last_mea_gate->qubits.reserve(max_qubit1);
+            last_mea_gate->links.reserve(max_qubit1);
+
+            ++stat.num_fused_mea_gates;
+          }
+
+          for (auto q : gate.qubits) {
+            last_mea_gate->qubits.push_back(q);
+            last_mea_gate->mask |= uint64_t{1} << q;
+            gates_lat[q] = link_manager.AddBack(last_mea_gate, gates_lat[q]);
+            last_mea_gate->links.push_back(gates_lat[q]);
+          }
+
+          last_mea_gate->gates.push_back(&gate);
+
+          ++stat.num_mea_gates;
+        } else {
+          gates_seq.push_back({&gate, {}, {}, {}, 0, kZero});
+          auto& fgate = gates_seq.back();
+
+          if (gate.controlled_by.size() == 0) {
+            if (max_gate_size < gate.qubits.size()) {
+              max_gate_size = gate.qubits.size();
+            }
+
+            unsigned num_gate_qubits = gate.qubits.size();
+            unsigned size = std::max(max_fused_size, num_gate_qubits);
+
+            fgate.qubits.reserve(size);
+            fgate.links.reserve(size);
+            fgate.gates.reserve(4 * size);
+            fgate.links.reserve(size);
+
+            if (fgates[num_gate_qubits].empty()) {
+              fgates[num_gate_qubits].reserve(num_gates);
+            }
+            fgates[num_gate_qubits].push_back(&fgate);
+
+            ++stat.num_gates[num_gate_qubits];
+          } else {
+            // Controlled gate.
+            // Controlled gates are not fused with other gates.
+
+            uint64_t size = gate.qubits.size() + gate.controlled_by.size();
+
+            fgate.qubits.reserve(gate.qubits.size());
+            fgate.links.reserve(size);
+
+            fgate.visited = kMeaCnt;
+            fgate.gates.push_back(&gate);
+
+            ++stat.num_controlled_gates;
+          }
+
+          for (auto q : gate.qubits) {
+            fgate.qubits.push_back(q);
+            fgate.mask |= uint64_t{1} << q;
+            gates_lat[q] = link_manager.AddBack(&fgate, gates_lat[q]);
+            fgate.links.push_back(gates_lat[q]);
+          }
+
+          for (auto q : gate.controlled_by) {
+            fgate.mask |= uint64_t{1} << q;
+            gates_lat[q] = link_manager.AddBack(&fgate, gates_lat[q]);
+            fgate.links.push_back(gates_lat[q]);
+          }
+        }
+      }
+
+      // Fuse large gates with smaller gates.
+      FuseGates(max_gate_size, fgates);
+
+      if (max_fused_size > 2) {
+        FuseGateSequences(
+            max_fused_size, max_qubit1, scratch, gates_seq, stat, fused_gates);
+      } else {
+        unsigned prev_time = 0;
+
+        std::vector<GateF*> orphaned_gates;
+        orphaned_gates.reserve(max_qubit1);
+
+        for (auto& fgate : gates_seq) {
+          if (fgate.gates.size() == 0) continue;
+
+          if (prev_time != fgate.parent->time) {
+            if (orphaned_gates.size() > 0) {
+              FuseOrphanedGates(
+                  max_fused_size, stat, orphaned_gates, fused_gates);
+              orphaned_gates.resize(0);
+            }
+
+            prev_time = fgate.parent->time;
+          }
+
+          if (fgate.qubits.size() == 1 && max_fused_size > 1
+              && fgate.visited != kMeaCnt && !fgate.parent->unfusible) {
+            orphaned_gates.push_back(&fgate);
+            continue;
+          }
+
+          // Assume fgate.qubits (gate.qubits) are sorted.
+          fused_gates.push_back({fgate.parent->kind, fgate.parent->time,
+                                 std::move(fgate.qubits), fgate.parent,
+                                 std::move(fgate.gates), {}});
+
+          if (fgate.visited != kMeaCnt) {
+            ++stat.num_fused_gates;
+          }
+        }
+
+        if (orphaned_gates.size() > 0) {
+          FuseOrphanedGates(max_fused_size, stat, orphaned_gates, fused_gates);
+        }
+      }
+
+      if (fgates[0].size() != 0) {
+        Base::FuseZeroQubitGates(fgates[0],
+                                 [](const GateF* g) { return g->parent; },
+                                 last_fused_gate_index, fused_gates);
+      }
+
+      last_fused_gate_index = fused_gates.size();
+    }
+
+    if (fuse_matrix) {
+      for (auto& fgate : fused_gates) {
+        if (fgate.kind != gate::kMeasurement && fgate.kind != gate::kDecomp) {
+          CalculateFusedMatrix(fgate);
+        }
+      }
+    }
+
+    PrintStat(param.verbosity, stat, fused_gates);
+
+    return fused_gates;
+  }
+
+ private:
+  // Fuse large gates with smaller gates.
+  static void FuseGates(uint64_t max_gate_size,
+                        std::vector<std::vector<GateF*>>& fgates) {
+    // Traverse gates in order of decreasing size.
+    for (uint64_t i = 0; i < max_gate_size; ++i) {
+      std::size_t pos = 0;
+
+      for (auto fgate : fgates[max_gate_size - i]) {
+        if (fgate->visited > kZero) continue;
+
+        fgates[max_gate_size - i][pos++] = fgate;
+
+        fgate->visited = kFirst;
+
+        FusePrev(0, *fgate);
+        fgate->gates.push_back(fgate->parent);
+        FuseNext(0, *fgate);
+      }
+
+      fgates[max_gate_size - i].resize(pos);
+    }
+  }
+
+  // Try to fuse gate sequences as follows. Gate time goes from bottom to top.
+  // Gates are fused either from left to right or from right to left.
+  //
+  // max_fused_size = 3: _-  or  -_
+  //
+  // max_fused_size = 4: _-_
+  //
+  // max_fused_size = 5: _-_-  or  -_-_
+  //
+  // max_fused_size = 6: _-_-_
+  static void FuseGateSequences(unsigned max_fused_size,
+                                unsigned max_qubit1, Scratch& scratch,
+                                std::vector<GateF>& gates_seq, Stat& stat,
+                                std::vector<GateFused>& fused_gates) {
+    unsigned prev_time = 0;
+
+    std::vector<GateF*> orphaned_gates;
+    orphaned_gates.reserve(max_qubit1);
+
+    for (auto& fgate : gates_seq) {
+      if (prev_time != fgate.parent->time) {
+        if (orphaned_gates.size() > 0) {
+          FuseOrphanedGates(max_fused_size, stat, orphaned_gates, fused_gates);
+          orphaned_gates.resize(0);
+        }
+
+        prev_time = fgate.parent->time;
+      }
+
+      if (fgate.visited == kFinal || fgate.gates.size() == 0) continue;
+
+      if (fgate.visited == kMeaCnt || fgate.qubits.size() >= max_fused_size
+          || fgate.parent->unfusible) {
+        if (fgate.visited != kMeaCnt) {
+          ++stat.num_fused_gates;
+        }
+
+        fgate.visited = kFinal;
+
+        fused_gates.push_back({fgate.parent->kind, fgate.parent->time,
+                               std::move(fgate.qubits), fgate.parent,
+                               std::move(fgate.gates), {}});
+
+        continue;
+      }
+
+
+      if (fgate.qubits.size() == 1 && max_fused_size > 1) {
+        orphaned_gates.push_back(&fgate);
+        continue;
+      }
+
+      scratch.data.resize(0);
+      scratch.gates.resize(0);
+      scratch.count = 0;
+
+      MakeGateSequence(max_fused_size, scratch, fgate);
+
+      if (scratch.gates.size() == 0) {
+        orphaned_gates.push_back(&fgate);
+      } else {
+        for (auto fgate : scratch.gates) {
+          std::sort(fgate->qubits.begin(), fgate->qubits.end());
+
+          fused_gates.push_back({fgate->parent->kind, fgate->parent->time,
+                                 std::move(fgate->qubits), fgate->parent,
+                                 std::move(fgate->gates), {}});
+
+          ++stat.num_fused_gates;
+        }
+      }
+    }
+
+    if (orphaned_gates.size() > 0) {
+      FuseOrphanedGates(max_fused_size, stat, orphaned_gates, fused_gates);
+    }
+  }
+
+  static void FuseOrphanedGates(unsigned max_fused_size, Stat& stat,
+                                std::vector<GateF*>& orphaned_gates,
+                                std::vector<GateFused>& fused_gates) {
+    for (std::size_t i = 0; i < orphaned_gates.size(); ++i) {
+      auto ogate1 = orphaned_gates[i];
+
+      if (ogate1->visited == kFinal) continue;
+
+      ogate1->visited = kFinal;
+
+      for (std::size_t j = i + 1; j < orphaned_gates.size(); ++j) {
+        auto ogate2 = orphaned_gates[j];
+
+        if (ogate2->visited == kFinal) continue;
+
+        unsigned cur_size = ogate1->qubits.size() + ogate2->qubits.size();
+
+        if (cur_size <= max_fused_size) {
+          ogate2->visited = kFinal;
+
+          for (auto q : ogate2->qubits) {
+            ogate1->qubits.push_back(q);
+            ogate1->mask |= uint64_t{1} << q;
+          }
+
+          for (auto l : ogate2->links) {
+            ogate1->links.push_back(l);
+          }
+
+          for (auto gate : ogate2->gates) {
+            ogate1->gates.push_back(gate);
+          }
+        }
+
+        if (cur_size == max_fused_size) {
+          break;
+        }
+      }
+
+      FuseNext(1, *ogate1);
+
+      std::sort(ogate1->qubits.begin(), ogate1->qubits.end());
+
+      fused_gates.push_back({ogate1->parent->kind, ogate1->parent->time,
+                             std::move(ogate1->qubits), ogate1->parent,
+                             std::move(ogate1->gates), {}});
+
+      ++stat.num_fused_gates;
+    }
+  }
+
+  static void MakeGateSequence(
+      unsigned max_fused_size, Scratch& scratch, GateF& fgate) {
+    unsigned level = kSecond + scratch.count;
+
+    FindLongestGateSequence(max_fused_size, level, scratch, fgate);
+
+    auto longest_seq = scratch.longest_seq;
+
+    if (longest_seq.size() == 1 && scratch.count == 0) {
+      fgate.visited = kFirst;
+      return;
+    }
+
+    ++scratch.count;
+
+    for (auto p : longest_seq) {
+      p->gate->visited = kCompress;
+
+      for (auto q : p->qubits) {
+        fgate.qubits.push_back(q);
+        fgate.mask |= uint64_t{1} << q;
+      }
+
+      for (auto l : p->links) {
+        fgate.links.push_back(l);
+      }
+    }
+
+    // Compress links.
+    for (auto& link : fgate.links) {
+      while (link->prev != nullptr && link->prev->val->visited == kCompress) {
+        link = link->prev;
+      }
+
+      while (link->next != nullptr && link->next->val->visited == kCompress) {
+        LinkManager::Delete(link->next);
+      }
+    }
+
+    for (auto p : longest_seq) {
+      p->gate->visited = level;
+    }
+
+    if (longest_seq.size() >= 3) {
+      AddGatesFromNext(longest_seq[2]->gate->gates, fgate);
+    }
+
+    if (longest_seq.size() >= 5) {
+      AddGatesFromNext(longest_seq[4]->gate->gates, fgate);
+    }
+
+    if (longest_seq.size() >= 2) {
+      // May call MakeGateSequence recursively.
+      AddGatesFromPrev(max_fused_size, *longest_seq[1]->gate, scratch, fgate);
+    }
+
+    if (longest_seq.size() >= 4) {
+      // May call MakeGateSequence recursively.
+      AddGatesFromPrev(max_fused_size, *longest_seq[3]->gate, scratch, fgate);
+    }
+
+    for (auto p : longest_seq) {
+      p->gate->visited = kFinal;
+    }
+
+    FuseNext(1, fgate);
+
+    scratch.gates.push_back(&fgate);
+  }
+
+  static void AddGatesFromNext(std::vector<const RGate*>& gates, GateF& fgate) {
+    for (auto gate : gates) {
+      fgate.gates.push_back(gate);
+    }
+  }
+
+  static void AddGatesFromPrev(unsigned max_fused_size, const GateF& pfgate,
+                               Scratch& scratch, GateF& fgate) {
+    for (auto gate : pfgate.gates) {
+        fgate.gates.push_back(gate);
+    }
+
+    for (auto link : pfgate.links) {
+      if (link->prev == nullptr) continue;
+
+      auto pgate = link->prev->val;
+
+      if (pgate->visited == kFirst) {
+        MakeGateSequence(max_fused_size, scratch, *pgate);
+      }
+    }
+  }
+
+  static void FindLongestGateSequence(
+      unsigned max_fused_size, unsigned level, Scratch& scratch, GateF& fgate) {
+    scratch.data.push_back({&fgate, {}, {}});
+
+    scratch.longest_seq.resize(0);
+    scratch.longest_seq.push_back(&scratch.data.back());
+
+    scratch.stack.resize(0);
+    scratch.stack.push_back(&scratch.data.back());
+
+    unsigned cur_size = fgate.qubits.size();
+    fgate.visited = level;
+
+    unsigned max_size = cur_size;
+
+    GetNextAvailableGates(max_fused_size, cur_size, fgate, nullptr,
+                          scratch.data, scratch.next1);
+
+    for (auto n1 : scratch.next1) {
+      unsigned cur_size2 = cur_size + n1->qubits.size();
+      if (cur_size2 > max_fused_size) continue;
+
+      bool feasible = GetPrevAvailableGates(max_fused_size, cur_size,
+                                            level, *n1->gate, nullptr,
+                                            scratch.data, scratch.prev1);
+
+      if (!feasible) continue;
+
+      if (scratch.prev1.size() == 0 && max_fused_size > 3) continue;
+
+      if (cur_size2 == max_fused_size) {
+        std::swap(scratch.longest_seq, scratch.stack);
+        scratch.longest_seq.push_back(n1);
+        return;
+      }
+
+      Push(level, cur_size2, cur_size, max_size, scratch, n1);
+
+      for (auto p1 : scratch.prev1) {
+        unsigned cur_size2 = cur_size + p1->qubits.size();
+
+        if (cur_size2 > max_fused_size) {
+          continue;
+        } else if (cur_size2 == max_fused_size) {
+          std::swap(scratch.longest_seq, scratch.stack);
+          scratch.longest_seq.push_back(p1);
+          return;
+        }
+
+        Push(level, cur_size2, cur_size, max_size, scratch, p1);
+
+        GetNextAvailableGates(max_fused_size, cur_size, *p1->gate, &fgate,
+                              scratch.data, scratch.next2);
+
+        for (auto n2 : scratch.next2) {
+          unsigned cur_size2 = cur_size + n2->qubits.size();
+          if (cur_size2 > max_fused_size) continue;
+
+          bool feasible = GetPrevAvailableGates(max_fused_size, cur_size,
+                                                level, *n2->gate, n1->gate,
+                                                scratch.data, scratch.prev2);
+
+          if (!feasible) continue;
+
+          if (cur_size2 == max_fused_size) {
+            std::swap(scratch.longest_seq, scratch.stack);
+            scratch.longest_seq.push_back(n2);
+            return;
+          }
+
+          Push(level, cur_size2, cur_size, max_size, scratch, n2);
+
+          for (auto p2 : scratch.prev2) {
+            unsigned cur_size2 = cur_size + p2->qubits.size();
+
+            if (cur_size2 > max_fused_size) {
+              continue;
+            } else if (cur_size2 == max_fused_size) {
+              std::swap(scratch.longest_seq, scratch.stack);
+              scratch.longest_seq.push_back(p2);
+              return;
+            }
+
+            if (cur_size2 > max_size) {
+              scratch.stack.push_back(p2);
+              scratch.longest_seq = scratch.stack;
+              scratch.stack.pop_back();
+              max_size = cur_size2;
+            }
+          }
+
+          Pop(cur_size, scratch, n2);
+        }
+
+        Pop(cur_size, scratch, p1);
+      }
+
+      Pop(cur_size, scratch, n1);
+    }
+  }
+
+  static void Push(unsigned level, unsigned cur_size2, unsigned& cur_size,
+                   unsigned& max_size, Scratch& scratch, GateA* agate) {
+    agate->gate->visited = level;
+    cur_size = cur_size2;
+    scratch.stack.push_back(agate);
+
+    if (cur_size > max_size) {
+      scratch.longest_seq = scratch.stack;
+      max_size = cur_size;
+    }
+  }
+
+  static void Pop(unsigned& cur_size, Scratch& scratch, GateA* agate) {
+    agate->gate->visited = kFirst;
+    cur_size -= agate->qubits.size();
+    scratch.stack.pop_back();
+  }
+
+  static void GetNextAvailableGates(unsigned max_fused_size, unsigned cur_size,
+                                    const GateF& pgate1, const GateF* pgate2,
+                                    std::vector<GateA>& scratch,
+                                    std::vector<GateA*>& next_gates) {
+    next_gates.resize(0);
+
+    for (auto link : pgate1.links) {
+      if (link->next == nullptr) continue;
+
+      auto ngate = link->next->val;
+
+      if (ngate->visited > kFirst || ngate->parent->unfusible) continue;
+
+      GateA next = {ngate, {}, {}};
+      next.qubits.reserve(8);
+      next.links.reserve(8);
+
+      GetAddedQubits(pgate1, pgate2, *ngate, next);
+
+      if (cur_size + next.qubits.size() > max_fused_size) continue;
+
+      scratch.push_back(std::move(next));
+      next_gates.push_back(&scratch.back());
+    }
+  }
+
+  static bool GetPrevAvailableGates(unsigned max_fused_size,
+                                    unsigned cur_size, unsigned level,
+                                    const GateF& ngate1, const GateF* ngate2,
+                                    std::vector<GateA>& scratch,
+                                    std::vector<GateA*>& prev_gates) {
+    prev_gates.resize(0);
+
+    for (auto link : ngate1.links) {
+      if (link->prev == nullptr) continue;
+
+      auto pgate = link->prev->val;
+
+      if (pgate->visited == kFinal || pgate->visited == level) continue;
+
+      if (pgate->visited > kFirst || pgate->parent->unfusible) {
+        prev_gates.resize(0);
+        return false;
+      }
+
+      GateA prev = {pgate, {}, {}};
+      prev.qubits.reserve(8);
+      prev.links.reserve(8);
+
+      GetAddedQubits(ngate1, ngate2, *pgate, prev);
+
+      bool all_prev_visited = true;
+
+      for (auto link : pgate->links) {
+        if (link->prev == nullptr) continue;
+
+        if (link->prev->val->visited <= kMeaCnt) {
+          all_prev_visited = false;
+          break;
+        }
+      }
+
+      if (!all_prev_visited) {
+        prev_gates.resize(0);
+        return false;
+      }
+
+      if (cur_size + prev.qubits.size() > max_fused_size) continue;
+
+      if (all_prev_visited) {
+        scratch.push_back(std::move(prev));
+        prev_gates.push_back(&scratch.back());
+      }
+    }
+
+    return true;
+  }
+
+  static void GetAddedQubits(const GateF& fgate0, const GateF* fgate1,
+                             const GateF& fgate2, GateA& added) {
+    for (std::size_t i = 0; i < fgate2.qubits.size(); ++i) {
+      unsigned q2 = fgate2.qubits[i];
+
+      if (std::find(fgate0.qubits.begin(), fgate0.qubits.end(), q2)
+          != fgate0.qubits.end()) continue;
+
+      if (fgate1 != nullptr
+          && std::find(fgate1->qubits.begin(), fgate1->qubits.end(), q2)
+            != fgate1->qubits.end()) continue;
+
+      added.qubits.push_back(q2);
+      added.links.push_back(fgate2.links[i]);
+    }
+  }
+
+  // Fuse smaller gates with fgate back in gate time.
+  static void FusePrev(unsigned pass, GateF& fgate) {
+    std::vector<const RGate*> gates;
+    gates.reserve(fgate.gates.capacity());
+
+    auto neighbor = [](const Link* link) -> const Link* {
+      return link->prev;
+    };
+
+    FusePrevOrNext<std::greater<unsigned>>(pass, neighbor, fgate, gates);
+
+    for (auto it = gates.rbegin(); it != gates.rend(); ++it) {
+      fgate.gates.push_back(*it);
+    }
+  }
+
+  // Fuse smaller gates with fgate forward in gate time.
+  static void FuseNext(unsigned pass, GateF& fgate) {
+    auto neighbor = [](const Link* link) -> const Link* {
+      return link->next;
+    };
+
+    FusePrevOrNext<std::less<unsigned>>(pass, neighbor, fgate, fgate.gates);
+  }
+
+  template <typename R, typename Neighbor>
+  static void FusePrevOrNext(unsigned pass, Neighbor neighb, GateF& fgate,
+                             std::vector<const RGate*>& gates) {
+    uint64_t bad_mask = 0;
+    auto links = fgate.links;
+
+    bool may_have_gates_to_fuse = true;
+
+    while (may_have_gates_to_fuse) {
+      may_have_gates_to_fuse = false;
+
+      std::sort(links.begin(), links.end(),
+                [&neighb](const Link* l, const Link* r) -> bool {
+                  auto ln = neighb(l);
+                  auto rn = neighb(r);
+
+                  if (ln != nullptr && rn != nullptr) {
+                    return R()(ln->val->parent->time, rn->val->parent->time);
+                  } else {
+                    // nullptrs are larger than everything else and
+                    // equivalent among each other.
+                    return ln != nullptr;
+                  }
+                });
+
+      for (auto link : links) {
+        auto n = neighb(link);
+
+        if (n == nullptr) continue;
+
+        auto g = n->val;
+
+        if (!QubitsAreIn(fgate.mask, g->mask) || (g->mask & bad_mask) != 0
+            || g->visited > pass || g->parent->unfusible) {
+          bad_mask |= g->mask;
+        } else {
+          g->visited = pass == 0 ? kFirst : kFinal;
+
+          if (pass == 0) {
+            gates.push_back(g->parent);
+          } else {
+            for (auto gate : g->gates) {
+              gates.push_back(gate);
+            }
+          }
+
+          for (auto link : g->links) {
+            LinkManager::Delete(link);
+          }
+
+          may_have_gates_to_fuse = true;
+          break;
+        }
+      }
+    }
+  }
+
+  static bool QubitsAreIn(uint64_t mask0, uint64_t mask) {
+    return ((mask0 | mask) ^ mask0) == 0;
+  }
+
+  static void PrintStat(unsigned verbosity, const Stat& stat,
+                        const std::vector<GateFused>& fused_gates) {
+    if (verbosity < 3) return;
+
+    if (stat.num_controlled_gates > 0) {
+      IO::messagef("%lu controlled gates\n", stat.num_controlled_gates);
+    }
+
+    if (stat.num_mea_gates > 0) {
+      IO::messagef("%lu measurement gates", stat.num_mea_gates);
+      if (stat.num_fused_mea_gates == stat.num_mea_gates) {
+        IO::messagef("\n");
+      } else {
+        IO::messagef(" are fused into %lu gates\n", stat.num_fused_mea_gates);
+      }
+    }
+
+    bool first = true;
+    for (unsigned i = 1; i < stat.num_gates.size(); ++i) {
+      if (stat.num_gates[i] > 0) {
+        if (first) {
+          first = false;
+        } else {
+          IO::messagef(", ");
+        }
+        IO::messagef("%u %u-qubit", stat.num_gates[i], i);
+      }
+    }
+
+    IO::messagef(" gates are fused into %lu gates\n", stat.num_fused_gates);
+
+    if (verbosity < 5) return;
+
+    IO::messagef("fused gate qubits:\n");
+    for (const auto& g : fused_gates) {
+      IO::messagef("%6u  ", g.parent->time);
+      if (g.parent->kind == gate::kMeasurement) {
+        IO::messagef("m");
+      } else if (g.parent->controlled_by.size() > 0) {
+        IO::messagef("c");
+        for (auto q : g.parent->controlled_by) {
+          IO::messagef("%3u", q);
+        }
+        IO::messagef("  t");
+      } else {
+        IO::messagef(" ");
+      }
+
+      for (auto q : g.qubits) {
+        IO::messagef("%3u", q);
+      }
+      IO::messagef("\n");
+    }
+  }
+
+  template <typename Gate2, typename GatesLat>
+  static bool ValidateGate(const Gate2& gate, unsigned max_qubit1,
+                           const GatesLat& gates_lat) {
+    for (unsigned q : gate.qubits) {
+      if (q >= max_qubit1) {
+        IO::errorf("fuser: gate qubit %u is out of range "
+                   "(should be smaller than %u).\n", q, max_qubit1);
+        return false;
+      }
+      if (gates_lat[q] != nullptr
+          && gate.time <= gates_lat[q]->val->parent->time) {
+        IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time);
+        return false;
+      }
+    }
+
+    for (unsigned q : gate.controlled_by) {
+      if (q >= max_qubit1) {
+        IO::errorf("fuser: gate qubit %u is out of range "
+                   "(should be smaller than %u).\n", q, max_qubit1);
+        return false;
+      }
+      if (gates_lat[q] != nullptr
+          && gate.time <= gates_lat[q]->val->parent->time) {
+        IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time);
+        return false;
+      }
+    }
+
+    return true;
+  }
+};
+
+}  // namespace qsim
+
+#endif  // FUSER_MQUBIT_H_
diff --git a/qsim/gate.h b/qsim/gate.h
new file mode 100644
index 0000000..a457acb
--- /dev/null
+++ b/qsim/gate.h
@@ -0,0 +1,216 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GATE_H_
+#define GATE_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "matrix.h"
+
+namespace qsim {
+
+namespace detail {
+
+template <typename Gate, typename GateDef>
+inline void SortQubits(Gate& gate) {
+  for (std::size_t i = 1; i < gate.qubits.size(); ++i) {
+    if (gate.qubits[i - 1] > gate.qubits[i]) {
+      if (!GateDef::symmetric) {
+        auto perm = NormalToGateOrderPermutation(gate.qubits);
+        MatrixShuffle(perm, gate.qubits.size(), gate.matrix);
+      }
+
+      gate.swapped = true;
+      std::sort(gate.qubits.begin(), gate.qubits.end());
+      break;
+    }
+  }
+}
+
+}  // namespace detail
+
+template <typename Qubits = std::vector<unsigned>, typename Gate>
+inline Gate& MakeControlledGate(Qubits&& controlled_by, Gate& gate) {
+  gate.controlled_by = std::forward<Qubits>(controlled_by);
+  gate.cmask = (uint64_t{1} << gate.controlled_by.size()) - 1;
+
+  std::sort(gate.controlled_by.begin(), gate.controlled_by.end());
+
+  return gate;
+}
+
+template <typename Qubits = std::vector<unsigned>, typename Gate>
+inline Gate& MakeControlledGate(Qubits&& controlled_by,
+                               const std::vector<unsigned>& control_values,
+                               Gate& gate) {
+  // Assume controlled_by.size() == control_values.size().
+
+  bool sorted = true;
+
+  for (std::size_t i = 1; i < controlled_by.size(); ++i) {
+    if (controlled_by[i - 1] > controlled_by[i]) {
+      sorted = false;
+      break;
+    }
+  }
+
+  if (sorted) {
+    gate.controlled_by = std::forward<Qubits>(controlled_by);
+    gate.cmask = 0;
+
+    for (std::size_t i = 0; i < control_values.size(); ++i) {
+      gate.cmask |= (control_values[i] & 1) << i;
+    }
+  } else {
+    struct ControlPair {
+      unsigned q;
+      unsigned v;
+    };
+
+    std::vector<ControlPair> cpairs;
+    cpairs.reserve(controlled_by.size());
+
+    for (std::size_t i = 0; i < controlled_by.size(); ++i) {
+      cpairs.push_back({controlled_by[i], control_values[i]});
+    }
+
+    // Sort control qubits and control values.
+    std::sort(cpairs.begin(), cpairs.end(),
+              [](const ControlPair& l, const ControlPair& r) -> bool {
+                return l.q < r.q;
+              });
+
+    gate.cmask = 0;
+    gate.controlled_by.reserve(controlled_by.size());
+
+    for (std::size_t i = 0; i < cpairs.size(); ++i) {
+      gate.cmask |= (cpairs[i].v & 1) << i;
+      gate.controlled_by.push_back(cpairs[i].q);
+    }
+  }
+
+  return gate;
+}
+
+namespace gate {
+
+constexpr int kDecomp = 100001;       // gate from Schmidt decomposition
+constexpr int kMeasurement = 100002;  // measurement gate
+
+}  // namespace gate
+
+enum GateAnyKind {
+  kGateAny = -1,
+};
+
+/**
+ * A generic gate to make it easier to use qsim with external gate sets.
+ */
+template <typename FP, typename GK = GateAnyKind>
+struct Gate {
+  using fp_type = FP;
+  using GateKind = GK;
+
+  GateKind kind;
+  unsigned time;
+  std::vector<unsigned> qubits;
+  std::vector<unsigned> controlled_by;
+  uint64_t cmask;
+  std::vector<fp_type> params;
+  Matrix<fp_type> matrix;
+  bool unfusible;      // If true, the gate is fused as a parent.
+  bool swapped;        // If true, the gate qubits are swapped to make qubits
+                       // ordered in ascending order. This does not apply to
+                       // control qubits of explicitly-controlled gates.
+
+  template <typename Qubits = std::vector<unsigned>>
+  Gate&& ControlledBy(Qubits&& controlled_by) {
+    MakeControlledGate(std::forward<Qubits>(controlled_by), *this);
+    return std::move(*this);
+  }
+
+  template <typename Qubits = std::vector<unsigned>>
+  Gate&& ControlledBy(Qubits&& controlled_by,
+                      const std::vector<unsigned>& control_values) {
+    MakeControlledGate(
+        std::forward<Qubits>(controlled_by), control_values, *this);
+    return std::move(*this);
+  }
+};
+
+template <typename Gate, typename GateDef,
+          typename Qubits = std::vector<unsigned>,
+          typename M = Matrix<typename Gate::fp_type>>
+inline Gate CreateGate(unsigned time, Qubits&& qubits, M&& matrix = {},
+                       std::vector<typename Gate::fp_type>&& params = {}) {
+  Gate gate = {GateDef::kind, time, std::forward<Qubits>(qubits), {}, 0,
+               std::move(params), std::forward<M>(matrix), false, false};
+
+  if (GateDef::kind != gate::kMeasurement) {
+    switch (gate.qubits.size()) {
+    case 1:
+      break;
+    case 2:
+      if (gate.qubits[0] > gate.qubits[1]) {
+        gate.swapped = true;
+        std::swap(gate.qubits[0], gate.qubits[1]);
+        if (!GateDef::symmetric) {
+          MatrixShuffle({1, 0}, 2, gate.matrix);
+        }
+      }
+      break;
+    default:
+      detail::SortQubits<Gate, GateDef>(gate);
+    }
+  }
+
+  return gate;
+}
+
+namespace gate {
+
+/**
+ * A gate that simulates measurement of one or more qubits, collapsing the
+ * state vector and storing the measured results.
+ */
+template <typename Gate>
+struct Measurement {
+  using GateKind = typename Gate::GateKind;
+
+  static constexpr GateKind kind = GateKind::kMeasurement;
+  static constexpr char name[] = "m";
+  static constexpr bool symmetric = false;
+
+  template <typename Qubits = std::vector<unsigned>>
+  static Gate Create(unsigned time, Qubits&& qubits) {
+    return CreateGate<Gate, Measurement>(time, std::forward<Qubits>(qubits));
+  }
+};
+
+}  // namespace gate
+
+template <typename fp_type>
+using schmidt_decomp_type = std::vector<std::vector<std::vector<fp_type>>>;
+
+template <typename fp_type, typename GateKind>
+schmidt_decomp_type<fp_type> GetSchmidtDecomp(
+    GateKind kind, const std::vector<fp_type>& params);
+
+}  // namespace qsim
+
+#endif  // GATE_H_
diff --git a/qsim/gate_appl.h b/qsim/gate_appl.h
new file mode 100644
index 0000000..8601e6f
--- /dev/null
+++ b/qsim/gate_appl.h
@@ -0,0 +1,231 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GATE_APPL_H_
+#define GATE_APPL_H_
+
+#include <utility>
+#include <vector>
+
+#include "fuser.h"
+#include "gate.h"
+#include "matrix.h"
+
+namespace qsim {
+
+/**
+ * Applies the given gate to the simulator state. Ignores measurement gates.
+ * @param simulator Simulator object. Provides specific implementations for
+ *   applying gates.
+ * @param gate The gate to be applied.
+ * @param state The state of the system, to be updated by this method.
+ */
+template <typename Simulator, typename Gate>
+inline void ApplyGate(const Simulator& simulator, const Gate& gate,
+                      typename Simulator::State& state) {
+  if (gate.kind != gate::kMeasurement) {
+    if (gate.controlled_by.size() == 0) {
+      simulator.ApplyGate(gate.qubits, gate.matrix.data(), state);
+    } else {
+      simulator.ApplyControlledGate(gate.qubits, gate.controlled_by,
+                                    gate.cmask, gate.matrix.data(), state);
+    }
+  }
+}
+
+/**
+ * Applies the given gate dagger to the simulator state. If the gate matrix is
+ *   unitary then this is equivalent to applying the inverse gate. Ignores
+ *   measurement gates.
+ * @param simulator Simulator object. Provides specific implementations for
+ *   applying gates.
+ * @param gate The gate to be applied.
+ * @param state The state of the system, to be updated by this method.
+ */
+template <typename Simulator, typename Gate>
+inline void ApplyGateDagger(const Simulator& simulator, const Gate& gate,
+                            typename Simulator::State& state) {
+  if (gate.kind != gate::kMeasurement) {
+    auto matrix = gate.matrix;
+    MatrixDagger(unsigned{1} << gate.qubits.size(), matrix);
+
+    if (gate.controlled_by.size() == 0) {
+      simulator.ApplyGate(gate.qubits, matrix.data(), state);
+    } else {
+      simulator.ApplyControlledGate(gate.qubits, gate.controlled_by,
+                                    gate.cmask, matrix.data(), state);
+    }
+  }
+}
+
+/**
+ * Applies the given gate to the simulator state.
+ * @param state_space StateSpace object required to perform measurements.
+ * @param simulator Simulator object. Provides specific implementations for
+ *   applying gates.
+ * @param gate The gate to be applied.
+ * @param rgen Random number generator to perform measurements.
+ * @param state The state of the system, to be updated by this method.
+ * @param mresults As an input parameter, this can be empty or this can
+ *   contain the results of the previous measurements. If gate is a measurement
+ *   gate then after a successful run, the measurement result will be added to
+ *   this.
+ * @return True if the measurement performed successfully; false otherwise.
+ */
+template <typename Simulator, typename Gate, typename Rgen>
+inline bool ApplyGate(
+    const typename Simulator::StateSpace& state_space,
+    const Simulator& simulator, const Gate& gate, Rgen& rgen,
+    typename Simulator::State& state,
+    std::vector<typename Simulator::StateSpace::MeasurementResult>& mresults) {
+  if (gate.kind == gate::kMeasurement) {
+    auto measure_result = state_space.Measure(gate.qubits, rgen, state);
+    if (measure_result.valid) {
+      mresults.push_back(std::move(measure_result));
+    } else {
+      return false;
+    }
+  } else {
+    ApplyGate(simulator, gate, state);
+  }
+
+  return true;
+}
+
+/**
+ * Applies the given gate to the simulator state, discarding measurement
+ *   results.
+ * @param state_space StateSpace object required to perform measurements.
+ * @param simulator Simulator object. Provides specific implementations for
+ *   applying gates.
+ * @param gate The gate to be applied.
+ * @param rgen Random number generator to perform measurements.
+ * @param state The state of the system, to be updated by this method.
+ * @return True if the measurement performed successfully; false otherwise.
+ */
+template <typename Simulator, typename Gate, typename Rgen>
+inline bool ApplyGate(const typename Simulator::StateSpace& state_space,
+                      const Simulator& simulator, const Gate& gate, Rgen& rgen,
+                      typename Simulator::State& state) {
+  using MeasurementResult = typename Simulator::StateSpace::MeasurementResult;
+  std::vector<MeasurementResult> discarded_results;
+  return
+      ApplyGate(state_space, simulator, gate, rgen, state, discarded_results);
+}
+
+/**
+ * Applies the given fused gate to the simulator state. Ignores measurement
+ *   gates.
+ * @param simulator Simulator object. Provides specific implementations for
+ *   applying gates.
+ * @param gate The gate to be applied.
+ * @param state The state of the system, to be updated by this method.
+ */
+template <typename Simulator, typename Gate>
+inline void ApplyFusedGate(const Simulator& simulator, const Gate& gate,
+                           typename Simulator::State& state) {
+  if (gate.kind != gate::kMeasurement) {
+    if (gate.parent->controlled_by.size() == 0) {
+      simulator.ApplyGate(gate.qubits, gate.matrix.data(), state);
+    } else {
+      simulator.ApplyControlledGate(gate.qubits, gate.parent->controlled_by,
+                                    gate.parent->cmask, gate.matrix.data(),
+                                    state);
+    }
+  }
+}
+
+/**
+ * Applies the given fused gate dagger to the simulator state. If the gate
+ *   matrix is unitary then this is equivalent to applying the inverse gate.
+ *   Ignores measurement gates.
+ * @param simulator Simulator object. Provides specific implementations for
+ *   applying gates.
+ * @param gate The gate to be applied.
+ * @param state The state of the system, to be updated by this method.
+ */
+template <typename Simulator, typename Gate>
+inline void ApplyFusedGateDagger(const Simulator& simulator, const Gate& gate,
+                                 typename Simulator::State& state) {
+  if (gate.kind != gate::kMeasurement) {
+    auto matrix = gate.matrix;
+    MatrixDagger(unsigned{1} << gate.qubits.size(), matrix);
+
+    if (gate.parent->controlled_by.size() == 0) {
+      simulator.ApplyGate(gate.qubits, matrix.data(), state);
+    } else {
+      simulator.ApplyControlledGate(gate.qubits, gate.parent->controlled_by,
+                                    gate.parent->cmask, matrix.data(), state);
+    }
+  }
+}
+
+/**
+ * Applies the given fused gate to the simulator state.
+ * @param state_space StateSpace object required to perform measurements.
+ * @param simulator Simulator object. Provides specific implementations for
+ *   applying gates.
+ * @param gate The gate to be applied.
+ * @param rgen Random number generator to perform measurements.
+ * @param state The state of the system, to be updated by this method.
+ * @param mresults As an input parameter, this can be empty or this can
+ *   contain the results of the previous measurements. If gate is a measurement
+ *   gate then after a successful run, the measurement result will be added to
+ *   this.
+ * @return True if the measurement performed successfully; false otherwise.
+ */
+template <typename Simulator, typename Gate, typename Rgen>
+inline bool ApplyFusedGate(
+    const typename Simulator::StateSpace& state_space,
+    const Simulator& simulator, const Gate& gate, Rgen& rgen,
+    typename Simulator::State& state,
+    std::vector<typename Simulator::StateSpace::MeasurementResult>& mresults) {
+  if (gate.kind == gate::kMeasurement) {
+    auto measure_result = state_space.Measure(gate.qubits, rgen, state);
+    if (measure_result.valid) {
+      mresults.push_back(std::move(measure_result));
+    } else {
+      return false;
+    }
+  } else {
+    ApplyFusedGate(simulator, gate, state);
+  }
+
+  return true;
+}
+
+/**
+ * Applies the given fused gate to the simulator state, discarding measurement
+ *   results.
+ * @param state_space StateSpace object required to perform measurements.
+ * @param simulator Simulator object. Provides specific implementations for
+ *   applying gates.
+ * @param gate The gate to be applied.
+ * @param rgen Random number generator to perform measurements.
+ * @param state The state of the system, to be updated by this method.
+ * @return True if the measurement performed successfully; false otherwise.
+ */
+template <typename Simulator, typename Gate, typename Rgen>
+inline bool ApplyFusedGate(const typename Simulator::StateSpace& state_space,
+                           const Simulator& simulator, const Gate& gate,
+                           Rgen& rgen, typename Simulator::State& state) {
+  using MeasurementResult = typename Simulator::StateSpace::MeasurementResult;
+  std::vector<MeasurementResult> discarded_results;
+  return ApplyFusedGate(
+      state_space, simulator, gate, rgen, state, discarded_results);
+}
+
+}  // namespace qsim
+
+#endif  // GATE_APPL_H_
diff --git a/qsim/gates_cirq.h b/qsim/gates_cirq.h
new file mode 100644
index 0000000..d767959
--- /dev/null
+++ b/qsim/gates_cirq.h
@@ -0,0 +1,1640 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GATES_CIRQ_H_
+#define GATES_CIRQ_H_
+
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <vector>
+
+#include "gate.h"
+#include "matrix.h"
+
+namespace qsim {
+
+namespace Cirq {
+
+enum GateKind {
+  kI1 = 0,     // One-qubit identity gate.
+  kI2,         // Two-qubit identity gate.
+  kI,          // Multi-qubit identity gate.
+  kXPowGate,
+  kYPowGate,
+  kZPowGate,
+  kHPowGate,
+  kCZPowGate,
+  kCXPowGate,
+  krx,
+  kry,
+  krz,
+  kH,
+  kS,
+  kCZ,
+  kCX,
+  kT,
+  kX,
+  kY,
+  kZ,
+  kPhasedXPowGate,
+  kPhasedXZGate,
+  kXXPowGate,
+  kYYPowGate,
+  kZZPowGate,
+  kXX,
+  kYY,
+  kZZ,
+  kSwapPowGate,
+  kISwapPowGate,
+  kriswap,
+  kSWAP,
+  kISWAP,
+  kPhasedISwapPowGate,
+  kgivens,
+  kFSimGate,
+  kTwoQubitDiagonalGate,
+  kThreeQubitDiagonalGate,
+  kCCZPowGate,
+  kCCXPowGate,
+  kCSwapGate,
+  kCCZ,
+  kCCX,
+  kMatrixGate1,  // One-qubit matrix gate.
+  kMatrixGate2,  // Two-qubit matrix gate.
+  kMatrixGate,   // Multi-qubit matrix gate.
+  kGlobalPhaseGate,
+  kDecomp = gate::kDecomp,
+  kMeasurement = gate::kMeasurement,
+};
+
+template <typename fp_type>
+using GateCirq = Gate<fp_type, GateKind>;
+
+constexpr double h_double = 0.5;
+constexpr double pi_double = 3.14159265358979323846264338327950288;
+constexpr double is2_double = 0.7071067811865475;
+
+// Gates from cirq/ops/global_phase_op.py:
+
+/**
+ * The global phase gate.
+ */
+template <typename fp_type>
+struct GlobalPhaseGate {
+  static constexpr GateKind kind = kGlobalPhaseGate;
+  static constexpr char name[] = "GlobalPhaseGate";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateCirq<fp_type> Create(unsigned time, fp_type phi) {
+    return Create(time, std::cos(phi), std::sin(phi));
+  }
+
+  static GateCirq<fp_type> Create(unsigned time, fp_type cp, fp_type sp) {
+    return CreateGate<GateCirq<fp_type>, GlobalPhaseGate>(
+        time, {}, {cp, sp}, {cp, sp});
+  }
+};
+
+template <typename fp_type>
+using global_phase_operation = GlobalPhaseGate<fp_type>;
+
+// Gates from cirq/ops/identity.py:
+
+/**
+ * A one-qubit identity gate.
+ */
+template <typename fp_type>
+struct I1 {
+  static constexpr GateKind kind = kI1;
+  static constexpr char name[] = "I1";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateCirq<fp_type>, I1>(
+        time, {q0}, {1, 0, 0, 0, 0, 0, 1, 0});
+  }
+};
+
+/**
+ * A two-qubit identity gate.
+ */
+template <typename fp_type>
+struct I2 {
+  static constexpr GateKind kind = kI2;
+  static constexpr char name[] = "I2";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
+    return CreateGate<GateCirq<fp_type>, I2>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, 1, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 1, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, 1, 0});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
+    return schmidt_decomp_type<fp_type>{
+      {{1, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, 1, 0}},
+    };
+  }
+};
+
+/**
+ * A multi-qubit identity gate.
+ */
+template <typename fp_type>
+struct I {
+  static constexpr GateKind kind = kI;
+  static constexpr char name[] = "I";
+  static constexpr bool symmetric = true;
+
+  static GateCirq<fp_type> Create(unsigned time,
+                                  const std::vector<unsigned>& qubits) {
+    Matrix<fp_type> matrix;
+    MatrixIdentity(1 << qubits.size(), matrix);
+    return CreateGate<GateCirq<fp_type>, I>(time, qubits, std::move(matrix));
+  }
+};
+
+// Gates form cirq/ops/common_gates.py:
+
+/**
+ * A gate that rotates around the X axis of the Bloch sphere.
+ * This is a generalization of the X gate.
+ */
+template <typename fp_type>
+struct XPowGate {
+  static constexpr GateKind kind = kXPowGate;
+  static constexpr char name[] = "XPowGate";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0,
+                                  fp_type exponent, fp_type global_shift = 0) {
+    fp_type c = std::cos(pi * exponent * 0.5);
+    fp_type s = std::sin(pi * exponent * 0.5);
+    fp_type gc = std::cos(pi * exponent * (0.5 + global_shift));
+    fp_type gs = std::sin(pi * exponent * (0.5 + global_shift));
+
+    return CreateGate<GateCirq<fp_type>, XPowGate>(
+        time, {q0}, {c * gc, c * gs, s * gs, -s * gc,
+                     s * gs, -s * gc, c * gc, c * gs},
+        {exponent, global_shift});
+  }
+};
+
+/**
+ * A gate that rotates around the Y axis of the Bloch sphere.
+ * This is a generalization of the Y gate.
+ */
+template <typename fp_type>
+struct YPowGate {
+  static constexpr GateKind kind = kYPowGate;
+  static constexpr char name[] = "YPowGate";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0,
+                                  fp_type exponent, fp_type global_shift = 0) {
+    fp_type c = std::cos(pi * exponent * 0.5);
+    fp_type s = std::sin(pi * exponent * 0.5);
+    fp_type gc = std::cos(pi * exponent * (0.5 + global_shift));
+    fp_type gs = std::sin(pi * exponent * (0.5 + global_shift));
+
+    return CreateGate<GateCirq<fp_type>, YPowGate>(
+        time, {q0}, {c * gc, c * gs, -s * gc, -s * gs,
+                     s * gc, s * gs, c * gc, c * gs}, {exponent, global_shift});
+  }
+};
+
+/**
+ * A gate that rotates around the Z axis of the Bloch sphere.
+ * This is a generalization of the Z gate.
+ */
+template <typename fp_type>
+struct ZPowGate {
+  static constexpr GateKind kind = kZPowGate;
+  static constexpr char name[] = "ZPowGate";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0,
+                                  fp_type exponent, fp_type global_shift = 0) {
+    fp_type c = std::cos(pi * exponent);
+    fp_type s = std::sin(pi * exponent);
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+
+    return CreateGate<GateCirq<fp_type>, ZPowGate>(
+        time, {q0}, {gc, gs, 0, 0, 0, 0, c * gc - s * gs, c * gs + s * gc},
+        {exponent, global_shift});
+  }
+};
+
+/**
+ * A gate that rotates around the X+Z axis of the Bloch sphere.
+ * This is a generalization of the Hadamard gate.
+ */
+template <typename fp_type>
+struct HPowGate {
+  static constexpr GateKind kind = kHPowGate;
+  static constexpr char name[] = "HPowGate";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0,
+                                  fp_type exponent, fp_type global_shift = 0) {
+    fp_type c = std::cos(pi * exponent * 0.5);
+    fp_type s = std::sin(pi * exponent * 0.5);
+    fp_type gc = std::cos(pi * exponent * (0.5 + global_shift));
+    fp_type gs = std::sin(pi * exponent * (0.5 + global_shift));
+
+    fp_type a = s * gs * is2;
+    fp_type b = s * gc * is2;
+
+    return CreateGate<GateCirq<fp_type>, HPowGate>(
+        time, {q0}, {c * gc + a, c * gs - b, a, -b,
+                     a, -b, c * gc - a, c * gs + b}, {exponent, global_shift});
+  }
+};
+
+/**
+ * A gate that applies a phase to the |11⟩ state of two qubits.
+ * This is a generalization of the CZ gate.
+ */
+template <typename fp_type>
+struct CZPowGate {
+  static constexpr GateKind kind = kCZPowGate;
+  static constexpr char name[] = "CZPowGate";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
+                                  fp_type exponent, fp_type global_shift = 0) {
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+    fp_type ec = std::cos(pi * exponent * (1 + global_shift));
+    fp_type es = std::sin(pi * exponent * (1 + global_shift));
+
+    return CreateGate<GateCirq<fp_type>, CZPowGate>(
+        time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0,
+                         0, 0, gc, gs, 0, 0, 0, 0,
+                         0, 0, 0, 0, gc, gs, 0, 0,
+                         0, 0, 0, 0, 0, 0, ec, es}, {exponent, global_shift});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp(
+      fp_type exponent, fp_type global_shift) {
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+    fp_type ec = std::cos(pi * exponent * (1 + global_shift));
+    fp_type es = std::sin(pi * exponent * (1 + global_shift));
+
+    return schmidt_decomp_type<fp_type>{
+      {{1, 0, 0, 0, 0, 0, 0, 0}, {gc, gs, 0, 0, 0, 0, gc, gs}},
+      {{0, 0, 0, 0, 0, 0, 1, 0}, {gc, gs, 0, 0, 0, 0, ec, es}},
+    };
+  }
+};
+
+/**
+ * A gate that applies a controlled power of an X gate.
+ * This is a generalization of the CX (or CNOT) gate.
+ */
+template <typename fp_type>
+struct CXPowGate {
+  static constexpr GateKind kind = kCXPowGate;
+  static constexpr char name[] = "CXPowGate";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = false;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
+                                  fp_type exponent, fp_type global_shift = 0) {
+    fp_type c = std::cos(pi * exponent * 0.5);
+    fp_type s = std::sin(pi * exponent * 0.5);
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+    fp_type ec = std::cos(pi * exponent * (0.5 + global_shift));
+    fp_type es = std::sin(pi * exponent * (0.5 + global_shift));
+
+    // Matrix is in this form because the simulator uses inverse qubit order.
+    return CreateGate<GateCirq<fp_type>, CXPowGate>(
+        time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0,
+                         0, 0, c * ec, c * es, 0, 0, s * es, -s * ec,
+                         0, 0, 0, 0, gc, gs, 0, 0,
+                         0, 0, s * es, -s * ec, 0, 0, c * ec, c * es},
+        {exponent, global_shift});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp(
+      fp_type exponent, fp_type global_shift) {
+    fp_type c = std::cos(pi * exponent * 0.5);
+    fp_type s = std::sin(pi * exponent * 0.5);
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+    fp_type ec = std::cos(pi * exponent * (0.5 + global_shift));
+    fp_type es = std::sin(pi * exponent * (0.5 + global_shift));
+
+    return schmidt_decomp_type<fp_type>{
+      {{1, 0, 0, 0, 0, 0, 0, 0}, {gc, gs, 0, 0, 0, 0, gc, gs}},
+      {{0, 0, 0, 0, 0, 0, 1, 0}, {c * ec, c * es, s * es, -s * ec,
+                                  s * es, -s * ec, c * ec, c * es}},
+    };
+  }
+};
+
+/**
+ * The `(exponent = phi/pi, global_shift = -0.5)` instance of XPowGate.
+ * This is a generalization of the X gate with a fixed global phase.
+ * This is a function in Cirq.
+ */
+template <typename fp_type>
+struct rx {
+  static constexpr GateKind kind = krx;
+  static constexpr char name[] = "rx";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, fp_type phi) {
+    fp_type c = std::cos(-0.5 * phi);
+    fp_type s = std::sin(-0.5 * phi);
+
+    return CreateGate<GateCirq<fp_type>, rx>(
+        time, {q0}, {c, 0, 0, s, 0, s, c, 0}, {phi});
+  }
+};
+
+/**
+ * The `(exponent = phi/pi, global_shift = -0.5)` instance of YPowGate.
+ * This is a generalization of the Y gate with a fixed global phase.
+ * This is a function in Cirq.
+ */
+template <typename fp_type>
+struct ry {
+  static constexpr GateKind kind = kry;
+  static constexpr char name[] = "ry";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, fp_type phi) {
+    fp_type c = std::cos(-0.5 * phi);
+    fp_type s = std::sin(-0.5 * phi);
+
+    return CreateGate<GateCirq<fp_type>, ry>(
+        time, {q0}, {c, 0, s, 0, -s, 0, c, 0}, {phi});
+  }
+};
+
+/**
+ * The `(exponent = phi/pi, global_shift = -0.5)` instance of ZPowGate.
+ * This is a generalization of the Z gate with a fixed global phase.
+ * This is a function in Cirq.
+ */
+template <typename fp_type>
+struct rz {
+  static constexpr GateKind kind = krz;
+  static constexpr char name[] = "rz";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, fp_type phi) {
+    fp_type c = std::cos(-0.5 * phi);
+    fp_type s = std::sin(-0.5 * phi);
+
+    return CreateGate<GateCirq<fp_type>, rz>(
+        time, {q0}, {c, s, 0, 0, 0, 0, c, -s}, {phi});
+  }
+};
+
+/**
+ * The `(exponent = 1, global_shift = 0)` instance of HPowGate.
+ * This is the canonical Hadamard (or H) gate.
+ */
+template <typename fp_type>
+struct H {
+  static constexpr GateKind kind = kH;
+  static constexpr char name[] = "H";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateCirq<fp_type>, H>(
+        time, {q0}, {is2, 0, is2, 0, is2, 0, -is2, 0});
+  }
+};
+
+/**
+ * The `(exponent = 0.5, global_shift = 0)` instance of ZPowGate.
+ * This is the canonical S gate.
+ */
+template <typename fp_type>
+struct S {
+  static constexpr GateKind kind = kS;
+  static constexpr char name[] = "S";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateCirq<fp_type>, S>(
+        time, {q0}, {1, 0, 0, 0, 0, 0, 0, 1});
+  }
+};
+
+/**
+ * The `(exponent = 0.25, global_shift = 0)` instance of ZPowGate.
+ * This is the canonical T gate.
+ */
+template <typename fp_type>
+struct T {
+  static constexpr GateKind kind = kT;
+  static constexpr char name[] = "T";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateCirq<fp_type>, T>(
+        time, {q0}, {1, 0, 0, 0, 0, 0, is2, is2});
+  }
+};
+
+/**
+ * The `(exponent = 1, global_shift = 0)` instance of CZPowGate.
+ * This is the canonical CZ gate.
+ */
+template <typename fp_type>
+struct CZ {
+  static constexpr GateKind kind = kCZ;
+  static constexpr char name[] = "CZ";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
+    return CreateGate<GateCirq<fp_type>, CZ>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, 1, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 1, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, -1, 0});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
+    return schmidt_decomp_type<fp_type>{
+      {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}},
+      {{0, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, -1, 0}},
+    };
+  }
+};
+
+template <typename fp_type>
+using CNotPowGate = CXPowGate<fp_type>;
+
+/**
+ * The `(exponent = 1, global_shift = 0)` instance of CXPowGate.
+ * This is the canonical CX (or CNOT) gate.
+ */
+template <typename fp_type>
+struct CX {
+  static constexpr GateKind kind = kCX;
+  static constexpr char name[] = "kCX";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = false;
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
+    // Matrix is in this form because the simulator uses inverse qubit order.
+    return CreateGate<GateCirq<fp_type>, CX>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, 1, 0,
+                         0, 0, 0, 0, 1, 0, 0, 0,
+                         0, 0, 1, 0, 0, 0, 0, 0});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
+    return schmidt_decomp_type<fp_type>{
+      {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}},
+      {{0, 0, 0, 0, 0, 0, 1, 0}, {0, 0, 1, 0, 1, 0, 0, 0}},
+    };
+  }
+};
+
+template <typename fp_type>
+using CNOT = CX<fp_type>;
+
+// Gates from cirq/ops/pauli_gates.py:
+
+/**
+ * The `(exponent = 1, global_shift = 0)` instance of XPowGate.
+ * This is the canonical Pauli X gate.
+ */
+template <typename fp_type>
+struct X : public XPowGate<fp_type> {
+  static constexpr GateKind kind = kX;
+  static constexpr char name[] = "X";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateCirq<fp_type>, X>(
+        time, {q0}, {0, 0, 1, 0, 1, 0, 0, 0});
+  }
+};
+
+/**
+ * The `(exponent = 1, global_shift = 0)` instance of YPowGate.
+ * This is the canonical Pauli Y gate.
+ */
+template <typename fp_type>
+struct Y : public YPowGate<fp_type> {
+  static constexpr GateKind kind = kY;
+  static constexpr char name[] = "Y";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateCirq<fp_type>, Y>(
+        time, {q0}, {0, 0, 0, -1, 0, 1, 0, 0});
+  }
+};
+
+/**
+ * The `(exponent = 1, global_shift = 0)` instance of ZPowGate.
+ * This is the canonical Pauli Z gate.
+ */
+template <typename fp_type>
+struct Z : public ZPowGate<fp_type> {
+  static constexpr GateKind kind = kZ;
+  static constexpr char name[] = "Z";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateCirq<fp_type>, Z>(
+        time, {q0}, {1, 0, 0, 0, 0, 0, -1, 0});
+  }
+};
+
+// Gates from cirq/ops/phased_x_gate.py:
+
+/**
+ * An XPowGate conjugated by ZPowGate%s.
+ * Equivalent to the circuit `───Z^-p───X^t───Z^p───`.
+ */
+template <typename fp_type>
+struct PhasedXPowGate {
+  static constexpr GateKind kind = kPhasedXPowGate;
+  static constexpr char name[] = "PhasedXPowGate";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0,
+                                  fp_type phase_exponent, fp_type exponent = 1,
+                                  fp_type global_shift = 0) {
+    fp_type pc = std::cos(pi * phase_exponent);
+    fp_type ps = std::sin(pi * phase_exponent);
+    fp_type ec = std::cos(pi * exponent);
+    fp_type es = std::sin(pi * exponent);
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+
+    fp_type ar = 0.5 * ((1 + ec) * gc - es * gs);
+    fp_type ai = 0.5 * ((1 + ec) * gs + es * gc);
+    fp_type br = -0.5 * ((-1 + ec) * gc - es * gs);
+    fp_type bi = -0.5 * ((-1 + ec) * gs + es * gc);
+
+    return CreateGate<GateCirq<fp_type>, PhasedXPowGate>(
+        time, {q0}, {ar, ai, pc * br + ps * bi, pc * bi - ps * br,
+                     pc * br - ps * bi, pc * bi + ps * br, ar, ai},
+        {phase_exponent, exponent, global_shift});
+  }
+};
+
+// Gates from cirq/ops/phased_x_z_gate.py:
+
+/**
+ * A PhasedXPowGate followed by a ZPowGate.
+ * Equivalent to the circuit `───Z^(-a)──X^x──Z^a───Z^z───`.
+ */
+template <typename fp_type>
+struct PhasedXZGate {
+  static constexpr GateKind kind = kPhasedXZGate;
+  static constexpr char name[] = "PhasedXZGate";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0,
+                                  fp_type x_exponent, fp_type z_exponent,
+                                  fp_type axis_phase_exponent) {
+    fp_type xc = std::cos(pi * x_exponent);
+    fp_type xs = std::sin(pi * x_exponent);
+    fp_type zc = std::cos(pi * z_exponent);
+    fp_type zs = std::sin(pi * z_exponent);
+    fp_type ac = std::cos(pi * axis_phase_exponent);
+    fp_type as = std::sin(pi * axis_phase_exponent);
+
+    fp_type br = 0.5 * (1 + xc);
+    fp_type bi = 0.5 * xs;
+    fp_type cr = -0.5 * (-1 + xc);
+    fp_type ci = -0.5 * xs;
+    fp_type dr = ac * zc - as * zs;
+    fp_type di = ac * zs + as * zc;
+
+    return CreateGate<GateCirq<fp_type>, PhasedXZGate>(
+        time, {q0}, {br, bi, ac * cr + as * ci, ac * ci - as * cr,
+                     dr * cr - di * ci, dr * ci + di * cr,
+                     zc * br - zs * bi, zc * bi + zs * br},
+        {x_exponent, z_exponent, axis_phase_exponent});
+  }
+};
+
+// Gates from cirq/ops/parity_gates.py:
+
+/**
+ * The tensor product of two X gates, possibly raised to an exponent.
+ */
+template <typename fp_type>
+struct XXPowGate {
+  static constexpr GateKind kind = kXXPowGate;
+  static constexpr char name[] = "XXPowGate";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
+                                  fp_type exponent, fp_type global_shift = 0) {
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+    fp_type c = std::cos(pi * exponent);
+    fp_type s = std::sin(pi * exponent);
+    fp_type ic = 0.5 * ((1 + c) * gc - s * gs);
+    fp_type is = 0.5 * ((1 + c) * gs + s * gc);
+    fp_type xc = 0.5 * ((1 - c) * gc + s * gs);
+    fp_type xs = 0.5 * ((1 - c) * gs - s * gc);
+
+    return CreateGate<GateCirq<fp_type>, XXPowGate>(
+        time, {q0, q1}, {ic, is, 0, 0, 0, 0, xc, xs,
+                         0, 0, ic, is, xc, xs, 0, 0,
+                         0, 0, xc, xs, ic, is, 0, 0,
+                         xc, xs, 0, 0, 0, 0, ic, is}, {exponent, global_shift});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp(
+      fp_type exponent, fp_type global_shift) {
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+    fp_type c = std::cos(pi * exponent);
+    fp_type s = std::sin(pi * exponent);
+    fp_type ic = 0.5 * ((1 + c) * gc - s * gs);
+    fp_type is = 0.5 * ((1 + c) * gs + s * gc);
+    fp_type xc = 0.5 * ((1 - c) * gc + s * gs);
+    fp_type xs = 0.5 * ((1 - c) * gs - s * gc);
+
+    return schmidt_decomp_type<fp_type>{
+      {{1, 0, 0, 0, 0, 0, 1, 0}, {ic, is, 0, 0, 0, 0, ic, is}},
+      {{0, 0, 1, 0, 1, 0, 0, 0}, {0, 0, xc, xs, xc, xs, 0, 0}},
+    };
+  }
+};
+
+/**
+ * The tensor product of two Y gates, possibly raised to an exponent.
+ */
+template <typename fp_type>
+struct YYPowGate {
+  static constexpr GateKind kind = kYYPowGate;
+  static constexpr char name[] = "YYPowGate";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
+                                  fp_type exponent, fp_type global_shift = 0) {
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+    fp_type c = std::cos(pi * exponent);
+    fp_type s = std::sin(pi * exponent);
+    fp_type ic = 0.5 * ((1 + c) * gc - s * gs);
+    fp_type is = 0.5 * ((1 + c) * gs + s * gc);
+    fp_type yc = 0.5 * ((1 - c) * gc + s * gs);
+    fp_type ys = 0.5 * ((1 - c) * gs - s * gc);
+
+    return CreateGate<GateCirq<fp_type>, YYPowGate>(
+        time, {q0, q1}, {ic, is, 0, 0, 0, 0, -yc, -ys,
+                         0, 0, ic, is, yc, ys, 0, 0,
+                         0, 0, yc, ys, ic, is, 0, 0,
+                         -yc, -ys, 0, 0, 0, 0, ic, is},
+        {exponent, global_shift});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp(
+      fp_type exponent, fp_type global_shift) {
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+    fp_type c = std::cos(pi * exponent);
+    fp_type s = std::sin(pi * exponent);
+    fp_type ic = 0.5 * ((1 + c) * gc - s * gs);
+    fp_type is = 0.5 * ((1 + c) * gs + s * gc);
+    fp_type yc = 0.5 * ((1 - c) * gc + s * gs);
+    fp_type ys = 0.5 * ((1 - c) * gs - s * gc);
+
+    return schmidt_decomp_type<fp_type>{
+      {{1, 0, 0, 0, 0, 0, 1, 0}, {ic, is, 0, 0, 0, 0, ic, is}},
+      {{0, 0, 0, -1, 0, 1, 0, 0}, {0, 0, ys, -yc, -ys, yc, 0, 0}},
+    };
+  }
+};
+
+/**
+ * The tensor product of two Z gates, possibly raised to an exponent.
+ */
+template <typename fp_type>
+struct ZZPowGate {
+  static constexpr GateKind kind = kZZPowGate;
+  static constexpr char name[] = "ZZPowGate";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
+                                  fp_type exponent, fp_type global_shift = 0) {
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+    fp_type zc = std::cos(pi * exponent * (1 + global_shift));
+    fp_type zs = std::sin(pi * exponent * (1 + global_shift));
+
+    return CreateGate<GateCirq<fp_type>, ZZPowGate>(
+        time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0,
+                         0, 0, zc, zs, 0, 0, 0, 0,
+                         0, 0, 0, 0, zc, zs, 0, 0,
+                         0, 0, 0, 0, 0, 0, gc, gs}, {exponent, global_shift});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp(
+      fp_type exponent, fp_type global_shift) {
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+    fp_type c = std::cos(pi * exponent);
+    fp_type s = std::sin(pi * exponent);
+    fp_type ic = 0.5 * ((1 + c) * gc - s * gs);
+    fp_type is = 0.5 * ((1 + c) * gs + s * gc);
+    fp_type zc = 0.5 * ((1 - c) * gc + s * gs);
+    fp_type zs = 0.5 * ((1 - c) * gs - s * gc);
+
+    return schmidt_decomp_type<fp_type>{
+      {{1, 0, 0, 0, 0, 0, 1, 0}, {ic, is, 0, 0, 0, 0, ic, is}},
+      {{1, 0, 0, 0, 0, 0, -1, 0}, {zc, zs, 0, 0, 0, 0, -zc, -zs}},
+    };
+  }
+};
+
+/**
+ * The `(exponent = 1, global_shift = 0)` instance of XXPowGate.
+ * This is the tensor product of two X gates.
+ */
+template <typename fp_type>
+struct XX {
+  static constexpr GateKind kind = kXX;
+  static constexpr char name[] = "XX";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
+    return CreateGate<GateCirq<fp_type>, XX>(
+        time, {q0, q1}, {0, 0, 0, 0, 0, 0, 1, 0,
+                         0, 0, 0, 0, 1, 0, 0, 0,
+                         0, 0, 1, 0, 0, 0, 0, 0,
+                         1, 0, 0, 0, 0, 0, 0, 0});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
+    return schmidt_decomp_type<fp_type>{
+      {{0, 0, 1, 0, 1, 0, 0, 0}, {0, 0, 1, 0, 1, 0, 0, 0}},
+    };
+  }
+};
+
+/**
+ * The `(exponent = 1, global_shift = 0)` instance of YYPowGate.
+ * This is the tensor product of two Y gates.
+ */
+template <typename fp_type>
+struct YY {
+  static constexpr GateKind kind = kYY;
+  static constexpr char name[] = "YY";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
+    return CreateGate<GateCirq<fp_type>, YY>(
+        time, {q0, q1}, {0, 0, 0, 0, 0, 0, -1, 0,
+                         0, 0, 0, 0, 1, 0, 0, 0,
+                         0, 0, 1, 0, 0, 0, 0, 0,
+                         -1, 0, 0, 0, 0, 0, 0, 0});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
+    return schmidt_decomp_type<fp_type>{
+      {{0, 0, 0, -1, 0, 1, 0, 0}, {0, 0, 0, -1, 0, 1, 0, 0}},
+    };
+  }
+};
+
+/**
+ * The `(exponent = 1, global_shift = 0)` instance of ZZPowGate.
+ * This is the tensor product of two Z gates.
+ */
+template <typename fp_type>
+struct ZZ {
+  static constexpr GateKind kind = kZZ;
+  static constexpr char name[] = "ZZ";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
+    return CreateGate<GateCirq<fp_type>, ZZ>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, -1, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, -1, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, 1, 0});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
+    return schmidt_decomp_type<fp_type>{
+      {{1, 0, 0, 0, 0, 0, -1, 0}, {1, 0, 0, 0, 0, 0, -1, 0}},
+    };
+  }
+};
+
+// Gates from cirq/ops/swap_gates.py:
+
+/**
+ * The SWAP gate, possibly raised to a power. Exchanges qubits.
+ */
+template <typename fp_type>
+struct SwapPowGate {
+  static constexpr GateKind kind = kSwapPowGate;
+  static constexpr char name[] = "SwapPowGate";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+  static constexpr fp_type h = static_cast<fp_type>(h_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
+                                  fp_type exponent, fp_type global_shift = 0) {
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+    fp_type c = std::cos(pi * exponent * 0.5);
+    fp_type s = std::sin(pi * exponent * 0.5);
+    fp_type ec = std::cos(pi * exponent * (0.5 + global_shift));
+    fp_type es = std::sin(pi * exponent * (0.5 + global_shift));
+
+    return CreateGate<GateCirq<fp_type>, SwapPowGate>(
+        time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0,
+                         0, 0, c * ec, c * es, s * es, -s * ec, 0, 0,
+                         0, 0, s * es, -s * ec, c * ec, c * es, 0, 0,
+                         0, 0, 0, 0, 0, 0, gc, gs}, {exponent, global_shift});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp(
+      fp_type exponent, fp_type global_shift) {
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+    fp_type c = std::cos(pi * exponent * 0.5);
+    fp_type s = std::sin(pi * exponent * 0.5);
+    fp_type ec = std::cos(pi * exponent * (0.5 + global_shift));
+    fp_type es = std::sin(pi * exponent * (0.5 + global_shift));
+
+    return schmidt_decomp_type<fp_type>{
+      {{h, 0, 0, 0, 0, 0, h, 0}, {gc + c * ec, gs + c * es, 0, 0,
+                                  0, 0, gc + c * ec, gs + c * es}},
+      {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, s * es, -s * ec,
+                                  s * es, -s * ec, 0, 0}},
+      {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, -s * ec, -s * es,
+                                   s * ec, s * es, 0, 0}},
+      {{h, 0, 0, 0, 0, 0, -h, 0}, {gc - c * ec, gs - c * es, 0, 0,
+                                   0, 0, -gc + c * ec, -gs + c * es}},
+    };
+  }
+};
+
+/**
+ * Rotates the |01⟩ vs |10⟩ subspace of two qubits around its Bloch X-axis.
+ * This is a generalization of the ISWAP gate.
+ */
+template <typename fp_type>
+struct ISwapPowGate {
+  static constexpr GateKind kind = kISwapPowGate;
+  static constexpr char name[] = "ISwapPowGate";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+  static constexpr fp_type h = static_cast<fp_type>(h_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
+                                  fp_type exponent, fp_type global_shift = 0) {
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+    fp_type c = std::cos(pi * exponent * 0.5);
+    fp_type s = std::sin(pi * exponent * 0.5);
+
+    return CreateGate<GateCirq<fp_type>, ISwapPowGate>(
+        time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0,
+                         0, 0, c * gc, c * gs, -s * gs, s * gc, 0, 0,
+                         0, 0, -s * gs, s * gc, c * gc, c * gs, 0, 0,
+                         0, 0, 0, 0, 0, 0, gc, gs}, {exponent, global_shift});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp(
+      fp_type exponent, fp_type global_shift) {
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+    fp_type c = std::cos(pi * exponent * 0.5);
+    fp_type s = std::sin(pi * exponent * 0.5);
+
+    return schmidt_decomp_type<fp_type>{
+      {{h, 0, 0, 0, 0, 0, h, 0}, {gc + c * gc, gs + c * gs, 0, 0,
+                                  0, 0, gc + c * gc, gs + c * gs}},
+      {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, -s * gs, s * gc,
+                                  -s * gs, s * gc, 0, 0}},
+      {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, s * gc, s * gs,
+                                   -s * gc, -s * gs, 0, 0}},
+      {{h, 0, 0, 0, 0, 0, -h, 0}, {gc - c * gc, gs - c * gs, 0, 0,
+                                   0, 0, -gc + c * gc, -gs + c * gs}},
+    };
+  }
+};
+
+/**
+ * The `(exponent = 2*phi/pi, global_shift = 0)` instance of ISwapPowGate.
+ * This is a generalization of the ISWAP gate with a fixed global phase of zero.
+ * This is a function in Cirq.
+ */
+template <typename fp_type>
+struct riswap {
+  static constexpr GateKind kind = kriswap;
+  static constexpr char name[] = "riswap";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+  static constexpr fp_type h = static_cast<fp_type>(h_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
+                                  fp_type phi) {
+    fp_type c = std::cos(phi);
+    fp_type s = std::sin(phi);
+
+    return CreateGate<GateCirq<fp_type>, riswap>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, c, 0, 0, s, 0, 0,
+                         0, 0, 0, s, c, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, 1, 0}, {phi});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp(fp_type phi) {
+    fp_type c = std::cos(phi);
+    fp_type s = std::sin(phi);
+
+    return schmidt_decomp_type<fp_type>{
+      {{h, 0, 0, 0, 0, 0, h, 0}, {1 + c, 0, 0, 0, 0, 0, 1 + c, 0}},
+      {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, 0, s, 0, s, 0, 0}},
+      {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, s, 0, -s, 0, 0, 0}},
+      {{h, 0, 0, 0, 0, 0, -h, 0}, {1 - c, 0, 0, 0, 0, 0, -1 + c, 0}},
+    };
+  }
+};
+
+/**
+ * The `(exponent = 1, global_shift = 0)` instance of SwapPowGate.
+ * This is the canonical SWAP gate.
+ */
+template <typename fp_type>
+struct SWAP {
+  static constexpr GateKind kind = kSWAP;
+  static constexpr char name[] = "SWAP";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
+    return CreateGate<GateCirq<fp_type>, SWAP>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 1, 0, 0, 0,
+                         0, 0, 1, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, 1, 0});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
+    return schmidt_decomp_type<fp_type>{
+      {{is2, 0, 0, 0, 0, 0, is2, 0}, {is2, 0, 0, 0, 0, 0, is2, 0}},
+      {{0, 0, is2, 0, is2, 0, 0, 0}, {0, 0, is2, 0, is2, 0, 0, 0}},
+      {{0, 0, 0, -is2, 0, is2, 0, 0}, {0, 0, 0, -is2, 0, is2, 0, 0}},
+      {{is2, 0, 0, 0, 0, 0, -is2, 0}, {is2, 0, 0, 0, 0, 0, -is2, 0}},
+    };
+  }
+};
+
+/**
+ * The `(exponent = 1, global_shift = 0)` instance of ISwapPowGate.
+ * This is the canonical ISWAP gate.
+ */
+template <typename fp_type>
+struct ISWAP {
+  static constexpr GateKind kind = kISWAP;
+  static constexpr char name[] = "ISWAP";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type h = static_cast<fp_type>(h_double);
+  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
+    return CreateGate<GateCirq<fp_type>, ISWAP>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 0, 1, 0, 0,
+                         0, 0, 0, 1, 0, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, 1, 0});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
+    return schmidt_decomp_type<fp_type>{
+      {{is2, 0, 0, 0, 0, 0, is2, 0}, {is2, 0, 0, 0, 0, 0, is2, 0}},
+      {{0, 0, h, h, h, h, 0, 0}, {0, 0, h, h, h, h, 0, 0}},
+      {{0, 0, h, -h, -h, h, 0, 0}, {0, 0, h, -h, -h, h, 0, 0}},
+      {{is2, 0, 0, 0, 0, 0, -is2, 0}, {is2, 0, 0, 0, 0, 0, -is2, 0}},
+    };
+  }
+};
+
+// Gates from cirq/ops/phased_iswap_gate.py:
+
+/**
+ * An ISwapPowGate conjugated by ZPowGate%s.
+ * Equivalent to the composition `(Z^-p ⊗ Z^p) ISWAP^t (Z^p ⊗ Z^-p)`.
+ */
+template <typename fp_type>
+struct PhasedISwapPowGate {
+  static constexpr GateKind kind = kPhasedISwapPowGate;
+  static constexpr char name[] = "PhasedISwapPowGate";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = false;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+  static constexpr fp_type h = static_cast<fp_type>(h_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
+                                  fp_type phase_exponent = 0.25,
+                                  fp_type exponent = 1.0) {
+    fp_type fc = std::cos(2 * pi * phase_exponent);
+    fp_type fs = std::sin(2 * pi * phase_exponent);
+    fp_type c = std::cos(pi * exponent * 0.5);
+    fp_type s = std::sin(pi * exponent * 0.5);
+
+    // Matrix is in this form because the simulator uses inverse qubit order.
+    return CreateGate<GateCirq<fp_type>, PhasedISwapPowGate>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, c, 0, s * fs, s * fc, 0, 0,
+                         0, 0, -s * fs, s * fc, c, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, 1, 0}, {phase_exponent, exponent});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp(
+      fp_type phase_exponent, fp_type exponent) {
+    fp_type fc = std::cos(2 * pi * phase_exponent);
+    fp_type fs = std::sin(2 * pi * phase_exponent);
+    fp_type c = std::cos(pi * exponent * 0.5);
+    fp_type s = std::sin(pi * exponent * 0.5);
+
+    return schmidt_decomp_type<fp_type>{
+      {{h, 0, 0, 0, 0, 0, h, 0}, {1 + c, 0, 0, 0, 0, 0, 1 + c, 0}},
+      {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, s * fs, s * fc, -s * fs, s * fc, 0, 0}},
+      {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, s * fc, -s * fs,
+                                   -s * fc, -s * fs, 0, 0}},
+      {{h, 0, 0, 0, 0, 0, -h, 0}, {1 - c, 0, 0, 0, 0, 0, -1 + c, 0}},
+    };
+  }
+};
+
+/**
+ * The `(phase_exponent = 0.25, exponent = 2*phi/pi)` instance of
+ * PhasedISwapPowGate.
+ * This is the "Givens rotation" from numerical linear algebra.
+ * This is a function in Cirq.
+ */
+template <typename fp_type>
+struct givens {
+  static constexpr GateKind kind = kgivens;
+  static constexpr char name[] = "givens";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = false;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+  static constexpr fp_type h = static_cast<fp_type>(h_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
+                                  fp_type phi) {
+    fp_type c = std::cos(phi);
+    fp_type s = std::sin(phi);
+
+    // Matrix is in this form because the simulator uses inverse qubit order.
+    return CreateGate<GateCirq<fp_type>, givens>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, c, 0, s, 0, 0, 0,
+                         0, 0, -s, 0, c, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, 1, 0}, {phi});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp(fp_type phi) {
+    fp_type c = std::cos(phi);
+    fp_type s = std::sin(phi);
+
+    return schmidt_decomp_type<fp_type>{
+      {{h, 0, 0, 0, 0, 0, h, 0}, {1 + c, 0, 0, 0, 0, 0, 1 + c, 0}},
+      {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, s, 0, -s, 0, 0, 0}},
+      {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, 0, -s, 0, -s, 0, 0}},
+      {{h, 0, 0, 0, 0, 0, -h, 0}, {1 - c, 0, 0, 0, 0, 0, -1 + c, 0}},
+    };
+  }
+};
+
+// Gates from cirq/ops/fsim_gate.py:
+
+/**
+ * The fermionic simulation gate family. Contains all two-qubit interactions
+ * that preserve excitations, up to single-qubit rotations and global phase.
+ */
+template <typename fp_type>
+struct FSimGate {
+  static constexpr GateKind kind = kFSimGate;
+  static constexpr char name[] = "FSimGate";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
+
+  static GateCirq<fp_type> Create(
+      unsigned time, unsigned q0, unsigned q1, fp_type theta, fp_type phi) {
+    if (phi < 0) {
+      phi += 2 * 3.141592653589793;
+    }
+
+    fp_type ct = std::cos(theta);
+    fp_type st = std::sin(theta);
+    fp_type cp = std::cos(phi);
+    fp_type sp = std::sin(phi);
+
+    return CreateGate<GateCirq<fp_type>, FSimGate>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, ct, 0, 0, -st, 0, 0,
+                         0, 0, 0, -st, ct, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, cp, -sp}, {theta, phi});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp(
+      fp_type theta, fp_type phi) {
+    fp_type ct = std::cos(theta);
+    fp_type st = std::sin(theta);
+
+    fp_type cp2 = std::cos(0.5 * phi);
+    fp_type sp2 = std::sin(0.5 * phi);
+    fp_type cp4 = std::cos(0.25 * phi);
+    fp_type sp4 = std::sin(0.25 * phi);
+
+    fp_type a0 = std::sqrt(std::sqrt(1 + 2 * ct * cp2 + ct * ct));
+    fp_type a1 = std::sqrt(std::sqrt(1 - 2 * ct * cp2 + ct * ct));
+
+    fp_type p0 = 0.5 * std::atan2(-sp2, cp2 + ct);
+    fp_type p1 = 0.5 * std::atan2(-sp2, cp2 - ct);
+
+    fp_type c0 = is2 * a0 * std::cos(p0);
+    fp_type s0 = is2 * a0 * std::sin(p0);
+
+    fp_type c1 = is2 * a1 * std::cos(p1);
+    fp_type s1 = is2 * a1 * std::sin(p1);
+
+    fp_type st2 = 0.5 * std::sqrt(st);
+
+    fp_type a = cp4 * c0 - sp4 * s0;
+    fp_type b = cp4 * s0 + sp4 * c0;
+    fp_type c = cp4 * c0 + sp4 * s0;
+    fp_type d = cp4 * s0 - sp4 * c0;
+
+    fp_type e = cp4 * c1 - sp4 * s1;
+    fp_type f = cp4 * s1 + sp4 * c1;
+    fp_type g = -(cp4 * c1 + sp4 * s1);
+    fp_type h = -(cp4 * s1 - sp4 * c1);
+
+    return schmidt_decomp_type<fp_type>{
+      {{a, b, 0, 0, 0, 0, c, d}, {a, b, 0, 0, 0, 0, c, d}},
+      {{0, 0, st2, -st2, st2, -st2, 0, 0}, {0, 0, st2, -st2, st2, -st2, 0, 0}},
+      {{0, 0, -st2, -st2, st2, st2, 0, 0}, {0, 0, -st2, -st2, st2, st2, 0, 0}},
+      {{e, f, 0, 0, 0, 0, g, h}, {e, f, 0, 0, 0, 0, g, h}},
+    };
+  }
+};
+
+// Gates from cirq/ops/two_qubit_diagonal_gate.py:
+
+/**
+ * A two-qubit diagonal gate.
+ */
+template <typename fp_type>
+struct TwoQubitDiagonalGate {
+  static constexpr GateKind kind = kTwoQubitDiagonalGate;
+  static constexpr char name[] = "TwoQubitDiagonalGate";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = false;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time,
+                                  unsigned q0, unsigned q1,
+                                  const std::vector<fp_type>& angles) {
+    std::vector<fp_type> cs;
+    std::vector<fp_type> ss;
+    cs.reserve(4);
+    ss.reserve(4);
+
+    for (std::size_t i = 0; i < angles.size(); ++i) {
+      cs.push_back(std::cos(angles[i]));
+      ss.push_back(std::sin(angles[i]));
+    }
+
+    for (std::size_t i = angles.size(); i < 4; ++i) {
+      cs.push_back(1);
+      ss.push_back(0);
+    }
+
+    // Matrix is in this form because the simulator uses inverse qubit order.
+    return CreateGate<GateCirq<fp_type>, TwoQubitDiagonalGate>(
+        time, {q0, q1}, {cs[0], ss[0], 0, 0, 0, 0, 0, 0,
+                         0, 0, cs[2], ss[2], 0, 0, 0, 0,
+                         0, 0, 0, 0, cs[1], ss[1], 0, 0,
+                         0, 0, 0, 0, 0, 0, cs[3], ss[3]});
+  }
+};
+
+// Gates from cirq/ops/three_qubit_gates.py:
+
+/**
+ * A three-qubit diagonal gate.
+ */
+template <typename fp_type>
+struct ThreeQubitDiagonalGate {
+  static constexpr GateKind kind = kThreeQubitDiagonalGate;
+  static constexpr char name[] = "ThreeQubitDiagonalGate";
+  static constexpr unsigned num_qubits = 3;
+  static constexpr bool symmetric = false;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time,
+                                  unsigned q0, unsigned q1, unsigned q2,
+                                  const std::vector<fp_type>& angles) {
+    std::vector<fp_type> cs;
+    std::vector<fp_type> ss;
+    cs.reserve(8);
+    ss.reserve(8);
+
+    for (std::size_t i = 0; i < angles.size(); ++i) {
+      cs.push_back(std::cos(angles[i]));
+      ss.push_back(std::sin(angles[i]));
+    }
+
+    for (std::size_t i = angles.size(); i < 8; ++i) {
+      cs.push_back(1);
+      ss.push_back(0);
+    }
+
+    // Matrix is in this form because the simulator uses inverse qubit order.
+    return CreateGate<GateCirq<fp_type>, ThreeQubitDiagonalGate>(
+        time, {q0, q1, q2},
+        {cs[0], ss[0], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+         0, 0, cs[4], ss[4], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+         0, 0, 0, 0, cs[2], ss[2], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+         0, 0, 0, 0, 0, 0, cs[6], ss[6], 0, 0, 0, 0, 0, 0, 0, 0,
+         0, 0, 0, 0, 0, 0, 0, 0, cs[1], ss[1], 0, 0, 0, 0, 0, 0,
+         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, cs[5], ss[5], 0, 0, 0, 0,
+         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, cs[3], ss[3], 0, 0,
+         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, cs[7], ss[7]});
+  }
+};
+
+/**
+ * A gate that applies a phase to the |111⟩ state of three qubits.
+ * This is a generalization of the CCZ gate.
+ */
+template <typename fp_type>
+struct CCZPowGate {
+  static constexpr GateKind kind = kCCZPowGate;
+  static constexpr char name[] = "CCZPowGate";
+  static constexpr unsigned num_qubits = 3;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time,
+                                  unsigned q0, unsigned q1, unsigned q2,
+                                  fp_type exponent, fp_type global_shift = 0) {
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+    fp_type ec = std::cos(pi * exponent * (1 + global_shift));
+    fp_type es = std::sin(pi * exponent * (1 + global_shift));
+
+    return CreateGate<GateCirq<fp_type>, CCZPowGate>(
+        time, {q0, q1, q2}, {gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ec, es},
+                            {exponent, global_shift});
+  }
+};
+
+/**
+ * A gate that applies a doubly-controlled power of an X gate.
+ * This is a generalization of the CCX (or CCNOT) gate.
+ */
+template <typename fp_type>
+struct CCXPowGate {
+  static constexpr GateKind kind = kCCXPowGate;
+  static constexpr char name[] = "CCXPowGate";
+  static constexpr unsigned num_qubits = 3;
+  static constexpr bool symmetric = false;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time,
+                                  unsigned q0, unsigned q1, unsigned q2,
+                                  fp_type exponent, fp_type global_shift = 0) {
+    fp_type c = std::cos(pi * exponent * 0.5);
+    fp_type s = std::sin(pi * exponent * 0.5);
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+    fp_type ec = std::cos(pi * exponent * (0.5 + global_shift));
+    fp_type es = std::sin(pi * exponent * (0.5 + global_shift));
+
+    // Matrix is in this form because the simulator uses inverse qubit order.
+    return CreateGate<GateCirq<fp_type>, CCXPowGate>(
+        time, {q0, q1, q2},
+        {gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+         0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+         0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+         0, 0, 0, 0, 0, 0, c * ec, c * es, 0, 0, 0, 0, 0, 0, s * es, -s * ec,
+         0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0,
+         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0,
+         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0,
+         0, 0, 0, 0, 0, 0, s * es, -s * ec, 0, 0, 0, 0, 0, 0, c * ec, c * es},
+        {exponent, global_shift});
+  }
+};
+
+/**
+ * A controlled swap gate (the Fredkin gate).
+ */
+template <typename fp_type>
+struct CSwapGate {
+  static constexpr GateKind kind = kCSwapGate;
+  static constexpr char name[] = "CSwapGate";
+  static constexpr unsigned num_qubits = 3;
+  static constexpr bool symmetric = false;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time,
+                                  unsigned q0, unsigned q1, unsigned q2) {
+    // Matrix is in this form because the simulator uses inverse qubit order.
+    return CreateGate<GateCirq<fp_type>, CSwapGate>(
+        time, {q0, q1, q2}, {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0});
+  }
+};
+
+/**
+ * The `(exponent = 1, global_shift = 0)` instance of CCZPowGate.
+ * This is the canonical doubly-controlled Z gate.
+ */
+template <typename fp_type>
+struct CCZ {
+  static constexpr GateKind kind = kCCZ;
+  static constexpr char name[] = "CCZ";
+  static constexpr unsigned num_qubits = 3;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time,
+                                  unsigned q0, unsigned q1, unsigned q2) {
+    return CreateGate<GateCirq<fp_type>, CCZ>(
+        time, {q0, q1, q2}, {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0});
+  }
+};
+
+/**
+ * The `(exponent = 1, global_shift = 0)` instance of CCXPowGate.
+ * This is the canonical doubly-controlled X gate (the TOFFOLI gate).
+ */
+template <typename fp_type>
+struct CCX {
+  static constexpr GateKind kind = kCCX;
+  static constexpr char name[] = "CCX";
+  static constexpr unsigned num_qubits = 3;
+  static constexpr bool symmetric = false;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time,
+                                  unsigned q0, unsigned q1, unsigned q2) {
+    // Matrix is in this form because the simulator uses inverse qubit order.
+    return CreateGate<GateCirq<fp_type>, CCX>(
+        time, {q0, q1, q2}, {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0});
+  }
+};
+
+template <typename fp_type>
+using CCNotPowGate = CCXPowGate<fp_type>;
+
+template <typename fp_type>
+using TOFFOLI = CCX<fp_type>;
+
+template <typename fp_type>
+using CCNOT = CCX<fp_type>;
+
+template <typename fp_type>
+using CSWAP = CSwapGate<fp_type>;
+
+template <typename fp_type>
+using FREDKIN = CSwapGate<fp_type>;
+
+// Gates from cirq/ops/matrix_gates.py:
+
+/**
+ * A one-qubit gate defined entirely by its matrix.
+ */
+template <typename fp_type>
+struct MatrixGate1 {
+  static constexpr GateKind kind = kMatrixGate1;
+  static constexpr char name[] = "MatrixGate1";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0,
+                                  const Matrix<fp_type>& m) {
+    auto m2 = m;
+    return
+        CreateGate<GateCirq<fp_type>, MatrixGate1>(time, {q0}, std::move(m2));
+  }
+};
+
+/**
+ * A two-qubit gate defined entirely by its matrix.
+ */
+template <typename fp_type>
+struct MatrixGate2 {
+  static constexpr GateKind kind = kMatrixGate2;
+  static constexpr char name[] = "MatrixGate2";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = false;
+
+  template <typename M = Matrix<fp_type>>
+  static GateCirq<fp_type> Create(
+      unsigned time, unsigned q0, unsigned q1, M&& m) {
+    return CreateGate<GateCirq<fp_type>, MatrixGate2>(time, {q1, q0},
+                                                      std::forward<M>(m));
+  }
+};
+
+/**
+ * A multi-qubit gate defined entirely by its matrix.
+ */
+template <typename fp_type>
+struct MatrixGate {
+  static constexpr GateKind kind = kMatrixGate;
+  static constexpr char name[] = "MatrixGate";
+  static constexpr bool symmetric = false;
+
+  template <typename M = Matrix<fp_type>>
+  static GateCirq<fp_type> Create(unsigned time,
+                                  std::vector<unsigned> qubits, M&& m) {
+    std::reverse(qubits.begin(), qubits.end());
+    return CreateGate<GateCirq<fp_type>, MatrixGate>(time, std::move(qubits),
+                                                     std::forward<M>(m));
+  }
+};
+
+}  // namesapce Cirq
+
+template <typename fp_type>
+inline schmidt_decomp_type<fp_type> GetSchmidtDecomp(
+    Cirq::GateKind kind, const std::vector<fp_type>& params) {
+  switch (kind) {
+  case Cirq::kI2:
+    return Cirq::I2<fp_type>::SchmidtDecomp();
+  case Cirq::kCZPowGate:
+    return Cirq::CZPowGate<fp_type>::SchmidtDecomp(params[0], params[1]);
+  case Cirq::kCXPowGate:
+    return Cirq::CXPowGate<fp_type>::SchmidtDecomp(params[0], params[1]);
+  case Cirq::kCZ:
+    return Cirq::CZ<fp_type>::SchmidtDecomp();
+  case Cirq::kCX:
+    return Cirq::CX<fp_type>::SchmidtDecomp();
+  case Cirq::kXXPowGate:
+    return Cirq::XXPowGate<fp_type>::SchmidtDecomp(params[0], params[1]);
+  case Cirq::kYYPowGate:
+    return Cirq::YYPowGate<fp_type>::SchmidtDecomp(params[0], params[1]);
+  case Cirq::kZZPowGate:
+    return Cirq::ZZPowGate<fp_type>::SchmidtDecomp(params[0], params[1]);
+  case Cirq::kXX:
+    return Cirq::XX<fp_type>::SchmidtDecomp();
+  case Cirq::kYY:
+    return Cirq::YY<fp_type>::SchmidtDecomp();
+  case Cirq::kZZ:
+    return Cirq::ZZ<fp_type>::SchmidtDecomp();
+  case Cirq::kSwapPowGate:
+    return Cirq::SwapPowGate<fp_type>::SchmidtDecomp(params[0], params[1]);
+  case Cirq::kISwapPowGate:
+    return Cirq::ISwapPowGate<fp_type>::SchmidtDecomp(params[0], params[1]);
+  case Cirq::kriswap:
+    return Cirq::riswap<fp_type>::SchmidtDecomp(params[0]);
+  case Cirq::kSWAP:
+    return Cirq::SWAP<fp_type>::SchmidtDecomp();
+  case Cirq::kISWAP:
+    return Cirq::ISWAP<fp_type>::SchmidtDecomp();
+  case Cirq::kPhasedISwapPowGate:
+    return Cirq::PhasedISwapPowGate<fp_type>::SchmidtDecomp(
+        params[0], params[1]);
+  case Cirq::kgivens:
+    return Cirq::givens<fp_type>::SchmidtDecomp(params[0]);
+  case Cirq::kFSimGate:
+    return Cirq::FSimGate<fp_type>::SchmidtDecomp(params[0], params[1]);
+  default:
+    // Single qubit gates of gates with unimplemented Schmidt decomposition.
+    return schmidt_decomp_type<fp_type>{};
+  }
+}
+
+}  // namespace qsim
+
+#endif  // GATES_CIRQ_H_
diff --git a/qsim/gates_qsim.h b/qsim/gates_qsim.h
new file mode 100644
index 0000000..366c4f1
--- /dev/null
+++ b/qsim/gates_qsim.h
@@ -0,0 +1,661 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GATES_QSIM_H_
+#define GATES_QSIM_H_
+
+#include <array>
+#include <cmath>
+#include <vector>
+
+#include "gate.h"
+
+namespace qsim {
+
+// Gate set implemented in qsim contains the following gates.
+enum GateKind {
+  kGateId1 = 0, // one-qubit Id
+  kGateHd,      // Hadamard
+  kGateT,       // T
+  kGateX,       // X
+  kGateY,       // Y
+  kGateZ,       // Z
+  kGateX2,      // sqrt(X)
+  kGateY2,      // sqrt(Y)
+  kGateRX,      // X-rotation
+  kGateRY,      // Y-rotation
+  kGateRZ,      // Z-rotation
+  kGateRXY,     // XY-rotation (rotation around arbitrary axis in the XY plane)
+  kGateHZ2,     // pi / 2 rotation around the X + Y axis
+  kGateS,       // S
+  kGateId2,     // two-qubit Id
+  kGateCZ,      // CZ
+  kGateCNot,    // CNOT (CX)
+  kGateSwap,    // swap
+  kGateIS,      // iSwap
+  kGateFS,      // fSim
+  kGateCP,      // control phase
+  kGateMatrix1, // one-qubit matrix gate
+  kGateMatrix2, // two-qubit matrix gate
+  kGateGPh,     // global phase gate
+  kDecomp = gate::kDecomp,
+  kMeasurement = gate::kMeasurement,
+};
+
+// Specialization of Gate (defined in gate.h) for the qsim gate set.
+template <typename fp_type>
+using GateQSim = Gate<fp_type, GateKind>;
+
+constexpr double h_double = 0.5;
+constexpr double is2_double = 0.7071067811865475;
+
+// Zero-qubit gates:
+
+/**
+ * The global phase gate.
+ */
+template <typename fp_type>
+struct GateGPh {
+  static constexpr GateKind kind = kGateGPh;
+  static constexpr char name[] = "p";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateQSim<fp_type> Create(unsigned time, fp_type phi) {
+    return Create(time, std::cos(phi), std::sin(phi));
+  }
+
+  static GateQSim<fp_type> Create(unsigned time, fp_type cp, fp_type sp) {
+    return CreateGate<GateQSim<fp_type>, GateGPh>(
+        time, {}, {cp, sp}, {cp, sp});
+  }
+};
+
+// One-qubit gates:
+
+/**
+ * The one-qubit identity gate.
+ */
+template <typename fp_type>
+struct GateId1 {
+  static constexpr GateKind kind = kGateId1;
+  static constexpr char name[] = "id1";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateQSim<fp_type>, GateId1>(
+        time, {q0}, {1, 0, 0, 0, 0, 0, 1, 0});
+  }
+};
+
+/**
+ * The Hadamard gate.
+ */
+template <typename fp_type>
+struct GateHd {
+  static constexpr GateKind kind = kGateHd;
+  static constexpr char name[] = "h";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateQSim<fp_type>, GateHd>(
+        time, {q0}, {is2, 0, is2, 0, is2, 0, -is2, 0});
+  }
+};
+
+/**
+ * The T gate, equivalent to `Z ^ 0.25`.
+ */
+template <typename fp_type>
+struct GateT {
+  static constexpr GateKind kind = kGateT;
+  static constexpr char name[] = "t";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateQSim<fp_type>, GateT>(
+        time, {q0}, {1, 0, 0, 0, 0, 0, is2, is2});
+  }
+};
+
+/**
+ * The Pauli X (or "NOT") gate.
+ */
+template <typename fp_type>
+struct GateX {
+  static constexpr GateKind kind = kGateX;
+  static constexpr char name[] = "x";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateQSim<fp_type>, GateX>(
+        time, {q0}, {0, 0, 1, 0, 1, 0, 0, 0});
+  }
+};
+
+/**
+ * The Pauli Y gate.
+ */
+template <typename fp_type>
+struct GateY {
+  static constexpr GateKind kind = kGateY;
+  static constexpr char name[] = "y";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateQSim<fp_type>, GateY>(
+        time, {q0}, {0, 0, 0, -1, 0, 1, 0, 0});
+  }
+};
+
+/**
+ * The Pauli Z gate.
+ */
+template <typename fp_type>
+struct GateZ {
+  static constexpr GateKind kind = kGateZ;
+  static constexpr char name[] = "z";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateQSim<fp_type>, GateZ>(
+        time, {q0}, {1, 0, 0, 0, 0, 0, -1, 0});
+  }
+};
+
+/**
+ * The "square root of X" gate.
+ */
+template <typename fp_type>
+struct GateX2 {
+  static constexpr GateKind kind = kGateX2;
+  static constexpr char name[] = "x_1_2";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type h = static_cast<fp_type>(h_double);
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateQSim<fp_type>, GateX2>(
+        time, {q0}, {h, h, h, -h, h, -h, h, h});
+  }
+};
+
+/**
+ * The "square root of Y" gate.
+ */
+template <typename fp_type>
+struct GateY2 {
+  static constexpr GateKind kind = kGateY2;
+  static constexpr char name[] = "y_1_2";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type h = static_cast<fp_type>(h_double);
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateQSim<fp_type>, GateY2>(
+        time, {q0}, {h, h, -h, -h, h, h, h, h});
+  }
+};
+
+/**
+ * A gate that rotates around the X axis of the Bloch sphere.
+ * This is a generalization of the X gate.
+ */
+template <typename fp_type>
+struct GateRX {
+  static constexpr GateKind kind = kGateRX;
+  static constexpr char name[] = "rx";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0, fp_type phi) {
+    fp_type phi2 = -0.5 * phi;
+    fp_type c = std::cos(phi2);
+    fp_type s = std::sin(phi2);
+
+    return CreateGate<GateQSim<fp_type>, GateRX>(
+        time, {q0}, {c, 0, 0, s, 0, s, c, 0}, {phi});
+  }
+};
+
+/**
+ * A gate that rotates around the Y axis of the Bloch sphere.
+ * This is a generalization of the Y gate.
+ */
+template <typename fp_type>
+struct GateRY {
+  static constexpr GateKind kind = kGateRY;
+  static constexpr char name[] = "ry";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0, fp_type phi) {
+    fp_type phi2 = -0.5 * phi;
+    fp_type c = std::cos(phi2);
+    fp_type s = std::sin(phi2);
+
+    return CreateGate<GateQSim<fp_type>, GateRY>(
+        time, {q0}, {c, 0, s, 0, -s, 0, c, 0}, {phi});
+  }
+};
+
+/**
+ * A gate that rotates around the Z axis of the Bloch sphere.
+ * This is a generalization of the Z gate.
+ */
+template <typename fp_type>
+struct GateRZ {
+  static constexpr GateKind kind = kGateRZ;
+  static constexpr char name[] = "rz";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0, fp_type phi) {
+    fp_type phi2 = -0.5 * phi;
+    fp_type c = std::cos(phi2);
+    fp_type s = std::sin(phi2);
+
+    return CreateGate<GateQSim<fp_type>, GateRZ>(
+        time, {q0}, {c, s, 0, 0, 0, 0, c, -s}, {phi});
+  }
+};
+
+/**
+ * A gate that rotates around an arbitrary axis in the XY-plane.
+ */
+template <typename fp_type>
+struct GateRXY {
+  static constexpr GateKind kind = kGateRXY;
+  static constexpr char name[] = "rxy";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateQSim<fp_type> Create(
+      unsigned time, unsigned q0, fp_type theta, fp_type phi) {
+    fp_type phi2 = -0.5 * phi;
+    fp_type cp = std::cos(phi2);
+    fp_type sp = std::sin(phi2);
+    fp_type ct = std::cos(theta) * sp;
+    fp_type st = std::sin(theta) * sp;
+
+    return CreateGate<GateQSim<fp_type>, GateRXY>(
+        time, {q0}, {cp, 0, st, ct, -st, ct, cp, 0}, {theta, phi});
+  }
+};
+
+/**
+ * A pi / 2 rotation around the X + Y axis.
+ */
+template <typename fp_type>
+struct GateHZ2 {
+  static constexpr GateKind kind = kGateHZ2;
+  static constexpr char name[] = "hz_1_2";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type h = static_cast<fp_type>(h_double);
+
+  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateQSim<fp_type>, GateHZ2>(
+        time, {q0}, {h, h, 0, -is2, is2, 0, h, h});
+  }
+};
+
+/**
+ * The S gate, equivalent to "square root of Z".
+ */
+template <typename fp_type>
+struct GateS {
+  static constexpr GateKind kind = kGateS;
+  static constexpr char name[] = "s";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateQSim<fp_type>, GateS>(
+        time, {q0}, {1, 0, 0, 0, 0, 0, 0, 1});
+  }
+};
+
+/**
+ * A one-qubit gate defined entirely by its matrix.
+ */
+template <typename fp_type>
+struct GateMatrix1 {
+  static constexpr GateKind kind = kGateMatrix1;
+  static constexpr char name[] = "mat1";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0,
+                                  const Matrix<fp_type>& m) {
+    auto m2 = m;
+    return
+        CreateGate<GateQSim<fp_type>, GateMatrix1>(time, {q0}, std::move(m2));
+  }
+};
+
+// Two-qubit gates:
+
+/**
+ * The two-qubit identity gate.
+ */
+template <typename fp_type>
+struct GateId2 {
+  static constexpr GateKind kind = kGateId2;
+  static constexpr char name[] = "id2";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
+    return CreateGate<GateQSim<fp_type>, GateId2>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, 1, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 1, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, 1, 0});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
+    return schmidt_decomp_type<fp_type>{
+      {{1, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, 1, 0}},
+    };
+  }
+};
+
+/**
+ * The controlled-Z (CZ) gate.
+ */
+template <typename fp_type>
+struct GateCZ {
+  static constexpr GateKind kind = kGateCZ;
+  static constexpr char name[] = "cz";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
+    return CreateGate<GateQSim<fp_type>, GateCZ>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, 1, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 1, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, -1, 0});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
+    return schmidt_decomp_type<fp_type>{
+      {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}},
+      {{0, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, -1, 0}},
+    };
+  }
+};
+
+/**
+ * The controlled-X (CX or CNOT) gate.
+ */
+template <typename fp_type>
+struct GateCNot {
+  static constexpr GateKind kind = kGateCNot;
+  static constexpr char name[] = "cnot";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = false;
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
+    // Matrix is in this form because the simulator uses inverse qubit order.
+    return CreateGate<GateQSim<fp_type>, GateCNot>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, 1, 0,
+                         0, 0, 0, 0, 1, 0, 0, 0,
+                         0, 0, 1, 0, 0, 0, 0, 0});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
+    return schmidt_decomp_type<fp_type>{
+      {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}},
+      {{0, 0, 0, 0, 0, 0, 1, 0}, {0, 0, 1, 0, 1, 0, 0, 0}},
+    };
+  }
+};
+
+/**
+ * The SWAP gate. Exchanges two qubits.
+ */
+template <typename fp_type>
+struct GateSwap {
+  static constexpr GateKind kind = kGateSwap;
+  static constexpr char name[] = "sw";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
+    return CreateGate<GateQSim<fp_type>, GateSwap>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 1, 0, 0, 0,
+                         0, 0, 1, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, 1, 0});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
+    return schmidt_decomp_type<fp_type>{
+      {{is2, 0, 0, 0, 0, 0, is2, 0}, {is2, 0, 0, 0, 0, 0, is2, 0}},
+      {{0, 0, is2, 0, is2, 0, 0, 0}, {0, 0, is2, 0, is2, 0, 0, 0}},
+      {{0, 0, 0, -is2, 0, is2, 0, 0}, {0, 0, 0, -is2, 0, is2, 0, 0}},
+      {{is2, 0, 0, 0, 0, 0, -is2, 0}, {is2, 0, 0, 0, 0, 0, -is2, 0}},
+    };
+  }
+};
+
+/**
+ * The ISWAP gate.
+ */
+template <typename fp_type>
+struct GateIS {
+  static constexpr GateKind kind = kGateIS;
+  static constexpr char name[] = "is";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type h = static_cast<fp_type>(h_double);
+  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
+    return CreateGate<GateQSim<fp_type>, GateIS>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 0, 1, 0, 0,
+                         0, 0, 0, 1, 0, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, 1, 0});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
+    return schmidt_decomp_type<fp_type>{
+      {{is2, 0, 0, 0, 0, 0, is2, 0}, {is2, 0, 0, 0, 0, 0, is2, 0}},
+      {{0, 0, h, h, h, h, 0, 0}, {0, 0, h, h, h, h, 0, 0}},
+      {{0, 0, h, -h, -h, h, 0, 0}, {0, 0, h, -h, -h, h, 0, 0}},
+      {{is2, 0, 0, 0, 0, 0, -is2, 0}, {is2, 0, 0, 0, 0, 0, -is2, 0}},
+    };
+  }
+};
+
+/**
+ * The fermionic simulation (FSim) gate family. Contains all two-qubit
+ * interactions that preserve excitations, up to single-qubit rotations and
+ * global phase.
+ */
+template <typename fp_type>
+struct GateFS {
+  static constexpr GateKind kind = kGateFS;
+  static constexpr char name[] = "fs";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
+
+  static GateQSim<fp_type> Create(
+      unsigned time, unsigned q0, unsigned q1, fp_type theta, fp_type phi) {
+    if (phi < 0) {
+      phi += 2 * 3.141592653589793;
+    }
+
+    fp_type ct = std::cos(theta);
+    fp_type st = std::sin(theta);
+    fp_type cp = std::cos(phi);
+    fp_type sp = std::sin(phi);
+
+    return CreateGate<GateQSim<fp_type>, GateFS>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, ct, 0, 0, -st, 0, 0,
+                         0, 0, 0, -st, ct, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, cp, -sp}, {theta, phi});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp(
+      fp_type theta, fp_type phi) {
+    fp_type ct = std::cos(theta);
+    fp_type st = std::sin(theta);
+
+    fp_type cp2 = std::cos(0.5 * phi);
+    fp_type sp2 = std::sin(0.5 * phi);
+    fp_type cp4 = std::cos(0.25 * phi);
+    fp_type sp4 = std::sin(0.25 * phi);
+
+    fp_type a0 = std::sqrt(std::sqrt(1 + 2 * ct * cp2 + ct * ct));
+    fp_type a1 = std::sqrt(std::sqrt(1 - 2 * ct * cp2 + ct * ct));
+
+    fp_type p0 = 0.5 * std::atan2(-sp2, cp2 + ct);
+    fp_type p1 = 0.5 * std::atan2(-sp2, cp2 - ct);
+
+    fp_type c0 = is2 * a0 * std::cos(p0);
+    fp_type s0 = is2 * a0 * std::sin(p0);
+
+    fp_type c1 = is2 * a1 * std::cos(p1);
+    fp_type s1 = is2 * a1 * std::sin(p1);
+
+    fp_type st2 = 0.5 * std::sqrt(st);
+
+    fp_type a = cp4 * c0 - sp4 * s0;
+    fp_type b = cp4 * s0 + sp4 * c0;
+    fp_type c = cp4 * c0 + sp4 * s0;
+    fp_type d = cp4 * s0 - sp4 * c0;
+
+    fp_type e = cp4 * c1 - sp4 * s1;
+    fp_type f = cp4 * s1 + sp4 * c1;
+    fp_type g = -(cp4 * c1 + sp4 * s1);
+    fp_type h = -(cp4 * s1 - sp4 * c1);
+
+    return schmidt_decomp_type<fp_type>{
+      {{a, b, 0, 0, 0, 0, c, d}, {a, b, 0, 0, 0, 0, c, d}},
+      {{0, 0, st2, -st2, st2, -st2, 0, 0}, {0, 0, st2, -st2, st2, -st2, 0, 0}},
+      {{0, 0, -st2, -st2, st2, st2, 0, 0}, {0, 0, -st2, -st2, st2, st2, 0, 0}},
+      {{e, f, 0, 0, 0, 0, g, h}, {e, f, 0, 0, 0, 0, g, h}},
+    };
+  }
+};
+
+/**
+ * The controlled phase gate. A generalized version of GateCZ.
+ */
+template <typename fp_type>
+struct GateCP {
+  static constexpr GateKind kind = kGateCP;
+  static constexpr char name[] = "cp";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static GateQSim<fp_type> Create(
+      unsigned time, unsigned q0, unsigned q1, fp_type phi) {
+    fp_type cp = std::cos(phi);
+    fp_type sp = std::sin(phi);
+
+    return CreateGate<GateQSim<fp_type>, GateCP>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, 1, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 1, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, cp, -sp}, {phi});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp(fp_type phi) {
+    fp_type cp = std::cos(phi);
+    fp_type sp = std::sin(phi);
+
+    return schmidt_decomp_type<fp_type>{
+      {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}},
+      {{0, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, cp, -sp}},
+    };
+  }
+};
+
+/**
+ * A two-qubit gate defined entirely by its matrix.
+ */
+template <typename fp_type>
+struct GateMatrix2 {
+  static constexpr GateKind kind = kGateMatrix2;
+  static constexpr char name[] = "mat2";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = false;
+
+  template <typename M = Matrix<fp_type>>
+  static GateQSim<fp_type> Create(
+      unsigned time, unsigned q0, unsigned q1, M&& m) {
+    return CreateGate<GateQSim<fp_type>, GateMatrix2>(time, {q1, q0},
+                                                      std::forward<M>(m));
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp(fp_type phi) {
+    // Not implemented.
+    return schmidt_decomp_type<fp_type>{};
+  }
+};
+
+template <typename fp_type>
+inline schmidt_decomp_type<fp_type> GetSchmidtDecomp(
+    GateKind kind, const std::vector<fp_type>& params) {
+  switch (kind) {
+  case kGateId2:
+    return GateId2<fp_type>::SchmidtDecomp();
+  case kGateCZ:
+    return GateCZ<fp_type>::SchmidtDecomp();
+  case kGateCNot:
+    return GateCNot<fp_type>::SchmidtDecomp();
+  case kGateSwap:
+    return GateSwap<fp_type>::SchmidtDecomp();
+  case kGateIS:
+    return GateIS<fp_type>::SchmidtDecomp();
+  case kGateFS:
+    return GateFS<fp_type>::SchmidtDecomp(params[0], params[1]);
+  case kGateCP:
+    return GateCP<fp_type>::SchmidtDecomp(params[0]);
+  default:
+    // Single qubit gates: empty Schmidt decomposition.
+    return schmidt_decomp_type<fp_type>{};
+  }
+}
+
+}  // namespace qsim
+
+#endif  // GATES_QSIM_H_
diff --git a/qsim/hybrid.h b/qsim/hybrid.h
new file mode 100644
index 0000000..44fad5b
--- /dev/null
+++ b/qsim/hybrid.h
@@ -0,0 +1,612 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HYBRID_H_
+#define HYBRID_H_
+
+#include <algorithm>
+#include <array>
+#include <complex>
+#include <vector>
+
+#include "gate.h"
+#include "gate_appl.h"
+
+namespace qsim {
+
+/**
+ * Hybrid Feynman-Schrodinger simulator.
+ */
+template <typename IO, typename GateT,
+          template <typename, typename> class FuserT, typename For>
+struct HybridSimulator final {
+ public:
+  using Gate = GateT;
+  using GateKind = typename Gate::GateKind;
+  using fp_type = typename Gate::fp_type;
+
+ private:
+  // Note that one can use "struct GateHybrid : public Gate {" in C++17.
+  struct GateHybrid {
+    using GateKind = HybridSimulator::GateKind;
+    using fp_type = HybridSimulator::fp_type;
+
+    GateKind kind;
+    unsigned time;
+    std::vector<unsigned> qubits;
+    std::vector<unsigned> controlled_by;
+    uint64_t cmask;
+    std::vector<fp_type> params;
+    Matrix<fp_type> matrix;
+    bool unfusible;
+    bool swapped;
+
+    const Gate* parent;
+    unsigned id;
+  };
+
+  struct GateX {
+    GateHybrid* decomposed0;
+    GateHybrid* decomposed1;
+    schmidt_decomp_type<fp_type> schmidt_decomp;
+    unsigned schmidt_bits;
+    unsigned swapped;
+  };
+
+ public:
+  using Fuser = FuserT<IO, GateHybrid>;
+  using GateFused = typename Fuser::GateFused;
+
+  /**
+   * Contextual data for hybrid simulation.
+   */
+  struct HybridData {
+    /**
+     * List of gates on the "0" side of the cut.
+     */
+    std::vector<GateHybrid> gates0;
+    /**
+     * List of gates on the "1" side of the cut.
+     */
+    std::vector<GateHybrid> gates1;
+    /**
+     * List of gates on the cut.
+     */
+    std::vector<GateX> gatexs;
+    /**
+     * Global qubit index to local qubit index map.
+     */
+    std::vector<unsigned> qubit_map;
+    /**
+     * Number of qubits on the "0" side of the cut.
+     */
+    unsigned num_qubits0;
+    /**
+     * Number of qubits on the "1" side of the cut.
+     */
+    unsigned num_qubits1;
+    /**
+     * Number of gates on the cut.
+     */
+    unsigned num_gatexs;
+  };
+
+  /**
+   * User-specified parameters for gate fusion and hybrid simulation.
+   */
+  struct Parameter : public Fuser::Parameter {
+    /**
+     * Fixed bitstring indicating values to assign to Schmidt decomposition
+     * indices of prefix gates.
+     */
+    uint64_t prefix;
+    /**
+     * Number of gates on the cut that are part of the prefix. Indices of these
+     * gates are assigned the value indicated by `prefix`.
+     */
+    unsigned num_prefix_gatexs;
+    /**
+     * Number of gates on the cut that are part of the root. All gates that are
+     * not part of the prefix or root are part of the suffix.
+     */
+    unsigned num_root_gatexs;
+    unsigned num_threads;
+  };
+
+  template <typename... Args>
+  explicit HybridSimulator(Args&&... args) : for_(args...) {}
+
+  /**
+   * Splits the lattice into two parts, using Schmidt decomposition for gates
+   * on the cut.
+   * @param parts Lattice sections to be simulated.
+   * @param gates List of all gates in the circuit.
+   * @param hd Output data with split parts.
+   * @return True if the splitting done successfully; false otherwise.
+   */
+  static bool SplitLattice(const std::vector<unsigned>& parts,
+                           const std::vector<Gate>& gates, HybridData& hd) {
+    hd.num_gatexs = 0;
+    hd.num_qubits0 = 0;
+    hd.num_qubits1 = 0;
+
+    hd.gates0.reserve(gates.size());
+    hd.gates1.reserve(gates.size());
+    hd.qubit_map.reserve(parts.size());
+
+    unsigned count0 = 0;
+    unsigned count1 = 0;
+
+    // Global qubit index to local qubit index map.
+    for (std::size_t i = 0; i < parts.size(); ++i) {
+      parts[i] == 0 ? ++hd.num_qubits0 : ++hd.num_qubits1;
+      hd.qubit_map.push_back(parts[i] == 0 ? count0++ : count1++);
+    }
+
+    // Split the lattice.
+    for (const auto& gate : gates) {
+      if (gate.kind == gate::kMeasurement) {
+        IO::errorf("measurement gates are not suported by qsimh.\n");
+        return false;
+      }
+
+      if (gate.controlled_by.size() > 0) {
+        IO::errorf("controlled gates are not suported by qsimh.\n");
+        return false;
+      }
+
+      switch (gate.qubits.size()) {
+      case 1:  // Single qubit gates.
+        switch (parts[gate.qubits[0]]) {
+        case 0:
+          hd.gates0.emplace_back(GateHybrid{gate.kind, gate.time,
+            {hd.qubit_map[gate.qubits[0]]}, {}, 0, gate.params, gate.matrix,
+            false, false, nullptr, 0});
+          break;
+        case 1:
+          hd.gates1.emplace_back(GateHybrid{gate.kind, gate.time,
+            {hd.qubit_map[gate.qubits[0]]}, {}, 0, gate.params, gate.matrix,
+            false, false, nullptr, 0});
+          break;
+        }
+        break;
+      case 2:  // Two qubit gates.
+        {
+          switch ((parts[gate.qubits[1]] << 1) | parts[gate.qubits[0]]) {
+          case 0:  // Both qubits in part 0.
+            hd.gates0.emplace_back(GateHybrid{gate.kind, gate.time,
+              {hd.qubit_map[gate.qubits[0]], hd.qubit_map[gate.qubits[1]]},
+              {}, 0, gate.params, gate.matrix, false, gate.swapped,
+              nullptr, 0});
+            break;
+          case 1:  // Gate on the cut, qubit 0 in part 1, qubit 1 in part 0.
+            hd.gates0.emplace_back(GateHybrid{GateKind::kDecomp, gate.time,
+              {hd.qubit_map[gate.qubits[1]]}, {}, 0, gate.params, {},
+              true, gate.swapped, &gate, hd.num_gatexs});
+            hd.gates1.emplace_back(GateHybrid{GateKind::kDecomp, gate.time,
+              {hd.qubit_map[gate.qubits[0]]}, {}, 0, gate.params, {},
+              true, gate.swapped, &gate, hd.num_gatexs});
+
+            ++hd.num_gatexs;
+            break;
+          case 2:  // Gate on the cut, qubit 0 in part 0, qubit 1 in part 1.
+            hd.gates0.emplace_back(GateHybrid{GateKind::kDecomp, gate.time,
+              {hd.qubit_map[gate.qubits[0]]}, {}, 0, gate.params, {},
+              true, gate.swapped, &gate, hd.num_gatexs});
+            hd.gates1.emplace_back(GateHybrid{GateKind::kDecomp, gate.time,
+              {hd.qubit_map[gate.qubits[1]]}, {}, 0, gate.params, {},
+              true, gate.swapped, &gate, hd.num_gatexs});
+
+            ++hd.num_gatexs;
+            break;
+          case 3:  // Both qubits in part 1.
+            hd.gates1.emplace_back(GateHybrid{gate.kind, gate.time,
+              {hd.qubit_map[gate.qubits[0]], hd.qubit_map[gate.qubits[1]]},
+              {}, 0, gate.params, gate.matrix, false, gate.swapped,
+              nullptr, 0});
+            break;
+          }
+        }
+        break;
+      default:
+        IO::errorf("multi-qubit gates are not suported by qsimh.\n");
+        return false;
+      }
+    }
+
+    auto compare = [](const GateHybrid& l, const GateHybrid& r) -> bool {
+      return l.time < r.time || (l.time == r.time &&
+          (l.parent < r.parent || (l.parent == r.parent && l.id < r.id)));
+    };
+
+    // Sort gates.
+    std::sort(hd.gates0.begin(), hd.gates0.end(), compare);
+    std::sort(hd.gates1.begin(), hd.gates1.end(), compare);
+
+    hd.gatexs.reserve(hd.num_gatexs);
+
+    // Get Schmidt matrices.
+    for (auto& gate0 : hd.gates0) {
+      if (gate0.parent != nullptr) {
+        auto d = GetSchmidtDecomp(gate0.parent->kind, gate0.parent->params);
+        if (d.size() == 0) {
+          IO::errorf("no Schmidt decomposition for gate kind %u.\n",
+                     gate0.parent->kind);
+          return false;
+        }
+
+        unsigned schmidt_bits = SchmidtBits(d.size());
+        if (schmidt_bits > 2) {
+          IO::errorf("Schmidt rank is too large for gate kind %u.\n",
+                     gate0.parent->kind);
+          return false;
+        }
+
+        unsigned swapped = parts[gate0.parent->qubits[0]];
+        if (gate0.parent->swapped) swapped = 1 - swapped;
+        hd.gatexs.emplace_back(GateX{&gate0, nullptr, std::move(d),
+                                     schmidt_bits, swapped});
+      }
+    }
+
+    unsigned count = 0;
+    for (auto& gate1 : hd.gates1) {
+      if (gate1.parent != nullptr) {
+        hd.gatexs[count++].decomposed1 = &gate1;
+      }
+    }
+
+    for (auto& gatex : hd.gatexs) {
+      if (gatex.schmidt_decomp.size() == 1) {
+        FillSchmidtMatrices(0, gatex);
+      }
+    }
+
+    return true;
+  }
+
+  /**
+   * Runs the hybrid simulator on a sectioned lattice.
+   * @param param Options for parallelism and logging. Also specifies the size
+   *   of the 'prefix' and 'root' sections of the lattice.
+   * @param factory Object to create simulators and state spaces.
+   * @param hd Container object for gates on the boundary between lattice
+   *   sections.
+   * @param parts Lattice sections to be simulated.
+   * @param fgates0 List of gates from one section of the lattice.
+   * @param fgates1 List of gates from the other section of the lattice.
+   * @param bitstrings List of output states to simulate, as bitstrings.
+   * @param results Output vector of amplitudes. After a successful run, this
+   *   will be populated with amplitudes for each state in 'bitstrings'.
+   * @return True if the simulation completed successfully; false otherwise.
+   */
+  template <typename Factory, typename Results>
+  bool Run(const Parameter& param, const Factory& factory,
+           HybridData& hd, const std::vector<unsigned>& parts,
+           const std::vector<GateFused>& fgates0,
+           const std::vector<GateFused>& fgates1,
+           const std::vector<uint64_t>& bitstrings, Results& results) const {
+    using Simulator = typename Factory::Simulator;
+    using StateSpace = typename Simulator::StateSpace;
+    using State = typename StateSpace::State;
+
+    unsigned num_p_gates = param.num_prefix_gatexs;
+    unsigned num_pr_gates = num_p_gates + param.num_root_gatexs;
+
+    auto bits = CountSchmidtBits(param, hd.gatexs);
+
+    uint64_t rmax = uint64_t{1} << bits.num_r_bits;
+    uint64_t smax = uint64_t{1} << bits.num_s_bits;
+
+    auto loc0 = CheckpointLocations(param, fgates0);
+    auto loc1 = CheckpointLocations(param, fgates1);
+
+    struct Index {
+      unsigned i0;
+      unsigned i1;
+    };
+
+    std::vector<Index> indices;
+    indices.reserve(bitstrings.size());
+
+    // Bitstring indices for part 0 and part 1. TODO: optimize.
+    for (const auto& bitstring : bitstrings) {
+      Index index{0, 0};
+
+      for (uint64_t i = 0; i < hd.qubit_map.size(); ++i) {
+        unsigned m = ((bitstring >> i) & 1) << hd.qubit_map[i];
+        parts[i] ? index.i1 |= m : index.i0 |= m;
+      }
+
+      indices.push_back(index);
+    }
+
+    StateSpace state_space = factory.CreateStateSpace();
+
+    State* rstate0;
+    State* rstate1;
+
+    State state0p = state_space.Null();
+    State state1p = state_space.Null();
+    State state0r = state_space.Null();
+    State state1r = state_space.Null();
+    State state0s = state_space.Null();
+    State state1s = state_space.Null();
+
+    // Create states.
+
+    if (!CreateStates(hd.num_qubits0, hd.num_qubits1, state_space, true,
+                      state0p, state1p, rstate0, rstate1)) {
+      return false;
+    }
+
+    if (!CreateStates(hd.num_qubits0, hd.num_qubits1, state_space, rmax > 1,
+                      state0r, state1r, rstate0, rstate1)) {
+      return false;
+    }
+
+    if (!CreateStates(hd.num_qubits0, hd.num_qubits1, state_space, smax > 1,
+                      state0s, state1s, rstate0, rstate1)) {
+      return false;
+    }
+
+    state_space.SetStateZero(state0p);
+    state_space.SetStateZero(state1p);
+
+    Simulator simulator = factory.CreateSimulator();
+
+    std::vector<unsigned> prev(hd.num_gatexs, unsigned(-1));
+
+    // param.prefix encodes the prefix path.
+    unsigned gatex_index = SetSchmidtMatrices(
+        0, num_p_gates, param.prefix, prev, hd.gatexs);
+
+    if (gatex_index == 0) {
+      // Apply gates before the first checkpoint.
+      ApplyGates(fgates0, 0, loc0[0], simulator, state0p);
+      ApplyGates(fgates1, 0, loc1[0], simulator, state1p);
+    } else {
+      IO::errorf("invalid prefix %lu for prefix gate index %u.\n",
+                 param.prefix, gatex_index - 1);
+      return false;
+    }
+
+    // Branch over root gates on the cut. r encodes the root path.
+    for (uint64_t r = 0; r < rmax; ++r) {
+      if (rmax > 1) {
+        state_space.Copy(state0p, state0r);
+        state_space.Copy(state1p, state1r);
+      }
+
+      if (SetSchmidtMatrices(num_p_gates, num_pr_gates,
+                             r, prev, hd.gatexs) == 0) {
+        // Apply gates before the second checkpoint.
+        ApplyGates(fgates0, loc0[0], loc0[1], simulator, state0r);
+        ApplyGates(fgates1, loc1[0], loc1[1], simulator, state1r);
+      } else {
+        continue;
+      }
+
+      // Branch over suffix gates on the cut. s encodes the suffix path.
+      for (uint64_t s = 0; s < smax; ++s) {
+        if (smax > 1) {
+          state_space.Copy(rmax > 1 ? state0r : state0p, state0s);
+          state_space.Copy(rmax > 1 ? state1r : state1p, state1s);
+        }
+
+        if (SetSchmidtMatrices(num_pr_gates, hd.num_gatexs,
+                               s, prev, hd.gatexs) == 0) {
+          // Apply the rest of the gates.
+          ApplyGates(fgates0, loc0[1], fgates0.size(), simulator, state0s);
+          ApplyGates(fgates1, loc1[1], fgates1.size(), simulator, state1s);
+        } else {
+          continue;
+        }
+
+        auto f = [](unsigned n, unsigned m, uint64_t i,
+                    const StateSpace& state_space,
+                    const State& state0, const State& state1,
+                    const std::vector<Index>& indices, Results& results) {
+          // TODO: make it faster for the CUDA state space.
+          auto a0 = state_space.GetAmpl(state0, indices[i].i0);
+          auto a1 = state_space.GetAmpl(state1, indices[i].i1);
+          results[i] += a0 * a1;
+        };
+
+        // Collect results.
+        for_.Run(results.size(), f,
+                 state_space, *rstate0, *rstate1, indices, results);
+      }
+    }
+
+    return true;
+  }
+
+ private:
+  /**
+   * Identifies when to save "checkpoints" of the simulation state. These allow
+   * runs with different cut-index values to reuse parts of the simulation.
+   * @param param Options for parallelism and logging. Also specifies the size
+   *   of the 'prefix' and 'root' sections of the lattice.
+   * @param fgates Set of gates for which to find checkpoint locations.
+   * @return A pair of numbers specifying how many gates to apply before the
+   *   first and second checkpoints, respectively.
+   */
+  static std::array<unsigned, 2> CheckpointLocations(
+      const Parameter& param, const std::vector<GateFused>& fgates) {
+    std::array<unsigned, 2> loc{0, 0};
+
+    unsigned num_decomposed = 0;
+    unsigned num_p_gates = param.num_prefix_gatexs;
+    unsigned num_pr_gates = num_p_gates + param.num_root_gatexs;
+
+    for (std::size_t i = 0; i < fgates.size(); ++i) {
+      for (auto gate: fgates[i].gates) {
+        if (gate->parent != nullptr) {
+          ++num_decomposed;
+          // There should be only one decomposed gate in fused gate.
+          break;
+        }
+      }
+
+      if (num_decomposed <= num_p_gates) {
+        loc[0] = i + 1;
+      }
+
+      if (num_decomposed <= num_pr_gates) {
+        loc[1] = i + 1;
+      }
+    }
+
+    return loc;
+  }
+
+  struct Bits {
+    unsigned num_p_bits;
+    unsigned num_r_bits;
+    unsigned num_s_bits;
+  };
+
+  static Bits CountSchmidtBits(
+      const Parameter& param, const std::vector<GateX>& gatexs) {
+    Bits bits{0, 0, 0};
+
+    unsigned num_p_gates = param.num_prefix_gatexs;
+    unsigned num_pr_gates = num_p_gates + param.num_root_gatexs;
+
+    for (std::size_t i = 0; i < gatexs.size(); ++i) {
+      const auto& gatex = gatexs[i];
+      if (i < num_p_gates) {
+        bits.num_p_bits += gatex.schmidt_bits;
+      } else if (i < num_pr_gates) {
+        bits.num_r_bits += gatex.schmidt_bits;
+      } else {
+        bits.num_s_bits += gatex.schmidt_bits;
+      }
+    }
+
+    return bits;
+  }
+
+  static unsigned SetSchmidtMatrices(std::size_t i0, std::size_t i1,
+                                     uint64_t path,
+                                     std::vector<unsigned>& prev_k,
+                                     std::vector<GateX>& gatexs) {
+    unsigned shift_length = 0;
+
+    for (std::size_t i = i0; i < i1; ++i) {
+      const auto& gatex = gatexs[i];
+
+      if (gatex.schmidt_bits == 0) {
+        // Continue if gatex has Schmidt rank 1.
+        continue;
+      }
+
+      unsigned k = (path >> shift_length) & ((1 << gatex.schmidt_bits) - 1);
+      shift_length += gatex.schmidt_bits;
+
+      if (k != prev_k[i]) {
+        if (k >= gatex.schmidt_decomp.size()) {
+          // Invalid path. Returns gatex index plus one to report error in case
+          // of invalid prefix.
+          return i + 1;
+        }
+
+        FillSchmidtMatrices(k, gatex);
+
+        prev_k[i] = k;
+      }
+    }
+
+    return 0;
+  }
+
+  static void FillSchmidtMatrices(unsigned k, const GateX& gatex) {
+    unsigned part0 = gatex.swapped;
+    unsigned part1 = 1 - part0;
+    {
+      gatex.decomposed0->matrix.resize(gatex.schmidt_decomp[k][part0].size());
+      auto begin = gatex.schmidt_decomp[k][part0].begin();
+      auto end = gatex.schmidt_decomp[k][part0].end();
+      std::copy(begin, end, gatex.decomposed0->matrix.begin());
+    }
+    {
+      gatex.decomposed1->matrix.resize(gatex.schmidt_decomp[k][part1].size());
+      auto begin = gatex.schmidt_decomp[k][part1].begin();
+      auto end = gatex.schmidt_decomp[k][part1].end();
+      std::copy(begin, end, gatex.decomposed1->matrix.begin());
+    }
+  }
+
+  template <typename Simulator>
+  static void ApplyGates(const std::vector<GateFused>& gates,
+                         std::size_t i0, std::size_t i1,
+                         const Simulator& simulator,
+                         typename Simulator::State& state) {
+    for (std::size_t i = i0; i < i1; ++i) {
+      if (gates[i].matrix.size() > 0) {
+        ApplyFusedGate(simulator, gates[i], state);
+      } else {
+        auto gate = gates[i];
+        CalculateFusedMatrix(gate);
+        ApplyFusedGate(simulator, gate, state);
+      }
+    }
+  }
+
+  static unsigned SchmidtBits(unsigned size) {
+    switch (size) {
+    case 1:
+      return 0;
+    case 2:
+      return 1;
+    case 3:
+      return 2;
+    case 4:
+      return 2;
+    default:
+      // Not supported.
+      return 42;
+    }
+  }
+
+  template <typename StateSpace>
+  static bool CreateStates(unsigned num_qubits0,unsigned num_qubits1,
+                           const StateSpace& state_space, bool create,
+                           typename StateSpace::State& state0,
+                           typename StateSpace::State& state1,
+                           typename StateSpace::State* (&rstate0),
+                           typename StateSpace::State* (&rstate1)) {
+    if (create) {
+      state0 = state_space.Create(num_qubits0);
+      state1 = state_space.Create(num_qubits1);
+
+      if (state_space.IsNull(state0) || state_space.IsNull(state1)) {
+        IO::errorf("not enough memory: is the number of qubits too large?\n");
+        return false;
+      }
+
+      rstate0 = &state0;
+      rstate1 = &state1;
+    }
+
+    return true;
+  }
+
+  For for_;
+};
+
+}  // namespace qsim
+
+#endif  // HYBRID_H_
diff --git a/qsim/io.h b/qsim/io.h
new file mode 100644
index 0000000..3b26c7c
--- /dev/null
+++ b/qsim/io.h
@@ -0,0 +1,44 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IO_H_
+#define IO_H_
+
+#include <cstdarg>
+#include <cstdio>
+
+namespace qsim {
+
+/**
+ * Controller for output logs.
+ */
+struct IO {
+  static void errorf(const char* format, ...) {
+    va_list args;
+    va_start(args, format);
+    vfprintf(stderr, format, args);
+    va_end(args);
+  }
+
+  static void messagef(const char* format, ...) {
+    va_list args;
+    va_start(args, format);
+    vprintf(format, args);
+    va_end(args);
+  }
+};
+
+}  // namespace qsim
+
+#endif  // IO_H_
diff --git a/qsim/io_file.h b/qsim/io_file.h
new file mode 100644
index 0000000..3cfac12
--- /dev/null
+++ b/qsim/io_file.h
@@ -0,0 +1,71 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IO_FILE_H_
+#define IO_FILE_H_
+
+#include <cstdint>
+#include <fstream>
+#include <string>
+
+#include "io.h"
+
+namespace qsim {
+
+/**
+ * Controller for output logs with methods for writing to file.
+ */
+struct IOFile : public IO {
+  static std::ifstream StreamFromFile(const std::string& file) {
+    std::ifstream fs;
+    fs.open(file);
+    if (!fs) {
+      errorf("cannot open %s for reading.\n", file.c_str());
+    }
+    return fs;
+  }
+
+  static void CloseStream(std::ifstream& fs) {
+    fs.close();
+  }
+
+  static bool WriteToFile(
+      const std::string& file, const std::string& content) {
+    return WriteToFile(file, content.data(), content.size());
+  }
+
+  static bool WriteToFile(
+      const std::string& file, const void* data, uint64_t size) {
+    auto fs = std::fstream(file, std::ios::out | std::ios::binary);
+
+    if (!fs) {
+      errorf("cannot open %s for writing.\n", file.c_str());
+      return false;
+    } else {
+      fs.write((const char*) data, size);
+      if (!fs) {
+        errorf("cannot write to %s.\n", file.c_str());
+        return false;
+      }
+
+      fs.close();
+    }
+
+    return true;
+  }
+};
+
+}  // namespace qsim
+
+#endif  // IO_FILE_H_
diff --git a/qsim/matrix.h b/qsim/matrix.h
new file mode 100644
index 0000000..a3c2640
--- /dev/null
+++ b/qsim/matrix.h
@@ -0,0 +1,296 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MATRIX_H_
+#define MATRIX_H_
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "bits.h"
+
+namespace qsim {
+
+/**
+ * Gate matrix type. Matrices are stored as vectors. The matrix elements are
+ * accessed as real(m[i][j]) <- vector[2 * (n * i + j)] and
+ * imag(m[i][j]) <- vector[2 * (n * i + j) + 1], where n is the number of rows
+ * or columns (n = 2^q, where q is the number of gate qubits).
+ */
+template <typename fp_type>
+using Matrix = std::vector<fp_type>;
+
+/**
+ * Sets all matrix elements to zero.
+ * @m Matrix to be cleared.
+ */
+template <typename fp_type>
+inline void MatrixClear(Matrix<fp_type>& m) {
+  for (unsigned i = 0; i < m.size(); ++i) {
+    m[i] = 0;
+  }
+}
+
+/**
+ * Sets an identity matrix.
+ * @n Number of matrix rows (columns).
+ * @m Output identity matrix.
+ */
+template <typename fp_type>
+inline void MatrixIdentity(unsigned n, Matrix<fp_type>& m) {
+  m.resize(2 * n * n);
+
+  MatrixClear(m);
+
+  for (unsigned i = 0; i < n; ++i) {
+    m[2 * (n * i + i)] = 1;
+  }
+}
+
+/**
+ * Multiplies two gate matrices of equal size: m2 = m1 m2.
+ * @q Number of gate qubits. The number of matrix rows (columns) is 2^q.
+ * @m1 Matrix m1.
+ * @m2 Input matrix m2. Output product of matrices m2 = m1 m2.
+ */
+template <typename fp_type1, typename fp_type2>
+inline void MatrixMultiply(
+    unsigned q, const Matrix<fp_type1>& m1, Matrix<fp_type2>& m2) {
+  Matrix<fp_type2> mt = m2;
+  unsigned n = unsigned{1} << q;
+
+  for (unsigned i = 0; i < n; ++i) {
+    for (unsigned j = 0; j < n; ++j) {
+      fp_type2 re = 0;
+      fp_type2 im = 0;
+
+      for (unsigned k = 0; k < n; ++k) {
+        fp_type2 r1 = m1[2 * (n * i + k)];
+        fp_type2 i1 = m1[2 * (n * i + k) + 1];
+        fp_type2 r2 = mt[2 * (n * k + j)];
+        fp_type2 i2 = mt[2 * (n * k + j) + 1];
+
+        re += r1 * r2 - i1 * i2;
+        im += r1 * i2 + i1 * r2;
+      }
+
+      m2[2 * (n * i + j)] = re;
+      m2[2 * (n * i + j) + 1] = im;
+    }
+  }
+}
+
+/**
+ * Multiplies two gate matrices of equal size: m2 = m1^\dagger m2.
+ * @q Number of gate qubits. The number of matrix rows (columns) is 2^q.
+ * @m1 Matrix m1.
+ * @m2 Input matrix m2. Output product of matrices m2 = m1 m2.
+ */
+template <typename fp_type1, typename fp_type2>
+inline void MatrixDaggerMultiply(
+    unsigned q, const Matrix<fp_type1>& m1, Matrix<fp_type2>& m2) {
+  Matrix<fp_type2> mt = m2;
+  unsigned n = unsigned{1} << q;
+
+  for (unsigned i = 0; i < n; ++i) {
+    for (unsigned j = 0; j < n; ++j) {
+      fp_type2 re = 0;
+      fp_type2 im = 0;
+
+      for (unsigned k = 0; k < n; ++k) {
+        fp_type2 r1 = m1[2 * (n * k + i)];
+        fp_type2 i1 = m1[2 * (n * k + i) + 1];
+        fp_type2 r2 = mt[2 * (n * k + j)];
+        fp_type2 i2 = mt[2 * (n * k + j) + 1];
+
+        re += r1 * r2 + i1 * i2;
+        im += r1 * i2 - i1 * r2;
+      }
+
+      m2[2 * (n * i + j)] = re;
+      m2[2 * (n * i + j) + 1] = im;
+    }
+  }
+}
+
+/**
+ * Multiplies two gate matrices: m2 = m1 m2. The size of m1 should not exceed
+ *   the size of m2.
+ * @mask1 Qubit mask that specifies the subset of qubits m1 acts on.
+ * @q1 Number of gate qubits. The number of matrix rows (columns) is 2^q1.
+ * @m1 Matrix m1.
+ * @q2 Number of gate qubits. The number of matrix rows (columns) is 2^q2.
+ * @m2 Input matrix m2. Output product of matrices m2 = m1 m2.
+ */
+template <typename fp_type1, typename fp_type2>
+inline void MatrixMultiply(unsigned mask1,
+                           unsigned q1, const Matrix<fp_type1>& m1,
+                           unsigned q2, Matrix<fp_type2>& m2) {
+  if (q1 == q2) {
+    MatrixMultiply(q1, m1, m2);
+  } else {
+    Matrix<fp_type2> mt = m2;
+    unsigned n1 = unsigned{1} << q1;
+    unsigned n2 = unsigned{1} << q2;
+
+    for (unsigned i = 0; i < n2; ++i) {
+      unsigned si = bits::CompressBits(i, q2, mask1);
+
+      for (unsigned j = 0; j < n2; ++j) {
+        fp_type2 re = 0;
+        fp_type2 im = 0;
+
+        for (unsigned k = 0; k < n1; ++k) {
+          unsigned ek = bits::ExpandBits(k, q2, mask1) + (i & ~mask1);
+
+          fp_type2 r1 = m1[2 * (n1 * si + k)];
+          fp_type2 i1 = m1[2 * (n1 * si + k) + 1];
+          fp_type2 r2 = mt[2 * (n2 * ek + j)];
+          fp_type2 i2 = mt[2 * (n2 * ek + j) + 1];
+
+          re += r1 * r2 - i1 * i2;
+          im += r1 * i2 + i1 * r2;
+        }
+
+        m2[2 * (n2 * i + j)] = re;
+        m2[2 * (n2 * i + j) + 1] = im;
+      }
+    }
+  }
+}
+
+/**
+ * Multiply a matrix by a real scalar value.
+ * @c Scalar value.
+ * @m Input matrix to be multiplied. Output matrix.
+ */
+template <typename fp_type1, typename fp_type2>
+inline void MatrixScalarMultiply(fp_type1 c, Matrix<fp_type2>& m) {
+  for (unsigned i = 0; i < m.size(); ++i) {
+    m[i] *= c;
+  }
+}
+
+/**
+ * Multiply a matrix by a complex scalar value.
+ * @re Real part of scalar value.
+ * @im Imaginary part of scalar value.
+ * @m Input matrix to be multiplied. Output matrix.
+ */
+template <typename fp_type1, typename fp_type2>
+inline void MatrixScalarMultiply(
+    fp_type1 re, fp_type1 im, Matrix<fp_type2>& m) {
+  for (unsigned i = 0; i < m.size() / 2; ++i) {
+    fp_type2 re0 = m[2 * i + 0];
+    fp_type2 im0 = m[2 * i + 1];
+    m[2 * i + 0] = re * re0 - im * im0;
+    m[2 * i + 1] = re * im0 + im * re0;
+  }
+}
+
+/**
+ * Daggers a matrix.
+ * @n Number of matrix rows (columns).
+ * @m Input matrix. Output matrix.
+ */
+template <typename fp_type>
+inline void MatrixDagger(unsigned n, Matrix<fp_type>& m) {
+  for (unsigned i = 0; i < n; ++i) {
+    m[2 * (n * i + i) + 1] = -m[2 * (n * i + i) + 1];
+
+    for (unsigned j = i + 1; j < n; ++j) {
+      std::swap(m[2 * (n * i + j)], m[2 * (n * j + i)]);
+      fp_type t = m[2 * (n * i + j) + 1];
+      m[2 * (n * i + j) + 1] = -m[2 * (n * j + i) + 1];
+      m[2 * (n * j + i) + 1] = -t;
+    }
+  }
+}
+
+/**
+ * Gets a permutation to rearrange qubits from "normal" order to "gate"
+ *   order. Qubits are ordered in increasing order for "normal" order.
+ *   Qubits are ordered arbitrarily for "gate" order. Returns an empty vector
+ *   if the qubits are in "normal" order.
+ * @qubits Qubit indices in "gate" order.
+ * @return Permutation as a vector.
+ */
+inline std::vector<unsigned> NormalToGateOrderPermutation(
+    const std::vector<unsigned>& qubits) {
+  std::vector<unsigned> perm;
+
+  bool normal_order = true;
+
+  for (std::size_t i = 1; i < qubits.size(); ++i) {
+    if (qubits[i] < qubits[i - 1]) {
+      normal_order = false;
+      break;
+    }
+  }
+
+  if (!normal_order) {
+    struct QI {
+      unsigned q;
+      unsigned index;
+    };
+
+    std::vector<QI> qis;
+    qis.reserve(qubits.size());
+
+    for (std::size_t i = 0; i < qubits.size(); ++i) {
+      qis.push_back({qubits[i], unsigned(i)});
+    }
+
+    std::sort(qis.begin(), qis.end(), [](const QI& l, const QI& r) {
+                                        return l.q < r.q;
+                                      });
+
+    perm.reserve(qubits.size());
+
+    for (std::size_t i = 0; i < qubits.size(); ++i) {
+      perm.push_back(qis[i].index);
+    }
+  }
+
+  return perm;
+}
+
+/**
+ * Shuffles the gate matrix elements to get the matrix that acts on qubits
+ *   that are in "normal" order (in increasing orger).
+ * @perm Permutation to rearrange qubits from "normal" order to "gate" order.
+ * @q Number of gate qubits. The number of matrix rows (columns) is 2^q.
+ * @m Input matrix. Output shuffled matrix.
+ */
+template <typename fp_type>
+inline void MatrixShuffle(const std::vector<unsigned>& perm,
+                          unsigned q, Matrix<fp_type>& m) {
+  Matrix<fp_type> mt = m;
+  unsigned n = unsigned{1} << q;
+
+  for (unsigned i = 0; i < n; ++i) {
+    unsigned pi = bits::PermuteBits(i, q, perm);
+    for (unsigned j = 0; j < n; ++j) {
+      unsigned pj = bits::PermuteBits(j, q, perm);
+
+      m[2 * (n * i + j)] = mt[2 * (n * pi + pj)];
+      m[2 * (n * i + j) + 1] = mt[2 * (n * pi + pj) + 1];
+    }
+  }
+}
+
+}  // namespace qsim
+
+#endif  // MATRIX_H_
diff --git a/qsim/mps_simulator.h b/qsim/mps_simulator.h
new file mode 100644
index 0000000..8fbcbae
--- /dev/null
+++ b/qsim/mps_simulator.h
@@ -0,0 +1,246 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MPS_SIMULATOR_H_
+#define MPS_SIMULATOR_H_
+
+// For templates will take care of parallelization.
+#define EIGEN_DONT_PARALLELIZE 1
+
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <vector>
+
+#include "../eigen/Eigen/Dense"
+#include "../eigen/Eigen/SVD"
+#include "mps_statespace.h"
+
+namespace qsim {
+
+namespace mps {
+
+/**
+ *  Truncated Matrix Product State (MPS) circuit simulator w/ vectorization.
+ */
+template <typename For, typename FP = float>
+class MPSSimulator final {
+ public:
+  using MPSStateSpace_ = MPSStateSpace<For, FP>;
+  using State = typename MPSStateSpace_::MPS;
+  using fp_type = typename MPSStateSpace_::fp_type;
+
+  using Complex = std::complex<fp_type>;
+  using Matrix =
+      Eigen::Matrix<Complex, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+  using ConstMatrixMap = Eigen::Map<const Matrix>;
+  using MatrixMap = Eigen::Map<Matrix>;
+
+  using OneQubitMatrix = Eigen::Matrix<Complex, 2, 2, Eigen::RowMajor>;
+  using ConstOneQubitMap = Eigen::Map<const OneQubitMatrix>;
+
+  // Note: ForArgs are currently unused.
+  template <typename... ForArgs>
+  explicit MPSSimulator(ForArgs&&... args) : for_(args...) {}
+
+  /**
+   * Applies a gate using non-vectorized instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyGate(const std::vector<unsigned>& qs, const fp_type* matrix,
+                 State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    switch (qs.size()) {
+      case 1:
+        ApplyGate1(qs, matrix, state);
+        break;
+      case 2:
+        ApplyGate2(qs, matrix, state);
+        break;
+      // case 3:
+      //   ApplyGate3(qs, matrix, state);
+      //   break;
+      // case 4:
+      //   ApplyGate4(qs, matrix, state);
+      //   break;
+      // case 5:
+      //   ApplyGate5(qs, matrix, state);
+      //   break;
+      // case 6:
+      //   ApplyGate6(qs, matrix, state);
+      //   break;
+      default:
+        // Not implemented.
+        break;
+    }
+  }
+
+  /**
+   * Applies a controlled gate using eigen3 operations w/ instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param cqs Indices of control qubits.
+   * @param cmask Bit mask of control qubit values.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyControlledGate(const std::vector<unsigned>& qs,
+                           const std::vector<unsigned>& cqs, uint64_t cmask,
+                           const fp_type* matrix, State& state) const {
+    // TODO.
+  }
+
+  /**
+   * Computes the expectation value of an operator using eigen3 operations
+   * w/ vectorized instructions.
+   * @param qs Indices of the qubits the operator acts on.
+   * @param matrix The operator matrix.
+   * @param state The state of the system.
+   * @return The computed expectation value.
+   */
+  std::complex<double> ExpectationValue(const std::vector<unsigned>& qs,
+                                        const fp_type* matrix,
+                                        const State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+    // TODO.
+    return std::complex<double>(-10., -10.);
+  }
+
+ private:
+  void ApplyGate1(const std::vector<unsigned>& qs, const fp_type* matrix,
+                  State& state) const {
+    if (qs[0] == state.num_qubits() - 1) {
+      Apply1Right(qs, matrix, state);
+    } else {
+      Apply1LeftOrInterior(qs, matrix, state);
+    }
+  }
+
+  void Apply1LeftOrInterior(const std::vector<unsigned>& qs,
+                            const fp_type* matrix, State& state) const {
+    fp_type* raw_state = state.get();
+    const auto bond_dim = state.bond_dim();
+    const auto l_offset = MPSStateSpace_::GetBlockOffset(state, qs[0]);
+    const auto r_offset = MPSStateSpace_::GetBlockOffset(state, qs[0] + 1);
+    const auto end = MPSStateSpace_::Size(state);
+    ConstOneQubitMap gate_matrix((Complex*) matrix);
+    MatrixMap scratch_block((Complex*)(raw_state + end), 2, bond_dim);
+
+    for (unsigned block_sep = l_offset; block_sep < r_offset;
+         block_sep += 4 * bond_dim) {
+      fp_type* cur_block = raw_state + block_sep;
+      ConstMatrixMap mps_block((Complex*) cur_block, 2, bond_dim);
+      scratch_block.noalias() = gate_matrix * mps_block;
+      memcpy(cur_block, raw_state + end, sizeof(fp_type) * bond_dim * 4);
+    }
+  }
+
+  void Apply1Right(const std::vector<unsigned>& qs, const fp_type* matrix,
+                   State& state) const {
+    fp_type* raw_state = state.get();
+    const auto bond_dim = state.bond_dim();
+    const auto offset = MPSStateSpace_::GetBlockOffset(state, qs[0]);
+    const auto end = MPSStateSpace_::Size(state);
+    ConstOneQubitMap gate_matrix((Complex*) matrix);
+    ConstMatrixMap mps_block((Complex*)(raw_state + offset), bond_dim, 2);
+    MatrixMap scratch_block((Complex*)(raw_state + end), bond_dim, 2);
+    scratch_block.noalias() = mps_block * gate_matrix.transpose();
+    memcpy(raw_state + offset, raw_state + end, sizeof(fp_type) * bond_dim * 4);
+  }
+
+  void ApplyGate2(const std::vector<unsigned>& qs, const fp_type* matrix,
+                  State& state) const {
+    // TODO: micro-benchmark this function and improve performance.
+    const auto bond_dim = state.bond_dim();
+    const auto num_qubits = state.num_qubits();
+    fp_type* raw_state = state.get();
+
+    const auto i_dim = (qs[0] == 0) ? 1 : bond_dim;
+    const auto j_dim = 2;
+    const auto k_dim = bond_dim;
+    const auto l_dim = 2;
+    const auto m_dim = (qs[1] == num_qubits - 1) ? 1 : bond_dim;
+
+    const auto b_0_offset = MPSStateSpace_::GetBlockOffset(state, qs[0]);
+    const auto b_1_offset = MPSStateSpace_::GetBlockOffset(state, qs[1]);
+    const auto end = MPSStateSpace_::Size(state);
+
+    MatrixMap block_0((Complex*)(raw_state + b_0_offset), i_dim * j_dim, k_dim);
+    MatrixMap block_1((Complex*)(raw_state + b_1_offset), k_dim, l_dim * m_dim);
+
+    // Merge both blocks into scratch space.
+    MatrixMap scratch_c((Complex*)(raw_state + end), i_dim * j_dim, l_dim * m_dim);
+    scratch_c.noalias() = block_0 * block_1;
+
+    // Transpose inner dims in-place.
+    MatrixMap scratch_c_t((Complex*)(raw_state + end), i_dim * j_dim * l_dim, m_dim);
+    for (unsigned i = 0; i < i_dim * j_dim * l_dim; i += 4) {
+      scratch_c_t.row(i + 1).swap(scratch_c_t.row(i + 2));
+    }
+
+    // Transpose gate matrix and place in 3rd (last) scratch block.
+    const auto scratch3_offset = end + 8 * bond_dim * bond_dim;
+    ConstMatrixMap gate_matrix((Complex*) matrix, 4, 4);
+    MatrixMap gate_matrix_transpose((Complex*)(raw_state + scratch3_offset), 4, 4);
+    gate_matrix_transpose = gate_matrix.transpose();
+    gate_matrix_transpose.col(1).swap(gate_matrix_transpose.col(2));
+
+    // Contract gate and merged block tensors, placing result in B0B1.
+    for (unsigned i = 0; i < i_dim; ++i) {
+      fp_type* src_block = raw_state + end + i * 8 * m_dim;
+      fp_type* dest_block = raw_state + b_0_offset + i * 8 * m_dim;
+      MatrixMap block_b0b1((Complex*) dest_block, 4, m_dim);
+      ConstMatrixMap scratch_c_i((Complex*) src_block, 4, m_dim);
+      // [i, np, m] = [np, lj] * [i, lj, m]
+      block_b0b1.noalias() = gate_matrix_transpose * scratch_c_i;
+    }
+
+    // SVD B0B1.
+    MatrixMap full_b0b1((Complex*)(raw_state + b_0_offset), 2 * i_dim, 2 * m_dim);
+    Eigen::BDCSVD<Matrix> svd(full_b0b1, Eigen::ComputeThinU | Eigen::ComputeThinV);
+    const auto p = std::min(2 * i_dim, 2 * m_dim);
+
+    // Place U in scratch to truncate and then B0.
+    MatrixMap svd_u((Complex*)(raw_state + end), 2 * i_dim, p);
+    svd_u.noalias() = svd.matrixU();
+    block_0.fill(Complex(0, 0));
+    const auto keep_cols = (svd_u.cols() > bond_dim) ? bond_dim : svd_u.cols();
+    block_0.block(0, 0, svd_u.rows(), keep_cols).noalias() =
+        svd_u(Eigen::indexing::all, Eigen::seq(0, keep_cols - 1));
+
+    // Place row product of S V into scratch to truncate and then B1.
+    MatrixMap svd_v((Complex*)(raw_state + end), p, 2 * m_dim);
+    MatrixMap s_vector((Complex*)(raw_state + end + 8 * bond_dim * bond_dim), p, 1);
+    svd_v.noalias() = svd.matrixV().adjoint();
+    s_vector.noalias() = svd.singularValues();
+    block_1.fill(Complex(0, 0));
+    const auto keep_rows = (svd_v.rows() > bond_dim) ? bond_dim : svd_v.rows();
+    const auto row_seq = Eigen::seq(0, keep_rows - 1);
+    for (unsigned i = 0; i < keep_rows; ++i) {
+      svd_v.row(i) *= s_vector(i);
+    }
+    block_1.block(0, 0, keep_rows, svd_v.cols()).noalias() =
+        svd_v(row_seq, Eigen::indexing::all);
+  }
+
+  For for_;
+};
+
+}  // namespace mps
+}  // namespace qsim
+
+#endif  // MPS_SIMULATOR_H_
diff --git a/qsim/mps_statespace.h b/qsim/mps_statespace.h
new file mode 100644
index 0000000..9b3acf3
--- /dev/null
+++ b/qsim/mps_statespace.h
@@ -0,0 +1,597 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MPS_STATESPACE_H_
+#define MPS_STATESPACE_H_
+
+// For templates will take care of parallelization.
+#define EIGEN_DONT_PARALLELIZE 1
+
+#ifdef _WIN32
+#include <malloc.h>
+#endif
+
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <random>
+
+#include "../eigen/Eigen/Dense"
+#include "../eigen/unsupported/Eigen/CXX11/Tensor"
+
+namespace qsim {
+
+namespace mps {
+
+namespace detail {
+
+inline void do_not_free(void*) {}
+
+inline void free(void* ptr) {
+#ifdef _WIN32
+  _aligned_free(ptr);
+#else
+  ::free(ptr);
+#endif
+}
+
+}  // namespace detail
+
+/**
+ * Class containing context and routines for fixed bond dimension
+ * truncated Matrix Product State (MPS) simulation.
+ */
+template <typename For, typename FP = float>
+class MPSStateSpace {
+ private:
+ public:
+  using fp_type = FP;
+  using Pointer = std::unique_ptr<fp_type, decltype(&detail::free)>;
+
+  using Complex = std::complex<fp_type>;
+  using Matrix =
+      Eigen::Matrix<Complex, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+  using ConstMatrixMap = Eigen::Map<const Matrix>;
+  using MatrixMap = Eigen::Map<Matrix>;
+
+  // Store MPS tensors with the following shape:
+  // [2, bond_dim], [bond_dim, 2, bond_dim], ... , [bond_dim, 2].
+  class MPS {
+   public:
+    MPS() = delete;
+
+    MPS(Pointer&& ptr, unsigned num_qubits, unsigned bond_dim)
+        : ptr_(std::move(ptr)), num_qubits_(num_qubits), bond_dim_(bond_dim) {}
+
+    fp_type* get() { return ptr_.get(); }
+
+    const fp_type* get() const { return ptr_.get(); }
+
+    fp_type* release() {
+      num_qubits_ = 0;
+      return ptr_.release();
+    }
+
+    unsigned num_qubits() const { return num_qubits_; }
+
+    unsigned bond_dim() const { return bond_dim_; }
+
+   private:
+    Pointer ptr_;
+    unsigned num_qubits_;
+    unsigned bond_dim_;
+  };
+
+  // Note: ForArgs are currently unused.
+  template <typename... ForArgs>
+  MPSStateSpace(ForArgs&&... args) : for_(args...) {}
+
+  // Requires num_qubits >= 2 and bond_dim >= 2.
+  static MPS Create(unsigned num_qubits, unsigned bond_dim) {
+    auto end_sizes = 2 * 4 * bond_dim;
+    auto internal_sizes = 4 * bond_dim * bond_dim * (num_qubits + 1);
+    // Use three extra "internal style" blocks past the end of the
+    //   working allocation for scratch space. Needed for gate
+    //   application.
+    auto size = sizeof(fp_type) * (end_sizes + internal_sizes);
+
+#ifdef _WIN32
+    Pointer ptr{(fp_type*)_aligned_malloc(size, 64), &detail::free};
+    bool is_null = ptr.get() != nullptr;
+    return MPS{std::move(ptr), is_null ? num_qubits : 0,
+               is_null ? bond_dim : 0};
+#else
+    void* p = nullptr;
+    if (posix_memalign(&p, 64, size) == 0) {
+      return MPS{Pointer{(fp_type*)p, &detail::free}, num_qubits, bond_dim};
+    } else {
+      return MPS{Pointer{nullptr, &detail::free}, 0, 0};
+    }
+#endif
+  }
+
+  static unsigned Size(const MPS& state) {
+    auto end_sizes = 2 * 4 * state.bond_dim();
+    auto internal_sizes = 4 * state.bond_dim() * state.bond_dim();
+    return end_sizes + internal_sizes * (state.num_qubits() - 2);
+  }
+
+  static unsigned RawSize(const MPS& state) {
+    return sizeof(fp_type) * Size(state);
+  }
+
+  // Get the pointer offset to the beginning of an MPS block.
+  static unsigned GetBlockOffset(const MPS& state, unsigned i) {
+    if (i == 0) {
+      return 0;
+    }
+    return 4 * state.bond_dim() * (1 + state.bond_dim() * (i - 1));
+  }
+
+  // Copies the state contents of one MPS to another.
+  // Ignores scratch data.
+  static bool Copy(const MPS& src, MPS& dest) {
+    if ((src.num_qubits() != dest.num_qubits()) ||
+        src.bond_dim() != dest.bond_dim()) {
+      return false;
+    }
+    auto size = RawSize(src);
+    memcpy(dest.get(), src.get(), size);
+    return true;
+  }
+
+  // Set the MPS to the |0> state.
+  static void SetStateZero(MPS& state) {
+    auto size = Size(state);
+    memset(state.get(), 0, sizeof(fp_type) * size);
+    auto block_size = 4 * state.bond_dim() * state.bond_dim();
+    state.get()[0] = 1.0;
+    for (unsigned i = 4 * state.bond_dim(); i < size; i += block_size) {
+      state.get()[i] = 1.0;
+    }
+  }
+
+  // Computes Re{<state1 | state2 >} for two equal sized MPS.
+  // Requires: state1.bond_dim() == state2.bond_dim() &&
+  //           state1.num_qubits() == state2.num_qubits()
+  static fp_type RealInnerProduct(MPS& state1, MPS& state2) {
+    return InnerProduct(state1, state2).real();
+  }
+
+  // Computes <state1 | state2 > for two equal sized MPS.
+  // Requires: state1.bond_dim() == state2.bond_dim() &&
+  //           state1.num_qubits() == state2.num_qubits()
+  static std::complex<fp_type> InnerProduct(MPS& state1, MPS& state2) {
+    const auto num_qubits = state1.num_qubits();
+    const auto bond_dim = state1.bond_dim();
+    const auto end = Size(state1);
+    auto offset = 0;
+    fp_type* state1_raw = state1.get();
+    fp_type* state2_raw = state2.get();
+
+    // Contract leftmost blocks together, store result in state1 scratch.
+    ConstMatrixMap top((Complex*)state2_raw, 2, bond_dim);
+    ConstMatrixMap bot((Complex*)state1_raw, 2, bond_dim);
+    MatrixMap partial_contract((Complex*)(state1_raw + end), bond_dim,
+                               bond_dim);
+    MatrixMap partial_contract2(
+        (Complex*)(state1_raw + end + 2 * bond_dim * bond_dim), bond_dim,
+        2 * bond_dim);
+    partial_contract.noalias() = top.adjoint() * bot;
+
+    // Contract all internal blocks together.
+    for (unsigned i = 1; i < num_qubits - 1; ++i) {
+      offset = GetBlockOffset(state1, i);
+
+      // reshape:
+      new (&partial_contract2)
+          MatrixMap((Complex*)(state1_raw + end + 2 * bond_dim * bond_dim),
+                    bond_dim, 2 * bond_dim);
+
+      // Merge bot into left boundary merged tensor.
+      new (&bot) ConstMatrixMap((Complex*)(state1_raw + offset), bond_dim,
+                                2 * bond_dim);
+      partial_contract2.noalias() = partial_contract * bot;
+
+      // reshape:
+      new (&partial_contract2)
+          MatrixMap((Complex*)(state1_raw + end + 2 * bond_dim * bond_dim),
+                    2 * bond_dim, bond_dim);
+
+      // Merge top into partial_contract2.
+      new (&top) ConstMatrixMap((Complex*)(state2_raw + offset), 2 * bond_dim,
+                                bond_dim);
+      partial_contract.noalias() = top.adjoint() * partial_contract2;
+    }
+
+    // Contract rightmost bottom block.
+    offset = GetBlockOffset(state1, num_qubits - 1);
+    new (&bot) ConstMatrixMap((Complex*)(state1_raw + offset), bond_dim, 2);
+    new (&partial_contract2) MatrixMap(
+        (Complex*)(state1_raw + end + 4 * bond_dim * bond_dim), bond_dim, 2);
+    partial_contract2.noalias() = partial_contract * bot;
+
+    // Contract rightmost top block.
+    new (&top) ConstMatrixMap((Complex*)(state2_raw + offset), 2 * bond_dim, 1);
+    new (&partial_contract) MatrixMap((Complex*)(state1_raw + end), 1, 1);
+    new (&partial_contract2)
+        MatrixMap((Complex*)(state1_raw + end + 4 * bond_dim * bond_dim),
+                  2 * bond_dim, 1);
+    partial_contract.noalias() = top.adjoint() * partial_contract2;
+
+    return partial_contract(0, 0);
+  }
+
+  // Compute the 2x2 1-RDM of state on index. Result written to rdm.
+  // Requires: scratch and rdm to be allocated.
+  static void ReduceDensityMatrix(MPS& state, MPS& scratch, int index,
+                                  fp_type* rdm) {
+    const auto num_qubits = state.num_qubits();
+    const auto bond_dim = state.bond_dim();
+    const auto end = Size(state);
+    const bool last_index = (index == num_qubits - 1);
+    const auto right_dim = (last_index ? 1 : bond_dim);
+    auto offset = 0;
+    fp_type* state_raw = state.get();
+    fp_type* scratch_raw = scratch.get();
+    fp_type* state_raw_workspace = state_raw + end + 2 * bond_dim * bond_dim;
+    fp_type* scratch_raw_workspace =
+        scratch_raw + end + 2 * bond_dim * bond_dim;
+
+    Copy(state, scratch);
+
+    // Contract leftmost blocks together, store result in state scratch.
+    ConstMatrixMap top((Complex*)scratch_raw, 2, bond_dim);
+    ConstMatrixMap bot((Complex*)state_raw, 2, bond_dim);
+    MatrixMap partial_contract((Complex*)(state_raw + end), bond_dim, bond_dim);
+    MatrixMap partial_contract2((Complex*)(state_raw_workspace), bond_dim,
+                                2 * bond_dim);
+
+    partial_contract.setZero();
+    partial_contract(0, 0) = 1;
+    if (index > 0) {
+      partial_contract.noalias() = top.adjoint() * bot;
+    }
+
+    // Contract all internal blocks together.
+    for (unsigned i = 1; i < index; ++i) {
+      offset = GetBlockOffset(state, i);
+
+      // reshape:
+      new (&partial_contract2)
+          MatrixMap((Complex*)(state_raw_workspace), bond_dim, 2 * bond_dim);
+
+      // Merge bot into left boundary merged tensor.
+      new (&bot) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim,
+                                2 * bond_dim);
+      partial_contract2.noalias() = partial_contract * bot;
+
+      // reshape:
+      new (&partial_contract2)
+          MatrixMap((Complex*)(state_raw_workspace), 2 * bond_dim, bond_dim);
+
+      // Merge top into partial_contract2.
+      new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), 2 * bond_dim,
+                                bond_dim);
+      partial_contract.noalias() = top.adjoint() * partial_contract2;
+    }
+
+    // The [bond_dim, bond_dim] block in state_raw now contains the contraction
+    // up to, but not including index.
+    // Contract rightmost blocks.
+    offset = GetBlockOffset(state, num_qubits - 1);
+    new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), bond_dim, 2);
+    new (&bot) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, 2);
+    new (&partial_contract)
+        MatrixMap((Complex*)(scratch_raw + end), bond_dim, bond_dim);
+    new (&partial_contract2)
+        MatrixMap((Complex*)(scratch_raw_workspace), bond_dim, 2 * bond_dim);
+
+    partial_contract.setZero();
+    partial_contract(0, 0) = 1;
+    if (index < num_qubits - 1) {
+      partial_contract.noalias() = top * bot.adjoint();
+    }
+
+    for (unsigned i = num_qubits - 2; i > index; --i) {
+      offset = GetBlockOffset(state, i);
+
+      // reshape:
+      new (&partial_contract2)
+          MatrixMap((Complex*)(scratch_raw_workspace), 2 * bond_dim, bond_dim);
+
+      // Merge bot into left boundary merged tensor.
+      new (&bot) ConstMatrixMap((Complex*)(state_raw + offset), 2 * bond_dim,
+                                bond_dim);
+      partial_contract2.noalias() = bot * partial_contract.adjoint();
+
+      // reshape:
+      new (&partial_contract2)
+          MatrixMap((Complex*)(scratch_raw_workspace), bond_dim, 2 * bond_dim);
+
+      // Merge top into partial_contract2.
+      new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), bond_dim,
+                                2 * bond_dim);
+      // [bd, bd] = [bd, 2bd] @ [bd, 2bd]
+      partial_contract.noalias() = top * partial_contract2.adjoint();
+    }
+
+    // The [bond_dim, bond_dim] block in scratch_raw now contains the
+    // contraction down from the end, but not including the index. Begin final
+    // contraction steps.
+
+    // Get leftmost [bd, bd] contraction and contract with top.
+
+    offset = GetBlockOffset(state, index);
+    new (&partial_contract)
+        MatrixMap((Complex*)(state_raw + end), bond_dim, bond_dim);
+    new (&top)
+        ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, 2 * right_dim);
+    new (&partial_contract2)
+        MatrixMap((Complex*)(scratch_raw_workspace), bond_dim, 2 * right_dim);
+    partial_contract2.noalias() = partial_contract * top.conjugate();
+    // copy the bottom contraction scratch_raw to state_raw to save space.
+    memcpy(state_raw + end, scratch_raw + end,
+           bond_dim * bond_dim * 2 * sizeof(fp_type));
+
+    // Contract top again for correct shape.
+    fp_type* contract3_target = (last_index ? rdm : scratch_raw);
+    MatrixMap partial_contract3((Complex*)contract3_target, 2 * right_dim,
+                                2 * right_dim);
+    partial_contract3.noalias() = top.transpose() * partial_contract2;
+
+    // If we are contracting the last index, all the needed transforms are done.
+    if (last_index) {
+      return;
+    }
+
+    // Conduct final tensor contraction operations. Cannot be easily compiled to
+    // matmul.
+    const Eigen::TensorMap<const Eigen::Tensor<Complex, 4, Eigen::RowMajor>>
+        t_4d((Complex*)scratch_raw, 2, bond_dim, 2, bond_dim);
+    const Eigen::TensorMap<const Eigen::Tensor<Complex, 2, Eigen::RowMajor>>
+        t_2d((Complex*)(state_raw + end), bond_dim, bond_dim);
+
+    const Eigen::array<Eigen::IndexPair<int>, 2> product_dims = {
+        Eigen::IndexPair<int>(1, 0),
+        Eigen::IndexPair<int>(3, 1),
+    };
+    Eigen::TensorMap<Eigen::Tensor<Complex, 2, Eigen::RowMajor>> out(
+        (Complex*)rdm, 2, 2);
+    out = t_4d.contract(t_2d, product_dims);
+  }
+
+  // Draw a single bitstring sample from state using scratch and scratch2
+  // as working space.
+  static void SampleOnce(MPS& state, MPS& scratch, MPS& scratch2,
+                         std::mt19937* random_gen, std::vector<bool>* sample) {
+    // TODO: carefully profile with perf and optimize temp storage
+    //  locations for cache friendliness.
+    const auto bond_dim = state.bond_dim();
+    const auto num_qubits = state.num_qubits();
+    const auto end = Size(state);
+    const auto left_frontier_offset = GetBlockOffset(state, num_qubits + 1);
+    std::default_random_engine generator;
+    fp_type* state_raw = state.get();
+    fp_type* scratch_raw = scratch.get();
+    fp_type* scratch2_raw = scratch2.get();
+    fp_type rdm[8];
+
+    sample->reserve(num_qubits);
+    Copy(state, scratch);
+    Copy(state, scratch2);
+
+    // Store prefix contractions in scratch2.
+    auto offset = GetBlockOffset(state, num_qubits - 1);
+    ConstMatrixMap top((Complex*)(state_raw + offset), bond_dim, 2);
+    ConstMatrixMap bot((Complex*)(scratch_raw + offset), bond_dim, 2);
+    MatrixMap partial_contract((Complex*)(scratch2_raw + offset), bond_dim,
+                               bond_dim);
+    MatrixMap partial_contract2((Complex*)(scratch_raw + end), bond_dim,
+                                2 * bond_dim);
+    partial_contract.noalias() = top * bot.adjoint();
+
+    for (unsigned i = num_qubits - 2; i > 0; --i) {
+      offset = GetBlockOffset(state, i);
+      // reshape:
+      new (&partial_contract2)
+          MatrixMap((Complex*)(scratch_raw + end), 2 * bond_dim, bond_dim);
+
+      // Merge bot into left boundary merged tensor.
+      new (&bot) ConstMatrixMap((Complex*)(scratch_raw + offset), 2 * bond_dim,
+                                bond_dim);
+      partial_contract2.noalias() = bot * partial_contract.adjoint();
+
+      // reshape:
+      new (&partial_contract2)
+          MatrixMap((Complex*)(scratch_raw + end), bond_dim, 2 * bond_dim);
+
+      // Merge top into partial_contract2.
+      new (&top) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim,
+                                2 * bond_dim);
+
+      // merge into partial_contract -> scracth2_raw.
+      new (&partial_contract)
+          MatrixMap((Complex*)(scratch2_raw + offset), bond_dim, bond_dim);
+      partial_contract.noalias() = top * partial_contract2.adjoint();
+    }
+
+    // Compute RDM-0 and draw first sample.
+    offset = GetBlockOffset(state, 1);
+    new (&top) ConstMatrixMap((Complex*)state_raw, 2, bond_dim);
+    new (&bot) ConstMatrixMap((Complex*)scratch_raw, 2, bond_dim);
+    new (&partial_contract)
+        MatrixMap((Complex*)(scratch2_raw + offset), bond_dim, bond_dim);
+    new (&partial_contract2)
+        MatrixMap((Complex*)(scratch_raw + end), 2, bond_dim);
+
+    partial_contract2.noalias() = bot * partial_contract.adjoint();
+
+    new (&partial_contract) MatrixMap((Complex*)rdm, 2, 2);
+    partial_contract.noalias() = top * partial_contract2.adjoint();
+    auto p0 = rdm[0] / (rdm[0] + rdm[6]);
+    std::bernoulli_distribution distribution(1 - p0);
+    auto bit_val = distribution(*random_gen);
+    sample->push_back(bit_val);
+
+    // collapse state.
+    new (&partial_contract) MatrixMap((Complex*)scratch_raw, 2, bond_dim);
+    partial_contract.row(!bit_val).setZero();
+
+    // Prepare left contraction frontier.
+    new (&partial_contract2) MatrixMap(
+        (Complex*)(scratch2_raw + left_frontier_offset), bond_dim, bond_dim);
+    partial_contract2.noalias() =
+        partial_contract.transpose() * partial_contract.conjugate();
+
+    // Compute RDM-i and draw internal tensor samples.
+    for (unsigned i = 1; i < num_qubits - 1; i++) {
+      // Get leftmost [bd, bd] contraction and contract with top.
+      offset = GetBlockOffset(state, i);
+      new (&partial_contract) MatrixMap(
+          (Complex*)(scratch2_raw + left_frontier_offset), bond_dim, bond_dim);
+      new (&top) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim,
+                                2 * bond_dim);
+      new (&partial_contract2)
+          MatrixMap((Complex*)(state_raw + end), bond_dim, 2 * bond_dim);
+      partial_contract2.noalias() = partial_contract * top.conjugate();
+
+      // Contract top again for correct shape.
+      MatrixMap partial_contract3((Complex*)(scratch_raw + end), 2 * bond_dim,
+                                  2 * bond_dim);
+      partial_contract3.noalias() = top.transpose() * partial_contract2;
+
+      // Conduct final tensor contraction operations. Cannot be easily compiled
+      // to matmul. Perf reports shows only ~6% of runtime spent here on large
+      // systems.
+      offset = GetBlockOffset(state, i + 1);
+      const Eigen::TensorMap<const Eigen::Tensor<Complex, 4, Eigen::RowMajor>>
+          t_4d((Complex*)(scratch_raw + end), 2, bond_dim, 2, bond_dim);
+      const Eigen::TensorMap<const Eigen::Tensor<Complex, 2, Eigen::RowMajor>>
+          t_2d((Complex*)(scratch2_raw + offset), bond_dim, bond_dim);
+
+      const Eigen::array<Eigen::IndexPair<int>, 2> product_dims = {
+          Eigen::IndexPair<int>(1, 0),
+          Eigen::IndexPair<int>(3, 1),
+      };
+      Eigen::TensorMap<Eigen::Tensor<Complex, 2, Eigen::RowMajor>> out(
+          (Complex*)rdm, 2, 2);
+      out = t_4d.contract(t_2d, product_dims);
+
+      // Sample bit and collapse state.
+      p0 = rdm[0] / (rdm[0] + rdm[6]);
+      distribution = std::bernoulli_distribution(1 - p0);
+      bit_val = distribution(*random_gen);
+
+      sample->push_back(bit_val);
+      offset = GetBlockOffset(state, i);
+      new (&partial_contract)
+          MatrixMap((Complex*)(scratch_raw + offset), bond_dim * 2, bond_dim);
+      for (unsigned j = !bit_val; j < 2 * bond_dim; j += 2) {
+        partial_contract.row(j).setZero();
+      }
+
+      // Update left frontier.
+      new (&partial_contract) MatrixMap(
+          (Complex*)(scratch2_raw + left_frontier_offset), bond_dim, bond_dim);
+
+      // reshape:
+      new (&partial_contract2)
+          MatrixMap((Complex*)(state_raw + end), bond_dim, 2 * bond_dim);
+
+      // Merge bot into left boundary merged tensor.
+      new (&bot) ConstMatrixMap((Complex*)(scratch_raw + offset), bond_dim,
+                                2 * bond_dim);
+      partial_contract2.noalias() = partial_contract * bot.conjugate();
+
+      // reshape:
+      new (&partial_contract2)
+          MatrixMap((Complex*)(state_raw + end), 2 * bond_dim, bond_dim);
+
+      // Merge top into partial_contract2.
+      new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), 2 * bond_dim,
+                                bond_dim);
+      partial_contract.noalias() = top.transpose() * partial_contract2;
+    }
+
+    // Compute RDM-(n-1) and sample.
+    offset = GetBlockOffset(state, num_qubits - 1);
+    new (&partial_contract2)
+        MatrixMap((Complex*)(state_raw + end), bond_dim, 2);
+
+    new (&top) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, 2);
+    partial_contract2.noalias() = partial_contract * top.conjugate();
+    new (&partial_contract) MatrixMap((Complex*)rdm, 2, 2);
+    partial_contract.noalias() = top.transpose() * partial_contract2;
+
+    p0 = rdm[0] / (rdm[0] + rdm[6]);
+    distribution = std::bernoulli_distribution(1 - p0);
+    bit_val = distribution(*random_gen);
+    sample->push_back(bit_val);
+  }
+
+  // Draw num_samples bitstring samples from state and store the result
+  // bit vectors in results. Uses scratch and scratch2 as workspace.
+  static void Sample(MPS& state, MPS& scratch, MPS& scratch2,
+                     unsigned num_samples, unsigned seed,
+                     std::vector<std::vector<bool>>* results) {
+    std::mt19937 rand_source(seed);
+    results->reserve(num_samples);
+    for (unsigned i = 0; i < num_samples; i++) {
+      SampleOnce(state, scratch, scratch2, &rand_source, &(*results)[i]);
+    }
+  }
+
+  // Testing only. Convert the MPS to a wavefunction under "normal" ordering.
+  // Requires: wf be allocated beforehand with bond_dim * 2 ^ num_qubits -1
+  // memory.
+  static void ToWaveFunction(MPS& state, fp_type* wf) {
+    const auto bond_dim = state.bond_dim();
+    const auto num_qubits = state.num_qubits();
+    fp_type* raw_state = state.get();
+
+    ConstMatrixMap accum = ConstMatrixMap((Complex*)(raw_state), 2, bond_dim);
+    ConstMatrixMap next_block = ConstMatrixMap(nullptr, 0, 0);
+    MatrixMap result2 = MatrixMap(nullptr, 0, 0);
+    auto offset = 0;
+    auto result2_size = 2;
+
+    for (unsigned i = 1; i < num_qubits - 1; i++) {
+      offset = GetBlockOffset(state, i);
+      // use of new does not trigger any expensive operations.
+      new (&next_block) ConstMatrixMap((Complex*)(raw_state + offset), bond_dim,
+                                       2 * bond_dim);
+      new (&result2) MatrixMap((Complex*)(wf), result2_size, 2 * bond_dim);
+
+      // temp variable used since result2 and accum point to same memory.
+      result2 = accum * next_block;
+      result2_size *= 2;
+      new (&accum) ConstMatrixMap((Complex*)(wf), result2_size, bond_dim);
+    }
+    offset = GetBlockOffset(state, num_qubits - 1);
+    new (&next_block)
+        ConstMatrixMap((Complex*)(raw_state + offset), bond_dim, 2);
+    new (&result2) MatrixMap((Complex*)(wf), result2_size, 2);
+    result2 = accum * next_block;
+  }
+
+ protected:
+  For for_;
+};
+
+}  // namespace mps
+}  // namespace qsim
+
+#endif  // MPS_STATESPACE_H_
diff --git a/qsim/parfor.h b/qsim/parfor.h
new file mode 100644
index 0000000..8a3a4d6
--- /dev/null
+++ b/qsim/parfor.h
@@ -0,0 +1,123 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PARFOR_H_
+#define PARFOR_H_
+
+#include <omp.h>
+
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+namespace qsim {
+
+/**
+ * Helper struct for executing for-loops in parallel across multiple threads.
+ */
+template <uint64_t MIN_SIZE>
+struct ParallelForT {
+  explicit ParallelForT(unsigned num_threads) : num_threads(num_threads) {}
+
+  // GetIndex0 and GetIndex1 are useful when we need to know how work was
+  // divided between threads, for instance, for reusing partial sums obtained
+  // by RunReduceP.
+  uint64_t GetIndex0(uint64_t size, unsigned thread_id) const {
+    return size >= MIN_SIZE ? size * thread_id / num_threads : 0;
+  }
+
+  uint64_t GetIndex1(uint64_t size, unsigned thread_id) const {
+    return size >= MIN_SIZE ? size * (thread_id + 1) / num_threads : size;
+  }
+
+  template <typename Function, typename... Args>
+  void Run(uint64_t size, Function&& func, Args&&... args) const {
+    if (num_threads > 1 && size >= MIN_SIZE) {
+      #pragma omp parallel num_threads(num_threads)
+      {
+        unsigned n = omp_get_num_threads();
+        unsigned m = omp_get_thread_num();
+
+        uint64_t i0 = GetIndex0(size, m);
+        uint64_t i1 = GetIndex1(size, m);
+
+        for (uint64_t i = i0; i < i1; ++i) {
+          func(n, m, i, args...);
+        }
+      }
+    } else {
+      for (uint64_t i = 0; i < size; ++i) {
+        func(1, 0, i, args...);
+      }
+    }
+  }
+
+  template <typename Function, typename Op, typename... Args>
+  std::vector<typename Op::result_type> RunReduceP(
+      uint64_t size, Function&& func, Op&& op, Args&&... args) const {
+    std::vector<typename Op::result_type> partial_results;
+
+    if (num_threads > 1 && size >= MIN_SIZE) {
+      partial_results.resize(num_threads, 0);
+
+      #pragma omp parallel num_threads(num_threads)
+      {
+        unsigned n = omp_get_num_threads();
+        unsigned m = omp_get_thread_num();
+
+        uint64_t i0 = GetIndex0(size, m);
+        uint64_t i1 = GetIndex1(size, m);
+
+        typename Op::result_type partial_result = 0;
+
+        for (uint64_t i = i0; i < i1; ++i) {
+          partial_result = op(partial_result, func(n, m, i, args...));
+        }
+
+        partial_results[m] = partial_result;
+      }
+    } else if (num_threads > 0) {
+      typename Op::result_type result = 0;
+      for (uint64_t i = 0; i < size; ++i) {
+        result = op(result, func(1, 0, i, args...));
+      }
+
+      partial_results.resize(1, result);
+    }
+
+    return partial_results;
+  }
+
+  template <typename Function, typename Op, typename... Args>
+  typename Op::result_type RunReduce(uint64_t size, Function&& func,
+                                     Op&& op, Args&&... args) const {
+    auto partial_results = RunReduceP(size, func, std::move(op), args...);
+
+    typename Op::result_type result = 0;
+
+    for (auto partial_result : partial_results) {
+      result = op(result, partial_result);
+    }
+
+    return result;
+  }
+
+  unsigned num_threads;
+};
+
+using ParallelFor = ParallelForT<1024>;
+
+}  // namespace qsim
+
+#endif  // PARFOR_H_
diff --git a/qsim/qtrajectory.h b/qsim/qtrajectory.h
new file mode 100644
index 0000000..1da6692
--- /dev/null
+++ b/qsim/qtrajectory.h
@@ -0,0 +1,435 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef QTRAJECTORY_H_
+#define QTRAJECTORY_H_
+
+#include <cmath>
+#include <complex>
+#include <cstdint>
+#include <random>
+#include <vector>
+
+#include "circuit_noisy.h"
+#include "gate.h"
+#include "gate_appl.h"
+
+namespace qsim {
+
+/**
+ * Quantum trajectory simulator.
+ */
+template <typename IO, typename Gate,
+          template <typename, typename> class FuserT, typename Simulator,
+          typename RGen = std::mt19937>
+class QuantumTrajectorySimulator {
+ public:
+  using Fuser = FuserT<IO, const Gate*>;
+  using StateSpace = typename Simulator::StateSpace;
+  using State = typename Simulator::State;
+  using MeasurementResult = typename StateSpace::MeasurementResult;
+
+  /**
+   * User-specified parameters for the simulator.
+   */
+  struct Parameter : public Fuser::Parameter {
+    /**
+     * If true, collect statistics of sampled Kraus operator indices.
+     */
+    bool collect_kop_stat = false;
+    /**
+     * If true, collect statistics of measured bitstrings.
+     */
+    bool collect_mea_stat = false;
+    /**
+     * If true, normalize the state vector before performing measurements.
+     */
+    bool normalize_before_mea_gates = true;
+    /**
+     * If false, do not apply deferred operators after the main loop for
+     * the "primary" noise trajectory, that is the trajectory in which
+     * the primary (the first operators in their respective channels) Kraus
+     * operators are sampled for each channel and there are no measurements
+     * in the computational basis. This can be used to speed up simulations
+     * of circuits with weak noise and without measurements by reusing
+     * the primary trajectory results. There is an additional condition for
+     * RunBatch. In this case, the deferred operators after the main loop are
+     * still applied for the first occurence of the primary trajectory.
+     * The primary Kraus operators should have the highest sampling
+     * probabilities to achieve the highest speedup.
+     *
+     * It is the client's responsibility to collect the primary trajectory
+     * results and to reuse them.
+     */
+    bool apply_last_deferred_ops = true;
+  };
+
+  /**
+   * Struct with statistics to populate by RunBatch and RunOnce methods.
+   */
+  struct Stat {
+    /**
+     * Indices of sampled Kraus operator indices and/or measured bitstrings.
+     */
+    std::vector<uint64_t> samples;
+    /**
+     * True if the "primary" noise trajectory is sampled, false otherwise.
+     */
+    bool primary;
+  };
+
+  /**
+   * Runs the given noisy circuit performing repetitions. Each repetition is
+   * seeded by repetition ID.
+   * @param param Options for the quantum trajectory simulator.
+   * @param circuit The noisy circuit to be simulated.
+   * @param r0, r1 The range of repetition IDs [r0, r1) to perform repetitions.
+   * @param state_space StateSpace object required to manipulate state vector.
+   * @param simulator Simulator object. Provides specific implementations for
+   *   applying gates.
+   * @param measure Function that performs measurements (in the sense of
+   *   computing expectation values, etc). This function should have three
+   *   required parameters [repetition ID (uint64_t), final state vector
+   *   (const State&), statistics of sampled Kraus operator indices and/or
+   *   measured bitstrings (const Stat&)] and any number of optional parameters.
+   * @param args Optional arguments for the 'measure' function.
+   * @return True if the simulation completed successfully; false otherwise.
+   */
+  template <typename MeasurementFunc, typename... Args>
+  static bool RunBatch(const Parameter& param,
+                       const NoisyCircuit<Gate>& circuit,
+                       uint64_t r0, uint64_t r1, const StateSpace& state_space,
+                       const Simulator& simulator, MeasurementFunc&& measure,
+                       Args&&... args) {
+    return RunBatch(param, circuit.num_qubits, circuit.channels.begin(),
+                    circuit.channels.end(), r0, r1, state_space, simulator,
+                    measure, args...);
+  }
+
+  /**
+   * Runs the given noisy circuit performing repetitions. Each repetition is
+   * seeded by repetition ID.
+   * @param param Options for the quantum trajectory simulator.
+   * @param num_qubits The number of qubits acted on by the circuit.
+   * @param cbeg, cend The range of channels [cbeg, cend) to run the circuit.
+   * @param r0, r1 The range of repetition IDs [r0, r1) to perform repetitions.
+   * @param state_space StateSpace object required to manipulate state vector.
+   * @param simulator Simulator object. Provides specific implementations for
+   *   applying gates.
+   * @param measure Function that performs measurements (in the sense of
+   *   computing expectation values, etc). This function should have three
+   *   required parameters [repetition ID (uint64_t), final state vector
+   *   (const State&), statistics of sampled Kraus operator indices and/or
+   *   measured bitstrings (const Stat&)] and any number of optional parameters.
+   * @param args Optional arguments for the 'measure' function.
+   * @return True if the simulation completed successfully; false otherwise.
+   */
+  template <typename MeasurementFunc, typename... Args>
+  static bool RunBatch(const Parameter& param, unsigned num_qubits,
+                       ncircuit_iterator<Gate> cbeg,
+                       ncircuit_iterator<Gate> cend,
+                       uint64_t r0, uint64_t r1, const StateSpace& state_space,
+                       const Simulator& simulator, MeasurementFunc&& measure,
+                       Args&&... args) {
+    std::vector<const Gate*> gates;
+    gates.reserve(4 * std::size_t(cend - cbeg));
+
+    State state = state_space.Null();
+
+    Stat stat;
+    bool had_primary_realization = false;
+
+    for (uint64_t r = r0; r < r1; ++r) {
+      if (!state_space.IsNull(state)) {
+        state_space.SetStateZero(state);
+      }
+
+      bool apply_last_deferred_ops =
+          param.apply_last_deferred_ops || !had_primary_realization;
+
+      if (!RunIteration(param, apply_last_deferred_ops, num_qubits, cbeg, cend,
+                        r, state_space, simulator, gates, state, stat)) {
+        return false;
+      }
+
+      if (stat.primary && !had_primary_realization) {
+        had_primary_realization = true;
+      }
+
+      measure(r, state, stat, args...);
+    }
+
+    return true;
+  }
+
+  /**
+   * Runs the given noisy circuit one time.
+   * @param param Options for the quantum trajectory simulator.
+   * @param circuit The noisy circuit to be simulated.
+   * @param r The repetition ID. The random number generator is seeded by 'r'.
+   * @param state_space StateSpace object required to manipulate state vector.
+   * @param simulator Simulator object. Provides specific implementations for
+   *   applying gates.
+   * @param state The state of the system, to be updated by this method.
+   * @param stat Statistics of sampled Kraus operator indices and/or measured
+   *   bitstrings, to be populated by this method.
+   * @return True if the simulation completed successfully; false otherwise.
+   */
+  static bool RunOnce(const Parameter& param,
+                      const NoisyCircuit<Gate>& circuit, uint64_t r,
+                      const StateSpace& state_space, const Simulator& simulator,
+                      State& state, Stat& stat) {
+    return RunOnce(param, circuit.num_qubits, circuit.channels.begin(),
+                   circuit.channels.end(), r, state_space, simulator,
+                   state, stat);
+  }
+
+  /**
+   * Runs the given noisy circuit one time.
+   * @param param Options for the quantum trajectory simulator.
+   * @param num_qubits The number of qubits acted on by the circuit.
+   * @param cbeg, cend The range of channels [cbeg, cend) to run the circuit.
+   * @param circuit The noisy circuit to be simulated.
+   * @param r The repetition ID. The random number generator is seeded by 'r'.
+   * @param state_space StateSpace object required to manipulate state vector.
+   * @param simulator Simulator object. Provides specific implementations for
+   *   applying gates.
+   * @param state The state of the system, to be updated by this method.
+   * @param stat Statistics of sampled Kraus operator indices and/or measured
+   *   bitstrings, to be populated by this method.
+   * @return True if the simulation completed successfully; false otherwise.
+   */
+  static bool RunOnce(const Parameter& param, unsigned num_qubits,
+                      ncircuit_iterator<Gate> cbeg,
+                      ncircuit_iterator<Gate> cend,
+                      uint64_t r, const StateSpace& state_space,
+                      const Simulator& simulator, State& state, Stat& stat) {
+    std::vector<const Gate*> gates;
+    gates.reserve(4 * std::size_t(cend - cbeg));
+
+    if (!RunIteration(param, param.apply_last_deferred_ops, num_qubits, cbeg,
+                      cend, r, state_space, simulator, gates, state, stat)) {
+      return false;
+    }
+
+    return true;
+  }
+
+ private:
+  static bool RunIteration(const Parameter& param,
+                           bool apply_last_deferred_ops, unsigned num_qubits,
+                           ncircuit_iterator<Gate> cbeg,
+                           ncircuit_iterator<Gate> cend,
+                           uint64_t rep, const StateSpace& state_space,
+                           const Simulator& simulator,
+                           std::vector<const Gate*>& gates,
+                           State& state, Stat& stat) {
+    if (param.collect_kop_stat || param.collect_mea_stat) {
+      stat.samples.reserve(std::size_t(cend - cbeg));
+      stat.samples.resize(0);
+    }
+
+    if (state_space.IsNull(state)) {
+      state = CreateState(num_qubits, state_space);
+      if (state_space.IsNull(state)) {
+        return false;
+      }
+
+      state_space.SetStateZero(state);
+    }
+
+    gates.resize(0);
+
+    RGen rgen(rep);
+    std::uniform_real_distribution<double> distr(0.0, 1.0);
+
+    bool unitary = true;
+    stat.primary = true;
+
+    for (auto it = cbeg; it != cend; ++it) {
+      const auto& channel = *it;
+
+      if (channel.size() == 0) continue;
+
+      if (channel[0].kind == gate::kMeasurement) {
+        // Measurement channel.
+
+        if (!ApplyDeferredOps(param, num_qubits, simulator, gates, state)) {
+          return false;
+        }
+
+        bool normalize = !unitary && param.normalize_before_mea_gates;
+        NormalizeState(normalize, state_space, unitary, state);
+
+        auto mresult = ApplyMeasurementGate(state_space, channel[0].ops[0],
+                                            rgen, state);
+
+        if (!mresult.valid) {
+          return false;
+        }
+
+        CollectStat(param.collect_mea_stat, mresult.bits, stat);
+
+        stat.primary = false;
+
+        continue;
+      }
+
+      // "Normal" channel.
+
+      double r = distr(rgen);
+      double cp = 0;
+
+      // Perform sampling of Kraus operators using probability bounds.
+      for (std::size_t i = 0; i < channel.size(); ++i) {
+        const auto& kop = channel[i];
+
+        cp += kop.prob;
+
+        if (r < cp) {
+          DeferOps(kop.ops, gates);
+          CollectStat(param.collect_kop_stat, i, stat);
+
+          unitary = unitary && kop.unitary;
+
+          break;
+        }
+      }
+
+      if (r < cp) continue;
+
+      if (!ApplyDeferredOps(param, num_qubits, simulator, gates, state)) {
+        return false;
+      }
+
+      NormalizeState(!unitary, state_space, unitary, state);
+
+      double max_prob = 0;
+      std::size_t max_prob_index = 0;
+
+      // Perform sampling of Kraus operators using norms of updated states.
+      for (std::size_t i = 0; i < channel.size(); ++i) {
+        const auto& kop = channel[i];
+
+        if (kop.unitary) continue;
+
+        double prob = std::real(
+            simulator.ExpectationValue(kop.qubits, kop.kd_k.data(), state));
+
+        if (prob > max_prob) {
+          max_prob = prob;
+          max_prob_index = i;
+        }
+
+        cp += prob - kop.prob;
+
+        if (r < cp || i == channel.size() - 1) {
+          // Sample ith Kraus operator if r < cp
+          // Sample the highest probability Kraus operator if r is greater
+          // than the sum of all probablities due to round-off errors.
+          uint64_t k = r < cp ? i : max_prob_index;
+
+          DeferOps(channel[k].ops, gates);
+          CollectStat(param.collect_kop_stat, k, stat);
+
+          unitary = false;
+
+          break;
+        }
+      }
+    }
+
+    if (apply_last_deferred_ops || !stat.primary) {
+      if (!ApplyDeferredOps(param, num_qubits, simulator, gates, state)) {
+        return false;
+      }
+
+      NormalizeState(!unitary, state_space, unitary, state);
+    }
+
+    return true;
+  }
+
+  static State CreateState(unsigned num_qubits, const StateSpace& state_space) {
+    auto state = state_space.Create(num_qubits);
+    if (state_space.IsNull(state)) {
+      IO::errorf("not enough memory: is the number of qubits too large?\n");
+      return state_space.Null();
+    }
+
+    return state;
+  }
+
+  static bool ApplyDeferredOps(
+      const Parameter& param, unsigned num_qubits, const Simulator& simulator,
+      std::vector<const Gate*>& gates, State& state) {
+    if (gates.size() > 0) {
+      auto fgates = Fuser::FuseGates(param, num_qubits, gates);
+
+      gates.resize(0);
+
+      if (fgates.size() == 0) {
+        return false;
+      }
+
+      for (const auto& fgate : fgates) {
+        ApplyFusedGate(simulator, fgate, state);
+      }
+    }
+
+    return true;
+  }
+
+  static MeasurementResult ApplyMeasurementGate(
+      const StateSpace& state_space, const Gate& gate,
+      RGen& rgen, State& state) {
+    auto result = state_space.Measure(gate.qubits, rgen, state);
+
+    if (!result.valid) {
+      IO::errorf("measurement failed.\n");
+    }
+
+    return result;
+  }
+
+  static void DeferOps(
+      const std::vector<Gate>& ops, std::vector<const Gate*>& gates) {
+    for (const auto& op : ops) {
+      gates.push_back(&op);
+    }
+  }
+
+  static void CollectStat(bool collect_stat, uint64_t i, Stat& stat) {
+    if (collect_stat) {
+      stat.samples.push_back(i);
+    }
+
+    if (i != 0) {
+      stat.primary = false;
+    }
+  }
+
+  static void NormalizeState(bool normalize, const StateSpace& state_space,
+                             bool& flag, State& state) {
+    if (normalize) {
+      double a = 1.0 / std::sqrt(state_space.Norm(state));
+      state_space.Multiply(a, state);
+      flag = true;
+    }
+  }
+};
+
+}  // namespace qsim
+
+#endif  // QTRAJECTORY_H_
diff --git a/qsim/run_qsim.h b/qsim/run_qsim.h
new file mode 100644
index 0000000..3752915
--- /dev/null
+++ b/qsim/run_qsim.h
@@ -0,0 +1,262 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef RUN_QSIM_H_
+#define RUN_QSIM_H_
+
+#include <random>
+#include <string>
+#include <vector>
+
+#include "gate.h"
+#include "gate_appl.h"
+#include "util.h"
+
+namespace qsim {
+
+/**
+ * Helper struct for running qsim.
+ */
+template <typename IO, typename Fuser, typename Factory,
+          typename RGen = std::mt19937>
+struct QSimRunner final {
+ public:
+  using Simulator = typename Factory::Simulator;
+  using StateSpace = typename Simulator::StateSpace;
+  using State = typename StateSpace::State;
+  using MeasurementResult = typename StateSpace::MeasurementResult;
+
+  /**
+   * User-specified parameters for gate fusion and simulation.
+   */
+  struct Parameter : public Fuser::Parameter {
+    /**
+     * Random number generator seed to apply measurement gates.
+     */
+    uint64_t seed;
+  };
+
+  /**
+   * Runs the given circuit, only measuring at the end.
+   * @param param Options for gate fusion, parallelism and logging.
+   * @param factory Object to create simulators and state spaces.
+   * @param circuit The circuit to be simulated.
+   * @param measure Function that performs measurements (in the sense of
+   *   computing expectation values, etc).
+   * @return True if the simulation completed successfully; false otherwise.
+   */
+  template <typename Circuit, typename MeasurementFunc>
+  static bool Run(const Parameter& param, const Factory& factory,
+                  const Circuit& circuit, MeasurementFunc measure) {
+    return Run(param, factory, {circuit.gates.back().time}, circuit, measure);
+  }
+
+  /**
+   * Runs the given circuit, measuring at user-specified times.
+   * @param param Options for gate fusion, parallelism and logging.
+   * @param factory Object to create simulators and state spaces.
+   * @param times_to_measure_at Time steps at which to perform measurements.
+   * @param circuit The circuit to be simulated.
+   * @param measure Function that performs measurements (in the sense of
+   *   computing expectation values, etc).
+   * @return True if the simulation completed successfully; false otherwise.
+   */
+  template <typename Circuit, typename MeasurementFunc>
+  static bool Run(const Parameter& param, const Factory& factory,
+                  const std::vector<unsigned>& times_to_measure_at,
+                  const Circuit& circuit, MeasurementFunc measure) {
+    double t0 = 0.0;
+    double t1 = 0.0;
+
+    if (param.verbosity > 1) {
+      t0 = GetTime();
+    }
+
+    RGen rgen(param.seed);
+
+    StateSpace state_space = factory.CreateStateSpace();
+
+    auto state = state_space.Create(circuit.num_qubits);
+    if (state_space.IsNull(state)) {
+      IO::errorf("not enough memory: is the number of qubits too large?\n");
+      return false;
+    }
+
+    state_space.SetStateZero(state);
+    Simulator simulator = factory.CreateSimulator();
+
+    if (param.verbosity > 1) {
+      t1 = GetTime();
+      IO::messagef("init time is %g seconds.\n", t1 - t0);
+      t0 = GetTime();
+    }
+
+    auto fused_gates = Fuser::FuseGates(param, circuit.num_qubits,
+                                        circuit.gates, times_to_measure_at);
+
+    if (fused_gates.size() == 0 && circuit.gates.size() > 0) {
+      return false;
+    }
+
+    if (param.verbosity > 1) {
+      t1 = GetTime();
+      IO::messagef("fuse time is %g seconds.\n", t1 - t0);
+    }
+
+    if (param.verbosity > 0) {
+      t0 = GetTime();
+    }
+
+    unsigned cur_time_index = 0;
+
+    // Apply fused gates.
+    for (std::size_t i = 0; i < fused_gates.size(); ++i) {
+      if (param.verbosity > 3) {
+        t1 = GetTime();
+      }
+
+      if (!ApplyFusedGate(state_space, simulator, fused_gates[i], rgen,
+                          state)) {
+        IO::errorf("measurement failed.\n");
+        return false;
+      }
+
+      if (param.verbosity > 3) {
+        state_space.DeviceSync();
+        double t2 = GetTime();
+        IO::messagef("gate %lu done in %g seconds.\n", i, t2 - t1);
+      }
+
+      unsigned t = times_to_measure_at[cur_time_index];
+
+      if (i == fused_gates.size() - 1 || t < fused_gates[i + 1].time) {
+        // Call back to perform measurements.
+        measure(cur_time_index, state_space, state);
+        ++cur_time_index;
+      }
+    }
+
+    if (param.verbosity > 0) {
+      state_space.DeviceSync();
+      double t2 = GetTime();
+      IO::messagef("time is %g seconds.\n", t2 - t0);
+    }
+
+    return true;
+  }
+
+  /**
+   * Runs the given circuit and make the final state available to the caller,
+   * recording the result of any intermediate measurements in the circuit.
+   * @param param Options for gate fusion, parallelism and logging.
+   * @param factory Object to create simulators and state spaces.
+   * @param circuit The circuit to be simulated.
+   * @param state As an input parameter, this should contain the initial state
+   *   of the system. After a successful run, it will be populated with the
+   *   final state of the system.
+   * @param measure_results As an input parameter, this should be empty.
+   *   After a successful run, this will contain all measurements results from
+   *   the run, ordered by time and qubit index.
+   * @return True if the simulation completed successfully; false otherwise.
+   */
+  template <typename Circuit>
+  static bool Run(const Parameter& param, const Factory& factory,
+                  const Circuit& circuit, State& state,
+                  std::vector<MeasurementResult>& measure_results) {
+    double t0 = 0.0;
+    double t1 = 0.0;
+
+    if (param.verbosity > 1) {
+      t0 = GetTime();
+    }
+
+    RGen rgen(param.seed);
+
+    StateSpace state_space = factory.CreateStateSpace();
+    Simulator simulator = factory.CreateSimulator();
+
+    if (param.verbosity > 1) {
+      t1 = GetTime();
+      IO::messagef("init time is %g seconds.\n", t1 - t0);
+      t0 = GetTime();
+    }
+
+    auto fused_gates = Fuser::FuseGates(param, circuit.num_qubits,
+                                        circuit.gates);
+
+    if (fused_gates.size() == 0 && circuit.gates.size() > 0) {
+      return false;
+    }
+
+    measure_results.reserve(fused_gates.size());
+
+    if (param.verbosity > 1) {
+      t1 = GetTime();
+      IO::messagef("fuse time is %g seconds.\n", t1 - t0);
+    }
+
+    if (param.verbosity > 0) {
+      t0 = GetTime();
+    }
+
+    // Apply fused gates.
+    for (std::size_t i = 0; i < fused_gates.size(); ++i) {
+      if (param.verbosity > 3) {
+        t1 = GetTime();
+      }
+
+      if (!ApplyFusedGate(state_space, simulator, fused_gates[i], rgen, state,
+                          measure_results)) {
+        IO::errorf("measurement failed.\n");
+        return false;
+      }
+
+      if (param.verbosity > 3) {
+        state_space.DeviceSync();
+        double t2 = GetTime();
+        IO::messagef("gate %lu done in %g seconds.\n", i, t2 - t1);
+      }
+    }
+
+    if (param.verbosity > 0) {
+      state_space.DeviceSync();
+      double t2 = GetTime();
+      IO::messagef("simu time is %g seconds.\n", t2 - t0);
+    }
+
+    return true;
+  }
+
+  /**
+   * Runs the given circuit and make the final state available to the caller,
+   * discarding the result of any intermediate measurements in the circuit.
+   * @param param Options for gate fusion, parallelism and logging.
+   * @param factory Object to create simulators and state spaces.
+   * @param circuit The circuit to be simulated.
+   * @param state As an input parameter, this should contain the initial state
+   *   of the system. After a successful run, it will be populated with the
+   *   final state of the system.
+   * @return True if the simulation completed successfully; false otherwise.
+   */
+  template <typename Circuit>
+  static bool Run(const Parameter& param, const Factory& factory,
+                  const Circuit& circuit, State& state) {
+    std::vector<MeasurementResult> discarded_results;
+    return Run(param, factory, circuit, state, discarded_results);
+  }
+};
+
+}  // namespace qsim
+
+#endif  // RUN_QSIM_H_
diff --git a/qsim/run_qsimh.h b/qsim/run_qsimh.h
new file mode 100644
index 0000000..c1534d3
--- /dev/null
+++ b/qsim/run_qsimh.h
@@ -0,0 +1,120 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef RUN_QSIMH_H_
+#define RUN_QSIMH_H_
+
+#include <string>
+#include <vector>
+
+#include "hybrid.h"
+#include "util.h"
+
+namespace qsim {
+
+/**
+ * Helper struct for running qsimh.
+ */
+template <typename IO, typename HybridSimulator>
+struct QSimHRunner final {
+  using Gate = typename HybridSimulator::Gate;
+  using fp_type = typename HybridSimulator::fp_type;
+
+  using Parameter = typename HybridSimulator::Parameter;
+  using HybridData = typename HybridSimulator::HybridData;
+  using Fuser = typename HybridSimulator::Fuser;
+
+  /**
+   * Evaluates the amplitudes for a given circuit and set of output states.
+   * @param param Options for gate fusion, parallelism and logging. Also
+   *   specifies the size of the 'prefix' and 'root' sections of the lattice.
+   * @param factory Object to create simulators and state spaces.
+   * @param circuit The circuit to be simulated.
+   * @param parts Lattice sections to be simulated.
+   * @param bitstrings List of output states to simulate, as bitstrings.
+   * @param results Output vector of amplitudes. After a successful run, this
+   *   will be populated with amplitudes for each state in 'bitstrings'.
+   * @return True if the simulation completed successfully; false otherwise.
+   */
+  template <typename Factory, typename Circuit>
+  static bool Run(const Parameter& param, const Factory& factory,
+                  const Circuit& circuit, const std::vector<unsigned>& parts,
+                  const std::vector<uint64_t>& bitstrings,
+                  std::vector<std::complex<fp_type>>& results) {
+    if (circuit.num_qubits != parts.size()) {
+      IO::errorf("parts size is not equal to the number of qubits.");
+      return false;
+    }
+
+    double t0 = 0.0;
+
+    if (param.verbosity > 0) {
+      t0 = GetTime();
+    }
+
+    HybridData hd;
+    bool rc = HybridSimulator::SplitLattice(parts, circuit.gates, hd);
+
+    if (!rc) {
+      return false;
+    }
+
+    if (hd.num_gatexs < param.num_prefix_gatexs + param.num_root_gatexs) {
+      IO::errorf("error: num_prefix_gates (%u) plus num_root gates (%u) is "
+                 "greater than num_gates_on_the_cut (%u).\n",
+                 param.num_prefix_gatexs, param.num_root_gatexs,
+                 hd.num_gatexs);
+      return false;
+    }
+
+    if (param.verbosity > 0) {
+      PrintInfo(param, hd);
+    }
+
+    auto fgates0 = Fuser::FuseGates(param, hd.num_qubits0, hd.gates0);
+    if (fgates0.size() == 0 && hd.gates0.size() > 0) {
+      return false;
+    }
+
+    auto fgates1 = Fuser::FuseGates(param, hd.num_qubits1, hd.gates1);
+    if (fgates1.size() == 0 && hd.gates1.size() > 0) {
+      return false;
+    }
+
+    rc = HybridSimulator(param.num_threads).Run(
+        param, factory, hd, parts, fgates0, fgates1, bitstrings, results);
+
+    if (rc && param.verbosity > 0) {
+      double t1 = GetTime();
+      IO::messagef("time elapsed %g seconds.\n", t1 - t0);
+    }
+
+    return rc;
+  }
+
+ private:
+  static void PrintInfo(const Parameter& param, const HybridData& hd) {
+    unsigned num_suffix_gates =
+        hd.num_gatexs - param.num_prefix_gatexs - param.num_root_gatexs;
+
+    IO::messagef("part 0: %u, part 1: %u\n", hd.num_qubits0, hd.num_qubits1);
+    IO::messagef("%u gates on the cut\n", hd.num_gatexs);
+    IO::messagef("breakup: %up+%ur+%us\n", param.num_prefix_gatexs,
+                 param.num_root_gatexs, num_suffix_gates);
+  }
+};
+
+}  // namespace qsim
+
+#endif  // RUN_QSIM_H_
diff --git a/qsim/seqfor.h b/qsim/seqfor.h
new file mode 100644
index 0000000..3ebf07c
--- /dev/null
+++ b/qsim/seqfor.h
@@ -0,0 +1,68 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SEQFOR_H_
+#define SEQFOR_H_
+
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+namespace qsim {
+
+/**
+ * Helper struct for executing for loops in series.
+ */
+struct SequentialFor {
+  explicit SequentialFor(unsigned num_threads) {}
+
+  // SequentialFor does not have any state. So all its methods can be static.
+
+  static uint64_t GetIndex0(uint64_t size, unsigned thread_id) {
+    return 0;
+  }
+
+  static uint64_t GetIndex1(uint64_t size, unsigned thread_id) {
+    return size;
+  }
+
+  template <typename Function, typename... Args>
+  static void Run(uint64_t size, Function&& func, Args&&... args) {
+    for (uint64_t i = 0; i < size; ++i) {
+      func(1, 0, i, args...);
+    }
+  }
+
+  template <typename Function, typename Op, typename... Args>
+  static std::vector<typename Op::result_type> RunReduceP(
+      uint64_t size, Function&& func, Op&& op, Args&&... args) {
+    typename Op::result_type result = 0;
+
+    for (uint64_t i = 0; i < size; ++i) {
+      result = op(result, func(1, 0, i, args...));
+    }
+
+    return std::vector<typename Op::result_type>(1, result);
+  }
+
+  template <typename Function, typename Op, typename... Args>
+  static typename Op::result_type RunReduce(uint64_t size, Function&& func,
+                                            Op&& op, Args&&... args) {
+    return RunReduceP(size, func, std::move(op), args...)[0];
+  }
+};
+
+}  // namespace qsim
+
+#endif  // SEQFOR_H_
diff --git a/qsim/simmux.h b/qsim/simmux.h
new file mode 100644
index 0000000..d3c4074
--- /dev/null
+++ b/qsim/simmux.h
@@ -0,0 +1,44 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SIMMUX_H_
+#define SIMMUX_H_
+
+#ifdef __AVX512F__
+# include "simulator_avx512.h"
+  namespace qsim {
+    template <typename For>
+    using Simulator = SimulatorAVX512<For>;
+  }
+#elif __AVX2__
+# include "simulator_avx.h"
+  namespace qsim {
+    template <typename For>
+    using Simulator = SimulatorAVX<For>;
+  }
+#elif __SSE4_1__
+# include "simulator_sse.h"
+  namespace qsim {
+    template <typename For>
+    using Simulator = SimulatorSSE<For>;
+  }
+#else
+# include "simulator_basic.h"
+  namespace qsim {
+    template <typename For>
+    using Simulator = SimulatorBasic<For>;
+  }
+#endif
+
+#endif  // SIMMUX_H_
diff --git a/qsim/simmux_gpu.h b/qsim/simmux_gpu.h
new file mode 100644
index 0000000..1f0bb59
--- /dev/null
+++ b/qsim/simmux_gpu.h
@@ -0,0 +1,30 @@
+// Copyright 2023 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SIMMUX_GPU_H_
+#define SIMMUX_GPU_H_
+
+#ifdef __CUSTATEVEC__
+# include "simulator_custatevec.h"
+  namespace qsim {
+    using SimulatorGpu = SimulatorCuStateVec<>;
+  }
+#else
+# include "simulator_cuda.h"
+  namespace qsim {
+    using SimulatorGpu = SimulatorCUDA<>;
+  }
+#endif
+
+#endif  // SIMMUX_GPU_H_
diff --git a/qsim/simulator.h b/qsim/simulator.h
new file mode 100644
index 0000000..eff5441
--- /dev/null
+++ b/qsim/simulator.h
@@ -0,0 +1,516 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SIMULATOR_H_
+#define SIMULATOR_H_
+
+#include <cstdint>
+
+#include "bits.h"
+
+namespace qsim {
+
+/**
+ * Base class for simulator classes.
+ */
+class SimulatorBase {
+ protected:
+  // The follwoing template parameters are used for functions below.
+  // H - the number of high (target) qubits.
+  // L - the number of low (target) qubits.
+  // R - SIMD register width in floats.
+
+  // Fills the table of masks (ms) that is used to calculate base state indices
+  // and the table of offset indices (xss) that is used to access the state
+  // vector entries in matrix-vector multiplication functions. This function is
+  // used in simulator_basic.h, simulator_sse.h and simulator_avx.h (no bmi2
+  // version).
+  template <unsigned H, unsigned L = 0>
+  static void FillIndices(unsigned num_qubits, const std::vector<unsigned>& qs,
+                          uint64_t* ms, uint64_t* xss) {
+    constexpr unsigned hsize = 1 << H;
+
+    if (H == 0) {
+      ms[0] = uint64_t(-1);
+      xss[0] = 0;
+    } else {
+      uint64_t xs[H + 1];
+
+      xs[0] = uint64_t{1} << (qs[L] + 1);
+      ms[0] = (uint64_t{1} << qs[L]) - 1;
+      for (unsigned i = 1; i < H; ++i) {
+        xs[i] = uint64_t{1} << (qs[L + i] + 1);
+        ms[i] = ((uint64_t{1} << qs[L + i]) - 1) ^ (xs[i - 1] - 1);
+      }
+      ms[H] = ((uint64_t{1} << num_qubits) - 1) ^ (xs[H - 1] - 1);
+
+      for (unsigned i = 0; i < hsize; ++i) {
+        uint64_t a = 0;
+        for (uint64_t k = 0; k < H; ++k) {
+          a += xs[k] * ((i >> k) & 1);
+        }
+        xss[i] = a;
+      }
+    }
+  }
+
+  // Fills gate matrix entries for gates with low qubits.
+  template <unsigned H, unsigned L, unsigned R, typename fp_type>
+  static void FillMatrix(unsigned qmaskl, const fp_type* matrix, fp_type* w) {
+    constexpr unsigned gsize = 1 << (H + L);
+    constexpr unsigned hsize = 1 << H;
+    constexpr unsigned lsize = 1 << L;
+    constexpr unsigned rsize = 1 << R;
+
+    unsigned s = 0;
+
+    for (unsigned i = 0; i < hsize; ++i) {
+      for (unsigned j = 0; j < gsize; ++j) {
+        unsigned p0 = 2 * i * lsize * gsize + 2 * lsize * (j / lsize);
+
+        for (unsigned k = 0; k < rsize; ++k) {
+          unsigned l = bits::CompressBits(k, R, qmaskl);
+          unsigned p = p0 + 2 * (gsize * l + (j + l) % lsize);
+
+          w[s + 0] = matrix[p];
+          w[s + rsize] = matrix[p + 1];
+
+          ++s;
+        }
+
+        s += rsize;
+      }
+    }
+  }
+
+  // Fills gate matrix entries for controlled gates with high target qubits
+  // and low control qubits.
+  template <unsigned H, unsigned R, typename fp_type>
+  static void FillControlledMatrixH(uint64_t cvalsl, uint64_t cmaskl,
+                                    const fp_type* matrix, fp_type* w) {
+    constexpr unsigned hsize = 1 << H;
+    constexpr unsigned rsize = 1 << R;
+
+    unsigned s = 0;
+
+    for (unsigned i = 0; i < hsize; ++i) {
+      for (unsigned j = 0; j < hsize; ++j) {
+        unsigned p = hsize * i + j;
+        fp_type v = i == j ? 1 : 0;
+
+        for (unsigned k = 0; k < rsize; ++k) {
+          w[s] = cvalsl == (k & cmaskl) ? matrix[2 * p] : v;
+          w[s + rsize] = cvalsl == (k & cmaskl) ? matrix[2 * p + 1] : 0;
+
+          ++s;
+        }
+
+        s += rsize;
+      }
+    }
+  }
+
+  // Fills gate matrix entries for controlled gates with low target qubits
+  // and low control qubits.
+  template <unsigned H, unsigned L, unsigned R, typename fp_type>
+  static void FillControlledMatrixL(uint64_t cvalsl, uint64_t cmaskl,
+                                    unsigned qmaskl, const fp_type* matrix,
+                                    fp_type* w) {
+    constexpr unsigned gsize = 1 << (H + L);
+    constexpr unsigned hsize = 1 << H;
+    constexpr unsigned lsize = 1 << L;
+    constexpr unsigned rsize = 1 << R;
+
+    unsigned s = 0;
+
+    for (unsigned i = 0; i < hsize; ++i) {
+      for (unsigned j = 0; j < gsize; ++j) {
+        unsigned p0 = i * lsize * gsize + lsize * (j / lsize);
+
+        for (unsigned k = 0; k < rsize; ++k) {
+          unsigned l = bits::CompressBits(k, R, qmaskl);
+          unsigned p = p0 + gsize * l + (j + l) % lsize;
+
+          fp_type v = p / gsize == p % gsize ? 1 : 0;
+
+          w[s] = cvalsl == (k & cmaskl) ? matrix[2 * p] : v;
+          w[s + rsize] = cvalsl == (k & cmaskl) ? matrix[2 * p + 1] : 0;
+
+          ++s;
+        }
+
+        s += rsize;
+      }
+    }
+  }
+
+/*
+  The GetMasks* functions below provide various masks and related values.
+  GetMasks1, GetMasks2, GetMasks3, GetMasks4, GetMasks5 and GetMasks6 are
+  used in simulator_avx.h (BMI2 version) and in simulator_avx512.h. GetMasks7,
+  GetMasks8, GetMasks9, GetMasks10 and GetMasks11 are used in simulator_avx.h
+  (no BMI2 version) and in simulator_sse.h.
+
+  imaskh - inverted mask of high qubits (high control and target qubits).
+  qmaskh - mask of high qubits (high target qubits).
+  cvalsh - control bit values of high control qubits placed in correct
+           positions.
+  cvalsl - control bit values of low control qubits placed in correct positions.
+  cmaskh - mask of high control qubits.
+  cmaskl - mask of low control qubits.
+  qmaskl - mask of low qubits (low target qubits).
+  cl - the number of low control qubits.
+
+  Note that imaskh, qmaskh and cvalsh are multiplied by two in GetMasks1,
+  GetMasks2, GetMasks3, GetMasks4, GetMasks5 and GetMasks6.
+*/
+
+  struct Masks1 {
+    uint64_t imaskh;
+    uint64_t qmaskh;
+  };
+
+  template <unsigned H, unsigned R>
+  static Masks1 GetMasks1(const std::vector<unsigned>& qs) {
+    uint64_t qmaskh = 0;
+
+    for (unsigned i = 0; i < H; ++i) {
+      qmaskh |= uint64_t{1} << qs[i];
+    }
+
+    return {2 * (~qmaskh ^ ((1 << R) - 1)), 2 * qmaskh};
+  }
+
+  struct Masks2 {
+    uint64_t imaskh;
+    uint64_t qmaskh;
+    unsigned qmaskl;
+  };
+
+  template <unsigned H, unsigned L, unsigned R>
+  static Masks2 GetMasks2(const std::vector<unsigned>& qs) {
+    uint64_t qmaskh = 0;
+    unsigned qmaskl = 0;
+
+    for (unsigned i = 0; i < L; ++i) {
+      qmaskl |= 1 << qs[i];
+    }
+
+    for (unsigned i = L; i < H + L; ++i) {
+      qmaskh |= uint64_t{1} << qs[i];
+    }
+
+    return {2 * (~qmaskh ^ ((1 << R) - 1)), 2 * qmaskh, qmaskl};
+  }
+
+  struct Masks3 {
+    uint64_t imaskh;
+    uint64_t qmaskh;
+    uint64_t cvalsh;
+  };
+
+  template <unsigned H, unsigned R>
+  static Masks3 GetMasks3(unsigned num_qubits, const std::vector<unsigned>& qs,
+                          const std::vector<unsigned>& cqs, uint64_t cvals) {
+    uint64_t qmaskh = 0;
+    uint64_t cmaskh = 0;
+
+    for (unsigned i = 0; i < H; ++i) {
+      qmaskh |= uint64_t{1} << qs[i];
+    }
+
+    for (auto q : cqs) {
+      cmaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh);
+
+    uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1);
+
+    return {2 * maskh, 2 * qmaskh, 2 * cvalsh};
+  }
+
+  struct Masks4 {
+    uint64_t imaskh;
+    uint64_t qmaskh;
+    uint64_t cvalsh;
+    uint64_t cvalsl;
+    uint64_t cmaskl;
+    unsigned cl;
+  };
+
+  template <unsigned H, unsigned R>
+  static Masks4 GetMasks4(unsigned num_qubits, const std::vector<unsigned>& qs,
+                          const std::vector<unsigned>& cqs, uint64_t cvals) {
+    unsigned cl = 0;
+    uint64_t qmaskh = 0;
+    uint64_t cmaskh = 0;
+    uint64_t cmaskl = 0;
+
+    for (unsigned i = 0; i < H; ++i) {
+      qmaskh |= uint64_t{1} << qs[i];
+    }
+
+    for (auto q : cqs) {
+      if (q >= R) {
+        cmaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        cmaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh);
+    uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl);
+
+    uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1);
+
+    return {2 * maskh, 2 * qmaskh, 2 * cvalsh, cvalsl, cmaskl, cl};
+  }
+
+  struct Masks5 {
+    uint64_t imaskh;
+    uint64_t qmaskh;
+    uint64_t cvalsh;
+    unsigned qmaskl;
+  };
+
+  template <unsigned H, unsigned L, unsigned R>
+  static Masks5 GetMasks5(unsigned num_qubits, const std::vector<unsigned>& qs,
+                          const std::vector<unsigned>& cqs, uint64_t cvals) {
+    uint64_t qmaskh = 0;
+    uint64_t cmaskh = 0;
+    unsigned qmaskl = 0;
+
+    for (unsigned i = 0; i < L; ++i) {
+      qmaskl |= 1 << qs[i];
+    }
+
+    for (unsigned i = L; i < H + L; ++i) {
+      qmaskh |= uint64_t{1} << qs[i];
+    }
+
+    for (auto q : cqs) {
+      cmaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh);
+
+    uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1);
+
+    return {2 * maskh, 2 * qmaskh, 2 * cvalsh, qmaskl};
+  }
+
+  struct Masks6 {
+    uint64_t imaskh;
+    uint64_t qmaskh;
+    uint64_t cvalsh;
+    uint64_t cvalsl;
+    uint64_t cmaskl;
+    unsigned qmaskl;
+    unsigned cl;
+  };
+
+  template <unsigned H, unsigned L, unsigned R>
+  static Masks6 GetMasks6(unsigned num_qubits, const std::vector<unsigned>& qs,
+                          const std::vector<unsigned>& cqs, uint64_t cvals) {
+    unsigned cl = 0;
+    uint64_t qmaskh = 0;
+    uint64_t cmaskh = 0;
+    uint64_t cmaskl = 0;
+    unsigned qmaskl = 0;
+
+    for (unsigned i = 0; i < L; ++i) {
+      qmaskl |= 1 << qs[i];
+    }
+
+    for (unsigned i = L; i < H + L; ++i) {
+      qmaskh |= uint64_t{1} << qs[i];
+    }
+
+    for (auto q : cqs) {
+      if (q >= R) {
+        cmaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        cmaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh);
+    uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl);
+
+    uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1);
+
+    return {2 * maskh, 2 * qmaskh, 2 * cvalsh, cvalsl, cmaskl, qmaskl, cl};
+  }
+
+  struct Masks7 {
+    uint64_t cvalsh;
+    uint64_t cmaskh;
+  };
+
+  static Masks7 GetMasks7(unsigned num_qubits, const std::vector<unsigned>& qs,
+                          const std::vector<unsigned>& cqs, uint64_t cvals) {
+    uint64_t cmaskh = 0;
+
+    for (auto q : cqs) {
+      cmaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh);
+
+    return {cvalsh, cmaskh};
+  }
+
+  struct Masks8 {
+    uint64_t cvalsh;
+    uint64_t cmaskh;
+    uint64_t cvalsl;
+    uint64_t cmaskl;
+  };
+
+  template <unsigned R>
+  static Masks8 GetMasks8(unsigned num_qubits, const std::vector<unsigned>& qs,
+                          const std::vector<unsigned>& cqs, uint64_t cvals) {
+    unsigned cl = 0;
+    uint64_t cmaskh = 0;
+    uint64_t cmaskl = 0;
+
+    for (auto q : cqs) {
+      if (q >= R) {
+        cmaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        cmaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh);
+    uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl);
+
+    return {cvalsh, cmaskh, cvalsl, cmaskl};
+  }
+
+  struct Masks9 {
+    uint64_t cvalsh;
+    uint64_t cmaskh;
+    unsigned qmaskl;
+  };
+
+  template <unsigned L>
+  static Masks9 GetMasks9(unsigned num_qubits, const std::vector<unsigned>& qs,
+                          const std::vector<unsigned>& cqs, uint64_t cvals) {
+    uint64_t cmaskh = 0;
+    unsigned qmaskl = 0;
+
+    for (unsigned i = 0; i < L; ++i) {
+      qmaskl |= 1 << qs[i];
+    }
+
+    for (auto q : cqs) {
+      cmaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh);
+
+    return {cvalsh, cmaskh, qmaskl};
+  }
+
+  struct Masks10 {
+    uint64_t cvalsh;
+    uint64_t cmaskh;
+    uint64_t cvalsl;
+    uint64_t cmaskl;
+    unsigned qmaskl;
+  };
+
+  template <unsigned L, unsigned R>
+  static Masks10 GetMasks10(unsigned num_qubits,
+                            const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs, uint64_t cvals) {
+    unsigned cl = 0;
+    uint64_t cmaskh = 0;
+    uint64_t cmaskl = 0;
+    unsigned qmaskl = 0;
+
+    for (unsigned i = 0; i < L; ++i) {
+      qmaskl |= 1 << qs[i];
+    }
+
+    for (auto q : cqs) {
+      if (q >= R) {
+        cmaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        cmaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh);
+    uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl);
+
+    return {cvalsh, cmaskh, cvalsl, cmaskl, qmaskl};
+  }
+
+  struct Masks11 {
+    unsigned qmaskl;
+  };
+
+  template <unsigned L>
+  static Masks11 GetMasks11(const std::vector<unsigned>& qs) {
+    unsigned qmaskl = 0;
+
+    for (unsigned i = 0; i < L; ++i) {
+      qmaskl |= 1 << qs[i];
+    }
+
+    return {qmaskl};
+  }
+
+  template <unsigned R>
+  static unsigned MaskedAdd(
+      unsigned a, unsigned b, unsigned mask, unsigned lsize) {
+    unsigned c = bits::CompressBits(a, R, mask);
+    return bits::ExpandBits((c + b) % lsize, R, mask);
+  }
+};
+
+template <>
+inline void SimulatorBase::FillIndices<0, 1>(unsigned num_qubits,
+                                             const std::vector<unsigned>& qs,
+                                             uint64_t* ms, uint64_t* xss) {
+  ms[0] = -1;
+  xss[0] = 0;
+}
+
+template <>
+inline void SimulatorBase::FillIndices<0, 2>(unsigned num_qubits,
+                                             const std::vector<unsigned>& qs,
+                                             uint64_t* ms, uint64_t* xss) {
+  ms[0] = -1;
+  xss[0] = 0;
+}
+
+template <>
+inline void SimulatorBase::FillIndices<0, 3>(unsigned num_qubits,
+                                             const std::vector<unsigned>& qs,
+                                             uint64_t* ms, uint64_t* xss) {
+  ms[0] = -1;
+  xss[0] = 0;
+}
+
+}  // namespace qsim
+
+#endif  // SIMULATOR_H_
diff --git a/qsim/simulator_avx.h b/qsim/simulator_avx.h
new file mode 100644
index 0000000..9742849
--- /dev/null
+++ b/qsim/simulator_avx.h
@@ -0,0 +1,1363 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SIMULATOR_AVX_H_
+#define SIMULATOR_AVX_H_
+
+#include <immintrin.h>
+
+#include <complex>
+#include <cstdint>
+#include <functional>
+#include <vector>
+
+#include "simulator.h"
+#include "statespace_avx.h"
+
+namespace qsim {
+
+/**
+ * Quantum circuit simulator with AVX vectorization.
+ */
+template <typename For>
+class SimulatorAVX final : public SimulatorBase {
+ public:
+  using StateSpace = StateSpaceAVX<For>;
+  using State = typename StateSpace::State;
+  using fp_type = typename StateSpace::fp_type;
+
+  template <typename... ForArgs>
+  explicit SimulatorAVX(ForArgs&&... args) : for_(args...) {}
+
+  /**
+   * Applies a gate using AVX instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyGate(const std::vector<unsigned>& qs,
+                 const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    switch (qs.size()) {
+    case 0:
+      ApplyGateH<0>(qs, matrix, state);
+      break;
+    case 1:
+      if (qs[0] > 2) {
+        ApplyGateH<1>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 1>(qs, matrix, state);
+      }
+      break;
+    case 2:
+      if (qs[0] > 2) {
+        ApplyGateH<2>(qs, matrix, state);
+      } else if (qs[1] > 2) {
+        ApplyGateL<1, 1>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 2>(qs, matrix, state);
+      }
+      break;
+    case 3:
+      if (qs[0] > 2) {
+        ApplyGateH<3>(qs, matrix, state);
+      } else if (qs[1] > 2) {
+        ApplyGateL<2, 1>(qs, matrix, state);
+      } else if (qs[2] > 2) {
+        ApplyGateL<1, 2>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 3>(qs, matrix, state);
+      }
+      break;
+    case 4:
+      if (qs[0] > 2) {
+        ApplyGateH<4>(qs, matrix, state);
+      } else if (qs[1] > 2) {
+        ApplyGateL<3, 1>(qs, matrix, state);
+      } else if (qs[2] > 2) {
+        ApplyGateL<2, 2>(qs, matrix, state);
+      } else {
+        ApplyGateL<1, 3>(qs, matrix, state);
+      }
+      break;
+    case 5:
+      if (qs[0] > 2) {
+        ApplyGateH<5>(qs, matrix, state);
+      } else if (qs[1] > 2) {
+        ApplyGateL<4, 1>(qs, matrix, state);
+      } else if (qs[2] > 2) {
+        ApplyGateL<3, 2>(qs, matrix, state);
+      } else {
+        ApplyGateL<2, 3>(qs, matrix, state);
+      }
+      break;
+    case 6:
+      if (qs[0] > 2) {
+        ApplyGateH<6>(qs, matrix, state);
+      } else if (qs[1] > 2) {
+        ApplyGateL<5, 1>(qs, matrix, state);
+      } else if (qs[2] > 2) {
+        ApplyGateL<4, 2>(qs, matrix, state);
+      } else {
+        ApplyGateL<3, 3>(qs, matrix, state);
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * Applies a controlled gate using AVX instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param cqs Indices of control qubits.
+   * @param cvals Bit mask of control qubit values.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyControlledGate(const std::vector<unsigned>& qs,
+                           const std::vector<unsigned>& cqs, uint64_t cvals,
+                           const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+    // Assume cqs[0] < cqs[1] < cqs[2] < ... .
+
+    if (cqs.size() == 0) {
+      ApplyGate(qs, matrix, state);
+      return;
+    }
+
+    switch (qs.size()) {
+    case 0:
+      if (cqs[0] > 2) {
+        ApplyControlledGateHH<0>(qs, cqs, cvals, matrix, state);
+      } else {
+        ApplyControlledGateHL<0>(qs, cqs, cvals, matrix, state);
+      }
+      break;
+    case 1:
+      if (qs[0] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 2:
+      if (qs[0] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 3:
+      if (qs[0] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[2] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 4:
+      if (qs[0] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[2] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * Computes the expectation value of an operator using AVX instructions.
+   * @param qs Indices of the qubits the operator acts on.
+   * @param matrix The operator matrix.
+   * @param state The state of the system.
+   * @return The computed expectation value.
+   */
+  std::complex<double> ExpectationValue(const std::vector<unsigned>& qs,
+                                        const fp_type* matrix,
+                                        const State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    switch (qs.size()) {
+    case 1:
+      if (qs[0] > 2) {
+        return ExpectationValueH<1>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<0, 1>(qs, matrix, state);
+      }
+      break;
+    case 2:
+      if (qs[0] > 2) {
+        return ExpectationValueH<2>(qs, matrix, state);
+      } else if (qs[1] > 2) {
+        return ExpectationValueL<1, 1>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<0, 2>(qs, matrix, state);
+      }
+      break;
+    case 3:
+      if (qs[0] > 2) {
+        return ExpectationValueH<3>(qs, matrix, state);
+      } else if (qs[1] > 2) {
+        return ExpectationValueL<2, 1>(qs, matrix, state);
+      } else if (qs[2] > 2) {
+        return ExpectationValueL<1, 2>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<0, 3>(qs, matrix, state);
+      }
+      break;
+    case 4:
+      if (qs[0] > 2) {
+        return ExpectationValueH<4>(qs, matrix, state);
+      } else if (qs[1] > 2) {
+        return ExpectationValueL<3, 1>(qs, matrix, state);
+      } else if (qs[2] > 2) {
+        return ExpectationValueL<2, 2>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<1, 3>(qs, matrix, state);
+      }
+      break;
+    case 5:
+      if (qs[0] > 2) {
+        return ExpectationValueH<5>(qs, matrix, state);
+      } else if (qs[1] > 2) {
+        return ExpectationValueL<4, 1>(qs, matrix, state);
+      } else if (qs[2] > 2) {
+        return ExpectationValueL<3, 2>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<2, 3>(qs, matrix, state);
+      }
+      break;
+    case 6:
+      if (qs[0] > 2) {
+        return ExpectationValueH<6>(qs, matrix, state);
+      } else if (qs[1] > 2) {
+        return ExpectationValueL<5, 1>(qs, matrix, state);
+      } else if (qs[2] > 2) {
+        return ExpectationValueL<4, 2>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<3, 3>(qs, matrix, state);
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+
+    return 0;
+  }
+
+  /**
+   * @return The size of SIMD register if applicable.
+   */
+  static unsigned SIMDRegisterSize() {
+    return 8;
+  }
+
+ private:
+#ifdef __BMI2__
+
+  template <unsigned H>
+  void ApplyGateH(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                uint64_t imaskh, uint64_t qmaskh, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m256 ru, iu, rn, in;
+      __m256 rs[hsize], is[hsize];
+
+      auto p0 = rstate + _pdep_u64(i, imaskh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k] = _mm256_load_ps(p0 + p);
+        is[k] = _mm256_load_ps(p0 + p + 8);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm256_set1_ps(v[j]);
+        iu = _mm256_set1_ps(v[j + 1]);
+        rn = _mm256_mul_ps(rs[0], ru);
+        in = _mm256_mul_ps(rs[0], iu);
+        rn = _mm256_fnmadd_ps(is[0], iu, rn);
+        in = _mm256_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm256_set1_ps(v[j]);
+          iu = _mm256_set1_ps(v[j + 1]);
+          rn = _mm256_fmadd_ps(rs[l], ru, rn);
+          in = _mm256_fmadd_ps(rs[l], iu, in);
+          rn = _mm256_fnmadd_ps(is[l], iu, rn);
+          in = _mm256_fmadd_ps(is[l], ru, in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm256_store_ps(p0 + p, rn);
+        _mm256_store_ps(p0 + p + 8, in);
+      }
+    };
+
+    auto m = GetMasks1<H, 3>(qs);
+
+    unsigned k = 3 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, matrix, m.imaskh, m.qmaskh, state.get());
+  }
+
+  template <unsigned H, unsigned L>
+  void ApplyGateL(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                uint64_t imaskh, uint64_t qmaskh, const __m256i* idx,
+                fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m256 rn, in;
+      __m256 rs[gsize], is[gsize];
+
+      auto p0 = rstate + _pdep_u64(i, imaskh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k2] = _mm256_load_ps(p0 + p);
+        is[k2] = _mm256_load_ps(p0 + p + 8);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
+          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm256_mul_ps(rs[0], w[j]);
+        in = _mm256_mul_ps(rs[0], w[j + 1]);
+        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm256_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm256_store_ps(p0 + p, rn);
+        _mm256_store_ps(p0 + p + 8, in);
+      }
+    };
+
+    __m256i idx[1 << L];
+    __m256 w[1 << (1 + 2 * H + L)];
+
+    auto m = GetMasks2<H, L, 3>(qs);
+    FillPermutationIndices<L>(m.qmaskl, idx);
+    FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
+
+    unsigned r = 3 + H;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, m.imaskh, m.qmaskh, idx, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
+                fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m256 ru, iu, rn, in;
+      __m256 rs[hsize], is[hsize];
+
+      auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k] = _mm256_load_ps(p0 + p);
+        is[k] = _mm256_load_ps(p0 + p + 8);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm256_set1_ps(v[j]);
+        iu = _mm256_set1_ps(v[j + 1]);
+        rn = _mm256_mul_ps(rs[0], ru);
+        in = _mm256_mul_ps(rs[0], iu);
+        rn = _mm256_fnmadd_ps(is[0], iu, rn);
+        in = _mm256_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm256_set1_ps(v[j]);
+          iu = _mm256_set1_ps(v[j + 1]);
+          rn = _mm256_fmadd_ps(rs[l], ru, rn);
+          in = _mm256_fmadd_ps(rs[l], iu, in);
+          rn = _mm256_fnmadd_ps(is[l], iu, rn);
+          in = _mm256_fmadd_ps(is[l], ru, in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm256_store_ps(p0 + p, rn);
+        _mm256_store_ps(p0 + p + 8, in);
+      }
+    };
+
+    auto m = GetMasks3<H, 3>(state.num_qubits(), qs, cqs, cvals);
+
+    unsigned k = 3 + H + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, matrix, m.imaskh, m.qmaskh, m.cvalsh, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
+                fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m256 rn, in;
+      __m256 rs[hsize], is[hsize];
+
+      auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k] = _mm256_load_ps(p0 + p);
+        is[k] = _mm256_load_ps(p0 + p + 8);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm256_mul_ps(rs[0], w[j]);
+        in = _mm256_mul_ps(rs[0], w[j + 1]);
+        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm256_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm256_store_ps(p0 + p, rn);
+        _mm256_store_ps(p0 + p + 8, in);
+      }
+    };
+
+    __m256 w[1 << (1 + 2 * H)];
+
+    auto m = GetMasks4<H, 3>(state.num_qubits(), qs, cqs, cvals);
+    FillControlledMatrixH<H, 3>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
+
+    unsigned r = 3 + H + cqs.size() - m.cl;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, state.get());
+  }
+
+  template <unsigned H, unsigned L, bool CH>
+  void ApplyControlledGateL(const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs, uint64_t cvals,
+                            const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
+                const __m256i* idx, fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m256 rn, in;
+      __m256 rs[gsize], is[gsize];
+
+      auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k2] = _mm256_load_ps(p0 + p);
+        is[k2] = _mm256_load_ps(p0 + p + 8);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
+          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm256_mul_ps(rs[0], w[j]);
+        in = _mm256_mul_ps(rs[0], w[j + 1]);
+        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm256_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm256_store_ps(p0 + p, rn);
+        _mm256_store_ps(p0 + p + 8, in);
+      }
+    };
+
+    __m256i idx[1 << L];
+    __m256 w[1 << (1 + 2 * H + L)];
+
+    if (CH) {
+      auto m = GetMasks5<H, L, 3>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
+
+      unsigned r = 3 + H + cqs.size();
+      unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+      uint64_t size = uint64_t{1} << n;
+
+      for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get());
+    } else {
+      auto m = GetMasks6<H, L, 3>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillControlledMatrixL<H, L, 3>(
+          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
+
+      unsigned r = 3 + H + cqs.size() - m.cl;
+      unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+      uint64_t size = uint64_t{1} << n;
+
+      for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get());
+    }
+  }
+
+  template <unsigned H>
+  std::complex<double> ExpectationValueH(const std::vector<unsigned>& qs,
+                                         const fp_type* matrix,
+                                         const State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                uint64_t imaskh, uint64_t qmaskh, const fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m256 ru, iu, rn, in;
+      __m256 rs[hsize], is[hsize];
+
+      auto p0 = rstate + _pdep_u64(i, imaskh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k] = _mm256_load_ps(p0 + p);
+        is[k] = _mm256_load_ps(p0 + p + 8);
+      }
+
+      double re = 0;
+      double im = 0;
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm256_set1_ps(v[j]);
+        iu = _mm256_set1_ps(v[j + 1]);
+        rn = _mm256_mul_ps(rs[0], ru);
+        in = _mm256_mul_ps(rs[0], iu);
+        rn = _mm256_fnmadd_ps(is[0], iu, rn);
+        in = _mm256_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm256_set1_ps(v[j]);
+          iu = _mm256_set1_ps(v[j + 1]);
+          rn = _mm256_fmadd_ps(rs[l], ru, rn);
+          in = _mm256_fmadd_ps(rs[l], iu, in);
+          rn = _mm256_fnmadd_ps(is[l], iu, rn);
+          in = _mm256_fmadd_ps(is[l], ru, in);
+
+          j += 2;
+        }
+
+        __m256 v_re = _mm256_fmadd_ps(is[k], in, _mm256_mul_ps(rs[k], rn));
+        __m256 v_im = _mm256_fnmadd_ps(is[k], rn, _mm256_mul_ps(rs[k], in));
+
+        re += detail::HorizontalSumAVX(v_re);
+        im += detail::HorizontalSumAVX(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    auto m = GetMasks1<H, 3>(qs);
+
+    unsigned k = 3 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return
+        for_.RunReduce(size, f, Op(), matrix, m.imaskh, m.qmaskh, state.get());
+  }
+
+  template <unsigned H, unsigned L>
+  std::complex<double> ExpectationValueL(const std::vector<unsigned>& qs,
+                                         const fp_type* matrix,
+                                         const State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                uint64_t imaskh, uint64_t qmaskh, const __m256i* idx,
+                const fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m256 rn, in;
+      __m256 rs[gsize], is[gsize];
+
+      auto p0 = rstate + _pdep_u64(i, imaskh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k2] = _mm256_load_ps(p0 + p);
+        is[k2] = _mm256_load_ps(p0 + p + 8);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
+          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
+        }
+      }
+
+      double re = 0;
+      double im = 0;
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm256_mul_ps(rs[0], w[j]);
+        in = _mm256_mul_ps(rs[0], w[j + 1]);
+        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm256_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        unsigned m = lsize * k;
+
+        __m256 v_re = _mm256_fmadd_ps(is[m], in, _mm256_mul_ps(rs[m], rn));
+        __m256 v_im = _mm256_fnmadd_ps(is[m], rn, _mm256_mul_ps(rs[m], in));
+
+        re += detail::HorizontalSumAVX(v_re);
+        im += detail::HorizontalSumAVX(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    __m256i idx[1 << L];
+    __m256 w[1 << (1 + 2 * H + L)];
+
+    auto m = GetMasks2<H, L, 3>(qs);
+    FillPermutationIndices<L>(m.qmaskl, idx);
+    FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
+
+    unsigned r = 3 + H;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return
+        for_.RunReduce(size, f, Op(), w, m.imaskh, m.qmaskh, idx, state.get());
+  }
+
+#else  // __BMI2__
+
+  template <unsigned H>
+  void ApplyGateH(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m256 ru, iu, rn, in;
+      __m256 rs[hsize], is[hsize];
+
+      i *= 8;
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      auto p0 = rstate + 2 * ii;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm256_load_ps(p0 + xss[k]);
+        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm256_set1_ps(v[j]);
+        iu = _mm256_set1_ps(v[j + 1]);
+        rn = _mm256_mul_ps(rs[0], ru);
+        in = _mm256_mul_ps(rs[0], iu);
+        rn = _mm256_fnmadd_ps(is[0], iu, rn);
+        in = _mm256_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm256_set1_ps(v[j]);
+          iu = _mm256_set1_ps(v[j + 1]);
+          rn = _mm256_fmadd_ps(rs[l], ru, rn);
+          in = _mm256_fmadd_ps(rs[l], iu, in);
+          rn = _mm256_fnmadd_ps(is[l], iu, rn);
+          in = _mm256_fmadd_ps(is[l], ru, in);
+
+          j += 2;
+        }
+
+        _mm256_store_ps(p0 + xss[k], rn);
+        _mm256_store_ps(p0 + xss[k] + 8, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 3 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, matrix, ms, xss, state.get());
+  }
+
+  template <unsigned H, unsigned L>
+  void ApplyGateL(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                const uint64_t* ms, const uint64_t* xss, const __m256i* idx,
+                fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m256 rn, in;
+      __m256 rs[gsize], is[gsize];
+
+      i *= 8;
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      auto p0 = rstate + 2 * ii;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        rs[k2] = _mm256_load_ps(p0 + xss[k]);
+        is[k2] = _mm256_load_ps(p0 + xss[k] + 8);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
+          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm256_mul_ps(rs[0], w[j]);
+        in = _mm256_mul_ps(rs[0], w[j + 1]);
+        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm256_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        _mm256_store_ps(p0 + xss[k], rn);
+        _mm256_store_ps(p0 + xss[k] + 8, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m256i idx[1 << L];
+    __m256 w[1 << (1 + 2 * H + L)];
+
+    auto m = GetMasks11<L>(qs);
+
+    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
+    FillPermutationIndices<L>(m.qmaskl, idx);
+    FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
+
+    unsigned r = 3 + H;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss, idx, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m256 ru, iu, rn, in;
+      __m256 rs[hsize], is[hsize];
+
+      i *= 8;
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      if ((ii & cmaskh) != cvalsh) return;
+
+      auto p0 = rstate + 2 * ii;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm256_load_ps(p0 + xss[k]);
+        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm256_set1_ps(v[j]);
+        iu = _mm256_set1_ps(v[j + 1]);
+        rn = _mm256_mul_ps(rs[0], ru);
+        in = _mm256_mul_ps(rs[0], iu);
+        rn = _mm256_fnmadd_ps(is[0], iu, rn);
+        in = _mm256_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm256_set1_ps(v[j]);
+          iu = _mm256_set1_ps(v[j + 1]);
+          rn = _mm256_fmadd_ps(rs[l], ru, rn);
+          in = _mm256_fmadd_ps(rs[l], iu, in);
+          rn = _mm256_fnmadd_ps(is[l], iu, rn);
+          in = _mm256_fmadd_ps(is[l], ru, in);
+
+          j += 2;
+        }
+
+        _mm256_store_ps(p0 + xss[k], rn);
+        _mm256_store_ps(p0 + xss[k] + 8, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+
+    auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals);
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 3 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, matrix, ms, xss, m.cvalsh, m.cmaskh, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m256 rn, in;
+      __m256 rs[hsize], is[hsize];
+
+      i *= 8;
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      if ((ii & cmaskh) != cvalsh) return;
+
+      auto p0 = rstate + 2 * ii;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm256_load_ps(p0 + xss[k]);
+        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm256_mul_ps(rs[0], w[j]);
+        in = _mm256_mul_ps(rs[0], w[j + 1]);
+        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm256_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        _mm256_store_ps(p0 + xss[k], rn);
+        _mm256_store_ps(p0 + xss[k] + 8, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m256 w[1 << (1 + 2 * H)];
+
+    auto m = GetMasks8<3>(state.num_qubits(), qs, cqs, cvals);
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+    FillControlledMatrixH<H, 3>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
+
+    unsigned r = 3 + H;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, state.get());
+  }
+
+  template <unsigned H, unsigned L, bool CH>
+  void ApplyControlledGateL(const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs, uint64_t cvals,
+                            const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, const __m256i* idx, fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m256 rn, in;
+      __m256 rs[gsize], is[gsize];
+
+      i *= 8;
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      if ((ii & cmaskh) != cvalsh) return;
+
+      auto p0 = rstate + 2 * ii;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+
+        rs[k2] = _mm256_load_ps(p0 + xss[k]);
+        is[k2] = _mm256_load_ps(p0 + xss[k] + 8);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
+          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm256_mul_ps(rs[0], w[j]);
+        in = _mm256_mul_ps(rs[0], w[j + 1]);
+        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm256_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        _mm256_store_ps(p0 + xss[k], rn);
+        _mm256_store_ps(p0 + xss[k] + 8, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m256i idx[1 << L];
+    __m256 w[1 << (1 + 2 * H + L)];
+
+    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
+
+    unsigned r = 3 + H;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    if (CH) {
+      auto m = GetMasks9<L>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
+
+      for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, idx, state.get());
+    } else {
+      auto m = GetMasks10<L, 3>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillControlledMatrixL<H, L, 3>(
+          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
+
+      for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, idx, state.get());
+    }
+  }
+
+  template <unsigned H>
+  std::complex<double> ExpectationValueH(const std::vector<unsigned>& qs,
+                                         const fp_type* matrix,
+                                         const State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                const fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m256 ru, iu, rn, in;
+      __m256 rs[hsize], is[hsize];
+
+      i *= 8;
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      auto p0 = rstate + 2 * ii;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm256_load_ps(p0 + xss[k]);
+        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
+      }
+
+      double re = 0;
+      double im = 0;
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm256_set1_ps(v[j]);
+        iu = _mm256_set1_ps(v[j + 1]);
+        rn = _mm256_mul_ps(rs[0], ru);
+        in = _mm256_mul_ps(rs[0], iu);
+        rn = _mm256_fnmadd_ps(is[0], iu, rn);
+        in = _mm256_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm256_set1_ps(v[j]);
+          iu = _mm256_set1_ps(v[j + 1]);
+          rn = _mm256_fmadd_ps(rs[l], ru, rn);
+          in = _mm256_fmadd_ps(rs[l], iu, in);
+          rn = _mm256_fnmadd_ps(is[l], iu, rn);
+          in = _mm256_fmadd_ps(is[l], ru, in);
+
+          j += 2;
+        }
+
+        __m256 v_re = _mm256_fmadd_ps(is[k], in, _mm256_mul_ps(rs[k], rn));
+        __m256 v_im = _mm256_fnmadd_ps(is[k], rn, _mm256_mul_ps(rs[k], in));
+
+        re += detail::HorizontalSumAVX(v_re);
+        im += detail::HorizontalSumAVX(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 3 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), matrix, ms, xss, state.get());
+  }
+
+  template <unsigned H, unsigned L>
+  std::complex<double> ExpectationValueL(const std::vector<unsigned>& qs,
+                                         const fp_type* matrix,
+                                         const State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                const uint64_t* ms, const uint64_t* xss, const __m256i* idx,
+                const fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m256 rn, in;
+      __m256 rs[gsize], is[gsize];
+
+      i *= 8;
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      auto p0 = rstate + 2 * ii;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+
+        rs[k2] = _mm256_load_ps(p0 + xss[k]);
+        is[k2] = _mm256_load_ps(p0 + xss[k] + 8);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
+          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
+        }
+      }
+
+      double re = 0;
+      double im = 0;
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm256_mul_ps(rs[0], w[j]);
+        in = _mm256_mul_ps(rs[0], w[j + 1]);
+        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm256_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        unsigned m = lsize * k;
+
+        __m256 v_re = _mm256_fmadd_ps(is[m], in, _mm256_mul_ps(rs[m], rn));
+        __m256 v_im = _mm256_fnmadd_ps(is[m], rn, _mm256_mul_ps(rs[m], in));
+
+        re += detail::HorizontalSumAVX(v_re);
+        im += detail::HorizontalSumAVX(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m256i idx[1 << L];
+    __m256 w[1 << (1 + 2 * H + L)];
+
+    auto m = GetMasks11<L>(qs);
+
+    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
+    FillPermutationIndices<L>(m.qmaskl, idx);
+    FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
+
+    unsigned r = 3 + H;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, state.get());
+  }
+
+#endif  // __BMI2__
+
+  template <unsigned L>
+  static void FillPermutationIndices(unsigned qmaskl, __m256i* idx) {
+    constexpr unsigned lsize = 1 << L;
+
+    for (unsigned i = 0; i < lsize - 1; ++i) {
+      unsigned p[8];
+
+      for (unsigned j = 0; j < 8; ++j) {
+        p[j] = MaskedAdd<3>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl));
+      }
+
+      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
+    }
+  }
+
+  For for_;
+};
+
+}  // namespace qsim
+
+#endif  // SIMULATOR_AVX_H_
diff --git a/qsim/simulator_avx512.h b/qsim/simulator_avx512.h
new file mode 100644
index 0000000..21a2e9d
--- /dev/null
+++ b/qsim/simulator_avx512.h
@@ -0,0 +1,846 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SIMULATOR_AVX512_H_
+#define SIMULATOR_AVX512_H_
+
+#include <immintrin.h>
+
+#include <complex>
+#include <cstdint>
+#include <functional>
+#include <vector>
+
+#include "simulator.h"
+#include "statespace_avx512.h"
+
+namespace qsim {
+
+/**
+ * Quantum circuit simulator with AVX512 vectorization.
+ */
+template <typename For>
+class SimulatorAVX512 final : public SimulatorBase {
+ public:
+  using StateSpace = StateSpaceAVX512<For>;
+  using State = typename StateSpace::State;
+  using fp_type = typename StateSpace::fp_type;
+
+  template <typename... ForArgs>
+  explicit SimulatorAVX512(ForArgs&&... args) : for_(args...) {}
+
+  /**
+   * Applies a gate using AVX512 instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyGate(const std::vector<unsigned>& qs,
+                 const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    switch (qs.size()) {
+    case 0:
+      ApplyGateH<0>(qs, matrix, state);
+      break;
+    case 1:
+      if (qs[0] > 3) {
+        ApplyGateH<1>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 1>(qs, matrix, state);
+      }
+      break;
+    case 2:
+      if (qs[0] > 3) {
+        ApplyGateH<2>(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        ApplyGateL<1, 1>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 2>(qs, matrix, state);
+      }
+      break;
+    case 3:
+      if (qs[0] > 3) {
+        ApplyGateH<3>(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        ApplyGateL<2, 1>(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        ApplyGateL<1, 2>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 3>(qs, matrix, state);
+      }
+      break;
+    case 4:
+      if (qs[0] > 3) {
+        ApplyGateH<4>(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        ApplyGateL<3, 1>(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        ApplyGateL<2, 2>(qs, matrix, state);
+      } else if (qs[3] > 3) {
+        ApplyGateL<1, 3>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 4>(qs, matrix, state);
+      }
+      break;
+    case 5:
+      if (qs[0] > 3) {
+        ApplyGateH<5>(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        ApplyGateL<4, 1>(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        ApplyGateL<3, 2>(qs, matrix, state);
+      } else if (qs[3] > 3) {
+        ApplyGateL<2, 3>(qs, matrix, state);
+      } else {
+        ApplyGateL<1, 4>(qs, matrix, state);
+      }
+      break;
+    case 6:
+      if (qs[0] > 3) {
+        ApplyGateH<6>(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        ApplyGateL<5, 1>(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        ApplyGateL<4, 2>(qs, matrix, state);
+      } else if (qs[3] > 3) {
+        ApplyGateL<3, 3>(qs, matrix, state);
+      } else {
+        ApplyGateL<2, 4>(qs, matrix, state);
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * Applies a controlled gate using AVX512 instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param cqs Indices of control qubits.
+   * @param cvals Bit mask of control qubit values.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyControlledGate(const std::vector<unsigned>& qs,
+                           const std::vector<unsigned>& cqs, uint64_t cvals,
+                           const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+    // Assume cqs[0] < cqs[1] < cqs[2] < ... .
+
+    if (cqs.size() == 0) {
+      ApplyGate(qs, matrix, state);
+      return;
+    }
+
+    switch (qs.size()) {
+    case 0:
+      if (cqs[0] > 3) {
+        ApplyControlledGateHH<0>(qs, cqs, cvals, matrix, state);
+      } else {
+        ApplyControlledGateHL<0>(qs, cqs, cvals, matrix, state);
+      }
+      break;
+    case 1:
+      if (qs[0] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 2:
+      if (qs[0] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 3:
+      if (qs[0] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[2] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 4:
+      if (qs[0] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[2] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[3] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<0, 4, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 4, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * Computes the expectation value of an operator using AVX512 instructions.
+   * @param qs Indices of the qubits the operator acts on.
+   * @param matrix The operator matrix.
+   * @param state The state of the system.
+   * @return The computed expectation value.
+   */
+  std::complex<double> ExpectationValue(const std::vector<unsigned>& qs,
+                                        const fp_type* matrix,
+                                        const State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    switch (qs.size()) {
+    case 1:
+      if (qs[0] > 3) {
+        return ExpectationValueH<1>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<0, 1>(qs, matrix, state);
+      }
+      break;
+    case 2:
+      if (qs[0] > 3) {
+        return ExpectationValueH<2>(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        return ExpectationValueL<1, 1>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<0, 2>(qs, matrix, state);
+      }
+      break;
+    case 3:
+      if (qs[0] > 3) {
+        return ExpectationValueH<3>(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        return ExpectationValueL<2, 1>(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        return ExpectationValueL<1, 2>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<0, 3>(qs, matrix, state);
+      }
+      break;
+    case 4:
+      if (qs[0] > 3) {
+        return ExpectationValueH<4>(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        return ExpectationValueL<3, 1>(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        return ExpectationValueL<2, 2>(qs, matrix, state);
+      } else if (qs[3] > 3) {
+        return ExpectationValueL<1, 3>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<0, 4>(qs, matrix, state);
+      }
+      break;
+    case 5:
+      if (qs[0] > 3) {
+        return ExpectationValueH<5>(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        return ExpectationValueL<4, 1>(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        return ExpectationValueL<3, 2>(qs, matrix, state);
+      } else if (qs[3] > 3) {
+        return ExpectationValueL<2, 3>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<1, 4>(qs, matrix, state);
+      }
+      break;
+    case 6:
+      if (qs[0] > 3) {
+        return ExpectationValueH<6>(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        return ExpectationValueL<5, 1>(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        return ExpectationValueL<4, 2>(qs, matrix, state);
+      } else if (qs[3] > 3) {
+        return ExpectationValueL<3, 3>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<2, 4>(qs, matrix, state);
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+
+    return 0;
+  }
+
+  /**
+   * @return The size of SIMD register if applicable.
+   */
+  static unsigned SIMDRegisterSize() {
+    return 16;
+  }
+
+ private:
+  template <unsigned H>
+  void ApplyGateH(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                uint64_t imaskh, uint64_t qmaskh, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m512 ru, iu, rn, in;
+      __m512 rs[hsize], is[hsize];
+
+      auto p0 = rstate + _pdep_u64(i, imaskh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k] = _mm512_load_ps(p0 + p);
+        is[k] = _mm512_load_ps(p0 + p + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[l], ru, rn);
+          in = _mm512_fmadd_ps(rs[l], iu, in);
+          rn = _mm512_fnmadd_ps(is[l], iu, rn);
+          in = _mm512_fmadd_ps(is[l], ru, in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm512_store_ps(p0 + p, rn);
+        _mm512_store_ps(p0 + p + 16, in);
+      }
+    };
+
+    auto m = GetMasks1<H, 4>(qs);
+
+    unsigned k = 4 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, matrix, m.imaskh, m.qmaskh, state.get());
+  }
+
+  template <unsigned H, unsigned L>
+  void ApplyGateL(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                uint64_t imaskh, uint64_t qmaskh, const __m512i* idx,
+                fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m512 rn, in;
+      __m512 rs[gsize], is[gsize];
+
+      auto p0 = rstate + _pdep_u64(i, imaskh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k2] = _mm512_load_ps(p0 + p);
+        is[k2] = _mm512_load_ps(p0 + p + 16);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]);
+          is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
+          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm512_store_ps(p0 + p, rn);
+        _mm512_store_ps(p0 + p + 16, in);
+      }
+    };
+
+    __m512i idx[1 << L];
+    __m512 w[1 << (1 + 2 * H + L)];
+
+    auto m = GetMasks2<H, L, 4>(qs);
+    FillPermutationIndices<L>(m.qmaskl, idx);
+    FillMatrix<H, L, 4>(m.qmaskl, matrix, (fp_type*) w);
+
+    unsigned r = 4 + H;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, m.imaskh, m.qmaskh, idx, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
+                fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m512 ru, iu, rn, in;
+      __m512 rs[hsize], is[hsize];
+
+      auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k] = _mm512_load_ps(p0 + p);
+        is[k] = _mm512_load_ps(p0 + p + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[l], ru, rn);
+          in = _mm512_fmadd_ps(rs[l], iu, in);
+          rn = _mm512_fnmadd_ps(is[l], iu, rn);
+          in = _mm512_fmadd_ps(is[l], ru, in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm512_store_ps(p0 + p, rn);
+        _mm512_store_ps(p0 + p + 16, in);
+      }
+    };
+
+    auto m = GetMasks3<H, 4>(state.num_qubits(), qs, cqs, cvals);
+
+    unsigned k = 4 + H + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, matrix, m.imaskh, m.qmaskh, m.cvalsh, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
+                fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m512 rn, in;
+      __m512 rs[hsize], is[hsize];
+
+      auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k] = _mm512_load_ps(p0 + p);
+        is[k] = _mm512_load_ps(p0 + p + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
+          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm512_store_ps(p0 + p, rn);
+        _mm512_store_ps(p0 + p + 16, in);
+      }
+    };
+
+    __m512 w[1 << (1 + 2 * H)];
+
+    auto m = GetMasks4<H, 4>(state.num_qubits(), qs, cqs, cvals);
+    FillControlledMatrixH<H, 4>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
+
+    unsigned r = 4 + H + cqs.size() - m.cl;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, state.get());
+  }
+
+  template <unsigned H, unsigned L, bool CH>
+  void ApplyControlledGateL(const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs, uint64_t cvals,
+                            const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
+                const __m512i* idx, fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m512 rn, in;
+      __m512 rs[gsize], is[gsize];
+
+      auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k2] = _mm512_load_ps(p0 + p);
+        is[k2] = _mm512_load_ps(p0 + p + 16);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]);
+          is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
+          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm512_store_ps(p0 + p, rn);
+        _mm512_store_ps(p0 + p + 16, in);
+      }
+    };
+
+    __m512i idx[1 << L];
+    __m512 w[1 << (1 + 2 * H + L)];
+
+    if (CH) {
+      auto m = GetMasks5<H, L, 4>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillMatrix<H, L, 4>(m.qmaskl, matrix, (fp_type*) w);
+
+      unsigned r = 4 + H + cqs.size();
+      unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+      uint64_t size = uint64_t{1} << n;
+
+      for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get());
+    } else {
+      auto m = GetMasks6<H, L, 4>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillControlledMatrixL<H, L, 4>(
+          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
+
+      unsigned r = 4 + H + cqs.size() - m.cl;
+      unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+      uint64_t size = uint64_t{1} << n;
+
+      for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get());
+    }
+  }
+
+  template <unsigned H>
+  std::complex<double> ExpectationValueH(const std::vector<unsigned>& qs,
+                                         const fp_type* matrix,
+                                         const State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                uint64_t imaskh, uint64_t qmaskh, const fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m512 ru, iu, rn, in;
+      __m512 rs[hsize], is[hsize];
+
+      auto p0 = rstate + _pdep_u64(i, imaskh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k] = _mm512_load_ps(p0 + p);
+        is[k] = _mm512_load_ps(p0 + p + 16);
+      }
+
+      double re = 0;
+      double im = 0;
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[l], ru, rn);
+          in = _mm512_fmadd_ps(rs[l], iu, in);
+          rn = _mm512_fnmadd_ps(is[l], iu, rn);
+          in = _mm512_fmadd_ps(is[l], ru, in);
+
+          j += 2;
+        }
+
+        __m512 v_re = _mm512_fmadd_ps(is[k], in, _mm512_mul_ps(rs[k], rn));
+        __m512 v_im = _mm512_fnmadd_ps(is[k], rn, _mm512_mul_ps(rs[k], in));
+
+        re += detail::HorizontalSumAVX512(v_re);
+        im += detail::HorizontalSumAVX512(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    auto m = GetMasks1<H, 4>(qs);
+
+    unsigned k = 4 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return
+        for_.RunReduce(size, f, Op(), matrix, m.imaskh, m.qmaskh, state.get());
+  }
+
+  template <unsigned H, unsigned L>
+  std::complex<double> ExpectationValueL(const std::vector<unsigned>& qs,
+                                         const fp_type* matrix,
+                                         const State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                uint64_t imaskh, uint64_t qmaskh, const __m512i* idx,
+                const fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m512 rn, in;
+      __m512 rs[gsize], is[gsize];
+
+      auto p0 = rstate + _pdep_u64(i, imaskh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k2] = _mm512_load_ps(p0 + p);
+        is[k2] = _mm512_load_ps(p0 + p + 16);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]);
+          is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]);
+        }
+      }
+
+      double re = 0;
+      double im = 0;
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
+          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        unsigned m = lsize * k;
+
+        __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn));
+        __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in));
+
+        re += detail::HorizontalSumAVX512(v_re);
+        im += detail::HorizontalSumAVX512(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    __m512i idx[1 << L];
+    __m512 w[1 << (1 + 2 * H + L)];
+
+    auto m = GetMasks2<H, L, 4>(qs);
+    FillPermutationIndices<L>(m.qmaskl, idx);
+    FillMatrix<H, L, 4>(m.qmaskl, matrix, (fp_type*) w);
+
+    unsigned r = 4 + H;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return
+        for_.RunReduce(size, f, Op(), w, m.imaskh, m.qmaskh, idx, state.get());
+  }
+
+  template <unsigned L>
+  static void FillPermutationIndices(unsigned qmaskl, __m512i* idx) {
+    constexpr unsigned lsize = 1 << L;
+
+    for (unsigned i = 0; i < lsize; ++i) {
+      unsigned p[16];
+
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd<4>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+  }
+
+  For for_;
+};
+
+}  // namespace qsim
+
+#endif  // SIMULATOR_AVX512_H_
diff --git a/qsim/simulator_basic.h b/qsim/simulator_basic.h
new file mode 100644
index 0000000..752eeb5
--- /dev/null
+++ b/qsim/simulator_basic.h
@@ -0,0 +1,349 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SIMULATOR_BASIC_H_
+#define SIMULATOR_BASIC_H_
+
+#include <complex>
+#include <cstdint>
+#include <functional>
+#include <vector>
+
+#include "simulator.h"
+#include "statespace_basic.h"
+
+namespace qsim {
+
+/**
+ * Quantum circuit simulator without vectorization.
+ */
+template <typename For, typename FP = float>
+class SimulatorBasic final : public SimulatorBase {
+ public:
+  using StateSpace = StateSpaceBasic<For, FP>;
+  using State = typename StateSpace::State;
+  using fp_type = typename StateSpace::fp_type;
+
+  template <typename... ForArgs>
+  explicit SimulatorBasic(ForArgs&&... args) : for_(args...) {}
+
+  /**
+   * Applies a gate using non-vectorized instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyGate(const std::vector<unsigned>& qs,
+                 const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    switch (qs.size()) {
+    case 0:
+      ApplyGateH<0>(qs, matrix, state);
+      break;
+    case 1:
+      ApplyGateH<1>(qs, matrix, state);
+      break;
+    case 2:
+      ApplyGateH<2>(qs, matrix, state);
+      break;
+    case 3:
+      ApplyGateH<3>(qs, matrix, state);
+      break;
+    case 4:
+      ApplyGateH<4>(qs, matrix, state);
+      break;
+    case 5:
+      ApplyGateH<5>(qs, matrix, state);
+      break;
+    case 6:
+      ApplyGateH<6>(qs, matrix, state);
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * Applies a controlled gate using non-vectorized instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param cqs Indices of control qubits.
+   * @param cvals Bit mask of control qubit values.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyControlledGate(const std::vector<unsigned>& qs,
+                           const std::vector<unsigned>& cqs, uint64_t cvals,
+                           const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    if (cqs.size() == 0) {
+      ApplyGate(qs, matrix, state);
+      return;
+    }
+
+    switch (qs.size()) {
+    case 0:
+      ApplyControlledGateH<0>(qs, cqs, cvals, matrix, state);
+      break;
+    case 1:
+      ApplyControlledGateH<1>(qs, cqs, cvals, matrix, state);
+      break;
+    case 2:
+      ApplyControlledGateH<2>(qs, cqs, cvals, matrix, state);
+      break;
+    case 3:
+      ApplyControlledGateH<3>(qs, cqs, cvals, matrix, state);
+      break;
+    case 4:
+      ApplyControlledGateH<4>(qs, cqs, cvals, matrix, state);
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * Computes the expectation value of an operator using non-vectorized
+   * instructions.
+   * @param qs Indices of the qubits the operator acts on.
+   * @param matrix The operator matrix.
+   * @param state The state of the system.
+   * @return The computed expectation value.
+   */
+  std::complex<double> ExpectationValue(const std::vector<unsigned>& qs,
+                                        const fp_type* matrix,
+                                        const State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    switch (qs.size()) {
+    case 1:
+      return ExpectationValueH<1>(qs, matrix, state);
+      break;
+    case 2:
+      return ExpectationValueH<2>(qs, matrix, state);
+      break;
+    case 3:
+      return ExpectationValueH<3>(qs, matrix, state);
+      break;
+    case 4:
+      return ExpectationValueH<4>(qs, matrix, state);
+      break;
+    case 5:
+      return ExpectationValueH<5>(qs, matrix, state);
+      break;
+    case 6:
+      return ExpectationValueH<6>(qs, matrix, state);
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+
+    return 0;
+  }
+
+  /**
+   * @return The size of SIMD register if applicable.
+   */
+  static unsigned SIMDRegisterSize() {
+    return 1;
+  }
+
+ private:
+  template <unsigned H>
+  void ApplyGateH(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      fp_type rn, in;
+      fp_type rs[hsize], is[hsize];
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      auto p0 = rstate + 2 * ii;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = *(p0 + xss[k]);
+        is[k] = *(p0 + xss[k] + 1);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = rs[0] * v[j] - is[0] * v[j + 1];
+        in = rs[0] * v[j + 1] + is[0] * v[j];
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          rn += rs[l] * v[j] - is[l] * v[j + 1];
+          in += rs[l] * v[j + 1] + is[l] * v[j];
+
+          j += 2;
+        }
+
+        *(p0 + xss[k]) = rn;
+        *(p0 + xss[k] + 1) = in;
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, matrix, ms, xss, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateH(const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs,
+                            uint64_t cvals, const fp_type* matrix,
+                            State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                uint64_t cvalsh, uint64_t cmaskh, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      fp_type rn, in;
+      fp_type rs[hsize], is[hsize];
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      if ((ii & cmaskh) == cvalsh) {
+        auto p0 = rstate + 2 * ii;
+
+        for (unsigned k = 0; k < hsize; ++k) {
+          rs[k] = *(p0 + xss[k]);
+          is[k] = *(p0 + xss[k] + 1);
+        }
+
+        uint64_t j = 0;
+
+        for (unsigned k = 0; k < hsize; ++k) {
+          rn = rs[0] * v[j] - is[0] * v[j + 1];
+          in = rs[0] * v[j + 1] + is[0] * v[j];
+
+          j += 2;
+
+          for (unsigned l = 1; l < hsize; ++l) {
+            rn += rs[l] * v[j] - is[l] * v[j + 1];
+            in += rs[l] * v[j + 1] + is[l] * v[j];
+
+            j += 2;
+          }
+
+          *(p0 + xss[k]) = rn;
+          *(p0 + xss[k] + 1) = in;
+        }
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals);
+
+    unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, matrix, ms, xss, m.cvalsh, m.cmaskh, state.get());
+  }
+
+  template <unsigned H>
+  std::complex<double> ExpectationValueH(const std::vector<unsigned>& qs,
+                                         const fp_type* matrix,
+                                         const State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                const fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      fp_type rn, in;
+      fp_type rs[hsize], is[hsize];
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      auto p0 = rstate + 2 * ii;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = *(p0 + xss[k]);
+        is[k] = *(p0 + xss[k] + 1);
+      }
+
+      double re = 0;
+      double im = 0;
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = rs[0] * v[j] - is[0] * v[j + 1];
+        in = rs[0] * v[j + 1] + is[0] * v[j];
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          rn += rs[l] * v[j] - is[l] * v[j + 1];
+          in += rs[l] * v[j + 1] + is[l] * v[j];
+
+          j += 2;
+        }
+
+        re += rs[k] * rn + is[k] * in;
+        im += rs[k] * in - is[k] * rn;
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), matrix, ms, xss, state.get());
+  }
+
+  For for_;
+};
+
+}  // namespace qsim
+
+#endif  // SIMULATOR_BASIC_H_
diff --git a/qsim/simulator_cuda.h b/qsim/simulator_cuda.h
new file mode 100644
index 0000000..5743bea
--- /dev/null
+++ b/qsim/simulator_cuda.h
@@ -0,0 +1,923 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SIMULATOR_CUDA_H_
+#define SIMULATOR_CUDA_H_
+
+#include "simulator_cuda_kernels.h"
+
+#include <algorithm>
+#include <complex>
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+#include "bits.h"
+#include "statespace_cuda.h"
+
+namespace qsim {
+
+/**
+ * Quantum circuit simulator with GPU vectorization.
+ */
+template <typename FP = float>
+class SimulatorCUDA final {
+ private:
+  using idx_type = uint64_t;
+  using Complex = qsim::Complex<double>;
+
+  // The maximum buffer size for indices and gate matrices.
+  // The maximum gate matrix size (for 6-qubit gates) is
+  // 2 * 2^6 * 2^6 * sizeof(FP) = 8192 * sizeof(FP). The maximum index size is
+  // 128 * sizeof(idx_type) + 96 * sizeof(unsigned).
+  static constexpr unsigned max_buf_size = 8192 * sizeof(FP)
+      + 128 * sizeof(idx_type) + 96 * sizeof(unsigned);
+
+ public:
+  using StateSpace = StateSpaceCUDA<FP>;
+  using State = typename StateSpace::State;
+  using fp_type = typename StateSpace::fp_type;
+
+  SimulatorCUDA() : scratch_(nullptr), scratch_size_(0) {
+    ErrorCheck(cudaMalloc(&d_ws, max_buf_size));
+  }
+
+  ~SimulatorCUDA() {
+    ErrorCheck(cudaFree(d_ws));
+
+    if (scratch_ != nullptr) {
+      ErrorCheck(cudaFree(scratch_));
+    }
+  }
+
+  /**
+   * Applies a gate using CUDA instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyGate(const std::vector<unsigned>& qs,
+                 const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    if (qs.size() == 0) {
+      ApplyGateH<0>(qs, matrix, state);
+    } else if (qs[0] > 4) {
+      switch (qs.size()) {
+      case 1:
+        ApplyGateH<1>(qs, matrix, state);
+        break;
+      case 2:
+        ApplyGateH<2>(qs, matrix, state);
+        break;
+      case 3:
+        ApplyGateH<3>(qs, matrix, state);
+        break;
+      case 4:
+        ApplyGateH<4>(qs, matrix, state);
+        break;
+      case 5:
+        ApplyGateH<5>(qs, matrix, state);
+        break;
+      case 6:
+        ApplyGateH<6>(qs, matrix, state);
+        break;
+      default:
+        // Not implemented.
+        break;
+      }
+    } else {
+      switch (qs.size()) {
+      case 1:
+        ApplyGateL<1>(qs, matrix, state);
+        break;
+      case 2:
+        ApplyGateL<2>(qs, matrix, state);
+        break;
+      case 3:
+        ApplyGateL<3>(qs, matrix, state);
+        break;
+      case 4:
+        ApplyGateL<4>(qs, matrix, state);
+        break;
+      case 5:
+        ApplyGateL<5>(qs, matrix, state);
+        break;
+      case 6:
+        ApplyGateL<6>(qs, matrix, state);
+        break;
+      default:
+        // Not implemented.
+        break;
+      }
+    }
+  }
+
+  /**
+   * Applies a controlled gate using CUDA instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param cqs Indices of control qubits.
+   * @param cvals Bit mask of control qubit values.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyControlledGate(const std::vector<unsigned>& qs,
+                           const std::vector<unsigned>& cqs, uint64_t cvals,
+                           const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    if (cqs.size() == 0) {
+      ApplyGate(qs, matrix, state);
+      return;
+    }
+
+    if (cqs[0] < 5) {
+      switch (qs.size()) {
+      case 0:
+        ApplyControlledGateL<0>(qs, cqs, cvals, matrix, state);
+        break;
+      case 1:
+        ApplyControlledGateL<1>(qs, cqs, cvals, matrix, state);
+        break;
+      case 2:
+        ApplyControlledGateL<2>(qs, cqs, cvals, matrix, state);
+        break;
+      case 3:
+        ApplyControlledGateL<3>(qs, cqs, cvals, matrix, state);
+        break;
+      case 4:
+        ApplyControlledGateL<4>(qs, cqs, cvals, matrix, state);
+        break;
+      default:
+        // Not implemented.
+        break;
+      }
+    } else {
+      if (qs.size() == 0) {
+        ApplyControlledGateHH<0>(qs, cqs, cvals, matrix, state);
+      } else if (qs[0] > 4) {
+        switch (qs.size()) {
+        case 1:
+          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
+          break;
+        case 2:
+          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
+          break;
+        case 3:
+          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
+          break;
+        case 4:
+          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
+          break;
+        default:
+          // Not implemented.
+          break;
+        }
+      } else {
+        switch (qs.size()) {
+        case 1:
+          ApplyControlledGateLH<1>(qs, cqs, cvals, matrix, state);
+          break;
+        case 2:
+          ApplyControlledGateLH<2>(qs, cqs, cvals, matrix, state);
+          break;
+        case 3:
+          ApplyControlledGateLH<3>(qs, cqs, cvals, matrix, state);
+          break;
+        case 4:
+          ApplyControlledGateLH<4>(qs, cqs, cvals, matrix, state);
+          break;
+        default:
+          // Not implemented.
+          break;
+        }
+      }
+    }
+  }
+
+  /**
+   * Computes the expectation value of an operator using CUDA instructions.
+   * @param qs Indices of the qubits the operator acts on.
+   * @param matrix The operator matrix.
+   * @param state The state of the system.
+   * @return The computed expectation value.
+   */
+  std::complex<double> ExpectationValue(const std::vector<unsigned>& qs,
+                                        const fp_type* matrix,
+                                        const State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    if (qs[0] > 4) {
+      switch (qs.size()) {
+      case 1:
+        return ExpectationValueH<1>(qs, matrix, state);
+      case 2:
+        return ExpectationValueH<2>(qs, matrix, state);
+      case 3:
+        return ExpectationValueH<3>(qs, matrix, state);
+      case 4:
+        return ExpectationValueH<4>(qs, matrix, state);
+      case 5:
+        return ExpectationValueH<5>(qs, matrix, state);
+      case 6:
+        return ExpectationValueH<6>(qs, matrix, state);
+      default:
+        // Not implemented.
+        break;
+      }
+    } else {
+      switch (qs.size()) {
+      case 1:
+        return ExpectationValueL<1>(qs, matrix, state);
+      case 2:
+        return ExpectationValueL<2>(qs, matrix, state);
+      case 3:
+        return ExpectationValueL<3>(qs, matrix, state);
+      case 4:
+        return ExpectationValueL<4>(qs, matrix, state);
+      case 5:
+        return ExpectationValueL<5>(qs, matrix, state);
+      case 6:
+        return ExpectationValueL<6>(qs, matrix, state);
+      default:
+        // Not implemented.
+        break;
+      }
+    }
+
+    return 0;
+  }
+
+  /**
+   * @return The size of SIMD register if applicable.
+   */
+  static unsigned SIMDRegisterSize() {
+    return 32;
+  }
+
+ private:
+  // The following indices are used in kernels.
+  // xss - indices to access the state vector entries in global memory.
+  // ms  - masks to access the state vector entries in global memory.
+  // tis - indices to access the state vector entries in shared memory
+  //       in the presence of low gate qubits.
+  // qis - indices to access the state vector entries in shared memory
+  //       in the presence of low gate qubits.
+  // cis - additional indices to access the state vector entries in global
+  //       memory in the presence of low control qubits.
+
+  template <unsigned G>
+  struct IndicesH {
+    static constexpr unsigned gsize = 1 << G;
+    static constexpr unsigned matrix_size = 2 * gsize * gsize * sizeof(fp_type);
+    static constexpr unsigned xss_size = 32 * sizeof(idx_type) * (1 + (G == 6));
+    static constexpr unsigned ms_size = 32 * sizeof(idx_type);
+    static constexpr unsigned xss_offs = matrix_size;
+    static constexpr unsigned ms_offs = xss_offs + xss_size;
+    static constexpr unsigned buf_size = ms_offs + ms_size;
+
+    IndicesH(char* p)
+        : xss((idx_type*) (p + xss_offs)), ms((idx_type*) (p + ms_offs)) {}
+
+    idx_type* xss;
+    idx_type* ms;
+  };
+
+  template <unsigned G>
+  struct IndicesL : public IndicesH<G> {
+    using Base = IndicesH<G>;
+    static constexpr unsigned qis_size = 32 * sizeof(unsigned) * (1 + (G == 6));
+    static constexpr unsigned tis_size = 32 * sizeof(unsigned);
+    static constexpr unsigned qis_offs = Base::buf_size;
+    static constexpr unsigned tis_offs = qis_offs + qis_size;
+    static constexpr unsigned buf_size = tis_offs + tis_size;
+
+    IndicesL(char* p)
+        : Base(p), qis((unsigned*) (p + qis_offs)),
+          tis((unsigned*) (p + tis_offs)) {}
+
+    unsigned* qis;
+    unsigned* tis;
+  };
+
+  template <unsigned G>
+  struct IndicesLC : public IndicesL<G> {
+    using Base = IndicesL<G>;
+    static constexpr unsigned cis_size = 32 * sizeof(idx_type);
+    static constexpr unsigned cis_offs = Base::buf_size;
+    static constexpr unsigned buf_size = cis_offs + cis_size;
+
+    IndicesLC(char* p) : Base(p), cis((idx_type*) (p + cis_offs)) {}
+
+    idx_type* cis;
+  };
+
+  struct DataC {
+    idx_type cvalsh;
+    unsigned num_aqs;
+    unsigned num_effective_qs;
+    unsigned remaining_low_cqs;
+  };
+
+  template <unsigned G>
+  void ApplyGateH(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    unsigned num_qubits = state.num_qubits();
+
+    IndicesH<G> h_i(h_ws);
+    GetIndicesH(num_qubits, qs, qs.size(), h_i);
+
+    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
+    ErrorCheck(
+        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
+
+    unsigned k = 5 + G;
+    unsigned n = num_qubits > k ? num_qubits - k : 0;
+    unsigned size = unsigned{1} << n;
+    unsigned threads = 64U;
+    unsigned blocks = std::max(1U, size / 2);
+
+    IndicesH<G> d_i(d_ws);
+
+    ApplyGateH_Kernel<G><<<blocks, threads>>>(
+        (fp_type*) d_ws, d_i.xss, d_i.ms, state.get());
+  }
+
+  template <unsigned G>
+  void ApplyGateL(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    unsigned num_qubits = state.num_qubits();
+
+    IndicesL<G> h_i(h_ws);
+    auto num_effective_qs = GetIndicesL(num_qubits, qs, h_i);
+
+    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
+    ErrorCheck(
+        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
+
+    unsigned k = 5 + num_effective_qs;
+    unsigned n = num_qubits > k ? num_qubits - k : 0;
+    unsigned size = unsigned{1} << n;
+    unsigned threads = 32;
+    unsigned blocks = size;
+
+    IndicesL<G> d_i(d_ws);
+
+    ApplyGateL_Kernel<G><<<blocks, threads>>>(
+        (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
+        1 << num_effective_qs, state.get());
+  }
+
+  template <unsigned G>
+  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, idx_type cvals,
+                             const fp_type* matrix, State& state) const {
+    unsigned aqs[64];
+    idx_type cmaskh = 0;
+    unsigned num_qubits = state.num_qubits();
+
+    IndicesH<G> h_i(h_ws);
+
+    unsigned num_aqs = GetHighQubits(qs, 0, cqs, 0, 0, cmaskh, aqs);
+    GetMs(num_qubits, aqs, num_aqs, h_i.ms);
+    GetXss(num_qubits, qs, qs.size(), h_i.xss);
+
+    idx_type cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh);
+
+    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
+    ErrorCheck(
+        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
+
+    unsigned k = 5 + G + cqs.size();
+    unsigned n = num_qubits > k ? num_qubits - k : 0;
+    unsigned size = unsigned{1} << n;
+    unsigned threads = 64U;
+    unsigned blocks = std::max(1U, size / 2);
+
+    IndicesH<G> d_i(d_ws);
+
+    ApplyControlledGateH_Kernel<G><<<blocks, threads>>>(
+        (fp_type*) d_ws, d_i.xss, d_i.ms, num_aqs + 1, cvalsh, state.get());
+  }
+
+  template <unsigned G>
+  void ApplyControlledGateLH(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    unsigned num_qubits = state.num_qubits();
+
+    IndicesL<G> h_i(h_ws);
+    auto d = GetIndicesLC(num_qubits, qs, cqs, cvals, h_i);
+
+    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
+    ErrorCheck(
+        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
+
+    unsigned k = 5 + G + cqs.size();
+    unsigned n = num_qubits > k ? num_qubits - k : 0;
+    unsigned size = unsigned{1} << n;
+    unsigned threads = 32;
+    unsigned blocks = size;
+
+    IndicesL<G> d_i(d_ws);
+
+    ApplyControlledGateLH_Kernel<G><<<blocks, threads>>>(
+        (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
+        d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs, state.get());
+  }
+
+  template <unsigned G>
+  void ApplyControlledGateL(const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs, uint64_t cvals,
+                            const fp_type* matrix, State& state) const {
+    unsigned num_qubits = state.num_qubits();
+
+    IndicesLC<G> h_i(h_ws);
+    auto d = GetIndicesLCL(num_qubits, qs, cqs, cvals, h_i);
+
+    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
+    ErrorCheck(
+        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
+
+    unsigned k = 5 + G + cqs.size();
+    unsigned n = num_qubits > k ? num_qubits - k : 0;
+    unsigned size = unsigned{1} << n;
+    unsigned threads = 32;
+    unsigned blocks = size;
+
+    IndicesLC<G> d_i(d_ws);
+
+    ApplyControlledGateL_Kernel<G><<<blocks, threads>>>(
+        (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d_i.cis,
+        d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs,
+        1 << (5 - d.remaining_low_cqs), state.get());
+  }
+
+  template <unsigned G>
+  std::complex<double> ExpectationValueH(const std::vector<unsigned>& qs,
+                                         const fp_type* matrix,
+                                         const State& state) const {
+    unsigned num_qubits = state.num_qubits();
+
+    IndicesH<G> h_i(h_ws);
+    GetIndicesH(num_qubits, qs, qs.size(), h_i);
+
+    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
+    ErrorCheck(
+        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
+
+    unsigned k = 5 + G;
+    unsigned n = num_qubits > k ? num_qubits - k : 0;
+    unsigned size = unsigned{1} << n;
+
+    unsigned s = std::min(n >= 14 ? n - 14 : 0, 4U);
+    unsigned threads = 64U;
+    unsigned blocks = std::max(1U, (size / 2) >> s);
+    unsigned num_iterations_per_block = 1 << s;
+
+    constexpr unsigned m = 16;
+
+    Complex* d_res1 = (Complex*) AllocScratch((blocks + m) * sizeof(Complex));
+    Complex* d_res2 = d_res1 + blocks;
+
+    IndicesH<G> d_i(d_ws);
+
+    ExpectationValueH_Kernel<G><<<blocks, threads>>>(
+        (fp_type*) d_ws, d_i.xss, d_i.ms, num_iterations_per_block,
+        state.get(), Plus<double>(), d_res1);
+
+    double mul = size == 1 ? 0.5 : 1.0;
+
+    return ExpectationValueReduceFinal<m>(blocks, mul, d_res1, d_res2);
+  }
+
+  template <unsigned G>
+  std::complex<double> ExpectationValueL(const std::vector<unsigned>& qs,
+                                         const fp_type* matrix,
+                                         const State& state) const {
+    unsigned num_qubits = state.num_qubits();
+
+    IndicesL<G> h_i(h_ws);
+    auto num_effective_qs = GetIndicesL(num_qubits, qs, h_i);
+
+    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
+    ErrorCheck(
+        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
+
+    unsigned k = 5 + num_effective_qs;
+    unsigned n = num_qubits > k ? num_qubits - k : 0;
+    unsigned size = unsigned{1} << n;
+
+    unsigned s = std::min(n >= 13 ? n - 13 : 0, 5U);
+    unsigned threads = 32;
+    unsigned blocks = size >> s;
+    unsigned num_iterations_per_block = 1 << s;
+
+    constexpr unsigned m = 16;
+
+    Complex* d_res1 = (Complex*) AllocScratch((blocks + m) * sizeof(Complex));
+    Complex* d_res2 = d_res1 + blocks;
+
+    IndicesL<G> d_i(d_ws);
+
+    ExpectationValueL_Kernel<G><<<blocks, threads>>>(
+        (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
+        num_iterations_per_block, state.get(), Plus<double>(), d_res1);
+
+    double mul = double(1 << (5 + num_effective_qs - G)) / 32;
+
+    return ExpectationValueReduceFinal<m>(blocks, mul, d_res1, d_res2);
+  }
+
+  template <unsigned m>
+  std::complex<double> ExpectationValueReduceFinal(
+      unsigned blocks, double mul,
+      const Complex* d_res1, Complex* d_res2) const {
+    Complex res2[m];
+
+    if (blocks <= 16) {
+      ErrorCheck(cudaMemcpy(res2, d_res1, blocks * sizeof(Complex),
+                            cudaMemcpyDeviceToHost));
+    } else {
+      unsigned threads2 = std::min(1024U, blocks);
+      unsigned blocks2 = std::min(m, blocks / threads2);
+
+      unsigned dblocks = std::max(1U, blocks / (blocks2 * threads2));
+      unsigned bytes = threads2 * sizeof(Complex);
+
+      Reduce2Kernel<Complex><<<blocks2, threads2, bytes>>>(
+          dblocks, blocks, Plus<Complex>(), Plus<double>(), d_res1, d_res2);
+
+      ErrorCheck(cudaMemcpy(res2, d_res2, blocks2 * sizeof(Complex),
+                            cudaMemcpyDeviceToHost));
+
+      blocks = blocks2;
+    }
+
+    double re = 0;
+    double im = 0;
+
+    for (unsigned i = 0; i < blocks; ++i) {
+      re += res2[i].re;
+      im += res2[i].im;
+    }
+
+    return {mul * re, mul * im};
+  }
+
+  template <typename AQ>
+  unsigned GetHighQubits(const std::vector<unsigned>& qs, unsigned qi,
+                         const std::vector<unsigned>& cqs, unsigned ci,
+                         unsigned ai, idx_type& cmaskh, AQ& aqs) const {
+    while (1) {
+      if (qi < qs.size() && (ci == cqs.size() || qs[qi] < cqs[ci])) {
+        aqs[ai++] = qs[qi++];
+      } else if (ci < cqs.size()) {
+        cmaskh |= idx_type{1} << cqs[ci];
+        aqs[ai++] = cqs[ci++];
+      } else {
+        break;
+      }
+    }
+
+    return ai;
+  }
+
+  template <typename QS>
+  void GetMs(unsigned num_qubits, const QS& qs, unsigned qs_size,
+             idx_type* ms) const {
+    if (qs_size == 0) {
+      ms[0] = idx_type(-1);
+    } else {
+      idx_type xs = idx_type{1} << (qs[0] + 1);
+      ms[0] = (idx_type{1} << qs[0]) - 1;
+      for (unsigned i = 1; i < qs_size; ++i) {
+        ms[i] = ((idx_type{1} << qs[i]) - 1) ^ (xs - 1);
+        xs = idx_type{1} << (qs[i] + 1);
+      }
+      ms[qs_size] = ((idx_type{1} << num_qubits) - 1) ^ (xs - 1);
+    }
+  }
+
+  template <typename QS>
+  void GetXss(unsigned num_qubits, const QS& qs, unsigned qs_size,
+              idx_type* xss) const {
+    if (qs_size == 0) {
+      xss[0] = 0;
+    } else {
+      unsigned g = qs_size;
+      unsigned gsize = 1 << qs_size;
+
+      idx_type xs[64];
+
+      xs[0] = idx_type{1} << (qs[0] + 1);
+      for (unsigned i = 1; i < g; ++i) {
+        xs[i] = idx_type{1} << (qs[i] + 1);
+      }
+
+      for (unsigned i = 0; i < gsize; ++i) {
+        idx_type a = 0;
+        for (unsigned k = 0; k < g; ++k) {
+          a += xs[k] * ((i >> k) & 1);
+        }
+        xss[i] = a;
+      }
+    }
+  }
+
+  template <unsigned G, typename qs_type>
+  void GetIndicesH(unsigned num_qubits, const qs_type& qs, unsigned qs_size,
+                   IndicesH<G>& indices) const {
+    if (qs_size == 0) {
+      indices.ms[0] = idx_type(-1);
+      indices.xss[0] = 0;
+    } else {
+      unsigned g = qs_size;
+      unsigned gsize = 1 << qs_size;
+
+      idx_type xs[64];
+
+      xs[0] = idx_type{1} << (qs[0] + 1);
+      indices.ms[0] = (idx_type{1} << qs[0]) - 1;
+      for (unsigned i = 1; i < g; ++i) {
+        xs[i] = idx_type{1} << (qs[i] + 1);
+        indices.ms[i] = ((idx_type{1} << qs[i]) - 1) ^ (xs[i - 1] - 1);
+      }
+      indices.ms[g] = ((idx_type{1} << num_qubits) - 1) ^ (xs[g - 1] - 1);
+
+      for (unsigned i = 0; i < gsize; ++i) {
+        idx_type a = 0;
+        for (unsigned k = 0; k < g; ++k) {
+          a += xs[k] * ((i >> k) & 1);
+        }
+        indices.xss[i] = a;
+      }
+    }
+  }
+
+  template <unsigned G>
+  void GetIndicesL(unsigned num_effective_qs, unsigned qmask,
+                   IndicesL<G>& indices) const {
+    for (unsigned i = num_effective_qs + 1; i < (G + 1); ++i) {
+      indices.ms[i] = 0;
+    }
+
+    for (unsigned i = (1 << num_effective_qs); i < indices.gsize; ++i) {
+      indices.xss[i] = 0;
+    }
+
+    for (unsigned i = 0; i < indices.gsize; ++i) {
+      indices.qis[i] = bits::ExpandBits(i, 5 + num_effective_qs, qmask);
+    }
+
+    unsigned tmask = ((1 << (5 + num_effective_qs)) - 1) ^ qmask;
+    for (unsigned i = 0; i < 32; ++i) {
+      indices.tis[i] = bits::ExpandBits(i, 5 + num_effective_qs, tmask);
+    }
+  }
+
+  template <unsigned G>
+  unsigned GetIndicesL(unsigned num_qubits, const std::vector<unsigned>& qs,
+                       IndicesL<G>& indices) const {
+    unsigned eqs[32];
+
+    unsigned qmaskh = 0;
+    unsigned qmaskl = 0;
+
+    unsigned qi = 0;
+
+    while (qi < qs.size() && qs[qi] < 5) {
+      qmaskl |= 1 << qs[qi++];
+    }
+
+    unsigned nq = std::max(5U, num_qubits);
+    unsigned num_effective_qs = std::min(nq - 5, unsigned(qs.size()));
+
+    unsigned l = 0;
+    unsigned ei = 0;
+    unsigned num_low_qs = qi;
+
+    if (qs.size() == num_low_qs) {
+      while (ei < num_effective_qs && l++ < num_low_qs) {
+        eqs[ei] = ei + 5;
+        ++ei;
+      }
+    } else {
+      while (ei < num_effective_qs && l < num_low_qs) {
+        unsigned ei5 = ei + 5;
+        eqs[ei] = ei5;
+        if (qi < qs.size() && qs[qi] == ei5) {
+          ++qi;
+          qmaskh |= 1 << ei5;
+        } else {
+          ++l;
+        }
+        ++ei;
+      }
+
+      while (ei < num_effective_qs) {
+        eqs[ei] = qs[qi++];
+        qmaskh |= 1 << (ei + 5);
+        ++ei;
+      }
+    }
+
+    GetIndicesH(num_qubits, eqs, num_effective_qs, indices);
+    GetIndicesL(num_effective_qs, qmaskh | qmaskl, indices);
+
+    return num_effective_qs;
+  }
+
+  template <unsigned G>
+  DataC GetIndicesLC(unsigned num_qubits, const std::vector<unsigned>& qs,
+                     const std::vector<unsigned>& cqs, uint64_t cvals,
+                     IndicesL<G>& indices) const {
+    unsigned aqs[64];
+    unsigned eqs[32];
+
+    unsigned qmaskh = 0;
+    unsigned qmaskl = 0;
+    idx_type cmaskh = 0;
+
+    unsigned qi = 0;
+
+    while (qi < qs.size() && qs[qi] < 5) {
+      qmaskl |= 1 << qs[qi++];
+    }
+
+    unsigned nq = std::max(5U, num_qubits - unsigned(cqs.size()));
+    unsigned num_effective_qs = std::min(nq - 5, unsigned(qs.size()));
+
+    unsigned l = 0;
+    unsigned ai = 5;
+    unsigned ci = 0;
+    unsigned ei = 0;
+    unsigned num_low_qs = qi;
+
+    while (ai < num_qubits && l < num_low_qs) {
+      aqs[ai - 5] = ai;
+      if (qi < qs.size() && qs[qi] == ai) {
+        ++qi;
+        eqs[ei++] = ai;
+        qmaskh |= 1 << (ai - ci);
+      } else if (ci < cqs.size() && cqs[ci] == ai) {
+        ++ci;
+        cmaskh |= idx_type{1} << ai;
+      } else {
+        ++l;
+        eqs[ei++] = ai;
+      }
+      ++ai;
+    }
+
+    unsigned i = ai;
+    unsigned j = qi;
+
+    while (ei < num_effective_qs) {
+      eqs[ei++] = qs[j++];
+      qmaskh |= 1 << (i++ - ci);
+    }
+
+    unsigned num_aqs = GetHighQubits(qs, qi, cqs, ci, ai - 5, cmaskh, aqs);
+    GetMs(num_qubits, aqs, num_aqs, indices.ms);
+    GetXss(num_qubits, eqs, num_effective_qs, indices.xss);
+    GetIndicesL(num_effective_qs, qmaskh | qmaskl, indices);
+
+    idx_type cvalsh = bits::ExpandBits(idx_type(cvals), num_qubits, cmaskh);
+
+    return {cvalsh, num_aqs, num_effective_qs};
+  }
+
+  template <unsigned G>
+  DataC GetIndicesLCL(unsigned num_qubits, const std::vector<unsigned>& qs,
+                      const std::vector<unsigned>& cqs, uint64_t cvals,
+                      IndicesLC<G>& indices) const {
+    unsigned aqs[64];
+    unsigned eqs[32];
+
+    unsigned qmaskh = 0;
+    unsigned qmaskl = 0;
+    idx_type cmaskh = 0;
+    idx_type cmaskl = 0;
+    idx_type cis_mask = 0;
+
+    unsigned qi = 0;
+    unsigned ci = 0;
+
+    for (unsigned k = 0; k < 5; ++k) {
+      if (qi < qs.size() && qs[qi] == k) {
+        qmaskl |= 1 << (k - ci);
+        ++qi;
+      } else if (ci < cqs.size() && cqs[ci] == k) {
+        cmaskl |= idx_type{1} << k;
+        ++ci;
+      }
+    }
+
+    unsigned num_low_qs = qi;
+    unsigned num_low_cqs = ci;
+
+    unsigned nq = std::max(5U, num_qubits - unsigned(cqs.size()));
+    unsigned num_effective_qs = std::min(nq - 5, unsigned(qs.size()));
+
+    unsigned l = 0;
+    unsigned ai = 5;
+    unsigned ei = 0;
+    unsigned num_low = num_low_qs + num_low_cqs;
+    unsigned remaining_low_cqs = num_low_cqs;
+    unsigned effective_low_qs = num_low_qs;
+    unsigned highest_cis_bit = 0;
+
+    while (ai < num_qubits && l < num_low) {
+      aqs[ai - 5] = ai;
+      if (qi < qs.size() && qs[qi] == ai) {
+        ++qi;
+        if ((ai - ci) > 4) {
+          eqs[ei++] = ai;
+          qmaskh |= 1 << (ai - ci);
+        } else {
+          highest_cis_bit = ai;
+          cis_mask |= idx_type{1} << ai;
+          qmaskl |= 1 << (ai - ci);
+          --remaining_low_cqs;
+          ++effective_low_qs;
+        }
+      } else if (ci < cqs.size() && cqs[ci] == ai) {
+        ++ci;
+        cmaskh |= idx_type{1} << ai;
+      } else {
+        ++l;
+        if (remaining_low_cqs == 0) {
+          eqs[ei++] = ai;
+        } else {
+          highest_cis_bit = ai;
+          cis_mask |= idx_type{1} << ai;
+          --remaining_low_cqs;
+        }
+      }
+      ++ai;
+    }
+
+    unsigned i = ai;
+    unsigned j = effective_low_qs;
+
+    while (ei < num_effective_qs) {
+      eqs[ei++] = qs[j++];
+      qmaskh |= 1 << (i++ - ci);
+    }
+
+    unsigned num_aqs = GetHighQubits(qs, qi, cqs, ci, ai - 5, cmaskh, aqs);
+    GetMs(num_qubits, aqs, num_aqs, indices.ms);
+    GetXss(num_qubits, eqs, num_effective_qs, indices.xss);
+    GetIndicesL(num_effective_qs, qmaskh | qmaskl, indices);
+
+    idx_type cvalsh = bits::ExpandBits(idx_type(cvals), num_qubits, cmaskh);
+    idx_type cvalsl = bits::ExpandBits(idx_type(cvals), 5, cmaskl);
+
+    cis_mask |= 31 ^ cmaskl;
+    highest_cis_bit = highest_cis_bit < 5 ? 5 : highest_cis_bit;
+    for (idx_type i = 0; i < 32; ++i) {
+      auto c = bits::ExpandBits(i, highest_cis_bit + 1, cis_mask);
+      indices.cis[i] = 2 * (c & 0xffffffe0) | (c & 0x1f) | cvalsl;
+    }
+
+    return {cvalsh, num_aqs, num_effective_qs, remaining_low_cqs};
+  }
+
+
+  void* AllocScratch(uint64_t size) const {
+    if (size > scratch_size_) {
+      if (scratch_ != nullptr) {
+        ErrorCheck(cudaFree(scratch_));
+      }
+
+      ErrorCheck(cudaMalloc(const_cast<void**>(&scratch_), size));
+
+      const_cast<uint64_t&>(scratch_size_) = size;
+    }
+
+    return scratch_;
+  }
+
+  char* d_ws;
+  char h_ws0[max_buf_size];
+  char* h_ws = (char*) h_ws0;
+
+  void* scratch_;
+  uint64_t scratch_size_;
+};
+
+}  // namespace qsim
+
+#endif  // SIMULATOR_CUDA_H_
diff --git a/qsim/simulator_cuda_kernels.h b/qsim/simulator_cuda_kernels.h
new file mode 100644
index 0000000..e21a9d6
--- /dev/null
+++ b/qsim/simulator_cuda_kernels.h
@@ -0,0 +1,683 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SIMULATOR_CUDA_KERNELS_H_
+#define SIMULATOR_CUDA_KERNELS_H_
+
+#ifdef __NVCC__
+  #include <cuda.h>
+  #include <cuda_runtime.h>
+
+  #include "util_cuda.h"
+#elif __HIP__
+  #include <hip/hip_runtime.h>
+  #include "cuda2hip.h"
+#endif
+
+namespace qsim {
+
+template <unsigned G, typename fp_type, typename idx_type>
+__global__ void ApplyGateH_Kernel(
+    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss0,
+    const idx_type* __restrict__ mss, fp_type* __restrict__ rstate) {
+  // blockDim.x must be equal to 64.
+
+  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
+
+  constexpr unsigned gsize = 1 << G;
+  constexpr unsigned rows =
+      G < 4 ? gsize : (sizeof(fp_type) == 4 ?
+                       (G < 6 ? gsize : 32) : (G < 5 ? 8 : 16));
+
+  fp_type rs[gsize], is[gsize];
+
+  __shared__ idx_type xss[64];
+  __shared__ fp_type v[2 * gsize * rows];
+
+  if (threadIdx.x < gsize) {
+    xss[threadIdx.x] = xss0[threadIdx.x];
+  }
+
+  if (G <= 2) {
+    if (threadIdx.x < 2 * gsize * gsize) {
+      v[threadIdx.x] = v0[threadIdx.x];
+    }
+  } else {
+    for (unsigned m = 0; m < 2 * gsize * rows; m += 64) {
+      v[m + threadIdx.x] = v0[m + threadIdx.x];
+    }
+  }
+
+  __syncthreads();
+
+  idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0;
+  idx_type ii = i & mss[0];
+  for (unsigned j = 1; j <= G; ++j) {
+    i *= 2;
+    ii |= i & mss[j];
+  }
+
+  auto p0 = rstate + 2 * ii + threadIdx.x % 32;
+
+  for (unsigned k = 0; k < gsize; ++k) {
+    rs[k] = *(p0 + xss[k]);
+    is[k] = *(p0 + xss[k] + 32);
+  }
+
+  for (unsigned s = 0; s < gsize / rows; ++s) {
+    if (s > 0) {
+      __syncthreads();
+
+      for (unsigned m = 0; m < 2 * gsize * rows; m += 64) {
+        v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
+      }
+
+      __syncthreads();
+    }
+
+    unsigned j = 0;
+
+    for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
+      fp_type rn = 0;
+      fp_type in = 0;
+
+      for (unsigned l = 0; l < gsize; ++l) {
+        fp_type rm = v[j++];
+        fp_type im = v[j++];
+        rn += rs[l] * rm;
+        rn -= is[l] * im;
+        in += rs[l] * im;
+        in += is[l] * rm;
+      }
+
+      *(p0 + xss[k]) = rn;
+      *(p0 + xss[k] + 32) = in;
+    }
+  }
+}
+
+template <unsigned G, typename fp_type, typename idx_type>
+__global__ void ApplyGateL_Kernel(
+    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss,
+    const idx_type* __restrict__ mss, const unsigned* __restrict__ qis,
+    const unsigned* __restrict__ tis, unsigned esize,
+    fp_type* __restrict__ rstate) {
+  // blockDim.x must be equal to 32.
+
+  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
+
+  constexpr unsigned gsize = 1 << G;
+  constexpr unsigned
+      rows = G < 4 ? gsize : (sizeof(fp_type) == 4 ?
+                              (G < 5 ? gsize : 8) : (G < 6 ? 8 : 4));
+
+  fp_type rs[gsize], is[gsize];
+
+  __shared__ fp_type v[2 * gsize * rows];
+  __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1];
+
+  if (G < 2) {
+    if (threadIdx.x < 2 * gsize * gsize) {
+      v[threadIdx.x] = v0[threadIdx.x];
+    }
+  } else {
+    for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
+      v[m + threadIdx.x] = v0[m + threadIdx.x];
+    }
+  }
+
+  idx_type i = 32 * idx_type{blockIdx.x};
+  idx_type ii = i & mss[0];
+  for (unsigned j = 1; j <= G; ++j) {
+    i *= 2;
+    ii |= i & mss[j];
+  }
+
+  auto p0 = rstate + 2 * ii + threadIdx.x;
+
+  for (unsigned k = 0; k < gsize; ++k) {
+    rs0[threadIdx.x][k] = *(p0 + xss[k]);
+    is0[threadIdx.x][k] = *(p0 + xss[k] + 32);
+  }
+
+  for (unsigned k = 0; k < gsize; ++k) {
+    unsigned i = tis[threadIdx.x] | qis[k];
+    unsigned m = i & 0x1f;
+    unsigned n = i / 32;
+
+    rs[k] = rs0[m][n];
+    is[k] = is0[m][n];
+  }
+
+  for (unsigned s = 0; s < gsize / rows; ++s) {
+    if (s > 0) {
+      for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
+        v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
+      }
+    }
+
+    unsigned j = 0;
+
+    for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
+      fp_type rn = 0;
+      fp_type in = 0;
+
+      for (unsigned l = 0; l < gsize; ++l) {
+        fp_type rm = v[j++];
+        fp_type im = v[j++];
+        rn += rs[l] * rm;
+        rn -= is[l] * im;
+        in += rs[l] * im;
+        in += is[l] * rm;
+      }
+
+      unsigned i = tis[threadIdx.x] | qis[k];
+      unsigned m = i & 0x1f;
+      unsigned n = i / 32;
+
+      rs0[m][n] = rn;
+      is0[m][n] = in;
+    }
+  }
+
+  for (unsigned k = 0; k < esize; ++k) {
+    *(p0 + xss[k]) = rs0[threadIdx.x][k];
+    *(p0 + xss[k] + 32) = is0[threadIdx.x][k];
+  }
+}
+
+template <unsigned G, typename fp_type, typename idx_type>
+__global__ void ApplyControlledGateH_Kernel(
+    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss0,
+    const idx_type* __restrict__ mss, unsigned num_mss, idx_type cvalsh,
+    fp_type* __restrict__ rstate) {
+  // blockDim.x must be equal to 64.
+
+  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
+
+  constexpr unsigned gsize = 1 << G;
+  constexpr unsigned rows =
+      G < 4 ? gsize : (sizeof(fp_type) == 4 ?
+                           (G < 6 ? gsize : 32) : (G < 5 ? 8 : 16));
+
+  fp_type rs[gsize], is[gsize];
+
+  __shared__ idx_type xss[64];
+  __shared__ fp_type v[2 * gsize * rows];
+
+  if (threadIdx.x < gsize) {
+    xss[threadIdx.x] = xss0[threadIdx.x];
+  }
+
+  if (G <= 2) {
+    if (threadIdx.x < 2 * gsize * gsize) {
+      v[threadIdx.x] = v0[threadIdx.x];
+    }
+  } else {
+    for (unsigned m = 0; m < 2 * gsize * rows; m += 64) {
+      v[m + threadIdx.x] = v0[m + threadIdx.x];
+    }
+  }
+
+  __syncthreads();
+
+  idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0;
+  idx_type ii = i & mss[0];
+  for (unsigned j = 1; j < num_mss; ++j) {
+    i *= 2;
+    ii |= i & mss[j];
+  }
+
+  ii |= cvalsh;
+
+  auto p0 = rstate + 2 * ii + threadIdx.x % 32;
+
+  for (unsigned k = 0; k < gsize; ++k) {
+    rs[k] = *(p0 + xss[k]);
+    is[k] = *(p0 + xss[k] + 32);
+  }
+
+  for (unsigned s = 0; s < gsize / rows; ++s) {
+    if (s > 0) {
+      __syncthreads();
+
+      for (unsigned m = 0; m < 2 * gsize * rows; m += 64) {
+        v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
+      }
+
+      __syncthreads();
+    }
+
+    unsigned j = 0;
+
+    for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
+      fp_type rn = 0;
+      fp_type in = 0;
+
+      for (unsigned l = 0; l < gsize; ++l) {
+        fp_type rm = v[j++];
+        fp_type im = v[j++];
+        rn += rs[l] * rm;
+        rn -= is[l] * im;
+        in += rs[l] * im;
+        in += is[l] * rm;
+      }
+
+      *(p0 + xss[k]) = rn;
+      *(p0 + xss[k] + 32) = in;
+    }
+  }
+}
+
+template <unsigned G, typename fp_type, typename idx_type>
+__global__ void ApplyControlledGateLH_Kernel(
+    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss,
+    const idx_type* __restrict__ mss, const unsigned* __restrict__ qis,
+    const unsigned* __restrict__ tis, unsigned num_mss, idx_type cvalsh,
+    unsigned esize, fp_type* __restrict__ rstate) {
+  // blockDim.x must be equal to 32.
+
+  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
+
+  constexpr unsigned gsize = 1 << G;
+  constexpr unsigned
+      rows = G < 4 ? gsize : (sizeof(fp_type) == 4 ?
+                              (G < 5 ? gsize : 8) : (G < 6 ? 8 : 4));
+
+  fp_type rs[gsize], is[gsize];
+
+  __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1];
+  __shared__ fp_type v[2 * gsize * rows];
+
+  idx_type i = 32 * idx_type{blockIdx.x};
+  idx_type ii = i & mss[0];
+  for (unsigned j = 1; j < num_mss; ++j) {
+    i *= 2;
+    ii |= i & mss[j];
+  }
+
+  ii |= cvalsh;
+
+  auto p0 = rstate + 2 * ii + threadIdx.x;
+
+  for (unsigned k = 0; k < gsize; ++k) {
+    rs0[threadIdx.x][k] = *(p0 + xss[k]);
+    is0[threadIdx.x][k] = *(p0 + xss[k] + 32);
+  }
+
+  if (G < 2) {
+    if (threadIdx.x < 2 * gsize * gsize) {
+      v[threadIdx.x] = v0[threadIdx.x];
+    }
+  } else {
+    for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
+      v[m + threadIdx.x] = v0[m + threadIdx.x];
+    }
+  }
+
+  for (unsigned k = 0; k < gsize; ++k) {
+    unsigned i = tis[threadIdx.x] | qis[k];
+    unsigned m = i & 0x1f;
+    unsigned n = i / 32;
+
+    rs[k] = rs0[m][n];
+    is[k] = is0[m][n];
+  }
+
+  for (unsigned s = 0; s < gsize / rows; ++s) {
+    if (s > 0) {
+      for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
+        v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
+      }
+    }
+
+    unsigned j = 0;
+
+    for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
+      fp_type rn = 0;
+      fp_type in = 0;
+
+      for (unsigned l = 0; l < gsize; ++l) {
+        fp_type rm = v[j++];
+        fp_type im = v[j++];
+        rn += rs[l] * rm;
+        rn -= is[l] * im;
+        in += rs[l] * im;
+        in += is[l] * rm;
+      }
+
+      unsigned i = tis[threadIdx.x] | qis[k];
+      unsigned m = i & 0x1f;
+      unsigned n = i / 32;
+
+      rs0[m][n] = rn;
+      is0[m][n] = in;
+    }
+  }
+
+  for (unsigned k = 0; k < esize; ++k) {
+    *(p0 + xss[k]) = rs0[threadIdx.x][k];
+    *(p0 + xss[k] + 32) = is0[threadIdx.x][k];
+  }
+}
+
+template <unsigned G, typename fp_type, typename idx_type>
+__global__ void ApplyControlledGateL_Kernel(
+    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss,
+    const idx_type* __restrict__ mss, const unsigned* __restrict__ qis,
+    const unsigned* __restrict__ tis, const idx_type* __restrict__ cis,
+    unsigned num_mss, idx_type cvalsh, unsigned esize, unsigned rwthreads,
+    fp_type* __restrict__ rstate) {
+  // blockDim.x must be equal to 32.
+
+  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
+
+  constexpr unsigned gsize = 1 << G;
+  constexpr unsigned
+      rows = G < 4 ? gsize : (sizeof(fp_type) == 4 ?
+                              (G < 5 ? gsize : 8) : (G < 6 ? 8 : 4));
+
+  fp_type rs[gsize], is[gsize];
+
+  __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1];
+  __shared__ fp_type v[2 * gsize * rows];
+
+  idx_type i = 32 * idx_type{blockIdx.x};
+  idx_type ii = i & mss[0];
+  for (unsigned j = 1; j < num_mss; ++j) {
+    i *= 2;
+    ii |= i & mss[j];
+  }
+
+  ii |= cvalsh;
+
+  auto p0 = rstate + 2 * ii + cis[threadIdx.x];
+
+  if (threadIdx.x < rwthreads) {
+    for (unsigned k = 0; k < gsize; ++k) {
+      rs0[threadIdx.x][k] = *(p0 + xss[k]);
+      is0[threadIdx.x][k] = *(p0 + xss[k] + 32);
+    }
+  }
+
+  if (G < 2) {
+    if (threadIdx.x < 2 * gsize * gsize) {
+      v[threadIdx.x] = v0[threadIdx.x];
+    }
+  } else {
+    for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
+      v[m + threadIdx.x] = v0[m + threadIdx.x];
+    }
+  }
+
+  for (unsigned k = 0; k < gsize; ++k) {
+    unsigned i = tis[threadIdx.x] | qis[k];
+    unsigned m = i & 0x1f;
+    unsigned n = i / 32;
+
+    rs[k] = rs0[m][n];
+    is[k] = is0[m][n];
+  }
+
+  for (unsigned s = 0; s < gsize / rows; ++s) {
+    if (s > 0) {
+      for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
+        v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
+      }
+    }
+
+    unsigned j = 0;
+
+    for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
+      fp_type rn = 0;
+      fp_type in = 0;
+
+      for (unsigned l = 0; l < gsize; ++l) {
+        fp_type rm = v[j++];
+        fp_type im = v[j++];
+        rn += rs[l] * rm;
+        rn -= is[l] * im;
+        in += rs[l] * im;
+        in += is[l] * rm;
+      }
+
+      unsigned i = tis[threadIdx.x] | qis[k];
+      unsigned m = i & 0x1f;
+      unsigned n = i / 32;
+
+      rs0[m][n] = rn;
+      is0[m][n] = in;
+    }
+  }
+
+  if (threadIdx.x < rwthreads) {
+    for (unsigned k = 0; k < esize; ++k) {
+      *(p0 + xss[k]) = rs0[threadIdx.x][k];
+      *(p0 + xss[k] + 32) = is0[threadIdx.x][k];
+    }
+  }
+}
+
+template <unsigned G, typename fp_type, typename idx_type, typename Op,
+          typename cfp_type>
+__global__ void ExpectationValueH_Kernel(
+    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss0,
+    const idx_type* __restrict__ mss, unsigned num_iterations_per_block,
+    const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) {
+  // blockDim.x must be equal to 64.
+
+  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
+
+  constexpr unsigned gsize = 1 << G;
+  constexpr unsigned rows =
+      G < 5 ? gsize : (sizeof(fp_type) == 4 ? (G < 6 ? 4 : 8) : 8);
+
+  fp_type rs[gsize], is[gsize];
+
+  __shared__ idx_type xss[64];
+  __shared__ fp_type v[2 * gsize * rows];
+
+  if (threadIdx.x < gsize) {
+    xss[threadIdx.x] = xss0[threadIdx.x];
+  }
+
+  if (G <= 2) {
+    if (threadIdx.x < 2 * gsize * gsize) {
+      v[threadIdx.x] = v0[threadIdx.x];
+    }
+  } else {
+    for (unsigned m = 0; m < 2 * gsize * rows; m += 64) {
+      v[m + threadIdx.x] = v0[m + threadIdx.x];
+    }
+  }
+
+  __syncthreads();
+
+  double re = 0;
+  double im = 0;
+
+  for (unsigned iter = 0; iter < num_iterations_per_block; ++iter) {
+    idx_type b = num_iterations_per_block * idx_type{blockIdx.x} + iter;
+
+    idx_type i = (64 * b + threadIdx.x) & 0xffffffffffe0;
+    idx_type ii = i & mss[0];
+    for (unsigned j = 1; j <= G; ++j) {
+      i *= 2;
+      ii |= i & mss[j];
+    }
+
+    auto p0 = rstate + 2 * ii + threadIdx.x % 32;
+
+    for (unsigned k = 0; k < gsize; ++k) {
+      rs[k] = *(p0 + xss[k]);
+      is[k] = *(p0 + xss[k] + 32);
+    }
+
+    for (unsigned s = 0; s < gsize / rows; ++s) {
+      if (s > 0 || iter > 0) {
+        __syncthreads();
+
+        for (unsigned m = 0; m < 2 * gsize * rows; m += 64) {
+          v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
+        }
+
+        __syncthreads();
+      }
+
+      unsigned j = 0;
+
+      for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
+        fp_type rn = 0;
+        fp_type in = 0;
+
+        for (unsigned l = 0; l < gsize; ++l) {
+          fp_type rm = v[j++];
+          fp_type im = v[j++];
+          rn += rs[l] * rm;
+          rn -= is[l] * im;
+          in += rs[l] * im;
+          in += is[l] * rm;
+        }
+
+        re += rs[k] * rn;
+        re += is[k] * in;
+        im += rs[k] * in;
+        im -= is[k] * rn;
+      }
+    }
+  }
+
+  __shared__ cfp_type partial1[64];
+  __shared__ cfp_type partial2[2];
+
+  partial1[threadIdx.x].re = re;
+  partial1[threadIdx.x].im = im;
+
+  auto val = WarpReduce(partial1[threadIdx.x], op);
+
+  if (threadIdx.x % 32 == 0) {
+    partial2[threadIdx.x / 32] = val;
+  }
+
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    result[blockIdx.x].re = partial2[0].re + partial2[1].re;
+    result[blockIdx.x].im = partial2[0].im + partial2[1].im;
+  }
+}
+
+template <unsigned G, typename fp_type, typename idx_type,
+          typename Op, typename cfp_type>
+__global__ void ExpectationValueL_Kernel(
+    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss,
+    const idx_type* __restrict__ mss, const unsigned* __restrict__ qis,
+    const unsigned* __restrict__ tis, unsigned num_iterations_per_block,
+    const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) {
+  // blockDim.x must be equal to 32.
+
+  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
+
+  constexpr unsigned gsize = 1 << G;
+  constexpr unsigned rows = G < 5 ? gsize : (sizeof(fp_type) == 4 ?
+                                             (G < 6 ? 4 : 2) : (G < 6 ? 2 : 1));
+
+  fp_type rs[gsize], is[gsize];
+
+  __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1];
+  __shared__ fp_type v[2 * gsize * rows];
+
+  if (G < 2) {
+    if (threadIdx.x < 2 * gsize * gsize) {
+      v[threadIdx.x] = v0[threadIdx.x];
+    }
+  } else {
+    for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
+      v[m + threadIdx.x] = v0[m + threadIdx.x];
+    }
+  }
+
+  double re = 0;
+  double im = 0;
+
+  for (idx_type iter = 0; iter < num_iterations_per_block; ++iter) {
+    idx_type i = 32 * (num_iterations_per_block * idx_type{blockIdx.x} + iter);
+    idx_type ii = i & mss[0];
+    for (unsigned j = 1; j <= G; ++j) {
+      i *= 2;
+      ii |= i & mss[j];
+    }
+
+    auto p0 = rstate + 2 * ii + threadIdx.x;
+
+    for (unsigned k = 0; k < gsize; ++k) {
+      rs0[threadIdx.x][k] = *(p0 + xss[k]);
+      is0[threadIdx.x][k] = *(p0 + xss[k] + 32);
+    }
+
+    for (unsigned k = 0; k < gsize; ++k) {
+      unsigned i = tis[threadIdx.x] | qis[k];
+      unsigned m = i & 0x1f;
+      unsigned n = i / 32;
+
+      rs[k] = rs0[m][n];
+      is[k] = is0[m][n];
+    }
+
+    for (unsigned s = 0; s < gsize / rows; ++s) {
+      if (s > 0 || iter > 0) {
+        for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
+          v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
+        }
+      }
+
+      unsigned j = 0;
+
+      for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
+        fp_type rn = 0;
+        fp_type in = 0;
+
+        for (unsigned l = 0; l < gsize; ++l) {
+          fp_type rm = v[j++];
+          fp_type im = v[j++];
+          rn += rs[l] * rm;
+          rn -= is[l] * im;
+          in += rs[l] * im;
+          in += is[l] * rm;
+        }
+
+        re += rs[k] * rn;
+        re += is[k] * in;
+        im += rs[k] * in;
+        im -= is[k] * rn;
+      }
+    }
+  }
+
+  __shared__ cfp_type partial[32];
+
+  partial[threadIdx.x].re = re;
+  partial[threadIdx.x].im = im;
+
+  auto val = WarpReduce(partial[threadIdx.x], op);
+
+  if (threadIdx.x == 0) {
+    result[blockIdx.x].re = val.re;
+    result[blockIdx.x].im = val.im;
+  }
+}
+
+}  // namespace qsim
+
+#endif  // SIMULATOR_CUDA_KERNELS_H_
diff --git a/qsim/simulator_custatevec.h b/qsim/simulator_custatevec.h
new file mode 100644
index 0000000..40d1902
--- /dev/null
+++ b/qsim/simulator_custatevec.h
@@ -0,0 +1,209 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SIMULATOR_CUSTATEVEC_H_
+#define SIMULATOR_CUSTATEVEC_H_
+
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+
+#include <cublas_v2.h>
+#include <cuComplex.h>
+#include <custatevec.h>
+
+#include "io.h"
+#include "statespace_custatevec.h"
+#include "util_custatevec.h"
+
+namespace qsim {
+
+/**
+ * Quantum circuit simulator using the NVIDIA cuStateVec library.
+ */
+template <typename FP = float>
+class SimulatorCuStateVec final {
+ public:
+  using StateSpace = StateSpaceCuStateVec<FP>;
+  using State = typename StateSpace::State;
+  using fp_type = typename StateSpace::fp_type;
+
+  static constexpr auto kStateType = StateSpace::kStateType;
+  static constexpr auto kMatrixType = StateSpace::kMatrixType;
+  static constexpr auto kExpectType = StateSpace::kExpectType;
+  static constexpr auto kComputeType = StateSpace::kComputeType;
+  static constexpr auto kMatrixLayout = StateSpace::kMatrixLayout;
+
+  explicit SimulatorCuStateVec(const cublasHandle_t& cublas_handle,
+                               const custatevecHandle_t& custatevec_handle)
+      : cublas_handle_(cublas_handle), custatevec_handle_(custatevec_handle),
+      workspace_(nullptr), workspace_size_(0) {}
+
+  ~SimulatorCuStateVec() {
+    ErrorCheck(cudaFree(workspace_));
+  }
+
+  /**
+   * Applies a gate using the NVIDIA cuStateVec library.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyGate(const std::vector<unsigned>& qs,
+                 const fp_type* matrix, State& state) const {
+    if (qs.size() == 0) {
+      uint64_t size = uint64_t{1} << state.num_qubits();
+
+      if (StateSpace::is_float) {
+        cuComplex a = {matrix[0], matrix[1]};
+        auto p = (cuComplex*) state.get();
+        ErrorCheck(cublasCscal(cublas_handle_, size, &a, p, 1));
+      } else {
+        cuDoubleComplex a = {matrix[0], matrix[1]};
+        auto p = (cuDoubleComplex*) state.get();
+        ErrorCheck(cublasZscal(cublas_handle_, size, &a, p, 1));
+      }
+    } else {
+      auto workspace_size = ApplyGateWorkSpaceSize(
+          state.num_qubits(), qs.size(), 0, matrix);
+      AllocWorkSpace(workspace_size);
+
+      ErrorCheck(custatevecApplyMatrix(
+                     custatevec_handle_, state.get(), kStateType,
+                     state.num_qubits(), matrix, kMatrixType, kMatrixLayout, 0,
+                     (int32_t*) qs.data(), qs.size(), nullptr, nullptr, 0,
+                     kComputeType, workspace_, workspace_size));
+    }
+  }
+
+  /**
+   * Applies a controlled gate using the NVIDIA cuStateVec library.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param cqs Indices of control qubits.
+   * @param cmask Bit mask of control qubit values.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyControlledGate(const std::vector<unsigned>& qs,
+                           const std::vector<unsigned>& cqs, uint64_t cmask,
+                           const fp_type* matrix, State& state) const {
+    if (qs.size() == 0) {
+      IO::errorf(
+          "error: controlled global phase gate is not implemented %s %d\n",
+          __FILE__, __LINE__);
+      exit(1);
+    } else {
+      std::vector<int32_t> control_bits;
+      control_bits.reserve(cqs.size());
+
+      for (std::size_t i = 0; i < cqs.size(); ++i) {
+        control_bits.push_back((cmask >> i) & 1);
+      }
+
+      auto workspace_size = ApplyGateWorkSpaceSize(
+          state.num_qubits(), qs.size(), cqs.size(), matrix);
+      AllocWorkSpace(workspace_size);
+
+      ErrorCheck(custatevecApplyMatrix(
+                     custatevec_handle_, state.get(), kStateType,
+                     state.num_qubits(), matrix, kMatrixType, kMatrixLayout, 0,
+                     (int32_t*) qs.data(), qs.size(),
+                     (int32_t*) cqs.data(), control_bits.data(), cqs.size(),
+                     kComputeType, workspace_, workspace_size));
+    }
+  }
+
+  /**
+   * Computes the expectation value of an operator using the NVIDIA cuStateVec
+   * library.
+   * @param qs Indices of the qubits the operator acts on.
+   * @param matrix The operator matrix.
+   * @param state The state of the system.
+   * @return The computed expectation value.
+   */
+  std::complex<double> ExpectationValue(const std::vector<unsigned>& qs,
+                                        const fp_type* matrix,
+                                        const State& state) const {
+    auto workspace_size = ExpectationValueWorkSpaceSize(
+        state.num_qubits(), qs.size(), matrix);
+    AllocWorkSpace(workspace_size);
+
+    cuDoubleComplex eval;
+
+    ErrorCheck(custatevecComputeExpectation(
+                   custatevec_handle_, state.get(), kStateType,
+                   state.num_qubits(), &eval, kExpectType, nullptr, matrix,
+                   kMatrixType, kMatrixLayout, (int32_t*) qs.data(), qs.size(),
+                   kComputeType, workspace_, workspace_size));
+
+    return {cuCreal(eval), cuCimag(eval)};
+  }
+
+  /**
+   * @return The size of SIMD register if applicable.
+   */
+  static unsigned SIMDRegisterSize() {
+    return 32;
+  }
+
+ private:
+  size_t ApplyGateWorkSpaceSize(
+      unsigned num_qubits, unsigned num_targets, unsigned num_controls,
+      const fp_type* matrix) const {
+    size_t size;
+
+    ErrorCheck(custatevecApplyMatrixGetWorkspaceSize(
+                   custatevec_handle_, kStateType, num_qubits, matrix,
+                   kMatrixType, kMatrixLayout, 0, num_targets, num_controls,
+                   kComputeType, &size));
+
+    return size;
+  }
+
+  size_t ExpectationValueWorkSpaceSize(
+      unsigned num_qubits, unsigned num_targets, const fp_type* matrix) const {
+    size_t size;
+
+    ErrorCheck(custatevecComputeExpectationGetWorkspaceSize(
+                   custatevec_handle_, kStateType, num_qubits, matrix,
+                   kMatrixType, kMatrixLayout, num_targets, kComputeType,
+                   &size));
+
+    return size;
+  }
+
+  void* AllocWorkSpace(size_t size) const {
+    if (size > workspace_size_) {
+      if (workspace_ != nullptr) {
+        ErrorCheck(cudaFree(workspace_));
+      }
+
+      ErrorCheck(cudaMalloc(const_cast<void**>(&workspace_), size));
+
+      const_cast<uint64_t&>(workspace_size_) = size;
+    }
+
+    return workspace_;
+  }
+
+  const cublasHandle_t cublas_handle_;
+  const custatevecHandle_t custatevec_handle_;
+
+  void* workspace_;
+  size_t workspace_size_;
+};
+
+}  // namespace qsim
+
+#endif  // SIMULATOR_CUSTATEVEC_H_
diff --git a/qsim/simulator_sse.h b/qsim/simulator_sse.h
new file mode 100644
index 0000000..5256c53
--- /dev/null
+++ b/qsim/simulator_sse.h
@@ -0,0 +1,864 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SIMULATOR_SSE_H_
+#define SIMULATOR_SSE_H_
+
+#include <smmintrin.h>
+
+#include <complex>
+#include <cstdint>
+#include <functional>
+#include <vector>
+
+#include "simulator.h"
+#include "statespace_sse.h"
+
+namespace qsim {
+
+/**
+ * Quantum circuit simulator with SSE vectorization.
+ */
+template <typename For>
+class SimulatorSSE final : public SimulatorBase {
+ public:
+  using StateSpace = StateSpaceSSE<For>;
+  using State = typename StateSpace::State;
+  using fp_type = typename StateSpace::fp_type;
+
+  template <typename... ForArgs>
+  explicit SimulatorSSE(ForArgs&&... args) : for_(args...) {}
+
+  /**
+   * Applies a gate using SSE instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyGate(const std::vector<unsigned>& qs,
+                 const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    switch (qs.size()) {
+    case 0:
+      ApplyGateH<0>(qs, matrix, state);
+      break;
+    case 1:
+      if (qs[0] > 1) {
+        ApplyGateH<1>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 1>(qs, matrix, state);
+      }
+      break;
+    case 2:
+      if (qs[0] > 1) {
+        ApplyGateH<2>(qs, matrix, state);
+      } else if (qs[1] > 1) {
+        ApplyGateL<1, 1>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 2>(qs, matrix, state);
+      }
+      break;
+    case 3:
+      if (qs[0] > 1) {
+        ApplyGateH<3>(qs, matrix, state);
+      } else if (qs[1] > 1) {
+        ApplyGateL<2, 1>(qs, matrix, state);
+      } else {
+        ApplyGateL<1, 2>(qs, matrix, state);
+      }
+      break;
+    case 4:
+      if (qs[0] > 1) {
+        ApplyGateH<4>(qs, matrix, state);
+      } else if (qs[1] > 1) {
+        ApplyGateL<3, 1>(qs, matrix, state);
+      } else {
+        ApplyGateL<2, 2>(qs, matrix, state);
+      }
+      break;
+    case 5:
+      if (qs[0] > 1) {
+        ApplyGateH<5>(qs, matrix, state);
+      } else if (qs[1] > 1) {
+        ApplyGateL<4, 1>(qs, matrix, state);
+      } else {
+        ApplyGateL<3, 2>(qs, matrix, state);
+      }
+      break;
+    case 6:
+      if (qs[0] > 1) {
+        ApplyGateH<6>(qs, matrix, state);
+      } else if (qs[1] > 1) {
+        ApplyGateL<5, 1>(qs, matrix, state);
+      } else {
+        ApplyGateL<4, 2>(qs, matrix, state);
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * Applies a controlled gate using SSE instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param cqs Indices of control qubits.
+   * @param cvals Bit mask of control qubit values.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyControlledGate(const std::vector<unsigned>& qs,
+                           const std::vector<unsigned>& cqs, uint64_t cvals,
+                           const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+    // Assume cqs[0] < cqs[1] < cqs[2] < ... .
+
+    if (cqs.size() == 0) {
+      ApplyGate(qs, matrix, state);
+      return;
+    }
+
+    switch (qs.size()) {
+    case 0:
+      if (cqs[0] > 1) {
+        ApplyControlledGateHH<0>(qs, cqs, cvals, matrix, state);
+      } else {
+        ApplyControlledGateHL<0>(qs, cqs, cvals, matrix, state);
+      }
+      break;
+    case 1:
+      if (qs[0] > 1) {
+        if (cqs[0] > 1) {
+          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 1) {
+          ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 2:
+      if (qs[0] > 1) {
+        if (cqs[0] > 1) {
+          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 1) {
+        if (cqs[0] > 1) {
+          ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 1) {
+          ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 3:
+      if (qs[0] > 1) {
+        if (cqs[0] > 1) {
+          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 1) {
+        if (cqs[0] > 1) {
+          ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 1) {
+          ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 4:
+      if (qs[0] > 1) {
+        if (cqs[0] > 1) {
+          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 1) {
+        if (cqs[0] > 1) {
+          ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 1) {
+          ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * Computes the expectation value of an operator using SSE instructions.
+   * @param qs Indices of the qubits the operator acts on.
+   * @param matrix The operator matrix.
+   * @param state The state of the system.
+   * @return The computed expectation value.
+   */
+  std::complex<double> ExpectationValue(const std::vector<unsigned>& qs,
+                                        const fp_type* matrix,
+                                        const State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    switch (qs.size()) {
+    case 1:
+      if (qs[0] > 1) {
+        return ExpectationValueH<1>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<0, 1>(qs, matrix, state);
+      }
+      break;
+    case 2:
+      if (qs[0] > 1) {
+        return ExpectationValueH<2>(qs, matrix, state);
+      } else if (qs[1] > 1) {
+        return ExpectationValueL<1, 1>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<0, 2>(qs, matrix, state);
+      }
+      break;
+    case 3:
+      if (qs[0] > 1) {
+        return ExpectationValueH<3>(qs, matrix, state);
+      } else if (qs[1] > 1) {
+        return ExpectationValueL<2, 1>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<1, 2>(qs, matrix, state);
+      }
+      break;
+    case 4:
+      if (qs[0] > 1) {
+        return ExpectationValueH<4>(qs, matrix, state);
+      } else if (qs[1] > 1) {
+        return ExpectationValueL<3, 1>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<2, 2>(qs, matrix, state);
+      }
+      break;
+    case 5:
+      if (qs[0] > 1) {
+        return ExpectationValueH<5>(qs, matrix, state);
+      } else if (qs[1] > 1) {
+        return ExpectationValueL<4, 1>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<3, 2>(qs, matrix, state);
+      }
+      break;
+    case 6:
+      if (qs[0] > 1) {
+        return ExpectationValueH<6>(qs, matrix, state);
+      } else if (qs[1] > 1) {
+        return ExpectationValueL<5, 1>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<4, 2>(qs, matrix, state);
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+
+    return 0;
+  }
+
+  /**
+   * @return The size of SIMD register if applicable.
+   */
+  static unsigned SIMDRegisterSize() {
+    return 4;
+  }
+
+ private:
+  template <unsigned H>
+  void ApplyGateH(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m128 ru, iu, rn, in;
+      __m128 rs[hsize], is[hsize];
+
+      i *= 4;
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      auto p0 = rstate + 2 * ii;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm_load_ps(p0 + xss[k]);
+        is[k] = _mm_load_ps(p0 + xss[k] + 4);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm_set1_ps(v[j]);
+        iu = _mm_set1_ps(v[j + 1]);
+        rn = _mm_mul_ps(rs[0], ru);
+        in = _mm_mul_ps(rs[0], iu);
+        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
+        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm_set1_ps(v[j]);
+          iu = _mm_set1_ps(v[j + 1]);
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], ru));
+
+          j += 2;
+        }
+
+        _mm_store_ps(p0 + xss[k], rn);
+        _mm_store_ps(p0 + xss[k] + 4, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 2 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, matrix, ms, xss, state.get());
+  }
+
+  template <unsigned H, unsigned L>
+  void ApplyGateL(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned q0, fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m128 rn, in;
+      __m128 rs[gsize], is[gsize];
+
+      i *= 4;
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      auto p0 = rstate + 2 * ii;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+
+        rs[k2] = _mm_load_ps(p0 + xss[k]);
+        is[k2] = _mm_load_ps(p0 + xss[k] + 4);
+
+        if (L == 1) {
+          rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177)
+                               : _mm_shuffle_ps(rs[k2], rs[k2], 78);
+          is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177)
+                               : _mm_shuffle_ps(is[k2], is[k2], 78);
+        } else if (L == 2) {
+          rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57);
+          is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57);
+          rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78);
+          is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78);
+          rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147);
+          is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm_mul_ps(rs[0], w[j]);
+        in = _mm_mul_ps(rs[0], w[j + 1]);
+        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
+        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
+
+          j += 2;
+        }
+
+        _mm_store_ps(p0 + xss[k], rn);
+        _mm_store_ps(p0 + xss[k] + 4, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m128 w[1 << (1 + 2 * H + L)];
+
+    auto m = GetMasks11<L>(qs);
+
+    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
+    FillMatrix<H, L, 2>(m.qmaskl, matrix, (fp_type*) w);
+
+    unsigned k = 2 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss, qs[0], state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m128 ru, iu, rn, in;
+      __m128 rs[hsize], is[hsize];
+
+      i *= 4;
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      if ((ii & cmaskh) != cvalsh) return;
+
+      auto p0 = rstate + 2 * ii;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm_load_ps(p0 + xss[k]);
+        is[k] = _mm_load_ps(p0 + xss[k] + 4);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm_set1_ps(v[j]);
+        iu = _mm_set1_ps(v[j + 1]);
+        rn = _mm_mul_ps(rs[0], ru);
+        in = _mm_mul_ps(rs[0], iu);
+        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
+        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm_set1_ps(v[j]);
+          iu = _mm_set1_ps(v[j + 1]);
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], ru));
+
+          j += 2;
+        }
+
+        _mm_store_ps(p0 + xss[k], rn);
+        _mm_store_ps(p0 + xss[k] + 4, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+
+    auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals);
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 2 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, matrix, ms, xss, m.cvalsh, m.cmaskh, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m128 rn, in;
+      __m128 rs[hsize], is[hsize];
+
+      i *= 4;
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      if ((ii & cmaskh) != cvalsh) return;
+
+      auto p0 = rstate + 2 * ii;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm_load_ps(p0 + xss[k]);
+        is[k] = _mm_load_ps(p0 + xss[k] + 4);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm_mul_ps(rs[0], w[j]);
+        in = _mm_mul_ps(rs[0], w[j + 1]);
+        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
+        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
+
+          j += 2;
+        }
+
+        _mm_store_ps(p0 + xss[k], rn);
+        _mm_store_ps(p0 + xss[k] + 4, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m128 w[1 << (1 + 2 * H)];
+
+    auto m = GetMasks8<2>(state.num_qubits(), qs, cqs, cvals);
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+    FillControlledMatrixH<H, 2>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
+
+    unsigned r = 2 + H;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, state.get());
+  }
+
+  template <unsigned H, unsigned L, bool CH>
+  void ApplyControlledGateL(const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs, uint64_t cvals,
+                            const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, unsigned q0, fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m128 rn, in;
+      __m128 rs[gsize], is[gsize];
+
+      i *= 4;
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      if ((ii & cmaskh) != cvalsh) return;
+
+      auto p0 = rstate + 2 * ii;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+
+        rs[k2] = _mm_load_ps(p0 + xss[k]);
+        is[k2] = _mm_load_ps(p0 + xss[k] + 4);
+
+        if (L == 1) {
+          rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177)
+                               : _mm_shuffle_ps(rs[k2], rs[k2], 78);
+          is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177)
+                               : _mm_shuffle_ps(is[k2], is[k2], 78);
+        } else if (L == 2) {
+          rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57);
+          is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57);
+          rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78);
+          is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78);
+          rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147);
+          is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm_mul_ps(rs[0], w[j]);
+        in = _mm_mul_ps(rs[0], w[j + 1]);
+        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
+        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
+
+          j += 2;
+        }
+
+        _mm_store_ps(p0 + xss[k], rn);
+        _mm_store_ps(p0 + xss[k] + 4, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m128 w[1 << (1 + 2 * H + L)];
+
+    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
+
+    unsigned r = 2 + H;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    if (CH) {
+      auto m = GetMasks9<L>(state.num_qubits(), qs, cqs, cvals);
+      FillMatrix<H, L, 2>(m.qmaskl, matrix, (fp_type*) w);
+
+      for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, qs[0], state.get());
+    } else {
+      auto m = GetMasks10<L, 2>(state.num_qubits(), qs, cqs, cvals);
+      FillControlledMatrixL<H, L, 2>(
+          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
+
+      for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, qs[0], state.get());
+    }
+  }
+
+  template <unsigned H>
+  std::complex<double> ExpectationValueH(const std::vector<unsigned>& qs,
+                                         const fp_type* matrix,
+                                         const State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                const fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m128 ru, iu, rn, in;
+      __m128 rs[hsize], is[hsize];
+
+      i *= 4;
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      auto p0 = rstate + 2 * ii;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm_load_ps(p0 + xss[k]);
+        is[k] = _mm_load_ps(p0 + xss[k] + 4);
+      }
+
+      double re = 0;
+      double im = 0;
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm_set1_ps(v[j]);
+        iu = _mm_set1_ps(v[j + 1]);
+        rn = _mm_mul_ps(rs[0], ru);
+        in = _mm_mul_ps(rs[0], iu);
+        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
+        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm_set1_ps(v[j]);
+          iu = _mm_set1_ps(v[j + 1]);
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], ru));
+
+          j += 2;
+        }
+
+        __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[k], rn), _mm_mul_ps(is[k], in));
+        __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[k], in), _mm_mul_ps(is[k], rn));
+
+        re += detail::HorizontalSumSSE(v_re);
+        im += detail::HorizontalSumSSE(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 2 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), matrix, ms, xss, state.get());
+  }
+
+  template <unsigned H, unsigned L>
+  std::complex<double> ExpectationValueL(const std::vector<unsigned>& qs,
+                                         const fp_type* matrix,
+                                         const State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
+                const uint64_t* ms, const uint64_t* xss, unsigned q0,
+                const fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m128 rn, in;
+      __m128 rs[gsize], is[gsize];
+
+      i *= 4;
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      auto p0 = rstate + 2 * ii;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+
+        rs[k2] = _mm_load_ps(p0 + xss[k]);
+        is[k2] = _mm_load_ps(p0 + xss[k] + 4);
+
+        if (L == 1) {
+          rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177)
+                               : _mm_shuffle_ps(rs[k2], rs[k2], 78);
+          is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177)
+                               : _mm_shuffle_ps(is[k2], is[k2], 78);
+        } else if (L == 2) {
+          rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57);
+          is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57);
+          rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78);
+          is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78);
+          rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147);
+          is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147);
+        }
+      }
+
+      double re = 0;
+      double im = 0;
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm_mul_ps(rs[0], w[j]);
+        in = _mm_mul_ps(rs[0], w[j + 1]);
+        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
+        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
+
+          j += 2;
+        }
+
+        unsigned m = lsize * k;
+
+        __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[m], rn), _mm_mul_ps(is[m], in));
+        __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[m], in), _mm_mul_ps(is[m], rn));
+
+        re += detail::HorizontalSumSSE(v_re);
+        im += detail::HorizontalSumSSE(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m128 w[1 << (1 + 2 * H + L)];
+
+    auto m = GetMasks11<L>(qs);
+
+    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
+    FillMatrix<H, L, 2>(m.qmaskl, matrix, (fp_type*) w);
+
+    unsigned k = 2 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), w, ms, xss, qs[0], state.get());
+  }
+
+  For for_;
+};
+
+}  // namespace qsim
+
+#endif  // SIMULATOR_SSE_H_
diff --git a/qsim/statespace.h b/qsim/statespace.h
new file mode 100644
index 0000000..2b0c9af
--- /dev/null
+++ b/qsim/statespace.h
@@ -0,0 +1,145 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef STATESPACE_H_
+#define STATESPACE_H_
+
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include <vector>
+
+#include "util.h"
+
+namespace qsim {
+
+/**
+ * Abstract class containing context and routines for general state-vector
+ * manipulations. "AVX", "AVX512", "Basic", and "SSE" implementations are
+ * provided.
+ */
+template <typename Impl,
+          template<typename...> class VectorSpace, typename... VSTypeParams>
+class StateSpace : public VectorSpace<Impl, VSTypeParams...> {
+ private:
+  using Base = VectorSpace<Impl, VSTypeParams...>;
+
+ public:
+  using fp_type = typename Base::fp_type;
+  using State = typename Base::Vector;
+
+  /**
+   * The observed state from a Measurement gate.
+   */
+  struct MeasurementResult {
+    /**
+     * A bitmask of all qubits measured in this result. In this format, if the
+     * qubit at index `i` is measured, the `i`th bit of `mask` is a one.
+     */
+    uint64_t mask;
+    /**
+     * A bitwise representation of the measured states. In this format, the
+     * qubit at index `i` is represented by the `i`th bit of `bits`.
+     * If `valid` is true, `mask` has already been applied to this field
+     * (i.e. `bits == bits & mask`).
+     */
+    uint64_t bits;
+    /**
+     * Observed states of the measured qubits. This vector only includes qubits
+     * specified by the associated Measurement gate.
+     */
+    std::vector<unsigned> bitstring;
+    /**
+     * Validation bit. If this is false, the measurement failed and all other
+     * fields of the result are invalid.
+     */
+    bool valid;
+  };
+
+  template <typename... Args>
+  StateSpace(Args&&... args) : Base(args...) {}
+
+  double Norm(const State& state) const {
+    auto partial_norms = static_cast<const Impl&>(*this).PartialNorms(state);
+
+    double norm = partial_norms[0];
+    for (std::size_t i = 1; i < partial_norms.size(); ++i) {
+      norm += partial_norms[i];
+    }
+
+    return norm;
+  }
+
+  template <typename RGen>
+  MeasurementResult Measure(const std::vector<unsigned>& qubits,
+                            RGen& rgen, State& state) const {
+    auto result =
+        static_cast<const Impl&>(*this).VirtualMeasure(qubits, rgen, state);
+
+    if (result.valid) {
+      static_cast<const Impl&>(*this).Collapse(result, state);
+    }
+
+    return result;
+  }
+
+  template <typename RGen>
+  MeasurementResult VirtualMeasure(const std::vector<unsigned>& qubits,
+                                   RGen& rgen, const State& state) const {
+    MeasurementResult result;
+
+    result.valid = true;
+    result.mask = 0;
+
+    for (auto q : qubits) {
+      if (q >= state.num_qubits()) {
+        result.valid = false;
+        return result;
+      }
+
+      result.mask |= uint64_t{1} << q;
+    }
+
+    auto partial_norms = static_cast<const Impl&>(*this).PartialNorms(state);
+
+    for (std::size_t i = 1; i < partial_norms.size(); ++i) {
+      partial_norms[i] += partial_norms[i - 1];
+    }
+
+    auto norm = partial_norms.back();
+    auto r = RandomValue(rgen, norm);
+
+    unsigned m = 0;
+    while (r > partial_norms[m]) ++m;
+    if (m > 0) {
+      r -= partial_norms[m - 1];
+    }
+
+    result.bits = static_cast<const Impl&>(*this).FindMeasuredBits(
+        m, r, result.mask, state);
+
+    result.bitstring.reserve(qubits.size());
+    result.bitstring.resize(0);
+
+    for (auto q : qubits) {
+      result.bitstring.push_back((result.bits >> q) & 1);
+    }
+
+    return result;
+  }
+};
+
+}  // namespace qsim
+
+#endif  // STATESPACE_H_
diff --git a/qsim/statespace_avx.h b/qsim/statespace_avx.h
new file mode 100644
index 0000000..876058b
--- /dev/null
+++ b/qsim/statespace_avx.h
@@ -0,0 +1,497 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef STATESPACE_AVX_H_
+#define STATESPACE_AVX_H_
+
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <cstdint>
+#include <functional>
+
+#include "statespace.h"
+#include "util.h"
+#include "vectorspace.h"
+
+namespace qsim {
+
+namespace detail {
+
+inline __m256i GetZeroMaskAVX(uint64_t i, uint64_t mask, uint64_t bits) {
+  __m256i s1 = _mm256_setr_epi64x(i + 0, i + 2, i + 4, i + 6);
+  __m256i s2 = _mm256_setr_epi64x(i + 1, i + 3, i + 5, i + 7);
+  __m256i ma = _mm256_set1_epi64x(mask);
+  __m256i bi = _mm256_set1_epi64x(bits);
+
+  s1 = _mm256_and_si256(s1, ma);
+  s2 = _mm256_and_si256(s2, ma);
+
+  s1 = _mm256_cmpeq_epi64(s1, bi);
+  s2 = _mm256_cmpeq_epi64(s2, bi);
+
+  return _mm256_blend_epi32(s1, s2, 170);  // 10101010
+}
+
+inline double HorizontalSumAVX(__m256 s) {
+  __m128 l = _mm256_castps256_ps128(s);
+  __m128 h = _mm256_extractf128_ps(s, 1);
+  __m128 s1  = _mm_add_ps(h, l);
+  __m128 s1s = _mm_movehdup_ps(s1);
+  __m128 s2 = _mm_add_ps(s1, s1s);
+
+  return _mm_cvtss_f32(_mm_add_ss(s2, _mm_movehl_ps(s1s, s2)));
+}
+
+}  // namespace detail
+
+/**
+ * Object containing context and routines for AVX state-vector manipulations.
+ * State is a vectorized sequence of eight real components followed by eight
+ * imaginary components. Eight single-precison floating numbers can be loaded
+ * into an AVX register.
+ */
+template <typename For>
+class StateSpaceAVX :
+    public StateSpace<StateSpaceAVX<For>, VectorSpace, For, float> {
+ private:
+  using Base = StateSpace<StateSpaceAVX<For>, qsim::VectorSpace, For, float>;
+
+ public:
+  using State = typename Base::State;
+  using fp_type = typename Base::fp_type;
+
+  template <typename... ForArgs>
+  explicit StateSpaceAVX(ForArgs&&... args) : Base(args...) {}
+
+  static uint64_t MinSize(unsigned num_qubits) {
+    return std::max(uint64_t{16}, 2 * (uint64_t{1} << num_qubits));
+  };
+
+  void InternalToNormalOrder(State& state) const {
+    if (state.num_qubits() == 1) {
+      fp_type* s = state.get();
+
+      s[2] = s[1];
+      s[1] = s[8];
+      s[3] = s[9];
+
+      for (uint64_t i = 4; i < 16; ++i) {
+        s[i] = 0;
+      }
+    } else if (state.num_qubits() == 2) {
+      fp_type* s = state.get();
+
+      s[6] = s[3];
+      s[4] = s[2];
+      s[2] = s[1];
+      s[1] = s[8];
+      s[3] = s[9];
+      s[5] = s[10];
+      s[7] = s[11];
+
+      for (uint64_t i = 8; i < 16; ++i) {
+        s[i] = 0;
+      }
+    } else {
+      auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) {
+        fp_type* s = p + 16 * i;
+
+        fp_type re[7];
+        fp_type im[7];
+
+        for (uint64_t i = 0; i < 7; ++i) {
+          re[i] = s[i + 1];
+          im[i] = s[i + 8];
+        }
+
+        for (uint64_t i = 0; i < 7; ++i) {
+          s[2 * i + 1] = im[i];
+          s[2 * i + 2] = re[i];
+        }
+      };
+
+      Base::for_.Run(MinSize(state.num_qubits()) / 16, f, state.get());
+    }
+  }
+
+  void NormalToInternalOrder(State& state) const {
+    if (state.num_qubits() == 1) {
+      fp_type* s = state.get();
+
+      s[8] = s[1];
+      s[1] = s[2];
+      s[9] = s[3];
+
+      for (uint64_t i = 2; i < 8; ++i) {
+        s[i] = 0;
+        s[i + 8] = 0;
+      }
+    } else if (state.num_qubits() == 2) {
+      fp_type* s = state.get();
+
+      s[8] = s[1];
+      s[9] = s[3];
+      s[10] = s[5];
+      s[11] = s[7];
+      s[1] = s[2];
+      s[2] = s[4];
+      s[3] = s[6];
+
+      for (uint64_t i = 4; i < 8; ++i) {
+        s[i] = 0;
+        s[i + 8] = 0;
+      }
+    } else {
+      auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) {
+        fp_type* s = p + 16 * i;
+
+        fp_type re[7];
+        fp_type im[7];
+
+        for (uint64_t i = 0; i < 7; ++i) {
+          im[i] = s[2 * i + 1];
+          re[i] = s[2 * i + 2];
+        }
+
+        for (uint64_t i = 0; i < 7; ++i) {
+          s[i + 1] = re[i];
+          s[i + 8] = im[i];
+        }
+      };
+
+      Base::for_.Run(MinSize(state.num_qubits()) / 16, f, state.get());
+    }
+  }
+
+  void SetAllZeros(State& state) const {
+    __m256 val0 = _mm256_setzero_ps();
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, __m256& val, fp_type* p) {
+      _mm256_store_ps(p + 16 * i, val);
+      _mm256_store_ps(p + 16 * i + 8, val);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 16, f, val0, state.get());
+  }
+
+  // Uniform superposition.
+  void SetStateUniform(State& state) const {
+    __m256 val0 = _mm256_setzero_ps();
+    __m256 valu;
+
+    fp_type v = double{1} / std::sqrt(uint64_t{1} << state.num_qubits());
+
+    switch (state.num_qubits()) {
+    case 1:
+      valu = _mm256_set_ps(0, 0, 0, 0, 0, 0, v, v);
+      break;
+    case 2:
+      valu = _mm256_set_ps(0, 0, 0, 0, v, v, v, v);
+      break;
+    default:
+      valu = _mm256_set1_ps(v);
+      break;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                __m256& val0, __m256 valu, fp_type* p) {
+      _mm256_store_ps(p + 16 * i, valu);
+      _mm256_store_ps(p + 16 * i + 8, val0);
+    };
+
+    Base::for_.Run(
+        MinSize(state.num_qubits()) / 16, f, val0, valu, state.get());
+  }
+
+  // |0> state.
+  void SetStateZero(State& state) const {
+    SetAllZeros(state);
+    state.get()[0] = 1;
+  }
+
+  static std::complex<fp_type> GetAmpl(const State& state, uint64_t i) {
+    uint64_t k = (16 * (i / 8)) + (i % 8);
+    return std::complex<fp_type>(state.get()[k], state.get()[k + 8]);
+  }
+
+  static void SetAmpl(
+      State& state, uint64_t i, const std::complex<fp_type>& ampl) {
+    uint64_t k = (16 * (i / 8)) + (i % 8);
+    state.get()[k] = std::real(ampl);
+    state.get()[k + 8] = std::imag(ampl);
+  }
+
+  static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) {
+    uint64_t k = (16 * (i / 8)) + (i % 8);
+    state.get()[k] = re;
+    state.get()[k + 8] = im;
+  }
+
+  // Sets state[i] = complex(re, im) where (i & mask) == bits.
+  // if `exclude` is true then the criteria becomes (i & mask) != bits.
+  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits,
+                   const std::complex<fp_type>& val,
+                   bool exclude = false) const {
+    BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude);
+  }
+
+  // Sets state[i] = complex(re, im) where (i & mask) == bits.
+  // if `exclude` is true then the criteria becomes (i & mask) != bits.
+  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re,
+                   fp_type im, bool exclude = false) const {
+    __m256 re_reg = _mm256_set1_ps(re);
+    __m256 im_reg = _mm256_set1_ps(im);
+
+    __m256i exclude_reg = _mm256_setzero_si256();
+    if (exclude) {
+      exclude_reg = _mm256_cmpeq_epi32(exclude_reg, exclude_reg);
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv,
+                uint64_t bitsv, __m256 re_n, __m256 im_n, __m256i exclude_n,
+                fp_type* p) {
+      __m256 ml = _mm256_castsi256_ps(_mm256_xor_si256(
+          detail::GetZeroMaskAVX(8 * i, maskv, bitsv), exclude_n));
+
+      __m256 re = _mm256_load_ps(p + 16 * i);
+      __m256 im = _mm256_load_ps(p + 16 * i + 8);
+
+      re = _mm256_blendv_ps(re, re_n, ml);
+      im = _mm256_blendv_ps(im, im_n, ml);
+
+      _mm256_store_ps(p + 16 * i, re);
+      _mm256_store_ps(p + 16 * i + 8, im);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 16, f, mask, bits, re_reg,
+                   im_reg, exclude_reg, state.get());
+  }
+
+  // Does the equivalent of dest += src elementwise.
+  bool Add(const State& src, State& dest) const {
+    if (src.num_qubits() != dest.num_qubits()) {
+      return false;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p1, fp_type* p2) {
+      __m256 re1 = _mm256_load_ps(p1 + 16 * i);
+      __m256 im1 = _mm256_load_ps(p1 + 16 * i + 8);
+      __m256 re2 = _mm256_load_ps(p2 + 16 * i);
+      __m256 im2 = _mm256_load_ps(p2 + 16 * i + 8);
+
+      _mm256_store_ps(p2 + 16 * i, _mm256_add_ps(re1, re2));
+      _mm256_store_ps(p2 + 16 * i + 8, _mm256_add_ps(im1, im2));
+    };
+
+    Base::for_.Run(MinSize(src.num_qubits()) / 16, f, src.get(), dest.get());
+
+    return true;
+  }
+
+  // Does the equivalent of state *= a elementwise.
+  void Multiply(fp_type a, State& state) const {
+    __m256 r = _mm256_set1_ps(a);
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, __m256 r, fp_type* p) {
+      __m256 re = _mm256_load_ps(p + 16 * i);
+      __m256 im = _mm256_load_ps(p + 16 * i + 8);
+
+      re = _mm256_mul_ps(re, r);
+      im = _mm256_mul_ps(im, r);
+
+      _mm256_store_ps(p + 16 * i, re);
+      _mm256_store_ps(p + 16 * i + 8, im);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 16, f, r, state.get());
+  }
+
+  std::complex<double> InnerProduct(
+      const State& state1, const State& state2) const {
+    if (state1.num_qubits() != state2.num_qubits()) {
+      return std::nan("");
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p1, const fp_type* p2) -> std::complex<double> {
+      __m256 re1 = _mm256_load_ps(p1 + 16 * i);
+      __m256 im1 = _mm256_load_ps(p1 + 16 * i + 8);
+      __m256 re2 = _mm256_load_ps(p2 + 16 * i);
+      __m256 im2 = _mm256_load_ps(p2 + 16 * i + 8);
+
+      __m256 ip_re = _mm256_fmadd_ps(im1, im2, _mm256_mul_ps(re1, re2));
+      __m256 ip_im = _mm256_fnmadd_ps(im1, re2, _mm256_mul_ps(re1, im2));
+
+      double re = detail::HorizontalSumAVX(ip_re);
+      double im = detail::HorizontalSumAVX(ip_im);
+
+      return std::complex<double>{re, im};
+    };
+
+    using Op = std::plus<std::complex<double>>;
+    return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 16, f,
+                                Op(), state1.get(), state2.get());
+  }
+
+  double RealInnerProduct(const State& state1, const State& state2) const {
+    if (state1.num_qubits() != state2.num_qubits()) {
+      return std::nan("");
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p1, const fp_type* p2) -> double {
+      __m256 re1 = _mm256_load_ps(p1 + 16 * i);
+      __m256 im1 = _mm256_load_ps(p1 + 16 * i + 8);
+      __m256 re2 = _mm256_load_ps(p2 + 16 * i);
+      __m256 im2 = _mm256_load_ps(p2 + 16 * i + 8);
+
+      __m256 ip_re = _mm256_fmadd_ps(im1, im2, _mm256_mul_ps(re1, re2));
+
+      return detail::HorizontalSumAVX(ip_re);
+    };
+
+    using Op = std::plus<double>;
+    return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 16, f,
+                                Op(), state1.get(), state2.get());
+  }
+
+  template <typename DistrRealType = double>
+  std::vector<uint64_t> Sample(
+      const State& state, uint64_t num_samples, unsigned seed) const {
+    std::vector<uint64_t> bitstrings;
+
+    if (num_samples > 0) {
+      double norm = 0;
+      uint64_t size = MinSize(state.num_qubits()) / 16;
+      const fp_type* p = state.get();
+
+      for (uint64_t k = 0; k < size; ++k) {
+        for (unsigned j = 0; j < 8; ++j) {
+          double re = p[16 * k + j];
+          double im = p[16 * k + 8 + j];
+          norm += re * re + im * im;
+        }
+      }
+
+      auto rs = GenerateRandomValues<DistrRealType>(num_samples, seed, norm);
+
+      uint64_t m = 0;
+      double csum = 0;
+      bitstrings.reserve(num_samples);
+
+      for (uint64_t k = 0; k < size; ++k) {
+        for (unsigned j = 0; j < 8; ++j) {
+          double re = p[16 * k + j];
+          double im = p[16 * k + 8 + j];
+          csum += re * re + im * im;
+          while (rs[m] < csum && m < num_samples) {
+            bitstrings.emplace_back(8 * k + j);
+            ++m;
+          }
+        }
+      }
+
+      for (; m < num_samples; ++m) {
+        bitstrings.emplace_back((uint64_t{1} << state.num_qubits()) - 1);
+      }
+    }
+
+    return bitstrings;
+  }
+
+  using MeasurementResult = typename Base::MeasurementResult;
+
+  void Collapse(const MeasurementResult& mr, State& state) const {
+    auto f1 = [](unsigned n, unsigned m, uint64_t i,
+                 uint64_t mask, uint64_t bits, const fp_type* p) -> double {
+      __m256i ml = detail::GetZeroMaskAVX(8 * i, mask, bits);
+
+      __m256 re = _mm256_maskload_ps(p + 16 * i, ml);
+      __m256 im = _mm256_maskload_ps(p + 16 * i + 8, ml);
+      __m256 s1 = _mm256_fmadd_ps(im, im, _mm256_mul_ps(re, re));
+
+      return detail::HorizontalSumAVX(s1);
+    };
+
+    using Op = std::plus<double>;
+    double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 16, f1,
+                                       Op(), mr.mask, mr.bits, state.get());
+
+    __m256 renorm = _mm256_set1_ps(1.0 / std::sqrt(norm));
+
+    auto f2 = [](unsigned n, unsigned m, uint64_t i,
+                 uint64_t mask, uint64_t bits, __m256 renorm, fp_type* p) {
+      __m256i ml = detail::GetZeroMaskAVX(8 * i, mask, bits);
+
+      __m256 re = _mm256_maskload_ps(p + 16 * i, ml);
+      __m256 im = _mm256_maskload_ps(p + 16 * i + 8, ml);
+
+      re = _mm256_mul_ps(re, renorm);
+      im = _mm256_mul_ps(im, renorm);
+
+      _mm256_store_ps(p + 16 * i, re);
+      _mm256_store_ps(p + 16 * i + 8, im);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 16, f2,
+                   mr.mask, mr.bits, renorm, state.get());
+  }
+
+  std::vector<double> PartialNorms(const State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p) -> double {
+      __m256 re = _mm256_load_ps(p + 16 * i);
+      __m256 im = _mm256_load_ps(p + 16 * i + 8);
+      __m256 s1 = _mm256_fmadd_ps(im, im, _mm256_mul_ps(re, re));
+
+      return detail::HorizontalSumAVX(s1);
+    };
+
+    using Op = std::plus<double>;
+    return Base::for_.RunReduceP(
+        MinSize(state.num_qubits()) / 16, f, Op(), state.get());
+  }
+
+  uint64_t FindMeasuredBits(
+      unsigned m, double r, uint64_t mask, const State& state) const {
+    double csum = 0;
+
+    uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 16, m);
+    uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 16, m);
+
+    const fp_type* p = state.get();
+
+    for (uint64_t k = k0; k < k1; ++k) {
+      for (uint64_t j = 0; j < 8; ++j) {
+        auto re = p[16 * k + j];
+        auto im = p[16 * k + j + 8];
+        csum += re * re + im * im;
+        if (r < csum) {
+          return (8 * k + j) & mask;
+        }
+      }
+    }
+
+    // Return the last bitstring in the unlikely case of underflow.
+    return (8 * k1 - 1) & mask;
+  }
+};
+
+}  // namespace qsim
+
+#endif  // STATESPACE_AVX_H_
diff --git a/qsim/statespace_avx512.h b/qsim/statespace_avx512.h
new file mode 100644
index 0000000..879fd89
--- /dev/null
+++ b/qsim/statespace_avx512.h
@@ -0,0 +1,448 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef STATESPACE_AVX512_H_
+#define STATESPACE_AVX512_H_
+
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <cstdint>
+#include <functional>
+
+#include "statespace.h"
+#include "util.h"
+#include "vectorspace.h"
+
+namespace qsim {
+
+namespace detail {
+
+inline unsigned GetZeroMaskAVX512(uint64_t i, uint64_t mask, uint64_t bits) {
+  __m512i s1 = _mm512_setr_epi64(
+      i + 0, i + 1, i + 2, i + 3, i + 4, i + 5, i + 6, i + 7);
+  __m512i s2 = _mm512_setr_epi64(
+      i + 8, i + 9, i + 10, i + 11, i + 12, i + 13, i + 14, i + 15);
+  __m512i ma = _mm512_set1_epi64(mask);
+  __m512i bi = _mm512_set1_epi64(bits);
+
+  s1 = _mm512_and_si512(s1, ma);
+  s2 = _mm512_and_si512(s2, ma);
+
+  unsigned m1 = _mm512_cmpeq_epu64_mask(s1, bi);
+  unsigned m2 = _mm512_cmpeq_epu64_mask(s2, bi);
+
+  return (m2 << 8) | m1;
+}
+
+inline double HorizontalSumAVX(__m256 s) {
+  __m128 l = _mm256_castps256_ps128(s);
+  __m128 h = _mm256_extractf128_ps(s, 1);
+  __m128 s1  = _mm_add_ps(h, l);
+  __m128 s1s = _mm_movehdup_ps(s1);
+  __m128 s2 = _mm_add_ps(s1, s1s);
+
+  return _mm_cvtss_f32(_mm_add_ss(s2, _mm_movehl_ps(s1s, s2)));
+}
+
+inline double HorizontalSumAVX512(__m512 s) {
+  __m256 l = _mm512_castps512_ps256(s);
+  __m512d sd = _mm512_castps_pd(s);
+  __m256d hd = _mm512_extractf64x4_pd(sd, 1);
+  __m256 h = _mm256_castpd_ps(hd);
+  __m256 p = _mm256_add_ps(h, l);
+
+  return HorizontalSumAVX(p);
+}
+
+}  // namespace detail
+
+/**
+ * Object containing context and routines for AVX state-vector manipulations.
+ * State is a vectorized sequence of sixteen real components followed by
+ * sixteen imaginary components. Sixteen single-precison floating numbers can
+ * be loaded into an AVX512 register.
+ */
+template <typename For>
+class StateSpaceAVX512 :
+    public StateSpace<StateSpaceAVX512<For>, VectorSpace, For, float> {
+ private:
+  using Base = StateSpace<StateSpaceAVX512<For>, qsim::VectorSpace, For, float>;
+
+ public:
+  using State = typename Base::State;
+  using fp_type = typename Base::fp_type;
+
+  template <typename... ForArgs>
+  explicit StateSpaceAVX512(ForArgs&&... args) : Base(args...) {}
+
+  static uint64_t MinSize(unsigned num_qubits) {
+    return std::max(uint64_t{32}, 2 * (uint64_t{1} << num_qubits));
+  };
+
+  void InternalToNormalOrder(State& state) const {
+    __m512i idx1 = _mm512_setr_epi32(
+        0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
+    __m512i idx2 = _mm512_setr_epi32(
+        8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                __m512i idx1, __m512i idx2, fp_type* p) {
+      __m512 v1 = _mm512_load_ps(p + 32 * i);
+      __m512 v2 = _mm512_load_ps(p + 32 * i + 16);
+
+      _mm512_store_ps(p + 32 * i,  _mm512_permutex2var_ps(v1, idx1, v2));
+      _mm512_store_ps(p + 32 * i + 16,  _mm512_permutex2var_ps(v1, idx2, v2));
+    };
+
+    Base::for_.Run(
+        MinSize(state.num_qubits()) / 32, f, idx1, idx2, state.get());
+  }
+
+  void NormalToInternalOrder(State& state) const {
+    __m512i idx1 = _mm512_setr_epi32(
+        0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+    __m512i idx2 = _mm512_setr_epi32(
+        1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                __m512i idx1, __m512i idx2, fp_type* p) {
+      __m512 re = _mm512_load_ps(p + 32 * i);
+      __m512 im = _mm512_load_ps(p + 32 * i + 16);
+
+      _mm512_store_ps(p + 32 * i,  _mm512_permutex2var_ps(re, idx1, im));
+      _mm512_store_ps(p + 32 * i + 16,  _mm512_permutex2var_ps(re, idx2, im));
+    };
+
+    Base::for_.Run(
+        MinSize(state.num_qubits()) / 32, f, idx1, idx2, state.get());
+  }
+
+  void SetAllZeros(State& state) const {
+    __m512 val0 = _mm512_setzero_ps();
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, __m512 val0, fp_type* p) {
+      _mm512_store_ps(p + 32 * i, val0);
+      _mm512_store_ps(p + 32 * i + 16, val0);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 32, f, val0, state.get());
+  }
+
+  // Uniform superposition.
+  void SetStateUniform(State& state) const {
+    __m512 val0 = _mm512_setzero_ps();
+    __m512 valu;
+
+    fp_type v = double{1} / std::sqrt(uint64_t{1} << state.num_qubits());
+
+    switch (state.num_qubits()) {
+    case 1:
+      valu = _mm512_set_ps(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, v, v);
+      break;
+    case 2:
+      valu = _mm512_set_ps(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, v, v, v, v);
+      break;
+    case 3:
+      valu = _mm512_set_ps(0, 0, 0, 0, 0, 0, 0, 0, v, v, v, v, v, v, v, v);
+      break;
+    default:
+      valu = _mm512_set1_ps(v);
+      break;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const __m512& val0, const __m512& valu, fp_type* p) {
+      _mm512_store_ps(p + 32 * i, valu);
+      _mm512_store_ps(p + 32 * i + 16, val0);
+    };
+
+    Base::for_.Run(
+        MinSize(state.num_qubits()) / 32, f, val0, valu, state.get());
+  }
+
+  // |0> state.
+  void SetStateZero(State& state) const {
+    SetAllZeros(state);
+    state.get()[0] = 1;
+  }
+
+  static std::complex<fp_type> GetAmpl(const State& state, uint64_t i) {
+    uint64_t p = (32 * (i / 16)) + (i % 16);
+    return std::complex<fp_type>(state.get()[p], state.get()[p + 16]);
+  }
+
+  static void SetAmpl(
+      State& state, uint64_t i, const std::complex<fp_type>& ampl) {
+    uint64_t p = (32 * (i / 16)) + (i % 16);
+    state.get()[p] = std::real(ampl);
+    state.get()[p + 16] = std::imag(ampl);
+  }
+
+  static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) {
+    uint64_t p = (32 * (i / 16)) + (i % 16);
+    state.get()[p] = re;
+    state.get()[p + 16] = im;
+  }
+
+  // Sets state[i] = complex(re, im) where (i & mask) == bits.
+  // if `exclude` is true then the criteria becomes (i & mask) != bits.
+  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits,
+                   const std::complex<fp_type>& val,
+                   bool exclude = false) const {
+    BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude);
+  }
+
+  // Sets state[i] = complex(re, im) where (i & mask) == bits.
+  // if `exclude` is true then the criteria becomes (i & mask) != bits.
+  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re,
+                   fp_type im, bool exclude = false) const {
+    __m512 re_reg = _mm512_set1_ps(re);
+    __m512 im_reg = _mm512_set1_ps(im);
+
+    __mmask16 exclude_n = exclude ? 0xffff : 0;
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv,
+                uint64_t bitsv, __m512 re_n, __m512 im_n, __mmask16 exclude_n,
+                fp_type* p) {
+      __m512 re = _mm512_load_ps(p + 32 * i);
+      __m512 im = _mm512_load_ps(p + 32 * i + 16);
+
+      __mmask16 ml =
+          detail::GetZeroMaskAVX512(16 * i, maskv, bitsv) ^ exclude_n;
+
+      re = _mm512_mask_blend_ps(ml, re, re_n);
+      im = _mm512_mask_blend_ps(ml, im, im_n);
+
+      _mm512_store_ps(p + 32 * i, re);
+      _mm512_store_ps(p + 32 * i + 16, im);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 32, f, mask, bits,
+                   re_reg, im_reg, exclude_n, state.get());
+  }
+
+  // Does the equivalent of dest += src elementwise.
+  bool Add(const State& src, State& dest) const {
+    if (src.num_qubits() != dest.num_qubits()) {
+      return false;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p1, fp_type* p2) {
+      __m512 re1 = _mm512_load_ps(p1 + 32 * i);
+      __m512 im1 = _mm512_load_ps(p1 + 32 * i + 16);
+      __m512 re2 = _mm512_load_ps(p2 + 32 * i);
+      __m512 im2 = _mm512_load_ps(p2 + 32 * i + 16);
+
+      _mm512_store_ps(p2 + 32 * i, _mm512_add_ps(re1, re2));
+      _mm512_store_ps(p2 + 32 * i + 16, _mm512_add_ps(im1, im2));
+    };
+
+    Base::for_.Run(MinSize(src.num_qubits()) / 32, f, src.get(), dest.get());
+
+    return true;
+  }
+
+  // Does the equivalent of state *= a elementwise.
+  void Multiply(fp_type a, State& state) const {
+    __m512 r = _mm512_set1_ps(a);
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, __m512 r, fp_type* p) {
+      __m512 re = _mm512_load_ps(p + 32 * i);
+      __m512 im = _mm512_load_ps(p + 32 * i + 16);
+
+      _mm512_store_ps(p + 32 * i, _mm512_mul_ps(re, r));
+      _mm512_store_ps(p + 32 * i + 16, _mm512_mul_ps(im, r));
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 32, f, r, state.get());
+  }
+
+  std::complex<double> InnerProduct(
+      const State& state1, const State& state2) const {
+    if (state1.num_qubits() != state2.num_qubits()) {
+      return std::nan("");
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p1, const fp_type* p2) -> std::complex<double> {
+      __m512 re1 = _mm512_load_ps(p1 + 32 * i);
+      __m512 im1 = _mm512_load_ps(p1 + 32 * i + 16);
+      __m512 re2 = _mm512_load_ps(p2 + 32 * i);
+      __m512 im2 = _mm512_load_ps(p2 + 32 * i + 16);
+
+      __m512 ip_re = _mm512_fmadd_ps(im1, im2, _mm512_mul_ps(re1, re2));
+      __m512 ip_im = _mm512_fnmadd_ps(im1, re2, _mm512_mul_ps(re1, im2));
+
+      double re = detail::HorizontalSumAVX512(ip_re);
+      double im = detail::HorizontalSumAVX512(ip_im);
+
+      return std::complex<double>{re, im};
+    };
+
+    using Op = std::plus<std::complex<double>>;
+    return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 32, f,
+                                Op(), state1.get(), state2.get());
+  }
+
+  double RealInnerProduct(const State& state1, const State& state2) const {
+    if (state1.num_qubits() != state2.num_qubits()) {
+      return std::nan("");
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p1, const fp_type* p2) -> double {
+      __m512 re1 = _mm512_load_ps(p1 + 32 * i);
+      __m512 im1 = _mm512_load_ps(p1 + 32 * i + 16);
+      __m512 re2 = _mm512_load_ps(p2 + 32 * i);
+      __m512 im2 = _mm512_load_ps(p2 + 32 * i + 16);
+
+      __m512 ip_re = _mm512_fmadd_ps(im1, im2, _mm512_mul_ps(re1, re2));
+
+      return detail::HorizontalSumAVX512(ip_re);
+    };
+
+    using Op = std::plus<double>;
+    return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 32, f,
+                                Op(), state1.get(), state2.get());
+  }
+
+  template <typename DistrRealType = double>
+  std::vector<uint64_t> Sample(
+      const State& state, uint64_t num_samples, unsigned seed) const {
+    std::vector<uint64_t> bitstrings;
+
+    if (num_samples > 0) {
+      double norm = 0;
+      uint64_t size = MinSize(state.num_qubits()) / 32;
+      const fp_type* p = state.get();
+
+      for (uint64_t k = 0; k < size; ++k) {
+        for (unsigned j = 0; j < 16; ++j) {
+          double re = p[32 * k + j];
+          double im = p[32 * k + 16 + j];
+          norm += re * re + im * im;
+        }
+      }
+
+      auto rs = GenerateRandomValues<DistrRealType>(num_samples, seed, norm);
+
+      uint64_t m = 0;
+      double csum = 0;
+      bitstrings.reserve(num_samples);
+
+      for (uint64_t k = 0; k < size; ++k) {
+        for (unsigned j = 0; j < 16; ++j) {
+          double re = p[32 * k + j];
+          double im = p[32 * k + 16 + j];
+          csum += re * re + im * im;
+          while (rs[m] < csum && m < num_samples) {
+            bitstrings.emplace_back(16 * k + j);
+            ++m;
+          }
+        }
+      }
+
+      for (; m < num_samples; ++m) {
+        bitstrings.emplace_back((uint64_t{1} << state.num_qubits()) - 1);
+      }
+    }
+
+    return bitstrings;
+  }
+
+  using MeasurementResult = typename Base::MeasurementResult;
+
+  void Collapse(const MeasurementResult& mr, State& state) const {
+    auto f1 = [](unsigned n, unsigned m, uint64_t i,
+                 uint64_t mask, uint64_t bits, const fp_type* p) -> double {
+      __mmask16 ml = detail::GetZeroMaskAVX512(16 * i, mask, bits);
+
+      __m512 re = _mm512_maskz_load_ps(ml, p + 32 * i);
+      __m512 im = _mm512_maskz_load_ps(ml, p + 32 * i + 16);
+      __m512 s1 = _mm512_fmadd_ps(im, im, _mm512_mul_ps(re, re));
+
+      return detail::HorizontalSumAVX512(s1);
+    };
+
+    using Op = std::plus<double>;
+    double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 32, f1,
+                                       Op(), mr.mask, mr.bits, state.get());
+
+    __m512 renorm = _mm512_set1_ps(1.0 / std::sqrt(norm));
+
+    auto f2 = [](unsigned n, unsigned m, uint64_t i,
+                 uint64_t mask, uint64_t bits, __m512 renorm, fp_type* p) {
+      __mmask16 ml = detail::GetZeroMaskAVX512(16 * i, mask, bits);
+
+      __m512 re = _mm512_maskz_load_ps(ml, p + 32 * i);
+      __m512 im = _mm512_maskz_load_ps(ml, p + 32 * i + 16);
+
+      re = _mm512_mul_ps(re, renorm);
+      im = _mm512_mul_ps(im, renorm);
+
+      _mm512_store_ps(p + 32 * i, re);
+      _mm512_store_ps(p + 32 * i + 16, im);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 32, f2,
+                   mr.mask, mr.bits, renorm, state.get());
+  }
+
+  std::vector<double> PartialNorms(const State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p) -> double {
+      __m512 re = _mm512_load_ps(p + 32 * i);
+      __m512 im = _mm512_load_ps(p + 32 * i + 16);
+      __m512 s1 = _mm512_fmadd_ps(im, im, _mm512_mul_ps(re, re));
+
+      return detail::HorizontalSumAVX512(s1);
+    };
+
+    using Op = std::plus<double>;
+    return Base::for_.RunReduceP(
+        MinSize(state.num_qubits()) / 32, f, Op(), state.get());
+  }
+
+  uint64_t FindMeasuredBits(
+      unsigned m, double r, uint64_t mask, const State& state) const {
+    double csum = 0;
+
+    uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 32, m);
+    uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 32, m);
+
+    const fp_type* p = state.get();
+
+    for (uint64_t k = k0; k < k1; ++k) {
+      for (uint64_t j = 0; j < 16; ++j) {
+        auto re = p[32 * k + j];
+        auto im = p[32 * k + j + 16];
+        csum += re * re + im * im;
+        if (r < csum) {
+          return (16 * k + j) & mask;
+        }
+      }
+    }
+
+    // Return the last bitstring in the unlikely case of underflow.
+    return (16 * k1 - 1) & mask;
+  }
+};
+
+}  // namespace qsim
+
+#endif  // STATESPACE_AVX512_H_
diff --git a/qsim/statespace_basic.h b/qsim/statespace_basic.h
new file mode 100644
index 0000000..6468483
--- /dev/null
+++ b/qsim/statespace_basic.h
@@ -0,0 +1,300 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef STATESPACE_BASIC_H_
+#define STATESPACE_BASIC_H_
+
+#include <cmath>
+#include <complex>
+#include <cstdint>
+#include <functional>
+
+#include "statespace.h"
+#include "util.h"
+#include "vectorspace.h"
+
+namespace qsim {
+
+/**
+ * Object containing context and routines for unoptimized state-vector
+ * manipulations. State is a non-vectorized sequence of one real amplitude
+ * followed by one imaginary amplitude.
+ */
+template <typename For, typename FP>
+class StateSpaceBasic :
+    public StateSpace<StateSpaceBasic<For, FP>, VectorSpace, For, FP> {
+ private:
+  using Base = StateSpace<StateSpaceBasic<For, FP>, qsim::VectorSpace, For, FP>;
+
+ public:
+  using State = typename Base::State;
+  using fp_type = typename Base::fp_type;
+
+  template <typename... ForArgs>
+  explicit StateSpaceBasic(ForArgs&&... args) : Base(args...) {}
+
+  static uint64_t MinSize(unsigned num_qubits) {
+    return 2 * (uint64_t{1} << num_qubits);
+  };
+
+  void InternalToNormalOrder(State& state) const {}
+
+  void NormalToInternalOrder(State& state) const {}
+
+  void SetAllZeros(State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) {
+      p[2 * i] = 0;
+      p[2 * i + 1] = 0;
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 2, f, state.get());
+  }
+
+  // Uniform superposition.
+  void SetStateUniform(State& state) const {
+    fp_type val = fp_type{1} / std::sqrt(uint64_t{1} << state.num_qubits());
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                fp_type val, fp_type* p) {
+      p[2 * i] = val;
+      p[2 * i + 1] = 0;
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 2, f, val, state.get());
+  }
+
+  // |0> state.
+  void SetStateZero(State& state) const {
+    SetAllZeros(state);
+    state.get()[0] = 1;
+  }
+
+  static std::complex<fp_type> GetAmpl(const State& state, uint64_t i) {
+    uint64_t p = 2 * i;
+    return std::complex<fp_type>(state.get()[p], state.get()[p + 1]);
+  }
+
+  static void SetAmpl(
+      State& state, uint64_t i, const std::complex<fp_type>& ampl) {
+    uint64_t p = 2 * i;
+    state.get()[p] = std::real(ampl);
+    state.get()[p + 1] = std::imag(ampl);
+  }
+
+  static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) {
+    uint64_t p = 2 * i;
+    state.get()[p] = re;
+    state.get()[p + 1] = im;
+  }
+
+  // Sets state[i] = complex(re, im) where (i & mask) == bits.
+  // if `exclude` is true then the criteria becomes (i & mask) != bits.
+  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits,
+                   const std::complex<fp_type>& val,
+                   bool exclude = false) const {
+    BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude);
+  }
+
+  // Sets state[i] = complex(re, im) where (i & mask) == bits.
+  // if `exclude` is true then the criteria becomes (i & mask) != bits.
+  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re,
+                   fp_type im, bool exclude = false) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv,
+                uint64_t bitsv, fp_type re_n, fp_type im_n, bool excludev,
+                fp_type* p) {
+      auto s = p + 2 * i;
+      bool in_mask = (i & maskv) == bitsv;
+      in_mask ^= excludev;
+      s[0] = in_mask ? re_n : s[0];
+      s[1] = in_mask ? im_n : s[1];
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 2, f, mask, bits, re, im,
+                   exclude, state.get());
+  }
+
+  // Does the equivalent of dest += src elementwise.
+  bool Add(const State& src, State& dest) const {
+    if (src.num_qubits() != dest.num_qubits()) {
+      return false;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p1, fp_type* p2) {
+      p2[2 * i] += p1[2 * i];
+      p2[2 * i + 1] += p1[2 * i + 1];
+    };
+
+    Base::for_.Run(MinSize(src.num_qubits()) / 2, f, src.get(), dest.get());
+
+    return true;
+  }
+
+  // Does the equivalent of state *= a elementwise.
+  void Multiply(fp_type a, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, fp_type a, fp_type* p) {
+      p[2 * i] *= a;
+      p[2 * i + 1] *= a;
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 2, f, a, state.get());
+  }
+
+  std::complex<double> InnerProduct(
+      const State& state1, const State& state2) const {
+    if (state1.num_qubits() != state2.num_qubits()) {
+      return std::nan("");
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p1, const fp_type* p2) -> std::complex<double> {
+      auto s1 = p1 + 2 * i;
+      auto s2 = p2 + 2 * i;
+
+      double re = s1[0] * s2[0] + s1[1] * s2[1];
+      double im = s1[0] * s2[1] - s1[1] * s2[0];
+
+      return std::complex<double>{re, im};
+    };
+
+    using Op = std::plus<std::complex<double>>;
+    return Base::for_.RunReduce(
+        MinSize(state1.num_qubits()) / 2, f, Op(), state1.get(), state2.get());
+  }
+
+  double RealInnerProduct(const State& state1, const State& state2) const {
+    if (state1.num_qubits() != state2.num_qubits()) {
+      return std::nan("");
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p1, const fp_type* p2) -> double {
+      auto s1 = p1 + 2 * i;
+      auto s2 = p2 + 2 * i;
+
+      return s1[0] * s2[0] + s1[1] * s2[1];
+    };
+
+    using Op = std::plus<double>;
+    return Base::for_.RunReduce(
+        MinSize(state1.num_qubits()) / 2, f, Op(), state1.get(), state2.get());
+  }
+
+  template <typename DistrRealType = double>
+  std::vector<uint64_t> Sample(
+      const State& state, uint64_t num_samples, unsigned seed) const {
+    std::vector<uint64_t> bitstrings;
+
+    if (num_samples > 0) {
+      double norm = 0;
+      uint64_t size = MinSize(state.num_qubits()) / 2;
+
+      const fp_type* p = state.get();
+
+      for (uint64_t k = 0; k < size; ++k) {
+        double re = p[2 * k];
+        double im = p[2 * k + 1];
+        norm += re * re + im * im;
+      }
+
+      auto rs = GenerateRandomValues<DistrRealType>(num_samples, seed, norm);
+
+      uint64_t m = 0;
+      double csum = 0;
+      bitstrings.reserve(num_samples);
+
+      for (uint64_t k = 0; k < size; ++k) {
+        double re = p[2 * k];
+        double im = p[2 * k + 1];
+        csum += re * re + im * im;
+        while (rs[m] < csum && m < num_samples) {
+          bitstrings.emplace_back(k);
+          ++m;
+        }
+      }
+
+      for (; m < num_samples; ++m) {
+        bitstrings.emplace_back((uint64_t{1} << state.num_qubits()) - 1);
+      }
+    }
+
+    return bitstrings;
+  }
+
+  using MeasurementResult = typename Base::MeasurementResult;
+
+  void Collapse(const MeasurementResult& mr, State& state) const {
+    auto f1 = [](unsigned n, unsigned m, uint64_t i,
+                 uint64_t mask, uint64_t bits, const fp_type* p) -> double {
+      auto s = p + 2 * i;
+      return (i & mask) == bits ? s[0] * s[0] + s[1] * s[1] : 0;
+    };
+
+    using Op = std::plus<double>;
+    double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 2, f1,
+                                       Op(), mr.mask, mr.bits, state.get());
+
+    double renorm = 1.0 / std::sqrt(norm);
+
+    auto f2 = [](unsigned n, unsigned m, uint64_t i,
+                 uint64_t mask, uint64_t bits, fp_type renorm, fp_type* p) {
+      auto s = p + 2 * i;
+      bool not_zero = (i & mask) == bits;
+
+      s[0] = not_zero ? s[0] * renorm : 0;
+      s[1] = not_zero ? s[1] * renorm : 0;
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 2, f2,
+                   mr.mask, mr.bits, renorm, state.get());
+  }
+
+  std::vector<double> PartialNorms(const State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p) -> double {
+      auto s = p + 2 * i;
+      return s[0] * s[0] + s[1] * s[1];
+    };
+
+    using Op = std::plus<double>;
+    return Base::for_.RunReduceP(
+        MinSize(state.num_qubits()) / 2, f, Op(), state.get());
+  }
+
+  uint64_t FindMeasuredBits(
+      unsigned m, double r, uint64_t mask, const State& state) const {
+    double csum = 0;
+
+    uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 2, m);
+    uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 2, m);
+
+    const fp_type* p = state.get();
+
+    for (uint64_t k = k0; k < k1; ++k) {
+      auto re = p[2 * k];
+      auto im = p[2 * k + 1];
+      csum += re * re + im * im;
+      if (r < csum) {
+        return k & mask;
+      }
+    }
+
+    // Return the last bitstring in the unlikely case of underflow.
+    return (k1 - 1) & mask;
+  }
+};
+
+}  // namespace qsim
+
+#endif  // STATESPACE_BASIC_H_
diff --git a/qsim/statespace_cuda.h b/qsim/statespace_cuda.h
new file mode 100644
index 0000000..660db07
--- /dev/null
+++ b/qsim/statespace_cuda.h
@@ -0,0 +1,470 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef STATESPACE_CUDA_H_
+#define STATESPACE_CUDA_H_
+
+#ifdef __NVCC__
+  #include <cuda.h>
+#elif __HIP__
+  #include <hip/hip_runtime.h>
+  #include "cuda2hip.h"
+#endif
+
+#include <algorithm>
+#include <complex>
+#include <cstdint>
+
+#include "statespace.h"
+#include "statespace_cuda_kernels.h"
+#include "vectorspace_cuda.h"
+#include "util_cuda.h"
+
+namespace qsim {
+
+/**
+ * Object containing context and routines for CUDA state-vector manipulations.
+ * State is a vectorized sequence of 32 real components followed by 32
+ * imaginary components. 32 floating numbers can be proccessed in parallel by
+ * a single warp. It is not recommended to use `GetAmpl` and `SetAmpl`.
+ */
+template <typename FP = float>
+class StateSpaceCUDA :
+    public StateSpace<StateSpaceCUDA<FP>, VectorSpaceCUDA, FP> {
+ private:
+  using Base = StateSpace<StateSpaceCUDA<FP>, qsim::VectorSpaceCUDA, FP>;
+
+ protected:
+  struct Grid {
+    unsigned threads;
+    unsigned dblocks;
+    unsigned blocks;
+  };
+
+ public:
+  using State = typename Base::State;
+  using fp_type = typename Base::fp_type;
+
+  struct Parameter {
+    /**
+     * The number of threads per block.
+     * Should be 2 to the power of k, where k is in the range [5,10].
+     */
+    unsigned num_threads = 512;
+    /**
+     * The number of data blocks. Each thread processes num_dblocks data
+     * blocks in reductions (norms, inner products, etc).
+     */
+    unsigned num_dblocks = 16;
+  };
+
+  explicit StateSpaceCUDA(const Parameter& param)
+      : param_(param), scratch_(nullptr), scratch_size_(0) {}
+
+  virtual ~StateSpaceCUDA() {
+    if (scratch_ != nullptr) {
+      ErrorCheck(cudaFree(scratch_));
+    }
+  }
+
+  static uint64_t MinSize(unsigned num_qubits) {
+    return std::max(uint64_t{64}, 2 * (uint64_t{1} << num_qubits));
+  };
+
+  void InternalToNormalOrder(State& state) const {
+    uint64_t size = MinSize(state.num_qubits()) / 2;
+
+    unsigned threads = std::min(size, uint64_t{param_.num_threads});
+    unsigned blocks = size / threads;
+    unsigned bytes = 2 * threads * sizeof(fp_type);
+
+    InternalToNormalOrderKernel<<<blocks, threads, bytes>>>(state.get());
+    ErrorCheck(cudaPeekAtLastError());
+    ErrorCheck(cudaDeviceSynchronize());
+  }
+
+  void NormalToInternalOrder(State& state) const {
+    uint64_t size = MinSize(state.num_qubits()) / 2;
+
+    unsigned threads = std::min(size, uint64_t{param_.num_threads});
+    unsigned blocks = size / threads;
+    unsigned bytes = 2 * threads * sizeof(fp_type);
+
+    NormalToInternalOrderKernel<<<blocks, threads, bytes>>>(state.get());
+    ErrorCheck(cudaPeekAtLastError());
+    ErrorCheck(cudaDeviceSynchronize());
+  }
+
+  void SetAllZeros(State& state) const {
+    ErrorCheck(cudaMemset(state.get(), 0,
+               MinSize(state.num_qubits()) * sizeof(fp_type)));
+  }
+
+  // Uniform superposition.
+  void SetStateUniform(State& state) const {
+    uint64_t size = MinSize(state.num_qubits()) / 2;
+    uint64_t hsize = uint64_t{1} << state.num_qubits();
+
+    unsigned threads = std::min(size, uint64_t{param_.num_threads});
+    unsigned blocks = size / threads;
+
+    fp_type v = double{1} / std::sqrt(hsize);
+
+    SetStateUniformKernel<<<blocks, threads>>>(v, hsize, state.get());
+    ErrorCheck(cudaPeekAtLastError());
+    ErrorCheck(cudaDeviceSynchronize());
+  }
+
+  // |0> state.
+  void SetStateZero(State& state) const {
+    SetAllZeros(state);
+    fp_type one[1] = {1};
+    ErrorCheck(
+        cudaMemcpy(state.get(), one, sizeof(fp_type), cudaMemcpyHostToDevice));
+  }
+
+  // It is not recommended to use this function.
+  static std::complex<fp_type> GetAmpl(const State& state, uint64_t i) {
+    fp_type re, im;
+    auto p = state.get() + 64 * (i / 32) + i % 32;
+    ErrorCheck(cudaMemcpy(&re, p, sizeof(fp_type), cudaMemcpyDeviceToHost));
+    ErrorCheck(
+        cudaMemcpy(&im, p + 32, sizeof(fp_type), cudaMemcpyDeviceToHost));
+    return std::complex<fp_type>(re, im);
+  }
+
+  // It is not recommended to use this function.
+  static void SetAmpl(
+      State& state, uint64_t i, const std::complex<fp_type>& ampl) {
+    fp_type re = std::real(ampl);
+    fp_type im = std::imag(ampl);
+    auto p = state.get() + 64 * (i / 32) + i % 32;
+    ErrorCheck(cudaMemcpy(p, &re, sizeof(fp_type), cudaMemcpyHostToDevice));
+    ErrorCheck(
+        cudaMemcpy(p + 32, &im, sizeof(fp_type), cudaMemcpyHostToDevice));
+  }
+
+  // It is not recommended to use this function.
+  static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) {
+    auto p = state.get() + 64 * (i / 32) + i % 32;
+    ErrorCheck(cudaMemcpy(p, &re, sizeof(fp_type), cudaMemcpyHostToDevice));
+    ErrorCheck(
+        cudaMemcpy(p + 32, &im, sizeof(fp_type), cudaMemcpyHostToDevice));
+  }
+
+  // Sets state[i] = complex(re, im) where (i & mask) == bits.
+  // if `exclude` is true then the criteria becomes (i & mask) != bits.
+  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits,
+                   const std::complex<fp_type>& val,
+                   bool exclude = false) const {
+    BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude);
+  }
+
+  // Sets state[i] = complex(re, im) where (i & mask) == bits.
+  // if `exclude` is true then the criteria becomes (i & mask) != bits.
+  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re,
+                   fp_type im, bool exclude = false) const {
+    uint64_t size = MinSize(state.num_qubits()) / 2;
+
+    unsigned threads = std::min(size, uint64_t{param_.num_threads});
+    unsigned blocks = size / threads;
+
+    BulkSetAmplKernel<<<blocks, threads>>>(
+        mask, bits, re, im, exclude, state.get());
+    ErrorCheck(cudaPeekAtLastError());
+    ErrorCheck(cudaDeviceSynchronize());
+  }
+
+  // Does the equivalent of dest += src elementwise.
+  bool Add(const State& src, State& dest) const {
+    if (src.num_qubits() != dest.num_qubits()) {
+      return false;
+    }
+
+    uint64_t size = MinSize(src.num_qubits());
+
+    unsigned threads = std::min(size, uint64_t{param_.num_threads});
+    unsigned blocks = size / threads;
+
+    AddKernel<<<blocks, threads>>>(src.get(), dest.get());
+    ErrorCheck(cudaPeekAtLastError());
+    ErrorCheck(cudaDeviceSynchronize());
+
+    return true;
+  }
+
+  // Does the equivalent of state *= a elementwise.
+  void Multiply(fp_type a, State& state) const {
+    uint64_t size = MinSize(state.num_qubits());
+
+    unsigned threads = std::min(size, uint64_t{param_.num_threads});
+    unsigned blocks = size / threads;
+
+    MultiplyKernel<<<blocks, threads>>>(a, state.get());
+    ErrorCheck(cudaPeekAtLastError());
+    ErrorCheck(cudaDeviceSynchronize());
+  }
+
+  std::complex<double> InnerProduct(
+      const State& state1, const State& state2) const {
+    if (state1.num_qubits() != state2.num_qubits()) {
+      return std::nan("");
+    }
+
+    using C = Complex<double>;
+    auto r = Reduce<C, C, Product<fp_type>>(state1, state2);
+
+    return {r.re, r.im};
+  }
+
+  double RealInnerProduct(const State& state1, const State& state2) const {
+    if (state1.num_qubits() != state2.num_qubits()) {
+      return std::nan("");
+    }
+
+    return Reduce<double, double, RealProduct<fp_type>>(state1, state2);
+  }
+
+  double Norm(const State& state) const {
+    return Reduce<double, double, RealProduct<fp_type>>(state, state);
+  }
+
+  template <typename DistrRealType = double>
+  std::vector<uint64_t> Sample(
+      const State& state, uint64_t num_samples, unsigned seed) const {
+    std::vector<uint64_t> bitstrings;
+
+    if (num_samples > 0) {
+      Grid g1 = GetGrid1(MinSize(state.num_qubits()) / 2);
+      unsigned bytes = g1.threads * sizeof(double);
+
+      unsigned scratch_size = (g1.blocks + 1) * sizeof(double)
+          + num_samples * (sizeof(uint64_t) + sizeof(DistrRealType));
+
+      void* scratch = AllocScratch(scratch_size);
+
+      double* d_res2 = (double*) scratch;
+      double* d_res1 = d_res2 + 1;
+      uint64_t* d_bitstrings = (uint64_t*) (d_res1 + g1.blocks);
+      DistrRealType* d_rs = (DistrRealType *) (d_bitstrings + num_samples);
+
+      auto op1 = RealProduct<fp_type>();
+      auto op2 = Plus<double>();
+
+      Reduce1Kernel<double><<<g1.blocks, g1.threads, bytes>>>(
+          g1.dblocks, op1, op2, op2, state.get(), state.get(), d_res1);
+      ErrorCheck(cudaPeekAtLastError());
+      ErrorCheck(cudaDeviceSynchronize());
+
+      double norm;
+
+      if (g1.blocks == 1) {
+        ErrorCheck(
+            cudaMemcpy(&norm, d_res1, sizeof(double), cudaMemcpyDeviceToHost));
+      } else {
+        Grid g2 = GetGrid2(g1.blocks);
+        unsigned bytes = g2.threads * sizeof(double);
+
+        auto op3 = Plus<double>();
+
+        Reduce2Kernel<double><<<g2.blocks, g2.threads, bytes>>>(
+            g2.dblocks, g1.blocks, op3, op3, d_res1, d_res2);
+        ErrorCheck(cudaPeekAtLastError());
+        ErrorCheck(cudaDeviceSynchronize());
+
+        ErrorCheck(
+            cudaMemcpy(&norm, d_res2, sizeof(double), cudaMemcpyDeviceToHost));
+      }
+
+      // TODO: generate random values on the device.
+      auto rs = GenerateRandomValues<DistrRealType>(num_samples, seed, norm);
+
+      ErrorCheck(cudaMemcpy(d_rs, rs.data(),
+                            num_samples * sizeof(DistrRealType),
+                            cudaMemcpyHostToDevice));
+
+      SampleKernel<<<1, g1.threads>>>(g1.blocks, g1.dblocks, num_samples,
+                                      d_rs, d_res1, state.get(), d_bitstrings);
+      ErrorCheck(cudaPeekAtLastError());
+      ErrorCheck(cudaDeviceSynchronize());
+
+      bitstrings.resize(num_samples, 0);
+
+      ErrorCheck(cudaMemcpy(bitstrings.data(), d_bitstrings,
+                            num_samples * sizeof(uint64_t),
+                            cudaMemcpyDeviceToHost));
+    }
+
+    return bitstrings;
+  }
+
+  using MeasurementResult = typename Base::MeasurementResult;
+
+  void Collapse(const MeasurementResult& mr, State& state) const {
+    using Op = RealProduct<fp_type>;
+    double r = Reduce<double, double, Op>(mr.mask, mr.bits, state, state);
+    fp_type renorm = 1 / std::sqrt(r);
+
+    uint64_t size = MinSize(state.num_qubits()) / 2;
+
+    unsigned threads = std::min(size, uint64_t{param_.num_threads});
+    unsigned blocks = size / threads;
+
+    CollapseKernel<<<blocks, threads>>>(mr.mask, mr.bits, renorm, state.get());
+    ErrorCheck(cudaPeekAtLastError());
+    ErrorCheck(cudaDeviceSynchronize());
+  }
+
+  std::vector<double> PartialNorms(const State& state) const {
+    Grid g = GetGrid1(MinSize(state.num_qubits()) / 2);
+
+    unsigned scratch_size = g.blocks * sizeof(double);
+    unsigned bytes = g.threads * sizeof(double);
+
+    double* d_res = (double*) AllocScratch(scratch_size);
+
+    auto op1 = RealProduct<fp_type>();
+    auto op2 = Plus<double>();
+
+    Reduce1Kernel<double><<<g.blocks, g.threads, bytes>>>(
+        g.dblocks, op1, op2, op2, state.get(), state.get(), d_res);
+    ErrorCheck(cudaPeekAtLastError());
+    ErrorCheck(cudaDeviceSynchronize());
+
+    std::vector<double> norms(g.blocks);
+
+    ErrorCheck(
+        cudaMemcpy(norms.data(), d_res, scratch_size, cudaMemcpyDeviceToHost));
+
+    return norms;
+  }
+
+  uint64_t FindMeasuredBits(
+      unsigned m, double r, uint64_t mask, const State& state) const {
+    Grid g = GetGrid1(MinSize(state.num_qubits()) / 2);
+
+    uint64_t res;
+    uint64_t* d_res = (uint64_t*) AllocScratch(sizeof(uint64_t));
+
+    FindMeasuredBitsKernel<<<1, g.threads>>>(
+        m, g.dblocks, r, state.get(), d_res);
+    ErrorCheck(cudaPeekAtLastError());
+    ErrorCheck(cudaDeviceSynchronize());
+
+    ErrorCheck(
+        cudaMemcpy(&res, d_res, sizeof(uint64_t), cudaMemcpyDeviceToHost));
+
+    return res & mask;
+  }
+
+ protected:
+  Parameter param_;
+
+  void* AllocScratch(uint64_t size) const {
+    if (size > scratch_size_) {
+      if (scratch_ != nullptr) {
+        ErrorCheck(cudaFree(scratch_));
+      }
+
+      ErrorCheck(cudaMalloc(const_cast<void**>(&scratch_), size));
+
+      const_cast<uint64_t&>(scratch_size_) = size;
+    }
+
+    return scratch_;
+  }
+
+  Grid GetGrid1(uint64_t size) const {
+    Grid grid;
+
+    grid.threads = std::min(size, uint64_t{param_.num_threads});
+    grid.dblocks = std::min(size / grid.threads, uint64_t{param_.num_dblocks});
+    grid.blocks = size / (grid.threads * grid.dblocks);
+
+    return grid;
+  }
+
+  Grid GetGrid2(unsigned size) const {
+    Grid grid;
+
+    grid.threads = std::min(param_.num_threads, std::max(32U, size));
+    grid.dblocks = std::max(1U, size / grid.threads);
+    grid.blocks = 1;
+
+    return grid;
+  }
+
+  template <typename FP1, typename FP2, typename Op>
+  FP2 Reduce(const State& state1, const State& state2) const {
+    return Reduce<FP1, FP2, Op>(0, 0, state1, state2);
+  }
+
+  template <typename FP1, typename FP2, typename Op>
+  FP2 Reduce(uint64_t mask, uint64_t bits,
+             const State& state1, const State& state2) const {
+    uint64_t size = MinSize(state1.num_qubits()) / 2;
+
+    Grid g1 = GetGrid1(size);
+    unsigned bytes = g1.threads * sizeof(FP1);
+
+    FP2* d_res2 = (FP2*) AllocScratch((g1.blocks + 1) * sizeof(FP2));
+    FP2* d_res1 = d_res2 + 1;
+
+    auto op1 = Op();
+    auto op2 = Plus<FP1>();
+    auto op3 = Plus<typename Scalar<FP1>::type>();
+
+    if (mask == 0) {
+      Reduce1Kernel<FP1><<<g1.blocks, g1.threads, bytes>>>(
+          g1.dblocks, op1, op2, op3, state1.get(), state2.get(), d_res1);
+    } else {
+      Reduce1MaskedKernel<FP1><<<g1.blocks, g1.threads, bytes>>>(
+          g1.dblocks, mask, bits, op1, op2, op3, state1.get(), state2.get(),
+          d_res1);
+    }
+    ErrorCheck(cudaPeekAtLastError());
+    ErrorCheck(cudaDeviceSynchronize());
+
+    FP2 result;
+
+    if (g1.blocks == 1) {
+      ErrorCheck(
+          cudaMemcpy(&result, d_res1, sizeof(FP2), cudaMemcpyDeviceToHost));
+    } else {
+      Grid g2 = GetGrid2(g1.blocks);
+      unsigned bytes = g2.threads * sizeof(FP2);
+
+      auto op2 = Plus<FP2>();
+      auto op3 = Plus<typename Scalar<FP2>::type>();
+
+      Reduce2Kernel<FP2><<<g2.blocks, g2.threads, bytes>>>(
+          g2.dblocks, g1.blocks, op2, op3, d_res1, d_res2);
+      ErrorCheck(cudaPeekAtLastError());
+      ErrorCheck(cudaDeviceSynchronize());
+
+      ErrorCheck(
+          cudaMemcpy(&result, d_res2, sizeof(FP2), cudaMemcpyDeviceToHost));
+    }
+
+    return result;
+  }
+
+ private:
+  void* scratch_;
+  uint64_t scratch_size_;
+};
+
+}  // namespace qsim
+
+#endif  // STATESPACE_CUDA_H_
diff --git a/qsim/statespace_cuda_kernels.h b/qsim/statespace_cuda_kernels.h
new file mode 100644
index 0000000..b54ebca
--- /dev/null
+++ b/qsim/statespace_cuda_kernels.h
@@ -0,0 +1,355 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef STATESPACE_CUDA_KERNELS_H_
+#define STATESPACE_CUDA_KERNELS_H_
+
+#ifdef __NVCC__
+  #include <cuda.h>
+#elif __HIP__
+  #include <hip/hip_runtime.h>
+  #include "cuda2hip.h"
+#endif
+
+#include "util_cuda.h"
+
+namespace qsim {
+
+namespace detail {
+
+template <typename FP1, typename FP2,
+          typename Op1, typename Op2, typename Op3, unsigned warp_size = 32>
+__device__ __forceinline__ FP1 BlockReduce1(
+    uint64_t n, Op1 op1, Op2 op2, Op3 op3, const FP2* s1, const FP2* s2) {
+  extern __shared__ float shared[];
+  FP1* partial1 = (FP1*) shared;
+
+  unsigned tid = threadIdx.x;
+  unsigned warp = threadIdx.x / warp_size;
+  unsigned lane = threadIdx.x % warp_size;
+
+  uint64_t k0 = 2 * n * blockIdx.x * blockDim.x + 2 * tid - lane;
+  uint64_t k1 = k0 + 2 * n * blockDim.x;
+
+  FP1 r;
+
+  r = op1(s1[k0], s1[k0 + warp_size], s2[k0], s2[k0 + warp_size]);
+  while ((k0 += 2 * blockDim.x) < k1) {
+    r = op2(r, op1(s1[k0], s1[k0 + warp_size], s2[k0], s2[k0 + warp_size]));
+  }
+
+  partial1[tid] = r;
+
+  __shared__ FP1 partial2[warp_size];
+
+  if (tid < warp_size) {
+    partial2[tid] = 0;
+  }
+
+  __syncthreads();
+
+  FP1 val = WarpReduce(partial1[tid], op3);
+
+  if (lane == 0) {
+    partial2[warp] = val;
+  }
+
+  __syncthreads();
+
+  FP1 result = 0;
+
+  if (tid < warp_size) {
+    result = WarpReduce(partial2[tid], op3);
+  }
+
+  return result;
+}
+
+template <typename FP1, typename FP2,
+          typename Op1, typename Op2, typename Op3, unsigned warp_size = 32>
+__device__ __forceinline__ FP1 BlockReduce1Masked(
+    uint64_t n, uint64_t mask, uint64_t bits, Op1 op1, Op2 op2, Op3 op3,
+    const FP2* s1, const FP2* s2) {
+  extern __shared__ float shared[];
+  FP1* partial1 = (FP1*) shared;
+
+  unsigned tid = threadIdx.x;
+  unsigned warp = threadIdx.x / warp_size;
+  unsigned lane = threadIdx.x % warp_size;
+
+  uint64_t k0 = 2 * n * blockIdx.x * blockDim.x + 2 * tid - lane;
+  uint64_t k1 = k0 + 2 * n * blockDim.x;
+
+  FP1 r = 0;
+
+  if (((k0 + lane) / 2 & mask) == bits) {
+    r = op1(s1[k0], s1[k0 + warp_size], s2[k0], s2[k0 + warp_size]);
+  }
+  while ((k0 += 2 * blockDim.x) < k1) {
+    if (((k0 + lane) / 2 & mask) == bits) {
+      r = op2(r, op1(s1[k0], s1[k0 + warp_size], s2[k0], s2[k0 + warp_size]));
+    }
+  }
+
+  partial1[tid] = r;
+
+  __shared__ FP1 partial2[warp_size];
+
+  if (tid < warp_size) {
+    partial2[tid] = 0;
+  }
+
+  __syncthreads();
+
+  FP1 val = WarpReduce(partial1[tid], op3);
+
+  if (lane == 0) {
+    partial2[warp] = val;
+  }
+
+  __syncthreads();
+
+  FP1 result = 0;
+
+  if (tid < warp_size) {
+    result = WarpReduce(partial2[tid], op3);
+  }
+
+  return result;
+}
+
+template <typename FP1, typename FP2,
+          typename Op2, typename Op3, unsigned warp_size = 32>
+__device__ __forceinline__ FP1 BlockReduce2(
+    uint64_t n, uint64_t size, Op2 op2, Op3 op3, const FP2* s) {
+  extern __shared__ float shared[];
+  FP1* partial1 = (FP1*) shared;
+
+  unsigned tid = threadIdx.x;
+  uint64_t k0 = n * blockIdx.x * blockDim.x + tid;
+  uint64_t k1 = k0 + n * blockDim.x;
+
+  FP1 r = 0;
+
+  if (tid < size) {
+    r = s[k0];
+    while ((k0 += blockDim.x) < k1) {
+      r = op2(r, s[k0]);
+    }
+  }
+
+  partial1[tid] = r;
+
+  __shared__ FP1 partial2[warp_size];
+
+  if (tid < warp_size) {
+    partial2[tid] = 0;
+  }
+
+  __syncthreads();
+
+  FP1 val = WarpReduce(partial1[tid], op3);
+
+  if (threadIdx.x % warp_size == 0) {
+    partial2[threadIdx.x / warp_size] = val;
+  }
+
+  __syncthreads();
+
+  FP1 result = 0;
+
+  if (tid < warp_size) {
+    result = WarpReduce(partial2[tid], op3);
+  }
+
+  return result;
+}
+
+}  // namespace detail
+
+template <typename FP1, typename FP2, typename FP3,
+          typename Op1, typename Op2, typename Op3, unsigned warp_size = 32>
+__global__ void Reduce1Kernel(uint64_t n, Op1 op1, Op2 op2, Op3 op3,
+                              const FP2* s1, const FP2* s2, FP3* result) {
+  FP1 sum = detail::BlockReduce1<FP1>(n, op1, op2, op3, s1, s2);
+
+  if (threadIdx.x == 0) {
+    result[blockIdx.x] = sum;
+  }
+}
+
+template <typename FP1, typename FP2, typename FP3,
+          typename Op1, typename Op2, typename Op3, unsigned warp_size = 32>
+__global__ void Reduce1MaskedKernel(uint64_t n, uint64_t mask, uint64_t bits,
+                                    Op1 op1, Op2 op2, Op3 op3,
+                                    const FP2* s1, const FP2* s2, FP3* result) {
+  FP1 sum =
+      detail::BlockReduce1Masked<FP1>(n, mask, bits, op1, op2, op3, s1, s2);
+
+  if (threadIdx.x == 0) {
+    result[blockIdx.x] = sum;
+  }
+}
+
+template <typename FP1, typename FP2, typename FP3,
+          typename Op2, typename Op3, unsigned warp_size = 32>
+__global__ void Reduce2Kernel(
+    uint64_t n, uint64_t size, Op2 op2, Op3 op3, const FP2* s, FP3* result) {
+  FP1 sum = detail::BlockReduce2<FP1>(n, size, op2, op3, s);
+
+  if (threadIdx.x == 0) {
+    result[blockIdx.x] = sum;
+  }
+}
+
+template <typename FP, unsigned warp_size = 32>
+__global__ void InternalToNormalOrderKernel(FP* state) {
+  unsigned lane = threadIdx.x % warp_size;
+  unsigned l = 2 * threadIdx.x - lane;
+  uint64_t k = 2 * uint64_t{blockIdx.x} * blockDim.x + l;
+
+  extern __shared__ float shared[];
+  FP* buf = (FP*) shared;
+
+  buf[l] = state[k];
+  buf[l + warp_size] = state[k + warp_size];
+
+  __syncthreads();
+
+  state[k + lane] = buf[l];
+  state[k + lane + 1] = buf[l + warp_size];
+}
+
+template <typename FP, unsigned warp_size = 32>
+__global__ void NormalToInternalOrderKernel(FP* state) {
+  unsigned lane = threadIdx.x % warp_size;
+  unsigned l = 2 * threadIdx.x - lane;
+  uint64_t k = 2 * uint64_t{blockIdx.x} * blockDim.x + l;
+
+  extern __shared__ float shared[];
+  FP* buf = (FP*) shared;
+
+  buf[l] = state[k];
+  buf[l + warp_size] = state[k + warp_size];
+
+  __syncthreads();
+
+  state[k] = buf[l + lane];
+  state[k + warp_size] = buf[l + lane + 1];
+}
+
+template <typename FP, unsigned warp_size = 32>
+__global__ void SetStateUniformKernel(FP v, uint64_t size, FP* state) {
+  unsigned lane = threadIdx.x % warp_size;
+  uint64_t k = 2 * (uint64_t{blockIdx.x} * blockDim.x + threadIdx.x) - lane;
+
+  state[k] = lane < size ? v : 0;
+  state[k + warp_size] = 0;
+}
+
+template <typename FP, unsigned warp_size = 32>
+__global__ void AddKernel(const FP* state1, FP* state2) {
+  uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x;
+  state2[k] += state1[k];
+}
+
+template <typename FP, unsigned warp_size = 32>
+__global__ void MultiplyKernel(FP a, FP* state) {
+  uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x;
+  state[k] *= a;
+}
+
+template <typename FP, unsigned warp_size = 32>
+__global__ void CollapseKernel(uint64_t mask, uint64_t bits, FP r, FP* state) {
+  uint64_t k1 = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x;
+  uint64_t k2 = 2 * k1 - threadIdx.x % warp_size;
+
+  if ((k1 & mask) == bits) {
+    state[k2] *= r;
+    state[k2 + warp_size] *= r;
+  } else {
+    state[k2] = 0;
+    state[k2 + warp_size] = 0;
+  }
+}
+
+template <typename FP, unsigned warp_size = 32>
+__global__ void BulkSetAmplKernel(
+    uint64_t mask, uint64_t bits, FP re, FP im, bool exclude, FP* state) {
+  uint64_t k1 = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x;
+  uint64_t k2 = 2 * k1 - threadIdx.x % warp_size;
+
+  bool set = ((k1 & mask) == bits) ^ exclude;
+
+  if (set) {
+    state[k2] = re;
+    state[k2 + warp_size] = im;
+  }
+}
+
+template <typename FP1, typename FP2, typename FP3, unsigned warp_size = 32>
+__global__ void SampleKernel(unsigned num_blocks,
+                             uint64_t n, uint64_t num_samples,
+                             const FP1* rs, const FP2* ps, const FP3* state,
+                             uint64_t *bitstrings) {
+  // Use just one thread. This can be somewhat slow.
+  if (threadIdx.x == 0) {
+    uint64_t m = 0;
+    double csum = 0;
+
+    for (unsigned block_id = 0; block_id < num_blocks; ++block_id) {
+      uint64_t km = n * blockDim.x;
+      uint64_t k0 = block_id * km;
+
+      for (uint64_t k = 0; k < km; ++k) {
+        uint64_t l = 2 * k0 + 64 * (k / 32) + k % 32;
+        FP3 re = state[l];
+        FP3 im = state[l + warp_size];
+        csum += re * re + im * im;
+        while (rs[m] < csum && m < num_samples) {
+          bitstrings[m++] = k0 + k;
+        }
+      }
+    }
+  }
+}
+
+template <typename FP, unsigned warp_size = 32>
+__global__ void FindMeasuredBitsKernel(
+    uint64_t block_id, uint64_t n, double r, const FP* state, uint64_t* res) {
+  // Use just one thread. This can be somewhat slow, however, this is
+  // more or less consistent with CPU implementations.
+  if (threadIdx.x == 0) {
+    double csum = 0;
+    uint64_t km = n * blockDim.x;
+    uint64_t k0 = block_id * km;
+
+    for (uint64_t k = 0; k < km; ++k) {
+      uint64_t l = 2 * k0 + 64 * (k / 32) + k % 32;
+      FP re = state[l];
+      FP im = state[l + warp_size];
+      csum += re * re + im * im;
+      if (r < csum) {
+        *res = k0 + k;
+        return;
+      }
+    }
+
+    *res = k0 + n * blockDim.x - 1;
+  }
+}
+
+}  // namespace qsim
+
+#endif  // STATESPACE_CUDA_KERNELS_H_
diff --git a/qsim/statespace_custatevec.h b/qsim/statespace_custatevec.h
new file mode 100644
index 0000000..f2f5de1
--- /dev/null
+++ b/qsim/statespace_custatevec.h
@@ -0,0 +1,376 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef STATESPACE_CUSTATEVEC_H_
+#define STATESPACE_CUSTATEVEC_H_
+
+#include <cmath>
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <cublas_v2.h>
+#include <cuComplex.h>
+#include <custatevec.h>
+
+#include "statespace.h"
+#include "util_custatevec.h"
+#include "vectorspace_cuda.h"
+
+namespace qsim {
+
+namespace detail {
+
+template <typename FP>
+__global__ void SetStateUniformKernel(FP v, uint64_t size, FP* state) {
+  uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x;
+
+  if (k < size) {
+    state[2 * k] = v;
+    state[2 * k + 1] = 0;
+  }
+}
+
+}  // namespace detail
+
+/**
+ * Object containing context and routines for cuStateVec state-vector
+ * manipulations. It is not recommended to use `GetAmpl` and `SetAmpl`.
+ */
+template <typename FP = float>
+class StateSpaceCuStateVec :
+    public StateSpace<StateSpaceCuStateVec<FP>, VectorSpaceCUDA, FP> {
+ private:
+  using Base = StateSpace<StateSpaceCuStateVec<FP>, qsim::VectorSpaceCUDA, FP>;
+
+ public:
+  using State = typename Base::State;
+  using fp_type = typename Base::fp_type;
+
+  static constexpr auto is_float = std::is_same<fp_type, float>::value;
+
+  static constexpr auto kStateType = is_float ? CUDA_C_32F : CUDA_C_64F;
+  static constexpr auto kMatrixType = kStateType;
+  static constexpr auto kExpectType = CUDA_C_64F;
+  static constexpr auto kComputeType =
+      is_float ? CUSTATEVEC_COMPUTE_32F : CUSTATEVEC_COMPUTE_64F;
+  static constexpr auto kMatrixLayout = CUSTATEVEC_MATRIX_LAYOUT_ROW;
+
+  explicit StateSpaceCuStateVec(const cublasHandle_t& cublas_handle,
+                                const custatevecHandle_t& custatevec_handle)
+      : cublas_handle_(cublas_handle), custatevec_handle_(custatevec_handle),
+        workspace_(nullptr), workspace_size_(0) {}
+
+  virtual ~StateSpaceCuStateVec() {
+    if (workspace_ != nullptr) {
+      ErrorCheck(cudaFree(workspace_));
+    }
+  }
+
+  static uint64_t MinSize(unsigned num_qubits) {
+    return 2 * (uint64_t{1} << num_qubits);
+  };
+
+  void InternalToNormalOrder(State& state) const {
+  }
+
+  void NormalToInternalOrder(State& state) const {
+  }
+
+  void SetAllZeros(State& state) const {
+    ErrorCheck(cudaMemset(state.get(), 0,
+                          MinSize(state.num_qubits()) * sizeof(fp_type)));
+  }
+
+  // Uniform superposition.
+  void SetStateUniform(State& state) const {
+    uint64_t size = uint64_t{1} << state.num_qubits();
+
+    unsigned threads = size < 256 ? size : 256;
+    unsigned blocks = size / threads;
+
+    fp_type v = double{1} / std::sqrt(size);
+
+    detail::SetStateUniformKernel<<<blocks, threads>>>(v, size, state.get());
+    ErrorCheck(cudaPeekAtLastError());
+  }
+
+  // |0> state.
+  void SetStateZero(State& state) const {
+    SetAllZeros(state);
+    fp_type one[1] = {1};
+    ErrorCheck(
+        cudaMemcpy(state.get(), one, sizeof(fp_type), cudaMemcpyHostToDevice));
+  }
+
+  // It is not recommended to use this function.
+  static std::complex<fp_type> GetAmpl(const State& state, uint64_t i) {
+    fp_type a[2];
+    auto p = state.get() + 2 * i;
+    ErrorCheck(cudaMemcpy(a, p, 2 * sizeof(fp_type), cudaMemcpyDeviceToHost));
+    return std::complex<fp_type>(a[0], a[1]);
+  }
+
+  // It is not recommended to use this function.
+  static void SetAmpl(
+      State& state, uint64_t i, const std::complex<fp_type>& ampl) {
+    fp_type a[2] = {std::real(ampl), std::imag(ampl)};
+    auto p = state.get() + 2 * i;
+    ErrorCheck(cudaMemcpy(p, a, 2 * sizeof(fp_type), cudaMemcpyHostToDevice));
+  }
+
+  // It is not recommended to use this function.
+  static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) {
+    fp_type a[2] = {re, im};
+    auto p = state.get() + 2 * i;
+    ErrorCheck(cudaMemcpy(p, a, 2 * sizeof(fp_type), cudaMemcpyHostToDevice));
+  }
+
+  // Sets state[i] = complex(re, im) where (i & mask) == bits.
+  // if `exclude` is true then the criteria becomes (i & mask) != bits.
+  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits,
+                   const std::complex<fp_type>& val,
+                   bool exclude = false) const {
+    // Not implemented.
+  }
+
+  // Sets state[i] = complex(re, im) where (i & mask) == bits.
+  // if `exclude` is true then the criteria becomes (i & mask) != bits.
+  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re,
+                   fp_type im, bool exclude = false) const {
+    // Not implemented.
+  }
+
+  // Does the equivalent of dest += src elementwise.
+  bool Add(const State& src, State& dest) const {
+    if (src.num_qubits() != dest.num_qubits()) {
+      return false;
+    }
+
+    uint64_t size = uint64_t{1} << src.num_qubits();
+
+    if (is_float) {
+      cuComplex a = {1.0, 0.0};
+      auto p1 = (const cuComplex*) src.get();
+      auto p2 = (cuComplex*) dest.get();
+      ErrorCheck(cublasCaxpy(cublas_handle_, size, &a, p1, 1, p2, 1));
+    } else {
+      cuDoubleComplex a = {1.0, 0.0};
+      auto p1 = (const cuDoubleComplex*) src.get();
+      auto p2 = (cuDoubleComplex*) dest.get();
+      ErrorCheck(cublasZaxpy(cublas_handle_, size, &a, p1, 1, p2, 1));
+    }
+
+    return true;
+  }
+
+  // Does the equivalent of state *= a elementwise.
+  void Multiply(fp_type a, State& state) const {
+    uint64_t size = uint64_t{1} << state.num_qubits();
+
+    if (is_float) {
+      float a1 = a;
+      auto p = (cuComplex*) state.get();
+      ErrorCheck(cublasCsscal(cublas_handle_, size, &a1, p, 1));
+    } else {
+      double a1 = a;
+      auto p = (cuDoubleComplex*) state.get();
+      ErrorCheck(cublasZdscal(cublas_handle_, size, &a1, p, 1));
+    }
+  }
+
+  std::complex<double> InnerProduct(
+      const State& state1, const State& state2) const {
+    if (state1.num_qubits() != state2.num_qubits()) {
+      return std::nan("");
+    }
+
+    uint64_t size = uint64_t{1} << state1.num_qubits();
+
+    if (is_float) {
+      cuComplex result;
+      auto p1 = (const cuComplex*) state1.get();
+      auto p2 = (const cuComplex*) state2.get();
+      ErrorCheck(cublasCdotc(cublas_handle_, size, p1, 1, p2, 1, &result));
+      return {cuCrealf(result), cuCimagf(result)};
+    } else {
+      cuDoubleComplex result;
+      auto p1 = (const cuDoubleComplex*) state1.get();
+      auto p2 = (const cuDoubleComplex*) state2.get();
+      ErrorCheck(cublasZdotc(cublas_handle_, size, p1, 1, p2, 1, &result));
+      return {cuCreal(result), cuCimag(result)};
+    }
+  }
+
+  double RealInnerProduct(const State& state1, const State& state2) const {
+    return std::real(InnerProduct(state1, state2));
+  }
+
+  double Norm(const State& state) const {
+    uint64_t size = uint64_t{1} << state.num_qubits();
+
+    if (is_float) {
+      float result;
+      auto p = (const cuComplex*) state.get();
+      ErrorCheck(cublasScnrm2(cublas_handle_, size, p, 1, &result));
+      return result * result;
+    } else {
+      double result;
+      auto p = (const cuDoubleComplex*) state.get();
+      ErrorCheck(cublasDznrm2(cublas_handle_, size, p, 1, &result));
+      return result * result;
+    }
+  }
+
+  template <typename DistrRealType = double>
+  std::vector<uint64_t> Sample(
+      const State& state, uint64_t num_samples, unsigned seed) const {
+    std::vector<uint64_t> bitstrings;
+
+    if (num_samples > 0) {
+      auto rs = GenerateRandomValues<double>(num_samples, seed, 1.0);
+
+      size_t workspace_size;
+      custatevecSamplerDescriptor_t sampler;
+
+      ErrorCheck(custatevecSamplerCreate(
+                     custatevec_handle_, state.get(), kStateType,
+                     state.num_qubits(), &sampler, num_samples,
+                     &workspace_size));
+
+      AllocWorkSpace(workspace_size);
+
+      ErrorCheck(custatevecSamplerPreprocess(
+                     custatevec_handle_, sampler, workspace_, workspace_size));
+
+      std::vector<custatevecIndex_t> bitstrings0(num_samples);
+      std::vector<int32_t> bitordering;
+
+      bitordering.reserve(state.num_qubits());
+      for (unsigned i = 0; i < state.num_qubits(); ++i) {
+        bitordering.push_back(i);
+      }
+
+      ErrorCheck(custatevecSamplerSample(
+                     custatevec_handle_, sampler, bitstrings0.data(),
+                     bitordering.data(), state.num_qubits(), rs.data(),
+                     num_samples, CUSTATEVEC_SAMPLER_OUTPUT_RANDNUM_ORDER));
+
+      bitstrings.reserve(num_samples);
+      for (unsigned i = 0; i < num_samples; ++i) {
+        bitstrings.push_back(bitstrings0[i]);
+      }
+    }
+
+    return bitstrings;
+  }
+
+  using MeasurementResult = typename Base::MeasurementResult;
+
+  template <typename RGen>
+  MeasurementResult Measure(const std::vector<unsigned>& qubits,
+                            RGen& rgen, State& state,
+                            bool no_collapse = false) const {
+    auto r = RandomValue(rgen, 1.0);
+
+    MeasurementResult result;
+
+    result.valid = true;
+    result.mask = 0;
+    result.bits = 0;
+    result.bitstring.resize(qubits.size(), 0);
+
+    for (auto q : qubits) {
+      if (q >= state.num_qubits()) {
+        result.valid = false;
+        return result;
+      }
+
+      result.mask |= uint64_t{1} << q;
+    }
+
+    auto collapse = no_collapse ?
+        CUSTATEVEC_COLLAPSE_NONE : CUSTATEVEC_COLLAPSE_NORMALIZE_AND_ZERO;
+
+    ErrorCheck(custatevecBatchMeasure(
+                   custatevec_handle_, state.get(), kStateType,
+                   state.num_qubits(), (int*) result.bitstring.data(),
+                   (int*) qubits.data(), qubits.size(), r, collapse));
+
+    for (std::size_t i = 0; i < result.bitstring.size(); ++i) {
+      result.bits |= result.bitstring[i] << qubits[i];
+    }
+
+    return result;
+  }
+
+  template <typename RGen>
+  MeasurementResult VirtualMeasure(const std::vector<unsigned>& qubits,
+                                   RGen& rgen, const State& state) const {
+    return Measure(qubits, rgen, const_cast<State&>(state), true);
+  }
+
+  void Collapse(const MeasurementResult& mr, State& state) const {
+    unsigned count = 0;
+
+    std::vector<int> bitstring;
+    std::vector<int> bitordering;
+
+    bitstring.reserve(state.num_qubits());
+    bitordering.reserve(state.num_qubits());
+
+    for (unsigned i = 0; i < state.num_qubits(); ++i) {
+      if (((mr.mask >> i) & 1) != 0) {
+        bitstring.push_back((mr.bits >> i) & 1);
+        bitordering.push_back(i);
+        ++count;
+      }
+    }
+
+    ErrorCheck(custatevecCollapseByBitString(
+                   custatevec_handle_, state.get(), kStateType,
+                   state.num_qubits(), bitstring.data(), bitordering.data(),
+                   count, 1.0));
+
+    // TODO: do we need the following?
+    double norm = Norm(state);
+    Multiply(1.0 / std::sqrt(norm), state);
+  }
+
+ private:
+  void* AllocWorkSpace(size_t size) const {
+    if (size > workspace_size_) {
+      if (workspace_ != nullptr) {
+        ErrorCheck(cudaFree(workspace_));
+      }
+
+      ErrorCheck(cudaMalloc(const_cast<void**>(&workspace_), size));
+
+      const_cast<uint64_t&>(workspace_size_) = size;
+    }
+
+    return workspace_;
+  }
+
+  const cublasHandle_t cublas_handle_;
+  const custatevecHandle_t custatevec_handle_;
+
+  void* workspace_;
+  size_t workspace_size_;
+};
+
+}  // namespace qsim
+
+#endif  // STATESPACE_CUSTATEVEC_H_
diff --git a/qsim/statespace_sse.h b/qsim/statespace_sse.h
new file mode 100644
index 0000000..cf41a09
--- /dev/null
+++ b/qsim/statespace_sse.h
@@ -0,0 +1,462 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef STATESPACE_SSE_H_
+#define STATESPACE_SSE_H_
+
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <cstdint>
+#include <functional>
+
+#include "statespace.h"
+#include "util.h"
+#include "vectorspace.h"
+
+namespace qsim {
+
+namespace detail {
+
+inline __m128i GetZeroMaskSSE(uint64_t i, uint64_t mask, uint64_t bits) {
+  __m128i s1 = _mm_set_epi64x(i + 2, i + 0);
+  __m128i s2 = _mm_set_epi64x(i + 3, i + 1);
+  __m128i ma = _mm_set1_epi64x(mask);
+  __m128i bi = _mm_set1_epi64x(bits);
+
+  s1 = _mm_and_si128(s1, ma);
+  s2 = _mm_and_si128(s2, ma);
+
+  s1 = _mm_cmpeq_epi64(s1, bi);
+  s2 = _mm_cmpeq_epi64(s2, bi);
+
+  return _mm_blend_epi16(s1, s2, 204);  // 11001100
+}
+
+inline double HorizontalSumSSE(__m128 s) {
+  __m128 ss = _mm_movehdup_ps(s);
+  __m128 s1 = _mm_add_ps(s, ss);
+
+  return _mm_cvtss_f32(_mm_add_ss(s1, _mm_movehl_ps(ss, s1)));
+}
+
+}  // namespace detail
+
+/**
+ * Object containing context and routines for SSE state-vector manipulations.
+ * State is a vectorized sequence of four real components followed by four
+ * imaginary components. Four single-precison floating numbers can be loaded
+ * into an SSE register.
+ */
+template <typename For>
+class StateSpaceSSE :
+    public StateSpace<StateSpaceSSE<For>, VectorSpace, For, float> {
+ private:
+  using Base = StateSpace<StateSpaceSSE<For>, qsim::VectorSpace, For, float>;
+
+ public:
+  using State = typename Base::State;
+  using fp_type = typename Base::fp_type;
+
+  template <typename... ForArgs>
+  explicit StateSpaceSSE(ForArgs&&... args) : Base(args...) {}
+
+  static uint64_t MinSize(unsigned num_qubits) {
+    return std::max(uint64_t{8}, 2 * (uint64_t{1} << num_qubits));
+  };
+
+  void InternalToNormalOrder(State& state) const {
+    if (state.num_qubits() == 1) {
+      auto s = state.get();
+
+      s[2] = s[1];
+      s[1] = s[4];
+      s[3] = s[5];
+
+      for (uint64_t i = 4; i < 8; ++i) {
+        s[i] = 0;
+      }
+    } else {
+      auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) {
+        auto s = p + 8 * i;
+
+        fp_type re[3];
+        fp_type im[3];
+
+        for (uint64_t i = 0; i < 3; ++i) {
+          re[i] = s[i + 1];
+          im[i] = s[i + 4];
+        }
+
+        for (uint64_t i = 0; i < 3; ++i) {
+          s[2 * i + 1] = im[i];
+          s[2 * i + 2] = re[i];
+        }
+      };
+
+      Base::for_.Run(MinSize(state.num_qubits()) / 8, f, state.get());
+    }
+  }
+
+  void NormalToInternalOrder(State& state) const {
+    if (state.num_qubits() == 1) {
+      auto s = state.get();
+
+      s[4] = s[1];
+      s[1] = s[2];
+      s[5] = s[3];
+
+      s[2] = 0;
+      s[3] = 0;
+      s[6] = 0;
+      s[7] = 0;
+    } else {
+      auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) {
+        auto s = p + 8 * i;
+
+        fp_type re[3];
+        fp_type im[3];
+
+        for (uint64_t i = 0; i < 3; ++i) {
+          im[i] = s[2 * i + 1];
+          re[i] = s[2 * i + 2];
+        }
+
+        for (uint64_t i = 0; i < 3; ++i) {
+          s[i + 1] = re[i];
+          s[i + 4] = im[i];
+        }
+      };
+
+      Base::for_.Run(MinSize(state.num_qubits()) / 8, f, state.get());
+    }
+  }
+
+  void SetAllZeros(State& state) const {
+    __m128 val0 = _mm_setzero_ps();
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, __m128 val0, fp_type* p) {
+      _mm_store_ps(p + 8 * i, val0);
+      _mm_store_ps(p + 8 * i + 4, val0);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 8, f, val0, state.get());
+  }
+
+  // Uniform superposition.
+  void SetStateUniform(State& state) const {
+    __m128 val0 = _mm_setzero_ps();
+    __m128 valu;
+
+    fp_type v = double{1} / std::sqrt(uint64_t{1} << state.num_qubits());
+
+    if (state.num_qubits() == 1) {
+      valu = _mm_set_ps(0, 0, v, v);
+    } else {
+      valu = _mm_set1_ps(v);
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                __m128 val0, __m128 valu, fp_type* p) {
+      _mm_store_ps(p + 8 * i, valu);
+      _mm_store_ps(p + 8 * i + 4, val0);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 8, f, val0, valu, state.get());
+  }
+
+  // |0> state.
+  void SetStateZero(State& state) const {
+    SetAllZeros(state);
+    state.get()[0] = 1;
+  }
+
+  static std::complex<fp_type> GetAmpl(const State& state, uint64_t i) {
+    uint64_t p = (8 * (i / 4)) + (i % 4);
+    return std::complex<fp_type>(state.get()[p], state.get()[p + 4]);
+  }
+
+  static void SetAmpl(
+      State& state, uint64_t i, const std::complex<fp_type>& ampl) {
+    uint64_t p = (8 * (i / 4)) + (i % 4);
+    state.get()[p] = std::real(ampl);
+    state.get()[p + 4] = std::imag(ampl);
+  }
+
+  static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) {
+    uint64_t p = (8 * (i / 4)) + (i % 4);
+    state.get()[p] = re;
+    state.get()[p + 4] = im;
+  }
+
+  // Sets state[i] = complex(re, im) where (i & mask) == bits.
+  // if `exclude` is true then the criteria becomes (i & mask) != bits.
+  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits,
+                   const std::complex<fp_type>& val,
+                   bool exclude = false) const {
+    BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val));
+  }
+
+  // Sets state[i] = complex(re, im) where (i & mask) == bits.
+  // if `exclude` is true then the criteria becomes (i & mask) != bits.
+  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re,
+                   fp_type im, bool exclude = false) const {
+    __m128 re_reg = _mm_set1_ps(re);
+    __m128 im_reg = _mm_set1_ps(im);
+    __m128i exclude_reg = _mm_setzero_si128();
+    if (exclude) {
+      exclude_reg = _mm_cmpeq_epi32(exclude_reg, exclude_reg);
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv,
+                uint64_t bitsv, __m128 re_n, __m128 im_n, __m128i exclude_n,
+                fp_type* p) {
+      __m128 ml = _mm_castsi128_ps(_mm_xor_si128(
+          detail::GetZeroMaskSSE(4 * i, maskv, bitsv), exclude_n));
+
+      __m128 re = _mm_load_ps(p + 8 * i);
+      __m128 im = _mm_load_ps(p + 8 * i + 4);
+
+      re = _mm_blendv_ps(re, re_n, ml);
+      im = _mm_blendv_ps(im, im_n, ml);
+
+      _mm_store_ps(p + 8 * i, re);
+      _mm_store_ps(p + 8 * i + 4, im);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 8, f, mask, bits, re_reg,
+                   im_reg, exclude_reg, state.get());
+  }
+
+  // Does the equivalent of dest += src elementwise.
+  bool Add(const State& src, State& dest) const {
+    if (src.num_qubits() != dest.num_qubits()) {
+      return false;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p1, fp_type* p2) {
+      __m128 re1 = _mm_load_ps(p1 + 8 * i);
+      __m128 im1 = _mm_load_ps(p1 + 8 * i + 4);
+      __m128 re2 = _mm_load_ps(p2 + 8 * i);
+      __m128 im2 = _mm_load_ps(p2 + 8 * i + 4);
+
+      _mm_store_ps(p2 + 8 * i, _mm_add_ps(re1, re2));
+      _mm_store_ps(p2 + 8 * i + 4, _mm_add_ps(im1, im2));
+    };
+
+    Base::for_.Run(MinSize(src.num_qubits()) / 8, f, src.get(), dest.get());
+
+    return true;
+  }
+
+  // Does the equivalent of state *= a elementwise.
+  void Multiply(fp_type a, State& state) const {
+    __m128 r = _mm_set1_ps(a);
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, __m128 r, fp_type* p) {
+      __m128 re = _mm_load_ps(p + 8 * i);
+      __m128 im = _mm_load_ps(p + 8 * i + 4);
+
+      re = _mm_mul_ps(re, r);
+      im = _mm_mul_ps(im, r);
+
+      _mm_store_ps(p + 8 * i, re);
+      _mm_store_ps(p + 8 * i + 4, im);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 8, f, r, state.get());
+  }
+
+  std::complex<double> InnerProduct(
+      const State& state1, const State& state2) const {
+    if (state1.num_qubits() != state2.num_qubits()) {
+      return std::nan("");
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p1, const fp_type* p2) -> std::complex<double> {
+      __m128 re1 = _mm_load_ps(p1 + 8 * i);
+      __m128 im1 = _mm_load_ps(p1 + 8 * i + 4);
+      __m128 re2 = _mm_load_ps(p2 + 8 * i);
+      __m128 im2 = _mm_load_ps(p2 + 8 * i + 4);
+
+      __m128 ip_re = _mm_add_ps(_mm_mul_ps(re1, re2), _mm_mul_ps(im1, im2));
+      __m128 ip_im = _mm_sub_ps(_mm_mul_ps(re1, im2), _mm_mul_ps(im1, re2));
+
+      double re = detail::HorizontalSumSSE(ip_re);
+      double im = detail::HorizontalSumSSE(ip_im);
+
+      return std::complex<double>{re, im};
+    };
+
+    using Op = std::plus<std::complex<double>>;
+    return Base::for_.RunReduce(
+        MinSize(state1.num_qubits()) / 8, f, Op(), state1.get(), state2.get());
+  }
+
+  double RealInnerProduct(const State& state1, const State& state2) const {
+    if (state1.num_qubits() != state2.num_qubits()) {
+      return std::nan("");
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p1, const fp_type* p2) -> double {
+      __m128 re1 = _mm_load_ps(p1 + 8 * i);
+      __m128 im1 = _mm_load_ps(p1 + 8 * i + 4);
+      __m128 re2 = _mm_load_ps(p2 + 8 * i);
+      __m128 im2 = _mm_load_ps(p2 + 8 * i + 4);
+
+      __m128 ip_re = _mm_add_ps(_mm_mul_ps(re1, re2), _mm_mul_ps(im1, im2));
+
+      return detail::HorizontalSumSSE(ip_re);
+    };
+
+    using Op = std::plus<double>;
+    return Base::for_.RunReduce(
+        MinSize(state1.num_qubits()) / 8, f, Op(), state1.get(), state2.get());
+  }
+
+  template <typename DistrRealType = double>
+  std::vector<uint64_t> Sample(
+      const State& state, uint64_t num_samples, unsigned seed) const {
+    std::vector<uint64_t> bitstrings;
+
+    if (num_samples > 0) {
+      double norm = 0;
+      uint64_t size = MinSize(state.num_qubits()) / 8;
+      const fp_type* p = state.get();
+
+      for (uint64_t k = 0; k < size; ++k) {
+        for (unsigned j = 0; j < 4; ++j) {
+          double re = p[8 * k + j];
+          double im = p[8 * k + 4 + j];
+          norm += re * re + im * im;
+        }
+      }
+
+      auto rs = GenerateRandomValues<DistrRealType>(num_samples, seed, norm);
+
+      uint64_t m = 0;
+      double csum = 0;
+      bitstrings.reserve(num_samples);
+
+      for (uint64_t k = 0; k < size; ++k) {
+        for (unsigned j = 0; j < 4; ++j) {
+          double re = p[8 * k + j];
+          double im = p[8 * k + 4 + j];
+          csum += re * re + im * im;
+          while (rs[m] < csum && m < num_samples) {
+            bitstrings.emplace_back(4 * k + j);
+            ++m;
+          }
+        }
+      }
+
+      for (; m < num_samples; ++m) {
+        bitstrings.emplace_back((uint64_t{1} << state.num_qubits()) - 1);
+      }
+    }
+
+    return bitstrings;
+  }
+
+  using MeasurementResult = typename Base::MeasurementResult;
+
+  void Collapse(const MeasurementResult& mr, State& state) const {
+    __m128 zero = _mm_set1_ps(0);
+
+    auto f1 = [](unsigned n, unsigned m, uint64_t i, uint64_t mask,
+                 uint64_t bits, __m128 zero, const fp_type* p) -> double {
+      __m128 ml = _mm_castsi128_ps(detail::GetZeroMaskSSE(4 * i, mask, bits));
+
+      __m128 re = _mm_load_ps(p + 8 * i);
+      __m128 im = _mm_load_ps(p + 8 * i + 4);
+      __m128 s1 = _mm_add_ps(_mm_mul_ps(re, re), _mm_mul_ps(im, im));
+
+      s1 = _mm_blendv_ps(zero, s1, ml);
+
+      return detail::HorizontalSumSSE(s1);
+    };
+
+    using Op = std::plus<double>;
+    double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 8, f1,
+                                       Op(), mr.mask, mr.bits, zero,
+                                       state.get());
+
+    __m128 renorm = _mm_set1_ps(1.0 / std::sqrt(norm));
+
+    auto f2 = [](unsigned n, unsigned m, uint64_t i, uint64_t mask,
+                 uint64_t bits, __m128 renorm, __m128 zero, fp_type* p) {
+      __m128 ml = _mm_castsi128_ps(detail::GetZeroMaskSSE(4 * i, mask, bits));
+
+      __m128 re = _mm_load_ps(p + 8 * i);
+      __m128 im = _mm_load_ps(p + 8 * i + 4);
+
+      re = _mm_blendv_ps(zero, _mm_mul_ps(re, renorm), ml);
+      im = _mm_blendv_ps(zero, _mm_mul_ps(im, renorm), ml);
+
+      _mm_store_ps(p + 8 * i, re);
+      _mm_store_ps(p + 8 * i + 4, im);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 8, f2,
+                   mr.mask, mr.bits, renorm, zero, state.get());
+  }
+
+  std::vector<double> PartialNorms(const State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p) -> double {
+      __m128 re = _mm_load_ps(p + 8 * i);
+      __m128 im = _mm_load_ps(p + 8 * i + 4);
+      __m128 s1 = _mm_add_ps(_mm_mul_ps(re, re), _mm_mul_ps(im, im));
+
+      return detail::HorizontalSumSSE(s1);
+    };
+
+    using Op = std::plus<double>;
+    return Base::for_.RunReduceP(
+        MinSize(state.num_qubits()) / 8, f, Op(), state.get());
+  }
+
+  uint64_t FindMeasuredBits(
+      unsigned m, double r, uint64_t mask, const State& state) const {
+    double csum = 0;
+
+    uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 8, m);
+    uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 8, m);
+
+    const fp_type* p = state.get();
+
+    for (uint64_t k = k0; k < k1; ++k) {
+      for (uint64_t j = 0; j < 4; ++j) {
+        auto re = p[8 * k + j];
+        auto im = p[8 * k + 4 + j];
+        csum += re * re + im * im;
+        if (r < csum) {
+          return (4 * k + j) & mask;
+        }
+      }
+    }
+
+    // Return the last bitstring in the unlikely case of underflow.
+    return (4 * k1 - 1) & mask;
+  }
+};
+
+}  // namespace qsim
+
+#endif  // STATESPACE_SSE_H_
diff --git a/qsim/umux.h b/qsim/umux.h
new file mode 100644
index 0000000..83b951b
--- /dev/null
+++ b/qsim/umux.h
@@ -0,0 +1,52 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UMUX_H_
+#define UMUX_H_
+
+#ifdef __AVX512F__
+# include "unitary_calculator_avx512.h"
+  namespace qsim {
+  namespace unitary {
+    template <typename For>
+    using UnitaryCalculator = UnitaryCalculatorAVX512<For>;
+  }
+  }
+#elif __AVX2__
+# include "unitary_calculator_avx.h"
+  namespace qsim {
+  namespace unitary {
+    template <typename For>
+    using UnitaryCalculator = UnitaryCalculatorAVX<For>;
+  }
+  }
+#elif __SSE4_1__
+# include "unitary_calculator_sse.h"
+  namespace qsim {
+  namespace unitary {
+    template <typename For>
+    using UnitaryCalculator = UnitaryCalculatorSSE<For>;
+  }
+  }
+#else
+# include "unitary_calculator_basic.h"
+  namespace qsim {
+  namespace unitary {
+    template <typename For>
+    using UnitaryCalculator = UnitaryCalculatorBasic<For>;
+  }
+  }
+#endif
+
+#endif  // UMUX_H_
diff --git a/qsim/unitary_calculator_avx.h b/qsim/unitary_calculator_avx.h
new file mode 100644
index 0000000..5e566ca
--- /dev/null
+++ b/qsim/unitary_calculator_avx.h
@@ -0,0 +1,1028 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UNITARY_CALCULATOR_AVX_H_
+#define UNITARY_CALCULATOR_AVX_H_
+
+#include <immintrin.h>
+
+#include <complex>
+#include <cstdint>
+#include <functional>
+#include <vector>
+
+#include "simulator.h"
+#include "unitaryspace_avx.h"
+
+namespace qsim {
+namespace unitary {
+
+/**
+ * Quantum circuit unitary calculator with AVX vectorization.
+ */
+template <typename For>
+class UnitaryCalculatorAVX final : public SimulatorBase {
+ public:
+  using UnitarySpace = UnitarySpaceAVX<For>;
+  using Unitary = typename UnitarySpace::Unitary;
+  using fp_type = typename UnitarySpace::fp_type;
+
+  using StateSpace = UnitarySpace;
+  using State = Unitary;
+
+  template <typename... ForArgs>
+  explicit UnitaryCalculatorAVX(ForArgs&&... args) : for_(args...) {}
+
+  /**
+   * Applies a gate using AVX instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyGate(const std::vector<unsigned>& qs,
+                 const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    switch (qs.size()) {
+    case 1:
+      if (qs[0] > 2) {
+        ApplyGateH<1>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 1>(qs, matrix, state);
+      }
+      break;
+    case 2:
+      if (qs[0] > 2) {
+        ApplyGateH<2>(qs, matrix, state);
+      } else if (qs[1] > 2) {
+        ApplyGateL<1, 1>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 2>(qs, matrix, state);
+      }
+      break;
+    case 3:
+      if (qs[0] > 2) {
+        ApplyGateH<3>(qs, matrix, state);
+      } else if (qs[1] > 2) {
+        ApplyGateL<2, 1>(qs, matrix, state);
+      } else if (qs[2] > 2) {
+        ApplyGateL<1, 2>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 3>(qs, matrix, state);
+      }
+      break;
+    case 4:
+      if (qs[0] > 2) {
+        ApplyGateH<4>(qs, matrix, state);
+      } else if (qs[1] > 2) {
+        ApplyGateL<3, 1>(qs, matrix, state);
+      } else if (qs[2] > 2) {
+        ApplyGateL<2, 2>(qs, matrix, state);
+      } else {
+        ApplyGateL<1, 3>(qs, matrix, state);
+      }
+      break;
+    case 5:
+      if (qs[0] > 2) {
+        ApplyGateH<5>(qs, matrix, state);
+      } else if (qs[1] > 2) {
+        ApplyGateL<4, 1>(qs, matrix, state);
+      } else if (qs[2] > 2) {
+        ApplyGateL<3, 2>(qs, matrix, state);
+      } else {
+        ApplyGateL<2, 3>(qs, matrix, state);
+      }
+      break;
+    case 6:
+      if (qs[0] > 2) {
+        ApplyGateH<6>(qs, matrix, state);
+      } else if (qs[1] > 2) {
+        ApplyGateL<5, 1>(qs, matrix, state);
+      } else if (qs[2] > 2) {
+        ApplyGateL<4, 2>(qs, matrix, state);
+      } else {
+        ApplyGateL<3, 3>(qs, matrix, state);
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * Applies a controlled gate using AVX instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param cqs Indices of control qubits.
+   * @param cvals Bit mask of control qubit values.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyControlledGate(const std::vector<unsigned>& qs,
+                           const std::vector<unsigned>& cqs, uint64_t cvals,
+                           const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+    // Assume cqs[0] < cqs[1] < cqs[2] < ... .
+
+    if (cqs.size() == 0) {
+      ApplyGate(qs, matrix, state);
+      return;
+    }
+
+    switch (qs.size()) {
+    case 1:
+      if (qs[0] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 2:
+      if (qs[0] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 3:
+      if (qs[0] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[2] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 4:
+      if (qs[0] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[2] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * @return The size of SIMD register if applicable.
+   */
+  static unsigned SIMDRegisterSize() {
+    return 8;
+  }
+
+ private:
+
+#ifdef __BMI2__
+
+  template <unsigned H>
+  void ApplyGateH(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t size,
+                uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m256 ru, iu, rn, in;
+      __m256 rs[hsize], is[hsize];
+
+      uint64_t r = i % size;
+      uint64_t s = i / size;
+
+      auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k] = _mm256_load_ps(p0 + p);
+        is[k] = _mm256_load_ps(p0 + p + 8);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm256_set1_ps(v[j]);
+        iu = _mm256_set1_ps(v[j + 1]);
+        rn = _mm256_mul_ps(rs[0], ru);
+        in = _mm256_mul_ps(rs[0], iu);
+        rn = _mm256_fnmadd_ps(is[0], iu, rn);
+        in = _mm256_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm256_set1_ps(v[j]);
+          iu = _mm256_set1_ps(v[j + 1]);
+          rn = _mm256_fmadd_ps(rs[l], ru, rn);
+          in = _mm256_fmadd_ps(rs[l], iu, in);
+          rn = _mm256_fnmadd_ps(is[l], iu, rn);
+          in = _mm256_fmadd_ps(is[l], ru, in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm256_store_ps(p0 + p, rn);
+        _mm256_store_ps(p0 + p + 8, in);
+      }
+    };
+
+    auto m = GetMasks1<H, 3>(qs);
+
+    unsigned k = 3 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f,
+             matrix, m.imaskh, m.qmaskh, size, raw_size, state.get());
+  }
+
+  template <unsigned H, unsigned L>
+  void ApplyGateL(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                uint64_t imaskh, uint64_t qmaskh, const __m256i* idx,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m256 rn, in;
+      __m256 rs[gsize], is[gsize];
+
+      uint64_t r = i % size;
+      uint64_t s = i / size;
+
+      auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k2] = _mm256_load_ps(p0 + p);
+        is[k2] = _mm256_load_ps(p0 + p + 8);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
+          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm256_mul_ps(rs[0], w[j]);
+        in = _mm256_mul_ps(rs[0], w[j + 1]);
+        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm256_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm256_store_ps(p0 + p, rn);
+        _mm256_store_ps(p0 + p + 8, in);
+      }
+    };
+
+    __m256i idx[1 << L];
+    __m256 w[1 << (1 + 2 * H + L)];
+
+    auto m = GetMasks2<H, L, 3>(qs);
+    FillPermutationIndices<L>(m.qmaskl, idx);
+    FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
+
+    unsigned k = 3 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f,
+             w, m.imaskh, m.qmaskh, idx, size, raw_size, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m256 ru, iu, rn, in;
+      __m256 rs[hsize], is[hsize];
+
+      uint64_t r = i % size;
+      uint64_t s = i / size;
+
+      auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k] = _mm256_load_ps(p0 + p);
+        is[k] = _mm256_load_ps(p0 + p + 8);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm256_set1_ps(v[j]);
+        iu = _mm256_set1_ps(v[j + 1]);
+        rn = _mm256_mul_ps(rs[0], ru);
+        in = _mm256_mul_ps(rs[0], iu);
+        rn = _mm256_fnmadd_ps(is[0], iu, rn);
+        in = _mm256_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm256_set1_ps(v[j]);
+          iu = _mm256_set1_ps(v[j + 1]);
+          rn = _mm256_fmadd_ps(rs[l], ru, rn);
+          in = _mm256_fmadd_ps(rs[l], iu, in);
+          rn = _mm256_fnmadd_ps(is[l], iu, rn);
+          in = _mm256_fmadd_ps(is[l], ru, in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm256_store_ps(p0 + p, rn);
+        _mm256_store_ps(p0 + p + 8, in);
+      }
+    };
+
+    auto m = GetMasks3<H, 3>(state.num_qubits(), qs, cqs, cvals);
+
+    unsigned k = 3 + H + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f,
+             matrix, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m256 rn, in;
+      __m256 rs[hsize], is[hsize];
+
+      uint64_t r = i % size;
+      uint64_t s = i / size;
+
+      auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k] = _mm256_load_ps(p0 + p);
+        is[k] = _mm256_load_ps(p0 + p + 8);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm256_mul_ps(rs[0], w[j]);
+        in = _mm256_mul_ps(rs[0], w[j + 1]);
+        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm256_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm256_store_ps(p0 + p, rn);
+        _mm256_store_ps(p0 + p + 8, in);
+      }
+    };
+
+    __m256 w[1 << (1 + 2 * H)];
+
+    auto m = GetMasks4<H, 3>(state.num_qubits(), qs, cqs, cvals);
+    FillControlledMatrixH<H, 3>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
+
+    unsigned k = 3 + H + cqs.size() - m.cl;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f,
+             w, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get());
+  }
+
+  template <unsigned H, unsigned L, bool CH>
+  void ApplyControlledGateL(const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs, uint64_t cvals,
+                            const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
+                const __m256i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m256 rn, in;
+      __m256 rs[gsize], is[gsize];
+
+      uint64_t r = i % size;
+      uint64_t s = i / size;
+
+      auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k2] = _mm256_load_ps(p0 + p);
+        is[k2] = _mm256_load_ps(p0 + p + 8);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
+          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm256_mul_ps(rs[0], w[j]);
+        in = _mm256_mul_ps(rs[0], w[j + 1]);
+        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm256_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm256_store_ps(p0 + p, rn);
+        _mm256_store_ps(p0 + p + 8, in);
+      }
+    };
+
+    __m256i idx[1 << L];
+    __m256 w[1 << (1 + 2 * H + L)];
+
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    if (CH) {
+      auto m = GetMasks5<H, L, 3>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
+
+      unsigned k = 3 + H + cqs.size();
+      unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+      uint64_t size = uint64_t{1} << n;
+
+      for_.Run(size * size2, f, w, m.imaskh, m.qmaskh,
+               m.cvalsh, idx, size, raw_size, state.get());
+    } else {
+      auto m = GetMasks6<H, L, 3>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillControlledMatrixL<H, L, 3>(
+          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
+
+      unsigned k = 3 + H + cqs.size() - m.cl;
+      unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+      uint64_t size = uint64_t{1} << n;
+
+      for_.Run(size * size2, f, w, m.imaskh, m.qmaskh,
+               m.cvalsh, idx, size, raw_size, state.get());
+    }
+  }
+
+#else  // __BMI2__
+
+  template <unsigned H>
+  void ApplyGateH(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss, uint64_t size,
+                uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m256 ru, iu, rn, in;
+      __m256 rs[hsize], is[hsize];
+
+      uint64_t r = 8 * (i % size);
+      uint64_t s = i / size;
+
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
+      }
+
+      auto p0 = rstate + row_size * s + 2 * t;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm256_load_ps(p0 + xss[k]);
+        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm256_set1_ps(v[j]);
+        iu = _mm256_set1_ps(v[j + 1]);
+        rn = _mm256_mul_ps(rs[0], ru);
+        in = _mm256_mul_ps(rs[0], iu);
+        rn = _mm256_fnmadd_ps(is[0], iu, rn);
+        in = _mm256_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm256_set1_ps(v[j]);
+          iu = _mm256_set1_ps(v[j + 1]);
+          rn = _mm256_fmadd_ps(rs[l], ru, rn);
+          in = _mm256_fmadd_ps(rs[l], iu, in);
+          rn = _mm256_fnmadd_ps(is[l], iu, rn);
+          in = _mm256_fmadd_ps(is[l], ru, in);
+
+          j += 2;
+        }
+
+        _mm256_store_ps(p0 + xss[k], rn);
+        _mm256_store_ps(p0 + xss[k] + 8, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 3 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, state.get());
+  }
+
+  template <unsigned H, unsigned L>
+  void ApplyGateL(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                const uint64_t* ms, const uint64_t* xss, const __m256i* idx,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m256 rn, in;
+      __m256 rs[gsize], is[gsize];
+
+      uint64_t r = 8 * (i % size);
+      uint64_t s = i / size;
+
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
+      }
+
+      auto p0 = rstate + row_size * s + 2 * t;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        rs[k2] = _mm256_load_ps(p0 + xss[k]);
+        is[k2] = _mm256_load_ps(p0 + xss[k] + 8);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
+          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm256_mul_ps(rs[0], w[j]);
+        in = _mm256_mul_ps(rs[0], w[j + 1]);
+        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm256_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        _mm256_store_ps(p0 + xss[k], rn);
+        _mm256_store_ps(p0 + xss[k] + 8, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m256i idx[1 << L];
+    __m256 w[1 << (1 + 2 * H + L)];
+
+    auto m = GetMasks11<L>(qs);
+
+    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
+    FillPermutationIndices<L>(m.qmaskl, idx);
+    FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
+
+    unsigned k = 3 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m256 ru, iu, rn, in;
+      __m256 rs[hsize], is[hsize];
+
+      uint64_t r = 8 * (i % size);
+      uint64_t s = i / size;
+
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
+      }
+
+      if ((t & cmaskh) != cvalsh) return;
+
+      auto p0 = rstate + row_size * s + 2 * t;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm256_load_ps(p0 + xss[k]);
+        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm256_set1_ps(v[j]);
+        iu = _mm256_set1_ps(v[j + 1]);
+        rn = _mm256_mul_ps(rs[0], ru);
+        in = _mm256_mul_ps(rs[0], iu);
+        rn = _mm256_fnmadd_ps(is[0], iu, rn);
+        in = _mm256_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm256_set1_ps(v[j]);
+          iu = _mm256_set1_ps(v[j + 1]);
+          rn = _mm256_fmadd_ps(rs[l], ru, rn);
+          in = _mm256_fmadd_ps(rs[l], iu, in);
+          rn = _mm256_fnmadd_ps(is[l], iu, rn);
+          in = _mm256_fmadd_ps(is[l], ru, in);
+
+          j += 2;
+        }
+
+        _mm256_store_ps(p0 + xss[k], rn);
+        _mm256_store_ps(p0 + xss[k] + 8, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+
+    auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals);
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 3 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f,
+             matrix, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m256 rn, in;
+      __m256 rs[hsize], is[hsize];
+
+      uint64_t r = 8 * (i % size);
+      uint64_t s = i / size;
+
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
+      }
+
+      if ((t & cmaskh) != cvalsh) return;
+
+      auto p0 = rstate + row_size * s + 2 * t;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm256_load_ps(p0 + xss[k]);
+        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm256_mul_ps(rs[0], w[j]);
+        in = _mm256_mul_ps(rs[0], w[j + 1]);
+        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm256_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        _mm256_store_ps(p0 + xss[k], rn);
+        _mm256_store_ps(p0 + xss[k] + 8, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m256 w[1 << (1 + 2 * H)];
+
+    auto m = GetMasks8<3>(state.num_qubits(), qs, cqs, cvals);
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+    FillControlledMatrixH<H, 3>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
+
+    unsigned k = 3 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f,
+             w, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get());
+  }
+
+  template <unsigned H, unsigned L, bool CH>
+  void ApplyControlledGateL(const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs, uint64_t cvals,
+                            const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, const __m256i* idx, uint64_t size,
+                uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m256 rn, in;
+      __m256 rs[gsize], is[gsize];
+
+      uint64_t r = 8 * (i % size);
+      uint64_t s = i / size;
+
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
+      }
+
+      if ((t & cmaskh) != cvalsh) return;
+
+      auto p0 = rstate + row_size * s + 2 * t;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+
+        rs[k2] = _mm256_load_ps(p0 + xss[k]);
+        is[k2] = _mm256_load_ps(p0 + xss[k] + 8);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
+          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm256_mul_ps(rs[0], w[j]);
+        in = _mm256_mul_ps(rs[0], w[j + 1]);
+        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm256_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        _mm256_store_ps(p0 + xss[k], rn);
+        _mm256_store_ps(p0 + xss[k] + 8, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m256i idx[1 << L];
+    __m256 w[1 << (1 + 2 * H + L)];
+
+    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 3 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    if (CH) {
+      auto m = GetMasks9<L>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
+
+      for_.Run(size * size2, f, w, ms, xss, m.cvalsh,
+               m.cmaskh, idx, size, raw_size, state.get());
+    } else {
+      auto m = GetMasks10<L, 3>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillControlledMatrixL<H, L, 3>(
+          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
+
+      for_.Run(size * size2, f, w, ms, xss, m.cvalsh,
+               m.cmaskh, idx, size, raw_size, state.get());
+    }
+  }
+
+#endif  // __BMI2__
+
+  template <unsigned L>
+  static void FillPermutationIndices(unsigned qmaskl, __m256i* idx) {
+    constexpr unsigned lsize = 1 << L;
+
+    for (unsigned i = 0; i < lsize - 1; ++i) {
+      unsigned p[8];
+
+      for (unsigned j = 0; j < 8; ++j) {
+        p[j] = MaskedAdd<3>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl));
+      }
+
+      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
+    }
+  }
+
+  For for_;
+};
+
+}  // namespace unitary
+}  // namespace qsim
+
+#endif  // UNITARY_CALCULATOR_AVX_H_
diff --git a/qsim/unitary_calculator_avx512.h b/qsim/unitary_calculator_avx512.h
new file mode 100644
index 0000000..8105367
--- /dev/null
+++ b/qsim/unitary_calculator_avx512.h
@@ -0,0 +1,644 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UNITARY_CALCULATOR_AVX512_H_
+#define UNITARY_CALCULATOR_AVX512_H_
+
+#include <immintrin.h>
+
+#include <complex>
+#include <cstdint>
+#include <functional>
+#include <vector>
+
+#include "simulator.h"
+#include "unitaryspace_avx512.h"
+
+namespace qsim {
+namespace unitary {
+
+/**
+ * Quantum circuit unitary calculator with AVX512 vectorization.
+ */
+template <typename For>
+class UnitaryCalculatorAVX512 final : public SimulatorBase {
+ public:
+  using UnitarySpace = UnitarySpaceAVX512<For>;
+  using Unitary = typename UnitarySpace::Unitary;
+  using fp_type = typename UnitarySpace::fp_type;
+
+  using StateSpace = UnitarySpace;
+  using State = Unitary;
+
+  template <typename... ForArgs>
+  explicit UnitaryCalculatorAVX512(ForArgs&&... args) : for_(args...) {}
+
+  /**
+   * Applies a gate using AVX512 instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyGate(const std::vector<unsigned>& qs,
+                 const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    switch (qs.size()) {
+    case 1:
+      if (qs[0] > 3) {
+        ApplyGateH<1>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 1>(qs, matrix, state);
+      }
+      break;
+    case 2:
+      if (qs[0] > 3) {
+        ApplyGateH<2>(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        ApplyGateL<1, 1>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 2>(qs, matrix, state);
+      }
+      break;
+    case 3:
+      if (qs[0] > 3) {
+        ApplyGateH<3>(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        ApplyGateL<2, 1>(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        ApplyGateL<1, 2>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 3>(qs, matrix, state);
+      }
+      break;
+    case 4:
+      if (qs[0] > 3) {
+        ApplyGateH<4>(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        ApplyGateL<3, 1>(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        ApplyGateL<2, 2>(qs, matrix, state);
+      } else if (qs[3] > 3) {
+        ApplyGateL<1, 3>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 4>(qs, matrix, state);
+      }
+      break;
+    case 5:
+      if (qs[0] > 3) {
+        ApplyGateH<5>(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        ApplyGateL<4, 1>(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        ApplyGateL<3, 2>(qs, matrix, state);
+      } else if (qs[3] > 3) {
+        ApplyGateL<2, 3>(qs, matrix, state);
+      } else {
+        ApplyGateL<1, 4>(qs, matrix, state);
+      }
+      break;
+    case 6:
+      if (qs[0] > 3) {
+        ApplyGateH<6>(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        ApplyGateL<5, 1>(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        ApplyGateL<4, 2>(qs, matrix, state);
+      } else if (qs[3] > 3) {
+        ApplyGateL<3, 3>(qs, matrix, state);
+      } else {
+        ApplyGateL<2, 4>(qs, matrix, state);
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * Applies a controlled gate using AVX512 instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param cqs Indices of control qubits.
+   * @param cvals Bit mask of control qubit values.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyControlledGate(const std::vector<unsigned>& qs,
+                           const std::vector<unsigned>& cqs, uint64_t cvals,
+                           const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+    // Assume cqs[0] < cqs[1] < cqs[2] < ... .
+
+    if (cqs.size() == 0) {
+      ApplyGate(qs, matrix, state);
+      return;
+    }
+
+    switch (qs.size()) {
+    case 1:
+      if (qs[0] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 2:
+      if (qs[0] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 3:
+      if (qs[0] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[2] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 4:
+      if (qs[0] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[2] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[3] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<0, 4, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 4, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * @return The size of SIMD register if applicable.
+   */
+  static unsigned SIMDRegisterSize() {
+    return 16;
+  }
+
+ private:
+  template <unsigned H>
+  void ApplyGateH(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t size,
+                uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m512 ru, iu, rn, in;
+      __m512 rs[hsize], is[hsize];
+
+      uint64_t r = i % size;
+      uint64_t s = i / size;
+
+      auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k] = _mm512_load_ps(p0 + p);
+        is[k] = _mm512_load_ps(p0 + p + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[l], ru, rn);
+          in = _mm512_fmadd_ps(rs[l], iu, in);
+          rn = _mm512_fnmadd_ps(is[l], iu, rn);
+          in = _mm512_fmadd_ps(is[l], ru, in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm512_store_ps(p0 + p, rn);
+        _mm512_store_ps(p0 + p + 16, in);
+      }
+    };
+
+    auto m = GetMasks1<H, 4>(qs);
+
+    unsigned k = 4 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f,
+             matrix, m.imaskh, m.qmaskh, size, raw_size, state.get());
+  }
+
+  template <unsigned H, unsigned L>
+  void ApplyGateL(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                uint64_t imaskh, uint64_t qmaskh, const __m512i* idx,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m512 rn, in;
+      __m512 rs[gsize], is[gsize];
+
+      uint64_t r = i % size;
+      uint64_t s = i / size;
+
+      auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k2] = _mm512_load_ps(p0 + p);
+        is[k2] = _mm512_load_ps(p0 + p + 16);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]);
+          is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
+          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm512_store_ps(p0 + p, rn);
+        _mm512_store_ps(p0 + p + 16, in);
+      }
+    };
+
+    __m512i idx[1 << L];
+    __m512 w[1 << (1 + 2 * H + L)];
+
+    auto m = GetMasks2<H, L, 4>(qs);
+    FillPermutationIndices<L>(m.qmaskl, idx);
+    FillMatrix<H, L, 4>(m.qmaskl, matrix, (fp_type*) w);
+
+    unsigned k = 4 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f,
+             w, m.imaskh, m.qmaskh, idx, size, raw_size, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m512 ru, iu, rn, in;
+      __m512 rs[hsize], is[hsize];
+
+      uint64_t r = i % size;
+      uint64_t s = i / size;
+
+      auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k] = _mm512_load_ps(p0 + p);
+        is[k] = _mm512_load_ps(p0 + p + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[l], ru, rn);
+          in = _mm512_fmadd_ps(rs[l], iu, in);
+          rn = _mm512_fnmadd_ps(is[l], iu, rn);
+          in = _mm512_fmadd_ps(is[l], ru, in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm512_store_ps(p0 + p, rn);
+        _mm512_store_ps(p0 + p + 16, in);
+      }
+    };
+
+    auto m = GetMasks3<H, 4>(state.num_qubits(), qs, cqs, cvals);
+
+    unsigned k = 4 + H + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f,
+             matrix, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m512 rn, in;
+      __m512 rs[hsize], is[hsize];
+
+      uint64_t r = i % size;
+      uint64_t s = i / size;
+
+      auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k] = _mm512_load_ps(p0 + p);
+        is[k] = _mm512_load_ps(p0 + p + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
+          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm512_store_ps(p0 + p, rn);
+        _mm512_store_ps(p0 + p + 16, in);
+      }
+    };
+
+    __m512 w[1 << (1 + 2 * H)];
+
+    auto m = GetMasks4<H, 4>(state.num_qubits(), qs, cqs, cvals);
+    FillControlledMatrixH<H, 4>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
+
+    unsigned k = 4 + H + cqs.size() - m.cl;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f,
+             w, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get());
+  }
+
+  template <unsigned H, unsigned L, bool CH>
+  void ApplyControlledGateL(const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs, uint64_t cvals,
+                            const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m512 rn, in;
+      __m512 rs[gsize], is[gsize];
+
+      uint64_t r = i % size;
+      uint64_t s = i / size;
+
+      auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k2] = _mm512_load_ps(p0 + p);
+        is[k2] = _mm512_load_ps(p0 + p + 16);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]);
+          is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
+          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm512_store_ps(p0 + p, rn);
+        _mm512_store_ps(p0 + p + 16, in);
+      }
+    };
+
+    __m512i idx[1 << L];
+    __m512 w[1 << (1 + 2 * H + L)];
+
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    if (CH) {
+      auto m = GetMasks5<H, L, 4>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillMatrix<H, L, 4>(m.qmaskl, matrix, (fp_type*) w);
+
+      unsigned k = 4 + H + cqs.size();
+      unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+      uint64_t size = uint64_t{1} << n;
+
+      for_.Run(size * size2, f, w, m.imaskh, m.qmaskh,
+               m.cvalsh, idx, size, raw_size, state.get());
+    } else {
+      auto m = GetMasks6<H, L, 4>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillControlledMatrixL<H, L, 4>(
+          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
+
+      unsigned k = 4 + H + cqs.size() - m.cl;
+      unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+      uint64_t size = uint64_t{1} << n;
+
+      for_.Run(size * size2, f, w, m.imaskh, m.qmaskh,
+               m.cvalsh, idx, size, raw_size, state.get());
+    }
+  }
+
+  template <unsigned L>
+  static void FillPermutationIndices(unsigned qmaskl, __m512i* idx) {
+    constexpr unsigned lsize = 1 << L;
+
+    for (unsigned i = 0; i < lsize; ++i) {
+      unsigned p[16];
+
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd<4>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+  }
+
+  For for_;
+};
+
+}  // namespace unitary
+}  // namespace qsim
+
+#endif  // UNITARY_CALCULATOR_AVX512_H_
diff --git a/qsim/unitary_calculator_basic.h b/qsim/unitary_calculator_basic.h
new file mode 100644
index 0000000..6b1821a
--- /dev/null
+++ b/qsim/unitary_calculator_basic.h
@@ -0,0 +1,259 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UNITARY_CALCULATOR_BASIC_H_
+#define UNITARY_CALCULATOR_BASIC_H_
+
+#include <complex>
+#include <cstdint>
+#include <functional>
+#include <vector>
+
+#include "simulator.h"
+#include "unitaryspace_basic.h"
+
+namespace qsim {
+namespace unitary {
+
+/**
+ * Quantum circuit unitary calculator without vectorization.
+ */
+template <typename For, typename FP = float>
+class UnitaryCalculatorBasic final : public SimulatorBase {
+ public:
+  using UnitarySpace = UnitarySpaceBasic<For, FP>;
+  using Unitary = typename UnitarySpace::Unitary;
+  using fp_type = typename UnitarySpace::fp_type;
+
+  using StateSpace = UnitarySpace;
+  using State = Unitary;
+
+  template <typename... ForArgs>
+  explicit UnitaryCalculatorBasic(ForArgs&&... args) : for_(args...) {}
+
+  /**
+   * Applies a gate using non-vectorized instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyGate(const std::vector<unsigned>& qs,
+                 const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    switch (qs.size()) {
+    case 1:
+      ApplyGateH<1>(qs, matrix, state);
+      break;
+    case 2:
+      ApplyGateH<2>(qs, matrix, state);
+      break;
+    case 3:
+      ApplyGateH<3>(qs, matrix, state);
+      break;
+    case 4:
+      ApplyGateH<4>(qs, matrix, state);
+      break;
+    case 5:
+      ApplyGateH<5>(qs, matrix, state);
+      break;
+    case 6:
+      ApplyGateH<6>(qs, matrix, state);
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * Applies a controlled gate using non-vectorized instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param cqs Indices of control qubits.
+   * @param cvals Bit mask of control qubit values.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyControlledGate(const std::vector<unsigned>& qs,
+                           const std::vector<unsigned>& cqs, uint64_t cvals,
+                           const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    if (cqs.size() == 0) {
+      ApplyGate(qs, matrix, state);
+      return;
+    }
+
+    switch (qs.size()) {
+    case 1:
+      ApplyControlledGateH<1>(qs, cqs, cvals, matrix, state);
+      break;
+    case 2:
+      ApplyControlledGateH<2>(qs, cqs, cvals, matrix, state);
+      break;
+    case 3:
+      ApplyControlledGateH<3>(qs, cqs, cvals, matrix, state);
+      break;
+    case 4:
+      ApplyControlledGateH<4>(qs, cqs, cvals, matrix, state);
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * @return The size of SIMD register if applicable.
+   */
+  static unsigned SIMDRegisterSize() {
+    return 1;
+  }
+
+ private:
+  template <unsigned H>
+  void ApplyGateH(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss, uint64_t size,
+                uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      fp_type rn, in;
+      fp_type rs[hsize], is[hsize];
+
+      uint64_t r = i % size;
+      uint64_t s = i / size;
+
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
+      }
+
+      auto p0 = rstate + row_size * s + 2 * t;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = *(p0 + xss[k]);
+        is[k] = *(p0 + xss[k] + 1);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = rs[0] * v[j] - is[0] * v[j + 1];
+        in = rs[0] * v[j + 1] + is[0] * v[j];
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          rn += rs[l] * v[j] - is[l] * v[j + 1];
+          in += rs[l] * v[j + 1] + is[l] * v[j];
+
+          j += 2;
+        }
+
+        *(p0 + xss[k]) = rn;
+        *(p0 + xss[k] + 1) = in;
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateH(const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs,
+                            uint64_t cvals, const fp_type* matrix,
+                            State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      fp_type rn, in;
+      fp_type rs[hsize], is[hsize];
+
+      uint64_t r = i % size;
+      uint64_t s = i / size;
+
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
+      }
+
+      if ((t & cmaskh) == cvalsh) {
+        auto p0 = rstate + row_size * s + 2 * t;
+
+        for (unsigned k = 0; k < hsize; ++k) {
+          rs[k] = *(p0 + xss[k]);
+          is[k] = *(p0 + xss[k] + 1);
+        }
+
+        uint64_t j = 0;
+
+        for (unsigned k = 0; k < hsize; ++k) {
+          rn = rs[0] * v[j] - is[0] * v[j + 1];
+          in = rs[0] * v[j + 1] + is[0] * v[j];
+
+          j += 2;
+
+          for (unsigned l = 1; l < hsize; ++l) {
+            rn += rs[l] * v[j] - is[l] * v[j + 1];
+            in += rs[l] * v[j + 1] + is[l] * v[j];
+
+            j += 2;
+          }
+
+          *(p0 + xss[k]) = rn;
+          *(p0 + xss[k] + 1) = in;
+        }
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals);
+
+    unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f,
+             matrix, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get());
+  }
+
+  For for_;
+};
+
+}  // namespace unitary
+}  // namespace qsim
+
+#endif  // UNITARY_CALCULATOR_BASIC_H_
diff --git a/qsim/unitary_calculator_sse.h b/qsim/unitary_calculator_sse.h
new file mode 100644
index 0000000..a3c3f2e
--- /dev/null
+++ b/qsim/unitary_calculator_sse.h
@@ -0,0 +1,639 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UNITARY_CALCULATOR_SSE_H_
+#define UNITARY_CALCULATOR_SSE_H_
+
+#include <smmintrin.h>
+
+#include <complex>
+#include <cstdint>
+#include <functional>
+#include <vector>
+
+#include "simulator.h"
+#include "unitaryspace_sse.h"
+
+namespace qsim {
+namespace unitary {
+
+/**
+ * Quantum circuit unitary calculator with SSE vectorization.
+ */
+template <typename For>
+class UnitaryCalculatorSSE final : public SimulatorBase {
+ public:
+  using UnitarySpace = UnitarySpaceSSE<For>;
+  using Unitary = typename UnitarySpace::Unitary;
+  using fp_type = typename UnitarySpace::fp_type;
+
+  using StateSpace = UnitarySpace;
+  using State = Unitary;
+
+  template <typename... ForArgs>
+  explicit UnitaryCalculatorSSE(ForArgs&&... args) : for_(args...) {}
+
+  /**
+   * Applies a gate using SSE instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyGate(const std::vector<unsigned>& qs,
+                 const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    switch (qs.size()) {
+    case 1:
+      if (qs[0] > 1) {
+        ApplyGateH<1>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 1>(qs, matrix, state);
+      }
+      break;
+    case 2:
+      if (qs[0] > 1) {
+        ApplyGateH<2>(qs, matrix, state);
+      } else if (qs[1] > 1) {
+        ApplyGateL<1, 1>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 2>(qs, matrix, state);
+      }
+      break;
+    case 3:
+      if (qs[0] > 1) {
+        ApplyGateH<3>(qs, matrix, state);
+      } else if (qs[1] > 1) {
+        ApplyGateL<2, 1>(qs, matrix, state);
+      } else {
+        ApplyGateL<1, 2>(qs, matrix, state);
+      }
+      break;
+    case 4:
+      if (qs[0] > 1) {
+        ApplyGateH<4>(qs, matrix, state);
+      } else if (qs[1] > 1) {
+        ApplyGateL<3, 1>(qs, matrix, state);
+      } else {
+        ApplyGateL<2, 2>(qs, matrix, state);
+      }
+      break;
+    case 5:
+      if (qs[0] > 1) {
+        ApplyGateH<5>(qs, matrix, state);
+      } else if (qs[1] > 1) {
+        ApplyGateL<4, 1>(qs, matrix, state);
+      } else {
+        ApplyGateL<3, 2>(qs, matrix, state);
+      }
+      break;
+    case 6:
+      if (qs[0] > 1) {
+        ApplyGateH<6>(qs, matrix, state);
+      } else if (qs[1] > 1) {
+        ApplyGateL<5, 1>(qs, matrix, state);
+      } else {
+        ApplyGateL<4, 2>(qs, matrix, state);
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * Applies a controlled gate using SSE instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param cqs Indices of control qubits.
+   * @param cvals Bit mask of control qubit values.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyControlledGate(const std::vector<unsigned>& qs,
+                           const std::vector<unsigned>& cqs, uint64_t cvals,
+                           const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+    // Assume cqs[0] < cqs[1] < cqs[2] < ... .
+
+    if (cqs.size() == 0) {
+      ApplyGate(qs, matrix, state);
+      return;
+    }
+
+    switch (qs.size()) {
+    case 1:
+      if (qs[0] > 1) {
+        if (cqs[0] > 1) {
+          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 1) {
+          ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 2:
+      if (qs[0] > 1) {
+        if (cqs[0] > 1) {
+          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 1) {
+        if (cqs[0] > 1) {
+          ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 1) {
+          ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 3:
+      if (qs[0] > 1) {
+        if (cqs[0] > 1) {
+          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 1) {
+        if (cqs[0] > 1) {
+          ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 1) {
+          ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 4:
+      if (qs[0] > 1) {
+        if (cqs[0] > 1) {
+          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 1) {
+        if (cqs[0] > 1) {
+          ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 1) {
+          ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * @return The size of SIMD register if applicable.
+   */
+  static unsigned SIMDRegisterSize() {
+    return 4;
+  }
+
+ private:
+  template <unsigned H>
+  void ApplyGateH(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss, uint64_t size,
+                uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m128 ru, iu, rn, in;
+      __m128 rs[hsize], is[hsize];
+
+      uint64_t r = 4 * (i % size);
+      uint64_t s = i / size;
+
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
+      }
+
+      auto p0 = rstate + row_size * s + 2 * t;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm_load_ps(p0 + xss[k]);
+        is[k] = _mm_load_ps(p0 + xss[k] + 4);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm_set1_ps(v[j]);
+        iu = _mm_set1_ps(v[j + 1]);
+        rn = _mm_mul_ps(rs[0], ru);
+        in = _mm_mul_ps(rs[0], iu);
+        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
+        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm_set1_ps(v[j]);
+          iu = _mm_set1_ps(v[j + 1]);
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], ru));
+
+          j += 2;
+        }
+
+        _mm_store_ps(p0 + xss[k], rn);
+        _mm_store_ps(p0 + xss[k] + 4, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 2 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, state.get());
+  }
+
+  template <unsigned H, unsigned L>
+  void ApplyGateL(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
+                const uint64_t* ms, const uint64_t* xss, unsigned q0,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m128 rn, in;
+      __m128 rs[gsize], is[gsize];
+
+      uint64_t r = 4 * (i % size);
+      uint64_t s = i / size;
+
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
+      }
+
+      auto p0 = rstate + row_size * s + 2 * t;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+
+        rs[k2] = _mm_load_ps(p0 + xss[k]);
+        is[k2] = _mm_load_ps(p0 + xss[k] + 4);
+
+        if (L == 1) {
+          rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177)
+                               : _mm_shuffle_ps(rs[k2], rs[k2], 78);
+          is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177)
+                               : _mm_shuffle_ps(is[k2], is[k2], 78);
+        } else if (L == 2) {
+          rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57);
+          is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57);
+          rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78);
+          is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78);
+          rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147);
+          is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm_mul_ps(rs[0], w[j]);
+        in = _mm_mul_ps(rs[0], w[j + 1]);
+        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
+        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
+
+          j += 2;
+        }
+
+        _mm_store_ps(p0 + xss[k], rn);
+        _mm_store_ps(p0 + xss[k] + 4, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m128 w[1 << (1 + 2 * H + L)];
+
+    auto m = GetMasks11<L>(qs);
+
+    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
+    FillMatrix<H, L, 2>(m.qmaskl, matrix, (fp_type*) w);
+
+    unsigned k = 2 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f,  w, ms, xss, qs[0], size, raw_size, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m128 ru, iu, rn, in;
+      __m128 rs[hsize], is[hsize];
+
+      uint64_t r = 4 * (i % size);
+      uint64_t s = i / size;
+
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
+      }
+
+      if ((t & cmaskh) != cvalsh) return;
+
+      auto p0 = rstate + row_size * s + 2 * t;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm_load_ps(p0 + xss[k]);
+        is[k] = _mm_load_ps(p0 + xss[k] + 4);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm_set1_ps(v[j]);
+        iu = _mm_set1_ps(v[j + 1]);
+        rn = _mm_mul_ps(rs[0], ru);
+        in = _mm_mul_ps(rs[0], iu);
+        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
+        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm_set1_ps(v[j]);
+          iu = _mm_set1_ps(v[j + 1]);
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], ru));
+
+          j += 2;
+        }
+
+        _mm_store_ps(p0 + xss[k], rn);
+        _mm_store_ps(p0 + xss[k] + 4, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+
+    auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals);
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 2 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f,
+             matrix, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m128 rn, in;
+      __m128 rs[hsize], is[hsize];
+
+      uint64_t r = 4 * (i % size);
+      uint64_t s = i / size;
+
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
+      }
+
+      if ((t & cmaskh) != cvalsh) return;
+
+      auto p0 = rstate + row_size * s + 2 * t;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm_load_ps(p0 + xss[k]);
+        is[k] = _mm_load_ps(p0 + xss[k] + 4);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm_mul_ps(rs[0], w[j]);
+        in = _mm_mul_ps(rs[0], w[j + 1]);
+        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
+        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
+
+          j += 2;
+        }
+
+        _mm_store_ps(p0 + xss[k], rn);
+        _mm_store_ps(p0 + xss[k] + 4, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m128 w[1 << (1 + 2 * H)];
+
+    auto m = GetMasks8<2>(state.num_qubits(), qs, cqs, cvals);
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+    FillControlledMatrixH<H, 2>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
+
+    unsigned k = 2 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f,
+             w, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get());
+  }
+
+  template <unsigned H, unsigned L, bool CH>
+  void ApplyControlledGateL(const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs, uint64_t cvals,
+                            const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, unsigned q0, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m128 rn, in;
+      __m128 rs[gsize], is[gsize];
+
+      uint64_t r = 4 * (i % size);
+      uint64_t s = i / size;
+
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
+      }
+
+      if ((t & cmaskh) != cvalsh) return;
+
+      auto p0 = rstate + row_size * s + 2 * t;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+
+        rs[k2] = _mm_load_ps(p0 + xss[k]);
+        is[k2] = _mm_load_ps(p0 + xss[k] + 4);
+
+        if (L == 1) {
+          rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177)
+                               : _mm_shuffle_ps(rs[k2], rs[k2], 78);
+          is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177)
+                               : _mm_shuffle_ps(is[k2], is[k2], 78);
+        } else if (L == 2) {
+          rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57);
+          is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57);
+          rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78);
+          is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78);
+          rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147);
+          is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm_mul_ps(rs[0], w[j]);
+        in = _mm_mul_ps(rs[0], w[j + 1]);
+        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
+        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
+
+          j += 2;
+        }
+
+        _mm_store_ps(p0 + xss[k], rn);
+        _mm_store_ps(p0 + xss[k] + 4, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m128 w[1 << (1 + 2 * H + L)];
+
+    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 2 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    if (CH) {
+      auto m = GetMasks9<L>(state.num_qubits(), qs, cqs, cvals);
+      FillMatrix<H, L, 2>(m.qmaskl, matrix, (fp_type*) w);
+
+      for_.Run(size * size2, f, w, ms, xss,
+               m.cvalsh, m.cmaskh, qs[0], size, raw_size, state.get());
+    } else {
+      auto m = GetMasks10<L, 2>(state.num_qubits(), qs, cqs, cvals);
+      FillControlledMatrixL<H, L, 2>(
+          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
+
+      for_.Run(size * size2, f, w, ms, xss,
+               m.cvalsh, m.cmaskh, qs[0], size, raw_size, state.get());
+    }
+  }
+
+  For for_;
+};
+
+}  // namespace unitary
+}  // namespace qsim
+
+#endif  // UNITARY_CALCULATOR_SSE_H_
diff --git a/qsim/unitaryspace.h b/qsim/unitaryspace.h
new file mode 100644
index 0000000..b5e2691
--- /dev/null
+++ b/qsim/unitaryspace.h
@@ -0,0 +1,65 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UNITARYSPACE_H_
+#define UNITARYSPACE_H_
+
+#include <cstdint>
+
+namespace qsim {
+
+namespace unitary {
+
+/**
+ * Abstract class containing routines for general unitary matrix manipulations.
+ * "AVX", "AVX512", "Basic", and "SSE" implementations are provided.
+ */
+template <typename Impl,
+          template<typename...> class VectorSpace, typename... VSTypeParams>
+class UnitarySpace : public VectorSpace<Impl, VSTypeParams...> {
+ private:
+  using Base = VectorSpace<Impl, VSTypeParams...>;
+
+ public:
+  using fp_type = typename Base::fp_type;
+  using Unitary = typename Base::Vector;
+
+  template <typename... ForArgs>
+  UnitarySpace(ForArgs&&... args) : Base(args...) {}
+
+  static Unitary CreateUnitary(unsigned num_qubits) {
+    return Base::Create(num_qubits);
+  }
+
+  static Unitary CreateUnitary(fp_type* p, unsigned num_qubits) {
+    return Base::Create(p, num_qubits);
+  }
+
+  static Unitary NullUnitary() {
+    return Base::Null();
+  }
+
+  static uint64_t Size(unsigned num_qubits) {
+    return uint64_t{1} << num_qubits;
+  };
+
+  void CopyUnitary(const Unitary& src, Unitary& dest) const {
+    Base::Copy(src, dest);
+  }
+};
+
+}  // namespace unitary
+}  // namespace qsim
+
+#endif  // UNITARYSPACE_H_
diff --git a/qsim/unitaryspace_avx.h b/qsim/unitaryspace_avx.h
new file mode 100644
index 0000000..c1ec59d
--- /dev/null
+++ b/qsim/unitaryspace_avx.h
@@ -0,0 +1,112 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UNITARYSPACE_AVX_H_
+#define UNITARYSPACE_AVX_H_
+
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <cstdint>
+
+#include "unitaryspace.h"
+#include "vectorspace.h"
+
+namespace qsim {
+
+namespace unitary {
+
+/**
+ * Object containing context and routines for unitary manipulations.
+ * Unitary is a vectorized sequence of eight real components followed by eight
+ * imaginary components. Eight single-precison floating numbers can be loaded
+ * into an AVX register.
+ */
+template <typename For>
+struct UnitarySpaceAVX :
+    public UnitarySpace<UnitarySpaceAVX<For>, VectorSpace, For, float> {
+ private:
+  using Base = UnitarySpace<UnitarySpaceAVX<For>,
+                            qsim::VectorSpace, For, float>;
+
+ public:
+  using Unitary = typename Base::Unitary;
+  using fp_type = typename Base::fp_type;
+
+  template <typename... ForArgs>
+  explicit UnitarySpaceAVX(ForArgs&&... args) : Base(args...) {}
+
+  static uint64_t MinRowSize(unsigned num_qubits) {
+    return std::max(uint64_t{16}, 2 * (uint64_t{1} << num_qubits));
+  };
+
+  static uint64_t MinSize(unsigned num_qubits) {
+    return Base::Size(num_qubits) * MinRowSize(num_qubits);
+  };
+
+  void SetAllZeros(Unitary& state) const {
+    __m256 val0 = _mm256_setzero_ps();
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, __m256& val, fp_type* p) {
+      _mm256_store_ps(p + 16 * i, val);
+      _mm256_store_ps(p + 16 * i + 8, val);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 16, f, val0, state.get());
+  }
+
+  void SetIdentity(Unitary& state) {
+    SetAllZeros(state);
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                uint64_t row_size, fp_type* p) {
+      p[row_size * i + (16 * (i / 8)) + (i % 8)] = 1;
+    };
+
+    uint64_t size = Base::Size(state.num_qubits());
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    Base::for_.Run(size, f, row_size, state.get());
+  }
+
+  static std::complex<fp_type> GetEntry(const Unitary& state,
+                                        uint64_t i, uint64_t j) {
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    uint64_t k = (16 * (j / 8)) + (j % 8);
+    return std::complex<fp_type>(state.get()[row_size * i + k],
+                                 state.get()[row_size * i + k + 8]);
+  }
+
+  static void SetEntry(Unitary& state, uint64_t i, uint64_t j,
+                       const std::complex<fp_type>& ampl) {
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    uint64_t k = (16 * (j / 8)) + (j % 8);
+    state.get()[row_size * i + k] = std::real(ampl);
+    state.get()[row_size * i + k + 8] = std::imag(ampl);
+  }
+
+  static void SetEntry(Unitary& state, uint64_t i, uint64_t j, fp_type re,
+                       fp_type im) {
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    uint64_t k = (16 * (j / 8)) + (j % 8);
+    state.get()[row_size * i + k] = re;
+    state.get()[row_size * i + k + 8] = im;
+  }
+};
+
+}  // namespace unitary
+}  // namespace qsim
+
+#endif  // UNITARYSPACE_AVX_H_
diff --git a/qsim/unitaryspace_avx512.h b/qsim/unitaryspace_avx512.h
new file mode 100644
index 0000000..4c23dc9
--- /dev/null
+++ b/qsim/unitaryspace_avx512.h
@@ -0,0 +1,112 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UNITARYSPACE_AVX512_H_
+#define UNITARYSPACE_AVX512_H_
+
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <cstdint>
+
+#include "unitaryspace.h"
+#include "vectorspace.h"
+
+namespace qsim {
+
+namespace unitary {
+
+/**
+ * Object containing context and routines for unitary manipulations.
+ * State is a vectorized sequence of sixteen real components followed by
+ * sixteen imaginary components. Sixteen single-precison floating numbers can
+ * be loaded into an AVX512 register.
+ */
+template <typename For>
+struct UnitarySpaceAVX512 :
+    public UnitarySpace<UnitarySpaceAVX512<For>, VectorSpace, For, float> {
+ private:
+  using Base = UnitarySpace<UnitarySpaceAVX512<For>,
+                            qsim::VectorSpace, For, float>;
+
+ public:
+  using Unitary = typename Base::Unitary;
+  using fp_type = typename Base::fp_type;
+
+  template <typename... ForArgs>
+  explicit UnitarySpaceAVX512(ForArgs&&... args) : Base(args...) {}
+
+  static uint64_t MinRowSize(unsigned num_qubits) {
+    return std::max(uint64_t{32}, 2 * (uint64_t{1} << num_qubits));
+  };
+
+  static uint64_t MinSize(unsigned num_qubits) {
+    return Base::Size(num_qubits) * MinRowSize(num_qubits);
+  };
+
+  void SetAllZeros(Unitary& state) const {
+    __m512 val0 = _mm512_setzero_ps();
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, __m512 val0, fp_type* p) {
+      _mm512_store_ps(p + 32 * i, val0);
+      _mm512_store_ps(p + 32 * i + 16, val0);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 32, f, val0, state.get());
+  }
+
+  void SetIdentity(Unitary& state) {
+    SetAllZeros(state);
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                uint64_t row_size, fp_type* p) {
+      p[row_size * i + (32 * (i / 16)) + (i % 16)] = 1;
+    };
+
+    uint64_t size = Base::Size(state.num_qubits());
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    Base::for_.Run(size, f, row_size, state.get());
+  }
+
+  static std::complex<fp_type> GetEntry(const Unitary& state,
+                                        uint64_t i, uint64_t j) {
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    uint64_t k = (32 * (j / 16)) + (j % 16);
+    return std::complex<fp_type>(state.get()[row_size * i + k],
+                                 state.get()[row_size * i + k + 16]);
+  }
+
+  static void SetEntry(Unitary& state, uint64_t i, uint64_t j,
+                       const std::complex<fp_type>& ampl) {
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    uint64_t k = (32 * (j / 16)) + (j % 16);
+    state.get()[row_size * i + k] = std::real(ampl);
+    state.get()[row_size * i + k + 16] = std::imag(ampl);
+  }
+
+  static void SetEntry(Unitary& state, uint64_t i, uint64_t j, fp_type re,
+                       fp_type im) {
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    uint64_t k = (32 * (j / 16)) + (j % 16);
+    state.get()[row_size * i + k] = re;
+    state.get()[row_size * i + k + 16] = im;
+  }
+};
+
+}  // namespace unitary
+}  // namespace qsim
+
+#endif  // UNITARYSPACE_AVX512_H_
diff --git a/qsim/unitaryspace_basic.h b/qsim/unitaryspace_basic.h
new file mode 100644
index 0000000..2db14b6
--- /dev/null
+++ b/qsim/unitaryspace_basic.h
@@ -0,0 +1,103 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UNITARYSPACE_BASIC_H_
+#define UNITARYSPACE_BASIC_H_
+
+#include <cmath>
+#include <complex>
+#include <cstdint>
+
+#include "unitaryspace.h"
+#include "vectorspace.h"
+
+namespace qsim {
+
+namespace unitary {
+
+/**
+ * Object containing context and routines for unitary manipulations.
+ * Unitary is a non-vectorized sequence of one real amplitude followed by
+ * one imaginary amplitude.
+ */
+template <typename For, typename FP>
+struct UnitarySpaceBasic
+    : public UnitarySpace<UnitarySpaceBasic<For, FP>, VectorSpace, For, FP> {
+ private:
+  using Base = UnitarySpace<UnitarySpaceBasic<For, FP>,
+                            qsim::VectorSpace, For, FP>;
+
+ public:
+  using Unitary = typename Base::Unitary;
+  using fp_type = typename Base::fp_type;
+
+  template <typename... ForArgs>
+  explicit UnitarySpaceBasic(ForArgs&&... args) : Base(args...) {}
+
+  static uint64_t MinRowSize(unsigned num_qubits) {
+    return 2 * (uint64_t{1} << num_qubits);
+  };
+
+  static uint64_t MinSize(unsigned num_qubits) {
+    return Base::Size(num_qubits) * MinRowSize(num_qubits);
+  };
+
+  void SetAllZeros(Unitary& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) {
+      p[2 * i + 0] = 0;
+      p[2 * i + 1] = 0;
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 2, f, state.get());
+  }
+
+  void SetIdentity(Unitary& state) {
+    SetAllZeros(state);
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                uint64_t row_size, fp_type* p) {
+      p[row_size * i + 2 * i] = 1;
+    };
+
+    uint64_t size = Base::Size(state.num_qubits());
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    Base::for_.Run(size, f, row_size, state.get());
+  }
+
+  static std::complex<fp_type> GetEntry(const Unitary& state,
+                                        uint64_t i, uint64_t j) {
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    return std::complex<fp_type>(state.get()[row_size * i + 2 * j],
+                                 state.get()[row_size * i + 2 * j + 1]);
+  }
+
+  static void SetEntry(Unitary& state, uint64_t i, uint64_t j,
+                       const std::complex<fp_type>& ampl) {
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    state.get()[row_size * i + 2 * j] = std::real(ampl);
+    state.get()[row_size * i + 2 * j + 1] = std::imag(ampl);
+  }
+
+  static void SetEntry(Unitary& state, uint64_t i, uint64_t j,
+                       fp_type re, fp_type im) {
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    state.get()[row_size * i + 2 * j] = re;
+    state.get()[row_size * i + 2 * j + 1] = im;
+  }
+};
+
+}  // namespace unitary
+}  // namespace qsim
+
+#endif  // UNITARYSPACE_BASIC_H_
diff --git a/qsim/unitaryspace_sse.h b/qsim/unitaryspace_sse.h
new file mode 100644
index 0000000..f3762fb
--- /dev/null
+++ b/qsim/unitaryspace_sse.h
@@ -0,0 +1,112 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UNITARYSPACE_SSE_H_
+#define UNITARYSPACE_SSE_H_
+
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <cstdint>
+
+#include "unitaryspace.h"
+#include "vectorspace.h"
+
+namespace qsim {
+
+namespace unitary {
+
+/**
+ * Object containing context and routines for unitary manipulations.
+ * Unitary is a vectorized sequence of four real components followed by four
+ * imaginary components. Four single-precison floating numbers can be loaded
+ * into an SSE register.
+ */
+template <typename For>
+struct UnitarySpaceSSE :
+    public UnitarySpace<UnitarySpaceSSE<For>, VectorSpace, For, float> {
+ private:
+  using Base = UnitarySpace<UnitarySpaceSSE<For>,
+                            qsim::VectorSpace, For, float>;
+
+ public:
+  using Unitary = typename Base::Unitary;
+  using fp_type = typename Base::fp_type;
+
+  template <typename... ForArgs>
+  explicit UnitarySpaceSSE(ForArgs&&... args) : Base(args...) {}
+
+  static uint64_t MinRowSize(unsigned num_qubits) {
+    return std::max(uint64_t{8}, 2 * (uint64_t{1} << num_qubits));
+  };
+
+  static uint64_t MinSize(unsigned num_qubits) {
+    return Base::Size(num_qubits) * MinRowSize(num_qubits);
+  };
+
+  void SetAllZeros(Unitary& state) const {
+    __m128 val0 = _mm_setzero_ps();
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, __m128 val0, fp_type* p) {
+      _mm_store_ps(p + 8 * i, val0);
+      _mm_store_ps(p + 8 * i + 4, val0);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 8, f, val0, state.get());
+  }
+
+  void SetIdentity(Unitary& state) {
+    SetAllZeros(state);
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                uint64_t row_size, fp_type* p) {
+      p[row_size * i + (8 * (i / 4)) + (i % 4)] = 1;
+    };
+
+    uint64_t size = Base::Size(state.num_qubits());
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    Base::for_.Run(size, f, row_size, state.get());
+  }
+
+  static std::complex<fp_type> GetEntry(const Unitary& state,
+                                        uint64_t i, uint64_t j) {
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    uint64_t k = (8 * (j / 4)) + (j % 4);
+    return std::complex<fp_type>(state.get()[row_size * i + k],
+                                 state.get()[row_size * i + k + 4]);
+  }
+
+  static void SetEntry(Unitary& state, uint64_t i, uint64_t j,
+                       const std::complex<fp_type>& ampl) {
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    uint64_t k = (8 * (j / 4)) + (j % 4);
+    state.get()[row_size * i + k] = std::real(ampl);
+    state.get()[row_size * i + k + 4] = std::imag(ampl);
+  }
+
+  static void SetEntry(Unitary& state, uint64_t i, uint64_t j, fp_type re,
+                       fp_type im) {
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    uint64_t k = (8 * (j / 4)) + (j % 4);
+    state.get()[row_size * i + k] = re;
+    state.get()[row_size * i + k + 4] = im;
+  }
+};
+
+}  // namespace unitary
+}  // namespace qsim
+
+#endif  // UNITARYSPACE_SSE_H_
diff --git a/qsim/util.h b/qsim/util.h
new file mode 100644
index 0000000..726a019
--- /dev/null
+++ b/qsim/util.h
@@ -0,0 +1,89 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UTIL_H_
+#define UTIL_H_
+
+#include <algorithm>
+#include <chrono>
+#include <random>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace qsim {
+
+template <typename Container>
+inline void SplitString(
+    const std::string& str, char delim, Container& words) {
+  words.resize(0);
+
+  std::string word;
+  std::stringstream ss(str);
+
+  while (std::getline(ss, word, delim)) {
+    words.push_back(std::move(word));
+  }
+}
+
+template <typename Op, typename Container>
+inline void SplitString(
+    const std::string& str, char delim, Op op, Container& words) {
+  words.resize(0);
+
+  std::string word;
+  std::stringstream ss(str);
+
+  while (std::getline(ss, word, delim)) {
+    words.push_back(op(word));
+  }
+}
+
+inline double GetTime() {
+  using namespace std::chrono;
+  steady_clock::duration since_epoch = steady_clock::now().time_since_epoch();
+  return double(since_epoch.count() * steady_clock::period::num)
+                                    / steady_clock::period::den;
+}
+
+template <typename DistrRealType, typename RGen>
+inline DistrRealType RandomValue(RGen& rgen, DistrRealType max_value) {
+  std::uniform_real_distribution<DistrRealType> distr(0.0, max_value);
+  return distr(rgen);
+}
+
+template <typename DistrRealType>
+inline std::vector<DistrRealType> GenerateRandomValues(
+    uint64_t num_samples, unsigned seed, DistrRealType max_value) {
+  std::vector<DistrRealType> rs;
+  rs.reserve(num_samples + 1);
+
+  std::mt19937 rgen(seed);
+  std::uniform_real_distribution<DistrRealType> distr(0.0, max_value);
+
+  for (uint64_t i = 0; i < num_samples; ++i) {
+    rs.emplace_back(distr(rgen));
+  }
+
+  std::sort(rs.begin(), rs.end());
+  // Populate the final element to prevent sanitizer errors.
+  rs.emplace_back(max_value);
+
+  return rs;
+}
+
+}  // namespace qsim
+
+#endif  // UTIL_H_
diff --git a/qsim/util_cpu.h b/qsim/util_cpu.h
new file mode 100644
index 0000000..8e02425
--- /dev/null
+++ b/qsim/util_cpu.h
@@ -0,0 +1,43 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UTIL_CPU_H_
+#define UTIL_CPU_H_
+
+#ifdef __SSE2__
+# include <immintrin.h>
+#endif
+
+namespace qsim {
+
+// This function sets flush-to-zero and denormals-are-zeros MXCSR control
+// flags. This prevents rare cases of performance slowdown potentially at
+// the cost of a tiny precision loss.
+inline void SetFlushToZeroAndDenormalsAreZeros() {
+#ifdef __SSE2__
+  _mm_setcsr(_mm_getcsr() | 0x8040);
+#endif
+}
+
+// This function clears flush-to-zero and denormals-are-zeros MXCSR control
+// flags.
+inline void ClearFlushToZeroAndDenormalsAreZeros() {
+#ifdef __SSE2__
+  _mm_setcsr(_mm_getcsr() & ~unsigned{0x8040});
+#endif
+}
+
+}  // namespace qsim
+
+#endif  // UTIL_CPU_H_
diff --git a/qsim/util_cuda.h b/qsim/util_cuda.h
new file mode 100644
index 0000000..5d8cb5d
--- /dev/null
+++ b/qsim/util_cuda.h
@@ -0,0 +1,128 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UTIL_CUDA_H_
+#define UTIL_CUDA_H_
+
+#ifdef __NVCC__
+  #include <cuda.h>
+#elif __HIP__
+  #include <hip/hip_runtime.h>
+#endif
+
+#include <cstdlib>
+
+#include "io.h"
+
+namespace qsim {
+
+#define ErrorCheck(code) { ErrorAssert((code), __FILE__, __LINE__); }
+
+inline void ErrorAssert(cudaError_t code, const char* file, unsigned line) {
+  if (code != cudaSuccess) {
+    IO::errorf("CUDA error: %s %s %d\n", cudaGetErrorString(code), file, line);
+    exit(code);
+  }
+}
+
+template <typename T>
+struct Complex {
+  __host__ __device__ __forceinline__ Complex() {}
+
+  __host__ __device__ __forceinline__ Complex(const T& re) : re(re), im(0) {}
+
+  __host__ __device__ __forceinline__ Complex(const T& re, const T& im)
+      : re(re), im(im) {}
+
+  template <typename U>
+  __host__ __device__ __forceinline__ Complex<T>& operator=(
+      const Complex<U>& r) {
+    re = r.re;
+    im = r.im;
+
+    return *this;
+  }
+
+  T re;
+  T im;
+};
+
+template <typename T>
+__host__ __device__ __forceinline__ Complex<T> operator+(
+    const Complex<T>& l, const Complex<T>& r) {
+  return Complex<T>(l.re + r.re, l.im + r.im);
+}
+
+template <typename T, typename U>
+__host__ __device__ __forceinline__ Complex<T> operator+(
+    const Complex<T>& l, const Complex<U>& r) {
+  return Complex<T>(l.re + r.re, l.im + r.im);
+}
+
+template <typename T>
+struct Scalar {
+  using type = T;
+};
+
+template <typename T>
+struct Scalar<Complex<T>> {
+  using type = T;
+};
+
+template <typename T>
+struct Plus {
+  template <typename U>
+  __device__ __forceinline__ T operator()(const T& v1, const U& v2) const {
+    return v1 + v2;
+  }
+};
+
+template <typename T>
+struct Product {
+  __device__ __forceinline__ Complex<T> operator()(
+      const T& re1, const T& im1, const T& re2, const T& im2) const {
+    return Complex<T>(re1 * re2 + im1 * im2, re1 * im2 - im1 * re2);
+  }
+};
+
+template <typename T>
+struct RealProduct {
+  __device__ __forceinline__ T operator()(
+      const T& re1, const T& im1, const T& re2, const T& im2) const {
+    return re1 * re2 + im1 * im2;
+  }
+};
+
+template <typename FP1, typename Op, unsigned warp_size = 32>
+__device__ __forceinline__ FP1 WarpReduce(FP1 val, Op op) {
+  for (unsigned i = warp_size / 2; i > 0; i /= 2) {
+    val = op(val, __shfl_down_sync(0xffffffff, val, i));
+  }
+
+  return val;
+}
+
+template <typename FP1, typename Op, unsigned warp_size = 32>
+__device__ __forceinline__ Complex<FP1> WarpReduce(Complex<FP1> val, Op op) {
+  for (unsigned i = warp_size / 2; i > 0; i /= 2) {
+    val.re = op(val.re, __shfl_down_sync(0xffffffff, val.re, i));
+    val.im = op(val.im, __shfl_down_sync(0xffffffff, val.im, i));
+  }
+
+  return val;
+}
+
+}  // namespace qsim
+
+#endif  // UTIL_CUDA_H_
diff --git a/qsim/util_custatevec.h b/qsim/util_custatevec.h
new file mode 100644
index 0000000..36f29ef
--- /dev/null
+++ b/qsim/util_custatevec.h
@@ -0,0 +1,44 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UTIL_CUSTATEVEC_H_
+#define UTIL_CUSTATEVEC_H_
+
+#include <cublas_v2.h>
+#include <custatevec.h>
+
+#include "io.h"
+#include "util_cuda.h"
+
+namespace qsim {
+
+inline void ErrorAssert(cublasStatus_t code, const char* file, unsigned line) {
+  if (code != CUBLAS_STATUS_SUCCESS) {
+    IO::errorf("cuBLAS error %i: %s %d\n", code, file, line);
+    exit(code);
+  }
+}
+
+inline void ErrorAssert(
+    custatevecStatus_t code, const char* file, unsigned line) {
+  if (code != CUSTATEVEC_STATUS_SUCCESS) {
+    IO::errorf("custatevec error: %s %s %d\n",
+                custatevecGetErrorString(code), file, line);
+    exit(code);
+  }
+}
+
+}  // namespace qsim
+
+#endif  // UTIL_CUSTATEVEC_H_
diff --git a/qsim/vectorspace.h b/qsim/vectorspace.h
new file mode 100644
index 0000000..7b33a53
--- /dev/null
+++ b/qsim/vectorspace.h
@@ -0,0 +1,185 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef VECTORSPACE_H_
+#define VECTORSPACE_H_
+
+#ifdef _WIN32
+  #include <malloc.h>
+#endif
+
+#include <cstdint>
+#include <cstdlib>
+#include <memory>
+#include <utility>
+
+namespace qsim {
+
+namespace detail {
+
+inline void do_not_free(void*) {}
+
+inline void free(void* ptr) {
+#ifdef _WIN32
+  _aligned_free(ptr);
+#else
+  ::free(ptr);
+#endif
+}
+
+}  // namespace detail
+
+// Routines for vector manipulations.
+template <typename Impl, typename For, typename FP>
+class VectorSpace {
+ public:
+  using fp_type = FP;
+
+ private:
+  using Pointer = std::unique_ptr<fp_type, decltype(&detail::free)>;
+
+ public:
+  class Vector {
+   public:
+    Vector() = delete;
+
+    Vector(Pointer&& ptr, unsigned num_qubits)
+        : ptr_(std::move(ptr)), num_qubits_(num_qubits) {}
+
+    fp_type* get() {
+      return ptr_.get();
+    }
+
+    const fp_type* get() const {
+      return ptr_.get();
+    }
+
+    fp_type* release() {
+      num_qubits_ = 0;
+      return ptr_.release();
+    }
+
+    unsigned num_qubits() const {
+      return num_qubits_;
+    }
+
+    bool requires_copy_to_host() const {
+      return false;
+    }
+
+   private:
+    Pointer ptr_;
+    unsigned num_qubits_;
+  };
+
+  template <typename... ForArgs>
+  VectorSpace(ForArgs&&... args) : for_(args...) {}
+
+  static Vector Create(unsigned num_qubits) {
+    auto size = sizeof(fp_type) * Impl::MinSize(num_qubits);
+    #ifdef _WIN32
+      Pointer ptr{(fp_type*) _aligned_malloc(size, 64), &detail::free};
+      return Vector{std::move(ptr), ptr.get() != nullptr ? num_qubits : 0};
+    #else
+      void* p = nullptr;
+      if (posix_memalign(&p, 64, size) == 0) {
+        return Vector{Pointer{(fp_type*) p, &detail::free}, num_qubits};
+      } else {
+        return Null();
+      }
+    #endif
+  }
+
+  // It is the client's responsibility to make sure that p has at least
+  // Impl::MinSize(num_qubits) elements.
+  static Vector Create(fp_type* p, unsigned num_qubits) {
+    return Vector{Pointer{p, &detail::do_not_free}, num_qubits};
+  }
+
+  static Vector Null() {
+    return Vector{Pointer{nullptr, &detail::free}, 0};
+  }
+
+  static bool IsNull(const Vector& vec) {
+    return vec.get() == nullptr;
+  }
+
+  static void Free(fp_type* ptr) {
+    detail::free(ptr);
+  }
+
+  bool Copy(const Vector& src, Vector& dest) const {
+    if (src.num_qubits() != dest.num_qubits()) {
+      return false;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* src, fp_type* dest) {
+      dest[i] = src[i];
+    };
+
+    for_.Run(Impl::MinSize(src.num_qubits()), f, src.get(), dest.get());
+
+    return true;
+  }
+
+  // It is the client's responsibility to make sure that dest has at least
+  // Impl::MinSize(src.num_qubits()) elements.
+  bool Copy(const Vector& src, fp_type* dest) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* src, fp_type* dest) {
+      dest[i] = src[i];
+    };
+
+    for_.Run(Impl::MinSize(src.num_qubits()), f, src.get(), dest);
+
+    return true;
+  }
+
+  // It is the client's responsibility to make sure that src has at least
+  // Impl::MinSize(dest.num_qubits()) elements.
+  bool Copy(const fp_type* src, Vector& dest) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* src, fp_type* dest) {
+      dest[i] = src[i];
+    };
+
+    for_.Run(Impl::MinSize(dest.num_qubits()), f, src, dest.get());
+
+    return true;
+  }
+
+  // It is the client's responsibility to make sure that src has at least
+  // min(size, Impl::MinSize(dest.num_qubits())) elements.
+  bool Copy(const fp_type* src, uint64_t size, Vector& dest) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* src, fp_type* dest) {
+      dest[i] = src[i];
+    };
+
+    size = std::min(size, Impl::MinSize(dest.num_qubits()));
+    for_.Run(size, f, src, dest.get());
+
+    return true;
+  }
+
+  void DeviceSync() {}
+
+ protected:
+  For for_;
+};
+
+}  // namespace qsim
+
+#endif  // VECTORSPACE_H_
diff --git a/qsim/vectorspace_cuda.h b/qsim/vectorspace_cuda.h
new file mode 100644
index 0000000..fd91553
--- /dev/null
+++ b/qsim/vectorspace_cuda.h
@@ -0,0 +1,172 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef VECTORSPACE_CUDA_H_
+#define VECTORSPACE_CUDA_H_
+
+#ifdef __NVCC__
+  #include <cuda.h>
+  #include <cuda_runtime.h>
+#elif __HIP__
+  #include <hip/hip_runtime.h>
+  #include "cuda2hip.h"
+#endif
+
+#include <memory>
+#include <utility>
+
+namespace qsim {
+
+namespace detail {
+
+inline void do_not_free(void*) {}
+
+inline void free(void* ptr) {
+  ErrorCheck(cudaFree(ptr));
+}
+
+}  // namespace detail
+
+// Routines for vector manipulations.
+template <typename Impl, typename FP>
+class VectorSpaceCUDA {
+ public:
+  using fp_type = FP;
+
+ private:
+  using Pointer = std::unique_ptr<fp_type, decltype(&detail::free)>;
+
+ public:
+  class Vector {
+   public:
+    Vector() = delete;
+
+    Vector(Pointer&& ptr, unsigned num_qubits)
+        : ptr_(std::move(ptr)), num_qubits_(num_qubits) {}
+
+    fp_type* get() {
+      return ptr_.get();
+    }
+
+    const fp_type* get() const {
+      return ptr_.get();
+    }
+
+    fp_type* release() {
+      num_qubits_ = 0;
+      return ptr_.release();
+    }
+
+    unsigned num_qubits() const {
+      return num_qubits_;
+    }
+
+    bool requires_copy_to_host() const {
+      return true;
+    }
+
+   private:
+    Pointer ptr_;
+    unsigned num_qubits_;
+  };
+
+  template <typename... Args>
+  VectorSpaceCUDA(Args&&... args) {}
+
+  static Vector Create(unsigned num_qubits) {
+    fp_type* p;
+    auto size = sizeof(fp_type) * Impl::MinSize(num_qubits);
+    auto rc = cudaMalloc(&p, size);
+
+    if (rc == cudaSuccess) {
+      return Vector{Pointer{(fp_type*) p, &detail::free}, num_qubits};
+    } else {
+      return Null();
+    }
+  }
+
+  // It is the client's responsibility to make sure that p has at least
+  // Impl::MinSize(num_qubits) elements.
+  static Vector Create(fp_type* p, unsigned num_qubits) {
+    return Vector{Pointer{p, &detail::do_not_free}, num_qubits};
+  }
+
+  static Vector Null() {
+    return Vector{Pointer{nullptr, &detail::free}, 0};
+  }
+
+  static bool IsNull(const Vector& vector) {
+    return vector.get() == nullptr;
+  }
+
+  static void Free(fp_type* ptr) {
+    detail::free(ptr);
+  }
+
+  bool Copy(const Vector& src, Vector& dest) const {
+    if (src.num_qubits() != dest.num_qubits()) {
+      return false;
+    }
+
+    ErrorCheck(
+        cudaMemcpy(dest.get(), src.get(),
+                   sizeof(fp_type) * Impl::MinSize(src.num_qubits()),
+                   cudaMemcpyDeviceToDevice));
+
+    return true;
+  }
+
+  // It is the client's responsibility to make sure that dest has at least
+  // Impl::MinSize(src.num_qubits()) elements.
+  bool Copy(const Vector& src, fp_type* dest) const {
+    ErrorCheck(
+        cudaMemcpy(dest, src.get(),
+                   sizeof(fp_type) * Impl::MinSize(src.num_qubits()),
+                   cudaMemcpyDeviceToHost));
+
+    return true;
+  }
+
+  // It is the client's responsibility to make sure that src has at least
+  // Impl::MinSize(dest.num_qubits()) elements.
+  bool Copy(const fp_type* src, Vector& dest) const {
+    ErrorCheck(
+        cudaMemcpy(dest.get(), src,
+                   sizeof(fp_type) * Impl::MinSize(dest.num_qubits()),
+                   cudaMemcpyHostToDevice));
+
+    return true;
+  }
+
+  // It is the client's responsibility to make sure that src has at least
+  // min(size, Impl::MinSize(dest.num_qubits())) elements.
+  bool Copy(const fp_type* src, uint64_t size, Vector& dest) const {
+    size = std::min(size, Impl::MinSize(dest.num_qubits()));
+    ErrorCheck(
+        cudaMemcpy(dest.get(), src,
+                   sizeof(fp_type) * size,
+                   cudaMemcpyHostToDevice));
+    return true;
+  }
+
+  void DeviceSync() {
+    ErrorCheck(cudaDeviceSynchronize());
+  }
+
+ protected:
+};
+
+}  // namespace qsim
+
+#endif  // VECTORSPACE_CUDA_H_
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 845a29a..b01bf2f 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -12,8 +12,13 @@ qiree_configure_file("qiree_config.h.in" "qiree_config.h" @ONLY)
 #----------------------------------------------------------------------------#
 
 add_subdirectory(qiree)
+
 if(QIREE_USE_XACC)
   add_subdirectory(qirxacc)
 endif()
 
+if(QIREE_USE_QSIM)
+  add_subdirectory(qirqsim)
+endif()
+
 #---------------------------------------------------------------------------##
diff --git a/src/qirqsim/BufferManager.cc b/src/qirqsim/BufferManager.cc
new file mode 100644
index 0000000..2e6f646
--- /dev/null
+++ b/src/qirqsim/BufferManager.cc
@@ -0,0 +1,33 @@
+//----------------------------------*-C++-*----------------------------------//
+// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers.
+// See the top-level COPYRIGHT file for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//---------------------------------------------------------------------------//
+//! \file qirqsim/BufferManager.hh
+//---------------------------------------------------------------------------//
+
+#include "BufferManager.hh"
+#include <unordered_map>
+#include <string>
+#include <optional>
+
+void BufferManager::updateBuffer(const std::string& qubit, const std::string& state, const int& value) {
+    // Insert or update the key-value pair in the buffer
+    std::pair<std::string, std::string> searchKey = {qubit, state};
+    int current_frequency = 0;
+    auto it = buffer.find(searchKey);
+    if (it != buffer.end()){
+        current_frequency = it -> second;
+    }
+    // Accumulate counts with every shot
+    buffer[{qubit, state}] = value + current_frequency;
+}
+
+std::optional<int> BufferManager::getBufferValue(const std::string& qubit, const std::string& state) const {
+    std::pair<std::string, std::string> searchKey = {qubit, state};
+    auto it = buffer.find(searchKey);
+    if (it != buffer.end()) {
+        return it->second;  // Key found
+    }
+    return std::nullopt;  // Key not found
+}
\ No newline at end of file
diff --git a/src/qirqsim/BufferManager.hh b/src/qirqsim/BufferManager.hh
new file mode 100644
index 0000000..dc03846
--- /dev/null
+++ b/src/qirqsim/BufferManager.hh
@@ -0,0 +1,45 @@
+//----------------------------------*-C++-*----------------------------------//
+// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers.
+// See the top-level COPYRIGHT file for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//---------------------------------------------------------------------------//
+//! \file qirqsim/BufferManager.hh
+//---------------------------------------------------------------------------//
+
+#ifndef BUFFER_MANAGER_H
+#define BUFFER_MANAGER_H
+
+#include <unordered_map>
+#include <string>
+#include <optional>
+#include <functional>
+#include <utility>
+
+// Define a hash function for std::pair
+
+struct pair_hash {
+    template <class T1, class T2>
+    std::size_t operator()(const std::pair<T1, T2>& pair) const {
+        auto hash1 = std::hash<T1>{}(pair.first);
+        auto hash2 = std::hash<T2>{}(pair.second);
+        // Combine the two hash values
+        return hash1 ^ (hash2 << 1);  // Shift and XOR
+    }
+};
+
+class BufferManager {
+public:
+    
+    // Method to update the buffer with a key-value pair
+    void updateBuffer(const std::string& qubit, const std::string& state, const int& value);
+    
+    // Retrieve buffer value for storage or evaluation
+    std::optional<int> getBufferValue(const std::string& qubit, const std::string& state) const;
+    
+private:
+    
+    // Dictionary to store key-value pairs
+    std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> buffer;
+};
+
+#endif // BUFFER_MANAGER_H
diff --git a/src/qirqsim/CMakeLists.txt b/src/qirqsim/CMakeLists.txt
new file mode 100644
index 0000000..09a0511
--- /dev/null
+++ b/src/qirqsim/CMakeLists.txt
@@ -0,0 +1,29 @@
+#---------------------------------*-CMake-*----------------------------------#
+# Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers.
+# See the top-level COPYRIGHT file for details.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#----------------------------------------------------------------------------#
+
+# Adding qsim as a library to qiree
+qiree_add_library(qirqsim
+  qsimQuantum.cc
+  qsimDefaultRuntime.cc
+  qsimTupleRuntime.cc
+  BufferManager.cc
+)
+
+#Link the qsim library to qiree and any other relevant libraries
+target_link_libraries(qirqsim
+  PUBLIC QIREE::qiree  # Link to qiree
+)
+
+#----------------------------------------------------------------------------#
+# HEADERS
+#----------------------------------------------------------------------------#
+
+# Install headers, matching the relevant .hh files for qsim integration
+install(DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/"
+  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/qirqsim"
+  COMPONENT development
+  FILES_MATCHING REGEX ".*\\.hh?$"
+)
diff --git a/src/qirqsim/qsimDefaultRuntime.cc b/src/qirqsim/qsimDefaultRuntime.cc
new file mode 100644
index 0000000..955959d
--- /dev/null
+++ b/src/qirqsim/qsimDefaultRuntime.cc
@@ -0,0 +1,71 @@
+//----------------------------------*-C++-*----------------------------------//
+// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers.
+// See the top-level COPYRIGHT file for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//---------------------------------------------------------------------------//
+//! \file qirqsim/qsimDefaultRuntime.cc
+//---------------------------------------------------------------------------//
+#include "qsimDefaultRuntime.hh"
+#include <iostream>
+#include "qiree/Assert.hh"
+ 
+namespace qiree
+{
+//---------------------------------------------------------------------------//
+/*!
+ * Initialize the execution environment, resetting qubits.
+ */
+
+void qsimDefaultRuntime::initialize(OptionalCString env)
+{
+    if (env)
+    {
+        output_ << "Argument to initialize: " << env << std::endl;
+    }
+}
+
+//---------------------------------------------------------------------------//
+/*!
+ * Execute circuit and mark the following N results as being part of an array
+ * named tag
+ */
+
+void qsimDefaultRuntime::array_record_output(size_type s, OptionalCString tag)
+{
+    //this->execute_if_needed();
+    //output_ << "array " << (tag ? tag : "<null>") << " length " << s
+    //        << std::endl;
+}
+
+//---------------------------------------------------------------------------//
+/*!
+ * Execute circuit and mark the following N results as being part of a tuple
+ * named tag
+ */
+
+void qsimDefaultRuntime::tuple_record_output(size_type s, OptionalCString tag)
+{
+    //this->execute_if_needed();
+    //output_ << "tuple " << (tag ? tag : "<null>") << " length " << s
+    //        << std::endl;
+}
+
+//---------------------------------------------------------------------------//
+/*!
+ * Execute circuit and report a single measurement result
+ */
+void qsimDefaultRuntime::result_record_output(Result r, OptionalCString tag)
+{
+    // Access values through the getter
+    // TODO: This prints results 'every time' result_record_output is called. Maybe enough to only print the 'final time'
+
+    if (auto value = sim_.manager.getBufferValue("q"+std::to_string(r.value), "0"); value.has_value()) {
+        std::cout << "q" << std::to_string(r.value) << " |0> freq: " << value.value() << "\n";
+    }
+
+    if (auto value = sim_.manager.getBufferValue("q"+std::to_string(r.value), "1"); value.has_value()) {
+        std::cout << "q" << std::to_string(r.value) << " |1> freq: " << value.value() << "\n";
+    }
+}
+
+}  // namespace qiree
diff --git a/src/qirqsim/qsimDefaultRuntime.hh b/src/qirqsim/qsimDefaultRuntime.hh
new file mode 100644
index 0000000..70dfdd4
--- /dev/null
+++ b/src/qirqsim/qsimDefaultRuntime.hh
@@ -0,0 +1,61 @@
+//----------------------------------*-C++-*----------------------------------//
+// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers.
+// See the top-level COPYRIGHT file for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//---------------------------------------------------------------------------//
+//! \file qirqsim/qsimDefaultRuntime.hh
+//---------------------------------------------------------------------------//
+#pragma once
+
+#include "qsimQuantum.hh"
+
+namespace qiree
+{
+
+/*!
+ * Print per-qubit measurement statistics. 
+ *
+ * Example for three qubits:
+ * \code
+ * q0 |0> freq: 509
+ * q0 |1> freq: 515
+ * q1 |0> freq: 509
+ * q1 |1> freq: 515
+ * q2 |1> freq: 1024
+ * \endcode
+ */
+
+class qsimDefaultRuntime final : virtual public RuntimeInterface
+{
+  public:
+    /*!
+     * Construct \c qsimDefaultRuntime.
+     */
+    qsimDefaultRuntime(std::ostream& output,
+                       qsimQuantum& sim
+                       )
+        : output_(output), sim_(sim)
+    {
+    }
+
+    //!@{
+    //! \name Runtime interface
+    // Initialize the execution environment, resetting qubits
+    void initialize(OptionalCString env) override;
+
+    //! Mark the following N results as being part of an array named tag
+    void array_record_output(size_type, OptionalCString tag) final;
+
+    //! Mark the following N results as being part of a tuple named tag
+    void tuple_record_output(size_type, OptionalCString) final;
+
+    // Save one result
+    void result_record_output(Result result, OptionalCString tag) final;
+    //!@}
+
+  private:
+    std::ostream& output_;
+    qsimQuantum& sim_;
+};
+
+}  // namespace qiree
diff --git a/src/qirqsim/qsimQuantum.cc b/src/qirqsim/qsimQuantum.cc
new file mode 100644
index 0000000..81f40ef
--- /dev/null
+++ b/src/qirqsim/qsimQuantum.cc
@@ -0,0 +1,218 @@
+//----------------------------------*-C++-*----------------------------------//
+// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers.
+// See the top-level COPYRIGHT file for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//---------------------------------------------------------------------------//
+//! \file qirxacc/qsimQuantum.cc
+//---------------------------------------------------------------------------//
+
+#include "qsimQuantum.hh"
+
+#include <algorithm>
+#include <iostream>
+#include <utility>
+#include <stdexcept>
+#include <thread>
+#include <optional>
+#include <cassert>
+
+#include "qiree/Assert.hh"
+
+// Qsim
+#include "../../tpls/qsim/simulator_basic.h"
+#include "../../tpls/qsim/statespace_basic.h"
+#include "../../tpls/qsim/gates_qsim.h"
+#include "../../tpls/qsim/circuit.h"
+#include "../../tpls/qsim/run_qsim.h"
+#include "../../tpls/qsim/io.h"
+#include "../../tpls/qsim/fuser.h"
+#include "../../tpls/qsim/circuit_qsim_parser.h" 
+#include "../../tpls/qsim/fuser_mqubit.h"
+#include "../../tpls/qsim/io_file.h"
+#include "../../tpls/qsim/simmux.h"
+#include "../../tpls/qsim/util_cpu.h"
+#include "../../tpls/qsim/formux.h"
+#include "../../tpls/qsim/gate.h"
+//
+
+namespace qiree{
+//---------------------------------------------------------------------------//
+/*
+Initialize the qsim simulator
+*/
+
+qsimQuantum::State qsimQuantum::init_state_space() { //check if StateSpace is the proper type for the output, problably it is just State from the Fatory struct.
+    std::srand(static_cast<unsigned int>(std::time(nullptr))); // Seed the random number generator
+    qsimParam.seed = std::rand(); // Set the seed for qsim parameters
+    numThreads = std::max(1, static_cast<int>(std::thread::hardware_concurrency())); // Get the number of threads
+    qsimParam.max_fused_size = 2; // Set the maximum size of fused gates
+    qsimParam.verbosity = 0; // see verbosity in run_qsim.h 
+    // Initialize the qsim simulator
+    qsimQuantum::StateSpace state_space = Factory(numThreads).CreateStateSpace(); // Create the state space
+    State state = state_space.Create(this->num_qubits()); // Create the state
+    // Check if the state is null
+    if (state_space.IsNull(state)) {
+        qsim::IO::errorf("not enough memory: is the number of qubits too large?\n");
+    }
+    state_space.SetStateZero(state); // Set the state to zero, TODO: the initial state is not necessarily zero
+  return state;
+  }
+  
+  qsimQuantum::qsimQuantum(std::ostream& os,
+                 size_type shots)
+  : output_(os)
+  {
+  }
+
+//---------------------------------------------------------------------------//
+/*
+Prepare to build a quantum circuit for an entry point
+*/
+void qsimQuantum::set_up(EntryPointAttrs const& attrs) {
+    QIREE_VALIDATE(attrs.required_num_qubits > 0,
+                   << "input is not a quantum program");
+    // Resize the result_to_qubit_ vector, based on the required number of results...
+    // the idea is to have as many classical registers as qubits (probably not true in general)
+    result_to_qubit_.resize(attrs.required_num_results);
+    num_qubits_ = attrs.required_num_qubits; // Set the number of qubits
+    state_ = std::make_shared<State>(init_state_space()); // Set the state space? Maybe.
+    q_circuit.num_qubits = num_qubits_; // Allocate the number of qubits in the circuit
+    execution_time = 0; // Initialize execution time
+
+}
+
+//---------------------------------------------------------------------------//
+/*
+Complete an execution
+*/
+void qsimQuantum::tear_down() {
+    q_circuit = {};
+    q_circuit.num_qubits = num_qubits_;
+    state_ = std::make_shared<State>(init_state_space());
+}
+
+//---------------------------------------------------------------------------//
+/*
+Reset the qubit
+*/
+void qsimQuantum::reset(Qubit q) {
+    q.value=0;
+}
+
+//----------------------------------------------------------------------------//
+/* 
+Read the value of a result. This utilizes the new BufferManager.
+*/
+QState qsimQuantum::read_result(Result r)
+{
+    std::string q_index_string = std::to_string(r.value);
+    auto meas_results = execute_if_needed();
+    if (meas_results.size() == 1 && meas_results[0].bitstring.size() == 1) {
+        const auto bitResult = meas_results[0].bitstring[0];
+        assert(bitResult == 0 || bitResult == 1);
+        std::string stringResult = std::to_string(bitResult);
+        if (stringResult == "1"){
+            manager.updateBuffer("q"+q_index_string, "1", 1);
+        } else{
+            manager.updateBuffer("q"+q_index_string, "0", 1);
+        }
+    } else {
+        qsim::IO::errorf("Unexpected measurement results encountered.");
+    }
+    return static_cast<QState>(meas_results[0].bitstring[0]);
+}
+
+//---------------------------------------------------------------------------//
+/*
+Map a qubit to a result index 
+(TODO: find how to link the classical register to the quantum register in qsim)
+*/
+void qsimQuantum::mz(Qubit q, Result r) { //we don't classical register yet. 
+    QIREE_EXPECT(q.value < this->num_qubits()); // TODO: q must be in the set of qubits, e.g., what happens if q=5 and qubits are {2,3,4,5}, q is less than num_qubits but not it is in the set of qubits. 
+    // Add measurement instruction
+    this->q_circuit.gates.push_back(
+        qsim::gate::Measurement<qsim::GateQSim<float>>::Create(
+            execution_time++, {this->getQubitIndex(q)}));
+}
+
+//---------------------------------------------------------------------------//
+/*
+Quantum Instruction Mapping
+*/
+// 1. Entangling gates
+void qsimQuantum::cx(Qubit q1, Qubit q2) {
+    q_circuit.gates.push_back(
+        qsim::GateCNot<float>::Create(execution_time++, this->getQubitIndex(q1), this->getQubitIndex(q2)));
+}
+void qsimQuantum::cnot(Qubit q1, Qubit q2) {
+    q_circuit.gates.push_back(
+        qsim::GateCNot<float>::Create(execution_time++, this->getQubitIndex(q1), this->getQubitIndex(q2)));
+}
+void qsimQuantum::cz(Qubit q1, Qubit q2) {
+    q_circuit.gates.push_back(
+        qsim::GateCZ<float>::Create(execution_time++, this->getQubitIndex(q1), this->getQubitIndex(q2)));
+}
+// 2. Local gates
+void qsimQuantum::h(Qubit q) {
+    q_circuit.gates.push_back(
+        qsim::GateHd<float>::Create(execution_time++, this->getQubitIndex(q)));
+}
+void qsimQuantum::s(Qubit q) {
+    q_circuit.gates.push_back(
+        qsim::GateS<float>::Create(execution_time++, this->getQubitIndex(q)));
+}
+void qsimQuantum::t(Qubit q) {
+    q_circuit.gates.push_back(
+        qsim::GateT<float>::Create(execution_time++, this->getQubitIndex(q)));
+}
+// 2.1 Pauli gates
+void qsimQuantum::x(Qubit q) {
+    q_circuit.gates.push_back(
+        qsim::GateX<float>::Create(execution_time++, this->getQubitIndex(q)));
+}
+void qsimQuantum::y(Qubit q) {
+    q_circuit.gates.push_back(
+        qsim::GateY<float>::Create(execution_time++, this->getQubitIndex(q)));
+}
+void qsimQuantum::z(Qubit q) {
+    q_circuit.gates.push_back(
+        qsim::GateZ<float>::Create(execution_time++, this->getQubitIndex(q)));
+}
+// 2.2 rotation gates
+void qsimQuantum::rx(double theta, Qubit q) {
+    q_circuit.gates.push_back(
+        qsim::GateRX<float>::Create(execution_time++, this->getQubitIndex(q), theta));
+}
+void qsimQuantum::ry(double theta, Qubit q) {
+    q_circuit.gates.push_back(
+        qsim::GateRY<float>::Create(execution_time++, this->getQubitIndex(q), theta));
+}
+void qsimQuantum::rz(double theta, Qubit q) {
+    q_circuit.gates.push_back(
+        qsim::GateRZ<float>::Create(execution_time++, this->getQubitIndex(q), theta));
+}
+
+Qubit qsimQuantum::result_to_qubit(Result r) {
+    // TODO: This function is not working. Giving 0 every time. Maybe not needed.
+    QIREE_EXPECT(r.value < this->num_results());
+    return result_to_qubit_[r.value]; // just copied this from the qirxacc, I have no idea if we need to do something else here
+}
+
+void qsimQuantum::print_accelbuf() {
+    // TODO: to be implemented, we can create a buffer class to store the results
+}
+
+qsimQuantum::VecMeas qsimQuantum::execute_if_needed() {
+    std::vector<StateSpace::MeasurementResult> meas_results; // Vector to hold measurement results, this must be empty before running
+    std::string stringResult;
+    static unsigned long int seed = 0;
+    qsimParam.seed = seed++;
+    const bool run_success = Runner::Run(qsimParam, Factory(numThreads), q_circuit, *state_, meas_results); // Run the simulation
+    assert(run_success); // Ensure the run was successful
+	// reset circuit here 
+	q_circuit = {};
+    q_circuit.num_qubits = num_qubits_;
+    return meas_results;
+}
+
+} // namespace qiree
diff --git a/src/qirqsim/qsimQuantum.hh b/src/qirqsim/qsimQuantum.hh
new file mode 100644
index 0000000..e720e8c
--- /dev/null
+++ b/src/qirqsim/qsimQuantum.hh
@@ -0,0 +1,175 @@
+//----------------------------------*-C++-*----------------------------------//
+// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers.
+// See the top-level COPYRIGHT file for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//---------------------------------------------------------------------------//
+//! \file qirqsim/qsimQuantum.hh
+//---------------------------------------------------------------------------//
+#pragma once
+
+#include <initializer_list>
+#include <map>
+#include <memory>
+#include <ostream>
+#include <vector>
+#include <cassert>
+
+#include "qiree/Macros.hh"
+#include "qiree/QuantumNotImpl.hh"
+#include "qiree/RuntimeInterface.hh"
+#include "qiree/Types.hh"
+#include "BufferManager.hh"
+
+#include "../../tpls/qsim/simulator_basic.h"
+#include "../../tpls/qsim/statespace_basic.h"
+#include "../../tpls/qsim/gates_qsim.h"
+#include "../../tpls/qsim/circuit.h"
+#include "../../tpls/qsim/run_qsim.h"
+#include "../../tpls/qsim/io.h"
+#include "../../tpls/qsim/fuser.h"
+#include "../../tpls/qsim/circuit_qsim_parser.h" 
+#include "../../tpls/qsim/fuser_mqubit.h"
+#include "../../tpls/qsim/io_file.h"
+#include "../../tpls/qsim/simmux.h"
+#include "../../tpls/qsim/util_cpu.h"
+#include "../../tpls/qsim/formux.h"
+#include "../../tpls/qsim/gate.h"
+
+struct Factory { // Factory class for creating simulators in qsim 
+    Factory(unsigned num_threads) : num_threads(num_threads) {}
+    using Simulator = qsim::Simulator<qsim::For>;
+    using StateSpace = Simulator::StateSpace;
+    StateSpace CreateStateSpace() const { return StateSpace(num_threads); } 
+    Simulator CreateSimulator() const { return Simulator(num_threads); }
+    unsigned num_threads;
+};
+
+namespace qiree
+{
+    class qsimQuantum final : virtual public QuantumNotImpl
+    {
+
+    public: 
+
+    // Define constructors and destructors
+    qsimQuantum(std::ostream& os, size_type shots); // Construct with number of shots
+
+    // Define types
+    using Simulator = qsim::Simulator<qsim::For>;
+    using StateSpace = Simulator::StateSpace;
+    using State = StateSpace::State;
+    using Fuser = qsim::MultiQubitGateFuser<qsim::IO, qsim::GateQSim<float>>;
+    using Runner = qsim::QSimRunner<qsim::IO, Fuser, Factory>;
+    using VecMeas = std::vector<StateSpace::MeasurementResult>;
+
+    State init_state_space();
+
+    QIREE_DELETE_COPY_MOVE(qsimQuantum); // Delete copy and move constructors
+
+    //!@{
+    //! \name Accessors
+    size_type num_results() const { return result_to_qubit_.size(); }
+    size_type num_qubits() const { return num_qubits_; }
+    
+    unsigned getQubitIndex(Qubit q) {
+    return static_cast<unsigned>(q.value); // Return the value of the qubit
+    }
+    //!@}
+
+    //!@{
+    //! \name Quantum interface
+    // Prepare to build a quantum circuit for an entry point
+    void set_up(EntryPointAttrs const&) override;
+
+    // Complete an execution
+    void tear_down() override;
+
+    // Map a qubit to a result index
+    void mz(Qubit, Result) final;
+
+    // Read the value of a result.
+    QState read_result(Result) final;
+    //!@}
+
+    //!@{
+    //! \name Utilities for runtime
+    // Get runtime qubit corresponding to a runtime result
+    Qubit result_to_qubit(Result);
+
+    // Wrapper for qsim
+    //std::map<std::string, int>
+    //get_marginal_counts(std::vector<Qubit> const& qubits);
+
+    // Run the circuit on the accelerator if we have not already. Returns true
+    // if the circuit was executed.
+    VecMeas execute_if_needed();
+
+    void print_accelbuf();
+    //!@}
+
+    //!@{
+    //! \name Circuit construction
+    // void ccx(Qubit, Qubit) final;
+    void ccnot(Qubit, Qubit, Qubit);  // TODO: not in examples or qir runner
+    void cnot(Qubit, Qubit) final;
+    void cx(Qubit, Qubit) final;
+    // void cy(Qubit, Qubit) final;
+    void cz(Qubit, Qubit) final;
+    void h(Qubit) final;
+    void reset(Qubit) final;
+    void rx(double, Qubit) final;
+    void ry(double, Qubit) final;
+    void rz(double, Qubit) final;
+    // void rzz(double, Qubit, Qubit) final;
+    void s(Qubit) final;
+    // void s_adj(Qubit) final;
+    // void swap(Qubit, Qubit) final;
+    void t(Qubit) final;
+    // void t_adj(Qubit) final;
+    void x(Qubit) final;
+    void y(Qubit) final;
+    void z(Qubit) final;
+    //!@}
+
+    // Get the quantum circuit
+    qsim::Circuit<qsim::GateQSim<float>> get_circuit() const { return q_circuit; } 
+    // Get the state space
+    State const& get_state() const { return *state_; }
+    // update the buffer
+    BufferManager manager;
+    
+    private:
+        //// TYPES ////
+        enum class Endianness
+        {
+            little,
+            big
+        };
+        unsigned numThreads; // Number of threads to use
+        unsigned max_fused_size; // Maximum size of fused gates
+        qsim::Circuit<qsim::GateQSim<float>> q_circuit; // Quantum circuit object
+        
+        Runner::Parameter qsimParam; // Parameters for qsim
+        size_t execution_time; // when the quantum operation will be executed
+
+        bool executed;
+        size_type num_qubits_{};
+        std::vector<Qubit> result_to_qubit_;
+        Endianness endian_;
+
+        std::ostream& output_;
+        std::shared_ptr<Simulator> simulator_;
+        std::shared_ptr<StateSpace> statespace_;
+        std::shared_ptr<State> state_;
+
+    };
+
+    class buffer {
+    public:
+        buffer(size_t size) : size(size) {}
+        size_t size;
+    }; 
+
+}  // namespace qiree
+
+    
diff --git a/src/qirqsim/qsimTupleRuntime.cc b/src/qirqsim/qsimTupleRuntime.cc
new file mode 100644
index 0000000..5366b79
--- /dev/null
+++ b/src/qirqsim/qsimTupleRuntime.cc
@@ -0,0 +1,123 @@
+//----------------------------------*-C++-*----------------------------------//
+// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers.
+// See the top-level COPYRIGHT file for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//---------------------------------------------------------------------------//
+//! \file qirqsim/qsimTupleRuntime.cc
+//---------------------------------------------------------------------------//
+#include "qsimTupleRuntime.hh"
+
+#include "qiree/Assert.hh"
+
+namespace qiree
+{
+//---------------------------------------------------------------------------//
+/*!
+ * Initialize the execution environment, resetting qubits.
+ */
+void qsimTupleRuntime::initialize(OptionalCString env)
+{
+    if (env)
+    {
+        output_ << "Argument to initialize: " << env << std::endl;
+    }
+}
+
+//---------------------------------------------------------------------------//
+/*!
+ * Execute circuit and mark the following N results as being part of an array
+ * named tag
+ */
+void qsimTupleRuntime::array_record_output(size_type s, OptionalCString tag)
+{
+    execute_if_needed();
+    start_tracking(GroupingType::array, tag, s);
+}
+
+//---------------------------------------------------------------------------//
+/*!
+ * Execute circuit and mark the following N results as being part of a tuple
+ * named tag
+ */
+void qsimTupleRuntime::tuple_record_output(size_type s, OptionalCString tag)
+{
+    execute_if_needed();
+    start_tracking(GroupingType::tuple, tag, s);
+}
+
+//---------------------------------------------------------------------------//
+/*!
+ * Execute circuit and report a single measurement result
+ */
+void qsimTupleRuntime::result_record_output(Result r, OptionalCString tag)
+{
+    execute_if_needed();
+    Qubit q = sim_.result_to_qubit(r);
+    push_result(q);
+}
+
+//---------------------------------------------------------------------------//
+// PRIVATE FUNCTIONS
+//---------------------------------------------------------------------------//
+
+void qsimTupleRuntime::execute_if_needed()
+{
+    /*
+    if (sim_.execute_if_needed() && print_accelbuf_)
+    {
+        sim_.print_accelbuf();
+    }
+    */
+}
+
+void qsimTupleRuntime::start_tracking(GroupingType type,
+                                      std::string tag,
+                                      size_type num_results)
+{
+    QIREE_EXPECT(!valid_);
+    valid_ = true;
+    type_ = type; 
+    tag_ = tag;
+    num_results_ = num_results;
+    qubits_.clear();
+
+    if (!num_results_)
+    {
+        // Edge case
+        print_header(0);
+        valid_ = false;
+    }
+}
+
+void qsimTupleRuntime::push_result(Qubit q)
+{
+    QIREE_EXPECT(valid_);
+    QIREE_EXPECT(qubits_.size() < num_results_);
+    qubits_.push_back(q);
+    if (qubits_.size() == num_results_)
+    {
+        finish_tuple();
+    }
+}
+
+void qsimTupleRuntime::print_header(size_type num_distinct)
+{
+    auto name = get_name();
+    output_ << name << " " << tag_ << " length " << qubits_.size()
+            << " distinct results " << num_distinct << std::endl;
+}
+
+void qsimTupleRuntime::finish_tuple()
+{
+    //auto counts = sim_.get_marginal_counts(qubits_);
+    std::map<std::string, int> counts = {{"0", 0}, {"1", 0}}; // Placeholder for actual counts, TODO: replace with actual counts
+    print_header(counts.size());
+    auto name = get_name();
+    for (auto& [bits, count] : counts)
+    {
+        output_ << name << " " << tag_ << " result " << bits << " count "
+                << count << std::endl;
+    }
+    valid_ = false;
+}
+}  // namespace qiree
diff --git a/src/qirqsim/qsimTupleRuntime.hh b/src/qirqsim/qsimTupleRuntime.hh
new file mode 100644
index 0000000..fa153f4
--- /dev/null
+++ b/src/qirqsim/qsimTupleRuntime.hh
@@ -0,0 +1,93 @@
+//----------------------------------*-C++-*----------------------------------//
+// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers.
+// See the top-level COPYRIGHT file for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//---------------------------------------------------------------------------//
+//! \file qirqsim/qsimTupleRuntime.hh
+//---------------------------------------------------------------------------//
+#pragma once
+
+#include "qsimQuantum.hh"
+
+namespace qiree
+{
+
+/*!
+ * Print per-tuple (or per-array) measurement statistics. (Compare with \ref
+ * qsimDefaultRuntime.)
+ *
+ * Example:
+ * \code
+ * tuple ret length 2 distinct results 2
+ * tuple ret result 00 count 512
+ * tuple ret result 11 count 512
+ * \endcode
+ */
+
+class qsimTupleRuntime final : virtual public RuntimeInterface
+{
+  public:
+    /*!
+     * Construct an \c qsimTupleRuntime.
+     * The \c print_accelbuf argument determines whether the qsim \c
+     * AcceleratorBuffer is dumped after execution.
+     */
+    qsimTupleRuntime(std::ostream& output,
+                     qsimQuantum& sim,
+                     bool print_accelbuf = true)
+        : output_(output)
+        , sim_(sim)
+        , print_accelbuf_(print_accelbuf)
+        , valid_(false)
+    {
+    }
+
+    //!@{
+    //! \name Runtime interface
+    // Initialize the execution environment, resetting qubits
+    void initialize(OptionalCString env) override;
+
+    // Execute circuit and mark the following N results as being part of an
+    // array named tag
+    void array_record_output(size_type, OptionalCString tag) final;
+
+    // Execute circuit and mark the following N results as being part of a
+    // tuple named tag
+    void tuple_record_output(size_type, OptionalCString) final;
+
+    // Execute circuit and report a single measurement result
+    void result_record_output(Result result, OptionalCString tag) final;
+    //!@}
+
+  private:
+    enum class GroupingType
+    {
+        tuple,
+        array,
+    };
+
+    std::ostream& output_;
+    qsimQuantum& sim_;
+    bool const print_accelbuf_;
+    bool valid_;
+    GroupingType type_;
+    std::string tag_;
+    size_type num_results_;
+    std::vector<Qubit> qubits_;
+
+    void execute_if_needed();
+    void
+    start_tracking(GroupingType type, std::string tag, size_type num_results);
+    void push_result(Qubit q);
+    void print_header(size_type num_distinct);
+    void finish_tuple();
+
+    inline std::string get_name()
+    {
+        return type_ == GroupingType::tuple   ? "tuple"
+               : type_ == GroupingType::array ? "array"
+                                              : "grouping";
+    }
+};
+
+}  // namespace qiree
diff --git a/tpls/qsim/bits.h b/tpls/qsim/bits.h
new file mode 100644
index 0000000..080c866
--- /dev/null
+++ b/tpls/qsim/bits.h
@@ -0,0 +1,106 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef BITS_H_
+#define BITS_H_
+
+#include <vector>
+
+#ifdef __BMI2__
+
+#include <immintrin.h>
+
+#include <cstdint>
+
+namespace qsim {
+namespace bits {
+
+inline uint32_t ExpandBits(uint32_t bits, unsigned n, uint32_t mask) {
+  return _pdep_u32(bits, mask);
+}
+
+inline uint64_t ExpandBits(uint64_t bits, unsigned n, uint64_t mask) {
+  return _pdep_u64(bits, mask);
+}
+
+inline uint32_t CompressBits(uint32_t bits, unsigned n, uint32_t mask) {
+  return _pext_u32(bits, mask);
+}
+
+inline uint64_t CompressBits(uint64_t bits, unsigned n, uint64_t mask) {
+  return _pext_u64(bits, mask);
+}
+
+}  // namespace bits
+}  // namespace qsim
+
+#else  // __BMI2__
+
+namespace qsim {
+namespace bits {
+
+template <typename Integer>
+inline Integer ExpandBits(Integer bits, unsigned n, Integer mask) {
+  Integer ebits = 0;
+  unsigned k = 0;
+
+  for (unsigned i = 0; i < n; ++i) {
+    if ((mask >> i) & 1) {
+      ebits |= ((bits >> k) & 1) << i;
+      ++k;
+    }
+  }
+
+  return ebits;
+}
+
+template <typename Integer>
+inline Integer CompressBits(Integer bits, unsigned n, Integer mask) {
+  Integer sbits = 0;
+  unsigned k = 0;
+
+  for (unsigned i = 0; i < n; ++i) {
+    if ((mask >> i) & 1) {
+      sbits |= ((bits >> i) & 1) << k;
+      ++k;
+    }
+  }
+
+  return sbits;
+}
+
+}  // namespace bits
+}  // namespace qsim
+
+#endif  // __BMI2__
+
+namespace qsim {
+namespace bits {
+
+template <typename Integer>
+inline Integer PermuteBits(
+    Integer bits, unsigned n, const std::vector<unsigned>& perm) {
+  Integer pbits = 0;
+
+  for (unsigned i = 0; i < n; ++i) {
+    pbits |= ((bits >> i) & 1) << perm[i];
+  }
+
+  return pbits;
+}
+
+}  // namespace bits
+}  // namespace qsim
+
+#endif  // BITS_H_
diff --git a/tpls/qsim/bitstring.h b/tpls/qsim/bitstring.h
new file mode 100644
index 0000000..b95584b
--- /dev/null
+++ b/tpls/qsim/bitstring.h
@@ -0,0 +1,97 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef BITSTRING_H_
+#define BITSTRING_H_
+
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace qsim {
+
+using Bitstring = uint64_t;
+
+/**
+ * Reads bitstrings (representing initialized or measured states of qubits)
+ * from a provided stream object and stores them in a vector.
+ * @param num_qubits Number of qubits represented in each bitstring.
+ * @param provider Source of bitstrings; only used for error reporting.
+ * @param fs The stream to read bitstrings from.
+ * @param bitstrings Output vector of bitstrings. On success, this will contain
+ *   all bitstrings read in from 'fs'.
+ * @return True if reading succeeded; false otherwise.
+ */
+template <typename IO, typename Stream>
+bool BitstringsFromStream(unsigned num_qubits, const std::string& provider,
+                          Stream& fs, std::vector<Bitstring>& bitstrings) {
+  bitstrings.resize(0);
+  bitstrings.reserve(100000);
+
+  // Bitstrings are in text format. One bitstring per line.
+
+  do {
+    char buf[128];
+    fs.getline(buf, 128);
+
+    if (fs) {
+      Bitstring b{0};
+
+      unsigned p = 0;
+      while (p < 128 && (buf[p] == '0' || buf[p] == '1')) {
+        b |= uint64_t(buf[p] - '0') << p;
+        ++p;
+      }
+
+      if (p != num_qubits) {
+        IO::errorf("wrong bitstring length in %s: "
+                   "got %u; should be %u.\n", provider.c_str(), p, num_qubits);
+        bitstrings.resize(0);
+        return false;
+      }
+
+      bitstrings.push_back(b);
+    }
+  } while (fs);
+
+  return true;
+}
+
+/**
+ * Reads bitstrings (representing initialized or measured states of qubits)
+ * from the given file and stores them in a vector.
+ * @param num_qubits Number of qubits represented in each bitstring.
+ * @param file The name of the file to read bitstrings from.
+ * @param bitstrings Output vector of bitstrings. On success, this will contain
+ *   all bitstrings read in from 'file'.
+ * @return True if reading succeeded; false otherwise.
+ */
+template <typename IO>
+inline bool BitstringsFromFile(unsigned num_qubits, const std::string& file,
+                               std::vector<Bitstring>& bitstrings) {
+  auto fs = IO::StreamFromFile(file);
+
+  if (!fs) {
+    return false;
+  } else {
+    bool rc = BitstringsFromStream<IO>(num_qubits, file, fs, bitstrings);
+    IO::CloseStream(fs);
+    return rc;
+  }
+}
+
+}  // namespace qsim
+
+#endif  // BITSTRING_H_
diff --git a/tpls/qsim/channel.h b/tpls/qsim/channel.h
new file mode 100644
index 0000000..372a174
--- /dev/null
+++ b/tpls/qsim/channel.h
@@ -0,0 +1,149 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef CHANNEL_H_
+#define CHANNEL_H_
+
+#include <set>
+#include <vector>
+
+#include "gate.h"
+#include "matrix.h"
+
+namespace qsim {
+
+/**
+ * Kraus operator.
+ */
+template <typename Gate>
+struct KrausOperator {
+  using fp_type = typename Gate::fp_type;
+
+  enum Kind {
+    kNormal = 0,
+    kMeasurement = gate::kMeasurement,
+  };
+
+  /**
+   * Kraus operator type;
+   */
+  Kind kind;
+
+  /**
+   * If true, the Kraus operator is a unitary operator times a constant.
+   */
+  bool unitary;
+
+  /**
+   * Lower bound on Kraus operator probability.
+   */
+  double prob;
+
+  /**
+   * Sequence of operations that represent the Kraus operator. This can be just
+   * one operation.
+   */
+  std::vector<Gate> ops;
+
+  /**
+   * Product of K^\dagger and K. This can be empty if unitary = true.
+   */
+  Matrix<fp_type> kd_k;
+
+  /**
+   * Qubits kd_k acts on. This can be empty if unitary = true.
+   */
+  std::vector<unsigned> qubits;
+
+  /**
+   * Calculates the product of "K^\dagger K". Sets qubits "K^\dagger K" acts on.
+   */
+  void CalculateKdKMatrix() {
+    if (ops.size() == 1) {
+      kd_k = ops[0].matrix;
+      MatrixDaggerMultiply(ops[0].qubits.size(), ops[0].matrix, kd_k);
+      qubits = ops[0].qubits;
+    } else if (ops.size() > 1) {
+      std::set<unsigned> qubit_map;
+
+      for (const auto& op : ops) {
+        for (unsigned q : op.qubits) {
+          qubit_map.insert(q);
+        }
+      }
+
+      unsigned num_qubits = qubit_map.size();
+
+      qubits.resize(0);
+      qubits.reserve(num_qubits);
+
+      for (auto it = qubit_map.begin(); it != qubit_map.end(); ++it) {
+        qubits.push_back(*it);
+      }
+
+      MatrixIdentity(unsigned{1} << num_qubits, kd_k);
+
+      for (const auto& op : ops) {
+        if (op.qubits.size() == num_qubits) {
+          MatrixMultiply(num_qubits, op.matrix, kd_k);
+        } else {
+          unsigned mask = 0;
+
+          for (auto q : op.qubits) {
+            for (unsigned i = 0; i < num_qubits; ++i) {
+              if (q == qubits[i]) {
+                mask |= unsigned{1} << i;
+                break;
+              }
+            }
+          }
+
+          MatrixMultiply(mask, op.qubits.size(), op.matrix, num_qubits, kd_k);
+        }
+      }
+
+      auto m = kd_k;
+      MatrixDaggerMultiply(num_qubits, m, kd_k);
+    }
+  }
+};
+
+/**
+ * Quantum channel.
+ */
+template <typename Gate>
+using Channel = std::vector<KrausOperator<Gate>>;
+
+/**
+ * Makes a channel from the gate.
+ * @param time The time to place the channel at.
+ * @param gate The input gate.
+ * @return The output channel.
+ */
+template <typename Gate>
+Channel<Gate> MakeChannelFromGate(unsigned time, const Gate& gate) {
+  auto normal = KrausOperator<Gate>::kNormal;
+  auto measurement = KrausOperator<Gate>::kMeasurement;
+
+  auto kind = gate.kind == gate::kMeasurement ? measurement : normal;
+
+  Channel<Gate> channel = {{kind, true, 1, {gate}}};
+  channel[0].ops[0].time = time;
+
+  return channel;
+}
+
+}  // namespace qsim
+
+#endif  // CHANNEL_H_
diff --git a/tpls/qsim/channels_cirq.h b/tpls/qsim/channels_cirq.h
new file mode 100644
index 0000000..69f1df9
--- /dev/null
+++ b/tpls/qsim/channels_cirq.h
@@ -0,0 +1,471 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef CHANNELS_CIRQ_H_
+#define CHANNELS_CIRQ_H_
+
+#include <cmath>
+#include <cstdint>
+#include <vector>
+
+#include "channel.h"
+#include "gates_cirq.h"
+
+namespace qsim {
+
+namespace Cirq {
+
+template <typename fp_type>
+using Channel = qsim::Channel<GateCirq<fp_type>>;
+
+/**
+ * Asymmetric depolarizing channel factory.
+ */
+template <typename fp_type>
+struct AsymmetricDepolarizingChannel {
+  static constexpr char name[] = "asymmetric_depolarize";
+
+  AsymmetricDepolarizingChannel(double p_x, double p_y, double p_z)
+      : p_x(p_x), p_y(p_y), p_z(p_z) {}
+
+  static Channel<fp_type> Create(unsigned time, unsigned q,
+                                 double p_x, double p_y, double p_z) {
+    double p1 = 1 - p_x - p_y - p_z;
+
+    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
+
+    return {{normal, 1, p1, {}},
+            {normal, 1, p_x, {X<fp_type>::Create(time, q)}},
+            {normal, 1, p_y, {Y<fp_type>::Create(time, q)}},
+            {normal, 1, p_z, {Z<fp_type>::Create(time, q)}}};
+  }
+
+  static Channel<fp_type> Create(unsigned time,
+                                 const std::vector<unsigned>& qubits,
+                                 double p_x, double p_y, double p_z) {
+    double p1 = 1 - p_x - p_y - p_z;
+
+    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
+
+    uint64_t size = uint64_t{1} << (2 * qubits.size());
+
+    Channel<fp_type> channel;
+    channel.reserve(size);
+
+    for (uint64_t i = 0; i < size; ++i) {
+      channel.push_back({normal, 1, 0, {}});
+      auto& kop = channel.back();
+
+      kop.ops.reserve(qubits.size());
+
+      double prob = 1;
+
+      for (unsigned q = 0; q < qubits.size(); ++q) {
+        unsigned pauli_index = (i >> (2 * q)) & 3;
+
+        switch (pauli_index) {
+        case 0:
+          prob *= p1;
+          break;
+        case 1:
+          prob *= p_x;
+          kop.ops.push_back(X<fp_type>::Create(time, q));
+          break;
+        case 2:
+          prob *= p_y;
+          kop.ops.push_back(Y<fp_type>::Create(time, q));
+          break;
+        case 3:
+          prob *= p_z;
+          kop.ops.push_back(Z<fp_type>::Create(time, q));
+          break;
+        }
+      }
+
+      kop.prob = prob;
+    }
+
+    return channel;
+  }
+
+  Channel<fp_type> Create(unsigned time, unsigned q) const {
+    return Create(time, q, p_x, p_y, p_z);
+  }
+
+  Channel<fp_type> Create(
+      unsigned time, const std::vector<unsigned>& qubits) const {
+    return Create(time, qubits, p_x, p_y, p_z);
+  }
+
+  double p_x = 0;
+  double p_y = 0;
+  double p_z = 0;
+};
+
+/**
+ * Returns an asymmetric depolarizing channel factory object.
+ */
+template <typename fp_type>
+inline AsymmetricDepolarizingChannel<fp_type> asymmetric_depolarize(
+    double p_x, double p_y, double p_z) {
+  return AsymmetricDepolarizingChannel<fp_type>(p_x, p_y, p_z);
+}
+
+/**
+ * Depolarizing channel factory.
+ */
+template <typename fp_type>
+struct DepolarizingChannel {
+  static constexpr char name[] = "depolarize";
+
+  DepolarizingChannel(double p) : p(p) {}
+
+  static Channel<fp_type> Create(unsigned time, unsigned q, double p) {
+    double p1 = 1 - p;
+    double p2 = p / 3;
+
+    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
+
+    return {{normal, 1, p1, {}},
+            {normal, 1, p2, {X<fp_type>::Create(time, q)}},
+            {normal, 1, p2, {Y<fp_type>::Create(time, q)}},
+            {normal, 1, p2, {Z<fp_type>::Create(time, q)}}};
+  }
+
+  static Channel<fp_type> Create(
+      unsigned time, const std::vector<unsigned>& qubits, double p) {
+    double p1 = 1 - p;
+    double p2 = p / 3;
+
+    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
+
+    uint64_t size = uint64_t{1} << (2 * qubits.size());
+
+    Channel<fp_type> channel;
+    channel.reserve(size);
+
+    for (uint64_t i = 0; i < size; ++i) {
+      channel.push_back({normal, 1, 0, {}});
+      auto& kop = channel.back();
+
+      kop.ops.reserve(qubits.size());
+
+      double prob = 1;
+
+      for (unsigned q = 0; q < qubits.size(); ++q) {
+        unsigned pauli_index = (i >> (2 * q)) & 3;
+
+        switch (pauli_index) {
+        case 0:
+          prob *= p1;
+          break;
+        case 1:
+          prob *= p2;
+          kop.ops.push_back(X<fp_type>::Create(time, q));
+          break;
+        case 2:
+          prob *= p2;
+          kop.ops.push_back(Y<fp_type>::Create(time, q));
+          break;
+        case 3:
+          prob *= p2;
+          kop.ops.push_back(Z<fp_type>::Create(time, q));
+          break;
+        }
+      }
+
+      kop.prob = prob;
+    }
+
+    return channel;
+  }
+
+  Channel<fp_type> Create(unsigned time, unsigned q) const {
+    return Create(time, q, p);
+  }
+
+  Channel<fp_type> Create(
+      unsigned time, const std::vector<unsigned>& qubits) const {
+    return Create(time, qubits, p);
+  }
+
+  double p = 0;
+};
+
+/**
+ * Returns a depolarizing channel factory object.
+ */
+template <typename fp_type>
+inline DepolarizingChannel<fp_type> depolarize(double p) {
+  return DepolarizingChannel<fp_type>(p);
+}
+
+/**
+ * Generalized amplitude damping channel factory.
+ */
+template <typename fp_type>
+struct GeneralizedAmplitudeDampingChannel {
+  static constexpr char name[] = "generalized_amplitude_damp";
+
+  GeneralizedAmplitudeDampingChannel(double p, double gamma)
+      : p(p), gamma(gamma) {}
+
+  static Channel<fp_type> Create(
+      unsigned time, unsigned q, double p, double gamma) {
+    double p1 = p * (1 - gamma);
+    double p2 = (1 - p) * (1 - gamma);
+    double p3 = 0;
+
+    fp_type t1 = std::sqrt(p);
+    fp_type r1 = std::sqrt(p * (1 - gamma));
+    fp_type s1 = std::sqrt(p * gamma);
+    fp_type t2 = std::sqrt(1 - p);
+    fp_type r2 = std::sqrt((1 - p) * (1 - gamma));
+    fp_type s2 = std::sqrt((1 - p) * gamma);
+
+    using M = Cirq::MatrixGate1<fp_type>;
+    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
+
+    return {{normal, 0, p1,
+             {M::Create(time, q, {t1, 0, 0, 0, 0, 0, r1, 0})},
+             {t1 * t1, 0, 0, 0, 0, 0, r1 * r1, 0}, {q},
+            },
+            {normal, 0, p2,
+             {M::Create(time, q, {r2, 0, 0, 0, 0, 0, t2, 0})},
+             {r2 * r2, 0, 0, 0, 0, 0, t2 * t2, 0}, {q},
+            },
+            {normal, 0, p3,
+             {M::Create(time, q, {0, 0, s1, 0, 0, 0, 0, 0})},
+             {0, 0, 0, 0, 0, 0, s1 * s1, 0}, {q},
+            },
+            {normal, 0, p3,
+             {M::Create(time, q, {0, 0, 0, 0, s2, 0, 0, 0})},
+             {s2 * s2, 0, 0, 0, 0, 0, 0, 0}, {q},
+            },
+           };
+  }
+
+  Channel<fp_type> Create(unsigned time, unsigned q) const {
+    return Create(time, q, p, gamma);
+  }
+
+  double p = 1;
+  double gamma = 0;
+};
+
+/**
+ * Returns a generalized amplitude damping channel factory object.
+ */
+template <typename fp_type>
+inline GeneralizedAmplitudeDampingChannel<fp_type> generalized_amplitude_damp(
+    double p, double gamma) {
+  return GeneralizedAmplitudeDampingChannel<fp_type>(p, gamma);
+}
+
+/**
+ * Amplitude damping channel factory.
+ */
+template <typename fp_type>
+struct AmplitudeDampingChannel {
+  static constexpr char name[] = "amplitude_damp";
+
+  AmplitudeDampingChannel(double gamma) : gamma(gamma) {}
+
+  static Channel<fp_type> Create(unsigned time, unsigned q, double gamma) {
+    double p1 = 1 - gamma;
+    double p2 = 0;
+
+    fp_type r = std::sqrt(p1);
+    fp_type s = std::sqrt(gamma);
+
+    using M = Cirq::MatrixGate1<fp_type>;
+    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
+
+    return {{normal, 0, p1,
+             {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})},
+             {1, 0, 0, 0, 0, 0, r * r, 0}, {q},
+            },
+            {normal, 0, p2,
+             {M::Create(time, q, {0, 0, s, 0, 0, 0, 0, 0})},
+             {0, 0, 0, 0, 0, 0, s * s, 0}, {q},
+            },
+           };
+  }
+
+  Channel<fp_type> Create(unsigned time, unsigned q) const {
+    return Create(time, q, gamma);
+  }
+
+  double gamma = 0;
+};
+
+/**
+ * Returns an amplitude damping channel factory object.
+ */
+template <typename fp_type>
+inline AmplitudeDampingChannel<fp_type> amplitude_damp(double gamma) {
+  return AmplitudeDampingChannel<fp_type>(gamma);
+}
+
+/**
+ *  Phase damping channel factory.
+ */
+template <typename fp_type>
+struct PhaseDampingChannel {
+  static constexpr char name[] = "phase_dump";
+
+  PhaseDampingChannel(double gamma) : gamma(gamma) {}
+
+  static Channel<fp_type> Create(unsigned time, unsigned q, double gamma) {
+    double p1 = 1 - gamma;
+    double p2 = 0;
+
+    fp_type r = std::sqrt(p1);
+    fp_type s = std::sqrt(gamma);
+
+    using M = Cirq::MatrixGate1<fp_type>;
+    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
+
+    return {{normal, 0, p1,
+             {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})},
+             {1, 0, 0, 0, 0, 0, r * r, 0}, {q},
+            },
+            {normal, 0, p2,
+             {M::Create(time, q, {0, 0, 0, 0, 0, 0, s, 0})},
+             {0, 0, 0, 0, 0, 0, s * s, 0}, {q},
+            },
+           };
+  }
+
+  Channel<fp_type> Create(unsigned time, unsigned q) const {
+    return Create(time, q, gamma);
+  }
+
+  double gamma = 0;
+};
+
+/**
+ * Returns a phase damping channel factory object.
+ */
+template <typename fp_type>
+inline PhaseDampingChannel<fp_type> phase_damp(double gamma) {
+  return PhaseDampingChannel<fp_type>(gamma);
+}
+
+/**
+ *  Reset channel factory.
+ */
+template <typename fp_type>
+struct ResetChannel {
+  static constexpr char name[] = "reset";
+
+  static Channel<fp_type> Create(unsigned time, unsigned q) {
+    using M = Cirq::MatrixGate1<fp_type>;
+    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
+
+    return {{normal, 0, 0,
+             {M::Create(time, q, {1, 0, 0, 0, 0, 0, 0, 0})},
+             {1, 0, 0, 0, 0, 0, 0, 0}, {q},
+            },
+            {normal, 0, 0,
+             {M::Create(time, q, {0, 0, 1, 0, 0, 0, 0, 0})},
+             {0, 0, 0, 0, 0, 0, 1, 0}, {q},
+            },
+           };
+  }
+};
+
+/**
+ * Returns a reset channel factory object.
+ */
+template <typename fp_type>
+inline ResetChannel<fp_type> reset() {
+  return ResetChannel<fp_type>();
+}
+
+/**
+ *  Phase flip channel factory.
+ */
+template <typename fp_type>
+struct PhaseFlipChannel {
+  static constexpr char name[] = "phase_flip";
+
+  PhaseFlipChannel(double p) : p(p) {}
+
+  static Channel<fp_type> Create(unsigned time, unsigned q, double p) {
+    double p1 = 1 - p;
+    double p2 = p;
+
+    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
+
+    return {{normal, 1, p1, {}},
+            {normal, 1, p2, {Z<fp_type>::Create(time, q)}}
+           };
+  }
+
+  Channel<fp_type> Create(unsigned time, unsigned q) const {
+    return Create(time, q, p);
+  }
+
+  double p = 0;
+};
+
+/**
+ * Returns a phase flip channel factory object.
+ */
+template <typename fp_type>
+inline PhaseFlipChannel<fp_type> phase_flip(double p) {
+  return PhaseFlipChannel<fp_type>(p);
+}
+
+/**
+ *  Bit flip channel factory.
+ */
+template <typename fp_type>
+struct BitFlipChannel {
+  static constexpr char name[] = "bit_flip";
+
+  BitFlipChannel(double p) : p(p) {}
+
+  static Channel<fp_type> Create(unsigned time, unsigned q, double p) {
+    double p1 = 1 - p;
+    double p2 = p;
+
+    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
+
+    return {{normal, 1, p1, {}},
+            {normal, 1, p2, {X<fp_type>::Create(time, q)}}
+           };
+  }
+
+  Channel<fp_type> Create(unsigned time, unsigned q) const {
+    return Create(time, q, p);
+  }
+
+  double p = 0;
+};
+
+/**
+ * Returns a bit flip channel factory object.
+ */
+template <typename fp_type>
+inline BitFlipChannel<fp_type> bit_flip(double p) {
+  return BitFlipChannel<fp_type>(p);
+}
+
+}  // namesapce Cirq
+
+}  // namespace qsim
+
+#endif  // CHANNELS_CIRQ_H_
diff --git a/tpls/qsim/channels_qsim.h b/tpls/qsim/channels_qsim.h
new file mode 100644
index 0000000..5c07bcc
--- /dev/null
+++ b/tpls/qsim/channels_qsim.h
@@ -0,0 +1,117 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef CHANNELS_QSIM_H_
+#define CHANNELS_QSIM_H_
+
+#include <cmath>
+#include <cstdint>
+#include <vector>
+
+#include "channel.h"
+#include "gates_qsim.h"
+
+namespace qsim {
+
+/**
+ * Amplitude damping channel factory.
+ */
+template <typename fp_type>
+struct AmplitudeDampingChannel {
+  AmplitudeDampingChannel(double gamma) : gamma(gamma) {}
+
+  static Channel<GateQSim<fp_type>> Create(
+      unsigned time, unsigned q, double gamma) {
+    double p1 = 1 - gamma;
+    double p2 = 0;
+
+    fp_type r = std::sqrt(p1);
+    fp_type s = std::sqrt(gamma);
+
+    using M = GateMatrix1<fp_type>;
+    auto normal = KrausOperator<GateQSim<fp_type>>::kNormal;
+
+    return {{normal, 0, p1,
+             {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})},
+             {1, 0, 0, 0, 0, 0, r * r, 0}, {q},
+            },
+            {normal, 0, p2,
+             {M::Create(time, q, {0, 0, s, 0, 0, 0, 0, 0})},
+             {0, 0, 0, 0, 0, 0, s * s, 0}, {q},
+            },
+           };
+  }
+
+  Channel<GateQSim<fp_type>> Create(unsigned time, unsigned q) const {
+    return Create(time, q, gamma);
+  }
+
+  double gamma = 0;
+};
+
+/**
+ * Returns an amplitude damping channel factory object.
+ */
+template <typename fp_type>
+inline AmplitudeDampingChannel<fp_type> amplitude_damp(double gamma) {
+  return AmplitudeDampingChannel<fp_type>(gamma);
+}
+
+/**
+ *  Phase damping channel factory.
+ */
+template <typename fp_type>
+struct PhaseDampingChannel {
+  PhaseDampingChannel(double gamma) : gamma(gamma) {}
+
+  static Channel<GateQSim<fp_type>> Create(
+      unsigned time, unsigned q, double gamma) {
+    double p1 = 1 - gamma;
+    double p2 = 0;
+
+    fp_type r = std::sqrt(p1);
+    fp_type s = std::sqrt(gamma);
+
+    using M = GateMatrix1<fp_type>;
+    auto normal = KrausOperator<GateQSim<fp_type>>::kNormal;
+
+    return {{normal, 0, p1,
+             {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})},
+             {1, 0, 0, 0, 0, 0, r * r, 0}, {q},
+            },
+            {normal, 0, p2,
+             {M::Create(time, q, {0, 0, 0, 0, 0, 0, s, 0})},
+             {0, 0, 0, 0, 0, 0, s * s, 0}, {q},
+            },
+           };
+  }
+
+  Channel<GateQSim<fp_type>> Create(unsigned time, unsigned q) const {
+    return Create(time, q, gamma);
+  }
+
+  double gamma = 0;
+};
+
+/**
+ * Returns a phase damping channel factory object.
+ */
+template <typename fp_type>
+inline PhaseDampingChannel<fp_type> phase_damp(double gamma) {
+  return PhaseDampingChannel<fp_type>(gamma);
+}
+
+}  // namespace qsim
+
+#endif  // CHANNELS_QSIM_H_
diff --git a/tpls/qsim/circuit.h b/tpls/qsim/circuit.h
new file mode 100644
index 0000000..59018ee
--- /dev/null
+++ b/tpls/qsim/circuit.h
@@ -0,0 +1,36 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef CIRCUIT_H_
+#define CIRCUIT_H_
+
+#include <vector>
+
+namespace qsim {
+
+/**
+ * A collection of gates. This object is consumed by `QSim[h]Runner.Run()`.
+ */
+template <typename Gate>
+struct Circuit {
+  unsigned num_qubits;
+  /**
+   * The set of gates to be run. Gate times should be ordered.
+   */
+  std::vector<Gate> gates;
+};
+
+}  // namespace qsim
+
+#endif  // CIRCUIT_H_
diff --git a/tpls/qsim/circuit_noisy.h b/tpls/qsim/circuit_noisy.h
new file mode 100644
index 0000000..40a228d
--- /dev/null
+++ b/tpls/qsim/circuit_noisy.h
@@ -0,0 +1,108 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef CIRCUIT_NOISY_H_
+#define CIRCUIT_NOISY_H_
+
+#include <vector>
+
+#include "circuit.h"
+#include "channel.h"
+
+namespace qsim {
+
+/**
+ * Noisy circuit.
+ */
+template <typename Gate>
+struct NoisyCircuit {
+  unsigned num_qubits;
+  std::vector<Channel<Gate>> channels;
+};
+
+template <typename Gate>
+using ncircuit_iterator = typename std::vector<Channel<Gate>>::const_iterator;
+
+/**
+ * Makes a noisy circuit from the clean circuit.
+ * Channels are added after each qubit of each gate of the clean cicuit.
+ * Roughly equivalent to cirq.Circuit.with_noise.
+ * @param num_qubits The number of circuit qubits.
+ * @param gbeg, gend The iterator range [gbeg, gend) of circuit gates.
+ * @param A channel factory to construct channels.
+ * @return The output noisy circuit.
+ */
+template <typename Gate, typename ChannelFactory>
+inline NoisyCircuit<Gate> MakeNoisy(
+    unsigned num_qubits,
+    typename std::vector<Gate>::const_iterator gbeg,
+    typename std::vector<Gate>::const_iterator gend,
+    const ChannelFactory& channel_factory) {
+  NoisyCircuit<Gate> ncircuit;
+
+  ncircuit.num_qubits = num_qubits;
+  ncircuit.channels.reserve(4 * std::size_t(gend - gbeg));
+
+  for (auto it = gbeg; it != gend; ++it) {
+    const auto& gate = *it;
+
+    ncircuit.channels.push_back(MakeChannelFromGate(2 * gate.time, gate));
+
+    for (auto q : gate.qubits) {
+      ncircuit.channels.push_back(channel_factory.Create(2 * gate.time + 1, q));
+    }
+
+    for (auto q : gate.controlled_by) {
+      ncircuit.channels.push_back(channel_factory.Create(2 * gate.time + 1, q));
+    }
+  }
+
+  return ncircuit;
+}
+
+/**
+ * Makes a noisy circuit from the clean circuit.
+ * Channels are added after each qubit of each gate of the clean cicuit.
+ * Roughly equivalent to cirq.Circuit.with_noise.
+ * @param num_qubits The number of circuit qubits.
+ * @param gates The circuit gates.
+ * @param A channel factory to construct channels.
+ * @return The output noisy circuit.
+ */
+template <typename Gate, typename ChannelFactory>
+inline NoisyCircuit<Gate> MakeNoisy(unsigned num_qubits,
+                                    const std::vector<Gate>& gates,
+                                    const ChannelFactory& channel_factory) {
+  return
+      MakeNoisy<Gate>(num_qubits, gates.begin(), gates.end(), channel_factory);
+}
+
+/**
+ * Makes a noisy circuit from the clean circuit.
+ * Channels are added after each qubit of each gate of the clean cicuit.
+ * Roughly equivalent to cirq.Circuit.with_noise.
+ * @param circuit The input cicuit.
+ * @param A channel factory to construct channels.
+ * @return The output noisy circuit.
+ */
+template <typename Gate, typename ChannelFactory>
+inline NoisyCircuit<Gate> MakeNoisy(const Circuit<Gate>& circuit,
+                                    const ChannelFactory& channel_factory) {
+  return MakeNoisy<Gate>(circuit.num_qubits, circuit.gates.begin(),
+                         circuit.gates.end(), channel_factory);
+}
+
+}  // namespace qsim
+
+#endif  // CIRCUIT_NOISY_H_
diff --git a/tpls/qsim/circuit_qsim_parser.h b/tpls/qsim/circuit_qsim_parser.h
new file mode 100644
index 0000000..de7bd89
--- /dev/null
+++ b/tpls/qsim/circuit_qsim_parser.h
@@ -0,0 +1,442 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef CIRCUIT_QSIM_PARSER_H_
+#define CIRCUIT_QSIM_PARSER_H_
+
+#include <algorithm>
+#include <cctype>
+#include <string>
+#include <sstream>
+#include <vector>
+
+#include "circuit.h"
+#include "gates_qsim.h"
+
+namespace qsim {
+
+/**
+ * Parser for the (deprecated) qsim <a href="https://github.com/quantumlib/qsim/blob/master/docs/input_format.md">file input format</a>.
+ * The primary supported interface for designing circuits to simulate with qsim
+ * is <a href="https://github.com/quantumlib/Cirq">Cirq</a>, which relies on
+ * the Python-based qsimcirq interface. For C++ applications, Cirq gates can be
+ * explicitly constructed in code.
+ */
+template <typename IO>
+class CircuitQsimParser final {
+ public:
+  /**
+   * Parses the given input stream into a Circuit object, following the rules
+   * defined in "docs/input_format.md".
+   * @param maxtime Maximum gate "time" to read operations for (inclusive).
+   * @param provider Circuit source; only used for error reporting.
+   * @param fs The stream to read the circuit from.
+   * @param circuit Output circuit object. If parsing is successful, this will
+   *   contain the circuit defined in 'fs'.
+   * @return True if parsing succeeds; false otherwise.
+   */
+  template <typename Stream, typename fp_type>
+  static bool FromStream(unsigned maxtime, const std::string& provider,
+                         Stream& fs, Circuit<GateQSim<fp_type>>& circuit) {
+    circuit.num_qubits = 0;
+
+    circuit.gates.resize(0);
+    circuit.gates.reserve(1024);
+
+    unsigned k = 0;
+
+    std::string line;
+    line.reserve(128);
+
+    unsigned time;
+    std::string gate_name;
+    gate_name.reserve(16);
+
+    unsigned max_time = 0;
+    unsigned prev_mea_time = 0;
+
+    std::vector<unsigned> last_times;
+
+    while (std::getline(fs, line)) {
+      ++k;
+
+      if (line.size() == 0 || line[0] == '#') continue;
+
+      std::stringstream ss(line);
+
+      if (circuit.num_qubits == 0) {
+        ss >> circuit.num_qubits;
+        if (circuit.num_qubits == 0) {
+          IO::errorf("invalid number of qubits in %s in line %u.\n",
+                     provider.c_str(), k);
+          return false;
+        }
+
+        last_times.resize(circuit.num_qubits, unsigned(-1));
+
+        continue;
+      }
+
+      ss >> time >> gate_name;
+
+      if (!ss) {
+        InvalidGateError(provider, k);
+        return false;
+      }
+
+      if (time > maxtime) {
+        break;
+      }
+
+      if (gate_name == "c") {
+        if (!ParseControlledGate<fp_type>(ss, time,
+                                          circuit.num_qubits, circuit.gates)) {
+          InvalidGateError(provider, k);
+          return false;
+        }
+      } else if (!ParseGate<fp_type>(ss, time, circuit.num_qubits,
+                                     gate_name, circuit.gates)) {
+        InvalidGateError(provider, k);
+        return false;
+      }
+
+      const auto& gate = circuit.gates.back();
+
+      if (time < prev_mea_time
+          || (gate.kind == gate::kMeasurement && time < max_time)) {
+        IO::errorf("gate crosses the time boundary set by measurement "
+                   "gates in line %u in %s.\n", k, provider.c_str());
+        return false;
+      }
+
+      if (gate.kind == gate::kMeasurement) {
+        prev_mea_time = time;
+      }
+
+      if (GateIsOutOfOrder(time, gate.qubits, last_times)
+          || GateIsOutOfOrder(time, gate.controlled_by, last_times)) {
+        IO::errorf("gate is out of time order in line %u in %s.\n",
+                   k, provider.c_str());
+        return false;
+      }
+
+      if (time > max_time) {
+        max_time = time;
+      }
+    }
+
+    return true;
+  }
+
+  /**
+   * Parses the given file into a Circuit object, following the rules defined
+   * in "docs/input_format.md".
+   * @param maxtime Maximum gate "time" to read operations for (inclusive).
+   * @param file The name of the file to read the circuit from.
+   * @param circuit Output circuit object. If parsing is successful, this will
+   *   contain the circuit defined in 'file'.
+   * @return True if parsing succeeds; false otherwise.
+   */
+  template <typename fp_type>
+  static bool FromFile(unsigned maxtime, const std::string& file,
+                       Circuit<GateQSim<fp_type>>& circuit) {
+    auto fs = IO::StreamFromFile(file);
+
+    if (!fs) {
+      return false;
+    } else {
+      bool rc = FromStream(maxtime, file, fs, circuit);
+      IO::CloseStream(fs);
+      return rc;
+    }
+  }
+
+ private:
+  static void InvalidGateError(const std::string& provider, unsigned line) {
+    IO::errorf("invalid gate in %s in line %u.\n", provider.c_str(), line);
+  }
+
+  /**
+   * Checks formatting for a zero-qubit gate parsed from 'ss'.
+   * @param ss Input stream containing the gate specification.
+   */
+  static bool ValidateGate(std::stringstream& ss) {
+    return ss && ss.peek() == std::stringstream::traits_type::eof();
+  }
+
+  /**
+   * Checks formatting for a single-qubit gate parsed from 'ss'.
+   * @param ss Input stream containing the gate specification.
+   * @param num_qubits Number of qubits, as defined at the start of the file.
+   * @param q0 Index of the affected qubit.
+   */
+  static bool ValidateGate(std::stringstream& ss,
+                           unsigned num_qubits, unsigned q0) {
+    return ss && ss.peek() == std::stringstream::traits_type::eof()
+        && q0 < num_qubits;
+  }
+
+  /**
+   * Checks formatting for a two-qubit gate parsed from 'ss'.
+   * @param ss Input stream containing the gate specification.
+   * @param num_qubits Number of qubits, as defined at the start of the file.
+   * @param q0 Index of the first affected qubit.
+   * @param q1 Index of the second affected qubit.
+   */
+  static bool ValidateGate(std::stringstream& ss,
+                           unsigned num_qubits, unsigned q0, unsigned q1) {
+    return ss && ss.peek() == std::stringstream::traits_type::eof()
+        && q0 < num_qubits && q1 < num_qubits && q0 != q1;
+  }
+
+  /**
+   * Checks formatting for a multiqubit gate parsed from 'ss'.
+   * @param ss Input stream containing the gate specification.
+   * @param num_qubits Number of qubits, as defined at the start of the file.
+   * @param qubits Indices of affected qubits.
+   */
+  static bool ValidateGate(std::stringstream& ss, unsigned num_qubits,
+                           const std::vector<unsigned>& qubits) {
+    return ss && ValidateQubits(num_qubits, qubits);
+  }
+
+  static bool ValidateControlledGate(
+      unsigned num_qubits, const std::vector<unsigned>& qubits,
+      const std::vector<unsigned>& controlled_by) {
+    if (!ValidateQubits(num_qubits, controlled_by)) return false;
+
+    std::size_t i = 0, j = 0;
+
+    while (i < qubits.size() && j < controlled_by.size()) {
+      if (qubits[i] == controlled_by[j]) {
+        return false;
+      } else if (qubits[i] < controlled_by[j]) {
+        ++i;
+      } else {
+        ++j;
+      }
+    }
+
+    return true;
+  }
+
+  static bool ValidateQubits(unsigned num_qubits,
+                             const std::vector<unsigned>& qubits) {
+    if (qubits.size() == 0 || qubits[0] >= num_qubits) return false;
+
+    // qubits should be sorted.
+
+    for (std::size_t i = 1; i < qubits.size(); ++i) {
+      if (qubits[i] >= num_qubits || qubits[i] == qubits[i - 1]) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  static bool GateIsOutOfOrder(unsigned time,
+                               const std::vector<unsigned>& qubits,
+                               std::vector<unsigned>& last_times) {
+    for (auto q : qubits) {
+      if (last_times[q] != unsigned(-1) && time <= last_times[q]) {
+        return true;
+      }
+
+      last_times[q] = time;
+    }
+
+    return false;
+  }
+
+  template <typename fp_type, typename Stream, typename Gate>
+  static bool ParseGate(Stream& ss, unsigned time, unsigned num_qubits,
+                        const std::string& gate_name,
+                        std::vector<Gate>& gates) {
+    unsigned q0, q1;
+    fp_type phi, theta;
+
+    if (gate_name == "p") {
+      ss >> phi;
+      if (!ValidateGate(ss)) return false;
+      gates.push_back(GateGPh<fp_type>::Create(time, phi));
+    } else if (gate_name == "id1") {
+      ss >> q0;
+      if (!ValidateGate(ss, num_qubits, q0)) return false;
+      gates.push_back(GateId1<fp_type>::Create(time, q0));
+    } else if (gate_name == "h") {
+      ss >> q0;
+      if (!ValidateGate(ss, num_qubits, q0)) return false;
+      gates.push_back(GateHd<fp_type>::Create(time, q0));
+    } else if (gate_name == "t") {
+      ss >> q0;
+      if (!ValidateGate(ss, num_qubits, q0)) return false;
+      gates.push_back(GateT<fp_type>::Create(time, q0));
+    } else if (gate_name == "x") {
+      ss >> q0;
+      if (!ValidateGate(ss, num_qubits, q0)) return false;
+      gates.push_back(GateX<fp_type>::Create(time, q0));
+    } else if (gate_name == "y") {
+      ss >> q0;
+      if (!ValidateGate(ss, num_qubits, q0)) return false;
+      gates.push_back(GateY<fp_type>::Create(time, q0));
+    } else if (gate_name == "z") {
+      ss >> q0;
+      if (!ValidateGate(ss, num_qubits, q0)) return false;
+      gates.push_back(GateZ<fp_type>::Create(time, q0));
+    } else if (gate_name == "x_1_2") {
+      ss >> q0;
+      if (!ValidateGate(ss, num_qubits, q0)) return false;
+      gates.push_back(GateX2<fp_type>::Create(time, q0));
+    } else if (gate_name == "y_1_2") {
+      ss >> q0;
+      if (!ValidateGate(ss, num_qubits, q0)) return false;
+      gates.push_back(GateY2<fp_type>::Create(time, q0));
+    } else if (gate_name == "rx") {
+      ss >> q0 >> phi;
+      if (!ValidateGate(ss, num_qubits, q0)) return false;
+      gates.push_back(GateRX<fp_type>::Create(time, q0, phi));
+    } else if (gate_name == "ry") {
+      ss >> q0 >> phi;
+      if (!ValidateGate(ss, num_qubits, q0)) return false;
+      gates.push_back(GateRY<fp_type>::Create(time, q0, phi));
+    } else if (gate_name == "rz") {
+      ss >> q0 >> phi;
+      if (!ValidateGate(ss, num_qubits, q0)) return false;
+      gates.push_back(GateRZ<fp_type>::Create(time, q0, phi));
+    } else if (gate_name == "rxy") {
+      ss >> q0 >> theta >> phi;
+      if (!ValidateGate(ss, num_qubits, q0)) return false;
+      gates.push_back(GateRXY<fp_type>::Create(time, q0, theta, phi));
+    } else if (gate_name == "hz_1_2") {
+      ss >> q0;
+      if (!ValidateGate(ss, num_qubits, q0)) return false;
+      gates.push_back(GateHZ2<fp_type>::Create(time, q0));
+    } else if (gate_name == "s") {
+      ss >> q0;
+      if (!ValidateGate(ss, num_qubits, q0)) return false;
+      gates.push_back(GateS<fp_type>::Create(time, q0));
+    } else if (gate_name == "id2") {
+      ss >> q0 >> q1;
+      if (!ValidateGate(ss, num_qubits, q0, q1)) return false;
+      gates.push_back(GateId2<fp_type>::Create(time, q0, q1));
+    } else if (gate_name == "cz") {
+      ss >> q0 >> q1;
+      if (!ValidateGate(ss, num_qubits, q0, q1)) return false;
+      gates.push_back(GateCZ<fp_type>::Create(time, q0, q1));
+    } else if (gate_name == "cnot" || gate_name == "cx") {
+      ss >> q0 >> q1;
+      if (!ValidateGate(ss, num_qubits, q0, q1)) return false;
+      gates.push_back(GateCNot<fp_type>::Create(time, q0, q1));
+    } else if (gate_name == "sw") {
+      ss >> q0 >> q1;
+      if (!ValidateGate(ss, num_qubits, q0, q1)) return false;
+      gates.push_back(GateSwap<fp_type>::Create(time, q0, q1));
+    } else if (gate_name == "is") {
+      ss >> q0 >> q1;
+      if (!ValidateGate(ss, num_qubits, q0, q1)) return false;
+      gates.push_back(GateIS<fp_type>::Create(time, q0, q1));
+    } else if (gate_name == "fs") {
+      ss >> q0 >> q1 >> theta >> phi;
+      if (!ValidateGate(ss, num_qubits, q0, q1)) return false;
+      gates.push_back(GateFS<fp_type>::Create(time, q0, q1, theta, phi));
+    } else if (gate_name == "cp") {
+      ss >> q0 >> q1 >> phi;
+      if (!ValidateGate(ss, num_qubits, q0, q1)) return false;
+      gates.push_back(GateCP<fp_type>::Create(time, q0, q1, phi));
+    } else if (gate_name == "m") {
+      std::vector<unsigned> qubits;
+      qubits.reserve(num_qubits);
+
+      while (ss.good()) {
+        ss >> q0;
+        if (ss) {
+          qubits.push_back(q0);
+        } else {
+          return false;
+        }
+      }
+
+      gates.push_back(gate::Measurement<GateQSim<fp_type>>::Create(
+          time, std::move(qubits)));
+
+      if (!ValidateQubits(num_qubits, gates.back().qubits)) return false;
+    } else {
+      return false;
+    }
+
+    return true;
+  }
+
+  template <typename fp_type, typename Stream, typename Gate>
+  static bool ParseControlledGate(Stream& ss, unsigned time,
+                                  unsigned num_qubits,
+                                  std::vector<Gate>& gates) {
+    std::vector<unsigned> controlled_by;
+    controlled_by.reserve(64);
+
+    std::string gate_name;
+    gate_name.reserve(16);
+
+    while (1) {
+      while (ss.good()) {
+        if (!std::isblank(ss.get())) {
+          ss.unget();
+          break;
+        }
+      }
+
+      if (!ss.good()) {
+        return false;
+      }
+
+      if (!std::isdigit(ss.peek())) {
+        break;
+      } else {
+        unsigned q;
+        ss >> q;
+
+        if (!ss.good() || !std::isblank(ss.get())) {
+          return false;
+        }
+
+        controlled_by.push_back(q);
+      }
+    }
+
+    if (controlled_by.size() == 0) {
+      return false;
+    }
+
+    ss >> gate_name;
+
+    if (!ss.good() || !ParseGate<fp_type>(ss, time,
+                                          num_qubits, gate_name, gates)) {
+      return false;
+    }
+
+    gates.back().ControlledBy(std::move(controlled_by));
+
+    if (!ValidateControlledGate(num_qubits, gates.back().qubits,
+                                gates.back().controlled_by)) {
+      return false;
+    }
+
+    return true;
+  }
+};
+
+}  // namespace qsim
+
+#endif  // CIRCUIT_QSIM_PARSER_H_
diff --git a/tpls/qsim/cuda2hip.h b/tpls/qsim/cuda2hip.h
new file mode 100644
index 0000000..da2d074
--- /dev/null
+++ b/tpls/qsim/cuda2hip.h
@@ -0,0 +1,61 @@
+// Copyright 2023 Advanced Micro Devices, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SIMULATOR_CUDA2HIP_H_
+#define SIMULATOR_CUDA2HIP_H_
+
+#define cublasCaxpy              hipblasCaxpy
+#define cublasCdotc              hipblasCdotc
+#define cublasCreate             hipblasCreate
+#define cublasCscal              hipblasCscal
+#define cublasCsscal             hipblasCsscal
+#define cublasDestroy            hipblasDestroy
+#define cublasDznrm2             hipblasDznrm2
+#define cublasHandle_t           hipblasHandle_t
+#define cublasScnrm2             hipblasScnrm2
+#define CUBLAS_STATUS_SUCCESS    HIPBLAS_STATUS_SUCCESS
+#define cublasStatus_t           hipblasStatus_t
+#define cublasZaxpy              hipblasZaxpy
+#define cublasZdotc              hipblasZdotc
+#define cublasZdscal             hipblasZdscal
+#define cublasZscal              hipblasZscal
+#define cuCimagf                 hipCimagf
+#define cuCimag                  hipCimag
+#define cuComplex                hipComplex
+#define cuCrealf                 hipCrealf
+#define cuCreal                  hipCreal
+#define CUDA_C_32F               HIPBLAS_C_32F
+#define CUDA_C_64F               HIPBLAS_C_64F
+#define cudaDeviceSynchronize    hipDeviceSynchronize
+#define cudaError_t              hipError_t
+#define cudaFree                 hipFree
+#define cudaGetErrorString       hipGetErrorString
+#define cudaMalloc               hipMalloc
+#define cudaMemcpyAsync          hipMemcpyAsync
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost   hipMemcpyDeviceToHost
+#define cudaMemcpy               hipMemcpy
+#define cudaMemcpyHostToDevice   hipMemcpyHostToDevice
+#define cudaMemset               hipMemset
+#define cudaPeekAtLastError      hipPeekAtLastError
+#define cudaSuccess              hipSuccess
+#define cuDoubleComplex          hipDoubleComplex
+
+template <typename T>
+__device__ __forceinline__ T __shfl_down_sync(
+    unsigned mask, T var, unsigned int delta, int width = warpSize) {
+  return __shfl_down(var, delta, width);
+}
+
+#endif  // SIMULATOR_CUDA2HIP_H_
diff --git a/tpls/qsim/expect.h b/tpls/qsim/expect.h
new file mode 100644
index 0000000..518d516
--- /dev/null
+++ b/tpls/qsim/expect.h
@@ -0,0 +1,148 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef EXPECT_H_
+#define EXPECT_H_
+
+#include <complex>
+
+#include "fuser.h"
+#include "gate_appl.h"
+
+namespace qsim {
+
+template <typename Gate>
+struct OpString {
+  std::complex<double> weight;
+  std::vector<Gate> ops;
+};
+
+/**
+ * Computes the expectation value of the sum of operator strings (operator
+ * sequences). Operators can act on any qubits and they can be any supported
+ * gates. This function uses a temporary state vector.
+ * @param param Options for gate fusion.
+ * @param strings Operator strings.
+ * @param ss StateSpace object required to copy the state vector and compute
+ *   inner products.
+ * @param simulator Simulator object. Provides specific implementations for
+ *   applying gates.
+ * @param state The state vector of the system.
+ * @param ket Temporary state vector.
+ * @return The computed expectation value.
+ */
+template <typename IO, typename Fuser, typename Gate, typename Simulator>
+std::complex<double> ExpectationValue(
+    const typename Fuser::Parameter& param,
+    const std::vector<OpString<Gate>>& strings,
+    const typename Simulator::StateSpace& state_space,
+    const Simulator& simulator, const typename Simulator::State& state,
+    typename Simulator::State& ket) {
+  std::complex<double> eval = 0;
+
+  if (state_space.IsNull(ket) || ket.num_qubits() < state.num_qubits()) {
+    ket = state_space.Create(state.num_qubits());
+    if (state_space.IsNull(ket)) {
+      IO::errorf("not enough memory: is the number of qubits too large?\n");
+      return eval;
+    }
+  }
+
+  for (const auto& str : strings) {
+    if (str.ops.size() == 0) {
+      eval += str.weight;
+      continue;
+    }
+
+    state_space.Copy(state, ket);
+
+    if (str.ops.size() == 1) {
+      const auto& op = str.ops[0];
+      simulator.ApplyGate(op.qubits, op.matrix.data(), ket);
+    } else {
+      auto fused_gates = Fuser::FuseGates(param, state.num_qubits(), str.ops);
+      if (fused_gates.size() == 0) {
+        eval = 0;
+        break;
+      }
+
+      for (const auto& fgate : fused_gates) {
+        ApplyFusedGate(simulator, fgate, ket);
+      }
+    }
+
+    eval += str.weight * state_space.InnerProduct(state, ket);
+  }
+
+  return eval;
+}
+
+/**
+ * Computes the expectation value of the sum of operator strings (operator
+ * sequences). Operators can act on any qubits and they can be any supported
+ * gates except for user-defined controlled gates. Computation is performed
+ * in place. No additional memory is allocated. The operator strings should
+ * act on no more than six qubits and they should be fusible into one gate.
+ * @param strings Operator strings.
+ * @param simulator Simulator object. Provides specific implementations for
+ *   computing expectation values.
+ * @param state The state of the system.
+ * @return The computed expectation value.
+ */
+template <typename IO, typename Fuser, typename Gate, typename Simulator>
+std::complex<double> ExpectationValue(
+    const std::vector<OpString<Gate>>& strings,
+    const Simulator& simulator, const typename Simulator::State& state) {
+  std::complex<double> eval = 0;
+
+  typename Fuser::Parameter param;
+  param.max_fused_size = 6;
+  for (const auto& str : strings) {
+    if (str.ops.size() == 0) {
+      eval += str.weight;
+    } else if (str.ops.size() == 1) {
+      const auto& op = str.ops[0];
+      auto r = simulator.ExpectationValue(op.qubits, op.matrix.data(), state);
+      eval += str.weight * r;
+    } else {
+      auto fused_gates = Fuser::FuseGates(param, state.num_qubits(), str.ops);
+
+      if (fused_gates.size() != 1) {
+        IO::errorf("too many fused gates; "
+                   "cannot compute the expectation value.\n");
+        eval = 0;
+        break;
+      }
+
+      const auto& fgate = fused_gates[0];
+
+      if (fgate.qubits.size() > 6) {
+        IO::errorf("operator string acts on too many qubits; "
+                   "cannot compute the expectation value.\n");
+        eval = 0;
+        break;
+      }
+
+      auto r = simulator.ExpectationValue(
+          fgate.qubits, fgate.matrix.data(), state);
+      eval += str.weight * r;
+    }
+  }
+
+  return eval;
+}
+
+}  // namespace qsim
+
+#endif  // EXPECT_H_
diff --git a/tpls/qsim/formux.h b/tpls/qsim/formux.h
new file mode 100644
index 0000000..4401e9b
--- /dev/null
+++ b/tpls/qsim/formux.h
@@ -0,0 +1,30 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef FORMUX_H_
+#define FORMUX_H_
+
+#ifdef _OPENMP
+# include "parfor.h"
+  namespace qsim {
+    using For = ParallelFor;
+  }
+#else
+# include "seqfor.h"
+  namespace qsim {
+    using For = SequentialFor;
+  }
+#endif
+
+#endif  // FORMUX_H_
diff --git a/tpls/qsim/fuser.h b/tpls/qsim/fuser.h
new file mode 100644
index 0000000..e4f3c3b
--- /dev/null
+++ b/tpls/qsim/fuser.h
@@ -0,0 +1,225 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef FUSER_H_
+#define FUSER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "gate.h"
+#include "matrix.h"
+
+namespace qsim {
+
+/**
+ * A collection of "fused" gates which can be multiplied together before being
+ * applied to the state vector.
+ */
+template <typename Gate>
+struct GateFused {
+  /**
+   * Kind of the first ("parent") gate.
+   */
+  typename Gate::GateKind kind;
+  /**
+   * The time index of the first ("parent") gate.
+   */
+  unsigned time;
+  /**
+   * A list of qubits these gates act upon. Control qubits for
+   * explicitly-controlled gates are excluded from this list.
+   */
+  std::vector<unsigned> qubits;
+  /**
+   * Pointer to the first ("parent") gate.
+   */
+  const Gate* parent;
+  /**
+   * Ordered list of component gates.
+   */
+  std::vector<const Gate*> gates;
+  /**
+   * Fused gate matrix.
+   */
+  Matrix<typename Gate::fp_type> matrix;
+};
+
+/**
+ * A base class for fuser classes with some common functions.
+ */
+template <typename IO, typename Gate>
+class Fuser {
+ protected:
+  using RGate = typename std::remove_pointer<Gate>::type;
+
+  static const RGate& GateToConstRef(const RGate& gate) {
+    return gate;
+  }
+
+  static const RGate& GateToConstRef(const RGate* gate) {
+    return *gate;
+  }
+
+  static std::vector<unsigned> MergeWithMeasurementTimes(
+      typename std::vector<Gate>::const_iterator gfirst,
+      typename std::vector<Gate>::const_iterator glast,
+      const std::vector<unsigned>& times) {
+    std::vector<unsigned> epochs;
+    epochs.reserve(glast - gfirst + times.size());
+
+    std::size_t last = 0;
+    unsigned max_time = 0;
+
+    for (auto gate_it = gfirst; gate_it < glast; ++gate_it) {
+      const auto& gate = GateToConstRef(*gate_it);
+
+      if (gate.time > max_time) {
+        max_time = gate.time;
+      }
+
+      if (epochs.size() > 0 && gate.time < epochs.back()) {
+        IO::errorf("gate crosses the time boundary.\n");
+        epochs.resize(0);
+        return epochs;
+      }
+
+      if (gate.kind == gate::kMeasurement) {
+        if (epochs.size() == 0 || epochs.back() < gate.time) {
+          if (!AddBoundary(gate.time, max_time, epochs)) {
+            epochs.resize(0);
+            return epochs;
+          }
+        }
+      }
+
+      while (last < times.size() && times[last] <= gate.time) {
+        unsigned prev = times[last++];
+        epochs.push_back(prev);
+        if (!AddBoundary(prev, max_time, epochs)) {
+          epochs.resize(0);
+          return epochs;
+        }
+        while (last < times.size() && times[last] <= prev) ++last;
+      }
+    }
+
+    if (epochs.size() == 0 || epochs.back() < max_time) {
+      epochs.push_back(max_time);
+    }
+
+    return epochs;
+  }
+
+  template <typename GateSeq0, typename Parent, typename GateFused>
+  static void FuseZeroQubitGates(const GateSeq0& gate_seq0,
+                                 Parent parent, std::size_t first,
+                                 std::vector<GateFused>& fused_gates) {
+    GateFused* fuse_to = nullptr;
+
+    for (std::size_t i = first; i < fused_gates.size(); ++i) {
+      auto& fgate = fused_gates[i];
+
+      if (fgate.kind != gate::kMeasurement && fgate.kind != gate::kDecomp
+          && fgate.parent->controlled_by.size() == 0
+          && !fgate.parent->unfusible) {
+        fuse_to = &fgate;
+        break;
+      }
+    }
+
+    if (fuse_to != nullptr) {
+      // Fuse zero-qubit gates with the first available fused gate.
+      for (const auto& g : gate_seq0) {
+        fuse_to->gates.push_back(parent(g));
+      }
+    } else {
+      auto g0 = parent(gate_seq0[0]);
+      fused_gates.push_back({g0->kind, g0->time, {}, g0, {g0}, {}});
+
+      for (std::size_t i = 1; i < gate_seq0.size(); ++i) {
+        fused_gates.back().gates.push_back(parent(gate_seq0[i]));
+      }
+    }
+  }
+
+ private:
+  static bool AddBoundary(unsigned time, unsigned max_time,
+                          std::vector<unsigned>& boundaries) {
+    if (max_time > time) {
+      IO::errorf("gate crosses the time boundary.\n");
+      return false;
+    }
+
+    boundaries.push_back(time);
+    return true;
+  }
+};
+
+/**
+ * Multiplies component gate matrices of a fused gate.
+ * @param gate Fused gate.
+ */
+template <typename FusedGate>
+inline void CalculateFusedMatrix(FusedGate& gate) {
+  MatrixIdentity(unsigned{1} << gate.qubits.size(), gate.matrix);
+
+  for (auto pgate : gate.gates) {
+    if (pgate->qubits.size() == 0) {
+      MatrixScalarMultiply(pgate->matrix[0], pgate->matrix[1], gate.matrix);
+    } else if (gate.qubits.size() == pgate->qubits.size()) {
+      MatrixMultiply(gate.qubits.size(), pgate->matrix, gate.matrix);
+    } else {
+      unsigned mask = 0;
+
+      for (auto q : pgate->qubits) {
+        for (std::size_t i = 0; i < gate.qubits.size(); ++i) {
+          if (q == gate.qubits[i]) {
+            mask |= unsigned{1} << i;
+            break;
+          }
+        }
+      }
+
+      MatrixMultiply(mask, pgate->qubits.size(), pgate->matrix,
+                     gate.qubits.size(), gate.matrix);
+    }
+  }
+}
+
+/**
+ * Multiplies component gate matrices for a range of fused gates.
+ * @param gbeg, gend The iterator range [gbeg, gend) of fused gates.
+ */
+template <typename Iterator>
+inline void CalculateFusedMatrices(Iterator gbeg, Iterator gend) {
+  for (auto g = gbeg; g != gend; ++g) {
+    if (g->kind != gate::kMeasurement) {
+      CalculateFusedMatrix(*g);
+    }
+  }
+}
+
+/**
+ * Multiplies component gate matrices for a vector of fused gates.
+ * @param gates The vector of fused gates.
+ */
+template <typename FusedGate>
+inline void CalculateFusedMatrices(std::vector<FusedGate>& gates) {
+  CalculateFusedMatrices(gates.begin(), gates.end());
+}
+
+}  // namespace qsim
+
+#endif  // FUSER_H_
diff --git a/tpls/qsim/fuser_basic.h b/tpls/qsim/fuser_basic.h
new file mode 100644
index 0000000..3191bd2
--- /dev/null
+++ b/tpls/qsim/fuser_basic.h
@@ -0,0 +1,411 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef FUSER_BASIC_H_
+#define FUSER_BASIC_H_
+
+#include <map>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "gate.h"
+#include "fuser.h"
+
+namespace qsim {
+
+/**
+ * Stateless object with methods for aggregating `Gate`s into `GateFused`.
+ * Measurement gates with equal times are fused together.
+ * User-defined controlled gates (controlled_by.size() > 0) and gates acting on
+ * more than two qubits are not fused.
+ * The template parameter Gate can be Gate type or a pointer to Gate type.
+ * This class is deprecated. It is recommended to use MultiQubitGateFuser
+ * from fuser_mqubit.h.
+ */
+template <typename IO, typename Gate>
+class BasicGateFuser final : public Fuser<IO, Gate> {
+ private:
+  using Base = Fuser<IO, Gate>;
+  using RGate = typename Base::RGate;
+
+ public:
+  using GateFused = qsim::GateFused<RGate>;
+
+  /**
+   * User-specified parameters for gate fusion.
+   * BasicGateFuser does not use any parameters.
+   */
+  struct Parameter {
+    unsigned verbosity = 0;
+  };
+
+  /**
+   * Stores sets of gates that can be applied together. Only one- and
+   * two-qubit gates will get fused. To respect specific time boundaries while
+   * fusing gates, use the other version of this method below.
+   * @param param Options for gate fusion.
+   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
+   * @param gates The gates (or pointers to the gates) to be fused.
+   *   Gate times of the gates that act on the same qubits should be ordered.
+   *   Gates that are out of time order should not cross the time boundaries
+   *   set by measurement gates.
+   * @param fuse_matrix If true, multiply gate matrices together.
+   * @return A vector of fused gate objects. Each element is a set of gates
+   *   acting on a specific pair of qubits which can be applied as a group.
+   */
+  static std::vector<GateFused> FuseGates(const Parameter& param,
+                                          unsigned max_qubit1,
+                                          const std::vector<Gate>& gates,
+                                          bool fuse_matrix = true) {
+    return FuseGates(
+        param, max_qubit1, gates.cbegin(), gates.cend(), {}, fuse_matrix);
+  }
+
+  /**
+   * Stores sets of gates that can be applied together. Only one- and
+   * two-qubit gates will get fused.
+   * @param param Options for gate fusion.
+   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
+   * @param gates The gates (or pointers to the gates) to be fused.
+   *   Gate times of the gates that act on the same qubits should be ordered.
+   *   Gates that are out of time order should not cross the time boundaries
+   *   set by `times_to_split_at` or by measurement gates.
+   * @param times_to_split_at Ordered list of time steps (boundaries) at which
+   *   to separate fused gates. Each element of the output will contain gates
+   *   from a single 'window' in this list.
+   * @param fuse_matrix If true, multiply gate matrices together.
+   * @return A vector of fused gate objects. Each element is a set of gates
+   *   acting on a specific pair of qubits which can be applied as a group.
+   */
+  static std::vector<GateFused> FuseGates(
+      const Parameter& param,
+      unsigned max_qubit1, const std::vector<Gate>& gates,
+      const std::vector<unsigned>& times_to_split_at,
+      bool fuse_matrix = true) {
+    return FuseGates(param, max_qubit1, gates.cbegin(), gates.cend(),
+                     times_to_split_at, fuse_matrix);
+  }
+
+  /**
+   * Stores sets of gates that can be applied together. Only one- and
+   * two-qubit gates will get fused. To respect specific time boundaries while
+   * fusing gates, use the other version of this method below.
+   * @param param Options for gate fusion.
+   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
+   * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates
+   *   (or pointers to gates) in. Gate times of the gates that act on the same
+   *   qubits should be ordered. Gates that are out of time order should not
+   *   cross the time boundaries set by measurement gates.
+   * @param fuse_matrix If true, multiply gate matrices together.
+   * @return A vector of fused gate objects. Each element is a set of gates
+   *   acting on a specific pair of qubits which can be applied as a group.
+   */
+  static std::vector<GateFused> FuseGates(
+      const Parameter& param, unsigned max_qubit1,
+      typename std::vector<Gate>::const_iterator gfirst,
+      typename std::vector<Gate>::const_iterator glast,
+      bool fuse_matrix = true) {
+    return FuseGates(param, max_qubit1, gfirst, glast, {}, fuse_matrix);
+  }
+
+  /**
+   * Stores sets of gates that can be applied together. Only one- and
+   * two-qubit gates will get fused.
+   * @param param Options for gate fusion.
+   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
+   * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates
+   *   (or pointers to gates) in. Gate times of the gates that act on the same
+   *   qubits should be ordered. Gates that are out of time order should not
+   *   cross the time boundaries set by `times_to_split_at` or by measurement
+   *   gates.
+   * @param times_to_split_at Ordered list of time steps (boundaries) at which
+   *   to separate fused gates. Each element of the output will contain gates
+   *   from a single 'window' in this list.
+   * @param fuse_matrix If true, multiply gate matrices together.
+   * @return A vector of fused gate objects. Each element is a set of gates
+   *   acting on a specific pair of qubits which can be applied as a group.
+   */
+  static std::vector<GateFused> FuseGates(
+      const Parameter& param, unsigned max_qubit1,
+      typename std::vector<Gate>::const_iterator gfirst,
+      typename std::vector<Gate>::const_iterator glast,
+      const std::vector<unsigned>& times_to_split_at,
+      bool fuse_matrix = true) {
+    std::vector<GateFused> gates_fused;
+
+    if (gfirst >= glast) return gates_fused;
+
+    std::size_t num_gates = glast - gfirst;
+
+    gates_fused.reserve(num_gates);
+
+    // Merge with measurement gate times to separate fused gates at.
+    auto times =
+        Base::MergeWithMeasurementTimes(gfirst, glast, times_to_split_at);
+
+    // Map to keep track of measurement gates with equal times.
+    std::map<unsigned, std::vector<const RGate*>> measurement_gates;
+
+    // Sequence of top level gates the other gates get fused to.
+    std::vector<const RGate*> gates_seq;
+
+    // Sequence of zero-qubit gates.
+    std::vector<const RGate*> gates_seq0;
+
+    // Lattice of gates: qubits "hyperplane" and time direction.
+    std::vector<std::vector<const RGate*>> gates_lat(max_qubit1);
+
+    // Current unfused gate.
+    auto gate_it = gfirst;
+
+    std::size_t last_fused_gate_index = 0;
+
+    for (std::size_t l = 0; l < times.size(); ++l) {
+      gates_seq.resize(0);
+      gates_seq.reserve(num_gates);
+
+      gates_seq0.resize(0);
+      gates_seq0.reserve(num_gates);
+
+      for (unsigned k = 0; k < max_qubit1; ++k) {
+        gates_lat[k].resize(0);
+        gates_lat[k].reserve(128);
+      }
+
+      // Fill gates_seq and gates_lat in.
+      for (; gate_it < glast; ++gate_it) {
+        const auto& gate = Base::GateToConstRef(*gate_it);
+
+        if (gate.time > times[l]) break;
+
+        if (!ValidateGate(gate, max_qubit1, gates_lat)) {
+          gates_fused.resize(0);
+          return gates_fused;
+        }
+
+        if (gate.kind == gate::kMeasurement) {
+          auto& mea_gates_at_time = measurement_gates[gate.time];
+          if (mea_gates_at_time.size() == 0) {
+            gates_seq.push_back(&gate);
+            mea_gates_at_time.reserve(max_qubit1);
+          }
+
+          mea_gates_at_time.push_back(&gate);
+        } else if (gate.controlled_by.size() > 0 || gate.qubits.size() > 2) {
+          for (auto q : gate.qubits) {
+            gates_lat[q].push_back(&gate);
+          }
+          for (auto q : gate.controlled_by) {
+            gates_lat[q].push_back(&gate);
+          }
+          gates_seq.push_back(&gate);
+        } else if (gate.qubits.size() == 1) {
+          gates_lat[gate.qubits[0]].push_back(&gate);
+          if (gate.unfusible) {
+            gates_seq.push_back(&gate);
+          }
+        } else if (gate.qubits.size() == 2) {
+          gates_lat[gate.qubits[0]].push_back(&gate);
+          gates_lat[gate.qubits[1]].push_back(&gate);
+          gates_seq.push_back(&gate);
+        } else {
+          gates_seq0.push_back(&gate);
+        }
+      }
+
+      std::vector<unsigned> last(max_qubit1, 0);
+
+      const RGate* delayed_measurement_gate = nullptr;
+
+      // Fuse gates.
+      for (auto pgate : gates_seq) {
+        if (pgate->kind == gate::kMeasurement) {
+          delayed_measurement_gate = pgate;
+        } else if (pgate->qubits.size() > 2
+                   || pgate->controlled_by.size() > 0) {
+          // Multi-qubit or controlled gate.
+
+          for (auto q : pgate->qubits) {
+            unsigned l = last[q];
+            if (gates_lat[q][l] != pgate) {
+              last[q] = AddOrphanedQubit(q, l, gates_lat, gates_fused);
+            }
+            ++last[q];
+          }
+
+          for (auto q : pgate->controlled_by) {
+            unsigned l = last[q];
+            if (gates_lat[q][l] != pgate) {
+              last[q] = AddOrphanedQubit(q, l, gates_lat, gates_fused);
+            }
+            ++last[q];
+          }
+
+          gates_fused.push_back({pgate->kind, pgate->time, pgate->qubits,
+                                 pgate, {pgate}, {}});
+        } else if (pgate->qubits.size() == 1) {
+          unsigned q0 = pgate->qubits[0];
+
+          GateFused gate_f = {pgate->kind, pgate->time, {q0}, pgate, {}, {}};
+
+          last[q0] = Advance(last[q0], gates_lat[q0], gate_f.gates);
+          gate_f.gates.push_back(gates_lat[q0][last[q0]]);
+          last[q0] = Advance(last[q0] + 1, gates_lat[q0], gate_f.gates);
+
+          gates_fused.push_back(std::move(gate_f));
+        } else if (pgate->qubits.size() == 2) {
+          unsigned q0 = pgate->qubits[0];
+          unsigned q1 = pgate->qubits[1];
+
+          if (Done(last[q0], pgate->time, gates_lat[q0])) continue;
+
+          GateFused gate_f =
+              {pgate->kind, pgate->time, {q0, q1}, pgate, {}, {}};
+
+          do {
+            last[q0] = Advance(last[q0], gates_lat[q0], gate_f.gates);
+            last[q1] = Advance(last[q1], gates_lat[q1], gate_f.gates);
+            // Here gates_lat[q0][last[q0]] == gates_lat[q1][last[q1]].
+
+            gate_f.gates.push_back(gates_lat[q0][last[q0]]);
+
+            last[q0] = Advance(last[q0] + 1, gates_lat[q0], gate_f.gates);
+            last[q1] = Advance(last[q1] + 1, gates_lat[q1], gate_f.gates);
+          } while (NextGate(last[q0], gates_lat[q0], last[q1], gates_lat[q1]));
+
+          gates_fused.push_back(std::move(gate_f));
+        }
+      }
+
+      for (unsigned q = 0; q < max_qubit1; ++q) {
+        auto l = last[q];
+        if (l == gates_lat[q].size()) continue;
+
+        // Orphaned qubit.
+        AddOrphanedQubit(q, l, gates_lat, gates_fused);
+      }
+
+      if (delayed_measurement_gate != nullptr) {
+        auto pgate = delayed_measurement_gate;
+
+        const auto& mea_gates_at_time = measurement_gates[pgate->time];
+
+        GateFused gate_f = {pgate->kind, pgate->time, {}, pgate, {}, {}};
+        gate_f.gates.reserve(mea_gates_at_time.size());
+
+        // Fuse measurement gates with equal times.
+
+        for (const auto* pgate : mea_gates_at_time) {
+          gate_f.qubits.insert(gate_f.qubits.end(),
+                               pgate->qubits.begin(), pgate->qubits.end());
+          gate_f.gates.push_back(pgate);
+        }
+
+        gates_fused.push_back(std::move(gate_f));
+      }
+
+      if (gates_seq0.size() != 0) {
+        Base::FuseZeroQubitGates(gates_seq0, [](const RGate* g) { return g; },
+                                 last_fused_gate_index, gates_fused);
+      }
+
+      if (gate_it == glast) break;
+
+      last_fused_gate_index = gates_fused.size();
+    }
+
+    if (fuse_matrix) {
+      for (auto& gate_f : gates_fused) {
+        if (gate_f.kind != gate::kMeasurement && gate_f.kind != gate::kDecomp) {
+          CalculateFusedMatrix(gate_f);
+        }
+      }
+    }
+
+    return gates_fused;
+  }
+
+ private:
+  static unsigned Advance(unsigned k, const std::vector<const RGate*>& wl,
+                          std::vector<const RGate*>& gates) {
+    while (k < wl.size() && wl[k]->qubits.size() == 1
+           && wl[k]->controlled_by.size() == 0 && !wl[k]->unfusible) {
+      gates.push_back(wl[k++]);
+    }
+
+    return k;
+  }
+
+  static bool Done(
+      unsigned k, unsigned t, const std::vector<const RGate*>& wl) {
+    return k >= wl.size() || wl[k]->time > t;
+  }
+
+  static bool NextGate(unsigned k1, const std::vector<const RGate*>& wl1,
+                       unsigned k2, const std::vector<const RGate*>& wl2) {
+    return k1 < wl1.size() && k2 < wl2.size() && wl1[k1] == wl2[k2]
+        && wl1[k1]->qubits.size() < 3 && wl1[k1]->controlled_by.size() == 0;
+  }
+
+  template <typename GatesLat>
+  static unsigned AddOrphanedQubit(unsigned q, unsigned k,
+                                   const GatesLat& gates_lat,
+                                   std::vector<GateFused>& gates_fused) {
+    auto pgate = gates_lat[q][k];
+
+    GateFused gate_f = {pgate->kind, pgate->time, {q}, pgate, {}, {}};
+    gate_f.gates.push_back(pgate);
+
+    k = Advance(k + 1, gates_lat[q], gate_f.gates);
+
+    gates_fused.push_back(std::move(gate_f));
+
+    return k;
+  }
+
+  template <typename Gate2, typename GatesLat>
+  static bool ValidateGate(const Gate2& gate, unsigned max_qubit1,
+                           const GatesLat& gates_lat) {
+    for (unsigned q : gate.qubits) {
+      if (q >= max_qubit1) {
+        IO::errorf("fuser: gate qubit %u is out of range "
+                   "(should be smaller than %u).\n", q, max_qubit1);
+        return false;
+      }
+      if (!gates_lat[q].empty() && gate.time <= gates_lat[q].back()->time) {
+        IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time);
+        return false;
+      }
+    }
+
+    for (unsigned q : gate.controlled_by) {
+      if (q >= max_qubit1) {
+        IO::errorf("fuser: gate qubit %u is out of range "
+                   "(should be smaller than %u).\n", q, max_qubit1);
+        return false;
+      }
+      if (!gates_lat[q].empty() && gate.time <= gates_lat[q].back()->time) {
+        IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time);
+        return false;
+      }
+    }
+
+    return true;
+  }
+};
+
+}  // namespace qsim
+
+#endif  // FUSER_BASIC_H_
diff --git a/tpls/qsim/fuser_mqubit.h b/tpls/qsim/fuser_mqubit.h
new file mode 100644
index 0000000..c75b1a0
--- /dev/null
+++ b/tpls/qsim/fuser_mqubit.h
@@ -0,0 +1,1095 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef FUSER_MQUBIT_H_
+#define FUSER_MQUBIT_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "gate.h"
+#include "fuser.h"
+
+namespace qsim {
+
+/**
+ * Multi-qubit gate fuser.
+ * Measurement gates with equal times are fused together.
+ * User-defined controlled gates (controlled_by.size() > 0) are not fused.
+ * The template parameter Gate can be Gate type or a pointer to Gate type.
+ */
+template <typename IO, typename Gate>
+class MultiQubitGateFuser final : public Fuser<IO, Gate> {
+ private:
+  using Base = Fuser<IO, Gate>;
+  using RGate = typename Base::RGate;
+
+  // Auxillary classes and structs.
+
+  // Manages doubly-linked lists.
+  template <typename T>
+  class LinkManagerT {
+   public:
+    struct Link {
+      T val;
+      Link* next;
+      Link* prev;
+    };
+
+    explicit LinkManagerT(uint64_t size) {
+      links_.reserve(size);
+    }
+
+    Link* AddBack(const T& t, Link* link) {
+      if (link == nullptr) {
+        links_.push_back({t, nullptr, nullptr});
+      } else {
+        links_.push_back({t, link->next, link});
+        link->next = &links_.back();
+      }
+
+      return &links_.back();
+    }
+
+    static void Delete(const Link* link) {
+      if (link->prev != nullptr) {
+        link->prev->next = link->next;
+      }
+      if (link->next != nullptr) {
+        link->next->prev = link->prev;
+      }
+    }
+
+   private:
+    std::vector<Link> links_;
+  };
+
+  struct GateF;
+
+  using LinkManager = LinkManagerT<GateF*>;
+  using Link = typename LinkManager::Link;
+
+  // Intermediate representation of a fused gate.
+  struct GateF {
+    const RGate* parent;
+    std::vector<unsigned> qubits;
+    std::vector<const RGate*> gates;  // Gates that get fused to this gate.
+    std::vector<Link*> links;         // Gate "lattice" links.
+    uint64_t mask;                    // Qubit mask.
+    unsigned visited;
+  };
+
+  // Possible values for visited in GateF.
+  // Note that MakeGateSequence assignes values from kSecond to the number of
+  // gates in the sequence plus one, see below.
+  enum Visited {
+    kZero = 0,             // Start value for "normal" gates.
+    kFirst = 1,            // Value after the first pass for partially fused
+                           // "normal" gates.
+    kSecond = 2,           // Start value to assign values in MakeGateSequence.
+    kCompress = 99999997,  // Used to compress links.
+    kMeaCnt = 99999998,    // Start value for controlled or measurement gates.
+    kFinal = 99999999,     // Value after the second pass for fused "normal"
+                           // gates or for controlled and measurement gates.
+  };
+
+  struct Stat {
+    unsigned num_mea_gates = 0;
+    unsigned num_fused_mea_gates = 0;
+    unsigned num_fused_gates = 0;
+    unsigned num_controlled_gates = 0;
+    std::vector<unsigned> num_gates;
+  };
+
+  // Gate that is added to a sequence of gates to fuse together.
+  struct GateA {
+    GateF* gate;
+    std::vector<unsigned> qubits;  // Added qubits.
+    std::vector<Link*> links;      // Added lattice links.
+  };
+
+  struct Scratch {
+    std::vector<GateA> data;
+    std::vector<GateA*> prev1;
+    std::vector<GateA*> prev2;
+    std::vector<GateA*> next1;
+    std::vector<GateA*> next2;
+    std::vector<GateA*> longest_seq;
+    std::vector<GateA*> stack;
+    std::vector<GateF*> gates;
+    unsigned count = 0;
+  };
+
+ public:
+  using GateFused = qsim::GateFused<RGate>;
+
+  /**
+   * User-specified parameters for gate fusion.
+   */
+  struct Parameter {
+    /**
+     * Maximum number of qubits in a fused gate. It can take values from 2 to
+     * 6 (0 and 1 are equivalent to 2). It is not recommended to use 5 or 6 as
+     * that might degrade performance for not very fast machines.
+     */
+    unsigned max_fused_size = 2;
+    unsigned verbosity = 0;
+  };
+
+  /**
+   * Stores sets of gates that can be applied together. To respect specific
+   * time boundaries while fusing gates, use the other version of this method
+   * below.
+   * @param param Options for gate fusion.
+   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
+   * @param gates The gates (or pointers to the gates) to be fused.
+   *   Gate times of the gates that act on the same qubits should be ordered.
+   *   Gates that are out of time order should not cross the time boundaries
+   *   set by measurement gates.
+   * @param fuse_matrix If true, multiply gate matrices together.
+   * @return A vector of fused gate objects. Each element is a set of gates
+   *   acting on a specific pair of qubits which can be applied as a group.
+   */
+  static std::vector<GateFused> FuseGates(const Parameter& param,
+                                          unsigned max_qubit1,
+                                          const std::vector<Gate>& gates,
+                                          bool fuse_matrix = true) {
+    return FuseGates(
+        param, max_qubit1, gates.cbegin(), gates.cend(), {}, fuse_matrix);
+  }
+
+  /**
+   * Stores sets of gates that can be applied together.
+   * @param param Options for gate fusion.
+   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
+   * @param gates The gates (or pointers to the gates) to be fused.
+   *   Gate times of the gates that act on the same qubits should be ordered.
+   *   Gates that are out of time order should not cross the time boundaries
+   *   set by `times_to_split_at` or by measurement gates.
+   * @param times_to_split_at Ordered list of time steps (boundaries) at which
+   *   to separate fused gates. Each element of the output will contain gates
+   *   from a single 'window' in this list.
+   * @param fuse_matrix If true, multiply gate matrices together.
+   * @return A vector of fused gate objects. Each element is a set of gates
+   *   acting on a specific pair of qubits which can be applied as a group.
+   */
+  static std::vector<GateFused> FuseGates(
+      const Parameter& param,
+      unsigned max_qubit1, const std::vector<Gate>& gates,
+      const std::vector<unsigned>& times_to_split_at,
+      bool fuse_matrix = true) {
+    return FuseGates(param, max_qubit1, gates.cbegin(), gates.cend(),
+                     times_to_split_at, fuse_matrix);
+  }
+
+  /**
+   * Stores sets of gates that can be applied together. To respect specific
+   * time boundaries while fusing gates, use the other version of this method
+   * below.
+   * @param param Options for gate fusion.
+   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
+   * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates
+   *   (or pointers to gates) in. Gate times of the gates that act on the same
+   *   qubits should be ordered. Gates that are out of time order should not
+   *   cross the time boundaries set by measurement gates.
+   * @param fuse_matrix If true, multiply gate matrices together.
+   * @return A vector of fused gate objects. Each element is a set of gates
+   *   acting on a specific pair of qubits which can be applied as a group.
+   */
+  static std::vector<GateFused> FuseGates(
+      const Parameter& param, unsigned max_qubit1,
+      typename std::vector<Gate>::const_iterator gfirst,
+      typename std::vector<Gate>::const_iterator glast,
+      bool fuse_matrix = true) {
+    return FuseGates(param, max_qubit1, gfirst, glast, {}, fuse_matrix);
+  }
+
+  /**
+   * Stores sets of gates that can be applied together.
+   * @param param Options for gate fusion.
+   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
+   * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates
+   *   (or pointers to gates) in. Gate times of the gates that act on the same
+   *   qubits should be ordered. Gates that are out of time order should not
+   *   cross the time boundaries set by `times_to_split_at` or by measurement
+   *   gates.
+   * @param times_to_split_at Ordered list of time steps (boundaries) at which
+   *   to separate fused gates. Each element of the output will contain gates
+   *   from a single 'window' in this list.
+   * @param fuse_matrix If true, multiply gate matrices together.
+   * @return A vector of fused gate objects. Each element is a set of gates
+   *   acting on a specific pair of qubits which can be applied as a group.
+   */
+  static std::vector<GateFused> FuseGates(
+      const Parameter& param, unsigned max_qubit1,
+      typename std::vector<Gate>::const_iterator gfirst,
+      typename std::vector<Gate>::const_iterator glast,
+      const std::vector<unsigned>& times_to_split_at,
+      bool fuse_matrix = true) {
+    std::vector<GateFused> fused_gates;
+
+    if (gfirst >= glast) return fused_gates;
+
+    std::size_t num_gates = glast - gfirst;
+
+    fused_gates.reserve(num_gates);
+
+    // Merge with measurement gate times to separate fused gates at.
+    auto epochs =
+        Base::MergeWithMeasurementTimes(gfirst, glast, times_to_split_at);
+
+    LinkManager link_manager(max_qubit1 * num_gates);
+
+    // Auxillary data structures.
+    // Sequence of intermediate fused gates.
+    std::vector<GateF> gates_seq;
+    // Gate "lattice".
+    std::vector<Link*> gates_lat;
+    // Sequences of intermediate fused gates ordered by gate size.
+    std::vector<std::vector<GateF*>> fgates(max_qubit1 + 1);
+
+    gates_seq.reserve(num_gates);
+    gates_lat.reserve(max_qubit1);
+
+    Scratch scratch;
+
+    scratch.data.reserve(1024);
+    scratch.prev1.reserve(32);
+    scratch.prev2.reserve(32);
+    scratch.next1.reserve(32);
+    scratch.next2.reserve(32);
+    scratch.longest_seq.reserve(8);
+    scratch.stack.reserve(8);
+
+    Stat stat;
+    stat.num_gates.resize(max_qubit1 + 1, 0);
+
+    unsigned max_fused_size = std::min(unsigned{6}, param.max_fused_size);
+    max_fused_size = std::min(max_fused_size, max_qubit1);
+
+    std::size_t last_fused_gate_index = 0;
+    auto gate_it = gfirst;
+
+    // Iterate over epochs.
+    for (std::size_t l = 0; l < epochs.size(); ++l) {
+      gates_seq.resize(0);
+      gates_lat.resize(0);
+      gates_lat.resize(max_qubit1, nullptr);
+
+      for (unsigned i = 0; i <= max_qubit1; ++i) {
+        fgates[i].resize(0);
+      }
+
+      uint64_t max_gate_size = 0;
+      GateF* last_mea_gate = nullptr;
+
+      // Iterate over input gates.
+      for (; gate_it < glast; ++gate_it) {
+        const auto& gate = Base::GateToConstRef(*gate_it);
+
+        if (gate.time > epochs[l]) break;
+
+        if (!ValidateGate(gate, max_qubit1, gates_lat)) {
+          fused_gates.resize(0);
+          return fused_gates;
+        }
+
+        // Fill in auxillary data structures.
+
+        if (gate.kind == gate::kMeasurement) {
+          // Measurement gate.
+
+          if (last_mea_gate == nullptr
+              || last_mea_gate->parent->time != gate.time) {
+            gates_seq.push_back({&gate, {}, {}, {}, 0, kMeaCnt});
+            last_mea_gate = &gates_seq.back();
+
+            last_mea_gate->qubits.reserve(max_qubit1);
+            last_mea_gate->links.reserve(max_qubit1);
+
+            ++stat.num_fused_mea_gates;
+          }
+
+          for (auto q : gate.qubits) {
+            last_mea_gate->qubits.push_back(q);
+            last_mea_gate->mask |= uint64_t{1} << q;
+            gates_lat[q] = link_manager.AddBack(last_mea_gate, gates_lat[q]);
+            last_mea_gate->links.push_back(gates_lat[q]);
+          }
+
+          last_mea_gate->gates.push_back(&gate);
+
+          ++stat.num_mea_gates;
+        } else {
+          gates_seq.push_back({&gate, {}, {}, {}, 0, kZero});
+          auto& fgate = gates_seq.back();
+
+          if (gate.controlled_by.size() == 0) {
+            if (max_gate_size < gate.qubits.size()) {
+              max_gate_size = gate.qubits.size();
+            }
+
+            unsigned num_gate_qubits = gate.qubits.size();
+            unsigned size = std::max(max_fused_size, num_gate_qubits);
+
+            fgate.qubits.reserve(size);
+            fgate.links.reserve(size);
+            fgate.gates.reserve(4 * size);
+            fgate.links.reserve(size);
+
+            if (fgates[num_gate_qubits].empty()) {
+              fgates[num_gate_qubits].reserve(num_gates);
+            }
+            fgates[num_gate_qubits].push_back(&fgate);
+
+            ++stat.num_gates[num_gate_qubits];
+          } else {
+            // Controlled gate.
+            // Controlled gates are not fused with other gates.
+
+            uint64_t size = gate.qubits.size() + gate.controlled_by.size();
+
+            fgate.qubits.reserve(gate.qubits.size());
+            fgate.links.reserve(size);
+
+            fgate.visited = kMeaCnt;
+            fgate.gates.push_back(&gate);
+
+            ++stat.num_controlled_gates;
+          }
+
+          for (auto q : gate.qubits) {
+            fgate.qubits.push_back(q);
+            fgate.mask |= uint64_t{1} << q;
+            gates_lat[q] = link_manager.AddBack(&fgate, gates_lat[q]);
+            fgate.links.push_back(gates_lat[q]);
+          }
+
+          for (auto q : gate.controlled_by) {
+            fgate.mask |= uint64_t{1} << q;
+            gates_lat[q] = link_manager.AddBack(&fgate, gates_lat[q]);
+            fgate.links.push_back(gates_lat[q]);
+          }
+        }
+      }
+
+      // Fuse large gates with smaller gates.
+      FuseGates(max_gate_size, fgates);
+
+      if (max_fused_size > 2) {
+        FuseGateSequences(
+            max_fused_size, max_qubit1, scratch, gates_seq, stat, fused_gates);
+      } else {
+        unsigned prev_time = 0;
+
+        std::vector<GateF*> orphaned_gates;
+        orphaned_gates.reserve(max_qubit1);
+
+        for (auto& fgate : gates_seq) {
+          if (fgate.gates.size() == 0) continue;
+
+          if (prev_time != fgate.parent->time) {
+            if (orphaned_gates.size() > 0) {
+              FuseOrphanedGates(
+                  max_fused_size, stat, orphaned_gates, fused_gates);
+              orphaned_gates.resize(0);
+            }
+
+            prev_time = fgate.parent->time;
+          }
+
+          if (fgate.qubits.size() == 1 && max_fused_size > 1
+              && fgate.visited != kMeaCnt && !fgate.parent->unfusible) {
+            orphaned_gates.push_back(&fgate);
+            continue;
+          }
+
+          // Assume fgate.qubits (gate.qubits) are sorted.
+          fused_gates.push_back({fgate.parent->kind, fgate.parent->time,
+                                 std::move(fgate.qubits), fgate.parent,
+                                 std::move(fgate.gates), {}});
+
+          if (fgate.visited != kMeaCnt) {
+            ++stat.num_fused_gates;
+          }
+        }
+
+        if (orphaned_gates.size() > 0) {
+          FuseOrphanedGates(max_fused_size, stat, orphaned_gates, fused_gates);
+        }
+      }
+
+      if (fgates[0].size() != 0) {
+        Base::FuseZeroQubitGates(fgates[0],
+                                 [](const GateF* g) { return g->parent; },
+                                 last_fused_gate_index, fused_gates);
+      }
+
+      last_fused_gate_index = fused_gates.size();
+    }
+
+    if (fuse_matrix) {
+      for (auto& fgate : fused_gates) {
+        if (fgate.kind != gate::kMeasurement && fgate.kind != gate::kDecomp) {
+          CalculateFusedMatrix(fgate);
+        }
+      }
+    }
+
+    PrintStat(param.verbosity, stat, fused_gates);
+
+    return fused_gates;
+  }
+
+ private:
+  // Fuse large gates with smaller gates.
+  static void FuseGates(uint64_t max_gate_size,
+                        std::vector<std::vector<GateF*>>& fgates) {
+    // Traverse gates in order of decreasing size.
+    for (uint64_t i = 0; i < max_gate_size; ++i) {
+      std::size_t pos = 0;
+
+      for (auto fgate : fgates[max_gate_size - i]) {
+        if (fgate->visited > kZero) continue;
+
+        fgates[max_gate_size - i][pos++] = fgate;
+
+        fgate->visited = kFirst;
+
+        FusePrev(0, *fgate);
+        fgate->gates.push_back(fgate->parent);
+        FuseNext(0, *fgate);
+      }
+
+      fgates[max_gate_size - i].resize(pos);
+    }
+  }
+
+  // Try to fuse gate sequences as follows. Gate time goes from bottom to top.
+  // Gates are fused either from left to right or from right to left.
+  //
+  // max_fused_size = 3: _-  or  -_
+  //
+  // max_fused_size = 4: _-_
+  //
+  // max_fused_size = 5: _-_-  or  -_-_
+  //
+  // max_fused_size = 6: _-_-_
+  static void FuseGateSequences(unsigned max_fused_size,
+                                unsigned max_qubit1, Scratch& scratch,
+                                std::vector<GateF>& gates_seq, Stat& stat,
+                                std::vector<GateFused>& fused_gates) {
+    unsigned prev_time = 0;
+
+    std::vector<GateF*> orphaned_gates;
+    orphaned_gates.reserve(max_qubit1);
+
+    for (auto& fgate : gates_seq) {
+      if (prev_time != fgate.parent->time) {
+        if (orphaned_gates.size() > 0) {
+          FuseOrphanedGates(max_fused_size, stat, orphaned_gates, fused_gates);
+          orphaned_gates.resize(0);
+        }
+
+        prev_time = fgate.parent->time;
+      }
+
+      if (fgate.visited == kFinal || fgate.gates.size() == 0) continue;
+
+      if (fgate.visited == kMeaCnt || fgate.qubits.size() >= max_fused_size
+          || fgate.parent->unfusible) {
+        if (fgate.visited != kMeaCnt) {
+          ++stat.num_fused_gates;
+        }
+
+        fgate.visited = kFinal;
+
+        fused_gates.push_back({fgate.parent->kind, fgate.parent->time,
+                               std::move(fgate.qubits), fgate.parent,
+                               std::move(fgate.gates), {}});
+
+        continue;
+      }
+
+
+      if (fgate.qubits.size() == 1 && max_fused_size > 1) {
+        orphaned_gates.push_back(&fgate);
+        continue;
+      }
+
+      scratch.data.resize(0);
+      scratch.gates.resize(0);
+      scratch.count = 0;
+
+      MakeGateSequence(max_fused_size, scratch, fgate);
+
+      if (scratch.gates.size() == 0) {
+        orphaned_gates.push_back(&fgate);
+      } else {
+        for (auto fgate : scratch.gates) {
+          std::sort(fgate->qubits.begin(), fgate->qubits.end());
+
+          fused_gates.push_back({fgate->parent->kind, fgate->parent->time,
+                                 std::move(fgate->qubits), fgate->parent,
+                                 std::move(fgate->gates), {}});
+
+          ++stat.num_fused_gates;
+        }
+      }
+    }
+
+    if (orphaned_gates.size() > 0) {
+      FuseOrphanedGates(max_fused_size, stat, orphaned_gates, fused_gates);
+    }
+  }
+
+  static void FuseOrphanedGates(unsigned max_fused_size, Stat& stat,
+                                std::vector<GateF*>& orphaned_gates,
+                                std::vector<GateFused>& fused_gates) {
+    for (std::size_t i = 0; i < orphaned_gates.size(); ++i) {
+      auto ogate1 = orphaned_gates[i];
+
+      if (ogate1->visited == kFinal) continue;
+
+      ogate1->visited = kFinal;
+
+      for (std::size_t j = i + 1; j < orphaned_gates.size(); ++j) {
+        auto ogate2 = orphaned_gates[j];
+
+        if (ogate2->visited == kFinal) continue;
+
+        unsigned cur_size = ogate1->qubits.size() + ogate2->qubits.size();
+
+        if (cur_size <= max_fused_size) {
+          ogate2->visited = kFinal;
+
+          for (auto q : ogate2->qubits) {
+            ogate1->qubits.push_back(q);
+            ogate1->mask |= uint64_t{1} << q;
+          }
+
+          for (auto l : ogate2->links) {
+            ogate1->links.push_back(l);
+          }
+
+          for (auto gate : ogate2->gates) {
+            ogate1->gates.push_back(gate);
+          }
+        }
+
+        if (cur_size == max_fused_size) {
+          break;
+        }
+      }
+
+      FuseNext(1, *ogate1);
+
+      std::sort(ogate1->qubits.begin(), ogate1->qubits.end());
+
+      fused_gates.push_back({ogate1->parent->kind, ogate1->parent->time,
+                             std::move(ogate1->qubits), ogate1->parent,
+                             std::move(ogate1->gates), {}});
+
+      ++stat.num_fused_gates;
+    }
+  }
+
+  static void MakeGateSequence(
+      unsigned max_fused_size, Scratch& scratch, GateF& fgate) {
+    unsigned level = kSecond + scratch.count;
+
+    FindLongestGateSequence(max_fused_size, level, scratch, fgate);
+
+    auto longest_seq = scratch.longest_seq;
+
+    if (longest_seq.size() == 1 && scratch.count == 0) {
+      fgate.visited = kFirst;
+      return;
+    }
+
+    ++scratch.count;
+
+    for (auto p : longest_seq) {
+      p->gate->visited = kCompress;
+
+      for (auto q : p->qubits) {
+        fgate.qubits.push_back(q);
+        fgate.mask |= uint64_t{1} << q;
+      }
+
+      for (auto l : p->links) {
+        fgate.links.push_back(l);
+      }
+    }
+
+    // Compress links.
+    for (auto& link : fgate.links) {
+      while (link->prev != nullptr && link->prev->val->visited == kCompress) {
+        link = link->prev;
+      }
+
+      while (link->next != nullptr && link->next->val->visited == kCompress) {
+        LinkManager::Delete(link->next);
+      }
+    }
+
+    for (auto p : longest_seq) {
+      p->gate->visited = level;
+    }
+
+    if (longest_seq.size() >= 3) {
+      AddGatesFromNext(longest_seq[2]->gate->gates, fgate);
+    }
+
+    if (longest_seq.size() >= 5) {
+      AddGatesFromNext(longest_seq[4]->gate->gates, fgate);
+    }
+
+    if (longest_seq.size() >= 2) {
+      // May call MakeGateSequence recursively.
+      AddGatesFromPrev(max_fused_size, *longest_seq[1]->gate, scratch, fgate);
+    }
+
+    if (longest_seq.size() >= 4) {
+      // May call MakeGateSequence recursively.
+      AddGatesFromPrev(max_fused_size, *longest_seq[3]->gate, scratch, fgate);
+    }
+
+    for (auto p : longest_seq) {
+      p->gate->visited = kFinal;
+    }
+
+    FuseNext(1, fgate);
+
+    scratch.gates.push_back(&fgate);
+  }
+
+  static void AddGatesFromNext(std::vector<const RGate*>& gates, GateF& fgate) {
+    for (auto gate : gates) {
+      fgate.gates.push_back(gate);
+    }
+  }
+
+  static void AddGatesFromPrev(unsigned max_fused_size, const GateF& pfgate,
+                               Scratch& scratch, GateF& fgate) {
+    for (auto gate : pfgate.gates) {
+        fgate.gates.push_back(gate);
+    }
+
+    for (auto link : pfgate.links) {
+      if (link->prev == nullptr) continue;
+
+      auto pgate = link->prev->val;
+
+      if (pgate->visited == kFirst) {
+        MakeGateSequence(max_fused_size, scratch, *pgate);
+      }
+    }
+  }
+
+  static void FindLongestGateSequence(
+      unsigned max_fused_size, unsigned level, Scratch& scratch, GateF& fgate) {
+    scratch.data.push_back({&fgate, {}, {}});
+
+    scratch.longest_seq.resize(0);
+    scratch.longest_seq.push_back(&scratch.data.back());
+
+    scratch.stack.resize(0);
+    scratch.stack.push_back(&scratch.data.back());
+
+    unsigned cur_size = fgate.qubits.size();
+    fgate.visited = level;
+
+    unsigned max_size = cur_size;
+
+    GetNextAvailableGates(max_fused_size, cur_size, fgate, nullptr,
+                          scratch.data, scratch.next1);
+
+    for (auto n1 : scratch.next1) {
+      unsigned cur_size2 = cur_size + n1->qubits.size();
+      if (cur_size2 > max_fused_size) continue;
+
+      bool feasible = GetPrevAvailableGates(max_fused_size, cur_size,
+                                            level, *n1->gate, nullptr,
+                                            scratch.data, scratch.prev1);
+
+      if (!feasible) continue;
+
+      if (scratch.prev1.size() == 0 && max_fused_size > 3) continue;
+
+      if (cur_size2 == max_fused_size) {
+        std::swap(scratch.longest_seq, scratch.stack);
+        scratch.longest_seq.push_back(n1);
+        return;
+      }
+
+      Push(level, cur_size2, cur_size, max_size, scratch, n1);
+
+      for (auto p1 : scratch.prev1) {
+        unsigned cur_size2 = cur_size + p1->qubits.size();
+
+        if (cur_size2 > max_fused_size) {
+          continue;
+        } else if (cur_size2 == max_fused_size) {
+          std::swap(scratch.longest_seq, scratch.stack);
+          scratch.longest_seq.push_back(p1);
+          return;
+        }
+
+        Push(level, cur_size2, cur_size, max_size, scratch, p1);
+
+        GetNextAvailableGates(max_fused_size, cur_size, *p1->gate, &fgate,
+                              scratch.data, scratch.next2);
+
+        for (auto n2 : scratch.next2) {
+          unsigned cur_size2 = cur_size + n2->qubits.size();
+          if (cur_size2 > max_fused_size) continue;
+
+          bool feasible = GetPrevAvailableGates(max_fused_size, cur_size,
+                                                level, *n2->gate, n1->gate,
+                                                scratch.data, scratch.prev2);
+
+          if (!feasible) continue;
+
+          if (cur_size2 == max_fused_size) {
+            std::swap(scratch.longest_seq, scratch.stack);
+            scratch.longest_seq.push_back(n2);
+            return;
+          }
+
+          Push(level, cur_size2, cur_size, max_size, scratch, n2);
+
+          for (auto p2 : scratch.prev2) {
+            unsigned cur_size2 = cur_size + p2->qubits.size();
+
+            if (cur_size2 > max_fused_size) {
+              continue;
+            } else if (cur_size2 == max_fused_size) {
+              std::swap(scratch.longest_seq, scratch.stack);
+              scratch.longest_seq.push_back(p2);
+              return;
+            }
+
+            if (cur_size2 > max_size) {
+              scratch.stack.push_back(p2);
+              scratch.longest_seq = scratch.stack;
+              scratch.stack.pop_back();
+              max_size = cur_size2;
+            }
+          }
+
+          Pop(cur_size, scratch, n2);
+        }
+
+        Pop(cur_size, scratch, p1);
+      }
+
+      Pop(cur_size, scratch, n1);
+    }
+  }
+
+  static void Push(unsigned level, unsigned cur_size2, unsigned& cur_size,
+                   unsigned& max_size, Scratch& scratch, GateA* agate) {
+    agate->gate->visited = level;
+    cur_size = cur_size2;
+    scratch.stack.push_back(agate);
+
+    if (cur_size > max_size) {
+      scratch.longest_seq = scratch.stack;
+      max_size = cur_size;
+    }
+  }
+
+  static void Pop(unsigned& cur_size, Scratch& scratch, GateA* agate) {
+    agate->gate->visited = kFirst;
+    cur_size -= agate->qubits.size();
+    scratch.stack.pop_back();
+  }
+
+  static void GetNextAvailableGates(unsigned max_fused_size, unsigned cur_size,
+                                    const GateF& pgate1, const GateF* pgate2,
+                                    std::vector<GateA>& scratch,
+                                    std::vector<GateA*>& next_gates) {
+    next_gates.resize(0);
+
+    for (auto link : pgate1.links) {
+      if (link->next == nullptr) continue;
+
+      auto ngate = link->next->val;
+
+      if (ngate->visited > kFirst || ngate->parent->unfusible) continue;
+
+      GateA next = {ngate, {}, {}};
+      next.qubits.reserve(8);
+      next.links.reserve(8);
+
+      GetAddedQubits(pgate1, pgate2, *ngate, next);
+
+      if (cur_size + next.qubits.size() > max_fused_size) continue;
+
+      scratch.push_back(std::move(next));
+      next_gates.push_back(&scratch.back());
+    }
+  }
+
+  static bool GetPrevAvailableGates(unsigned max_fused_size,
+                                    unsigned cur_size, unsigned level,
+                                    const GateF& ngate1, const GateF* ngate2,
+                                    std::vector<GateA>& scratch,
+                                    std::vector<GateA*>& prev_gates) {
+    prev_gates.resize(0);
+
+    for (auto link : ngate1.links) {
+      if (link->prev == nullptr) continue;
+
+      auto pgate = link->prev->val;
+
+      if (pgate->visited == kFinal || pgate->visited == level) continue;
+
+      if (pgate->visited > kFirst || pgate->parent->unfusible) {
+        prev_gates.resize(0);
+        return false;
+      }
+
+      GateA prev = {pgate, {}, {}};
+      prev.qubits.reserve(8);
+      prev.links.reserve(8);
+
+      GetAddedQubits(ngate1, ngate2, *pgate, prev);
+
+      bool all_prev_visited = true;
+
+      for (auto link : pgate->links) {
+        if (link->prev == nullptr) continue;
+
+        if (link->prev->val->visited <= kMeaCnt) {
+          all_prev_visited = false;
+          break;
+        }
+      }
+
+      if (!all_prev_visited) {
+        prev_gates.resize(0);
+        return false;
+      }
+
+      if (cur_size + prev.qubits.size() > max_fused_size) continue;
+
+      if (all_prev_visited) {
+        scratch.push_back(std::move(prev));
+        prev_gates.push_back(&scratch.back());
+      }
+    }
+
+    return true;
+  }
+
+  static void GetAddedQubits(const GateF& fgate0, const GateF* fgate1,
+                             const GateF& fgate2, GateA& added) {
+    for (std::size_t i = 0; i < fgate2.qubits.size(); ++i) {
+      unsigned q2 = fgate2.qubits[i];
+
+      if (std::find(fgate0.qubits.begin(), fgate0.qubits.end(), q2)
+          != fgate0.qubits.end()) continue;
+
+      if (fgate1 != nullptr
+          && std::find(fgate1->qubits.begin(), fgate1->qubits.end(), q2)
+            != fgate1->qubits.end()) continue;
+
+      added.qubits.push_back(q2);
+      added.links.push_back(fgate2.links[i]);
+    }
+  }
+
+  // Fuse smaller gates with fgate back in gate time.
+  static void FusePrev(unsigned pass, GateF& fgate) {
+    std::vector<const RGate*> gates;
+    gates.reserve(fgate.gates.capacity());
+
+    auto neighbor = [](const Link* link) -> const Link* {
+      return link->prev;
+    };
+
+    FusePrevOrNext<std::greater<unsigned>>(pass, neighbor, fgate, gates);
+
+    for (auto it = gates.rbegin(); it != gates.rend(); ++it) {
+      fgate.gates.push_back(*it);
+    }
+  }
+
+  // Fuse smaller gates with fgate forward in gate time.
+  static void FuseNext(unsigned pass, GateF& fgate) {
+    auto neighbor = [](const Link* link) -> const Link* {
+      return link->next;
+    };
+
+    FusePrevOrNext<std::less<unsigned>>(pass, neighbor, fgate, fgate.gates);
+  }
+
+  template <typename R, typename Neighbor>
+  static void FusePrevOrNext(unsigned pass, Neighbor neighb, GateF& fgate,
+                             std::vector<const RGate*>& gates) {
+    uint64_t bad_mask = 0;
+    auto links = fgate.links;
+
+    bool may_have_gates_to_fuse = true;
+
+    while (may_have_gates_to_fuse) {
+      may_have_gates_to_fuse = false;
+
+      std::sort(links.begin(), links.end(),
+                [&neighb](const Link* l, const Link* r) -> bool {
+                  auto ln = neighb(l);
+                  auto rn = neighb(r);
+
+                  if (ln != nullptr && rn != nullptr) {
+                    return R()(ln->val->parent->time, rn->val->parent->time);
+                  } else {
+                    // nullptrs are larger than everything else and
+                    // equivalent among each other.
+                    return ln != nullptr;
+                  }
+                });
+
+      for (auto link : links) {
+        auto n = neighb(link);
+
+        if (n == nullptr) continue;
+
+        auto g = n->val;
+
+        if (!QubitsAreIn(fgate.mask, g->mask) || (g->mask & bad_mask) != 0
+            || g->visited > pass || g->parent->unfusible) {
+          bad_mask |= g->mask;
+        } else {
+          g->visited = pass == 0 ? kFirst : kFinal;
+
+          if (pass == 0) {
+            gates.push_back(g->parent);
+          } else {
+            for (auto gate : g->gates) {
+              gates.push_back(gate);
+            }
+          }
+
+          for (auto link : g->links) {
+            LinkManager::Delete(link);
+          }
+
+          may_have_gates_to_fuse = true;
+          break;
+        }
+      }
+    }
+  }
+
+  static bool QubitsAreIn(uint64_t mask0, uint64_t mask) {
+    return ((mask0 | mask) ^ mask0) == 0;
+  }
+
+  static void PrintStat(unsigned verbosity, const Stat& stat,
+                        const std::vector<GateFused>& fused_gates) {
+    if (verbosity < 3) return;
+
+    if (stat.num_controlled_gates > 0) {
+      IO::messagef("%lu controlled gates\n", stat.num_controlled_gates);
+    }
+
+    if (stat.num_mea_gates > 0) {
+      IO::messagef("%lu measurement gates", stat.num_mea_gates);
+      if (stat.num_fused_mea_gates == stat.num_mea_gates) {
+        IO::messagef("\n");
+      } else {
+        IO::messagef(" are fused into %lu gates\n", stat.num_fused_mea_gates);
+      }
+    }
+
+    bool first = true;
+    for (unsigned i = 1; i < stat.num_gates.size(); ++i) {
+      if (stat.num_gates[i] > 0) {
+        if (first) {
+          first = false;
+        } else {
+          IO::messagef(", ");
+        }
+        IO::messagef("%u %u-qubit", stat.num_gates[i], i);
+      }
+    }
+
+    IO::messagef(" gates are fused into %lu gates\n", stat.num_fused_gates);
+
+    if (verbosity < 5) return;
+
+    IO::messagef("fused gate qubits:\n");
+    for (const auto& g : fused_gates) {
+      IO::messagef("%6u  ", g.parent->time);
+      if (g.parent->kind == gate::kMeasurement) {
+        IO::messagef("m");
+      } else if (g.parent->controlled_by.size() > 0) {
+        IO::messagef("c");
+        for (auto q : g.parent->controlled_by) {
+          IO::messagef("%3u", q);
+        }
+        IO::messagef("  t");
+      } else {
+        IO::messagef(" ");
+      }
+
+      for (auto q : g.qubits) {
+        IO::messagef("%3u", q);
+      }
+      IO::messagef("\n");
+    }
+  }
+
+  template <typename Gate2, typename GatesLat>
+  static bool ValidateGate(const Gate2& gate, unsigned max_qubit1,
+                           const GatesLat& gates_lat) {
+    for (unsigned q : gate.qubits) {
+      if (q >= max_qubit1) {
+        IO::errorf("fuser: gate qubit %u is out of range "
+                   "(should be smaller than %u).\n", q, max_qubit1);
+        return false;
+      }
+      if (gates_lat[q] != nullptr
+          && gate.time <= gates_lat[q]->val->parent->time) {
+        IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time);
+        return false;
+      }
+    }
+
+    for (unsigned q : gate.controlled_by) {
+      if (q >= max_qubit1) {
+        IO::errorf("fuser: gate qubit %u is out of range "
+                   "(should be smaller than %u).\n", q, max_qubit1);
+        return false;
+      }
+      if (gates_lat[q] != nullptr
+          && gate.time <= gates_lat[q]->val->parent->time) {
+        IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time);
+        return false;
+      }
+    }
+
+    return true;
+  }
+};
+
+}  // namespace qsim
+
+#endif  // FUSER_MQUBIT_H_
diff --git a/tpls/qsim/gate.h b/tpls/qsim/gate.h
new file mode 100644
index 0000000..a457acb
--- /dev/null
+++ b/tpls/qsim/gate.h
@@ -0,0 +1,216 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GATE_H_
+#define GATE_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "matrix.h"
+
+namespace qsim {
+
+namespace detail {
+
+template <typename Gate, typename GateDef>
+inline void SortQubits(Gate& gate) {
+  for (std::size_t i = 1; i < gate.qubits.size(); ++i) {
+    if (gate.qubits[i - 1] > gate.qubits[i]) {
+      if (!GateDef::symmetric) {
+        auto perm = NormalToGateOrderPermutation(gate.qubits);
+        MatrixShuffle(perm, gate.qubits.size(), gate.matrix);
+      }
+
+      gate.swapped = true;
+      std::sort(gate.qubits.begin(), gate.qubits.end());
+      break;
+    }
+  }
+}
+
+}  // namespace detail
+
+template <typename Qubits = std::vector<unsigned>, typename Gate>
+inline Gate& MakeControlledGate(Qubits&& controlled_by, Gate& gate) {
+  gate.controlled_by = std::forward<Qubits>(controlled_by);
+  gate.cmask = (uint64_t{1} << gate.controlled_by.size()) - 1;
+
+  std::sort(gate.controlled_by.begin(), gate.controlled_by.end());
+
+  return gate;
+}
+
+template <typename Qubits = std::vector<unsigned>, typename Gate>
+inline Gate& MakeControlledGate(Qubits&& controlled_by,
+                               const std::vector<unsigned>& control_values,
+                               Gate& gate) {
+  // Assume controlled_by.size() == control_values.size().
+
+  bool sorted = true;
+
+  for (std::size_t i = 1; i < controlled_by.size(); ++i) {
+    if (controlled_by[i - 1] > controlled_by[i]) {
+      sorted = false;
+      break;
+    }
+  }
+
+  if (sorted) {
+    gate.controlled_by = std::forward<Qubits>(controlled_by);
+    gate.cmask = 0;
+
+    for (std::size_t i = 0; i < control_values.size(); ++i) {
+      gate.cmask |= (control_values[i] & 1) << i;
+    }
+  } else {
+    struct ControlPair {
+      unsigned q;
+      unsigned v;
+    };
+
+    std::vector<ControlPair> cpairs;
+    cpairs.reserve(controlled_by.size());
+
+    for (std::size_t i = 0; i < controlled_by.size(); ++i) {
+      cpairs.push_back({controlled_by[i], control_values[i]});
+    }
+
+    // Sort control qubits and control values.
+    std::sort(cpairs.begin(), cpairs.end(),
+              [](const ControlPair& l, const ControlPair& r) -> bool {
+                return l.q < r.q;
+              });
+
+    gate.cmask = 0;
+    gate.controlled_by.reserve(controlled_by.size());
+
+    for (std::size_t i = 0; i < cpairs.size(); ++i) {
+      gate.cmask |= (cpairs[i].v & 1) << i;
+      gate.controlled_by.push_back(cpairs[i].q);
+    }
+  }
+
+  return gate;
+}
+
+namespace gate {
+
+constexpr int kDecomp = 100001;       // gate from Schmidt decomposition
+constexpr int kMeasurement = 100002;  // measurement gate
+
+}  // namespace gate
+
+enum GateAnyKind {
+  kGateAny = -1,
+};
+
+/**
+ * A generic gate to make it easier to use qsim with external gate sets.
+ */
+template <typename FP, typename GK = GateAnyKind>
+struct Gate {
+  using fp_type = FP;
+  using GateKind = GK;
+
+  GateKind kind;
+  unsigned time;
+  std::vector<unsigned> qubits;
+  std::vector<unsigned> controlled_by;
+  uint64_t cmask;
+  std::vector<fp_type> params;
+  Matrix<fp_type> matrix;
+  bool unfusible;      // If true, the gate is fused as a parent.
+  bool swapped;        // If true, the gate qubits are swapped to make qubits
+                       // ordered in ascending order. This does not apply to
+                       // control qubits of explicitly-controlled gates.
+
+  template <typename Qubits = std::vector<unsigned>>
+  Gate&& ControlledBy(Qubits&& controlled_by) {
+    MakeControlledGate(std::forward<Qubits>(controlled_by), *this);
+    return std::move(*this);
+  }
+
+  template <typename Qubits = std::vector<unsigned>>
+  Gate&& ControlledBy(Qubits&& controlled_by,
+                      const std::vector<unsigned>& control_values) {
+    MakeControlledGate(
+        std::forward<Qubits>(controlled_by), control_values, *this);
+    return std::move(*this);
+  }
+};
+
+template <typename Gate, typename GateDef,
+          typename Qubits = std::vector<unsigned>,
+          typename M = Matrix<typename Gate::fp_type>>
+inline Gate CreateGate(unsigned time, Qubits&& qubits, M&& matrix = {},
+                       std::vector<typename Gate::fp_type>&& params = {}) {
+  Gate gate = {GateDef::kind, time, std::forward<Qubits>(qubits), {}, 0,
+               std::move(params), std::forward<M>(matrix), false, false};
+
+  if (GateDef::kind != gate::kMeasurement) {
+    switch (gate.qubits.size()) {
+    case 1:
+      break;
+    case 2:
+      if (gate.qubits[0] > gate.qubits[1]) {
+        gate.swapped = true;
+        std::swap(gate.qubits[0], gate.qubits[1]);
+        if (!GateDef::symmetric) {
+          MatrixShuffle({1, 0}, 2, gate.matrix);
+        }
+      }
+      break;
+    default:
+      detail::SortQubits<Gate, GateDef>(gate);
+    }
+  }
+
+  return gate;
+}
+
+namespace gate {
+
+/**
+ * A gate that simulates measurement of one or more qubits, collapsing the
+ * state vector and storing the measured results.
+ */
+template <typename Gate>
+struct Measurement {
+  using GateKind = typename Gate::GateKind;
+
+  static constexpr GateKind kind = GateKind::kMeasurement;
+  static constexpr char name[] = "m";
+  static constexpr bool symmetric = false;
+
+  template <typename Qubits = std::vector<unsigned>>
+  static Gate Create(unsigned time, Qubits&& qubits) {
+    return CreateGate<Gate, Measurement>(time, std::forward<Qubits>(qubits));
+  }
+};
+
+}  // namespace gate
+
+template <typename fp_type>
+using schmidt_decomp_type = std::vector<std::vector<std::vector<fp_type>>>;
+
+template <typename fp_type, typename GateKind>
+schmidt_decomp_type<fp_type> GetSchmidtDecomp(
+    GateKind kind, const std::vector<fp_type>& params);
+
+}  // namespace qsim
+
+#endif  // GATE_H_
diff --git a/tpls/qsim/gate_appl.h b/tpls/qsim/gate_appl.h
new file mode 100644
index 0000000..8601e6f
--- /dev/null
+++ b/tpls/qsim/gate_appl.h
@@ -0,0 +1,231 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GATE_APPL_H_
+#define GATE_APPL_H_
+
+#include <utility>
+#include <vector>
+
+#include "fuser.h"
+#include "gate.h"
+#include "matrix.h"
+
+namespace qsim {
+
+/**
+ * Applies the given gate to the simulator state. Ignores measurement gates.
+ * @param simulator Simulator object. Provides specific implementations for
+ *   applying gates.
+ * @param gate The gate to be applied.
+ * @param state The state of the system, to be updated by this method.
+ */
+template <typename Simulator, typename Gate>
+inline void ApplyGate(const Simulator& simulator, const Gate& gate,
+                      typename Simulator::State& state) {
+  if (gate.kind != gate::kMeasurement) {
+    if (gate.controlled_by.size() == 0) {
+      simulator.ApplyGate(gate.qubits, gate.matrix.data(), state);
+    } else {
+      simulator.ApplyControlledGate(gate.qubits, gate.controlled_by,
+                                    gate.cmask, gate.matrix.data(), state);
+    }
+  }
+}
+
+/**
+ * Applies the given gate dagger to the simulator state. If the gate matrix is
+ *   unitary then this is equivalent to applying the inverse gate. Ignores
+ *   measurement gates.
+ * @param simulator Simulator object. Provides specific implementations for
+ *   applying gates.
+ * @param gate The gate to be applied.
+ * @param state The state of the system, to be updated by this method.
+ */
+template <typename Simulator, typename Gate>
+inline void ApplyGateDagger(const Simulator& simulator, const Gate& gate,
+                            typename Simulator::State& state) {
+  if (gate.kind != gate::kMeasurement) {
+    auto matrix = gate.matrix;
+    MatrixDagger(unsigned{1} << gate.qubits.size(), matrix);
+
+    if (gate.controlled_by.size() == 0) {
+      simulator.ApplyGate(gate.qubits, matrix.data(), state);
+    } else {
+      simulator.ApplyControlledGate(gate.qubits, gate.controlled_by,
+                                    gate.cmask, matrix.data(), state);
+    }
+  }
+}
+
+/**
+ * Applies the given gate to the simulator state.
+ * @param state_space StateSpace object required to perform measurements.
+ * @param simulator Simulator object. Provides specific implementations for
+ *   applying gates.
+ * @param gate The gate to be applied.
+ * @param rgen Random number generator to perform measurements.
+ * @param state The state of the system, to be updated by this method.
+ * @param mresults As an input parameter, this can be empty or this can
+ *   contain the results of the previous measurements. If gate is a measurement
+ *   gate then after a successful run, the measurement result will be added to
+ *   this.
+ * @return True if the measurement performed successfully; false otherwise.
+ */
+template <typename Simulator, typename Gate, typename Rgen>
+inline bool ApplyGate(
+    const typename Simulator::StateSpace& state_space,
+    const Simulator& simulator, const Gate& gate, Rgen& rgen,
+    typename Simulator::State& state,
+    std::vector<typename Simulator::StateSpace::MeasurementResult>& mresults) {
+  if (gate.kind == gate::kMeasurement) {
+    auto measure_result = state_space.Measure(gate.qubits, rgen, state);
+    if (measure_result.valid) {
+      mresults.push_back(std::move(measure_result));
+    } else {
+      return false;
+    }
+  } else {
+    ApplyGate(simulator, gate, state);
+  }
+
+  return true;
+}
+
+/**
+ * Applies the given gate to the simulator state, discarding measurement
+ *   results.
+ * @param state_space StateSpace object required to perform measurements.
+ * @param simulator Simulator object. Provides specific implementations for
+ *   applying gates.
+ * @param gate The gate to be applied.
+ * @param rgen Random number generator to perform measurements.
+ * @param state The state of the system, to be updated by this method.
+ * @return True if the measurement performed successfully; false otherwise.
+ */
+template <typename Simulator, typename Gate, typename Rgen>
+inline bool ApplyGate(const typename Simulator::StateSpace& state_space,
+                      const Simulator& simulator, const Gate& gate, Rgen& rgen,
+                      typename Simulator::State& state) {
+  using MeasurementResult = typename Simulator::StateSpace::MeasurementResult;
+  std::vector<MeasurementResult> discarded_results;
+  return
+      ApplyGate(state_space, simulator, gate, rgen, state, discarded_results);
+}
+
+/**
+ * Applies the given fused gate to the simulator state. Ignores measurement
+ *   gates.
+ * @param simulator Simulator object. Provides specific implementations for
+ *   applying gates.
+ * @param gate The gate to be applied.
+ * @param state The state of the system, to be updated by this method.
+ */
+template <typename Simulator, typename Gate>
+inline void ApplyFusedGate(const Simulator& simulator, const Gate& gate,
+                           typename Simulator::State& state) {
+  if (gate.kind != gate::kMeasurement) {
+    if (gate.parent->controlled_by.size() == 0) {
+      simulator.ApplyGate(gate.qubits, gate.matrix.data(), state);
+    } else {
+      simulator.ApplyControlledGate(gate.qubits, gate.parent->controlled_by,
+                                    gate.parent->cmask, gate.matrix.data(),
+                                    state);
+    }
+  }
+}
+
+/**
+ * Applies the given fused gate dagger to the simulator state. If the gate
+ *   matrix is unitary then this is equivalent to applying the inverse gate.
+ *   Ignores measurement gates.
+ * @param simulator Simulator object. Provides specific implementations for
+ *   applying gates.
+ * @param gate The gate to be applied.
+ * @param state The state of the system, to be updated by this method.
+ */
+template <typename Simulator, typename Gate>
+inline void ApplyFusedGateDagger(const Simulator& simulator, const Gate& gate,
+                                 typename Simulator::State& state) {
+  if (gate.kind != gate::kMeasurement) {
+    auto matrix = gate.matrix;
+    MatrixDagger(unsigned{1} << gate.qubits.size(), matrix);
+
+    if (gate.parent->controlled_by.size() == 0) {
+      simulator.ApplyGate(gate.qubits, matrix.data(), state);
+    } else {
+      simulator.ApplyControlledGate(gate.qubits, gate.parent->controlled_by,
+                                    gate.parent->cmask, matrix.data(), state);
+    }
+  }
+}
+
+/**
+ * Applies the given fused gate to the simulator state.
+ * @param state_space StateSpace object required to perform measurements.
+ * @param simulator Simulator object. Provides specific implementations for
+ *   applying gates.
+ * @param gate The gate to be applied.
+ * @param rgen Random number generator to perform measurements.
+ * @param state The state of the system, to be updated by this method.
+ * @param mresults As an input parameter, this can be empty or this can
+ *   contain the results of the previous measurements. If gate is a measurement
+ *   gate then after a successful run, the measurement result will be added to
+ *   this.
+ * @return True if the measurement performed successfully; false otherwise.
+ */
+template <typename Simulator, typename Gate, typename Rgen>
+inline bool ApplyFusedGate(
+    const typename Simulator::StateSpace& state_space,
+    const Simulator& simulator, const Gate& gate, Rgen& rgen,
+    typename Simulator::State& state,
+    std::vector<typename Simulator::StateSpace::MeasurementResult>& mresults) {
+  if (gate.kind == gate::kMeasurement) {
+    auto measure_result = state_space.Measure(gate.qubits, rgen, state);
+    if (measure_result.valid) {
+      mresults.push_back(std::move(measure_result));
+    } else {
+      return false;
+    }
+  } else {
+    ApplyFusedGate(simulator, gate, state);
+  }
+
+  return true;
+}
+
+/**
+ * Applies the given fused gate to the simulator state, discarding measurement
+ *   results.
+ * @param state_space StateSpace object required to perform measurements.
+ * @param simulator Simulator object. Provides specific implementations for
+ *   applying gates.
+ * @param gate The gate to be applied.
+ * @param rgen Random number generator to perform measurements.
+ * @param state The state of the system, to be updated by this method.
+ * @return True if the measurement performed successfully; false otherwise.
+ */
+template <typename Simulator, typename Gate, typename Rgen>
+inline bool ApplyFusedGate(const typename Simulator::StateSpace& state_space,
+                           const Simulator& simulator, const Gate& gate,
+                           Rgen& rgen, typename Simulator::State& state) {
+  using MeasurementResult = typename Simulator::StateSpace::MeasurementResult;
+  std::vector<MeasurementResult> discarded_results;
+  return ApplyFusedGate(
+      state_space, simulator, gate, rgen, state, discarded_results);
+}
+
+}  // namespace qsim
+
+#endif  // GATE_APPL_H_
diff --git a/tpls/qsim/gates_cirq.h b/tpls/qsim/gates_cirq.h
new file mode 100644
index 0000000..d767959
--- /dev/null
+++ b/tpls/qsim/gates_cirq.h
@@ -0,0 +1,1640 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GATES_CIRQ_H_
+#define GATES_CIRQ_H_
+
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <vector>
+
+#include "gate.h"
+#include "matrix.h"
+
+namespace qsim {
+
+namespace Cirq {
+
+enum GateKind {
+  kI1 = 0,     // One-qubit identity gate.
+  kI2,         // Two-qubit identity gate.
+  kI,          // Multi-qubit identity gate.
+  kXPowGate,
+  kYPowGate,
+  kZPowGate,
+  kHPowGate,
+  kCZPowGate,
+  kCXPowGate,
+  krx,
+  kry,
+  krz,
+  kH,
+  kS,
+  kCZ,
+  kCX,
+  kT,
+  kX,
+  kY,
+  kZ,
+  kPhasedXPowGate,
+  kPhasedXZGate,
+  kXXPowGate,
+  kYYPowGate,
+  kZZPowGate,
+  kXX,
+  kYY,
+  kZZ,
+  kSwapPowGate,
+  kISwapPowGate,
+  kriswap,
+  kSWAP,
+  kISWAP,
+  kPhasedISwapPowGate,
+  kgivens,
+  kFSimGate,
+  kTwoQubitDiagonalGate,
+  kThreeQubitDiagonalGate,
+  kCCZPowGate,
+  kCCXPowGate,
+  kCSwapGate,
+  kCCZ,
+  kCCX,
+  kMatrixGate1,  // One-qubit matrix gate.
+  kMatrixGate2,  // Two-qubit matrix gate.
+  kMatrixGate,   // Multi-qubit matrix gate.
+  kGlobalPhaseGate,
+  kDecomp = gate::kDecomp,
+  kMeasurement = gate::kMeasurement,
+};
+
+template <typename fp_type>
+using GateCirq = Gate<fp_type, GateKind>;
+
+constexpr double h_double = 0.5;
+constexpr double pi_double = 3.14159265358979323846264338327950288;
+constexpr double is2_double = 0.7071067811865475;
+
+// Gates from cirq/ops/global_phase_op.py:
+
+/**
+ * The global phase gate.
+ */
+template <typename fp_type>
+struct GlobalPhaseGate {
+  static constexpr GateKind kind = kGlobalPhaseGate;
+  static constexpr char name[] = "GlobalPhaseGate";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateCirq<fp_type> Create(unsigned time, fp_type phi) {
+    return Create(time, std::cos(phi), std::sin(phi));
+  }
+
+  static GateCirq<fp_type> Create(unsigned time, fp_type cp, fp_type sp) {
+    return CreateGate<GateCirq<fp_type>, GlobalPhaseGate>(
+        time, {}, {cp, sp}, {cp, sp});
+  }
+};
+
+template <typename fp_type>
+using global_phase_operation = GlobalPhaseGate<fp_type>;
+
+// Gates from cirq/ops/identity.py:
+
+/**
+ * A one-qubit identity gate.
+ */
+template <typename fp_type>
+struct I1 {
+  static constexpr GateKind kind = kI1;
+  static constexpr char name[] = "I1";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateCirq<fp_type>, I1>(
+        time, {q0}, {1, 0, 0, 0, 0, 0, 1, 0});
+  }
+};
+
+/**
+ * A two-qubit identity gate.
+ */
+template <typename fp_type>
+struct I2 {
+  static constexpr GateKind kind = kI2;
+  static constexpr char name[] = "I2";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
+    return CreateGate<GateCirq<fp_type>, I2>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, 1, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 1, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, 1, 0});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
+    return schmidt_decomp_type<fp_type>{
+      {{1, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, 1, 0}},
+    };
+  }
+};
+
+/**
+ * A multi-qubit identity gate.
+ */
+template <typename fp_type>
+struct I {
+  static constexpr GateKind kind = kI;
+  static constexpr char name[] = "I";
+  static constexpr bool symmetric = true;
+
+  static GateCirq<fp_type> Create(unsigned time,
+                                  const std::vector<unsigned>& qubits) {
+    Matrix<fp_type> matrix;
+    MatrixIdentity(1 << qubits.size(), matrix);
+    return CreateGate<GateCirq<fp_type>, I>(time, qubits, std::move(matrix));
+  }
+};
+
+// Gates form cirq/ops/common_gates.py:
+
+/**
+ * A gate that rotates around the X axis of the Bloch sphere.
+ * This is a generalization of the X gate.
+ */
+template <typename fp_type>
+struct XPowGate {
+  static constexpr GateKind kind = kXPowGate;
+  static constexpr char name[] = "XPowGate";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0,
+                                  fp_type exponent, fp_type global_shift = 0) {
+    fp_type c = std::cos(pi * exponent * 0.5);
+    fp_type s = std::sin(pi * exponent * 0.5);
+    fp_type gc = std::cos(pi * exponent * (0.5 + global_shift));
+    fp_type gs = std::sin(pi * exponent * (0.5 + global_shift));
+
+    return CreateGate<GateCirq<fp_type>, XPowGate>(
+        time, {q0}, {c * gc, c * gs, s * gs, -s * gc,
+                     s * gs, -s * gc, c * gc, c * gs},
+        {exponent, global_shift});
+  }
+};
+
+/**
+ * A gate that rotates around the Y axis of the Bloch sphere.
+ * This is a generalization of the Y gate.
+ */
+template <typename fp_type>
+struct YPowGate {
+  static constexpr GateKind kind = kYPowGate;
+  static constexpr char name[] = "YPowGate";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0,
+                                  fp_type exponent, fp_type global_shift = 0) {
+    fp_type c = std::cos(pi * exponent * 0.5);
+    fp_type s = std::sin(pi * exponent * 0.5);
+    fp_type gc = std::cos(pi * exponent * (0.5 + global_shift));
+    fp_type gs = std::sin(pi * exponent * (0.5 + global_shift));
+
+    return CreateGate<GateCirq<fp_type>, YPowGate>(
+        time, {q0}, {c * gc, c * gs, -s * gc, -s * gs,
+                     s * gc, s * gs, c * gc, c * gs}, {exponent, global_shift});
+  }
+};
+
+/**
+ * A gate that rotates around the Z axis of the Bloch sphere.
+ * This is a generalization of the Z gate.
+ */
+template <typename fp_type>
+struct ZPowGate {
+  static constexpr GateKind kind = kZPowGate;
+  static constexpr char name[] = "ZPowGate";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0,
+                                  fp_type exponent, fp_type global_shift = 0) {
+    fp_type c = std::cos(pi * exponent);
+    fp_type s = std::sin(pi * exponent);
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+
+    return CreateGate<GateCirq<fp_type>, ZPowGate>(
+        time, {q0}, {gc, gs, 0, 0, 0, 0, c * gc - s * gs, c * gs + s * gc},
+        {exponent, global_shift});
+  }
+};
+
+/**
+ * A gate that rotates around the X+Z axis of the Bloch sphere.
+ * This is a generalization of the Hadamard gate.
+ */
+template <typename fp_type>
+struct HPowGate {
+  static constexpr GateKind kind = kHPowGate;
+  static constexpr char name[] = "HPowGate";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0,
+                                  fp_type exponent, fp_type global_shift = 0) {
+    fp_type c = std::cos(pi * exponent * 0.5);
+    fp_type s = std::sin(pi * exponent * 0.5);
+    fp_type gc = std::cos(pi * exponent * (0.5 + global_shift));
+    fp_type gs = std::sin(pi * exponent * (0.5 + global_shift));
+
+    fp_type a = s * gs * is2;
+    fp_type b = s * gc * is2;
+
+    return CreateGate<GateCirq<fp_type>, HPowGate>(
+        time, {q0}, {c * gc + a, c * gs - b, a, -b,
+                     a, -b, c * gc - a, c * gs + b}, {exponent, global_shift});
+  }
+};
+
+/**
+ * A gate that applies a phase to the |11⟩ state of two qubits.
+ * This is a generalization of the CZ gate.
+ */
+template <typename fp_type>
+struct CZPowGate {
+  static constexpr GateKind kind = kCZPowGate;
+  static constexpr char name[] = "CZPowGate";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
+                                  fp_type exponent, fp_type global_shift = 0) {
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+    fp_type ec = std::cos(pi * exponent * (1 + global_shift));
+    fp_type es = std::sin(pi * exponent * (1 + global_shift));
+
+    return CreateGate<GateCirq<fp_type>, CZPowGate>(
+        time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0,
+                         0, 0, gc, gs, 0, 0, 0, 0,
+                         0, 0, 0, 0, gc, gs, 0, 0,
+                         0, 0, 0, 0, 0, 0, ec, es}, {exponent, global_shift});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp(
+      fp_type exponent, fp_type global_shift) {
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+    fp_type ec = std::cos(pi * exponent * (1 + global_shift));
+    fp_type es = std::sin(pi * exponent * (1 + global_shift));
+
+    return schmidt_decomp_type<fp_type>{
+      {{1, 0, 0, 0, 0, 0, 0, 0}, {gc, gs, 0, 0, 0, 0, gc, gs}},
+      {{0, 0, 0, 0, 0, 0, 1, 0}, {gc, gs, 0, 0, 0, 0, ec, es}},
+    };
+  }
+};
+
+/**
+ * A gate that applies a controlled power of an X gate.
+ * This is a generalization of the CX (or CNOT) gate.
+ */
+template <typename fp_type>
+struct CXPowGate {
+  static constexpr GateKind kind = kCXPowGate;
+  static constexpr char name[] = "CXPowGate";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = false;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
+                                  fp_type exponent, fp_type global_shift = 0) {
+    fp_type c = std::cos(pi * exponent * 0.5);
+    fp_type s = std::sin(pi * exponent * 0.5);
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+    fp_type ec = std::cos(pi * exponent * (0.5 + global_shift));
+    fp_type es = std::sin(pi * exponent * (0.5 + global_shift));
+
+    // Matrix is in this form because the simulator uses inverse qubit order.
+    return CreateGate<GateCirq<fp_type>, CXPowGate>(
+        time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0,
+                         0, 0, c * ec, c * es, 0, 0, s * es, -s * ec,
+                         0, 0, 0, 0, gc, gs, 0, 0,
+                         0, 0, s * es, -s * ec, 0, 0, c * ec, c * es},
+        {exponent, global_shift});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp(
+      fp_type exponent, fp_type global_shift) {
+    fp_type c = std::cos(pi * exponent * 0.5);
+    fp_type s = std::sin(pi * exponent * 0.5);
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+    fp_type ec = std::cos(pi * exponent * (0.5 + global_shift));
+    fp_type es = std::sin(pi * exponent * (0.5 + global_shift));
+
+    return schmidt_decomp_type<fp_type>{
+      {{1, 0, 0, 0, 0, 0, 0, 0}, {gc, gs, 0, 0, 0, 0, gc, gs}},
+      {{0, 0, 0, 0, 0, 0, 1, 0}, {c * ec, c * es, s * es, -s * ec,
+                                  s * es, -s * ec, c * ec, c * es}},
+    };
+  }
+};
+
+/**
+ * The `(exponent = phi/pi, global_shift = -0.5)` instance of XPowGate.
+ * This is a generalization of the X gate with a fixed global phase.
+ * This is a function in Cirq.
+ */
+template <typename fp_type>
+struct rx {
+  static constexpr GateKind kind = krx;
+  static constexpr char name[] = "rx";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, fp_type phi) {
+    fp_type c = std::cos(-0.5 * phi);
+    fp_type s = std::sin(-0.5 * phi);
+
+    return CreateGate<GateCirq<fp_type>, rx>(
+        time, {q0}, {c, 0, 0, s, 0, s, c, 0}, {phi});
+  }
+};
+
+/**
+ * The `(exponent = phi/pi, global_shift = -0.5)` instance of YPowGate.
+ * This is a generalization of the Y gate with a fixed global phase.
+ * This is a function in Cirq.
+ */
+template <typename fp_type>
+struct ry {
+  static constexpr GateKind kind = kry;
+  static constexpr char name[] = "ry";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, fp_type phi) {
+    fp_type c = std::cos(-0.5 * phi);
+    fp_type s = std::sin(-0.5 * phi);
+
+    return CreateGate<GateCirq<fp_type>, ry>(
+        time, {q0}, {c, 0, s, 0, -s, 0, c, 0}, {phi});
+  }
+};
+
+/**
+ * The `(exponent = phi/pi, global_shift = -0.5)` instance of ZPowGate.
+ * This is a generalization of the Z gate with a fixed global phase.
+ * This is a function in Cirq.
+ */
+template <typename fp_type>
+struct rz {
+  static constexpr GateKind kind = krz;
+  static constexpr char name[] = "rz";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, fp_type phi) {
+    fp_type c = std::cos(-0.5 * phi);
+    fp_type s = std::sin(-0.5 * phi);
+
+    return CreateGate<GateCirq<fp_type>, rz>(
+        time, {q0}, {c, s, 0, 0, 0, 0, c, -s}, {phi});
+  }
+};
+
+/**
+ * The `(exponent = 1, global_shift = 0)` instance of HPowGate.
+ * This is the canonical Hadamard (or H) gate.
+ */
+template <typename fp_type>
+struct H {
+  static constexpr GateKind kind = kH;
+  static constexpr char name[] = "H";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateCirq<fp_type>, H>(
+        time, {q0}, {is2, 0, is2, 0, is2, 0, -is2, 0});
+  }
+};
+
+/**
+ * The `(exponent = 0.5, global_shift = 0)` instance of ZPowGate.
+ * This is the canonical S gate.
+ */
+template <typename fp_type>
+struct S {
+  static constexpr GateKind kind = kS;
+  static constexpr char name[] = "S";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateCirq<fp_type>, S>(
+        time, {q0}, {1, 0, 0, 0, 0, 0, 0, 1});
+  }
+};
+
+/**
+ * The `(exponent = 0.25, global_shift = 0)` instance of ZPowGate.
+ * This is the canonical T gate.
+ */
+template <typename fp_type>
+struct T {
+  static constexpr GateKind kind = kT;
+  static constexpr char name[] = "T";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateCirq<fp_type>, T>(
+        time, {q0}, {1, 0, 0, 0, 0, 0, is2, is2});
+  }
+};
+
+/**
+ * The `(exponent = 1, global_shift = 0)` instance of CZPowGate.
+ * This is the canonical CZ gate.
+ */
+template <typename fp_type>
+struct CZ {
+  static constexpr GateKind kind = kCZ;
+  static constexpr char name[] = "CZ";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
+    return CreateGate<GateCirq<fp_type>, CZ>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, 1, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 1, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, -1, 0});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
+    return schmidt_decomp_type<fp_type>{
+      {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}},
+      {{0, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, -1, 0}},
+    };
+  }
+};
+
+template <typename fp_type>
+using CNotPowGate = CXPowGate<fp_type>;
+
+/**
+ * The `(exponent = 1, global_shift = 0)` instance of CXPowGate.
+ * This is the canonical CX (or CNOT) gate.
+ */
+template <typename fp_type>
+struct CX {
+  static constexpr GateKind kind = kCX;
+  static constexpr char name[] = "kCX";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = false;
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
+    // Matrix is in this form because the simulator uses inverse qubit order.
+    return CreateGate<GateCirq<fp_type>, CX>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, 1, 0,
+                         0, 0, 0, 0, 1, 0, 0, 0,
+                         0, 0, 1, 0, 0, 0, 0, 0});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
+    return schmidt_decomp_type<fp_type>{
+      {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}},
+      {{0, 0, 0, 0, 0, 0, 1, 0}, {0, 0, 1, 0, 1, 0, 0, 0}},
+    };
+  }
+};
+
+template <typename fp_type>
+using CNOT = CX<fp_type>;
+
+// Gates from cirq/ops/pauli_gates.py:
+
+/**
+ * The `(exponent = 1, global_shift = 0)` instance of XPowGate.
+ * This is the canonical Pauli X gate.
+ */
+template <typename fp_type>
+struct X : public XPowGate<fp_type> {
+  static constexpr GateKind kind = kX;
+  static constexpr char name[] = "X";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateCirq<fp_type>, X>(
+        time, {q0}, {0, 0, 1, 0, 1, 0, 0, 0});
+  }
+};
+
+/**
+ * The `(exponent = 1, global_shift = 0)` instance of YPowGate.
+ * This is the canonical Pauli Y gate.
+ */
+template <typename fp_type>
+struct Y : public YPowGate<fp_type> {
+  static constexpr GateKind kind = kY;
+  static constexpr char name[] = "Y";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateCirq<fp_type>, Y>(
+        time, {q0}, {0, 0, 0, -1, 0, 1, 0, 0});
+  }
+};
+
+/**
+ * The `(exponent = 1, global_shift = 0)` instance of ZPowGate.
+ * This is the canonical Pauli Z gate.
+ */
+template <typename fp_type>
+struct Z : public ZPowGate<fp_type> {
+  static constexpr GateKind kind = kZ;
+  static constexpr char name[] = "Z";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateCirq<fp_type>, Z>(
+        time, {q0}, {1, 0, 0, 0, 0, 0, -1, 0});
+  }
+};
+
+// Gates from cirq/ops/phased_x_gate.py:
+
+/**
+ * An XPowGate conjugated by ZPowGate%s.
+ * Equivalent to the circuit `───Z^-p───X^t───Z^p───`.
+ */
+template <typename fp_type>
+struct PhasedXPowGate {
+  static constexpr GateKind kind = kPhasedXPowGate;
+  static constexpr char name[] = "PhasedXPowGate";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0,
+                                  fp_type phase_exponent, fp_type exponent = 1,
+                                  fp_type global_shift = 0) {
+    fp_type pc = std::cos(pi * phase_exponent);
+    fp_type ps = std::sin(pi * phase_exponent);
+    fp_type ec = std::cos(pi * exponent);
+    fp_type es = std::sin(pi * exponent);
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+
+    fp_type ar = 0.5 * ((1 + ec) * gc - es * gs);
+    fp_type ai = 0.5 * ((1 + ec) * gs + es * gc);
+    fp_type br = -0.5 * ((-1 + ec) * gc - es * gs);
+    fp_type bi = -0.5 * ((-1 + ec) * gs + es * gc);
+
+    return CreateGate<GateCirq<fp_type>, PhasedXPowGate>(
+        time, {q0}, {ar, ai, pc * br + ps * bi, pc * bi - ps * br,
+                     pc * br - ps * bi, pc * bi + ps * br, ar, ai},
+        {phase_exponent, exponent, global_shift});
+  }
+};
+
+// Gates from cirq/ops/phased_x_z_gate.py:
+
+/**
+ * A PhasedXPowGate followed by a ZPowGate.
+ * Equivalent to the circuit `───Z^(-a)──X^x──Z^a───Z^z───`.
+ */
+template <typename fp_type>
+struct PhasedXZGate {
+  static constexpr GateKind kind = kPhasedXZGate;
+  static constexpr char name[] = "PhasedXZGate";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0,
+                                  fp_type x_exponent, fp_type z_exponent,
+                                  fp_type axis_phase_exponent) {
+    fp_type xc = std::cos(pi * x_exponent);
+    fp_type xs = std::sin(pi * x_exponent);
+    fp_type zc = std::cos(pi * z_exponent);
+    fp_type zs = std::sin(pi * z_exponent);
+    fp_type ac = std::cos(pi * axis_phase_exponent);
+    fp_type as = std::sin(pi * axis_phase_exponent);
+
+    fp_type br = 0.5 * (1 + xc);
+    fp_type bi = 0.5 * xs;
+    fp_type cr = -0.5 * (-1 + xc);
+    fp_type ci = -0.5 * xs;
+    fp_type dr = ac * zc - as * zs;
+    fp_type di = ac * zs + as * zc;
+
+    return CreateGate<GateCirq<fp_type>, PhasedXZGate>(
+        time, {q0}, {br, bi, ac * cr + as * ci, ac * ci - as * cr,
+                     dr * cr - di * ci, dr * ci + di * cr,
+                     zc * br - zs * bi, zc * bi + zs * br},
+        {x_exponent, z_exponent, axis_phase_exponent});
+  }
+};
+
+// Gates from cirq/ops/parity_gates.py:
+
+/**
+ * The tensor product of two X gates, possibly raised to an exponent.
+ */
+template <typename fp_type>
+struct XXPowGate {
+  static constexpr GateKind kind = kXXPowGate;
+  static constexpr char name[] = "XXPowGate";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
+                                  fp_type exponent, fp_type global_shift = 0) {
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+    fp_type c = std::cos(pi * exponent);
+    fp_type s = std::sin(pi * exponent);
+    fp_type ic = 0.5 * ((1 + c) * gc - s * gs);
+    fp_type is = 0.5 * ((1 + c) * gs + s * gc);
+    fp_type xc = 0.5 * ((1 - c) * gc + s * gs);
+    fp_type xs = 0.5 * ((1 - c) * gs - s * gc);
+
+    return CreateGate<GateCirq<fp_type>, XXPowGate>(
+        time, {q0, q1}, {ic, is, 0, 0, 0, 0, xc, xs,
+                         0, 0, ic, is, xc, xs, 0, 0,
+                         0, 0, xc, xs, ic, is, 0, 0,
+                         xc, xs, 0, 0, 0, 0, ic, is}, {exponent, global_shift});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp(
+      fp_type exponent, fp_type global_shift) {
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+    fp_type c = std::cos(pi * exponent);
+    fp_type s = std::sin(pi * exponent);
+    fp_type ic = 0.5 * ((1 + c) * gc - s * gs);
+    fp_type is = 0.5 * ((1 + c) * gs + s * gc);
+    fp_type xc = 0.5 * ((1 - c) * gc + s * gs);
+    fp_type xs = 0.5 * ((1 - c) * gs - s * gc);
+
+    return schmidt_decomp_type<fp_type>{
+      {{1, 0, 0, 0, 0, 0, 1, 0}, {ic, is, 0, 0, 0, 0, ic, is}},
+      {{0, 0, 1, 0, 1, 0, 0, 0}, {0, 0, xc, xs, xc, xs, 0, 0}},
+    };
+  }
+};
+
+/**
+ * The tensor product of two Y gates, possibly raised to an exponent.
+ */
+template <typename fp_type>
+struct YYPowGate {
+  static constexpr GateKind kind = kYYPowGate;
+  static constexpr char name[] = "YYPowGate";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
+                                  fp_type exponent, fp_type global_shift = 0) {
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+    fp_type c = std::cos(pi * exponent);
+    fp_type s = std::sin(pi * exponent);
+    fp_type ic = 0.5 * ((1 + c) * gc - s * gs);
+    fp_type is = 0.5 * ((1 + c) * gs + s * gc);
+    fp_type yc = 0.5 * ((1 - c) * gc + s * gs);
+    fp_type ys = 0.5 * ((1 - c) * gs - s * gc);
+
+    return CreateGate<GateCirq<fp_type>, YYPowGate>(
+        time, {q0, q1}, {ic, is, 0, 0, 0, 0, -yc, -ys,
+                         0, 0, ic, is, yc, ys, 0, 0,
+                         0, 0, yc, ys, ic, is, 0, 0,
+                         -yc, -ys, 0, 0, 0, 0, ic, is},
+        {exponent, global_shift});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp(
+      fp_type exponent, fp_type global_shift) {
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+    fp_type c = std::cos(pi * exponent);
+    fp_type s = std::sin(pi * exponent);
+    fp_type ic = 0.5 * ((1 + c) * gc - s * gs);
+    fp_type is = 0.5 * ((1 + c) * gs + s * gc);
+    fp_type yc = 0.5 * ((1 - c) * gc + s * gs);
+    fp_type ys = 0.5 * ((1 - c) * gs - s * gc);
+
+    return schmidt_decomp_type<fp_type>{
+      {{1, 0, 0, 0, 0, 0, 1, 0}, {ic, is, 0, 0, 0, 0, ic, is}},
+      {{0, 0, 0, -1, 0, 1, 0, 0}, {0, 0, ys, -yc, -ys, yc, 0, 0}},
+    };
+  }
+};
+
+/**
+ * The tensor product of two Z gates, possibly raised to an exponent.
+ */
+template <typename fp_type>
+struct ZZPowGate {
+  static constexpr GateKind kind = kZZPowGate;
+  static constexpr char name[] = "ZZPowGate";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
+                                  fp_type exponent, fp_type global_shift = 0) {
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+    fp_type zc = std::cos(pi * exponent * (1 + global_shift));
+    fp_type zs = std::sin(pi * exponent * (1 + global_shift));
+
+    return CreateGate<GateCirq<fp_type>, ZZPowGate>(
+        time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0,
+                         0, 0, zc, zs, 0, 0, 0, 0,
+                         0, 0, 0, 0, zc, zs, 0, 0,
+                         0, 0, 0, 0, 0, 0, gc, gs}, {exponent, global_shift});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp(
+      fp_type exponent, fp_type global_shift) {
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+    fp_type c = std::cos(pi * exponent);
+    fp_type s = std::sin(pi * exponent);
+    fp_type ic = 0.5 * ((1 + c) * gc - s * gs);
+    fp_type is = 0.5 * ((1 + c) * gs + s * gc);
+    fp_type zc = 0.5 * ((1 - c) * gc + s * gs);
+    fp_type zs = 0.5 * ((1 - c) * gs - s * gc);
+
+    return schmidt_decomp_type<fp_type>{
+      {{1, 0, 0, 0, 0, 0, 1, 0}, {ic, is, 0, 0, 0, 0, ic, is}},
+      {{1, 0, 0, 0, 0, 0, -1, 0}, {zc, zs, 0, 0, 0, 0, -zc, -zs}},
+    };
+  }
+};
+
+/**
+ * The `(exponent = 1, global_shift = 0)` instance of XXPowGate.
+ * This is the tensor product of two X gates.
+ */
+template <typename fp_type>
+struct XX {
+  static constexpr GateKind kind = kXX;
+  static constexpr char name[] = "XX";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
+    return CreateGate<GateCirq<fp_type>, XX>(
+        time, {q0, q1}, {0, 0, 0, 0, 0, 0, 1, 0,
+                         0, 0, 0, 0, 1, 0, 0, 0,
+                         0, 0, 1, 0, 0, 0, 0, 0,
+                         1, 0, 0, 0, 0, 0, 0, 0});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
+    return schmidt_decomp_type<fp_type>{
+      {{0, 0, 1, 0, 1, 0, 0, 0}, {0, 0, 1, 0, 1, 0, 0, 0}},
+    };
+  }
+};
+
+/**
+ * The `(exponent = 1, global_shift = 0)` instance of YYPowGate.
+ * This is the tensor product of two Y gates.
+ */
+template <typename fp_type>
+struct YY {
+  static constexpr GateKind kind = kYY;
+  static constexpr char name[] = "YY";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
+    return CreateGate<GateCirq<fp_type>, YY>(
+        time, {q0, q1}, {0, 0, 0, 0, 0, 0, -1, 0,
+                         0, 0, 0, 0, 1, 0, 0, 0,
+                         0, 0, 1, 0, 0, 0, 0, 0,
+                         -1, 0, 0, 0, 0, 0, 0, 0});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
+    return schmidt_decomp_type<fp_type>{
+      {{0, 0, 0, -1, 0, 1, 0, 0}, {0, 0, 0, -1, 0, 1, 0, 0}},
+    };
+  }
+};
+
+/**
+ * The `(exponent = 1, global_shift = 0)` instance of ZZPowGate.
+ * This is the tensor product of two Z gates.
+ */
+template <typename fp_type>
+struct ZZ {
+  static constexpr GateKind kind = kZZ;
+  static constexpr char name[] = "ZZ";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
+    return CreateGate<GateCirq<fp_type>, ZZ>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, -1, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, -1, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, 1, 0});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
+    return schmidt_decomp_type<fp_type>{
+      {{1, 0, 0, 0, 0, 0, -1, 0}, {1, 0, 0, 0, 0, 0, -1, 0}},
+    };
+  }
+};
+
+// Gates from cirq/ops/swap_gates.py:
+
+/**
+ * The SWAP gate, possibly raised to a power. Exchanges qubits.
+ */
+template <typename fp_type>
+struct SwapPowGate {
+  static constexpr GateKind kind = kSwapPowGate;
+  static constexpr char name[] = "SwapPowGate";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+  static constexpr fp_type h = static_cast<fp_type>(h_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
+                                  fp_type exponent, fp_type global_shift = 0) {
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+    fp_type c = std::cos(pi * exponent * 0.5);
+    fp_type s = std::sin(pi * exponent * 0.5);
+    fp_type ec = std::cos(pi * exponent * (0.5 + global_shift));
+    fp_type es = std::sin(pi * exponent * (0.5 + global_shift));
+
+    return CreateGate<GateCirq<fp_type>, SwapPowGate>(
+        time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0,
+                         0, 0, c * ec, c * es, s * es, -s * ec, 0, 0,
+                         0, 0, s * es, -s * ec, c * ec, c * es, 0, 0,
+                         0, 0, 0, 0, 0, 0, gc, gs}, {exponent, global_shift});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp(
+      fp_type exponent, fp_type global_shift) {
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+    fp_type c = std::cos(pi * exponent * 0.5);
+    fp_type s = std::sin(pi * exponent * 0.5);
+    fp_type ec = std::cos(pi * exponent * (0.5 + global_shift));
+    fp_type es = std::sin(pi * exponent * (0.5 + global_shift));
+
+    return schmidt_decomp_type<fp_type>{
+      {{h, 0, 0, 0, 0, 0, h, 0}, {gc + c * ec, gs + c * es, 0, 0,
+                                  0, 0, gc + c * ec, gs + c * es}},
+      {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, s * es, -s * ec,
+                                  s * es, -s * ec, 0, 0}},
+      {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, -s * ec, -s * es,
+                                   s * ec, s * es, 0, 0}},
+      {{h, 0, 0, 0, 0, 0, -h, 0}, {gc - c * ec, gs - c * es, 0, 0,
+                                   0, 0, -gc + c * ec, -gs + c * es}},
+    };
+  }
+};
+
+/**
+ * Rotates the |01⟩ vs |10⟩ subspace of two qubits around its Bloch X-axis.
+ * This is a generalization of the ISWAP gate.
+ */
+template <typename fp_type>
+struct ISwapPowGate {
+  static constexpr GateKind kind = kISwapPowGate;
+  static constexpr char name[] = "ISwapPowGate";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+  static constexpr fp_type h = static_cast<fp_type>(h_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
+                                  fp_type exponent, fp_type global_shift = 0) {
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+    fp_type c = std::cos(pi * exponent * 0.5);
+    fp_type s = std::sin(pi * exponent * 0.5);
+
+    return CreateGate<GateCirq<fp_type>, ISwapPowGate>(
+        time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0,
+                         0, 0, c * gc, c * gs, -s * gs, s * gc, 0, 0,
+                         0, 0, -s * gs, s * gc, c * gc, c * gs, 0, 0,
+                         0, 0, 0, 0, 0, 0, gc, gs}, {exponent, global_shift});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp(
+      fp_type exponent, fp_type global_shift) {
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+    fp_type c = std::cos(pi * exponent * 0.5);
+    fp_type s = std::sin(pi * exponent * 0.5);
+
+    return schmidt_decomp_type<fp_type>{
+      {{h, 0, 0, 0, 0, 0, h, 0}, {gc + c * gc, gs + c * gs, 0, 0,
+                                  0, 0, gc + c * gc, gs + c * gs}},
+      {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, -s * gs, s * gc,
+                                  -s * gs, s * gc, 0, 0}},
+      {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, s * gc, s * gs,
+                                   -s * gc, -s * gs, 0, 0}},
+      {{h, 0, 0, 0, 0, 0, -h, 0}, {gc - c * gc, gs - c * gs, 0, 0,
+                                   0, 0, -gc + c * gc, -gs + c * gs}},
+    };
+  }
+};
+
+/**
+ * The `(exponent = 2*phi/pi, global_shift = 0)` instance of ISwapPowGate.
+ * This is a generalization of the ISWAP gate with a fixed global phase of zero.
+ * This is a function in Cirq.
+ */
+template <typename fp_type>
+struct riswap {
+  static constexpr GateKind kind = kriswap;
+  static constexpr char name[] = "riswap";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+  static constexpr fp_type h = static_cast<fp_type>(h_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
+                                  fp_type phi) {
+    fp_type c = std::cos(phi);
+    fp_type s = std::sin(phi);
+
+    return CreateGate<GateCirq<fp_type>, riswap>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, c, 0, 0, s, 0, 0,
+                         0, 0, 0, s, c, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, 1, 0}, {phi});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp(fp_type phi) {
+    fp_type c = std::cos(phi);
+    fp_type s = std::sin(phi);
+
+    return schmidt_decomp_type<fp_type>{
+      {{h, 0, 0, 0, 0, 0, h, 0}, {1 + c, 0, 0, 0, 0, 0, 1 + c, 0}},
+      {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, 0, s, 0, s, 0, 0}},
+      {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, s, 0, -s, 0, 0, 0}},
+      {{h, 0, 0, 0, 0, 0, -h, 0}, {1 - c, 0, 0, 0, 0, 0, -1 + c, 0}},
+    };
+  }
+};
+
+/**
+ * The `(exponent = 1, global_shift = 0)` instance of SwapPowGate.
+ * This is the canonical SWAP gate.
+ */
+template <typename fp_type>
+struct SWAP {
+  static constexpr GateKind kind = kSWAP;
+  static constexpr char name[] = "SWAP";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
+    return CreateGate<GateCirq<fp_type>, SWAP>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 1, 0, 0, 0,
+                         0, 0, 1, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, 1, 0});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
+    return schmidt_decomp_type<fp_type>{
+      {{is2, 0, 0, 0, 0, 0, is2, 0}, {is2, 0, 0, 0, 0, 0, is2, 0}},
+      {{0, 0, is2, 0, is2, 0, 0, 0}, {0, 0, is2, 0, is2, 0, 0, 0}},
+      {{0, 0, 0, -is2, 0, is2, 0, 0}, {0, 0, 0, -is2, 0, is2, 0, 0}},
+      {{is2, 0, 0, 0, 0, 0, -is2, 0}, {is2, 0, 0, 0, 0, 0, -is2, 0}},
+    };
+  }
+};
+
+/**
+ * The `(exponent = 1, global_shift = 0)` instance of ISwapPowGate.
+ * This is the canonical ISWAP gate.
+ */
+template <typename fp_type>
+struct ISWAP {
+  static constexpr GateKind kind = kISWAP;
+  static constexpr char name[] = "ISWAP";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type h = static_cast<fp_type>(h_double);
+  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
+    return CreateGate<GateCirq<fp_type>, ISWAP>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 0, 1, 0, 0,
+                         0, 0, 0, 1, 0, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, 1, 0});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
+    return schmidt_decomp_type<fp_type>{
+      {{is2, 0, 0, 0, 0, 0, is2, 0}, {is2, 0, 0, 0, 0, 0, is2, 0}},
+      {{0, 0, h, h, h, h, 0, 0}, {0, 0, h, h, h, h, 0, 0}},
+      {{0, 0, h, -h, -h, h, 0, 0}, {0, 0, h, -h, -h, h, 0, 0}},
+      {{is2, 0, 0, 0, 0, 0, -is2, 0}, {is2, 0, 0, 0, 0, 0, -is2, 0}},
+    };
+  }
+};
+
+// Gates from cirq/ops/phased_iswap_gate.py:
+
+/**
+ * An ISwapPowGate conjugated by ZPowGate%s.
+ * Equivalent to the composition `(Z^-p ⊗ Z^p) ISWAP^t (Z^p ⊗ Z^-p)`.
+ */
+template <typename fp_type>
+struct PhasedISwapPowGate {
+  static constexpr GateKind kind = kPhasedISwapPowGate;
+  static constexpr char name[] = "PhasedISwapPowGate";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = false;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+  static constexpr fp_type h = static_cast<fp_type>(h_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
+                                  fp_type phase_exponent = 0.25,
+                                  fp_type exponent = 1.0) {
+    fp_type fc = std::cos(2 * pi * phase_exponent);
+    fp_type fs = std::sin(2 * pi * phase_exponent);
+    fp_type c = std::cos(pi * exponent * 0.5);
+    fp_type s = std::sin(pi * exponent * 0.5);
+
+    // Matrix is in this form because the simulator uses inverse qubit order.
+    return CreateGate<GateCirq<fp_type>, PhasedISwapPowGate>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, c, 0, s * fs, s * fc, 0, 0,
+                         0, 0, -s * fs, s * fc, c, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, 1, 0}, {phase_exponent, exponent});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp(
+      fp_type phase_exponent, fp_type exponent) {
+    fp_type fc = std::cos(2 * pi * phase_exponent);
+    fp_type fs = std::sin(2 * pi * phase_exponent);
+    fp_type c = std::cos(pi * exponent * 0.5);
+    fp_type s = std::sin(pi * exponent * 0.5);
+
+    return schmidt_decomp_type<fp_type>{
+      {{h, 0, 0, 0, 0, 0, h, 0}, {1 + c, 0, 0, 0, 0, 0, 1 + c, 0}},
+      {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, s * fs, s * fc, -s * fs, s * fc, 0, 0}},
+      {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, s * fc, -s * fs,
+                                   -s * fc, -s * fs, 0, 0}},
+      {{h, 0, 0, 0, 0, 0, -h, 0}, {1 - c, 0, 0, 0, 0, 0, -1 + c, 0}},
+    };
+  }
+};
+
+/**
+ * The `(phase_exponent = 0.25, exponent = 2*phi/pi)` instance of
+ * PhasedISwapPowGate.
+ * This is the "Givens rotation" from numerical linear algebra.
+ * This is a function in Cirq.
+ */
+template <typename fp_type>
+struct givens {
+  static constexpr GateKind kind = kgivens;
+  static constexpr char name[] = "givens";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = false;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+  static constexpr fp_type h = static_cast<fp_type>(h_double);
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
+                                  fp_type phi) {
+    fp_type c = std::cos(phi);
+    fp_type s = std::sin(phi);
+
+    // Matrix is in this form because the simulator uses inverse qubit order.
+    return CreateGate<GateCirq<fp_type>, givens>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, c, 0, s, 0, 0, 0,
+                         0, 0, -s, 0, c, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, 1, 0}, {phi});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp(fp_type phi) {
+    fp_type c = std::cos(phi);
+    fp_type s = std::sin(phi);
+
+    return schmidt_decomp_type<fp_type>{
+      {{h, 0, 0, 0, 0, 0, h, 0}, {1 + c, 0, 0, 0, 0, 0, 1 + c, 0}},
+      {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, s, 0, -s, 0, 0, 0}},
+      {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, 0, -s, 0, -s, 0, 0}},
+      {{h, 0, 0, 0, 0, 0, -h, 0}, {1 - c, 0, 0, 0, 0, 0, -1 + c, 0}},
+    };
+  }
+};
+
+// Gates from cirq/ops/fsim_gate.py:
+
+/**
+ * The fermionic simulation gate family. Contains all two-qubit interactions
+ * that preserve excitations, up to single-qubit rotations and global phase.
+ */
+template <typename fp_type>
+struct FSimGate {
+  static constexpr GateKind kind = kFSimGate;
+  static constexpr char name[] = "FSimGate";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
+
+  static GateCirq<fp_type> Create(
+      unsigned time, unsigned q0, unsigned q1, fp_type theta, fp_type phi) {
+    if (phi < 0) {
+      phi += 2 * 3.141592653589793;
+    }
+
+    fp_type ct = std::cos(theta);
+    fp_type st = std::sin(theta);
+    fp_type cp = std::cos(phi);
+    fp_type sp = std::sin(phi);
+
+    return CreateGate<GateCirq<fp_type>, FSimGate>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, ct, 0, 0, -st, 0, 0,
+                         0, 0, 0, -st, ct, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, cp, -sp}, {theta, phi});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp(
+      fp_type theta, fp_type phi) {
+    fp_type ct = std::cos(theta);
+    fp_type st = std::sin(theta);
+
+    fp_type cp2 = std::cos(0.5 * phi);
+    fp_type sp2 = std::sin(0.5 * phi);
+    fp_type cp4 = std::cos(0.25 * phi);
+    fp_type sp4 = std::sin(0.25 * phi);
+
+    fp_type a0 = std::sqrt(std::sqrt(1 + 2 * ct * cp2 + ct * ct));
+    fp_type a1 = std::sqrt(std::sqrt(1 - 2 * ct * cp2 + ct * ct));
+
+    fp_type p0 = 0.5 * std::atan2(-sp2, cp2 + ct);
+    fp_type p1 = 0.5 * std::atan2(-sp2, cp2 - ct);
+
+    fp_type c0 = is2 * a0 * std::cos(p0);
+    fp_type s0 = is2 * a0 * std::sin(p0);
+
+    fp_type c1 = is2 * a1 * std::cos(p1);
+    fp_type s1 = is2 * a1 * std::sin(p1);
+
+    fp_type st2 = 0.5 * std::sqrt(st);
+
+    fp_type a = cp4 * c0 - sp4 * s0;
+    fp_type b = cp4 * s0 + sp4 * c0;
+    fp_type c = cp4 * c0 + sp4 * s0;
+    fp_type d = cp4 * s0 - sp4 * c0;
+
+    fp_type e = cp4 * c1 - sp4 * s1;
+    fp_type f = cp4 * s1 + sp4 * c1;
+    fp_type g = -(cp4 * c1 + sp4 * s1);
+    fp_type h = -(cp4 * s1 - sp4 * c1);
+
+    return schmidt_decomp_type<fp_type>{
+      {{a, b, 0, 0, 0, 0, c, d}, {a, b, 0, 0, 0, 0, c, d}},
+      {{0, 0, st2, -st2, st2, -st2, 0, 0}, {0, 0, st2, -st2, st2, -st2, 0, 0}},
+      {{0, 0, -st2, -st2, st2, st2, 0, 0}, {0, 0, -st2, -st2, st2, st2, 0, 0}},
+      {{e, f, 0, 0, 0, 0, g, h}, {e, f, 0, 0, 0, 0, g, h}},
+    };
+  }
+};
+
+// Gates from cirq/ops/two_qubit_diagonal_gate.py:
+
+/**
+ * A two-qubit diagonal gate.
+ */
+template <typename fp_type>
+struct TwoQubitDiagonalGate {
+  static constexpr GateKind kind = kTwoQubitDiagonalGate;
+  static constexpr char name[] = "TwoQubitDiagonalGate";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = false;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time,
+                                  unsigned q0, unsigned q1,
+                                  const std::vector<fp_type>& angles) {
+    std::vector<fp_type> cs;
+    std::vector<fp_type> ss;
+    cs.reserve(4);
+    ss.reserve(4);
+
+    for (std::size_t i = 0; i < angles.size(); ++i) {
+      cs.push_back(std::cos(angles[i]));
+      ss.push_back(std::sin(angles[i]));
+    }
+
+    for (std::size_t i = angles.size(); i < 4; ++i) {
+      cs.push_back(1);
+      ss.push_back(0);
+    }
+
+    // Matrix is in this form because the simulator uses inverse qubit order.
+    return CreateGate<GateCirq<fp_type>, TwoQubitDiagonalGate>(
+        time, {q0, q1}, {cs[0], ss[0], 0, 0, 0, 0, 0, 0,
+                         0, 0, cs[2], ss[2], 0, 0, 0, 0,
+                         0, 0, 0, 0, cs[1], ss[1], 0, 0,
+                         0, 0, 0, 0, 0, 0, cs[3], ss[3]});
+  }
+};
+
+// Gates from cirq/ops/three_qubit_gates.py:
+
+/**
+ * A three-qubit diagonal gate.
+ */
+template <typename fp_type>
+struct ThreeQubitDiagonalGate {
+  static constexpr GateKind kind = kThreeQubitDiagonalGate;
+  static constexpr char name[] = "ThreeQubitDiagonalGate";
+  static constexpr unsigned num_qubits = 3;
+  static constexpr bool symmetric = false;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time,
+                                  unsigned q0, unsigned q1, unsigned q2,
+                                  const std::vector<fp_type>& angles) {
+    std::vector<fp_type> cs;
+    std::vector<fp_type> ss;
+    cs.reserve(8);
+    ss.reserve(8);
+
+    for (std::size_t i = 0; i < angles.size(); ++i) {
+      cs.push_back(std::cos(angles[i]));
+      ss.push_back(std::sin(angles[i]));
+    }
+
+    for (std::size_t i = angles.size(); i < 8; ++i) {
+      cs.push_back(1);
+      ss.push_back(0);
+    }
+
+    // Matrix is in this form because the simulator uses inverse qubit order.
+    return CreateGate<GateCirq<fp_type>, ThreeQubitDiagonalGate>(
+        time, {q0, q1, q2},
+        {cs[0], ss[0], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+         0, 0, cs[4], ss[4], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+         0, 0, 0, 0, cs[2], ss[2], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+         0, 0, 0, 0, 0, 0, cs[6], ss[6], 0, 0, 0, 0, 0, 0, 0, 0,
+         0, 0, 0, 0, 0, 0, 0, 0, cs[1], ss[1], 0, 0, 0, 0, 0, 0,
+         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, cs[5], ss[5], 0, 0, 0, 0,
+         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, cs[3], ss[3], 0, 0,
+         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, cs[7], ss[7]});
+  }
+};
+
+/**
+ * A gate that applies a phase to the |111⟩ state of three qubits.
+ * This is a generalization of the CCZ gate.
+ */
+template <typename fp_type>
+struct CCZPowGate {
+  static constexpr GateKind kind = kCCZPowGate;
+  static constexpr char name[] = "CCZPowGate";
+  static constexpr unsigned num_qubits = 3;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time,
+                                  unsigned q0, unsigned q1, unsigned q2,
+                                  fp_type exponent, fp_type global_shift = 0) {
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+    fp_type ec = std::cos(pi * exponent * (1 + global_shift));
+    fp_type es = std::sin(pi * exponent * (1 + global_shift));
+
+    return CreateGate<GateCirq<fp_type>, CCZPowGate>(
+        time, {q0, q1, q2}, {gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ec, es},
+                            {exponent, global_shift});
+  }
+};
+
+/**
+ * A gate that applies a doubly-controlled power of an X gate.
+ * This is a generalization of the CCX (or CCNOT) gate.
+ */
+template <typename fp_type>
+struct CCXPowGate {
+  static constexpr GateKind kind = kCCXPowGate;
+  static constexpr char name[] = "CCXPowGate";
+  static constexpr unsigned num_qubits = 3;
+  static constexpr bool symmetric = false;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time,
+                                  unsigned q0, unsigned q1, unsigned q2,
+                                  fp_type exponent, fp_type global_shift = 0) {
+    fp_type c = std::cos(pi * exponent * 0.5);
+    fp_type s = std::sin(pi * exponent * 0.5);
+    fp_type gc = std::cos(pi * exponent * global_shift);
+    fp_type gs = std::sin(pi * exponent * global_shift);
+    fp_type ec = std::cos(pi * exponent * (0.5 + global_shift));
+    fp_type es = std::sin(pi * exponent * (0.5 + global_shift));
+
+    // Matrix is in this form because the simulator uses inverse qubit order.
+    return CreateGate<GateCirq<fp_type>, CCXPowGate>(
+        time, {q0, q1, q2},
+        {gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+         0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+         0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+         0, 0, 0, 0, 0, 0, c * ec, c * es, 0, 0, 0, 0, 0, 0, s * es, -s * ec,
+         0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0,
+         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0,
+         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0,
+         0, 0, 0, 0, 0, 0, s * es, -s * ec, 0, 0, 0, 0, 0, 0, c * ec, c * es},
+        {exponent, global_shift});
+  }
+};
+
+/**
+ * A controlled swap gate (the Fredkin gate).
+ */
+template <typename fp_type>
+struct CSwapGate {
+  static constexpr GateKind kind = kCSwapGate;
+  static constexpr char name[] = "CSwapGate";
+  static constexpr unsigned num_qubits = 3;
+  static constexpr bool symmetric = false;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time,
+                                  unsigned q0, unsigned q1, unsigned q2) {
+    // Matrix is in this form because the simulator uses inverse qubit order.
+    return CreateGate<GateCirq<fp_type>, CSwapGate>(
+        time, {q0, q1, q2}, {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0});
+  }
+};
+
+/**
+ * The `(exponent = 1, global_shift = 0)` instance of CCZPowGate.
+ * This is the canonical doubly-controlled Z gate.
+ */
+template <typename fp_type>
+struct CCZ {
+  static constexpr GateKind kind = kCCZ;
+  static constexpr char name[] = "CCZ";
+  static constexpr unsigned num_qubits = 3;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time,
+                                  unsigned q0, unsigned q1, unsigned q2) {
+    return CreateGate<GateCirq<fp_type>, CCZ>(
+        time, {q0, q1, q2}, {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0});
+  }
+};
+
+/**
+ * The `(exponent = 1, global_shift = 0)` instance of CCXPowGate.
+ * This is the canonical doubly-controlled X gate (the TOFFOLI gate).
+ */
+template <typename fp_type>
+struct CCX {
+  static constexpr GateKind kind = kCCX;
+  static constexpr char name[] = "CCX";
+  static constexpr unsigned num_qubits = 3;
+  static constexpr bool symmetric = false;
+
+  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
+
+  static GateCirq<fp_type> Create(unsigned time,
+                                  unsigned q0, unsigned q1, unsigned q2) {
+    // Matrix is in this form because the simulator uses inverse qubit order.
+    return CreateGate<GateCirq<fp_type>, CCX>(
+        time, {q0, q1, q2}, {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
+                             0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0});
+  }
+};
+
+template <typename fp_type>
+using CCNotPowGate = CCXPowGate<fp_type>;
+
+template <typename fp_type>
+using TOFFOLI = CCX<fp_type>;
+
+template <typename fp_type>
+using CCNOT = CCX<fp_type>;
+
+template <typename fp_type>
+using CSWAP = CSwapGate<fp_type>;
+
+template <typename fp_type>
+using FREDKIN = CSwapGate<fp_type>;
+
+// Gates from cirq/ops/matrix_gates.py:
+
+/**
+ * A one-qubit gate defined entirely by its matrix.
+ */
+template <typename fp_type>
+struct MatrixGate1 {
+  static constexpr GateKind kind = kMatrixGate1;
+  static constexpr char name[] = "MatrixGate1";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateCirq<fp_type> Create(unsigned time, unsigned q0,
+                                  const Matrix<fp_type>& m) {
+    auto m2 = m;
+    return
+        CreateGate<GateCirq<fp_type>, MatrixGate1>(time, {q0}, std::move(m2));
+  }
+};
+
+/**
+ * A two-qubit gate defined entirely by its matrix.
+ */
+template <typename fp_type>
+struct MatrixGate2 {
+  static constexpr GateKind kind = kMatrixGate2;
+  static constexpr char name[] = "MatrixGate2";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = false;
+
+  template <typename M = Matrix<fp_type>>
+  static GateCirq<fp_type> Create(
+      unsigned time, unsigned q0, unsigned q1, M&& m) {
+    return CreateGate<GateCirq<fp_type>, MatrixGate2>(time, {q1, q0},
+                                                      std::forward<M>(m));
+  }
+};
+
+/**
+ * A multi-qubit gate defined entirely by its matrix.
+ */
+template <typename fp_type>
+struct MatrixGate {
+  static constexpr GateKind kind = kMatrixGate;
+  static constexpr char name[] = "MatrixGate";
+  static constexpr bool symmetric = false;
+
+  template <typename M = Matrix<fp_type>>
+  static GateCirq<fp_type> Create(unsigned time,
+                                  std::vector<unsigned> qubits, M&& m) {
+    std::reverse(qubits.begin(), qubits.end());
+    return CreateGate<GateCirq<fp_type>, MatrixGate>(time, std::move(qubits),
+                                                     std::forward<M>(m));
+  }
+};
+
+}  // namesapce Cirq
+
+template <typename fp_type>
+inline schmidt_decomp_type<fp_type> GetSchmidtDecomp(
+    Cirq::GateKind kind, const std::vector<fp_type>& params) {
+  switch (kind) {
+  case Cirq::kI2:
+    return Cirq::I2<fp_type>::SchmidtDecomp();
+  case Cirq::kCZPowGate:
+    return Cirq::CZPowGate<fp_type>::SchmidtDecomp(params[0], params[1]);
+  case Cirq::kCXPowGate:
+    return Cirq::CXPowGate<fp_type>::SchmidtDecomp(params[0], params[1]);
+  case Cirq::kCZ:
+    return Cirq::CZ<fp_type>::SchmidtDecomp();
+  case Cirq::kCX:
+    return Cirq::CX<fp_type>::SchmidtDecomp();
+  case Cirq::kXXPowGate:
+    return Cirq::XXPowGate<fp_type>::SchmidtDecomp(params[0], params[1]);
+  case Cirq::kYYPowGate:
+    return Cirq::YYPowGate<fp_type>::SchmidtDecomp(params[0], params[1]);
+  case Cirq::kZZPowGate:
+    return Cirq::ZZPowGate<fp_type>::SchmidtDecomp(params[0], params[1]);
+  case Cirq::kXX:
+    return Cirq::XX<fp_type>::SchmidtDecomp();
+  case Cirq::kYY:
+    return Cirq::YY<fp_type>::SchmidtDecomp();
+  case Cirq::kZZ:
+    return Cirq::ZZ<fp_type>::SchmidtDecomp();
+  case Cirq::kSwapPowGate:
+    return Cirq::SwapPowGate<fp_type>::SchmidtDecomp(params[0], params[1]);
+  case Cirq::kISwapPowGate:
+    return Cirq::ISwapPowGate<fp_type>::SchmidtDecomp(params[0], params[1]);
+  case Cirq::kriswap:
+    return Cirq::riswap<fp_type>::SchmidtDecomp(params[0]);
+  case Cirq::kSWAP:
+    return Cirq::SWAP<fp_type>::SchmidtDecomp();
+  case Cirq::kISWAP:
+    return Cirq::ISWAP<fp_type>::SchmidtDecomp();
+  case Cirq::kPhasedISwapPowGate:
+    return Cirq::PhasedISwapPowGate<fp_type>::SchmidtDecomp(
+        params[0], params[1]);
+  case Cirq::kgivens:
+    return Cirq::givens<fp_type>::SchmidtDecomp(params[0]);
+  case Cirq::kFSimGate:
+    return Cirq::FSimGate<fp_type>::SchmidtDecomp(params[0], params[1]);
+  default:
+    // Single qubit gates of gates with unimplemented Schmidt decomposition.
+    return schmidt_decomp_type<fp_type>{};
+  }
+}
+
+}  // namespace qsim
+
+#endif  // GATES_CIRQ_H_
diff --git a/tpls/qsim/gates_qsim.h b/tpls/qsim/gates_qsim.h
new file mode 100644
index 0000000..366c4f1
--- /dev/null
+++ b/tpls/qsim/gates_qsim.h
@@ -0,0 +1,661 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GATES_QSIM_H_
+#define GATES_QSIM_H_
+
+#include <array>
+#include <cmath>
+#include <vector>
+
+#include "gate.h"
+
+namespace qsim {
+
+// Gate set implemented in qsim contains the following gates.
+enum GateKind {
+  kGateId1 = 0, // one-qubit Id
+  kGateHd,      // Hadamard
+  kGateT,       // T
+  kGateX,       // X
+  kGateY,       // Y
+  kGateZ,       // Z
+  kGateX2,      // sqrt(X)
+  kGateY2,      // sqrt(Y)
+  kGateRX,      // X-rotation
+  kGateRY,      // Y-rotation
+  kGateRZ,      // Z-rotation
+  kGateRXY,     // XY-rotation (rotation around arbitrary axis in the XY plane)
+  kGateHZ2,     // pi / 2 rotation around the X + Y axis
+  kGateS,       // S
+  kGateId2,     // two-qubit Id
+  kGateCZ,      // CZ
+  kGateCNot,    // CNOT (CX)
+  kGateSwap,    // swap
+  kGateIS,      // iSwap
+  kGateFS,      // fSim
+  kGateCP,      // control phase
+  kGateMatrix1, // one-qubit matrix gate
+  kGateMatrix2, // two-qubit matrix gate
+  kGateGPh,     // global phase gate
+  kDecomp = gate::kDecomp,
+  kMeasurement = gate::kMeasurement,
+};
+
+// Specialization of Gate (defined in gate.h) for the qsim gate set.
+template <typename fp_type>
+using GateQSim = Gate<fp_type, GateKind>;
+
+constexpr double h_double = 0.5;
+constexpr double is2_double = 0.7071067811865475;
+
+// Zero-qubit gates:
+
+/**
+ * The global phase gate.
+ */
+template <typename fp_type>
+struct GateGPh {
+  static constexpr GateKind kind = kGateGPh;
+  static constexpr char name[] = "p";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateQSim<fp_type> Create(unsigned time, fp_type phi) {
+    return Create(time, std::cos(phi), std::sin(phi));
+  }
+
+  static GateQSim<fp_type> Create(unsigned time, fp_type cp, fp_type sp) {
+    return CreateGate<GateQSim<fp_type>, GateGPh>(
+        time, {}, {cp, sp}, {cp, sp});
+  }
+};
+
+// One-qubit gates:
+
+/**
+ * The one-qubit identity gate.
+ */
+template <typename fp_type>
+struct GateId1 {
+  static constexpr GateKind kind = kGateId1;
+  static constexpr char name[] = "id1";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateQSim<fp_type>, GateId1>(
+        time, {q0}, {1, 0, 0, 0, 0, 0, 1, 0});
+  }
+};
+
+/**
+ * The Hadamard gate.
+ */
+template <typename fp_type>
+struct GateHd {
+  static constexpr GateKind kind = kGateHd;
+  static constexpr char name[] = "h";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateQSim<fp_type>, GateHd>(
+        time, {q0}, {is2, 0, is2, 0, is2, 0, -is2, 0});
+  }
+};
+
+/**
+ * The T gate, equivalent to `Z ^ 0.25`.
+ */
+template <typename fp_type>
+struct GateT {
+  static constexpr GateKind kind = kGateT;
+  static constexpr char name[] = "t";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateQSim<fp_type>, GateT>(
+        time, {q0}, {1, 0, 0, 0, 0, 0, is2, is2});
+  }
+};
+
+/**
+ * The Pauli X (or "NOT") gate.
+ */
+template <typename fp_type>
+struct GateX {
+  static constexpr GateKind kind = kGateX;
+  static constexpr char name[] = "x";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateQSim<fp_type>, GateX>(
+        time, {q0}, {0, 0, 1, 0, 1, 0, 0, 0});
+  }
+};
+
+/**
+ * The Pauli Y gate.
+ */
+template <typename fp_type>
+struct GateY {
+  static constexpr GateKind kind = kGateY;
+  static constexpr char name[] = "y";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateQSim<fp_type>, GateY>(
+        time, {q0}, {0, 0, 0, -1, 0, 1, 0, 0});
+  }
+};
+
+/**
+ * The Pauli Z gate.
+ */
+template <typename fp_type>
+struct GateZ {
+  static constexpr GateKind kind = kGateZ;
+  static constexpr char name[] = "z";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateQSim<fp_type>, GateZ>(
+        time, {q0}, {1, 0, 0, 0, 0, 0, -1, 0});
+  }
+};
+
+/**
+ * The "square root of X" gate.
+ */
+template <typename fp_type>
+struct GateX2 {
+  static constexpr GateKind kind = kGateX2;
+  static constexpr char name[] = "x_1_2";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type h = static_cast<fp_type>(h_double);
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateQSim<fp_type>, GateX2>(
+        time, {q0}, {h, h, h, -h, h, -h, h, h});
+  }
+};
+
+/**
+ * The "square root of Y" gate.
+ */
+template <typename fp_type>
+struct GateY2 {
+  static constexpr GateKind kind = kGateY2;
+  static constexpr char name[] = "y_1_2";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type h = static_cast<fp_type>(h_double);
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateQSim<fp_type>, GateY2>(
+        time, {q0}, {h, h, -h, -h, h, h, h, h});
+  }
+};
+
+/**
+ * A gate that rotates around the X axis of the Bloch sphere.
+ * This is a generalization of the X gate.
+ */
+template <typename fp_type>
+struct GateRX {
+  static constexpr GateKind kind = kGateRX;
+  static constexpr char name[] = "rx";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0, fp_type phi) {
+    fp_type phi2 = -0.5 * phi;
+    fp_type c = std::cos(phi2);
+    fp_type s = std::sin(phi2);
+
+    return CreateGate<GateQSim<fp_type>, GateRX>(
+        time, {q0}, {c, 0, 0, s, 0, s, c, 0}, {phi});
+  }
+};
+
+/**
+ * A gate that rotates around the Y axis of the Bloch sphere.
+ * This is a generalization of the Y gate.
+ */
+template <typename fp_type>
+struct GateRY {
+  static constexpr GateKind kind = kGateRY;
+  static constexpr char name[] = "ry";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0, fp_type phi) {
+    fp_type phi2 = -0.5 * phi;
+    fp_type c = std::cos(phi2);
+    fp_type s = std::sin(phi2);
+
+    return CreateGate<GateQSim<fp_type>, GateRY>(
+        time, {q0}, {c, 0, s, 0, -s, 0, c, 0}, {phi});
+  }
+};
+
+/**
+ * A gate that rotates around the Z axis of the Bloch sphere.
+ * This is a generalization of the Z gate.
+ */
+template <typename fp_type>
+struct GateRZ {
+  static constexpr GateKind kind = kGateRZ;
+  static constexpr char name[] = "rz";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0, fp_type phi) {
+    fp_type phi2 = -0.5 * phi;
+    fp_type c = std::cos(phi2);
+    fp_type s = std::sin(phi2);
+
+    return CreateGate<GateQSim<fp_type>, GateRZ>(
+        time, {q0}, {c, s, 0, 0, 0, 0, c, -s}, {phi});
+  }
+};
+
+/**
+ * A gate that rotates around an arbitrary axis in the XY-plane.
+ */
+template <typename fp_type>
+struct GateRXY {
+  static constexpr GateKind kind = kGateRXY;
+  static constexpr char name[] = "rxy";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateQSim<fp_type> Create(
+      unsigned time, unsigned q0, fp_type theta, fp_type phi) {
+    fp_type phi2 = -0.5 * phi;
+    fp_type cp = std::cos(phi2);
+    fp_type sp = std::sin(phi2);
+    fp_type ct = std::cos(theta) * sp;
+    fp_type st = std::sin(theta) * sp;
+
+    return CreateGate<GateQSim<fp_type>, GateRXY>(
+        time, {q0}, {cp, 0, st, ct, -st, ct, cp, 0}, {theta, phi});
+  }
+};
+
+/**
+ * A pi / 2 rotation around the X + Y axis.
+ */
+template <typename fp_type>
+struct GateHZ2 {
+  static constexpr GateKind kind = kGateHZ2;
+  static constexpr char name[] = "hz_1_2";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type h = static_cast<fp_type>(h_double);
+
+  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateQSim<fp_type>, GateHZ2>(
+        time, {q0}, {h, h, 0, -is2, is2, 0, h, h});
+  }
+};
+
+/**
+ * The S gate, equivalent to "square root of Z".
+ */
+template <typename fp_type>
+struct GateS {
+  static constexpr GateKind kind = kGateS;
+  static constexpr char name[] = "s";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
+    return CreateGate<GateQSim<fp_type>, GateS>(
+        time, {q0}, {1, 0, 0, 0, 0, 0, 0, 1});
+  }
+};
+
+/**
+ * A one-qubit gate defined entirely by its matrix.
+ */
+template <typename fp_type>
+struct GateMatrix1 {
+  static constexpr GateKind kind = kGateMatrix1;
+  static constexpr char name[] = "mat1";
+  static constexpr unsigned num_qubits = 1;
+  static constexpr bool symmetric = true;
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0,
+                                  const Matrix<fp_type>& m) {
+    auto m2 = m;
+    return
+        CreateGate<GateQSim<fp_type>, GateMatrix1>(time, {q0}, std::move(m2));
+  }
+};
+
+// Two-qubit gates:
+
+/**
+ * The two-qubit identity gate.
+ */
+template <typename fp_type>
+struct GateId2 {
+  static constexpr GateKind kind = kGateId2;
+  static constexpr char name[] = "id2";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
+    return CreateGate<GateQSim<fp_type>, GateId2>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, 1, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 1, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, 1, 0});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
+    return schmidt_decomp_type<fp_type>{
+      {{1, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, 1, 0}},
+    };
+  }
+};
+
+/**
+ * The controlled-Z (CZ) gate.
+ */
+template <typename fp_type>
+struct GateCZ {
+  static constexpr GateKind kind = kGateCZ;
+  static constexpr char name[] = "cz";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
+    return CreateGate<GateQSim<fp_type>, GateCZ>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, 1, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 1, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, -1, 0});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
+    return schmidt_decomp_type<fp_type>{
+      {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}},
+      {{0, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, -1, 0}},
+    };
+  }
+};
+
+/**
+ * The controlled-X (CX or CNOT) gate.
+ */
+template <typename fp_type>
+struct GateCNot {
+  static constexpr GateKind kind = kGateCNot;
+  static constexpr char name[] = "cnot";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = false;
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
+    // Matrix is in this form because the simulator uses inverse qubit order.
+    return CreateGate<GateQSim<fp_type>, GateCNot>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, 1, 0,
+                         0, 0, 0, 0, 1, 0, 0, 0,
+                         0, 0, 1, 0, 0, 0, 0, 0});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
+    return schmidt_decomp_type<fp_type>{
+      {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}},
+      {{0, 0, 0, 0, 0, 0, 1, 0}, {0, 0, 1, 0, 1, 0, 0, 0}},
+    };
+  }
+};
+
+/**
+ * The SWAP gate. Exchanges two qubits.
+ */
+template <typename fp_type>
+struct GateSwap {
+  static constexpr GateKind kind = kGateSwap;
+  static constexpr char name[] = "sw";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
+    return CreateGate<GateQSim<fp_type>, GateSwap>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 1, 0, 0, 0,
+                         0, 0, 1, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, 1, 0});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
+    return schmidt_decomp_type<fp_type>{
+      {{is2, 0, 0, 0, 0, 0, is2, 0}, {is2, 0, 0, 0, 0, 0, is2, 0}},
+      {{0, 0, is2, 0, is2, 0, 0, 0}, {0, 0, is2, 0, is2, 0, 0, 0}},
+      {{0, 0, 0, -is2, 0, is2, 0, 0}, {0, 0, 0, -is2, 0, is2, 0, 0}},
+      {{is2, 0, 0, 0, 0, 0, -is2, 0}, {is2, 0, 0, 0, 0, 0, -is2, 0}},
+    };
+  }
+};
+
+/**
+ * The ISWAP gate.
+ */
+template <typename fp_type>
+struct GateIS {
+  static constexpr GateKind kind = kGateIS;
+  static constexpr char name[] = "is";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type h = static_cast<fp_type>(h_double);
+  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
+
+  static GateQSim<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
+    return CreateGate<GateQSim<fp_type>, GateIS>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 0, 1, 0, 0,
+                         0, 0, 0, 1, 0, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, 1, 0});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
+    return schmidt_decomp_type<fp_type>{
+      {{is2, 0, 0, 0, 0, 0, is2, 0}, {is2, 0, 0, 0, 0, 0, is2, 0}},
+      {{0, 0, h, h, h, h, 0, 0}, {0, 0, h, h, h, h, 0, 0}},
+      {{0, 0, h, -h, -h, h, 0, 0}, {0, 0, h, -h, -h, h, 0, 0}},
+      {{is2, 0, 0, 0, 0, 0, -is2, 0}, {is2, 0, 0, 0, 0, 0, -is2, 0}},
+    };
+  }
+};
+
+/**
+ * The fermionic simulation (FSim) gate family. Contains all two-qubit
+ * interactions that preserve excitations, up to single-qubit rotations and
+ * global phase.
+ */
+template <typename fp_type>
+struct GateFS {
+  static constexpr GateKind kind = kGateFS;
+  static constexpr char name[] = "fs";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
+
+  static GateQSim<fp_type> Create(
+      unsigned time, unsigned q0, unsigned q1, fp_type theta, fp_type phi) {
+    if (phi < 0) {
+      phi += 2 * 3.141592653589793;
+    }
+
+    fp_type ct = std::cos(theta);
+    fp_type st = std::sin(theta);
+    fp_type cp = std::cos(phi);
+    fp_type sp = std::sin(phi);
+
+    return CreateGate<GateQSim<fp_type>, GateFS>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, ct, 0, 0, -st, 0, 0,
+                         0, 0, 0, -st, ct, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, cp, -sp}, {theta, phi});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp(
+      fp_type theta, fp_type phi) {
+    fp_type ct = std::cos(theta);
+    fp_type st = std::sin(theta);
+
+    fp_type cp2 = std::cos(0.5 * phi);
+    fp_type sp2 = std::sin(0.5 * phi);
+    fp_type cp4 = std::cos(0.25 * phi);
+    fp_type sp4 = std::sin(0.25 * phi);
+
+    fp_type a0 = std::sqrt(std::sqrt(1 + 2 * ct * cp2 + ct * ct));
+    fp_type a1 = std::sqrt(std::sqrt(1 - 2 * ct * cp2 + ct * ct));
+
+    fp_type p0 = 0.5 * std::atan2(-sp2, cp2 + ct);
+    fp_type p1 = 0.5 * std::atan2(-sp2, cp2 - ct);
+
+    fp_type c0 = is2 * a0 * std::cos(p0);
+    fp_type s0 = is2 * a0 * std::sin(p0);
+
+    fp_type c1 = is2 * a1 * std::cos(p1);
+    fp_type s1 = is2 * a1 * std::sin(p1);
+
+    fp_type st2 = 0.5 * std::sqrt(st);
+
+    fp_type a = cp4 * c0 - sp4 * s0;
+    fp_type b = cp4 * s0 + sp4 * c0;
+    fp_type c = cp4 * c0 + sp4 * s0;
+    fp_type d = cp4 * s0 - sp4 * c0;
+
+    fp_type e = cp4 * c1 - sp4 * s1;
+    fp_type f = cp4 * s1 + sp4 * c1;
+    fp_type g = -(cp4 * c1 + sp4 * s1);
+    fp_type h = -(cp4 * s1 - sp4 * c1);
+
+    return schmidt_decomp_type<fp_type>{
+      {{a, b, 0, 0, 0, 0, c, d}, {a, b, 0, 0, 0, 0, c, d}},
+      {{0, 0, st2, -st2, st2, -st2, 0, 0}, {0, 0, st2, -st2, st2, -st2, 0, 0}},
+      {{0, 0, -st2, -st2, st2, st2, 0, 0}, {0, 0, -st2, -st2, st2, st2, 0, 0}},
+      {{e, f, 0, 0, 0, 0, g, h}, {e, f, 0, 0, 0, 0, g, h}},
+    };
+  }
+};
+
+/**
+ * The controlled phase gate. A generalized version of GateCZ.
+ */
+template <typename fp_type>
+struct GateCP {
+  static constexpr GateKind kind = kGateCP;
+  static constexpr char name[] = "cp";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = true;
+
+  static GateQSim<fp_type> Create(
+      unsigned time, unsigned q0, unsigned q1, fp_type phi) {
+    fp_type cp = std::cos(phi);
+    fp_type sp = std::sin(phi);
+
+    return CreateGate<GateQSim<fp_type>, GateCP>(
+        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
+                         0, 0, 1, 0, 0, 0, 0, 0,
+                         0, 0, 0, 0, 1, 0, 0, 0,
+                         0, 0, 0, 0, 0, 0, cp, -sp}, {phi});
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp(fp_type phi) {
+    fp_type cp = std::cos(phi);
+    fp_type sp = std::sin(phi);
+
+    return schmidt_decomp_type<fp_type>{
+      {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}},
+      {{0, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, cp, -sp}},
+    };
+  }
+};
+
+/**
+ * A two-qubit gate defined entirely by its matrix.
+ */
+template <typename fp_type>
+struct GateMatrix2 {
+  static constexpr GateKind kind = kGateMatrix2;
+  static constexpr char name[] = "mat2";
+  static constexpr unsigned num_qubits = 2;
+  static constexpr bool symmetric = false;
+
+  template <typename M = Matrix<fp_type>>
+  static GateQSim<fp_type> Create(
+      unsigned time, unsigned q0, unsigned q1, M&& m) {
+    return CreateGate<GateQSim<fp_type>, GateMatrix2>(time, {q1, q0},
+                                                      std::forward<M>(m));
+  }
+
+  static schmidt_decomp_type<fp_type> SchmidtDecomp(fp_type phi) {
+    // Not implemented.
+    return schmidt_decomp_type<fp_type>{};
+  }
+};
+
+template <typename fp_type>
+inline schmidt_decomp_type<fp_type> GetSchmidtDecomp(
+    GateKind kind, const std::vector<fp_type>& params) {
+  switch (kind) {
+  case kGateId2:
+    return GateId2<fp_type>::SchmidtDecomp();
+  case kGateCZ:
+    return GateCZ<fp_type>::SchmidtDecomp();
+  case kGateCNot:
+    return GateCNot<fp_type>::SchmidtDecomp();
+  case kGateSwap:
+    return GateSwap<fp_type>::SchmidtDecomp();
+  case kGateIS:
+    return GateIS<fp_type>::SchmidtDecomp();
+  case kGateFS:
+    return GateFS<fp_type>::SchmidtDecomp(params[0], params[1]);
+  case kGateCP:
+    return GateCP<fp_type>::SchmidtDecomp(params[0]);
+  default:
+    // Single qubit gates: empty Schmidt decomposition.
+    return schmidt_decomp_type<fp_type>{};
+  }
+}
+
+}  // namespace qsim
+
+#endif  // GATES_QSIM_H_
diff --git a/tpls/qsim/hybrid.h b/tpls/qsim/hybrid.h
new file mode 100644
index 0000000..44fad5b
--- /dev/null
+++ b/tpls/qsim/hybrid.h
@@ -0,0 +1,612 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef HYBRID_H_
+#define HYBRID_H_
+
+#include <algorithm>
+#include <array>
+#include <complex>
+#include <vector>
+
+#include "gate.h"
+#include "gate_appl.h"
+
+namespace qsim {
+
+/**
+ * Hybrid Feynman-Schrodinger simulator.
+ */
+template <typename IO, typename GateT,
+          template <typename, typename> class FuserT, typename For>
+struct HybridSimulator final {
+ public:
+  using Gate = GateT;
+  using GateKind = typename Gate::GateKind;
+  using fp_type = typename Gate::fp_type;
+
+ private:
+  // Note that one can use "struct GateHybrid : public Gate {" in C++17.
+  struct GateHybrid {
+    using GateKind = HybridSimulator::GateKind;
+    using fp_type = HybridSimulator::fp_type;
+
+    GateKind kind;
+    unsigned time;
+    std::vector<unsigned> qubits;
+    std::vector<unsigned> controlled_by;
+    uint64_t cmask;
+    std::vector<fp_type> params;
+    Matrix<fp_type> matrix;
+    bool unfusible;
+    bool swapped;
+
+    const Gate* parent;
+    unsigned id;
+  };
+
+  struct GateX {
+    GateHybrid* decomposed0;
+    GateHybrid* decomposed1;
+    schmidt_decomp_type<fp_type> schmidt_decomp;
+    unsigned schmidt_bits;
+    unsigned swapped;
+  };
+
+ public:
+  using Fuser = FuserT<IO, GateHybrid>;
+  using GateFused = typename Fuser::GateFused;
+
+  /**
+   * Contextual data for hybrid simulation.
+   */
+  struct HybridData {
+    /**
+     * List of gates on the "0" side of the cut.
+     */
+    std::vector<GateHybrid> gates0;
+    /**
+     * List of gates on the "1" side of the cut.
+     */
+    std::vector<GateHybrid> gates1;
+    /**
+     * List of gates on the cut.
+     */
+    std::vector<GateX> gatexs;
+    /**
+     * Global qubit index to local qubit index map.
+     */
+    std::vector<unsigned> qubit_map;
+    /**
+     * Number of qubits on the "0" side of the cut.
+     */
+    unsigned num_qubits0;
+    /**
+     * Number of qubits on the "1" side of the cut.
+     */
+    unsigned num_qubits1;
+    /**
+     * Number of gates on the cut.
+     */
+    unsigned num_gatexs;
+  };
+
+  /**
+   * User-specified parameters for gate fusion and hybrid simulation.
+   */
+  struct Parameter : public Fuser::Parameter {
+    /**
+     * Fixed bitstring indicating values to assign to Schmidt decomposition
+     * indices of prefix gates.
+     */
+    uint64_t prefix;
+    /**
+     * Number of gates on the cut that are part of the prefix. Indices of these
+     * gates are assigned the value indicated by `prefix`.
+     */
+    unsigned num_prefix_gatexs;
+    /**
+     * Number of gates on the cut that are part of the root. All gates that are
+     * not part of the prefix or root are part of the suffix.
+     */
+    unsigned num_root_gatexs;
+    unsigned num_threads;
+  };
+
+  template <typename... Args>
+  explicit HybridSimulator(Args&&... args) : for_(args...) {}
+
+  /**
+   * Splits the lattice into two parts, using Schmidt decomposition for gates
+   * on the cut.
+   * @param parts Lattice sections to be simulated.
+   * @param gates List of all gates in the circuit.
+   * @param hd Output data with split parts.
+   * @return True if the splitting done successfully; false otherwise.
+   */
+  static bool SplitLattice(const std::vector<unsigned>& parts,
+                           const std::vector<Gate>& gates, HybridData& hd) {
+    hd.num_gatexs = 0;
+    hd.num_qubits0 = 0;
+    hd.num_qubits1 = 0;
+
+    hd.gates0.reserve(gates.size());
+    hd.gates1.reserve(gates.size());
+    hd.qubit_map.reserve(parts.size());
+
+    unsigned count0 = 0;
+    unsigned count1 = 0;
+
+    // Global qubit index to local qubit index map.
+    for (std::size_t i = 0; i < parts.size(); ++i) {
+      parts[i] == 0 ? ++hd.num_qubits0 : ++hd.num_qubits1;
+      hd.qubit_map.push_back(parts[i] == 0 ? count0++ : count1++);
+    }
+
+    // Split the lattice.
+    for (const auto& gate : gates) {
+      if (gate.kind == gate::kMeasurement) {
+        IO::errorf("measurement gates are not suported by qsimh.\n");
+        return false;
+      }
+
+      if (gate.controlled_by.size() > 0) {
+        IO::errorf("controlled gates are not suported by qsimh.\n");
+        return false;
+      }
+
+      switch (gate.qubits.size()) {
+      case 1:  // Single qubit gates.
+        switch (parts[gate.qubits[0]]) {
+        case 0:
+          hd.gates0.emplace_back(GateHybrid{gate.kind, gate.time,
+            {hd.qubit_map[gate.qubits[0]]}, {}, 0, gate.params, gate.matrix,
+            false, false, nullptr, 0});
+          break;
+        case 1:
+          hd.gates1.emplace_back(GateHybrid{gate.kind, gate.time,
+            {hd.qubit_map[gate.qubits[0]]}, {}, 0, gate.params, gate.matrix,
+            false, false, nullptr, 0});
+          break;
+        }
+        break;
+      case 2:  // Two qubit gates.
+        {
+          switch ((parts[gate.qubits[1]] << 1) | parts[gate.qubits[0]]) {
+          case 0:  // Both qubits in part 0.
+            hd.gates0.emplace_back(GateHybrid{gate.kind, gate.time,
+              {hd.qubit_map[gate.qubits[0]], hd.qubit_map[gate.qubits[1]]},
+              {}, 0, gate.params, gate.matrix, false, gate.swapped,
+              nullptr, 0});
+            break;
+          case 1:  // Gate on the cut, qubit 0 in part 1, qubit 1 in part 0.
+            hd.gates0.emplace_back(GateHybrid{GateKind::kDecomp, gate.time,
+              {hd.qubit_map[gate.qubits[1]]}, {}, 0, gate.params, {},
+              true, gate.swapped, &gate, hd.num_gatexs});
+            hd.gates1.emplace_back(GateHybrid{GateKind::kDecomp, gate.time,
+              {hd.qubit_map[gate.qubits[0]]}, {}, 0, gate.params, {},
+              true, gate.swapped, &gate, hd.num_gatexs});
+
+            ++hd.num_gatexs;
+            break;
+          case 2:  // Gate on the cut, qubit 0 in part 0, qubit 1 in part 1.
+            hd.gates0.emplace_back(GateHybrid{GateKind::kDecomp, gate.time,
+              {hd.qubit_map[gate.qubits[0]]}, {}, 0, gate.params, {},
+              true, gate.swapped, &gate, hd.num_gatexs});
+            hd.gates1.emplace_back(GateHybrid{GateKind::kDecomp, gate.time,
+              {hd.qubit_map[gate.qubits[1]]}, {}, 0, gate.params, {},
+              true, gate.swapped, &gate, hd.num_gatexs});
+
+            ++hd.num_gatexs;
+            break;
+          case 3:  // Both qubits in part 1.
+            hd.gates1.emplace_back(GateHybrid{gate.kind, gate.time,
+              {hd.qubit_map[gate.qubits[0]], hd.qubit_map[gate.qubits[1]]},
+              {}, 0, gate.params, gate.matrix, false, gate.swapped,
+              nullptr, 0});
+            break;
+          }
+        }
+        break;
+      default:
+        IO::errorf("multi-qubit gates are not suported by qsimh.\n");
+        return false;
+      }
+    }
+
+    auto compare = [](const GateHybrid& l, const GateHybrid& r) -> bool {
+      return l.time < r.time || (l.time == r.time &&
+          (l.parent < r.parent || (l.parent == r.parent && l.id < r.id)));
+    };
+
+    // Sort gates.
+    std::sort(hd.gates0.begin(), hd.gates0.end(), compare);
+    std::sort(hd.gates1.begin(), hd.gates1.end(), compare);
+
+    hd.gatexs.reserve(hd.num_gatexs);
+
+    // Get Schmidt matrices.
+    for (auto& gate0 : hd.gates0) {
+      if (gate0.parent != nullptr) {
+        auto d = GetSchmidtDecomp(gate0.parent->kind, gate0.parent->params);
+        if (d.size() == 0) {
+          IO::errorf("no Schmidt decomposition for gate kind %u.\n",
+                     gate0.parent->kind);
+          return false;
+        }
+
+        unsigned schmidt_bits = SchmidtBits(d.size());
+        if (schmidt_bits > 2) {
+          IO::errorf("Schmidt rank is too large for gate kind %u.\n",
+                     gate0.parent->kind);
+          return false;
+        }
+
+        unsigned swapped = parts[gate0.parent->qubits[0]];
+        if (gate0.parent->swapped) swapped = 1 - swapped;
+        hd.gatexs.emplace_back(GateX{&gate0, nullptr, std::move(d),
+                                     schmidt_bits, swapped});
+      }
+    }
+
+    unsigned count = 0;
+    for (auto& gate1 : hd.gates1) {
+      if (gate1.parent != nullptr) {
+        hd.gatexs[count++].decomposed1 = &gate1;
+      }
+    }
+
+    for (auto& gatex : hd.gatexs) {
+      if (gatex.schmidt_decomp.size() == 1) {
+        FillSchmidtMatrices(0, gatex);
+      }
+    }
+
+    return true;
+  }
+
+  /**
+   * Runs the hybrid simulator on a sectioned lattice.
+   * @param param Options for parallelism and logging. Also specifies the size
+   *   of the 'prefix' and 'root' sections of the lattice.
+   * @param factory Object to create simulators and state spaces.
+   * @param hd Container object for gates on the boundary between lattice
+   *   sections.
+   * @param parts Lattice sections to be simulated.
+   * @param fgates0 List of gates from one section of the lattice.
+   * @param fgates1 List of gates from the other section of the lattice.
+   * @param bitstrings List of output states to simulate, as bitstrings.
+   * @param results Output vector of amplitudes. After a successful run, this
+   *   will be populated with amplitudes for each state in 'bitstrings'.
+   * @return True if the simulation completed successfully; false otherwise.
+   */
+  template <typename Factory, typename Results>
+  bool Run(const Parameter& param, const Factory& factory,
+           HybridData& hd, const std::vector<unsigned>& parts,
+           const std::vector<GateFused>& fgates0,
+           const std::vector<GateFused>& fgates1,
+           const std::vector<uint64_t>& bitstrings, Results& results) const {
+    using Simulator = typename Factory::Simulator;
+    using StateSpace = typename Simulator::StateSpace;
+    using State = typename StateSpace::State;
+
+    unsigned num_p_gates = param.num_prefix_gatexs;
+    unsigned num_pr_gates = num_p_gates + param.num_root_gatexs;
+
+    auto bits = CountSchmidtBits(param, hd.gatexs);
+
+    uint64_t rmax = uint64_t{1} << bits.num_r_bits;
+    uint64_t smax = uint64_t{1} << bits.num_s_bits;
+
+    auto loc0 = CheckpointLocations(param, fgates0);
+    auto loc1 = CheckpointLocations(param, fgates1);
+
+    struct Index {
+      unsigned i0;
+      unsigned i1;
+    };
+
+    std::vector<Index> indices;
+    indices.reserve(bitstrings.size());
+
+    // Bitstring indices for part 0 and part 1. TODO: optimize.
+    for (const auto& bitstring : bitstrings) {
+      Index index{0, 0};
+
+      for (uint64_t i = 0; i < hd.qubit_map.size(); ++i) {
+        unsigned m = ((bitstring >> i) & 1) << hd.qubit_map[i];
+        parts[i] ? index.i1 |= m : index.i0 |= m;
+      }
+
+      indices.push_back(index);
+    }
+
+    StateSpace state_space = factory.CreateStateSpace();
+
+    State* rstate0;
+    State* rstate1;
+
+    State state0p = state_space.Null();
+    State state1p = state_space.Null();
+    State state0r = state_space.Null();
+    State state1r = state_space.Null();
+    State state0s = state_space.Null();
+    State state1s = state_space.Null();
+
+    // Create states.
+
+    if (!CreateStates(hd.num_qubits0, hd.num_qubits1, state_space, true,
+                      state0p, state1p, rstate0, rstate1)) {
+      return false;
+    }
+
+    if (!CreateStates(hd.num_qubits0, hd.num_qubits1, state_space, rmax > 1,
+                      state0r, state1r, rstate0, rstate1)) {
+      return false;
+    }
+
+    if (!CreateStates(hd.num_qubits0, hd.num_qubits1, state_space, smax > 1,
+                      state0s, state1s, rstate0, rstate1)) {
+      return false;
+    }
+
+    state_space.SetStateZero(state0p);
+    state_space.SetStateZero(state1p);
+
+    Simulator simulator = factory.CreateSimulator();
+
+    std::vector<unsigned> prev(hd.num_gatexs, unsigned(-1));
+
+    // param.prefix encodes the prefix path.
+    unsigned gatex_index = SetSchmidtMatrices(
+        0, num_p_gates, param.prefix, prev, hd.gatexs);
+
+    if (gatex_index == 0) {
+      // Apply gates before the first checkpoint.
+      ApplyGates(fgates0, 0, loc0[0], simulator, state0p);
+      ApplyGates(fgates1, 0, loc1[0], simulator, state1p);
+    } else {
+      IO::errorf("invalid prefix %lu for prefix gate index %u.\n",
+                 param.prefix, gatex_index - 1);
+      return false;
+    }
+
+    // Branch over root gates on the cut. r encodes the root path.
+    for (uint64_t r = 0; r < rmax; ++r) {
+      if (rmax > 1) {
+        state_space.Copy(state0p, state0r);
+        state_space.Copy(state1p, state1r);
+      }
+
+      if (SetSchmidtMatrices(num_p_gates, num_pr_gates,
+                             r, prev, hd.gatexs) == 0) {
+        // Apply gates before the second checkpoint.
+        ApplyGates(fgates0, loc0[0], loc0[1], simulator, state0r);
+        ApplyGates(fgates1, loc1[0], loc1[1], simulator, state1r);
+      } else {
+        continue;
+      }
+
+      // Branch over suffix gates on the cut. s encodes the suffix path.
+      for (uint64_t s = 0; s < smax; ++s) {
+        if (smax > 1) {
+          state_space.Copy(rmax > 1 ? state0r : state0p, state0s);
+          state_space.Copy(rmax > 1 ? state1r : state1p, state1s);
+        }
+
+        if (SetSchmidtMatrices(num_pr_gates, hd.num_gatexs,
+                               s, prev, hd.gatexs) == 0) {
+          // Apply the rest of the gates.
+          ApplyGates(fgates0, loc0[1], fgates0.size(), simulator, state0s);
+          ApplyGates(fgates1, loc1[1], fgates1.size(), simulator, state1s);
+        } else {
+          continue;
+        }
+
+        auto f = [](unsigned n, unsigned m, uint64_t i,
+                    const StateSpace& state_space,
+                    const State& state0, const State& state1,
+                    const std::vector<Index>& indices, Results& results) {
+          // TODO: make it faster for the CUDA state space.
+          auto a0 = state_space.GetAmpl(state0, indices[i].i0);
+          auto a1 = state_space.GetAmpl(state1, indices[i].i1);
+          results[i] += a0 * a1;
+        };
+
+        // Collect results.
+        for_.Run(results.size(), f,
+                 state_space, *rstate0, *rstate1, indices, results);
+      }
+    }
+
+    return true;
+  }
+
+ private:
+  /**
+   * Identifies when to save "checkpoints" of the simulation state. These allow
+   * runs with different cut-index values to reuse parts of the simulation.
+   * @param param Options for parallelism and logging. Also specifies the size
+   *   of the 'prefix' and 'root' sections of the lattice.
+   * @param fgates Set of gates for which to find checkpoint locations.
+   * @return A pair of numbers specifying how many gates to apply before the
+   *   first and second checkpoints, respectively.
+   */
+  static std::array<unsigned, 2> CheckpointLocations(
+      const Parameter& param, const std::vector<GateFused>& fgates) {
+    std::array<unsigned, 2> loc{0, 0};
+
+    unsigned num_decomposed = 0;
+    unsigned num_p_gates = param.num_prefix_gatexs;
+    unsigned num_pr_gates = num_p_gates + param.num_root_gatexs;
+
+    for (std::size_t i = 0; i < fgates.size(); ++i) {
+      for (auto gate: fgates[i].gates) {
+        if (gate->parent != nullptr) {
+          ++num_decomposed;
+          // There should be only one decomposed gate in fused gate.
+          break;
+        }
+      }
+
+      if (num_decomposed <= num_p_gates) {
+        loc[0] = i + 1;
+      }
+
+      if (num_decomposed <= num_pr_gates) {
+        loc[1] = i + 1;
+      }
+    }
+
+    return loc;
+  }
+
+  struct Bits {
+    unsigned num_p_bits;
+    unsigned num_r_bits;
+    unsigned num_s_bits;
+  };
+
+  static Bits CountSchmidtBits(
+      const Parameter& param, const std::vector<GateX>& gatexs) {
+    Bits bits{0, 0, 0};
+
+    unsigned num_p_gates = param.num_prefix_gatexs;
+    unsigned num_pr_gates = num_p_gates + param.num_root_gatexs;
+
+    for (std::size_t i = 0; i < gatexs.size(); ++i) {
+      const auto& gatex = gatexs[i];
+      if (i < num_p_gates) {
+        bits.num_p_bits += gatex.schmidt_bits;
+      } else if (i < num_pr_gates) {
+        bits.num_r_bits += gatex.schmidt_bits;
+      } else {
+        bits.num_s_bits += gatex.schmidt_bits;
+      }
+    }
+
+    return bits;
+  }
+
+  static unsigned SetSchmidtMatrices(std::size_t i0, std::size_t i1,
+                                     uint64_t path,
+                                     std::vector<unsigned>& prev_k,
+                                     std::vector<GateX>& gatexs) {
+    unsigned shift_length = 0;
+
+    for (std::size_t i = i0; i < i1; ++i) {
+      const auto& gatex = gatexs[i];
+
+      if (gatex.schmidt_bits == 0) {
+        // Continue if gatex has Schmidt rank 1.
+        continue;
+      }
+
+      unsigned k = (path >> shift_length) & ((1 << gatex.schmidt_bits) - 1);
+      shift_length += gatex.schmidt_bits;
+
+      if (k != prev_k[i]) {
+        if (k >= gatex.schmidt_decomp.size()) {
+          // Invalid path. Returns gatex index plus one to report error in case
+          // of invalid prefix.
+          return i + 1;
+        }
+
+        FillSchmidtMatrices(k, gatex);
+
+        prev_k[i] = k;
+      }
+    }
+
+    return 0;
+  }
+
+  static void FillSchmidtMatrices(unsigned k, const GateX& gatex) {
+    unsigned part0 = gatex.swapped;
+    unsigned part1 = 1 - part0;
+    {
+      gatex.decomposed0->matrix.resize(gatex.schmidt_decomp[k][part0].size());
+      auto begin = gatex.schmidt_decomp[k][part0].begin();
+      auto end = gatex.schmidt_decomp[k][part0].end();
+      std::copy(begin, end, gatex.decomposed0->matrix.begin());
+    }
+    {
+      gatex.decomposed1->matrix.resize(gatex.schmidt_decomp[k][part1].size());
+      auto begin = gatex.schmidt_decomp[k][part1].begin();
+      auto end = gatex.schmidt_decomp[k][part1].end();
+      std::copy(begin, end, gatex.decomposed1->matrix.begin());
+    }
+  }
+
+  template <typename Simulator>
+  static void ApplyGates(const std::vector<GateFused>& gates,
+                         std::size_t i0, std::size_t i1,
+                         const Simulator& simulator,
+                         typename Simulator::State& state) {
+    for (std::size_t i = i0; i < i1; ++i) {
+      if (gates[i].matrix.size() > 0) {
+        ApplyFusedGate(simulator, gates[i], state);
+      } else {
+        auto gate = gates[i];
+        CalculateFusedMatrix(gate);
+        ApplyFusedGate(simulator, gate, state);
+      }
+    }
+  }
+
+  static unsigned SchmidtBits(unsigned size) {
+    switch (size) {
+    case 1:
+      return 0;
+    case 2:
+      return 1;
+    case 3:
+      return 2;
+    case 4:
+      return 2;
+    default:
+      // Not supported.
+      return 42;
+    }
+  }
+
+  template <typename StateSpace>
+  static bool CreateStates(unsigned num_qubits0,unsigned num_qubits1,
+                           const StateSpace& state_space, bool create,
+                           typename StateSpace::State& state0,
+                           typename StateSpace::State& state1,
+                           typename StateSpace::State* (&rstate0),
+                           typename StateSpace::State* (&rstate1)) {
+    if (create) {
+      state0 = state_space.Create(num_qubits0);
+      state1 = state_space.Create(num_qubits1);
+
+      if (state_space.IsNull(state0) || state_space.IsNull(state1)) {
+        IO::errorf("not enough memory: is the number of qubits too large?\n");
+        return false;
+      }
+
+      rstate0 = &state0;
+      rstate1 = &state1;
+    }
+
+    return true;
+  }
+
+  For for_;
+};
+
+}  // namespace qsim
+
+#endif  // HYBRID_H_
diff --git a/tpls/qsim/io.h b/tpls/qsim/io.h
new file mode 100644
index 0000000..3b26c7c
--- /dev/null
+++ b/tpls/qsim/io.h
@@ -0,0 +1,44 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IO_H_
+#define IO_H_
+
+#include <cstdarg>
+#include <cstdio>
+
+namespace qsim {
+
+/**
+ * Controller for output logs.
+ */
+struct IO {
+  static void errorf(const char* format, ...) {
+    va_list args;
+    va_start(args, format);
+    vfprintf(stderr, format, args);
+    va_end(args);
+  }
+
+  static void messagef(const char* format, ...) {
+    va_list args;
+    va_start(args, format);
+    vprintf(format, args);
+    va_end(args);
+  }
+};
+
+}  // namespace qsim
+
+#endif  // IO_H_
diff --git a/tpls/qsim/io_file.h b/tpls/qsim/io_file.h
new file mode 100644
index 0000000..3cfac12
--- /dev/null
+++ b/tpls/qsim/io_file.h
@@ -0,0 +1,71 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef IO_FILE_H_
+#define IO_FILE_H_
+
+#include <cstdint>
+#include <fstream>
+#include <string>
+
+#include "io.h"
+
+namespace qsim {
+
+/**
+ * Controller for output logs with methods for writing to file.
+ */
+struct IOFile : public IO {
+  static std::ifstream StreamFromFile(const std::string& file) {
+    std::ifstream fs;
+    fs.open(file);
+    if (!fs) {
+      errorf("cannot open %s for reading.\n", file.c_str());
+    }
+    return fs;
+  }
+
+  static void CloseStream(std::ifstream& fs) {
+    fs.close();
+  }
+
+  static bool WriteToFile(
+      const std::string& file, const std::string& content) {
+    return WriteToFile(file, content.data(), content.size());
+  }
+
+  static bool WriteToFile(
+      const std::string& file, const void* data, uint64_t size) {
+    auto fs = std::fstream(file, std::ios::out | std::ios::binary);
+
+    if (!fs) {
+      errorf("cannot open %s for writing.\n", file.c_str());
+      return false;
+    } else {
+      fs.write((const char*) data, size);
+      if (!fs) {
+        errorf("cannot write to %s.\n", file.c_str());
+        return false;
+      }
+
+      fs.close();
+    }
+
+    return true;
+  }
+};
+
+}  // namespace qsim
+
+#endif  // IO_FILE_H_
diff --git a/tpls/qsim/matrix.h b/tpls/qsim/matrix.h
new file mode 100644
index 0000000..a3c2640
--- /dev/null
+++ b/tpls/qsim/matrix.h
@@ -0,0 +1,296 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MATRIX_H_
+#define MATRIX_H_
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "bits.h"
+
+namespace qsim {
+
+/**
+ * Gate matrix type. Matrices are stored as vectors. The matrix elements are
+ * accessed as real(m[i][j]) <- vector[2 * (n * i + j)] and
+ * imag(m[i][j]) <- vector[2 * (n * i + j) + 1], where n is the number of rows
+ * or columns (n = 2^q, where q is the number of gate qubits).
+ */
+template <typename fp_type>
+using Matrix = std::vector<fp_type>;
+
+/**
+ * Sets all matrix elements to zero.
+ * @m Matrix to be cleared.
+ */
+template <typename fp_type>
+inline void MatrixClear(Matrix<fp_type>& m) {
+  for (unsigned i = 0; i < m.size(); ++i) {
+    m[i] = 0;
+  }
+}
+
+/**
+ * Sets an identity matrix.
+ * @n Number of matrix rows (columns).
+ * @m Output identity matrix.
+ */
+template <typename fp_type>
+inline void MatrixIdentity(unsigned n, Matrix<fp_type>& m) {
+  m.resize(2 * n * n);
+
+  MatrixClear(m);
+
+  for (unsigned i = 0; i < n; ++i) {
+    m[2 * (n * i + i)] = 1;
+  }
+}
+
+/**
+ * Multiplies two gate matrices of equal size: m2 = m1 m2.
+ * @q Number of gate qubits. The number of matrix rows (columns) is 2^q.
+ * @m1 Matrix m1.
+ * @m2 Input matrix m2. Output product of matrices m2 = m1 m2.
+ */
+template <typename fp_type1, typename fp_type2>
+inline void MatrixMultiply(
+    unsigned q, const Matrix<fp_type1>& m1, Matrix<fp_type2>& m2) {
+  Matrix<fp_type2> mt = m2;
+  unsigned n = unsigned{1} << q;
+
+  for (unsigned i = 0; i < n; ++i) {
+    for (unsigned j = 0; j < n; ++j) {
+      fp_type2 re = 0;
+      fp_type2 im = 0;
+
+      for (unsigned k = 0; k < n; ++k) {
+        fp_type2 r1 = m1[2 * (n * i + k)];
+        fp_type2 i1 = m1[2 * (n * i + k) + 1];
+        fp_type2 r2 = mt[2 * (n * k + j)];
+        fp_type2 i2 = mt[2 * (n * k + j) + 1];
+
+        re += r1 * r2 - i1 * i2;
+        im += r1 * i2 + i1 * r2;
+      }
+
+      m2[2 * (n * i + j)] = re;
+      m2[2 * (n * i + j) + 1] = im;
+    }
+  }
+}
+
+/**
+ * Multiplies two gate matrices of equal size: m2 = m1^\dagger m2.
+ * @q Number of gate qubits. The number of matrix rows (columns) is 2^q.
+ * @m1 Matrix m1.
+ * @m2 Input matrix m2. Output product of matrices m2 = m1 m2.
+ */
+template <typename fp_type1, typename fp_type2>
+inline void MatrixDaggerMultiply(
+    unsigned q, const Matrix<fp_type1>& m1, Matrix<fp_type2>& m2) {
+  Matrix<fp_type2> mt = m2;
+  unsigned n = unsigned{1} << q;
+
+  for (unsigned i = 0; i < n; ++i) {
+    for (unsigned j = 0; j < n; ++j) {
+      fp_type2 re = 0;
+      fp_type2 im = 0;
+
+      for (unsigned k = 0; k < n; ++k) {
+        fp_type2 r1 = m1[2 * (n * k + i)];
+        fp_type2 i1 = m1[2 * (n * k + i) + 1];
+        fp_type2 r2 = mt[2 * (n * k + j)];
+        fp_type2 i2 = mt[2 * (n * k + j) + 1];
+
+        re += r1 * r2 + i1 * i2;
+        im += r1 * i2 - i1 * r2;
+      }
+
+      m2[2 * (n * i + j)] = re;
+      m2[2 * (n * i + j) + 1] = im;
+    }
+  }
+}
+
+/**
+ * Multiplies two gate matrices: m2 = m1 m2. The size of m1 should not exceed
+ *   the size of m2.
+ * @mask1 Qubit mask that specifies the subset of qubits m1 acts on.
+ * @q1 Number of gate qubits. The number of matrix rows (columns) is 2^q1.
+ * @m1 Matrix m1.
+ * @q2 Number of gate qubits. The number of matrix rows (columns) is 2^q2.
+ * @m2 Input matrix m2. Output product of matrices m2 = m1 m2.
+ */
+template <typename fp_type1, typename fp_type2>
+inline void MatrixMultiply(unsigned mask1,
+                           unsigned q1, const Matrix<fp_type1>& m1,
+                           unsigned q2, Matrix<fp_type2>& m2) {
+  if (q1 == q2) {
+    MatrixMultiply(q1, m1, m2);
+  } else {
+    Matrix<fp_type2> mt = m2;
+    unsigned n1 = unsigned{1} << q1;
+    unsigned n2 = unsigned{1} << q2;
+
+    for (unsigned i = 0; i < n2; ++i) {
+      unsigned si = bits::CompressBits(i, q2, mask1);
+
+      for (unsigned j = 0; j < n2; ++j) {
+        fp_type2 re = 0;
+        fp_type2 im = 0;
+
+        for (unsigned k = 0; k < n1; ++k) {
+          unsigned ek = bits::ExpandBits(k, q2, mask1) + (i & ~mask1);
+
+          fp_type2 r1 = m1[2 * (n1 * si + k)];
+          fp_type2 i1 = m1[2 * (n1 * si + k) + 1];
+          fp_type2 r2 = mt[2 * (n2 * ek + j)];
+          fp_type2 i2 = mt[2 * (n2 * ek + j) + 1];
+
+          re += r1 * r2 - i1 * i2;
+          im += r1 * i2 + i1 * r2;
+        }
+
+        m2[2 * (n2 * i + j)] = re;
+        m2[2 * (n2 * i + j) + 1] = im;
+      }
+    }
+  }
+}
+
+/**
+ * Multiply a matrix by a real scalar value.
+ * @c Scalar value.
+ * @m Input matrix to be multiplied. Output matrix.
+ */
+template <typename fp_type1, typename fp_type2>
+inline void MatrixScalarMultiply(fp_type1 c, Matrix<fp_type2>& m) {
+  for (unsigned i = 0; i < m.size(); ++i) {
+    m[i] *= c;
+  }
+}
+
+/**
+ * Multiply a matrix by a complex scalar value.
+ * @re Real part of scalar value.
+ * @im Imaginary part of scalar value.
+ * @m Input matrix to be multiplied. Output matrix.
+ */
+template <typename fp_type1, typename fp_type2>
+inline void MatrixScalarMultiply(
+    fp_type1 re, fp_type1 im, Matrix<fp_type2>& m) {
+  for (unsigned i = 0; i < m.size() / 2; ++i) {
+    fp_type2 re0 = m[2 * i + 0];
+    fp_type2 im0 = m[2 * i + 1];
+    m[2 * i + 0] = re * re0 - im * im0;
+    m[2 * i + 1] = re * im0 + im * re0;
+  }
+}
+
+/**
+ * Daggers a matrix.
+ * @n Number of matrix rows (columns).
+ * @m Input matrix. Output matrix.
+ */
+template <typename fp_type>
+inline void MatrixDagger(unsigned n, Matrix<fp_type>& m) {
+  for (unsigned i = 0; i < n; ++i) {
+    m[2 * (n * i + i) + 1] = -m[2 * (n * i + i) + 1];
+
+    for (unsigned j = i + 1; j < n; ++j) {
+      std::swap(m[2 * (n * i + j)], m[2 * (n * j + i)]);
+      fp_type t = m[2 * (n * i + j) + 1];
+      m[2 * (n * i + j) + 1] = -m[2 * (n * j + i) + 1];
+      m[2 * (n * j + i) + 1] = -t;
+    }
+  }
+}
+
+/**
+ * Gets a permutation to rearrange qubits from "normal" order to "gate"
+ *   order. Qubits are ordered in increasing order for "normal" order.
+ *   Qubits are ordered arbitrarily for "gate" order. Returns an empty vector
+ *   if the qubits are in "normal" order.
+ * @qubits Qubit indices in "gate" order.
+ * @return Permutation as a vector.
+ */
+inline std::vector<unsigned> NormalToGateOrderPermutation(
+    const std::vector<unsigned>& qubits) {
+  std::vector<unsigned> perm;
+
+  bool normal_order = true;
+
+  for (std::size_t i = 1; i < qubits.size(); ++i) {
+    if (qubits[i] < qubits[i - 1]) {
+      normal_order = false;
+      break;
+    }
+  }
+
+  if (!normal_order) {
+    struct QI {
+      unsigned q;
+      unsigned index;
+    };
+
+    std::vector<QI> qis;
+    qis.reserve(qubits.size());
+
+    for (std::size_t i = 0; i < qubits.size(); ++i) {
+      qis.push_back({qubits[i], unsigned(i)});
+    }
+
+    std::sort(qis.begin(), qis.end(), [](const QI& l, const QI& r) {
+                                        return l.q < r.q;
+                                      });
+
+    perm.reserve(qubits.size());
+
+    for (std::size_t i = 0; i < qubits.size(); ++i) {
+      perm.push_back(qis[i].index);
+    }
+  }
+
+  return perm;
+}
+
+/**
+ * Shuffles the gate matrix elements to get the matrix that acts on qubits
+ *   that are in "normal" order (in increasing orger).
+ * @perm Permutation to rearrange qubits from "normal" order to "gate" order.
+ * @q Number of gate qubits. The number of matrix rows (columns) is 2^q.
+ * @m Input matrix. Output shuffled matrix.
+ */
+template <typename fp_type>
+inline void MatrixShuffle(const std::vector<unsigned>& perm,
+                          unsigned q, Matrix<fp_type>& m) {
+  Matrix<fp_type> mt = m;
+  unsigned n = unsigned{1} << q;
+
+  for (unsigned i = 0; i < n; ++i) {
+    unsigned pi = bits::PermuteBits(i, q, perm);
+    for (unsigned j = 0; j < n; ++j) {
+      unsigned pj = bits::PermuteBits(j, q, perm);
+
+      m[2 * (n * i + j)] = mt[2 * (n * pi + pj)];
+      m[2 * (n * i + j) + 1] = mt[2 * (n * pi + pj) + 1];
+    }
+  }
+}
+
+}  // namespace qsim
+
+#endif  // MATRIX_H_
diff --git a/tpls/qsim/mps_simulator.h b/tpls/qsim/mps_simulator.h
new file mode 100644
index 0000000..8fbcbae
--- /dev/null
+++ b/tpls/qsim/mps_simulator.h
@@ -0,0 +1,246 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MPS_SIMULATOR_H_
+#define MPS_SIMULATOR_H_
+
+// For templates will take care of parallelization.
+#define EIGEN_DONT_PARALLELIZE 1
+
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <vector>
+
+#include "../eigen/Eigen/Dense"
+#include "../eigen/Eigen/SVD"
+#include "mps_statespace.h"
+
+namespace qsim {
+
+namespace mps {
+
+/**
+ *  Truncated Matrix Product State (MPS) circuit simulator w/ vectorization.
+ */
+template <typename For, typename FP = float>
+class MPSSimulator final {
+ public:
+  using MPSStateSpace_ = MPSStateSpace<For, FP>;
+  using State = typename MPSStateSpace_::MPS;
+  using fp_type = typename MPSStateSpace_::fp_type;
+
+  using Complex = std::complex<fp_type>;
+  using Matrix =
+      Eigen::Matrix<Complex, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+  using ConstMatrixMap = Eigen::Map<const Matrix>;
+  using MatrixMap = Eigen::Map<Matrix>;
+
+  using OneQubitMatrix = Eigen::Matrix<Complex, 2, 2, Eigen::RowMajor>;
+  using ConstOneQubitMap = Eigen::Map<const OneQubitMatrix>;
+
+  // Note: ForArgs are currently unused.
+  template <typename... ForArgs>
+  explicit MPSSimulator(ForArgs&&... args) : for_(args...) {}
+
+  /**
+   * Applies a gate using non-vectorized instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyGate(const std::vector<unsigned>& qs, const fp_type* matrix,
+                 State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    switch (qs.size()) {
+      case 1:
+        ApplyGate1(qs, matrix, state);
+        break;
+      case 2:
+        ApplyGate2(qs, matrix, state);
+        break;
+      // case 3:
+      //   ApplyGate3(qs, matrix, state);
+      //   break;
+      // case 4:
+      //   ApplyGate4(qs, matrix, state);
+      //   break;
+      // case 5:
+      //   ApplyGate5(qs, matrix, state);
+      //   break;
+      // case 6:
+      //   ApplyGate6(qs, matrix, state);
+      //   break;
+      default:
+        // Not implemented.
+        break;
+    }
+  }
+
+  /**
+   * Applies a controlled gate using eigen3 operations w/ instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param cqs Indices of control qubits.
+   * @param cmask Bit mask of control qubit values.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyControlledGate(const std::vector<unsigned>& qs,
+                           const std::vector<unsigned>& cqs, uint64_t cmask,
+                           const fp_type* matrix, State& state) const {
+    // TODO.
+  }
+
+  /**
+   * Computes the expectation value of an operator using eigen3 operations
+   * w/ vectorized instructions.
+   * @param qs Indices of the qubits the operator acts on.
+   * @param matrix The operator matrix.
+   * @param state The state of the system.
+   * @return The computed expectation value.
+   */
+  std::complex<double> ExpectationValue(const std::vector<unsigned>& qs,
+                                        const fp_type* matrix,
+                                        const State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+    // TODO.
+    return std::complex<double>(-10., -10.);
+  }
+
+ private:
+  void ApplyGate1(const std::vector<unsigned>& qs, const fp_type* matrix,
+                  State& state) const {
+    if (qs[0] == state.num_qubits() - 1) {
+      Apply1Right(qs, matrix, state);
+    } else {
+      Apply1LeftOrInterior(qs, matrix, state);
+    }
+  }
+
+  void Apply1LeftOrInterior(const std::vector<unsigned>& qs,
+                            const fp_type* matrix, State& state) const {
+    fp_type* raw_state = state.get();
+    const auto bond_dim = state.bond_dim();
+    const auto l_offset = MPSStateSpace_::GetBlockOffset(state, qs[0]);
+    const auto r_offset = MPSStateSpace_::GetBlockOffset(state, qs[0] + 1);
+    const auto end = MPSStateSpace_::Size(state);
+    ConstOneQubitMap gate_matrix((Complex*) matrix);
+    MatrixMap scratch_block((Complex*)(raw_state + end), 2, bond_dim);
+
+    for (unsigned block_sep = l_offset; block_sep < r_offset;
+         block_sep += 4 * bond_dim) {
+      fp_type* cur_block = raw_state + block_sep;
+      ConstMatrixMap mps_block((Complex*) cur_block, 2, bond_dim);
+      scratch_block.noalias() = gate_matrix * mps_block;
+      memcpy(cur_block, raw_state + end, sizeof(fp_type) * bond_dim * 4);
+    }
+  }
+
+  void Apply1Right(const std::vector<unsigned>& qs, const fp_type* matrix,
+                   State& state) const {
+    fp_type* raw_state = state.get();
+    const auto bond_dim = state.bond_dim();
+    const auto offset = MPSStateSpace_::GetBlockOffset(state, qs[0]);
+    const auto end = MPSStateSpace_::Size(state);
+    ConstOneQubitMap gate_matrix((Complex*) matrix);
+    ConstMatrixMap mps_block((Complex*)(raw_state + offset), bond_dim, 2);
+    MatrixMap scratch_block((Complex*)(raw_state + end), bond_dim, 2);
+    scratch_block.noalias() = mps_block * gate_matrix.transpose();
+    memcpy(raw_state + offset, raw_state + end, sizeof(fp_type) * bond_dim * 4);
+  }
+
+  void ApplyGate2(const std::vector<unsigned>& qs, const fp_type* matrix,
+                  State& state) const {
+    // TODO: micro-benchmark this function and improve performance.
+    const auto bond_dim = state.bond_dim();
+    const auto num_qubits = state.num_qubits();
+    fp_type* raw_state = state.get();
+
+    const auto i_dim = (qs[0] == 0) ? 1 : bond_dim;
+    const auto j_dim = 2;
+    const auto k_dim = bond_dim;
+    const auto l_dim = 2;
+    const auto m_dim = (qs[1] == num_qubits - 1) ? 1 : bond_dim;
+
+    const auto b_0_offset = MPSStateSpace_::GetBlockOffset(state, qs[0]);
+    const auto b_1_offset = MPSStateSpace_::GetBlockOffset(state, qs[1]);
+    const auto end = MPSStateSpace_::Size(state);
+
+    MatrixMap block_0((Complex*)(raw_state + b_0_offset), i_dim * j_dim, k_dim);
+    MatrixMap block_1((Complex*)(raw_state + b_1_offset), k_dim, l_dim * m_dim);
+
+    // Merge both blocks into scratch space.
+    MatrixMap scratch_c((Complex*)(raw_state + end), i_dim * j_dim, l_dim * m_dim);
+    scratch_c.noalias() = block_0 * block_1;
+
+    // Transpose inner dims in-place.
+    MatrixMap scratch_c_t((Complex*)(raw_state + end), i_dim * j_dim * l_dim, m_dim);
+    for (unsigned i = 0; i < i_dim * j_dim * l_dim; i += 4) {
+      scratch_c_t.row(i + 1).swap(scratch_c_t.row(i + 2));
+    }
+
+    // Transpose gate matrix and place in 3rd (last) scratch block.
+    const auto scratch3_offset = end + 8 * bond_dim * bond_dim;
+    ConstMatrixMap gate_matrix((Complex*) matrix, 4, 4);
+    MatrixMap gate_matrix_transpose((Complex*)(raw_state + scratch3_offset), 4, 4);
+    gate_matrix_transpose = gate_matrix.transpose();
+    gate_matrix_transpose.col(1).swap(gate_matrix_transpose.col(2));
+
+    // Contract gate and merged block tensors, placing result in B0B1.
+    for (unsigned i = 0; i < i_dim; ++i) {
+      fp_type* src_block = raw_state + end + i * 8 * m_dim;
+      fp_type* dest_block = raw_state + b_0_offset + i * 8 * m_dim;
+      MatrixMap block_b0b1((Complex*) dest_block, 4, m_dim);
+      ConstMatrixMap scratch_c_i((Complex*) src_block, 4, m_dim);
+      // [i, np, m] = [np, lj] * [i, lj, m]
+      block_b0b1.noalias() = gate_matrix_transpose * scratch_c_i;
+    }
+
+    // SVD B0B1.
+    MatrixMap full_b0b1((Complex*)(raw_state + b_0_offset), 2 * i_dim, 2 * m_dim);
+    Eigen::BDCSVD<Matrix> svd(full_b0b1, Eigen::ComputeThinU | Eigen::ComputeThinV);
+    const auto p = std::min(2 * i_dim, 2 * m_dim);
+
+    // Place U in scratch to truncate and then B0.
+    MatrixMap svd_u((Complex*)(raw_state + end), 2 * i_dim, p);
+    svd_u.noalias() = svd.matrixU();
+    block_0.fill(Complex(0, 0));
+    const auto keep_cols = (svd_u.cols() > bond_dim) ? bond_dim : svd_u.cols();
+    block_0.block(0, 0, svd_u.rows(), keep_cols).noalias() =
+        svd_u(Eigen::indexing::all, Eigen::seq(0, keep_cols - 1));
+
+    // Place row product of S V into scratch to truncate and then B1.
+    MatrixMap svd_v((Complex*)(raw_state + end), p, 2 * m_dim);
+    MatrixMap s_vector((Complex*)(raw_state + end + 8 * bond_dim * bond_dim), p, 1);
+    svd_v.noalias() = svd.matrixV().adjoint();
+    s_vector.noalias() = svd.singularValues();
+    block_1.fill(Complex(0, 0));
+    const auto keep_rows = (svd_v.rows() > bond_dim) ? bond_dim : svd_v.rows();
+    const auto row_seq = Eigen::seq(0, keep_rows - 1);
+    for (unsigned i = 0; i < keep_rows; ++i) {
+      svd_v.row(i) *= s_vector(i);
+    }
+    block_1.block(0, 0, keep_rows, svd_v.cols()).noalias() =
+        svd_v(row_seq, Eigen::indexing::all);
+  }
+
+  For for_;
+};
+
+}  // namespace mps
+}  // namespace qsim
+
+#endif  // MPS_SIMULATOR_H_
diff --git a/tpls/qsim/mps_statespace.h b/tpls/qsim/mps_statespace.h
new file mode 100644
index 0000000..9b3acf3
--- /dev/null
+++ b/tpls/qsim/mps_statespace.h
@@ -0,0 +1,597 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MPS_STATESPACE_H_
+#define MPS_STATESPACE_H_
+
+// For templates will take care of parallelization.
+#define EIGEN_DONT_PARALLELIZE 1
+
+#ifdef _WIN32
+#include <malloc.h>
+#endif
+
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <memory>
+#include <random>
+
+#include "../eigen/Eigen/Dense"
+#include "../eigen/unsupported/Eigen/CXX11/Tensor"
+
+namespace qsim {
+
+namespace mps {
+
+namespace detail {
+
+inline void do_not_free(void*) {}
+
+inline void free(void* ptr) {
+#ifdef _WIN32
+  _aligned_free(ptr);
+#else
+  ::free(ptr);
+#endif
+}
+
+}  // namespace detail
+
+/**
+ * Class containing context and routines for fixed bond dimension
+ * truncated Matrix Product State (MPS) simulation.
+ */
+template <typename For, typename FP = float>
+class MPSStateSpace {
+ private:
+ public:
+  using fp_type = FP;
+  using Pointer = std::unique_ptr<fp_type, decltype(&detail::free)>;
+
+  using Complex = std::complex<fp_type>;
+  using Matrix =
+      Eigen::Matrix<Complex, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+  using ConstMatrixMap = Eigen::Map<const Matrix>;
+  using MatrixMap = Eigen::Map<Matrix>;
+
+  // Store MPS tensors with the following shape:
+  // [2, bond_dim], [bond_dim, 2, bond_dim], ... , [bond_dim, 2].
+  class MPS {
+   public:
+    MPS() = delete;
+
+    MPS(Pointer&& ptr, unsigned num_qubits, unsigned bond_dim)
+        : ptr_(std::move(ptr)), num_qubits_(num_qubits), bond_dim_(bond_dim) {}
+
+    fp_type* get() { return ptr_.get(); }
+
+    const fp_type* get() const { return ptr_.get(); }
+
+    fp_type* release() {
+      num_qubits_ = 0;
+      return ptr_.release();
+    }
+
+    unsigned num_qubits() const { return num_qubits_; }
+
+    unsigned bond_dim() const { return bond_dim_; }
+
+   private:
+    Pointer ptr_;
+    unsigned num_qubits_;
+    unsigned bond_dim_;
+  };
+
+  // Note: ForArgs are currently unused.
+  template <typename... ForArgs>
+  MPSStateSpace(ForArgs&&... args) : for_(args...) {}
+
+  // Requires num_qubits >= 2 and bond_dim >= 2.
+  static MPS Create(unsigned num_qubits, unsigned bond_dim) {
+    auto end_sizes = 2 * 4 * bond_dim;
+    auto internal_sizes = 4 * bond_dim * bond_dim * (num_qubits + 1);
+    // Use three extra "internal style" blocks past the end of the
+    //   working allocation for scratch space. Needed for gate
+    //   application.
+    auto size = sizeof(fp_type) * (end_sizes + internal_sizes);
+
+#ifdef _WIN32
+    Pointer ptr{(fp_type*)_aligned_malloc(size, 64), &detail::free};
+    bool is_null = ptr.get() != nullptr;
+    return MPS{std::move(ptr), is_null ? num_qubits : 0,
+               is_null ? bond_dim : 0};
+#else
+    void* p = nullptr;
+    if (posix_memalign(&p, 64, size) == 0) {
+      return MPS{Pointer{(fp_type*)p, &detail::free}, num_qubits, bond_dim};
+    } else {
+      return MPS{Pointer{nullptr, &detail::free}, 0, 0};
+    }
+#endif
+  }
+
+  static unsigned Size(const MPS& state) {
+    auto end_sizes = 2 * 4 * state.bond_dim();
+    auto internal_sizes = 4 * state.bond_dim() * state.bond_dim();
+    return end_sizes + internal_sizes * (state.num_qubits() - 2);
+  }
+
+  static unsigned RawSize(const MPS& state) {
+    return sizeof(fp_type) * Size(state);
+  }
+
+  // Get the pointer offset to the beginning of an MPS block.
+  static unsigned GetBlockOffset(const MPS& state, unsigned i) {
+    if (i == 0) {
+      return 0;
+    }
+    return 4 * state.bond_dim() * (1 + state.bond_dim() * (i - 1));
+  }
+
+  // Copies the state contents of one MPS to another.
+  // Ignores scratch data.
+  static bool Copy(const MPS& src, MPS& dest) {
+    if ((src.num_qubits() != dest.num_qubits()) ||
+        src.bond_dim() != dest.bond_dim()) {
+      return false;
+    }
+    auto size = RawSize(src);
+    memcpy(dest.get(), src.get(), size);
+    return true;
+  }
+
+  // Set the MPS to the |0> state.
+  static void SetStateZero(MPS& state) {
+    auto size = Size(state);
+    memset(state.get(), 0, sizeof(fp_type) * size);
+    auto block_size = 4 * state.bond_dim() * state.bond_dim();
+    state.get()[0] = 1.0;
+    for (unsigned i = 4 * state.bond_dim(); i < size; i += block_size) {
+      state.get()[i] = 1.0;
+    }
+  }
+
+  // Computes Re{<state1 | state2 >} for two equal sized MPS.
+  // Requires: state1.bond_dim() == state2.bond_dim() &&
+  //           state1.num_qubits() == state2.num_qubits()
+  static fp_type RealInnerProduct(MPS& state1, MPS& state2) {
+    return InnerProduct(state1, state2).real();
+  }
+
+  // Computes <state1 | state2 > for two equal sized MPS.
+  // Requires: state1.bond_dim() == state2.bond_dim() &&
+  //           state1.num_qubits() == state2.num_qubits()
+  static std::complex<fp_type> InnerProduct(MPS& state1, MPS& state2) {
+    const auto num_qubits = state1.num_qubits();
+    const auto bond_dim = state1.bond_dim();
+    const auto end = Size(state1);
+    auto offset = 0;
+    fp_type* state1_raw = state1.get();
+    fp_type* state2_raw = state2.get();
+
+    // Contract leftmost blocks together, store result in state1 scratch.
+    ConstMatrixMap top((Complex*)state2_raw, 2, bond_dim);
+    ConstMatrixMap bot((Complex*)state1_raw, 2, bond_dim);
+    MatrixMap partial_contract((Complex*)(state1_raw + end), bond_dim,
+                               bond_dim);
+    MatrixMap partial_contract2(
+        (Complex*)(state1_raw + end + 2 * bond_dim * bond_dim), bond_dim,
+        2 * bond_dim);
+    partial_contract.noalias() = top.adjoint() * bot;
+
+    // Contract all internal blocks together.
+    for (unsigned i = 1; i < num_qubits - 1; ++i) {
+      offset = GetBlockOffset(state1, i);
+
+      // reshape:
+      new (&partial_contract2)
+          MatrixMap((Complex*)(state1_raw + end + 2 * bond_dim * bond_dim),
+                    bond_dim, 2 * bond_dim);
+
+      // Merge bot into left boundary merged tensor.
+      new (&bot) ConstMatrixMap((Complex*)(state1_raw + offset), bond_dim,
+                                2 * bond_dim);
+      partial_contract2.noalias() = partial_contract * bot;
+
+      // reshape:
+      new (&partial_contract2)
+          MatrixMap((Complex*)(state1_raw + end + 2 * bond_dim * bond_dim),
+                    2 * bond_dim, bond_dim);
+
+      // Merge top into partial_contract2.
+      new (&top) ConstMatrixMap((Complex*)(state2_raw + offset), 2 * bond_dim,
+                                bond_dim);
+      partial_contract.noalias() = top.adjoint() * partial_contract2;
+    }
+
+    // Contract rightmost bottom block.
+    offset = GetBlockOffset(state1, num_qubits - 1);
+    new (&bot) ConstMatrixMap((Complex*)(state1_raw + offset), bond_dim, 2);
+    new (&partial_contract2) MatrixMap(
+        (Complex*)(state1_raw + end + 4 * bond_dim * bond_dim), bond_dim, 2);
+    partial_contract2.noalias() = partial_contract * bot;
+
+    // Contract rightmost top block.
+    new (&top) ConstMatrixMap((Complex*)(state2_raw + offset), 2 * bond_dim, 1);
+    new (&partial_contract) MatrixMap((Complex*)(state1_raw + end), 1, 1);
+    new (&partial_contract2)
+        MatrixMap((Complex*)(state1_raw + end + 4 * bond_dim * bond_dim),
+                  2 * bond_dim, 1);
+    partial_contract.noalias() = top.adjoint() * partial_contract2;
+
+    return partial_contract(0, 0);
+  }
+
+  // Compute the 2x2 1-RDM of state on index. Result written to rdm.
+  // Requires: scratch and rdm to be allocated.
+  static void ReduceDensityMatrix(MPS& state, MPS& scratch, int index,
+                                  fp_type* rdm) {
+    const auto num_qubits = state.num_qubits();
+    const auto bond_dim = state.bond_dim();
+    const auto end = Size(state);
+    const bool last_index = (index == num_qubits - 1);
+    const auto right_dim = (last_index ? 1 : bond_dim);
+    auto offset = 0;
+    fp_type* state_raw = state.get();
+    fp_type* scratch_raw = scratch.get();
+    fp_type* state_raw_workspace = state_raw + end + 2 * bond_dim * bond_dim;
+    fp_type* scratch_raw_workspace =
+        scratch_raw + end + 2 * bond_dim * bond_dim;
+
+    Copy(state, scratch);
+
+    // Contract leftmost blocks together, store result in state scratch.
+    ConstMatrixMap top((Complex*)scratch_raw, 2, bond_dim);
+    ConstMatrixMap bot((Complex*)state_raw, 2, bond_dim);
+    MatrixMap partial_contract((Complex*)(state_raw + end), bond_dim, bond_dim);
+    MatrixMap partial_contract2((Complex*)(state_raw_workspace), bond_dim,
+                                2 * bond_dim);
+
+    partial_contract.setZero();
+    partial_contract(0, 0) = 1;
+    if (index > 0) {
+      partial_contract.noalias() = top.adjoint() * bot;
+    }
+
+    // Contract all internal blocks together.
+    for (unsigned i = 1; i < index; ++i) {
+      offset = GetBlockOffset(state, i);
+
+      // reshape:
+      new (&partial_contract2)
+          MatrixMap((Complex*)(state_raw_workspace), bond_dim, 2 * bond_dim);
+
+      // Merge bot into left boundary merged tensor.
+      new (&bot) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim,
+                                2 * bond_dim);
+      partial_contract2.noalias() = partial_contract * bot;
+
+      // reshape:
+      new (&partial_contract2)
+          MatrixMap((Complex*)(state_raw_workspace), 2 * bond_dim, bond_dim);
+
+      // Merge top into partial_contract2.
+      new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), 2 * bond_dim,
+                                bond_dim);
+      partial_contract.noalias() = top.adjoint() * partial_contract2;
+    }
+
+    // The [bond_dim, bond_dim] block in state_raw now contains the contraction
+    // up to, but not including index.
+    // Contract rightmost blocks.
+    offset = GetBlockOffset(state, num_qubits - 1);
+    new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), bond_dim, 2);
+    new (&bot) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, 2);
+    new (&partial_contract)
+        MatrixMap((Complex*)(scratch_raw + end), bond_dim, bond_dim);
+    new (&partial_contract2)
+        MatrixMap((Complex*)(scratch_raw_workspace), bond_dim, 2 * bond_dim);
+
+    partial_contract.setZero();
+    partial_contract(0, 0) = 1;
+    if (index < num_qubits - 1) {
+      partial_contract.noalias() = top * bot.adjoint();
+    }
+
+    for (unsigned i = num_qubits - 2; i > index; --i) {
+      offset = GetBlockOffset(state, i);
+
+      // reshape:
+      new (&partial_contract2)
+          MatrixMap((Complex*)(scratch_raw_workspace), 2 * bond_dim, bond_dim);
+
+      // Merge bot into left boundary merged tensor.
+      new (&bot) ConstMatrixMap((Complex*)(state_raw + offset), 2 * bond_dim,
+                                bond_dim);
+      partial_contract2.noalias() = bot * partial_contract.adjoint();
+
+      // reshape:
+      new (&partial_contract2)
+          MatrixMap((Complex*)(scratch_raw_workspace), bond_dim, 2 * bond_dim);
+
+      // Merge top into partial_contract2.
+      new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), bond_dim,
+                                2 * bond_dim);
+      // [bd, bd] = [bd, 2bd] @ [bd, 2bd]
+      partial_contract.noalias() = top * partial_contract2.adjoint();
+    }
+
+    // The [bond_dim, bond_dim] block in scratch_raw now contains the
+    // contraction down from the end, but not including the index. Begin final
+    // contraction steps.
+
+    // Get leftmost [bd, bd] contraction and contract with top.
+
+    offset = GetBlockOffset(state, index);
+    new (&partial_contract)
+        MatrixMap((Complex*)(state_raw + end), bond_dim, bond_dim);
+    new (&top)
+        ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, 2 * right_dim);
+    new (&partial_contract2)
+        MatrixMap((Complex*)(scratch_raw_workspace), bond_dim, 2 * right_dim);
+    partial_contract2.noalias() = partial_contract * top.conjugate();
+    // copy the bottom contraction scratch_raw to state_raw to save space.
+    memcpy(state_raw + end, scratch_raw + end,
+           bond_dim * bond_dim * 2 * sizeof(fp_type));
+
+    // Contract top again for correct shape.
+    fp_type* contract3_target = (last_index ? rdm : scratch_raw);
+    MatrixMap partial_contract3((Complex*)contract3_target, 2 * right_dim,
+                                2 * right_dim);
+    partial_contract3.noalias() = top.transpose() * partial_contract2;
+
+    // If we are contracting the last index, all the needed transforms are done.
+    if (last_index) {
+      return;
+    }
+
+    // Conduct final tensor contraction operations. Cannot be easily compiled to
+    // matmul.
+    const Eigen::TensorMap<const Eigen::Tensor<Complex, 4, Eigen::RowMajor>>
+        t_4d((Complex*)scratch_raw, 2, bond_dim, 2, bond_dim);
+    const Eigen::TensorMap<const Eigen::Tensor<Complex, 2, Eigen::RowMajor>>
+        t_2d((Complex*)(state_raw + end), bond_dim, bond_dim);
+
+    const Eigen::array<Eigen::IndexPair<int>, 2> product_dims = {
+        Eigen::IndexPair<int>(1, 0),
+        Eigen::IndexPair<int>(3, 1),
+    };
+    Eigen::TensorMap<Eigen::Tensor<Complex, 2, Eigen::RowMajor>> out(
+        (Complex*)rdm, 2, 2);
+    out = t_4d.contract(t_2d, product_dims);
+  }
+
+  // Draw a single bitstring sample from state using scratch and scratch2
+  // as working space.
+  static void SampleOnce(MPS& state, MPS& scratch, MPS& scratch2,
+                         std::mt19937* random_gen, std::vector<bool>* sample) {
+    // TODO: carefully profile with perf and optimize temp storage
+    //  locations for cache friendliness.
+    const auto bond_dim = state.bond_dim();
+    const auto num_qubits = state.num_qubits();
+    const auto end = Size(state);
+    const auto left_frontier_offset = GetBlockOffset(state, num_qubits + 1);
+    std::default_random_engine generator;
+    fp_type* state_raw = state.get();
+    fp_type* scratch_raw = scratch.get();
+    fp_type* scratch2_raw = scratch2.get();
+    fp_type rdm[8];
+
+    sample->reserve(num_qubits);
+    Copy(state, scratch);
+    Copy(state, scratch2);
+
+    // Store prefix contractions in scratch2.
+    auto offset = GetBlockOffset(state, num_qubits - 1);
+    ConstMatrixMap top((Complex*)(state_raw + offset), bond_dim, 2);
+    ConstMatrixMap bot((Complex*)(scratch_raw + offset), bond_dim, 2);
+    MatrixMap partial_contract((Complex*)(scratch2_raw + offset), bond_dim,
+                               bond_dim);
+    MatrixMap partial_contract2((Complex*)(scratch_raw + end), bond_dim,
+                                2 * bond_dim);
+    partial_contract.noalias() = top * bot.adjoint();
+
+    for (unsigned i = num_qubits - 2; i > 0; --i) {
+      offset = GetBlockOffset(state, i);
+      // reshape:
+      new (&partial_contract2)
+          MatrixMap((Complex*)(scratch_raw + end), 2 * bond_dim, bond_dim);
+
+      // Merge bot into left boundary merged tensor.
+      new (&bot) ConstMatrixMap((Complex*)(scratch_raw + offset), 2 * bond_dim,
+                                bond_dim);
+      partial_contract2.noalias() = bot * partial_contract.adjoint();
+
+      // reshape:
+      new (&partial_contract2)
+          MatrixMap((Complex*)(scratch_raw + end), bond_dim, 2 * bond_dim);
+
+      // Merge top into partial_contract2.
+      new (&top) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim,
+                                2 * bond_dim);
+
+      // merge into partial_contract -> scracth2_raw.
+      new (&partial_contract)
+          MatrixMap((Complex*)(scratch2_raw + offset), bond_dim, bond_dim);
+      partial_contract.noalias() = top * partial_contract2.adjoint();
+    }
+
+    // Compute RDM-0 and draw first sample.
+    offset = GetBlockOffset(state, 1);
+    new (&top) ConstMatrixMap((Complex*)state_raw, 2, bond_dim);
+    new (&bot) ConstMatrixMap((Complex*)scratch_raw, 2, bond_dim);
+    new (&partial_contract)
+        MatrixMap((Complex*)(scratch2_raw + offset), bond_dim, bond_dim);
+    new (&partial_contract2)
+        MatrixMap((Complex*)(scratch_raw + end), 2, bond_dim);
+
+    partial_contract2.noalias() = bot * partial_contract.adjoint();
+
+    new (&partial_contract) MatrixMap((Complex*)rdm, 2, 2);
+    partial_contract.noalias() = top * partial_contract2.adjoint();
+    auto p0 = rdm[0] / (rdm[0] + rdm[6]);
+    std::bernoulli_distribution distribution(1 - p0);
+    auto bit_val = distribution(*random_gen);
+    sample->push_back(bit_val);
+
+    // collapse state.
+    new (&partial_contract) MatrixMap((Complex*)scratch_raw, 2, bond_dim);
+    partial_contract.row(!bit_val).setZero();
+
+    // Prepare left contraction frontier.
+    new (&partial_contract2) MatrixMap(
+        (Complex*)(scratch2_raw + left_frontier_offset), bond_dim, bond_dim);
+    partial_contract2.noalias() =
+        partial_contract.transpose() * partial_contract.conjugate();
+
+    // Compute RDM-i and draw internal tensor samples.
+    for (unsigned i = 1; i < num_qubits - 1; i++) {
+      // Get leftmost [bd, bd] contraction and contract with top.
+      offset = GetBlockOffset(state, i);
+      new (&partial_contract) MatrixMap(
+          (Complex*)(scratch2_raw + left_frontier_offset), bond_dim, bond_dim);
+      new (&top) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim,
+                                2 * bond_dim);
+      new (&partial_contract2)
+          MatrixMap((Complex*)(state_raw + end), bond_dim, 2 * bond_dim);
+      partial_contract2.noalias() = partial_contract * top.conjugate();
+
+      // Contract top again for correct shape.
+      MatrixMap partial_contract3((Complex*)(scratch_raw + end), 2 * bond_dim,
+                                  2 * bond_dim);
+      partial_contract3.noalias() = top.transpose() * partial_contract2;
+
+      // Conduct final tensor contraction operations. Cannot be easily compiled
+      // to matmul. Perf reports shows only ~6% of runtime spent here on large
+      // systems.
+      offset = GetBlockOffset(state, i + 1);
+      const Eigen::TensorMap<const Eigen::Tensor<Complex, 4, Eigen::RowMajor>>
+          t_4d((Complex*)(scratch_raw + end), 2, bond_dim, 2, bond_dim);
+      const Eigen::TensorMap<const Eigen::Tensor<Complex, 2, Eigen::RowMajor>>
+          t_2d((Complex*)(scratch2_raw + offset), bond_dim, bond_dim);
+
+      const Eigen::array<Eigen::IndexPair<int>, 2> product_dims = {
+          Eigen::IndexPair<int>(1, 0),
+          Eigen::IndexPair<int>(3, 1),
+      };
+      Eigen::TensorMap<Eigen::Tensor<Complex, 2, Eigen::RowMajor>> out(
+          (Complex*)rdm, 2, 2);
+      out = t_4d.contract(t_2d, product_dims);
+
+      // Sample bit and collapse state.
+      p0 = rdm[0] / (rdm[0] + rdm[6]);
+      distribution = std::bernoulli_distribution(1 - p0);
+      bit_val = distribution(*random_gen);
+
+      sample->push_back(bit_val);
+      offset = GetBlockOffset(state, i);
+      new (&partial_contract)
+          MatrixMap((Complex*)(scratch_raw + offset), bond_dim * 2, bond_dim);
+      for (unsigned j = !bit_val; j < 2 * bond_dim; j += 2) {
+        partial_contract.row(j).setZero();
+      }
+
+      // Update left frontier.
+      new (&partial_contract) MatrixMap(
+          (Complex*)(scratch2_raw + left_frontier_offset), bond_dim, bond_dim);
+
+      // reshape:
+      new (&partial_contract2)
+          MatrixMap((Complex*)(state_raw + end), bond_dim, 2 * bond_dim);
+
+      // Merge bot into left boundary merged tensor.
+      new (&bot) ConstMatrixMap((Complex*)(scratch_raw + offset), bond_dim,
+                                2 * bond_dim);
+      partial_contract2.noalias() = partial_contract * bot.conjugate();
+
+      // reshape:
+      new (&partial_contract2)
+          MatrixMap((Complex*)(state_raw + end), 2 * bond_dim, bond_dim);
+
+      // Merge top into partial_contract2.
+      new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), 2 * bond_dim,
+                                bond_dim);
+      partial_contract.noalias() = top.transpose() * partial_contract2;
+    }
+
+    // Compute RDM-(n-1) and sample.
+    offset = GetBlockOffset(state, num_qubits - 1);
+    new (&partial_contract2)
+        MatrixMap((Complex*)(state_raw + end), bond_dim, 2);
+
+    new (&top) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, 2);
+    partial_contract2.noalias() = partial_contract * top.conjugate();
+    new (&partial_contract) MatrixMap((Complex*)rdm, 2, 2);
+    partial_contract.noalias() = top.transpose() * partial_contract2;
+
+    p0 = rdm[0] / (rdm[0] + rdm[6]);
+    distribution = std::bernoulli_distribution(1 - p0);
+    bit_val = distribution(*random_gen);
+    sample->push_back(bit_val);
+  }
+
+  // Draw num_samples bitstring samples from state and store the result
+  // bit vectors in results. Uses scratch and scratch2 as workspace.
+  static void Sample(MPS& state, MPS& scratch, MPS& scratch2,
+                     unsigned num_samples, unsigned seed,
+                     std::vector<std::vector<bool>>* results) {
+    std::mt19937 rand_source(seed);
+    results->reserve(num_samples);
+    for (unsigned i = 0; i < num_samples; i++) {
+      SampleOnce(state, scratch, scratch2, &rand_source, &(*results)[i]);
+    }
+  }
+
+  // Testing only. Convert the MPS to a wavefunction under "normal" ordering.
+  // Requires: wf be allocated beforehand with bond_dim * 2 ^ num_qubits -1
+  // memory.
+  static void ToWaveFunction(MPS& state, fp_type* wf) {
+    const auto bond_dim = state.bond_dim();
+    const auto num_qubits = state.num_qubits();
+    fp_type* raw_state = state.get();
+
+    ConstMatrixMap accum = ConstMatrixMap((Complex*)(raw_state), 2, bond_dim);
+    ConstMatrixMap next_block = ConstMatrixMap(nullptr, 0, 0);
+    MatrixMap result2 = MatrixMap(nullptr, 0, 0);
+    auto offset = 0;
+    auto result2_size = 2;
+
+    for (unsigned i = 1; i < num_qubits - 1; i++) {
+      offset = GetBlockOffset(state, i);
+      // use of new does not trigger any expensive operations.
+      new (&next_block) ConstMatrixMap((Complex*)(raw_state + offset), bond_dim,
+                                       2 * bond_dim);
+      new (&result2) MatrixMap((Complex*)(wf), result2_size, 2 * bond_dim);
+
+      // temp variable used since result2 and accum point to same memory.
+      result2 = accum * next_block;
+      result2_size *= 2;
+      new (&accum) ConstMatrixMap((Complex*)(wf), result2_size, bond_dim);
+    }
+    offset = GetBlockOffset(state, num_qubits - 1);
+    new (&next_block)
+        ConstMatrixMap((Complex*)(raw_state + offset), bond_dim, 2);
+    new (&result2) MatrixMap((Complex*)(wf), result2_size, 2);
+    result2 = accum * next_block;
+  }
+
+ protected:
+  For for_;
+};
+
+}  // namespace mps
+}  // namespace qsim
+
+#endif  // MPS_STATESPACE_H_
diff --git a/tpls/qsim/parfor.h b/tpls/qsim/parfor.h
new file mode 100644
index 0000000..8a3a4d6
--- /dev/null
+++ b/tpls/qsim/parfor.h
@@ -0,0 +1,123 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PARFOR_H_
+#define PARFOR_H_
+
+#include <omp.h>
+
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+namespace qsim {
+
+/**
+ * Helper struct for executing for-loops in parallel across multiple threads.
+ */
+template <uint64_t MIN_SIZE>
+struct ParallelForT {
+  explicit ParallelForT(unsigned num_threads) : num_threads(num_threads) {}
+
+  // GetIndex0 and GetIndex1 are useful when we need to know how work was
+  // divided between threads, for instance, for reusing partial sums obtained
+  // by RunReduceP.
+  uint64_t GetIndex0(uint64_t size, unsigned thread_id) const {
+    return size >= MIN_SIZE ? size * thread_id / num_threads : 0;
+  }
+
+  uint64_t GetIndex1(uint64_t size, unsigned thread_id) const {
+    return size >= MIN_SIZE ? size * (thread_id + 1) / num_threads : size;
+  }
+
+  template <typename Function, typename... Args>
+  void Run(uint64_t size, Function&& func, Args&&... args) const {
+    if (num_threads > 1 && size >= MIN_SIZE) {
+      #pragma omp parallel num_threads(num_threads)
+      {
+        unsigned n = omp_get_num_threads();
+        unsigned m = omp_get_thread_num();
+
+        uint64_t i0 = GetIndex0(size, m);
+        uint64_t i1 = GetIndex1(size, m);
+
+        for (uint64_t i = i0; i < i1; ++i) {
+          func(n, m, i, args...);
+        }
+      }
+    } else {
+      for (uint64_t i = 0; i < size; ++i) {
+        func(1, 0, i, args...);
+      }
+    }
+  }
+
+  template <typename Function, typename Op, typename... Args>
+  std::vector<typename Op::result_type> RunReduceP(
+      uint64_t size, Function&& func, Op&& op, Args&&... args) const {
+    std::vector<typename Op::result_type> partial_results;
+
+    if (num_threads > 1 && size >= MIN_SIZE) {
+      partial_results.resize(num_threads, 0);
+
+      #pragma omp parallel num_threads(num_threads)
+      {
+        unsigned n = omp_get_num_threads();
+        unsigned m = omp_get_thread_num();
+
+        uint64_t i0 = GetIndex0(size, m);
+        uint64_t i1 = GetIndex1(size, m);
+
+        typename Op::result_type partial_result = 0;
+
+        for (uint64_t i = i0; i < i1; ++i) {
+          partial_result = op(partial_result, func(n, m, i, args...));
+        }
+
+        partial_results[m] = partial_result;
+      }
+    } else if (num_threads > 0) {
+      typename Op::result_type result = 0;
+      for (uint64_t i = 0; i < size; ++i) {
+        result = op(result, func(1, 0, i, args...));
+      }
+
+      partial_results.resize(1, result);
+    }
+
+    return partial_results;
+  }
+
+  template <typename Function, typename Op, typename... Args>
+  typename Op::result_type RunReduce(uint64_t size, Function&& func,
+                                     Op&& op, Args&&... args) const {
+    auto partial_results = RunReduceP(size, func, std::move(op), args...);
+
+    typename Op::result_type result = 0;
+
+    for (auto partial_result : partial_results) {
+      result = op(result, partial_result);
+    }
+
+    return result;
+  }
+
+  unsigned num_threads;
+};
+
+using ParallelFor = ParallelForT<1024>;
+
+}  // namespace qsim
+
+#endif  // PARFOR_H_
diff --git a/tpls/qsim/qtrajectory.h b/tpls/qsim/qtrajectory.h
new file mode 100644
index 0000000..1da6692
--- /dev/null
+++ b/tpls/qsim/qtrajectory.h
@@ -0,0 +1,435 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef QTRAJECTORY_H_
+#define QTRAJECTORY_H_
+
+#include <cmath>
+#include <complex>
+#include <cstdint>
+#include <random>
+#include <vector>
+
+#include "circuit_noisy.h"
+#include "gate.h"
+#include "gate_appl.h"
+
+namespace qsim {
+
+/**
+ * Quantum trajectory simulator.
+ */
+template <typename IO, typename Gate,
+          template <typename, typename> class FuserT, typename Simulator,
+          typename RGen = std::mt19937>
+class QuantumTrajectorySimulator {
+ public:
+  using Fuser = FuserT<IO, const Gate*>;
+  using StateSpace = typename Simulator::StateSpace;
+  using State = typename Simulator::State;
+  using MeasurementResult = typename StateSpace::MeasurementResult;
+
+  /**
+   * User-specified parameters for the simulator.
+   */
+  struct Parameter : public Fuser::Parameter {
+    /**
+     * If true, collect statistics of sampled Kraus operator indices.
+     */
+    bool collect_kop_stat = false;
+    /**
+     * If true, collect statistics of measured bitstrings.
+     */
+    bool collect_mea_stat = false;
+    /**
+     * If true, normalize the state vector before performing measurements.
+     */
+    bool normalize_before_mea_gates = true;
+    /**
+     * If false, do not apply deferred operators after the main loop for
+     * the "primary" noise trajectory, that is the trajectory in which
+     * the primary (the first operators in their respective channels) Kraus
+     * operators are sampled for each channel and there are no measurements
+     * in the computational basis. This can be used to speed up simulations
+     * of circuits with weak noise and without measurements by reusing
+     * the primary trajectory results. There is an additional condition for
+     * RunBatch. In this case, the deferred operators after the main loop are
+     * still applied for the first occurence of the primary trajectory.
+     * The primary Kraus operators should have the highest sampling
+     * probabilities to achieve the highest speedup.
+     *
+     * It is the client's responsibility to collect the primary trajectory
+     * results and to reuse them.
+     */
+    bool apply_last_deferred_ops = true;
+  };
+
+  /**
+   * Struct with statistics to populate by RunBatch and RunOnce methods.
+   */
+  struct Stat {
+    /**
+     * Indices of sampled Kraus operator indices and/or measured bitstrings.
+     */
+    std::vector<uint64_t> samples;
+    /**
+     * True if the "primary" noise trajectory is sampled, false otherwise.
+     */
+    bool primary;
+  };
+
+  /**
+   * Runs the given noisy circuit performing repetitions. Each repetition is
+   * seeded by repetition ID.
+   * @param param Options for the quantum trajectory simulator.
+   * @param circuit The noisy circuit to be simulated.
+   * @param r0, r1 The range of repetition IDs [r0, r1) to perform repetitions.
+   * @param state_space StateSpace object required to manipulate state vector.
+   * @param simulator Simulator object. Provides specific implementations for
+   *   applying gates.
+   * @param measure Function that performs measurements (in the sense of
+   *   computing expectation values, etc). This function should have three
+   *   required parameters [repetition ID (uint64_t), final state vector
+   *   (const State&), statistics of sampled Kraus operator indices and/or
+   *   measured bitstrings (const Stat&)] and any number of optional parameters.
+   * @param args Optional arguments for the 'measure' function.
+   * @return True if the simulation completed successfully; false otherwise.
+   */
+  template <typename MeasurementFunc, typename... Args>
+  static bool RunBatch(const Parameter& param,
+                       const NoisyCircuit<Gate>& circuit,
+                       uint64_t r0, uint64_t r1, const StateSpace& state_space,
+                       const Simulator& simulator, MeasurementFunc&& measure,
+                       Args&&... args) {
+    return RunBatch(param, circuit.num_qubits, circuit.channels.begin(),
+                    circuit.channels.end(), r0, r1, state_space, simulator,
+                    measure, args...);
+  }
+
+  /**
+   * Runs the given noisy circuit performing repetitions. Each repetition is
+   * seeded by repetition ID.
+   * @param param Options for the quantum trajectory simulator.
+   * @param num_qubits The number of qubits acted on by the circuit.
+   * @param cbeg, cend The range of channels [cbeg, cend) to run the circuit.
+   * @param r0, r1 The range of repetition IDs [r0, r1) to perform repetitions.
+   * @param state_space StateSpace object required to manipulate state vector.
+   * @param simulator Simulator object. Provides specific implementations for
+   *   applying gates.
+   * @param measure Function that performs measurements (in the sense of
+   *   computing expectation values, etc). This function should have three
+   *   required parameters [repetition ID (uint64_t), final state vector
+   *   (const State&), statistics of sampled Kraus operator indices and/or
+   *   measured bitstrings (const Stat&)] and any number of optional parameters.
+   * @param args Optional arguments for the 'measure' function.
+   * @return True if the simulation completed successfully; false otherwise.
+   */
+  template <typename MeasurementFunc, typename... Args>
+  static bool RunBatch(const Parameter& param, unsigned num_qubits,
+                       ncircuit_iterator<Gate> cbeg,
+                       ncircuit_iterator<Gate> cend,
+                       uint64_t r0, uint64_t r1, const StateSpace& state_space,
+                       const Simulator& simulator, MeasurementFunc&& measure,
+                       Args&&... args) {
+    std::vector<const Gate*> gates;
+    gates.reserve(4 * std::size_t(cend - cbeg));
+
+    State state = state_space.Null();
+
+    Stat stat;
+    bool had_primary_realization = false;
+
+    for (uint64_t r = r0; r < r1; ++r) {
+      if (!state_space.IsNull(state)) {
+        state_space.SetStateZero(state);
+      }
+
+      bool apply_last_deferred_ops =
+          param.apply_last_deferred_ops || !had_primary_realization;
+
+      if (!RunIteration(param, apply_last_deferred_ops, num_qubits, cbeg, cend,
+                        r, state_space, simulator, gates, state, stat)) {
+        return false;
+      }
+
+      if (stat.primary && !had_primary_realization) {
+        had_primary_realization = true;
+      }
+
+      measure(r, state, stat, args...);
+    }
+
+    return true;
+  }
+
+  /**
+   * Runs the given noisy circuit one time.
+   * @param param Options for the quantum trajectory simulator.
+   * @param circuit The noisy circuit to be simulated.
+   * @param r The repetition ID. The random number generator is seeded by 'r'.
+   * @param state_space StateSpace object required to manipulate state vector.
+   * @param simulator Simulator object. Provides specific implementations for
+   *   applying gates.
+   * @param state The state of the system, to be updated by this method.
+   * @param stat Statistics of sampled Kraus operator indices and/or measured
+   *   bitstrings, to be populated by this method.
+   * @return True if the simulation completed successfully; false otherwise.
+   */
+  static bool RunOnce(const Parameter& param,
+                      const NoisyCircuit<Gate>& circuit, uint64_t r,
+                      const StateSpace& state_space, const Simulator& simulator,
+                      State& state, Stat& stat) {
+    return RunOnce(param, circuit.num_qubits, circuit.channels.begin(),
+                   circuit.channels.end(), r, state_space, simulator,
+                   state, stat);
+  }
+
+  /**
+   * Runs the given noisy circuit one time.
+   * @param param Options for the quantum trajectory simulator.
+   * @param num_qubits The number of qubits acted on by the circuit.
+   * @param cbeg, cend The range of channels [cbeg, cend) to run the circuit.
+   * @param circuit The noisy circuit to be simulated.
+   * @param r The repetition ID. The random number generator is seeded by 'r'.
+   * @param state_space StateSpace object required to manipulate state vector.
+   * @param simulator Simulator object. Provides specific implementations for
+   *   applying gates.
+   * @param state The state of the system, to be updated by this method.
+   * @param stat Statistics of sampled Kraus operator indices and/or measured
+   *   bitstrings, to be populated by this method.
+   * @return True if the simulation completed successfully; false otherwise.
+   */
+  static bool RunOnce(const Parameter& param, unsigned num_qubits,
+                      ncircuit_iterator<Gate> cbeg,
+                      ncircuit_iterator<Gate> cend,
+                      uint64_t r, const StateSpace& state_space,
+                      const Simulator& simulator, State& state, Stat& stat) {
+    std::vector<const Gate*> gates;
+    gates.reserve(4 * std::size_t(cend - cbeg));
+
+    if (!RunIteration(param, param.apply_last_deferred_ops, num_qubits, cbeg,
+                      cend, r, state_space, simulator, gates, state, stat)) {
+      return false;
+    }
+
+    return true;
+  }
+
+ private:
+  static bool RunIteration(const Parameter& param,
+                           bool apply_last_deferred_ops, unsigned num_qubits,
+                           ncircuit_iterator<Gate> cbeg,
+                           ncircuit_iterator<Gate> cend,
+                           uint64_t rep, const StateSpace& state_space,
+                           const Simulator& simulator,
+                           std::vector<const Gate*>& gates,
+                           State& state, Stat& stat) {
+    if (param.collect_kop_stat || param.collect_mea_stat) {
+      stat.samples.reserve(std::size_t(cend - cbeg));
+      stat.samples.resize(0);
+    }
+
+    if (state_space.IsNull(state)) {
+      state = CreateState(num_qubits, state_space);
+      if (state_space.IsNull(state)) {
+        return false;
+      }
+
+      state_space.SetStateZero(state);
+    }
+
+    gates.resize(0);
+
+    RGen rgen(rep);
+    std::uniform_real_distribution<double> distr(0.0, 1.0);
+
+    bool unitary = true;
+    stat.primary = true;
+
+    for (auto it = cbeg; it != cend; ++it) {
+      const auto& channel = *it;
+
+      if (channel.size() == 0) continue;
+
+      if (channel[0].kind == gate::kMeasurement) {
+        // Measurement channel.
+
+        if (!ApplyDeferredOps(param, num_qubits, simulator, gates, state)) {
+          return false;
+        }
+
+        bool normalize = !unitary && param.normalize_before_mea_gates;
+        NormalizeState(normalize, state_space, unitary, state);
+
+        auto mresult = ApplyMeasurementGate(state_space, channel[0].ops[0],
+                                            rgen, state);
+
+        if (!mresult.valid) {
+          return false;
+        }
+
+        CollectStat(param.collect_mea_stat, mresult.bits, stat);
+
+        stat.primary = false;
+
+        continue;
+      }
+
+      // "Normal" channel.
+
+      double r = distr(rgen);
+      double cp = 0;
+
+      // Perform sampling of Kraus operators using probability bounds.
+      for (std::size_t i = 0; i < channel.size(); ++i) {
+        const auto& kop = channel[i];
+
+        cp += kop.prob;
+
+        if (r < cp) {
+          DeferOps(kop.ops, gates);
+          CollectStat(param.collect_kop_stat, i, stat);
+
+          unitary = unitary && kop.unitary;
+
+          break;
+        }
+      }
+
+      if (r < cp) continue;
+
+      if (!ApplyDeferredOps(param, num_qubits, simulator, gates, state)) {
+        return false;
+      }
+
+      NormalizeState(!unitary, state_space, unitary, state);
+
+      double max_prob = 0;
+      std::size_t max_prob_index = 0;
+
+      // Perform sampling of Kraus operators using norms of updated states.
+      for (std::size_t i = 0; i < channel.size(); ++i) {
+        const auto& kop = channel[i];
+
+        if (kop.unitary) continue;
+
+        double prob = std::real(
+            simulator.ExpectationValue(kop.qubits, kop.kd_k.data(), state));
+
+        if (prob > max_prob) {
+          max_prob = prob;
+          max_prob_index = i;
+        }
+
+        cp += prob - kop.prob;
+
+        if (r < cp || i == channel.size() - 1) {
+          // Sample ith Kraus operator if r < cp
+          // Sample the highest probability Kraus operator if r is greater
+          // than the sum of all probablities due to round-off errors.
+          uint64_t k = r < cp ? i : max_prob_index;
+
+          DeferOps(channel[k].ops, gates);
+          CollectStat(param.collect_kop_stat, k, stat);
+
+          unitary = false;
+
+          break;
+        }
+      }
+    }
+
+    if (apply_last_deferred_ops || !stat.primary) {
+      if (!ApplyDeferredOps(param, num_qubits, simulator, gates, state)) {
+        return false;
+      }
+
+      NormalizeState(!unitary, state_space, unitary, state);
+    }
+
+    return true;
+  }
+
+  static State CreateState(unsigned num_qubits, const StateSpace& state_space) {
+    auto state = state_space.Create(num_qubits);
+    if (state_space.IsNull(state)) {
+      IO::errorf("not enough memory: is the number of qubits too large?\n");
+      return state_space.Null();
+    }
+
+    return state;
+  }
+
+  static bool ApplyDeferredOps(
+      const Parameter& param, unsigned num_qubits, const Simulator& simulator,
+      std::vector<const Gate*>& gates, State& state) {
+    if (gates.size() > 0) {
+      auto fgates = Fuser::FuseGates(param, num_qubits, gates);
+
+      gates.resize(0);
+
+      if (fgates.size() == 0) {
+        return false;
+      }
+
+      for (const auto& fgate : fgates) {
+        ApplyFusedGate(simulator, fgate, state);
+      }
+    }
+
+    return true;
+  }
+
+  static MeasurementResult ApplyMeasurementGate(
+      const StateSpace& state_space, const Gate& gate,
+      RGen& rgen, State& state) {
+    auto result = state_space.Measure(gate.qubits, rgen, state);
+
+    if (!result.valid) {
+      IO::errorf("measurement failed.\n");
+    }
+
+    return result;
+  }
+
+  static void DeferOps(
+      const std::vector<Gate>& ops, std::vector<const Gate*>& gates) {
+    for (const auto& op : ops) {
+      gates.push_back(&op);
+    }
+  }
+
+  static void CollectStat(bool collect_stat, uint64_t i, Stat& stat) {
+    if (collect_stat) {
+      stat.samples.push_back(i);
+    }
+
+    if (i != 0) {
+      stat.primary = false;
+    }
+  }
+
+  static void NormalizeState(bool normalize, const StateSpace& state_space,
+                             bool& flag, State& state) {
+    if (normalize) {
+      double a = 1.0 / std::sqrt(state_space.Norm(state));
+      state_space.Multiply(a, state);
+      flag = true;
+    }
+  }
+};
+
+}  // namespace qsim
+
+#endif  // QTRAJECTORY_H_
diff --git a/tpls/qsim/run_qsim.h b/tpls/qsim/run_qsim.h
new file mode 100644
index 0000000..3752915
--- /dev/null
+++ b/tpls/qsim/run_qsim.h
@@ -0,0 +1,262 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef RUN_QSIM_H_
+#define RUN_QSIM_H_
+
+#include <random>
+#include <string>
+#include <vector>
+
+#include "gate.h"
+#include "gate_appl.h"
+#include "util.h"
+
+namespace qsim {
+
+/**
+ * Helper struct for running qsim.
+ */
+template <typename IO, typename Fuser, typename Factory,
+          typename RGen = std::mt19937>
+struct QSimRunner final {
+ public:
+  using Simulator = typename Factory::Simulator;
+  using StateSpace = typename Simulator::StateSpace;
+  using State = typename StateSpace::State;
+  using MeasurementResult = typename StateSpace::MeasurementResult;
+
+  /**
+   * User-specified parameters for gate fusion and simulation.
+   */
+  struct Parameter : public Fuser::Parameter {
+    /**
+     * Random number generator seed to apply measurement gates.
+     */
+    uint64_t seed;
+  };
+
+  /**
+   * Runs the given circuit, only measuring at the end.
+   * @param param Options for gate fusion, parallelism and logging.
+   * @param factory Object to create simulators and state spaces.
+   * @param circuit The circuit to be simulated.
+   * @param measure Function that performs measurements (in the sense of
+   *   computing expectation values, etc).
+   * @return True if the simulation completed successfully; false otherwise.
+   */
+  template <typename Circuit, typename MeasurementFunc>
+  static bool Run(const Parameter& param, const Factory& factory,
+                  const Circuit& circuit, MeasurementFunc measure) {
+    return Run(param, factory, {circuit.gates.back().time}, circuit, measure);
+  }
+
+  /**
+   * Runs the given circuit, measuring at user-specified times.
+   * @param param Options for gate fusion, parallelism and logging.
+   * @param factory Object to create simulators and state spaces.
+   * @param times_to_measure_at Time steps at which to perform measurements.
+   * @param circuit The circuit to be simulated.
+   * @param measure Function that performs measurements (in the sense of
+   *   computing expectation values, etc).
+   * @return True if the simulation completed successfully; false otherwise.
+   */
+  template <typename Circuit, typename MeasurementFunc>
+  static bool Run(const Parameter& param, const Factory& factory,
+                  const std::vector<unsigned>& times_to_measure_at,
+                  const Circuit& circuit, MeasurementFunc measure) {
+    double t0 = 0.0;
+    double t1 = 0.0;
+
+    if (param.verbosity > 1) {
+      t0 = GetTime();
+    }
+
+    RGen rgen(param.seed);
+
+    StateSpace state_space = factory.CreateStateSpace();
+
+    auto state = state_space.Create(circuit.num_qubits);
+    if (state_space.IsNull(state)) {
+      IO::errorf("not enough memory: is the number of qubits too large?\n");
+      return false;
+    }
+
+    state_space.SetStateZero(state);
+    Simulator simulator = factory.CreateSimulator();
+
+    if (param.verbosity > 1) {
+      t1 = GetTime();
+      IO::messagef("init time is %g seconds.\n", t1 - t0);
+      t0 = GetTime();
+    }
+
+    auto fused_gates = Fuser::FuseGates(param, circuit.num_qubits,
+                                        circuit.gates, times_to_measure_at);
+
+    if (fused_gates.size() == 0 && circuit.gates.size() > 0) {
+      return false;
+    }
+
+    if (param.verbosity > 1) {
+      t1 = GetTime();
+      IO::messagef("fuse time is %g seconds.\n", t1 - t0);
+    }
+
+    if (param.verbosity > 0) {
+      t0 = GetTime();
+    }
+
+    unsigned cur_time_index = 0;
+
+    // Apply fused gates.
+    for (std::size_t i = 0; i < fused_gates.size(); ++i) {
+      if (param.verbosity > 3) {
+        t1 = GetTime();
+      }
+
+      if (!ApplyFusedGate(state_space, simulator, fused_gates[i], rgen,
+                          state)) {
+        IO::errorf("measurement failed.\n");
+        return false;
+      }
+
+      if (param.verbosity > 3) {
+        state_space.DeviceSync();
+        double t2 = GetTime();
+        IO::messagef("gate %lu done in %g seconds.\n", i, t2 - t1);
+      }
+
+      unsigned t = times_to_measure_at[cur_time_index];
+
+      if (i == fused_gates.size() - 1 || t < fused_gates[i + 1].time) {
+        // Call back to perform measurements.
+        measure(cur_time_index, state_space, state);
+        ++cur_time_index;
+      }
+    }
+
+    if (param.verbosity > 0) {
+      state_space.DeviceSync();
+      double t2 = GetTime();
+      IO::messagef("time is %g seconds.\n", t2 - t0);
+    }
+
+    return true;
+  }
+
+  /**
+   * Runs the given circuit and make the final state available to the caller,
+   * recording the result of any intermediate measurements in the circuit.
+   * @param param Options for gate fusion, parallelism and logging.
+   * @param factory Object to create simulators and state spaces.
+   * @param circuit The circuit to be simulated.
+   * @param state As an input parameter, this should contain the initial state
+   *   of the system. After a successful run, it will be populated with the
+   *   final state of the system.
+   * @param measure_results As an input parameter, this should be empty.
+   *   After a successful run, this will contain all measurements results from
+   *   the run, ordered by time and qubit index.
+   * @return True if the simulation completed successfully; false otherwise.
+   */
+  template <typename Circuit>
+  static bool Run(const Parameter& param, const Factory& factory,
+                  const Circuit& circuit, State& state,
+                  std::vector<MeasurementResult>& measure_results) {
+    double t0 = 0.0;
+    double t1 = 0.0;
+
+    if (param.verbosity > 1) {
+      t0 = GetTime();
+    }
+
+    RGen rgen(param.seed);
+
+    StateSpace state_space = factory.CreateStateSpace();
+    Simulator simulator = factory.CreateSimulator();
+
+    if (param.verbosity > 1) {
+      t1 = GetTime();
+      IO::messagef("init time is %g seconds.\n", t1 - t0);
+      t0 = GetTime();
+    }
+
+    auto fused_gates = Fuser::FuseGates(param, circuit.num_qubits,
+                                        circuit.gates);
+
+    if (fused_gates.size() == 0 && circuit.gates.size() > 0) {
+      return false;
+    }
+
+    measure_results.reserve(fused_gates.size());
+
+    if (param.verbosity > 1) {
+      t1 = GetTime();
+      IO::messagef("fuse time is %g seconds.\n", t1 - t0);
+    }
+
+    if (param.verbosity > 0) {
+      t0 = GetTime();
+    }
+
+    // Apply fused gates.
+    for (std::size_t i = 0; i < fused_gates.size(); ++i) {
+      if (param.verbosity > 3) {
+        t1 = GetTime();
+      }
+
+      if (!ApplyFusedGate(state_space, simulator, fused_gates[i], rgen, state,
+                          measure_results)) {
+        IO::errorf("measurement failed.\n");
+        return false;
+      }
+
+      if (param.verbosity > 3) {
+        state_space.DeviceSync();
+        double t2 = GetTime();
+        IO::messagef("gate %lu done in %g seconds.\n", i, t2 - t1);
+      }
+    }
+
+    if (param.verbosity > 0) {
+      state_space.DeviceSync();
+      double t2 = GetTime();
+      IO::messagef("simu time is %g seconds.\n", t2 - t0);
+    }
+
+    return true;
+  }
+
+  /**
+   * Runs the given circuit and make the final state available to the caller,
+   * discarding the result of any intermediate measurements in the circuit.
+   * @param param Options for gate fusion, parallelism and logging.
+   * @param factory Object to create simulators and state spaces.
+   * @param circuit The circuit to be simulated.
+   * @param state As an input parameter, this should contain the initial state
+   *   of the system. After a successful run, it will be populated with the
+   *   final state of the system.
+   * @return True if the simulation completed successfully; false otherwise.
+   */
+  template <typename Circuit>
+  static bool Run(const Parameter& param, const Factory& factory,
+                  const Circuit& circuit, State& state) {
+    std::vector<MeasurementResult> discarded_results;
+    return Run(param, factory, circuit, state, discarded_results);
+  }
+};
+
+}  // namespace qsim
+
+#endif  // RUN_QSIM_H_
diff --git a/tpls/qsim/run_qsimh.h b/tpls/qsim/run_qsimh.h
new file mode 100644
index 0000000..c1534d3
--- /dev/null
+++ b/tpls/qsim/run_qsimh.h
@@ -0,0 +1,120 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef RUN_QSIMH_H_
+#define RUN_QSIMH_H_
+
+#include <string>
+#include <vector>
+
+#include "hybrid.h"
+#include "util.h"
+
+namespace qsim {
+
+/**
+ * Helper struct for running qsimh.
+ */
+template <typename IO, typename HybridSimulator>
+struct QSimHRunner final {
+  using Gate = typename HybridSimulator::Gate;
+  using fp_type = typename HybridSimulator::fp_type;
+
+  using Parameter = typename HybridSimulator::Parameter;
+  using HybridData = typename HybridSimulator::HybridData;
+  using Fuser = typename HybridSimulator::Fuser;
+
+  /**
+   * Evaluates the amplitudes for a given circuit and set of output states.
+   * @param param Options for gate fusion, parallelism and logging. Also
+   *   specifies the size of the 'prefix' and 'root' sections of the lattice.
+   * @param factory Object to create simulators and state spaces.
+   * @param circuit The circuit to be simulated.
+   * @param parts Lattice sections to be simulated.
+   * @param bitstrings List of output states to simulate, as bitstrings.
+   * @param results Output vector of amplitudes. After a successful run, this
+   *   will be populated with amplitudes for each state in 'bitstrings'.
+   * @return True if the simulation completed successfully; false otherwise.
+   */
+  template <typename Factory, typename Circuit>
+  static bool Run(const Parameter& param, const Factory& factory,
+                  const Circuit& circuit, const std::vector<unsigned>& parts,
+                  const std::vector<uint64_t>& bitstrings,
+                  std::vector<std::complex<fp_type>>& results) {
+    if (circuit.num_qubits != parts.size()) {
+      IO::errorf("parts size is not equal to the number of qubits.");
+      return false;
+    }
+
+    double t0 = 0.0;
+
+    if (param.verbosity > 0) {
+      t0 = GetTime();
+    }
+
+    HybridData hd;
+    bool rc = HybridSimulator::SplitLattice(parts, circuit.gates, hd);
+
+    if (!rc) {
+      return false;
+    }
+
+    if (hd.num_gatexs < param.num_prefix_gatexs + param.num_root_gatexs) {
+      IO::errorf("error: num_prefix_gates (%u) plus num_root gates (%u) is "
+                 "greater than num_gates_on_the_cut (%u).\n",
+                 param.num_prefix_gatexs, param.num_root_gatexs,
+                 hd.num_gatexs);
+      return false;
+    }
+
+    if (param.verbosity > 0) {
+      PrintInfo(param, hd);
+    }
+
+    auto fgates0 = Fuser::FuseGates(param, hd.num_qubits0, hd.gates0);
+    if (fgates0.size() == 0 && hd.gates0.size() > 0) {
+      return false;
+    }
+
+    auto fgates1 = Fuser::FuseGates(param, hd.num_qubits1, hd.gates1);
+    if (fgates1.size() == 0 && hd.gates1.size() > 0) {
+      return false;
+    }
+
+    rc = HybridSimulator(param.num_threads).Run(
+        param, factory, hd, parts, fgates0, fgates1, bitstrings, results);
+
+    if (rc && param.verbosity > 0) {
+      double t1 = GetTime();
+      IO::messagef("time elapsed %g seconds.\n", t1 - t0);
+    }
+
+    return rc;
+  }
+
+ private:
+  static void PrintInfo(const Parameter& param, const HybridData& hd) {
+    unsigned num_suffix_gates =
+        hd.num_gatexs - param.num_prefix_gatexs - param.num_root_gatexs;
+
+    IO::messagef("part 0: %u, part 1: %u\n", hd.num_qubits0, hd.num_qubits1);
+    IO::messagef("%u gates on the cut\n", hd.num_gatexs);
+    IO::messagef("breakup: %up+%ur+%us\n", param.num_prefix_gatexs,
+                 param.num_root_gatexs, num_suffix_gates);
+  }
+};
+
+}  // namespace qsim
+
+#endif  // RUN_QSIM_H_
diff --git a/tpls/qsim/seqfor.h b/tpls/qsim/seqfor.h
new file mode 100644
index 0000000..3ebf07c
--- /dev/null
+++ b/tpls/qsim/seqfor.h
@@ -0,0 +1,68 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SEQFOR_H_
+#define SEQFOR_H_
+
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+namespace qsim {
+
+/**
+ * Helper struct for executing for loops in series.
+ */
+struct SequentialFor {
+  explicit SequentialFor(unsigned num_threads) {}
+
+  // SequentialFor does not have any state. So all its methods can be static.
+
+  static uint64_t GetIndex0(uint64_t size, unsigned thread_id) {
+    return 0;
+  }
+
+  static uint64_t GetIndex1(uint64_t size, unsigned thread_id) {
+    return size;
+  }
+
+  template <typename Function, typename... Args>
+  static void Run(uint64_t size, Function&& func, Args&&... args) {
+    for (uint64_t i = 0; i < size; ++i) {
+      func(1, 0, i, args...);
+    }
+  }
+
+  template <typename Function, typename Op, typename... Args>
+  static std::vector<typename Op::result_type> RunReduceP(
+      uint64_t size, Function&& func, Op&& op, Args&&... args) {
+    typename Op::result_type result = 0;
+
+    for (uint64_t i = 0; i < size; ++i) {
+      result = op(result, func(1, 0, i, args...));
+    }
+
+    return std::vector<typename Op::result_type>(1, result);
+  }
+
+  template <typename Function, typename Op, typename... Args>
+  static typename Op::result_type RunReduce(uint64_t size, Function&& func,
+                                            Op&& op, Args&&... args) {
+    return RunReduceP(size, func, std::move(op), args...)[0];
+  }
+};
+
+}  // namespace qsim
+
+#endif  // SEQFOR_H_
diff --git a/tpls/qsim/simmux.h b/tpls/qsim/simmux.h
new file mode 100644
index 0000000..d3c4074
--- /dev/null
+++ b/tpls/qsim/simmux.h
@@ -0,0 +1,44 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SIMMUX_H_
+#define SIMMUX_H_
+
+#ifdef __AVX512F__
+# include "simulator_avx512.h"
+  namespace qsim {
+    template <typename For>
+    using Simulator = SimulatorAVX512<For>;
+  }
+#elif __AVX2__
+# include "simulator_avx.h"
+  namespace qsim {
+    template <typename For>
+    using Simulator = SimulatorAVX<For>;
+  }
+#elif __SSE4_1__
+# include "simulator_sse.h"
+  namespace qsim {
+    template <typename For>
+    using Simulator = SimulatorSSE<For>;
+  }
+#else
+# include "simulator_basic.h"
+  namespace qsim {
+    template <typename For>
+    using Simulator = SimulatorBasic<For>;
+  }
+#endif
+
+#endif  // SIMMUX_H_
diff --git a/tpls/qsim/simmux_gpu.h b/tpls/qsim/simmux_gpu.h
new file mode 100644
index 0000000..1f0bb59
--- /dev/null
+++ b/tpls/qsim/simmux_gpu.h
@@ -0,0 +1,30 @@
+// Copyright 2023 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SIMMUX_GPU_H_
+#define SIMMUX_GPU_H_
+
+#ifdef __CUSTATEVEC__
+# include "simulator_custatevec.h"
+  namespace qsim {
+    using SimulatorGpu = SimulatorCuStateVec<>;
+  }
+#else
+# include "simulator_cuda.h"
+  namespace qsim {
+    using SimulatorGpu = SimulatorCUDA<>;
+  }
+#endif
+
+#endif  // SIMMUX_GPU_H_
diff --git a/tpls/qsim/simulator.h b/tpls/qsim/simulator.h
new file mode 100644
index 0000000..eff5441
--- /dev/null
+++ b/tpls/qsim/simulator.h
@@ -0,0 +1,516 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SIMULATOR_H_
+#define SIMULATOR_H_
+
+#include <cstdint>
+
+#include "bits.h"
+
+namespace qsim {
+
+/**
+ * Base class for simulator classes.
+ */
+class SimulatorBase {
+ protected:
+  // The follwoing template parameters are used for functions below.
+  // H - the number of high (target) qubits.
+  // L - the number of low (target) qubits.
+  // R - SIMD register width in floats.
+
+  // Fills the table of masks (ms) that is used to calculate base state indices
+  // and the table of offset indices (xss) that is used to access the state
+  // vector entries in matrix-vector multiplication functions. This function is
+  // used in simulator_basic.h, simulator_sse.h and simulator_avx.h (no bmi2
+  // version).
+  template <unsigned H, unsigned L = 0>
+  static void FillIndices(unsigned num_qubits, const std::vector<unsigned>& qs,
+                          uint64_t* ms, uint64_t* xss) {
+    constexpr unsigned hsize = 1 << H;
+
+    if (H == 0) {
+      ms[0] = uint64_t(-1);
+      xss[0] = 0;
+    } else {
+      uint64_t xs[H + 1];
+
+      xs[0] = uint64_t{1} << (qs[L] + 1);
+      ms[0] = (uint64_t{1} << qs[L]) - 1;
+      for (unsigned i = 1; i < H; ++i) {
+        xs[i] = uint64_t{1} << (qs[L + i] + 1);
+        ms[i] = ((uint64_t{1} << qs[L + i]) - 1) ^ (xs[i - 1] - 1);
+      }
+      ms[H] = ((uint64_t{1} << num_qubits) - 1) ^ (xs[H - 1] - 1);
+
+      for (unsigned i = 0; i < hsize; ++i) {
+        uint64_t a = 0;
+        for (uint64_t k = 0; k < H; ++k) {
+          a += xs[k] * ((i >> k) & 1);
+        }
+        xss[i] = a;
+      }
+    }
+  }
+
+  // Fills gate matrix entries for gates with low qubits.
+  template <unsigned H, unsigned L, unsigned R, typename fp_type>
+  static void FillMatrix(unsigned qmaskl, const fp_type* matrix, fp_type* w) {
+    constexpr unsigned gsize = 1 << (H + L);
+    constexpr unsigned hsize = 1 << H;
+    constexpr unsigned lsize = 1 << L;
+    constexpr unsigned rsize = 1 << R;
+
+    unsigned s = 0;
+
+    for (unsigned i = 0; i < hsize; ++i) {
+      for (unsigned j = 0; j < gsize; ++j) {
+        unsigned p0 = 2 * i * lsize * gsize + 2 * lsize * (j / lsize);
+
+        for (unsigned k = 0; k < rsize; ++k) {
+          unsigned l = bits::CompressBits(k, R, qmaskl);
+          unsigned p = p0 + 2 * (gsize * l + (j + l) % lsize);
+
+          w[s + 0] = matrix[p];
+          w[s + rsize] = matrix[p + 1];
+
+          ++s;
+        }
+
+        s += rsize;
+      }
+    }
+  }
+
+  // Fills gate matrix entries for controlled gates with high target qubits
+  // and low control qubits.
+  template <unsigned H, unsigned R, typename fp_type>
+  static void FillControlledMatrixH(uint64_t cvalsl, uint64_t cmaskl,
+                                    const fp_type* matrix, fp_type* w) {
+    constexpr unsigned hsize = 1 << H;
+    constexpr unsigned rsize = 1 << R;
+
+    unsigned s = 0;
+
+    for (unsigned i = 0; i < hsize; ++i) {
+      for (unsigned j = 0; j < hsize; ++j) {
+        unsigned p = hsize * i + j;
+        fp_type v = i == j ? 1 : 0;
+
+        for (unsigned k = 0; k < rsize; ++k) {
+          w[s] = cvalsl == (k & cmaskl) ? matrix[2 * p] : v;
+          w[s + rsize] = cvalsl == (k & cmaskl) ? matrix[2 * p + 1] : 0;
+
+          ++s;
+        }
+
+        s += rsize;
+      }
+    }
+  }
+
+  // Fills gate matrix entries for controlled gates with low target qubits
+  // and low control qubits.
+  template <unsigned H, unsigned L, unsigned R, typename fp_type>
+  static void FillControlledMatrixL(uint64_t cvalsl, uint64_t cmaskl,
+                                    unsigned qmaskl, const fp_type* matrix,
+                                    fp_type* w) {
+    constexpr unsigned gsize = 1 << (H + L);
+    constexpr unsigned hsize = 1 << H;
+    constexpr unsigned lsize = 1 << L;
+    constexpr unsigned rsize = 1 << R;
+
+    unsigned s = 0;
+
+    for (unsigned i = 0; i < hsize; ++i) {
+      for (unsigned j = 0; j < gsize; ++j) {
+        unsigned p0 = i * lsize * gsize + lsize * (j / lsize);
+
+        for (unsigned k = 0; k < rsize; ++k) {
+          unsigned l = bits::CompressBits(k, R, qmaskl);
+          unsigned p = p0 + gsize * l + (j + l) % lsize;
+
+          fp_type v = p / gsize == p % gsize ? 1 : 0;
+
+          w[s] = cvalsl == (k & cmaskl) ? matrix[2 * p] : v;
+          w[s + rsize] = cvalsl == (k & cmaskl) ? matrix[2 * p + 1] : 0;
+
+          ++s;
+        }
+
+        s += rsize;
+      }
+    }
+  }
+
+/*
+  The GetMasks* functions below provide various masks and related values.
+  GetMasks1, GetMasks2, GetMasks3, GetMasks4, GetMasks5 and GetMasks6 are
+  used in simulator_avx.h (BMI2 version) and in simulator_avx512.h. GetMasks7,
+  GetMasks8, GetMasks9, GetMasks10 and GetMasks11 are used in simulator_avx.h
+  (no BMI2 version) and in simulator_sse.h.
+
+  imaskh - inverted mask of high qubits (high control and target qubits).
+  qmaskh - mask of high qubits (high target qubits).
+  cvalsh - control bit values of high control qubits placed in correct
+           positions.
+  cvalsl - control bit values of low control qubits placed in correct positions.
+  cmaskh - mask of high control qubits.
+  cmaskl - mask of low control qubits.
+  qmaskl - mask of low qubits (low target qubits).
+  cl - the number of low control qubits.
+
+  Note that imaskh, qmaskh and cvalsh are multiplied by two in GetMasks1,
+  GetMasks2, GetMasks3, GetMasks4, GetMasks5 and GetMasks6.
+*/
+
+  struct Masks1 {
+    uint64_t imaskh;
+    uint64_t qmaskh;
+  };
+
+  template <unsigned H, unsigned R>
+  static Masks1 GetMasks1(const std::vector<unsigned>& qs) {
+    uint64_t qmaskh = 0;
+
+    for (unsigned i = 0; i < H; ++i) {
+      qmaskh |= uint64_t{1} << qs[i];
+    }
+
+    return {2 * (~qmaskh ^ ((1 << R) - 1)), 2 * qmaskh};
+  }
+
+  struct Masks2 {
+    uint64_t imaskh;
+    uint64_t qmaskh;
+    unsigned qmaskl;
+  };
+
+  template <unsigned H, unsigned L, unsigned R>
+  static Masks2 GetMasks2(const std::vector<unsigned>& qs) {
+    uint64_t qmaskh = 0;
+    unsigned qmaskl = 0;
+
+    for (unsigned i = 0; i < L; ++i) {
+      qmaskl |= 1 << qs[i];
+    }
+
+    for (unsigned i = L; i < H + L; ++i) {
+      qmaskh |= uint64_t{1} << qs[i];
+    }
+
+    return {2 * (~qmaskh ^ ((1 << R) - 1)), 2 * qmaskh, qmaskl};
+  }
+
+  struct Masks3 {
+    uint64_t imaskh;
+    uint64_t qmaskh;
+    uint64_t cvalsh;
+  };
+
+  template <unsigned H, unsigned R>
+  static Masks3 GetMasks3(unsigned num_qubits, const std::vector<unsigned>& qs,
+                          const std::vector<unsigned>& cqs, uint64_t cvals) {
+    uint64_t qmaskh = 0;
+    uint64_t cmaskh = 0;
+
+    for (unsigned i = 0; i < H; ++i) {
+      qmaskh |= uint64_t{1} << qs[i];
+    }
+
+    for (auto q : cqs) {
+      cmaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh);
+
+    uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1);
+
+    return {2 * maskh, 2 * qmaskh, 2 * cvalsh};
+  }
+
+  struct Masks4 {
+    uint64_t imaskh;
+    uint64_t qmaskh;
+    uint64_t cvalsh;
+    uint64_t cvalsl;
+    uint64_t cmaskl;
+    unsigned cl;
+  };
+
+  template <unsigned H, unsigned R>
+  static Masks4 GetMasks4(unsigned num_qubits, const std::vector<unsigned>& qs,
+                          const std::vector<unsigned>& cqs, uint64_t cvals) {
+    unsigned cl = 0;
+    uint64_t qmaskh = 0;
+    uint64_t cmaskh = 0;
+    uint64_t cmaskl = 0;
+
+    for (unsigned i = 0; i < H; ++i) {
+      qmaskh |= uint64_t{1} << qs[i];
+    }
+
+    for (auto q : cqs) {
+      if (q >= R) {
+        cmaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        cmaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh);
+    uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl);
+
+    uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1);
+
+    return {2 * maskh, 2 * qmaskh, 2 * cvalsh, cvalsl, cmaskl, cl};
+  }
+
+  struct Masks5 {
+    uint64_t imaskh;
+    uint64_t qmaskh;
+    uint64_t cvalsh;
+    unsigned qmaskl;
+  };
+
+  template <unsigned H, unsigned L, unsigned R>
+  static Masks5 GetMasks5(unsigned num_qubits, const std::vector<unsigned>& qs,
+                          const std::vector<unsigned>& cqs, uint64_t cvals) {
+    uint64_t qmaskh = 0;
+    uint64_t cmaskh = 0;
+    unsigned qmaskl = 0;
+
+    for (unsigned i = 0; i < L; ++i) {
+      qmaskl |= 1 << qs[i];
+    }
+
+    for (unsigned i = L; i < H + L; ++i) {
+      qmaskh |= uint64_t{1} << qs[i];
+    }
+
+    for (auto q : cqs) {
+      cmaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh);
+
+    uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1);
+
+    return {2 * maskh, 2 * qmaskh, 2 * cvalsh, qmaskl};
+  }
+
+  struct Masks6 {
+    uint64_t imaskh;
+    uint64_t qmaskh;
+    uint64_t cvalsh;
+    uint64_t cvalsl;
+    uint64_t cmaskl;
+    unsigned qmaskl;
+    unsigned cl;
+  };
+
+  template <unsigned H, unsigned L, unsigned R>
+  static Masks6 GetMasks6(unsigned num_qubits, const std::vector<unsigned>& qs,
+                          const std::vector<unsigned>& cqs, uint64_t cvals) {
+    unsigned cl = 0;
+    uint64_t qmaskh = 0;
+    uint64_t cmaskh = 0;
+    uint64_t cmaskl = 0;
+    unsigned qmaskl = 0;
+
+    for (unsigned i = 0; i < L; ++i) {
+      qmaskl |= 1 << qs[i];
+    }
+
+    for (unsigned i = L; i < H + L; ++i) {
+      qmaskh |= uint64_t{1} << qs[i];
+    }
+
+    for (auto q : cqs) {
+      if (q >= R) {
+        cmaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        cmaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh);
+    uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl);
+
+    uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1);
+
+    return {2 * maskh, 2 * qmaskh, 2 * cvalsh, cvalsl, cmaskl, qmaskl, cl};
+  }
+
+  struct Masks7 {
+    uint64_t cvalsh;
+    uint64_t cmaskh;
+  };
+
+  static Masks7 GetMasks7(unsigned num_qubits, const std::vector<unsigned>& qs,
+                          const std::vector<unsigned>& cqs, uint64_t cvals) {
+    uint64_t cmaskh = 0;
+
+    for (auto q : cqs) {
+      cmaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh);
+
+    return {cvalsh, cmaskh};
+  }
+
+  struct Masks8 {
+    uint64_t cvalsh;
+    uint64_t cmaskh;
+    uint64_t cvalsl;
+    uint64_t cmaskl;
+  };
+
+  template <unsigned R>
+  static Masks8 GetMasks8(unsigned num_qubits, const std::vector<unsigned>& qs,
+                          const std::vector<unsigned>& cqs, uint64_t cvals) {
+    unsigned cl = 0;
+    uint64_t cmaskh = 0;
+    uint64_t cmaskl = 0;
+
+    for (auto q : cqs) {
+      if (q >= R) {
+        cmaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        cmaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh);
+    uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl);
+
+    return {cvalsh, cmaskh, cvalsl, cmaskl};
+  }
+
+  struct Masks9 {
+    uint64_t cvalsh;
+    uint64_t cmaskh;
+    unsigned qmaskl;
+  };
+
+  template <unsigned L>
+  static Masks9 GetMasks9(unsigned num_qubits, const std::vector<unsigned>& qs,
+                          const std::vector<unsigned>& cqs, uint64_t cvals) {
+    uint64_t cmaskh = 0;
+    unsigned qmaskl = 0;
+
+    for (unsigned i = 0; i < L; ++i) {
+      qmaskl |= 1 << qs[i];
+    }
+
+    for (auto q : cqs) {
+      cmaskh |= uint64_t{1} << q;
+    }
+
+    uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh);
+
+    return {cvalsh, cmaskh, qmaskl};
+  }
+
+  struct Masks10 {
+    uint64_t cvalsh;
+    uint64_t cmaskh;
+    uint64_t cvalsl;
+    uint64_t cmaskl;
+    unsigned qmaskl;
+  };
+
+  template <unsigned L, unsigned R>
+  static Masks10 GetMasks10(unsigned num_qubits,
+                            const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs, uint64_t cvals) {
+    unsigned cl = 0;
+    uint64_t cmaskh = 0;
+    uint64_t cmaskl = 0;
+    unsigned qmaskl = 0;
+
+    for (unsigned i = 0; i < L; ++i) {
+      qmaskl |= 1 << qs[i];
+    }
+
+    for (auto q : cqs) {
+      if (q >= R) {
+        cmaskh |= uint64_t{1} << q;
+      } else {
+        ++cl;
+        cmaskl |= uint64_t{1} << q;
+      }
+    }
+
+    uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh);
+    uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl);
+
+    return {cvalsh, cmaskh, cvalsl, cmaskl, qmaskl};
+  }
+
+  struct Masks11 {
+    unsigned qmaskl;
+  };
+
+  template <unsigned L>
+  static Masks11 GetMasks11(const std::vector<unsigned>& qs) {
+    unsigned qmaskl = 0;
+
+    for (unsigned i = 0; i < L; ++i) {
+      qmaskl |= 1 << qs[i];
+    }
+
+    return {qmaskl};
+  }
+
+  template <unsigned R>
+  static unsigned MaskedAdd(
+      unsigned a, unsigned b, unsigned mask, unsigned lsize) {
+    unsigned c = bits::CompressBits(a, R, mask);
+    return bits::ExpandBits((c + b) % lsize, R, mask);
+  }
+};
+
+template <>
+inline void SimulatorBase::FillIndices<0, 1>(unsigned num_qubits,
+                                             const std::vector<unsigned>& qs,
+                                             uint64_t* ms, uint64_t* xss) {
+  ms[0] = -1;
+  xss[0] = 0;
+}
+
+template <>
+inline void SimulatorBase::FillIndices<0, 2>(unsigned num_qubits,
+                                             const std::vector<unsigned>& qs,
+                                             uint64_t* ms, uint64_t* xss) {
+  ms[0] = -1;
+  xss[0] = 0;
+}
+
+template <>
+inline void SimulatorBase::FillIndices<0, 3>(unsigned num_qubits,
+                                             const std::vector<unsigned>& qs,
+                                             uint64_t* ms, uint64_t* xss) {
+  ms[0] = -1;
+  xss[0] = 0;
+}
+
+}  // namespace qsim
+
+#endif  // SIMULATOR_H_
diff --git a/tpls/qsim/simulator_avx.h b/tpls/qsim/simulator_avx.h
new file mode 100644
index 0000000..9742849
--- /dev/null
+++ b/tpls/qsim/simulator_avx.h
@@ -0,0 +1,1363 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SIMULATOR_AVX_H_
+#define SIMULATOR_AVX_H_
+
+#include <immintrin.h>
+
+#include <complex>
+#include <cstdint>
+#include <functional>
+#include <vector>
+
+#include "simulator.h"
+#include "statespace_avx.h"
+
+namespace qsim {
+
+/**
+ * Quantum circuit simulator with AVX vectorization.
+ */
+template <typename For>
+class SimulatorAVX final : public SimulatorBase {
+ public:
+  using StateSpace = StateSpaceAVX<For>;
+  using State = typename StateSpace::State;
+  using fp_type = typename StateSpace::fp_type;
+
+  template <typename... ForArgs>
+  explicit SimulatorAVX(ForArgs&&... args) : for_(args...) {}
+
+  /**
+   * Applies a gate using AVX instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyGate(const std::vector<unsigned>& qs,
+                 const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    switch (qs.size()) {
+    case 0:
+      ApplyGateH<0>(qs, matrix, state);
+      break;
+    case 1:
+      if (qs[0] > 2) {
+        ApplyGateH<1>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 1>(qs, matrix, state);
+      }
+      break;
+    case 2:
+      if (qs[0] > 2) {
+        ApplyGateH<2>(qs, matrix, state);
+      } else if (qs[1] > 2) {
+        ApplyGateL<1, 1>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 2>(qs, matrix, state);
+      }
+      break;
+    case 3:
+      if (qs[0] > 2) {
+        ApplyGateH<3>(qs, matrix, state);
+      } else if (qs[1] > 2) {
+        ApplyGateL<2, 1>(qs, matrix, state);
+      } else if (qs[2] > 2) {
+        ApplyGateL<1, 2>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 3>(qs, matrix, state);
+      }
+      break;
+    case 4:
+      if (qs[0] > 2) {
+        ApplyGateH<4>(qs, matrix, state);
+      } else if (qs[1] > 2) {
+        ApplyGateL<3, 1>(qs, matrix, state);
+      } else if (qs[2] > 2) {
+        ApplyGateL<2, 2>(qs, matrix, state);
+      } else {
+        ApplyGateL<1, 3>(qs, matrix, state);
+      }
+      break;
+    case 5:
+      if (qs[0] > 2) {
+        ApplyGateH<5>(qs, matrix, state);
+      } else if (qs[1] > 2) {
+        ApplyGateL<4, 1>(qs, matrix, state);
+      } else if (qs[2] > 2) {
+        ApplyGateL<3, 2>(qs, matrix, state);
+      } else {
+        ApplyGateL<2, 3>(qs, matrix, state);
+      }
+      break;
+    case 6:
+      if (qs[0] > 2) {
+        ApplyGateH<6>(qs, matrix, state);
+      } else if (qs[1] > 2) {
+        ApplyGateL<5, 1>(qs, matrix, state);
+      } else if (qs[2] > 2) {
+        ApplyGateL<4, 2>(qs, matrix, state);
+      } else {
+        ApplyGateL<3, 3>(qs, matrix, state);
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * Applies a controlled gate using AVX instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param cqs Indices of control qubits.
+   * @param cvals Bit mask of control qubit values.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyControlledGate(const std::vector<unsigned>& qs,
+                           const std::vector<unsigned>& cqs, uint64_t cvals,
+                           const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+    // Assume cqs[0] < cqs[1] < cqs[2] < ... .
+
+    if (cqs.size() == 0) {
+      ApplyGate(qs, matrix, state);
+      return;
+    }
+
+    switch (qs.size()) {
+    case 0:
+      if (cqs[0] > 2) {
+        ApplyControlledGateHH<0>(qs, cqs, cvals, matrix, state);
+      } else {
+        ApplyControlledGateHL<0>(qs, cqs, cvals, matrix, state);
+      }
+      break;
+    case 1:
+      if (qs[0] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 2:
+      if (qs[0] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 3:
+      if (qs[0] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[2] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 4:
+      if (qs[0] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[2] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * Computes the expectation value of an operator using AVX instructions.
+   * @param qs Indices of the qubits the operator acts on.
+   * @param matrix The operator matrix.
+   * @param state The state of the system.
+   * @return The computed expectation value.
+   */
+  std::complex<double> ExpectationValue(const std::vector<unsigned>& qs,
+                                        const fp_type* matrix,
+                                        const State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    switch (qs.size()) {
+    case 1:
+      if (qs[0] > 2) {
+        return ExpectationValueH<1>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<0, 1>(qs, matrix, state);
+      }
+      break;
+    case 2:
+      if (qs[0] > 2) {
+        return ExpectationValueH<2>(qs, matrix, state);
+      } else if (qs[1] > 2) {
+        return ExpectationValueL<1, 1>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<0, 2>(qs, matrix, state);
+      }
+      break;
+    case 3:
+      if (qs[0] > 2) {
+        return ExpectationValueH<3>(qs, matrix, state);
+      } else if (qs[1] > 2) {
+        return ExpectationValueL<2, 1>(qs, matrix, state);
+      } else if (qs[2] > 2) {
+        return ExpectationValueL<1, 2>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<0, 3>(qs, matrix, state);
+      }
+      break;
+    case 4:
+      if (qs[0] > 2) {
+        return ExpectationValueH<4>(qs, matrix, state);
+      } else if (qs[1] > 2) {
+        return ExpectationValueL<3, 1>(qs, matrix, state);
+      } else if (qs[2] > 2) {
+        return ExpectationValueL<2, 2>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<1, 3>(qs, matrix, state);
+      }
+      break;
+    case 5:
+      if (qs[0] > 2) {
+        return ExpectationValueH<5>(qs, matrix, state);
+      } else if (qs[1] > 2) {
+        return ExpectationValueL<4, 1>(qs, matrix, state);
+      } else if (qs[2] > 2) {
+        return ExpectationValueL<3, 2>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<2, 3>(qs, matrix, state);
+      }
+      break;
+    case 6:
+      if (qs[0] > 2) {
+        return ExpectationValueH<6>(qs, matrix, state);
+      } else if (qs[1] > 2) {
+        return ExpectationValueL<5, 1>(qs, matrix, state);
+      } else if (qs[2] > 2) {
+        return ExpectationValueL<4, 2>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<3, 3>(qs, matrix, state);
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+
+    return 0;
+  }
+
+  /**
+   * @return The size of SIMD register if applicable.
+   */
+  static unsigned SIMDRegisterSize() {
+    return 8;
+  }
+
+ private:
+#ifdef __BMI2__
+
+  template <unsigned H>
+  void ApplyGateH(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                uint64_t imaskh, uint64_t qmaskh, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m256 ru, iu, rn, in;
+      __m256 rs[hsize], is[hsize];
+
+      auto p0 = rstate + _pdep_u64(i, imaskh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k] = _mm256_load_ps(p0 + p);
+        is[k] = _mm256_load_ps(p0 + p + 8);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm256_set1_ps(v[j]);
+        iu = _mm256_set1_ps(v[j + 1]);
+        rn = _mm256_mul_ps(rs[0], ru);
+        in = _mm256_mul_ps(rs[0], iu);
+        rn = _mm256_fnmadd_ps(is[0], iu, rn);
+        in = _mm256_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm256_set1_ps(v[j]);
+          iu = _mm256_set1_ps(v[j + 1]);
+          rn = _mm256_fmadd_ps(rs[l], ru, rn);
+          in = _mm256_fmadd_ps(rs[l], iu, in);
+          rn = _mm256_fnmadd_ps(is[l], iu, rn);
+          in = _mm256_fmadd_ps(is[l], ru, in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm256_store_ps(p0 + p, rn);
+        _mm256_store_ps(p0 + p + 8, in);
+      }
+    };
+
+    auto m = GetMasks1<H, 3>(qs);
+
+    unsigned k = 3 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, matrix, m.imaskh, m.qmaskh, state.get());
+  }
+
+  template <unsigned H, unsigned L>
+  void ApplyGateL(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                uint64_t imaskh, uint64_t qmaskh, const __m256i* idx,
+                fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m256 rn, in;
+      __m256 rs[gsize], is[gsize];
+
+      auto p0 = rstate + _pdep_u64(i, imaskh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k2] = _mm256_load_ps(p0 + p);
+        is[k2] = _mm256_load_ps(p0 + p + 8);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
+          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm256_mul_ps(rs[0], w[j]);
+        in = _mm256_mul_ps(rs[0], w[j + 1]);
+        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm256_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm256_store_ps(p0 + p, rn);
+        _mm256_store_ps(p0 + p + 8, in);
+      }
+    };
+
+    __m256i idx[1 << L];
+    __m256 w[1 << (1 + 2 * H + L)];
+
+    auto m = GetMasks2<H, L, 3>(qs);
+    FillPermutationIndices<L>(m.qmaskl, idx);
+    FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
+
+    unsigned r = 3 + H;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, m.imaskh, m.qmaskh, idx, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
+                fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m256 ru, iu, rn, in;
+      __m256 rs[hsize], is[hsize];
+
+      auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k] = _mm256_load_ps(p0 + p);
+        is[k] = _mm256_load_ps(p0 + p + 8);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm256_set1_ps(v[j]);
+        iu = _mm256_set1_ps(v[j + 1]);
+        rn = _mm256_mul_ps(rs[0], ru);
+        in = _mm256_mul_ps(rs[0], iu);
+        rn = _mm256_fnmadd_ps(is[0], iu, rn);
+        in = _mm256_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm256_set1_ps(v[j]);
+          iu = _mm256_set1_ps(v[j + 1]);
+          rn = _mm256_fmadd_ps(rs[l], ru, rn);
+          in = _mm256_fmadd_ps(rs[l], iu, in);
+          rn = _mm256_fnmadd_ps(is[l], iu, rn);
+          in = _mm256_fmadd_ps(is[l], ru, in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm256_store_ps(p0 + p, rn);
+        _mm256_store_ps(p0 + p + 8, in);
+      }
+    };
+
+    auto m = GetMasks3<H, 3>(state.num_qubits(), qs, cqs, cvals);
+
+    unsigned k = 3 + H + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, matrix, m.imaskh, m.qmaskh, m.cvalsh, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
+                fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m256 rn, in;
+      __m256 rs[hsize], is[hsize];
+
+      auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k] = _mm256_load_ps(p0 + p);
+        is[k] = _mm256_load_ps(p0 + p + 8);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm256_mul_ps(rs[0], w[j]);
+        in = _mm256_mul_ps(rs[0], w[j + 1]);
+        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm256_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm256_store_ps(p0 + p, rn);
+        _mm256_store_ps(p0 + p + 8, in);
+      }
+    };
+
+    __m256 w[1 << (1 + 2 * H)];
+
+    auto m = GetMasks4<H, 3>(state.num_qubits(), qs, cqs, cvals);
+    FillControlledMatrixH<H, 3>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
+
+    unsigned r = 3 + H + cqs.size() - m.cl;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, state.get());
+  }
+
+  template <unsigned H, unsigned L, bool CH>
+  void ApplyControlledGateL(const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs, uint64_t cvals,
+                            const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
+                const __m256i* idx, fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m256 rn, in;
+      __m256 rs[gsize], is[gsize];
+
+      auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k2] = _mm256_load_ps(p0 + p);
+        is[k2] = _mm256_load_ps(p0 + p + 8);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
+          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm256_mul_ps(rs[0], w[j]);
+        in = _mm256_mul_ps(rs[0], w[j + 1]);
+        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm256_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm256_store_ps(p0 + p, rn);
+        _mm256_store_ps(p0 + p + 8, in);
+      }
+    };
+
+    __m256i idx[1 << L];
+    __m256 w[1 << (1 + 2 * H + L)];
+
+    if (CH) {
+      auto m = GetMasks5<H, L, 3>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
+
+      unsigned r = 3 + H + cqs.size();
+      unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+      uint64_t size = uint64_t{1} << n;
+
+      for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get());
+    } else {
+      auto m = GetMasks6<H, L, 3>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillControlledMatrixL<H, L, 3>(
+          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
+
+      unsigned r = 3 + H + cqs.size() - m.cl;
+      unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+      uint64_t size = uint64_t{1} << n;
+
+      for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get());
+    }
+  }
+
+  template <unsigned H>
+  std::complex<double> ExpectationValueH(const std::vector<unsigned>& qs,
+                                         const fp_type* matrix,
+                                         const State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                uint64_t imaskh, uint64_t qmaskh, const fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m256 ru, iu, rn, in;
+      __m256 rs[hsize], is[hsize];
+
+      auto p0 = rstate + _pdep_u64(i, imaskh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k] = _mm256_load_ps(p0 + p);
+        is[k] = _mm256_load_ps(p0 + p + 8);
+      }
+
+      double re = 0;
+      double im = 0;
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm256_set1_ps(v[j]);
+        iu = _mm256_set1_ps(v[j + 1]);
+        rn = _mm256_mul_ps(rs[0], ru);
+        in = _mm256_mul_ps(rs[0], iu);
+        rn = _mm256_fnmadd_ps(is[0], iu, rn);
+        in = _mm256_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm256_set1_ps(v[j]);
+          iu = _mm256_set1_ps(v[j + 1]);
+          rn = _mm256_fmadd_ps(rs[l], ru, rn);
+          in = _mm256_fmadd_ps(rs[l], iu, in);
+          rn = _mm256_fnmadd_ps(is[l], iu, rn);
+          in = _mm256_fmadd_ps(is[l], ru, in);
+
+          j += 2;
+        }
+
+        __m256 v_re = _mm256_fmadd_ps(is[k], in, _mm256_mul_ps(rs[k], rn));
+        __m256 v_im = _mm256_fnmadd_ps(is[k], rn, _mm256_mul_ps(rs[k], in));
+
+        re += detail::HorizontalSumAVX(v_re);
+        im += detail::HorizontalSumAVX(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    auto m = GetMasks1<H, 3>(qs);
+
+    unsigned k = 3 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return
+        for_.RunReduce(size, f, Op(), matrix, m.imaskh, m.qmaskh, state.get());
+  }
+
+  template <unsigned H, unsigned L>
+  std::complex<double> ExpectationValueL(const std::vector<unsigned>& qs,
+                                         const fp_type* matrix,
+                                         const State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                uint64_t imaskh, uint64_t qmaskh, const __m256i* idx,
+                const fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m256 rn, in;
+      __m256 rs[gsize], is[gsize];
+
+      auto p0 = rstate + _pdep_u64(i, imaskh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k2] = _mm256_load_ps(p0 + p);
+        is[k2] = _mm256_load_ps(p0 + p + 8);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
+          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
+        }
+      }
+
+      double re = 0;
+      double im = 0;
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm256_mul_ps(rs[0], w[j]);
+        in = _mm256_mul_ps(rs[0], w[j + 1]);
+        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm256_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        unsigned m = lsize * k;
+
+        __m256 v_re = _mm256_fmadd_ps(is[m], in, _mm256_mul_ps(rs[m], rn));
+        __m256 v_im = _mm256_fnmadd_ps(is[m], rn, _mm256_mul_ps(rs[m], in));
+
+        re += detail::HorizontalSumAVX(v_re);
+        im += detail::HorizontalSumAVX(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    __m256i idx[1 << L];
+    __m256 w[1 << (1 + 2 * H + L)];
+
+    auto m = GetMasks2<H, L, 3>(qs);
+    FillPermutationIndices<L>(m.qmaskl, idx);
+    FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
+
+    unsigned r = 3 + H;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return
+        for_.RunReduce(size, f, Op(), w, m.imaskh, m.qmaskh, idx, state.get());
+  }
+
+#else  // __BMI2__
+
+  template <unsigned H>
+  void ApplyGateH(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m256 ru, iu, rn, in;
+      __m256 rs[hsize], is[hsize];
+
+      i *= 8;
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      auto p0 = rstate + 2 * ii;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm256_load_ps(p0 + xss[k]);
+        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm256_set1_ps(v[j]);
+        iu = _mm256_set1_ps(v[j + 1]);
+        rn = _mm256_mul_ps(rs[0], ru);
+        in = _mm256_mul_ps(rs[0], iu);
+        rn = _mm256_fnmadd_ps(is[0], iu, rn);
+        in = _mm256_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm256_set1_ps(v[j]);
+          iu = _mm256_set1_ps(v[j + 1]);
+          rn = _mm256_fmadd_ps(rs[l], ru, rn);
+          in = _mm256_fmadd_ps(rs[l], iu, in);
+          rn = _mm256_fnmadd_ps(is[l], iu, rn);
+          in = _mm256_fmadd_ps(is[l], ru, in);
+
+          j += 2;
+        }
+
+        _mm256_store_ps(p0 + xss[k], rn);
+        _mm256_store_ps(p0 + xss[k] + 8, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 3 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, matrix, ms, xss, state.get());
+  }
+
+  template <unsigned H, unsigned L>
+  void ApplyGateL(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                const uint64_t* ms, const uint64_t* xss, const __m256i* idx,
+                fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m256 rn, in;
+      __m256 rs[gsize], is[gsize];
+
+      i *= 8;
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      auto p0 = rstate + 2 * ii;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        rs[k2] = _mm256_load_ps(p0 + xss[k]);
+        is[k2] = _mm256_load_ps(p0 + xss[k] + 8);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
+          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm256_mul_ps(rs[0], w[j]);
+        in = _mm256_mul_ps(rs[0], w[j + 1]);
+        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm256_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        _mm256_store_ps(p0 + xss[k], rn);
+        _mm256_store_ps(p0 + xss[k] + 8, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m256i idx[1 << L];
+    __m256 w[1 << (1 + 2 * H + L)];
+
+    auto m = GetMasks11<L>(qs);
+
+    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
+    FillPermutationIndices<L>(m.qmaskl, idx);
+    FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
+
+    unsigned r = 3 + H;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss, idx, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m256 ru, iu, rn, in;
+      __m256 rs[hsize], is[hsize];
+
+      i *= 8;
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      if ((ii & cmaskh) != cvalsh) return;
+
+      auto p0 = rstate + 2 * ii;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm256_load_ps(p0 + xss[k]);
+        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm256_set1_ps(v[j]);
+        iu = _mm256_set1_ps(v[j + 1]);
+        rn = _mm256_mul_ps(rs[0], ru);
+        in = _mm256_mul_ps(rs[0], iu);
+        rn = _mm256_fnmadd_ps(is[0], iu, rn);
+        in = _mm256_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm256_set1_ps(v[j]);
+          iu = _mm256_set1_ps(v[j + 1]);
+          rn = _mm256_fmadd_ps(rs[l], ru, rn);
+          in = _mm256_fmadd_ps(rs[l], iu, in);
+          rn = _mm256_fnmadd_ps(is[l], iu, rn);
+          in = _mm256_fmadd_ps(is[l], ru, in);
+
+          j += 2;
+        }
+
+        _mm256_store_ps(p0 + xss[k], rn);
+        _mm256_store_ps(p0 + xss[k] + 8, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+
+    auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals);
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 3 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, matrix, ms, xss, m.cvalsh, m.cmaskh, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m256 rn, in;
+      __m256 rs[hsize], is[hsize];
+
+      i *= 8;
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      if ((ii & cmaskh) != cvalsh) return;
+
+      auto p0 = rstate + 2 * ii;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm256_load_ps(p0 + xss[k]);
+        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm256_mul_ps(rs[0], w[j]);
+        in = _mm256_mul_ps(rs[0], w[j + 1]);
+        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm256_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        _mm256_store_ps(p0 + xss[k], rn);
+        _mm256_store_ps(p0 + xss[k] + 8, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m256 w[1 << (1 + 2 * H)];
+
+    auto m = GetMasks8<3>(state.num_qubits(), qs, cqs, cvals);
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+    FillControlledMatrixH<H, 3>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
+
+    unsigned r = 3 + H;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, state.get());
+  }
+
+  template <unsigned H, unsigned L, bool CH>
+  void ApplyControlledGateL(const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs, uint64_t cvals,
+                            const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, const __m256i* idx, fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m256 rn, in;
+      __m256 rs[gsize], is[gsize];
+
+      i *= 8;
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      if ((ii & cmaskh) != cvalsh) return;
+
+      auto p0 = rstate + 2 * ii;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+
+        rs[k2] = _mm256_load_ps(p0 + xss[k]);
+        is[k2] = _mm256_load_ps(p0 + xss[k] + 8);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
+          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm256_mul_ps(rs[0], w[j]);
+        in = _mm256_mul_ps(rs[0], w[j + 1]);
+        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm256_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        _mm256_store_ps(p0 + xss[k], rn);
+        _mm256_store_ps(p0 + xss[k] + 8, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m256i idx[1 << L];
+    __m256 w[1 << (1 + 2 * H + L)];
+
+    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
+
+    unsigned r = 3 + H;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    if (CH) {
+      auto m = GetMasks9<L>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
+
+      for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, idx, state.get());
+    } else {
+      auto m = GetMasks10<L, 3>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillControlledMatrixL<H, L, 3>(
+          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
+
+      for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, idx, state.get());
+    }
+  }
+
+  template <unsigned H>
+  std::complex<double> ExpectationValueH(const std::vector<unsigned>& qs,
+                                         const fp_type* matrix,
+                                         const State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                const fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m256 ru, iu, rn, in;
+      __m256 rs[hsize], is[hsize];
+
+      i *= 8;
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      auto p0 = rstate + 2 * ii;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm256_load_ps(p0 + xss[k]);
+        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
+      }
+
+      double re = 0;
+      double im = 0;
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm256_set1_ps(v[j]);
+        iu = _mm256_set1_ps(v[j + 1]);
+        rn = _mm256_mul_ps(rs[0], ru);
+        in = _mm256_mul_ps(rs[0], iu);
+        rn = _mm256_fnmadd_ps(is[0], iu, rn);
+        in = _mm256_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm256_set1_ps(v[j]);
+          iu = _mm256_set1_ps(v[j + 1]);
+          rn = _mm256_fmadd_ps(rs[l], ru, rn);
+          in = _mm256_fmadd_ps(rs[l], iu, in);
+          rn = _mm256_fnmadd_ps(is[l], iu, rn);
+          in = _mm256_fmadd_ps(is[l], ru, in);
+
+          j += 2;
+        }
+
+        __m256 v_re = _mm256_fmadd_ps(is[k], in, _mm256_mul_ps(rs[k], rn));
+        __m256 v_im = _mm256_fnmadd_ps(is[k], rn, _mm256_mul_ps(rs[k], in));
+
+        re += detail::HorizontalSumAVX(v_re);
+        im += detail::HorizontalSumAVX(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 3 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), matrix, ms, xss, state.get());
+  }
+
+  template <unsigned H, unsigned L>
+  std::complex<double> ExpectationValueL(const std::vector<unsigned>& qs,
+                                         const fp_type* matrix,
+                                         const State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                const uint64_t* ms, const uint64_t* xss, const __m256i* idx,
+                const fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m256 rn, in;
+      __m256 rs[gsize], is[gsize];
+
+      i *= 8;
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      auto p0 = rstate + 2 * ii;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+
+        rs[k2] = _mm256_load_ps(p0 + xss[k]);
+        is[k2] = _mm256_load_ps(p0 + xss[k] + 8);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
+          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
+        }
+      }
+
+      double re = 0;
+      double im = 0;
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm256_mul_ps(rs[0], w[j]);
+        in = _mm256_mul_ps(rs[0], w[j + 1]);
+        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm256_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        unsigned m = lsize * k;
+
+        __m256 v_re = _mm256_fmadd_ps(is[m], in, _mm256_mul_ps(rs[m], rn));
+        __m256 v_im = _mm256_fnmadd_ps(is[m], rn, _mm256_mul_ps(rs[m], in));
+
+        re += detail::HorizontalSumAVX(v_re);
+        im += detail::HorizontalSumAVX(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m256i idx[1 << L];
+    __m256 w[1 << (1 + 2 * H + L)];
+
+    auto m = GetMasks11<L>(qs);
+
+    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
+    FillPermutationIndices<L>(m.qmaskl, idx);
+    FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
+
+    unsigned r = 3 + H;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, state.get());
+  }
+
+#endif  // __BMI2__
+
+  template <unsigned L>
+  static void FillPermutationIndices(unsigned qmaskl, __m256i* idx) {
+    constexpr unsigned lsize = 1 << L;
+
+    for (unsigned i = 0; i < lsize - 1; ++i) {
+      unsigned p[8];
+
+      for (unsigned j = 0; j < 8; ++j) {
+        p[j] = MaskedAdd<3>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl));
+      }
+
+      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
+    }
+  }
+
+  For for_;
+};
+
+}  // namespace qsim
+
+#endif  // SIMULATOR_AVX_H_
diff --git a/tpls/qsim/simulator_avx512.h b/tpls/qsim/simulator_avx512.h
new file mode 100644
index 0000000..21a2e9d
--- /dev/null
+++ b/tpls/qsim/simulator_avx512.h
@@ -0,0 +1,846 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SIMULATOR_AVX512_H_
+#define SIMULATOR_AVX512_H_
+
+#include <immintrin.h>
+
+#include <complex>
+#include <cstdint>
+#include <functional>
+#include <vector>
+
+#include "simulator.h"
+#include "statespace_avx512.h"
+
+namespace qsim {
+
+/**
+ * Quantum circuit simulator with AVX512 vectorization.
+ */
+template <typename For>
+class SimulatorAVX512 final : public SimulatorBase {
+ public:
+  using StateSpace = StateSpaceAVX512<For>;
+  using State = typename StateSpace::State;
+  using fp_type = typename StateSpace::fp_type;
+
+  template <typename... ForArgs>
+  explicit SimulatorAVX512(ForArgs&&... args) : for_(args...) {}
+
+  /**
+   * Applies a gate using AVX512 instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyGate(const std::vector<unsigned>& qs,
+                 const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    switch (qs.size()) {
+    case 0:
+      ApplyGateH<0>(qs, matrix, state);
+      break;
+    case 1:
+      if (qs[0] > 3) {
+        ApplyGateH<1>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 1>(qs, matrix, state);
+      }
+      break;
+    case 2:
+      if (qs[0] > 3) {
+        ApplyGateH<2>(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        ApplyGateL<1, 1>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 2>(qs, matrix, state);
+      }
+      break;
+    case 3:
+      if (qs[0] > 3) {
+        ApplyGateH<3>(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        ApplyGateL<2, 1>(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        ApplyGateL<1, 2>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 3>(qs, matrix, state);
+      }
+      break;
+    case 4:
+      if (qs[0] > 3) {
+        ApplyGateH<4>(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        ApplyGateL<3, 1>(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        ApplyGateL<2, 2>(qs, matrix, state);
+      } else if (qs[3] > 3) {
+        ApplyGateL<1, 3>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 4>(qs, matrix, state);
+      }
+      break;
+    case 5:
+      if (qs[0] > 3) {
+        ApplyGateH<5>(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        ApplyGateL<4, 1>(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        ApplyGateL<3, 2>(qs, matrix, state);
+      } else if (qs[3] > 3) {
+        ApplyGateL<2, 3>(qs, matrix, state);
+      } else {
+        ApplyGateL<1, 4>(qs, matrix, state);
+      }
+      break;
+    case 6:
+      if (qs[0] > 3) {
+        ApplyGateH<6>(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        ApplyGateL<5, 1>(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        ApplyGateL<4, 2>(qs, matrix, state);
+      } else if (qs[3] > 3) {
+        ApplyGateL<3, 3>(qs, matrix, state);
+      } else {
+        ApplyGateL<2, 4>(qs, matrix, state);
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * Applies a controlled gate using AVX512 instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param cqs Indices of control qubits.
+   * @param cvals Bit mask of control qubit values.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyControlledGate(const std::vector<unsigned>& qs,
+                           const std::vector<unsigned>& cqs, uint64_t cvals,
+                           const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+    // Assume cqs[0] < cqs[1] < cqs[2] < ... .
+
+    if (cqs.size() == 0) {
+      ApplyGate(qs, matrix, state);
+      return;
+    }
+
+    switch (qs.size()) {
+    case 0:
+      if (cqs[0] > 3) {
+        ApplyControlledGateHH<0>(qs, cqs, cvals, matrix, state);
+      } else {
+        ApplyControlledGateHL<0>(qs, cqs, cvals, matrix, state);
+      }
+      break;
+    case 1:
+      if (qs[0] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 2:
+      if (qs[0] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 3:
+      if (qs[0] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[2] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 4:
+      if (qs[0] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[2] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[3] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<0, 4, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 4, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * Computes the expectation value of an operator using AVX512 instructions.
+   * @param qs Indices of the qubits the operator acts on.
+   * @param matrix The operator matrix.
+   * @param state The state of the system.
+   * @return The computed expectation value.
+   */
+  std::complex<double> ExpectationValue(const std::vector<unsigned>& qs,
+                                        const fp_type* matrix,
+                                        const State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    switch (qs.size()) {
+    case 1:
+      if (qs[0] > 3) {
+        return ExpectationValueH<1>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<0, 1>(qs, matrix, state);
+      }
+      break;
+    case 2:
+      if (qs[0] > 3) {
+        return ExpectationValueH<2>(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        return ExpectationValueL<1, 1>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<0, 2>(qs, matrix, state);
+      }
+      break;
+    case 3:
+      if (qs[0] > 3) {
+        return ExpectationValueH<3>(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        return ExpectationValueL<2, 1>(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        return ExpectationValueL<1, 2>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<0, 3>(qs, matrix, state);
+      }
+      break;
+    case 4:
+      if (qs[0] > 3) {
+        return ExpectationValueH<4>(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        return ExpectationValueL<3, 1>(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        return ExpectationValueL<2, 2>(qs, matrix, state);
+      } else if (qs[3] > 3) {
+        return ExpectationValueL<1, 3>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<0, 4>(qs, matrix, state);
+      }
+      break;
+    case 5:
+      if (qs[0] > 3) {
+        return ExpectationValueH<5>(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        return ExpectationValueL<4, 1>(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        return ExpectationValueL<3, 2>(qs, matrix, state);
+      } else if (qs[3] > 3) {
+        return ExpectationValueL<2, 3>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<1, 4>(qs, matrix, state);
+      }
+      break;
+    case 6:
+      if (qs[0] > 3) {
+        return ExpectationValueH<6>(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        return ExpectationValueL<5, 1>(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        return ExpectationValueL<4, 2>(qs, matrix, state);
+      } else if (qs[3] > 3) {
+        return ExpectationValueL<3, 3>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<2, 4>(qs, matrix, state);
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+
+    return 0;
+  }
+
+  /**
+   * @return The size of SIMD register if applicable.
+   */
+  static unsigned SIMDRegisterSize() {
+    return 16;
+  }
+
+ private:
+  template <unsigned H>
+  void ApplyGateH(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                uint64_t imaskh, uint64_t qmaskh, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m512 ru, iu, rn, in;
+      __m512 rs[hsize], is[hsize];
+
+      auto p0 = rstate + _pdep_u64(i, imaskh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k] = _mm512_load_ps(p0 + p);
+        is[k] = _mm512_load_ps(p0 + p + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[l], ru, rn);
+          in = _mm512_fmadd_ps(rs[l], iu, in);
+          rn = _mm512_fnmadd_ps(is[l], iu, rn);
+          in = _mm512_fmadd_ps(is[l], ru, in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm512_store_ps(p0 + p, rn);
+        _mm512_store_ps(p0 + p + 16, in);
+      }
+    };
+
+    auto m = GetMasks1<H, 4>(qs);
+
+    unsigned k = 4 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, matrix, m.imaskh, m.qmaskh, state.get());
+  }
+
+  template <unsigned H, unsigned L>
+  void ApplyGateL(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                uint64_t imaskh, uint64_t qmaskh, const __m512i* idx,
+                fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m512 rn, in;
+      __m512 rs[gsize], is[gsize];
+
+      auto p0 = rstate + _pdep_u64(i, imaskh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k2] = _mm512_load_ps(p0 + p);
+        is[k2] = _mm512_load_ps(p0 + p + 16);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]);
+          is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
+          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm512_store_ps(p0 + p, rn);
+        _mm512_store_ps(p0 + p + 16, in);
+      }
+    };
+
+    __m512i idx[1 << L];
+    __m512 w[1 << (1 + 2 * H + L)];
+
+    auto m = GetMasks2<H, L, 4>(qs);
+    FillPermutationIndices<L>(m.qmaskl, idx);
+    FillMatrix<H, L, 4>(m.qmaskl, matrix, (fp_type*) w);
+
+    unsigned r = 4 + H;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, m.imaskh, m.qmaskh, idx, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
+                fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m512 ru, iu, rn, in;
+      __m512 rs[hsize], is[hsize];
+
+      auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k] = _mm512_load_ps(p0 + p);
+        is[k] = _mm512_load_ps(p0 + p + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[l], ru, rn);
+          in = _mm512_fmadd_ps(rs[l], iu, in);
+          rn = _mm512_fnmadd_ps(is[l], iu, rn);
+          in = _mm512_fmadd_ps(is[l], ru, in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm512_store_ps(p0 + p, rn);
+        _mm512_store_ps(p0 + p + 16, in);
+      }
+    };
+
+    auto m = GetMasks3<H, 4>(state.num_qubits(), qs, cqs, cvals);
+
+    unsigned k = 4 + H + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, matrix, m.imaskh, m.qmaskh, m.cvalsh, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
+                fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m512 rn, in;
+      __m512 rs[hsize], is[hsize];
+
+      auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k] = _mm512_load_ps(p0 + p);
+        is[k] = _mm512_load_ps(p0 + p + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
+          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm512_store_ps(p0 + p, rn);
+        _mm512_store_ps(p0 + p + 16, in);
+      }
+    };
+
+    __m512 w[1 << (1 + 2 * H)];
+
+    auto m = GetMasks4<H, 4>(state.num_qubits(), qs, cqs, cvals);
+    FillControlledMatrixH<H, 4>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
+
+    unsigned r = 4 + H + cqs.size() - m.cl;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, state.get());
+  }
+
+  template <unsigned H, unsigned L, bool CH>
+  void ApplyControlledGateL(const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs, uint64_t cvals,
+                            const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
+                const __m512i* idx, fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m512 rn, in;
+      __m512 rs[gsize], is[gsize];
+
+      auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k2] = _mm512_load_ps(p0 + p);
+        is[k2] = _mm512_load_ps(p0 + p + 16);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]);
+          is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
+          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm512_store_ps(p0 + p, rn);
+        _mm512_store_ps(p0 + p + 16, in);
+      }
+    };
+
+    __m512i idx[1 << L];
+    __m512 w[1 << (1 + 2 * H + L)];
+
+    if (CH) {
+      auto m = GetMasks5<H, L, 4>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillMatrix<H, L, 4>(m.qmaskl, matrix, (fp_type*) w);
+
+      unsigned r = 4 + H + cqs.size();
+      unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+      uint64_t size = uint64_t{1} << n;
+
+      for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get());
+    } else {
+      auto m = GetMasks6<H, L, 4>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillControlledMatrixL<H, L, 4>(
+          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
+
+      unsigned r = 4 + H + cqs.size() - m.cl;
+      unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+      uint64_t size = uint64_t{1} << n;
+
+      for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get());
+    }
+  }
+
+  template <unsigned H>
+  std::complex<double> ExpectationValueH(const std::vector<unsigned>& qs,
+                                         const fp_type* matrix,
+                                         const State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                uint64_t imaskh, uint64_t qmaskh, const fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m512 ru, iu, rn, in;
+      __m512 rs[hsize], is[hsize];
+
+      auto p0 = rstate + _pdep_u64(i, imaskh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k] = _mm512_load_ps(p0 + p);
+        is[k] = _mm512_load_ps(p0 + p + 16);
+      }
+
+      double re = 0;
+      double im = 0;
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[l], ru, rn);
+          in = _mm512_fmadd_ps(rs[l], iu, in);
+          rn = _mm512_fnmadd_ps(is[l], iu, rn);
+          in = _mm512_fmadd_ps(is[l], ru, in);
+
+          j += 2;
+        }
+
+        __m512 v_re = _mm512_fmadd_ps(is[k], in, _mm512_mul_ps(rs[k], rn));
+        __m512 v_im = _mm512_fnmadd_ps(is[k], rn, _mm512_mul_ps(rs[k], in));
+
+        re += detail::HorizontalSumAVX512(v_re);
+        im += detail::HorizontalSumAVX512(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    auto m = GetMasks1<H, 4>(qs);
+
+    unsigned k = 4 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return
+        for_.RunReduce(size, f, Op(), matrix, m.imaskh, m.qmaskh, state.get());
+  }
+
+  template <unsigned H, unsigned L>
+  std::complex<double> ExpectationValueL(const std::vector<unsigned>& qs,
+                                         const fp_type* matrix,
+                                         const State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                uint64_t imaskh, uint64_t qmaskh, const __m512i* idx,
+                const fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m512 rn, in;
+      __m512 rs[gsize], is[gsize];
+
+      auto p0 = rstate + _pdep_u64(i, imaskh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k2] = _mm512_load_ps(p0 + p);
+        is[k2] = _mm512_load_ps(p0 + p + 16);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]);
+          is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]);
+        }
+      }
+
+      double re = 0;
+      double im = 0;
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
+          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        unsigned m = lsize * k;
+
+        __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn));
+        __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in));
+
+        re += detail::HorizontalSumAVX512(v_re);
+        im += detail::HorizontalSumAVX512(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    __m512i idx[1 << L];
+    __m512 w[1 << (1 + 2 * H + L)];
+
+    auto m = GetMasks2<H, L, 4>(qs);
+    FillPermutationIndices<L>(m.qmaskl, idx);
+    FillMatrix<H, L, 4>(m.qmaskl, matrix, (fp_type*) w);
+
+    unsigned r = 4 + H;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return
+        for_.RunReduce(size, f, Op(), w, m.imaskh, m.qmaskh, idx, state.get());
+  }
+
+  template <unsigned L>
+  static void FillPermutationIndices(unsigned qmaskl, __m512i* idx) {
+    constexpr unsigned lsize = 1 << L;
+
+    for (unsigned i = 0; i < lsize; ++i) {
+      unsigned p[16];
+
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd<4>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+  }
+
+  For for_;
+};
+
+}  // namespace qsim
+
+#endif  // SIMULATOR_AVX512_H_
diff --git a/tpls/qsim/simulator_basic.h b/tpls/qsim/simulator_basic.h
new file mode 100644
index 0000000..752eeb5
--- /dev/null
+++ b/tpls/qsim/simulator_basic.h
@@ -0,0 +1,349 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SIMULATOR_BASIC_H_
+#define SIMULATOR_BASIC_H_
+
+#include <complex>
+#include <cstdint>
+#include <functional>
+#include <vector>
+
+#include "simulator.h"
+#include "statespace_basic.h"
+
+namespace qsim {
+
+/**
+ * Quantum circuit simulator without vectorization.
+ */
+template <typename For, typename FP = float>
+class SimulatorBasic final : public SimulatorBase {
+ public:
+  using StateSpace = StateSpaceBasic<For, FP>;
+  using State = typename StateSpace::State;
+  using fp_type = typename StateSpace::fp_type;
+
+  template <typename... ForArgs>
+  explicit SimulatorBasic(ForArgs&&... args) : for_(args...) {}
+
+  /**
+   * Applies a gate using non-vectorized instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyGate(const std::vector<unsigned>& qs,
+                 const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    switch (qs.size()) {
+    case 0:
+      ApplyGateH<0>(qs, matrix, state);
+      break;
+    case 1:
+      ApplyGateH<1>(qs, matrix, state);
+      break;
+    case 2:
+      ApplyGateH<2>(qs, matrix, state);
+      break;
+    case 3:
+      ApplyGateH<3>(qs, matrix, state);
+      break;
+    case 4:
+      ApplyGateH<4>(qs, matrix, state);
+      break;
+    case 5:
+      ApplyGateH<5>(qs, matrix, state);
+      break;
+    case 6:
+      ApplyGateH<6>(qs, matrix, state);
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * Applies a controlled gate using non-vectorized instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param cqs Indices of control qubits.
+   * @param cvals Bit mask of control qubit values.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyControlledGate(const std::vector<unsigned>& qs,
+                           const std::vector<unsigned>& cqs, uint64_t cvals,
+                           const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    if (cqs.size() == 0) {
+      ApplyGate(qs, matrix, state);
+      return;
+    }
+
+    switch (qs.size()) {
+    case 0:
+      ApplyControlledGateH<0>(qs, cqs, cvals, matrix, state);
+      break;
+    case 1:
+      ApplyControlledGateH<1>(qs, cqs, cvals, matrix, state);
+      break;
+    case 2:
+      ApplyControlledGateH<2>(qs, cqs, cvals, matrix, state);
+      break;
+    case 3:
+      ApplyControlledGateH<3>(qs, cqs, cvals, matrix, state);
+      break;
+    case 4:
+      ApplyControlledGateH<4>(qs, cqs, cvals, matrix, state);
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * Computes the expectation value of an operator using non-vectorized
+   * instructions.
+   * @param qs Indices of the qubits the operator acts on.
+   * @param matrix The operator matrix.
+   * @param state The state of the system.
+   * @return The computed expectation value.
+   */
+  std::complex<double> ExpectationValue(const std::vector<unsigned>& qs,
+                                        const fp_type* matrix,
+                                        const State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    switch (qs.size()) {
+    case 1:
+      return ExpectationValueH<1>(qs, matrix, state);
+      break;
+    case 2:
+      return ExpectationValueH<2>(qs, matrix, state);
+      break;
+    case 3:
+      return ExpectationValueH<3>(qs, matrix, state);
+      break;
+    case 4:
+      return ExpectationValueH<4>(qs, matrix, state);
+      break;
+    case 5:
+      return ExpectationValueH<5>(qs, matrix, state);
+      break;
+    case 6:
+      return ExpectationValueH<6>(qs, matrix, state);
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+
+    return 0;
+  }
+
+  /**
+   * @return The size of SIMD register if applicable.
+   */
+  static unsigned SIMDRegisterSize() {
+    return 1;
+  }
+
+ private:
+  template <unsigned H>
+  void ApplyGateH(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      fp_type rn, in;
+      fp_type rs[hsize], is[hsize];
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      auto p0 = rstate + 2 * ii;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = *(p0 + xss[k]);
+        is[k] = *(p0 + xss[k] + 1);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = rs[0] * v[j] - is[0] * v[j + 1];
+        in = rs[0] * v[j + 1] + is[0] * v[j];
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          rn += rs[l] * v[j] - is[l] * v[j + 1];
+          in += rs[l] * v[j + 1] + is[l] * v[j];
+
+          j += 2;
+        }
+
+        *(p0 + xss[k]) = rn;
+        *(p0 + xss[k] + 1) = in;
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, matrix, ms, xss, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateH(const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs,
+                            uint64_t cvals, const fp_type* matrix,
+                            State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                uint64_t cvalsh, uint64_t cmaskh, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      fp_type rn, in;
+      fp_type rs[hsize], is[hsize];
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      if ((ii & cmaskh) == cvalsh) {
+        auto p0 = rstate + 2 * ii;
+
+        for (unsigned k = 0; k < hsize; ++k) {
+          rs[k] = *(p0 + xss[k]);
+          is[k] = *(p0 + xss[k] + 1);
+        }
+
+        uint64_t j = 0;
+
+        for (unsigned k = 0; k < hsize; ++k) {
+          rn = rs[0] * v[j] - is[0] * v[j + 1];
+          in = rs[0] * v[j + 1] + is[0] * v[j];
+
+          j += 2;
+
+          for (unsigned l = 1; l < hsize; ++l) {
+            rn += rs[l] * v[j] - is[l] * v[j + 1];
+            in += rs[l] * v[j + 1] + is[l] * v[j];
+
+            j += 2;
+          }
+
+          *(p0 + xss[k]) = rn;
+          *(p0 + xss[k] + 1) = in;
+        }
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals);
+
+    unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, matrix, ms, xss, m.cvalsh, m.cmaskh, state.get());
+  }
+
+  template <unsigned H>
+  std::complex<double> ExpectationValueH(const std::vector<unsigned>& qs,
+                                         const fp_type* matrix,
+                                         const State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                const fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      fp_type rn, in;
+      fp_type rs[hsize], is[hsize];
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      auto p0 = rstate + 2 * ii;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = *(p0 + xss[k]);
+        is[k] = *(p0 + xss[k] + 1);
+      }
+
+      double re = 0;
+      double im = 0;
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = rs[0] * v[j] - is[0] * v[j + 1];
+        in = rs[0] * v[j + 1] + is[0] * v[j];
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          rn += rs[l] * v[j] - is[l] * v[j + 1];
+          in += rs[l] * v[j + 1] + is[l] * v[j];
+
+          j += 2;
+        }
+
+        re += rs[k] * rn + is[k] * in;
+        im += rs[k] * in - is[k] * rn;
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), matrix, ms, xss, state.get());
+  }
+
+  For for_;
+};
+
+}  // namespace qsim
+
+#endif  // SIMULATOR_BASIC_H_
diff --git a/tpls/qsim/simulator_cuda.h b/tpls/qsim/simulator_cuda.h
new file mode 100644
index 0000000..5743bea
--- /dev/null
+++ b/tpls/qsim/simulator_cuda.h
@@ -0,0 +1,923 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SIMULATOR_CUDA_H_
+#define SIMULATOR_CUDA_H_
+
+#include "simulator_cuda_kernels.h"
+
+#include <algorithm>
+#include <complex>
+#include <cstdint>
+#include <cstring>
+#include <vector>
+
+#include "bits.h"
+#include "statespace_cuda.h"
+
+namespace qsim {
+
+/**
+ * Quantum circuit simulator with GPU vectorization.
+ */
+template <typename FP = float>
+class SimulatorCUDA final {
+ private:
+  using idx_type = uint64_t;
+  using Complex = qsim::Complex<double>;
+
+  // The maximum buffer size for indices and gate matrices.
+  // The maximum gate matrix size (for 6-qubit gates) is
+  // 2 * 2^6 * 2^6 * sizeof(FP) = 8192 * sizeof(FP). The maximum index size is
+  // 128 * sizeof(idx_type) + 96 * sizeof(unsigned).
+  static constexpr unsigned max_buf_size = 8192 * sizeof(FP)
+      + 128 * sizeof(idx_type) + 96 * sizeof(unsigned);
+
+ public:
+  using StateSpace = StateSpaceCUDA<FP>;
+  using State = typename StateSpace::State;
+  using fp_type = typename StateSpace::fp_type;
+
+  SimulatorCUDA() : scratch_(nullptr), scratch_size_(0) {
+    ErrorCheck(cudaMalloc(&d_ws, max_buf_size));
+  }
+
+  ~SimulatorCUDA() {
+    ErrorCheck(cudaFree(d_ws));
+
+    if (scratch_ != nullptr) {
+      ErrorCheck(cudaFree(scratch_));
+    }
+  }
+
+  /**
+   * Applies a gate using CUDA instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyGate(const std::vector<unsigned>& qs,
+                 const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    if (qs.size() == 0) {
+      ApplyGateH<0>(qs, matrix, state);
+    } else if (qs[0] > 4) {
+      switch (qs.size()) {
+      case 1:
+        ApplyGateH<1>(qs, matrix, state);
+        break;
+      case 2:
+        ApplyGateH<2>(qs, matrix, state);
+        break;
+      case 3:
+        ApplyGateH<3>(qs, matrix, state);
+        break;
+      case 4:
+        ApplyGateH<4>(qs, matrix, state);
+        break;
+      case 5:
+        ApplyGateH<5>(qs, matrix, state);
+        break;
+      case 6:
+        ApplyGateH<6>(qs, matrix, state);
+        break;
+      default:
+        // Not implemented.
+        break;
+      }
+    } else {
+      switch (qs.size()) {
+      case 1:
+        ApplyGateL<1>(qs, matrix, state);
+        break;
+      case 2:
+        ApplyGateL<2>(qs, matrix, state);
+        break;
+      case 3:
+        ApplyGateL<3>(qs, matrix, state);
+        break;
+      case 4:
+        ApplyGateL<4>(qs, matrix, state);
+        break;
+      case 5:
+        ApplyGateL<5>(qs, matrix, state);
+        break;
+      case 6:
+        ApplyGateL<6>(qs, matrix, state);
+        break;
+      default:
+        // Not implemented.
+        break;
+      }
+    }
+  }
+
+  /**
+   * Applies a controlled gate using CUDA instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param cqs Indices of control qubits.
+   * @param cvals Bit mask of control qubit values.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyControlledGate(const std::vector<unsigned>& qs,
+                           const std::vector<unsigned>& cqs, uint64_t cvals,
+                           const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    if (cqs.size() == 0) {
+      ApplyGate(qs, matrix, state);
+      return;
+    }
+
+    if (cqs[0] < 5) {
+      switch (qs.size()) {
+      case 0:
+        ApplyControlledGateL<0>(qs, cqs, cvals, matrix, state);
+        break;
+      case 1:
+        ApplyControlledGateL<1>(qs, cqs, cvals, matrix, state);
+        break;
+      case 2:
+        ApplyControlledGateL<2>(qs, cqs, cvals, matrix, state);
+        break;
+      case 3:
+        ApplyControlledGateL<3>(qs, cqs, cvals, matrix, state);
+        break;
+      case 4:
+        ApplyControlledGateL<4>(qs, cqs, cvals, matrix, state);
+        break;
+      default:
+        // Not implemented.
+        break;
+      }
+    } else {
+      if (qs.size() == 0) {
+        ApplyControlledGateHH<0>(qs, cqs, cvals, matrix, state);
+      } else if (qs[0] > 4) {
+        switch (qs.size()) {
+        case 1:
+          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
+          break;
+        case 2:
+          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
+          break;
+        case 3:
+          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
+          break;
+        case 4:
+          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
+          break;
+        default:
+          // Not implemented.
+          break;
+        }
+      } else {
+        switch (qs.size()) {
+        case 1:
+          ApplyControlledGateLH<1>(qs, cqs, cvals, matrix, state);
+          break;
+        case 2:
+          ApplyControlledGateLH<2>(qs, cqs, cvals, matrix, state);
+          break;
+        case 3:
+          ApplyControlledGateLH<3>(qs, cqs, cvals, matrix, state);
+          break;
+        case 4:
+          ApplyControlledGateLH<4>(qs, cqs, cvals, matrix, state);
+          break;
+        default:
+          // Not implemented.
+          break;
+        }
+      }
+    }
+  }
+
+  /**
+   * Computes the expectation value of an operator using CUDA instructions.
+   * @param qs Indices of the qubits the operator acts on.
+   * @param matrix The operator matrix.
+   * @param state The state of the system.
+   * @return The computed expectation value.
+   */
+  std::complex<double> ExpectationValue(const std::vector<unsigned>& qs,
+                                        const fp_type* matrix,
+                                        const State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    if (qs[0] > 4) {
+      switch (qs.size()) {
+      case 1:
+        return ExpectationValueH<1>(qs, matrix, state);
+      case 2:
+        return ExpectationValueH<2>(qs, matrix, state);
+      case 3:
+        return ExpectationValueH<3>(qs, matrix, state);
+      case 4:
+        return ExpectationValueH<4>(qs, matrix, state);
+      case 5:
+        return ExpectationValueH<5>(qs, matrix, state);
+      case 6:
+        return ExpectationValueH<6>(qs, matrix, state);
+      default:
+        // Not implemented.
+        break;
+      }
+    } else {
+      switch (qs.size()) {
+      case 1:
+        return ExpectationValueL<1>(qs, matrix, state);
+      case 2:
+        return ExpectationValueL<2>(qs, matrix, state);
+      case 3:
+        return ExpectationValueL<3>(qs, matrix, state);
+      case 4:
+        return ExpectationValueL<4>(qs, matrix, state);
+      case 5:
+        return ExpectationValueL<5>(qs, matrix, state);
+      case 6:
+        return ExpectationValueL<6>(qs, matrix, state);
+      default:
+        // Not implemented.
+        break;
+      }
+    }
+
+    return 0;
+  }
+
+  /**
+   * @return The size of SIMD register if applicable.
+   */
+  static unsigned SIMDRegisterSize() {
+    return 32;
+  }
+
+ private:
+  // The following indices are used in kernels.
+  // xss - indices to access the state vector entries in global memory.
+  // ms  - masks to access the state vector entries in global memory.
+  // tis - indices to access the state vector entries in shared memory
+  //       in the presence of low gate qubits.
+  // qis - indices to access the state vector entries in shared memory
+  //       in the presence of low gate qubits.
+  // cis - additional indices to access the state vector entries in global
+  //       memory in the presence of low control qubits.
+
+  template <unsigned G>
+  struct IndicesH {
+    static constexpr unsigned gsize = 1 << G;
+    static constexpr unsigned matrix_size = 2 * gsize * gsize * sizeof(fp_type);
+    static constexpr unsigned xss_size = 32 * sizeof(idx_type) * (1 + (G == 6));
+    static constexpr unsigned ms_size = 32 * sizeof(idx_type);
+    static constexpr unsigned xss_offs = matrix_size;
+    static constexpr unsigned ms_offs = xss_offs + xss_size;
+    static constexpr unsigned buf_size = ms_offs + ms_size;
+
+    IndicesH(char* p)
+        : xss((idx_type*) (p + xss_offs)), ms((idx_type*) (p + ms_offs)) {}
+
+    idx_type* xss;
+    idx_type* ms;
+  };
+
+  template <unsigned G>
+  struct IndicesL : public IndicesH<G> {
+    using Base = IndicesH<G>;
+    static constexpr unsigned qis_size = 32 * sizeof(unsigned) * (1 + (G == 6));
+    static constexpr unsigned tis_size = 32 * sizeof(unsigned);
+    static constexpr unsigned qis_offs = Base::buf_size;
+    static constexpr unsigned tis_offs = qis_offs + qis_size;
+    static constexpr unsigned buf_size = tis_offs + tis_size;
+
+    IndicesL(char* p)
+        : Base(p), qis((unsigned*) (p + qis_offs)),
+          tis((unsigned*) (p + tis_offs)) {}
+
+    unsigned* qis;
+    unsigned* tis;
+  };
+
+  template <unsigned G>
+  struct IndicesLC : public IndicesL<G> {
+    using Base = IndicesL<G>;
+    static constexpr unsigned cis_size = 32 * sizeof(idx_type);
+    static constexpr unsigned cis_offs = Base::buf_size;
+    static constexpr unsigned buf_size = cis_offs + cis_size;
+
+    IndicesLC(char* p) : Base(p), cis((idx_type*) (p + cis_offs)) {}
+
+    idx_type* cis;
+  };
+
+  struct DataC {
+    idx_type cvalsh;
+    unsigned num_aqs;
+    unsigned num_effective_qs;
+    unsigned remaining_low_cqs;
+  };
+
+  template <unsigned G>
+  void ApplyGateH(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    unsigned num_qubits = state.num_qubits();
+
+    IndicesH<G> h_i(h_ws);
+    GetIndicesH(num_qubits, qs, qs.size(), h_i);
+
+    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
+    ErrorCheck(
+        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
+
+    unsigned k = 5 + G;
+    unsigned n = num_qubits > k ? num_qubits - k : 0;
+    unsigned size = unsigned{1} << n;
+    unsigned threads = 64U;
+    unsigned blocks = std::max(1U, size / 2);
+
+    IndicesH<G> d_i(d_ws);
+
+    ApplyGateH_Kernel<G><<<blocks, threads>>>(
+        (fp_type*) d_ws, d_i.xss, d_i.ms, state.get());
+  }
+
+  template <unsigned G>
+  void ApplyGateL(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    unsigned num_qubits = state.num_qubits();
+
+    IndicesL<G> h_i(h_ws);
+    auto num_effective_qs = GetIndicesL(num_qubits, qs, h_i);
+
+    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
+    ErrorCheck(
+        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
+
+    unsigned k = 5 + num_effective_qs;
+    unsigned n = num_qubits > k ? num_qubits - k : 0;
+    unsigned size = unsigned{1} << n;
+    unsigned threads = 32;
+    unsigned blocks = size;
+
+    IndicesL<G> d_i(d_ws);
+
+    ApplyGateL_Kernel<G><<<blocks, threads>>>(
+        (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
+        1 << num_effective_qs, state.get());
+  }
+
+  template <unsigned G>
+  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, idx_type cvals,
+                             const fp_type* matrix, State& state) const {
+    unsigned aqs[64];
+    idx_type cmaskh = 0;
+    unsigned num_qubits = state.num_qubits();
+
+    IndicesH<G> h_i(h_ws);
+
+    unsigned num_aqs = GetHighQubits(qs, 0, cqs, 0, 0, cmaskh, aqs);
+    GetMs(num_qubits, aqs, num_aqs, h_i.ms);
+    GetXss(num_qubits, qs, qs.size(), h_i.xss);
+
+    idx_type cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh);
+
+    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
+    ErrorCheck(
+        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
+
+    unsigned k = 5 + G + cqs.size();
+    unsigned n = num_qubits > k ? num_qubits - k : 0;
+    unsigned size = unsigned{1} << n;
+    unsigned threads = 64U;
+    unsigned blocks = std::max(1U, size / 2);
+
+    IndicesH<G> d_i(d_ws);
+
+    ApplyControlledGateH_Kernel<G><<<blocks, threads>>>(
+        (fp_type*) d_ws, d_i.xss, d_i.ms, num_aqs + 1, cvalsh, state.get());
+  }
+
+  template <unsigned G>
+  void ApplyControlledGateLH(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    unsigned num_qubits = state.num_qubits();
+
+    IndicesL<G> h_i(h_ws);
+    auto d = GetIndicesLC(num_qubits, qs, cqs, cvals, h_i);
+
+    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
+    ErrorCheck(
+        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
+
+    unsigned k = 5 + G + cqs.size();
+    unsigned n = num_qubits > k ? num_qubits - k : 0;
+    unsigned size = unsigned{1} << n;
+    unsigned threads = 32;
+    unsigned blocks = size;
+
+    IndicesL<G> d_i(d_ws);
+
+    ApplyControlledGateLH_Kernel<G><<<blocks, threads>>>(
+        (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
+        d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs, state.get());
+  }
+
+  template <unsigned G>
+  void ApplyControlledGateL(const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs, uint64_t cvals,
+                            const fp_type* matrix, State& state) const {
+    unsigned num_qubits = state.num_qubits();
+
+    IndicesLC<G> h_i(h_ws);
+    auto d = GetIndicesLCL(num_qubits, qs, cqs, cvals, h_i);
+
+    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
+    ErrorCheck(
+        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
+
+    unsigned k = 5 + G + cqs.size();
+    unsigned n = num_qubits > k ? num_qubits - k : 0;
+    unsigned size = unsigned{1} << n;
+    unsigned threads = 32;
+    unsigned blocks = size;
+
+    IndicesLC<G> d_i(d_ws);
+
+    ApplyControlledGateL_Kernel<G><<<blocks, threads>>>(
+        (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d_i.cis,
+        d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs,
+        1 << (5 - d.remaining_low_cqs), state.get());
+  }
+
+  template <unsigned G>
+  std::complex<double> ExpectationValueH(const std::vector<unsigned>& qs,
+                                         const fp_type* matrix,
+                                         const State& state) const {
+    unsigned num_qubits = state.num_qubits();
+
+    IndicesH<G> h_i(h_ws);
+    GetIndicesH(num_qubits, qs, qs.size(), h_i);
+
+    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
+    ErrorCheck(
+        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
+
+    unsigned k = 5 + G;
+    unsigned n = num_qubits > k ? num_qubits - k : 0;
+    unsigned size = unsigned{1} << n;
+
+    unsigned s = std::min(n >= 14 ? n - 14 : 0, 4U);
+    unsigned threads = 64U;
+    unsigned blocks = std::max(1U, (size / 2) >> s);
+    unsigned num_iterations_per_block = 1 << s;
+
+    constexpr unsigned m = 16;
+
+    Complex* d_res1 = (Complex*) AllocScratch((blocks + m) * sizeof(Complex));
+    Complex* d_res2 = d_res1 + blocks;
+
+    IndicesH<G> d_i(d_ws);
+
+    ExpectationValueH_Kernel<G><<<blocks, threads>>>(
+        (fp_type*) d_ws, d_i.xss, d_i.ms, num_iterations_per_block,
+        state.get(), Plus<double>(), d_res1);
+
+    double mul = size == 1 ? 0.5 : 1.0;
+
+    return ExpectationValueReduceFinal<m>(blocks, mul, d_res1, d_res2);
+  }
+
+  template <unsigned G>
+  std::complex<double> ExpectationValueL(const std::vector<unsigned>& qs,
+                                         const fp_type* matrix,
+                                         const State& state) const {
+    unsigned num_qubits = state.num_qubits();
+
+    IndicesL<G> h_i(h_ws);
+    auto num_effective_qs = GetIndicesL(num_qubits, qs, h_i);
+
+    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
+    ErrorCheck(
+        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
+
+    unsigned k = 5 + num_effective_qs;
+    unsigned n = num_qubits > k ? num_qubits - k : 0;
+    unsigned size = unsigned{1} << n;
+
+    unsigned s = std::min(n >= 13 ? n - 13 : 0, 5U);
+    unsigned threads = 32;
+    unsigned blocks = size >> s;
+    unsigned num_iterations_per_block = 1 << s;
+
+    constexpr unsigned m = 16;
+
+    Complex* d_res1 = (Complex*) AllocScratch((blocks + m) * sizeof(Complex));
+    Complex* d_res2 = d_res1 + blocks;
+
+    IndicesL<G> d_i(d_ws);
+
+    ExpectationValueL_Kernel<G><<<blocks, threads>>>(
+        (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
+        num_iterations_per_block, state.get(), Plus<double>(), d_res1);
+
+    double mul = double(1 << (5 + num_effective_qs - G)) / 32;
+
+    return ExpectationValueReduceFinal<m>(blocks, mul, d_res1, d_res2);
+  }
+
+  template <unsigned m>
+  std::complex<double> ExpectationValueReduceFinal(
+      unsigned blocks, double mul,
+      const Complex* d_res1, Complex* d_res2) const {
+    Complex res2[m];
+
+    if (blocks <= 16) {
+      ErrorCheck(cudaMemcpy(res2, d_res1, blocks * sizeof(Complex),
+                            cudaMemcpyDeviceToHost));
+    } else {
+      unsigned threads2 = std::min(1024U, blocks);
+      unsigned blocks2 = std::min(m, blocks / threads2);
+
+      unsigned dblocks = std::max(1U, blocks / (blocks2 * threads2));
+      unsigned bytes = threads2 * sizeof(Complex);
+
+      Reduce2Kernel<Complex><<<blocks2, threads2, bytes>>>(
+          dblocks, blocks, Plus<Complex>(), Plus<double>(), d_res1, d_res2);
+
+      ErrorCheck(cudaMemcpy(res2, d_res2, blocks2 * sizeof(Complex),
+                            cudaMemcpyDeviceToHost));
+
+      blocks = blocks2;
+    }
+
+    double re = 0;
+    double im = 0;
+
+    for (unsigned i = 0; i < blocks; ++i) {
+      re += res2[i].re;
+      im += res2[i].im;
+    }
+
+    return {mul * re, mul * im};
+  }
+
+  template <typename AQ>
+  unsigned GetHighQubits(const std::vector<unsigned>& qs, unsigned qi,
+                         const std::vector<unsigned>& cqs, unsigned ci,
+                         unsigned ai, idx_type& cmaskh, AQ& aqs) const {
+    while (1) {
+      if (qi < qs.size() && (ci == cqs.size() || qs[qi] < cqs[ci])) {
+        aqs[ai++] = qs[qi++];
+      } else if (ci < cqs.size()) {
+        cmaskh |= idx_type{1} << cqs[ci];
+        aqs[ai++] = cqs[ci++];
+      } else {
+        break;
+      }
+    }
+
+    return ai;
+  }
+
+  template <typename QS>
+  void GetMs(unsigned num_qubits, const QS& qs, unsigned qs_size,
+             idx_type* ms) const {
+    if (qs_size == 0) {
+      ms[0] = idx_type(-1);
+    } else {
+      idx_type xs = idx_type{1} << (qs[0] + 1);
+      ms[0] = (idx_type{1} << qs[0]) - 1;
+      for (unsigned i = 1; i < qs_size; ++i) {
+        ms[i] = ((idx_type{1} << qs[i]) - 1) ^ (xs - 1);
+        xs = idx_type{1} << (qs[i] + 1);
+      }
+      ms[qs_size] = ((idx_type{1} << num_qubits) - 1) ^ (xs - 1);
+    }
+  }
+
+  template <typename QS>
+  void GetXss(unsigned num_qubits, const QS& qs, unsigned qs_size,
+              idx_type* xss) const {
+    if (qs_size == 0) {
+      xss[0] = 0;
+    } else {
+      unsigned g = qs_size;
+      unsigned gsize = 1 << qs_size;
+
+      idx_type xs[64];
+
+      xs[0] = idx_type{1} << (qs[0] + 1);
+      for (unsigned i = 1; i < g; ++i) {
+        xs[i] = idx_type{1} << (qs[i] + 1);
+      }
+
+      for (unsigned i = 0; i < gsize; ++i) {
+        idx_type a = 0;
+        for (unsigned k = 0; k < g; ++k) {
+          a += xs[k] * ((i >> k) & 1);
+        }
+        xss[i] = a;
+      }
+    }
+  }
+
+  template <unsigned G, typename qs_type>
+  void GetIndicesH(unsigned num_qubits, const qs_type& qs, unsigned qs_size,
+                   IndicesH<G>& indices) const {
+    if (qs_size == 0) {
+      indices.ms[0] = idx_type(-1);
+      indices.xss[0] = 0;
+    } else {
+      unsigned g = qs_size;
+      unsigned gsize = 1 << qs_size;
+
+      idx_type xs[64];
+
+      xs[0] = idx_type{1} << (qs[0] + 1);
+      indices.ms[0] = (idx_type{1} << qs[0]) - 1;
+      for (unsigned i = 1; i < g; ++i) {
+        xs[i] = idx_type{1} << (qs[i] + 1);
+        indices.ms[i] = ((idx_type{1} << qs[i]) - 1) ^ (xs[i - 1] - 1);
+      }
+      indices.ms[g] = ((idx_type{1} << num_qubits) - 1) ^ (xs[g - 1] - 1);
+
+      for (unsigned i = 0; i < gsize; ++i) {
+        idx_type a = 0;
+        for (unsigned k = 0; k < g; ++k) {
+          a += xs[k] * ((i >> k) & 1);
+        }
+        indices.xss[i] = a;
+      }
+    }
+  }
+
+  template <unsigned G>
+  void GetIndicesL(unsigned num_effective_qs, unsigned qmask,
+                   IndicesL<G>& indices) const {
+    for (unsigned i = num_effective_qs + 1; i < (G + 1); ++i) {
+      indices.ms[i] = 0;
+    }
+
+    for (unsigned i = (1 << num_effective_qs); i < indices.gsize; ++i) {
+      indices.xss[i] = 0;
+    }
+
+    for (unsigned i = 0; i < indices.gsize; ++i) {
+      indices.qis[i] = bits::ExpandBits(i, 5 + num_effective_qs, qmask);
+    }
+
+    unsigned tmask = ((1 << (5 + num_effective_qs)) - 1) ^ qmask;
+    for (unsigned i = 0; i < 32; ++i) {
+      indices.tis[i] = bits::ExpandBits(i, 5 + num_effective_qs, tmask);
+    }
+  }
+
+  template <unsigned G>
+  unsigned GetIndicesL(unsigned num_qubits, const std::vector<unsigned>& qs,
+                       IndicesL<G>& indices) const {
+    unsigned eqs[32];
+
+    unsigned qmaskh = 0;
+    unsigned qmaskl = 0;
+
+    unsigned qi = 0;
+
+    while (qi < qs.size() && qs[qi] < 5) {
+      qmaskl |= 1 << qs[qi++];
+    }
+
+    unsigned nq = std::max(5U, num_qubits);
+    unsigned num_effective_qs = std::min(nq - 5, unsigned(qs.size()));
+
+    unsigned l = 0;
+    unsigned ei = 0;
+    unsigned num_low_qs = qi;
+
+    if (qs.size() == num_low_qs) {
+      while (ei < num_effective_qs && l++ < num_low_qs) {
+        eqs[ei] = ei + 5;
+        ++ei;
+      }
+    } else {
+      while (ei < num_effective_qs && l < num_low_qs) {
+        unsigned ei5 = ei + 5;
+        eqs[ei] = ei5;
+        if (qi < qs.size() && qs[qi] == ei5) {
+          ++qi;
+          qmaskh |= 1 << ei5;
+        } else {
+          ++l;
+        }
+        ++ei;
+      }
+
+      while (ei < num_effective_qs) {
+        eqs[ei] = qs[qi++];
+        qmaskh |= 1 << (ei + 5);
+        ++ei;
+      }
+    }
+
+    GetIndicesH(num_qubits, eqs, num_effective_qs, indices);
+    GetIndicesL(num_effective_qs, qmaskh | qmaskl, indices);
+
+    return num_effective_qs;
+  }
+
+  template <unsigned G>
+  DataC GetIndicesLC(unsigned num_qubits, const std::vector<unsigned>& qs,
+                     const std::vector<unsigned>& cqs, uint64_t cvals,
+                     IndicesL<G>& indices) const {
+    unsigned aqs[64];
+    unsigned eqs[32];
+
+    unsigned qmaskh = 0;
+    unsigned qmaskl = 0;
+    idx_type cmaskh = 0;
+
+    unsigned qi = 0;
+
+    while (qi < qs.size() && qs[qi] < 5) {
+      qmaskl |= 1 << qs[qi++];
+    }
+
+    unsigned nq = std::max(5U, num_qubits - unsigned(cqs.size()));
+    unsigned num_effective_qs = std::min(nq - 5, unsigned(qs.size()));
+
+    unsigned l = 0;
+    unsigned ai = 5;
+    unsigned ci = 0;
+    unsigned ei = 0;
+    unsigned num_low_qs = qi;
+
+    while (ai < num_qubits && l < num_low_qs) {
+      aqs[ai - 5] = ai;
+      if (qi < qs.size() && qs[qi] == ai) {
+        ++qi;
+        eqs[ei++] = ai;
+        qmaskh |= 1 << (ai - ci);
+      } else if (ci < cqs.size() && cqs[ci] == ai) {
+        ++ci;
+        cmaskh |= idx_type{1} << ai;
+      } else {
+        ++l;
+        eqs[ei++] = ai;
+      }
+      ++ai;
+    }
+
+    unsigned i = ai;
+    unsigned j = qi;
+
+    while (ei < num_effective_qs) {
+      eqs[ei++] = qs[j++];
+      qmaskh |= 1 << (i++ - ci);
+    }
+
+    unsigned num_aqs = GetHighQubits(qs, qi, cqs, ci, ai - 5, cmaskh, aqs);
+    GetMs(num_qubits, aqs, num_aqs, indices.ms);
+    GetXss(num_qubits, eqs, num_effective_qs, indices.xss);
+    GetIndicesL(num_effective_qs, qmaskh | qmaskl, indices);
+
+    idx_type cvalsh = bits::ExpandBits(idx_type(cvals), num_qubits, cmaskh);
+
+    return {cvalsh, num_aqs, num_effective_qs};
+  }
+
+  template <unsigned G>
+  DataC GetIndicesLCL(unsigned num_qubits, const std::vector<unsigned>& qs,
+                      const std::vector<unsigned>& cqs, uint64_t cvals,
+                      IndicesLC<G>& indices) const {
+    unsigned aqs[64];
+    unsigned eqs[32];
+
+    unsigned qmaskh = 0;
+    unsigned qmaskl = 0;
+    idx_type cmaskh = 0;
+    idx_type cmaskl = 0;
+    idx_type cis_mask = 0;
+
+    unsigned qi = 0;
+    unsigned ci = 0;
+
+    for (unsigned k = 0; k < 5; ++k) {
+      if (qi < qs.size() && qs[qi] == k) {
+        qmaskl |= 1 << (k - ci);
+        ++qi;
+      } else if (ci < cqs.size() && cqs[ci] == k) {
+        cmaskl |= idx_type{1} << k;
+        ++ci;
+      }
+    }
+
+    unsigned num_low_qs = qi;
+    unsigned num_low_cqs = ci;
+
+    unsigned nq = std::max(5U, num_qubits - unsigned(cqs.size()));
+    unsigned num_effective_qs = std::min(nq - 5, unsigned(qs.size()));
+
+    unsigned l = 0;
+    unsigned ai = 5;
+    unsigned ei = 0;
+    unsigned num_low = num_low_qs + num_low_cqs;
+    unsigned remaining_low_cqs = num_low_cqs;
+    unsigned effective_low_qs = num_low_qs;
+    unsigned highest_cis_bit = 0;
+
+    while (ai < num_qubits && l < num_low) {
+      aqs[ai - 5] = ai;
+      if (qi < qs.size() && qs[qi] == ai) {
+        ++qi;
+        if ((ai - ci) > 4) {
+          eqs[ei++] = ai;
+          qmaskh |= 1 << (ai - ci);
+        } else {
+          highest_cis_bit = ai;
+          cis_mask |= idx_type{1} << ai;
+          qmaskl |= 1 << (ai - ci);
+          --remaining_low_cqs;
+          ++effective_low_qs;
+        }
+      } else if (ci < cqs.size() && cqs[ci] == ai) {
+        ++ci;
+        cmaskh |= idx_type{1} << ai;
+      } else {
+        ++l;
+        if (remaining_low_cqs == 0) {
+          eqs[ei++] = ai;
+        } else {
+          highest_cis_bit = ai;
+          cis_mask |= idx_type{1} << ai;
+          --remaining_low_cqs;
+        }
+      }
+      ++ai;
+    }
+
+    unsigned i = ai;
+    unsigned j = effective_low_qs;
+
+    while (ei < num_effective_qs) {
+      eqs[ei++] = qs[j++];
+      qmaskh |= 1 << (i++ - ci);
+    }
+
+    unsigned num_aqs = GetHighQubits(qs, qi, cqs, ci, ai - 5, cmaskh, aqs);
+    GetMs(num_qubits, aqs, num_aqs, indices.ms);
+    GetXss(num_qubits, eqs, num_effective_qs, indices.xss);
+    GetIndicesL(num_effective_qs, qmaskh | qmaskl, indices);
+
+    idx_type cvalsh = bits::ExpandBits(idx_type(cvals), num_qubits, cmaskh);
+    idx_type cvalsl = bits::ExpandBits(idx_type(cvals), 5, cmaskl);
+
+    cis_mask |= 31 ^ cmaskl;
+    highest_cis_bit = highest_cis_bit < 5 ? 5 : highest_cis_bit;
+    for (idx_type i = 0; i < 32; ++i) {
+      auto c = bits::ExpandBits(i, highest_cis_bit + 1, cis_mask);
+      indices.cis[i] = 2 * (c & 0xffffffe0) | (c & 0x1f) | cvalsl;
+    }
+
+    return {cvalsh, num_aqs, num_effective_qs, remaining_low_cqs};
+  }
+
+
+  void* AllocScratch(uint64_t size) const {
+    if (size > scratch_size_) {
+      if (scratch_ != nullptr) {
+        ErrorCheck(cudaFree(scratch_));
+      }
+
+      ErrorCheck(cudaMalloc(const_cast<void**>(&scratch_), size));
+
+      const_cast<uint64_t&>(scratch_size_) = size;
+    }
+
+    return scratch_;
+  }
+
+  char* d_ws;
+  char h_ws0[max_buf_size];
+  char* h_ws = (char*) h_ws0;
+
+  void* scratch_;
+  uint64_t scratch_size_;
+};
+
+}  // namespace qsim
+
+#endif  // SIMULATOR_CUDA_H_
diff --git a/tpls/qsim/simulator_cuda_kernels.h b/tpls/qsim/simulator_cuda_kernels.h
new file mode 100644
index 0000000..e21a9d6
--- /dev/null
+++ b/tpls/qsim/simulator_cuda_kernels.h
@@ -0,0 +1,683 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SIMULATOR_CUDA_KERNELS_H_
+#define SIMULATOR_CUDA_KERNELS_H_
+
+#ifdef __NVCC__
+  #include <cuda.h>
+  #include <cuda_runtime.h>
+
+  #include "util_cuda.h"
+#elif __HIP__
+  #include <hip/hip_runtime.h>
+  #include "cuda2hip.h"
+#endif
+
+namespace qsim {
+
+template <unsigned G, typename fp_type, typename idx_type>
+__global__ void ApplyGateH_Kernel(
+    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss0,
+    const idx_type* __restrict__ mss, fp_type* __restrict__ rstate) {
+  // blockDim.x must be equal to 64.
+
+  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
+
+  constexpr unsigned gsize = 1 << G;
+  constexpr unsigned rows =
+      G < 4 ? gsize : (sizeof(fp_type) == 4 ?
+                       (G < 6 ? gsize : 32) : (G < 5 ? 8 : 16));
+
+  fp_type rs[gsize], is[gsize];
+
+  __shared__ idx_type xss[64];
+  __shared__ fp_type v[2 * gsize * rows];
+
+  if (threadIdx.x < gsize) {
+    xss[threadIdx.x] = xss0[threadIdx.x];
+  }
+
+  if (G <= 2) {
+    if (threadIdx.x < 2 * gsize * gsize) {
+      v[threadIdx.x] = v0[threadIdx.x];
+    }
+  } else {
+    for (unsigned m = 0; m < 2 * gsize * rows; m += 64) {
+      v[m + threadIdx.x] = v0[m + threadIdx.x];
+    }
+  }
+
+  __syncthreads();
+
+  idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0;
+  idx_type ii = i & mss[0];
+  for (unsigned j = 1; j <= G; ++j) {
+    i *= 2;
+    ii |= i & mss[j];
+  }
+
+  auto p0 = rstate + 2 * ii + threadIdx.x % 32;
+
+  for (unsigned k = 0; k < gsize; ++k) {
+    rs[k] = *(p0 + xss[k]);
+    is[k] = *(p0 + xss[k] + 32);
+  }
+
+  for (unsigned s = 0; s < gsize / rows; ++s) {
+    if (s > 0) {
+      __syncthreads();
+
+      for (unsigned m = 0; m < 2 * gsize * rows; m += 64) {
+        v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
+      }
+
+      __syncthreads();
+    }
+
+    unsigned j = 0;
+
+    for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
+      fp_type rn = 0;
+      fp_type in = 0;
+
+      for (unsigned l = 0; l < gsize; ++l) {
+        fp_type rm = v[j++];
+        fp_type im = v[j++];
+        rn += rs[l] * rm;
+        rn -= is[l] * im;
+        in += rs[l] * im;
+        in += is[l] * rm;
+      }
+
+      *(p0 + xss[k]) = rn;
+      *(p0 + xss[k] + 32) = in;
+    }
+  }
+}
+
+template <unsigned G, typename fp_type, typename idx_type>
+__global__ void ApplyGateL_Kernel(
+    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss,
+    const idx_type* __restrict__ mss, const unsigned* __restrict__ qis,
+    const unsigned* __restrict__ tis, unsigned esize,
+    fp_type* __restrict__ rstate) {
+  // blockDim.x must be equal to 32.
+
+  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
+
+  constexpr unsigned gsize = 1 << G;
+  constexpr unsigned
+      rows = G < 4 ? gsize : (sizeof(fp_type) == 4 ?
+                              (G < 5 ? gsize : 8) : (G < 6 ? 8 : 4));
+
+  fp_type rs[gsize], is[gsize];
+
+  __shared__ fp_type v[2 * gsize * rows];
+  __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1];
+
+  if (G < 2) {
+    if (threadIdx.x < 2 * gsize * gsize) {
+      v[threadIdx.x] = v0[threadIdx.x];
+    }
+  } else {
+    for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
+      v[m + threadIdx.x] = v0[m + threadIdx.x];
+    }
+  }
+
+  idx_type i = 32 * idx_type{blockIdx.x};
+  idx_type ii = i & mss[0];
+  for (unsigned j = 1; j <= G; ++j) {
+    i *= 2;
+    ii |= i & mss[j];
+  }
+
+  auto p0 = rstate + 2 * ii + threadIdx.x;
+
+  for (unsigned k = 0; k < gsize; ++k) {
+    rs0[threadIdx.x][k] = *(p0 + xss[k]);
+    is0[threadIdx.x][k] = *(p0 + xss[k] + 32);
+  }
+
+  for (unsigned k = 0; k < gsize; ++k) {
+    unsigned i = tis[threadIdx.x] | qis[k];
+    unsigned m = i & 0x1f;
+    unsigned n = i / 32;
+
+    rs[k] = rs0[m][n];
+    is[k] = is0[m][n];
+  }
+
+  for (unsigned s = 0; s < gsize / rows; ++s) {
+    if (s > 0) {
+      for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
+        v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
+      }
+    }
+
+    unsigned j = 0;
+
+    for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
+      fp_type rn = 0;
+      fp_type in = 0;
+
+      for (unsigned l = 0; l < gsize; ++l) {
+        fp_type rm = v[j++];
+        fp_type im = v[j++];
+        rn += rs[l] * rm;
+        rn -= is[l] * im;
+        in += rs[l] * im;
+        in += is[l] * rm;
+      }
+
+      unsigned i = tis[threadIdx.x] | qis[k];
+      unsigned m = i & 0x1f;
+      unsigned n = i / 32;
+
+      rs0[m][n] = rn;
+      is0[m][n] = in;
+    }
+  }
+
+  for (unsigned k = 0; k < esize; ++k) {
+    *(p0 + xss[k]) = rs0[threadIdx.x][k];
+    *(p0 + xss[k] + 32) = is0[threadIdx.x][k];
+  }
+}
+
+template <unsigned G, typename fp_type, typename idx_type>
+__global__ void ApplyControlledGateH_Kernel(
+    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss0,
+    const idx_type* __restrict__ mss, unsigned num_mss, idx_type cvalsh,
+    fp_type* __restrict__ rstate) {
+  // blockDim.x must be equal to 64.
+
+  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
+
+  constexpr unsigned gsize = 1 << G;
+  constexpr unsigned rows =
+      G < 4 ? gsize : (sizeof(fp_type) == 4 ?
+                           (G < 6 ? gsize : 32) : (G < 5 ? 8 : 16));
+
+  fp_type rs[gsize], is[gsize];
+
+  __shared__ idx_type xss[64];
+  __shared__ fp_type v[2 * gsize * rows];
+
+  if (threadIdx.x < gsize) {
+    xss[threadIdx.x] = xss0[threadIdx.x];
+  }
+
+  if (G <= 2) {
+    if (threadIdx.x < 2 * gsize * gsize) {
+      v[threadIdx.x] = v0[threadIdx.x];
+    }
+  } else {
+    for (unsigned m = 0; m < 2 * gsize * rows; m += 64) {
+      v[m + threadIdx.x] = v0[m + threadIdx.x];
+    }
+  }
+
+  __syncthreads();
+
+  idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0;
+  idx_type ii = i & mss[0];
+  for (unsigned j = 1; j < num_mss; ++j) {
+    i *= 2;
+    ii |= i & mss[j];
+  }
+
+  ii |= cvalsh;
+
+  auto p0 = rstate + 2 * ii + threadIdx.x % 32;
+
+  for (unsigned k = 0; k < gsize; ++k) {
+    rs[k] = *(p0 + xss[k]);
+    is[k] = *(p0 + xss[k] + 32);
+  }
+
+  for (unsigned s = 0; s < gsize / rows; ++s) {
+    if (s > 0) {
+      __syncthreads();
+
+      for (unsigned m = 0; m < 2 * gsize * rows; m += 64) {
+        v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
+      }
+
+      __syncthreads();
+    }
+
+    unsigned j = 0;
+
+    for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
+      fp_type rn = 0;
+      fp_type in = 0;
+
+      for (unsigned l = 0; l < gsize; ++l) {
+        fp_type rm = v[j++];
+        fp_type im = v[j++];
+        rn += rs[l] * rm;
+        rn -= is[l] * im;
+        in += rs[l] * im;
+        in += is[l] * rm;
+      }
+
+      *(p0 + xss[k]) = rn;
+      *(p0 + xss[k] + 32) = in;
+    }
+  }
+}
+
+template <unsigned G, typename fp_type, typename idx_type>
+__global__ void ApplyControlledGateLH_Kernel(
+    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss,
+    const idx_type* __restrict__ mss, const unsigned* __restrict__ qis,
+    const unsigned* __restrict__ tis, unsigned num_mss, idx_type cvalsh,
+    unsigned esize, fp_type* __restrict__ rstate) {
+  // blockDim.x must be equal to 32.
+
+  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
+
+  constexpr unsigned gsize = 1 << G;
+  constexpr unsigned
+      rows = G < 4 ? gsize : (sizeof(fp_type) == 4 ?
+                              (G < 5 ? gsize : 8) : (G < 6 ? 8 : 4));
+
+  fp_type rs[gsize], is[gsize];
+
+  __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1];
+  __shared__ fp_type v[2 * gsize * rows];
+
+  idx_type i = 32 * idx_type{blockIdx.x};
+  idx_type ii = i & mss[0];
+  for (unsigned j = 1; j < num_mss; ++j) {
+    i *= 2;
+    ii |= i & mss[j];
+  }
+
+  ii |= cvalsh;
+
+  auto p0 = rstate + 2 * ii + threadIdx.x;
+
+  for (unsigned k = 0; k < gsize; ++k) {
+    rs0[threadIdx.x][k] = *(p0 + xss[k]);
+    is0[threadIdx.x][k] = *(p0 + xss[k] + 32);
+  }
+
+  if (G < 2) {
+    if (threadIdx.x < 2 * gsize * gsize) {
+      v[threadIdx.x] = v0[threadIdx.x];
+    }
+  } else {
+    for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
+      v[m + threadIdx.x] = v0[m + threadIdx.x];
+    }
+  }
+
+  for (unsigned k = 0; k < gsize; ++k) {
+    unsigned i = tis[threadIdx.x] | qis[k];
+    unsigned m = i & 0x1f;
+    unsigned n = i / 32;
+
+    rs[k] = rs0[m][n];
+    is[k] = is0[m][n];
+  }
+
+  for (unsigned s = 0; s < gsize / rows; ++s) {
+    if (s > 0) {
+      for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
+        v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
+      }
+    }
+
+    unsigned j = 0;
+
+    for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
+      fp_type rn = 0;
+      fp_type in = 0;
+
+      for (unsigned l = 0; l < gsize; ++l) {
+        fp_type rm = v[j++];
+        fp_type im = v[j++];
+        rn += rs[l] * rm;
+        rn -= is[l] * im;
+        in += rs[l] * im;
+        in += is[l] * rm;
+      }
+
+      unsigned i = tis[threadIdx.x] | qis[k];
+      unsigned m = i & 0x1f;
+      unsigned n = i / 32;
+
+      rs0[m][n] = rn;
+      is0[m][n] = in;
+    }
+  }
+
+  for (unsigned k = 0; k < esize; ++k) {
+    *(p0 + xss[k]) = rs0[threadIdx.x][k];
+    *(p0 + xss[k] + 32) = is0[threadIdx.x][k];
+  }
+}
+
+template <unsigned G, typename fp_type, typename idx_type>
+__global__ void ApplyControlledGateL_Kernel(
+    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss,
+    const idx_type* __restrict__ mss, const unsigned* __restrict__ qis,
+    const unsigned* __restrict__ tis, const idx_type* __restrict__ cis,
+    unsigned num_mss, idx_type cvalsh, unsigned esize, unsigned rwthreads,
+    fp_type* __restrict__ rstate) {
+  // blockDim.x must be equal to 32.
+
+  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
+
+  constexpr unsigned gsize = 1 << G;
+  constexpr unsigned
+      rows = G < 4 ? gsize : (sizeof(fp_type) == 4 ?
+                              (G < 5 ? gsize : 8) : (G < 6 ? 8 : 4));
+
+  fp_type rs[gsize], is[gsize];
+
+  __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1];
+  __shared__ fp_type v[2 * gsize * rows];
+
+  idx_type i = 32 * idx_type{blockIdx.x};
+  idx_type ii = i & mss[0];
+  for (unsigned j = 1; j < num_mss; ++j) {
+    i *= 2;
+    ii |= i & mss[j];
+  }
+
+  ii |= cvalsh;
+
+  auto p0 = rstate + 2 * ii + cis[threadIdx.x];
+
+  if (threadIdx.x < rwthreads) {
+    for (unsigned k = 0; k < gsize; ++k) {
+      rs0[threadIdx.x][k] = *(p0 + xss[k]);
+      is0[threadIdx.x][k] = *(p0 + xss[k] + 32);
+    }
+  }
+
+  if (G < 2) {
+    if (threadIdx.x < 2 * gsize * gsize) {
+      v[threadIdx.x] = v0[threadIdx.x];
+    }
+  } else {
+    for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
+      v[m + threadIdx.x] = v0[m + threadIdx.x];
+    }
+  }
+
+  for (unsigned k = 0; k < gsize; ++k) {
+    unsigned i = tis[threadIdx.x] | qis[k];
+    unsigned m = i & 0x1f;
+    unsigned n = i / 32;
+
+    rs[k] = rs0[m][n];
+    is[k] = is0[m][n];
+  }
+
+  for (unsigned s = 0; s < gsize / rows; ++s) {
+    if (s > 0) {
+      for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
+        v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
+      }
+    }
+
+    unsigned j = 0;
+
+    for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
+      fp_type rn = 0;
+      fp_type in = 0;
+
+      for (unsigned l = 0; l < gsize; ++l) {
+        fp_type rm = v[j++];
+        fp_type im = v[j++];
+        rn += rs[l] * rm;
+        rn -= is[l] * im;
+        in += rs[l] * im;
+        in += is[l] * rm;
+      }
+
+      unsigned i = tis[threadIdx.x] | qis[k];
+      unsigned m = i & 0x1f;
+      unsigned n = i / 32;
+
+      rs0[m][n] = rn;
+      is0[m][n] = in;
+    }
+  }
+
+  if (threadIdx.x < rwthreads) {
+    for (unsigned k = 0; k < esize; ++k) {
+      *(p0 + xss[k]) = rs0[threadIdx.x][k];
+      *(p0 + xss[k] + 32) = is0[threadIdx.x][k];
+    }
+  }
+}
+
+template <unsigned G, typename fp_type, typename idx_type, typename Op,
+          typename cfp_type>
+__global__ void ExpectationValueH_Kernel(
+    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss0,
+    const idx_type* __restrict__ mss, unsigned num_iterations_per_block,
+    const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) {
+  // blockDim.x must be equal to 64.
+
+  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
+
+  constexpr unsigned gsize = 1 << G;
+  constexpr unsigned rows =
+      G < 5 ? gsize : (sizeof(fp_type) == 4 ? (G < 6 ? 4 : 8) : 8);
+
+  fp_type rs[gsize], is[gsize];
+
+  __shared__ idx_type xss[64];
+  __shared__ fp_type v[2 * gsize * rows];
+
+  if (threadIdx.x < gsize) {
+    xss[threadIdx.x] = xss0[threadIdx.x];
+  }
+
+  if (G <= 2) {
+    if (threadIdx.x < 2 * gsize * gsize) {
+      v[threadIdx.x] = v0[threadIdx.x];
+    }
+  } else {
+    for (unsigned m = 0; m < 2 * gsize * rows; m += 64) {
+      v[m + threadIdx.x] = v0[m + threadIdx.x];
+    }
+  }
+
+  __syncthreads();
+
+  double re = 0;
+  double im = 0;
+
+  for (unsigned iter = 0; iter < num_iterations_per_block; ++iter) {
+    idx_type b = num_iterations_per_block * idx_type{blockIdx.x} + iter;
+
+    idx_type i = (64 * b + threadIdx.x) & 0xffffffffffe0;
+    idx_type ii = i & mss[0];
+    for (unsigned j = 1; j <= G; ++j) {
+      i *= 2;
+      ii |= i & mss[j];
+    }
+
+    auto p0 = rstate + 2 * ii + threadIdx.x % 32;
+
+    for (unsigned k = 0; k < gsize; ++k) {
+      rs[k] = *(p0 + xss[k]);
+      is[k] = *(p0 + xss[k] + 32);
+    }
+
+    for (unsigned s = 0; s < gsize / rows; ++s) {
+      if (s > 0 || iter > 0) {
+        __syncthreads();
+
+        for (unsigned m = 0; m < 2 * gsize * rows; m += 64) {
+          v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
+        }
+
+        __syncthreads();
+      }
+
+      unsigned j = 0;
+
+      for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
+        fp_type rn = 0;
+        fp_type in = 0;
+
+        for (unsigned l = 0; l < gsize; ++l) {
+          fp_type rm = v[j++];
+          fp_type im = v[j++];
+          rn += rs[l] * rm;
+          rn -= is[l] * im;
+          in += rs[l] * im;
+          in += is[l] * rm;
+        }
+
+        re += rs[k] * rn;
+        re += is[k] * in;
+        im += rs[k] * in;
+        im -= is[k] * rn;
+      }
+    }
+  }
+
+  __shared__ cfp_type partial1[64];
+  __shared__ cfp_type partial2[2];
+
+  partial1[threadIdx.x].re = re;
+  partial1[threadIdx.x].im = im;
+
+  auto val = WarpReduce(partial1[threadIdx.x], op);
+
+  if (threadIdx.x % 32 == 0) {
+    partial2[threadIdx.x / 32] = val;
+  }
+
+  __syncthreads();
+
+  if (threadIdx.x == 0) {
+    result[blockIdx.x].re = partial2[0].re + partial2[1].re;
+    result[blockIdx.x].im = partial2[0].im + partial2[1].im;
+  }
+}
+
+template <unsigned G, typename fp_type, typename idx_type,
+          typename Op, typename cfp_type>
+__global__ void ExpectationValueL_Kernel(
+    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss,
+    const idx_type* __restrict__ mss, const unsigned* __restrict__ qis,
+    const unsigned* __restrict__ tis, unsigned num_iterations_per_block,
+    const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) {
+  // blockDim.x must be equal to 32.
+
+  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
+
+  constexpr unsigned gsize = 1 << G;
+  constexpr unsigned rows = G < 5 ? gsize : (sizeof(fp_type) == 4 ?
+                                             (G < 6 ? 4 : 2) : (G < 6 ? 2 : 1));
+
+  fp_type rs[gsize], is[gsize];
+
+  __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1];
+  __shared__ fp_type v[2 * gsize * rows];
+
+  if (G < 2) {
+    if (threadIdx.x < 2 * gsize * gsize) {
+      v[threadIdx.x] = v0[threadIdx.x];
+    }
+  } else {
+    for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
+      v[m + threadIdx.x] = v0[m + threadIdx.x];
+    }
+  }
+
+  double re = 0;
+  double im = 0;
+
+  for (idx_type iter = 0; iter < num_iterations_per_block; ++iter) {
+    idx_type i = 32 * (num_iterations_per_block * idx_type{blockIdx.x} + iter);
+    idx_type ii = i & mss[0];
+    for (unsigned j = 1; j <= G; ++j) {
+      i *= 2;
+      ii |= i & mss[j];
+    }
+
+    auto p0 = rstate + 2 * ii + threadIdx.x;
+
+    for (unsigned k = 0; k < gsize; ++k) {
+      rs0[threadIdx.x][k] = *(p0 + xss[k]);
+      is0[threadIdx.x][k] = *(p0 + xss[k] + 32);
+    }
+
+    for (unsigned k = 0; k < gsize; ++k) {
+      unsigned i = tis[threadIdx.x] | qis[k];
+      unsigned m = i & 0x1f;
+      unsigned n = i / 32;
+
+      rs[k] = rs0[m][n];
+      is[k] = is0[m][n];
+    }
+
+    for (unsigned s = 0; s < gsize / rows; ++s) {
+      if (s > 0 || iter > 0) {
+        for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
+          v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
+        }
+      }
+
+      unsigned j = 0;
+
+      for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
+        fp_type rn = 0;
+        fp_type in = 0;
+
+        for (unsigned l = 0; l < gsize; ++l) {
+          fp_type rm = v[j++];
+          fp_type im = v[j++];
+          rn += rs[l] * rm;
+          rn -= is[l] * im;
+          in += rs[l] * im;
+          in += is[l] * rm;
+        }
+
+        re += rs[k] * rn;
+        re += is[k] * in;
+        im += rs[k] * in;
+        im -= is[k] * rn;
+      }
+    }
+  }
+
+  __shared__ cfp_type partial[32];
+
+  partial[threadIdx.x].re = re;
+  partial[threadIdx.x].im = im;
+
+  auto val = WarpReduce(partial[threadIdx.x], op);
+
+  if (threadIdx.x == 0) {
+    result[blockIdx.x].re = val.re;
+    result[blockIdx.x].im = val.im;
+  }
+}
+
+}  // namespace qsim
+
+#endif  // SIMULATOR_CUDA_KERNELS_H_
diff --git a/tpls/qsim/simulator_custatevec.h b/tpls/qsim/simulator_custatevec.h
new file mode 100644
index 0000000..40d1902
--- /dev/null
+++ b/tpls/qsim/simulator_custatevec.h
@@ -0,0 +1,209 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SIMULATOR_CUSTATEVEC_H_
+#define SIMULATOR_CUSTATEVEC_H_
+
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+
+#include <cublas_v2.h>
+#include <cuComplex.h>
+#include <custatevec.h>
+
+#include "io.h"
+#include "statespace_custatevec.h"
+#include "util_custatevec.h"
+
+namespace qsim {
+
+/**
+ * Quantum circuit simulator using the NVIDIA cuStateVec library.
+ */
+template <typename FP = float>
+class SimulatorCuStateVec final {
+ public:
+  using StateSpace = StateSpaceCuStateVec<FP>;
+  using State = typename StateSpace::State;
+  using fp_type = typename StateSpace::fp_type;
+
+  static constexpr auto kStateType = StateSpace::kStateType;
+  static constexpr auto kMatrixType = StateSpace::kMatrixType;
+  static constexpr auto kExpectType = StateSpace::kExpectType;
+  static constexpr auto kComputeType = StateSpace::kComputeType;
+  static constexpr auto kMatrixLayout = StateSpace::kMatrixLayout;
+
+  explicit SimulatorCuStateVec(const cublasHandle_t& cublas_handle,
+                               const custatevecHandle_t& custatevec_handle)
+      : cublas_handle_(cublas_handle), custatevec_handle_(custatevec_handle),
+      workspace_(nullptr), workspace_size_(0) {}
+
+  ~SimulatorCuStateVec() {
+    ErrorCheck(cudaFree(workspace_));
+  }
+
+  /**
+   * Applies a gate using the NVIDIA cuStateVec library.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyGate(const std::vector<unsigned>& qs,
+                 const fp_type* matrix, State& state) const {
+    if (qs.size() == 0) {
+      uint64_t size = uint64_t{1} << state.num_qubits();
+
+      if (StateSpace::is_float) {
+        cuComplex a = {matrix[0], matrix[1]};
+        auto p = (cuComplex*) state.get();
+        ErrorCheck(cublasCscal(cublas_handle_, size, &a, p, 1));
+      } else {
+        cuDoubleComplex a = {matrix[0], matrix[1]};
+        auto p = (cuDoubleComplex*) state.get();
+        ErrorCheck(cublasZscal(cublas_handle_, size, &a, p, 1));
+      }
+    } else {
+      auto workspace_size = ApplyGateWorkSpaceSize(
+          state.num_qubits(), qs.size(), 0, matrix);
+      AllocWorkSpace(workspace_size);
+
+      ErrorCheck(custatevecApplyMatrix(
+                     custatevec_handle_, state.get(), kStateType,
+                     state.num_qubits(), matrix, kMatrixType, kMatrixLayout, 0,
+                     (int32_t*) qs.data(), qs.size(), nullptr, nullptr, 0,
+                     kComputeType, workspace_, workspace_size));
+    }
+  }
+
+  /**
+   * Applies a controlled gate using the NVIDIA cuStateVec library.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param cqs Indices of control qubits.
+   * @param cmask Bit mask of control qubit values.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyControlledGate(const std::vector<unsigned>& qs,
+                           const std::vector<unsigned>& cqs, uint64_t cmask,
+                           const fp_type* matrix, State& state) const {
+    if (qs.size() == 0) {
+      IO::errorf(
+          "error: controlled global phase gate is not implemented %s %d\n",
+          __FILE__, __LINE__);
+      exit(1);
+    } else {
+      std::vector<int32_t> control_bits;
+      control_bits.reserve(cqs.size());
+
+      for (std::size_t i = 0; i < cqs.size(); ++i) {
+        control_bits.push_back((cmask >> i) & 1);
+      }
+
+      auto workspace_size = ApplyGateWorkSpaceSize(
+          state.num_qubits(), qs.size(), cqs.size(), matrix);
+      AllocWorkSpace(workspace_size);
+
+      ErrorCheck(custatevecApplyMatrix(
+                     custatevec_handle_, state.get(), kStateType,
+                     state.num_qubits(), matrix, kMatrixType, kMatrixLayout, 0,
+                     (int32_t*) qs.data(), qs.size(),
+                     (int32_t*) cqs.data(), control_bits.data(), cqs.size(),
+                     kComputeType, workspace_, workspace_size));
+    }
+  }
+
+  /**
+   * Computes the expectation value of an operator using the NVIDIA cuStateVec
+   * library.
+   * @param qs Indices of the qubits the operator acts on.
+   * @param matrix The operator matrix.
+   * @param state The state of the system.
+   * @return The computed expectation value.
+   */
+  std::complex<double> ExpectationValue(const std::vector<unsigned>& qs,
+                                        const fp_type* matrix,
+                                        const State& state) const {
+    auto workspace_size = ExpectationValueWorkSpaceSize(
+        state.num_qubits(), qs.size(), matrix);
+    AllocWorkSpace(workspace_size);
+
+    cuDoubleComplex eval;
+
+    ErrorCheck(custatevecComputeExpectation(
+                   custatevec_handle_, state.get(), kStateType,
+                   state.num_qubits(), &eval, kExpectType, nullptr, matrix,
+                   kMatrixType, kMatrixLayout, (int32_t*) qs.data(), qs.size(),
+                   kComputeType, workspace_, workspace_size));
+
+    return {cuCreal(eval), cuCimag(eval)};
+  }
+
+  /**
+   * @return The size of SIMD register if applicable.
+   */
+  static unsigned SIMDRegisterSize() {
+    return 32;
+  }
+
+ private:
+  size_t ApplyGateWorkSpaceSize(
+      unsigned num_qubits, unsigned num_targets, unsigned num_controls,
+      const fp_type* matrix) const {
+    size_t size;
+
+    ErrorCheck(custatevecApplyMatrixGetWorkspaceSize(
+                   custatevec_handle_, kStateType, num_qubits, matrix,
+                   kMatrixType, kMatrixLayout, 0, num_targets, num_controls,
+                   kComputeType, &size));
+
+    return size;
+  }
+
+  size_t ExpectationValueWorkSpaceSize(
+      unsigned num_qubits, unsigned num_targets, const fp_type* matrix) const {
+    size_t size;
+
+    ErrorCheck(custatevecComputeExpectationGetWorkspaceSize(
+                   custatevec_handle_, kStateType, num_qubits, matrix,
+                   kMatrixType, kMatrixLayout, num_targets, kComputeType,
+                   &size));
+
+    return size;
+  }
+
+  void* AllocWorkSpace(size_t size) const {
+    if (size > workspace_size_) {
+      if (workspace_ != nullptr) {
+        ErrorCheck(cudaFree(workspace_));
+      }
+
+      ErrorCheck(cudaMalloc(const_cast<void**>(&workspace_), size));
+
+      const_cast<uint64_t&>(workspace_size_) = size;
+    }
+
+    return workspace_;
+  }
+
+  const cublasHandle_t cublas_handle_;
+  const custatevecHandle_t custatevec_handle_;
+
+  void* workspace_;
+  size_t workspace_size_;
+};
+
+}  // namespace qsim
+
+#endif  // SIMULATOR_CUSTATEVEC_H_
diff --git a/tpls/qsim/simulator_sse.h b/tpls/qsim/simulator_sse.h
new file mode 100644
index 0000000..5256c53
--- /dev/null
+++ b/tpls/qsim/simulator_sse.h
@@ -0,0 +1,864 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SIMULATOR_SSE_H_
+#define SIMULATOR_SSE_H_
+
+#include <smmintrin.h>
+
+#include <complex>
+#include <cstdint>
+#include <functional>
+#include <vector>
+
+#include "simulator.h"
+#include "statespace_sse.h"
+
+namespace qsim {
+
+/**
+ * Quantum circuit simulator with SSE vectorization.
+ */
+template <typename For>
+class SimulatorSSE final : public SimulatorBase {
+ public:
+  using StateSpace = StateSpaceSSE<For>;
+  using State = typename StateSpace::State;
+  using fp_type = typename StateSpace::fp_type;
+
+  template <typename... ForArgs>
+  explicit SimulatorSSE(ForArgs&&... args) : for_(args...) {}
+
+  /**
+   * Applies a gate using SSE instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyGate(const std::vector<unsigned>& qs,
+                 const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    switch (qs.size()) {
+    case 0:
+      ApplyGateH<0>(qs, matrix, state);
+      break;
+    case 1:
+      if (qs[0] > 1) {
+        ApplyGateH<1>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 1>(qs, matrix, state);
+      }
+      break;
+    case 2:
+      if (qs[0] > 1) {
+        ApplyGateH<2>(qs, matrix, state);
+      } else if (qs[1] > 1) {
+        ApplyGateL<1, 1>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 2>(qs, matrix, state);
+      }
+      break;
+    case 3:
+      if (qs[0] > 1) {
+        ApplyGateH<3>(qs, matrix, state);
+      } else if (qs[1] > 1) {
+        ApplyGateL<2, 1>(qs, matrix, state);
+      } else {
+        ApplyGateL<1, 2>(qs, matrix, state);
+      }
+      break;
+    case 4:
+      if (qs[0] > 1) {
+        ApplyGateH<4>(qs, matrix, state);
+      } else if (qs[1] > 1) {
+        ApplyGateL<3, 1>(qs, matrix, state);
+      } else {
+        ApplyGateL<2, 2>(qs, matrix, state);
+      }
+      break;
+    case 5:
+      if (qs[0] > 1) {
+        ApplyGateH<5>(qs, matrix, state);
+      } else if (qs[1] > 1) {
+        ApplyGateL<4, 1>(qs, matrix, state);
+      } else {
+        ApplyGateL<3, 2>(qs, matrix, state);
+      }
+      break;
+    case 6:
+      if (qs[0] > 1) {
+        ApplyGateH<6>(qs, matrix, state);
+      } else if (qs[1] > 1) {
+        ApplyGateL<5, 1>(qs, matrix, state);
+      } else {
+        ApplyGateL<4, 2>(qs, matrix, state);
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * Applies a controlled gate using SSE instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param cqs Indices of control qubits.
+   * @param cvals Bit mask of control qubit values.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyControlledGate(const std::vector<unsigned>& qs,
+                           const std::vector<unsigned>& cqs, uint64_t cvals,
+                           const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+    // Assume cqs[0] < cqs[1] < cqs[2] < ... .
+
+    if (cqs.size() == 0) {
+      ApplyGate(qs, matrix, state);
+      return;
+    }
+
+    switch (qs.size()) {
+    case 0:
+      if (cqs[0] > 1) {
+        ApplyControlledGateHH<0>(qs, cqs, cvals, matrix, state);
+      } else {
+        ApplyControlledGateHL<0>(qs, cqs, cvals, matrix, state);
+      }
+      break;
+    case 1:
+      if (qs[0] > 1) {
+        if (cqs[0] > 1) {
+          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 1) {
+          ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 2:
+      if (qs[0] > 1) {
+        if (cqs[0] > 1) {
+          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 1) {
+        if (cqs[0] > 1) {
+          ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 1) {
+          ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 3:
+      if (qs[0] > 1) {
+        if (cqs[0] > 1) {
+          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 1) {
+        if (cqs[0] > 1) {
+          ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 1) {
+          ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 4:
+      if (qs[0] > 1) {
+        if (cqs[0] > 1) {
+          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 1) {
+        if (cqs[0] > 1) {
+          ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 1) {
+          ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * Computes the expectation value of an operator using SSE instructions.
+   * @param qs Indices of the qubits the operator acts on.
+   * @param matrix The operator matrix.
+   * @param state The state of the system.
+   * @return The computed expectation value.
+   */
+  std::complex<double> ExpectationValue(const std::vector<unsigned>& qs,
+                                        const fp_type* matrix,
+                                        const State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    switch (qs.size()) {
+    case 1:
+      if (qs[0] > 1) {
+        return ExpectationValueH<1>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<0, 1>(qs, matrix, state);
+      }
+      break;
+    case 2:
+      if (qs[0] > 1) {
+        return ExpectationValueH<2>(qs, matrix, state);
+      } else if (qs[1] > 1) {
+        return ExpectationValueL<1, 1>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<0, 2>(qs, matrix, state);
+      }
+      break;
+    case 3:
+      if (qs[0] > 1) {
+        return ExpectationValueH<3>(qs, matrix, state);
+      } else if (qs[1] > 1) {
+        return ExpectationValueL<2, 1>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<1, 2>(qs, matrix, state);
+      }
+      break;
+    case 4:
+      if (qs[0] > 1) {
+        return ExpectationValueH<4>(qs, matrix, state);
+      } else if (qs[1] > 1) {
+        return ExpectationValueL<3, 1>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<2, 2>(qs, matrix, state);
+      }
+      break;
+    case 5:
+      if (qs[0] > 1) {
+        return ExpectationValueH<5>(qs, matrix, state);
+      } else if (qs[1] > 1) {
+        return ExpectationValueL<4, 1>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<3, 2>(qs, matrix, state);
+      }
+      break;
+    case 6:
+      if (qs[0] > 1) {
+        return ExpectationValueH<6>(qs, matrix, state);
+      } else if (qs[1] > 1) {
+        return ExpectationValueL<5, 1>(qs, matrix, state);
+      } else {
+        return ExpectationValueL<4, 2>(qs, matrix, state);
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+
+    return 0;
+  }
+
+  /**
+   * @return The size of SIMD register if applicable.
+   */
+  static unsigned SIMDRegisterSize() {
+    return 4;
+  }
+
+ private:
+  template <unsigned H>
+  void ApplyGateH(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m128 ru, iu, rn, in;
+      __m128 rs[hsize], is[hsize];
+
+      i *= 4;
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      auto p0 = rstate + 2 * ii;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm_load_ps(p0 + xss[k]);
+        is[k] = _mm_load_ps(p0 + xss[k] + 4);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm_set1_ps(v[j]);
+        iu = _mm_set1_ps(v[j + 1]);
+        rn = _mm_mul_ps(rs[0], ru);
+        in = _mm_mul_ps(rs[0], iu);
+        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
+        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm_set1_ps(v[j]);
+          iu = _mm_set1_ps(v[j + 1]);
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], ru));
+
+          j += 2;
+        }
+
+        _mm_store_ps(p0 + xss[k], rn);
+        _mm_store_ps(p0 + xss[k] + 4, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 2 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, matrix, ms, xss, state.get());
+  }
+
+  template <unsigned H, unsigned L>
+  void ApplyGateL(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
+                const uint64_t* ms, const uint64_t* xss,
+                unsigned q0, fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m128 rn, in;
+      __m128 rs[gsize], is[gsize];
+
+      i *= 4;
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      auto p0 = rstate + 2 * ii;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+
+        rs[k2] = _mm_load_ps(p0 + xss[k]);
+        is[k2] = _mm_load_ps(p0 + xss[k] + 4);
+
+        if (L == 1) {
+          rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177)
+                               : _mm_shuffle_ps(rs[k2], rs[k2], 78);
+          is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177)
+                               : _mm_shuffle_ps(is[k2], is[k2], 78);
+        } else if (L == 2) {
+          rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57);
+          is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57);
+          rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78);
+          is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78);
+          rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147);
+          is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm_mul_ps(rs[0], w[j]);
+        in = _mm_mul_ps(rs[0], w[j + 1]);
+        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
+        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
+
+          j += 2;
+        }
+
+        _mm_store_ps(p0 + xss[k], rn);
+        _mm_store_ps(p0 + xss[k] + 4, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m128 w[1 << (1 + 2 * H + L)];
+
+    auto m = GetMasks11<L>(qs);
+
+    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
+    FillMatrix<H, L, 2>(m.qmaskl, matrix, (fp_type*) w);
+
+    unsigned k = 2 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss, qs[0], state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m128 ru, iu, rn, in;
+      __m128 rs[hsize], is[hsize];
+
+      i *= 4;
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      if ((ii & cmaskh) != cvalsh) return;
+
+      auto p0 = rstate + 2 * ii;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm_load_ps(p0 + xss[k]);
+        is[k] = _mm_load_ps(p0 + xss[k] + 4);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm_set1_ps(v[j]);
+        iu = _mm_set1_ps(v[j + 1]);
+        rn = _mm_mul_ps(rs[0], ru);
+        in = _mm_mul_ps(rs[0], iu);
+        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
+        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm_set1_ps(v[j]);
+          iu = _mm_set1_ps(v[j + 1]);
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], ru));
+
+          j += 2;
+        }
+
+        _mm_store_ps(p0 + xss[k], rn);
+        _mm_store_ps(p0 + xss[k] + 4, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+
+    auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals);
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 2 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, matrix, ms, xss, m.cvalsh, m.cmaskh, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m128 rn, in;
+      __m128 rs[hsize], is[hsize];
+
+      i *= 4;
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      if ((ii & cmaskh) != cvalsh) return;
+
+      auto p0 = rstate + 2 * ii;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm_load_ps(p0 + xss[k]);
+        is[k] = _mm_load_ps(p0 + xss[k] + 4);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm_mul_ps(rs[0], w[j]);
+        in = _mm_mul_ps(rs[0], w[j + 1]);
+        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
+        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
+
+          j += 2;
+        }
+
+        _mm_store_ps(p0 + xss[k], rn);
+        _mm_store_ps(p0 + xss[k] + 4, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m128 w[1 << (1 + 2 * H)];
+
+    auto m = GetMasks8<2>(state.num_qubits(), qs, cqs, cvals);
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+    FillControlledMatrixH<H, 2>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
+
+    unsigned r = 2 + H;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, state.get());
+  }
+
+  template <unsigned H, unsigned L, bool CH>
+  void ApplyControlledGateL(const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs, uint64_t cvals,
+                            const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, unsigned q0, fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m128 rn, in;
+      __m128 rs[gsize], is[gsize];
+
+      i *= 4;
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      if ((ii & cmaskh) != cvalsh) return;
+
+      auto p0 = rstate + 2 * ii;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+
+        rs[k2] = _mm_load_ps(p0 + xss[k]);
+        is[k2] = _mm_load_ps(p0 + xss[k] + 4);
+
+        if (L == 1) {
+          rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177)
+                               : _mm_shuffle_ps(rs[k2], rs[k2], 78);
+          is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177)
+                               : _mm_shuffle_ps(is[k2], is[k2], 78);
+        } else if (L == 2) {
+          rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57);
+          is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57);
+          rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78);
+          is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78);
+          rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147);
+          is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm_mul_ps(rs[0], w[j]);
+        in = _mm_mul_ps(rs[0], w[j + 1]);
+        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
+        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
+
+          j += 2;
+        }
+
+        _mm_store_ps(p0 + xss[k], rn);
+        _mm_store_ps(p0 + xss[k] + 4, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m128 w[1 << (1 + 2 * H + L)];
+
+    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
+
+    unsigned r = 2 + H;
+    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    if (CH) {
+      auto m = GetMasks9<L>(state.num_qubits(), qs, cqs, cvals);
+      FillMatrix<H, L, 2>(m.qmaskl, matrix, (fp_type*) w);
+
+      for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, qs[0], state.get());
+    } else {
+      auto m = GetMasks10<L, 2>(state.num_qubits(), qs, cqs, cvals);
+      FillControlledMatrixL<H, L, 2>(
+          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
+
+      for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, qs[0], state.get());
+    }
+  }
+
+  template <unsigned H>
+  std::complex<double> ExpectationValueH(const std::vector<unsigned>& qs,
+                                         const fp_type* matrix,
+                                         const State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss,
+                const fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m128 ru, iu, rn, in;
+      __m128 rs[hsize], is[hsize];
+
+      i *= 4;
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      auto p0 = rstate + 2 * ii;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm_load_ps(p0 + xss[k]);
+        is[k] = _mm_load_ps(p0 + xss[k] + 4);
+      }
+
+      double re = 0;
+      double im = 0;
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm_set1_ps(v[j]);
+        iu = _mm_set1_ps(v[j + 1]);
+        rn = _mm_mul_ps(rs[0], ru);
+        in = _mm_mul_ps(rs[0], iu);
+        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
+        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm_set1_ps(v[j]);
+          iu = _mm_set1_ps(v[j + 1]);
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], ru));
+
+          j += 2;
+        }
+
+        __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[k], rn), _mm_mul_ps(is[k], in));
+        __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[k], in), _mm_mul_ps(is[k], rn));
+
+        re += detail::HorizontalSumSSE(v_re);
+        im += detail::HorizontalSumSSE(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 2 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), matrix, ms, xss, state.get());
+  }
+
+  template <unsigned H, unsigned L>
+  std::complex<double> ExpectationValueL(const std::vector<unsigned>& qs,
+                                         const fp_type* matrix,
+                                         const State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
+                const uint64_t* ms, const uint64_t* xss, unsigned q0,
+                const fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m128 rn, in;
+      __m128 rs[gsize], is[gsize];
+
+      i *= 4;
+
+      uint64_t ii = i & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        i *= 2;
+        ii |= i & ms[j];
+      }
+
+      auto p0 = rstate + 2 * ii;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+
+        rs[k2] = _mm_load_ps(p0 + xss[k]);
+        is[k2] = _mm_load_ps(p0 + xss[k] + 4);
+
+        if (L == 1) {
+          rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177)
+                               : _mm_shuffle_ps(rs[k2], rs[k2], 78);
+          is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177)
+                               : _mm_shuffle_ps(is[k2], is[k2], 78);
+        } else if (L == 2) {
+          rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57);
+          is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57);
+          rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78);
+          is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78);
+          rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147);
+          is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147);
+        }
+      }
+
+      double re = 0;
+      double im = 0;
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm_mul_ps(rs[0], w[j]);
+        in = _mm_mul_ps(rs[0], w[j + 1]);
+        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
+        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
+
+          j += 2;
+        }
+
+        unsigned m = lsize * k;
+
+        __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[m], rn), _mm_mul_ps(is[m], in));
+        __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[m], in), _mm_mul_ps(is[m], rn));
+
+        re += detail::HorizontalSumSSE(v_re);
+        im += detail::HorizontalSumSSE(v_im);
+      }
+
+      return std::complex<double>{re, im};
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m128 w[1 << (1 + 2 * H + L)];
+
+    auto m = GetMasks11<L>(qs);
+
+    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
+    FillMatrix<H, L, 2>(m.qmaskl, matrix, (fp_type*) w);
+
+    unsigned k = 2 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+
+    using Op = std::plus<std::complex<double>>;
+    return for_.RunReduce(size, f, Op(), w, ms, xss, qs[0], state.get());
+  }
+
+  For for_;
+};
+
+}  // namespace qsim
+
+#endif  // SIMULATOR_SSE_H_
diff --git a/tpls/qsim/statespace.h b/tpls/qsim/statespace.h
new file mode 100644
index 0000000..2b0c9af
--- /dev/null
+++ b/tpls/qsim/statespace.h
@@ -0,0 +1,145 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef STATESPACE_H_
+#define STATESPACE_H_
+
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include <vector>
+
+#include "util.h"
+
+namespace qsim {
+
+/**
+ * Abstract class containing context and routines for general state-vector
+ * manipulations. "AVX", "AVX512", "Basic", and "SSE" implementations are
+ * provided.
+ */
+template <typename Impl,
+          template<typename...> class VectorSpace, typename... VSTypeParams>
+class StateSpace : public VectorSpace<Impl, VSTypeParams...> {
+ private:
+  using Base = VectorSpace<Impl, VSTypeParams...>;
+
+ public:
+  using fp_type = typename Base::fp_type;
+  using State = typename Base::Vector;
+
+  /**
+   * The observed state from a Measurement gate.
+   */
+  struct MeasurementResult {
+    /**
+     * A bitmask of all qubits measured in this result. In this format, if the
+     * qubit at index `i` is measured, the `i`th bit of `mask` is a one.
+     */
+    uint64_t mask;
+    /**
+     * A bitwise representation of the measured states. In this format, the
+     * qubit at index `i` is represented by the `i`th bit of `bits`.
+     * If `valid` is true, `mask` has already been applied to this field
+     * (i.e. `bits == bits & mask`).
+     */
+    uint64_t bits;
+    /**
+     * Observed states of the measured qubits. This vector only includes qubits
+     * specified by the associated Measurement gate.
+     */
+    std::vector<unsigned> bitstring;
+    /**
+     * Validation bit. If this is false, the measurement failed and all other
+     * fields of the result are invalid.
+     */
+    bool valid;
+  };
+
+  template <typename... Args>
+  StateSpace(Args&&... args) : Base(args...) {}
+
+  double Norm(const State& state) const {
+    auto partial_norms = static_cast<const Impl&>(*this).PartialNorms(state);
+
+    double norm = partial_norms[0];
+    for (std::size_t i = 1; i < partial_norms.size(); ++i) {
+      norm += partial_norms[i];
+    }
+
+    return norm;
+  }
+
+  template <typename RGen>
+  MeasurementResult Measure(const std::vector<unsigned>& qubits,
+                            RGen& rgen, State& state) const {
+    auto result =
+        static_cast<const Impl&>(*this).VirtualMeasure(qubits, rgen, state);
+
+    if (result.valid) {
+      static_cast<const Impl&>(*this).Collapse(result, state);
+    }
+
+    return result;
+  }
+
+  template <typename RGen>
+  MeasurementResult VirtualMeasure(const std::vector<unsigned>& qubits,
+                                   RGen& rgen, const State& state) const {
+    MeasurementResult result;
+
+    result.valid = true;
+    result.mask = 0;
+
+    for (auto q : qubits) {
+      if (q >= state.num_qubits()) {
+        result.valid = false;
+        return result;
+      }
+
+      result.mask |= uint64_t{1} << q;
+    }
+
+    auto partial_norms = static_cast<const Impl&>(*this).PartialNorms(state);
+
+    for (std::size_t i = 1; i < partial_norms.size(); ++i) {
+      partial_norms[i] += partial_norms[i - 1];
+    }
+
+    auto norm = partial_norms.back();
+    auto r = RandomValue(rgen, norm);
+
+    unsigned m = 0;
+    while (r > partial_norms[m]) ++m;
+    if (m > 0) {
+      r -= partial_norms[m - 1];
+    }
+
+    result.bits = static_cast<const Impl&>(*this).FindMeasuredBits(
+        m, r, result.mask, state);
+
+    result.bitstring.reserve(qubits.size());
+    result.bitstring.resize(0);
+
+    for (auto q : qubits) {
+      result.bitstring.push_back((result.bits >> q) & 1);
+    }
+
+    return result;
+  }
+};
+
+}  // namespace qsim
+
+#endif  // STATESPACE_H_
diff --git a/tpls/qsim/statespace_avx.h b/tpls/qsim/statespace_avx.h
new file mode 100644
index 0000000..876058b
--- /dev/null
+++ b/tpls/qsim/statespace_avx.h
@@ -0,0 +1,497 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef STATESPACE_AVX_H_
+#define STATESPACE_AVX_H_
+
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <cstdint>
+#include <functional>
+
+#include "statespace.h"
+#include "util.h"
+#include "vectorspace.h"
+
+namespace qsim {
+
+namespace detail {
+
+inline __m256i GetZeroMaskAVX(uint64_t i, uint64_t mask, uint64_t bits) {
+  __m256i s1 = _mm256_setr_epi64x(i + 0, i + 2, i + 4, i + 6);
+  __m256i s2 = _mm256_setr_epi64x(i + 1, i + 3, i + 5, i + 7);
+  __m256i ma = _mm256_set1_epi64x(mask);
+  __m256i bi = _mm256_set1_epi64x(bits);
+
+  s1 = _mm256_and_si256(s1, ma);
+  s2 = _mm256_and_si256(s2, ma);
+
+  s1 = _mm256_cmpeq_epi64(s1, bi);
+  s2 = _mm256_cmpeq_epi64(s2, bi);
+
+  return _mm256_blend_epi32(s1, s2, 170);  // 10101010
+}
+
+inline double HorizontalSumAVX(__m256 s) {
+  __m128 l = _mm256_castps256_ps128(s);
+  __m128 h = _mm256_extractf128_ps(s, 1);
+  __m128 s1  = _mm_add_ps(h, l);
+  __m128 s1s = _mm_movehdup_ps(s1);
+  __m128 s2 = _mm_add_ps(s1, s1s);
+
+  return _mm_cvtss_f32(_mm_add_ss(s2, _mm_movehl_ps(s1s, s2)));
+}
+
+}  // namespace detail
+
+/**
+ * Object containing context and routines for AVX state-vector manipulations.
+ * State is a vectorized sequence of eight real components followed by eight
+ * imaginary components. Eight single-precison floating numbers can be loaded
+ * into an AVX register.
+ */
+template <typename For>
+class StateSpaceAVX :
+    public StateSpace<StateSpaceAVX<For>, VectorSpace, For, float> {
+ private:
+  using Base = StateSpace<StateSpaceAVX<For>, qsim::VectorSpace, For, float>;
+
+ public:
+  using State = typename Base::State;
+  using fp_type = typename Base::fp_type;
+
+  template <typename... ForArgs>
+  explicit StateSpaceAVX(ForArgs&&... args) : Base(args...) {}
+
+  static uint64_t MinSize(unsigned num_qubits) {
+    return std::max(uint64_t{16}, 2 * (uint64_t{1} << num_qubits));
+  };
+
+  void InternalToNormalOrder(State& state) const {
+    if (state.num_qubits() == 1) {
+      fp_type* s = state.get();
+
+      s[2] = s[1];
+      s[1] = s[8];
+      s[3] = s[9];
+
+      for (uint64_t i = 4; i < 16; ++i) {
+        s[i] = 0;
+      }
+    } else if (state.num_qubits() == 2) {
+      fp_type* s = state.get();
+
+      s[6] = s[3];
+      s[4] = s[2];
+      s[2] = s[1];
+      s[1] = s[8];
+      s[3] = s[9];
+      s[5] = s[10];
+      s[7] = s[11];
+
+      for (uint64_t i = 8; i < 16; ++i) {
+        s[i] = 0;
+      }
+    } else {
+      auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) {
+        fp_type* s = p + 16 * i;
+
+        fp_type re[7];
+        fp_type im[7];
+
+        for (uint64_t i = 0; i < 7; ++i) {
+          re[i] = s[i + 1];
+          im[i] = s[i + 8];
+        }
+
+        for (uint64_t i = 0; i < 7; ++i) {
+          s[2 * i + 1] = im[i];
+          s[2 * i + 2] = re[i];
+        }
+      };
+
+      Base::for_.Run(MinSize(state.num_qubits()) / 16, f, state.get());
+    }
+  }
+
+  void NormalToInternalOrder(State& state) const {
+    if (state.num_qubits() == 1) {
+      fp_type* s = state.get();
+
+      s[8] = s[1];
+      s[1] = s[2];
+      s[9] = s[3];
+
+      for (uint64_t i = 2; i < 8; ++i) {
+        s[i] = 0;
+        s[i + 8] = 0;
+      }
+    } else if (state.num_qubits() == 2) {
+      fp_type* s = state.get();
+
+      s[8] = s[1];
+      s[9] = s[3];
+      s[10] = s[5];
+      s[11] = s[7];
+      s[1] = s[2];
+      s[2] = s[4];
+      s[3] = s[6];
+
+      for (uint64_t i = 4; i < 8; ++i) {
+        s[i] = 0;
+        s[i + 8] = 0;
+      }
+    } else {
+      auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) {
+        fp_type* s = p + 16 * i;
+
+        fp_type re[7];
+        fp_type im[7];
+
+        for (uint64_t i = 0; i < 7; ++i) {
+          im[i] = s[2 * i + 1];
+          re[i] = s[2 * i + 2];
+        }
+
+        for (uint64_t i = 0; i < 7; ++i) {
+          s[i + 1] = re[i];
+          s[i + 8] = im[i];
+        }
+      };
+
+      Base::for_.Run(MinSize(state.num_qubits()) / 16, f, state.get());
+    }
+  }
+
+  void SetAllZeros(State& state) const {
+    __m256 val0 = _mm256_setzero_ps();
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, __m256& val, fp_type* p) {
+      _mm256_store_ps(p + 16 * i, val);
+      _mm256_store_ps(p + 16 * i + 8, val);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 16, f, val0, state.get());
+  }
+
+  // Uniform superposition.
+  void SetStateUniform(State& state) const {
+    __m256 val0 = _mm256_setzero_ps();
+    __m256 valu;
+
+    fp_type v = double{1} / std::sqrt(uint64_t{1} << state.num_qubits());
+
+    switch (state.num_qubits()) {
+    case 1:
+      valu = _mm256_set_ps(0, 0, 0, 0, 0, 0, v, v);
+      break;
+    case 2:
+      valu = _mm256_set_ps(0, 0, 0, 0, v, v, v, v);
+      break;
+    default:
+      valu = _mm256_set1_ps(v);
+      break;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                __m256& val0, __m256 valu, fp_type* p) {
+      _mm256_store_ps(p + 16 * i, valu);
+      _mm256_store_ps(p + 16 * i + 8, val0);
+    };
+
+    Base::for_.Run(
+        MinSize(state.num_qubits()) / 16, f, val0, valu, state.get());
+  }
+
+  // |0> state.
+  void SetStateZero(State& state) const {
+    SetAllZeros(state);
+    state.get()[0] = 1;
+  }
+
+  static std::complex<fp_type> GetAmpl(const State& state, uint64_t i) {
+    uint64_t k = (16 * (i / 8)) + (i % 8);
+    return std::complex<fp_type>(state.get()[k], state.get()[k + 8]);
+  }
+
+  static void SetAmpl(
+      State& state, uint64_t i, const std::complex<fp_type>& ampl) {
+    uint64_t k = (16 * (i / 8)) + (i % 8);
+    state.get()[k] = std::real(ampl);
+    state.get()[k + 8] = std::imag(ampl);
+  }
+
+  static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) {
+    uint64_t k = (16 * (i / 8)) + (i % 8);
+    state.get()[k] = re;
+    state.get()[k + 8] = im;
+  }
+
+  // Sets state[i] = complex(re, im) where (i & mask) == bits.
+  // if `exclude` is true then the criteria becomes (i & mask) != bits.
+  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits,
+                   const std::complex<fp_type>& val,
+                   bool exclude = false) const {
+    BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude);
+  }
+
+  // Sets state[i] = complex(re, im) where (i & mask) == bits.
+  // if `exclude` is true then the criteria becomes (i & mask) != bits.
+  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re,
+                   fp_type im, bool exclude = false) const {
+    __m256 re_reg = _mm256_set1_ps(re);
+    __m256 im_reg = _mm256_set1_ps(im);
+
+    __m256i exclude_reg = _mm256_setzero_si256();
+    if (exclude) {
+      exclude_reg = _mm256_cmpeq_epi32(exclude_reg, exclude_reg);
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv,
+                uint64_t bitsv, __m256 re_n, __m256 im_n, __m256i exclude_n,
+                fp_type* p) {
+      __m256 ml = _mm256_castsi256_ps(_mm256_xor_si256(
+          detail::GetZeroMaskAVX(8 * i, maskv, bitsv), exclude_n));
+
+      __m256 re = _mm256_load_ps(p + 16 * i);
+      __m256 im = _mm256_load_ps(p + 16 * i + 8);
+
+      re = _mm256_blendv_ps(re, re_n, ml);
+      im = _mm256_blendv_ps(im, im_n, ml);
+
+      _mm256_store_ps(p + 16 * i, re);
+      _mm256_store_ps(p + 16 * i + 8, im);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 16, f, mask, bits, re_reg,
+                   im_reg, exclude_reg, state.get());
+  }
+
+  // Does the equivalent of dest += src elementwise.
+  bool Add(const State& src, State& dest) const {
+    if (src.num_qubits() != dest.num_qubits()) {
+      return false;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p1, fp_type* p2) {
+      __m256 re1 = _mm256_load_ps(p1 + 16 * i);
+      __m256 im1 = _mm256_load_ps(p1 + 16 * i + 8);
+      __m256 re2 = _mm256_load_ps(p2 + 16 * i);
+      __m256 im2 = _mm256_load_ps(p2 + 16 * i + 8);
+
+      _mm256_store_ps(p2 + 16 * i, _mm256_add_ps(re1, re2));
+      _mm256_store_ps(p2 + 16 * i + 8, _mm256_add_ps(im1, im2));
+    };
+
+    Base::for_.Run(MinSize(src.num_qubits()) / 16, f, src.get(), dest.get());
+
+    return true;
+  }
+
+  // Does the equivalent of state *= a elementwise.
+  void Multiply(fp_type a, State& state) const {
+    __m256 r = _mm256_set1_ps(a);
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, __m256 r, fp_type* p) {
+      __m256 re = _mm256_load_ps(p + 16 * i);
+      __m256 im = _mm256_load_ps(p + 16 * i + 8);
+
+      re = _mm256_mul_ps(re, r);
+      im = _mm256_mul_ps(im, r);
+
+      _mm256_store_ps(p + 16 * i, re);
+      _mm256_store_ps(p + 16 * i + 8, im);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 16, f, r, state.get());
+  }
+
+  std::complex<double> InnerProduct(
+      const State& state1, const State& state2) const {
+    if (state1.num_qubits() != state2.num_qubits()) {
+      return std::nan("");
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p1, const fp_type* p2) -> std::complex<double> {
+      __m256 re1 = _mm256_load_ps(p1 + 16 * i);
+      __m256 im1 = _mm256_load_ps(p1 + 16 * i + 8);
+      __m256 re2 = _mm256_load_ps(p2 + 16 * i);
+      __m256 im2 = _mm256_load_ps(p2 + 16 * i + 8);
+
+      __m256 ip_re = _mm256_fmadd_ps(im1, im2, _mm256_mul_ps(re1, re2));
+      __m256 ip_im = _mm256_fnmadd_ps(im1, re2, _mm256_mul_ps(re1, im2));
+
+      double re = detail::HorizontalSumAVX(ip_re);
+      double im = detail::HorizontalSumAVX(ip_im);
+
+      return std::complex<double>{re, im};
+    };
+
+    using Op = std::plus<std::complex<double>>;
+    return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 16, f,
+                                Op(), state1.get(), state2.get());
+  }
+
+  double RealInnerProduct(const State& state1, const State& state2) const {
+    if (state1.num_qubits() != state2.num_qubits()) {
+      return std::nan("");
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p1, const fp_type* p2) -> double {
+      __m256 re1 = _mm256_load_ps(p1 + 16 * i);
+      __m256 im1 = _mm256_load_ps(p1 + 16 * i + 8);
+      __m256 re2 = _mm256_load_ps(p2 + 16 * i);
+      __m256 im2 = _mm256_load_ps(p2 + 16 * i + 8);
+
+      __m256 ip_re = _mm256_fmadd_ps(im1, im2, _mm256_mul_ps(re1, re2));
+
+      return detail::HorizontalSumAVX(ip_re);
+    };
+
+    using Op = std::plus<double>;
+    return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 16, f,
+                                Op(), state1.get(), state2.get());
+  }
+
+  template <typename DistrRealType = double>
+  std::vector<uint64_t> Sample(
+      const State& state, uint64_t num_samples, unsigned seed) const {
+    std::vector<uint64_t> bitstrings;
+
+    if (num_samples > 0) {
+      double norm = 0;
+      uint64_t size = MinSize(state.num_qubits()) / 16;
+      const fp_type* p = state.get();
+
+      for (uint64_t k = 0; k < size; ++k) {
+        for (unsigned j = 0; j < 8; ++j) {
+          double re = p[16 * k + j];
+          double im = p[16 * k + 8 + j];
+          norm += re * re + im * im;
+        }
+      }
+
+      auto rs = GenerateRandomValues<DistrRealType>(num_samples, seed, norm);
+
+      uint64_t m = 0;
+      double csum = 0;
+      bitstrings.reserve(num_samples);
+
+      for (uint64_t k = 0; k < size; ++k) {
+        for (unsigned j = 0; j < 8; ++j) {
+          double re = p[16 * k + j];
+          double im = p[16 * k + 8 + j];
+          csum += re * re + im * im;
+          while (rs[m] < csum && m < num_samples) {
+            bitstrings.emplace_back(8 * k + j);
+            ++m;
+          }
+        }
+      }
+
+      for (; m < num_samples; ++m) {
+        bitstrings.emplace_back((uint64_t{1} << state.num_qubits()) - 1);
+      }
+    }
+
+    return bitstrings;
+  }
+
+  using MeasurementResult = typename Base::MeasurementResult;
+
+  void Collapse(const MeasurementResult& mr, State& state) const {
+    auto f1 = [](unsigned n, unsigned m, uint64_t i,
+                 uint64_t mask, uint64_t bits, const fp_type* p) -> double {
+      __m256i ml = detail::GetZeroMaskAVX(8 * i, mask, bits);
+
+      __m256 re = _mm256_maskload_ps(p + 16 * i, ml);
+      __m256 im = _mm256_maskload_ps(p + 16 * i + 8, ml);
+      __m256 s1 = _mm256_fmadd_ps(im, im, _mm256_mul_ps(re, re));
+
+      return detail::HorizontalSumAVX(s1);
+    };
+
+    using Op = std::plus<double>;
+    double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 16, f1,
+                                       Op(), mr.mask, mr.bits, state.get());
+
+    __m256 renorm = _mm256_set1_ps(1.0 / std::sqrt(norm));
+
+    auto f2 = [](unsigned n, unsigned m, uint64_t i,
+                 uint64_t mask, uint64_t bits, __m256 renorm, fp_type* p) {
+      __m256i ml = detail::GetZeroMaskAVX(8 * i, mask, bits);
+
+      __m256 re = _mm256_maskload_ps(p + 16 * i, ml);
+      __m256 im = _mm256_maskload_ps(p + 16 * i + 8, ml);
+
+      re = _mm256_mul_ps(re, renorm);
+      im = _mm256_mul_ps(im, renorm);
+
+      _mm256_store_ps(p + 16 * i, re);
+      _mm256_store_ps(p + 16 * i + 8, im);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 16, f2,
+                   mr.mask, mr.bits, renorm, state.get());
+  }
+
+  std::vector<double> PartialNorms(const State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p) -> double {
+      __m256 re = _mm256_load_ps(p + 16 * i);
+      __m256 im = _mm256_load_ps(p + 16 * i + 8);
+      __m256 s1 = _mm256_fmadd_ps(im, im, _mm256_mul_ps(re, re));
+
+      return detail::HorizontalSumAVX(s1);
+    };
+
+    using Op = std::plus<double>;
+    return Base::for_.RunReduceP(
+        MinSize(state.num_qubits()) / 16, f, Op(), state.get());
+  }
+
+  uint64_t FindMeasuredBits(
+      unsigned m, double r, uint64_t mask, const State& state) const {
+    double csum = 0;
+
+    uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 16, m);
+    uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 16, m);
+
+    const fp_type* p = state.get();
+
+    for (uint64_t k = k0; k < k1; ++k) {
+      for (uint64_t j = 0; j < 8; ++j) {
+        auto re = p[16 * k + j];
+        auto im = p[16 * k + j + 8];
+        csum += re * re + im * im;
+        if (r < csum) {
+          return (8 * k + j) & mask;
+        }
+      }
+    }
+
+    // Return the last bitstring in the unlikely case of underflow.
+    return (8 * k1 - 1) & mask;
+  }
+};
+
+}  // namespace qsim
+
+#endif  // STATESPACE_AVX_H_
diff --git a/tpls/qsim/statespace_avx512.h b/tpls/qsim/statespace_avx512.h
new file mode 100644
index 0000000..879fd89
--- /dev/null
+++ b/tpls/qsim/statespace_avx512.h
@@ -0,0 +1,448 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef STATESPACE_AVX512_H_
+#define STATESPACE_AVX512_H_
+
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <cstdint>
+#include <functional>
+
+#include "statespace.h"
+#include "util.h"
+#include "vectorspace.h"
+
+namespace qsim {
+
+namespace detail {
+
+inline unsigned GetZeroMaskAVX512(uint64_t i, uint64_t mask, uint64_t bits) {
+  __m512i s1 = _mm512_setr_epi64(
+      i + 0, i + 1, i + 2, i + 3, i + 4, i + 5, i + 6, i + 7);
+  __m512i s2 = _mm512_setr_epi64(
+      i + 8, i + 9, i + 10, i + 11, i + 12, i + 13, i + 14, i + 15);
+  __m512i ma = _mm512_set1_epi64(mask);
+  __m512i bi = _mm512_set1_epi64(bits);
+
+  s1 = _mm512_and_si512(s1, ma);
+  s2 = _mm512_and_si512(s2, ma);
+
+  unsigned m1 = _mm512_cmpeq_epu64_mask(s1, bi);
+  unsigned m2 = _mm512_cmpeq_epu64_mask(s2, bi);
+
+  return (m2 << 8) | m1;
+}
+
+inline double HorizontalSumAVX(__m256 s) {
+  __m128 l = _mm256_castps256_ps128(s);
+  __m128 h = _mm256_extractf128_ps(s, 1);
+  __m128 s1  = _mm_add_ps(h, l);
+  __m128 s1s = _mm_movehdup_ps(s1);
+  __m128 s2 = _mm_add_ps(s1, s1s);
+
+  return _mm_cvtss_f32(_mm_add_ss(s2, _mm_movehl_ps(s1s, s2)));
+}
+
+inline double HorizontalSumAVX512(__m512 s) {
+  __m256 l = _mm512_castps512_ps256(s);
+  __m512d sd = _mm512_castps_pd(s);
+  __m256d hd = _mm512_extractf64x4_pd(sd, 1);
+  __m256 h = _mm256_castpd_ps(hd);
+  __m256 p = _mm256_add_ps(h, l);
+
+  return HorizontalSumAVX(p);
+}
+
+}  // namespace detail
+
+/**
+ * Object containing context and routines for AVX state-vector manipulations.
+ * State is a vectorized sequence of sixteen real components followed by
+ * sixteen imaginary components. Sixteen single-precison floating numbers can
+ * be loaded into an AVX512 register.
+ */
+template <typename For>
+class StateSpaceAVX512 :
+    public StateSpace<StateSpaceAVX512<For>, VectorSpace, For, float> {
+ private:
+  using Base = StateSpace<StateSpaceAVX512<For>, qsim::VectorSpace, For, float>;
+
+ public:
+  using State = typename Base::State;
+  using fp_type = typename Base::fp_type;
+
+  template <typename... ForArgs>
+  explicit StateSpaceAVX512(ForArgs&&... args) : Base(args...) {}
+
+  static uint64_t MinSize(unsigned num_qubits) {
+    return std::max(uint64_t{32}, 2 * (uint64_t{1} << num_qubits));
+  };
+
+  void InternalToNormalOrder(State& state) const {
+    __m512i idx1 = _mm512_setr_epi32(
+        0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
+    __m512i idx2 = _mm512_setr_epi32(
+        8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                __m512i idx1, __m512i idx2, fp_type* p) {
+      __m512 v1 = _mm512_load_ps(p + 32 * i);
+      __m512 v2 = _mm512_load_ps(p + 32 * i + 16);
+
+      _mm512_store_ps(p + 32 * i,  _mm512_permutex2var_ps(v1, idx1, v2));
+      _mm512_store_ps(p + 32 * i + 16,  _mm512_permutex2var_ps(v1, idx2, v2));
+    };
+
+    Base::for_.Run(
+        MinSize(state.num_qubits()) / 32, f, idx1, idx2, state.get());
+  }
+
+  void NormalToInternalOrder(State& state) const {
+    __m512i idx1 = _mm512_setr_epi32(
+        0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+    __m512i idx2 = _mm512_setr_epi32(
+        1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                __m512i idx1, __m512i idx2, fp_type* p) {
+      __m512 re = _mm512_load_ps(p + 32 * i);
+      __m512 im = _mm512_load_ps(p + 32 * i + 16);
+
+      _mm512_store_ps(p + 32 * i,  _mm512_permutex2var_ps(re, idx1, im));
+      _mm512_store_ps(p + 32 * i + 16,  _mm512_permutex2var_ps(re, idx2, im));
+    };
+
+    Base::for_.Run(
+        MinSize(state.num_qubits()) / 32, f, idx1, idx2, state.get());
+  }
+
+  void SetAllZeros(State& state) const {
+    __m512 val0 = _mm512_setzero_ps();
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, __m512 val0, fp_type* p) {
+      _mm512_store_ps(p + 32 * i, val0);
+      _mm512_store_ps(p + 32 * i + 16, val0);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 32, f, val0, state.get());
+  }
+
+  // Uniform superposition.
+  void SetStateUniform(State& state) const {
+    __m512 val0 = _mm512_setzero_ps();
+    __m512 valu;
+
+    fp_type v = double{1} / std::sqrt(uint64_t{1} << state.num_qubits());
+
+    switch (state.num_qubits()) {
+    case 1:
+      valu = _mm512_set_ps(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, v, v);
+      break;
+    case 2:
+      valu = _mm512_set_ps(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, v, v, v, v);
+      break;
+    case 3:
+      valu = _mm512_set_ps(0, 0, 0, 0, 0, 0, 0, 0, v, v, v, v, v, v, v, v);
+      break;
+    default:
+      valu = _mm512_set1_ps(v);
+      break;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const __m512& val0, const __m512& valu, fp_type* p) {
+      _mm512_store_ps(p + 32 * i, valu);
+      _mm512_store_ps(p + 32 * i + 16, val0);
+    };
+
+    Base::for_.Run(
+        MinSize(state.num_qubits()) / 32, f, val0, valu, state.get());
+  }
+
+  // |0> state.
+  void SetStateZero(State& state) const {
+    SetAllZeros(state);
+    state.get()[0] = 1;
+  }
+
+  static std::complex<fp_type> GetAmpl(const State& state, uint64_t i) {
+    uint64_t p = (32 * (i / 16)) + (i % 16);
+    return std::complex<fp_type>(state.get()[p], state.get()[p + 16]);
+  }
+
+  static void SetAmpl(
+      State& state, uint64_t i, const std::complex<fp_type>& ampl) {
+    uint64_t p = (32 * (i / 16)) + (i % 16);
+    state.get()[p] = std::real(ampl);
+    state.get()[p + 16] = std::imag(ampl);
+  }
+
+  static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) {
+    uint64_t p = (32 * (i / 16)) + (i % 16);
+    state.get()[p] = re;
+    state.get()[p + 16] = im;
+  }
+
+  // Sets state[i] = complex(re, im) where (i & mask) == bits.
+  // if `exclude` is true then the criteria becomes (i & mask) != bits.
+  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits,
+                   const std::complex<fp_type>& val,
+                   bool exclude = false) const {
+    BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude);
+  }
+
+  // Sets state[i] = complex(re, im) where (i & mask) == bits.
+  // if `exclude` is true then the criteria becomes (i & mask) != bits.
+  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re,
+                   fp_type im, bool exclude = false) const {
+    __m512 re_reg = _mm512_set1_ps(re);
+    __m512 im_reg = _mm512_set1_ps(im);
+
+    __mmask16 exclude_n = exclude ? 0xffff : 0;
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv,
+                uint64_t bitsv, __m512 re_n, __m512 im_n, __mmask16 exclude_n,
+                fp_type* p) {
+      __m512 re = _mm512_load_ps(p + 32 * i);
+      __m512 im = _mm512_load_ps(p + 32 * i + 16);
+
+      __mmask16 ml =
+          detail::GetZeroMaskAVX512(16 * i, maskv, bitsv) ^ exclude_n;
+
+      re = _mm512_mask_blend_ps(ml, re, re_n);
+      im = _mm512_mask_blend_ps(ml, im, im_n);
+
+      _mm512_store_ps(p + 32 * i, re);
+      _mm512_store_ps(p + 32 * i + 16, im);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 32, f, mask, bits,
+                   re_reg, im_reg, exclude_n, state.get());
+  }
+
+  // Does the equivalent of dest += src elementwise.
+  bool Add(const State& src, State& dest) const {
+    if (src.num_qubits() != dest.num_qubits()) {
+      return false;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p1, fp_type* p2) {
+      __m512 re1 = _mm512_load_ps(p1 + 32 * i);
+      __m512 im1 = _mm512_load_ps(p1 + 32 * i + 16);
+      __m512 re2 = _mm512_load_ps(p2 + 32 * i);
+      __m512 im2 = _mm512_load_ps(p2 + 32 * i + 16);
+
+      _mm512_store_ps(p2 + 32 * i, _mm512_add_ps(re1, re2));
+      _mm512_store_ps(p2 + 32 * i + 16, _mm512_add_ps(im1, im2));
+    };
+
+    Base::for_.Run(MinSize(src.num_qubits()) / 32, f, src.get(), dest.get());
+
+    return true;
+  }
+
+  // Does the equivalent of state *= a elementwise.
+  void Multiply(fp_type a, State& state) const {
+    __m512 r = _mm512_set1_ps(a);
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, __m512 r, fp_type* p) {
+      __m512 re = _mm512_load_ps(p + 32 * i);
+      __m512 im = _mm512_load_ps(p + 32 * i + 16);
+
+      _mm512_store_ps(p + 32 * i, _mm512_mul_ps(re, r));
+      _mm512_store_ps(p + 32 * i + 16, _mm512_mul_ps(im, r));
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 32, f, r, state.get());
+  }
+
+  std::complex<double> InnerProduct(
+      const State& state1, const State& state2) const {
+    if (state1.num_qubits() != state2.num_qubits()) {
+      return std::nan("");
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p1, const fp_type* p2) -> std::complex<double> {
+      __m512 re1 = _mm512_load_ps(p1 + 32 * i);
+      __m512 im1 = _mm512_load_ps(p1 + 32 * i + 16);
+      __m512 re2 = _mm512_load_ps(p2 + 32 * i);
+      __m512 im2 = _mm512_load_ps(p2 + 32 * i + 16);
+
+      __m512 ip_re = _mm512_fmadd_ps(im1, im2, _mm512_mul_ps(re1, re2));
+      __m512 ip_im = _mm512_fnmadd_ps(im1, re2, _mm512_mul_ps(re1, im2));
+
+      double re = detail::HorizontalSumAVX512(ip_re);
+      double im = detail::HorizontalSumAVX512(ip_im);
+
+      return std::complex<double>{re, im};
+    };
+
+    using Op = std::plus<std::complex<double>>;
+    return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 32, f,
+                                Op(), state1.get(), state2.get());
+  }
+
+  double RealInnerProduct(const State& state1, const State& state2) const {
+    if (state1.num_qubits() != state2.num_qubits()) {
+      return std::nan("");
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p1, const fp_type* p2) -> double {
+      __m512 re1 = _mm512_load_ps(p1 + 32 * i);
+      __m512 im1 = _mm512_load_ps(p1 + 32 * i + 16);
+      __m512 re2 = _mm512_load_ps(p2 + 32 * i);
+      __m512 im2 = _mm512_load_ps(p2 + 32 * i + 16);
+
+      __m512 ip_re = _mm512_fmadd_ps(im1, im2, _mm512_mul_ps(re1, re2));
+
+      return detail::HorizontalSumAVX512(ip_re);
+    };
+
+    using Op = std::plus<double>;
+    return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 32, f,
+                                Op(), state1.get(), state2.get());
+  }
+
+  template <typename DistrRealType = double>
+  std::vector<uint64_t> Sample(
+      const State& state, uint64_t num_samples, unsigned seed) const {
+    std::vector<uint64_t> bitstrings;
+
+    if (num_samples > 0) {
+      double norm = 0;
+      uint64_t size = MinSize(state.num_qubits()) / 32;
+      const fp_type* p = state.get();
+
+      for (uint64_t k = 0; k < size; ++k) {
+        for (unsigned j = 0; j < 16; ++j) {
+          double re = p[32 * k + j];
+          double im = p[32 * k + 16 + j];
+          norm += re * re + im * im;
+        }
+      }
+
+      auto rs = GenerateRandomValues<DistrRealType>(num_samples, seed, norm);
+
+      uint64_t m = 0;
+      double csum = 0;
+      bitstrings.reserve(num_samples);
+
+      for (uint64_t k = 0; k < size; ++k) {
+        for (unsigned j = 0; j < 16; ++j) {
+          double re = p[32 * k + j];
+          double im = p[32 * k + 16 + j];
+          csum += re * re + im * im;
+          while (rs[m] < csum && m < num_samples) {
+            bitstrings.emplace_back(16 * k + j);
+            ++m;
+          }
+        }
+      }
+
+      for (; m < num_samples; ++m) {
+        bitstrings.emplace_back((uint64_t{1} << state.num_qubits()) - 1);
+      }
+    }
+
+    return bitstrings;
+  }
+
+  using MeasurementResult = typename Base::MeasurementResult;
+
+  void Collapse(const MeasurementResult& mr, State& state) const {
+    auto f1 = [](unsigned n, unsigned m, uint64_t i,
+                 uint64_t mask, uint64_t bits, const fp_type* p) -> double {
+      __mmask16 ml = detail::GetZeroMaskAVX512(16 * i, mask, bits);
+
+      __m512 re = _mm512_maskz_load_ps(ml, p + 32 * i);
+      __m512 im = _mm512_maskz_load_ps(ml, p + 32 * i + 16);
+      __m512 s1 = _mm512_fmadd_ps(im, im, _mm512_mul_ps(re, re));
+
+      return detail::HorizontalSumAVX512(s1);
+    };
+
+    using Op = std::plus<double>;
+    double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 32, f1,
+                                       Op(), mr.mask, mr.bits, state.get());
+
+    __m512 renorm = _mm512_set1_ps(1.0 / std::sqrt(norm));
+
+    auto f2 = [](unsigned n, unsigned m, uint64_t i,
+                 uint64_t mask, uint64_t bits, __m512 renorm, fp_type* p) {
+      __mmask16 ml = detail::GetZeroMaskAVX512(16 * i, mask, bits);
+
+      __m512 re = _mm512_maskz_load_ps(ml, p + 32 * i);
+      __m512 im = _mm512_maskz_load_ps(ml, p + 32 * i + 16);
+
+      re = _mm512_mul_ps(re, renorm);
+      im = _mm512_mul_ps(im, renorm);
+
+      _mm512_store_ps(p + 32 * i, re);
+      _mm512_store_ps(p + 32 * i + 16, im);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 32, f2,
+                   mr.mask, mr.bits, renorm, state.get());
+  }
+
+  std::vector<double> PartialNorms(const State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p) -> double {
+      __m512 re = _mm512_load_ps(p + 32 * i);
+      __m512 im = _mm512_load_ps(p + 32 * i + 16);
+      __m512 s1 = _mm512_fmadd_ps(im, im, _mm512_mul_ps(re, re));
+
+      return detail::HorizontalSumAVX512(s1);
+    };
+
+    using Op = std::plus<double>;
+    return Base::for_.RunReduceP(
+        MinSize(state.num_qubits()) / 32, f, Op(), state.get());
+  }
+
+  uint64_t FindMeasuredBits(
+      unsigned m, double r, uint64_t mask, const State& state) const {
+    double csum = 0;
+
+    uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 32, m);
+    uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 32, m);
+
+    const fp_type* p = state.get();
+
+    for (uint64_t k = k0; k < k1; ++k) {
+      for (uint64_t j = 0; j < 16; ++j) {
+        auto re = p[32 * k + j];
+        auto im = p[32 * k + j + 16];
+        csum += re * re + im * im;
+        if (r < csum) {
+          return (16 * k + j) & mask;
+        }
+      }
+    }
+
+    // Return the last bitstring in the unlikely case of underflow.
+    return (16 * k1 - 1) & mask;
+  }
+};
+
+}  // namespace qsim
+
+#endif  // STATESPACE_AVX512_H_
diff --git a/tpls/qsim/statespace_basic.h b/tpls/qsim/statespace_basic.h
new file mode 100644
index 0000000..6468483
--- /dev/null
+++ b/tpls/qsim/statespace_basic.h
@@ -0,0 +1,300 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef STATESPACE_BASIC_H_
+#define STATESPACE_BASIC_H_
+
+#include <cmath>
+#include <complex>
+#include <cstdint>
+#include <functional>
+
+#include "statespace.h"
+#include "util.h"
+#include "vectorspace.h"
+
+namespace qsim {
+
+/**
+ * Object containing context and routines for unoptimized state-vector
+ * manipulations. State is a non-vectorized sequence of one real amplitude
+ * followed by one imaginary amplitude.
+ */
+template <typename For, typename FP>
+class StateSpaceBasic :
+    public StateSpace<StateSpaceBasic<For, FP>, VectorSpace, For, FP> {
+ private:
+  using Base = StateSpace<StateSpaceBasic<For, FP>, qsim::VectorSpace, For, FP>;
+
+ public:
+  using State = typename Base::State;
+  using fp_type = typename Base::fp_type;
+
+  template <typename... ForArgs>
+  explicit StateSpaceBasic(ForArgs&&... args) : Base(args...) {}
+
+  static uint64_t MinSize(unsigned num_qubits) {
+    return 2 * (uint64_t{1} << num_qubits);
+  };
+
+  void InternalToNormalOrder(State& state) const {}
+
+  void NormalToInternalOrder(State& state) const {}
+
+  void SetAllZeros(State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) {
+      p[2 * i] = 0;
+      p[2 * i + 1] = 0;
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 2, f, state.get());
+  }
+
+  // Uniform superposition.
+  void SetStateUniform(State& state) const {
+    fp_type val = fp_type{1} / std::sqrt(uint64_t{1} << state.num_qubits());
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                fp_type val, fp_type* p) {
+      p[2 * i] = val;
+      p[2 * i + 1] = 0;
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 2, f, val, state.get());
+  }
+
+  // |0> state.
+  void SetStateZero(State& state) const {
+    SetAllZeros(state);
+    state.get()[0] = 1;
+  }
+
+  static std::complex<fp_type> GetAmpl(const State& state, uint64_t i) {
+    uint64_t p = 2 * i;
+    return std::complex<fp_type>(state.get()[p], state.get()[p + 1]);
+  }
+
+  static void SetAmpl(
+      State& state, uint64_t i, const std::complex<fp_type>& ampl) {
+    uint64_t p = 2 * i;
+    state.get()[p] = std::real(ampl);
+    state.get()[p + 1] = std::imag(ampl);
+  }
+
+  static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) {
+    uint64_t p = 2 * i;
+    state.get()[p] = re;
+    state.get()[p + 1] = im;
+  }
+
+  // Sets state[i] = complex(re, im) where (i & mask) == bits.
+  // if `exclude` is true then the criteria becomes (i & mask) != bits.
+  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits,
+                   const std::complex<fp_type>& val,
+                   bool exclude = false) const {
+    BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude);
+  }
+
+  // Sets state[i] = complex(re, im) where (i & mask) == bits.
+  // if `exclude` is true then the criteria becomes (i & mask) != bits.
+  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re,
+                   fp_type im, bool exclude = false) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv,
+                uint64_t bitsv, fp_type re_n, fp_type im_n, bool excludev,
+                fp_type* p) {
+      auto s = p + 2 * i;
+      bool in_mask = (i & maskv) == bitsv;
+      in_mask ^= excludev;
+      s[0] = in_mask ? re_n : s[0];
+      s[1] = in_mask ? im_n : s[1];
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 2, f, mask, bits, re, im,
+                   exclude, state.get());
+  }
+
+  // Does the equivalent of dest += src elementwise.
+  bool Add(const State& src, State& dest) const {
+    if (src.num_qubits() != dest.num_qubits()) {
+      return false;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p1, fp_type* p2) {
+      p2[2 * i] += p1[2 * i];
+      p2[2 * i + 1] += p1[2 * i + 1];
+    };
+
+    Base::for_.Run(MinSize(src.num_qubits()) / 2, f, src.get(), dest.get());
+
+    return true;
+  }
+
+  // Does the equivalent of state *= a elementwise.
+  void Multiply(fp_type a, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, fp_type a, fp_type* p) {
+      p[2 * i] *= a;
+      p[2 * i + 1] *= a;
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 2, f, a, state.get());
+  }
+
+  std::complex<double> InnerProduct(
+      const State& state1, const State& state2) const {
+    if (state1.num_qubits() != state2.num_qubits()) {
+      return std::nan("");
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p1, const fp_type* p2) -> std::complex<double> {
+      auto s1 = p1 + 2 * i;
+      auto s2 = p2 + 2 * i;
+
+      double re = s1[0] * s2[0] + s1[1] * s2[1];
+      double im = s1[0] * s2[1] - s1[1] * s2[0];
+
+      return std::complex<double>{re, im};
+    };
+
+    using Op = std::plus<std::complex<double>>;
+    return Base::for_.RunReduce(
+        MinSize(state1.num_qubits()) / 2, f, Op(), state1.get(), state2.get());
+  }
+
+  double RealInnerProduct(const State& state1, const State& state2) const {
+    if (state1.num_qubits() != state2.num_qubits()) {
+      return std::nan("");
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p1, const fp_type* p2) -> double {
+      auto s1 = p1 + 2 * i;
+      auto s2 = p2 + 2 * i;
+
+      return s1[0] * s2[0] + s1[1] * s2[1];
+    };
+
+    using Op = std::plus<double>;
+    return Base::for_.RunReduce(
+        MinSize(state1.num_qubits()) / 2, f, Op(), state1.get(), state2.get());
+  }
+
+  template <typename DistrRealType = double>
+  std::vector<uint64_t> Sample(
+      const State& state, uint64_t num_samples, unsigned seed) const {
+    std::vector<uint64_t> bitstrings;
+
+    if (num_samples > 0) {
+      double norm = 0;
+      uint64_t size = MinSize(state.num_qubits()) / 2;
+
+      const fp_type* p = state.get();
+
+      for (uint64_t k = 0; k < size; ++k) {
+        double re = p[2 * k];
+        double im = p[2 * k + 1];
+        norm += re * re + im * im;
+      }
+
+      auto rs = GenerateRandomValues<DistrRealType>(num_samples, seed, norm);
+
+      uint64_t m = 0;
+      double csum = 0;
+      bitstrings.reserve(num_samples);
+
+      for (uint64_t k = 0; k < size; ++k) {
+        double re = p[2 * k];
+        double im = p[2 * k + 1];
+        csum += re * re + im * im;
+        while (rs[m] < csum && m < num_samples) {
+          bitstrings.emplace_back(k);
+          ++m;
+        }
+      }
+
+      for (; m < num_samples; ++m) {
+        bitstrings.emplace_back((uint64_t{1} << state.num_qubits()) - 1);
+      }
+    }
+
+    return bitstrings;
+  }
+
+  using MeasurementResult = typename Base::MeasurementResult;
+
+  void Collapse(const MeasurementResult& mr, State& state) const {
+    auto f1 = [](unsigned n, unsigned m, uint64_t i,
+                 uint64_t mask, uint64_t bits, const fp_type* p) -> double {
+      auto s = p + 2 * i;
+      return (i & mask) == bits ? s[0] * s[0] + s[1] * s[1] : 0;
+    };
+
+    using Op = std::plus<double>;
+    double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 2, f1,
+                                       Op(), mr.mask, mr.bits, state.get());
+
+    double renorm = 1.0 / std::sqrt(norm);
+
+    auto f2 = [](unsigned n, unsigned m, uint64_t i,
+                 uint64_t mask, uint64_t bits, fp_type renorm, fp_type* p) {
+      auto s = p + 2 * i;
+      bool not_zero = (i & mask) == bits;
+
+      s[0] = not_zero ? s[0] * renorm : 0;
+      s[1] = not_zero ? s[1] * renorm : 0;
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 2, f2,
+                   mr.mask, mr.bits, renorm, state.get());
+  }
+
+  std::vector<double> PartialNorms(const State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p) -> double {
+      auto s = p + 2 * i;
+      return s[0] * s[0] + s[1] * s[1];
+    };
+
+    using Op = std::plus<double>;
+    return Base::for_.RunReduceP(
+        MinSize(state.num_qubits()) / 2, f, Op(), state.get());
+  }
+
+  uint64_t FindMeasuredBits(
+      unsigned m, double r, uint64_t mask, const State& state) const {
+    double csum = 0;
+
+    uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 2, m);
+    uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 2, m);
+
+    const fp_type* p = state.get();
+
+    for (uint64_t k = k0; k < k1; ++k) {
+      auto re = p[2 * k];
+      auto im = p[2 * k + 1];
+      csum += re * re + im * im;
+      if (r < csum) {
+        return k & mask;
+      }
+    }
+
+    // Return the last bitstring in the unlikely case of underflow.
+    return (k1 - 1) & mask;
+  }
+};
+
+}  // namespace qsim
+
+#endif  // STATESPACE_BASIC_H_
diff --git a/tpls/qsim/statespace_cuda.h b/tpls/qsim/statespace_cuda.h
new file mode 100644
index 0000000..660db07
--- /dev/null
+++ b/tpls/qsim/statespace_cuda.h
@@ -0,0 +1,470 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef STATESPACE_CUDA_H_
+#define STATESPACE_CUDA_H_
+
+#ifdef __NVCC__
+  #include <cuda.h>
+#elif __HIP__
+  #include <hip/hip_runtime.h>
+  #include "cuda2hip.h"
+#endif
+
+#include <algorithm>
+#include <complex>
+#include <cstdint>
+
+#include "statespace.h"
+#include "statespace_cuda_kernels.h"
+#include "vectorspace_cuda.h"
+#include "util_cuda.h"
+
+namespace qsim {
+
+/**
+ * Object containing context and routines for CUDA state-vector manipulations.
+ * State is a vectorized sequence of 32 real components followed by 32
+ * imaginary components. 32 floating numbers can be proccessed in parallel by
+ * a single warp. It is not recommended to use `GetAmpl` and `SetAmpl`.
+ */
+template <typename FP = float>
+class StateSpaceCUDA :
+    public StateSpace<StateSpaceCUDA<FP>, VectorSpaceCUDA, FP> {
+ private:
+  using Base = StateSpace<StateSpaceCUDA<FP>, qsim::VectorSpaceCUDA, FP>;
+
+ protected:
+  struct Grid {
+    unsigned threads;
+    unsigned dblocks;
+    unsigned blocks;
+  };
+
+ public:
+  using State = typename Base::State;
+  using fp_type = typename Base::fp_type;
+
+  struct Parameter {
+    /**
+     * The number of threads per block.
+     * Should be 2 to the power of k, where k is in the range [5,10].
+     */
+    unsigned num_threads = 512;
+    /**
+     * The number of data blocks. Each thread processes num_dblocks data
+     * blocks in reductions (norms, inner products, etc).
+     */
+    unsigned num_dblocks = 16;
+  };
+
+  explicit StateSpaceCUDA(const Parameter& param)
+      : param_(param), scratch_(nullptr), scratch_size_(0) {}
+
+  virtual ~StateSpaceCUDA() {
+    if (scratch_ != nullptr) {
+      ErrorCheck(cudaFree(scratch_));
+    }
+  }
+
+  static uint64_t MinSize(unsigned num_qubits) {
+    return std::max(uint64_t{64}, 2 * (uint64_t{1} << num_qubits));
+  };
+
+  void InternalToNormalOrder(State& state) const {
+    uint64_t size = MinSize(state.num_qubits()) / 2;
+
+    unsigned threads = std::min(size, uint64_t{param_.num_threads});
+    unsigned blocks = size / threads;
+    unsigned bytes = 2 * threads * sizeof(fp_type);
+
+    InternalToNormalOrderKernel<<<blocks, threads, bytes>>>(state.get());
+    ErrorCheck(cudaPeekAtLastError());
+    ErrorCheck(cudaDeviceSynchronize());
+  }
+
+  void NormalToInternalOrder(State& state) const {
+    uint64_t size = MinSize(state.num_qubits()) / 2;
+
+    unsigned threads = std::min(size, uint64_t{param_.num_threads});
+    unsigned blocks = size / threads;
+    unsigned bytes = 2 * threads * sizeof(fp_type);
+
+    NormalToInternalOrderKernel<<<blocks, threads, bytes>>>(state.get());
+    ErrorCheck(cudaPeekAtLastError());
+    ErrorCheck(cudaDeviceSynchronize());
+  }
+
+  void SetAllZeros(State& state) const {
+    ErrorCheck(cudaMemset(state.get(), 0,
+               MinSize(state.num_qubits()) * sizeof(fp_type)));
+  }
+
+  // Uniform superposition.
+  void SetStateUniform(State& state) const {
+    uint64_t size = MinSize(state.num_qubits()) / 2;
+    uint64_t hsize = uint64_t{1} << state.num_qubits();
+
+    unsigned threads = std::min(size, uint64_t{param_.num_threads});
+    unsigned blocks = size / threads;
+
+    fp_type v = double{1} / std::sqrt(hsize);
+
+    SetStateUniformKernel<<<blocks, threads>>>(v, hsize, state.get());
+    ErrorCheck(cudaPeekAtLastError());
+    ErrorCheck(cudaDeviceSynchronize());
+  }
+
+  // |0> state.
+  void SetStateZero(State& state) const {
+    SetAllZeros(state);
+    fp_type one[1] = {1};
+    ErrorCheck(
+        cudaMemcpy(state.get(), one, sizeof(fp_type), cudaMemcpyHostToDevice));
+  }
+
+  // It is not recommended to use this function.
+  static std::complex<fp_type> GetAmpl(const State& state, uint64_t i) {
+    fp_type re, im;
+    auto p = state.get() + 64 * (i / 32) + i % 32;
+    ErrorCheck(cudaMemcpy(&re, p, sizeof(fp_type), cudaMemcpyDeviceToHost));
+    ErrorCheck(
+        cudaMemcpy(&im, p + 32, sizeof(fp_type), cudaMemcpyDeviceToHost));
+    return std::complex<fp_type>(re, im);
+  }
+
+  // It is not recommended to use this function.
+  static void SetAmpl(
+      State& state, uint64_t i, const std::complex<fp_type>& ampl) {
+    fp_type re = std::real(ampl);
+    fp_type im = std::imag(ampl);
+    auto p = state.get() + 64 * (i / 32) + i % 32;
+    ErrorCheck(cudaMemcpy(p, &re, sizeof(fp_type), cudaMemcpyHostToDevice));
+    ErrorCheck(
+        cudaMemcpy(p + 32, &im, sizeof(fp_type), cudaMemcpyHostToDevice));
+  }
+
+  // It is not recommended to use this function.
+  static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) {
+    auto p = state.get() + 64 * (i / 32) + i % 32;
+    ErrorCheck(cudaMemcpy(p, &re, sizeof(fp_type), cudaMemcpyHostToDevice));
+    ErrorCheck(
+        cudaMemcpy(p + 32, &im, sizeof(fp_type), cudaMemcpyHostToDevice));
+  }
+
+  // Sets state[i] = complex(re, im) where (i & mask) == bits.
+  // if `exclude` is true then the criteria becomes (i & mask) != bits.
+  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits,
+                   const std::complex<fp_type>& val,
+                   bool exclude = false) const {
+    BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude);
+  }
+
+  // Sets state[i] = complex(re, im) where (i & mask) == bits.
+  // if `exclude` is true then the criteria becomes (i & mask) != bits.
+  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re,
+                   fp_type im, bool exclude = false) const {
+    uint64_t size = MinSize(state.num_qubits()) / 2;
+
+    unsigned threads = std::min(size, uint64_t{param_.num_threads});
+    unsigned blocks = size / threads;
+
+    BulkSetAmplKernel<<<blocks, threads>>>(
+        mask, bits, re, im, exclude, state.get());
+    ErrorCheck(cudaPeekAtLastError());
+    ErrorCheck(cudaDeviceSynchronize());
+  }
+
+  // Does the equivalent of dest += src elementwise.
+  bool Add(const State& src, State& dest) const {
+    if (src.num_qubits() != dest.num_qubits()) {
+      return false;
+    }
+
+    uint64_t size = MinSize(src.num_qubits());
+
+    unsigned threads = std::min(size, uint64_t{param_.num_threads});
+    unsigned blocks = size / threads;
+
+    AddKernel<<<blocks, threads>>>(src.get(), dest.get());
+    ErrorCheck(cudaPeekAtLastError());
+    ErrorCheck(cudaDeviceSynchronize());
+
+    return true;
+  }
+
+  // Does the equivalent of state *= a elementwise.
+  void Multiply(fp_type a, State& state) const {
+    uint64_t size = MinSize(state.num_qubits());
+
+    unsigned threads = std::min(size, uint64_t{param_.num_threads});
+    unsigned blocks = size / threads;
+
+    MultiplyKernel<<<blocks, threads>>>(a, state.get());
+    ErrorCheck(cudaPeekAtLastError());
+    ErrorCheck(cudaDeviceSynchronize());
+  }
+
+  std::complex<double> InnerProduct(
+      const State& state1, const State& state2) const {
+    if (state1.num_qubits() != state2.num_qubits()) {
+      return std::nan("");
+    }
+
+    using C = Complex<double>;
+    auto r = Reduce<C, C, Product<fp_type>>(state1, state2);
+
+    return {r.re, r.im};
+  }
+
+  double RealInnerProduct(const State& state1, const State& state2) const {
+    if (state1.num_qubits() != state2.num_qubits()) {
+      return std::nan("");
+    }
+
+    return Reduce<double, double, RealProduct<fp_type>>(state1, state2);
+  }
+
+  double Norm(const State& state) const {
+    return Reduce<double, double, RealProduct<fp_type>>(state, state);
+  }
+
+  template <typename DistrRealType = double>
+  std::vector<uint64_t> Sample(
+      const State& state, uint64_t num_samples, unsigned seed) const {
+    std::vector<uint64_t> bitstrings;
+
+    if (num_samples > 0) {
+      Grid g1 = GetGrid1(MinSize(state.num_qubits()) / 2);
+      unsigned bytes = g1.threads * sizeof(double);
+
+      unsigned scratch_size = (g1.blocks + 1) * sizeof(double)
+          + num_samples * (sizeof(uint64_t) + sizeof(DistrRealType));
+
+      void* scratch = AllocScratch(scratch_size);
+
+      double* d_res2 = (double*) scratch;
+      double* d_res1 = d_res2 + 1;
+      uint64_t* d_bitstrings = (uint64_t*) (d_res1 + g1.blocks);
+      DistrRealType* d_rs = (DistrRealType *) (d_bitstrings + num_samples);
+
+      auto op1 = RealProduct<fp_type>();
+      auto op2 = Plus<double>();
+
+      Reduce1Kernel<double><<<g1.blocks, g1.threads, bytes>>>(
+          g1.dblocks, op1, op2, op2, state.get(), state.get(), d_res1);
+      ErrorCheck(cudaPeekAtLastError());
+      ErrorCheck(cudaDeviceSynchronize());
+
+      double norm;
+
+      if (g1.blocks == 1) {
+        ErrorCheck(
+            cudaMemcpy(&norm, d_res1, sizeof(double), cudaMemcpyDeviceToHost));
+      } else {
+        Grid g2 = GetGrid2(g1.blocks);
+        unsigned bytes = g2.threads * sizeof(double);
+
+        auto op3 = Plus<double>();
+
+        Reduce2Kernel<double><<<g2.blocks, g2.threads, bytes>>>(
+            g2.dblocks, g1.blocks, op3, op3, d_res1, d_res2);
+        ErrorCheck(cudaPeekAtLastError());
+        ErrorCheck(cudaDeviceSynchronize());
+
+        ErrorCheck(
+            cudaMemcpy(&norm, d_res2, sizeof(double), cudaMemcpyDeviceToHost));
+      }
+
+      // TODO: generate random values on the device.
+      auto rs = GenerateRandomValues<DistrRealType>(num_samples, seed, norm);
+
+      ErrorCheck(cudaMemcpy(d_rs, rs.data(),
+                            num_samples * sizeof(DistrRealType),
+                            cudaMemcpyHostToDevice));
+
+      SampleKernel<<<1, g1.threads>>>(g1.blocks, g1.dblocks, num_samples,
+                                      d_rs, d_res1, state.get(), d_bitstrings);
+      ErrorCheck(cudaPeekAtLastError());
+      ErrorCheck(cudaDeviceSynchronize());
+
+      bitstrings.resize(num_samples, 0);
+
+      ErrorCheck(cudaMemcpy(bitstrings.data(), d_bitstrings,
+                            num_samples * sizeof(uint64_t),
+                            cudaMemcpyDeviceToHost));
+    }
+
+    return bitstrings;
+  }
+
+  using MeasurementResult = typename Base::MeasurementResult;
+
+  void Collapse(const MeasurementResult& mr, State& state) const {
+    using Op = RealProduct<fp_type>;
+    double r = Reduce<double, double, Op>(mr.mask, mr.bits, state, state);
+    fp_type renorm = 1 / std::sqrt(r);
+
+    uint64_t size = MinSize(state.num_qubits()) / 2;
+
+    unsigned threads = std::min(size, uint64_t{param_.num_threads});
+    unsigned blocks = size / threads;
+
+    CollapseKernel<<<blocks, threads>>>(mr.mask, mr.bits, renorm, state.get());
+    ErrorCheck(cudaPeekAtLastError());
+    ErrorCheck(cudaDeviceSynchronize());
+  }
+
+  std::vector<double> PartialNorms(const State& state) const {
+    Grid g = GetGrid1(MinSize(state.num_qubits()) / 2);
+
+    unsigned scratch_size = g.blocks * sizeof(double);
+    unsigned bytes = g.threads * sizeof(double);
+
+    double* d_res = (double*) AllocScratch(scratch_size);
+
+    auto op1 = RealProduct<fp_type>();
+    auto op2 = Plus<double>();
+
+    Reduce1Kernel<double><<<g.blocks, g.threads, bytes>>>(
+        g.dblocks, op1, op2, op2, state.get(), state.get(), d_res);
+    ErrorCheck(cudaPeekAtLastError());
+    ErrorCheck(cudaDeviceSynchronize());
+
+    std::vector<double> norms(g.blocks);
+
+    ErrorCheck(
+        cudaMemcpy(norms.data(), d_res, scratch_size, cudaMemcpyDeviceToHost));
+
+    return norms;
+  }
+
+  uint64_t FindMeasuredBits(
+      unsigned m, double r, uint64_t mask, const State& state) const {
+    Grid g = GetGrid1(MinSize(state.num_qubits()) / 2);
+
+    uint64_t res;
+    uint64_t* d_res = (uint64_t*) AllocScratch(sizeof(uint64_t));
+
+    FindMeasuredBitsKernel<<<1, g.threads>>>(
+        m, g.dblocks, r, state.get(), d_res);
+    ErrorCheck(cudaPeekAtLastError());
+    ErrorCheck(cudaDeviceSynchronize());
+
+    ErrorCheck(
+        cudaMemcpy(&res, d_res, sizeof(uint64_t), cudaMemcpyDeviceToHost));
+
+    return res & mask;
+  }
+
+ protected:
+  Parameter param_;
+
+  void* AllocScratch(uint64_t size) const {
+    if (size > scratch_size_) {
+      if (scratch_ != nullptr) {
+        ErrorCheck(cudaFree(scratch_));
+      }
+
+      ErrorCheck(cudaMalloc(const_cast<void**>(&scratch_), size));
+
+      const_cast<uint64_t&>(scratch_size_) = size;
+    }
+
+    return scratch_;
+  }
+
+  Grid GetGrid1(uint64_t size) const {
+    Grid grid;
+
+    grid.threads = std::min(size, uint64_t{param_.num_threads});
+    grid.dblocks = std::min(size / grid.threads, uint64_t{param_.num_dblocks});
+    grid.blocks = size / (grid.threads * grid.dblocks);
+
+    return grid;
+  }
+
+  Grid GetGrid2(unsigned size) const {
+    Grid grid;
+
+    grid.threads = std::min(param_.num_threads, std::max(32U, size));
+    grid.dblocks = std::max(1U, size / grid.threads);
+    grid.blocks = 1;
+
+    return grid;
+  }
+
+  template <typename FP1, typename FP2, typename Op>
+  FP2 Reduce(const State& state1, const State& state2) const {
+    return Reduce<FP1, FP2, Op>(0, 0, state1, state2);
+  }
+
+  template <typename FP1, typename FP2, typename Op>
+  FP2 Reduce(uint64_t mask, uint64_t bits,
+             const State& state1, const State& state2) const {
+    uint64_t size = MinSize(state1.num_qubits()) / 2;
+
+    Grid g1 = GetGrid1(size);
+    unsigned bytes = g1.threads * sizeof(FP1);
+
+    FP2* d_res2 = (FP2*) AllocScratch((g1.blocks + 1) * sizeof(FP2));
+    FP2* d_res1 = d_res2 + 1;
+
+    auto op1 = Op();
+    auto op2 = Plus<FP1>();
+    auto op3 = Plus<typename Scalar<FP1>::type>();
+
+    if (mask == 0) {
+      Reduce1Kernel<FP1><<<g1.blocks, g1.threads, bytes>>>(
+          g1.dblocks, op1, op2, op3, state1.get(), state2.get(), d_res1);
+    } else {
+      Reduce1MaskedKernel<FP1><<<g1.blocks, g1.threads, bytes>>>(
+          g1.dblocks, mask, bits, op1, op2, op3, state1.get(), state2.get(),
+          d_res1);
+    }
+    ErrorCheck(cudaPeekAtLastError());
+    ErrorCheck(cudaDeviceSynchronize());
+
+    FP2 result;
+
+    if (g1.blocks == 1) {
+      ErrorCheck(
+          cudaMemcpy(&result, d_res1, sizeof(FP2), cudaMemcpyDeviceToHost));
+    } else {
+      Grid g2 = GetGrid2(g1.blocks);
+      unsigned bytes = g2.threads * sizeof(FP2);
+
+      auto op2 = Plus<FP2>();
+      auto op3 = Plus<typename Scalar<FP2>::type>();
+
+      Reduce2Kernel<FP2><<<g2.blocks, g2.threads, bytes>>>(
+          g2.dblocks, g1.blocks, op2, op3, d_res1, d_res2);
+      ErrorCheck(cudaPeekAtLastError());
+      ErrorCheck(cudaDeviceSynchronize());
+
+      ErrorCheck(
+          cudaMemcpy(&result, d_res2, sizeof(FP2), cudaMemcpyDeviceToHost));
+    }
+
+    return result;
+  }
+
+ private:
+  void* scratch_;
+  uint64_t scratch_size_;
+};
+
+}  // namespace qsim
+
+#endif  // STATESPACE_CUDA_H_
diff --git a/tpls/qsim/statespace_cuda_kernels.h b/tpls/qsim/statespace_cuda_kernels.h
new file mode 100644
index 0000000..b54ebca
--- /dev/null
+++ b/tpls/qsim/statespace_cuda_kernels.h
@@ -0,0 +1,355 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef STATESPACE_CUDA_KERNELS_H_
+#define STATESPACE_CUDA_KERNELS_H_
+
+#ifdef __NVCC__
+  #include <cuda.h>
+#elif __HIP__
+  #include <hip/hip_runtime.h>
+  #include "cuda2hip.h"
+#endif
+
+#include "util_cuda.h"
+
+namespace qsim {
+
+namespace detail {
+
+template <typename FP1, typename FP2,
+          typename Op1, typename Op2, typename Op3, unsigned warp_size = 32>
+__device__ __forceinline__ FP1 BlockReduce1(
+    uint64_t n, Op1 op1, Op2 op2, Op3 op3, const FP2* s1, const FP2* s2) {
+  extern __shared__ float shared[];
+  FP1* partial1 = (FP1*) shared;
+
+  unsigned tid = threadIdx.x;
+  unsigned warp = threadIdx.x / warp_size;
+  unsigned lane = threadIdx.x % warp_size;
+
+  uint64_t k0 = 2 * n * blockIdx.x * blockDim.x + 2 * tid - lane;
+  uint64_t k1 = k0 + 2 * n * blockDim.x;
+
+  FP1 r;
+
+  r = op1(s1[k0], s1[k0 + warp_size], s2[k0], s2[k0 + warp_size]);
+  while ((k0 += 2 * blockDim.x) < k1) {
+    r = op2(r, op1(s1[k0], s1[k0 + warp_size], s2[k0], s2[k0 + warp_size]));
+  }
+
+  partial1[tid] = r;
+
+  __shared__ FP1 partial2[warp_size];
+
+  if (tid < warp_size) {
+    partial2[tid] = 0;
+  }
+
+  __syncthreads();
+
+  FP1 val = WarpReduce(partial1[tid], op3);
+
+  if (lane == 0) {
+    partial2[warp] = val;
+  }
+
+  __syncthreads();
+
+  FP1 result = 0;
+
+  if (tid < warp_size) {
+    result = WarpReduce(partial2[tid], op3);
+  }
+
+  return result;
+}
+
+template <typename FP1, typename FP2,
+          typename Op1, typename Op2, typename Op3, unsigned warp_size = 32>
+__device__ __forceinline__ FP1 BlockReduce1Masked(
+    uint64_t n, uint64_t mask, uint64_t bits, Op1 op1, Op2 op2, Op3 op3,
+    const FP2* s1, const FP2* s2) {
+  extern __shared__ float shared[];
+  FP1* partial1 = (FP1*) shared;
+
+  unsigned tid = threadIdx.x;
+  unsigned warp = threadIdx.x / warp_size;
+  unsigned lane = threadIdx.x % warp_size;
+
+  uint64_t k0 = 2 * n * blockIdx.x * blockDim.x + 2 * tid - lane;
+  uint64_t k1 = k0 + 2 * n * blockDim.x;
+
+  FP1 r = 0;
+
+  if (((k0 + lane) / 2 & mask) == bits) {
+    r = op1(s1[k0], s1[k0 + warp_size], s2[k0], s2[k0 + warp_size]);
+  }
+  while ((k0 += 2 * blockDim.x) < k1) {
+    if (((k0 + lane) / 2 & mask) == bits) {
+      r = op2(r, op1(s1[k0], s1[k0 + warp_size], s2[k0], s2[k0 + warp_size]));
+    }
+  }
+
+  partial1[tid] = r;
+
+  __shared__ FP1 partial2[warp_size];
+
+  if (tid < warp_size) {
+    partial2[tid] = 0;
+  }
+
+  __syncthreads();
+
+  FP1 val = WarpReduce(partial1[tid], op3);
+
+  if (lane == 0) {
+    partial2[warp] = val;
+  }
+
+  __syncthreads();
+
+  FP1 result = 0;
+
+  if (tid < warp_size) {
+    result = WarpReduce(partial2[tid], op3);
+  }
+
+  return result;
+}
+
+template <typename FP1, typename FP2,
+          typename Op2, typename Op3, unsigned warp_size = 32>
+__device__ __forceinline__ FP1 BlockReduce2(
+    uint64_t n, uint64_t size, Op2 op2, Op3 op3, const FP2* s) {
+  extern __shared__ float shared[];
+  FP1* partial1 = (FP1*) shared;
+
+  unsigned tid = threadIdx.x;
+  uint64_t k0 = n * blockIdx.x * blockDim.x + tid;
+  uint64_t k1 = k0 + n * blockDim.x;
+
+  FP1 r = 0;
+
+  if (tid < size) {
+    r = s[k0];
+    while ((k0 += blockDim.x) < k1) {
+      r = op2(r, s[k0]);
+    }
+  }
+
+  partial1[tid] = r;
+
+  __shared__ FP1 partial2[warp_size];
+
+  if (tid < warp_size) {
+    partial2[tid] = 0;
+  }
+
+  __syncthreads();
+
+  FP1 val = WarpReduce(partial1[tid], op3);
+
+  if (threadIdx.x % warp_size == 0) {
+    partial2[threadIdx.x / warp_size] = val;
+  }
+
+  __syncthreads();
+
+  FP1 result = 0;
+
+  if (tid < warp_size) {
+    result = WarpReduce(partial2[tid], op3);
+  }
+
+  return result;
+}
+
+}  // namespace detail
+
+template <typename FP1, typename FP2, typename FP3,
+          typename Op1, typename Op2, typename Op3, unsigned warp_size = 32>
+__global__ void Reduce1Kernel(uint64_t n, Op1 op1, Op2 op2, Op3 op3,
+                              const FP2* s1, const FP2* s2, FP3* result) {
+  FP1 sum = detail::BlockReduce1<FP1>(n, op1, op2, op3, s1, s2);
+
+  if (threadIdx.x == 0) {
+    result[blockIdx.x] = sum;
+  }
+}
+
+template <typename FP1, typename FP2, typename FP3,
+          typename Op1, typename Op2, typename Op3, unsigned warp_size = 32>
+__global__ void Reduce1MaskedKernel(uint64_t n, uint64_t mask, uint64_t bits,
+                                    Op1 op1, Op2 op2, Op3 op3,
+                                    const FP2* s1, const FP2* s2, FP3* result) {
+  FP1 sum =
+      detail::BlockReduce1Masked<FP1>(n, mask, bits, op1, op2, op3, s1, s2);
+
+  if (threadIdx.x == 0) {
+    result[blockIdx.x] = sum;
+  }
+}
+
+template <typename FP1, typename FP2, typename FP3,
+          typename Op2, typename Op3, unsigned warp_size = 32>
+__global__ void Reduce2Kernel(
+    uint64_t n, uint64_t size, Op2 op2, Op3 op3, const FP2* s, FP3* result) {
+  FP1 sum = detail::BlockReduce2<FP1>(n, size, op2, op3, s);
+
+  if (threadIdx.x == 0) {
+    result[blockIdx.x] = sum;
+  }
+}
+
+template <typename FP, unsigned warp_size = 32>
+__global__ void InternalToNormalOrderKernel(FP* state) {
+  unsigned lane = threadIdx.x % warp_size;
+  unsigned l = 2 * threadIdx.x - lane;
+  uint64_t k = 2 * uint64_t{blockIdx.x} * blockDim.x + l;
+
+  extern __shared__ float shared[];
+  FP* buf = (FP*) shared;
+
+  buf[l] = state[k];
+  buf[l + warp_size] = state[k + warp_size];
+
+  __syncthreads();
+
+  state[k + lane] = buf[l];
+  state[k + lane + 1] = buf[l + warp_size];
+}
+
+template <typename FP, unsigned warp_size = 32>
+__global__ void NormalToInternalOrderKernel(FP* state) {
+  unsigned lane = threadIdx.x % warp_size;
+  unsigned l = 2 * threadIdx.x - lane;
+  uint64_t k = 2 * uint64_t{blockIdx.x} * blockDim.x + l;
+
+  extern __shared__ float shared[];
+  FP* buf = (FP*) shared;
+
+  buf[l] = state[k];
+  buf[l + warp_size] = state[k + warp_size];
+
+  __syncthreads();
+
+  state[k] = buf[l + lane];
+  state[k + warp_size] = buf[l + lane + 1];
+}
+
+template <typename FP, unsigned warp_size = 32>
+__global__ void SetStateUniformKernel(FP v, uint64_t size, FP* state) {
+  unsigned lane = threadIdx.x % warp_size;
+  uint64_t k = 2 * (uint64_t{blockIdx.x} * blockDim.x + threadIdx.x) - lane;
+
+  state[k] = lane < size ? v : 0;
+  state[k + warp_size] = 0;
+}
+
+template <typename FP, unsigned warp_size = 32>
+__global__ void AddKernel(const FP* state1, FP* state2) {
+  uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x;
+  state2[k] += state1[k];
+}
+
+template <typename FP, unsigned warp_size = 32>
+__global__ void MultiplyKernel(FP a, FP* state) {
+  uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x;
+  state[k] *= a;
+}
+
+template <typename FP, unsigned warp_size = 32>
+__global__ void CollapseKernel(uint64_t mask, uint64_t bits, FP r, FP* state) {
+  uint64_t k1 = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x;
+  uint64_t k2 = 2 * k1 - threadIdx.x % warp_size;
+
+  if ((k1 & mask) == bits) {
+    state[k2] *= r;
+    state[k2 + warp_size] *= r;
+  } else {
+    state[k2] = 0;
+    state[k2 + warp_size] = 0;
+  }
+}
+
+template <typename FP, unsigned warp_size = 32>
+__global__ void BulkSetAmplKernel(
+    uint64_t mask, uint64_t bits, FP re, FP im, bool exclude, FP* state) {
+  uint64_t k1 = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x;
+  uint64_t k2 = 2 * k1 - threadIdx.x % warp_size;
+
+  bool set = ((k1 & mask) == bits) ^ exclude;
+
+  if (set) {
+    state[k2] = re;
+    state[k2 + warp_size] = im;
+  }
+}
+
+template <typename FP1, typename FP2, typename FP3, unsigned warp_size = 32>
+__global__ void SampleKernel(unsigned num_blocks,
+                             uint64_t n, uint64_t num_samples,
+                             const FP1* rs, const FP2* ps, const FP3* state,
+                             uint64_t *bitstrings) {
+  // Use just one thread. This can be somewhat slow.
+  if (threadIdx.x == 0) {
+    uint64_t m = 0;
+    double csum = 0;
+
+    for (unsigned block_id = 0; block_id < num_blocks; ++block_id) {
+      uint64_t km = n * blockDim.x;
+      uint64_t k0 = block_id * km;
+
+      for (uint64_t k = 0; k < km; ++k) {
+        uint64_t l = 2 * k0 + 64 * (k / 32) + k % 32;
+        FP3 re = state[l];
+        FP3 im = state[l + warp_size];
+        csum += re * re + im * im;
+        while (rs[m] < csum && m < num_samples) {
+          bitstrings[m++] = k0 + k;
+        }
+      }
+    }
+  }
+}
+
+template <typename FP, unsigned warp_size = 32>
+__global__ void FindMeasuredBitsKernel(
+    uint64_t block_id, uint64_t n, double r, const FP* state, uint64_t* res) {
+  // Use just one thread. This can be somewhat slow, however, this is
+  // more or less consistent with CPU implementations.
+  if (threadIdx.x == 0) {
+    double csum = 0;
+    uint64_t km = n * blockDim.x;
+    uint64_t k0 = block_id * km;
+
+    for (uint64_t k = 0; k < km; ++k) {
+      uint64_t l = 2 * k0 + 64 * (k / 32) + k % 32;
+      FP re = state[l];
+      FP im = state[l + warp_size];
+      csum += re * re + im * im;
+      if (r < csum) {
+        *res = k0 + k;
+        return;
+      }
+    }
+
+    *res = k0 + n * blockDim.x - 1;
+  }
+}
+
+}  // namespace qsim
+
+#endif  // STATESPACE_CUDA_KERNELS_H_
diff --git a/tpls/qsim/statespace_custatevec.h b/tpls/qsim/statespace_custatevec.h
new file mode 100644
index 0000000..f2f5de1
--- /dev/null
+++ b/tpls/qsim/statespace_custatevec.h
@@ -0,0 +1,376 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef STATESPACE_CUSTATEVEC_H_
+#define STATESPACE_CUSTATEVEC_H_
+
+#include <cmath>
+#include <complex>
+#include <cstdint>
+#include <type_traits>
+#include <vector>
+
+#include <cublas_v2.h>
+#include <cuComplex.h>
+#include <custatevec.h>
+
+#include "statespace.h"
+#include "util_custatevec.h"
+#include "vectorspace_cuda.h"
+
+namespace qsim {
+
+namespace detail {
+
+template <typename FP>
+__global__ void SetStateUniformKernel(FP v, uint64_t size, FP* state) {
+  uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x;
+
+  if (k < size) {
+    state[2 * k] = v;
+    state[2 * k + 1] = 0;
+  }
+}
+
+}  // namespace detail
+
+/**
+ * Object containing context and routines for cuStateVec state-vector
+ * manipulations. It is not recommended to use `GetAmpl` and `SetAmpl`.
+ */
+template <typename FP = float>
+class StateSpaceCuStateVec :
+    public StateSpace<StateSpaceCuStateVec<FP>, VectorSpaceCUDA, FP> {
+ private:
+  using Base = StateSpace<StateSpaceCuStateVec<FP>, qsim::VectorSpaceCUDA, FP>;
+
+ public:
+  using State = typename Base::State;
+  using fp_type = typename Base::fp_type;
+
+  static constexpr auto is_float = std::is_same<fp_type, float>::value;
+
+  static constexpr auto kStateType = is_float ? CUDA_C_32F : CUDA_C_64F;
+  static constexpr auto kMatrixType = kStateType;
+  static constexpr auto kExpectType = CUDA_C_64F;
+  static constexpr auto kComputeType =
+      is_float ? CUSTATEVEC_COMPUTE_32F : CUSTATEVEC_COMPUTE_64F;
+  static constexpr auto kMatrixLayout = CUSTATEVEC_MATRIX_LAYOUT_ROW;
+
+  explicit StateSpaceCuStateVec(const cublasHandle_t& cublas_handle,
+                                const custatevecHandle_t& custatevec_handle)
+      : cublas_handle_(cublas_handle), custatevec_handle_(custatevec_handle),
+        workspace_(nullptr), workspace_size_(0) {}
+
+  virtual ~StateSpaceCuStateVec() {
+    if (workspace_ != nullptr) {
+      ErrorCheck(cudaFree(workspace_));
+    }
+  }
+
+  static uint64_t MinSize(unsigned num_qubits) {
+    return 2 * (uint64_t{1} << num_qubits);
+  };
+
+  void InternalToNormalOrder(State& state) const {
+  }
+
+  void NormalToInternalOrder(State& state) const {
+  }
+
+  void SetAllZeros(State& state) const {
+    ErrorCheck(cudaMemset(state.get(), 0,
+                          MinSize(state.num_qubits()) * sizeof(fp_type)));
+  }
+
+  // Uniform superposition.
+  void SetStateUniform(State& state) const {
+    uint64_t size = uint64_t{1} << state.num_qubits();
+
+    unsigned threads = size < 256 ? size : 256;
+    unsigned blocks = size / threads;
+
+    fp_type v = double{1} / std::sqrt(size);
+
+    detail::SetStateUniformKernel<<<blocks, threads>>>(v, size, state.get());
+    ErrorCheck(cudaPeekAtLastError());
+  }
+
+  // |0> state.
+  void SetStateZero(State& state) const {
+    SetAllZeros(state);
+    fp_type one[1] = {1};
+    ErrorCheck(
+        cudaMemcpy(state.get(), one, sizeof(fp_type), cudaMemcpyHostToDevice));
+  }
+
+  // It is not recommended to use this function.
+  static std::complex<fp_type> GetAmpl(const State& state, uint64_t i) {
+    fp_type a[2];
+    auto p = state.get() + 2 * i;
+    ErrorCheck(cudaMemcpy(a, p, 2 * sizeof(fp_type), cudaMemcpyDeviceToHost));
+    return std::complex<fp_type>(a[0], a[1]);
+  }
+
+  // It is not recommended to use this function.
+  static void SetAmpl(
+      State& state, uint64_t i, const std::complex<fp_type>& ampl) {
+    fp_type a[2] = {std::real(ampl), std::imag(ampl)};
+    auto p = state.get() + 2 * i;
+    ErrorCheck(cudaMemcpy(p, a, 2 * sizeof(fp_type), cudaMemcpyHostToDevice));
+  }
+
+  // It is not recommended to use this function.
+  static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) {
+    fp_type a[2] = {re, im};
+    auto p = state.get() + 2 * i;
+    ErrorCheck(cudaMemcpy(p, a, 2 * sizeof(fp_type), cudaMemcpyHostToDevice));
+  }
+
+  // Sets state[i] = complex(re, im) where (i & mask) == bits.
+  // if `exclude` is true then the criteria becomes (i & mask) != bits.
+  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits,
+                   const std::complex<fp_type>& val,
+                   bool exclude = false) const {
+    // Not implemented.
+  }
+
+  // Sets state[i] = complex(re, im) where (i & mask) == bits.
+  // if `exclude` is true then the criteria becomes (i & mask) != bits.
+  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re,
+                   fp_type im, bool exclude = false) const {
+    // Not implemented.
+  }
+
+  // Does the equivalent of dest += src elementwise.
+  bool Add(const State& src, State& dest) const {
+    if (src.num_qubits() != dest.num_qubits()) {
+      return false;
+    }
+
+    uint64_t size = uint64_t{1} << src.num_qubits();
+
+    if (is_float) {
+      cuComplex a = {1.0, 0.0};
+      auto p1 = (const cuComplex*) src.get();
+      auto p2 = (cuComplex*) dest.get();
+      ErrorCheck(cublasCaxpy(cublas_handle_, size, &a, p1, 1, p2, 1));
+    } else {
+      cuDoubleComplex a = {1.0, 0.0};
+      auto p1 = (const cuDoubleComplex*) src.get();
+      auto p2 = (cuDoubleComplex*) dest.get();
+      ErrorCheck(cublasZaxpy(cublas_handle_, size, &a, p1, 1, p2, 1));
+    }
+
+    return true;
+  }
+
+  // Does the equivalent of state *= a elementwise.
+  void Multiply(fp_type a, State& state) const {
+    uint64_t size = uint64_t{1} << state.num_qubits();
+
+    if (is_float) {
+      float a1 = a;
+      auto p = (cuComplex*) state.get();
+      ErrorCheck(cublasCsscal(cublas_handle_, size, &a1, p, 1));
+    } else {
+      double a1 = a;
+      auto p = (cuDoubleComplex*) state.get();
+      ErrorCheck(cublasZdscal(cublas_handle_, size, &a1, p, 1));
+    }
+  }
+
+  std::complex<double> InnerProduct(
+      const State& state1, const State& state2) const {
+    if (state1.num_qubits() != state2.num_qubits()) {
+      return std::nan("");
+    }
+
+    uint64_t size = uint64_t{1} << state1.num_qubits();
+
+    if (is_float) {
+      cuComplex result;
+      auto p1 = (const cuComplex*) state1.get();
+      auto p2 = (const cuComplex*) state2.get();
+      ErrorCheck(cublasCdotc(cublas_handle_, size, p1, 1, p2, 1, &result));
+      return {cuCrealf(result), cuCimagf(result)};
+    } else {
+      cuDoubleComplex result;
+      auto p1 = (const cuDoubleComplex*) state1.get();
+      auto p2 = (const cuDoubleComplex*) state2.get();
+      ErrorCheck(cublasZdotc(cublas_handle_, size, p1, 1, p2, 1, &result));
+      return {cuCreal(result), cuCimag(result)};
+    }
+  }
+
+  double RealInnerProduct(const State& state1, const State& state2) const {
+    return std::real(InnerProduct(state1, state2));
+  }
+
+  double Norm(const State& state) const {
+    uint64_t size = uint64_t{1} << state.num_qubits();
+
+    if (is_float) {
+      float result;
+      auto p = (const cuComplex*) state.get();
+      ErrorCheck(cublasScnrm2(cublas_handle_, size, p, 1, &result));
+      return result * result;
+    } else {
+      double result;
+      auto p = (const cuDoubleComplex*) state.get();
+      ErrorCheck(cublasDznrm2(cublas_handle_, size, p, 1, &result));
+      return result * result;
+    }
+  }
+
+  template <typename DistrRealType = double>
+  std::vector<uint64_t> Sample(
+      const State& state, uint64_t num_samples, unsigned seed) const {
+    std::vector<uint64_t> bitstrings;
+
+    if (num_samples > 0) {
+      auto rs = GenerateRandomValues<double>(num_samples, seed, 1.0);
+
+      size_t workspace_size;
+      custatevecSamplerDescriptor_t sampler;
+
+      ErrorCheck(custatevecSamplerCreate(
+                     custatevec_handle_, state.get(), kStateType,
+                     state.num_qubits(), &sampler, num_samples,
+                     &workspace_size));
+
+      AllocWorkSpace(workspace_size);
+
+      ErrorCheck(custatevecSamplerPreprocess(
+                     custatevec_handle_, sampler, workspace_, workspace_size));
+
+      std::vector<custatevecIndex_t> bitstrings0(num_samples);
+      std::vector<int32_t> bitordering;
+
+      bitordering.reserve(state.num_qubits());
+      for (unsigned i = 0; i < state.num_qubits(); ++i) {
+        bitordering.push_back(i);
+      }
+
+      ErrorCheck(custatevecSamplerSample(
+                     custatevec_handle_, sampler, bitstrings0.data(),
+                     bitordering.data(), state.num_qubits(), rs.data(),
+                     num_samples, CUSTATEVEC_SAMPLER_OUTPUT_RANDNUM_ORDER));
+
+      bitstrings.reserve(num_samples);
+      for (unsigned i = 0; i < num_samples; ++i) {
+        bitstrings.push_back(bitstrings0[i]);
+      }
+    }
+
+    return bitstrings;
+  }
+
+  using MeasurementResult = typename Base::MeasurementResult;
+
+  template <typename RGen>
+  MeasurementResult Measure(const std::vector<unsigned>& qubits,
+                            RGen& rgen, State& state,
+                            bool no_collapse = false) const {
+    auto r = RandomValue(rgen, 1.0);
+
+    MeasurementResult result;
+
+    result.valid = true;
+    result.mask = 0;
+    result.bits = 0;
+    result.bitstring.resize(qubits.size(), 0);
+
+    for (auto q : qubits) {
+      if (q >= state.num_qubits()) {
+        result.valid = false;
+        return result;
+      }
+
+      result.mask |= uint64_t{1} << q;
+    }
+
+    auto collapse = no_collapse ?
+        CUSTATEVEC_COLLAPSE_NONE : CUSTATEVEC_COLLAPSE_NORMALIZE_AND_ZERO;
+
+    ErrorCheck(custatevecBatchMeasure(
+                   custatevec_handle_, state.get(), kStateType,
+                   state.num_qubits(), (int*) result.bitstring.data(),
+                   (int*) qubits.data(), qubits.size(), r, collapse));
+
+    for (std::size_t i = 0; i < result.bitstring.size(); ++i) {
+      result.bits |= result.bitstring[i] << qubits[i];
+    }
+
+    return result;
+  }
+
+  template <typename RGen>
+  MeasurementResult VirtualMeasure(const std::vector<unsigned>& qubits,
+                                   RGen& rgen, const State& state) const {
+    return Measure(qubits, rgen, const_cast<State&>(state), true);
+  }
+
+  void Collapse(const MeasurementResult& mr, State& state) const {
+    unsigned count = 0;
+
+    std::vector<int> bitstring;
+    std::vector<int> bitordering;
+
+    bitstring.reserve(state.num_qubits());
+    bitordering.reserve(state.num_qubits());
+
+    for (unsigned i = 0; i < state.num_qubits(); ++i) {
+      if (((mr.mask >> i) & 1) != 0) {
+        bitstring.push_back((mr.bits >> i) & 1);
+        bitordering.push_back(i);
+        ++count;
+      }
+    }
+
+    ErrorCheck(custatevecCollapseByBitString(
+                   custatevec_handle_, state.get(), kStateType,
+                   state.num_qubits(), bitstring.data(), bitordering.data(),
+                   count, 1.0));
+
+    // TODO: do we need the following?
+    double norm = Norm(state);
+    Multiply(1.0 / std::sqrt(norm), state);
+  }
+
+ private:
+  void* AllocWorkSpace(size_t size) const {
+    if (size > workspace_size_) {
+      if (workspace_ != nullptr) {
+        ErrorCheck(cudaFree(workspace_));
+      }
+
+      ErrorCheck(cudaMalloc(const_cast<void**>(&workspace_), size));
+
+      const_cast<uint64_t&>(workspace_size_) = size;
+    }
+
+    return workspace_;
+  }
+
+  const cublasHandle_t cublas_handle_;
+  const custatevecHandle_t custatevec_handle_;
+
+  void* workspace_;
+  size_t workspace_size_;
+};
+
+}  // namespace qsim
+
+#endif  // STATESPACE_CUSTATEVEC_H_
diff --git a/tpls/qsim/statespace_sse.h b/tpls/qsim/statespace_sse.h
new file mode 100644
index 0000000..cf41a09
--- /dev/null
+++ b/tpls/qsim/statespace_sse.h
@@ -0,0 +1,462 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef STATESPACE_SSE_H_
+#define STATESPACE_SSE_H_
+
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <cstdint>
+#include <functional>
+
+#include "statespace.h"
+#include "util.h"
+#include "vectorspace.h"
+
+namespace qsim {
+
+namespace detail {
+
+inline __m128i GetZeroMaskSSE(uint64_t i, uint64_t mask, uint64_t bits) {
+  __m128i s1 = _mm_set_epi64x(i + 2, i + 0);
+  __m128i s2 = _mm_set_epi64x(i + 3, i + 1);
+  __m128i ma = _mm_set1_epi64x(mask);
+  __m128i bi = _mm_set1_epi64x(bits);
+
+  s1 = _mm_and_si128(s1, ma);
+  s2 = _mm_and_si128(s2, ma);
+
+  s1 = _mm_cmpeq_epi64(s1, bi);
+  s2 = _mm_cmpeq_epi64(s2, bi);
+
+  return _mm_blend_epi16(s1, s2, 204);  // 11001100
+}
+
+inline double HorizontalSumSSE(__m128 s) {
+  __m128 ss = _mm_movehdup_ps(s);
+  __m128 s1 = _mm_add_ps(s, ss);
+
+  return _mm_cvtss_f32(_mm_add_ss(s1, _mm_movehl_ps(ss, s1)));
+}
+
+}  // namespace detail
+
+/**
+ * Object containing context and routines for SSE state-vector manipulations.
+ * State is a vectorized sequence of four real components followed by four
+ * imaginary components. Four single-precison floating numbers can be loaded
+ * into an SSE register.
+ */
+template <typename For>
+class StateSpaceSSE :
+    public StateSpace<StateSpaceSSE<For>, VectorSpace, For, float> {
+ private:
+  using Base = StateSpace<StateSpaceSSE<For>, qsim::VectorSpace, For, float>;
+
+ public:
+  using State = typename Base::State;
+  using fp_type = typename Base::fp_type;
+
+  template <typename... ForArgs>
+  explicit StateSpaceSSE(ForArgs&&... args) : Base(args...) {}
+
+  static uint64_t MinSize(unsigned num_qubits) {
+    return std::max(uint64_t{8}, 2 * (uint64_t{1} << num_qubits));
+  };
+
+  void InternalToNormalOrder(State& state) const {
+    if (state.num_qubits() == 1) {
+      auto s = state.get();
+
+      s[2] = s[1];
+      s[1] = s[4];
+      s[3] = s[5];
+
+      for (uint64_t i = 4; i < 8; ++i) {
+        s[i] = 0;
+      }
+    } else {
+      auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) {
+        auto s = p + 8 * i;
+
+        fp_type re[3];
+        fp_type im[3];
+
+        for (uint64_t i = 0; i < 3; ++i) {
+          re[i] = s[i + 1];
+          im[i] = s[i + 4];
+        }
+
+        for (uint64_t i = 0; i < 3; ++i) {
+          s[2 * i + 1] = im[i];
+          s[2 * i + 2] = re[i];
+        }
+      };
+
+      Base::for_.Run(MinSize(state.num_qubits()) / 8, f, state.get());
+    }
+  }
+
+  void NormalToInternalOrder(State& state) const {
+    if (state.num_qubits() == 1) {
+      auto s = state.get();
+
+      s[4] = s[1];
+      s[1] = s[2];
+      s[5] = s[3];
+
+      s[2] = 0;
+      s[3] = 0;
+      s[6] = 0;
+      s[7] = 0;
+    } else {
+      auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) {
+        auto s = p + 8 * i;
+
+        fp_type re[3];
+        fp_type im[3];
+
+        for (uint64_t i = 0; i < 3; ++i) {
+          im[i] = s[2 * i + 1];
+          re[i] = s[2 * i + 2];
+        }
+
+        for (uint64_t i = 0; i < 3; ++i) {
+          s[i + 1] = re[i];
+          s[i + 4] = im[i];
+        }
+      };
+
+      Base::for_.Run(MinSize(state.num_qubits()) / 8, f, state.get());
+    }
+  }
+
+  void SetAllZeros(State& state) const {
+    __m128 val0 = _mm_setzero_ps();
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, __m128 val0, fp_type* p) {
+      _mm_store_ps(p + 8 * i, val0);
+      _mm_store_ps(p + 8 * i + 4, val0);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 8, f, val0, state.get());
+  }
+
+  // Uniform superposition.
+  void SetStateUniform(State& state) const {
+    __m128 val0 = _mm_setzero_ps();
+    __m128 valu;
+
+    fp_type v = double{1} / std::sqrt(uint64_t{1} << state.num_qubits());
+
+    if (state.num_qubits() == 1) {
+      valu = _mm_set_ps(0, 0, v, v);
+    } else {
+      valu = _mm_set1_ps(v);
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                __m128 val0, __m128 valu, fp_type* p) {
+      _mm_store_ps(p + 8 * i, valu);
+      _mm_store_ps(p + 8 * i + 4, val0);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 8, f, val0, valu, state.get());
+  }
+
+  // |0> state.
+  void SetStateZero(State& state) const {
+    SetAllZeros(state);
+    state.get()[0] = 1;
+  }
+
+  static std::complex<fp_type> GetAmpl(const State& state, uint64_t i) {
+    uint64_t p = (8 * (i / 4)) + (i % 4);
+    return std::complex<fp_type>(state.get()[p], state.get()[p + 4]);
+  }
+
+  static void SetAmpl(
+      State& state, uint64_t i, const std::complex<fp_type>& ampl) {
+    uint64_t p = (8 * (i / 4)) + (i % 4);
+    state.get()[p] = std::real(ampl);
+    state.get()[p + 4] = std::imag(ampl);
+  }
+
+  static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) {
+    uint64_t p = (8 * (i / 4)) + (i % 4);
+    state.get()[p] = re;
+    state.get()[p + 4] = im;
+  }
+
+  // Sets state[i] = complex(re, im) where (i & mask) == bits.
+  // if `exclude` is true then the criteria becomes (i & mask) != bits.
+  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits,
+                   const std::complex<fp_type>& val,
+                   bool exclude = false) const {
+    BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val));
+  }
+
+  // Sets state[i] = complex(re, im) where (i & mask) == bits.
+  // if `exclude` is true then the criteria becomes (i & mask) != bits.
+  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re,
+                   fp_type im, bool exclude = false) const {
+    __m128 re_reg = _mm_set1_ps(re);
+    __m128 im_reg = _mm_set1_ps(im);
+    __m128i exclude_reg = _mm_setzero_si128();
+    if (exclude) {
+      exclude_reg = _mm_cmpeq_epi32(exclude_reg, exclude_reg);
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv,
+                uint64_t bitsv, __m128 re_n, __m128 im_n, __m128i exclude_n,
+                fp_type* p) {
+      __m128 ml = _mm_castsi128_ps(_mm_xor_si128(
+          detail::GetZeroMaskSSE(4 * i, maskv, bitsv), exclude_n));
+
+      __m128 re = _mm_load_ps(p + 8 * i);
+      __m128 im = _mm_load_ps(p + 8 * i + 4);
+
+      re = _mm_blendv_ps(re, re_n, ml);
+      im = _mm_blendv_ps(im, im_n, ml);
+
+      _mm_store_ps(p + 8 * i, re);
+      _mm_store_ps(p + 8 * i + 4, im);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 8, f, mask, bits, re_reg,
+                   im_reg, exclude_reg, state.get());
+  }
+
+  // Does the equivalent of dest += src elementwise.
+  bool Add(const State& src, State& dest) const {
+    if (src.num_qubits() != dest.num_qubits()) {
+      return false;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p1, fp_type* p2) {
+      __m128 re1 = _mm_load_ps(p1 + 8 * i);
+      __m128 im1 = _mm_load_ps(p1 + 8 * i + 4);
+      __m128 re2 = _mm_load_ps(p2 + 8 * i);
+      __m128 im2 = _mm_load_ps(p2 + 8 * i + 4);
+
+      _mm_store_ps(p2 + 8 * i, _mm_add_ps(re1, re2));
+      _mm_store_ps(p2 + 8 * i + 4, _mm_add_ps(im1, im2));
+    };
+
+    Base::for_.Run(MinSize(src.num_qubits()) / 8, f, src.get(), dest.get());
+
+    return true;
+  }
+
+  // Does the equivalent of state *= a elementwise.
+  void Multiply(fp_type a, State& state) const {
+    __m128 r = _mm_set1_ps(a);
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, __m128 r, fp_type* p) {
+      __m128 re = _mm_load_ps(p + 8 * i);
+      __m128 im = _mm_load_ps(p + 8 * i + 4);
+
+      re = _mm_mul_ps(re, r);
+      im = _mm_mul_ps(im, r);
+
+      _mm_store_ps(p + 8 * i, re);
+      _mm_store_ps(p + 8 * i + 4, im);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 8, f, r, state.get());
+  }
+
+  std::complex<double> InnerProduct(
+      const State& state1, const State& state2) const {
+    if (state1.num_qubits() != state2.num_qubits()) {
+      return std::nan("");
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p1, const fp_type* p2) -> std::complex<double> {
+      __m128 re1 = _mm_load_ps(p1 + 8 * i);
+      __m128 im1 = _mm_load_ps(p1 + 8 * i + 4);
+      __m128 re2 = _mm_load_ps(p2 + 8 * i);
+      __m128 im2 = _mm_load_ps(p2 + 8 * i + 4);
+
+      __m128 ip_re = _mm_add_ps(_mm_mul_ps(re1, re2), _mm_mul_ps(im1, im2));
+      __m128 ip_im = _mm_sub_ps(_mm_mul_ps(re1, im2), _mm_mul_ps(im1, re2));
+
+      double re = detail::HorizontalSumSSE(ip_re);
+      double im = detail::HorizontalSumSSE(ip_im);
+
+      return std::complex<double>{re, im};
+    };
+
+    using Op = std::plus<std::complex<double>>;
+    return Base::for_.RunReduce(
+        MinSize(state1.num_qubits()) / 8, f, Op(), state1.get(), state2.get());
+  }
+
+  double RealInnerProduct(const State& state1, const State& state2) const {
+    if (state1.num_qubits() != state2.num_qubits()) {
+      return std::nan("");
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p1, const fp_type* p2) -> double {
+      __m128 re1 = _mm_load_ps(p1 + 8 * i);
+      __m128 im1 = _mm_load_ps(p1 + 8 * i + 4);
+      __m128 re2 = _mm_load_ps(p2 + 8 * i);
+      __m128 im2 = _mm_load_ps(p2 + 8 * i + 4);
+
+      __m128 ip_re = _mm_add_ps(_mm_mul_ps(re1, re2), _mm_mul_ps(im1, im2));
+
+      return detail::HorizontalSumSSE(ip_re);
+    };
+
+    using Op = std::plus<double>;
+    return Base::for_.RunReduce(
+        MinSize(state1.num_qubits()) / 8, f, Op(), state1.get(), state2.get());
+  }
+
+  template <typename DistrRealType = double>
+  std::vector<uint64_t> Sample(
+      const State& state, uint64_t num_samples, unsigned seed) const {
+    std::vector<uint64_t> bitstrings;
+
+    if (num_samples > 0) {
+      double norm = 0;
+      uint64_t size = MinSize(state.num_qubits()) / 8;
+      const fp_type* p = state.get();
+
+      for (uint64_t k = 0; k < size; ++k) {
+        for (unsigned j = 0; j < 4; ++j) {
+          double re = p[8 * k + j];
+          double im = p[8 * k + 4 + j];
+          norm += re * re + im * im;
+        }
+      }
+
+      auto rs = GenerateRandomValues<DistrRealType>(num_samples, seed, norm);
+
+      uint64_t m = 0;
+      double csum = 0;
+      bitstrings.reserve(num_samples);
+
+      for (uint64_t k = 0; k < size; ++k) {
+        for (unsigned j = 0; j < 4; ++j) {
+          double re = p[8 * k + j];
+          double im = p[8 * k + 4 + j];
+          csum += re * re + im * im;
+          while (rs[m] < csum && m < num_samples) {
+            bitstrings.emplace_back(4 * k + j);
+            ++m;
+          }
+        }
+      }
+
+      for (; m < num_samples; ++m) {
+        bitstrings.emplace_back((uint64_t{1} << state.num_qubits()) - 1);
+      }
+    }
+
+    return bitstrings;
+  }
+
+  using MeasurementResult = typename Base::MeasurementResult;
+
+  void Collapse(const MeasurementResult& mr, State& state) const {
+    __m128 zero = _mm_set1_ps(0);
+
+    auto f1 = [](unsigned n, unsigned m, uint64_t i, uint64_t mask,
+                 uint64_t bits, __m128 zero, const fp_type* p) -> double {
+      __m128 ml = _mm_castsi128_ps(detail::GetZeroMaskSSE(4 * i, mask, bits));
+
+      __m128 re = _mm_load_ps(p + 8 * i);
+      __m128 im = _mm_load_ps(p + 8 * i + 4);
+      __m128 s1 = _mm_add_ps(_mm_mul_ps(re, re), _mm_mul_ps(im, im));
+
+      s1 = _mm_blendv_ps(zero, s1, ml);
+
+      return detail::HorizontalSumSSE(s1);
+    };
+
+    using Op = std::plus<double>;
+    double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 8, f1,
+                                       Op(), mr.mask, mr.bits, zero,
+                                       state.get());
+
+    __m128 renorm = _mm_set1_ps(1.0 / std::sqrt(norm));
+
+    auto f2 = [](unsigned n, unsigned m, uint64_t i, uint64_t mask,
+                 uint64_t bits, __m128 renorm, __m128 zero, fp_type* p) {
+      __m128 ml = _mm_castsi128_ps(detail::GetZeroMaskSSE(4 * i, mask, bits));
+
+      __m128 re = _mm_load_ps(p + 8 * i);
+      __m128 im = _mm_load_ps(p + 8 * i + 4);
+
+      re = _mm_blendv_ps(zero, _mm_mul_ps(re, renorm), ml);
+      im = _mm_blendv_ps(zero, _mm_mul_ps(im, renorm), ml);
+
+      _mm_store_ps(p + 8 * i, re);
+      _mm_store_ps(p + 8 * i + 4, im);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 8, f2,
+                   mr.mask, mr.bits, renorm, zero, state.get());
+  }
+
+  std::vector<double> PartialNorms(const State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* p) -> double {
+      __m128 re = _mm_load_ps(p + 8 * i);
+      __m128 im = _mm_load_ps(p + 8 * i + 4);
+      __m128 s1 = _mm_add_ps(_mm_mul_ps(re, re), _mm_mul_ps(im, im));
+
+      return detail::HorizontalSumSSE(s1);
+    };
+
+    using Op = std::plus<double>;
+    return Base::for_.RunReduceP(
+        MinSize(state.num_qubits()) / 8, f, Op(), state.get());
+  }
+
+  uint64_t FindMeasuredBits(
+      unsigned m, double r, uint64_t mask, const State& state) const {
+    double csum = 0;
+
+    uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 8, m);
+    uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 8, m);
+
+    const fp_type* p = state.get();
+
+    for (uint64_t k = k0; k < k1; ++k) {
+      for (uint64_t j = 0; j < 4; ++j) {
+        auto re = p[8 * k + j];
+        auto im = p[8 * k + 4 + j];
+        csum += re * re + im * im;
+        if (r < csum) {
+          return (4 * k + j) & mask;
+        }
+      }
+    }
+
+    // Return the last bitstring in the unlikely case of underflow.
+    return (4 * k1 - 1) & mask;
+  }
+};
+
+}  // namespace qsim
+
+#endif  // STATESPACE_SSE_H_
diff --git a/tpls/qsim/umux.h b/tpls/qsim/umux.h
new file mode 100644
index 0000000..83b951b
--- /dev/null
+++ b/tpls/qsim/umux.h
@@ -0,0 +1,52 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UMUX_H_
+#define UMUX_H_
+
+#ifdef __AVX512F__
+# include "unitary_calculator_avx512.h"
+  namespace qsim {
+  namespace unitary {
+    template <typename For>
+    using UnitaryCalculator = UnitaryCalculatorAVX512<For>;
+  }
+  }
+#elif __AVX2__
+# include "unitary_calculator_avx.h"
+  namespace qsim {
+  namespace unitary {
+    template <typename For>
+    using UnitaryCalculator = UnitaryCalculatorAVX<For>;
+  }
+  }
+#elif __SSE4_1__
+# include "unitary_calculator_sse.h"
+  namespace qsim {
+  namespace unitary {
+    template <typename For>
+    using UnitaryCalculator = UnitaryCalculatorSSE<For>;
+  }
+  }
+#else
+# include "unitary_calculator_basic.h"
+  namespace qsim {
+  namespace unitary {
+    template <typename For>
+    using UnitaryCalculator = UnitaryCalculatorBasic<For>;
+  }
+  }
+#endif
+
+#endif  // UMUX_H_
diff --git a/tpls/qsim/unitary_calculator_avx.h b/tpls/qsim/unitary_calculator_avx.h
new file mode 100644
index 0000000..5e566ca
--- /dev/null
+++ b/tpls/qsim/unitary_calculator_avx.h
@@ -0,0 +1,1028 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UNITARY_CALCULATOR_AVX_H_
+#define UNITARY_CALCULATOR_AVX_H_
+
+#include <immintrin.h>
+
+#include <complex>
+#include <cstdint>
+#include <functional>
+#include <vector>
+
+#include "simulator.h"
+#include "unitaryspace_avx.h"
+
+namespace qsim {
+namespace unitary {
+
+/**
+ * Quantum circuit unitary calculator with AVX vectorization.
+ */
+template <typename For>
+class UnitaryCalculatorAVX final : public SimulatorBase {
+ public:
+  using UnitarySpace = UnitarySpaceAVX<For>;
+  using Unitary = typename UnitarySpace::Unitary;
+  using fp_type = typename UnitarySpace::fp_type;
+
+  using StateSpace = UnitarySpace;
+  using State = Unitary;
+
+  template <typename... ForArgs>
+  explicit UnitaryCalculatorAVX(ForArgs&&... args) : for_(args...) {}
+
+  /**
+   * Applies a gate using AVX instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyGate(const std::vector<unsigned>& qs,
+                 const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    switch (qs.size()) {
+    case 1:
+      if (qs[0] > 2) {
+        ApplyGateH<1>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 1>(qs, matrix, state);
+      }
+      break;
+    case 2:
+      if (qs[0] > 2) {
+        ApplyGateH<2>(qs, matrix, state);
+      } else if (qs[1] > 2) {
+        ApplyGateL<1, 1>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 2>(qs, matrix, state);
+      }
+      break;
+    case 3:
+      if (qs[0] > 2) {
+        ApplyGateH<3>(qs, matrix, state);
+      } else if (qs[1] > 2) {
+        ApplyGateL<2, 1>(qs, matrix, state);
+      } else if (qs[2] > 2) {
+        ApplyGateL<1, 2>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 3>(qs, matrix, state);
+      }
+      break;
+    case 4:
+      if (qs[0] > 2) {
+        ApplyGateH<4>(qs, matrix, state);
+      } else if (qs[1] > 2) {
+        ApplyGateL<3, 1>(qs, matrix, state);
+      } else if (qs[2] > 2) {
+        ApplyGateL<2, 2>(qs, matrix, state);
+      } else {
+        ApplyGateL<1, 3>(qs, matrix, state);
+      }
+      break;
+    case 5:
+      if (qs[0] > 2) {
+        ApplyGateH<5>(qs, matrix, state);
+      } else if (qs[1] > 2) {
+        ApplyGateL<4, 1>(qs, matrix, state);
+      } else if (qs[2] > 2) {
+        ApplyGateL<3, 2>(qs, matrix, state);
+      } else {
+        ApplyGateL<2, 3>(qs, matrix, state);
+      }
+      break;
+    case 6:
+      if (qs[0] > 2) {
+        ApplyGateH<6>(qs, matrix, state);
+      } else if (qs[1] > 2) {
+        ApplyGateL<5, 1>(qs, matrix, state);
+      } else if (qs[2] > 2) {
+        ApplyGateL<4, 2>(qs, matrix, state);
+      } else {
+        ApplyGateL<3, 3>(qs, matrix, state);
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * Applies a controlled gate using AVX instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param cqs Indices of control qubits.
+   * @param cvals Bit mask of control qubit values.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyControlledGate(const std::vector<unsigned>& qs,
+                           const std::vector<unsigned>& cqs, uint64_t cvals,
+                           const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+    // Assume cqs[0] < cqs[1] < cqs[2] < ... .
+
+    if (cqs.size() == 0) {
+      ApplyGate(qs, matrix, state);
+      return;
+    }
+
+    switch (qs.size()) {
+    case 1:
+      if (qs[0] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 2:
+      if (qs[0] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 3:
+      if (qs[0] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[2] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 4:
+      if (qs[0] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[2] > 2) {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 2) {
+          ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * @return The size of SIMD register if applicable.
+   */
+  static unsigned SIMDRegisterSize() {
+    return 8;
+  }
+
+ private:
+
+#ifdef __BMI2__
+
+  template <unsigned H>
+  void ApplyGateH(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t size,
+                uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m256 ru, iu, rn, in;
+      __m256 rs[hsize], is[hsize];
+
+      uint64_t r = i % size;
+      uint64_t s = i / size;
+
+      auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k] = _mm256_load_ps(p0 + p);
+        is[k] = _mm256_load_ps(p0 + p + 8);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm256_set1_ps(v[j]);
+        iu = _mm256_set1_ps(v[j + 1]);
+        rn = _mm256_mul_ps(rs[0], ru);
+        in = _mm256_mul_ps(rs[0], iu);
+        rn = _mm256_fnmadd_ps(is[0], iu, rn);
+        in = _mm256_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm256_set1_ps(v[j]);
+          iu = _mm256_set1_ps(v[j + 1]);
+          rn = _mm256_fmadd_ps(rs[l], ru, rn);
+          in = _mm256_fmadd_ps(rs[l], iu, in);
+          rn = _mm256_fnmadd_ps(is[l], iu, rn);
+          in = _mm256_fmadd_ps(is[l], ru, in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm256_store_ps(p0 + p, rn);
+        _mm256_store_ps(p0 + p + 8, in);
+      }
+    };
+
+    auto m = GetMasks1<H, 3>(qs);
+
+    unsigned k = 3 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f,
+             matrix, m.imaskh, m.qmaskh, size, raw_size, state.get());
+  }
+
+  template <unsigned H, unsigned L>
+  void ApplyGateL(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                uint64_t imaskh, uint64_t qmaskh, const __m256i* idx,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m256 rn, in;
+      __m256 rs[gsize], is[gsize];
+
+      uint64_t r = i % size;
+      uint64_t s = i / size;
+
+      auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k2] = _mm256_load_ps(p0 + p);
+        is[k2] = _mm256_load_ps(p0 + p + 8);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
+          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm256_mul_ps(rs[0], w[j]);
+        in = _mm256_mul_ps(rs[0], w[j + 1]);
+        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm256_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm256_store_ps(p0 + p, rn);
+        _mm256_store_ps(p0 + p + 8, in);
+      }
+    };
+
+    __m256i idx[1 << L];
+    __m256 w[1 << (1 + 2 * H + L)];
+
+    auto m = GetMasks2<H, L, 3>(qs);
+    FillPermutationIndices<L>(m.qmaskl, idx);
+    FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
+
+    unsigned k = 3 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f,
+             w, m.imaskh, m.qmaskh, idx, size, raw_size, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m256 ru, iu, rn, in;
+      __m256 rs[hsize], is[hsize];
+
+      uint64_t r = i % size;
+      uint64_t s = i / size;
+
+      auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k] = _mm256_load_ps(p0 + p);
+        is[k] = _mm256_load_ps(p0 + p + 8);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm256_set1_ps(v[j]);
+        iu = _mm256_set1_ps(v[j + 1]);
+        rn = _mm256_mul_ps(rs[0], ru);
+        in = _mm256_mul_ps(rs[0], iu);
+        rn = _mm256_fnmadd_ps(is[0], iu, rn);
+        in = _mm256_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm256_set1_ps(v[j]);
+          iu = _mm256_set1_ps(v[j + 1]);
+          rn = _mm256_fmadd_ps(rs[l], ru, rn);
+          in = _mm256_fmadd_ps(rs[l], iu, in);
+          rn = _mm256_fnmadd_ps(is[l], iu, rn);
+          in = _mm256_fmadd_ps(is[l], ru, in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm256_store_ps(p0 + p, rn);
+        _mm256_store_ps(p0 + p + 8, in);
+      }
+    };
+
+    auto m = GetMasks3<H, 3>(state.num_qubits(), qs, cqs, cvals);
+
+    unsigned k = 3 + H + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f,
+             matrix, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m256 rn, in;
+      __m256 rs[hsize], is[hsize];
+
+      uint64_t r = i % size;
+      uint64_t s = i / size;
+
+      auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k] = _mm256_load_ps(p0 + p);
+        is[k] = _mm256_load_ps(p0 + p + 8);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm256_mul_ps(rs[0], w[j]);
+        in = _mm256_mul_ps(rs[0], w[j + 1]);
+        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm256_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm256_store_ps(p0 + p, rn);
+        _mm256_store_ps(p0 + p + 8, in);
+      }
+    };
+
+    __m256 w[1 << (1 + 2 * H)];
+
+    auto m = GetMasks4<H, 3>(state.num_qubits(), qs, cqs, cvals);
+    FillControlledMatrixH<H, 3>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
+
+    unsigned k = 3 + H + cqs.size() - m.cl;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f,
+             w, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get());
+  }
+
+  template <unsigned H, unsigned L, bool CH>
+  void ApplyControlledGateL(const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs, uint64_t cvals,
+                            const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
+                const __m256i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m256 rn, in;
+      __m256 rs[gsize], is[gsize];
+
+      uint64_t r = i % size;
+      uint64_t s = i / size;
+
+      auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k2] = _mm256_load_ps(p0 + p);
+        is[k2] = _mm256_load_ps(p0 + p + 8);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
+          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm256_mul_ps(rs[0], w[j]);
+        in = _mm256_mul_ps(rs[0], w[j + 1]);
+        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm256_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm256_store_ps(p0 + p, rn);
+        _mm256_store_ps(p0 + p + 8, in);
+      }
+    };
+
+    __m256i idx[1 << L];
+    __m256 w[1 << (1 + 2 * H + L)];
+
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    if (CH) {
+      auto m = GetMasks5<H, L, 3>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
+
+      unsigned k = 3 + H + cqs.size();
+      unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+      uint64_t size = uint64_t{1} << n;
+
+      for_.Run(size * size2, f, w, m.imaskh, m.qmaskh,
+               m.cvalsh, idx, size, raw_size, state.get());
+    } else {
+      auto m = GetMasks6<H, L, 3>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillControlledMatrixL<H, L, 3>(
+          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
+
+      unsigned k = 3 + H + cqs.size() - m.cl;
+      unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+      uint64_t size = uint64_t{1} << n;
+
+      for_.Run(size * size2, f, w, m.imaskh, m.qmaskh,
+               m.cvalsh, idx, size, raw_size, state.get());
+    }
+  }
+
+#else  // __BMI2__
+
+  template <unsigned H>
+  void ApplyGateH(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss, uint64_t size,
+                uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m256 ru, iu, rn, in;
+      __m256 rs[hsize], is[hsize];
+
+      uint64_t r = 8 * (i % size);
+      uint64_t s = i / size;
+
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
+      }
+
+      auto p0 = rstate + row_size * s + 2 * t;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm256_load_ps(p0 + xss[k]);
+        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm256_set1_ps(v[j]);
+        iu = _mm256_set1_ps(v[j + 1]);
+        rn = _mm256_mul_ps(rs[0], ru);
+        in = _mm256_mul_ps(rs[0], iu);
+        rn = _mm256_fnmadd_ps(is[0], iu, rn);
+        in = _mm256_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm256_set1_ps(v[j]);
+          iu = _mm256_set1_ps(v[j + 1]);
+          rn = _mm256_fmadd_ps(rs[l], ru, rn);
+          in = _mm256_fmadd_ps(rs[l], iu, in);
+          rn = _mm256_fnmadd_ps(is[l], iu, rn);
+          in = _mm256_fmadd_ps(is[l], ru, in);
+
+          j += 2;
+        }
+
+        _mm256_store_ps(p0 + xss[k], rn);
+        _mm256_store_ps(p0 + xss[k] + 8, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 3 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, state.get());
+  }
+
+  template <unsigned H, unsigned L>
+  void ApplyGateL(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                const uint64_t* ms, const uint64_t* xss, const __m256i* idx,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m256 rn, in;
+      __m256 rs[gsize], is[gsize];
+
+      uint64_t r = 8 * (i % size);
+      uint64_t s = i / size;
+
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
+      }
+
+      auto p0 = rstate + row_size * s + 2 * t;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        rs[k2] = _mm256_load_ps(p0 + xss[k]);
+        is[k2] = _mm256_load_ps(p0 + xss[k] + 8);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
+          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm256_mul_ps(rs[0], w[j]);
+        in = _mm256_mul_ps(rs[0], w[j + 1]);
+        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm256_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        _mm256_store_ps(p0 + xss[k], rn);
+        _mm256_store_ps(p0 + xss[k] + 8, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m256i idx[1 << L];
+    __m256 w[1 << (1 + 2 * H + L)];
+
+    auto m = GetMasks11<L>(qs);
+
+    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
+    FillPermutationIndices<L>(m.qmaskl, idx);
+    FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
+
+    unsigned k = 3 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m256 ru, iu, rn, in;
+      __m256 rs[hsize], is[hsize];
+
+      uint64_t r = 8 * (i % size);
+      uint64_t s = i / size;
+
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
+      }
+
+      if ((t & cmaskh) != cvalsh) return;
+
+      auto p0 = rstate + row_size * s + 2 * t;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm256_load_ps(p0 + xss[k]);
+        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm256_set1_ps(v[j]);
+        iu = _mm256_set1_ps(v[j + 1]);
+        rn = _mm256_mul_ps(rs[0], ru);
+        in = _mm256_mul_ps(rs[0], iu);
+        rn = _mm256_fnmadd_ps(is[0], iu, rn);
+        in = _mm256_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm256_set1_ps(v[j]);
+          iu = _mm256_set1_ps(v[j + 1]);
+          rn = _mm256_fmadd_ps(rs[l], ru, rn);
+          in = _mm256_fmadd_ps(rs[l], iu, in);
+          rn = _mm256_fnmadd_ps(is[l], iu, rn);
+          in = _mm256_fmadd_ps(is[l], ru, in);
+
+          j += 2;
+        }
+
+        _mm256_store_ps(p0 + xss[k], rn);
+        _mm256_store_ps(p0 + xss[k] + 8, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+
+    auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals);
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 3 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f,
+             matrix, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m256 rn, in;
+      __m256 rs[hsize], is[hsize];
+
+      uint64_t r = 8 * (i % size);
+      uint64_t s = i / size;
+
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
+      }
+
+      if ((t & cmaskh) != cvalsh) return;
+
+      auto p0 = rstate + row_size * s + 2 * t;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm256_load_ps(p0 + xss[k]);
+        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm256_mul_ps(rs[0], w[j]);
+        in = _mm256_mul_ps(rs[0], w[j + 1]);
+        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm256_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        _mm256_store_ps(p0 + xss[k], rn);
+        _mm256_store_ps(p0 + xss[k] + 8, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m256 w[1 << (1 + 2 * H)];
+
+    auto m = GetMasks8<3>(state.num_qubits(), qs, cqs, cvals);
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+    FillControlledMatrixH<H, 3>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
+
+    unsigned k = 3 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f,
+             w, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get());
+  }
+
+  template <unsigned H, unsigned L, bool CH>
+  void ApplyControlledGateL(const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs, uint64_t cvals,
+                            const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, const __m256i* idx, uint64_t size,
+                uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m256 rn, in;
+      __m256 rs[gsize], is[gsize];
+
+      uint64_t r = 8 * (i % size);
+      uint64_t s = i / size;
+
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
+      }
+
+      if ((t & cmaskh) != cvalsh) return;
+
+      auto p0 = rstate + row_size * s + 2 * t;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+
+        rs[k2] = _mm256_load_ps(p0 + xss[k]);
+        is[k2] = _mm256_load_ps(p0 + xss[k] + 8);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
+          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm256_mul_ps(rs[0], w[j]);
+        in = _mm256_mul_ps(rs[0], w[j + 1]);
+        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm256_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
+          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm256_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        _mm256_store_ps(p0 + xss[k], rn);
+        _mm256_store_ps(p0 + xss[k] + 8, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m256i idx[1 << L];
+    __m256 w[1 << (1 + 2 * H + L)];
+
+    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 3 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    if (CH) {
+      auto m = GetMasks9<L>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
+
+      for_.Run(size * size2, f, w, ms, xss, m.cvalsh,
+               m.cmaskh, idx, size, raw_size, state.get());
+    } else {
+      auto m = GetMasks10<L, 3>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillControlledMatrixL<H, L, 3>(
+          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
+
+      for_.Run(size * size2, f, w, ms, xss, m.cvalsh,
+               m.cmaskh, idx, size, raw_size, state.get());
+    }
+  }
+
+#endif  // __BMI2__
+
+  template <unsigned L>
+  static void FillPermutationIndices(unsigned qmaskl, __m256i* idx) {
+    constexpr unsigned lsize = 1 << L;
+
+    for (unsigned i = 0; i < lsize - 1; ++i) {
+      unsigned p[8];
+
+      for (unsigned j = 0; j < 8; ++j) {
+        p[j] = MaskedAdd<3>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl));
+      }
+
+      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
+    }
+  }
+
+  For for_;
+};
+
+}  // namespace unitary
+}  // namespace qsim
+
+#endif  // UNITARY_CALCULATOR_AVX_H_
diff --git a/tpls/qsim/unitary_calculator_avx512.h b/tpls/qsim/unitary_calculator_avx512.h
new file mode 100644
index 0000000..8105367
--- /dev/null
+++ b/tpls/qsim/unitary_calculator_avx512.h
@@ -0,0 +1,644 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UNITARY_CALCULATOR_AVX512_H_
+#define UNITARY_CALCULATOR_AVX512_H_
+
+#include <immintrin.h>
+
+#include <complex>
+#include <cstdint>
+#include <functional>
+#include <vector>
+
+#include "simulator.h"
+#include "unitaryspace_avx512.h"
+
+namespace qsim {
+namespace unitary {
+
+/**
+ * Quantum circuit unitary calculator with AVX512 vectorization.
+ */
+template <typename For>
+class UnitaryCalculatorAVX512 final : public SimulatorBase {
+ public:
+  using UnitarySpace = UnitarySpaceAVX512<For>;
+  using Unitary = typename UnitarySpace::Unitary;
+  using fp_type = typename UnitarySpace::fp_type;
+
+  using StateSpace = UnitarySpace;
+  using State = Unitary;
+
+  template <typename... ForArgs>
+  explicit UnitaryCalculatorAVX512(ForArgs&&... args) : for_(args...) {}
+
+  /**
+   * Applies a gate using AVX512 instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyGate(const std::vector<unsigned>& qs,
+                 const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    switch (qs.size()) {
+    case 1:
+      if (qs[0] > 3) {
+        ApplyGateH<1>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 1>(qs, matrix, state);
+      }
+      break;
+    case 2:
+      if (qs[0] > 3) {
+        ApplyGateH<2>(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        ApplyGateL<1, 1>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 2>(qs, matrix, state);
+      }
+      break;
+    case 3:
+      if (qs[0] > 3) {
+        ApplyGateH<3>(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        ApplyGateL<2, 1>(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        ApplyGateL<1, 2>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 3>(qs, matrix, state);
+      }
+      break;
+    case 4:
+      if (qs[0] > 3) {
+        ApplyGateH<4>(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        ApplyGateL<3, 1>(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        ApplyGateL<2, 2>(qs, matrix, state);
+      } else if (qs[3] > 3) {
+        ApplyGateL<1, 3>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 4>(qs, matrix, state);
+      }
+      break;
+    case 5:
+      if (qs[0] > 3) {
+        ApplyGateH<5>(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        ApplyGateL<4, 1>(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        ApplyGateL<3, 2>(qs, matrix, state);
+      } else if (qs[3] > 3) {
+        ApplyGateL<2, 3>(qs, matrix, state);
+      } else {
+        ApplyGateL<1, 4>(qs, matrix, state);
+      }
+      break;
+    case 6:
+      if (qs[0] > 3) {
+        ApplyGateH<6>(qs, matrix, state);
+      } else if (qs[1] > 3) {
+        ApplyGateL<5, 1>(qs, matrix, state);
+      } else if (qs[2] > 3) {
+        ApplyGateL<4, 2>(qs, matrix, state);
+      } else if (qs[3] > 3) {
+        ApplyGateL<3, 3>(qs, matrix, state);
+      } else {
+        ApplyGateL<2, 4>(qs, matrix, state);
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * Applies a controlled gate using AVX512 instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param cqs Indices of control qubits.
+   * @param cvals Bit mask of control qubit values.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyControlledGate(const std::vector<unsigned>& qs,
+                           const std::vector<unsigned>& cqs, uint64_t cvals,
+                           const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+    // Assume cqs[0] < cqs[1] < cqs[2] < ... .
+
+    if (cqs.size() == 0) {
+      ApplyGate(qs, matrix, state);
+      return;
+    }
+
+    switch (qs.size()) {
+    case 1:
+      if (qs[0] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 2:
+      if (qs[0] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 3:
+      if (qs[0] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[2] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 4:
+      if (qs[0] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[2] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[3] > 3) {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 3) {
+          ApplyControlledGateL<0, 4, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 4, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * @return The size of SIMD register if applicable.
+   */
+  static unsigned SIMDRegisterSize() {
+    return 16;
+  }
+
+ private:
+  template <unsigned H>
+  void ApplyGateH(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t size,
+                uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m512 ru, iu, rn, in;
+      __m512 rs[hsize], is[hsize];
+
+      uint64_t r = i % size;
+      uint64_t s = i / size;
+
+      auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k] = _mm512_load_ps(p0 + p);
+        is[k] = _mm512_load_ps(p0 + p + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[l], ru, rn);
+          in = _mm512_fmadd_ps(rs[l], iu, in);
+          rn = _mm512_fnmadd_ps(is[l], iu, rn);
+          in = _mm512_fmadd_ps(is[l], ru, in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm512_store_ps(p0 + p, rn);
+        _mm512_store_ps(p0 + p + 16, in);
+      }
+    };
+
+    auto m = GetMasks1<H, 4>(qs);
+
+    unsigned k = 4 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f,
+             matrix, m.imaskh, m.qmaskh, size, raw_size, state.get());
+  }
+
+  template <unsigned H, unsigned L>
+  void ApplyGateL(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                uint64_t imaskh, uint64_t qmaskh, const __m512i* idx,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m512 rn, in;
+      __m512 rs[gsize], is[gsize];
+
+      uint64_t r = i % size;
+      uint64_t s = i / size;
+
+      auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k2] = _mm512_load_ps(p0 + p);
+        is[k2] = _mm512_load_ps(p0 + p + 16);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]);
+          is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
+          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm512_store_ps(p0 + p, rn);
+        _mm512_store_ps(p0 + p + 16, in);
+      }
+    };
+
+    __m512i idx[1 << L];
+    __m512 w[1 << (1 + 2 * H + L)];
+
+    auto m = GetMasks2<H, L, 4>(qs);
+    FillPermutationIndices<L>(m.qmaskl, idx);
+    FillMatrix<H, L, 4>(m.qmaskl, matrix, (fp_type*) w);
+
+    unsigned k = 4 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f,
+             w, m.imaskh, m.qmaskh, idx, size, raw_size, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m512 ru, iu, rn, in;
+      __m512 rs[hsize], is[hsize];
+
+      uint64_t r = i % size;
+      uint64_t s = i / size;
+
+      auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k] = _mm512_load_ps(p0 + p);
+        is[k] = _mm512_load_ps(p0 + p + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm512_set1_ps(v[j]);
+        iu = _mm512_set1_ps(v[j + 1]);
+        rn = _mm512_mul_ps(rs[0], ru);
+        in = _mm512_mul_ps(rs[0], iu);
+        rn = _mm512_fnmadd_ps(is[0], iu, rn);
+        in = _mm512_fmadd_ps(is[0], ru, in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm512_set1_ps(v[j]);
+          iu = _mm512_set1_ps(v[j + 1]);
+          rn = _mm512_fmadd_ps(rs[l], ru, rn);
+          in = _mm512_fmadd_ps(rs[l], iu, in);
+          rn = _mm512_fnmadd_ps(is[l], iu, rn);
+          in = _mm512_fmadd_ps(is[l], ru, in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm512_store_ps(p0 + p, rn);
+        _mm512_store_ps(p0 + p + 16, in);
+      }
+    };
+
+    auto m = GetMasks3<H, 4>(state.num_qubits(), qs, cqs, cvals);
+
+    unsigned k = 4 + H + cqs.size();
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f,
+             matrix, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m512 rn, in;
+      __m512 rs[hsize], is[hsize];
+
+      uint64_t r = i % size;
+      uint64_t s = i / size;
+
+      auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k] = _mm512_load_ps(p0 + p);
+        is[k] = _mm512_load_ps(p0 + p + 16);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
+          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm512_store_ps(p0 + p, rn);
+        _mm512_store_ps(p0 + p + 16, in);
+      }
+    };
+
+    __m512 w[1 << (1 + 2 * H)];
+
+    auto m = GetMasks4<H, 4>(state.num_qubits(), qs, cqs, cvals);
+    FillControlledMatrixH<H, 4>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
+
+    unsigned k = 4 + H + cqs.size() - m.cl;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f,
+             w, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get());
+  }
+
+  template <unsigned H, unsigned L, bool CH>
+  void ApplyControlledGateL(const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs, uint64_t cvals,
+                            const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
+                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
+                const __m512i* idx, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m512 rn, in;
+      __m512 rs[gsize], is[gsize];
+
+      uint64_t r = i % size;
+      uint64_t s = i / size;
+
+      auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh);
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        rs[k2] = _mm512_load_ps(p0 + p);
+        is[k2] = _mm512_load_ps(p0 + p + 16);
+
+        for (unsigned l = 1; l < lsize; ++l) {
+          rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]);
+          is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm512_mul_ps(rs[0], w[j]);
+        in = _mm512_mul_ps(rs[0], w[j + 1]);
+        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
+        in = _mm512_fmadd_ps(is[0], w[j], in);
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
+          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
+          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
+          in = _mm512_fmadd_ps(is[l], w[j], in);
+
+          j += 2;
+        }
+
+        uint64_t p = _pdep_u64(k, qmaskh);
+
+        _mm512_store_ps(p0 + p, rn);
+        _mm512_store_ps(p0 + p + 16, in);
+      }
+    };
+
+    __m512i idx[1 << L];
+    __m512 w[1 << (1 + 2 * H + L)];
+
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    if (CH) {
+      auto m = GetMasks5<H, L, 4>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillMatrix<H, L, 4>(m.qmaskl, matrix, (fp_type*) w);
+
+      unsigned k = 4 + H + cqs.size();
+      unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+      uint64_t size = uint64_t{1} << n;
+
+      for_.Run(size * size2, f, w, m.imaskh, m.qmaskh,
+               m.cvalsh, idx, size, raw_size, state.get());
+    } else {
+      auto m = GetMasks6<H, L, 4>(state.num_qubits(), qs, cqs, cvals);
+      FillPermutationIndices<L>(m.qmaskl, idx);
+      FillControlledMatrixL<H, L, 4>(
+          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
+
+      unsigned k = 4 + H + cqs.size() - m.cl;
+      unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+      uint64_t size = uint64_t{1} << n;
+
+      for_.Run(size * size2, f, w, m.imaskh, m.qmaskh,
+               m.cvalsh, idx, size, raw_size, state.get());
+    }
+  }
+
+  template <unsigned L>
+  static void FillPermutationIndices(unsigned qmaskl, __m512i* idx) {
+    constexpr unsigned lsize = 1 << L;
+
+    for (unsigned i = 0; i < lsize; ++i) {
+      unsigned p[16];
+
+      for (unsigned j = 0; j < 16; ++j) {
+        p[j] = MaskedAdd<4>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl));
+      }
+
+      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
+                                p[9], p[8], p[7], p[6], p[5], p[4],
+                                p[3], p[2], p[1], p[0]);
+    }
+  }
+
+  For for_;
+};
+
+}  // namespace unitary
+}  // namespace qsim
+
+#endif  // UNITARY_CALCULATOR_AVX512_H_
diff --git a/tpls/qsim/unitary_calculator_basic.h b/tpls/qsim/unitary_calculator_basic.h
new file mode 100644
index 0000000..6b1821a
--- /dev/null
+++ b/tpls/qsim/unitary_calculator_basic.h
@@ -0,0 +1,259 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UNITARY_CALCULATOR_BASIC_H_
+#define UNITARY_CALCULATOR_BASIC_H_
+
+#include <complex>
+#include <cstdint>
+#include <functional>
+#include <vector>
+
+#include "simulator.h"
+#include "unitaryspace_basic.h"
+
+namespace qsim {
+namespace unitary {
+
+/**
+ * Quantum circuit unitary calculator without vectorization.
+ */
+template <typename For, typename FP = float>
+class UnitaryCalculatorBasic final : public SimulatorBase {
+ public:
+  using UnitarySpace = UnitarySpaceBasic<For, FP>;
+  using Unitary = typename UnitarySpace::Unitary;
+  using fp_type = typename UnitarySpace::fp_type;
+
+  using StateSpace = UnitarySpace;
+  using State = Unitary;
+
+  template <typename... ForArgs>
+  explicit UnitaryCalculatorBasic(ForArgs&&... args) : for_(args...) {}
+
+  /**
+   * Applies a gate using non-vectorized instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyGate(const std::vector<unsigned>& qs,
+                 const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    switch (qs.size()) {
+    case 1:
+      ApplyGateH<1>(qs, matrix, state);
+      break;
+    case 2:
+      ApplyGateH<2>(qs, matrix, state);
+      break;
+    case 3:
+      ApplyGateH<3>(qs, matrix, state);
+      break;
+    case 4:
+      ApplyGateH<4>(qs, matrix, state);
+      break;
+    case 5:
+      ApplyGateH<5>(qs, matrix, state);
+      break;
+    case 6:
+      ApplyGateH<6>(qs, matrix, state);
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * Applies a controlled gate using non-vectorized instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param cqs Indices of control qubits.
+   * @param cvals Bit mask of control qubit values.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyControlledGate(const std::vector<unsigned>& qs,
+                           const std::vector<unsigned>& cqs, uint64_t cvals,
+                           const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    if (cqs.size() == 0) {
+      ApplyGate(qs, matrix, state);
+      return;
+    }
+
+    switch (qs.size()) {
+    case 1:
+      ApplyControlledGateH<1>(qs, cqs, cvals, matrix, state);
+      break;
+    case 2:
+      ApplyControlledGateH<2>(qs, cqs, cvals, matrix, state);
+      break;
+    case 3:
+      ApplyControlledGateH<3>(qs, cqs, cvals, matrix, state);
+      break;
+    case 4:
+      ApplyControlledGateH<4>(qs, cqs, cvals, matrix, state);
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * @return The size of SIMD register if applicable.
+   */
+  static unsigned SIMDRegisterSize() {
+    return 1;
+  }
+
+ private:
+  template <unsigned H>
+  void ApplyGateH(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss, uint64_t size,
+                uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      fp_type rn, in;
+      fp_type rs[hsize], is[hsize];
+
+      uint64_t r = i % size;
+      uint64_t s = i / size;
+
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
+      }
+
+      auto p0 = rstate + row_size * s + 2 * t;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = *(p0 + xss[k]);
+        is[k] = *(p0 + xss[k] + 1);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = rs[0] * v[j] - is[0] * v[j + 1];
+        in = rs[0] * v[j + 1] + is[0] * v[j];
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          rn += rs[l] * v[j] - is[l] * v[j + 1];
+          in += rs[l] * v[j + 1] + is[l] * v[j];
+
+          j += 2;
+        }
+
+        *(p0 + xss[k]) = rn;
+        *(p0 + xss[k] + 1) = in;
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateH(const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs,
+                            uint64_t cvals, const fp_type* matrix,
+                            State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      fp_type rn, in;
+      fp_type rs[hsize], is[hsize];
+
+      uint64_t r = i % size;
+      uint64_t s = i / size;
+
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
+      }
+
+      if ((t & cmaskh) == cvalsh) {
+        auto p0 = rstate + row_size * s + 2 * t;
+
+        for (unsigned k = 0; k < hsize; ++k) {
+          rs[k] = *(p0 + xss[k]);
+          is[k] = *(p0 + xss[k] + 1);
+        }
+
+        uint64_t j = 0;
+
+        for (unsigned k = 0; k < hsize; ++k) {
+          rn = rs[0] * v[j] - is[0] * v[j + 1];
+          in = rs[0] * v[j + 1] + is[0] * v[j];
+
+          j += 2;
+
+          for (unsigned l = 1; l < hsize; ++l) {
+            rn += rs[l] * v[j] - is[l] * v[j + 1];
+            in += rs[l] * v[j + 1] + is[l] * v[j];
+
+            j += 2;
+          }
+
+          *(p0 + xss[k]) = rn;
+          *(p0 + xss[k] + 1) = in;
+        }
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals);
+
+    unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f,
+             matrix, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get());
+  }
+
+  For for_;
+};
+
+}  // namespace unitary
+}  // namespace qsim
+
+#endif  // UNITARY_CALCULATOR_BASIC_H_
diff --git a/tpls/qsim/unitary_calculator_sse.h b/tpls/qsim/unitary_calculator_sse.h
new file mode 100644
index 0000000..a3c3f2e
--- /dev/null
+++ b/tpls/qsim/unitary_calculator_sse.h
@@ -0,0 +1,639 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UNITARY_CALCULATOR_SSE_H_
+#define UNITARY_CALCULATOR_SSE_H_
+
+#include <smmintrin.h>
+
+#include <complex>
+#include <cstdint>
+#include <functional>
+#include <vector>
+
+#include "simulator.h"
+#include "unitaryspace_sse.h"
+
+namespace qsim {
+namespace unitary {
+
+/**
+ * Quantum circuit unitary calculator with SSE vectorization.
+ */
+template <typename For>
+class UnitaryCalculatorSSE final : public SimulatorBase {
+ public:
+  using UnitarySpace = UnitarySpaceSSE<For>;
+  using Unitary = typename UnitarySpace::Unitary;
+  using fp_type = typename UnitarySpace::fp_type;
+
+  using StateSpace = UnitarySpace;
+  using State = Unitary;
+
+  template <typename... ForArgs>
+  explicit UnitaryCalculatorSSE(ForArgs&&... args) : for_(args...) {}
+
+  /**
+   * Applies a gate using SSE instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyGate(const std::vector<unsigned>& qs,
+                 const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+
+    switch (qs.size()) {
+    case 1:
+      if (qs[0] > 1) {
+        ApplyGateH<1>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 1>(qs, matrix, state);
+      }
+      break;
+    case 2:
+      if (qs[0] > 1) {
+        ApplyGateH<2>(qs, matrix, state);
+      } else if (qs[1] > 1) {
+        ApplyGateL<1, 1>(qs, matrix, state);
+      } else {
+        ApplyGateL<0, 2>(qs, matrix, state);
+      }
+      break;
+    case 3:
+      if (qs[0] > 1) {
+        ApplyGateH<3>(qs, matrix, state);
+      } else if (qs[1] > 1) {
+        ApplyGateL<2, 1>(qs, matrix, state);
+      } else {
+        ApplyGateL<1, 2>(qs, matrix, state);
+      }
+      break;
+    case 4:
+      if (qs[0] > 1) {
+        ApplyGateH<4>(qs, matrix, state);
+      } else if (qs[1] > 1) {
+        ApplyGateL<3, 1>(qs, matrix, state);
+      } else {
+        ApplyGateL<2, 2>(qs, matrix, state);
+      }
+      break;
+    case 5:
+      if (qs[0] > 1) {
+        ApplyGateH<5>(qs, matrix, state);
+      } else if (qs[1] > 1) {
+        ApplyGateL<4, 1>(qs, matrix, state);
+      } else {
+        ApplyGateL<3, 2>(qs, matrix, state);
+      }
+      break;
+    case 6:
+      if (qs[0] > 1) {
+        ApplyGateH<6>(qs, matrix, state);
+      } else if (qs[1] > 1) {
+        ApplyGateL<5, 1>(qs, matrix, state);
+      } else {
+        ApplyGateL<4, 2>(qs, matrix, state);
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * Applies a controlled gate using SSE instructions.
+   * @param qs Indices of the qubits affected by this gate.
+   * @param cqs Indices of control qubits.
+   * @param cvals Bit mask of control qubit values.
+   * @param matrix Matrix representation of the gate to be applied.
+   * @param state The state of the system, to be updated by this method.
+   */
+  void ApplyControlledGate(const std::vector<unsigned>& qs,
+                           const std::vector<unsigned>& cqs, uint64_t cvals,
+                           const fp_type* matrix, State& state) const {
+    // Assume qs[0] < qs[1] < qs[2] < ... .
+    // Assume cqs[0] < cqs[1] < cqs[2] < ... .
+
+    if (cqs.size() == 0) {
+      ApplyGate(qs, matrix, state);
+      return;
+    }
+
+    switch (qs.size()) {
+    case 1:
+      if (qs[0] > 1) {
+        if (cqs[0] > 1) {
+          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 1) {
+          ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 2:
+      if (qs[0] > 1) {
+        if (cqs[0] > 1) {
+          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 1) {
+        if (cqs[0] > 1) {
+          ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 1) {
+          ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 3:
+      if (qs[0] > 1) {
+        if (cqs[0] > 1) {
+          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 1) {
+        if (cqs[0] > 1) {
+          ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 1) {
+          ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    case 4:
+      if (qs[0] > 1) {
+        if (cqs[0] > 1) {
+          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state);
+        }
+      } else if (qs[1] > 1) {
+        if (cqs[0] > 1) {
+          ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state);
+        }
+      } else {
+        if (cqs[0] > 1) {
+          ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state);
+        } else {
+          ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state);
+        }
+      }
+      break;
+    default:
+      // Not implemented.
+      break;
+    }
+  }
+
+  /**
+   * @return The size of SIMD register if applicable.
+   */
+  static unsigned SIMDRegisterSize() {
+    return 4;
+  }
+
+ private:
+  template <unsigned H>
+  void ApplyGateH(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss, uint64_t size,
+                uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m128 ru, iu, rn, in;
+      __m128 rs[hsize], is[hsize];
+
+      uint64_t r = 4 * (i % size);
+      uint64_t s = i / size;
+
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
+      }
+
+      auto p0 = rstate + row_size * s + 2 * t;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm_load_ps(p0 + xss[k]);
+        is[k] = _mm_load_ps(p0 + xss[k] + 4);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm_set1_ps(v[j]);
+        iu = _mm_set1_ps(v[j + 1]);
+        rn = _mm_mul_ps(rs[0], ru);
+        in = _mm_mul_ps(rs[0], iu);
+        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
+        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm_set1_ps(v[j]);
+          iu = _mm_set1_ps(v[j + 1]);
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], ru));
+
+          j += 2;
+        }
+
+        _mm_store_ps(p0 + xss[k], rn);
+        _mm_store_ps(p0 + xss[k] + 4, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 2 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, state.get());
+  }
+
+  template <unsigned H, unsigned L>
+  void ApplyGateL(const std::vector<unsigned>& qs,
+                  const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
+                const uint64_t* ms, const uint64_t* xss, unsigned q0,
+                uint64_t size, uint64_t row_size, fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m128 rn, in;
+      __m128 rs[gsize], is[gsize];
+
+      uint64_t r = 4 * (i % size);
+      uint64_t s = i / size;
+
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
+      }
+
+      auto p0 = rstate + row_size * s + 2 * t;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+
+        rs[k2] = _mm_load_ps(p0 + xss[k]);
+        is[k2] = _mm_load_ps(p0 + xss[k] + 4);
+
+        if (L == 1) {
+          rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177)
+                               : _mm_shuffle_ps(rs[k2], rs[k2], 78);
+          is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177)
+                               : _mm_shuffle_ps(is[k2], is[k2], 78);
+        } else if (L == 2) {
+          rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57);
+          is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57);
+          rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78);
+          is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78);
+          rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147);
+          is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm_mul_ps(rs[0], w[j]);
+        in = _mm_mul_ps(rs[0], w[j + 1]);
+        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
+        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
+
+          j += 2;
+        }
+
+        _mm_store_ps(p0 + xss[k], rn);
+        _mm_store_ps(p0 + xss[k] + 4, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m128 w[1 << (1 + 2 * H + L)];
+
+    auto m = GetMasks11<L>(qs);
+
+    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
+    FillMatrix<H, L, 2>(m.qmaskl, matrix, (fp_type*) w);
+
+    unsigned k = 2 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f,  w, ms, xss, qs[0], size, raw_size, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m128 ru, iu, rn, in;
+      __m128 rs[hsize], is[hsize];
+
+      uint64_t r = 4 * (i % size);
+      uint64_t s = i / size;
+
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
+      }
+
+      if ((t & cmaskh) != cvalsh) return;
+
+      auto p0 = rstate + row_size * s + 2 * t;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm_load_ps(p0 + xss[k]);
+        is[k] = _mm_load_ps(p0 + xss[k] + 4);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        ru = _mm_set1_ps(v[j]);
+        iu = _mm_set1_ps(v[j + 1]);
+        rn = _mm_mul_ps(rs[0], ru);
+        in = _mm_mul_ps(rs[0], iu);
+        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
+        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          ru = _mm_set1_ps(v[j]);
+          iu = _mm_set1_ps(v[j + 1]);
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], ru));
+
+          j += 2;
+        }
+
+        _mm_store_ps(p0 + xss[k], rn);
+        _mm_store_ps(p0 + xss[k] + 4, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+
+    auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals);
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 2 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f,
+             matrix, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get());
+  }
+
+  template <unsigned H>
+  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
+                             const std::vector<unsigned>& cqs, uint64_t cvals,
+                             const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      constexpr unsigned hsize = 1 << H;
+
+      __m128 rn, in;
+      __m128 rs[hsize], is[hsize];
+
+      uint64_t r = 4 * (i % size);
+      uint64_t s = i / size;
+
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
+      }
+
+      if ((t & cmaskh) != cvalsh) return;
+
+      auto p0 = rstate + row_size * s + 2 * t;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rs[k] = _mm_load_ps(p0 + xss[k]);
+        is[k] = _mm_load_ps(p0 + xss[k] + 4);
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm_mul_ps(rs[0], w[j]);
+        in = _mm_mul_ps(rs[0], w[j + 1]);
+        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
+        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
+
+        j += 2;
+
+        for (unsigned l = 1; l < hsize; ++l) {
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
+
+          j += 2;
+        }
+
+        _mm_store_ps(p0 + xss[k], rn);
+        _mm_store_ps(p0 + xss[k] + 4, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m128 w[1 << (1 + 2 * H)];
+
+    auto m = GetMasks8<2>(state.num_qubits(), qs, cqs, cvals);
+    FillIndices<H>(state.num_qubits(), qs, ms, xss);
+    FillControlledMatrixH<H, 2>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
+
+    unsigned k = 2 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    for_.Run(size * size2, f,
+             w, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get());
+  }
+
+  template <unsigned H, unsigned L, bool CH>
+  void ApplyControlledGateL(const std::vector<unsigned>& qs,
+                            const std::vector<unsigned>& cqs, uint64_t cvals,
+                            const fp_type* matrix, State& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
+                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
+                uint64_t cmaskh, unsigned q0, uint64_t size, uint64_t row_size,
+                fp_type* rstate) {
+      constexpr unsigned gsize = 1 << (H + L);
+      constexpr unsigned hsize = 1 << H;
+      constexpr unsigned lsize = 1 << L;
+
+      __m128 rn, in;
+      __m128 rs[gsize], is[gsize];
+
+      uint64_t r = 4 * (i % size);
+      uint64_t s = i / size;
+
+      uint64_t t = r & ms[0];
+      for (unsigned j = 1; j <= H; ++j) {
+        r *= 2;
+        t |= r & ms[j];
+      }
+
+      if ((t & cmaskh) != cvalsh) return;
+
+      auto p0 = rstate + row_size * s + 2 * t;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        unsigned k2 = lsize * k;
+
+        rs[k2] = _mm_load_ps(p0 + xss[k]);
+        is[k2] = _mm_load_ps(p0 + xss[k] + 4);
+
+        if (L == 1) {
+          rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177)
+                               : _mm_shuffle_ps(rs[k2], rs[k2], 78);
+          is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177)
+                               : _mm_shuffle_ps(is[k2], is[k2], 78);
+        } else if (L == 2) {
+          rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57);
+          is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57);
+          rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78);
+          is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78);
+          rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147);
+          is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147);
+        }
+      }
+
+      uint64_t j = 0;
+
+      for (unsigned k = 0; k < hsize; ++k) {
+        rn = _mm_mul_ps(rs[0], w[j]);
+        in = _mm_mul_ps(rs[0], w[j + 1]);
+        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
+        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
+
+        j += 2;
+
+        for (unsigned l = 1; l < gsize; ++l) {
+          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
+          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
+          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
+          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
+
+          j += 2;
+        }
+
+        _mm_store_ps(p0 + xss[k], rn);
+        _mm_store_ps(p0 + xss[k] + 4, in);
+      }
+    };
+
+    uint64_t ms[H + 1];
+    uint64_t xss[1 << H];
+    __m128 w[1 << (1 + 2 * H + L)];
+
+    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
+
+    unsigned k = 2 + H;
+    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
+    uint64_t size = uint64_t{1} << n;
+    uint64_t size2 = uint64_t{1} << state.num_qubits();
+    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
+
+    if (CH) {
+      auto m = GetMasks9<L>(state.num_qubits(), qs, cqs, cvals);
+      FillMatrix<H, L, 2>(m.qmaskl, matrix, (fp_type*) w);
+
+      for_.Run(size * size2, f, w, ms, xss,
+               m.cvalsh, m.cmaskh, qs[0], size, raw_size, state.get());
+    } else {
+      auto m = GetMasks10<L, 2>(state.num_qubits(), qs, cqs, cvals);
+      FillControlledMatrixL<H, L, 2>(
+          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
+
+      for_.Run(size * size2, f, w, ms, xss,
+               m.cvalsh, m.cmaskh, qs[0], size, raw_size, state.get());
+    }
+  }
+
+  For for_;
+};
+
+}  // namespace unitary
+}  // namespace qsim
+
+#endif  // UNITARY_CALCULATOR_SSE_H_
diff --git a/tpls/qsim/unitaryspace.h b/tpls/qsim/unitaryspace.h
new file mode 100644
index 0000000..b5e2691
--- /dev/null
+++ b/tpls/qsim/unitaryspace.h
@@ -0,0 +1,65 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UNITARYSPACE_H_
+#define UNITARYSPACE_H_
+
+#include <cstdint>
+
+namespace qsim {
+
+namespace unitary {
+
+/**
+ * Abstract class containing routines for general unitary matrix manipulations.
+ * "AVX", "AVX512", "Basic", and "SSE" implementations are provided.
+ */
+template <typename Impl,
+          template<typename...> class VectorSpace, typename... VSTypeParams>
+class UnitarySpace : public VectorSpace<Impl, VSTypeParams...> {
+ private:
+  using Base = VectorSpace<Impl, VSTypeParams...>;
+
+ public:
+  using fp_type = typename Base::fp_type;
+  using Unitary = typename Base::Vector;
+
+  template <typename... ForArgs>
+  UnitarySpace(ForArgs&&... args) : Base(args...) {}
+
+  static Unitary CreateUnitary(unsigned num_qubits) {
+    return Base::Create(num_qubits);
+  }
+
+  static Unitary CreateUnitary(fp_type* p, unsigned num_qubits) {
+    return Base::Create(p, num_qubits);
+  }
+
+  static Unitary NullUnitary() {
+    return Base::Null();
+  }
+
+  static uint64_t Size(unsigned num_qubits) {
+    return uint64_t{1} << num_qubits;
+  };
+
+  void CopyUnitary(const Unitary& src, Unitary& dest) const {
+    Base::Copy(src, dest);
+  }
+};
+
+}  // namespace unitary
+}  // namespace qsim
+
+#endif  // UNITARYSPACE_H_
diff --git a/tpls/qsim/unitaryspace_avx.h b/tpls/qsim/unitaryspace_avx.h
new file mode 100644
index 0000000..c1ec59d
--- /dev/null
+++ b/tpls/qsim/unitaryspace_avx.h
@@ -0,0 +1,112 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UNITARYSPACE_AVX_H_
+#define UNITARYSPACE_AVX_H_
+
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <cstdint>
+
+#include "unitaryspace.h"
+#include "vectorspace.h"
+
+namespace qsim {
+
+namespace unitary {
+
+/**
+ * Object containing context and routines for unitary manipulations.
+ * Unitary is a vectorized sequence of eight real components followed by eight
+ * imaginary components. Eight single-precison floating numbers can be loaded
+ * into an AVX register.
+ */
+template <typename For>
+struct UnitarySpaceAVX :
+    public UnitarySpace<UnitarySpaceAVX<For>, VectorSpace, For, float> {
+ private:
+  using Base = UnitarySpace<UnitarySpaceAVX<For>,
+                            qsim::VectorSpace, For, float>;
+
+ public:
+  using Unitary = typename Base::Unitary;
+  using fp_type = typename Base::fp_type;
+
+  template <typename... ForArgs>
+  explicit UnitarySpaceAVX(ForArgs&&... args) : Base(args...) {}
+
+  static uint64_t MinRowSize(unsigned num_qubits) {
+    return std::max(uint64_t{16}, 2 * (uint64_t{1} << num_qubits));
+  };
+
+  static uint64_t MinSize(unsigned num_qubits) {
+    return Base::Size(num_qubits) * MinRowSize(num_qubits);
+  };
+
+  void SetAllZeros(Unitary& state) const {
+    __m256 val0 = _mm256_setzero_ps();
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, __m256& val, fp_type* p) {
+      _mm256_store_ps(p + 16 * i, val);
+      _mm256_store_ps(p + 16 * i + 8, val);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 16, f, val0, state.get());
+  }
+
+  void SetIdentity(Unitary& state) {
+    SetAllZeros(state);
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                uint64_t row_size, fp_type* p) {
+      p[row_size * i + (16 * (i / 8)) + (i % 8)] = 1;
+    };
+
+    uint64_t size = Base::Size(state.num_qubits());
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    Base::for_.Run(size, f, row_size, state.get());
+  }
+
+  static std::complex<fp_type> GetEntry(const Unitary& state,
+                                        uint64_t i, uint64_t j) {
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    uint64_t k = (16 * (j / 8)) + (j % 8);
+    return std::complex<fp_type>(state.get()[row_size * i + k],
+                                 state.get()[row_size * i + k + 8]);
+  }
+
+  static void SetEntry(Unitary& state, uint64_t i, uint64_t j,
+                       const std::complex<fp_type>& ampl) {
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    uint64_t k = (16 * (j / 8)) + (j % 8);
+    state.get()[row_size * i + k] = std::real(ampl);
+    state.get()[row_size * i + k + 8] = std::imag(ampl);
+  }
+
+  static void SetEntry(Unitary& state, uint64_t i, uint64_t j, fp_type re,
+                       fp_type im) {
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    uint64_t k = (16 * (j / 8)) + (j % 8);
+    state.get()[row_size * i + k] = re;
+    state.get()[row_size * i + k + 8] = im;
+  }
+};
+
+}  // namespace unitary
+}  // namespace qsim
+
+#endif  // UNITARYSPACE_AVX_H_
diff --git a/tpls/qsim/unitaryspace_avx512.h b/tpls/qsim/unitaryspace_avx512.h
new file mode 100644
index 0000000..4c23dc9
--- /dev/null
+++ b/tpls/qsim/unitaryspace_avx512.h
@@ -0,0 +1,112 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UNITARYSPACE_AVX512_H_
+#define UNITARYSPACE_AVX512_H_
+
+#include <immintrin.h>
+
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <cstdint>
+
+#include "unitaryspace.h"
+#include "vectorspace.h"
+
+namespace qsim {
+
+namespace unitary {
+
+/**
+ * Object containing context and routines for unitary manipulations.
+ * State is a vectorized sequence of sixteen real components followed by
+ * sixteen imaginary components. Sixteen single-precison floating numbers can
+ * be loaded into an AVX512 register.
+ */
+template <typename For>
+struct UnitarySpaceAVX512 :
+    public UnitarySpace<UnitarySpaceAVX512<For>, VectorSpace, For, float> {
+ private:
+  using Base = UnitarySpace<UnitarySpaceAVX512<For>,
+                            qsim::VectorSpace, For, float>;
+
+ public:
+  using Unitary = typename Base::Unitary;
+  using fp_type = typename Base::fp_type;
+
+  template <typename... ForArgs>
+  explicit UnitarySpaceAVX512(ForArgs&&... args) : Base(args...) {}
+
+  static uint64_t MinRowSize(unsigned num_qubits) {
+    return std::max(uint64_t{32}, 2 * (uint64_t{1} << num_qubits));
+  };
+
+  static uint64_t MinSize(unsigned num_qubits) {
+    return Base::Size(num_qubits) * MinRowSize(num_qubits);
+  };
+
+  void SetAllZeros(Unitary& state) const {
+    __m512 val0 = _mm512_setzero_ps();
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, __m512 val0, fp_type* p) {
+      _mm512_store_ps(p + 32 * i, val0);
+      _mm512_store_ps(p + 32 * i + 16, val0);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 32, f, val0, state.get());
+  }
+
+  void SetIdentity(Unitary& state) {
+    SetAllZeros(state);
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                uint64_t row_size, fp_type* p) {
+      p[row_size * i + (32 * (i / 16)) + (i % 16)] = 1;
+    };
+
+    uint64_t size = Base::Size(state.num_qubits());
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    Base::for_.Run(size, f, row_size, state.get());
+  }
+
+  static std::complex<fp_type> GetEntry(const Unitary& state,
+                                        uint64_t i, uint64_t j) {
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    uint64_t k = (32 * (j / 16)) + (j % 16);
+    return std::complex<fp_type>(state.get()[row_size * i + k],
+                                 state.get()[row_size * i + k + 16]);
+  }
+
+  static void SetEntry(Unitary& state, uint64_t i, uint64_t j,
+                       const std::complex<fp_type>& ampl) {
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    uint64_t k = (32 * (j / 16)) + (j % 16);
+    state.get()[row_size * i + k] = std::real(ampl);
+    state.get()[row_size * i + k + 16] = std::imag(ampl);
+  }
+
+  static void SetEntry(Unitary& state, uint64_t i, uint64_t j, fp_type re,
+                       fp_type im) {
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    uint64_t k = (32 * (j / 16)) + (j % 16);
+    state.get()[row_size * i + k] = re;
+    state.get()[row_size * i + k + 16] = im;
+  }
+};
+
+}  // namespace unitary
+}  // namespace qsim
+
+#endif  // UNITARYSPACE_AVX512_H_
diff --git a/tpls/qsim/unitaryspace_basic.h b/tpls/qsim/unitaryspace_basic.h
new file mode 100644
index 0000000..2db14b6
--- /dev/null
+++ b/tpls/qsim/unitaryspace_basic.h
@@ -0,0 +1,103 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UNITARYSPACE_BASIC_H_
+#define UNITARYSPACE_BASIC_H_
+
+#include <cmath>
+#include <complex>
+#include <cstdint>
+
+#include "unitaryspace.h"
+#include "vectorspace.h"
+
+namespace qsim {
+
+namespace unitary {
+
+/**
+ * Object containing context and routines for unitary manipulations.
+ * Unitary is a non-vectorized sequence of one real amplitude followed by
+ * one imaginary amplitude.
+ */
+template <typename For, typename FP>
+struct UnitarySpaceBasic
+    : public UnitarySpace<UnitarySpaceBasic<For, FP>, VectorSpace, For, FP> {
+ private:
+  using Base = UnitarySpace<UnitarySpaceBasic<For, FP>,
+                            qsim::VectorSpace, For, FP>;
+
+ public:
+  using Unitary = typename Base::Unitary;
+  using fp_type = typename Base::fp_type;
+
+  template <typename... ForArgs>
+  explicit UnitarySpaceBasic(ForArgs&&... args) : Base(args...) {}
+
+  static uint64_t MinRowSize(unsigned num_qubits) {
+    return 2 * (uint64_t{1} << num_qubits);
+  };
+
+  static uint64_t MinSize(unsigned num_qubits) {
+    return Base::Size(num_qubits) * MinRowSize(num_qubits);
+  };
+
+  void SetAllZeros(Unitary& state) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) {
+      p[2 * i + 0] = 0;
+      p[2 * i + 1] = 0;
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 2, f, state.get());
+  }
+
+  void SetIdentity(Unitary& state) {
+    SetAllZeros(state);
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                uint64_t row_size, fp_type* p) {
+      p[row_size * i + 2 * i] = 1;
+    };
+
+    uint64_t size = Base::Size(state.num_qubits());
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    Base::for_.Run(size, f, row_size, state.get());
+  }
+
+  static std::complex<fp_type> GetEntry(const Unitary& state,
+                                        uint64_t i, uint64_t j) {
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    return std::complex<fp_type>(state.get()[row_size * i + 2 * j],
+                                 state.get()[row_size * i + 2 * j + 1]);
+  }
+
+  static void SetEntry(Unitary& state, uint64_t i, uint64_t j,
+                       const std::complex<fp_type>& ampl) {
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    state.get()[row_size * i + 2 * j] = std::real(ampl);
+    state.get()[row_size * i + 2 * j + 1] = std::imag(ampl);
+  }
+
+  static void SetEntry(Unitary& state, uint64_t i, uint64_t j,
+                       fp_type re, fp_type im) {
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    state.get()[row_size * i + 2 * j] = re;
+    state.get()[row_size * i + 2 * j + 1] = im;
+  }
+};
+
+}  // namespace unitary
+}  // namespace qsim
+
+#endif  // UNITARYSPACE_BASIC_H_
diff --git a/tpls/qsim/unitaryspace_sse.h b/tpls/qsim/unitaryspace_sse.h
new file mode 100644
index 0000000..f3762fb
--- /dev/null
+++ b/tpls/qsim/unitaryspace_sse.h
@@ -0,0 +1,112 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UNITARYSPACE_SSE_H_
+#define UNITARYSPACE_SSE_H_
+
+#include <smmintrin.h>
+
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <cstdint>
+
+#include "unitaryspace.h"
+#include "vectorspace.h"
+
+namespace qsim {
+
+namespace unitary {
+
+/**
+ * Object containing context and routines for unitary manipulations.
+ * Unitary is a vectorized sequence of four real components followed by four
+ * imaginary components. Four single-precison floating numbers can be loaded
+ * into an SSE register.
+ */
+template <typename For>
+struct UnitarySpaceSSE :
+    public UnitarySpace<UnitarySpaceSSE<For>, VectorSpace, For, float> {
+ private:
+  using Base = UnitarySpace<UnitarySpaceSSE<For>,
+                            qsim::VectorSpace, For, float>;
+
+ public:
+  using Unitary = typename Base::Unitary;
+  using fp_type = typename Base::fp_type;
+
+  template <typename... ForArgs>
+  explicit UnitarySpaceSSE(ForArgs&&... args) : Base(args...) {}
+
+  static uint64_t MinRowSize(unsigned num_qubits) {
+    return std::max(uint64_t{8}, 2 * (uint64_t{1} << num_qubits));
+  };
+
+  static uint64_t MinSize(unsigned num_qubits) {
+    return Base::Size(num_qubits) * MinRowSize(num_qubits);
+  };
+
+  void SetAllZeros(Unitary& state) const {
+    __m128 val0 = _mm_setzero_ps();
+
+    auto f = [](unsigned n, unsigned m, uint64_t i, __m128 val0, fp_type* p) {
+      _mm_store_ps(p + 8 * i, val0);
+      _mm_store_ps(p + 8 * i + 4, val0);
+    };
+
+    Base::for_.Run(MinSize(state.num_qubits()) / 8, f, val0, state.get());
+  }
+
+  void SetIdentity(Unitary& state) {
+    SetAllZeros(state);
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                uint64_t row_size, fp_type* p) {
+      p[row_size * i + (8 * (i / 4)) + (i % 4)] = 1;
+    };
+
+    uint64_t size = Base::Size(state.num_qubits());
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    Base::for_.Run(size, f, row_size, state.get());
+  }
+
+  static std::complex<fp_type> GetEntry(const Unitary& state,
+                                        uint64_t i, uint64_t j) {
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    uint64_t k = (8 * (j / 4)) + (j % 4);
+    return std::complex<fp_type>(state.get()[row_size * i + k],
+                                 state.get()[row_size * i + k + 4]);
+  }
+
+  static void SetEntry(Unitary& state, uint64_t i, uint64_t j,
+                       const std::complex<fp_type>& ampl) {
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    uint64_t k = (8 * (j / 4)) + (j % 4);
+    state.get()[row_size * i + k] = std::real(ampl);
+    state.get()[row_size * i + k + 4] = std::imag(ampl);
+  }
+
+  static void SetEntry(Unitary& state, uint64_t i, uint64_t j, fp_type re,
+                       fp_type im) {
+    uint64_t row_size = MinRowSize(state.num_qubits());
+    uint64_t k = (8 * (j / 4)) + (j % 4);
+    state.get()[row_size * i + k] = re;
+    state.get()[row_size * i + k + 4] = im;
+  }
+};
+
+}  // namespace unitary
+}  // namespace qsim
+
+#endif  // UNITARYSPACE_SSE_H_
diff --git a/tpls/qsim/util.h b/tpls/qsim/util.h
new file mode 100644
index 0000000..726a019
--- /dev/null
+++ b/tpls/qsim/util.h
@@ -0,0 +1,89 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UTIL_H_
+#define UTIL_H_
+
+#include <algorithm>
+#include <chrono>
+#include <random>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace qsim {
+
+template <typename Container>
+inline void SplitString(
+    const std::string& str, char delim, Container& words) {
+  words.resize(0);
+
+  std::string word;
+  std::stringstream ss(str);
+
+  while (std::getline(ss, word, delim)) {
+    words.push_back(std::move(word));
+  }
+}
+
+template <typename Op, typename Container>
+inline void SplitString(
+    const std::string& str, char delim, Op op, Container& words) {
+  words.resize(0);
+
+  std::string word;
+  std::stringstream ss(str);
+
+  while (std::getline(ss, word, delim)) {
+    words.push_back(op(word));
+  }
+}
+
+inline double GetTime() {
+  using namespace std::chrono;
+  steady_clock::duration since_epoch = steady_clock::now().time_since_epoch();
+  return double(since_epoch.count() * steady_clock::period::num)
+                                    / steady_clock::period::den;
+}
+
+template <typename DistrRealType, typename RGen>
+inline DistrRealType RandomValue(RGen& rgen, DistrRealType max_value) {
+  std::uniform_real_distribution<DistrRealType> distr(0.0, max_value);
+  return distr(rgen);
+}
+
+template <typename DistrRealType>
+inline std::vector<DistrRealType> GenerateRandomValues(
+    uint64_t num_samples, unsigned seed, DistrRealType max_value) {
+  std::vector<DistrRealType> rs;
+  rs.reserve(num_samples + 1);
+
+  std::mt19937 rgen(seed);
+  std::uniform_real_distribution<DistrRealType> distr(0.0, max_value);
+
+  for (uint64_t i = 0; i < num_samples; ++i) {
+    rs.emplace_back(distr(rgen));
+  }
+
+  std::sort(rs.begin(), rs.end());
+  // Populate the final element to prevent sanitizer errors.
+  rs.emplace_back(max_value);
+
+  return rs;
+}
+
+}  // namespace qsim
+
+#endif  // UTIL_H_
diff --git a/tpls/qsim/util_cpu.h b/tpls/qsim/util_cpu.h
new file mode 100644
index 0000000..8e02425
--- /dev/null
+++ b/tpls/qsim/util_cpu.h
@@ -0,0 +1,43 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UTIL_CPU_H_
+#define UTIL_CPU_H_
+
+#ifdef __SSE2__
+# include <immintrin.h>
+#endif
+
+namespace qsim {
+
+// This function sets flush-to-zero and denormals-are-zeros MXCSR control
+// flags. This prevents rare cases of performance slowdown potentially at
+// the cost of a tiny precision loss.
+inline void SetFlushToZeroAndDenormalsAreZeros() {
+#ifdef __SSE2__
+  _mm_setcsr(_mm_getcsr() | 0x8040);
+#endif
+}
+
+// This function clears flush-to-zero and denormals-are-zeros MXCSR control
+// flags.
+inline void ClearFlushToZeroAndDenormalsAreZeros() {
+#ifdef __SSE2__
+  _mm_setcsr(_mm_getcsr() & ~unsigned{0x8040});
+#endif
+}
+
+}  // namespace qsim
+
+#endif  // UTIL_CPU_H_
diff --git a/tpls/qsim/util_cuda.h b/tpls/qsim/util_cuda.h
new file mode 100644
index 0000000..5d8cb5d
--- /dev/null
+++ b/tpls/qsim/util_cuda.h
@@ -0,0 +1,128 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UTIL_CUDA_H_
+#define UTIL_CUDA_H_
+
+#ifdef __NVCC__
+  #include <cuda.h>
+#elif __HIP__
+  #include <hip/hip_runtime.h>
+#endif
+
+#include <cstdlib>
+
+#include "io.h"
+
+namespace qsim {
+
+#define ErrorCheck(code) { ErrorAssert((code), __FILE__, __LINE__); }
+
+inline void ErrorAssert(cudaError_t code, const char* file, unsigned line) {
+  if (code != cudaSuccess) {
+    IO::errorf("CUDA error: %s %s %d\n", cudaGetErrorString(code), file, line);
+    exit(code);
+  }
+}
+
+template <typename T>
+struct Complex {
+  __host__ __device__ __forceinline__ Complex() {}
+
+  __host__ __device__ __forceinline__ Complex(const T& re) : re(re), im(0) {}
+
+  __host__ __device__ __forceinline__ Complex(const T& re, const T& im)
+      : re(re), im(im) {}
+
+  template <typename U>
+  __host__ __device__ __forceinline__ Complex<T>& operator=(
+      const Complex<U>& r) {
+    re = r.re;
+    im = r.im;
+
+    return *this;
+  }
+
+  T re;
+  T im;
+};
+
+template <typename T>
+__host__ __device__ __forceinline__ Complex<T> operator+(
+    const Complex<T>& l, const Complex<T>& r) {
+  return Complex<T>(l.re + r.re, l.im + r.im);
+}
+
+template <typename T, typename U>
+__host__ __device__ __forceinline__ Complex<T> operator+(
+    const Complex<T>& l, const Complex<U>& r) {
+  return Complex<T>(l.re + r.re, l.im + r.im);
+}
+
+template <typename T>
+struct Scalar {
+  using type = T;
+};
+
+template <typename T>
+struct Scalar<Complex<T>> {
+  using type = T;
+};
+
+template <typename T>
+struct Plus {
+  template <typename U>
+  __device__ __forceinline__ T operator()(const T& v1, const U& v2) const {
+    return v1 + v2;
+  }
+};
+
+template <typename T>
+struct Product {
+  __device__ __forceinline__ Complex<T> operator()(
+      const T& re1, const T& im1, const T& re2, const T& im2) const {
+    return Complex<T>(re1 * re2 + im1 * im2, re1 * im2 - im1 * re2);
+  }
+};
+
+template <typename T>
+struct RealProduct {
+  __device__ __forceinline__ T operator()(
+      const T& re1, const T& im1, const T& re2, const T& im2) const {
+    return re1 * re2 + im1 * im2;
+  }
+};
+
+template <typename FP1, typename Op, unsigned warp_size = 32>
+__device__ __forceinline__ FP1 WarpReduce(FP1 val, Op op) {
+  for (unsigned i = warp_size / 2; i > 0; i /= 2) {
+    val = op(val, __shfl_down_sync(0xffffffff, val, i));
+  }
+
+  return val;
+}
+
+template <typename FP1, typename Op, unsigned warp_size = 32>
+__device__ __forceinline__ Complex<FP1> WarpReduce(Complex<FP1> val, Op op) {
+  for (unsigned i = warp_size / 2; i > 0; i /= 2) {
+    val.re = op(val.re, __shfl_down_sync(0xffffffff, val.re, i));
+    val.im = op(val.im, __shfl_down_sync(0xffffffff, val.im, i));
+  }
+
+  return val;
+}
+
+}  // namespace qsim
+
+#endif  // UTIL_CUDA_H_
diff --git a/tpls/qsim/util_custatevec.h b/tpls/qsim/util_custatevec.h
new file mode 100644
index 0000000..36f29ef
--- /dev/null
+++ b/tpls/qsim/util_custatevec.h
@@ -0,0 +1,44 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef UTIL_CUSTATEVEC_H_
+#define UTIL_CUSTATEVEC_H_
+
+#include <cublas_v2.h>
+#include <custatevec.h>
+
+#include "io.h"
+#include "util_cuda.h"
+
+namespace qsim {
+
+inline void ErrorAssert(cublasStatus_t code, const char* file, unsigned line) {
+  if (code != CUBLAS_STATUS_SUCCESS) {
+    IO::errorf("cuBLAS error %i: %s %d\n", code, file, line);
+    exit(code);
+  }
+}
+
+inline void ErrorAssert(
+    custatevecStatus_t code, const char* file, unsigned line) {
+  if (code != CUSTATEVEC_STATUS_SUCCESS) {
+    IO::errorf("custatevec error: %s %s %d\n",
+                custatevecGetErrorString(code), file, line);
+    exit(code);
+  }
+}
+
+}  // namespace qsim
+
+#endif  // UTIL_CUSTATEVEC_H_
diff --git a/tpls/qsim/vectorspace.h b/tpls/qsim/vectorspace.h
new file mode 100644
index 0000000..7b33a53
--- /dev/null
+++ b/tpls/qsim/vectorspace.h
@@ -0,0 +1,185 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef VECTORSPACE_H_
+#define VECTORSPACE_H_
+
+#ifdef _WIN32
+  #include <malloc.h>
+#endif
+
+#include <cstdint>
+#include <cstdlib>
+#include <memory>
+#include <utility>
+
+namespace qsim {
+
+namespace detail {
+
+inline void do_not_free(void*) {}
+
+inline void free(void* ptr) {
+#ifdef _WIN32
+  _aligned_free(ptr);
+#else
+  ::free(ptr);
+#endif
+}
+
+}  // namespace detail
+
+// Routines for vector manipulations.
+template <typename Impl, typename For, typename FP>
+class VectorSpace {
+ public:
+  using fp_type = FP;
+
+ private:
+  using Pointer = std::unique_ptr<fp_type, decltype(&detail::free)>;
+
+ public:
+  class Vector {
+   public:
+    Vector() = delete;
+
+    Vector(Pointer&& ptr, unsigned num_qubits)
+        : ptr_(std::move(ptr)), num_qubits_(num_qubits) {}
+
+    fp_type* get() {
+      return ptr_.get();
+    }
+
+    const fp_type* get() const {
+      return ptr_.get();
+    }
+
+    fp_type* release() {
+      num_qubits_ = 0;
+      return ptr_.release();
+    }
+
+    unsigned num_qubits() const {
+      return num_qubits_;
+    }
+
+    bool requires_copy_to_host() const {
+      return false;
+    }
+
+   private:
+    Pointer ptr_;
+    unsigned num_qubits_;
+  };
+
+  template <typename... ForArgs>
+  VectorSpace(ForArgs&&... args) : for_(args...) {}
+
+  static Vector Create(unsigned num_qubits) {
+    auto size = sizeof(fp_type) * Impl::MinSize(num_qubits);
+    #ifdef _WIN32
+      Pointer ptr{(fp_type*) _aligned_malloc(size, 64), &detail::free};
+      return Vector{std::move(ptr), ptr.get() != nullptr ? num_qubits : 0};
+    #else
+      void* p = nullptr;
+      if (posix_memalign(&p, 64, size) == 0) {
+        return Vector{Pointer{(fp_type*) p, &detail::free}, num_qubits};
+      } else {
+        return Null();
+      }
+    #endif
+  }
+
+  // It is the client's responsibility to make sure that p has at least
+  // Impl::MinSize(num_qubits) elements.
+  static Vector Create(fp_type* p, unsigned num_qubits) {
+    return Vector{Pointer{p, &detail::do_not_free}, num_qubits};
+  }
+
+  static Vector Null() {
+    return Vector{Pointer{nullptr, &detail::free}, 0};
+  }
+
+  static bool IsNull(const Vector& vec) {
+    return vec.get() == nullptr;
+  }
+
+  static void Free(fp_type* ptr) {
+    detail::free(ptr);
+  }
+
+  bool Copy(const Vector& src, Vector& dest) const {
+    if (src.num_qubits() != dest.num_qubits()) {
+      return false;
+    }
+
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* src, fp_type* dest) {
+      dest[i] = src[i];
+    };
+
+    for_.Run(Impl::MinSize(src.num_qubits()), f, src.get(), dest.get());
+
+    return true;
+  }
+
+  // It is the client's responsibility to make sure that dest has at least
+  // Impl::MinSize(src.num_qubits()) elements.
+  bool Copy(const Vector& src, fp_type* dest) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* src, fp_type* dest) {
+      dest[i] = src[i];
+    };
+
+    for_.Run(Impl::MinSize(src.num_qubits()), f, src.get(), dest);
+
+    return true;
+  }
+
+  // It is the client's responsibility to make sure that src has at least
+  // Impl::MinSize(dest.num_qubits()) elements.
+  bool Copy(const fp_type* src, Vector& dest) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* src, fp_type* dest) {
+      dest[i] = src[i];
+    };
+
+    for_.Run(Impl::MinSize(dest.num_qubits()), f, src, dest.get());
+
+    return true;
+  }
+
+  // It is the client's responsibility to make sure that src has at least
+  // min(size, Impl::MinSize(dest.num_qubits())) elements.
+  bool Copy(const fp_type* src, uint64_t size, Vector& dest) const {
+    auto f = [](unsigned n, unsigned m, uint64_t i,
+                const fp_type* src, fp_type* dest) {
+      dest[i] = src[i];
+    };
+
+    size = std::min(size, Impl::MinSize(dest.num_qubits()));
+    for_.Run(size, f, src, dest.get());
+
+    return true;
+  }
+
+  void DeviceSync() {}
+
+ protected:
+  For for_;
+};
+
+}  // namespace qsim
+
+#endif  // VECTORSPACE_H_
diff --git a/tpls/qsim/vectorspace_cuda.h b/tpls/qsim/vectorspace_cuda.h
new file mode 100644
index 0000000..fd91553
--- /dev/null
+++ b/tpls/qsim/vectorspace_cuda.h
@@ -0,0 +1,172 @@
+// Copyright 2019 Google LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef VECTORSPACE_CUDA_H_
+#define VECTORSPACE_CUDA_H_
+
+#ifdef __NVCC__
+  #include <cuda.h>
+  #include <cuda_runtime.h>
+#elif __HIP__
+  #include <hip/hip_runtime.h>
+  #include "cuda2hip.h"
+#endif
+
+#include <memory>
+#include <utility>
+
+namespace qsim {
+
+namespace detail {
+
+inline void do_not_free(void*) {}
+
+inline void free(void* ptr) {
+  ErrorCheck(cudaFree(ptr));
+}
+
+}  // namespace detail
+
+// Routines for vector manipulations.
+template <typename Impl, typename FP>
+class VectorSpaceCUDA {
+ public:
+  using fp_type = FP;
+
+ private:
+  using Pointer = std::unique_ptr<fp_type, decltype(&detail::free)>;
+
+ public:
+  class Vector {
+   public:
+    Vector() = delete;
+
+    Vector(Pointer&& ptr, unsigned num_qubits)
+        : ptr_(std::move(ptr)), num_qubits_(num_qubits) {}
+
+    fp_type* get() {
+      return ptr_.get();
+    }
+
+    const fp_type* get() const {
+      return ptr_.get();
+    }
+
+    fp_type* release() {
+      num_qubits_ = 0;
+      return ptr_.release();
+    }
+
+    unsigned num_qubits() const {
+      return num_qubits_;
+    }
+
+    bool requires_copy_to_host() const {
+      return true;
+    }
+
+   private:
+    Pointer ptr_;
+    unsigned num_qubits_;
+  };
+
+  template <typename... Args>
+  VectorSpaceCUDA(Args&&... args) {}
+
+  static Vector Create(unsigned num_qubits) {
+    fp_type* p;
+    auto size = sizeof(fp_type) * Impl::MinSize(num_qubits);
+    auto rc = cudaMalloc(&p, size);
+
+    if (rc == cudaSuccess) {
+      return Vector{Pointer{(fp_type*) p, &detail::free}, num_qubits};
+    } else {
+      return Null();
+    }
+  }
+
+  // It is the client's responsibility to make sure that p has at least
+  // Impl::MinSize(num_qubits) elements.
+  static Vector Create(fp_type* p, unsigned num_qubits) {
+    return Vector{Pointer{p, &detail::do_not_free}, num_qubits};
+  }
+
+  static Vector Null() {
+    return Vector{Pointer{nullptr, &detail::free}, 0};
+  }
+
+  static bool IsNull(const Vector& vector) {
+    return vector.get() == nullptr;
+  }
+
+  static void Free(fp_type* ptr) {
+    detail::free(ptr);
+  }
+
+  bool Copy(const Vector& src, Vector& dest) const {
+    if (src.num_qubits() != dest.num_qubits()) {
+      return false;
+    }
+
+    ErrorCheck(
+        cudaMemcpy(dest.get(), src.get(),
+                   sizeof(fp_type) * Impl::MinSize(src.num_qubits()),
+                   cudaMemcpyDeviceToDevice));
+
+    return true;
+  }
+
+  // It is the client's responsibility to make sure that dest has at least
+  // Impl::MinSize(src.num_qubits()) elements.
+  bool Copy(const Vector& src, fp_type* dest) const {
+    ErrorCheck(
+        cudaMemcpy(dest, src.get(),
+                   sizeof(fp_type) * Impl::MinSize(src.num_qubits()),
+                   cudaMemcpyDeviceToHost));
+
+    return true;
+  }
+
+  // It is the client's responsibility to make sure that src has at least
+  // Impl::MinSize(dest.num_qubits()) elements.
+  bool Copy(const fp_type* src, Vector& dest) const {
+    ErrorCheck(
+        cudaMemcpy(dest.get(), src,
+                   sizeof(fp_type) * Impl::MinSize(dest.num_qubits()),
+                   cudaMemcpyHostToDevice));
+
+    return true;
+  }
+
+  // It is the client's responsibility to make sure that src has at least
+  // min(size, Impl::MinSize(dest.num_qubits())) elements.
+  bool Copy(const fp_type* src, uint64_t size, Vector& dest) const {
+    size = std::min(size, Impl::MinSize(dest.num_qubits()));
+    ErrorCheck(
+        cudaMemcpy(dest.get(), src,
+                   sizeof(fp_type) * size,
+                   cudaMemcpyHostToDevice));
+    return true;
+  }
+
+  void DeviceSync() {
+    ErrorCheck(cudaDeviceSynchronize());
+  }
+
+ protected:
+};
+
+}  // namespace qsim
+
+#endif  // VECTORSPACE_CUDA_H_

From 177aaa4b5516bedc7942c91480d293d330397460 Mon Sep 17 00:00:00 2001
From: wongey <25296194+wongey@users.noreply.github.com>
Date: Tue, 5 Nov 2024 22:05:03 -0500
Subject: [PATCH 02/64] Remove redudant qsim folder

which was accidentally copied into the top level
---
 qsim/bits.h                      |  106 --
 qsim/bitstring.h                 |   97 --
 qsim/channel.h                   |  149 ---
 qsim/channels_cirq.h             |  471 ---------
 qsim/channels_qsim.h             |  117 ---
 qsim/circuit.h                   |   36 -
 qsim/circuit_noisy.h             |  108 --
 qsim/circuit_qsim_parser.h       |  442 --------
 qsim/cuda2hip.h                  |   61 --
 qsim/expect.h                    |  148 ---
 qsim/formux.h                    |   30 -
 qsim/fuser.h                     |  225 ----
 qsim/fuser_basic.h               |  411 --------
 qsim/fuser_mqubit.h              | 1095 --------------------
 qsim/gate.h                      |  216 ----
 qsim/gate_appl.h                 |  231 -----
 qsim/gates_cirq.h                | 1640 ------------------------------
 qsim/gates_qsim.h                |  661 ------------
 qsim/hybrid.h                    |  612 -----------
 qsim/io.h                        |   44 -
 qsim/io_file.h                   |   71 --
 qsim/matrix.h                    |  296 ------
 qsim/mps_simulator.h             |  246 -----
 qsim/mps_statespace.h            |  597 -----------
 qsim/parfor.h                    |  123 ---
 qsim/qtrajectory.h               |  435 --------
 qsim/run_qsim.h                  |  262 -----
 qsim/run_qsimh.h                 |  120 ---
 qsim/seqfor.h                    |   68 --
 qsim/simmux.h                    |   44 -
 qsim/simmux_gpu.h                |   30 -
 qsim/simulator.h                 |  516 ----------
 qsim/simulator_avx.h             | 1363 -------------------------
 qsim/simulator_avx512.h          |  846 ---------------
 qsim/simulator_basic.h           |  349 -------
 qsim/simulator_cuda.h            |  923 -----------------
 qsim/simulator_cuda_kernels.h    |  683 -------------
 qsim/simulator_custatevec.h      |  209 ----
 qsim/simulator_sse.h             |  864 ----------------
 qsim/statespace.h                |  145 ---
 qsim/statespace_avx.h            |  497 ---------
 qsim/statespace_avx512.h         |  448 --------
 qsim/statespace_basic.h          |  300 ------
 qsim/statespace_cuda.h           |  470 ---------
 qsim/statespace_cuda_kernels.h   |  355 -------
 qsim/statespace_custatevec.h     |  376 -------
 qsim/statespace_sse.h            |  462 ---------
 qsim/umux.h                      |   52 -
 qsim/unitary_calculator_avx.h    | 1028 -------------------
 qsim/unitary_calculator_avx512.h |  644 ------------
 qsim/unitary_calculator_basic.h  |  259 -----
 qsim/unitary_calculator_sse.h    |  639 ------------
 qsim/unitaryspace.h              |   65 --
 qsim/unitaryspace_avx.h          |  112 --
 qsim/unitaryspace_avx512.h       |  112 --
 qsim/unitaryspace_basic.h        |  103 --
 qsim/unitaryspace_sse.h          |  112 --
 qsim/util.h                      |   89 --
 qsim/util_cpu.h                  |   43 -
 qsim/util_cuda.h                 |  128 ---
 qsim/util_custatevec.h           |   44 -
 qsim/vectorspace.h               |  185 ----
 qsim/vectorspace_cuda.h          |  172 ----
 63 files changed, 21785 deletions(-)
 delete mode 100644 qsim/bits.h
 delete mode 100644 qsim/bitstring.h
 delete mode 100644 qsim/channel.h
 delete mode 100644 qsim/channels_cirq.h
 delete mode 100644 qsim/channels_qsim.h
 delete mode 100644 qsim/circuit.h
 delete mode 100644 qsim/circuit_noisy.h
 delete mode 100644 qsim/circuit_qsim_parser.h
 delete mode 100644 qsim/cuda2hip.h
 delete mode 100644 qsim/expect.h
 delete mode 100644 qsim/formux.h
 delete mode 100644 qsim/fuser.h
 delete mode 100644 qsim/fuser_basic.h
 delete mode 100644 qsim/fuser_mqubit.h
 delete mode 100644 qsim/gate.h
 delete mode 100644 qsim/gate_appl.h
 delete mode 100644 qsim/gates_cirq.h
 delete mode 100644 qsim/gates_qsim.h
 delete mode 100644 qsim/hybrid.h
 delete mode 100644 qsim/io.h
 delete mode 100644 qsim/io_file.h
 delete mode 100644 qsim/matrix.h
 delete mode 100644 qsim/mps_simulator.h
 delete mode 100644 qsim/mps_statespace.h
 delete mode 100644 qsim/parfor.h
 delete mode 100644 qsim/qtrajectory.h
 delete mode 100644 qsim/run_qsim.h
 delete mode 100644 qsim/run_qsimh.h
 delete mode 100644 qsim/seqfor.h
 delete mode 100644 qsim/simmux.h
 delete mode 100644 qsim/simmux_gpu.h
 delete mode 100644 qsim/simulator.h
 delete mode 100644 qsim/simulator_avx.h
 delete mode 100644 qsim/simulator_avx512.h
 delete mode 100644 qsim/simulator_basic.h
 delete mode 100644 qsim/simulator_cuda.h
 delete mode 100644 qsim/simulator_cuda_kernels.h
 delete mode 100644 qsim/simulator_custatevec.h
 delete mode 100644 qsim/simulator_sse.h
 delete mode 100644 qsim/statespace.h
 delete mode 100644 qsim/statespace_avx.h
 delete mode 100644 qsim/statespace_avx512.h
 delete mode 100644 qsim/statespace_basic.h
 delete mode 100644 qsim/statespace_cuda.h
 delete mode 100644 qsim/statespace_cuda_kernels.h
 delete mode 100644 qsim/statespace_custatevec.h
 delete mode 100644 qsim/statespace_sse.h
 delete mode 100644 qsim/umux.h
 delete mode 100644 qsim/unitary_calculator_avx.h
 delete mode 100644 qsim/unitary_calculator_avx512.h
 delete mode 100644 qsim/unitary_calculator_basic.h
 delete mode 100644 qsim/unitary_calculator_sse.h
 delete mode 100644 qsim/unitaryspace.h
 delete mode 100644 qsim/unitaryspace_avx.h
 delete mode 100644 qsim/unitaryspace_avx512.h
 delete mode 100644 qsim/unitaryspace_basic.h
 delete mode 100644 qsim/unitaryspace_sse.h
 delete mode 100644 qsim/util.h
 delete mode 100644 qsim/util_cpu.h
 delete mode 100644 qsim/util_cuda.h
 delete mode 100644 qsim/util_custatevec.h
 delete mode 100644 qsim/vectorspace.h
 delete mode 100644 qsim/vectorspace_cuda.h

diff --git a/qsim/bits.h b/qsim/bits.h
deleted file mode 100644
index 080c866..0000000
--- a/qsim/bits.h
+++ /dev/null
@@ -1,106 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef BITS_H_
-#define BITS_H_
-
-#include <vector>
-
-#ifdef __BMI2__
-
-#include <immintrin.h>
-
-#include <cstdint>
-
-namespace qsim {
-namespace bits {
-
-inline uint32_t ExpandBits(uint32_t bits, unsigned n, uint32_t mask) {
-  return _pdep_u32(bits, mask);
-}
-
-inline uint64_t ExpandBits(uint64_t bits, unsigned n, uint64_t mask) {
-  return _pdep_u64(bits, mask);
-}
-
-inline uint32_t CompressBits(uint32_t bits, unsigned n, uint32_t mask) {
-  return _pext_u32(bits, mask);
-}
-
-inline uint64_t CompressBits(uint64_t bits, unsigned n, uint64_t mask) {
-  return _pext_u64(bits, mask);
-}
-
-}  // namespace bits
-}  // namespace qsim
-
-#else  // __BMI2__
-
-namespace qsim {
-namespace bits {
-
-template <typename Integer>
-inline Integer ExpandBits(Integer bits, unsigned n, Integer mask) {
-  Integer ebits = 0;
-  unsigned k = 0;
-
-  for (unsigned i = 0; i < n; ++i) {
-    if ((mask >> i) & 1) {
-      ebits |= ((bits >> k) & 1) << i;
-      ++k;
-    }
-  }
-
-  return ebits;
-}
-
-template <typename Integer>
-inline Integer CompressBits(Integer bits, unsigned n, Integer mask) {
-  Integer sbits = 0;
-  unsigned k = 0;
-
-  for (unsigned i = 0; i < n; ++i) {
-    if ((mask >> i) & 1) {
-      sbits |= ((bits >> i) & 1) << k;
-      ++k;
-    }
-  }
-
-  return sbits;
-}
-
-}  // namespace bits
-}  // namespace qsim
-
-#endif  // __BMI2__
-
-namespace qsim {
-namespace bits {
-
-template <typename Integer>
-inline Integer PermuteBits(
-    Integer bits, unsigned n, const std::vector<unsigned>& perm) {
-  Integer pbits = 0;
-
-  for (unsigned i = 0; i < n; ++i) {
-    pbits |= ((bits >> i) & 1) << perm[i];
-  }
-
-  return pbits;
-}
-
-}  // namespace bits
-}  // namespace qsim
-
-#endif  // BITS_H_
diff --git a/qsim/bitstring.h b/qsim/bitstring.h
deleted file mode 100644
index b95584b..0000000
--- a/qsim/bitstring.h
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef BITSTRING_H_
-#define BITSTRING_H_
-
-#include <cstdint>
-#include <string>
-#include <utility>
-#include <vector>
-
-namespace qsim {
-
-using Bitstring = uint64_t;
-
-/**
- * Reads bitstrings (representing initialized or measured states of qubits)
- * from a provided stream object and stores them in a vector.
- * @param num_qubits Number of qubits represented in each bitstring.
- * @param provider Source of bitstrings; only used for error reporting.
- * @param fs The stream to read bitstrings from.
- * @param bitstrings Output vector of bitstrings. On success, this will contain
- *   all bitstrings read in from 'fs'.
- * @return True if reading succeeded; false otherwise.
- */
-template <typename IO, typename Stream>
-bool BitstringsFromStream(unsigned num_qubits, const std::string& provider,
-                          Stream& fs, std::vector<Bitstring>& bitstrings) {
-  bitstrings.resize(0);
-  bitstrings.reserve(100000);
-
-  // Bitstrings are in text format. One bitstring per line.
-
-  do {
-    char buf[128];
-    fs.getline(buf, 128);
-
-    if (fs) {
-      Bitstring b{0};
-
-      unsigned p = 0;
-      while (p < 128 && (buf[p] == '0' || buf[p] == '1')) {
-        b |= uint64_t(buf[p] - '0') << p;
-        ++p;
-      }
-
-      if (p != num_qubits) {
-        IO::errorf("wrong bitstring length in %s: "
-                   "got %u; should be %u.\n", provider.c_str(), p, num_qubits);
-        bitstrings.resize(0);
-        return false;
-      }
-
-      bitstrings.push_back(b);
-    }
-  } while (fs);
-
-  return true;
-}
-
-/**
- * Reads bitstrings (representing initialized or measured states of qubits)
- * from the given file and stores them in a vector.
- * @param num_qubits Number of qubits represented in each bitstring.
- * @param file The name of the file to read bitstrings from.
- * @param bitstrings Output vector of bitstrings. On success, this will contain
- *   all bitstrings read in from 'file'.
- * @return True if reading succeeded; false otherwise.
- */
-template <typename IO>
-inline bool BitstringsFromFile(unsigned num_qubits, const std::string& file,
-                               std::vector<Bitstring>& bitstrings) {
-  auto fs = IO::StreamFromFile(file);
-
-  if (!fs) {
-    return false;
-  } else {
-    bool rc = BitstringsFromStream<IO>(num_qubits, file, fs, bitstrings);
-    IO::CloseStream(fs);
-    return rc;
-  }
-}
-
-}  // namespace qsim
-
-#endif  // BITSTRING_H_
diff --git a/qsim/channel.h b/qsim/channel.h
deleted file mode 100644
index 372a174..0000000
--- a/qsim/channel.h
+++ /dev/null
@@ -1,149 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef CHANNEL_H_
-#define CHANNEL_H_
-
-#include <set>
-#include <vector>
-
-#include "gate.h"
-#include "matrix.h"
-
-namespace qsim {
-
-/**
- * Kraus operator.
- */
-template <typename Gate>
-struct KrausOperator {
-  using fp_type = typename Gate::fp_type;
-
-  enum Kind {
-    kNormal = 0,
-    kMeasurement = gate::kMeasurement,
-  };
-
-  /**
-   * Kraus operator type;
-   */
-  Kind kind;
-
-  /**
-   * If true, the Kraus operator is a unitary operator times a constant.
-   */
-  bool unitary;
-
-  /**
-   * Lower bound on Kraus operator probability.
-   */
-  double prob;
-
-  /**
-   * Sequence of operations that represent the Kraus operator. This can be just
-   * one operation.
-   */
-  std::vector<Gate> ops;
-
-  /**
-   * Product of K^\dagger and K. This can be empty if unitary = true.
-   */
-  Matrix<fp_type> kd_k;
-
-  /**
-   * Qubits kd_k acts on. This can be empty if unitary = true.
-   */
-  std::vector<unsigned> qubits;
-
-  /**
-   * Calculates the product of "K^\dagger K". Sets qubits "K^\dagger K" acts on.
-   */
-  void CalculateKdKMatrix() {
-    if (ops.size() == 1) {
-      kd_k = ops[0].matrix;
-      MatrixDaggerMultiply(ops[0].qubits.size(), ops[0].matrix, kd_k);
-      qubits = ops[0].qubits;
-    } else if (ops.size() > 1) {
-      std::set<unsigned> qubit_map;
-
-      for (const auto& op : ops) {
-        for (unsigned q : op.qubits) {
-          qubit_map.insert(q);
-        }
-      }
-
-      unsigned num_qubits = qubit_map.size();
-
-      qubits.resize(0);
-      qubits.reserve(num_qubits);
-
-      for (auto it = qubit_map.begin(); it != qubit_map.end(); ++it) {
-        qubits.push_back(*it);
-      }
-
-      MatrixIdentity(unsigned{1} << num_qubits, kd_k);
-
-      for (const auto& op : ops) {
-        if (op.qubits.size() == num_qubits) {
-          MatrixMultiply(num_qubits, op.matrix, kd_k);
-        } else {
-          unsigned mask = 0;
-
-          for (auto q : op.qubits) {
-            for (unsigned i = 0; i < num_qubits; ++i) {
-              if (q == qubits[i]) {
-                mask |= unsigned{1} << i;
-                break;
-              }
-            }
-          }
-
-          MatrixMultiply(mask, op.qubits.size(), op.matrix, num_qubits, kd_k);
-        }
-      }
-
-      auto m = kd_k;
-      MatrixDaggerMultiply(num_qubits, m, kd_k);
-    }
-  }
-};
-
-/**
- * Quantum channel.
- */
-template <typename Gate>
-using Channel = std::vector<KrausOperator<Gate>>;
-
-/**
- * Makes a channel from the gate.
- * @param time The time to place the channel at.
- * @param gate The input gate.
- * @return The output channel.
- */
-template <typename Gate>
-Channel<Gate> MakeChannelFromGate(unsigned time, const Gate& gate) {
-  auto normal = KrausOperator<Gate>::kNormal;
-  auto measurement = KrausOperator<Gate>::kMeasurement;
-
-  auto kind = gate.kind == gate::kMeasurement ? measurement : normal;
-
-  Channel<Gate> channel = {{kind, true, 1, {gate}}};
-  channel[0].ops[0].time = time;
-
-  return channel;
-}
-
-}  // namespace qsim
-
-#endif  // CHANNEL_H_
diff --git a/qsim/channels_cirq.h b/qsim/channels_cirq.h
deleted file mode 100644
index 69f1df9..0000000
--- a/qsim/channels_cirq.h
+++ /dev/null
@@ -1,471 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef CHANNELS_CIRQ_H_
-#define CHANNELS_CIRQ_H_
-
-#include <cmath>
-#include <cstdint>
-#include <vector>
-
-#include "channel.h"
-#include "gates_cirq.h"
-
-namespace qsim {
-
-namespace Cirq {
-
-template <typename fp_type>
-using Channel = qsim::Channel<GateCirq<fp_type>>;
-
-/**
- * Asymmetric depolarizing channel factory.
- */
-template <typename fp_type>
-struct AsymmetricDepolarizingChannel {
-  static constexpr char name[] = "asymmetric_depolarize";
-
-  AsymmetricDepolarizingChannel(double p_x, double p_y, double p_z)
-      : p_x(p_x), p_y(p_y), p_z(p_z) {}
-
-  static Channel<fp_type> Create(unsigned time, unsigned q,
-                                 double p_x, double p_y, double p_z) {
-    double p1 = 1 - p_x - p_y - p_z;
-
-    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
-
-    return {{normal, 1, p1, {}},
-            {normal, 1, p_x, {X<fp_type>::Create(time, q)}},
-            {normal, 1, p_y, {Y<fp_type>::Create(time, q)}},
-            {normal, 1, p_z, {Z<fp_type>::Create(time, q)}}};
-  }
-
-  static Channel<fp_type> Create(unsigned time,
-                                 const std::vector<unsigned>& qubits,
-                                 double p_x, double p_y, double p_z) {
-    double p1 = 1 - p_x - p_y - p_z;
-
-    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
-
-    uint64_t size = uint64_t{1} << (2 * qubits.size());
-
-    Channel<fp_type> channel;
-    channel.reserve(size);
-
-    for (uint64_t i = 0; i < size; ++i) {
-      channel.push_back({normal, 1, 0, {}});
-      auto& kop = channel.back();
-
-      kop.ops.reserve(qubits.size());
-
-      double prob = 1;
-
-      for (unsigned q = 0; q < qubits.size(); ++q) {
-        unsigned pauli_index = (i >> (2 * q)) & 3;
-
-        switch (pauli_index) {
-        case 0:
-          prob *= p1;
-          break;
-        case 1:
-          prob *= p_x;
-          kop.ops.push_back(X<fp_type>::Create(time, q));
-          break;
-        case 2:
-          prob *= p_y;
-          kop.ops.push_back(Y<fp_type>::Create(time, q));
-          break;
-        case 3:
-          prob *= p_z;
-          kop.ops.push_back(Z<fp_type>::Create(time, q));
-          break;
-        }
-      }
-
-      kop.prob = prob;
-    }
-
-    return channel;
-  }
-
-  Channel<fp_type> Create(unsigned time, unsigned q) const {
-    return Create(time, q, p_x, p_y, p_z);
-  }
-
-  Channel<fp_type> Create(
-      unsigned time, const std::vector<unsigned>& qubits) const {
-    return Create(time, qubits, p_x, p_y, p_z);
-  }
-
-  double p_x = 0;
-  double p_y = 0;
-  double p_z = 0;
-};
-
-/**
- * Returns an asymmetric depolarizing channel factory object.
- */
-template <typename fp_type>
-inline AsymmetricDepolarizingChannel<fp_type> asymmetric_depolarize(
-    double p_x, double p_y, double p_z) {
-  return AsymmetricDepolarizingChannel<fp_type>(p_x, p_y, p_z);
-}
-
-/**
- * Depolarizing channel factory.
- */
-template <typename fp_type>
-struct DepolarizingChannel {
-  static constexpr char name[] = "depolarize";
-
-  DepolarizingChannel(double p) : p(p) {}
-
-  static Channel<fp_type> Create(unsigned time, unsigned q, double p) {
-    double p1 = 1 - p;
-    double p2 = p / 3;
-
-    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
-
-    return {{normal, 1, p1, {}},
-            {normal, 1, p2, {X<fp_type>::Create(time, q)}},
-            {normal, 1, p2, {Y<fp_type>::Create(time, q)}},
-            {normal, 1, p2, {Z<fp_type>::Create(time, q)}}};
-  }
-
-  static Channel<fp_type> Create(
-      unsigned time, const std::vector<unsigned>& qubits, double p) {
-    double p1 = 1 - p;
-    double p2 = p / 3;
-
-    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
-
-    uint64_t size = uint64_t{1} << (2 * qubits.size());
-
-    Channel<fp_type> channel;
-    channel.reserve(size);
-
-    for (uint64_t i = 0; i < size; ++i) {
-      channel.push_back({normal, 1, 0, {}});
-      auto& kop = channel.back();
-
-      kop.ops.reserve(qubits.size());
-
-      double prob = 1;
-
-      for (unsigned q = 0; q < qubits.size(); ++q) {
-        unsigned pauli_index = (i >> (2 * q)) & 3;
-
-        switch (pauli_index) {
-        case 0:
-          prob *= p1;
-          break;
-        case 1:
-          prob *= p2;
-          kop.ops.push_back(X<fp_type>::Create(time, q));
-          break;
-        case 2:
-          prob *= p2;
-          kop.ops.push_back(Y<fp_type>::Create(time, q));
-          break;
-        case 3:
-          prob *= p2;
-          kop.ops.push_back(Z<fp_type>::Create(time, q));
-          break;
-        }
-      }
-
-      kop.prob = prob;
-    }
-
-    return channel;
-  }
-
-  Channel<fp_type> Create(unsigned time, unsigned q) const {
-    return Create(time, q, p);
-  }
-
-  Channel<fp_type> Create(
-      unsigned time, const std::vector<unsigned>& qubits) const {
-    return Create(time, qubits, p);
-  }
-
-  double p = 0;
-};
-
-/**
- * Returns a depolarizing channel factory object.
- */
-template <typename fp_type>
-inline DepolarizingChannel<fp_type> depolarize(double p) {
-  return DepolarizingChannel<fp_type>(p);
-}
-
-/**
- * Generalized amplitude damping channel factory.
- */
-template <typename fp_type>
-struct GeneralizedAmplitudeDampingChannel {
-  static constexpr char name[] = "generalized_amplitude_damp";
-
-  GeneralizedAmplitudeDampingChannel(double p, double gamma)
-      : p(p), gamma(gamma) {}
-
-  static Channel<fp_type> Create(
-      unsigned time, unsigned q, double p, double gamma) {
-    double p1 = p * (1 - gamma);
-    double p2 = (1 - p) * (1 - gamma);
-    double p3 = 0;
-
-    fp_type t1 = std::sqrt(p);
-    fp_type r1 = std::sqrt(p * (1 - gamma));
-    fp_type s1 = std::sqrt(p * gamma);
-    fp_type t2 = std::sqrt(1 - p);
-    fp_type r2 = std::sqrt((1 - p) * (1 - gamma));
-    fp_type s2 = std::sqrt((1 - p) * gamma);
-
-    using M = Cirq::MatrixGate1<fp_type>;
-    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
-
-    return {{normal, 0, p1,
-             {M::Create(time, q, {t1, 0, 0, 0, 0, 0, r1, 0})},
-             {t1 * t1, 0, 0, 0, 0, 0, r1 * r1, 0}, {q},
-            },
-            {normal, 0, p2,
-             {M::Create(time, q, {r2, 0, 0, 0, 0, 0, t2, 0})},
-             {r2 * r2, 0, 0, 0, 0, 0, t2 * t2, 0}, {q},
-            },
-            {normal, 0, p3,
-             {M::Create(time, q, {0, 0, s1, 0, 0, 0, 0, 0})},
-             {0, 0, 0, 0, 0, 0, s1 * s1, 0}, {q},
-            },
-            {normal, 0, p3,
-             {M::Create(time, q, {0, 0, 0, 0, s2, 0, 0, 0})},
-             {s2 * s2, 0, 0, 0, 0, 0, 0, 0}, {q},
-            },
-           };
-  }
-
-  Channel<fp_type> Create(unsigned time, unsigned q) const {
-    return Create(time, q, p, gamma);
-  }
-
-  double p = 1;
-  double gamma = 0;
-};
-
-/**
- * Returns a generalized amplitude damping channel factory object.
- */
-template <typename fp_type>
-inline GeneralizedAmplitudeDampingChannel<fp_type> generalized_amplitude_damp(
-    double p, double gamma) {
-  return GeneralizedAmplitudeDampingChannel<fp_type>(p, gamma);
-}
-
-/**
- * Amplitude damping channel factory.
- */
-template <typename fp_type>
-struct AmplitudeDampingChannel {
-  static constexpr char name[] = "amplitude_damp";
-
-  AmplitudeDampingChannel(double gamma) : gamma(gamma) {}
-
-  static Channel<fp_type> Create(unsigned time, unsigned q, double gamma) {
-    double p1 = 1 - gamma;
-    double p2 = 0;
-
-    fp_type r = std::sqrt(p1);
-    fp_type s = std::sqrt(gamma);
-
-    using M = Cirq::MatrixGate1<fp_type>;
-    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
-
-    return {{normal, 0, p1,
-             {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})},
-             {1, 0, 0, 0, 0, 0, r * r, 0}, {q},
-            },
-            {normal, 0, p2,
-             {M::Create(time, q, {0, 0, s, 0, 0, 0, 0, 0})},
-             {0, 0, 0, 0, 0, 0, s * s, 0}, {q},
-            },
-           };
-  }
-
-  Channel<fp_type> Create(unsigned time, unsigned q) const {
-    return Create(time, q, gamma);
-  }
-
-  double gamma = 0;
-};
-
-/**
- * Returns an amplitude damping channel factory object.
- */
-template <typename fp_type>
-inline AmplitudeDampingChannel<fp_type> amplitude_damp(double gamma) {
-  return AmplitudeDampingChannel<fp_type>(gamma);
-}
-
-/**
- *  Phase damping channel factory.
- */
-template <typename fp_type>
-struct PhaseDampingChannel {
-  static constexpr char name[] = "phase_dump";
-
-  PhaseDampingChannel(double gamma) : gamma(gamma) {}
-
-  static Channel<fp_type> Create(unsigned time, unsigned q, double gamma) {
-    double p1 = 1 - gamma;
-    double p2 = 0;
-
-    fp_type r = std::sqrt(p1);
-    fp_type s = std::sqrt(gamma);
-
-    using M = Cirq::MatrixGate1<fp_type>;
-    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
-
-    return {{normal, 0, p1,
-             {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})},
-             {1, 0, 0, 0, 0, 0, r * r, 0}, {q},
-            },
-            {normal, 0, p2,
-             {M::Create(time, q, {0, 0, 0, 0, 0, 0, s, 0})},
-             {0, 0, 0, 0, 0, 0, s * s, 0}, {q},
-            },
-           };
-  }
-
-  Channel<fp_type> Create(unsigned time, unsigned q) const {
-    return Create(time, q, gamma);
-  }
-
-  double gamma = 0;
-};
-
-/**
- * Returns a phase damping channel factory object.
- */
-template <typename fp_type>
-inline PhaseDampingChannel<fp_type> phase_damp(double gamma) {
-  return PhaseDampingChannel<fp_type>(gamma);
-}
-
-/**
- *  Reset channel factory.
- */
-template <typename fp_type>
-struct ResetChannel {
-  static constexpr char name[] = "reset";
-
-  static Channel<fp_type> Create(unsigned time, unsigned q) {
-    using M = Cirq::MatrixGate1<fp_type>;
-    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
-
-    return {{normal, 0, 0,
-             {M::Create(time, q, {1, 0, 0, 0, 0, 0, 0, 0})},
-             {1, 0, 0, 0, 0, 0, 0, 0}, {q},
-            },
-            {normal, 0, 0,
-             {M::Create(time, q, {0, 0, 1, 0, 0, 0, 0, 0})},
-             {0, 0, 0, 0, 0, 0, 1, 0}, {q},
-            },
-           };
-  }
-};
-
-/**
- * Returns a reset channel factory object.
- */
-template <typename fp_type>
-inline ResetChannel<fp_type> reset() {
-  return ResetChannel<fp_type>();
-}
-
-/**
- *  Phase flip channel factory.
- */
-template <typename fp_type>
-struct PhaseFlipChannel {
-  static constexpr char name[] = "phase_flip";
-
-  PhaseFlipChannel(double p) : p(p) {}
-
-  static Channel<fp_type> Create(unsigned time, unsigned q, double p) {
-    double p1 = 1 - p;
-    double p2 = p;
-
-    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
-
-    return {{normal, 1, p1, {}},
-            {normal, 1, p2, {Z<fp_type>::Create(time, q)}}
-           };
-  }
-
-  Channel<fp_type> Create(unsigned time, unsigned q) const {
-    return Create(time, q, p);
-  }
-
-  double p = 0;
-};
-
-/**
- * Returns a phase flip channel factory object.
- */
-template <typename fp_type>
-inline PhaseFlipChannel<fp_type> phase_flip(double p) {
-  return PhaseFlipChannel<fp_type>(p);
-}
-
-/**
- *  Bit flip channel factory.
- */
-template <typename fp_type>
-struct BitFlipChannel {
-  static constexpr char name[] = "bit_flip";
-
-  BitFlipChannel(double p) : p(p) {}
-
-  static Channel<fp_type> Create(unsigned time, unsigned q, double p) {
-    double p1 = 1 - p;
-    double p2 = p;
-
-    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
-
-    return {{normal, 1, p1, {}},
-            {normal, 1, p2, {X<fp_type>::Create(time, q)}}
-           };
-  }
-
-  Channel<fp_type> Create(unsigned time, unsigned q) const {
-    return Create(time, q, p);
-  }
-
-  double p = 0;
-};
-
-/**
- * Returns a bit flip channel factory object.
- */
-template <typename fp_type>
-inline BitFlipChannel<fp_type> bit_flip(double p) {
-  return BitFlipChannel<fp_type>(p);
-}
-
-}  // namesapce Cirq
-
-}  // namespace qsim
-
-#endif  // CHANNELS_CIRQ_H_
diff --git a/qsim/channels_qsim.h b/qsim/channels_qsim.h
deleted file mode 100644
index 5c07bcc..0000000
--- a/qsim/channels_qsim.h
+++ /dev/null
@@ -1,117 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef CHANNELS_QSIM_H_
-#define CHANNELS_QSIM_H_
-
-#include <cmath>
-#include <cstdint>
-#include <vector>
-
-#include "channel.h"
-#include "gates_qsim.h"
-
-namespace qsim {
-
-/**
- * Amplitude damping channel factory.
- */
-template <typename fp_type>
-struct AmplitudeDampingChannel {
-  AmplitudeDampingChannel(double gamma) : gamma(gamma) {}
-
-  static Channel<GateQSim<fp_type>> Create(
-      unsigned time, unsigned q, double gamma) {
-    double p1 = 1 - gamma;
-    double p2 = 0;
-
-    fp_type r = std::sqrt(p1);
-    fp_type s = std::sqrt(gamma);
-
-    using M = GateMatrix1<fp_type>;
-    auto normal = KrausOperator<GateQSim<fp_type>>::kNormal;
-
-    return {{normal, 0, p1,
-             {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})},
-             {1, 0, 0, 0, 0, 0, r * r, 0}, {q},
-            },
-            {normal, 0, p2,
-             {M::Create(time, q, {0, 0, s, 0, 0, 0, 0, 0})},
-             {0, 0, 0, 0, 0, 0, s * s, 0}, {q},
-            },
-           };
-  }
-
-  Channel<GateQSim<fp_type>> Create(unsigned time, unsigned q) const {
-    return Create(time, q, gamma);
-  }
-
-  double gamma = 0;
-};
-
-/**
- * Returns an amplitude damping channel factory object.
- */
-template <typename fp_type>
-inline AmplitudeDampingChannel<fp_type> amplitude_damp(double gamma) {
-  return AmplitudeDampingChannel<fp_type>(gamma);
-}
-
-/**
- *  Phase damping channel factory.
- */
-template <typename fp_type>
-struct PhaseDampingChannel {
-  PhaseDampingChannel(double gamma) : gamma(gamma) {}
-
-  static Channel<GateQSim<fp_type>> Create(
-      unsigned time, unsigned q, double gamma) {
-    double p1 = 1 - gamma;
-    double p2 = 0;
-
-    fp_type r = std::sqrt(p1);
-    fp_type s = std::sqrt(gamma);
-
-    using M = GateMatrix1<fp_type>;
-    auto normal = KrausOperator<GateQSim<fp_type>>::kNormal;
-
-    return {{normal, 0, p1,
-             {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})},
-             {1, 0, 0, 0, 0, 0, r * r, 0}, {q},
-            },
-            {normal, 0, p2,
-             {M::Create(time, q, {0, 0, 0, 0, 0, 0, s, 0})},
-             {0, 0, 0, 0, 0, 0, s * s, 0}, {q},
-            },
-           };
-  }
-
-  Channel<GateQSim<fp_type>> Create(unsigned time, unsigned q) const {
-    return Create(time, q, gamma);
-  }
-
-  double gamma = 0;
-};
-
-/**
- * Returns a phase damping channel factory object.
- */
-template <typename fp_type>
-inline PhaseDampingChannel<fp_type> phase_damp(double gamma) {
-  return PhaseDampingChannel<fp_type>(gamma);
-}
-
-}  // namespace qsim
-
-#endif  // CHANNELS_QSIM_H_
diff --git a/qsim/circuit.h b/qsim/circuit.h
deleted file mode 100644
index 59018ee..0000000
--- a/qsim/circuit.h
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef CIRCUIT_H_
-#define CIRCUIT_H_
-
-#include <vector>
-
-namespace qsim {
-
-/**
- * A collection of gates. This object is consumed by `QSim[h]Runner.Run()`.
- */
-template <typename Gate>
-struct Circuit {
-  unsigned num_qubits;
-  /**
-   * The set of gates to be run. Gate times should be ordered.
-   */
-  std::vector<Gate> gates;
-};
-
-}  // namespace qsim
-
-#endif  // CIRCUIT_H_
diff --git a/qsim/circuit_noisy.h b/qsim/circuit_noisy.h
deleted file mode 100644
index 40a228d..0000000
--- a/qsim/circuit_noisy.h
+++ /dev/null
@@ -1,108 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef CIRCUIT_NOISY_H_
-#define CIRCUIT_NOISY_H_
-
-#include <vector>
-
-#include "circuit.h"
-#include "channel.h"
-
-namespace qsim {
-
-/**
- * Noisy circuit.
- */
-template <typename Gate>
-struct NoisyCircuit {
-  unsigned num_qubits;
-  std::vector<Channel<Gate>> channels;
-};
-
-template <typename Gate>
-using ncircuit_iterator = typename std::vector<Channel<Gate>>::const_iterator;
-
-/**
- * Makes a noisy circuit from the clean circuit.
- * Channels are added after each qubit of each gate of the clean cicuit.
- * Roughly equivalent to cirq.Circuit.with_noise.
- * @param num_qubits The number of circuit qubits.
- * @param gbeg, gend The iterator range [gbeg, gend) of circuit gates.
- * @param A channel factory to construct channels.
- * @return The output noisy circuit.
- */
-template <typename Gate, typename ChannelFactory>
-inline NoisyCircuit<Gate> MakeNoisy(
-    unsigned num_qubits,
-    typename std::vector<Gate>::const_iterator gbeg,
-    typename std::vector<Gate>::const_iterator gend,
-    const ChannelFactory& channel_factory) {
-  NoisyCircuit<Gate> ncircuit;
-
-  ncircuit.num_qubits = num_qubits;
-  ncircuit.channels.reserve(4 * std::size_t(gend - gbeg));
-
-  for (auto it = gbeg; it != gend; ++it) {
-    const auto& gate = *it;
-
-    ncircuit.channels.push_back(MakeChannelFromGate(2 * gate.time, gate));
-
-    for (auto q : gate.qubits) {
-      ncircuit.channels.push_back(channel_factory.Create(2 * gate.time + 1, q));
-    }
-
-    for (auto q : gate.controlled_by) {
-      ncircuit.channels.push_back(channel_factory.Create(2 * gate.time + 1, q));
-    }
-  }
-
-  return ncircuit;
-}
-
-/**
- * Makes a noisy circuit from the clean circuit.
- * Channels are added after each qubit of each gate of the clean cicuit.
- * Roughly equivalent to cirq.Circuit.with_noise.
- * @param num_qubits The number of circuit qubits.
- * @param gates The circuit gates.
- * @param A channel factory to construct channels.
- * @return The output noisy circuit.
- */
-template <typename Gate, typename ChannelFactory>
-inline NoisyCircuit<Gate> MakeNoisy(unsigned num_qubits,
-                                    const std::vector<Gate>& gates,
-                                    const ChannelFactory& channel_factory) {
-  return
-      MakeNoisy<Gate>(num_qubits, gates.begin(), gates.end(), channel_factory);
-}
-
-/**
- * Makes a noisy circuit from the clean circuit.
- * Channels are added after each qubit of each gate of the clean cicuit.
- * Roughly equivalent to cirq.Circuit.with_noise.
- * @param circuit The input cicuit.
- * @param A channel factory to construct channels.
- * @return The output noisy circuit.
- */
-template <typename Gate, typename ChannelFactory>
-inline NoisyCircuit<Gate> MakeNoisy(const Circuit<Gate>& circuit,
-                                    const ChannelFactory& channel_factory) {
-  return MakeNoisy<Gate>(circuit.num_qubits, circuit.gates.begin(),
-                         circuit.gates.end(), channel_factory);
-}
-
-}  // namespace qsim
-
-#endif  // CIRCUIT_NOISY_H_
diff --git a/qsim/circuit_qsim_parser.h b/qsim/circuit_qsim_parser.h
deleted file mode 100644
index de7bd89..0000000
--- a/qsim/circuit_qsim_parser.h
+++ /dev/null
@@ -1,442 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef CIRCUIT_QSIM_PARSER_H_
-#define CIRCUIT_QSIM_PARSER_H_
-
-#include <algorithm>
-#include <cctype>
-#include <string>
-#include <sstream>
-#include <vector>
-
-#include "circuit.h"
-#include "gates_qsim.h"
-
-namespace qsim {
-
-/**
- * Parser for the (deprecated) qsim <a href="https://github.com/quantumlib/qsim/blob/master/docs/input_format.md">file input format</a>.
- * The primary supported interface for designing circuits to simulate with qsim
- * is <a href="https://github.com/quantumlib/Cirq">Cirq</a>, which relies on
- * the Python-based qsimcirq interface. For C++ applications, Cirq gates can be
- * explicitly constructed in code.
- */
-template <typename IO>
-class CircuitQsimParser final {
- public:
-  /**
-   * Parses the given input stream into a Circuit object, following the rules
-   * defined in "docs/input_format.md".
-   * @param maxtime Maximum gate "time" to read operations for (inclusive).
-   * @param provider Circuit source; only used for error reporting.
-   * @param fs The stream to read the circuit from.
-   * @param circuit Output circuit object. If parsing is successful, this will
-   *   contain the circuit defined in 'fs'.
-   * @return True if parsing succeeds; false otherwise.
-   */
-  template <typename Stream, typename fp_type>
-  static bool FromStream(unsigned maxtime, const std::string& provider,
-                         Stream& fs, Circuit<GateQSim<fp_type>>& circuit) {
-    circuit.num_qubits = 0;
-
-    circuit.gates.resize(0);
-    circuit.gates.reserve(1024);
-
-    unsigned k = 0;
-
-    std::string line;
-    line.reserve(128);
-
-    unsigned time;
-    std::string gate_name;
-    gate_name.reserve(16);
-
-    unsigned max_time = 0;
-    unsigned prev_mea_time = 0;
-
-    std::vector<unsigned> last_times;
-
-    while (std::getline(fs, line)) {
-      ++k;
-
-      if (line.size() == 0 || line[0] == '#') continue;
-
-      std::stringstream ss(line);
-
-      if (circuit.num_qubits == 0) {
-        ss >> circuit.num_qubits;
-        if (circuit.num_qubits == 0) {
-          IO::errorf("invalid number of qubits in %s in line %u.\n",
-                     provider.c_str(), k);
-          return false;
-        }
-
-        last_times.resize(circuit.num_qubits, unsigned(-1));
-
-        continue;
-      }
-
-      ss >> time >> gate_name;
-
-      if (!ss) {
-        InvalidGateError(provider, k);
-        return false;
-      }
-
-      if (time > maxtime) {
-        break;
-      }
-
-      if (gate_name == "c") {
-        if (!ParseControlledGate<fp_type>(ss, time,
-                                          circuit.num_qubits, circuit.gates)) {
-          InvalidGateError(provider, k);
-          return false;
-        }
-      } else if (!ParseGate<fp_type>(ss, time, circuit.num_qubits,
-                                     gate_name, circuit.gates)) {
-        InvalidGateError(provider, k);
-        return false;
-      }
-
-      const auto& gate = circuit.gates.back();
-
-      if (time < prev_mea_time
-          || (gate.kind == gate::kMeasurement && time < max_time)) {
-        IO::errorf("gate crosses the time boundary set by measurement "
-                   "gates in line %u in %s.\n", k, provider.c_str());
-        return false;
-      }
-
-      if (gate.kind == gate::kMeasurement) {
-        prev_mea_time = time;
-      }
-
-      if (GateIsOutOfOrder(time, gate.qubits, last_times)
-          || GateIsOutOfOrder(time, gate.controlled_by, last_times)) {
-        IO::errorf("gate is out of time order in line %u in %s.\n",
-                   k, provider.c_str());
-        return false;
-      }
-
-      if (time > max_time) {
-        max_time = time;
-      }
-    }
-
-    return true;
-  }
-
-  /**
-   * Parses the given file into a Circuit object, following the rules defined
-   * in "docs/input_format.md".
-   * @param maxtime Maximum gate "time" to read operations for (inclusive).
-   * @param file The name of the file to read the circuit from.
-   * @param circuit Output circuit object. If parsing is successful, this will
-   *   contain the circuit defined in 'file'.
-   * @return True if parsing succeeds; false otherwise.
-   */
-  template <typename fp_type>
-  static bool FromFile(unsigned maxtime, const std::string& file,
-                       Circuit<GateQSim<fp_type>>& circuit) {
-    auto fs = IO::StreamFromFile(file);
-
-    if (!fs) {
-      return false;
-    } else {
-      bool rc = FromStream(maxtime, file, fs, circuit);
-      IO::CloseStream(fs);
-      return rc;
-    }
-  }
-
- private:
-  static void InvalidGateError(const std::string& provider, unsigned line) {
-    IO::errorf("invalid gate in %s in line %u.\n", provider.c_str(), line);
-  }
-
-  /**
-   * Checks formatting for a zero-qubit gate parsed from 'ss'.
-   * @param ss Input stream containing the gate specification.
-   */
-  static bool ValidateGate(std::stringstream& ss) {
-    return ss && ss.peek() == std::stringstream::traits_type::eof();
-  }
-
-  /**
-   * Checks formatting for a single-qubit gate parsed from 'ss'.
-   * @param ss Input stream containing the gate specification.
-   * @param num_qubits Number of qubits, as defined at the start of the file.
-   * @param q0 Index of the affected qubit.
-   */
-  static bool ValidateGate(std::stringstream& ss,
-                           unsigned num_qubits, unsigned q0) {
-    return ss && ss.peek() == std::stringstream::traits_type::eof()
-        && q0 < num_qubits;
-  }
-
-  /**
-   * Checks formatting for a two-qubit gate parsed from 'ss'.
-   * @param ss Input stream containing the gate specification.
-   * @param num_qubits Number of qubits, as defined at the start of the file.
-   * @param q0 Index of the first affected qubit.
-   * @param q1 Index of the second affected qubit.
-   */
-  static bool ValidateGate(std::stringstream& ss,
-                           unsigned num_qubits, unsigned q0, unsigned q1) {
-    return ss && ss.peek() == std::stringstream::traits_type::eof()
-        && q0 < num_qubits && q1 < num_qubits && q0 != q1;
-  }
-
-  /**
-   * Checks formatting for a multiqubit gate parsed from 'ss'.
-   * @param ss Input stream containing the gate specification.
-   * @param num_qubits Number of qubits, as defined at the start of the file.
-   * @param qubits Indices of affected qubits.
-   */
-  static bool ValidateGate(std::stringstream& ss, unsigned num_qubits,
-                           const std::vector<unsigned>& qubits) {
-    return ss && ValidateQubits(num_qubits, qubits);
-  }
-
-  static bool ValidateControlledGate(
-      unsigned num_qubits, const std::vector<unsigned>& qubits,
-      const std::vector<unsigned>& controlled_by) {
-    if (!ValidateQubits(num_qubits, controlled_by)) return false;
-
-    std::size_t i = 0, j = 0;
-
-    while (i < qubits.size() && j < controlled_by.size()) {
-      if (qubits[i] == controlled_by[j]) {
-        return false;
-      } else if (qubits[i] < controlled_by[j]) {
-        ++i;
-      } else {
-        ++j;
-      }
-    }
-
-    return true;
-  }
-
-  static bool ValidateQubits(unsigned num_qubits,
-                             const std::vector<unsigned>& qubits) {
-    if (qubits.size() == 0 || qubits[0] >= num_qubits) return false;
-
-    // qubits should be sorted.
-
-    for (std::size_t i = 1; i < qubits.size(); ++i) {
-      if (qubits[i] >= num_qubits || qubits[i] == qubits[i - 1]) {
-        return false;
-      }
-    }
-
-    return true;
-  }
-
-  static bool GateIsOutOfOrder(unsigned time,
-                               const std::vector<unsigned>& qubits,
-                               std::vector<unsigned>& last_times) {
-    for (auto q : qubits) {
-      if (last_times[q] != unsigned(-1) && time <= last_times[q]) {
-        return true;
-      }
-
-      last_times[q] = time;
-    }
-
-    return false;
-  }
-
-  template <typename fp_type, typename Stream, typename Gate>
-  static bool ParseGate(Stream& ss, unsigned time, unsigned num_qubits,
-                        const std::string& gate_name,
-                        std::vector<Gate>& gates) {
-    unsigned q0, q1;
-    fp_type phi, theta;
-
-    if (gate_name == "p") {
-      ss >> phi;
-      if (!ValidateGate(ss)) return false;
-      gates.push_back(GateGPh<fp_type>::Create(time, phi));
-    } else if (gate_name == "id1") {
-      ss >> q0;
-      if (!ValidateGate(ss, num_qubits, q0)) return false;
-      gates.push_back(GateId1<fp_type>::Create(time, q0));
-    } else if (gate_name == "h") {
-      ss >> q0;
-      if (!ValidateGate(ss, num_qubits, q0)) return false;
-      gates.push_back(GateHd<fp_type>::Create(time, q0));
-    } else if (gate_name == "t") {
-      ss >> q0;
-      if (!ValidateGate(ss, num_qubits, q0)) return false;
-      gates.push_back(GateT<fp_type>::Create(time, q0));
-    } else if (gate_name == "x") {
-      ss >> q0;
-      if (!ValidateGate(ss, num_qubits, q0)) return false;
-      gates.push_back(GateX<fp_type>::Create(time, q0));
-    } else if (gate_name == "y") {
-      ss >> q0;
-      if (!ValidateGate(ss, num_qubits, q0)) return false;
-      gates.push_back(GateY<fp_type>::Create(time, q0));
-    } else if (gate_name == "z") {
-      ss >> q0;
-      if (!ValidateGate(ss, num_qubits, q0)) return false;
-      gates.push_back(GateZ<fp_type>::Create(time, q0));
-    } else if (gate_name == "x_1_2") {
-      ss >> q0;
-      if (!ValidateGate(ss, num_qubits, q0)) return false;
-      gates.push_back(GateX2<fp_type>::Create(time, q0));
-    } else if (gate_name == "y_1_2") {
-      ss >> q0;
-      if (!ValidateGate(ss, num_qubits, q0)) return false;
-      gates.push_back(GateY2<fp_type>::Create(time, q0));
-    } else if (gate_name == "rx") {
-      ss >> q0 >> phi;
-      if (!ValidateGate(ss, num_qubits, q0)) return false;
-      gates.push_back(GateRX<fp_type>::Create(time, q0, phi));
-    } else if (gate_name == "ry") {
-      ss >> q0 >> phi;
-      if (!ValidateGate(ss, num_qubits, q0)) return false;
-      gates.push_back(GateRY<fp_type>::Create(time, q0, phi));
-    } else if (gate_name == "rz") {
-      ss >> q0 >> phi;
-      if (!ValidateGate(ss, num_qubits, q0)) return false;
-      gates.push_back(GateRZ<fp_type>::Create(time, q0, phi));
-    } else if (gate_name == "rxy") {
-      ss >> q0 >> theta >> phi;
-      if (!ValidateGate(ss, num_qubits, q0)) return false;
-      gates.push_back(GateRXY<fp_type>::Create(time, q0, theta, phi));
-    } else if (gate_name == "hz_1_2") {
-      ss >> q0;
-      if (!ValidateGate(ss, num_qubits, q0)) return false;
-      gates.push_back(GateHZ2<fp_type>::Create(time, q0));
-    } else if (gate_name == "s") {
-      ss >> q0;
-      if (!ValidateGate(ss, num_qubits, q0)) return false;
-      gates.push_back(GateS<fp_type>::Create(time, q0));
-    } else if (gate_name == "id2") {
-      ss >> q0 >> q1;
-      if (!ValidateGate(ss, num_qubits, q0, q1)) return false;
-      gates.push_back(GateId2<fp_type>::Create(time, q0, q1));
-    } else if (gate_name == "cz") {
-      ss >> q0 >> q1;
-      if (!ValidateGate(ss, num_qubits, q0, q1)) return false;
-      gates.push_back(GateCZ<fp_type>::Create(time, q0, q1));
-    } else if (gate_name == "cnot" || gate_name == "cx") {
-      ss >> q0 >> q1;
-      if (!ValidateGate(ss, num_qubits, q0, q1)) return false;
-      gates.push_back(GateCNot<fp_type>::Create(time, q0, q1));
-    } else if (gate_name == "sw") {
-      ss >> q0 >> q1;
-      if (!ValidateGate(ss, num_qubits, q0, q1)) return false;
-      gates.push_back(GateSwap<fp_type>::Create(time, q0, q1));
-    } else if (gate_name == "is") {
-      ss >> q0 >> q1;
-      if (!ValidateGate(ss, num_qubits, q0, q1)) return false;
-      gates.push_back(GateIS<fp_type>::Create(time, q0, q1));
-    } else if (gate_name == "fs") {
-      ss >> q0 >> q1 >> theta >> phi;
-      if (!ValidateGate(ss, num_qubits, q0, q1)) return false;
-      gates.push_back(GateFS<fp_type>::Create(time, q0, q1, theta, phi));
-    } else if (gate_name == "cp") {
-      ss >> q0 >> q1 >> phi;
-      if (!ValidateGate(ss, num_qubits, q0, q1)) return false;
-      gates.push_back(GateCP<fp_type>::Create(time, q0, q1, phi));
-    } else if (gate_name == "m") {
-      std::vector<unsigned> qubits;
-      qubits.reserve(num_qubits);
-
-      while (ss.good()) {
-        ss >> q0;
-        if (ss) {
-          qubits.push_back(q0);
-        } else {
-          return false;
-        }
-      }
-
-      gates.push_back(gate::Measurement<GateQSim<fp_type>>::Create(
-          time, std::move(qubits)));
-
-      if (!ValidateQubits(num_qubits, gates.back().qubits)) return false;
-    } else {
-      return false;
-    }
-
-    return true;
-  }
-
-  template <typename fp_type, typename Stream, typename Gate>
-  static bool ParseControlledGate(Stream& ss, unsigned time,
-                                  unsigned num_qubits,
-                                  std::vector<Gate>& gates) {
-    std::vector<unsigned> controlled_by;
-    controlled_by.reserve(64);
-
-    std::string gate_name;
-    gate_name.reserve(16);
-
-    while (1) {
-      while (ss.good()) {
-        if (!std::isblank(ss.get())) {
-          ss.unget();
-          break;
-        }
-      }
-
-      if (!ss.good()) {
-        return false;
-      }
-
-      if (!std::isdigit(ss.peek())) {
-        break;
-      } else {
-        unsigned q;
-        ss >> q;
-
-        if (!ss.good() || !std::isblank(ss.get())) {
-          return false;
-        }
-
-        controlled_by.push_back(q);
-      }
-    }
-
-    if (controlled_by.size() == 0) {
-      return false;
-    }
-
-    ss >> gate_name;
-
-    if (!ss.good() || !ParseGate<fp_type>(ss, time,
-                                          num_qubits, gate_name, gates)) {
-      return false;
-    }
-
-    gates.back().ControlledBy(std::move(controlled_by));
-
-    if (!ValidateControlledGate(num_qubits, gates.back().qubits,
-                                gates.back().controlled_by)) {
-      return false;
-    }
-
-    return true;
-  }
-};
-
-}  // namespace qsim
-
-#endif  // CIRCUIT_QSIM_PARSER_H_
diff --git a/qsim/cuda2hip.h b/qsim/cuda2hip.h
deleted file mode 100644
index da2d074..0000000
--- a/qsim/cuda2hip.h
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright 2023 Advanced Micro Devices, Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SIMULATOR_CUDA2HIP_H_
-#define SIMULATOR_CUDA2HIP_H_
-
-#define cublasCaxpy              hipblasCaxpy
-#define cublasCdotc              hipblasCdotc
-#define cublasCreate             hipblasCreate
-#define cublasCscal              hipblasCscal
-#define cublasCsscal             hipblasCsscal
-#define cublasDestroy            hipblasDestroy
-#define cublasDznrm2             hipblasDznrm2
-#define cublasHandle_t           hipblasHandle_t
-#define cublasScnrm2             hipblasScnrm2
-#define CUBLAS_STATUS_SUCCESS    HIPBLAS_STATUS_SUCCESS
-#define cublasStatus_t           hipblasStatus_t
-#define cublasZaxpy              hipblasZaxpy
-#define cublasZdotc              hipblasZdotc
-#define cublasZdscal             hipblasZdscal
-#define cublasZscal              hipblasZscal
-#define cuCimagf                 hipCimagf
-#define cuCimag                  hipCimag
-#define cuComplex                hipComplex
-#define cuCrealf                 hipCrealf
-#define cuCreal                  hipCreal
-#define CUDA_C_32F               HIPBLAS_C_32F
-#define CUDA_C_64F               HIPBLAS_C_64F
-#define cudaDeviceSynchronize    hipDeviceSynchronize
-#define cudaError_t              hipError_t
-#define cudaFree                 hipFree
-#define cudaGetErrorString       hipGetErrorString
-#define cudaMalloc               hipMalloc
-#define cudaMemcpyAsync          hipMemcpyAsync
-#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
-#define cudaMemcpyDeviceToHost   hipMemcpyDeviceToHost
-#define cudaMemcpy               hipMemcpy
-#define cudaMemcpyHostToDevice   hipMemcpyHostToDevice
-#define cudaMemset               hipMemset
-#define cudaPeekAtLastError      hipPeekAtLastError
-#define cudaSuccess              hipSuccess
-#define cuDoubleComplex          hipDoubleComplex
-
-template <typename T>
-__device__ __forceinline__ T __shfl_down_sync(
-    unsigned mask, T var, unsigned int delta, int width = warpSize) {
-  return __shfl_down(var, delta, width);
-}
-
-#endif  // SIMULATOR_CUDA2HIP_H_
diff --git a/qsim/expect.h b/qsim/expect.h
deleted file mode 100644
index 518d516..0000000
--- a/qsim/expect.h
+++ /dev/null
@@ -1,148 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef EXPECT_H_
-#define EXPECT_H_
-
-#include <complex>
-
-#include "fuser.h"
-#include "gate_appl.h"
-
-namespace qsim {
-
-template <typename Gate>
-struct OpString {
-  std::complex<double> weight;
-  std::vector<Gate> ops;
-};
-
-/**
- * Computes the expectation value of the sum of operator strings (operator
- * sequences). Operators can act on any qubits and they can be any supported
- * gates. This function uses a temporary state vector.
- * @param param Options for gate fusion.
- * @param strings Operator strings.
- * @param ss StateSpace object required to copy the state vector and compute
- *   inner products.
- * @param simulator Simulator object. Provides specific implementations for
- *   applying gates.
- * @param state The state vector of the system.
- * @param ket Temporary state vector.
- * @return The computed expectation value.
- */
-template <typename IO, typename Fuser, typename Gate, typename Simulator>
-std::complex<double> ExpectationValue(
-    const typename Fuser::Parameter& param,
-    const std::vector<OpString<Gate>>& strings,
-    const typename Simulator::StateSpace& state_space,
-    const Simulator& simulator, const typename Simulator::State& state,
-    typename Simulator::State& ket) {
-  std::complex<double> eval = 0;
-
-  if (state_space.IsNull(ket) || ket.num_qubits() < state.num_qubits()) {
-    ket = state_space.Create(state.num_qubits());
-    if (state_space.IsNull(ket)) {
-      IO::errorf("not enough memory: is the number of qubits too large?\n");
-      return eval;
-    }
-  }
-
-  for (const auto& str : strings) {
-    if (str.ops.size() == 0) {
-      eval += str.weight;
-      continue;
-    }
-
-    state_space.Copy(state, ket);
-
-    if (str.ops.size() == 1) {
-      const auto& op = str.ops[0];
-      simulator.ApplyGate(op.qubits, op.matrix.data(), ket);
-    } else {
-      auto fused_gates = Fuser::FuseGates(param, state.num_qubits(), str.ops);
-      if (fused_gates.size() == 0) {
-        eval = 0;
-        break;
-      }
-
-      for (const auto& fgate : fused_gates) {
-        ApplyFusedGate(simulator, fgate, ket);
-      }
-    }
-
-    eval += str.weight * state_space.InnerProduct(state, ket);
-  }
-
-  return eval;
-}
-
-/**
- * Computes the expectation value of the sum of operator strings (operator
- * sequences). Operators can act on any qubits and they can be any supported
- * gates except for user-defined controlled gates. Computation is performed
- * in place. No additional memory is allocated. The operator strings should
- * act on no more than six qubits and they should be fusible into one gate.
- * @param strings Operator strings.
- * @param simulator Simulator object. Provides specific implementations for
- *   computing expectation values.
- * @param state The state of the system.
- * @return The computed expectation value.
- */
-template <typename IO, typename Fuser, typename Gate, typename Simulator>
-std::complex<double> ExpectationValue(
-    const std::vector<OpString<Gate>>& strings,
-    const Simulator& simulator, const typename Simulator::State& state) {
-  std::complex<double> eval = 0;
-
-  typename Fuser::Parameter param;
-  param.max_fused_size = 6;
-  for (const auto& str : strings) {
-    if (str.ops.size() == 0) {
-      eval += str.weight;
-    } else if (str.ops.size() == 1) {
-      const auto& op = str.ops[0];
-      auto r = simulator.ExpectationValue(op.qubits, op.matrix.data(), state);
-      eval += str.weight * r;
-    } else {
-      auto fused_gates = Fuser::FuseGates(param, state.num_qubits(), str.ops);
-
-      if (fused_gates.size() != 1) {
-        IO::errorf("too many fused gates; "
-                   "cannot compute the expectation value.\n");
-        eval = 0;
-        break;
-      }
-
-      const auto& fgate = fused_gates[0];
-
-      if (fgate.qubits.size() > 6) {
-        IO::errorf("operator string acts on too many qubits; "
-                   "cannot compute the expectation value.\n");
-        eval = 0;
-        break;
-      }
-
-      auto r = simulator.ExpectationValue(
-          fgate.qubits, fgate.matrix.data(), state);
-      eval += str.weight * r;
-    }
-  }
-
-  return eval;
-}
-
-}  // namespace qsim
-
-#endif  // EXPECT_H_
diff --git a/qsim/formux.h b/qsim/formux.h
deleted file mode 100644
index 4401e9b..0000000
--- a/qsim/formux.h
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef FORMUX_H_
-#define FORMUX_H_
-
-#ifdef _OPENMP
-# include "parfor.h"
-  namespace qsim {
-    using For = ParallelFor;
-  }
-#else
-# include "seqfor.h"
-  namespace qsim {
-    using For = SequentialFor;
-  }
-#endif
-
-#endif  // FORMUX_H_
diff --git a/qsim/fuser.h b/qsim/fuser.h
deleted file mode 100644
index e4f3c3b..0000000
--- a/qsim/fuser.h
+++ /dev/null
@@ -1,225 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef FUSER_H_
-#define FUSER_H_
-
-#include <cstdint>
-#include <vector>
-
-#include "gate.h"
-#include "matrix.h"
-
-namespace qsim {
-
-/**
- * A collection of "fused" gates which can be multiplied together before being
- * applied to the state vector.
- */
-template <typename Gate>
-struct GateFused {
-  /**
-   * Kind of the first ("parent") gate.
-   */
-  typename Gate::GateKind kind;
-  /**
-   * The time index of the first ("parent") gate.
-   */
-  unsigned time;
-  /**
-   * A list of qubits these gates act upon. Control qubits for
-   * explicitly-controlled gates are excluded from this list.
-   */
-  std::vector<unsigned> qubits;
-  /**
-   * Pointer to the first ("parent") gate.
-   */
-  const Gate* parent;
-  /**
-   * Ordered list of component gates.
-   */
-  std::vector<const Gate*> gates;
-  /**
-   * Fused gate matrix.
-   */
-  Matrix<typename Gate::fp_type> matrix;
-};
-
-/**
- * A base class for fuser classes with some common functions.
- */
-template <typename IO, typename Gate>
-class Fuser {
- protected:
-  using RGate = typename std::remove_pointer<Gate>::type;
-
-  static const RGate& GateToConstRef(const RGate& gate) {
-    return gate;
-  }
-
-  static const RGate& GateToConstRef(const RGate* gate) {
-    return *gate;
-  }
-
-  static std::vector<unsigned> MergeWithMeasurementTimes(
-      typename std::vector<Gate>::const_iterator gfirst,
-      typename std::vector<Gate>::const_iterator glast,
-      const std::vector<unsigned>& times) {
-    std::vector<unsigned> epochs;
-    epochs.reserve(glast - gfirst + times.size());
-
-    std::size_t last = 0;
-    unsigned max_time = 0;
-
-    for (auto gate_it = gfirst; gate_it < glast; ++gate_it) {
-      const auto& gate = GateToConstRef(*gate_it);
-
-      if (gate.time > max_time) {
-        max_time = gate.time;
-      }
-
-      if (epochs.size() > 0 && gate.time < epochs.back()) {
-        IO::errorf("gate crosses the time boundary.\n");
-        epochs.resize(0);
-        return epochs;
-      }
-
-      if (gate.kind == gate::kMeasurement) {
-        if (epochs.size() == 0 || epochs.back() < gate.time) {
-          if (!AddBoundary(gate.time, max_time, epochs)) {
-            epochs.resize(0);
-            return epochs;
-          }
-        }
-      }
-
-      while (last < times.size() && times[last] <= gate.time) {
-        unsigned prev = times[last++];
-        epochs.push_back(prev);
-        if (!AddBoundary(prev, max_time, epochs)) {
-          epochs.resize(0);
-          return epochs;
-        }
-        while (last < times.size() && times[last] <= prev) ++last;
-      }
-    }
-
-    if (epochs.size() == 0 || epochs.back() < max_time) {
-      epochs.push_back(max_time);
-    }
-
-    return epochs;
-  }
-
-  template <typename GateSeq0, typename Parent, typename GateFused>
-  static void FuseZeroQubitGates(const GateSeq0& gate_seq0,
-                                 Parent parent, std::size_t first,
-                                 std::vector<GateFused>& fused_gates) {
-    GateFused* fuse_to = nullptr;
-
-    for (std::size_t i = first; i < fused_gates.size(); ++i) {
-      auto& fgate = fused_gates[i];
-
-      if (fgate.kind != gate::kMeasurement && fgate.kind != gate::kDecomp
-          && fgate.parent->controlled_by.size() == 0
-          && !fgate.parent->unfusible) {
-        fuse_to = &fgate;
-        break;
-      }
-    }
-
-    if (fuse_to != nullptr) {
-      // Fuse zero-qubit gates with the first available fused gate.
-      for (const auto& g : gate_seq0) {
-        fuse_to->gates.push_back(parent(g));
-      }
-    } else {
-      auto g0 = parent(gate_seq0[0]);
-      fused_gates.push_back({g0->kind, g0->time, {}, g0, {g0}, {}});
-
-      for (std::size_t i = 1; i < gate_seq0.size(); ++i) {
-        fused_gates.back().gates.push_back(parent(gate_seq0[i]));
-      }
-    }
-  }
-
- private:
-  static bool AddBoundary(unsigned time, unsigned max_time,
-                          std::vector<unsigned>& boundaries) {
-    if (max_time > time) {
-      IO::errorf("gate crosses the time boundary.\n");
-      return false;
-    }
-
-    boundaries.push_back(time);
-    return true;
-  }
-};
-
-/**
- * Multiplies component gate matrices of a fused gate.
- * @param gate Fused gate.
- */
-template <typename FusedGate>
-inline void CalculateFusedMatrix(FusedGate& gate) {
-  MatrixIdentity(unsigned{1} << gate.qubits.size(), gate.matrix);
-
-  for (auto pgate : gate.gates) {
-    if (pgate->qubits.size() == 0) {
-      MatrixScalarMultiply(pgate->matrix[0], pgate->matrix[1], gate.matrix);
-    } else if (gate.qubits.size() == pgate->qubits.size()) {
-      MatrixMultiply(gate.qubits.size(), pgate->matrix, gate.matrix);
-    } else {
-      unsigned mask = 0;
-
-      for (auto q : pgate->qubits) {
-        for (std::size_t i = 0; i < gate.qubits.size(); ++i) {
-          if (q == gate.qubits[i]) {
-            mask |= unsigned{1} << i;
-            break;
-          }
-        }
-      }
-
-      MatrixMultiply(mask, pgate->qubits.size(), pgate->matrix,
-                     gate.qubits.size(), gate.matrix);
-    }
-  }
-}
-
-/**
- * Multiplies component gate matrices for a range of fused gates.
- * @param gbeg, gend The iterator range [gbeg, gend) of fused gates.
- */
-template <typename Iterator>
-inline void CalculateFusedMatrices(Iterator gbeg, Iterator gend) {
-  for (auto g = gbeg; g != gend; ++g) {
-    if (g->kind != gate::kMeasurement) {
-      CalculateFusedMatrix(*g);
-    }
-  }
-}
-
-/**
- * Multiplies component gate matrices for a vector of fused gates.
- * @param gates The vector of fused gates.
- */
-template <typename FusedGate>
-inline void CalculateFusedMatrices(std::vector<FusedGate>& gates) {
-  CalculateFusedMatrices(gates.begin(), gates.end());
-}
-
-}  // namespace qsim
-
-#endif  // FUSER_H_
diff --git a/qsim/fuser_basic.h b/qsim/fuser_basic.h
deleted file mode 100644
index 3191bd2..0000000
--- a/qsim/fuser_basic.h
+++ /dev/null
@@ -1,411 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef FUSER_BASIC_H_
-#define FUSER_BASIC_H_
-
-#include <map>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "gate.h"
-#include "fuser.h"
-
-namespace qsim {
-
-/**
- * Stateless object with methods for aggregating `Gate`s into `GateFused`.
- * Measurement gates with equal times are fused together.
- * User-defined controlled gates (controlled_by.size() > 0) and gates acting on
- * more than two qubits are not fused.
- * The template parameter Gate can be Gate type or a pointer to Gate type.
- * This class is deprecated. It is recommended to use MultiQubitGateFuser
- * from fuser_mqubit.h.
- */
-template <typename IO, typename Gate>
-class BasicGateFuser final : public Fuser<IO, Gate> {
- private:
-  using Base = Fuser<IO, Gate>;
-  using RGate = typename Base::RGate;
-
- public:
-  using GateFused = qsim::GateFused<RGate>;
-
-  /**
-   * User-specified parameters for gate fusion.
-   * BasicGateFuser does not use any parameters.
-   */
-  struct Parameter {
-    unsigned verbosity = 0;
-  };
-
-  /**
-   * Stores sets of gates that can be applied together. Only one- and
-   * two-qubit gates will get fused. To respect specific time boundaries while
-   * fusing gates, use the other version of this method below.
-   * @param param Options for gate fusion.
-   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
-   * @param gates The gates (or pointers to the gates) to be fused.
-   *   Gate times of the gates that act on the same qubits should be ordered.
-   *   Gates that are out of time order should not cross the time boundaries
-   *   set by measurement gates.
-   * @param fuse_matrix If true, multiply gate matrices together.
-   * @return A vector of fused gate objects. Each element is a set of gates
-   *   acting on a specific pair of qubits which can be applied as a group.
-   */
-  static std::vector<GateFused> FuseGates(const Parameter& param,
-                                          unsigned max_qubit1,
-                                          const std::vector<Gate>& gates,
-                                          bool fuse_matrix = true) {
-    return FuseGates(
-        param, max_qubit1, gates.cbegin(), gates.cend(), {}, fuse_matrix);
-  }
-
-  /**
-   * Stores sets of gates that can be applied together. Only one- and
-   * two-qubit gates will get fused.
-   * @param param Options for gate fusion.
-   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
-   * @param gates The gates (or pointers to the gates) to be fused.
-   *   Gate times of the gates that act on the same qubits should be ordered.
-   *   Gates that are out of time order should not cross the time boundaries
-   *   set by `times_to_split_at` or by measurement gates.
-   * @param times_to_split_at Ordered list of time steps (boundaries) at which
-   *   to separate fused gates. Each element of the output will contain gates
-   *   from a single 'window' in this list.
-   * @param fuse_matrix If true, multiply gate matrices together.
-   * @return A vector of fused gate objects. Each element is a set of gates
-   *   acting on a specific pair of qubits which can be applied as a group.
-   */
-  static std::vector<GateFused> FuseGates(
-      const Parameter& param,
-      unsigned max_qubit1, const std::vector<Gate>& gates,
-      const std::vector<unsigned>& times_to_split_at,
-      bool fuse_matrix = true) {
-    return FuseGates(param, max_qubit1, gates.cbegin(), gates.cend(),
-                     times_to_split_at, fuse_matrix);
-  }
-
-  /**
-   * Stores sets of gates that can be applied together. Only one- and
-   * two-qubit gates will get fused. To respect specific time boundaries while
-   * fusing gates, use the other version of this method below.
-   * @param param Options for gate fusion.
-   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
-   * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates
-   *   (or pointers to gates) in. Gate times of the gates that act on the same
-   *   qubits should be ordered. Gates that are out of time order should not
-   *   cross the time boundaries set by measurement gates.
-   * @param fuse_matrix If true, multiply gate matrices together.
-   * @return A vector of fused gate objects. Each element is a set of gates
-   *   acting on a specific pair of qubits which can be applied as a group.
-   */
-  static std::vector<GateFused> FuseGates(
-      const Parameter& param, unsigned max_qubit1,
-      typename std::vector<Gate>::const_iterator gfirst,
-      typename std::vector<Gate>::const_iterator glast,
-      bool fuse_matrix = true) {
-    return FuseGates(param, max_qubit1, gfirst, glast, {}, fuse_matrix);
-  }
-
-  /**
-   * Stores sets of gates that can be applied together. Only one- and
-   * two-qubit gates will get fused.
-   * @param param Options for gate fusion.
-   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
-   * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates
-   *   (or pointers to gates) in. Gate times of the gates that act on the same
-   *   qubits should be ordered. Gates that are out of time order should not
-   *   cross the time boundaries set by `times_to_split_at` or by measurement
-   *   gates.
-   * @param times_to_split_at Ordered list of time steps (boundaries) at which
-   *   to separate fused gates. Each element of the output will contain gates
-   *   from a single 'window' in this list.
-   * @param fuse_matrix If true, multiply gate matrices together.
-   * @return A vector of fused gate objects. Each element is a set of gates
-   *   acting on a specific pair of qubits which can be applied as a group.
-   */
-  static std::vector<GateFused> FuseGates(
-      const Parameter& param, unsigned max_qubit1,
-      typename std::vector<Gate>::const_iterator gfirst,
-      typename std::vector<Gate>::const_iterator glast,
-      const std::vector<unsigned>& times_to_split_at,
-      bool fuse_matrix = true) {
-    std::vector<GateFused> gates_fused;
-
-    if (gfirst >= glast) return gates_fused;
-
-    std::size_t num_gates = glast - gfirst;
-
-    gates_fused.reserve(num_gates);
-
-    // Merge with measurement gate times to separate fused gates at.
-    auto times =
-        Base::MergeWithMeasurementTimes(gfirst, glast, times_to_split_at);
-
-    // Map to keep track of measurement gates with equal times.
-    std::map<unsigned, std::vector<const RGate*>> measurement_gates;
-
-    // Sequence of top level gates the other gates get fused to.
-    std::vector<const RGate*> gates_seq;
-
-    // Sequence of zero-qubit gates.
-    std::vector<const RGate*> gates_seq0;
-
-    // Lattice of gates: qubits "hyperplane" and time direction.
-    std::vector<std::vector<const RGate*>> gates_lat(max_qubit1);
-
-    // Current unfused gate.
-    auto gate_it = gfirst;
-
-    std::size_t last_fused_gate_index = 0;
-
-    for (std::size_t l = 0; l < times.size(); ++l) {
-      gates_seq.resize(0);
-      gates_seq.reserve(num_gates);
-
-      gates_seq0.resize(0);
-      gates_seq0.reserve(num_gates);
-
-      for (unsigned k = 0; k < max_qubit1; ++k) {
-        gates_lat[k].resize(0);
-        gates_lat[k].reserve(128);
-      }
-
-      // Fill gates_seq and gates_lat in.
-      for (; gate_it < glast; ++gate_it) {
-        const auto& gate = Base::GateToConstRef(*gate_it);
-
-        if (gate.time > times[l]) break;
-
-        if (!ValidateGate(gate, max_qubit1, gates_lat)) {
-          gates_fused.resize(0);
-          return gates_fused;
-        }
-
-        if (gate.kind == gate::kMeasurement) {
-          auto& mea_gates_at_time = measurement_gates[gate.time];
-          if (mea_gates_at_time.size() == 0) {
-            gates_seq.push_back(&gate);
-            mea_gates_at_time.reserve(max_qubit1);
-          }
-
-          mea_gates_at_time.push_back(&gate);
-        } else if (gate.controlled_by.size() > 0 || gate.qubits.size() > 2) {
-          for (auto q : gate.qubits) {
-            gates_lat[q].push_back(&gate);
-          }
-          for (auto q : gate.controlled_by) {
-            gates_lat[q].push_back(&gate);
-          }
-          gates_seq.push_back(&gate);
-        } else if (gate.qubits.size() == 1) {
-          gates_lat[gate.qubits[0]].push_back(&gate);
-          if (gate.unfusible) {
-            gates_seq.push_back(&gate);
-          }
-        } else if (gate.qubits.size() == 2) {
-          gates_lat[gate.qubits[0]].push_back(&gate);
-          gates_lat[gate.qubits[1]].push_back(&gate);
-          gates_seq.push_back(&gate);
-        } else {
-          gates_seq0.push_back(&gate);
-        }
-      }
-
-      std::vector<unsigned> last(max_qubit1, 0);
-
-      const RGate* delayed_measurement_gate = nullptr;
-
-      // Fuse gates.
-      for (auto pgate : gates_seq) {
-        if (pgate->kind == gate::kMeasurement) {
-          delayed_measurement_gate = pgate;
-        } else if (pgate->qubits.size() > 2
-                   || pgate->controlled_by.size() > 0) {
-          // Multi-qubit or controlled gate.
-
-          for (auto q : pgate->qubits) {
-            unsigned l = last[q];
-            if (gates_lat[q][l] != pgate) {
-              last[q] = AddOrphanedQubit(q, l, gates_lat, gates_fused);
-            }
-            ++last[q];
-          }
-
-          for (auto q : pgate->controlled_by) {
-            unsigned l = last[q];
-            if (gates_lat[q][l] != pgate) {
-              last[q] = AddOrphanedQubit(q, l, gates_lat, gates_fused);
-            }
-            ++last[q];
-          }
-
-          gates_fused.push_back({pgate->kind, pgate->time, pgate->qubits,
-                                 pgate, {pgate}, {}});
-        } else if (pgate->qubits.size() == 1) {
-          unsigned q0 = pgate->qubits[0];
-
-          GateFused gate_f = {pgate->kind, pgate->time, {q0}, pgate, {}, {}};
-
-          last[q0] = Advance(last[q0], gates_lat[q0], gate_f.gates);
-          gate_f.gates.push_back(gates_lat[q0][last[q0]]);
-          last[q0] = Advance(last[q0] + 1, gates_lat[q0], gate_f.gates);
-
-          gates_fused.push_back(std::move(gate_f));
-        } else if (pgate->qubits.size() == 2) {
-          unsigned q0 = pgate->qubits[0];
-          unsigned q1 = pgate->qubits[1];
-
-          if (Done(last[q0], pgate->time, gates_lat[q0])) continue;
-
-          GateFused gate_f =
-              {pgate->kind, pgate->time, {q0, q1}, pgate, {}, {}};
-
-          do {
-            last[q0] = Advance(last[q0], gates_lat[q0], gate_f.gates);
-            last[q1] = Advance(last[q1], gates_lat[q1], gate_f.gates);
-            // Here gates_lat[q0][last[q0]] == gates_lat[q1][last[q1]].
-
-            gate_f.gates.push_back(gates_lat[q0][last[q0]]);
-
-            last[q0] = Advance(last[q0] + 1, gates_lat[q0], gate_f.gates);
-            last[q1] = Advance(last[q1] + 1, gates_lat[q1], gate_f.gates);
-          } while (NextGate(last[q0], gates_lat[q0], last[q1], gates_lat[q1]));
-
-          gates_fused.push_back(std::move(gate_f));
-        }
-      }
-
-      for (unsigned q = 0; q < max_qubit1; ++q) {
-        auto l = last[q];
-        if (l == gates_lat[q].size()) continue;
-
-        // Orphaned qubit.
-        AddOrphanedQubit(q, l, gates_lat, gates_fused);
-      }
-
-      if (delayed_measurement_gate != nullptr) {
-        auto pgate = delayed_measurement_gate;
-
-        const auto& mea_gates_at_time = measurement_gates[pgate->time];
-
-        GateFused gate_f = {pgate->kind, pgate->time, {}, pgate, {}, {}};
-        gate_f.gates.reserve(mea_gates_at_time.size());
-
-        // Fuse measurement gates with equal times.
-
-        for (const auto* pgate : mea_gates_at_time) {
-          gate_f.qubits.insert(gate_f.qubits.end(),
-                               pgate->qubits.begin(), pgate->qubits.end());
-          gate_f.gates.push_back(pgate);
-        }
-
-        gates_fused.push_back(std::move(gate_f));
-      }
-
-      if (gates_seq0.size() != 0) {
-        Base::FuseZeroQubitGates(gates_seq0, [](const RGate* g) { return g; },
-                                 last_fused_gate_index, gates_fused);
-      }
-
-      if (gate_it == glast) break;
-
-      last_fused_gate_index = gates_fused.size();
-    }
-
-    if (fuse_matrix) {
-      for (auto& gate_f : gates_fused) {
-        if (gate_f.kind != gate::kMeasurement && gate_f.kind != gate::kDecomp) {
-          CalculateFusedMatrix(gate_f);
-        }
-      }
-    }
-
-    return gates_fused;
-  }
-
- private:
-  static unsigned Advance(unsigned k, const std::vector<const RGate*>& wl,
-                          std::vector<const RGate*>& gates) {
-    while (k < wl.size() && wl[k]->qubits.size() == 1
-           && wl[k]->controlled_by.size() == 0 && !wl[k]->unfusible) {
-      gates.push_back(wl[k++]);
-    }
-
-    return k;
-  }
-
-  static bool Done(
-      unsigned k, unsigned t, const std::vector<const RGate*>& wl) {
-    return k >= wl.size() || wl[k]->time > t;
-  }
-
-  static bool NextGate(unsigned k1, const std::vector<const RGate*>& wl1,
-                       unsigned k2, const std::vector<const RGate*>& wl2) {
-    return k1 < wl1.size() && k2 < wl2.size() && wl1[k1] == wl2[k2]
-        && wl1[k1]->qubits.size() < 3 && wl1[k1]->controlled_by.size() == 0;
-  }
-
-  template <typename GatesLat>
-  static unsigned AddOrphanedQubit(unsigned q, unsigned k,
-                                   const GatesLat& gates_lat,
-                                   std::vector<GateFused>& gates_fused) {
-    auto pgate = gates_lat[q][k];
-
-    GateFused gate_f = {pgate->kind, pgate->time, {q}, pgate, {}, {}};
-    gate_f.gates.push_back(pgate);
-
-    k = Advance(k + 1, gates_lat[q], gate_f.gates);
-
-    gates_fused.push_back(std::move(gate_f));
-
-    return k;
-  }
-
-  template <typename Gate2, typename GatesLat>
-  static bool ValidateGate(const Gate2& gate, unsigned max_qubit1,
-                           const GatesLat& gates_lat) {
-    for (unsigned q : gate.qubits) {
-      if (q >= max_qubit1) {
-        IO::errorf("fuser: gate qubit %u is out of range "
-                   "(should be smaller than %u).\n", q, max_qubit1);
-        return false;
-      }
-      if (!gates_lat[q].empty() && gate.time <= gates_lat[q].back()->time) {
-        IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time);
-        return false;
-      }
-    }
-
-    for (unsigned q : gate.controlled_by) {
-      if (q >= max_qubit1) {
-        IO::errorf("fuser: gate qubit %u is out of range "
-                   "(should be smaller than %u).\n", q, max_qubit1);
-        return false;
-      }
-      if (!gates_lat[q].empty() && gate.time <= gates_lat[q].back()->time) {
-        IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time);
-        return false;
-      }
-    }
-
-    return true;
-  }
-};
-
-}  // namespace qsim
-
-#endif  // FUSER_BASIC_H_
diff --git a/qsim/fuser_mqubit.h b/qsim/fuser_mqubit.h
deleted file mode 100644
index c75b1a0..0000000
--- a/qsim/fuser_mqubit.h
+++ /dev/null
@@ -1,1095 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef FUSER_MQUBIT_H_
-#define FUSER_MQUBIT_H_
-
-#include <algorithm>
-#include <cstdint>
-#include <limits>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "gate.h"
-#include "fuser.h"
-
-namespace qsim {
-
-/**
- * Multi-qubit gate fuser.
- * Measurement gates with equal times are fused together.
- * User-defined controlled gates (controlled_by.size() > 0) are not fused.
- * The template parameter Gate can be Gate type or a pointer to Gate type.
- */
-template <typename IO, typename Gate>
-class MultiQubitGateFuser final : public Fuser<IO, Gate> {
- private:
-  using Base = Fuser<IO, Gate>;
-  using RGate = typename Base::RGate;
-
-  // Auxillary classes and structs.
-
-  // Manages doubly-linked lists.
-  template <typename T>
-  class LinkManagerT {
-   public:
-    struct Link {
-      T val;
-      Link* next;
-      Link* prev;
-    };
-
-    explicit LinkManagerT(uint64_t size) {
-      links_.reserve(size);
-    }
-
-    Link* AddBack(const T& t, Link* link) {
-      if (link == nullptr) {
-        links_.push_back({t, nullptr, nullptr});
-      } else {
-        links_.push_back({t, link->next, link});
-        link->next = &links_.back();
-      }
-
-      return &links_.back();
-    }
-
-    static void Delete(const Link* link) {
-      if (link->prev != nullptr) {
-        link->prev->next = link->next;
-      }
-      if (link->next != nullptr) {
-        link->next->prev = link->prev;
-      }
-    }
-
-   private:
-    std::vector<Link> links_;
-  };
-
-  struct GateF;
-
-  using LinkManager = LinkManagerT<GateF*>;
-  using Link = typename LinkManager::Link;
-
-  // Intermediate representation of a fused gate.
-  struct GateF {
-    const RGate* parent;
-    std::vector<unsigned> qubits;
-    std::vector<const RGate*> gates;  // Gates that get fused to this gate.
-    std::vector<Link*> links;         // Gate "lattice" links.
-    uint64_t mask;                    // Qubit mask.
-    unsigned visited;
-  };
-
-  // Possible values for visited in GateF.
-  // Note that MakeGateSequence assignes values from kSecond to the number of
-  // gates in the sequence plus one, see below.
-  enum Visited {
-    kZero = 0,             // Start value for "normal" gates.
-    kFirst = 1,            // Value after the first pass for partially fused
-                           // "normal" gates.
-    kSecond = 2,           // Start value to assign values in MakeGateSequence.
-    kCompress = 99999997,  // Used to compress links.
-    kMeaCnt = 99999998,    // Start value for controlled or measurement gates.
-    kFinal = 99999999,     // Value after the second pass for fused "normal"
-                           // gates or for controlled and measurement gates.
-  };
-
-  struct Stat {
-    unsigned num_mea_gates = 0;
-    unsigned num_fused_mea_gates = 0;
-    unsigned num_fused_gates = 0;
-    unsigned num_controlled_gates = 0;
-    std::vector<unsigned> num_gates;
-  };
-
-  // Gate that is added to a sequence of gates to fuse together.
-  struct GateA {
-    GateF* gate;
-    std::vector<unsigned> qubits;  // Added qubits.
-    std::vector<Link*> links;      // Added lattice links.
-  };
-
-  struct Scratch {
-    std::vector<GateA> data;
-    std::vector<GateA*> prev1;
-    std::vector<GateA*> prev2;
-    std::vector<GateA*> next1;
-    std::vector<GateA*> next2;
-    std::vector<GateA*> longest_seq;
-    std::vector<GateA*> stack;
-    std::vector<GateF*> gates;
-    unsigned count = 0;
-  };
-
- public:
-  using GateFused = qsim::GateFused<RGate>;
-
-  /**
-   * User-specified parameters for gate fusion.
-   */
-  struct Parameter {
-    /**
-     * Maximum number of qubits in a fused gate. It can take values from 2 to
-     * 6 (0 and 1 are equivalent to 2). It is not recommended to use 5 or 6 as
-     * that might degrade performance for not very fast machines.
-     */
-    unsigned max_fused_size = 2;
-    unsigned verbosity = 0;
-  };
-
-  /**
-   * Stores sets of gates that can be applied together. To respect specific
-   * time boundaries while fusing gates, use the other version of this method
-   * below.
-   * @param param Options for gate fusion.
-   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
-   * @param gates The gates (or pointers to the gates) to be fused.
-   *   Gate times of the gates that act on the same qubits should be ordered.
-   *   Gates that are out of time order should not cross the time boundaries
-   *   set by measurement gates.
-   * @param fuse_matrix If true, multiply gate matrices together.
-   * @return A vector of fused gate objects. Each element is a set of gates
-   *   acting on a specific pair of qubits which can be applied as a group.
-   */
-  static std::vector<GateFused> FuseGates(const Parameter& param,
-                                          unsigned max_qubit1,
-                                          const std::vector<Gate>& gates,
-                                          bool fuse_matrix = true) {
-    return FuseGates(
-        param, max_qubit1, gates.cbegin(), gates.cend(), {}, fuse_matrix);
-  }
-
-  /**
-   * Stores sets of gates that can be applied together.
-   * @param param Options for gate fusion.
-   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
-   * @param gates The gates (or pointers to the gates) to be fused.
-   *   Gate times of the gates that act on the same qubits should be ordered.
-   *   Gates that are out of time order should not cross the time boundaries
-   *   set by `times_to_split_at` or by measurement gates.
-   * @param times_to_split_at Ordered list of time steps (boundaries) at which
-   *   to separate fused gates. Each element of the output will contain gates
-   *   from a single 'window' in this list.
-   * @param fuse_matrix If true, multiply gate matrices together.
-   * @return A vector of fused gate objects. Each element is a set of gates
-   *   acting on a specific pair of qubits which can be applied as a group.
-   */
-  static std::vector<GateFused> FuseGates(
-      const Parameter& param,
-      unsigned max_qubit1, const std::vector<Gate>& gates,
-      const std::vector<unsigned>& times_to_split_at,
-      bool fuse_matrix = true) {
-    return FuseGates(param, max_qubit1, gates.cbegin(), gates.cend(),
-                     times_to_split_at, fuse_matrix);
-  }
-
-  /**
-   * Stores sets of gates that can be applied together. To respect specific
-   * time boundaries while fusing gates, use the other version of this method
-   * below.
-   * @param param Options for gate fusion.
-   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
-   * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates
-   *   (or pointers to gates) in. Gate times of the gates that act on the same
-   *   qubits should be ordered. Gates that are out of time order should not
-   *   cross the time boundaries set by measurement gates.
-   * @param fuse_matrix If true, multiply gate matrices together.
-   * @return A vector of fused gate objects. Each element is a set of gates
-   *   acting on a specific pair of qubits which can be applied as a group.
-   */
-  static std::vector<GateFused> FuseGates(
-      const Parameter& param, unsigned max_qubit1,
-      typename std::vector<Gate>::const_iterator gfirst,
-      typename std::vector<Gate>::const_iterator glast,
-      bool fuse_matrix = true) {
-    return FuseGates(param, max_qubit1, gfirst, glast, {}, fuse_matrix);
-  }
-
-  /**
-   * Stores sets of gates that can be applied together.
-   * @param param Options for gate fusion.
-   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
-   * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates
-   *   (or pointers to gates) in. Gate times of the gates that act on the same
-   *   qubits should be ordered. Gates that are out of time order should not
-   *   cross the time boundaries set by `times_to_split_at` or by measurement
-   *   gates.
-   * @param times_to_split_at Ordered list of time steps (boundaries) at which
-   *   to separate fused gates. Each element of the output will contain gates
-   *   from a single 'window' in this list.
-   * @param fuse_matrix If true, multiply gate matrices together.
-   * @return A vector of fused gate objects. Each element is a set of gates
-   *   acting on a specific pair of qubits which can be applied as a group.
-   */
-  static std::vector<GateFused> FuseGates(
-      const Parameter& param, unsigned max_qubit1,
-      typename std::vector<Gate>::const_iterator gfirst,
-      typename std::vector<Gate>::const_iterator glast,
-      const std::vector<unsigned>& times_to_split_at,
-      bool fuse_matrix = true) {
-    std::vector<GateFused> fused_gates;
-
-    if (gfirst >= glast) return fused_gates;
-
-    std::size_t num_gates = glast - gfirst;
-
-    fused_gates.reserve(num_gates);
-
-    // Merge with measurement gate times to separate fused gates at.
-    auto epochs =
-        Base::MergeWithMeasurementTimes(gfirst, glast, times_to_split_at);
-
-    LinkManager link_manager(max_qubit1 * num_gates);
-
-    // Auxillary data structures.
-    // Sequence of intermediate fused gates.
-    std::vector<GateF> gates_seq;
-    // Gate "lattice".
-    std::vector<Link*> gates_lat;
-    // Sequences of intermediate fused gates ordered by gate size.
-    std::vector<std::vector<GateF*>> fgates(max_qubit1 + 1);
-
-    gates_seq.reserve(num_gates);
-    gates_lat.reserve(max_qubit1);
-
-    Scratch scratch;
-
-    scratch.data.reserve(1024);
-    scratch.prev1.reserve(32);
-    scratch.prev2.reserve(32);
-    scratch.next1.reserve(32);
-    scratch.next2.reserve(32);
-    scratch.longest_seq.reserve(8);
-    scratch.stack.reserve(8);
-
-    Stat stat;
-    stat.num_gates.resize(max_qubit1 + 1, 0);
-
-    unsigned max_fused_size = std::min(unsigned{6}, param.max_fused_size);
-    max_fused_size = std::min(max_fused_size, max_qubit1);
-
-    std::size_t last_fused_gate_index = 0;
-    auto gate_it = gfirst;
-
-    // Iterate over epochs.
-    for (std::size_t l = 0; l < epochs.size(); ++l) {
-      gates_seq.resize(0);
-      gates_lat.resize(0);
-      gates_lat.resize(max_qubit1, nullptr);
-
-      for (unsigned i = 0; i <= max_qubit1; ++i) {
-        fgates[i].resize(0);
-      }
-
-      uint64_t max_gate_size = 0;
-      GateF* last_mea_gate = nullptr;
-
-      // Iterate over input gates.
-      for (; gate_it < glast; ++gate_it) {
-        const auto& gate = Base::GateToConstRef(*gate_it);
-
-        if (gate.time > epochs[l]) break;
-
-        if (!ValidateGate(gate, max_qubit1, gates_lat)) {
-          fused_gates.resize(0);
-          return fused_gates;
-        }
-
-        // Fill in auxillary data structures.
-
-        if (gate.kind == gate::kMeasurement) {
-          // Measurement gate.
-
-          if (last_mea_gate == nullptr
-              || last_mea_gate->parent->time != gate.time) {
-            gates_seq.push_back({&gate, {}, {}, {}, 0, kMeaCnt});
-            last_mea_gate = &gates_seq.back();
-
-            last_mea_gate->qubits.reserve(max_qubit1);
-            last_mea_gate->links.reserve(max_qubit1);
-
-            ++stat.num_fused_mea_gates;
-          }
-
-          for (auto q : gate.qubits) {
-            last_mea_gate->qubits.push_back(q);
-            last_mea_gate->mask |= uint64_t{1} << q;
-            gates_lat[q] = link_manager.AddBack(last_mea_gate, gates_lat[q]);
-            last_mea_gate->links.push_back(gates_lat[q]);
-          }
-
-          last_mea_gate->gates.push_back(&gate);
-
-          ++stat.num_mea_gates;
-        } else {
-          gates_seq.push_back({&gate, {}, {}, {}, 0, kZero});
-          auto& fgate = gates_seq.back();
-
-          if (gate.controlled_by.size() == 0) {
-            if (max_gate_size < gate.qubits.size()) {
-              max_gate_size = gate.qubits.size();
-            }
-
-            unsigned num_gate_qubits = gate.qubits.size();
-            unsigned size = std::max(max_fused_size, num_gate_qubits);
-
-            fgate.qubits.reserve(size);
-            fgate.links.reserve(size);
-            fgate.gates.reserve(4 * size);
-            fgate.links.reserve(size);
-
-            if (fgates[num_gate_qubits].empty()) {
-              fgates[num_gate_qubits].reserve(num_gates);
-            }
-            fgates[num_gate_qubits].push_back(&fgate);
-
-            ++stat.num_gates[num_gate_qubits];
-          } else {
-            // Controlled gate.
-            // Controlled gates are not fused with other gates.
-
-            uint64_t size = gate.qubits.size() + gate.controlled_by.size();
-
-            fgate.qubits.reserve(gate.qubits.size());
-            fgate.links.reserve(size);
-
-            fgate.visited = kMeaCnt;
-            fgate.gates.push_back(&gate);
-
-            ++stat.num_controlled_gates;
-          }
-
-          for (auto q : gate.qubits) {
-            fgate.qubits.push_back(q);
-            fgate.mask |= uint64_t{1} << q;
-            gates_lat[q] = link_manager.AddBack(&fgate, gates_lat[q]);
-            fgate.links.push_back(gates_lat[q]);
-          }
-
-          for (auto q : gate.controlled_by) {
-            fgate.mask |= uint64_t{1} << q;
-            gates_lat[q] = link_manager.AddBack(&fgate, gates_lat[q]);
-            fgate.links.push_back(gates_lat[q]);
-          }
-        }
-      }
-
-      // Fuse large gates with smaller gates.
-      FuseGates(max_gate_size, fgates);
-
-      if (max_fused_size > 2) {
-        FuseGateSequences(
-            max_fused_size, max_qubit1, scratch, gates_seq, stat, fused_gates);
-      } else {
-        unsigned prev_time = 0;
-
-        std::vector<GateF*> orphaned_gates;
-        orphaned_gates.reserve(max_qubit1);
-
-        for (auto& fgate : gates_seq) {
-          if (fgate.gates.size() == 0) continue;
-
-          if (prev_time != fgate.parent->time) {
-            if (orphaned_gates.size() > 0) {
-              FuseOrphanedGates(
-                  max_fused_size, stat, orphaned_gates, fused_gates);
-              orphaned_gates.resize(0);
-            }
-
-            prev_time = fgate.parent->time;
-          }
-
-          if (fgate.qubits.size() == 1 && max_fused_size > 1
-              && fgate.visited != kMeaCnt && !fgate.parent->unfusible) {
-            orphaned_gates.push_back(&fgate);
-            continue;
-          }
-
-          // Assume fgate.qubits (gate.qubits) are sorted.
-          fused_gates.push_back({fgate.parent->kind, fgate.parent->time,
-                                 std::move(fgate.qubits), fgate.parent,
-                                 std::move(fgate.gates), {}});
-
-          if (fgate.visited != kMeaCnt) {
-            ++stat.num_fused_gates;
-          }
-        }
-
-        if (orphaned_gates.size() > 0) {
-          FuseOrphanedGates(max_fused_size, stat, orphaned_gates, fused_gates);
-        }
-      }
-
-      if (fgates[0].size() != 0) {
-        Base::FuseZeroQubitGates(fgates[0],
-                                 [](const GateF* g) { return g->parent; },
-                                 last_fused_gate_index, fused_gates);
-      }
-
-      last_fused_gate_index = fused_gates.size();
-    }
-
-    if (fuse_matrix) {
-      for (auto& fgate : fused_gates) {
-        if (fgate.kind != gate::kMeasurement && fgate.kind != gate::kDecomp) {
-          CalculateFusedMatrix(fgate);
-        }
-      }
-    }
-
-    PrintStat(param.verbosity, stat, fused_gates);
-
-    return fused_gates;
-  }
-
- private:
-  // Fuse large gates with smaller gates.
-  static void FuseGates(uint64_t max_gate_size,
-                        std::vector<std::vector<GateF*>>& fgates) {
-    // Traverse gates in order of decreasing size.
-    for (uint64_t i = 0; i < max_gate_size; ++i) {
-      std::size_t pos = 0;
-
-      for (auto fgate : fgates[max_gate_size - i]) {
-        if (fgate->visited > kZero) continue;
-
-        fgates[max_gate_size - i][pos++] = fgate;
-
-        fgate->visited = kFirst;
-
-        FusePrev(0, *fgate);
-        fgate->gates.push_back(fgate->parent);
-        FuseNext(0, *fgate);
-      }
-
-      fgates[max_gate_size - i].resize(pos);
-    }
-  }
-
-  // Try to fuse gate sequences as follows. Gate time goes from bottom to top.
-  // Gates are fused either from left to right or from right to left.
-  //
-  // max_fused_size = 3: _-  or  -_
-  //
-  // max_fused_size = 4: _-_
-  //
-  // max_fused_size = 5: _-_-  or  -_-_
-  //
-  // max_fused_size = 6: _-_-_
-  static void FuseGateSequences(unsigned max_fused_size,
-                                unsigned max_qubit1, Scratch& scratch,
-                                std::vector<GateF>& gates_seq, Stat& stat,
-                                std::vector<GateFused>& fused_gates) {
-    unsigned prev_time = 0;
-
-    std::vector<GateF*> orphaned_gates;
-    orphaned_gates.reserve(max_qubit1);
-
-    for (auto& fgate : gates_seq) {
-      if (prev_time != fgate.parent->time) {
-        if (orphaned_gates.size() > 0) {
-          FuseOrphanedGates(max_fused_size, stat, orphaned_gates, fused_gates);
-          orphaned_gates.resize(0);
-        }
-
-        prev_time = fgate.parent->time;
-      }
-
-      if (fgate.visited == kFinal || fgate.gates.size() == 0) continue;
-
-      if (fgate.visited == kMeaCnt || fgate.qubits.size() >= max_fused_size
-          || fgate.parent->unfusible) {
-        if (fgate.visited != kMeaCnt) {
-          ++stat.num_fused_gates;
-        }
-
-        fgate.visited = kFinal;
-
-        fused_gates.push_back({fgate.parent->kind, fgate.parent->time,
-                               std::move(fgate.qubits), fgate.parent,
-                               std::move(fgate.gates), {}});
-
-        continue;
-      }
-
-
-      if (fgate.qubits.size() == 1 && max_fused_size > 1) {
-        orphaned_gates.push_back(&fgate);
-        continue;
-      }
-
-      scratch.data.resize(0);
-      scratch.gates.resize(0);
-      scratch.count = 0;
-
-      MakeGateSequence(max_fused_size, scratch, fgate);
-
-      if (scratch.gates.size() == 0) {
-        orphaned_gates.push_back(&fgate);
-      } else {
-        for (auto fgate : scratch.gates) {
-          std::sort(fgate->qubits.begin(), fgate->qubits.end());
-
-          fused_gates.push_back({fgate->parent->kind, fgate->parent->time,
-                                 std::move(fgate->qubits), fgate->parent,
-                                 std::move(fgate->gates), {}});
-
-          ++stat.num_fused_gates;
-        }
-      }
-    }
-
-    if (orphaned_gates.size() > 0) {
-      FuseOrphanedGates(max_fused_size, stat, orphaned_gates, fused_gates);
-    }
-  }
-
-  static void FuseOrphanedGates(unsigned max_fused_size, Stat& stat,
-                                std::vector<GateF*>& orphaned_gates,
-                                std::vector<GateFused>& fused_gates) {
-    for (std::size_t i = 0; i < orphaned_gates.size(); ++i) {
-      auto ogate1 = orphaned_gates[i];
-
-      if (ogate1->visited == kFinal) continue;
-
-      ogate1->visited = kFinal;
-
-      for (std::size_t j = i + 1; j < orphaned_gates.size(); ++j) {
-        auto ogate2 = orphaned_gates[j];
-
-        if (ogate2->visited == kFinal) continue;
-
-        unsigned cur_size = ogate1->qubits.size() + ogate2->qubits.size();
-
-        if (cur_size <= max_fused_size) {
-          ogate2->visited = kFinal;
-
-          for (auto q : ogate2->qubits) {
-            ogate1->qubits.push_back(q);
-            ogate1->mask |= uint64_t{1} << q;
-          }
-
-          for (auto l : ogate2->links) {
-            ogate1->links.push_back(l);
-          }
-
-          for (auto gate : ogate2->gates) {
-            ogate1->gates.push_back(gate);
-          }
-        }
-
-        if (cur_size == max_fused_size) {
-          break;
-        }
-      }
-
-      FuseNext(1, *ogate1);
-
-      std::sort(ogate1->qubits.begin(), ogate1->qubits.end());
-
-      fused_gates.push_back({ogate1->parent->kind, ogate1->parent->time,
-                             std::move(ogate1->qubits), ogate1->parent,
-                             std::move(ogate1->gates), {}});
-
-      ++stat.num_fused_gates;
-    }
-  }
-
-  static void MakeGateSequence(
-      unsigned max_fused_size, Scratch& scratch, GateF& fgate) {
-    unsigned level = kSecond + scratch.count;
-
-    FindLongestGateSequence(max_fused_size, level, scratch, fgate);
-
-    auto longest_seq = scratch.longest_seq;
-
-    if (longest_seq.size() == 1 && scratch.count == 0) {
-      fgate.visited = kFirst;
-      return;
-    }
-
-    ++scratch.count;
-
-    for (auto p : longest_seq) {
-      p->gate->visited = kCompress;
-
-      for (auto q : p->qubits) {
-        fgate.qubits.push_back(q);
-        fgate.mask |= uint64_t{1} << q;
-      }
-
-      for (auto l : p->links) {
-        fgate.links.push_back(l);
-      }
-    }
-
-    // Compress links.
-    for (auto& link : fgate.links) {
-      while (link->prev != nullptr && link->prev->val->visited == kCompress) {
-        link = link->prev;
-      }
-
-      while (link->next != nullptr && link->next->val->visited == kCompress) {
-        LinkManager::Delete(link->next);
-      }
-    }
-
-    for (auto p : longest_seq) {
-      p->gate->visited = level;
-    }
-
-    if (longest_seq.size() >= 3) {
-      AddGatesFromNext(longest_seq[2]->gate->gates, fgate);
-    }
-
-    if (longest_seq.size() >= 5) {
-      AddGatesFromNext(longest_seq[4]->gate->gates, fgate);
-    }
-
-    if (longest_seq.size() >= 2) {
-      // May call MakeGateSequence recursively.
-      AddGatesFromPrev(max_fused_size, *longest_seq[1]->gate, scratch, fgate);
-    }
-
-    if (longest_seq.size() >= 4) {
-      // May call MakeGateSequence recursively.
-      AddGatesFromPrev(max_fused_size, *longest_seq[3]->gate, scratch, fgate);
-    }
-
-    for (auto p : longest_seq) {
-      p->gate->visited = kFinal;
-    }
-
-    FuseNext(1, fgate);
-
-    scratch.gates.push_back(&fgate);
-  }
-
-  static void AddGatesFromNext(std::vector<const RGate*>& gates, GateF& fgate) {
-    for (auto gate : gates) {
-      fgate.gates.push_back(gate);
-    }
-  }
-
-  static void AddGatesFromPrev(unsigned max_fused_size, const GateF& pfgate,
-                               Scratch& scratch, GateF& fgate) {
-    for (auto gate : pfgate.gates) {
-        fgate.gates.push_back(gate);
-    }
-
-    for (auto link : pfgate.links) {
-      if (link->prev == nullptr) continue;
-
-      auto pgate = link->prev->val;
-
-      if (pgate->visited == kFirst) {
-        MakeGateSequence(max_fused_size, scratch, *pgate);
-      }
-    }
-  }
-
-  static void FindLongestGateSequence(
-      unsigned max_fused_size, unsigned level, Scratch& scratch, GateF& fgate) {
-    scratch.data.push_back({&fgate, {}, {}});
-
-    scratch.longest_seq.resize(0);
-    scratch.longest_seq.push_back(&scratch.data.back());
-
-    scratch.stack.resize(0);
-    scratch.stack.push_back(&scratch.data.back());
-
-    unsigned cur_size = fgate.qubits.size();
-    fgate.visited = level;
-
-    unsigned max_size = cur_size;
-
-    GetNextAvailableGates(max_fused_size, cur_size, fgate, nullptr,
-                          scratch.data, scratch.next1);
-
-    for (auto n1 : scratch.next1) {
-      unsigned cur_size2 = cur_size + n1->qubits.size();
-      if (cur_size2 > max_fused_size) continue;
-
-      bool feasible = GetPrevAvailableGates(max_fused_size, cur_size,
-                                            level, *n1->gate, nullptr,
-                                            scratch.data, scratch.prev1);
-
-      if (!feasible) continue;
-
-      if (scratch.prev1.size() == 0 && max_fused_size > 3) continue;
-
-      if (cur_size2 == max_fused_size) {
-        std::swap(scratch.longest_seq, scratch.stack);
-        scratch.longest_seq.push_back(n1);
-        return;
-      }
-
-      Push(level, cur_size2, cur_size, max_size, scratch, n1);
-
-      for (auto p1 : scratch.prev1) {
-        unsigned cur_size2 = cur_size + p1->qubits.size();
-
-        if (cur_size2 > max_fused_size) {
-          continue;
-        } else if (cur_size2 == max_fused_size) {
-          std::swap(scratch.longest_seq, scratch.stack);
-          scratch.longest_seq.push_back(p1);
-          return;
-        }
-
-        Push(level, cur_size2, cur_size, max_size, scratch, p1);
-
-        GetNextAvailableGates(max_fused_size, cur_size, *p1->gate, &fgate,
-                              scratch.data, scratch.next2);
-
-        for (auto n2 : scratch.next2) {
-          unsigned cur_size2 = cur_size + n2->qubits.size();
-          if (cur_size2 > max_fused_size) continue;
-
-          bool feasible = GetPrevAvailableGates(max_fused_size, cur_size,
-                                                level, *n2->gate, n1->gate,
-                                                scratch.data, scratch.prev2);
-
-          if (!feasible) continue;
-
-          if (cur_size2 == max_fused_size) {
-            std::swap(scratch.longest_seq, scratch.stack);
-            scratch.longest_seq.push_back(n2);
-            return;
-          }
-
-          Push(level, cur_size2, cur_size, max_size, scratch, n2);
-
-          for (auto p2 : scratch.prev2) {
-            unsigned cur_size2 = cur_size + p2->qubits.size();
-
-            if (cur_size2 > max_fused_size) {
-              continue;
-            } else if (cur_size2 == max_fused_size) {
-              std::swap(scratch.longest_seq, scratch.stack);
-              scratch.longest_seq.push_back(p2);
-              return;
-            }
-
-            if (cur_size2 > max_size) {
-              scratch.stack.push_back(p2);
-              scratch.longest_seq = scratch.stack;
-              scratch.stack.pop_back();
-              max_size = cur_size2;
-            }
-          }
-
-          Pop(cur_size, scratch, n2);
-        }
-
-        Pop(cur_size, scratch, p1);
-      }
-
-      Pop(cur_size, scratch, n1);
-    }
-  }
-
-  static void Push(unsigned level, unsigned cur_size2, unsigned& cur_size,
-                   unsigned& max_size, Scratch& scratch, GateA* agate) {
-    agate->gate->visited = level;
-    cur_size = cur_size2;
-    scratch.stack.push_back(agate);
-
-    if (cur_size > max_size) {
-      scratch.longest_seq = scratch.stack;
-      max_size = cur_size;
-    }
-  }
-
-  static void Pop(unsigned& cur_size, Scratch& scratch, GateA* agate) {
-    agate->gate->visited = kFirst;
-    cur_size -= agate->qubits.size();
-    scratch.stack.pop_back();
-  }
-
-  static void GetNextAvailableGates(unsigned max_fused_size, unsigned cur_size,
-                                    const GateF& pgate1, const GateF* pgate2,
-                                    std::vector<GateA>& scratch,
-                                    std::vector<GateA*>& next_gates) {
-    next_gates.resize(0);
-
-    for (auto link : pgate1.links) {
-      if (link->next == nullptr) continue;
-
-      auto ngate = link->next->val;
-
-      if (ngate->visited > kFirst || ngate->parent->unfusible) continue;
-
-      GateA next = {ngate, {}, {}};
-      next.qubits.reserve(8);
-      next.links.reserve(8);
-
-      GetAddedQubits(pgate1, pgate2, *ngate, next);
-
-      if (cur_size + next.qubits.size() > max_fused_size) continue;
-
-      scratch.push_back(std::move(next));
-      next_gates.push_back(&scratch.back());
-    }
-  }
-
-  static bool GetPrevAvailableGates(unsigned max_fused_size,
-                                    unsigned cur_size, unsigned level,
-                                    const GateF& ngate1, const GateF* ngate2,
-                                    std::vector<GateA>& scratch,
-                                    std::vector<GateA*>& prev_gates) {
-    prev_gates.resize(0);
-
-    for (auto link : ngate1.links) {
-      if (link->prev == nullptr) continue;
-
-      auto pgate = link->prev->val;
-
-      if (pgate->visited == kFinal || pgate->visited == level) continue;
-
-      if (pgate->visited > kFirst || pgate->parent->unfusible) {
-        prev_gates.resize(0);
-        return false;
-      }
-
-      GateA prev = {pgate, {}, {}};
-      prev.qubits.reserve(8);
-      prev.links.reserve(8);
-
-      GetAddedQubits(ngate1, ngate2, *pgate, prev);
-
-      bool all_prev_visited = true;
-
-      for (auto link : pgate->links) {
-        if (link->prev == nullptr) continue;
-
-        if (link->prev->val->visited <= kMeaCnt) {
-          all_prev_visited = false;
-          break;
-        }
-      }
-
-      if (!all_prev_visited) {
-        prev_gates.resize(0);
-        return false;
-      }
-
-      if (cur_size + prev.qubits.size() > max_fused_size) continue;
-
-      if (all_prev_visited) {
-        scratch.push_back(std::move(prev));
-        prev_gates.push_back(&scratch.back());
-      }
-    }
-
-    return true;
-  }
-
-  static void GetAddedQubits(const GateF& fgate0, const GateF* fgate1,
-                             const GateF& fgate2, GateA& added) {
-    for (std::size_t i = 0; i < fgate2.qubits.size(); ++i) {
-      unsigned q2 = fgate2.qubits[i];
-
-      if (std::find(fgate0.qubits.begin(), fgate0.qubits.end(), q2)
-          != fgate0.qubits.end()) continue;
-
-      if (fgate1 != nullptr
-          && std::find(fgate1->qubits.begin(), fgate1->qubits.end(), q2)
-            != fgate1->qubits.end()) continue;
-
-      added.qubits.push_back(q2);
-      added.links.push_back(fgate2.links[i]);
-    }
-  }
-
-  // Fuse smaller gates with fgate back in gate time.
-  static void FusePrev(unsigned pass, GateF& fgate) {
-    std::vector<const RGate*> gates;
-    gates.reserve(fgate.gates.capacity());
-
-    auto neighbor = [](const Link* link) -> const Link* {
-      return link->prev;
-    };
-
-    FusePrevOrNext<std::greater<unsigned>>(pass, neighbor, fgate, gates);
-
-    for (auto it = gates.rbegin(); it != gates.rend(); ++it) {
-      fgate.gates.push_back(*it);
-    }
-  }
-
-  // Fuse smaller gates with fgate forward in gate time.
-  static void FuseNext(unsigned pass, GateF& fgate) {
-    auto neighbor = [](const Link* link) -> const Link* {
-      return link->next;
-    };
-
-    FusePrevOrNext<std::less<unsigned>>(pass, neighbor, fgate, fgate.gates);
-  }
-
-  template <typename R, typename Neighbor>
-  static void FusePrevOrNext(unsigned pass, Neighbor neighb, GateF& fgate,
-                             std::vector<const RGate*>& gates) {
-    uint64_t bad_mask = 0;
-    auto links = fgate.links;
-
-    bool may_have_gates_to_fuse = true;
-
-    while (may_have_gates_to_fuse) {
-      may_have_gates_to_fuse = false;
-
-      std::sort(links.begin(), links.end(),
-                [&neighb](const Link* l, const Link* r) -> bool {
-                  auto ln = neighb(l);
-                  auto rn = neighb(r);
-
-                  if (ln != nullptr && rn != nullptr) {
-                    return R()(ln->val->parent->time, rn->val->parent->time);
-                  } else {
-                    // nullptrs are larger than everything else and
-                    // equivalent among each other.
-                    return ln != nullptr;
-                  }
-                });
-
-      for (auto link : links) {
-        auto n = neighb(link);
-
-        if (n == nullptr) continue;
-
-        auto g = n->val;
-
-        if (!QubitsAreIn(fgate.mask, g->mask) || (g->mask & bad_mask) != 0
-            || g->visited > pass || g->parent->unfusible) {
-          bad_mask |= g->mask;
-        } else {
-          g->visited = pass == 0 ? kFirst : kFinal;
-
-          if (pass == 0) {
-            gates.push_back(g->parent);
-          } else {
-            for (auto gate : g->gates) {
-              gates.push_back(gate);
-            }
-          }
-
-          for (auto link : g->links) {
-            LinkManager::Delete(link);
-          }
-
-          may_have_gates_to_fuse = true;
-          break;
-        }
-      }
-    }
-  }
-
-  static bool QubitsAreIn(uint64_t mask0, uint64_t mask) {
-    return ((mask0 | mask) ^ mask0) == 0;
-  }
-
-  static void PrintStat(unsigned verbosity, const Stat& stat,
-                        const std::vector<GateFused>& fused_gates) {
-    if (verbosity < 3) return;
-
-    if (stat.num_controlled_gates > 0) {
-      IO::messagef("%lu controlled gates\n", stat.num_controlled_gates);
-    }
-
-    if (stat.num_mea_gates > 0) {
-      IO::messagef("%lu measurement gates", stat.num_mea_gates);
-      if (stat.num_fused_mea_gates == stat.num_mea_gates) {
-        IO::messagef("\n");
-      } else {
-        IO::messagef(" are fused into %lu gates\n", stat.num_fused_mea_gates);
-      }
-    }
-
-    bool first = true;
-    for (unsigned i = 1; i < stat.num_gates.size(); ++i) {
-      if (stat.num_gates[i] > 0) {
-        if (first) {
-          first = false;
-        } else {
-          IO::messagef(", ");
-        }
-        IO::messagef("%u %u-qubit", stat.num_gates[i], i);
-      }
-    }
-
-    IO::messagef(" gates are fused into %lu gates\n", stat.num_fused_gates);
-
-    if (verbosity < 5) return;
-
-    IO::messagef("fused gate qubits:\n");
-    for (const auto& g : fused_gates) {
-      IO::messagef("%6u  ", g.parent->time);
-      if (g.parent->kind == gate::kMeasurement) {
-        IO::messagef("m");
-      } else if (g.parent->controlled_by.size() > 0) {
-        IO::messagef("c");
-        for (auto q : g.parent->controlled_by) {
-          IO::messagef("%3u", q);
-        }
-        IO::messagef("  t");
-      } else {
-        IO::messagef(" ");
-      }
-
-      for (auto q : g.qubits) {
-        IO::messagef("%3u", q);
-      }
-      IO::messagef("\n");
-    }
-  }
-
-  template <typename Gate2, typename GatesLat>
-  static bool ValidateGate(const Gate2& gate, unsigned max_qubit1,
-                           const GatesLat& gates_lat) {
-    for (unsigned q : gate.qubits) {
-      if (q >= max_qubit1) {
-        IO::errorf("fuser: gate qubit %u is out of range "
-                   "(should be smaller than %u).\n", q, max_qubit1);
-        return false;
-      }
-      if (gates_lat[q] != nullptr
-          && gate.time <= gates_lat[q]->val->parent->time) {
-        IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time);
-        return false;
-      }
-    }
-
-    for (unsigned q : gate.controlled_by) {
-      if (q >= max_qubit1) {
-        IO::errorf("fuser: gate qubit %u is out of range "
-                   "(should be smaller than %u).\n", q, max_qubit1);
-        return false;
-      }
-      if (gates_lat[q] != nullptr
-          && gate.time <= gates_lat[q]->val->parent->time) {
-        IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time);
-        return false;
-      }
-    }
-
-    return true;
-  }
-};
-
-}  // namespace qsim
-
-#endif  // FUSER_MQUBIT_H_
diff --git a/qsim/gate.h b/qsim/gate.h
deleted file mode 100644
index a457acb..0000000
--- a/qsim/gate.h
+++ /dev/null
@@ -1,216 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef GATE_H_
-#define GATE_H_
-
-#include <algorithm>
-#include <cstdint>
-#include <utility>
-#include <vector>
-
-#include "matrix.h"
-
-namespace qsim {
-
-namespace detail {
-
-template <typename Gate, typename GateDef>
-inline void SortQubits(Gate& gate) {
-  for (std::size_t i = 1; i < gate.qubits.size(); ++i) {
-    if (gate.qubits[i - 1] > gate.qubits[i]) {
-      if (!GateDef::symmetric) {
-        auto perm = NormalToGateOrderPermutation(gate.qubits);
-        MatrixShuffle(perm, gate.qubits.size(), gate.matrix);
-      }
-
-      gate.swapped = true;
-      std::sort(gate.qubits.begin(), gate.qubits.end());
-      break;
-    }
-  }
-}
-
-}  // namespace detail
-
-template <typename Qubits = std::vector<unsigned>, typename Gate>
-inline Gate& MakeControlledGate(Qubits&& controlled_by, Gate& gate) {
-  gate.controlled_by = std::forward<Qubits>(controlled_by);
-  gate.cmask = (uint64_t{1} << gate.controlled_by.size()) - 1;
-
-  std::sort(gate.controlled_by.begin(), gate.controlled_by.end());
-
-  return gate;
-}
-
-template <typename Qubits = std::vector<unsigned>, typename Gate>
-inline Gate& MakeControlledGate(Qubits&& controlled_by,
-                               const std::vector<unsigned>& control_values,
-                               Gate& gate) {
-  // Assume controlled_by.size() == control_values.size().
-
-  bool sorted = true;
-
-  for (std::size_t i = 1; i < controlled_by.size(); ++i) {
-    if (controlled_by[i - 1] > controlled_by[i]) {
-      sorted = false;
-      break;
-    }
-  }
-
-  if (sorted) {
-    gate.controlled_by = std::forward<Qubits>(controlled_by);
-    gate.cmask = 0;
-
-    for (std::size_t i = 0; i < control_values.size(); ++i) {
-      gate.cmask |= (control_values[i] & 1) << i;
-    }
-  } else {
-    struct ControlPair {
-      unsigned q;
-      unsigned v;
-    };
-
-    std::vector<ControlPair> cpairs;
-    cpairs.reserve(controlled_by.size());
-
-    for (std::size_t i = 0; i < controlled_by.size(); ++i) {
-      cpairs.push_back({controlled_by[i], control_values[i]});
-    }
-
-    // Sort control qubits and control values.
-    std::sort(cpairs.begin(), cpairs.end(),
-              [](const ControlPair& l, const ControlPair& r) -> bool {
-                return l.q < r.q;
-              });
-
-    gate.cmask = 0;
-    gate.controlled_by.reserve(controlled_by.size());
-
-    for (std::size_t i = 0; i < cpairs.size(); ++i) {
-      gate.cmask |= (cpairs[i].v & 1) << i;
-      gate.controlled_by.push_back(cpairs[i].q);
-    }
-  }
-
-  return gate;
-}
-
-namespace gate {
-
-constexpr int kDecomp = 100001;       // gate from Schmidt decomposition
-constexpr int kMeasurement = 100002;  // measurement gate
-
-}  // namespace gate
-
-enum GateAnyKind {
-  kGateAny = -1,
-};
-
-/**
- * A generic gate to make it easier to use qsim with external gate sets.
- */
-template <typename FP, typename GK = GateAnyKind>
-struct Gate {
-  using fp_type = FP;
-  using GateKind = GK;
-
-  GateKind kind;
-  unsigned time;
-  std::vector<unsigned> qubits;
-  std::vector<unsigned> controlled_by;
-  uint64_t cmask;
-  std::vector<fp_type> params;
-  Matrix<fp_type> matrix;
-  bool unfusible;      // If true, the gate is fused as a parent.
-  bool swapped;        // If true, the gate qubits are swapped to make qubits
-                       // ordered in ascending order. This does not apply to
-                       // control qubits of explicitly-controlled gates.
-
-  template <typename Qubits = std::vector<unsigned>>
-  Gate&& ControlledBy(Qubits&& controlled_by) {
-    MakeControlledGate(std::forward<Qubits>(controlled_by), *this);
-    return std::move(*this);
-  }
-
-  template <typename Qubits = std::vector<unsigned>>
-  Gate&& ControlledBy(Qubits&& controlled_by,
-                      const std::vector<unsigned>& control_values) {
-    MakeControlledGate(
-        std::forward<Qubits>(controlled_by), control_values, *this);
-    return std::move(*this);
-  }
-};
-
-template <typename Gate, typename GateDef,
-          typename Qubits = std::vector<unsigned>,
-          typename M = Matrix<typename Gate::fp_type>>
-inline Gate CreateGate(unsigned time, Qubits&& qubits, M&& matrix = {},
-                       std::vector<typename Gate::fp_type>&& params = {}) {
-  Gate gate = {GateDef::kind, time, std::forward<Qubits>(qubits), {}, 0,
-               std::move(params), std::forward<M>(matrix), false, false};
-
-  if (GateDef::kind != gate::kMeasurement) {
-    switch (gate.qubits.size()) {
-    case 1:
-      break;
-    case 2:
-      if (gate.qubits[0] > gate.qubits[1]) {
-        gate.swapped = true;
-        std::swap(gate.qubits[0], gate.qubits[1]);
-        if (!GateDef::symmetric) {
-          MatrixShuffle({1, 0}, 2, gate.matrix);
-        }
-      }
-      break;
-    default:
-      detail::SortQubits<Gate, GateDef>(gate);
-    }
-  }
-
-  return gate;
-}
-
-namespace gate {
-
-/**
- * A gate that simulates measurement of one or more qubits, collapsing the
- * state vector and storing the measured results.
- */
-template <typename Gate>
-struct Measurement {
-  using GateKind = typename Gate::GateKind;
-
-  static constexpr GateKind kind = GateKind::kMeasurement;
-  static constexpr char name[] = "m";
-  static constexpr bool symmetric = false;
-
-  template <typename Qubits = std::vector<unsigned>>
-  static Gate Create(unsigned time, Qubits&& qubits) {
-    return CreateGate<Gate, Measurement>(time, std::forward<Qubits>(qubits));
-  }
-};
-
-}  // namespace gate
-
-template <typename fp_type>
-using schmidt_decomp_type = std::vector<std::vector<std::vector<fp_type>>>;
-
-template <typename fp_type, typename GateKind>
-schmidt_decomp_type<fp_type> GetSchmidtDecomp(
-    GateKind kind, const std::vector<fp_type>& params);
-
-}  // namespace qsim
-
-#endif  // GATE_H_
diff --git a/qsim/gate_appl.h b/qsim/gate_appl.h
deleted file mode 100644
index 8601e6f..0000000
--- a/qsim/gate_appl.h
+++ /dev/null
@@ -1,231 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef GATE_APPL_H_
-#define GATE_APPL_H_
-
-#include <utility>
-#include <vector>
-
-#include "fuser.h"
-#include "gate.h"
-#include "matrix.h"
-
-namespace qsim {
-
-/**
- * Applies the given gate to the simulator state. Ignores measurement gates.
- * @param simulator Simulator object. Provides specific implementations for
- *   applying gates.
- * @param gate The gate to be applied.
- * @param state The state of the system, to be updated by this method.
- */
-template <typename Simulator, typename Gate>
-inline void ApplyGate(const Simulator& simulator, const Gate& gate,
-                      typename Simulator::State& state) {
-  if (gate.kind != gate::kMeasurement) {
-    if (gate.controlled_by.size() == 0) {
-      simulator.ApplyGate(gate.qubits, gate.matrix.data(), state);
-    } else {
-      simulator.ApplyControlledGate(gate.qubits, gate.controlled_by,
-                                    gate.cmask, gate.matrix.data(), state);
-    }
-  }
-}
-
-/**
- * Applies the given gate dagger to the simulator state. If the gate matrix is
- *   unitary then this is equivalent to applying the inverse gate. Ignores
- *   measurement gates.
- * @param simulator Simulator object. Provides specific implementations for
- *   applying gates.
- * @param gate The gate to be applied.
- * @param state The state of the system, to be updated by this method.
- */
-template <typename Simulator, typename Gate>
-inline void ApplyGateDagger(const Simulator& simulator, const Gate& gate,
-                            typename Simulator::State& state) {
-  if (gate.kind != gate::kMeasurement) {
-    auto matrix = gate.matrix;
-    MatrixDagger(unsigned{1} << gate.qubits.size(), matrix);
-
-    if (gate.controlled_by.size() == 0) {
-      simulator.ApplyGate(gate.qubits, matrix.data(), state);
-    } else {
-      simulator.ApplyControlledGate(gate.qubits, gate.controlled_by,
-                                    gate.cmask, matrix.data(), state);
-    }
-  }
-}
-
-/**
- * Applies the given gate to the simulator state.
- * @param state_space StateSpace object required to perform measurements.
- * @param simulator Simulator object. Provides specific implementations for
- *   applying gates.
- * @param gate The gate to be applied.
- * @param rgen Random number generator to perform measurements.
- * @param state The state of the system, to be updated by this method.
- * @param mresults As an input parameter, this can be empty or this can
- *   contain the results of the previous measurements. If gate is a measurement
- *   gate then after a successful run, the measurement result will be added to
- *   this.
- * @return True if the measurement performed successfully; false otherwise.
- */
-template <typename Simulator, typename Gate, typename Rgen>
-inline bool ApplyGate(
-    const typename Simulator::StateSpace& state_space,
-    const Simulator& simulator, const Gate& gate, Rgen& rgen,
-    typename Simulator::State& state,
-    std::vector<typename Simulator::StateSpace::MeasurementResult>& mresults) {
-  if (gate.kind == gate::kMeasurement) {
-    auto measure_result = state_space.Measure(gate.qubits, rgen, state);
-    if (measure_result.valid) {
-      mresults.push_back(std::move(measure_result));
-    } else {
-      return false;
-    }
-  } else {
-    ApplyGate(simulator, gate, state);
-  }
-
-  return true;
-}
-
-/**
- * Applies the given gate to the simulator state, discarding measurement
- *   results.
- * @param state_space StateSpace object required to perform measurements.
- * @param simulator Simulator object. Provides specific implementations for
- *   applying gates.
- * @param gate The gate to be applied.
- * @param rgen Random number generator to perform measurements.
- * @param state The state of the system, to be updated by this method.
- * @return True if the measurement performed successfully; false otherwise.
- */
-template <typename Simulator, typename Gate, typename Rgen>
-inline bool ApplyGate(const typename Simulator::StateSpace& state_space,
-                      const Simulator& simulator, const Gate& gate, Rgen& rgen,
-                      typename Simulator::State& state) {
-  using MeasurementResult = typename Simulator::StateSpace::MeasurementResult;
-  std::vector<MeasurementResult> discarded_results;
-  return
-      ApplyGate(state_space, simulator, gate, rgen, state, discarded_results);
-}
-
-/**
- * Applies the given fused gate to the simulator state. Ignores measurement
- *   gates.
- * @param simulator Simulator object. Provides specific implementations for
- *   applying gates.
- * @param gate The gate to be applied.
- * @param state The state of the system, to be updated by this method.
- */
-template <typename Simulator, typename Gate>
-inline void ApplyFusedGate(const Simulator& simulator, const Gate& gate,
-                           typename Simulator::State& state) {
-  if (gate.kind != gate::kMeasurement) {
-    if (gate.parent->controlled_by.size() == 0) {
-      simulator.ApplyGate(gate.qubits, gate.matrix.data(), state);
-    } else {
-      simulator.ApplyControlledGate(gate.qubits, gate.parent->controlled_by,
-                                    gate.parent->cmask, gate.matrix.data(),
-                                    state);
-    }
-  }
-}
-
-/**
- * Applies the given fused gate dagger to the simulator state. If the gate
- *   matrix is unitary then this is equivalent to applying the inverse gate.
- *   Ignores measurement gates.
- * @param simulator Simulator object. Provides specific implementations for
- *   applying gates.
- * @param gate The gate to be applied.
- * @param state The state of the system, to be updated by this method.
- */
-template <typename Simulator, typename Gate>
-inline void ApplyFusedGateDagger(const Simulator& simulator, const Gate& gate,
-                                 typename Simulator::State& state) {
-  if (gate.kind != gate::kMeasurement) {
-    auto matrix = gate.matrix;
-    MatrixDagger(unsigned{1} << gate.qubits.size(), matrix);
-
-    if (gate.parent->controlled_by.size() == 0) {
-      simulator.ApplyGate(gate.qubits, matrix.data(), state);
-    } else {
-      simulator.ApplyControlledGate(gate.qubits, gate.parent->controlled_by,
-                                    gate.parent->cmask, matrix.data(), state);
-    }
-  }
-}
-
-/**
- * Applies the given fused gate to the simulator state.
- * @param state_space StateSpace object required to perform measurements.
- * @param simulator Simulator object. Provides specific implementations for
- *   applying gates.
- * @param gate The gate to be applied.
- * @param rgen Random number generator to perform measurements.
- * @param state The state of the system, to be updated by this method.
- * @param mresults As an input parameter, this can be empty or this can
- *   contain the results of the previous measurements. If gate is a measurement
- *   gate then after a successful run, the measurement result will be added to
- *   this.
- * @return True if the measurement performed successfully; false otherwise.
- */
-template <typename Simulator, typename Gate, typename Rgen>
-inline bool ApplyFusedGate(
-    const typename Simulator::StateSpace& state_space,
-    const Simulator& simulator, const Gate& gate, Rgen& rgen,
-    typename Simulator::State& state,
-    std::vector<typename Simulator::StateSpace::MeasurementResult>& mresults) {
-  if (gate.kind == gate::kMeasurement) {
-    auto measure_result = state_space.Measure(gate.qubits, rgen, state);
-    if (measure_result.valid) {
-      mresults.push_back(std::move(measure_result));
-    } else {
-      return false;
-    }
-  } else {
-    ApplyFusedGate(simulator, gate, state);
-  }
-
-  return true;
-}
-
-/**
- * Applies the given fused gate to the simulator state, discarding measurement
- *   results.
- * @param state_space StateSpace object required to perform measurements.
- * @param simulator Simulator object. Provides specific implementations for
- *   applying gates.
- * @param gate The gate to be applied.
- * @param rgen Random number generator to perform measurements.
- * @param state The state of the system, to be updated by this method.
- * @return True if the measurement performed successfully; false otherwise.
- */
-template <typename Simulator, typename Gate, typename Rgen>
-inline bool ApplyFusedGate(const typename Simulator::StateSpace& state_space,
-                           const Simulator& simulator, const Gate& gate,
-                           Rgen& rgen, typename Simulator::State& state) {
-  using MeasurementResult = typename Simulator::StateSpace::MeasurementResult;
-  std::vector<MeasurementResult> discarded_results;
-  return ApplyFusedGate(
-      state_space, simulator, gate, rgen, state, discarded_results);
-}
-
-}  // namespace qsim
-
-#endif  // GATE_APPL_H_
diff --git a/qsim/gates_cirq.h b/qsim/gates_cirq.h
deleted file mode 100644
index d767959..0000000
--- a/qsim/gates_cirq.h
+++ /dev/null
@@ -1,1640 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef GATES_CIRQ_H_
-#define GATES_CIRQ_H_
-
-#include <algorithm>
-#include <cmath>
-#include <complex>
-#include <vector>
-
-#include "gate.h"
-#include "matrix.h"
-
-namespace qsim {
-
-namespace Cirq {
-
-enum GateKind {
-  kI1 = 0,     // One-qubit identity gate.
-  kI2,         // Two-qubit identity gate.
-  kI,          // Multi-qubit identity gate.
-  kXPowGate,
-  kYPowGate,
-  kZPowGate,
-  kHPowGate,
-  kCZPowGate,
-  kCXPowGate,
-  krx,
-  kry,
-  krz,
-  kH,
-  kS,
-  kCZ,
-  kCX,
-  kT,
-  kX,
-  kY,
-  kZ,
-  kPhasedXPowGate,
-  kPhasedXZGate,
-  kXXPowGate,
-  kYYPowGate,
-  kZZPowGate,
-  kXX,
-  kYY,
-  kZZ,
-  kSwapPowGate,
-  kISwapPowGate,
-  kriswap,
-  kSWAP,
-  kISWAP,
-  kPhasedISwapPowGate,
-  kgivens,
-  kFSimGate,
-  kTwoQubitDiagonalGate,
-  kThreeQubitDiagonalGate,
-  kCCZPowGate,
-  kCCXPowGate,
-  kCSwapGate,
-  kCCZ,
-  kCCX,
-  kMatrixGate1,  // One-qubit matrix gate.
-  kMatrixGate2,  // Two-qubit matrix gate.
-  kMatrixGate,   // Multi-qubit matrix gate.
-  kGlobalPhaseGate,
-  kDecomp = gate::kDecomp,
-  kMeasurement = gate::kMeasurement,
-};
-
-template <typename fp_type>
-using GateCirq = Gate<fp_type, GateKind>;
-
-constexpr double h_double = 0.5;
-constexpr double pi_double = 3.14159265358979323846264338327950288;
-constexpr double is2_double = 0.7071067811865475;
-
-// Gates from cirq/ops/global_phase_op.py:
-
-/**
- * The global phase gate.
- */
-template <typename fp_type>
-struct GlobalPhaseGate {
-  static constexpr GateKind kind = kGlobalPhaseGate;
-  static constexpr char name[] = "GlobalPhaseGate";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateCirq<fp_type> Create(unsigned time, fp_type phi) {
-    return Create(time, std::cos(phi), std::sin(phi));
-  }
-
-  static GateCirq<fp_type> Create(unsigned time, fp_type cp, fp_type sp) {
-    return CreateGate<GateCirq<fp_type>, GlobalPhaseGate>(
-        time, {}, {cp, sp}, {cp, sp});
-  }
-};
-
-template <typename fp_type>
-using global_phase_operation = GlobalPhaseGate<fp_type>;
-
-// Gates from cirq/ops/identity.py:
-
-/**
- * A one-qubit identity gate.
- */
-template <typename fp_type>
-struct I1 {
-  static constexpr GateKind kind = kI1;
-  static constexpr char name[] = "I1";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateCirq<fp_type>, I1>(
-        time, {q0}, {1, 0, 0, 0, 0, 0, 1, 0});
-  }
-};
-
-/**
- * A two-qubit identity gate.
- */
-template <typename fp_type>
-struct I2 {
-  static constexpr GateKind kind = kI2;
-  static constexpr char name[] = "I2";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
-    return CreateGate<GateCirq<fp_type>, I2>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, 1, 0, 0, 0, 0, 0,
-                         0, 0, 0, 0, 1, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, 1, 0});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
-    return schmidt_decomp_type<fp_type>{
-      {{1, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, 1, 0}},
-    };
-  }
-};
-
-/**
- * A multi-qubit identity gate.
- */
-template <typename fp_type>
-struct I {
-  static constexpr GateKind kind = kI;
-  static constexpr char name[] = "I";
-  static constexpr bool symmetric = true;
-
-  static GateCirq<fp_type> Create(unsigned time,
-                                  const std::vector<unsigned>& qubits) {
-    Matrix<fp_type> matrix;
-    MatrixIdentity(1 << qubits.size(), matrix);
-    return CreateGate<GateCirq<fp_type>, I>(time, qubits, std::move(matrix));
-  }
-};
-
-// Gates form cirq/ops/common_gates.py:
-
-/**
- * A gate that rotates around the X axis of the Bloch sphere.
- * This is a generalization of the X gate.
- */
-template <typename fp_type>
-struct XPowGate {
-  static constexpr GateKind kind = kXPowGate;
-  static constexpr char name[] = "XPowGate";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0,
-                                  fp_type exponent, fp_type global_shift = 0) {
-    fp_type c = std::cos(pi * exponent * 0.5);
-    fp_type s = std::sin(pi * exponent * 0.5);
-    fp_type gc = std::cos(pi * exponent * (0.5 + global_shift));
-    fp_type gs = std::sin(pi * exponent * (0.5 + global_shift));
-
-    return CreateGate<GateCirq<fp_type>, XPowGate>(
-        time, {q0}, {c * gc, c * gs, s * gs, -s * gc,
-                     s * gs, -s * gc, c * gc, c * gs},
-        {exponent, global_shift});
-  }
-};
-
-/**
- * A gate that rotates around the Y axis of the Bloch sphere.
- * This is a generalization of the Y gate.
- */
-template <typename fp_type>
-struct YPowGate {
-  static constexpr GateKind kind = kYPowGate;
-  static constexpr char name[] = "YPowGate";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0,
-                                  fp_type exponent, fp_type global_shift = 0) {
-    fp_type c = std::cos(pi * exponent * 0.5);
-    fp_type s = std::sin(pi * exponent * 0.5);
-    fp_type gc = std::cos(pi * exponent * (0.5 + global_shift));
-    fp_type gs = std::sin(pi * exponent * (0.5 + global_shift));
-
-    return CreateGate<GateCirq<fp_type>, YPowGate>(
-        time, {q0}, {c * gc, c * gs, -s * gc, -s * gs,
-                     s * gc, s * gs, c * gc, c * gs}, {exponent, global_shift});
-  }
-};
-
-/**
- * A gate that rotates around the Z axis of the Bloch sphere.
- * This is a generalization of the Z gate.
- */
-template <typename fp_type>
-struct ZPowGate {
-  static constexpr GateKind kind = kZPowGate;
-  static constexpr char name[] = "ZPowGate";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0,
-                                  fp_type exponent, fp_type global_shift = 0) {
-    fp_type c = std::cos(pi * exponent);
-    fp_type s = std::sin(pi * exponent);
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-
-    return CreateGate<GateCirq<fp_type>, ZPowGate>(
-        time, {q0}, {gc, gs, 0, 0, 0, 0, c * gc - s * gs, c * gs + s * gc},
-        {exponent, global_shift});
-  }
-};
-
-/**
- * A gate that rotates around the X+Z axis of the Bloch sphere.
- * This is a generalization of the Hadamard gate.
- */
-template <typename fp_type>
-struct HPowGate {
-  static constexpr GateKind kind = kHPowGate;
-  static constexpr char name[] = "HPowGate";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0,
-                                  fp_type exponent, fp_type global_shift = 0) {
-    fp_type c = std::cos(pi * exponent * 0.5);
-    fp_type s = std::sin(pi * exponent * 0.5);
-    fp_type gc = std::cos(pi * exponent * (0.5 + global_shift));
-    fp_type gs = std::sin(pi * exponent * (0.5 + global_shift));
-
-    fp_type a = s * gs * is2;
-    fp_type b = s * gc * is2;
-
-    return CreateGate<GateCirq<fp_type>, HPowGate>(
-        time, {q0}, {c * gc + a, c * gs - b, a, -b,
-                     a, -b, c * gc - a, c * gs + b}, {exponent, global_shift});
-  }
-};
-
-/**
- * A gate that applies a phase to the |11⟩ state of two qubits.
- * This is a generalization of the CZ gate.
- */
-template <typename fp_type>
-struct CZPowGate {
-  static constexpr GateKind kind = kCZPowGate;
-  static constexpr char name[] = "CZPowGate";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
-                                  fp_type exponent, fp_type global_shift = 0) {
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-    fp_type ec = std::cos(pi * exponent * (1 + global_shift));
-    fp_type es = std::sin(pi * exponent * (1 + global_shift));
-
-    return CreateGate<GateCirq<fp_type>, CZPowGate>(
-        time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0,
-                         0, 0, gc, gs, 0, 0, 0, 0,
-                         0, 0, 0, 0, gc, gs, 0, 0,
-                         0, 0, 0, 0, 0, 0, ec, es}, {exponent, global_shift});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp(
-      fp_type exponent, fp_type global_shift) {
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-    fp_type ec = std::cos(pi * exponent * (1 + global_shift));
-    fp_type es = std::sin(pi * exponent * (1 + global_shift));
-
-    return schmidt_decomp_type<fp_type>{
-      {{1, 0, 0, 0, 0, 0, 0, 0}, {gc, gs, 0, 0, 0, 0, gc, gs}},
-      {{0, 0, 0, 0, 0, 0, 1, 0}, {gc, gs, 0, 0, 0, 0, ec, es}},
-    };
-  }
-};
-
-/**
- * A gate that applies a controlled power of an X gate.
- * This is a generalization of the CX (or CNOT) gate.
- */
-template <typename fp_type>
-struct CXPowGate {
-  static constexpr GateKind kind = kCXPowGate;
-  static constexpr char name[] = "CXPowGate";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = false;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
-                                  fp_type exponent, fp_type global_shift = 0) {
-    fp_type c = std::cos(pi * exponent * 0.5);
-    fp_type s = std::sin(pi * exponent * 0.5);
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-    fp_type ec = std::cos(pi * exponent * (0.5 + global_shift));
-    fp_type es = std::sin(pi * exponent * (0.5 + global_shift));
-
-    // Matrix is in this form because the simulator uses inverse qubit order.
-    return CreateGate<GateCirq<fp_type>, CXPowGate>(
-        time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0,
-                         0, 0, c * ec, c * es, 0, 0, s * es, -s * ec,
-                         0, 0, 0, 0, gc, gs, 0, 0,
-                         0, 0, s * es, -s * ec, 0, 0, c * ec, c * es},
-        {exponent, global_shift});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp(
-      fp_type exponent, fp_type global_shift) {
-    fp_type c = std::cos(pi * exponent * 0.5);
-    fp_type s = std::sin(pi * exponent * 0.5);
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-    fp_type ec = std::cos(pi * exponent * (0.5 + global_shift));
-    fp_type es = std::sin(pi * exponent * (0.5 + global_shift));
-
-    return schmidt_decomp_type<fp_type>{
-      {{1, 0, 0, 0, 0, 0, 0, 0}, {gc, gs, 0, 0, 0, 0, gc, gs}},
-      {{0, 0, 0, 0, 0, 0, 1, 0}, {c * ec, c * es, s * es, -s * ec,
-                                  s * es, -s * ec, c * ec, c * es}},
-    };
-  }
-};
-
-/**
- * The `(exponent = phi/pi, global_shift = -0.5)` instance of XPowGate.
- * This is a generalization of the X gate with a fixed global phase.
- * This is a function in Cirq.
- */
-template <typename fp_type>
-struct rx {
-  static constexpr GateKind kind = krx;
-  static constexpr char name[] = "rx";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, fp_type phi) {
-    fp_type c = std::cos(-0.5 * phi);
-    fp_type s = std::sin(-0.5 * phi);
-
-    return CreateGate<GateCirq<fp_type>, rx>(
-        time, {q0}, {c, 0, 0, s, 0, s, c, 0}, {phi});
-  }
-};
-
-/**
- * The `(exponent = phi/pi, global_shift = -0.5)` instance of YPowGate.
- * This is a generalization of the Y gate with a fixed global phase.
- * This is a function in Cirq.
- */
-template <typename fp_type>
-struct ry {
-  static constexpr GateKind kind = kry;
-  static constexpr char name[] = "ry";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, fp_type phi) {
-    fp_type c = std::cos(-0.5 * phi);
-    fp_type s = std::sin(-0.5 * phi);
-
-    return CreateGate<GateCirq<fp_type>, ry>(
-        time, {q0}, {c, 0, s, 0, -s, 0, c, 0}, {phi});
-  }
-};
-
-/**
- * The `(exponent = phi/pi, global_shift = -0.5)` instance of ZPowGate.
- * This is a generalization of the Z gate with a fixed global phase.
- * This is a function in Cirq.
- */
-template <typename fp_type>
-struct rz {
-  static constexpr GateKind kind = krz;
-  static constexpr char name[] = "rz";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, fp_type phi) {
-    fp_type c = std::cos(-0.5 * phi);
-    fp_type s = std::sin(-0.5 * phi);
-
-    return CreateGate<GateCirq<fp_type>, rz>(
-        time, {q0}, {c, s, 0, 0, 0, 0, c, -s}, {phi});
-  }
-};
-
-/**
- * The `(exponent = 1, global_shift = 0)` instance of HPowGate.
- * This is the canonical Hadamard (or H) gate.
- */
-template <typename fp_type>
-struct H {
-  static constexpr GateKind kind = kH;
-  static constexpr char name[] = "H";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateCirq<fp_type>, H>(
-        time, {q0}, {is2, 0, is2, 0, is2, 0, -is2, 0});
-  }
-};
-
-/**
- * The `(exponent = 0.5, global_shift = 0)` instance of ZPowGate.
- * This is the canonical S gate.
- */
-template <typename fp_type>
-struct S {
-  static constexpr GateKind kind = kS;
-  static constexpr char name[] = "S";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateCirq<fp_type>, S>(
-        time, {q0}, {1, 0, 0, 0, 0, 0, 0, 1});
-  }
-};
-
-/**
- * The `(exponent = 0.25, global_shift = 0)` instance of ZPowGate.
- * This is the canonical T gate.
- */
-template <typename fp_type>
-struct T {
-  static constexpr GateKind kind = kT;
-  static constexpr char name[] = "T";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateCirq<fp_type>, T>(
-        time, {q0}, {1, 0, 0, 0, 0, 0, is2, is2});
-  }
-};
-
-/**
- * The `(exponent = 1, global_shift = 0)` instance of CZPowGate.
- * This is the canonical CZ gate.
- */
-template <typename fp_type>
-struct CZ {
-  static constexpr GateKind kind = kCZ;
-  static constexpr char name[] = "CZ";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
-    return CreateGate<GateCirq<fp_type>, CZ>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, 1, 0, 0, 0, 0, 0,
-                         0, 0, 0, 0, 1, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, -1, 0});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
-    return schmidt_decomp_type<fp_type>{
-      {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}},
-      {{0, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, -1, 0}},
-    };
-  }
-};
-
-template <typename fp_type>
-using CNotPowGate = CXPowGate<fp_type>;
-
-/**
- * The `(exponent = 1, global_shift = 0)` instance of CXPowGate.
- * This is the canonical CX (or CNOT) gate.
- */
-template <typename fp_type>
-struct CX {
-  static constexpr GateKind kind = kCX;
-  static constexpr char name[] = "kCX";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = false;
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
-    // Matrix is in this form because the simulator uses inverse qubit order.
-    return CreateGate<GateCirq<fp_type>, CX>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, 1, 0,
-                         0, 0, 0, 0, 1, 0, 0, 0,
-                         0, 0, 1, 0, 0, 0, 0, 0});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
-    return schmidt_decomp_type<fp_type>{
-      {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}},
-      {{0, 0, 0, 0, 0, 0, 1, 0}, {0, 0, 1, 0, 1, 0, 0, 0}},
-    };
-  }
-};
-
-template <typename fp_type>
-using CNOT = CX<fp_type>;
-
-// Gates from cirq/ops/pauli_gates.py:
-
-/**
- * The `(exponent = 1, global_shift = 0)` instance of XPowGate.
- * This is the canonical Pauli X gate.
- */
-template <typename fp_type>
-struct X : public XPowGate<fp_type> {
-  static constexpr GateKind kind = kX;
-  static constexpr char name[] = "X";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateCirq<fp_type>, X>(
-        time, {q0}, {0, 0, 1, 0, 1, 0, 0, 0});
-  }
-};
-
-/**
- * The `(exponent = 1, global_shift = 0)` instance of YPowGate.
- * This is the canonical Pauli Y gate.
- */
-template <typename fp_type>
-struct Y : public YPowGate<fp_type> {
-  static constexpr GateKind kind = kY;
-  static constexpr char name[] = "Y";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateCirq<fp_type>, Y>(
-        time, {q0}, {0, 0, 0, -1, 0, 1, 0, 0});
-  }
-};
-
-/**
- * The `(exponent = 1, global_shift = 0)` instance of ZPowGate.
- * This is the canonical Pauli Z gate.
- */
-template <typename fp_type>
-struct Z : public ZPowGate<fp_type> {
-  static constexpr GateKind kind = kZ;
-  static constexpr char name[] = "Z";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateCirq<fp_type>, Z>(
-        time, {q0}, {1, 0, 0, 0, 0, 0, -1, 0});
-  }
-};
-
-// Gates from cirq/ops/phased_x_gate.py:
-
-/**
- * An XPowGate conjugated by ZPowGate%s.
- * Equivalent to the circuit `───Z^-p───X^t───Z^p───`.
- */
-template <typename fp_type>
-struct PhasedXPowGate {
-  static constexpr GateKind kind = kPhasedXPowGate;
-  static constexpr char name[] = "PhasedXPowGate";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0,
-                                  fp_type phase_exponent, fp_type exponent = 1,
-                                  fp_type global_shift = 0) {
-    fp_type pc = std::cos(pi * phase_exponent);
-    fp_type ps = std::sin(pi * phase_exponent);
-    fp_type ec = std::cos(pi * exponent);
-    fp_type es = std::sin(pi * exponent);
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-
-    fp_type ar = 0.5 * ((1 + ec) * gc - es * gs);
-    fp_type ai = 0.5 * ((1 + ec) * gs + es * gc);
-    fp_type br = -0.5 * ((-1 + ec) * gc - es * gs);
-    fp_type bi = -0.5 * ((-1 + ec) * gs + es * gc);
-
-    return CreateGate<GateCirq<fp_type>, PhasedXPowGate>(
-        time, {q0}, {ar, ai, pc * br + ps * bi, pc * bi - ps * br,
-                     pc * br - ps * bi, pc * bi + ps * br, ar, ai},
-        {phase_exponent, exponent, global_shift});
-  }
-};
-
-// Gates from cirq/ops/phased_x_z_gate.py:
-
-/**
- * A PhasedXPowGate followed by a ZPowGate.
- * Equivalent to the circuit `───Z^(-a)──X^x──Z^a───Z^z───`.
- */
-template <typename fp_type>
-struct PhasedXZGate {
-  static constexpr GateKind kind = kPhasedXZGate;
-  static constexpr char name[] = "PhasedXZGate";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0,
-                                  fp_type x_exponent, fp_type z_exponent,
-                                  fp_type axis_phase_exponent) {
-    fp_type xc = std::cos(pi * x_exponent);
-    fp_type xs = std::sin(pi * x_exponent);
-    fp_type zc = std::cos(pi * z_exponent);
-    fp_type zs = std::sin(pi * z_exponent);
-    fp_type ac = std::cos(pi * axis_phase_exponent);
-    fp_type as = std::sin(pi * axis_phase_exponent);
-
-    fp_type br = 0.5 * (1 + xc);
-    fp_type bi = 0.5 * xs;
-    fp_type cr = -0.5 * (-1 + xc);
-    fp_type ci = -0.5 * xs;
-    fp_type dr = ac * zc - as * zs;
-    fp_type di = ac * zs + as * zc;
-
-    return CreateGate<GateCirq<fp_type>, PhasedXZGate>(
-        time, {q0}, {br, bi, ac * cr + as * ci, ac * ci - as * cr,
-                     dr * cr - di * ci, dr * ci + di * cr,
-                     zc * br - zs * bi, zc * bi + zs * br},
-        {x_exponent, z_exponent, axis_phase_exponent});
-  }
-};
-
-// Gates from cirq/ops/parity_gates.py:
-
-/**
- * The tensor product of two X gates, possibly raised to an exponent.
- */
-template <typename fp_type>
-struct XXPowGate {
-  static constexpr GateKind kind = kXXPowGate;
-  static constexpr char name[] = "XXPowGate";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
-                                  fp_type exponent, fp_type global_shift = 0) {
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-    fp_type c = std::cos(pi * exponent);
-    fp_type s = std::sin(pi * exponent);
-    fp_type ic = 0.5 * ((1 + c) * gc - s * gs);
-    fp_type is = 0.5 * ((1 + c) * gs + s * gc);
-    fp_type xc = 0.5 * ((1 - c) * gc + s * gs);
-    fp_type xs = 0.5 * ((1 - c) * gs - s * gc);
-
-    return CreateGate<GateCirq<fp_type>, XXPowGate>(
-        time, {q0, q1}, {ic, is, 0, 0, 0, 0, xc, xs,
-                         0, 0, ic, is, xc, xs, 0, 0,
-                         0, 0, xc, xs, ic, is, 0, 0,
-                         xc, xs, 0, 0, 0, 0, ic, is}, {exponent, global_shift});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp(
-      fp_type exponent, fp_type global_shift) {
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-    fp_type c = std::cos(pi * exponent);
-    fp_type s = std::sin(pi * exponent);
-    fp_type ic = 0.5 * ((1 + c) * gc - s * gs);
-    fp_type is = 0.5 * ((1 + c) * gs + s * gc);
-    fp_type xc = 0.5 * ((1 - c) * gc + s * gs);
-    fp_type xs = 0.5 * ((1 - c) * gs - s * gc);
-
-    return schmidt_decomp_type<fp_type>{
-      {{1, 0, 0, 0, 0, 0, 1, 0}, {ic, is, 0, 0, 0, 0, ic, is}},
-      {{0, 0, 1, 0, 1, 0, 0, 0}, {0, 0, xc, xs, xc, xs, 0, 0}},
-    };
-  }
-};
-
-/**
- * The tensor product of two Y gates, possibly raised to an exponent.
- */
-template <typename fp_type>
-struct YYPowGate {
-  static constexpr GateKind kind = kYYPowGate;
-  static constexpr char name[] = "YYPowGate";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
-                                  fp_type exponent, fp_type global_shift = 0) {
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-    fp_type c = std::cos(pi * exponent);
-    fp_type s = std::sin(pi * exponent);
-    fp_type ic = 0.5 * ((1 + c) * gc - s * gs);
-    fp_type is = 0.5 * ((1 + c) * gs + s * gc);
-    fp_type yc = 0.5 * ((1 - c) * gc + s * gs);
-    fp_type ys = 0.5 * ((1 - c) * gs - s * gc);
-
-    return CreateGate<GateCirq<fp_type>, YYPowGate>(
-        time, {q0, q1}, {ic, is, 0, 0, 0, 0, -yc, -ys,
-                         0, 0, ic, is, yc, ys, 0, 0,
-                         0, 0, yc, ys, ic, is, 0, 0,
-                         -yc, -ys, 0, 0, 0, 0, ic, is},
-        {exponent, global_shift});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp(
-      fp_type exponent, fp_type global_shift) {
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-    fp_type c = std::cos(pi * exponent);
-    fp_type s = std::sin(pi * exponent);
-    fp_type ic = 0.5 * ((1 + c) * gc - s * gs);
-    fp_type is = 0.5 * ((1 + c) * gs + s * gc);
-    fp_type yc = 0.5 * ((1 - c) * gc + s * gs);
-    fp_type ys = 0.5 * ((1 - c) * gs - s * gc);
-
-    return schmidt_decomp_type<fp_type>{
-      {{1, 0, 0, 0, 0, 0, 1, 0}, {ic, is, 0, 0, 0, 0, ic, is}},
-      {{0, 0, 0, -1, 0, 1, 0, 0}, {0, 0, ys, -yc, -ys, yc, 0, 0}},
-    };
-  }
-};
-
-/**
- * The tensor product of two Z gates, possibly raised to an exponent.
- */
-template <typename fp_type>
-struct ZZPowGate {
-  static constexpr GateKind kind = kZZPowGate;
-  static constexpr char name[] = "ZZPowGate";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
-                                  fp_type exponent, fp_type global_shift = 0) {
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-    fp_type zc = std::cos(pi * exponent * (1 + global_shift));
-    fp_type zs = std::sin(pi * exponent * (1 + global_shift));
-
-    return CreateGate<GateCirq<fp_type>, ZZPowGate>(
-        time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0,
-                         0, 0, zc, zs, 0, 0, 0, 0,
-                         0, 0, 0, 0, zc, zs, 0, 0,
-                         0, 0, 0, 0, 0, 0, gc, gs}, {exponent, global_shift});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp(
-      fp_type exponent, fp_type global_shift) {
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-    fp_type c = std::cos(pi * exponent);
-    fp_type s = std::sin(pi * exponent);
-    fp_type ic = 0.5 * ((1 + c) * gc - s * gs);
-    fp_type is = 0.5 * ((1 + c) * gs + s * gc);
-    fp_type zc = 0.5 * ((1 - c) * gc + s * gs);
-    fp_type zs = 0.5 * ((1 - c) * gs - s * gc);
-
-    return schmidt_decomp_type<fp_type>{
-      {{1, 0, 0, 0, 0, 0, 1, 0}, {ic, is, 0, 0, 0, 0, ic, is}},
-      {{1, 0, 0, 0, 0, 0, -1, 0}, {zc, zs, 0, 0, 0, 0, -zc, -zs}},
-    };
-  }
-};
-
-/**
- * The `(exponent = 1, global_shift = 0)` instance of XXPowGate.
- * This is the tensor product of two X gates.
- */
-template <typename fp_type>
-struct XX {
-  static constexpr GateKind kind = kXX;
-  static constexpr char name[] = "XX";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
-    return CreateGate<GateCirq<fp_type>, XX>(
-        time, {q0, q1}, {0, 0, 0, 0, 0, 0, 1, 0,
-                         0, 0, 0, 0, 1, 0, 0, 0,
-                         0, 0, 1, 0, 0, 0, 0, 0,
-                         1, 0, 0, 0, 0, 0, 0, 0});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
-    return schmidt_decomp_type<fp_type>{
-      {{0, 0, 1, 0, 1, 0, 0, 0}, {0, 0, 1, 0, 1, 0, 0, 0}},
-    };
-  }
-};
-
-/**
- * The `(exponent = 1, global_shift = 0)` instance of YYPowGate.
- * This is the tensor product of two Y gates.
- */
-template <typename fp_type>
-struct YY {
-  static constexpr GateKind kind = kYY;
-  static constexpr char name[] = "YY";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
-    return CreateGate<GateCirq<fp_type>, YY>(
-        time, {q0, q1}, {0, 0, 0, 0, 0, 0, -1, 0,
-                         0, 0, 0, 0, 1, 0, 0, 0,
-                         0, 0, 1, 0, 0, 0, 0, 0,
-                         -1, 0, 0, 0, 0, 0, 0, 0});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
-    return schmidt_decomp_type<fp_type>{
-      {{0, 0, 0, -1, 0, 1, 0, 0}, {0, 0, 0, -1, 0, 1, 0, 0}},
-    };
-  }
-};
-
-/**
- * The `(exponent = 1, global_shift = 0)` instance of ZZPowGate.
- * This is the tensor product of two Z gates.
- */
-template <typename fp_type>
-struct ZZ {
-  static constexpr GateKind kind = kZZ;
-  static constexpr char name[] = "ZZ";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
-    return CreateGate<GateCirq<fp_type>, ZZ>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, -1, 0, 0, 0, 0, 0,
-                         0, 0, 0, 0, -1, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, 1, 0});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
-    return schmidt_decomp_type<fp_type>{
-      {{1, 0, 0, 0, 0, 0, -1, 0}, {1, 0, 0, 0, 0, 0, -1, 0}},
-    };
-  }
-};
-
-// Gates from cirq/ops/swap_gates.py:
-
-/**
- * The SWAP gate, possibly raised to a power. Exchanges qubits.
- */
-template <typename fp_type>
-struct SwapPowGate {
-  static constexpr GateKind kind = kSwapPowGate;
-  static constexpr char name[] = "SwapPowGate";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-  static constexpr fp_type h = static_cast<fp_type>(h_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
-                                  fp_type exponent, fp_type global_shift = 0) {
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-    fp_type c = std::cos(pi * exponent * 0.5);
-    fp_type s = std::sin(pi * exponent * 0.5);
-    fp_type ec = std::cos(pi * exponent * (0.5 + global_shift));
-    fp_type es = std::sin(pi * exponent * (0.5 + global_shift));
-
-    return CreateGate<GateCirq<fp_type>, SwapPowGate>(
-        time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0,
-                         0, 0, c * ec, c * es, s * es, -s * ec, 0, 0,
-                         0, 0, s * es, -s * ec, c * ec, c * es, 0, 0,
-                         0, 0, 0, 0, 0, 0, gc, gs}, {exponent, global_shift});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp(
-      fp_type exponent, fp_type global_shift) {
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-    fp_type c = std::cos(pi * exponent * 0.5);
-    fp_type s = std::sin(pi * exponent * 0.5);
-    fp_type ec = std::cos(pi * exponent * (0.5 + global_shift));
-    fp_type es = std::sin(pi * exponent * (0.5 + global_shift));
-
-    return schmidt_decomp_type<fp_type>{
-      {{h, 0, 0, 0, 0, 0, h, 0}, {gc + c * ec, gs + c * es, 0, 0,
-                                  0, 0, gc + c * ec, gs + c * es}},
-      {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, s * es, -s * ec,
-                                  s * es, -s * ec, 0, 0}},
-      {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, -s * ec, -s * es,
-                                   s * ec, s * es, 0, 0}},
-      {{h, 0, 0, 0, 0, 0, -h, 0}, {gc - c * ec, gs - c * es, 0, 0,
-                                   0, 0, -gc + c * ec, -gs + c * es}},
-    };
-  }
-};
-
-/**
- * Rotates the |01⟩ vs |10⟩ subspace of two qubits around its Bloch X-axis.
- * This is a generalization of the ISWAP gate.
- */
-template <typename fp_type>
-struct ISwapPowGate {
-  static constexpr GateKind kind = kISwapPowGate;
-  static constexpr char name[] = "ISwapPowGate";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-  static constexpr fp_type h = static_cast<fp_type>(h_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
-                                  fp_type exponent, fp_type global_shift = 0) {
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-    fp_type c = std::cos(pi * exponent * 0.5);
-    fp_type s = std::sin(pi * exponent * 0.5);
-
-    return CreateGate<GateCirq<fp_type>, ISwapPowGate>(
-        time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0,
-                         0, 0, c * gc, c * gs, -s * gs, s * gc, 0, 0,
-                         0, 0, -s * gs, s * gc, c * gc, c * gs, 0, 0,
-                         0, 0, 0, 0, 0, 0, gc, gs}, {exponent, global_shift});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp(
-      fp_type exponent, fp_type global_shift) {
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-    fp_type c = std::cos(pi * exponent * 0.5);
-    fp_type s = std::sin(pi * exponent * 0.5);
-
-    return schmidt_decomp_type<fp_type>{
-      {{h, 0, 0, 0, 0, 0, h, 0}, {gc + c * gc, gs + c * gs, 0, 0,
-                                  0, 0, gc + c * gc, gs + c * gs}},
-      {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, -s * gs, s * gc,
-                                  -s * gs, s * gc, 0, 0}},
-      {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, s * gc, s * gs,
-                                   -s * gc, -s * gs, 0, 0}},
-      {{h, 0, 0, 0, 0, 0, -h, 0}, {gc - c * gc, gs - c * gs, 0, 0,
-                                   0, 0, -gc + c * gc, -gs + c * gs}},
-    };
-  }
-};
-
-/**
- * The `(exponent = 2*phi/pi, global_shift = 0)` instance of ISwapPowGate.
- * This is a generalization of the ISWAP gate with a fixed global phase of zero.
- * This is a function in Cirq.
- */
-template <typename fp_type>
-struct riswap {
-  static constexpr GateKind kind = kriswap;
-  static constexpr char name[] = "riswap";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-  static constexpr fp_type h = static_cast<fp_type>(h_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
-                                  fp_type phi) {
-    fp_type c = std::cos(phi);
-    fp_type s = std::sin(phi);
-
-    return CreateGate<GateCirq<fp_type>, riswap>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, c, 0, 0, s, 0, 0,
-                         0, 0, 0, s, c, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, 1, 0}, {phi});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp(fp_type phi) {
-    fp_type c = std::cos(phi);
-    fp_type s = std::sin(phi);
-
-    return schmidt_decomp_type<fp_type>{
-      {{h, 0, 0, 0, 0, 0, h, 0}, {1 + c, 0, 0, 0, 0, 0, 1 + c, 0}},
-      {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, 0, s, 0, s, 0, 0}},
-      {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, s, 0, -s, 0, 0, 0}},
-      {{h, 0, 0, 0, 0, 0, -h, 0}, {1 - c, 0, 0, 0, 0, 0, -1 + c, 0}},
-    };
-  }
-};
-
-/**
- * The `(exponent = 1, global_shift = 0)` instance of SwapPowGate.
- * This is the canonical SWAP gate.
- */
-template <typename fp_type>
-struct SWAP {
-  static constexpr GateKind kind = kSWAP;
-  static constexpr char name[] = "SWAP";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
-    return CreateGate<GateCirq<fp_type>, SWAP>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, 0, 0, 1, 0, 0, 0,
-                         0, 0, 1, 0, 0, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, 1, 0});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
-    return schmidt_decomp_type<fp_type>{
-      {{is2, 0, 0, 0, 0, 0, is2, 0}, {is2, 0, 0, 0, 0, 0, is2, 0}},
-      {{0, 0, is2, 0, is2, 0, 0, 0}, {0, 0, is2, 0, is2, 0, 0, 0}},
-      {{0, 0, 0, -is2, 0, is2, 0, 0}, {0, 0, 0, -is2, 0, is2, 0, 0}},
-      {{is2, 0, 0, 0, 0, 0, -is2, 0}, {is2, 0, 0, 0, 0, 0, -is2, 0}},
-    };
-  }
-};
-
-/**
- * The `(exponent = 1, global_shift = 0)` instance of ISwapPowGate.
- * This is the canonical ISWAP gate.
- */
-template <typename fp_type>
-struct ISWAP {
-  static constexpr GateKind kind = kISWAP;
-  static constexpr char name[] = "ISWAP";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type h = static_cast<fp_type>(h_double);
-  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
-    return CreateGate<GateCirq<fp_type>, ISWAP>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, 0, 0, 0, 1, 0, 0,
-                         0, 0, 0, 1, 0, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, 1, 0});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
-    return schmidt_decomp_type<fp_type>{
-      {{is2, 0, 0, 0, 0, 0, is2, 0}, {is2, 0, 0, 0, 0, 0, is2, 0}},
-      {{0, 0, h, h, h, h, 0, 0}, {0, 0, h, h, h, h, 0, 0}},
-      {{0, 0, h, -h, -h, h, 0, 0}, {0, 0, h, -h, -h, h, 0, 0}},
-      {{is2, 0, 0, 0, 0, 0, -is2, 0}, {is2, 0, 0, 0, 0, 0, -is2, 0}},
-    };
-  }
-};
-
-// Gates from cirq/ops/phased_iswap_gate.py:
-
-/**
- * An ISwapPowGate conjugated by ZPowGate%s.
- * Equivalent to the composition `(Z^-p ⊗ Z^p) ISWAP^t (Z^p ⊗ Z^-p)`.
- */
-template <typename fp_type>
-struct PhasedISwapPowGate {
-  static constexpr GateKind kind = kPhasedISwapPowGate;
-  static constexpr char name[] = "PhasedISwapPowGate";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = false;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-  static constexpr fp_type h = static_cast<fp_type>(h_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
-                                  fp_type phase_exponent = 0.25,
-                                  fp_type exponent = 1.0) {
-    fp_type fc = std::cos(2 * pi * phase_exponent);
-    fp_type fs = std::sin(2 * pi * phase_exponent);
-    fp_type c = std::cos(pi * exponent * 0.5);
-    fp_type s = std::sin(pi * exponent * 0.5);
-
-    // Matrix is in this form because the simulator uses inverse qubit order.
-    return CreateGate<GateCirq<fp_type>, PhasedISwapPowGate>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, c, 0, s * fs, s * fc, 0, 0,
-                         0, 0, -s * fs, s * fc, c, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, 1, 0}, {phase_exponent, exponent});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp(
-      fp_type phase_exponent, fp_type exponent) {
-    fp_type fc = std::cos(2 * pi * phase_exponent);
-    fp_type fs = std::sin(2 * pi * phase_exponent);
-    fp_type c = std::cos(pi * exponent * 0.5);
-    fp_type s = std::sin(pi * exponent * 0.5);
-
-    return schmidt_decomp_type<fp_type>{
-      {{h, 0, 0, 0, 0, 0, h, 0}, {1 + c, 0, 0, 0, 0, 0, 1 + c, 0}},
-      {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, s * fs, s * fc, -s * fs, s * fc, 0, 0}},
-      {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, s * fc, -s * fs,
-                                   -s * fc, -s * fs, 0, 0}},
-      {{h, 0, 0, 0, 0, 0, -h, 0}, {1 - c, 0, 0, 0, 0, 0, -1 + c, 0}},
-    };
-  }
-};
-
-/**
- * The `(phase_exponent = 0.25, exponent = 2*phi/pi)` instance of
- * PhasedISwapPowGate.
- * This is the "Givens rotation" from numerical linear algebra.
- * This is a function in Cirq.
- */
-template <typename fp_type>
-struct givens {
-  static constexpr GateKind kind = kgivens;
-  static constexpr char name[] = "givens";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = false;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-  static constexpr fp_type h = static_cast<fp_type>(h_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
-                                  fp_type phi) {
-    fp_type c = std::cos(phi);
-    fp_type s = std::sin(phi);
-
-    // Matrix is in this form because the simulator uses inverse qubit order.
-    return CreateGate<GateCirq<fp_type>, givens>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, c, 0, s, 0, 0, 0,
-                         0, 0, -s, 0, c, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, 1, 0}, {phi});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp(fp_type phi) {
-    fp_type c = std::cos(phi);
-    fp_type s = std::sin(phi);
-
-    return schmidt_decomp_type<fp_type>{
-      {{h, 0, 0, 0, 0, 0, h, 0}, {1 + c, 0, 0, 0, 0, 0, 1 + c, 0}},
-      {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, s, 0, -s, 0, 0, 0}},
-      {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, 0, -s, 0, -s, 0, 0}},
-      {{h, 0, 0, 0, 0, 0, -h, 0}, {1 - c, 0, 0, 0, 0, 0, -1 + c, 0}},
-    };
-  }
-};
-
-// Gates from cirq/ops/fsim_gate.py:
-
-/**
- * The fermionic simulation gate family. Contains all two-qubit interactions
- * that preserve excitations, up to single-qubit rotations and global phase.
- */
-template <typename fp_type>
-struct FSimGate {
-  static constexpr GateKind kind = kFSimGate;
-  static constexpr char name[] = "FSimGate";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
-
-  static GateCirq<fp_type> Create(
-      unsigned time, unsigned q0, unsigned q1, fp_type theta, fp_type phi) {
-    if (phi < 0) {
-      phi += 2 * 3.141592653589793;
-    }
-
-    fp_type ct = std::cos(theta);
-    fp_type st = std::sin(theta);
-    fp_type cp = std::cos(phi);
-    fp_type sp = std::sin(phi);
-
-    return CreateGate<GateCirq<fp_type>, FSimGate>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, ct, 0, 0, -st, 0, 0,
-                         0, 0, 0, -st, ct, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, cp, -sp}, {theta, phi});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp(
-      fp_type theta, fp_type phi) {
-    fp_type ct = std::cos(theta);
-    fp_type st = std::sin(theta);
-
-    fp_type cp2 = std::cos(0.5 * phi);
-    fp_type sp2 = std::sin(0.5 * phi);
-    fp_type cp4 = std::cos(0.25 * phi);
-    fp_type sp4 = std::sin(0.25 * phi);
-
-    fp_type a0 = std::sqrt(std::sqrt(1 + 2 * ct * cp2 + ct * ct));
-    fp_type a1 = std::sqrt(std::sqrt(1 - 2 * ct * cp2 + ct * ct));
-
-    fp_type p0 = 0.5 * std::atan2(-sp2, cp2 + ct);
-    fp_type p1 = 0.5 * std::atan2(-sp2, cp2 - ct);
-
-    fp_type c0 = is2 * a0 * std::cos(p0);
-    fp_type s0 = is2 * a0 * std::sin(p0);
-
-    fp_type c1 = is2 * a1 * std::cos(p1);
-    fp_type s1 = is2 * a1 * std::sin(p1);
-
-    fp_type st2 = 0.5 * std::sqrt(st);
-
-    fp_type a = cp4 * c0 - sp4 * s0;
-    fp_type b = cp4 * s0 + sp4 * c0;
-    fp_type c = cp4 * c0 + sp4 * s0;
-    fp_type d = cp4 * s0 - sp4 * c0;
-
-    fp_type e = cp4 * c1 - sp4 * s1;
-    fp_type f = cp4 * s1 + sp4 * c1;
-    fp_type g = -(cp4 * c1 + sp4 * s1);
-    fp_type h = -(cp4 * s1 - sp4 * c1);
-
-    return schmidt_decomp_type<fp_type>{
-      {{a, b, 0, 0, 0, 0, c, d}, {a, b, 0, 0, 0, 0, c, d}},
-      {{0, 0, st2, -st2, st2, -st2, 0, 0}, {0, 0, st2, -st2, st2, -st2, 0, 0}},
-      {{0, 0, -st2, -st2, st2, st2, 0, 0}, {0, 0, -st2, -st2, st2, st2, 0, 0}},
-      {{e, f, 0, 0, 0, 0, g, h}, {e, f, 0, 0, 0, 0, g, h}},
-    };
-  }
-};
-
-// Gates from cirq/ops/two_qubit_diagonal_gate.py:
-
-/**
- * A two-qubit diagonal gate.
- */
-template <typename fp_type>
-struct TwoQubitDiagonalGate {
-  static constexpr GateKind kind = kTwoQubitDiagonalGate;
-  static constexpr char name[] = "TwoQubitDiagonalGate";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = false;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time,
-                                  unsigned q0, unsigned q1,
-                                  const std::vector<fp_type>& angles) {
-    std::vector<fp_type> cs;
-    std::vector<fp_type> ss;
-    cs.reserve(4);
-    ss.reserve(4);
-
-    for (std::size_t i = 0; i < angles.size(); ++i) {
-      cs.push_back(std::cos(angles[i]));
-      ss.push_back(std::sin(angles[i]));
-    }
-
-    for (std::size_t i = angles.size(); i < 4; ++i) {
-      cs.push_back(1);
-      ss.push_back(0);
-    }
-
-    // Matrix is in this form because the simulator uses inverse qubit order.
-    return CreateGate<GateCirq<fp_type>, TwoQubitDiagonalGate>(
-        time, {q0, q1}, {cs[0], ss[0], 0, 0, 0, 0, 0, 0,
-                         0, 0, cs[2], ss[2], 0, 0, 0, 0,
-                         0, 0, 0, 0, cs[1], ss[1], 0, 0,
-                         0, 0, 0, 0, 0, 0, cs[3], ss[3]});
-  }
-};
-
-// Gates from cirq/ops/three_qubit_gates.py:
-
-/**
- * A three-qubit diagonal gate.
- */
-template <typename fp_type>
-struct ThreeQubitDiagonalGate {
-  static constexpr GateKind kind = kThreeQubitDiagonalGate;
-  static constexpr char name[] = "ThreeQubitDiagonalGate";
-  static constexpr unsigned num_qubits = 3;
-  static constexpr bool symmetric = false;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time,
-                                  unsigned q0, unsigned q1, unsigned q2,
-                                  const std::vector<fp_type>& angles) {
-    std::vector<fp_type> cs;
-    std::vector<fp_type> ss;
-    cs.reserve(8);
-    ss.reserve(8);
-
-    for (std::size_t i = 0; i < angles.size(); ++i) {
-      cs.push_back(std::cos(angles[i]));
-      ss.push_back(std::sin(angles[i]));
-    }
-
-    for (std::size_t i = angles.size(); i < 8; ++i) {
-      cs.push_back(1);
-      ss.push_back(0);
-    }
-
-    // Matrix is in this form because the simulator uses inverse qubit order.
-    return CreateGate<GateCirq<fp_type>, ThreeQubitDiagonalGate>(
-        time, {q0, q1, q2},
-        {cs[0], ss[0], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-         0, 0, cs[4], ss[4], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-         0, 0, 0, 0, cs[2], ss[2], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-         0, 0, 0, 0, 0, 0, cs[6], ss[6], 0, 0, 0, 0, 0, 0, 0, 0,
-         0, 0, 0, 0, 0, 0, 0, 0, cs[1], ss[1], 0, 0, 0, 0, 0, 0,
-         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, cs[5], ss[5], 0, 0, 0, 0,
-         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, cs[3], ss[3], 0, 0,
-         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, cs[7], ss[7]});
-  }
-};
-
-/**
- * A gate that applies a phase to the |111⟩ state of three qubits.
- * This is a generalization of the CCZ gate.
- */
-template <typename fp_type>
-struct CCZPowGate {
-  static constexpr GateKind kind = kCCZPowGate;
-  static constexpr char name[] = "CCZPowGate";
-  static constexpr unsigned num_qubits = 3;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time,
-                                  unsigned q0, unsigned q1, unsigned q2,
-                                  fp_type exponent, fp_type global_shift = 0) {
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-    fp_type ec = std::cos(pi * exponent * (1 + global_shift));
-    fp_type es = std::sin(pi * exponent * (1 + global_shift));
-
-    return CreateGate<GateCirq<fp_type>, CCZPowGate>(
-        time, {q0, q1, q2}, {gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ec, es},
-                            {exponent, global_shift});
-  }
-};
-
-/**
- * A gate that applies a doubly-controlled power of an X gate.
- * This is a generalization of the CCX (or CCNOT) gate.
- */
-template <typename fp_type>
-struct CCXPowGate {
-  static constexpr GateKind kind = kCCXPowGate;
-  static constexpr char name[] = "CCXPowGate";
-  static constexpr unsigned num_qubits = 3;
-  static constexpr bool symmetric = false;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time,
-                                  unsigned q0, unsigned q1, unsigned q2,
-                                  fp_type exponent, fp_type global_shift = 0) {
-    fp_type c = std::cos(pi * exponent * 0.5);
-    fp_type s = std::sin(pi * exponent * 0.5);
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-    fp_type ec = std::cos(pi * exponent * (0.5 + global_shift));
-    fp_type es = std::sin(pi * exponent * (0.5 + global_shift));
-
-    // Matrix is in this form because the simulator uses inverse qubit order.
-    return CreateGate<GateCirq<fp_type>, CCXPowGate>(
-        time, {q0, q1, q2},
-        {gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-         0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-         0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-         0, 0, 0, 0, 0, 0, c * ec, c * es, 0, 0, 0, 0, 0, 0, s * es, -s * ec,
-         0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0,
-         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0,
-         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0,
-         0, 0, 0, 0, 0, 0, s * es, -s * ec, 0, 0, 0, 0, 0, 0, c * ec, c * es},
-        {exponent, global_shift});
-  }
-};
-
-/**
- * A controlled swap gate (the Fredkin gate).
- */
-template <typename fp_type>
-struct CSwapGate {
-  static constexpr GateKind kind = kCSwapGate;
-  static constexpr char name[] = "CSwapGate";
-  static constexpr unsigned num_qubits = 3;
-  static constexpr bool symmetric = false;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time,
-                                  unsigned q0, unsigned q1, unsigned q2) {
-    // Matrix is in this form because the simulator uses inverse qubit order.
-    return CreateGate<GateCirq<fp_type>, CSwapGate>(
-        time, {q0, q1, q2}, {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0});
-  }
-};
-
-/**
- * The `(exponent = 1, global_shift = 0)` instance of CCZPowGate.
- * This is the canonical doubly-controlled Z gate.
- */
-template <typename fp_type>
-struct CCZ {
-  static constexpr GateKind kind = kCCZ;
-  static constexpr char name[] = "CCZ";
-  static constexpr unsigned num_qubits = 3;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time,
-                                  unsigned q0, unsigned q1, unsigned q2) {
-    return CreateGate<GateCirq<fp_type>, CCZ>(
-        time, {q0, q1, q2}, {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0});
-  }
-};
-
-/**
- * The `(exponent = 1, global_shift = 0)` instance of CCXPowGate.
- * This is the canonical doubly-controlled X gate (the TOFFOLI gate).
- */
-template <typename fp_type>
-struct CCX {
-  static constexpr GateKind kind = kCCX;
-  static constexpr char name[] = "CCX";
-  static constexpr unsigned num_qubits = 3;
-  static constexpr bool symmetric = false;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time,
-                                  unsigned q0, unsigned q1, unsigned q2) {
-    // Matrix is in this form because the simulator uses inverse qubit order.
-    return CreateGate<GateCirq<fp_type>, CCX>(
-        time, {q0, q1, q2}, {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0});
-  }
-};
-
-template <typename fp_type>
-using CCNotPowGate = CCXPowGate<fp_type>;
-
-template <typename fp_type>
-using TOFFOLI = CCX<fp_type>;
-
-template <typename fp_type>
-using CCNOT = CCX<fp_type>;
-
-template <typename fp_type>
-using CSWAP = CSwapGate<fp_type>;
-
-template <typename fp_type>
-using FREDKIN = CSwapGate<fp_type>;
-
-// Gates from cirq/ops/matrix_gates.py:
-
-/**
- * A one-qubit gate defined entirely by its matrix.
- */
-template <typename fp_type>
-struct MatrixGate1 {
-  static constexpr GateKind kind = kMatrixGate1;
-  static constexpr char name[] = "MatrixGate1";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0,
-                                  const Matrix<fp_type>& m) {
-    auto m2 = m;
-    return
-        CreateGate<GateCirq<fp_type>, MatrixGate1>(time, {q0}, std::move(m2));
-  }
-};
-
-/**
- * A two-qubit gate defined entirely by its matrix.
- */
-template <typename fp_type>
-struct MatrixGate2 {
-  static constexpr GateKind kind = kMatrixGate2;
-  static constexpr char name[] = "MatrixGate2";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = false;
-
-  template <typename M = Matrix<fp_type>>
-  static GateCirq<fp_type> Create(
-      unsigned time, unsigned q0, unsigned q1, M&& m) {
-    return CreateGate<GateCirq<fp_type>, MatrixGate2>(time, {q1, q0},
-                                                      std::forward<M>(m));
-  }
-};
-
-/**
- * A multi-qubit gate defined entirely by its matrix.
- */
-template <typename fp_type>
-struct MatrixGate {
-  static constexpr GateKind kind = kMatrixGate;
-  static constexpr char name[] = "MatrixGate";
-  static constexpr bool symmetric = false;
-
-  template <typename M = Matrix<fp_type>>
-  static GateCirq<fp_type> Create(unsigned time,
-                                  std::vector<unsigned> qubits, M&& m) {
-    std::reverse(qubits.begin(), qubits.end());
-    return CreateGate<GateCirq<fp_type>, MatrixGate>(time, std::move(qubits),
-                                                     std::forward<M>(m));
-  }
-};
-
-}  // namesapce Cirq
-
-template <typename fp_type>
-inline schmidt_decomp_type<fp_type> GetSchmidtDecomp(
-    Cirq::GateKind kind, const std::vector<fp_type>& params) {
-  switch (kind) {
-  case Cirq::kI2:
-    return Cirq::I2<fp_type>::SchmidtDecomp();
-  case Cirq::kCZPowGate:
-    return Cirq::CZPowGate<fp_type>::SchmidtDecomp(params[0], params[1]);
-  case Cirq::kCXPowGate:
-    return Cirq::CXPowGate<fp_type>::SchmidtDecomp(params[0], params[1]);
-  case Cirq::kCZ:
-    return Cirq::CZ<fp_type>::SchmidtDecomp();
-  case Cirq::kCX:
-    return Cirq::CX<fp_type>::SchmidtDecomp();
-  case Cirq::kXXPowGate:
-    return Cirq::XXPowGate<fp_type>::SchmidtDecomp(params[0], params[1]);
-  case Cirq::kYYPowGate:
-    return Cirq::YYPowGate<fp_type>::SchmidtDecomp(params[0], params[1]);
-  case Cirq::kZZPowGate:
-    return Cirq::ZZPowGate<fp_type>::SchmidtDecomp(params[0], params[1]);
-  case Cirq::kXX:
-    return Cirq::XX<fp_type>::SchmidtDecomp();
-  case Cirq::kYY:
-    return Cirq::YY<fp_type>::SchmidtDecomp();
-  case Cirq::kZZ:
-    return Cirq::ZZ<fp_type>::SchmidtDecomp();
-  case Cirq::kSwapPowGate:
-    return Cirq::SwapPowGate<fp_type>::SchmidtDecomp(params[0], params[1]);
-  case Cirq::kISwapPowGate:
-    return Cirq::ISwapPowGate<fp_type>::SchmidtDecomp(params[0], params[1]);
-  case Cirq::kriswap:
-    return Cirq::riswap<fp_type>::SchmidtDecomp(params[0]);
-  case Cirq::kSWAP:
-    return Cirq::SWAP<fp_type>::SchmidtDecomp();
-  case Cirq::kISWAP:
-    return Cirq::ISWAP<fp_type>::SchmidtDecomp();
-  case Cirq::kPhasedISwapPowGate:
-    return Cirq::PhasedISwapPowGate<fp_type>::SchmidtDecomp(
-        params[0], params[1]);
-  case Cirq::kgivens:
-    return Cirq::givens<fp_type>::SchmidtDecomp(params[0]);
-  case Cirq::kFSimGate:
-    return Cirq::FSimGate<fp_type>::SchmidtDecomp(params[0], params[1]);
-  default:
-    // Single qubit gates of gates with unimplemented Schmidt decomposition.
-    return schmidt_decomp_type<fp_type>{};
-  }
-}
-
-}  // namespace qsim
-
-#endif  // GATES_CIRQ_H_
diff --git a/qsim/gates_qsim.h b/qsim/gates_qsim.h
deleted file mode 100644
index 366c4f1..0000000
--- a/qsim/gates_qsim.h
+++ /dev/null
@@ -1,661 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef GATES_QSIM_H_
-#define GATES_QSIM_H_
-
-#include <array>
-#include <cmath>
-#include <vector>
-
-#include "gate.h"
-
-namespace qsim {
-
-// Gate set implemented in qsim contains the following gates.
-enum GateKind {
-  kGateId1 = 0, // one-qubit Id
-  kGateHd,      // Hadamard
-  kGateT,       // T
-  kGateX,       // X
-  kGateY,       // Y
-  kGateZ,       // Z
-  kGateX2,      // sqrt(X)
-  kGateY2,      // sqrt(Y)
-  kGateRX,      // X-rotation
-  kGateRY,      // Y-rotation
-  kGateRZ,      // Z-rotation
-  kGateRXY,     // XY-rotation (rotation around arbitrary axis in the XY plane)
-  kGateHZ2,     // pi / 2 rotation around the X + Y axis
-  kGateS,       // S
-  kGateId2,     // two-qubit Id
-  kGateCZ,      // CZ
-  kGateCNot,    // CNOT (CX)
-  kGateSwap,    // swap
-  kGateIS,      // iSwap
-  kGateFS,      // fSim
-  kGateCP,      // control phase
-  kGateMatrix1, // one-qubit matrix gate
-  kGateMatrix2, // two-qubit matrix gate
-  kGateGPh,     // global phase gate
-  kDecomp = gate::kDecomp,
-  kMeasurement = gate::kMeasurement,
-};
-
-// Specialization of Gate (defined in gate.h) for the qsim gate set.
-template <typename fp_type>
-using GateQSim = Gate<fp_type, GateKind>;
-
-constexpr double h_double = 0.5;
-constexpr double is2_double = 0.7071067811865475;
-
-// Zero-qubit gates:
-
-/**
- * The global phase gate.
- */
-template <typename fp_type>
-struct GateGPh {
-  static constexpr GateKind kind = kGateGPh;
-  static constexpr char name[] = "p";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateQSim<fp_type> Create(unsigned time, fp_type phi) {
-    return Create(time, std::cos(phi), std::sin(phi));
-  }
-
-  static GateQSim<fp_type> Create(unsigned time, fp_type cp, fp_type sp) {
-    return CreateGate<GateQSim<fp_type>, GateGPh>(
-        time, {}, {cp, sp}, {cp, sp});
-  }
-};
-
-// One-qubit gates:
-
-/**
- * The one-qubit identity gate.
- */
-template <typename fp_type>
-struct GateId1 {
-  static constexpr GateKind kind = kGateId1;
-  static constexpr char name[] = "id1";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateQSim<fp_type>, GateId1>(
-        time, {q0}, {1, 0, 0, 0, 0, 0, 1, 0});
-  }
-};
-
-/**
- * The Hadamard gate.
- */
-template <typename fp_type>
-struct GateHd {
-  static constexpr GateKind kind = kGateHd;
-  static constexpr char name[] = "h";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateQSim<fp_type>, GateHd>(
-        time, {q0}, {is2, 0, is2, 0, is2, 0, -is2, 0});
-  }
-};
-
-/**
- * The T gate, equivalent to `Z ^ 0.25`.
- */
-template <typename fp_type>
-struct GateT {
-  static constexpr GateKind kind = kGateT;
-  static constexpr char name[] = "t";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateQSim<fp_type>, GateT>(
-        time, {q0}, {1, 0, 0, 0, 0, 0, is2, is2});
-  }
-};
-
-/**
- * The Pauli X (or "NOT") gate.
- */
-template <typename fp_type>
-struct GateX {
-  static constexpr GateKind kind = kGateX;
-  static constexpr char name[] = "x";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateQSim<fp_type>, GateX>(
-        time, {q0}, {0, 0, 1, 0, 1, 0, 0, 0});
-  }
-};
-
-/**
- * The Pauli Y gate.
- */
-template <typename fp_type>
-struct GateY {
-  static constexpr GateKind kind = kGateY;
-  static constexpr char name[] = "y";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateQSim<fp_type>, GateY>(
-        time, {q0}, {0, 0, 0, -1, 0, 1, 0, 0});
-  }
-};
-
-/**
- * The Pauli Z gate.
- */
-template <typename fp_type>
-struct GateZ {
-  static constexpr GateKind kind = kGateZ;
-  static constexpr char name[] = "z";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateQSim<fp_type>, GateZ>(
-        time, {q0}, {1, 0, 0, 0, 0, 0, -1, 0});
-  }
-};
-
-/**
- * The "square root of X" gate.
- */
-template <typename fp_type>
-struct GateX2 {
-  static constexpr GateKind kind = kGateX2;
-  static constexpr char name[] = "x_1_2";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type h = static_cast<fp_type>(h_double);
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateQSim<fp_type>, GateX2>(
-        time, {q0}, {h, h, h, -h, h, -h, h, h});
-  }
-};
-
-/**
- * The "square root of Y" gate.
- */
-template <typename fp_type>
-struct GateY2 {
-  static constexpr GateKind kind = kGateY2;
-  static constexpr char name[] = "y_1_2";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type h = static_cast<fp_type>(h_double);
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateQSim<fp_type>, GateY2>(
-        time, {q0}, {h, h, -h, -h, h, h, h, h});
-  }
-};
-
-/**
- * A gate that rotates around the X axis of the Bloch sphere.
- * This is a generalization of the X gate.
- */
-template <typename fp_type>
-struct GateRX {
-  static constexpr GateKind kind = kGateRX;
-  static constexpr char name[] = "rx";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0, fp_type phi) {
-    fp_type phi2 = -0.5 * phi;
-    fp_type c = std::cos(phi2);
-    fp_type s = std::sin(phi2);
-
-    return CreateGate<GateQSim<fp_type>, GateRX>(
-        time, {q0}, {c, 0, 0, s, 0, s, c, 0}, {phi});
-  }
-};
-
-/**
- * A gate that rotates around the Y axis of the Bloch sphere.
- * This is a generalization of the Y gate.
- */
-template <typename fp_type>
-struct GateRY {
-  static constexpr GateKind kind = kGateRY;
-  static constexpr char name[] = "ry";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0, fp_type phi) {
-    fp_type phi2 = -0.5 * phi;
-    fp_type c = std::cos(phi2);
-    fp_type s = std::sin(phi2);
-
-    return CreateGate<GateQSim<fp_type>, GateRY>(
-        time, {q0}, {c, 0, s, 0, -s, 0, c, 0}, {phi});
-  }
-};
-
-/**
- * A gate that rotates around the Z axis of the Bloch sphere.
- * This is a generalization of the Z gate.
- */
-template <typename fp_type>
-struct GateRZ {
-  static constexpr GateKind kind = kGateRZ;
-  static constexpr char name[] = "rz";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0, fp_type phi) {
-    fp_type phi2 = -0.5 * phi;
-    fp_type c = std::cos(phi2);
-    fp_type s = std::sin(phi2);
-
-    return CreateGate<GateQSim<fp_type>, GateRZ>(
-        time, {q0}, {c, s, 0, 0, 0, 0, c, -s}, {phi});
-  }
-};
-
-/**
- * A gate that rotates around an arbitrary axis in the XY-plane.
- */
-template <typename fp_type>
-struct GateRXY {
-  static constexpr GateKind kind = kGateRXY;
-  static constexpr char name[] = "rxy";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateQSim<fp_type> Create(
-      unsigned time, unsigned q0, fp_type theta, fp_type phi) {
-    fp_type phi2 = -0.5 * phi;
-    fp_type cp = std::cos(phi2);
-    fp_type sp = std::sin(phi2);
-    fp_type ct = std::cos(theta) * sp;
-    fp_type st = std::sin(theta) * sp;
-
-    return CreateGate<GateQSim<fp_type>, GateRXY>(
-        time, {q0}, {cp, 0, st, ct, -st, ct, cp, 0}, {theta, phi});
-  }
-};
-
-/**
- * A pi / 2 rotation around the X + Y axis.
- */
-template <typename fp_type>
-struct GateHZ2 {
-  static constexpr GateKind kind = kGateHZ2;
-  static constexpr char name[] = "hz_1_2";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type h = static_cast<fp_type>(h_double);
-
-  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateQSim<fp_type>, GateHZ2>(
-        time, {q0}, {h, h, 0, -is2, is2, 0, h, h});
-  }
-};
-
-/**
- * The S gate, equivalent to "square root of Z".
- */
-template <typename fp_type>
-struct GateS {
-  static constexpr GateKind kind = kGateS;
-  static constexpr char name[] = "s";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateQSim<fp_type>, GateS>(
-        time, {q0}, {1, 0, 0, 0, 0, 0, 0, 1});
-  }
-};
-
-/**
- * A one-qubit gate defined entirely by its matrix.
- */
-template <typename fp_type>
-struct GateMatrix1 {
-  static constexpr GateKind kind = kGateMatrix1;
-  static constexpr char name[] = "mat1";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0,
-                                  const Matrix<fp_type>& m) {
-    auto m2 = m;
-    return
-        CreateGate<GateQSim<fp_type>, GateMatrix1>(time, {q0}, std::move(m2));
-  }
-};
-
-// Two-qubit gates:
-
-/**
- * The two-qubit identity gate.
- */
-template <typename fp_type>
-struct GateId2 {
-  static constexpr GateKind kind = kGateId2;
-  static constexpr char name[] = "id2";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
-    return CreateGate<GateQSim<fp_type>, GateId2>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, 1, 0, 0, 0, 0, 0,
-                         0, 0, 0, 0, 1, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, 1, 0});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
-    return schmidt_decomp_type<fp_type>{
-      {{1, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, 1, 0}},
-    };
-  }
-};
-
-/**
- * The controlled-Z (CZ) gate.
- */
-template <typename fp_type>
-struct GateCZ {
-  static constexpr GateKind kind = kGateCZ;
-  static constexpr char name[] = "cz";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
-    return CreateGate<GateQSim<fp_type>, GateCZ>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, 1, 0, 0, 0, 0, 0,
-                         0, 0, 0, 0, 1, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, -1, 0});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
-    return schmidt_decomp_type<fp_type>{
-      {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}},
-      {{0, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, -1, 0}},
-    };
-  }
-};
-
-/**
- * The controlled-X (CX or CNOT) gate.
- */
-template <typename fp_type>
-struct GateCNot {
-  static constexpr GateKind kind = kGateCNot;
-  static constexpr char name[] = "cnot";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = false;
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
-    // Matrix is in this form because the simulator uses inverse qubit order.
-    return CreateGate<GateQSim<fp_type>, GateCNot>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, 1, 0,
-                         0, 0, 0, 0, 1, 0, 0, 0,
-                         0, 0, 1, 0, 0, 0, 0, 0});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
-    return schmidt_decomp_type<fp_type>{
-      {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}},
-      {{0, 0, 0, 0, 0, 0, 1, 0}, {0, 0, 1, 0, 1, 0, 0, 0}},
-    };
-  }
-};
-
-/**
- * The SWAP gate. Exchanges two qubits.
- */
-template <typename fp_type>
-struct GateSwap {
-  static constexpr GateKind kind = kGateSwap;
-  static constexpr char name[] = "sw";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
-    return CreateGate<GateQSim<fp_type>, GateSwap>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, 0, 0, 1, 0, 0, 0,
-                         0, 0, 1, 0, 0, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, 1, 0});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
-    return schmidt_decomp_type<fp_type>{
-      {{is2, 0, 0, 0, 0, 0, is2, 0}, {is2, 0, 0, 0, 0, 0, is2, 0}},
-      {{0, 0, is2, 0, is2, 0, 0, 0}, {0, 0, is2, 0, is2, 0, 0, 0}},
-      {{0, 0, 0, -is2, 0, is2, 0, 0}, {0, 0, 0, -is2, 0, is2, 0, 0}},
-      {{is2, 0, 0, 0, 0, 0, -is2, 0}, {is2, 0, 0, 0, 0, 0, -is2, 0}},
-    };
-  }
-};
-
-/**
- * The ISWAP gate.
- */
-template <typename fp_type>
-struct GateIS {
-  static constexpr GateKind kind = kGateIS;
-  static constexpr char name[] = "is";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type h = static_cast<fp_type>(h_double);
-  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
-    return CreateGate<GateQSim<fp_type>, GateIS>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, 0, 0, 0, 1, 0, 0,
-                         0, 0, 0, 1, 0, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, 1, 0});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
-    return schmidt_decomp_type<fp_type>{
-      {{is2, 0, 0, 0, 0, 0, is2, 0}, {is2, 0, 0, 0, 0, 0, is2, 0}},
-      {{0, 0, h, h, h, h, 0, 0}, {0, 0, h, h, h, h, 0, 0}},
-      {{0, 0, h, -h, -h, h, 0, 0}, {0, 0, h, -h, -h, h, 0, 0}},
-      {{is2, 0, 0, 0, 0, 0, -is2, 0}, {is2, 0, 0, 0, 0, 0, -is2, 0}},
-    };
-  }
-};
-
-/**
- * The fermionic simulation (FSim) gate family. Contains all two-qubit
- * interactions that preserve excitations, up to single-qubit rotations and
- * global phase.
- */
-template <typename fp_type>
-struct GateFS {
-  static constexpr GateKind kind = kGateFS;
-  static constexpr char name[] = "fs";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
-
-  static GateQSim<fp_type> Create(
-      unsigned time, unsigned q0, unsigned q1, fp_type theta, fp_type phi) {
-    if (phi < 0) {
-      phi += 2 * 3.141592653589793;
-    }
-
-    fp_type ct = std::cos(theta);
-    fp_type st = std::sin(theta);
-    fp_type cp = std::cos(phi);
-    fp_type sp = std::sin(phi);
-
-    return CreateGate<GateQSim<fp_type>, GateFS>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, ct, 0, 0, -st, 0, 0,
-                         0, 0, 0, -st, ct, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, cp, -sp}, {theta, phi});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp(
-      fp_type theta, fp_type phi) {
-    fp_type ct = std::cos(theta);
-    fp_type st = std::sin(theta);
-
-    fp_type cp2 = std::cos(0.5 * phi);
-    fp_type sp2 = std::sin(0.5 * phi);
-    fp_type cp4 = std::cos(0.25 * phi);
-    fp_type sp4 = std::sin(0.25 * phi);
-
-    fp_type a0 = std::sqrt(std::sqrt(1 + 2 * ct * cp2 + ct * ct));
-    fp_type a1 = std::sqrt(std::sqrt(1 - 2 * ct * cp2 + ct * ct));
-
-    fp_type p0 = 0.5 * std::atan2(-sp2, cp2 + ct);
-    fp_type p1 = 0.5 * std::atan2(-sp2, cp2 - ct);
-
-    fp_type c0 = is2 * a0 * std::cos(p0);
-    fp_type s0 = is2 * a0 * std::sin(p0);
-
-    fp_type c1 = is2 * a1 * std::cos(p1);
-    fp_type s1 = is2 * a1 * std::sin(p1);
-
-    fp_type st2 = 0.5 * std::sqrt(st);
-
-    fp_type a = cp4 * c0 - sp4 * s0;
-    fp_type b = cp4 * s0 + sp4 * c0;
-    fp_type c = cp4 * c0 + sp4 * s0;
-    fp_type d = cp4 * s0 - sp4 * c0;
-
-    fp_type e = cp4 * c1 - sp4 * s1;
-    fp_type f = cp4 * s1 + sp4 * c1;
-    fp_type g = -(cp4 * c1 + sp4 * s1);
-    fp_type h = -(cp4 * s1 - sp4 * c1);
-
-    return schmidt_decomp_type<fp_type>{
-      {{a, b, 0, 0, 0, 0, c, d}, {a, b, 0, 0, 0, 0, c, d}},
-      {{0, 0, st2, -st2, st2, -st2, 0, 0}, {0, 0, st2, -st2, st2, -st2, 0, 0}},
-      {{0, 0, -st2, -st2, st2, st2, 0, 0}, {0, 0, -st2, -st2, st2, st2, 0, 0}},
-      {{e, f, 0, 0, 0, 0, g, h}, {e, f, 0, 0, 0, 0, g, h}},
-    };
-  }
-};
-
-/**
- * The controlled phase gate. A generalized version of GateCZ.
- */
-template <typename fp_type>
-struct GateCP {
-  static constexpr GateKind kind = kGateCP;
-  static constexpr char name[] = "cp";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static GateQSim<fp_type> Create(
-      unsigned time, unsigned q0, unsigned q1, fp_type phi) {
-    fp_type cp = std::cos(phi);
-    fp_type sp = std::sin(phi);
-
-    return CreateGate<GateQSim<fp_type>, GateCP>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, 1, 0, 0, 0, 0, 0,
-                         0, 0, 0, 0, 1, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, cp, -sp}, {phi});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp(fp_type phi) {
-    fp_type cp = std::cos(phi);
-    fp_type sp = std::sin(phi);
-
-    return schmidt_decomp_type<fp_type>{
-      {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}},
-      {{0, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, cp, -sp}},
-    };
-  }
-};
-
-/**
- * A two-qubit gate defined entirely by its matrix.
- */
-template <typename fp_type>
-struct GateMatrix2 {
-  static constexpr GateKind kind = kGateMatrix2;
-  static constexpr char name[] = "mat2";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = false;
-
-  template <typename M = Matrix<fp_type>>
-  static GateQSim<fp_type> Create(
-      unsigned time, unsigned q0, unsigned q1, M&& m) {
-    return CreateGate<GateQSim<fp_type>, GateMatrix2>(time, {q1, q0},
-                                                      std::forward<M>(m));
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp(fp_type phi) {
-    // Not implemented.
-    return schmidt_decomp_type<fp_type>{};
-  }
-};
-
-template <typename fp_type>
-inline schmidt_decomp_type<fp_type> GetSchmidtDecomp(
-    GateKind kind, const std::vector<fp_type>& params) {
-  switch (kind) {
-  case kGateId2:
-    return GateId2<fp_type>::SchmidtDecomp();
-  case kGateCZ:
-    return GateCZ<fp_type>::SchmidtDecomp();
-  case kGateCNot:
-    return GateCNot<fp_type>::SchmidtDecomp();
-  case kGateSwap:
-    return GateSwap<fp_type>::SchmidtDecomp();
-  case kGateIS:
-    return GateIS<fp_type>::SchmidtDecomp();
-  case kGateFS:
-    return GateFS<fp_type>::SchmidtDecomp(params[0], params[1]);
-  case kGateCP:
-    return GateCP<fp_type>::SchmidtDecomp(params[0]);
-  default:
-    // Single qubit gates: empty Schmidt decomposition.
-    return schmidt_decomp_type<fp_type>{};
-  }
-}
-
-}  // namespace qsim
-
-#endif  // GATES_QSIM_H_
diff --git a/qsim/hybrid.h b/qsim/hybrid.h
deleted file mode 100644
index 44fad5b..0000000
--- a/qsim/hybrid.h
+++ /dev/null
@@ -1,612 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HYBRID_H_
-#define HYBRID_H_
-
-#include <algorithm>
-#include <array>
-#include <complex>
-#include <vector>
-
-#include "gate.h"
-#include "gate_appl.h"
-
-namespace qsim {
-
-/**
- * Hybrid Feynman-Schrodinger simulator.
- */
-template <typename IO, typename GateT,
-          template <typename, typename> class FuserT, typename For>
-struct HybridSimulator final {
- public:
-  using Gate = GateT;
-  using GateKind = typename Gate::GateKind;
-  using fp_type = typename Gate::fp_type;
-
- private:
-  // Note that one can use "struct GateHybrid : public Gate {" in C++17.
-  struct GateHybrid {
-    using GateKind = HybridSimulator::GateKind;
-    using fp_type = HybridSimulator::fp_type;
-
-    GateKind kind;
-    unsigned time;
-    std::vector<unsigned> qubits;
-    std::vector<unsigned> controlled_by;
-    uint64_t cmask;
-    std::vector<fp_type> params;
-    Matrix<fp_type> matrix;
-    bool unfusible;
-    bool swapped;
-
-    const Gate* parent;
-    unsigned id;
-  };
-
-  struct GateX {
-    GateHybrid* decomposed0;
-    GateHybrid* decomposed1;
-    schmidt_decomp_type<fp_type> schmidt_decomp;
-    unsigned schmidt_bits;
-    unsigned swapped;
-  };
-
- public:
-  using Fuser = FuserT<IO, GateHybrid>;
-  using GateFused = typename Fuser::GateFused;
-
-  /**
-   * Contextual data for hybrid simulation.
-   */
-  struct HybridData {
-    /**
-     * List of gates on the "0" side of the cut.
-     */
-    std::vector<GateHybrid> gates0;
-    /**
-     * List of gates on the "1" side of the cut.
-     */
-    std::vector<GateHybrid> gates1;
-    /**
-     * List of gates on the cut.
-     */
-    std::vector<GateX> gatexs;
-    /**
-     * Global qubit index to local qubit index map.
-     */
-    std::vector<unsigned> qubit_map;
-    /**
-     * Number of qubits on the "0" side of the cut.
-     */
-    unsigned num_qubits0;
-    /**
-     * Number of qubits on the "1" side of the cut.
-     */
-    unsigned num_qubits1;
-    /**
-     * Number of gates on the cut.
-     */
-    unsigned num_gatexs;
-  };
-
-  /**
-   * User-specified parameters for gate fusion and hybrid simulation.
-   */
-  struct Parameter : public Fuser::Parameter {
-    /**
-     * Fixed bitstring indicating values to assign to Schmidt decomposition
-     * indices of prefix gates.
-     */
-    uint64_t prefix;
-    /**
-     * Number of gates on the cut that are part of the prefix. Indices of these
-     * gates are assigned the value indicated by `prefix`.
-     */
-    unsigned num_prefix_gatexs;
-    /**
-     * Number of gates on the cut that are part of the root. All gates that are
-     * not part of the prefix or root are part of the suffix.
-     */
-    unsigned num_root_gatexs;
-    unsigned num_threads;
-  };
-
-  template <typename... Args>
-  explicit HybridSimulator(Args&&... args) : for_(args...) {}
-
-  /**
-   * Splits the lattice into two parts, using Schmidt decomposition for gates
-   * on the cut.
-   * @param parts Lattice sections to be simulated.
-   * @param gates List of all gates in the circuit.
-   * @param hd Output data with split parts.
-   * @return True if the splitting done successfully; false otherwise.
-   */
-  static bool SplitLattice(const std::vector<unsigned>& parts,
-                           const std::vector<Gate>& gates, HybridData& hd) {
-    hd.num_gatexs = 0;
-    hd.num_qubits0 = 0;
-    hd.num_qubits1 = 0;
-
-    hd.gates0.reserve(gates.size());
-    hd.gates1.reserve(gates.size());
-    hd.qubit_map.reserve(parts.size());
-
-    unsigned count0 = 0;
-    unsigned count1 = 0;
-
-    // Global qubit index to local qubit index map.
-    for (std::size_t i = 0; i < parts.size(); ++i) {
-      parts[i] == 0 ? ++hd.num_qubits0 : ++hd.num_qubits1;
-      hd.qubit_map.push_back(parts[i] == 0 ? count0++ : count1++);
-    }
-
-    // Split the lattice.
-    for (const auto& gate : gates) {
-      if (gate.kind == gate::kMeasurement) {
-        IO::errorf("measurement gates are not suported by qsimh.\n");
-        return false;
-      }
-
-      if (gate.controlled_by.size() > 0) {
-        IO::errorf("controlled gates are not suported by qsimh.\n");
-        return false;
-      }
-
-      switch (gate.qubits.size()) {
-      case 1:  // Single qubit gates.
-        switch (parts[gate.qubits[0]]) {
-        case 0:
-          hd.gates0.emplace_back(GateHybrid{gate.kind, gate.time,
-            {hd.qubit_map[gate.qubits[0]]}, {}, 0, gate.params, gate.matrix,
-            false, false, nullptr, 0});
-          break;
-        case 1:
-          hd.gates1.emplace_back(GateHybrid{gate.kind, gate.time,
-            {hd.qubit_map[gate.qubits[0]]}, {}, 0, gate.params, gate.matrix,
-            false, false, nullptr, 0});
-          break;
-        }
-        break;
-      case 2:  // Two qubit gates.
-        {
-          switch ((parts[gate.qubits[1]] << 1) | parts[gate.qubits[0]]) {
-          case 0:  // Both qubits in part 0.
-            hd.gates0.emplace_back(GateHybrid{gate.kind, gate.time,
-              {hd.qubit_map[gate.qubits[0]], hd.qubit_map[gate.qubits[1]]},
-              {}, 0, gate.params, gate.matrix, false, gate.swapped,
-              nullptr, 0});
-            break;
-          case 1:  // Gate on the cut, qubit 0 in part 1, qubit 1 in part 0.
-            hd.gates0.emplace_back(GateHybrid{GateKind::kDecomp, gate.time,
-              {hd.qubit_map[gate.qubits[1]]}, {}, 0, gate.params, {},
-              true, gate.swapped, &gate, hd.num_gatexs});
-            hd.gates1.emplace_back(GateHybrid{GateKind::kDecomp, gate.time,
-              {hd.qubit_map[gate.qubits[0]]}, {}, 0, gate.params, {},
-              true, gate.swapped, &gate, hd.num_gatexs});
-
-            ++hd.num_gatexs;
-            break;
-          case 2:  // Gate on the cut, qubit 0 in part 0, qubit 1 in part 1.
-            hd.gates0.emplace_back(GateHybrid{GateKind::kDecomp, gate.time,
-              {hd.qubit_map[gate.qubits[0]]}, {}, 0, gate.params, {},
-              true, gate.swapped, &gate, hd.num_gatexs});
-            hd.gates1.emplace_back(GateHybrid{GateKind::kDecomp, gate.time,
-              {hd.qubit_map[gate.qubits[1]]}, {}, 0, gate.params, {},
-              true, gate.swapped, &gate, hd.num_gatexs});
-
-            ++hd.num_gatexs;
-            break;
-          case 3:  // Both qubits in part 1.
-            hd.gates1.emplace_back(GateHybrid{gate.kind, gate.time,
-              {hd.qubit_map[gate.qubits[0]], hd.qubit_map[gate.qubits[1]]},
-              {}, 0, gate.params, gate.matrix, false, gate.swapped,
-              nullptr, 0});
-            break;
-          }
-        }
-        break;
-      default:
-        IO::errorf("multi-qubit gates are not suported by qsimh.\n");
-        return false;
-      }
-    }
-
-    auto compare = [](const GateHybrid& l, const GateHybrid& r) -> bool {
-      return l.time < r.time || (l.time == r.time &&
-          (l.parent < r.parent || (l.parent == r.parent && l.id < r.id)));
-    };
-
-    // Sort gates.
-    std::sort(hd.gates0.begin(), hd.gates0.end(), compare);
-    std::sort(hd.gates1.begin(), hd.gates1.end(), compare);
-
-    hd.gatexs.reserve(hd.num_gatexs);
-
-    // Get Schmidt matrices.
-    for (auto& gate0 : hd.gates0) {
-      if (gate0.parent != nullptr) {
-        auto d = GetSchmidtDecomp(gate0.parent->kind, gate0.parent->params);
-        if (d.size() == 0) {
-          IO::errorf("no Schmidt decomposition for gate kind %u.\n",
-                     gate0.parent->kind);
-          return false;
-        }
-
-        unsigned schmidt_bits = SchmidtBits(d.size());
-        if (schmidt_bits > 2) {
-          IO::errorf("Schmidt rank is too large for gate kind %u.\n",
-                     gate0.parent->kind);
-          return false;
-        }
-
-        unsigned swapped = parts[gate0.parent->qubits[0]];
-        if (gate0.parent->swapped) swapped = 1 - swapped;
-        hd.gatexs.emplace_back(GateX{&gate0, nullptr, std::move(d),
-                                     schmidt_bits, swapped});
-      }
-    }
-
-    unsigned count = 0;
-    for (auto& gate1 : hd.gates1) {
-      if (gate1.parent != nullptr) {
-        hd.gatexs[count++].decomposed1 = &gate1;
-      }
-    }
-
-    for (auto& gatex : hd.gatexs) {
-      if (gatex.schmidt_decomp.size() == 1) {
-        FillSchmidtMatrices(0, gatex);
-      }
-    }
-
-    return true;
-  }
-
-  /**
-   * Runs the hybrid simulator on a sectioned lattice.
-   * @param param Options for parallelism and logging. Also specifies the size
-   *   of the 'prefix' and 'root' sections of the lattice.
-   * @param factory Object to create simulators and state spaces.
-   * @param hd Container object for gates on the boundary between lattice
-   *   sections.
-   * @param parts Lattice sections to be simulated.
-   * @param fgates0 List of gates from one section of the lattice.
-   * @param fgates1 List of gates from the other section of the lattice.
-   * @param bitstrings List of output states to simulate, as bitstrings.
-   * @param results Output vector of amplitudes. After a successful run, this
-   *   will be populated with amplitudes for each state in 'bitstrings'.
-   * @return True if the simulation completed successfully; false otherwise.
-   */
-  template <typename Factory, typename Results>
-  bool Run(const Parameter& param, const Factory& factory,
-           HybridData& hd, const std::vector<unsigned>& parts,
-           const std::vector<GateFused>& fgates0,
-           const std::vector<GateFused>& fgates1,
-           const std::vector<uint64_t>& bitstrings, Results& results) const {
-    using Simulator = typename Factory::Simulator;
-    using StateSpace = typename Simulator::StateSpace;
-    using State = typename StateSpace::State;
-
-    unsigned num_p_gates = param.num_prefix_gatexs;
-    unsigned num_pr_gates = num_p_gates + param.num_root_gatexs;
-
-    auto bits = CountSchmidtBits(param, hd.gatexs);
-
-    uint64_t rmax = uint64_t{1} << bits.num_r_bits;
-    uint64_t smax = uint64_t{1} << bits.num_s_bits;
-
-    auto loc0 = CheckpointLocations(param, fgates0);
-    auto loc1 = CheckpointLocations(param, fgates1);
-
-    struct Index {
-      unsigned i0;
-      unsigned i1;
-    };
-
-    std::vector<Index> indices;
-    indices.reserve(bitstrings.size());
-
-    // Bitstring indices for part 0 and part 1. TODO: optimize.
-    for (const auto& bitstring : bitstrings) {
-      Index index{0, 0};
-
-      for (uint64_t i = 0; i < hd.qubit_map.size(); ++i) {
-        unsigned m = ((bitstring >> i) & 1) << hd.qubit_map[i];
-        parts[i] ? index.i1 |= m : index.i0 |= m;
-      }
-
-      indices.push_back(index);
-    }
-
-    StateSpace state_space = factory.CreateStateSpace();
-
-    State* rstate0;
-    State* rstate1;
-
-    State state0p = state_space.Null();
-    State state1p = state_space.Null();
-    State state0r = state_space.Null();
-    State state1r = state_space.Null();
-    State state0s = state_space.Null();
-    State state1s = state_space.Null();
-
-    // Create states.
-
-    if (!CreateStates(hd.num_qubits0, hd.num_qubits1, state_space, true,
-                      state0p, state1p, rstate0, rstate1)) {
-      return false;
-    }
-
-    if (!CreateStates(hd.num_qubits0, hd.num_qubits1, state_space, rmax > 1,
-                      state0r, state1r, rstate0, rstate1)) {
-      return false;
-    }
-
-    if (!CreateStates(hd.num_qubits0, hd.num_qubits1, state_space, smax > 1,
-                      state0s, state1s, rstate0, rstate1)) {
-      return false;
-    }
-
-    state_space.SetStateZero(state0p);
-    state_space.SetStateZero(state1p);
-
-    Simulator simulator = factory.CreateSimulator();
-
-    std::vector<unsigned> prev(hd.num_gatexs, unsigned(-1));
-
-    // param.prefix encodes the prefix path.
-    unsigned gatex_index = SetSchmidtMatrices(
-        0, num_p_gates, param.prefix, prev, hd.gatexs);
-
-    if (gatex_index == 0) {
-      // Apply gates before the first checkpoint.
-      ApplyGates(fgates0, 0, loc0[0], simulator, state0p);
-      ApplyGates(fgates1, 0, loc1[0], simulator, state1p);
-    } else {
-      IO::errorf("invalid prefix %lu for prefix gate index %u.\n",
-                 param.prefix, gatex_index - 1);
-      return false;
-    }
-
-    // Branch over root gates on the cut. r encodes the root path.
-    for (uint64_t r = 0; r < rmax; ++r) {
-      if (rmax > 1) {
-        state_space.Copy(state0p, state0r);
-        state_space.Copy(state1p, state1r);
-      }
-
-      if (SetSchmidtMatrices(num_p_gates, num_pr_gates,
-                             r, prev, hd.gatexs) == 0) {
-        // Apply gates before the second checkpoint.
-        ApplyGates(fgates0, loc0[0], loc0[1], simulator, state0r);
-        ApplyGates(fgates1, loc1[0], loc1[1], simulator, state1r);
-      } else {
-        continue;
-      }
-
-      // Branch over suffix gates on the cut. s encodes the suffix path.
-      for (uint64_t s = 0; s < smax; ++s) {
-        if (smax > 1) {
-          state_space.Copy(rmax > 1 ? state0r : state0p, state0s);
-          state_space.Copy(rmax > 1 ? state1r : state1p, state1s);
-        }
-
-        if (SetSchmidtMatrices(num_pr_gates, hd.num_gatexs,
-                               s, prev, hd.gatexs) == 0) {
-          // Apply the rest of the gates.
-          ApplyGates(fgates0, loc0[1], fgates0.size(), simulator, state0s);
-          ApplyGates(fgates1, loc1[1], fgates1.size(), simulator, state1s);
-        } else {
-          continue;
-        }
-
-        auto f = [](unsigned n, unsigned m, uint64_t i,
-                    const StateSpace& state_space,
-                    const State& state0, const State& state1,
-                    const std::vector<Index>& indices, Results& results) {
-          // TODO: make it faster for the CUDA state space.
-          auto a0 = state_space.GetAmpl(state0, indices[i].i0);
-          auto a1 = state_space.GetAmpl(state1, indices[i].i1);
-          results[i] += a0 * a1;
-        };
-
-        // Collect results.
-        for_.Run(results.size(), f,
-                 state_space, *rstate0, *rstate1, indices, results);
-      }
-    }
-
-    return true;
-  }
-
- private:
-  /**
-   * Identifies when to save "checkpoints" of the simulation state. These allow
-   * runs with different cut-index values to reuse parts of the simulation.
-   * @param param Options for parallelism and logging. Also specifies the size
-   *   of the 'prefix' and 'root' sections of the lattice.
-   * @param fgates Set of gates for which to find checkpoint locations.
-   * @return A pair of numbers specifying how many gates to apply before the
-   *   first and second checkpoints, respectively.
-   */
-  static std::array<unsigned, 2> CheckpointLocations(
-      const Parameter& param, const std::vector<GateFused>& fgates) {
-    std::array<unsigned, 2> loc{0, 0};
-
-    unsigned num_decomposed = 0;
-    unsigned num_p_gates = param.num_prefix_gatexs;
-    unsigned num_pr_gates = num_p_gates + param.num_root_gatexs;
-
-    for (std::size_t i = 0; i < fgates.size(); ++i) {
-      for (auto gate: fgates[i].gates) {
-        if (gate->parent != nullptr) {
-          ++num_decomposed;
-          // There should be only one decomposed gate in fused gate.
-          break;
-        }
-      }
-
-      if (num_decomposed <= num_p_gates) {
-        loc[0] = i + 1;
-      }
-
-      if (num_decomposed <= num_pr_gates) {
-        loc[1] = i + 1;
-      }
-    }
-
-    return loc;
-  }
-
-  struct Bits {
-    unsigned num_p_bits;
-    unsigned num_r_bits;
-    unsigned num_s_bits;
-  };
-
-  static Bits CountSchmidtBits(
-      const Parameter& param, const std::vector<GateX>& gatexs) {
-    Bits bits{0, 0, 0};
-
-    unsigned num_p_gates = param.num_prefix_gatexs;
-    unsigned num_pr_gates = num_p_gates + param.num_root_gatexs;
-
-    for (std::size_t i = 0; i < gatexs.size(); ++i) {
-      const auto& gatex = gatexs[i];
-      if (i < num_p_gates) {
-        bits.num_p_bits += gatex.schmidt_bits;
-      } else if (i < num_pr_gates) {
-        bits.num_r_bits += gatex.schmidt_bits;
-      } else {
-        bits.num_s_bits += gatex.schmidt_bits;
-      }
-    }
-
-    return bits;
-  }
-
-  static unsigned SetSchmidtMatrices(std::size_t i0, std::size_t i1,
-                                     uint64_t path,
-                                     std::vector<unsigned>& prev_k,
-                                     std::vector<GateX>& gatexs) {
-    unsigned shift_length = 0;
-
-    for (std::size_t i = i0; i < i1; ++i) {
-      const auto& gatex = gatexs[i];
-
-      if (gatex.schmidt_bits == 0) {
-        // Continue if gatex has Schmidt rank 1.
-        continue;
-      }
-
-      unsigned k = (path >> shift_length) & ((1 << gatex.schmidt_bits) - 1);
-      shift_length += gatex.schmidt_bits;
-
-      if (k != prev_k[i]) {
-        if (k >= gatex.schmidt_decomp.size()) {
-          // Invalid path. Returns gatex index plus one to report error in case
-          // of invalid prefix.
-          return i + 1;
-        }
-
-        FillSchmidtMatrices(k, gatex);
-
-        prev_k[i] = k;
-      }
-    }
-
-    return 0;
-  }
-
-  static void FillSchmidtMatrices(unsigned k, const GateX& gatex) {
-    unsigned part0 = gatex.swapped;
-    unsigned part1 = 1 - part0;
-    {
-      gatex.decomposed0->matrix.resize(gatex.schmidt_decomp[k][part0].size());
-      auto begin = gatex.schmidt_decomp[k][part0].begin();
-      auto end = gatex.schmidt_decomp[k][part0].end();
-      std::copy(begin, end, gatex.decomposed0->matrix.begin());
-    }
-    {
-      gatex.decomposed1->matrix.resize(gatex.schmidt_decomp[k][part1].size());
-      auto begin = gatex.schmidt_decomp[k][part1].begin();
-      auto end = gatex.schmidt_decomp[k][part1].end();
-      std::copy(begin, end, gatex.decomposed1->matrix.begin());
-    }
-  }
-
-  template <typename Simulator>
-  static void ApplyGates(const std::vector<GateFused>& gates,
-                         std::size_t i0, std::size_t i1,
-                         const Simulator& simulator,
-                         typename Simulator::State& state) {
-    for (std::size_t i = i0; i < i1; ++i) {
-      if (gates[i].matrix.size() > 0) {
-        ApplyFusedGate(simulator, gates[i], state);
-      } else {
-        auto gate = gates[i];
-        CalculateFusedMatrix(gate);
-        ApplyFusedGate(simulator, gate, state);
-      }
-    }
-  }
-
-  static unsigned SchmidtBits(unsigned size) {
-    switch (size) {
-    case 1:
-      return 0;
-    case 2:
-      return 1;
-    case 3:
-      return 2;
-    case 4:
-      return 2;
-    default:
-      // Not supported.
-      return 42;
-    }
-  }
-
-  template <typename StateSpace>
-  static bool CreateStates(unsigned num_qubits0,unsigned num_qubits1,
-                           const StateSpace& state_space, bool create,
-                           typename StateSpace::State& state0,
-                           typename StateSpace::State& state1,
-                           typename StateSpace::State* (&rstate0),
-                           typename StateSpace::State* (&rstate1)) {
-    if (create) {
-      state0 = state_space.Create(num_qubits0);
-      state1 = state_space.Create(num_qubits1);
-
-      if (state_space.IsNull(state0) || state_space.IsNull(state1)) {
-        IO::errorf("not enough memory: is the number of qubits too large?\n");
-        return false;
-      }
-
-      rstate0 = &state0;
-      rstate1 = &state1;
-    }
-
-    return true;
-  }
-
-  For for_;
-};
-
-}  // namespace qsim
-
-#endif  // HYBRID_H_
diff --git a/qsim/io.h b/qsim/io.h
deleted file mode 100644
index 3b26c7c..0000000
--- a/qsim/io.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef IO_H_
-#define IO_H_
-
-#include <cstdarg>
-#include <cstdio>
-
-namespace qsim {
-
-/**
- * Controller for output logs.
- */
-struct IO {
-  static void errorf(const char* format, ...) {
-    va_list args;
-    va_start(args, format);
-    vfprintf(stderr, format, args);
-    va_end(args);
-  }
-
-  static void messagef(const char* format, ...) {
-    va_list args;
-    va_start(args, format);
-    vprintf(format, args);
-    va_end(args);
-  }
-};
-
-}  // namespace qsim
-
-#endif  // IO_H_
diff --git a/qsim/io_file.h b/qsim/io_file.h
deleted file mode 100644
index 3cfac12..0000000
--- a/qsim/io_file.h
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef IO_FILE_H_
-#define IO_FILE_H_
-
-#include <cstdint>
-#include <fstream>
-#include <string>
-
-#include "io.h"
-
-namespace qsim {
-
-/**
- * Controller for output logs with methods for writing to file.
- */
-struct IOFile : public IO {
-  static std::ifstream StreamFromFile(const std::string& file) {
-    std::ifstream fs;
-    fs.open(file);
-    if (!fs) {
-      errorf("cannot open %s for reading.\n", file.c_str());
-    }
-    return fs;
-  }
-
-  static void CloseStream(std::ifstream& fs) {
-    fs.close();
-  }
-
-  static bool WriteToFile(
-      const std::string& file, const std::string& content) {
-    return WriteToFile(file, content.data(), content.size());
-  }
-
-  static bool WriteToFile(
-      const std::string& file, const void* data, uint64_t size) {
-    auto fs = std::fstream(file, std::ios::out | std::ios::binary);
-
-    if (!fs) {
-      errorf("cannot open %s for writing.\n", file.c_str());
-      return false;
-    } else {
-      fs.write((const char*) data, size);
-      if (!fs) {
-        errorf("cannot write to %s.\n", file.c_str());
-        return false;
-      }
-
-      fs.close();
-    }
-
-    return true;
-  }
-};
-
-}  // namespace qsim
-
-#endif  // IO_FILE_H_
diff --git a/qsim/matrix.h b/qsim/matrix.h
deleted file mode 100644
index a3c2640..0000000
--- a/qsim/matrix.h
+++ /dev/null
@@ -1,296 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MATRIX_H_
-#define MATRIX_H_
-
-#include <algorithm>
-#include <utility>
-#include <vector>
-
-#include "bits.h"
-
-namespace qsim {
-
-/**
- * Gate matrix type. Matrices are stored as vectors. The matrix elements are
- * accessed as real(m[i][j]) <- vector[2 * (n * i + j)] and
- * imag(m[i][j]) <- vector[2 * (n * i + j) + 1], where n is the number of rows
- * or columns (n = 2^q, where q is the number of gate qubits).
- */
-template <typename fp_type>
-using Matrix = std::vector<fp_type>;
-
-/**
- * Sets all matrix elements to zero.
- * @m Matrix to be cleared.
- */
-template <typename fp_type>
-inline void MatrixClear(Matrix<fp_type>& m) {
-  for (unsigned i = 0; i < m.size(); ++i) {
-    m[i] = 0;
-  }
-}
-
-/**
- * Sets an identity matrix.
- * @n Number of matrix rows (columns).
- * @m Output identity matrix.
- */
-template <typename fp_type>
-inline void MatrixIdentity(unsigned n, Matrix<fp_type>& m) {
-  m.resize(2 * n * n);
-
-  MatrixClear(m);
-
-  for (unsigned i = 0; i < n; ++i) {
-    m[2 * (n * i + i)] = 1;
-  }
-}
-
-/**
- * Multiplies two gate matrices of equal size: m2 = m1 m2.
- * @q Number of gate qubits. The number of matrix rows (columns) is 2^q.
- * @m1 Matrix m1.
- * @m2 Input matrix m2. Output product of matrices m2 = m1 m2.
- */
-template <typename fp_type1, typename fp_type2>
-inline void MatrixMultiply(
-    unsigned q, const Matrix<fp_type1>& m1, Matrix<fp_type2>& m2) {
-  Matrix<fp_type2> mt = m2;
-  unsigned n = unsigned{1} << q;
-
-  for (unsigned i = 0; i < n; ++i) {
-    for (unsigned j = 0; j < n; ++j) {
-      fp_type2 re = 0;
-      fp_type2 im = 0;
-
-      for (unsigned k = 0; k < n; ++k) {
-        fp_type2 r1 = m1[2 * (n * i + k)];
-        fp_type2 i1 = m1[2 * (n * i + k) + 1];
-        fp_type2 r2 = mt[2 * (n * k + j)];
-        fp_type2 i2 = mt[2 * (n * k + j) + 1];
-
-        re += r1 * r2 - i1 * i2;
-        im += r1 * i2 + i1 * r2;
-      }
-
-      m2[2 * (n * i + j)] = re;
-      m2[2 * (n * i + j) + 1] = im;
-    }
-  }
-}
-
-/**
- * Multiplies two gate matrices of equal size: m2 = m1^\dagger m2.
- * @q Number of gate qubits. The number of matrix rows (columns) is 2^q.
- * @m1 Matrix m1.
- * @m2 Input matrix m2. Output product of matrices m2 = m1 m2.
- */
-template <typename fp_type1, typename fp_type2>
-inline void MatrixDaggerMultiply(
-    unsigned q, const Matrix<fp_type1>& m1, Matrix<fp_type2>& m2) {
-  Matrix<fp_type2> mt = m2;
-  unsigned n = unsigned{1} << q;
-
-  for (unsigned i = 0; i < n; ++i) {
-    for (unsigned j = 0; j < n; ++j) {
-      fp_type2 re = 0;
-      fp_type2 im = 0;
-
-      for (unsigned k = 0; k < n; ++k) {
-        fp_type2 r1 = m1[2 * (n * k + i)];
-        fp_type2 i1 = m1[2 * (n * k + i) + 1];
-        fp_type2 r2 = mt[2 * (n * k + j)];
-        fp_type2 i2 = mt[2 * (n * k + j) + 1];
-
-        re += r1 * r2 + i1 * i2;
-        im += r1 * i2 - i1 * r2;
-      }
-
-      m2[2 * (n * i + j)] = re;
-      m2[2 * (n * i + j) + 1] = im;
-    }
-  }
-}
-
-/**
- * Multiplies two gate matrices: m2 = m1 m2. The size of m1 should not exceed
- *   the size of m2.
- * @mask1 Qubit mask that specifies the subset of qubits m1 acts on.
- * @q1 Number of gate qubits. The number of matrix rows (columns) is 2^q1.
- * @m1 Matrix m1.
- * @q2 Number of gate qubits. The number of matrix rows (columns) is 2^q2.
- * @m2 Input matrix m2. Output product of matrices m2 = m1 m2.
- */
-template <typename fp_type1, typename fp_type2>
-inline void MatrixMultiply(unsigned mask1,
-                           unsigned q1, const Matrix<fp_type1>& m1,
-                           unsigned q2, Matrix<fp_type2>& m2) {
-  if (q1 == q2) {
-    MatrixMultiply(q1, m1, m2);
-  } else {
-    Matrix<fp_type2> mt = m2;
-    unsigned n1 = unsigned{1} << q1;
-    unsigned n2 = unsigned{1} << q2;
-
-    for (unsigned i = 0; i < n2; ++i) {
-      unsigned si = bits::CompressBits(i, q2, mask1);
-
-      for (unsigned j = 0; j < n2; ++j) {
-        fp_type2 re = 0;
-        fp_type2 im = 0;
-
-        for (unsigned k = 0; k < n1; ++k) {
-          unsigned ek = bits::ExpandBits(k, q2, mask1) + (i & ~mask1);
-
-          fp_type2 r1 = m1[2 * (n1 * si + k)];
-          fp_type2 i1 = m1[2 * (n1 * si + k) + 1];
-          fp_type2 r2 = mt[2 * (n2 * ek + j)];
-          fp_type2 i2 = mt[2 * (n2 * ek + j) + 1];
-
-          re += r1 * r2 - i1 * i2;
-          im += r1 * i2 + i1 * r2;
-        }
-
-        m2[2 * (n2 * i + j)] = re;
-        m2[2 * (n2 * i + j) + 1] = im;
-      }
-    }
-  }
-}
-
-/**
- * Multiply a matrix by a real scalar value.
- * @c Scalar value.
- * @m Input matrix to be multiplied. Output matrix.
- */
-template <typename fp_type1, typename fp_type2>
-inline void MatrixScalarMultiply(fp_type1 c, Matrix<fp_type2>& m) {
-  for (unsigned i = 0; i < m.size(); ++i) {
-    m[i] *= c;
-  }
-}
-
-/**
- * Multiply a matrix by a complex scalar value.
- * @re Real part of scalar value.
- * @im Imaginary part of scalar value.
- * @m Input matrix to be multiplied. Output matrix.
- */
-template <typename fp_type1, typename fp_type2>
-inline void MatrixScalarMultiply(
-    fp_type1 re, fp_type1 im, Matrix<fp_type2>& m) {
-  for (unsigned i = 0; i < m.size() / 2; ++i) {
-    fp_type2 re0 = m[2 * i + 0];
-    fp_type2 im0 = m[2 * i + 1];
-    m[2 * i + 0] = re * re0 - im * im0;
-    m[2 * i + 1] = re * im0 + im * re0;
-  }
-}
-
-/**
- * Daggers a matrix.
- * @n Number of matrix rows (columns).
- * @m Input matrix. Output matrix.
- */
-template <typename fp_type>
-inline void MatrixDagger(unsigned n, Matrix<fp_type>& m) {
-  for (unsigned i = 0; i < n; ++i) {
-    m[2 * (n * i + i) + 1] = -m[2 * (n * i + i) + 1];
-
-    for (unsigned j = i + 1; j < n; ++j) {
-      std::swap(m[2 * (n * i + j)], m[2 * (n * j + i)]);
-      fp_type t = m[2 * (n * i + j) + 1];
-      m[2 * (n * i + j) + 1] = -m[2 * (n * j + i) + 1];
-      m[2 * (n * j + i) + 1] = -t;
-    }
-  }
-}
-
-/**
- * Gets a permutation to rearrange qubits from "normal" order to "gate"
- *   order. Qubits are ordered in increasing order for "normal" order.
- *   Qubits are ordered arbitrarily for "gate" order. Returns an empty vector
- *   if the qubits are in "normal" order.
- * @qubits Qubit indices in "gate" order.
- * @return Permutation as a vector.
- */
-inline std::vector<unsigned> NormalToGateOrderPermutation(
-    const std::vector<unsigned>& qubits) {
-  std::vector<unsigned> perm;
-
-  bool normal_order = true;
-
-  for (std::size_t i = 1; i < qubits.size(); ++i) {
-    if (qubits[i] < qubits[i - 1]) {
-      normal_order = false;
-      break;
-    }
-  }
-
-  if (!normal_order) {
-    struct QI {
-      unsigned q;
-      unsigned index;
-    };
-
-    std::vector<QI> qis;
-    qis.reserve(qubits.size());
-
-    for (std::size_t i = 0; i < qubits.size(); ++i) {
-      qis.push_back({qubits[i], unsigned(i)});
-    }
-
-    std::sort(qis.begin(), qis.end(), [](const QI& l, const QI& r) {
-                                        return l.q < r.q;
-                                      });
-
-    perm.reserve(qubits.size());
-
-    for (std::size_t i = 0; i < qubits.size(); ++i) {
-      perm.push_back(qis[i].index);
-    }
-  }
-
-  return perm;
-}
-
-/**
- * Shuffles the gate matrix elements to get the matrix that acts on qubits
- *   that are in "normal" order (in increasing orger).
- * @perm Permutation to rearrange qubits from "normal" order to "gate" order.
- * @q Number of gate qubits. The number of matrix rows (columns) is 2^q.
- * @m Input matrix. Output shuffled matrix.
- */
-template <typename fp_type>
-inline void MatrixShuffle(const std::vector<unsigned>& perm,
-                          unsigned q, Matrix<fp_type>& m) {
-  Matrix<fp_type> mt = m;
-  unsigned n = unsigned{1} << q;
-
-  for (unsigned i = 0; i < n; ++i) {
-    unsigned pi = bits::PermuteBits(i, q, perm);
-    for (unsigned j = 0; j < n; ++j) {
-      unsigned pj = bits::PermuteBits(j, q, perm);
-
-      m[2 * (n * i + j)] = mt[2 * (n * pi + pj)];
-      m[2 * (n * i + j) + 1] = mt[2 * (n * pi + pj) + 1];
-    }
-  }
-}
-
-}  // namespace qsim
-
-#endif  // MATRIX_H_
diff --git a/qsim/mps_simulator.h b/qsim/mps_simulator.h
deleted file mode 100644
index 8fbcbae..0000000
--- a/qsim/mps_simulator.h
+++ /dev/null
@@ -1,246 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MPS_SIMULATOR_H_
-#define MPS_SIMULATOR_H_
-
-// For templates will take care of parallelization.
-#define EIGEN_DONT_PARALLELIZE 1
-
-#include <cstdint>
-#include <cstdlib>
-#include <cstring>
-#include <memory>
-#include <vector>
-
-#include "../eigen/Eigen/Dense"
-#include "../eigen/Eigen/SVD"
-#include "mps_statespace.h"
-
-namespace qsim {
-
-namespace mps {
-
-/**
- *  Truncated Matrix Product State (MPS) circuit simulator w/ vectorization.
- */
-template <typename For, typename FP = float>
-class MPSSimulator final {
- public:
-  using MPSStateSpace_ = MPSStateSpace<For, FP>;
-  using State = typename MPSStateSpace_::MPS;
-  using fp_type = typename MPSStateSpace_::fp_type;
-
-  using Complex = std::complex<fp_type>;
-  using Matrix =
-      Eigen::Matrix<Complex, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-  using ConstMatrixMap = Eigen::Map<const Matrix>;
-  using MatrixMap = Eigen::Map<Matrix>;
-
-  using OneQubitMatrix = Eigen::Matrix<Complex, 2, 2, Eigen::RowMajor>;
-  using ConstOneQubitMap = Eigen::Map<const OneQubitMatrix>;
-
-  // Note: ForArgs are currently unused.
-  template <typename... ForArgs>
-  explicit MPSSimulator(ForArgs&&... args) : for_(args...) {}
-
-  /**
-   * Applies a gate using non-vectorized instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyGate(const std::vector<unsigned>& qs, const fp_type* matrix,
-                 State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    switch (qs.size()) {
-      case 1:
-        ApplyGate1(qs, matrix, state);
-        break;
-      case 2:
-        ApplyGate2(qs, matrix, state);
-        break;
-      // case 3:
-      //   ApplyGate3(qs, matrix, state);
-      //   break;
-      // case 4:
-      //   ApplyGate4(qs, matrix, state);
-      //   break;
-      // case 5:
-      //   ApplyGate5(qs, matrix, state);
-      //   break;
-      // case 6:
-      //   ApplyGate6(qs, matrix, state);
-      //   break;
-      default:
-        // Not implemented.
-        break;
-    }
-  }
-
-  /**
-   * Applies a controlled gate using eigen3 operations w/ instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param cqs Indices of control qubits.
-   * @param cmask Bit mask of control qubit values.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyControlledGate(const std::vector<unsigned>& qs,
-                           const std::vector<unsigned>& cqs, uint64_t cmask,
-                           const fp_type* matrix, State& state) const {
-    // TODO.
-  }
-
-  /**
-   * Computes the expectation value of an operator using eigen3 operations
-   * w/ vectorized instructions.
-   * @param qs Indices of the qubits the operator acts on.
-   * @param matrix The operator matrix.
-   * @param state The state of the system.
-   * @return The computed expectation value.
-   */
-  std::complex<double> ExpectationValue(const std::vector<unsigned>& qs,
-                                        const fp_type* matrix,
-                                        const State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-    // TODO.
-    return std::complex<double>(-10., -10.);
-  }
-
- private:
-  void ApplyGate1(const std::vector<unsigned>& qs, const fp_type* matrix,
-                  State& state) const {
-    if (qs[0] == state.num_qubits() - 1) {
-      Apply1Right(qs, matrix, state);
-    } else {
-      Apply1LeftOrInterior(qs, matrix, state);
-    }
-  }
-
-  void Apply1LeftOrInterior(const std::vector<unsigned>& qs,
-                            const fp_type* matrix, State& state) const {
-    fp_type* raw_state = state.get();
-    const auto bond_dim = state.bond_dim();
-    const auto l_offset = MPSStateSpace_::GetBlockOffset(state, qs[0]);
-    const auto r_offset = MPSStateSpace_::GetBlockOffset(state, qs[0] + 1);
-    const auto end = MPSStateSpace_::Size(state);
-    ConstOneQubitMap gate_matrix((Complex*) matrix);
-    MatrixMap scratch_block((Complex*)(raw_state + end), 2, bond_dim);
-
-    for (unsigned block_sep = l_offset; block_sep < r_offset;
-         block_sep += 4 * bond_dim) {
-      fp_type* cur_block = raw_state + block_sep;
-      ConstMatrixMap mps_block((Complex*) cur_block, 2, bond_dim);
-      scratch_block.noalias() = gate_matrix * mps_block;
-      memcpy(cur_block, raw_state + end, sizeof(fp_type) * bond_dim * 4);
-    }
-  }
-
-  void Apply1Right(const std::vector<unsigned>& qs, const fp_type* matrix,
-                   State& state) const {
-    fp_type* raw_state = state.get();
-    const auto bond_dim = state.bond_dim();
-    const auto offset = MPSStateSpace_::GetBlockOffset(state, qs[0]);
-    const auto end = MPSStateSpace_::Size(state);
-    ConstOneQubitMap gate_matrix((Complex*) matrix);
-    ConstMatrixMap mps_block((Complex*)(raw_state + offset), bond_dim, 2);
-    MatrixMap scratch_block((Complex*)(raw_state + end), bond_dim, 2);
-    scratch_block.noalias() = mps_block * gate_matrix.transpose();
-    memcpy(raw_state + offset, raw_state + end, sizeof(fp_type) * bond_dim * 4);
-  }
-
-  void ApplyGate2(const std::vector<unsigned>& qs, const fp_type* matrix,
-                  State& state) const {
-    // TODO: micro-benchmark this function and improve performance.
-    const auto bond_dim = state.bond_dim();
-    const auto num_qubits = state.num_qubits();
-    fp_type* raw_state = state.get();
-
-    const auto i_dim = (qs[0] == 0) ? 1 : bond_dim;
-    const auto j_dim = 2;
-    const auto k_dim = bond_dim;
-    const auto l_dim = 2;
-    const auto m_dim = (qs[1] == num_qubits - 1) ? 1 : bond_dim;
-
-    const auto b_0_offset = MPSStateSpace_::GetBlockOffset(state, qs[0]);
-    const auto b_1_offset = MPSStateSpace_::GetBlockOffset(state, qs[1]);
-    const auto end = MPSStateSpace_::Size(state);
-
-    MatrixMap block_0((Complex*)(raw_state + b_0_offset), i_dim * j_dim, k_dim);
-    MatrixMap block_1((Complex*)(raw_state + b_1_offset), k_dim, l_dim * m_dim);
-
-    // Merge both blocks into scratch space.
-    MatrixMap scratch_c((Complex*)(raw_state + end), i_dim * j_dim, l_dim * m_dim);
-    scratch_c.noalias() = block_0 * block_1;
-
-    // Transpose inner dims in-place.
-    MatrixMap scratch_c_t((Complex*)(raw_state + end), i_dim * j_dim * l_dim, m_dim);
-    for (unsigned i = 0; i < i_dim * j_dim * l_dim; i += 4) {
-      scratch_c_t.row(i + 1).swap(scratch_c_t.row(i + 2));
-    }
-
-    // Transpose gate matrix and place in 3rd (last) scratch block.
-    const auto scratch3_offset = end + 8 * bond_dim * bond_dim;
-    ConstMatrixMap gate_matrix((Complex*) matrix, 4, 4);
-    MatrixMap gate_matrix_transpose((Complex*)(raw_state + scratch3_offset), 4, 4);
-    gate_matrix_transpose = gate_matrix.transpose();
-    gate_matrix_transpose.col(1).swap(gate_matrix_transpose.col(2));
-
-    // Contract gate and merged block tensors, placing result in B0B1.
-    for (unsigned i = 0; i < i_dim; ++i) {
-      fp_type* src_block = raw_state + end + i * 8 * m_dim;
-      fp_type* dest_block = raw_state + b_0_offset + i * 8 * m_dim;
-      MatrixMap block_b0b1((Complex*) dest_block, 4, m_dim);
-      ConstMatrixMap scratch_c_i((Complex*) src_block, 4, m_dim);
-      // [i, np, m] = [np, lj] * [i, lj, m]
-      block_b0b1.noalias() = gate_matrix_transpose * scratch_c_i;
-    }
-
-    // SVD B0B1.
-    MatrixMap full_b0b1((Complex*)(raw_state + b_0_offset), 2 * i_dim, 2 * m_dim);
-    Eigen::BDCSVD<Matrix> svd(full_b0b1, Eigen::ComputeThinU | Eigen::ComputeThinV);
-    const auto p = std::min(2 * i_dim, 2 * m_dim);
-
-    // Place U in scratch to truncate and then B0.
-    MatrixMap svd_u((Complex*)(raw_state + end), 2 * i_dim, p);
-    svd_u.noalias() = svd.matrixU();
-    block_0.fill(Complex(0, 0));
-    const auto keep_cols = (svd_u.cols() > bond_dim) ? bond_dim : svd_u.cols();
-    block_0.block(0, 0, svd_u.rows(), keep_cols).noalias() =
-        svd_u(Eigen::indexing::all, Eigen::seq(0, keep_cols - 1));
-
-    // Place row product of S V into scratch to truncate and then B1.
-    MatrixMap svd_v((Complex*)(raw_state + end), p, 2 * m_dim);
-    MatrixMap s_vector((Complex*)(raw_state + end + 8 * bond_dim * bond_dim), p, 1);
-    svd_v.noalias() = svd.matrixV().adjoint();
-    s_vector.noalias() = svd.singularValues();
-    block_1.fill(Complex(0, 0));
-    const auto keep_rows = (svd_v.rows() > bond_dim) ? bond_dim : svd_v.rows();
-    const auto row_seq = Eigen::seq(0, keep_rows - 1);
-    for (unsigned i = 0; i < keep_rows; ++i) {
-      svd_v.row(i) *= s_vector(i);
-    }
-    block_1.block(0, 0, keep_rows, svd_v.cols()).noalias() =
-        svd_v(row_seq, Eigen::indexing::all);
-  }
-
-  For for_;
-};
-
-}  // namespace mps
-}  // namespace qsim
-
-#endif  // MPS_SIMULATOR_H_
diff --git a/qsim/mps_statespace.h b/qsim/mps_statespace.h
deleted file mode 100644
index 9b3acf3..0000000
--- a/qsim/mps_statespace.h
+++ /dev/null
@@ -1,597 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MPS_STATESPACE_H_
-#define MPS_STATESPACE_H_
-
-// For templates will take care of parallelization.
-#define EIGEN_DONT_PARALLELIZE 1
-
-#ifdef _WIN32
-#include <malloc.h>
-#endif
-
-#include <cstdint>
-#include <cstdlib>
-#include <cstring>
-#include <memory>
-#include <random>
-
-#include "../eigen/Eigen/Dense"
-#include "../eigen/unsupported/Eigen/CXX11/Tensor"
-
-namespace qsim {
-
-namespace mps {
-
-namespace detail {
-
-inline void do_not_free(void*) {}
-
-inline void free(void* ptr) {
-#ifdef _WIN32
-  _aligned_free(ptr);
-#else
-  ::free(ptr);
-#endif
-}
-
-}  // namespace detail
-
-/**
- * Class containing context and routines for fixed bond dimension
- * truncated Matrix Product State (MPS) simulation.
- */
-template <typename For, typename FP = float>
-class MPSStateSpace {
- private:
- public:
-  using fp_type = FP;
-  using Pointer = std::unique_ptr<fp_type, decltype(&detail::free)>;
-
-  using Complex = std::complex<fp_type>;
-  using Matrix =
-      Eigen::Matrix<Complex, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-  using ConstMatrixMap = Eigen::Map<const Matrix>;
-  using MatrixMap = Eigen::Map<Matrix>;
-
-  // Store MPS tensors with the following shape:
-  // [2, bond_dim], [bond_dim, 2, bond_dim], ... , [bond_dim, 2].
-  class MPS {
-   public:
-    MPS() = delete;
-
-    MPS(Pointer&& ptr, unsigned num_qubits, unsigned bond_dim)
-        : ptr_(std::move(ptr)), num_qubits_(num_qubits), bond_dim_(bond_dim) {}
-
-    fp_type* get() { return ptr_.get(); }
-
-    const fp_type* get() const { return ptr_.get(); }
-
-    fp_type* release() {
-      num_qubits_ = 0;
-      return ptr_.release();
-    }
-
-    unsigned num_qubits() const { return num_qubits_; }
-
-    unsigned bond_dim() const { return bond_dim_; }
-
-   private:
-    Pointer ptr_;
-    unsigned num_qubits_;
-    unsigned bond_dim_;
-  };
-
-  // Note: ForArgs are currently unused.
-  template <typename... ForArgs>
-  MPSStateSpace(ForArgs&&... args) : for_(args...) {}
-
-  // Requires num_qubits >= 2 and bond_dim >= 2.
-  static MPS Create(unsigned num_qubits, unsigned bond_dim) {
-    auto end_sizes = 2 * 4 * bond_dim;
-    auto internal_sizes = 4 * bond_dim * bond_dim * (num_qubits + 1);
-    // Use three extra "internal style" blocks past the end of the
-    //   working allocation for scratch space. Needed for gate
-    //   application.
-    auto size = sizeof(fp_type) * (end_sizes + internal_sizes);
-
-#ifdef _WIN32
-    Pointer ptr{(fp_type*)_aligned_malloc(size, 64), &detail::free};
-    bool is_null = ptr.get() != nullptr;
-    return MPS{std::move(ptr), is_null ? num_qubits : 0,
-               is_null ? bond_dim : 0};
-#else
-    void* p = nullptr;
-    if (posix_memalign(&p, 64, size) == 0) {
-      return MPS{Pointer{(fp_type*)p, &detail::free}, num_qubits, bond_dim};
-    } else {
-      return MPS{Pointer{nullptr, &detail::free}, 0, 0};
-    }
-#endif
-  }
-
-  static unsigned Size(const MPS& state) {
-    auto end_sizes = 2 * 4 * state.bond_dim();
-    auto internal_sizes = 4 * state.bond_dim() * state.bond_dim();
-    return end_sizes + internal_sizes * (state.num_qubits() - 2);
-  }
-
-  static unsigned RawSize(const MPS& state) {
-    return sizeof(fp_type) * Size(state);
-  }
-
-  // Get the pointer offset to the beginning of an MPS block.
-  static unsigned GetBlockOffset(const MPS& state, unsigned i) {
-    if (i == 0) {
-      return 0;
-    }
-    return 4 * state.bond_dim() * (1 + state.bond_dim() * (i - 1));
-  }
-
-  // Copies the state contents of one MPS to another.
-  // Ignores scratch data.
-  static bool Copy(const MPS& src, MPS& dest) {
-    if ((src.num_qubits() != dest.num_qubits()) ||
-        src.bond_dim() != dest.bond_dim()) {
-      return false;
-    }
-    auto size = RawSize(src);
-    memcpy(dest.get(), src.get(), size);
-    return true;
-  }
-
-  // Set the MPS to the |0> state.
-  static void SetStateZero(MPS& state) {
-    auto size = Size(state);
-    memset(state.get(), 0, sizeof(fp_type) * size);
-    auto block_size = 4 * state.bond_dim() * state.bond_dim();
-    state.get()[0] = 1.0;
-    for (unsigned i = 4 * state.bond_dim(); i < size; i += block_size) {
-      state.get()[i] = 1.0;
-    }
-  }
-
-  // Computes Re{<state1 | state2 >} for two equal sized MPS.
-  // Requires: state1.bond_dim() == state2.bond_dim() &&
-  //           state1.num_qubits() == state2.num_qubits()
-  static fp_type RealInnerProduct(MPS& state1, MPS& state2) {
-    return InnerProduct(state1, state2).real();
-  }
-
-  // Computes <state1 | state2 > for two equal sized MPS.
-  // Requires: state1.bond_dim() == state2.bond_dim() &&
-  //           state1.num_qubits() == state2.num_qubits()
-  static std::complex<fp_type> InnerProduct(MPS& state1, MPS& state2) {
-    const auto num_qubits = state1.num_qubits();
-    const auto bond_dim = state1.bond_dim();
-    const auto end = Size(state1);
-    auto offset = 0;
-    fp_type* state1_raw = state1.get();
-    fp_type* state2_raw = state2.get();
-
-    // Contract leftmost blocks together, store result in state1 scratch.
-    ConstMatrixMap top((Complex*)state2_raw, 2, bond_dim);
-    ConstMatrixMap bot((Complex*)state1_raw, 2, bond_dim);
-    MatrixMap partial_contract((Complex*)(state1_raw + end), bond_dim,
-                               bond_dim);
-    MatrixMap partial_contract2(
-        (Complex*)(state1_raw + end + 2 * bond_dim * bond_dim), bond_dim,
-        2 * bond_dim);
-    partial_contract.noalias() = top.adjoint() * bot;
-
-    // Contract all internal blocks together.
-    for (unsigned i = 1; i < num_qubits - 1; ++i) {
-      offset = GetBlockOffset(state1, i);
-
-      // reshape:
-      new (&partial_contract2)
-          MatrixMap((Complex*)(state1_raw + end + 2 * bond_dim * bond_dim),
-                    bond_dim, 2 * bond_dim);
-
-      // Merge bot into left boundary merged tensor.
-      new (&bot) ConstMatrixMap((Complex*)(state1_raw + offset), bond_dim,
-                                2 * bond_dim);
-      partial_contract2.noalias() = partial_contract * bot;
-
-      // reshape:
-      new (&partial_contract2)
-          MatrixMap((Complex*)(state1_raw + end + 2 * bond_dim * bond_dim),
-                    2 * bond_dim, bond_dim);
-
-      // Merge top into partial_contract2.
-      new (&top) ConstMatrixMap((Complex*)(state2_raw + offset), 2 * bond_dim,
-                                bond_dim);
-      partial_contract.noalias() = top.adjoint() * partial_contract2;
-    }
-
-    // Contract rightmost bottom block.
-    offset = GetBlockOffset(state1, num_qubits - 1);
-    new (&bot) ConstMatrixMap((Complex*)(state1_raw + offset), bond_dim, 2);
-    new (&partial_contract2) MatrixMap(
-        (Complex*)(state1_raw + end + 4 * bond_dim * bond_dim), bond_dim, 2);
-    partial_contract2.noalias() = partial_contract * bot;
-
-    // Contract rightmost top block.
-    new (&top) ConstMatrixMap((Complex*)(state2_raw + offset), 2 * bond_dim, 1);
-    new (&partial_contract) MatrixMap((Complex*)(state1_raw + end), 1, 1);
-    new (&partial_contract2)
-        MatrixMap((Complex*)(state1_raw + end + 4 * bond_dim * bond_dim),
-                  2 * bond_dim, 1);
-    partial_contract.noalias() = top.adjoint() * partial_contract2;
-
-    return partial_contract(0, 0);
-  }
-
-  // Compute the 2x2 1-RDM of state on index. Result written to rdm.
-  // Requires: scratch and rdm to be allocated.
-  static void ReduceDensityMatrix(MPS& state, MPS& scratch, int index,
-                                  fp_type* rdm) {
-    const auto num_qubits = state.num_qubits();
-    const auto bond_dim = state.bond_dim();
-    const auto end = Size(state);
-    const bool last_index = (index == num_qubits - 1);
-    const auto right_dim = (last_index ? 1 : bond_dim);
-    auto offset = 0;
-    fp_type* state_raw = state.get();
-    fp_type* scratch_raw = scratch.get();
-    fp_type* state_raw_workspace = state_raw + end + 2 * bond_dim * bond_dim;
-    fp_type* scratch_raw_workspace =
-        scratch_raw + end + 2 * bond_dim * bond_dim;
-
-    Copy(state, scratch);
-
-    // Contract leftmost blocks together, store result in state scratch.
-    ConstMatrixMap top((Complex*)scratch_raw, 2, bond_dim);
-    ConstMatrixMap bot((Complex*)state_raw, 2, bond_dim);
-    MatrixMap partial_contract((Complex*)(state_raw + end), bond_dim, bond_dim);
-    MatrixMap partial_contract2((Complex*)(state_raw_workspace), bond_dim,
-                                2 * bond_dim);
-
-    partial_contract.setZero();
-    partial_contract(0, 0) = 1;
-    if (index > 0) {
-      partial_contract.noalias() = top.adjoint() * bot;
-    }
-
-    // Contract all internal blocks together.
-    for (unsigned i = 1; i < index; ++i) {
-      offset = GetBlockOffset(state, i);
-
-      // reshape:
-      new (&partial_contract2)
-          MatrixMap((Complex*)(state_raw_workspace), bond_dim, 2 * bond_dim);
-
-      // Merge bot into left boundary merged tensor.
-      new (&bot) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim,
-                                2 * bond_dim);
-      partial_contract2.noalias() = partial_contract * bot;
-
-      // reshape:
-      new (&partial_contract2)
-          MatrixMap((Complex*)(state_raw_workspace), 2 * bond_dim, bond_dim);
-
-      // Merge top into partial_contract2.
-      new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), 2 * bond_dim,
-                                bond_dim);
-      partial_contract.noalias() = top.adjoint() * partial_contract2;
-    }
-
-    // The [bond_dim, bond_dim] block in state_raw now contains the contraction
-    // up to, but not including index.
-    // Contract rightmost blocks.
-    offset = GetBlockOffset(state, num_qubits - 1);
-    new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), bond_dim, 2);
-    new (&bot) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, 2);
-    new (&partial_contract)
-        MatrixMap((Complex*)(scratch_raw + end), bond_dim, bond_dim);
-    new (&partial_contract2)
-        MatrixMap((Complex*)(scratch_raw_workspace), bond_dim, 2 * bond_dim);
-
-    partial_contract.setZero();
-    partial_contract(0, 0) = 1;
-    if (index < num_qubits - 1) {
-      partial_contract.noalias() = top * bot.adjoint();
-    }
-
-    for (unsigned i = num_qubits - 2; i > index; --i) {
-      offset = GetBlockOffset(state, i);
-
-      // reshape:
-      new (&partial_contract2)
-          MatrixMap((Complex*)(scratch_raw_workspace), 2 * bond_dim, bond_dim);
-
-      // Merge bot into left boundary merged tensor.
-      new (&bot) ConstMatrixMap((Complex*)(state_raw + offset), 2 * bond_dim,
-                                bond_dim);
-      partial_contract2.noalias() = bot * partial_contract.adjoint();
-
-      // reshape:
-      new (&partial_contract2)
-          MatrixMap((Complex*)(scratch_raw_workspace), bond_dim, 2 * bond_dim);
-
-      // Merge top into partial_contract2.
-      new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), bond_dim,
-                                2 * bond_dim);
-      // [bd, bd] = [bd, 2bd] @ [bd, 2bd]
-      partial_contract.noalias() = top * partial_contract2.adjoint();
-    }
-
-    // The [bond_dim, bond_dim] block in scratch_raw now contains the
-    // contraction down from the end, but not including the index. Begin final
-    // contraction steps.
-
-    // Get leftmost [bd, bd] contraction and contract with top.
-
-    offset = GetBlockOffset(state, index);
-    new (&partial_contract)
-        MatrixMap((Complex*)(state_raw + end), bond_dim, bond_dim);
-    new (&top)
-        ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, 2 * right_dim);
-    new (&partial_contract2)
-        MatrixMap((Complex*)(scratch_raw_workspace), bond_dim, 2 * right_dim);
-    partial_contract2.noalias() = partial_contract * top.conjugate();
-    // copy the bottom contraction scratch_raw to state_raw to save space.
-    memcpy(state_raw + end, scratch_raw + end,
-           bond_dim * bond_dim * 2 * sizeof(fp_type));
-
-    // Contract top again for correct shape.
-    fp_type* contract3_target = (last_index ? rdm : scratch_raw);
-    MatrixMap partial_contract3((Complex*)contract3_target, 2 * right_dim,
-                                2 * right_dim);
-    partial_contract3.noalias() = top.transpose() * partial_contract2;
-
-    // If we are contracting the last index, all the needed transforms are done.
-    if (last_index) {
-      return;
-    }
-
-    // Conduct final tensor contraction operations. Cannot be easily compiled to
-    // matmul.
-    const Eigen::TensorMap<const Eigen::Tensor<Complex, 4, Eigen::RowMajor>>
-        t_4d((Complex*)scratch_raw, 2, bond_dim, 2, bond_dim);
-    const Eigen::TensorMap<const Eigen::Tensor<Complex, 2, Eigen::RowMajor>>
-        t_2d((Complex*)(state_raw + end), bond_dim, bond_dim);
-
-    const Eigen::array<Eigen::IndexPair<int>, 2> product_dims = {
-        Eigen::IndexPair<int>(1, 0),
-        Eigen::IndexPair<int>(3, 1),
-    };
-    Eigen::TensorMap<Eigen::Tensor<Complex, 2, Eigen::RowMajor>> out(
-        (Complex*)rdm, 2, 2);
-    out = t_4d.contract(t_2d, product_dims);
-  }
-
-  // Draw a single bitstring sample from state using scratch and scratch2
-  // as working space.
-  static void SampleOnce(MPS& state, MPS& scratch, MPS& scratch2,
-                         std::mt19937* random_gen, std::vector<bool>* sample) {
-    // TODO: carefully profile with perf and optimize temp storage
-    //  locations for cache friendliness.
-    const auto bond_dim = state.bond_dim();
-    const auto num_qubits = state.num_qubits();
-    const auto end = Size(state);
-    const auto left_frontier_offset = GetBlockOffset(state, num_qubits + 1);
-    std::default_random_engine generator;
-    fp_type* state_raw = state.get();
-    fp_type* scratch_raw = scratch.get();
-    fp_type* scratch2_raw = scratch2.get();
-    fp_type rdm[8];
-
-    sample->reserve(num_qubits);
-    Copy(state, scratch);
-    Copy(state, scratch2);
-
-    // Store prefix contractions in scratch2.
-    auto offset = GetBlockOffset(state, num_qubits - 1);
-    ConstMatrixMap top((Complex*)(state_raw + offset), bond_dim, 2);
-    ConstMatrixMap bot((Complex*)(scratch_raw + offset), bond_dim, 2);
-    MatrixMap partial_contract((Complex*)(scratch2_raw + offset), bond_dim,
-                               bond_dim);
-    MatrixMap partial_contract2((Complex*)(scratch_raw + end), bond_dim,
-                                2 * bond_dim);
-    partial_contract.noalias() = top * bot.adjoint();
-
-    for (unsigned i = num_qubits - 2; i > 0; --i) {
-      offset = GetBlockOffset(state, i);
-      // reshape:
-      new (&partial_contract2)
-          MatrixMap((Complex*)(scratch_raw + end), 2 * bond_dim, bond_dim);
-
-      // Merge bot into left boundary merged tensor.
-      new (&bot) ConstMatrixMap((Complex*)(scratch_raw + offset), 2 * bond_dim,
-                                bond_dim);
-      partial_contract2.noalias() = bot * partial_contract.adjoint();
-
-      // reshape:
-      new (&partial_contract2)
-          MatrixMap((Complex*)(scratch_raw + end), bond_dim, 2 * bond_dim);
-
-      // Merge top into partial_contract2.
-      new (&top) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim,
-                                2 * bond_dim);
-
-      // merge into partial_contract -> scracth2_raw.
-      new (&partial_contract)
-          MatrixMap((Complex*)(scratch2_raw + offset), bond_dim, bond_dim);
-      partial_contract.noalias() = top * partial_contract2.adjoint();
-    }
-
-    // Compute RDM-0 and draw first sample.
-    offset = GetBlockOffset(state, 1);
-    new (&top) ConstMatrixMap((Complex*)state_raw, 2, bond_dim);
-    new (&bot) ConstMatrixMap((Complex*)scratch_raw, 2, bond_dim);
-    new (&partial_contract)
-        MatrixMap((Complex*)(scratch2_raw + offset), bond_dim, bond_dim);
-    new (&partial_contract2)
-        MatrixMap((Complex*)(scratch_raw + end), 2, bond_dim);
-
-    partial_contract2.noalias() = bot * partial_contract.adjoint();
-
-    new (&partial_contract) MatrixMap((Complex*)rdm, 2, 2);
-    partial_contract.noalias() = top * partial_contract2.adjoint();
-    auto p0 = rdm[0] / (rdm[0] + rdm[6]);
-    std::bernoulli_distribution distribution(1 - p0);
-    auto bit_val = distribution(*random_gen);
-    sample->push_back(bit_val);
-
-    // collapse state.
-    new (&partial_contract) MatrixMap((Complex*)scratch_raw, 2, bond_dim);
-    partial_contract.row(!bit_val).setZero();
-
-    // Prepare left contraction frontier.
-    new (&partial_contract2) MatrixMap(
-        (Complex*)(scratch2_raw + left_frontier_offset), bond_dim, bond_dim);
-    partial_contract2.noalias() =
-        partial_contract.transpose() * partial_contract.conjugate();
-
-    // Compute RDM-i and draw internal tensor samples.
-    for (unsigned i = 1; i < num_qubits - 1; i++) {
-      // Get leftmost [bd, bd] contraction and contract with top.
-      offset = GetBlockOffset(state, i);
-      new (&partial_contract) MatrixMap(
-          (Complex*)(scratch2_raw + left_frontier_offset), bond_dim, bond_dim);
-      new (&top) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim,
-                                2 * bond_dim);
-      new (&partial_contract2)
-          MatrixMap((Complex*)(state_raw + end), bond_dim, 2 * bond_dim);
-      partial_contract2.noalias() = partial_contract * top.conjugate();
-
-      // Contract top again for correct shape.
-      MatrixMap partial_contract3((Complex*)(scratch_raw + end), 2 * bond_dim,
-                                  2 * bond_dim);
-      partial_contract3.noalias() = top.transpose() * partial_contract2;
-
-      // Conduct final tensor contraction operations. Cannot be easily compiled
-      // to matmul. Perf reports shows only ~6% of runtime spent here on large
-      // systems.
-      offset = GetBlockOffset(state, i + 1);
-      const Eigen::TensorMap<const Eigen::Tensor<Complex, 4, Eigen::RowMajor>>
-          t_4d((Complex*)(scratch_raw + end), 2, bond_dim, 2, bond_dim);
-      const Eigen::TensorMap<const Eigen::Tensor<Complex, 2, Eigen::RowMajor>>
-          t_2d((Complex*)(scratch2_raw + offset), bond_dim, bond_dim);
-
-      const Eigen::array<Eigen::IndexPair<int>, 2> product_dims = {
-          Eigen::IndexPair<int>(1, 0),
-          Eigen::IndexPair<int>(3, 1),
-      };
-      Eigen::TensorMap<Eigen::Tensor<Complex, 2, Eigen::RowMajor>> out(
-          (Complex*)rdm, 2, 2);
-      out = t_4d.contract(t_2d, product_dims);
-
-      // Sample bit and collapse state.
-      p0 = rdm[0] / (rdm[0] + rdm[6]);
-      distribution = std::bernoulli_distribution(1 - p0);
-      bit_val = distribution(*random_gen);
-
-      sample->push_back(bit_val);
-      offset = GetBlockOffset(state, i);
-      new (&partial_contract)
-          MatrixMap((Complex*)(scratch_raw + offset), bond_dim * 2, bond_dim);
-      for (unsigned j = !bit_val; j < 2 * bond_dim; j += 2) {
-        partial_contract.row(j).setZero();
-      }
-
-      // Update left frontier.
-      new (&partial_contract) MatrixMap(
-          (Complex*)(scratch2_raw + left_frontier_offset), bond_dim, bond_dim);
-
-      // reshape:
-      new (&partial_contract2)
-          MatrixMap((Complex*)(state_raw + end), bond_dim, 2 * bond_dim);
-
-      // Merge bot into left boundary merged tensor.
-      new (&bot) ConstMatrixMap((Complex*)(scratch_raw + offset), bond_dim,
-                                2 * bond_dim);
-      partial_contract2.noalias() = partial_contract * bot.conjugate();
-
-      // reshape:
-      new (&partial_contract2)
-          MatrixMap((Complex*)(state_raw + end), 2 * bond_dim, bond_dim);
-
-      // Merge top into partial_contract2.
-      new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), 2 * bond_dim,
-                                bond_dim);
-      partial_contract.noalias() = top.transpose() * partial_contract2;
-    }
-
-    // Compute RDM-(n-1) and sample.
-    offset = GetBlockOffset(state, num_qubits - 1);
-    new (&partial_contract2)
-        MatrixMap((Complex*)(state_raw + end), bond_dim, 2);
-
-    new (&top) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, 2);
-    partial_contract2.noalias() = partial_contract * top.conjugate();
-    new (&partial_contract) MatrixMap((Complex*)rdm, 2, 2);
-    partial_contract.noalias() = top.transpose() * partial_contract2;
-
-    p0 = rdm[0] / (rdm[0] + rdm[6]);
-    distribution = std::bernoulli_distribution(1 - p0);
-    bit_val = distribution(*random_gen);
-    sample->push_back(bit_val);
-  }
-
-  // Draw num_samples bitstring samples from state and store the result
-  // bit vectors in results. Uses scratch and scratch2 as workspace.
-  static void Sample(MPS& state, MPS& scratch, MPS& scratch2,
-                     unsigned num_samples, unsigned seed,
-                     std::vector<std::vector<bool>>* results) {
-    std::mt19937 rand_source(seed);
-    results->reserve(num_samples);
-    for (unsigned i = 0; i < num_samples; i++) {
-      SampleOnce(state, scratch, scratch2, &rand_source, &(*results)[i]);
-    }
-  }
-
-  // Testing only. Convert the MPS to a wavefunction under "normal" ordering.
-  // Requires: wf be allocated beforehand with bond_dim * 2 ^ num_qubits -1
-  // memory.
-  static void ToWaveFunction(MPS& state, fp_type* wf) {
-    const auto bond_dim = state.bond_dim();
-    const auto num_qubits = state.num_qubits();
-    fp_type* raw_state = state.get();
-
-    ConstMatrixMap accum = ConstMatrixMap((Complex*)(raw_state), 2, bond_dim);
-    ConstMatrixMap next_block = ConstMatrixMap(nullptr, 0, 0);
-    MatrixMap result2 = MatrixMap(nullptr, 0, 0);
-    auto offset = 0;
-    auto result2_size = 2;
-
-    for (unsigned i = 1; i < num_qubits - 1; i++) {
-      offset = GetBlockOffset(state, i);
-      // use of new does not trigger any expensive operations.
-      new (&next_block) ConstMatrixMap((Complex*)(raw_state + offset), bond_dim,
-                                       2 * bond_dim);
-      new (&result2) MatrixMap((Complex*)(wf), result2_size, 2 * bond_dim);
-
-      // temp variable used since result2 and accum point to same memory.
-      result2 = accum * next_block;
-      result2_size *= 2;
-      new (&accum) ConstMatrixMap((Complex*)(wf), result2_size, bond_dim);
-    }
-    offset = GetBlockOffset(state, num_qubits - 1);
-    new (&next_block)
-        ConstMatrixMap((Complex*)(raw_state + offset), bond_dim, 2);
-    new (&result2) MatrixMap((Complex*)(wf), result2_size, 2);
-    result2 = accum * next_block;
-  }
-
- protected:
-  For for_;
-};
-
-}  // namespace mps
-}  // namespace qsim
-
-#endif  // MPS_STATESPACE_H_
diff --git a/qsim/parfor.h b/qsim/parfor.h
deleted file mode 100644
index 8a3a4d6..0000000
--- a/qsim/parfor.h
+++ /dev/null
@@ -1,123 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef PARFOR_H_
-#define PARFOR_H_
-
-#include <omp.h>
-
-#include <cstdint>
-#include <utility>
-#include <vector>
-
-namespace qsim {
-
-/**
- * Helper struct for executing for-loops in parallel across multiple threads.
- */
-template <uint64_t MIN_SIZE>
-struct ParallelForT {
-  explicit ParallelForT(unsigned num_threads) : num_threads(num_threads) {}
-
-  // GetIndex0 and GetIndex1 are useful when we need to know how work was
-  // divided between threads, for instance, for reusing partial sums obtained
-  // by RunReduceP.
-  uint64_t GetIndex0(uint64_t size, unsigned thread_id) const {
-    return size >= MIN_SIZE ? size * thread_id / num_threads : 0;
-  }
-
-  uint64_t GetIndex1(uint64_t size, unsigned thread_id) const {
-    return size >= MIN_SIZE ? size * (thread_id + 1) / num_threads : size;
-  }
-
-  template <typename Function, typename... Args>
-  void Run(uint64_t size, Function&& func, Args&&... args) const {
-    if (num_threads > 1 && size >= MIN_SIZE) {
-      #pragma omp parallel num_threads(num_threads)
-      {
-        unsigned n = omp_get_num_threads();
-        unsigned m = omp_get_thread_num();
-
-        uint64_t i0 = GetIndex0(size, m);
-        uint64_t i1 = GetIndex1(size, m);
-
-        for (uint64_t i = i0; i < i1; ++i) {
-          func(n, m, i, args...);
-        }
-      }
-    } else {
-      for (uint64_t i = 0; i < size; ++i) {
-        func(1, 0, i, args...);
-      }
-    }
-  }
-
-  template <typename Function, typename Op, typename... Args>
-  std::vector<typename Op::result_type> RunReduceP(
-      uint64_t size, Function&& func, Op&& op, Args&&... args) const {
-    std::vector<typename Op::result_type> partial_results;
-
-    if (num_threads > 1 && size >= MIN_SIZE) {
-      partial_results.resize(num_threads, 0);
-
-      #pragma omp parallel num_threads(num_threads)
-      {
-        unsigned n = omp_get_num_threads();
-        unsigned m = omp_get_thread_num();
-
-        uint64_t i0 = GetIndex0(size, m);
-        uint64_t i1 = GetIndex1(size, m);
-
-        typename Op::result_type partial_result = 0;
-
-        for (uint64_t i = i0; i < i1; ++i) {
-          partial_result = op(partial_result, func(n, m, i, args...));
-        }
-
-        partial_results[m] = partial_result;
-      }
-    } else if (num_threads > 0) {
-      typename Op::result_type result = 0;
-      for (uint64_t i = 0; i < size; ++i) {
-        result = op(result, func(1, 0, i, args...));
-      }
-
-      partial_results.resize(1, result);
-    }
-
-    return partial_results;
-  }
-
-  template <typename Function, typename Op, typename... Args>
-  typename Op::result_type RunReduce(uint64_t size, Function&& func,
-                                     Op&& op, Args&&... args) const {
-    auto partial_results = RunReduceP(size, func, std::move(op), args...);
-
-    typename Op::result_type result = 0;
-
-    for (auto partial_result : partial_results) {
-      result = op(result, partial_result);
-    }
-
-    return result;
-  }
-
-  unsigned num_threads;
-};
-
-using ParallelFor = ParallelForT<1024>;
-
-}  // namespace qsim
-
-#endif  // PARFOR_H_
diff --git a/qsim/qtrajectory.h b/qsim/qtrajectory.h
deleted file mode 100644
index 1da6692..0000000
--- a/qsim/qtrajectory.h
+++ /dev/null
@@ -1,435 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef QTRAJECTORY_H_
-#define QTRAJECTORY_H_
-
-#include <cmath>
-#include <complex>
-#include <cstdint>
-#include <random>
-#include <vector>
-
-#include "circuit_noisy.h"
-#include "gate.h"
-#include "gate_appl.h"
-
-namespace qsim {
-
-/**
- * Quantum trajectory simulator.
- */
-template <typename IO, typename Gate,
-          template <typename, typename> class FuserT, typename Simulator,
-          typename RGen = std::mt19937>
-class QuantumTrajectorySimulator {
- public:
-  using Fuser = FuserT<IO, const Gate*>;
-  using StateSpace = typename Simulator::StateSpace;
-  using State = typename Simulator::State;
-  using MeasurementResult = typename StateSpace::MeasurementResult;
-
-  /**
-   * User-specified parameters for the simulator.
-   */
-  struct Parameter : public Fuser::Parameter {
-    /**
-     * If true, collect statistics of sampled Kraus operator indices.
-     */
-    bool collect_kop_stat = false;
-    /**
-     * If true, collect statistics of measured bitstrings.
-     */
-    bool collect_mea_stat = false;
-    /**
-     * If true, normalize the state vector before performing measurements.
-     */
-    bool normalize_before_mea_gates = true;
-    /**
-     * If false, do not apply deferred operators after the main loop for
-     * the "primary" noise trajectory, that is the trajectory in which
-     * the primary (the first operators in their respective channels) Kraus
-     * operators are sampled for each channel and there are no measurements
-     * in the computational basis. This can be used to speed up simulations
-     * of circuits with weak noise and without measurements by reusing
-     * the primary trajectory results. There is an additional condition for
-     * RunBatch. In this case, the deferred operators after the main loop are
-     * still applied for the first occurence of the primary trajectory.
-     * The primary Kraus operators should have the highest sampling
-     * probabilities to achieve the highest speedup.
-     *
-     * It is the client's responsibility to collect the primary trajectory
-     * results and to reuse them.
-     */
-    bool apply_last_deferred_ops = true;
-  };
-
-  /**
-   * Struct with statistics to populate by RunBatch and RunOnce methods.
-   */
-  struct Stat {
-    /**
-     * Indices of sampled Kraus operator indices and/or measured bitstrings.
-     */
-    std::vector<uint64_t> samples;
-    /**
-     * True if the "primary" noise trajectory is sampled, false otherwise.
-     */
-    bool primary;
-  };
-
-  /**
-   * Runs the given noisy circuit performing repetitions. Each repetition is
-   * seeded by repetition ID.
-   * @param param Options for the quantum trajectory simulator.
-   * @param circuit The noisy circuit to be simulated.
-   * @param r0, r1 The range of repetition IDs [r0, r1) to perform repetitions.
-   * @param state_space StateSpace object required to manipulate state vector.
-   * @param simulator Simulator object. Provides specific implementations for
-   *   applying gates.
-   * @param measure Function that performs measurements (in the sense of
-   *   computing expectation values, etc). This function should have three
-   *   required parameters [repetition ID (uint64_t), final state vector
-   *   (const State&), statistics of sampled Kraus operator indices and/or
-   *   measured bitstrings (const Stat&)] and any number of optional parameters.
-   * @param args Optional arguments for the 'measure' function.
-   * @return True if the simulation completed successfully; false otherwise.
-   */
-  template <typename MeasurementFunc, typename... Args>
-  static bool RunBatch(const Parameter& param,
-                       const NoisyCircuit<Gate>& circuit,
-                       uint64_t r0, uint64_t r1, const StateSpace& state_space,
-                       const Simulator& simulator, MeasurementFunc&& measure,
-                       Args&&... args) {
-    return RunBatch(param, circuit.num_qubits, circuit.channels.begin(),
-                    circuit.channels.end(), r0, r1, state_space, simulator,
-                    measure, args...);
-  }
-
-  /**
-   * Runs the given noisy circuit performing repetitions. Each repetition is
-   * seeded by repetition ID.
-   * @param param Options for the quantum trajectory simulator.
-   * @param num_qubits The number of qubits acted on by the circuit.
-   * @param cbeg, cend The range of channels [cbeg, cend) to run the circuit.
-   * @param r0, r1 The range of repetition IDs [r0, r1) to perform repetitions.
-   * @param state_space StateSpace object required to manipulate state vector.
-   * @param simulator Simulator object. Provides specific implementations for
-   *   applying gates.
-   * @param measure Function that performs measurements (in the sense of
-   *   computing expectation values, etc). This function should have three
-   *   required parameters [repetition ID (uint64_t), final state vector
-   *   (const State&), statistics of sampled Kraus operator indices and/or
-   *   measured bitstrings (const Stat&)] and any number of optional parameters.
-   * @param args Optional arguments for the 'measure' function.
-   * @return True if the simulation completed successfully; false otherwise.
-   */
-  template <typename MeasurementFunc, typename... Args>
-  static bool RunBatch(const Parameter& param, unsigned num_qubits,
-                       ncircuit_iterator<Gate> cbeg,
-                       ncircuit_iterator<Gate> cend,
-                       uint64_t r0, uint64_t r1, const StateSpace& state_space,
-                       const Simulator& simulator, MeasurementFunc&& measure,
-                       Args&&... args) {
-    std::vector<const Gate*> gates;
-    gates.reserve(4 * std::size_t(cend - cbeg));
-
-    State state = state_space.Null();
-
-    Stat stat;
-    bool had_primary_realization = false;
-
-    for (uint64_t r = r0; r < r1; ++r) {
-      if (!state_space.IsNull(state)) {
-        state_space.SetStateZero(state);
-      }
-
-      bool apply_last_deferred_ops =
-          param.apply_last_deferred_ops || !had_primary_realization;
-
-      if (!RunIteration(param, apply_last_deferred_ops, num_qubits, cbeg, cend,
-                        r, state_space, simulator, gates, state, stat)) {
-        return false;
-      }
-
-      if (stat.primary && !had_primary_realization) {
-        had_primary_realization = true;
-      }
-
-      measure(r, state, stat, args...);
-    }
-
-    return true;
-  }
-
-  /**
-   * Runs the given noisy circuit one time.
-   * @param param Options for the quantum trajectory simulator.
-   * @param circuit The noisy circuit to be simulated.
-   * @param r The repetition ID. The random number generator is seeded by 'r'.
-   * @param state_space StateSpace object required to manipulate state vector.
-   * @param simulator Simulator object. Provides specific implementations for
-   *   applying gates.
-   * @param state The state of the system, to be updated by this method.
-   * @param stat Statistics of sampled Kraus operator indices and/or measured
-   *   bitstrings, to be populated by this method.
-   * @return True if the simulation completed successfully; false otherwise.
-   */
-  static bool RunOnce(const Parameter& param,
-                      const NoisyCircuit<Gate>& circuit, uint64_t r,
-                      const StateSpace& state_space, const Simulator& simulator,
-                      State& state, Stat& stat) {
-    return RunOnce(param, circuit.num_qubits, circuit.channels.begin(),
-                   circuit.channels.end(), r, state_space, simulator,
-                   state, stat);
-  }
-
-  /**
-   * Runs the given noisy circuit one time.
-   * @param param Options for the quantum trajectory simulator.
-   * @param num_qubits The number of qubits acted on by the circuit.
-   * @param cbeg, cend The range of channels [cbeg, cend) to run the circuit.
-   * @param circuit The noisy circuit to be simulated.
-   * @param r The repetition ID. The random number generator is seeded by 'r'.
-   * @param state_space StateSpace object required to manipulate state vector.
-   * @param simulator Simulator object. Provides specific implementations for
-   *   applying gates.
-   * @param state The state of the system, to be updated by this method.
-   * @param stat Statistics of sampled Kraus operator indices and/or measured
-   *   bitstrings, to be populated by this method.
-   * @return True if the simulation completed successfully; false otherwise.
-   */
-  static bool RunOnce(const Parameter& param, unsigned num_qubits,
-                      ncircuit_iterator<Gate> cbeg,
-                      ncircuit_iterator<Gate> cend,
-                      uint64_t r, const StateSpace& state_space,
-                      const Simulator& simulator, State& state, Stat& stat) {
-    std::vector<const Gate*> gates;
-    gates.reserve(4 * std::size_t(cend - cbeg));
-
-    if (!RunIteration(param, param.apply_last_deferred_ops, num_qubits, cbeg,
-                      cend, r, state_space, simulator, gates, state, stat)) {
-      return false;
-    }
-
-    return true;
-  }
-
- private:
-  static bool RunIteration(const Parameter& param,
-                           bool apply_last_deferred_ops, unsigned num_qubits,
-                           ncircuit_iterator<Gate> cbeg,
-                           ncircuit_iterator<Gate> cend,
-                           uint64_t rep, const StateSpace& state_space,
-                           const Simulator& simulator,
-                           std::vector<const Gate*>& gates,
-                           State& state, Stat& stat) {
-    if (param.collect_kop_stat || param.collect_mea_stat) {
-      stat.samples.reserve(std::size_t(cend - cbeg));
-      stat.samples.resize(0);
-    }
-
-    if (state_space.IsNull(state)) {
-      state = CreateState(num_qubits, state_space);
-      if (state_space.IsNull(state)) {
-        return false;
-      }
-
-      state_space.SetStateZero(state);
-    }
-
-    gates.resize(0);
-
-    RGen rgen(rep);
-    std::uniform_real_distribution<double> distr(0.0, 1.0);
-
-    bool unitary = true;
-    stat.primary = true;
-
-    for (auto it = cbeg; it != cend; ++it) {
-      const auto& channel = *it;
-
-      if (channel.size() == 0) continue;
-
-      if (channel[0].kind == gate::kMeasurement) {
-        // Measurement channel.
-
-        if (!ApplyDeferredOps(param, num_qubits, simulator, gates, state)) {
-          return false;
-        }
-
-        bool normalize = !unitary && param.normalize_before_mea_gates;
-        NormalizeState(normalize, state_space, unitary, state);
-
-        auto mresult = ApplyMeasurementGate(state_space, channel[0].ops[0],
-                                            rgen, state);
-
-        if (!mresult.valid) {
-          return false;
-        }
-
-        CollectStat(param.collect_mea_stat, mresult.bits, stat);
-
-        stat.primary = false;
-
-        continue;
-      }
-
-      // "Normal" channel.
-
-      double r = distr(rgen);
-      double cp = 0;
-
-      // Perform sampling of Kraus operators using probability bounds.
-      for (std::size_t i = 0; i < channel.size(); ++i) {
-        const auto& kop = channel[i];
-
-        cp += kop.prob;
-
-        if (r < cp) {
-          DeferOps(kop.ops, gates);
-          CollectStat(param.collect_kop_stat, i, stat);
-
-          unitary = unitary && kop.unitary;
-
-          break;
-        }
-      }
-
-      if (r < cp) continue;
-
-      if (!ApplyDeferredOps(param, num_qubits, simulator, gates, state)) {
-        return false;
-      }
-
-      NormalizeState(!unitary, state_space, unitary, state);
-
-      double max_prob = 0;
-      std::size_t max_prob_index = 0;
-
-      // Perform sampling of Kraus operators using norms of updated states.
-      for (std::size_t i = 0; i < channel.size(); ++i) {
-        const auto& kop = channel[i];
-
-        if (kop.unitary) continue;
-
-        double prob = std::real(
-            simulator.ExpectationValue(kop.qubits, kop.kd_k.data(), state));
-
-        if (prob > max_prob) {
-          max_prob = prob;
-          max_prob_index = i;
-        }
-
-        cp += prob - kop.prob;
-
-        if (r < cp || i == channel.size() - 1) {
-          // Sample ith Kraus operator if r < cp
-          // Sample the highest probability Kraus operator if r is greater
-          // than the sum of all probablities due to round-off errors.
-          uint64_t k = r < cp ? i : max_prob_index;
-
-          DeferOps(channel[k].ops, gates);
-          CollectStat(param.collect_kop_stat, k, stat);
-
-          unitary = false;
-
-          break;
-        }
-      }
-    }
-
-    if (apply_last_deferred_ops || !stat.primary) {
-      if (!ApplyDeferredOps(param, num_qubits, simulator, gates, state)) {
-        return false;
-      }
-
-      NormalizeState(!unitary, state_space, unitary, state);
-    }
-
-    return true;
-  }
-
-  static State CreateState(unsigned num_qubits, const StateSpace& state_space) {
-    auto state = state_space.Create(num_qubits);
-    if (state_space.IsNull(state)) {
-      IO::errorf("not enough memory: is the number of qubits too large?\n");
-      return state_space.Null();
-    }
-
-    return state;
-  }
-
-  static bool ApplyDeferredOps(
-      const Parameter& param, unsigned num_qubits, const Simulator& simulator,
-      std::vector<const Gate*>& gates, State& state) {
-    if (gates.size() > 0) {
-      auto fgates = Fuser::FuseGates(param, num_qubits, gates);
-
-      gates.resize(0);
-
-      if (fgates.size() == 0) {
-        return false;
-      }
-
-      for (const auto& fgate : fgates) {
-        ApplyFusedGate(simulator, fgate, state);
-      }
-    }
-
-    return true;
-  }
-
-  static MeasurementResult ApplyMeasurementGate(
-      const StateSpace& state_space, const Gate& gate,
-      RGen& rgen, State& state) {
-    auto result = state_space.Measure(gate.qubits, rgen, state);
-
-    if (!result.valid) {
-      IO::errorf("measurement failed.\n");
-    }
-
-    return result;
-  }
-
-  static void DeferOps(
-      const std::vector<Gate>& ops, std::vector<const Gate*>& gates) {
-    for (const auto& op : ops) {
-      gates.push_back(&op);
-    }
-  }
-
-  static void CollectStat(bool collect_stat, uint64_t i, Stat& stat) {
-    if (collect_stat) {
-      stat.samples.push_back(i);
-    }
-
-    if (i != 0) {
-      stat.primary = false;
-    }
-  }
-
-  static void NormalizeState(bool normalize, const StateSpace& state_space,
-                             bool& flag, State& state) {
-    if (normalize) {
-      double a = 1.0 / std::sqrt(state_space.Norm(state));
-      state_space.Multiply(a, state);
-      flag = true;
-    }
-  }
-};
-
-}  // namespace qsim
-
-#endif  // QTRAJECTORY_H_
diff --git a/qsim/run_qsim.h b/qsim/run_qsim.h
deleted file mode 100644
index 3752915..0000000
--- a/qsim/run_qsim.h
+++ /dev/null
@@ -1,262 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef RUN_QSIM_H_
-#define RUN_QSIM_H_
-
-#include <random>
-#include <string>
-#include <vector>
-
-#include "gate.h"
-#include "gate_appl.h"
-#include "util.h"
-
-namespace qsim {
-
-/**
- * Helper struct for running qsim.
- */
-template <typename IO, typename Fuser, typename Factory,
-          typename RGen = std::mt19937>
-struct QSimRunner final {
- public:
-  using Simulator = typename Factory::Simulator;
-  using StateSpace = typename Simulator::StateSpace;
-  using State = typename StateSpace::State;
-  using MeasurementResult = typename StateSpace::MeasurementResult;
-
-  /**
-   * User-specified parameters for gate fusion and simulation.
-   */
-  struct Parameter : public Fuser::Parameter {
-    /**
-     * Random number generator seed to apply measurement gates.
-     */
-    uint64_t seed;
-  };
-
-  /**
-   * Runs the given circuit, only measuring at the end.
-   * @param param Options for gate fusion, parallelism and logging.
-   * @param factory Object to create simulators and state spaces.
-   * @param circuit The circuit to be simulated.
-   * @param measure Function that performs measurements (in the sense of
-   *   computing expectation values, etc).
-   * @return True if the simulation completed successfully; false otherwise.
-   */
-  template <typename Circuit, typename MeasurementFunc>
-  static bool Run(const Parameter& param, const Factory& factory,
-                  const Circuit& circuit, MeasurementFunc measure) {
-    return Run(param, factory, {circuit.gates.back().time}, circuit, measure);
-  }
-
-  /**
-   * Runs the given circuit, measuring at user-specified times.
-   * @param param Options for gate fusion, parallelism and logging.
-   * @param factory Object to create simulators and state spaces.
-   * @param times_to_measure_at Time steps at which to perform measurements.
-   * @param circuit The circuit to be simulated.
-   * @param measure Function that performs measurements (in the sense of
-   *   computing expectation values, etc).
-   * @return True if the simulation completed successfully; false otherwise.
-   */
-  template <typename Circuit, typename MeasurementFunc>
-  static bool Run(const Parameter& param, const Factory& factory,
-                  const std::vector<unsigned>& times_to_measure_at,
-                  const Circuit& circuit, MeasurementFunc measure) {
-    double t0 = 0.0;
-    double t1 = 0.0;
-
-    if (param.verbosity > 1) {
-      t0 = GetTime();
-    }
-
-    RGen rgen(param.seed);
-
-    StateSpace state_space = factory.CreateStateSpace();
-
-    auto state = state_space.Create(circuit.num_qubits);
-    if (state_space.IsNull(state)) {
-      IO::errorf("not enough memory: is the number of qubits too large?\n");
-      return false;
-    }
-
-    state_space.SetStateZero(state);
-    Simulator simulator = factory.CreateSimulator();
-
-    if (param.verbosity > 1) {
-      t1 = GetTime();
-      IO::messagef("init time is %g seconds.\n", t1 - t0);
-      t0 = GetTime();
-    }
-
-    auto fused_gates = Fuser::FuseGates(param, circuit.num_qubits,
-                                        circuit.gates, times_to_measure_at);
-
-    if (fused_gates.size() == 0 && circuit.gates.size() > 0) {
-      return false;
-    }
-
-    if (param.verbosity > 1) {
-      t1 = GetTime();
-      IO::messagef("fuse time is %g seconds.\n", t1 - t0);
-    }
-
-    if (param.verbosity > 0) {
-      t0 = GetTime();
-    }
-
-    unsigned cur_time_index = 0;
-
-    // Apply fused gates.
-    for (std::size_t i = 0; i < fused_gates.size(); ++i) {
-      if (param.verbosity > 3) {
-        t1 = GetTime();
-      }
-
-      if (!ApplyFusedGate(state_space, simulator, fused_gates[i], rgen,
-                          state)) {
-        IO::errorf("measurement failed.\n");
-        return false;
-      }
-
-      if (param.verbosity > 3) {
-        state_space.DeviceSync();
-        double t2 = GetTime();
-        IO::messagef("gate %lu done in %g seconds.\n", i, t2 - t1);
-      }
-
-      unsigned t = times_to_measure_at[cur_time_index];
-
-      if (i == fused_gates.size() - 1 || t < fused_gates[i + 1].time) {
-        // Call back to perform measurements.
-        measure(cur_time_index, state_space, state);
-        ++cur_time_index;
-      }
-    }
-
-    if (param.verbosity > 0) {
-      state_space.DeviceSync();
-      double t2 = GetTime();
-      IO::messagef("time is %g seconds.\n", t2 - t0);
-    }
-
-    return true;
-  }
-
-  /**
-   * Runs the given circuit and make the final state available to the caller,
-   * recording the result of any intermediate measurements in the circuit.
-   * @param param Options for gate fusion, parallelism and logging.
-   * @param factory Object to create simulators and state spaces.
-   * @param circuit The circuit to be simulated.
-   * @param state As an input parameter, this should contain the initial state
-   *   of the system. After a successful run, it will be populated with the
-   *   final state of the system.
-   * @param measure_results As an input parameter, this should be empty.
-   *   After a successful run, this will contain all measurements results from
-   *   the run, ordered by time and qubit index.
-   * @return True if the simulation completed successfully; false otherwise.
-   */
-  template <typename Circuit>
-  static bool Run(const Parameter& param, const Factory& factory,
-                  const Circuit& circuit, State& state,
-                  std::vector<MeasurementResult>& measure_results) {
-    double t0 = 0.0;
-    double t1 = 0.0;
-
-    if (param.verbosity > 1) {
-      t0 = GetTime();
-    }
-
-    RGen rgen(param.seed);
-
-    StateSpace state_space = factory.CreateStateSpace();
-    Simulator simulator = factory.CreateSimulator();
-
-    if (param.verbosity > 1) {
-      t1 = GetTime();
-      IO::messagef("init time is %g seconds.\n", t1 - t0);
-      t0 = GetTime();
-    }
-
-    auto fused_gates = Fuser::FuseGates(param, circuit.num_qubits,
-                                        circuit.gates);
-
-    if (fused_gates.size() == 0 && circuit.gates.size() > 0) {
-      return false;
-    }
-
-    measure_results.reserve(fused_gates.size());
-
-    if (param.verbosity > 1) {
-      t1 = GetTime();
-      IO::messagef("fuse time is %g seconds.\n", t1 - t0);
-    }
-
-    if (param.verbosity > 0) {
-      t0 = GetTime();
-    }
-
-    // Apply fused gates.
-    for (std::size_t i = 0; i < fused_gates.size(); ++i) {
-      if (param.verbosity > 3) {
-        t1 = GetTime();
-      }
-
-      if (!ApplyFusedGate(state_space, simulator, fused_gates[i], rgen, state,
-                          measure_results)) {
-        IO::errorf("measurement failed.\n");
-        return false;
-      }
-
-      if (param.verbosity > 3) {
-        state_space.DeviceSync();
-        double t2 = GetTime();
-        IO::messagef("gate %lu done in %g seconds.\n", i, t2 - t1);
-      }
-    }
-
-    if (param.verbosity > 0) {
-      state_space.DeviceSync();
-      double t2 = GetTime();
-      IO::messagef("simu time is %g seconds.\n", t2 - t0);
-    }
-
-    return true;
-  }
-
-  /**
-   * Runs the given circuit and make the final state available to the caller,
-   * discarding the result of any intermediate measurements in the circuit.
-   * @param param Options for gate fusion, parallelism and logging.
-   * @param factory Object to create simulators and state spaces.
-   * @param circuit The circuit to be simulated.
-   * @param state As an input parameter, this should contain the initial state
-   *   of the system. After a successful run, it will be populated with the
-   *   final state of the system.
-   * @return True if the simulation completed successfully; false otherwise.
-   */
-  template <typename Circuit>
-  static bool Run(const Parameter& param, const Factory& factory,
-                  const Circuit& circuit, State& state) {
-    std::vector<MeasurementResult> discarded_results;
-    return Run(param, factory, circuit, state, discarded_results);
-  }
-};
-
-}  // namespace qsim
-
-#endif  // RUN_QSIM_H_
diff --git a/qsim/run_qsimh.h b/qsim/run_qsimh.h
deleted file mode 100644
index c1534d3..0000000
--- a/qsim/run_qsimh.h
+++ /dev/null
@@ -1,120 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef RUN_QSIMH_H_
-#define RUN_QSIMH_H_
-
-#include <string>
-#include <vector>
-
-#include "hybrid.h"
-#include "util.h"
-
-namespace qsim {
-
-/**
- * Helper struct for running qsimh.
- */
-template <typename IO, typename HybridSimulator>
-struct QSimHRunner final {
-  using Gate = typename HybridSimulator::Gate;
-  using fp_type = typename HybridSimulator::fp_type;
-
-  using Parameter = typename HybridSimulator::Parameter;
-  using HybridData = typename HybridSimulator::HybridData;
-  using Fuser = typename HybridSimulator::Fuser;
-
-  /**
-   * Evaluates the amplitudes for a given circuit and set of output states.
-   * @param param Options for gate fusion, parallelism and logging. Also
-   *   specifies the size of the 'prefix' and 'root' sections of the lattice.
-   * @param factory Object to create simulators and state spaces.
-   * @param circuit The circuit to be simulated.
-   * @param parts Lattice sections to be simulated.
-   * @param bitstrings List of output states to simulate, as bitstrings.
-   * @param results Output vector of amplitudes. After a successful run, this
-   *   will be populated with amplitudes for each state in 'bitstrings'.
-   * @return True if the simulation completed successfully; false otherwise.
-   */
-  template <typename Factory, typename Circuit>
-  static bool Run(const Parameter& param, const Factory& factory,
-                  const Circuit& circuit, const std::vector<unsigned>& parts,
-                  const std::vector<uint64_t>& bitstrings,
-                  std::vector<std::complex<fp_type>>& results) {
-    if (circuit.num_qubits != parts.size()) {
-      IO::errorf("parts size is not equal to the number of qubits.");
-      return false;
-    }
-
-    double t0 = 0.0;
-
-    if (param.verbosity > 0) {
-      t0 = GetTime();
-    }
-
-    HybridData hd;
-    bool rc = HybridSimulator::SplitLattice(parts, circuit.gates, hd);
-
-    if (!rc) {
-      return false;
-    }
-
-    if (hd.num_gatexs < param.num_prefix_gatexs + param.num_root_gatexs) {
-      IO::errorf("error: num_prefix_gates (%u) plus num_root gates (%u) is "
-                 "greater than num_gates_on_the_cut (%u).\n",
-                 param.num_prefix_gatexs, param.num_root_gatexs,
-                 hd.num_gatexs);
-      return false;
-    }
-
-    if (param.verbosity > 0) {
-      PrintInfo(param, hd);
-    }
-
-    auto fgates0 = Fuser::FuseGates(param, hd.num_qubits0, hd.gates0);
-    if (fgates0.size() == 0 && hd.gates0.size() > 0) {
-      return false;
-    }
-
-    auto fgates1 = Fuser::FuseGates(param, hd.num_qubits1, hd.gates1);
-    if (fgates1.size() == 0 && hd.gates1.size() > 0) {
-      return false;
-    }
-
-    rc = HybridSimulator(param.num_threads).Run(
-        param, factory, hd, parts, fgates0, fgates1, bitstrings, results);
-
-    if (rc && param.verbosity > 0) {
-      double t1 = GetTime();
-      IO::messagef("time elapsed %g seconds.\n", t1 - t0);
-    }
-
-    return rc;
-  }
-
- private:
-  static void PrintInfo(const Parameter& param, const HybridData& hd) {
-    unsigned num_suffix_gates =
-        hd.num_gatexs - param.num_prefix_gatexs - param.num_root_gatexs;
-
-    IO::messagef("part 0: %u, part 1: %u\n", hd.num_qubits0, hd.num_qubits1);
-    IO::messagef("%u gates on the cut\n", hd.num_gatexs);
-    IO::messagef("breakup: %up+%ur+%us\n", param.num_prefix_gatexs,
-                 param.num_root_gatexs, num_suffix_gates);
-  }
-};
-
-}  // namespace qsim
-
-#endif  // RUN_QSIM_H_
diff --git a/qsim/seqfor.h b/qsim/seqfor.h
deleted file mode 100644
index 3ebf07c..0000000
--- a/qsim/seqfor.h
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SEQFOR_H_
-#define SEQFOR_H_
-
-#include <cstdint>
-#include <utility>
-#include <vector>
-
-namespace qsim {
-
-/**
- * Helper struct for executing for loops in series.
- */
-struct SequentialFor {
-  explicit SequentialFor(unsigned num_threads) {}
-
-  // SequentialFor does not have any state. So all its methods can be static.
-
-  static uint64_t GetIndex0(uint64_t size, unsigned thread_id) {
-    return 0;
-  }
-
-  static uint64_t GetIndex1(uint64_t size, unsigned thread_id) {
-    return size;
-  }
-
-  template <typename Function, typename... Args>
-  static void Run(uint64_t size, Function&& func, Args&&... args) {
-    for (uint64_t i = 0; i < size; ++i) {
-      func(1, 0, i, args...);
-    }
-  }
-
-  template <typename Function, typename Op, typename... Args>
-  static std::vector<typename Op::result_type> RunReduceP(
-      uint64_t size, Function&& func, Op&& op, Args&&... args) {
-    typename Op::result_type result = 0;
-
-    for (uint64_t i = 0; i < size; ++i) {
-      result = op(result, func(1, 0, i, args...));
-    }
-
-    return std::vector<typename Op::result_type>(1, result);
-  }
-
-  template <typename Function, typename Op, typename... Args>
-  static typename Op::result_type RunReduce(uint64_t size, Function&& func,
-                                            Op&& op, Args&&... args) {
-    return RunReduceP(size, func, std::move(op), args...)[0];
-  }
-};
-
-}  // namespace qsim
-
-#endif  // SEQFOR_H_
diff --git a/qsim/simmux.h b/qsim/simmux.h
deleted file mode 100644
index d3c4074..0000000
--- a/qsim/simmux.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SIMMUX_H_
-#define SIMMUX_H_
-
-#ifdef __AVX512F__
-# include "simulator_avx512.h"
-  namespace qsim {
-    template <typename For>
-    using Simulator = SimulatorAVX512<For>;
-  }
-#elif __AVX2__
-# include "simulator_avx.h"
-  namespace qsim {
-    template <typename For>
-    using Simulator = SimulatorAVX<For>;
-  }
-#elif __SSE4_1__
-# include "simulator_sse.h"
-  namespace qsim {
-    template <typename For>
-    using Simulator = SimulatorSSE<For>;
-  }
-#else
-# include "simulator_basic.h"
-  namespace qsim {
-    template <typename For>
-    using Simulator = SimulatorBasic<For>;
-  }
-#endif
-
-#endif  // SIMMUX_H_
diff --git a/qsim/simmux_gpu.h b/qsim/simmux_gpu.h
deleted file mode 100644
index 1f0bb59..0000000
--- a/qsim/simmux_gpu.h
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright 2023 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SIMMUX_GPU_H_
-#define SIMMUX_GPU_H_
-
-#ifdef __CUSTATEVEC__
-# include "simulator_custatevec.h"
-  namespace qsim {
-    using SimulatorGpu = SimulatorCuStateVec<>;
-  }
-#else
-# include "simulator_cuda.h"
-  namespace qsim {
-    using SimulatorGpu = SimulatorCUDA<>;
-  }
-#endif
-
-#endif  // SIMMUX_GPU_H_
diff --git a/qsim/simulator.h b/qsim/simulator.h
deleted file mode 100644
index eff5441..0000000
--- a/qsim/simulator.h
+++ /dev/null
@@ -1,516 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SIMULATOR_H_
-#define SIMULATOR_H_
-
-#include <cstdint>
-
-#include "bits.h"
-
-namespace qsim {
-
-/**
- * Base class for simulator classes.
- */
-class SimulatorBase {
- protected:
-  // The follwoing template parameters are used for functions below.
-  // H - the number of high (target) qubits.
-  // L - the number of low (target) qubits.
-  // R - SIMD register width in floats.
-
-  // Fills the table of masks (ms) that is used to calculate base state indices
-  // and the table of offset indices (xss) that is used to access the state
-  // vector entries in matrix-vector multiplication functions. This function is
-  // used in simulator_basic.h, simulator_sse.h and simulator_avx.h (no bmi2
-  // version).
-  template <unsigned H, unsigned L = 0>
-  static void FillIndices(unsigned num_qubits, const std::vector<unsigned>& qs,
-                          uint64_t* ms, uint64_t* xss) {
-    constexpr unsigned hsize = 1 << H;
-
-    if (H == 0) {
-      ms[0] = uint64_t(-1);
-      xss[0] = 0;
-    } else {
-      uint64_t xs[H + 1];
-
-      xs[0] = uint64_t{1} << (qs[L] + 1);
-      ms[0] = (uint64_t{1} << qs[L]) - 1;
-      for (unsigned i = 1; i < H; ++i) {
-        xs[i] = uint64_t{1} << (qs[L + i] + 1);
-        ms[i] = ((uint64_t{1} << qs[L + i]) - 1) ^ (xs[i - 1] - 1);
-      }
-      ms[H] = ((uint64_t{1} << num_qubits) - 1) ^ (xs[H - 1] - 1);
-
-      for (unsigned i = 0; i < hsize; ++i) {
-        uint64_t a = 0;
-        for (uint64_t k = 0; k < H; ++k) {
-          a += xs[k] * ((i >> k) & 1);
-        }
-        xss[i] = a;
-      }
-    }
-  }
-
-  // Fills gate matrix entries for gates with low qubits.
-  template <unsigned H, unsigned L, unsigned R, typename fp_type>
-  static void FillMatrix(unsigned qmaskl, const fp_type* matrix, fp_type* w) {
-    constexpr unsigned gsize = 1 << (H + L);
-    constexpr unsigned hsize = 1 << H;
-    constexpr unsigned lsize = 1 << L;
-    constexpr unsigned rsize = 1 << R;
-
-    unsigned s = 0;
-
-    for (unsigned i = 0; i < hsize; ++i) {
-      for (unsigned j = 0; j < gsize; ++j) {
-        unsigned p0 = 2 * i * lsize * gsize + 2 * lsize * (j / lsize);
-
-        for (unsigned k = 0; k < rsize; ++k) {
-          unsigned l = bits::CompressBits(k, R, qmaskl);
-          unsigned p = p0 + 2 * (gsize * l + (j + l) % lsize);
-
-          w[s + 0] = matrix[p];
-          w[s + rsize] = matrix[p + 1];
-
-          ++s;
-        }
-
-        s += rsize;
-      }
-    }
-  }
-
-  // Fills gate matrix entries for controlled gates with high target qubits
-  // and low control qubits.
-  template <unsigned H, unsigned R, typename fp_type>
-  static void FillControlledMatrixH(uint64_t cvalsl, uint64_t cmaskl,
-                                    const fp_type* matrix, fp_type* w) {
-    constexpr unsigned hsize = 1 << H;
-    constexpr unsigned rsize = 1 << R;
-
-    unsigned s = 0;
-
-    for (unsigned i = 0; i < hsize; ++i) {
-      for (unsigned j = 0; j < hsize; ++j) {
-        unsigned p = hsize * i + j;
-        fp_type v = i == j ? 1 : 0;
-
-        for (unsigned k = 0; k < rsize; ++k) {
-          w[s] = cvalsl == (k & cmaskl) ? matrix[2 * p] : v;
-          w[s + rsize] = cvalsl == (k & cmaskl) ? matrix[2 * p + 1] : 0;
-
-          ++s;
-        }
-
-        s += rsize;
-      }
-    }
-  }
-
-  // Fills gate matrix entries for controlled gates with low target qubits
-  // and low control qubits.
-  template <unsigned H, unsigned L, unsigned R, typename fp_type>
-  static void FillControlledMatrixL(uint64_t cvalsl, uint64_t cmaskl,
-                                    unsigned qmaskl, const fp_type* matrix,
-                                    fp_type* w) {
-    constexpr unsigned gsize = 1 << (H + L);
-    constexpr unsigned hsize = 1 << H;
-    constexpr unsigned lsize = 1 << L;
-    constexpr unsigned rsize = 1 << R;
-
-    unsigned s = 0;
-
-    for (unsigned i = 0; i < hsize; ++i) {
-      for (unsigned j = 0; j < gsize; ++j) {
-        unsigned p0 = i * lsize * gsize + lsize * (j / lsize);
-
-        for (unsigned k = 0; k < rsize; ++k) {
-          unsigned l = bits::CompressBits(k, R, qmaskl);
-          unsigned p = p0 + gsize * l + (j + l) % lsize;
-
-          fp_type v = p / gsize == p % gsize ? 1 : 0;
-
-          w[s] = cvalsl == (k & cmaskl) ? matrix[2 * p] : v;
-          w[s + rsize] = cvalsl == (k & cmaskl) ? matrix[2 * p + 1] : 0;
-
-          ++s;
-        }
-
-        s += rsize;
-      }
-    }
-  }
-
-/*
-  The GetMasks* functions below provide various masks and related values.
-  GetMasks1, GetMasks2, GetMasks3, GetMasks4, GetMasks5 and GetMasks6 are
-  used in simulator_avx.h (BMI2 version) and in simulator_avx512.h. GetMasks7,
-  GetMasks8, GetMasks9, GetMasks10 and GetMasks11 are used in simulator_avx.h
-  (no BMI2 version) and in simulator_sse.h.
-
-  imaskh - inverted mask of high qubits (high control and target qubits).
-  qmaskh - mask of high qubits (high target qubits).
-  cvalsh - control bit values of high control qubits placed in correct
-           positions.
-  cvalsl - control bit values of low control qubits placed in correct positions.
-  cmaskh - mask of high control qubits.
-  cmaskl - mask of low control qubits.
-  qmaskl - mask of low qubits (low target qubits).
-  cl - the number of low control qubits.
-
-  Note that imaskh, qmaskh and cvalsh are multiplied by two in GetMasks1,
-  GetMasks2, GetMasks3, GetMasks4, GetMasks5 and GetMasks6.
-*/
-
-  struct Masks1 {
-    uint64_t imaskh;
-    uint64_t qmaskh;
-  };
-
-  template <unsigned H, unsigned R>
-  static Masks1 GetMasks1(const std::vector<unsigned>& qs) {
-    uint64_t qmaskh = 0;
-
-    for (unsigned i = 0; i < H; ++i) {
-      qmaskh |= uint64_t{1} << qs[i];
-    }
-
-    return {2 * (~qmaskh ^ ((1 << R) - 1)), 2 * qmaskh};
-  }
-
-  struct Masks2 {
-    uint64_t imaskh;
-    uint64_t qmaskh;
-    unsigned qmaskl;
-  };
-
-  template <unsigned H, unsigned L, unsigned R>
-  static Masks2 GetMasks2(const std::vector<unsigned>& qs) {
-    uint64_t qmaskh = 0;
-    unsigned qmaskl = 0;
-
-    for (unsigned i = 0; i < L; ++i) {
-      qmaskl |= 1 << qs[i];
-    }
-
-    for (unsigned i = L; i < H + L; ++i) {
-      qmaskh |= uint64_t{1} << qs[i];
-    }
-
-    return {2 * (~qmaskh ^ ((1 << R) - 1)), 2 * qmaskh, qmaskl};
-  }
-
-  struct Masks3 {
-    uint64_t imaskh;
-    uint64_t qmaskh;
-    uint64_t cvalsh;
-  };
-
-  template <unsigned H, unsigned R>
-  static Masks3 GetMasks3(unsigned num_qubits, const std::vector<unsigned>& qs,
-                          const std::vector<unsigned>& cqs, uint64_t cvals) {
-    uint64_t qmaskh = 0;
-    uint64_t cmaskh = 0;
-
-    for (unsigned i = 0; i < H; ++i) {
-      qmaskh |= uint64_t{1} << qs[i];
-    }
-
-    for (auto q : cqs) {
-      cmaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh);
-
-    uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1);
-
-    return {2 * maskh, 2 * qmaskh, 2 * cvalsh};
-  }
-
-  struct Masks4 {
-    uint64_t imaskh;
-    uint64_t qmaskh;
-    uint64_t cvalsh;
-    uint64_t cvalsl;
-    uint64_t cmaskl;
-    unsigned cl;
-  };
-
-  template <unsigned H, unsigned R>
-  static Masks4 GetMasks4(unsigned num_qubits, const std::vector<unsigned>& qs,
-                          const std::vector<unsigned>& cqs, uint64_t cvals) {
-    unsigned cl = 0;
-    uint64_t qmaskh = 0;
-    uint64_t cmaskh = 0;
-    uint64_t cmaskl = 0;
-
-    for (unsigned i = 0; i < H; ++i) {
-      qmaskh |= uint64_t{1} << qs[i];
-    }
-
-    for (auto q : cqs) {
-      if (q >= R) {
-        cmaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        cmaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh);
-    uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl);
-
-    uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1);
-
-    return {2 * maskh, 2 * qmaskh, 2 * cvalsh, cvalsl, cmaskl, cl};
-  }
-
-  struct Masks5 {
-    uint64_t imaskh;
-    uint64_t qmaskh;
-    uint64_t cvalsh;
-    unsigned qmaskl;
-  };
-
-  template <unsigned H, unsigned L, unsigned R>
-  static Masks5 GetMasks5(unsigned num_qubits, const std::vector<unsigned>& qs,
-                          const std::vector<unsigned>& cqs, uint64_t cvals) {
-    uint64_t qmaskh = 0;
-    uint64_t cmaskh = 0;
-    unsigned qmaskl = 0;
-
-    for (unsigned i = 0; i < L; ++i) {
-      qmaskl |= 1 << qs[i];
-    }
-
-    for (unsigned i = L; i < H + L; ++i) {
-      qmaskh |= uint64_t{1} << qs[i];
-    }
-
-    for (auto q : cqs) {
-      cmaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh);
-
-    uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1);
-
-    return {2 * maskh, 2 * qmaskh, 2 * cvalsh, qmaskl};
-  }
-
-  struct Masks6 {
-    uint64_t imaskh;
-    uint64_t qmaskh;
-    uint64_t cvalsh;
-    uint64_t cvalsl;
-    uint64_t cmaskl;
-    unsigned qmaskl;
-    unsigned cl;
-  };
-
-  template <unsigned H, unsigned L, unsigned R>
-  static Masks6 GetMasks6(unsigned num_qubits, const std::vector<unsigned>& qs,
-                          const std::vector<unsigned>& cqs, uint64_t cvals) {
-    unsigned cl = 0;
-    uint64_t qmaskh = 0;
-    uint64_t cmaskh = 0;
-    uint64_t cmaskl = 0;
-    unsigned qmaskl = 0;
-
-    for (unsigned i = 0; i < L; ++i) {
-      qmaskl |= 1 << qs[i];
-    }
-
-    for (unsigned i = L; i < H + L; ++i) {
-      qmaskh |= uint64_t{1} << qs[i];
-    }
-
-    for (auto q : cqs) {
-      if (q >= R) {
-        cmaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        cmaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh);
-    uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl);
-
-    uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1);
-
-    return {2 * maskh, 2 * qmaskh, 2 * cvalsh, cvalsl, cmaskl, qmaskl, cl};
-  }
-
-  struct Masks7 {
-    uint64_t cvalsh;
-    uint64_t cmaskh;
-  };
-
-  static Masks7 GetMasks7(unsigned num_qubits, const std::vector<unsigned>& qs,
-                          const std::vector<unsigned>& cqs, uint64_t cvals) {
-    uint64_t cmaskh = 0;
-
-    for (auto q : cqs) {
-      cmaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh);
-
-    return {cvalsh, cmaskh};
-  }
-
-  struct Masks8 {
-    uint64_t cvalsh;
-    uint64_t cmaskh;
-    uint64_t cvalsl;
-    uint64_t cmaskl;
-  };
-
-  template <unsigned R>
-  static Masks8 GetMasks8(unsigned num_qubits, const std::vector<unsigned>& qs,
-                          const std::vector<unsigned>& cqs, uint64_t cvals) {
-    unsigned cl = 0;
-    uint64_t cmaskh = 0;
-    uint64_t cmaskl = 0;
-
-    for (auto q : cqs) {
-      if (q >= R) {
-        cmaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        cmaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh);
-    uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl);
-
-    return {cvalsh, cmaskh, cvalsl, cmaskl};
-  }
-
-  struct Masks9 {
-    uint64_t cvalsh;
-    uint64_t cmaskh;
-    unsigned qmaskl;
-  };
-
-  template <unsigned L>
-  static Masks9 GetMasks9(unsigned num_qubits, const std::vector<unsigned>& qs,
-                          const std::vector<unsigned>& cqs, uint64_t cvals) {
-    uint64_t cmaskh = 0;
-    unsigned qmaskl = 0;
-
-    for (unsigned i = 0; i < L; ++i) {
-      qmaskl |= 1 << qs[i];
-    }
-
-    for (auto q : cqs) {
-      cmaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh);
-
-    return {cvalsh, cmaskh, qmaskl};
-  }
-
-  struct Masks10 {
-    uint64_t cvalsh;
-    uint64_t cmaskh;
-    uint64_t cvalsl;
-    uint64_t cmaskl;
-    unsigned qmaskl;
-  };
-
-  template <unsigned L, unsigned R>
-  static Masks10 GetMasks10(unsigned num_qubits,
-                            const std::vector<unsigned>& qs,
-                            const std::vector<unsigned>& cqs, uint64_t cvals) {
-    unsigned cl = 0;
-    uint64_t cmaskh = 0;
-    uint64_t cmaskl = 0;
-    unsigned qmaskl = 0;
-
-    for (unsigned i = 0; i < L; ++i) {
-      qmaskl |= 1 << qs[i];
-    }
-
-    for (auto q : cqs) {
-      if (q >= R) {
-        cmaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        cmaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh);
-    uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl);
-
-    return {cvalsh, cmaskh, cvalsl, cmaskl, qmaskl};
-  }
-
-  struct Masks11 {
-    unsigned qmaskl;
-  };
-
-  template <unsigned L>
-  static Masks11 GetMasks11(const std::vector<unsigned>& qs) {
-    unsigned qmaskl = 0;
-
-    for (unsigned i = 0; i < L; ++i) {
-      qmaskl |= 1 << qs[i];
-    }
-
-    return {qmaskl};
-  }
-
-  template <unsigned R>
-  static unsigned MaskedAdd(
-      unsigned a, unsigned b, unsigned mask, unsigned lsize) {
-    unsigned c = bits::CompressBits(a, R, mask);
-    return bits::ExpandBits((c + b) % lsize, R, mask);
-  }
-};
-
-template <>
-inline void SimulatorBase::FillIndices<0, 1>(unsigned num_qubits,
-                                             const std::vector<unsigned>& qs,
-                                             uint64_t* ms, uint64_t* xss) {
-  ms[0] = -1;
-  xss[0] = 0;
-}
-
-template <>
-inline void SimulatorBase::FillIndices<0, 2>(unsigned num_qubits,
-                                             const std::vector<unsigned>& qs,
-                                             uint64_t* ms, uint64_t* xss) {
-  ms[0] = -1;
-  xss[0] = 0;
-}
-
-template <>
-inline void SimulatorBase::FillIndices<0, 3>(unsigned num_qubits,
-                                             const std::vector<unsigned>& qs,
-                                             uint64_t* ms, uint64_t* xss) {
-  ms[0] = -1;
-  xss[0] = 0;
-}
-
-}  // namespace qsim
-
-#endif  // SIMULATOR_H_
diff --git a/qsim/simulator_avx.h b/qsim/simulator_avx.h
deleted file mode 100644
index 9742849..0000000
--- a/qsim/simulator_avx.h
+++ /dev/null
@@ -1,1363 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SIMULATOR_AVX_H_
-#define SIMULATOR_AVX_H_
-
-#include <immintrin.h>
-
-#include <complex>
-#include <cstdint>
-#include <functional>
-#include <vector>
-
-#include "simulator.h"
-#include "statespace_avx.h"
-
-namespace qsim {
-
-/**
- * Quantum circuit simulator with AVX vectorization.
- */
-template <typename For>
-class SimulatorAVX final : public SimulatorBase {
- public:
-  using StateSpace = StateSpaceAVX<For>;
-  using State = typename StateSpace::State;
-  using fp_type = typename StateSpace::fp_type;
-
-  template <typename... ForArgs>
-  explicit SimulatorAVX(ForArgs&&... args) : for_(args...) {}
-
-  /**
-   * Applies a gate using AVX instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyGate(const std::vector<unsigned>& qs,
-                 const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    switch (qs.size()) {
-    case 0:
-      ApplyGateH<0>(qs, matrix, state);
-      break;
-    case 1:
-      if (qs[0] > 2) {
-        ApplyGateH<1>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 1>(qs, matrix, state);
-      }
-      break;
-    case 2:
-      if (qs[0] > 2) {
-        ApplyGateH<2>(qs, matrix, state);
-      } else if (qs[1] > 2) {
-        ApplyGateL<1, 1>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 2>(qs, matrix, state);
-      }
-      break;
-    case 3:
-      if (qs[0] > 2) {
-        ApplyGateH<3>(qs, matrix, state);
-      } else if (qs[1] > 2) {
-        ApplyGateL<2, 1>(qs, matrix, state);
-      } else if (qs[2] > 2) {
-        ApplyGateL<1, 2>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 3>(qs, matrix, state);
-      }
-      break;
-    case 4:
-      if (qs[0] > 2) {
-        ApplyGateH<4>(qs, matrix, state);
-      } else if (qs[1] > 2) {
-        ApplyGateL<3, 1>(qs, matrix, state);
-      } else if (qs[2] > 2) {
-        ApplyGateL<2, 2>(qs, matrix, state);
-      } else {
-        ApplyGateL<1, 3>(qs, matrix, state);
-      }
-      break;
-    case 5:
-      if (qs[0] > 2) {
-        ApplyGateH<5>(qs, matrix, state);
-      } else if (qs[1] > 2) {
-        ApplyGateL<4, 1>(qs, matrix, state);
-      } else if (qs[2] > 2) {
-        ApplyGateL<3, 2>(qs, matrix, state);
-      } else {
-        ApplyGateL<2, 3>(qs, matrix, state);
-      }
-      break;
-    case 6:
-      if (qs[0] > 2) {
-        ApplyGateH<6>(qs, matrix, state);
-      } else if (qs[1] > 2) {
-        ApplyGateL<5, 1>(qs, matrix, state);
-      } else if (qs[2] > 2) {
-        ApplyGateL<4, 2>(qs, matrix, state);
-      } else {
-        ApplyGateL<3, 3>(qs, matrix, state);
-      }
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-  }
-
-  /**
-   * Applies a controlled gate using AVX instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param cqs Indices of control qubits.
-   * @param cvals Bit mask of control qubit values.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyControlledGate(const std::vector<unsigned>& qs,
-                           const std::vector<unsigned>& cqs, uint64_t cvals,
-                           const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-    // Assume cqs[0] < cqs[1] < cqs[2] < ... .
-
-    if (cqs.size() == 0) {
-      ApplyGate(qs, matrix, state);
-      return;
-    }
-
-    switch (qs.size()) {
-    case 0:
-      if (cqs[0] > 2) {
-        ApplyControlledGateHH<0>(qs, cqs, cvals, matrix, state);
-      } else {
-        ApplyControlledGateHL<0>(qs, cqs, cvals, matrix, state);
-      }
-      break;
-    case 1:
-      if (qs[0] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 2:
-      if (qs[0] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 3:
-      if (qs[0] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[2] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 4:
-      if (qs[0] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[2] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-  }
-
-  /**
-   * Computes the expectation value of an operator using AVX instructions.
-   * @param qs Indices of the qubits the operator acts on.
-   * @param matrix The operator matrix.
-   * @param state The state of the system.
-   * @return The computed expectation value.
-   */
-  std::complex<double> ExpectationValue(const std::vector<unsigned>& qs,
-                                        const fp_type* matrix,
-                                        const State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    switch (qs.size()) {
-    case 1:
-      if (qs[0] > 2) {
-        return ExpectationValueH<1>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<0, 1>(qs, matrix, state);
-      }
-      break;
-    case 2:
-      if (qs[0] > 2) {
-        return ExpectationValueH<2>(qs, matrix, state);
-      } else if (qs[1] > 2) {
-        return ExpectationValueL<1, 1>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<0, 2>(qs, matrix, state);
-      }
-      break;
-    case 3:
-      if (qs[0] > 2) {
-        return ExpectationValueH<3>(qs, matrix, state);
-      } else if (qs[1] > 2) {
-        return ExpectationValueL<2, 1>(qs, matrix, state);
-      } else if (qs[2] > 2) {
-        return ExpectationValueL<1, 2>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<0, 3>(qs, matrix, state);
-      }
-      break;
-    case 4:
-      if (qs[0] > 2) {
-        return ExpectationValueH<4>(qs, matrix, state);
-      } else if (qs[1] > 2) {
-        return ExpectationValueL<3, 1>(qs, matrix, state);
-      } else if (qs[2] > 2) {
-        return ExpectationValueL<2, 2>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<1, 3>(qs, matrix, state);
-      }
-      break;
-    case 5:
-      if (qs[0] > 2) {
-        return ExpectationValueH<5>(qs, matrix, state);
-      } else if (qs[1] > 2) {
-        return ExpectationValueL<4, 1>(qs, matrix, state);
-      } else if (qs[2] > 2) {
-        return ExpectationValueL<3, 2>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<2, 3>(qs, matrix, state);
-      }
-      break;
-    case 6:
-      if (qs[0] > 2) {
-        return ExpectationValueH<6>(qs, matrix, state);
-      } else if (qs[1] > 2) {
-        return ExpectationValueL<5, 1>(qs, matrix, state);
-      } else if (qs[2] > 2) {
-        return ExpectationValueL<4, 2>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<3, 3>(qs, matrix, state);
-      }
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-
-    return 0;
-  }
-
-  /**
-   * @return The size of SIMD register if applicable.
-   */
-  static unsigned SIMDRegisterSize() {
-    return 8;
-  }
-
- private:
-#ifdef __BMI2__
-
-  template <unsigned H>
-  void ApplyGateH(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                uint64_t imaskh, uint64_t qmaskh, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m256 ru, iu, rn, in;
-      __m256 rs[hsize], is[hsize];
-
-      auto p0 = rstate + _pdep_u64(i, imaskh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k] = _mm256_load_ps(p0 + p);
-        is[k] = _mm256_load_ps(p0 + p + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[l], ru, rn);
-          in = _mm256_fmadd_ps(rs[l], iu, in);
-          rn = _mm256_fnmadd_ps(is[l], iu, rn);
-          in = _mm256_fmadd_ps(is[l], ru, in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm256_store_ps(p0 + p, rn);
-        _mm256_store_ps(p0 + p + 8, in);
-      }
-    };
-
-    auto m = GetMasks1<H, 3>(qs);
-
-    unsigned k = 3 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, m.imaskh, m.qmaskh, state.get());
-  }
-
-  template <unsigned H, unsigned L>
-  void ApplyGateL(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                uint64_t imaskh, uint64_t qmaskh, const __m256i* idx,
-                fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m256 rn, in;
-      __m256 rs[gsize], is[gsize];
-
-      auto p0 = rstate + _pdep_u64(i, imaskh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k2] = _mm256_load_ps(p0 + p);
-        is[k2] = _mm256_load_ps(p0 + p + 8);
-
-        for (unsigned l = 1; l < lsize; ++l) {
-          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
-          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
-          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm256_store_ps(p0 + p, rn);
-        _mm256_store_ps(p0 + p + 8, in);
-      }
-    };
-
-    __m256i idx[1 << L];
-    __m256 w[1 << (1 + 2 * H + L)];
-
-    auto m = GetMasks2<H, L, 3>(qs);
-    FillPermutationIndices<L>(m.qmaskl, idx);
-    FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
-
-    unsigned r = 3 + H;
-    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, m.imaskh, m.qmaskh, idx, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
-                fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m256 ru, iu, rn, in;
-      __m256 rs[hsize], is[hsize];
-
-      auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k] = _mm256_load_ps(p0 + p);
-        is[k] = _mm256_load_ps(p0 + p + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[l], ru, rn);
-          in = _mm256_fmadd_ps(rs[l], iu, in);
-          rn = _mm256_fnmadd_ps(is[l], iu, rn);
-          in = _mm256_fmadd_ps(is[l], ru, in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm256_store_ps(p0 + p, rn);
-        _mm256_store_ps(p0 + p + 8, in);
-      }
-    };
-
-    auto m = GetMasks3<H, 3>(state.num_qubits(), qs, cqs, cvals);
-
-    unsigned k = 3 + H + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, m.imaskh, m.qmaskh, m.cvalsh, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
-                fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m256 rn, in;
-      __m256 rs[hsize], is[hsize];
-
-      auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k] = _mm256_load_ps(p0 + p);
-        is[k] = _mm256_load_ps(p0 + p + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
-          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm256_store_ps(p0 + p, rn);
-        _mm256_store_ps(p0 + p + 8, in);
-      }
-    };
-
-    __m256 w[1 << (1 + 2 * H)];
-
-    auto m = GetMasks4<H, 3>(state.num_qubits(), qs, cqs, cvals);
-    FillControlledMatrixH<H, 3>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
-
-    unsigned r = 3 + H + cqs.size() - m.cl;
-    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, state.get());
-  }
-
-  template <unsigned H, unsigned L, bool CH>
-  void ApplyControlledGateL(const std::vector<unsigned>& qs,
-                            const std::vector<unsigned>& cqs, uint64_t cvals,
-                            const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
-                const __m256i* idx, fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m256 rn, in;
-      __m256 rs[gsize], is[gsize];
-
-      auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k2] = _mm256_load_ps(p0 + p);
-        is[k2] = _mm256_load_ps(p0 + p + 8);
-
-        for (unsigned l = 1; l < lsize; ++l) {
-          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
-          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
-          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm256_store_ps(p0 + p, rn);
-        _mm256_store_ps(p0 + p + 8, in);
-      }
-    };
-
-    __m256i idx[1 << L];
-    __m256 w[1 << (1 + 2 * H + L)];
-
-    if (CH) {
-      auto m = GetMasks5<H, L, 3>(state.num_qubits(), qs, cqs, cvals);
-      FillPermutationIndices<L>(m.qmaskl, idx);
-      FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
-
-      unsigned r = 3 + H + cqs.size();
-      unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
-      uint64_t size = uint64_t{1} << n;
-
-      for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get());
-    } else {
-      auto m = GetMasks6<H, L, 3>(state.num_qubits(), qs, cqs, cvals);
-      FillPermutationIndices<L>(m.qmaskl, idx);
-      FillControlledMatrixL<H, L, 3>(
-          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
-
-      unsigned r = 3 + H + cqs.size() - m.cl;
-      unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
-      uint64_t size = uint64_t{1} << n;
-
-      for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get());
-    }
-  }
-
-  template <unsigned H>
-  std::complex<double> ExpectationValueH(const std::vector<unsigned>& qs,
-                                         const fp_type* matrix,
-                                         const State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                uint64_t imaskh, uint64_t qmaskh, const fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m256 ru, iu, rn, in;
-      __m256 rs[hsize], is[hsize];
-
-      auto p0 = rstate + _pdep_u64(i, imaskh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k] = _mm256_load_ps(p0 + p);
-        is[k] = _mm256_load_ps(p0 + p + 8);
-      }
-
-      double re = 0;
-      double im = 0;
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[l], ru, rn);
-          in = _mm256_fmadd_ps(rs[l], iu, in);
-          rn = _mm256_fnmadd_ps(is[l], iu, rn);
-          in = _mm256_fmadd_ps(is[l], ru, in);
-
-          j += 2;
-        }
-
-        __m256 v_re = _mm256_fmadd_ps(is[k], in, _mm256_mul_ps(rs[k], rn));
-        __m256 v_im = _mm256_fnmadd_ps(is[k], rn, _mm256_mul_ps(rs[k], in));
-
-        re += detail::HorizontalSumAVX(v_re);
-        im += detail::HorizontalSumAVX(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    auto m = GetMasks1<H, 3>(qs);
-
-    unsigned k = 3 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return
-        for_.RunReduce(size, f, Op(), matrix, m.imaskh, m.qmaskh, state.get());
-  }
-
-  template <unsigned H, unsigned L>
-  std::complex<double> ExpectationValueL(const std::vector<unsigned>& qs,
-                                         const fp_type* matrix,
-                                         const State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                uint64_t imaskh, uint64_t qmaskh, const __m256i* idx,
-                const fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m256 rn, in;
-      __m256 rs[gsize], is[gsize];
-
-      auto p0 = rstate + _pdep_u64(i, imaskh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k2] = _mm256_load_ps(p0 + p);
-        is[k2] = _mm256_load_ps(p0 + p + 8);
-
-        for (unsigned l = 1; l < lsize; ++l) {
-          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
-          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
-          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        unsigned m = lsize * k;
-
-        __m256 v_re = _mm256_fmadd_ps(is[m], in, _mm256_mul_ps(rs[m], rn));
-        __m256 v_im = _mm256_fnmadd_ps(is[m], rn, _mm256_mul_ps(rs[m], in));
-
-        re += detail::HorizontalSumAVX(v_re);
-        im += detail::HorizontalSumAVX(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    __m256i idx[1 << L];
-    __m256 w[1 << (1 + 2 * H + L)];
-
-    auto m = GetMasks2<H, L, 3>(qs);
-    FillPermutationIndices<L>(m.qmaskl, idx);
-    FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
-
-    unsigned r = 3 + H;
-    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return
-        for_.RunReduce(size, f, Op(), w, m.imaskh, m.qmaskh, idx, state.get());
-  }
-
-#else  // __BMI2__
-
-  template <unsigned H>
-  void ApplyGateH(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m256 ru, iu, rn, in;
-      __m256 rs[hsize], is[hsize];
-
-      i *= 8;
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      auto p0 = rstate + 2 * ii;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = _mm256_load_ps(p0 + xss[k]);
-        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[l], ru, rn);
-          in = _mm256_fmadd_ps(rs[l], iu, in);
-          rn = _mm256_fnmadd_ps(is[l], iu, rn);
-          in = _mm256_fmadd_ps(is[l], ru, in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[k], rn);
-        _mm256_store_ps(p0 + xss[k] + 8, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-
-    unsigned k = 3 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss, state.get());
-  }
-
-  template <unsigned H, unsigned L>
-  void ApplyGateL(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss, const __m256i* idx,
-                fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m256 rn, in;
-      __m256 rs[gsize], is[gsize];
-
-      i *= 8;
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      auto p0 = rstate + 2 * ii;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-        rs[k2] = _mm256_load_ps(p0 + xss[k]);
-        is[k2] = _mm256_load_ps(p0 + xss[k] + 8);
-
-        for (unsigned l = 1; l < lsize; ++l) {
-          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
-          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
-          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[k], rn);
-        _mm256_store_ps(p0 + xss[k] + 8, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-    __m256i idx[1 << L];
-    __m256 w[1 << (1 + 2 * H + L)];
-
-    auto m = GetMasks11<L>(qs);
-
-    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
-    FillPermutationIndices<L>(m.qmaskl, idx);
-    FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
-
-    unsigned r = 3 + H;
-    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, idx, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
-                uint64_t cmaskh, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m256 ru, iu, rn, in;
-      __m256 rs[hsize], is[hsize];
-
-      i *= 8;
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      if ((ii & cmaskh) != cvalsh) return;
-
-      auto p0 = rstate + 2 * ii;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = _mm256_load_ps(p0 + xss[k]);
-        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[l], ru, rn);
-          in = _mm256_fmadd_ps(rs[l], iu, in);
-          rn = _mm256_fnmadd_ps(is[l], iu, rn);
-          in = _mm256_fmadd_ps(is[l], ru, in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[k], rn);
-        _mm256_store_ps(p0 + xss[k] + 8, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-
-    auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals);
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-
-    unsigned k = 3 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss, m.cvalsh, m.cmaskh, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
-                uint64_t cmaskh, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m256 rn, in;
-      __m256 rs[hsize], is[hsize];
-
-      i *= 8;
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      if ((ii & cmaskh) != cvalsh) return;
-
-      auto p0 = rstate + 2 * ii;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = _mm256_load_ps(p0 + xss[k]);
-        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
-          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[k], rn);
-        _mm256_store_ps(p0 + xss[k] + 8, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-    __m256 w[1 << (1 + 2 * H)];
-
-    auto m = GetMasks8<3>(state.num_qubits(), qs, cqs, cvals);
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-    FillControlledMatrixH<H, 3>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
-
-    unsigned r = 3 + H;
-    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, state.get());
-  }
-
-  template <unsigned H, unsigned L, bool CH>
-  void ApplyControlledGateL(const std::vector<unsigned>& qs,
-                            const std::vector<unsigned>& cqs, uint64_t cvals,
-                            const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
-                uint64_t cmaskh, const __m256i* idx, fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m256 rn, in;
-      __m256 rs[gsize], is[gsize];
-
-      i *= 8;
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      if ((ii & cmaskh) != cvalsh) return;
-
-      auto p0 = rstate + 2 * ii;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-
-        rs[k2] = _mm256_load_ps(p0 + xss[k]);
-        is[k2] = _mm256_load_ps(p0 + xss[k] + 8);
-
-        for (unsigned l = 1; l < lsize; ++l) {
-          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
-          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
-          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[k], rn);
-        _mm256_store_ps(p0 + xss[k] + 8, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-    __m256i idx[1 << L];
-    __m256 w[1 << (1 + 2 * H + L)];
-
-    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
-
-    unsigned r = 3 + H;
-    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    if (CH) {
-      auto m = GetMasks9<L>(state.num_qubits(), qs, cqs, cvals);
-      FillPermutationIndices<L>(m.qmaskl, idx);
-      FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
-
-      for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, idx, state.get());
-    } else {
-      auto m = GetMasks10<L, 3>(state.num_qubits(), qs, cqs, cvals);
-      FillPermutationIndices<L>(m.qmaskl, idx);
-      FillControlledMatrixL<H, L, 3>(
-          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
-
-      for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, idx, state.get());
-    }
-  }
-
-  template <unsigned H>
-  std::complex<double> ExpectationValueH(const std::vector<unsigned>& qs,
-                                         const fp_type* matrix,
-                                         const State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                const fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m256 ru, iu, rn, in;
-      __m256 rs[hsize], is[hsize];
-
-      i *= 8;
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      auto p0 = rstate + 2 * ii;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = _mm256_load_ps(p0 + xss[k]);
-        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
-      }
-
-      double re = 0;
-      double im = 0;
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[l], ru, rn);
-          in = _mm256_fmadd_ps(rs[l], iu, in);
-          rn = _mm256_fnmadd_ps(is[l], iu, rn);
-          in = _mm256_fmadd_ps(is[l], ru, in);
-
-          j += 2;
-        }
-
-        __m256 v_re = _mm256_fmadd_ps(is[k], in, _mm256_mul_ps(rs[k], rn));
-        __m256 v_im = _mm256_fnmadd_ps(is[k], rn, _mm256_mul_ps(rs[k], in));
-
-        re += detail::HorizontalSumAVX(v_re);
-        im += detail::HorizontalSumAVX(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-
-    unsigned k = 3 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), matrix, ms, xss, state.get());
-  }
-
-  template <unsigned H, unsigned L>
-  std::complex<double> ExpectationValueL(const std::vector<unsigned>& qs,
-                                         const fp_type* matrix,
-                                         const State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss, const __m256i* idx,
-                const fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m256 rn, in;
-      __m256 rs[gsize], is[gsize];
-
-      i *= 8;
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      auto p0 = rstate + 2 * ii;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-
-        rs[k2] = _mm256_load_ps(p0 + xss[k]);
-        is[k2] = _mm256_load_ps(p0 + xss[k] + 8);
-
-        for (unsigned l = 1; l < lsize; ++l) {
-          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
-          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
-          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        unsigned m = lsize * k;
-
-        __m256 v_re = _mm256_fmadd_ps(is[m], in, _mm256_mul_ps(rs[m], rn));
-        __m256 v_im = _mm256_fnmadd_ps(is[m], rn, _mm256_mul_ps(rs[m], in));
-
-        re += detail::HorizontalSumAVX(v_re);
-        im += detail::HorizontalSumAVX(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-    __m256i idx[1 << L];
-    __m256 w[1 << (1 + 2 * H + L)];
-
-    auto m = GetMasks11<L>(qs);
-
-    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
-    FillPermutationIndices<L>(m.qmaskl, idx);
-    FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
-
-    unsigned r = 3 + H;
-    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, state.get());
-  }
-
-#endif  // __BMI2__
-
-  template <unsigned L>
-  static void FillPermutationIndices(unsigned qmaskl, __m256i* idx) {
-    constexpr unsigned lsize = 1 << L;
-
-    for (unsigned i = 0; i < lsize - 1; ++i) {
-      unsigned p[8];
-
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd<3>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-  }
-
-  For for_;
-};
-
-}  // namespace qsim
-
-#endif  // SIMULATOR_AVX_H_
diff --git a/qsim/simulator_avx512.h b/qsim/simulator_avx512.h
deleted file mode 100644
index 21a2e9d..0000000
--- a/qsim/simulator_avx512.h
+++ /dev/null
@@ -1,846 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SIMULATOR_AVX512_H_
-#define SIMULATOR_AVX512_H_
-
-#include <immintrin.h>
-
-#include <complex>
-#include <cstdint>
-#include <functional>
-#include <vector>
-
-#include "simulator.h"
-#include "statespace_avx512.h"
-
-namespace qsim {
-
-/**
- * Quantum circuit simulator with AVX512 vectorization.
- */
-template <typename For>
-class SimulatorAVX512 final : public SimulatorBase {
- public:
-  using StateSpace = StateSpaceAVX512<For>;
-  using State = typename StateSpace::State;
-  using fp_type = typename StateSpace::fp_type;
-
-  template <typename... ForArgs>
-  explicit SimulatorAVX512(ForArgs&&... args) : for_(args...) {}
-
-  /**
-   * Applies a gate using AVX512 instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyGate(const std::vector<unsigned>& qs,
-                 const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    switch (qs.size()) {
-    case 0:
-      ApplyGateH<0>(qs, matrix, state);
-      break;
-    case 1:
-      if (qs[0] > 3) {
-        ApplyGateH<1>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 1>(qs, matrix, state);
-      }
-      break;
-    case 2:
-      if (qs[0] > 3) {
-        ApplyGateH<2>(qs, matrix, state);
-      } else if (qs[1] > 3) {
-        ApplyGateL<1, 1>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 2>(qs, matrix, state);
-      }
-      break;
-    case 3:
-      if (qs[0] > 3) {
-        ApplyGateH<3>(qs, matrix, state);
-      } else if (qs[1] > 3) {
-        ApplyGateL<2, 1>(qs, matrix, state);
-      } else if (qs[2] > 3) {
-        ApplyGateL<1, 2>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 3>(qs, matrix, state);
-      }
-      break;
-    case 4:
-      if (qs[0] > 3) {
-        ApplyGateH<4>(qs, matrix, state);
-      } else if (qs[1] > 3) {
-        ApplyGateL<3, 1>(qs, matrix, state);
-      } else if (qs[2] > 3) {
-        ApplyGateL<2, 2>(qs, matrix, state);
-      } else if (qs[3] > 3) {
-        ApplyGateL<1, 3>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 4>(qs, matrix, state);
-      }
-      break;
-    case 5:
-      if (qs[0] > 3) {
-        ApplyGateH<5>(qs, matrix, state);
-      } else if (qs[1] > 3) {
-        ApplyGateL<4, 1>(qs, matrix, state);
-      } else if (qs[2] > 3) {
-        ApplyGateL<3, 2>(qs, matrix, state);
-      } else if (qs[3] > 3) {
-        ApplyGateL<2, 3>(qs, matrix, state);
-      } else {
-        ApplyGateL<1, 4>(qs, matrix, state);
-      }
-      break;
-    case 6:
-      if (qs[0] > 3) {
-        ApplyGateH<6>(qs, matrix, state);
-      } else if (qs[1] > 3) {
-        ApplyGateL<5, 1>(qs, matrix, state);
-      } else if (qs[2] > 3) {
-        ApplyGateL<4, 2>(qs, matrix, state);
-      } else if (qs[3] > 3) {
-        ApplyGateL<3, 3>(qs, matrix, state);
-      } else {
-        ApplyGateL<2, 4>(qs, matrix, state);
-      }
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-  }
-
-  /**
-   * Applies a controlled gate using AVX512 instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param cqs Indices of control qubits.
-   * @param cvals Bit mask of control qubit values.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyControlledGate(const std::vector<unsigned>& qs,
-                           const std::vector<unsigned>& cqs, uint64_t cvals,
-                           const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-    // Assume cqs[0] < cqs[1] < cqs[2] < ... .
-
-    if (cqs.size() == 0) {
-      ApplyGate(qs, matrix, state);
-      return;
-    }
-
-    switch (qs.size()) {
-    case 0:
-      if (cqs[0] > 3) {
-        ApplyControlledGateHH<0>(qs, cqs, cvals, matrix, state);
-      } else {
-        ApplyControlledGateHL<0>(qs, cqs, cvals, matrix, state);
-      }
-      break;
-    case 1:
-      if (qs[0] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 2:
-      if (qs[0] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 3:
-      if (qs[0] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[2] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 4:
-      if (qs[0] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[2] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[3] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<0, 4, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 4, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-  }
-
-  /**
-   * Computes the expectation value of an operator using AVX512 instructions.
-   * @param qs Indices of the qubits the operator acts on.
-   * @param matrix The operator matrix.
-   * @param state The state of the system.
-   * @return The computed expectation value.
-   */
-  std::complex<double> ExpectationValue(const std::vector<unsigned>& qs,
-                                        const fp_type* matrix,
-                                        const State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    switch (qs.size()) {
-    case 1:
-      if (qs[0] > 3) {
-        return ExpectationValueH<1>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<0, 1>(qs, matrix, state);
-      }
-      break;
-    case 2:
-      if (qs[0] > 3) {
-        return ExpectationValueH<2>(qs, matrix, state);
-      } else if (qs[1] > 3) {
-        return ExpectationValueL<1, 1>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<0, 2>(qs, matrix, state);
-      }
-      break;
-    case 3:
-      if (qs[0] > 3) {
-        return ExpectationValueH<3>(qs, matrix, state);
-      } else if (qs[1] > 3) {
-        return ExpectationValueL<2, 1>(qs, matrix, state);
-      } else if (qs[2] > 3) {
-        return ExpectationValueL<1, 2>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<0, 3>(qs, matrix, state);
-      }
-      break;
-    case 4:
-      if (qs[0] > 3) {
-        return ExpectationValueH<4>(qs, matrix, state);
-      } else if (qs[1] > 3) {
-        return ExpectationValueL<3, 1>(qs, matrix, state);
-      } else if (qs[2] > 3) {
-        return ExpectationValueL<2, 2>(qs, matrix, state);
-      } else if (qs[3] > 3) {
-        return ExpectationValueL<1, 3>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<0, 4>(qs, matrix, state);
-      }
-      break;
-    case 5:
-      if (qs[0] > 3) {
-        return ExpectationValueH<5>(qs, matrix, state);
-      } else if (qs[1] > 3) {
-        return ExpectationValueL<4, 1>(qs, matrix, state);
-      } else if (qs[2] > 3) {
-        return ExpectationValueL<3, 2>(qs, matrix, state);
-      } else if (qs[3] > 3) {
-        return ExpectationValueL<2, 3>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<1, 4>(qs, matrix, state);
-      }
-      break;
-    case 6:
-      if (qs[0] > 3) {
-        return ExpectationValueH<6>(qs, matrix, state);
-      } else if (qs[1] > 3) {
-        return ExpectationValueL<5, 1>(qs, matrix, state);
-      } else if (qs[2] > 3) {
-        return ExpectationValueL<4, 2>(qs, matrix, state);
-      } else if (qs[3] > 3) {
-        return ExpectationValueL<3, 3>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<2, 4>(qs, matrix, state);
-      }
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-
-    return 0;
-  }
-
-  /**
-   * @return The size of SIMD register if applicable.
-   */
-  static unsigned SIMDRegisterSize() {
-    return 16;
-  }
-
- private:
-  template <unsigned H>
-  void ApplyGateH(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                uint64_t imaskh, uint64_t qmaskh, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m512 ru, iu, rn, in;
-      __m512 rs[hsize], is[hsize];
-
-      auto p0 = rstate + _pdep_u64(i, imaskh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k] = _mm512_load_ps(p0 + p);
-        is[k] = _mm512_load_ps(p0 + p + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm512_set1_ps(v[j]);
-        iu = _mm512_set1_ps(v[j + 1]);
-        rn = _mm512_mul_ps(rs[0], ru);
-        in = _mm512_mul_ps(rs[0], iu);
-        rn = _mm512_fnmadd_ps(is[0], iu, rn);
-        in = _mm512_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm512_set1_ps(v[j]);
-          iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[l], ru, rn);
-          in = _mm512_fmadd_ps(rs[l], iu, in);
-          rn = _mm512_fnmadd_ps(is[l], iu, rn);
-          in = _mm512_fmadd_ps(is[l], ru, in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm512_store_ps(p0 + p, rn);
-        _mm512_store_ps(p0 + p + 16, in);
-      }
-    };
-
-    auto m = GetMasks1<H, 4>(qs);
-
-    unsigned k = 4 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, m.imaskh, m.qmaskh, state.get());
-  }
-
-  template <unsigned H, unsigned L>
-  void ApplyGateL(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                uint64_t imaskh, uint64_t qmaskh, const __m512i* idx,
-                fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m512 rn, in;
-      __m512 rs[gsize], is[gsize];
-
-      auto p0 = rstate + _pdep_u64(i, imaskh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k2] = _mm512_load_ps(p0 + p);
-        is[k2] = _mm512_load_ps(p0 + p + 16);
-
-        for (unsigned l = 1; l < lsize; ++l) {
-          rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]);
-          is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
-          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm512_store_ps(p0 + p, rn);
-        _mm512_store_ps(p0 + p + 16, in);
-      }
-    };
-
-    __m512i idx[1 << L];
-    __m512 w[1 << (1 + 2 * H + L)];
-
-    auto m = GetMasks2<H, L, 4>(qs);
-    FillPermutationIndices<L>(m.qmaskl, idx);
-    FillMatrix<H, L, 4>(m.qmaskl, matrix, (fp_type*) w);
-
-    unsigned r = 4 + H;
-    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, m.imaskh, m.qmaskh, idx, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
-                fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m512 ru, iu, rn, in;
-      __m512 rs[hsize], is[hsize];
-
-      auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k] = _mm512_load_ps(p0 + p);
-        is[k] = _mm512_load_ps(p0 + p + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm512_set1_ps(v[j]);
-        iu = _mm512_set1_ps(v[j + 1]);
-        rn = _mm512_mul_ps(rs[0], ru);
-        in = _mm512_mul_ps(rs[0], iu);
-        rn = _mm512_fnmadd_ps(is[0], iu, rn);
-        in = _mm512_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm512_set1_ps(v[j]);
-          iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[l], ru, rn);
-          in = _mm512_fmadd_ps(rs[l], iu, in);
-          rn = _mm512_fnmadd_ps(is[l], iu, rn);
-          in = _mm512_fmadd_ps(is[l], ru, in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm512_store_ps(p0 + p, rn);
-        _mm512_store_ps(p0 + p + 16, in);
-      }
-    };
-
-    auto m = GetMasks3<H, 4>(state.num_qubits(), qs, cqs, cvals);
-
-    unsigned k = 4 + H + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, m.imaskh, m.qmaskh, m.cvalsh, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
-                fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m512 rn, in;
-      __m512 rs[hsize], is[hsize];
-
-      auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k] = _mm512_load_ps(p0 + p);
-        is[k] = _mm512_load_ps(p0 + p + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
-          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm512_store_ps(p0 + p, rn);
-        _mm512_store_ps(p0 + p + 16, in);
-      }
-    };
-
-    __m512 w[1 << (1 + 2 * H)];
-
-    auto m = GetMasks4<H, 4>(state.num_qubits(), qs, cqs, cvals);
-    FillControlledMatrixH<H, 4>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
-
-    unsigned r = 4 + H + cqs.size() - m.cl;
-    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, state.get());
-  }
-
-  template <unsigned H, unsigned L, bool CH>
-  void ApplyControlledGateL(const std::vector<unsigned>& qs,
-                            const std::vector<unsigned>& cqs, uint64_t cvals,
-                            const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
-                const __m512i* idx, fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m512 rn, in;
-      __m512 rs[gsize], is[gsize];
-
-      auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k2] = _mm512_load_ps(p0 + p);
-        is[k2] = _mm512_load_ps(p0 + p + 16);
-
-        for (unsigned l = 1; l < lsize; ++l) {
-          rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]);
-          is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
-          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm512_store_ps(p0 + p, rn);
-        _mm512_store_ps(p0 + p + 16, in);
-      }
-    };
-
-    __m512i idx[1 << L];
-    __m512 w[1 << (1 + 2 * H + L)];
-
-    if (CH) {
-      auto m = GetMasks5<H, L, 4>(state.num_qubits(), qs, cqs, cvals);
-      FillPermutationIndices<L>(m.qmaskl, idx);
-      FillMatrix<H, L, 4>(m.qmaskl, matrix, (fp_type*) w);
-
-      unsigned r = 4 + H + cqs.size();
-      unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
-      uint64_t size = uint64_t{1} << n;
-
-      for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get());
-    } else {
-      auto m = GetMasks6<H, L, 4>(state.num_qubits(), qs, cqs, cvals);
-      FillPermutationIndices<L>(m.qmaskl, idx);
-      FillControlledMatrixL<H, L, 4>(
-          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
-
-      unsigned r = 4 + H + cqs.size() - m.cl;
-      unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
-      uint64_t size = uint64_t{1} << n;
-
-      for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get());
-    }
-  }
-
-  template <unsigned H>
-  std::complex<double> ExpectationValueH(const std::vector<unsigned>& qs,
-                                         const fp_type* matrix,
-                                         const State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                uint64_t imaskh, uint64_t qmaskh, const fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m512 ru, iu, rn, in;
-      __m512 rs[hsize], is[hsize];
-
-      auto p0 = rstate + _pdep_u64(i, imaskh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k] = _mm512_load_ps(p0 + p);
-        is[k] = _mm512_load_ps(p0 + p + 16);
-      }
-
-      double re = 0;
-      double im = 0;
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm512_set1_ps(v[j]);
-        iu = _mm512_set1_ps(v[j + 1]);
-        rn = _mm512_mul_ps(rs[0], ru);
-        in = _mm512_mul_ps(rs[0], iu);
-        rn = _mm512_fnmadd_ps(is[0], iu, rn);
-        in = _mm512_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm512_set1_ps(v[j]);
-          iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[l], ru, rn);
-          in = _mm512_fmadd_ps(rs[l], iu, in);
-          rn = _mm512_fnmadd_ps(is[l], iu, rn);
-          in = _mm512_fmadd_ps(is[l], ru, in);
-
-          j += 2;
-        }
-
-        __m512 v_re = _mm512_fmadd_ps(is[k], in, _mm512_mul_ps(rs[k], rn));
-        __m512 v_im = _mm512_fnmadd_ps(is[k], rn, _mm512_mul_ps(rs[k], in));
-
-        re += detail::HorizontalSumAVX512(v_re);
-        im += detail::HorizontalSumAVX512(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    auto m = GetMasks1<H, 4>(qs);
-
-    unsigned k = 4 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return
-        for_.RunReduce(size, f, Op(), matrix, m.imaskh, m.qmaskh, state.get());
-  }
-
-  template <unsigned H, unsigned L>
-  std::complex<double> ExpectationValueL(const std::vector<unsigned>& qs,
-                                         const fp_type* matrix,
-                                         const State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                uint64_t imaskh, uint64_t qmaskh, const __m512i* idx,
-                const fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m512 rn, in;
-      __m512 rs[gsize], is[gsize];
-
-      auto p0 = rstate + _pdep_u64(i, imaskh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k2] = _mm512_load_ps(p0 + p);
-        is[k2] = _mm512_load_ps(p0 + p + 16);
-
-        for (unsigned l = 1; l < lsize; ++l) {
-          rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]);
-          is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
-          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        unsigned m = lsize * k;
-
-        __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn));
-        __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in));
-
-        re += detail::HorizontalSumAVX512(v_re);
-        im += detail::HorizontalSumAVX512(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    __m512i idx[1 << L];
-    __m512 w[1 << (1 + 2 * H + L)];
-
-    auto m = GetMasks2<H, L, 4>(qs);
-    FillPermutationIndices<L>(m.qmaskl, idx);
-    FillMatrix<H, L, 4>(m.qmaskl, matrix, (fp_type*) w);
-
-    unsigned r = 4 + H;
-    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return
-        for_.RunReduce(size, f, Op(), w, m.imaskh, m.qmaskh, idx, state.get());
-  }
-
-  template <unsigned L>
-  static void FillPermutationIndices(unsigned qmaskl, __m512i* idx) {
-    constexpr unsigned lsize = 1 << L;
-
-    for (unsigned i = 0; i < lsize; ++i) {
-      unsigned p[16];
-
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd<4>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-  }
-
-  For for_;
-};
-
-}  // namespace qsim
-
-#endif  // SIMULATOR_AVX512_H_
diff --git a/qsim/simulator_basic.h b/qsim/simulator_basic.h
deleted file mode 100644
index 752eeb5..0000000
--- a/qsim/simulator_basic.h
+++ /dev/null
@@ -1,349 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SIMULATOR_BASIC_H_
-#define SIMULATOR_BASIC_H_
-
-#include <complex>
-#include <cstdint>
-#include <functional>
-#include <vector>
-
-#include "simulator.h"
-#include "statespace_basic.h"
-
-namespace qsim {
-
-/**
- * Quantum circuit simulator without vectorization.
- */
-template <typename For, typename FP = float>
-class SimulatorBasic final : public SimulatorBase {
- public:
-  using StateSpace = StateSpaceBasic<For, FP>;
-  using State = typename StateSpace::State;
-  using fp_type = typename StateSpace::fp_type;
-
-  template <typename... ForArgs>
-  explicit SimulatorBasic(ForArgs&&... args) : for_(args...) {}
-
-  /**
-   * Applies a gate using non-vectorized instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyGate(const std::vector<unsigned>& qs,
-                 const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    switch (qs.size()) {
-    case 0:
-      ApplyGateH<0>(qs, matrix, state);
-      break;
-    case 1:
-      ApplyGateH<1>(qs, matrix, state);
-      break;
-    case 2:
-      ApplyGateH<2>(qs, matrix, state);
-      break;
-    case 3:
-      ApplyGateH<3>(qs, matrix, state);
-      break;
-    case 4:
-      ApplyGateH<4>(qs, matrix, state);
-      break;
-    case 5:
-      ApplyGateH<5>(qs, matrix, state);
-      break;
-    case 6:
-      ApplyGateH<6>(qs, matrix, state);
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-  }
-
-  /**
-   * Applies a controlled gate using non-vectorized instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param cqs Indices of control qubits.
-   * @param cvals Bit mask of control qubit values.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyControlledGate(const std::vector<unsigned>& qs,
-                           const std::vector<unsigned>& cqs, uint64_t cvals,
-                           const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    if (cqs.size() == 0) {
-      ApplyGate(qs, matrix, state);
-      return;
-    }
-
-    switch (qs.size()) {
-    case 0:
-      ApplyControlledGateH<0>(qs, cqs, cvals, matrix, state);
-      break;
-    case 1:
-      ApplyControlledGateH<1>(qs, cqs, cvals, matrix, state);
-      break;
-    case 2:
-      ApplyControlledGateH<2>(qs, cqs, cvals, matrix, state);
-      break;
-    case 3:
-      ApplyControlledGateH<3>(qs, cqs, cvals, matrix, state);
-      break;
-    case 4:
-      ApplyControlledGateH<4>(qs, cqs, cvals, matrix, state);
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-  }
-
-  /**
-   * Computes the expectation value of an operator using non-vectorized
-   * instructions.
-   * @param qs Indices of the qubits the operator acts on.
-   * @param matrix The operator matrix.
-   * @param state The state of the system.
-   * @return The computed expectation value.
-   */
-  std::complex<double> ExpectationValue(const std::vector<unsigned>& qs,
-                                        const fp_type* matrix,
-                                        const State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    switch (qs.size()) {
-    case 1:
-      return ExpectationValueH<1>(qs, matrix, state);
-      break;
-    case 2:
-      return ExpectationValueH<2>(qs, matrix, state);
-      break;
-    case 3:
-      return ExpectationValueH<3>(qs, matrix, state);
-      break;
-    case 4:
-      return ExpectationValueH<4>(qs, matrix, state);
-      break;
-    case 5:
-      return ExpectationValueH<5>(qs, matrix, state);
-      break;
-    case 6:
-      return ExpectationValueH<6>(qs, matrix, state);
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-
-    return 0;
-  }
-
-  /**
-   * @return The size of SIMD register if applicable.
-   */
-  static unsigned SIMDRegisterSize() {
-    return 1;
-  }
-
- private:
-  template <unsigned H>
-  void ApplyGateH(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      fp_type rn, in;
-      fp_type rs[hsize], is[hsize];
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      auto p0 = rstate + 2 * ii;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = *(p0 + xss[k]);
-        is[k] = *(p0 + xss[k] + 1);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = rs[0] * v[j] - is[0] * v[j + 1];
-        in = rs[0] * v[j + 1] + is[0] * v[j];
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          rn += rs[l] * v[j] - is[l] * v[j + 1];
-          in += rs[l] * v[j + 1] + is[l] * v[j];
-
-          j += 2;
-        }
-
-        *(p0 + xss[k]) = rn;
-        *(p0 + xss[k] + 1) = in;
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-
-    unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateH(const std::vector<unsigned>& qs,
-                            const std::vector<unsigned>& cqs,
-                            uint64_t cvals, const fp_type* matrix,
-                            State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                uint64_t cvalsh, uint64_t cmaskh, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      fp_type rn, in;
-      fp_type rs[hsize], is[hsize];
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      if ((ii & cmaskh) == cvalsh) {
-        auto p0 = rstate + 2 * ii;
-
-        for (unsigned k = 0; k < hsize; ++k) {
-          rs[k] = *(p0 + xss[k]);
-          is[k] = *(p0 + xss[k] + 1);
-        }
-
-        uint64_t j = 0;
-
-        for (unsigned k = 0; k < hsize; ++k) {
-          rn = rs[0] * v[j] - is[0] * v[j + 1];
-          in = rs[0] * v[j + 1] + is[0] * v[j];
-
-          j += 2;
-
-          for (unsigned l = 1; l < hsize; ++l) {
-            rn += rs[l] * v[j] - is[l] * v[j + 1];
-            in += rs[l] * v[j + 1] + is[l] * v[j];
-
-            j += 2;
-          }
-
-          *(p0 + xss[k]) = rn;
-          *(p0 + xss[k] + 1) = in;
-        }
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-
-    auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals);
-
-    unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss, m.cvalsh, m.cmaskh, state.get());
-  }
-
-  template <unsigned H>
-  std::complex<double> ExpectationValueH(const std::vector<unsigned>& qs,
-                                         const fp_type* matrix,
-                                         const State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                const fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      fp_type rn, in;
-      fp_type rs[hsize], is[hsize];
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      auto p0 = rstate + 2 * ii;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = *(p0 + xss[k]);
-        is[k] = *(p0 + xss[k] + 1);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = rs[0] * v[j] - is[0] * v[j + 1];
-        in = rs[0] * v[j + 1] + is[0] * v[j];
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          rn += rs[l] * v[j] - is[l] * v[j + 1];
-          in += rs[l] * v[j + 1] + is[l] * v[j];
-
-          j += 2;
-        }
-
-        re += rs[k] * rn + is[k] * in;
-        im += rs[k] * in - is[k] * rn;
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-
-    unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), matrix, ms, xss, state.get());
-  }
-
-  For for_;
-};
-
-}  // namespace qsim
-
-#endif  // SIMULATOR_BASIC_H_
diff --git a/qsim/simulator_cuda.h b/qsim/simulator_cuda.h
deleted file mode 100644
index 5743bea..0000000
--- a/qsim/simulator_cuda.h
+++ /dev/null
@@ -1,923 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SIMULATOR_CUDA_H_
-#define SIMULATOR_CUDA_H_
-
-#include "simulator_cuda_kernels.h"
-
-#include <algorithm>
-#include <complex>
-#include <cstdint>
-#include <cstring>
-#include <vector>
-
-#include "bits.h"
-#include "statespace_cuda.h"
-
-namespace qsim {
-
-/**
- * Quantum circuit simulator with GPU vectorization.
- */
-template <typename FP = float>
-class SimulatorCUDA final {
- private:
-  using idx_type = uint64_t;
-  using Complex = qsim::Complex<double>;
-
-  // The maximum buffer size for indices and gate matrices.
-  // The maximum gate matrix size (for 6-qubit gates) is
-  // 2 * 2^6 * 2^6 * sizeof(FP) = 8192 * sizeof(FP). The maximum index size is
-  // 128 * sizeof(idx_type) + 96 * sizeof(unsigned).
-  static constexpr unsigned max_buf_size = 8192 * sizeof(FP)
-      + 128 * sizeof(idx_type) + 96 * sizeof(unsigned);
-
- public:
-  using StateSpace = StateSpaceCUDA<FP>;
-  using State = typename StateSpace::State;
-  using fp_type = typename StateSpace::fp_type;
-
-  SimulatorCUDA() : scratch_(nullptr), scratch_size_(0) {
-    ErrorCheck(cudaMalloc(&d_ws, max_buf_size));
-  }
-
-  ~SimulatorCUDA() {
-    ErrorCheck(cudaFree(d_ws));
-
-    if (scratch_ != nullptr) {
-      ErrorCheck(cudaFree(scratch_));
-    }
-  }
-
-  /**
-   * Applies a gate using CUDA instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyGate(const std::vector<unsigned>& qs,
-                 const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    if (qs.size() == 0) {
-      ApplyGateH<0>(qs, matrix, state);
-    } else if (qs[0] > 4) {
-      switch (qs.size()) {
-      case 1:
-        ApplyGateH<1>(qs, matrix, state);
-        break;
-      case 2:
-        ApplyGateH<2>(qs, matrix, state);
-        break;
-      case 3:
-        ApplyGateH<3>(qs, matrix, state);
-        break;
-      case 4:
-        ApplyGateH<4>(qs, matrix, state);
-        break;
-      case 5:
-        ApplyGateH<5>(qs, matrix, state);
-        break;
-      case 6:
-        ApplyGateH<6>(qs, matrix, state);
-        break;
-      default:
-        // Not implemented.
-        break;
-      }
-    } else {
-      switch (qs.size()) {
-      case 1:
-        ApplyGateL<1>(qs, matrix, state);
-        break;
-      case 2:
-        ApplyGateL<2>(qs, matrix, state);
-        break;
-      case 3:
-        ApplyGateL<3>(qs, matrix, state);
-        break;
-      case 4:
-        ApplyGateL<4>(qs, matrix, state);
-        break;
-      case 5:
-        ApplyGateL<5>(qs, matrix, state);
-        break;
-      case 6:
-        ApplyGateL<6>(qs, matrix, state);
-        break;
-      default:
-        // Not implemented.
-        break;
-      }
-    }
-  }
-
-  /**
-   * Applies a controlled gate using CUDA instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param cqs Indices of control qubits.
-   * @param cvals Bit mask of control qubit values.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyControlledGate(const std::vector<unsigned>& qs,
-                           const std::vector<unsigned>& cqs, uint64_t cvals,
-                           const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    if (cqs.size() == 0) {
-      ApplyGate(qs, matrix, state);
-      return;
-    }
-
-    if (cqs[0] < 5) {
-      switch (qs.size()) {
-      case 0:
-        ApplyControlledGateL<0>(qs, cqs, cvals, matrix, state);
-        break;
-      case 1:
-        ApplyControlledGateL<1>(qs, cqs, cvals, matrix, state);
-        break;
-      case 2:
-        ApplyControlledGateL<2>(qs, cqs, cvals, matrix, state);
-        break;
-      case 3:
-        ApplyControlledGateL<3>(qs, cqs, cvals, matrix, state);
-        break;
-      case 4:
-        ApplyControlledGateL<4>(qs, cqs, cvals, matrix, state);
-        break;
-      default:
-        // Not implemented.
-        break;
-      }
-    } else {
-      if (qs.size() == 0) {
-        ApplyControlledGateHH<0>(qs, cqs, cvals, matrix, state);
-      } else if (qs[0] > 4) {
-        switch (qs.size()) {
-        case 1:
-          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
-          break;
-        case 2:
-          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
-          break;
-        case 3:
-          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
-          break;
-        case 4:
-          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
-          break;
-        default:
-          // Not implemented.
-          break;
-        }
-      } else {
-        switch (qs.size()) {
-        case 1:
-          ApplyControlledGateLH<1>(qs, cqs, cvals, matrix, state);
-          break;
-        case 2:
-          ApplyControlledGateLH<2>(qs, cqs, cvals, matrix, state);
-          break;
-        case 3:
-          ApplyControlledGateLH<3>(qs, cqs, cvals, matrix, state);
-          break;
-        case 4:
-          ApplyControlledGateLH<4>(qs, cqs, cvals, matrix, state);
-          break;
-        default:
-          // Not implemented.
-          break;
-        }
-      }
-    }
-  }
-
-  /**
-   * Computes the expectation value of an operator using CUDA instructions.
-   * @param qs Indices of the qubits the operator acts on.
-   * @param matrix The operator matrix.
-   * @param state The state of the system.
-   * @return The computed expectation value.
-   */
-  std::complex<double> ExpectationValue(const std::vector<unsigned>& qs,
-                                        const fp_type* matrix,
-                                        const State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    if (qs[0] > 4) {
-      switch (qs.size()) {
-      case 1:
-        return ExpectationValueH<1>(qs, matrix, state);
-      case 2:
-        return ExpectationValueH<2>(qs, matrix, state);
-      case 3:
-        return ExpectationValueH<3>(qs, matrix, state);
-      case 4:
-        return ExpectationValueH<4>(qs, matrix, state);
-      case 5:
-        return ExpectationValueH<5>(qs, matrix, state);
-      case 6:
-        return ExpectationValueH<6>(qs, matrix, state);
-      default:
-        // Not implemented.
-        break;
-      }
-    } else {
-      switch (qs.size()) {
-      case 1:
-        return ExpectationValueL<1>(qs, matrix, state);
-      case 2:
-        return ExpectationValueL<2>(qs, matrix, state);
-      case 3:
-        return ExpectationValueL<3>(qs, matrix, state);
-      case 4:
-        return ExpectationValueL<4>(qs, matrix, state);
-      case 5:
-        return ExpectationValueL<5>(qs, matrix, state);
-      case 6:
-        return ExpectationValueL<6>(qs, matrix, state);
-      default:
-        // Not implemented.
-        break;
-      }
-    }
-
-    return 0;
-  }
-
-  /**
-   * @return The size of SIMD register if applicable.
-   */
-  static unsigned SIMDRegisterSize() {
-    return 32;
-  }
-
- private:
-  // The following indices are used in kernels.
-  // xss - indices to access the state vector entries in global memory.
-  // ms  - masks to access the state vector entries in global memory.
-  // tis - indices to access the state vector entries in shared memory
-  //       in the presence of low gate qubits.
-  // qis - indices to access the state vector entries in shared memory
-  //       in the presence of low gate qubits.
-  // cis - additional indices to access the state vector entries in global
-  //       memory in the presence of low control qubits.
-
-  template <unsigned G>
-  struct IndicesH {
-    static constexpr unsigned gsize = 1 << G;
-    static constexpr unsigned matrix_size = 2 * gsize * gsize * sizeof(fp_type);
-    static constexpr unsigned xss_size = 32 * sizeof(idx_type) * (1 + (G == 6));
-    static constexpr unsigned ms_size = 32 * sizeof(idx_type);
-    static constexpr unsigned xss_offs = matrix_size;
-    static constexpr unsigned ms_offs = xss_offs + xss_size;
-    static constexpr unsigned buf_size = ms_offs + ms_size;
-
-    IndicesH(char* p)
-        : xss((idx_type*) (p + xss_offs)), ms((idx_type*) (p + ms_offs)) {}
-
-    idx_type* xss;
-    idx_type* ms;
-  };
-
-  template <unsigned G>
-  struct IndicesL : public IndicesH<G> {
-    using Base = IndicesH<G>;
-    static constexpr unsigned qis_size = 32 * sizeof(unsigned) * (1 + (G == 6));
-    static constexpr unsigned tis_size = 32 * sizeof(unsigned);
-    static constexpr unsigned qis_offs = Base::buf_size;
-    static constexpr unsigned tis_offs = qis_offs + qis_size;
-    static constexpr unsigned buf_size = tis_offs + tis_size;
-
-    IndicesL(char* p)
-        : Base(p), qis((unsigned*) (p + qis_offs)),
-          tis((unsigned*) (p + tis_offs)) {}
-
-    unsigned* qis;
-    unsigned* tis;
-  };
-
-  template <unsigned G>
-  struct IndicesLC : public IndicesL<G> {
-    using Base = IndicesL<G>;
-    static constexpr unsigned cis_size = 32 * sizeof(idx_type);
-    static constexpr unsigned cis_offs = Base::buf_size;
-    static constexpr unsigned buf_size = cis_offs + cis_size;
-
-    IndicesLC(char* p) : Base(p), cis((idx_type*) (p + cis_offs)) {}
-
-    idx_type* cis;
-  };
-
-  struct DataC {
-    idx_type cvalsh;
-    unsigned num_aqs;
-    unsigned num_effective_qs;
-    unsigned remaining_low_cqs;
-  };
-
-  template <unsigned G>
-  void ApplyGateH(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    unsigned num_qubits = state.num_qubits();
-
-    IndicesH<G> h_i(h_ws);
-    GetIndicesH(num_qubits, qs, qs.size(), h_i);
-
-    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
-    ErrorCheck(
-        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
-
-    unsigned k = 5 + G;
-    unsigned n = num_qubits > k ? num_qubits - k : 0;
-    unsigned size = unsigned{1} << n;
-    unsigned threads = 64U;
-    unsigned blocks = std::max(1U, size / 2);
-
-    IndicesH<G> d_i(d_ws);
-
-    ApplyGateH_Kernel<G><<<blocks, threads>>>(
-        (fp_type*) d_ws, d_i.xss, d_i.ms, state.get());
-  }
-
-  template <unsigned G>
-  void ApplyGateL(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    unsigned num_qubits = state.num_qubits();
-
-    IndicesL<G> h_i(h_ws);
-    auto num_effective_qs = GetIndicesL(num_qubits, qs, h_i);
-
-    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
-    ErrorCheck(
-        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
-
-    unsigned k = 5 + num_effective_qs;
-    unsigned n = num_qubits > k ? num_qubits - k : 0;
-    unsigned size = unsigned{1} << n;
-    unsigned threads = 32;
-    unsigned blocks = size;
-
-    IndicesL<G> d_i(d_ws);
-
-    ApplyGateL_Kernel<G><<<blocks, threads>>>(
-        (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
-        1 << num_effective_qs, state.get());
-  }
-
-  template <unsigned G>
-  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, idx_type cvals,
-                             const fp_type* matrix, State& state) const {
-    unsigned aqs[64];
-    idx_type cmaskh = 0;
-    unsigned num_qubits = state.num_qubits();
-
-    IndicesH<G> h_i(h_ws);
-
-    unsigned num_aqs = GetHighQubits(qs, 0, cqs, 0, 0, cmaskh, aqs);
-    GetMs(num_qubits, aqs, num_aqs, h_i.ms);
-    GetXss(num_qubits, qs, qs.size(), h_i.xss);
-
-    idx_type cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh);
-
-    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
-    ErrorCheck(
-        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
-
-    unsigned k = 5 + G + cqs.size();
-    unsigned n = num_qubits > k ? num_qubits - k : 0;
-    unsigned size = unsigned{1} << n;
-    unsigned threads = 64U;
-    unsigned blocks = std::max(1U, size / 2);
-
-    IndicesH<G> d_i(d_ws);
-
-    ApplyControlledGateH_Kernel<G><<<blocks, threads>>>(
-        (fp_type*) d_ws, d_i.xss, d_i.ms, num_aqs + 1, cvalsh, state.get());
-  }
-
-  template <unsigned G>
-  void ApplyControlledGateLH(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    unsigned num_qubits = state.num_qubits();
-
-    IndicesL<G> h_i(h_ws);
-    auto d = GetIndicesLC(num_qubits, qs, cqs, cvals, h_i);
-
-    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
-    ErrorCheck(
-        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
-
-    unsigned k = 5 + G + cqs.size();
-    unsigned n = num_qubits > k ? num_qubits - k : 0;
-    unsigned size = unsigned{1} << n;
-    unsigned threads = 32;
-    unsigned blocks = size;
-
-    IndicesL<G> d_i(d_ws);
-
-    ApplyControlledGateLH_Kernel<G><<<blocks, threads>>>(
-        (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
-        d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs, state.get());
-  }
-
-  template <unsigned G>
-  void ApplyControlledGateL(const std::vector<unsigned>& qs,
-                            const std::vector<unsigned>& cqs, uint64_t cvals,
-                            const fp_type* matrix, State& state) const {
-    unsigned num_qubits = state.num_qubits();
-
-    IndicesLC<G> h_i(h_ws);
-    auto d = GetIndicesLCL(num_qubits, qs, cqs, cvals, h_i);
-
-    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
-    ErrorCheck(
-        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
-
-    unsigned k = 5 + G + cqs.size();
-    unsigned n = num_qubits > k ? num_qubits - k : 0;
-    unsigned size = unsigned{1} << n;
-    unsigned threads = 32;
-    unsigned blocks = size;
-
-    IndicesLC<G> d_i(d_ws);
-
-    ApplyControlledGateL_Kernel<G><<<blocks, threads>>>(
-        (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d_i.cis,
-        d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs,
-        1 << (5 - d.remaining_low_cqs), state.get());
-  }
-
-  template <unsigned G>
-  std::complex<double> ExpectationValueH(const std::vector<unsigned>& qs,
-                                         const fp_type* matrix,
-                                         const State& state) const {
-    unsigned num_qubits = state.num_qubits();
-
-    IndicesH<G> h_i(h_ws);
-    GetIndicesH(num_qubits, qs, qs.size(), h_i);
-
-    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
-    ErrorCheck(
-        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
-
-    unsigned k = 5 + G;
-    unsigned n = num_qubits > k ? num_qubits - k : 0;
-    unsigned size = unsigned{1} << n;
-
-    unsigned s = std::min(n >= 14 ? n - 14 : 0, 4U);
-    unsigned threads = 64U;
-    unsigned blocks = std::max(1U, (size / 2) >> s);
-    unsigned num_iterations_per_block = 1 << s;
-
-    constexpr unsigned m = 16;
-
-    Complex* d_res1 = (Complex*) AllocScratch((blocks + m) * sizeof(Complex));
-    Complex* d_res2 = d_res1 + blocks;
-
-    IndicesH<G> d_i(d_ws);
-
-    ExpectationValueH_Kernel<G><<<blocks, threads>>>(
-        (fp_type*) d_ws, d_i.xss, d_i.ms, num_iterations_per_block,
-        state.get(), Plus<double>(), d_res1);
-
-    double mul = size == 1 ? 0.5 : 1.0;
-
-    return ExpectationValueReduceFinal<m>(blocks, mul, d_res1, d_res2);
-  }
-
-  template <unsigned G>
-  std::complex<double> ExpectationValueL(const std::vector<unsigned>& qs,
-                                         const fp_type* matrix,
-                                         const State& state) const {
-    unsigned num_qubits = state.num_qubits();
-
-    IndicesL<G> h_i(h_ws);
-    auto num_effective_qs = GetIndicesL(num_qubits, qs, h_i);
-
-    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
-    ErrorCheck(
-        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
-
-    unsigned k = 5 + num_effective_qs;
-    unsigned n = num_qubits > k ? num_qubits - k : 0;
-    unsigned size = unsigned{1} << n;
-
-    unsigned s = std::min(n >= 13 ? n - 13 : 0, 5U);
-    unsigned threads = 32;
-    unsigned blocks = size >> s;
-    unsigned num_iterations_per_block = 1 << s;
-
-    constexpr unsigned m = 16;
-
-    Complex* d_res1 = (Complex*) AllocScratch((blocks + m) * sizeof(Complex));
-    Complex* d_res2 = d_res1 + blocks;
-
-    IndicesL<G> d_i(d_ws);
-
-    ExpectationValueL_Kernel<G><<<blocks, threads>>>(
-        (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
-        num_iterations_per_block, state.get(), Plus<double>(), d_res1);
-
-    double mul = double(1 << (5 + num_effective_qs - G)) / 32;
-
-    return ExpectationValueReduceFinal<m>(blocks, mul, d_res1, d_res2);
-  }
-
-  template <unsigned m>
-  std::complex<double> ExpectationValueReduceFinal(
-      unsigned blocks, double mul,
-      const Complex* d_res1, Complex* d_res2) const {
-    Complex res2[m];
-
-    if (blocks <= 16) {
-      ErrorCheck(cudaMemcpy(res2, d_res1, blocks * sizeof(Complex),
-                            cudaMemcpyDeviceToHost));
-    } else {
-      unsigned threads2 = std::min(1024U, blocks);
-      unsigned blocks2 = std::min(m, blocks / threads2);
-
-      unsigned dblocks = std::max(1U, blocks / (blocks2 * threads2));
-      unsigned bytes = threads2 * sizeof(Complex);
-
-      Reduce2Kernel<Complex><<<blocks2, threads2, bytes>>>(
-          dblocks, blocks, Plus<Complex>(), Plus<double>(), d_res1, d_res2);
-
-      ErrorCheck(cudaMemcpy(res2, d_res2, blocks2 * sizeof(Complex),
-                            cudaMemcpyDeviceToHost));
-
-      blocks = blocks2;
-    }
-
-    double re = 0;
-    double im = 0;
-
-    for (unsigned i = 0; i < blocks; ++i) {
-      re += res2[i].re;
-      im += res2[i].im;
-    }
-
-    return {mul * re, mul * im};
-  }
-
-  template <typename AQ>
-  unsigned GetHighQubits(const std::vector<unsigned>& qs, unsigned qi,
-                         const std::vector<unsigned>& cqs, unsigned ci,
-                         unsigned ai, idx_type& cmaskh, AQ& aqs) const {
-    while (1) {
-      if (qi < qs.size() && (ci == cqs.size() || qs[qi] < cqs[ci])) {
-        aqs[ai++] = qs[qi++];
-      } else if (ci < cqs.size()) {
-        cmaskh |= idx_type{1} << cqs[ci];
-        aqs[ai++] = cqs[ci++];
-      } else {
-        break;
-      }
-    }
-
-    return ai;
-  }
-
-  template <typename QS>
-  void GetMs(unsigned num_qubits, const QS& qs, unsigned qs_size,
-             idx_type* ms) const {
-    if (qs_size == 0) {
-      ms[0] = idx_type(-1);
-    } else {
-      idx_type xs = idx_type{1} << (qs[0] + 1);
-      ms[0] = (idx_type{1} << qs[0]) - 1;
-      for (unsigned i = 1; i < qs_size; ++i) {
-        ms[i] = ((idx_type{1} << qs[i]) - 1) ^ (xs - 1);
-        xs = idx_type{1} << (qs[i] + 1);
-      }
-      ms[qs_size] = ((idx_type{1} << num_qubits) - 1) ^ (xs - 1);
-    }
-  }
-
-  template <typename QS>
-  void GetXss(unsigned num_qubits, const QS& qs, unsigned qs_size,
-              idx_type* xss) const {
-    if (qs_size == 0) {
-      xss[0] = 0;
-    } else {
-      unsigned g = qs_size;
-      unsigned gsize = 1 << qs_size;
-
-      idx_type xs[64];
-
-      xs[0] = idx_type{1} << (qs[0] + 1);
-      for (unsigned i = 1; i < g; ++i) {
-        xs[i] = idx_type{1} << (qs[i] + 1);
-      }
-
-      for (unsigned i = 0; i < gsize; ++i) {
-        idx_type a = 0;
-        for (unsigned k = 0; k < g; ++k) {
-          a += xs[k] * ((i >> k) & 1);
-        }
-        xss[i] = a;
-      }
-    }
-  }
-
-  template <unsigned G, typename qs_type>
-  void GetIndicesH(unsigned num_qubits, const qs_type& qs, unsigned qs_size,
-                   IndicesH<G>& indices) const {
-    if (qs_size == 0) {
-      indices.ms[0] = idx_type(-1);
-      indices.xss[0] = 0;
-    } else {
-      unsigned g = qs_size;
-      unsigned gsize = 1 << qs_size;
-
-      idx_type xs[64];
-
-      xs[0] = idx_type{1} << (qs[0] + 1);
-      indices.ms[0] = (idx_type{1} << qs[0]) - 1;
-      for (unsigned i = 1; i < g; ++i) {
-        xs[i] = idx_type{1} << (qs[i] + 1);
-        indices.ms[i] = ((idx_type{1} << qs[i]) - 1) ^ (xs[i - 1] - 1);
-      }
-      indices.ms[g] = ((idx_type{1} << num_qubits) - 1) ^ (xs[g - 1] - 1);
-
-      for (unsigned i = 0; i < gsize; ++i) {
-        idx_type a = 0;
-        for (unsigned k = 0; k < g; ++k) {
-          a += xs[k] * ((i >> k) & 1);
-        }
-        indices.xss[i] = a;
-      }
-    }
-  }
-
-  template <unsigned G>
-  void GetIndicesL(unsigned num_effective_qs, unsigned qmask,
-                   IndicesL<G>& indices) const {
-    for (unsigned i = num_effective_qs + 1; i < (G + 1); ++i) {
-      indices.ms[i] = 0;
-    }
-
-    for (unsigned i = (1 << num_effective_qs); i < indices.gsize; ++i) {
-      indices.xss[i] = 0;
-    }
-
-    for (unsigned i = 0; i < indices.gsize; ++i) {
-      indices.qis[i] = bits::ExpandBits(i, 5 + num_effective_qs, qmask);
-    }
-
-    unsigned tmask = ((1 << (5 + num_effective_qs)) - 1) ^ qmask;
-    for (unsigned i = 0; i < 32; ++i) {
-      indices.tis[i] = bits::ExpandBits(i, 5 + num_effective_qs, tmask);
-    }
-  }
-
-  template <unsigned G>
-  unsigned GetIndicesL(unsigned num_qubits, const std::vector<unsigned>& qs,
-                       IndicesL<G>& indices) const {
-    unsigned eqs[32];
-
-    unsigned qmaskh = 0;
-    unsigned qmaskl = 0;
-
-    unsigned qi = 0;
-
-    while (qi < qs.size() && qs[qi] < 5) {
-      qmaskl |= 1 << qs[qi++];
-    }
-
-    unsigned nq = std::max(5U, num_qubits);
-    unsigned num_effective_qs = std::min(nq - 5, unsigned(qs.size()));
-
-    unsigned l = 0;
-    unsigned ei = 0;
-    unsigned num_low_qs = qi;
-
-    if (qs.size() == num_low_qs) {
-      while (ei < num_effective_qs && l++ < num_low_qs) {
-        eqs[ei] = ei + 5;
-        ++ei;
-      }
-    } else {
-      while (ei < num_effective_qs && l < num_low_qs) {
-        unsigned ei5 = ei + 5;
-        eqs[ei] = ei5;
-        if (qi < qs.size() && qs[qi] == ei5) {
-          ++qi;
-          qmaskh |= 1 << ei5;
-        } else {
-          ++l;
-        }
-        ++ei;
-      }
-
-      while (ei < num_effective_qs) {
-        eqs[ei] = qs[qi++];
-        qmaskh |= 1 << (ei + 5);
-        ++ei;
-      }
-    }
-
-    GetIndicesH(num_qubits, eqs, num_effective_qs, indices);
-    GetIndicesL(num_effective_qs, qmaskh | qmaskl, indices);
-
-    return num_effective_qs;
-  }
-
-  template <unsigned G>
-  DataC GetIndicesLC(unsigned num_qubits, const std::vector<unsigned>& qs,
-                     const std::vector<unsigned>& cqs, uint64_t cvals,
-                     IndicesL<G>& indices) const {
-    unsigned aqs[64];
-    unsigned eqs[32];
-
-    unsigned qmaskh = 0;
-    unsigned qmaskl = 0;
-    idx_type cmaskh = 0;
-
-    unsigned qi = 0;
-
-    while (qi < qs.size() && qs[qi] < 5) {
-      qmaskl |= 1 << qs[qi++];
-    }
-
-    unsigned nq = std::max(5U, num_qubits - unsigned(cqs.size()));
-    unsigned num_effective_qs = std::min(nq - 5, unsigned(qs.size()));
-
-    unsigned l = 0;
-    unsigned ai = 5;
-    unsigned ci = 0;
-    unsigned ei = 0;
-    unsigned num_low_qs = qi;
-
-    while (ai < num_qubits && l < num_low_qs) {
-      aqs[ai - 5] = ai;
-      if (qi < qs.size() && qs[qi] == ai) {
-        ++qi;
-        eqs[ei++] = ai;
-        qmaskh |= 1 << (ai - ci);
-      } else if (ci < cqs.size() && cqs[ci] == ai) {
-        ++ci;
-        cmaskh |= idx_type{1} << ai;
-      } else {
-        ++l;
-        eqs[ei++] = ai;
-      }
-      ++ai;
-    }
-
-    unsigned i = ai;
-    unsigned j = qi;
-
-    while (ei < num_effective_qs) {
-      eqs[ei++] = qs[j++];
-      qmaskh |= 1 << (i++ - ci);
-    }
-
-    unsigned num_aqs = GetHighQubits(qs, qi, cqs, ci, ai - 5, cmaskh, aqs);
-    GetMs(num_qubits, aqs, num_aqs, indices.ms);
-    GetXss(num_qubits, eqs, num_effective_qs, indices.xss);
-    GetIndicesL(num_effective_qs, qmaskh | qmaskl, indices);
-
-    idx_type cvalsh = bits::ExpandBits(idx_type(cvals), num_qubits, cmaskh);
-
-    return {cvalsh, num_aqs, num_effective_qs};
-  }
-
-  template <unsigned G>
-  DataC GetIndicesLCL(unsigned num_qubits, const std::vector<unsigned>& qs,
-                      const std::vector<unsigned>& cqs, uint64_t cvals,
-                      IndicesLC<G>& indices) const {
-    unsigned aqs[64];
-    unsigned eqs[32];
-
-    unsigned qmaskh = 0;
-    unsigned qmaskl = 0;
-    idx_type cmaskh = 0;
-    idx_type cmaskl = 0;
-    idx_type cis_mask = 0;
-
-    unsigned qi = 0;
-    unsigned ci = 0;
-
-    for (unsigned k = 0; k < 5; ++k) {
-      if (qi < qs.size() && qs[qi] == k) {
-        qmaskl |= 1 << (k - ci);
-        ++qi;
-      } else if (ci < cqs.size() && cqs[ci] == k) {
-        cmaskl |= idx_type{1} << k;
-        ++ci;
-      }
-    }
-
-    unsigned num_low_qs = qi;
-    unsigned num_low_cqs = ci;
-
-    unsigned nq = std::max(5U, num_qubits - unsigned(cqs.size()));
-    unsigned num_effective_qs = std::min(nq - 5, unsigned(qs.size()));
-
-    unsigned l = 0;
-    unsigned ai = 5;
-    unsigned ei = 0;
-    unsigned num_low = num_low_qs + num_low_cqs;
-    unsigned remaining_low_cqs = num_low_cqs;
-    unsigned effective_low_qs = num_low_qs;
-    unsigned highest_cis_bit = 0;
-
-    while (ai < num_qubits && l < num_low) {
-      aqs[ai - 5] = ai;
-      if (qi < qs.size() && qs[qi] == ai) {
-        ++qi;
-        if ((ai - ci) > 4) {
-          eqs[ei++] = ai;
-          qmaskh |= 1 << (ai - ci);
-        } else {
-          highest_cis_bit = ai;
-          cis_mask |= idx_type{1} << ai;
-          qmaskl |= 1 << (ai - ci);
-          --remaining_low_cqs;
-          ++effective_low_qs;
-        }
-      } else if (ci < cqs.size() && cqs[ci] == ai) {
-        ++ci;
-        cmaskh |= idx_type{1} << ai;
-      } else {
-        ++l;
-        if (remaining_low_cqs == 0) {
-          eqs[ei++] = ai;
-        } else {
-          highest_cis_bit = ai;
-          cis_mask |= idx_type{1} << ai;
-          --remaining_low_cqs;
-        }
-      }
-      ++ai;
-    }
-
-    unsigned i = ai;
-    unsigned j = effective_low_qs;
-
-    while (ei < num_effective_qs) {
-      eqs[ei++] = qs[j++];
-      qmaskh |= 1 << (i++ - ci);
-    }
-
-    unsigned num_aqs = GetHighQubits(qs, qi, cqs, ci, ai - 5, cmaskh, aqs);
-    GetMs(num_qubits, aqs, num_aqs, indices.ms);
-    GetXss(num_qubits, eqs, num_effective_qs, indices.xss);
-    GetIndicesL(num_effective_qs, qmaskh | qmaskl, indices);
-
-    idx_type cvalsh = bits::ExpandBits(idx_type(cvals), num_qubits, cmaskh);
-    idx_type cvalsl = bits::ExpandBits(idx_type(cvals), 5, cmaskl);
-
-    cis_mask |= 31 ^ cmaskl;
-    highest_cis_bit = highest_cis_bit < 5 ? 5 : highest_cis_bit;
-    for (idx_type i = 0; i < 32; ++i) {
-      auto c = bits::ExpandBits(i, highest_cis_bit + 1, cis_mask);
-      indices.cis[i] = 2 * (c & 0xffffffe0) | (c & 0x1f) | cvalsl;
-    }
-
-    return {cvalsh, num_aqs, num_effective_qs, remaining_low_cqs};
-  }
-
-
-  void* AllocScratch(uint64_t size) const {
-    if (size > scratch_size_) {
-      if (scratch_ != nullptr) {
-        ErrorCheck(cudaFree(scratch_));
-      }
-
-      ErrorCheck(cudaMalloc(const_cast<void**>(&scratch_), size));
-
-      const_cast<uint64_t&>(scratch_size_) = size;
-    }
-
-    return scratch_;
-  }
-
-  char* d_ws;
-  char h_ws0[max_buf_size];
-  char* h_ws = (char*) h_ws0;
-
-  void* scratch_;
-  uint64_t scratch_size_;
-};
-
-}  // namespace qsim
-
-#endif  // SIMULATOR_CUDA_H_
diff --git a/qsim/simulator_cuda_kernels.h b/qsim/simulator_cuda_kernels.h
deleted file mode 100644
index e21a9d6..0000000
--- a/qsim/simulator_cuda_kernels.h
+++ /dev/null
@@ -1,683 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SIMULATOR_CUDA_KERNELS_H_
-#define SIMULATOR_CUDA_KERNELS_H_
-
-#ifdef __NVCC__
-  #include <cuda.h>
-  #include <cuda_runtime.h>
-
-  #include "util_cuda.h"
-#elif __HIP__
-  #include <hip/hip_runtime.h>
-  #include "cuda2hip.h"
-#endif
-
-namespace qsim {
-
-template <unsigned G, typename fp_type, typename idx_type>
-__global__ void ApplyGateH_Kernel(
-    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss0,
-    const idx_type* __restrict__ mss, fp_type* __restrict__ rstate) {
-  // blockDim.x must be equal to 64.
-
-  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
-
-  constexpr unsigned gsize = 1 << G;
-  constexpr unsigned rows =
-      G < 4 ? gsize : (sizeof(fp_type) == 4 ?
-                       (G < 6 ? gsize : 32) : (G < 5 ? 8 : 16));
-
-  fp_type rs[gsize], is[gsize];
-
-  __shared__ idx_type xss[64];
-  __shared__ fp_type v[2 * gsize * rows];
-
-  if (threadIdx.x < gsize) {
-    xss[threadIdx.x] = xss0[threadIdx.x];
-  }
-
-  if (G <= 2) {
-    if (threadIdx.x < 2 * gsize * gsize) {
-      v[threadIdx.x] = v0[threadIdx.x];
-    }
-  } else {
-    for (unsigned m = 0; m < 2 * gsize * rows; m += 64) {
-      v[m + threadIdx.x] = v0[m + threadIdx.x];
-    }
-  }
-
-  __syncthreads();
-
-  idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0;
-  idx_type ii = i & mss[0];
-  for (unsigned j = 1; j <= G; ++j) {
-    i *= 2;
-    ii |= i & mss[j];
-  }
-
-  auto p0 = rstate + 2 * ii + threadIdx.x % 32;
-
-  for (unsigned k = 0; k < gsize; ++k) {
-    rs[k] = *(p0 + xss[k]);
-    is[k] = *(p0 + xss[k] + 32);
-  }
-
-  for (unsigned s = 0; s < gsize / rows; ++s) {
-    if (s > 0) {
-      __syncthreads();
-
-      for (unsigned m = 0; m < 2 * gsize * rows; m += 64) {
-        v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
-      }
-
-      __syncthreads();
-    }
-
-    unsigned j = 0;
-
-    for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
-      fp_type rn = 0;
-      fp_type in = 0;
-
-      for (unsigned l = 0; l < gsize; ++l) {
-        fp_type rm = v[j++];
-        fp_type im = v[j++];
-        rn += rs[l] * rm;
-        rn -= is[l] * im;
-        in += rs[l] * im;
-        in += is[l] * rm;
-      }
-
-      *(p0 + xss[k]) = rn;
-      *(p0 + xss[k] + 32) = in;
-    }
-  }
-}
-
-template <unsigned G, typename fp_type, typename idx_type>
-__global__ void ApplyGateL_Kernel(
-    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss,
-    const idx_type* __restrict__ mss, const unsigned* __restrict__ qis,
-    const unsigned* __restrict__ tis, unsigned esize,
-    fp_type* __restrict__ rstate) {
-  // blockDim.x must be equal to 32.
-
-  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
-
-  constexpr unsigned gsize = 1 << G;
-  constexpr unsigned
-      rows = G < 4 ? gsize : (sizeof(fp_type) == 4 ?
-                              (G < 5 ? gsize : 8) : (G < 6 ? 8 : 4));
-
-  fp_type rs[gsize], is[gsize];
-
-  __shared__ fp_type v[2 * gsize * rows];
-  __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1];
-
-  if (G < 2) {
-    if (threadIdx.x < 2 * gsize * gsize) {
-      v[threadIdx.x] = v0[threadIdx.x];
-    }
-  } else {
-    for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
-      v[m + threadIdx.x] = v0[m + threadIdx.x];
-    }
-  }
-
-  idx_type i = 32 * idx_type{blockIdx.x};
-  idx_type ii = i & mss[0];
-  for (unsigned j = 1; j <= G; ++j) {
-    i *= 2;
-    ii |= i & mss[j];
-  }
-
-  auto p0 = rstate + 2 * ii + threadIdx.x;
-
-  for (unsigned k = 0; k < gsize; ++k) {
-    rs0[threadIdx.x][k] = *(p0 + xss[k]);
-    is0[threadIdx.x][k] = *(p0 + xss[k] + 32);
-  }
-
-  for (unsigned k = 0; k < gsize; ++k) {
-    unsigned i = tis[threadIdx.x] | qis[k];
-    unsigned m = i & 0x1f;
-    unsigned n = i / 32;
-
-    rs[k] = rs0[m][n];
-    is[k] = is0[m][n];
-  }
-
-  for (unsigned s = 0; s < gsize / rows; ++s) {
-    if (s > 0) {
-      for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
-        v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
-      }
-    }
-
-    unsigned j = 0;
-
-    for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
-      fp_type rn = 0;
-      fp_type in = 0;
-
-      for (unsigned l = 0; l < gsize; ++l) {
-        fp_type rm = v[j++];
-        fp_type im = v[j++];
-        rn += rs[l] * rm;
-        rn -= is[l] * im;
-        in += rs[l] * im;
-        in += is[l] * rm;
-      }
-
-      unsigned i = tis[threadIdx.x] | qis[k];
-      unsigned m = i & 0x1f;
-      unsigned n = i / 32;
-
-      rs0[m][n] = rn;
-      is0[m][n] = in;
-    }
-  }
-
-  for (unsigned k = 0; k < esize; ++k) {
-    *(p0 + xss[k]) = rs0[threadIdx.x][k];
-    *(p0 + xss[k] + 32) = is0[threadIdx.x][k];
-  }
-}
-
-template <unsigned G, typename fp_type, typename idx_type>
-__global__ void ApplyControlledGateH_Kernel(
-    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss0,
-    const idx_type* __restrict__ mss, unsigned num_mss, idx_type cvalsh,
-    fp_type* __restrict__ rstate) {
-  // blockDim.x must be equal to 64.
-
-  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
-
-  constexpr unsigned gsize = 1 << G;
-  constexpr unsigned rows =
-      G < 4 ? gsize : (sizeof(fp_type) == 4 ?
-                           (G < 6 ? gsize : 32) : (G < 5 ? 8 : 16));
-
-  fp_type rs[gsize], is[gsize];
-
-  __shared__ idx_type xss[64];
-  __shared__ fp_type v[2 * gsize * rows];
-
-  if (threadIdx.x < gsize) {
-    xss[threadIdx.x] = xss0[threadIdx.x];
-  }
-
-  if (G <= 2) {
-    if (threadIdx.x < 2 * gsize * gsize) {
-      v[threadIdx.x] = v0[threadIdx.x];
-    }
-  } else {
-    for (unsigned m = 0; m < 2 * gsize * rows; m += 64) {
-      v[m + threadIdx.x] = v0[m + threadIdx.x];
-    }
-  }
-
-  __syncthreads();
-
-  idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0;
-  idx_type ii = i & mss[0];
-  for (unsigned j = 1; j < num_mss; ++j) {
-    i *= 2;
-    ii |= i & mss[j];
-  }
-
-  ii |= cvalsh;
-
-  auto p0 = rstate + 2 * ii + threadIdx.x % 32;
-
-  for (unsigned k = 0; k < gsize; ++k) {
-    rs[k] = *(p0 + xss[k]);
-    is[k] = *(p0 + xss[k] + 32);
-  }
-
-  for (unsigned s = 0; s < gsize / rows; ++s) {
-    if (s > 0) {
-      __syncthreads();
-
-      for (unsigned m = 0; m < 2 * gsize * rows; m += 64) {
-        v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
-      }
-
-      __syncthreads();
-    }
-
-    unsigned j = 0;
-
-    for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
-      fp_type rn = 0;
-      fp_type in = 0;
-
-      for (unsigned l = 0; l < gsize; ++l) {
-        fp_type rm = v[j++];
-        fp_type im = v[j++];
-        rn += rs[l] * rm;
-        rn -= is[l] * im;
-        in += rs[l] * im;
-        in += is[l] * rm;
-      }
-
-      *(p0 + xss[k]) = rn;
-      *(p0 + xss[k] + 32) = in;
-    }
-  }
-}
-
-template <unsigned G, typename fp_type, typename idx_type>
-__global__ void ApplyControlledGateLH_Kernel(
-    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss,
-    const idx_type* __restrict__ mss, const unsigned* __restrict__ qis,
-    const unsigned* __restrict__ tis, unsigned num_mss, idx_type cvalsh,
-    unsigned esize, fp_type* __restrict__ rstate) {
-  // blockDim.x must be equal to 32.
-
-  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
-
-  constexpr unsigned gsize = 1 << G;
-  constexpr unsigned
-      rows = G < 4 ? gsize : (sizeof(fp_type) == 4 ?
-                              (G < 5 ? gsize : 8) : (G < 6 ? 8 : 4));
-
-  fp_type rs[gsize], is[gsize];
-
-  __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1];
-  __shared__ fp_type v[2 * gsize * rows];
-
-  idx_type i = 32 * idx_type{blockIdx.x};
-  idx_type ii = i & mss[0];
-  for (unsigned j = 1; j < num_mss; ++j) {
-    i *= 2;
-    ii |= i & mss[j];
-  }
-
-  ii |= cvalsh;
-
-  auto p0 = rstate + 2 * ii + threadIdx.x;
-
-  for (unsigned k = 0; k < gsize; ++k) {
-    rs0[threadIdx.x][k] = *(p0 + xss[k]);
-    is0[threadIdx.x][k] = *(p0 + xss[k] + 32);
-  }
-
-  if (G < 2) {
-    if (threadIdx.x < 2 * gsize * gsize) {
-      v[threadIdx.x] = v0[threadIdx.x];
-    }
-  } else {
-    for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
-      v[m + threadIdx.x] = v0[m + threadIdx.x];
-    }
-  }
-
-  for (unsigned k = 0; k < gsize; ++k) {
-    unsigned i = tis[threadIdx.x] | qis[k];
-    unsigned m = i & 0x1f;
-    unsigned n = i / 32;
-
-    rs[k] = rs0[m][n];
-    is[k] = is0[m][n];
-  }
-
-  for (unsigned s = 0; s < gsize / rows; ++s) {
-    if (s > 0) {
-      for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
-        v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
-      }
-    }
-
-    unsigned j = 0;
-
-    for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
-      fp_type rn = 0;
-      fp_type in = 0;
-
-      for (unsigned l = 0; l < gsize; ++l) {
-        fp_type rm = v[j++];
-        fp_type im = v[j++];
-        rn += rs[l] * rm;
-        rn -= is[l] * im;
-        in += rs[l] * im;
-        in += is[l] * rm;
-      }
-
-      unsigned i = tis[threadIdx.x] | qis[k];
-      unsigned m = i & 0x1f;
-      unsigned n = i / 32;
-
-      rs0[m][n] = rn;
-      is0[m][n] = in;
-    }
-  }
-
-  for (unsigned k = 0; k < esize; ++k) {
-    *(p0 + xss[k]) = rs0[threadIdx.x][k];
-    *(p0 + xss[k] + 32) = is0[threadIdx.x][k];
-  }
-}
-
-template <unsigned G, typename fp_type, typename idx_type>
-__global__ void ApplyControlledGateL_Kernel(
-    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss,
-    const idx_type* __restrict__ mss, const unsigned* __restrict__ qis,
-    const unsigned* __restrict__ tis, const idx_type* __restrict__ cis,
-    unsigned num_mss, idx_type cvalsh, unsigned esize, unsigned rwthreads,
-    fp_type* __restrict__ rstate) {
-  // blockDim.x must be equal to 32.
-
-  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
-
-  constexpr unsigned gsize = 1 << G;
-  constexpr unsigned
-      rows = G < 4 ? gsize : (sizeof(fp_type) == 4 ?
-                              (G < 5 ? gsize : 8) : (G < 6 ? 8 : 4));
-
-  fp_type rs[gsize], is[gsize];
-
-  __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1];
-  __shared__ fp_type v[2 * gsize * rows];
-
-  idx_type i = 32 * idx_type{blockIdx.x};
-  idx_type ii = i & mss[0];
-  for (unsigned j = 1; j < num_mss; ++j) {
-    i *= 2;
-    ii |= i & mss[j];
-  }
-
-  ii |= cvalsh;
-
-  auto p0 = rstate + 2 * ii + cis[threadIdx.x];
-
-  if (threadIdx.x < rwthreads) {
-    for (unsigned k = 0; k < gsize; ++k) {
-      rs0[threadIdx.x][k] = *(p0 + xss[k]);
-      is0[threadIdx.x][k] = *(p0 + xss[k] + 32);
-    }
-  }
-
-  if (G < 2) {
-    if (threadIdx.x < 2 * gsize * gsize) {
-      v[threadIdx.x] = v0[threadIdx.x];
-    }
-  } else {
-    for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
-      v[m + threadIdx.x] = v0[m + threadIdx.x];
-    }
-  }
-
-  for (unsigned k = 0; k < gsize; ++k) {
-    unsigned i = tis[threadIdx.x] | qis[k];
-    unsigned m = i & 0x1f;
-    unsigned n = i / 32;
-
-    rs[k] = rs0[m][n];
-    is[k] = is0[m][n];
-  }
-
-  for (unsigned s = 0; s < gsize / rows; ++s) {
-    if (s > 0) {
-      for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
-        v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
-      }
-    }
-
-    unsigned j = 0;
-
-    for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
-      fp_type rn = 0;
-      fp_type in = 0;
-
-      for (unsigned l = 0; l < gsize; ++l) {
-        fp_type rm = v[j++];
-        fp_type im = v[j++];
-        rn += rs[l] * rm;
-        rn -= is[l] * im;
-        in += rs[l] * im;
-        in += is[l] * rm;
-      }
-
-      unsigned i = tis[threadIdx.x] | qis[k];
-      unsigned m = i & 0x1f;
-      unsigned n = i / 32;
-
-      rs0[m][n] = rn;
-      is0[m][n] = in;
-    }
-  }
-
-  if (threadIdx.x < rwthreads) {
-    for (unsigned k = 0; k < esize; ++k) {
-      *(p0 + xss[k]) = rs0[threadIdx.x][k];
-      *(p0 + xss[k] + 32) = is0[threadIdx.x][k];
-    }
-  }
-}
-
-template <unsigned G, typename fp_type, typename idx_type, typename Op,
-          typename cfp_type>
-__global__ void ExpectationValueH_Kernel(
-    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss0,
-    const idx_type* __restrict__ mss, unsigned num_iterations_per_block,
-    const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) {
-  // blockDim.x must be equal to 64.
-
-  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
-
-  constexpr unsigned gsize = 1 << G;
-  constexpr unsigned rows =
-      G < 5 ? gsize : (sizeof(fp_type) == 4 ? (G < 6 ? 4 : 8) : 8);
-
-  fp_type rs[gsize], is[gsize];
-
-  __shared__ idx_type xss[64];
-  __shared__ fp_type v[2 * gsize * rows];
-
-  if (threadIdx.x < gsize) {
-    xss[threadIdx.x] = xss0[threadIdx.x];
-  }
-
-  if (G <= 2) {
-    if (threadIdx.x < 2 * gsize * gsize) {
-      v[threadIdx.x] = v0[threadIdx.x];
-    }
-  } else {
-    for (unsigned m = 0; m < 2 * gsize * rows; m += 64) {
-      v[m + threadIdx.x] = v0[m + threadIdx.x];
-    }
-  }
-
-  __syncthreads();
-
-  double re = 0;
-  double im = 0;
-
-  for (unsigned iter = 0; iter < num_iterations_per_block; ++iter) {
-    idx_type b = num_iterations_per_block * idx_type{blockIdx.x} + iter;
-
-    idx_type i = (64 * b + threadIdx.x) & 0xffffffffffe0;
-    idx_type ii = i & mss[0];
-    for (unsigned j = 1; j <= G; ++j) {
-      i *= 2;
-      ii |= i & mss[j];
-    }
-
-    auto p0 = rstate + 2 * ii + threadIdx.x % 32;
-
-    for (unsigned k = 0; k < gsize; ++k) {
-      rs[k] = *(p0 + xss[k]);
-      is[k] = *(p0 + xss[k] + 32);
-    }
-
-    for (unsigned s = 0; s < gsize / rows; ++s) {
-      if (s > 0 || iter > 0) {
-        __syncthreads();
-
-        for (unsigned m = 0; m < 2 * gsize * rows; m += 64) {
-          v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
-        }
-
-        __syncthreads();
-      }
-
-      unsigned j = 0;
-
-      for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
-        fp_type rn = 0;
-        fp_type in = 0;
-
-        for (unsigned l = 0; l < gsize; ++l) {
-          fp_type rm = v[j++];
-          fp_type im = v[j++];
-          rn += rs[l] * rm;
-          rn -= is[l] * im;
-          in += rs[l] * im;
-          in += is[l] * rm;
-        }
-
-        re += rs[k] * rn;
-        re += is[k] * in;
-        im += rs[k] * in;
-        im -= is[k] * rn;
-      }
-    }
-  }
-
-  __shared__ cfp_type partial1[64];
-  __shared__ cfp_type partial2[2];
-
-  partial1[threadIdx.x].re = re;
-  partial1[threadIdx.x].im = im;
-
-  auto val = WarpReduce(partial1[threadIdx.x], op);
-
-  if (threadIdx.x % 32 == 0) {
-    partial2[threadIdx.x / 32] = val;
-  }
-
-  __syncthreads();
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x].re = partial2[0].re + partial2[1].re;
-    result[blockIdx.x].im = partial2[0].im + partial2[1].im;
-  }
-}
-
-template <unsigned G, typename fp_type, typename idx_type,
-          typename Op, typename cfp_type>
-__global__ void ExpectationValueL_Kernel(
-    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss,
-    const idx_type* __restrict__ mss, const unsigned* __restrict__ qis,
-    const unsigned* __restrict__ tis, unsigned num_iterations_per_block,
-    const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) {
-  // blockDim.x must be equal to 32.
-
-  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
-
-  constexpr unsigned gsize = 1 << G;
-  constexpr unsigned rows = G < 5 ? gsize : (sizeof(fp_type) == 4 ?
-                                             (G < 6 ? 4 : 2) : (G < 6 ? 2 : 1));
-
-  fp_type rs[gsize], is[gsize];
-
-  __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1];
-  __shared__ fp_type v[2 * gsize * rows];
-
-  if (G < 2) {
-    if (threadIdx.x < 2 * gsize * gsize) {
-      v[threadIdx.x] = v0[threadIdx.x];
-    }
-  } else {
-    for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
-      v[m + threadIdx.x] = v0[m + threadIdx.x];
-    }
-  }
-
-  double re = 0;
-  double im = 0;
-
-  for (idx_type iter = 0; iter < num_iterations_per_block; ++iter) {
-    idx_type i = 32 * (num_iterations_per_block * idx_type{blockIdx.x} + iter);
-    idx_type ii = i & mss[0];
-    for (unsigned j = 1; j <= G; ++j) {
-      i *= 2;
-      ii |= i & mss[j];
-    }
-
-    auto p0 = rstate + 2 * ii + threadIdx.x;
-
-    for (unsigned k = 0; k < gsize; ++k) {
-      rs0[threadIdx.x][k] = *(p0 + xss[k]);
-      is0[threadIdx.x][k] = *(p0 + xss[k] + 32);
-    }
-
-    for (unsigned k = 0; k < gsize; ++k) {
-      unsigned i = tis[threadIdx.x] | qis[k];
-      unsigned m = i & 0x1f;
-      unsigned n = i / 32;
-
-      rs[k] = rs0[m][n];
-      is[k] = is0[m][n];
-    }
-
-    for (unsigned s = 0; s < gsize / rows; ++s) {
-      if (s > 0 || iter > 0) {
-        for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
-          v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
-        }
-      }
-
-      unsigned j = 0;
-
-      for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
-        fp_type rn = 0;
-        fp_type in = 0;
-
-        for (unsigned l = 0; l < gsize; ++l) {
-          fp_type rm = v[j++];
-          fp_type im = v[j++];
-          rn += rs[l] * rm;
-          rn -= is[l] * im;
-          in += rs[l] * im;
-          in += is[l] * rm;
-        }
-
-        re += rs[k] * rn;
-        re += is[k] * in;
-        im += rs[k] * in;
-        im -= is[k] * rn;
-      }
-    }
-  }
-
-  __shared__ cfp_type partial[32];
-
-  partial[threadIdx.x].re = re;
-  partial[threadIdx.x].im = im;
-
-  auto val = WarpReduce(partial[threadIdx.x], op);
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x].re = val.re;
-    result[blockIdx.x].im = val.im;
-  }
-}
-
-}  // namespace qsim
-
-#endif  // SIMULATOR_CUDA_KERNELS_H_
diff --git a/qsim/simulator_custatevec.h b/qsim/simulator_custatevec.h
deleted file mode 100644
index 40d1902..0000000
--- a/qsim/simulator_custatevec.h
+++ /dev/null
@@ -1,209 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SIMULATOR_CUSTATEVEC_H_
-#define SIMULATOR_CUSTATEVEC_H_
-
-#include <complex>
-#include <cstdint>
-#include <type_traits>
-
-#include <cublas_v2.h>
-#include <cuComplex.h>
-#include <custatevec.h>
-
-#include "io.h"
-#include "statespace_custatevec.h"
-#include "util_custatevec.h"
-
-namespace qsim {
-
-/**
- * Quantum circuit simulator using the NVIDIA cuStateVec library.
- */
-template <typename FP = float>
-class SimulatorCuStateVec final {
- public:
-  using StateSpace = StateSpaceCuStateVec<FP>;
-  using State = typename StateSpace::State;
-  using fp_type = typename StateSpace::fp_type;
-
-  static constexpr auto kStateType = StateSpace::kStateType;
-  static constexpr auto kMatrixType = StateSpace::kMatrixType;
-  static constexpr auto kExpectType = StateSpace::kExpectType;
-  static constexpr auto kComputeType = StateSpace::kComputeType;
-  static constexpr auto kMatrixLayout = StateSpace::kMatrixLayout;
-
-  explicit SimulatorCuStateVec(const cublasHandle_t& cublas_handle,
-                               const custatevecHandle_t& custatevec_handle)
-      : cublas_handle_(cublas_handle), custatevec_handle_(custatevec_handle),
-      workspace_(nullptr), workspace_size_(0) {}
-
-  ~SimulatorCuStateVec() {
-    ErrorCheck(cudaFree(workspace_));
-  }
-
-  /**
-   * Applies a gate using the NVIDIA cuStateVec library.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyGate(const std::vector<unsigned>& qs,
-                 const fp_type* matrix, State& state) const {
-    if (qs.size() == 0) {
-      uint64_t size = uint64_t{1} << state.num_qubits();
-
-      if (StateSpace::is_float) {
-        cuComplex a = {matrix[0], matrix[1]};
-        auto p = (cuComplex*) state.get();
-        ErrorCheck(cublasCscal(cublas_handle_, size, &a, p, 1));
-      } else {
-        cuDoubleComplex a = {matrix[0], matrix[1]};
-        auto p = (cuDoubleComplex*) state.get();
-        ErrorCheck(cublasZscal(cublas_handle_, size, &a, p, 1));
-      }
-    } else {
-      auto workspace_size = ApplyGateWorkSpaceSize(
-          state.num_qubits(), qs.size(), 0, matrix);
-      AllocWorkSpace(workspace_size);
-
-      ErrorCheck(custatevecApplyMatrix(
-                     custatevec_handle_, state.get(), kStateType,
-                     state.num_qubits(), matrix, kMatrixType, kMatrixLayout, 0,
-                     (int32_t*) qs.data(), qs.size(), nullptr, nullptr, 0,
-                     kComputeType, workspace_, workspace_size));
-    }
-  }
-
-  /**
-   * Applies a controlled gate using the NVIDIA cuStateVec library.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param cqs Indices of control qubits.
-   * @param cmask Bit mask of control qubit values.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyControlledGate(const std::vector<unsigned>& qs,
-                           const std::vector<unsigned>& cqs, uint64_t cmask,
-                           const fp_type* matrix, State& state) const {
-    if (qs.size() == 0) {
-      IO::errorf(
-          "error: controlled global phase gate is not implemented %s %d\n",
-          __FILE__, __LINE__);
-      exit(1);
-    } else {
-      std::vector<int32_t> control_bits;
-      control_bits.reserve(cqs.size());
-
-      for (std::size_t i = 0; i < cqs.size(); ++i) {
-        control_bits.push_back((cmask >> i) & 1);
-      }
-
-      auto workspace_size = ApplyGateWorkSpaceSize(
-          state.num_qubits(), qs.size(), cqs.size(), matrix);
-      AllocWorkSpace(workspace_size);
-
-      ErrorCheck(custatevecApplyMatrix(
-                     custatevec_handle_, state.get(), kStateType,
-                     state.num_qubits(), matrix, kMatrixType, kMatrixLayout, 0,
-                     (int32_t*) qs.data(), qs.size(),
-                     (int32_t*) cqs.data(), control_bits.data(), cqs.size(),
-                     kComputeType, workspace_, workspace_size));
-    }
-  }
-
-  /**
-   * Computes the expectation value of an operator using the NVIDIA cuStateVec
-   * library.
-   * @param qs Indices of the qubits the operator acts on.
-   * @param matrix The operator matrix.
-   * @param state The state of the system.
-   * @return The computed expectation value.
-   */
-  std::complex<double> ExpectationValue(const std::vector<unsigned>& qs,
-                                        const fp_type* matrix,
-                                        const State& state) const {
-    auto workspace_size = ExpectationValueWorkSpaceSize(
-        state.num_qubits(), qs.size(), matrix);
-    AllocWorkSpace(workspace_size);
-
-    cuDoubleComplex eval;
-
-    ErrorCheck(custatevecComputeExpectation(
-                   custatevec_handle_, state.get(), kStateType,
-                   state.num_qubits(), &eval, kExpectType, nullptr, matrix,
-                   kMatrixType, kMatrixLayout, (int32_t*) qs.data(), qs.size(),
-                   kComputeType, workspace_, workspace_size));
-
-    return {cuCreal(eval), cuCimag(eval)};
-  }
-
-  /**
-   * @return The size of SIMD register if applicable.
-   */
-  static unsigned SIMDRegisterSize() {
-    return 32;
-  }
-
- private:
-  size_t ApplyGateWorkSpaceSize(
-      unsigned num_qubits, unsigned num_targets, unsigned num_controls,
-      const fp_type* matrix) const {
-    size_t size;
-
-    ErrorCheck(custatevecApplyMatrixGetWorkspaceSize(
-                   custatevec_handle_, kStateType, num_qubits, matrix,
-                   kMatrixType, kMatrixLayout, 0, num_targets, num_controls,
-                   kComputeType, &size));
-
-    return size;
-  }
-
-  size_t ExpectationValueWorkSpaceSize(
-      unsigned num_qubits, unsigned num_targets, const fp_type* matrix) const {
-    size_t size;
-
-    ErrorCheck(custatevecComputeExpectationGetWorkspaceSize(
-                   custatevec_handle_, kStateType, num_qubits, matrix,
-                   kMatrixType, kMatrixLayout, num_targets, kComputeType,
-                   &size));
-
-    return size;
-  }
-
-  void* AllocWorkSpace(size_t size) const {
-    if (size > workspace_size_) {
-      if (workspace_ != nullptr) {
-        ErrorCheck(cudaFree(workspace_));
-      }
-
-      ErrorCheck(cudaMalloc(const_cast<void**>(&workspace_), size));
-
-      const_cast<uint64_t&>(workspace_size_) = size;
-    }
-
-    return workspace_;
-  }
-
-  const cublasHandle_t cublas_handle_;
-  const custatevecHandle_t custatevec_handle_;
-
-  void* workspace_;
-  size_t workspace_size_;
-};
-
-}  // namespace qsim
-
-#endif  // SIMULATOR_CUSTATEVEC_H_
diff --git a/qsim/simulator_sse.h b/qsim/simulator_sse.h
deleted file mode 100644
index 5256c53..0000000
--- a/qsim/simulator_sse.h
+++ /dev/null
@@ -1,864 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SIMULATOR_SSE_H_
-#define SIMULATOR_SSE_H_
-
-#include <smmintrin.h>
-
-#include <complex>
-#include <cstdint>
-#include <functional>
-#include <vector>
-
-#include "simulator.h"
-#include "statespace_sse.h"
-
-namespace qsim {
-
-/**
- * Quantum circuit simulator with SSE vectorization.
- */
-template <typename For>
-class SimulatorSSE final : public SimulatorBase {
- public:
-  using StateSpace = StateSpaceSSE<For>;
-  using State = typename StateSpace::State;
-  using fp_type = typename StateSpace::fp_type;
-
-  template <typename... ForArgs>
-  explicit SimulatorSSE(ForArgs&&... args) : for_(args...) {}
-
-  /**
-   * Applies a gate using SSE instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyGate(const std::vector<unsigned>& qs,
-                 const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    switch (qs.size()) {
-    case 0:
-      ApplyGateH<0>(qs, matrix, state);
-      break;
-    case 1:
-      if (qs[0] > 1) {
-        ApplyGateH<1>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 1>(qs, matrix, state);
-      }
-      break;
-    case 2:
-      if (qs[0] > 1) {
-        ApplyGateH<2>(qs, matrix, state);
-      } else if (qs[1] > 1) {
-        ApplyGateL<1, 1>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 2>(qs, matrix, state);
-      }
-      break;
-    case 3:
-      if (qs[0] > 1) {
-        ApplyGateH<3>(qs, matrix, state);
-      } else if (qs[1] > 1) {
-        ApplyGateL<2, 1>(qs, matrix, state);
-      } else {
-        ApplyGateL<1, 2>(qs, matrix, state);
-      }
-      break;
-    case 4:
-      if (qs[0] > 1) {
-        ApplyGateH<4>(qs, matrix, state);
-      } else if (qs[1] > 1) {
-        ApplyGateL<3, 1>(qs, matrix, state);
-      } else {
-        ApplyGateL<2, 2>(qs, matrix, state);
-      }
-      break;
-    case 5:
-      if (qs[0] > 1) {
-        ApplyGateH<5>(qs, matrix, state);
-      } else if (qs[1] > 1) {
-        ApplyGateL<4, 1>(qs, matrix, state);
-      } else {
-        ApplyGateL<3, 2>(qs, matrix, state);
-      }
-      break;
-    case 6:
-      if (qs[0] > 1) {
-        ApplyGateH<6>(qs, matrix, state);
-      } else if (qs[1] > 1) {
-        ApplyGateL<5, 1>(qs, matrix, state);
-      } else {
-        ApplyGateL<4, 2>(qs, matrix, state);
-      }
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-  }
-
-  /**
-   * Applies a controlled gate using SSE instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param cqs Indices of control qubits.
-   * @param cvals Bit mask of control qubit values.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyControlledGate(const std::vector<unsigned>& qs,
-                           const std::vector<unsigned>& cqs, uint64_t cvals,
-                           const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-    // Assume cqs[0] < cqs[1] < cqs[2] < ... .
-
-    if (cqs.size() == 0) {
-      ApplyGate(qs, matrix, state);
-      return;
-    }
-
-    switch (qs.size()) {
-    case 0:
-      if (cqs[0] > 1) {
-        ApplyControlledGateHH<0>(qs, cqs, cvals, matrix, state);
-      } else {
-        ApplyControlledGateHL<0>(qs, cqs, cvals, matrix, state);
-      }
-      break;
-    case 1:
-      if (qs[0] > 1) {
-        if (cqs[0] > 1) {
-          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 1) {
-          ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 2:
-      if (qs[0] > 1) {
-        if (cqs[0] > 1) {
-          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 1) {
-        if (cqs[0] > 1) {
-          ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 1) {
-          ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 3:
-      if (qs[0] > 1) {
-        if (cqs[0] > 1) {
-          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 1) {
-        if (cqs[0] > 1) {
-          ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 1) {
-          ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 4:
-      if (qs[0] > 1) {
-        if (cqs[0] > 1) {
-          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 1) {
-        if (cqs[0] > 1) {
-          ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 1) {
-          ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-  }
-
-  /**
-   * Computes the expectation value of an operator using SSE instructions.
-   * @param qs Indices of the qubits the operator acts on.
-   * @param matrix The operator matrix.
-   * @param state The state of the system.
-   * @return The computed expectation value.
-   */
-  std::complex<double> ExpectationValue(const std::vector<unsigned>& qs,
-                                        const fp_type* matrix,
-                                        const State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    switch (qs.size()) {
-    case 1:
-      if (qs[0] > 1) {
-        return ExpectationValueH<1>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<0, 1>(qs, matrix, state);
-      }
-      break;
-    case 2:
-      if (qs[0] > 1) {
-        return ExpectationValueH<2>(qs, matrix, state);
-      } else if (qs[1] > 1) {
-        return ExpectationValueL<1, 1>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<0, 2>(qs, matrix, state);
-      }
-      break;
-    case 3:
-      if (qs[0] > 1) {
-        return ExpectationValueH<3>(qs, matrix, state);
-      } else if (qs[1] > 1) {
-        return ExpectationValueL<2, 1>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<1, 2>(qs, matrix, state);
-      }
-      break;
-    case 4:
-      if (qs[0] > 1) {
-        return ExpectationValueH<4>(qs, matrix, state);
-      } else if (qs[1] > 1) {
-        return ExpectationValueL<3, 1>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<2, 2>(qs, matrix, state);
-      }
-      break;
-    case 5:
-      if (qs[0] > 1) {
-        return ExpectationValueH<5>(qs, matrix, state);
-      } else if (qs[1] > 1) {
-        return ExpectationValueL<4, 1>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<3, 2>(qs, matrix, state);
-      }
-      break;
-    case 6:
-      if (qs[0] > 1) {
-        return ExpectationValueH<6>(qs, matrix, state);
-      } else if (qs[1] > 1) {
-        return ExpectationValueL<5, 1>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<4, 2>(qs, matrix, state);
-      }
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-
-    return 0;
-  }
-
-  /**
-   * @return The size of SIMD register if applicable.
-   */
-  static unsigned SIMDRegisterSize() {
-    return 4;
-  }
-
- private:
-  template <unsigned H>
-  void ApplyGateH(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m128 ru, iu, rn, in;
-      __m128 rs[hsize], is[hsize];
-
-      i *= 4;
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      auto p0 = rstate + 2 * ii;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = _mm_load_ps(p0 + xss[k]);
-        is[k] = _mm_load_ps(p0 + xss[k] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm_set1_ps(v[j]);
-        iu = _mm_set1_ps(v[j + 1]);
-        rn = _mm_mul_ps(rs[0], ru);
-        in = _mm_mul_ps(rs[0], iu);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm_set1_ps(v[j]);
-          iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[l], ru));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[k], rn);
-        _mm_store_ps(p0 + xss[k] + 4, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-
-    unsigned k = 2 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss, state.get());
-  }
-
-  template <unsigned H, unsigned L>
-  void ApplyGateL(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned q0, fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m128 rn, in;
-      __m128 rs[gsize], is[gsize];
-
-      i *= 4;
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      auto p0 = rstate + 2 * ii;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-
-        rs[k2] = _mm_load_ps(p0 + xss[k]);
-        is[k2] = _mm_load_ps(p0 + xss[k] + 4);
-
-        if (L == 1) {
-          rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177)
-                               : _mm_shuffle_ps(rs[k2], rs[k2], 78);
-          is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177)
-                               : _mm_shuffle_ps(is[k2], is[k2], 78);
-        } else if (L == 2) {
-          rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57);
-          is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57);
-          rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78);
-          is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78);
-          rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147);
-          is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[k], rn);
-        _mm_store_ps(p0 + xss[k] + 4, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-    __m128 w[1 << (1 + 2 * H + L)];
-
-    auto m = GetMasks11<L>(qs);
-
-    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
-    FillMatrix<H, L, 2>(m.qmaskl, matrix, (fp_type*) w);
-
-    unsigned k = 2 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, qs[0], state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
-                uint64_t cmaskh, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m128 ru, iu, rn, in;
-      __m128 rs[hsize], is[hsize];
-
-      i *= 4;
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      if ((ii & cmaskh) != cvalsh) return;
-
-      auto p0 = rstate + 2 * ii;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = _mm_load_ps(p0 + xss[k]);
-        is[k] = _mm_load_ps(p0 + xss[k] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm_set1_ps(v[j]);
-        iu = _mm_set1_ps(v[j + 1]);
-        rn = _mm_mul_ps(rs[0], ru);
-        in = _mm_mul_ps(rs[0], iu);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm_set1_ps(v[j]);
-          iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[l], ru));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[k], rn);
-        _mm_store_ps(p0 + xss[k] + 4, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-
-    auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals);
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-
-    unsigned k = 2 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss, m.cvalsh, m.cmaskh, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
-                uint64_t cmaskh, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m128 rn, in;
-      __m128 rs[hsize], is[hsize];
-
-      i *= 4;
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      if ((ii & cmaskh) != cvalsh) return;
-
-      auto p0 = rstate + 2 * ii;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = _mm_load_ps(p0 + xss[k]);
-        is[k] = _mm_load_ps(p0 + xss[k] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[k], rn);
-        _mm_store_ps(p0 + xss[k] + 4, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-    __m128 w[1 << (1 + 2 * H)];
-
-    auto m = GetMasks8<2>(state.num_qubits(), qs, cqs, cvals);
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-    FillControlledMatrixH<H, 2>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
-
-    unsigned r = 2 + H;
-    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, state.get());
-  }
-
-  template <unsigned H, unsigned L, bool CH>
-  void ApplyControlledGateL(const std::vector<unsigned>& qs,
-                            const std::vector<unsigned>& cqs, uint64_t cvals,
-                            const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
-                uint64_t cmaskh, unsigned q0, fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m128 rn, in;
-      __m128 rs[gsize], is[gsize];
-
-      i *= 4;
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      if ((ii & cmaskh) != cvalsh) return;
-
-      auto p0 = rstate + 2 * ii;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-
-        rs[k2] = _mm_load_ps(p0 + xss[k]);
-        is[k2] = _mm_load_ps(p0 + xss[k] + 4);
-
-        if (L == 1) {
-          rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177)
-                               : _mm_shuffle_ps(rs[k2], rs[k2], 78);
-          is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177)
-                               : _mm_shuffle_ps(is[k2], is[k2], 78);
-        } else if (L == 2) {
-          rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57);
-          is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57);
-          rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78);
-          is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78);
-          rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147);
-          is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[k], rn);
-        _mm_store_ps(p0 + xss[k] + 4, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-    __m128 w[1 << (1 + 2 * H + L)];
-
-    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
-
-    unsigned r = 2 + H;
-    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    if (CH) {
-      auto m = GetMasks9<L>(state.num_qubits(), qs, cqs, cvals);
-      FillMatrix<H, L, 2>(m.qmaskl, matrix, (fp_type*) w);
-
-      for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, qs[0], state.get());
-    } else {
-      auto m = GetMasks10<L, 2>(state.num_qubits(), qs, cqs, cvals);
-      FillControlledMatrixL<H, L, 2>(
-          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
-
-      for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, qs[0], state.get());
-    }
-  }
-
-  template <unsigned H>
-  std::complex<double> ExpectationValueH(const std::vector<unsigned>& qs,
-                                         const fp_type* matrix,
-                                         const State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                const fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m128 ru, iu, rn, in;
-      __m128 rs[hsize], is[hsize];
-
-      i *= 4;
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      auto p0 = rstate + 2 * ii;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = _mm_load_ps(p0 + xss[k]);
-        is[k] = _mm_load_ps(p0 + xss[k] + 4);
-      }
-
-      double re = 0;
-      double im = 0;
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm_set1_ps(v[j]);
-        iu = _mm_set1_ps(v[j + 1]);
-        rn = _mm_mul_ps(rs[0], ru);
-        in = _mm_mul_ps(rs[0], iu);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm_set1_ps(v[j]);
-          iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[l], ru));
-
-          j += 2;
-        }
-
-        __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[k], rn), _mm_mul_ps(is[k], in));
-        __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[k], in), _mm_mul_ps(is[k], rn));
-
-        re += detail::HorizontalSumSSE(v_re);
-        im += detail::HorizontalSumSSE(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-
-    unsigned k = 2 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), matrix, ms, xss, state.get());
-  }
-
-  template <unsigned H, unsigned L>
-  std::complex<double> ExpectationValueL(const std::vector<unsigned>& qs,
-                                         const fp_type* matrix,
-                                         const State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss, unsigned q0,
-                const fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m128 rn, in;
-      __m128 rs[gsize], is[gsize];
-
-      i *= 4;
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      auto p0 = rstate + 2 * ii;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-
-        rs[k2] = _mm_load_ps(p0 + xss[k]);
-        is[k2] = _mm_load_ps(p0 + xss[k] + 4);
-
-        if (L == 1) {
-          rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177)
-                               : _mm_shuffle_ps(rs[k2], rs[k2], 78);
-          is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177)
-                               : _mm_shuffle_ps(is[k2], is[k2], 78);
-        } else if (L == 2) {
-          rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57);
-          is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57);
-          rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78);
-          is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78);
-          rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147);
-          is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
-
-          j += 2;
-        }
-
-        unsigned m = lsize * k;
-
-        __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[m], rn), _mm_mul_ps(is[m], in));
-        __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[m], in), _mm_mul_ps(is[m], rn));
-
-        re += detail::HorizontalSumSSE(v_re);
-        im += detail::HorizontalSumSSE(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-    __m128 w[1 << (1 + 2 * H + L)];
-
-    auto m = GetMasks11<L>(qs);
-
-    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
-    FillMatrix<H, L, 2>(m.qmaskl, matrix, (fp_type*) w);
-
-    unsigned k = 2 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, qs[0], state.get());
-  }
-
-  For for_;
-};
-
-}  // namespace qsim
-
-#endif  // SIMULATOR_SSE_H_
diff --git a/qsim/statespace.h b/qsim/statespace.h
deleted file mode 100644
index 2b0c9af..0000000
--- a/qsim/statespace.h
+++ /dev/null
@@ -1,145 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef STATESPACE_H_
-#define STATESPACE_H_
-
-#include <cmath>
-#include <cstdint>
-#include <cstdlib>
-#include <vector>
-
-#include "util.h"
-
-namespace qsim {
-
-/**
- * Abstract class containing context and routines for general state-vector
- * manipulations. "AVX", "AVX512", "Basic", and "SSE" implementations are
- * provided.
- */
-template <typename Impl,
-          template<typename...> class VectorSpace, typename... VSTypeParams>
-class StateSpace : public VectorSpace<Impl, VSTypeParams...> {
- private:
-  using Base = VectorSpace<Impl, VSTypeParams...>;
-
- public:
-  using fp_type = typename Base::fp_type;
-  using State = typename Base::Vector;
-
-  /**
-   * The observed state from a Measurement gate.
-   */
-  struct MeasurementResult {
-    /**
-     * A bitmask of all qubits measured in this result. In this format, if the
-     * qubit at index `i` is measured, the `i`th bit of `mask` is a one.
-     */
-    uint64_t mask;
-    /**
-     * A bitwise representation of the measured states. In this format, the
-     * qubit at index `i` is represented by the `i`th bit of `bits`.
-     * If `valid` is true, `mask` has already been applied to this field
-     * (i.e. `bits == bits & mask`).
-     */
-    uint64_t bits;
-    /**
-     * Observed states of the measured qubits. This vector only includes qubits
-     * specified by the associated Measurement gate.
-     */
-    std::vector<unsigned> bitstring;
-    /**
-     * Validation bit. If this is false, the measurement failed and all other
-     * fields of the result are invalid.
-     */
-    bool valid;
-  };
-
-  template <typename... Args>
-  StateSpace(Args&&... args) : Base(args...) {}
-
-  double Norm(const State& state) const {
-    auto partial_norms = static_cast<const Impl&>(*this).PartialNorms(state);
-
-    double norm = partial_norms[0];
-    for (std::size_t i = 1; i < partial_norms.size(); ++i) {
-      norm += partial_norms[i];
-    }
-
-    return norm;
-  }
-
-  template <typename RGen>
-  MeasurementResult Measure(const std::vector<unsigned>& qubits,
-                            RGen& rgen, State& state) const {
-    auto result =
-        static_cast<const Impl&>(*this).VirtualMeasure(qubits, rgen, state);
-
-    if (result.valid) {
-      static_cast<const Impl&>(*this).Collapse(result, state);
-    }
-
-    return result;
-  }
-
-  template <typename RGen>
-  MeasurementResult VirtualMeasure(const std::vector<unsigned>& qubits,
-                                   RGen& rgen, const State& state) const {
-    MeasurementResult result;
-
-    result.valid = true;
-    result.mask = 0;
-
-    for (auto q : qubits) {
-      if (q >= state.num_qubits()) {
-        result.valid = false;
-        return result;
-      }
-
-      result.mask |= uint64_t{1} << q;
-    }
-
-    auto partial_norms = static_cast<const Impl&>(*this).PartialNorms(state);
-
-    for (std::size_t i = 1; i < partial_norms.size(); ++i) {
-      partial_norms[i] += partial_norms[i - 1];
-    }
-
-    auto norm = partial_norms.back();
-    auto r = RandomValue(rgen, norm);
-
-    unsigned m = 0;
-    while (r > partial_norms[m]) ++m;
-    if (m > 0) {
-      r -= partial_norms[m - 1];
-    }
-
-    result.bits = static_cast<const Impl&>(*this).FindMeasuredBits(
-        m, r, result.mask, state);
-
-    result.bitstring.reserve(qubits.size());
-    result.bitstring.resize(0);
-
-    for (auto q : qubits) {
-      result.bitstring.push_back((result.bits >> q) & 1);
-    }
-
-    return result;
-  }
-};
-
-}  // namespace qsim
-
-#endif  // STATESPACE_H_
diff --git a/qsim/statespace_avx.h b/qsim/statespace_avx.h
deleted file mode 100644
index 876058b..0000000
--- a/qsim/statespace_avx.h
+++ /dev/null
@@ -1,497 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef STATESPACE_AVX_H_
-#define STATESPACE_AVX_H_
-
-#include <immintrin.h>
-
-#include <algorithm>
-#include <cmath>
-#include <complex>
-#include <cstdint>
-#include <functional>
-
-#include "statespace.h"
-#include "util.h"
-#include "vectorspace.h"
-
-namespace qsim {
-
-namespace detail {
-
-inline __m256i GetZeroMaskAVX(uint64_t i, uint64_t mask, uint64_t bits) {
-  __m256i s1 = _mm256_setr_epi64x(i + 0, i + 2, i + 4, i + 6);
-  __m256i s2 = _mm256_setr_epi64x(i + 1, i + 3, i + 5, i + 7);
-  __m256i ma = _mm256_set1_epi64x(mask);
-  __m256i bi = _mm256_set1_epi64x(bits);
-
-  s1 = _mm256_and_si256(s1, ma);
-  s2 = _mm256_and_si256(s2, ma);
-
-  s1 = _mm256_cmpeq_epi64(s1, bi);
-  s2 = _mm256_cmpeq_epi64(s2, bi);
-
-  return _mm256_blend_epi32(s1, s2, 170);  // 10101010
-}
-
-inline double HorizontalSumAVX(__m256 s) {
-  __m128 l = _mm256_castps256_ps128(s);
-  __m128 h = _mm256_extractf128_ps(s, 1);
-  __m128 s1  = _mm_add_ps(h, l);
-  __m128 s1s = _mm_movehdup_ps(s1);
-  __m128 s2 = _mm_add_ps(s1, s1s);
-
-  return _mm_cvtss_f32(_mm_add_ss(s2, _mm_movehl_ps(s1s, s2)));
-}
-
-}  // namespace detail
-
-/**
- * Object containing context and routines for AVX state-vector manipulations.
- * State is a vectorized sequence of eight real components followed by eight
- * imaginary components. Eight single-precison floating numbers can be loaded
- * into an AVX register.
- */
-template <typename For>
-class StateSpaceAVX :
-    public StateSpace<StateSpaceAVX<For>, VectorSpace, For, float> {
- private:
-  using Base = StateSpace<StateSpaceAVX<For>, qsim::VectorSpace, For, float>;
-
- public:
-  using State = typename Base::State;
-  using fp_type = typename Base::fp_type;
-
-  template <typename... ForArgs>
-  explicit StateSpaceAVX(ForArgs&&... args) : Base(args...) {}
-
-  static uint64_t MinSize(unsigned num_qubits) {
-    return std::max(uint64_t{16}, 2 * (uint64_t{1} << num_qubits));
-  };
-
-  void InternalToNormalOrder(State& state) const {
-    if (state.num_qubits() == 1) {
-      fp_type* s = state.get();
-
-      s[2] = s[1];
-      s[1] = s[8];
-      s[3] = s[9];
-
-      for (uint64_t i = 4; i < 16; ++i) {
-        s[i] = 0;
-      }
-    } else if (state.num_qubits() == 2) {
-      fp_type* s = state.get();
-
-      s[6] = s[3];
-      s[4] = s[2];
-      s[2] = s[1];
-      s[1] = s[8];
-      s[3] = s[9];
-      s[5] = s[10];
-      s[7] = s[11];
-
-      for (uint64_t i = 8; i < 16; ++i) {
-        s[i] = 0;
-      }
-    } else {
-      auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) {
-        fp_type* s = p + 16 * i;
-
-        fp_type re[7];
-        fp_type im[7];
-
-        for (uint64_t i = 0; i < 7; ++i) {
-          re[i] = s[i + 1];
-          im[i] = s[i + 8];
-        }
-
-        for (uint64_t i = 0; i < 7; ++i) {
-          s[2 * i + 1] = im[i];
-          s[2 * i + 2] = re[i];
-        }
-      };
-
-      Base::for_.Run(MinSize(state.num_qubits()) / 16, f, state.get());
-    }
-  }
-
-  void NormalToInternalOrder(State& state) const {
-    if (state.num_qubits() == 1) {
-      fp_type* s = state.get();
-
-      s[8] = s[1];
-      s[1] = s[2];
-      s[9] = s[3];
-
-      for (uint64_t i = 2; i < 8; ++i) {
-        s[i] = 0;
-        s[i + 8] = 0;
-      }
-    } else if (state.num_qubits() == 2) {
-      fp_type* s = state.get();
-
-      s[8] = s[1];
-      s[9] = s[3];
-      s[10] = s[5];
-      s[11] = s[7];
-      s[1] = s[2];
-      s[2] = s[4];
-      s[3] = s[6];
-
-      for (uint64_t i = 4; i < 8; ++i) {
-        s[i] = 0;
-        s[i + 8] = 0;
-      }
-    } else {
-      auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) {
-        fp_type* s = p + 16 * i;
-
-        fp_type re[7];
-        fp_type im[7];
-
-        for (uint64_t i = 0; i < 7; ++i) {
-          im[i] = s[2 * i + 1];
-          re[i] = s[2 * i + 2];
-        }
-
-        for (uint64_t i = 0; i < 7; ++i) {
-          s[i + 1] = re[i];
-          s[i + 8] = im[i];
-        }
-      };
-
-      Base::for_.Run(MinSize(state.num_qubits()) / 16, f, state.get());
-    }
-  }
-
-  void SetAllZeros(State& state) const {
-    __m256 val0 = _mm256_setzero_ps();
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, __m256& val, fp_type* p) {
-      _mm256_store_ps(p + 16 * i, val);
-      _mm256_store_ps(p + 16 * i + 8, val);
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 16, f, val0, state.get());
-  }
-
-  // Uniform superposition.
-  void SetStateUniform(State& state) const {
-    __m256 val0 = _mm256_setzero_ps();
-    __m256 valu;
-
-    fp_type v = double{1} / std::sqrt(uint64_t{1} << state.num_qubits());
-
-    switch (state.num_qubits()) {
-    case 1:
-      valu = _mm256_set_ps(0, 0, 0, 0, 0, 0, v, v);
-      break;
-    case 2:
-      valu = _mm256_set_ps(0, 0, 0, 0, v, v, v, v);
-      break;
-    default:
-      valu = _mm256_set1_ps(v);
-      break;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                __m256& val0, __m256 valu, fp_type* p) {
-      _mm256_store_ps(p + 16 * i, valu);
-      _mm256_store_ps(p + 16 * i + 8, val0);
-    };
-
-    Base::for_.Run(
-        MinSize(state.num_qubits()) / 16, f, val0, valu, state.get());
-  }
-
-  // |0> state.
-  void SetStateZero(State& state) const {
-    SetAllZeros(state);
-    state.get()[0] = 1;
-  }
-
-  static std::complex<fp_type> GetAmpl(const State& state, uint64_t i) {
-    uint64_t k = (16 * (i / 8)) + (i % 8);
-    return std::complex<fp_type>(state.get()[k], state.get()[k + 8]);
-  }
-
-  static void SetAmpl(
-      State& state, uint64_t i, const std::complex<fp_type>& ampl) {
-    uint64_t k = (16 * (i / 8)) + (i % 8);
-    state.get()[k] = std::real(ampl);
-    state.get()[k + 8] = std::imag(ampl);
-  }
-
-  static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) {
-    uint64_t k = (16 * (i / 8)) + (i % 8);
-    state.get()[k] = re;
-    state.get()[k + 8] = im;
-  }
-
-  // Sets state[i] = complex(re, im) where (i & mask) == bits.
-  // if `exclude` is true then the criteria becomes (i & mask) != bits.
-  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits,
-                   const std::complex<fp_type>& val,
-                   bool exclude = false) const {
-    BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude);
-  }
-
-  // Sets state[i] = complex(re, im) where (i & mask) == bits.
-  // if `exclude` is true then the criteria becomes (i & mask) != bits.
-  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re,
-                   fp_type im, bool exclude = false) const {
-    __m256 re_reg = _mm256_set1_ps(re);
-    __m256 im_reg = _mm256_set1_ps(im);
-
-    __m256i exclude_reg = _mm256_setzero_si256();
-    if (exclude) {
-      exclude_reg = _mm256_cmpeq_epi32(exclude_reg, exclude_reg);
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv,
-                uint64_t bitsv, __m256 re_n, __m256 im_n, __m256i exclude_n,
-                fp_type* p) {
-      __m256 ml = _mm256_castsi256_ps(_mm256_xor_si256(
-          detail::GetZeroMaskAVX(8 * i, maskv, bitsv), exclude_n));
-
-      __m256 re = _mm256_load_ps(p + 16 * i);
-      __m256 im = _mm256_load_ps(p + 16 * i + 8);
-
-      re = _mm256_blendv_ps(re, re_n, ml);
-      im = _mm256_blendv_ps(im, im_n, ml);
-
-      _mm256_store_ps(p + 16 * i, re);
-      _mm256_store_ps(p + 16 * i + 8, im);
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 16, f, mask, bits, re_reg,
-                   im_reg, exclude_reg, state.get());
-  }
-
-  // Does the equivalent of dest += src elementwise.
-  bool Add(const State& src, State& dest) const {
-    if (src.num_qubits() != dest.num_qubits()) {
-      return false;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* p1, fp_type* p2) {
-      __m256 re1 = _mm256_load_ps(p1 + 16 * i);
-      __m256 im1 = _mm256_load_ps(p1 + 16 * i + 8);
-      __m256 re2 = _mm256_load_ps(p2 + 16 * i);
-      __m256 im2 = _mm256_load_ps(p2 + 16 * i + 8);
-
-      _mm256_store_ps(p2 + 16 * i, _mm256_add_ps(re1, re2));
-      _mm256_store_ps(p2 + 16 * i + 8, _mm256_add_ps(im1, im2));
-    };
-
-    Base::for_.Run(MinSize(src.num_qubits()) / 16, f, src.get(), dest.get());
-
-    return true;
-  }
-
-  // Does the equivalent of state *= a elementwise.
-  void Multiply(fp_type a, State& state) const {
-    __m256 r = _mm256_set1_ps(a);
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, __m256 r, fp_type* p) {
-      __m256 re = _mm256_load_ps(p + 16 * i);
-      __m256 im = _mm256_load_ps(p + 16 * i + 8);
-
-      re = _mm256_mul_ps(re, r);
-      im = _mm256_mul_ps(im, r);
-
-      _mm256_store_ps(p + 16 * i, re);
-      _mm256_store_ps(p + 16 * i + 8, im);
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 16, f, r, state.get());
-  }
-
-  std::complex<double> InnerProduct(
-      const State& state1, const State& state2) const {
-    if (state1.num_qubits() != state2.num_qubits()) {
-      return std::nan("");
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* p1, const fp_type* p2) -> std::complex<double> {
-      __m256 re1 = _mm256_load_ps(p1 + 16 * i);
-      __m256 im1 = _mm256_load_ps(p1 + 16 * i + 8);
-      __m256 re2 = _mm256_load_ps(p2 + 16 * i);
-      __m256 im2 = _mm256_load_ps(p2 + 16 * i + 8);
-
-      __m256 ip_re = _mm256_fmadd_ps(im1, im2, _mm256_mul_ps(re1, re2));
-      __m256 ip_im = _mm256_fnmadd_ps(im1, re2, _mm256_mul_ps(re1, im2));
-
-      double re = detail::HorizontalSumAVX(ip_re);
-      double im = detail::HorizontalSumAVX(ip_im);
-
-      return std::complex<double>{re, im};
-    };
-
-    using Op = std::plus<std::complex<double>>;
-    return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 16, f,
-                                Op(), state1.get(), state2.get());
-  }
-
-  double RealInnerProduct(const State& state1, const State& state2) const {
-    if (state1.num_qubits() != state2.num_qubits()) {
-      return std::nan("");
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* p1, const fp_type* p2) -> double {
-      __m256 re1 = _mm256_load_ps(p1 + 16 * i);
-      __m256 im1 = _mm256_load_ps(p1 + 16 * i + 8);
-      __m256 re2 = _mm256_load_ps(p2 + 16 * i);
-      __m256 im2 = _mm256_load_ps(p2 + 16 * i + 8);
-
-      __m256 ip_re = _mm256_fmadd_ps(im1, im2, _mm256_mul_ps(re1, re2));
-
-      return detail::HorizontalSumAVX(ip_re);
-    };
-
-    using Op = std::plus<double>;
-    return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 16, f,
-                                Op(), state1.get(), state2.get());
-  }
-
-  template <typename DistrRealType = double>
-  std::vector<uint64_t> Sample(
-      const State& state, uint64_t num_samples, unsigned seed) const {
-    std::vector<uint64_t> bitstrings;
-
-    if (num_samples > 0) {
-      double norm = 0;
-      uint64_t size = MinSize(state.num_qubits()) / 16;
-      const fp_type* p = state.get();
-
-      for (uint64_t k = 0; k < size; ++k) {
-        for (unsigned j = 0; j < 8; ++j) {
-          double re = p[16 * k + j];
-          double im = p[16 * k + 8 + j];
-          norm += re * re + im * im;
-        }
-      }
-
-      auto rs = GenerateRandomValues<DistrRealType>(num_samples, seed, norm);
-
-      uint64_t m = 0;
-      double csum = 0;
-      bitstrings.reserve(num_samples);
-
-      for (uint64_t k = 0; k < size; ++k) {
-        for (unsigned j = 0; j < 8; ++j) {
-          double re = p[16 * k + j];
-          double im = p[16 * k + 8 + j];
-          csum += re * re + im * im;
-          while (rs[m] < csum && m < num_samples) {
-            bitstrings.emplace_back(8 * k + j);
-            ++m;
-          }
-        }
-      }
-
-      for (; m < num_samples; ++m) {
-        bitstrings.emplace_back((uint64_t{1} << state.num_qubits()) - 1);
-      }
-    }
-
-    return bitstrings;
-  }
-
-  using MeasurementResult = typename Base::MeasurementResult;
-
-  void Collapse(const MeasurementResult& mr, State& state) const {
-    auto f1 = [](unsigned n, unsigned m, uint64_t i,
-                 uint64_t mask, uint64_t bits, const fp_type* p) -> double {
-      __m256i ml = detail::GetZeroMaskAVX(8 * i, mask, bits);
-
-      __m256 re = _mm256_maskload_ps(p + 16 * i, ml);
-      __m256 im = _mm256_maskload_ps(p + 16 * i + 8, ml);
-      __m256 s1 = _mm256_fmadd_ps(im, im, _mm256_mul_ps(re, re));
-
-      return detail::HorizontalSumAVX(s1);
-    };
-
-    using Op = std::plus<double>;
-    double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 16, f1,
-                                       Op(), mr.mask, mr.bits, state.get());
-
-    __m256 renorm = _mm256_set1_ps(1.0 / std::sqrt(norm));
-
-    auto f2 = [](unsigned n, unsigned m, uint64_t i,
-                 uint64_t mask, uint64_t bits, __m256 renorm, fp_type* p) {
-      __m256i ml = detail::GetZeroMaskAVX(8 * i, mask, bits);
-
-      __m256 re = _mm256_maskload_ps(p + 16 * i, ml);
-      __m256 im = _mm256_maskload_ps(p + 16 * i + 8, ml);
-
-      re = _mm256_mul_ps(re, renorm);
-      im = _mm256_mul_ps(im, renorm);
-
-      _mm256_store_ps(p + 16 * i, re);
-      _mm256_store_ps(p + 16 * i + 8, im);
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 16, f2,
-                   mr.mask, mr.bits, renorm, state.get());
-  }
-
-  std::vector<double> PartialNorms(const State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* p) -> double {
-      __m256 re = _mm256_load_ps(p + 16 * i);
-      __m256 im = _mm256_load_ps(p + 16 * i + 8);
-      __m256 s1 = _mm256_fmadd_ps(im, im, _mm256_mul_ps(re, re));
-
-      return detail::HorizontalSumAVX(s1);
-    };
-
-    using Op = std::plus<double>;
-    return Base::for_.RunReduceP(
-        MinSize(state.num_qubits()) / 16, f, Op(), state.get());
-  }
-
-  uint64_t FindMeasuredBits(
-      unsigned m, double r, uint64_t mask, const State& state) const {
-    double csum = 0;
-
-    uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 16, m);
-    uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 16, m);
-
-    const fp_type* p = state.get();
-
-    for (uint64_t k = k0; k < k1; ++k) {
-      for (uint64_t j = 0; j < 8; ++j) {
-        auto re = p[16 * k + j];
-        auto im = p[16 * k + j + 8];
-        csum += re * re + im * im;
-        if (r < csum) {
-          return (8 * k + j) & mask;
-        }
-      }
-    }
-
-    // Return the last bitstring in the unlikely case of underflow.
-    return (8 * k1 - 1) & mask;
-  }
-};
-
-}  // namespace qsim
-
-#endif  // STATESPACE_AVX_H_
diff --git a/qsim/statespace_avx512.h b/qsim/statespace_avx512.h
deleted file mode 100644
index 879fd89..0000000
--- a/qsim/statespace_avx512.h
+++ /dev/null
@@ -1,448 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef STATESPACE_AVX512_H_
-#define STATESPACE_AVX512_H_
-
-#include <immintrin.h>
-
-#include <algorithm>
-#include <cmath>
-#include <complex>
-#include <cstdint>
-#include <functional>
-
-#include "statespace.h"
-#include "util.h"
-#include "vectorspace.h"
-
-namespace qsim {
-
-namespace detail {
-
-inline unsigned GetZeroMaskAVX512(uint64_t i, uint64_t mask, uint64_t bits) {
-  __m512i s1 = _mm512_setr_epi64(
-      i + 0, i + 1, i + 2, i + 3, i + 4, i + 5, i + 6, i + 7);
-  __m512i s2 = _mm512_setr_epi64(
-      i + 8, i + 9, i + 10, i + 11, i + 12, i + 13, i + 14, i + 15);
-  __m512i ma = _mm512_set1_epi64(mask);
-  __m512i bi = _mm512_set1_epi64(bits);
-
-  s1 = _mm512_and_si512(s1, ma);
-  s2 = _mm512_and_si512(s2, ma);
-
-  unsigned m1 = _mm512_cmpeq_epu64_mask(s1, bi);
-  unsigned m2 = _mm512_cmpeq_epu64_mask(s2, bi);
-
-  return (m2 << 8) | m1;
-}
-
-inline double HorizontalSumAVX(__m256 s) {
-  __m128 l = _mm256_castps256_ps128(s);
-  __m128 h = _mm256_extractf128_ps(s, 1);
-  __m128 s1  = _mm_add_ps(h, l);
-  __m128 s1s = _mm_movehdup_ps(s1);
-  __m128 s2 = _mm_add_ps(s1, s1s);
-
-  return _mm_cvtss_f32(_mm_add_ss(s2, _mm_movehl_ps(s1s, s2)));
-}
-
-inline double HorizontalSumAVX512(__m512 s) {
-  __m256 l = _mm512_castps512_ps256(s);
-  __m512d sd = _mm512_castps_pd(s);
-  __m256d hd = _mm512_extractf64x4_pd(sd, 1);
-  __m256 h = _mm256_castpd_ps(hd);
-  __m256 p = _mm256_add_ps(h, l);
-
-  return HorizontalSumAVX(p);
-}
-
-}  // namespace detail
-
-/**
- * Object containing context and routines for AVX state-vector manipulations.
- * State is a vectorized sequence of sixteen real components followed by
- * sixteen imaginary components. Sixteen single-precison floating numbers can
- * be loaded into an AVX512 register.
- */
-template <typename For>
-class StateSpaceAVX512 :
-    public StateSpace<StateSpaceAVX512<For>, VectorSpace, For, float> {
- private:
-  using Base = StateSpace<StateSpaceAVX512<For>, qsim::VectorSpace, For, float>;
-
- public:
-  using State = typename Base::State;
-  using fp_type = typename Base::fp_type;
-
-  template <typename... ForArgs>
-  explicit StateSpaceAVX512(ForArgs&&... args) : Base(args...) {}
-
-  static uint64_t MinSize(unsigned num_qubits) {
-    return std::max(uint64_t{32}, 2 * (uint64_t{1} << num_qubits));
-  };
-
-  void InternalToNormalOrder(State& state) const {
-    __m512i idx1 = _mm512_setr_epi32(
-        0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
-    __m512i idx2 = _mm512_setr_epi32(
-        8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                __m512i idx1, __m512i idx2, fp_type* p) {
-      __m512 v1 = _mm512_load_ps(p + 32 * i);
-      __m512 v2 = _mm512_load_ps(p + 32 * i + 16);
-
-      _mm512_store_ps(p + 32 * i,  _mm512_permutex2var_ps(v1, idx1, v2));
-      _mm512_store_ps(p + 32 * i + 16,  _mm512_permutex2var_ps(v1, idx2, v2));
-    };
-
-    Base::for_.Run(
-        MinSize(state.num_qubits()) / 32, f, idx1, idx2, state.get());
-  }
-
-  void NormalToInternalOrder(State& state) const {
-    __m512i idx1 = _mm512_setr_epi32(
-        0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
-    __m512i idx2 = _mm512_setr_epi32(
-        1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                __m512i idx1, __m512i idx2, fp_type* p) {
-      __m512 re = _mm512_load_ps(p + 32 * i);
-      __m512 im = _mm512_load_ps(p + 32 * i + 16);
-
-      _mm512_store_ps(p + 32 * i,  _mm512_permutex2var_ps(re, idx1, im));
-      _mm512_store_ps(p + 32 * i + 16,  _mm512_permutex2var_ps(re, idx2, im));
-    };
-
-    Base::for_.Run(
-        MinSize(state.num_qubits()) / 32, f, idx1, idx2, state.get());
-  }
-
-  void SetAllZeros(State& state) const {
-    __m512 val0 = _mm512_setzero_ps();
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, __m512 val0, fp_type* p) {
-      _mm512_store_ps(p + 32 * i, val0);
-      _mm512_store_ps(p + 32 * i + 16, val0);
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 32, f, val0, state.get());
-  }
-
-  // Uniform superposition.
-  void SetStateUniform(State& state) const {
-    __m512 val0 = _mm512_setzero_ps();
-    __m512 valu;
-
-    fp_type v = double{1} / std::sqrt(uint64_t{1} << state.num_qubits());
-
-    switch (state.num_qubits()) {
-    case 1:
-      valu = _mm512_set_ps(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, v, v);
-      break;
-    case 2:
-      valu = _mm512_set_ps(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, v, v, v, v);
-      break;
-    case 3:
-      valu = _mm512_set_ps(0, 0, 0, 0, 0, 0, 0, 0, v, v, v, v, v, v, v, v);
-      break;
-    default:
-      valu = _mm512_set1_ps(v);
-      break;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const __m512& val0, const __m512& valu, fp_type* p) {
-      _mm512_store_ps(p + 32 * i, valu);
-      _mm512_store_ps(p + 32 * i + 16, val0);
-    };
-
-    Base::for_.Run(
-        MinSize(state.num_qubits()) / 32, f, val0, valu, state.get());
-  }
-
-  // |0> state.
-  void SetStateZero(State& state) const {
-    SetAllZeros(state);
-    state.get()[0] = 1;
-  }
-
-  static std::complex<fp_type> GetAmpl(const State& state, uint64_t i) {
-    uint64_t p = (32 * (i / 16)) + (i % 16);
-    return std::complex<fp_type>(state.get()[p], state.get()[p + 16]);
-  }
-
-  static void SetAmpl(
-      State& state, uint64_t i, const std::complex<fp_type>& ampl) {
-    uint64_t p = (32 * (i / 16)) + (i % 16);
-    state.get()[p] = std::real(ampl);
-    state.get()[p + 16] = std::imag(ampl);
-  }
-
-  static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) {
-    uint64_t p = (32 * (i / 16)) + (i % 16);
-    state.get()[p] = re;
-    state.get()[p + 16] = im;
-  }
-
-  // Sets state[i] = complex(re, im) where (i & mask) == bits.
-  // if `exclude` is true then the criteria becomes (i & mask) != bits.
-  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits,
-                   const std::complex<fp_type>& val,
-                   bool exclude = false) const {
-    BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude);
-  }
-
-  // Sets state[i] = complex(re, im) where (i & mask) == bits.
-  // if `exclude` is true then the criteria becomes (i & mask) != bits.
-  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re,
-                   fp_type im, bool exclude = false) const {
-    __m512 re_reg = _mm512_set1_ps(re);
-    __m512 im_reg = _mm512_set1_ps(im);
-
-    __mmask16 exclude_n = exclude ? 0xffff : 0;
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv,
-                uint64_t bitsv, __m512 re_n, __m512 im_n, __mmask16 exclude_n,
-                fp_type* p) {
-      __m512 re = _mm512_load_ps(p + 32 * i);
-      __m512 im = _mm512_load_ps(p + 32 * i + 16);
-
-      __mmask16 ml =
-          detail::GetZeroMaskAVX512(16 * i, maskv, bitsv) ^ exclude_n;
-
-      re = _mm512_mask_blend_ps(ml, re, re_n);
-      im = _mm512_mask_blend_ps(ml, im, im_n);
-
-      _mm512_store_ps(p + 32 * i, re);
-      _mm512_store_ps(p + 32 * i + 16, im);
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 32, f, mask, bits,
-                   re_reg, im_reg, exclude_n, state.get());
-  }
-
-  // Does the equivalent of dest += src elementwise.
-  bool Add(const State& src, State& dest) const {
-    if (src.num_qubits() != dest.num_qubits()) {
-      return false;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* p1, fp_type* p2) {
-      __m512 re1 = _mm512_load_ps(p1 + 32 * i);
-      __m512 im1 = _mm512_load_ps(p1 + 32 * i + 16);
-      __m512 re2 = _mm512_load_ps(p2 + 32 * i);
-      __m512 im2 = _mm512_load_ps(p2 + 32 * i + 16);
-
-      _mm512_store_ps(p2 + 32 * i, _mm512_add_ps(re1, re2));
-      _mm512_store_ps(p2 + 32 * i + 16, _mm512_add_ps(im1, im2));
-    };
-
-    Base::for_.Run(MinSize(src.num_qubits()) / 32, f, src.get(), dest.get());
-
-    return true;
-  }
-
-  // Does the equivalent of state *= a elementwise.
-  void Multiply(fp_type a, State& state) const {
-    __m512 r = _mm512_set1_ps(a);
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, __m512 r, fp_type* p) {
-      __m512 re = _mm512_load_ps(p + 32 * i);
-      __m512 im = _mm512_load_ps(p + 32 * i + 16);
-
-      _mm512_store_ps(p + 32 * i, _mm512_mul_ps(re, r));
-      _mm512_store_ps(p + 32 * i + 16, _mm512_mul_ps(im, r));
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 32, f, r, state.get());
-  }
-
-  std::complex<double> InnerProduct(
-      const State& state1, const State& state2) const {
-    if (state1.num_qubits() != state2.num_qubits()) {
-      return std::nan("");
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* p1, const fp_type* p2) -> std::complex<double> {
-      __m512 re1 = _mm512_load_ps(p1 + 32 * i);
-      __m512 im1 = _mm512_load_ps(p1 + 32 * i + 16);
-      __m512 re2 = _mm512_load_ps(p2 + 32 * i);
-      __m512 im2 = _mm512_load_ps(p2 + 32 * i + 16);
-
-      __m512 ip_re = _mm512_fmadd_ps(im1, im2, _mm512_mul_ps(re1, re2));
-      __m512 ip_im = _mm512_fnmadd_ps(im1, re2, _mm512_mul_ps(re1, im2));
-
-      double re = detail::HorizontalSumAVX512(ip_re);
-      double im = detail::HorizontalSumAVX512(ip_im);
-
-      return std::complex<double>{re, im};
-    };
-
-    using Op = std::plus<std::complex<double>>;
-    return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 32, f,
-                                Op(), state1.get(), state2.get());
-  }
-
-  double RealInnerProduct(const State& state1, const State& state2) const {
-    if (state1.num_qubits() != state2.num_qubits()) {
-      return std::nan("");
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* p1, const fp_type* p2) -> double {
-      __m512 re1 = _mm512_load_ps(p1 + 32 * i);
-      __m512 im1 = _mm512_load_ps(p1 + 32 * i + 16);
-      __m512 re2 = _mm512_load_ps(p2 + 32 * i);
-      __m512 im2 = _mm512_load_ps(p2 + 32 * i + 16);
-
-      __m512 ip_re = _mm512_fmadd_ps(im1, im2, _mm512_mul_ps(re1, re2));
-
-      return detail::HorizontalSumAVX512(ip_re);
-    };
-
-    using Op = std::plus<double>;
-    return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 32, f,
-                                Op(), state1.get(), state2.get());
-  }
-
-  template <typename DistrRealType = double>
-  std::vector<uint64_t> Sample(
-      const State& state, uint64_t num_samples, unsigned seed) const {
-    std::vector<uint64_t> bitstrings;
-
-    if (num_samples > 0) {
-      double norm = 0;
-      uint64_t size = MinSize(state.num_qubits()) / 32;
-      const fp_type* p = state.get();
-
-      for (uint64_t k = 0; k < size; ++k) {
-        for (unsigned j = 0; j < 16; ++j) {
-          double re = p[32 * k + j];
-          double im = p[32 * k + 16 + j];
-          norm += re * re + im * im;
-        }
-      }
-
-      auto rs = GenerateRandomValues<DistrRealType>(num_samples, seed, norm);
-
-      uint64_t m = 0;
-      double csum = 0;
-      bitstrings.reserve(num_samples);
-
-      for (uint64_t k = 0; k < size; ++k) {
-        for (unsigned j = 0; j < 16; ++j) {
-          double re = p[32 * k + j];
-          double im = p[32 * k + 16 + j];
-          csum += re * re + im * im;
-          while (rs[m] < csum && m < num_samples) {
-            bitstrings.emplace_back(16 * k + j);
-            ++m;
-          }
-        }
-      }
-
-      for (; m < num_samples; ++m) {
-        bitstrings.emplace_back((uint64_t{1} << state.num_qubits()) - 1);
-      }
-    }
-
-    return bitstrings;
-  }
-
-  using MeasurementResult = typename Base::MeasurementResult;
-
-  void Collapse(const MeasurementResult& mr, State& state) const {
-    auto f1 = [](unsigned n, unsigned m, uint64_t i,
-                 uint64_t mask, uint64_t bits, const fp_type* p) -> double {
-      __mmask16 ml = detail::GetZeroMaskAVX512(16 * i, mask, bits);
-
-      __m512 re = _mm512_maskz_load_ps(ml, p + 32 * i);
-      __m512 im = _mm512_maskz_load_ps(ml, p + 32 * i + 16);
-      __m512 s1 = _mm512_fmadd_ps(im, im, _mm512_mul_ps(re, re));
-
-      return detail::HorizontalSumAVX512(s1);
-    };
-
-    using Op = std::plus<double>;
-    double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 32, f1,
-                                       Op(), mr.mask, mr.bits, state.get());
-
-    __m512 renorm = _mm512_set1_ps(1.0 / std::sqrt(norm));
-
-    auto f2 = [](unsigned n, unsigned m, uint64_t i,
-                 uint64_t mask, uint64_t bits, __m512 renorm, fp_type* p) {
-      __mmask16 ml = detail::GetZeroMaskAVX512(16 * i, mask, bits);
-
-      __m512 re = _mm512_maskz_load_ps(ml, p + 32 * i);
-      __m512 im = _mm512_maskz_load_ps(ml, p + 32 * i + 16);
-
-      re = _mm512_mul_ps(re, renorm);
-      im = _mm512_mul_ps(im, renorm);
-
-      _mm512_store_ps(p + 32 * i, re);
-      _mm512_store_ps(p + 32 * i + 16, im);
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 32, f2,
-                   mr.mask, mr.bits, renorm, state.get());
-  }
-
-  std::vector<double> PartialNorms(const State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* p) -> double {
-      __m512 re = _mm512_load_ps(p + 32 * i);
-      __m512 im = _mm512_load_ps(p + 32 * i + 16);
-      __m512 s1 = _mm512_fmadd_ps(im, im, _mm512_mul_ps(re, re));
-
-      return detail::HorizontalSumAVX512(s1);
-    };
-
-    using Op = std::plus<double>;
-    return Base::for_.RunReduceP(
-        MinSize(state.num_qubits()) / 32, f, Op(), state.get());
-  }
-
-  uint64_t FindMeasuredBits(
-      unsigned m, double r, uint64_t mask, const State& state) const {
-    double csum = 0;
-
-    uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 32, m);
-    uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 32, m);
-
-    const fp_type* p = state.get();
-
-    for (uint64_t k = k0; k < k1; ++k) {
-      for (uint64_t j = 0; j < 16; ++j) {
-        auto re = p[32 * k + j];
-        auto im = p[32 * k + j + 16];
-        csum += re * re + im * im;
-        if (r < csum) {
-          return (16 * k + j) & mask;
-        }
-      }
-    }
-
-    // Return the last bitstring in the unlikely case of underflow.
-    return (16 * k1 - 1) & mask;
-  }
-};
-
-}  // namespace qsim
-
-#endif  // STATESPACE_AVX512_H_
diff --git a/qsim/statespace_basic.h b/qsim/statespace_basic.h
deleted file mode 100644
index 6468483..0000000
--- a/qsim/statespace_basic.h
+++ /dev/null
@@ -1,300 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef STATESPACE_BASIC_H_
-#define STATESPACE_BASIC_H_
-
-#include <cmath>
-#include <complex>
-#include <cstdint>
-#include <functional>
-
-#include "statespace.h"
-#include "util.h"
-#include "vectorspace.h"
-
-namespace qsim {
-
-/**
- * Object containing context and routines for unoptimized state-vector
- * manipulations. State is a non-vectorized sequence of one real amplitude
- * followed by one imaginary amplitude.
- */
-template <typename For, typename FP>
-class StateSpaceBasic :
-    public StateSpace<StateSpaceBasic<For, FP>, VectorSpace, For, FP> {
- private:
-  using Base = StateSpace<StateSpaceBasic<For, FP>, qsim::VectorSpace, For, FP>;
-
- public:
-  using State = typename Base::State;
-  using fp_type = typename Base::fp_type;
-
-  template <typename... ForArgs>
-  explicit StateSpaceBasic(ForArgs&&... args) : Base(args...) {}
-
-  static uint64_t MinSize(unsigned num_qubits) {
-    return 2 * (uint64_t{1} << num_qubits);
-  };
-
-  void InternalToNormalOrder(State& state) const {}
-
-  void NormalToInternalOrder(State& state) const {}
-
-  void SetAllZeros(State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) {
-      p[2 * i] = 0;
-      p[2 * i + 1] = 0;
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 2, f, state.get());
-  }
-
-  // Uniform superposition.
-  void SetStateUniform(State& state) const {
-    fp_type val = fp_type{1} / std::sqrt(uint64_t{1} << state.num_qubits());
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                fp_type val, fp_type* p) {
-      p[2 * i] = val;
-      p[2 * i + 1] = 0;
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 2, f, val, state.get());
-  }
-
-  // |0> state.
-  void SetStateZero(State& state) const {
-    SetAllZeros(state);
-    state.get()[0] = 1;
-  }
-
-  static std::complex<fp_type> GetAmpl(const State& state, uint64_t i) {
-    uint64_t p = 2 * i;
-    return std::complex<fp_type>(state.get()[p], state.get()[p + 1]);
-  }
-
-  static void SetAmpl(
-      State& state, uint64_t i, const std::complex<fp_type>& ampl) {
-    uint64_t p = 2 * i;
-    state.get()[p] = std::real(ampl);
-    state.get()[p + 1] = std::imag(ampl);
-  }
-
-  static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) {
-    uint64_t p = 2 * i;
-    state.get()[p] = re;
-    state.get()[p + 1] = im;
-  }
-
-  // Sets state[i] = complex(re, im) where (i & mask) == bits.
-  // if `exclude` is true then the criteria becomes (i & mask) != bits.
-  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits,
-                   const std::complex<fp_type>& val,
-                   bool exclude = false) const {
-    BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude);
-  }
-
-  // Sets state[i] = complex(re, im) where (i & mask) == bits.
-  // if `exclude` is true then the criteria becomes (i & mask) != bits.
-  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re,
-                   fp_type im, bool exclude = false) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv,
-                uint64_t bitsv, fp_type re_n, fp_type im_n, bool excludev,
-                fp_type* p) {
-      auto s = p + 2 * i;
-      bool in_mask = (i & maskv) == bitsv;
-      in_mask ^= excludev;
-      s[0] = in_mask ? re_n : s[0];
-      s[1] = in_mask ? im_n : s[1];
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 2, f, mask, bits, re, im,
-                   exclude, state.get());
-  }
-
-  // Does the equivalent of dest += src elementwise.
-  bool Add(const State& src, State& dest) const {
-    if (src.num_qubits() != dest.num_qubits()) {
-      return false;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* p1, fp_type* p2) {
-      p2[2 * i] += p1[2 * i];
-      p2[2 * i + 1] += p1[2 * i + 1];
-    };
-
-    Base::for_.Run(MinSize(src.num_qubits()) / 2, f, src.get(), dest.get());
-
-    return true;
-  }
-
-  // Does the equivalent of state *= a elementwise.
-  void Multiply(fp_type a, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, fp_type a, fp_type* p) {
-      p[2 * i] *= a;
-      p[2 * i + 1] *= a;
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 2, f, a, state.get());
-  }
-
-  std::complex<double> InnerProduct(
-      const State& state1, const State& state2) const {
-    if (state1.num_qubits() != state2.num_qubits()) {
-      return std::nan("");
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* p1, const fp_type* p2) -> std::complex<double> {
-      auto s1 = p1 + 2 * i;
-      auto s2 = p2 + 2 * i;
-
-      double re = s1[0] * s2[0] + s1[1] * s2[1];
-      double im = s1[0] * s2[1] - s1[1] * s2[0];
-
-      return std::complex<double>{re, im};
-    };
-
-    using Op = std::plus<std::complex<double>>;
-    return Base::for_.RunReduce(
-        MinSize(state1.num_qubits()) / 2, f, Op(), state1.get(), state2.get());
-  }
-
-  double RealInnerProduct(const State& state1, const State& state2) const {
-    if (state1.num_qubits() != state2.num_qubits()) {
-      return std::nan("");
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* p1, const fp_type* p2) -> double {
-      auto s1 = p1 + 2 * i;
-      auto s2 = p2 + 2 * i;
-
-      return s1[0] * s2[0] + s1[1] * s2[1];
-    };
-
-    using Op = std::plus<double>;
-    return Base::for_.RunReduce(
-        MinSize(state1.num_qubits()) / 2, f, Op(), state1.get(), state2.get());
-  }
-
-  template <typename DistrRealType = double>
-  std::vector<uint64_t> Sample(
-      const State& state, uint64_t num_samples, unsigned seed) const {
-    std::vector<uint64_t> bitstrings;
-
-    if (num_samples > 0) {
-      double norm = 0;
-      uint64_t size = MinSize(state.num_qubits()) / 2;
-
-      const fp_type* p = state.get();
-
-      for (uint64_t k = 0; k < size; ++k) {
-        double re = p[2 * k];
-        double im = p[2 * k + 1];
-        norm += re * re + im * im;
-      }
-
-      auto rs = GenerateRandomValues<DistrRealType>(num_samples, seed, norm);
-
-      uint64_t m = 0;
-      double csum = 0;
-      bitstrings.reserve(num_samples);
-
-      for (uint64_t k = 0; k < size; ++k) {
-        double re = p[2 * k];
-        double im = p[2 * k + 1];
-        csum += re * re + im * im;
-        while (rs[m] < csum && m < num_samples) {
-          bitstrings.emplace_back(k);
-          ++m;
-        }
-      }
-
-      for (; m < num_samples; ++m) {
-        bitstrings.emplace_back((uint64_t{1} << state.num_qubits()) - 1);
-      }
-    }
-
-    return bitstrings;
-  }
-
-  using MeasurementResult = typename Base::MeasurementResult;
-
-  void Collapse(const MeasurementResult& mr, State& state) const {
-    auto f1 = [](unsigned n, unsigned m, uint64_t i,
-                 uint64_t mask, uint64_t bits, const fp_type* p) -> double {
-      auto s = p + 2 * i;
-      return (i & mask) == bits ? s[0] * s[0] + s[1] * s[1] : 0;
-    };
-
-    using Op = std::plus<double>;
-    double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 2, f1,
-                                       Op(), mr.mask, mr.bits, state.get());
-
-    double renorm = 1.0 / std::sqrt(norm);
-
-    auto f2 = [](unsigned n, unsigned m, uint64_t i,
-                 uint64_t mask, uint64_t bits, fp_type renorm, fp_type* p) {
-      auto s = p + 2 * i;
-      bool not_zero = (i & mask) == bits;
-
-      s[0] = not_zero ? s[0] * renorm : 0;
-      s[1] = not_zero ? s[1] * renorm : 0;
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 2, f2,
-                   mr.mask, mr.bits, renorm, state.get());
-  }
-
-  std::vector<double> PartialNorms(const State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* p) -> double {
-      auto s = p + 2 * i;
-      return s[0] * s[0] + s[1] * s[1];
-    };
-
-    using Op = std::plus<double>;
-    return Base::for_.RunReduceP(
-        MinSize(state.num_qubits()) / 2, f, Op(), state.get());
-  }
-
-  uint64_t FindMeasuredBits(
-      unsigned m, double r, uint64_t mask, const State& state) const {
-    double csum = 0;
-
-    uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 2, m);
-    uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 2, m);
-
-    const fp_type* p = state.get();
-
-    for (uint64_t k = k0; k < k1; ++k) {
-      auto re = p[2 * k];
-      auto im = p[2 * k + 1];
-      csum += re * re + im * im;
-      if (r < csum) {
-        return k & mask;
-      }
-    }
-
-    // Return the last bitstring in the unlikely case of underflow.
-    return (k1 - 1) & mask;
-  }
-};
-
-}  // namespace qsim
-
-#endif  // STATESPACE_BASIC_H_
diff --git a/qsim/statespace_cuda.h b/qsim/statespace_cuda.h
deleted file mode 100644
index 660db07..0000000
--- a/qsim/statespace_cuda.h
+++ /dev/null
@@ -1,470 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef STATESPACE_CUDA_H_
-#define STATESPACE_CUDA_H_
-
-#ifdef __NVCC__
-  #include <cuda.h>
-#elif __HIP__
-  #include <hip/hip_runtime.h>
-  #include "cuda2hip.h"
-#endif
-
-#include <algorithm>
-#include <complex>
-#include <cstdint>
-
-#include "statespace.h"
-#include "statespace_cuda_kernels.h"
-#include "vectorspace_cuda.h"
-#include "util_cuda.h"
-
-namespace qsim {
-
-/**
- * Object containing context and routines for CUDA state-vector manipulations.
- * State is a vectorized sequence of 32 real components followed by 32
- * imaginary components. 32 floating numbers can be proccessed in parallel by
- * a single warp. It is not recommended to use `GetAmpl` and `SetAmpl`.
- */
-template <typename FP = float>
-class StateSpaceCUDA :
-    public StateSpace<StateSpaceCUDA<FP>, VectorSpaceCUDA, FP> {
- private:
-  using Base = StateSpace<StateSpaceCUDA<FP>, qsim::VectorSpaceCUDA, FP>;
-
- protected:
-  struct Grid {
-    unsigned threads;
-    unsigned dblocks;
-    unsigned blocks;
-  };
-
- public:
-  using State = typename Base::State;
-  using fp_type = typename Base::fp_type;
-
-  struct Parameter {
-    /**
-     * The number of threads per block.
-     * Should be 2 to the power of k, where k is in the range [5,10].
-     */
-    unsigned num_threads = 512;
-    /**
-     * The number of data blocks. Each thread processes num_dblocks data
-     * blocks in reductions (norms, inner products, etc).
-     */
-    unsigned num_dblocks = 16;
-  };
-
-  explicit StateSpaceCUDA(const Parameter& param)
-      : param_(param), scratch_(nullptr), scratch_size_(0) {}
-
-  virtual ~StateSpaceCUDA() {
-    if (scratch_ != nullptr) {
-      ErrorCheck(cudaFree(scratch_));
-    }
-  }
-
-  static uint64_t MinSize(unsigned num_qubits) {
-    return std::max(uint64_t{64}, 2 * (uint64_t{1} << num_qubits));
-  };
-
-  void InternalToNormalOrder(State& state) const {
-    uint64_t size = MinSize(state.num_qubits()) / 2;
-
-    unsigned threads = std::min(size, uint64_t{param_.num_threads});
-    unsigned blocks = size / threads;
-    unsigned bytes = 2 * threads * sizeof(fp_type);
-
-    InternalToNormalOrderKernel<<<blocks, threads, bytes>>>(state.get());
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void NormalToInternalOrder(State& state) const {
-    uint64_t size = MinSize(state.num_qubits()) / 2;
-
-    unsigned threads = std::min(size, uint64_t{param_.num_threads});
-    unsigned blocks = size / threads;
-    unsigned bytes = 2 * threads * sizeof(fp_type);
-
-    NormalToInternalOrderKernel<<<blocks, threads, bytes>>>(state.get());
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void SetAllZeros(State& state) const {
-    ErrorCheck(cudaMemset(state.get(), 0,
-               MinSize(state.num_qubits()) * sizeof(fp_type)));
-  }
-
-  // Uniform superposition.
-  void SetStateUniform(State& state) const {
-    uint64_t size = MinSize(state.num_qubits()) / 2;
-    uint64_t hsize = uint64_t{1} << state.num_qubits();
-
-    unsigned threads = std::min(size, uint64_t{param_.num_threads});
-    unsigned blocks = size / threads;
-
-    fp_type v = double{1} / std::sqrt(hsize);
-
-    SetStateUniformKernel<<<blocks, threads>>>(v, hsize, state.get());
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  // |0> state.
-  void SetStateZero(State& state) const {
-    SetAllZeros(state);
-    fp_type one[1] = {1};
-    ErrorCheck(
-        cudaMemcpy(state.get(), one, sizeof(fp_type), cudaMemcpyHostToDevice));
-  }
-
-  // It is not recommended to use this function.
-  static std::complex<fp_type> GetAmpl(const State& state, uint64_t i) {
-    fp_type re, im;
-    auto p = state.get() + 64 * (i / 32) + i % 32;
-    ErrorCheck(cudaMemcpy(&re, p, sizeof(fp_type), cudaMemcpyDeviceToHost));
-    ErrorCheck(
-        cudaMemcpy(&im, p + 32, sizeof(fp_type), cudaMemcpyDeviceToHost));
-    return std::complex<fp_type>(re, im);
-  }
-
-  // It is not recommended to use this function.
-  static void SetAmpl(
-      State& state, uint64_t i, const std::complex<fp_type>& ampl) {
-    fp_type re = std::real(ampl);
-    fp_type im = std::imag(ampl);
-    auto p = state.get() + 64 * (i / 32) + i % 32;
-    ErrorCheck(cudaMemcpy(p, &re, sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(p + 32, &im, sizeof(fp_type), cudaMemcpyHostToDevice));
-  }
-
-  // It is not recommended to use this function.
-  static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) {
-    auto p = state.get() + 64 * (i / 32) + i % 32;
-    ErrorCheck(cudaMemcpy(p, &re, sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(p + 32, &im, sizeof(fp_type), cudaMemcpyHostToDevice));
-  }
-
-  // Sets state[i] = complex(re, im) where (i & mask) == bits.
-  // if `exclude` is true then the criteria becomes (i & mask) != bits.
-  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits,
-                   const std::complex<fp_type>& val,
-                   bool exclude = false) const {
-    BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude);
-  }
-
-  // Sets state[i] = complex(re, im) where (i & mask) == bits.
-  // if `exclude` is true then the criteria becomes (i & mask) != bits.
-  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re,
-                   fp_type im, bool exclude = false) const {
-    uint64_t size = MinSize(state.num_qubits()) / 2;
-
-    unsigned threads = std::min(size, uint64_t{param_.num_threads});
-    unsigned blocks = size / threads;
-
-    BulkSetAmplKernel<<<blocks, threads>>>(
-        mask, bits, re, im, exclude, state.get());
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  // Does the equivalent of dest += src elementwise.
-  bool Add(const State& src, State& dest) const {
-    if (src.num_qubits() != dest.num_qubits()) {
-      return false;
-    }
-
-    uint64_t size = MinSize(src.num_qubits());
-
-    unsigned threads = std::min(size, uint64_t{param_.num_threads});
-    unsigned blocks = size / threads;
-
-    AddKernel<<<blocks, threads>>>(src.get(), dest.get());
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    return true;
-  }
-
-  // Does the equivalent of state *= a elementwise.
-  void Multiply(fp_type a, State& state) const {
-    uint64_t size = MinSize(state.num_qubits());
-
-    unsigned threads = std::min(size, uint64_t{param_.num_threads});
-    unsigned blocks = size / threads;
-
-    MultiplyKernel<<<blocks, threads>>>(a, state.get());
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  std::complex<double> InnerProduct(
-      const State& state1, const State& state2) const {
-    if (state1.num_qubits() != state2.num_qubits()) {
-      return std::nan("");
-    }
-
-    using C = Complex<double>;
-    auto r = Reduce<C, C, Product<fp_type>>(state1, state2);
-
-    return {r.re, r.im};
-  }
-
-  double RealInnerProduct(const State& state1, const State& state2) const {
-    if (state1.num_qubits() != state2.num_qubits()) {
-      return std::nan("");
-    }
-
-    return Reduce<double, double, RealProduct<fp_type>>(state1, state2);
-  }
-
-  double Norm(const State& state) const {
-    return Reduce<double, double, RealProduct<fp_type>>(state, state);
-  }
-
-  template <typename DistrRealType = double>
-  std::vector<uint64_t> Sample(
-      const State& state, uint64_t num_samples, unsigned seed) const {
-    std::vector<uint64_t> bitstrings;
-
-    if (num_samples > 0) {
-      Grid g1 = GetGrid1(MinSize(state.num_qubits()) / 2);
-      unsigned bytes = g1.threads * sizeof(double);
-
-      unsigned scratch_size = (g1.blocks + 1) * sizeof(double)
-          + num_samples * (sizeof(uint64_t) + sizeof(DistrRealType));
-
-      void* scratch = AllocScratch(scratch_size);
-
-      double* d_res2 = (double*) scratch;
-      double* d_res1 = d_res2 + 1;
-      uint64_t* d_bitstrings = (uint64_t*) (d_res1 + g1.blocks);
-      DistrRealType* d_rs = (DistrRealType *) (d_bitstrings + num_samples);
-
-      auto op1 = RealProduct<fp_type>();
-      auto op2 = Plus<double>();
-
-      Reduce1Kernel<double><<<g1.blocks, g1.threads, bytes>>>(
-          g1.dblocks, op1, op2, op2, state.get(), state.get(), d_res1);
-      ErrorCheck(cudaPeekAtLastError());
-      ErrorCheck(cudaDeviceSynchronize());
-
-      double norm;
-
-      if (g1.blocks == 1) {
-        ErrorCheck(
-            cudaMemcpy(&norm, d_res1, sizeof(double), cudaMemcpyDeviceToHost));
-      } else {
-        Grid g2 = GetGrid2(g1.blocks);
-        unsigned bytes = g2.threads * sizeof(double);
-
-        auto op3 = Plus<double>();
-
-        Reduce2Kernel<double><<<g2.blocks, g2.threads, bytes>>>(
-            g2.dblocks, g1.blocks, op3, op3, d_res1, d_res2);
-        ErrorCheck(cudaPeekAtLastError());
-        ErrorCheck(cudaDeviceSynchronize());
-
-        ErrorCheck(
-            cudaMemcpy(&norm, d_res2, sizeof(double), cudaMemcpyDeviceToHost));
-      }
-
-      // TODO: generate random values on the device.
-      auto rs = GenerateRandomValues<DistrRealType>(num_samples, seed, norm);
-
-      ErrorCheck(cudaMemcpy(d_rs, rs.data(),
-                            num_samples * sizeof(DistrRealType),
-                            cudaMemcpyHostToDevice));
-
-      SampleKernel<<<1, g1.threads>>>(g1.blocks, g1.dblocks, num_samples,
-                                      d_rs, d_res1, state.get(), d_bitstrings);
-      ErrorCheck(cudaPeekAtLastError());
-      ErrorCheck(cudaDeviceSynchronize());
-
-      bitstrings.resize(num_samples, 0);
-
-      ErrorCheck(cudaMemcpy(bitstrings.data(), d_bitstrings,
-                            num_samples * sizeof(uint64_t),
-                            cudaMemcpyDeviceToHost));
-    }
-
-    return bitstrings;
-  }
-
-  using MeasurementResult = typename Base::MeasurementResult;
-
-  void Collapse(const MeasurementResult& mr, State& state) const {
-    using Op = RealProduct<fp_type>;
-    double r = Reduce<double, double, Op>(mr.mask, mr.bits, state, state);
-    fp_type renorm = 1 / std::sqrt(r);
-
-    uint64_t size = MinSize(state.num_qubits()) / 2;
-
-    unsigned threads = std::min(size, uint64_t{param_.num_threads});
-    unsigned blocks = size / threads;
-
-    CollapseKernel<<<blocks, threads>>>(mr.mask, mr.bits, renorm, state.get());
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  std::vector<double> PartialNorms(const State& state) const {
-    Grid g = GetGrid1(MinSize(state.num_qubits()) / 2);
-
-    unsigned scratch_size = g.blocks * sizeof(double);
-    unsigned bytes = g.threads * sizeof(double);
-
-    double* d_res = (double*) AllocScratch(scratch_size);
-
-    auto op1 = RealProduct<fp_type>();
-    auto op2 = Plus<double>();
-
-    Reduce1Kernel<double><<<g.blocks, g.threads, bytes>>>(
-        g.dblocks, op1, op2, op2, state.get(), state.get(), d_res);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    std::vector<double> norms(g.blocks);
-
-    ErrorCheck(
-        cudaMemcpy(norms.data(), d_res, scratch_size, cudaMemcpyDeviceToHost));
-
-    return norms;
-  }
-
-  uint64_t FindMeasuredBits(
-      unsigned m, double r, uint64_t mask, const State& state) const {
-    Grid g = GetGrid1(MinSize(state.num_qubits()) / 2);
-
-    uint64_t res;
-    uint64_t* d_res = (uint64_t*) AllocScratch(sizeof(uint64_t));
-
-    FindMeasuredBitsKernel<<<1, g.threads>>>(
-        m, g.dblocks, r, state.get(), d_res);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    ErrorCheck(
-        cudaMemcpy(&res, d_res, sizeof(uint64_t), cudaMemcpyDeviceToHost));
-
-    return res & mask;
-  }
-
- protected:
-  Parameter param_;
-
-  void* AllocScratch(uint64_t size) const {
-    if (size > scratch_size_) {
-      if (scratch_ != nullptr) {
-        ErrorCheck(cudaFree(scratch_));
-      }
-
-      ErrorCheck(cudaMalloc(const_cast<void**>(&scratch_), size));
-
-      const_cast<uint64_t&>(scratch_size_) = size;
-    }
-
-    return scratch_;
-  }
-
-  Grid GetGrid1(uint64_t size) const {
-    Grid grid;
-
-    grid.threads = std::min(size, uint64_t{param_.num_threads});
-    grid.dblocks = std::min(size / grid.threads, uint64_t{param_.num_dblocks});
-    grid.blocks = size / (grid.threads * grid.dblocks);
-
-    return grid;
-  }
-
-  Grid GetGrid2(unsigned size) const {
-    Grid grid;
-
-    grid.threads = std::min(param_.num_threads, std::max(32U, size));
-    grid.dblocks = std::max(1U, size / grid.threads);
-    grid.blocks = 1;
-
-    return grid;
-  }
-
-  template <typename FP1, typename FP2, typename Op>
-  FP2 Reduce(const State& state1, const State& state2) const {
-    return Reduce<FP1, FP2, Op>(0, 0, state1, state2);
-  }
-
-  template <typename FP1, typename FP2, typename Op>
-  FP2 Reduce(uint64_t mask, uint64_t bits,
-             const State& state1, const State& state2) const {
-    uint64_t size = MinSize(state1.num_qubits()) / 2;
-
-    Grid g1 = GetGrid1(size);
-    unsigned bytes = g1.threads * sizeof(FP1);
-
-    FP2* d_res2 = (FP2*) AllocScratch((g1.blocks + 1) * sizeof(FP2));
-    FP2* d_res1 = d_res2 + 1;
-
-    auto op1 = Op();
-    auto op2 = Plus<FP1>();
-    auto op3 = Plus<typename Scalar<FP1>::type>();
-
-    if (mask == 0) {
-      Reduce1Kernel<FP1><<<g1.blocks, g1.threads, bytes>>>(
-          g1.dblocks, op1, op2, op3, state1.get(), state2.get(), d_res1);
-    } else {
-      Reduce1MaskedKernel<FP1><<<g1.blocks, g1.threads, bytes>>>(
-          g1.dblocks, mask, bits, op1, op2, op3, state1.get(), state2.get(),
-          d_res1);
-    }
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    FP2 result;
-
-    if (g1.blocks == 1) {
-      ErrorCheck(
-          cudaMemcpy(&result, d_res1, sizeof(FP2), cudaMemcpyDeviceToHost));
-    } else {
-      Grid g2 = GetGrid2(g1.blocks);
-      unsigned bytes = g2.threads * sizeof(FP2);
-
-      auto op2 = Plus<FP2>();
-      auto op3 = Plus<typename Scalar<FP2>::type>();
-
-      Reduce2Kernel<FP2><<<g2.blocks, g2.threads, bytes>>>(
-          g2.dblocks, g1.blocks, op2, op3, d_res1, d_res2);
-      ErrorCheck(cudaPeekAtLastError());
-      ErrorCheck(cudaDeviceSynchronize());
-
-      ErrorCheck(
-          cudaMemcpy(&result, d_res2, sizeof(FP2), cudaMemcpyDeviceToHost));
-    }
-
-    return result;
-  }
-
- private:
-  void* scratch_;
-  uint64_t scratch_size_;
-};
-
-}  // namespace qsim
-
-#endif  // STATESPACE_CUDA_H_
diff --git a/qsim/statespace_cuda_kernels.h b/qsim/statespace_cuda_kernels.h
deleted file mode 100644
index b54ebca..0000000
--- a/qsim/statespace_cuda_kernels.h
+++ /dev/null
@@ -1,355 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef STATESPACE_CUDA_KERNELS_H_
-#define STATESPACE_CUDA_KERNELS_H_
-
-#ifdef __NVCC__
-  #include <cuda.h>
-#elif __HIP__
-  #include <hip/hip_runtime.h>
-  #include "cuda2hip.h"
-#endif
-
-#include "util_cuda.h"
-
-namespace qsim {
-
-namespace detail {
-
-template <typename FP1, typename FP2,
-          typename Op1, typename Op2, typename Op3, unsigned warp_size = 32>
-__device__ __forceinline__ FP1 BlockReduce1(
-    uint64_t n, Op1 op1, Op2 op2, Op3 op3, const FP2* s1, const FP2* s2) {
-  extern __shared__ float shared[];
-  FP1* partial1 = (FP1*) shared;
-
-  unsigned tid = threadIdx.x;
-  unsigned warp = threadIdx.x / warp_size;
-  unsigned lane = threadIdx.x % warp_size;
-
-  uint64_t k0 = 2 * n * blockIdx.x * blockDim.x + 2 * tid - lane;
-  uint64_t k1 = k0 + 2 * n * blockDim.x;
-
-  FP1 r;
-
-  r = op1(s1[k0], s1[k0 + warp_size], s2[k0], s2[k0 + warp_size]);
-  while ((k0 += 2 * blockDim.x) < k1) {
-    r = op2(r, op1(s1[k0], s1[k0 + warp_size], s2[k0], s2[k0 + warp_size]));
-  }
-
-  partial1[tid] = r;
-
-  __shared__ FP1 partial2[warp_size];
-
-  if (tid < warp_size) {
-    partial2[tid] = 0;
-  }
-
-  __syncthreads();
-
-  FP1 val = WarpReduce(partial1[tid], op3);
-
-  if (lane == 0) {
-    partial2[warp] = val;
-  }
-
-  __syncthreads();
-
-  FP1 result = 0;
-
-  if (tid < warp_size) {
-    result = WarpReduce(partial2[tid], op3);
-  }
-
-  return result;
-}
-
-template <typename FP1, typename FP2,
-          typename Op1, typename Op2, typename Op3, unsigned warp_size = 32>
-__device__ __forceinline__ FP1 BlockReduce1Masked(
-    uint64_t n, uint64_t mask, uint64_t bits, Op1 op1, Op2 op2, Op3 op3,
-    const FP2* s1, const FP2* s2) {
-  extern __shared__ float shared[];
-  FP1* partial1 = (FP1*) shared;
-
-  unsigned tid = threadIdx.x;
-  unsigned warp = threadIdx.x / warp_size;
-  unsigned lane = threadIdx.x % warp_size;
-
-  uint64_t k0 = 2 * n * blockIdx.x * blockDim.x + 2 * tid - lane;
-  uint64_t k1 = k0 + 2 * n * blockDim.x;
-
-  FP1 r = 0;
-
-  if (((k0 + lane) / 2 & mask) == bits) {
-    r = op1(s1[k0], s1[k0 + warp_size], s2[k0], s2[k0 + warp_size]);
-  }
-  while ((k0 += 2 * blockDim.x) < k1) {
-    if (((k0 + lane) / 2 & mask) == bits) {
-      r = op2(r, op1(s1[k0], s1[k0 + warp_size], s2[k0], s2[k0 + warp_size]));
-    }
-  }
-
-  partial1[tid] = r;
-
-  __shared__ FP1 partial2[warp_size];
-
-  if (tid < warp_size) {
-    partial2[tid] = 0;
-  }
-
-  __syncthreads();
-
-  FP1 val = WarpReduce(partial1[tid], op3);
-
-  if (lane == 0) {
-    partial2[warp] = val;
-  }
-
-  __syncthreads();
-
-  FP1 result = 0;
-
-  if (tid < warp_size) {
-    result = WarpReduce(partial2[tid], op3);
-  }
-
-  return result;
-}
-
-template <typename FP1, typename FP2,
-          typename Op2, typename Op3, unsigned warp_size = 32>
-__device__ __forceinline__ FP1 BlockReduce2(
-    uint64_t n, uint64_t size, Op2 op2, Op3 op3, const FP2* s) {
-  extern __shared__ float shared[];
-  FP1* partial1 = (FP1*) shared;
-
-  unsigned tid = threadIdx.x;
-  uint64_t k0 = n * blockIdx.x * blockDim.x + tid;
-  uint64_t k1 = k0 + n * blockDim.x;
-
-  FP1 r = 0;
-
-  if (tid < size) {
-    r = s[k0];
-    while ((k0 += blockDim.x) < k1) {
-      r = op2(r, s[k0]);
-    }
-  }
-
-  partial1[tid] = r;
-
-  __shared__ FP1 partial2[warp_size];
-
-  if (tid < warp_size) {
-    partial2[tid] = 0;
-  }
-
-  __syncthreads();
-
-  FP1 val = WarpReduce(partial1[tid], op3);
-
-  if (threadIdx.x % warp_size == 0) {
-    partial2[threadIdx.x / warp_size] = val;
-  }
-
-  __syncthreads();
-
-  FP1 result = 0;
-
-  if (tid < warp_size) {
-    result = WarpReduce(partial2[tid], op3);
-  }
-
-  return result;
-}
-
-}  // namespace detail
-
-template <typename FP1, typename FP2, typename FP3,
-          typename Op1, typename Op2, typename Op3, unsigned warp_size = 32>
-__global__ void Reduce1Kernel(uint64_t n, Op1 op1, Op2 op2, Op3 op3,
-                              const FP2* s1, const FP2* s2, FP3* result) {
-  FP1 sum = detail::BlockReduce1<FP1>(n, op1, op2, op3, s1, s2);
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x] = sum;
-  }
-}
-
-template <typename FP1, typename FP2, typename FP3,
-          typename Op1, typename Op2, typename Op3, unsigned warp_size = 32>
-__global__ void Reduce1MaskedKernel(uint64_t n, uint64_t mask, uint64_t bits,
-                                    Op1 op1, Op2 op2, Op3 op3,
-                                    const FP2* s1, const FP2* s2, FP3* result) {
-  FP1 sum =
-      detail::BlockReduce1Masked<FP1>(n, mask, bits, op1, op2, op3, s1, s2);
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x] = sum;
-  }
-}
-
-template <typename FP1, typename FP2, typename FP3,
-          typename Op2, typename Op3, unsigned warp_size = 32>
-__global__ void Reduce2Kernel(
-    uint64_t n, uint64_t size, Op2 op2, Op3 op3, const FP2* s, FP3* result) {
-  FP1 sum = detail::BlockReduce2<FP1>(n, size, op2, op3, s);
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x] = sum;
-  }
-}
-
-template <typename FP, unsigned warp_size = 32>
-__global__ void InternalToNormalOrderKernel(FP* state) {
-  unsigned lane = threadIdx.x % warp_size;
-  unsigned l = 2 * threadIdx.x - lane;
-  uint64_t k = 2 * uint64_t{blockIdx.x} * blockDim.x + l;
-
-  extern __shared__ float shared[];
-  FP* buf = (FP*) shared;
-
-  buf[l] = state[k];
-  buf[l + warp_size] = state[k + warp_size];
-
-  __syncthreads();
-
-  state[k + lane] = buf[l];
-  state[k + lane + 1] = buf[l + warp_size];
-}
-
-template <typename FP, unsigned warp_size = 32>
-__global__ void NormalToInternalOrderKernel(FP* state) {
-  unsigned lane = threadIdx.x % warp_size;
-  unsigned l = 2 * threadIdx.x - lane;
-  uint64_t k = 2 * uint64_t{blockIdx.x} * blockDim.x + l;
-
-  extern __shared__ float shared[];
-  FP* buf = (FP*) shared;
-
-  buf[l] = state[k];
-  buf[l + warp_size] = state[k + warp_size];
-
-  __syncthreads();
-
-  state[k] = buf[l + lane];
-  state[k + warp_size] = buf[l + lane + 1];
-}
-
-template <typename FP, unsigned warp_size = 32>
-__global__ void SetStateUniformKernel(FP v, uint64_t size, FP* state) {
-  unsigned lane = threadIdx.x % warp_size;
-  uint64_t k = 2 * (uint64_t{blockIdx.x} * blockDim.x + threadIdx.x) - lane;
-
-  state[k] = lane < size ? v : 0;
-  state[k + warp_size] = 0;
-}
-
-template <typename FP, unsigned warp_size = 32>
-__global__ void AddKernel(const FP* state1, FP* state2) {
-  uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x;
-  state2[k] += state1[k];
-}
-
-template <typename FP, unsigned warp_size = 32>
-__global__ void MultiplyKernel(FP a, FP* state) {
-  uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x;
-  state[k] *= a;
-}
-
-template <typename FP, unsigned warp_size = 32>
-__global__ void CollapseKernel(uint64_t mask, uint64_t bits, FP r, FP* state) {
-  uint64_t k1 = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x;
-  uint64_t k2 = 2 * k1 - threadIdx.x % warp_size;
-
-  if ((k1 & mask) == bits) {
-    state[k2] *= r;
-    state[k2 + warp_size] *= r;
-  } else {
-    state[k2] = 0;
-    state[k2 + warp_size] = 0;
-  }
-}
-
-template <typename FP, unsigned warp_size = 32>
-__global__ void BulkSetAmplKernel(
-    uint64_t mask, uint64_t bits, FP re, FP im, bool exclude, FP* state) {
-  uint64_t k1 = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x;
-  uint64_t k2 = 2 * k1 - threadIdx.x % warp_size;
-
-  bool set = ((k1 & mask) == bits) ^ exclude;
-
-  if (set) {
-    state[k2] = re;
-    state[k2 + warp_size] = im;
-  }
-}
-
-template <typename FP1, typename FP2, typename FP3, unsigned warp_size = 32>
-__global__ void SampleKernel(unsigned num_blocks,
-                             uint64_t n, uint64_t num_samples,
-                             const FP1* rs, const FP2* ps, const FP3* state,
-                             uint64_t *bitstrings) {
-  // Use just one thread. This can be somewhat slow.
-  if (threadIdx.x == 0) {
-    uint64_t m = 0;
-    double csum = 0;
-
-    for (unsigned block_id = 0; block_id < num_blocks; ++block_id) {
-      uint64_t km = n * blockDim.x;
-      uint64_t k0 = block_id * km;
-
-      for (uint64_t k = 0; k < km; ++k) {
-        uint64_t l = 2 * k0 + 64 * (k / 32) + k % 32;
-        FP3 re = state[l];
-        FP3 im = state[l + warp_size];
-        csum += re * re + im * im;
-        while (rs[m] < csum && m < num_samples) {
-          bitstrings[m++] = k0 + k;
-        }
-      }
-    }
-  }
-}
-
-template <typename FP, unsigned warp_size = 32>
-__global__ void FindMeasuredBitsKernel(
-    uint64_t block_id, uint64_t n, double r, const FP* state, uint64_t* res) {
-  // Use just one thread. This can be somewhat slow, however, this is
-  // more or less consistent with CPU implementations.
-  if (threadIdx.x == 0) {
-    double csum = 0;
-    uint64_t km = n * blockDim.x;
-    uint64_t k0 = block_id * km;
-
-    for (uint64_t k = 0; k < km; ++k) {
-      uint64_t l = 2 * k0 + 64 * (k / 32) + k % 32;
-      FP re = state[l];
-      FP im = state[l + warp_size];
-      csum += re * re + im * im;
-      if (r < csum) {
-        *res = k0 + k;
-        return;
-      }
-    }
-
-    *res = k0 + n * blockDim.x - 1;
-  }
-}
-
-}  // namespace qsim
-
-#endif  // STATESPACE_CUDA_KERNELS_H_
diff --git a/qsim/statespace_custatevec.h b/qsim/statespace_custatevec.h
deleted file mode 100644
index f2f5de1..0000000
--- a/qsim/statespace_custatevec.h
+++ /dev/null
@@ -1,376 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef STATESPACE_CUSTATEVEC_H_
-#define STATESPACE_CUSTATEVEC_H_
-
-#include <cmath>
-#include <complex>
-#include <cstdint>
-#include <type_traits>
-#include <vector>
-
-#include <cublas_v2.h>
-#include <cuComplex.h>
-#include <custatevec.h>
-
-#include "statespace.h"
-#include "util_custatevec.h"
-#include "vectorspace_cuda.h"
-
-namespace qsim {
-
-namespace detail {
-
-template <typename FP>
-__global__ void SetStateUniformKernel(FP v, uint64_t size, FP* state) {
-  uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x;
-
-  if (k < size) {
-    state[2 * k] = v;
-    state[2 * k + 1] = 0;
-  }
-}
-
-}  // namespace detail
-
-/**
- * Object containing context and routines for cuStateVec state-vector
- * manipulations. It is not recommended to use `GetAmpl` and `SetAmpl`.
- */
-template <typename FP = float>
-class StateSpaceCuStateVec :
-    public StateSpace<StateSpaceCuStateVec<FP>, VectorSpaceCUDA, FP> {
- private:
-  using Base = StateSpace<StateSpaceCuStateVec<FP>, qsim::VectorSpaceCUDA, FP>;
-
- public:
-  using State = typename Base::State;
-  using fp_type = typename Base::fp_type;
-
-  static constexpr auto is_float = std::is_same<fp_type, float>::value;
-
-  static constexpr auto kStateType = is_float ? CUDA_C_32F : CUDA_C_64F;
-  static constexpr auto kMatrixType = kStateType;
-  static constexpr auto kExpectType = CUDA_C_64F;
-  static constexpr auto kComputeType =
-      is_float ? CUSTATEVEC_COMPUTE_32F : CUSTATEVEC_COMPUTE_64F;
-  static constexpr auto kMatrixLayout = CUSTATEVEC_MATRIX_LAYOUT_ROW;
-
-  explicit StateSpaceCuStateVec(const cublasHandle_t& cublas_handle,
-                                const custatevecHandle_t& custatevec_handle)
-      : cublas_handle_(cublas_handle), custatevec_handle_(custatevec_handle),
-        workspace_(nullptr), workspace_size_(0) {}
-
-  virtual ~StateSpaceCuStateVec() {
-    if (workspace_ != nullptr) {
-      ErrorCheck(cudaFree(workspace_));
-    }
-  }
-
-  static uint64_t MinSize(unsigned num_qubits) {
-    return 2 * (uint64_t{1} << num_qubits);
-  };
-
-  void InternalToNormalOrder(State& state) const {
-  }
-
-  void NormalToInternalOrder(State& state) const {
-  }
-
-  void SetAllZeros(State& state) const {
-    ErrorCheck(cudaMemset(state.get(), 0,
-                          MinSize(state.num_qubits()) * sizeof(fp_type)));
-  }
-
-  // Uniform superposition.
-  void SetStateUniform(State& state) const {
-    uint64_t size = uint64_t{1} << state.num_qubits();
-
-    unsigned threads = size < 256 ? size : 256;
-    unsigned blocks = size / threads;
-
-    fp_type v = double{1} / std::sqrt(size);
-
-    detail::SetStateUniformKernel<<<blocks, threads>>>(v, size, state.get());
-    ErrorCheck(cudaPeekAtLastError());
-  }
-
-  // |0> state.
-  void SetStateZero(State& state) const {
-    SetAllZeros(state);
-    fp_type one[1] = {1};
-    ErrorCheck(
-        cudaMemcpy(state.get(), one, sizeof(fp_type), cudaMemcpyHostToDevice));
-  }
-
-  // It is not recommended to use this function.
-  static std::complex<fp_type> GetAmpl(const State& state, uint64_t i) {
-    fp_type a[2];
-    auto p = state.get() + 2 * i;
-    ErrorCheck(cudaMemcpy(a, p, 2 * sizeof(fp_type), cudaMemcpyDeviceToHost));
-    return std::complex<fp_type>(a[0], a[1]);
-  }
-
-  // It is not recommended to use this function.
-  static void SetAmpl(
-      State& state, uint64_t i, const std::complex<fp_type>& ampl) {
-    fp_type a[2] = {std::real(ampl), std::imag(ampl)};
-    auto p = state.get() + 2 * i;
-    ErrorCheck(cudaMemcpy(p, a, 2 * sizeof(fp_type), cudaMemcpyHostToDevice));
-  }
-
-  // It is not recommended to use this function.
-  static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) {
-    fp_type a[2] = {re, im};
-    auto p = state.get() + 2 * i;
-    ErrorCheck(cudaMemcpy(p, a, 2 * sizeof(fp_type), cudaMemcpyHostToDevice));
-  }
-
-  // Sets state[i] = complex(re, im) where (i & mask) == bits.
-  // if `exclude` is true then the criteria becomes (i & mask) != bits.
-  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits,
-                   const std::complex<fp_type>& val,
-                   bool exclude = false) const {
-    // Not implemented.
-  }
-
-  // Sets state[i] = complex(re, im) where (i & mask) == bits.
-  // if `exclude` is true then the criteria becomes (i & mask) != bits.
-  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re,
-                   fp_type im, bool exclude = false) const {
-    // Not implemented.
-  }
-
-  // Does the equivalent of dest += src elementwise.
-  bool Add(const State& src, State& dest) const {
-    if (src.num_qubits() != dest.num_qubits()) {
-      return false;
-    }
-
-    uint64_t size = uint64_t{1} << src.num_qubits();
-
-    if (is_float) {
-      cuComplex a = {1.0, 0.0};
-      auto p1 = (const cuComplex*) src.get();
-      auto p2 = (cuComplex*) dest.get();
-      ErrorCheck(cublasCaxpy(cublas_handle_, size, &a, p1, 1, p2, 1));
-    } else {
-      cuDoubleComplex a = {1.0, 0.0};
-      auto p1 = (const cuDoubleComplex*) src.get();
-      auto p2 = (cuDoubleComplex*) dest.get();
-      ErrorCheck(cublasZaxpy(cublas_handle_, size, &a, p1, 1, p2, 1));
-    }
-
-    return true;
-  }
-
-  // Does the equivalent of state *= a elementwise.
-  void Multiply(fp_type a, State& state) const {
-    uint64_t size = uint64_t{1} << state.num_qubits();
-
-    if (is_float) {
-      float a1 = a;
-      auto p = (cuComplex*) state.get();
-      ErrorCheck(cublasCsscal(cublas_handle_, size, &a1, p, 1));
-    } else {
-      double a1 = a;
-      auto p = (cuDoubleComplex*) state.get();
-      ErrorCheck(cublasZdscal(cublas_handle_, size, &a1, p, 1));
-    }
-  }
-
-  std::complex<double> InnerProduct(
-      const State& state1, const State& state2) const {
-    if (state1.num_qubits() != state2.num_qubits()) {
-      return std::nan("");
-    }
-
-    uint64_t size = uint64_t{1} << state1.num_qubits();
-
-    if (is_float) {
-      cuComplex result;
-      auto p1 = (const cuComplex*) state1.get();
-      auto p2 = (const cuComplex*) state2.get();
-      ErrorCheck(cublasCdotc(cublas_handle_, size, p1, 1, p2, 1, &result));
-      return {cuCrealf(result), cuCimagf(result)};
-    } else {
-      cuDoubleComplex result;
-      auto p1 = (const cuDoubleComplex*) state1.get();
-      auto p2 = (const cuDoubleComplex*) state2.get();
-      ErrorCheck(cublasZdotc(cublas_handle_, size, p1, 1, p2, 1, &result));
-      return {cuCreal(result), cuCimag(result)};
-    }
-  }
-
-  double RealInnerProduct(const State& state1, const State& state2) const {
-    return std::real(InnerProduct(state1, state2));
-  }
-
-  double Norm(const State& state) const {
-    uint64_t size = uint64_t{1} << state.num_qubits();
-
-    if (is_float) {
-      float result;
-      auto p = (const cuComplex*) state.get();
-      ErrorCheck(cublasScnrm2(cublas_handle_, size, p, 1, &result));
-      return result * result;
-    } else {
-      double result;
-      auto p = (const cuDoubleComplex*) state.get();
-      ErrorCheck(cublasDznrm2(cublas_handle_, size, p, 1, &result));
-      return result * result;
-    }
-  }
-
-  template <typename DistrRealType = double>
-  std::vector<uint64_t> Sample(
-      const State& state, uint64_t num_samples, unsigned seed) const {
-    std::vector<uint64_t> bitstrings;
-
-    if (num_samples > 0) {
-      auto rs = GenerateRandomValues<double>(num_samples, seed, 1.0);
-
-      size_t workspace_size;
-      custatevecSamplerDescriptor_t sampler;
-
-      ErrorCheck(custatevecSamplerCreate(
-                     custatevec_handle_, state.get(), kStateType,
-                     state.num_qubits(), &sampler, num_samples,
-                     &workspace_size));
-
-      AllocWorkSpace(workspace_size);
-
-      ErrorCheck(custatevecSamplerPreprocess(
-                     custatevec_handle_, sampler, workspace_, workspace_size));
-
-      std::vector<custatevecIndex_t> bitstrings0(num_samples);
-      std::vector<int32_t> bitordering;
-
-      bitordering.reserve(state.num_qubits());
-      for (unsigned i = 0; i < state.num_qubits(); ++i) {
-        bitordering.push_back(i);
-      }
-
-      ErrorCheck(custatevecSamplerSample(
-                     custatevec_handle_, sampler, bitstrings0.data(),
-                     bitordering.data(), state.num_qubits(), rs.data(),
-                     num_samples, CUSTATEVEC_SAMPLER_OUTPUT_RANDNUM_ORDER));
-
-      bitstrings.reserve(num_samples);
-      for (unsigned i = 0; i < num_samples; ++i) {
-        bitstrings.push_back(bitstrings0[i]);
-      }
-    }
-
-    return bitstrings;
-  }
-
-  using MeasurementResult = typename Base::MeasurementResult;
-
-  template <typename RGen>
-  MeasurementResult Measure(const std::vector<unsigned>& qubits,
-                            RGen& rgen, State& state,
-                            bool no_collapse = false) const {
-    auto r = RandomValue(rgen, 1.0);
-
-    MeasurementResult result;
-
-    result.valid = true;
-    result.mask = 0;
-    result.bits = 0;
-    result.bitstring.resize(qubits.size(), 0);
-
-    for (auto q : qubits) {
-      if (q >= state.num_qubits()) {
-        result.valid = false;
-        return result;
-      }
-
-      result.mask |= uint64_t{1} << q;
-    }
-
-    auto collapse = no_collapse ?
-        CUSTATEVEC_COLLAPSE_NONE : CUSTATEVEC_COLLAPSE_NORMALIZE_AND_ZERO;
-
-    ErrorCheck(custatevecBatchMeasure(
-                   custatevec_handle_, state.get(), kStateType,
-                   state.num_qubits(), (int*) result.bitstring.data(),
-                   (int*) qubits.data(), qubits.size(), r, collapse));
-
-    for (std::size_t i = 0; i < result.bitstring.size(); ++i) {
-      result.bits |= result.bitstring[i] << qubits[i];
-    }
-
-    return result;
-  }
-
-  template <typename RGen>
-  MeasurementResult VirtualMeasure(const std::vector<unsigned>& qubits,
-                                   RGen& rgen, const State& state) const {
-    return Measure(qubits, rgen, const_cast<State&>(state), true);
-  }
-
-  void Collapse(const MeasurementResult& mr, State& state) const {
-    unsigned count = 0;
-
-    std::vector<int> bitstring;
-    std::vector<int> bitordering;
-
-    bitstring.reserve(state.num_qubits());
-    bitordering.reserve(state.num_qubits());
-
-    for (unsigned i = 0; i < state.num_qubits(); ++i) {
-      if (((mr.mask >> i) & 1) != 0) {
-        bitstring.push_back((mr.bits >> i) & 1);
-        bitordering.push_back(i);
-        ++count;
-      }
-    }
-
-    ErrorCheck(custatevecCollapseByBitString(
-                   custatevec_handle_, state.get(), kStateType,
-                   state.num_qubits(), bitstring.data(), bitordering.data(),
-                   count, 1.0));
-
-    // TODO: do we need the following?
-    double norm = Norm(state);
-    Multiply(1.0 / std::sqrt(norm), state);
-  }
-
- private:
-  void* AllocWorkSpace(size_t size) const {
-    if (size > workspace_size_) {
-      if (workspace_ != nullptr) {
-        ErrorCheck(cudaFree(workspace_));
-      }
-
-      ErrorCheck(cudaMalloc(const_cast<void**>(&workspace_), size));
-
-      const_cast<uint64_t&>(workspace_size_) = size;
-    }
-
-    return workspace_;
-  }
-
-  const cublasHandle_t cublas_handle_;
-  const custatevecHandle_t custatevec_handle_;
-
-  void* workspace_;
-  size_t workspace_size_;
-};
-
-}  // namespace qsim
-
-#endif  // STATESPACE_CUSTATEVEC_H_
diff --git a/qsim/statespace_sse.h b/qsim/statespace_sse.h
deleted file mode 100644
index cf41a09..0000000
--- a/qsim/statespace_sse.h
+++ /dev/null
@@ -1,462 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef STATESPACE_SSE_H_
-#define STATESPACE_SSE_H_
-
-#include <smmintrin.h>
-
-#include <algorithm>
-#include <cmath>
-#include <complex>
-#include <cstdint>
-#include <functional>
-
-#include "statespace.h"
-#include "util.h"
-#include "vectorspace.h"
-
-namespace qsim {
-
-namespace detail {
-
-inline __m128i GetZeroMaskSSE(uint64_t i, uint64_t mask, uint64_t bits) {
-  __m128i s1 = _mm_set_epi64x(i + 2, i + 0);
-  __m128i s2 = _mm_set_epi64x(i + 3, i + 1);
-  __m128i ma = _mm_set1_epi64x(mask);
-  __m128i bi = _mm_set1_epi64x(bits);
-
-  s1 = _mm_and_si128(s1, ma);
-  s2 = _mm_and_si128(s2, ma);
-
-  s1 = _mm_cmpeq_epi64(s1, bi);
-  s2 = _mm_cmpeq_epi64(s2, bi);
-
-  return _mm_blend_epi16(s1, s2, 204);  // 11001100
-}
-
-inline double HorizontalSumSSE(__m128 s) {
-  __m128 ss = _mm_movehdup_ps(s);
-  __m128 s1 = _mm_add_ps(s, ss);
-
-  return _mm_cvtss_f32(_mm_add_ss(s1, _mm_movehl_ps(ss, s1)));
-}
-
-}  // namespace detail
-
-/**
- * Object containing context and routines for SSE state-vector manipulations.
- * State is a vectorized sequence of four real components followed by four
- * imaginary components. Four single-precison floating numbers can be loaded
- * into an SSE register.
- */
-template <typename For>
-class StateSpaceSSE :
-    public StateSpace<StateSpaceSSE<For>, VectorSpace, For, float> {
- private:
-  using Base = StateSpace<StateSpaceSSE<For>, qsim::VectorSpace, For, float>;
-
- public:
-  using State = typename Base::State;
-  using fp_type = typename Base::fp_type;
-
-  template <typename... ForArgs>
-  explicit StateSpaceSSE(ForArgs&&... args) : Base(args...) {}
-
-  static uint64_t MinSize(unsigned num_qubits) {
-    return std::max(uint64_t{8}, 2 * (uint64_t{1} << num_qubits));
-  };
-
-  void InternalToNormalOrder(State& state) const {
-    if (state.num_qubits() == 1) {
-      auto s = state.get();
-
-      s[2] = s[1];
-      s[1] = s[4];
-      s[3] = s[5];
-
-      for (uint64_t i = 4; i < 8; ++i) {
-        s[i] = 0;
-      }
-    } else {
-      auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) {
-        auto s = p + 8 * i;
-
-        fp_type re[3];
-        fp_type im[3];
-
-        for (uint64_t i = 0; i < 3; ++i) {
-          re[i] = s[i + 1];
-          im[i] = s[i + 4];
-        }
-
-        for (uint64_t i = 0; i < 3; ++i) {
-          s[2 * i + 1] = im[i];
-          s[2 * i + 2] = re[i];
-        }
-      };
-
-      Base::for_.Run(MinSize(state.num_qubits()) / 8, f, state.get());
-    }
-  }
-
-  void NormalToInternalOrder(State& state) const {
-    if (state.num_qubits() == 1) {
-      auto s = state.get();
-
-      s[4] = s[1];
-      s[1] = s[2];
-      s[5] = s[3];
-
-      s[2] = 0;
-      s[3] = 0;
-      s[6] = 0;
-      s[7] = 0;
-    } else {
-      auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) {
-        auto s = p + 8 * i;
-
-        fp_type re[3];
-        fp_type im[3];
-
-        for (uint64_t i = 0; i < 3; ++i) {
-          im[i] = s[2 * i + 1];
-          re[i] = s[2 * i + 2];
-        }
-
-        for (uint64_t i = 0; i < 3; ++i) {
-          s[i + 1] = re[i];
-          s[i + 4] = im[i];
-        }
-      };
-
-      Base::for_.Run(MinSize(state.num_qubits()) / 8, f, state.get());
-    }
-  }
-
-  void SetAllZeros(State& state) const {
-    __m128 val0 = _mm_setzero_ps();
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, __m128 val0, fp_type* p) {
-      _mm_store_ps(p + 8 * i, val0);
-      _mm_store_ps(p + 8 * i + 4, val0);
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 8, f, val0, state.get());
-  }
-
-  // Uniform superposition.
-  void SetStateUniform(State& state) const {
-    __m128 val0 = _mm_setzero_ps();
-    __m128 valu;
-
-    fp_type v = double{1} / std::sqrt(uint64_t{1} << state.num_qubits());
-
-    if (state.num_qubits() == 1) {
-      valu = _mm_set_ps(0, 0, v, v);
-    } else {
-      valu = _mm_set1_ps(v);
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                __m128 val0, __m128 valu, fp_type* p) {
-      _mm_store_ps(p + 8 * i, valu);
-      _mm_store_ps(p + 8 * i + 4, val0);
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 8, f, val0, valu, state.get());
-  }
-
-  // |0> state.
-  void SetStateZero(State& state) const {
-    SetAllZeros(state);
-    state.get()[0] = 1;
-  }
-
-  static std::complex<fp_type> GetAmpl(const State& state, uint64_t i) {
-    uint64_t p = (8 * (i / 4)) + (i % 4);
-    return std::complex<fp_type>(state.get()[p], state.get()[p + 4]);
-  }
-
-  static void SetAmpl(
-      State& state, uint64_t i, const std::complex<fp_type>& ampl) {
-    uint64_t p = (8 * (i / 4)) + (i % 4);
-    state.get()[p] = std::real(ampl);
-    state.get()[p + 4] = std::imag(ampl);
-  }
-
-  static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) {
-    uint64_t p = (8 * (i / 4)) + (i % 4);
-    state.get()[p] = re;
-    state.get()[p + 4] = im;
-  }
-
-  // Sets state[i] = complex(re, im) where (i & mask) == bits.
-  // if `exclude` is true then the criteria becomes (i & mask) != bits.
-  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits,
-                   const std::complex<fp_type>& val,
-                   bool exclude = false) const {
-    BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val));
-  }
-
-  // Sets state[i] = complex(re, im) where (i & mask) == bits.
-  // if `exclude` is true then the criteria becomes (i & mask) != bits.
-  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re,
-                   fp_type im, bool exclude = false) const {
-    __m128 re_reg = _mm_set1_ps(re);
-    __m128 im_reg = _mm_set1_ps(im);
-    __m128i exclude_reg = _mm_setzero_si128();
-    if (exclude) {
-      exclude_reg = _mm_cmpeq_epi32(exclude_reg, exclude_reg);
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv,
-                uint64_t bitsv, __m128 re_n, __m128 im_n, __m128i exclude_n,
-                fp_type* p) {
-      __m128 ml = _mm_castsi128_ps(_mm_xor_si128(
-          detail::GetZeroMaskSSE(4 * i, maskv, bitsv), exclude_n));
-
-      __m128 re = _mm_load_ps(p + 8 * i);
-      __m128 im = _mm_load_ps(p + 8 * i + 4);
-
-      re = _mm_blendv_ps(re, re_n, ml);
-      im = _mm_blendv_ps(im, im_n, ml);
-
-      _mm_store_ps(p + 8 * i, re);
-      _mm_store_ps(p + 8 * i + 4, im);
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 8, f, mask, bits, re_reg,
-                   im_reg, exclude_reg, state.get());
-  }
-
-  // Does the equivalent of dest += src elementwise.
-  bool Add(const State& src, State& dest) const {
-    if (src.num_qubits() != dest.num_qubits()) {
-      return false;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* p1, fp_type* p2) {
-      __m128 re1 = _mm_load_ps(p1 + 8 * i);
-      __m128 im1 = _mm_load_ps(p1 + 8 * i + 4);
-      __m128 re2 = _mm_load_ps(p2 + 8 * i);
-      __m128 im2 = _mm_load_ps(p2 + 8 * i + 4);
-
-      _mm_store_ps(p2 + 8 * i, _mm_add_ps(re1, re2));
-      _mm_store_ps(p2 + 8 * i + 4, _mm_add_ps(im1, im2));
-    };
-
-    Base::for_.Run(MinSize(src.num_qubits()) / 8, f, src.get(), dest.get());
-
-    return true;
-  }
-
-  // Does the equivalent of state *= a elementwise.
-  void Multiply(fp_type a, State& state) const {
-    __m128 r = _mm_set1_ps(a);
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, __m128 r, fp_type* p) {
-      __m128 re = _mm_load_ps(p + 8 * i);
-      __m128 im = _mm_load_ps(p + 8 * i + 4);
-
-      re = _mm_mul_ps(re, r);
-      im = _mm_mul_ps(im, r);
-
-      _mm_store_ps(p + 8 * i, re);
-      _mm_store_ps(p + 8 * i + 4, im);
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 8, f, r, state.get());
-  }
-
-  std::complex<double> InnerProduct(
-      const State& state1, const State& state2) const {
-    if (state1.num_qubits() != state2.num_qubits()) {
-      return std::nan("");
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* p1, const fp_type* p2) -> std::complex<double> {
-      __m128 re1 = _mm_load_ps(p1 + 8 * i);
-      __m128 im1 = _mm_load_ps(p1 + 8 * i + 4);
-      __m128 re2 = _mm_load_ps(p2 + 8 * i);
-      __m128 im2 = _mm_load_ps(p2 + 8 * i + 4);
-
-      __m128 ip_re = _mm_add_ps(_mm_mul_ps(re1, re2), _mm_mul_ps(im1, im2));
-      __m128 ip_im = _mm_sub_ps(_mm_mul_ps(re1, im2), _mm_mul_ps(im1, re2));
-
-      double re = detail::HorizontalSumSSE(ip_re);
-      double im = detail::HorizontalSumSSE(ip_im);
-
-      return std::complex<double>{re, im};
-    };
-
-    using Op = std::plus<std::complex<double>>;
-    return Base::for_.RunReduce(
-        MinSize(state1.num_qubits()) / 8, f, Op(), state1.get(), state2.get());
-  }
-
-  double RealInnerProduct(const State& state1, const State& state2) const {
-    if (state1.num_qubits() != state2.num_qubits()) {
-      return std::nan("");
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* p1, const fp_type* p2) -> double {
-      __m128 re1 = _mm_load_ps(p1 + 8 * i);
-      __m128 im1 = _mm_load_ps(p1 + 8 * i + 4);
-      __m128 re2 = _mm_load_ps(p2 + 8 * i);
-      __m128 im2 = _mm_load_ps(p2 + 8 * i + 4);
-
-      __m128 ip_re = _mm_add_ps(_mm_mul_ps(re1, re2), _mm_mul_ps(im1, im2));
-
-      return detail::HorizontalSumSSE(ip_re);
-    };
-
-    using Op = std::plus<double>;
-    return Base::for_.RunReduce(
-        MinSize(state1.num_qubits()) / 8, f, Op(), state1.get(), state2.get());
-  }
-
-  template <typename DistrRealType = double>
-  std::vector<uint64_t> Sample(
-      const State& state, uint64_t num_samples, unsigned seed) const {
-    std::vector<uint64_t> bitstrings;
-
-    if (num_samples > 0) {
-      double norm = 0;
-      uint64_t size = MinSize(state.num_qubits()) / 8;
-      const fp_type* p = state.get();
-
-      for (uint64_t k = 0; k < size; ++k) {
-        for (unsigned j = 0; j < 4; ++j) {
-          double re = p[8 * k + j];
-          double im = p[8 * k + 4 + j];
-          norm += re * re + im * im;
-        }
-      }
-
-      auto rs = GenerateRandomValues<DistrRealType>(num_samples, seed, norm);
-
-      uint64_t m = 0;
-      double csum = 0;
-      bitstrings.reserve(num_samples);
-
-      for (uint64_t k = 0; k < size; ++k) {
-        for (unsigned j = 0; j < 4; ++j) {
-          double re = p[8 * k + j];
-          double im = p[8 * k + 4 + j];
-          csum += re * re + im * im;
-          while (rs[m] < csum && m < num_samples) {
-            bitstrings.emplace_back(4 * k + j);
-            ++m;
-          }
-        }
-      }
-
-      for (; m < num_samples; ++m) {
-        bitstrings.emplace_back((uint64_t{1} << state.num_qubits()) - 1);
-      }
-    }
-
-    return bitstrings;
-  }
-
-  using MeasurementResult = typename Base::MeasurementResult;
-
-  void Collapse(const MeasurementResult& mr, State& state) const {
-    __m128 zero = _mm_set1_ps(0);
-
-    auto f1 = [](unsigned n, unsigned m, uint64_t i, uint64_t mask,
-                 uint64_t bits, __m128 zero, const fp_type* p) -> double {
-      __m128 ml = _mm_castsi128_ps(detail::GetZeroMaskSSE(4 * i, mask, bits));
-
-      __m128 re = _mm_load_ps(p + 8 * i);
-      __m128 im = _mm_load_ps(p + 8 * i + 4);
-      __m128 s1 = _mm_add_ps(_mm_mul_ps(re, re), _mm_mul_ps(im, im));
-
-      s1 = _mm_blendv_ps(zero, s1, ml);
-
-      return detail::HorizontalSumSSE(s1);
-    };
-
-    using Op = std::plus<double>;
-    double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 8, f1,
-                                       Op(), mr.mask, mr.bits, zero,
-                                       state.get());
-
-    __m128 renorm = _mm_set1_ps(1.0 / std::sqrt(norm));
-
-    auto f2 = [](unsigned n, unsigned m, uint64_t i, uint64_t mask,
-                 uint64_t bits, __m128 renorm, __m128 zero, fp_type* p) {
-      __m128 ml = _mm_castsi128_ps(detail::GetZeroMaskSSE(4 * i, mask, bits));
-
-      __m128 re = _mm_load_ps(p + 8 * i);
-      __m128 im = _mm_load_ps(p + 8 * i + 4);
-
-      re = _mm_blendv_ps(zero, _mm_mul_ps(re, renorm), ml);
-      im = _mm_blendv_ps(zero, _mm_mul_ps(im, renorm), ml);
-
-      _mm_store_ps(p + 8 * i, re);
-      _mm_store_ps(p + 8 * i + 4, im);
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 8, f2,
-                   mr.mask, mr.bits, renorm, zero, state.get());
-  }
-
-  std::vector<double> PartialNorms(const State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* p) -> double {
-      __m128 re = _mm_load_ps(p + 8 * i);
-      __m128 im = _mm_load_ps(p + 8 * i + 4);
-      __m128 s1 = _mm_add_ps(_mm_mul_ps(re, re), _mm_mul_ps(im, im));
-
-      return detail::HorizontalSumSSE(s1);
-    };
-
-    using Op = std::plus<double>;
-    return Base::for_.RunReduceP(
-        MinSize(state.num_qubits()) / 8, f, Op(), state.get());
-  }
-
-  uint64_t FindMeasuredBits(
-      unsigned m, double r, uint64_t mask, const State& state) const {
-    double csum = 0;
-
-    uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 8, m);
-    uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 8, m);
-
-    const fp_type* p = state.get();
-
-    for (uint64_t k = k0; k < k1; ++k) {
-      for (uint64_t j = 0; j < 4; ++j) {
-        auto re = p[8 * k + j];
-        auto im = p[8 * k + 4 + j];
-        csum += re * re + im * im;
-        if (r < csum) {
-          return (4 * k + j) & mask;
-        }
-      }
-    }
-
-    // Return the last bitstring in the unlikely case of underflow.
-    return (4 * k1 - 1) & mask;
-  }
-};
-
-}  // namespace qsim
-
-#endif  // STATESPACE_SSE_H_
diff --git a/qsim/umux.h b/qsim/umux.h
deleted file mode 100644
index 83b951b..0000000
--- a/qsim/umux.h
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef UMUX_H_
-#define UMUX_H_
-
-#ifdef __AVX512F__
-# include "unitary_calculator_avx512.h"
-  namespace qsim {
-  namespace unitary {
-    template <typename For>
-    using UnitaryCalculator = UnitaryCalculatorAVX512<For>;
-  }
-  }
-#elif __AVX2__
-# include "unitary_calculator_avx.h"
-  namespace qsim {
-  namespace unitary {
-    template <typename For>
-    using UnitaryCalculator = UnitaryCalculatorAVX<For>;
-  }
-  }
-#elif __SSE4_1__
-# include "unitary_calculator_sse.h"
-  namespace qsim {
-  namespace unitary {
-    template <typename For>
-    using UnitaryCalculator = UnitaryCalculatorSSE<For>;
-  }
-  }
-#else
-# include "unitary_calculator_basic.h"
-  namespace qsim {
-  namespace unitary {
-    template <typename For>
-    using UnitaryCalculator = UnitaryCalculatorBasic<For>;
-  }
-  }
-#endif
-
-#endif  // UMUX_H_
diff --git a/qsim/unitary_calculator_avx.h b/qsim/unitary_calculator_avx.h
deleted file mode 100644
index 5e566ca..0000000
--- a/qsim/unitary_calculator_avx.h
+++ /dev/null
@@ -1,1028 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef UNITARY_CALCULATOR_AVX_H_
-#define UNITARY_CALCULATOR_AVX_H_
-
-#include <immintrin.h>
-
-#include <complex>
-#include <cstdint>
-#include <functional>
-#include <vector>
-
-#include "simulator.h"
-#include "unitaryspace_avx.h"
-
-namespace qsim {
-namespace unitary {
-
-/**
- * Quantum circuit unitary calculator with AVX vectorization.
- */
-template <typename For>
-class UnitaryCalculatorAVX final : public SimulatorBase {
- public:
-  using UnitarySpace = UnitarySpaceAVX<For>;
-  using Unitary = typename UnitarySpace::Unitary;
-  using fp_type = typename UnitarySpace::fp_type;
-
-  using StateSpace = UnitarySpace;
-  using State = Unitary;
-
-  template <typename... ForArgs>
-  explicit UnitaryCalculatorAVX(ForArgs&&... args) : for_(args...) {}
-
-  /**
-   * Applies a gate using AVX instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyGate(const std::vector<unsigned>& qs,
-                 const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    switch (qs.size()) {
-    case 1:
-      if (qs[0] > 2) {
-        ApplyGateH<1>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 1>(qs, matrix, state);
-      }
-      break;
-    case 2:
-      if (qs[0] > 2) {
-        ApplyGateH<2>(qs, matrix, state);
-      } else if (qs[1] > 2) {
-        ApplyGateL<1, 1>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 2>(qs, matrix, state);
-      }
-      break;
-    case 3:
-      if (qs[0] > 2) {
-        ApplyGateH<3>(qs, matrix, state);
-      } else if (qs[1] > 2) {
-        ApplyGateL<2, 1>(qs, matrix, state);
-      } else if (qs[2] > 2) {
-        ApplyGateL<1, 2>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 3>(qs, matrix, state);
-      }
-      break;
-    case 4:
-      if (qs[0] > 2) {
-        ApplyGateH<4>(qs, matrix, state);
-      } else if (qs[1] > 2) {
-        ApplyGateL<3, 1>(qs, matrix, state);
-      } else if (qs[2] > 2) {
-        ApplyGateL<2, 2>(qs, matrix, state);
-      } else {
-        ApplyGateL<1, 3>(qs, matrix, state);
-      }
-      break;
-    case 5:
-      if (qs[0] > 2) {
-        ApplyGateH<5>(qs, matrix, state);
-      } else if (qs[1] > 2) {
-        ApplyGateL<4, 1>(qs, matrix, state);
-      } else if (qs[2] > 2) {
-        ApplyGateL<3, 2>(qs, matrix, state);
-      } else {
-        ApplyGateL<2, 3>(qs, matrix, state);
-      }
-      break;
-    case 6:
-      if (qs[0] > 2) {
-        ApplyGateH<6>(qs, matrix, state);
-      } else if (qs[1] > 2) {
-        ApplyGateL<5, 1>(qs, matrix, state);
-      } else if (qs[2] > 2) {
-        ApplyGateL<4, 2>(qs, matrix, state);
-      } else {
-        ApplyGateL<3, 3>(qs, matrix, state);
-      }
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-  }
-
-  /**
-   * Applies a controlled gate using AVX instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param cqs Indices of control qubits.
-   * @param cvals Bit mask of control qubit values.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyControlledGate(const std::vector<unsigned>& qs,
-                           const std::vector<unsigned>& cqs, uint64_t cvals,
-                           const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-    // Assume cqs[0] < cqs[1] < cqs[2] < ... .
-
-    if (cqs.size() == 0) {
-      ApplyGate(qs, matrix, state);
-      return;
-    }
-
-    switch (qs.size()) {
-    case 1:
-      if (qs[0] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 2:
-      if (qs[0] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 3:
-      if (qs[0] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[2] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 4:
-      if (qs[0] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[2] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-  }
-
-  /**
-   * @return The size of SIMD register if applicable.
-   */
-  static unsigned SIMDRegisterSize() {
-    return 8;
-  }
-
- private:
-
-#ifdef __BMI2__
-
-  template <unsigned H>
-  void ApplyGateH(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                uint64_t imaskh, uint64_t qmaskh, uint64_t size,
-                uint64_t row_size, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m256 ru, iu, rn, in;
-      __m256 rs[hsize], is[hsize];
-
-      uint64_t r = i % size;
-      uint64_t s = i / size;
-
-      auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k] = _mm256_load_ps(p0 + p);
-        is[k] = _mm256_load_ps(p0 + p + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[l], ru, rn);
-          in = _mm256_fmadd_ps(rs[l], iu, in);
-          rn = _mm256_fnmadd_ps(is[l], iu, rn);
-          in = _mm256_fmadd_ps(is[l], ru, in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm256_store_ps(p0 + p, rn);
-        _mm256_store_ps(p0 + p + 8, in);
-      }
-    };
-
-    auto m = GetMasks1<H, 3>(qs);
-
-    unsigned k = 3 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f,
-             matrix, m.imaskh, m.qmaskh, size, raw_size, state.get());
-  }
-
-  template <unsigned H, unsigned L>
-  void ApplyGateL(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                uint64_t imaskh, uint64_t qmaskh, const __m256i* idx,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m256 rn, in;
-      __m256 rs[gsize], is[gsize];
-
-      uint64_t r = i % size;
-      uint64_t s = i / size;
-
-      auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k2] = _mm256_load_ps(p0 + p);
-        is[k2] = _mm256_load_ps(p0 + p + 8);
-
-        for (unsigned l = 1; l < lsize; ++l) {
-          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
-          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
-          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm256_store_ps(p0 + p, rn);
-        _mm256_store_ps(p0 + p + 8, in);
-      }
-    };
-
-    __m256i idx[1 << L];
-    __m256 w[1 << (1 + 2 * H + L)];
-
-    auto m = GetMasks2<H, L, 3>(qs);
-    FillPermutationIndices<L>(m.qmaskl, idx);
-    FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
-
-    unsigned k = 3 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f,
-             w, m.imaskh, m.qmaskh, idx, size, raw_size, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m256 ru, iu, rn, in;
-      __m256 rs[hsize], is[hsize];
-
-      uint64_t r = i % size;
-      uint64_t s = i / size;
-
-      auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k] = _mm256_load_ps(p0 + p);
-        is[k] = _mm256_load_ps(p0 + p + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[l], ru, rn);
-          in = _mm256_fmadd_ps(rs[l], iu, in);
-          rn = _mm256_fnmadd_ps(is[l], iu, rn);
-          in = _mm256_fmadd_ps(is[l], ru, in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm256_store_ps(p0 + p, rn);
-        _mm256_store_ps(p0 + p + 8, in);
-      }
-    };
-
-    auto m = GetMasks3<H, 3>(state.num_qubits(), qs, cqs, cvals);
-
-    unsigned k = 3 + H + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f,
-             matrix, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m256 rn, in;
-      __m256 rs[hsize], is[hsize];
-
-      uint64_t r = i % size;
-      uint64_t s = i / size;
-
-      auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k] = _mm256_load_ps(p0 + p);
-        is[k] = _mm256_load_ps(p0 + p + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
-          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm256_store_ps(p0 + p, rn);
-        _mm256_store_ps(p0 + p + 8, in);
-      }
-    };
-
-    __m256 w[1 << (1 + 2 * H)];
-
-    auto m = GetMasks4<H, 3>(state.num_qubits(), qs, cqs, cvals);
-    FillControlledMatrixH<H, 3>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
-
-    unsigned k = 3 + H + cqs.size() - m.cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f,
-             w, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get());
-  }
-
-  template <unsigned H, unsigned L, bool CH>
-  void ApplyControlledGateL(const std::vector<unsigned>& qs,
-                            const std::vector<unsigned>& cqs, uint64_t cvals,
-                            const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m256 rn, in;
-      __m256 rs[gsize], is[gsize];
-
-      uint64_t r = i % size;
-      uint64_t s = i / size;
-
-      auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k2] = _mm256_load_ps(p0 + p);
-        is[k2] = _mm256_load_ps(p0 + p + 8);
-
-        for (unsigned l = 1; l < lsize; ++l) {
-          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
-          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
-          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm256_store_ps(p0 + p, rn);
-        _mm256_store_ps(p0 + p + 8, in);
-      }
-    };
-
-    __m256i idx[1 << L];
-    __m256 w[1 << (1 + 2 * H + L)];
-
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    if (CH) {
-      auto m = GetMasks5<H, L, 3>(state.num_qubits(), qs, cqs, cvals);
-      FillPermutationIndices<L>(m.qmaskl, idx);
-      FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
-
-      unsigned k = 3 + H + cqs.size();
-      unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-      uint64_t size = uint64_t{1} << n;
-
-      for_.Run(size * size2, f, w, m.imaskh, m.qmaskh,
-               m.cvalsh, idx, size, raw_size, state.get());
-    } else {
-      auto m = GetMasks6<H, L, 3>(state.num_qubits(), qs, cqs, cvals);
-      FillPermutationIndices<L>(m.qmaskl, idx);
-      FillControlledMatrixL<H, L, 3>(
-          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
-
-      unsigned k = 3 + H + cqs.size() - m.cl;
-      unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-      uint64_t size = uint64_t{1} << n;
-
-      for_.Run(size * size2, f, w, m.imaskh, m.qmaskh,
-               m.cvalsh, idx, size, raw_size, state.get());
-    }
-  }
-
-#else  // __BMI2__
-
-  template <unsigned H>
-  void ApplyGateH(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss, uint64_t size,
-                uint64_t row_size, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m256 ru, iu, rn, in;
-      __m256 rs[hsize], is[hsize];
-
-      uint64_t r = 8 * (i % size);
-      uint64_t s = i / size;
-
-      uint64_t t = r & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        r *= 2;
-        t |= r & ms[j];
-      }
-
-      auto p0 = rstate + row_size * s + 2 * t;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = _mm256_load_ps(p0 + xss[k]);
-        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[l], ru, rn);
-          in = _mm256_fmadd_ps(rs[l], iu, in);
-          rn = _mm256_fnmadd_ps(is[l], iu, rn);
-          in = _mm256_fmadd_ps(is[l], ru, in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[k], rn);
-        _mm256_store_ps(p0 + xss[k] + 8, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-
-    unsigned k = 3 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, state.get());
-  }
-
-  template <unsigned H, unsigned L>
-  void ApplyGateL(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss, const __m256i* idx,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m256 rn, in;
-      __m256 rs[gsize], is[gsize];
-
-      uint64_t r = 8 * (i % size);
-      uint64_t s = i / size;
-
-      uint64_t t = r & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        r *= 2;
-        t |= r & ms[j];
-      }
-
-      auto p0 = rstate + row_size * s + 2 * t;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-        rs[k2] = _mm256_load_ps(p0 + xss[k]);
-        is[k2] = _mm256_load_ps(p0 + xss[k] + 8);
-
-        for (unsigned l = 1; l < lsize; ++l) {
-          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
-          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
-          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[k], rn);
-        _mm256_store_ps(p0 + xss[k] + 8, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-    __m256i idx[1 << L];
-    __m256 w[1 << (1 + 2 * H + L)];
-
-    auto m = GetMasks11<L>(qs);
-
-    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
-    FillPermutationIndices<L>(m.qmaskl, idx);
-    FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
-
-    unsigned k = 3 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
-                uint64_t cmaskh, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m256 ru, iu, rn, in;
-      __m256 rs[hsize], is[hsize];
-
-      uint64_t r = 8 * (i % size);
-      uint64_t s = i / size;
-
-      uint64_t t = r & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        r *= 2;
-        t |= r & ms[j];
-      }
-
-      if ((t & cmaskh) != cvalsh) return;
-
-      auto p0 = rstate + row_size * s + 2 * t;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = _mm256_load_ps(p0 + xss[k]);
-        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[l], ru, rn);
-          in = _mm256_fmadd_ps(rs[l], iu, in);
-          rn = _mm256_fnmadd_ps(is[l], iu, rn);
-          in = _mm256_fmadd_ps(is[l], ru, in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[k], rn);
-        _mm256_store_ps(p0 + xss[k] + 8, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-
-    auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals);
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-
-    unsigned k = 3 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f,
-             matrix, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
-                uint64_t cmaskh, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m256 rn, in;
-      __m256 rs[hsize], is[hsize];
-
-      uint64_t r = 8 * (i % size);
-      uint64_t s = i / size;
-
-      uint64_t t = r & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        r *= 2;
-        t |= r & ms[j];
-      }
-
-      if ((t & cmaskh) != cvalsh) return;
-
-      auto p0 = rstate + row_size * s + 2 * t;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = _mm256_load_ps(p0 + xss[k]);
-        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
-          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[k], rn);
-        _mm256_store_ps(p0 + xss[k] + 8, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-    __m256 w[1 << (1 + 2 * H)];
-
-    auto m = GetMasks8<3>(state.num_qubits(), qs, cqs, cvals);
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-    FillControlledMatrixH<H, 3>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
-
-    unsigned k = 3 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f,
-             w, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get());
-  }
-
-  template <unsigned H, unsigned L, bool CH>
-  void ApplyControlledGateL(const std::vector<unsigned>& qs,
-                            const std::vector<unsigned>& cqs, uint64_t cvals,
-                            const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
-                uint64_t cmaskh, const __m256i* idx, uint64_t size,
-                uint64_t row_size, fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m256 rn, in;
-      __m256 rs[gsize], is[gsize];
-
-      uint64_t r = 8 * (i % size);
-      uint64_t s = i / size;
-
-      uint64_t t = r & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        r *= 2;
-        t |= r & ms[j];
-      }
-
-      if ((t & cmaskh) != cvalsh) return;
-
-      auto p0 = rstate + row_size * s + 2 * t;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-
-        rs[k2] = _mm256_load_ps(p0 + xss[k]);
-        is[k2] = _mm256_load_ps(p0 + xss[k] + 8);
-
-        for (unsigned l = 1; l < lsize; ++l) {
-          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
-          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
-          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[k], rn);
-        _mm256_store_ps(p0 + xss[k] + 8, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-    __m256i idx[1 << L];
-    __m256 w[1 << (1 + 2 * H + L)];
-
-    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
-
-    unsigned k = 3 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    if (CH) {
-      auto m = GetMasks9<L>(state.num_qubits(), qs, cqs, cvals);
-      FillPermutationIndices<L>(m.qmaskl, idx);
-      FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
-
-      for_.Run(size * size2, f, w, ms, xss, m.cvalsh,
-               m.cmaskh, idx, size, raw_size, state.get());
-    } else {
-      auto m = GetMasks10<L, 3>(state.num_qubits(), qs, cqs, cvals);
-      FillPermutationIndices<L>(m.qmaskl, idx);
-      FillControlledMatrixL<H, L, 3>(
-          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
-
-      for_.Run(size * size2, f, w, ms, xss, m.cvalsh,
-               m.cmaskh, idx, size, raw_size, state.get());
-    }
-  }
-
-#endif  // __BMI2__
-
-  template <unsigned L>
-  static void FillPermutationIndices(unsigned qmaskl, __m256i* idx) {
-    constexpr unsigned lsize = 1 << L;
-
-    for (unsigned i = 0; i < lsize - 1; ++i) {
-      unsigned p[8];
-
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd<3>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-  }
-
-  For for_;
-};
-
-}  // namespace unitary
-}  // namespace qsim
-
-#endif  // UNITARY_CALCULATOR_AVX_H_
diff --git a/qsim/unitary_calculator_avx512.h b/qsim/unitary_calculator_avx512.h
deleted file mode 100644
index 8105367..0000000
--- a/qsim/unitary_calculator_avx512.h
+++ /dev/null
@@ -1,644 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef UNITARY_CALCULATOR_AVX512_H_
-#define UNITARY_CALCULATOR_AVX512_H_
-
-#include <immintrin.h>
-
-#include <complex>
-#include <cstdint>
-#include <functional>
-#include <vector>
-
-#include "simulator.h"
-#include "unitaryspace_avx512.h"
-
-namespace qsim {
-namespace unitary {
-
-/**
- * Quantum circuit unitary calculator with AVX512 vectorization.
- */
-template <typename For>
-class UnitaryCalculatorAVX512 final : public SimulatorBase {
- public:
-  using UnitarySpace = UnitarySpaceAVX512<For>;
-  using Unitary = typename UnitarySpace::Unitary;
-  using fp_type = typename UnitarySpace::fp_type;
-
-  using StateSpace = UnitarySpace;
-  using State = Unitary;
-
-  template <typename... ForArgs>
-  explicit UnitaryCalculatorAVX512(ForArgs&&... args) : for_(args...) {}
-
-  /**
-   * Applies a gate using AVX512 instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyGate(const std::vector<unsigned>& qs,
-                 const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    switch (qs.size()) {
-    case 1:
-      if (qs[0] > 3) {
-        ApplyGateH<1>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 1>(qs, matrix, state);
-      }
-      break;
-    case 2:
-      if (qs[0] > 3) {
-        ApplyGateH<2>(qs, matrix, state);
-      } else if (qs[1] > 3) {
-        ApplyGateL<1, 1>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 2>(qs, matrix, state);
-      }
-      break;
-    case 3:
-      if (qs[0] > 3) {
-        ApplyGateH<3>(qs, matrix, state);
-      } else if (qs[1] > 3) {
-        ApplyGateL<2, 1>(qs, matrix, state);
-      } else if (qs[2] > 3) {
-        ApplyGateL<1, 2>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 3>(qs, matrix, state);
-      }
-      break;
-    case 4:
-      if (qs[0] > 3) {
-        ApplyGateH<4>(qs, matrix, state);
-      } else if (qs[1] > 3) {
-        ApplyGateL<3, 1>(qs, matrix, state);
-      } else if (qs[2] > 3) {
-        ApplyGateL<2, 2>(qs, matrix, state);
-      } else if (qs[3] > 3) {
-        ApplyGateL<1, 3>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 4>(qs, matrix, state);
-      }
-      break;
-    case 5:
-      if (qs[0] > 3) {
-        ApplyGateH<5>(qs, matrix, state);
-      } else if (qs[1] > 3) {
-        ApplyGateL<4, 1>(qs, matrix, state);
-      } else if (qs[2] > 3) {
-        ApplyGateL<3, 2>(qs, matrix, state);
-      } else if (qs[3] > 3) {
-        ApplyGateL<2, 3>(qs, matrix, state);
-      } else {
-        ApplyGateL<1, 4>(qs, matrix, state);
-      }
-      break;
-    case 6:
-      if (qs[0] > 3) {
-        ApplyGateH<6>(qs, matrix, state);
-      } else if (qs[1] > 3) {
-        ApplyGateL<5, 1>(qs, matrix, state);
-      } else if (qs[2] > 3) {
-        ApplyGateL<4, 2>(qs, matrix, state);
-      } else if (qs[3] > 3) {
-        ApplyGateL<3, 3>(qs, matrix, state);
-      } else {
-        ApplyGateL<2, 4>(qs, matrix, state);
-      }
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-  }
-
-  /**
-   * Applies a controlled gate using AVX512 instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param cqs Indices of control qubits.
-   * @param cvals Bit mask of control qubit values.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyControlledGate(const std::vector<unsigned>& qs,
-                           const std::vector<unsigned>& cqs, uint64_t cvals,
-                           const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-    // Assume cqs[0] < cqs[1] < cqs[2] < ... .
-
-    if (cqs.size() == 0) {
-      ApplyGate(qs, matrix, state);
-      return;
-    }
-
-    switch (qs.size()) {
-    case 1:
-      if (qs[0] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 2:
-      if (qs[0] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 3:
-      if (qs[0] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[2] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 4:
-      if (qs[0] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[2] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[3] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<0, 4, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 4, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-  }
-
-  /**
-   * @return The size of SIMD register if applicable.
-   */
-  static unsigned SIMDRegisterSize() {
-    return 16;
-  }
-
- private:
-  template <unsigned H>
-  void ApplyGateH(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                uint64_t imaskh, uint64_t qmaskh, uint64_t size,
-                uint64_t row_size, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m512 ru, iu, rn, in;
-      __m512 rs[hsize], is[hsize];
-
-      uint64_t r = i % size;
-      uint64_t s = i / size;
-
-      auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k] = _mm512_load_ps(p0 + p);
-        is[k] = _mm512_load_ps(p0 + p + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm512_set1_ps(v[j]);
-        iu = _mm512_set1_ps(v[j + 1]);
-        rn = _mm512_mul_ps(rs[0], ru);
-        in = _mm512_mul_ps(rs[0], iu);
-        rn = _mm512_fnmadd_ps(is[0], iu, rn);
-        in = _mm512_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm512_set1_ps(v[j]);
-          iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[l], ru, rn);
-          in = _mm512_fmadd_ps(rs[l], iu, in);
-          rn = _mm512_fnmadd_ps(is[l], iu, rn);
-          in = _mm512_fmadd_ps(is[l], ru, in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm512_store_ps(p0 + p, rn);
-        _mm512_store_ps(p0 + p + 16, in);
-      }
-    };
-
-    auto m = GetMasks1<H, 4>(qs);
-
-    unsigned k = 4 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f,
-             matrix, m.imaskh, m.qmaskh, size, raw_size, state.get());
-  }
-
-  template <unsigned H, unsigned L>
-  void ApplyGateL(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                uint64_t imaskh, uint64_t qmaskh, const __m512i* idx,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m512 rn, in;
-      __m512 rs[gsize], is[gsize];
-
-      uint64_t r = i % size;
-      uint64_t s = i / size;
-
-      auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k2] = _mm512_load_ps(p0 + p);
-        is[k2] = _mm512_load_ps(p0 + p + 16);
-
-        for (unsigned l = 1; l < lsize; ++l) {
-          rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]);
-          is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
-          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm512_store_ps(p0 + p, rn);
-        _mm512_store_ps(p0 + p + 16, in);
-      }
-    };
-
-    __m512i idx[1 << L];
-    __m512 w[1 << (1 + 2 * H + L)];
-
-    auto m = GetMasks2<H, L, 4>(qs);
-    FillPermutationIndices<L>(m.qmaskl, idx);
-    FillMatrix<H, L, 4>(m.qmaskl, matrix, (fp_type*) w);
-
-    unsigned k = 4 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f,
-             w, m.imaskh, m.qmaskh, idx, size, raw_size, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m512 ru, iu, rn, in;
-      __m512 rs[hsize], is[hsize];
-
-      uint64_t r = i % size;
-      uint64_t s = i / size;
-
-      auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k] = _mm512_load_ps(p0 + p);
-        is[k] = _mm512_load_ps(p0 + p + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm512_set1_ps(v[j]);
-        iu = _mm512_set1_ps(v[j + 1]);
-        rn = _mm512_mul_ps(rs[0], ru);
-        in = _mm512_mul_ps(rs[0], iu);
-        rn = _mm512_fnmadd_ps(is[0], iu, rn);
-        in = _mm512_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm512_set1_ps(v[j]);
-          iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[l], ru, rn);
-          in = _mm512_fmadd_ps(rs[l], iu, in);
-          rn = _mm512_fnmadd_ps(is[l], iu, rn);
-          in = _mm512_fmadd_ps(is[l], ru, in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm512_store_ps(p0 + p, rn);
-        _mm512_store_ps(p0 + p + 16, in);
-      }
-    };
-
-    auto m = GetMasks3<H, 4>(state.num_qubits(), qs, cqs, cvals);
-
-    unsigned k = 4 + H + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f,
-             matrix, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m512 rn, in;
-      __m512 rs[hsize], is[hsize];
-
-      uint64_t r = i % size;
-      uint64_t s = i / size;
-
-      auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k] = _mm512_load_ps(p0 + p);
-        is[k] = _mm512_load_ps(p0 + p + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
-          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm512_store_ps(p0 + p, rn);
-        _mm512_store_ps(p0 + p + 16, in);
-      }
-    };
-
-    __m512 w[1 << (1 + 2 * H)];
-
-    auto m = GetMasks4<H, 4>(state.num_qubits(), qs, cqs, cvals);
-    FillControlledMatrixH<H, 4>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
-
-    unsigned k = 4 + H + cqs.size() - m.cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f,
-             w, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get());
-  }
-
-  template <unsigned H, unsigned L, bool CH>
-  void ApplyControlledGateL(const std::vector<unsigned>& qs,
-                            const std::vector<unsigned>& cqs, uint64_t cvals,
-                            const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m512 rn, in;
-      __m512 rs[gsize], is[gsize];
-
-      uint64_t r = i % size;
-      uint64_t s = i / size;
-
-      auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k2] = _mm512_load_ps(p0 + p);
-        is[k2] = _mm512_load_ps(p0 + p + 16);
-
-        for (unsigned l = 1; l < lsize; ++l) {
-          rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]);
-          is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
-          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm512_store_ps(p0 + p, rn);
-        _mm512_store_ps(p0 + p + 16, in);
-      }
-    };
-
-    __m512i idx[1 << L];
-    __m512 w[1 << (1 + 2 * H + L)];
-
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    if (CH) {
-      auto m = GetMasks5<H, L, 4>(state.num_qubits(), qs, cqs, cvals);
-      FillPermutationIndices<L>(m.qmaskl, idx);
-      FillMatrix<H, L, 4>(m.qmaskl, matrix, (fp_type*) w);
-
-      unsigned k = 4 + H + cqs.size();
-      unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-      uint64_t size = uint64_t{1} << n;
-
-      for_.Run(size * size2, f, w, m.imaskh, m.qmaskh,
-               m.cvalsh, idx, size, raw_size, state.get());
-    } else {
-      auto m = GetMasks6<H, L, 4>(state.num_qubits(), qs, cqs, cvals);
-      FillPermutationIndices<L>(m.qmaskl, idx);
-      FillControlledMatrixL<H, L, 4>(
-          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
-
-      unsigned k = 4 + H + cqs.size() - m.cl;
-      unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-      uint64_t size = uint64_t{1} << n;
-
-      for_.Run(size * size2, f, w, m.imaskh, m.qmaskh,
-               m.cvalsh, idx, size, raw_size, state.get());
-    }
-  }
-
-  template <unsigned L>
-  static void FillPermutationIndices(unsigned qmaskl, __m512i* idx) {
-    constexpr unsigned lsize = 1 << L;
-
-    for (unsigned i = 0; i < lsize; ++i) {
-      unsigned p[16];
-
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd<4>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-  }
-
-  For for_;
-};
-
-}  // namespace unitary
-}  // namespace qsim
-
-#endif  // UNITARY_CALCULATOR_AVX512_H_
diff --git a/qsim/unitary_calculator_basic.h b/qsim/unitary_calculator_basic.h
deleted file mode 100644
index 6b1821a..0000000
--- a/qsim/unitary_calculator_basic.h
+++ /dev/null
@@ -1,259 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef UNITARY_CALCULATOR_BASIC_H_
-#define UNITARY_CALCULATOR_BASIC_H_
-
-#include <complex>
-#include <cstdint>
-#include <functional>
-#include <vector>
-
-#include "simulator.h"
-#include "unitaryspace_basic.h"
-
-namespace qsim {
-namespace unitary {
-
-/**
- * Quantum circuit unitary calculator without vectorization.
- */
-template <typename For, typename FP = float>
-class UnitaryCalculatorBasic final : public SimulatorBase {
- public:
-  using UnitarySpace = UnitarySpaceBasic<For, FP>;
-  using Unitary = typename UnitarySpace::Unitary;
-  using fp_type = typename UnitarySpace::fp_type;
-
-  using StateSpace = UnitarySpace;
-  using State = Unitary;
-
-  template <typename... ForArgs>
-  explicit UnitaryCalculatorBasic(ForArgs&&... args) : for_(args...) {}
-
-  /**
-   * Applies a gate using non-vectorized instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyGate(const std::vector<unsigned>& qs,
-                 const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    switch (qs.size()) {
-    case 1:
-      ApplyGateH<1>(qs, matrix, state);
-      break;
-    case 2:
-      ApplyGateH<2>(qs, matrix, state);
-      break;
-    case 3:
-      ApplyGateH<3>(qs, matrix, state);
-      break;
-    case 4:
-      ApplyGateH<4>(qs, matrix, state);
-      break;
-    case 5:
-      ApplyGateH<5>(qs, matrix, state);
-      break;
-    case 6:
-      ApplyGateH<6>(qs, matrix, state);
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-  }
-
-  /**
-   * Applies a controlled gate using non-vectorized instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param cqs Indices of control qubits.
-   * @param cvals Bit mask of control qubit values.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyControlledGate(const std::vector<unsigned>& qs,
-                           const std::vector<unsigned>& cqs, uint64_t cvals,
-                           const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    if (cqs.size() == 0) {
-      ApplyGate(qs, matrix, state);
-      return;
-    }
-
-    switch (qs.size()) {
-    case 1:
-      ApplyControlledGateH<1>(qs, cqs, cvals, matrix, state);
-      break;
-    case 2:
-      ApplyControlledGateH<2>(qs, cqs, cvals, matrix, state);
-      break;
-    case 3:
-      ApplyControlledGateH<3>(qs, cqs, cvals, matrix, state);
-      break;
-    case 4:
-      ApplyControlledGateH<4>(qs, cqs, cvals, matrix, state);
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-  }
-
-  /**
-   * @return The size of SIMD register if applicable.
-   */
-  static unsigned SIMDRegisterSize() {
-    return 1;
-  }
-
- private:
-  template <unsigned H>
-  void ApplyGateH(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss, uint64_t size,
-                uint64_t row_size, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      fp_type rn, in;
-      fp_type rs[hsize], is[hsize];
-
-      uint64_t r = i % size;
-      uint64_t s = i / size;
-
-      uint64_t t = r & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        r *= 2;
-        t |= r & ms[j];
-      }
-
-      auto p0 = rstate + row_size * s + 2 * t;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = *(p0 + xss[k]);
-        is[k] = *(p0 + xss[k] + 1);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = rs[0] * v[j] - is[0] * v[j + 1];
-        in = rs[0] * v[j + 1] + is[0] * v[j];
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          rn += rs[l] * v[j] - is[l] * v[j + 1];
-          in += rs[l] * v[j + 1] + is[l] * v[j];
-
-          j += 2;
-        }
-
-        *(p0 + xss[k]) = rn;
-        *(p0 + xss[k] + 1) = in;
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-
-    unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateH(const std::vector<unsigned>& qs,
-                            const std::vector<unsigned>& cqs,
-                            uint64_t cvals, const fp_type* matrix,
-                            State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
-                uint64_t cmaskh, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      fp_type rn, in;
-      fp_type rs[hsize], is[hsize];
-
-      uint64_t r = i % size;
-      uint64_t s = i / size;
-
-      uint64_t t = r & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        r *= 2;
-        t |= r & ms[j];
-      }
-
-      if ((t & cmaskh) == cvalsh) {
-        auto p0 = rstate + row_size * s + 2 * t;
-
-        for (unsigned k = 0; k < hsize; ++k) {
-          rs[k] = *(p0 + xss[k]);
-          is[k] = *(p0 + xss[k] + 1);
-        }
-
-        uint64_t j = 0;
-
-        for (unsigned k = 0; k < hsize; ++k) {
-          rn = rs[0] * v[j] - is[0] * v[j + 1];
-          in = rs[0] * v[j + 1] + is[0] * v[j];
-
-          j += 2;
-
-          for (unsigned l = 1; l < hsize; ++l) {
-            rn += rs[l] * v[j] - is[l] * v[j + 1];
-            in += rs[l] * v[j + 1] + is[l] * v[j];
-
-            j += 2;
-          }
-
-          *(p0 + xss[k]) = rn;
-          *(p0 + xss[k] + 1) = in;
-        }
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-
-    auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals);
-
-    unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f,
-             matrix, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get());
-  }
-
-  For for_;
-};
-
-}  // namespace unitary
-}  // namespace qsim
-
-#endif  // UNITARY_CALCULATOR_BASIC_H_
diff --git a/qsim/unitary_calculator_sse.h b/qsim/unitary_calculator_sse.h
deleted file mode 100644
index a3c3f2e..0000000
--- a/qsim/unitary_calculator_sse.h
+++ /dev/null
@@ -1,639 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef UNITARY_CALCULATOR_SSE_H_
-#define UNITARY_CALCULATOR_SSE_H_
-
-#include <smmintrin.h>
-
-#include <complex>
-#include <cstdint>
-#include <functional>
-#include <vector>
-
-#include "simulator.h"
-#include "unitaryspace_sse.h"
-
-namespace qsim {
-namespace unitary {
-
-/**
- * Quantum circuit unitary calculator with SSE vectorization.
- */
-template <typename For>
-class UnitaryCalculatorSSE final : public SimulatorBase {
- public:
-  using UnitarySpace = UnitarySpaceSSE<For>;
-  using Unitary = typename UnitarySpace::Unitary;
-  using fp_type = typename UnitarySpace::fp_type;
-
-  using StateSpace = UnitarySpace;
-  using State = Unitary;
-
-  template <typename... ForArgs>
-  explicit UnitaryCalculatorSSE(ForArgs&&... args) : for_(args...) {}
-
-  /**
-   * Applies a gate using SSE instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyGate(const std::vector<unsigned>& qs,
-                 const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    switch (qs.size()) {
-    case 1:
-      if (qs[0] > 1) {
-        ApplyGateH<1>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 1>(qs, matrix, state);
-      }
-      break;
-    case 2:
-      if (qs[0] > 1) {
-        ApplyGateH<2>(qs, matrix, state);
-      } else if (qs[1] > 1) {
-        ApplyGateL<1, 1>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 2>(qs, matrix, state);
-      }
-      break;
-    case 3:
-      if (qs[0] > 1) {
-        ApplyGateH<3>(qs, matrix, state);
-      } else if (qs[1] > 1) {
-        ApplyGateL<2, 1>(qs, matrix, state);
-      } else {
-        ApplyGateL<1, 2>(qs, matrix, state);
-      }
-      break;
-    case 4:
-      if (qs[0] > 1) {
-        ApplyGateH<4>(qs, matrix, state);
-      } else if (qs[1] > 1) {
-        ApplyGateL<3, 1>(qs, matrix, state);
-      } else {
-        ApplyGateL<2, 2>(qs, matrix, state);
-      }
-      break;
-    case 5:
-      if (qs[0] > 1) {
-        ApplyGateH<5>(qs, matrix, state);
-      } else if (qs[1] > 1) {
-        ApplyGateL<4, 1>(qs, matrix, state);
-      } else {
-        ApplyGateL<3, 2>(qs, matrix, state);
-      }
-      break;
-    case 6:
-      if (qs[0] > 1) {
-        ApplyGateH<6>(qs, matrix, state);
-      } else if (qs[1] > 1) {
-        ApplyGateL<5, 1>(qs, matrix, state);
-      } else {
-        ApplyGateL<4, 2>(qs, matrix, state);
-      }
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-  }
-
-  /**
-   * Applies a controlled gate using SSE instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param cqs Indices of control qubits.
-   * @param cvals Bit mask of control qubit values.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyControlledGate(const std::vector<unsigned>& qs,
-                           const std::vector<unsigned>& cqs, uint64_t cvals,
-                           const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-    // Assume cqs[0] < cqs[1] < cqs[2] < ... .
-
-    if (cqs.size() == 0) {
-      ApplyGate(qs, matrix, state);
-      return;
-    }
-
-    switch (qs.size()) {
-    case 1:
-      if (qs[0] > 1) {
-        if (cqs[0] > 1) {
-          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 1) {
-          ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 2:
-      if (qs[0] > 1) {
-        if (cqs[0] > 1) {
-          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 1) {
-        if (cqs[0] > 1) {
-          ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 1) {
-          ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 3:
-      if (qs[0] > 1) {
-        if (cqs[0] > 1) {
-          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 1) {
-        if (cqs[0] > 1) {
-          ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 1) {
-          ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 4:
-      if (qs[0] > 1) {
-        if (cqs[0] > 1) {
-          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 1) {
-        if (cqs[0] > 1) {
-          ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 1) {
-          ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-  }
-
-  /**
-   * @return The size of SIMD register if applicable.
-   */
-  static unsigned SIMDRegisterSize() {
-    return 4;
-  }
-
- private:
-  template <unsigned H>
-  void ApplyGateH(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss, uint64_t size,
-                uint64_t row_size, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m128 ru, iu, rn, in;
-      __m128 rs[hsize], is[hsize];
-
-      uint64_t r = 4 * (i % size);
-      uint64_t s = i / size;
-
-      uint64_t t = r & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        r *= 2;
-        t |= r & ms[j];
-      }
-
-      auto p0 = rstate + row_size * s + 2 * t;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = _mm_load_ps(p0 + xss[k]);
-        is[k] = _mm_load_ps(p0 + xss[k] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm_set1_ps(v[j]);
-        iu = _mm_set1_ps(v[j + 1]);
-        rn = _mm_mul_ps(rs[0], ru);
-        in = _mm_mul_ps(rs[0], iu);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm_set1_ps(v[j]);
-          iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[l], ru));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[k], rn);
-        _mm_store_ps(p0 + xss[k] + 4, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-
-    unsigned k = 2 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, state.get());
-  }
-
-  template <unsigned H, unsigned L>
-  void ApplyGateL(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss, unsigned q0,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m128 rn, in;
-      __m128 rs[gsize], is[gsize];
-
-      uint64_t r = 4 * (i % size);
-      uint64_t s = i / size;
-
-      uint64_t t = r & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        r *= 2;
-        t |= r & ms[j];
-      }
-
-      auto p0 = rstate + row_size * s + 2 * t;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-
-        rs[k2] = _mm_load_ps(p0 + xss[k]);
-        is[k2] = _mm_load_ps(p0 + xss[k] + 4);
-
-        if (L == 1) {
-          rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177)
-                               : _mm_shuffle_ps(rs[k2], rs[k2], 78);
-          is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177)
-                               : _mm_shuffle_ps(is[k2], is[k2], 78);
-        } else if (L == 2) {
-          rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57);
-          is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57);
-          rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78);
-          is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78);
-          rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147);
-          is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[k], rn);
-        _mm_store_ps(p0 + xss[k] + 4, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-    __m128 w[1 << (1 + 2 * H + L)];
-
-    auto m = GetMasks11<L>(qs);
-
-    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
-    FillMatrix<H, L, 2>(m.qmaskl, matrix, (fp_type*) w);
-
-    unsigned k = 2 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f,  w, ms, xss, qs[0], size, raw_size, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
-                uint64_t cmaskh, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m128 ru, iu, rn, in;
-      __m128 rs[hsize], is[hsize];
-
-      uint64_t r = 4 * (i % size);
-      uint64_t s = i / size;
-
-      uint64_t t = r & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        r *= 2;
-        t |= r & ms[j];
-      }
-
-      if ((t & cmaskh) != cvalsh) return;
-
-      auto p0 = rstate + row_size * s + 2 * t;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = _mm_load_ps(p0 + xss[k]);
-        is[k] = _mm_load_ps(p0 + xss[k] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm_set1_ps(v[j]);
-        iu = _mm_set1_ps(v[j + 1]);
-        rn = _mm_mul_ps(rs[0], ru);
-        in = _mm_mul_ps(rs[0], iu);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm_set1_ps(v[j]);
-          iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[l], ru));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[k], rn);
-        _mm_store_ps(p0 + xss[k] + 4, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-
-    auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals);
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-
-    unsigned k = 2 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f,
-             matrix, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
-                uint64_t cmaskh, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m128 rn, in;
-      __m128 rs[hsize], is[hsize];
-
-      uint64_t r = 4 * (i % size);
-      uint64_t s = i / size;
-
-      uint64_t t = r & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        r *= 2;
-        t |= r & ms[j];
-      }
-
-      if ((t & cmaskh) != cvalsh) return;
-
-      auto p0 = rstate + row_size * s + 2 * t;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = _mm_load_ps(p0 + xss[k]);
-        is[k] = _mm_load_ps(p0 + xss[k] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[k], rn);
-        _mm_store_ps(p0 + xss[k] + 4, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-    __m128 w[1 << (1 + 2 * H)];
-
-    auto m = GetMasks8<2>(state.num_qubits(), qs, cqs, cvals);
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-    FillControlledMatrixH<H, 2>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
-
-    unsigned k = 2 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f,
-             w, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get());
-  }
-
-  template <unsigned H, unsigned L, bool CH>
-  void ApplyControlledGateL(const std::vector<unsigned>& qs,
-                            const std::vector<unsigned>& cqs, uint64_t cvals,
-                            const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
-                uint64_t cmaskh, unsigned q0, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m128 rn, in;
-      __m128 rs[gsize], is[gsize];
-
-      uint64_t r = 4 * (i % size);
-      uint64_t s = i / size;
-
-      uint64_t t = r & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        r *= 2;
-        t |= r & ms[j];
-      }
-
-      if ((t & cmaskh) != cvalsh) return;
-
-      auto p0 = rstate + row_size * s + 2 * t;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-
-        rs[k2] = _mm_load_ps(p0 + xss[k]);
-        is[k2] = _mm_load_ps(p0 + xss[k] + 4);
-
-        if (L == 1) {
-          rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177)
-                               : _mm_shuffle_ps(rs[k2], rs[k2], 78);
-          is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177)
-                               : _mm_shuffle_ps(is[k2], is[k2], 78);
-        } else if (L == 2) {
-          rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57);
-          is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57);
-          rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78);
-          is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78);
-          rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147);
-          is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[k], rn);
-        _mm_store_ps(p0 + xss[k] + 4, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-    __m128 w[1 << (1 + 2 * H + L)];
-
-    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
-
-    unsigned k = 2 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    if (CH) {
-      auto m = GetMasks9<L>(state.num_qubits(), qs, cqs, cvals);
-      FillMatrix<H, L, 2>(m.qmaskl, matrix, (fp_type*) w);
-
-      for_.Run(size * size2, f, w, ms, xss,
-               m.cvalsh, m.cmaskh, qs[0], size, raw_size, state.get());
-    } else {
-      auto m = GetMasks10<L, 2>(state.num_qubits(), qs, cqs, cvals);
-      FillControlledMatrixL<H, L, 2>(
-          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
-
-      for_.Run(size * size2, f, w, ms, xss,
-               m.cvalsh, m.cmaskh, qs[0], size, raw_size, state.get());
-    }
-  }
-
-  For for_;
-};
-
-}  // namespace unitary
-}  // namespace qsim
-
-#endif  // UNITARY_CALCULATOR_SSE_H_
diff --git a/qsim/unitaryspace.h b/qsim/unitaryspace.h
deleted file mode 100644
index b5e2691..0000000
--- a/qsim/unitaryspace.h
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef UNITARYSPACE_H_
-#define UNITARYSPACE_H_
-
-#include <cstdint>
-
-namespace qsim {
-
-namespace unitary {
-
-/**
- * Abstract class containing routines for general unitary matrix manipulations.
- * "AVX", "AVX512", "Basic", and "SSE" implementations are provided.
- */
-template <typename Impl,
-          template<typename...> class VectorSpace, typename... VSTypeParams>
-class UnitarySpace : public VectorSpace<Impl, VSTypeParams...> {
- private:
-  using Base = VectorSpace<Impl, VSTypeParams...>;
-
- public:
-  using fp_type = typename Base::fp_type;
-  using Unitary = typename Base::Vector;
-
-  template <typename... ForArgs>
-  UnitarySpace(ForArgs&&... args) : Base(args...) {}
-
-  static Unitary CreateUnitary(unsigned num_qubits) {
-    return Base::Create(num_qubits);
-  }
-
-  static Unitary CreateUnitary(fp_type* p, unsigned num_qubits) {
-    return Base::Create(p, num_qubits);
-  }
-
-  static Unitary NullUnitary() {
-    return Base::Null();
-  }
-
-  static uint64_t Size(unsigned num_qubits) {
-    return uint64_t{1} << num_qubits;
-  };
-
-  void CopyUnitary(const Unitary& src, Unitary& dest) const {
-    Base::Copy(src, dest);
-  }
-};
-
-}  // namespace unitary
-}  // namespace qsim
-
-#endif  // UNITARYSPACE_H_
diff --git a/qsim/unitaryspace_avx.h b/qsim/unitaryspace_avx.h
deleted file mode 100644
index c1ec59d..0000000
--- a/qsim/unitaryspace_avx.h
+++ /dev/null
@@ -1,112 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef UNITARYSPACE_AVX_H_
-#define UNITARYSPACE_AVX_H_
-
-#include <immintrin.h>
-
-#include <algorithm>
-#include <cmath>
-#include <complex>
-#include <cstdint>
-
-#include "unitaryspace.h"
-#include "vectorspace.h"
-
-namespace qsim {
-
-namespace unitary {
-
-/**
- * Object containing context and routines for unitary manipulations.
- * Unitary is a vectorized sequence of eight real components followed by eight
- * imaginary components. Eight single-precison floating numbers can be loaded
- * into an AVX register.
- */
-template <typename For>
-struct UnitarySpaceAVX :
-    public UnitarySpace<UnitarySpaceAVX<For>, VectorSpace, For, float> {
- private:
-  using Base = UnitarySpace<UnitarySpaceAVX<For>,
-                            qsim::VectorSpace, For, float>;
-
- public:
-  using Unitary = typename Base::Unitary;
-  using fp_type = typename Base::fp_type;
-
-  template <typename... ForArgs>
-  explicit UnitarySpaceAVX(ForArgs&&... args) : Base(args...) {}
-
-  static uint64_t MinRowSize(unsigned num_qubits) {
-    return std::max(uint64_t{16}, 2 * (uint64_t{1} << num_qubits));
-  };
-
-  static uint64_t MinSize(unsigned num_qubits) {
-    return Base::Size(num_qubits) * MinRowSize(num_qubits);
-  };
-
-  void SetAllZeros(Unitary& state) const {
-    __m256 val0 = _mm256_setzero_ps();
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, __m256& val, fp_type* p) {
-      _mm256_store_ps(p + 16 * i, val);
-      _mm256_store_ps(p + 16 * i + 8, val);
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 16, f, val0, state.get());
-  }
-
-  void SetIdentity(Unitary& state) {
-    SetAllZeros(state);
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                uint64_t row_size, fp_type* p) {
-      p[row_size * i + (16 * (i / 8)) + (i % 8)] = 1;
-    };
-
-    uint64_t size = Base::Size(state.num_qubits());
-    uint64_t row_size = MinRowSize(state.num_qubits());
-    Base::for_.Run(size, f, row_size, state.get());
-  }
-
-  static std::complex<fp_type> GetEntry(const Unitary& state,
-                                        uint64_t i, uint64_t j) {
-    uint64_t row_size = MinRowSize(state.num_qubits());
-    uint64_t k = (16 * (j / 8)) + (j % 8);
-    return std::complex<fp_type>(state.get()[row_size * i + k],
-                                 state.get()[row_size * i + k + 8]);
-  }
-
-  static void SetEntry(Unitary& state, uint64_t i, uint64_t j,
-                       const std::complex<fp_type>& ampl) {
-    uint64_t row_size = MinRowSize(state.num_qubits());
-    uint64_t k = (16 * (j / 8)) + (j % 8);
-    state.get()[row_size * i + k] = std::real(ampl);
-    state.get()[row_size * i + k + 8] = std::imag(ampl);
-  }
-
-  static void SetEntry(Unitary& state, uint64_t i, uint64_t j, fp_type re,
-                       fp_type im) {
-    uint64_t row_size = MinRowSize(state.num_qubits());
-    uint64_t k = (16 * (j / 8)) + (j % 8);
-    state.get()[row_size * i + k] = re;
-    state.get()[row_size * i + k + 8] = im;
-  }
-};
-
-}  // namespace unitary
-}  // namespace qsim
-
-#endif  // UNITARYSPACE_AVX_H_
diff --git a/qsim/unitaryspace_avx512.h b/qsim/unitaryspace_avx512.h
deleted file mode 100644
index 4c23dc9..0000000
--- a/qsim/unitaryspace_avx512.h
+++ /dev/null
@@ -1,112 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef UNITARYSPACE_AVX512_H_
-#define UNITARYSPACE_AVX512_H_
-
-#include <immintrin.h>
-
-#include <algorithm>
-#include <cmath>
-#include <complex>
-#include <cstdint>
-
-#include "unitaryspace.h"
-#include "vectorspace.h"
-
-namespace qsim {
-
-namespace unitary {
-
-/**
- * Object containing context and routines for unitary manipulations.
- * State is a vectorized sequence of sixteen real components followed by
- * sixteen imaginary components. Sixteen single-precison floating numbers can
- * be loaded into an AVX512 register.
- */
-template <typename For>
-struct UnitarySpaceAVX512 :
-    public UnitarySpace<UnitarySpaceAVX512<For>, VectorSpace, For, float> {
- private:
-  using Base = UnitarySpace<UnitarySpaceAVX512<For>,
-                            qsim::VectorSpace, For, float>;
-
- public:
-  using Unitary = typename Base::Unitary;
-  using fp_type = typename Base::fp_type;
-
-  template <typename... ForArgs>
-  explicit UnitarySpaceAVX512(ForArgs&&... args) : Base(args...) {}
-
-  static uint64_t MinRowSize(unsigned num_qubits) {
-    return std::max(uint64_t{32}, 2 * (uint64_t{1} << num_qubits));
-  };
-
-  static uint64_t MinSize(unsigned num_qubits) {
-    return Base::Size(num_qubits) * MinRowSize(num_qubits);
-  };
-
-  void SetAllZeros(Unitary& state) const {
-    __m512 val0 = _mm512_setzero_ps();
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, __m512 val0, fp_type* p) {
-      _mm512_store_ps(p + 32 * i, val0);
-      _mm512_store_ps(p + 32 * i + 16, val0);
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 32, f, val0, state.get());
-  }
-
-  void SetIdentity(Unitary& state) {
-    SetAllZeros(state);
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                uint64_t row_size, fp_type* p) {
-      p[row_size * i + (32 * (i / 16)) + (i % 16)] = 1;
-    };
-
-    uint64_t size = Base::Size(state.num_qubits());
-    uint64_t row_size = MinRowSize(state.num_qubits());
-    Base::for_.Run(size, f, row_size, state.get());
-  }
-
-  static std::complex<fp_type> GetEntry(const Unitary& state,
-                                        uint64_t i, uint64_t j) {
-    uint64_t row_size = MinRowSize(state.num_qubits());
-    uint64_t k = (32 * (j / 16)) + (j % 16);
-    return std::complex<fp_type>(state.get()[row_size * i + k],
-                                 state.get()[row_size * i + k + 16]);
-  }
-
-  static void SetEntry(Unitary& state, uint64_t i, uint64_t j,
-                       const std::complex<fp_type>& ampl) {
-    uint64_t row_size = MinRowSize(state.num_qubits());
-    uint64_t k = (32 * (j / 16)) + (j % 16);
-    state.get()[row_size * i + k] = std::real(ampl);
-    state.get()[row_size * i + k + 16] = std::imag(ampl);
-  }
-
-  static void SetEntry(Unitary& state, uint64_t i, uint64_t j, fp_type re,
-                       fp_type im) {
-    uint64_t row_size = MinRowSize(state.num_qubits());
-    uint64_t k = (32 * (j / 16)) + (j % 16);
-    state.get()[row_size * i + k] = re;
-    state.get()[row_size * i + k + 16] = im;
-  }
-};
-
-}  // namespace unitary
-}  // namespace qsim
-
-#endif  // UNITARYSPACE_AVX512_H_
diff --git a/qsim/unitaryspace_basic.h b/qsim/unitaryspace_basic.h
deleted file mode 100644
index 2db14b6..0000000
--- a/qsim/unitaryspace_basic.h
+++ /dev/null
@@ -1,103 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef UNITARYSPACE_BASIC_H_
-#define UNITARYSPACE_BASIC_H_
-
-#include <cmath>
-#include <complex>
-#include <cstdint>
-
-#include "unitaryspace.h"
-#include "vectorspace.h"
-
-namespace qsim {
-
-namespace unitary {
-
-/**
- * Object containing context and routines for unitary manipulations.
- * Unitary is a non-vectorized sequence of one real amplitude followed by
- * one imaginary amplitude.
- */
-template <typename For, typename FP>
-struct UnitarySpaceBasic
-    : public UnitarySpace<UnitarySpaceBasic<For, FP>, VectorSpace, For, FP> {
- private:
-  using Base = UnitarySpace<UnitarySpaceBasic<For, FP>,
-                            qsim::VectorSpace, For, FP>;
-
- public:
-  using Unitary = typename Base::Unitary;
-  using fp_type = typename Base::fp_type;
-
-  template <typename... ForArgs>
-  explicit UnitarySpaceBasic(ForArgs&&... args) : Base(args...) {}
-
-  static uint64_t MinRowSize(unsigned num_qubits) {
-    return 2 * (uint64_t{1} << num_qubits);
-  };
-
-  static uint64_t MinSize(unsigned num_qubits) {
-    return Base::Size(num_qubits) * MinRowSize(num_qubits);
-  };
-
-  void SetAllZeros(Unitary& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) {
-      p[2 * i + 0] = 0;
-      p[2 * i + 1] = 0;
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 2, f, state.get());
-  }
-
-  void SetIdentity(Unitary& state) {
-    SetAllZeros(state);
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                uint64_t row_size, fp_type* p) {
-      p[row_size * i + 2 * i] = 1;
-    };
-
-    uint64_t size = Base::Size(state.num_qubits());
-    uint64_t row_size = MinRowSize(state.num_qubits());
-    Base::for_.Run(size, f, row_size, state.get());
-  }
-
-  static std::complex<fp_type> GetEntry(const Unitary& state,
-                                        uint64_t i, uint64_t j) {
-    uint64_t row_size = MinRowSize(state.num_qubits());
-    return std::complex<fp_type>(state.get()[row_size * i + 2 * j],
-                                 state.get()[row_size * i + 2 * j + 1]);
-  }
-
-  static void SetEntry(Unitary& state, uint64_t i, uint64_t j,
-                       const std::complex<fp_type>& ampl) {
-    uint64_t row_size = MinRowSize(state.num_qubits());
-    state.get()[row_size * i + 2 * j] = std::real(ampl);
-    state.get()[row_size * i + 2 * j + 1] = std::imag(ampl);
-  }
-
-  static void SetEntry(Unitary& state, uint64_t i, uint64_t j,
-                       fp_type re, fp_type im) {
-    uint64_t row_size = MinRowSize(state.num_qubits());
-    state.get()[row_size * i + 2 * j] = re;
-    state.get()[row_size * i + 2 * j + 1] = im;
-  }
-};
-
-}  // namespace unitary
-}  // namespace qsim
-
-#endif  // UNITARYSPACE_BASIC_H_
diff --git a/qsim/unitaryspace_sse.h b/qsim/unitaryspace_sse.h
deleted file mode 100644
index f3762fb..0000000
--- a/qsim/unitaryspace_sse.h
+++ /dev/null
@@ -1,112 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef UNITARYSPACE_SSE_H_
-#define UNITARYSPACE_SSE_H_
-
-#include <smmintrin.h>
-
-#include <algorithm>
-#include <cmath>
-#include <complex>
-#include <cstdint>
-
-#include "unitaryspace.h"
-#include "vectorspace.h"
-
-namespace qsim {
-
-namespace unitary {
-
-/**
- * Object containing context and routines for unitary manipulations.
- * Unitary is a vectorized sequence of four real components followed by four
- * imaginary components. Four single-precison floating numbers can be loaded
- * into an SSE register.
- */
-template <typename For>
-struct UnitarySpaceSSE :
-    public UnitarySpace<UnitarySpaceSSE<For>, VectorSpace, For, float> {
- private:
-  using Base = UnitarySpace<UnitarySpaceSSE<For>,
-                            qsim::VectorSpace, For, float>;
-
- public:
-  using Unitary = typename Base::Unitary;
-  using fp_type = typename Base::fp_type;
-
-  template <typename... ForArgs>
-  explicit UnitarySpaceSSE(ForArgs&&... args) : Base(args...) {}
-
-  static uint64_t MinRowSize(unsigned num_qubits) {
-    return std::max(uint64_t{8}, 2 * (uint64_t{1} << num_qubits));
-  };
-
-  static uint64_t MinSize(unsigned num_qubits) {
-    return Base::Size(num_qubits) * MinRowSize(num_qubits);
-  };
-
-  void SetAllZeros(Unitary& state) const {
-    __m128 val0 = _mm_setzero_ps();
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, __m128 val0, fp_type* p) {
-      _mm_store_ps(p + 8 * i, val0);
-      _mm_store_ps(p + 8 * i + 4, val0);
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 8, f, val0, state.get());
-  }
-
-  void SetIdentity(Unitary& state) {
-    SetAllZeros(state);
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                uint64_t row_size, fp_type* p) {
-      p[row_size * i + (8 * (i / 4)) + (i % 4)] = 1;
-    };
-
-    uint64_t size = Base::Size(state.num_qubits());
-    uint64_t row_size = MinRowSize(state.num_qubits());
-    Base::for_.Run(size, f, row_size, state.get());
-  }
-
-  static std::complex<fp_type> GetEntry(const Unitary& state,
-                                        uint64_t i, uint64_t j) {
-    uint64_t row_size = MinRowSize(state.num_qubits());
-    uint64_t k = (8 * (j / 4)) + (j % 4);
-    return std::complex<fp_type>(state.get()[row_size * i + k],
-                                 state.get()[row_size * i + k + 4]);
-  }
-
-  static void SetEntry(Unitary& state, uint64_t i, uint64_t j,
-                       const std::complex<fp_type>& ampl) {
-    uint64_t row_size = MinRowSize(state.num_qubits());
-    uint64_t k = (8 * (j / 4)) + (j % 4);
-    state.get()[row_size * i + k] = std::real(ampl);
-    state.get()[row_size * i + k + 4] = std::imag(ampl);
-  }
-
-  static void SetEntry(Unitary& state, uint64_t i, uint64_t j, fp_type re,
-                       fp_type im) {
-    uint64_t row_size = MinRowSize(state.num_qubits());
-    uint64_t k = (8 * (j / 4)) + (j % 4);
-    state.get()[row_size * i + k] = re;
-    state.get()[row_size * i + k + 4] = im;
-  }
-};
-
-}  // namespace unitary
-}  // namespace qsim
-
-#endif  // UNITARYSPACE_SSE_H_
diff --git a/qsim/util.h b/qsim/util.h
deleted file mode 100644
index 726a019..0000000
--- a/qsim/util.h
+++ /dev/null
@@ -1,89 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef UTIL_H_
-#define UTIL_H_
-
-#include <algorithm>
-#include <chrono>
-#include <random>
-#include <sstream>
-#include <string>
-#include <utility>
-#include <vector>
-
-namespace qsim {
-
-template <typename Container>
-inline void SplitString(
-    const std::string& str, char delim, Container& words) {
-  words.resize(0);
-
-  std::string word;
-  std::stringstream ss(str);
-
-  while (std::getline(ss, word, delim)) {
-    words.push_back(std::move(word));
-  }
-}
-
-template <typename Op, typename Container>
-inline void SplitString(
-    const std::string& str, char delim, Op op, Container& words) {
-  words.resize(0);
-
-  std::string word;
-  std::stringstream ss(str);
-
-  while (std::getline(ss, word, delim)) {
-    words.push_back(op(word));
-  }
-}
-
-inline double GetTime() {
-  using namespace std::chrono;
-  steady_clock::duration since_epoch = steady_clock::now().time_since_epoch();
-  return double(since_epoch.count() * steady_clock::period::num)
-                                    / steady_clock::period::den;
-}
-
-template <typename DistrRealType, typename RGen>
-inline DistrRealType RandomValue(RGen& rgen, DistrRealType max_value) {
-  std::uniform_real_distribution<DistrRealType> distr(0.0, max_value);
-  return distr(rgen);
-}
-
-template <typename DistrRealType>
-inline std::vector<DistrRealType> GenerateRandomValues(
-    uint64_t num_samples, unsigned seed, DistrRealType max_value) {
-  std::vector<DistrRealType> rs;
-  rs.reserve(num_samples + 1);
-
-  std::mt19937 rgen(seed);
-  std::uniform_real_distribution<DistrRealType> distr(0.0, max_value);
-
-  for (uint64_t i = 0; i < num_samples; ++i) {
-    rs.emplace_back(distr(rgen));
-  }
-
-  std::sort(rs.begin(), rs.end());
-  // Populate the final element to prevent sanitizer errors.
-  rs.emplace_back(max_value);
-
-  return rs;
-}
-
-}  // namespace qsim
-
-#endif  // UTIL_H_
diff --git a/qsim/util_cpu.h b/qsim/util_cpu.h
deleted file mode 100644
index 8e02425..0000000
--- a/qsim/util_cpu.h
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef UTIL_CPU_H_
-#define UTIL_CPU_H_
-
-#ifdef __SSE2__
-# include <immintrin.h>
-#endif
-
-namespace qsim {
-
-// This function sets flush-to-zero and denormals-are-zeros MXCSR control
-// flags. This prevents rare cases of performance slowdown potentially at
-// the cost of a tiny precision loss.
-inline void SetFlushToZeroAndDenormalsAreZeros() {
-#ifdef __SSE2__
-  _mm_setcsr(_mm_getcsr() | 0x8040);
-#endif
-}
-
-// This function clears flush-to-zero and denormals-are-zeros MXCSR control
-// flags.
-inline void ClearFlushToZeroAndDenormalsAreZeros() {
-#ifdef __SSE2__
-  _mm_setcsr(_mm_getcsr() & ~unsigned{0x8040});
-#endif
-}
-
-}  // namespace qsim
-
-#endif  // UTIL_CPU_H_
diff --git a/qsim/util_cuda.h b/qsim/util_cuda.h
deleted file mode 100644
index 5d8cb5d..0000000
--- a/qsim/util_cuda.h
+++ /dev/null
@@ -1,128 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef UTIL_CUDA_H_
-#define UTIL_CUDA_H_
-
-#ifdef __NVCC__
-  #include <cuda.h>
-#elif __HIP__
-  #include <hip/hip_runtime.h>
-#endif
-
-#include <cstdlib>
-
-#include "io.h"
-
-namespace qsim {
-
-#define ErrorCheck(code) { ErrorAssert((code), __FILE__, __LINE__); }
-
-inline void ErrorAssert(cudaError_t code, const char* file, unsigned line) {
-  if (code != cudaSuccess) {
-    IO::errorf("CUDA error: %s %s %d\n", cudaGetErrorString(code), file, line);
-    exit(code);
-  }
-}
-
-template <typename T>
-struct Complex {
-  __host__ __device__ __forceinline__ Complex() {}
-
-  __host__ __device__ __forceinline__ Complex(const T& re) : re(re), im(0) {}
-
-  __host__ __device__ __forceinline__ Complex(const T& re, const T& im)
-      : re(re), im(im) {}
-
-  template <typename U>
-  __host__ __device__ __forceinline__ Complex<T>& operator=(
-      const Complex<U>& r) {
-    re = r.re;
-    im = r.im;
-
-    return *this;
-  }
-
-  T re;
-  T im;
-};
-
-template <typename T>
-__host__ __device__ __forceinline__ Complex<T> operator+(
-    const Complex<T>& l, const Complex<T>& r) {
-  return Complex<T>(l.re + r.re, l.im + r.im);
-}
-
-template <typename T, typename U>
-__host__ __device__ __forceinline__ Complex<T> operator+(
-    const Complex<T>& l, const Complex<U>& r) {
-  return Complex<T>(l.re + r.re, l.im + r.im);
-}
-
-template <typename T>
-struct Scalar {
-  using type = T;
-};
-
-template <typename T>
-struct Scalar<Complex<T>> {
-  using type = T;
-};
-
-template <typename T>
-struct Plus {
-  template <typename U>
-  __device__ __forceinline__ T operator()(const T& v1, const U& v2) const {
-    return v1 + v2;
-  }
-};
-
-template <typename T>
-struct Product {
-  __device__ __forceinline__ Complex<T> operator()(
-      const T& re1, const T& im1, const T& re2, const T& im2) const {
-    return Complex<T>(re1 * re2 + im1 * im2, re1 * im2 - im1 * re2);
-  }
-};
-
-template <typename T>
-struct RealProduct {
-  __device__ __forceinline__ T operator()(
-      const T& re1, const T& im1, const T& re2, const T& im2) const {
-    return re1 * re2 + im1 * im2;
-  }
-};
-
-template <typename FP1, typename Op, unsigned warp_size = 32>
-__device__ __forceinline__ FP1 WarpReduce(FP1 val, Op op) {
-  for (unsigned i = warp_size / 2; i > 0; i /= 2) {
-    val = op(val, __shfl_down_sync(0xffffffff, val, i));
-  }
-
-  return val;
-}
-
-template <typename FP1, typename Op, unsigned warp_size = 32>
-__device__ __forceinline__ Complex<FP1> WarpReduce(Complex<FP1> val, Op op) {
-  for (unsigned i = warp_size / 2; i > 0; i /= 2) {
-    val.re = op(val.re, __shfl_down_sync(0xffffffff, val.re, i));
-    val.im = op(val.im, __shfl_down_sync(0xffffffff, val.im, i));
-  }
-
-  return val;
-}
-
-}  // namespace qsim
-
-#endif  // UTIL_CUDA_H_
diff --git a/qsim/util_custatevec.h b/qsim/util_custatevec.h
deleted file mode 100644
index 36f29ef..0000000
--- a/qsim/util_custatevec.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef UTIL_CUSTATEVEC_H_
-#define UTIL_CUSTATEVEC_H_
-
-#include <cublas_v2.h>
-#include <custatevec.h>
-
-#include "io.h"
-#include "util_cuda.h"
-
-namespace qsim {
-
-inline void ErrorAssert(cublasStatus_t code, const char* file, unsigned line) {
-  if (code != CUBLAS_STATUS_SUCCESS) {
-    IO::errorf("cuBLAS error %i: %s %d\n", code, file, line);
-    exit(code);
-  }
-}
-
-inline void ErrorAssert(
-    custatevecStatus_t code, const char* file, unsigned line) {
-  if (code != CUSTATEVEC_STATUS_SUCCESS) {
-    IO::errorf("custatevec error: %s %s %d\n",
-                custatevecGetErrorString(code), file, line);
-    exit(code);
-  }
-}
-
-}  // namespace qsim
-
-#endif  // UTIL_CUSTATEVEC_H_
diff --git a/qsim/vectorspace.h b/qsim/vectorspace.h
deleted file mode 100644
index 7b33a53..0000000
--- a/qsim/vectorspace.h
+++ /dev/null
@@ -1,185 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef VECTORSPACE_H_
-#define VECTORSPACE_H_
-
-#ifdef _WIN32
-  #include <malloc.h>
-#endif
-
-#include <cstdint>
-#include <cstdlib>
-#include <memory>
-#include <utility>
-
-namespace qsim {
-
-namespace detail {
-
-inline void do_not_free(void*) {}
-
-inline void free(void* ptr) {
-#ifdef _WIN32
-  _aligned_free(ptr);
-#else
-  ::free(ptr);
-#endif
-}
-
-}  // namespace detail
-
-// Routines for vector manipulations.
-template <typename Impl, typename For, typename FP>
-class VectorSpace {
- public:
-  using fp_type = FP;
-
- private:
-  using Pointer = std::unique_ptr<fp_type, decltype(&detail::free)>;
-
- public:
-  class Vector {
-   public:
-    Vector() = delete;
-
-    Vector(Pointer&& ptr, unsigned num_qubits)
-        : ptr_(std::move(ptr)), num_qubits_(num_qubits) {}
-
-    fp_type* get() {
-      return ptr_.get();
-    }
-
-    const fp_type* get() const {
-      return ptr_.get();
-    }
-
-    fp_type* release() {
-      num_qubits_ = 0;
-      return ptr_.release();
-    }
-
-    unsigned num_qubits() const {
-      return num_qubits_;
-    }
-
-    bool requires_copy_to_host() const {
-      return false;
-    }
-
-   private:
-    Pointer ptr_;
-    unsigned num_qubits_;
-  };
-
-  template <typename... ForArgs>
-  VectorSpace(ForArgs&&... args) : for_(args...) {}
-
-  static Vector Create(unsigned num_qubits) {
-    auto size = sizeof(fp_type) * Impl::MinSize(num_qubits);
-    #ifdef _WIN32
-      Pointer ptr{(fp_type*) _aligned_malloc(size, 64), &detail::free};
-      return Vector{std::move(ptr), ptr.get() != nullptr ? num_qubits : 0};
-    #else
-      void* p = nullptr;
-      if (posix_memalign(&p, 64, size) == 0) {
-        return Vector{Pointer{(fp_type*) p, &detail::free}, num_qubits};
-      } else {
-        return Null();
-      }
-    #endif
-  }
-
-  // It is the client's responsibility to make sure that p has at least
-  // Impl::MinSize(num_qubits) elements.
-  static Vector Create(fp_type* p, unsigned num_qubits) {
-    return Vector{Pointer{p, &detail::do_not_free}, num_qubits};
-  }
-
-  static Vector Null() {
-    return Vector{Pointer{nullptr, &detail::free}, 0};
-  }
-
-  static bool IsNull(const Vector& vec) {
-    return vec.get() == nullptr;
-  }
-
-  static void Free(fp_type* ptr) {
-    detail::free(ptr);
-  }
-
-  bool Copy(const Vector& src, Vector& dest) const {
-    if (src.num_qubits() != dest.num_qubits()) {
-      return false;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* src, fp_type* dest) {
-      dest[i] = src[i];
-    };
-
-    for_.Run(Impl::MinSize(src.num_qubits()), f, src.get(), dest.get());
-
-    return true;
-  }
-
-  // It is the client's responsibility to make sure that dest has at least
-  // Impl::MinSize(src.num_qubits()) elements.
-  bool Copy(const Vector& src, fp_type* dest) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* src, fp_type* dest) {
-      dest[i] = src[i];
-    };
-
-    for_.Run(Impl::MinSize(src.num_qubits()), f, src.get(), dest);
-
-    return true;
-  }
-
-  // It is the client's responsibility to make sure that src has at least
-  // Impl::MinSize(dest.num_qubits()) elements.
-  bool Copy(const fp_type* src, Vector& dest) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* src, fp_type* dest) {
-      dest[i] = src[i];
-    };
-
-    for_.Run(Impl::MinSize(dest.num_qubits()), f, src, dest.get());
-
-    return true;
-  }
-
-  // It is the client's responsibility to make sure that src has at least
-  // min(size, Impl::MinSize(dest.num_qubits())) elements.
-  bool Copy(const fp_type* src, uint64_t size, Vector& dest) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* src, fp_type* dest) {
-      dest[i] = src[i];
-    };
-
-    size = std::min(size, Impl::MinSize(dest.num_qubits()));
-    for_.Run(size, f, src, dest.get());
-
-    return true;
-  }
-
-  void DeviceSync() {}
-
- protected:
-  For for_;
-};
-
-}  // namespace qsim
-
-#endif  // VECTORSPACE_H_
diff --git a/qsim/vectorspace_cuda.h b/qsim/vectorspace_cuda.h
deleted file mode 100644
index fd91553..0000000
--- a/qsim/vectorspace_cuda.h
+++ /dev/null
@@ -1,172 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef VECTORSPACE_CUDA_H_
-#define VECTORSPACE_CUDA_H_
-
-#ifdef __NVCC__
-  #include <cuda.h>
-  #include <cuda_runtime.h>
-#elif __HIP__
-  #include <hip/hip_runtime.h>
-  #include "cuda2hip.h"
-#endif
-
-#include <memory>
-#include <utility>
-
-namespace qsim {
-
-namespace detail {
-
-inline void do_not_free(void*) {}
-
-inline void free(void* ptr) {
-  ErrorCheck(cudaFree(ptr));
-}
-
-}  // namespace detail
-
-// Routines for vector manipulations.
-template <typename Impl, typename FP>
-class VectorSpaceCUDA {
- public:
-  using fp_type = FP;
-
- private:
-  using Pointer = std::unique_ptr<fp_type, decltype(&detail::free)>;
-
- public:
-  class Vector {
-   public:
-    Vector() = delete;
-
-    Vector(Pointer&& ptr, unsigned num_qubits)
-        : ptr_(std::move(ptr)), num_qubits_(num_qubits) {}
-
-    fp_type* get() {
-      return ptr_.get();
-    }
-
-    const fp_type* get() const {
-      return ptr_.get();
-    }
-
-    fp_type* release() {
-      num_qubits_ = 0;
-      return ptr_.release();
-    }
-
-    unsigned num_qubits() const {
-      return num_qubits_;
-    }
-
-    bool requires_copy_to_host() const {
-      return true;
-    }
-
-   private:
-    Pointer ptr_;
-    unsigned num_qubits_;
-  };
-
-  template <typename... Args>
-  VectorSpaceCUDA(Args&&... args) {}
-
-  static Vector Create(unsigned num_qubits) {
-    fp_type* p;
-    auto size = sizeof(fp_type) * Impl::MinSize(num_qubits);
-    auto rc = cudaMalloc(&p, size);
-
-    if (rc == cudaSuccess) {
-      return Vector{Pointer{(fp_type*) p, &detail::free}, num_qubits};
-    } else {
-      return Null();
-    }
-  }
-
-  // It is the client's responsibility to make sure that p has at least
-  // Impl::MinSize(num_qubits) elements.
-  static Vector Create(fp_type* p, unsigned num_qubits) {
-    return Vector{Pointer{p, &detail::do_not_free}, num_qubits};
-  }
-
-  static Vector Null() {
-    return Vector{Pointer{nullptr, &detail::free}, 0};
-  }
-
-  static bool IsNull(const Vector& vector) {
-    return vector.get() == nullptr;
-  }
-
-  static void Free(fp_type* ptr) {
-    detail::free(ptr);
-  }
-
-  bool Copy(const Vector& src, Vector& dest) const {
-    if (src.num_qubits() != dest.num_qubits()) {
-      return false;
-    }
-
-    ErrorCheck(
-        cudaMemcpy(dest.get(), src.get(),
-                   sizeof(fp_type) * Impl::MinSize(src.num_qubits()),
-                   cudaMemcpyDeviceToDevice));
-
-    return true;
-  }
-
-  // It is the client's responsibility to make sure that dest has at least
-  // Impl::MinSize(src.num_qubits()) elements.
-  bool Copy(const Vector& src, fp_type* dest) const {
-    ErrorCheck(
-        cudaMemcpy(dest, src.get(),
-                   sizeof(fp_type) * Impl::MinSize(src.num_qubits()),
-                   cudaMemcpyDeviceToHost));
-
-    return true;
-  }
-
-  // It is the client's responsibility to make sure that src has at least
-  // Impl::MinSize(dest.num_qubits()) elements.
-  bool Copy(const fp_type* src, Vector& dest) const {
-    ErrorCheck(
-        cudaMemcpy(dest.get(), src,
-                   sizeof(fp_type) * Impl::MinSize(dest.num_qubits()),
-                   cudaMemcpyHostToDevice));
-
-    return true;
-  }
-
-  // It is the client's responsibility to make sure that src has at least
-  // min(size, Impl::MinSize(dest.num_qubits())) elements.
-  bool Copy(const fp_type* src, uint64_t size, Vector& dest) const {
-    size = std::min(size, Impl::MinSize(dest.num_qubits()));
-    ErrorCheck(
-        cudaMemcpy(dest.get(), src,
-                   sizeof(fp_type) * size,
-                   cudaMemcpyHostToDevice));
-    return true;
-  }
-
-  void DeviceSync() {
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
- protected:
-};
-
-}  // namespace qsim
-
-#endif  // VECTORSPACE_CUDA_H_

From 39223c35d01f8a6a16433bbe86fdbfe2d0e9564c Mon Sep 17 00:00:00 2001
From: wongey <25296194+wongey@users.noreply.github.com>
Date: Tue, 5 Nov 2024 23:04:56 -0500
Subject: [PATCH 03/64] Clean up results printing and output from Vicente

---
 app/qir-qsim.cc                   | 14 ++++++++++++++
 src/qirqsim/BufferManager.cc      | 15 ++++++++++++++-
 src/qirqsim/BufferManager.hh      |  3 +++
 src/qirqsim/qsimDefaultRuntime.cc | 10 ++++------
 src/qirqsim/qsimDefaultRuntime.hh | 13 ++++++++-----
 src/qirqsim/qsimQuantum.cc        | 16 +++++++++++++++-
 src/qirqsim/qsimQuantum.hh        |  5 ++++-
 7 files changed, 62 insertions(+), 14 deletions(-)

diff --git a/app/qir-qsim.cc b/app/qir-qsim.cc
index 809b686..75f1612 100644
--- a/app/qir-qsim.cc
+++ b/app/qir-qsim.cc
@@ -54,6 +54,20 @@ void run(std::string const& filename,
     for (int i = 0; i < num_shots; i++){    
         execute(sim, *rt);
     }
+
+    std::cout << std::endl;
+    std::cout << "Measurement output:" << std::endl;
+    std::cout << "-------------------" << std::endl;
+    std::cout << "Number of shots: " << num_shots << std::endl;
+    std::cout << "Number of qubits: " << sim.num_qubits() << std::endl;
+    
+    for(int q_index = 0; q_index < sim.num_qubits(); q_index++){
+        int value_0 = 0;
+        int value_1 = 0;
+        if (auto value = sim.manager.getBufferValue("q"+std::to_string(q_index), "0"); value.has_value()){ value_0 = value.value();}
+        if (auto value = sim.manager.getBufferValue("q"+std::to_string(q_index), "1"); value.has_value()){ value_1 = value.value();}
+        std::cout << "q" << q_index << " {0: " << value_0 << "," << " 1: " << value_1 << "}\n";
+    }
 }
 
 //---------------------------------------------------------------------------//
diff --git a/src/qirqsim/BufferManager.cc b/src/qirqsim/BufferManager.cc
index 2e6f646..46931d9 100644
--- a/src/qirqsim/BufferManager.cc
+++ b/src/qirqsim/BufferManager.cc
@@ -23,6 +23,11 @@ void BufferManager::updateBuffer(const std::string& qubit, const std::string& st
     buffer[{qubit, state}] = value + current_frequency;
 }
 
+void BufferManager::updateBuffer(const std::string& key, const int& value) {
+    // Insert or update the key-value pair in the buffer
+    simple_buffer[key] = value;
+}
+
 std::optional<int> BufferManager::getBufferValue(const std::string& qubit, const std::string& state) const {
     std::pair<std::string, std::string> searchKey = {qubit, state};
     auto it = buffer.find(searchKey);
@@ -30,4 +35,12 @@ std::optional<int> BufferManager::getBufferValue(const std::string& qubit, const
         return it->second;  // Key found
     }
     return std::nullopt;  // Key not found
-}
\ No newline at end of file
+}
+
+std::optional<int> BufferManager::getBufferValue(const std::string& key) const {
+    auto it = simple_buffer.find(key);
+    if (it != simple_buffer.end()) {
+        return it->second;  // Key found
+    }
+    return std::nullopt;  // Key not found
+}
diff --git a/src/qirqsim/BufferManager.hh b/src/qirqsim/BufferManager.hh
index dc03846..efb3800 100644
--- a/src/qirqsim/BufferManager.hh
+++ b/src/qirqsim/BufferManager.hh
@@ -32,14 +32,17 @@ public:
     
     // Method to update the buffer with a key-value pair
     void updateBuffer(const std::string& qubit, const std::string& state, const int& value);
+    void updateBuffer(const std::string& key, const int& value);
     
     // Retrieve buffer value for storage or evaluation
     std::optional<int> getBufferValue(const std::string& qubit, const std::string& state) const;
+    std::optional<int> getBufferValue(const std::string& key) const;
     
 private:
     
     // Dictionary to store key-value pairs
     std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> buffer;
+    std::unordered_map<std::string, int> simple_buffer;
 };
 
 #endif // BUFFER_MANAGER_H
diff --git a/src/qirqsim/qsimDefaultRuntime.cc b/src/qirqsim/qsimDefaultRuntime.cc
index 955959d..339703a 100644
--- a/src/qirqsim/qsimDefaultRuntime.cc
+++ b/src/qirqsim/qsimDefaultRuntime.cc
@@ -57,15 +57,13 @@ void qsimDefaultRuntime::tuple_record_output(size_type s, OptionalCString tag)
 void qsimDefaultRuntime::result_record_output(Result r, OptionalCString tag)
 {
     // Access values through the getter
-    // TODO: This prints results 'every time' result_record_output is called. Maybe enough to only print the 'final time'
+    // This prints results every time result_record_output is called
+    // Can comment out if only want to see final results
 
-    if (auto value = sim_.manager.getBufferValue("q"+std::to_string(r.value), "0"); value.has_value()) {
-        std::cout << "q" << std::to_string(r.value) << " |0> freq: " << value.value() << "\n";
+    if (auto value = sim_.manager.getBufferValue("q"+std::to_string(r.value)); value.has_value()) {
+        std::cout << "q" << std::to_string(r.value) << " : " << value.value() << "\n";
     }
 
-    if (auto value = sim_.manager.getBufferValue("q"+std::to_string(r.value), "1"); value.has_value()) {
-        std::cout << "q" << std::to_string(r.value) << " |1> freq: " << value.value() << "\n";
-    }
 }
 
 }  // namespace qiree
diff --git a/src/qirqsim/qsimDefaultRuntime.hh b/src/qirqsim/qsimDefaultRuntime.hh
index 70dfdd4..26f06ab 100644
--- a/src/qirqsim/qsimDefaultRuntime.hh
+++ b/src/qirqsim/qsimDefaultRuntime.hh
@@ -17,11 +17,14 @@ namespace qiree
  *
  * Example for three qubits:
  * \code
- * q0 |0> freq: 509
- * q0 |1> freq: 515
- * q1 |0> freq: 509
- * q1 |1> freq: 515
- * q2 |1> freq: 1024
+ * Measurement output:
+ * -------------------
+ * Number of shots: 1024
+ * Number of qubits: 3
+ * q0 {0: 542, 1: 482}
+ * q1 {0: 521, 1: 503}
+ * q2 {0: 0, 1: 1024}
+ * 
  * \endcode
  */
 
diff --git a/src/qirqsim/qsimQuantum.cc b/src/qirqsim/qsimQuantum.cc
index 81f40ef..74d510d 100644
--- a/src/qirqsim/qsimQuantum.cc
+++ b/src/qirqsim/qsimQuantum.cc
@@ -68,6 +68,7 @@ qsimQuantum::State qsimQuantum::init_state_space() { //check if StateSpace is th
 /*
 Prepare to build a quantum circuit for an entry point
 */
+
 void qsimQuantum::set_up(EntryPointAttrs const& attrs) {
     QIREE_VALIDATE(attrs.required_num_qubits > 0,
                    << "input is not a quantum program");
@@ -78,13 +79,20 @@ void qsimQuantum::set_up(EntryPointAttrs const& attrs) {
     state_ = std::make_shared<State>(init_state_space()); // Set the state space? Maybe.
     q_circuit.num_qubits = num_qubits_; // Allocate the number of qubits in the circuit
     execution_time = 0; // Initialize execution time
-
+    static unsigned int rep = 0;
+    rep++;
+    this->repCount(rep);
 }
 
 //---------------------------------------------------------------------------//
 /*
 Complete an execution
 */
+
+void qsimQuantum::repCount(int rep) {
+    repetition = rep;
+}
+
 void qsimQuantum::tear_down() {
     q_circuit = {};
     q_circuit.num_qubits = num_qubits_;
@@ -95,6 +103,7 @@ void qsimQuantum::tear_down() {
 /*
 Reset the qubit
 */
+
 void qsimQuantum::reset(Qubit q) {
     q.value=0;
 }
@@ -103,6 +112,7 @@ void qsimQuantum::reset(Qubit q) {
 /* 
 Read the value of a result. This utilizes the new BufferManager.
 */
+
 QState qsimQuantum::read_result(Result r)
 {
     std::string q_index_string = std::to_string(r.value);
@@ -113,8 +123,10 @@ QState qsimQuantum::read_result(Result r)
         std::string stringResult = std::to_string(bitResult);
         if (stringResult == "1"){
             manager.updateBuffer("q"+q_index_string, "1", 1);
+            manager.updateBuffer("q"+q_index_string, 1);
         } else{
             manager.updateBuffer("q"+q_index_string, "0", 1);
+            manager.updateBuffer("q"+q_index_string, 0);
         }
     } else {
         qsim::IO::errorf("Unexpected measurement results encountered.");
@@ -127,6 +139,7 @@ QState qsimQuantum::read_result(Result r)
 Map a qubit to a result index 
 (TODO: find how to link the classical register to the quantum register in qsim)
 */
+
 void qsimQuantum::mz(Qubit q, Result r) { //we don't classical register yet. 
     QIREE_EXPECT(q.value < this->num_qubits()); // TODO: q must be in the set of qubits, e.g., what happens if q=5 and qubits are {2,3,4,5}, q is less than num_qubits but not it is in the set of qubits. 
     // Add measurement instruction
@@ -139,6 +152,7 @@ void qsimQuantum::mz(Qubit q, Result r) { //we don't classical register yet.
 /*
 Quantum Instruction Mapping
 */
+
 // 1. Entangling gates
 void qsimQuantum::cx(Qubit q1, Qubit q2) {
     q_circuit.gates.push_back(
diff --git a/src/qirqsim/qsimQuantum.hh b/src/qirqsim/qsimQuantum.hh
index e720e8c..cfdfc4b 100644
--- a/src/qirqsim/qsimQuantum.hh
+++ b/src/qirqsim/qsimQuantum.hh
@@ -135,8 +135,11 @@ namespace qiree
     qsim::Circuit<qsim::GateQSim<float>> get_circuit() const { return q_circuit; } 
     // Get the state space
     State const& get_state() const { return *state_; }
-    // update the buffer
+    // Update the buffer
     BufferManager manager;
+    // Number of repetitions
+    int repetition;
+    void repCount(int rep);
     
     private:
         //// TYPES ////

From c699ae48cf8a82566e0582853a15efd3d3682e14 Mon Sep 17 00:00:00 2001
From: wongey <25296194+wongey@users.noreply.github.com>
Date: Tue, 5 Nov 2024 23:05:52 -0500
Subject: [PATCH 04/64] Add dyanamic BV example from Vicente

---
 examples/dynamicbv.ll | 101 ++++++++++++++++++++++++++++++++++++++++++
 examples/teleport.ll  |   2 +-
 2 files changed, 102 insertions(+), 1 deletion(-)
 create mode 100644 examples/dynamicbv.ll

diff --git a/examples/dynamicbv.ll b/examples/dynamicbv.ll
new file mode 100644
index 0000000..6d48157
--- /dev/null
+++ b/examples/dynamicbv.ll
@@ -0,0 +1,101 @@
+; ModuleID = 'dynamicbv'
+source_filename = "dynamicbv"
+
+; ModuleID = 'BernsteinVazirani'
+source_filename = "bv_algorithm"
+
+%Qubit = type opaque
+%Result = type opaque
+
+define void @main() #0 {
+entry:
+  ; Initialize qubits
+  call void @__quantum__qis__h__body(%Qubit* null) ; apply Hadamard to query qubit
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 1 to %Qubit*)) ; set ancillary qubit
+  call void @__quantum__qis__h__body(%Qubit* inttoptr (i64 1 to %Qubit*)) ;
+  
+
+  ; Apply CNOT for bit '1'
+  call void @__quantum__qis__cnot__body(%Qubit* null, %Qubit* inttoptr (i64 1 to %Qubit*)) ; kickback phase on q0
+  call void @__quantum__qis__h__body(%Qubit* null) ; correcting eigenvalue
+  
+  ; Mid-circuit measurement 
+  call void @__quantum__qis__mz__body(%Qubit* null, %Result* null) ; from this we get the first bit
+  call i1 @__quantum__qis__read_result__body(%Result* null)
+  call void @__quantum__qis__mz__body(%Qubit* inttoptr (i64 1 to %Qubit*), %Result* inttoptr (i64 1 to %Result*)) ; just to reset ancillary qubit
+  call i1 @__quantum__qis__read_result__body(%Result* inttoptr (i64 1 to %Result*))  
+  
+  ; Output the results
+  call void @__quantum__rt__array_record_output(i64 2, i8* null)
+  call void @__quantum__rt__result_record_output(%Result* null, i8* null)
+  call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 1 to %Result*), i8* null)
+
+  ; Initialize qubits 
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 1 to %Qubit*)) ; set ancillary qubit
+  call void @__quantum__qis__h__body(%Qubit* inttoptr (i64 1 to %Qubit*)) ;
+  call void @__quantum__qis__h__body(%Qubit* null) ; apply Hadamard to query qubit
+
+  ; Apply Identiry for bit '0'
+  ; Nothing
+
+  ; Mid-circuit measurement 
+  call void @__quantum__qis__mz__body(%Qubit* null, %Result* null) ; from this we get the first bit
+  call i1 @__quantum__qis__read_result__body(%Result* null)
+  call void @__quantum__qis__mz__body(%Qubit* inttoptr (i64 1 to %Qubit*), %Result* inttoptr (i64 1 to %Result*)) ; just to reset ancillary qubit
+  call i1 @__quantum__qis__read_result__body(%Result* inttoptr (i64 1 to %Result*))  
+  
+  ; Output the results
+  call void @__quantum__rt__array_record_output(i64 2, i8* null)
+  call void @__quantum__rt__result_record_output(%Result* null, i8* null)
+  call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 1 to %Result*), i8* null)
+
+  ; Initialize qubits
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 1 to %Qubit*)) ; set ancillary qubit
+  call void @__quantum__qis__h__body(%Qubit* inttoptr (i64 1 to %Qubit*)) ;
+  call void @__quantum__qis__h__body(%Qubit* null) ; apply Hadamard to query qubit
+
+  ; Apply CNOT for bit '1'
+  call void @__quantum__qis__cnot__body(%Qubit* null, %Qubit* inttoptr (i64 1 to %Qubit*)) ; kickback phase on q0
+  call void @__quantum__qis__h__body(%Qubit* null) ; correcting eigenvalue
+  
+  ; Mid-circuit measurement 
+  call void @__quantum__qis__mz__body(%Qubit* null, %Result* null) ; from this we get the first bit
+  call i1 @__quantum__qis__read_result__body(%Result* null)
+  call void @__quantum__qis__mz__body(%Qubit* inttoptr (i64 1 to %Qubit*), %Result* inttoptr (i64 1 to %Result*)) ; just to reset ancillary qubit
+  call i1 @__quantum__qis__read_result__body(%Result* inttoptr (i64 1 to %Result*))  
+  
+  ; Output the results
+  call void @__quantum__rt__array_record_output(i64 2, i8* null)
+  call void @__quantum__rt__result_record_output(%Result* null, i8* null)
+  call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 1 to %Result*), i8* null)
+
+  ret void
+}
+
+; Declaration of quantum operations
+declare void @__quantum__qis__h__body(%Qubit*)
+declare void @__quantum__qis__x__body(%Qubit*)
+declare void @__quantum__qis__cnot__body(%Qubit*, %Qubit*)
+declare void @__quantum__qis__mz__body(%Qubit*, %Result*)
+declare i1 @__quantum__qis__read_result__body(%Result*)
+
+; Quantum runtime functions for managing qubits and results
+declare %Qubit* @__quantum__rt__qubit_allocate()
+declare %Result* @__quantum__rt__result_allocate()
+declare void @__quantum__rt__qubit_release(%Qubit*)
+declare void @__quantum__rt__result_release(%Result*)
+declare void @__quantum__rt__result_record_output(%Result*, i8*)
+declare void @__quantum__rt__array_record_output(i64, i8*)
+
+
+
+attributes #0 = { "entry_point" "num_required_qubits"="2" "num_required_results"="2" "output_labeling_schema" "qir_profiles"="custom" }
+attributes #1 = { "irreversible" }
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = !{i32 1, !"qir_major_version", i32 1}
+!1 = !{i32 7, !"qir_minor_version", i32 0}
+!2 = !{i32 1, !"dynamic_qubit_management", i1 false}
+!3 = !{i32 1, !"dynamic_result_management", i1 false}
+
diff --git a/examples/teleport.ll b/examples/teleport.ll
index 184359f..6fcb74e 100644
--- a/examples/teleport.ll
+++ b/examples/teleport.ll
@@ -38,7 +38,7 @@ else2:                                            ; preds = %continue
 
 continue3:                                        ; preds = %else2, %then1
   call void @__quantum__qis__mz__body(%Qubit* inttoptr (i64 2 to %Qubit*), %Result* inttoptr (i64 2 to %Result*))
-  %2 = call i2 @__quantum__qis__read_result__body(%Result* inttoptr (i64 2 to %Result*))
+  %2 = call i1 @__quantum__qis__read_result__body(%Result* inttoptr (i64 2 to %Result*))
   call void @__quantum__rt__array_record_output(i64 3, i8* null)
   call void @__quantum__rt__result_record_output(%Result* null, i8* null)
   call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 1 to %Result*), i8* null)

From 65054f280877512ae590a5e31247478971c08c06 Mon Sep 17 00:00:00 2001
From: Seth R Johnson <johnsonsr@ornl.gov>
Date: Fri, 22 Nov 2024 13:57:21 -0500
Subject: [PATCH 05/64] Revert changes to upstream and remove tpls

---
 CMakeLists.txt                        |   36 +-
 app/CMakeLists.txt                    |   51 +-
 cmake/FindLLVM.cmake                  |    8 -
 tpls/qsim/bits.h                      |  106 --
 tpls/qsim/bitstring.h                 |   97 --
 tpls/qsim/channel.h                   |  149 ---
 tpls/qsim/channels_cirq.h             |  471 -------
 tpls/qsim/channels_qsim.h             |  117 --
 tpls/qsim/circuit.h                   |   36 -
 tpls/qsim/circuit_noisy.h             |  108 --
 tpls/qsim/circuit_qsim_parser.h       |  442 -------
 tpls/qsim/cuda2hip.h                  |   61 -
 tpls/qsim/expect.h                    |  148 ---
 tpls/qsim/formux.h                    |   30 -
 tpls/qsim/fuser.h                     |  225 ----
 tpls/qsim/fuser_basic.h               |  411 -------
 tpls/qsim/fuser_mqubit.h              | 1095 -----------------
 tpls/qsim/gate.h                      |  216 ----
 tpls/qsim/gate_appl.h                 |  231 ----
 tpls/qsim/gates_cirq.h                | 1640 -------------------------
 tpls/qsim/gates_qsim.h                |  661 ----------
 tpls/qsim/hybrid.h                    |  612 ---------
 tpls/qsim/io.h                        |   44 -
 tpls/qsim/io_file.h                   |   71 --
 tpls/qsim/matrix.h                    |  296 -----
 tpls/qsim/mps_simulator.h             |  246 ----
 tpls/qsim/mps_statespace.h            |  597 ---------
 tpls/qsim/parfor.h                    |  123 --
 tpls/qsim/qtrajectory.h               |  435 -------
 tpls/qsim/run_qsim.h                  |  262 ----
 tpls/qsim/run_qsimh.h                 |  120 --
 tpls/qsim/seqfor.h                    |   68 -
 tpls/qsim/simmux.h                    |   44 -
 tpls/qsim/simmux_gpu.h                |   30 -
 tpls/qsim/simulator.h                 |  516 --------
 tpls/qsim/simulator_avx.h             | 1363 --------------------
 tpls/qsim/simulator_avx512.h          |  846 -------------
 tpls/qsim/simulator_basic.h           |  349 ------
 tpls/qsim/simulator_cuda.h            |  923 --------------
 tpls/qsim/simulator_cuda_kernels.h    |  683 ----------
 tpls/qsim/simulator_custatevec.h      |  209 ----
 tpls/qsim/simulator_sse.h             |  864 -------------
 tpls/qsim/statespace.h                |  145 ---
 tpls/qsim/statespace_avx.h            |  497 --------
 tpls/qsim/statespace_avx512.h         |  448 -------
 tpls/qsim/statespace_basic.h          |  300 -----
 tpls/qsim/statespace_cuda.h           |  470 -------
 tpls/qsim/statespace_cuda_kernels.h   |  355 ------
 tpls/qsim/statespace_custatevec.h     |  376 ------
 tpls/qsim/statespace_sse.h            |  462 -------
 tpls/qsim/umux.h                      |   52 -
 tpls/qsim/unitary_calculator_avx.h    | 1028 ----------------
 tpls/qsim/unitary_calculator_avx512.h |  644 ----------
 tpls/qsim/unitary_calculator_basic.h  |  259 ----
 tpls/qsim/unitary_calculator_sse.h    |  639 ----------
 tpls/qsim/unitaryspace.h              |   65 -
 tpls/qsim/unitaryspace_avx.h          |  112 --
 tpls/qsim/unitaryspace_avx512.h       |  112 --
 tpls/qsim/unitaryspace_basic.h        |  103 --
 tpls/qsim/unitaryspace_sse.h          |  112 --
 tpls/qsim/util.h                      |   89 --
 tpls/qsim/util_cpu.h                  |   43 -
 tpls/qsim/util_cuda.h                 |  128 --
 tpls/qsim/util_custatevec.h           |   44 -
 tpls/qsim/vectorspace.h               |  185 ---
 tpls/qsim/vectorspace_cuda.h          |  172 ---
 66 files changed, 5 insertions(+), 21875 deletions(-)
 delete mode 100644 tpls/qsim/bits.h
 delete mode 100644 tpls/qsim/bitstring.h
 delete mode 100644 tpls/qsim/channel.h
 delete mode 100644 tpls/qsim/channels_cirq.h
 delete mode 100644 tpls/qsim/channels_qsim.h
 delete mode 100644 tpls/qsim/circuit.h
 delete mode 100644 tpls/qsim/circuit_noisy.h
 delete mode 100644 tpls/qsim/circuit_qsim_parser.h
 delete mode 100644 tpls/qsim/cuda2hip.h
 delete mode 100644 tpls/qsim/expect.h
 delete mode 100644 tpls/qsim/formux.h
 delete mode 100644 tpls/qsim/fuser.h
 delete mode 100644 tpls/qsim/fuser_basic.h
 delete mode 100644 tpls/qsim/fuser_mqubit.h
 delete mode 100644 tpls/qsim/gate.h
 delete mode 100644 tpls/qsim/gate_appl.h
 delete mode 100644 tpls/qsim/gates_cirq.h
 delete mode 100644 tpls/qsim/gates_qsim.h
 delete mode 100644 tpls/qsim/hybrid.h
 delete mode 100644 tpls/qsim/io.h
 delete mode 100644 tpls/qsim/io_file.h
 delete mode 100644 tpls/qsim/matrix.h
 delete mode 100644 tpls/qsim/mps_simulator.h
 delete mode 100644 tpls/qsim/mps_statespace.h
 delete mode 100644 tpls/qsim/parfor.h
 delete mode 100644 tpls/qsim/qtrajectory.h
 delete mode 100644 tpls/qsim/run_qsim.h
 delete mode 100644 tpls/qsim/run_qsimh.h
 delete mode 100644 tpls/qsim/seqfor.h
 delete mode 100644 tpls/qsim/simmux.h
 delete mode 100644 tpls/qsim/simmux_gpu.h
 delete mode 100644 tpls/qsim/simulator.h
 delete mode 100644 tpls/qsim/simulator_avx.h
 delete mode 100644 tpls/qsim/simulator_avx512.h
 delete mode 100644 tpls/qsim/simulator_basic.h
 delete mode 100644 tpls/qsim/simulator_cuda.h
 delete mode 100644 tpls/qsim/simulator_cuda_kernels.h
 delete mode 100644 tpls/qsim/simulator_custatevec.h
 delete mode 100644 tpls/qsim/simulator_sse.h
 delete mode 100644 tpls/qsim/statespace.h
 delete mode 100644 tpls/qsim/statespace_avx.h
 delete mode 100644 tpls/qsim/statespace_avx512.h
 delete mode 100644 tpls/qsim/statespace_basic.h
 delete mode 100644 tpls/qsim/statespace_cuda.h
 delete mode 100644 tpls/qsim/statespace_cuda_kernels.h
 delete mode 100644 tpls/qsim/statespace_custatevec.h
 delete mode 100644 tpls/qsim/statespace_sse.h
 delete mode 100644 tpls/qsim/umux.h
 delete mode 100644 tpls/qsim/unitary_calculator_avx.h
 delete mode 100644 tpls/qsim/unitary_calculator_avx512.h
 delete mode 100644 tpls/qsim/unitary_calculator_basic.h
 delete mode 100644 tpls/qsim/unitary_calculator_sse.h
 delete mode 100644 tpls/qsim/unitaryspace.h
 delete mode 100644 tpls/qsim/unitaryspace_avx.h
 delete mode 100644 tpls/qsim/unitaryspace_avx512.h
 delete mode 100644 tpls/qsim/unitaryspace_basic.h
 delete mode 100644 tpls/qsim/unitaryspace_sse.h
 delete mode 100644 tpls/qsim/util.h
 delete mode 100644 tpls/qsim/util_cpu.h
 delete mode 100644 tpls/qsim/util_cuda.h
 delete mode 100644 tpls/qsim/util_custatevec.h
 delete mode 100644 tpls/qsim/vectorspace.h
 delete mode 100644 tpls/qsim/vectorspace_cuda.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a536e86..bd57739 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,15 +43,10 @@ qiree_set_default(BUILD_TESTING ${QIREE_BUILD_TESTS})
 # Assertion handling
 option(QIREE_DEBUG "Enable runtime assertions" ON)
 
-# Enforce mutual exclusivity
-if(QIREE_USE_XACC)
-  set(QIREE_USE_QSIM OFF CACHE BOOL "Build qsim interface" FORCE)
-  message(STATUS "QIREE_USE_XACC is ON, setting QIREE_USE_QSIM to OFF.")
-elseif(QIREE_USE_QSIM)
-  set(QIREE_USE_XACC OFF CACHE BOOL "Build XACC interface" FORCE)
-  message(STATUS "QIREE_USE_QSIM is ON, setting QIREE_USE_XACC to OFF.")
-endif()
+qiree_set_default(BUILD_TESTING ${QIREE_BUILD_TESTS})
 
+# Assertion handling
+option(QIREE_DEBUG "Enable runtime assertions" ON)
 
 #----------------------------------------------------------------------------#
 # CMAKE INTRINSIC OPTIONS
@@ -185,31 +180,6 @@ if(QIREE_BUILD_TESTS)
   add_subdirectory(test)
 endif()
 
-#----------------------------------------------------------------------------#
-# OPENMP
-#----------------------------------------------------------------------------#
-
-# Manually set OpenMP flags for macOS with libomp
-if(APPLE)
-  set(OpenMP_C_FLAGS "-Xpreprocessor -fopenmp -I/opt/homebrew/include")
-  set(OpenMP_CXX_FLAGS "-Xpreprocessor -fopenmp -I/opt/homebrew/include")
-  set(OpenMP_C_LIB_NAMES "omp")
-  set(OpenMP_CXX_LIB_NAMES "omp")
-  set(OpenMP_omp_LIBRARY "/opt/homebrew/lib/libomp.dylib")
-  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-  link_directories("/opt/homebrew/lib")
-endif()
-
-# Now try to find OpenMP
-find_package(OpenMP REQUIRED)
-
-if(OpenMP_FOUND)
-  message(STATUS "OpenMP found")
-else()
-  message(FATAL_ERROR "OpenMP support is required but was not found.")
-endif()
-
 #----------------------------------------------------------------------------#
 # APPLICATIONS AND BINARIES
 #----------------------------------------------------------------------------#
diff --git a/app/CMakeLists.txt b/app/CMakeLists.txt
index fb78caa..ea7589a 100644
--- a/app/CMakeLists.txt
+++ b/app/CMakeLists.txt
@@ -6,16 +6,15 @@
 
 include(FetchContent)
 FetchContent_Declare(
+  # Command Line Parser for C++ programs
   cli11_proj
   QUIET
-  GIT_REPOSITORY https://github.com/CLIUtils/CLI11.git # Command Line Parser for C++ programs
+  GIT_REPOSITORY https://github.com/CLIUtils/CLI11.git
   GIT_TAG f4d0731cebb123ff0ace712c099dffbcd2c58e5a # v2.4.1
 )
 
 FetchContent_MakeAvailable(cli11_proj)
 
-# Conditionally add XACC-based executable
-
 if(QIREE_USE_XACC)
   qiree_add_executable(qir-xacc
     qir-xacc.cc
@@ -26,50 +25,4 @@ if(QIREE_USE_XACC)
   )
 endif()
 
-# Conditionally download and configure qsim library
-
-if(QIREE_USE_QSIM)
-  FetchContent_Declare(
-    qsim_lib
-    GIT_REPOSITORY https://github.com/quantumlib/qsim.git
-    GIT_TAG master # Use a specific commit/tag if needed
-  )
-  
-  FetchContent_GetProperties(qsim_lib)
-  
-  if(NOT qsim_lib_POPULATED)
-    FetchContent_MakeAvailable(qsim_lib)
-
-    # Copy header files to tpls/qsim
-    file(MAKE_DIRECTORY ${CMAKE_SOURCE_DIR}/tpls/qsim)
-    message(STATUS "Copying qsim headers to ${CMAKE_SOURCE_DIR}/tpls/qsim")
-    file(GLOB qsim_headers "${qsim_lib_SOURCE_DIR}/lib/*.h")
-    file(COPY ${qsim_headers} DESTINATION ${CMAKE_SOURCE_DIR}/tpls/qsim)
-  endif()
-
-  find_package(OpenMP REQUIRED)
-
-  if(OpenMP_CXX_FOUND)
-    target_link_libraries(qirqsim PUBLIC OpenMP::OpenMP_CXX)
-  endif()
-  # Collect source files for the qsim library
-  #file(GLOB SRC "${CMAKE_SOURCE_DIR}/src/qirqsim/*.cc")
-
-  # Add qsim library with the correct include directories
-
-  #add_library(qsim SHARED ${SRC})
-  #target_include_directories(qsim 
-  #  PUBLIC 
-  #    ${CMAKE_SOURCE_DIR}/tpls/qsim            # qsim headers
-  #    ${CMAKE_SOURCE_DIR}/tpls/qsim/lib        # Additional qsim headers if needed
-  #    )
-  
-  # Add the qir-qsim executable and link it with qsim
-  qiree_add_executable(qir-qsim qir-qsim.cc)
-  target_link_libraries(qir-qsim 
-    PUBLIC QIREE::qiree QIREE::qirqsim 
-    PRIVATE CLI11::CLI11 
-  )
-endif()
-
 #-----------------------------------------------------------------------------#
diff --git a/cmake/FindLLVM.cmake b/cmake/FindLLVM.cmake
index 1994269..f363f67 100644
--- a/cmake/FindLLVM.cmake
+++ b/cmake/FindLLVM.cmake
@@ -28,12 +28,6 @@ This module will set the following variables if found:
 
 include(FindPackageHandleStandardArgs)
 
-# Check if the system is macOS
-if(APPLE)
-  # Set LLVM_DIR to the Homebrew location if using macOS
-  set(LLVM_DIR "/opt/homebrew/opt/llvm/lib/cmake/llvm" CACHE PATH "Path to LLVM on macOS")
-endif()
-
 find_package(LLVM QUIET CONFIG)
 find_package_handle_standard_args(LLVM CONFIG_MODE)
 
@@ -42,8 +36,6 @@ if(LLVM_FOUND)
   target_include_directories(LLVM::headers SYSTEM INTERFACE
     "${LLVM_INCLUDE_DIRS}"
   )
-else()
-  message(WARNING "Could not find LLVM. Make sure LLVM is installed and LLVM_DIR is set.")
 endif()
 
 #-----------------------------------------------------------------------------#
diff --git a/tpls/qsim/bits.h b/tpls/qsim/bits.h
deleted file mode 100644
index 080c866..0000000
--- a/tpls/qsim/bits.h
+++ /dev/null
@@ -1,106 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef BITS_H_
-#define BITS_H_
-
-#include <vector>
-
-#ifdef __BMI2__
-
-#include <immintrin.h>
-
-#include <cstdint>
-
-namespace qsim {
-namespace bits {
-
-inline uint32_t ExpandBits(uint32_t bits, unsigned n, uint32_t mask) {
-  return _pdep_u32(bits, mask);
-}
-
-inline uint64_t ExpandBits(uint64_t bits, unsigned n, uint64_t mask) {
-  return _pdep_u64(bits, mask);
-}
-
-inline uint32_t CompressBits(uint32_t bits, unsigned n, uint32_t mask) {
-  return _pext_u32(bits, mask);
-}
-
-inline uint64_t CompressBits(uint64_t bits, unsigned n, uint64_t mask) {
-  return _pext_u64(bits, mask);
-}
-
-}  // namespace bits
-}  // namespace qsim
-
-#else  // __BMI2__
-
-namespace qsim {
-namespace bits {
-
-template <typename Integer>
-inline Integer ExpandBits(Integer bits, unsigned n, Integer mask) {
-  Integer ebits = 0;
-  unsigned k = 0;
-
-  for (unsigned i = 0; i < n; ++i) {
-    if ((mask >> i) & 1) {
-      ebits |= ((bits >> k) & 1) << i;
-      ++k;
-    }
-  }
-
-  return ebits;
-}
-
-template <typename Integer>
-inline Integer CompressBits(Integer bits, unsigned n, Integer mask) {
-  Integer sbits = 0;
-  unsigned k = 0;
-
-  for (unsigned i = 0; i < n; ++i) {
-    if ((mask >> i) & 1) {
-      sbits |= ((bits >> i) & 1) << k;
-      ++k;
-    }
-  }
-
-  return sbits;
-}
-
-}  // namespace bits
-}  // namespace qsim
-
-#endif  // __BMI2__
-
-namespace qsim {
-namespace bits {
-
-template <typename Integer>
-inline Integer PermuteBits(
-    Integer bits, unsigned n, const std::vector<unsigned>& perm) {
-  Integer pbits = 0;
-
-  for (unsigned i = 0; i < n; ++i) {
-    pbits |= ((bits >> i) & 1) << perm[i];
-  }
-
-  return pbits;
-}
-
-}  // namespace bits
-}  // namespace qsim
-
-#endif  // BITS_H_
diff --git a/tpls/qsim/bitstring.h b/tpls/qsim/bitstring.h
deleted file mode 100644
index b95584b..0000000
--- a/tpls/qsim/bitstring.h
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef BITSTRING_H_
-#define BITSTRING_H_
-
-#include <cstdint>
-#include <string>
-#include <utility>
-#include <vector>
-
-namespace qsim {
-
-using Bitstring = uint64_t;
-
-/**
- * Reads bitstrings (representing initialized or measured states of qubits)
- * from a provided stream object and stores them in a vector.
- * @param num_qubits Number of qubits represented in each bitstring.
- * @param provider Source of bitstrings; only used for error reporting.
- * @param fs The stream to read bitstrings from.
- * @param bitstrings Output vector of bitstrings. On success, this will contain
- *   all bitstrings read in from 'fs'.
- * @return True if reading succeeded; false otherwise.
- */
-template <typename IO, typename Stream>
-bool BitstringsFromStream(unsigned num_qubits, const std::string& provider,
-                          Stream& fs, std::vector<Bitstring>& bitstrings) {
-  bitstrings.resize(0);
-  bitstrings.reserve(100000);
-
-  // Bitstrings are in text format. One bitstring per line.
-
-  do {
-    char buf[128];
-    fs.getline(buf, 128);
-
-    if (fs) {
-      Bitstring b{0};
-
-      unsigned p = 0;
-      while (p < 128 && (buf[p] == '0' || buf[p] == '1')) {
-        b |= uint64_t(buf[p] - '0') << p;
-        ++p;
-      }
-
-      if (p != num_qubits) {
-        IO::errorf("wrong bitstring length in %s: "
-                   "got %u; should be %u.\n", provider.c_str(), p, num_qubits);
-        bitstrings.resize(0);
-        return false;
-      }
-
-      bitstrings.push_back(b);
-    }
-  } while (fs);
-
-  return true;
-}
-
-/**
- * Reads bitstrings (representing initialized or measured states of qubits)
- * from the given file and stores them in a vector.
- * @param num_qubits Number of qubits represented in each bitstring.
- * @param file The name of the file to read bitstrings from.
- * @param bitstrings Output vector of bitstrings. On success, this will contain
- *   all bitstrings read in from 'file'.
- * @return True if reading succeeded; false otherwise.
- */
-template <typename IO>
-inline bool BitstringsFromFile(unsigned num_qubits, const std::string& file,
-                               std::vector<Bitstring>& bitstrings) {
-  auto fs = IO::StreamFromFile(file);
-
-  if (!fs) {
-    return false;
-  } else {
-    bool rc = BitstringsFromStream<IO>(num_qubits, file, fs, bitstrings);
-    IO::CloseStream(fs);
-    return rc;
-  }
-}
-
-}  // namespace qsim
-
-#endif  // BITSTRING_H_
diff --git a/tpls/qsim/channel.h b/tpls/qsim/channel.h
deleted file mode 100644
index 372a174..0000000
--- a/tpls/qsim/channel.h
+++ /dev/null
@@ -1,149 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef CHANNEL_H_
-#define CHANNEL_H_
-
-#include <set>
-#include <vector>
-
-#include "gate.h"
-#include "matrix.h"
-
-namespace qsim {
-
-/**
- * Kraus operator.
- */
-template <typename Gate>
-struct KrausOperator {
-  using fp_type = typename Gate::fp_type;
-
-  enum Kind {
-    kNormal = 0,
-    kMeasurement = gate::kMeasurement,
-  };
-
-  /**
-   * Kraus operator type;
-   */
-  Kind kind;
-
-  /**
-   * If true, the Kraus operator is a unitary operator times a constant.
-   */
-  bool unitary;
-
-  /**
-   * Lower bound on Kraus operator probability.
-   */
-  double prob;
-
-  /**
-   * Sequence of operations that represent the Kraus operator. This can be just
-   * one operation.
-   */
-  std::vector<Gate> ops;
-
-  /**
-   * Product of K^\dagger and K. This can be empty if unitary = true.
-   */
-  Matrix<fp_type> kd_k;
-
-  /**
-   * Qubits kd_k acts on. This can be empty if unitary = true.
-   */
-  std::vector<unsigned> qubits;
-
-  /**
-   * Calculates the product of "K^\dagger K". Sets qubits "K^\dagger K" acts on.
-   */
-  void CalculateKdKMatrix() {
-    if (ops.size() == 1) {
-      kd_k = ops[0].matrix;
-      MatrixDaggerMultiply(ops[0].qubits.size(), ops[0].matrix, kd_k);
-      qubits = ops[0].qubits;
-    } else if (ops.size() > 1) {
-      std::set<unsigned> qubit_map;
-
-      for (const auto& op : ops) {
-        for (unsigned q : op.qubits) {
-          qubit_map.insert(q);
-        }
-      }
-
-      unsigned num_qubits = qubit_map.size();
-
-      qubits.resize(0);
-      qubits.reserve(num_qubits);
-
-      for (auto it = qubit_map.begin(); it != qubit_map.end(); ++it) {
-        qubits.push_back(*it);
-      }
-
-      MatrixIdentity(unsigned{1} << num_qubits, kd_k);
-
-      for (const auto& op : ops) {
-        if (op.qubits.size() == num_qubits) {
-          MatrixMultiply(num_qubits, op.matrix, kd_k);
-        } else {
-          unsigned mask = 0;
-
-          for (auto q : op.qubits) {
-            for (unsigned i = 0; i < num_qubits; ++i) {
-              if (q == qubits[i]) {
-                mask |= unsigned{1} << i;
-                break;
-              }
-            }
-          }
-
-          MatrixMultiply(mask, op.qubits.size(), op.matrix, num_qubits, kd_k);
-        }
-      }
-
-      auto m = kd_k;
-      MatrixDaggerMultiply(num_qubits, m, kd_k);
-    }
-  }
-};
-
-/**
- * Quantum channel.
- */
-template <typename Gate>
-using Channel = std::vector<KrausOperator<Gate>>;
-
-/**
- * Makes a channel from the gate.
- * @param time The time to place the channel at.
- * @param gate The input gate.
- * @return The output channel.
- */
-template <typename Gate>
-Channel<Gate> MakeChannelFromGate(unsigned time, const Gate& gate) {
-  auto normal = KrausOperator<Gate>::kNormal;
-  auto measurement = KrausOperator<Gate>::kMeasurement;
-
-  auto kind = gate.kind == gate::kMeasurement ? measurement : normal;
-
-  Channel<Gate> channel = {{kind, true, 1, {gate}}};
-  channel[0].ops[0].time = time;
-
-  return channel;
-}
-
-}  // namespace qsim
-
-#endif  // CHANNEL_H_
diff --git a/tpls/qsim/channels_cirq.h b/tpls/qsim/channels_cirq.h
deleted file mode 100644
index 69f1df9..0000000
--- a/tpls/qsim/channels_cirq.h
+++ /dev/null
@@ -1,471 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef CHANNELS_CIRQ_H_
-#define CHANNELS_CIRQ_H_
-
-#include <cmath>
-#include <cstdint>
-#include <vector>
-
-#include "channel.h"
-#include "gates_cirq.h"
-
-namespace qsim {
-
-namespace Cirq {
-
-template <typename fp_type>
-using Channel = qsim::Channel<GateCirq<fp_type>>;
-
-/**
- * Asymmetric depolarizing channel factory.
- */
-template <typename fp_type>
-struct AsymmetricDepolarizingChannel {
-  static constexpr char name[] = "asymmetric_depolarize";
-
-  AsymmetricDepolarizingChannel(double p_x, double p_y, double p_z)
-      : p_x(p_x), p_y(p_y), p_z(p_z) {}
-
-  static Channel<fp_type> Create(unsigned time, unsigned q,
-                                 double p_x, double p_y, double p_z) {
-    double p1 = 1 - p_x - p_y - p_z;
-
-    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
-
-    return {{normal, 1, p1, {}},
-            {normal, 1, p_x, {X<fp_type>::Create(time, q)}},
-            {normal, 1, p_y, {Y<fp_type>::Create(time, q)}},
-            {normal, 1, p_z, {Z<fp_type>::Create(time, q)}}};
-  }
-
-  static Channel<fp_type> Create(unsigned time,
-                                 const std::vector<unsigned>& qubits,
-                                 double p_x, double p_y, double p_z) {
-    double p1 = 1 - p_x - p_y - p_z;
-
-    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
-
-    uint64_t size = uint64_t{1} << (2 * qubits.size());
-
-    Channel<fp_type> channel;
-    channel.reserve(size);
-
-    for (uint64_t i = 0; i < size; ++i) {
-      channel.push_back({normal, 1, 0, {}});
-      auto& kop = channel.back();
-
-      kop.ops.reserve(qubits.size());
-
-      double prob = 1;
-
-      for (unsigned q = 0; q < qubits.size(); ++q) {
-        unsigned pauli_index = (i >> (2 * q)) & 3;
-
-        switch (pauli_index) {
-        case 0:
-          prob *= p1;
-          break;
-        case 1:
-          prob *= p_x;
-          kop.ops.push_back(X<fp_type>::Create(time, q));
-          break;
-        case 2:
-          prob *= p_y;
-          kop.ops.push_back(Y<fp_type>::Create(time, q));
-          break;
-        case 3:
-          prob *= p_z;
-          kop.ops.push_back(Z<fp_type>::Create(time, q));
-          break;
-        }
-      }
-
-      kop.prob = prob;
-    }
-
-    return channel;
-  }
-
-  Channel<fp_type> Create(unsigned time, unsigned q) const {
-    return Create(time, q, p_x, p_y, p_z);
-  }
-
-  Channel<fp_type> Create(
-      unsigned time, const std::vector<unsigned>& qubits) const {
-    return Create(time, qubits, p_x, p_y, p_z);
-  }
-
-  double p_x = 0;
-  double p_y = 0;
-  double p_z = 0;
-};
-
-/**
- * Returns an asymmetric depolarizing channel factory object.
- */
-template <typename fp_type>
-inline AsymmetricDepolarizingChannel<fp_type> asymmetric_depolarize(
-    double p_x, double p_y, double p_z) {
-  return AsymmetricDepolarizingChannel<fp_type>(p_x, p_y, p_z);
-}
-
-/**
- * Depolarizing channel factory.
- */
-template <typename fp_type>
-struct DepolarizingChannel {
-  static constexpr char name[] = "depolarize";
-
-  DepolarizingChannel(double p) : p(p) {}
-
-  static Channel<fp_type> Create(unsigned time, unsigned q, double p) {
-    double p1 = 1 - p;
-    double p2 = p / 3;
-
-    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
-
-    return {{normal, 1, p1, {}},
-            {normal, 1, p2, {X<fp_type>::Create(time, q)}},
-            {normal, 1, p2, {Y<fp_type>::Create(time, q)}},
-            {normal, 1, p2, {Z<fp_type>::Create(time, q)}}};
-  }
-
-  static Channel<fp_type> Create(
-      unsigned time, const std::vector<unsigned>& qubits, double p) {
-    double p1 = 1 - p;
-    double p2 = p / 3;
-
-    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
-
-    uint64_t size = uint64_t{1} << (2 * qubits.size());
-
-    Channel<fp_type> channel;
-    channel.reserve(size);
-
-    for (uint64_t i = 0; i < size; ++i) {
-      channel.push_back({normal, 1, 0, {}});
-      auto& kop = channel.back();
-
-      kop.ops.reserve(qubits.size());
-
-      double prob = 1;
-
-      for (unsigned q = 0; q < qubits.size(); ++q) {
-        unsigned pauli_index = (i >> (2 * q)) & 3;
-
-        switch (pauli_index) {
-        case 0:
-          prob *= p1;
-          break;
-        case 1:
-          prob *= p2;
-          kop.ops.push_back(X<fp_type>::Create(time, q));
-          break;
-        case 2:
-          prob *= p2;
-          kop.ops.push_back(Y<fp_type>::Create(time, q));
-          break;
-        case 3:
-          prob *= p2;
-          kop.ops.push_back(Z<fp_type>::Create(time, q));
-          break;
-        }
-      }
-
-      kop.prob = prob;
-    }
-
-    return channel;
-  }
-
-  Channel<fp_type> Create(unsigned time, unsigned q) const {
-    return Create(time, q, p);
-  }
-
-  Channel<fp_type> Create(
-      unsigned time, const std::vector<unsigned>& qubits) const {
-    return Create(time, qubits, p);
-  }
-
-  double p = 0;
-};
-
-/**
- * Returns a depolarizing channel factory object.
- */
-template <typename fp_type>
-inline DepolarizingChannel<fp_type> depolarize(double p) {
-  return DepolarizingChannel<fp_type>(p);
-}
-
-/**
- * Generalized amplitude damping channel factory.
- */
-template <typename fp_type>
-struct GeneralizedAmplitudeDampingChannel {
-  static constexpr char name[] = "generalized_amplitude_damp";
-
-  GeneralizedAmplitudeDampingChannel(double p, double gamma)
-      : p(p), gamma(gamma) {}
-
-  static Channel<fp_type> Create(
-      unsigned time, unsigned q, double p, double gamma) {
-    double p1 = p * (1 - gamma);
-    double p2 = (1 - p) * (1 - gamma);
-    double p3 = 0;
-
-    fp_type t1 = std::sqrt(p);
-    fp_type r1 = std::sqrt(p * (1 - gamma));
-    fp_type s1 = std::sqrt(p * gamma);
-    fp_type t2 = std::sqrt(1 - p);
-    fp_type r2 = std::sqrt((1 - p) * (1 - gamma));
-    fp_type s2 = std::sqrt((1 - p) * gamma);
-
-    using M = Cirq::MatrixGate1<fp_type>;
-    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
-
-    return {{normal, 0, p1,
-             {M::Create(time, q, {t1, 0, 0, 0, 0, 0, r1, 0})},
-             {t1 * t1, 0, 0, 0, 0, 0, r1 * r1, 0}, {q},
-            },
-            {normal, 0, p2,
-             {M::Create(time, q, {r2, 0, 0, 0, 0, 0, t2, 0})},
-             {r2 * r2, 0, 0, 0, 0, 0, t2 * t2, 0}, {q},
-            },
-            {normal, 0, p3,
-             {M::Create(time, q, {0, 0, s1, 0, 0, 0, 0, 0})},
-             {0, 0, 0, 0, 0, 0, s1 * s1, 0}, {q},
-            },
-            {normal, 0, p3,
-             {M::Create(time, q, {0, 0, 0, 0, s2, 0, 0, 0})},
-             {s2 * s2, 0, 0, 0, 0, 0, 0, 0}, {q},
-            },
-           };
-  }
-
-  Channel<fp_type> Create(unsigned time, unsigned q) const {
-    return Create(time, q, p, gamma);
-  }
-
-  double p = 1;
-  double gamma = 0;
-};
-
-/**
- * Returns a generalized amplitude damping channel factory object.
- */
-template <typename fp_type>
-inline GeneralizedAmplitudeDampingChannel<fp_type> generalized_amplitude_damp(
-    double p, double gamma) {
-  return GeneralizedAmplitudeDampingChannel<fp_type>(p, gamma);
-}
-
-/**
- * Amplitude damping channel factory.
- */
-template <typename fp_type>
-struct AmplitudeDampingChannel {
-  static constexpr char name[] = "amplitude_damp";
-
-  AmplitudeDampingChannel(double gamma) : gamma(gamma) {}
-
-  static Channel<fp_type> Create(unsigned time, unsigned q, double gamma) {
-    double p1 = 1 - gamma;
-    double p2 = 0;
-
-    fp_type r = std::sqrt(p1);
-    fp_type s = std::sqrt(gamma);
-
-    using M = Cirq::MatrixGate1<fp_type>;
-    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
-
-    return {{normal, 0, p1,
-             {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})},
-             {1, 0, 0, 0, 0, 0, r * r, 0}, {q},
-            },
-            {normal, 0, p2,
-             {M::Create(time, q, {0, 0, s, 0, 0, 0, 0, 0})},
-             {0, 0, 0, 0, 0, 0, s * s, 0}, {q},
-            },
-           };
-  }
-
-  Channel<fp_type> Create(unsigned time, unsigned q) const {
-    return Create(time, q, gamma);
-  }
-
-  double gamma = 0;
-};
-
-/**
- * Returns an amplitude damping channel factory object.
- */
-template <typename fp_type>
-inline AmplitudeDampingChannel<fp_type> amplitude_damp(double gamma) {
-  return AmplitudeDampingChannel<fp_type>(gamma);
-}
-
-/**
- *  Phase damping channel factory.
- */
-template <typename fp_type>
-struct PhaseDampingChannel {
-  static constexpr char name[] = "phase_dump";
-
-  PhaseDampingChannel(double gamma) : gamma(gamma) {}
-
-  static Channel<fp_type> Create(unsigned time, unsigned q, double gamma) {
-    double p1 = 1 - gamma;
-    double p2 = 0;
-
-    fp_type r = std::sqrt(p1);
-    fp_type s = std::sqrt(gamma);
-
-    using M = Cirq::MatrixGate1<fp_type>;
-    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
-
-    return {{normal, 0, p1,
-             {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})},
-             {1, 0, 0, 0, 0, 0, r * r, 0}, {q},
-            },
-            {normal, 0, p2,
-             {M::Create(time, q, {0, 0, 0, 0, 0, 0, s, 0})},
-             {0, 0, 0, 0, 0, 0, s * s, 0}, {q},
-            },
-           };
-  }
-
-  Channel<fp_type> Create(unsigned time, unsigned q) const {
-    return Create(time, q, gamma);
-  }
-
-  double gamma = 0;
-};
-
-/**
- * Returns a phase damping channel factory object.
- */
-template <typename fp_type>
-inline PhaseDampingChannel<fp_type> phase_damp(double gamma) {
-  return PhaseDampingChannel<fp_type>(gamma);
-}
-
-/**
- *  Reset channel factory.
- */
-template <typename fp_type>
-struct ResetChannel {
-  static constexpr char name[] = "reset";
-
-  static Channel<fp_type> Create(unsigned time, unsigned q) {
-    using M = Cirq::MatrixGate1<fp_type>;
-    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
-
-    return {{normal, 0, 0,
-             {M::Create(time, q, {1, 0, 0, 0, 0, 0, 0, 0})},
-             {1, 0, 0, 0, 0, 0, 0, 0}, {q},
-            },
-            {normal, 0, 0,
-             {M::Create(time, q, {0, 0, 1, 0, 0, 0, 0, 0})},
-             {0, 0, 0, 0, 0, 0, 1, 0}, {q},
-            },
-           };
-  }
-};
-
-/**
- * Returns a reset channel factory object.
- */
-template <typename fp_type>
-inline ResetChannel<fp_type> reset() {
-  return ResetChannel<fp_type>();
-}
-
-/**
- *  Phase flip channel factory.
- */
-template <typename fp_type>
-struct PhaseFlipChannel {
-  static constexpr char name[] = "phase_flip";
-
-  PhaseFlipChannel(double p) : p(p) {}
-
-  static Channel<fp_type> Create(unsigned time, unsigned q, double p) {
-    double p1 = 1 - p;
-    double p2 = p;
-
-    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
-
-    return {{normal, 1, p1, {}},
-            {normal, 1, p2, {Z<fp_type>::Create(time, q)}}
-           };
-  }
-
-  Channel<fp_type> Create(unsigned time, unsigned q) const {
-    return Create(time, q, p);
-  }
-
-  double p = 0;
-};
-
-/**
- * Returns a phase flip channel factory object.
- */
-template <typename fp_type>
-inline PhaseFlipChannel<fp_type> phase_flip(double p) {
-  return PhaseFlipChannel<fp_type>(p);
-}
-
-/**
- *  Bit flip channel factory.
- */
-template <typename fp_type>
-struct BitFlipChannel {
-  static constexpr char name[] = "bit_flip";
-
-  BitFlipChannel(double p) : p(p) {}
-
-  static Channel<fp_type> Create(unsigned time, unsigned q, double p) {
-    double p1 = 1 - p;
-    double p2 = p;
-
-    auto normal = KrausOperator<GateCirq<fp_type>>::kNormal;
-
-    return {{normal, 1, p1, {}},
-            {normal, 1, p2, {X<fp_type>::Create(time, q)}}
-           };
-  }
-
-  Channel<fp_type> Create(unsigned time, unsigned q) const {
-    return Create(time, q, p);
-  }
-
-  double p = 0;
-};
-
-/**
- * Returns a bit flip channel factory object.
- */
-template <typename fp_type>
-inline BitFlipChannel<fp_type> bit_flip(double p) {
-  return BitFlipChannel<fp_type>(p);
-}
-
-}  // namesapce Cirq
-
-}  // namespace qsim
-
-#endif  // CHANNELS_CIRQ_H_
diff --git a/tpls/qsim/channels_qsim.h b/tpls/qsim/channels_qsim.h
deleted file mode 100644
index 5c07bcc..0000000
--- a/tpls/qsim/channels_qsim.h
+++ /dev/null
@@ -1,117 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef CHANNELS_QSIM_H_
-#define CHANNELS_QSIM_H_
-
-#include <cmath>
-#include <cstdint>
-#include <vector>
-
-#include "channel.h"
-#include "gates_qsim.h"
-
-namespace qsim {
-
-/**
- * Amplitude damping channel factory.
- */
-template <typename fp_type>
-struct AmplitudeDampingChannel {
-  AmplitudeDampingChannel(double gamma) : gamma(gamma) {}
-
-  static Channel<GateQSim<fp_type>> Create(
-      unsigned time, unsigned q, double gamma) {
-    double p1 = 1 - gamma;
-    double p2 = 0;
-
-    fp_type r = std::sqrt(p1);
-    fp_type s = std::sqrt(gamma);
-
-    using M = GateMatrix1<fp_type>;
-    auto normal = KrausOperator<GateQSim<fp_type>>::kNormal;
-
-    return {{normal, 0, p1,
-             {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})},
-             {1, 0, 0, 0, 0, 0, r * r, 0}, {q},
-            },
-            {normal, 0, p2,
-             {M::Create(time, q, {0, 0, s, 0, 0, 0, 0, 0})},
-             {0, 0, 0, 0, 0, 0, s * s, 0}, {q},
-            },
-           };
-  }
-
-  Channel<GateQSim<fp_type>> Create(unsigned time, unsigned q) const {
-    return Create(time, q, gamma);
-  }
-
-  double gamma = 0;
-};
-
-/**
- * Returns an amplitude damping channel factory object.
- */
-template <typename fp_type>
-inline AmplitudeDampingChannel<fp_type> amplitude_damp(double gamma) {
-  return AmplitudeDampingChannel<fp_type>(gamma);
-}
-
-/**
- *  Phase damping channel factory.
- */
-template <typename fp_type>
-struct PhaseDampingChannel {
-  PhaseDampingChannel(double gamma) : gamma(gamma) {}
-
-  static Channel<GateQSim<fp_type>> Create(
-      unsigned time, unsigned q, double gamma) {
-    double p1 = 1 - gamma;
-    double p2 = 0;
-
-    fp_type r = std::sqrt(p1);
-    fp_type s = std::sqrt(gamma);
-
-    using M = GateMatrix1<fp_type>;
-    auto normal = KrausOperator<GateQSim<fp_type>>::kNormal;
-
-    return {{normal, 0, p1,
-             {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})},
-             {1, 0, 0, 0, 0, 0, r * r, 0}, {q},
-            },
-            {normal, 0, p2,
-             {M::Create(time, q, {0, 0, 0, 0, 0, 0, s, 0})},
-             {0, 0, 0, 0, 0, 0, s * s, 0}, {q},
-            },
-           };
-  }
-
-  Channel<GateQSim<fp_type>> Create(unsigned time, unsigned q) const {
-    return Create(time, q, gamma);
-  }
-
-  double gamma = 0;
-};
-
-/**
- * Returns a phase damping channel factory object.
- */
-template <typename fp_type>
-inline PhaseDampingChannel<fp_type> phase_damp(double gamma) {
-  return PhaseDampingChannel<fp_type>(gamma);
-}
-
-}  // namespace qsim
-
-#endif  // CHANNELS_QSIM_H_
diff --git a/tpls/qsim/circuit.h b/tpls/qsim/circuit.h
deleted file mode 100644
index 59018ee..0000000
--- a/tpls/qsim/circuit.h
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef CIRCUIT_H_
-#define CIRCUIT_H_
-
-#include <vector>
-
-namespace qsim {
-
-/**
- * A collection of gates. This object is consumed by `QSim[h]Runner.Run()`.
- */
-template <typename Gate>
-struct Circuit {
-  unsigned num_qubits;
-  /**
-   * The set of gates to be run. Gate times should be ordered.
-   */
-  std::vector<Gate> gates;
-};
-
-}  // namespace qsim
-
-#endif  // CIRCUIT_H_
diff --git a/tpls/qsim/circuit_noisy.h b/tpls/qsim/circuit_noisy.h
deleted file mode 100644
index 40a228d..0000000
--- a/tpls/qsim/circuit_noisy.h
+++ /dev/null
@@ -1,108 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef CIRCUIT_NOISY_H_
-#define CIRCUIT_NOISY_H_
-
-#include <vector>
-
-#include "circuit.h"
-#include "channel.h"
-
-namespace qsim {
-
-/**
- * Noisy circuit.
- */
-template <typename Gate>
-struct NoisyCircuit {
-  unsigned num_qubits;
-  std::vector<Channel<Gate>> channels;
-};
-
-template <typename Gate>
-using ncircuit_iterator = typename std::vector<Channel<Gate>>::const_iterator;
-
-/**
- * Makes a noisy circuit from the clean circuit.
- * Channels are added after each qubit of each gate of the clean cicuit.
- * Roughly equivalent to cirq.Circuit.with_noise.
- * @param num_qubits The number of circuit qubits.
- * @param gbeg, gend The iterator range [gbeg, gend) of circuit gates.
- * @param A channel factory to construct channels.
- * @return The output noisy circuit.
- */
-template <typename Gate, typename ChannelFactory>
-inline NoisyCircuit<Gate> MakeNoisy(
-    unsigned num_qubits,
-    typename std::vector<Gate>::const_iterator gbeg,
-    typename std::vector<Gate>::const_iterator gend,
-    const ChannelFactory& channel_factory) {
-  NoisyCircuit<Gate> ncircuit;
-
-  ncircuit.num_qubits = num_qubits;
-  ncircuit.channels.reserve(4 * std::size_t(gend - gbeg));
-
-  for (auto it = gbeg; it != gend; ++it) {
-    const auto& gate = *it;
-
-    ncircuit.channels.push_back(MakeChannelFromGate(2 * gate.time, gate));
-
-    for (auto q : gate.qubits) {
-      ncircuit.channels.push_back(channel_factory.Create(2 * gate.time + 1, q));
-    }
-
-    for (auto q : gate.controlled_by) {
-      ncircuit.channels.push_back(channel_factory.Create(2 * gate.time + 1, q));
-    }
-  }
-
-  return ncircuit;
-}
-
-/**
- * Makes a noisy circuit from the clean circuit.
- * Channels are added after each qubit of each gate of the clean cicuit.
- * Roughly equivalent to cirq.Circuit.with_noise.
- * @param num_qubits The number of circuit qubits.
- * @param gates The circuit gates.
- * @param A channel factory to construct channels.
- * @return The output noisy circuit.
- */
-template <typename Gate, typename ChannelFactory>
-inline NoisyCircuit<Gate> MakeNoisy(unsigned num_qubits,
-                                    const std::vector<Gate>& gates,
-                                    const ChannelFactory& channel_factory) {
-  return
-      MakeNoisy<Gate>(num_qubits, gates.begin(), gates.end(), channel_factory);
-}
-
-/**
- * Makes a noisy circuit from the clean circuit.
- * Channels are added after each qubit of each gate of the clean cicuit.
- * Roughly equivalent to cirq.Circuit.with_noise.
- * @param circuit The input cicuit.
- * @param A channel factory to construct channels.
- * @return The output noisy circuit.
- */
-template <typename Gate, typename ChannelFactory>
-inline NoisyCircuit<Gate> MakeNoisy(const Circuit<Gate>& circuit,
-                                    const ChannelFactory& channel_factory) {
-  return MakeNoisy<Gate>(circuit.num_qubits, circuit.gates.begin(),
-                         circuit.gates.end(), channel_factory);
-}
-
-}  // namespace qsim
-
-#endif  // CIRCUIT_NOISY_H_
diff --git a/tpls/qsim/circuit_qsim_parser.h b/tpls/qsim/circuit_qsim_parser.h
deleted file mode 100644
index de7bd89..0000000
--- a/tpls/qsim/circuit_qsim_parser.h
+++ /dev/null
@@ -1,442 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef CIRCUIT_QSIM_PARSER_H_
-#define CIRCUIT_QSIM_PARSER_H_
-
-#include <algorithm>
-#include <cctype>
-#include <string>
-#include <sstream>
-#include <vector>
-
-#include "circuit.h"
-#include "gates_qsim.h"
-
-namespace qsim {
-
-/**
- * Parser for the (deprecated) qsim <a href="https://github.com/quantumlib/qsim/blob/master/docs/input_format.md">file input format</a>.
- * The primary supported interface for designing circuits to simulate with qsim
- * is <a href="https://github.com/quantumlib/Cirq">Cirq</a>, which relies on
- * the Python-based qsimcirq interface. For C++ applications, Cirq gates can be
- * explicitly constructed in code.
- */
-template <typename IO>
-class CircuitQsimParser final {
- public:
-  /**
-   * Parses the given input stream into a Circuit object, following the rules
-   * defined in "docs/input_format.md".
-   * @param maxtime Maximum gate "time" to read operations for (inclusive).
-   * @param provider Circuit source; only used for error reporting.
-   * @param fs The stream to read the circuit from.
-   * @param circuit Output circuit object. If parsing is successful, this will
-   *   contain the circuit defined in 'fs'.
-   * @return True if parsing succeeds; false otherwise.
-   */
-  template <typename Stream, typename fp_type>
-  static bool FromStream(unsigned maxtime, const std::string& provider,
-                         Stream& fs, Circuit<GateQSim<fp_type>>& circuit) {
-    circuit.num_qubits = 0;
-
-    circuit.gates.resize(0);
-    circuit.gates.reserve(1024);
-
-    unsigned k = 0;
-
-    std::string line;
-    line.reserve(128);
-
-    unsigned time;
-    std::string gate_name;
-    gate_name.reserve(16);
-
-    unsigned max_time = 0;
-    unsigned prev_mea_time = 0;
-
-    std::vector<unsigned> last_times;
-
-    while (std::getline(fs, line)) {
-      ++k;
-
-      if (line.size() == 0 || line[0] == '#') continue;
-
-      std::stringstream ss(line);
-
-      if (circuit.num_qubits == 0) {
-        ss >> circuit.num_qubits;
-        if (circuit.num_qubits == 0) {
-          IO::errorf("invalid number of qubits in %s in line %u.\n",
-                     provider.c_str(), k);
-          return false;
-        }
-
-        last_times.resize(circuit.num_qubits, unsigned(-1));
-
-        continue;
-      }
-
-      ss >> time >> gate_name;
-
-      if (!ss) {
-        InvalidGateError(provider, k);
-        return false;
-      }
-
-      if (time > maxtime) {
-        break;
-      }
-
-      if (gate_name == "c") {
-        if (!ParseControlledGate<fp_type>(ss, time,
-                                          circuit.num_qubits, circuit.gates)) {
-          InvalidGateError(provider, k);
-          return false;
-        }
-      } else if (!ParseGate<fp_type>(ss, time, circuit.num_qubits,
-                                     gate_name, circuit.gates)) {
-        InvalidGateError(provider, k);
-        return false;
-      }
-
-      const auto& gate = circuit.gates.back();
-
-      if (time < prev_mea_time
-          || (gate.kind == gate::kMeasurement && time < max_time)) {
-        IO::errorf("gate crosses the time boundary set by measurement "
-                   "gates in line %u in %s.\n", k, provider.c_str());
-        return false;
-      }
-
-      if (gate.kind == gate::kMeasurement) {
-        prev_mea_time = time;
-      }
-
-      if (GateIsOutOfOrder(time, gate.qubits, last_times)
-          || GateIsOutOfOrder(time, gate.controlled_by, last_times)) {
-        IO::errorf("gate is out of time order in line %u in %s.\n",
-                   k, provider.c_str());
-        return false;
-      }
-
-      if (time > max_time) {
-        max_time = time;
-      }
-    }
-
-    return true;
-  }
-
-  /**
-   * Parses the given file into a Circuit object, following the rules defined
-   * in "docs/input_format.md".
-   * @param maxtime Maximum gate "time" to read operations for (inclusive).
-   * @param file The name of the file to read the circuit from.
-   * @param circuit Output circuit object. If parsing is successful, this will
-   *   contain the circuit defined in 'file'.
-   * @return True if parsing succeeds; false otherwise.
-   */
-  template <typename fp_type>
-  static bool FromFile(unsigned maxtime, const std::string& file,
-                       Circuit<GateQSim<fp_type>>& circuit) {
-    auto fs = IO::StreamFromFile(file);
-
-    if (!fs) {
-      return false;
-    } else {
-      bool rc = FromStream(maxtime, file, fs, circuit);
-      IO::CloseStream(fs);
-      return rc;
-    }
-  }
-
- private:
-  static void InvalidGateError(const std::string& provider, unsigned line) {
-    IO::errorf("invalid gate in %s in line %u.\n", provider.c_str(), line);
-  }
-
-  /**
-   * Checks formatting for a zero-qubit gate parsed from 'ss'.
-   * @param ss Input stream containing the gate specification.
-   */
-  static bool ValidateGate(std::stringstream& ss) {
-    return ss && ss.peek() == std::stringstream::traits_type::eof();
-  }
-
-  /**
-   * Checks formatting for a single-qubit gate parsed from 'ss'.
-   * @param ss Input stream containing the gate specification.
-   * @param num_qubits Number of qubits, as defined at the start of the file.
-   * @param q0 Index of the affected qubit.
-   */
-  static bool ValidateGate(std::stringstream& ss,
-                           unsigned num_qubits, unsigned q0) {
-    return ss && ss.peek() == std::stringstream::traits_type::eof()
-        && q0 < num_qubits;
-  }
-
-  /**
-   * Checks formatting for a two-qubit gate parsed from 'ss'.
-   * @param ss Input stream containing the gate specification.
-   * @param num_qubits Number of qubits, as defined at the start of the file.
-   * @param q0 Index of the first affected qubit.
-   * @param q1 Index of the second affected qubit.
-   */
-  static bool ValidateGate(std::stringstream& ss,
-                           unsigned num_qubits, unsigned q0, unsigned q1) {
-    return ss && ss.peek() == std::stringstream::traits_type::eof()
-        && q0 < num_qubits && q1 < num_qubits && q0 != q1;
-  }
-
-  /**
-   * Checks formatting for a multiqubit gate parsed from 'ss'.
-   * @param ss Input stream containing the gate specification.
-   * @param num_qubits Number of qubits, as defined at the start of the file.
-   * @param qubits Indices of affected qubits.
-   */
-  static bool ValidateGate(std::stringstream& ss, unsigned num_qubits,
-                           const std::vector<unsigned>& qubits) {
-    return ss && ValidateQubits(num_qubits, qubits);
-  }
-
-  static bool ValidateControlledGate(
-      unsigned num_qubits, const std::vector<unsigned>& qubits,
-      const std::vector<unsigned>& controlled_by) {
-    if (!ValidateQubits(num_qubits, controlled_by)) return false;
-
-    std::size_t i = 0, j = 0;
-
-    while (i < qubits.size() && j < controlled_by.size()) {
-      if (qubits[i] == controlled_by[j]) {
-        return false;
-      } else if (qubits[i] < controlled_by[j]) {
-        ++i;
-      } else {
-        ++j;
-      }
-    }
-
-    return true;
-  }
-
-  static bool ValidateQubits(unsigned num_qubits,
-                             const std::vector<unsigned>& qubits) {
-    if (qubits.size() == 0 || qubits[0] >= num_qubits) return false;
-
-    // qubits should be sorted.
-
-    for (std::size_t i = 1; i < qubits.size(); ++i) {
-      if (qubits[i] >= num_qubits || qubits[i] == qubits[i - 1]) {
-        return false;
-      }
-    }
-
-    return true;
-  }
-
-  static bool GateIsOutOfOrder(unsigned time,
-                               const std::vector<unsigned>& qubits,
-                               std::vector<unsigned>& last_times) {
-    for (auto q : qubits) {
-      if (last_times[q] != unsigned(-1) && time <= last_times[q]) {
-        return true;
-      }
-
-      last_times[q] = time;
-    }
-
-    return false;
-  }
-
-  template <typename fp_type, typename Stream, typename Gate>
-  static bool ParseGate(Stream& ss, unsigned time, unsigned num_qubits,
-                        const std::string& gate_name,
-                        std::vector<Gate>& gates) {
-    unsigned q0, q1;
-    fp_type phi, theta;
-
-    if (gate_name == "p") {
-      ss >> phi;
-      if (!ValidateGate(ss)) return false;
-      gates.push_back(GateGPh<fp_type>::Create(time, phi));
-    } else if (gate_name == "id1") {
-      ss >> q0;
-      if (!ValidateGate(ss, num_qubits, q0)) return false;
-      gates.push_back(GateId1<fp_type>::Create(time, q0));
-    } else if (gate_name == "h") {
-      ss >> q0;
-      if (!ValidateGate(ss, num_qubits, q0)) return false;
-      gates.push_back(GateHd<fp_type>::Create(time, q0));
-    } else if (gate_name == "t") {
-      ss >> q0;
-      if (!ValidateGate(ss, num_qubits, q0)) return false;
-      gates.push_back(GateT<fp_type>::Create(time, q0));
-    } else if (gate_name == "x") {
-      ss >> q0;
-      if (!ValidateGate(ss, num_qubits, q0)) return false;
-      gates.push_back(GateX<fp_type>::Create(time, q0));
-    } else if (gate_name == "y") {
-      ss >> q0;
-      if (!ValidateGate(ss, num_qubits, q0)) return false;
-      gates.push_back(GateY<fp_type>::Create(time, q0));
-    } else if (gate_name == "z") {
-      ss >> q0;
-      if (!ValidateGate(ss, num_qubits, q0)) return false;
-      gates.push_back(GateZ<fp_type>::Create(time, q0));
-    } else if (gate_name == "x_1_2") {
-      ss >> q0;
-      if (!ValidateGate(ss, num_qubits, q0)) return false;
-      gates.push_back(GateX2<fp_type>::Create(time, q0));
-    } else if (gate_name == "y_1_2") {
-      ss >> q0;
-      if (!ValidateGate(ss, num_qubits, q0)) return false;
-      gates.push_back(GateY2<fp_type>::Create(time, q0));
-    } else if (gate_name == "rx") {
-      ss >> q0 >> phi;
-      if (!ValidateGate(ss, num_qubits, q0)) return false;
-      gates.push_back(GateRX<fp_type>::Create(time, q0, phi));
-    } else if (gate_name == "ry") {
-      ss >> q0 >> phi;
-      if (!ValidateGate(ss, num_qubits, q0)) return false;
-      gates.push_back(GateRY<fp_type>::Create(time, q0, phi));
-    } else if (gate_name == "rz") {
-      ss >> q0 >> phi;
-      if (!ValidateGate(ss, num_qubits, q0)) return false;
-      gates.push_back(GateRZ<fp_type>::Create(time, q0, phi));
-    } else if (gate_name == "rxy") {
-      ss >> q0 >> theta >> phi;
-      if (!ValidateGate(ss, num_qubits, q0)) return false;
-      gates.push_back(GateRXY<fp_type>::Create(time, q0, theta, phi));
-    } else if (gate_name == "hz_1_2") {
-      ss >> q0;
-      if (!ValidateGate(ss, num_qubits, q0)) return false;
-      gates.push_back(GateHZ2<fp_type>::Create(time, q0));
-    } else if (gate_name == "s") {
-      ss >> q0;
-      if (!ValidateGate(ss, num_qubits, q0)) return false;
-      gates.push_back(GateS<fp_type>::Create(time, q0));
-    } else if (gate_name == "id2") {
-      ss >> q0 >> q1;
-      if (!ValidateGate(ss, num_qubits, q0, q1)) return false;
-      gates.push_back(GateId2<fp_type>::Create(time, q0, q1));
-    } else if (gate_name == "cz") {
-      ss >> q0 >> q1;
-      if (!ValidateGate(ss, num_qubits, q0, q1)) return false;
-      gates.push_back(GateCZ<fp_type>::Create(time, q0, q1));
-    } else if (gate_name == "cnot" || gate_name == "cx") {
-      ss >> q0 >> q1;
-      if (!ValidateGate(ss, num_qubits, q0, q1)) return false;
-      gates.push_back(GateCNot<fp_type>::Create(time, q0, q1));
-    } else if (gate_name == "sw") {
-      ss >> q0 >> q1;
-      if (!ValidateGate(ss, num_qubits, q0, q1)) return false;
-      gates.push_back(GateSwap<fp_type>::Create(time, q0, q1));
-    } else if (gate_name == "is") {
-      ss >> q0 >> q1;
-      if (!ValidateGate(ss, num_qubits, q0, q1)) return false;
-      gates.push_back(GateIS<fp_type>::Create(time, q0, q1));
-    } else if (gate_name == "fs") {
-      ss >> q0 >> q1 >> theta >> phi;
-      if (!ValidateGate(ss, num_qubits, q0, q1)) return false;
-      gates.push_back(GateFS<fp_type>::Create(time, q0, q1, theta, phi));
-    } else if (gate_name == "cp") {
-      ss >> q0 >> q1 >> phi;
-      if (!ValidateGate(ss, num_qubits, q0, q1)) return false;
-      gates.push_back(GateCP<fp_type>::Create(time, q0, q1, phi));
-    } else if (gate_name == "m") {
-      std::vector<unsigned> qubits;
-      qubits.reserve(num_qubits);
-
-      while (ss.good()) {
-        ss >> q0;
-        if (ss) {
-          qubits.push_back(q0);
-        } else {
-          return false;
-        }
-      }
-
-      gates.push_back(gate::Measurement<GateQSim<fp_type>>::Create(
-          time, std::move(qubits)));
-
-      if (!ValidateQubits(num_qubits, gates.back().qubits)) return false;
-    } else {
-      return false;
-    }
-
-    return true;
-  }
-
-  template <typename fp_type, typename Stream, typename Gate>
-  static bool ParseControlledGate(Stream& ss, unsigned time,
-                                  unsigned num_qubits,
-                                  std::vector<Gate>& gates) {
-    std::vector<unsigned> controlled_by;
-    controlled_by.reserve(64);
-
-    std::string gate_name;
-    gate_name.reserve(16);
-
-    while (1) {
-      while (ss.good()) {
-        if (!std::isblank(ss.get())) {
-          ss.unget();
-          break;
-        }
-      }
-
-      if (!ss.good()) {
-        return false;
-      }
-
-      if (!std::isdigit(ss.peek())) {
-        break;
-      } else {
-        unsigned q;
-        ss >> q;
-
-        if (!ss.good() || !std::isblank(ss.get())) {
-          return false;
-        }
-
-        controlled_by.push_back(q);
-      }
-    }
-
-    if (controlled_by.size() == 0) {
-      return false;
-    }
-
-    ss >> gate_name;
-
-    if (!ss.good() || !ParseGate<fp_type>(ss, time,
-                                          num_qubits, gate_name, gates)) {
-      return false;
-    }
-
-    gates.back().ControlledBy(std::move(controlled_by));
-
-    if (!ValidateControlledGate(num_qubits, gates.back().qubits,
-                                gates.back().controlled_by)) {
-      return false;
-    }
-
-    return true;
-  }
-};
-
-}  // namespace qsim
-
-#endif  // CIRCUIT_QSIM_PARSER_H_
diff --git a/tpls/qsim/cuda2hip.h b/tpls/qsim/cuda2hip.h
deleted file mode 100644
index da2d074..0000000
--- a/tpls/qsim/cuda2hip.h
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright 2023 Advanced Micro Devices, Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SIMULATOR_CUDA2HIP_H_
-#define SIMULATOR_CUDA2HIP_H_
-
-#define cublasCaxpy              hipblasCaxpy
-#define cublasCdotc              hipblasCdotc
-#define cublasCreate             hipblasCreate
-#define cublasCscal              hipblasCscal
-#define cublasCsscal             hipblasCsscal
-#define cublasDestroy            hipblasDestroy
-#define cublasDznrm2             hipblasDznrm2
-#define cublasHandle_t           hipblasHandle_t
-#define cublasScnrm2             hipblasScnrm2
-#define CUBLAS_STATUS_SUCCESS    HIPBLAS_STATUS_SUCCESS
-#define cublasStatus_t           hipblasStatus_t
-#define cublasZaxpy              hipblasZaxpy
-#define cublasZdotc              hipblasZdotc
-#define cublasZdscal             hipblasZdscal
-#define cublasZscal              hipblasZscal
-#define cuCimagf                 hipCimagf
-#define cuCimag                  hipCimag
-#define cuComplex                hipComplex
-#define cuCrealf                 hipCrealf
-#define cuCreal                  hipCreal
-#define CUDA_C_32F               HIPBLAS_C_32F
-#define CUDA_C_64F               HIPBLAS_C_64F
-#define cudaDeviceSynchronize    hipDeviceSynchronize
-#define cudaError_t              hipError_t
-#define cudaFree                 hipFree
-#define cudaGetErrorString       hipGetErrorString
-#define cudaMalloc               hipMalloc
-#define cudaMemcpyAsync          hipMemcpyAsync
-#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
-#define cudaMemcpyDeviceToHost   hipMemcpyDeviceToHost
-#define cudaMemcpy               hipMemcpy
-#define cudaMemcpyHostToDevice   hipMemcpyHostToDevice
-#define cudaMemset               hipMemset
-#define cudaPeekAtLastError      hipPeekAtLastError
-#define cudaSuccess              hipSuccess
-#define cuDoubleComplex          hipDoubleComplex
-
-template <typename T>
-__device__ __forceinline__ T __shfl_down_sync(
-    unsigned mask, T var, unsigned int delta, int width = warpSize) {
-  return __shfl_down(var, delta, width);
-}
-
-#endif  // SIMULATOR_CUDA2HIP_H_
diff --git a/tpls/qsim/expect.h b/tpls/qsim/expect.h
deleted file mode 100644
index 518d516..0000000
--- a/tpls/qsim/expect.h
+++ /dev/null
@@ -1,148 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef EXPECT_H_
-#define EXPECT_H_
-
-#include <complex>
-
-#include "fuser.h"
-#include "gate_appl.h"
-
-namespace qsim {
-
-template <typename Gate>
-struct OpString {
-  std::complex<double> weight;
-  std::vector<Gate> ops;
-};
-
-/**
- * Computes the expectation value of the sum of operator strings (operator
- * sequences). Operators can act on any qubits and they can be any supported
- * gates. This function uses a temporary state vector.
- * @param param Options for gate fusion.
- * @param strings Operator strings.
- * @param ss StateSpace object required to copy the state vector and compute
- *   inner products.
- * @param simulator Simulator object. Provides specific implementations for
- *   applying gates.
- * @param state The state vector of the system.
- * @param ket Temporary state vector.
- * @return The computed expectation value.
- */
-template <typename IO, typename Fuser, typename Gate, typename Simulator>
-std::complex<double> ExpectationValue(
-    const typename Fuser::Parameter& param,
-    const std::vector<OpString<Gate>>& strings,
-    const typename Simulator::StateSpace& state_space,
-    const Simulator& simulator, const typename Simulator::State& state,
-    typename Simulator::State& ket) {
-  std::complex<double> eval = 0;
-
-  if (state_space.IsNull(ket) || ket.num_qubits() < state.num_qubits()) {
-    ket = state_space.Create(state.num_qubits());
-    if (state_space.IsNull(ket)) {
-      IO::errorf("not enough memory: is the number of qubits too large?\n");
-      return eval;
-    }
-  }
-
-  for (const auto& str : strings) {
-    if (str.ops.size() == 0) {
-      eval += str.weight;
-      continue;
-    }
-
-    state_space.Copy(state, ket);
-
-    if (str.ops.size() == 1) {
-      const auto& op = str.ops[0];
-      simulator.ApplyGate(op.qubits, op.matrix.data(), ket);
-    } else {
-      auto fused_gates = Fuser::FuseGates(param, state.num_qubits(), str.ops);
-      if (fused_gates.size() == 0) {
-        eval = 0;
-        break;
-      }
-
-      for (const auto& fgate : fused_gates) {
-        ApplyFusedGate(simulator, fgate, ket);
-      }
-    }
-
-    eval += str.weight * state_space.InnerProduct(state, ket);
-  }
-
-  return eval;
-}
-
-/**
- * Computes the expectation value of the sum of operator strings (operator
- * sequences). Operators can act on any qubits and they can be any supported
- * gates except for user-defined controlled gates. Computation is performed
- * in place. No additional memory is allocated. The operator strings should
- * act on no more than six qubits and they should be fusible into one gate.
- * @param strings Operator strings.
- * @param simulator Simulator object. Provides specific implementations for
- *   computing expectation values.
- * @param state The state of the system.
- * @return The computed expectation value.
- */
-template <typename IO, typename Fuser, typename Gate, typename Simulator>
-std::complex<double> ExpectationValue(
-    const std::vector<OpString<Gate>>& strings,
-    const Simulator& simulator, const typename Simulator::State& state) {
-  std::complex<double> eval = 0;
-
-  typename Fuser::Parameter param;
-  param.max_fused_size = 6;
-  for (const auto& str : strings) {
-    if (str.ops.size() == 0) {
-      eval += str.weight;
-    } else if (str.ops.size() == 1) {
-      const auto& op = str.ops[0];
-      auto r = simulator.ExpectationValue(op.qubits, op.matrix.data(), state);
-      eval += str.weight * r;
-    } else {
-      auto fused_gates = Fuser::FuseGates(param, state.num_qubits(), str.ops);
-
-      if (fused_gates.size() != 1) {
-        IO::errorf("too many fused gates; "
-                   "cannot compute the expectation value.\n");
-        eval = 0;
-        break;
-      }
-
-      const auto& fgate = fused_gates[0];
-
-      if (fgate.qubits.size() > 6) {
-        IO::errorf("operator string acts on too many qubits; "
-                   "cannot compute the expectation value.\n");
-        eval = 0;
-        break;
-      }
-
-      auto r = simulator.ExpectationValue(
-          fgate.qubits, fgate.matrix.data(), state);
-      eval += str.weight * r;
-    }
-  }
-
-  return eval;
-}
-
-}  // namespace qsim
-
-#endif  // EXPECT_H_
diff --git a/tpls/qsim/formux.h b/tpls/qsim/formux.h
deleted file mode 100644
index 4401e9b..0000000
--- a/tpls/qsim/formux.h
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef FORMUX_H_
-#define FORMUX_H_
-
-#ifdef _OPENMP
-# include "parfor.h"
-  namespace qsim {
-    using For = ParallelFor;
-  }
-#else
-# include "seqfor.h"
-  namespace qsim {
-    using For = SequentialFor;
-  }
-#endif
-
-#endif  // FORMUX_H_
diff --git a/tpls/qsim/fuser.h b/tpls/qsim/fuser.h
deleted file mode 100644
index e4f3c3b..0000000
--- a/tpls/qsim/fuser.h
+++ /dev/null
@@ -1,225 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef FUSER_H_
-#define FUSER_H_
-
-#include <cstdint>
-#include <vector>
-
-#include "gate.h"
-#include "matrix.h"
-
-namespace qsim {
-
-/**
- * A collection of "fused" gates which can be multiplied together before being
- * applied to the state vector.
- */
-template <typename Gate>
-struct GateFused {
-  /**
-   * Kind of the first ("parent") gate.
-   */
-  typename Gate::GateKind kind;
-  /**
-   * The time index of the first ("parent") gate.
-   */
-  unsigned time;
-  /**
-   * A list of qubits these gates act upon. Control qubits for
-   * explicitly-controlled gates are excluded from this list.
-   */
-  std::vector<unsigned> qubits;
-  /**
-   * Pointer to the first ("parent") gate.
-   */
-  const Gate* parent;
-  /**
-   * Ordered list of component gates.
-   */
-  std::vector<const Gate*> gates;
-  /**
-   * Fused gate matrix.
-   */
-  Matrix<typename Gate::fp_type> matrix;
-};
-
-/**
- * A base class for fuser classes with some common functions.
- */
-template <typename IO, typename Gate>
-class Fuser {
- protected:
-  using RGate = typename std::remove_pointer<Gate>::type;
-
-  static const RGate& GateToConstRef(const RGate& gate) {
-    return gate;
-  }
-
-  static const RGate& GateToConstRef(const RGate* gate) {
-    return *gate;
-  }
-
-  static std::vector<unsigned> MergeWithMeasurementTimes(
-      typename std::vector<Gate>::const_iterator gfirst,
-      typename std::vector<Gate>::const_iterator glast,
-      const std::vector<unsigned>& times) {
-    std::vector<unsigned> epochs;
-    epochs.reserve(glast - gfirst + times.size());
-
-    std::size_t last = 0;
-    unsigned max_time = 0;
-
-    for (auto gate_it = gfirst; gate_it < glast; ++gate_it) {
-      const auto& gate = GateToConstRef(*gate_it);
-
-      if (gate.time > max_time) {
-        max_time = gate.time;
-      }
-
-      if (epochs.size() > 0 && gate.time < epochs.back()) {
-        IO::errorf("gate crosses the time boundary.\n");
-        epochs.resize(0);
-        return epochs;
-      }
-
-      if (gate.kind == gate::kMeasurement) {
-        if (epochs.size() == 0 || epochs.back() < gate.time) {
-          if (!AddBoundary(gate.time, max_time, epochs)) {
-            epochs.resize(0);
-            return epochs;
-          }
-        }
-      }
-
-      while (last < times.size() && times[last] <= gate.time) {
-        unsigned prev = times[last++];
-        epochs.push_back(prev);
-        if (!AddBoundary(prev, max_time, epochs)) {
-          epochs.resize(0);
-          return epochs;
-        }
-        while (last < times.size() && times[last] <= prev) ++last;
-      }
-    }
-
-    if (epochs.size() == 0 || epochs.back() < max_time) {
-      epochs.push_back(max_time);
-    }
-
-    return epochs;
-  }
-
-  template <typename GateSeq0, typename Parent, typename GateFused>
-  static void FuseZeroQubitGates(const GateSeq0& gate_seq0,
-                                 Parent parent, std::size_t first,
-                                 std::vector<GateFused>& fused_gates) {
-    GateFused* fuse_to = nullptr;
-
-    for (std::size_t i = first; i < fused_gates.size(); ++i) {
-      auto& fgate = fused_gates[i];
-
-      if (fgate.kind != gate::kMeasurement && fgate.kind != gate::kDecomp
-          && fgate.parent->controlled_by.size() == 0
-          && !fgate.parent->unfusible) {
-        fuse_to = &fgate;
-        break;
-      }
-    }
-
-    if (fuse_to != nullptr) {
-      // Fuse zero-qubit gates with the first available fused gate.
-      for (const auto& g : gate_seq0) {
-        fuse_to->gates.push_back(parent(g));
-      }
-    } else {
-      auto g0 = parent(gate_seq0[0]);
-      fused_gates.push_back({g0->kind, g0->time, {}, g0, {g0}, {}});
-
-      for (std::size_t i = 1; i < gate_seq0.size(); ++i) {
-        fused_gates.back().gates.push_back(parent(gate_seq0[i]));
-      }
-    }
-  }
-
- private:
-  static bool AddBoundary(unsigned time, unsigned max_time,
-                          std::vector<unsigned>& boundaries) {
-    if (max_time > time) {
-      IO::errorf("gate crosses the time boundary.\n");
-      return false;
-    }
-
-    boundaries.push_back(time);
-    return true;
-  }
-};
-
-/**
- * Multiplies component gate matrices of a fused gate.
- * @param gate Fused gate.
- */
-template <typename FusedGate>
-inline void CalculateFusedMatrix(FusedGate& gate) {
-  MatrixIdentity(unsigned{1} << gate.qubits.size(), gate.matrix);
-
-  for (auto pgate : gate.gates) {
-    if (pgate->qubits.size() == 0) {
-      MatrixScalarMultiply(pgate->matrix[0], pgate->matrix[1], gate.matrix);
-    } else if (gate.qubits.size() == pgate->qubits.size()) {
-      MatrixMultiply(gate.qubits.size(), pgate->matrix, gate.matrix);
-    } else {
-      unsigned mask = 0;
-
-      for (auto q : pgate->qubits) {
-        for (std::size_t i = 0; i < gate.qubits.size(); ++i) {
-          if (q == gate.qubits[i]) {
-            mask |= unsigned{1} << i;
-            break;
-          }
-        }
-      }
-
-      MatrixMultiply(mask, pgate->qubits.size(), pgate->matrix,
-                     gate.qubits.size(), gate.matrix);
-    }
-  }
-}
-
-/**
- * Multiplies component gate matrices for a range of fused gates.
- * @param gbeg, gend The iterator range [gbeg, gend) of fused gates.
- */
-template <typename Iterator>
-inline void CalculateFusedMatrices(Iterator gbeg, Iterator gend) {
-  for (auto g = gbeg; g != gend; ++g) {
-    if (g->kind != gate::kMeasurement) {
-      CalculateFusedMatrix(*g);
-    }
-  }
-}
-
-/**
- * Multiplies component gate matrices for a vector of fused gates.
- * @param gates The vector of fused gates.
- */
-template <typename FusedGate>
-inline void CalculateFusedMatrices(std::vector<FusedGate>& gates) {
-  CalculateFusedMatrices(gates.begin(), gates.end());
-}
-
-}  // namespace qsim
-
-#endif  // FUSER_H_
diff --git a/tpls/qsim/fuser_basic.h b/tpls/qsim/fuser_basic.h
deleted file mode 100644
index 3191bd2..0000000
--- a/tpls/qsim/fuser_basic.h
+++ /dev/null
@@ -1,411 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef FUSER_BASIC_H_
-#define FUSER_BASIC_H_
-
-#include <map>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "gate.h"
-#include "fuser.h"
-
-namespace qsim {
-
-/**
- * Stateless object with methods for aggregating `Gate`s into `GateFused`.
- * Measurement gates with equal times are fused together.
- * User-defined controlled gates (controlled_by.size() > 0) and gates acting on
- * more than two qubits are not fused.
- * The template parameter Gate can be Gate type or a pointer to Gate type.
- * This class is deprecated. It is recommended to use MultiQubitGateFuser
- * from fuser_mqubit.h.
- */
-template <typename IO, typename Gate>
-class BasicGateFuser final : public Fuser<IO, Gate> {
- private:
-  using Base = Fuser<IO, Gate>;
-  using RGate = typename Base::RGate;
-
- public:
-  using GateFused = qsim::GateFused<RGate>;
-
-  /**
-   * User-specified parameters for gate fusion.
-   * BasicGateFuser does not use any parameters.
-   */
-  struct Parameter {
-    unsigned verbosity = 0;
-  };
-
-  /**
-   * Stores sets of gates that can be applied together. Only one- and
-   * two-qubit gates will get fused. To respect specific time boundaries while
-   * fusing gates, use the other version of this method below.
-   * @param param Options for gate fusion.
-   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
-   * @param gates The gates (or pointers to the gates) to be fused.
-   *   Gate times of the gates that act on the same qubits should be ordered.
-   *   Gates that are out of time order should not cross the time boundaries
-   *   set by measurement gates.
-   * @param fuse_matrix If true, multiply gate matrices together.
-   * @return A vector of fused gate objects. Each element is a set of gates
-   *   acting on a specific pair of qubits which can be applied as a group.
-   */
-  static std::vector<GateFused> FuseGates(const Parameter& param,
-                                          unsigned max_qubit1,
-                                          const std::vector<Gate>& gates,
-                                          bool fuse_matrix = true) {
-    return FuseGates(
-        param, max_qubit1, gates.cbegin(), gates.cend(), {}, fuse_matrix);
-  }
-
-  /**
-   * Stores sets of gates that can be applied together. Only one- and
-   * two-qubit gates will get fused.
-   * @param param Options for gate fusion.
-   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
-   * @param gates The gates (or pointers to the gates) to be fused.
-   *   Gate times of the gates that act on the same qubits should be ordered.
-   *   Gates that are out of time order should not cross the time boundaries
-   *   set by `times_to_split_at` or by measurement gates.
-   * @param times_to_split_at Ordered list of time steps (boundaries) at which
-   *   to separate fused gates. Each element of the output will contain gates
-   *   from a single 'window' in this list.
-   * @param fuse_matrix If true, multiply gate matrices together.
-   * @return A vector of fused gate objects. Each element is a set of gates
-   *   acting on a specific pair of qubits which can be applied as a group.
-   */
-  static std::vector<GateFused> FuseGates(
-      const Parameter& param,
-      unsigned max_qubit1, const std::vector<Gate>& gates,
-      const std::vector<unsigned>& times_to_split_at,
-      bool fuse_matrix = true) {
-    return FuseGates(param, max_qubit1, gates.cbegin(), gates.cend(),
-                     times_to_split_at, fuse_matrix);
-  }
-
-  /**
-   * Stores sets of gates that can be applied together. Only one- and
-   * two-qubit gates will get fused. To respect specific time boundaries while
-   * fusing gates, use the other version of this method below.
-   * @param param Options for gate fusion.
-   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
-   * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates
-   *   (or pointers to gates) in. Gate times of the gates that act on the same
-   *   qubits should be ordered. Gates that are out of time order should not
-   *   cross the time boundaries set by measurement gates.
-   * @param fuse_matrix If true, multiply gate matrices together.
-   * @return A vector of fused gate objects. Each element is a set of gates
-   *   acting on a specific pair of qubits which can be applied as a group.
-   */
-  static std::vector<GateFused> FuseGates(
-      const Parameter& param, unsigned max_qubit1,
-      typename std::vector<Gate>::const_iterator gfirst,
-      typename std::vector<Gate>::const_iterator glast,
-      bool fuse_matrix = true) {
-    return FuseGates(param, max_qubit1, gfirst, glast, {}, fuse_matrix);
-  }
-
-  /**
-   * Stores sets of gates that can be applied together. Only one- and
-   * two-qubit gates will get fused.
-   * @param param Options for gate fusion.
-   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
-   * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates
-   *   (or pointers to gates) in. Gate times of the gates that act on the same
-   *   qubits should be ordered. Gates that are out of time order should not
-   *   cross the time boundaries set by `times_to_split_at` or by measurement
-   *   gates.
-   * @param times_to_split_at Ordered list of time steps (boundaries) at which
-   *   to separate fused gates. Each element of the output will contain gates
-   *   from a single 'window' in this list.
-   * @param fuse_matrix If true, multiply gate matrices together.
-   * @return A vector of fused gate objects. Each element is a set of gates
-   *   acting on a specific pair of qubits which can be applied as a group.
-   */
-  static std::vector<GateFused> FuseGates(
-      const Parameter& param, unsigned max_qubit1,
-      typename std::vector<Gate>::const_iterator gfirst,
-      typename std::vector<Gate>::const_iterator glast,
-      const std::vector<unsigned>& times_to_split_at,
-      bool fuse_matrix = true) {
-    std::vector<GateFused> gates_fused;
-
-    if (gfirst >= glast) return gates_fused;
-
-    std::size_t num_gates = glast - gfirst;
-
-    gates_fused.reserve(num_gates);
-
-    // Merge with measurement gate times to separate fused gates at.
-    auto times =
-        Base::MergeWithMeasurementTimes(gfirst, glast, times_to_split_at);
-
-    // Map to keep track of measurement gates with equal times.
-    std::map<unsigned, std::vector<const RGate*>> measurement_gates;
-
-    // Sequence of top level gates the other gates get fused to.
-    std::vector<const RGate*> gates_seq;
-
-    // Sequence of zero-qubit gates.
-    std::vector<const RGate*> gates_seq0;
-
-    // Lattice of gates: qubits "hyperplane" and time direction.
-    std::vector<std::vector<const RGate*>> gates_lat(max_qubit1);
-
-    // Current unfused gate.
-    auto gate_it = gfirst;
-
-    std::size_t last_fused_gate_index = 0;
-
-    for (std::size_t l = 0; l < times.size(); ++l) {
-      gates_seq.resize(0);
-      gates_seq.reserve(num_gates);
-
-      gates_seq0.resize(0);
-      gates_seq0.reserve(num_gates);
-
-      for (unsigned k = 0; k < max_qubit1; ++k) {
-        gates_lat[k].resize(0);
-        gates_lat[k].reserve(128);
-      }
-
-      // Fill gates_seq and gates_lat in.
-      for (; gate_it < glast; ++gate_it) {
-        const auto& gate = Base::GateToConstRef(*gate_it);
-
-        if (gate.time > times[l]) break;
-
-        if (!ValidateGate(gate, max_qubit1, gates_lat)) {
-          gates_fused.resize(0);
-          return gates_fused;
-        }
-
-        if (gate.kind == gate::kMeasurement) {
-          auto& mea_gates_at_time = measurement_gates[gate.time];
-          if (mea_gates_at_time.size() == 0) {
-            gates_seq.push_back(&gate);
-            mea_gates_at_time.reserve(max_qubit1);
-          }
-
-          mea_gates_at_time.push_back(&gate);
-        } else if (gate.controlled_by.size() > 0 || gate.qubits.size() > 2) {
-          for (auto q : gate.qubits) {
-            gates_lat[q].push_back(&gate);
-          }
-          for (auto q : gate.controlled_by) {
-            gates_lat[q].push_back(&gate);
-          }
-          gates_seq.push_back(&gate);
-        } else if (gate.qubits.size() == 1) {
-          gates_lat[gate.qubits[0]].push_back(&gate);
-          if (gate.unfusible) {
-            gates_seq.push_back(&gate);
-          }
-        } else if (gate.qubits.size() == 2) {
-          gates_lat[gate.qubits[0]].push_back(&gate);
-          gates_lat[gate.qubits[1]].push_back(&gate);
-          gates_seq.push_back(&gate);
-        } else {
-          gates_seq0.push_back(&gate);
-        }
-      }
-
-      std::vector<unsigned> last(max_qubit1, 0);
-
-      const RGate* delayed_measurement_gate = nullptr;
-
-      // Fuse gates.
-      for (auto pgate : gates_seq) {
-        if (pgate->kind == gate::kMeasurement) {
-          delayed_measurement_gate = pgate;
-        } else if (pgate->qubits.size() > 2
-                   || pgate->controlled_by.size() > 0) {
-          // Multi-qubit or controlled gate.
-
-          for (auto q : pgate->qubits) {
-            unsigned l = last[q];
-            if (gates_lat[q][l] != pgate) {
-              last[q] = AddOrphanedQubit(q, l, gates_lat, gates_fused);
-            }
-            ++last[q];
-          }
-
-          for (auto q : pgate->controlled_by) {
-            unsigned l = last[q];
-            if (gates_lat[q][l] != pgate) {
-              last[q] = AddOrphanedQubit(q, l, gates_lat, gates_fused);
-            }
-            ++last[q];
-          }
-
-          gates_fused.push_back({pgate->kind, pgate->time, pgate->qubits,
-                                 pgate, {pgate}, {}});
-        } else if (pgate->qubits.size() == 1) {
-          unsigned q0 = pgate->qubits[0];
-
-          GateFused gate_f = {pgate->kind, pgate->time, {q0}, pgate, {}, {}};
-
-          last[q0] = Advance(last[q0], gates_lat[q0], gate_f.gates);
-          gate_f.gates.push_back(gates_lat[q0][last[q0]]);
-          last[q0] = Advance(last[q0] + 1, gates_lat[q0], gate_f.gates);
-
-          gates_fused.push_back(std::move(gate_f));
-        } else if (pgate->qubits.size() == 2) {
-          unsigned q0 = pgate->qubits[0];
-          unsigned q1 = pgate->qubits[1];
-
-          if (Done(last[q0], pgate->time, gates_lat[q0])) continue;
-
-          GateFused gate_f =
-              {pgate->kind, pgate->time, {q0, q1}, pgate, {}, {}};
-
-          do {
-            last[q0] = Advance(last[q0], gates_lat[q0], gate_f.gates);
-            last[q1] = Advance(last[q1], gates_lat[q1], gate_f.gates);
-            // Here gates_lat[q0][last[q0]] == gates_lat[q1][last[q1]].
-
-            gate_f.gates.push_back(gates_lat[q0][last[q0]]);
-
-            last[q0] = Advance(last[q0] + 1, gates_lat[q0], gate_f.gates);
-            last[q1] = Advance(last[q1] + 1, gates_lat[q1], gate_f.gates);
-          } while (NextGate(last[q0], gates_lat[q0], last[q1], gates_lat[q1]));
-
-          gates_fused.push_back(std::move(gate_f));
-        }
-      }
-
-      for (unsigned q = 0; q < max_qubit1; ++q) {
-        auto l = last[q];
-        if (l == gates_lat[q].size()) continue;
-
-        // Orphaned qubit.
-        AddOrphanedQubit(q, l, gates_lat, gates_fused);
-      }
-
-      if (delayed_measurement_gate != nullptr) {
-        auto pgate = delayed_measurement_gate;
-
-        const auto& mea_gates_at_time = measurement_gates[pgate->time];
-
-        GateFused gate_f = {pgate->kind, pgate->time, {}, pgate, {}, {}};
-        gate_f.gates.reserve(mea_gates_at_time.size());
-
-        // Fuse measurement gates with equal times.
-
-        for (const auto* pgate : mea_gates_at_time) {
-          gate_f.qubits.insert(gate_f.qubits.end(),
-                               pgate->qubits.begin(), pgate->qubits.end());
-          gate_f.gates.push_back(pgate);
-        }
-
-        gates_fused.push_back(std::move(gate_f));
-      }
-
-      if (gates_seq0.size() != 0) {
-        Base::FuseZeroQubitGates(gates_seq0, [](const RGate* g) { return g; },
-                                 last_fused_gate_index, gates_fused);
-      }
-
-      if (gate_it == glast) break;
-
-      last_fused_gate_index = gates_fused.size();
-    }
-
-    if (fuse_matrix) {
-      for (auto& gate_f : gates_fused) {
-        if (gate_f.kind != gate::kMeasurement && gate_f.kind != gate::kDecomp) {
-          CalculateFusedMatrix(gate_f);
-        }
-      }
-    }
-
-    return gates_fused;
-  }
-
- private:
-  static unsigned Advance(unsigned k, const std::vector<const RGate*>& wl,
-                          std::vector<const RGate*>& gates) {
-    while (k < wl.size() && wl[k]->qubits.size() == 1
-           && wl[k]->controlled_by.size() == 0 && !wl[k]->unfusible) {
-      gates.push_back(wl[k++]);
-    }
-
-    return k;
-  }
-
-  static bool Done(
-      unsigned k, unsigned t, const std::vector<const RGate*>& wl) {
-    return k >= wl.size() || wl[k]->time > t;
-  }
-
-  static bool NextGate(unsigned k1, const std::vector<const RGate*>& wl1,
-                       unsigned k2, const std::vector<const RGate*>& wl2) {
-    return k1 < wl1.size() && k2 < wl2.size() && wl1[k1] == wl2[k2]
-        && wl1[k1]->qubits.size() < 3 && wl1[k1]->controlled_by.size() == 0;
-  }
-
-  template <typename GatesLat>
-  static unsigned AddOrphanedQubit(unsigned q, unsigned k,
-                                   const GatesLat& gates_lat,
-                                   std::vector<GateFused>& gates_fused) {
-    auto pgate = gates_lat[q][k];
-
-    GateFused gate_f = {pgate->kind, pgate->time, {q}, pgate, {}, {}};
-    gate_f.gates.push_back(pgate);
-
-    k = Advance(k + 1, gates_lat[q], gate_f.gates);
-
-    gates_fused.push_back(std::move(gate_f));
-
-    return k;
-  }
-
-  template <typename Gate2, typename GatesLat>
-  static bool ValidateGate(const Gate2& gate, unsigned max_qubit1,
-                           const GatesLat& gates_lat) {
-    for (unsigned q : gate.qubits) {
-      if (q >= max_qubit1) {
-        IO::errorf("fuser: gate qubit %u is out of range "
-                   "(should be smaller than %u).\n", q, max_qubit1);
-        return false;
-      }
-      if (!gates_lat[q].empty() && gate.time <= gates_lat[q].back()->time) {
-        IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time);
-        return false;
-      }
-    }
-
-    for (unsigned q : gate.controlled_by) {
-      if (q >= max_qubit1) {
-        IO::errorf("fuser: gate qubit %u is out of range "
-                   "(should be smaller than %u).\n", q, max_qubit1);
-        return false;
-      }
-      if (!gates_lat[q].empty() && gate.time <= gates_lat[q].back()->time) {
-        IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time);
-        return false;
-      }
-    }
-
-    return true;
-  }
-};
-
-}  // namespace qsim
-
-#endif  // FUSER_BASIC_H_
diff --git a/tpls/qsim/fuser_mqubit.h b/tpls/qsim/fuser_mqubit.h
deleted file mode 100644
index c75b1a0..0000000
--- a/tpls/qsim/fuser_mqubit.h
+++ /dev/null
@@ -1,1095 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef FUSER_MQUBIT_H_
-#define FUSER_MQUBIT_H_
-
-#include <algorithm>
-#include <cstdint>
-#include <limits>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "gate.h"
-#include "fuser.h"
-
-namespace qsim {
-
-/**
- * Multi-qubit gate fuser.
- * Measurement gates with equal times are fused together.
- * User-defined controlled gates (controlled_by.size() > 0) are not fused.
- * The template parameter Gate can be Gate type or a pointer to Gate type.
- */
-template <typename IO, typename Gate>
-class MultiQubitGateFuser final : public Fuser<IO, Gate> {
- private:
-  using Base = Fuser<IO, Gate>;
-  using RGate = typename Base::RGate;
-
-  // Auxillary classes and structs.
-
-  // Manages doubly-linked lists.
-  template <typename T>
-  class LinkManagerT {
-   public:
-    struct Link {
-      T val;
-      Link* next;
-      Link* prev;
-    };
-
-    explicit LinkManagerT(uint64_t size) {
-      links_.reserve(size);
-    }
-
-    Link* AddBack(const T& t, Link* link) {
-      if (link == nullptr) {
-        links_.push_back({t, nullptr, nullptr});
-      } else {
-        links_.push_back({t, link->next, link});
-        link->next = &links_.back();
-      }
-
-      return &links_.back();
-    }
-
-    static void Delete(const Link* link) {
-      if (link->prev != nullptr) {
-        link->prev->next = link->next;
-      }
-      if (link->next != nullptr) {
-        link->next->prev = link->prev;
-      }
-    }
-
-   private:
-    std::vector<Link> links_;
-  };
-
-  struct GateF;
-
-  using LinkManager = LinkManagerT<GateF*>;
-  using Link = typename LinkManager::Link;
-
-  // Intermediate representation of a fused gate.
-  struct GateF {
-    const RGate* parent;
-    std::vector<unsigned> qubits;
-    std::vector<const RGate*> gates;  // Gates that get fused to this gate.
-    std::vector<Link*> links;         // Gate "lattice" links.
-    uint64_t mask;                    // Qubit mask.
-    unsigned visited;
-  };
-
-  // Possible values for visited in GateF.
-  // Note that MakeGateSequence assignes values from kSecond to the number of
-  // gates in the sequence plus one, see below.
-  enum Visited {
-    kZero = 0,             // Start value for "normal" gates.
-    kFirst = 1,            // Value after the first pass for partially fused
-                           // "normal" gates.
-    kSecond = 2,           // Start value to assign values in MakeGateSequence.
-    kCompress = 99999997,  // Used to compress links.
-    kMeaCnt = 99999998,    // Start value for controlled or measurement gates.
-    kFinal = 99999999,     // Value after the second pass for fused "normal"
-                           // gates or for controlled and measurement gates.
-  };
-
-  struct Stat {
-    unsigned num_mea_gates = 0;
-    unsigned num_fused_mea_gates = 0;
-    unsigned num_fused_gates = 0;
-    unsigned num_controlled_gates = 0;
-    std::vector<unsigned> num_gates;
-  };
-
-  // Gate that is added to a sequence of gates to fuse together.
-  struct GateA {
-    GateF* gate;
-    std::vector<unsigned> qubits;  // Added qubits.
-    std::vector<Link*> links;      // Added lattice links.
-  };
-
-  struct Scratch {
-    std::vector<GateA> data;
-    std::vector<GateA*> prev1;
-    std::vector<GateA*> prev2;
-    std::vector<GateA*> next1;
-    std::vector<GateA*> next2;
-    std::vector<GateA*> longest_seq;
-    std::vector<GateA*> stack;
-    std::vector<GateF*> gates;
-    unsigned count = 0;
-  };
-
- public:
-  using GateFused = qsim::GateFused<RGate>;
-
-  /**
-   * User-specified parameters for gate fusion.
-   */
-  struct Parameter {
-    /**
-     * Maximum number of qubits in a fused gate. It can take values from 2 to
-     * 6 (0 and 1 are equivalent to 2). It is not recommended to use 5 or 6 as
-     * that might degrade performance for not very fast machines.
-     */
-    unsigned max_fused_size = 2;
-    unsigned verbosity = 0;
-  };
-
-  /**
-   * Stores sets of gates that can be applied together. To respect specific
-   * time boundaries while fusing gates, use the other version of this method
-   * below.
-   * @param param Options for gate fusion.
-   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
-   * @param gates The gates (or pointers to the gates) to be fused.
-   *   Gate times of the gates that act on the same qubits should be ordered.
-   *   Gates that are out of time order should not cross the time boundaries
-   *   set by measurement gates.
-   * @param fuse_matrix If true, multiply gate matrices together.
-   * @return A vector of fused gate objects. Each element is a set of gates
-   *   acting on a specific pair of qubits which can be applied as a group.
-   */
-  static std::vector<GateFused> FuseGates(const Parameter& param,
-                                          unsigned max_qubit1,
-                                          const std::vector<Gate>& gates,
-                                          bool fuse_matrix = true) {
-    return FuseGates(
-        param, max_qubit1, gates.cbegin(), gates.cend(), {}, fuse_matrix);
-  }
-
-  /**
-   * Stores sets of gates that can be applied together.
-   * @param param Options for gate fusion.
-   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
-   * @param gates The gates (or pointers to the gates) to be fused.
-   *   Gate times of the gates that act on the same qubits should be ordered.
-   *   Gates that are out of time order should not cross the time boundaries
-   *   set by `times_to_split_at` or by measurement gates.
-   * @param times_to_split_at Ordered list of time steps (boundaries) at which
-   *   to separate fused gates. Each element of the output will contain gates
-   *   from a single 'window' in this list.
-   * @param fuse_matrix If true, multiply gate matrices together.
-   * @return A vector of fused gate objects. Each element is a set of gates
-   *   acting on a specific pair of qubits which can be applied as a group.
-   */
-  static std::vector<GateFused> FuseGates(
-      const Parameter& param,
-      unsigned max_qubit1, const std::vector<Gate>& gates,
-      const std::vector<unsigned>& times_to_split_at,
-      bool fuse_matrix = true) {
-    return FuseGates(param, max_qubit1, gates.cbegin(), gates.cend(),
-                     times_to_split_at, fuse_matrix);
-  }
-
-  /**
-   * Stores sets of gates that can be applied together. To respect specific
-   * time boundaries while fusing gates, use the other version of this method
-   * below.
-   * @param param Options for gate fusion.
-   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
-   * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates
-   *   (or pointers to gates) in. Gate times of the gates that act on the same
-   *   qubits should be ordered. Gates that are out of time order should not
-   *   cross the time boundaries set by measurement gates.
-   * @param fuse_matrix If true, multiply gate matrices together.
-   * @return A vector of fused gate objects. Each element is a set of gates
-   *   acting on a specific pair of qubits which can be applied as a group.
-   */
-  static std::vector<GateFused> FuseGates(
-      const Parameter& param, unsigned max_qubit1,
-      typename std::vector<Gate>::const_iterator gfirst,
-      typename std::vector<Gate>::const_iterator glast,
-      bool fuse_matrix = true) {
-    return FuseGates(param, max_qubit1, gfirst, glast, {}, fuse_matrix);
-  }
-
-  /**
-   * Stores sets of gates that can be applied together.
-   * @param param Options for gate fusion.
-   * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'.
-   * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates
-   *   (or pointers to gates) in. Gate times of the gates that act on the same
-   *   qubits should be ordered. Gates that are out of time order should not
-   *   cross the time boundaries set by `times_to_split_at` or by measurement
-   *   gates.
-   * @param times_to_split_at Ordered list of time steps (boundaries) at which
-   *   to separate fused gates. Each element of the output will contain gates
-   *   from a single 'window' in this list.
-   * @param fuse_matrix If true, multiply gate matrices together.
-   * @return A vector of fused gate objects. Each element is a set of gates
-   *   acting on a specific pair of qubits which can be applied as a group.
-   */
-  static std::vector<GateFused> FuseGates(
-      const Parameter& param, unsigned max_qubit1,
-      typename std::vector<Gate>::const_iterator gfirst,
-      typename std::vector<Gate>::const_iterator glast,
-      const std::vector<unsigned>& times_to_split_at,
-      bool fuse_matrix = true) {
-    std::vector<GateFused> fused_gates;
-
-    if (gfirst >= glast) return fused_gates;
-
-    std::size_t num_gates = glast - gfirst;
-
-    fused_gates.reserve(num_gates);
-
-    // Merge with measurement gate times to separate fused gates at.
-    auto epochs =
-        Base::MergeWithMeasurementTimes(gfirst, glast, times_to_split_at);
-
-    LinkManager link_manager(max_qubit1 * num_gates);
-
-    // Auxillary data structures.
-    // Sequence of intermediate fused gates.
-    std::vector<GateF> gates_seq;
-    // Gate "lattice".
-    std::vector<Link*> gates_lat;
-    // Sequences of intermediate fused gates ordered by gate size.
-    std::vector<std::vector<GateF*>> fgates(max_qubit1 + 1);
-
-    gates_seq.reserve(num_gates);
-    gates_lat.reserve(max_qubit1);
-
-    Scratch scratch;
-
-    scratch.data.reserve(1024);
-    scratch.prev1.reserve(32);
-    scratch.prev2.reserve(32);
-    scratch.next1.reserve(32);
-    scratch.next2.reserve(32);
-    scratch.longest_seq.reserve(8);
-    scratch.stack.reserve(8);
-
-    Stat stat;
-    stat.num_gates.resize(max_qubit1 + 1, 0);
-
-    unsigned max_fused_size = std::min(unsigned{6}, param.max_fused_size);
-    max_fused_size = std::min(max_fused_size, max_qubit1);
-
-    std::size_t last_fused_gate_index = 0;
-    auto gate_it = gfirst;
-
-    // Iterate over epochs.
-    for (std::size_t l = 0; l < epochs.size(); ++l) {
-      gates_seq.resize(0);
-      gates_lat.resize(0);
-      gates_lat.resize(max_qubit1, nullptr);
-
-      for (unsigned i = 0; i <= max_qubit1; ++i) {
-        fgates[i].resize(0);
-      }
-
-      uint64_t max_gate_size = 0;
-      GateF* last_mea_gate = nullptr;
-
-      // Iterate over input gates.
-      for (; gate_it < glast; ++gate_it) {
-        const auto& gate = Base::GateToConstRef(*gate_it);
-
-        if (gate.time > epochs[l]) break;
-
-        if (!ValidateGate(gate, max_qubit1, gates_lat)) {
-          fused_gates.resize(0);
-          return fused_gates;
-        }
-
-        // Fill in auxillary data structures.
-
-        if (gate.kind == gate::kMeasurement) {
-          // Measurement gate.
-
-          if (last_mea_gate == nullptr
-              || last_mea_gate->parent->time != gate.time) {
-            gates_seq.push_back({&gate, {}, {}, {}, 0, kMeaCnt});
-            last_mea_gate = &gates_seq.back();
-
-            last_mea_gate->qubits.reserve(max_qubit1);
-            last_mea_gate->links.reserve(max_qubit1);
-
-            ++stat.num_fused_mea_gates;
-          }
-
-          for (auto q : gate.qubits) {
-            last_mea_gate->qubits.push_back(q);
-            last_mea_gate->mask |= uint64_t{1} << q;
-            gates_lat[q] = link_manager.AddBack(last_mea_gate, gates_lat[q]);
-            last_mea_gate->links.push_back(gates_lat[q]);
-          }
-
-          last_mea_gate->gates.push_back(&gate);
-
-          ++stat.num_mea_gates;
-        } else {
-          gates_seq.push_back({&gate, {}, {}, {}, 0, kZero});
-          auto& fgate = gates_seq.back();
-
-          if (gate.controlled_by.size() == 0) {
-            if (max_gate_size < gate.qubits.size()) {
-              max_gate_size = gate.qubits.size();
-            }
-
-            unsigned num_gate_qubits = gate.qubits.size();
-            unsigned size = std::max(max_fused_size, num_gate_qubits);
-
-            fgate.qubits.reserve(size);
-            fgate.links.reserve(size);
-            fgate.gates.reserve(4 * size);
-            fgate.links.reserve(size);
-
-            if (fgates[num_gate_qubits].empty()) {
-              fgates[num_gate_qubits].reserve(num_gates);
-            }
-            fgates[num_gate_qubits].push_back(&fgate);
-
-            ++stat.num_gates[num_gate_qubits];
-          } else {
-            // Controlled gate.
-            // Controlled gates are not fused with other gates.
-
-            uint64_t size = gate.qubits.size() + gate.controlled_by.size();
-
-            fgate.qubits.reserve(gate.qubits.size());
-            fgate.links.reserve(size);
-
-            fgate.visited = kMeaCnt;
-            fgate.gates.push_back(&gate);
-
-            ++stat.num_controlled_gates;
-          }
-
-          for (auto q : gate.qubits) {
-            fgate.qubits.push_back(q);
-            fgate.mask |= uint64_t{1} << q;
-            gates_lat[q] = link_manager.AddBack(&fgate, gates_lat[q]);
-            fgate.links.push_back(gates_lat[q]);
-          }
-
-          for (auto q : gate.controlled_by) {
-            fgate.mask |= uint64_t{1} << q;
-            gates_lat[q] = link_manager.AddBack(&fgate, gates_lat[q]);
-            fgate.links.push_back(gates_lat[q]);
-          }
-        }
-      }
-
-      // Fuse large gates with smaller gates.
-      FuseGates(max_gate_size, fgates);
-
-      if (max_fused_size > 2) {
-        FuseGateSequences(
-            max_fused_size, max_qubit1, scratch, gates_seq, stat, fused_gates);
-      } else {
-        unsigned prev_time = 0;
-
-        std::vector<GateF*> orphaned_gates;
-        orphaned_gates.reserve(max_qubit1);
-
-        for (auto& fgate : gates_seq) {
-          if (fgate.gates.size() == 0) continue;
-
-          if (prev_time != fgate.parent->time) {
-            if (orphaned_gates.size() > 0) {
-              FuseOrphanedGates(
-                  max_fused_size, stat, orphaned_gates, fused_gates);
-              orphaned_gates.resize(0);
-            }
-
-            prev_time = fgate.parent->time;
-          }
-
-          if (fgate.qubits.size() == 1 && max_fused_size > 1
-              && fgate.visited != kMeaCnt && !fgate.parent->unfusible) {
-            orphaned_gates.push_back(&fgate);
-            continue;
-          }
-
-          // Assume fgate.qubits (gate.qubits) are sorted.
-          fused_gates.push_back({fgate.parent->kind, fgate.parent->time,
-                                 std::move(fgate.qubits), fgate.parent,
-                                 std::move(fgate.gates), {}});
-
-          if (fgate.visited != kMeaCnt) {
-            ++stat.num_fused_gates;
-          }
-        }
-
-        if (orphaned_gates.size() > 0) {
-          FuseOrphanedGates(max_fused_size, stat, orphaned_gates, fused_gates);
-        }
-      }
-
-      if (fgates[0].size() != 0) {
-        Base::FuseZeroQubitGates(fgates[0],
-                                 [](const GateF* g) { return g->parent; },
-                                 last_fused_gate_index, fused_gates);
-      }
-
-      last_fused_gate_index = fused_gates.size();
-    }
-
-    if (fuse_matrix) {
-      for (auto& fgate : fused_gates) {
-        if (fgate.kind != gate::kMeasurement && fgate.kind != gate::kDecomp) {
-          CalculateFusedMatrix(fgate);
-        }
-      }
-    }
-
-    PrintStat(param.verbosity, stat, fused_gates);
-
-    return fused_gates;
-  }
-
- private:
-  // Fuse large gates with smaller gates.
-  static void FuseGates(uint64_t max_gate_size,
-                        std::vector<std::vector<GateF*>>& fgates) {
-    // Traverse gates in order of decreasing size.
-    for (uint64_t i = 0; i < max_gate_size; ++i) {
-      std::size_t pos = 0;
-
-      for (auto fgate : fgates[max_gate_size - i]) {
-        if (fgate->visited > kZero) continue;
-
-        fgates[max_gate_size - i][pos++] = fgate;
-
-        fgate->visited = kFirst;
-
-        FusePrev(0, *fgate);
-        fgate->gates.push_back(fgate->parent);
-        FuseNext(0, *fgate);
-      }
-
-      fgates[max_gate_size - i].resize(pos);
-    }
-  }
-
-  // Try to fuse gate sequences as follows. Gate time goes from bottom to top.
-  // Gates are fused either from left to right or from right to left.
-  //
-  // max_fused_size = 3: _-  or  -_
-  //
-  // max_fused_size = 4: _-_
-  //
-  // max_fused_size = 5: _-_-  or  -_-_
-  //
-  // max_fused_size = 6: _-_-_
-  static void FuseGateSequences(unsigned max_fused_size,
-                                unsigned max_qubit1, Scratch& scratch,
-                                std::vector<GateF>& gates_seq, Stat& stat,
-                                std::vector<GateFused>& fused_gates) {
-    unsigned prev_time = 0;
-
-    std::vector<GateF*> orphaned_gates;
-    orphaned_gates.reserve(max_qubit1);
-
-    for (auto& fgate : gates_seq) {
-      if (prev_time != fgate.parent->time) {
-        if (orphaned_gates.size() > 0) {
-          FuseOrphanedGates(max_fused_size, stat, orphaned_gates, fused_gates);
-          orphaned_gates.resize(0);
-        }
-
-        prev_time = fgate.parent->time;
-      }
-
-      if (fgate.visited == kFinal || fgate.gates.size() == 0) continue;
-
-      if (fgate.visited == kMeaCnt || fgate.qubits.size() >= max_fused_size
-          || fgate.parent->unfusible) {
-        if (fgate.visited != kMeaCnt) {
-          ++stat.num_fused_gates;
-        }
-
-        fgate.visited = kFinal;
-
-        fused_gates.push_back({fgate.parent->kind, fgate.parent->time,
-                               std::move(fgate.qubits), fgate.parent,
-                               std::move(fgate.gates), {}});
-
-        continue;
-      }
-
-
-      if (fgate.qubits.size() == 1 && max_fused_size > 1) {
-        orphaned_gates.push_back(&fgate);
-        continue;
-      }
-
-      scratch.data.resize(0);
-      scratch.gates.resize(0);
-      scratch.count = 0;
-
-      MakeGateSequence(max_fused_size, scratch, fgate);
-
-      if (scratch.gates.size() == 0) {
-        orphaned_gates.push_back(&fgate);
-      } else {
-        for (auto fgate : scratch.gates) {
-          std::sort(fgate->qubits.begin(), fgate->qubits.end());
-
-          fused_gates.push_back({fgate->parent->kind, fgate->parent->time,
-                                 std::move(fgate->qubits), fgate->parent,
-                                 std::move(fgate->gates), {}});
-
-          ++stat.num_fused_gates;
-        }
-      }
-    }
-
-    if (orphaned_gates.size() > 0) {
-      FuseOrphanedGates(max_fused_size, stat, orphaned_gates, fused_gates);
-    }
-  }
-
-  static void FuseOrphanedGates(unsigned max_fused_size, Stat& stat,
-                                std::vector<GateF*>& orphaned_gates,
-                                std::vector<GateFused>& fused_gates) {
-    for (std::size_t i = 0; i < orphaned_gates.size(); ++i) {
-      auto ogate1 = orphaned_gates[i];
-
-      if (ogate1->visited == kFinal) continue;
-
-      ogate1->visited = kFinal;
-
-      for (std::size_t j = i + 1; j < orphaned_gates.size(); ++j) {
-        auto ogate2 = orphaned_gates[j];
-
-        if (ogate2->visited == kFinal) continue;
-
-        unsigned cur_size = ogate1->qubits.size() + ogate2->qubits.size();
-
-        if (cur_size <= max_fused_size) {
-          ogate2->visited = kFinal;
-
-          for (auto q : ogate2->qubits) {
-            ogate1->qubits.push_back(q);
-            ogate1->mask |= uint64_t{1} << q;
-          }
-
-          for (auto l : ogate2->links) {
-            ogate1->links.push_back(l);
-          }
-
-          for (auto gate : ogate2->gates) {
-            ogate1->gates.push_back(gate);
-          }
-        }
-
-        if (cur_size == max_fused_size) {
-          break;
-        }
-      }
-
-      FuseNext(1, *ogate1);
-
-      std::sort(ogate1->qubits.begin(), ogate1->qubits.end());
-
-      fused_gates.push_back({ogate1->parent->kind, ogate1->parent->time,
-                             std::move(ogate1->qubits), ogate1->parent,
-                             std::move(ogate1->gates), {}});
-
-      ++stat.num_fused_gates;
-    }
-  }
-
-  static void MakeGateSequence(
-      unsigned max_fused_size, Scratch& scratch, GateF& fgate) {
-    unsigned level = kSecond + scratch.count;
-
-    FindLongestGateSequence(max_fused_size, level, scratch, fgate);
-
-    auto longest_seq = scratch.longest_seq;
-
-    if (longest_seq.size() == 1 && scratch.count == 0) {
-      fgate.visited = kFirst;
-      return;
-    }
-
-    ++scratch.count;
-
-    for (auto p : longest_seq) {
-      p->gate->visited = kCompress;
-
-      for (auto q : p->qubits) {
-        fgate.qubits.push_back(q);
-        fgate.mask |= uint64_t{1} << q;
-      }
-
-      for (auto l : p->links) {
-        fgate.links.push_back(l);
-      }
-    }
-
-    // Compress links.
-    for (auto& link : fgate.links) {
-      while (link->prev != nullptr && link->prev->val->visited == kCompress) {
-        link = link->prev;
-      }
-
-      while (link->next != nullptr && link->next->val->visited == kCompress) {
-        LinkManager::Delete(link->next);
-      }
-    }
-
-    for (auto p : longest_seq) {
-      p->gate->visited = level;
-    }
-
-    if (longest_seq.size() >= 3) {
-      AddGatesFromNext(longest_seq[2]->gate->gates, fgate);
-    }
-
-    if (longest_seq.size() >= 5) {
-      AddGatesFromNext(longest_seq[4]->gate->gates, fgate);
-    }
-
-    if (longest_seq.size() >= 2) {
-      // May call MakeGateSequence recursively.
-      AddGatesFromPrev(max_fused_size, *longest_seq[1]->gate, scratch, fgate);
-    }
-
-    if (longest_seq.size() >= 4) {
-      // May call MakeGateSequence recursively.
-      AddGatesFromPrev(max_fused_size, *longest_seq[3]->gate, scratch, fgate);
-    }
-
-    for (auto p : longest_seq) {
-      p->gate->visited = kFinal;
-    }
-
-    FuseNext(1, fgate);
-
-    scratch.gates.push_back(&fgate);
-  }
-
-  static void AddGatesFromNext(std::vector<const RGate*>& gates, GateF& fgate) {
-    for (auto gate : gates) {
-      fgate.gates.push_back(gate);
-    }
-  }
-
-  static void AddGatesFromPrev(unsigned max_fused_size, const GateF& pfgate,
-                               Scratch& scratch, GateF& fgate) {
-    for (auto gate : pfgate.gates) {
-        fgate.gates.push_back(gate);
-    }
-
-    for (auto link : pfgate.links) {
-      if (link->prev == nullptr) continue;
-
-      auto pgate = link->prev->val;
-
-      if (pgate->visited == kFirst) {
-        MakeGateSequence(max_fused_size, scratch, *pgate);
-      }
-    }
-  }
-
-  static void FindLongestGateSequence(
-      unsigned max_fused_size, unsigned level, Scratch& scratch, GateF& fgate) {
-    scratch.data.push_back({&fgate, {}, {}});
-
-    scratch.longest_seq.resize(0);
-    scratch.longest_seq.push_back(&scratch.data.back());
-
-    scratch.stack.resize(0);
-    scratch.stack.push_back(&scratch.data.back());
-
-    unsigned cur_size = fgate.qubits.size();
-    fgate.visited = level;
-
-    unsigned max_size = cur_size;
-
-    GetNextAvailableGates(max_fused_size, cur_size, fgate, nullptr,
-                          scratch.data, scratch.next1);
-
-    for (auto n1 : scratch.next1) {
-      unsigned cur_size2 = cur_size + n1->qubits.size();
-      if (cur_size2 > max_fused_size) continue;
-
-      bool feasible = GetPrevAvailableGates(max_fused_size, cur_size,
-                                            level, *n1->gate, nullptr,
-                                            scratch.data, scratch.prev1);
-
-      if (!feasible) continue;
-
-      if (scratch.prev1.size() == 0 && max_fused_size > 3) continue;
-
-      if (cur_size2 == max_fused_size) {
-        std::swap(scratch.longest_seq, scratch.stack);
-        scratch.longest_seq.push_back(n1);
-        return;
-      }
-
-      Push(level, cur_size2, cur_size, max_size, scratch, n1);
-
-      for (auto p1 : scratch.prev1) {
-        unsigned cur_size2 = cur_size + p1->qubits.size();
-
-        if (cur_size2 > max_fused_size) {
-          continue;
-        } else if (cur_size2 == max_fused_size) {
-          std::swap(scratch.longest_seq, scratch.stack);
-          scratch.longest_seq.push_back(p1);
-          return;
-        }
-
-        Push(level, cur_size2, cur_size, max_size, scratch, p1);
-
-        GetNextAvailableGates(max_fused_size, cur_size, *p1->gate, &fgate,
-                              scratch.data, scratch.next2);
-
-        for (auto n2 : scratch.next2) {
-          unsigned cur_size2 = cur_size + n2->qubits.size();
-          if (cur_size2 > max_fused_size) continue;
-
-          bool feasible = GetPrevAvailableGates(max_fused_size, cur_size,
-                                                level, *n2->gate, n1->gate,
-                                                scratch.data, scratch.prev2);
-
-          if (!feasible) continue;
-
-          if (cur_size2 == max_fused_size) {
-            std::swap(scratch.longest_seq, scratch.stack);
-            scratch.longest_seq.push_back(n2);
-            return;
-          }
-
-          Push(level, cur_size2, cur_size, max_size, scratch, n2);
-
-          for (auto p2 : scratch.prev2) {
-            unsigned cur_size2 = cur_size + p2->qubits.size();
-
-            if (cur_size2 > max_fused_size) {
-              continue;
-            } else if (cur_size2 == max_fused_size) {
-              std::swap(scratch.longest_seq, scratch.stack);
-              scratch.longest_seq.push_back(p2);
-              return;
-            }
-
-            if (cur_size2 > max_size) {
-              scratch.stack.push_back(p2);
-              scratch.longest_seq = scratch.stack;
-              scratch.stack.pop_back();
-              max_size = cur_size2;
-            }
-          }
-
-          Pop(cur_size, scratch, n2);
-        }
-
-        Pop(cur_size, scratch, p1);
-      }
-
-      Pop(cur_size, scratch, n1);
-    }
-  }
-
-  static void Push(unsigned level, unsigned cur_size2, unsigned& cur_size,
-                   unsigned& max_size, Scratch& scratch, GateA* agate) {
-    agate->gate->visited = level;
-    cur_size = cur_size2;
-    scratch.stack.push_back(agate);
-
-    if (cur_size > max_size) {
-      scratch.longest_seq = scratch.stack;
-      max_size = cur_size;
-    }
-  }
-
-  static void Pop(unsigned& cur_size, Scratch& scratch, GateA* agate) {
-    agate->gate->visited = kFirst;
-    cur_size -= agate->qubits.size();
-    scratch.stack.pop_back();
-  }
-
-  static void GetNextAvailableGates(unsigned max_fused_size, unsigned cur_size,
-                                    const GateF& pgate1, const GateF* pgate2,
-                                    std::vector<GateA>& scratch,
-                                    std::vector<GateA*>& next_gates) {
-    next_gates.resize(0);
-
-    for (auto link : pgate1.links) {
-      if (link->next == nullptr) continue;
-
-      auto ngate = link->next->val;
-
-      if (ngate->visited > kFirst || ngate->parent->unfusible) continue;
-
-      GateA next = {ngate, {}, {}};
-      next.qubits.reserve(8);
-      next.links.reserve(8);
-
-      GetAddedQubits(pgate1, pgate2, *ngate, next);
-
-      if (cur_size + next.qubits.size() > max_fused_size) continue;
-
-      scratch.push_back(std::move(next));
-      next_gates.push_back(&scratch.back());
-    }
-  }
-
-  static bool GetPrevAvailableGates(unsigned max_fused_size,
-                                    unsigned cur_size, unsigned level,
-                                    const GateF& ngate1, const GateF* ngate2,
-                                    std::vector<GateA>& scratch,
-                                    std::vector<GateA*>& prev_gates) {
-    prev_gates.resize(0);
-
-    for (auto link : ngate1.links) {
-      if (link->prev == nullptr) continue;
-
-      auto pgate = link->prev->val;
-
-      if (pgate->visited == kFinal || pgate->visited == level) continue;
-
-      if (pgate->visited > kFirst || pgate->parent->unfusible) {
-        prev_gates.resize(0);
-        return false;
-      }
-
-      GateA prev = {pgate, {}, {}};
-      prev.qubits.reserve(8);
-      prev.links.reserve(8);
-
-      GetAddedQubits(ngate1, ngate2, *pgate, prev);
-
-      bool all_prev_visited = true;
-
-      for (auto link : pgate->links) {
-        if (link->prev == nullptr) continue;
-
-        if (link->prev->val->visited <= kMeaCnt) {
-          all_prev_visited = false;
-          break;
-        }
-      }
-
-      if (!all_prev_visited) {
-        prev_gates.resize(0);
-        return false;
-      }
-
-      if (cur_size + prev.qubits.size() > max_fused_size) continue;
-
-      if (all_prev_visited) {
-        scratch.push_back(std::move(prev));
-        prev_gates.push_back(&scratch.back());
-      }
-    }
-
-    return true;
-  }
-
-  static void GetAddedQubits(const GateF& fgate0, const GateF* fgate1,
-                             const GateF& fgate2, GateA& added) {
-    for (std::size_t i = 0; i < fgate2.qubits.size(); ++i) {
-      unsigned q2 = fgate2.qubits[i];
-
-      if (std::find(fgate0.qubits.begin(), fgate0.qubits.end(), q2)
-          != fgate0.qubits.end()) continue;
-
-      if (fgate1 != nullptr
-          && std::find(fgate1->qubits.begin(), fgate1->qubits.end(), q2)
-            != fgate1->qubits.end()) continue;
-
-      added.qubits.push_back(q2);
-      added.links.push_back(fgate2.links[i]);
-    }
-  }
-
-  // Fuse smaller gates with fgate back in gate time.
-  static void FusePrev(unsigned pass, GateF& fgate) {
-    std::vector<const RGate*> gates;
-    gates.reserve(fgate.gates.capacity());
-
-    auto neighbor = [](const Link* link) -> const Link* {
-      return link->prev;
-    };
-
-    FusePrevOrNext<std::greater<unsigned>>(pass, neighbor, fgate, gates);
-
-    for (auto it = gates.rbegin(); it != gates.rend(); ++it) {
-      fgate.gates.push_back(*it);
-    }
-  }
-
-  // Fuse smaller gates with fgate forward in gate time.
-  static void FuseNext(unsigned pass, GateF& fgate) {
-    auto neighbor = [](const Link* link) -> const Link* {
-      return link->next;
-    };
-
-    FusePrevOrNext<std::less<unsigned>>(pass, neighbor, fgate, fgate.gates);
-  }
-
-  template <typename R, typename Neighbor>
-  static void FusePrevOrNext(unsigned pass, Neighbor neighb, GateF& fgate,
-                             std::vector<const RGate*>& gates) {
-    uint64_t bad_mask = 0;
-    auto links = fgate.links;
-
-    bool may_have_gates_to_fuse = true;
-
-    while (may_have_gates_to_fuse) {
-      may_have_gates_to_fuse = false;
-
-      std::sort(links.begin(), links.end(),
-                [&neighb](const Link* l, const Link* r) -> bool {
-                  auto ln = neighb(l);
-                  auto rn = neighb(r);
-
-                  if (ln != nullptr && rn != nullptr) {
-                    return R()(ln->val->parent->time, rn->val->parent->time);
-                  } else {
-                    // nullptrs are larger than everything else and
-                    // equivalent among each other.
-                    return ln != nullptr;
-                  }
-                });
-
-      for (auto link : links) {
-        auto n = neighb(link);
-
-        if (n == nullptr) continue;
-
-        auto g = n->val;
-
-        if (!QubitsAreIn(fgate.mask, g->mask) || (g->mask & bad_mask) != 0
-            || g->visited > pass || g->parent->unfusible) {
-          bad_mask |= g->mask;
-        } else {
-          g->visited = pass == 0 ? kFirst : kFinal;
-
-          if (pass == 0) {
-            gates.push_back(g->parent);
-          } else {
-            for (auto gate : g->gates) {
-              gates.push_back(gate);
-            }
-          }
-
-          for (auto link : g->links) {
-            LinkManager::Delete(link);
-          }
-
-          may_have_gates_to_fuse = true;
-          break;
-        }
-      }
-    }
-  }
-
-  static bool QubitsAreIn(uint64_t mask0, uint64_t mask) {
-    return ((mask0 | mask) ^ mask0) == 0;
-  }
-
-  static void PrintStat(unsigned verbosity, const Stat& stat,
-                        const std::vector<GateFused>& fused_gates) {
-    if (verbosity < 3) return;
-
-    if (stat.num_controlled_gates > 0) {
-      IO::messagef("%lu controlled gates\n", stat.num_controlled_gates);
-    }
-
-    if (stat.num_mea_gates > 0) {
-      IO::messagef("%lu measurement gates", stat.num_mea_gates);
-      if (stat.num_fused_mea_gates == stat.num_mea_gates) {
-        IO::messagef("\n");
-      } else {
-        IO::messagef(" are fused into %lu gates\n", stat.num_fused_mea_gates);
-      }
-    }
-
-    bool first = true;
-    for (unsigned i = 1; i < stat.num_gates.size(); ++i) {
-      if (stat.num_gates[i] > 0) {
-        if (first) {
-          first = false;
-        } else {
-          IO::messagef(", ");
-        }
-        IO::messagef("%u %u-qubit", stat.num_gates[i], i);
-      }
-    }
-
-    IO::messagef(" gates are fused into %lu gates\n", stat.num_fused_gates);
-
-    if (verbosity < 5) return;
-
-    IO::messagef("fused gate qubits:\n");
-    for (const auto& g : fused_gates) {
-      IO::messagef("%6u  ", g.parent->time);
-      if (g.parent->kind == gate::kMeasurement) {
-        IO::messagef("m");
-      } else if (g.parent->controlled_by.size() > 0) {
-        IO::messagef("c");
-        for (auto q : g.parent->controlled_by) {
-          IO::messagef("%3u", q);
-        }
-        IO::messagef("  t");
-      } else {
-        IO::messagef(" ");
-      }
-
-      for (auto q : g.qubits) {
-        IO::messagef("%3u", q);
-      }
-      IO::messagef("\n");
-    }
-  }
-
-  template <typename Gate2, typename GatesLat>
-  static bool ValidateGate(const Gate2& gate, unsigned max_qubit1,
-                           const GatesLat& gates_lat) {
-    for (unsigned q : gate.qubits) {
-      if (q >= max_qubit1) {
-        IO::errorf("fuser: gate qubit %u is out of range "
-                   "(should be smaller than %u).\n", q, max_qubit1);
-        return false;
-      }
-      if (gates_lat[q] != nullptr
-          && gate.time <= gates_lat[q]->val->parent->time) {
-        IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time);
-        return false;
-      }
-    }
-
-    for (unsigned q : gate.controlled_by) {
-      if (q >= max_qubit1) {
-        IO::errorf("fuser: gate qubit %u is out of range "
-                   "(should be smaller than %u).\n", q, max_qubit1);
-        return false;
-      }
-      if (gates_lat[q] != nullptr
-          && gate.time <= gates_lat[q]->val->parent->time) {
-        IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time);
-        return false;
-      }
-    }
-
-    return true;
-  }
-};
-
-}  // namespace qsim
-
-#endif  // FUSER_MQUBIT_H_
diff --git a/tpls/qsim/gate.h b/tpls/qsim/gate.h
deleted file mode 100644
index a457acb..0000000
--- a/tpls/qsim/gate.h
+++ /dev/null
@@ -1,216 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef GATE_H_
-#define GATE_H_
-
-#include <algorithm>
-#include <cstdint>
-#include <utility>
-#include <vector>
-
-#include "matrix.h"
-
-namespace qsim {
-
-namespace detail {
-
-template <typename Gate, typename GateDef>
-inline void SortQubits(Gate& gate) {
-  for (std::size_t i = 1; i < gate.qubits.size(); ++i) {
-    if (gate.qubits[i - 1] > gate.qubits[i]) {
-      if (!GateDef::symmetric) {
-        auto perm = NormalToGateOrderPermutation(gate.qubits);
-        MatrixShuffle(perm, gate.qubits.size(), gate.matrix);
-      }
-
-      gate.swapped = true;
-      std::sort(gate.qubits.begin(), gate.qubits.end());
-      break;
-    }
-  }
-}
-
-}  // namespace detail
-
-template <typename Qubits = std::vector<unsigned>, typename Gate>
-inline Gate& MakeControlledGate(Qubits&& controlled_by, Gate& gate) {
-  gate.controlled_by = std::forward<Qubits>(controlled_by);
-  gate.cmask = (uint64_t{1} << gate.controlled_by.size()) - 1;
-
-  std::sort(gate.controlled_by.begin(), gate.controlled_by.end());
-
-  return gate;
-}
-
-template <typename Qubits = std::vector<unsigned>, typename Gate>
-inline Gate& MakeControlledGate(Qubits&& controlled_by,
-                               const std::vector<unsigned>& control_values,
-                               Gate& gate) {
-  // Assume controlled_by.size() == control_values.size().
-
-  bool sorted = true;
-
-  for (std::size_t i = 1; i < controlled_by.size(); ++i) {
-    if (controlled_by[i - 1] > controlled_by[i]) {
-      sorted = false;
-      break;
-    }
-  }
-
-  if (sorted) {
-    gate.controlled_by = std::forward<Qubits>(controlled_by);
-    gate.cmask = 0;
-
-    for (std::size_t i = 0; i < control_values.size(); ++i) {
-      gate.cmask |= (control_values[i] & 1) << i;
-    }
-  } else {
-    struct ControlPair {
-      unsigned q;
-      unsigned v;
-    };
-
-    std::vector<ControlPair> cpairs;
-    cpairs.reserve(controlled_by.size());
-
-    for (std::size_t i = 0; i < controlled_by.size(); ++i) {
-      cpairs.push_back({controlled_by[i], control_values[i]});
-    }
-
-    // Sort control qubits and control values.
-    std::sort(cpairs.begin(), cpairs.end(),
-              [](const ControlPair& l, const ControlPair& r) -> bool {
-                return l.q < r.q;
-              });
-
-    gate.cmask = 0;
-    gate.controlled_by.reserve(controlled_by.size());
-
-    for (std::size_t i = 0; i < cpairs.size(); ++i) {
-      gate.cmask |= (cpairs[i].v & 1) << i;
-      gate.controlled_by.push_back(cpairs[i].q);
-    }
-  }
-
-  return gate;
-}
-
-namespace gate {
-
-constexpr int kDecomp = 100001;       // gate from Schmidt decomposition
-constexpr int kMeasurement = 100002;  // measurement gate
-
-}  // namespace gate
-
-enum GateAnyKind {
-  kGateAny = -1,
-};
-
-/**
- * A generic gate to make it easier to use qsim with external gate sets.
- */
-template <typename FP, typename GK = GateAnyKind>
-struct Gate {
-  using fp_type = FP;
-  using GateKind = GK;
-
-  GateKind kind;
-  unsigned time;
-  std::vector<unsigned> qubits;
-  std::vector<unsigned> controlled_by;
-  uint64_t cmask;
-  std::vector<fp_type> params;
-  Matrix<fp_type> matrix;
-  bool unfusible;      // If true, the gate is fused as a parent.
-  bool swapped;        // If true, the gate qubits are swapped to make qubits
-                       // ordered in ascending order. This does not apply to
-                       // control qubits of explicitly-controlled gates.
-
-  template <typename Qubits = std::vector<unsigned>>
-  Gate&& ControlledBy(Qubits&& controlled_by) {
-    MakeControlledGate(std::forward<Qubits>(controlled_by), *this);
-    return std::move(*this);
-  }
-
-  template <typename Qubits = std::vector<unsigned>>
-  Gate&& ControlledBy(Qubits&& controlled_by,
-                      const std::vector<unsigned>& control_values) {
-    MakeControlledGate(
-        std::forward<Qubits>(controlled_by), control_values, *this);
-    return std::move(*this);
-  }
-};
-
-template <typename Gate, typename GateDef,
-          typename Qubits = std::vector<unsigned>,
-          typename M = Matrix<typename Gate::fp_type>>
-inline Gate CreateGate(unsigned time, Qubits&& qubits, M&& matrix = {},
-                       std::vector<typename Gate::fp_type>&& params = {}) {
-  Gate gate = {GateDef::kind, time, std::forward<Qubits>(qubits), {}, 0,
-               std::move(params), std::forward<M>(matrix), false, false};
-
-  if (GateDef::kind != gate::kMeasurement) {
-    switch (gate.qubits.size()) {
-    case 1:
-      break;
-    case 2:
-      if (gate.qubits[0] > gate.qubits[1]) {
-        gate.swapped = true;
-        std::swap(gate.qubits[0], gate.qubits[1]);
-        if (!GateDef::symmetric) {
-          MatrixShuffle({1, 0}, 2, gate.matrix);
-        }
-      }
-      break;
-    default:
-      detail::SortQubits<Gate, GateDef>(gate);
-    }
-  }
-
-  return gate;
-}
-
-namespace gate {
-
-/**
- * A gate that simulates measurement of one or more qubits, collapsing the
- * state vector and storing the measured results.
- */
-template <typename Gate>
-struct Measurement {
-  using GateKind = typename Gate::GateKind;
-
-  static constexpr GateKind kind = GateKind::kMeasurement;
-  static constexpr char name[] = "m";
-  static constexpr bool symmetric = false;
-
-  template <typename Qubits = std::vector<unsigned>>
-  static Gate Create(unsigned time, Qubits&& qubits) {
-    return CreateGate<Gate, Measurement>(time, std::forward<Qubits>(qubits));
-  }
-};
-
-}  // namespace gate
-
-template <typename fp_type>
-using schmidt_decomp_type = std::vector<std::vector<std::vector<fp_type>>>;
-
-template <typename fp_type, typename GateKind>
-schmidt_decomp_type<fp_type> GetSchmidtDecomp(
-    GateKind kind, const std::vector<fp_type>& params);
-
-}  // namespace qsim
-
-#endif  // GATE_H_
diff --git a/tpls/qsim/gate_appl.h b/tpls/qsim/gate_appl.h
deleted file mode 100644
index 8601e6f..0000000
--- a/tpls/qsim/gate_appl.h
+++ /dev/null
@@ -1,231 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef GATE_APPL_H_
-#define GATE_APPL_H_
-
-#include <utility>
-#include <vector>
-
-#include "fuser.h"
-#include "gate.h"
-#include "matrix.h"
-
-namespace qsim {
-
-/**
- * Applies the given gate to the simulator state. Ignores measurement gates.
- * @param simulator Simulator object. Provides specific implementations for
- *   applying gates.
- * @param gate The gate to be applied.
- * @param state The state of the system, to be updated by this method.
- */
-template <typename Simulator, typename Gate>
-inline void ApplyGate(const Simulator& simulator, const Gate& gate,
-                      typename Simulator::State& state) {
-  if (gate.kind != gate::kMeasurement) {
-    if (gate.controlled_by.size() == 0) {
-      simulator.ApplyGate(gate.qubits, gate.matrix.data(), state);
-    } else {
-      simulator.ApplyControlledGate(gate.qubits, gate.controlled_by,
-                                    gate.cmask, gate.matrix.data(), state);
-    }
-  }
-}
-
-/**
- * Applies the given gate dagger to the simulator state. If the gate matrix is
- *   unitary then this is equivalent to applying the inverse gate. Ignores
- *   measurement gates.
- * @param simulator Simulator object. Provides specific implementations for
- *   applying gates.
- * @param gate The gate to be applied.
- * @param state The state of the system, to be updated by this method.
- */
-template <typename Simulator, typename Gate>
-inline void ApplyGateDagger(const Simulator& simulator, const Gate& gate,
-                            typename Simulator::State& state) {
-  if (gate.kind != gate::kMeasurement) {
-    auto matrix = gate.matrix;
-    MatrixDagger(unsigned{1} << gate.qubits.size(), matrix);
-
-    if (gate.controlled_by.size() == 0) {
-      simulator.ApplyGate(gate.qubits, matrix.data(), state);
-    } else {
-      simulator.ApplyControlledGate(gate.qubits, gate.controlled_by,
-                                    gate.cmask, matrix.data(), state);
-    }
-  }
-}
-
-/**
- * Applies the given gate to the simulator state.
- * @param state_space StateSpace object required to perform measurements.
- * @param simulator Simulator object. Provides specific implementations for
- *   applying gates.
- * @param gate The gate to be applied.
- * @param rgen Random number generator to perform measurements.
- * @param state The state of the system, to be updated by this method.
- * @param mresults As an input parameter, this can be empty or this can
- *   contain the results of the previous measurements. If gate is a measurement
- *   gate then after a successful run, the measurement result will be added to
- *   this.
- * @return True if the measurement performed successfully; false otherwise.
- */
-template <typename Simulator, typename Gate, typename Rgen>
-inline bool ApplyGate(
-    const typename Simulator::StateSpace& state_space,
-    const Simulator& simulator, const Gate& gate, Rgen& rgen,
-    typename Simulator::State& state,
-    std::vector<typename Simulator::StateSpace::MeasurementResult>& mresults) {
-  if (gate.kind == gate::kMeasurement) {
-    auto measure_result = state_space.Measure(gate.qubits, rgen, state);
-    if (measure_result.valid) {
-      mresults.push_back(std::move(measure_result));
-    } else {
-      return false;
-    }
-  } else {
-    ApplyGate(simulator, gate, state);
-  }
-
-  return true;
-}
-
-/**
- * Applies the given gate to the simulator state, discarding measurement
- *   results.
- * @param state_space StateSpace object required to perform measurements.
- * @param simulator Simulator object. Provides specific implementations for
- *   applying gates.
- * @param gate The gate to be applied.
- * @param rgen Random number generator to perform measurements.
- * @param state The state of the system, to be updated by this method.
- * @return True if the measurement performed successfully; false otherwise.
- */
-template <typename Simulator, typename Gate, typename Rgen>
-inline bool ApplyGate(const typename Simulator::StateSpace& state_space,
-                      const Simulator& simulator, const Gate& gate, Rgen& rgen,
-                      typename Simulator::State& state) {
-  using MeasurementResult = typename Simulator::StateSpace::MeasurementResult;
-  std::vector<MeasurementResult> discarded_results;
-  return
-      ApplyGate(state_space, simulator, gate, rgen, state, discarded_results);
-}
-
-/**
- * Applies the given fused gate to the simulator state. Ignores measurement
- *   gates.
- * @param simulator Simulator object. Provides specific implementations for
- *   applying gates.
- * @param gate The gate to be applied.
- * @param state The state of the system, to be updated by this method.
- */
-template <typename Simulator, typename Gate>
-inline void ApplyFusedGate(const Simulator& simulator, const Gate& gate,
-                           typename Simulator::State& state) {
-  if (gate.kind != gate::kMeasurement) {
-    if (gate.parent->controlled_by.size() == 0) {
-      simulator.ApplyGate(gate.qubits, gate.matrix.data(), state);
-    } else {
-      simulator.ApplyControlledGate(gate.qubits, gate.parent->controlled_by,
-                                    gate.parent->cmask, gate.matrix.data(),
-                                    state);
-    }
-  }
-}
-
-/**
- * Applies the given fused gate dagger to the simulator state. If the gate
- *   matrix is unitary then this is equivalent to applying the inverse gate.
- *   Ignores measurement gates.
- * @param simulator Simulator object. Provides specific implementations for
- *   applying gates.
- * @param gate The gate to be applied.
- * @param state The state of the system, to be updated by this method.
- */
-template <typename Simulator, typename Gate>
-inline void ApplyFusedGateDagger(const Simulator& simulator, const Gate& gate,
-                                 typename Simulator::State& state) {
-  if (gate.kind != gate::kMeasurement) {
-    auto matrix = gate.matrix;
-    MatrixDagger(unsigned{1} << gate.qubits.size(), matrix);
-
-    if (gate.parent->controlled_by.size() == 0) {
-      simulator.ApplyGate(gate.qubits, matrix.data(), state);
-    } else {
-      simulator.ApplyControlledGate(gate.qubits, gate.parent->controlled_by,
-                                    gate.parent->cmask, matrix.data(), state);
-    }
-  }
-}
-
-/**
- * Applies the given fused gate to the simulator state.
- * @param state_space StateSpace object required to perform measurements.
- * @param simulator Simulator object. Provides specific implementations for
- *   applying gates.
- * @param gate The gate to be applied.
- * @param rgen Random number generator to perform measurements.
- * @param state The state of the system, to be updated by this method.
- * @param mresults As an input parameter, this can be empty or this can
- *   contain the results of the previous measurements. If gate is a measurement
- *   gate then after a successful run, the measurement result will be added to
- *   this.
- * @return True if the measurement performed successfully; false otherwise.
- */
-template <typename Simulator, typename Gate, typename Rgen>
-inline bool ApplyFusedGate(
-    const typename Simulator::StateSpace& state_space,
-    const Simulator& simulator, const Gate& gate, Rgen& rgen,
-    typename Simulator::State& state,
-    std::vector<typename Simulator::StateSpace::MeasurementResult>& mresults) {
-  if (gate.kind == gate::kMeasurement) {
-    auto measure_result = state_space.Measure(gate.qubits, rgen, state);
-    if (measure_result.valid) {
-      mresults.push_back(std::move(measure_result));
-    } else {
-      return false;
-    }
-  } else {
-    ApplyFusedGate(simulator, gate, state);
-  }
-
-  return true;
-}
-
-/**
- * Applies the given fused gate to the simulator state, discarding measurement
- *   results.
- * @param state_space StateSpace object required to perform measurements.
- * @param simulator Simulator object. Provides specific implementations for
- *   applying gates.
- * @param gate The gate to be applied.
- * @param rgen Random number generator to perform measurements.
- * @param state The state of the system, to be updated by this method.
- * @return True if the measurement performed successfully; false otherwise.
- */
-template <typename Simulator, typename Gate, typename Rgen>
-inline bool ApplyFusedGate(const typename Simulator::StateSpace& state_space,
-                           const Simulator& simulator, const Gate& gate,
-                           Rgen& rgen, typename Simulator::State& state) {
-  using MeasurementResult = typename Simulator::StateSpace::MeasurementResult;
-  std::vector<MeasurementResult> discarded_results;
-  return ApplyFusedGate(
-      state_space, simulator, gate, rgen, state, discarded_results);
-}
-
-}  // namespace qsim
-
-#endif  // GATE_APPL_H_
diff --git a/tpls/qsim/gates_cirq.h b/tpls/qsim/gates_cirq.h
deleted file mode 100644
index d767959..0000000
--- a/tpls/qsim/gates_cirq.h
+++ /dev/null
@@ -1,1640 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef GATES_CIRQ_H_
-#define GATES_CIRQ_H_
-
-#include <algorithm>
-#include <cmath>
-#include <complex>
-#include <vector>
-
-#include "gate.h"
-#include "matrix.h"
-
-namespace qsim {
-
-namespace Cirq {
-
-enum GateKind {
-  kI1 = 0,     // One-qubit identity gate.
-  kI2,         // Two-qubit identity gate.
-  kI,          // Multi-qubit identity gate.
-  kXPowGate,
-  kYPowGate,
-  kZPowGate,
-  kHPowGate,
-  kCZPowGate,
-  kCXPowGate,
-  krx,
-  kry,
-  krz,
-  kH,
-  kS,
-  kCZ,
-  kCX,
-  kT,
-  kX,
-  kY,
-  kZ,
-  kPhasedXPowGate,
-  kPhasedXZGate,
-  kXXPowGate,
-  kYYPowGate,
-  kZZPowGate,
-  kXX,
-  kYY,
-  kZZ,
-  kSwapPowGate,
-  kISwapPowGate,
-  kriswap,
-  kSWAP,
-  kISWAP,
-  kPhasedISwapPowGate,
-  kgivens,
-  kFSimGate,
-  kTwoQubitDiagonalGate,
-  kThreeQubitDiagonalGate,
-  kCCZPowGate,
-  kCCXPowGate,
-  kCSwapGate,
-  kCCZ,
-  kCCX,
-  kMatrixGate1,  // One-qubit matrix gate.
-  kMatrixGate2,  // Two-qubit matrix gate.
-  kMatrixGate,   // Multi-qubit matrix gate.
-  kGlobalPhaseGate,
-  kDecomp = gate::kDecomp,
-  kMeasurement = gate::kMeasurement,
-};
-
-template <typename fp_type>
-using GateCirq = Gate<fp_type, GateKind>;
-
-constexpr double h_double = 0.5;
-constexpr double pi_double = 3.14159265358979323846264338327950288;
-constexpr double is2_double = 0.7071067811865475;
-
-// Gates from cirq/ops/global_phase_op.py:
-
-/**
- * The global phase gate.
- */
-template <typename fp_type>
-struct GlobalPhaseGate {
-  static constexpr GateKind kind = kGlobalPhaseGate;
-  static constexpr char name[] = "GlobalPhaseGate";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateCirq<fp_type> Create(unsigned time, fp_type phi) {
-    return Create(time, std::cos(phi), std::sin(phi));
-  }
-
-  static GateCirq<fp_type> Create(unsigned time, fp_type cp, fp_type sp) {
-    return CreateGate<GateCirq<fp_type>, GlobalPhaseGate>(
-        time, {}, {cp, sp}, {cp, sp});
-  }
-};
-
-template <typename fp_type>
-using global_phase_operation = GlobalPhaseGate<fp_type>;
-
-// Gates from cirq/ops/identity.py:
-
-/**
- * A one-qubit identity gate.
- */
-template <typename fp_type>
-struct I1 {
-  static constexpr GateKind kind = kI1;
-  static constexpr char name[] = "I1";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateCirq<fp_type>, I1>(
-        time, {q0}, {1, 0, 0, 0, 0, 0, 1, 0});
-  }
-};
-
-/**
- * A two-qubit identity gate.
- */
-template <typename fp_type>
-struct I2 {
-  static constexpr GateKind kind = kI2;
-  static constexpr char name[] = "I2";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
-    return CreateGate<GateCirq<fp_type>, I2>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, 1, 0, 0, 0, 0, 0,
-                         0, 0, 0, 0, 1, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, 1, 0});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
-    return schmidt_decomp_type<fp_type>{
-      {{1, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, 1, 0}},
-    };
-  }
-};
-
-/**
- * A multi-qubit identity gate.
- */
-template <typename fp_type>
-struct I {
-  static constexpr GateKind kind = kI;
-  static constexpr char name[] = "I";
-  static constexpr bool symmetric = true;
-
-  static GateCirq<fp_type> Create(unsigned time,
-                                  const std::vector<unsigned>& qubits) {
-    Matrix<fp_type> matrix;
-    MatrixIdentity(1 << qubits.size(), matrix);
-    return CreateGate<GateCirq<fp_type>, I>(time, qubits, std::move(matrix));
-  }
-};
-
-// Gates form cirq/ops/common_gates.py:
-
-/**
- * A gate that rotates around the X axis of the Bloch sphere.
- * This is a generalization of the X gate.
- */
-template <typename fp_type>
-struct XPowGate {
-  static constexpr GateKind kind = kXPowGate;
-  static constexpr char name[] = "XPowGate";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0,
-                                  fp_type exponent, fp_type global_shift = 0) {
-    fp_type c = std::cos(pi * exponent * 0.5);
-    fp_type s = std::sin(pi * exponent * 0.5);
-    fp_type gc = std::cos(pi * exponent * (0.5 + global_shift));
-    fp_type gs = std::sin(pi * exponent * (0.5 + global_shift));
-
-    return CreateGate<GateCirq<fp_type>, XPowGate>(
-        time, {q0}, {c * gc, c * gs, s * gs, -s * gc,
-                     s * gs, -s * gc, c * gc, c * gs},
-        {exponent, global_shift});
-  }
-};
-
-/**
- * A gate that rotates around the Y axis of the Bloch sphere.
- * This is a generalization of the Y gate.
- */
-template <typename fp_type>
-struct YPowGate {
-  static constexpr GateKind kind = kYPowGate;
-  static constexpr char name[] = "YPowGate";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0,
-                                  fp_type exponent, fp_type global_shift = 0) {
-    fp_type c = std::cos(pi * exponent * 0.5);
-    fp_type s = std::sin(pi * exponent * 0.5);
-    fp_type gc = std::cos(pi * exponent * (0.5 + global_shift));
-    fp_type gs = std::sin(pi * exponent * (0.5 + global_shift));
-
-    return CreateGate<GateCirq<fp_type>, YPowGate>(
-        time, {q0}, {c * gc, c * gs, -s * gc, -s * gs,
-                     s * gc, s * gs, c * gc, c * gs}, {exponent, global_shift});
-  }
-};
-
-/**
- * A gate that rotates around the Z axis of the Bloch sphere.
- * This is a generalization of the Z gate.
- */
-template <typename fp_type>
-struct ZPowGate {
-  static constexpr GateKind kind = kZPowGate;
-  static constexpr char name[] = "ZPowGate";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0,
-                                  fp_type exponent, fp_type global_shift = 0) {
-    fp_type c = std::cos(pi * exponent);
-    fp_type s = std::sin(pi * exponent);
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-
-    return CreateGate<GateCirq<fp_type>, ZPowGate>(
-        time, {q0}, {gc, gs, 0, 0, 0, 0, c * gc - s * gs, c * gs + s * gc},
-        {exponent, global_shift});
-  }
-};
-
-/**
- * A gate that rotates around the X+Z axis of the Bloch sphere.
- * This is a generalization of the Hadamard gate.
- */
-template <typename fp_type>
-struct HPowGate {
-  static constexpr GateKind kind = kHPowGate;
-  static constexpr char name[] = "HPowGate";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0,
-                                  fp_type exponent, fp_type global_shift = 0) {
-    fp_type c = std::cos(pi * exponent * 0.5);
-    fp_type s = std::sin(pi * exponent * 0.5);
-    fp_type gc = std::cos(pi * exponent * (0.5 + global_shift));
-    fp_type gs = std::sin(pi * exponent * (0.5 + global_shift));
-
-    fp_type a = s * gs * is2;
-    fp_type b = s * gc * is2;
-
-    return CreateGate<GateCirq<fp_type>, HPowGate>(
-        time, {q0}, {c * gc + a, c * gs - b, a, -b,
-                     a, -b, c * gc - a, c * gs + b}, {exponent, global_shift});
-  }
-};
-
-/**
- * A gate that applies a phase to the |11⟩ state of two qubits.
- * This is a generalization of the CZ gate.
- */
-template <typename fp_type>
-struct CZPowGate {
-  static constexpr GateKind kind = kCZPowGate;
-  static constexpr char name[] = "CZPowGate";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
-                                  fp_type exponent, fp_type global_shift = 0) {
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-    fp_type ec = std::cos(pi * exponent * (1 + global_shift));
-    fp_type es = std::sin(pi * exponent * (1 + global_shift));
-
-    return CreateGate<GateCirq<fp_type>, CZPowGate>(
-        time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0,
-                         0, 0, gc, gs, 0, 0, 0, 0,
-                         0, 0, 0, 0, gc, gs, 0, 0,
-                         0, 0, 0, 0, 0, 0, ec, es}, {exponent, global_shift});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp(
-      fp_type exponent, fp_type global_shift) {
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-    fp_type ec = std::cos(pi * exponent * (1 + global_shift));
-    fp_type es = std::sin(pi * exponent * (1 + global_shift));
-
-    return schmidt_decomp_type<fp_type>{
-      {{1, 0, 0, 0, 0, 0, 0, 0}, {gc, gs, 0, 0, 0, 0, gc, gs}},
-      {{0, 0, 0, 0, 0, 0, 1, 0}, {gc, gs, 0, 0, 0, 0, ec, es}},
-    };
-  }
-};
-
-/**
- * A gate that applies a controlled power of an X gate.
- * This is a generalization of the CX (or CNOT) gate.
- */
-template <typename fp_type>
-struct CXPowGate {
-  static constexpr GateKind kind = kCXPowGate;
-  static constexpr char name[] = "CXPowGate";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = false;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
-                                  fp_type exponent, fp_type global_shift = 0) {
-    fp_type c = std::cos(pi * exponent * 0.5);
-    fp_type s = std::sin(pi * exponent * 0.5);
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-    fp_type ec = std::cos(pi * exponent * (0.5 + global_shift));
-    fp_type es = std::sin(pi * exponent * (0.5 + global_shift));
-
-    // Matrix is in this form because the simulator uses inverse qubit order.
-    return CreateGate<GateCirq<fp_type>, CXPowGate>(
-        time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0,
-                         0, 0, c * ec, c * es, 0, 0, s * es, -s * ec,
-                         0, 0, 0, 0, gc, gs, 0, 0,
-                         0, 0, s * es, -s * ec, 0, 0, c * ec, c * es},
-        {exponent, global_shift});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp(
-      fp_type exponent, fp_type global_shift) {
-    fp_type c = std::cos(pi * exponent * 0.5);
-    fp_type s = std::sin(pi * exponent * 0.5);
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-    fp_type ec = std::cos(pi * exponent * (0.5 + global_shift));
-    fp_type es = std::sin(pi * exponent * (0.5 + global_shift));
-
-    return schmidt_decomp_type<fp_type>{
-      {{1, 0, 0, 0, 0, 0, 0, 0}, {gc, gs, 0, 0, 0, 0, gc, gs}},
-      {{0, 0, 0, 0, 0, 0, 1, 0}, {c * ec, c * es, s * es, -s * ec,
-                                  s * es, -s * ec, c * ec, c * es}},
-    };
-  }
-};
-
-/**
- * The `(exponent = phi/pi, global_shift = -0.5)` instance of XPowGate.
- * This is a generalization of the X gate with a fixed global phase.
- * This is a function in Cirq.
- */
-template <typename fp_type>
-struct rx {
-  static constexpr GateKind kind = krx;
-  static constexpr char name[] = "rx";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, fp_type phi) {
-    fp_type c = std::cos(-0.5 * phi);
-    fp_type s = std::sin(-0.5 * phi);
-
-    return CreateGate<GateCirq<fp_type>, rx>(
-        time, {q0}, {c, 0, 0, s, 0, s, c, 0}, {phi});
-  }
-};
-
-/**
- * The `(exponent = phi/pi, global_shift = -0.5)` instance of YPowGate.
- * This is a generalization of the Y gate with a fixed global phase.
- * This is a function in Cirq.
- */
-template <typename fp_type>
-struct ry {
-  static constexpr GateKind kind = kry;
-  static constexpr char name[] = "ry";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, fp_type phi) {
-    fp_type c = std::cos(-0.5 * phi);
-    fp_type s = std::sin(-0.5 * phi);
-
-    return CreateGate<GateCirq<fp_type>, ry>(
-        time, {q0}, {c, 0, s, 0, -s, 0, c, 0}, {phi});
-  }
-};
-
-/**
- * The `(exponent = phi/pi, global_shift = -0.5)` instance of ZPowGate.
- * This is a generalization of the Z gate with a fixed global phase.
- * This is a function in Cirq.
- */
-template <typename fp_type>
-struct rz {
-  static constexpr GateKind kind = krz;
-  static constexpr char name[] = "rz";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, fp_type phi) {
-    fp_type c = std::cos(-0.5 * phi);
-    fp_type s = std::sin(-0.5 * phi);
-
-    return CreateGate<GateCirq<fp_type>, rz>(
-        time, {q0}, {c, s, 0, 0, 0, 0, c, -s}, {phi});
-  }
-};
-
-/**
- * The `(exponent = 1, global_shift = 0)` instance of HPowGate.
- * This is the canonical Hadamard (or H) gate.
- */
-template <typename fp_type>
-struct H {
-  static constexpr GateKind kind = kH;
-  static constexpr char name[] = "H";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateCirq<fp_type>, H>(
-        time, {q0}, {is2, 0, is2, 0, is2, 0, -is2, 0});
-  }
-};
-
-/**
- * The `(exponent = 0.5, global_shift = 0)` instance of ZPowGate.
- * This is the canonical S gate.
- */
-template <typename fp_type>
-struct S {
-  static constexpr GateKind kind = kS;
-  static constexpr char name[] = "S";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateCirq<fp_type>, S>(
-        time, {q0}, {1, 0, 0, 0, 0, 0, 0, 1});
-  }
-};
-
-/**
- * The `(exponent = 0.25, global_shift = 0)` instance of ZPowGate.
- * This is the canonical T gate.
- */
-template <typename fp_type>
-struct T {
-  static constexpr GateKind kind = kT;
-  static constexpr char name[] = "T";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateCirq<fp_type>, T>(
-        time, {q0}, {1, 0, 0, 0, 0, 0, is2, is2});
-  }
-};
-
-/**
- * The `(exponent = 1, global_shift = 0)` instance of CZPowGate.
- * This is the canonical CZ gate.
- */
-template <typename fp_type>
-struct CZ {
-  static constexpr GateKind kind = kCZ;
-  static constexpr char name[] = "CZ";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
-    return CreateGate<GateCirq<fp_type>, CZ>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, 1, 0, 0, 0, 0, 0,
-                         0, 0, 0, 0, 1, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, -1, 0});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
-    return schmidt_decomp_type<fp_type>{
-      {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}},
-      {{0, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, -1, 0}},
-    };
-  }
-};
-
-template <typename fp_type>
-using CNotPowGate = CXPowGate<fp_type>;
-
-/**
- * The `(exponent = 1, global_shift = 0)` instance of CXPowGate.
- * This is the canonical CX (or CNOT) gate.
- */
-template <typename fp_type>
-struct CX {
-  static constexpr GateKind kind = kCX;
-  static constexpr char name[] = "kCX";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = false;
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
-    // Matrix is in this form because the simulator uses inverse qubit order.
-    return CreateGate<GateCirq<fp_type>, CX>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, 1, 0,
-                         0, 0, 0, 0, 1, 0, 0, 0,
-                         0, 0, 1, 0, 0, 0, 0, 0});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
-    return schmidt_decomp_type<fp_type>{
-      {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}},
-      {{0, 0, 0, 0, 0, 0, 1, 0}, {0, 0, 1, 0, 1, 0, 0, 0}},
-    };
-  }
-};
-
-template <typename fp_type>
-using CNOT = CX<fp_type>;
-
-// Gates from cirq/ops/pauli_gates.py:
-
-/**
- * The `(exponent = 1, global_shift = 0)` instance of XPowGate.
- * This is the canonical Pauli X gate.
- */
-template <typename fp_type>
-struct X : public XPowGate<fp_type> {
-  static constexpr GateKind kind = kX;
-  static constexpr char name[] = "X";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateCirq<fp_type>, X>(
-        time, {q0}, {0, 0, 1, 0, 1, 0, 0, 0});
-  }
-};
-
-/**
- * The `(exponent = 1, global_shift = 0)` instance of YPowGate.
- * This is the canonical Pauli Y gate.
- */
-template <typename fp_type>
-struct Y : public YPowGate<fp_type> {
-  static constexpr GateKind kind = kY;
-  static constexpr char name[] = "Y";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateCirq<fp_type>, Y>(
-        time, {q0}, {0, 0, 0, -1, 0, 1, 0, 0});
-  }
-};
-
-/**
- * The `(exponent = 1, global_shift = 0)` instance of ZPowGate.
- * This is the canonical Pauli Z gate.
- */
-template <typename fp_type>
-struct Z : public ZPowGate<fp_type> {
-  static constexpr GateKind kind = kZ;
-  static constexpr char name[] = "Z";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateCirq<fp_type>, Z>(
-        time, {q0}, {1, 0, 0, 0, 0, 0, -1, 0});
-  }
-};
-
-// Gates from cirq/ops/phased_x_gate.py:
-
-/**
- * An XPowGate conjugated by ZPowGate%s.
- * Equivalent to the circuit `───Z^-p───X^t───Z^p───`.
- */
-template <typename fp_type>
-struct PhasedXPowGate {
-  static constexpr GateKind kind = kPhasedXPowGate;
-  static constexpr char name[] = "PhasedXPowGate";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0,
-                                  fp_type phase_exponent, fp_type exponent = 1,
-                                  fp_type global_shift = 0) {
-    fp_type pc = std::cos(pi * phase_exponent);
-    fp_type ps = std::sin(pi * phase_exponent);
-    fp_type ec = std::cos(pi * exponent);
-    fp_type es = std::sin(pi * exponent);
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-
-    fp_type ar = 0.5 * ((1 + ec) * gc - es * gs);
-    fp_type ai = 0.5 * ((1 + ec) * gs + es * gc);
-    fp_type br = -0.5 * ((-1 + ec) * gc - es * gs);
-    fp_type bi = -0.5 * ((-1 + ec) * gs + es * gc);
-
-    return CreateGate<GateCirq<fp_type>, PhasedXPowGate>(
-        time, {q0}, {ar, ai, pc * br + ps * bi, pc * bi - ps * br,
-                     pc * br - ps * bi, pc * bi + ps * br, ar, ai},
-        {phase_exponent, exponent, global_shift});
-  }
-};
-
-// Gates from cirq/ops/phased_x_z_gate.py:
-
-/**
- * A PhasedXPowGate followed by a ZPowGate.
- * Equivalent to the circuit `───Z^(-a)──X^x──Z^a───Z^z───`.
- */
-template <typename fp_type>
-struct PhasedXZGate {
-  static constexpr GateKind kind = kPhasedXZGate;
-  static constexpr char name[] = "PhasedXZGate";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0,
-                                  fp_type x_exponent, fp_type z_exponent,
-                                  fp_type axis_phase_exponent) {
-    fp_type xc = std::cos(pi * x_exponent);
-    fp_type xs = std::sin(pi * x_exponent);
-    fp_type zc = std::cos(pi * z_exponent);
-    fp_type zs = std::sin(pi * z_exponent);
-    fp_type ac = std::cos(pi * axis_phase_exponent);
-    fp_type as = std::sin(pi * axis_phase_exponent);
-
-    fp_type br = 0.5 * (1 + xc);
-    fp_type bi = 0.5 * xs;
-    fp_type cr = -0.5 * (-1 + xc);
-    fp_type ci = -0.5 * xs;
-    fp_type dr = ac * zc - as * zs;
-    fp_type di = ac * zs + as * zc;
-
-    return CreateGate<GateCirq<fp_type>, PhasedXZGate>(
-        time, {q0}, {br, bi, ac * cr + as * ci, ac * ci - as * cr,
-                     dr * cr - di * ci, dr * ci + di * cr,
-                     zc * br - zs * bi, zc * bi + zs * br},
-        {x_exponent, z_exponent, axis_phase_exponent});
-  }
-};
-
-// Gates from cirq/ops/parity_gates.py:
-
-/**
- * The tensor product of two X gates, possibly raised to an exponent.
- */
-template <typename fp_type>
-struct XXPowGate {
-  static constexpr GateKind kind = kXXPowGate;
-  static constexpr char name[] = "XXPowGate";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
-                                  fp_type exponent, fp_type global_shift = 0) {
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-    fp_type c = std::cos(pi * exponent);
-    fp_type s = std::sin(pi * exponent);
-    fp_type ic = 0.5 * ((1 + c) * gc - s * gs);
-    fp_type is = 0.5 * ((1 + c) * gs + s * gc);
-    fp_type xc = 0.5 * ((1 - c) * gc + s * gs);
-    fp_type xs = 0.5 * ((1 - c) * gs - s * gc);
-
-    return CreateGate<GateCirq<fp_type>, XXPowGate>(
-        time, {q0, q1}, {ic, is, 0, 0, 0, 0, xc, xs,
-                         0, 0, ic, is, xc, xs, 0, 0,
-                         0, 0, xc, xs, ic, is, 0, 0,
-                         xc, xs, 0, 0, 0, 0, ic, is}, {exponent, global_shift});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp(
-      fp_type exponent, fp_type global_shift) {
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-    fp_type c = std::cos(pi * exponent);
-    fp_type s = std::sin(pi * exponent);
-    fp_type ic = 0.5 * ((1 + c) * gc - s * gs);
-    fp_type is = 0.5 * ((1 + c) * gs + s * gc);
-    fp_type xc = 0.5 * ((1 - c) * gc + s * gs);
-    fp_type xs = 0.5 * ((1 - c) * gs - s * gc);
-
-    return schmidt_decomp_type<fp_type>{
-      {{1, 0, 0, 0, 0, 0, 1, 0}, {ic, is, 0, 0, 0, 0, ic, is}},
-      {{0, 0, 1, 0, 1, 0, 0, 0}, {0, 0, xc, xs, xc, xs, 0, 0}},
-    };
-  }
-};
-
-/**
- * The tensor product of two Y gates, possibly raised to an exponent.
- */
-template <typename fp_type>
-struct YYPowGate {
-  static constexpr GateKind kind = kYYPowGate;
-  static constexpr char name[] = "YYPowGate";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
-                                  fp_type exponent, fp_type global_shift = 0) {
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-    fp_type c = std::cos(pi * exponent);
-    fp_type s = std::sin(pi * exponent);
-    fp_type ic = 0.5 * ((1 + c) * gc - s * gs);
-    fp_type is = 0.5 * ((1 + c) * gs + s * gc);
-    fp_type yc = 0.5 * ((1 - c) * gc + s * gs);
-    fp_type ys = 0.5 * ((1 - c) * gs - s * gc);
-
-    return CreateGate<GateCirq<fp_type>, YYPowGate>(
-        time, {q0, q1}, {ic, is, 0, 0, 0, 0, -yc, -ys,
-                         0, 0, ic, is, yc, ys, 0, 0,
-                         0, 0, yc, ys, ic, is, 0, 0,
-                         -yc, -ys, 0, 0, 0, 0, ic, is},
-        {exponent, global_shift});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp(
-      fp_type exponent, fp_type global_shift) {
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-    fp_type c = std::cos(pi * exponent);
-    fp_type s = std::sin(pi * exponent);
-    fp_type ic = 0.5 * ((1 + c) * gc - s * gs);
-    fp_type is = 0.5 * ((1 + c) * gs + s * gc);
-    fp_type yc = 0.5 * ((1 - c) * gc + s * gs);
-    fp_type ys = 0.5 * ((1 - c) * gs - s * gc);
-
-    return schmidt_decomp_type<fp_type>{
-      {{1, 0, 0, 0, 0, 0, 1, 0}, {ic, is, 0, 0, 0, 0, ic, is}},
-      {{0, 0, 0, -1, 0, 1, 0, 0}, {0, 0, ys, -yc, -ys, yc, 0, 0}},
-    };
-  }
-};
-
-/**
- * The tensor product of two Z gates, possibly raised to an exponent.
- */
-template <typename fp_type>
-struct ZZPowGate {
-  static constexpr GateKind kind = kZZPowGate;
-  static constexpr char name[] = "ZZPowGate";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
-                                  fp_type exponent, fp_type global_shift = 0) {
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-    fp_type zc = std::cos(pi * exponent * (1 + global_shift));
-    fp_type zs = std::sin(pi * exponent * (1 + global_shift));
-
-    return CreateGate<GateCirq<fp_type>, ZZPowGate>(
-        time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0,
-                         0, 0, zc, zs, 0, 0, 0, 0,
-                         0, 0, 0, 0, zc, zs, 0, 0,
-                         0, 0, 0, 0, 0, 0, gc, gs}, {exponent, global_shift});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp(
-      fp_type exponent, fp_type global_shift) {
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-    fp_type c = std::cos(pi * exponent);
-    fp_type s = std::sin(pi * exponent);
-    fp_type ic = 0.5 * ((1 + c) * gc - s * gs);
-    fp_type is = 0.5 * ((1 + c) * gs + s * gc);
-    fp_type zc = 0.5 * ((1 - c) * gc + s * gs);
-    fp_type zs = 0.5 * ((1 - c) * gs - s * gc);
-
-    return schmidt_decomp_type<fp_type>{
-      {{1, 0, 0, 0, 0, 0, 1, 0}, {ic, is, 0, 0, 0, 0, ic, is}},
-      {{1, 0, 0, 0, 0, 0, -1, 0}, {zc, zs, 0, 0, 0, 0, -zc, -zs}},
-    };
-  }
-};
-
-/**
- * The `(exponent = 1, global_shift = 0)` instance of XXPowGate.
- * This is the tensor product of two X gates.
- */
-template <typename fp_type>
-struct XX {
-  static constexpr GateKind kind = kXX;
-  static constexpr char name[] = "XX";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
-    return CreateGate<GateCirq<fp_type>, XX>(
-        time, {q0, q1}, {0, 0, 0, 0, 0, 0, 1, 0,
-                         0, 0, 0, 0, 1, 0, 0, 0,
-                         0, 0, 1, 0, 0, 0, 0, 0,
-                         1, 0, 0, 0, 0, 0, 0, 0});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
-    return schmidt_decomp_type<fp_type>{
-      {{0, 0, 1, 0, 1, 0, 0, 0}, {0, 0, 1, 0, 1, 0, 0, 0}},
-    };
-  }
-};
-
-/**
- * The `(exponent = 1, global_shift = 0)` instance of YYPowGate.
- * This is the tensor product of two Y gates.
- */
-template <typename fp_type>
-struct YY {
-  static constexpr GateKind kind = kYY;
-  static constexpr char name[] = "YY";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
-    return CreateGate<GateCirq<fp_type>, YY>(
-        time, {q0, q1}, {0, 0, 0, 0, 0, 0, -1, 0,
-                         0, 0, 0, 0, 1, 0, 0, 0,
-                         0, 0, 1, 0, 0, 0, 0, 0,
-                         -1, 0, 0, 0, 0, 0, 0, 0});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
-    return schmidt_decomp_type<fp_type>{
-      {{0, 0, 0, -1, 0, 1, 0, 0}, {0, 0, 0, -1, 0, 1, 0, 0}},
-    };
-  }
-};
-
-/**
- * The `(exponent = 1, global_shift = 0)` instance of ZZPowGate.
- * This is the tensor product of two Z gates.
- */
-template <typename fp_type>
-struct ZZ {
-  static constexpr GateKind kind = kZZ;
-  static constexpr char name[] = "ZZ";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
-    return CreateGate<GateCirq<fp_type>, ZZ>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, -1, 0, 0, 0, 0, 0,
-                         0, 0, 0, 0, -1, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, 1, 0});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
-    return schmidt_decomp_type<fp_type>{
-      {{1, 0, 0, 0, 0, 0, -1, 0}, {1, 0, 0, 0, 0, 0, -1, 0}},
-    };
-  }
-};
-
-// Gates from cirq/ops/swap_gates.py:
-
-/**
- * The SWAP gate, possibly raised to a power. Exchanges qubits.
- */
-template <typename fp_type>
-struct SwapPowGate {
-  static constexpr GateKind kind = kSwapPowGate;
-  static constexpr char name[] = "SwapPowGate";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-  static constexpr fp_type h = static_cast<fp_type>(h_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
-                                  fp_type exponent, fp_type global_shift = 0) {
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-    fp_type c = std::cos(pi * exponent * 0.5);
-    fp_type s = std::sin(pi * exponent * 0.5);
-    fp_type ec = std::cos(pi * exponent * (0.5 + global_shift));
-    fp_type es = std::sin(pi * exponent * (0.5 + global_shift));
-
-    return CreateGate<GateCirq<fp_type>, SwapPowGate>(
-        time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0,
-                         0, 0, c * ec, c * es, s * es, -s * ec, 0, 0,
-                         0, 0, s * es, -s * ec, c * ec, c * es, 0, 0,
-                         0, 0, 0, 0, 0, 0, gc, gs}, {exponent, global_shift});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp(
-      fp_type exponent, fp_type global_shift) {
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-    fp_type c = std::cos(pi * exponent * 0.5);
-    fp_type s = std::sin(pi * exponent * 0.5);
-    fp_type ec = std::cos(pi * exponent * (0.5 + global_shift));
-    fp_type es = std::sin(pi * exponent * (0.5 + global_shift));
-
-    return schmidt_decomp_type<fp_type>{
-      {{h, 0, 0, 0, 0, 0, h, 0}, {gc + c * ec, gs + c * es, 0, 0,
-                                  0, 0, gc + c * ec, gs + c * es}},
-      {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, s * es, -s * ec,
-                                  s * es, -s * ec, 0, 0}},
-      {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, -s * ec, -s * es,
-                                   s * ec, s * es, 0, 0}},
-      {{h, 0, 0, 0, 0, 0, -h, 0}, {gc - c * ec, gs - c * es, 0, 0,
-                                   0, 0, -gc + c * ec, -gs + c * es}},
-    };
-  }
-};
-
-/**
- * Rotates the |01⟩ vs |10⟩ subspace of two qubits around its Bloch X-axis.
- * This is a generalization of the ISWAP gate.
- */
-template <typename fp_type>
-struct ISwapPowGate {
-  static constexpr GateKind kind = kISwapPowGate;
-  static constexpr char name[] = "ISwapPowGate";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-  static constexpr fp_type h = static_cast<fp_type>(h_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
-                                  fp_type exponent, fp_type global_shift = 0) {
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-    fp_type c = std::cos(pi * exponent * 0.5);
-    fp_type s = std::sin(pi * exponent * 0.5);
-
-    return CreateGate<GateCirq<fp_type>, ISwapPowGate>(
-        time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0,
-                         0, 0, c * gc, c * gs, -s * gs, s * gc, 0, 0,
-                         0, 0, -s * gs, s * gc, c * gc, c * gs, 0, 0,
-                         0, 0, 0, 0, 0, 0, gc, gs}, {exponent, global_shift});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp(
-      fp_type exponent, fp_type global_shift) {
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-    fp_type c = std::cos(pi * exponent * 0.5);
-    fp_type s = std::sin(pi * exponent * 0.5);
-
-    return schmidt_decomp_type<fp_type>{
-      {{h, 0, 0, 0, 0, 0, h, 0}, {gc + c * gc, gs + c * gs, 0, 0,
-                                  0, 0, gc + c * gc, gs + c * gs}},
-      {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, -s * gs, s * gc,
-                                  -s * gs, s * gc, 0, 0}},
-      {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, s * gc, s * gs,
-                                   -s * gc, -s * gs, 0, 0}},
-      {{h, 0, 0, 0, 0, 0, -h, 0}, {gc - c * gc, gs - c * gs, 0, 0,
-                                   0, 0, -gc + c * gc, -gs + c * gs}},
-    };
-  }
-};
-
-/**
- * The `(exponent = 2*phi/pi, global_shift = 0)` instance of ISwapPowGate.
- * This is a generalization of the ISWAP gate with a fixed global phase of zero.
- * This is a function in Cirq.
- */
-template <typename fp_type>
-struct riswap {
-  static constexpr GateKind kind = kriswap;
-  static constexpr char name[] = "riswap";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-  static constexpr fp_type h = static_cast<fp_type>(h_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
-                                  fp_type phi) {
-    fp_type c = std::cos(phi);
-    fp_type s = std::sin(phi);
-
-    return CreateGate<GateCirq<fp_type>, riswap>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, c, 0, 0, s, 0, 0,
-                         0, 0, 0, s, c, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, 1, 0}, {phi});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp(fp_type phi) {
-    fp_type c = std::cos(phi);
-    fp_type s = std::sin(phi);
-
-    return schmidt_decomp_type<fp_type>{
-      {{h, 0, 0, 0, 0, 0, h, 0}, {1 + c, 0, 0, 0, 0, 0, 1 + c, 0}},
-      {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, 0, s, 0, s, 0, 0}},
-      {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, s, 0, -s, 0, 0, 0}},
-      {{h, 0, 0, 0, 0, 0, -h, 0}, {1 - c, 0, 0, 0, 0, 0, -1 + c, 0}},
-    };
-  }
-};
-
-/**
- * The `(exponent = 1, global_shift = 0)` instance of SwapPowGate.
- * This is the canonical SWAP gate.
- */
-template <typename fp_type>
-struct SWAP {
-  static constexpr GateKind kind = kSWAP;
-  static constexpr char name[] = "SWAP";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
-    return CreateGate<GateCirq<fp_type>, SWAP>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, 0, 0, 1, 0, 0, 0,
-                         0, 0, 1, 0, 0, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, 1, 0});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
-    return schmidt_decomp_type<fp_type>{
-      {{is2, 0, 0, 0, 0, 0, is2, 0}, {is2, 0, 0, 0, 0, 0, is2, 0}},
-      {{0, 0, is2, 0, is2, 0, 0, 0}, {0, 0, is2, 0, is2, 0, 0, 0}},
-      {{0, 0, 0, -is2, 0, is2, 0, 0}, {0, 0, 0, -is2, 0, is2, 0, 0}},
-      {{is2, 0, 0, 0, 0, 0, -is2, 0}, {is2, 0, 0, 0, 0, 0, -is2, 0}},
-    };
-  }
-};
-
-/**
- * The `(exponent = 1, global_shift = 0)` instance of ISwapPowGate.
- * This is the canonical ISWAP gate.
- */
-template <typename fp_type>
-struct ISWAP {
-  static constexpr GateKind kind = kISWAP;
-  static constexpr char name[] = "ISWAP";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type h = static_cast<fp_type>(h_double);
-  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
-    return CreateGate<GateCirq<fp_type>, ISWAP>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, 0, 0, 0, 1, 0, 0,
-                         0, 0, 0, 1, 0, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, 1, 0});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
-    return schmidt_decomp_type<fp_type>{
-      {{is2, 0, 0, 0, 0, 0, is2, 0}, {is2, 0, 0, 0, 0, 0, is2, 0}},
-      {{0, 0, h, h, h, h, 0, 0}, {0, 0, h, h, h, h, 0, 0}},
-      {{0, 0, h, -h, -h, h, 0, 0}, {0, 0, h, -h, -h, h, 0, 0}},
-      {{is2, 0, 0, 0, 0, 0, -is2, 0}, {is2, 0, 0, 0, 0, 0, -is2, 0}},
-    };
-  }
-};
-
-// Gates from cirq/ops/phased_iswap_gate.py:
-
-/**
- * An ISwapPowGate conjugated by ZPowGate%s.
- * Equivalent to the composition `(Z^-p ⊗ Z^p) ISWAP^t (Z^p ⊗ Z^-p)`.
- */
-template <typename fp_type>
-struct PhasedISwapPowGate {
-  static constexpr GateKind kind = kPhasedISwapPowGate;
-  static constexpr char name[] = "PhasedISwapPowGate";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = false;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-  static constexpr fp_type h = static_cast<fp_type>(h_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
-                                  fp_type phase_exponent = 0.25,
-                                  fp_type exponent = 1.0) {
-    fp_type fc = std::cos(2 * pi * phase_exponent);
-    fp_type fs = std::sin(2 * pi * phase_exponent);
-    fp_type c = std::cos(pi * exponent * 0.5);
-    fp_type s = std::sin(pi * exponent * 0.5);
-
-    // Matrix is in this form because the simulator uses inverse qubit order.
-    return CreateGate<GateCirq<fp_type>, PhasedISwapPowGate>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, c, 0, s * fs, s * fc, 0, 0,
-                         0, 0, -s * fs, s * fc, c, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, 1, 0}, {phase_exponent, exponent});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp(
-      fp_type phase_exponent, fp_type exponent) {
-    fp_type fc = std::cos(2 * pi * phase_exponent);
-    fp_type fs = std::sin(2 * pi * phase_exponent);
-    fp_type c = std::cos(pi * exponent * 0.5);
-    fp_type s = std::sin(pi * exponent * 0.5);
-
-    return schmidt_decomp_type<fp_type>{
-      {{h, 0, 0, 0, 0, 0, h, 0}, {1 + c, 0, 0, 0, 0, 0, 1 + c, 0}},
-      {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, s * fs, s * fc, -s * fs, s * fc, 0, 0}},
-      {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, s * fc, -s * fs,
-                                   -s * fc, -s * fs, 0, 0}},
-      {{h, 0, 0, 0, 0, 0, -h, 0}, {1 - c, 0, 0, 0, 0, 0, -1 + c, 0}},
-    };
-  }
-};
-
-/**
- * The `(phase_exponent = 0.25, exponent = 2*phi/pi)` instance of
- * PhasedISwapPowGate.
- * This is the "Givens rotation" from numerical linear algebra.
- * This is a function in Cirq.
- */
-template <typename fp_type>
-struct givens {
-  static constexpr GateKind kind = kgivens;
-  static constexpr char name[] = "givens";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = false;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-  static constexpr fp_type h = static_cast<fp_type>(h_double);
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0, unsigned q1,
-                                  fp_type phi) {
-    fp_type c = std::cos(phi);
-    fp_type s = std::sin(phi);
-
-    // Matrix is in this form because the simulator uses inverse qubit order.
-    return CreateGate<GateCirq<fp_type>, givens>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, c, 0, s, 0, 0, 0,
-                         0, 0, -s, 0, c, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, 1, 0}, {phi});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp(fp_type phi) {
-    fp_type c = std::cos(phi);
-    fp_type s = std::sin(phi);
-
-    return schmidt_decomp_type<fp_type>{
-      {{h, 0, 0, 0, 0, 0, h, 0}, {1 + c, 0, 0, 0, 0, 0, 1 + c, 0}},
-      {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, s, 0, -s, 0, 0, 0}},
-      {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, 0, -s, 0, -s, 0, 0}},
-      {{h, 0, 0, 0, 0, 0, -h, 0}, {1 - c, 0, 0, 0, 0, 0, -1 + c, 0}},
-    };
-  }
-};
-
-// Gates from cirq/ops/fsim_gate.py:
-
-/**
- * The fermionic simulation gate family. Contains all two-qubit interactions
- * that preserve excitations, up to single-qubit rotations and global phase.
- */
-template <typename fp_type>
-struct FSimGate {
-  static constexpr GateKind kind = kFSimGate;
-  static constexpr char name[] = "FSimGate";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
-
-  static GateCirq<fp_type> Create(
-      unsigned time, unsigned q0, unsigned q1, fp_type theta, fp_type phi) {
-    if (phi < 0) {
-      phi += 2 * 3.141592653589793;
-    }
-
-    fp_type ct = std::cos(theta);
-    fp_type st = std::sin(theta);
-    fp_type cp = std::cos(phi);
-    fp_type sp = std::sin(phi);
-
-    return CreateGate<GateCirq<fp_type>, FSimGate>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, ct, 0, 0, -st, 0, 0,
-                         0, 0, 0, -st, ct, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, cp, -sp}, {theta, phi});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp(
-      fp_type theta, fp_type phi) {
-    fp_type ct = std::cos(theta);
-    fp_type st = std::sin(theta);
-
-    fp_type cp2 = std::cos(0.5 * phi);
-    fp_type sp2 = std::sin(0.5 * phi);
-    fp_type cp4 = std::cos(0.25 * phi);
-    fp_type sp4 = std::sin(0.25 * phi);
-
-    fp_type a0 = std::sqrt(std::sqrt(1 + 2 * ct * cp2 + ct * ct));
-    fp_type a1 = std::sqrt(std::sqrt(1 - 2 * ct * cp2 + ct * ct));
-
-    fp_type p0 = 0.5 * std::atan2(-sp2, cp2 + ct);
-    fp_type p1 = 0.5 * std::atan2(-sp2, cp2 - ct);
-
-    fp_type c0 = is2 * a0 * std::cos(p0);
-    fp_type s0 = is2 * a0 * std::sin(p0);
-
-    fp_type c1 = is2 * a1 * std::cos(p1);
-    fp_type s1 = is2 * a1 * std::sin(p1);
-
-    fp_type st2 = 0.5 * std::sqrt(st);
-
-    fp_type a = cp4 * c0 - sp4 * s0;
-    fp_type b = cp4 * s0 + sp4 * c0;
-    fp_type c = cp4 * c0 + sp4 * s0;
-    fp_type d = cp4 * s0 - sp4 * c0;
-
-    fp_type e = cp4 * c1 - sp4 * s1;
-    fp_type f = cp4 * s1 + sp4 * c1;
-    fp_type g = -(cp4 * c1 + sp4 * s1);
-    fp_type h = -(cp4 * s1 - sp4 * c1);
-
-    return schmidt_decomp_type<fp_type>{
-      {{a, b, 0, 0, 0, 0, c, d}, {a, b, 0, 0, 0, 0, c, d}},
-      {{0, 0, st2, -st2, st2, -st2, 0, 0}, {0, 0, st2, -st2, st2, -st2, 0, 0}},
-      {{0, 0, -st2, -st2, st2, st2, 0, 0}, {0, 0, -st2, -st2, st2, st2, 0, 0}},
-      {{e, f, 0, 0, 0, 0, g, h}, {e, f, 0, 0, 0, 0, g, h}},
-    };
-  }
-};
-
-// Gates from cirq/ops/two_qubit_diagonal_gate.py:
-
-/**
- * A two-qubit diagonal gate.
- */
-template <typename fp_type>
-struct TwoQubitDiagonalGate {
-  static constexpr GateKind kind = kTwoQubitDiagonalGate;
-  static constexpr char name[] = "TwoQubitDiagonalGate";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = false;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time,
-                                  unsigned q0, unsigned q1,
-                                  const std::vector<fp_type>& angles) {
-    std::vector<fp_type> cs;
-    std::vector<fp_type> ss;
-    cs.reserve(4);
-    ss.reserve(4);
-
-    for (std::size_t i = 0; i < angles.size(); ++i) {
-      cs.push_back(std::cos(angles[i]));
-      ss.push_back(std::sin(angles[i]));
-    }
-
-    for (std::size_t i = angles.size(); i < 4; ++i) {
-      cs.push_back(1);
-      ss.push_back(0);
-    }
-
-    // Matrix is in this form because the simulator uses inverse qubit order.
-    return CreateGate<GateCirq<fp_type>, TwoQubitDiagonalGate>(
-        time, {q0, q1}, {cs[0], ss[0], 0, 0, 0, 0, 0, 0,
-                         0, 0, cs[2], ss[2], 0, 0, 0, 0,
-                         0, 0, 0, 0, cs[1], ss[1], 0, 0,
-                         0, 0, 0, 0, 0, 0, cs[3], ss[3]});
-  }
-};
-
-// Gates from cirq/ops/three_qubit_gates.py:
-
-/**
- * A three-qubit diagonal gate.
- */
-template <typename fp_type>
-struct ThreeQubitDiagonalGate {
-  static constexpr GateKind kind = kThreeQubitDiagonalGate;
-  static constexpr char name[] = "ThreeQubitDiagonalGate";
-  static constexpr unsigned num_qubits = 3;
-  static constexpr bool symmetric = false;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time,
-                                  unsigned q0, unsigned q1, unsigned q2,
-                                  const std::vector<fp_type>& angles) {
-    std::vector<fp_type> cs;
-    std::vector<fp_type> ss;
-    cs.reserve(8);
-    ss.reserve(8);
-
-    for (std::size_t i = 0; i < angles.size(); ++i) {
-      cs.push_back(std::cos(angles[i]));
-      ss.push_back(std::sin(angles[i]));
-    }
-
-    for (std::size_t i = angles.size(); i < 8; ++i) {
-      cs.push_back(1);
-      ss.push_back(0);
-    }
-
-    // Matrix is in this form because the simulator uses inverse qubit order.
-    return CreateGate<GateCirq<fp_type>, ThreeQubitDiagonalGate>(
-        time, {q0, q1, q2},
-        {cs[0], ss[0], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-         0, 0, cs[4], ss[4], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-         0, 0, 0, 0, cs[2], ss[2], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-         0, 0, 0, 0, 0, 0, cs[6], ss[6], 0, 0, 0, 0, 0, 0, 0, 0,
-         0, 0, 0, 0, 0, 0, 0, 0, cs[1], ss[1], 0, 0, 0, 0, 0, 0,
-         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, cs[5], ss[5], 0, 0, 0, 0,
-         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, cs[3], ss[3], 0, 0,
-         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, cs[7], ss[7]});
-  }
-};
-
-/**
- * A gate that applies a phase to the |111⟩ state of three qubits.
- * This is a generalization of the CCZ gate.
- */
-template <typename fp_type>
-struct CCZPowGate {
-  static constexpr GateKind kind = kCCZPowGate;
-  static constexpr char name[] = "CCZPowGate";
-  static constexpr unsigned num_qubits = 3;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time,
-                                  unsigned q0, unsigned q1, unsigned q2,
-                                  fp_type exponent, fp_type global_shift = 0) {
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-    fp_type ec = std::cos(pi * exponent * (1 + global_shift));
-    fp_type es = std::sin(pi * exponent * (1 + global_shift));
-
-    return CreateGate<GateCirq<fp_type>, CCZPowGate>(
-        time, {q0, q1, q2}, {gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ec, es},
-                            {exponent, global_shift});
-  }
-};
-
-/**
- * A gate that applies a doubly-controlled power of an X gate.
- * This is a generalization of the CCX (or CCNOT) gate.
- */
-template <typename fp_type>
-struct CCXPowGate {
-  static constexpr GateKind kind = kCCXPowGate;
-  static constexpr char name[] = "CCXPowGate";
-  static constexpr unsigned num_qubits = 3;
-  static constexpr bool symmetric = false;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time,
-                                  unsigned q0, unsigned q1, unsigned q2,
-                                  fp_type exponent, fp_type global_shift = 0) {
-    fp_type c = std::cos(pi * exponent * 0.5);
-    fp_type s = std::sin(pi * exponent * 0.5);
-    fp_type gc = std::cos(pi * exponent * global_shift);
-    fp_type gs = std::sin(pi * exponent * global_shift);
-    fp_type ec = std::cos(pi * exponent * (0.5 + global_shift));
-    fp_type es = std::sin(pi * exponent * (0.5 + global_shift));
-
-    // Matrix is in this form because the simulator uses inverse qubit order.
-    return CreateGate<GateCirq<fp_type>, CCXPowGate>(
-        time, {q0, q1, q2},
-        {gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-         0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-         0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-         0, 0, 0, 0, 0, 0, c * ec, c * es, 0, 0, 0, 0, 0, 0, s * es, -s * ec,
-         0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0,
-         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0,
-         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0,
-         0, 0, 0, 0, 0, 0, s * es, -s * ec, 0, 0, 0, 0, 0, 0, c * ec, c * es},
-        {exponent, global_shift});
-  }
-};
-
-/**
- * A controlled swap gate (the Fredkin gate).
- */
-template <typename fp_type>
-struct CSwapGate {
-  static constexpr GateKind kind = kCSwapGate;
-  static constexpr char name[] = "CSwapGate";
-  static constexpr unsigned num_qubits = 3;
-  static constexpr bool symmetric = false;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time,
-                                  unsigned q0, unsigned q1, unsigned q2) {
-    // Matrix is in this form because the simulator uses inverse qubit order.
-    return CreateGate<GateCirq<fp_type>, CSwapGate>(
-        time, {q0, q1, q2}, {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0});
-  }
-};
-
-/**
- * The `(exponent = 1, global_shift = 0)` instance of CCZPowGate.
- * This is the canonical doubly-controlled Z gate.
- */
-template <typename fp_type>
-struct CCZ {
-  static constexpr GateKind kind = kCCZ;
-  static constexpr char name[] = "CCZ";
-  static constexpr unsigned num_qubits = 3;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time,
-                                  unsigned q0, unsigned q1, unsigned q2) {
-    return CreateGate<GateCirq<fp_type>, CCZ>(
-        time, {q0, q1, q2}, {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0});
-  }
-};
-
-/**
- * The `(exponent = 1, global_shift = 0)` instance of CCXPowGate.
- * This is the canonical doubly-controlled X gate (the TOFFOLI gate).
- */
-template <typename fp_type>
-struct CCX {
-  static constexpr GateKind kind = kCCX;
-  static constexpr char name[] = "CCX";
-  static constexpr unsigned num_qubits = 3;
-  static constexpr bool symmetric = false;
-
-  static constexpr fp_type pi = static_cast<fp_type>(pi_double);
-
-  static GateCirq<fp_type> Create(unsigned time,
-                                  unsigned q0, unsigned q1, unsigned q2) {
-    // Matrix is in this form because the simulator uses inverse qubit order.
-    return CreateGate<GateCirq<fp_type>, CCX>(
-        time, {q0, q1, q2}, {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
-                             0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0});
-  }
-};
-
-template <typename fp_type>
-using CCNotPowGate = CCXPowGate<fp_type>;
-
-template <typename fp_type>
-using TOFFOLI = CCX<fp_type>;
-
-template <typename fp_type>
-using CCNOT = CCX<fp_type>;
-
-template <typename fp_type>
-using CSWAP = CSwapGate<fp_type>;
-
-template <typename fp_type>
-using FREDKIN = CSwapGate<fp_type>;
-
-// Gates from cirq/ops/matrix_gates.py:
-
-/**
- * A one-qubit gate defined entirely by its matrix.
- */
-template <typename fp_type>
-struct MatrixGate1 {
-  static constexpr GateKind kind = kMatrixGate1;
-  static constexpr char name[] = "MatrixGate1";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateCirq<fp_type> Create(unsigned time, unsigned q0,
-                                  const Matrix<fp_type>& m) {
-    auto m2 = m;
-    return
-        CreateGate<GateCirq<fp_type>, MatrixGate1>(time, {q0}, std::move(m2));
-  }
-};
-
-/**
- * A two-qubit gate defined entirely by its matrix.
- */
-template <typename fp_type>
-struct MatrixGate2 {
-  static constexpr GateKind kind = kMatrixGate2;
-  static constexpr char name[] = "MatrixGate2";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = false;
-
-  template <typename M = Matrix<fp_type>>
-  static GateCirq<fp_type> Create(
-      unsigned time, unsigned q0, unsigned q1, M&& m) {
-    return CreateGate<GateCirq<fp_type>, MatrixGate2>(time, {q1, q0},
-                                                      std::forward<M>(m));
-  }
-};
-
-/**
- * A multi-qubit gate defined entirely by its matrix.
- */
-template <typename fp_type>
-struct MatrixGate {
-  static constexpr GateKind kind = kMatrixGate;
-  static constexpr char name[] = "MatrixGate";
-  static constexpr bool symmetric = false;
-
-  template <typename M = Matrix<fp_type>>
-  static GateCirq<fp_type> Create(unsigned time,
-                                  std::vector<unsigned> qubits, M&& m) {
-    std::reverse(qubits.begin(), qubits.end());
-    return CreateGate<GateCirq<fp_type>, MatrixGate>(time, std::move(qubits),
-                                                     std::forward<M>(m));
-  }
-};
-
-}  // namesapce Cirq
-
-template <typename fp_type>
-inline schmidt_decomp_type<fp_type> GetSchmidtDecomp(
-    Cirq::GateKind kind, const std::vector<fp_type>& params) {
-  switch (kind) {
-  case Cirq::kI2:
-    return Cirq::I2<fp_type>::SchmidtDecomp();
-  case Cirq::kCZPowGate:
-    return Cirq::CZPowGate<fp_type>::SchmidtDecomp(params[0], params[1]);
-  case Cirq::kCXPowGate:
-    return Cirq::CXPowGate<fp_type>::SchmidtDecomp(params[0], params[1]);
-  case Cirq::kCZ:
-    return Cirq::CZ<fp_type>::SchmidtDecomp();
-  case Cirq::kCX:
-    return Cirq::CX<fp_type>::SchmidtDecomp();
-  case Cirq::kXXPowGate:
-    return Cirq::XXPowGate<fp_type>::SchmidtDecomp(params[0], params[1]);
-  case Cirq::kYYPowGate:
-    return Cirq::YYPowGate<fp_type>::SchmidtDecomp(params[0], params[1]);
-  case Cirq::kZZPowGate:
-    return Cirq::ZZPowGate<fp_type>::SchmidtDecomp(params[0], params[1]);
-  case Cirq::kXX:
-    return Cirq::XX<fp_type>::SchmidtDecomp();
-  case Cirq::kYY:
-    return Cirq::YY<fp_type>::SchmidtDecomp();
-  case Cirq::kZZ:
-    return Cirq::ZZ<fp_type>::SchmidtDecomp();
-  case Cirq::kSwapPowGate:
-    return Cirq::SwapPowGate<fp_type>::SchmidtDecomp(params[0], params[1]);
-  case Cirq::kISwapPowGate:
-    return Cirq::ISwapPowGate<fp_type>::SchmidtDecomp(params[0], params[1]);
-  case Cirq::kriswap:
-    return Cirq::riswap<fp_type>::SchmidtDecomp(params[0]);
-  case Cirq::kSWAP:
-    return Cirq::SWAP<fp_type>::SchmidtDecomp();
-  case Cirq::kISWAP:
-    return Cirq::ISWAP<fp_type>::SchmidtDecomp();
-  case Cirq::kPhasedISwapPowGate:
-    return Cirq::PhasedISwapPowGate<fp_type>::SchmidtDecomp(
-        params[0], params[1]);
-  case Cirq::kgivens:
-    return Cirq::givens<fp_type>::SchmidtDecomp(params[0]);
-  case Cirq::kFSimGate:
-    return Cirq::FSimGate<fp_type>::SchmidtDecomp(params[0], params[1]);
-  default:
-    // Single qubit gates of gates with unimplemented Schmidt decomposition.
-    return schmidt_decomp_type<fp_type>{};
-  }
-}
-
-}  // namespace qsim
-
-#endif  // GATES_CIRQ_H_
diff --git a/tpls/qsim/gates_qsim.h b/tpls/qsim/gates_qsim.h
deleted file mode 100644
index 366c4f1..0000000
--- a/tpls/qsim/gates_qsim.h
+++ /dev/null
@@ -1,661 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef GATES_QSIM_H_
-#define GATES_QSIM_H_
-
-#include <array>
-#include <cmath>
-#include <vector>
-
-#include "gate.h"
-
-namespace qsim {
-
-// Gate set implemented in qsim contains the following gates.
-enum GateKind {
-  kGateId1 = 0, // one-qubit Id
-  kGateHd,      // Hadamard
-  kGateT,       // T
-  kGateX,       // X
-  kGateY,       // Y
-  kGateZ,       // Z
-  kGateX2,      // sqrt(X)
-  kGateY2,      // sqrt(Y)
-  kGateRX,      // X-rotation
-  kGateRY,      // Y-rotation
-  kGateRZ,      // Z-rotation
-  kGateRXY,     // XY-rotation (rotation around arbitrary axis in the XY plane)
-  kGateHZ2,     // pi / 2 rotation around the X + Y axis
-  kGateS,       // S
-  kGateId2,     // two-qubit Id
-  kGateCZ,      // CZ
-  kGateCNot,    // CNOT (CX)
-  kGateSwap,    // swap
-  kGateIS,      // iSwap
-  kGateFS,      // fSim
-  kGateCP,      // control phase
-  kGateMatrix1, // one-qubit matrix gate
-  kGateMatrix2, // two-qubit matrix gate
-  kGateGPh,     // global phase gate
-  kDecomp = gate::kDecomp,
-  kMeasurement = gate::kMeasurement,
-};
-
-// Specialization of Gate (defined in gate.h) for the qsim gate set.
-template <typename fp_type>
-using GateQSim = Gate<fp_type, GateKind>;
-
-constexpr double h_double = 0.5;
-constexpr double is2_double = 0.7071067811865475;
-
-// Zero-qubit gates:
-
-/**
- * The global phase gate.
- */
-template <typename fp_type>
-struct GateGPh {
-  static constexpr GateKind kind = kGateGPh;
-  static constexpr char name[] = "p";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateQSim<fp_type> Create(unsigned time, fp_type phi) {
-    return Create(time, std::cos(phi), std::sin(phi));
-  }
-
-  static GateQSim<fp_type> Create(unsigned time, fp_type cp, fp_type sp) {
-    return CreateGate<GateQSim<fp_type>, GateGPh>(
-        time, {}, {cp, sp}, {cp, sp});
-  }
-};
-
-// One-qubit gates:
-
-/**
- * The one-qubit identity gate.
- */
-template <typename fp_type>
-struct GateId1 {
-  static constexpr GateKind kind = kGateId1;
-  static constexpr char name[] = "id1";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateQSim<fp_type>, GateId1>(
-        time, {q0}, {1, 0, 0, 0, 0, 0, 1, 0});
-  }
-};
-
-/**
- * The Hadamard gate.
- */
-template <typename fp_type>
-struct GateHd {
-  static constexpr GateKind kind = kGateHd;
-  static constexpr char name[] = "h";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateQSim<fp_type>, GateHd>(
-        time, {q0}, {is2, 0, is2, 0, is2, 0, -is2, 0});
-  }
-};
-
-/**
- * The T gate, equivalent to `Z ^ 0.25`.
- */
-template <typename fp_type>
-struct GateT {
-  static constexpr GateKind kind = kGateT;
-  static constexpr char name[] = "t";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateQSim<fp_type>, GateT>(
-        time, {q0}, {1, 0, 0, 0, 0, 0, is2, is2});
-  }
-};
-
-/**
- * The Pauli X (or "NOT") gate.
- */
-template <typename fp_type>
-struct GateX {
-  static constexpr GateKind kind = kGateX;
-  static constexpr char name[] = "x";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateQSim<fp_type>, GateX>(
-        time, {q0}, {0, 0, 1, 0, 1, 0, 0, 0});
-  }
-};
-
-/**
- * The Pauli Y gate.
- */
-template <typename fp_type>
-struct GateY {
-  static constexpr GateKind kind = kGateY;
-  static constexpr char name[] = "y";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateQSim<fp_type>, GateY>(
-        time, {q0}, {0, 0, 0, -1, 0, 1, 0, 0});
-  }
-};
-
-/**
- * The Pauli Z gate.
- */
-template <typename fp_type>
-struct GateZ {
-  static constexpr GateKind kind = kGateZ;
-  static constexpr char name[] = "z";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateQSim<fp_type>, GateZ>(
-        time, {q0}, {1, 0, 0, 0, 0, 0, -1, 0});
-  }
-};
-
-/**
- * The "square root of X" gate.
- */
-template <typename fp_type>
-struct GateX2 {
-  static constexpr GateKind kind = kGateX2;
-  static constexpr char name[] = "x_1_2";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type h = static_cast<fp_type>(h_double);
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateQSim<fp_type>, GateX2>(
-        time, {q0}, {h, h, h, -h, h, -h, h, h});
-  }
-};
-
-/**
- * The "square root of Y" gate.
- */
-template <typename fp_type>
-struct GateY2 {
-  static constexpr GateKind kind = kGateY2;
-  static constexpr char name[] = "y_1_2";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type h = static_cast<fp_type>(h_double);
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateQSim<fp_type>, GateY2>(
-        time, {q0}, {h, h, -h, -h, h, h, h, h});
-  }
-};
-
-/**
- * A gate that rotates around the X axis of the Bloch sphere.
- * This is a generalization of the X gate.
- */
-template <typename fp_type>
-struct GateRX {
-  static constexpr GateKind kind = kGateRX;
-  static constexpr char name[] = "rx";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0, fp_type phi) {
-    fp_type phi2 = -0.5 * phi;
-    fp_type c = std::cos(phi2);
-    fp_type s = std::sin(phi2);
-
-    return CreateGate<GateQSim<fp_type>, GateRX>(
-        time, {q0}, {c, 0, 0, s, 0, s, c, 0}, {phi});
-  }
-};
-
-/**
- * A gate that rotates around the Y axis of the Bloch sphere.
- * This is a generalization of the Y gate.
- */
-template <typename fp_type>
-struct GateRY {
-  static constexpr GateKind kind = kGateRY;
-  static constexpr char name[] = "ry";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0, fp_type phi) {
-    fp_type phi2 = -0.5 * phi;
-    fp_type c = std::cos(phi2);
-    fp_type s = std::sin(phi2);
-
-    return CreateGate<GateQSim<fp_type>, GateRY>(
-        time, {q0}, {c, 0, s, 0, -s, 0, c, 0}, {phi});
-  }
-};
-
-/**
- * A gate that rotates around the Z axis of the Bloch sphere.
- * This is a generalization of the Z gate.
- */
-template <typename fp_type>
-struct GateRZ {
-  static constexpr GateKind kind = kGateRZ;
-  static constexpr char name[] = "rz";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0, fp_type phi) {
-    fp_type phi2 = -0.5 * phi;
-    fp_type c = std::cos(phi2);
-    fp_type s = std::sin(phi2);
-
-    return CreateGate<GateQSim<fp_type>, GateRZ>(
-        time, {q0}, {c, s, 0, 0, 0, 0, c, -s}, {phi});
-  }
-};
-
-/**
- * A gate that rotates around an arbitrary axis in the XY-plane.
- */
-template <typename fp_type>
-struct GateRXY {
-  static constexpr GateKind kind = kGateRXY;
-  static constexpr char name[] = "rxy";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateQSim<fp_type> Create(
-      unsigned time, unsigned q0, fp_type theta, fp_type phi) {
-    fp_type phi2 = -0.5 * phi;
-    fp_type cp = std::cos(phi2);
-    fp_type sp = std::sin(phi2);
-    fp_type ct = std::cos(theta) * sp;
-    fp_type st = std::sin(theta) * sp;
-
-    return CreateGate<GateQSim<fp_type>, GateRXY>(
-        time, {q0}, {cp, 0, st, ct, -st, ct, cp, 0}, {theta, phi});
-  }
-};
-
-/**
- * A pi / 2 rotation around the X + Y axis.
- */
-template <typename fp_type>
-struct GateHZ2 {
-  static constexpr GateKind kind = kGateHZ2;
-  static constexpr char name[] = "hz_1_2";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type h = static_cast<fp_type>(h_double);
-
-  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateQSim<fp_type>, GateHZ2>(
-        time, {q0}, {h, h, 0, -is2, is2, 0, h, h});
-  }
-};
-
-/**
- * The S gate, equivalent to "square root of Z".
- */
-template <typename fp_type>
-struct GateS {
-  static constexpr GateKind kind = kGateS;
-  static constexpr char name[] = "s";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0) {
-    return CreateGate<GateQSim<fp_type>, GateS>(
-        time, {q0}, {1, 0, 0, 0, 0, 0, 0, 1});
-  }
-};
-
-/**
- * A one-qubit gate defined entirely by its matrix.
- */
-template <typename fp_type>
-struct GateMatrix1 {
-  static constexpr GateKind kind = kGateMatrix1;
-  static constexpr char name[] = "mat1";
-  static constexpr unsigned num_qubits = 1;
-  static constexpr bool symmetric = true;
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0,
-                                  const Matrix<fp_type>& m) {
-    auto m2 = m;
-    return
-        CreateGate<GateQSim<fp_type>, GateMatrix1>(time, {q0}, std::move(m2));
-  }
-};
-
-// Two-qubit gates:
-
-/**
- * The two-qubit identity gate.
- */
-template <typename fp_type>
-struct GateId2 {
-  static constexpr GateKind kind = kGateId2;
-  static constexpr char name[] = "id2";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
-    return CreateGate<GateQSim<fp_type>, GateId2>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, 1, 0, 0, 0, 0, 0,
-                         0, 0, 0, 0, 1, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, 1, 0});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
-    return schmidt_decomp_type<fp_type>{
-      {{1, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, 1, 0}},
-    };
-  }
-};
-
-/**
- * The controlled-Z (CZ) gate.
- */
-template <typename fp_type>
-struct GateCZ {
-  static constexpr GateKind kind = kGateCZ;
-  static constexpr char name[] = "cz";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
-    return CreateGate<GateQSim<fp_type>, GateCZ>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, 1, 0, 0, 0, 0, 0,
-                         0, 0, 0, 0, 1, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, -1, 0});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
-    return schmidt_decomp_type<fp_type>{
-      {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}},
-      {{0, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, -1, 0}},
-    };
-  }
-};
-
-/**
- * The controlled-X (CX or CNOT) gate.
- */
-template <typename fp_type>
-struct GateCNot {
-  static constexpr GateKind kind = kGateCNot;
-  static constexpr char name[] = "cnot";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = false;
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
-    // Matrix is in this form because the simulator uses inverse qubit order.
-    return CreateGate<GateQSim<fp_type>, GateCNot>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, 1, 0,
-                         0, 0, 0, 0, 1, 0, 0, 0,
-                         0, 0, 1, 0, 0, 0, 0, 0});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
-    return schmidt_decomp_type<fp_type>{
-      {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}},
-      {{0, 0, 0, 0, 0, 0, 1, 0}, {0, 0, 1, 0, 1, 0, 0, 0}},
-    };
-  }
-};
-
-/**
- * The SWAP gate. Exchanges two qubits.
- */
-template <typename fp_type>
-struct GateSwap {
-  static constexpr GateKind kind = kGateSwap;
-  static constexpr char name[] = "sw";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
-    return CreateGate<GateQSim<fp_type>, GateSwap>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, 0, 0, 1, 0, 0, 0,
-                         0, 0, 1, 0, 0, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, 1, 0});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
-    return schmidt_decomp_type<fp_type>{
-      {{is2, 0, 0, 0, 0, 0, is2, 0}, {is2, 0, 0, 0, 0, 0, is2, 0}},
-      {{0, 0, is2, 0, is2, 0, 0, 0}, {0, 0, is2, 0, is2, 0, 0, 0}},
-      {{0, 0, 0, -is2, 0, is2, 0, 0}, {0, 0, 0, -is2, 0, is2, 0, 0}},
-      {{is2, 0, 0, 0, 0, 0, -is2, 0}, {is2, 0, 0, 0, 0, 0, -is2, 0}},
-    };
-  }
-};
-
-/**
- * The ISWAP gate.
- */
-template <typename fp_type>
-struct GateIS {
-  static constexpr GateKind kind = kGateIS;
-  static constexpr char name[] = "is";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type h = static_cast<fp_type>(h_double);
-  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
-
-  static GateQSim<fp_type> Create(unsigned time, unsigned q0, unsigned q1) {
-    return CreateGate<GateQSim<fp_type>, GateIS>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, 0, 0, 0, 1, 0, 0,
-                         0, 0, 0, 1, 0, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, 1, 0});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp() {
-    return schmidt_decomp_type<fp_type>{
-      {{is2, 0, 0, 0, 0, 0, is2, 0}, {is2, 0, 0, 0, 0, 0, is2, 0}},
-      {{0, 0, h, h, h, h, 0, 0}, {0, 0, h, h, h, h, 0, 0}},
-      {{0, 0, h, -h, -h, h, 0, 0}, {0, 0, h, -h, -h, h, 0, 0}},
-      {{is2, 0, 0, 0, 0, 0, -is2, 0}, {is2, 0, 0, 0, 0, 0, -is2, 0}},
-    };
-  }
-};
-
-/**
- * The fermionic simulation (FSim) gate family. Contains all two-qubit
- * interactions that preserve excitations, up to single-qubit rotations and
- * global phase.
- */
-template <typename fp_type>
-struct GateFS {
-  static constexpr GateKind kind = kGateFS;
-  static constexpr char name[] = "fs";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static constexpr fp_type is2 = static_cast<fp_type>(is2_double);
-
-  static GateQSim<fp_type> Create(
-      unsigned time, unsigned q0, unsigned q1, fp_type theta, fp_type phi) {
-    if (phi < 0) {
-      phi += 2 * 3.141592653589793;
-    }
-
-    fp_type ct = std::cos(theta);
-    fp_type st = std::sin(theta);
-    fp_type cp = std::cos(phi);
-    fp_type sp = std::sin(phi);
-
-    return CreateGate<GateQSim<fp_type>, GateFS>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, ct, 0, 0, -st, 0, 0,
-                         0, 0, 0, -st, ct, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, cp, -sp}, {theta, phi});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp(
-      fp_type theta, fp_type phi) {
-    fp_type ct = std::cos(theta);
-    fp_type st = std::sin(theta);
-
-    fp_type cp2 = std::cos(0.5 * phi);
-    fp_type sp2 = std::sin(0.5 * phi);
-    fp_type cp4 = std::cos(0.25 * phi);
-    fp_type sp4 = std::sin(0.25 * phi);
-
-    fp_type a0 = std::sqrt(std::sqrt(1 + 2 * ct * cp2 + ct * ct));
-    fp_type a1 = std::sqrt(std::sqrt(1 - 2 * ct * cp2 + ct * ct));
-
-    fp_type p0 = 0.5 * std::atan2(-sp2, cp2 + ct);
-    fp_type p1 = 0.5 * std::atan2(-sp2, cp2 - ct);
-
-    fp_type c0 = is2 * a0 * std::cos(p0);
-    fp_type s0 = is2 * a0 * std::sin(p0);
-
-    fp_type c1 = is2 * a1 * std::cos(p1);
-    fp_type s1 = is2 * a1 * std::sin(p1);
-
-    fp_type st2 = 0.5 * std::sqrt(st);
-
-    fp_type a = cp4 * c0 - sp4 * s0;
-    fp_type b = cp4 * s0 + sp4 * c0;
-    fp_type c = cp4 * c0 + sp4 * s0;
-    fp_type d = cp4 * s0 - sp4 * c0;
-
-    fp_type e = cp4 * c1 - sp4 * s1;
-    fp_type f = cp4 * s1 + sp4 * c1;
-    fp_type g = -(cp4 * c1 + sp4 * s1);
-    fp_type h = -(cp4 * s1 - sp4 * c1);
-
-    return schmidt_decomp_type<fp_type>{
-      {{a, b, 0, 0, 0, 0, c, d}, {a, b, 0, 0, 0, 0, c, d}},
-      {{0, 0, st2, -st2, st2, -st2, 0, 0}, {0, 0, st2, -st2, st2, -st2, 0, 0}},
-      {{0, 0, -st2, -st2, st2, st2, 0, 0}, {0, 0, -st2, -st2, st2, st2, 0, 0}},
-      {{e, f, 0, 0, 0, 0, g, h}, {e, f, 0, 0, 0, 0, g, h}},
-    };
-  }
-};
-
-/**
- * The controlled phase gate. A generalized version of GateCZ.
- */
-template <typename fp_type>
-struct GateCP {
-  static constexpr GateKind kind = kGateCP;
-  static constexpr char name[] = "cp";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = true;
-
-  static GateQSim<fp_type> Create(
-      unsigned time, unsigned q0, unsigned q1, fp_type phi) {
-    fp_type cp = std::cos(phi);
-    fp_type sp = std::sin(phi);
-
-    return CreateGate<GateQSim<fp_type>, GateCP>(
-        time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0,
-                         0, 0, 1, 0, 0, 0, 0, 0,
-                         0, 0, 0, 0, 1, 0, 0, 0,
-                         0, 0, 0, 0, 0, 0, cp, -sp}, {phi});
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp(fp_type phi) {
-    fp_type cp = std::cos(phi);
-    fp_type sp = std::sin(phi);
-
-    return schmidt_decomp_type<fp_type>{
-      {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}},
-      {{0, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, cp, -sp}},
-    };
-  }
-};
-
-/**
- * A two-qubit gate defined entirely by its matrix.
- */
-template <typename fp_type>
-struct GateMatrix2 {
-  static constexpr GateKind kind = kGateMatrix2;
-  static constexpr char name[] = "mat2";
-  static constexpr unsigned num_qubits = 2;
-  static constexpr bool symmetric = false;
-
-  template <typename M = Matrix<fp_type>>
-  static GateQSim<fp_type> Create(
-      unsigned time, unsigned q0, unsigned q1, M&& m) {
-    return CreateGate<GateQSim<fp_type>, GateMatrix2>(time, {q1, q0},
-                                                      std::forward<M>(m));
-  }
-
-  static schmidt_decomp_type<fp_type> SchmidtDecomp(fp_type phi) {
-    // Not implemented.
-    return schmidt_decomp_type<fp_type>{};
-  }
-};
-
-template <typename fp_type>
-inline schmidt_decomp_type<fp_type> GetSchmidtDecomp(
-    GateKind kind, const std::vector<fp_type>& params) {
-  switch (kind) {
-  case kGateId2:
-    return GateId2<fp_type>::SchmidtDecomp();
-  case kGateCZ:
-    return GateCZ<fp_type>::SchmidtDecomp();
-  case kGateCNot:
-    return GateCNot<fp_type>::SchmidtDecomp();
-  case kGateSwap:
-    return GateSwap<fp_type>::SchmidtDecomp();
-  case kGateIS:
-    return GateIS<fp_type>::SchmidtDecomp();
-  case kGateFS:
-    return GateFS<fp_type>::SchmidtDecomp(params[0], params[1]);
-  case kGateCP:
-    return GateCP<fp_type>::SchmidtDecomp(params[0]);
-  default:
-    // Single qubit gates: empty Schmidt decomposition.
-    return schmidt_decomp_type<fp_type>{};
-  }
-}
-
-}  // namespace qsim
-
-#endif  // GATES_QSIM_H_
diff --git a/tpls/qsim/hybrid.h b/tpls/qsim/hybrid.h
deleted file mode 100644
index 44fad5b..0000000
--- a/tpls/qsim/hybrid.h
+++ /dev/null
@@ -1,612 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef HYBRID_H_
-#define HYBRID_H_
-
-#include <algorithm>
-#include <array>
-#include <complex>
-#include <vector>
-
-#include "gate.h"
-#include "gate_appl.h"
-
-namespace qsim {
-
-/**
- * Hybrid Feynman-Schrodinger simulator.
- */
-template <typename IO, typename GateT,
-          template <typename, typename> class FuserT, typename For>
-struct HybridSimulator final {
- public:
-  using Gate = GateT;
-  using GateKind = typename Gate::GateKind;
-  using fp_type = typename Gate::fp_type;
-
- private:
-  // Note that one can use "struct GateHybrid : public Gate {" in C++17.
-  struct GateHybrid {
-    using GateKind = HybridSimulator::GateKind;
-    using fp_type = HybridSimulator::fp_type;
-
-    GateKind kind;
-    unsigned time;
-    std::vector<unsigned> qubits;
-    std::vector<unsigned> controlled_by;
-    uint64_t cmask;
-    std::vector<fp_type> params;
-    Matrix<fp_type> matrix;
-    bool unfusible;
-    bool swapped;
-
-    const Gate* parent;
-    unsigned id;
-  };
-
-  struct GateX {
-    GateHybrid* decomposed0;
-    GateHybrid* decomposed1;
-    schmidt_decomp_type<fp_type> schmidt_decomp;
-    unsigned schmidt_bits;
-    unsigned swapped;
-  };
-
- public:
-  using Fuser = FuserT<IO, GateHybrid>;
-  using GateFused = typename Fuser::GateFused;
-
-  /**
-   * Contextual data for hybrid simulation.
-   */
-  struct HybridData {
-    /**
-     * List of gates on the "0" side of the cut.
-     */
-    std::vector<GateHybrid> gates0;
-    /**
-     * List of gates on the "1" side of the cut.
-     */
-    std::vector<GateHybrid> gates1;
-    /**
-     * List of gates on the cut.
-     */
-    std::vector<GateX> gatexs;
-    /**
-     * Global qubit index to local qubit index map.
-     */
-    std::vector<unsigned> qubit_map;
-    /**
-     * Number of qubits on the "0" side of the cut.
-     */
-    unsigned num_qubits0;
-    /**
-     * Number of qubits on the "1" side of the cut.
-     */
-    unsigned num_qubits1;
-    /**
-     * Number of gates on the cut.
-     */
-    unsigned num_gatexs;
-  };
-
-  /**
-   * User-specified parameters for gate fusion and hybrid simulation.
-   */
-  struct Parameter : public Fuser::Parameter {
-    /**
-     * Fixed bitstring indicating values to assign to Schmidt decomposition
-     * indices of prefix gates.
-     */
-    uint64_t prefix;
-    /**
-     * Number of gates on the cut that are part of the prefix. Indices of these
-     * gates are assigned the value indicated by `prefix`.
-     */
-    unsigned num_prefix_gatexs;
-    /**
-     * Number of gates on the cut that are part of the root. All gates that are
-     * not part of the prefix or root are part of the suffix.
-     */
-    unsigned num_root_gatexs;
-    unsigned num_threads;
-  };
-
-  template <typename... Args>
-  explicit HybridSimulator(Args&&... args) : for_(args...) {}
-
-  /**
-   * Splits the lattice into two parts, using Schmidt decomposition for gates
-   * on the cut.
-   * @param parts Lattice sections to be simulated.
-   * @param gates List of all gates in the circuit.
-   * @param hd Output data with split parts.
-   * @return True if the splitting done successfully; false otherwise.
-   */
-  static bool SplitLattice(const std::vector<unsigned>& parts,
-                           const std::vector<Gate>& gates, HybridData& hd) {
-    hd.num_gatexs = 0;
-    hd.num_qubits0 = 0;
-    hd.num_qubits1 = 0;
-
-    hd.gates0.reserve(gates.size());
-    hd.gates1.reserve(gates.size());
-    hd.qubit_map.reserve(parts.size());
-
-    unsigned count0 = 0;
-    unsigned count1 = 0;
-
-    // Global qubit index to local qubit index map.
-    for (std::size_t i = 0; i < parts.size(); ++i) {
-      parts[i] == 0 ? ++hd.num_qubits0 : ++hd.num_qubits1;
-      hd.qubit_map.push_back(parts[i] == 0 ? count0++ : count1++);
-    }
-
-    // Split the lattice.
-    for (const auto& gate : gates) {
-      if (gate.kind == gate::kMeasurement) {
-        IO::errorf("measurement gates are not suported by qsimh.\n");
-        return false;
-      }
-
-      if (gate.controlled_by.size() > 0) {
-        IO::errorf("controlled gates are not suported by qsimh.\n");
-        return false;
-      }
-
-      switch (gate.qubits.size()) {
-      case 1:  // Single qubit gates.
-        switch (parts[gate.qubits[0]]) {
-        case 0:
-          hd.gates0.emplace_back(GateHybrid{gate.kind, gate.time,
-            {hd.qubit_map[gate.qubits[0]]}, {}, 0, gate.params, gate.matrix,
-            false, false, nullptr, 0});
-          break;
-        case 1:
-          hd.gates1.emplace_back(GateHybrid{gate.kind, gate.time,
-            {hd.qubit_map[gate.qubits[0]]}, {}, 0, gate.params, gate.matrix,
-            false, false, nullptr, 0});
-          break;
-        }
-        break;
-      case 2:  // Two qubit gates.
-        {
-          switch ((parts[gate.qubits[1]] << 1) | parts[gate.qubits[0]]) {
-          case 0:  // Both qubits in part 0.
-            hd.gates0.emplace_back(GateHybrid{gate.kind, gate.time,
-              {hd.qubit_map[gate.qubits[0]], hd.qubit_map[gate.qubits[1]]},
-              {}, 0, gate.params, gate.matrix, false, gate.swapped,
-              nullptr, 0});
-            break;
-          case 1:  // Gate on the cut, qubit 0 in part 1, qubit 1 in part 0.
-            hd.gates0.emplace_back(GateHybrid{GateKind::kDecomp, gate.time,
-              {hd.qubit_map[gate.qubits[1]]}, {}, 0, gate.params, {},
-              true, gate.swapped, &gate, hd.num_gatexs});
-            hd.gates1.emplace_back(GateHybrid{GateKind::kDecomp, gate.time,
-              {hd.qubit_map[gate.qubits[0]]}, {}, 0, gate.params, {},
-              true, gate.swapped, &gate, hd.num_gatexs});
-
-            ++hd.num_gatexs;
-            break;
-          case 2:  // Gate on the cut, qubit 0 in part 0, qubit 1 in part 1.
-            hd.gates0.emplace_back(GateHybrid{GateKind::kDecomp, gate.time,
-              {hd.qubit_map[gate.qubits[0]]}, {}, 0, gate.params, {},
-              true, gate.swapped, &gate, hd.num_gatexs});
-            hd.gates1.emplace_back(GateHybrid{GateKind::kDecomp, gate.time,
-              {hd.qubit_map[gate.qubits[1]]}, {}, 0, gate.params, {},
-              true, gate.swapped, &gate, hd.num_gatexs});
-
-            ++hd.num_gatexs;
-            break;
-          case 3:  // Both qubits in part 1.
-            hd.gates1.emplace_back(GateHybrid{gate.kind, gate.time,
-              {hd.qubit_map[gate.qubits[0]], hd.qubit_map[gate.qubits[1]]},
-              {}, 0, gate.params, gate.matrix, false, gate.swapped,
-              nullptr, 0});
-            break;
-          }
-        }
-        break;
-      default:
-        IO::errorf("multi-qubit gates are not suported by qsimh.\n");
-        return false;
-      }
-    }
-
-    auto compare = [](const GateHybrid& l, const GateHybrid& r) -> bool {
-      return l.time < r.time || (l.time == r.time &&
-          (l.parent < r.parent || (l.parent == r.parent && l.id < r.id)));
-    };
-
-    // Sort gates.
-    std::sort(hd.gates0.begin(), hd.gates0.end(), compare);
-    std::sort(hd.gates1.begin(), hd.gates1.end(), compare);
-
-    hd.gatexs.reserve(hd.num_gatexs);
-
-    // Get Schmidt matrices.
-    for (auto& gate0 : hd.gates0) {
-      if (gate0.parent != nullptr) {
-        auto d = GetSchmidtDecomp(gate0.parent->kind, gate0.parent->params);
-        if (d.size() == 0) {
-          IO::errorf("no Schmidt decomposition for gate kind %u.\n",
-                     gate0.parent->kind);
-          return false;
-        }
-
-        unsigned schmidt_bits = SchmidtBits(d.size());
-        if (schmidt_bits > 2) {
-          IO::errorf("Schmidt rank is too large for gate kind %u.\n",
-                     gate0.parent->kind);
-          return false;
-        }
-
-        unsigned swapped = parts[gate0.parent->qubits[0]];
-        if (gate0.parent->swapped) swapped = 1 - swapped;
-        hd.gatexs.emplace_back(GateX{&gate0, nullptr, std::move(d),
-                                     schmidt_bits, swapped});
-      }
-    }
-
-    unsigned count = 0;
-    for (auto& gate1 : hd.gates1) {
-      if (gate1.parent != nullptr) {
-        hd.gatexs[count++].decomposed1 = &gate1;
-      }
-    }
-
-    for (auto& gatex : hd.gatexs) {
-      if (gatex.schmidt_decomp.size() == 1) {
-        FillSchmidtMatrices(0, gatex);
-      }
-    }
-
-    return true;
-  }
-
-  /**
-   * Runs the hybrid simulator on a sectioned lattice.
-   * @param param Options for parallelism and logging. Also specifies the size
-   *   of the 'prefix' and 'root' sections of the lattice.
-   * @param factory Object to create simulators and state spaces.
-   * @param hd Container object for gates on the boundary between lattice
-   *   sections.
-   * @param parts Lattice sections to be simulated.
-   * @param fgates0 List of gates from one section of the lattice.
-   * @param fgates1 List of gates from the other section of the lattice.
-   * @param bitstrings List of output states to simulate, as bitstrings.
-   * @param results Output vector of amplitudes. After a successful run, this
-   *   will be populated with amplitudes for each state in 'bitstrings'.
-   * @return True if the simulation completed successfully; false otherwise.
-   */
-  template <typename Factory, typename Results>
-  bool Run(const Parameter& param, const Factory& factory,
-           HybridData& hd, const std::vector<unsigned>& parts,
-           const std::vector<GateFused>& fgates0,
-           const std::vector<GateFused>& fgates1,
-           const std::vector<uint64_t>& bitstrings, Results& results) const {
-    using Simulator = typename Factory::Simulator;
-    using StateSpace = typename Simulator::StateSpace;
-    using State = typename StateSpace::State;
-
-    unsigned num_p_gates = param.num_prefix_gatexs;
-    unsigned num_pr_gates = num_p_gates + param.num_root_gatexs;
-
-    auto bits = CountSchmidtBits(param, hd.gatexs);
-
-    uint64_t rmax = uint64_t{1} << bits.num_r_bits;
-    uint64_t smax = uint64_t{1} << bits.num_s_bits;
-
-    auto loc0 = CheckpointLocations(param, fgates0);
-    auto loc1 = CheckpointLocations(param, fgates1);
-
-    struct Index {
-      unsigned i0;
-      unsigned i1;
-    };
-
-    std::vector<Index> indices;
-    indices.reserve(bitstrings.size());
-
-    // Bitstring indices for part 0 and part 1. TODO: optimize.
-    for (const auto& bitstring : bitstrings) {
-      Index index{0, 0};
-
-      for (uint64_t i = 0; i < hd.qubit_map.size(); ++i) {
-        unsigned m = ((bitstring >> i) & 1) << hd.qubit_map[i];
-        parts[i] ? index.i1 |= m : index.i0 |= m;
-      }
-
-      indices.push_back(index);
-    }
-
-    StateSpace state_space = factory.CreateStateSpace();
-
-    State* rstate0;
-    State* rstate1;
-
-    State state0p = state_space.Null();
-    State state1p = state_space.Null();
-    State state0r = state_space.Null();
-    State state1r = state_space.Null();
-    State state0s = state_space.Null();
-    State state1s = state_space.Null();
-
-    // Create states.
-
-    if (!CreateStates(hd.num_qubits0, hd.num_qubits1, state_space, true,
-                      state0p, state1p, rstate0, rstate1)) {
-      return false;
-    }
-
-    if (!CreateStates(hd.num_qubits0, hd.num_qubits1, state_space, rmax > 1,
-                      state0r, state1r, rstate0, rstate1)) {
-      return false;
-    }
-
-    if (!CreateStates(hd.num_qubits0, hd.num_qubits1, state_space, smax > 1,
-                      state0s, state1s, rstate0, rstate1)) {
-      return false;
-    }
-
-    state_space.SetStateZero(state0p);
-    state_space.SetStateZero(state1p);
-
-    Simulator simulator = factory.CreateSimulator();
-
-    std::vector<unsigned> prev(hd.num_gatexs, unsigned(-1));
-
-    // param.prefix encodes the prefix path.
-    unsigned gatex_index = SetSchmidtMatrices(
-        0, num_p_gates, param.prefix, prev, hd.gatexs);
-
-    if (gatex_index == 0) {
-      // Apply gates before the first checkpoint.
-      ApplyGates(fgates0, 0, loc0[0], simulator, state0p);
-      ApplyGates(fgates1, 0, loc1[0], simulator, state1p);
-    } else {
-      IO::errorf("invalid prefix %lu for prefix gate index %u.\n",
-                 param.prefix, gatex_index - 1);
-      return false;
-    }
-
-    // Branch over root gates on the cut. r encodes the root path.
-    for (uint64_t r = 0; r < rmax; ++r) {
-      if (rmax > 1) {
-        state_space.Copy(state0p, state0r);
-        state_space.Copy(state1p, state1r);
-      }
-
-      if (SetSchmidtMatrices(num_p_gates, num_pr_gates,
-                             r, prev, hd.gatexs) == 0) {
-        // Apply gates before the second checkpoint.
-        ApplyGates(fgates0, loc0[0], loc0[1], simulator, state0r);
-        ApplyGates(fgates1, loc1[0], loc1[1], simulator, state1r);
-      } else {
-        continue;
-      }
-
-      // Branch over suffix gates on the cut. s encodes the suffix path.
-      for (uint64_t s = 0; s < smax; ++s) {
-        if (smax > 1) {
-          state_space.Copy(rmax > 1 ? state0r : state0p, state0s);
-          state_space.Copy(rmax > 1 ? state1r : state1p, state1s);
-        }
-
-        if (SetSchmidtMatrices(num_pr_gates, hd.num_gatexs,
-                               s, prev, hd.gatexs) == 0) {
-          // Apply the rest of the gates.
-          ApplyGates(fgates0, loc0[1], fgates0.size(), simulator, state0s);
-          ApplyGates(fgates1, loc1[1], fgates1.size(), simulator, state1s);
-        } else {
-          continue;
-        }
-
-        auto f = [](unsigned n, unsigned m, uint64_t i,
-                    const StateSpace& state_space,
-                    const State& state0, const State& state1,
-                    const std::vector<Index>& indices, Results& results) {
-          // TODO: make it faster for the CUDA state space.
-          auto a0 = state_space.GetAmpl(state0, indices[i].i0);
-          auto a1 = state_space.GetAmpl(state1, indices[i].i1);
-          results[i] += a0 * a1;
-        };
-
-        // Collect results.
-        for_.Run(results.size(), f,
-                 state_space, *rstate0, *rstate1, indices, results);
-      }
-    }
-
-    return true;
-  }
-
- private:
-  /**
-   * Identifies when to save "checkpoints" of the simulation state. These allow
-   * runs with different cut-index values to reuse parts of the simulation.
-   * @param param Options for parallelism and logging. Also specifies the size
-   *   of the 'prefix' and 'root' sections of the lattice.
-   * @param fgates Set of gates for which to find checkpoint locations.
-   * @return A pair of numbers specifying how many gates to apply before the
-   *   first and second checkpoints, respectively.
-   */
-  static std::array<unsigned, 2> CheckpointLocations(
-      const Parameter& param, const std::vector<GateFused>& fgates) {
-    std::array<unsigned, 2> loc{0, 0};
-
-    unsigned num_decomposed = 0;
-    unsigned num_p_gates = param.num_prefix_gatexs;
-    unsigned num_pr_gates = num_p_gates + param.num_root_gatexs;
-
-    for (std::size_t i = 0; i < fgates.size(); ++i) {
-      for (auto gate: fgates[i].gates) {
-        if (gate->parent != nullptr) {
-          ++num_decomposed;
-          // There should be only one decomposed gate in fused gate.
-          break;
-        }
-      }
-
-      if (num_decomposed <= num_p_gates) {
-        loc[0] = i + 1;
-      }
-
-      if (num_decomposed <= num_pr_gates) {
-        loc[1] = i + 1;
-      }
-    }
-
-    return loc;
-  }
-
-  struct Bits {
-    unsigned num_p_bits;
-    unsigned num_r_bits;
-    unsigned num_s_bits;
-  };
-
-  static Bits CountSchmidtBits(
-      const Parameter& param, const std::vector<GateX>& gatexs) {
-    Bits bits{0, 0, 0};
-
-    unsigned num_p_gates = param.num_prefix_gatexs;
-    unsigned num_pr_gates = num_p_gates + param.num_root_gatexs;
-
-    for (std::size_t i = 0; i < gatexs.size(); ++i) {
-      const auto& gatex = gatexs[i];
-      if (i < num_p_gates) {
-        bits.num_p_bits += gatex.schmidt_bits;
-      } else if (i < num_pr_gates) {
-        bits.num_r_bits += gatex.schmidt_bits;
-      } else {
-        bits.num_s_bits += gatex.schmidt_bits;
-      }
-    }
-
-    return bits;
-  }
-
-  static unsigned SetSchmidtMatrices(std::size_t i0, std::size_t i1,
-                                     uint64_t path,
-                                     std::vector<unsigned>& prev_k,
-                                     std::vector<GateX>& gatexs) {
-    unsigned shift_length = 0;
-
-    for (std::size_t i = i0; i < i1; ++i) {
-      const auto& gatex = gatexs[i];
-
-      if (gatex.schmidt_bits == 0) {
-        // Continue if gatex has Schmidt rank 1.
-        continue;
-      }
-
-      unsigned k = (path >> shift_length) & ((1 << gatex.schmidt_bits) - 1);
-      shift_length += gatex.schmidt_bits;
-
-      if (k != prev_k[i]) {
-        if (k >= gatex.schmidt_decomp.size()) {
-          // Invalid path. Returns gatex index plus one to report error in case
-          // of invalid prefix.
-          return i + 1;
-        }
-
-        FillSchmidtMatrices(k, gatex);
-
-        prev_k[i] = k;
-      }
-    }
-
-    return 0;
-  }
-
-  static void FillSchmidtMatrices(unsigned k, const GateX& gatex) {
-    unsigned part0 = gatex.swapped;
-    unsigned part1 = 1 - part0;
-    {
-      gatex.decomposed0->matrix.resize(gatex.schmidt_decomp[k][part0].size());
-      auto begin = gatex.schmidt_decomp[k][part0].begin();
-      auto end = gatex.schmidt_decomp[k][part0].end();
-      std::copy(begin, end, gatex.decomposed0->matrix.begin());
-    }
-    {
-      gatex.decomposed1->matrix.resize(gatex.schmidt_decomp[k][part1].size());
-      auto begin = gatex.schmidt_decomp[k][part1].begin();
-      auto end = gatex.schmidt_decomp[k][part1].end();
-      std::copy(begin, end, gatex.decomposed1->matrix.begin());
-    }
-  }
-
-  template <typename Simulator>
-  static void ApplyGates(const std::vector<GateFused>& gates,
-                         std::size_t i0, std::size_t i1,
-                         const Simulator& simulator,
-                         typename Simulator::State& state) {
-    for (std::size_t i = i0; i < i1; ++i) {
-      if (gates[i].matrix.size() > 0) {
-        ApplyFusedGate(simulator, gates[i], state);
-      } else {
-        auto gate = gates[i];
-        CalculateFusedMatrix(gate);
-        ApplyFusedGate(simulator, gate, state);
-      }
-    }
-  }
-
-  static unsigned SchmidtBits(unsigned size) {
-    switch (size) {
-    case 1:
-      return 0;
-    case 2:
-      return 1;
-    case 3:
-      return 2;
-    case 4:
-      return 2;
-    default:
-      // Not supported.
-      return 42;
-    }
-  }
-
-  template <typename StateSpace>
-  static bool CreateStates(unsigned num_qubits0,unsigned num_qubits1,
-                           const StateSpace& state_space, bool create,
-                           typename StateSpace::State& state0,
-                           typename StateSpace::State& state1,
-                           typename StateSpace::State* (&rstate0),
-                           typename StateSpace::State* (&rstate1)) {
-    if (create) {
-      state0 = state_space.Create(num_qubits0);
-      state1 = state_space.Create(num_qubits1);
-
-      if (state_space.IsNull(state0) || state_space.IsNull(state1)) {
-        IO::errorf("not enough memory: is the number of qubits too large?\n");
-        return false;
-      }
-
-      rstate0 = &state0;
-      rstate1 = &state1;
-    }
-
-    return true;
-  }
-
-  For for_;
-};
-
-}  // namespace qsim
-
-#endif  // HYBRID_H_
diff --git a/tpls/qsim/io.h b/tpls/qsim/io.h
deleted file mode 100644
index 3b26c7c..0000000
--- a/tpls/qsim/io.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef IO_H_
-#define IO_H_
-
-#include <cstdarg>
-#include <cstdio>
-
-namespace qsim {
-
-/**
- * Controller for output logs.
- */
-struct IO {
-  static void errorf(const char* format, ...) {
-    va_list args;
-    va_start(args, format);
-    vfprintf(stderr, format, args);
-    va_end(args);
-  }
-
-  static void messagef(const char* format, ...) {
-    va_list args;
-    va_start(args, format);
-    vprintf(format, args);
-    va_end(args);
-  }
-};
-
-}  // namespace qsim
-
-#endif  // IO_H_
diff --git a/tpls/qsim/io_file.h b/tpls/qsim/io_file.h
deleted file mode 100644
index 3cfac12..0000000
--- a/tpls/qsim/io_file.h
+++ /dev/null
@@ -1,71 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef IO_FILE_H_
-#define IO_FILE_H_
-
-#include <cstdint>
-#include <fstream>
-#include <string>
-
-#include "io.h"
-
-namespace qsim {
-
-/**
- * Controller for output logs with methods for writing to file.
- */
-struct IOFile : public IO {
-  static std::ifstream StreamFromFile(const std::string& file) {
-    std::ifstream fs;
-    fs.open(file);
-    if (!fs) {
-      errorf("cannot open %s for reading.\n", file.c_str());
-    }
-    return fs;
-  }
-
-  static void CloseStream(std::ifstream& fs) {
-    fs.close();
-  }
-
-  static bool WriteToFile(
-      const std::string& file, const std::string& content) {
-    return WriteToFile(file, content.data(), content.size());
-  }
-
-  static bool WriteToFile(
-      const std::string& file, const void* data, uint64_t size) {
-    auto fs = std::fstream(file, std::ios::out | std::ios::binary);
-
-    if (!fs) {
-      errorf("cannot open %s for writing.\n", file.c_str());
-      return false;
-    } else {
-      fs.write((const char*) data, size);
-      if (!fs) {
-        errorf("cannot write to %s.\n", file.c_str());
-        return false;
-      }
-
-      fs.close();
-    }
-
-    return true;
-  }
-};
-
-}  // namespace qsim
-
-#endif  // IO_FILE_H_
diff --git a/tpls/qsim/matrix.h b/tpls/qsim/matrix.h
deleted file mode 100644
index a3c2640..0000000
--- a/tpls/qsim/matrix.h
+++ /dev/null
@@ -1,296 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MATRIX_H_
-#define MATRIX_H_
-
-#include <algorithm>
-#include <utility>
-#include <vector>
-
-#include "bits.h"
-
-namespace qsim {
-
-/**
- * Gate matrix type. Matrices are stored as vectors. The matrix elements are
- * accessed as real(m[i][j]) <- vector[2 * (n * i + j)] and
- * imag(m[i][j]) <- vector[2 * (n * i + j) + 1], where n is the number of rows
- * or columns (n = 2^q, where q is the number of gate qubits).
- */
-template <typename fp_type>
-using Matrix = std::vector<fp_type>;
-
-/**
- * Sets all matrix elements to zero.
- * @m Matrix to be cleared.
- */
-template <typename fp_type>
-inline void MatrixClear(Matrix<fp_type>& m) {
-  for (unsigned i = 0; i < m.size(); ++i) {
-    m[i] = 0;
-  }
-}
-
-/**
- * Sets an identity matrix.
- * @n Number of matrix rows (columns).
- * @m Output identity matrix.
- */
-template <typename fp_type>
-inline void MatrixIdentity(unsigned n, Matrix<fp_type>& m) {
-  m.resize(2 * n * n);
-
-  MatrixClear(m);
-
-  for (unsigned i = 0; i < n; ++i) {
-    m[2 * (n * i + i)] = 1;
-  }
-}
-
-/**
- * Multiplies two gate matrices of equal size: m2 = m1 m2.
- * @q Number of gate qubits. The number of matrix rows (columns) is 2^q.
- * @m1 Matrix m1.
- * @m2 Input matrix m2. Output product of matrices m2 = m1 m2.
- */
-template <typename fp_type1, typename fp_type2>
-inline void MatrixMultiply(
-    unsigned q, const Matrix<fp_type1>& m1, Matrix<fp_type2>& m2) {
-  Matrix<fp_type2> mt = m2;
-  unsigned n = unsigned{1} << q;
-
-  for (unsigned i = 0; i < n; ++i) {
-    for (unsigned j = 0; j < n; ++j) {
-      fp_type2 re = 0;
-      fp_type2 im = 0;
-
-      for (unsigned k = 0; k < n; ++k) {
-        fp_type2 r1 = m1[2 * (n * i + k)];
-        fp_type2 i1 = m1[2 * (n * i + k) + 1];
-        fp_type2 r2 = mt[2 * (n * k + j)];
-        fp_type2 i2 = mt[2 * (n * k + j) + 1];
-
-        re += r1 * r2 - i1 * i2;
-        im += r1 * i2 + i1 * r2;
-      }
-
-      m2[2 * (n * i + j)] = re;
-      m2[2 * (n * i + j) + 1] = im;
-    }
-  }
-}
-
-/**
- * Multiplies two gate matrices of equal size: m2 = m1^\dagger m2.
- * @q Number of gate qubits. The number of matrix rows (columns) is 2^q.
- * @m1 Matrix m1.
- * @m2 Input matrix m2. Output product of matrices m2 = m1 m2.
- */
-template <typename fp_type1, typename fp_type2>
-inline void MatrixDaggerMultiply(
-    unsigned q, const Matrix<fp_type1>& m1, Matrix<fp_type2>& m2) {
-  Matrix<fp_type2> mt = m2;
-  unsigned n = unsigned{1} << q;
-
-  for (unsigned i = 0; i < n; ++i) {
-    for (unsigned j = 0; j < n; ++j) {
-      fp_type2 re = 0;
-      fp_type2 im = 0;
-
-      for (unsigned k = 0; k < n; ++k) {
-        fp_type2 r1 = m1[2 * (n * k + i)];
-        fp_type2 i1 = m1[2 * (n * k + i) + 1];
-        fp_type2 r2 = mt[2 * (n * k + j)];
-        fp_type2 i2 = mt[2 * (n * k + j) + 1];
-
-        re += r1 * r2 + i1 * i2;
-        im += r1 * i2 - i1 * r2;
-      }
-
-      m2[2 * (n * i + j)] = re;
-      m2[2 * (n * i + j) + 1] = im;
-    }
-  }
-}
-
-/**
- * Multiplies two gate matrices: m2 = m1 m2. The size of m1 should not exceed
- *   the size of m2.
- * @mask1 Qubit mask that specifies the subset of qubits m1 acts on.
- * @q1 Number of gate qubits. The number of matrix rows (columns) is 2^q1.
- * @m1 Matrix m1.
- * @q2 Number of gate qubits. The number of matrix rows (columns) is 2^q2.
- * @m2 Input matrix m2. Output product of matrices m2 = m1 m2.
- */
-template <typename fp_type1, typename fp_type2>
-inline void MatrixMultiply(unsigned mask1,
-                           unsigned q1, const Matrix<fp_type1>& m1,
-                           unsigned q2, Matrix<fp_type2>& m2) {
-  if (q1 == q2) {
-    MatrixMultiply(q1, m1, m2);
-  } else {
-    Matrix<fp_type2> mt = m2;
-    unsigned n1 = unsigned{1} << q1;
-    unsigned n2 = unsigned{1} << q2;
-
-    for (unsigned i = 0; i < n2; ++i) {
-      unsigned si = bits::CompressBits(i, q2, mask1);
-
-      for (unsigned j = 0; j < n2; ++j) {
-        fp_type2 re = 0;
-        fp_type2 im = 0;
-
-        for (unsigned k = 0; k < n1; ++k) {
-          unsigned ek = bits::ExpandBits(k, q2, mask1) + (i & ~mask1);
-
-          fp_type2 r1 = m1[2 * (n1 * si + k)];
-          fp_type2 i1 = m1[2 * (n1 * si + k) + 1];
-          fp_type2 r2 = mt[2 * (n2 * ek + j)];
-          fp_type2 i2 = mt[2 * (n2 * ek + j) + 1];
-
-          re += r1 * r2 - i1 * i2;
-          im += r1 * i2 + i1 * r2;
-        }
-
-        m2[2 * (n2 * i + j)] = re;
-        m2[2 * (n2 * i + j) + 1] = im;
-      }
-    }
-  }
-}
-
-/**
- * Multiply a matrix by a real scalar value.
- * @c Scalar value.
- * @m Input matrix to be multiplied. Output matrix.
- */
-template <typename fp_type1, typename fp_type2>
-inline void MatrixScalarMultiply(fp_type1 c, Matrix<fp_type2>& m) {
-  for (unsigned i = 0; i < m.size(); ++i) {
-    m[i] *= c;
-  }
-}
-
-/**
- * Multiply a matrix by a complex scalar value.
- * @re Real part of scalar value.
- * @im Imaginary part of scalar value.
- * @m Input matrix to be multiplied. Output matrix.
- */
-template <typename fp_type1, typename fp_type2>
-inline void MatrixScalarMultiply(
-    fp_type1 re, fp_type1 im, Matrix<fp_type2>& m) {
-  for (unsigned i = 0; i < m.size() / 2; ++i) {
-    fp_type2 re0 = m[2 * i + 0];
-    fp_type2 im0 = m[2 * i + 1];
-    m[2 * i + 0] = re * re0 - im * im0;
-    m[2 * i + 1] = re * im0 + im * re0;
-  }
-}
-
-/**
- * Daggers a matrix.
- * @n Number of matrix rows (columns).
- * @m Input matrix. Output matrix.
- */
-template <typename fp_type>
-inline void MatrixDagger(unsigned n, Matrix<fp_type>& m) {
-  for (unsigned i = 0; i < n; ++i) {
-    m[2 * (n * i + i) + 1] = -m[2 * (n * i + i) + 1];
-
-    for (unsigned j = i + 1; j < n; ++j) {
-      std::swap(m[2 * (n * i + j)], m[2 * (n * j + i)]);
-      fp_type t = m[2 * (n * i + j) + 1];
-      m[2 * (n * i + j) + 1] = -m[2 * (n * j + i) + 1];
-      m[2 * (n * j + i) + 1] = -t;
-    }
-  }
-}
-
-/**
- * Gets a permutation to rearrange qubits from "normal" order to "gate"
- *   order. Qubits are ordered in increasing order for "normal" order.
- *   Qubits are ordered arbitrarily for "gate" order. Returns an empty vector
- *   if the qubits are in "normal" order.
- * @qubits Qubit indices in "gate" order.
- * @return Permutation as a vector.
- */
-inline std::vector<unsigned> NormalToGateOrderPermutation(
-    const std::vector<unsigned>& qubits) {
-  std::vector<unsigned> perm;
-
-  bool normal_order = true;
-
-  for (std::size_t i = 1; i < qubits.size(); ++i) {
-    if (qubits[i] < qubits[i - 1]) {
-      normal_order = false;
-      break;
-    }
-  }
-
-  if (!normal_order) {
-    struct QI {
-      unsigned q;
-      unsigned index;
-    };
-
-    std::vector<QI> qis;
-    qis.reserve(qubits.size());
-
-    for (std::size_t i = 0; i < qubits.size(); ++i) {
-      qis.push_back({qubits[i], unsigned(i)});
-    }
-
-    std::sort(qis.begin(), qis.end(), [](const QI& l, const QI& r) {
-                                        return l.q < r.q;
-                                      });
-
-    perm.reserve(qubits.size());
-
-    for (std::size_t i = 0; i < qubits.size(); ++i) {
-      perm.push_back(qis[i].index);
-    }
-  }
-
-  return perm;
-}
-
-/**
- * Shuffles the gate matrix elements to get the matrix that acts on qubits
- *   that are in "normal" order (in increasing orger).
- * @perm Permutation to rearrange qubits from "normal" order to "gate" order.
- * @q Number of gate qubits. The number of matrix rows (columns) is 2^q.
- * @m Input matrix. Output shuffled matrix.
- */
-template <typename fp_type>
-inline void MatrixShuffle(const std::vector<unsigned>& perm,
-                          unsigned q, Matrix<fp_type>& m) {
-  Matrix<fp_type> mt = m;
-  unsigned n = unsigned{1} << q;
-
-  for (unsigned i = 0; i < n; ++i) {
-    unsigned pi = bits::PermuteBits(i, q, perm);
-    for (unsigned j = 0; j < n; ++j) {
-      unsigned pj = bits::PermuteBits(j, q, perm);
-
-      m[2 * (n * i + j)] = mt[2 * (n * pi + pj)];
-      m[2 * (n * i + j) + 1] = mt[2 * (n * pi + pj) + 1];
-    }
-  }
-}
-
-}  // namespace qsim
-
-#endif  // MATRIX_H_
diff --git a/tpls/qsim/mps_simulator.h b/tpls/qsim/mps_simulator.h
deleted file mode 100644
index 8fbcbae..0000000
--- a/tpls/qsim/mps_simulator.h
+++ /dev/null
@@ -1,246 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MPS_SIMULATOR_H_
-#define MPS_SIMULATOR_H_
-
-// For templates will take care of parallelization.
-#define EIGEN_DONT_PARALLELIZE 1
-
-#include <cstdint>
-#include <cstdlib>
-#include <cstring>
-#include <memory>
-#include <vector>
-
-#include "../eigen/Eigen/Dense"
-#include "../eigen/Eigen/SVD"
-#include "mps_statespace.h"
-
-namespace qsim {
-
-namespace mps {
-
-/**
- *  Truncated Matrix Product State (MPS) circuit simulator w/ vectorization.
- */
-template <typename For, typename FP = float>
-class MPSSimulator final {
- public:
-  using MPSStateSpace_ = MPSStateSpace<For, FP>;
-  using State = typename MPSStateSpace_::MPS;
-  using fp_type = typename MPSStateSpace_::fp_type;
-
-  using Complex = std::complex<fp_type>;
-  using Matrix =
-      Eigen::Matrix<Complex, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-  using ConstMatrixMap = Eigen::Map<const Matrix>;
-  using MatrixMap = Eigen::Map<Matrix>;
-
-  using OneQubitMatrix = Eigen::Matrix<Complex, 2, 2, Eigen::RowMajor>;
-  using ConstOneQubitMap = Eigen::Map<const OneQubitMatrix>;
-
-  // Note: ForArgs are currently unused.
-  template <typename... ForArgs>
-  explicit MPSSimulator(ForArgs&&... args) : for_(args...) {}
-
-  /**
-   * Applies a gate using non-vectorized instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyGate(const std::vector<unsigned>& qs, const fp_type* matrix,
-                 State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    switch (qs.size()) {
-      case 1:
-        ApplyGate1(qs, matrix, state);
-        break;
-      case 2:
-        ApplyGate2(qs, matrix, state);
-        break;
-      // case 3:
-      //   ApplyGate3(qs, matrix, state);
-      //   break;
-      // case 4:
-      //   ApplyGate4(qs, matrix, state);
-      //   break;
-      // case 5:
-      //   ApplyGate5(qs, matrix, state);
-      //   break;
-      // case 6:
-      //   ApplyGate6(qs, matrix, state);
-      //   break;
-      default:
-        // Not implemented.
-        break;
-    }
-  }
-
-  /**
-   * Applies a controlled gate using eigen3 operations w/ instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param cqs Indices of control qubits.
-   * @param cmask Bit mask of control qubit values.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyControlledGate(const std::vector<unsigned>& qs,
-                           const std::vector<unsigned>& cqs, uint64_t cmask,
-                           const fp_type* matrix, State& state) const {
-    // TODO.
-  }
-
-  /**
-   * Computes the expectation value of an operator using eigen3 operations
-   * w/ vectorized instructions.
-   * @param qs Indices of the qubits the operator acts on.
-   * @param matrix The operator matrix.
-   * @param state The state of the system.
-   * @return The computed expectation value.
-   */
-  std::complex<double> ExpectationValue(const std::vector<unsigned>& qs,
-                                        const fp_type* matrix,
-                                        const State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-    // TODO.
-    return std::complex<double>(-10., -10.);
-  }
-
- private:
-  void ApplyGate1(const std::vector<unsigned>& qs, const fp_type* matrix,
-                  State& state) const {
-    if (qs[0] == state.num_qubits() - 1) {
-      Apply1Right(qs, matrix, state);
-    } else {
-      Apply1LeftOrInterior(qs, matrix, state);
-    }
-  }
-
-  void Apply1LeftOrInterior(const std::vector<unsigned>& qs,
-                            const fp_type* matrix, State& state) const {
-    fp_type* raw_state = state.get();
-    const auto bond_dim = state.bond_dim();
-    const auto l_offset = MPSStateSpace_::GetBlockOffset(state, qs[0]);
-    const auto r_offset = MPSStateSpace_::GetBlockOffset(state, qs[0] + 1);
-    const auto end = MPSStateSpace_::Size(state);
-    ConstOneQubitMap gate_matrix((Complex*) matrix);
-    MatrixMap scratch_block((Complex*)(raw_state + end), 2, bond_dim);
-
-    for (unsigned block_sep = l_offset; block_sep < r_offset;
-         block_sep += 4 * bond_dim) {
-      fp_type* cur_block = raw_state + block_sep;
-      ConstMatrixMap mps_block((Complex*) cur_block, 2, bond_dim);
-      scratch_block.noalias() = gate_matrix * mps_block;
-      memcpy(cur_block, raw_state + end, sizeof(fp_type) * bond_dim * 4);
-    }
-  }
-
-  void Apply1Right(const std::vector<unsigned>& qs, const fp_type* matrix,
-                   State& state) const {
-    fp_type* raw_state = state.get();
-    const auto bond_dim = state.bond_dim();
-    const auto offset = MPSStateSpace_::GetBlockOffset(state, qs[0]);
-    const auto end = MPSStateSpace_::Size(state);
-    ConstOneQubitMap gate_matrix((Complex*) matrix);
-    ConstMatrixMap mps_block((Complex*)(raw_state + offset), bond_dim, 2);
-    MatrixMap scratch_block((Complex*)(raw_state + end), bond_dim, 2);
-    scratch_block.noalias() = mps_block * gate_matrix.transpose();
-    memcpy(raw_state + offset, raw_state + end, sizeof(fp_type) * bond_dim * 4);
-  }
-
-  void ApplyGate2(const std::vector<unsigned>& qs, const fp_type* matrix,
-                  State& state) const {
-    // TODO: micro-benchmark this function and improve performance.
-    const auto bond_dim = state.bond_dim();
-    const auto num_qubits = state.num_qubits();
-    fp_type* raw_state = state.get();
-
-    const auto i_dim = (qs[0] == 0) ? 1 : bond_dim;
-    const auto j_dim = 2;
-    const auto k_dim = bond_dim;
-    const auto l_dim = 2;
-    const auto m_dim = (qs[1] == num_qubits - 1) ? 1 : bond_dim;
-
-    const auto b_0_offset = MPSStateSpace_::GetBlockOffset(state, qs[0]);
-    const auto b_1_offset = MPSStateSpace_::GetBlockOffset(state, qs[1]);
-    const auto end = MPSStateSpace_::Size(state);
-
-    MatrixMap block_0((Complex*)(raw_state + b_0_offset), i_dim * j_dim, k_dim);
-    MatrixMap block_1((Complex*)(raw_state + b_1_offset), k_dim, l_dim * m_dim);
-
-    // Merge both blocks into scratch space.
-    MatrixMap scratch_c((Complex*)(raw_state + end), i_dim * j_dim, l_dim * m_dim);
-    scratch_c.noalias() = block_0 * block_1;
-
-    // Transpose inner dims in-place.
-    MatrixMap scratch_c_t((Complex*)(raw_state + end), i_dim * j_dim * l_dim, m_dim);
-    for (unsigned i = 0; i < i_dim * j_dim * l_dim; i += 4) {
-      scratch_c_t.row(i + 1).swap(scratch_c_t.row(i + 2));
-    }
-
-    // Transpose gate matrix and place in 3rd (last) scratch block.
-    const auto scratch3_offset = end + 8 * bond_dim * bond_dim;
-    ConstMatrixMap gate_matrix((Complex*) matrix, 4, 4);
-    MatrixMap gate_matrix_transpose((Complex*)(raw_state + scratch3_offset), 4, 4);
-    gate_matrix_transpose = gate_matrix.transpose();
-    gate_matrix_transpose.col(1).swap(gate_matrix_transpose.col(2));
-
-    // Contract gate and merged block tensors, placing result in B0B1.
-    for (unsigned i = 0; i < i_dim; ++i) {
-      fp_type* src_block = raw_state + end + i * 8 * m_dim;
-      fp_type* dest_block = raw_state + b_0_offset + i * 8 * m_dim;
-      MatrixMap block_b0b1((Complex*) dest_block, 4, m_dim);
-      ConstMatrixMap scratch_c_i((Complex*) src_block, 4, m_dim);
-      // [i, np, m] = [np, lj] * [i, lj, m]
-      block_b0b1.noalias() = gate_matrix_transpose * scratch_c_i;
-    }
-
-    // SVD B0B1.
-    MatrixMap full_b0b1((Complex*)(raw_state + b_0_offset), 2 * i_dim, 2 * m_dim);
-    Eigen::BDCSVD<Matrix> svd(full_b0b1, Eigen::ComputeThinU | Eigen::ComputeThinV);
-    const auto p = std::min(2 * i_dim, 2 * m_dim);
-
-    // Place U in scratch to truncate and then B0.
-    MatrixMap svd_u((Complex*)(raw_state + end), 2 * i_dim, p);
-    svd_u.noalias() = svd.matrixU();
-    block_0.fill(Complex(0, 0));
-    const auto keep_cols = (svd_u.cols() > bond_dim) ? bond_dim : svd_u.cols();
-    block_0.block(0, 0, svd_u.rows(), keep_cols).noalias() =
-        svd_u(Eigen::indexing::all, Eigen::seq(0, keep_cols - 1));
-
-    // Place row product of S V into scratch to truncate and then B1.
-    MatrixMap svd_v((Complex*)(raw_state + end), p, 2 * m_dim);
-    MatrixMap s_vector((Complex*)(raw_state + end + 8 * bond_dim * bond_dim), p, 1);
-    svd_v.noalias() = svd.matrixV().adjoint();
-    s_vector.noalias() = svd.singularValues();
-    block_1.fill(Complex(0, 0));
-    const auto keep_rows = (svd_v.rows() > bond_dim) ? bond_dim : svd_v.rows();
-    const auto row_seq = Eigen::seq(0, keep_rows - 1);
-    for (unsigned i = 0; i < keep_rows; ++i) {
-      svd_v.row(i) *= s_vector(i);
-    }
-    block_1.block(0, 0, keep_rows, svd_v.cols()).noalias() =
-        svd_v(row_seq, Eigen::indexing::all);
-  }
-
-  For for_;
-};
-
-}  // namespace mps
-}  // namespace qsim
-
-#endif  // MPS_SIMULATOR_H_
diff --git a/tpls/qsim/mps_statespace.h b/tpls/qsim/mps_statespace.h
deleted file mode 100644
index 9b3acf3..0000000
--- a/tpls/qsim/mps_statespace.h
+++ /dev/null
@@ -1,597 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef MPS_STATESPACE_H_
-#define MPS_STATESPACE_H_
-
-// For templates will take care of parallelization.
-#define EIGEN_DONT_PARALLELIZE 1
-
-#ifdef _WIN32
-#include <malloc.h>
-#endif
-
-#include <cstdint>
-#include <cstdlib>
-#include <cstring>
-#include <memory>
-#include <random>
-
-#include "../eigen/Eigen/Dense"
-#include "../eigen/unsupported/Eigen/CXX11/Tensor"
-
-namespace qsim {
-
-namespace mps {
-
-namespace detail {
-
-inline void do_not_free(void*) {}
-
-inline void free(void* ptr) {
-#ifdef _WIN32
-  _aligned_free(ptr);
-#else
-  ::free(ptr);
-#endif
-}
-
-}  // namespace detail
-
-/**
- * Class containing context and routines for fixed bond dimension
- * truncated Matrix Product State (MPS) simulation.
- */
-template <typename For, typename FP = float>
-class MPSStateSpace {
- private:
- public:
-  using fp_type = FP;
-  using Pointer = std::unique_ptr<fp_type, decltype(&detail::free)>;
-
-  using Complex = std::complex<fp_type>;
-  using Matrix =
-      Eigen::Matrix<Complex, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-  using ConstMatrixMap = Eigen::Map<const Matrix>;
-  using MatrixMap = Eigen::Map<Matrix>;
-
-  // Store MPS tensors with the following shape:
-  // [2, bond_dim], [bond_dim, 2, bond_dim], ... , [bond_dim, 2].
-  class MPS {
-   public:
-    MPS() = delete;
-
-    MPS(Pointer&& ptr, unsigned num_qubits, unsigned bond_dim)
-        : ptr_(std::move(ptr)), num_qubits_(num_qubits), bond_dim_(bond_dim) {}
-
-    fp_type* get() { return ptr_.get(); }
-
-    const fp_type* get() const { return ptr_.get(); }
-
-    fp_type* release() {
-      num_qubits_ = 0;
-      return ptr_.release();
-    }
-
-    unsigned num_qubits() const { return num_qubits_; }
-
-    unsigned bond_dim() const { return bond_dim_; }
-
-   private:
-    Pointer ptr_;
-    unsigned num_qubits_;
-    unsigned bond_dim_;
-  };
-
-  // Note: ForArgs are currently unused.
-  template <typename... ForArgs>
-  MPSStateSpace(ForArgs&&... args) : for_(args...) {}
-
-  // Requires num_qubits >= 2 and bond_dim >= 2.
-  static MPS Create(unsigned num_qubits, unsigned bond_dim) {
-    auto end_sizes = 2 * 4 * bond_dim;
-    auto internal_sizes = 4 * bond_dim * bond_dim * (num_qubits + 1);
-    // Use three extra "internal style" blocks past the end of the
-    //   working allocation for scratch space. Needed for gate
-    //   application.
-    auto size = sizeof(fp_type) * (end_sizes + internal_sizes);
-
-#ifdef _WIN32
-    Pointer ptr{(fp_type*)_aligned_malloc(size, 64), &detail::free};
-    bool is_null = ptr.get() != nullptr;
-    return MPS{std::move(ptr), is_null ? num_qubits : 0,
-               is_null ? bond_dim : 0};
-#else
-    void* p = nullptr;
-    if (posix_memalign(&p, 64, size) == 0) {
-      return MPS{Pointer{(fp_type*)p, &detail::free}, num_qubits, bond_dim};
-    } else {
-      return MPS{Pointer{nullptr, &detail::free}, 0, 0};
-    }
-#endif
-  }
-
-  static unsigned Size(const MPS& state) {
-    auto end_sizes = 2 * 4 * state.bond_dim();
-    auto internal_sizes = 4 * state.bond_dim() * state.bond_dim();
-    return end_sizes + internal_sizes * (state.num_qubits() - 2);
-  }
-
-  static unsigned RawSize(const MPS& state) {
-    return sizeof(fp_type) * Size(state);
-  }
-
-  // Get the pointer offset to the beginning of an MPS block.
-  static unsigned GetBlockOffset(const MPS& state, unsigned i) {
-    if (i == 0) {
-      return 0;
-    }
-    return 4 * state.bond_dim() * (1 + state.bond_dim() * (i - 1));
-  }
-
-  // Copies the state contents of one MPS to another.
-  // Ignores scratch data.
-  static bool Copy(const MPS& src, MPS& dest) {
-    if ((src.num_qubits() != dest.num_qubits()) ||
-        src.bond_dim() != dest.bond_dim()) {
-      return false;
-    }
-    auto size = RawSize(src);
-    memcpy(dest.get(), src.get(), size);
-    return true;
-  }
-
-  // Set the MPS to the |0> state.
-  static void SetStateZero(MPS& state) {
-    auto size = Size(state);
-    memset(state.get(), 0, sizeof(fp_type) * size);
-    auto block_size = 4 * state.bond_dim() * state.bond_dim();
-    state.get()[0] = 1.0;
-    for (unsigned i = 4 * state.bond_dim(); i < size; i += block_size) {
-      state.get()[i] = 1.0;
-    }
-  }
-
-  // Computes Re{<state1 | state2 >} for two equal sized MPS.
-  // Requires: state1.bond_dim() == state2.bond_dim() &&
-  //           state1.num_qubits() == state2.num_qubits()
-  static fp_type RealInnerProduct(MPS& state1, MPS& state2) {
-    return InnerProduct(state1, state2).real();
-  }
-
-  // Computes <state1 | state2 > for two equal sized MPS.
-  // Requires: state1.bond_dim() == state2.bond_dim() &&
-  //           state1.num_qubits() == state2.num_qubits()
-  static std::complex<fp_type> InnerProduct(MPS& state1, MPS& state2) {
-    const auto num_qubits = state1.num_qubits();
-    const auto bond_dim = state1.bond_dim();
-    const auto end = Size(state1);
-    auto offset = 0;
-    fp_type* state1_raw = state1.get();
-    fp_type* state2_raw = state2.get();
-
-    // Contract leftmost blocks together, store result in state1 scratch.
-    ConstMatrixMap top((Complex*)state2_raw, 2, bond_dim);
-    ConstMatrixMap bot((Complex*)state1_raw, 2, bond_dim);
-    MatrixMap partial_contract((Complex*)(state1_raw + end), bond_dim,
-                               bond_dim);
-    MatrixMap partial_contract2(
-        (Complex*)(state1_raw + end + 2 * bond_dim * bond_dim), bond_dim,
-        2 * bond_dim);
-    partial_contract.noalias() = top.adjoint() * bot;
-
-    // Contract all internal blocks together.
-    for (unsigned i = 1; i < num_qubits - 1; ++i) {
-      offset = GetBlockOffset(state1, i);
-
-      // reshape:
-      new (&partial_contract2)
-          MatrixMap((Complex*)(state1_raw + end + 2 * bond_dim * bond_dim),
-                    bond_dim, 2 * bond_dim);
-
-      // Merge bot into left boundary merged tensor.
-      new (&bot) ConstMatrixMap((Complex*)(state1_raw + offset), bond_dim,
-                                2 * bond_dim);
-      partial_contract2.noalias() = partial_contract * bot;
-
-      // reshape:
-      new (&partial_contract2)
-          MatrixMap((Complex*)(state1_raw + end + 2 * bond_dim * bond_dim),
-                    2 * bond_dim, bond_dim);
-
-      // Merge top into partial_contract2.
-      new (&top) ConstMatrixMap((Complex*)(state2_raw + offset), 2 * bond_dim,
-                                bond_dim);
-      partial_contract.noalias() = top.adjoint() * partial_contract2;
-    }
-
-    // Contract rightmost bottom block.
-    offset = GetBlockOffset(state1, num_qubits - 1);
-    new (&bot) ConstMatrixMap((Complex*)(state1_raw + offset), bond_dim, 2);
-    new (&partial_contract2) MatrixMap(
-        (Complex*)(state1_raw + end + 4 * bond_dim * bond_dim), bond_dim, 2);
-    partial_contract2.noalias() = partial_contract * bot;
-
-    // Contract rightmost top block.
-    new (&top) ConstMatrixMap((Complex*)(state2_raw + offset), 2 * bond_dim, 1);
-    new (&partial_contract) MatrixMap((Complex*)(state1_raw + end), 1, 1);
-    new (&partial_contract2)
-        MatrixMap((Complex*)(state1_raw + end + 4 * bond_dim * bond_dim),
-                  2 * bond_dim, 1);
-    partial_contract.noalias() = top.adjoint() * partial_contract2;
-
-    return partial_contract(0, 0);
-  }
-
-  // Compute the 2x2 1-RDM of state on index. Result written to rdm.
-  // Requires: scratch and rdm to be allocated.
-  static void ReduceDensityMatrix(MPS& state, MPS& scratch, int index,
-                                  fp_type* rdm) {
-    const auto num_qubits = state.num_qubits();
-    const auto bond_dim = state.bond_dim();
-    const auto end = Size(state);
-    const bool last_index = (index == num_qubits - 1);
-    const auto right_dim = (last_index ? 1 : bond_dim);
-    auto offset = 0;
-    fp_type* state_raw = state.get();
-    fp_type* scratch_raw = scratch.get();
-    fp_type* state_raw_workspace = state_raw + end + 2 * bond_dim * bond_dim;
-    fp_type* scratch_raw_workspace =
-        scratch_raw + end + 2 * bond_dim * bond_dim;
-
-    Copy(state, scratch);
-
-    // Contract leftmost blocks together, store result in state scratch.
-    ConstMatrixMap top((Complex*)scratch_raw, 2, bond_dim);
-    ConstMatrixMap bot((Complex*)state_raw, 2, bond_dim);
-    MatrixMap partial_contract((Complex*)(state_raw + end), bond_dim, bond_dim);
-    MatrixMap partial_contract2((Complex*)(state_raw_workspace), bond_dim,
-                                2 * bond_dim);
-
-    partial_contract.setZero();
-    partial_contract(0, 0) = 1;
-    if (index > 0) {
-      partial_contract.noalias() = top.adjoint() * bot;
-    }
-
-    // Contract all internal blocks together.
-    for (unsigned i = 1; i < index; ++i) {
-      offset = GetBlockOffset(state, i);
-
-      // reshape:
-      new (&partial_contract2)
-          MatrixMap((Complex*)(state_raw_workspace), bond_dim, 2 * bond_dim);
-
-      // Merge bot into left boundary merged tensor.
-      new (&bot) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim,
-                                2 * bond_dim);
-      partial_contract2.noalias() = partial_contract * bot;
-
-      // reshape:
-      new (&partial_contract2)
-          MatrixMap((Complex*)(state_raw_workspace), 2 * bond_dim, bond_dim);
-
-      // Merge top into partial_contract2.
-      new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), 2 * bond_dim,
-                                bond_dim);
-      partial_contract.noalias() = top.adjoint() * partial_contract2;
-    }
-
-    // The [bond_dim, bond_dim] block in state_raw now contains the contraction
-    // up to, but not including index.
-    // Contract rightmost blocks.
-    offset = GetBlockOffset(state, num_qubits - 1);
-    new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), bond_dim, 2);
-    new (&bot) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, 2);
-    new (&partial_contract)
-        MatrixMap((Complex*)(scratch_raw + end), bond_dim, bond_dim);
-    new (&partial_contract2)
-        MatrixMap((Complex*)(scratch_raw_workspace), bond_dim, 2 * bond_dim);
-
-    partial_contract.setZero();
-    partial_contract(0, 0) = 1;
-    if (index < num_qubits - 1) {
-      partial_contract.noalias() = top * bot.adjoint();
-    }
-
-    for (unsigned i = num_qubits - 2; i > index; --i) {
-      offset = GetBlockOffset(state, i);
-
-      // reshape:
-      new (&partial_contract2)
-          MatrixMap((Complex*)(scratch_raw_workspace), 2 * bond_dim, bond_dim);
-
-      // Merge bot into left boundary merged tensor.
-      new (&bot) ConstMatrixMap((Complex*)(state_raw + offset), 2 * bond_dim,
-                                bond_dim);
-      partial_contract2.noalias() = bot * partial_contract.adjoint();
-
-      // reshape:
-      new (&partial_contract2)
-          MatrixMap((Complex*)(scratch_raw_workspace), bond_dim, 2 * bond_dim);
-
-      // Merge top into partial_contract2.
-      new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), bond_dim,
-                                2 * bond_dim);
-      // [bd, bd] = [bd, 2bd] @ [bd, 2bd]
-      partial_contract.noalias() = top * partial_contract2.adjoint();
-    }
-
-    // The [bond_dim, bond_dim] block in scratch_raw now contains the
-    // contraction down from the end, but not including the index. Begin final
-    // contraction steps.
-
-    // Get leftmost [bd, bd] contraction and contract with top.
-
-    offset = GetBlockOffset(state, index);
-    new (&partial_contract)
-        MatrixMap((Complex*)(state_raw + end), bond_dim, bond_dim);
-    new (&top)
-        ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, 2 * right_dim);
-    new (&partial_contract2)
-        MatrixMap((Complex*)(scratch_raw_workspace), bond_dim, 2 * right_dim);
-    partial_contract2.noalias() = partial_contract * top.conjugate();
-    // copy the bottom contraction scratch_raw to state_raw to save space.
-    memcpy(state_raw + end, scratch_raw + end,
-           bond_dim * bond_dim * 2 * sizeof(fp_type));
-
-    // Contract top again for correct shape.
-    fp_type* contract3_target = (last_index ? rdm : scratch_raw);
-    MatrixMap partial_contract3((Complex*)contract3_target, 2 * right_dim,
-                                2 * right_dim);
-    partial_contract3.noalias() = top.transpose() * partial_contract2;
-
-    // If we are contracting the last index, all the needed transforms are done.
-    if (last_index) {
-      return;
-    }
-
-    // Conduct final tensor contraction operations. Cannot be easily compiled to
-    // matmul.
-    const Eigen::TensorMap<const Eigen::Tensor<Complex, 4, Eigen::RowMajor>>
-        t_4d((Complex*)scratch_raw, 2, bond_dim, 2, bond_dim);
-    const Eigen::TensorMap<const Eigen::Tensor<Complex, 2, Eigen::RowMajor>>
-        t_2d((Complex*)(state_raw + end), bond_dim, bond_dim);
-
-    const Eigen::array<Eigen::IndexPair<int>, 2> product_dims = {
-        Eigen::IndexPair<int>(1, 0),
-        Eigen::IndexPair<int>(3, 1),
-    };
-    Eigen::TensorMap<Eigen::Tensor<Complex, 2, Eigen::RowMajor>> out(
-        (Complex*)rdm, 2, 2);
-    out = t_4d.contract(t_2d, product_dims);
-  }
-
-  // Draw a single bitstring sample from state using scratch and scratch2
-  // as working space.
-  static void SampleOnce(MPS& state, MPS& scratch, MPS& scratch2,
-                         std::mt19937* random_gen, std::vector<bool>* sample) {
-    // TODO: carefully profile with perf and optimize temp storage
-    //  locations for cache friendliness.
-    const auto bond_dim = state.bond_dim();
-    const auto num_qubits = state.num_qubits();
-    const auto end = Size(state);
-    const auto left_frontier_offset = GetBlockOffset(state, num_qubits + 1);
-    std::default_random_engine generator;
-    fp_type* state_raw = state.get();
-    fp_type* scratch_raw = scratch.get();
-    fp_type* scratch2_raw = scratch2.get();
-    fp_type rdm[8];
-
-    sample->reserve(num_qubits);
-    Copy(state, scratch);
-    Copy(state, scratch2);
-
-    // Store prefix contractions in scratch2.
-    auto offset = GetBlockOffset(state, num_qubits - 1);
-    ConstMatrixMap top((Complex*)(state_raw + offset), bond_dim, 2);
-    ConstMatrixMap bot((Complex*)(scratch_raw + offset), bond_dim, 2);
-    MatrixMap partial_contract((Complex*)(scratch2_raw + offset), bond_dim,
-                               bond_dim);
-    MatrixMap partial_contract2((Complex*)(scratch_raw + end), bond_dim,
-                                2 * bond_dim);
-    partial_contract.noalias() = top * bot.adjoint();
-
-    for (unsigned i = num_qubits - 2; i > 0; --i) {
-      offset = GetBlockOffset(state, i);
-      // reshape:
-      new (&partial_contract2)
-          MatrixMap((Complex*)(scratch_raw + end), 2 * bond_dim, bond_dim);
-
-      // Merge bot into left boundary merged tensor.
-      new (&bot) ConstMatrixMap((Complex*)(scratch_raw + offset), 2 * bond_dim,
-                                bond_dim);
-      partial_contract2.noalias() = bot * partial_contract.adjoint();
-
-      // reshape:
-      new (&partial_contract2)
-          MatrixMap((Complex*)(scratch_raw + end), bond_dim, 2 * bond_dim);
-
-      // Merge top into partial_contract2.
-      new (&top) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim,
-                                2 * bond_dim);
-
-      // merge into partial_contract -> scracth2_raw.
-      new (&partial_contract)
-          MatrixMap((Complex*)(scratch2_raw + offset), bond_dim, bond_dim);
-      partial_contract.noalias() = top * partial_contract2.adjoint();
-    }
-
-    // Compute RDM-0 and draw first sample.
-    offset = GetBlockOffset(state, 1);
-    new (&top) ConstMatrixMap((Complex*)state_raw, 2, bond_dim);
-    new (&bot) ConstMatrixMap((Complex*)scratch_raw, 2, bond_dim);
-    new (&partial_contract)
-        MatrixMap((Complex*)(scratch2_raw + offset), bond_dim, bond_dim);
-    new (&partial_contract2)
-        MatrixMap((Complex*)(scratch_raw + end), 2, bond_dim);
-
-    partial_contract2.noalias() = bot * partial_contract.adjoint();
-
-    new (&partial_contract) MatrixMap((Complex*)rdm, 2, 2);
-    partial_contract.noalias() = top * partial_contract2.adjoint();
-    auto p0 = rdm[0] / (rdm[0] + rdm[6]);
-    std::bernoulli_distribution distribution(1 - p0);
-    auto bit_val = distribution(*random_gen);
-    sample->push_back(bit_val);
-
-    // collapse state.
-    new (&partial_contract) MatrixMap((Complex*)scratch_raw, 2, bond_dim);
-    partial_contract.row(!bit_val).setZero();
-
-    // Prepare left contraction frontier.
-    new (&partial_contract2) MatrixMap(
-        (Complex*)(scratch2_raw + left_frontier_offset), bond_dim, bond_dim);
-    partial_contract2.noalias() =
-        partial_contract.transpose() * partial_contract.conjugate();
-
-    // Compute RDM-i and draw internal tensor samples.
-    for (unsigned i = 1; i < num_qubits - 1; i++) {
-      // Get leftmost [bd, bd] contraction and contract with top.
-      offset = GetBlockOffset(state, i);
-      new (&partial_contract) MatrixMap(
-          (Complex*)(scratch2_raw + left_frontier_offset), bond_dim, bond_dim);
-      new (&top) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim,
-                                2 * bond_dim);
-      new (&partial_contract2)
-          MatrixMap((Complex*)(state_raw + end), bond_dim, 2 * bond_dim);
-      partial_contract2.noalias() = partial_contract * top.conjugate();
-
-      // Contract top again for correct shape.
-      MatrixMap partial_contract3((Complex*)(scratch_raw + end), 2 * bond_dim,
-                                  2 * bond_dim);
-      partial_contract3.noalias() = top.transpose() * partial_contract2;
-
-      // Conduct final tensor contraction operations. Cannot be easily compiled
-      // to matmul. Perf reports shows only ~6% of runtime spent here on large
-      // systems.
-      offset = GetBlockOffset(state, i + 1);
-      const Eigen::TensorMap<const Eigen::Tensor<Complex, 4, Eigen::RowMajor>>
-          t_4d((Complex*)(scratch_raw + end), 2, bond_dim, 2, bond_dim);
-      const Eigen::TensorMap<const Eigen::Tensor<Complex, 2, Eigen::RowMajor>>
-          t_2d((Complex*)(scratch2_raw + offset), bond_dim, bond_dim);
-
-      const Eigen::array<Eigen::IndexPair<int>, 2> product_dims = {
-          Eigen::IndexPair<int>(1, 0),
-          Eigen::IndexPair<int>(3, 1),
-      };
-      Eigen::TensorMap<Eigen::Tensor<Complex, 2, Eigen::RowMajor>> out(
-          (Complex*)rdm, 2, 2);
-      out = t_4d.contract(t_2d, product_dims);
-
-      // Sample bit and collapse state.
-      p0 = rdm[0] / (rdm[0] + rdm[6]);
-      distribution = std::bernoulli_distribution(1 - p0);
-      bit_val = distribution(*random_gen);
-
-      sample->push_back(bit_val);
-      offset = GetBlockOffset(state, i);
-      new (&partial_contract)
-          MatrixMap((Complex*)(scratch_raw + offset), bond_dim * 2, bond_dim);
-      for (unsigned j = !bit_val; j < 2 * bond_dim; j += 2) {
-        partial_contract.row(j).setZero();
-      }
-
-      // Update left frontier.
-      new (&partial_contract) MatrixMap(
-          (Complex*)(scratch2_raw + left_frontier_offset), bond_dim, bond_dim);
-
-      // reshape:
-      new (&partial_contract2)
-          MatrixMap((Complex*)(state_raw + end), bond_dim, 2 * bond_dim);
-
-      // Merge bot into left boundary merged tensor.
-      new (&bot) ConstMatrixMap((Complex*)(scratch_raw + offset), bond_dim,
-                                2 * bond_dim);
-      partial_contract2.noalias() = partial_contract * bot.conjugate();
-
-      // reshape:
-      new (&partial_contract2)
-          MatrixMap((Complex*)(state_raw + end), 2 * bond_dim, bond_dim);
-
-      // Merge top into partial_contract2.
-      new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), 2 * bond_dim,
-                                bond_dim);
-      partial_contract.noalias() = top.transpose() * partial_contract2;
-    }
-
-    // Compute RDM-(n-1) and sample.
-    offset = GetBlockOffset(state, num_qubits - 1);
-    new (&partial_contract2)
-        MatrixMap((Complex*)(state_raw + end), bond_dim, 2);
-
-    new (&top) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, 2);
-    partial_contract2.noalias() = partial_contract * top.conjugate();
-    new (&partial_contract) MatrixMap((Complex*)rdm, 2, 2);
-    partial_contract.noalias() = top.transpose() * partial_contract2;
-
-    p0 = rdm[0] / (rdm[0] + rdm[6]);
-    distribution = std::bernoulli_distribution(1 - p0);
-    bit_val = distribution(*random_gen);
-    sample->push_back(bit_val);
-  }
-
-  // Draw num_samples bitstring samples from state and store the result
-  // bit vectors in results. Uses scratch and scratch2 as workspace.
-  static void Sample(MPS& state, MPS& scratch, MPS& scratch2,
-                     unsigned num_samples, unsigned seed,
-                     std::vector<std::vector<bool>>* results) {
-    std::mt19937 rand_source(seed);
-    results->reserve(num_samples);
-    for (unsigned i = 0; i < num_samples; i++) {
-      SampleOnce(state, scratch, scratch2, &rand_source, &(*results)[i]);
-    }
-  }
-
-  // Testing only. Convert the MPS to a wavefunction under "normal" ordering.
-  // Requires: wf be allocated beforehand with bond_dim * 2 ^ num_qubits -1
-  // memory.
-  static void ToWaveFunction(MPS& state, fp_type* wf) {
-    const auto bond_dim = state.bond_dim();
-    const auto num_qubits = state.num_qubits();
-    fp_type* raw_state = state.get();
-
-    ConstMatrixMap accum = ConstMatrixMap((Complex*)(raw_state), 2, bond_dim);
-    ConstMatrixMap next_block = ConstMatrixMap(nullptr, 0, 0);
-    MatrixMap result2 = MatrixMap(nullptr, 0, 0);
-    auto offset = 0;
-    auto result2_size = 2;
-
-    for (unsigned i = 1; i < num_qubits - 1; i++) {
-      offset = GetBlockOffset(state, i);
-      // use of new does not trigger any expensive operations.
-      new (&next_block) ConstMatrixMap((Complex*)(raw_state + offset), bond_dim,
-                                       2 * bond_dim);
-      new (&result2) MatrixMap((Complex*)(wf), result2_size, 2 * bond_dim);
-
-      // temp variable used since result2 and accum point to same memory.
-      result2 = accum * next_block;
-      result2_size *= 2;
-      new (&accum) ConstMatrixMap((Complex*)(wf), result2_size, bond_dim);
-    }
-    offset = GetBlockOffset(state, num_qubits - 1);
-    new (&next_block)
-        ConstMatrixMap((Complex*)(raw_state + offset), bond_dim, 2);
-    new (&result2) MatrixMap((Complex*)(wf), result2_size, 2);
-    result2 = accum * next_block;
-  }
-
- protected:
-  For for_;
-};
-
-}  // namespace mps
-}  // namespace qsim
-
-#endif  // MPS_STATESPACE_H_
diff --git a/tpls/qsim/parfor.h b/tpls/qsim/parfor.h
deleted file mode 100644
index 8a3a4d6..0000000
--- a/tpls/qsim/parfor.h
+++ /dev/null
@@ -1,123 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef PARFOR_H_
-#define PARFOR_H_
-
-#include <omp.h>
-
-#include <cstdint>
-#include <utility>
-#include <vector>
-
-namespace qsim {
-
-/**
- * Helper struct for executing for-loops in parallel across multiple threads.
- */
-template <uint64_t MIN_SIZE>
-struct ParallelForT {
-  explicit ParallelForT(unsigned num_threads) : num_threads(num_threads) {}
-
-  // GetIndex0 and GetIndex1 are useful when we need to know how work was
-  // divided between threads, for instance, for reusing partial sums obtained
-  // by RunReduceP.
-  uint64_t GetIndex0(uint64_t size, unsigned thread_id) const {
-    return size >= MIN_SIZE ? size * thread_id / num_threads : 0;
-  }
-
-  uint64_t GetIndex1(uint64_t size, unsigned thread_id) const {
-    return size >= MIN_SIZE ? size * (thread_id + 1) / num_threads : size;
-  }
-
-  template <typename Function, typename... Args>
-  void Run(uint64_t size, Function&& func, Args&&... args) const {
-    if (num_threads > 1 && size >= MIN_SIZE) {
-      #pragma omp parallel num_threads(num_threads)
-      {
-        unsigned n = omp_get_num_threads();
-        unsigned m = omp_get_thread_num();
-
-        uint64_t i0 = GetIndex0(size, m);
-        uint64_t i1 = GetIndex1(size, m);
-
-        for (uint64_t i = i0; i < i1; ++i) {
-          func(n, m, i, args...);
-        }
-      }
-    } else {
-      for (uint64_t i = 0; i < size; ++i) {
-        func(1, 0, i, args...);
-      }
-    }
-  }
-
-  template <typename Function, typename Op, typename... Args>
-  std::vector<typename Op::result_type> RunReduceP(
-      uint64_t size, Function&& func, Op&& op, Args&&... args) const {
-    std::vector<typename Op::result_type> partial_results;
-
-    if (num_threads > 1 && size >= MIN_SIZE) {
-      partial_results.resize(num_threads, 0);
-
-      #pragma omp parallel num_threads(num_threads)
-      {
-        unsigned n = omp_get_num_threads();
-        unsigned m = omp_get_thread_num();
-
-        uint64_t i0 = GetIndex0(size, m);
-        uint64_t i1 = GetIndex1(size, m);
-
-        typename Op::result_type partial_result = 0;
-
-        for (uint64_t i = i0; i < i1; ++i) {
-          partial_result = op(partial_result, func(n, m, i, args...));
-        }
-
-        partial_results[m] = partial_result;
-      }
-    } else if (num_threads > 0) {
-      typename Op::result_type result = 0;
-      for (uint64_t i = 0; i < size; ++i) {
-        result = op(result, func(1, 0, i, args...));
-      }
-
-      partial_results.resize(1, result);
-    }
-
-    return partial_results;
-  }
-
-  template <typename Function, typename Op, typename... Args>
-  typename Op::result_type RunReduce(uint64_t size, Function&& func,
-                                     Op&& op, Args&&... args) const {
-    auto partial_results = RunReduceP(size, func, std::move(op), args...);
-
-    typename Op::result_type result = 0;
-
-    for (auto partial_result : partial_results) {
-      result = op(result, partial_result);
-    }
-
-    return result;
-  }
-
-  unsigned num_threads;
-};
-
-using ParallelFor = ParallelForT<1024>;
-
-}  // namespace qsim
-
-#endif  // PARFOR_H_
diff --git a/tpls/qsim/qtrajectory.h b/tpls/qsim/qtrajectory.h
deleted file mode 100644
index 1da6692..0000000
--- a/tpls/qsim/qtrajectory.h
+++ /dev/null
@@ -1,435 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef QTRAJECTORY_H_
-#define QTRAJECTORY_H_
-
-#include <cmath>
-#include <complex>
-#include <cstdint>
-#include <random>
-#include <vector>
-
-#include "circuit_noisy.h"
-#include "gate.h"
-#include "gate_appl.h"
-
-namespace qsim {
-
-/**
- * Quantum trajectory simulator.
- */
-template <typename IO, typename Gate,
-          template <typename, typename> class FuserT, typename Simulator,
-          typename RGen = std::mt19937>
-class QuantumTrajectorySimulator {
- public:
-  using Fuser = FuserT<IO, const Gate*>;
-  using StateSpace = typename Simulator::StateSpace;
-  using State = typename Simulator::State;
-  using MeasurementResult = typename StateSpace::MeasurementResult;
-
-  /**
-   * User-specified parameters for the simulator.
-   */
-  struct Parameter : public Fuser::Parameter {
-    /**
-     * If true, collect statistics of sampled Kraus operator indices.
-     */
-    bool collect_kop_stat = false;
-    /**
-     * If true, collect statistics of measured bitstrings.
-     */
-    bool collect_mea_stat = false;
-    /**
-     * If true, normalize the state vector before performing measurements.
-     */
-    bool normalize_before_mea_gates = true;
-    /**
-     * If false, do not apply deferred operators after the main loop for
-     * the "primary" noise trajectory, that is the trajectory in which
-     * the primary (the first operators in their respective channels) Kraus
-     * operators are sampled for each channel and there are no measurements
-     * in the computational basis. This can be used to speed up simulations
-     * of circuits with weak noise and without measurements by reusing
-     * the primary trajectory results. There is an additional condition for
-     * RunBatch. In this case, the deferred operators after the main loop are
-     * still applied for the first occurence of the primary trajectory.
-     * The primary Kraus operators should have the highest sampling
-     * probabilities to achieve the highest speedup.
-     *
-     * It is the client's responsibility to collect the primary trajectory
-     * results and to reuse them.
-     */
-    bool apply_last_deferred_ops = true;
-  };
-
-  /**
-   * Struct with statistics to populate by RunBatch and RunOnce methods.
-   */
-  struct Stat {
-    /**
-     * Indices of sampled Kraus operator indices and/or measured bitstrings.
-     */
-    std::vector<uint64_t> samples;
-    /**
-     * True if the "primary" noise trajectory is sampled, false otherwise.
-     */
-    bool primary;
-  };
-
-  /**
-   * Runs the given noisy circuit performing repetitions. Each repetition is
-   * seeded by repetition ID.
-   * @param param Options for the quantum trajectory simulator.
-   * @param circuit The noisy circuit to be simulated.
-   * @param r0, r1 The range of repetition IDs [r0, r1) to perform repetitions.
-   * @param state_space StateSpace object required to manipulate state vector.
-   * @param simulator Simulator object. Provides specific implementations for
-   *   applying gates.
-   * @param measure Function that performs measurements (in the sense of
-   *   computing expectation values, etc). This function should have three
-   *   required parameters [repetition ID (uint64_t), final state vector
-   *   (const State&), statistics of sampled Kraus operator indices and/or
-   *   measured bitstrings (const Stat&)] and any number of optional parameters.
-   * @param args Optional arguments for the 'measure' function.
-   * @return True if the simulation completed successfully; false otherwise.
-   */
-  template <typename MeasurementFunc, typename... Args>
-  static bool RunBatch(const Parameter& param,
-                       const NoisyCircuit<Gate>& circuit,
-                       uint64_t r0, uint64_t r1, const StateSpace& state_space,
-                       const Simulator& simulator, MeasurementFunc&& measure,
-                       Args&&... args) {
-    return RunBatch(param, circuit.num_qubits, circuit.channels.begin(),
-                    circuit.channels.end(), r0, r1, state_space, simulator,
-                    measure, args...);
-  }
-
-  /**
-   * Runs the given noisy circuit performing repetitions. Each repetition is
-   * seeded by repetition ID.
-   * @param param Options for the quantum trajectory simulator.
-   * @param num_qubits The number of qubits acted on by the circuit.
-   * @param cbeg, cend The range of channels [cbeg, cend) to run the circuit.
-   * @param r0, r1 The range of repetition IDs [r0, r1) to perform repetitions.
-   * @param state_space StateSpace object required to manipulate state vector.
-   * @param simulator Simulator object. Provides specific implementations for
-   *   applying gates.
-   * @param measure Function that performs measurements (in the sense of
-   *   computing expectation values, etc). This function should have three
-   *   required parameters [repetition ID (uint64_t), final state vector
-   *   (const State&), statistics of sampled Kraus operator indices and/or
-   *   measured bitstrings (const Stat&)] and any number of optional parameters.
-   * @param args Optional arguments for the 'measure' function.
-   * @return True if the simulation completed successfully; false otherwise.
-   */
-  template <typename MeasurementFunc, typename... Args>
-  static bool RunBatch(const Parameter& param, unsigned num_qubits,
-                       ncircuit_iterator<Gate> cbeg,
-                       ncircuit_iterator<Gate> cend,
-                       uint64_t r0, uint64_t r1, const StateSpace& state_space,
-                       const Simulator& simulator, MeasurementFunc&& measure,
-                       Args&&... args) {
-    std::vector<const Gate*> gates;
-    gates.reserve(4 * std::size_t(cend - cbeg));
-
-    State state = state_space.Null();
-
-    Stat stat;
-    bool had_primary_realization = false;
-
-    for (uint64_t r = r0; r < r1; ++r) {
-      if (!state_space.IsNull(state)) {
-        state_space.SetStateZero(state);
-      }
-
-      bool apply_last_deferred_ops =
-          param.apply_last_deferred_ops || !had_primary_realization;
-
-      if (!RunIteration(param, apply_last_deferred_ops, num_qubits, cbeg, cend,
-                        r, state_space, simulator, gates, state, stat)) {
-        return false;
-      }
-
-      if (stat.primary && !had_primary_realization) {
-        had_primary_realization = true;
-      }
-
-      measure(r, state, stat, args...);
-    }
-
-    return true;
-  }
-
-  /**
-   * Runs the given noisy circuit one time.
-   * @param param Options for the quantum trajectory simulator.
-   * @param circuit The noisy circuit to be simulated.
-   * @param r The repetition ID. The random number generator is seeded by 'r'.
-   * @param state_space StateSpace object required to manipulate state vector.
-   * @param simulator Simulator object. Provides specific implementations for
-   *   applying gates.
-   * @param state The state of the system, to be updated by this method.
-   * @param stat Statistics of sampled Kraus operator indices and/or measured
-   *   bitstrings, to be populated by this method.
-   * @return True if the simulation completed successfully; false otherwise.
-   */
-  static bool RunOnce(const Parameter& param,
-                      const NoisyCircuit<Gate>& circuit, uint64_t r,
-                      const StateSpace& state_space, const Simulator& simulator,
-                      State& state, Stat& stat) {
-    return RunOnce(param, circuit.num_qubits, circuit.channels.begin(),
-                   circuit.channels.end(), r, state_space, simulator,
-                   state, stat);
-  }
-
-  /**
-   * Runs the given noisy circuit one time.
-   * @param param Options for the quantum trajectory simulator.
-   * @param num_qubits The number of qubits acted on by the circuit.
-   * @param cbeg, cend The range of channels [cbeg, cend) to run the circuit.
-   * @param circuit The noisy circuit to be simulated.
-   * @param r The repetition ID. The random number generator is seeded by 'r'.
-   * @param state_space StateSpace object required to manipulate state vector.
-   * @param simulator Simulator object. Provides specific implementations for
-   *   applying gates.
-   * @param state The state of the system, to be updated by this method.
-   * @param stat Statistics of sampled Kraus operator indices and/or measured
-   *   bitstrings, to be populated by this method.
-   * @return True if the simulation completed successfully; false otherwise.
-   */
-  static bool RunOnce(const Parameter& param, unsigned num_qubits,
-                      ncircuit_iterator<Gate> cbeg,
-                      ncircuit_iterator<Gate> cend,
-                      uint64_t r, const StateSpace& state_space,
-                      const Simulator& simulator, State& state, Stat& stat) {
-    std::vector<const Gate*> gates;
-    gates.reserve(4 * std::size_t(cend - cbeg));
-
-    if (!RunIteration(param, param.apply_last_deferred_ops, num_qubits, cbeg,
-                      cend, r, state_space, simulator, gates, state, stat)) {
-      return false;
-    }
-
-    return true;
-  }
-
- private:
-  static bool RunIteration(const Parameter& param,
-                           bool apply_last_deferred_ops, unsigned num_qubits,
-                           ncircuit_iterator<Gate> cbeg,
-                           ncircuit_iterator<Gate> cend,
-                           uint64_t rep, const StateSpace& state_space,
-                           const Simulator& simulator,
-                           std::vector<const Gate*>& gates,
-                           State& state, Stat& stat) {
-    if (param.collect_kop_stat || param.collect_mea_stat) {
-      stat.samples.reserve(std::size_t(cend - cbeg));
-      stat.samples.resize(0);
-    }
-
-    if (state_space.IsNull(state)) {
-      state = CreateState(num_qubits, state_space);
-      if (state_space.IsNull(state)) {
-        return false;
-      }
-
-      state_space.SetStateZero(state);
-    }
-
-    gates.resize(0);
-
-    RGen rgen(rep);
-    std::uniform_real_distribution<double> distr(0.0, 1.0);
-
-    bool unitary = true;
-    stat.primary = true;
-
-    for (auto it = cbeg; it != cend; ++it) {
-      const auto& channel = *it;
-
-      if (channel.size() == 0) continue;
-
-      if (channel[0].kind == gate::kMeasurement) {
-        // Measurement channel.
-
-        if (!ApplyDeferredOps(param, num_qubits, simulator, gates, state)) {
-          return false;
-        }
-
-        bool normalize = !unitary && param.normalize_before_mea_gates;
-        NormalizeState(normalize, state_space, unitary, state);
-
-        auto mresult = ApplyMeasurementGate(state_space, channel[0].ops[0],
-                                            rgen, state);
-
-        if (!mresult.valid) {
-          return false;
-        }
-
-        CollectStat(param.collect_mea_stat, mresult.bits, stat);
-
-        stat.primary = false;
-
-        continue;
-      }
-
-      // "Normal" channel.
-
-      double r = distr(rgen);
-      double cp = 0;
-
-      // Perform sampling of Kraus operators using probability bounds.
-      for (std::size_t i = 0; i < channel.size(); ++i) {
-        const auto& kop = channel[i];
-
-        cp += kop.prob;
-
-        if (r < cp) {
-          DeferOps(kop.ops, gates);
-          CollectStat(param.collect_kop_stat, i, stat);
-
-          unitary = unitary && kop.unitary;
-
-          break;
-        }
-      }
-
-      if (r < cp) continue;
-
-      if (!ApplyDeferredOps(param, num_qubits, simulator, gates, state)) {
-        return false;
-      }
-
-      NormalizeState(!unitary, state_space, unitary, state);
-
-      double max_prob = 0;
-      std::size_t max_prob_index = 0;
-
-      // Perform sampling of Kraus operators using norms of updated states.
-      for (std::size_t i = 0; i < channel.size(); ++i) {
-        const auto& kop = channel[i];
-
-        if (kop.unitary) continue;
-
-        double prob = std::real(
-            simulator.ExpectationValue(kop.qubits, kop.kd_k.data(), state));
-
-        if (prob > max_prob) {
-          max_prob = prob;
-          max_prob_index = i;
-        }
-
-        cp += prob - kop.prob;
-
-        if (r < cp || i == channel.size() - 1) {
-          // Sample ith Kraus operator if r < cp
-          // Sample the highest probability Kraus operator if r is greater
-          // than the sum of all probablities due to round-off errors.
-          uint64_t k = r < cp ? i : max_prob_index;
-
-          DeferOps(channel[k].ops, gates);
-          CollectStat(param.collect_kop_stat, k, stat);
-
-          unitary = false;
-
-          break;
-        }
-      }
-    }
-
-    if (apply_last_deferred_ops || !stat.primary) {
-      if (!ApplyDeferredOps(param, num_qubits, simulator, gates, state)) {
-        return false;
-      }
-
-      NormalizeState(!unitary, state_space, unitary, state);
-    }
-
-    return true;
-  }
-
-  static State CreateState(unsigned num_qubits, const StateSpace& state_space) {
-    auto state = state_space.Create(num_qubits);
-    if (state_space.IsNull(state)) {
-      IO::errorf("not enough memory: is the number of qubits too large?\n");
-      return state_space.Null();
-    }
-
-    return state;
-  }
-
-  static bool ApplyDeferredOps(
-      const Parameter& param, unsigned num_qubits, const Simulator& simulator,
-      std::vector<const Gate*>& gates, State& state) {
-    if (gates.size() > 0) {
-      auto fgates = Fuser::FuseGates(param, num_qubits, gates);
-
-      gates.resize(0);
-
-      if (fgates.size() == 0) {
-        return false;
-      }
-
-      for (const auto& fgate : fgates) {
-        ApplyFusedGate(simulator, fgate, state);
-      }
-    }
-
-    return true;
-  }
-
-  static MeasurementResult ApplyMeasurementGate(
-      const StateSpace& state_space, const Gate& gate,
-      RGen& rgen, State& state) {
-    auto result = state_space.Measure(gate.qubits, rgen, state);
-
-    if (!result.valid) {
-      IO::errorf("measurement failed.\n");
-    }
-
-    return result;
-  }
-
-  static void DeferOps(
-      const std::vector<Gate>& ops, std::vector<const Gate*>& gates) {
-    for (const auto& op : ops) {
-      gates.push_back(&op);
-    }
-  }
-
-  static void CollectStat(bool collect_stat, uint64_t i, Stat& stat) {
-    if (collect_stat) {
-      stat.samples.push_back(i);
-    }
-
-    if (i != 0) {
-      stat.primary = false;
-    }
-  }
-
-  static void NormalizeState(bool normalize, const StateSpace& state_space,
-                             bool& flag, State& state) {
-    if (normalize) {
-      double a = 1.0 / std::sqrt(state_space.Norm(state));
-      state_space.Multiply(a, state);
-      flag = true;
-    }
-  }
-};
-
-}  // namespace qsim
-
-#endif  // QTRAJECTORY_H_
diff --git a/tpls/qsim/run_qsim.h b/tpls/qsim/run_qsim.h
deleted file mode 100644
index 3752915..0000000
--- a/tpls/qsim/run_qsim.h
+++ /dev/null
@@ -1,262 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef RUN_QSIM_H_
-#define RUN_QSIM_H_
-
-#include <random>
-#include <string>
-#include <vector>
-
-#include "gate.h"
-#include "gate_appl.h"
-#include "util.h"
-
-namespace qsim {
-
-/**
- * Helper struct for running qsim.
- */
-template <typename IO, typename Fuser, typename Factory,
-          typename RGen = std::mt19937>
-struct QSimRunner final {
- public:
-  using Simulator = typename Factory::Simulator;
-  using StateSpace = typename Simulator::StateSpace;
-  using State = typename StateSpace::State;
-  using MeasurementResult = typename StateSpace::MeasurementResult;
-
-  /**
-   * User-specified parameters for gate fusion and simulation.
-   */
-  struct Parameter : public Fuser::Parameter {
-    /**
-     * Random number generator seed to apply measurement gates.
-     */
-    uint64_t seed;
-  };
-
-  /**
-   * Runs the given circuit, only measuring at the end.
-   * @param param Options for gate fusion, parallelism and logging.
-   * @param factory Object to create simulators and state spaces.
-   * @param circuit The circuit to be simulated.
-   * @param measure Function that performs measurements (in the sense of
-   *   computing expectation values, etc).
-   * @return True if the simulation completed successfully; false otherwise.
-   */
-  template <typename Circuit, typename MeasurementFunc>
-  static bool Run(const Parameter& param, const Factory& factory,
-                  const Circuit& circuit, MeasurementFunc measure) {
-    return Run(param, factory, {circuit.gates.back().time}, circuit, measure);
-  }
-
-  /**
-   * Runs the given circuit, measuring at user-specified times.
-   * @param param Options for gate fusion, parallelism and logging.
-   * @param factory Object to create simulators and state spaces.
-   * @param times_to_measure_at Time steps at which to perform measurements.
-   * @param circuit The circuit to be simulated.
-   * @param measure Function that performs measurements (in the sense of
-   *   computing expectation values, etc).
-   * @return True if the simulation completed successfully; false otherwise.
-   */
-  template <typename Circuit, typename MeasurementFunc>
-  static bool Run(const Parameter& param, const Factory& factory,
-                  const std::vector<unsigned>& times_to_measure_at,
-                  const Circuit& circuit, MeasurementFunc measure) {
-    double t0 = 0.0;
-    double t1 = 0.0;
-
-    if (param.verbosity > 1) {
-      t0 = GetTime();
-    }
-
-    RGen rgen(param.seed);
-
-    StateSpace state_space = factory.CreateStateSpace();
-
-    auto state = state_space.Create(circuit.num_qubits);
-    if (state_space.IsNull(state)) {
-      IO::errorf("not enough memory: is the number of qubits too large?\n");
-      return false;
-    }
-
-    state_space.SetStateZero(state);
-    Simulator simulator = factory.CreateSimulator();
-
-    if (param.verbosity > 1) {
-      t1 = GetTime();
-      IO::messagef("init time is %g seconds.\n", t1 - t0);
-      t0 = GetTime();
-    }
-
-    auto fused_gates = Fuser::FuseGates(param, circuit.num_qubits,
-                                        circuit.gates, times_to_measure_at);
-
-    if (fused_gates.size() == 0 && circuit.gates.size() > 0) {
-      return false;
-    }
-
-    if (param.verbosity > 1) {
-      t1 = GetTime();
-      IO::messagef("fuse time is %g seconds.\n", t1 - t0);
-    }
-
-    if (param.verbosity > 0) {
-      t0 = GetTime();
-    }
-
-    unsigned cur_time_index = 0;
-
-    // Apply fused gates.
-    for (std::size_t i = 0; i < fused_gates.size(); ++i) {
-      if (param.verbosity > 3) {
-        t1 = GetTime();
-      }
-
-      if (!ApplyFusedGate(state_space, simulator, fused_gates[i], rgen,
-                          state)) {
-        IO::errorf("measurement failed.\n");
-        return false;
-      }
-
-      if (param.verbosity > 3) {
-        state_space.DeviceSync();
-        double t2 = GetTime();
-        IO::messagef("gate %lu done in %g seconds.\n", i, t2 - t1);
-      }
-
-      unsigned t = times_to_measure_at[cur_time_index];
-
-      if (i == fused_gates.size() - 1 || t < fused_gates[i + 1].time) {
-        // Call back to perform measurements.
-        measure(cur_time_index, state_space, state);
-        ++cur_time_index;
-      }
-    }
-
-    if (param.verbosity > 0) {
-      state_space.DeviceSync();
-      double t2 = GetTime();
-      IO::messagef("time is %g seconds.\n", t2 - t0);
-    }
-
-    return true;
-  }
-
-  /**
-   * Runs the given circuit and make the final state available to the caller,
-   * recording the result of any intermediate measurements in the circuit.
-   * @param param Options for gate fusion, parallelism and logging.
-   * @param factory Object to create simulators and state spaces.
-   * @param circuit The circuit to be simulated.
-   * @param state As an input parameter, this should contain the initial state
-   *   of the system. After a successful run, it will be populated with the
-   *   final state of the system.
-   * @param measure_results As an input parameter, this should be empty.
-   *   After a successful run, this will contain all measurements results from
-   *   the run, ordered by time and qubit index.
-   * @return True if the simulation completed successfully; false otherwise.
-   */
-  template <typename Circuit>
-  static bool Run(const Parameter& param, const Factory& factory,
-                  const Circuit& circuit, State& state,
-                  std::vector<MeasurementResult>& measure_results) {
-    double t0 = 0.0;
-    double t1 = 0.0;
-
-    if (param.verbosity > 1) {
-      t0 = GetTime();
-    }
-
-    RGen rgen(param.seed);
-
-    StateSpace state_space = factory.CreateStateSpace();
-    Simulator simulator = factory.CreateSimulator();
-
-    if (param.verbosity > 1) {
-      t1 = GetTime();
-      IO::messagef("init time is %g seconds.\n", t1 - t0);
-      t0 = GetTime();
-    }
-
-    auto fused_gates = Fuser::FuseGates(param, circuit.num_qubits,
-                                        circuit.gates);
-
-    if (fused_gates.size() == 0 && circuit.gates.size() > 0) {
-      return false;
-    }
-
-    measure_results.reserve(fused_gates.size());
-
-    if (param.verbosity > 1) {
-      t1 = GetTime();
-      IO::messagef("fuse time is %g seconds.\n", t1 - t0);
-    }
-
-    if (param.verbosity > 0) {
-      t0 = GetTime();
-    }
-
-    // Apply fused gates.
-    for (std::size_t i = 0; i < fused_gates.size(); ++i) {
-      if (param.verbosity > 3) {
-        t1 = GetTime();
-      }
-
-      if (!ApplyFusedGate(state_space, simulator, fused_gates[i], rgen, state,
-                          measure_results)) {
-        IO::errorf("measurement failed.\n");
-        return false;
-      }
-
-      if (param.verbosity > 3) {
-        state_space.DeviceSync();
-        double t2 = GetTime();
-        IO::messagef("gate %lu done in %g seconds.\n", i, t2 - t1);
-      }
-    }
-
-    if (param.verbosity > 0) {
-      state_space.DeviceSync();
-      double t2 = GetTime();
-      IO::messagef("simu time is %g seconds.\n", t2 - t0);
-    }
-
-    return true;
-  }
-
-  /**
-   * Runs the given circuit and make the final state available to the caller,
-   * discarding the result of any intermediate measurements in the circuit.
-   * @param param Options for gate fusion, parallelism and logging.
-   * @param factory Object to create simulators and state spaces.
-   * @param circuit The circuit to be simulated.
-   * @param state As an input parameter, this should contain the initial state
-   *   of the system. After a successful run, it will be populated with the
-   *   final state of the system.
-   * @return True if the simulation completed successfully; false otherwise.
-   */
-  template <typename Circuit>
-  static bool Run(const Parameter& param, const Factory& factory,
-                  const Circuit& circuit, State& state) {
-    std::vector<MeasurementResult> discarded_results;
-    return Run(param, factory, circuit, state, discarded_results);
-  }
-};
-
-}  // namespace qsim
-
-#endif  // RUN_QSIM_H_
diff --git a/tpls/qsim/run_qsimh.h b/tpls/qsim/run_qsimh.h
deleted file mode 100644
index c1534d3..0000000
--- a/tpls/qsim/run_qsimh.h
+++ /dev/null
@@ -1,120 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef RUN_QSIMH_H_
-#define RUN_QSIMH_H_
-
-#include <string>
-#include <vector>
-
-#include "hybrid.h"
-#include "util.h"
-
-namespace qsim {
-
-/**
- * Helper struct for running qsimh.
- */
-template <typename IO, typename HybridSimulator>
-struct QSimHRunner final {
-  using Gate = typename HybridSimulator::Gate;
-  using fp_type = typename HybridSimulator::fp_type;
-
-  using Parameter = typename HybridSimulator::Parameter;
-  using HybridData = typename HybridSimulator::HybridData;
-  using Fuser = typename HybridSimulator::Fuser;
-
-  /**
-   * Evaluates the amplitudes for a given circuit and set of output states.
-   * @param param Options for gate fusion, parallelism and logging. Also
-   *   specifies the size of the 'prefix' and 'root' sections of the lattice.
-   * @param factory Object to create simulators and state spaces.
-   * @param circuit The circuit to be simulated.
-   * @param parts Lattice sections to be simulated.
-   * @param bitstrings List of output states to simulate, as bitstrings.
-   * @param results Output vector of amplitudes. After a successful run, this
-   *   will be populated with amplitudes for each state in 'bitstrings'.
-   * @return True if the simulation completed successfully; false otherwise.
-   */
-  template <typename Factory, typename Circuit>
-  static bool Run(const Parameter& param, const Factory& factory,
-                  const Circuit& circuit, const std::vector<unsigned>& parts,
-                  const std::vector<uint64_t>& bitstrings,
-                  std::vector<std::complex<fp_type>>& results) {
-    if (circuit.num_qubits != parts.size()) {
-      IO::errorf("parts size is not equal to the number of qubits.");
-      return false;
-    }
-
-    double t0 = 0.0;
-
-    if (param.verbosity > 0) {
-      t0 = GetTime();
-    }
-
-    HybridData hd;
-    bool rc = HybridSimulator::SplitLattice(parts, circuit.gates, hd);
-
-    if (!rc) {
-      return false;
-    }
-
-    if (hd.num_gatexs < param.num_prefix_gatexs + param.num_root_gatexs) {
-      IO::errorf("error: num_prefix_gates (%u) plus num_root gates (%u) is "
-                 "greater than num_gates_on_the_cut (%u).\n",
-                 param.num_prefix_gatexs, param.num_root_gatexs,
-                 hd.num_gatexs);
-      return false;
-    }
-
-    if (param.verbosity > 0) {
-      PrintInfo(param, hd);
-    }
-
-    auto fgates0 = Fuser::FuseGates(param, hd.num_qubits0, hd.gates0);
-    if (fgates0.size() == 0 && hd.gates0.size() > 0) {
-      return false;
-    }
-
-    auto fgates1 = Fuser::FuseGates(param, hd.num_qubits1, hd.gates1);
-    if (fgates1.size() == 0 && hd.gates1.size() > 0) {
-      return false;
-    }
-
-    rc = HybridSimulator(param.num_threads).Run(
-        param, factory, hd, parts, fgates0, fgates1, bitstrings, results);
-
-    if (rc && param.verbosity > 0) {
-      double t1 = GetTime();
-      IO::messagef("time elapsed %g seconds.\n", t1 - t0);
-    }
-
-    return rc;
-  }
-
- private:
-  static void PrintInfo(const Parameter& param, const HybridData& hd) {
-    unsigned num_suffix_gates =
-        hd.num_gatexs - param.num_prefix_gatexs - param.num_root_gatexs;
-
-    IO::messagef("part 0: %u, part 1: %u\n", hd.num_qubits0, hd.num_qubits1);
-    IO::messagef("%u gates on the cut\n", hd.num_gatexs);
-    IO::messagef("breakup: %up+%ur+%us\n", param.num_prefix_gatexs,
-                 param.num_root_gatexs, num_suffix_gates);
-  }
-};
-
-}  // namespace qsim
-
-#endif  // RUN_QSIM_H_
diff --git a/tpls/qsim/seqfor.h b/tpls/qsim/seqfor.h
deleted file mode 100644
index 3ebf07c..0000000
--- a/tpls/qsim/seqfor.h
+++ /dev/null
@@ -1,68 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SEQFOR_H_
-#define SEQFOR_H_
-
-#include <cstdint>
-#include <utility>
-#include <vector>
-
-namespace qsim {
-
-/**
- * Helper struct for executing for loops in series.
- */
-struct SequentialFor {
-  explicit SequentialFor(unsigned num_threads) {}
-
-  // SequentialFor does not have any state. So all its methods can be static.
-
-  static uint64_t GetIndex0(uint64_t size, unsigned thread_id) {
-    return 0;
-  }
-
-  static uint64_t GetIndex1(uint64_t size, unsigned thread_id) {
-    return size;
-  }
-
-  template <typename Function, typename... Args>
-  static void Run(uint64_t size, Function&& func, Args&&... args) {
-    for (uint64_t i = 0; i < size; ++i) {
-      func(1, 0, i, args...);
-    }
-  }
-
-  template <typename Function, typename Op, typename... Args>
-  static std::vector<typename Op::result_type> RunReduceP(
-      uint64_t size, Function&& func, Op&& op, Args&&... args) {
-    typename Op::result_type result = 0;
-
-    for (uint64_t i = 0; i < size; ++i) {
-      result = op(result, func(1, 0, i, args...));
-    }
-
-    return std::vector<typename Op::result_type>(1, result);
-  }
-
-  template <typename Function, typename Op, typename... Args>
-  static typename Op::result_type RunReduce(uint64_t size, Function&& func,
-                                            Op&& op, Args&&... args) {
-    return RunReduceP(size, func, std::move(op), args...)[0];
-  }
-};
-
-}  // namespace qsim
-
-#endif  // SEQFOR_H_
diff --git a/tpls/qsim/simmux.h b/tpls/qsim/simmux.h
deleted file mode 100644
index d3c4074..0000000
--- a/tpls/qsim/simmux.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SIMMUX_H_
-#define SIMMUX_H_
-
-#ifdef __AVX512F__
-# include "simulator_avx512.h"
-  namespace qsim {
-    template <typename For>
-    using Simulator = SimulatorAVX512<For>;
-  }
-#elif __AVX2__
-# include "simulator_avx.h"
-  namespace qsim {
-    template <typename For>
-    using Simulator = SimulatorAVX<For>;
-  }
-#elif __SSE4_1__
-# include "simulator_sse.h"
-  namespace qsim {
-    template <typename For>
-    using Simulator = SimulatorSSE<For>;
-  }
-#else
-# include "simulator_basic.h"
-  namespace qsim {
-    template <typename For>
-    using Simulator = SimulatorBasic<For>;
-  }
-#endif
-
-#endif  // SIMMUX_H_
diff --git a/tpls/qsim/simmux_gpu.h b/tpls/qsim/simmux_gpu.h
deleted file mode 100644
index 1f0bb59..0000000
--- a/tpls/qsim/simmux_gpu.h
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright 2023 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SIMMUX_GPU_H_
-#define SIMMUX_GPU_H_
-
-#ifdef __CUSTATEVEC__
-# include "simulator_custatevec.h"
-  namespace qsim {
-    using SimulatorGpu = SimulatorCuStateVec<>;
-  }
-#else
-# include "simulator_cuda.h"
-  namespace qsim {
-    using SimulatorGpu = SimulatorCUDA<>;
-  }
-#endif
-
-#endif  // SIMMUX_GPU_H_
diff --git a/tpls/qsim/simulator.h b/tpls/qsim/simulator.h
deleted file mode 100644
index eff5441..0000000
--- a/tpls/qsim/simulator.h
+++ /dev/null
@@ -1,516 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SIMULATOR_H_
-#define SIMULATOR_H_
-
-#include <cstdint>
-
-#include "bits.h"
-
-namespace qsim {
-
-/**
- * Base class for simulator classes.
- */
-class SimulatorBase {
- protected:
-  // The follwoing template parameters are used for functions below.
-  // H - the number of high (target) qubits.
-  // L - the number of low (target) qubits.
-  // R - SIMD register width in floats.
-
-  // Fills the table of masks (ms) that is used to calculate base state indices
-  // and the table of offset indices (xss) that is used to access the state
-  // vector entries in matrix-vector multiplication functions. This function is
-  // used in simulator_basic.h, simulator_sse.h and simulator_avx.h (no bmi2
-  // version).
-  template <unsigned H, unsigned L = 0>
-  static void FillIndices(unsigned num_qubits, const std::vector<unsigned>& qs,
-                          uint64_t* ms, uint64_t* xss) {
-    constexpr unsigned hsize = 1 << H;
-
-    if (H == 0) {
-      ms[0] = uint64_t(-1);
-      xss[0] = 0;
-    } else {
-      uint64_t xs[H + 1];
-
-      xs[0] = uint64_t{1} << (qs[L] + 1);
-      ms[0] = (uint64_t{1} << qs[L]) - 1;
-      for (unsigned i = 1; i < H; ++i) {
-        xs[i] = uint64_t{1} << (qs[L + i] + 1);
-        ms[i] = ((uint64_t{1} << qs[L + i]) - 1) ^ (xs[i - 1] - 1);
-      }
-      ms[H] = ((uint64_t{1} << num_qubits) - 1) ^ (xs[H - 1] - 1);
-
-      for (unsigned i = 0; i < hsize; ++i) {
-        uint64_t a = 0;
-        for (uint64_t k = 0; k < H; ++k) {
-          a += xs[k] * ((i >> k) & 1);
-        }
-        xss[i] = a;
-      }
-    }
-  }
-
-  // Fills gate matrix entries for gates with low qubits.
-  template <unsigned H, unsigned L, unsigned R, typename fp_type>
-  static void FillMatrix(unsigned qmaskl, const fp_type* matrix, fp_type* w) {
-    constexpr unsigned gsize = 1 << (H + L);
-    constexpr unsigned hsize = 1 << H;
-    constexpr unsigned lsize = 1 << L;
-    constexpr unsigned rsize = 1 << R;
-
-    unsigned s = 0;
-
-    for (unsigned i = 0; i < hsize; ++i) {
-      for (unsigned j = 0; j < gsize; ++j) {
-        unsigned p0 = 2 * i * lsize * gsize + 2 * lsize * (j / lsize);
-
-        for (unsigned k = 0; k < rsize; ++k) {
-          unsigned l = bits::CompressBits(k, R, qmaskl);
-          unsigned p = p0 + 2 * (gsize * l + (j + l) % lsize);
-
-          w[s + 0] = matrix[p];
-          w[s + rsize] = matrix[p + 1];
-
-          ++s;
-        }
-
-        s += rsize;
-      }
-    }
-  }
-
-  // Fills gate matrix entries for controlled gates with high target qubits
-  // and low control qubits.
-  template <unsigned H, unsigned R, typename fp_type>
-  static void FillControlledMatrixH(uint64_t cvalsl, uint64_t cmaskl,
-                                    const fp_type* matrix, fp_type* w) {
-    constexpr unsigned hsize = 1 << H;
-    constexpr unsigned rsize = 1 << R;
-
-    unsigned s = 0;
-
-    for (unsigned i = 0; i < hsize; ++i) {
-      for (unsigned j = 0; j < hsize; ++j) {
-        unsigned p = hsize * i + j;
-        fp_type v = i == j ? 1 : 0;
-
-        for (unsigned k = 0; k < rsize; ++k) {
-          w[s] = cvalsl == (k & cmaskl) ? matrix[2 * p] : v;
-          w[s + rsize] = cvalsl == (k & cmaskl) ? matrix[2 * p + 1] : 0;
-
-          ++s;
-        }
-
-        s += rsize;
-      }
-    }
-  }
-
-  // Fills gate matrix entries for controlled gates with low target qubits
-  // and low control qubits.
-  template <unsigned H, unsigned L, unsigned R, typename fp_type>
-  static void FillControlledMatrixL(uint64_t cvalsl, uint64_t cmaskl,
-                                    unsigned qmaskl, const fp_type* matrix,
-                                    fp_type* w) {
-    constexpr unsigned gsize = 1 << (H + L);
-    constexpr unsigned hsize = 1 << H;
-    constexpr unsigned lsize = 1 << L;
-    constexpr unsigned rsize = 1 << R;
-
-    unsigned s = 0;
-
-    for (unsigned i = 0; i < hsize; ++i) {
-      for (unsigned j = 0; j < gsize; ++j) {
-        unsigned p0 = i * lsize * gsize + lsize * (j / lsize);
-
-        for (unsigned k = 0; k < rsize; ++k) {
-          unsigned l = bits::CompressBits(k, R, qmaskl);
-          unsigned p = p0 + gsize * l + (j + l) % lsize;
-
-          fp_type v = p / gsize == p % gsize ? 1 : 0;
-
-          w[s] = cvalsl == (k & cmaskl) ? matrix[2 * p] : v;
-          w[s + rsize] = cvalsl == (k & cmaskl) ? matrix[2 * p + 1] : 0;
-
-          ++s;
-        }
-
-        s += rsize;
-      }
-    }
-  }
-
-/*
-  The GetMasks* functions below provide various masks and related values.
-  GetMasks1, GetMasks2, GetMasks3, GetMasks4, GetMasks5 and GetMasks6 are
-  used in simulator_avx.h (BMI2 version) and in simulator_avx512.h. GetMasks7,
-  GetMasks8, GetMasks9, GetMasks10 and GetMasks11 are used in simulator_avx.h
-  (no BMI2 version) and in simulator_sse.h.
-
-  imaskh - inverted mask of high qubits (high control and target qubits).
-  qmaskh - mask of high qubits (high target qubits).
-  cvalsh - control bit values of high control qubits placed in correct
-           positions.
-  cvalsl - control bit values of low control qubits placed in correct positions.
-  cmaskh - mask of high control qubits.
-  cmaskl - mask of low control qubits.
-  qmaskl - mask of low qubits (low target qubits).
-  cl - the number of low control qubits.
-
-  Note that imaskh, qmaskh and cvalsh are multiplied by two in GetMasks1,
-  GetMasks2, GetMasks3, GetMasks4, GetMasks5 and GetMasks6.
-*/
-
-  struct Masks1 {
-    uint64_t imaskh;
-    uint64_t qmaskh;
-  };
-
-  template <unsigned H, unsigned R>
-  static Masks1 GetMasks1(const std::vector<unsigned>& qs) {
-    uint64_t qmaskh = 0;
-
-    for (unsigned i = 0; i < H; ++i) {
-      qmaskh |= uint64_t{1} << qs[i];
-    }
-
-    return {2 * (~qmaskh ^ ((1 << R) - 1)), 2 * qmaskh};
-  }
-
-  struct Masks2 {
-    uint64_t imaskh;
-    uint64_t qmaskh;
-    unsigned qmaskl;
-  };
-
-  template <unsigned H, unsigned L, unsigned R>
-  static Masks2 GetMasks2(const std::vector<unsigned>& qs) {
-    uint64_t qmaskh = 0;
-    unsigned qmaskl = 0;
-
-    for (unsigned i = 0; i < L; ++i) {
-      qmaskl |= 1 << qs[i];
-    }
-
-    for (unsigned i = L; i < H + L; ++i) {
-      qmaskh |= uint64_t{1} << qs[i];
-    }
-
-    return {2 * (~qmaskh ^ ((1 << R) - 1)), 2 * qmaskh, qmaskl};
-  }
-
-  struct Masks3 {
-    uint64_t imaskh;
-    uint64_t qmaskh;
-    uint64_t cvalsh;
-  };
-
-  template <unsigned H, unsigned R>
-  static Masks3 GetMasks3(unsigned num_qubits, const std::vector<unsigned>& qs,
-                          const std::vector<unsigned>& cqs, uint64_t cvals) {
-    uint64_t qmaskh = 0;
-    uint64_t cmaskh = 0;
-
-    for (unsigned i = 0; i < H; ++i) {
-      qmaskh |= uint64_t{1} << qs[i];
-    }
-
-    for (auto q : cqs) {
-      cmaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh);
-
-    uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1);
-
-    return {2 * maskh, 2 * qmaskh, 2 * cvalsh};
-  }
-
-  struct Masks4 {
-    uint64_t imaskh;
-    uint64_t qmaskh;
-    uint64_t cvalsh;
-    uint64_t cvalsl;
-    uint64_t cmaskl;
-    unsigned cl;
-  };
-
-  template <unsigned H, unsigned R>
-  static Masks4 GetMasks4(unsigned num_qubits, const std::vector<unsigned>& qs,
-                          const std::vector<unsigned>& cqs, uint64_t cvals) {
-    unsigned cl = 0;
-    uint64_t qmaskh = 0;
-    uint64_t cmaskh = 0;
-    uint64_t cmaskl = 0;
-
-    for (unsigned i = 0; i < H; ++i) {
-      qmaskh |= uint64_t{1} << qs[i];
-    }
-
-    for (auto q : cqs) {
-      if (q >= R) {
-        cmaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        cmaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh);
-    uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl);
-
-    uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1);
-
-    return {2 * maskh, 2 * qmaskh, 2 * cvalsh, cvalsl, cmaskl, cl};
-  }
-
-  struct Masks5 {
-    uint64_t imaskh;
-    uint64_t qmaskh;
-    uint64_t cvalsh;
-    unsigned qmaskl;
-  };
-
-  template <unsigned H, unsigned L, unsigned R>
-  static Masks5 GetMasks5(unsigned num_qubits, const std::vector<unsigned>& qs,
-                          const std::vector<unsigned>& cqs, uint64_t cvals) {
-    uint64_t qmaskh = 0;
-    uint64_t cmaskh = 0;
-    unsigned qmaskl = 0;
-
-    for (unsigned i = 0; i < L; ++i) {
-      qmaskl |= 1 << qs[i];
-    }
-
-    for (unsigned i = L; i < H + L; ++i) {
-      qmaskh |= uint64_t{1} << qs[i];
-    }
-
-    for (auto q : cqs) {
-      cmaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh);
-
-    uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1);
-
-    return {2 * maskh, 2 * qmaskh, 2 * cvalsh, qmaskl};
-  }
-
-  struct Masks6 {
-    uint64_t imaskh;
-    uint64_t qmaskh;
-    uint64_t cvalsh;
-    uint64_t cvalsl;
-    uint64_t cmaskl;
-    unsigned qmaskl;
-    unsigned cl;
-  };
-
-  template <unsigned H, unsigned L, unsigned R>
-  static Masks6 GetMasks6(unsigned num_qubits, const std::vector<unsigned>& qs,
-                          const std::vector<unsigned>& cqs, uint64_t cvals) {
-    unsigned cl = 0;
-    uint64_t qmaskh = 0;
-    uint64_t cmaskh = 0;
-    uint64_t cmaskl = 0;
-    unsigned qmaskl = 0;
-
-    for (unsigned i = 0; i < L; ++i) {
-      qmaskl |= 1 << qs[i];
-    }
-
-    for (unsigned i = L; i < H + L; ++i) {
-      qmaskh |= uint64_t{1} << qs[i];
-    }
-
-    for (auto q : cqs) {
-      if (q >= R) {
-        cmaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        cmaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh);
-    uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl);
-
-    uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1);
-
-    return {2 * maskh, 2 * qmaskh, 2 * cvalsh, cvalsl, cmaskl, qmaskl, cl};
-  }
-
-  struct Masks7 {
-    uint64_t cvalsh;
-    uint64_t cmaskh;
-  };
-
-  static Masks7 GetMasks7(unsigned num_qubits, const std::vector<unsigned>& qs,
-                          const std::vector<unsigned>& cqs, uint64_t cvals) {
-    uint64_t cmaskh = 0;
-
-    for (auto q : cqs) {
-      cmaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh);
-
-    return {cvalsh, cmaskh};
-  }
-
-  struct Masks8 {
-    uint64_t cvalsh;
-    uint64_t cmaskh;
-    uint64_t cvalsl;
-    uint64_t cmaskl;
-  };
-
-  template <unsigned R>
-  static Masks8 GetMasks8(unsigned num_qubits, const std::vector<unsigned>& qs,
-                          const std::vector<unsigned>& cqs, uint64_t cvals) {
-    unsigned cl = 0;
-    uint64_t cmaskh = 0;
-    uint64_t cmaskl = 0;
-
-    for (auto q : cqs) {
-      if (q >= R) {
-        cmaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        cmaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh);
-    uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl);
-
-    return {cvalsh, cmaskh, cvalsl, cmaskl};
-  }
-
-  struct Masks9 {
-    uint64_t cvalsh;
-    uint64_t cmaskh;
-    unsigned qmaskl;
-  };
-
-  template <unsigned L>
-  static Masks9 GetMasks9(unsigned num_qubits, const std::vector<unsigned>& qs,
-                          const std::vector<unsigned>& cqs, uint64_t cvals) {
-    uint64_t cmaskh = 0;
-    unsigned qmaskl = 0;
-
-    for (unsigned i = 0; i < L; ++i) {
-      qmaskl |= 1 << qs[i];
-    }
-
-    for (auto q : cqs) {
-      cmaskh |= uint64_t{1} << q;
-    }
-
-    uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh);
-
-    return {cvalsh, cmaskh, qmaskl};
-  }
-
-  struct Masks10 {
-    uint64_t cvalsh;
-    uint64_t cmaskh;
-    uint64_t cvalsl;
-    uint64_t cmaskl;
-    unsigned qmaskl;
-  };
-
-  template <unsigned L, unsigned R>
-  static Masks10 GetMasks10(unsigned num_qubits,
-                            const std::vector<unsigned>& qs,
-                            const std::vector<unsigned>& cqs, uint64_t cvals) {
-    unsigned cl = 0;
-    uint64_t cmaskh = 0;
-    uint64_t cmaskl = 0;
-    unsigned qmaskl = 0;
-
-    for (unsigned i = 0; i < L; ++i) {
-      qmaskl |= 1 << qs[i];
-    }
-
-    for (auto q : cqs) {
-      if (q >= R) {
-        cmaskh |= uint64_t{1} << q;
-      } else {
-        ++cl;
-        cmaskl |= uint64_t{1} << q;
-      }
-    }
-
-    uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh);
-    uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl);
-
-    return {cvalsh, cmaskh, cvalsl, cmaskl, qmaskl};
-  }
-
-  struct Masks11 {
-    unsigned qmaskl;
-  };
-
-  template <unsigned L>
-  static Masks11 GetMasks11(const std::vector<unsigned>& qs) {
-    unsigned qmaskl = 0;
-
-    for (unsigned i = 0; i < L; ++i) {
-      qmaskl |= 1 << qs[i];
-    }
-
-    return {qmaskl};
-  }
-
-  template <unsigned R>
-  static unsigned MaskedAdd(
-      unsigned a, unsigned b, unsigned mask, unsigned lsize) {
-    unsigned c = bits::CompressBits(a, R, mask);
-    return bits::ExpandBits((c + b) % lsize, R, mask);
-  }
-};
-
-template <>
-inline void SimulatorBase::FillIndices<0, 1>(unsigned num_qubits,
-                                             const std::vector<unsigned>& qs,
-                                             uint64_t* ms, uint64_t* xss) {
-  ms[0] = -1;
-  xss[0] = 0;
-}
-
-template <>
-inline void SimulatorBase::FillIndices<0, 2>(unsigned num_qubits,
-                                             const std::vector<unsigned>& qs,
-                                             uint64_t* ms, uint64_t* xss) {
-  ms[0] = -1;
-  xss[0] = 0;
-}
-
-template <>
-inline void SimulatorBase::FillIndices<0, 3>(unsigned num_qubits,
-                                             const std::vector<unsigned>& qs,
-                                             uint64_t* ms, uint64_t* xss) {
-  ms[0] = -1;
-  xss[0] = 0;
-}
-
-}  // namespace qsim
-
-#endif  // SIMULATOR_H_
diff --git a/tpls/qsim/simulator_avx.h b/tpls/qsim/simulator_avx.h
deleted file mode 100644
index 9742849..0000000
--- a/tpls/qsim/simulator_avx.h
+++ /dev/null
@@ -1,1363 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SIMULATOR_AVX_H_
-#define SIMULATOR_AVX_H_
-
-#include <immintrin.h>
-
-#include <complex>
-#include <cstdint>
-#include <functional>
-#include <vector>
-
-#include "simulator.h"
-#include "statespace_avx.h"
-
-namespace qsim {
-
-/**
- * Quantum circuit simulator with AVX vectorization.
- */
-template <typename For>
-class SimulatorAVX final : public SimulatorBase {
- public:
-  using StateSpace = StateSpaceAVX<For>;
-  using State = typename StateSpace::State;
-  using fp_type = typename StateSpace::fp_type;
-
-  template <typename... ForArgs>
-  explicit SimulatorAVX(ForArgs&&... args) : for_(args...) {}
-
-  /**
-   * Applies a gate using AVX instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyGate(const std::vector<unsigned>& qs,
-                 const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    switch (qs.size()) {
-    case 0:
-      ApplyGateH<0>(qs, matrix, state);
-      break;
-    case 1:
-      if (qs[0] > 2) {
-        ApplyGateH<1>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 1>(qs, matrix, state);
-      }
-      break;
-    case 2:
-      if (qs[0] > 2) {
-        ApplyGateH<2>(qs, matrix, state);
-      } else if (qs[1] > 2) {
-        ApplyGateL<1, 1>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 2>(qs, matrix, state);
-      }
-      break;
-    case 3:
-      if (qs[0] > 2) {
-        ApplyGateH<3>(qs, matrix, state);
-      } else if (qs[1] > 2) {
-        ApplyGateL<2, 1>(qs, matrix, state);
-      } else if (qs[2] > 2) {
-        ApplyGateL<1, 2>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 3>(qs, matrix, state);
-      }
-      break;
-    case 4:
-      if (qs[0] > 2) {
-        ApplyGateH<4>(qs, matrix, state);
-      } else if (qs[1] > 2) {
-        ApplyGateL<3, 1>(qs, matrix, state);
-      } else if (qs[2] > 2) {
-        ApplyGateL<2, 2>(qs, matrix, state);
-      } else {
-        ApplyGateL<1, 3>(qs, matrix, state);
-      }
-      break;
-    case 5:
-      if (qs[0] > 2) {
-        ApplyGateH<5>(qs, matrix, state);
-      } else if (qs[1] > 2) {
-        ApplyGateL<4, 1>(qs, matrix, state);
-      } else if (qs[2] > 2) {
-        ApplyGateL<3, 2>(qs, matrix, state);
-      } else {
-        ApplyGateL<2, 3>(qs, matrix, state);
-      }
-      break;
-    case 6:
-      if (qs[0] > 2) {
-        ApplyGateH<6>(qs, matrix, state);
-      } else if (qs[1] > 2) {
-        ApplyGateL<5, 1>(qs, matrix, state);
-      } else if (qs[2] > 2) {
-        ApplyGateL<4, 2>(qs, matrix, state);
-      } else {
-        ApplyGateL<3, 3>(qs, matrix, state);
-      }
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-  }
-
-  /**
-   * Applies a controlled gate using AVX instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param cqs Indices of control qubits.
-   * @param cvals Bit mask of control qubit values.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyControlledGate(const std::vector<unsigned>& qs,
-                           const std::vector<unsigned>& cqs, uint64_t cvals,
-                           const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-    // Assume cqs[0] < cqs[1] < cqs[2] < ... .
-
-    if (cqs.size() == 0) {
-      ApplyGate(qs, matrix, state);
-      return;
-    }
-
-    switch (qs.size()) {
-    case 0:
-      if (cqs[0] > 2) {
-        ApplyControlledGateHH<0>(qs, cqs, cvals, matrix, state);
-      } else {
-        ApplyControlledGateHL<0>(qs, cqs, cvals, matrix, state);
-      }
-      break;
-    case 1:
-      if (qs[0] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 2:
-      if (qs[0] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 3:
-      if (qs[0] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[2] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 4:
-      if (qs[0] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[2] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-  }
-
-  /**
-   * Computes the expectation value of an operator using AVX instructions.
-   * @param qs Indices of the qubits the operator acts on.
-   * @param matrix The operator matrix.
-   * @param state The state of the system.
-   * @return The computed expectation value.
-   */
-  std::complex<double> ExpectationValue(const std::vector<unsigned>& qs,
-                                        const fp_type* matrix,
-                                        const State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    switch (qs.size()) {
-    case 1:
-      if (qs[0] > 2) {
-        return ExpectationValueH<1>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<0, 1>(qs, matrix, state);
-      }
-      break;
-    case 2:
-      if (qs[0] > 2) {
-        return ExpectationValueH<2>(qs, matrix, state);
-      } else if (qs[1] > 2) {
-        return ExpectationValueL<1, 1>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<0, 2>(qs, matrix, state);
-      }
-      break;
-    case 3:
-      if (qs[0] > 2) {
-        return ExpectationValueH<3>(qs, matrix, state);
-      } else if (qs[1] > 2) {
-        return ExpectationValueL<2, 1>(qs, matrix, state);
-      } else if (qs[2] > 2) {
-        return ExpectationValueL<1, 2>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<0, 3>(qs, matrix, state);
-      }
-      break;
-    case 4:
-      if (qs[0] > 2) {
-        return ExpectationValueH<4>(qs, matrix, state);
-      } else if (qs[1] > 2) {
-        return ExpectationValueL<3, 1>(qs, matrix, state);
-      } else if (qs[2] > 2) {
-        return ExpectationValueL<2, 2>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<1, 3>(qs, matrix, state);
-      }
-      break;
-    case 5:
-      if (qs[0] > 2) {
-        return ExpectationValueH<5>(qs, matrix, state);
-      } else if (qs[1] > 2) {
-        return ExpectationValueL<4, 1>(qs, matrix, state);
-      } else if (qs[2] > 2) {
-        return ExpectationValueL<3, 2>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<2, 3>(qs, matrix, state);
-      }
-      break;
-    case 6:
-      if (qs[0] > 2) {
-        return ExpectationValueH<6>(qs, matrix, state);
-      } else if (qs[1] > 2) {
-        return ExpectationValueL<5, 1>(qs, matrix, state);
-      } else if (qs[2] > 2) {
-        return ExpectationValueL<4, 2>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<3, 3>(qs, matrix, state);
-      }
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-
-    return 0;
-  }
-
-  /**
-   * @return The size of SIMD register if applicable.
-   */
-  static unsigned SIMDRegisterSize() {
-    return 8;
-  }
-
- private:
-#ifdef __BMI2__
-
-  template <unsigned H>
-  void ApplyGateH(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                uint64_t imaskh, uint64_t qmaskh, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m256 ru, iu, rn, in;
-      __m256 rs[hsize], is[hsize];
-
-      auto p0 = rstate + _pdep_u64(i, imaskh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k] = _mm256_load_ps(p0 + p);
-        is[k] = _mm256_load_ps(p0 + p + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[l], ru, rn);
-          in = _mm256_fmadd_ps(rs[l], iu, in);
-          rn = _mm256_fnmadd_ps(is[l], iu, rn);
-          in = _mm256_fmadd_ps(is[l], ru, in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm256_store_ps(p0 + p, rn);
-        _mm256_store_ps(p0 + p + 8, in);
-      }
-    };
-
-    auto m = GetMasks1<H, 3>(qs);
-
-    unsigned k = 3 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, m.imaskh, m.qmaskh, state.get());
-  }
-
-  template <unsigned H, unsigned L>
-  void ApplyGateL(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                uint64_t imaskh, uint64_t qmaskh, const __m256i* idx,
-                fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m256 rn, in;
-      __m256 rs[gsize], is[gsize];
-
-      auto p0 = rstate + _pdep_u64(i, imaskh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k2] = _mm256_load_ps(p0 + p);
-        is[k2] = _mm256_load_ps(p0 + p + 8);
-
-        for (unsigned l = 1; l < lsize; ++l) {
-          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
-          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
-          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm256_store_ps(p0 + p, rn);
-        _mm256_store_ps(p0 + p + 8, in);
-      }
-    };
-
-    __m256i idx[1 << L];
-    __m256 w[1 << (1 + 2 * H + L)];
-
-    auto m = GetMasks2<H, L, 3>(qs);
-    FillPermutationIndices<L>(m.qmaskl, idx);
-    FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
-
-    unsigned r = 3 + H;
-    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, m.imaskh, m.qmaskh, idx, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
-                fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m256 ru, iu, rn, in;
-      __m256 rs[hsize], is[hsize];
-
-      auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k] = _mm256_load_ps(p0 + p);
-        is[k] = _mm256_load_ps(p0 + p + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[l], ru, rn);
-          in = _mm256_fmadd_ps(rs[l], iu, in);
-          rn = _mm256_fnmadd_ps(is[l], iu, rn);
-          in = _mm256_fmadd_ps(is[l], ru, in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm256_store_ps(p0 + p, rn);
-        _mm256_store_ps(p0 + p + 8, in);
-      }
-    };
-
-    auto m = GetMasks3<H, 3>(state.num_qubits(), qs, cqs, cvals);
-
-    unsigned k = 3 + H + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, m.imaskh, m.qmaskh, m.cvalsh, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
-                fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m256 rn, in;
-      __m256 rs[hsize], is[hsize];
-
-      auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k] = _mm256_load_ps(p0 + p);
-        is[k] = _mm256_load_ps(p0 + p + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
-          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm256_store_ps(p0 + p, rn);
-        _mm256_store_ps(p0 + p + 8, in);
-      }
-    };
-
-    __m256 w[1 << (1 + 2 * H)];
-
-    auto m = GetMasks4<H, 3>(state.num_qubits(), qs, cqs, cvals);
-    FillControlledMatrixH<H, 3>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
-
-    unsigned r = 3 + H + cqs.size() - m.cl;
-    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, state.get());
-  }
-
-  template <unsigned H, unsigned L, bool CH>
-  void ApplyControlledGateL(const std::vector<unsigned>& qs,
-                            const std::vector<unsigned>& cqs, uint64_t cvals,
-                            const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
-                const __m256i* idx, fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m256 rn, in;
-      __m256 rs[gsize], is[gsize];
-
-      auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k2] = _mm256_load_ps(p0 + p);
-        is[k2] = _mm256_load_ps(p0 + p + 8);
-
-        for (unsigned l = 1; l < lsize; ++l) {
-          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
-          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
-          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm256_store_ps(p0 + p, rn);
-        _mm256_store_ps(p0 + p + 8, in);
-      }
-    };
-
-    __m256i idx[1 << L];
-    __m256 w[1 << (1 + 2 * H + L)];
-
-    if (CH) {
-      auto m = GetMasks5<H, L, 3>(state.num_qubits(), qs, cqs, cvals);
-      FillPermutationIndices<L>(m.qmaskl, idx);
-      FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
-
-      unsigned r = 3 + H + cqs.size();
-      unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
-      uint64_t size = uint64_t{1} << n;
-
-      for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get());
-    } else {
-      auto m = GetMasks6<H, L, 3>(state.num_qubits(), qs, cqs, cvals);
-      FillPermutationIndices<L>(m.qmaskl, idx);
-      FillControlledMatrixL<H, L, 3>(
-          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
-
-      unsigned r = 3 + H + cqs.size() - m.cl;
-      unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
-      uint64_t size = uint64_t{1} << n;
-
-      for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get());
-    }
-  }
-
-  template <unsigned H>
-  std::complex<double> ExpectationValueH(const std::vector<unsigned>& qs,
-                                         const fp_type* matrix,
-                                         const State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                uint64_t imaskh, uint64_t qmaskh, const fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m256 ru, iu, rn, in;
-      __m256 rs[hsize], is[hsize];
-
-      auto p0 = rstate + _pdep_u64(i, imaskh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k] = _mm256_load_ps(p0 + p);
-        is[k] = _mm256_load_ps(p0 + p + 8);
-      }
-
-      double re = 0;
-      double im = 0;
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[l], ru, rn);
-          in = _mm256_fmadd_ps(rs[l], iu, in);
-          rn = _mm256_fnmadd_ps(is[l], iu, rn);
-          in = _mm256_fmadd_ps(is[l], ru, in);
-
-          j += 2;
-        }
-
-        __m256 v_re = _mm256_fmadd_ps(is[k], in, _mm256_mul_ps(rs[k], rn));
-        __m256 v_im = _mm256_fnmadd_ps(is[k], rn, _mm256_mul_ps(rs[k], in));
-
-        re += detail::HorizontalSumAVX(v_re);
-        im += detail::HorizontalSumAVX(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    auto m = GetMasks1<H, 3>(qs);
-
-    unsigned k = 3 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return
-        for_.RunReduce(size, f, Op(), matrix, m.imaskh, m.qmaskh, state.get());
-  }
-
-  template <unsigned H, unsigned L>
-  std::complex<double> ExpectationValueL(const std::vector<unsigned>& qs,
-                                         const fp_type* matrix,
-                                         const State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                uint64_t imaskh, uint64_t qmaskh, const __m256i* idx,
-                const fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m256 rn, in;
-      __m256 rs[gsize], is[gsize];
-
-      auto p0 = rstate + _pdep_u64(i, imaskh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k2] = _mm256_load_ps(p0 + p);
-        is[k2] = _mm256_load_ps(p0 + p + 8);
-
-        for (unsigned l = 1; l < lsize; ++l) {
-          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
-          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
-          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        unsigned m = lsize * k;
-
-        __m256 v_re = _mm256_fmadd_ps(is[m], in, _mm256_mul_ps(rs[m], rn));
-        __m256 v_im = _mm256_fnmadd_ps(is[m], rn, _mm256_mul_ps(rs[m], in));
-
-        re += detail::HorizontalSumAVX(v_re);
-        im += detail::HorizontalSumAVX(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    __m256i idx[1 << L];
-    __m256 w[1 << (1 + 2 * H + L)];
-
-    auto m = GetMasks2<H, L, 3>(qs);
-    FillPermutationIndices<L>(m.qmaskl, idx);
-    FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
-
-    unsigned r = 3 + H;
-    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return
-        for_.RunReduce(size, f, Op(), w, m.imaskh, m.qmaskh, idx, state.get());
-  }
-
-#else  // __BMI2__
-
-  template <unsigned H>
-  void ApplyGateH(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m256 ru, iu, rn, in;
-      __m256 rs[hsize], is[hsize];
-
-      i *= 8;
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      auto p0 = rstate + 2 * ii;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = _mm256_load_ps(p0 + xss[k]);
-        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[l], ru, rn);
-          in = _mm256_fmadd_ps(rs[l], iu, in);
-          rn = _mm256_fnmadd_ps(is[l], iu, rn);
-          in = _mm256_fmadd_ps(is[l], ru, in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[k], rn);
-        _mm256_store_ps(p0 + xss[k] + 8, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-
-    unsigned k = 3 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss, state.get());
-  }
-
-  template <unsigned H, unsigned L>
-  void ApplyGateL(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss, const __m256i* idx,
-                fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m256 rn, in;
-      __m256 rs[gsize], is[gsize];
-
-      i *= 8;
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      auto p0 = rstate + 2 * ii;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-        rs[k2] = _mm256_load_ps(p0 + xss[k]);
-        is[k2] = _mm256_load_ps(p0 + xss[k] + 8);
-
-        for (unsigned l = 1; l < lsize; ++l) {
-          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
-          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
-          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[k], rn);
-        _mm256_store_ps(p0 + xss[k] + 8, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-    __m256i idx[1 << L];
-    __m256 w[1 << (1 + 2 * H + L)];
-
-    auto m = GetMasks11<L>(qs);
-
-    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
-    FillPermutationIndices<L>(m.qmaskl, idx);
-    FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
-
-    unsigned r = 3 + H;
-    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, idx, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
-                uint64_t cmaskh, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m256 ru, iu, rn, in;
-      __m256 rs[hsize], is[hsize];
-
-      i *= 8;
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      if ((ii & cmaskh) != cvalsh) return;
-
-      auto p0 = rstate + 2 * ii;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = _mm256_load_ps(p0 + xss[k]);
-        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[l], ru, rn);
-          in = _mm256_fmadd_ps(rs[l], iu, in);
-          rn = _mm256_fnmadd_ps(is[l], iu, rn);
-          in = _mm256_fmadd_ps(is[l], ru, in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[k], rn);
-        _mm256_store_ps(p0 + xss[k] + 8, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-
-    auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals);
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-
-    unsigned k = 3 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss, m.cvalsh, m.cmaskh, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
-                uint64_t cmaskh, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m256 rn, in;
-      __m256 rs[hsize], is[hsize];
-
-      i *= 8;
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      if ((ii & cmaskh) != cvalsh) return;
-
-      auto p0 = rstate + 2 * ii;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = _mm256_load_ps(p0 + xss[k]);
-        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
-          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[k], rn);
-        _mm256_store_ps(p0 + xss[k] + 8, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-    __m256 w[1 << (1 + 2 * H)];
-
-    auto m = GetMasks8<3>(state.num_qubits(), qs, cqs, cvals);
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-    FillControlledMatrixH<H, 3>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
-
-    unsigned r = 3 + H;
-    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, state.get());
-  }
-
-  template <unsigned H, unsigned L, bool CH>
-  void ApplyControlledGateL(const std::vector<unsigned>& qs,
-                            const std::vector<unsigned>& cqs, uint64_t cvals,
-                            const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
-                uint64_t cmaskh, const __m256i* idx, fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m256 rn, in;
-      __m256 rs[gsize], is[gsize];
-
-      i *= 8;
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      if ((ii & cmaskh) != cvalsh) return;
-
-      auto p0 = rstate + 2 * ii;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-
-        rs[k2] = _mm256_load_ps(p0 + xss[k]);
-        is[k2] = _mm256_load_ps(p0 + xss[k] + 8);
-
-        for (unsigned l = 1; l < lsize; ++l) {
-          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
-          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
-          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[k], rn);
-        _mm256_store_ps(p0 + xss[k] + 8, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-    __m256i idx[1 << L];
-    __m256 w[1 << (1 + 2 * H + L)];
-
-    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
-
-    unsigned r = 3 + H;
-    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    if (CH) {
-      auto m = GetMasks9<L>(state.num_qubits(), qs, cqs, cvals);
-      FillPermutationIndices<L>(m.qmaskl, idx);
-      FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
-
-      for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, idx, state.get());
-    } else {
-      auto m = GetMasks10<L, 3>(state.num_qubits(), qs, cqs, cvals);
-      FillPermutationIndices<L>(m.qmaskl, idx);
-      FillControlledMatrixL<H, L, 3>(
-          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
-
-      for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, idx, state.get());
-    }
-  }
-
-  template <unsigned H>
-  std::complex<double> ExpectationValueH(const std::vector<unsigned>& qs,
-                                         const fp_type* matrix,
-                                         const State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                const fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m256 ru, iu, rn, in;
-      __m256 rs[hsize], is[hsize];
-
-      i *= 8;
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      auto p0 = rstate + 2 * ii;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = _mm256_load_ps(p0 + xss[k]);
-        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
-      }
-
-      double re = 0;
-      double im = 0;
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[l], ru, rn);
-          in = _mm256_fmadd_ps(rs[l], iu, in);
-          rn = _mm256_fnmadd_ps(is[l], iu, rn);
-          in = _mm256_fmadd_ps(is[l], ru, in);
-
-          j += 2;
-        }
-
-        __m256 v_re = _mm256_fmadd_ps(is[k], in, _mm256_mul_ps(rs[k], rn));
-        __m256 v_im = _mm256_fnmadd_ps(is[k], rn, _mm256_mul_ps(rs[k], in));
-
-        re += detail::HorizontalSumAVX(v_re);
-        im += detail::HorizontalSumAVX(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-
-    unsigned k = 3 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), matrix, ms, xss, state.get());
-  }
-
-  template <unsigned H, unsigned L>
-  std::complex<double> ExpectationValueL(const std::vector<unsigned>& qs,
-                                         const fp_type* matrix,
-                                         const State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss, const __m256i* idx,
-                const fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m256 rn, in;
-      __m256 rs[gsize], is[gsize];
-
-      i *= 8;
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      auto p0 = rstate + 2 * ii;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-
-        rs[k2] = _mm256_load_ps(p0 + xss[k]);
-        is[k2] = _mm256_load_ps(p0 + xss[k] + 8);
-
-        for (unsigned l = 1; l < lsize; ++l) {
-          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
-          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
-          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        unsigned m = lsize * k;
-
-        __m256 v_re = _mm256_fmadd_ps(is[m], in, _mm256_mul_ps(rs[m], rn));
-        __m256 v_im = _mm256_fnmadd_ps(is[m], rn, _mm256_mul_ps(rs[m], in));
-
-        re += detail::HorizontalSumAVX(v_re);
-        im += detail::HorizontalSumAVX(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-    __m256i idx[1 << L];
-    __m256 w[1 << (1 + 2 * H + L)];
-
-    auto m = GetMasks11<L>(qs);
-
-    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
-    FillPermutationIndices<L>(m.qmaskl, idx);
-    FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
-
-    unsigned r = 3 + H;
-    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, idx, state.get());
-  }
-
-#endif  // __BMI2__
-
-  template <unsigned L>
-  static void FillPermutationIndices(unsigned qmaskl, __m256i* idx) {
-    constexpr unsigned lsize = 1 << L;
-
-    for (unsigned i = 0; i < lsize - 1; ++i) {
-      unsigned p[8];
-
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd<3>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-  }
-
-  For for_;
-};
-
-}  // namespace qsim
-
-#endif  // SIMULATOR_AVX_H_
diff --git a/tpls/qsim/simulator_avx512.h b/tpls/qsim/simulator_avx512.h
deleted file mode 100644
index 21a2e9d..0000000
--- a/tpls/qsim/simulator_avx512.h
+++ /dev/null
@@ -1,846 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SIMULATOR_AVX512_H_
-#define SIMULATOR_AVX512_H_
-
-#include <immintrin.h>
-
-#include <complex>
-#include <cstdint>
-#include <functional>
-#include <vector>
-
-#include "simulator.h"
-#include "statespace_avx512.h"
-
-namespace qsim {
-
-/**
- * Quantum circuit simulator with AVX512 vectorization.
- */
-template <typename For>
-class SimulatorAVX512 final : public SimulatorBase {
- public:
-  using StateSpace = StateSpaceAVX512<For>;
-  using State = typename StateSpace::State;
-  using fp_type = typename StateSpace::fp_type;
-
-  template <typename... ForArgs>
-  explicit SimulatorAVX512(ForArgs&&... args) : for_(args...) {}
-
-  /**
-   * Applies a gate using AVX512 instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyGate(const std::vector<unsigned>& qs,
-                 const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    switch (qs.size()) {
-    case 0:
-      ApplyGateH<0>(qs, matrix, state);
-      break;
-    case 1:
-      if (qs[0] > 3) {
-        ApplyGateH<1>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 1>(qs, matrix, state);
-      }
-      break;
-    case 2:
-      if (qs[0] > 3) {
-        ApplyGateH<2>(qs, matrix, state);
-      } else if (qs[1] > 3) {
-        ApplyGateL<1, 1>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 2>(qs, matrix, state);
-      }
-      break;
-    case 3:
-      if (qs[0] > 3) {
-        ApplyGateH<3>(qs, matrix, state);
-      } else if (qs[1] > 3) {
-        ApplyGateL<2, 1>(qs, matrix, state);
-      } else if (qs[2] > 3) {
-        ApplyGateL<1, 2>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 3>(qs, matrix, state);
-      }
-      break;
-    case 4:
-      if (qs[0] > 3) {
-        ApplyGateH<4>(qs, matrix, state);
-      } else if (qs[1] > 3) {
-        ApplyGateL<3, 1>(qs, matrix, state);
-      } else if (qs[2] > 3) {
-        ApplyGateL<2, 2>(qs, matrix, state);
-      } else if (qs[3] > 3) {
-        ApplyGateL<1, 3>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 4>(qs, matrix, state);
-      }
-      break;
-    case 5:
-      if (qs[0] > 3) {
-        ApplyGateH<5>(qs, matrix, state);
-      } else if (qs[1] > 3) {
-        ApplyGateL<4, 1>(qs, matrix, state);
-      } else if (qs[2] > 3) {
-        ApplyGateL<3, 2>(qs, matrix, state);
-      } else if (qs[3] > 3) {
-        ApplyGateL<2, 3>(qs, matrix, state);
-      } else {
-        ApplyGateL<1, 4>(qs, matrix, state);
-      }
-      break;
-    case 6:
-      if (qs[0] > 3) {
-        ApplyGateH<6>(qs, matrix, state);
-      } else if (qs[1] > 3) {
-        ApplyGateL<5, 1>(qs, matrix, state);
-      } else if (qs[2] > 3) {
-        ApplyGateL<4, 2>(qs, matrix, state);
-      } else if (qs[3] > 3) {
-        ApplyGateL<3, 3>(qs, matrix, state);
-      } else {
-        ApplyGateL<2, 4>(qs, matrix, state);
-      }
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-  }
-
-  /**
-   * Applies a controlled gate using AVX512 instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param cqs Indices of control qubits.
-   * @param cvals Bit mask of control qubit values.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyControlledGate(const std::vector<unsigned>& qs,
-                           const std::vector<unsigned>& cqs, uint64_t cvals,
-                           const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-    // Assume cqs[0] < cqs[1] < cqs[2] < ... .
-
-    if (cqs.size() == 0) {
-      ApplyGate(qs, matrix, state);
-      return;
-    }
-
-    switch (qs.size()) {
-    case 0:
-      if (cqs[0] > 3) {
-        ApplyControlledGateHH<0>(qs, cqs, cvals, matrix, state);
-      } else {
-        ApplyControlledGateHL<0>(qs, cqs, cvals, matrix, state);
-      }
-      break;
-    case 1:
-      if (qs[0] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 2:
-      if (qs[0] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 3:
-      if (qs[0] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[2] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 4:
-      if (qs[0] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[2] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[3] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<0, 4, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 4, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-  }
-
-  /**
-   * Computes the expectation value of an operator using AVX512 instructions.
-   * @param qs Indices of the qubits the operator acts on.
-   * @param matrix The operator matrix.
-   * @param state The state of the system.
-   * @return The computed expectation value.
-   */
-  std::complex<double> ExpectationValue(const std::vector<unsigned>& qs,
-                                        const fp_type* matrix,
-                                        const State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    switch (qs.size()) {
-    case 1:
-      if (qs[0] > 3) {
-        return ExpectationValueH<1>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<0, 1>(qs, matrix, state);
-      }
-      break;
-    case 2:
-      if (qs[0] > 3) {
-        return ExpectationValueH<2>(qs, matrix, state);
-      } else if (qs[1] > 3) {
-        return ExpectationValueL<1, 1>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<0, 2>(qs, matrix, state);
-      }
-      break;
-    case 3:
-      if (qs[0] > 3) {
-        return ExpectationValueH<3>(qs, matrix, state);
-      } else if (qs[1] > 3) {
-        return ExpectationValueL<2, 1>(qs, matrix, state);
-      } else if (qs[2] > 3) {
-        return ExpectationValueL<1, 2>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<0, 3>(qs, matrix, state);
-      }
-      break;
-    case 4:
-      if (qs[0] > 3) {
-        return ExpectationValueH<4>(qs, matrix, state);
-      } else if (qs[1] > 3) {
-        return ExpectationValueL<3, 1>(qs, matrix, state);
-      } else if (qs[2] > 3) {
-        return ExpectationValueL<2, 2>(qs, matrix, state);
-      } else if (qs[3] > 3) {
-        return ExpectationValueL<1, 3>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<0, 4>(qs, matrix, state);
-      }
-      break;
-    case 5:
-      if (qs[0] > 3) {
-        return ExpectationValueH<5>(qs, matrix, state);
-      } else if (qs[1] > 3) {
-        return ExpectationValueL<4, 1>(qs, matrix, state);
-      } else if (qs[2] > 3) {
-        return ExpectationValueL<3, 2>(qs, matrix, state);
-      } else if (qs[3] > 3) {
-        return ExpectationValueL<2, 3>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<1, 4>(qs, matrix, state);
-      }
-      break;
-    case 6:
-      if (qs[0] > 3) {
-        return ExpectationValueH<6>(qs, matrix, state);
-      } else if (qs[1] > 3) {
-        return ExpectationValueL<5, 1>(qs, matrix, state);
-      } else if (qs[2] > 3) {
-        return ExpectationValueL<4, 2>(qs, matrix, state);
-      } else if (qs[3] > 3) {
-        return ExpectationValueL<3, 3>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<2, 4>(qs, matrix, state);
-      }
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-
-    return 0;
-  }
-
-  /**
-   * @return The size of SIMD register if applicable.
-   */
-  static unsigned SIMDRegisterSize() {
-    return 16;
-  }
-
- private:
-  template <unsigned H>
-  void ApplyGateH(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                uint64_t imaskh, uint64_t qmaskh, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m512 ru, iu, rn, in;
-      __m512 rs[hsize], is[hsize];
-
-      auto p0 = rstate + _pdep_u64(i, imaskh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k] = _mm512_load_ps(p0 + p);
-        is[k] = _mm512_load_ps(p0 + p + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm512_set1_ps(v[j]);
-        iu = _mm512_set1_ps(v[j + 1]);
-        rn = _mm512_mul_ps(rs[0], ru);
-        in = _mm512_mul_ps(rs[0], iu);
-        rn = _mm512_fnmadd_ps(is[0], iu, rn);
-        in = _mm512_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm512_set1_ps(v[j]);
-          iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[l], ru, rn);
-          in = _mm512_fmadd_ps(rs[l], iu, in);
-          rn = _mm512_fnmadd_ps(is[l], iu, rn);
-          in = _mm512_fmadd_ps(is[l], ru, in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm512_store_ps(p0 + p, rn);
-        _mm512_store_ps(p0 + p + 16, in);
-      }
-    };
-
-    auto m = GetMasks1<H, 4>(qs);
-
-    unsigned k = 4 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, m.imaskh, m.qmaskh, state.get());
-  }
-
-  template <unsigned H, unsigned L>
-  void ApplyGateL(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                uint64_t imaskh, uint64_t qmaskh, const __m512i* idx,
-                fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m512 rn, in;
-      __m512 rs[gsize], is[gsize];
-
-      auto p0 = rstate + _pdep_u64(i, imaskh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k2] = _mm512_load_ps(p0 + p);
-        is[k2] = _mm512_load_ps(p0 + p + 16);
-
-        for (unsigned l = 1; l < lsize; ++l) {
-          rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]);
-          is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
-          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm512_store_ps(p0 + p, rn);
-        _mm512_store_ps(p0 + p + 16, in);
-      }
-    };
-
-    __m512i idx[1 << L];
-    __m512 w[1 << (1 + 2 * H + L)];
-
-    auto m = GetMasks2<H, L, 4>(qs);
-    FillPermutationIndices<L>(m.qmaskl, idx);
-    FillMatrix<H, L, 4>(m.qmaskl, matrix, (fp_type*) w);
-
-    unsigned r = 4 + H;
-    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, m.imaskh, m.qmaskh, idx, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
-                fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m512 ru, iu, rn, in;
-      __m512 rs[hsize], is[hsize];
-
-      auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k] = _mm512_load_ps(p0 + p);
-        is[k] = _mm512_load_ps(p0 + p + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm512_set1_ps(v[j]);
-        iu = _mm512_set1_ps(v[j + 1]);
-        rn = _mm512_mul_ps(rs[0], ru);
-        in = _mm512_mul_ps(rs[0], iu);
-        rn = _mm512_fnmadd_ps(is[0], iu, rn);
-        in = _mm512_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm512_set1_ps(v[j]);
-          iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[l], ru, rn);
-          in = _mm512_fmadd_ps(rs[l], iu, in);
-          rn = _mm512_fnmadd_ps(is[l], iu, rn);
-          in = _mm512_fmadd_ps(is[l], ru, in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm512_store_ps(p0 + p, rn);
-        _mm512_store_ps(p0 + p + 16, in);
-      }
-    };
-
-    auto m = GetMasks3<H, 4>(state.num_qubits(), qs, cqs, cvals);
-
-    unsigned k = 4 + H + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, m.imaskh, m.qmaskh, m.cvalsh, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
-                fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m512 rn, in;
-      __m512 rs[hsize], is[hsize];
-
-      auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k] = _mm512_load_ps(p0 + p);
-        is[k] = _mm512_load_ps(p0 + p + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
-          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm512_store_ps(p0 + p, rn);
-        _mm512_store_ps(p0 + p + 16, in);
-      }
-    };
-
-    __m512 w[1 << (1 + 2 * H)];
-
-    auto m = GetMasks4<H, 4>(state.num_qubits(), qs, cqs, cvals);
-    FillControlledMatrixH<H, 4>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
-
-    unsigned r = 4 + H + cqs.size() - m.cl;
-    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, state.get());
-  }
-
-  template <unsigned H, unsigned L, bool CH>
-  void ApplyControlledGateL(const std::vector<unsigned>& qs,
-                            const std::vector<unsigned>& cqs, uint64_t cvals,
-                            const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
-                const __m512i* idx, fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m512 rn, in;
-      __m512 rs[gsize], is[gsize];
-
-      auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k2] = _mm512_load_ps(p0 + p);
-        is[k2] = _mm512_load_ps(p0 + p + 16);
-
-        for (unsigned l = 1; l < lsize; ++l) {
-          rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]);
-          is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
-          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm512_store_ps(p0 + p, rn);
-        _mm512_store_ps(p0 + p + 16, in);
-      }
-    };
-
-    __m512i idx[1 << L];
-    __m512 w[1 << (1 + 2 * H + L)];
-
-    if (CH) {
-      auto m = GetMasks5<H, L, 4>(state.num_qubits(), qs, cqs, cvals);
-      FillPermutationIndices<L>(m.qmaskl, idx);
-      FillMatrix<H, L, 4>(m.qmaskl, matrix, (fp_type*) w);
-
-      unsigned r = 4 + H + cqs.size();
-      unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
-      uint64_t size = uint64_t{1} << n;
-
-      for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get());
-    } else {
-      auto m = GetMasks6<H, L, 4>(state.num_qubits(), qs, cqs, cvals);
-      FillPermutationIndices<L>(m.qmaskl, idx);
-      FillControlledMatrixL<H, L, 4>(
-          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
-
-      unsigned r = 4 + H + cqs.size() - m.cl;
-      unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
-      uint64_t size = uint64_t{1} << n;
-
-      for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get());
-    }
-  }
-
-  template <unsigned H>
-  std::complex<double> ExpectationValueH(const std::vector<unsigned>& qs,
-                                         const fp_type* matrix,
-                                         const State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                uint64_t imaskh, uint64_t qmaskh, const fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m512 ru, iu, rn, in;
-      __m512 rs[hsize], is[hsize];
-
-      auto p0 = rstate + _pdep_u64(i, imaskh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k] = _mm512_load_ps(p0 + p);
-        is[k] = _mm512_load_ps(p0 + p + 16);
-      }
-
-      double re = 0;
-      double im = 0;
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm512_set1_ps(v[j]);
-        iu = _mm512_set1_ps(v[j + 1]);
-        rn = _mm512_mul_ps(rs[0], ru);
-        in = _mm512_mul_ps(rs[0], iu);
-        rn = _mm512_fnmadd_ps(is[0], iu, rn);
-        in = _mm512_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm512_set1_ps(v[j]);
-          iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[l], ru, rn);
-          in = _mm512_fmadd_ps(rs[l], iu, in);
-          rn = _mm512_fnmadd_ps(is[l], iu, rn);
-          in = _mm512_fmadd_ps(is[l], ru, in);
-
-          j += 2;
-        }
-
-        __m512 v_re = _mm512_fmadd_ps(is[k], in, _mm512_mul_ps(rs[k], rn));
-        __m512 v_im = _mm512_fnmadd_ps(is[k], rn, _mm512_mul_ps(rs[k], in));
-
-        re += detail::HorizontalSumAVX512(v_re);
-        im += detail::HorizontalSumAVX512(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    auto m = GetMasks1<H, 4>(qs);
-
-    unsigned k = 4 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return
-        for_.RunReduce(size, f, Op(), matrix, m.imaskh, m.qmaskh, state.get());
-  }
-
-  template <unsigned H, unsigned L>
-  std::complex<double> ExpectationValueL(const std::vector<unsigned>& qs,
-                                         const fp_type* matrix,
-                                         const State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                uint64_t imaskh, uint64_t qmaskh, const __m512i* idx,
-                const fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m512 rn, in;
-      __m512 rs[gsize], is[gsize];
-
-      auto p0 = rstate + _pdep_u64(i, imaskh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k2] = _mm512_load_ps(p0 + p);
-        is[k2] = _mm512_load_ps(p0 + p + 16);
-
-        for (unsigned l = 1; l < lsize; ++l) {
-          rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]);
-          is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
-          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        unsigned m = lsize * k;
-
-        __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn));
-        __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in));
-
-        re += detail::HorizontalSumAVX512(v_re);
-        im += detail::HorizontalSumAVX512(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    __m512i idx[1 << L];
-    __m512 w[1 << (1 + 2 * H + L)];
-
-    auto m = GetMasks2<H, L, 4>(qs);
-    FillPermutationIndices<L>(m.qmaskl, idx);
-    FillMatrix<H, L, 4>(m.qmaskl, matrix, (fp_type*) w);
-
-    unsigned r = 4 + H;
-    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return
-        for_.RunReduce(size, f, Op(), w, m.imaskh, m.qmaskh, idx, state.get());
-  }
-
-  template <unsigned L>
-  static void FillPermutationIndices(unsigned qmaskl, __m512i* idx) {
-    constexpr unsigned lsize = 1 << L;
-
-    for (unsigned i = 0; i < lsize; ++i) {
-      unsigned p[16];
-
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd<4>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-  }
-
-  For for_;
-};
-
-}  // namespace qsim
-
-#endif  // SIMULATOR_AVX512_H_
diff --git a/tpls/qsim/simulator_basic.h b/tpls/qsim/simulator_basic.h
deleted file mode 100644
index 752eeb5..0000000
--- a/tpls/qsim/simulator_basic.h
+++ /dev/null
@@ -1,349 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SIMULATOR_BASIC_H_
-#define SIMULATOR_BASIC_H_
-
-#include <complex>
-#include <cstdint>
-#include <functional>
-#include <vector>
-
-#include "simulator.h"
-#include "statespace_basic.h"
-
-namespace qsim {
-
-/**
- * Quantum circuit simulator without vectorization.
- */
-template <typename For, typename FP = float>
-class SimulatorBasic final : public SimulatorBase {
- public:
-  using StateSpace = StateSpaceBasic<For, FP>;
-  using State = typename StateSpace::State;
-  using fp_type = typename StateSpace::fp_type;
-
-  template <typename... ForArgs>
-  explicit SimulatorBasic(ForArgs&&... args) : for_(args...) {}
-
-  /**
-   * Applies a gate using non-vectorized instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyGate(const std::vector<unsigned>& qs,
-                 const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    switch (qs.size()) {
-    case 0:
-      ApplyGateH<0>(qs, matrix, state);
-      break;
-    case 1:
-      ApplyGateH<1>(qs, matrix, state);
-      break;
-    case 2:
-      ApplyGateH<2>(qs, matrix, state);
-      break;
-    case 3:
-      ApplyGateH<3>(qs, matrix, state);
-      break;
-    case 4:
-      ApplyGateH<4>(qs, matrix, state);
-      break;
-    case 5:
-      ApplyGateH<5>(qs, matrix, state);
-      break;
-    case 6:
-      ApplyGateH<6>(qs, matrix, state);
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-  }
-
-  /**
-   * Applies a controlled gate using non-vectorized instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param cqs Indices of control qubits.
-   * @param cvals Bit mask of control qubit values.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyControlledGate(const std::vector<unsigned>& qs,
-                           const std::vector<unsigned>& cqs, uint64_t cvals,
-                           const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    if (cqs.size() == 0) {
-      ApplyGate(qs, matrix, state);
-      return;
-    }
-
-    switch (qs.size()) {
-    case 0:
-      ApplyControlledGateH<0>(qs, cqs, cvals, matrix, state);
-      break;
-    case 1:
-      ApplyControlledGateH<1>(qs, cqs, cvals, matrix, state);
-      break;
-    case 2:
-      ApplyControlledGateH<2>(qs, cqs, cvals, matrix, state);
-      break;
-    case 3:
-      ApplyControlledGateH<3>(qs, cqs, cvals, matrix, state);
-      break;
-    case 4:
-      ApplyControlledGateH<4>(qs, cqs, cvals, matrix, state);
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-  }
-
-  /**
-   * Computes the expectation value of an operator using non-vectorized
-   * instructions.
-   * @param qs Indices of the qubits the operator acts on.
-   * @param matrix The operator matrix.
-   * @param state The state of the system.
-   * @return The computed expectation value.
-   */
-  std::complex<double> ExpectationValue(const std::vector<unsigned>& qs,
-                                        const fp_type* matrix,
-                                        const State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    switch (qs.size()) {
-    case 1:
-      return ExpectationValueH<1>(qs, matrix, state);
-      break;
-    case 2:
-      return ExpectationValueH<2>(qs, matrix, state);
-      break;
-    case 3:
-      return ExpectationValueH<3>(qs, matrix, state);
-      break;
-    case 4:
-      return ExpectationValueH<4>(qs, matrix, state);
-      break;
-    case 5:
-      return ExpectationValueH<5>(qs, matrix, state);
-      break;
-    case 6:
-      return ExpectationValueH<6>(qs, matrix, state);
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-
-    return 0;
-  }
-
-  /**
-   * @return The size of SIMD register if applicable.
-   */
-  static unsigned SIMDRegisterSize() {
-    return 1;
-  }
-
- private:
-  template <unsigned H>
-  void ApplyGateH(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      fp_type rn, in;
-      fp_type rs[hsize], is[hsize];
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      auto p0 = rstate + 2 * ii;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = *(p0 + xss[k]);
-        is[k] = *(p0 + xss[k] + 1);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = rs[0] * v[j] - is[0] * v[j + 1];
-        in = rs[0] * v[j + 1] + is[0] * v[j];
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          rn += rs[l] * v[j] - is[l] * v[j + 1];
-          in += rs[l] * v[j + 1] + is[l] * v[j];
-
-          j += 2;
-        }
-
-        *(p0 + xss[k]) = rn;
-        *(p0 + xss[k] + 1) = in;
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-
-    unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateH(const std::vector<unsigned>& qs,
-                            const std::vector<unsigned>& cqs,
-                            uint64_t cvals, const fp_type* matrix,
-                            State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                uint64_t cvalsh, uint64_t cmaskh, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      fp_type rn, in;
-      fp_type rs[hsize], is[hsize];
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      if ((ii & cmaskh) == cvalsh) {
-        auto p0 = rstate + 2 * ii;
-
-        for (unsigned k = 0; k < hsize; ++k) {
-          rs[k] = *(p0 + xss[k]);
-          is[k] = *(p0 + xss[k] + 1);
-        }
-
-        uint64_t j = 0;
-
-        for (unsigned k = 0; k < hsize; ++k) {
-          rn = rs[0] * v[j] - is[0] * v[j + 1];
-          in = rs[0] * v[j + 1] + is[0] * v[j];
-
-          j += 2;
-
-          for (unsigned l = 1; l < hsize; ++l) {
-            rn += rs[l] * v[j] - is[l] * v[j + 1];
-            in += rs[l] * v[j + 1] + is[l] * v[j];
-
-            j += 2;
-          }
-
-          *(p0 + xss[k]) = rn;
-          *(p0 + xss[k] + 1) = in;
-        }
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-
-    auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals);
-
-    unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss, m.cvalsh, m.cmaskh, state.get());
-  }
-
-  template <unsigned H>
-  std::complex<double> ExpectationValueH(const std::vector<unsigned>& qs,
-                                         const fp_type* matrix,
-                                         const State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                const fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      fp_type rn, in;
-      fp_type rs[hsize], is[hsize];
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      auto p0 = rstate + 2 * ii;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = *(p0 + xss[k]);
-        is[k] = *(p0 + xss[k] + 1);
-      }
-
-      double re = 0;
-      double im = 0;
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = rs[0] * v[j] - is[0] * v[j + 1];
-        in = rs[0] * v[j + 1] + is[0] * v[j];
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          rn += rs[l] * v[j] - is[l] * v[j + 1];
-          in += rs[l] * v[j + 1] + is[l] * v[j];
-
-          j += 2;
-        }
-
-        re += rs[k] * rn + is[k] * in;
-        im += rs[k] * in - is[k] * rn;
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-
-    unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), matrix, ms, xss, state.get());
-  }
-
-  For for_;
-};
-
-}  // namespace qsim
-
-#endif  // SIMULATOR_BASIC_H_
diff --git a/tpls/qsim/simulator_cuda.h b/tpls/qsim/simulator_cuda.h
deleted file mode 100644
index 5743bea..0000000
--- a/tpls/qsim/simulator_cuda.h
+++ /dev/null
@@ -1,923 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SIMULATOR_CUDA_H_
-#define SIMULATOR_CUDA_H_
-
-#include "simulator_cuda_kernels.h"
-
-#include <algorithm>
-#include <complex>
-#include <cstdint>
-#include <cstring>
-#include <vector>
-
-#include "bits.h"
-#include "statespace_cuda.h"
-
-namespace qsim {
-
-/**
- * Quantum circuit simulator with GPU vectorization.
- */
-template <typename FP = float>
-class SimulatorCUDA final {
- private:
-  using idx_type = uint64_t;
-  using Complex = qsim::Complex<double>;
-
-  // The maximum buffer size for indices and gate matrices.
-  // The maximum gate matrix size (for 6-qubit gates) is
-  // 2 * 2^6 * 2^6 * sizeof(FP) = 8192 * sizeof(FP). The maximum index size is
-  // 128 * sizeof(idx_type) + 96 * sizeof(unsigned).
-  static constexpr unsigned max_buf_size = 8192 * sizeof(FP)
-      + 128 * sizeof(idx_type) + 96 * sizeof(unsigned);
-
- public:
-  using StateSpace = StateSpaceCUDA<FP>;
-  using State = typename StateSpace::State;
-  using fp_type = typename StateSpace::fp_type;
-
-  SimulatorCUDA() : scratch_(nullptr), scratch_size_(0) {
-    ErrorCheck(cudaMalloc(&d_ws, max_buf_size));
-  }
-
-  ~SimulatorCUDA() {
-    ErrorCheck(cudaFree(d_ws));
-
-    if (scratch_ != nullptr) {
-      ErrorCheck(cudaFree(scratch_));
-    }
-  }
-
-  /**
-   * Applies a gate using CUDA instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyGate(const std::vector<unsigned>& qs,
-                 const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    if (qs.size() == 0) {
-      ApplyGateH<0>(qs, matrix, state);
-    } else if (qs[0] > 4) {
-      switch (qs.size()) {
-      case 1:
-        ApplyGateH<1>(qs, matrix, state);
-        break;
-      case 2:
-        ApplyGateH<2>(qs, matrix, state);
-        break;
-      case 3:
-        ApplyGateH<3>(qs, matrix, state);
-        break;
-      case 4:
-        ApplyGateH<4>(qs, matrix, state);
-        break;
-      case 5:
-        ApplyGateH<5>(qs, matrix, state);
-        break;
-      case 6:
-        ApplyGateH<6>(qs, matrix, state);
-        break;
-      default:
-        // Not implemented.
-        break;
-      }
-    } else {
-      switch (qs.size()) {
-      case 1:
-        ApplyGateL<1>(qs, matrix, state);
-        break;
-      case 2:
-        ApplyGateL<2>(qs, matrix, state);
-        break;
-      case 3:
-        ApplyGateL<3>(qs, matrix, state);
-        break;
-      case 4:
-        ApplyGateL<4>(qs, matrix, state);
-        break;
-      case 5:
-        ApplyGateL<5>(qs, matrix, state);
-        break;
-      case 6:
-        ApplyGateL<6>(qs, matrix, state);
-        break;
-      default:
-        // Not implemented.
-        break;
-      }
-    }
-  }
-
-  /**
-   * Applies a controlled gate using CUDA instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param cqs Indices of control qubits.
-   * @param cvals Bit mask of control qubit values.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyControlledGate(const std::vector<unsigned>& qs,
-                           const std::vector<unsigned>& cqs, uint64_t cvals,
-                           const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    if (cqs.size() == 0) {
-      ApplyGate(qs, matrix, state);
-      return;
-    }
-
-    if (cqs[0] < 5) {
-      switch (qs.size()) {
-      case 0:
-        ApplyControlledGateL<0>(qs, cqs, cvals, matrix, state);
-        break;
-      case 1:
-        ApplyControlledGateL<1>(qs, cqs, cvals, matrix, state);
-        break;
-      case 2:
-        ApplyControlledGateL<2>(qs, cqs, cvals, matrix, state);
-        break;
-      case 3:
-        ApplyControlledGateL<3>(qs, cqs, cvals, matrix, state);
-        break;
-      case 4:
-        ApplyControlledGateL<4>(qs, cqs, cvals, matrix, state);
-        break;
-      default:
-        // Not implemented.
-        break;
-      }
-    } else {
-      if (qs.size() == 0) {
-        ApplyControlledGateHH<0>(qs, cqs, cvals, matrix, state);
-      } else if (qs[0] > 4) {
-        switch (qs.size()) {
-        case 1:
-          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
-          break;
-        case 2:
-          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
-          break;
-        case 3:
-          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
-          break;
-        case 4:
-          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
-          break;
-        default:
-          // Not implemented.
-          break;
-        }
-      } else {
-        switch (qs.size()) {
-        case 1:
-          ApplyControlledGateLH<1>(qs, cqs, cvals, matrix, state);
-          break;
-        case 2:
-          ApplyControlledGateLH<2>(qs, cqs, cvals, matrix, state);
-          break;
-        case 3:
-          ApplyControlledGateLH<3>(qs, cqs, cvals, matrix, state);
-          break;
-        case 4:
-          ApplyControlledGateLH<4>(qs, cqs, cvals, matrix, state);
-          break;
-        default:
-          // Not implemented.
-          break;
-        }
-      }
-    }
-  }
-
-  /**
-   * Computes the expectation value of an operator using CUDA instructions.
-   * @param qs Indices of the qubits the operator acts on.
-   * @param matrix The operator matrix.
-   * @param state The state of the system.
-   * @return The computed expectation value.
-   */
-  std::complex<double> ExpectationValue(const std::vector<unsigned>& qs,
-                                        const fp_type* matrix,
-                                        const State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    if (qs[0] > 4) {
-      switch (qs.size()) {
-      case 1:
-        return ExpectationValueH<1>(qs, matrix, state);
-      case 2:
-        return ExpectationValueH<2>(qs, matrix, state);
-      case 3:
-        return ExpectationValueH<3>(qs, matrix, state);
-      case 4:
-        return ExpectationValueH<4>(qs, matrix, state);
-      case 5:
-        return ExpectationValueH<5>(qs, matrix, state);
-      case 6:
-        return ExpectationValueH<6>(qs, matrix, state);
-      default:
-        // Not implemented.
-        break;
-      }
-    } else {
-      switch (qs.size()) {
-      case 1:
-        return ExpectationValueL<1>(qs, matrix, state);
-      case 2:
-        return ExpectationValueL<2>(qs, matrix, state);
-      case 3:
-        return ExpectationValueL<3>(qs, matrix, state);
-      case 4:
-        return ExpectationValueL<4>(qs, matrix, state);
-      case 5:
-        return ExpectationValueL<5>(qs, matrix, state);
-      case 6:
-        return ExpectationValueL<6>(qs, matrix, state);
-      default:
-        // Not implemented.
-        break;
-      }
-    }
-
-    return 0;
-  }
-
-  /**
-   * @return The size of SIMD register if applicable.
-   */
-  static unsigned SIMDRegisterSize() {
-    return 32;
-  }
-
- private:
-  // The following indices are used in kernels.
-  // xss - indices to access the state vector entries in global memory.
-  // ms  - masks to access the state vector entries in global memory.
-  // tis - indices to access the state vector entries in shared memory
-  //       in the presence of low gate qubits.
-  // qis - indices to access the state vector entries in shared memory
-  //       in the presence of low gate qubits.
-  // cis - additional indices to access the state vector entries in global
-  //       memory in the presence of low control qubits.
-
-  template <unsigned G>
-  struct IndicesH {
-    static constexpr unsigned gsize = 1 << G;
-    static constexpr unsigned matrix_size = 2 * gsize * gsize * sizeof(fp_type);
-    static constexpr unsigned xss_size = 32 * sizeof(idx_type) * (1 + (G == 6));
-    static constexpr unsigned ms_size = 32 * sizeof(idx_type);
-    static constexpr unsigned xss_offs = matrix_size;
-    static constexpr unsigned ms_offs = xss_offs + xss_size;
-    static constexpr unsigned buf_size = ms_offs + ms_size;
-
-    IndicesH(char* p)
-        : xss((idx_type*) (p + xss_offs)), ms((idx_type*) (p + ms_offs)) {}
-
-    idx_type* xss;
-    idx_type* ms;
-  };
-
-  template <unsigned G>
-  struct IndicesL : public IndicesH<G> {
-    using Base = IndicesH<G>;
-    static constexpr unsigned qis_size = 32 * sizeof(unsigned) * (1 + (G == 6));
-    static constexpr unsigned tis_size = 32 * sizeof(unsigned);
-    static constexpr unsigned qis_offs = Base::buf_size;
-    static constexpr unsigned tis_offs = qis_offs + qis_size;
-    static constexpr unsigned buf_size = tis_offs + tis_size;
-
-    IndicesL(char* p)
-        : Base(p), qis((unsigned*) (p + qis_offs)),
-          tis((unsigned*) (p + tis_offs)) {}
-
-    unsigned* qis;
-    unsigned* tis;
-  };
-
-  template <unsigned G>
-  struct IndicesLC : public IndicesL<G> {
-    using Base = IndicesL<G>;
-    static constexpr unsigned cis_size = 32 * sizeof(idx_type);
-    static constexpr unsigned cis_offs = Base::buf_size;
-    static constexpr unsigned buf_size = cis_offs + cis_size;
-
-    IndicesLC(char* p) : Base(p), cis((idx_type*) (p + cis_offs)) {}
-
-    idx_type* cis;
-  };
-
-  struct DataC {
-    idx_type cvalsh;
-    unsigned num_aqs;
-    unsigned num_effective_qs;
-    unsigned remaining_low_cqs;
-  };
-
-  template <unsigned G>
-  void ApplyGateH(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    unsigned num_qubits = state.num_qubits();
-
-    IndicesH<G> h_i(h_ws);
-    GetIndicesH(num_qubits, qs, qs.size(), h_i);
-
-    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
-    ErrorCheck(
-        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
-
-    unsigned k = 5 + G;
-    unsigned n = num_qubits > k ? num_qubits - k : 0;
-    unsigned size = unsigned{1} << n;
-    unsigned threads = 64U;
-    unsigned blocks = std::max(1U, size / 2);
-
-    IndicesH<G> d_i(d_ws);
-
-    ApplyGateH_Kernel<G><<<blocks, threads>>>(
-        (fp_type*) d_ws, d_i.xss, d_i.ms, state.get());
-  }
-
-  template <unsigned G>
-  void ApplyGateL(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    unsigned num_qubits = state.num_qubits();
-
-    IndicesL<G> h_i(h_ws);
-    auto num_effective_qs = GetIndicesL(num_qubits, qs, h_i);
-
-    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
-    ErrorCheck(
-        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
-
-    unsigned k = 5 + num_effective_qs;
-    unsigned n = num_qubits > k ? num_qubits - k : 0;
-    unsigned size = unsigned{1} << n;
-    unsigned threads = 32;
-    unsigned blocks = size;
-
-    IndicesL<G> d_i(d_ws);
-
-    ApplyGateL_Kernel<G><<<blocks, threads>>>(
-        (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
-        1 << num_effective_qs, state.get());
-  }
-
-  template <unsigned G>
-  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, idx_type cvals,
-                             const fp_type* matrix, State& state) const {
-    unsigned aqs[64];
-    idx_type cmaskh = 0;
-    unsigned num_qubits = state.num_qubits();
-
-    IndicesH<G> h_i(h_ws);
-
-    unsigned num_aqs = GetHighQubits(qs, 0, cqs, 0, 0, cmaskh, aqs);
-    GetMs(num_qubits, aqs, num_aqs, h_i.ms);
-    GetXss(num_qubits, qs, qs.size(), h_i.xss);
-
-    idx_type cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh);
-
-    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
-    ErrorCheck(
-        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
-
-    unsigned k = 5 + G + cqs.size();
-    unsigned n = num_qubits > k ? num_qubits - k : 0;
-    unsigned size = unsigned{1} << n;
-    unsigned threads = 64U;
-    unsigned blocks = std::max(1U, size / 2);
-
-    IndicesH<G> d_i(d_ws);
-
-    ApplyControlledGateH_Kernel<G><<<blocks, threads>>>(
-        (fp_type*) d_ws, d_i.xss, d_i.ms, num_aqs + 1, cvalsh, state.get());
-  }
-
-  template <unsigned G>
-  void ApplyControlledGateLH(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    unsigned num_qubits = state.num_qubits();
-
-    IndicesL<G> h_i(h_ws);
-    auto d = GetIndicesLC(num_qubits, qs, cqs, cvals, h_i);
-
-    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
-    ErrorCheck(
-        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
-
-    unsigned k = 5 + G + cqs.size();
-    unsigned n = num_qubits > k ? num_qubits - k : 0;
-    unsigned size = unsigned{1} << n;
-    unsigned threads = 32;
-    unsigned blocks = size;
-
-    IndicesL<G> d_i(d_ws);
-
-    ApplyControlledGateLH_Kernel<G><<<blocks, threads>>>(
-        (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
-        d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs, state.get());
-  }
-
-  template <unsigned G>
-  void ApplyControlledGateL(const std::vector<unsigned>& qs,
-                            const std::vector<unsigned>& cqs, uint64_t cvals,
-                            const fp_type* matrix, State& state) const {
-    unsigned num_qubits = state.num_qubits();
-
-    IndicesLC<G> h_i(h_ws);
-    auto d = GetIndicesLCL(num_qubits, qs, cqs, cvals, h_i);
-
-    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
-    ErrorCheck(
-        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
-
-    unsigned k = 5 + G + cqs.size();
-    unsigned n = num_qubits > k ? num_qubits - k : 0;
-    unsigned size = unsigned{1} << n;
-    unsigned threads = 32;
-    unsigned blocks = size;
-
-    IndicesLC<G> d_i(d_ws);
-
-    ApplyControlledGateL_Kernel<G><<<blocks, threads>>>(
-        (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d_i.cis,
-        d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs,
-        1 << (5 - d.remaining_low_cqs), state.get());
-  }
-
-  template <unsigned G>
-  std::complex<double> ExpectationValueH(const std::vector<unsigned>& qs,
-                                         const fp_type* matrix,
-                                         const State& state) const {
-    unsigned num_qubits = state.num_qubits();
-
-    IndicesH<G> h_i(h_ws);
-    GetIndicesH(num_qubits, qs, qs.size(), h_i);
-
-    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
-    ErrorCheck(
-        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
-
-    unsigned k = 5 + G;
-    unsigned n = num_qubits > k ? num_qubits - k : 0;
-    unsigned size = unsigned{1} << n;
-
-    unsigned s = std::min(n >= 14 ? n - 14 : 0, 4U);
-    unsigned threads = 64U;
-    unsigned blocks = std::max(1U, (size / 2) >> s);
-    unsigned num_iterations_per_block = 1 << s;
-
-    constexpr unsigned m = 16;
-
-    Complex* d_res1 = (Complex*) AllocScratch((blocks + m) * sizeof(Complex));
-    Complex* d_res2 = d_res1 + blocks;
-
-    IndicesH<G> d_i(d_ws);
-
-    ExpectationValueH_Kernel<G><<<blocks, threads>>>(
-        (fp_type*) d_ws, d_i.xss, d_i.ms, num_iterations_per_block,
-        state.get(), Plus<double>(), d_res1);
-
-    double mul = size == 1 ? 0.5 : 1.0;
-
-    return ExpectationValueReduceFinal<m>(blocks, mul, d_res1, d_res2);
-  }
-
-  template <unsigned G>
-  std::complex<double> ExpectationValueL(const std::vector<unsigned>& qs,
-                                         const fp_type* matrix,
-                                         const State& state) const {
-    unsigned num_qubits = state.num_qubits();
-
-    IndicesL<G> h_i(h_ws);
-    auto num_effective_qs = GetIndicesL(num_qubits, qs, h_i);
-
-    std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size);
-    ErrorCheck(
-        cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice));
-
-    unsigned k = 5 + num_effective_qs;
-    unsigned n = num_qubits > k ? num_qubits - k : 0;
-    unsigned size = unsigned{1} << n;
-
-    unsigned s = std::min(n >= 13 ? n - 13 : 0, 5U);
-    unsigned threads = 32;
-    unsigned blocks = size >> s;
-    unsigned num_iterations_per_block = 1 << s;
-
-    constexpr unsigned m = 16;
-
-    Complex* d_res1 = (Complex*) AllocScratch((blocks + m) * sizeof(Complex));
-    Complex* d_res2 = d_res1 + blocks;
-
-    IndicesL<G> d_i(d_ws);
-
-    ExpectationValueL_Kernel<G><<<blocks, threads>>>(
-        (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis,
-        num_iterations_per_block, state.get(), Plus<double>(), d_res1);
-
-    double mul = double(1 << (5 + num_effective_qs - G)) / 32;
-
-    return ExpectationValueReduceFinal<m>(blocks, mul, d_res1, d_res2);
-  }
-
-  template <unsigned m>
-  std::complex<double> ExpectationValueReduceFinal(
-      unsigned blocks, double mul,
-      const Complex* d_res1, Complex* d_res2) const {
-    Complex res2[m];
-
-    if (blocks <= 16) {
-      ErrorCheck(cudaMemcpy(res2, d_res1, blocks * sizeof(Complex),
-                            cudaMemcpyDeviceToHost));
-    } else {
-      unsigned threads2 = std::min(1024U, blocks);
-      unsigned blocks2 = std::min(m, blocks / threads2);
-
-      unsigned dblocks = std::max(1U, blocks / (blocks2 * threads2));
-      unsigned bytes = threads2 * sizeof(Complex);
-
-      Reduce2Kernel<Complex><<<blocks2, threads2, bytes>>>(
-          dblocks, blocks, Plus<Complex>(), Plus<double>(), d_res1, d_res2);
-
-      ErrorCheck(cudaMemcpy(res2, d_res2, blocks2 * sizeof(Complex),
-                            cudaMemcpyDeviceToHost));
-
-      blocks = blocks2;
-    }
-
-    double re = 0;
-    double im = 0;
-
-    for (unsigned i = 0; i < blocks; ++i) {
-      re += res2[i].re;
-      im += res2[i].im;
-    }
-
-    return {mul * re, mul * im};
-  }
-
-  template <typename AQ>
-  unsigned GetHighQubits(const std::vector<unsigned>& qs, unsigned qi,
-                         const std::vector<unsigned>& cqs, unsigned ci,
-                         unsigned ai, idx_type& cmaskh, AQ& aqs) const {
-    while (1) {
-      if (qi < qs.size() && (ci == cqs.size() || qs[qi] < cqs[ci])) {
-        aqs[ai++] = qs[qi++];
-      } else if (ci < cqs.size()) {
-        cmaskh |= idx_type{1} << cqs[ci];
-        aqs[ai++] = cqs[ci++];
-      } else {
-        break;
-      }
-    }
-
-    return ai;
-  }
-
-  template <typename QS>
-  void GetMs(unsigned num_qubits, const QS& qs, unsigned qs_size,
-             idx_type* ms) const {
-    if (qs_size == 0) {
-      ms[0] = idx_type(-1);
-    } else {
-      idx_type xs = idx_type{1} << (qs[0] + 1);
-      ms[0] = (idx_type{1} << qs[0]) - 1;
-      for (unsigned i = 1; i < qs_size; ++i) {
-        ms[i] = ((idx_type{1} << qs[i]) - 1) ^ (xs - 1);
-        xs = idx_type{1} << (qs[i] + 1);
-      }
-      ms[qs_size] = ((idx_type{1} << num_qubits) - 1) ^ (xs - 1);
-    }
-  }
-
-  template <typename QS>
-  void GetXss(unsigned num_qubits, const QS& qs, unsigned qs_size,
-              idx_type* xss) const {
-    if (qs_size == 0) {
-      xss[0] = 0;
-    } else {
-      unsigned g = qs_size;
-      unsigned gsize = 1 << qs_size;
-
-      idx_type xs[64];
-
-      xs[0] = idx_type{1} << (qs[0] + 1);
-      for (unsigned i = 1; i < g; ++i) {
-        xs[i] = idx_type{1} << (qs[i] + 1);
-      }
-
-      for (unsigned i = 0; i < gsize; ++i) {
-        idx_type a = 0;
-        for (unsigned k = 0; k < g; ++k) {
-          a += xs[k] * ((i >> k) & 1);
-        }
-        xss[i] = a;
-      }
-    }
-  }
-
-  template <unsigned G, typename qs_type>
-  void GetIndicesH(unsigned num_qubits, const qs_type& qs, unsigned qs_size,
-                   IndicesH<G>& indices) const {
-    if (qs_size == 0) {
-      indices.ms[0] = idx_type(-1);
-      indices.xss[0] = 0;
-    } else {
-      unsigned g = qs_size;
-      unsigned gsize = 1 << qs_size;
-
-      idx_type xs[64];
-
-      xs[0] = idx_type{1} << (qs[0] + 1);
-      indices.ms[0] = (idx_type{1} << qs[0]) - 1;
-      for (unsigned i = 1; i < g; ++i) {
-        xs[i] = idx_type{1} << (qs[i] + 1);
-        indices.ms[i] = ((idx_type{1} << qs[i]) - 1) ^ (xs[i - 1] - 1);
-      }
-      indices.ms[g] = ((idx_type{1} << num_qubits) - 1) ^ (xs[g - 1] - 1);
-
-      for (unsigned i = 0; i < gsize; ++i) {
-        idx_type a = 0;
-        for (unsigned k = 0; k < g; ++k) {
-          a += xs[k] * ((i >> k) & 1);
-        }
-        indices.xss[i] = a;
-      }
-    }
-  }
-
-  template <unsigned G>
-  void GetIndicesL(unsigned num_effective_qs, unsigned qmask,
-                   IndicesL<G>& indices) const {
-    for (unsigned i = num_effective_qs + 1; i < (G + 1); ++i) {
-      indices.ms[i] = 0;
-    }
-
-    for (unsigned i = (1 << num_effective_qs); i < indices.gsize; ++i) {
-      indices.xss[i] = 0;
-    }
-
-    for (unsigned i = 0; i < indices.gsize; ++i) {
-      indices.qis[i] = bits::ExpandBits(i, 5 + num_effective_qs, qmask);
-    }
-
-    unsigned tmask = ((1 << (5 + num_effective_qs)) - 1) ^ qmask;
-    for (unsigned i = 0; i < 32; ++i) {
-      indices.tis[i] = bits::ExpandBits(i, 5 + num_effective_qs, tmask);
-    }
-  }
-
-  template <unsigned G>
-  unsigned GetIndicesL(unsigned num_qubits, const std::vector<unsigned>& qs,
-                       IndicesL<G>& indices) const {
-    unsigned eqs[32];
-
-    unsigned qmaskh = 0;
-    unsigned qmaskl = 0;
-
-    unsigned qi = 0;
-
-    while (qi < qs.size() && qs[qi] < 5) {
-      qmaskl |= 1 << qs[qi++];
-    }
-
-    unsigned nq = std::max(5U, num_qubits);
-    unsigned num_effective_qs = std::min(nq - 5, unsigned(qs.size()));
-
-    unsigned l = 0;
-    unsigned ei = 0;
-    unsigned num_low_qs = qi;
-
-    if (qs.size() == num_low_qs) {
-      while (ei < num_effective_qs && l++ < num_low_qs) {
-        eqs[ei] = ei + 5;
-        ++ei;
-      }
-    } else {
-      while (ei < num_effective_qs && l < num_low_qs) {
-        unsigned ei5 = ei + 5;
-        eqs[ei] = ei5;
-        if (qi < qs.size() && qs[qi] == ei5) {
-          ++qi;
-          qmaskh |= 1 << ei5;
-        } else {
-          ++l;
-        }
-        ++ei;
-      }
-
-      while (ei < num_effective_qs) {
-        eqs[ei] = qs[qi++];
-        qmaskh |= 1 << (ei + 5);
-        ++ei;
-      }
-    }
-
-    GetIndicesH(num_qubits, eqs, num_effective_qs, indices);
-    GetIndicesL(num_effective_qs, qmaskh | qmaskl, indices);
-
-    return num_effective_qs;
-  }
-
-  template <unsigned G>
-  DataC GetIndicesLC(unsigned num_qubits, const std::vector<unsigned>& qs,
-                     const std::vector<unsigned>& cqs, uint64_t cvals,
-                     IndicesL<G>& indices) const {
-    unsigned aqs[64];
-    unsigned eqs[32];
-
-    unsigned qmaskh = 0;
-    unsigned qmaskl = 0;
-    idx_type cmaskh = 0;
-
-    unsigned qi = 0;
-
-    while (qi < qs.size() && qs[qi] < 5) {
-      qmaskl |= 1 << qs[qi++];
-    }
-
-    unsigned nq = std::max(5U, num_qubits - unsigned(cqs.size()));
-    unsigned num_effective_qs = std::min(nq - 5, unsigned(qs.size()));
-
-    unsigned l = 0;
-    unsigned ai = 5;
-    unsigned ci = 0;
-    unsigned ei = 0;
-    unsigned num_low_qs = qi;
-
-    while (ai < num_qubits && l < num_low_qs) {
-      aqs[ai - 5] = ai;
-      if (qi < qs.size() && qs[qi] == ai) {
-        ++qi;
-        eqs[ei++] = ai;
-        qmaskh |= 1 << (ai - ci);
-      } else if (ci < cqs.size() && cqs[ci] == ai) {
-        ++ci;
-        cmaskh |= idx_type{1} << ai;
-      } else {
-        ++l;
-        eqs[ei++] = ai;
-      }
-      ++ai;
-    }
-
-    unsigned i = ai;
-    unsigned j = qi;
-
-    while (ei < num_effective_qs) {
-      eqs[ei++] = qs[j++];
-      qmaskh |= 1 << (i++ - ci);
-    }
-
-    unsigned num_aqs = GetHighQubits(qs, qi, cqs, ci, ai - 5, cmaskh, aqs);
-    GetMs(num_qubits, aqs, num_aqs, indices.ms);
-    GetXss(num_qubits, eqs, num_effective_qs, indices.xss);
-    GetIndicesL(num_effective_qs, qmaskh | qmaskl, indices);
-
-    idx_type cvalsh = bits::ExpandBits(idx_type(cvals), num_qubits, cmaskh);
-
-    return {cvalsh, num_aqs, num_effective_qs};
-  }
-
-  template <unsigned G>
-  DataC GetIndicesLCL(unsigned num_qubits, const std::vector<unsigned>& qs,
-                      const std::vector<unsigned>& cqs, uint64_t cvals,
-                      IndicesLC<G>& indices) const {
-    unsigned aqs[64];
-    unsigned eqs[32];
-
-    unsigned qmaskh = 0;
-    unsigned qmaskl = 0;
-    idx_type cmaskh = 0;
-    idx_type cmaskl = 0;
-    idx_type cis_mask = 0;
-
-    unsigned qi = 0;
-    unsigned ci = 0;
-
-    for (unsigned k = 0; k < 5; ++k) {
-      if (qi < qs.size() && qs[qi] == k) {
-        qmaskl |= 1 << (k - ci);
-        ++qi;
-      } else if (ci < cqs.size() && cqs[ci] == k) {
-        cmaskl |= idx_type{1} << k;
-        ++ci;
-      }
-    }
-
-    unsigned num_low_qs = qi;
-    unsigned num_low_cqs = ci;
-
-    unsigned nq = std::max(5U, num_qubits - unsigned(cqs.size()));
-    unsigned num_effective_qs = std::min(nq - 5, unsigned(qs.size()));
-
-    unsigned l = 0;
-    unsigned ai = 5;
-    unsigned ei = 0;
-    unsigned num_low = num_low_qs + num_low_cqs;
-    unsigned remaining_low_cqs = num_low_cqs;
-    unsigned effective_low_qs = num_low_qs;
-    unsigned highest_cis_bit = 0;
-
-    while (ai < num_qubits && l < num_low) {
-      aqs[ai - 5] = ai;
-      if (qi < qs.size() && qs[qi] == ai) {
-        ++qi;
-        if ((ai - ci) > 4) {
-          eqs[ei++] = ai;
-          qmaskh |= 1 << (ai - ci);
-        } else {
-          highest_cis_bit = ai;
-          cis_mask |= idx_type{1} << ai;
-          qmaskl |= 1 << (ai - ci);
-          --remaining_low_cqs;
-          ++effective_low_qs;
-        }
-      } else if (ci < cqs.size() && cqs[ci] == ai) {
-        ++ci;
-        cmaskh |= idx_type{1} << ai;
-      } else {
-        ++l;
-        if (remaining_low_cqs == 0) {
-          eqs[ei++] = ai;
-        } else {
-          highest_cis_bit = ai;
-          cis_mask |= idx_type{1} << ai;
-          --remaining_low_cqs;
-        }
-      }
-      ++ai;
-    }
-
-    unsigned i = ai;
-    unsigned j = effective_low_qs;
-
-    while (ei < num_effective_qs) {
-      eqs[ei++] = qs[j++];
-      qmaskh |= 1 << (i++ - ci);
-    }
-
-    unsigned num_aqs = GetHighQubits(qs, qi, cqs, ci, ai - 5, cmaskh, aqs);
-    GetMs(num_qubits, aqs, num_aqs, indices.ms);
-    GetXss(num_qubits, eqs, num_effective_qs, indices.xss);
-    GetIndicesL(num_effective_qs, qmaskh | qmaskl, indices);
-
-    idx_type cvalsh = bits::ExpandBits(idx_type(cvals), num_qubits, cmaskh);
-    idx_type cvalsl = bits::ExpandBits(idx_type(cvals), 5, cmaskl);
-
-    cis_mask |= 31 ^ cmaskl;
-    highest_cis_bit = highest_cis_bit < 5 ? 5 : highest_cis_bit;
-    for (idx_type i = 0; i < 32; ++i) {
-      auto c = bits::ExpandBits(i, highest_cis_bit + 1, cis_mask);
-      indices.cis[i] = 2 * (c & 0xffffffe0) | (c & 0x1f) | cvalsl;
-    }
-
-    return {cvalsh, num_aqs, num_effective_qs, remaining_low_cqs};
-  }
-
-
-  void* AllocScratch(uint64_t size) const {
-    if (size > scratch_size_) {
-      if (scratch_ != nullptr) {
-        ErrorCheck(cudaFree(scratch_));
-      }
-
-      ErrorCheck(cudaMalloc(const_cast<void**>(&scratch_), size));
-
-      const_cast<uint64_t&>(scratch_size_) = size;
-    }
-
-    return scratch_;
-  }
-
-  char* d_ws;
-  char h_ws0[max_buf_size];
-  char* h_ws = (char*) h_ws0;
-
-  void* scratch_;
-  uint64_t scratch_size_;
-};
-
-}  // namespace qsim
-
-#endif  // SIMULATOR_CUDA_H_
diff --git a/tpls/qsim/simulator_cuda_kernels.h b/tpls/qsim/simulator_cuda_kernels.h
deleted file mode 100644
index e21a9d6..0000000
--- a/tpls/qsim/simulator_cuda_kernels.h
+++ /dev/null
@@ -1,683 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SIMULATOR_CUDA_KERNELS_H_
-#define SIMULATOR_CUDA_KERNELS_H_
-
-#ifdef __NVCC__
-  #include <cuda.h>
-  #include <cuda_runtime.h>
-
-  #include "util_cuda.h"
-#elif __HIP__
-  #include <hip/hip_runtime.h>
-  #include "cuda2hip.h"
-#endif
-
-namespace qsim {
-
-template <unsigned G, typename fp_type, typename idx_type>
-__global__ void ApplyGateH_Kernel(
-    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss0,
-    const idx_type* __restrict__ mss, fp_type* __restrict__ rstate) {
-  // blockDim.x must be equal to 64.
-
-  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
-
-  constexpr unsigned gsize = 1 << G;
-  constexpr unsigned rows =
-      G < 4 ? gsize : (sizeof(fp_type) == 4 ?
-                       (G < 6 ? gsize : 32) : (G < 5 ? 8 : 16));
-
-  fp_type rs[gsize], is[gsize];
-
-  __shared__ idx_type xss[64];
-  __shared__ fp_type v[2 * gsize * rows];
-
-  if (threadIdx.x < gsize) {
-    xss[threadIdx.x] = xss0[threadIdx.x];
-  }
-
-  if (G <= 2) {
-    if (threadIdx.x < 2 * gsize * gsize) {
-      v[threadIdx.x] = v0[threadIdx.x];
-    }
-  } else {
-    for (unsigned m = 0; m < 2 * gsize * rows; m += 64) {
-      v[m + threadIdx.x] = v0[m + threadIdx.x];
-    }
-  }
-
-  __syncthreads();
-
-  idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0;
-  idx_type ii = i & mss[0];
-  for (unsigned j = 1; j <= G; ++j) {
-    i *= 2;
-    ii |= i & mss[j];
-  }
-
-  auto p0 = rstate + 2 * ii + threadIdx.x % 32;
-
-  for (unsigned k = 0; k < gsize; ++k) {
-    rs[k] = *(p0 + xss[k]);
-    is[k] = *(p0 + xss[k] + 32);
-  }
-
-  for (unsigned s = 0; s < gsize / rows; ++s) {
-    if (s > 0) {
-      __syncthreads();
-
-      for (unsigned m = 0; m < 2 * gsize * rows; m += 64) {
-        v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
-      }
-
-      __syncthreads();
-    }
-
-    unsigned j = 0;
-
-    for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
-      fp_type rn = 0;
-      fp_type in = 0;
-
-      for (unsigned l = 0; l < gsize; ++l) {
-        fp_type rm = v[j++];
-        fp_type im = v[j++];
-        rn += rs[l] * rm;
-        rn -= is[l] * im;
-        in += rs[l] * im;
-        in += is[l] * rm;
-      }
-
-      *(p0 + xss[k]) = rn;
-      *(p0 + xss[k] + 32) = in;
-    }
-  }
-}
-
-template <unsigned G, typename fp_type, typename idx_type>
-__global__ void ApplyGateL_Kernel(
-    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss,
-    const idx_type* __restrict__ mss, const unsigned* __restrict__ qis,
-    const unsigned* __restrict__ tis, unsigned esize,
-    fp_type* __restrict__ rstate) {
-  // blockDim.x must be equal to 32.
-
-  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
-
-  constexpr unsigned gsize = 1 << G;
-  constexpr unsigned
-      rows = G < 4 ? gsize : (sizeof(fp_type) == 4 ?
-                              (G < 5 ? gsize : 8) : (G < 6 ? 8 : 4));
-
-  fp_type rs[gsize], is[gsize];
-
-  __shared__ fp_type v[2 * gsize * rows];
-  __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1];
-
-  if (G < 2) {
-    if (threadIdx.x < 2 * gsize * gsize) {
-      v[threadIdx.x] = v0[threadIdx.x];
-    }
-  } else {
-    for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
-      v[m + threadIdx.x] = v0[m + threadIdx.x];
-    }
-  }
-
-  idx_type i = 32 * idx_type{blockIdx.x};
-  idx_type ii = i & mss[0];
-  for (unsigned j = 1; j <= G; ++j) {
-    i *= 2;
-    ii |= i & mss[j];
-  }
-
-  auto p0 = rstate + 2 * ii + threadIdx.x;
-
-  for (unsigned k = 0; k < gsize; ++k) {
-    rs0[threadIdx.x][k] = *(p0 + xss[k]);
-    is0[threadIdx.x][k] = *(p0 + xss[k] + 32);
-  }
-
-  for (unsigned k = 0; k < gsize; ++k) {
-    unsigned i = tis[threadIdx.x] | qis[k];
-    unsigned m = i & 0x1f;
-    unsigned n = i / 32;
-
-    rs[k] = rs0[m][n];
-    is[k] = is0[m][n];
-  }
-
-  for (unsigned s = 0; s < gsize / rows; ++s) {
-    if (s > 0) {
-      for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
-        v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
-      }
-    }
-
-    unsigned j = 0;
-
-    for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
-      fp_type rn = 0;
-      fp_type in = 0;
-
-      for (unsigned l = 0; l < gsize; ++l) {
-        fp_type rm = v[j++];
-        fp_type im = v[j++];
-        rn += rs[l] * rm;
-        rn -= is[l] * im;
-        in += rs[l] * im;
-        in += is[l] * rm;
-      }
-
-      unsigned i = tis[threadIdx.x] | qis[k];
-      unsigned m = i & 0x1f;
-      unsigned n = i / 32;
-
-      rs0[m][n] = rn;
-      is0[m][n] = in;
-    }
-  }
-
-  for (unsigned k = 0; k < esize; ++k) {
-    *(p0 + xss[k]) = rs0[threadIdx.x][k];
-    *(p0 + xss[k] + 32) = is0[threadIdx.x][k];
-  }
-}
-
-template <unsigned G, typename fp_type, typename idx_type>
-__global__ void ApplyControlledGateH_Kernel(
-    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss0,
-    const idx_type* __restrict__ mss, unsigned num_mss, idx_type cvalsh,
-    fp_type* __restrict__ rstate) {
-  // blockDim.x must be equal to 64.
-
-  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
-
-  constexpr unsigned gsize = 1 << G;
-  constexpr unsigned rows =
-      G < 4 ? gsize : (sizeof(fp_type) == 4 ?
-                           (G < 6 ? gsize : 32) : (G < 5 ? 8 : 16));
-
-  fp_type rs[gsize], is[gsize];
-
-  __shared__ idx_type xss[64];
-  __shared__ fp_type v[2 * gsize * rows];
-
-  if (threadIdx.x < gsize) {
-    xss[threadIdx.x] = xss0[threadIdx.x];
-  }
-
-  if (G <= 2) {
-    if (threadIdx.x < 2 * gsize * gsize) {
-      v[threadIdx.x] = v0[threadIdx.x];
-    }
-  } else {
-    for (unsigned m = 0; m < 2 * gsize * rows; m += 64) {
-      v[m + threadIdx.x] = v0[m + threadIdx.x];
-    }
-  }
-
-  __syncthreads();
-
-  idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0;
-  idx_type ii = i & mss[0];
-  for (unsigned j = 1; j < num_mss; ++j) {
-    i *= 2;
-    ii |= i & mss[j];
-  }
-
-  ii |= cvalsh;
-
-  auto p0 = rstate + 2 * ii + threadIdx.x % 32;
-
-  for (unsigned k = 0; k < gsize; ++k) {
-    rs[k] = *(p0 + xss[k]);
-    is[k] = *(p0 + xss[k] + 32);
-  }
-
-  for (unsigned s = 0; s < gsize / rows; ++s) {
-    if (s > 0) {
-      __syncthreads();
-
-      for (unsigned m = 0; m < 2 * gsize * rows; m += 64) {
-        v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
-      }
-
-      __syncthreads();
-    }
-
-    unsigned j = 0;
-
-    for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
-      fp_type rn = 0;
-      fp_type in = 0;
-
-      for (unsigned l = 0; l < gsize; ++l) {
-        fp_type rm = v[j++];
-        fp_type im = v[j++];
-        rn += rs[l] * rm;
-        rn -= is[l] * im;
-        in += rs[l] * im;
-        in += is[l] * rm;
-      }
-
-      *(p0 + xss[k]) = rn;
-      *(p0 + xss[k] + 32) = in;
-    }
-  }
-}
-
-template <unsigned G, typename fp_type, typename idx_type>
-__global__ void ApplyControlledGateLH_Kernel(
-    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss,
-    const idx_type* __restrict__ mss, const unsigned* __restrict__ qis,
-    const unsigned* __restrict__ tis, unsigned num_mss, idx_type cvalsh,
-    unsigned esize, fp_type* __restrict__ rstate) {
-  // blockDim.x must be equal to 32.
-
-  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
-
-  constexpr unsigned gsize = 1 << G;
-  constexpr unsigned
-      rows = G < 4 ? gsize : (sizeof(fp_type) == 4 ?
-                              (G < 5 ? gsize : 8) : (G < 6 ? 8 : 4));
-
-  fp_type rs[gsize], is[gsize];
-
-  __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1];
-  __shared__ fp_type v[2 * gsize * rows];
-
-  idx_type i = 32 * idx_type{blockIdx.x};
-  idx_type ii = i & mss[0];
-  for (unsigned j = 1; j < num_mss; ++j) {
-    i *= 2;
-    ii |= i & mss[j];
-  }
-
-  ii |= cvalsh;
-
-  auto p0 = rstate + 2 * ii + threadIdx.x;
-
-  for (unsigned k = 0; k < gsize; ++k) {
-    rs0[threadIdx.x][k] = *(p0 + xss[k]);
-    is0[threadIdx.x][k] = *(p0 + xss[k] + 32);
-  }
-
-  if (G < 2) {
-    if (threadIdx.x < 2 * gsize * gsize) {
-      v[threadIdx.x] = v0[threadIdx.x];
-    }
-  } else {
-    for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
-      v[m + threadIdx.x] = v0[m + threadIdx.x];
-    }
-  }
-
-  for (unsigned k = 0; k < gsize; ++k) {
-    unsigned i = tis[threadIdx.x] | qis[k];
-    unsigned m = i & 0x1f;
-    unsigned n = i / 32;
-
-    rs[k] = rs0[m][n];
-    is[k] = is0[m][n];
-  }
-
-  for (unsigned s = 0; s < gsize / rows; ++s) {
-    if (s > 0) {
-      for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
-        v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
-      }
-    }
-
-    unsigned j = 0;
-
-    for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
-      fp_type rn = 0;
-      fp_type in = 0;
-
-      for (unsigned l = 0; l < gsize; ++l) {
-        fp_type rm = v[j++];
-        fp_type im = v[j++];
-        rn += rs[l] * rm;
-        rn -= is[l] * im;
-        in += rs[l] * im;
-        in += is[l] * rm;
-      }
-
-      unsigned i = tis[threadIdx.x] | qis[k];
-      unsigned m = i & 0x1f;
-      unsigned n = i / 32;
-
-      rs0[m][n] = rn;
-      is0[m][n] = in;
-    }
-  }
-
-  for (unsigned k = 0; k < esize; ++k) {
-    *(p0 + xss[k]) = rs0[threadIdx.x][k];
-    *(p0 + xss[k] + 32) = is0[threadIdx.x][k];
-  }
-}
-
-template <unsigned G, typename fp_type, typename idx_type>
-__global__ void ApplyControlledGateL_Kernel(
-    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss,
-    const idx_type* __restrict__ mss, const unsigned* __restrict__ qis,
-    const unsigned* __restrict__ tis, const idx_type* __restrict__ cis,
-    unsigned num_mss, idx_type cvalsh, unsigned esize, unsigned rwthreads,
-    fp_type* __restrict__ rstate) {
-  // blockDim.x must be equal to 32.
-
-  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
-
-  constexpr unsigned gsize = 1 << G;
-  constexpr unsigned
-      rows = G < 4 ? gsize : (sizeof(fp_type) == 4 ?
-                              (G < 5 ? gsize : 8) : (G < 6 ? 8 : 4));
-
-  fp_type rs[gsize], is[gsize];
-
-  __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1];
-  __shared__ fp_type v[2 * gsize * rows];
-
-  idx_type i = 32 * idx_type{blockIdx.x};
-  idx_type ii = i & mss[0];
-  for (unsigned j = 1; j < num_mss; ++j) {
-    i *= 2;
-    ii |= i & mss[j];
-  }
-
-  ii |= cvalsh;
-
-  auto p0 = rstate + 2 * ii + cis[threadIdx.x];
-
-  if (threadIdx.x < rwthreads) {
-    for (unsigned k = 0; k < gsize; ++k) {
-      rs0[threadIdx.x][k] = *(p0 + xss[k]);
-      is0[threadIdx.x][k] = *(p0 + xss[k] + 32);
-    }
-  }
-
-  if (G < 2) {
-    if (threadIdx.x < 2 * gsize * gsize) {
-      v[threadIdx.x] = v0[threadIdx.x];
-    }
-  } else {
-    for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
-      v[m + threadIdx.x] = v0[m + threadIdx.x];
-    }
-  }
-
-  for (unsigned k = 0; k < gsize; ++k) {
-    unsigned i = tis[threadIdx.x] | qis[k];
-    unsigned m = i & 0x1f;
-    unsigned n = i / 32;
-
-    rs[k] = rs0[m][n];
-    is[k] = is0[m][n];
-  }
-
-  for (unsigned s = 0; s < gsize / rows; ++s) {
-    if (s > 0) {
-      for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
-        v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
-      }
-    }
-
-    unsigned j = 0;
-
-    for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
-      fp_type rn = 0;
-      fp_type in = 0;
-
-      for (unsigned l = 0; l < gsize; ++l) {
-        fp_type rm = v[j++];
-        fp_type im = v[j++];
-        rn += rs[l] * rm;
-        rn -= is[l] * im;
-        in += rs[l] * im;
-        in += is[l] * rm;
-      }
-
-      unsigned i = tis[threadIdx.x] | qis[k];
-      unsigned m = i & 0x1f;
-      unsigned n = i / 32;
-
-      rs0[m][n] = rn;
-      is0[m][n] = in;
-    }
-  }
-
-  if (threadIdx.x < rwthreads) {
-    for (unsigned k = 0; k < esize; ++k) {
-      *(p0 + xss[k]) = rs0[threadIdx.x][k];
-      *(p0 + xss[k] + 32) = is0[threadIdx.x][k];
-    }
-  }
-}
-
-template <unsigned G, typename fp_type, typename idx_type, typename Op,
-          typename cfp_type>
-__global__ void ExpectationValueH_Kernel(
-    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss0,
-    const idx_type* __restrict__ mss, unsigned num_iterations_per_block,
-    const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) {
-  // blockDim.x must be equal to 64.
-
-  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
-
-  constexpr unsigned gsize = 1 << G;
-  constexpr unsigned rows =
-      G < 5 ? gsize : (sizeof(fp_type) == 4 ? (G < 6 ? 4 : 8) : 8);
-
-  fp_type rs[gsize], is[gsize];
-
-  __shared__ idx_type xss[64];
-  __shared__ fp_type v[2 * gsize * rows];
-
-  if (threadIdx.x < gsize) {
-    xss[threadIdx.x] = xss0[threadIdx.x];
-  }
-
-  if (G <= 2) {
-    if (threadIdx.x < 2 * gsize * gsize) {
-      v[threadIdx.x] = v0[threadIdx.x];
-    }
-  } else {
-    for (unsigned m = 0; m < 2 * gsize * rows; m += 64) {
-      v[m + threadIdx.x] = v0[m + threadIdx.x];
-    }
-  }
-
-  __syncthreads();
-
-  double re = 0;
-  double im = 0;
-
-  for (unsigned iter = 0; iter < num_iterations_per_block; ++iter) {
-    idx_type b = num_iterations_per_block * idx_type{blockIdx.x} + iter;
-
-    idx_type i = (64 * b + threadIdx.x) & 0xffffffffffe0;
-    idx_type ii = i & mss[0];
-    for (unsigned j = 1; j <= G; ++j) {
-      i *= 2;
-      ii |= i & mss[j];
-    }
-
-    auto p0 = rstate + 2 * ii + threadIdx.x % 32;
-
-    for (unsigned k = 0; k < gsize; ++k) {
-      rs[k] = *(p0 + xss[k]);
-      is[k] = *(p0 + xss[k] + 32);
-    }
-
-    for (unsigned s = 0; s < gsize / rows; ++s) {
-      if (s > 0 || iter > 0) {
-        __syncthreads();
-
-        for (unsigned m = 0; m < 2 * gsize * rows; m += 64) {
-          v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
-        }
-
-        __syncthreads();
-      }
-
-      unsigned j = 0;
-
-      for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
-        fp_type rn = 0;
-        fp_type in = 0;
-
-        for (unsigned l = 0; l < gsize; ++l) {
-          fp_type rm = v[j++];
-          fp_type im = v[j++];
-          rn += rs[l] * rm;
-          rn -= is[l] * im;
-          in += rs[l] * im;
-          in += is[l] * rm;
-        }
-
-        re += rs[k] * rn;
-        re += is[k] * in;
-        im += rs[k] * in;
-        im -= is[k] * rn;
-      }
-    }
-  }
-
-  __shared__ cfp_type partial1[64];
-  __shared__ cfp_type partial2[2];
-
-  partial1[threadIdx.x].re = re;
-  partial1[threadIdx.x].im = im;
-
-  auto val = WarpReduce(partial1[threadIdx.x], op);
-
-  if (threadIdx.x % 32 == 0) {
-    partial2[threadIdx.x / 32] = val;
-  }
-
-  __syncthreads();
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x].re = partial2[0].re + partial2[1].re;
-    result[blockIdx.x].im = partial2[0].im + partial2[1].im;
-  }
-}
-
-template <unsigned G, typename fp_type, typename idx_type,
-          typename Op, typename cfp_type>
-__global__ void ExpectationValueL_Kernel(
-    const fp_type* __restrict__ v0, const idx_type* __restrict__ xss,
-    const idx_type* __restrict__ mss, const unsigned* __restrict__ qis,
-    const unsigned* __restrict__ tis, unsigned num_iterations_per_block,
-    const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) {
-  // blockDim.x must be equal to 32.
-
-  static_assert(G < 7, "gates acting on more than 6 qubits are not supported.");
-
-  constexpr unsigned gsize = 1 << G;
-  constexpr unsigned rows = G < 5 ? gsize : (sizeof(fp_type) == 4 ?
-                                             (G < 6 ? 4 : 2) : (G < 6 ? 2 : 1));
-
-  fp_type rs[gsize], is[gsize];
-
-  __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1];
-  __shared__ fp_type v[2 * gsize * rows];
-
-  if (G < 2) {
-    if (threadIdx.x < 2 * gsize * gsize) {
-      v[threadIdx.x] = v0[threadIdx.x];
-    }
-  } else {
-    for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
-      v[m + threadIdx.x] = v0[m + threadIdx.x];
-    }
-  }
-
-  double re = 0;
-  double im = 0;
-
-  for (idx_type iter = 0; iter < num_iterations_per_block; ++iter) {
-    idx_type i = 32 * (num_iterations_per_block * idx_type{blockIdx.x} + iter);
-    idx_type ii = i & mss[0];
-    for (unsigned j = 1; j <= G; ++j) {
-      i *= 2;
-      ii |= i & mss[j];
-    }
-
-    auto p0 = rstate + 2 * ii + threadIdx.x;
-
-    for (unsigned k = 0; k < gsize; ++k) {
-      rs0[threadIdx.x][k] = *(p0 + xss[k]);
-      is0[threadIdx.x][k] = *(p0 + xss[k] + 32);
-    }
-
-    for (unsigned k = 0; k < gsize; ++k) {
-      unsigned i = tis[threadIdx.x] | qis[k];
-      unsigned m = i & 0x1f;
-      unsigned n = i / 32;
-
-      rs[k] = rs0[m][n];
-      is[k] = is0[m][n];
-    }
-
-    for (unsigned s = 0; s < gsize / rows; ++s) {
-      if (s > 0 || iter > 0) {
-        for (unsigned m = 0; m < 2 * gsize * rows; m += 32) {
-          v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x];
-        }
-      }
-
-      unsigned j = 0;
-
-      for (unsigned k = rows * s; k < rows * (s + 1); ++k) {
-        fp_type rn = 0;
-        fp_type in = 0;
-
-        for (unsigned l = 0; l < gsize; ++l) {
-          fp_type rm = v[j++];
-          fp_type im = v[j++];
-          rn += rs[l] * rm;
-          rn -= is[l] * im;
-          in += rs[l] * im;
-          in += is[l] * rm;
-        }
-
-        re += rs[k] * rn;
-        re += is[k] * in;
-        im += rs[k] * in;
-        im -= is[k] * rn;
-      }
-    }
-  }
-
-  __shared__ cfp_type partial[32];
-
-  partial[threadIdx.x].re = re;
-  partial[threadIdx.x].im = im;
-
-  auto val = WarpReduce(partial[threadIdx.x], op);
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x].re = val.re;
-    result[blockIdx.x].im = val.im;
-  }
-}
-
-}  // namespace qsim
-
-#endif  // SIMULATOR_CUDA_KERNELS_H_
diff --git a/tpls/qsim/simulator_custatevec.h b/tpls/qsim/simulator_custatevec.h
deleted file mode 100644
index 40d1902..0000000
--- a/tpls/qsim/simulator_custatevec.h
+++ /dev/null
@@ -1,209 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SIMULATOR_CUSTATEVEC_H_
-#define SIMULATOR_CUSTATEVEC_H_
-
-#include <complex>
-#include <cstdint>
-#include <type_traits>
-
-#include <cublas_v2.h>
-#include <cuComplex.h>
-#include <custatevec.h>
-
-#include "io.h"
-#include "statespace_custatevec.h"
-#include "util_custatevec.h"
-
-namespace qsim {
-
-/**
- * Quantum circuit simulator using the NVIDIA cuStateVec library.
- */
-template <typename FP = float>
-class SimulatorCuStateVec final {
- public:
-  using StateSpace = StateSpaceCuStateVec<FP>;
-  using State = typename StateSpace::State;
-  using fp_type = typename StateSpace::fp_type;
-
-  static constexpr auto kStateType = StateSpace::kStateType;
-  static constexpr auto kMatrixType = StateSpace::kMatrixType;
-  static constexpr auto kExpectType = StateSpace::kExpectType;
-  static constexpr auto kComputeType = StateSpace::kComputeType;
-  static constexpr auto kMatrixLayout = StateSpace::kMatrixLayout;
-
-  explicit SimulatorCuStateVec(const cublasHandle_t& cublas_handle,
-                               const custatevecHandle_t& custatevec_handle)
-      : cublas_handle_(cublas_handle), custatevec_handle_(custatevec_handle),
-      workspace_(nullptr), workspace_size_(0) {}
-
-  ~SimulatorCuStateVec() {
-    ErrorCheck(cudaFree(workspace_));
-  }
-
-  /**
-   * Applies a gate using the NVIDIA cuStateVec library.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyGate(const std::vector<unsigned>& qs,
-                 const fp_type* matrix, State& state) const {
-    if (qs.size() == 0) {
-      uint64_t size = uint64_t{1} << state.num_qubits();
-
-      if (StateSpace::is_float) {
-        cuComplex a = {matrix[0], matrix[1]};
-        auto p = (cuComplex*) state.get();
-        ErrorCheck(cublasCscal(cublas_handle_, size, &a, p, 1));
-      } else {
-        cuDoubleComplex a = {matrix[0], matrix[1]};
-        auto p = (cuDoubleComplex*) state.get();
-        ErrorCheck(cublasZscal(cublas_handle_, size, &a, p, 1));
-      }
-    } else {
-      auto workspace_size = ApplyGateWorkSpaceSize(
-          state.num_qubits(), qs.size(), 0, matrix);
-      AllocWorkSpace(workspace_size);
-
-      ErrorCheck(custatevecApplyMatrix(
-                     custatevec_handle_, state.get(), kStateType,
-                     state.num_qubits(), matrix, kMatrixType, kMatrixLayout, 0,
-                     (int32_t*) qs.data(), qs.size(), nullptr, nullptr, 0,
-                     kComputeType, workspace_, workspace_size));
-    }
-  }
-
-  /**
-   * Applies a controlled gate using the NVIDIA cuStateVec library.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param cqs Indices of control qubits.
-   * @param cmask Bit mask of control qubit values.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyControlledGate(const std::vector<unsigned>& qs,
-                           const std::vector<unsigned>& cqs, uint64_t cmask,
-                           const fp_type* matrix, State& state) const {
-    if (qs.size() == 0) {
-      IO::errorf(
-          "error: controlled global phase gate is not implemented %s %d\n",
-          __FILE__, __LINE__);
-      exit(1);
-    } else {
-      std::vector<int32_t> control_bits;
-      control_bits.reserve(cqs.size());
-
-      for (std::size_t i = 0; i < cqs.size(); ++i) {
-        control_bits.push_back((cmask >> i) & 1);
-      }
-
-      auto workspace_size = ApplyGateWorkSpaceSize(
-          state.num_qubits(), qs.size(), cqs.size(), matrix);
-      AllocWorkSpace(workspace_size);
-
-      ErrorCheck(custatevecApplyMatrix(
-                     custatevec_handle_, state.get(), kStateType,
-                     state.num_qubits(), matrix, kMatrixType, kMatrixLayout, 0,
-                     (int32_t*) qs.data(), qs.size(),
-                     (int32_t*) cqs.data(), control_bits.data(), cqs.size(),
-                     kComputeType, workspace_, workspace_size));
-    }
-  }
-
-  /**
-   * Computes the expectation value of an operator using the NVIDIA cuStateVec
-   * library.
-   * @param qs Indices of the qubits the operator acts on.
-   * @param matrix The operator matrix.
-   * @param state The state of the system.
-   * @return The computed expectation value.
-   */
-  std::complex<double> ExpectationValue(const std::vector<unsigned>& qs,
-                                        const fp_type* matrix,
-                                        const State& state) const {
-    auto workspace_size = ExpectationValueWorkSpaceSize(
-        state.num_qubits(), qs.size(), matrix);
-    AllocWorkSpace(workspace_size);
-
-    cuDoubleComplex eval;
-
-    ErrorCheck(custatevecComputeExpectation(
-                   custatevec_handle_, state.get(), kStateType,
-                   state.num_qubits(), &eval, kExpectType, nullptr, matrix,
-                   kMatrixType, kMatrixLayout, (int32_t*) qs.data(), qs.size(),
-                   kComputeType, workspace_, workspace_size));
-
-    return {cuCreal(eval), cuCimag(eval)};
-  }
-
-  /**
-   * @return The size of SIMD register if applicable.
-   */
-  static unsigned SIMDRegisterSize() {
-    return 32;
-  }
-
- private:
-  size_t ApplyGateWorkSpaceSize(
-      unsigned num_qubits, unsigned num_targets, unsigned num_controls,
-      const fp_type* matrix) const {
-    size_t size;
-
-    ErrorCheck(custatevecApplyMatrixGetWorkspaceSize(
-                   custatevec_handle_, kStateType, num_qubits, matrix,
-                   kMatrixType, kMatrixLayout, 0, num_targets, num_controls,
-                   kComputeType, &size));
-
-    return size;
-  }
-
-  size_t ExpectationValueWorkSpaceSize(
-      unsigned num_qubits, unsigned num_targets, const fp_type* matrix) const {
-    size_t size;
-
-    ErrorCheck(custatevecComputeExpectationGetWorkspaceSize(
-                   custatevec_handle_, kStateType, num_qubits, matrix,
-                   kMatrixType, kMatrixLayout, num_targets, kComputeType,
-                   &size));
-
-    return size;
-  }
-
-  void* AllocWorkSpace(size_t size) const {
-    if (size > workspace_size_) {
-      if (workspace_ != nullptr) {
-        ErrorCheck(cudaFree(workspace_));
-      }
-
-      ErrorCheck(cudaMalloc(const_cast<void**>(&workspace_), size));
-
-      const_cast<uint64_t&>(workspace_size_) = size;
-    }
-
-    return workspace_;
-  }
-
-  const cublasHandle_t cublas_handle_;
-  const custatevecHandle_t custatevec_handle_;
-
-  void* workspace_;
-  size_t workspace_size_;
-};
-
-}  // namespace qsim
-
-#endif  // SIMULATOR_CUSTATEVEC_H_
diff --git a/tpls/qsim/simulator_sse.h b/tpls/qsim/simulator_sse.h
deleted file mode 100644
index 5256c53..0000000
--- a/tpls/qsim/simulator_sse.h
+++ /dev/null
@@ -1,864 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SIMULATOR_SSE_H_
-#define SIMULATOR_SSE_H_
-
-#include <smmintrin.h>
-
-#include <complex>
-#include <cstdint>
-#include <functional>
-#include <vector>
-
-#include "simulator.h"
-#include "statespace_sse.h"
-
-namespace qsim {
-
-/**
- * Quantum circuit simulator with SSE vectorization.
- */
-template <typename For>
-class SimulatorSSE final : public SimulatorBase {
- public:
-  using StateSpace = StateSpaceSSE<For>;
-  using State = typename StateSpace::State;
-  using fp_type = typename StateSpace::fp_type;
-
-  template <typename... ForArgs>
-  explicit SimulatorSSE(ForArgs&&... args) : for_(args...) {}
-
-  /**
-   * Applies a gate using SSE instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyGate(const std::vector<unsigned>& qs,
-                 const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    switch (qs.size()) {
-    case 0:
-      ApplyGateH<0>(qs, matrix, state);
-      break;
-    case 1:
-      if (qs[0] > 1) {
-        ApplyGateH<1>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 1>(qs, matrix, state);
-      }
-      break;
-    case 2:
-      if (qs[0] > 1) {
-        ApplyGateH<2>(qs, matrix, state);
-      } else if (qs[1] > 1) {
-        ApplyGateL<1, 1>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 2>(qs, matrix, state);
-      }
-      break;
-    case 3:
-      if (qs[0] > 1) {
-        ApplyGateH<3>(qs, matrix, state);
-      } else if (qs[1] > 1) {
-        ApplyGateL<2, 1>(qs, matrix, state);
-      } else {
-        ApplyGateL<1, 2>(qs, matrix, state);
-      }
-      break;
-    case 4:
-      if (qs[0] > 1) {
-        ApplyGateH<4>(qs, matrix, state);
-      } else if (qs[1] > 1) {
-        ApplyGateL<3, 1>(qs, matrix, state);
-      } else {
-        ApplyGateL<2, 2>(qs, matrix, state);
-      }
-      break;
-    case 5:
-      if (qs[0] > 1) {
-        ApplyGateH<5>(qs, matrix, state);
-      } else if (qs[1] > 1) {
-        ApplyGateL<4, 1>(qs, matrix, state);
-      } else {
-        ApplyGateL<3, 2>(qs, matrix, state);
-      }
-      break;
-    case 6:
-      if (qs[0] > 1) {
-        ApplyGateH<6>(qs, matrix, state);
-      } else if (qs[1] > 1) {
-        ApplyGateL<5, 1>(qs, matrix, state);
-      } else {
-        ApplyGateL<4, 2>(qs, matrix, state);
-      }
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-  }
-
-  /**
-   * Applies a controlled gate using SSE instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param cqs Indices of control qubits.
-   * @param cvals Bit mask of control qubit values.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyControlledGate(const std::vector<unsigned>& qs,
-                           const std::vector<unsigned>& cqs, uint64_t cvals,
-                           const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-    // Assume cqs[0] < cqs[1] < cqs[2] < ... .
-
-    if (cqs.size() == 0) {
-      ApplyGate(qs, matrix, state);
-      return;
-    }
-
-    switch (qs.size()) {
-    case 0:
-      if (cqs[0] > 1) {
-        ApplyControlledGateHH<0>(qs, cqs, cvals, matrix, state);
-      } else {
-        ApplyControlledGateHL<0>(qs, cqs, cvals, matrix, state);
-      }
-      break;
-    case 1:
-      if (qs[0] > 1) {
-        if (cqs[0] > 1) {
-          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 1) {
-          ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 2:
-      if (qs[0] > 1) {
-        if (cqs[0] > 1) {
-          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 1) {
-        if (cqs[0] > 1) {
-          ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 1) {
-          ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 3:
-      if (qs[0] > 1) {
-        if (cqs[0] > 1) {
-          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 1) {
-        if (cqs[0] > 1) {
-          ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 1) {
-          ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 4:
-      if (qs[0] > 1) {
-        if (cqs[0] > 1) {
-          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 1) {
-        if (cqs[0] > 1) {
-          ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 1) {
-          ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-  }
-
-  /**
-   * Computes the expectation value of an operator using SSE instructions.
-   * @param qs Indices of the qubits the operator acts on.
-   * @param matrix The operator matrix.
-   * @param state The state of the system.
-   * @return The computed expectation value.
-   */
-  std::complex<double> ExpectationValue(const std::vector<unsigned>& qs,
-                                        const fp_type* matrix,
-                                        const State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    switch (qs.size()) {
-    case 1:
-      if (qs[0] > 1) {
-        return ExpectationValueH<1>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<0, 1>(qs, matrix, state);
-      }
-      break;
-    case 2:
-      if (qs[0] > 1) {
-        return ExpectationValueH<2>(qs, matrix, state);
-      } else if (qs[1] > 1) {
-        return ExpectationValueL<1, 1>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<0, 2>(qs, matrix, state);
-      }
-      break;
-    case 3:
-      if (qs[0] > 1) {
-        return ExpectationValueH<3>(qs, matrix, state);
-      } else if (qs[1] > 1) {
-        return ExpectationValueL<2, 1>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<1, 2>(qs, matrix, state);
-      }
-      break;
-    case 4:
-      if (qs[0] > 1) {
-        return ExpectationValueH<4>(qs, matrix, state);
-      } else if (qs[1] > 1) {
-        return ExpectationValueL<3, 1>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<2, 2>(qs, matrix, state);
-      }
-      break;
-    case 5:
-      if (qs[0] > 1) {
-        return ExpectationValueH<5>(qs, matrix, state);
-      } else if (qs[1] > 1) {
-        return ExpectationValueL<4, 1>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<3, 2>(qs, matrix, state);
-      }
-      break;
-    case 6:
-      if (qs[0] > 1) {
-        return ExpectationValueH<6>(qs, matrix, state);
-      } else if (qs[1] > 1) {
-        return ExpectationValueL<5, 1>(qs, matrix, state);
-      } else {
-        return ExpectationValueL<4, 2>(qs, matrix, state);
-      }
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-
-    return 0;
-  }
-
-  /**
-   * @return The size of SIMD register if applicable.
-   */
-  static unsigned SIMDRegisterSize() {
-    return 4;
-  }
-
- private:
-  template <unsigned H>
-  void ApplyGateH(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m128 ru, iu, rn, in;
-      __m128 rs[hsize], is[hsize];
-
-      i *= 4;
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      auto p0 = rstate + 2 * ii;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = _mm_load_ps(p0 + xss[k]);
-        is[k] = _mm_load_ps(p0 + xss[k] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm_set1_ps(v[j]);
-        iu = _mm_set1_ps(v[j + 1]);
-        rn = _mm_mul_ps(rs[0], ru);
-        in = _mm_mul_ps(rs[0], iu);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm_set1_ps(v[j]);
-          iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[l], ru));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[k], rn);
-        _mm_store_ps(p0 + xss[k] + 4, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-
-    unsigned k = 2 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss, state.get());
-  }
-
-  template <unsigned H, unsigned L>
-  void ApplyGateL(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss,
-                unsigned q0, fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m128 rn, in;
-      __m128 rs[gsize], is[gsize];
-
-      i *= 4;
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      auto p0 = rstate + 2 * ii;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-
-        rs[k2] = _mm_load_ps(p0 + xss[k]);
-        is[k2] = _mm_load_ps(p0 + xss[k] + 4);
-
-        if (L == 1) {
-          rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177)
-                               : _mm_shuffle_ps(rs[k2], rs[k2], 78);
-          is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177)
-                               : _mm_shuffle_ps(is[k2], is[k2], 78);
-        } else if (L == 2) {
-          rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57);
-          is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57);
-          rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78);
-          is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78);
-          rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147);
-          is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[k], rn);
-        _mm_store_ps(p0 + xss[k] + 4, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-    __m128 w[1 << (1 + 2 * H + L)];
-
-    auto m = GetMasks11<L>(qs);
-
-    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
-    FillMatrix<H, L, 2>(m.qmaskl, matrix, (fp_type*) w);
-
-    unsigned k = 2 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, qs[0], state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
-                uint64_t cmaskh, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m128 ru, iu, rn, in;
-      __m128 rs[hsize], is[hsize];
-
-      i *= 4;
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      if ((ii & cmaskh) != cvalsh) return;
-
-      auto p0 = rstate + 2 * ii;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = _mm_load_ps(p0 + xss[k]);
-        is[k] = _mm_load_ps(p0 + xss[k] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm_set1_ps(v[j]);
-        iu = _mm_set1_ps(v[j + 1]);
-        rn = _mm_mul_ps(rs[0], ru);
-        in = _mm_mul_ps(rs[0], iu);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm_set1_ps(v[j]);
-          iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[l], ru));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[k], rn);
-        _mm_store_ps(p0 + xss[k] + 4, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-
-    auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals);
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-
-    unsigned k = 2 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, matrix, ms, xss, m.cvalsh, m.cmaskh, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
-                uint64_t cmaskh, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m128 rn, in;
-      __m128 rs[hsize], is[hsize];
-
-      i *= 4;
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      if ((ii & cmaskh) != cvalsh) return;
-
-      auto p0 = rstate + 2 * ii;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = _mm_load_ps(p0 + xss[k]);
-        is[k] = _mm_load_ps(p0 + xss[k] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[k], rn);
-        _mm_store_ps(p0 + xss[k] + 4, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-    __m128 w[1 << (1 + 2 * H)];
-
-    auto m = GetMasks8<2>(state.num_qubits(), qs, cqs, cvals);
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-    FillControlledMatrixH<H, 2>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
-
-    unsigned r = 2 + H;
-    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, state.get());
-  }
-
-  template <unsigned H, unsigned L, bool CH>
-  void ApplyControlledGateL(const std::vector<unsigned>& qs,
-                            const std::vector<unsigned>& cqs, uint64_t cvals,
-                            const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
-                uint64_t cmaskh, unsigned q0, fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m128 rn, in;
-      __m128 rs[gsize], is[gsize];
-
-      i *= 4;
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      if ((ii & cmaskh) != cvalsh) return;
-
-      auto p0 = rstate + 2 * ii;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-
-        rs[k2] = _mm_load_ps(p0 + xss[k]);
-        is[k2] = _mm_load_ps(p0 + xss[k] + 4);
-
-        if (L == 1) {
-          rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177)
-                               : _mm_shuffle_ps(rs[k2], rs[k2], 78);
-          is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177)
-                               : _mm_shuffle_ps(is[k2], is[k2], 78);
-        } else if (L == 2) {
-          rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57);
-          is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57);
-          rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78);
-          is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78);
-          rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147);
-          is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[k], rn);
-        _mm_store_ps(p0 + xss[k] + 4, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-    __m128 w[1 << (1 + 2 * H + L)];
-
-    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
-
-    unsigned r = 2 + H;
-    unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    if (CH) {
-      auto m = GetMasks9<L>(state.num_qubits(), qs, cqs, cvals);
-      FillMatrix<H, L, 2>(m.qmaskl, matrix, (fp_type*) w);
-
-      for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, qs[0], state.get());
-    } else {
-      auto m = GetMasks10<L, 2>(state.num_qubits(), qs, cqs, cvals);
-      FillControlledMatrixL<H, L, 2>(
-          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
-
-      for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, qs[0], state.get());
-    }
-  }
-
-  template <unsigned H>
-  std::complex<double> ExpectationValueH(const std::vector<unsigned>& qs,
-                                         const fp_type* matrix,
-                                         const State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss,
-                const fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m128 ru, iu, rn, in;
-      __m128 rs[hsize], is[hsize];
-
-      i *= 4;
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      auto p0 = rstate + 2 * ii;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = _mm_load_ps(p0 + xss[k]);
-        is[k] = _mm_load_ps(p0 + xss[k] + 4);
-      }
-
-      double re = 0;
-      double im = 0;
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm_set1_ps(v[j]);
-        iu = _mm_set1_ps(v[j + 1]);
-        rn = _mm_mul_ps(rs[0], ru);
-        in = _mm_mul_ps(rs[0], iu);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm_set1_ps(v[j]);
-          iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[l], ru));
-
-          j += 2;
-        }
-
-        __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[k], rn), _mm_mul_ps(is[k], in));
-        __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[k], in), _mm_mul_ps(is[k], rn));
-
-        re += detail::HorizontalSumSSE(v_re);
-        im += detail::HorizontalSumSSE(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-
-    unsigned k = 2 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), matrix, ms, xss, state.get());
-  }
-
-  template <unsigned H, unsigned L>
-  std::complex<double> ExpectationValueL(const std::vector<unsigned>& qs,
-                                         const fp_type* matrix,
-                                         const State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss, unsigned q0,
-                const fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m128 rn, in;
-      __m128 rs[gsize], is[gsize];
-
-      i *= 4;
-
-      uint64_t ii = i & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        i *= 2;
-        ii |= i & ms[j];
-      }
-
-      auto p0 = rstate + 2 * ii;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-
-        rs[k2] = _mm_load_ps(p0 + xss[k]);
-        is[k2] = _mm_load_ps(p0 + xss[k] + 4);
-
-        if (L == 1) {
-          rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177)
-                               : _mm_shuffle_ps(rs[k2], rs[k2], 78);
-          is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177)
-                               : _mm_shuffle_ps(is[k2], is[k2], 78);
-        } else if (L == 2) {
-          rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57);
-          is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57);
-          rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78);
-          is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78);
-          rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147);
-          is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147);
-        }
-      }
-
-      double re = 0;
-      double im = 0;
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
-
-          j += 2;
-        }
-
-        unsigned m = lsize * k;
-
-        __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[m], rn), _mm_mul_ps(is[m], in));
-        __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[m], in), _mm_mul_ps(is[m], rn));
-
-        re += detail::HorizontalSumSSE(v_re);
-        im += detail::HorizontalSumSSE(v_im);
-      }
-
-      return std::complex<double>{re, im};
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-    __m128 w[1 << (1 + 2 * H + L)];
-
-    auto m = GetMasks11<L>(qs);
-
-    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
-    FillMatrix<H, L, 2>(m.qmaskl, matrix, (fp_type*) w);
-
-    unsigned k = 2 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-
-    using Op = std::plus<std::complex<double>>;
-    return for_.RunReduce(size, f, Op(), w, ms, xss, qs[0], state.get());
-  }
-
-  For for_;
-};
-
-}  // namespace qsim
-
-#endif  // SIMULATOR_SSE_H_
diff --git a/tpls/qsim/statespace.h b/tpls/qsim/statespace.h
deleted file mode 100644
index 2b0c9af..0000000
--- a/tpls/qsim/statespace.h
+++ /dev/null
@@ -1,145 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef STATESPACE_H_
-#define STATESPACE_H_
-
-#include <cmath>
-#include <cstdint>
-#include <cstdlib>
-#include <vector>
-
-#include "util.h"
-
-namespace qsim {
-
-/**
- * Abstract class containing context and routines for general state-vector
- * manipulations. "AVX", "AVX512", "Basic", and "SSE" implementations are
- * provided.
- */
-template <typename Impl,
-          template<typename...> class VectorSpace, typename... VSTypeParams>
-class StateSpace : public VectorSpace<Impl, VSTypeParams...> {
- private:
-  using Base = VectorSpace<Impl, VSTypeParams...>;
-
- public:
-  using fp_type = typename Base::fp_type;
-  using State = typename Base::Vector;
-
-  /**
-   * The observed state from a Measurement gate.
-   */
-  struct MeasurementResult {
-    /**
-     * A bitmask of all qubits measured in this result. In this format, if the
-     * qubit at index `i` is measured, the `i`th bit of `mask` is a one.
-     */
-    uint64_t mask;
-    /**
-     * A bitwise representation of the measured states. In this format, the
-     * qubit at index `i` is represented by the `i`th bit of `bits`.
-     * If `valid` is true, `mask` has already been applied to this field
-     * (i.e. `bits == bits & mask`).
-     */
-    uint64_t bits;
-    /**
-     * Observed states of the measured qubits. This vector only includes qubits
-     * specified by the associated Measurement gate.
-     */
-    std::vector<unsigned> bitstring;
-    /**
-     * Validation bit. If this is false, the measurement failed and all other
-     * fields of the result are invalid.
-     */
-    bool valid;
-  };
-
-  template <typename... Args>
-  StateSpace(Args&&... args) : Base(args...) {}
-
-  double Norm(const State& state) const {
-    auto partial_norms = static_cast<const Impl&>(*this).PartialNorms(state);
-
-    double norm = partial_norms[0];
-    for (std::size_t i = 1; i < partial_norms.size(); ++i) {
-      norm += partial_norms[i];
-    }
-
-    return norm;
-  }
-
-  template <typename RGen>
-  MeasurementResult Measure(const std::vector<unsigned>& qubits,
-                            RGen& rgen, State& state) const {
-    auto result =
-        static_cast<const Impl&>(*this).VirtualMeasure(qubits, rgen, state);
-
-    if (result.valid) {
-      static_cast<const Impl&>(*this).Collapse(result, state);
-    }
-
-    return result;
-  }
-
-  template <typename RGen>
-  MeasurementResult VirtualMeasure(const std::vector<unsigned>& qubits,
-                                   RGen& rgen, const State& state) const {
-    MeasurementResult result;
-
-    result.valid = true;
-    result.mask = 0;
-
-    for (auto q : qubits) {
-      if (q >= state.num_qubits()) {
-        result.valid = false;
-        return result;
-      }
-
-      result.mask |= uint64_t{1} << q;
-    }
-
-    auto partial_norms = static_cast<const Impl&>(*this).PartialNorms(state);
-
-    for (std::size_t i = 1; i < partial_norms.size(); ++i) {
-      partial_norms[i] += partial_norms[i - 1];
-    }
-
-    auto norm = partial_norms.back();
-    auto r = RandomValue(rgen, norm);
-
-    unsigned m = 0;
-    while (r > partial_norms[m]) ++m;
-    if (m > 0) {
-      r -= partial_norms[m - 1];
-    }
-
-    result.bits = static_cast<const Impl&>(*this).FindMeasuredBits(
-        m, r, result.mask, state);
-
-    result.bitstring.reserve(qubits.size());
-    result.bitstring.resize(0);
-
-    for (auto q : qubits) {
-      result.bitstring.push_back((result.bits >> q) & 1);
-    }
-
-    return result;
-  }
-};
-
-}  // namespace qsim
-
-#endif  // STATESPACE_H_
diff --git a/tpls/qsim/statespace_avx.h b/tpls/qsim/statespace_avx.h
deleted file mode 100644
index 876058b..0000000
--- a/tpls/qsim/statespace_avx.h
+++ /dev/null
@@ -1,497 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef STATESPACE_AVX_H_
-#define STATESPACE_AVX_H_
-
-#include <immintrin.h>
-
-#include <algorithm>
-#include <cmath>
-#include <complex>
-#include <cstdint>
-#include <functional>
-
-#include "statespace.h"
-#include "util.h"
-#include "vectorspace.h"
-
-namespace qsim {
-
-namespace detail {
-
-inline __m256i GetZeroMaskAVX(uint64_t i, uint64_t mask, uint64_t bits) {
-  __m256i s1 = _mm256_setr_epi64x(i + 0, i + 2, i + 4, i + 6);
-  __m256i s2 = _mm256_setr_epi64x(i + 1, i + 3, i + 5, i + 7);
-  __m256i ma = _mm256_set1_epi64x(mask);
-  __m256i bi = _mm256_set1_epi64x(bits);
-
-  s1 = _mm256_and_si256(s1, ma);
-  s2 = _mm256_and_si256(s2, ma);
-
-  s1 = _mm256_cmpeq_epi64(s1, bi);
-  s2 = _mm256_cmpeq_epi64(s2, bi);
-
-  return _mm256_blend_epi32(s1, s2, 170);  // 10101010
-}
-
-inline double HorizontalSumAVX(__m256 s) {
-  __m128 l = _mm256_castps256_ps128(s);
-  __m128 h = _mm256_extractf128_ps(s, 1);
-  __m128 s1  = _mm_add_ps(h, l);
-  __m128 s1s = _mm_movehdup_ps(s1);
-  __m128 s2 = _mm_add_ps(s1, s1s);
-
-  return _mm_cvtss_f32(_mm_add_ss(s2, _mm_movehl_ps(s1s, s2)));
-}
-
-}  // namespace detail
-
-/**
- * Object containing context and routines for AVX state-vector manipulations.
- * State is a vectorized sequence of eight real components followed by eight
- * imaginary components. Eight single-precison floating numbers can be loaded
- * into an AVX register.
- */
-template <typename For>
-class StateSpaceAVX :
-    public StateSpace<StateSpaceAVX<For>, VectorSpace, For, float> {
- private:
-  using Base = StateSpace<StateSpaceAVX<For>, qsim::VectorSpace, For, float>;
-
- public:
-  using State = typename Base::State;
-  using fp_type = typename Base::fp_type;
-
-  template <typename... ForArgs>
-  explicit StateSpaceAVX(ForArgs&&... args) : Base(args...) {}
-
-  static uint64_t MinSize(unsigned num_qubits) {
-    return std::max(uint64_t{16}, 2 * (uint64_t{1} << num_qubits));
-  };
-
-  void InternalToNormalOrder(State& state) const {
-    if (state.num_qubits() == 1) {
-      fp_type* s = state.get();
-
-      s[2] = s[1];
-      s[1] = s[8];
-      s[3] = s[9];
-
-      for (uint64_t i = 4; i < 16; ++i) {
-        s[i] = 0;
-      }
-    } else if (state.num_qubits() == 2) {
-      fp_type* s = state.get();
-
-      s[6] = s[3];
-      s[4] = s[2];
-      s[2] = s[1];
-      s[1] = s[8];
-      s[3] = s[9];
-      s[5] = s[10];
-      s[7] = s[11];
-
-      for (uint64_t i = 8; i < 16; ++i) {
-        s[i] = 0;
-      }
-    } else {
-      auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) {
-        fp_type* s = p + 16 * i;
-
-        fp_type re[7];
-        fp_type im[7];
-
-        for (uint64_t i = 0; i < 7; ++i) {
-          re[i] = s[i + 1];
-          im[i] = s[i + 8];
-        }
-
-        for (uint64_t i = 0; i < 7; ++i) {
-          s[2 * i + 1] = im[i];
-          s[2 * i + 2] = re[i];
-        }
-      };
-
-      Base::for_.Run(MinSize(state.num_qubits()) / 16, f, state.get());
-    }
-  }
-
-  void NormalToInternalOrder(State& state) const {
-    if (state.num_qubits() == 1) {
-      fp_type* s = state.get();
-
-      s[8] = s[1];
-      s[1] = s[2];
-      s[9] = s[3];
-
-      for (uint64_t i = 2; i < 8; ++i) {
-        s[i] = 0;
-        s[i + 8] = 0;
-      }
-    } else if (state.num_qubits() == 2) {
-      fp_type* s = state.get();
-
-      s[8] = s[1];
-      s[9] = s[3];
-      s[10] = s[5];
-      s[11] = s[7];
-      s[1] = s[2];
-      s[2] = s[4];
-      s[3] = s[6];
-
-      for (uint64_t i = 4; i < 8; ++i) {
-        s[i] = 0;
-        s[i + 8] = 0;
-      }
-    } else {
-      auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) {
-        fp_type* s = p + 16 * i;
-
-        fp_type re[7];
-        fp_type im[7];
-
-        for (uint64_t i = 0; i < 7; ++i) {
-          im[i] = s[2 * i + 1];
-          re[i] = s[2 * i + 2];
-        }
-
-        for (uint64_t i = 0; i < 7; ++i) {
-          s[i + 1] = re[i];
-          s[i + 8] = im[i];
-        }
-      };
-
-      Base::for_.Run(MinSize(state.num_qubits()) / 16, f, state.get());
-    }
-  }
-
-  void SetAllZeros(State& state) const {
-    __m256 val0 = _mm256_setzero_ps();
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, __m256& val, fp_type* p) {
-      _mm256_store_ps(p + 16 * i, val);
-      _mm256_store_ps(p + 16 * i + 8, val);
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 16, f, val0, state.get());
-  }
-
-  // Uniform superposition.
-  void SetStateUniform(State& state) const {
-    __m256 val0 = _mm256_setzero_ps();
-    __m256 valu;
-
-    fp_type v = double{1} / std::sqrt(uint64_t{1} << state.num_qubits());
-
-    switch (state.num_qubits()) {
-    case 1:
-      valu = _mm256_set_ps(0, 0, 0, 0, 0, 0, v, v);
-      break;
-    case 2:
-      valu = _mm256_set_ps(0, 0, 0, 0, v, v, v, v);
-      break;
-    default:
-      valu = _mm256_set1_ps(v);
-      break;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                __m256& val0, __m256 valu, fp_type* p) {
-      _mm256_store_ps(p + 16 * i, valu);
-      _mm256_store_ps(p + 16 * i + 8, val0);
-    };
-
-    Base::for_.Run(
-        MinSize(state.num_qubits()) / 16, f, val0, valu, state.get());
-  }
-
-  // |0> state.
-  void SetStateZero(State& state) const {
-    SetAllZeros(state);
-    state.get()[0] = 1;
-  }
-
-  static std::complex<fp_type> GetAmpl(const State& state, uint64_t i) {
-    uint64_t k = (16 * (i / 8)) + (i % 8);
-    return std::complex<fp_type>(state.get()[k], state.get()[k + 8]);
-  }
-
-  static void SetAmpl(
-      State& state, uint64_t i, const std::complex<fp_type>& ampl) {
-    uint64_t k = (16 * (i / 8)) + (i % 8);
-    state.get()[k] = std::real(ampl);
-    state.get()[k + 8] = std::imag(ampl);
-  }
-
-  static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) {
-    uint64_t k = (16 * (i / 8)) + (i % 8);
-    state.get()[k] = re;
-    state.get()[k + 8] = im;
-  }
-
-  // Sets state[i] = complex(re, im) where (i & mask) == bits.
-  // if `exclude` is true then the criteria becomes (i & mask) != bits.
-  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits,
-                   const std::complex<fp_type>& val,
-                   bool exclude = false) const {
-    BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude);
-  }
-
-  // Sets state[i] = complex(re, im) where (i & mask) == bits.
-  // if `exclude` is true then the criteria becomes (i & mask) != bits.
-  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re,
-                   fp_type im, bool exclude = false) const {
-    __m256 re_reg = _mm256_set1_ps(re);
-    __m256 im_reg = _mm256_set1_ps(im);
-
-    __m256i exclude_reg = _mm256_setzero_si256();
-    if (exclude) {
-      exclude_reg = _mm256_cmpeq_epi32(exclude_reg, exclude_reg);
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv,
-                uint64_t bitsv, __m256 re_n, __m256 im_n, __m256i exclude_n,
-                fp_type* p) {
-      __m256 ml = _mm256_castsi256_ps(_mm256_xor_si256(
-          detail::GetZeroMaskAVX(8 * i, maskv, bitsv), exclude_n));
-
-      __m256 re = _mm256_load_ps(p + 16 * i);
-      __m256 im = _mm256_load_ps(p + 16 * i + 8);
-
-      re = _mm256_blendv_ps(re, re_n, ml);
-      im = _mm256_blendv_ps(im, im_n, ml);
-
-      _mm256_store_ps(p + 16 * i, re);
-      _mm256_store_ps(p + 16 * i + 8, im);
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 16, f, mask, bits, re_reg,
-                   im_reg, exclude_reg, state.get());
-  }
-
-  // Does the equivalent of dest += src elementwise.
-  bool Add(const State& src, State& dest) const {
-    if (src.num_qubits() != dest.num_qubits()) {
-      return false;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* p1, fp_type* p2) {
-      __m256 re1 = _mm256_load_ps(p1 + 16 * i);
-      __m256 im1 = _mm256_load_ps(p1 + 16 * i + 8);
-      __m256 re2 = _mm256_load_ps(p2 + 16 * i);
-      __m256 im2 = _mm256_load_ps(p2 + 16 * i + 8);
-
-      _mm256_store_ps(p2 + 16 * i, _mm256_add_ps(re1, re2));
-      _mm256_store_ps(p2 + 16 * i + 8, _mm256_add_ps(im1, im2));
-    };
-
-    Base::for_.Run(MinSize(src.num_qubits()) / 16, f, src.get(), dest.get());
-
-    return true;
-  }
-
-  // Does the equivalent of state *= a elementwise.
-  void Multiply(fp_type a, State& state) const {
-    __m256 r = _mm256_set1_ps(a);
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, __m256 r, fp_type* p) {
-      __m256 re = _mm256_load_ps(p + 16 * i);
-      __m256 im = _mm256_load_ps(p + 16 * i + 8);
-
-      re = _mm256_mul_ps(re, r);
-      im = _mm256_mul_ps(im, r);
-
-      _mm256_store_ps(p + 16 * i, re);
-      _mm256_store_ps(p + 16 * i + 8, im);
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 16, f, r, state.get());
-  }
-
-  std::complex<double> InnerProduct(
-      const State& state1, const State& state2) const {
-    if (state1.num_qubits() != state2.num_qubits()) {
-      return std::nan("");
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* p1, const fp_type* p2) -> std::complex<double> {
-      __m256 re1 = _mm256_load_ps(p1 + 16 * i);
-      __m256 im1 = _mm256_load_ps(p1 + 16 * i + 8);
-      __m256 re2 = _mm256_load_ps(p2 + 16 * i);
-      __m256 im2 = _mm256_load_ps(p2 + 16 * i + 8);
-
-      __m256 ip_re = _mm256_fmadd_ps(im1, im2, _mm256_mul_ps(re1, re2));
-      __m256 ip_im = _mm256_fnmadd_ps(im1, re2, _mm256_mul_ps(re1, im2));
-
-      double re = detail::HorizontalSumAVX(ip_re);
-      double im = detail::HorizontalSumAVX(ip_im);
-
-      return std::complex<double>{re, im};
-    };
-
-    using Op = std::plus<std::complex<double>>;
-    return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 16, f,
-                                Op(), state1.get(), state2.get());
-  }
-
-  double RealInnerProduct(const State& state1, const State& state2) const {
-    if (state1.num_qubits() != state2.num_qubits()) {
-      return std::nan("");
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* p1, const fp_type* p2) -> double {
-      __m256 re1 = _mm256_load_ps(p1 + 16 * i);
-      __m256 im1 = _mm256_load_ps(p1 + 16 * i + 8);
-      __m256 re2 = _mm256_load_ps(p2 + 16 * i);
-      __m256 im2 = _mm256_load_ps(p2 + 16 * i + 8);
-
-      __m256 ip_re = _mm256_fmadd_ps(im1, im2, _mm256_mul_ps(re1, re2));
-
-      return detail::HorizontalSumAVX(ip_re);
-    };
-
-    using Op = std::plus<double>;
-    return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 16, f,
-                                Op(), state1.get(), state2.get());
-  }
-
-  template <typename DistrRealType = double>
-  std::vector<uint64_t> Sample(
-      const State& state, uint64_t num_samples, unsigned seed) const {
-    std::vector<uint64_t> bitstrings;
-
-    if (num_samples > 0) {
-      double norm = 0;
-      uint64_t size = MinSize(state.num_qubits()) / 16;
-      const fp_type* p = state.get();
-
-      for (uint64_t k = 0; k < size; ++k) {
-        for (unsigned j = 0; j < 8; ++j) {
-          double re = p[16 * k + j];
-          double im = p[16 * k + 8 + j];
-          norm += re * re + im * im;
-        }
-      }
-
-      auto rs = GenerateRandomValues<DistrRealType>(num_samples, seed, norm);
-
-      uint64_t m = 0;
-      double csum = 0;
-      bitstrings.reserve(num_samples);
-
-      for (uint64_t k = 0; k < size; ++k) {
-        for (unsigned j = 0; j < 8; ++j) {
-          double re = p[16 * k + j];
-          double im = p[16 * k + 8 + j];
-          csum += re * re + im * im;
-          while (rs[m] < csum && m < num_samples) {
-            bitstrings.emplace_back(8 * k + j);
-            ++m;
-          }
-        }
-      }
-
-      for (; m < num_samples; ++m) {
-        bitstrings.emplace_back((uint64_t{1} << state.num_qubits()) - 1);
-      }
-    }
-
-    return bitstrings;
-  }
-
-  using MeasurementResult = typename Base::MeasurementResult;
-
-  void Collapse(const MeasurementResult& mr, State& state) const {
-    auto f1 = [](unsigned n, unsigned m, uint64_t i,
-                 uint64_t mask, uint64_t bits, const fp_type* p) -> double {
-      __m256i ml = detail::GetZeroMaskAVX(8 * i, mask, bits);
-
-      __m256 re = _mm256_maskload_ps(p + 16 * i, ml);
-      __m256 im = _mm256_maskload_ps(p + 16 * i + 8, ml);
-      __m256 s1 = _mm256_fmadd_ps(im, im, _mm256_mul_ps(re, re));
-
-      return detail::HorizontalSumAVX(s1);
-    };
-
-    using Op = std::plus<double>;
-    double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 16, f1,
-                                       Op(), mr.mask, mr.bits, state.get());
-
-    __m256 renorm = _mm256_set1_ps(1.0 / std::sqrt(norm));
-
-    auto f2 = [](unsigned n, unsigned m, uint64_t i,
-                 uint64_t mask, uint64_t bits, __m256 renorm, fp_type* p) {
-      __m256i ml = detail::GetZeroMaskAVX(8 * i, mask, bits);
-
-      __m256 re = _mm256_maskload_ps(p + 16 * i, ml);
-      __m256 im = _mm256_maskload_ps(p + 16 * i + 8, ml);
-
-      re = _mm256_mul_ps(re, renorm);
-      im = _mm256_mul_ps(im, renorm);
-
-      _mm256_store_ps(p + 16 * i, re);
-      _mm256_store_ps(p + 16 * i + 8, im);
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 16, f2,
-                   mr.mask, mr.bits, renorm, state.get());
-  }
-
-  std::vector<double> PartialNorms(const State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* p) -> double {
-      __m256 re = _mm256_load_ps(p + 16 * i);
-      __m256 im = _mm256_load_ps(p + 16 * i + 8);
-      __m256 s1 = _mm256_fmadd_ps(im, im, _mm256_mul_ps(re, re));
-
-      return detail::HorizontalSumAVX(s1);
-    };
-
-    using Op = std::plus<double>;
-    return Base::for_.RunReduceP(
-        MinSize(state.num_qubits()) / 16, f, Op(), state.get());
-  }
-
-  uint64_t FindMeasuredBits(
-      unsigned m, double r, uint64_t mask, const State& state) const {
-    double csum = 0;
-
-    uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 16, m);
-    uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 16, m);
-
-    const fp_type* p = state.get();
-
-    for (uint64_t k = k0; k < k1; ++k) {
-      for (uint64_t j = 0; j < 8; ++j) {
-        auto re = p[16 * k + j];
-        auto im = p[16 * k + j + 8];
-        csum += re * re + im * im;
-        if (r < csum) {
-          return (8 * k + j) & mask;
-        }
-      }
-    }
-
-    // Return the last bitstring in the unlikely case of underflow.
-    return (8 * k1 - 1) & mask;
-  }
-};
-
-}  // namespace qsim
-
-#endif  // STATESPACE_AVX_H_
diff --git a/tpls/qsim/statespace_avx512.h b/tpls/qsim/statespace_avx512.h
deleted file mode 100644
index 879fd89..0000000
--- a/tpls/qsim/statespace_avx512.h
+++ /dev/null
@@ -1,448 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef STATESPACE_AVX512_H_
-#define STATESPACE_AVX512_H_
-
-#include <immintrin.h>
-
-#include <algorithm>
-#include <cmath>
-#include <complex>
-#include <cstdint>
-#include <functional>
-
-#include "statespace.h"
-#include "util.h"
-#include "vectorspace.h"
-
-namespace qsim {
-
-namespace detail {
-
-inline unsigned GetZeroMaskAVX512(uint64_t i, uint64_t mask, uint64_t bits) {
-  __m512i s1 = _mm512_setr_epi64(
-      i + 0, i + 1, i + 2, i + 3, i + 4, i + 5, i + 6, i + 7);
-  __m512i s2 = _mm512_setr_epi64(
-      i + 8, i + 9, i + 10, i + 11, i + 12, i + 13, i + 14, i + 15);
-  __m512i ma = _mm512_set1_epi64(mask);
-  __m512i bi = _mm512_set1_epi64(bits);
-
-  s1 = _mm512_and_si512(s1, ma);
-  s2 = _mm512_and_si512(s2, ma);
-
-  unsigned m1 = _mm512_cmpeq_epu64_mask(s1, bi);
-  unsigned m2 = _mm512_cmpeq_epu64_mask(s2, bi);
-
-  return (m2 << 8) | m1;
-}
-
-inline double HorizontalSumAVX(__m256 s) {
-  __m128 l = _mm256_castps256_ps128(s);
-  __m128 h = _mm256_extractf128_ps(s, 1);
-  __m128 s1  = _mm_add_ps(h, l);
-  __m128 s1s = _mm_movehdup_ps(s1);
-  __m128 s2 = _mm_add_ps(s1, s1s);
-
-  return _mm_cvtss_f32(_mm_add_ss(s2, _mm_movehl_ps(s1s, s2)));
-}
-
-inline double HorizontalSumAVX512(__m512 s) {
-  __m256 l = _mm512_castps512_ps256(s);
-  __m512d sd = _mm512_castps_pd(s);
-  __m256d hd = _mm512_extractf64x4_pd(sd, 1);
-  __m256 h = _mm256_castpd_ps(hd);
-  __m256 p = _mm256_add_ps(h, l);
-
-  return HorizontalSumAVX(p);
-}
-
-}  // namespace detail
-
-/**
- * Object containing context and routines for AVX state-vector manipulations.
- * State is a vectorized sequence of sixteen real components followed by
- * sixteen imaginary components. Sixteen single-precison floating numbers can
- * be loaded into an AVX512 register.
- */
-template <typename For>
-class StateSpaceAVX512 :
-    public StateSpace<StateSpaceAVX512<For>, VectorSpace, For, float> {
- private:
-  using Base = StateSpace<StateSpaceAVX512<For>, qsim::VectorSpace, For, float>;
-
- public:
-  using State = typename Base::State;
-  using fp_type = typename Base::fp_type;
-
-  template <typename... ForArgs>
-  explicit StateSpaceAVX512(ForArgs&&... args) : Base(args...) {}
-
-  static uint64_t MinSize(unsigned num_qubits) {
-    return std::max(uint64_t{32}, 2 * (uint64_t{1} << num_qubits));
-  };
-
-  void InternalToNormalOrder(State& state) const {
-    __m512i idx1 = _mm512_setr_epi32(
-        0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
-    __m512i idx2 = _mm512_setr_epi32(
-        8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                __m512i idx1, __m512i idx2, fp_type* p) {
-      __m512 v1 = _mm512_load_ps(p + 32 * i);
-      __m512 v2 = _mm512_load_ps(p + 32 * i + 16);
-
-      _mm512_store_ps(p + 32 * i,  _mm512_permutex2var_ps(v1, idx1, v2));
-      _mm512_store_ps(p + 32 * i + 16,  _mm512_permutex2var_ps(v1, idx2, v2));
-    };
-
-    Base::for_.Run(
-        MinSize(state.num_qubits()) / 32, f, idx1, idx2, state.get());
-  }
-
-  void NormalToInternalOrder(State& state) const {
-    __m512i idx1 = _mm512_setr_epi32(
-        0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
-    __m512i idx2 = _mm512_setr_epi32(
-        1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                __m512i idx1, __m512i idx2, fp_type* p) {
-      __m512 re = _mm512_load_ps(p + 32 * i);
-      __m512 im = _mm512_load_ps(p + 32 * i + 16);
-
-      _mm512_store_ps(p + 32 * i,  _mm512_permutex2var_ps(re, idx1, im));
-      _mm512_store_ps(p + 32 * i + 16,  _mm512_permutex2var_ps(re, idx2, im));
-    };
-
-    Base::for_.Run(
-        MinSize(state.num_qubits()) / 32, f, idx1, idx2, state.get());
-  }
-
-  void SetAllZeros(State& state) const {
-    __m512 val0 = _mm512_setzero_ps();
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, __m512 val0, fp_type* p) {
-      _mm512_store_ps(p + 32 * i, val0);
-      _mm512_store_ps(p + 32 * i + 16, val0);
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 32, f, val0, state.get());
-  }
-
-  // Uniform superposition.
-  void SetStateUniform(State& state) const {
-    __m512 val0 = _mm512_setzero_ps();
-    __m512 valu;
-
-    fp_type v = double{1} / std::sqrt(uint64_t{1} << state.num_qubits());
-
-    switch (state.num_qubits()) {
-    case 1:
-      valu = _mm512_set_ps(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, v, v);
-      break;
-    case 2:
-      valu = _mm512_set_ps(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, v, v, v, v);
-      break;
-    case 3:
-      valu = _mm512_set_ps(0, 0, 0, 0, 0, 0, 0, 0, v, v, v, v, v, v, v, v);
-      break;
-    default:
-      valu = _mm512_set1_ps(v);
-      break;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const __m512& val0, const __m512& valu, fp_type* p) {
-      _mm512_store_ps(p + 32 * i, valu);
-      _mm512_store_ps(p + 32 * i + 16, val0);
-    };
-
-    Base::for_.Run(
-        MinSize(state.num_qubits()) / 32, f, val0, valu, state.get());
-  }
-
-  // |0> state.
-  void SetStateZero(State& state) const {
-    SetAllZeros(state);
-    state.get()[0] = 1;
-  }
-
-  static std::complex<fp_type> GetAmpl(const State& state, uint64_t i) {
-    uint64_t p = (32 * (i / 16)) + (i % 16);
-    return std::complex<fp_type>(state.get()[p], state.get()[p + 16]);
-  }
-
-  static void SetAmpl(
-      State& state, uint64_t i, const std::complex<fp_type>& ampl) {
-    uint64_t p = (32 * (i / 16)) + (i % 16);
-    state.get()[p] = std::real(ampl);
-    state.get()[p + 16] = std::imag(ampl);
-  }
-
-  static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) {
-    uint64_t p = (32 * (i / 16)) + (i % 16);
-    state.get()[p] = re;
-    state.get()[p + 16] = im;
-  }
-
-  // Sets state[i] = complex(re, im) where (i & mask) == bits.
-  // if `exclude` is true then the criteria becomes (i & mask) != bits.
-  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits,
-                   const std::complex<fp_type>& val,
-                   bool exclude = false) const {
-    BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude);
-  }
-
-  // Sets state[i] = complex(re, im) where (i & mask) == bits.
-  // if `exclude` is true then the criteria becomes (i & mask) != bits.
-  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re,
-                   fp_type im, bool exclude = false) const {
-    __m512 re_reg = _mm512_set1_ps(re);
-    __m512 im_reg = _mm512_set1_ps(im);
-
-    __mmask16 exclude_n = exclude ? 0xffff : 0;
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv,
-                uint64_t bitsv, __m512 re_n, __m512 im_n, __mmask16 exclude_n,
-                fp_type* p) {
-      __m512 re = _mm512_load_ps(p + 32 * i);
-      __m512 im = _mm512_load_ps(p + 32 * i + 16);
-
-      __mmask16 ml =
-          detail::GetZeroMaskAVX512(16 * i, maskv, bitsv) ^ exclude_n;
-
-      re = _mm512_mask_blend_ps(ml, re, re_n);
-      im = _mm512_mask_blend_ps(ml, im, im_n);
-
-      _mm512_store_ps(p + 32 * i, re);
-      _mm512_store_ps(p + 32 * i + 16, im);
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 32, f, mask, bits,
-                   re_reg, im_reg, exclude_n, state.get());
-  }
-
-  // Does the equivalent of dest += src elementwise.
-  bool Add(const State& src, State& dest) const {
-    if (src.num_qubits() != dest.num_qubits()) {
-      return false;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* p1, fp_type* p2) {
-      __m512 re1 = _mm512_load_ps(p1 + 32 * i);
-      __m512 im1 = _mm512_load_ps(p1 + 32 * i + 16);
-      __m512 re2 = _mm512_load_ps(p2 + 32 * i);
-      __m512 im2 = _mm512_load_ps(p2 + 32 * i + 16);
-
-      _mm512_store_ps(p2 + 32 * i, _mm512_add_ps(re1, re2));
-      _mm512_store_ps(p2 + 32 * i + 16, _mm512_add_ps(im1, im2));
-    };
-
-    Base::for_.Run(MinSize(src.num_qubits()) / 32, f, src.get(), dest.get());
-
-    return true;
-  }
-
-  // Does the equivalent of state *= a elementwise.
-  void Multiply(fp_type a, State& state) const {
-    __m512 r = _mm512_set1_ps(a);
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, __m512 r, fp_type* p) {
-      __m512 re = _mm512_load_ps(p + 32 * i);
-      __m512 im = _mm512_load_ps(p + 32 * i + 16);
-
-      _mm512_store_ps(p + 32 * i, _mm512_mul_ps(re, r));
-      _mm512_store_ps(p + 32 * i + 16, _mm512_mul_ps(im, r));
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 32, f, r, state.get());
-  }
-
-  std::complex<double> InnerProduct(
-      const State& state1, const State& state2) const {
-    if (state1.num_qubits() != state2.num_qubits()) {
-      return std::nan("");
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* p1, const fp_type* p2) -> std::complex<double> {
-      __m512 re1 = _mm512_load_ps(p1 + 32 * i);
-      __m512 im1 = _mm512_load_ps(p1 + 32 * i + 16);
-      __m512 re2 = _mm512_load_ps(p2 + 32 * i);
-      __m512 im2 = _mm512_load_ps(p2 + 32 * i + 16);
-
-      __m512 ip_re = _mm512_fmadd_ps(im1, im2, _mm512_mul_ps(re1, re2));
-      __m512 ip_im = _mm512_fnmadd_ps(im1, re2, _mm512_mul_ps(re1, im2));
-
-      double re = detail::HorizontalSumAVX512(ip_re);
-      double im = detail::HorizontalSumAVX512(ip_im);
-
-      return std::complex<double>{re, im};
-    };
-
-    using Op = std::plus<std::complex<double>>;
-    return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 32, f,
-                                Op(), state1.get(), state2.get());
-  }
-
-  double RealInnerProduct(const State& state1, const State& state2) const {
-    if (state1.num_qubits() != state2.num_qubits()) {
-      return std::nan("");
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* p1, const fp_type* p2) -> double {
-      __m512 re1 = _mm512_load_ps(p1 + 32 * i);
-      __m512 im1 = _mm512_load_ps(p1 + 32 * i + 16);
-      __m512 re2 = _mm512_load_ps(p2 + 32 * i);
-      __m512 im2 = _mm512_load_ps(p2 + 32 * i + 16);
-
-      __m512 ip_re = _mm512_fmadd_ps(im1, im2, _mm512_mul_ps(re1, re2));
-
-      return detail::HorizontalSumAVX512(ip_re);
-    };
-
-    using Op = std::plus<double>;
-    return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 32, f,
-                                Op(), state1.get(), state2.get());
-  }
-
-  template <typename DistrRealType = double>
-  std::vector<uint64_t> Sample(
-      const State& state, uint64_t num_samples, unsigned seed) const {
-    std::vector<uint64_t> bitstrings;
-
-    if (num_samples > 0) {
-      double norm = 0;
-      uint64_t size = MinSize(state.num_qubits()) / 32;
-      const fp_type* p = state.get();
-
-      for (uint64_t k = 0; k < size; ++k) {
-        for (unsigned j = 0; j < 16; ++j) {
-          double re = p[32 * k + j];
-          double im = p[32 * k + 16 + j];
-          norm += re * re + im * im;
-        }
-      }
-
-      auto rs = GenerateRandomValues<DistrRealType>(num_samples, seed, norm);
-
-      uint64_t m = 0;
-      double csum = 0;
-      bitstrings.reserve(num_samples);
-
-      for (uint64_t k = 0; k < size; ++k) {
-        for (unsigned j = 0; j < 16; ++j) {
-          double re = p[32 * k + j];
-          double im = p[32 * k + 16 + j];
-          csum += re * re + im * im;
-          while (rs[m] < csum && m < num_samples) {
-            bitstrings.emplace_back(16 * k + j);
-            ++m;
-          }
-        }
-      }
-
-      for (; m < num_samples; ++m) {
-        bitstrings.emplace_back((uint64_t{1} << state.num_qubits()) - 1);
-      }
-    }
-
-    return bitstrings;
-  }
-
-  using MeasurementResult = typename Base::MeasurementResult;
-
-  void Collapse(const MeasurementResult& mr, State& state) const {
-    auto f1 = [](unsigned n, unsigned m, uint64_t i,
-                 uint64_t mask, uint64_t bits, const fp_type* p) -> double {
-      __mmask16 ml = detail::GetZeroMaskAVX512(16 * i, mask, bits);
-
-      __m512 re = _mm512_maskz_load_ps(ml, p + 32 * i);
-      __m512 im = _mm512_maskz_load_ps(ml, p + 32 * i + 16);
-      __m512 s1 = _mm512_fmadd_ps(im, im, _mm512_mul_ps(re, re));
-
-      return detail::HorizontalSumAVX512(s1);
-    };
-
-    using Op = std::plus<double>;
-    double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 32, f1,
-                                       Op(), mr.mask, mr.bits, state.get());
-
-    __m512 renorm = _mm512_set1_ps(1.0 / std::sqrt(norm));
-
-    auto f2 = [](unsigned n, unsigned m, uint64_t i,
-                 uint64_t mask, uint64_t bits, __m512 renorm, fp_type* p) {
-      __mmask16 ml = detail::GetZeroMaskAVX512(16 * i, mask, bits);
-
-      __m512 re = _mm512_maskz_load_ps(ml, p + 32 * i);
-      __m512 im = _mm512_maskz_load_ps(ml, p + 32 * i + 16);
-
-      re = _mm512_mul_ps(re, renorm);
-      im = _mm512_mul_ps(im, renorm);
-
-      _mm512_store_ps(p + 32 * i, re);
-      _mm512_store_ps(p + 32 * i + 16, im);
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 32, f2,
-                   mr.mask, mr.bits, renorm, state.get());
-  }
-
-  std::vector<double> PartialNorms(const State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* p) -> double {
-      __m512 re = _mm512_load_ps(p + 32 * i);
-      __m512 im = _mm512_load_ps(p + 32 * i + 16);
-      __m512 s1 = _mm512_fmadd_ps(im, im, _mm512_mul_ps(re, re));
-
-      return detail::HorizontalSumAVX512(s1);
-    };
-
-    using Op = std::plus<double>;
-    return Base::for_.RunReduceP(
-        MinSize(state.num_qubits()) / 32, f, Op(), state.get());
-  }
-
-  uint64_t FindMeasuredBits(
-      unsigned m, double r, uint64_t mask, const State& state) const {
-    double csum = 0;
-
-    uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 32, m);
-    uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 32, m);
-
-    const fp_type* p = state.get();
-
-    for (uint64_t k = k0; k < k1; ++k) {
-      for (uint64_t j = 0; j < 16; ++j) {
-        auto re = p[32 * k + j];
-        auto im = p[32 * k + j + 16];
-        csum += re * re + im * im;
-        if (r < csum) {
-          return (16 * k + j) & mask;
-        }
-      }
-    }
-
-    // Return the last bitstring in the unlikely case of underflow.
-    return (16 * k1 - 1) & mask;
-  }
-};
-
-}  // namespace qsim
-
-#endif  // STATESPACE_AVX512_H_
diff --git a/tpls/qsim/statespace_basic.h b/tpls/qsim/statespace_basic.h
deleted file mode 100644
index 6468483..0000000
--- a/tpls/qsim/statespace_basic.h
+++ /dev/null
@@ -1,300 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef STATESPACE_BASIC_H_
-#define STATESPACE_BASIC_H_
-
-#include <cmath>
-#include <complex>
-#include <cstdint>
-#include <functional>
-
-#include "statespace.h"
-#include "util.h"
-#include "vectorspace.h"
-
-namespace qsim {
-
-/**
- * Object containing context and routines for unoptimized state-vector
- * manipulations. State is a non-vectorized sequence of one real amplitude
- * followed by one imaginary amplitude.
- */
-template <typename For, typename FP>
-class StateSpaceBasic :
-    public StateSpace<StateSpaceBasic<For, FP>, VectorSpace, For, FP> {
- private:
-  using Base = StateSpace<StateSpaceBasic<For, FP>, qsim::VectorSpace, For, FP>;
-
- public:
-  using State = typename Base::State;
-  using fp_type = typename Base::fp_type;
-
-  template <typename... ForArgs>
-  explicit StateSpaceBasic(ForArgs&&... args) : Base(args...) {}
-
-  static uint64_t MinSize(unsigned num_qubits) {
-    return 2 * (uint64_t{1} << num_qubits);
-  };
-
-  void InternalToNormalOrder(State& state) const {}
-
-  void NormalToInternalOrder(State& state) const {}
-
-  void SetAllZeros(State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) {
-      p[2 * i] = 0;
-      p[2 * i + 1] = 0;
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 2, f, state.get());
-  }
-
-  // Uniform superposition.
-  void SetStateUniform(State& state) const {
-    fp_type val = fp_type{1} / std::sqrt(uint64_t{1} << state.num_qubits());
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                fp_type val, fp_type* p) {
-      p[2 * i] = val;
-      p[2 * i + 1] = 0;
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 2, f, val, state.get());
-  }
-
-  // |0> state.
-  void SetStateZero(State& state) const {
-    SetAllZeros(state);
-    state.get()[0] = 1;
-  }
-
-  static std::complex<fp_type> GetAmpl(const State& state, uint64_t i) {
-    uint64_t p = 2 * i;
-    return std::complex<fp_type>(state.get()[p], state.get()[p + 1]);
-  }
-
-  static void SetAmpl(
-      State& state, uint64_t i, const std::complex<fp_type>& ampl) {
-    uint64_t p = 2 * i;
-    state.get()[p] = std::real(ampl);
-    state.get()[p + 1] = std::imag(ampl);
-  }
-
-  static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) {
-    uint64_t p = 2 * i;
-    state.get()[p] = re;
-    state.get()[p + 1] = im;
-  }
-
-  // Sets state[i] = complex(re, im) where (i & mask) == bits.
-  // if `exclude` is true then the criteria becomes (i & mask) != bits.
-  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits,
-                   const std::complex<fp_type>& val,
-                   bool exclude = false) const {
-    BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude);
-  }
-
-  // Sets state[i] = complex(re, im) where (i & mask) == bits.
-  // if `exclude` is true then the criteria becomes (i & mask) != bits.
-  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re,
-                   fp_type im, bool exclude = false) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv,
-                uint64_t bitsv, fp_type re_n, fp_type im_n, bool excludev,
-                fp_type* p) {
-      auto s = p + 2 * i;
-      bool in_mask = (i & maskv) == bitsv;
-      in_mask ^= excludev;
-      s[0] = in_mask ? re_n : s[0];
-      s[1] = in_mask ? im_n : s[1];
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 2, f, mask, bits, re, im,
-                   exclude, state.get());
-  }
-
-  // Does the equivalent of dest += src elementwise.
-  bool Add(const State& src, State& dest) const {
-    if (src.num_qubits() != dest.num_qubits()) {
-      return false;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* p1, fp_type* p2) {
-      p2[2 * i] += p1[2 * i];
-      p2[2 * i + 1] += p1[2 * i + 1];
-    };
-
-    Base::for_.Run(MinSize(src.num_qubits()) / 2, f, src.get(), dest.get());
-
-    return true;
-  }
-
-  // Does the equivalent of state *= a elementwise.
-  void Multiply(fp_type a, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, fp_type a, fp_type* p) {
-      p[2 * i] *= a;
-      p[2 * i + 1] *= a;
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 2, f, a, state.get());
-  }
-
-  std::complex<double> InnerProduct(
-      const State& state1, const State& state2) const {
-    if (state1.num_qubits() != state2.num_qubits()) {
-      return std::nan("");
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* p1, const fp_type* p2) -> std::complex<double> {
-      auto s1 = p1 + 2 * i;
-      auto s2 = p2 + 2 * i;
-
-      double re = s1[0] * s2[0] + s1[1] * s2[1];
-      double im = s1[0] * s2[1] - s1[1] * s2[0];
-
-      return std::complex<double>{re, im};
-    };
-
-    using Op = std::plus<std::complex<double>>;
-    return Base::for_.RunReduce(
-        MinSize(state1.num_qubits()) / 2, f, Op(), state1.get(), state2.get());
-  }
-
-  double RealInnerProduct(const State& state1, const State& state2) const {
-    if (state1.num_qubits() != state2.num_qubits()) {
-      return std::nan("");
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* p1, const fp_type* p2) -> double {
-      auto s1 = p1 + 2 * i;
-      auto s2 = p2 + 2 * i;
-
-      return s1[0] * s2[0] + s1[1] * s2[1];
-    };
-
-    using Op = std::plus<double>;
-    return Base::for_.RunReduce(
-        MinSize(state1.num_qubits()) / 2, f, Op(), state1.get(), state2.get());
-  }
-
-  template <typename DistrRealType = double>
-  std::vector<uint64_t> Sample(
-      const State& state, uint64_t num_samples, unsigned seed) const {
-    std::vector<uint64_t> bitstrings;
-
-    if (num_samples > 0) {
-      double norm = 0;
-      uint64_t size = MinSize(state.num_qubits()) / 2;
-
-      const fp_type* p = state.get();
-
-      for (uint64_t k = 0; k < size; ++k) {
-        double re = p[2 * k];
-        double im = p[2 * k + 1];
-        norm += re * re + im * im;
-      }
-
-      auto rs = GenerateRandomValues<DistrRealType>(num_samples, seed, norm);
-
-      uint64_t m = 0;
-      double csum = 0;
-      bitstrings.reserve(num_samples);
-
-      for (uint64_t k = 0; k < size; ++k) {
-        double re = p[2 * k];
-        double im = p[2 * k + 1];
-        csum += re * re + im * im;
-        while (rs[m] < csum && m < num_samples) {
-          bitstrings.emplace_back(k);
-          ++m;
-        }
-      }
-
-      for (; m < num_samples; ++m) {
-        bitstrings.emplace_back((uint64_t{1} << state.num_qubits()) - 1);
-      }
-    }
-
-    return bitstrings;
-  }
-
-  using MeasurementResult = typename Base::MeasurementResult;
-
-  void Collapse(const MeasurementResult& mr, State& state) const {
-    auto f1 = [](unsigned n, unsigned m, uint64_t i,
-                 uint64_t mask, uint64_t bits, const fp_type* p) -> double {
-      auto s = p + 2 * i;
-      return (i & mask) == bits ? s[0] * s[0] + s[1] * s[1] : 0;
-    };
-
-    using Op = std::plus<double>;
-    double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 2, f1,
-                                       Op(), mr.mask, mr.bits, state.get());
-
-    double renorm = 1.0 / std::sqrt(norm);
-
-    auto f2 = [](unsigned n, unsigned m, uint64_t i,
-                 uint64_t mask, uint64_t bits, fp_type renorm, fp_type* p) {
-      auto s = p + 2 * i;
-      bool not_zero = (i & mask) == bits;
-
-      s[0] = not_zero ? s[0] * renorm : 0;
-      s[1] = not_zero ? s[1] * renorm : 0;
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 2, f2,
-                   mr.mask, mr.bits, renorm, state.get());
-  }
-
-  std::vector<double> PartialNorms(const State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* p) -> double {
-      auto s = p + 2 * i;
-      return s[0] * s[0] + s[1] * s[1];
-    };
-
-    using Op = std::plus<double>;
-    return Base::for_.RunReduceP(
-        MinSize(state.num_qubits()) / 2, f, Op(), state.get());
-  }
-
-  uint64_t FindMeasuredBits(
-      unsigned m, double r, uint64_t mask, const State& state) const {
-    double csum = 0;
-
-    uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 2, m);
-    uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 2, m);
-
-    const fp_type* p = state.get();
-
-    for (uint64_t k = k0; k < k1; ++k) {
-      auto re = p[2 * k];
-      auto im = p[2 * k + 1];
-      csum += re * re + im * im;
-      if (r < csum) {
-        return k & mask;
-      }
-    }
-
-    // Return the last bitstring in the unlikely case of underflow.
-    return (k1 - 1) & mask;
-  }
-};
-
-}  // namespace qsim
-
-#endif  // STATESPACE_BASIC_H_
diff --git a/tpls/qsim/statespace_cuda.h b/tpls/qsim/statespace_cuda.h
deleted file mode 100644
index 660db07..0000000
--- a/tpls/qsim/statespace_cuda.h
+++ /dev/null
@@ -1,470 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef STATESPACE_CUDA_H_
-#define STATESPACE_CUDA_H_
-
-#ifdef __NVCC__
-  #include <cuda.h>
-#elif __HIP__
-  #include <hip/hip_runtime.h>
-  #include "cuda2hip.h"
-#endif
-
-#include <algorithm>
-#include <complex>
-#include <cstdint>
-
-#include "statespace.h"
-#include "statespace_cuda_kernels.h"
-#include "vectorspace_cuda.h"
-#include "util_cuda.h"
-
-namespace qsim {
-
-/**
- * Object containing context and routines for CUDA state-vector manipulations.
- * State is a vectorized sequence of 32 real components followed by 32
- * imaginary components. 32 floating numbers can be proccessed in parallel by
- * a single warp. It is not recommended to use `GetAmpl` and `SetAmpl`.
- */
-template <typename FP = float>
-class StateSpaceCUDA :
-    public StateSpace<StateSpaceCUDA<FP>, VectorSpaceCUDA, FP> {
- private:
-  using Base = StateSpace<StateSpaceCUDA<FP>, qsim::VectorSpaceCUDA, FP>;
-
- protected:
-  struct Grid {
-    unsigned threads;
-    unsigned dblocks;
-    unsigned blocks;
-  };
-
- public:
-  using State = typename Base::State;
-  using fp_type = typename Base::fp_type;
-
-  struct Parameter {
-    /**
-     * The number of threads per block.
-     * Should be 2 to the power of k, where k is in the range [5,10].
-     */
-    unsigned num_threads = 512;
-    /**
-     * The number of data blocks. Each thread processes num_dblocks data
-     * blocks in reductions (norms, inner products, etc).
-     */
-    unsigned num_dblocks = 16;
-  };
-
-  explicit StateSpaceCUDA(const Parameter& param)
-      : param_(param), scratch_(nullptr), scratch_size_(0) {}
-
-  virtual ~StateSpaceCUDA() {
-    if (scratch_ != nullptr) {
-      ErrorCheck(cudaFree(scratch_));
-    }
-  }
-
-  static uint64_t MinSize(unsigned num_qubits) {
-    return std::max(uint64_t{64}, 2 * (uint64_t{1} << num_qubits));
-  };
-
-  void InternalToNormalOrder(State& state) const {
-    uint64_t size = MinSize(state.num_qubits()) / 2;
-
-    unsigned threads = std::min(size, uint64_t{param_.num_threads});
-    unsigned blocks = size / threads;
-    unsigned bytes = 2 * threads * sizeof(fp_type);
-
-    InternalToNormalOrderKernel<<<blocks, threads, bytes>>>(state.get());
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void NormalToInternalOrder(State& state) const {
-    uint64_t size = MinSize(state.num_qubits()) / 2;
-
-    unsigned threads = std::min(size, uint64_t{param_.num_threads});
-    unsigned blocks = size / threads;
-    unsigned bytes = 2 * threads * sizeof(fp_type);
-
-    NormalToInternalOrderKernel<<<blocks, threads, bytes>>>(state.get());
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  void SetAllZeros(State& state) const {
-    ErrorCheck(cudaMemset(state.get(), 0,
-               MinSize(state.num_qubits()) * sizeof(fp_type)));
-  }
-
-  // Uniform superposition.
-  void SetStateUniform(State& state) const {
-    uint64_t size = MinSize(state.num_qubits()) / 2;
-    uint64_t hsize = uint64_t{1} << state.num_qubits();
-
-    unsigned threads = std::min(size, uint64_t{param_.num_threads});
-    unsigned blocks = size / threads;
-
-    fp_type v = double{1} / std::sqrt(hsize);
-
-    SetStateUniformKernel<<<blocks, threads>>>(v, hsize, state.get());
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  // |0> state.
-  void SetStateZero(State& state) const {
-    SetAllZeros(state);
-    fp_type one[1] = {1};
-    ErrorCheck(
-        cudaMemcpy(state.get(), one, sizeof(fp_type), cudaMemcpyHostToDevice));
-  }
-
-  // It is not recommended to use this function.
-  static std::complex<fp_type> GetAmpl(const State& state, uint64_t i) {
-    fp_type re, im;
-    auto p = state.get() + 64 * (i / 32) + i % 32;
-    ErrorCheck(cudaMemcpy(&re, p, sizeof(fp_type), cudaMemcpyDeviceToHost));
-    ErrorCheck(
-        cudaMemcpy(&im, p + 32, sizeof(fp_type), cudaMemcpyDeviceToHost));
-    return std::complex<fp_type>(re, im);
-  }
-
-  // It is not recommended to use this function.
-  static void SetAmpl(
-      State& state, uint64_t i, const std::complex<fp_type>& ampl) {
-    fp_type re = std::real(ampl);
-    fp_type im = std::imag(ampl);
-    auto p = state.get() + 64 * (i / 32) + i % 32;
-    ErrorCheck(cudaMemcpy(p, &re, sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(p + 32, &im, sizeof(fp_type), cudaMemcpyHostToDevice));
-  }
-
-  // It is not recommended to use this function.
-  static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) {
-    auto p = state.get() + 64 * (i / 32) + i % 32;
-    ErrorCheck(cudaMemcpy(p, &re, sizeof(fp_type), cudaMemcpyHostToDevice));
-    ErrorCheck(
-        cudaMemcpy(p + 32, &im, sizeof(fp_type), cudaMemcpyHostToDevice));
-  }
-
-  // Sets state[i] = complex(re, im) where (i & mask) == bits.
-  // if `exclude` is true then the criteria becomes (i & mask) != bits.
-  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits,
-                   const std::complex<fp_type>& val,
-                   bool exclude = false) const {
-    BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude);
-  }
-
-  // Sets state[i] = complex(re, im) where (i & mask) == bits.
-  // if `exclude` is true then the criteria becomes (i & mask) != bits.
-  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re,
-                   fp_type im, bool exclude = false) const {
-    uint64_t size = MinSize(state.num_qubits()) / 2;
-
-    unsigned threads = std::min(size, uint64_t{param_.num_threads});
-    unsigned blocks = size / threads;
-
-    BulkSetAmplKernel<<<blocks, threads>>>(
-        mask, bits, re, im, exclude, state.get());
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  // Does the equivalent of dest += src elementwise.
-  bool Add(const State& src, State& dest) const {
-    if (src.num_qubits() != dest.num_qubits()) {
-      return false;
-    }
-
-    uint64_t size = MinSize(src.num_qubits());
-
-    unsigned threads = std::min(size, uint64_t{param_.num_threads});
-    unsigned blocks = size / threads;
-
-    AddKernel<<<blocks, threads>>>(src.get(), dest.get());
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    return true;
-  }
-
-  // Does the equivalent of state *= a elementwise.
-  void Multiply(fp_type a, State& state) const {
-    uint64_t size = MinSize(state.num_qubits());
-
-    unsigned threads = std::min(size, uint64_t{param_.num_threads});
-    unsigned blocks = size / threads;
-
-    MultiplyKernel<<<blocks, threads>>>(a, state.get());
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  std::complex<double> InnerProduct(
-      const State& state1, const State& state2) const {
-    if (state1.num_qubits() != state2.num_qubits()) {
-      return std::nan("");
-    }
-
-    using C = Complex<double>;
-    auto r = Reduce<C, C, Product<fp_type>>(state1, state2);
-
-    return {r.re, r.im};
-  }
-
-  double RealInnerProduct(const State& state1, const State& state2) const {
-    if (state1.num_qubits() != state2.num_qubits()) {
-      return std::nan("");
-    }
-
-    return Reduce<double, double, RealProduct<fp_type>>(state1, state2);
-  }
-
-  double Norm(const State& state) const {
-    return Reduce<double, double, RealProduct<fp_type>>(state, state);
-  }
-
-  template <typename DistrRealType = double>
-  std::vector<uint64_t> Sample(
-      const State& state, uint64_t num_samples, unsigned seed) const {
-    std::vector<uint64_t> bitstrings;
-
-    if (num_samples > 0) {
-      Grid g1 = GetGrid1(MinSize(state.num_qubits()) / 2);
-      unsigned bytes = g1.threads * sizeof(double);
-
-      unsigned scratch_size = (g1.blocks + 1) * sizeof(double)
-          + num_samples * (sizeof(uint64_t) + sizeof(DistrRealType));
-
-      void* scratch = AllocScratch(scratch_size);
-
-      double* d_res2 = (double*) scratch;
-      double* d_res1 = d_res2 + 1;
-      uint64_t* d_bitstrings = (uint64_t*) (d_res1 + g1.blocks);
-      DistrRealType* d_rs = (DistrRealType *) (d_bitstrings + num_samples);
-
-      auto op1 = RealProduct<fp_type>();
-      auto op2 = Plus<double>();
-
-      Reduce1Kernel<double><<<g1.blocks, g1.threads, bytes>>>(
-          g1.dblocks, op1, op2, op2, state.get(), state.get(), d_res1);
-      ErrorCheck(cudaPeekAtLastError());
-      ErrorCheck(cudaDeviceSynchronize());
-
-      double norm;
-
-      if (g1.blocks == 1) {
-        ErrorCheck(
-            cudaMemcpy(&norm, d_res1, sizeof(double), cudaMemcpyDeviceToHost));
-      } else {
-        Grid g2 = GetGrid2(g1.blocks);
-        unsigned bytes = g2.threads * sizeof(double);
-
-        auto op3 = Plus<double>();
-
-        Reduce2Kernel<double><<<g2.blocks, g2.threads, bytes>>>(
-            g2.dblocks, g1.blocks, op3, op3, d_res1, d_res2);
-        ErrorCheck(cudaPeekAtLastError());
-        ErrorCheck(cudaDeviceSynchronize());
-
-        ErrorCheck(
-            cudaMemcpy(&norm, d_res2, sizeof(double), cudaMemcpyDeviceToHost));
-      }
-
-      // TODO: generate random values on the device.
-      auto rs = GenerateRandomValues<DistrRealType>(num_samples, seed, norm);
-
-      ErrorCheck(cudaMemcpy(d_rs, rs.data(),
-                            num_samples * sizeof(DistrRealType),
-                            cudaMemcpyHostToDevice));
-
-      SampleKernel<<<1, g1.threads>>>(g1.blocks, g1.dblocks, num_samples,
-                                      d_rs, d_res1, state.get(), d_bitstrings);
-      ErrorCheck(cudaPeekAtLastError());
-      ErrorCheck(cudaDeviceSynchronize());
-
-      bitstrings.resize(num_samples, 0);
-
-      ErrorCheck(cudaMemcpy(bitstrings.data(), d_bitstrings,
-                            num_samples * sizeof(uint64_t),
-                            cudaMemcpyDeviceToHost));
-    }
-
-    return bitstrings;
-  }
-
-  using MeasurementResult = typename Base::MeasurementResult;
-
-  void Collapse(const MeasurementResult& mr, State& state) const {
-    using Op = RealProduct<fp_type>;
-    double r = Reduce<double, double, Op>(mr.mask, mr.bits, state, state);
-    fp_type renorm = 1 / std::sqrt(r);
-
-    uint64_t size = MinSize(state.num_qubits()) / 2;
-
-    unsigned threads = std::min(size, uint64_t{param_.num_threads});
-    unsigned blocks = size / threads;
-
-    CollapseKernel<<<blocks, threads>>>(mr.mask, mr.bits, renorm, state.get());
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
-  std::vector<double> PartialNorms(const State& state) const {
-    Grid g = GetGrid1(MinSize(state.num_qubits()) / 2);
-
-    unsigned scratch_size = g.blocks * sizeof(double);
-    unsigned bytes = g.threads * sizeof(double);
-
-    double* d_res = (double*) AllocScratch(scratch_size);
-
-    auto op1 = RealProduct<fp_type>();
-    auto op2 = Plus<double>();
-
-    Reduce1Kernel<double><<<g.blocks, g.threads, bytes>>>(
-        g.dblocks, op1, op2, op2, state.get(), state.get(), d_res);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    std::vector<double> norms(g.blocks);
-
-    ErrorCheck(
-        cudaMemcpy(norms.data(), d_res, scratch_size, cudaMemcpyDeviceToHost));
-
-    return norms;
-  }
-
-  uint64_t FindMeasuredBits(
-      unsigned m, double r, uint64_t mask, const State& state) const {
-    Grid g = GetGrid1(MinSize(state.num_qubits()) / 2);
-
-    uint64_t res;
-    uint64_t* d_res = (uint64_t*) AllocScratch(sizeof(uint64_t));
-
-    FindMeasuredBitsKernel<<<1, g.threads>>>(
-        m, g.dblocks, r, state.get(), d_res);
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    ErrorCheck(
-        cudaMemcpy(&res, d_res, sizeof(uint64_t), cudaMemcpyDeviceToHost));
-
-    return res & mask;
-  }
-
- protected:
-  Parameter param_;
-
-  void* AllocScratch(uint64_t size) const {
-    if (size > scratch_size_) {
-      if (scratch_ != nullptr) {
-        ErrorCheck(cudaFree(scratch_));
-      }
-
-      ErrorCheck(cudaMalloc(const_cast<void**>(&scratch_), size));
-
-      const_cast<uint64_t&>(scratch_size_) = size;
-    }
-
-    return scratch_;
-  }
-
-  Grid GetGrid1(uint64_t size) const {
-    Grid grid;
-
-    grid.threads = std::min(size, uint64_t{param_.num_threads});
-    grid.dblocks = std::min(size / grid.threads, uint64_t{param_.num_dblocks});
-    grid.blocks = size / (grid.threads * grid.dblocks);
-
-    return grid;
-  }
-
-  Grid GetGrid2(unsigned size) const {
-    Grid grid;
-
-    grid.threads = std::min(param_.num_threads, std::max(32U, size));
-    grid.dblocks = std::max(1U, size / grid.threads);
-    grid.blocks = 1;
-
-    return grid;
-  }
-
-  template <typename FP1, typename FP2, typename Op>
-  FP2 Reduce(const State& state1, const State& state2) const {
-    return Reduce<FP1, FP2, Op>(0, 0, state1, state2);
-  }
-
-  template <typename FP1, typename FP2, typename Op>
-  FP2 Reduce(uint64_t mask, uint64_t bits,
-             const State& state1, const State& state2) const {
-    uint64_t size = MinSize(state1.num_qubits()) / 2;
-
-    Grid g1 = GetGrid1(size);
-    unsigned bytes = g1.threads * sizeof(FP1);
-
-    FP2* d_res2 = (FP2*) AllocScratch((g1.blocks + 1) * sizeof(FP2));
-    FP2* d_res1 = d_res2 + 1;
-
-    auto op1 = Op();
-    auto op2 = Plus<FP1>();
-    auto op3 = Plus<typename Scalar<FP1>::type>();
-
-    if (mask == 0) {
-      Reduce1Kernel<FP1><<<g1.blocks, g1.threads, bytes>>>(
-          g1.dblocks, op1, op2, op3, state1.get(), state2.get(), d_res1);
-    } else {
-      Reduce1MaskedKernel<FP1><<<g1.blocks, g1.threads, bytes>>>(
-          g1.dblocks, mask, bits, op1, op2, op3, state1.get(), state2.get(),
-          d_res1);
-    }
-    ErrorCheck(cudaPeekAtLastError());
-    ErrorCheck(cudaDeviceSynchronize());
-
-    FP2 result;
-
-    if (g1.blocks == 1) {
-      ErrorCheck(
-          cudaMemcpy(&result, d_res1, sizeof(FP2), cudaMemcpyDeviceToHost));
-    } else {
-      Grid g2 = GetGrid2(g1.blocks);
-      unsigned bytes = g2.threads * sizeof(FP2);
-
-      auto op2 = Plus<FP2>();
-      auto op3 = Plus<typename Scalar<FP2>::type>();
-
-      Reduce2Kernel<FP2><<<g2.blocks, g2.threads, bytes>>>(
-          g2.dblocks, g1.blocks, op2, op3, d_res1, d_res2);
-      ErrorCheck(cudaPeekAtLastError());
-      ErrorCheck(cudaDeviceSynchronize());
-
-      ErrorCheck(
-          cudaMemcpy(&result, d_res2, sizeof(FP2), cudaMemcpyDeviceToHost));
-    }
-
-    return result;
-  }
-
- private:
-  void* scratch_;
-  uint64_t scratch_size_;
-};
-
-}  // namespace qsim
-
-#endif  // STATESPACE_CUDA_H_
diff --git a/tpls/qsim/statespace_cuda_kernels.h b/tpls/qsim/statespace_cuda_kernels.h
deleted file mode 100644
index b54ebca..0000000
--- a/tpls/qsim/statespace_cuda_kernels.h
+++ /dev/null
@@ -1,355 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef STATESPACE_CUDA_KERNELS_H_
-#define STATESPACE_CUDA_KERNELS_H_
-
-#ifdef __NVCC__
-  #include <cuda.h>
-#elif __HIP__
-  #include <hip/hip_runtime.h>
-  #include "cuda2hip.h"
-#endif
-
-#include "util_cuda.h"
-
-namespace qsim {
-
-namespace detail {
-
-template <typename FP1, typename FP2,
-          typename Op1, typename Op2, typename Op3, unsigned warp_size = 32>
-__device__ __forceinline__ FP1 BlockReduce1(
-    uint64_t n, Op1 op1, Op2 op2, Op3 op3, const FP2* s1, const FP2* s2) {
-  extern __shared__ float shared[];
-  FP1* partial1 = (FP1*) shared;
-
-  unsigned tid = threadIdx.x;
-  unsigned warp = threadIdx.x / warp_size;
-  unsigned lane = threadIdx.x % warp_size;
-
-  uint64_t k0 = 2 * n * blockIdx.x * blockDim.x + 2 * tid - lane;
-  uint64_t k1 = k0 + 2 * n * blockDim.x;
-
-  FP1 r;
-
-  r = op1(s1[k0], s1[k0 + warp_size], s2[k0], s2[k0 + warp_size]);
-  while ((k0 += 2 * blockDim.x) < k1) {
-    r = op2(r, op1(s1[k0], s1[k0 + warp_size], s2[k0], s2[k0 + warp_size]));
-  }
-
-  partial1[tid] = r;
-
-  __shared__ FP1 partial2[warp_size];
-
-  if (tid < warp_size) {
-    partial2[tid] = 0;
-  }
-
-  __syncthreads();
-
-  FP1 val = WarpReduce(partial1[tid], op3);
-
-  if (lane == 0) {
-    partial2[warp] = val;
-  }
-
-  __syncthreads();
-
-  FP1 result = 0;
-
-  if (tid < warp_size) {
-    result = WarpReduce(partial2[tid], op3);
-  }
-
-  return result;
-}
-
-template <typename FP1, typename FP2,
-          typename Op1, typename Op2, typename Op3, unsigned warp_size = 32>
-__device__ __forceinline__ FP1 BlockReduce1Masked(
-    uint64_t n, uint64_t mask, uint64_t bits, Op1 op1, Op2 op2, Op3 op3,
-    const FP2* s1, const FP2* s2) {
-  extern __shared__ float shared[];
-  FP1* partial1 = (FP1*) shared;
-
-  unsigned tid = threadIdx.x;
-  unsigned warp = threadIdx.x / warp_size;
-  unsigned lane = threadIdx.x % warp_size;
-
-  uint64_t k0 = 2 * n * blockIdx.x * blockDim.x + 2 * tid - lane;
-  uint64_t k1 = k0 + 2 * n * blockDim.x;
-
-  FP1 r = 0;
-
-  if (((k0 + lane) / 2 & mask) == bits) {
-    r = op1(s1[k0], s1[k0 + warp_size], s2[k0], s2[k0 + warp_size]);
-  }
-  while ((k0 += 2 * blockDim.x) < k1) {
-    if (((k0 + lane) / 2 & mask) == bits) {
-      r = op2(r, op1(s1[k0], s1[k0 + warp_size], s2[k0], s2[k0 + warp_size]));
-    }
-  }
-
-  partial1[tid] = r;
-
-  __shared__ FP1 partial2[warp_size];
-
-  if (tid < warp_size) {
-    partial2[tid] = 0;
-  }
-
-  __syncthreads();
-
-  FP1 val = WarpReduce(partial1[tid], op3);
-
-  if (lane == 0) {
-    partial2[warp] = val;
-  }
-
-  __syncthreads();
-
-  FP1 result = 0;
-
-  if (tid < warp_size) {
-    result = WarpReduce(partial2[tid], op3);
-  }
-
-  return result;
-}
-
-template <typename FP1, typename FP2,
-          typename Op2, typename Op3, unsigned warp_size = 32>
-__device__ __forceinline__ FP1 BlockReduce2(
-    uint64_t n, uint64_t size, Op2 op2, Op3 op3, const FP2* s) {
-  extern __shared__ float shared[];
-  FP1* partial1 = (FP1*) shared;
-
-  unsigned tid = threadIdx.x;
-  uint64_t k0 = n * blockIdx.x * blockDim.x + tid;
-  uint64_t k1 = k0 + n * blockDim.x;
-
-  FP1 r = 0;
-
-  if (tid < size) {
-    r = s[k0];
-    while ((k0 += blockDim.x) < k1) {
-      r = op2(r, s[k0]);
-    }
-  }
-
-  partial1[tid] = r;
-
-  __shared__ FP1 partial2[warp_size];
-
-  if (tid < warp_size) {
-    partial2[tid] = 0;
-  }
-
-  __syncthreads();
-
-  FP1 val = WarpReduce(partial1[tid], op3);
-
-  if (threadIdx.x % warp_size == 0) {
-    partial2[threadIdx.x / warp_size] = val;
-  }
-
-  __syncthreads();
-
-  FP1 result = 0;
-
-  if (tid < warp_size) {
-    result = WarpReduce(partial2[tid], op3);
-  }
-
-  return result;
-}
-
-}  // namespace detail
-
-template <typename FP1, typename FP2, typename FP3,
-          typename Op1, typename Op2, typename Op3, unsigned warp_size = 32>
-__global__ void Reduce1Kernel(uint64_t n, Op1 op1, Op2 op2, Op3 op3,
-                              const FP2* s1, const FP2* s2, FP3* result) {
-  FP1 sum = detail::BlockReduce1<FP1>(n, op1, op2, op3, s1, s2);
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x] = sum;
-  }
-}
-
-template <typename FP1, typename FP2, typename FP3,
-          typename Op1, typename Op2, typename Op3, unsigned warp_size = 32>
-__global__ void Reduce1MaskedKernel(uint64_t n, uint64_t mask, uint64_t bits,
-                                    Op1 op1, Op2 op2, Op3 op3,
-                                    const FP2* s1, const FP2* s2, FP3* result) {
-  FP1 sum =
-      detail::BlockReduce1Masked<FP1>(n, mask, bits, op1, op2, op3, s1, s2);
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x] = sum;
-  }
-}
-
-template <typename FP1, typename FP2, typename FP3,
-          typename Op2, typename Op3, unsigned warp_size = 32>
-__global__ void Reduce2Kernel(
-    uint64_t n, uint64_t size, Op2 op2, Op3 op3, const FP2* s, FP3* result) {
-  FP1 sum = detail::BlockReduce2<FP1>(n, size, op2, op3, s);
-
-  if (threadIdx.x == 0) {
-    result[blockIdx.x] = sum;
-  }
-}
-
-template <typename FP, unsigned warp_size = 32>
-__global__ void InternalToNormalOrderKernel(FP* state) {
-  unsigned lane = threadIdx.x % warp_size;
-  unsigned l = 2 * threadIdx.x - lane;
-  uint64_t k = 2 * uint64_t{blockIdx.x} * blockDim.x + l;
-
-  extern __shared__ float shared[];
-  FP* buf = (FP*) shared;
-
-  buf[l] = state[k];
-  buf[l + warp_size] = state[k + warp_size];
-
-  __syncthreads();
-
-  state[k + lane] = buf[l];
-  state[k + lane + 1] = buf[l + warp_size];
-}
-
-template <typename FP, unsigned warp_size = 32>
-__global__ void NormalToInternalOrderKernel(FP* state) {
-  unsigned lane = threadIdx.x % warp_size;
-  unsigned l = 2 * threadIdx.x - lane;
-  uint64_t k = 2 * uint64_t{blockIdx.x} * blockDim.x + l;
-
-  extern __shared__ float shared[];
-  FP* buf = (FP*) shared;
-
-  buf[l] = state[k];
-  buf[l + warp_size] = state[k + warp_size];
-
-  __syncthreads();
-
-  state[k] = buf[l + lane];
-  state[k + warp_size] = buf[l + lane + 1];
-}
-
-template <typename FP, unsigned warp_size = 32>
-__global__ void SetStateUniformKernel(FP v, uint64_t size, FP* state) {
-  unsigned lane = threadIdx.x % warp_size;
-  uint64_t k = 2 * (uint64_t{blockIdx.x} * blockDim.x + threadIdx.x) - lane;
-
-  state[k] = lane < size ? v : 0;
-  state[k + warp_size] = 0;
-}
-
-template <typename FP, unsigned warp_size = 32>
-__global__ void AddKernel(const FP* state1, FP* state2) {
-  uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x;
-  state2[k] += state1[k];
-}
-
-template <typename FP, unsigned warp_size = 32>
-__global__ void MultiplyKernel(FP a, FP* state) {
-  uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x;
-  state[k] *= a;
-}
-
-template <typename FP, unsigned warp_size = 32>
-__global__ void CollapseKernel(uint64_t mask, uint64_t bits, FP r, FP* state) {
-  uint64_t k1 = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x;
-  uint64_t k2 = 2 * k1 - threadIdx.x % warp_size;
-
-  if ((k1 & mask) == bits) {
-    state[k2] *= r;
-    state[k2 + warp_size] *= r;
-  } else {
-    state[k2] = 0;
-    state[k2 + warp_size] = 0;
-  }
-}
-
-template <typename FP, unsigned warp_size = 32>
-__global__ void BulkSetAmplKernel(
-    uint64_t mask, uint64_t bits, FP re, FP im, bool exclude, FP* state) {
-  uint64_t k1 = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x;
-  uint64_t k2 = 2 * k1 - threadIdx.x % warp_size;
-
-  bool set = ((k1 & mask) == bits) ^ exclude;
-
-  if (set) {
-    state[k2] = re;
-    state[k2 + warp_size] = im;
-  }
-}
-
-template <typename FP1, typename FP2, typename FP3, unsigned warp_size = 32>
-__global__ void SampleKernel(unsigned num_blocks,
-                             uint64_t n, uint64_t num_samples,
-                             const FP1* rs, const FP2* ps, const FP3* state,
-                             uint64_t *bitstrings) {
-  // Use just one thread. This can be somewhat slow.
-  if (threadIdx.x == 0) {
-    uint64_t m = 0;
-    double csum = 0;
-
-    for (unsigned block_id = 0; block_id < num_blocks; ++block_id) {
-      uint64_t km = n * blockDim.x;
-      uint64_t k0 = block_id * km;
-
-      for (uint64_t k = 0; k < km; ++k) {
-        uint64_t l = 2 * k0 + 64 * (k / 32) + k % 32;
-        FP3 re = state[l];
-        FP3 im = state[l + warp_size];
-        csum += re * re + im * im;
-        while (rs[m] < csum && m < num_samples) {
-          bitstrings[m++] = k0 + k;
-        }
-      }
-    }
-  }
-}
-
-template <typename FP, unsigned warp_size = 32>
-__global__ void FindMeasuredBitsKernel(
-    uint64_t block_id, uint64_t n, double r, const FP* state, uint64_t* res) {
-  // Use just one thread. This can be somewhat slow, however, this is
-  // more or less consistent with CPU implementations.
-  if (threadIdx.x == 0) {
-    double csum = 0;
-    uint64_t km = n * blockDim.x;
-    uint64_t k0 = block_id * km;
-
-    for (uint64_t k = 0; k < km; ++k) {
-      uint64_t l = 2 * k0 + 64 * (k / 32) + k % 32;
-      FP re = state[l];
-      FP im = state[l + warp_size];
-      csum += re * re + im * im;
-      if (r < csum) {
-        *res = k0 + k;
-        return;
-      }
-    }
-
-    *res = k0 + n * blockDim.x - 1;
-  }
-}
-
-}  // namespace qsim
-
-#endif  // STATESPACE_CUDA_KERNELS_H_
diff --git a/tpls/qsim/statespace_custatevec.h b/tpls/qsim/statespace_custatevec.h
deleted file mode 100644
index f2f5de1..0000000
--- a/tpls/qsim/statespace_custatevec.h
+++ /dev/null
@@ -1,376 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef STATESPACE_CUSTATEVEC_H_
-#define STATESPACE_CUSTATEVEC_H_
-
-#include <cmath>
-#include <complex>
-#include <cstdint>
-#include <type_traits>
-#include <vector>
-
-#include <cublas_v2.h>
-#include <cuComplex.h>
-#include <custatevec.h>
-
-#include "statespace.h"
-#include "util_custatevec.h"
-#include "vectorspace_cuda.h"
-
-namespace qsim {
-
-namespace detail {
-
-template <typename FP>
-__global__ void SetStateUniformKernel(FP v, uint64_t size, FP* state) {
-  uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x;
-
-  if (k < size) {
-    state[2 * k] = v;
-    state[2 * k + 1] = 0;
-  }
-}
-
-}  // namespace detail
-
-/**
- * Object containing context and routines for cuStateVec state-vector
- * manipulations. It is not recommended to use `GetAmpl` and `SetAmpl`.
- */
-template <typename FP = float>
-class StateSpaceCuStateVec :
-    public StateSpace<StateSpaceCuStateVec<FP>, VectorSpaceCUDA, FP> {
- private:
-  using Base = StateSpace<StateSpaceCuStateVec<FP>, qsim::VectorSpaceCUDA, FP>;
-
- public:
-  using State = typename Base::State;
-  using fp_type = typename Base::fp_type;
-
-  static constexpr auto is_float = std::is_same<fp_type, float>::value;
-
-  static constexpr auto kStateType = is_float ? CUDA_C_32F : CUDA_C_64F;
-  static constexpr auto kMatrixType = kStateType;
-  static constexpr auto kExpectType = CUDA_C_64F;
-  static constexpr auto kComputeType =
-      is_float ? CUSTATEVEC_COMPUTE_32F : CUSTATEVEC_COMPUTE_64F;
-  static constexpr auto kMatrixLayout = CUSTATEVEC_MATRIX_LAYOUT_ROW;
-
-  explicit StateSpaceCuStateVec(const cublasHandle_t& cublas_handle,
-                                const custatevecHandle_t& custatevec_handle)
-      : cublas_handle_(cublas_handle), custatevec_handle_(custatevec_handle),
-        workspace_(nullptr), workspace_size_(0) {}
-
-  virtual ~StateSpaceCuStateVec() {
-    if (workspace_ != nullptr) {
-      ErrorCheck(cudaFree(workspace_));
-    }
-  }
-
-  static uint64_t MinSize(unsigned num_qubits) {
-    return 2 * (uint64_t{1} << num_qubits);
-  };
-
-  void InternalToNormalOrder(State& state) const {
-  }
-
-  void NormalToInternalOrder(State& state) const {
-  }
-
-  void SetAllZeros(State& state) const {
-    ErrorCheck(cudaMemset(state.get(), 0,
-                          MinSize(state.num_qubits()) * sizeof(fp_type)));
-  }
-
-  // Uniform superposition.
-  void SetStateUniform(State& state) const {
-    uint64_t size = uint64_t{1} << state.num_qubits();
-
-    unsigned threads = size < 256 ? size : 256;
-    unsigned blocks = size / threads;
-
-    fp_type v = double{1} / std::sqrt(size);
-
-    detail::SetStateUniformKernel<<<blocks, threads>>>(v, size, state.get());
-    ErrorCheck(cudaPeekAtLastError());
-  }
-
-  // |0> state.
-  void SetStateZero(State& state) const {
-    SetAllZeros(state);
-    fp_type one[1] = {1};
-    ErrorCheck(
-        cudaMemcpy(state.get(), one, sizeof(fp_type), cudaMemcpyHostToDevice));
-  }
-
-  // It is not recommended to use this function.
-  static std::complex<fp_type> GetAmpl(const State& state, uint64_t i) {
-    fp_type a[2];
-    auto p = state.get() + 2 * i;
-    ErrorCheck(cudaMemcpy(a, p, 2 * sizeof(fp_type), cudaMemcpyDeviceToHost));
-    return std::complex<fp_type>(a[0], a[1]);
-  }
-
-  // It is not recommended to use this function.
-  static void SetAmpl(
-      State& state, uint64_t i, const std::complex<fp_type>& ampl) {
-    fp_type a[2] = {std::real(ampl), std::imag(ampl)};
-    auto p = state.get() + 2 * i;
-    ErrorCheck(cudaMemcpy(p, a, 2 * sizeof(fp_type), cudaMemcpyHostToDevice));
-  }
-
-  // It is not recommended to use this function.
-  static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) {
-    fp_type a[2] = {re, im};
-    auto p = state.get() + 2 * i;
-    ErrorCheck(cudaMemcpy(p, a, 2 * sizeof(fp_type), cudaMemcpyHostToDevice));
-  }
-
-  // Sets state[i] = complex(re, im) where (i & mask) == bits.
-  // if `exclude` is true then the criteria becomes (i & mask) != bits.
-  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits,
-                   const std::complex<fp_type>& val,
-                   bool exclude = false) const {
-    // Not implemented.
-  }
-
-  // Sets state[i] = complex(re, im) where (i & mask) == bits.
-  // if `exclude` is true then the criteria becomes (i & mask) != bits.
-  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re,
-                   fp_type im, bool exclude = false) const {
-    // Not implemented.
-  }
-
-  // Does the equivalent of dest += src elementwise.
-  bool Add(const State& src, State& dest) const {
-    if (src.num_qubits() != dest.num_qubits()) {
-      return false;
-    }
-
-    uint64_t size = uint64_t{1} << src.num_qubits();
-
-    if (is_float) {
-      cuComplex a = {1.0, 0.0};
-      auto p1 = (const cuComplex*) src.get();
-      auto p2 = (cuComplex*) dest.get();
-      ErrorCheck(cublasCaxpy(cublas_handle_, size, &a, p1, 1, p2, 1));
-    } else {
-      cuDoubleComplex a = {1.0, 0.0};
-      auto p1 = (const cuDoubleComplex*) src.get();
-      auto p2 = (cuDoubleComplex*) dest.get();
-      ErrorCheck(cublasZaxpy(cublas_handle_, size, &a, p1, 1, p2, 1));
-    }
-
-    return true;
-  }
-
-  // Does the equivalent of state *= a elementwise.
-  void Multiply(fp_type a, State& state) const {
-    uint64_t size = uint64_t{1} << state.num_qubits();
-
-    if (is_float) {
-      float a1 = a;
-      auto p = (cuComplex*) state.get();
-      ErrorCheck(cublasCsscal(cublas_handle_, size, &a1, p, 1));
-    } else {
-      double a1 = a;
-      auto p = (cuDoubleComplex*) state.get();
-      ErrorCheck(cublasZdscal(cublas_handle_, size, &a1, p, 1));
-    }
-  }
-
-  std::complex<double> InnerProduct(
-      const State& state1, const State& state2) const {
-    if (state1.num_qubits() != state2.num_qubits()) {
-      return std::nan("");
-    }
-
-    uint64_t size = uint64_t{1} << state1.num_qubits();
-
-    if (is_float) {
-      cuComplex result;
-      auto p1 = (const cuComplex*) state1.get();
-      auto p2 = (const cuComplex*) state2.get();
-      ErrorCheck(cublasCdotc(cublas_handle_, size, p1, 1, p2, 1, &result));
-      return {cuCrealf(result), cuCimagf(result)};
-    } else {
-      cuDoubleComplex result;
-      auto p1 = (const cuDoubleComplex*) state1.get();
-      auto p2 = (const cuDoubleComplex*) state2.get();
-      ErrorCheck(cublasZdotc(cublas_handle_, size, p1, 1, p2, 1, &result));
-      return {cuCreal(result), cuCimag(result)};
-    }
-  }
-
-  double RealInnerProduct(const State& state1, const State& state2) const {
-    return std::real(InnerProduct(state1, state2));
-  }
-
-  double Norm(const State& state) const {
-    uint64_t size = uint64_t{1} << state.num_qubits();
-
-    if (is_float) {
-      float result;
-      auto p = (const cuComplex*) state.get();
-      ErrorCheck(cublasScnrm2(cublas_handle_, size, p, 1, &result));
-      return result * result;
-    } else {
-      double result;
-      auto p = (const cuDoubleComplex*) state.get();
-      ErrorCheck(cublasDznrm2(cublas_handle_, size, p, 1, &result));
-      return result * result;
-    }
-  }
-
-  template <typename DistrRealType = double>
-  std::vector<uint64_t> Sample(
-      const State& state, uint64_t num_samples, unsigned seed) const {
-    std::vector<uint64_t> bitstrings;
-
-    if (num_samples > 0) {
-      auto rs = GenerateRandomValues<double>(num_samples, seed, 1.0);
-
-      size_t workspace_size;
-      custatevecSamplerDescriptor_t sampler;
-
-      ErrorCheck(custatevecSamplerCreate(
-                     custatevec_handle_, state.get(), kStateType,
-                     state.num_qubits(), &sampler, num_samples,
-                     &workspace_size));
-
-      AllocWorkSpace(workspace_size);
-
-      ErrorCheck(custatevecSamplerPreprocess(
-                     custatevec_handle_, sampler, workspace_, workspace_size));
-
-      std::vector<custatevecIndex_t> bitstrings0(num_samples);
-      std::vector<int32_t> bitordering;
-
-      bitordering.reserve(state.num_qubits());
-      for (unsigned i = 0; i < state.num_qubits(); ++i) {
-        bitordering.push_back(i);
-      }
-
-      ErrorCheck(custatevecSamplerSample(
-                     custatevec_handle_, sampler, bitstrings0.data(),
-                     bitordering.data(), state.num_qubits(), rs.data(),
-                     num_samples, CUSTATEVEC_SAMPLER_OUTPUT_RANDNUM_ORDER));
-
-      bitstrings.reserve(num_samples);
-      for (unsigned i = 0; i < num_samples; ++i) {
-        bitstrings.push_back(bitstrings0[i]);
-      }
-    }
-
-    return bitstrings;
-  }
-
-  using MeasurementResult = typename Base::MeasurementResult;
-
-  template <typename RGen>
-  MeasurementResult Measure(const std::vector<unsigned>& qubits,
-                            RGen& rgen, State& state,
-                            bool no_collapse = false) const {
-    auto r = RandomValue(rgen, 1.0);
-
-    MeasurementResult result;
-
-    result.valid = true;
-    result.mask = 0;
-    result.bits = 0;
-    result.bitstring.resize(qubits.size(), 0);
-
-    for (auto q : qubits) {
-      if (q >= state.num_qubits()) {
-        result.valid = false;
-        return result;
-      }
-
-      result.mask |= uint64_t{1} << q;
-    }
-
-    auto collapse = no_collapse ?
-        CUSTATEVEC_COLLAPSE_NONE : CUSTATEVEC_COLLAPSE_NORMALIZE_AND_ZERO;
-
-    ErrorCheck(custatevecBatchMeasure(
-                   custatevec_handle_, state.get(), kStateType,
-                   state.num_qubits(), (int*) result.bitstring.data(),
-                   (int*) qubits.data(), qubits.size(), r, collapse));
-
-    for (std::size_t i = 0; i < result.bitstring.size(); ++i) {
-      result.bits |= result.bitstring[i] << qubits[i];
-    }
-
-    return result;
-  }
-
-  template <typename RGen>
-  MeasurementResult VirtualMeasure(const std::vector<unsigned>& qubits,
-                                   RGen& rgen, const State& state) const {
-    return Measure(qubits, rgen, const_cast<State&>(state), true);
-  }
-
-  void Collapse(const MeasurementResult& mr, State& state) const {
-    unsigned count = 0;
-
-    std::vector<int> bitstring;
-    std::vector<int> bitordering;
-
-    bitstring.reserve(state.num_qubits());
-    bitordering.reserve(state.num_qubits());
-
-    for (unsigned i = 0; i < state.num_qubits(); ++i) {
-      if (((mr.mask >> i) & 1) != 0) {
-        bitstring.push_back((mr.bits >> i) & 1);
-        bitordering.push_back(i);
-        ++count;
-      }
-    }
-
-    ErrorCheck(custatevecCollapseByBitString(
-                   custatevec_handle_, state.get(), kStateType,
-                   state.num_qubits(), bitstring.data(), bitordering.data(),
-                   count, 1.0));
-
-    // TODO: do we need the following?
-    double norm = Norm(state);
-    Multiply(1.0 / std::sqrt(norm), state);
-  }
-
- private:
-  void* AllocWorkSpace(size_t size) const {
-    if (size > workspace_size_) {
-      if (workspace_ != nullptr) {
-        ErrorCheck(cudaFree(workspace_));
-      }
-
-      ErrorCheck(cudaMalloc(const_cast<void**>(&workspace_), size));
-
-      const_cast<uint64_t&>(workspace_size_) = size;
-    }
-
-    return workspace_;
-  }
-
-  const cublasHandle_t cublas_handle_;
-  const custatevecHandle_t custatevec_handle_;
-
-  void* workspace_;
-  size_t workspace_size_;
-};
-
-}  // namespace qsim
-
-#endif  // STATESPACE_CUSTATEVEC_H_
diff --git a/tpls/qsim/statespace_sse.h b/tpls/qsim/statespace_sse.h
deleted file mode 100644
index cf41a09..0000000
--- a/tpls/qsim/statespace_sse.h
+++ /dev/null
@@ -1,462 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef STATESPACE_SSE_H_
-#define STATESPACE_SSE_H_
-
-#include <smmintrin.h>
-
-#include <algorithm>
-#include <cmath>
-#include <complex>
-#include <cstdint>
-#include <functional>
-
-#include "statespace.h"
-#include "util.h"
-#include "vectorspace.h"
-
-namespace qsim {
-
-namespace detail {
-
-inline __m128i GetZeroMaskSSE(uint64_t i, uint64_t mask, uint64_t bits) {
-  __m128i s1 = _mm_set_epi64x(i + 2, i + 0);
-  __m128i s2 = _mm_set_epi64x(i + 3, i + 1);
-  __m128i ma = _mm_set1_epi64x(mask);
-  __m128i bi = _mm_set1_epi64x(bits);
-
-  s1 = _mm_and_si128(s1, ma);
-  s2 = _mm_and_si128(s2, ma);
-
-  s1 = _mm_cmpeq_epi64(s1, bi);
-  s2 = _mm_cmpeq_epi64(s2, bi);
-
-  return _mm_blend_epi16(s1, s2, 204);  // 11001100
-}
-
-inline double HorizontalSumSSE(__m128 s) {
-  __m128 ss = _mm_movehdup_ps(s);
-  __m128 s1 = _mm_add_ps(s, ss);
-
-  return _mm_cvtss_f32(_mm_add_ss(s1, _mm_movehl_ps(ss, s1)));
-}
-
-}  // namespace detail
-
-/**
- * Object containing context and routines for SSE state-vector manipulations.
- * State is a vectorized sequence of four real components followed by four
- * imaginary components. Four single-precison floating numbers can be loaded
- * into an SSE register.
- */
-template <typename For>
-class StateSpaceSSE :
-    public StateSpace<StateSpaceSSE<For>, VectorSpace, For, float> {
- private:
-  using Base = StateSpace<StateSpaceSSE<For>, qsim::VectorSpace, For, float>;
-
- public:
-  using State = typename Base::State;
-  using fp_type = typename Base::fp_type;
-
-  template <typename... ForArgs>
-  explicit StateSpaceSSE(ForArgs&&... args) : Base(args...) {}
-
-  static uint64_t MinSize(unsigned num_qubits) {
-    return std::max(uint64_t{8}, 2 * (uint64_t{1} << num_qubits));
-  };
-
-  void InternalToNormalOrder(State& state) const {
-    if (state.num_qubits() == 1) {
-      auto s = state.get();
-
-      s[2] = s[1];
-      s[1] = s[4];
-      s[3] = s[5];
-
-      for (uint64_t i = 4; i < 8; ++i) {
-        s[i] = 0;
-      }
-    } else {
-      auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) {
-        auto s = p + 8 * i;
-
-        fp_type re[3];
-        fp_type im[3];
-
-        for (uint64_t i = 0; i < 3; ++i) {
-          re[i] = s[i + 1];
-          im[i] = s[i + 4];
-        }
-
-        for (uint64_t i = 0; i < 3; ++i) {
-          s[2 * i + 1] = im[i];
-          s[2 * i + 2] = re[i];
-        }
-      };
-
-      Base::for_.Run(MinSize(state.num_qubits()) / 8, f, state.get());
-    }
-  }
-
-  void NormalToInternalOrder(State& state) const {
-    if (state.num_qubits() == 1) {
-      auto s = state.get();
-
-      s[4] = s[1];
-      s[1] = s[2];
-      s[5] = s[3];
-
-      s[2] = 0;
-      s[3] = 0;
-      s[6] = 0;
-      s[7] = 0;
-    } else {
-      auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) {
-        auto s = p + 8 * i;
-
-        fp_type re[3];
-        fp_type im[3];
-
-        for (uint64_t i = 0; i < 3; ++i) {
-          im[i] = s[2 * i + 1];
-          re[i] = s[2 * i + 2];
-        }
-
-        for (uint64_t i = 0; i < 3; ++i) {
-          s[i + 1] = re[i];
-          s[i + 4] = im[i];
-        }
-      };
-
-      Base::for_.Run(MinSize(state.num_qubits()) / 8, f, state.get());
-    }
-  }
-
-  void SetAllZeros(State& state) const {
-    __m128 val0 = _mm_setzero_ps();
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, __m128 val0, fp_type* p) {
-      _mm_store_ps(p + 8 * i, val0);
-      _mm_store_ps(p + 8 * i + 4, val0);
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 8, f, val0, state.get());
-  }
-
-  // Uniform superposition.
-  void SetStateUniform(State& state) const {
-    __m128 val0 = _mm_setzero_ps();
-    __m128 valu;
-
-    fp_type v = double{1} / std::sqrt(uint64_t{1} << state.num_qubits());
-
-    if (state.num_qubits() == 1) {
-      valu = _mm_set_ps(0, 0, v, v);
-    } else {
-      valu = _mm_set1_ps(v);
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                __m128 val0, __m128 valu, fp_type* p) {
-      _mm_store_ps(p + 8 * i, valu);
-      _mm_store_ps(p + 8 * i + 4, val0);
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 8, f, val0, valu, state.get());
-  }
-
-  // |0> state.
-  void SetStateZero(State& state) const {
-    SetAllZeros(state);
-    state.get()[0] = 1;
-  }
-
-  static std::complex<fp_type> GetAmpl(const State& state, uint64_t i) {
-    uint64_t p = (8 * (i / 4)) + (i % 4);
-    return std::complex<fp_type>(state.get()[p], state.get()[p + 4]);
-  }
-
-  static void SetAmpl(
-      State& state, uint64_t i, const std::complex<fp_type>& ampl) {
-    uint64_t p = (8 * (i / 4)) + (i % 4);
-    state.get()[p] = std::real(ampl);
-    state.get()[p + 4] = std::imag(ampl);
-  }
-
-  static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) {
-    uint64_t p = (8 * (i / 4)) + (i % 4);
-    state.get()[p] = re;
-    state.get()[p + 4] = im;
-  }
-
-  // Sets state[i] = complex(re, im) where (i & mask) == bits.
-  // if `exclude` is true then the criteria becomes (i & mask) != bits.
-  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits,
-                   const std::complex<fp_type>& val,
-                   bool exclude = false) const {
-    BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val));
-  }
-
-  // Sets state[i] = complex(re, im) where (i & mask) == bits.
-  // if `exclude` is true then the criteria becomes (i & mask) != bits.
-  void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re,
-                   fp_type im, bool exclude = false) const {
-    __m128 re_reg = _mm_set1_ps(re);
-    __m128 im_reg = _mm_set1_ps(im);
-    __m128i exclude_reg = _mm_setzero_si128();
-    if (exclude) {
-      exclude_reg = _mm_cmpeq_epi32(exclude_reg, exclude_reg);
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv,
-                uint64_t bitsv, __m128 re_n, __m128 im_n, __m128i exclude_n,
-                fp_type* p) {
-      __m128 ml = _mm_castsi128_ps(_mm_xor_si128(
-          detail::GetZeroMaskSSE(4 * i, maskv, bitsv), exclude_n));
-
-      __m128 re = _mm_load_ps(p + 8 * i);
-      __m128 im = _mm_load_ps(p + 8 * i + 4);
-
-      re = _mm_blendv_ps(re, re_n, ml);
-      im = _mm_blendv_ps(im, im_n, ml);
-
-      _mm_store_ps(p + 8 * i, re);
-      _mm_store_ps(p + 8 * i + 4, im);
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 8, f, mask, bits, re_reg,
-                   im_reg, exclude_reg, state.get());
-  }
-
-  // Does the equivalent of dest += src elementwise.
-  bool Add(const State& src, State& dest) const {
-    if (src.num_qubits() != dest.num_qubits()) {
-      return false;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* p1, fp_type* p2) {
-      __m128 re1 = _mm_load_ps(p1 + 8 * i);
-      __m128 im1 = _mm_load_ps(p1 + 8 * i + 4);
-      __m128 re2 = _mm_load_ps(p2 + 8 * i);
-      __m128 im2 = _mm_load_ps(p2 + 8 * i + 4);
-
-      _mm_store_ps(p2 + 8 * i, _mm_add_ps(re1, re2));
-      _mm_store_ps(p2 + 8 * i + 4, _mm_add_ps(im1, im2));
-    };
-
-    Base::for_.Run(MinSize(src.num_qubits()) / 8, f, src.get(), dest.get());
-
-    return true;
-  }
-
-  // Does the equivalent of state *= a elementwise.
-  void Multiply(fp_type a, State& state) const {
-    __m128 r = _mm_set1_ps(a);
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, __m128 r, fp_type* p) {
-      __m128 re = _mm_load_ps(p + 8 * i);
-      __m128 im = _mm_load_ps(p + 8 * i + 4);
-
-      re = _mm_mul_ps(re, r);
-      im = _mm_mul_ps(im, r);
-
-      _mm_store_ps(p + 8 * i, re);
-      _mm_store_ps(p + 8 * i + 4, im);
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 8, f, r, state.get());
-  }
-
-  std::complex<double> InnerProduct(
-      const State& state1, const State& state2) const {
-    if (state1.num_qubits() != state2.num_qubits()) {
-      return std::nan("");
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* p1, const fp_type* p2) -> std::complex<double> {
-      __m128 re1 = _mm_load_ps(p1 + 8 * i);
-      __m128 im1 = _mm_load_ps(p1 + 8 * i + 4);
-      __m128 re2 = _mm_load_ps(p2 + 8 * i);
-      __m128 im2 = _mm_load_ps(p2 + 8 * i + 4);
-
-      __m128 ip_re = _mm_add_ps(_mm_mul_ps(re1, re2), _mm_mul_ps(im1, im2));
-      __m128 ip_im = _mm_sub_ps(_mm_mul_ps(re1, im2), _mm_mul_ps(im1, re2));
-
-      double re = detail::HorizontalSumSSE(ip_re);
-      double im = detail::HorizontalSumSSE(ip_im);
-
-      return std::complex<double>{re, im};
-    };
-
-    using Op = std::plus<std::complex<double>>;
-    return Base::for_.RunReduce(
-        MinSize(state1.num_qubits()) / 8, f, Op(), state1.get(), state2.get());
-  }
-
-  double RealInnerProduct(const State& state1, const State& state2) const {
-    if (state1.num_qubits() != state2.num_qubits()) {
-      return std::nan("");
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* p1, const fp_type* p2) -> double {
-      __m128 re1 = _mm_load_ps(p1 + 8 * i);
-      __m128 im1 = _mm_load_ps(p1 + 8 * i + 4);
-      __m128 re2 = _mm_load_ps(p2 + 8 * i);
-      __m128 im2 = _mm_load_ps(p2 + 8 * i + 4);
-
-      __m128 ip_re = _mm_add_ps(_mm_mul_ps(re1, re2), _mm_mul_ps(im1, im2));
-
-      return detail::HorizontalSumSSE(ip_re);
-    };
-
-    using Op = std::plus<double>;
-    return Base::for_.RunReduce(
-        MinSize(state1.num_qubits()) / 8, f, Op(), state1.get(), state2.get());
-  }
-
-  template <typename DistrRealType = double>
-  std::vector<uint64_t> Sample(
-      const State& state, uint64_t num_samples, unsigned seed) const {
-    std::vector<uint64_t> bitstrings;
-
-    if (num_samples > 0) {
-      double norm = 0;
-      uint64_t size = MinSize(state.num_qubits()) / 8;
-      const fp_type* p = state.get();
-
-      for (uint64_t k = 0; k < size; ++k) {
-        for (unsigned j = 0; j < 4; ++j) {
-          double re = p[8 * k + j];
-          double im = p[8 * k + 4 + j];
-          norm += re * re + im * im;
-        }
-      }
-
-      auto rs = GenerateRandomValues<DistrRealType>(num_samples, seed, norm);
-
-      uint64_t m = 0;
-      double csum = 0;
-      bitstrings.reserve(num_samples);
-
-      for (uint64_t k = 0; k < size; ++k) {
-        for (unsigned j = 0; j < 4; ++j) {
-          double re = p[8 * k + j];
-          double im = p[8 * k + 4 + j];
-          csum += re * re + im * im;
-          while (rs[m] < csum && m < num_samples) {
-            bitstrings.emplace_back(4 * k + j);
-            ++m;
-          }
-        }
-      }
-
-      for (; m < num_samples; ++m) {
-        bitstrings.emplace_back((uint64_t{1} << state.num_qubits()) - 1);
-      }
-    }
-
-    return bitstrings;
-  }
-
-  using MeasurementResult = typename Base::MeasurementResult;
-
-  void Collapse(const MeasurementResult& mr, State& state) const {
-    __m128 zero = _mm_set1_ps(0);
-
-    auto f1 = [](unsigned n, unsigned m, uint64_t i, uint64_t mask,
-                 uint64_t bits, __m128 zero, const fp_type* p) -> double {
-      __m128 ml = _mm_castsi128_ps(detail::GetZeroMaskSSE(4 * i, mask, bits));
-
-      __m128 re = _mm_load_ps(p + 8 * i);
-      __m128 im = _mm_load_ps(p + 8 * i + 4);
-      __m128 s1 = _mm_add_ps(_mm_mul_ps(re, re), _mm_mul_ps(im, im));
-
-      s1 = _mm_blendv_ps(zero, s1, ml);
-
-      return detail::HorizontalSumSSE(s1);
-    };
-
-    using Op = std::plus<double>;
-    double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 8, f1,
-                                       Op(), mr.mask, mr.bits, zero,
-                                       state.get());
-
-    __m128 renorm = _mm_set1_ps(1.0 / std::sqrt(norm));
-
-    auto f2 = [](unsigned n, unsigned m, uint64_t i, uint64_t mask,
-                 uint64_t bits, __m128 renorm, __m128 zero, fp_type* p) {
-      __m128 ml = _mm_castsi128_ps(detail::GetZeroMaskSSE(4 * i, mask, bits));
-
-      __m128 re = _mm_load_ps(p + 8 * i);
-      __m128 im = _mm_load_ps(p + 8 * i + 4);
-
-      re = _mm_blendv_ps(zero, _mm_mul_ps(re, renorm), ml);
-      im = _mm_blendv_ps(zero, _mm_mul_ps(im, renorm), ml);
-
-      _mm_store_ps(p + 8 * i, re);
-      _mm_store_ps(p + 8 * i + 4, im);
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 8, f2,
-                   mr.mask, mr.bits, renorm, zero, state.get());
-  }
-
-  std::vector<double> PartialNorms(const State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* p) -> double {
-      __m128 re = _mm_load_ps(p + 8 * i);
-      __m128 im = _mm_load_ps(p + 8 * i + 4);
-      __m128 s1 = _mm_add_ps(_mm_mul_ps(re, re), _mm_mul_ps(im, im));
-
-      return detail::HorizontalSumSSE(s1);
-    };
-
-    using Op = std::plus<double>;
-    return Base::for_.RunReduceP(
-        MinSize(state.num_qubits()) / 8, f, Op(), state.get());
-  }
-
-  uint64_t FindMeasuredBits(
-      unsigned m, double r, uint64_t mask, const State& state) const {
-    double csum = 0;
-
-    uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 8, m);
-    uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 8, m);
-
-    const fp_type* p = state.get();
-
-    for (uint64_t k = k0; k < k1; ++k) {
-      for (uint64_t j = 0; j < 4; ++j) {
-        auto re = p[8 * k + j];
-        auto im = p[8 * k + 4 + j];
-        csum += re * re + im * im;
-        if (r < csum) {
-          return (4 * k + j) & mask;
-        }
-      }
-    }
-
-    // Return the last bitstring in the unlikely case of underflow.
-    return (4 * k1 - 1) & mask;
-  }
-};
-
-}  // namespace qsim
-
-#endif  // STATESPACE_SSE_H_
diff --git a/tpls/qsim/umux.h b/tpls/qsim/umux.h
deleted file mode 100644
index 83b951b..0000000
--- a/tpls/qsim/umux.h
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef UMUX_H_
-#define UMUX_H_
-
-#ifdef __AVX512F__
-# include "unitary_calculator_avx512.h"
-  namespace qsim {
-  namespace unitary {
-    template <typename For>
-    using UnitaryCalculator = UnitaryCalculatorAVX512<For>;
-  }
-  }
-#elif __AVX2__
-# include "unitary_calculator_avx.h"
-  namespace qsim {
-  namespace unitary {
-    template <typename For>
-    using UnitaryCalculator = UnitaryCalculatorAVX<For>;
-  }
-  }
-#elif __SSE4_1__
-# include "unitary_calculator_sse.h"
-  namespace qsim {
-  namespace unitary {
-    template <typename For>
-    using UnitaryCalculator = UnitaryCalculatorSSE<For>;
-  }
-  }
-#else
-# include "unitary_calculator_basic.h"
-  namespace qsim {
-  namespace unitary {
-    template <typename For>
-    using UnitaryCalculator = UnitaryCalculatorBasic<For>;
-  }
-  }
-#endif
-
-#endif  // UMUX_H_
diff --git a/tpls/qsim/unitary_calculator_avx.h b/tpls/qsim/unitary_calculator_avx.h
deleted file mode 100644
index 5e566ca..0000000
--- a/tpls/qsim/unitary_calculator_avx.h
+++ /dev/null
@@ -1,1028 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef UNITARY_CALCULATOR_AVX_H_
-#define UNITARY_CALCULATOR_AVX_H_
-
-#include <immintrin.h>
-
-#include <complex>
-#include <cstdint>
-#include <functional>
-#include <vector>
-
-#include "simulator.h"
-#include "unitaryspace_avx.h"
-
-namespace qsim {
-namespace unitary {
-
-/**
- * Quantum circuit unitary calculator with AVX vectorization.
- */
-template <typename For>
-class UnitaryCalculatorAVX final : public SimulatorBase {
- public:
-  using UnitarySpace = UnitarySpaceAVX<For>;
-  using Unitary = typename UnitarySpace::Unitary;
-  using fp_type = typename UnitarySpace::fp_type;
-
-  using StateSpace = UnitarySpace;
-  using State = Unitary;
-
-  template <typename... ForArgs>
-  explicit UnitaryCalculatorAVX(ForArgs&&... args) : for_(args...) {}
-
-  /**
-   * Applies a gate using AVX instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyGate(const std::vector<unsigned>& qs,
-                 const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    switch (qs.size()) {
-    case 1:
-      if (qs[0] > 2) {
-        ApplyGateH<1>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 1>(qs, matrix, state);
-      }
-      break;
-    case 2:
-      if (qs[0] > 2) {
-        ApplyGateH<2>(qs, matrix, state);
-      } else if (qs[1] > 2) {
-        ApplyGateL<1, 1>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 2>(qs, matrix, state);
-      }
-      break;
-    case 3:
-      if (qs[0] > 2) {
-        ApplyGateH<3>(qs, matrix, state);
-      } else if (qs[1] > 2) {
-        ApplyGateL<2, 1>(qs, matrix, state);
-      } else if (qs[2] > 2) {
-        ApplyGateL<1, 2>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 3>(qs, matrix, state);
-      }
-      break;
-    case 4:
-      if (qs[0] > 2) {
-        ApplyGateH<4>(qs, matrix, state);
-      } else if (qs[1] > 2) {
-        ApplyGateL<3, 1>(qs, matrix, state);
-      } else if (qs[2] > 2) {
-        ApplyGateL<2, 2>(qs, matrix, state);
-      } else {
-        ApplyGateL<1, 3>(qs, matrix, state);
-      }
-      break;
-    case 5:
-      if (qs[0] > 2) {
-        ApplyGateH<5>(qs, matrix, state);
-      } else if (qs[1] > 2) {
-        ApplyGateL<4, 1>(qs, matrix, state);
-      } else if (qs[2] > 2) {
-        ApplyGateL<3, 2>(qs, matrix, state);
-      } else {
-        ApplyGateL<2, 3>(qs, matrix, state);
-      }
-      break;
-    case 6:
-      if (qs[0] > 2) {
-        ApplyGateH<6>(qs, matrix, state);
-      } else if (qs[1] > 2) {
-        ApplyGateL<5, 1>(qs, matrix, state);
-      } else if (qs[2] > 2) {
-        ApplyGateL<4, 2>(qs, matrix, state);
-      } else {
-        ApplyGateL<3, 3>(qs, matrix, state);
-      }
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-  }
-
-  /**
-   * Applies a controlled gate using AVX instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param cqs Indices of control qubits.
-   * @param cvals Bit mask of control qubit values.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyControlledGate(const std::vector<unsigned>& qs,
-                           const std::vector<unsigned>& cqs, uint64_t cvals,
-                           const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-    // Assume cqs[0] < cqs[1] < cqs[2] < ... .
-
-    if (cqs.size() == 0) {
-      ApplyGate(qs, matrix, state);
-      return;
-    }
-
-    switch (qs.size()) {
-    case 1:
-      if (qs[0] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 2:
-      if (qs[0] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 3:
-      if (qs[0] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[2] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 4:
-      if (qs[0] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[2] > 2) {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 2) {
-          ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-  }
-
-  /**
-   * @return The size of SIMD register if applicable.
-   */
-  static unsigned SIMDRegisterSize() {
-    return 8;
-  }
-
- private:
-
-#ifdef __BMI2__
-
-  template <unsigned H>
-  void ApplyGateH(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                uint64_t imaskh, uint64_t qmaskh, uint64_t size,
-                uint64_t row_size, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m256 ru, iu, rn, in;
-      __m256 rs[hsize], is[hsize];
-
-      uint64_t r = i % size;
-      uint64_t s = i / size;
-
-      auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k] = _mm256_load_ps(p0 + p);
-        is[k] = _mm256_load_ps(p0 + p + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[l], ru, rn);
-          in = _mm256_fmadd_ps(rs[l], iu, in);
-          rn = _mm256_fnmadd_ps(is[l], iu, rn);
-          in = _mm256_fmadd_ps(is[l], ru, in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm256_store_ps(p0 + p, rn);
-        _mm256_store_ps(p0 + p + 8, in);
-      }
-    };
-
-    auto m = GetMasks1<H, 3>(qs);
-
-    unsigned k = 3 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f,
-             matrix, m.imaskh, m.qmaskh, size, raw_size, state.get());
-  }
-
-  template <unsigned H, unsigned L>
-  void ApplyGateL(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                uint64_t imaskh, uint64_t qmaskh, const __m256i* idx,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m256 rn, in;
-      __m256 rs[gsize], is[gsize];
-
-      uint64_t r = i % size;
-      uint64_t s = i / size;
-
-      auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k2] = _mm256_load_ps(p0 + p);
-        is[k2] = _mm256_load_ps(p0 + p + 8);
-
-        for (unsigned l = 1; l < lsize; ++l) {
-          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
-          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
-          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm256_store_ps(p0 + p, rn);
-        _mm256_store_ps(p0 + p + 8, in);
-      }
-    };
-
-    __m256i idx[1 << L];
-    __m256 w[1 << (1 + 2 * H + L)];
-
-    auto m = GetMasks2<H, L, 3>(qs);
-    FillPermutationIndices<L>(m.qmaskl, idx);
-    FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
-
-    unsigned k = 3 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f,
-             w, m.imaskh, m.qmaskh, idx, size, raw_size, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m256 ru, iu, rn, in;
-      __m256 rs[hsize], is[hsize];
-
-      uint64_t r = i % size;
-      uint64_t s = i / size;
-
-      auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k] = _mm256_load_ps(p0 + p);
-        is[k] = _mm256_load_ps(p0 + p + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[l], ru, rn);
-          in = _mm256_fmadd_ps(rs[l], iu, in);
-          rn = _mm256_fnmadd_ps(is[l], iu, rn);
-          in = _mm256_fmadd_ps(is[l], ru, in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm256_store_ps(p0 + p, rn);
-        _mm256_store_ps(p0 + p + 8, in);
-      }
-    };
-
-    auto m = GetMasks3<H, 3>(state.num_qubits(), qs, cqs, cvals);
-
-    unsigned k = 3 + H + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f,
-             matrix, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m256 rn, in;
-      __m256 rs[hsize], is[hsize];
-
-      uint64_t r = i % size;
-      uint64_t s = i / size;
-
-      auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k] = _mm256_load_ps(p0 + p);
-        is[k] = _mm256_load_ps(p0 + p + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
-          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm256_store_ps(p0 + p, rn);
-        _mm256_store_ps(p0 + p + 8, in);
-      }
-    };
-
-    __m256 w[1 << (1 + 2 * H)];
-
-    auto m = GetMasks4<H, 3>(state.num_qubits(), qs, cqs, cvals);
-    FillControlledMatrixH<H, 3>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
-
-    unsigned k = 3 + H + cqs.size() - m.cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f,
-             w, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get());
-  }
-
-  template <unsigned H, unsigned L, bool CH>
-  void ApplyControlledGateL(const std::vector<unsigned>& qs,
-                            const std::vector<unsigned>& cqs, uint64_t cvals,
-                            const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
-                const __m256i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m256 rn, in;
-      __m256 rs[gsize], is[gsize];
-
-      uint64_t r = i % size;
-      uint64_t s = i / size;
-
-      auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k2] = _mm256_load_ps(p0 + p);
-        is[k2] = _mm256_load_ps(p0 + p + 8);
-
-        for (unsigned l = 1; l < lsize; ++l) {
-          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
-          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
-          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm256_store_ps(p0 + p, rn);
-        _mm256_store_ps(p0 + p + 8, in);
-      }
-    };
-
-    __m256i idx[1 << L];
-    __m256 w[1 << (1 + 2 * H + L)];
-
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    if (CH) {
-      auto m = GetMasks5<H, L, 3>(state.num_qubits(), qs, cqs, cvals);
-      FillPermutationIndices<L>(m.qmaskl, idx);
-      FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
-
-      unsigned k = 3 + H + cqs.size();
-      unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-      uint64_t size = uint64_t{1} << n;
-
-      for_.Run(size * size2, f, w, m.imaskh, m.qmaskh,
-               m.cvalsh, idx, size, raw_size, state.get());
-    } else {
-      auto m = GetMasks6<H, L, 3>(state.num_qubits(), qs, cqs, cvals);
-      FillPermutationIndices<L>(m.qmaskl, idx);
-      FillControlledMatrixL<H, L, 3>(
-          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
-
-      unsigned k = 3 + H + cqs.size() - m.cl;
-      unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-      uint64_t size = uint64_t{1} << n;
-
-      for_.Run(size * size2, f, w, m.imaskh, m.qmaskh,
-               m.cvalsh, idx, size, raw_size, state.get());
-    }
-  }
-
-#else  // __BMI2__
-
-  template <unsigned H>
-  void ApplyGateH(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss, uint64_t size,
-                uint64_t row_size, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m256 ru, iu, rn, in;
-      __m256 rs[hsize], is[hsize];
-
-      uint64_t r = 8 * (i % size);
-      uint64_t s = i / size;
-
-      uint64_t t = r & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        r *= 2;
-        t |= r & ms[j];
-      }
-
-      auto p0 = rstate + row_size * s + 2 * t;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = _mm256_load_ps(p0 + xss[k]);
-        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[l], ru, rn);
-          in = _mm256_fmadd_ps(rs[l], iu, in);
-          rn = _mm256_fnmadd_ps(is[l], iu, rn);
-          in = _mm256_fmadd_ps(is[l], ru, in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[k], rn);
-        _mm256_store_ps(p0 + xss[k] + 8, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-
-    unsigned k = 3 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, state.get());
-  }
-
-  template <unsigned H, unsigned L>
-  void ApplyGateL(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss, const __m256i* idx,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m256 rn, in;
-      __m256 rs[gsize], is[gsize];
-
-      uint64_t r = 8 * (i % size);
-      uint64_t s = i / size;
-
-      uint64_t t = r & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        r *= 2;
-        t |= r & ms[j];
-      }
-
-      auto p0 = rstate + row_size * s + 2 * t;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-        rs[k2] = _mm256_load_ps(p0 + xss[k]);
-        is[k2] = _mm256_load_ps(p0 + xss[k] + 8);
-
-        for (unsigned l = 1; l < lsize; ++l) {
-          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
-          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
-          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[k], rn);
-        _mm256_store_ps(p0 + xss[k] + 8, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-    __m256i idx[1 << L];
-    __m256 w[1 << (1 + 2 * H + L)];
-
-    auto m = GetMasks11<L>(qs);
-
-    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
-    FillPermutationIndices<L>(m.qmaskl, idx);
-    FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
-
-    unsigned k = 3 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
-                uint64_t cmaskh, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m256 ru, iu, rn, in;
-      __m256 rs[hsize], is[hsize];
-
-      uint64_t r = 8 * (i % size);
-      uint64_t s = i / size;
-
-      uint64_t t = r & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        r *= 2;
-        t |= r & ms[j];
-      }
-
-      if ((t & cmaskh) != cvalsh) return;
-
-      auto p0 = rstate + row_size * s + 2 * t;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = _mm256_load_ps(p0 + xss[k]);
-        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm256_set1_ps(v[j]);
-        iu = _mm256_set1_ps(v[j + 1]);
-        rn = _mm256_mul_ps(rs[0], ru);
-        in = _mm256_mul_ps(rs[0], iu);
-        rn = _mm256_fnmadd_ps(is[0], iu, rn);
-        in = _mm256_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm256_set1_ps(v[j]);
-          iu = _mm256_set1_ps(v[j + 1]);
-          rn = _mm256_fmadd_ps(rs[l], ru, rn);
-          in = _mm256_fmadd_ps(rs[l], iu, in);
-          rn = _mm256_fnmadd_ps(is[l], iu, rn);
-          in = _mm256_fmadd_ps(is[l], ru, in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[k], rn);
-        _mm256_store_ps(p0 + xss[k] + 8, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-
-    auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals);
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-
-    unsigned k = 3 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f,
-             matrix, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
-                uint64_t cmaskh, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m256 rn, in;
-      __m256 rs[hsize], is[hsize];
-
-      uint64_t r = 8 * (i % size);
-      uint64_t s = i / size;
-
-      uint64_t t = r & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        r *= 2;
-        t |= r & ms[j];
-      }
-
-      if ((t & cmaskh) != cvalsh) return;
-
-      auto p0 = rstate + row_size * s + 2 * t;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = _mm256_load_ps(p0 + xss[k]);
-        is[k] = _mm256_load_ps(p0 + xss[k] + 8);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
-          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[k], rn);
-        _mm256_store_ps(p0 + xss[k] + 8, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-    __m256 w[1 << (1 + 2 * H)];
-
-    auto m = GetMasks8<3>(state.num_qubits(), qs, cqs, cvals);
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-    FillControlledMatrixH<H, 3>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
-
-    unsigned k = 3 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f,
-             w, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get());
-  }
-
-  template <unsigned H, unsigned L, bool CH>
-  void ApplyControlledGateL(const std::vector<unsigned>& qs,
-                            const std::vector<unsigned>& cqs, uint64_t cvals,
-                            const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w,
-                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
-                uint64_t cmaskh, const __m256i* idx, uint64_t size,
-                uint64_t row_size, fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m256 rn, in;
-      __m256 rs[gsize], is[gsize];
-
-      uint64_t r = 8 * (i % size);
-      uint64_t s = i / size;
-
-      uint64_t t = r & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        r *= 2;
-        t |= r & ms[j];
-      }
-
-      if ((t & cmaskh) != cvalsh) return;
-
-      auto p0 = rstate + row_size * s + 2 * t;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-
-        rs[k2] = _mm256_load_ps(p0 + xss[k]);
-        is[k2] = _mm256_load_ps(p0 + xss[k] + 8);
-
-        for (unsigned l = 1; l < lsize; ++l) {
-          rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]);
-          is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm256_mul_ps(rs[0], w[j]);
-        in = _mm256_mul_ps(rs[0], w[j + 1]);
-        rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm256_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm256_fmadd_ps(rs[l], w[j], rn);
-          in = _mm256_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm256_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        _mm256_store_ps(p0 + xss[k], rn);
-        _mm256_store_ps(p0 + xss[k] + 8, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-    __m256i idx[1 << L];
-    __m256 w[1 << (1 + 2 * H + L)];
-
-    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
-
-    unsigned k = 3 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    if (CH) {
-      auto m = GetMasks9<L>(state.num_qubits(), qs, cqs, cvals);
-      FillPermutationIndices<L>(m.qmaskl, idx);
-      FillMatrix<H, L, 3>(m.qmaskl, matrix, (fp_type*) w);
-
-      for_.Run(size * size2, f, w, ms, xss, m.cvalsh,
-               m.cmaskh, idx, size, raw_size, state.get());
-    } else {
-      auto m = GetMasks10<L, 3>(state.num_qubits(), qs, cqs, cvals);
-      FillPermutationIndices<L>(m.qmaskl, idx);
-      FillControlledMatrixL<H, L, 3>(
-          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
-
-      for_.Run(size * size2, f, w, ms, xss, m.cvalsh,
-               m.cmaskh, idx, size, raw_size, state.get());
-    }
-  }
-
-#endif  // __BMI2__
-
-  template <unsigned L>
-  static void FillPermutationIndices(unsigned qmaskl, __m256i* idx) {
-    constexpr unsigned lsize = 1 << L;
-
-    for (unsigned i = 0; i < lsize - 1; ++i) {
-      unsigned p[8];
-
-      for (unsigned j = 0; j < 8; ++j) {
-        p[j] = MaskedAdd<3>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl));
-      }
-
-      idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]);
-    }
-  }
-
-  For for_;
-};
-
-}  // namespace unitary
-}  // namespace qsim
-
-#endif  // UNITARY_CALCULATOR_AVX_H_
diff --git a/tpls/qsim/unitary_calculator_avx512.h b/tpls/qsim/unitary_calculator_avx512.h
deleted file mode 100644
index 8105367..0000000
--- a/tpls/qsim/unitary_calculator_avx512.h
+++ /dev/null
@@ -1,644 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef UNITARY_CALCULATOR_AVX512_H_
-#define UNITARY_CALCULATOR_AVX512_H_
-
-#include <immintrin.h>
-
-#include <complex>
-#include <cstdint>
-#include <functional>
-#include <vector>
-
-#include "simulator.h"
-#include "unitaryspace_avx512.h"
-
-namespace qsim {
-namespace unitary {
-
-/**
- * Quantum circuit unitary calculator with AVX512 vectorization.
- */
-template <typename For>
-class UnitaryCalculatorAVX512 final : public SimulatorBase {
- public:
-  using UnitarySpace = UnitarySpaceAVX512<For>;
-  using Unitary = typename UnitarySpace::Unitary;
-  using fp_type = typename UnitarySpace::fp_type;
-
-  using StateSpace = UnitarySpace;
-  using State = Unitary;
-
-  template <typename... ForArgs>
-  explicit UnitaryCalculatorAVX512(ForArgs&&... args) : for_(args...) {}
-
-  /**
-   * Applies a gate using AVX512 instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyGate(const std::vector<unsigned>& qs,
-                 const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    switch (qs.size()) {
-    case 1:
-      if (qs[0] > 3) {
-        ApplyGateH<1>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 1>(qs, matrix, state);
-      }
-      break;
-    case 2:
-      if (qs[0] > 3) {
-        ApplyGateH<2>(qs, matrix, state);
-      } else if (qs[1] > 3) {
-        ApplyGateL<1, 1>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 2>(qs, matrix, state);
-      }
-      break;
-    case 3:
-      if (qs[0] > 3) {
-        ApplyGateH<3>(qs, matrix, state);
-      } else if (qs[1] > 3) {
-        ApplyGateL<2, 1>(qs, matrix, state);
-      } else if (qs[2] > 3) {
-        ApplyGateL<1, 2>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 3>(qs, matrix, state);
-      }
-      break;
-    case 4:
-      if (qs[0] > 3) {
-        ApplyGateH<4>(qs, matrix, state);
-      } else if (qs[1] > 3) {
-        ApplyGateL<3, 1>(qs, matrix, state);
-      } else if (qs[2] > 3) {
-        ApplyGateL<2, 2>(qs, matrix, state);
-      } else if (qs[3] > 3) {
-        ApplyGateL<1, 3>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 4>(qs, matrix, state);
-      }
-      break;
-    case 5:
-      if (qs[0] > 3) {
-        ApplyGateH<5>(qs, matrix, state);
-      } else if (qs[1] > 3) {
-        ApplyGateL<4, 1>(qs, matrix, state);
-      } else if (qs[2] > 3) {
-        ApplyGateL<3, 2>(qs, matrix, state);
-      } else if (qs[3] > 3) {
-        ApplyGateL<2, 3>(qs, matrix, state);
-      } else {
-        ApplyGateL<1, 4>(qs, matrix, state);
-      }
-      break;
-    case 6:
-      if (qs[0] > 3) {
-        ApplyGateH<6>(qs, matrix, state);
-      } else if (qs[1] > 3) {
-        ApplyGateL<5, 1>(qs, matrix, state);
-      } else if (qs[2] > 3) {
-        ApplyGateL<4, 2>(qs, matrix, state);
-      } else if (qs[3] > 3) {
-        ApplyGateL<3, 3>(qs, matrix, state);
-      } else {
-        ApplyGateL<2, 4>(qs, matrix, state);
-      }
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-  }
-
-  /**
-   * Applies a controlled gate using AVX512 instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param cqs Indices of control qubits.
-   * @param cvals Bit mask of control qubit values.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyControlledGate(const std::vector<unsigned>& qs,
-                           const std::vector<unsigned>& cqs, uint64_t cvals,
-                           const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-    // Assume cqs[0] < cqs[1] < cqs[2] < ... .
-
-    if (cqs.size() == 0) {
-      ApplyGate(qs, matrix, state);
-      return;
-    }
-
-    switch (qs.size()) {
-    case 1:
-      if (qs[0] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 2:
-      if (qs[0] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 3:
-      if (qs[0] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[2] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 4:
-      if (qs[0] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[2] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[3] > 3) {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 3) {
-          ApplyControlledGateL<0, 4, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 4, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-  }
-
-  /**
-   * @return The size of SIMD register if applicable.
-   */
-  static unsigned SIMDRegisterSize() {
-    return 16;
-  }
-
- private:
-  template <unsigned H>
-  void ApplyGateH(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                uint64_t imaskh, uint64_t qmaskh, uint64_t size,
-                uint64_t row_size, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m512 ru, iu, rn, in;
-      __m512 rs[hsize], is[hsize];
-
-      uint64_t r = i % size;
-      uint64_t s = i / size;
-
-      auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k] = _mm512_load_ps(p0 + p);
-        is[k] = _mm512_load_ps(p0 + p + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm512_set1_ps(v[j]);
-        iu = _mm512_set1_ps(v[j + 1]);
-        rn = _mm512_mul_ps(rs[0], ru);
-        in = _mm512_mul_ps(rs[0], iu);
-        rn = _mm512_fnmadd_ps(is[0], iu, rn);
-        in = _mm512_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm512_set1_ps(v[j]);
-          iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[l], ru, rn);
-          in = _mm512_fmadd_ps(rs[l], iu, in);
-          rn = _mm512_fnmadd_ps(is[l], iu, rn);
-          in = _mm512_fmadd_ps(is[l], ru, in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm512_store_ps(p0 + p, rn);
-        _mm512_store_ps(p0 + p + 16, in);
-      }
-    };
-
-    auto m = GetMasks1<H, 4>(qs);
-
-    unsigned k = 4 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f,
-             matrix, m.imaskh, m.qmaskh, size, raw_size, state.get());
-  }
-
-  template <unsigned H, unsigned L>
-  void ApplyGateL(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                uint64_t imaskh, uint64_t qmaskh, const __m512i* idx,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m512 rn, in;
-      __m512 rs[gsize], is[gsize];
-
-      uint64_t r = i % size;
-      uint64_t s = i / size;
-
-      auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k2] = _mm512_load_ps(p0 + p);
-        is[k2] = _mm512_load_ps(p0 + p + 16);
-
-        for (unsigned l = 1; l < lsize; ++l) {
-          rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]);
-          is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
-          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm512_store_ps(p0 + p, rn);
-        _mm512_store_ps(p0 + p + 16, in);
-      }
-    };
-
-    __m512i idx[1 << L];
-    __m512 w[1 << (1 + 2 * H + L)];
-
-    auto m = GetMasks2<H, L, 4>(qs);
-    FillPermutationIndices<L>(m.qmaskl, idx);
-    FillMatrix<H, L, 4>(m.qmaskl, matrix, (fp_type*) w);
-
-    unsigned k = 4 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f,
-             w, m.imaskh, m.qmaskh, idx, size, raw_size, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m512 ru, iu, rn, in;
-      __m512 rs[hsize], is[hsize];
-
-      uint64_t r = i % size;
-      uint64_t s = i / size;
-
-      auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k] = _mm512_load_ps(p0 + p);
-        is[k] = _mm512_load_ps(p0 + p + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm512_set1_ps(v[j]);
-        iu = _mm512_set1_ps(v[j + 1]);
-        rn = _mm512_mul_ps(rs[0], ru);
-        in = _mm512_mul_ps(rs[0], iu);
-        rn = _mm512_fnmadd_ps(is[0], iu, rn);
-        in = _mm512_fmadd_ps(is[0], ru, in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm512_set1_ps(v[j]);
-          iu = _mm512_set1_ps(v[j + 1]);
-          rn = _mm512_fmadd_ps(rs[l], ru, rn);
-          in = _mm512_fmadd_ps(rs[l], iu, in);
-          rn = _mm512_fnmadd_ps(is[l], iu, rn);
-          in = _mm512_fmadd_ps(is[l], ru, in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm512_store_ps(p0 + p, rn);
-        _mm512_store_ps(p0 + p + 16, in);
-      }
-    };
-
-    auto m = GetMasks3<H, 4>(state.num_qubits(), qs, cqs, cvals);
-
-    unsigned k = 4 + H + cqs.size();
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f,
-             matrix, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m512 rn, in;
-      __m512 rs[hsize], is[hsize];
-
-      uint64_t r = i % size;
-      uint64_t s = i / size;
-
-      auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k] = _mm512_load_ps(p0 + p);
-        is[k] = _mm512_load_ps(p0 + p + 16);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
-          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm512_store_ps(p0 + p, rn);
-        _mm512_store_ps(p0 + p + 16, in);
-      }
-    };
-
-    __m512 w[1 << (1 + 2 * H)];
-
-    auto m = GetMasks4<H, 4>(state.num_qubits(), qs, cqs, cvals);
-    FillControlledMatrixH<H, 4>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
-
-    unsigned k = 4 + H + cqs.size() - m.cl;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f,
-             w, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get());
-  }
-
-  template <unsigned H, unsigned L, bool CH>
-  void ApplyControlledGateL(const std::vector<unsigned>& qs,
-                            const std::vector<unsigned>& cqs, uint64_t cvals,
-                            const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w,
-                uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh,
-                const __m512i* idx, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m512 rn, in;
-      __m512 rs[gsize], is[gsize];
-
-      uint64_t r = i % size;
-      uint64_t s = i / size;
-
-      auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh);
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        rs[k2] = _mm512_load_ps(p0 + p);
-        is[k2] = _mm512_load_ps(p0 + p + 16);
-
-        for (unsigned l = 1; l < lsize; ++l) {
-          rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]);
-          is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm512_mul_ps(rs[0], w[j]);
-        in = _mm512_mul_ps(rs[0], w[j + 1]);
-        rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn);
-        in = _mm512_fmadd_ps(is[0], w[j], in);
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm512_fmadd_ps(rs[l], w[j], rn);
-          in = _mm512_fmadd_ps(rs[l], w[j + 1], in);
-          rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn);
-          in = _mm512_fmadd_ps(is[l], w[j], in);
-
-          j += 2;
-        }
-
-        uint64_t p = _pdep_u64(k, qmaskh);
-
-        _mm512_store_ps(p0 + p, rn);
-        _mm512_store_ps(p0 + p + 16, in);
-      }
-    };
-
-    __m512i idx[1 << L];
-    __m512 w[1 << (1 + 2 * H + L)];
-
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    if (CH) {
-      auto m = GetMasks5<H, L, 4>(state.num_qubits(), qs, cqs, cvals);
-      FillPermutationIndices<L>(m.qmaskl, idx);
-      FillMatrix<H, L, 4>(m.qmaskl, matrix, (fp_type*) w);
-
-      unsigned k = 4 + H + cqs.size();
-      unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-      uint64_t size = uint64_t{1} << n;
-
-      for_.Run(size * size2, f, w, m.imaskh, m.qmaskh,
-               m.cvalsh, idx, size, raw_size, state.get());
-    } else {
-      auto m = GetMasks6<H, L, 4>(state.num_qubits(), qs, cqs, cvals);
-      FillPermutationIndices<L>(m.qmaskl, idx);
-      FillControlledMatrixL<H, L, 4>(
-          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
-
-      unsigned k = 4 + H + cqs.size() - m.cl;
-      unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-      uint64_t size = uint64_t{1} << n;
-
-      for_.Run(size * size2, f, w, m.imaskh, m.qmaskh,
-               m.cvalsh, idx, size, raw_size, state.get());
-    }
-  }
-
-  template <unsigned L>
-  static void FillPermutationIndices(unsigned qmaskl, __m512i* idx) {
-    constexpr unsigned lsize = 1 << L;
-
-    for (unsigned i = 0; i < lsize; ++i) {
-      unsigned p[16];
-
-      for (unsigned j = 0; j < 16; ++j) {
-        p[j] = MaskedAdd<4>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl));
-      }
-
-      idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10],
-                                p[9], p[8], p[7], p[6], p[5], p[4],
-                                p[3], p[2], p[1], p[0]);
-    }
-  }
-
-  For for_;
-};
-
-}  // namespace unitary
-}  // namespace qsim
-
-#endif  // UNITARY_CALCULATOR_AVX512_H_
diff --git a/tpls/qsim/unitary_calculator_basic.h b/tpls/qsim/unitary_calculator_basic.h
deleted file mode 100644
index 6b1821a..0000000
--- a/tpls/qsim/unitary_calculator_basic.h
+++ /dev/null
@@ -1,259 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef UNITARY_CALCULATOR_BASIC_H_
-#define UNITARY_CALCULATOR_BASIC_H_
-
-#include <complex>
-#include <cstdint>
-#include <functional>
-#include <vector>
-
-#include "simulator.h"
-#include "unitaryspace_basic.h"
-
-namespace qsim {
-namespace unitary {
-
-/**
- * Quantum circuit unitary calculator without vectorization.
- */
-template <typename For, typename FP = float>
-class UnitaryCalculatorBasic final : public SimulatorBase {
- public:
-  using UnitarySpace = UnitarySpaceBasic<For, FP>;
-  using Unitary = typename UnitarySpace::Unitary;
-  using fp_type = typename UnitarySpace::fp_type;
-
-  using StateSpace = UnitarySpace;
-  using State = Unitary;
-
-  template <typename... ForArgs>
-  explicit UnitaryCalculatorBasic(ForArgs&&... args) : for_(args...) {}
-
-  /**
-   * Applies a gate using non-vectorized instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyGate(const std::vector<unsigned>& qs,
-                 const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    switch (qs.size()) {
-    case 1:
-      ApplyGateH<1>(qs, matrix, state);
-      break;
-    case 2:
-      ApplyGateH<2>(qs, matrix, state);
-      break;
-    case 3:
-      ApplyGateH<3>(qs, matrix, state);
-      break;
-    case 4:
-      ApplyGateH<4>(qs, matrix, state);
-      break;
-    case 5:
-      ApplyGateH<5>(qs, matrix, state);
-      break;
-    case 6:
-      ApplyGateH<6>(qs, matrix, state);
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-  }
-
-  /**
-   * Applies a controlled gate using non-vectorized instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param cqs Indices of control qubits.
-   * @param cvals Bit mask of control qubit values.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyControlledGate(const std::vector<unsigned>& qs,
-                           const std::vector<unsigned>& cqs, uint64_t cvals,
-                           const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    if (cqs.size() == 0) {
-      ApplyGate(qs, matrix, state);
-      return;
-    }
-
-    switch (qs.size()) {
-    case 1:
-      ApplyControlledGateH<1>(qs, cqs, cvals, matrix, state);
-      break;
-    case 2:
-      ApplyControlledGateH<2>(qs, cqs, cvals, matrix, state);
-      break;
-    case 3:
-      ApplyControlledGateH<3>(qs, cqs, cvals, matrix, state);
-      break;
-    case 4:
-      ApplyControlledGateH<4>(qs, cqs, cvals, matrix, state);
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-  }
-
-  /**
-   * @return The size of SIMD register if applicable.
-   */
-  static unsigned SIMDRegisterSize() {
-    return 1;
-  }
-
- private:
-  template <unsigned H>
-  void ApplyGateH(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss, uint64_t size,
-                uint64_t row_size, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      fp_type rn, in;
-      fp_type rs[hsize], is[hsize];
-
-      uint64_t r = i % size;
-      uint64_t s = i / size;
-
-      uint64_t t = r & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        r *= 2;
-        t |= r & ms[j];
-      }
-
-      auto p0 = rstate + row_size * s + 2 * t;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = *(p0 + xss[k]);
-        is[k] = *(p0 + xss[k] + 1);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = rs[0] * v[j] - is[0] * v[j + 1];
-        in = rs[0] * v[j + 1] + is[0] * v[j];
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          rn += rs[l] * v[j] - is[l] * v[j + 1];
-          in += rs[l] * v[j + 1] + is[l] * v[j];
-
-          j += 2;
-        }
-
-        *(p0 + xss[k]) = rn;
-        *(p0 + xss[k] + 1) = in;
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-
-    unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateH(const std::vector<unsigned>& qs,
-                            const std::vector<unsigned>& cqs,
-                            uint64_t cvals, const fp_type* matrix,
-                            State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
-                uint64_t cmaskh, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      fp_type rn, in;
-      fp_type rs[hsize], is[hsize];
-
-      uint64_t r = i % size;
-      uint64_t s = i / size;
-
-      uint64_t t = r & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        r *= 2;
-        t |= r & ms[j];
-      }
-
-      if ((t & cmaskh) == cvalsh) {
-        auto p0 = rstate + row_size * s + 2 * t;
-
-        for (unsigned k = 0; k < hsize; ++k) {
-          rs[k] = *(p0 + xss[k]);
-          is[k] = *(p0 + xss[k] + 1);
-        }
-
-        uint64_t j = 0;
-
-        for (unsigned k = 0; k < hsize; ++k) {
-          rn = rs[0] * v[j] - is[0] * v[j + 1];
-          in = rs[0] * v[j + 1] + is[0] * v[j];
-
-          j += 2;
-
-          for (unsigned l = 1; l < hsize; ++l) {
-            rn += rs[l] * v[j] - is[l] * v[j + 1];
-            in += rs[l] * v[j + 1] + is[l] * v[j];
-
-            j += 2;
-          }
-
-          *(p0 + xss[k]) = rn;
-          *(p0 + xss[k] + 1) = in;
-        }
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-
-    auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals);
-
-    unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f,
-             matrix, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get());
-  }
-
-  For for_;
-};
-
-}  // namespace unitary
-}  // namespace qsim
-
-#endif  // UNITARY_CALCULATOR_BASIC_H_
diff --git a/tpls/qsim/unitary_calculator_sse.h b/tpls/qsim/unitary_calculator_sse.h
deleted file mode 100644
index a3c3f2e..0000000
--- a/tpls/qsim/unitary_calculator_sse.h
+++ /dev/null
@@ -1,639 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef UNITARY_CALCULATOR_SSE_H_
-#define UNITARY_CALCULATOR_SSE_H_
-
-#include <smmintrin.h>
-
-#include <complex>
-#include <cstdint>
-#include <functional>
-#include <vector>
-
-#include "simulator.h"
-#include "unitaryspace_sse.h"
-
-namespace qsim {
-namespace unitary {
-
-/**
- * Quantum circuit unitary calculator with SSE vectorization.
- */
-template <typename For>
-class UnitaryCalculatorSSE final : public SimulatorBase {
- public:
-  using UnitarySpace = UnitarySpaceSSE<For>;
-  using Unitary = typename UnitarySpace::Unitary;
-  using fp_type = typename UnitarySpace::fp_type;
-
-  using StateSpace = UnitarySpace;
-  using State = Unitary;
-
-  template <typename... ForArgs>
-  explicit UnitaryCalculatorSSE(ForArgs&&... args) : for_(args...) {}
-
-  /**
-   * Applies a gate using SSE instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyGate(const std::vector<unsigned>& qs,
-                 const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-
-    switch (qs.size()) {
-    case 1:
-      if (qs[0] > 1) {
-        ApplyGateH<1>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 1>(qs, matrix, state);
-      }
-      break;
-    case 2:
-      if (qs[0] > 1) {
-        ApplyGateH<2>(qs, matrix, state);
-      } else if (qs[1] > 1) {
-        ApplyGateL<1, 1>(qs, matrix, state);
-      } else {
-        ApplyGateL<0, 2>(qs, matrix, state);
-      }
-      break;
-    case 3:
-      if (qs[0] > 1) {
-        ApplyGateH<3>(qs, matrix, state);
-      } else if (qs[1] > 1) {
-        ApplyGateL<2, 1>(qs, matrix, state);
-      } else {
-        ApplyGateL<1, 2>(qs, matrix, state);
-      }
-      break;
-    case 4:
-      if (qs[0] > 1) {
-        ApplyGateH<4>(qs, matrix, state);
-      } else if (qs[1] > 1) {
-        ApplyGateL<3, 1>(qs, matrix, state);
-      } else {
-        ApplyGateL<2, 2>(qs, matrix, state);
-      }
-      break;
-    case 5:
-      if (qs[0] > 1) {
-        ApplyGateH<5>(qs, matrix, state);
-      } else if (qs[1] > 1) {
-        ApplyGateL<4, 1>(qs, matrix, state);
-      } else {
-        ApplyGateL<3, 2>(qs, matrix, state);
-      }
-      break;
-    case 6:
-      if (qs[0] > 1) {
-        ApplyGateH<6>(qs, matrix, state);
-      } else if (qs[1] > 1) {
-        ApplyGateL<5, 1>(qs, matrix, state);
-      } else {
-        ApplyGateL<4, 2>(qs, matrix, state);
-      }
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-  }
-
-  /**
-   * Applies a controlled gate using SSE instructions.
-   * @param qs Indices of the qubits affected by this gate.
-   * @param cqs Indices of control qubits.
-   * @param cvals Bit mask of control qubit values.
-   * @param matrix Matrix representation of the gate to be applied.
-   * @param state The state of the system, to be updated by this method.
-   */
-  void ApplyControlledGate(const std::vector<unsigned>& qs,
-                           const std::vector<unsigned>& cqs, uint64_t cvals,
-                           const fp_type* matrix, State& state) const {
-    // Assume qs[0] < qs[1] < qs[2] < ... .
-    // Assume cqs[0] < cqs[1] < cqs[2] < ... .
-
-    if (cqs.size() == 0) {
-      ApplyGate(qs, matrix, state);
-      return;
-    }
-
-    switch (qs.size()) {
-    case 1:
-      if (qs[0] > 1) {
-        if (cqs[0] > 1) {
-          ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 1) {
-          ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 2:
-      if (qs[0] > 1) {
-        if (cqs[0] > 1) {
-          ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 1) {
-        if (cqs[0] > 1) {
-          ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 1) {
-          ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 3:
-      if (qs[0] > 1) {
-        if (cqs[0] > 1) {
-          ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 1) {
-        if (cqs[0] > 1) {
-          ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 1) {
-          ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    case 4:
-      if (qs[0] > 1) {
-        if (cqs[0] > 1) {
-          ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state);
-        }
-      } else if (qs[1] > 1) {
-        if (cqs[0] > 1) {
-          ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state);
-        }
-      } else {
-        if (cqs[0] > 1) {
-          ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state);
-        } else {
-          ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state);
-        }
-      }
-      break;
-    default:
-      // Not implemented.
-      break;
-    }
-  }
-
-  /**
-   * @return The size of SIMD register if applicable.
-   */
-  static unsigned SIMDRegisterSize() {
-    return 4;
-  }
-
- private:
-  template <unsigned H>
-  void ApplyGateH(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss, uint64_t size,
-                uint64_t row_size, fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m128 ru, iu, rn, in;
-      __m128 rs[hsize], is[hsize];
-
-      uint64_t r = 4 * (i % size);
-      uint64_t s = i / size;
-
-      uint64_t t = r & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        r *= 2;
-        t |= r & ms[j];
-      }
-
-      auto p0 = rstate + row_size * s + 2 * t;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = _mm_load_ps(p0 + xss[k]);
-        is[k] = _mm_load_ps(p0 + xss[k] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm_set1_ps(v[j]);
-        iu = _mm_set1_ps(v[j + 1]);
-        rn = _mm_mul_ps(rs[0], ru);
-        in = _mm_mul_ps(rs[0], iu);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm_set1_ps(v[j]);
-          iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[l], ru));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[k], rn);
-        _mm_store_ps(p0 + xss[k] + 4, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-
-    unsigned k = 2 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, state.get());
-  }
-
-  template <unsigned H, unsigned L>
-  void ApplyGateL(const std::vector<unsigned>& qs,
-                  const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss, unsigned q0,
-                uint64_t size, uint64_t row_size, fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m128 rn, in;
-      __m128 rs[gsize], is[gsize];
-
-      uint64_t r = 4 * (i % size);
-      uint64_t s = i / size;
-
-      uint64_t t = r & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        r *= 2;
-        t |= r & ms[j];
-      }
-
-      auto p0 = rstate + row_size * s + 2 * t;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-
-        rs[k2] = _mm_load_ps(p0 + xss[k]);
-        is[k2] = _mm_load_ps(p0 + xss[k] + 4);
-
-        if (L == 1) {
-          rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177)
-                               : _mm_shuffle_ps(rs[k2], rs[k2], 78);
-          is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177)
-                               : _mm_shuffle_ps(is[k2], is[k2], 78);
-        } else if (L == 2) {
-          rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57);
-          is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57);
-          rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78);
-          is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78);
-          rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147);
-          is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[k], rn);
-        _mm_store_ps(p0 + xss[k] + 4, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-    __m128 w[1 << (1 + 2 * H + L)];
-
-    auto m = GetMasks11<L>(qs);
-
-    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
-    FillMatrix<H, L, 2>(m.qmaskl, matrix, (fp_type*) w);
-
-    unsigned k = 2 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f,  w, ms, xss, qs[0], size, raw_size, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateHH(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v,
-                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
-                uint64_t cmaskh, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m128 ru, iu, rn, in;
-      __m128 rs[hsize], is[hsize];
-
-      uint64_t r = 4 * (i % size);
-      uint64_t s = i / size;
-
-      uint64_t t = r & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        r *= 2;
-        t |= r & ms[j];
-      }
-
-      if ((t & cmaskh) != cvalsh) return;
-
-      auto p0 = rstate + row_size * s + 2 * t;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = _mm_load_ps(p0 + xss[k]);
-        is[k] = _mm_load_ps(p0 + xss[k] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        ru = _mm_set1_ps(v[j]);
-        iu = _mm_set1_ps(v[j + 1]);
-        rn = _mm_mul_ps(rs[0], ru);
-        in = _mm_mul_ps(rs[0], iu);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], ru));
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          ru = _mm_set1_ps(v[j]);
-          iu = _mm_set1_ps(v[j + 1]);
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu));
-          in = _mm_add_ps(in, _mm_mul_ps(is[l], ru));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[k], rn);
-        _mm_store_ps(p0 + xss[k] + 4, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-
-    auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals);
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-
-    unsigned k = 2 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f,
-             matrix, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get());
-  }
-
-  template <unsigned H>
-  void ApplyControlledGateHL(const std::vector<unsigned>& qs,
-                             const std::vector<unsigned>& cqs, uint64_t cvals,
-                             const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
-                uint64_t cmaskh, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      constexpr unsigned hsize = 1 << H;
-
-      __m128 rn, in;
-      __m128 rs[hsize], is[hsize];
-
-      uint64_t r = 4 * (i % size);
-      uint64_t s = i / size;
-
-      uint64_t t = r & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        r *= 2;
-        t |= r & ms[j];
-      }
-
-      if ((t & cmaskh) != cvalsh) return;
-
-      auto p0 = rstate + row_size * s + 2 * t;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rs[k] = _mm_load_ps(p0 + xss[k]);
-        is[k] = _mm_load_ps(p0 + xss[k] + 4);
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned l = 1; l < hsize; ++l) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[k], rn);
-        _mm_store_ps(p0 + xss[k] + 4, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-    __m128 w[1 << (1 + 2 * H)];
-
-    auto m = GetMasks8<2>(state.num_qubits(), qs, cqs, cvals);
-    FillIndices<H>(state.num_qubits(), qs, ms, xss);
-    FillControlledMatrixH<H, 2>(m.cvalsl, m.cmaskl, matrix, (fp_type*) w);
-
-    unsigned k = 2 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    for_.Run(size * size2, f,
-             w, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get());
-  }
-
-  template <unsigned H, unsigned L, bool CH>
-  void ApplyControlledGateL(const std::vector<unsigned>& qs,
-                            const std::vector<unsigned>& cqs, uint64_t cvals,
-                            const fp_type* matrix, State& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w,
-                const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh,
-                uint64_t cmaskh, unsigned q0, uint64_t size, uint64_t row_size,
-                fp_type* rstate) {
-      constexpr unsigned gsize = 1 << (H + L);
-      constexpr unsigned hsize = 1 << H;
-      constexpr unsigned lsize = 1 << L;
-
-      __m128 rn, in;
-      __m128 rs[gsize], is[gsize];
-
-      uint64_t r = 4 * (i % size);
-      uint64_t s = i / size;
-
-      uint64_t t = r & ms[0];
-      for (unsigned j = 1; j <= H; ++j) {
-        r *= 2;
-        t |= r & ms[j];
-      }
-
-      if ((t & cmaskh) != cvalsh) return;
-
-      auto p0 = rstate + row_size * s + 2 * t;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        unsigned k2 = lsize * k;
-
-        rs[k2] = _mm_load_ps(p0 + xss[k]);
-        is[k2] = _mm_load_ps(p0 + xss[k] + 4);
-
-        if (L == 1) {
-          rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177)
-                               : _mm_shuffle_ps(rs[k2], rs[k2], 78);
-          is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177)
-                               : _mm_shuffle_ps(is[k2], is[k2], 78);
-        } else if (L == 2) {
-          rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57);
-          is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57);
-          rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78);
-          is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78);
-          rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147);
-          is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147);
-        }
-      }
-
-      uint64_t j = 0;
-
-      for (unsigned k = 0; k < hsize; ++k) {
-        rn = _mm_mul_ps(rs[0], w[j]);
-        in = _mm_mul_ps(rs[0], w[j + 1]);
-        rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1]));
-        in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j]));
-
-        j += 2;
-
-        for (unsigned l = 1; l < gsize; ++l) {
-          rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j]));
-          in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1]));
-          rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1]));
-          in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j]));
-
-          j += 2;
-        }
-
-        _mm_store_ps(p0 + xss[k], rn);
-        _mm_store_ps(p0 + xss[k] + 4, in);
-      }
-    };
-
-    uint64_t ms[H + 1];
-    uint64_t xss[1 << H];
-    __m128 w[1 << (1 + 2 * H + L)];
-
-    FillIndices<H, L>(state.num_qubits(), qs, ms, xss);
-
-    unsigned k = 2 + H;
-    unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0;
-    uint64_t size = uint64_t{1} << n;
-    uint64_t size2 = uint64_t{1} << state.num_qubits();
-    uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits());
-
-    if (CH) {
-      auto m = GetMasks9<L>(state.num_qubits(), qs, cqs, cvals);
-      FillMatrix<H, L, 2>(m.qmaskl, matrix, (fp_type*) w);
-
-      for_.Run(size * size2, f, w, ms, xss,
-               m.cvalsh, m.cmaskh, qs[0], size, raw_size, state.get());
-    } else {
-      auto m = GetMasks10<L, 2>(state.num_qubits(), qs, cqs, cvals);
-      FillControlledMatrixL<H, L, 2>(
-          m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w);
-
-      for_.Run(size * size2, f, w, ms, xss,
-               m.cvalsh, m.cmaskh, qs[0], size, raw_size, state.get());
-    }
-  }
-
-  For for_;
-};
-
-}  // namespace unitary
-}  // namespace qsim
-
-#endif  // UNITARY_CALCULATOR_SSE_H_
diff --git a/tpls/qsim/unitaryspace.h b/tpls/qsim/unitaryspace.h
deleted file mode 100644
index b5e2691..0000000
--- a/tpls/qsim/unitaryspace.h
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef UNITARYSPACE_H_
-#define UNITARYSPACE_H_
-
-#include <cstdint>
-
-namespace qsim {
-
-namespace unitary {
-
-/**
- * Abstract class containing routines for general unitary matrix manipulations.
- * "AVX", "AVX512", "Basic", and "SSE" implementations are provided.
- */
-template <typename Impl,
-          template<typename...> class VectorSpace, typename... VSTypeParams>
-class UnitarySpace : public VectorSpace<Impl, VSTypeParams...> {
- private:
-  using Base = VectorSpace<Impl, VSTypeParams...>;
-
- public:
-  using fp_type = typename Base::fp_type;
-  using Unitary = typename Base::Vector;
-
-  template <typename... ForArgs>
-  UnitarySpace(ForArgs&&... args) : Base(args...) {}
-
-  static Unitary CreateUnitary(unsigned num_qubits) {
-    return Base::Create(num_qubits);
-  }
-
-  static Unitary CreateUnitary(fp_type* p, unsigned num_qubits) {
-    return Base::Create(p, num_qubits);
-  }
-
-  static Unitary NullUnitary() {
-    return Base::Null();
-  }
-
-  static uint64_t Size(unsigned num_qubits) {
-    return uint64_t{1} << num_qubits;
-  };
-
-  void CopyUnitary(const Unitary& src, Unitary& dest) const {
-    Base::Copy(src, dest);
-  }
-};
-
-}  // namespace unitary
-}  // namespace qsim
-
-#endif  // UNITARYSPACE_H_
diff --git a/tpls/qsim/unitaryspace_avx.h b/tpls/qsim/unitaryspace_avx.h
deleted file mode 100644
index c1ec59d..0000000
--- a/tpls/qsim/unitaryspace_avx.h
+++ /dev/null
@@ -1,112 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef UNITARYSPACE_AVX_H_
-#define UNITARYSPACE_AVX_H_
-
-#include <immintrin.h>
-
-#include <algorithm>
-#include <cmath>
-#include <complex>
-#include <cstdint>
-
-#include "unitaryspace.h"
-#include "vectorspace.h"
-
-namespace qsim {
-
-namespace unitary {
-
-/**
- * Object containing context and routines for unitary manipulations.
- * Unitary is a vectorized sequence of eight real components followed by eight
- * imaginary components. Eight single-precison floating numbers can be loaded
- * into an AVX register.
- */
-template <typename For>
-struct UnitarySpaceAVX :
-    public UnitarySpace<UnitarySpaceAVX<For>, VectorSpace, For, float> {
- private:
-  using Base = UnitarySpace<UnitarySpaceAVX<For>,
-                            qsim::VectorSpace, For, float>;
-
- public:
-  using Unitary = typename Base::Unitary;
-  using fp_type = typename Base::fp_type;
-
-  template <typename... ForArgs>
-  explicit UnitarySpaceAVX(ForArgs&&... args) : Base(args...) {}
-
-  static uint64_t MinRowSize(unsigned num_qubits) {
-    return std::max(uint64_t{16}, 2 * (uint64_t{1} << num_qubits));
-  };
-
-  static uint64_t MinSize(unsigned num_qubits) {
-    return Base::Size(num_qubits) * MinRowSize(num_qubits);
-  };
-
-  void SetAllZeros(Unitary& state) const {
-    __m256 val0 = _mm256_setzero_ps();
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, __m256& val, fp_type* p) {
-      _mm256_store_ps(p + 16 * i, val);
-      _mm256_store_ps(p + 16 * i + 8, val);
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 16, f, val0, state.get());
-  }
-
-  void SetIdentity(Unitary& state) {
-    SetAllZeros(state);
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                uint64_t row_size, fp_type* p) {
-      p[row_size * i + (16 * (i / 8)) + (i % 8)] = 1;
-    };
-
-    uint64_t size = Base::Size(state.num_qubits());
-    uint64_t row_size = MinRowSize(state.num_qubits());
-    Base::for_.Run(size, f, row_size, state.get());
-  }
-
-  static std::complex<fp_type> GetEntry(const Unitary& state,
-                                        uint64_t i, uint64_t j) {
-    uint64_t row_size = MinRowSize(state.num_qubits());
-    uint64_t k = (16 * (j / 8)) + (j % 8);
-    return std::complex<fp_type>(state.get()[row_size * i + k],
-                                 state.get()[row_size * i + k + 8]);
-  }
-
-  static void SetEntry(Unitary& state, uint64_t i, uint64_t j,
-                       const std::complex<fp_type>& ampl) {
-    uint64_t row_size = MinRowSize(state.num_qubits());
-    uint64_t k = (16 * (j / 8)) + (j % 8);
-    state.get()[row_size * i + k] = std::real(ampl);
-    state.get()[row_size * i + k + 8] = std::imag(ampl);
-  }
-
-  static void SetEntry(Unitary& state, uint64_t i, uint64_t j, fp_type re,
-                       fp_type im) {
-    uint64_t row_size = MinRowSize(state.num_qubits());
-    uint64_t k = (16 * (j / 8)) + (j % 8);
-    state.get()[row_size * i + k] = re;
-    state.get()[row_size * i + k + 8] = im;
-  }
-};
-
-}  // namespace unitary
-}  // namespace qsim
-
-#endif  // UNITARYSPACE_AVX_H_
diff --git a/tpls/qsim/unitaryspace_avx512.h b/tpls/qsim/unitaryspace_avx512.h
deleted file mode 100644
index 4c23dc9..0000000
--- a/tpls/qsim/unitaryspace_avx512.h
+++ /dev/null
@@ -1,112 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef UNITARYSPACE_AVX512_H_
-#define UNITARYSPACE_AVX512_H_
-
-#include <immintrin.h>
-
-#include <algorithm>
-#include <cmath>
-#include <complex>
-#include <cstdint>
-
-#include "unitaryspace.h"
-#include "vectorspace.h"
-
-namespace qsim {
-
-namespace unitary {
-
-/**
- * Object containing context and routines for unitary manipulations.
- * State is a vectorized sequence of sixteen real components followed by
- * sixteen imaginary components. Sixteen single-precison floating numbers can
- * be loaded into an AVX512 register.
- */
-template <typename For>
-struct UnitarySpaceAVX512 :
-    public UnitarySpace<UnitarySpaceAVX512<For>, VectorSpace, For, float> {
- private:
-  using Base = UnitarySpace<UnitarySpaceAVX512<For>,
-                            qsim::VectorSpace, For, float>;
-
- public:
-  using Unitary = typename Base::Unitary;
-  using fp_type = typename Base::fp_type;
-
-  template <typename... ForArgs>
-  explicit UnitarySpaceAVX512(ForArgs&&... args) : Base(args...) {}
-
-  static uint64_t MinRowSize(unsigned num_qubits) {
-    return std::max(uint64_t{32}, 2 * (uint64_t{1} << num_qubits));
-  };
-
-  static uint64_t MinSize(unsigned num_qubits) {
-    return Base::Size(num_qubits) * MinRowSize(num_qubits);
-  };
-
-  void SetAllZeros(Unitary& state) const {
-    __m512 val0 = _mm512_setzero_ps();
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, __m512 val0, fp_type* p) {
-      _mm512_store_ps(p + 32 * i, val0);
-      _mm512_store_ps(p + 32 * i + 16, val0);
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 32, f, val0, state.get());
-  }
-
-  void SetIdentity(Unitary& state) {
-    SetAllZeros(state);
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                uint64_t row_size, fp_type* p) {
-      p[row_size * i + (32 * (i / 16)) + (i % 16)] = 1;
-    };
-
-    uint64_t size = Base::Size(state.num_qubits());
-    uint64_t row_size = MinRowSize(state.num_qubits());
-    Base::for_.Run(size, f, row_size, state.get());
-  }
-
-  static std::complex<fp_type> GetEntry(const Unitary& state,
-                                        uint64_t i, uint64_t j) {
-    uint64_t row_size = MinRowSize(state.num_qubits());
-    uint64_t k = (32 * (j / 16)) + (j % 16);
-    return std::complex<fp_type>(state.get()[row_size * i + k],
-                                 state.get()[row_size * i + k + 16]);
-  }
-
-  static void SetEntry(Unitary& state, uint64_t i, uint64_t j,
-                       const std::complex<fp_type>& ampl) {
-    uint64_t row_size = MinRowSize(state.num_qubits());
-    uint64_t k = (32 * (j / 16)) + (j % 16);
-    state.get()[row_size * i + k] = std::real(ampl);
-    state.get()[row_size * i + k + 16] = std::imag(ampl);
-  }
-
-  static void SetEntry(Unitary& state, uint64_t i, uint64_t j, fp_type re,
-                       fp_type im) {
-    uint64_t row_size = MinRowSize(state.num_qubits());
-    uint64_t k = (32 * (j / 16)) + (j % 16);
-    state.get()[row_size * i + k] = re;
-    state.get()[row_size * i + k + 16] = im;
-  }
-};
-
-}  // namespace unitary
-}  // namespace qsim
-
-#endif  // UNITARYSPACE_AVX512_H_
diff --git a/tpls/qsim/unitaryspace_basic.h b/tpls/qsim/unitaryspace_basic.h
deleted file mode 100644
index 2db14b6..0000000
--- a/tpls/qsim/unitaryspace_basic.h
+++ /dev/null
@@ -1,103 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef UNITARYSPACE_BASIC_H_
-#define UNITARYSPACE_BASIC_H_
-
-#include <cmath>
-#include <complex>
-#include <cstdint>
-
-#include "unitaryspace.h"
-#include "vectorspace.h"
-
-namespace qsim {
-
-namespace unitary {
-
-/**
- * Object containing context and routines for unitary manipulations.
- * Unitary is a non-vectorized sequence of one real amplitude followed by
- * one imaginary amplitude.
- */
-template <typename For, typename FP>
-struct UnitarySpaceBasic
-    : public UnitarySpace<UnitarySpaceBasic<For, FP>, VectorSpace, For, FP> {
- private:
-  using Base = UnitarySpace<UnitarySpaceBasic<For, FP>,
-                            qsim::VectorSpace, For, FP>;
-
- public:
-  using Unitary = typename Base::Unitary;
-  using fp_type = typename Base::fp_type;
-
-  template <typename... ForArgs>
-  explicit UnitarySpaceBasic(ForArgs&&... args) : Base(args...) {}
-
-  static uint64_t MinRowSize(unsigned num_qubits) {
-    return 2 * (uint64_t{1} << num_qubits);
-  };
-
-  static uint64_t MinSize(unsigned num_qubits) {
-    return Base::Size(num_qubits) * MinRowSize(num_qubits);
-  };
-
-  void SetAllZeros(Unitary& state) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) {
-      p[2 * i + 0] = 0;
-      p[2 * i + 1] = 0;
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 2, f, state.get());
-  }
-
-  void SetIdentity(Unitary& state) {
-    SetAllZeros(state);
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                uint64_t row_size, fp_type* p) {
-      p[row_size * i + 2 * i] = 1;
-    };
-
-    uint64_t size = Base::Size(state.num_qubits());
-    uint64_t row_size = MinRowSize(state.num_qubits());
-    Base::for_.Run(size, f, row_size, state.get());
-  }
-
-  static std::complex<fp_type> GetEntry(const Unitary& state,
-                                        uint64_t i, uint64_t j) {
-    uint64_t row_size = MinRowSize(state.num_qubits());
-    return std::complex<fp_type>(state.get()[row_size * i + 2 * j],
-                                 state.get()[row_size * i + 2 * j + 1]);
-  }
-
-  static void SetEntry(Unitary& state, uint64_t i, uint64_t j,
-                       const std::complex<fp_type>& ampl) {
-    uint64_t row_size = MinRowSize(state.num_qubits());
-    state.get()[row_size * i + 2 * j] = std::real(ampl);
-    state.get()[row_size * i + 2 * j + 1] = std::imag(ampl);
-  }
-
-  static void SetEntry(Unitary& state, uint64_t i, uint64_t j,
-                       fp_type re, fp_type im) {
-    uint64_t row_size = MinRowSize(state.num_qubits());
-    state.get()[row_size * i + 2 * j] = re;
-    state.get()[row_size * i + 2 * j + 1] = im;
-  }
-};
-
-}  // namespace unitary
-}  // namespace qsim
-
-#endif  // UNITARYSPACE_BASIC_H_
diff --git a/tpls/qsim/unitaryspace_sse.h b/tpls/qsim/unitaryspace_sse.h
deleted file mode 100644
index f3762fb..0000000
--- a/tpls/qsim/unitaryspace_sse.h
+++ /dev/null
@@ -1,112 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef UNITARYSPACE_SSE_H_
-#define UNITARYSPACE_SSE_H_
-
-#include <smmintrin.h>
-
-#include <algorithm>
-#include <cmath>
-#include <complex>
-#include <cstdint>
-
-#include "unitaryspace.h"
-#include "vectorspace.h"
-
-namespace qsim {
-
-namespace unitary {
-
-/**
- * Object containing context and routines for unitary manipulations.
- * Unitary is a vectorized sequence of four real components followed by four
- * imaginary components. Four single-precison floating numbers can be loaded
- * into an SSE register.
- */
-template <typename For>
-struct UnitarySpaceSSE :
-    public UnitarySpace<UnitarySpaceSSE<For>, VectorSpace, For, float> {
- private:
-  using Base = UnitarySpace<UnitarySpaceSSE<For>,
-                            qsim::VectorSpace, For, float>;
-
- public:
-  using Unitary = typename Base::Unitary;
-  using fp_type = typename Base::fp_type;
-
-  template <typename... ForArgs>
-  explicit UnitarySpaceSSE(ForArgs&&... args) : Base(args...) {}
-
-  static uint64_t MinRowSize(unsigned num_qubits) {
-    return std::max(uint64_t{8}, 2 * (uint64_t{1} << num_qubits));
-  };
-
-  static uint64_t MinSize(unsigned num_qubits) {
-    return Base::Size(num_qubits) * MinRowSize(num_qubits);
-  };
-
-  void SetAllZeros(Unitary& state) const {
-    __m128 val0 = _mm_setzero_ps();
-
-    auto f = [](unsigned n, unsigned m, uint64_t i, __m128 val0, fp_type* p) {
-      _mm_store_ps(p + 8 * i, val0);
-      _mm_store_ps(p + 8 * i + 4, val0);
-    };
-
-    Base::for_.Run(MinSize(state.num_qubits()) / 8, f, val0, state.get());
-  }
-
-  void SetIdentity(Unitary& state) {
-    SetAllZeros(state);
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                uint64_t row_size, fp_type* p) {
-      p[row_size * i + (8 * (i / 4)) + (i % 4)] = 1;
-    };
-
-    uint64_t size = Base::Size(state.num_qubits());
-    uint64_t row_size = MinRowSize(state.num_qubits());
-    Base::for_.Run(size, f, row_size, state.get());
-  }
-
-  static std::complex<fp_type> GetEntry(const Unitary& state,
-                                        uint64_t i, uint64_t j) {
-    uint64_t row_size = MinRowSize(state.num_qubits());
-    uint64_t k = (8 * (j / 4)) + (j % 4);
-    return std::complex<fp_type>(state.get()[row_size * i + k],
-                                 state.get()[row_size * i + k + 4]);
-  }
-
-  static void SetEntry(Unitary& state, uint64_t i, uint64_t j,
-                       const std::complex<fp_type>& ampl) {
-    uint64_t row_size = MinRowSize(state.num_qubits());
-    uint64_t k = (8 * (j / 4)) + (j % 4);
-    state.get()[row_size * i + k] = std::real(ampl);
-    state.get()[row_size * i + k + 4] = std::imag(ampl);
-  }
-
-  static void SetEntry(Unitary& state, uint64_t i, uint64_t j, fp_type re,
-                       fp_type im) {
-    uint64_t row_size = MinRowSize(state.num_qubits());
-    uint64_t k = (8 * (j / 4)) + (j % 4);
-    state.get()[row_size * i + k] = re;
-    state.get()[row_size * i + k + 4] = im;
-  }
-};
-
-}  // namespace unitary
-}  // namespace qsim
-
-#endif  // UNITARYSPACE_SSE_H_
diff --git a/tpls/qsim/util.h b/tpls/qsim/util.h
deleted file mode 100644
index 726a019..0000000
--- a/tpls/qsim/util.h
+++ /dev/null
@@ -1,89 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef UTIL_H_
-#define UTIL_H_
-
-#include <algorithm>
-#include <chrono>
-#include <random>
-#include <sstream>
-#include <string>
-#include <utility>
-#include <vector>
-
-namespace qsim {
-
-template <typename Container>
-inline void SplitString(
-    const std::string& str, char delim, Container& words) {
-  words.resize(0);
-
-  std::string word;
-  std::stringstream ss(str);
-
-  while (std::getline(ss, word, delim)) {
-    words.push_back(std::move(word));
-  }
-}
-
-template <typename Op, typename Container>
-inline void SplitString(
-    const std::string& str, char delim, Op op, Container& words) {
-  words.resize(0);
-
-  std::string word;
-  std::stringstream ss(str);
-
-  while (std::getline(ss, word, delim)) {
-    words.push_back(op(word));
-  }
-}
-
-inline double GetTime() {
-  using namespace std::chrono;
-  steady_clock::duration since_epoch = steady_clock::now().time_since_epoch();
-  return double(since_epoch.count() * steady_clock::period::num)
-                                    / steady_clock::period::den;
-}
-
-template <typename DistrRealType, typename RGen>
-inline DistrRealType RandomValue(RGen& rgen, DistrRealType max_value) {
-  std::uniform_real_distribution<DistrRealType> distr(0.0, max_value);
-  return distr(rgen);
-}
-
-template <typename DistrRealType>
-inline std::vector<DistrRealType> GenerateRandomValues(
-    uint64_t num_samples, unsigned seed, DistrRealType max_value) {
-  std::vector<DistrRealType> rs;
-  rs.reserve(num_samples + 1);
-
-  std::mt19937 rgen(seed);
-  std::uniform_real_distribution<DistrRealType> distr(0.0, max_value);
-
-  for (uint64_t i = 0; i < num_samples; ++i) {
-    rs.emplace_back(distr(rgen));
-  }
-
-  std::sort(rs.begin(), rs.end());
-  // Populate the final element to prevent sanitizer errors.
-  rs.emplace_back(max_value);
-
-  return rs;
-}
-
-}  // namespace qsim
-
-#endif  // UTIL_H_
diff --git a/tpls/qsim/util_cpu.h b/tpls/qsim/util_cpu.h
deleted file mode 100644
index 8e02425..0000000
--- a/tpls/qsim/util_cpu.h
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef UTIL_CPU_H_
-#define UTIL_CPU_H_
-
-#ifdef __SSE2__
-# include <immintrin.h>
-#endif
-
-namespace qsim {
-
-// This function sets flush-to-zero and denormals-are-zeros MXCSR control
-// flags. This prevents rare cases of performance slowdown potentially at
-// the cost of a tiny precision loss.
-inline void SetFlushToZeroAndDenormalsAreZeros() {
-#ifdef __SSE2__
-  _mm_setcsr(_mm_getcsr() | 0x8040);
-#endif
-}
-
-// This function clears flush-to-zero and denormals-are-zeros MXCSR control
-// flags.
-inline void ClearFlushToZeroAndDenormalsAreZeros() {
-#ifdef __SSE2__
-  _mm_setcsr(_mm_getcsr() & ~unsigned{0x8040});
-#endif
-}
-
-}  // namespace qsim
-
-#endif  // UTIL_CPU_H_
diff --git a/tpls/qsim/util_cuda.h b/tpls/qsim/util_cuda.h
deleted file mode 100644
index 5d8cb5d..0000000
--- a/tpls/qsim/util_cuda.h
+++ /dev/null
@@ -1,128 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef UTIL_CUDA_H_
-#define UTIL_CUDA_H_
-
-#ifdef __NVCC__
-  #include <cuda.h>
-#elif __HIP__
-  #include <hip/hip_runtime.h>
-#endif
-
-#include <cstdlib>
-
-#include "io.h"
-
-namespace qsim {
-
-#define ErrorCheck(code) { ErrorAssert((code), __FILE__, __LINE__); }
-
-inline void ErrorAssert(cudaError_t code, const char* file, unsigned line) {
-  if (code != cudaSuccess) {
-    IO::errorf("CUDA error: %s %s %d\n", cudaGetErrorString(code), file, line);
-    exit(code);
-  }
-}
-
-template <typename T>
-struct Complex {
-  __host__ __device__ __forceinline__ Complex() {}
-
-  __host__ __device__ __forceinline__ Complex(const T& re) : re(re), im(0) {}
-
-  __host__ __device__ __forceinline__ Complex(const T& re, const T& im)
-      : re(re), im(im) {}
-
-  template <typename U>
-  __host__ __device__ __forceinline__ Complex<T>& operator=(
-      const Complex<U>& r) {
-    re = r.re;
-    im = r.im;
-
-    return *this;
-  }
-
-  T re;
-  T im;
-};
-
-template <typename T>
-__host__ __device__ __forceinline__ Complex<T> operator+(
-    const Complex<T>& l, const Complex<T>& r) {
-  return Complex<T>(l.re + r.re, l.im + r.im);
-}
-
-template <typename T, typename U>
-__host__ __device__ __forceinline__ Complex<T> operator+(
-    const Complex<T>& l, const Complex<U>& r) {
-  return Complex<T>(l.re + r.re, l.im + r.im);
-}
-
-template <typename T>
-struct Scalar {
-  using type = T;
-};
-
-template <typename T>
-struct Scalar<Complex<T>> {
-  using type = T;
-};
-
-template <typename T>
-struct Plus {
-  template <typename U>
-  __device__ __forceinline__ T operator()(const T& v1, const U& v2) const {
-    return v1 + v2;
-  }
-};
-
-template <typename T>
-struct Product {
-  __device__ __forceinline__ Complex<T> operator()(
-      const T& re1, const T& im1, const T& re2, const T& im2) const {
-    return Complex<T>(re1 * re2 + im1 * im2, re1 * im2 - im1 * re2);
-  }
-};
-
-template <typename T>
-struct RealProduct {
-  __device__ __forceinline__ T operator()(
-      const T& re1, const T& im1, const T& re2, const T& im2) const {
-    return re1 * re2 + im1 * im2;
-  }
-};
-
-template <typename FP1, typename Op, unsigned warp_size = 32>
-__device__ __forceinline__ FP1 WarpReduce(FP1 val, Op op) {
-  for (unsigned i = warp_size / 2; i > 0; i /= 2) {
-    val = op(val, __shfl_down_sync(0xffffffff, val, i));
-  }
-
-  return val;
-}
-
-template <typename FP1, typename Op, unsigned warp_size = 32>
-__device__ __forceinline__ Complex<FP1> WarpReduce(Complex<FP1> val, Op op) {
-  for (unsigned i = warp_size / 2; i > 0; i /= 2) {
-    val.re = op(val.re, __shfl_down_sync(0xffffffff, val.re, i));
-    val.im = op(val.im, __shfl_down_sync(0xffffffff, val.im, i));
-  }
-
-  return val;
-}
-
-}  // namespace qsim
-
-#endif  // UTIL_CUDA_H_
diff --git a/tpls/qsim/util_custatevec.h b/tpls/qsim/util_custatevec.h
deleted file mode 100644
index 36f29ef..0000000
--- a/tpls/qsim/util_custatevec.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef UTIL_CUSTATEVEC_H_
-#define UTIL_CUSTATEVEC_H_
-
-#include <cublas_v2.h>
-#include <custatevec.h>
-
-#include "io.h"
-#include "util_cuda.h"
-
-namespace qsim {
-
-inline void ErrorAssert(cublasStatus_t code, const char* file, unsigned line) {
-  if (code != CUBLAS_STATUS_SUCCESS) {
-    IO::errorf("cuBLAS error %i: %s %d\n", code, file, line);
-    exit(code);
-  }
-}
-
-inline void ErrorAssert(
-    custatevecStatus_t code, const char* file, unsigned line) {
-  if (code != CUSTATEVEC_STATUS_SUCCESS) {
-    IO::errorf("custatevec error: %s %s %d\n",
-                custatevecGetErrorString(code), file, line);
-    exit(code);
-  }
-}
-
-}  // namespace qsim
-
-#endif  // UTIL_CUSTATEVEC_H_
diff --git a/tpls/qsim/vectorspace.h b/tpls/qsim/vectorspace.h
deleted file mode 100644
index 7b33a53..0000000
--- a/tpls/qsim/vectorspace.h
+++ /dev/null
@@ -1,185 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef VECTORSPACE_H_
-#define VECTORSPACE_H_
-
-#ifdef _WIN32
-  #include <malloc.h>
-#endif
-
-#include <cstdint>
-#include <cstdlib>
-#include <memory>
-#include <utility>
-
-namespace qsim {
-
-namespace detail {
-
-inline void do_not_free(void*) {}
-
-inline void free(void* ptr) {
-#ifdef _WIN32
-  _aligned_free(ptr);
-#else
-  ::free(ptr);
-#endif
-}
-
-}  // namespace detail
-
-// Routines for vector manipulations.
-template <typename Impl, typename For, typename FP>
-class VectorSpace {
- public:
-  using fp_type = FP;
-
- private:
-  using Pointer = std::unique_ptr<fp_type, decltype(&detail::free)>;
-
- public:
-  class Vector {
-   public:
-    Vector() = delete;
-
-    Vector(Pointer&& ptr, unsigned num_qubits)
-        : ptr_(std::move(ptr)), num_qubits_(num_qubits) {}
-
-    fp_type* get() {
-      return ptr_.get();
-    }
-
-    const fp_type* get() const {
-      return ptr_.get();
-    }
-
-    fp_type* release() {
-      num_qubits_ = 0;
-      return ptr_.release();
-    }
-
-    unsigned num_qubits() const {
-      return num_qubits_;
-    }
-
-    bool requires_copy_to_host() const {
-      return false;
-    }
-
-   private:
-    Pointer ptr_;
-    unsigned num_qubits_;
-  };
-
-  template <typename... ForArgs>
-  VectorSpace(ForArgs&&... args) : for_(args...) {}
-
-  static Vector Create(unsigned num_qubits) {
-    auto size = sizeof(fp_type) * Impl::MinSize(num_qubits);
-    #ifdef _WIN32
-      Pointer ptr{(fp_type*) _aligned_malloc(size, 64), &detail::free};
-      return Vector{std::move(ptr), ptr.get() != nullptr ? num_qubits : 0};
-    #else
-      void* p = nullptr;
-      if (posix_memalign(&p, 64, size) == 0) {
-        return Vector{Pointer{(fp_type*) p, &detail::free}, num_qubits};
-      } else {
-        return Null();
-      }
-    #endif
-  }
-
-  // It is the client's responsibility to make sure that p has at least
-  // Impl::MinSize(num_qubits) elements.
-  static Vector Create(fp_type* p, unsigned num_qubits) {
-    return Vector{Pointer{p, &detail::do_not_free}, num_qubits};
-  }
-
-  static Vector Null() {
-    return Vector{Pointer{nullptr, &detail::free}, 0};
-  }
-
-  static bool IsNull(const Vector& vec) {
-    return vec.get() == nullptr;
-  }
-
-  static void Free(fp_type* ptr) {
-    detail::free(ptr);
-  }
-
-  bool Copy(const Vector& src, Vector& dest) const {
-    if (src.num_qubits() != dest.num_qubits()) {
-      return false;
-    }
-
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* src, fp_type* dest) {
-      dest[i] = src[i];
-    };
-
-    for_.Run(Impl::MinSize(src.num_qubits()), f, src.get(), dest.get());
-
-    return true;
-  }
-
-  // It is the client's responsibility to make sure that dest has at least
-  // Impl::MinSize(src.num_qubits()) elements.
-  bool Copy(const Vector& src, fp_type* dest) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* src, fp_type* dest) {
-      dest[i] = src[i];
-    };
-
-    for_.Run(Impl::MinSize(src.num_qubits()), f, src.get(), dest);
-
-    return true;
-  }
-
-  // It is the client's responsibility to make sure that src has at least
-  // Impl::MinSize(dest.num_qubits()) elements.
-  bool Copy(const fp_type* src, Vector& dest) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* src, fp_type* dest) {
-      dest[i] = src[i];
-    };
-
-    for_.Run(Impl::MinSize(dest.num_qubits()), f, src, dest.get());
-
-    return true;
-  }
-
-  // It is the client's responsibility to make sure that src has at least
-  // min(size, Impl::MinSize(dest.num_qubits())) elements.
-  bool Copy(const fp_type* src, uint64_t size, Vector& dest) const {
-    auto f = [](unsigned n, unsigned m, uint64_t i,
-                const fp_type* src, fp_type* dest) {
-      dest[i] = src[i];
-    };
-
-    size = std::min(size, Impl::MinSize(dest.num_qubits()));
-    for_.Run(size, f, src, dest.get());
-
-    return true;
-  }
-
-  void DeviceSync() {}
-
- protected:
-  For for_;
-};
-
-}  // namespace qsim
-
-#endif  // VECTORSPACE_H_
diff --git a/tpls/qsim/vectorspace_cuda.h b/tpls/qsim/vectorspace_cuda.h
deleted file mode 100644
index fd91553..0000000
--- a/tpls/qsim/vectorspace_cuda.h
+++ /dev/null
@@ -1,172 +0,0 @@
-// Copyright 2019 Google LLC. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     https://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef VECTORSPACE_CUDA_H_
-#define VECTORSPACE_CUDA_H_
-
-#ifdef __NVCC__
-  #include <cuda.h>
-  #include <cuda_runtime.h>
-#elif __HIP__
-  #include <hip/hip_runtime.h>
-  #include "cuda2hip.h"
-#endif
-
-#include <memory>
-#include <utility>
-
-namespace qsim {
-
-namespace detail {
-
-inline void do_not_free(void*) {}
-
-inline void free(void* ptr) {
-  ErrorCheck(cudaFree(ptr));
-}
-
-}  // namespace detail
-
-// Routines for vector manipulations.
-template <typename Impl, typename FP>
-class VectorSpaceCUDA {
- public:
-  using fp_type = FP;
-
- private:
-  using Pointer = std::unique_ptr<fp_type, decltype(&detail::free)>;
-
- public:
-  class Vector {
-   public:
-    Vector() = delete;
-
-    Vector(Pointer&& ptr, unsigned num_qubits)
-        : ptr_(std::move(ptr)), num_qubits_(num_qubits) {}
-
-    fp_type* get() {
-      return ptr_.get();
-    }
-
-    const fp_type* get() const {
-      return ptr_.get();
-    }
-
-    fp_type* release() {
-      num_qubits_ = 0;
-      return ptr_.release();
-    }
-
-    unsigned num_qubits() const {
-      return num_qubits_;
-    }
-
-    bool requires_copy_to_host() const {
-      return true;
-    }
-
-   private:
-    Pointer ptr_;
-    unsigned num_qubits_;
-  };
-
-  template <typename... Args>
-  VectorSpaceCUDA(Args&&... args) {}
-
-  static Vector Create(unsigned num_qubits) {
-    fp_type* p;
-    auto size = sizeof(fp_type) * Impl::MinSize(num_qubits);
-    auto rc = cudaMalloc(&p, size);
-
-    if (rc == cudaSuccess) {
-      return Vector{Pointer{(fp_type*) p, &detail::free}, num_qubits};
-    } else {
-      return Null();
-    }
-  }
-
-  // It is the client's responsibility to make sure that p has at least
-  // Impl::MinSize(num_qubits) elements.
-  static Vector Create(fp_type* p, unsigned num_qubits) {
-    return Vector{Pointer{p, &detail::do_not_free}, num_qubits};
-  }
-
-  static Vector Null() {
-    return Vector{Pointer{nullptr, &detail::free}, 0};
-  }
-
-  static bool IsNull(const Vector& vector) {
-    return vector.get() == nullptr;
-  }
-
-  static void Free(fp_type* ptr) {
-    detail::free(ptr);
-  }
-
-  bool Copy(const Vector& src, Vector& dest) const {
-    if (src.num_qubits() != dest.num_qubits()) {
-      return false;
-    }
-
-    ErrorCheck(
-        cudaMemcpy(dest.get(), src.get(),
-                   sizeof(fp_type) * Impl::MinSize(src.num_qubits()),
-                   cudaMemcpyDeviceToDevice));
-
-    return true;
-  }
-
-  // It is the client's responsibility to make sure that dest has at least
-  // Impl::MinSize(src.num_qubits()) elements.
-  bool Copy(const Vector& src, fp_type* dest) const {
-    ErrorCheck(
-        cudaMemcpy(dest, src.get(),
-                   sizeof(fp_type) * Impl::MinSize(src.num_qubits()),
-                   cudaMemcpyDeviceToHost));
-
-    return true;
-  }
-
-  // It is the client's responsibility to make sure that src has at least
-  // Impl::MinSize(dest.num_qubits()) elements.
-  bool Copy(const fp_type* src, Vector& dest) const {
-    ErrorCheck(
-        cudaMemcpy(dest.get(), src,
-                   sizeof(fp_type) * Impl::MinSize(dest.num_qubits()),
-                   cudaMemcpyHostToDevice));
-
-    return true;
-  }
-
-  // It is the client's responsibility to make sure that src has at least
-  // min(size, Impl::MinSize(dest.num_qubits())) elements.
-  bool Copy(const fp_type* src, uint64_t size, Vector& dest) const {
-    size = std::min(size, Impl::MinSize(dest.num_qubits()));
-    ErrorCheck(
-        cudaMemcpy(dest.get(), src,
-                   sizeof(fp_type) * size,
-                   cudaMemcpyHostToDevice));
-    return true;
-  }
-
-  void DeviceSync() {
-    ErrorCheck(cudaDeviceSynchronize());
-  }
-
- protected:
-};
-
-}  // namespace qsim
-
-#endif  // VECTORSPACE_CUDA_H_

From 63874bc7ddf8c2eafd31691cc0a1cf92c2830a41 Mon Sep 17 00:00:00 2001
From: Seth R Johnson <johnsonsr@ornl.gov>
Date: Fri, 22 Nov 2024 14:15:50 -0500
Subject: [PATCH 06/64] Update goldfinger configuration

---
 CMakeLists.txt                        | 5 +++--
 scripts/cmake-presets/goldfinger.json | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bd57739..79e8162 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -117,8 +117,9 @@ set(QIREE_RUNTIME_OUTPUT_DIRECTORY
 enable_language(C) # Needed for LLVM
 find_package(LLVM REQUIRED)
 if((LLVM_VERSION VERSION_LESS 14)
-  OR (LLVM_VERSION VERSION_GREATER_EQUAL 19))
-  message(WARNING "QIR-EE is only tested with LLVM 14-18: found version ${LLVM_VERSION}")
+  OR (LLVM_VERSION VERSION_GREATER_EQUAL 20))
+  message(WARNING "QIR-EE is only tested with LLVM 14-19: found version ${LLVM_VERSION}")
+endif()
 endif()
 
 if(QIREE_USE_XACC)
diff --git a/scripts/cmake-presets/goldfinger.json b/scripts/cmake-presets/goldfinger.json
index 0e21a45..a056420 100644
--- a/scripts/cmake-presets/goldfinger.json
+++ b/scripts/cmake-presets/goldfinger.json
@@ -9,7 +9,7 @@
       "cacheVariables": {
         "CMAKE_BUILD_TYPE":      {"type": "STRING", "value": "Debug"},
         "CMAKE_EXPORT_COMPILE_COMMANDS": {"type": "BOOL",   "value": "ON"},
-        "CMAKE_OSX_DEPLOYMENT_TARGET": {"type": "STRING", "value": "14"},
+        "CMAKE_OSX_DEPLOYMENT_TARGET": {"type": "STRING", "value": "15"},
         "CMAKE_CXX_STANDARD":   {"type": "STRING",   "value": "17"},
         "CMAKE_CXX_EXTENSIONS": {"type": "BOOL",   "value": "OFF"},
         "CMAKE_FIND_FRAMEWORK": {"type": "STRING", "value": "LAST"},
@@ -17,7 +17,7 @@
         "CMAKE_CXX_FLAGS": "-Wall -Wextra -Werror -Wno-error=deprecated -pedantic -fdiagnostics-color=always"
       },
       "environment": {
-        "CMAKE_PREFIX_PATH": "/opt/homebrew/Cellar/llvm/18.1.8:/opt/spack/var/spack/environments/xacc/.spack-env/view:$env{HOME}/Code/xacc/install"
+        "CMAKE_PREFIX_PATH": "/opt/homebrew/opt/llvm:/opt/spack/var/spack/environments/xacc/.spack-env/view:$env{HOME}/Code/xacc/install"
       }
     },
     {

From 91753d642b8668ee1305e2876b7b2e8662636daf Mon Sep 17 00:00:00 2001
From: Seth R Johnson <johnsonsr@ornl.gov>
Date: Fri, 22 Nov 2024 14:29:42 -0500
Subject: [PATCH 07/64] Add qsim

---
 CMakeLists.txt             | 35 +++++++++++++++++++++++++----------
 app/CMakeLists.txt         | 19 ++++++++++++++++++-
 src/qiree_config.h.in      |  2 ++
 src/qirqsim/CMakeLists.txt |  1 +
 src/qirqsim/qsimQuantum.cc | 28 ++++++++++++++--------------
 src/qirqsim/qsimQuantum.hh | 33 ++++++++++++++++-----------------
 6 files changed, 76 insertions(+), 42 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 79e8162..864093d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,7 +4,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #----------------------------------------------------------------------------#
 
-cmake_minimum_required(VERSION 3.12)
+cmake_minimum_required(VERSION 3.18)
 
 # Set QIREE_VERSION using git tags using the following format
 set(CGV_TAG_REGEX "v([0-9.]+)(-dev|-rc.[0-9]+)?")
@@ -12,12 +12,13 @@ include("${CMAKE_CURRENT_LIST_DIR}/cmake/CgvFindVersion.cmake")
 cgv_find_version(QIREE)
 
 project(QIREE VERSION "${QIREE_VERSION}" LANGUAGES CXX)
-cmake_policy(VERSION 3.12...3.22)
+cmake_policy(VERSION 3.18...3.30)
 
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
 
 include(GNUInstallDirs)
 include(CMakePackageConfigHelpers)
+include(FetchContent)
 include("${CMAKE_CURRENT_LIST_DIR}/cmake/QIREEUtils.cmake")
 
 macro(qiree_set_default name value)
@@ -34,14 +35,10 @@ endmacro()
 
 # Components
 option(QIREE_BUILD_DOCS "Build QIR-EE documentation" OFF)
-option(QIREE_BUILD_TESTS "Build QIR-EE unit tests" OFF)
+option(QIREE_BUILD_TESTS "Build QIR-EE unit tests" ON)
 option(QIREE_BUILD_EXAMPLES "Build QIR-EE examples" OFF)
-option(QIREE_USE_XACC "Build XACC interface" OFF)
-option(QIREE_USE_QSIM "Build qsim interface" OFF)
-qiree_set_default(BUILD_TESTING ${QIREE_BUILD_TESTS})
-
-# Assertion handling
-option(QIREE_DEBUG "Enable runtime assertions" ON)
+option(QIREE_USE_QSIM "Download and build Google qsim backend" OFF)
+option(QIREE_USE_XACC "Build XACC interface" ON)
 
 qiree_set_default(BUILD_TESTING ${QIREE_BUILD_TESTS})
 
@@ -120,6 +117,24 @@ if((LLVM_VERSION VERSION_LESS 14)
   OR (LLVM_VERSION VERSION_GREATER_EQUAL 20))
   message(WARNING "QIR-EE is only tested with LLVM 14-19: found version ${LLVM_VERSION}")
 endif()
+
+if(QIREE_USE_QSIM)
+  # Declare and download qsim: it's header-only and the code is in "lib",
+  # so download it into "external/qsim" directory and include "external"
+  FetchContent_Declare(
+    qsim_content
+    QUIET
+    GIT_REPOSITORY https://github.com/quantumlib/qsim.git
+    GIT_TAG e5817518b16858e0732269b56525f72bcdb30764 # v0.21.0
+    SOURCE_SUBDIR "lib" # Don't load top-level cmake file
+    SOURCE_DIR "external/qsim"
+  )
+  FetchContent_MakeAvailable(qsim_content)
+  add_library(qiree_qsim INTERFACE)
+  add_library(QIREE::qsim ALIAS qiree_qsim)
+  target_include_directories(qiree_qsim SYSTEM INTERFACE
+    "${CMAKE_CURRENT_BINARY_DIR}/external"
+  )
 endif()
 
 if(QIREE_USE_XACC)
@@ -192,7 +207,7 @@ add_subdirectory(app)
 #----------------------------------------------------------------------------#
 
 if(QIREE_BUILD_EXAMPLES)
-   add_subdirectory(examples)
+  add_subdirectory(examples)
 endif()
 
 #----------------------------------------------------------------------------#
diff --git a/app/CMakeLists.txt b/app/CMakeLists.txt
index ea7589a..4bf7330 100644
--- a/app/CMakeLists.txt
+++ b/app/CMakeLists.txt
@@ -4,7 +4,6 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #-----------------------------------------------------------------------------#
 
-include(FetchContent)
 FetchContent_Declare(
   # Command Line Parser for C++ programs
   cli11_proj
@@ -15,6 +14,24 @@ FetchContent_Declare(
 
 FetchContent_MakeAvailable(cli11_proj)
 
+#-----------------------------------------------------------------------------#
+# QSIM FRONT END
+#-----------------------------------------------------------------------------#
+
+if(QIREE_USE_QSIM)
+  qiree_add_executable(qir-qsim
+    qir-qsim.cc
+  )
+  target_link_libraries(qir-qsim
+    PUBLIC QIREE::qiree QIREE::qirqsim
+    PRIVATE CLI11::CLI11
+  )
+endif()
+
+#-----------------------------------------------------------------------------#
+# XACC FRONT END
+#-----------------------------------------------------------------------------#
+
 if(QIREE_USE_XACC)
   qiree_add_executable(qir-xacc
     qir-xacc.cc
diff --git a/src/qiree_config.h.in b/src/qiree_config.h.in
index 475c792..d46b752 100644
--- a/src/qiree_config.h.in
+++ b/src/qiree_config.h.in
@@ -10,5 +10,7 @@
 #define qiree_config_h
 
 #cmakedefine01 QIREE_DEBUG
+#cmakedefine01 QIREE_USE_QSIM
+#cmakedefine01 QIREE_USE_XACC
 
 #endif /* qiree_config_h */
diff --git a/src/qirqsim/CMakeLists.txt b/src/qirqsim/CMakeLists.txt
index 09a0511..380bbac 100644
--- a/src/qirqsim/CMakeLists.txt
+++ b/src/qirqsim/CMakeLists.txt
@@ -15,6 +15,7 @@ qiree_add_library(qirqsim
 #Link the qsim library to qiree and any other relevant libraries
 target_link_libraries(qirqsim
   PUBLIC QIREE::qiree  # Link to qiree
+  PRIVATE QIREE::qsim
 )
 
 #----------------------------------------------------------------------------#
diff --git a/src/qirqsim/qsimQuantum.cc b/src/qirqsim/qsimQuantum.cc
index 74d510d..0ee6746 100644
--- a/src/qirqsim/qsimQuantum.cc
+++ b/src/qirqsim/qsimQuantum.cc
@@ -19,20 +19,20 @@
 #include "qiree/Assert.hh"
 
 // Qsim
-#include "../../tpls/qsim/simulator_basic.h"
-#include "../../tpls/qsim/statespace_basic.h"
-#include "../../tpls/qsim/gates_qsim.h"
-#include "../../tpls/qsim/circuit.h"
-#include "../../tpls/qsim/run_qsim.h"
-#include "../../tpls/qsim/io.h"
-#include "../../tpls/qsim/fuser.h"
-#include "../../tpls/qsim/circuit_qsim_parser.h" 
-#include "../../tpls/qsim/fuser_mqubit.h"
-#include "../../tpls/qsim/io_file.h"
-#include "../../tpls/qsim/simmux.h"
-#include "../../tpls/qsim/util_cpu.h"
-#include "../../tpls/qsim/formux.h"
-#include "../../tpls/qsim/gate.h"
+#include <qsim/lib/circuit.h>
+#include <qsim/lib/circuit_qsim_parser.h>
+#include <qsim/lib/formux.h>
+#include <qsim/lib/fuser.h>
+#include <qsim/lib/fuser_mqubit.h>
+#include <qsim/lib/gate.h>
+#include <qsim/lib/gates_qsim.h>
+#include <qsim/lib/io.h>
+#include <qsim/lib/io_file.h>
+#include <qsim/lib/run_qsim.h>
+#include <qsim/lib/simmux.h>
+#include <qsim/lib/simulator_basic.h>
+#include <qsim/lib/statespace_basic.h>
+#include <qsim/lib/util_cpu.h>
 //
 
 namespace qiree{
diff --git a/src/qirqsim/qsimQuantum.hh b/src/qirqsim/qsimQuantum.hh
index cfdfc4b..024209f 100644
--- a/src/qirqsim/qsimQuantum.hh
+++ b/src/qirqsim/qsimQuantum.hh
@@ -7,33 +7,32 @@
 //---------------------------------------------------------------------------//
 #pragma once
 
+#include <cassert>
 #include <initializer_list>
 #include <map>
 #include <memory>
 #include <ostream>
 #include <vector>
-#include <cassert>
+#include <qsim/lib/circuit.h>
+#include <qsim/lib/circuit_qsim_parser.h>
+#include <qsim/lib/formux.h>
+#include <qsim/lib/fuser.h>
+#include <qsim/lib/fuser_mqubit.h>
+#include <qsim/lib/gate.h>
+#include <qsim/lib/gates_qsim.h>
+#include <qsim/lib/io.h>
+#include <qsim/lib/io_file.h>
+#include <qsim/lib/run_qsim.h>
+#include <qsim/lib/simmux.h>
+#include <qsim/lib/simulator_basic.h>
+#include <qsim/lib/statespace_basic.h>
+#include <qsim/lib/util_cpu.h>
 
+#include "BufferManager.hh"
 #include "qiree/Macros.hh"
 #include "qiree/QuantumNotImpl.hh"
 #include "qiree/RuntimeInterface.hh"
 #include "qiree/Types.hh"
-#include "BufferManager.hh"
-
-#include "../../tpls/qsim/simulator_basic.h"
-#include "../../tpls/qsim/statespace_basic.h"
-#include "../../tpls/qsim/gates_qsim.h"
-#include "../../tpls/qsim/circuit.h"
-#include "../../tpls/qsim/run_qsim.h"
-#include "../../tpls/qsim/io.h"
-#include "../../tpls/qsim/fuser.h"
-#include "../../tpls/qsim/circuit_qsim_parser.h" 
-#include "../../tpls/qsim/fuser_mqubit.h"
-#include "../../tpls/qsim/io_file.h"
-#include "../../tpls/qsim/simmux.h"
-#include "../../tpls/qsim/util_cpu.h"
-#include "../../tpls/qsim/formux.h"
-#include "../../tpls/qsim/gate.h"
 
 struct Factory { // Factory class for creating simulators in qsim 
     Factory(unsigned num_threads) : num_threads(num_threads) {}

From e289381d75d9084f551026677abd2b15b021f22a Mon Sep 17 00:00:00 2001
From: Seth R Johnson <johnsonsr@ornl.gov>
Date: Fri, 22 Nov 2024 14:33:57 -0500
Subject: [PATCH 08/64] Update version to get include directories working

---
 CMakeLists.txt             | 7 ++++---
 src/qirqsim/CMakeLists.txt | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 864093d..3eb2675 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -125,15 +125,16 @@ if(QIREE_USE_QSIM)
     qsim_content
     QUIET
     GIT_REPOSITORY https://github.com/quantumlib/qsim.git
-    GIT_TAG e5817518b16858e0732269b56525f72bcdb30764 # v0.21.0
+    GIT_TAG 55b4d0e7ea8f085a1709c2c06ff1e28b3aa93357 # 'main' on 22 Nov 2024
     SOURCE_SUBDIR "lib" # Don't load top-level cmake file
     SOURCE_DIR "external/qsim"
   )
   FetchContent_MakeAvailable(qsim_content)
-  add_library(qiree_qsim INTERFACE)
+  qiree_add_library(qiree_qsim INTERFACE)
   add_library(QIREE::qsim ALIAS qiree_qsim)
   target_include_directories(qiree_qsim SYSTEM INTERFACE
-    "${CMAKE_CURRENT_BINARY_DIR}/external"
+    "$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/external>"
+    "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/external>"
   )
 endif()
 
diff --git a/src/qirqsim/CMakeLists.txt b/src/qirqsim/CMakeLists.txt
index 380bbac..b0cf690 100644
--- a/src/qirqsim/CMakeLists.txt
+++ b/src/qirqsim/CMakeLists.txt
@@ -15,7 +15,7 @@ qiree_add_library(qirqsim
 #Link the qsim library to qiree and any other relevant libraries
 target_link_libraries(qirqsim
   PUBLIC QIREE::qiree  # Link to qiree
-  PRIVATE QIREE::qsim
+  PUBLIC QIREE::qsim #FIXME: make private
 )
 
 #----------------------------------------------------------------------------#

From ac7c24402879e5404f20385c7635d879ed8708a4 Mon Sep 17 00:00:00 2001
From: Seth R Johnson <johnsonsr@ornl.gov>
Date: Fri, 22 Nov 2024 14:49:06 -0500
Subject: [PATCH 09/64] Run clang format

---
 src/qirqsim/BufferManager.cc      |  31 +++--
 src/qirqsim/BufferManager.hh      |  40 +++---
 src/qirqsim/qsimDefaultRuntime.cc |  24 ++--
 src/qirqsim/qsimDefaultRuntime.hh |   8 +-
 src/qirqsim/qsimQuantum.cc        | 222 +++++++++++++++++++-----------
 src/qirqsim/qsimQuantum.hh        |  95 ++++++-------
 src/qirqsim/qsimTupleRuntime.cc   |  10 +-
 7 files changed, 255 insertions(+), 175 deletions(-)

diff --git a/src/qirqsim/BufferManager.cc b/src/qirqsim/BufferManager.cc
index 46931d9..b340604 100644
--- a/src/qirqsim/BufferManager.cc
+++ b/src/qirqsim/BufferManager.cc
@@ -7,39 +7,50 @@
 //---------------------------------------------------------------------------//
 
 #include "BufferManager.hh"
-#include <unordered_map>
-#include <string>
+
 #include <optional>
+#include <string>
+#include <unordered_map>
 
-void BufferManager::updateBuffer(const std::string& qubit, const std::string& state, const int& value) {
+void BufferManager::updateBuffer(std::string const& qubit,
+                                 std::string const& state,
+                                 int const& value)
+{
     // Insert or update the key-value pair in the buffer
     std::pair<std::string, std::string> searchKey = {qubit, state};
     int current_frequency = 0;
     auto it = buffer.find(searchKey);
-    if (it != buffer.end()){
-        current_frequency = it -> second;
+    if (it != buffer.end())
+    {
+        current_frequency = it->second;
     }
     // Accumulate counts with every shot
     buffer[{qubit, state}] = value + current_frequency;
 }
 
-void BufferManager::updateBuffer(const std::string& key, const int& value) {
+void BufferManager::updateBuffer(std::string const& key, int const& value)
+{
     // Insert or update the key-value pair in the buffer
     simple_buffer[key] = value;
 }
 
-std::optional<int> BufferManager::getBufferValue(const std::string& qubit, const std::string& state) const {
+std::optional<int> BufferManager::getBufferValue(std::string const& qubit,
+                                                 std::string const& state) const
+{
     std::pair<std::string, std::string> searchKey = {qubit, state};
     auto it = buffer.find(searchKey);
-    if (it != buffer.end()) {
+    if (it != buffer.end())
+    {
         return it->second;  // Key found
     }
     return std::nullopt;  // Key not found
 }
 
-std::optional<int> BufferManager::getBufferValue(const std::string& key) const {
+std::optional<int> BufferManager::getBufferValue(std::string const& key) const
+{
     auto it = simple_buffer.find(key);
-    if (it != simple_buffer.end()) {
+    if (it != simple_buffer.end())
+    {
         return it->second;  // Key found
     }
     return std::nullopt;  // Key not found
diff --git a/src/qirqsim/BufferManager.hh b/src/qirqsim/BufferManager.hh
index efb3800..9bac1b5 100644
--- a/src/qirqsim/BufferManager.hh
+++ b/src/qirqsim/BufferManager.hh
@@ -9,17 +9,19 @@
 #ifndef BUFFER_MANAGER_H
 #define BUFFER_MANAGER_H
 
-#include <unordered_map>
-#include <string>
-#include <optional>
 #include <functional>
+#include <optional>
+#include <string>
+#include <unordered_map>
 #include <utility>
 
 // Define a hash function for std::pair
 
-struct pair_hash {
-    template <class T1, class T2>
-    std::size_t operator()(const std::pair<T1, T2>& pair) const {
+struct pair_hash
+{
+    template<class T1, class T2>
+    std::size_t operator()(std::pair<T1, T2> const& pair) const
+    {
         auto hash1 = std::hash<T1>{}(pair.first);
         auto hash2 = std::hash<T2>{}(pair.second);
         // Combine the two hash values
@@ -27,22 +29,24 @@ struct pair_hash {
     }
 };
 
-class BufferManager {
-public:
-    
+class BufferManager
+{
+  public:
     // Method to update the buffer with a key-value pair
-    void updateBuffer(const std::string& qubit, const std::string& state, const int& value);
-    void updateBuffer(const std::string& key, const int& value);
-    
+    void updateBuffer(std::string const& qubit,
+                      std::string const& state,
+                      int const& value);
+    void updateBuffer(std::string const& key, int const& value);
+
     // Retrieve buffer value for storage or evaluation
-    std::optional<int> getBufferValue(const std::string& qubit, const std::string& state) const;
-    std::optional<int> getBufferValue(const std::string& key) const;
-    
-private:
-    
+    std::optional<int>
+    getBufferValue(std::string const& qubit, std::string const& state) const;
+    std::optional<int> getBufferValue(std::string const& key) const;
+
+  private:
     // Dictionary to store key-value pairs
     std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> buffer;
     std::unordered_map<std::string, int> simple_buffer;
 };
 
-#endif // BUFFER_MANAGER_H
+#endif  // BUFFER_MANAGER_H
diff --git a/src/qirqsim/qsimDefaultRuntime.cc b/src/qirqsim/qsimDefaultRuntime.cc
index 339703a..924e1e6 100644
--- a/src/qirqsim/qsimDefaultRuntime.cc
+++ b/src/qirqsim/qsimDefaultRuntime.cc
@@ -6,9 +6,11 @@
 //! \file qirqsim/qsimDefaultRuntime.cc
 //---------------------------------------------------------------------------//
 #include "qsimDefaultRuntime.hh"
+
 #include <iostream>
+
 #include "qiree/Assert.hh"
- 
+
 namespace qiree
 {
 //---------------------------------------------------------------------------//
@@ -32,9 +34,9 @@ void qsimDefaultRuntime::initialize(OptionalCString env)
 
 void qsimDefaultRuntime::array_record_output(size_type s, OptionalCString tag)
 {
-    //this->execute_if_needed();
-    //output_ << "array " << (tag ? tag : "<null>") << " length " << s
-    //        << std::endl;
+    // this->execute_if_needed();
+    // output_ << "array " << (tag ? tag : "<null>") << " length " << s
+    //         << std::endl;
 }
 
 //---------------------------------------------------------------------------//
@@ -45,9 +47,9 @@ void qsimDefaultRuntime::array_record_output(size_type s, OptionalCString tag)
 
 void qsimDefaultRuntime::tuple_record_output(size_type s, OptionalCString tag)
 {
-    //this->execute_if_needed();
-    //output_ << "tuple " << (tag ? tag : "<null>") << " length " << s
-    //        << std::endl;
+    // this->execute_if_needed();
+    // output_ << "tuple " << (tag ? tag : "<null>") << " length " << s
+    //         << std::endl;
 }
 
 //---------------------------------------------------------------------------//
@@ -60,10 +62,12 @@ void qsimDefaultRuntime::result_record_output(Result r, OptionalCString tag)
     // This prints results every time result_record_output is called
     // Can comment out if only want to see final results
 
-    if (auto value = sim_.manager.getBufferValue("q"+std::to_string(r.value)); value.has_value()) {
-        std::cout << "q" << std::to_string(r.value) << " : " << value.value() << "\n";
+    if (auto value = sim_.manager.getBufferValue("q" + std::to_string(r.value));
+        value.has_value())
+    {
+        std::cout << "q" << std::to_string(r.value) << " : " << value.value()
+                  << "\n";
     }
-
 }
 
 }  // namespace qiree
diff --git a/src/qirqsim/qsimDefaultRuntime.hh b/src/qirqsim/qsimDefaultRuntime.hh
index 26f06ab..e76308e 100644
--- a/src/qirqsim/qsimDefaultRuntime.hh
+++ b/src/qirqsim/qsimDefaultRuntime.hh
@@ -13,7 +13,7 @@ namespace qiree
 {
 
 /*!
- * Print per-qubit measurement statistics. 
+ * Print per-qubit measurement statistics.
  *
  * Example for three qubits:
  * \code
@@ -24,7 +24,7 @@ namespace qiree
  * q0 {0: 542, 1: 482}
  * q1 {0: 521, 1: 503}
  * q2 {0: 0, 1: 1024}
- * 
+ *
  * \endcode
  */
 
@@ -34,9 +34,7 @@ class qsimDefaultRuntime final : virtual public RuntimeInterface
     /*!
      * Construct \c qsimDefaultRuntime.
      */
-    qsimDefaultRuntime(std::ostream& output,
-                       qsimQuantum& sim
-                       )
+    qsimDefaultRuntime(std::ostream& output, qsimQuantum& sim)
         : output_(output), sim_(sim)
     {
     }
diff --git a/src/qirqsim/qsimQuantum.cc b/src/qirqsim/qsimQuantum.cc
index 0ee6746..0fe9a98 100644
--- a/src/qirqsim/qsimQuantum.cc
+++ b/src/qirqsim/qsimQuantum.cc
@@ -9,12 +9,12 @@
 #include "qsimQuantum.hh"
 
 #include <algorithm>
+#include <cassert>
 #include <iostream>
-#include <utility>
+#include <optional>
 #include <stdexcept>
 #include <thread>
-#include <optional>
-#include <cassert>
+#include <utility>
 
 #include "qiree/Assert.hh"
 
@@ -35,50 +35,64 @@
 #include <qsim/lib/util_cpu.h>
 //
 
-namespace qiree{
+namespace qiree
+{
 //---------------------------------------------------------------------------//
 /*
 Initialize the qsim simulator
 */
 
-qsimQuantum::State qsimQuantum::init_state_space() { //check if StateSpace is the proper type for the output, problably it is just State from the Fatory struct.
-    std::srand(static_cast<unsigned int>(std::time(nullptr))); // Seed the random number generator
-    qsimParam.seed = std::rand(); // Set the seed for qsim parameters
-    numThreads = std::max(1, static_cast<int>(std::thread::hardware_concurrency())); // Get the number of threads
-    qsimParam.max_fused_size = 2; // Set the maximum size of fused gates
-    qsimParam.verbosity = 0; // see verbosity in run_qsim.h 
+qsimQuantum::State qsimQuantum::init_state_space()
+{  // check if StateSpace is the proper type for the output, problably it is
+   // just State from the Fatory struct.
+    std::srand(static_cast<unsigned int>(std::time(nullptr)));  // Seed the
+                                                                // random
+                                                                // number
+                                                                // generator
+    qsimParam.seed = std::rand();  // Set the seed for qsim parameters
+    numThreads = std::max(
+        1, static_cast<int>(std::thread::hardware_concurrency()));  // Get the
+                                                                    // number
+                                                                    // of
+                                                                    // threads
+    qsimParam.max_fused_size = 2;  // Set the maximum size of fused gates
+    qsimParam.verbosity = 0;  // see verbosity in run_qsim.h
     // Initialize the qsim simulator
-    qsimQuantum::StateSpace state_space = Factory(numThreads).CreateStateSpace(); // Create the state space
-    State state = state_space.Create(this->num_qubits()); // Create the state
+    qsimQuantum::StateSpace state_space
+        = Factory(numThreads).CreateStateSpace();  // Create the state space
+    State state = state_space.Create(this->num_qubits());  // Create the state
     // Check if the state is null
-    if (state_space.IsNull(state)) {
-        qsim::IO::errorf("not enough memory: is the number of qubits too large?\n");
+    if (state_space.IsNull(state))
+    {
+        qsim::IO::errorf(
+            "not enough memory: is the number of qubits too large?\n");
     }
-    state_space.SetStateZero(state); // Set the state to zero, TODO: the initial state is not necessarily zero
-  return state;
-  }
-  
-  qsimQuantum::qsimQuantum(std::ostream& os,
-                 size_type shots)
-  : output_(os)
-  {
-  }
+    state_space.SetStateZero(state);  // Set the state to zero, TODO: the
+                                      // initial state is not necessarily zero
+    return state;
+}
+
+qsimQuantum::qsimQuantum(std::ostream& os, size_type shots) : output_(os) {}
 
 //---------------------------------------------------------------------------//
 /*
 Prepare to build a quantum circuit for an entry point
 */
 
-void qsimQuantum::set_up(EntryPointAttrs const& attrs) {
+void qsimQuantum::set_up(EntryPointAttrs const& attrs)
+{
     QIREE_VALIDATE(attrs.required_num_qubits > 0,
                    << "input is not a quantum program");
-    // Resize the result_to_qubit_ vector, based on the required number of results...
-    // the idea is to have as many classical registers as qubits (probably not true in general)
+    // Resize the result_to_qubit_ vector, based on the required number of
+    // results... the idea is to have as many classical registers as qubits
+    // (probably not true in general)
     result_to_qubit_.resize(attrs.required_num_results);
-    num_qubits_ = attrs.required_num_qubits; // Set the number of qubits
-    state_ = std::make_shared<State>(init_state_space()); // Set the state space? Maybe.
-    q_circuit.num_qubits = num_qubits_; // Allocate the number of qubits in the circuit
-    execution_time = 0; // Initialize execution time
+    num_qubits_ = attrs.required_num_qubits;  // Set the number of qubits
+    state_ = std::make_shared<State>(init_state_space());  // Set the state
+                                                           // space? Maybe.
+    q_circuit.num_qubits = num_qubits_;  // Allocate the number of qubits in
+                                         // the circuit
+    execution_time = 0;  // Initialize execution time
     static unsigned int rep = 0;
     rep++;
     this->repCount(rep);
@@ -89,11 +103,13 @@ void qsimQuantum::set_up(EntryPointAttrs const& attrs) {
 Complete an execution
 */
 
-void qsimQuantum::repCount(int rep) {
+void qsimQuantum::repCount(int rep)
+{
     repetition = rep;
 }
 
-void qsimQuantum::tear_down() {
+void qsimQuantum::tear_down()
+{
     q_circuit = {};
     q_circuit.num_qubits = num_qubits_;
     state_ = std::make_shared<State>(init_state_space());
@@ -104,12 +120,13 @@ void qsimQuantum::tear_down() {
 Reset the qubit
 */
 
-void qsimQuantum::reset(Qubit q) {
-    q.value=0;
+void qsimQuantum::reset(Qubit q)
+{
+    q.value = 0;
 }
 
 //----------------------------------------------------------------------------//
-/* 
+/*
 Read the value of a result. This utilizes the new BufferManager.
 */
 
@@ -117,18 +134,24 @@ QState qsimQuantum::read_result(Result r)
 {
     std::string q_index_string = std::to_string(r.value);
     auto meas_results = execute_if_needed();
-    if (meas_results.size() == 1 && meas_results[0].bitstring.size() == 1) {
-        const auto bitResult = meas_results[0].bitstring[0];
+    if (meas_results.size() == 1 && meas_results[0].bitstring.size() == 1)
+    {
+        auto const bitResult = meas_results[0].bitstring[0];
         assert(bitResult == 0 || bitResult == 1);
         std::string stringResult = std::to_string(bitResult);
-        if (stringResult == "1"){
-            manager.updateBuffer("q"+q_index_string, "1", 1);
-            manager.updateBuffer("q"+q_index_string, 1);
-        } else{
-            manager.updateBuffer("q"+q_index_string, "0", 1);
-            manager.updateBuffer("q"+q_index_string, 0);
+        if (stringResult == "1")
+        {
+            manager.updateBuffer("q" + q_index_string, "1", 1);
+            manager.updateBuffer("q" + q_index_string, 1);
+        }
+        else
+        {
+            manager.updateBuffer("q" + q_index_string, "0", 1);
+            manager.updateBuffer("q" + q_index_string, 0);
         }
-    } else {
+    }
+    else
+    {
         qsim::IO::errorf("Unexpected measurement results encountered.");
     }
     return static_cast<QState>(meas_results[0].bitstring[0]);
@@ -136,12 +159,18 @@ QState qsimQuantum::read_result(Result r)
 
 //---------------------------------------------------------------------------//
 /*
-Map a qubit to a result index 
+Map a qubit to a result index
 (TODO: find how to link the classical register to the quantum register in qsim)
 */
 
-void qsimQuantum::mz(Qubit q, Result r) { //we don't classical register yet. 
-    QIREE_EXPECT(q.value < this->num_qubits()); // TODO: q must be in the set of qubits, e.g., what happens if q=5 and qubits are {2,3,4,5}, q is less than num_qubits but not it is in the set of qubits. 
+void qsimQuantum::mz(Qubit q, Result r)
+{  // we don't classical register yet.
+    QIREE_EXPECT(q.value < this->num_qubits());  // TODO: q must be in the set
+                                                 // of qubits, e.g., what
+                                                 // happens if q=5 and qubits
+                                                 // are {2,3,4,5}, q is less
+                                                 // than num_qubits but not it
+                                                 // is in the set of qubits.
     // Add measurement instruction
     this->q_circuit.gates.push_back(
         qsim::gate::Measurement<qsim::GateQSim<float>>::Create(
@@ -154,79 +183,106 @@ Quantum Instruction Mapping
 */
 
 // 1. Entangling gates
-void qsimQuantum::cx(Qubit q1, Qubit q2) {
-    q_circuit.gates.push_back(
-        qsim::GateCNot<float>::Create(execution_time++, this->getQubitIndex(q1), this->getQubitIndex(q2)));
+void qsimQuantum::cx(Qubit q1, Qubit q2)
+{
+    q_circuit.gates.push_back(qsim::GateCNot<float>::Create(
+        execution_time++, this->getQubitIndex(q1), this->getQubitIndex(q2)));
 }
-void qsimQuantum::cnot(Qubit q1, Qubit q2) {
-    q_circuit.gates.push_back(
-        qsim::GateCNot<float>::Create(execution_time++, this->getQubitIndex(q1), this->getQubitIndex(q2)));
+void qsimQuantum::cnot(Qubit q1, Qubit q2)
+{
+    q_circuit.gates.push_back(qsim::GateCNot<float>::Create(
+        execution_time++, this->getQubitIndex(q1), this->getQubitIndex(q2)));
 }
-void qsimQuantum::cz(Qubit q1, Qubit q2) {
-    q_circuit.gates.push_back(
-        qsim::GateCZ<float>::Create(execution_time++, this->getQubitIndex(q1), this->getQubitIndex(q2)));
+void qsimQuantum::cz(Qubit q1, Qubit q2)
+{
+    q_circuit.gates.push_back(qsim::GateCZ<float>::Create(
+        execution_time++, this->getQubitIndex(q1), this->getQubitIndex(q2)));
 }
 // 2. Local gates
-void qsimQuantum::h(Qubit q) {
+void qsimQuantum::h(Qubit q)
+{
     q_circuit.gates.push_back(
         qsim::GateHd<float>::Create(execution_time++, this->getQubitIndex(q)));
 }
-void qsimQuantum::s(Qubit q) {
+void qsimQuantum::s(Qubit q)
+{
     q_circuit.gates.push_back(
         qsim::GateS<float>::Create(execution_time++, this->getQubitIndex(q)));
 }
-void qsimQuantum::t(Qubit q) {
+void qsimQuantum::t(Qubit q)
+{
     q_circuit.gates.push_back(
         qsim::GateT<float>::Create(execution_time++, this->getQubitIndex(q)));
 }
 // 2.1 Pauli gates
-void qsimQuantum::x(Qubit q) {
+void qsimQuantum::x(Qubit q)
+{
     q_circuit.gates.push_back(
         qsim::GateX<float>::Create(execution_time++, this->getQubitIndex(q)));
 }
-void qsimQuantum::y(Qubit q) {
+void qsimQuantum::y(Qubit q)
+{
     q_circuit.gates.push_back(
         qsim::GateY<float>::Create(execution_time++, this->getQubitIndex(q)));
 }
-void qsimQuantum::z(Qubit q) {
+void qsimQuantum::z(Qubit q)
+{
     q_circuit.gates.push_back(
         qsim::GateZ<float>::Create(execution_time++, this->getQubitIndex(q)));
 }
 // 2.2 rotation gates
-void qsimQuantum::rx(double theta, Qubit q) {
-    q_circuit.gates.push_back(
-        qsim::GateRX<float>::Create(execution_time++, this->getQubitIndex(q), theta));
+void qsimQuantum::rx(double theta, Qubit q)
+{
+    q_circuit.gates.push_back(qsim::GateRX<float>::Create(
+        execution_time++, this->getQubitIndex(q), theta));
 }
-void qsimQuantum::ry(double theta, Qubit q) {
-    q_circuit.gates.push_back(
-        qsim::GateRY<float>::Create(execution_time++, this->getQubitIndex(q), theta));
+void qsimQuantum::ry(double theta, Qubit q)
+{
+    q_circuit.gates.push_back(qsim::GateRY<float>::Create(
+        execution_time++, this->getQubitIndex(q), theta));
 }
-void qsimQuantum::rz(double theta, Qubit q) {
-    q_circuit.gates.push_back(
-        qsim::GateRZ<float>::Create(execution_time++, this->getQubitIndex(q), theta));
+void qsimQuantum::rz(double theta, Qubit q)
+{
+    q_circuit.gates.push_back(qsim::GateRZ<float>::Create(
+        execution_time++, this->getQubitIndex(q), theta));
 }
 
-Qubit qsimQuantum::result_to_qubit(Result r) {
-    // TODO: This function is not working. Giving 0 every time. Maybe not needed.
+Qubit qsimQuantum::result_to_qubit(Result r)
+{
+    // TODO: This function is not working. Giving 0 every time. Maybe not
+    // needed.
     QIREE_EXPECT(r.value < this->num_results());
-    return result_to_qubit_[r.value]; // just copied this from the qirxacc, I have no idea if we need to do something else here
+    return result_to_qubit_[r.value];  // just copied this from the qirxacc, I
+                                       // have no idea if we need to do
+                                       // something else here
 }
 
-void qsimQuantum::print_accelbuf() {
-    // TODO: to be implemented, we can create a buffer class to store the results
+void qsimQuantum::print_accelbuf()
+{
+    // TODO: to be implemented, we can create a buffer class to store the
+    // results
 }
 
-qsimQuantum::VecMeas qsimQuantum::execute_if_needed() {
-    std::vector<StateSpace::MeasurementResult> meas_results; // Vector to hold measurement results, this must be empty before running
+qsimQuantum::VecMeas qsimQuantum::execute_if_needed()
+{
+    std::vector<StateSpace::MeasurementResult> meas_results;  // Vector to hold
+                                                              // measurement
+                                                              // results, this
+                                                              // must be empty
+                                                              // before running
     std::string stringResult;
     static unsigned long int seed = 0;
     qsimParam.seed = seed++;
-    const bool run_success = Runner::Run(qsimParam, Factory(numThreads), q_circuit, *state_, meas_results); // Run the simulation
-    assert(run_success); // Ensure the run was successful
-	// reset circuit here 
-	q_circuit = {};
+    bool const run_success = Runner::Run(qsimParam,
+                                         Factory(numThreads),
+                                         q_circuit,
+                                         *state_,
+                                         meas_results);  // Run the simulation
+    assert(run_success);  // Ensure the run was successful
+    // reset circuit here
+    q_circuit = {};
     q_circuit.num_qubits = num_qubits_;
     return meas_results;
 }
 
-} // namespace qiree
+}  // namespace qiree
diff --git a/src/qirqsim/qsimQuantum.hh b/src/qirqsim/qsimQuantum.hh
index 024209f..745da51 100644
--- a/src/qirqsim/qsimQuantum.hh
+++ b/src/qirqsim/qsimQuantum.hh
@@ -34,24 +34,24 @@
 #include "qiree/RuntimeInterface.hh"
 #include "qiree/Types.hh"
 
-struct Factory { // Factory class for creating simulators in qsim 
+struct Factory
+{  // Factory class for creating simulators in qsim
     Factory(unsigned num_threads) : num_threads(num_threads) {}
     using Simulator = qsim::Simulator<qsim::For>;
     using StateSpace = Simulator::StateSpace;
-    StateSpace CreateStateSpace() const { return StateSpace(num_threads); } 
+    StateSpace CreateStateSpace() const { return StateSpace(num_threads); }
     Simulator CreateSimulator() const { return Simulator(num_threads); }
     unsigned num_threads;
 };
 
 namespace qiree
 {
-    class qsimQuantum final : virtual public QuantumNotImpl
-    {
-
-    public: 
-
+class qsimQuantum final : virtual public QuantumNotImpl
+{
+  public:
     // Define constructors and destructors
-    qsimQuantum(std::ostream& os, size_type shots); // Construct with number of shots
+    qsimQuantum(std::ostream& os, size_type shots);  // Construct with number
+                                                     // of shots
 
     // Define types
     using Simulator = qsim::Simulator<qsim::For>;
@@ -63,15 +63,17 @@ namespace qiree
 
     State init_state_space();
 
-    QIREE_DELETE_COPY_MOVE(qsimQuantum); // Delete copy and move constructors
+    QIREE_DELETE_COPY_MOVE(qsimQuantum);  // Delete copy and move constructors
 
     //!@{
     //! \name Accessors
     size_type num_results() const { return result_to_qubit_.size(); }
     size_type num_qubits() const { return num_qubits_; }
-    
-    unsigned getQubitIndex(Qubit q) {
-    return static_cast<unsigned>(q.value); // Return the value of the qubit
+
+    unsigned getQubitIndex(Qubit q)
+    {
+        return static_cast<unsigned>(q.value);  // Return the value of the
+                                                // qubit
     }
     //!@}
 
@@ -96,8 +98,8 @@ namespace qiree
     Qubit result_to_qubit(Result);
 
     // Wrapper for qsim
-    //std::map<std::string, int>
-    //get_marginal_counts(std::vector<Qubit> const& qubits);
+    // std::map<std::string, int>
+    // get_marginal_counts(std::vector<Qubit> const& qubits);
 
     // Run the circuit on the accelerator if we have not already. Returns true
     // if the circuit was executed.
@@ -131,7 +133,10 @@ namespace qiree
     //!@}
 
     // Get the quantum circuit
-    qsim::Circuit<qsim::GateQSim<float>> get_circuit() const { return q_circuit; } 
+    qsim::Circuit<qsim::GateQSim<float>> get_circuit() const
+    {
+        return q_circuit;
+    }
     // Get the state space
     State const& get_state() const { return *state_; }
     // Update the buffer
@@ -139,39 +144,37 @@ namespace qiree
     // Number of repetitions
     int repetition;
     void repCount(int rep);
-    
-    private:
-        //// TYPES ////
-        enum class Endianness
-        {
-            little,
-            big
-        };
-        unsigned numThreads; // Number of threads to use
-        unsigned max_fused_size; // Maximum size of fused gates
-        qsim::Circuit<qsim::GateQSim<float>> q_circuit; // Quantum circuit object
-        
-        Runner::Parameter qsimParam; // Parameters for qsim
-        size_t execution_time; // when the quantum operation will be executed
-
-        bool executed;
-        size_type num_qubits_{};
-        std::vector<Qubit> result_to_qubit_;
-        Endianness endian_;
-
-        std::ostream& output_;
-        std::shared_ptr<Simulator> simulator_;
-        std::shared_ptr<StateSpace> statespace_;
-        std::shared_ptr<State> state_;
 
+  private:
+    //// TYPES ////
+    enum class Endianness
+    {
+        little,
+        big
     };
+    unsigned numThreads;  // Number of threads to use
+    unsigned max_fused_size;  // Maximum size of fused gates
+    qsim::Circuit<qsim::GateQSim<float>> q_circuit;  // Quantum circuit object
+
+    Runner::Parameter qsimParam;  // Parameters for qsim
+    size_t execution_time;  // when the quantum operation will be executed
+
+    bool executed;
+    size_type num_qubits_{};
+    std::vector<Qubit> result_to_qubit_;
+    Endianness endian_;
+
+    std::ostream& output_;
+    std::shared_ptr<Simulator> simulator_;
+    std::shared_ptr<StateSpace> statespace_;
+    std::shared_ptr<State> state_;
+};
 
-    class buffer {
-    public:
-        buffer(size_t size) : size(size) {}
-        size_t size;
-    }; 
+class buffer
+{
+  public:
+    buffer(size_t size) : size(size) {}
+    size_t size;
+};
 
 }  // namespace qiree
-
-    
diff --git a/src/qirqsim/qsimTupleRuntime.cc b/src/qirqsim/qsimTupleRuntime.cc
index 5366b79..bf88e6d 100644
--- a/src/qirqsim/qsimTupleRuntime.cc
+++ b/src/qirqsim/qsimTupleRuntime.cc
@@ -76,7 +76,7 @@ void qsimTupleRuntime::start_tracking(GroupingType type,
 {
     QIREE_EXPECT(!valid_);
     valid_ = true;
-    type_ = type; 
+    type_ = type;
     tag_ = tag;
     num_results_ = num_results;
     qubits_.clear();
@@ -109,8 +109,12 @@ void qsimTupleRuntime::print_header(size_type num_distinct)
 
 void qsimTupleRuntime::finish_tuple()
 {
-    //auto counts = sim_.get_marginal_counts(qubits_);
-    std::map<std::string, int> counts = {{"0", 0}, {"1", 0}}; // Placeholder for actual counts, TODO: replace with actual counts
+    // auto counts = sim_.get_marginal_counts(qubits_);
+    std::map<std::string, int> counts = {{"0", 0}, {"1", 0}};  // Placeholder
+                                                               // for actual
+                                                               // counts, TODO:
+                                                               // replace with
+                                                               // actual counts
     print_header(counts.size());
     auto name = get_name();
     for (auto& [bits, count] : counts)

From ece521f5fd02a20aa867a19eb9c80f9bd778aaa6 Mon Sep 17 00:00:00 2001
From: wongey <25296194+wongey@users.noreply.github.com>
Date: Mon, 25 Nov 2024 10:00:29 -0500
Subject: [PATCH 10/64] Add qsim dynamicbv test

Remark: Compares only empty output for now until we find a way to properly store the result and not just print as we go.

Temporarily using lowercase qsim. Will modify class names later with the others at the same time.
---
 test/CMakeLists.txt              | 10 ++++
 test/qirqsim/qsimQuantum.test.cc | 98 ++++++++++++++++++++++++++++++++
 2 files changed, 108 insertions(+)
 create mode 100644 test/qirqsim/qsimQuantum.test.cc

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index c4191cc..716bcbc 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -55,3 +55,13 @@ if(QIREE_USE_XACC)
 endif()
 
 #---------------------------------------------------------------------------##
+
+#---------------------------------------------------------------------------##
+# QIRQSIM TESTS
+#---------------------------------------------------------------------------##
+
+if(QIREE_USE_QSIM)
+  qiree_add_test(qirqsim qsimQuantum)
+endif()
+
+#---------------------------------------------------------------------------##
diff --git a/test/qirqsim/qsimQuantum.test.cc b/test/qirqsim/qsimQuantum.test.cc
new file mode 100644
index 0000000..3d29034
--- /dev/null
+++ b/test/qirqsim/qsimQuantum.test.cc
@@ -0,0 +1,98 @@
+//----------------------------------*-C++-*----------------------------------//
+// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers.
+// See the top-level COPYRIGHT file for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//---------------------------------------------------------------------------//
+//! \file qirxacc/XaccQuantum.test.cc
+//---------------------------------------------------------------------------//
+#include "qirqsim/qsimQuantum.hh"
+
+#include <regex>
+
+#include "qiree/Types.hh"
+#include "qiree_test.hh"
+#include "qirqsim/qsimDefaultRuntime.hh"
+
+namespace qiree
+{
+namespace test
+{
+//---------------------------------------------------------------------------//
+
+class qsimQuantumTest : public ::qiree::test::Test
+{
+  protected:
+    void SetUp() override {}
+
+    static std::string clean_output(std::string&& s)
+    {
+        std::string result = std::move(s);
+        static std::regex const subs_ptr("0x[0-9a-f]+");
+        result = std::regex_replace(result, subs_ptr, "0x0");
+        return result;
+    }
+};
+
+
+TEST_F(qsimQuantumTest, sim_dynamicbv)
+{
+    using Q = Qubit;
+    using R = Result;
+
+    std::ostringstream os;
+    os << '\n';
+
+    // Create a simulator that will write to the string stream
+    qsimQuantum qsim_sim{os, 1};
+    qsimDefaultRuntime qsim_rt{os, qsim_sim};
+
+    // Call functions in the same sequence that dynamicbv.ll would
+    qsim_sim.set_up([] {
+        EntryPointAttrs attrs;
+        attrs.required_num_qubits = 2;
+        attrs.required_num_results = 2;
+        return attrs;
+    }());
+    qsim_sim.h(Q{0});
+    qsim_sim.x(Q{1});
+    qsim_sim.h(Q{1});
+    qsim_sim.cnot(Q{0},Q{1});
+    qsim_sim.h(Q{0});
+    qsim_sim.mz(Q{0}, R{0});
+    qsim_sim.read_result(R{0});
+    qsim_sim.mz(Q{1}, R{1});
+    qsim_sim.read_result(R{1});
+    qsim_rt.array_record_output(2,"");
+    qsim_rt.result_record_output(R{0},"");
+    qsim_rt.result_record_output(R{1},"");
+    qsim_sim.h(Q{0});
+    qsim_sim.x(Q{1});
+    qsim_sim.h(Q{1});
+    qsim_sim.mz(Q{0}, R{0});
+    qsim_sim.read_result(R{0});
+    qsim_sim.mz(Q{1}, R{1});
+    qsim_sim.read_result(R{1});
+    qsim_rt.array_record_output(2,"");
+    qsim_rt.result_record_output(R{0},"");
+    qsim_rt.result_record_output(R{1},"");
+    qsim_sim.h(Q{0});
+    qsim_sim.x(Q{1});
+    qsim_sim.h(Q{1});
+    qsim_sim.cnot(Q{0},Q{1});
+    qsim_sim.h(Q{0});
+    qsim_sim.mz(Q{0}, R{0});
+    qsim_sim.read_result(R{0});
+    qsim_sim.mz(Q{1}, R{1});
+    qsim_sim.read_result(R{1});
+    qsim_rt.array_record_output(2,"");
+    qsim_rt.result_record_output(R{0},"");
+    qsim_rt.result_record_output(R{1},"");
+    qsim_sim.tear_down();
+    auto result = clean_output(os.str());
+    EXPECT_EQ(R"(
+)", result) << result; // TODO: Modify qsimDefaultRuntime.cc so that it stores a result to be compared here (currently just prints as it goes...)
+}
+
+//---------------------------------------------------------------------------//
+}  // namespace test
+}  // namespace qiree

From a34f190ed977727662c2838bf41ae8e957dfc4ce Mon Sep 17 00:00:00 2001
From: wongey <25296194+wongey@users.noreply.github.com>
Date: Mon, 25 Nov 2024 20:25:08 -0500
Subject: [PATCH 11/64] Capitalize class names q -> Q

This commit will fail tests since I am documenting the file name change separately.
---
 app/qir-qsim.cc                   | 12 +++----
 src/qirqsim/CMakeLists.txt        |  6 ++--
 src/qirqsim/qsimDefaultRuntime.cc | 12 +++----
 src/qirqsim/qsimDefaultRuntime.hh | 12 +++----
 src/qirqsim/qsimQuantum.cc        | 52 +++++++++++++++----------------
 src/qirqsim/qsimQuantum.hh        |  8 ++---
 src/qirqsim/qsimTupleRuntime.cc   | 22 ++++++-------
 src/qirqsim/qsimTupleRuntime.hh   | 16 +++++-----
 test/CMakeLists.txt               |  2 +-
 test/qirqsim/qsimQuantum.test.cc  | 16 +++++-----
 10 files changed, 79 insertions(+), 79 deletions(-)

diff --git a/app/qir-qsim.cc b/app/qir-qsim.cc
index 75f1612..e5d3e72 100644
--- a/app/qir-qsim.cc
+++ b/app/qir-qsim.cc
@@ -20,9 +20,9 @@
 #include "qiree/Module.hh"
 #include "qiree/QuantumNotImpl.hh"
 
-#include "qirqsim/qsimDefaultRuntime.hh"
-#include "qirqsim/qsimQuantum.hh"
-#include "qirqsim/qsimTupleRuntime.hh"
+#include "qirqsim/QsimDefaultRuntime.hh"
+#include "qirqsim/QsimQuantum.hh"
+#include "qirqsim/QsimTupleRuntime.hh"
 
 using namespace std::string_view_literals;
 
@@ -38,15 +38,15 @@ void run(std::string const& filename,
     Executor execute{Module{filename}};
     
     // Set up qsim
-    qsimQuantum sim(std::cout, num_shots);
+    QsimQuantum sim(std::cout, num_shots);
     
     // Collect the statistics 
     std::unique_ptr<RuntimeInterface> rt;
     //if (group_tuples){
-    //    rt = std::make_unique<qsimTupleRuntime>(
+    //    rt = std::make_unique<QsimTupleRuntime>(
     //        std::cout, sim);
     //} else {
-        rt = std::make_unique<qsimDefaultRuntime>(
+        rt = std::make_unique<QsimDefaultRuntime>(
             std::cout, sim);
     //}
 
diff --git a/src/qirqsim/CMakeLists.txt b/src/qirqsim/CMakeLists.txt
index b0cf690..b11018d 100644
--- a/src/qirqsim/CMakeLists.txt
+++ b/src/qirqsim/CMakeLists.txt
@@ -6,9 +6,9 @@
 
 # Adding qsim as a library to qiree
 qiree_add_library(qirqsim
-  qsimQuantum.cc
-  qsimDefaultRuntime.cc
-  qsimTupleRuntime.cc
+  QsimQuantum.cc
+  QsimDefaultRuntime.cc
+  QsimTupleRuntime.cc
   BufferManager.cc
 )
 
diff --git a/src/qirqsim/qsimDefaultRuntime.cc b/src/qirqsim/qsimDefaultRuntime.cc
index 924e1e6..4ece7c1 100644
--- a/src/qirqsim/qsimDefaultRuntime.cc
+++ b/src/qirqsim/qsimDefaultRuntime.cc
@@ -3,9 +3,9 @@
 // See the top-level COPYRIGHT file for details.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //---------------------------------------------------------------------------//
-//! \file qirqsim/qsimDefaultRuntime.cc
+//! \file qirqsim/QsimDefaultRuntime.cc
 //---------------------------------------------------------------------------//
-#include "qsimDefaultRuntime.hh"
+#include "QsimDefaultRuntime.hh"
 
 #include <iostream>
 
@@ -18,7 +18,7 @@ namespace qiree
  * Initialize the execution environment, resetting qubits.
  */
 
-void qsimDefaultRuntime::initialize(OptionalCString env)
+void QsimDefaultRuntime::initialize(OptionalCString env)
 {
     if (env)
     {
@@ -32,7 +32,7 @@ void qsimDefaultRuntime::initialize(OptionalCString env)
  * named tag
  */
 
-void qsimDefaultRuntime::array_record_output(size_type s, OptionalCString tag)
+void QsimDefaultRuntime::array_record_output(size_type s, OptionalCString tag)
 {
     // this->execute_if_needed();
     // output_ << "array " << (tag ? tag : "<null>") << " length " << s
@@ -45,7 +45,7 @@ void qsimDefaultRuntime::array_record_output(size_type s, OptionalCString tag)
  * named tag
  */
 
-void qsimDefaultRuntime::tuple_record_output(size_type s, OptionalCString tag)
+void QsimDefaultRuntime::tuple_record_output(size_type s, OptionalCString tag)
 {
     // this->execute_if_needed();
     // output_ << "tuple " << (tag ? tag : "<null>") << " length " << s
@@ -56,7 +56,7 @@ void qsimDefaultRuntime::tuple_record_output(size_type s, OptionalCString tag)
 /*!
  * Execute circuit and report a single measurement result
  */
-void qsimDefaultRuntime::result_record_output(Result r, OptionalCString tag)
+void QsimDefaultRuntime::result_record_output(Result r, OptionalCString tag)
 {
     // Access values through the getter
     // This prints results every time result_record_output is called
diff --git a/src/qirqsim/qsimDefaultRuntime.hh b/src/qirqsim/qsimDefaultRuntime.hh
index e76308e..fb0b7a3 100644
--- a/src/qirqsim/qsimDefaultRuntime.hh
+++ b/src/qirqsim/qsimDefaultRuntime.hh
@@ -3,11 +3,11 @@
 // See the top-level COPYRIGHT file for details.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //---------------------------------------------------------------------------//
-//! \file qirqsim/qsimDefaultRuntime.hh
+//! \file qirqsim/QsimDefaultRuntime.hh
 //---------------------------------------------------------------------------//
 #pragma once
 
-#include "qsimQuantum.hh"
+#include "QsimQuantum.hh"
 
 namespace qiree
 {
@@ -28,13 +28,13 @@ namespace qiree
  * \endcode
  */
 
-class qsimDefaultRuntime final : virtual public RuntimeInterface
+class QsimDefaultRuntime final : virtual public RuntimeInterface
 {
   public:
     /*!
-     * Construct \c qsimDefaultRuntime.
+     * Construct \c QsimDefaultRuntime.
      */
-    qsimDefaultRuntime(std::ostream& output, qsimQuantum& sim)
+    QsimDefaultRuntime(std::ostream& output, QsimQuantum& sim)
         : output_(output), sim_(sim)
     {
     }
@@ -56,7 +56,7 @@ class qsimDefaultRuntime final : virtual public RuntimeInterface
 
   private:
     std::ostream& output_;
-    qsimQuantum& sim_;
+    QsimQuantum& sim_;
 };
 
 }  // namespace qiree
diff --git a/src/qirqsim/qsimQuantum.cc b/src/qirqsim/qsimQuantum.cc
index 0fe9a98..5ae1e9f 100644
--- a/src/qirqsim/qsimQuantum.cc
+++ b/src/qirqsim/qsimQuantum.cc
@@ -3,10 +3,10 @@
 // See the top-level COPYRIGHT file for details.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //---------------------------------------------------------------------------//
-//! \file qirxacc/qsimQuantum.cc
+//! \file qirxacc/QsimQuantum.cc
 //---------------------------------------------------------------------------//
 
-#include "qsimQuantum.hh"
+#include "QsimQuantum.hh"
 
 #include <algorithm>
 #include <cassert>
@@ -42,7 +42,7 @@ namespace qiree
 Initialize the qsim simulator
 */
 
-qsimQuantum::State qsimQuantum::init_state_space()
+QsimQuantum::State QsimQuantum::init_state_space()
 {  // check if StateSpace is the proper type for the output, problably it is
    // just State from the Fatory struct.
     std::srand(static_cast<unsigned int>(std::time(nullptr)));  // Seed the
@@ -58,7 +58,7 @@ qsimQuantum::State qsimQuantum::init_state_space()
     qsimParam.max_fused_size = 2;  // Set the maximum size of fused gates
     qsimParam.verbosity = 0;  // see verbosity in run_qsim.h
     // Initialize the qsim simulator
-    qsimQuantum::StateSpace state_space
+    QsimQuantum::StateSpace state_space
         = Factory(numThreads).CreateStateSpace();  // Create the state space
     State state = state_space.Create(this->num_qubits());  // Create the state
     // Check if the state is null
@@ -72,14 +72,14 @@ qsimQuantum::State qsimQuantum::init_state_space()
     return state;
 }
 
-qsimQuantum::qsimQuantum(std::ostream& os, size_type shots) : output_(os) {}
+QsimQuantum::QsimQuantum(std::ostream& os, size_type shots) : output_(os) {}
 
 //---------------------------------------------------------------------------//
 /*
 Prepare to build a quantum circuit for an entry point
 */
 
-void qsimQuantum::set_up(EntryPointAttrs const& attrs)
+void QsimQuantum::set_up(EntryPointAttrs const& attrs)
 {
     QIREE_VALIDATE(attrs.required_num_qubits > 0,
                    << "input is not a quantum program");
@@ -103,12 +103,12 @@ void qsimQuantum::set_up(EntryPointAttrs const& attrs)
 Complete an execution
 */
 
-void qsimQuantum::repCount(int rep)
+void QsimQuantum::repCount(int rep)
 {
     repetition = rep;
 }
 
-void qsimQuantum::tear_down()
+void QsimQuantum::tear_down()
 {
     q_circuit = {};
     q_circuit.num_qubits = num_qubits_;
@@ -120,7 +120,7 @@ void qsimQuantum::tear_down()
 Reset the qubit
 */
 
-void qsimQuantum::reset(Qubit q)
+void QsimQuantum::reset(Qubit q)
 {
     q.value = 0;
 }
@@ -130,7 +130,7 @@ void qsimQuantum::reset(Qubit q)
 Read the value of a result. This utilizes the new BufferManager.
 */
 
-QState qsimQuantum::read_result(Result r)
+QState QsimQuantum::read_result(Result r)
 {
     std::string q_index_string = std::to_string(r.value);
     auto meas_results = execute_if_needed();
@@ -163,7 +163,7 @@ Map a qubit to a result index
 (TODO: find how to link the classical register to the quantum register in qsim)
 */
 
-void qsimQuantum::mz(Qubit q, Result r)
+void QsimQuantum::mz(Qubit q, Result r)
 {  // we don't classical register yet.
     QIREE_EXPECT(q.value < this->num_qubits());  // TODO: q must be in the set
                                                  // of qubits, e.g., what
@@ -183,71 +183,71 @@ Quantum Instruction Mapping
 */
 
 // 1. Entangling gates
-void qsimQuantum::cx(Qubit q1, Qubit q2)
+void QsimQuantum::cx(Qubit q1, Qubit q2)
 {
     q_circuit.gates.push_back(qsim::GateCNot<float>::Create(
         execution_time++, this->getQubitIndex(q1), this->getQubitIndex(q2)));
 }
-void qsimQuantum::cnot(Qubit q1, Qubit q2)
+void QsimQuantum::cnot(Qubit q1, Qubit q2)
 {
     q_circuit.gates.push_back(qsim::GateCNot<float>::Create(
         execution_time++, this->getQubitIndex(q1), this->getQubitIndex(q2)));
 }
-void qsimQuantum::cz(Qubit q1, Qubit q2)
+void QsimQuantum::cz(Qubit q1, Qubit q2)
 {
     q_circuit.gates.push_back(qsim::GateCZ<float>::Create(
         execution_time++, this->getQubitIndex(q1), this->getQubitIndex(q2)));
 }
 // 2. Local gates
-void qsimQuantum::h(Qubit q)
+void QsimQuantum::h(Qubit q)
 {
     q_circuit.gates.push_back(
         qsim::GateHd<float>::Create(execution_time++, this->getQubitIndex(q)));
 }
-void qsimQuantum::s(Qubit q)
+void QsimQuantum::s(Qubit q)
 {
     q_circuit.gates.push_back(
         qsim::GateS<float>::Create(execution_time++, this->getQubitIndex(q)));
 }
-void qsimQuantum::t(Qubit q)
+void QsimQuantum::t(Qubit q)
 {
     q_circuit.gates.push_back(
         qsim::GateT<float>::Create(execution_time++, this->getQubitIndex(q)));
 }
 // 2.1 Pauli gates
-void qsimQuantum::x(Qubit q)
+void QsimQuantum::x(Qubit q)
 {
     q_circuit.gates.push_back(
         qsim::GateX<float>::Create(execution_time++, this->getQubitIndex(q)));
 }
-void qsimQuantum::y(Qubit q)
+void QsimQuantum::y(Qubit q)
 {
     q_circuit.gates.push_back(
         qsim::GateY<float>::Create(execution_time++, this->getQubitIndex(q)));
 }
-void qsimQuantum::z(Qubit q)
+void QsimQuantum::z(Qubit q)
 {
     q_circuit.gates.push_back(
         qsim::GateZ<float>::Create(execution_time++, this->getQubitIndex(q)));
 }
 // 2.2 rotation gates
-void qsimQuantum::rx(double theta, Qubit q)
+void QsimQuantum::rx(double theta, Qubit q)
 {
     q_circuit.gates.push_back(qsim::GateRX<float>::Create(
         execution_time++, this->getQubitIndex(q), theta));
 }
-void qsimQuantum::ry(double theta, Qubit q)
+void QsimQuantum::ry(double theta, Qubit q)
 {
     q_circuit.gates.push_back(qsim::GateRY<float>::Create(
         execution_time++, this->getQubitIndex(q), theta));
 }
-void qsimQuantum::rz(double theta, Qubit q)
+void QsimQuantum::rz(double theta, Qubit q)
 {
     q_circuit.gates.push_back(qsim::GateRZ<float>::Create(
         execution_time++, this->getQubitIndex(q), theta));
 }
 
-Qubit qsimQuantum::result_to_qubit(Result r)
+Qubit QsimQuantum::result_to_qubit(Result r)
 {
     // TODO: This function is not working. Giving 0 every time. Maybe not
     // needed.
@@ -257,13 +257,13 @@ Qubit qsimQuantum::result_to_qubit(Result r)
                                        // something else here
 }
 
-void qsimQuantum::print_accelbuf()
+void QsimQuantum::print_accelbuf()
 {
     // TODO: to be implemented, we can create a buffer class to store the
     // results
 }
 
-qsimQuantum::VecMeas qsimQuantum::execute_if_needed()
+QsimQuantum::VecMeas QsimQuantum::execute_if_needed()
 {
     std::vector<StateSpace::MeasurementResult> meas_results;  // Vector to hold
                                                               // measurement
diff --git a/src/qirqsim/qsimQuantum.hh b/src/qirqsim/qsimQuantum.hh
index 745da51..db88dd4 100644
--- a/src/qirqsim/qsimQuantum.hh
+++ b/src/qirqsim/qsimQuantum.hh
@@ -3,7 +3,7 @@
 // See the top-level COPYRIGHT file for details.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //---------------------------------------------------------------------------//
-//! \file qirqsim/qsimQuantum.hh
+//! \file qirqsim/QsimQuantum.hh
 //---------------------------------------------------------------------------//
 #pragma once
 
@@ -46,11 +46,11 @@ struct Factory
 
 namespace qiree
 {
-class qsimQuantum final : virtual public QuantumNotImpl
+class QsimQuantum final : virtual public QuantumNotImpl
 {
   public:
     // Define constructors and destructors
-    qsimQuantum(std::ostream& os, size_type shots);  // Construct with number
+    QsimQuantum(std::ostream& os, size_type shots);  // Construct with number
                                                      // of shots
 
     // Define types
@@ -63,7 +63,7 @@ class qsimQuantum final : virtual public QuantumNotImpl
 
     State init_state_space();
 
-    QIREE_DELETE_COPY_MOVE(qsimQuantum);  // Delete copy and move constructors
+    QIREE_DELETE_COPY_MOVE(QsimQuantum);  // Delete copy and move constructors
 
     //!@{
     //! \name Accessors
diff --git a/src/qirqsim/qsimTupleRuntime.cc b/src/qirqsim/qsimTupleRuntime.cc
index bf88e6d..34a7440 100644
--- a/src/qirqsim/qsimTupleRuntime.cc
+++ b/src/qirqsim/qsimTupleRuntime.cc
@@ -3,9 +3,9 @@
 // See the top-level COPYRIGHT file for details.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //---------------------------------------------------------------------------//
-//! \file qirqsim/qsimTupleRuntime.cc
+//! \file qirqsim/QsimTupleRuntime.cc
 //---------------------------------------------------------------------------//
-#include "qsimTupleRuntime.hh"
+#include "QsimTupleRuntime.hh"
 
 #include "qiree/Assert.hh"
 
@@ -15,7 +15,7 @@ namespace qiree
 /*!
  * Initialize the execution environment, resetting qubits.
  */
-void qsimTupleRuntime::initialize(OptionalCString env)
+void QsimTupleRuntime::initialize(OptionalCString env)
 {
     if (env)
     {
@@ -28,7 +28,7 @@ void qsimTupleRuntime::initialize(OptionalCString env)
  * Execute circuit and mark the following N results as being part of an array
  * named tag
  */
-void qsimTupleRuntime::array_record_output(size_type s, OptionalCString tag)
+void QsimTupleRuntime::array_record_output(size_type s, OptionalCString tag)
 {
     execute_if_needed();
     start_tracking(GroupingType::array, tag, s);
@@ -39,7 +39,7 @@ void qsimTupleRuntime::array_record_output(size_type s, OptionalCString tag)
  * Execute circuit and mark the following N results as being part of a tuple
  * named tag
  */
-void qsimTupleRuntime::tuple_record_output(size_type s, OptionalCString tag)
+void QsimTupleRuntime::tuple_record_output(size_type s, OptionalCString tag)
 {
     execute_if_needed();
     start_tracking(GroupingType::tuple, tag, s);
@@ -49,7 +49,7 @@ void qsimTupleRuntime::tuple_record_output(size_type s, OptionalCString tag)
 /*!
  * Execute circuit and report a single measurement result
  */
-void qsimTupleRuntime::result_record_output(Result r, OptionalCString tag)
+void QsimTupleRuntime::result_record_output(Result r, OptionalCString tag)
 {
     execute_if_needed();
     Qubit q = sim_.result_to_qubit(r);
@@ -60,7 +60,7 @@ void qsimTupleRuntime::result_record_output(Result r, OptionalCString tag)
 // PRIVATE FUNCTIONS
 //---------------------------------------------------------------------------//
 
-void qsimTupleRuntime::execute_if_needed()
+void QsimTupleRuntime::execute_if_needed()
 {
     /*
     if (sim_.execute_if_needed() && print_accelbuf_)
@@ -70,7 +70,7 @@ void qsimTupleRuntime::execute_if_needed()
     */
 }
 
-void qsimTupleRuntime::start_tracking(GroupingType type,
+void QsimTupleRuntime::start_tracking(GroupingType type,
                                       std::string tag,
                                       size_type num_results)
 {
@@ -89,7 +89,7 @@ void qsimTupleRuntime::start_tracking(GroupingType type,
     }
 }
 
-void qsimTupleRuntime::push_result(Qubit q)
+void QsimTupleRuntime::push_result(Qubit q)
 {
     QIREE_EXPECT(valid_);
     QIREE_EXPECT(qubits_.size() < num_results_);
@@ -100,14 +100,14 @@ void qsimTupleRuntime::push_result(Qubit q)
     }
 }
 
-void qsimTupleRuntime::print_header(size_type num_distinct)
+void QsimTupleRuntime::print_header(size_type num_distinct)
 {
     auto name = get_name();
     output_ << name << " " << tag_ << " length " << qubits_.size()
             << " distinct results " << num_distinct << std::endl;
 }
 
-void qsimTupleRuntime::finish_tuple()
+void QsimTupleRuntime::finish_tuple()
 {
     // auto counts = sim_.get_marginal_counts(qubits_);
     std::map<std::string, int> counts = {{"0", 0}, {"1", 0}};  // Placeholder
diff --git a/src/qirqsim/qsimTupleRuntime.hh b/src/qirqsim/qsimTupleRuntime.hh
index fa153f4..d6cafbe 100644
--- a/src/qirqsim/qsimTupleRuntime.hh
+++ b/src/qirqsim/qsimTupleRuntime.hh
@@ -3,18 +3,18 @@
 // See the top-level COPYRIGHT file for details.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //---------------------------------------------------------------------------//
-//! \file qirqsim/qsimTupleRuntime.hh
+//! \file qirqsim/QsimTupleRuntime.hh
 //---------------------------------------------------------------------------//
 #pragma once
 
-#include "qsimQuantum.hh"
+#include "QsimQuantum.hh"
 
 namespace qiree
 {
 
 /*!
  * Print per-tuple (or per-array) measurement statistics. (Compare with \ref
- * qsimDefaultRuntime.)
+ * QsimDefaultRuntime.)
  *
  * Example:
  * \code
@@ -24,16 +24,16 @@ namespace qiree
  * \endcode
  */
 
-class qsimTupleRuntime final : virtual public RuntimeInterface
+class QsimTupleRuntime final : virtual public RuntimeInterface
 {
   public:
     /*!
-     * Construct an \c qsimTupleRuntime.
+     * Construct an \c QsimTupleRuntime.
      * The \c print_accelbuf argument determines whether the qsim \c
      * AcceleratorBuffer is dumped after execution.
      */
-    qsimTupleRuntime(std::ostream& output,
-                     qsimQuantum& sim,
+    QsimTupleRuntime(std::ostream& output,
+                     QsimQuantum& sim,
                      bool print_accelbuf = true)
         : output_(output)
         , sim_(sim)
@@ -67,7 +67,7 @@ class qsimTupleRuntime final : virtual public RuntimeInterface
     };
 
     std::ostream& output_;
-    qsimQuantum& sim_;
+    QsimQuantum& sim_;
     bool const print_accelbuf_;
     bool valid_;
     GroupingType type_;
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 716bcbc..af87510 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -61,7 +61,7 @@ endif()
 #---------------------------------------------------------------------------##
 
 if(QIREE_USE_QSIM)
-  qiree_add_test(qirqsim qsimQuantum)
+  qiree_add_test(qirqsim QsimQuantum)
 endif()
 
 #---------------------------------------------------------------------------##
diff --git a/test/qirqsim/qsimQuantum.test.cc b/test/qirqsim/qsimQuantum.test.cc
index 3d29034..066f9bf 100644
--- a/test/qirqsim/qsimQuantum.test.cc
+++ b/test/qirqsim/qsimQuantum.test.cc
@@ -3,15 +3,15 @@
 // See the top-level COPYRIGHT file for details.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //---------------------------------------------------------------------------//
-//! \file qirxacc/XaccQuantum.test.cc
+//! \file qirxacc/QsimQuantum.test.cc
 //---------------------------------------------------------------------------//
-#include "qirqsim/qsimQuantum.hh"
+#include "qirqsim/QsimQuantum.hh"
 
 #include <regex>
 
 #include "qiree/Types.hh"
 #include "qiree_test.hh"
-#include "qirqsim/qsimDefaultRuntime.hh"
+#include "qirqsim/QsimDefaultRuntime.hh"
 
 namespace qiree
 {
@@ -19,7 +19,7 @@ namespace test
 {
 //---------------------------------------------------------------------------//
 
-class qsimQuantumTest : public ::qiree::test::Test
+class QsimQuantumTest : public ::qiree::test::Test
 {
   protected:
     void SetUp() override {}
@@ -34,7 +34,7 @@ class qsimQuantumTest : public ::qiree::test::Test
 };
 
 
-TEST_F(qsimQuantumTest, sim_dynamicbv)
+TEST_F(QsimQuantumTest, sim_dynamicbv)
 {
     using Q = Qubit;
     using R = Result;
@@ -43,8 +43,8 @@ TEST_F(qsimQuantumTest, sim_dynamicbv)
     os << '\n';
 
     // Create a simulator that will write to the string stream
-    qsimQuantum qsim_sim{os, 1};
-    qsimDefaultRuntime qsim_rt{os, qsim_sim};
+    QsimQuantum qsim_sim{os, 1};
+    QsimDefaultRuntime qsim_rt{os, qsim_sim};
 
     // Call functions in the same sequence that dynamicbv.ll would
     qsim_sim.set_up([] {
@@ -90,7 +90,7 @@ TEST_F(qsimQuantumTest, sim_dynamicbv)
     qsim_sim.tear_down();
     auto result = clean_output(os.str());
     EXPECT_EQ(R"(
-)", result) << result; // TODO: Modify qsimDefaultRuntime.cc so that it stores a result to be compared here (currently just prints as it goes...)
+)", result) << result; // TODO: Modify QsimDefaultRuntime.cc so that it stores a result to be compared here (currently just prints as it goes...)
 }
 
 //---------------------------------------------------------------------------//

From 8fb40f3f22c7abba35764824b91cb4b48ec07126 Mon Sep 17 00:00:00 2001
From: wongey <25296194+wongey@users.noreply.github.com>
Date: Mon, 25 Nov 2024 20:28:08 -0500
Subject: [PATCH 12/64] Update qsim file names

---
 src/qirqsim/{qsimDefaultRuntime.cc => QsimDefaultRuntime.cc} | 0
 src/qirqsim/{qsimDefaultRuntime.hh => QsimDefaultRuntime.hh} | 0
 src/qirqsim/{qsimQuantum.cc => QsimQuantum.cc}               | 0
 src/qirqsim/{qsimQuantum.hh => QsimQuantum.hh}               | 0
 src/qirqsim/{qsimTupleRuntime.cc => QsimTupleRuntime.cc}     | 0
 src/qirqsim/{qsimTupleRuntime.hh => QsimTupleRuntime.hh}     | 0
 test/qirqsim/{qsimQuantum.test.cc => QsimQuantum.test.cc}    | 0
 7 files changed, 0 insertions(+), 0 deletions(-)
 rename src/qirqsim/{qsimDefaultRuntime.cc => QsimDefaultRuntime.cc} (100%)
 rename src/qirqsim/{qsimDefaultRuntime.hh => QsimDefaultRuntime.hh} (100%)
 rename src/qirqsim/{qsimQuantum.cc => QsimQuantum.cc} (100%)
 rename src/qirqsim/{qsimQuantum.hh => QsimQuantum.hh} (100%)
 rename src/qirqsim/{qsimTupleRuntime.cc => QsimTupleRuntime.cc} (100%)
 rename src/qirqsim/{qsimTupleRuntime.hh => QsimTupleRuntime.hh} (100%)
 rename test/qirqsim/{qsimQuantum.test.cc => QsimQuantum.test.cc} (100%)

diff --git a/src/qirqsim/qsimDefaultRuntime.cc b/src/qirqsim/QsimDefaultRuntime.cc
similarity index 100%
rename from src/qirqsim/qsimDefaultRuntime.cc
rename to src/qirqsim/QsimDefaultRuntime.cc
diff --git a/src/qirqsim/qsimDefaultRuntime.hh b/src/qirqsim/QsimDefaultRuntime.hh
similarity index 100%
rename from src/qirqsim/qsimDefaultRuntime.hh
rename to src/qirqsim/QsimDefaultRuntime.hh
diff --git a/src/qirqsim/qsimQuantum.cc b/src/qirqsim/QsimQuantum.cc
similarity index 100%
rename from src/qirqsim/qsimQuantum.cc
rename to src/qirqsim/QsimQuantum.cc
diff --git a/src/qirqsim/qsimQuantum.hh b/src/qirqsim/QsimQuantum.hh
similarity index 100%
rename from src/qirqsim/qsimQuantum.hh
rename to src/qirqsim/QsimQuantum.hh
diff --git a/src/qirqsim/qsimTupleRuntime.cc b/src/qirqsim/QsimTupleRuntime.cc
similarity index 100%
rename from src/qirqsim/qsimTupleRuntime.cc
rename to src/qirqsim/QsimTupleRuntime.cc
diff --git a/src/qirqsim/qsimTupleRuntime.hh b/src/qirqsim/QsimTupleRuntime.hh
similarity index 100%
rename from src/qirqsim/qsimTupleRuntime.hh
rename to src/qirqsim/QsimTupleRuntime.hh
diff --git a/test/qirqsim/qsimQuantum.test.cc b/test/qirqsim/QsimQuantum.test.cc
similarity index 100%
rename from test/qirqsim/qsimQuantum.test.cc
rename to test/qirqsim/QsimQuantum.test.cc

From 25d94333271ec8d9ca3009832ec8408e061726ae Mon Sep 17 00:00:00 2001
From: wongey <25296194+wongey@users.noreply.github.com>
Date: Mon, 25 Nov 2024 20:28:52 -0500
Subject: [PATCH 13/64] Add examples

bell_ccx.ll to examples folder and dynamicbv.ll to test data folder
---
 examples/bell_ccx.ll   |  43 ++++++++++++++++++
 test/data/dynamicbv.ll | 101 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 144 insertions(+)
 create mode 100644 examples/bell_ccx.ll
 create mode 100644 test/data/dynamicbv.ll

diff --git a/examples/bell_ccx.ll b/examples/bell_ccx.ll
new file mode 100644
index 0000000..e5b2ea7
--- /dev/null
+++ b/examples/bell_ccx.ll
@@ -0,0 +1,43 @@
+; ModuleID = 'Bell_ccx'
+source_filename = "Bell_ccx"
+
+%Qubit = type opaque
+%Result = type opaque
+
+define void @main() #0 {
+entry:
+  call void @__quantum__qis__h__body(%Qubit* null)
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 1 to %Qubit*))
+  call void @__quantum__qis__ccx__body(%Qubit* null, %Qubit* inttoptr (i64 1 to %Qubit*), %Qubit* inttoptr (i64 2 to %Qubit*))
+  call void @__quantum__qis__mz__body(%Qubit* null, %Result* null)
+  call void @__quantum__qis__mz__body(%Qubit* inttoptr (i64 1 to %Qubit*), %Result* inttoptr (i64 1 to %Result*))
+  call void @__quantum__qis__mz__body(%Qubit* inttoptr (i64 2 to %Qubit*), %Result* inttoptr (i64 2 to %Result*))
+  call void @__quantum__rt__array_record_output(i64 3, i8* null)
+  call void @__quantum__rt__result_record_output(%Result* null, i8* null)
+  call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 1 to %Result*), i8* null)
+  call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 2 to %Result*), i8* null)
+  ret void
+}
+
+declare void @__quantum__qis__h__body(%Qubit*)
+
+declare void @__quantum__qis__x__body(%Qubit*)
+
+declare void @__quantum__qis__ccx__body(%Qubit*, %Qubit*, %Qubit*)
+
+declare void @__quantum__qis__mz__body(%Qubit*, %Result* writeonly) #1
+
+declare void @__quantum__rt__array_record_output(i64, i8*)
+
+declare void @__quantum__rt__result_record_output(%Result*, i8*)
+
+attributes #0 = { "entry_point" "num_required_qubits"="3" "num_required_results"="3" "output_labeling_schema" "qir_profiles"="custom" }
+attributes #1 = { "irreversible" }
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = !{i32 1, !"qir_major_version", i32 1}
+!1 = !{i32 7, !"qir_minor_version", i32 0}
+!2 = !{i32 1, !"dynamic_qubit_management", i1 false}
+!3 = !{i32 1, !"dynamic_result_management", i1 false}
+
diff --git a/test/data/dynamicbv.ll b/test/data/dynamicbv.ll
new file mode 100644
index 0000000..6d48157
--- /dev/null
+++ b/test/data/dynamicbv.ll
@@ -0,0 +1,101 @@
+; ModuleID = 'dynamicbv'
+source_filename = "dynamicbv"
+
+; ModuleID = 'BernsteinVazirani'
+source_filename = "bv_algorithm"
+
+%Qubit = type opaque
+%Result = type opaque
+
+define void @main() #0 {
+entry:
+  ; Initialize qubits
+  call void @__quantum__qis__h__body(%Qubit* null) ; apply Hadamard to query qubit
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 1 to %Qubit*)) ; set ancillary qubit
+  call void @__quantum__qis__h__body(%Qubit* inttoptr (i64 1 to %Qubit*)) ;
+  
+
+  ; Apply CNOT for bit '1'
+  call void @__quantum__qis__cnot__body(%Qubit* null, %Qubit* inttoptr (i64 1 to %Qubit*)) ; kickback phase on q0
+  call void @__quantum__qis__h__body(%Qubit* null) ; correcting eigenvalue
+  
+  ; Mid-circuit measurement 
+  call void @__quantum__qis__mz__body(%Qubit* null, %Result* null) ; from this we get the first bit
+  call i1 @__quantum__qis__read_result__body(%Result* null)
+  call void @__quantum__qis__mz__body(%Qubit* inttoptr (i64 1 to %Qubit*), %Result* inttoptr (i64 1 to %Result*)) ; just to reset ancillary qubit
+  call i1 @__quantum__qis__read_result__body(%Result* inttoptr (i64 1 to %Result*))  
+  
+  ; Output the results
+  call void @__quantum__rt__array_record_output(i64 2, i8* null)
+  call void @__quantum__rt__result_record_output(%Result* null, i8* null)
+  call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 1 to %Result*), i8* null)
+
+  ; Initialize qubits 
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 1 to %Qubit*)) ; set ancillary qubit
+  call void @__quantum__qis__h__body(%Qubit* inttoptr (i64 1 to %Qubit*)) ;
+  call void @__quantum__qis__h__body(%Qubit* null) ; apply Hadamard to query qubit
+
+  ; Apply Identiry for bit '0'
+  ; Nothing
+
+  ; Mid-circuit measurement 
+  call void @__quantum__qis__mz__body(%Qubit* null, %Result* null) ; from this we get the first bit
+  call i1 @__quantum__qis__read_result__body(%Result* null)
+  call void @__quantum__qis__mz__body(%Qubit* inttoptr (i64 1 to %Qubit*), %Result* inttoptr (i64 1 to %Result*)) ; just to reset ancillary qubit
+  call i1 @__quantum__qis__read_result__body(%Result* inttoptr (i64 1 to %Result*))  
+  
+  ; Output the results
+  call void @__quantum__rt__array_record_output(i64 2, i8* null)
+  call void @__quantum__rt__result_record_output(%Result* null, i8* null)
+  call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 1 to %Result*), i8* null)
+
+  ; Initialize qubits
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 1 to %Qubit*)) ; set ancillary qubit
+  call void @__quantum__qis__h__body(%Qubit* inttoptr (i64 1 to %Qubit*)) ;
+  call void @__quantum__qis__h__body(%Qubit* null) ; apply Hadamard to query qubit
+
+  ; Apply CNOT for bit '1'
+  call void @__quantum__qis__cnot__body(%Qubit* null, %Qubit* inttoptr (i64 1 to %Qubit*)) ; kickback phase on q0
+  call void @__quantum__qis__h__body(%Qubit* null) ; correcting eigenvalue
+  
+  ; Mid-circuit measurement 
+  call void @__quantum__qis__mz__body(%Qubit* null, %Result* null) ; from this we get the first bit
+  call i1 @__quantum__qis__read_result__body(%Result* null)
+  call void @__quantum__qis__mz__body(%Qubit* inttoptr (i64 1 to %Qubit*), %Result* inttoptr (i64 1 to %Result*)) ; just to reset ancillary qubit
+  call i1 @__quantum__qis__read_result__body(%Result* inttoptr (i64 1 to %Result*))  
+  
+  ; Output the results
+  call void @__quantum__rt__array_record_output(i64 2, i8* null)
+  call void @__quantum__rt__result_record_output(%Result* null, i8* null)
+  call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 1 to %Result*), i8* null)
+
+  ret void
+}
+
+; Declaration of quantum operations
+declare void @__quantum__qis__h__body(%Qubit*)
+declare void @__quantum__qis__x__body(%Qubit*)
+declare void @__quantum__qis__cnot__body(%Qubit*, %Qubit*)
+declare void @__quantum__qis__mz__body(%Qubit*, %Result*)
+declare i1 @__quantum__qis__read_result__body(%Result*)
+
+; Quantum runtime functions for managing qubits and results
+declare %Qubit* @__quantum__rt__qubit_allocate()
+declare %Result* @__quantum__rt__result_allocate()
+declare void @__quantum__rt__qubit_release(%Qubit*)
+declare void @__quantum__rt__result_release(%Result*)
+declare void @__quantum__rt__result_record_output(%Result*, i8*)
+declare void @__quantum__rt__array_record_output(i64, i8*)
+
+
+
+attributes #0 = { "entry_point" "num_required_qubits"="2" "num_required_results"="2" "output_labeling_schema" "qir_profiles"="custom" }
+attributes #1 = { "irreversible" }
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = !{i32 1, !"qir_major_version", i32 1}
+!1 = !{i32 7, !"qir_minor_version", i32 0}
+!2 = !{i32 1, !"dynamic_qubit_management", i1 false}
+!3 = !{i32 1, !"dynamic_result_management", i1 false}
+

From 22d0ac9880b07fb331beb296b9b7a2b923d45f65 Mon Sep 17 00:00:00 2001
From: wongey <25296194+wongey@users.noreply.github.com>
Date: Mon, 25 Nov 2024 20:44:20 -0500
Subject: [PATCH 14/64] Minor formatting

---
 src/qirqsim/QsimQuantum.cc | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/src/qirqsim/QsimQuantum.cc b/src/qirqsim/QsimQuantum.cc
index 5ae1e9f..d6f1a98 100644
--- a/src/qirqsim/QsimQuantum.cc
+++ b/src/qirqsim/QsimQuantum.cc
@@ -62,11 +62,8 @@ QsimQuantum::State QsimQuantum::init_state_space()
         = Factory(numThreads).CreateStateSpace();  // Create the state space
     State state = state_space.Create(this->num_qubits());  // Create the state
     // Check if the state is null
-    if (state_space.IsNull(state))
-    {
-        qsim::IO::errorf(
-            "not enough memory: is the number of qubits too large?\n");
-    }
+    QIREE_VALIDATE(!state_space.IsNull(state),
+            << "not enough memory: is the number of qubits too large?";
     state_space.SetStateZero(state);  // Set the state to zero, TODO: the
                                       // initial state is not necessarily zero
     return state;
@@ -76,7 +73,7 @@ QsimQuantum::QsimQuantum(std::ostream& os, size_type shots) : output_(os) {}
 
 //---------------------------------------------------------------------------//
 /*
-Prepare to build a quantum circuit for an entry point
+* Prepare to build a quantum circuit for an entry point
 */
 
 void QsimQuantum::set_up(EntryPointAttrs const& attrs)
@@ -100,7 +97,7 @@ void QsimQuantum::set_up(EntryPointAttrs const& attrs)
 
 //---------------------------------------------------------------------------//
 /*
-Complete an execution
+* Complete an execution
 */
 
 void QsimQuantum::repCount(int rep)
@@ -117,7 +114,7 @@ void QsimQuantum::tear_down()
 
 //---------------------------------------------------------------------------//
 /*
-Reset the qubit
+* Reset the qubit
 */
 
 void QsimQuantum::reset(Qubit q)
@@ -127,7 +124,7 @@ void QsimQuantum::reset(Qubit q)
 
 //----------------------------------------------------------------------------//
 /*
-Read the value of a result. This utilizes the new BufferManager.
+* Read the value of a result. This utilizes the new BufferManager.
 */
 
 QState QsimQuantum::read_result(Result r)
@@ -159,8 +156,8 @@ QState QsimQuantum::read_result(Result r)
 
 //---------------------------------------------------------------------------//
 /*
-Map a qubit to a result index
-(TODO: find how to link the classical register to the quantum register in qsim)
+* Map a qubit to a result index
+* (TODO: find how to link the classical register to the quantum register in qsim)
 */
 
 void QsimQuantum::mz(Qubit q, Result r)
@@ -179,7 +176,7 @@ void QsimQuantum::mz(Qubit q, Result r)
 
 //---------------------------------------------------------------------------//
 /*
-Quantum Instruction Mapping
+* Quantum Instruction Mapping
 */
 
 // 1. Entangling gates

From 29be7aef6803a02d4a4f3efb3dfebaa46c02efa8 Mon Sep 17 00:00:00 2001
From: wongey <25296194+wongey@users.noreply.github.com>
Date: Wed, 27 Nov 2024 01:10:26 -0500
Subject: [PATCH 15/64] Resolve seed issue

---
 app/qir-qsim.cc            |  2 +-
 src/qirqsim/QsimQuantum.cc | 26 +++++++++++---------------
 src/qirqsim/QsimQuantum.hh |  5 +++--
 3 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/app/qir-qsim.cc b/app/qir-qsim.cc
index e5d3e72..5493f26 100644
--- a/app/qir-qsim.cc
+++ b/app/qir-qsim.cc
@@ -38,7 +38,7 @@ void run(std::string const& filename,
     Executor execute{Module{filename}};
     
     // Set up qsim
-    QsimQuantum sim(std::cout, num_shots);
+    QsimQuantum sim(std::cout, 0);
     
     // Collect the statistics 
     std::unique_ptr<RuntimeInterface> rt;
diff --git a/src/qirqsim/QsimQuantum.cc b/src/qirqsim/QsimQuantum.cc
index d6f1a98..938fb99 100644
--- a/src/qirqsim/QsimQuantum.cc
+++ b/src/qirqsim/QsimQuantum.cc
@@ -39,22 +39,18 @@ namespace qiree
 {
 //---------------------------------------------------------------------------//
 /*
-Initialize the qsim simulator
+* Initialize the qsim simulator
 */
 
 QsimQuantum::State QsimQuantum::init_state_space()
-{  // check if StateSpace is the proper type for the output, problably it is
-   // just State from the Fatory struct.
-    std::srand(static_cast<unsigned int>(std::time(nullptr)));  // Seed the
-                                                                // random
-                                                                // number
-                                                                // generator
-    qsimParam.seed = std::rand();  // Set the seed for qsim parameters
-    numThreads = std::max(
-        1, static_cast<int>(std::thread::hardware_concurrency()));  // Get the
-                                                                    // number
-                                                                    // of
-                                                                    // threads
+{   
+    // check if StateSpace is the proper type for the output, problably it is
+    // just State from the Factory struct.
+    qsimParam.seed = seed_;
+    seed_++;
+    // Get the number of threads
+    numThreads
+        = std::max(1, static_cast<int>(std::thread::hardware_concurrency()));
     qsimParam.max_fused_size = 2;  // Set the maximum size of fused gates
     qsimParam.verbosity = 0;  // see verbosity in run_qsim.h
     // Initialize the qsim simulator
@@ -63,13 +59,13 @@ QsimQuantum::State QsimQuantum::init_state_space()
     State state = state_space.Create(this->num_qubits());  // Create the state
     // Check if the state is null
     QIREE_VALIDATE(!state_space.IsNull(state),
-            << "not enough memory: is the number of qubits too large?";
+            << "not enough memory: is the number of qubits too large?");
     state_space.SetStateZero(state);  // Set the state to zero, TODO: the
                                       // initial state is not necessarily zero
     return state;
 }
 
-QsimQuantum::QsimQuantum(std::ostream& os, size_type shots) : output_(os) {}
+QsimQuantum::QsimQuantum(std::ostream& os, unsigned long int seed) : output_(os), seed_(seed) {}
 
 //---------------------------------------------------------------------------//
 /*
diff --git a/src/qirqsim/QsimQuantum.hh b/src/qirqsim/QsimQuantum.hh
index db88dd4..fa13a3d 100644
--- a/src/qirqsim/QsimQuantum.hh
+++ b/src/qirqsim/QsimQuantum.hh
@@ -50,8 +50,8 @@ class QsimQuantum final : virtual public QuantumNotImpl
 {
   public:
     // Define constructors and destructors
-    QsimQuantum(std::ostream& os, size_type shots);  // Construct with number
-                                                     // of shots
+    // Construct with number of shots
+    QsimQuantum(std::ostream& os, size_type shots);  
 
     // Define types
     using Simulator = qsim::Simulator<qsim::For>;
@@ -157,6 +157,7 @@ class QsimQuantum final : virtual public QuantumNotImpl
     qsim::Circuit<qsim::GateQSim<float>> q_circuit;  // Quantum circuit object
 
     Runner::Parameter qsimParam;  // Parameters for qsim
+    unsigned long int seed_;
     size_t execution_time;  // when the quantum operation will be executed
 
     bool executed;

From 19867f36bb8bfccbdabe3e795b9a51d915c87ed6 Mon Sep 17 00:00:00 2001
From: wongey <25296194+wongey@users.noreply.github.com>
Date: Wed, 27 Nov 2024 01:11:05 -0500
Subject: [PATCH 16/64] Minor formatting

---
 CMakePresets.json            | 92 +++++++++++++++++++++++-------------
 src/qirqsim/BufferManager.hh |  5 +-
 2 files changed, 62 insertions(+), 35 deletions(-)

diff --git a/CMakePresets.json b/CMakePresets.json
index 287e268..4d9e63b 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -1,33 +1,61 @@
 {
-  "version": 3,
-  "cmakeMinimumRequired": {"major": 3, "minor": 21, "patch": 0},
-  "configurePresets": [
-    {
-      "name": "default",
-      "displayName": "Automatic options (debug with tests)",
-      "description": "Dependencies are enabled based on environment probing",
-      "binaryDir": "${sourceDir}/build-${presetName}",
-      "generator": "Ninja",
-      "cacheVariables": {
-        "BUILD_SHARED_LIBS":    {"type": "BOOL",   "value": "ON"},
-        "CMAKE_BUILD_TYPE":     {"type": "STRING", "value": "Debug"},
-        "CMAKE_INSTALL_PREFIX": "${sourceDir}/install-${presetName}"
-      }
-    }
-  ],
-  "buildPresets": [
-    {
-      "name": "default",
-      "jobs": 0,
-      "configurePreset": "default"
-    }
-  ],
-  "testPresets": [
-    {
-      "name": "default",
-      "configurePreset": "default",
-      "output": {"outputOnFailure": true},
-      "execution": {"noTestsAction": "error", "stopOnFailure": false, "jobs": 8}
-    }
-  ]
-}
+"version": 3,
+    "cmakeMinimumRequired": {
+        "major": 3,
+        "minor": 21,
+        "patch": 0
+    },
+    "configurePresets": [
+        {
+            "name": "default",
+            "displayName": "Automatic options (debug with tests)",
+            "description": "Dependencies are enabled based on environment probing",
+            "binaryDir": "${sourceDir}/build-${presetName}",
+            "generator": "Ninja",
+            "cacheVariables": {
+                "BUILD_SHARED_LIBS": {
+                    "type": "BOOL",
+                    "value": "ON"
+                },
+                "CMAKE_BUILD_TYPE": {
+                    "type": "STRING",
+                    "value": "Debug"
+                },
+                "CMAKE_INSTALL_PREFIX": "${sourceDir}/install-${presetName}"
+            }
+        },
+        {
+            "name": "default",
+            "displayName": "Clang 16.0.6 x86_64-pc-linux-gnu",
+            "description": "Using compilers: C = /usr/bin/clang-16, CXX = /usr/bin/clang++-16",
+            "binaryDir": "${sourceDir}/out/build/${presetName}",
+            "cacheVariables": {
+                "CMAKE_INSTALL_PREFIX": "${sourceDir}/out/install/${presetName}",
+                "CMAKE_C_COMPILER": "/usr/bin/clang-16",
+                "CMAKE_CXX_COMPILER": "/usr/bin/clang++-16",
+                "CMAKE_BUILD_TYPE": "Debug"
+            }
+        }
+    ],
+    "buildPresets": [
+        {
+            "name": "default",
+            "jobs": 0,
+            "configurePreset": "default"
+        }
+    ],
+    "testPresets": [
+        {
+            "name": "default",
+            "configurePreset": "default",
+            "output": {
+                "outputOnFailure": true
+            },
+            "execution": {
+                "noTestsAction": "error",
+                "stopOnFailure": false,
+                "jobs": 8
+            }
+        }
+    ]
+}
\ No newline at end of file
diff --git a/src/qirqsim/BufferManager.hh b/src/qirqsim/BufferManager.hh
index 9bac1b5..01035f8 100644
--- a/src/qirqsim/BufferManager.hh
+++ b/src/qirqsim/BufferManager.hh
@@ -6,8 +6,7 @@
 //! \file qirqsim/BufferManager.hh
 //---------------------------------------------------------------------------//
 
-#ifndef BUFFER_MANAGER_H
-#define BUFFER_MANAGER_H
+#pragma once
 
 #include <functional>
 #include <optional>
@@ -49,4 +48,4 @@ class BufferManager
     std::unordered_map<std::string, int> simple_buffer;
 };
 
-#endif  // BUFFER_MANAGER_H
+// BUFFER_MANAGER_H

From ede6e4acc00a2e08405d1db5b0d4c8cba390f0c4 Mon Sep 17 00:00:00 2001
From: wongey <25296194+wongey@users.noreply.github.com>
Date: Wed, 27 Nov 2024 01:11:31 -0500
Subject: [PATCH 17/64] Update qsim unit test

---
 test/qirqsim/QsimQuantum.test.cc | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/test/qirqsim/QsimQuantum.test.cc b/test/qirqsim/QsimQuantum.test.cc
index 066f9bf..e01dca7 100644
--- a/test/qirqsim/QsimQuantum.test.cc
+++ b/test/qirqsim/QsimQuantum.test.cc
@@ -88,9 +88,12 @@ TEST_F(QsimQuantumTest, sim_dynamicbv)
     qsim_rt.result_record_output(R{0},"");
     qsim_rt.result_record_output(R{1},"");
     qsim_sim.tear_down();
-    auto result = clean_output(os.str());
-    EXPECT_EQ(R"(
-)", result) << result; // TODO: Modify QsimDefaultRuntime.cc so that it stores a result to be compared here (currently just prints as it goes...)
+
+    ASSERT_EQ(2, qsim_sim.num_qubits());
+    EXPECT_EQ(1, qsim_sim.manager.getBufferValue("q0", "0").value());
+    EXPECT_EQ(2, qsim_sim.manager.getBufferValue("q0", "1").value());
+    EXPECT_EQ(2, qsim_sim.manager.getBufferValue("q1", "0").value());
+    EXPECT_EQ(1, qsim_sim.manager.getBufferValue("q1", "1").value());
 }
 
 //---------------------------------------------------------------------------//

From 6fe34744c7ec4d7777390dcbc92cc6ab92f1ba52 Mon Sep 17 00:00:00 2001
From: wongey <25296194+wongey@users.noreply.github.com>
Date: Wed, 27 Nov 2024 11:19:09 -0500
Subject: [PATCH 18/64] Refactor to move public to private types

---
 src/qirqsim/BufferManager.hh |   1 +
 src/qirqsim/CMakeLists.txt   |   2 +-
 src/qirqsim/QsimQuantum.cc   | 148 ++++++++++++++++++++---------------
 src/qirqsim/QsimQuantum.hh   |  58 ++------------
 4 files changed, 93 insertions(+), 116 deletions(-)

diff --git a/src/qirqsim/BufferManager.hh b/src/qirqsim/BufferManager.hh
index 01035f8..deac635 100644
--- a/src/qirqsim/BufferManager.hh
+++ b/src/qirqsim/BufferManager.hh
@@ -32,6 +32,7 @@ class BufferManager
 {
   public:
     // Method to update the buffer with a key-value pair
+    // TODO: Don't use strings here
     void updateBuffer(std::string const& qubit,
                       std::string const& state,
                       int const& value);
diff --git a/src/qirqsim/CMakeLists.txt b/src/qirqsim/CMakeLists.txt
index b11018d..c75b054 100644
--- a/src/qirqsim/CMakeLists.txt
+++ b/src/qirqsim/CMakeLists.txt
@@ -15,7 +15,7 @@ qiree_add_library(qirqsim
 #Link the qsim library to qiree and any other relevant libraries
 target_link_libraries(qirqsim
   PUBLIC QIREE::qiree  # Link to qiree
-  PUBLIC QIREE::qsim #FIXME: make private
+  PRIVATE QIREE::qsim
 )
 
 #----------------------------------------------------------------------------#
diff --git a/src/qirqsim/QsimQuantum.cc b/src/qirqsim/QsimQuantum.cc
index 938fb99..d40f27c 100644
--- a/src/qirqsim/QsimQuantum.cc
+++ b/src/qirqsim/QsimQuantum.cc
@@ -37,40 +37,31 @@
 
 namespace qiree
 {
+
+struct Factory
+{  // Factory class for creating simulators in qsim
+    Factory(unsigned num_threads) : num_threads(num_threads) {}
+    using Simulator = qsim::Simulator<qsim::For>;
+    using StateSpace = Simulator::StateSpace;
+    StateSpace CreateStateSpace() const { return StateSpace(num_threads); }
+    Simulator CreateSimulator() const { return Simulator(num_threads); }
+    unsigned num_threads;
+};
+
 //---------------------------------------------------------------------------//
 /*
-* Initialize the qsim simulator
-*/
+ * Initialize the qsim simulator
+ */
 
-QsimQuantum::State QsimQuantum::init_state_space()
-{   
-    // check if StateSpace is the proper type for the output, problably it is
-    // just State from the Factory struct.
-    qsimParam.seed = seed_;
-    seed_++;
-    // Get the number of threads
-    numThreads
-        = std::max(1, static_cast<int>(std::thread::hardware_concurrency()));
-    qsimParam.max_fused_size = 2;  // Set the maximum size of fused gates
-    qsimParam.verbosity = 0;  // see verbosity in run_qsim.h
-    // Initialize the qsim simulator
-    QsimQuantum::StateSpace state_space
-        = Factory(numThreads).CreateStateSpace();  // Create the state space
-    State state = state_space.Create(this->num_qubits());  // Create the state
-    // Check if the state is null
-    QIREE_VALIDATE(!state_space.IsNull(state),
-            << "not enough memory: is the number of qubits too large?");
-    state_space.SetStateZero(state);  // Set the state to zero, TODO: the
-                                      // initial state is not necessarily zero
-    return state;
+QsimQuantum::QsimQuantum(std::ostream& os, unsigned long int seed)
+    : output_(os), seed_(seed)
+{
 }
 
-QsimQuantum::QsimQuantum(std::ostream& os, unsigned long int seed) : output_(os), seed_(seed) {}
-
 //---------------------------------------------------------------------------//
 /*
-* Prepare to build a quantum circuit for an entry point
-*/
+ * Prepare to build a quantum circuit for an entry point
+ */
 
 void QsimQuantum::set_up(EntryPointAttrs const& attrs)
 {
@@ -81,10 +72,28 @@ void QsimQuantum::set_up(EntryPointAttrs const& attrs)
     // (probably not true in general)
     result_to_qubit_.resize(attrs.required_num_results);
     num_qubits_ = attrs.required_num_qubits;  // Set the number of qubits
-    state_ = std::make_shared<State>(init_state_space());  // Set the state
-                                                           // space? Maybe.
-    q_circuit.num_qubits = num_qubits_;  // Allocate the number of qubits in
-                                         // the circuit
+
+    // Get the number of threads
+    numThreads
+        = std::max(1, static_cast<int>(std::thread::hardware_concurrency()));
+
+    // Initialize the qsim simulator
+    QsimQuantum::StateSpace state_space
+        = Factory(numThreads).CreateStateSpace();  // Create the state space
+    
+    // Create the state
+    State state = state_space.Create(this->num_qubits());
+    // Check if the state is null
+    QIREE_VALIDATE(!state_space.IsNull(state),
+                   << "not enough memory: is the number of qubits too large?");
+    
+    state_space.SetStateZero(state);  // Set the state to zero, TODO: the
+                                      // initial state is not necessarily zero
+
+    state_ = std::make_shared<State>(std::move(state));  
+
+    // Allocate the number of qubits in the circuit
+    q_circuit.num_qubits = num_qubits_;  
     execution_time = 0;  // Initialize execution time
     static unsigned int rep = 0;
     rep++;
@@ -93,8 +102,8 @@ void QsimQuantum::set_up(EntryPointAttrs const& attrs)
 
 //---------------------------------------------------------------------------//
 /*
-* Complete an execution
-*/
+ * Complete an execution
+ */
 
 void QsimQuantum::repCount(int rep)
 {
@@ -104,14 +113,12 @@ void QsimQuantum::repCount(int rep)
 void QsimQuantum::tear_down()
 {
     q_circuit = {};
-    q_circuit.num_qubits = num_qubits_;
-    state_ = std::make_shared<State>(init_state_space());
 }
 
 //---------------------------------------------------------------------------//
 /*
-* Reset the qubit
-*/
+ * Reset the qubit
+ */
 
 void QsimQuantum::reset(Qubit q)
 {
@@ -120,18 +127,43 @@ void QsimQuantum::reset(Qubit q)
 
 //----------------------------------------------------------------------------//
 /*
-* Read the value of a result. This utilizes the new BufferManager.
-*/
+ * Read the value of a result. This utilizes the new BufferManager.
+ */
 
 QState QsimQuantum::read_result(Result r)
 {
-    std::string q_index_string = std::to_string(r.value);
-    auto meas_results = execute_if_needed();
+    using Fuser = qsim::MultiQubitGateFuser<qsim::IO, qsim::GateQSim<float>>;
+    using Runner = qsim::QSimRunner<qsim::IO, Fuser, Factory>;
+    using VecMeas = std::vector<StateSpace::MeasurementResult>;
+
+    // Vector to hold measurement results, this must be empty before running
+    std::vector<StateSpace::MeasurementResult> meas_results;
+    std::string stringResult;
+
+    Runner::Parameter qsimParam;  // Parameters for qsim
+    qsimParam.seed = seed_;
+    seed_++;
+    qsimParam.max_fused_size = 2;  // Set the maximum size of fused gates
+    qsimParam.verbosity = 0;  // see verbosity in run_qsim.h
+
+    // Run the simulation
+    bool const run_success = Runner::Run(qsimParam,
+                                         Factory(numThreads),
+                                         q_circuit,
+                                         *state_,
+                                         meas_results);
+
+    assert(run_success);  // Ensure the run was successful
+    // reset circuit here
+    q_circuit = {};
+    q_circuit.num_qubits = num_qubits_;
+
     if (meas_results.size() == 1 && meas_results[0].bitstring.size() == 1)
     {
         auto const bitResult = meas_results[0].bitstring[0];
         assert(bitResult == 0 || bitResult == 1);
         std::string stringResult = std::to_string(bitResult);
+        std::string q_index_string = std::to_string(r.value);
         if (stringResult == "1")
         {
             manager.updateBuffer("q" + q_index_string, "1", 1);
@@ -152,9 +184,10 @@ QState QsimQuantum::read_result(Result r)
 
 //---------------------------------------------------------------------------//
 /*
-* Map a qubit to a result index
-* (TODO: find how to link the classical register to the quantum register in qsim)
-*/
+ * Map a qubit to a result index
+ * (TODO: find how to link the classical register to the quantum register in
+ * qsim)
+ */
 
 void QsimQuantum::mz(Qubit q, Result r)
 {  // we don't classical register yet.
@@ -164,6 +197,8 @@ void QsimQuantum::mz(Qubit q, Result r)
                                                  // are {2,3,4,5}, q is less
                                                  // than num_qubits but not it
                                                  // is in the set of qubits.
+    // TODO: maybe not what we want long term
+    QIREE_EXPECT(q.value == r.value);
     // Add measurement instruction
     this->q_circuit.gates.push_back(
         qsim::gate::Measurement<qsim::GateQSim<float>>::Create(
@@ -172,8 +207,8 @@ void QsimQuantum::mz(Qubit q, Result r)
 
 //---------------------------------------------------------------------------//
 /*
-* Quantum Instruction Mapping
-*/
+ * Quantum Instruction Mapping
+ */
 
 // 1. Entangling gates
 void QsimQuantum::cx(Qubit q1, Qubit q2)
@@ -256,26 +291,9 @@ void QsimQuantum::print_accelbuf()
     // results
 }
 
-QsimQuantum::VecMeas QsimQuantum::execute_if_needed()
+void QsimQuantum::execute_if_needed()
 {
-    std::vector<StateSpace::MeasurementResult> meas_results;  // Vector to hold
-                                                              // measurement
-                                                              // results, this
-                                                              // must be empty
-                                                              // before running
-    std::string stringResult;
-    static unsigned long int seed = 0;
-    qsimParam.seed = seed++;
-    bool const run_success = Runner::Run(qsimParam,
-                                         Factory(numThreads),
-                                         q_circuit,
-                                         *state_,
-                                         meas_results);  // Run the simulation
-    assert(run_success);  // Ensure the run was successful
-    // reset circuit here
-    q_circuit = {};
-    q_circuit.num_qubits = num_qubits_;
-    return meas_results;
+    QIREE_EXPECT(false);
 }
 
 }  // namespace qiree
diff --git a/src/qirqsim/QsimQuantum.hh b/src/qirqsim/QsimQuantum.hh
index fa13a3d..f2d10c6 100644
--- a/src/qirqsim/QsimQuantum.hh
+++ b/src/qirqsim/QsimQuantum.hh
@@ -13,20 +13,6 @@
 #include <memory>
 #include <ostream>
 #include <vector>
-#include <qsim/lib/circuit.h>
-#include <qsim/lib/circuit_qsim_parser.h>
-#include <qsim/lib/formux.h>
-#include <qsim/lib/fuser.h>
-#include <qsim/lib/fuser_mqubit.h>
-#include <qsim/lib/gate.h>
-#include <qsim/lib/gates_qsim.h>
-#include <qsim/lib/io.h>
-#include <qsim/lib/io_file.h>
-#include <qsim/lib/run_qsim.h>
-#include <qsim/lib/simmux.h>
-#include <qsim/lib/simulator_basic.h>
-#include <qsim/lib/statespace_basic.h>
-#include <qsim/lib/util_cpu.h>
 
 #include "BufferManager.hh"
 #include "qiree/Macros.hh"
@@ -34,16 +20,6 @@
 #include "qiree/RuntimeInterface.hh"
 #include "qiree/Types.hh"
 
-struct Factory
-{  // Factory class for creating simulators in qsim
-    Factory(unsigned num_threads) : num_threads(num_threads) {}
-    using Simulator = qsim::Simulator<qsim::For>;
-    using StateSpace = Simulator::StateSpace;
-    StateSpace CreateStateSpace() const { return StateSpace(num_threads); }
-    Simulator CreateSimulator() const { return Simulator(num_threads); }
-    unsigned num_threads;
-};
-
 namespace qiree
 {
 class QsimQuantum final : virtual public QuantumNotImpl
@@ -53,16 +29,6 @@ class QsimQuantum final : virtual public QuantumNotImpl
     // Construct with number of shots
     QsimQuantum(std::ostream& os, size_type shots);  
 
-    // Define types
-    using Simulator = qsim::Simulator<qsim::For>;
-    using StateSpace = Simulator::StateSpace;
-    using State = StateSpace::State;
-    using Fuser = qsim::MultiQubitGateFuser<qsim::IO, qsim::GateQSim<float>>;
-    using Runner = qsim::QSimRunner<qsim::IO, Fuser, Factory>;
-    using VecMeas = std::vector<StateSpace::MeasurementResult>;
-
-    State init_state_space();
-
     QIREE_DELETE_COPY_MOVE(QsimQuantum);  // Delete copy and move constructors
 
     //!@{
@@ -103,7 +69,7 @@ class QsimQuantum final : virtual public QuantumNotImpl
 
     // Run the circuit on the accelerator if we have not already. Returns true
     // if the circuit was executed.
-    VecMeas execute_if_needed();
+    void execute_if_needed();
 
     void print_accelbuf();
     //!@}
@@ -132,13 +98,6 @@ class QsimQuantum final : virtual public QuantumNotImpl
     void z(Qubit) final;
     //!@}
 
-    // Get the quantum circuit
-    qsim::Circuit<qsim::GateQSim<float>> get_circuit() const
-    {
-        return q_circuit;
-    }
-    // Get the state space
-    State const& get_state() const { return *state_; }
     // Update the buffer
     BufferManager manager;
     // Number of repetitions
@@ -146,17 +105,23 @@ class QsimQuantum final : virtual public QuantumNotImpl
     void repCount(int rep);
 
   private:
+
     //// TYPES ////
+
+    using Simulator = qsim::Simulator<qsim::For>;
+    using StateSpace = Simulator::StateSpace;
+    using State = StateSpace::State;
+
     enum class Endianness
     {
         little,
         big
     };
+    
     unsigned numThreads;  // Number of threads to use
     unsigned max_fused_size;  // Maximum size of fused gates
     qsim::Circuit<qsim::GateQSim<float>> q_circuit;  // Quantum circuit object
 
-    Runner::Parameter qsimParam;  // Parameters for qsim
     unsigned long int seed_;
     size_t execution_time;  // when the quantum operation will be executed
 
@@ -171,11 +136,4 @@ class QsimQuantum final : virtual public QuantumNotImpl
     std::shared_ptr<State> state_;
 };
 
-class buffer
-{
-  public:
-    buffer(size_t size) : size(size) {}
-    size_t size;
-};
-
 }  // namespace qiree

From 24a06bb89aac2fe4b3bf79c11e248357f953c438 Mon Sep 17 00:00:00 2001
From: Seth R Johnson <johnsonsr@ornl.gov>
Date: Wed, 27 Nov 2024 11:25:12 -0500
Subject: [PATCH 19/64] Unused variables and naming

---
 src/qirqsim/QsimQuantum.cc | 90 ++++++++++++++++----------------------
 src/qirqsim/QsimQuantum.hh | 33 +++-----------
 2 files changed, 44 insertions(+), 79 deletions(-)

diff --git a/src/qirqsim/QsimQuantum.cc b/src/qirqsim/QsimQuantum.cc
index d40f27c..ce0834a 100644
--- a/src/qirqsim/QsimQuantum.cc
+++ b/src/qirqsim/QsimQuantum.cc
@@ -49,20 +49,18 @@ struct Factory
 };
 
 //---------------------------------------------------------------------------//
-/*
+/*!
  * Initialize the qsim simulator
  */
-
 QsimQuantum::QsimQuantum(std::ostream& os, unsigned long int seed)
     : output_(os), seed_(seed)
 {
 }
 
 //---------------------------------------------------------------------------//
-/*
+/*!
  * Prepare to build a quantum circuit for an entry point
  */
-
 void QsimQuantum::set_up(EntryPointAttrs const& attrs)
 {
     QIREE_VALIDATE(attrs.required_num_qubits > 0,
@@ -74,62 +72,51 @@ void QsimQuantum::set_up(EntryPointAttrs const& attrs)
     num_qubits_ = attrs.required_num_qubits;  // Set the number of qubits
 
     // Get the number of threads
-    numThreads
+    num_threads_
         = std::max(1, static_cast<int>(std::thread::hardware_concurrency()));
 
     // Initialize the qsim simulator
     QsimQuantum::StateSpace state_space
-        = Factory(numThreads).CreateStateSpace();  // Create the state space
-    
+        = Factory(num_threads_).CreateStateSpace();  // Create the state space
+
     // Create the state
     State state = state_space.Create(this->num_qubits());
     // Check if the state is null
     QIREE_VALIDATE(!state_space.IsNull(state),
                    << "not enough memory: is the number of qubits too large?");
-    
+
     state_space.SetStateZero(state);  // Set the state to zero, TODO: the
                                       // initial state is not necessarily zero
 
-    state_ = std::make_shared<State>(std::move(state));  
+    state_ = std::make_shared<State>(std::move(state));
 
     // Allocate the number of qubits in the circuit
-    q_circuit.num_qubits = num_qubits_;  
-    execution_time = 0;  // Initialize execution time
-    static unsigned int rep = 0;
-    rep++;
-    this->repCount(rep);
+    q_circuit.num_qubits = num_qubits_;
+    gate_index_ = 0;  // Initialize execution time
 }
 
 //---------------------------------------------------------------------------//
-/*
+/*!
  * Complete an execution
  */
-
-void QsimQuantum::repCount(int rep)
-{
-    repetition = rep;
-}
-
 void QsimQuantum::tear_down()
 {
     q_circuit = {};
 }
 
 //---------------------------------------------------------------------------//
-/*
+/*!
  * Reset the qubit
  */
-
 void QsimQuantum::reset(Qubit q)
 {
     q.value = 0;
 }
 
 //----------------------------------------------------------------------------//
-/*
+/*!
  * Read the value of a result. This utilizes the new BufferManager.
  */
-
 QState QsimQuantum::read_result(Result r)
 {
     using Fuser = qsim::MultiQubitGateFuser<qsim::IO, qsim::GateQSim<float>>;
@@ -147,11 +134,8 @@ QState QsimQuantum::read_result(Result r)
     qsimParam.verbosity = 0;  // see verbosity in run_qsim.h
 
     // Run the simulation
-    bool const run_success = Runner::Run(qsimParam,
-                                         Factory(numThreads),
-                                         q_circuit,
-                                         *state_,
-                                         meas_results);
+    bool const run_success = Runner::Run(
+        qsimParam, Factory(num_threads_), q_circuit, *state_, meas_results);
 
     assert(run_success);  // Ensure the run was successful
     // reset circuit here
@@ -183,12 +167,12 @@ QState QsimQuantum::read_result(Result r)
 }
 
 //---------------------------------------------------------------------------//
-/*
- * Map a qubit to a result index
+/*!
+ * Map a qubit to a result index.
+ *
  * (TODO: find how to link the classical register to the quantum register in
  * qsim)
  */
-
 void QsimQuantum::mz(Qubit q, Result r)
 {  // we don't classical register yet.
     QIREE_EXPECT(q.value < this->num_qubits());  // TODO: q must be in the set
@@ -201,8 +185,8 @@ void QsimQuantum::mz(Qubit q, Result r)
     QIREE_EXPECT(q.value == r.value);
     // Add measurement instruction
     this->q_circuit.gates.push_back(
-        qsim::gate::Measurement<qsim::GateQSim<float>>::Create(
-            execution_time++, {this->getQubitIndex(q)}));
+        qsim::gate::Measurement<qsim::GateQSim<float>>::Create(gate_index_++,
+                                                               {q.value}));
 }
 
 //---------------------------------------------------------------------------//
@@ -213,66 +197,66 @@ void QsimQuantum::mz(Qubit q, Result r)
 // 1. Entangling gates
 void QsimQuantum::cx(Qubit q1, Qubit q2)
 {
-    q_circuit.gates.push_back(qsim::GateCNot<float>::Create(
-        execution_time++, this->getQubitIndex(q1), this->getQubitIndex(q2)));
+    q_circuit.gates.push_back(
+        qsim::GateCNot<float>::Create(gate_index_++, q1.value, q2.value));
 }
 void QsimQuantum::cnot(Qubit q1, Qubit q2)
 {
-    q_circuit.gates.push_back(qsim::GateCNot<float>::Create(
-        execution_time++, this->getQubitIndex(q1), this->getQubitIndex(q2)));
+    q_circuit.gates.push_back(
+        qsim::GateCNot<float>::Create(gate_index_++, q1.value, q2.value));
 }
 void QsimQuantum::cz(Qubit q1, Qubit q2)
 {
-    q_circuit.gates.push_back(qsim::GateCZ<float>::Create(
-        execution_time++, this->getQubitIndex(q1), this->getQubitIndex(q2)));
+    q_circuit.gates.push_back(
+        qsim::GateCZ<float>::Create(gate_index_++, q1.value, q2.value));
 }
 // 2. Local gates
 void QsimQuantum::h(Qubit q)
 {
     q_circuit.gates.push_back(
-        qsim::GateHd<float>::Create(execution_time++, this->getQubitIndex(q)));
+        qsim::GateHd<float>::Create(gate_index_++, q.value));
 }
 void QsimQuantum::s(Qubit q)
 {
     q_circuit.gates.push_back(
-        qsim::GateS<float>::Create(execution_time++, this->getQubitIndex(q)));
+        qsim::GateS<float>::Create(gate_index_++, q.value));
 }
 void QsimQuantum::t(Qubit q)
 {
     q_circuit.gates.push_back(
-        qsim::GateT<float>::Create(execution_time++, this->getQubitIndex(q)));
+        qsim::GateT<float>::Create(gate_index_++, q.value));
 }
 // 2.1 Pauli gates
 void QsimQuantum::x(Qubit q)
 {
     q_circuit.gates.push_back(
-        qsim::GateX<float>::Create(execution_time++, this->getQubitIndex(q)));
+        qsim::GateX<float>::Create(gate_index_++, q.value));
 }
 void QsimQuantum::y(Qubit q)
 {
     q_circuit.gates.push_back(
-        qsim::GateY<float>::Create(execution_time++, this->getQubitIndex(q)));
+        qsim::GateY<float>::Create(gate_index_++, q.value));
 }
 void QsimQuantum::z(Qubit q)
 {
     q_circuit.gates.push_back(
-        qsim::GateZ<float>::Create(execution_time++, this->getQubitIndex(q)));
+        qsim::GateZ<float>::Create(gate_index_++, q.value));
 }
 // 2.2 rotation gates
 void QsimQuantum::rx(double theta, Qubit q)
 {
-    q_circuit.gates.push_back(qsim::GateRX<float>::Create(
-        execution_time++, this->getQubitIndex(q), theta));
+    q_circuit.gates.push_back(
+        qsim::GateRX<float>::Create(gate_index_++, q.value, theta));
 }
 void QsimQuantum::ry(double theta, Qubit q)
 {
-    q_circuit.gates.push_back(qsim::GateRY<float>::Create(
-        execution_time++, this->getQubitIndex(q), theta));
+    q_circuit.gates.push_back(
+        qsim::GateRY<float>::Create(gate_index_++, q.value, theta));
 }
 void QsimQuantum::rz(double theta, Qubit q)
 {
-    q_circuit.gates.push_back(qsim::GateRZ<float>::Create(
-        execution_time++, this->getQubitIndex(q), theta));
+    q_circuit.gates.push_back(
+        qsim::GateRZ<float>::Create(gate_index_++, q.value, theta));
 }
 
 Qubit QsimQuantum::result_to_qubit(Result r)
diff --git a/src/qirqsim/QsimQuantum.hh b/src/qirqsim/QsimQuantum.hh
index f2d10c6..e42507f 100644
--- a/src/qirqsim/QsimQuantum.hh
+++ b/src/qirqsim/QsimQuantum.hh
@@ -22,12 +22,15 @@
 
 namespace qiree
 {
+//---------------------------------------------------------------------------//
+/*!
+ * Create and execute quantum circuits using google Qsim.
+ */
 class QsimQuantum final : virtual public QuantumNotImpl
 {
   public:
-    // Define constructors and destructors
     // Construct with number of shots
-    QsimQuantum(std::ostream& os, size_type shots);  
+    QsimQuantum(std::ostream& os, size_type shots);
 
     QIREE_DELETE_COPY_MOVE(QsimQuantum);  // Delete copy and move constructors
 
@@ -35,12 +38,6 @@ class QsimQuantum final : virtual public QuantumNotImpl
     //! \name Accessors
     size_type num_results() const { return result_to_qubit_.size(); }
     size_type num_qubits() const { return num_qubits_; }
-
-    unsigned getQubitIndex(Qubit q)
-    {
-        return static_cast<unsigned>(q.value);  // Return the value of the
-                                                // qubit
-    }
     //!@}
 
     //!@{
@@ -63,10 +60,6 @@ class QsimQuantum final : virtual public QuantumNotImpl
     // Get runtime qubit corresponding to a runtime result
     Qubit result_to_qubit(Result);
 
-    // Wrapper for qsim
-    // std::map<std::string, int>
-    // get_marginal_counts(std::vector<Qubit> const& qubits);
-
     // Run the circuit on the accelerator if we have not already. Returns true
     // if the circuit was executed.
     void execute_if_needed();
@@ -100,9 +93,6 @@ class QsimQuantum final : virtual public QuantumNotImpl
 
     // Update the buffer
     BufferManager manager;
-    // Number of repetitions
-    int repetition;
-    void repCount(int rep);
 
   private:
 
@@ -112,23 +102,14 @@ class QsimQuantum final : virtual public QuantumNotImpl
     using StateSpace = Simulator::StateSpace;
     using State = StateSpace::State;
 
-    enum class Endianness
-    {
-        little,
-        big
-    };
-    
-    unsigned numThreads;  // Number of threads to use
-    unsigned max_fused_size;  // Maximum size of fused gates
+    unsigned num_threads_;  // Number of threads to use
     qsim::Circuit<qsim::GateQSim<float>> q_circuit;  // Quantum circuit object
 
     unsigned long int seed_;
-    size_t execution_time;  // when the quantum operation will be executed
+    size_t gate_index_;  // when the quantum operation will be executed
 
-    bool executed;
     size_type num_qubits_{};
     std::vector<Qubit> result_to_qubit_;
-    Endianness endian_;
 
     std::ostream& output_;
     std::shared_ptr<Simulator> simulator_;

From aadbb8a4e586bbbab04b000c81c0a3691caa0d2f Mon Sep 17 00:00:00 2001
From: Seth R Johnson <johnsonsr@ornl.gov>
Date: Wed, 27 Nov 2024 11:33:31 -0500
Subject: [PATCH 20/64] Use PIMPL

---
 src/qirqsim/QsimQuantum.cc      | 81 ++++++++++++++++++++-------------
 src/qirqsim/QsimQuantum.hh      | 18 ++++----
 src/qirqsim/QsimTupleRuntime.cc |  1 +
 3 files changed, 59 insertions(+), 41 deletions(-)

diff --git a/src/qirqsim/QsimQuantum.cc b/src/qirqsim/QsimQuantum.cc
index ce0834a..c1e2770 100644
--- a/src/qirqsim/QsimQuantum.cc
+++ b/src/qirqsim/QsimQuantum.cc
@@ -9,7 +9,6 @@
 #include "QsimQuantum.hh"
 
 #include <algorithm>
-#include <cassert>
 #include <iostream>
 #include <optional>
 #include <stdexcept>
@@ -37,26 +36,44 @@
 
 namespace qiree
 {
-
-struct Factory
-{  // Factory class for creating simulators in qsim
+//---------------------------------------------------------------------------//
+/*!
+ * Factory class for creating simulators in qsim.
+ */
+struct QsimQuantum::Factory
+{
     Factory(unsigned num_threads) : num_threads(num_threads) {}
     using Simulator = qsim::Simulator<qsim::For>;
     using StateSpace = Simulator::StateSpace;
+
     StateSpace CreateStateSpace() const { return StateSpace(num_threads); }
     Simulator CreateSimulator() const { return Simulator(num_threads); }
     unsigned num_threads;
 };
 
+//---------------------------------------------------------------------------//
+/*!
+ * Quantum state and circuit.
+ */
+struct QsimQuantum::State
+{
+    qsim::Circuit<qsim::GateQSim<float>> circuit;
+    Factory::StateSpace::State state;
+};
+
 //---------------------------------------------------------------------------//
 /*!
  * Initialize the qsim simulator
  */
 QsimQuantum::QsimQuantum(std::ostream& os, unsigned long int seed)
-    : output_(os), seed_(seed)
+    : output_(os), seed_(seed), state_{std::make_unique<State>()}
 {
 }
 
+//---------------------------------------------------------------------------//
+//! Default destructor
+QsimQuantum::~QsimQuantum() = default;
+
 //---------------------------------------------------------------------------//
 /*!
  * Prepare to build a quantum circuit for an entry point
@@ -76,11 +93,12 @@ void QsimQuantum::set_up(EntryPointAttrs const& attrs)
         = std::max(1, static_cast<int>(std::thread::hardware_concurrency()));
 
     // Initialize the qsim simulator
-    QsimQuantum::StateSpace state_space
-        = Factory(num_threads_).CreateStateSpace();  // Create the state space
+    auto state_space = Factory(num_threads_).CreateStateSpace();  // Create the
+                                                                  // state
+                                                                  // space
 
     // Create the state
-    State state = state_space.Create(this->num_qubits());
+    state_->state = state_space.Create(this->num_qubits());
     // Check if the state is null
     QIREE_VALIDATE(!state_space.IsNull(state),
                    << "not enough memory: is the number of qubits too large?");
@@ -88,10 +106,8 @@ void QsimQuantum::set_up(EntryPointAttrs const& attrs)
     state_space.SetStateZero(state);  // Set the state to zero, TODO: the
                                       // initial state is not necessarily zero
 
-    state_ = std::make_shared<State>(std::move(state));
-
     // Allocate the number of qubits in the circuit
-    q_circuit.num_qubits = num_qubits_;
+    state_->circuit.num_qubits = num_qubits_;
     gate_index_ = 0;  // Initialize execution time
 }
 
@@ -101,7 +117,7 @@ void QsimQuantum::set_up(EntryPointAttrs const& attrs)
  */
 void QsimQuantum::tear_down()
 {
-    q_circuit = {};
+    state_->circuit = {};
 }
 
 //---------------------------------------------------------------------------//
@@ -134,18 +150,21 @@ QState QsimQuantum::read_result(Result r)
     qsimParam.verbosity = 0;  // see verbosity in run_qsim.h
 
     // Run the simulation
-    bool const run_success = Runner::Run(
-        qsimParam, Factory(num_threads_), q_circuit, *state_, meas_results);
+    bool const run_success = Runner::Run(qsimParam,
+                                         Factory(num_threads_),
+                                         state_->circuit,
+                                         state_->state,
+                                         meas_results);
 
-    assert(run_success);  // Ensure the run was successful
+    QIREE_ASSERT(run_success);  // Ensure the run was successful
     // reset circuit here
-    q_circuit = {};
-    q_circuit.num_qubits = num_qubits_;
+    state_->circuit = {};
+    state_->circuit.num_qubits = num_qubits_;
 
     if (meas_results.size() == 1 && meas_results[0].bitstring.size() == 1)
     {
         auto const bitResult = meas_results[0].bitstring[0];
-        assert(bitResult == 0 || bitResult == 1);
+        QIREE_ASSERT(bitResult == 0 || bitResult == 1);
         std::string stringResult = std::to_string(bitResult);
         std::string q_index_string = std::to_string(r.value);
         if (stringResult == "1")
@@ -184,7 +203,7 @@ void QsimQuantum::mz(Qubit q, Result r)
     // TODO: maybe not what we want long term
     QIREE_EXPECT(q.value == r.value);
     // Add measurement instruction
-    this->q_circuit.gates.push_back(
+    state_->circuit.gates.push_back(
         qsim::gate::Measurement<qsim::GateQSim<float>>::Create(gate_index_++,
                                                                {q.value}));
 }
@@ -197,65 +216,65 @@ void QsimQuantum::mz(Qubit q, Result r)
 // 1. Entangling gates
 void QsimQuantum::cx(Qubit q1, Qubit q2)
 {
-    q_circuit.gates.push_back(
+    state_->circuit.gates.push_back(
         qsim::GateCNot<float>::Create(gate_index_++, q1.value, q2.value));
 }
 void QsimQuantum::cnot(Qubit q1, Qubit q2)
 {
-    q_circuit.gates.push_back(
+    state_->circuit.gates.push_back(
         qsim::GateCNot<float>::Create(gate_index_++, q1.value, q2.value));
 }
 void QsimQuantum::cz(Qubit q1, Qubit q2)
 {
-    q_circuit.gates.push_back(
+    state_->circuit.gates.push_back(
         qsim::GateCZ<float>::Create(gate_index_++, q1.value, q2.value));
 }
 // 2. Local gates
 void QsimQuantum::h(Qubit q)
 {
-    q_circuit.gates.push_back(
+    state_->circuit.gates.push_back(
         qsim::GateHd<float>::Create(gate_index_++, q.value));
 }
 void QsimQuantum::s(Qubit q)
 {
-    q_circuit.gates.push_back(
+    state_->circuit.gates.push_back(
         qsim::GateS<float>::Create(gate_index_++, q.value));
 }
 void QsimQuantum::t(Qubit q)
 {
-    q_circuit.gates.push_back(
+    state_->circuit.gates.push_back(
         qsim::GateT<float>::Create(gate_index_++, q.value));
 }
 // 2.1 Pauli gates
 void QsimQuantum::x(Qubit q)
 {
-    q_circuit.gates.push_back(
+    state_->circuit.gates.push_back(
         qsim::GateX<float>::Create(gate_index_++, q.value));
 }
 void QsimQuantum::y(Qubit q)
 {
-    q_circuit.gates.push_back(
+    state_->circuit.gates.push_back(
         qsim::GateY<float>::Create(gate_index_++, q.value));
 }
 void QsimQuantum::z(Qubit q)
 {
-    q_circuit.gates.push_back(
+    state_->circuit.gates.push_back(
         qsim::GateZ<float>::Create(gate_index_++, q.value));
 }
 // 2.2 rotation gates
 void QsimQuantum::rx(double theta, Qubit q)
 {
-    q_circuit.gates.push_back(
+    state_->circuit.gates.push_back(
         qsim::GateRX<float>::Create(gate_index_++, q.value, theta));
 }
 void QsimQuantum::ry(double theta, Qubit q)
 {
-    q_circuit.gates.push_back(
+    state_->circuit.gates.push_back(
         qsim::GateRY<float>::Create(gate_index_++, q.value, theta));
 }
 void QsimQuantum::rz(double theta, Qubit q)
 {
-    q_circuit.gates.push_back(
+    state_->circuit.gates.push_back(
         qsim::GateRZ<float>::Create(gate_index_++, q.value, theta));
 }
 
diff --git a/src/qirqsim/QsimQuantum.hh b/src/qirqsim/QsimQuantum.hh
index e42507f..1292e8b 100644
--- a/src/qirqsim/QsimQuantum.hh
+++ b/src/qirqsim/QsimQuantum.hh
@@ -7,9 +7,6 @@
 //---------------------------------------------------------------------------//
 #pragma once
 
-#include <cassert>
-#include <initializer_list>
-#include <map>
 #include <memory>
 #include <ostream>
 #include <vector>
@@ -31,6 +28,7 @@ class QsimQuantum final : virtual public QuantumNotImpl
   public:
     // Construct with number of shots
     QsimQuantum(std::ostream& os, size_type shots);
+    ~QsimQuantum();
 
     QIREE_DELETE_COPY_MOVE(QsimQuantum);  // Delete copy and move constructors
 
@@ -98,12 +96,12 @@ class QsimQuantum final : virtual public QuantumNotImpl
 
     //// TYPES ////
 
-    using Simulator = qsim::Simulator<qsim::For>;
-    using StateSpace = Simulator::StateSpace;
-    using State = StateSpace::State;
+    struct Factory;
+    struct State;
+
+    //// DATA ////
 
     unsigned num_threads_;  // Number of threads to use
-    qsim::Circuit<qsim::GateQSim<float>> q_circuit;  // Quantum circuit object
 
     unsigned long int seed_;
     size_t gate_index_;  // when the quantum operation will be executed
@@ -112,9 +110,9 @@ class QsimQuantum final : virtual public QuantumNotImpl
     std::vector<Qubit> result_to_qubit_;
 
     std::ostream& output_;
-    std::shared_ptr<Simulator> simulator_;
-    std::shared_ptr<StateSpace> statespace_;
-    std::shared_ptr<State> state_;
+
+    // Quantum circuit, simulator, and measured results
+    std::unique_ptr<State> state_;
 };
 
 }  // namespace qiree
diff --git a/src/qirqsim/QsimTupleRuntime.cc b/src/qirqsim/QsimTupleRuntime.cc
index 34a7440..aa06798 100644
--- a/src/qirqsim/QsimTupleRuntime.cc
+++ b/src/qirqsim/QsimTupleRuntime.cc
@@ -7,6 +7,7 @@
 //---------------------------------------------------------------------------//
 #include "QsimTupleRuntime.hh"
 
+#include
 #include "qiree/Assert.hh"
 
 namespace qiree

From 6f13b04356c134acd0c1e1f0ba2e01b75cac3745 Mon Sep 17 00:00:00 2001
From: Seth R Johnson <johnsonsr@ornl.gov>
Date: Wed, 27 Nov 2024 11:38:32 -0500
Subject: [PATCH 21/64] Delete tuple runtime, fix errors

---
 app/qir-qsim.cc                 |  32 +++-----
 src/qirqsim/CMakeLists.txt      |   1 -
 src/qirqsim/QsimQuantum.cc      |  16 ++--
 src/qirqsim/QsimQuantum.hh      |  14 ++--
 src/qirqsim/QsimTupleRuntime.cc | 128 --------------------------------
 src/qirqsim/QsimTupleRuntime.hh |  93 -----------------------
 6 files changed, 24 insertions(+), 260 deletions(-)
 delete mode 100644 src/qirqsim/QsimTupleRuntime.cc
 delete mode 100644 src/qirqsim/QsimTupleRuntime.hh

diff --git a/app/qir-qsim.cc b/app/qir-qsim.cc
index 5493f26..92f4669 100644
--- a/app/qir-qsim.cc
+++ b/app/qir-qsim.cc
@@ -12,17 +12,12 @@
 #include <CLI/CLI.hpp>
 
 #include "qiree_version.h"
-#include "qiree/Executor.hh"
-#include "qiree/Module.hh"
-#include "qiree/QuantumNotImpl.hh"
 
 #include "qiree/Executor.hh"
 #include "qiree/Module.hh"
 #include "qiree/QuantumNotImpl.hh"
-
 #include "qirqsim/QsimDefaultRuntime.hh"
 #include "qirqsim/QsimQuantum.hh"
-#include "qirqsim/QsimTupleRuntime.hh"
 
 using namespace std::string_view_literals;
 
@@ -36,22 +31,17 @@ void run(std::string const& filename,
 {
     // Load the input
     Executor execute{Module{filename}};
-    
+
     // Set up qsim
     QsimQuantum sim(std::cout, 0);
-    
-    // Collect the statistics 
+
+    // Collect the statistics
     std::unique_ptr<RuntimeInterface> rt;
-    //if (group_tuples){
-    //    rt = std::make_unique<QsimTupleRuntime>(
-    //        std::cout, sim);
-    //} else {
-        rt = std::make_unique<QsimDefaultRuntime>(
-            std::cout, sim);
-    //}
+    rt = std::make_unique<QsimDefaultRuntime>(std::cout, sim);
 
     // Run several time = shots (default 1)
-    for (int i = 0; i < num_shots; i++){    
+    for (int i = 0; i < num_shots; i++)
+    {
         execute(sim, *rt);
     }
 
@@ -60,7 +50,7 @@ void run(std::string const& filename,
     std::cout << "-------------------" << std::endl;
     std::cout << "Number of shots: " << num_shots << std::endl;
     std::cout << "Number of qubits: " << sim.num_qubits() << std::endl;
-    
+
     for(int q_index = 0; q_index < sim.num_qubits(); q_index++){
         int value_0 = 0;
         int value_1 = 0;
@@ -83,7 +73,7 @@ int main(int argc, char* argv[])
     int num_shots{1};
     std::string filename;
     //bool group_tuples{false};
- 
+
     CLI::App app;
 
     auto* filename_opt
@@ -93,16 +83,16 @@ int main(int argc, char* argv[])
     auto* nshot_opt
         = app.add_option("-s,--shots", num_shots, "Number of shots");
     nshot_opt->capture_default_str();
-    
+
     //app.add_flag("--group-tuples,!--no-group-tuples",
     //            group_tuples,
     //            "Print per-tuple measurement statistics rather than "
     //            "per-qubit");
-    
+
     CLI11_PARSE(app, argc, argv);
 
     //qiree::app::run(filename, num_shots, group_tuples);
     qiree::app::run(filename, num_shots);
-        
+
     return EXIT_SUCCESS;
 }
diff --git a/src/qirqsim/CMakeLists.txt b/src/qirqsim/CMakeLists.txt
index c75b054..f0c34d6 100644
--- a/src/qirqsim/CMakeLists.txt
+++ b/src/qirqsim/CMakeLists.txt
@@ -8,7 +8,6 @@
 qiree_add_library(qirqsim
   QsimQuantum.cc
   QsimDefaultRuntime.cc
-  QsimTupleRuntime.cc
   BufferManager.cc
 )
 
diff --git a/src/qirqsim/QsimQuantum.cc b/src/qirqsim/QsimQuantum.cc
index c1e2770..b71ccf4 100644
--- a/src/qirqsim/QsimQuantum.cc
+++ b/src/qirqsim/QsimQuantum.cc
@@ -58,7 +58,7 @@ struct QsimQuantum::Factory
 struct QsimQuantum::State
 {
     qsim::Circuit<qsim::GateQSim<float>> circuit;
-    Factory::StateSpace::State state;
+    std::optional<Factory::StateSpace::State> state;
 };
 
 //---------------------------------------------------------------------------//
@@ -100,11 +100,11 @@ void QsimQuantum::set_up(EntryPointAttrs const& attrs)
     // Create the state
     state_->state = state_space.Create(this->num_qubits());
     // Check if the state is null
-    QIREE_VALIDATE(!state_space.IsNull(state),
+    QIREE_VALIDATE(!state_space.IsNull(*state_->state),
                    << "not enough memory: is the number of qubits too large?");
 
-    state_space.SetStateZero(state);  // Set the state to zero, TODO: the
-                                      // initial state is not necessarily zero
+    // TODO: initial states shouldn't necessarily be zero
+    state_space.SetStateZero(*state_->state);
 
     // Allocate the number of qubits in the circuit
     state_->circuit.num_qubits = num_qubits_;
@@ -137,7 +137,7 @@ QState QsimQuantum::read_result(Result r)
 {
     using Fuser = qsim::MultiQubitGateFuser<qsim::IO, qsim::GateQSim<float>>;
     using Runner = qsim::QSimRunner<qsim::IO, Fuser, Factory>;
-    using VecMeas = std::vector<StateSpace::MeasurementResult>;
+    using StateSpace = Factory::StateSpace;
 
     // Vector to hold measurement results, this must be empty before running
     std::vector<StateSpace::MeasurementResult> meas_results;
@@ -153,7 +153,7 @@ QState QsimQuantum::read_result(Result r)
     bool const run_success = Runner::Run(qsimParam,
                                          Factory(num_threads_),
                                          state_->circuit,
-                                         state_->state,
+                                         *state_->state,
                                          meas_results);
 
     QIREE_ASSERT(run_success);  // Ensure the run was successful
@@ -204,8 +204,8 @@ void QsimQuantum::mz(Qubit q, Result r)
     QIREE_EXPECT(q.value == r.value);
     // Add measurement instruction
     state_->circuit.gates.push_back(
-        qsim::gate::Measurement<qsim::GateQSim<float>>::Create(gate_index_++,
-                                                               {q.value}));
+        qsim::gate::Measurement<qsim::GateQSim<float>>::Create(
+            gate_index_++, {static_cast<unsigned int>(q.value)}));
 }
 
 //---------------------------------------------------------------------------//
diff --git a/src/qirqsim/QsimQuantum.hh b/src/qirqsim/QsimQuantum.hh
index 1292e8b..94813e6 100644
--- a/src/qirqsim/QsimQuantum.hh
+++ b/src/qirqsim/QsimQuantum.hh
@@ -27,7 +27,7 @@ class QsimQuantum final : virtual public QuantumNotImpl
 {
   public:
     // Construct with number of shots
-    QsimQuantum(std::ostream& os, size_type shots);
+    QsimQuantum(std::ostream& os, unsigned long int shots);
     ~QsimQuantum();
 
     QIREE_DELETE_COPY_MOVE(QsimQuantum);  // Delete copy and move constructors
@@ -101,18 +101,14 @@ class QsimQuantum final : virtual public QuantumNotImpl
 
     //// DATA ////
 
-    unsigned num_threads_;  // Number of threads to use
+    std::ostream& output_;
+    unsigned long int seed_{};
+    std::unique_ptr<State> state_;
 
-    unsigned long int seed_;
+    unsigned num_threads_{};  // Number of threads to use
     size_t gate_index_;  // when the quantum operation will be executed
-
     size_type num_qubits_{};
     std::vector<Qubit> result_to_qubit_;
-
-    std::ostream& output_;
-
-    // Quantum circuit, simulator, and measured results
-    std::unique_ptr<State> state_;
 };
 
 }  // namespace qiree
diff --git a/src/qirqsim/QsimTupleRuntime.cc b/src/qirqsim/QsimTupleRuntime.cc
deleted file mode 100644
index aa06798..0000000
--- a/src/qirqsim/QsimTupleRuntime.cc
+++ /dev/null
@@ -1,128 +0,0 @@
-//----------------------------------*-C++-*----------------------------------//
-// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers.
-// See the top-level COPYRIGHT file for details.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//---------------------------------------------------------------------------//
-//! \file qirqsim/QsimTupleRuntime.cc
-//---------------------------------------------------------------------------//
-#include "QsimTupleRuntime.hh"
-
-#include
-#include "qiree/Assert.hh"
-
-namespace qiree
-{
-//---------------------------------------------------------------------------//
-/*!
- * Initialize the execution environment, resetting qubits.
- */
-void QsimTupleRuntime::initialize(OptionalCString env)
-{
-    if (env)
-    {
-        output_ << "Argument to initialize: " << env << std::endl;
-    }
-}
-
-//---------------------------------------------------------------------------//
-/*!
- * Execute circuit and mark the following N results as being part of an array
- * named tag
- */
-void QsimTupleRuntime::array_record_output(size_type s, OptionalCString tag)
-{
-    execute_if_needed();
-    start_tracking(GroupingType::array, tag, s);
-}
-
-//---------------------------------------------------------------------------//
-/*!
- * Execute circuit and mark the following N results as being part of a tuple
- * named tag
- */
-void QsimTupleRuntime::tuple_record_output(size_type s, OptionalCString tag)
-{
-    execute_if_needed();
-    start_tracking(GroupingType::tuple, tag, s);
-}
-
-//---------------------------------------------------------------------------//
-/*!
- * Execute circuit and report a single measurement result
- */
-void QsimTupleRuntime::result_record_output(Result r, OptionalCString tag)
-{
-    execute_if_needed();
-    Qubit q = sim_.result_to_qubit(r);
-    push_result(q);
-}
-
-//---------------------------------------------------------------------------//
-// PRIVATE FUNCTIONS
-//---------------------------------------------------------------------------//
-
-void QsimTupleRuntime::execute_if_needed()
-{
-    /*
-    if (sim_.execute_if_needed() && print_accelbuf_)
-    {
-        sim_.print_accelbuf();
-    }
-    */
-}
-
-void QsimTupleRuntime::start_tracking(GroupingType type,
-                                      std::string tag,
-                                      size_type num_results)
-{
-    QIREE_EXPECT(!valid_);
-    valid_ = true;
-    type_ = type;
-    tag_ = tag;
-    num_results_ = num_results;
-    qubits_.clear();
-
-    if (!num_results_)
-    {
-        // Edge case
-        print_header(0);
-        valid_ = false;
-    }
-}
-
-void QsimTupleRuntime::push_result(Qubit q)
-{
-    QIREE_EXPECT(valid_);
-    QIREE_EXPECT(qubits_.size() < num_results_);
-    qubits_.push_back(q);
-    if (qubits_.size() == num_results_)
-    {
-        finish_tuple();
-    }
-}
-
-void QsimTupleRuntime::print_header(size_type num_distinct)
-{
-    auto name = get_name();
-    output_ << name << " " << tag_ << " length " << qubits_.size()
-            << " distinct results " << num_distinct << std::endl;
-}
-
-void QsimTupleRuntime::finish_tuple()
-{
-    // auto counts = sim_.get_marginal_counts(qubits_);
-    std::map<std::string, int> counts = {{"0", 0}, {"1", 0}};  // Placeholder
-                                                               // for actual
-                                                               // counts, TODO:
-                                                               // replace with
-                                                               // actual counts
-    print_header(counts.size());
-    auto name = get_name();
-    for (auto& [bits, count] : counts)
-    {
-        output_ << name << " " << tag_ << " result " << bits << " count "
-                << count << std::endl;
-    }
-    valid_ = false;
-}
-}  // namespace qiree
diff --git a/src/qirqsim/QsimTupleRuntime.hh b/src/qirqsim/QsimTupleRuntime.hh
deleted file mode 100644
index d6cafbe..0000000
--- a/src/qirqsim/QsimTupleRuntime.hh
+++ /dev/null
@@ -1,93 +0,0 @@
-//----------------------------------*-C++-*----------------------------------//
-// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers.
-// See the top-level COPYRIGHT file for details.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//---------------------------------------------------------------------------//
-//! \file qirqsim/QsimTupleRuntime.hh
-//---------------------------------------------------------------------------//
-#pragma once
-
-#include "QsimQuantum.hh"
-
-namespace qiree
-{
-
-/*!
- * Print per-tuple (or per-array) measurement statistics. (Compare with \ref
- * QsimDefaultRuntime.)
- *
- * Example:
- * \code
- * tuple ret length 2 distinct results 2
- * tuple ret result 00 count 512
- * tuple ret result 11 count 512
- * \endcode
- */
-
-class QsimTupleRuntime final : virtual public RuntimeInterface
-{
-  public:
-    /*!
-     * Construct an \c QsimTupleRuntime.
-     * The \c print_accelbuf argument determines whether the qsim \c
-     * AcceleratorBuffer is dumped after execution.
-     */
-    QsimTupleRuntime(std::ostream& output,
-                     QsimQuantum& sim,
-                     bool print_accelbuf = true)
-        : output_(output)
-        , sim_(sim)
-        , print_accelbuf_(print_accelbuf)
-        , valid_(false)
-    {
-    }
-
-    //!@{
-    //! \name Runtime interface
-    // Initialize the execution environment, resetting qubits
-    void initialize(OptionalCString env) override;
-
-    // Execute circuit and mark the following N results as being part of an
-    // array named tag
-    void array_record_output(size_type, OptionalCString tag) final;
-
-    // Execute circuit and mark the following N results as being part of a
-    // tuple named tag
-    void tuple_record_output(size_type, OptionalCString) final;
-
-    // Execute circuit and report a single measurement result
-    void result_record_output(Result result, OptionalCString tag) final;
-    //!@}
-
-  private:
-    enum class GroupingType
-    {
-        tuple,
-        array,
-    };
-
-    std::ostream& output_;
-    QsimQuantum& sim_;
-    bool const print_accelbuf_;
-    bool valid_;
-    GroupingType type_;
-    std::string tag_;
-    size_type num_results_;
-    std::vector<Qubit> qubits_;
-
-    void execute_if_needed();
-    void
-    start_tracking(GroupingType type, std::string tag, size_type num_results);
-    void push_result(Qubit q);
-    void print_header(size_type num_distinct);
-    void finish_tuple();
-
-    inline std::string get_name()
-    {
-        return type_ == GroupingType::tuple   ? "tuple"
-               : type_ == GroupingType::array ? "array"
-                                              : "grouping";
-    }
-};
-
-}  // namespace qiree

From 35778add90605e1624898aae4e135c48acc7dcb8 Mon Sep 17 00:00:00 2001
From: wongey <25296194+wongey@users.noreply.github.com>
Date: Wed, 27 Nov 2024 11:44:03 -0500
Subject: [PATCH 22/64] Fix qsim test

---
 test/qirqsim/QsimQuantum.test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/qirqsim/QsimQuantum.test.cc b/test/qirqsim/QsimQuantum.test.cc
index e01dca7..21395cc 100644
--- a/test/qirqsim/QsimQuantum.test.cc
+++ b/test/qirqsim/QsimQuantum.test.cc
@@ -43,7 +43,7 @@ TEST_F(QsimQuantumTest, sim_dynamicbv)
     os << '\n';
 
     // Create a simulator that will write to the string stream
-    QsimQuantum qsim_sim{os, 1};
+    QsimQuantum qsim_sim{os, 0};
     QsimDefaultRuntime qsim_rt{os, qsim_sim};
 
     // Call functions in the same sequence that dynamicbv.ll would

From bdd35c413bb5bb21588adf5dd7201b77d92ba34b Mon Sep 17 00:00:00 2001
From: Vicente <vicenley@gmail.com>
Date: Mon, 23 Dec 2024 13:18:40 -0500
Subject: [PATCH 23/64] including OutputDistribution into the libs list

---
 src/qiree/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/qiree/CMakeLists.txt b/src/qiree/CMakeLists.txt
index 1c2be37..b19ff14 100644
--- a/src/qiree/CMakeLists.txt
+++ b/src/qiree/CMakeLists.txt
@@ -19,6 +19,7 @@ qiree_add_library(qiree
   Module.cc
   Executor.cc
   QuantumNotImpl.cc
+  OutputDistribution.cc
 )
 target_compile_features(qiree PUBLIC cxx_std_17)
 target_link_libraries(qiree

From cb373bad6331a45c13c3fe1abbc2464785fdffbf Mon Sep 17 00:00:00 2001
From: Vicente <vicenley@gmail.com>
Date: Mon, 23 Dec 2024 13:19:14 -0500
Subject: [PATCH 24/64] old BufferManager now in qiree namespace

---
 src/qiree/OutputDistribution.cc | 64 ++++++++++++++++++++++++++++++++
 src/qiree/OutputDistribution.hh | 65 +++++++++++++++++++++++++++++++++
 2 files changed, 129 insertions(+)
 create mode 100644 src/qiree/OutputDistribution.cc
 create mode 100644 src/qiree/OutputDistribution.hh

diff --git a/src/qiree/OutputDistribution.cc b/src/qiree/OutputDistribution.cc
new file mode 100644
index 0000000..0d0297f
--- /dev/null
+++ b/src/qiree/OutputDistribution.cc
@@ -0,0 +1,64 @@
+//----------------------------------*-C++-*----------------------------------//
+// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers.
+// See the top-level COPYRIGHT file for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//---------------------------------------------------------------------------//
+//! \file qiree/Buffer.hh
+//---------------------------------------------------------------------------//
+
+#include "OutputDistribution.hh"
+
+#include <optional>
+#include <string>
+#include <unordered_map>
+
+namespace qiree
+{
+
+
+
+void Buffer::updateBuffer(std::string const& qubit,
+                                 std::string const& state,
+                                 int const& value)
+{
+    // Insert or update the key-value pair in the buffer
+    std::pair<std::string, std::string> searchKey = {qubit, state};
+    int current_frequency = 0;
+    auto it = buffer.find(searchKey);
+    if (it != buffer.end())
+    {
+        current_frequency = it->second;
+    }
+    // Accumulate counts with every shot
+    buffer[{qubit, state}] = value + current_frequency;
+}
+
+void Buffer::updateBuffer(std::string const& key, int const& value)
+{
+    // Insert or update the key-value pair in the buffer
+    simple_buffer[key] = value;
+}
+
+std::optional<int> Buffer::getBufferValue(std::string const& qubit,
+                                                 std::string const& state) const
+{
+    std::pair<std::string, std::string> searchKey = {qubit, state};
+    auto it = buffer.find(searchKey);
+    if (it != buffer.end())
+    {
+        return it->second;  // Key found
+    }
+    return std::nullopt;  // Key not found
+}
+
+std::optional<int> Buffer::getBufferValue(std::string const& key) const
+{
+    auto it = simple_buffer.find(key);
+    if (it != simple_buffer.end())
+    {
+        return it->second;  // Key found
+    }
+    return std::nullopt;  // Key not found
+}
+
+}  // namespace qiree
\ No newline at end of file
diff --git a/src/qiree/OutputDistribution.hh b/src/qiree/OutputDistribution.hh
new file mode 100644
index 0000000..398c88e
--- /dev/null
+++ b/src/qiree/OutputDistribution.hh
@@ -0,0 +1,65 @@
+//----------------------------------*-C++-*----------------------------------//
+// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers.
+// See the top-level COPYRIGHT file for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//---------------------------------------------------------------------------//
+//! \file qiree/Buffer.hh
+//---------------------------------------------------------------------------//
+
+#pragma once
+
+#include <functional>
+#include <optional>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+namespace qiree
+{
+
+// Define a hash function for std::pair
+
+struct pair_hash
+{
+    template<class T1, class T2>
+    std::size_t operator()(std::pair<T1, T2> const& pair) const
+    {
+        auto hash1 = std::hash<T1>{}(pair.first);
+        auto hash2 = std::hash<T2>{}(pair.second);
+        // Combine the two hash values
+        return hash1 ^ (hash2 << 1);  // Shift and XOR
+    }
+};
+
+class Buffer
+{
+  public:
+    // Method to update the buffer with a key-value pair
+    // TODO: Don't use strings here
+    void updateBuffer(std::string const& qubit,
+                      std::string const& state,
+                      int const& value);
+    void updateBuffer(std::string const& key, int const& value);
+
+    // Retrieve buffer value for storage or evaluation
+    std::optional<int>
+    getBufferValue(std::string const& qubit, std::string const& state) const;
+    std::optional<int> getBufferValue(std::string const& key) const;
+
+  private:
+    // Dictionary to store key-value pairs
+    std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> buffer;
+    std::unordered_map<std::string, int> simple_buffer;
+};
+
+// BUFFER_H
+
+} // namespace qiree
+
+
+
+
+
+
+
+

From 181737d6c7850a6fa388493036289d93044c8b28 Mon Sep 17 00:00:00 2001
From: Vicente <vicenley@gmail.com>
Date: Mon, 23 Dec 2024 13:19:47 -0500
Subject: [PATCH 25/64] deleting the old BufferManager

---
 src/qirqsim/BufferManager.cc | 57 ------------------------------------
 src/qirqsim/BufferManager.hh | 52 --------------------------------
 2 files changed, 109 deletions(-)
 delete mode 100644 src/qirqsim/BufferManager.cc
 delete mode 100644 src/qirqsim/BufferManager.hh

diff --git a/src/qirqsim/BufferManager.cc b/src/qirqsim/BufferManager.cc
deleted file mode 100644
index b340604..0000000
--- a/src/qirqsim/BufferManager.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-//----------------------------------*-C++-*----------------------------------//
-// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers.
-// See the top-level COPYRIGHT file for details.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//---------------------------------------------------------------------------//
-//! \file qirqsim/BufferManager.hh
-//---------------------------------------------------------------------------//
-
-#include "BufferManager.hh"
-
-#include <optional>
-#include <string>
-#include <unordered_map>
-
-void BufferManager::updateBuffer(std::string const& qubit,
-                                 std::string const& state,
-                                 int const& value)
-{
-    // Insert or update the key-value pair in the buffer
-    std::pair<std::string, std::string> searchKey = {qubit, state};
-    int current_frequency = 0;
-    auto it = buffer.find(searchKey);
-    if (it != buffer.end())
-    {
-        current_frequency = it->second;
-    }
-    // Accumulate counts with every shot
-    buffer[{qubit, state}] = value + current_frequency;
-}
-
-void BufferManager::updateBuffer(std::string const& key, int const& value)
-{
-    // Insert or update the key-value pair in the buffer
-    simple_buffer[key] = value;
-}
-
-std::optional<int> BufferManager::getBufferValue(std::string const& qubit,
-                                                 std::string const& state) const
-{
-    std::pair<std::string, std::string> searchKey = {qubit, state};
-    auto it = buffer.find(searchKey);
-    if (it != buffer.end())
-    {
-        return it->second;  // Key found
-    }
-    return std::nullopt;  // Key not found
-}
-
-std::optional<int> BufferManager::getBufferValue(std::string const& key) const
-{
-    auto it = simple_buffer.find(key);
-    if (it != simple_buffer.end())
-    {
-        return it->second;  // Key found
-    }
-    return std::nullopt;  // Key not found
-}
diff --git a/src/qirqsim/BufferManager.hh b/src/qirqsim/BufferManager.hh
deleted file mode 100644
index deac635..0000000
--- a/src/qirqsim/BufferManager.hh
+++ /dev/null
@@ -1,52 +0,0 @@
-//----------------------------------*-C++-*----------------------------------//
-// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers.
-// See the top-level COPYRIGHT file for details.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//---------------------------------------------------------------------------//
-//! \file qirqsim/BufferManager.hh
-//---------------------------------------------------------------------------//
-
-#pragma once
-
-#include <functional>
-#include <optional>
-#include <string>
-#include <unordered_map>
-#include <utility>
-
-// Define a hash function for std::pair
-
-struct pair_hash
-{
-    template<class T1, class T2>
-    std::size_t operator()(std::pair<T1, T2> const& pair) const
-    {
-        auto hash1 = std::hash<T1>{}(pair.first);
-        auto hash2 = std::hash<T2>{}(pair.second);
-        // Combine the two hash values
-        return hash1 ^ (hash2 << 1);  // Shift and XOR
-    }
-};
-
-class BufferManager
-{
-  public:
-    // Method to update the buffer with a key-value pair
-    // TODO: Don't use strings here
-    void updateBuffer(std::string const& qubit,
-                      std::string const& state,
-                      int const& value);
-    void updateBuffer(std::string const& key, int const& value);
-
-    // Retrieve buffer value for storage or evaluation
-    std::optional<int>
-    getBufferValue(std::string const& qubit, std::string const& state) const;
-    std::optional<int> getBufferValue(std::string const& key) const;
-
-  private:
-    // Dictionary to store key-value pairs
-    std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> buffer;
-    std::unordered_map<std::string, int> simple_buffer;
-};
-
-// BUFFER_MANAGER_H

From a92aacfb7df5acdaa0ab0010edce3a877f6c4c28 Mon Sep 17 00:00:00 2001
From: Vicente <vicenley@gmail.com>
Date: Mon, 23 Dec 2024 13:20:43 -0500
Subject: [PATCH 26/64] updating libs list

---
 src/qirqsim/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/qirqsim/CMakeLists.txt b/src/qirqsim/CMakeLists.txt
index f0c34d6..0d81dec 100644
--- a/src/qirqsim/CMakeLists.txt
+++ b/src/qirqsim/CMakeLists.txt
@@ -8,7 +8,6 @@
 qiree_add_library(qirqsim
   QsimQuantum.cc
   QsimDefaultRuntime.cc
-  BufferManager.cc
 )
 
 #Link the qsim library to qiree and any other relevant libraries

From 040b834b079a6c3a4079d058127a63f2dd23004b Mon Sep 17 00:00:00 2001
From: Vicente <vicenley@gmail.com>
Date: Mon, 23 Dec 2024 13:22:04 -0500
Subject: [PATCH 27/64] updating manager (Buffer)

---
 src/qirqsim/QsimQuantum.hh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/qirqsim/QsimQuantum.hh b/src/qirqsim/QsimQuantum.hh
index 94813e6..cde10a8 100644
--- a/src/qirqsim/QsimQuantum.hh
+++ b/src/qirqsim/QsimQuantum.hh
@@ -11,11 +11,11 @@
 #include <ostream>
 #include <vector>
 
-#include "BufferManager.hh"
 #include "qiree/Macros.hh"
 #include "qiree/QuantumNotImpl.hh"
 #include "qiree/RuntimeInterface.hh"
 #include "qiree/Types.hh"
+#include "qiree/OutputDistribution.hh"
 
 namespace qiree
 {
@@ -90,7 +90,7 @@ class QsimQuantum final : virtual public QuantumNotImpl
     //!@}
 
     // Update the buffer
-    BufferManager manager;
+    Buffer manager;
 
   private:
 

From c5d2f04da4baa7d7050d638ac7d3a05556b38af5 Mon Sep 17 00:00:00 2001
From: wongey <25296194+wongey@users.noreply.github.com>
Date: Tue, 14 Jan 2025 14:13:41 -0500
Subject: [PATCH 28/64] REVERT ME Delete OutputDistribution and qsim app
 temporarily

---
 app/CMakeLists.txt              | 14 -----
 app/qir-qsim.cc                 | 98 ---------------------------------
 src/qiree/CMakeLists.txt        |  1 -
 src/qiree/OutputDistribution.cc | 64 ---------------------
 src/qiree/OutputDistribution.hh | 65 ----------------------
 5 files changed, 242 deletions(-)
 delete mode 100644 app/qir-qsim.cc
 delete mode 100644 src/qiree/OutputDistribution.cc
 delete mode 100644 src/qiree/OutputDistribution.hh

diff --git a/app/CMakeLists.txt b/app/CMakeLists.txt
index 4bf7330..d0640b4 100644
--- a/app/CMakeLists.txt
+++ b/app/CMakeLists.txt
@@ -14,20 +14,6 @@ FetchContent_Declare(
 
 FetchContent_MakeAvailable(cli11_proj)
 
-#-----------------------------------------------------------------------------#
-# QSIM FRONT END
-#-----------------------------------------------------------------------------#
-
-if(QIREE_USE_QSIM)
-  qiree_add_executable(qir-qsim
-    qir-qsim.cc
-  )
-  target_link_libraries(qir-qsim
-    PUBLIC QIREE::qiree QIREE::qirqsim
-    PRIVATE CLI11::CLI11
-  )
-endif()
-
 #-----------------------------------------------------------------------------#
 # XACC FRONT END
 #-----------------------------------------------------------------------------#
diff --git a/app/qir-qsim.cc b/app/qir-qsim.cc
deleted file mode 100644
index 92f4669..0000000
--- a/app/qir-qsim.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-//----------------------------------*-C++-*----------------------------------//
-// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers.
-// See the top-level COPYRIGHT file for details.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//---------------------------------------------------------------------------//
-//! \file qir-xacc/qir-xacc.cc
-//---------------------------------------------------------------------------//
-#include <cstdlib>
-#include <iostream>
-#include <string>
-#include <string_view>
-#include <CLI/CLI.hpp>
-
-#include "qiree_version.h"
-
-#include "qiree/Executor.hh"
-#include "qiree/Module.hh"
-#include "qiree/QuantumNotImpl.hh"
-#include "qirqsim/QsimDefaultRuntime.hh"
-#include "qirqsim/QsimQuantum.hh"
-
-using namespace std::string_view_literals;
-
-namespace qiree
-{
-namespace app
-{
-void run(std::string const& filename,
-         int num_shots)
-         // bool group_tuples = false)
-{
-    // Load the input
-    Executor execute{Module{filename}};
-
-    // Set up qsim
-    QsimQuantum sim(std::cout, 0);
-
-    // Collect the statistics
-    std::unique_ptr<RuntimeInterface> rt;
-    rt = std::make_unique<QsimDefaultRuntime>(std::cout, sim);
-
-    // Run several time = shots (default 1)
-    for (int i = 0; i < num_shots; i++)
-    {
-        execute(sim, *rt);
-    }
-
-    std::cout << std::endl;
-    std::cout << "Measurement output:" << std::endl;
-    std::cout << "-------------------" << std::endl;
-    std::cout << "Number of shots: " << num_shots << std::endl;
-    std::cout << "Number of qubits: " << sim.num_qubits() << std::endl;
-
-    for(int q_index = 0; q_index < sim.num_qubits(); q_index++){
-        int value_0 = 0;
-        int value_1 = 0;
-        if (auto value = sim.manager.getBufferValue("q"+std::to_string(q_index), "0"); value.has_value()){ value_0 = value.value();}
-        if (auto value = sim.manager.getBufferValue("q"+std::to_string(q_index), "1"); value.has_value()){ value_1 = value.value();}
-        std::cout << "q" << q_index << " {0: " << value_0 << "," << " 1: " << value_1 << "}\n";
-    }
-}
-
-//---------------------------------------------------------------------------//
-}  // namespace app
-}  // namespace qiree
-
-//---------------------------------------------------------------------------//
-/*!
- * Execute and run.
- */
-int main(int argc, char* argv[])
-{
-    int num_shots{1};
-    std::string filename;
-    //bool group_tuples{false};
-
-    CLI::App app;
-
-    auto* filename_opt
-        = app.add_option("--input,-i,input", filename, "QIR input file");
-    filename_opt->required();
-
-    auto* nshot_opt
-        = app.add_option("-s,--shots", num_shots, "Number of shots");
-    nshot_opt->capture_default_str();
-
-    //app.add_flag("--group-tuples,!--no-group-tuples",
-    //            group_tuples,
-    //            "Print per-tuple measurement statistics rather than "
-    //            "per-qubit");
-
-    CLI11_PARSE(app, argc, argv);
-
-    //qiree::app::run(filename, num_shots, group_tuples);
-    qiree::app::run(filename, num_shots);
-
-    return EXIT_SUCCESS;
-}
diff --git a/src/qiree/CMakeLists.txt b/src/qiree/CMakeLists.txt
index b19ff14..1c2be37 100644
--- a/src/qiree/CMakeLists.txt
+++ b/src/qiree/CMakeLists.txt
@@ -19,7 +19,6 @@ qiree_add_library(qiree
   Module.cc
   Executor.cc
   QuantumNotImpl.cc
-  OutputDistribution.cc
 )
 target_compile_features(qiree PUBLIC cxx_std_17)
 target_link_libraries(qiree
diff --git a/src/qiree/OutputDistribution.cc b/src/qiree/OutputDistribution.cc
deleted file mode 100644
index 0d0297f..0000000
--- a/src/qiree/OutputDistribution.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-//----------------------------------*-C++-*----------------------------------//
-// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers.
-// See the top-level COPYRIGHT file for details.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//---------------------------------------------------------------------------//
-//! \file qiree/Buffer.hh
-//---------------------------------------------------------------------------//
-
-#include "OutputDistribution.hh"
-
-#include <optional>
-#include <string>
-#include <unordered_map>
-
-namespace qiree
-{
-
-
-
-void Buffer::updateBuffer(std::string const& qubit,
-                                 std::string const& state,
-                                 int const& value)
-{
-    // Insert or update the key-value pair in the buffer
-    std::pair<std::string, std::string> searchKey = {qubit, state};
-    int current_frequency = 0;
-    auto it = buffer.find(searchKey);
-    if (it != buffer.end())
-    {
-        current_frequency = it->second;
-    }
-    // Accumulate counts with every shot
-    buffer[{qubit, state}] = value + current_frequency;
-}
-
-void Buffer::updateBuffer(std::string const& key, int const& value)
-{
-    // Insert or update the key-value pair in the buffer
-    simple_buffer[key] = value;
-}
-
-std::optional<int> Buffer::getBufferValue(std::string const& qubit,
-                                                 std::string const& state) const
-{
-    std::pair<std::string, std::string> searchKey = {qubit, state};
-    auto it = buffer.find(searchKey);
-    if (it != buffer.end())
-    {
-        return it->second;  // Key found
-    }
-    return std::nullopt;  // Key not found
-}
-
-std::optional<int> Buffer::getBufferValue(std::string const& key) const
-{
-    auto it = simple_buffer.find(key);
-    if (it != simple_buffer.end())
-    {
-        return it->second;  // Key found
-    }
-    return std::nullopt;  // Key not found
-}
-
-}  // namespace qiree
\ No newline at end of file
diff --git a/src/qiree/OutputDistribution.hh b/src/qiree/OutputDistribution.hh
deleted file mode 100644
index 398c88e..0000000
--- a/src/qiree/OutputDistribution.hh
+++ /dev/null
@@ -1,65 +0,0 @@
-//----------------------------------*-C++-*----------------------------------//
-// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers.
-// See the top-level COPYRIGHT file for details.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//---------------------------------------------------------------------------//
-//! \file qiree/Buffer.hh
-//---------------------------------------------------------------------------//
-
-#pragma once
-
-#include <functional>
-#include <optional>
-#include <string>
-#include <unordered_map>
-#include <utility>
-
-namespace qiree
-{
-
-// Define a hash function for std::pair
-
-struct pair_hash
-{
-    template<class T1, class T2>
-    std::size_t operator()(std::pair<T1, T2> const& pair) const
-    {
-        auto hash1 = std::hash<T1>{}(pair.first);
-        auto hash2 = std::hash<T2>{}(pair.second);
-        // Combine the two hash values
-        return hash1 ^ (hash2 << 1);  // Shift and XOR
-    }
-};
-
-class Buffer
-{
-  public:
-    // Method to update the buffer with a key-value pair
-    // TODO: Don't use strings here
-    void updateBuffer(std::string const& qubit,
-                      std::string const& state,
-                      int const& value);
-    void updateBuffer(std::string const& key, int const& value);
-
-    // Retrieve buffer value for storage or evaluation
-    std::optional<int>
-    getBufferValue(std::string const& qubit, std::string const& state) const;
-    std::optional<int> getBufferValue(std::string const& key) const;
-
-  private:
-    // Dictionary to store key-value pairs
-    std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> buffer;
-    std::unordered_map<std::string, int> simple_buffer;
-};
-
-// BUFFER_H
-
-} // namespace qiree
-
-
-
-
-
-
-
-

From 10ed1ae013abdd1aa102a46e4885c71ed867e71d Mon Sep 17 00:00:00 2001
From: Seth R Johnson <johnsonsr@ornl.gov>
Date: Tue, 14 Jan 2025 14:18:49 -0500
Subject: [PATCH 29/64] Fix build (some to be reverted)

---
 CMakePresets.json                 | 14 +-------------
 src/qirqsim/QsimDefaultRuntime.cc |  9 ++++++---
 src/qirqsim/QsimQuantum.cc        |  4 +++-
 src/qirqsim/QsimQuantum.hh        |  4 ----
 test/qirqsim/QsimQuantum.test.cc  |  2 ++
 5 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/CMakePresets.json b/CMakePresets.json
index 4d9e63b..8de704f 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -23,18 +23,6 @@
                 },
                 "CMAKE_INSTALL_PREFIX": "${sourceDir}/install-${presetName}"
             }
-        },
-        {
-            "name": "default",
-            "displayName": "Clang 16.0.6 x86_64-pc-linux-gnu",
-            "description": "Using compilers: C = /usr/bin/clang-16, CXX = /usr/bin/clang++-16",
-            "binaryDir": "${sourceDir}/out/build/${presetName}",
-            "cacheVariables": {
-                "CMAKE_INSTALL_PREFIX": "${sourceDir}/out/install/${presetName}",
-                "CMAKE_C_COMPILER": "/usr/bin/clang-16",
-                "CMAKE_CXX_COMPILER": "/usr/bin/clang++-16",
-                "CMAKE_BUILD_TYPE": "Debug"
-            }
         }
     ],
     "buildPresets": [
@@ -58,4 +46,4 @@
             }
         }
     ]
-}
\ No newline at end of file
+}
diff --git a/src/qirqsim/QsimDefaultRuntime.cc b/src/qirqsim/QsimDefaultRuntime.cc
index 4ece7c1..d9571f8 100644
--- a/src/qirqsim/QsimDefaultRuntime.cc
+++ b/src/qirqsim/QsimDefaultRuntime.cc
@@ -32,7 +32,7 @@ void QsimDefaultRuntime::initialize(OptionalCString env)
  * named tag
  */
 
-void QsimDefaultRuntime::array_record_output(size_type s, OptionalCString tag)
+void QsimDefaultRuntime::array_record_output(size_type, OptionalCString)
 {
     // this->execute_if_needed();
     // output_ << "array " << (tag ? tag : "<null>") << " length " << s
@@ -45,7 +45,7 @@ void QsimDefaultRuntime::array_record_output(size_type s, OptionalCString tag)
  * named tag
  */
 
-void QsimDefaultRuntime::tuple_record_output(size_type s, OptionalCString tag)
+void QsimDefaultRuntime::tuple_record_output(size_type, OptionalCString)
 {
     // this->execute_if_needed();
     // output_ << "tuple " << (tag ? tag : "<null>") << " length " << s
@@ -56,18 +56,21 @@ void QsimDefaultRuntime::tuple_record_output(size_type s, OptionalCString tag)
 /*!
  * Execute circuit and report a single measurement result
  */
-void QsimDefaultRuntime::result_record_output(Result r, OptionalCString tag)
+void QsimDefaultRuntime::result_record_output(Result, OptionalCString)
 {
     // Access values through the getter
     // This prints results every time result_record_output is called
     // Can comment out if only want to see final results
 
+#if 0
     if (auto value = sim_.manager.getBufferValue("q" + std::to_string(r.value));
         value.has_value())
     {
         std::cout << "q" << std::to_string(r.value) << " : " << value.value()
                   << "\n";
     }
+#endif
+    (void)sizeof(sim_);
 }
 
 }  // namespace qiree
diff --git a/src/qirqsim/QsimQuantum.cc b/src/qirqsim/QsimQuantum.cc
index b71ccf4..c25c297 100644
--- a/src/qirqsim/QsimQuantum.cc
+++ b/src/qirqsim/QsimQuantum.cc
@@ -133,7 +133,7 @@ void QsimQuantum::reset(Qubit q)
 /*!
  * Read the value of a result. This utilizes the new BufferManager.
  */
-QState QsimQuantum::read_result(Result r)
+QState QsimQuantum::read_result(Result)
 {
     using Fuser = qsim::MultiQubitGateFuser<qsim::IO, qsim::GateQSim<float>>;
     using Runner = qsim::QSimRunner<qsim::IO, Fuser, Factory>;
@@ -161,6 +161,7 @@ QState QsimQuantum::read_result(Result r)
     state_->circuit = {};
     state_->circuit.num_qubits = num_qubits_;
 
+#if 0
     if (meas_results.size() == 1 && meas_results[0].bitstring.size() == 1)
     {
         auto const bitResult = meas_results[0].bitstring[0];
@@ -182,6 +183,7 @@ QState QsimQuantum::read_result(Result r)
     {
         qsim::IO::errorf("Unexpected measurement results encountered.");
     }
+#endif
     return static_cast<QState>(meas_results[0].bitstring[0]);
 }
 
diff --git a/src/qirqsim/QsimQuantum.hh b/src/qirqsim/QsimQuantum.hh
index cde10a8..83533da 100644
--- a/src/qirqsim/QsimQuantum.hh
+++ b/src/qirqsim/QsimQuantum.hh
@@ -15,7 +15,6 @@
 #include "qiree/QuantumNotImpl.hh"
 #include "qiree/RuntimeInterface.hh"
 #include "qiree/Types.hh"
-#include "qiree/OutputDistribution.hh"
 
 namespace qiree
 {
@@ -89,9 +88,6 @@ class QsimQuantum final : virtual public QuantumNotImpl
     void z(Qubit) final;
     //!@}
 
-    // Update the buffer
-    Buffer manager;
-
   private:
 
     //// TYPES ////
diff --git a/test/qirqsim/QsimQuantum.test.cc b/test/qirqsim/QsimQuantum.test.cc
index 21395cc..b9ef992 100644
--- a/test/qirqsim/QsimQuantum.test.cc
+++ b/test/qirqsim/QsimQuantum.test.cc
@@ -90,10 +90,12 @@ TEST_F(QsimQuantumTest, sim_dynamicbv)
     qsim_sim.tear_down();
 
     ASSERT_EQ(2, qsim_sim.num_qubits());
+#if 0
     EXPECT_EQ(1, qsim_sim.manager.getBufferValue("q0", "0").value());
     EXPECT_EQ(2, qsim_sim.manager.getBufferValue("q0", "1").value());
     EXPECT_EQ(2, qsim_sim.manager.getBufferValue("q1", "0").value());
     EXPECT_EQ(1, qsim_sim.manager.getBufferValue("q1", "1").value());
+#endif
 }
 
 //---------------------------------------------------------------------------//

From f5f54cc8038539fc78276a332c123cfe3a721b5e Mon Sep 17 00:00:00 2001
From: Seth R Johnson <johnsonsr@ornl.gov>
Date: Tue, 14 Jan 2025 14:36:26 -0500
Subject: [PATCH 30/64] Store results in a buffer for post-entrypoint retrieval

---
 src/qirqsim/QsimQuantum.cc       | 89 ++++++++++++++------------------
 src/qirqsim/QsimQuantum.hh       | 24 ++++++++-
 test/qirqsim/QsimQuantum.test.cc | 20 ++++---
 3 files changed, 74 insertions(+), 59 deletions(-)

diff --git a/src/qirqsim/QsimQuantum.cc b/src/qirqsim/QsimQuantum.cc
index c25c297..3e4011f 100644
--- a/src/qirqsim/QsimQuantum.cc
+++ b/src/qirqsim/QsimQuantum.cc
@@ -82,10 +82,11 @@ void QsimQuantum::set_up(EntryPointAttrs const& attrs)
 {
     QIREE_VALIDATE(attrs.required_num_qubits > 0,
                    << "input is not a quantum program");
+
     // Resize the result_to_qubit_ vector, based on the required number of
     // results... the idea is to have as many classical registers as qubits
     // (probably not true in general)
-    result_to_qubit_.resize(attrs.required_num_results);
+    results_.resize(attrs.required_num_results);
     num_qubits_ = attrs.required_num_qubits;  // Set the number of qubits
 
     // Get the number of threads
@@ -129,12 +130,25 @@ void QsimQuantum::reset(Qubit q)
     q.value = 0;
 }
 
-//----------------------------------------------------------------------------//
+//---------------------------------------------------------------------------//
 /*!
- * Read the value of a result. This utilizes the new BufferManager.
+ * Map a qubit to a result index.
+ *
+ * (TODO: find how to link the classical register to the quantum register in
+ * qsim)
  */
-QState QsimQuantum::read_result(Result)
+void QsimQuantum::mz(Qubit q, Result r)
 {
+    QIREE_EXPECT(q.value < this->num_qubits());
+    QIREE_EXPECT(r.value < this->num_results());
+
+    // Add measurement instruction
+    state_->circuit.gates.push_back(
+        qsim::gate::Measurement<qsim::GateQSim<float>>::Create(
+            gate_index_++, {static_cast<unsigned int>(q.value)}));
+
+    //// EXECUTE CIRCUIT ////
+
     using Fuser = qsim::MultiQubitGateFuser<qsim::IO, qsim::GateQSim<float>>;
     using Runner = qsim::QSimRunner<qsim::IO, Fuser, Factory>;
     using StateSpace = Factory::StateSpace;
@@ -149,72 +163,47 @@ QState QsimQuantum::read_result(Result)
     qsimParam.max_fused_size = 2;  // Set the maximum size of fused gates
     qsimParam.verbosity = 0;  // see verbosity in run_qsim.h
 
-    // Run the simulation
+    // Run the simulation and check that it passed
     bool const run_success = Runner::Run(qsimParam,
                                          Factory(num_threads_),
                                          state_->circuit,
                                          *state_->state,
                                          meas_results);
+    QIREE_ASSERT(run_success);
+    QIREE_VALIDATE(
+        meas_results.size() == 1 && meas_results[0].bitstring.size() == 1,
+        << "inconsistent measured results size (" << meas_results.size()
+        << "), bitstring size");
+
+    //// RESET CIRCUIT ////
 
-    QIREE_ASSERT(run_success);  // Ensure the run was successful
-    // reset circuit here
     state_->circuit = {};
     state_->circuit.num_qubits = num_qubits_;
 
-#if 0
-    if (meas_results.size() == 1 && meas_results[0].bitstring.size() == 1)
-    {
-        auto const bitResult = meas_results[0].bitstring[0];
-        QIREE_ASSERT(bitResult == 0 || bitResult == 1);
-        std::string stringResult = std::to_string(bitResult);
-        std::string q_index_string = std::to_string(r.value);
-        if (stringResult == "1")
-        {
-            manager.updateBuffer("q" + q_index_string, "1", 1);
-            manager.updateBuffer("q" + q_index_string, 1);
-        }
-        else
-        {
-            manager.updateBuffer("q" + q_index_string, "0", 1);
-            manager.updateBuffer("q" + q_index_string, 0);
-        }
-    }
-    else
-    {
-        qsim::IO::errorf("Unexpected measurement results encountered.");
-    }
-#endif
-    return static_cast<QState>(meas_results[0].bitstring[0]);
+    //// STORE RESULT ////
+
+    auto result = meas_results[0].bitstring[0];
+    QIREE_ASSERT(result == 0 || result == 1);
+
+    results_[r.value] = result;
 }
 
-//---------------------------------------------------------------------------//
+//----------------------------------------------------------------------------//
 /*!
- * Map a qubit to a result index.
+ * Read the value of a result.
  *
- * (TODO: find how to link the classical register to the quantum register in
- * qsim)
+ * \todo We could add assertions to check that we actually measured into the
+ * given result.
  */
-void QsimQuantum::mz(Qubit q, Result r)
-{  // we don't classical register yet.
-    QIREE_EXPECT(q.value < this->num_qubits());  // TODO: q must be in the set
-                                                 // of qubits, e.g., what
-                                                 // happens if q=5 and qubits
-                                                 // are {2,3,4,5}, q is less
-                                                 // than num_qubits but not it
-                                                 // is in the set of qubits.
-    // TODO: maybe not what we want long term
-    QIREE_EXPECT(q.value == r.value);
-    // Add measurement instruction
-    state_->circuit.gates.push_back(
-        qsim::gate::Measurement<qsim::GateQSim<float>>::Create(
-            gate_index_++, {static_cast<unsigned int>(q.value)}));
+QState QsimQuantum::read_result(Result r)
+{
+    return this->get_result(r);
 }
 
 //---------------------------------------------------------------------------//
 /*
  * Quantum Instruction Mapping
  */
-
 // 1. Entangling gates
 void QsimQuantum::cx(Qubit q1, Qubit q2)
 {
diff --git a/src/qirqsim/QsimQuantum.hh b/src/qirqsim/QsimQuantum.hh
index 83533da..8ca79c8 100644
--- a/src/qirqsim/QsimQuantum.hh
+++ b/src/qirqsim/QsimQuantum.hh
@@ -11,6 +11,7 @@
 #include <ostream>
 #include <vector>
 
+#include "qiree/Assert.hh"
 #include "qiree/Macros.hh"
 #include "qiree/QuantumNotImpl.hh"
 #include "qiree/RuntimeInterface.hh"
@@ -33,8 +34,15 @@ class QsimQuantum final : virtual public QuantumNotImpl
 
     //!@{
     //! \name Accessors
-    size_type num_results() const { return result_to_qubit_.size(); }
+
+    //! Number of qubits in the circuit
     size_type num_qubits() const { return num_qubits_; }
+
+    //! Number of classical result registers
+    size_type num_results() const { return results_.size(); }
+
+    // Get the result from a classical register
+    inline QState get_result(Result r) const;
     //!@}
 
     //!@{
@@ -88,6 +96,8 @@ class QsimQuantum final : virtual public QuantumNotImpl
     void z(Qubit) final;
     //!@}
 
+    //
+
   private:
 
     //// TYPES ////
@@ -100,6 +110,7 @@ class QsimQuantum final : virtual public QuantumNotImpl
     std::ostream& output_;
     unsigned long int seed_{};
     std::unique_ptr<State> state_;
+    std::vector<bool> results_;
 
     unsigned num_threads_{};  // Number of threads to use
     size_t gate_index_;  // when the quantum operation will be executed
@@ -107,4 +118,15 @@ class QsimQuantum final : virtual public QuantumNotImpl
     std::vector<Qubit> result_to_qubit_;
 };
 
+//---------------------------------------------------------------------------//
+/*!
+ * Get the result from a classical register.
+ */
+QState QsimQuantum::get_result(Result r) const
+{
+    QIREE_EXPECT(r.value < results_.size());
+    auto result_bool = static_cast<bool>(results_[r.value]);
+    return static_cast<QState>(result_bool);
+}
+
 }  // namespace qiree
diff --git a/test/qirqsim/QsimQuantum.test.cc b/test/qirqsim/QsimQuantum.test.cc
index b9ef992..711c2a0 100644
--- a/test/qirqsim/QsimQuantum.test.cc
+++ b/test/qirqsim/QsimQuantum.test.cc
@@ -53,6 +53,9 @@ TEST_F(QsimQuantumTest, sim_dynamicbv)
         attrs.required_num_results = 2;
         return attrs;
     }());
+    ASSERT_EQ(2, qsim_sim.num_qubits());
+    ASSERT_EQ(2, qsim_sim.num_results());
+
     qsim_sim.h(Q{0});
     qsim_sim.x(Q{1});
     qsim_sim.h(Q{1});
@@ -65,6 +68,9 @@ TEST_F(QsimQuantumTest, sim_dynamicbv)
     qsim_rt.array_record_output(2,"");
     qsim_rt.result_record_output(R{0},"");
     qsim_rt.result_record_output(R{1},"");
+    EXPECT_EQ(QState::one, qsim_sim.get_result(R{0}));
+    EXPECT_EQ(QState::one, qsim_sim.get_result(R{1}));
+
     qsim_sim.h(Q{0});
     qsim_sim.x(Q{1});
     qsim_sim.h(Q{1});
@@ -75,6 +81,9 @@ TEST_F(QsimQuantumTest, sim_dynamicbv)
     qsim_rt.array_record_output(2,"");
     qsim_rt.result_record_output(R{0},"");
     qsim_rt.result_record_output(R{1},"");
+    EXPECT_EQ(QState::zero, qsim_sim.get_result(R{0}));
+    EXPECT_EQ(QState::zero, qsim_sim.get_result(R{1}));
+
     qsim_sim.h(Q{0});
     qsim_sim.x(Q{1});
     qsim_sim.h(Q{1});
@@ -87,15 +96,10 @@ TEST_F(QsimQuantumTest, sim_dynamicbv)
     qsim_rt.array_record_output(2,"");
     qsim_rt.result_record_output(R{0},"");
     qsim_rt.result_record_output(R{1},"");
-    qsim_sim.tear_down();
+    EXPECT_EQ(QState::one, qsim_sim.get_result(R{0}));
+    EXPECT_EQ(QState::zero, qsim_sim.get_result(R{1}));
 
-    ASSERT_EQ(2, qsim_sim.num_qubits());
-#if 0
-    EXPECT_EQ(1, qsim_sim.manager.getBufferValue("q0", "0").value());
-    EXPECT_EQ(2, qsim_sim.manager.getBufferValue("q0", "1").value());
-    EXPECT_EQ(2, qsim_sim.manager.getBufferValue("q1", "0").value());
-    EXPECT_EQ(1, qsim_sim.manager.getBufferValue("q1", "1").value());
-#endif
+    qsim_sim.tear_down();
 }
 
 //---------------------------------------------------------------------------//

From 43c5b79b593157a53c76a03ce66d1e1f041997ef Mon Sep 17 00:00:00 2001
From: Seth R Johnson <johnsonsr@ornl.gov>
Date: Tue, 14 Jan 2025 14:46:23 -0500
Subject: [PATCH 31/64] Add helper function

---
 src/qirqsim/QsimQuantum.cc | 43 +++++++++++++++++---------------------
 src/qirqsim/QsimQuantum.hh |  5 +++++
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/qirqsim/QsimQuantum.cc b/src/qirqsim/QsimQuantum.cc
index 3e4011f..9a15ef0 100644
--- a/src/qirqsim/QsimQuantum.cc
+++ b/src/qirqsim/QsimQuantum.cc
@@ -207,66 +207,54 @@ QState QsimQuantum::read_result(Result r)
 // 1. Entangling gates
 void QsimQuantum::cx(Qubit q1, Qubit q2)
 {
-    state_->circuit.gates.push_back(
-        qsim::GateCNot<float>::Create(gate_index_++, q1.value, q2.value));
+    this->add_gate<qsim::GateCNot>(q1.value, q2.value);
 }
 void QsimQuantum::cnot(Qubit q1, Qubit q2)
 {
-    state_->circuit.gates.push_back(
-        qsim::GateCNot<float>::Create(gate_index_++, q1.value, q2.value));
+    this->add_gate<qsim::GateCNot>(q1.value, q2.value);
 }
 void QsimQuantum::cz(Qubit q1, Qubit q2)
 {
-    state_->circuit.gates.push_back(
-        qsim::GateCZ<float>::Create(gate_index_++, q1.value, q2.value));
+    this->add_gate<qsim::GateCZ>(q1.value, q2.value);
 }
 // 2. Local gates
 void QsimQuantum::h(Qubit q)
 {
-    state_->circuit.gates.push_back(
-        qsim::GateHd<float>::Create(gate_index_++, q.value));
+    this->add_gate<qsim::GateHd>(q.value);
 }
 void QsimQuantum::s(Qubit q)
 {
-    state_->circuit.gates.push_back(
-        qsim::GateS<float>::Create(gate_index_++, q.value));
+    this->add_gate<qsim::GateS>(q.value);
 }
 void QsimQuantum::t(Qubit q)
 {
-    state_->circuit.gates.push_back(
-        qsim::GateT<float>::Create(gate_index_++, q.value));
+    this->add_gate<qsim::GateT>(q.value);
 }
 // 2.1 Pauli gates
 void QsimQuantum::x(Qubit q)
 {
-    state_->circuit.gates.push_back(
-        qsim::GateX<float>::Create(gate_index_++, q.value));
+    this->add_gate<qsim::GateX>(q.value);
 }
 void QsimQuantum::y(Qubit q)
 {
-    state_->circuit.gates.push_back(
-        qsim::GateY<float>::Create(gate_index_++, q.value));
+    this->add_gate<qsim::GateY>(q.value);
 }
 void QsimQuantum::z(Qubit q)
 {
-    state_->circuit.gates.push_back(
-        qsim::GateZ<float>::Create(gate_index_++, q.value));
+    this->add_gate<qsim::GateZ>(q.value);
 }
 // 2.2 rotation gates
 void QsimQuantum::rx(double theta, Qubit q)
 {
-    state_->circuit.gates.push_back(
-        qsim::GateRX<float>::Create(gate_index_++, q.value, theta));
+    this->add_gate<qsim::GateRX>(q.value, theta);
 }
 void QsimQuantum::ry(double theta, Qubit q)
 {
-    state_->circuit.gates.push_back(
-        qsim::GateRY<float>::Create(gate_index_++, q.value, theta));
+    this->add_gate<qsim::GateRY>(q.value, theta);
 }
 void QsimQuantum::rz(double theta, Qubit q)
 {
-    state_->circuit.gates.push_back(
-        qsim::GateRZ<float>::Create(gate_index_++, q.value, theta));
+    this->add_gate<qsim::GateRZ>(q.value, theta);
 }
 
 Qubit QsimQuantum::result_to_qubit(Result r)
@@ -290,4 +278,11 @@ void QsimQuantum::execute_if_needed()
     QIREE_EXPECT(false);
 }
 
+template<template<class> class Gate, class... Ts>
+void QsimQuantum::add_gate(Ts&&... args)
+{
+    state_->circuit.gates.push_back(
+        Gate<float>::Create(gate_index_++, std::forward<Ts>(args)...));
+}
+
 }  // namespace qiree
diff --git a/src/qirqsim/QsimQuantum.hh b/src/qirqsim/QsimQuantum.hh
index 8ca79c8..0ad6942 100644
--- a/src/qirqsim/QsimQuantum.hh
+++ b/src/qirqsim/QsimQuantum.hh
@@ -116,6 +116,11 @@ class QsimQuantum final : virtual public QuantumNotImpl
     size_t gate_index_;  // when the quantum operation will be executed
     size_type num_qubits_{};
     std::vector<Qubit> result_to_qubit_;
+
+    //// HELPER FUNCTIONS ////
+
+    template<template<class> class Gate, class... Ts>
+    void add_gate(Ts&&... args);
 };
 
 //---------------------------------------------------------------------------//

From 5e32f3e3942993640612f3fa26b2e6bc115db4c6 Mon Sep 17 00:00:00 2001
From: Joseph Lee <joseph.lee@xanadu.ai>
Date: Tue, 14 Jan 2025 22:16:17 +0000
Subject: [PATCH 32/64] add qirlightning runtime

---
 src/qirlightning/CMakeLists.txt               |   28 +
 src/qirlightning/LightningDefaultRuntime.cc   |   73 ++
 src/qirlightning/LightningDefaultRuntime.hh   |   62 +
 src/qirlightning/LightningQuantum.cc          |  195 ++++
 src/qirlightning/LightningQuantum.hh          |  114 ++
 src/qirlightning/catalyst_runtime/.clang-tidy |  232 ++++
 src/qirlightning/catalyst_runtime/.gitignore  |    3 +
 .../catalyst_runtime/CMakeLists.txt           |  133 +++
 src/qirlightning/catalyst_runtime/Makefile    |  121 ++
 src/qirlightning/catalyst_runtime/README.rst  |  118 ++
 .../catalyst_runtime/include/DataView.hpp     |  148 +++
 .../include/DynamicLibraryLoader.hpp          |   79 ++
 .../catalyst_runtime/include/Exception.hpp    |   87 ++
 .../include/QuantumDevice.hpp                 |  364 ++++++
 .../catalyst_runtime/include/RuntimeCAPI.h    |  112 ++
 .../catalyst_runtime/include/Types.h          |  165 +++
 .../catalyst_runtime/lib/CMakeLists.txt       |    3 +
 .../lib/backend/CMakeLists.txt                |    7 +
 .../lib/backend/common/CacheManager.hpp       |  199 ++++
 .../lib/backend/common/QubitManager.hpp       |  146 +++
 .../lib/backend/common/Utils.hpp              |  304 +++++
 .../catalyst_runtime/lib/capi/CMakeLists.txt  |   57 +
 .../lib/capi/ExecutionContext.hpp             |  367 ++++++
 .../catalyst_runtime/lib/capi/MemRefUtils.hpp |   48 +
 .../catalyst_runtime/lib/capi/RuntimeCAPI.cpp | 1012 +++++++++++++++++
 .../lib/registry/CMakeLists.txt               |   33 +
 .../lib/registry/Registry.cpp                 |  179 +++
 27 files changed, 4389 insertions(+)
 create mode 100644 src/qirlightning/CMakeLists.txt
 create mode 100644 src/qirlightning/LightningDefaultRuntime.cc
 create mode 100644 src/qirlightning/LightningDefaultRuntime.hh
 create mode 100644 src/qirlightning/LightningQuantum.cc
 create mode 100644 src/qirlightning/LightningQuantum.hh
 create mode 100644 src/qirlightning/catalyst_runtime/.clang-tidy
 create mode 100644 src/qirlightning/catalyst_runtime/.gitignore
 create mode 100644 src/qirlightning/catalyst_runtime/CMakeLists.txt
 create mode 100644 src/qirlightning/catalyst_runtime/Makefile
 create mode 100644 src/qirlightning/catalyst_runtime/README.rst
 create mode 100644 src/qirlightning/catalyst_runtime/include/DataView.hpp
 create mode 100644 src/qirlightning/catalyst_runtime/include/DynamicLibraryLoader.hpp
 create mode 100644 src/qirlightning/catalyst_runtime/include/Exception.hpp
 create mode 100644 src/qirlightning/catalyst_runtime/include/QuantumDevice.hpp
 create mode 100644 src/qirlightning/catalyst_runtime/include/RuntimeCAPI.h
 create mode 100644 src/qirlightning/catalyst_runtime/include/Types.h
 create mode 100644 src/qirlightning/catalyst_runtime/lib/CMakeLists.txt
 create mode 100644 src/qirlightning/catalyst_runtime/lib/backend/CMakeLists.txt
 create mode 100644 src/qirlightning/catalyst_runtime/lib/backend/common/CacheManager.hpp
 create mode 100644 src/qirlightning/catalyst_runtime/lib/backend/common/QubitManager.hpp
 create mode 100644 src/qirlightning/catalyst_runtime/lib/backend/common/Utils.hpp
 create mode 100644 src/qirlightning/catalyst_runtime/lib/capi/CMakeLists.txt
 create mode 100644 src/qirlightning/catalyst_runtime/lib/capi/ExecutionContext.hpp
 create mode 100644 src/qirlightning/catalyst_runtime/lib/capi/MemRefUtils.hpp
 create mode 100644 src/qirlightning/catalyst_runtime/lib/capi/RuntimeCAPI.cpp
 create mode 100644 src/qirlightning/catalyst_runtime/lib/registry/CMakeLists.txt
 create mode 100644 src/qirlightning/catalyst_runtime/lib/registry/Registry.cpp

diff --git a/src/qirlightning/CMakeLists.txt b/src/qirlightning/CMakeLists.txt
new file mode 100644
index 0000000..0d81dec
--- /dev/null
+++ b/src/qirlightning/CMakeLists.txt
@@ -0,0 +1,28 @@
+#---------------------------------*-CMake-*----------------------------------#
+# Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers.
+# See the top-level COPYRIGHT file for details.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#----------------------------------------------------------------------------#
+
+# Adding qsim as a library to qiree
+qiree_add_library(qirqsim
+  QsimQuantum.cc
+  QsimDefaultRuntime.cc
+)
+
+#Link the qsim library to qiree and any other relevant libraries
+target_link_libraries(qirqsim
+  PUBLIC QIREE::qiree  # Link to qiree
+  PRIVATE QIREE::qsim
+)
+
+#----------------------------------------------------------------------------#
+# HEADERS
+#----------------------------------------------------------------------------#
+
+# Install headers, matching the relevant .hh files for qsim integration
+install(DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/"
+  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/qirqsim"
+  COMPONENT development
+  FILES_MATCHING REGEX ".*\\.hh?$"
+)
diff --git a/src/qirlightning/LightningDefaultRuntime.cc b/src/qirlightning/LightningDefaultRuntime.cc
new file mode 100644
index 0000000..2440ee0
--- /dev/null
+++ b/src/qirlightning/LightningDefaultRuntime.cc
@@ -0,0 +1,73 @@
+//----------------------------------*-C++-*----------------------------------//
+// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers.
+// See the top-level COPYRIGHT file for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//---------------------------------------------------------------------------//
+//! \file qirlightning/LightningDefaultRuntime.cc
+//---------------------------------------------------------------------------//
+#include "LightningDefaultRuntime.hh"
+
+#include <iostream>
+
+#include "qiree/Assert.hh"
+
+namespace qiree
+{
+//---------------------------------------------------------------------------//
+/*!
+ * Initialize the execution environment, resetting qubits.
+ */
+
+void LightningDefaultRuntime::initialize(OptionalCString env)
+{
+    if (env)
+    {
+        output_ << "Argument to initialize: " << env << std::endl;
+    }
+}
+
+//---------------------------------------------------------------------------//
+/*!
+ * Execute circuit and mark the following N results as being part of an array
+ * named tag
+ */
+
+void LightningDefaultRuntime::array_record_output(size_type s, OptionalCString tag)
+{
+    // this->execute_if_needed();
+    // output_ << "array " << (tag ? tag : "<null>") << " length " << s
+    //         << std::endl;
+}
+
+//---------------------------------------------------------------------------//
+/*!
+ * Execute circuit and mark the following N results as being part of a tuple
+ * named tag
+ */
+
+void LightningDefaultRuntime::tuple_record_output(size_type s, OptionalCString tag)
+{
+    // this->execute_if_needed();
+    // output_ << "tuple " << (tag ? tag : "<null>") << " length " << s
+    //         << std::endl;
+}
+
+//---------------------------------------------------------------------------//
+/*!
+ * Execute circuit and report a single measurement result
+ */
+void LightningDefaultRuntime::result_record_output(Result r, OptionalCString tag)
+{
+    // Access values through the getter
+    // This prints results every time result_record_output is called
+    // Can comment out if only want to see final results
+
+    if (auto value = sim_.manager.getBufferValue("q" + std::to_string(r.value));
+        value.has_value())
+    {
+        std::cout << "q" << std::to_string(r.value) << " : " << value.value()
+                  << "\n";
+    }
+}
+
+}  // namespace qiree
diff --git a/src/qirlightning/LightningDefaultRuntime.hh b/src/qirlightning/LightningDefaultRuntime.hh
new file mode 100644
index 0000000..cac9c1e
--- /dev/null
+++ b/src/qirlightning/LightningDefaultRuntime.hh
@@ -0,0 +1,62 @@
+//----------------------------------*-C++-*----------------------------------//
+// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers.
+// See the top-level COPYRIGHT file for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//---------------------------------------------------------------------------//
+//! \file qirlightning/LightningDefaultRuntime.hh
+//---------------------------------------------------------------------------//
+#pragma once
+
+#include "LightningQuantum.hh"
+
+namespace qiree
+{
+
+/*!
+ * Print per-qubit measurement statistics.
+ *
+ * Example for three qubits:
+ * \code
+ * Measurement output:
+ * -------------------
+ * Number of shots: 1024
+ * Number of qubits: 3
+ * q0 {0: 542, 1: 482}
+ * q1 {0: 521, 1: 503}
+ * q2 {0: 0, 1: 1024}
+ *
+ * \endcode
+ */
+
+class LightningDefaultRuntime final : virtual public RuntimeInterface
+{
+  public:
+    /*!
+     * Construct \c LightningDefaultRuntime.
+     */
+    LightningDefaultRuntime(std::ostream& output, LightningQuantum& sim)
+        : output_(output), sim_(sim)
+    {
+    }
+
+    //!@{
+    //! \name Runtime interface
+    // Initialize the execution environment, resetting qubits
+    void initialize(OptionalCString env) override;
+
+    //! Mark the following N results as being part of an array named tag
+    void array_record_output(size_type, OptionalCString tag) final;
+
+    //! Mark the following N results as being part of a tuple named tag
+    void tuple_record_output(size_type, OptionalCString) final;
+
+    // Save one result
+    void result_record_output(Result result, OptionalCString tag) final;
+    //!@}
+
+  private:
+    std::ostream& output_;
+    LightningQuantum& sim_;
+};
+
+}  // namespace qiree
diff --git a/src/qirlightning/LightningQuantum.cc b/src/qirlightning/LightningQuantum.cc
new file mode 100644
index 0000000..3f39825
--- /dev/null
+++ b/src/qirlightning/LightningQuantum.cc
@@ -0,0 +1,195 @@
+//----------------------------------*-C++-*----------------------------------//
+// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers.
+// See the top-level COPYRIGHT file for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//---------------------------------------------------------------------------//
+//! \file qirlightning/LightningQuantum.cc
+//---------------------------------------------------------------------------//
+
+#include "LightningQuantum.hh"
+
+#include <algorithm>
+#include <iostream>
+#include <optional>
+#include <stdexcept>
+#include <thread>
+#include <utility>
+
+#include "qiree/Assert.hh"
+
+// Lightning
+#include "catalyst_runtime/lib/capi/ExecutionContext.hpp"
+
+namespace qiree
+{
+using namespace Catalyst::Runtime;
+
+static inline std::shared_ptr<RTDevice> loadRTDevice(const std::string &rtd_lib,
+                                                   const std::string &rtd_name = {},
+                                                   const std::string &rtd_kwargs = {})
+{
+    ExecutionContext context;
+    return context.getOrCreateDevice(rtd_lib, rtd_name, rtd_kwargs);
+}
+
+//---------------------------------------------------------------------------//
+/*!
+ * Initialize the Lightning simulator
+ */
+LightningQuantum::LightningQuantum(std::ostream& os, unsigned long int seed)
+{
+    auto RTDevice = loadDevice("/home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/pennylane_lightning/liblightning_gpu_catalyst.so", "LightningGPUSimulator", "");
+
+}
+
+//---------------------------------------------------------------------------//
+//! Default destructor
+LightningQuantum::~LightningQuantum() = default;
+
+//---------------------------------------------------------------------------//
+/*!
+ * Prepare to build a quantum circuit for an entry point
+ */
+void LightningQuantum::set_up(EntryPointAttrs const& attrs)
+{
+    QIREE_VALIDATE(attrs.required_num_qubits > 0,
+                   << "input is not a quantum program");
+    
+    num_qubits_ = attrs.required_num_qubits;  // Set the number of qubits
+
+    RTDevice->getQuantumDevicePtr()->AllocateQubits(num_qubits_);
+
+}
+
+//---------------------------------------------------------------------------//
+/*!
+ * Complete an execution
+ */
+void LightningQuantum::tear_down()
+{
+    context->deactivateDevice(RTDevice);
+    RTDevice = nullptr;
+}
+
+//---------------------------------------------------------------------------//
+/*!
+ * Reset the qubit
+ */
+void LightningQuantum::reset(Qubit q)
+{
+    q.value = 0;
+}
+
+//----------------------------------------------------------------------------//
+/*!
+ * Read the value of a result. This utilizes the new BufferManager.
+ */
+QState LightningQuantum::read_result(Result r)
+{
+
+    return static_cast<QState>(meas_results[0].bitstring[0]);
+}
+
+//---------------------------------------------------------------------------//
+/*!
+ * Map a qubit to a result index.
+ *
+ * (TODO: find how to link the classical register to the quantum register in
+ * qsim)
+ */
+void LightningQuantum::mz(Qubit q, Result r)
+{  // we don't classical register yet.
+    /* QIREE_EXPECT(q.value < this->num_qubits()); */  // TODO: q must be in the set
+                                                 // of qubits, e.g., what
+                                                 // happens if q=5 and qubits
+                                                 // are {2,3,4,5}, q is less
+                                                 // than num_qubits but not it
+                                                 // is in the set of qubits.
+    // TODO: maybe not what we want long term
+    QIREE_EXPECT(q.value == r.value);
+    // Add measurement instruction
+    Measure(q.value, std::nullopt);
+    // RETURN MEASURE RESULT??
+
+}
+
+//---------------------------------------------------------------------------//
+/*
+ * Quantum Instruction Mapping
+ */
+
+// 1. Entangling gates
+void LightningQuantum::cx(Qubit q1, Qubit q2)
+{
+    RTDevice->getQuantumDevicePtr()->NamedOperation("CNOT", {}, {q1.value, q2.value});
+}
+void LightningQuantum::cnot(Qubit q1, Qubit q2)
+{
+    RTDevice->getQuantumDevicePtr()->NamedOperation("CNOT", {}, {q1.value, q2.value});
+}
+void LightningQuantum::cz(Qubit q1, Qubit q2)
+{
+    RTDevice->getQuantumDevicePtr()->NamedOperation("CZ", {}, {q1.value, q2.value});
+}
+// 2. Local gates
+void LightningQuantum::h(Qubit q)
+{
+    RTDevice->getQuantumDevicePtr()->NamedOperation("Hadamard", {}, {q.value});
+}
+void LightningQuantum::s(Qubit q)
+{
+    RTDevice->getQuantumDevicePtr()->NamedOperation("S", {}, {q.value});
+}
+void LightningQuantum::t(Qubit q)
+{
+    RTDevice->getQuantumDevicePtr()->NamedOperation("T", {}, {q.value});
+}
+// 2.1 Pauli gates
+void LightningQuantum::x(Qubit q)
+{
+    RTDevice->getQuantumDevicePtr()->NamedOperation("PauliX", {}, {q.value});
+}
+void LightningQuantum::y(Qubit q)
+{
+    RTDevice->getQuantumDevicePtr()->NamedOperation("PauliY", {}, {q.value});
+}
+void LightningQuantum::z(Qubit q)
+{
+    RTDevice->getQuantumDevicePtr()->NamedOperation("PauliZ", {}, {q.value});
+}
+// 2.2 rotation gates
+void LightningQuantum::rx(double theta, Qubit q)
+{
+    RTDevice->getQuantumDevicePtr()->NamedOperation("RX", {theta}, {q.value});
+}
+void LightningQuantum::ry(double theta, Qubit q)
+{
+    RTDevice->getQuantumDevicePtr()->NamedOperation("RY", {theta}, {q.value});
+}
+void LightningQuantum::rz(double theta, Qubit q)
+{
+    RTDevice->getQuantumDevicePtr()->NamedOperation("RZ", {theta}, {q.value});
+}
+
+Qubit LightningQuantum::result_to_qubit(Result r)
+{
+    // TODO: This function is not working. Giving 0 every time. Maybe not
+    // needed.
+    /* QIREE_EXPECT(r.value < this->num_results()); */
+    return result_to_qubit_[r.value];  // just copied this from the qirxacc, I
+                                       // have no idea if we need to do
+                                       // something else here
+}
+
+void LightningQuantum::print_accelbuf()
+{
+    // TODO: to be implemented, we can create a buffer class to store the
+    // results
+}
+
+void LightningQuantum::execute_if_needed()
+{
+    /* QIREE_EXPECT(false); */
+}
+
+}  // namespace qiree
diff --git a/src/qirlightning/LightningQuantum.hh b/src/qirlightning/LightningQuantum.hh
new file mode 100644
index 0000000..e9b8bb2
--- /dev/null
+++ b/src/qirlightning/LightningQuantum.hh
@@ -0,0 +1,114 @@
+//----------------------------------*-C++-*----------------------------------//
+// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers.
+// See the top-level COPYRIGHT file for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//---------------------------------------------------------------------------//
+//! \file qirlightning/LightningQuantum.hh
+//---------------------------------------------------------------------------//
+#pragma once
+
+#include <memory>
+#include <ostream>
+#include <vector>
+
+#include "qiree/Macros.hh"
+#include "qiree/QuantumNotImpl.hh"
+#include "qiree/RuntimeInterface.hh"
+#include "qiree/Types.hh"
+#include "qiree/OutputDistribution.hh"
+
+namespace qiree
+{
+//---------------------------------------------------------------------------//
+/*!
+ * Create and execute quantum circuits using Pennylane Lightning.
+ */
+class LightningQuantum final : virtual public QuantumNotImpl
+{
+  public:
+    // Construct with number of shots
+    LightningQuantum(std::ostream& os, unsigned long int shots);
+    ~LightningQuantum();
+
+    QIREE_DELETE_COPY_MOVE(LightningQuantum);  // Delete copy and move constructors
+
+    //!@{
+    //! \name Accessors
+    size_type num_results() const { return result_to_qubit_.size(); }
+    size_type num_qubits() const { return num_qubits_; }
+    //!@}
+
+    //!@{
+    //! \name Quantum interface
+    // Prepare to build a quantum circuit for an entry point
+    void set_up(EntryPointAttrs const&) override;
+
+    // Complete an execution
+    void tear_down() override;
+
+    // Map a qubit to a result index
+    void mz(Qubit, Result) final;
+
+    // Read the value of a result.
+    QState read_result(Result) final;
+    //!@}
+
+    //!@{
+    //! \name Utilities for runtime
+    // Get runtime qubit corresponding to a runtime result
+    Qubit result_to_qubit(Result);
+
+    // Run the circuit on the accelerator if we have not already. Returns true
+    // if the circuit was executed.
+    void execute_if_needed();
+
+    void print_accelbuf();
+    //!@}
+
+    //!@{
+    //! \name Circuit construction
+    // void ccx(Qubit, Qubit) final;
+    void ccnot(Qubit, Qubit, Qubit);  // TODO: not in examples or qir runner
+    void cnot(Qubit, Qubit) final;
+    void cx(Qubit, Qubit) final;
+    // void cy(Qubit, Qubit) final;
+    void cz(Qubit, Qubit) final;
+    void h(Qubit) final;
+    void reset(Qubit) final;
+    void rx(double, Qubit) final;
+    void ry(double, Qubit) final;
+    void rz(double, Qubit) final;
+    // void rzz(double, Qubit, Qubit) final;
+    void s(Qubit) final;
+    // void s_adj(Qubit) final;
+    // void swap(Qubit, Qubit) final;
+    void t(Qubit) final;
+    // void t_adj(Qubit) final;
+    void x(Qubit) final;
+    void y(Qubit) final;
+    void z(Qubit) final;
+    //!@}
+
+    // Update the buffer
+    Buffer manager;
+
+  private:
+
+    //// TYPES ////
+
+    struct Factory;
+    struct State;
+
+    //// DATA ////
+
+    std::ostream& output_;
+    unsigned long int seed_{};
+    std::unique_ptr<State> state_;
+
+    unsigned num_threads_{};  // Number of threads to use
+    size_t gate_index_;  // when the quantum operation will be executed
+    size_type num_qubits_{};
+    std::vector<Qubit> result_to_qubit_;
+};
+
+}  // namespace qiree
diff --git a/src/qirlightning/catalyst_runtime/.clang-tidy b/src/qirlightning/catalyst_runtime/.clang-tidy
new file mode 100644
index 0000000..e7ca11f
--- /dev/null
+++ b/src/qirlightning/catalyst_runtime/.clang-tidy
@@ -0,0 +1,232 @@
+---
+Checks:          '-*,clang-diagnostic-*,clang-analyzer-*,-llvmlibc-*,modernize-*,-modernize-use-trailing-return-type,clang-analyzer-cplusplus*,openmp-*,performance-*,portability-*,readability-*,-modernize-avoid-c-arrays,-readability-magic-numbers,hicpp-*,-hicpp-no-array-decay,-hicpp-avoid-c-arrays,bugprone-suspicious-*,llvm-namespace-comment,cppcoreguidelines-slicing,cppcoreguidelines-special-member-functions,-readability-identifier-length'
+WarningsAsErrors: '*'
+HeaderFilterRegex: '.*'
+AnalyzeTemporaryDtors: false
+FormatStyle:     none
+InheritParentConfig: true
+User:            mlxd
+CheckOptions:
+  - key:             modernize-replace-auto-ptr.IncludeStyle
+    value:           llvm
+  - key:             performance-move-const-arg.CheckTriviallyCopyableMove
+    value:           'true'
+  - key:             modernize-use-auto.MinTypeNameLength
+    value:           '5'
+  - key:             readability-static-accessed-through-instance.NameSpecifierNestingThreshold
+    value:           '3'
+  - key:             readability-function-size.VariableThreshold
+    value:           '4294967295'
+  - key:             cert-dcl16-c.NewSuffixes
+    value:           'L;LL;LU;LLU'
+  - key:             readability-identifier-naming.GetConfigPerFile
+    value:           'true'
+  - key:             readability-inconsistent-declaration-parameter-name.Strict
+    value:           'false'
+  - key:             readability-magic-numbers.IgnoredIntegerValues
+    value:           '1;2;3;4;'
+  - key:             modernize-use-default-member-init.UseAssignment
+    value:           'false'
+  - key:             readability-function-size.NestingThreshold
+    value:           '4294967295'
+  - key:             modernize-use-override.AllowOverrideAndFinal
+    value:           'false'
+  - key:             readability-function-size.ParameterThreshold
+    value:           '4294967295'
+  - key:             openmp-exception-escape.IgnoredExceptions
+    value:           ''
+  - key:             modernize-pass-by-value.ValuesOnly
+    value:           'false'
+  - key:             modernize-loop-convert.IncludeStyle
+    value:           llvm
+  - key:             cert-str34-c.DiagnoseSignedUnsignedCharComparisons
+    value:           '0'
+  - key:             readability-identifier-naming.AggressiveDependentMemberLookup
+    value:           'false'
+  - key:             readability-redundant-smartptr-get.IgnoreMacros
+    value:           'true'
+  - key:             modernize-use-emplace.TupleTypes
+    value:           '::std::pair;::std::tuple'
+  - key:             modernize-use-emplace.TupleMakeFunctions
+    value:           '::std::make_pair;::std::make_tuple'
+  - key:             modernize-use-nodiscard.ReplacementString
+    value:           '[[nodiscard]]'
+  - key:             modernize-loop-convert.MakeReverseRangeHeader
+    value:           ''
+  - key:             modernize-replace-random-shuffle.IncludeStyle
+    value:           llvm
+  - key:             modernize-use-bool-literals.IgnoreMacros
+    value:           'true'
+  - key:             google-readability-namespace-comments.ShortNamespaceLines
+    value:           '10'
+  - key:             modernize-avoid-bind.PermissiveParameterList
+    value:           'false'
+  - key:             modernize-use-override.FinalSpelling
+    value:           final
+  - key:             performance-move-constructor-init.IncludeStyle
+    value:           llvm
+  - key:             modernize-loop-convert.UseCxx20ReverseRanges
+    value:           'true'
+  - key:             modernize-use-noexcept.ReplacementString
+    value:           ''
+  - key:             modernize-use-using.IgnoreMacros
+    value:           'true'
+  - key:             performance-type-promotion-in-math-fn.IncludeStyle
+    value:           llvm
+  - key:             modernize-loop-convert.NamingStyle
+    value:           CamelCase
+  - key:             modernize-loop-convert.MakeReverseRangeFunction
+    value:           ''
+  - key:             readability-inconsistent-declaration-parameter-name.IgnoreMacros
+    value:           'true'
+  - key:             performance-no-automatic-move.AllowedTypes
+    value:           ''
+  - key:             performance-for-range-copy.WarnOnAllAutoCopies
+    value:           'false'
+  - key:             readability-identifier-naming.IgnoreFailedSplit
+    value:           'false'
+  - key:             modernize-pass-by-value.IncludeStyle
+    value:           llvm
+  - key:             readability-qualified-auto.AddConstToQualified
+    value:           'true'
+  - key:             readability-simplify-boolean-expr.ChainedConditionalReturn
+    value:           'false'
+  - key:             readability-else-after-return.WarnOnConditionVariables
+    value:           'true'
+  - key:             readability-uppercase-literal-suffix.IgnoreMacros
+    value:           'true'
+  - key:             modernize-use-nullptr.NullMacros
+    value:           'NULL'
+  - key:             modernize-make-shared.IgnoreMacros
+    value:           'true'
+  - key:             performance-unnecessary-copy-initialization.AllowedTypes
+    value:           ''
+  - key:             modernize-use-transparent-functors.SafeMode
+    value:           'false'
+  - key:             modernize-make-shared.IgnoreDefaultInitialization
+    value:           'true'
+  - key:             modernize-make-shared.IncludeStyle
+    value:           llvm
+  - key:             readability-simplify-boolean-expr.ChainedConditionalAssignment
+    value:           'false'
+  - key:             cert-oop54-cpp.WarnOnlyIfThisHasSuspiciousField
+    value:           '0'
+  - key:             readability-function-size.LineThreshold
+    value:           '4294967295'
+  - key:             performance-inefficient-vector-operation.EnableProto
+    value:           'false'
+  - key:             modernize-use-override.IgnoreDestructors
+    value:           'false'
+  - key:             modernize-loop-convert.MaxCopySize
+    value:           '16'
+  - key:             modernize-make-shared.MakeSmartPtrFunction
+    value:           'std::make_shared'
+  - key:             portability-simd-intrinsics.Suggest
+    value:           'false'
+  - key:             cppcoreguidelines-explicit-virtual-functions.IgnoreDestructors
+    value:           '1'
+  - key:             modernize-make-unique.IgnoreMacros
+    value:           'true'
+  - key:             modernize-make-shared.MakeSmartPtrFunctionHeader
+    value:           '<memory>'
+  - key:             performance-for-range-copy.AllowedTypes
+    value:           ''
+  - key:             readability-redundant-string-init.StringNames
+    value:           '::std::basic_string_view;::std::basic_string'
+  - key:             modernize-make-unique.IgnoreDefaultInitialization
+    value:           'true'
+  - key:             modernize-use-emplace.ContainersWithPushBack
+    value:           '::std::vector;::std::list;::std::deque'
+  - key:             readability-magic-numbers.IgnoreBitFieldsWidths
+    value:           'true'
+  - key:             modernize-make-unique.IncludeStyle
+    value:           llvm
+  - key:             readability-braces-around-statements.ShortStatementLines
+    value:           '0'
+  - key:             modernize-use-override.OverrideSpelling
+    value:           override
+  - key:             readability-magic-numbers.IgnoredFloatingPointValues
+    value:           '1.0;100.0;'
+  - key:             performance-inefficient-string-concatenation.StrictMode
+    value:           'false'
+  - key:             readability-implicit-bool-conversion.AllowPointerConditions
+    value:           'false'
+  - key:             readability-redundant-declaration.IgnoreMacros
+    value:           'true'
+  - key:             google-readability-braces-around-statements.ShortStatementLines
+    value:           '1'
+  - key:             modernize-make-unique.MakeSmartPtrFunction
+    value:           'std::make_unique'
+  - key:             portability-restrict-system-includes.Includes
+    value:           '*'
+  - key:             readability-else-after-return.WarnOnUnfixable
+    value:           'true'
+  - key:             modernize-use-emplace.IgnoreImplicitConstructors
+    value:           'false'
+  - key:             modernize-make-unique.MakeSmartPtrFunctionHeader
+    value:           '<memory>'
+  - key:             modernize-use-equals-delete.IgnoreMacros
+    value:           'true'
+  - key:             readability-magic-numbers.IgnoreAllFloatingPointValues
+    value:           'false'
+  - key:             readability-uppercase-literal-suffix.NewSuffixes
+    value:           ''
+  - key:             modernize-loop-convert.MinConfidence
+    value:           reasonable
+  - key:             performance-unnecessary-value-param.AllowedTypes
+    value:           ''
+  - key:             modernize-use-noexcept.UseNoexceptFalse
+    value:           'true'
+  - key:             google-readability-namespace-comments.SpacesBeforeComments
+    value:           '2'
+  - key:             readability-function-cognitive-complexity.Threshold
+    value:           '100'
+  - key:             readability-function-cognitive-complexity.IgnoreMacros
+    value:           'true'
+  - key:             cppcoreguidelines-non-private-member-variables-in-classes.IgnoreClassesWithAllMemberVariablesBeingPublic
+    value:           '1'
+  - key:             performance-faster-string-find.StringLikeClasses
+    value:           '::std::basic_string;::std::basic_string_view'
+  - key:             readability-function-size.BranchThreshold
+    value:           '4294967295'
+  - key:             readability-implicit-bool-conversion.AllowIntegerConditions
+    value:           'false'
+  - key:             readability-function-size.StatementThreshold
+    value:           '800'
+  - key:             modernize-use-default-member-init.IgnoreMacros
+    value:           'true'
+  - key:             llvm-qualified-auto.AddConstToQualified
+    value:           '0'
+  - key:             readability-identifier-naming.IgnoreMainLikeFunctions
+    value:           'false'
+  - key:             google-readability-function-size.StatementThreshold
+    value:           '800'
+  - key:             llvm-else-after-return.WarnOnConditionVariables
+    value:           '0'
+  - key:             modernize-raw-string-literal.DelimiterStem
+    value:           lit
+  - key:             modernize-use-equals-default.IgnoreMacros
+    value:           'true'
+  - key:             modernize-raw-string-literal.ReplaceShorterLiterals
+    value:           'false'
+  - key:             modernize-use-emplace.SmartPointers
+    value:           '::std::shared_ptr;::std::unique_ptr;::std::auto_ptr;::std::weak_ptr'
+  - key:             performance-inefficient-vector-operation.VectorLikeClasses
+    value:           '::std::vector'
+  - key:             modernize-use-auto.RemoveStars
+    value:           'false'
+  - key:             readability-magic-numbers.IgnorePowersOf2IntegerValues
+    value:           'true'
+  - key:             portability-simd-intrinsics.Std
+    value:           ''
+  - key:             readability-redundant-member-init.IgnoreBaseInCopyConstructors
+    value:           'false'
+  - key:             performance-unnecessary-value-param.IncludeStyle
+    value:           llvm
+  - key:             modernize-replace-disallow-copy-and-assign-macro.MacroName
+    value:           DISALLOW_COPY_AND_ASSIGN
+  - key:             llvm-else-after-return.WarnOnUnfixable
+    value:           '0'
+  - key:             readability-simplify-subscript-expr.Types
+    value:           '::std::basic_string;::std::basic_string_view;::std::vector;::std::array'
+...
diff --git a/src/qirlightning/catalyst_runtime/.gitignore b/src/qirlightning/catalyst_runtime/.gitignore
new file mode 100644
index 0000000..4258b32
--- /dev/null
+++ b/src/qirlightning/catalyst_runtime/.gitignore
@@ -0,0 +1,3 @@
+build
+build_cov
+bin/__pycache__/
diff --git a/src/qirlightning/catalyst_runtime/CMakeLists.txt b/src/qirlightning/catalyst_runtime/CMakeLists.txt
new file mode 100644
index 0000000..1651851
--- /dev/null
+++ b/src/qirlightning/catalyst_runtime/CMakeLists.txt
@@ -0,0 +1,133 @@
+cmake_minimum_required(VERSION 3.26)
+
+project(catalyst_runtime)
+include(FetchContent)
+include(ExternalProject)
+
+set(CMAKE_CXX_STANDARD  20)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+# Compiler options
+option(ENABLE_CODE_COVERAGE "Enable code coverage" OFF)
+option(ENABLE_ADDRESS_SANITIZER "Enable address sanitizer" OFF)
+option(RUNTIME_CLANG_TIDY "Enable Clang Tidy" OFF)
+
+option(ENABLE_OPENQASM "Build OpenQasm backend device" OFF)
+
+set(CMAKE_VERBOSE_MAKEFILE ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+set(runtime_includes "${PROJECT_SOURCE_DIR}/include")
+set(capi_utils_includes "${PROJECT_SOURCE_DIR}/lib/capi")
+set(backend_includes "${PROJECT_SOURCE_DIR}/lib/backend/common")
+
+
+# Get LLVM hash to target from source tree.
+file(READ ../.dep-versions DEPENDENCY_VERSIONS)
+string(REGEX MATCH "llvm=([0-9a-f]+)" _ ${DEPENDENCY_VERSIONS})
+set(LLVM_HASH ${CMAKE_MATCH_1})
+message(STATUS "Detected LLVM version - ${LLVM_HASH}")
+
+FetchContent_Declare(
+    MLIRRunnerUtils
+    URL                 https://raw.githubusercontent.com/llvm/llvm-project/${LLVM_HASH}/mlir/include/mlir/ExecutionEngine/RunnerUtils.h
+    DOWNLOAD_NO_EXTRACT True
+    SOURCE_DIR        mlir/ExecutionEngine
+)
+
+FetchContent_Declare(
+    MLIRCRunnerUtils
+    URL                 https://raw.githubusercontent.com/llvm/llvm-project/${LLVM_HASH}/mlir/include/mlir/ExecutionEngine/CRunnerUtils.h
+    DOWNLOAD_NO_EXTRACT True
+    SOURCE_DIR          mlir/ExecutionEngine
+)
+
+FetchContent_Declare(
+    MLIRFloat16Bits
+    URL                 https://raw.githubusercontent.com/llvm/llvm-project/${LLVM_HASH}/mlir/include/mlir/ExecutionEngine/Float16bits.h
+    DOWNLOAD_NO_EXTRACT True
+    SOURCE_DIR          mlir/ExecutionEngine
+)
+
+# Note on pybind11 vs python discovery order:
+# If Python is looked for first, then we have to look for all the components needed by pybind11.
+# In particular, if pybind11::embed is used, then we need to find both headers (Development.Module)
+# and the shared library (Development.Embed) before pybind11 is discovered.
+# With the other order PyBind will discover everything it needs.
+# Note on flags:
+# - PYTHON_EXECUTABLE is a pybind11 specific flag used by its own (legacy) Python discovery process,
+#   it will not affect find_package(Python) calls.
+# - Python_EXECUTABLE is a cmake flag used in find_package(Python) to guide the discovery.
+# Note that pybind11 can be made to use find_python (instead of its legacy discovery), and thus
+# respect Python_EXECUTABLE), via the PYBIND11_FINDPYTHON flag.
+
+# Here, we look for the desired Python version early to avoid any problems with mismatched packages.
+# The desired Python environment should be specified ahead of time via -DPython_EXECUTABLE=...
+# The optional component is only used for the C++ test suite (to spin up its own interpreter),
+# and requires libpython.so to be available on the system.
+find_package(Python REQUIRED
+    COMPONENTS Interpreter Development.Module
+    OPTIONAL_COMPONENTS Development.Embed Development.SABIModule
+)
+
+if(RUNTIME_ENABLE_WARNINGS)
+    message(STATUS "Building with compiler warnings as errors enabled.")
+    add_compile_options(-Werror -Wall)
+endif()
+
+message(STATUS "ENABLE_OPENQASM is ${ENABLE_OPENQASM}.")
+
+set(devices_list)
+list(APPEND devices_list rtd_null_qubit)
+list(APPEND backend_includes "${PROJECT_SOURCE_DIR}/lib/backend/null_qubit")
+
+if(ENABLE_OPENQASM)
+    list(APPEND backend_includes "${PROJECT_SOURCE_DIR}/lib/backend/openqasm")
+    list(APPEND devices_list rtd_openqasm)
+endif()
+
+add_library(catalyst_qir_runtime INTERFACE)
+
+target_link_libraries(catalyst_qir_runtime INTERFACE ${devices_list} rt_capi)
+
+target_include_directories(catalyst_qir_runtime INTERFACE
+    ${runtime_includes}
+    ${backend_includes}
+)
+
+if(ENABLE_CODE_COVERAGE)
+    message(STATUS "ENABLE_CODE_COVERAGE is ON.")
+    if(APPLE)
+        target_compile_options(catalyst_qir_runtime INTERFACE -fprofile-instr-generate -fcoverage-mapping)
+        target_link_options(catalyst_qir_runtime INTERFACE -fprofile-instr-generate -fcoverage-mapping)
+    else()
+        target_compile_options(catalyst_qir_runtime INTERFACE -fprofile-arcs -ftest-coverage)
+        target_link_libraries(catalyst_qir_runtime INTERFACE gcov)
+    endif()
+endif()
+
+
+if(ENABLE_ADDRESS_SANITIZER)
+    message(STATUS "ENABLE_ADDRESS_SANITIZER is ON.")
+    add_compile_options(-fsanitize=address)
+    add_link_options(-fsanitize=address)
+endif()
+
+add_subdirectory(lib)
+add_subdirectory(tests)
+
+if(APPLE AND (${CMAKE_SYSTEM_PROCESSOR} STREQUAL arm64))
+# Don't rerun external project everytime we configure the runtime build.
+if(NOT EXISTS ${CMAKE_BINARY_DIR}/lib/liblapacke.3.dylib)
+    ExternalProject_Add(lapacke-accelerate
+        GIT_REPOSITORY https://github.com/lepus2589/accelerate-lapacke.git
+        GIT_TAG master
+        PREFIX _lapacke-accelerate
+        CMAKE_ARGS "--preset accelerate-lapacke32"
+                   "-DCMAKE_INSTALL_PREFIX=${CMAKE_BINARY_DIR}/_lapacke-accelerate/install"
+        INSTALL_COMMAND ${CMAKE_COMMAND} --build . --target install
+        COMMAND cp ${CMAKE_BINARY_DIR}/_lapacke-accelerate/install/lib/liblapacke.3.dylib ${CMAKE_BINARY_DIR}/lib
+    )
+    add_dependencies(rt_capi lapacke-accelerate)  # automatically build with the runtime
+endif()
+endif()
diff --git a/src/qirlightning/catalyst_runtime/Makefile b/src/qirlightning/catalyst_runtime/Makefile
new file mode 100644
index 0000000..55733a4
--- /dev/null
+++ b/src/qirlightning/catalyst_runtime/Makefile
@@ -0,0 +1,121 @@
+PYTHON?=$(shell which python3)
+PYTHON_PREFIX:=$(shell $(PYTHON) -c "import sys; print(sys.prefix)")
+PYTHON_VERSION:=$(shell $(PYTHON) -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
+C_COMPILER?=$(shell which clang)
+CXX_COMPILER?=$(shell which clang++)
+COMPILER_LAUNCHER?=$(shell which ccache)
+NPROC?=$(shell python3 -c "import os; print(os.cpu_count())")
+
+MK_ABSPATH := $(abspath $(lastword $(MAKEFILE_LIST)))
+MK_DIR := $(dir $(MK_ABSPATH))
+RT_BUILD_DIR?=$(MK_DIR)/build
+CODE_COVERAGE?=OFF
+BUILD_TYPE?=RelWithDebInfo
+ENABLE_OPENQASM?=ON
+ENABLE_ASAN?=OFF
+
+BUILD_TARGETS := rt_capi rtd_null_qubit
+TEST_TARGETS := runner_tests_qir_runtime
+
+PLATFORM := $(shell uname -s)
+
+ifeq ($(ENABLE_OPENQASM), ON)
+	BUILD_TARGETS += rtd_openqasm
+	TEST_TARGETS += runner_tests_openqasm
+endif
+
+.PHONY: help
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  all                to build Catalyst Runtime"
+	@echo "  coverage           to generate a coverage report using lcov"
+	@echo "  clean              to delete all temporary, cache, and build files"
+	@echo "  test               to run the Catalyst runtime test suite"
+	@echo "  format [check=1]   to apply C++ formatter; use with 'check=1' to check instead of modify (requires clang-format)"
+	@echo "  format [version=?] to apply C++ formatter; use with 'version={version}' to run clang-format-{version} instead of clang-format"
+	@echo "  check-tidy         to build Catalyst Runtime with RUNTIME_CLANG_TIDY=ON (requires clang-tidy)"
+
+.PHONY: configure
+configure:
+	@echo "Configure Catalyst Runtime"
+
+	cmake -G Ninja -B $(RT_BUILD_DIR) . \
+		-DCMAKE_BUILD_TYPE=$(BUILD_TYPE) \
+		-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=$(RT_BUILD_DIR)/lib \
+		-DCMAKE_C_COMPILER=$(C_COMPILER) \
+		-DCMAKE_CXX_COMPILER=$(CXX_COMPILER) \
+		-DCMAKE_C_COMPILER_LAUNCHER=$(COMPILER_LAUNCHER) \
+		-DCMAKE_CXX_COMPILER_LAUNCHER=$(COMPILER_LAUNCHER) \
+		-DENABLE_OPENQASM=$(ENABLE_OPENQASM) \
+		-DENABLE_CODE_COVERAGE=$(CODE_COVERAGE) \
+        -DPython_EXECUTABLE=$(PYTHON) \
+		-DENABLE_ADDRESS_SANITIZER=$(ENABLE_ASAN)
+
+.PHONY: runtime
+runtime: configure
+	cmake --build $(RT_BUILD_DIR) --target $(BUILD_TARGETS) -j$(NPROC) --verbose
+
+.PHONY: test_runner
+test_runner: configure
+	cmake --build $(RT_BUILD_DIR) --target $(TEST_TARGETS) -j$(NPROC) --verbose
+
+.PHONY: test
+test: CODE_COVERAGE=OFF
+test: BUILD_TYPE?=RelWithDebInfo
+test: test_runner
+	@echo "Catalyst runtime test suite - NullQubit"
+	$(ASAN_COMMAND) $(RT_BUILD_DIR)/tests/runner_tests_qir_runtime
+ifeq ($(ENABLE_OPENQASM), ON)
+	# Test the OpenQasm devices C++ tests
+	$(ASAN_COMMAND) $(RT_BUILD_DIR)/tests/runner_tests_openqasm
+endif
+
+.PHONY: coverage
+coverage: RT_BUILD_DIR := $(RT_BUILD_DIR)_cov
+coverage: CODE_COVERAGE=ON
+coverage: BUILD_TYPE=Debug
+coverage: C_COMPILER=$(shell which gcc)
+coverage: CXX_COMPILER=$(shell which g++)
+coverage: export LLVM_PROFILE_FILE := $(RT_BUILD_DIR)/tests/%m.profraw
+coverage: test_runner
+	@echo "check C++ code coverage"
+	$(RT_BUILD_DIR)/tests/runner_tests_qir_runtime
+ifeq ($(ENABLE_OPENQASM), ON)
+	$(RT_BUILD_DIR)/tests/runner_tests_openqasm
+endif
+ifeq ($(PLATFORM),Linux)
+	lcov --directory $(RT_BUILD_DIR) -b $(MK_DIR)/lib --capture --output-file $(RT_BUILD_DIR)/coverage.info
+	lcov --remove $(RT_BUILD_DIR)/coverage.info '/usr/*' '*/_deps/*' '*/envs/*' '*/mlir/*' --output-file $(RT_BUILD_DIR)/coverage.info
+	genhtml $(RT_BUILD_DIR)/coverage.info --output-directory $(RT_BUILD_DIR)/cov -t "Catalyst Runtime C++ Coverage" --num-spaces 4
+else
+	xcrun llvm-profdata merge $(RT_BUILD_DIR)/tests/*.profraw -o $(RT_BUILD_DIR)/tests/rt_test_coverage.profdata
+	xcrun llvm-cov show -instr-profile $(RT_BUILD_DIR)/tests/rt_test_coverage.profdata \
+		-object $(RT_BUILD_DIR)/tests/runner_tests_openqasm \
+		$(RT_BUILD_DIR)/tests/runner_tests_qir_runtime \
+		-format=html -output-dir=$(RT_BUILD_DIR)/coverage_html \
+		$(MK_DIR)/include $(MK_DIR)/lib $(MK_DIR)/tests
+endif
+
+.PHONY: clean
+clean:
+	@echo "clean build files"
+	rm -rf $(RT_BUILD_DIR) $(RT_BUILD_DIR)_cov cov coverage.info $(MK_DIR)/BuildTidy
+
+.PHONY: format
+format:
+ifdef check
+	$(PYTHON) ../bin/format.py --check $(if $(version:-=),--cfversion $(version)) .
+else
+	$(PYTHON) ../bin/format.py $(if $(version:-=),--cfversion $(version)) .
+endif
+
+.PHONY: check-tidy
+check-tidy:
+	@echo "build Catalyst Runtime with RUNTIME_CLANG_TIDY=ON"
+	cmake -G Ninja -B $(MK_DIR)/BuildTidy . \
+		-DCMAKE_BUILD_TYPE=$(BUILD_TYPE) \
+		-DCMAKE_C_COMPILER=$(C_COMPILER) \
+		-DCMAKE_CXX_COMPILER=$(CXX_COMPILER) \
+		-DRUNTIME_CLANG_TIDY=ON
+
+	cmake --build $(MK_DIR)/BuildTidy --target rt_capi -j$(NPROC)
diff --git a/src/qirlightning/catalyst_runtime/README.rst b/src/qirlightning/catalyst_runtime/README.rst
new file mode 100644
index 0000000..8a881e5
--- /dev/null
+++ b/src/qirlightning/catalyst_runtime/README.rst
@@ -0,0 +1,118 @@
+.. runtime-start-inclusion-marker-do-not-remove
+
+Catalyst Quantum Runtime
+########################
+
+The Catalyst Runtime is a C++ QIR runtime that enables the execution of Catalyst-compiled
+quantum programs, and is currently backed by `PennyLane-Lightning <https://docs.pennylane.ai/projects/lightning/en/stable>`_
+state-vector simulators, and `Amazon Braket <https://amazon-braket-pennylane-plugin-python.readthedocs.io>`__
+devices. Additional hardware support, including QPUs, to come.
+
+The runtime employs the `QuantumDevice <https://docs.pennylane.ai/projects/catalyst/en/stable/api/structCatalyst_1_1Runtime_1_1QuantumDevice.html#exhale-struct-structcatalyst-1-1runtime-1-1quantumdevice>`_
+public interface to support an extensible list of backend devices. This interface comprises two collections of abstract methods:
+
+- The Qubit management, device shot noise, and quantum tape recording methods are utilized for the implementation of Quantum Runtime (QR) instructions.
+
+- The quantum operations, observables, measurements, and gradient methods are used to implement Quantum Instruction Set (QIS) instructions.
+
+A complete list of instructions supported by the runtime can be found in
+`RuntimeCAPI.h <https://github.com/PennyLaneAI/catalyst/tree/main/runtime/include/RuntimeCAPI.h>`_.
+
+Contents
+========
+
+The directory is structured as follows:
+
+- `include <https://github.com/PennyLaneAI/catalyst/tree/main/runtime/include>`_:
+    This contains the public header files of the runtime including the ``QuantumDevice`` API
+    for backend quantum devices and the runtime CAPI.
+
+- `lib <https://github.com/PennyLaneAI/catalyst/tree/main/runtime/lib>`_:
+    The core modules of the runtime are structured into ``lib/capi`` and ``lib/backend``.
+    `lib/capi <https://github.com/PennyLaneAI/catalyst/tree/main/runtime/lib/capi>`_  implements the semantics for
+    QIR instructions lowered to our custom runtime. `lib/backend <https://github.com/PennyLaneAI/catalyst/tree/main/runtime/lib/backend>`_
+    contains implementations of the ``QuantumDevice`` API for backend simulators.
+
+- `tests <https://github.com/PennyLaneAI/catalyst/tree/main/runtime/tests>`_:
+    A collection of C++ tests for modules and methods in the runtime.
+
+Backend Devices
+===============
+
+New device backends for the runtime can be realized by implementing the quantum device interface.
+The following table shows the available devices along with supported features:
+
+.. list-table::
+   :widths: 25 25 25 25
+   :header-rows: 0
+
+   * - **Features**
+     - **PennyLane-Lightning-Qubit**
+     - **PennyLane-Lightning-Kokkos** and **PennyLane-Lightning-GPU**
+     - **Amazon-Braket-OpenQasm**
+   * - Qubit Management
+     - Dynamic allocation/deallocation
+     - Static allocation/deallocation
+     - Static allocation/deallocation
+   * - Gate Operations
+     - `Lightning operations <https://github.com/PennyLaneAI/pennylane-lightning/blob/master/pennylane_lightning/core/src/gates/GateOperation.hpp>`_
+     - `Lightning operations <https://github.com/PennyLaneAI/pennylane-lightning/blob/master/pennylane_lightning/core/src/gates/GateOperation.hpp>`_ without controlled gates support
+     - `Braket operations <https://github.com/PennyLaneAI/catalyst/blob/e812afbadbd777209862d5c76f394e3f0c43ffb6/runtime/lib/backend/openqasm/OpenQasmBuilder.hpp#L49>`_
+   * - Quantum Observables
+     - ``Identity``, ``PauliX``, ``PauliY``, ``PauliZ``, ``Hadamard``, ``Hermitian``, ``Hamiltonian``, and Tensor Product of Observables
+     - ``Identity``, ``PauliX``, ``PauliY``, ``PauliZ``, ``Hadamard``, ``Hermitian``, ``Hamiltonian``, and Tensor Product of Observables
+     - ``Identity``, ``PauliX``, ``PauliY``, ``PauliZ``, ``Hadamard``, ``Hermitian``, and Tensor Product of Observables
+   * - Expectation Value
+     - All observables; Finite-shots supported
+     - All observables; Finite-shots supported
+     - All observables; Finite-shots supported
+   * - Variance
+     - All observables; Finite-shots supported
+     - All observables; Finite-shots supported
+     - All observables; Finite-shots supported
+   * - Probability
+     - Only for the computational basis on the supplied qubits; Finite-shots supported
+     - Only for the computational basis on the supplied qubits; Finite-shots supported
+     - The computational basis on all active qubits; Finite-shots supported
+   * - Sampling
+     - Only for the computational basis on the supplied qubits
+     - Only for the computational basis on the supplied qubits
+     - The computational basis on all active qubits; Finite-shots supported
+   * - Mid-Circuit Measurement
+     - Only for the computational basis on the supplied qubit
+     - Only for the computational basis on the supplied qubit
+     - Not supported
+   * - Gradient
+     - The Adjoint-Jacobian method for expectation values on all observables
+     - The Adjoint-Jacobian method for expectation values on all observables
+     - Not supported
+
+Requirements
+============
+
+To build the runtime from source, it is required to have an up to date version of a C/C++ compiler such as gcc or clang
+with support for the C++20 standard library.
+
+Installation
+============
+
+By default, the runtime builds all supported backend devices.
+You can build the runtime with custom devices from the list of Backend Devices.
+
+You can use ``ENABLE_OPENQASM=OFF`` to disable building the runtime with `Amazon-Braket-OpenQasm <https://aws.amazon.com/braket/>`_:
+
+.. code-block:: console
+
+    make runtime ENABLE_OPENQASM=OFF
+
+This device currently offers generators for the `OpenQasm3 <https://openqasm.com/versions/3.0/index.html>`_ specification and
+`Amazon Braket <https://docs.aws.amazon.com/braket/latest/developerguide/braket-openqasm-supported-features.html>`__ assembly extension.
+Moreover, the generated assembly can be executed on Amazon Braket devices leveraging `amazon-braket-sdk-python <https://github.com/aws/amazon-braket-sdk-python>`_.
+
+To check the runtime test suite from the root directory:
+
+.. code-block:: console
+
+    make test-runtime
+
+.. runtime-end-inclusion-marker-do-not-remove
diff --git a/src/qirlightning/catalyst_runtime/include/DataView.hpp b/src/qirlightning/catalyst_runtime/include/DataView.hpp
new file mode 100644
index 0000000..6cf50f2
--- /dev/null
+++ b/src/qirlightning/catalyst_runtime/include/DataView.hpp
@@ -0,0 +1,148 @@
+// Copyright 2023 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <Exception.hpp>
+
+/**
+ * A multi-dimensional view for MemRef-like and std::vector<T> types.
+ *
+ * @tparam T The underlying data type
+ * @tparam R The Rank (R > 0)
+ *
+ * @note A forward iterator is implemented in this view for traversing over the entire
+ * elements of MemRef types rank-by-rank starting from the last dimension (R-1). For example,
+ * The DataView iterator for MemRef<T, 2> starts from index (0, 0) and traverses elements
+ * in the following order:
+ * (0, 0), ..., (0, sizes[1]-1), (1, 0), ..., (1, sizes[1]-1), ... (sizes[0]-1, sizes[1]-1).
+ */
+template <typename T, size_t R> class DataView {
+  private:
+    T *data_aligned;
+    size_t offset;
+    size_t sizes[R] = {0};
+    size_t strides[R] = {0};
+
+  public:
+    class iterator {
+      private:
+        const DataView<T, R> &view;
+
+        int64_t loc; // physical index
+        size_t indices[R] = {0};
+
+      public:
+        using iterator_category = std::forward_iterator_tag; // LCOV_EXCL_LINE
+        using value_type = T;                                // LCOV_EXCL_LINE
+        using difference_type = std::ptrdiff_t;              // LCOV_EXCL_LINE
+        using pointer = T *;                                 // LCOV_EXCL_LINE
+        using reference = T &;                               // LCOV_EXCL_LINE
+
+        iterator(const DataView<T, R> &_view, int64_t begin_idx) : view(_view), loc(begin_idx) {}
+        pointer operator->() const { return &view.data_aligned[loc]; }
+        reference operator*() const { return view.data_aligned[loc]; }
+        iterator &operator++()
+        {
+            int64_t next_axis = -1;
+            int64_t idx;
+            for (int64_t i = R; i > 0; --i) {
+                idx = i - 1;
+                if (indices[idx]++ < view.sizes[idx] - 1) {
+                    next_axis = idx;
+                    break;
+                }
+                indices[idx] = 0;
+                loc -= (view.sizes[idx] - 1) * view.strides[idx];
+            }
+
+            loc = next_axis == -1 ? -1 : loc + view.strides[next_axis];
+            return *this;
+        }
+        iterator operator++(int)
+        {
+            auto tmp = *this;
+            int64_t next_axis = -1;
+            int64_t idx;
+            for (int64_t i = R; i > 0; --i) {
+                idx = i - 1;
+                if (indices[idx]++ < view.sizes[idx] - 1) {
+                    next_axis = idx;
+                    break;
+                }
+                indices[idx] = 0;
+                loc -= (view.sizes[idx] - 1) * view.strides[idx];
+            }
+
+            loc = next_axis == -1 ? -1 : loc + view.strides[next_axis];
+            return tmp;
+        }
+        bool operator==(const iterator &other) const
+        {
+            return (loc == other.loc && view.data_aligned == other.view.data_aligned);
+        }
+        bool operator!=(const iterator &other) const { return !(*this == other); }
+    };
+
+    explicit DataView(std::vector<T> &buffer) : data_aligned(buffer.data()), offset(0)
+    {
+        static_assert(R == 1, "[Class: DataView] Assertion: R == 1");
+        sizes[0] = buffer.size();
+        strides[0] = 1;
+    }
+
+    explicit DataView(T *_data_aligned, size_t _offset, const size_t *_sizes,
+                      const size_t *_strides)
+        : data_aligned(_data_aligned), offset(_offset)
+    {
+        static_assert(R > 0, "[Class: DataView] Assertion: R > 0");
+        if (_sizes != nullptr && _strides != nullptr) {
+            for (size_t i = 0; i < R; i++) {
+                sizes[i] = _sizes[i];
+                strides[i] = _strides[i];
+            }
+        } // else sizes = {0}, strides = {0}
+    }
+
+    [[nodiscard]] auto size() const -> size_t
+    {
+        if (!data_aligned) {
+            return 0;
+        }
+
+        size_t tsize = 1;
+        for (size_t i = 0; i < R; i++) {
+            tsize *= sizes[i];
+        }
+        return tsize;
+    }
+
+    template <typename... I> T &operator()(I... idxs) const
+    {
+        static_assert(sizeof...(idxs) == R,
+                      "[Class: DataView] Error in Catalyst Runtime: Wrong number of indices");
+        size_t indices[] = {static_cast<size_t>(idxs)...};
+
+        size_t loc = offset;
+        for (size_t axis = 0; axis < R; axis++) {
+            RT_ASSERT(indices[axis] < sizes[axis]);
+            loc += indices[axis] * strides[axis];
+        }
+        return data_aligned[loc];
+    }
+
+    iterator begin() { return iterator{*this, static_cast<int64_t>(offset)}; }
+
+    iterator end() { return iterator{*this, -1}; }
+};
diff --git a/src/qirlightning/catalyst_runtime/include/DynamicLibraryLoader.hpp b/src/qirlightning/catalyst_runtime/include/DynamicLibraryLoader.hpp
new file mode 100644
index 0000000..1c25ab8
--- /dev/null
+++ b/src/qirlightning/catalyst_runtime/include/DynamicLibraryLoader.hpp
@@ -0,0 +1,79 @@
+// Copyright 2024 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <dlfcn.h>
+#include <string_view>
+
+#include "Exception.hpp"
+
+/**
+ * @brief A utility struct to handle opening, closing and retrieving symbols
+ *        from dynamic shared objects.
+ */
+struct DynamicLibraryLoader {
+    void *handle;
+
+    DynamicLibraryLoader(std::string_view library_name, int mode = RTLD_LAZY | RTLD_NODELETE)
+    {
+        // Load the shared library
+        handle = dlopen(library_name.data(), mode);
+        if (!handle) {
+            const char *err_msg = dlerror();
+            RT_FAIL(err_msg);
+        }
+    }
+
+    ~DynamicLibraryLoader()
+    {
+        if (handle) {
+            // TODO: This is non-sensical.
+            // We are using RTLD_NODELETE, why would calling dlclose have a side-effect?
+            // Worst of all, the side-effect is not in our code.
+            // When we have dlclose, everything works well the first time.
+            // However, when trying to compile a second time, we will find that jaxlib will now
+            // raise a StopIteration exception. This doesn't really make any sense.
+            // My guess is that somehow dlclosing here will unload a the StopIteration symbol (?)
+            // rebind it with another equivalent (but with different id?)
+            // and then the MLIR python bindings are unable to catch it and stop the iteration and
+            // it gets propagated upwards.
+            //
+            // Is not calling dlclose bad?
+            // A little bit, although dlclose implies intent and does not create any requirements
+            // upon the implementation. See here:
+            // https://pubs.opengroup.org/onlinepubs/000095399/functions/dlclose.html
+            // https://github.com/pybind/pybind11/blob/75e48c5f959b4f0a49d8c664e059b6fb4b497102/include/pybind11/detail/internals.h#L108-L113
+            //
+#ifndef __APPLE__
+            dlclose(handle);
+#endif
+        }
+    }
+
+    // Get symbol from library
+    template <typename T> T getSymbol(std::string_view symbol_name)
+    {
+        // Clear any existing errors
+        dlerror();
+
+        // Retrieve symbol
+        T symbol = reinterpret_cast<T>(dlsym(handle, symbol_name.data()));
+        const char *err_msg = dlerror();
+        if (err_msg != nullptr) {
+            RT_FAIL(err_msg);
+        }
+        return symbol;
+    }
+};
diff --git a/src/qirlightning/catalyst_runtime/include/Exception.hpp b/src/qirlightning/catalyst_runtime/include/Exception.hpp
new file mode 100644
index 0000000..a76da14
--- /dev/null
+++ b/src/qirlightning/catalyst_runtime/include/Exception.hpp
@@ -0,0 +1,87 @@
+// Copyright 2023 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <exception>
+#include <iostream>
+
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+/**
+ * @brief Macro that throws `RuntimeException` with given message.
+ */
+#define RT_FAIL(message) Catalyst::Runtime::_abort((message), __FILE__, __LINE__, __func__)
+
+/**
+ * @brief Macro that throws `RuntimeException` if expression evaluates
+ * to true.
+ */
+#define RT_FAIL_IF(expression, message)                                                            \
+    if ((expression)) {                                                                            \
+        RT_FAIL(message);                                                                          \
+    }
+
+/**
+ * @brief Macro that throws `RuntimeException` with the given expression
+ * and source location if expression evaluates to false.
+ */
+#define RT_ASSERT(expression) RT_FAIL_IF(!(expression), "Assertion: " #expression)
+
+namespace Catalyst::Runtime {
+
+/**
+ * @brief This is the general exception thrown by Catalyst for runtime errors
+ * that is derived from `std::exception`.
+ */
+class RuntimeException : public std::exception {
+  private:
+    const std::string err_msg;
+
+  public:
+    explicit RuntimeException(std::string msg) noexcept
+        : err_msg{std::move(msg)} {}        // LCOV_EXCL_LINE
+    ~RuntimeException() override = default; // LCOV_EXCL_LINE
+
+    RuntimeException(const RuntimeException &) = default;
+    RuntimeException(RuntimeException &&) noexcept = default;
+
+    RuntimeException &operator=(const RuntimeException &) = delete;
+    RuntimeException &operator=(RuntimeException &&) = delete;
+
+    [[nodiscard]] auto what() const noexcept -> const char * override
+    {
+        return err_msg.c_str();
+    } // LCOV_EXCL_LINE
+};
+
+/**
+ * @brief Throws a `RuntimeException` with the given error message.
+ *
+ * @note This is not supposed to be called directly.
+ */
+[[noreturn]] inline void _abort(const char *message, const char *file_name, size_t line,
+                                const char *function_name)
+{
+    std::stringstream sstream;
+    sstream << "[" << file_name << "][Line:" << line << "][Function:" << function_name
+            << "] Error in Catalyst Runtime: " << message;
+
+    throw RuntimeException(sstream.str());
+} // LCOV_EXCL_LINE
+
+} // namespace Catalyst::Runtime
diff --git a/src/qirlightning/catalyst_runtime/include/QuantumDevice.hpp b/src/qirlightning/catalyst_runtime/include/QuantumDevice.hpp
new file mode 100644
index 0000000..ccdb606
--- /dev/null
+++ b/src/qirlightning/catalyst_runtime/include/QuantumDevice.hpp
@@ -0,0 +1,364 @@
+// Copyright 2022-2023 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <complex>
+#include <memory>
+#include <optional>
+#include <random>
+#include <vector>
+
+#include "DataView.hpp"
+#include "Types.h"
+
+// A helper template macro to generate the <IDENTIFIER>Factory method by
+// calling <CONSTRUCTOR>(kwargs). Check the Custom Devices guideline for details:
+// https://docs.pennylane.ai/projects/catalyst/en/stable/dev/custom_devices.html
+#define GENERATE_DEVICE_FACTORY(IDENTIFIER, CONSTRUCTOR)                                           \
+    extern "C" Catalyst::Runtime::QuantumDevice *IDENTIFIER##Factory(const char *kwargs)           \
+    {                                                                                              \
+        return new CONSTRUCTOR(std::string(kwargs));                                               \
+    }
+
+namespace Catalyst::Runtime {
+
+/**
+ * @brief struct API for backend quantum devices.
+ *
+ * This device API contains,
+ * - a set of methods to manage qubit allocations and deallocations, device shot
+ *   noise, and quantum tape recording as well as reference values for the result
+ *   data-type; these are used to implement Quantum Runtime (QR) instructions.
+ *
+ * - a set of methods for quantum operations, observables, measurements, and gradient
+ *   of the device; these are used to implement Quantum Instruction Set (QIS) instructions.
+ *
+ */
+struct QuantumDevice {
+    QuantumDevice() = default;          // LCOV_EXCL_LINE
+    virtual ~QuantumDevice() = default; // LCOV_EXCL_LINE
+
+    QuantumDevice &operator=(const QuantumDevice &) = delete;
+    QuantumDevice(const QuantumDevice &) = delete;
+    QuantumDevice(QuantumDevice &&) = delete;
+    QuantumDevice &operator=(QuantumDevice &&) = delete;
+
+    /**
+     * @brief Allocate a qubit.
+     *
+     * @return `QubitIdType`
+     */
+    virtual auto AllocateQubit() -> QubitIdType = 0;
+
+    /**
+     * @brief Allocate a vector of qubits.
+     *
+     * @param num_qubits The number of qubits to allocate.
+     *
+     * @return `std::vector<QubitIdType>`
+     */
+    virtual auto AllocateQubits(size_t num_qubits) -> std::vector<QubitIdType> = 0;
+
+    /**
+     * @brief Release a qubit.
+     *
+     * @param qubit The id of the qubit
+     */
+    virtual void ReleaseQubit(QubitIdType qubit) = 0;
+
+    /**
+     * @brief Release all qubits.
+     */
+    virtual void ReleaseAllQubits() = 0;
+
+    /**
+     * @brief Get the number of allocated qubits.
+     *
+     * @return `size_t`
+     */
+    [[nodiscard]] virtual auto GetNumQubits() const -> size_t = 0;
+
+    /**
+     * @brief Set the number of device shots.
+     *
+     * @param shots The number of noise shots
+     */
+    virtual void SetDeviceShots(size_t shots) = 0;
+
+    /**
+     * @brief Get the number of device shots.
+     *
+     * @return `size_t`
+     */
+    [[nodiscard]] virtual auto GetDeviceShots() const -> size_t = 0;
+
+    /**
+     * @brief Set the PRNG of the device.
+     *
+     * The Catalyst runtime enables seeded program execution on non-hardware devices.
+     * A random number generator instance is managed by the runtime to predictably
+     * generate results for non-deterministic programs, such as those involving `Measure`
+     * calls.
+     * Devices implementing support for this feature do not need to use the provided
+     * PRNG instance as their sole source of random numbers, but it is expected that the
+     * the same instance state will predictable and reproducibly generate the same
+     * program results. It is also expected that the provided PRNG state is evolved
+     * sufficiently so that two device executions sharing the same instance do not produce
+     * identical results.
+     * The provided PRNG instance is not thread-locked, and devices wishing to share it
+     * across threads will need to provide their own thread-safety.
+     *
+     * @param gen The std::mt19937 PRNG object.
+     */
+    virtual void SetDevicePRNG([[maybe_unused]] std::mt19937 *gen){};
+
+    /**
+     * @brief Start recording a quantum tape if provided.
+     *
+     * @note This is backed by the `Catalyst::Runtime::CacheManager<ComplexT>` property in
+     * the device implementation.
+     */
+    virtual void StartTapeRecording() = 0;
+
+    /**
+     * @brief Stop recording a quantum tape if provided.
+     *
+     * @note This is backed by the `Catalyst::Runtime::CacheManager<ComplexT>` property in
+     * the device implementation.
+     */
+    virtual void StopTapeRecording() = 0;
+
+    /**
+     * @brief Result value for "Zero" used in the measurement process.
+     *
+     * @return `Result`
+     */
+    [[nodiscard]] virtual auto Zero() const -> Result = 0;
+
+    /**
+     * @brief Result value for "One"  used in the measurement process.
+     *
+     * @return `Result`
+     */
+    [[nodiscard]] virtual auto One() const -> Result = 0;
+
+    /**
+     * @brief A helper method to print the state vector of a device.
+     */
+    virtual void PrintState() = 0;
+
+    /**
+     * @brief Prepare subsystems using the given ket vector in the computational basis.
+     *
+     * @param state A state vector of size 2**len(wires)
+     * @param wires The wire(s) the operation acts on
+     */
+    virtual void SetState([[maybe_unused]] DataView<std::complex<double>, 1> &state,
+                          [[maybe_unused]] std::vector<QubitIdType> &wires)
+    {
+        RT_FAIL("Unsupported functionality");
+    }
+
+    /**
+     * @brief Prepares a single computational basis state.
+     *
+     * @param n Prepares the basis state |n>, where n is an array of integers from the set {0, 1}
+     * @param wires The wire(s) the operation acts on
+     */
+    virtual void SetBasisState([[maybe_unused]] DataView<int8_t, 1> &n,
+                               [[maybe_unused]] std::vector<QubitIdType> &wires)
+    {
+        RT_FAIL("Unsupported functionality");
+    }
+
+    /**
+     * @brief Apply a single gate to the state vector of a device with its name if this is
+     * supported.
+     *
+     * @param name The name of the gate to apply
+     * @param params Optional parameter list for parametric gates
+     * @param wires Wires to apply gate to
+     * @param inverse Indicates whether to use inverse of gate
+     * @param controlled_wires Optional controlled wires applied to the operation
+     * @param controlled_values Optional controlled values applied to the operation
+     */
+    virtual void
+    NamedOperation(const std::string &name, const std::vector<double> &params,
+                   const std::vector<QubitIdType> &wires, [[maybe_unused]] bool inverse = false,
+                   [[maybe_unused]] const std::vector<QubitIdType> &controlled_wires = {},
+                   [[maybe_unused]] const std::vector<bool> &controlled_values = {}) = 0;
+
+    /**
+     * @brief Apply a given matrix directly to the state vector of a device.
+     *
+     * @param matrix The matrix of data in row-major format
+     * @param wires Wires to apply gate to
+     * @param inverse Indicates whether to use inverse of gate
+     * @param controlled_wires Controlled wires applied to the operation
+     * @param controlled_values Controlled values applied to the operation
+     */
+    virtual void
+    MatrixOperation(const std::vector<std::complex<double>> &matrix,
+                    const std::vector<QubitIdType> &wires, [[maybe_unused]] bool inverse = false,
+                    [[maybe_unused]] const std::vector<QubitIdType> &controlled_wires = {},
+                    [[maybe_unused]] const std::vector<bool> &controlled_values = {}) = 0;
+
+    /**
+     * @brief Construct a named (Identity, PauliX, PauliY, PauliZ, and Hadamard)
+     * or Hermitian observable.
+     *
+     * @param id The type of the observable
+     * @param matrix The matrix of data to construct a hermitian observable
+     * @param wires Wires to apply observable to
+     *
+     * @return `ObsIdType` Index of the constructed observable
+     */
+    virtual auto Observable(ObsId id, const std::vector<std::complex<double>> &matrix,
+                            const std::vector<QubitIdType> &wires) -> ObsIdType = 0;
+
+    /**
+     * @brief Construct a tensor product of observables.
+     *
+     * @param obs The vector of observables indices of type ObsIdType
+     *
+     * @return `ObsIdType` Index of the constructed observable
+     */
+    virtual auto TensorObservable(const std::vector<ObsIdType> &obs) -> ObsIdType = 0;
+
+    /**
+     * @brief Construct a Hamiltonian observable.
+     *
+     * @param coeffs The vector of coefficients
+     * @param obs The vector of observables indices of size `coeffs`
+     *
+     * @return `ObsIdType` Index of the constructed observable
+     */
+    virtual auto HamiltonianObservable(const std::vector<double> &coeffs,
+                                       const std::vector<ObsIdType> &obs) -> ObsIdType = 0;
+
+    /**
+     * @brief Compute the expected value of an observable.
+     *
+     * @param obsKey The index of the constructed observable
+     *
+     * @return `double` The expected value
+     */
+    virtual auto Expval(ObsIdType obsKey) -> double = 0;
+
+    /**
+     * @brief Compute the variance of an observable.
+     *
+     * @param obsKey The index of the constructed observable
+     *
+     * @return `double` The variance
+     */
+    virtual auto Var(ObsIdType obsKey) -> double = 0;
+
+    /**
+     * @brief Get the state-vector of a device.
+     *
+     * @param state The pre-allocated `DataView<complex<double>, 1>`
+     */
+    virtual void State(DataView<std::complex<double>, 1> &state) = 0;
+
+    /**
+     * @brief Compute the probabilities of each computational basis state.
+
+     * @param probs The pre-allocated `DataView<double, 1>`
+     */
+    virtual void Probs(DataView<double, 1> &probs) = 0;
+
+    /**
+     * @brief Compute the probabilities for a subset of the full system.
+     *
+     * @param probs The pre-allocated `DataView<double, 1>`
+     * @param wires Wires will restrict probabilities to a subset of the full system
+     */
+    virtual void PartialProbs(DataView<double, 1> &probs,
+                              const std::vector<QubitIdType> &wires) = 0;
+
+    /**
+     * @brief Compute samples with the number of shots on the entire wires,
+     * returing raw samples.
+     *
+     * @param samples The pre-allocated `DataView<double, 2>`representing a matrix of
+     * shape `shots * numQubits`. The built-in iterator in `DataView<double, 2>`
+     * iterates over all elements of `samples` row-wise.
+     * @param shots The number of shots
+     */
+    virtual void Sample(DataView<double, 2> &samples, size_t shots) = 0;
+
+    /**
+     * @brief Compute partial samples with the number of shots on `wires`,
+     * returing raw samples.
+     *
+     * @param samples The pre-allocated `DataView<double, 2>`representing a matrix of
+     * shape `shots * numWires`. The built-in iterator in `DataView<double, 2>`
+     * iterates over all elements of `samples` row-wise.
+     * @param wires Wires to compute samples on
+     * @param shots The number of shots
+     */
+    virtual void PartialSample(DataView<double, 2> &samples, const std::vector<QubitIdType> &wires,
+                               size_t shots) = 0;
+
+    /**
+     * @brief Sample with the number of shots on the entire wires, returning the
+     * number of counts for each sample.
+     *
+     * @param eigvals The pre-allocated `DataView<double, 1>`
+     * @param counts The pre-allocated `DataView<int64_t, 1>`
+     * @param shots The number of shots
+     */
+    virtual void Counts(DataView<double, 1> &eigvals, DataView<int64_t, 1> &counts,
+                        size_t shots) = 0;
+
+    /**
+     * @brief Partial sample with the number of shots on `wires`, returning the
+     * number of counts for each sample.
+     *
+     * @param eigvals The pre-allocated `DataView<double, 1>`
+     * @param counts The pre-allocated `DataView<int64_t, 1>`
+     * @param wires Wires to compute samples on
+     * @param shots The number of shots
+     */
+    virtual void PartialCounts(DataView<double, 1> &eigvals, DataView<int64_t, 1> &counts,
+                               const std::vector<QubitIdType> &wires, size_t shots) = 0;
+
+    /**
+     * @brief A general measurement method that acts on a single wire.
+     *
+     * @param wire The wire to compute Measure on
+     * @param postselect Which basis state to postselect after a mid-circuit measurement (-1 denotes
+     no post-selection)
+
+     * @return `Result` The measurement result
+     */
+    virtual auto Measure(QubitIdType wire, std::optional<int32_t> postselect) -> Result = 0;
+
+    /**
+     * @brief Compute the gradient of a quantum tape, that is cached using
+     * `Catalyst::Runtime::Simulator::CacheManager`, for a specific set of trainable
+     * parameters.
+     *
+     * @param gradients The vector of pre-allocated `DataView<double, 1>*`
+     * to store gradients resutls for the list of cached observables.
+     * @param trainParams The vector of trainable parameters; if none, all parameters
+     * would be assumed trainable
+     *
+     */
+    virtual void Gradient(std::vector<DataView<double, 1>> &gradients,
+                          const std::vector<size_t> &trainParams) = 0;
+};
+} // namespace Catalyst::Runtime
diff --git a/src/qirlightning/catalyst_runtime/include/RuntimeCAPI.h b/src/qirlightning/catalyst_runtime/include/RuntimeCAPI.h
new file mode 100644
index 0000000..b0f63ca
--- /dev/null
+++ b/src/qirlightning/catalyst_runtime/include/RuntimeCAPI.h
@@ -0,0 +1,112 @@
+// Copyright 2022-2023 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#ifndef RUNTIMECAPI_H
+#define RUNTIMECAPI_H
+
+#include "Types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Quantum Runtime Instructions
+void __catalyst__rt__fail_cstr(const char *);
+void __catalyst__rt__initialize(uint32_t *seed);
+void __catalyst__rt__device_init(int8_t *, int8_t *, int8_t *, int64_t shots);
+void __catalyst__rt__device_release();
+void __catalyst__rt__finalize();
+void __catalyst__rt__toggle_recorder(bool);
+void __catalyst__rt__print_state();
+void __catalyst__rt__print_tensor(OpaqueMemRefT *, bool);
+void __catalyst__rt__print_string(char *);
+void __catalyst__rt__assert_bool(bool, char *);
+int64_t __catalyst__rt__array_get_size_1d(QirArray *);
+int8_t *__catalyst__rt__array_get_element_ptr_1d(QirArray *, int64_t);
+
+QUBIT *__catalyst__rt__qubit_allocate();
+QirArray *__catalyst__rt__qubit_allocate_array(int64_t);
+void __catalyst__rt__qubit_release(QUBIT *);
+void __catalyst__rt__qubit_release_array(QirArray *);
+
+int64_t __catalyst__rt__num_qubits();
+
+bool __catalyst__rt__result_equal(RESULT *, RESULT *);
+RESULT *__catalyst__rt__result_get_one();
+RESULT *__catalyst__rt__result_get_zero();
+
+// Quantum Gate Set Instructions
+void __catalyst__qis__SetState(MemRefT_CplxT_double_1d *, uint64_t, ...);
+void __catalyst__qis__SetBasisState(MemRefT_int8_1d *, uint64_t, ...);
+void __catalyst__qis__Identity(QUBIT *, const Modifiers *);
+void __catalyst__qis__PauliX(QUBIT *, const Modifiers *);
+void __catalyst__qis__PauliY(QUBIT *, const Modifiers *);
+void __catalyst__qis__PauliZ(QUBIT *, const Modifiers *);
+void __catalyst__qis__Hadamard(QUBIT *, const Modifiers *);
+void __catalyst__qis__S(QUBIT *, const Modifiers *);
+void __catalyst__qis__T(QUBIT *, const Modifiers *);
+void __catalyst__qis__PhaseShift(double, QUBIT *, const Modifiers *);
+void __catalyst__qis__RX(double, QUBIT *, const Modifiers *);
+void __catalyst__qis__RY(double, QUBIT *, const Modifiers *);
+void __catalyst__qis__RZ(double, QUBIT *, const Modifiers *);
+void __catalyst__qis__Rot(double, double, double, QUBIT *, const Modifiers *);
+void __catalyst__qis__CNOT(QUBIT *, QUBIT *, const Modifiers *);
+void __catalyst__qis__CY(QUBIT *, QUBIT *, const Modifiers *);
+void __catalyst__qis__CZ(QUBIT *, QUBIT *, const Modifiers *);
+void __catalyst__qis__SWAP(QUBIT *, QUBIT *, const Modifiers *);
+void __catalyst__qis__IsingXX(double, QUBIT *, QUBIT *, const Modifiers *);
+void __catalyst__qis__IsingYY(double, QUBIT *, QUBIT *, const Modifiers *);
+void __catalyst__qis__IsingXY(double, QUBIT *, QUBIT *, const Modifiers *);
+void __catalyst__qis__IsingZZ(double, QUBIT *, QUBIT *, const Modifiers *);
+void __catalyst__qis__ControlledPhaseShift(double, QUBIT *, QUBIT *, const Modifiers *);
+void __catalyst__qis__CRX(double, QUBIT *, QUBIT *, const Modifiers *);
+void __catalyst__qis__CRY(double, QUBIT *, QUBIT *, const Modifiers *);
+void __catalyst__qis__CRZ(double, QUBIT *, QUBIT *, const Modifiers *);
+void __catalyst__qis__CRot(double, double, double, QUBIT *, QUBIT *, const Modifiers *);
+void __catalyst__qis__CSWAP(QUBIT *, QUBIT *, QUBIT *, const Modifiers *);
+void __catalyst__qis__Toffoli(QUBIT *, QUBIT *, QUBIT *, const Modifiers *);
+void __catalyst__qis__MultiRZ(double, const Modifiers *, int64_t, /*qubits*/...);
+void __catalyst__qis__GlobalPhase(double, const Modifiers *);
+void __catalyst__qis__ISWAP(QUBIT *, QUBIT *, const Modifiers *);
+void __catalyst__qis__PSWAP(double, QUBIT *, QUBIT *, const Modifiers *);
+
+// Struct pointer arguments for these instructions represent real arguments,
+// as passing structs by value is too unreliable / compiler dependant.
+void __catalyst__qis__QubitUnitary(MemRefT_CplxT_double_2d *, const Modifiers *, int64_t,
+                                   /*qubits*/...);
+
+ObsIdType __catalyst__qis__NamedObs(int64_t, QUBIT *);
+ObsIdType __catalyst__qis__HermitianObs(MemRefT_CplxT_double_2d *, int64_t, /*qubits*/...);
+ObsIdType __catalyst__qis__TensorObs(int64_t, /*obsKeys*/...);
+ObsIdType __catalyst__qis__HamiltonianObs(MemRefT_double_1d *, int64_t, /*obsKeys*/...);
+
+// Struct pointers arguments here represent return values.
+RESULT *__catalyst__qis__Measure(QUBIT *, int32_t);
+double __catalyst__qis__Expval(ObsIdType);
+double __catalyst__qis__Variance(ObsIdType);
+void __catalyst__qis__Probs(MemRefT_double_1d *, int64_t, /*qubits*/...);
+void __catalyst__qis__Sample(MemRefT_double_2d *, int64_t, /*qubits*/...);
+void __catalyst__qis__Counts(PairT_MemRefT_double_int64_1d *, int64_t, /*qubits*/...);
+void __catalyst__qis__State(MemRefT_CplxT_double_1d *, int64_t, /*qubits*/...);
+void __catalyst__qis__Gradient(int64_t, /*results*/...);
+void __catalyst__qis__Gradient_params(MemRefT_int64_1d *, int64_t, /*results*/...);
+
+void __catalyst__host__rt__unrecoverable_error();
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif
diff --git a/src/qirlightning/catalyst_runtime/include/Types.h b/src/qirlightning/catalyst_runtime/include/Types.h
new file mode 100644
index 0000000..a30a1c2
--- /dev/null
+++ b/src/qirlightning/catalyst_runtime/include/Types.h
@@ -0,0 +1,165 @@
+// Copyright 2022-2023 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#ifndef TYPES_H
+#define TYPES_H
+
+#include <cmath>
+#include <cstdint>
+#include <limits>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Qubit, Result and Observable types
+struct QUBIT;
+using QubitIdType = intptr_t;
+
+using RESULT = bool;
+using Result = RESULT *;
+using QirArray = void *;
+
+using ObsIdType = intptr_t;
+
+enum ObsId : int8_t {
+    Identity = 0,
+    PauliX,
+    PauliY,
+    PauliZ,
+    Hadamard,
+    Hermitian,
+};
+
+enum ObsType : int8_t {
+    Basic = 0,
+    TensorProd,
+    Hamiltonian,
+};
+
+// complex<float> type
+struct CplxT_float {
+    float real;
+    float imag;
+};
+
+// complex<double> type
+struct CplxT_double {
+    double real;
+    double imag;
+};
+
+enum NumericType : int8_t {
+    idx = 0,
+    i1,
+    i8,
+    i16,
+    i32,
+    i64,
+    f32,
+    f64,
+    c64,
+    c128,
+};
+
+// MemRefT<datatype, dimension=rank> type
+struct OpaqueMemRefT {
+    int64_t rank;
+    void *descriptor;
+    NumericType datatype;
+};
+
+// MemRefT<complex<double>, dimension=1> type
+struct MemRefT_CplxT_double_1d {
+    CplxT_double *data_allocated;
+    CplxT_double *data_aligned;
+    size_t offset;
+    size_t sizes[1];
+    size_t strides[1];
+};
+
+// MemRefT<complex<double>, dimension=2> type
+struct MemRefT_CplxT_double_2d {
+    CplxT_double *data_allocated;
+    CplxT_double *data_aligned;
+    size_t offset;
+    size_t sizes[2];
+    size_t strides[2];
+};
+
+// MemRefT<double, dimension=1> type
+struct MemRefT_double_1d {
+    double *data_allocated;
+    double *data_aligned;
+    size_t offset;
+    size_t sizes[1];
+    size_t strides[1];
+};
+
+// MemRefT<double, dimension=2> type
+struct MemRefT_double_2d {
+    double *data_allocated;
+    double *data_aligned;
+    size_t offset;
+    size_t sizes[2];
+    size_t strides[2];
+};
+
+// MemRefT<int64_t, dimension=1> type
+struct MemRefT_int64_1d {
+    int64_t *data_allocated;
+    int64_t *data_aligned;
+    size_t offset;
+    size_t sizes[1];
+    size_t strides[1];
+};
+
+// MemRefT<int64_t, dimension=1> type
+struct MemRefT_int8_1d {
+    int8_t *data_allocated;
+    int8_t *data_aligned;
+    size_t offset;
+    size_t sizes[1];
+    size_t strides[1];
+};
+
+// PairT<MemRefT<double, dimension=1>, MemRefT<int64, dimension=2>> type
+struct PairT_MemRefT_double_int64_1d {
+    struct MemRefT_double_1d first;
+    struct MemRefT_int64_1d second;
+};
+
+// Quantum operation modifiers
+struct Modifiers {
+    bool adjoint;
+    size_t num_controlled;
+    QUBIT *controlled_wires;
+    bool *controlled_values;
+};
+
+using CplxT_double = struct CplxT_double;
+using MemRefT_CplxT_double_1d = struct MemRefT_CplxT_double_1d;
+using MemRefT_CplxT_double_2d = struct MemRefT_CplxT_double_2d;
+using MemRefT_double_1d = struct MemRefT_double_1d;
+using MemRefT_double_2d = struct MemRefT_double_2d;
+using MemRefT_int64_1d = struct MemRefT_int64_1d;
+using PairT_MemRefT_double_int64_1d = struct PairT_MemRefT_double_int64_1d;
+using Modifiers = struct Modifiers;
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif
diff --git a/src/qirlightning/catalyst_runtime/lib/CMakeLists.txt b/src/qirlightning/catalyst_runtime/lib/CMakeLists.txt
new file mode 100644
index 0000000..50fd0b0
--- /dev/null
+++ b/src/qirlightning/catalyst_runtime/lib/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_subdirectory(capi)
+add_subdirectory(backend)
+add_subdirectory(registry)
diff --git a/src/qirlightning/catalyst_runtime/lib/backend/CMakeLists.txt b/src/qirlightning/catalyst_runtime/lib/backend/CMakeLists.txt
new file mode 100644
index 0000000..45b7ad7
--- /dev/null
+++ b/src/qirlightning/catalyst_runtime/lib/backend/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_subdirectory(null_qubit)
+configure_file(null_qubit/null_qubit.toml null_qubit.toml)
+if(ENABLE_OPENQASM)
+add_subdirectory(openqasm)
+configure_file(openqasm/braket_local_qubit.toml braket_local_qubit.toml)
+configure_file(openqasm/braket_aws_qubit.toml braket_aws_qubit.toml)
+endif()
diff --git a/src/qirlightning/catalyst_runtime/lib/backend/common/CacheManager.hpp b/src/qirlightning/catalyst_runtime/lib/backend/common/CacheManager.hpp
new file mode 100644
index 0000000..0141f33
--- /dev/null
+++ b/src/qirlightning/catalyst_runtime/lib/backend/common/CacheManager.hpp
@@ -0,0 +1,199 @@
+// Copyright 2022-2023 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <complex>
+#include <string>
+#include <vector>
+
+#include "Types.h"
+#include "Utils.hpp"
+
+namespace Catalyst::Runtime {
+/**
+ * @brief The CacheManager caches the entire operations and observables of
+ * a program at runtime.
+ *
+ * One direct use case of this functionality is explored to compute gradient
+ * of a circuit with taking advantage of gradient methods provided by
+ * simulators.
+ */
+template <typename ComplexT = std::complex<double>> class CacheManager {
+  protected:
+    // Operations Data
+    std::vector<std::string> ops_names_{};
+    std::vector<std::vector<double>> ops_params_{};
+    std::vector<std::vector<size_t>> ops_wires_{};
+    std::vector<bool> ops_inverses_{};
+    std::vector<std::vector<ComplexT>> ops_matrixs_{};
+    std::vector<std::vector<size_t>> ops_controlled_wires_{};
+    std::vector<std::vector<bool>> ops_controlled_values_{};
+
+    // Observables Data
+    std::vector<ObsIdType> obs_keys_{};
+    std::vector<MeasurementsT> obs_callees_{};
+
+    // Number of parameters
+    size_t num_params_{0};
+
+  public:
+    CacheManager() = default;
+    ~CacheManager() = default;
+
+    CacheManager(const CacheManager &) = delete;
+    CacheManager &operator=(const CacheManager &) = delete;
+    CacheManager(CacheManager &&) = delete;
+    CacheManager &operator=(CacheManager &&) = delete;
+
+    /**
+     * Reset cached gates
+     */
+    void Reset()
+    {
+        ops_names_.clear();
+        ops_params_.clear();
+        ops_wires_.clear();
+        ops_inverses_.clear();
+        ops_matrixs_.clear();
+        ops_controlled_wires_.clear();
+        ops_controlled_values_.clear();
+
+        obs_keys_.clear();
+        obs_callees_.clear();
+
+        num_params_ = 0;
+    }
+
+    /**
+     * @brief Add a new operation to the list of cached gates.
+     *
+     * @param name Name of the given gate
+     * @param params Parameters of the gate
+     * @param wires Wires the gate acts on
+     * @param inverse If true, inverse of the gate is applied
+     * @param matrix Unitary matrix for the 'MatrixOp' operations
+     * @param controlled_wires Control wires
+     * @param controlled_values Control values
+     */
+    void addOperation(const std::string &name, const std::vector<double> &params,
+                      const std::vector<size_t> &wires, bool inverse,
+                      const std::vector<ComplexT> &matrix = {},
+                      const std::vector<size_t> &controlled_wires = {},
+                      const std::vector<bool> &controlled_values = {})
+    {
+        ops_names_.push_back(name);
+        ops_params_.push_back(params);
+        ops_wires_.push_back(wires);
+        ops_inverses_.push_back(inverse);
+        ops_matrixs_.push_back(matrix);
+        ops_controlled_wires_.push_back(controlled_wires);
+        ops_controlled_values_.push_back(controlled_values);
+
+        num_params_ += params.size();
+    }
+
+    /**
+     * @brief Add a new observable to the list of cached gates.
+     *
+     * @param id The observable key created by LObsManager()
+     * @param callee The measurement operation
+     */
+    void addObservable(const ObsIdType id, const MeasurementsT &callee = MeasurementsT::None)
+    {
+        obs_keys_.push_back(id);
+        obs_callees_.push_back(callee);
+    }
+
+    /**
+     * @brief Get a reference to observables keys.
+     */
+    auto getObservablesKeys() -> const std::vector<ObsIdType> & { return obs_keys_; }
+
+    /**
+     * @brief Get a reference to observables callees.
+     */
+    auto getObservablesCallees() -> const std::vector<MeasurementsT> & { return obs_callees_; }
+
+    /**
+     * @brief Get a reference to operations names.
+     */
+    auto getOperationsNames() -> const std::vector<std::string> & { return ops_names_; }
+
+    /**
+     * @brief Get a reference to operations parameters.
+     */
+    auto getOperationsParameters() -> const std::vector<std::vector<double>> &
+    {
+        return ops_params_;
+    }
+
+    /**
+     * @brief Get a reference to operations wires.
+     */
+    auto getOperationsWires() -> const std::vector<std::vector<size_t>> & { return ops_wires_; }
+
+    /**
+     * @brief Get a reference to operation controlled wires.
+     */
+    auto getOperationsControlledWires() -> const std::vector<std::vector<size_t>> &
+    {
+        return this->ops_controlled_wires_;
+    }
+
+    /**
+     * @brief Get a reference to operation controlled values.
+     */
+    auto getOperationsControlledValues() -> const std::vector<std::vector<bool>> &
+    {
+        return this->ops_controlled_values_;
+    }
+
+    /**
+     * @brief Get a reference to operations inverses.
+     */
+    auto getOperationsInverses() -> const std::vector<bool> & { return ops_inverses_; }
+
+    /**
+     * @brief Get a reference to operations matrices.
+     */
+    auto getOperationsMatrices() -> const std::vector<std::vector<ComplexT>> &
+    {
+        return ops_matrixs_;
+    }
+
+    /**
+     * @brief Get total number of cached gates.
+     */
+    [[nodiscard]] auto getNumGates() const -> size_t
+    {
+        return ops_names_.size() + obs_keys_.size();
+    }
+
+    /**
+     * @brief Get number of operations.
+     */
+    [[nodiscard]] auto getNumOperations() const -> size_t { return ops_names_.size(); }
+
+    /**
+     * @brief Get number of observables.
+     */
+    [[nodiscard]] auto getNumObservables() const -> size_t { return obs_keys_.size(); }
+
+    /**
+     * @brief Get total number of cached gates.
+     */
+    [[nodiscard]] auto getNumParams() const -> size_t { return num_params_; }
+};
+} // namespace Catalyst::Runtime
diff --git a/src/qirlightning/catalyst_runtime/lib/backend/common/QubitManager.hpp b/src/qirlightning/catalyst_runtime/lib/backend/common/QubitManager.hpp
new file mode 100644
index 0000000..05dc377
--- /dev/null
+++ b/src/qirlightning/catalyst_runtime/lib/backend/common/QubitManager.hpp
@@ -0,0 +1,146 @@
+// Copyright 2022-2023 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <map>
+
+#include "Exception.hpp"
+#include "Types.h"
+#include "Utils.hpp"
+
+namespace Catalyst::Runtime {
+
+/**
+ * Qubit Manager
+ *
+ * @brief That maintains mapping of qubit IDs between runtime and device
+ * ids (e.g., Lightning-Dynamic). When user allocates a qubit, the
+ * `QubitManager` adds the qubit as an active qubit that operations
+ * can act on. When user releases a qubit, the `QubitManager` removes
+ * that qubit from the list of active wires.
+ */
+template <typename SimQubitIdType = QubitIdType, typename DevQubitIdType = size_t>
+class QubitManager {
+  private:
+    using LQMapT = std::map<SimQubitIdType, DevQubitIdType>;
+
+    SimQubitIdType next_idx{0};
+    LQMapT qubits_map{};
+
+    template <class OIter = typename LQMapT::iterator>
+    [[nodiscard]] inline OIter _remove_simulator_qubit_id(SimQubitIdType s_idx)
+    {
+        const auto &&s_idx_iter = this->qubits_map.find(s_idx);
+        RT_FAIL_IF(s_idx_iter == this->qubits_map.end(), "Invalid simulator qubit index");
+
+        return this->qubits_map.erase(s_idx_iter);
+    }
+
+    template <class IIter = typename LQMapT::iterator>
+    inline void _update_qubits_mapfrom(IIter s_idx_iter)
+    {
+        for (; s_idx_iter != this->qubits_map.end(); s_idx_iter++) {
+            s_idx_iter->second--;
+        }
+    }
+
+  public:
+    QubitManager() = default;
+    ~QubitManager() = default;
+
+    QubitManager(const QubitManager &) = delete;
+    QubitManager &operator=(const QubitManager &) = delete;
+    QubitManager(QubitManager &&) = delete;
+    QubitManager &operator=(QubitManager &&) = delete;
+
+    [[nodiscard]] auto isValidQubitId(SimQubitIdType s_idx) -> bool
+    {
+        return this->qubits_map.contains(s_idx);
+    }
+
+    [[nodiscard]] auto isValidQubitId(const std::vector<SimQubitIdType> &ss_idx) -> bool
+    {
+        return std::all_of(ss_idx.begin(), ss_idx.end(),
+                           [this](SimQubitIdType s) { return isValidQubitId(s); });
+    }
+
+    [[nodiscard]] auto getAllQubitIds() -> std::vector<SimQubitIdType>
+    {
+        std::vector<SimQubitIdType> ids;
+        ids.reserve(this->qubits_map.size());
+        for (const auto &it : this->qubits_map) {
+            ids.push_back(it.first);
+        }
+
+        return ids;
+    }
+
+    [[nodiscard]] auto getDeviceId(SimQubitIdType s_idx) -> DevQubitIdType
+    {
+        RT_FAIL_IF(!isValidQubitId(s_idx), "Invalid device qubit index");
+
+        return this->qubits_map[s_idx];
+    }
+
+    auto getDeviceIds(const std::vector<SimQubitIdType> &ss_idx) -> std::vector<DevQubitIdType>
+    {
+        std::vector<DevQubitIdType> dd_idx;
+        dd_idx.reserve(ss_idx.size());
+        for (const auto &s : ss_idx) {
+            dd_idx.push_back(getDeviceId(s));
+        }
+        return dd_idx;
+    }
+
+    [[nodiscard]] auto getSimulatorId(DevQubitIdType d_idx) -> SimQubitIdType
+    {
+        auto s_idx = std::find_if(this->qubits_map.begin(), this->qubits_map.end(),
+                                  [&d_idx](auto &&p) { return p.second == d_idx; });
+
+        RT_FAIL_IF(s_idx == this->qubits_map.end(), "Invalid simulator qubit index");
+
+        return s_idx->first;
+    }
+
+    [[nodiscard]] auto Allocate(DevQubitIdType d_next_idx) -> SimQubitIdType
+    {
+        this->qubits_map[this->next_idx++] = d_next_idx;
+        return this->next_idx - 1;
+    }
+
+    auto AllocateRange(DevQubitIdType start_idx, size_t size) -> std::vector<SimQubitIdType>
+    {
+        std::vector<SimQubitIdType> ids;
+        ids.reserve(size);
+        for (DevQubitIdType i = start_idx; i < start_idx + size; i++) {
+            ids.push_back(this->next_idx);
+            this->qubits_map[this->next_idx++] = i;
+        }
+        return ids;
+    }
+
+    void Release(SimQubitIdType s_idx)
+    {
+        _update_qubits_mapfrom(_remove_simulator_qubit_id(s_idx));
+    }
+
+    void ReleaseAll()
+    {
+        // Release all qubits by clearing the map.
+        this->qubits_map.clear();
+    }
+};
+} // namespace Catalyst::Runtime
diff --git a/src/qirlightning/catalyst_runtime/lib/backend/common/Utils.hpp b/src/qirlightning/catalyst_runtime/lib/backend/common/Utils.hpp
new file mode 100644
index 0000000..0527ac4
--- /dev/null
+++ b/src/qirlightning/catalyst_runtime/lib/backend/common/Utils.hpp
@@ -0,0 +1,304 @@
+// Copyright 2022-2023 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <array>
+#include <optional>
+#include <random>
+#include <sstream>
+#include <string>
+#include <string_view>
+#include <tuple>
+#include <unordered_map>
+#include <utility>
+
+#include "Exception.hpp"
+#include "Types.h"
+
+#define QUANTUM_DEVICE_DEL_DECLARATIONS(CLASSNAME)                                                 \
+    CLASSNAME(const CLASSNAME &) = delete;                                                         \
+    CLASSNAME &operator=(const CLASSNAME &) = delete;                                              \
+    CLASSNAME(CLASSNAME &&) = delete;                                                              \
+    CLASSNAME &operator=(CLASSNAME &&) = delete;
+
+#define QUANTUM_DEVICE_RT_DECLARATIONS                                                             \
+    auto AllocateQubit()->QubitIdType override;                                                    \
+    auto AllocateQubits(size_t num_qubits)->std::vector<QubitIdType> override;                     \
+    void ReleaseQubit(QubitIdType q) override;                                                     \
+    void ReleaseAllQubits() override;                                                              \
+    [[nodiscard]] auto GetNumQubits() const->size_t override;                                      \
+    void StartTapeRecording() override;                                                            \
+    void StopTapeRecording() override;                                                             \
+    void SetDeviceShots(size_t shots) override;                                                    \
+    [[nodiscard]] auto GetDeviceShots() const->size_t override;                                    \
+    void PrintState() override;                                                                    \
+    [[nodiscard]] auto Zero() const->Result override;                                              \
+    [[nodiscard]] auto One() const->Result override;
+
+#define QUANTUM_DEVICE_QIS_DECLARATIONS                                                            \
+    void NamedOperation(                                                                           \
+        const std::string &name, const std::vector<double> &params,                                \
+        const std::vector<QubitIdType> &wires, [[maybe_unused]] bool inverse = false,              \
+        [[maybe_unused]] const std::vector<QubitIdType> &controlled_wires = {},                    \
+        [[maybe_unused]] const std::vector<bool> &controlled_values = {}) override;                \
+    using Catalyst::Runtime::QuantumDevice::MatrixOperation;                                       \
+    void MatrixOperation(                                                                          \
+        const std::vector<std::complex<double>> &matrix, const std::vector<QubitIdType> &wires,    \
+        [[maybe_unused]] bool inverse = false,                                                     \
+        [[maybe_unused]] const std::vector<QubitIdType> &controlled_wires = {},                    \
+        [[maybe_unused]] const std::vector<bool> &controlled_values = {}) override;                \
+    auto Observable(ObsId id, const std::vector<std::complex<double>> &matrix,                     \
+                    const std::vector<QubitIdType> &wires)                                         \
+        ->ObsIdType override;                                                                      \
+    auto TensorObservable(const std::vector<ObsIdType> &obs)->ObsIdType override;                  \
+    auto HamiltonianObservable(const std::vector<double> &coeffs,                                  \
+                               const std::vector<ObsIdType> &obs)                                  \
+        ->ObsIdType override;                                                                      \
+    auto Expval(ObsIdType obsKey)->double override;                                                \
+    auto Var(ObsIdType obsKey)->double override;                                                   \
+    void State(DataView<std::complex<double>, 1> &state) override;                                 \
+    void Probs(DataView<double, 1> &probs) override;                                               \
+    void PartialProbs(DataView<double, 1> &probs, const std::vector<QubitIdType> &wires) override; \
+    void Sample(DataView<double, 2> &samples, size_t shots) override;                              \
+    void PartialSample(DataView<double, 2> &samples, const std::vector<QubitIdType> &wires,        \
+                       size_t shots) override;                                                     \
+    void Counts(DataView<double, 1> &eigvals, DataView<int64_t, 1> &counts, size_t shots)          \
+        override;                                                                                  \
+    void PartialCounts(DataView<double, 1> &eigvals, DataView<int64_t, 1> &counts,                 \
+                       const std::vector<QubitIdType> &wires, size_t shots) override;              \
+    auto Measure(QubitIdType wire, std::optional<int32_t> postselect = std::nullopt)               \
+        ->Result override;                                                                         \
+    void Gradient(std::vector<DataView<double, 1>> &gradients,                                     \
+                  const std::vector<size_t> &trainParams) override;
+
+namespace Catalyst::Runtime {
+static inline auto parse_kwargs(std::string kwargs) -> std::unordered_map<std::string, std::string>
+{
+    // cleaning kwargs
+    if (kwargs.empty()) {
+        return {};
+    }
+
+    std::unordered_map<std::string, std::string> map;
+    size_t s3_pos = kwargs.find("\'s3_destination_folder\'");
+    if (s3_pos != std::string::npos) {
+        auto opening_pos = kwargs.find('(', s3_pos);
+        RT_ASSERT(opening_pos != std::string::npos);
+        auto closing_pos = kwargs.find(')', opening_pos);
+        RT_ASSERT(closing_pos != std::string::npos);
+        map["s3_destination_folder"] = kwargs.substr(opening_pos, closing_pos - opening_pos + 1);
+    }
+
+    auto kwargs_end_iter = (s3_pos == std::string::npos) ? kwargs.end() : kwargs.begin() + s3_pos;
+
+    kwargs.erase(std::remove_if(kwargs.begin(), kwargs_end_iter,
+                                [](char c) {
+                                    switch (c) {
+                                    case '{':
+                                    case '}':
+                                    case ' ':
+                                    case '\'':
+                                        return true;
+                                    default:
+                                        return false;
+                                    }
+                                }),
+                 kwargs.end());
+
+    // constructing map
+    std::istringstream iss(kwargs);
+    std::string token;
+    while (std::getline(iss, token, ',')) {
+        std::istringstream issp(token);
+        std::string pair[2];
+        std::getline(issp, pair[0], ':');
+        std::getline(issp, pair[1]);
+        map[pair[0]] = pair[1];
+    }
+
+    return map;
+}
+
+enum class MeasurementsT : uint8_t {
+    None, // = 0
+    Expval,
+    Var,
+    Probs,
+    State,
+};
+
+} // namespace Catalyst::Runtime
+
+namespace Catalyst::Runtime::Simulator::Lightning {
+enum class SimulatorGate : uint8_t {
+    // 1-qubit
+    Identity, // = 0
+    PauliX,
+    PauliY,
+    PauliZ,
+    Hadamard,
+    S,
+    T,
+    PhaseShift,
+    RX,
+    RY,
+    RZ,
+    Rot,
+    // 2-qubit
+    CNOT,
+    CY,
+    CZ,
+    SWAP,
+    ISWAP,
+    PSWAP,
+    IsingXX,
+    IsingYY,
+    IsingXY,
+    IsingZZ,
+    ControlledPhaseShift,
+    CRX,
+    CRY,
+    CRZ,
+    CRot,
+    // 3-qubit
+    CSWAP,
+    Toffoli,
+    // n-qubit
+    MultiRZ,
+};
+
+constexpr std::array simulator_observable_support = {
+    // ObsId, ObsName, SimulatorSupport
+    std::tuple<ObsId, std::string_view, bool>{ObsId::Identity, "Identity", true},
+    std::tuple<ObsId, std::string_view, bool>{ObsId::PauliX, "PauliX", true},
+    std::tuple<ObsId, std::string_view, bool>{ObsId::PauliY, "PauliY", true},
+    std::tuple<ObsId, std::string_view, bool>{ObsId::PauliZ, "PauliZ", true},
+    std::tuple<ObsId, std::string_view, bool>{ObsId::Hadamard, "Hadamard", true},
+};
+
+using GateInfoTupleT = std::tuple<SimulatorGate, std::string_view, size_t, size_t>;
+
+constexpr std::array simulator_gate_info = {
+    // 1-qubit
+    GateInfoTupleT{SimulatorGate::Identity, "Identity", 1, 0},
+    GateInfoTupleT{SimulatorGate::PauliX, "PauliX", 1, 0},
+    GateInfoTupleT{SimulatorGate::PauliY, "PauliY", 1, 0},
+    GateInfoTupleT{SimulatorGate::PauliZ, "PauliZ", 1, 0},
+    GateInfoTupleT{SimulatorGate::Hadamard, "Hadamard", 1, 0},
+    GateInfoTupleT{SimulatorGate::S, "S", 1, 0},
+    GateInfoTupleT{SimulatorGate::T, "T", 1, 0},
+    GateInfoTupleT{SimulatorGate::PhaseShift, "PhaseShift", 1, 1},
+    GateInfoTupleT{SimulatorGate::RX, "RX", 1, 1},
+    GateInfoTupleT{SimulatorGate::RY, "RY", 1, 1},
+    GateInfoTupleT{SimulatorGate::RZ, "RZ", 1, 1},
+    GateInfoTupleT{SimulatorGate::Rot, "Rot", 1, 3},
+    // 2-qubit
+    GateInfoTupleT{SimulatorGate::CNOT, "CNOT", 2, 0},
+    GateInfoTupleT{SimulatorGate::CY, "CY", 2, 0},
+    GateInfoTupleT{SimulatorGate::CZ, "CZ", 2, 0},
+    GateInfoTupleT{SimulatorGate::SWAP, "SWAP", 2, 0},
+    GateInfoTupleT{SimulatorGate::ISWAP, "ISWAP", 2, 0},
+    GateInfoTupleT{SimulatorGate::PSWAP, "PSWAP", 2, 1},
+    GateInfoTupleT{SimulatorGate::IsingXX, "IsingXX", 2, 1},
+    GateInfoTupleT{SimulatorGate::IsingYY, "IsingYY", 2, 1},
+    GateInfoTupleT{SimulatorGate::IsingXY, "IsingXY", 2, 1},
+    GateInfoTupleT{SimulatorGate::IsingZZ, "IsingZZ", 2, 1},
+    GateInfoTupleT{SimulatorGate::ControlledPhaseShift, "ControlledPhaseShift", 2, 1},
+    GateInfoTupleT{SimulatorGate::CRX, "CRX", 2, 1},
+    GateInfoTupleT{SimulatorGate::CRY, "CRY", 2, 1},
+    GateInfoTupleT{SimulatorGate::CRZ, "CRZ", 2, 1},
+    GateInfoTupleT{SimulatorGate::CRot, "CRot", 2, 3},
+    // 3-qubit
+    GateInfoTupleT{SimulatorGate::CSWAP, "CSWAP", 3, 0},
+    GateInfoTupleT{SimulatorGate::Toffoli, "Toffoli", 3, 0},
+    // n-qubit
+    GateInfoTupleT{SimulatorGate::MultiRZ, "MultiRZ", 0, 1},
+};
+
+constexpr size_t simulator_gate_info_size = simulator_gate_info.size();
+constexpr size_t simulator_observable_support_size = simulator_observable_support.size();
+
+template <size_t size = simulator_gate_info_size>
+using SimulatorGateInfoDataT = std::array<GateInfoTupleT, size>;
+
+template <size_t size = simulator_observable_support_size>
+constexpr auto lookup_obs(const std::array<std::tuple<ObsId, std::string_view, bool>, size> &arr,
+                          const ObsId key) -> std::string_view
+{
+    for (size_t idx = 0; idx < size; idx++) {
+        auto &&[op_id, op_str, op_support] = arr[idx];
+        if (op_id == key && op_support) {
+            return op_str;
+        }
+    }
+    throw std::range_error("The given observable is not supported by the simulator");
+}
+
+template <size_t size = simulator_gate_info_size>
+constexpr auto lookup_gates(const SimulatorGateInfoDataT<size> &arr, const std::string &key)
+    -> std::pair<size_t, size_t>
+{
+    for (size_t idx = 0; idx < size; idx++) {
+        auto &&[op, op_str, op_num_wires, op_num_params] = arr[idx];
+        if (op_str == key) {
+            return std::make_pair(op_num_wires, op_num_params);
+        }
+    }
+    throw std::range_error("The given operation is not supported by the simulator");
+}
+
+template <size_t size = simulator_gate_info_size>
+constexpr auto has_gate(const SimulatorGateInfoDataT<size> &arr, const std::string &key) -> bool
+{
+    for (size_t idx = 0; idx < size; idx++) {
+        if (std::get<1>(arr[idx]) == key) {
+            return true;
+        }
+    }
+    return false;
+}
+
+static inline auto
+simulateDraw(const std::vector<double> &probs, std::optional<int32_t> postselect,
+             std::mt19937 *gen = nullptr) // NOLINT(readability-non-const-parameter)
+    -> bool
+{
+    if (postselect) {
+        auto postselect_value = postselect.value();
+        RT_FAIL_IF(postselect_value < 0 || postselect_value > 1, "Invalid postselect value");
+        RT_FAIL_IF(probs[postselect_value] == 0, "Probability of postselect value is 0");
+        return static_cast<bool>(postselect_value == 1);
+    }
+
+    // Normal flow, no post-selection
+    // Draw a number according to the given distribution
+    std::uniform_real_distribution<> dis(0., 1.);
+
+    float draw;
+    if (gen != nullptr) {
+        draw = dis(*gen);
+        (*gen)();
+    }
+    else {
+        std::random_device rd;
+        std::mt19937 gen_no_seed(rd());
+        draw = dis(gen_no_seed);
+    }
+
+    return draw > probs[0];
+}
+
+} // namespace Catalyst::Runtime::Simulator::Lightning
diff --git a/src/qirlightning/catalyst_runtime/lib/capi/CMakeLists.txt b/src/qirlightning/catalyst_runtime/lib/capi/CMakeLists.txt
new file mode 100644
index 0000000..e05e9bf
--- /dev/null
+++ b/src/qirlightning/catalyst_runtime/lib/capi/CMakeLists.txt
@@ -0,0 +1,57 @@
+##################################
+# Object Lib catalyst_qir_qis_obj
+##################################
+
+add_library(catalyst_qir_qis_obj OBJECT RuntimeCAPI.cpp)
+
+# include external MLIR runner utils
+FetchContent_MakeAvailable(MLIRRunnerUtils)
+FetchContent_MakeAvailable(MLIRCRunnerUtils)
+FetchContent_MakeAvailable(MLIRFloat16Bits)
+
+# link to rt_backend
+target_link_libraries(catalyst_qir_qis_obj ${CMAKE_DL_LIBS})
+
+target_link_libraries(catalyst_qir_qis_obj
+    pthread
+    dl
+)
+
+target_include_directories(catalyst_qir_qis_obj PUBLIC .
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${runtime_includes}
+    ${mlirrunnerutils_SOURCE_DIR}/../..  # includes are relative to mlir/ExecutionEngine
+    ${PROJECT_SOURCE_DIR}/../mlir/lib/Driver  # Timer.hpp
+)
+
+# The MLIR Runner Utils raises this warning so we need to disable it for our -Werror builds.
+if(RUNTIME_ENABLE_WARNINGS)
+    target_compile_options(catalyst_qir_qis_obj PRIVATE "-Wno-unused-parameter")
+endif()
+
+set_property(TARGET catalyst_qir_qis_obj PROPERTY POSITION_INDEPENDENT_CODE ON)
+
+#####################
+# Shared Lib rt_capi
+#####################
+
+add_library(rt_capi SHARED)
+
+target_link_libraries(rt_capi ${CMAKE_DL_LIBS} catalyst_qir_qis_obj)
+add_dependencies(rt_capi catalyst_callback_registry)
+
+
+target_include_directories(rt_capi PUBLIC .
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${runtime_includes}
+    ${capi_utils_includes}
+)
+
+set_property(TARGET rt_capi PROPERTY POSITION_INDEPENDENT_CODE ON)
+set_property(TARGET rt_capi APPEND PROPERTY BUILD_RPATH "$<TARGET_FILE_DIR:catalyst_callback_registry>")
+
+if(NOT APPLE)
+    set_property(TARGET rt_capi APPEND PROPERTY BUILD_RPATH $ORIGIN)
+else()
+    set_property(TARGET rt_capi APPEND PROPERTY BUILD_RPATH @loader_path)
+endif()
diff --git a/src/qirlightning/catalyst_runtime/lib/capi/ExecutionContext.hpp b/src/qirlightning/catalyst_runtime/lib/capi/ExecutionContext.hpp
new file mode 100644
index 0000000..9abe8cb
--- /dev/null
+++ b/src/qirlightning/catalyst_runtime/lib/capi/ExecutionContext.hpp
@@ -0,0 +1,367 @@
+// Copyright 2022-2023 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <dlfcn.h>
+
+#include <cstdio>
+#include <functional>
+#include <memory>
+#include <mutex>
+#include <random>
+#include <string>
+#include <string_view>
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "Exception.hpp"
+#include "QuantumDevice.hpp"
+#include "Types.h"
+
+extern void callbackCall(int64_t, int64_t, int64_t, va_list);
+
+namespace Catalyst::Runtime {
+
+extern "C" void __catalyst_inactive_callback(int64_t identifier, int64_t argc, int64_t retc, ...);
+
+class MemoryManager // NOLINT(cppcoreguidelines-special-member-functions,
+                    // hicpp-special-member-functions)
+    final {
+  private:
+    std::unordered_set<void *> _impl;
+    std::mutex mu; // To guard the memory manager
+
+  public:
+    explicit MemoryManager() { _impl.reserve(1024); };
+
+    ~MemoryManager()
+    {
+        // Lock the mutex to protect _impl free
+        std::lock_guard<std::mutex> lock(mu);
+        for (auto *allocation : _impl) {
+            free(allocation); // NOLINT(cppcoreguidelines-no-malloc, hicpp-no-malloc)
+        }
+    }
+
+    void insert(void *ptr)
+    {
+        // Lock the mutex to protect _impl update
+        std::lock_guard<std::mutex> lock(mu);
+        _impl.insert(ptr);
+    }
+    void erase(void *ptr)
+    {
+        // Lock the mutex to protect _impl update
+        std::lock_guard<std::mutex> lock(mu);
+        _impl.erase(ptr);
+    }
+    bool contains(void *ptr)
+    {
+        // Lock the mutex to protect _impl update
+        std::lock_guard<std::mutex> lock(mu);
+        return _impl.contains(ptr);
+    }
+};
+
+class SharedLibraryManager final {
+  private:
+    void *_handler{nullptr};
+
+  public:
+    SharedLibraryManager() = delete;
+    explicit SharedLibraryManager(const std::string &filename)
+    {
+#ifdef __APPLE__
+        auto rtld_flags = RTLD_LAZY;
+#else
+        // Closing the dynamic library of Lightning simulators with dlclose() where OpenMP
+        // directives (in Lightning simulators) are in use would raise memory segfaults.
+        // Note that we use RTLD_NODELETE as a workaround to fix the issue.
+        auto rtld_flags = RTLD_LAZY | RTLD_NODELETE;
+#endif
+
+        _handler = dlopen(filename.c_str(), rtld_flags);
+        RT_FAIL_IF(!_handler, dlerror());
+    }
+
+    ~SharedLibraryManager()
+    {
+        // dlopen and dlclose increment and decrement reference counters.
+        // Since we have a guaranteed _handler in a valid SharedLibraryManager instance
+        // then we don't really need to worry about dlclose.
+        // In other words, there is an one to one correspondence between an instance
+        // of SharedLibraryManager and an increase in the reference count for the dynamic library.
+        // dlclose returns non-zero on error.
+        //
+        // Errors in dlclose are implementation dependent.
+        // There are two possible errors during dlclose in glibc: "shared object not open"
+        // and "cannot create scope list". Look for _dl_signal_error in:
+        //
+        //     https://codebrowser.dev/glibc/glibc/elf/dl-close.c.html
+        //
+        // This means that at the very least, one could trigger an error in the following line by
+        // doing the following: dlopen the same library and closing it multiple times in a different
+        // location.
+        //
+        // This would mean that the reference count would be less than the number of instances
+        // of SharedLibraryManager.
+        //
+        // There really is no way to protect against this error, except to always use
+        // SharedLibraryManager to manage shared libraries.
+        //
+        // Exercise for the reader, how could one trigger the "cannot create scope list" error?
+        dlclose(_handler);
+    }
+
+    SharedLibraryManager(const SharedLibraryManager &other) = delete;
+    SharedLibraryManager &operator=(const SharedLibraryManager &other) = delete;
+    SharedLibraryManager(SharedLibraryManager &&other) = delete;
+    SharedLibraryManager &operator=(SharedLibraryManager &&other) = delete;
+
+    void *getSymbol(const std::string &symbol)
+    {
+        void *sym = dlsym(_handler, symbol.c_str());
+        RT_FAIL_IF(!sym, dlerror());
+        return sym;
+    }
+};
+
+/**
+ * This indicates the various stages a device can be in:
+ * - `Active`   : The device is added to the device pool and the `ExecutionContext` device pointer
+ *                (`RTD_PTR`) points to this device instance. The CAPI routines have only access to
+ *                one single active device per thread via `RTD_PTR`.
+ * - `Inactive`  : The device is deactivated meaning `RTD_PTR` does not point to this device.
+ *                 The device is not removed from the pool, allowing the `ExecutionContext` manager
+ *                 to reuse this device in a multi-qnode workflow when another device with identical
+ *                 specifications is requested.
+ */
+enum class RTDeviceStatus : uint8_t {
+    Active = 0,
+    Inactive,
+};
+
+extern "C" Catalyst::Runtime::QuantumDevice *GenericDeviceFactory(const char *kwargs);
+
+/**
+ * Runtime Device data-class.
+ *
+ * This class introduces an interface for constructed devices by the `ExecutionContext`
+ * manager. This includes the device name, library, kwargs, and a shared pointer to the
+ * `QuantumDevice` entry point.
+ */
+class RTDevice {
+  private:
+    std::string rtd_lib;
+    std::string rtd_name;
+    std::string rtd_kwargs;
+
+    std::unique_ptr<SharedLibraryManager> rtd_dylib{nullptr};
+    std::unique_ptr<QuantumDevice> rtd_qdevice{nullptr};
+
+    RTDeviceStatus status{RTDeviceStatus::Inactive};
+
+    static void _complete_dylib_os_extension(std::string &rtd_lib, const std::string &name) noexcept
+    {
+#ifdef __linux__
+        rtd_lib = "librtd_" + name + ".so";
+#elif defined(__APPLE__)
+        rtd_lib = "librtd_" + name + ".dylib";
+#endif
+    }
+
+    static void _pl2runtime_device_info(std::string &rtd_lib, std::string &rtd_name) noexcept
+    {
+        // The following if-elif is required for C++ tests where these backend devices
+        // are linked in the interface library of the runtime. (check runtime/CMakeLists.txt)
+        // Besides, this provides support for runtime device (RTD) libraries added to the system
+        // path. This maintains backward compatibility for specifying a device using its name.
+        // TODO: This support may need to be removed after updating the C++ unit tests.
+        if (rtd_lib == "null.qubit") {
+            rtd_name = "NullQubit";
+            _complete_dylib_os_extension(rtd_lib, "null_qubit");
+        }
+        else if (rtd_lib == "lightning.qubit") {
+            rtd_name = "LightningSimulator";
+            _complete_dylib_os_extension(rtd_lib, "lightning");
+        }
+        else if (rtd_lib == "braket.aws.qubit" || rtd_lib == "braket.local.qubit") {
+            rtd_name = "OpenQasmDevice";
+            _complete_dylib_os_extension(rtd_lib, "openqasm");
+        }
+    }
+
+  public:
+    explicit RTDevice(std::string _rtd_lib, std::string _rtd_name = {},
+                      std::string _rtd_kwargs = {})
+        : rtd_lib(std::move(_rtd_lib)), rtd_name(std::move(_rtd_name)),
+          rtd_kwargs(std::move(_rtd_kwargs))
+    {
+        _pl2runtime_device_info(rtd_lib, rtd_name);
+    }
+
+    explicit RTDevice(std::string_view _rtd_lib, std::string_view _rtd_name,
+                      std::string_view _rtd_kwargs)
+        : rtd_lib(_rtd_lib), rtd_name(_rtd_name), rtd_kwargs(_rtd_kwargs)
+    {
+        _pl2runtime_device_info(rtd_lib, rtd_name);
+    }
+
+    ~RTDevice() = default;
+    RTDevice(const RTDevice &other) = delete;
+    RTDevice &operator=(const RTDevice &other) = delete;
+    RTDevice(RTDevice &&other) = delete;
+    RTDevice &operator=(RTDevice &&other) = delete;
+
+    auto operator==(const RTDevice &other) const -> bool
+    {
+        return (this->rtd_lib == other.rtd_lib && this->rtd_name == other.rtd_name) &&
+               this->rtd_kwargs == other.rtd_kwargs;
+    }
+
+    [[nodiscard]] auto getQuantumDevicePtr() -> const std::unique_ptr<QuantumDevice> &
+    {
+        if (rtd_qdevice) {
+            return rtd_qdevice;
+        }
+
+        rtd_dylib = std::make_unique<SharedLibraryManager>(rtd_lib);
+        std::string factory_name{rtd_name + "Factory"};
+        void *f_ptr = rtd_dylib->getSymbol(factory_name);
+        rtd_qdevice = std::unique_ptr<QuantumDevice>(
+            (f_ptr != nullptr)
+                ? reinterpret_cast<decltype(GenericDeviceFactory) *>(f_ptr)(rtd_kwargs.c_str())
+                : nullptr);
+        return rtd_qdevice;
+    }
+
+    [[nodiscard]] auto getDeviceInfo() const -> std::tuple<std::string, std::string, std::string>
+    {
+        return {rtd_lib, rtd_name, rtd_kwargs};
+    }
+
+    [[nodiscard]] auto getDeviceName() const -> const std::string & { return rtd_name; }
+
+    void setDeviceStatus(RTDeviceStatus new_status) noexcept { status = new_status; }
+
+    [[nodiscard]] auto getDeviceStatus() const -> RTDeviceStatus { return status; }
+
+    friend std::ostream &operator<<(std::ostream &os, const RTDevice &device)
+    {
+        os << "RTD, name: " << device.rtd_name << " lib: " << device.rtd_lib
+           << " kwargs: " << device.rtd_kwargs;
+        return os;
+    }
+};
+
+class ExecutionContext final {
+  private:
+    // Device pool
+    std::vector<std::shared_ptr<RTDevice>> device_pool;
+    std::mutex pool_mu; // To protect device_pool
+
+    bool initial_tape_recorder_status{false};
+
+    // ExecutionContext pointers
+    std::unique_ptr<MemoryManager> memory_man_ptr{nullptr};
+
+    // PRNG
+    uint32_t *seed;
+    std::mt19937 gen;
+
+  public:
+    explicit ExecutionContext(uint32_t *seed = nullptr) : seed(seed)
+    {
+        memory_man_ptr = std::make_unique<MemoryManager>();
+
+        if (this->seed != nullptr) {
+            this->gen = std::mt19937(*seed);
+        }
+    }
+
+    ~ExecutionContext() = default;
+    ExecutionContext(const ExecutionContext &other) = delete;
+    ExecutionContext &operator=(const ExecutionContext &other) = delete;
+    ExecutionContext(ExecutionContext &&other) = delete;
+    ExecutionContext &operator=(ExecutionContext &&other) = delete;
+
+    void setDeviceRecorderStatus(bool status) noexcept { initial_tape_recorder_status = status; }
+
+    [[nodiscard]] auto getDeviceRecorderStatus() const -> bool
+    {
+        return initial_tape_recorder_status;
+    }
+
+    [[nodiscard]] auto getMemoryManager() const -> const std::unique_ptr<MemoryManager> &
+    {
+        return memory_man_ptr;
+    }
+
+    [[nodiscard]] auto getOrCreateDevice(std::string_view rtd_lib, std::string_view rtd_name,
+                                         std::string_view rtd_kwargs)
+        -> const std::shared_ptr<RTDevice> &
+    {
+        std::lock_guard<std::mutex> lock(pool_mu);
+
+        auto device = std::make_shared<RTDevice>(rtd_lib, rtd_name, rtd_kwargs);
+
+        const size_t key = device_pool.size();
+        for (size_t i = 0; i < key; i++) {
+            if (device_pool[i]->getDeviceStatus() == RTDeviceStatus::Inactive &&
+                *device_pool[i] == *device) {
+                device_pool[i]->setDeviceStatus(RTDeviceStatus::Active);
+                return device_pool[i];
+            }
+        }
+
+        RT_ASSERT(device->getQuantumDevicePtr());
+
+        // Add a new device
+        device->setDeviceStatus(RTDeviceStatus::Active);
+        if (this->seed != nullptr) {
+            device->getQuantumDevicePtr()->SetDevicePRNG(&(this->gen));
+        }
+        else {
+            device->getQuantumDevicePtr()->SetDevicePRNG(nullptr);
+        }
+        device_pool.push_back(device);
+
+        return device_pool[key];
+    }
+
+    [[nodiscard]] auto getOrCreateDevice(const std::string &rtd_lib,
+                                         const std::string &rtd_name = {},
+                                         const std::string &rtd_kwargs = {})
+        -> const std::shared_ptr<RTDevice> &
+    {
+        return getOrCreateDevice(std::string_view{rtd_lib}, std::string_view{rtd_name},
+                                 std::string_view{rtd_kwargs});
+    }
+
+    [[nodiscard]] auto getDevice(size_t device_key) -> const std::shared_ptr<RTDevice> &
+    {
+        std::lock_guard<std::mutex> lock(pool_mu);
+        RT_FAIL_IF(device_key >= device_pool.size(), "Invalid device_key");
+        return device_pool[device_key];
+    }
+
+    void deactivateDevice(RTDevice *RTD_PTR)
+    {
+        std::lock_guard<std::mutex> lock(pool_mu);
+        RTD_PTR->setDeviceStatus(RTDeviceStatus::Inactive);
+    }
+};
+} // namespace Catalyst::Runtime
diff --git a/src/qirlightning/catalyst_runtime/lib/capi/MemRefUtils.hpp b/src/qirlightning/catalyst_runtime/lib/capi/MemRefUtils.hpp
new file mode 100644
index 0000000..481da78
--- /dev/null
+++ b/src/qirlightning/catalyst_runtime/lib/capi/MemRefUtils.hpp
@@ -0,0 +1,48 @@
+// Copyright 2023 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+
+#include "mlir/ExecutionEngine/RunnerUtils.h"
+
+extern "C" {
+void *_mlir_memref_to_llvm_alloc(size_t size);
+void *_mlir_memref_to_llvm_aligned_alloc(size_t alignment, size_t size);
+bool _mlir_memory_transfer(void *);
+void _mlir_memref_to_llvm_free(void *ptr);
+}
+
+// MemRef type definition
+template <typename T, size_t R> struct MemRefT {
+    T *data_allocated;
+    T *data_aligned;
+    size_t offset;
+    size_t sizes[R];
+    size_t strides[R];
+};
+
+template <typename T>
+inline void printMemref(const UnrankedMemRefType<T> &memref, bool printDescriptor = false)
+{
+    auto m = DynamicMemRefType<T>(memref);
+    if (printDescriptor) {
+        std::cout << "MemRef: ";
+        printMemRefMetaData(std::cout, m);
+        std::cout << " data =" << std::endl;
+    }
+    impl::MemRefDataPrinter<T>::print(std::cout, m.data, m.rank, m.rank, m.offset, m.sizes,
+                                      m.strides);
+}
diff --git a/src/qirlightning/catalyst_runtime/lib/capi/RuntimeCAPI.cpp b/src/qirlightning/catalyst_runtime/lib/capi/RuntimeCAPI.cpp
new file mode 100644
index 0000000..8c1e019
--- /dev/null
+++ b/src/qirlightning/catalyst_runtime/lib/capi/RuntimeCAPI.cpp
@@ -0,0 +1,1012 @@
+// Copyright 2022-2023 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstdarg>
+#include <cstdlib>
+#include <ctime>
+
+#include <bitset>
+#include <stdexcept>
+
+#include <memory>
+#include <ostream>
+#include <string_view>
+
+#include "mlir/ExecutionEngine/CRunnerUtils.h"
+
+#include "Exception.hpp"
+#include "QuantumDevice.hpp"
+
+#include "ExecutionContext.hpp"
+#include "MemRefUtils.hpp"
+#include "Timer.hpp"
+
+#include "RuntimeCAPI.h"
+
+namespace Catalyst::Runtime {
+
+/**
+ * @brief Global quantum device unique pointer.
+ */
+static std::unique_ptr<ExecutionContext> CTX = nullptr;
+
+/**
+ * @brief Thread local device pointer with internal linkage.
+ */
+thread_local static RTDevice *RTD_PTR = nullptr;
+
+bool getModifiersAdjoint(const Modifiers *modifiers)
+{
+    return !modifiers ? false : modifiers->adjoint;
+}
+
+std::vector<QubitIdType> getModifiersControlledWires(const Modifiers *modifiers)
+{
+    return !modifiers ? std::vector<QubitIdType>()
+                      : std::vector<QubitIdType>(
+                            reinterpret_cast<QubitIdType *>(modifiers->controlled_wires),
+                            reinterpret_cast<QubitIdType *>(modifiers->controlled_wires) +
+                                modifiers->num_controlled);
+}
+
+std::vector<bool> getModifiersControlledValues(const Modifiers *modifiers)
+{
+    return !modifiers ? std::vector<bool>()
+                      : std::vector<bool>(modifiers->controlled_values,
+                                          modifiers->controlled_values + modifiers->num_controlled);
+}
+
+#define MODIFIERS_ARGS(mod)                                                                        \
+    getModifiersAdjoint(mod), getModifiersControlledWires(mod), getModifiersControlledValues(mod)
+
+/**
+ * @brief Initialize the device instance and update the value of RTD_PTR
+ * to the new initialized device pointer.
+ */
+[[nodiscard]] bool initRTDevicePtr(std::string_view rtd_lib, std::string_view rtd_name,
+                                   std::string_view rtd_kwargs)
+{
+    auto &&device = CTX->getOrCreateDevice(rtd_lib, rtd_name, rtd_kwargs);
+    if (device) {
+        RTD_PTR = device.get();
+        return RTD_PTR ? true : false;
+    }
+    return false;
+}
+
+/**
+ * @brief get the active device.
+ */
+auto getQuantumDevicePtr() -> const std::unique_ptr<QuantumDevice> &
+{
+    return RTD_PTR->getQuantumDevicePtr();
+}
+
+/**
+ * @brief Inactivate the active device instance.
+ */
+void deactivateDevice()
+{
+    CTX->deactivateDevice(RTD_PTR);
+    RTD_PTR = nullptr;
+}
+} // namespace Catalyst::Runtime
+
+extern "C" {
+
+using namespace Catalyst::Runtime;
+using timer = catalyst::utils::Timer;
+
+void __catalyst_inactive_callback(int64_t identifier, int64_t argc, int64_t retc, ...)
+{
+    // LIBREGISTRY is a compile time macro. It is defined based on the output
+    // name of the callback library. And since it is stored in the same location
+    // as this library, it shares the ORIGIN variable. Do a `git grep LIBREGISTRY`
+    // to find its definition in the CMakeFiles.
+    // It is the name of the library that contains the callbackCall implementation.
+    // The reason why this is using dlopen is because we have historically wanted
+    // to avoid a dependency of python in the runtime.
+    // With dlopen, we leave the possibility of linking against the runtime without
+    // linking with LIBREGISTRY which is implemented as a pybind11 module.
+    //
+    // The only restriction is that there should be no calls to pyregsitry.
+    //
+    // This function cannot be tested from the runtime tests because there would be no valid python
+    // function to callback...
+    void *handle = dlopen(LIBREGISTRY, RTLD_LAZY);
+    if (!handle) {
+        char *err_msg = dlerror();
+        RT_FAIL(err_msg);
+    }
+
+    void (*callbackCall)(int64_t, int64_t, int64_t, va_list);
+    typedef void (*func_ptr_t)(int64_t, int64_t, int64_t, va_list);
+    callbackCall = (func_ptr_t)dlsym(handle, "callbackCall");
+    if (!callbackCall) {
+        char *err_msg = dlerror();
+        RT_FAIL(err_msg);
+    }
+
+    va_list args;
+    va_start(args, retc);
+    callbackCall(identifier, argc, retc, args);
+    va_end(args);
+    dlclose(handle);
+}
+
+void __catalyst__host__rt__unrecoverable_error()
+{
+    RT_FAIL("Unrecoverable error from asynchronous execution of multiple quantum programs.");
+}
+
+void *_mlir_memref_to_llvm_alloc(size_t size)
+{
+    void *ptr = malloc(size);
+    CTX->getMemoryManager()->insert(ptr);
+    return ptr;
+}
+
+void *_mlir_memref_to_llvm_aligned_alloc(size_t alignment, size_t size)
+{
+    void *ptr = aligned_alloc(alignment, size);
+    CTX->getMemoryManager()->insert(ptr);
+    return ptr;
+}
+
+bool _mlir_memory_transfer(void *ptr)
+{
+    if (!CTX->getMemoryManager()->contains(ptr)) {
+        return false;
+    }
+    CTX->getMemoryManager()->erase(ptr);
+    return true;
+}
+
+void _mlir_memref_to_llvm_free(void *ptr)
+{
+    CTX->getMemoryManager()->erase(ptr);
+    free(ptr);
+}
+
+void __catalyst__rt__print_string(char *string)
+{
+    if (!string) {
+        std::cout << "None" << std::endl;
+        return;
+    }
+    std::cout << string << std::endl;
+}
+
+void __catalyst__rt__assert_bool(bool p, char *s) { RT_FAIL_IF(!p, s); }
+
+void __catalyst__rt__print_tensor(OpaqueMemRefT *c_memref, bool printDescriptor)
+{
+    if (c_memref->datatype == NumericType::idx) {
+        printMemref<impl::index_type>({c_memref->rank, c_memref->descriptor}, printDescriptor);
+    }
+    else if (c_memref->datatype == NumericType::i1) {
+        printMemref<bool>({c_memref->rank, c_memref->descriptor}, printDescriptor);
+    }
+    else if (c_memref->datatype == NumericType::i8) {
+        printMemref<int8_t>({c_memref->rank, c_memref->descriptor}, printDescriptor);
+    }
+    else if (c_memref->datatype == NumericType::i16) {
+        printMemref<int16_t>({c_memref->rank, c_memref->descriptor}, printDescriptor);
+    }
+    else if (c_memref->datatype == NumericType::i32) {
+        printMemref<int32_t>({c_memref->rank, c_memref->descriptor}, printDescriptor);
+    }
+    else if (c_memref->datatype == NumericType::i64) {
+        printMemref<int64_t>({c_memref->rank, c_memref->descriptor}, printDescriptor);
+    }
+    else if (c_memref->datatype == NumericType::f32) {
+        printMemref<float>({c_memref->rank, c_memref->descriptor}, printDescriptor);
+    }
+    else if (c_memref->datatype == NumericType::f64) {
+        printMemref<double>({c_memref->rank, c_memref->descriptor}, printDescriptor);
+    }
+    else if (c_memref->datatype == NumericType::c64) {
+        printMemref<impl::complex32>({c_memref->rank, c_memref->descriptor}, printDescriptor);
+    }
+    else if (c_memref->datatype == NumericType::c128) {
+        printMemref<impl::complex64>({c_memref->rank, c_memref->descriptor}, printDescriptor);
+    }
+    else {
+        RT_FAIL("Unkown numeric type encoding for array printing.");
+    }
+
+    std::cout << std::endl;
+}
+
+void __catalyst__rt__fail_cstr(const char *cstr) { RT_FAIL(cstr); }
+
+void __catalyst__rt__initialize(uint32_t *seed) { CTX = std::make_unique<ExecutionContext>(seed); }
+
+void __catalyst__rt__finalize()
+{
+    RTD_PTR = nullptr;
+    CTX.reset(nullptr);
+}
+
+static int __catalyst__rt__device_init__impl(int8_t *rtd_lib, int8_t *rtd_name, int8_t *rtd_kwargs,
+                                             int64_t shots)
+{
+    // Device library cannot be a nullptr
+    RT_FAIL_IF(!rtd_lib, "Invalid device library");
+    RT_FAIL_IF(!CTX, "Invalid use of the global driver before initialization");
+    RT_FAIL_IF(RTD_PTR, "Cannot re-initialize an ACTIVE device: Consider using "
+                        "__catalyst__rt__device_release before __catalyst__rt__device_init");
+
+    const std::vector<std::string_view> args{
+        reinterpret_cast<char *>(rtd_lib), (rtd_name ? reinterpret_cast<char *>(rtd_name) : ""),
+        (rtd_kwargs ? reinterpret_cast<char *>(rtd_kwargs) : "")};
+    RT_FAIL_IF(!initRTDevicePtr(args[0], args[1], args[2]),
+               "Failed initialization of the backend device");
+    getQuantumDevicePtr()->SetDeviceShots(shots);
+    if (CTX->getDeviceRecorderStatus()) {
+        getQuantumDevicePtr()->StartTapeRecording();
+    }
+    return 0;
+}
+
+void __catalyst__rt__device_init(int8_t *rtd_lib, int8_t *rtd_name, int8_t *rtd_kwargs,
+                                 int64_t shots)
+{
+    timer::timer(__catalyst__rt__device_init__impl, "device_init", /* add_endl */ true, rtd_lib,
+                 rtd_name, rtd_kwargs, shots);
+}
+
+static int __catalyst__rt__device_release__impl()
+{
+    RT_FAIL_IF(!CTX, "Cannot release an ACTIVE device out of scope of the global driver");
+    // TODO: This will be used for the async support
+    deactivateDevice();
+    return 0;
+}
+
+void __catalyst__rt__device_release()
+{
+    timer::timer(__catalyst__rt__device_release__impl, "device_release", /* add_endl */ true);
+}
+
+void __catalyst__rt__print_state() { getQuantumDevicePtr()->PrintState(); }
+
+void __catalyst__rt__toggle_recorder(bool status)
+{
+    CTX->setDeviceRecorderStatus(status);
+    if (!RTD_PTR) {
+        return;
+    }
+
+    if (status) {
+        getQuantumDevicePtr()->StartTapeRecording();
+    }
+    else {
+        getQuantumDevicePtr()->StopTapeRecording();
+    }
+}
+
+static QUBIT *__catalyst__rt__qubit_allocate__impl()
+{
+    RT_ASSERT(getQuantumDevicePtr() != nullptr);
+    RT_ASSERT(CTX->getMemoryManager() != nullptr);
+
+    return reinterpret_cast<QUBIT *>(getQuantumDevicePtr()->AllocateQubit());
+}
+
+QUBIT *__catalyst__rt__qubit_allocate()
+{
+    return timer::timer(__catalyst__rt__qubit_allocate__impl, "qubit_allocate",
+                        /* add_endl */ true);
+}
+
+static QirArray *__catalyst__rt__qubit_allocate_array__impl(int64_t num_qubits)
+{
+    RT_ASSERT(getQuantumDevicePtr() != nullptr);
+    RT_ASSERT(CTX->getMemoryManager() != nullptr);
+    RT_ASSERT(num_qubits >= 0);
+
+    // For first prototype, we just want to make this work.
+    // But ideally, I think the device should determine the representation.
+    // Essentially just forward this to the device library.
+    // And the device library can choose how to handle everything.
+    std::vector<QubitIdType> qubit_vector = getQuantumDevicePtr()->AllocateQubits(num_qubits);
+
+    // I don't like this copying.
+    std::vector<QubitIdType> *qubit_vector_ptr =
+        new std::vector<QubitIdType>(qubit_vector.begin(), qubit_vector.end());
+
+    // Because this function is interfacing with C
+    // I think we should return a trivial-type
+    //     https://en.cppreference.com/w/cpp/named_req/TrivialType
+    // Why should we return a trivial type?
+    //
+    // Paraphrasing from stackoverflow: https://stackoverflow.com/a/72409589
+    //     extern "C" will avoid name mangling from happening.
+    //     It doesn't prevent a function from returning or accepting a C++ type.
+    //     But the calling language needs to understand the data-layout for the
+    //     type being returned.
+    //     For non-trivial types, this will be difficult to impossible.
+    return (QirArray *)qubit_vector_ptr;
+}
+
+QirArray *__catalyst__rt__qubit_allocate_array(int64_t num_qubits)
+{
+    return timer::timer(__catalyst__rt__qubit_allocate_array__impl, "qubit_allocate_array",
+                        /* add_endl */ true, num_qubits);
+}
+
+static int __catalyst__rt__qubit_release__impl(QUBIT *qubit)
+{
+    getQuantumDevicePtr()->ReleaseQubit(reinterpret_cast<QubitIdType>(qubit));
+    return 0;
+}
+
+void __catalyst__rt__qubit_release(QUBIT *qubit)
+{
+    timer::timer(__catalyst__rt__qubit_release__impl, "qubit_release",
+                 /* add_endl */ true, qubit);
+}
+
+static int __catalyst__rt__qubit_release_array__impl(QirArray *qubit_array)
+{
+    getQuantumDevicePtr()->ReleaseAllQubits();
+    std::vector<QubitIdType> *qubit_array_ptr =
+        reinterpret_cast<std::vector<QubitIdType> *>(qubit_array);
+    delete qubit_array_ptr;
+    return 0;
+}
+
+void __catalyst__rt__qubit_release_array(QirArray *qubit_array)
+{
+    timer::timer(__catalyst__rt__qubit_release_array__impl, "qubit_release_array",
+                 /* add_endl */ true, qubit_array);
+}
+
+int64_t __catalyst__rt__num_qubits()
+{
+    return static_cast<int64_t>(getQuantumDevicePtr()->GetNumQubits());
+}
+
+bool __catalyst__rt__result_equal(RESULT *r0, RESULT *r1) { return (r0 == r1) || (*r0 == *r1); }
+
+RESULT *__catalyst__rt__result_get_one() { return getQuantumDevicePtr()->One(); }
+
+RESULT *__catalyst__rt__result_get_zero() { return getQuantumDevicePtr()->Zero(); }
+
+void __catalyst__qis__Gradient(int64_t numResults, /* results = */...)
+{
+    RT_ASSERT(numResults >= 0);
+    using ResultType = MemRefT<double, 1>;
+
+    std::vector<ResultType *> mem_ptrs;
+    mem_ptrs.reserve(numResults);
+    va_list args;
+    va_start(args, numResults);
+    for (int64_t i = 0; i < numResults; i++) {
+        mem_ptrs.push_back(va_arg(args, ResultType *));
+    }
+    va_end(args);
+
+    std::vector<DataView<double, 1>> mem_views;
+    mem_views.reserve(numResults);
+    for (auto *mr : mem_ptrs) {
+        mem_views.emplace_back(mr->data_aligned, mr->offset, mr->sizes, mr->strides);
+    }
+
+    // num_observables * num_train_params
+    getQuantumDevicePtr()->Gradient(mem_views, {});
+}
+
+void __catalyst__qis__Gradient_params(MemRefT_int64_1d *params, int64_t numResults,
+                                      /* results = */...)
+{
+    RT_ASSERT(numResults >= 0);
+    using ResultType = MemRefT<double, 1>;
+
+    if (params == nullptr || !params->sizes[0]) {
+        RT_FAIL("Invalid number of trainable parameters");
+    }
+
+    const size_t tp_size = params->sizes[0];
+
+    // create a vector of custom trainable parameters
+    std::vector<size_t> train_params;
+    auto *params_data = params->data_aligned;
+    train_params.reserve(tp_size);
+    for (size_t i = 0; i < tp_size; i++) {
+        auto p = params_data[i];
+        RT_FAIL_IF(p < 0, "trainable parameter cannot be a negative integer");
+        train_params.push_back(p);
+    }
+
+    std::vector<ResultType *> mem_ptrs;
+    mem_ptrs.reserve(numResults);
+    va_list args;
+    va_start(args, numResults);
+    for (int64_t i = 0; i < numResults; i++) {
+        mem_ptrs.push_back(va_arg(args, ResultType *));
+    }
+    va_end(args);
+
+    std::vector<DataView<double, 1>> mem_views;
+    mem_views.reserve(numResults);
+    for (auto *mr : mem_ptrs) {
+        mem_views.emplace_back(mr->data_aligned, mr->offset, mr->sizes, mr->strides);
+    }
+
+    // num_observables * num_train_params
+    getQuantumDevicePtr()->Gradient(mem_views, train_params);
+}
+
+void __catalyst__qis__GlobalPhase(double phi, const Modifiers *modifiers)
+{
+    getQuantumDevicePtr()->NamedOperation("GlobalPhase", {phi}, {}, MODIFIERS_ARGS(modifiers));
+}
+
+void __catalyst__qis__SetState(MemRefT_CplxT_double_1d *data, uint64_t numQubits, ...)
+{
+    RT_ASSERT(numQubits > 0);
+
+    va_list args;
+    va_start(args, numQubits);
+    std::vector<QubitIdType> wires(numQubits);
+    for (uint64_t i = 0; i < numQubits; i++) {
+        wires[i] = va_arg(args, QubitIdType);
+    }
+    va_end(args);
+
+    MemRefT<std::complex<double>, 1> *data_p = (MemRefT<std::complex<double>, 1> *)data;
+    DataView<std::complex<double>, 1> data_view(data_p->data_aligned, data_p->offset, data_p->sizes,
+                                                data_p->strides);
+    getQuantumDevicePtr()->SetState(data_view, wires);
+}
+
+void __catalyst__qis__SetBasisState(MemRefT_int8_1d *data, uint64_t numQubits, ...)
+{
+    RT_ASSERT(numQubits > 0);
+
+    DataView<int8_t, 1> data_view(data->data_aligned, data->offset, data->sizes, data->strides);
+
+    va_list args;
+    va_start(args, numQubits);
+    std::vector<QubitIdType> wires(numQubits);
+    for (uint64_t i = 0; i < numQubits; i++) {
+        wires[i] = va_arg(args, QubitIdType);
+    }
+    va_end(args);
+    std::unordered_set<QubitIdType> wire_set(wires.begin(), wires.end());
+    RT_FAIL_IF(wire_set.size() != numQubits, "Wires must be unique");
+    RT_FAIL_IF(data->sizes[0] != numQubits,
+               "BasisState parameter and wires must be of equal length.");
+
+    getQuantumDevicePtr()->SetBasisState(data_view, wires);
+}
+
+void __catalyst__qis__Identity(QUBIT *qubit, const Modifiers *modifiers)
+{
+    getQuantumDevicePtr()->NamedOperation("Identity", {}, {reinterpret_cast<QubitIdType>(qubit)},
+                                          MODIFIERS_ARGS(modifiers));
+}
+
+void __catalyst__qis__PauliX(QUBIT *qubit, const Modifiers *modifiers)
+{
+    getQuantumDevicePtr()->NamedOperation("PauliX", {}, {reinterpret_cast<QubitIdType>(qubit)},
+                                          MODIFIERS_ARGS(modifiers));
+}
+
+void __catalyst__qis__PauliY(QUBIT *qubit, const Modifiers *modifiers)
+{
+    getQuantumDevicePtr()->NamedOperation("PauliY", {}, {reinterpret_cast<QubitIdType>(qubit)},
+                                          MODIFIERS_ARGS(modifiers));
+}
+
+void __catalyst__qis__PauliZ(QUBIT *qubit, const Modifiers *modifiers)
+{
+    getQuantumDevicePtr()->NamedOperation("PauliZ", {}, {reinterpret_cast<QubitIdType>(qubit)},
+                                          MODIFIERS_ARGS(modifiers));
+}
+
+void __catalyst__qis__Hadamard(QUBIT *qubit, const Modifiers *modifiers)
+{
+    getQuantumDevicePtr()->NamedOperation("Hadamard", {}, {reinterpret_cast<QubitIdType>(qubit)},
+                                          MODIFIERS_ARGS(modifiers));
+}
+
+void __catalyst__qis__S(QUBIT *qubit, const Modifiers *modifiers)
+{
+    getQuantumDevicePtr()->NamedOperation("S", {}, {reinterpret_cast<QubitIdType>(qubit)},
+                                          MODIFIERS_ARGS(modifiers));
+}
+
+void __catalyst__qis__T(QUBIT *qubit, const Modifiers *modifiers)
+{
+    getQuantumDevicePtr()->NamedOperation("T", {}, {reinterpret_cast<QubitIdType>(qubit)},
+                                          MODIFIERS_ARGS(modifiers));
+}
+
+void __catalyst__qis__PhaseShift(double theta, QUBIT *qubit, const Modifiers *modifiers)
+{
+    getQuantumDevicePtr()->NamedOperation(
+        "PhaseShift", {theta}, {reinterpret_cast<QubitIdType>(qubit)}, MODIFIERS_ARGS(modifiers));
+}
+
+void __catalyst__qis__RX(double theta, QUBIT *qubit, const Modifiers *modifiers)
+{
+    getQuantumDevicePtr()->NamedOperation("RX", {theta}, {reinterpret_cast<QubitIdType>(qubit)},
+                                          MODIFIERS_ARGS(modifiers));
+}
+
+void __catalyst__qis__RY(double theta, QUBIT *qubit, const Modifiers *modifiers)
+{
+    getQuantumDevicePtr()->NamedOperation("RY", {theta}, {reinterpret_cast<QubitIdType>(qubit)},
+                                          MODIFIERS_ARGS(modifiers));
+}
+
+void __catalyst__qis__RZ(double theta, QUBIT *qubit, const Modifiers *modifiers)
+{
+    getQuantumDevicePtr()->NamedOperation("RZ", {theta}, {reinterpret_cast<QubitIdType>(qubit)},
+                                          MODIFIERS_ARGS(modifiers));
+}
+
+void __catalyst__qis__Rot(double phi, double theta, double omega, QUBIT *qubit,
+                          const Modifiers *modifiers)
+{
+    getQuantumDevicePtr()->NamedOperation("Rot", {phi, theta, omega},
+                                          {reinterpret_cast<QubitIdType>(qubit)},
+                                          MODIFIERS_ARGS(modifiers));
+}
+
+void __catalyst__qis__CNOT(QUBIT *control, QUBIT *target, const Modifiers *modifiers)
+{
+    RT_FAIL_IF(control == target,
+               "Invalid input for CNOT gate. Control and target qubit operands must be distinct.");
+    getQuantumDevicePtr()->NamedOperation("CNOT", {},
+                                          {/* control = */ reinterpret_cast<QubitIdType>(control),
+                                           /* target = */ reinterpret_cast<QubitIdType>(target)},
+                                          /* modifiers */ MODIFIERS_ARGS(modifiers));
+}
+
+void __catalyst__qis__CY(QUBIT *control, QUBIT *target, const Modifiers *modifiers)
+{
+    getQuantumDevicePtr()->NamedOperation("CY", {},
+                                          {/* control = */ reinterpret_cast<QubitIdType>(control),
+                                           /* target = */ reinterpret_cast<QubitIdType>(target)},
+                                          /* modifiers */ MODIFIERS_ARGS(modifiers));
+}
+
+void __catalyst__qis__CZ(QUBIT *control, QUBIT *target, const Modifiers *modifiers)
+{
+    getQuantumDevicePtr()->NamedOperation("CZ", {},
+                                          {/* control = */ reinterpret_cast<QubitIdType>(control),
+                                           /* target = */ reinterpret_cast<QubitIdType>(target)},
+                                          /* modifiers */ MODIFIERS_ARGS(modifiers));
+}
+
+void __catalyst__qis__SWAP(QUBIT *control, QUBIT *target, const Modifiers *modifiers)
+{
+    getQuantumDevicePtr()->NamedOperation("SWAP", {},
+                                          {/* control = */ reinterpret_cast<QubitIdType>(control),
+                                           /* target = */ reinterpret_cast<QubitIdType>(target)},
+                                          /* modifiers */ MODIFIERS_ARGS(modifiers));
+}
+
+void __catalyst__qis__IsingXX(double theta, QUBIT *control, QUBIT *target,
+                              const Modifiers *modifiers)
+{
+    getQuantumDevicePtr()->NamedOperation("IsingXX", {theta},
+                                          {/* control = */ reinterpret_cast<QubitIdType>(control),
+                                           /* target = */ reinterpret_cast<QubitIdType>(target)},
+                                          /* modifiers */ MODIFIERS_ARGS(modifiers));
+}
+
+void __catalyst__qis__IsingYY(double theta, QUBIT *control, QUBIT *target,
+                              const Modifiers *modifiers)
+{
+    getQuantumDevicePtr()->NamedOperation("IsingYY", {theta},
+                                          {/* control = */ reinterpret_cast<QubitIdType>(control),
+                                           /* target = */ reinterpret_cast<QubitIdType>(target)},
+                                          /* modifiers */ MODIFIERS_ARGS(modifiers));
+}
+
+void __catalyst__qis__IsingXY(double theta, QUBIT *control, QUBIT *target,
+                              const Modifiers *modifiers)
+{
+    getQuantumDevicePtr()->NamedOperation("IsingXY", {theta},
+                                          {/* control = */ reinterpret_cast<QubitIdType>(control),
+                                           /* target = */ reinterpret_cast<QubitIdType>(target)},
+                                          /* modifiers */ MODIFIERS_ARGS(modifiers));
+}
+
+void __catalyst__qis__IsingZZ(double theta, QUBIT *control, QUBIT *target,
+                              const Modifiers *modifiers)
+{
+    getQuantumDevicePtr()->NamedOperation("IsingZZ", {theta},
+                                          {/* control = */ reinterpret_cast<QubitIdType>(control),
+                                           /* target = */ reinterpret_cast<QubitIdType>(target)},
+                                          /* modifiers */ MODIFIERS_ARGS(modifiers));
+}
+
+void __catalyst__qis__ControlledPhaseShift(double theta, QUBIT *control, QUBIT *target,
+                                           const Modifiers *modifiers)
+{
+    getQuantumDevicePtr()->NamedOperation("ControlledPhaseShift", {theta},
+                                          {/* control = */ reinterpret_cast<QubitIdType>(control),
+                                           /* target = */ reinterpret_cast<QubitIdType>(target)},
+                                          /* modifiers */ MODIFIERS_ARGS(modifiers));
+}
+
+void __catalyst__qis__CRX(double theta, QUBIT *control, QUBIT *target, const Modifiers *modifiers)
+{
+    getQuantumDevicePtr()->NamedOperation("CRX", {theta},
+                                          {/* control = */ reinterpret_cast<QubitIdType>(control),
+                                           /* target = */ reinterpret_cast<QubitIdType>(target)},
+                                          /* modifiers */ MODIFIERS_ARGS(modifiers));
+}
+
+void __catalyst__qis__CRY(double theta, QUBIT *control, QUBIT *target, const Modifiers *modifiers)
+{
+    getQuantumDevicePtr()->NamedOperation("CRY", {theta},
+                                          {/* control = */ reinterpret_cast<QubitIdType>(control),
+                                           /* target = */ reinterpret_cast<QubitIdType>(target)},
+                                          /* modifiers */ MODIFIERS_ARGS(modifiers));
+}
+
+void __catalyst__qis__CRZ(double theta, QUBIT *control, QUBIT *target, const Modifiers *modifiers)
+{
+    getQuantumDevicePtr()->NamedOperation("CRZ", {theta},
+                                          {/* control = */ reinterpret_cast<QubitIdType>(control),
+                                           /* target = */ reinterpret_cast<QubitIdType>(target)},
+                                          /* modifiers */ MODIFIERS_ARGS(modifiers));
+}
+
+void __catalyst__qis__CRot(double phi, double theta, double omega, QUBIT *control, QUBIT *target,
+                           const Modifiers *modifiers)
+{
+    getQuantumDevicePtr()->NamedOperation("CRot", {phi, theta, omega},
+                                          {/* control = */ reinterpret_cast<QubitIdType>(control),
+                                           /* target = */ reinterpret_cast<QubitIdType>(target)},
+                                          /* modifiers */ MODIFIERS_ARGS(modifiers));
+}
+
+void __catalyst__qis__CSWAP(QUBIT *control, QUBIT *aswap, QUBIT *bswap, const Modifiers *modifiers)
+{
+    getQuantumDevicePtr()->NamedOperation("CSWAP", {},
+                                          {reinterpret_cast<QubitIdType>(control),
+                                           reinterpret_cast<QubitIdType>(aswap),
+                                           reinterpret_cast<QubitIdType>(bswap)},
+                                          /* modifiers */ MODIFIERS_ARGS(modifiers));
+}
+
+void __catalyst__qis__Toffoli(QUBIT *wire0, QUBIT *wire1, QUBIT *wire2, const Modifiers *modifiers)
+{
+    getQuantumDevicePtr()->NamedOperation("Toffoli", {},
+                                          {reinterpret_cast<QubitIdType>(wire0),
+                                           reinterpret_cast<QubitIdType>(wire1),
+                                           reinterpret_cast<QubitIdType>(wire2)},
+                                          /* modifiers */ MODIFIERS_ARGS(modifiers));
+}
+
+void __catalyst__qis__MultiRZ(double theta, const Modifiers *modifiers, int64_t numQubits, ...)
+{
+    RT_ASSERT(numQubits >= 0);
+
+    va_list args;
+    va_start(args, numQubits);
+    std::vector<QubitIdType> wires(numQubits);
+    for (int64_t i = 0; i < numQubits; i++) {
+        wires[i] = va_arg(args, QubitIdType);
+    }
+    va_end(args);
+
+    getQuantumDevicePtr()->NamedOperation("MultiRZ", {theta}, wires,
+                                          /* modifiers */ MODIFIERS_ARGS(modifiers));
+}
+
+void __catalyst__qis__ISWAP(QUBIT *wire0, QUBIT *wire1, const Modifiers *modifiers)
+{
+    getQuantumDevicePtr()->NamedOperation(
+        "ISWAP", {}, {reinterpret_cast<QubitIdType>(wire0), reinterpret_cast<QubitIdType>(wire1)},
+        MODIFIERS_ARGS(modifiers));
+}
+
+void __catalyst__qis__PSWAP(double phi, QUBIT *wire0, QUBIT *wire1, const Modifiers *modifiers)
+{
+    getQuantumDevicePtr()->NamedOperation(
+        "PSWAP", {phi},
+        {reinterpret_cast<QubitIdType>(wire0), reinterpret_cast<QubitIdType>(wire1)},
+        MODIFIERS_ARGS(modifiers));
+}
+
+static void _qubitUnitary_impl(MemRefT_CplxT_double_2d *matrix, int64_t numQubits,
+                               std::vector<std::complex<double>> &coeffs,
+                               std::vector<QubitIdType> &wires, va_list *args)
+{
+    const size_t num_rows = matrix->sizes[0];
+    const size_t num_col = matrix->sizes[1];
+    const size_t expected_size = std::pow(2, numQubits);
+
+    if (num_rows != expected_size || num_col != expected_size) {
+        RT_FAIL("Invalid given QubitUnitary matrix; "
+                "The size of the matrix must be pow(2, numWires) * pow(2, numWires).");
+    }
+
+    wires.reserve(numQubits);
+    for (int64_t i = 0; i < numQubits; i++) {
+        wires.push_back(va_arg(*args, QubitIdType));
+    }
+
+    const size_t matrix_size = num_rows * num_col;
+    coeffs.reserve(matrix_size);
+    for (size_t i = 0; i < matrix_size; i++) {
+        coeffs.emplace_back(matrix->data_aligned[i].real, matrix->data_aligned[i].imag);
+    }
+}
+
+void __catalyst__qis__QubitUnitary(MemRefT_CplxT_double_2d *matrix, const Modifiers *modifiers,
+                                   int64_t numQubits, /*qubits*/...)
+{
+    RT_ASSERT(numQubits >= 0);
+
+    if (matrix == nullptr) {
+        RT_FAIL("The QubitUnitary matrix must be initialized");
+    }
+
+    if (numQubits > __catalyst__rt__num_qubits()) {
+        RT_FAIL("Invalid number of wires");
+    }
+
+    va_list args;
+    std::vector<std::complex<double>> coeffs;
+    std::vector<QubitIdType> wires;
+    va_start(args, numQubits);
+    _qubitUnitary_impl(matrix, numQubits, coeffs, wires, &args);
+    va_end(args);
+    return getQuantumDevicePtr()->MatrixOperation(coeffs, wires, MODIFIERS_ARGS(modifiers));
+}
+
+ObsIdType __catalyst__qis__NamedObs(int64_t obsId, QUBIT *wire)
+{
+    return getQuantumDevicePtr()->Observable(static_cast<ObsId>(obsId), {},
+                                             {reinterpret_cast<QubitIdType>(wire)});
+}
+
+ObsIdType __catalyst__qis__HermitianObs(MemRefT_CplxT_double_2d *matrix, int64_t numQubits, ...)
+{
+    RT_ASSERT(numQubits >= 0);
+
+    if (matrix == nullptr) {
+        RT_FAIL("The Hermitian matrix must be initialized");
+    }
+
+    const size_t num_rows = matrix->sizes[0];
+    const size_t num_col = matrix->sizes[1];
+    const size_t expected_size = std::pow(2, numQubits);
+
+    if (num_rows != expected_size || num_col != expected_size) {
+        RT_FAIL("Invalid given Hermitian matrix; "
+                "The size of the matrix must be pow(2, numWires) * pow(2, numWires).");
+    }
+
+    va_list args;
+    va_start(args, numQubits);
+    std::vector<QubitIdType> wires(numQubits);
+    for (int64_t i = 0; i < numQubits; i++) {
+        wires[i] = va_arg(args, QubitIdType);
+    }
+    va_end(args);
+
+    if (numQubits > __catalyst__rt__num_qubits()) {
+        RT_FAIL("Invalid number of wires");
+    }
+
+    const size_t matrix_size = num_rows * num_col;
+    std::vector<std::complex<double>> coeffs;
+    coeffs.reserve(matrix_size);
+    for (size_t i = 0; i < matrix_size; i++) {
+        coeffs.emplace_back(matrix->data_aligned[i].real, matrix->data_aligned[i].imag);
+    }
+
+    return getQuantumDevicePtr()->Observable(ObsId::Hermitian, coeffs, wires);
+}
+
+ObsIdType __catalyst__qis__TensorObs(int64_t numObs, /*obsKeys*/...)
+{
+    if (numObs < 1) {
+        RT_FAIL("Invalid number of observables to create TensorProdObs");
+    }
+
+    va_list args;
+    va_start(args, numObs);
+    std::vector<ObsIdType> obsKeys;
+    obsKeys.reserve(numObs);
+    for (int64_t i = 0; i < numObs; i++) {
+        obsKeys.push_back(va_arg(args, ObsIdType));
+    }
+    va_end(args);
+
+    return getQuantumDevicePtr()->TensorObservable(obsKeys);
+}
+
+ObsIdType __catalyst__qis__HamiltonianObs(MemRefT_double_1d *coeffs, int64_t numObs,
+                                          /*obsKeys*/...)
+{
+    RT_ASSERT(numObs >= 0);
+
+    if (coeffs == nullptr) {
+        RT_FAIL("Invalid coefficients for computing Hamiltonian; "
+                "The coefficients list must be initialized.");
+    }
+
+    const size_t coeffs_size = coeffs->sizes[0];
+
+    if (static_cast<size_t>(numObs) != coeffs_size) {
+        RT_FAIL("Invalid coefficients for computing Hamiltonian; "
+                "The number of coefficients and observables must be equal.");
+    }
+
+    va_list args;
+    va_start(args, numObs);
+    std::vector<ObsIdType> obsKeys;
+    obsKeys.reserve(numObs);
+    for (int64_t i = 0; i < numObs; i++) {
+        obsKeys.push_back(va_arg(args, ObsIdType));
+    }
+    va_end(args);
+
+    std::vector<double> coeffs_vec(coeffs->data_aligned, coeffs->data_aligned + coeffs_size);
+    return getQuantumDevicePtr()->HamiltonianObservable(coeffs_vec, obsKeys);
+}
+
+RESULT *__catalyst__qis__Measure(QUBIT *wire, int32_t postselect)
+{
+    std::optional<int32_t> postselectOpt{postselect};
+
+    // Any value different to 0 or 1 denotes absence of postselect, and it is hence turned into
+    // std::nullopt at the C++ interface
+    if (postselect != 0 && postselect != 1) {
+        postselectOpt = std::nullopt;
+    }
+
+    return getQuantumDevicePtr()->Measure(reinterpret_cast<QubitIdType>(wire), postselectOpt);
+}
+
+double __catalyst__qis__Expval(ObsIdType obsKey) { return getQuantumDevicePtr()->Expval(obsKey); }
+
+double __catalyst__qis__Variance(ObsIdType obsKey) { return getQuantumDevicePtr()->Var(obsKey); }
+
+void __catalyst__qis__State(MemRefT_CplxT_double_1d *result, int64_t numQubits, ...)
+{
+    RT_ASSERT(numQubits >= 0);
+    MemRefT<std::complex<double>, 1> *result_p = (MemRefT<std::complex<double>, 1> *)result;
+
+    va_list args;
+    va_start(args, numQubits);
+    std::vector<QubitIdType> wires(numQubits);
+    for (int64_t i = 0; i < numQubits; i++) {
+        wires[i] = va_arg(args, QubitIdType);
+    }
+    va_end(args);
+
+    DataView<std::complex<double>, 1> view(result_p->data_aligned, result_p->offset,
+                                           result_p->sizes, result_p->strides);
+
+    if (wires.empty()) {
+        getQuantumDevicePtr()->State(view);
+    }
+    else {
+        RT_FAIL("Partial State-Vector not supported yet");
+        // getQuantumDevicePtr()->PartialState(stateVec,
+        // numElements, wires);
+    }
+}
+
+void __catalyst__qis__Probs(MemRefT_double_1d *result, int64_t numQubits, ...)
+{
+    RT_ASSERT(numQubits >= 0);
+    MemRefT<double, 1> *result_p = (MemRefT<double, 1> *)result;
+
+    va_list args;
+    va_start(args, numQubits);
+    std::vector<QubitIdType> wires(numQubits);
+    for (int64_t i = 0; i < numQubits; i++) {
+        wires[i] = va_arg(args, QubitIdType);
+    }
+    va_end(args);
+
+    DataView<double, 1> view(result_p->data_aligned, result_p->offset, result_p->sizes,
+                             result_p->strides);
+
+    if (wires.empty()) {
+        getQuantumDevicePtr()->Probs(view);
+    }
+    else {
+        getQuantumDevicePtr()->PartialProbs(view, wires);
+    }
+}
+
+void __catalyst__qis__Sample(MemRefT_double_2d *result, int64_t numQubits, ...)
+{
+    int64_t shots = getQuantumDevicePtr()->GetDeviceShots();
+    RT_ASSERT(shots >= 0);
+    RT_ASSERT(numQubits >= 0);
+    MemRefT<double, 2> *result_p = (MemRefT<double, 2> *)result;
+
+    va_list args;
+    va_start(args, numQubits);
+    std::vector<QubitIdType> wires(numQubits);
+    for (int64_t i = 0; i < numQubits; i++) {
+        wires[i] = va_arg(args, QubitIdType);
+    }
+    va_end(args);
+
+    DataView<double, 2> view(result_p->data_aligned, result_p->offset, result_p->sizes,
+                             result_p->strides);
+
+    if (wires.empty()) {
+        getQuantumDevicePtr()->Sample(view, shots);
+    }
+    else {
+        getQuantumDevicePtr()->PartialSample(view, wires, shots);
+    }
+}
+
+void __catalyst__qis__Counts(PairT_MemRefT_double_int64_1d *result, int64_t numQubits, ...)
+{
+    int64_t shots = getQuantumDevicePtr()->GetDeviceShots();
+    RT_ASSERT(shots >= 0);
+    RT_ASSERT(numQubits >= 0);
+    MemRefT<double, 1> *result_eigvals_p = (MemRefT<double, 1> *)&result->first;
+    MemRefT<int64_t, 1> *result_counts_p = (MemRefT<int64_t, 1> *)&result->second;
+
+    va_list args;
+    va_start(args, numQubits);
+    std::vector<QubitIdType> wires(numQubits);
+    for (int64_t i = 0; i < numQubits; i++) {
+        wires[i] = va_arg(args, QubitIdType);
+    }
+    va_end(args);
+
+    DataView<double, 1> eigvals_view(result_eigvals_p->data_aligned, result_eigvals_p->offset,
+                                     result_eigvals_p->sizes, result_eigvals_p->strides);
+    DataView<int64_t, 1> counts_view(result_counts_p->data_aligned, result_counts_p->offset,
+                                     result_counts_p->sizes, result_counts_p->strides);
+
+    if (wires.empty()) {
+        getQuantumDevicePtr()->Counts(eigvals_view, counts_view, shots);
+    }
+    else {
+        getQuantumDevicePtr()->PartialCounts(eigvals_view, counts_view, wires, shots);
+    }
+}
+
+int64_t __catalyst__rt__array_get_size_1d(QirArray *ptr)
+{
+    std::vector<QubitIdType> *qubit_vector_ptr = reinterpret_cast<std::vector<QubitIdType> *>(ptr);
+    return qubit_vector_ptr->size();
+}
+
+int8_t *__catalyst__rt__array_get_element_ptr_1d(QirArray *ptr, int64_t idx)
+{
+    std::vector<QubitIdType> *qubit_vector_ptr = reinterpret_cast<std::vector<QubitIdType> *>(ptr);
+
+    RT_ASSERT(idx >= 0);
+    std::string error_msg = "The qubit register does not contain the requested wire: ";
+    error_msg += std::to_string(idx);
+    RT_FAIL_IF(static_cast<size_t>(idx) >= qubit_vector_ptr->size(), error_msg.c_str());
+
+    QubitIdType *data = qubit_vector_ptr->data();
+    return (int8_t *)&data[idx];
+}
+}
diff --git a/src/qirlightning/catalyst_runtime/lib/registry/CMakeLists.txt b/src/qirlightning/catalyst_runtime/lib/registry/CMakeLists.txt
new file mode 100644
index 0000000..2c19e4a
--- /dev/null
+++ b/src/qirlightning/catalyst_runtime/lib/registry/CMakeLists.txt
@@ -0,0 +1,33 @@
+# nanobind suggests including these lines to configure CMake to perform an optimized release build
+# by default unless another build type is specified. Without this addition, binding code may run
+# slowly and produce large binaries.
+# See https://nanobind.readthedocs.io/en/latest/building.html#preliminaries
+if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+    set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
+    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
+endif()
+
+# Locate nanobind
+execute_process(
+    COMMAND "${Python_EXECUTABLE}" -c "import nanobind; print(nanobind.cmake_dir())"
+    OUTPUT_VARIABLE nanobind_DIR OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+find_package(nanobind CONFIG REQUIRED)
+
+# Source file list for `wrapper` module
+set(REGISTRY_SRC_FILES
+    Registry.cpp
+)
+
+# Create the Python `catalyst_callback_registry` module
+# Target the stable ABI for Python 3.12+, which reduces the number of binary wheels that must be
+# built (`STABLE_ABI` does nothing on older Python versions).
+nanobind_add_module(catalyst_callback_registry STABLE_ABI ${REGISTRY_SRC_FILES})
+
+# Use a consistant suffix ".so" rather than, e.g. ".abi3.so" (when using the Stable ABI) or
+# ".cpython-3xx-darwin.so". Doing so simplifies the process to locate it when calling
+# `dlopen(LIBREGISTRY)` in runtime/lib/capi/RuntimeCAPI.cpp.
+set_target_properties(catalyst_callback_registry PROPERTIES SUFFIX ".so")
+
+target_include_directories(catalyst_callback_registry PUBLIC ${runtime_includes})
+target_compile_definitions(catalyst_qir_qis_obj PUBLIC -DLIBREGISTRY=\"$<TARGET_FILE_NAME:catalyst_callback_registry>\")
diff --git a/src/qirlightning/catalyst_runtime/lib/registry/Registry.cpp b/src/qirlightning/catalyst_runtime/lib/registry/Registry.cpp
new file mode 100644
index 0000000..fd4715d
--- /dev/null
+++ b/src/qirlightning/catalyst_runtime/lib/registry/Registry.cpp
@@ -0,0 +1,179 @@
+// Copyright 2024 Xanadu Quantum Technologies Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstdint>
+#include <cstdio>
+#include <dlfcn.h>
+#include <string>
+#include <unordered_map>
+
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/string.h>
+
+namespace nb = nanobind;
+
+// From PyBind11's documentation:
+//
+//     Do you have any global variables that are pybind11 objects or invoke pybind11 functions in
+//     either their constructor or destructor? You are generally not allowed to invoke any Python
+//     function in a global static context. We recommend using lazy initialization and then
+//     intentionally leaking at the end of the program.
+//
+// https://pybind11.readthedocs.io/en/stable/advanced/misc.html#common-sources-of-global-interpreter-lock-errors
+std::unordered_map<int64_t, nb::callable> *references;
+
+std::string libmlirpath;
+
+struct UnrankedMemrefType {
+    int64_t rank;
+    void *descriptor;
+};
+
+class LibraryManager {
+    void *_handle;
+
+  public:
+    LibraryManager(std::string path)
+    {
+        this->_handle = dlopen(path.c_str(), RTLD_LAZY);
+        if (!this->_handle) {
+            throw nb::value_error(dlerror());
+        }
+    }
+
+    ~LibraryManager()
+    {
+        if (this->_handle) {
+            dlclose(this->_handle);
+        }
+    }
+
+    void operator()(long elementSize, UnrankedMemrefType *src, UnrankedMemrefType *dst)
+    {
+        void *f_ptr = dlsym(this->_handle, "memrefCopy");
+        if (!f_ptr) {
+            throw nb::value_error(dlerror());
+        }
+        typedef void (*memrefCopy_t)(int64_t, void *, void *);
+        void (*memrefCopy)(int64_t, void *, void *);
+        memrefCopy = (memrefCopy_t)(f_ptr);
+        return memrefCopy(elementSize, src, dst);
+    }
+};
+
+inline const char *ext()
+{
+#ifdef __APPLE__
+    return ".dylib";
+#elif __linux__
+    return ".so";
+#else
+#error "Only apple and linux are currently supported";
+#endif
+}
+
+std::string library_name(std::string name) { return name + ext(); }
+
+void convertResult(nb::handle tuple)
+{
+    nb::object unrankedMemrefPtrSizeTuple = tuple.attr("__getitem__")(0);
+
+    nb::object unranked_memref = unrankedMemrefPtrSizeTuple.attr("__getitem__")(0);
+    nb::object element_size = unrankedMemrefPtrSizeTuple.attr("__getitem__")(1);
+    nb::object unranked_memref_ptr_int = unranked_memref.attr("value");
+
+    void *unranked_memref_ptr = reinterpret_cast<void *>(nb::cast<long>(unranked_memref_ptr_int));
+    long e_size = nb::cast<long>(element_size);
+
+    nb::object dest = tuple.attr("__getitem__")(1);
+
+    long destAsLong = nb::cast<long>(dest);
+    void *destAsPtr = (void *)(destAsLong);
+
+    UnrankedMemrefType *src = (UnrankedMemrefType *)unranked_memref_ptr;
+    UnrankedMemrefType destMemref = {src->rank, destAsPtr};
+
+    std::string libpath = libmlirpath + library_name("/libmlir_c_runner_utils");
+    LibraryManager memrefCopy(libpath);
+    memrefCopy(e_size, src, &destMemref);
+}
+
+void convertResults(nb::list results, nb::list allocated)
+{
+    auto builtins = nb::module_::import_("builtins");
+    auto zip = builtins.attr("zip");
+    for (nb::handle obj : zip(results, allocated)) {
+        convertResult(obj);
+    }
+}
+
+extern "C" {
+[[gnu::visibility("default")]] void callbackCall(int64_t identifier, int64_t count, int64_t retc,
+                                                 va_list args)
+{
+    nb::gil_scoped_acquire lock;
+    auto it = references->find(identifier);
+    if (it == references->end()) {
+        throw std::invalid_argument("Callback called with invalid identifier");
+    }
+    auto lambda = it->second;
+
+    nb::list flat_args;
+    for (int i = 0; i < count; i++) {
+        int64_t ptr = va_arg(args, int64_t);
+        flat_args.append(ptr);
+    }
+
+    nb::list flat_results = nb::list(lambda(flat_args));
+
+    // We have a flat list of return values.
+    // These returns **may** be array views to
+    // the very same memrefs that we passed as inputs.
+    // As a first prototype, let's copy these values.
+    // I think it is best to always copy them because
+    // of aliasing. Let's just copy them to guarantee
+    // no aliasing issues. We can revisit this as an optimization
+    // and allowing these to alias.
+    nb::list flat_returns_allocated_compiler;
+    for (int i = 0; i < retc; i++) {
+        int64_t ptr = va_arg(args, int64_t);
+        flat_returns_allocated_compiler.append(ptr);
+    }
+    convertResults(flat_results, flat_returns_allocated_compiler);
+}
+}
+
+void setMLIRLibPath(std::string path) { libmlirpath = path; }
+
+auto registerImpl(nb::callable f)
+{
+    // Do we need to see if it is already present or can we just override it? Just override is fine.
+    // Does python reuse id's? Yes.
+    // But only after they have been garbaged collected.
+    // So as long as we maintain a reference to it, then they won't be garbage collected.
+    // Inserting the function into the unordered map increases the reference by one.
+    int64_t id = reinterpret_cast<int64_t>(f.ptr());
+    references->insert({id, f});
+    return id;
+}
+
+NB_MODULE(catalyst_callback_registry, m)
+{
+    if (references == nullptr) {
+        references = new std::unordered_map<int64_t, nb::callable>();
+    }
+    m.doc() = "Callbacks";
+    m.def("register", &registerImpl, "Call a python function registered in a map.");
+    m.def("set_mlir_lib_path", &setMLIRLibPath, "Set location of mlir's libraries.");
+}

From 40c8fb73579e76765cc5f0baac22ab5bbd918e5f Mon Sep 17 00:00:00 2001
From: Joseph Lee <joseph.lee@xanadu.ai>
Date: Tue, 14 Jan 2025 22:52:33 +0000
Subject: [PATCH 33/64] add simple demo

---
 src/qirlightning/simple_demo/README.md        | 61 +++++++++++++++++++
 .../simple_demo/test_rt_device.cpp            | 40 ++++++++++++
 2 files changed, 101 insertions(+)
 create mode 100644 src/qirlightning/simple_demo/README.md
 create mode 100644 src/qirlightning/simple_demo/test_rt_device.cpp

diff --git a/src/qirlightning/simple_demo/README.md b/src/qirlightning/simple_demo/README.md
new file mode 100644
index 0000000..7a25fe4
--- /dev/null
+++ b/src/qirlightning/simple_demo/README.md
@@ -0,0 +1,61 @@
+# Simple Demo for Catalyst/Lightning runtime
+
+This is a super simple demo for using Catalyst runtime to drive Lightning devices. The example here uses `lightning.kokkos`, but can easily be updated to target other devices, e.g. lightning.gpu (if an Nvidia GPU is present). 
+
+The new files required are in `../catalyst_runtime`, which contains a subset of files from the [Catalyst Runtime](https://github.com/PennyLaneAI/catalyst/tree/main/runtime).
+
+## Installing a lightning simulator
+
+When installing [Pennylane-Lightning](https://github.com/PennyLaneAI/pennylane-lightning) from pip or source, you will have the shared objects for each of the simulator installed. These are named `liblightning_kokkos_catalyst.so`/`liblightning_GPU_catalyst.so` etc. 
+
+Running `pip install pennylane` or `pip install pennylane-lightning` will install the `lightning.qubit` (CPU) simulator, and other simulators can be installed by running `pip install pennylane-lightning-kokkos / pennylane-lightning-gpu`.
+
+Example:
+```
+$ pip install pennylane-lightning-kokkos
+
+$ pip show pennylane-lightning-kokkos
+Name: PennyLane_Lightning_Kokkos
+Version: 0.39.0
+Summary: PennyLane-Lightning plugin
+Home-page: https://github.com/PennyLaneAI/pennylane-lightning
+Author: 
+Author-email: 
+License: Apache License 2.0
+Location: /home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages
+Requires: pennylane, pennylane-lightning
+
+$ ls /home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/pennylane_lightning
+... liblightning_kokkos_catalyst.so ...
+```
+
+You can swap `pennylane-lightning-kokkos` for `pennylane-lightning-gpu` for lightning.gpu and `pennylane-lightning` for lightning.gpu simulators.
+
+## Compilation
+
+To compile:
+
+```
+$ clang++ --std=c++20 test_rt_device.cpp -I/home/joseph/work/qiree/catalyst/runtime/include -I/home/joseph/work/qiree/catalyst/runtime/lib/capi -I/home/joseph/work/qiree/catalyst/runtime/lib/backend/common -o test_rt_device.out
+```
+
+To run:
+
+```
+$ ./test_rt_device.out 
+Kokkos::OpenMP::initialize WARNING: OMP_PROC_BIND environment variable not set
+  In general, for best performance with OpenMP 4.0 or better set OMP_PROC_BIND=spread and OMP_PLACES=threads
+  For best performance with OpenMP 3.1 set OMP_PROC_BIND=true
+  For unit testing set OMP_PROC_BIND=false
+
+Num Qubits = 3
+State = 
+*** State-Vector of Size 8 ***
+[(0.707107,0), (0,0), (0,0), (0,0), (0.707107,0), (0,0), (0,0), (0,0)]
+Measure on wire 0 = 0
+```
+
+To run on other devices, e.g. lightning.gpu, you need to change:
+- `pip install custatevec-cu12 pennylane-lightning-gpu` (custatevec is a dependency)
+- replace `RTDLIB` and `RTDNAME` from `kokkos` to `GPU`
+- include `cuquantum` libraries when running, e.g. `LD_LIBRARY_PATH=/home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/cuquantum/lib/:$LD_LIBRARY_PATH ./test_rt_device.out`
diff --git a/src/qirlightning/simple_demo/test_rt_device.cpp b/src/qirlightning/simple_demo/test_rt_device.cpp
new file mode 100644
index 0000000..f70410a
--- /dev/null
+++ b/src/qirlightning/simple_demo/test_rt_device.cpp
@@ -0,0 +1,40 @@
+#include "ExecutionContext.hpp"
+
+// Runtime libraries (kokkos/GPU/qubit etc.)
+#define RTDLIB "/home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/pennylane_lightning/liblightning_kokkos_catalyst.so" // change to liblightning_gpu_catalyst.so
+#define RTDNAME "LightningKokkosSimulator" // change to LightningGPUSimulator
+
+using namespace Catalyst::Runtime;
+
+static inline std::shared_ptr<RTDevice> loadRTDevice(const std::string &rtd_lib,
+                                                   const std::string &rtd_name = {},
+                                                   const std::string &rtd_kwargs = {})
+{
+    ExecutionContext context;
+    return context.getOrCreateDevice(rtd_lib, rtd_name, rtd_kwargs);
+}
+
+int main() {
+    auto RTDevice = loadRTDevice(RTDLIB, RTDNAME, "");
+
+    // Allocate Qubits
+    RTDevice->getQuantumDevicePtr()->AllocateQubits(3);
+
+    // Get Num Qubits
+    std::cout << "Num Qubits = " << RTDevice->getQuantumDevicePtr()->GetNumQubits() << std::endl;
+
+    // Apply Gate
+    RTDevice->getQuantumDevicePtr()->NamedOperation("Hadamard", {}, {0});
+
+    // Print State
+    std::cout << "State = " << std::endl;
+    RTDevice->getQuantumDevicePtr()->PrintState();
+
+    // Measure
+    QubitIdType wire{0};
+    Result result = RTDevice->getQuantumDevicePtr()->Measure(wire, std::nullopt);
+    std::cout << "Measure on wire 0 = " << *result << std::endl;
+
+
+    return 0;
+}

From a76563c4344f28d32e959444f74c7b070fce43ed Mon Sep 17 00:00:00 2001
From: Joseph Lee <40768758+josephleekl@users.noreply.github.com>
Date: Wed, 15 Jan 2025 09:58:29 -0500
Subject: [PATCH 34/64] Update README.md

---
 src/qirlightning/simple_demo/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/qirlightning/simple_demo/README.md b/src/qirlightning/simple_demo/README.md
index 7a25fe4..f514e92 100644
--- a/src/qirlightning/simple_demo/README.md
+++ b/src/qirlightning/simple_demo/README.md
@@ -36,7 +36,7 @@ You can swap `pennylane-lightning-kokkos` for `pennylane-lightning-gpu` for ligh
 To compile:
 
 ```
-$ clang++ --std=c++20 test_rt_device.cpp -I/home/joseph/work/qiree/catalyst/runtime/include -I/home/joseph/work/qiree/catalyst/runtime/lib/capi -I/home/joseph/work/qiree/catalyst/runtime/lib/backend/common -o test_rt_device.out
+$ clang++ --std=c++20 test_rt_device.cpp -I../catalyst_runtime/lib/capi -I../catalyst_runtime/include -o test_rt_device.out
 ```
 
 To run:

From 611fa66e1ca434fa6728ae6f9c4693454bf665a2 Mon Sep 17 00:00:00 2001
From: Joseph Lee <joseph.lee@xanadu.ai>
Date: Wed, 15 Jan 2025 19:07:51 +0000
Subject: [PATCH 35/64] remove catalyst runtime deps and update demo

---
 src/qirlightning/catalyst_runtime/.clang-tidy |  232 ----
 src/qirlightning/catalyst_runtime/.gitignore  |    3 -
 .../catalyst_runtime/CMakeLists.txt           |  133 ---
 src/qirlightning/catalyst_runtime/Makefile    |  121 --
 .../catalyst_runtime/lib/CMakeLists.txt       |    3 -
 .../lib/backend/CMakeLists.txt                |    7 -
 .../lib/backend/common/CacheManager.hpp       |  199 ----
 .../lib/backend/common/QubitManager.hpp       |  146 ---
 .../lib/backend/common/Utils.hpp              |  304 -----
 .../catalyst_runtime/lib/capi/CMakeLists.txt  |   57 -
 .../lib/capi/ExecutionContext.hpp             |  367 ------
 .../catalyst_runtime/lib/capi/MemRefUtils.hpp |   48 -
 .../catalyst_runtime/lib/capi/RuntimeCAPI.cpp | 1012 -----------------
 .../lib/registry/CMakeLists.txt               |   33 -
 .../lib/registry/Registry.cpp                 |  179 ---
 src/qirlightning/simple_demo/README.md        |   11 +-
 .../simple_demo/test_rt_device.cpp            |   83 +-
 17 files changed, 64 insertions(+), 2874 deletions(-)
 delete mode 100644 src/qirlightning/catalyst_runtime/.clang-tidy
 delete mode 100644 src/qirlightning/catalyst_runtime/.gitignore
 delete mode 100644 src/qirlightning/catalyst_runtime/CMakeLists.txt
 delete mode 100644 src/qirlightning/catalyst_runtime/Makefile
 delete mode 100644 src/qirlightning/catalyst_runtime/lib/CMakeLists.txt
 delete mode 100644 src/qirlightning/catalyst_runtime/lib/backend/CMakeLists.txt
 delete mode 100644 src/qirlightning/catalyst_runtime/lib/backend/common/CacheManager.hpp
 delete mode 100644 src/qirlightning/catalyst_runtime/lib/backend/common/QubitManager.hpp
 delete mode 100644 src/qirlightning/catalyst_runtime/lib/backend/common/Utils.hpp
 delete mode 100644 src/qirlightning/catalyst_runtime/lib/capi/CMakeLists.txt
 delete mode 100644 src/qirlightning/catalyst_runtime/lib/capi/ExecutionContext.hpp
 delete mode 100644 src/qirlightning/catalyst_runtime/lib/capi/MemRefUtils.hpp
 delete mode 100644 src/qirlightning/catalyst_runtime/lib/capi/RuntimeCAPI.cpp
 delete mode 100644 src/qirlightning/catalyst_runtime/lib/registry/CMakeLists.txt
 delete mode 100644 src/qirlightning/catalyst_runtime/lib/registry/Registry.cpp

diff --git a/src/qirlightning/catalyst_runtime/.clang-tidy b/src/qirlightning/catalyst_runtime/.clang-tidy
deleted file mode 100644
index e7ca11f..0000000
--- a/src/qirlightning/catalyst_runtime/.clang-tidy
+++ /dev/null
@@ -1,232 +0,0 @@
----
-Checks:          '-*,clang-diagnostic-*,clang-analyzer-*,-llvmlibc-*,modernize-*,-modernize-use-trailing-return-type,clang-analyzer-cplusplus*,openmp-*,performance-*,portability-*,readability-*,-modernize-avoid-c-arrays,-readability-magic-numbers,hicpp-*,-hicpp-no-array-decay,-hicpp-avoid-c-arrays,bugprone-suspicious-*,llvm-namespace-comment,cppcoreguidelines-slicing,cppcoreguidelines-special-member-functions,-readability-identifier-length'
-WarningsAsErrors: '*'
-HeaderFilterRegex: '.*'
-AnalyzeTemporaryDtors: false
-FormatStyle:     none
-InheritParentConfig: true
-User:            mlxd
-CheckOptions:
-  - key:             modernize-replace-auto-ptr.IncludeStyle
-    value:           llvm
-  - key:             performance-move-const-arg.CheckTriviallyCopyableMove
-    value:           'true'
-  - key:             modernize-use-auto.MinTypeNameLength
-    value:           '5'
-  - key:             readability-static-accessed-through-instance.NameSpecifierNestingThreshold
-    value:           '3'
-  - key:             readability-function-size.VariableThreshold
-    value:           '4294967295'
-  - key:             cert-dcl16-c.NewSuffixes
-    value:           'L;LL;LU;LLU'
-  - key:             readability-identifier-naming.GetConfigPerFile
-    value:           'true'
-  - key:             readability-inconsistent-declaration-parameter-name.Strict
-    value:           'false'
-  - key:             readability-magic-numbers.IgnoredIntegerValues
-    value:           '1;2;3;4;'
-  - key:             modernize-use-default-member-init.UseAssignment
-    value:           'false'
-  - key:             readability-function-size.NestingThreshold
-    value:           '4294967295'
-  - key:             modernize-use-override.AllowOverrideAndFinal
-    value:           'false'
-  - key:             readability-function-size.ParameterThreshold
-    value:           '4294967295'
-  - key:             openmp-exception-escape.IgnoredExceptions
-    value:           ''
-  - key:             modernize-pass-by-value.ValuesOnly
-    value:           'false'
-  - key:             modernize-loop-convert.IncludeStyle
-    value:           llvm
-  - key:             cert-str34-c.DiagnoseSignedUnsignedCharComparisons
-    value:           '0'
-  - key:             readability-identifier-naming.AggressiveDependentMemberLookup
-    value:           'false'
-  - key:             readability-redundant-smartptr-get.IgnoreMacros
-    value:           'true'
-  - key:             modernize-use-emplace.TupleTypes
-    value:           '::std::pair;::std::tuple'
-  - key:             modernize-use-emplace.TupleMakeFunctions
-    value:           '::std::make_pair;::std::make_tuple'
-  - key:             modernize-use-nodiscard.ReplacementString
-    value:           '[[nodiscard]]'
-  - key:             modernize-loop-convert.MakeReverseRangeHeader
-    value:           ''
-  - key:             modernize-replace-random-shuffle.IncludeStyle
-    value:           llvm
-  - key:             modernize-use-bool-literals.IgnoreMacros
-    value:           'true'
-  - key:             google-readability-namespace-comments.ShortNamespaceLines
-    value:           '10'
-  - key:             modernize-avoid-bind.PermissiveParameterList
-    value:           'false'
-  - key:             modernize-use-override.FinalSpelling
-    value:           final
-  - key:             performance-move-constructor-init.IncludeStyle
-    value:           llvm
-  - key:             modernize-loop-convert.UseCxx20ReverseRanges
-    value:           'true'
-  - key:             modernize-use-noexcept.ReplacementString
-    value:           ''
-  - key:             modernize-use-using.IgnoreMacros
-    value:           'true'
-  - key:             performance-type-promotion-in-math-fn.IncludeStyle
-    value:           llvm
-  - key:             modernize-loop-convert.NamingStyle
-    value:           CamelCase
-  - key:             modernize-loop-convert.MakeReverseRangeFunction
-    value:           ''
-  - key:             readability-inconsistent-declaration-parameter-name.IgnoreMacros
-    value:           'true'
-  - key:             performance-no-automatic-move.AllowedTypes
-    value:           ''
-  - key:             performance-for-range-copy.WarnOnAllAutoCopies
-    value:           'false'
-  - key:             readability-identifier-naming.IgnoreFailedSplit
-    value:           'false'
-  - key:             modernize-pass-by-value.IncludeStyle
-    value:           llvm
-  - key:             readability-qualified-auto.AddConstToQualified
-    value:           'true'
-  - key:             readability-simplify-boolean-expr.ChainedConditionalReturn
-    value:           'false'
-  - key:             readability-else-after-return.WarnOnConditionVariables
-    value:           'true'
-  - key:             readability-uppercase-literal-suffix.IgnoreMacros
-    value:           'true'
-  - key:             modernize-use-nullptr.NullMacros
-    value:           'NULL'
-  - key:             modernize-make-shared.IgnoreMacros
-    value:           'true'
-  - key:             performance-unnecessary-copy-initialization.AllowedTypes
-    value:           ''
-  - key:             modernize-use-transparent-functors.SafeMode
-    value:           'false'
-  - key:             modernize-make-shared.IgnoreDefaultInitialization
-    value:           'true'
-  - key:             modernize-make-shared.IncludeStyle
-    value:           llvm
-  - key:             readability-simplify-boolean-expr.ChainedConditionalAssignment
-    value:           'false'
-  - key:             cert-oop54-cpp.WarnOnlyIfThisHasSuspiciousField
-    value:           '0'
-  - key:             readability-function-size.LineThreshold
-    value:           '4294967295'
-  - key:             performance-inefficient-vector-operation.EnableProto
-    value:           'false'
-  - key:             modernize-use-override.IgnoreDestructors
-    value:           'false'
-  - key:             modernize-loop-convert.MaxCopySize
-    value:           '16'
-  - key:             modernize-make-shared.MakeSmartPtrFunction
-    value:           'std::make_shared'
-  - key:             portability-simd-intrinsics.Suggest
-    value:           'false'
-  - key:             cppcoreguidelines-explicit-virtual-functions.IgnoreDestructors
-    value:           '1'
-  - key:             modernize-make-unique.IgnoreMacros
-    value:           'true'
-  - key:             modernize-make-shared.MakeSmartPtrFunctionHeader
-    value:           '<memory>'
-  - key:             performance-for-range-copy.AllowedTypes
-    value:           ''
-  - key:             readability-redundant-string-init.StringNames
-    value:           '::std::basic_string_view;::std::basic_string'
-  - key:             modernize-make-unique.IgnoreDefaultInitialization
-    value:           'true'
-  - key:             modernize-use-emplace.ContainersWithPushBack
-    value:           '::std::vector;::std::list;::std::deque'
-  - key:             readability-magic-numbers.IgnoreBitFieldsWidths
-    value:           'true'
-  - key:             modernize-make-unique.IncludeStyle
-    value:           llvm
-  - key:             readability-braces-around-statements.ShortStatementLines
-    value:           '0'
-  - key:             modernize-use-override.OverrideSpelling
-    value:           override
-  - key:             readability-magic-numbers.IgnoredFloatingPointValues
-    value:           '1.0;100.0;'
-  - key:             performance-inefficient-string-concatenation.StrictMode
-    value:           'false'
-  - key:             readability-implicit-bool-conversion.AllowPointerConditions
-    value:           'false'
-  - key:             readability-redundant-declaration.IgnoreMacros
-    value:           'true'
-  - key:             google-readability-braces-around-statements.ShortStatementLines
-    value:           '1'
-  - key:             modernize-make-unique.MakeSmartPtrFunction
-    value:           'std::make_unique'
-  - key:             portability-restrict-system-includes.Includes
-    value:           '*'
-  - key:             readability-else-after-return.WarnOnUnfixable
-    value:           'true'
-  - key:             modernize-use-emplace.IgnoreImplicitConstructors
-    value:           'false'
-  - key:             modernize-make-unique.MakeSmartPtrFunctionHeader
-    value:           '<memory>'
-  - key:             modernize-use-equals-delete.IgnoreMacros
-    value:           'true'
-  - key:             readability-magic-numbers.IgnoreAllFloatingPointValues
-    value:           'false'
-  - key:             readability-uppercase-literal-suffix.NewSuffixes
-    value:           ''
-  - key:             modernize-loop-convert.MinConfidence
-    value:           reasonable
-  - key:             performance-unnecessary-value-param.AllowedTypes
-    value:           ''
-  - key:             modernize-use-noexcept.UseNoexceptFalse
-    value:           'true'
-  - key:             google-readability-namespace-comments.SpacesBeforeComments
-    value:           '2'
-  - key:             readability-function-cognitive-complexity.Threshold
-    value:           '100'
-  - key:             readability-function-cognitive-complexity.IgnoreMacros
-    value:           'true'
-  - key:             cppcoreguidelines-non-private-member-variables-in-classes.IgnoreClassesWithAllMemberVariablesBeingPublic
-    value:           '1'
-  - key:             performance-faster-string-find.StringLikeClasses
-    value:           '::std::basic_string;::std::basic_string_view'
-  - key:             readability-function-size.BranchThreshold
-    value:           '4294967295'
-  - key:             readability-implicit-bool-conversion.AllowIntegerConditions
-    value:           'false'
-  - key:             readability-function-size.StatementThreshold
-    value:           '800'
-  - key:             modernize-use-default-member-init.IgnoreMacros
-    value:           'true'
-  - key:             llvm-qualified-auto.AddConstToQualified
-    value:           '0'
-  - key:             readability-identifier-naming.IgnoreMainLikeFunctions
-    value:           'false'
-  - key:             google-readability-function-size.StatementThreshold
-    value:           '800'
-  - key:             llvm-else-after-return.WarnOnConditionVariables
-    value:           '0'
-  - key:             modernize-raw-string-literal.DelimiterStem
-    value:           lit
-  - key:             modernize-use-equals-default.IgnoreMacros
-    value:           'true'
-  - key:             modernize-raw-string-literal.ReplaceShorterLiterals
-    value:           'false'
-  - key:             modernize-use-emplace.SmartPointers
-    value:           '::std::shared_ptr;::std::unique_ptr;::std::auto_ptr;::std::weak_ptr'
-  - key:             performance-inefficient-vector-operation.VectorLikeClasses
-    value:           '::std::vector'
-  - key:             modernize-use-auto.RemoveStars
-    value:           'false'
-  - key:             readability-magic-numbers.IgnorePowersOf2IntegerValues
-    value:           'true'
-  - key:             portability-simd-intrinsics.Std
-    value:           ''
-  - key:             readability-redundant-member-init.IgnoreBaseInCopyConstructors
-    value:           'false'
-  - key:             performance-unnecessary-value-param.IncludeStyle
-    value:           llvm
-  - key:             modernize-replace-disallow-copy-and-assign-macro.MacroName
-    value:           DISALLOW_COPY_AND_ASSIGN
-  - key:             llvm-else-after-return.WarnOnUnfixable
-    value:           '0'
-  - key:             readability-simplify-subscript-expr.Types
-    value:           '::std::basic_string;::std::basic_string_view;::std::vector;::std::array'
-...
diff --git a/src/qirlightning/catalyst_runtime/.gitignore b/src/qirlightning/catalyst_runtime/.gitignore
deleted file mode 100644
index 4258b32..0000000
--- a/src/qirlightning/catalyst_runtime/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-build
-build_cov
-bin/__pycache__/
diff --git a/src/qirlightning/catalyst_runtime/CMakeLists.txt b/src/qirlightning/catalyst_runtime/CMakeLists.txt
deleted file mode 100644
index 1651851..0000000
--- a/src/qirlightning/catalyst_runtime/CMakeLists.txt
+++ /dev/null
@@ -1,133 +0,0 @@
-cmake_minimum_required(VERSION 3.26)
-
-project(catalyst_runtime)
-include(FetchContent)
-include(ExternalProject)
-
-set(CMAKE_CXX_STANDARD  20)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-
-# Compiler options
-option(ENABLE_CODE_COVERAGE "Enable code coverage" OFF)
-option(ENABLE_ADDRESS_SANITIZER "Enable address sanitizer" OFF)
-option(RUNTIME_CLANG_TIDY "Enable Clang Tidy" OFF)
-
-option(ENABLE_OPENQASM "Build OpenQasm backend device" OFF)
-
-set(CMAKE_VERBOSE_MAKEFILE ON)
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-
-set(runtime_includes "${PROJECT_SOURCE_DIR}/include")
-set(capi_utils_includes "${PROJECT_SOURCE_DIR}/lib/capi")
-set(backend_includes "${PROJECT_SOURCE_DIR}/lib/backend/common")
-
-
-# Get LLVM hash to target from source tree.
-file(READ ../.dep-versions DEPENDENCY_VERSIONS)
-string(REGEX MATCH "llvm=([0-9a-f]+)" _ ${DEPENDENCY_VERSIONS})
-set(LLVM_HASH ${CMAKE_MATCH_1})
-message(STATUS "Detected LLVM version - ${LLVM_HASH}")
-
-FetchContent_Declare(
-    MLIRRunnerUtils
-    URL                 https://raw.githubusercontent.com/llvm/llvm-project/${LLVM_HASH}/mlir/include/mlir/ExecutionEngine/RunnerUtils.h
-    DOWNLOAD_NO_EXTRACT True
-    SOURCE_DIR        mlir/ExecutionEngine
-)
-
-FetchContent_Declare(
-    MLIRCRunnerUtils
-    URL                 https://raw.githubusercontent.com/llvm/llvm-project/${LLVM_HASH}/mlir/include/mlir/ExecutionEngine/CRunnerUtils.h
-    DOWNLOAD_NO_EXTRACT True
-    SOURCE_DIR          mlir/ExecutionEngine
-)
-
-FetchContent_Declare(
-    MLIRFloat16Bits
-    URL                 https://raw.githubusercontent.com/llvm/llvm-project/${LLVM_HASH}/mlir/include/mlir/ExecutionEngine/Float16bits.h
-    DOWNLOAD_NO_EXTRACT True
-    SOURCE_DIR          mlir/ExecutionEngine
-)
-
-# Note on pybind11 vs python discovery order:
-# If Python is looked for first, then we have to look for all the components needed by pybind11.
-# In particular, if pybind11::embed is used, then we need to find both headers (Development.Module)
-# and the shared library (Development.Embed) before pybind11 is discovered.
-# With the other order PyBind will discover everything it needs.
-# Note on flags:
-# - PYTHON_EXECUTABLE is a pybind11 specific flag used by its own (legacy) Python discovery process,
-#   it will not affect find_package(Python) calls.
-# - Python_EXECUTABLE is a cmake flag used in find_package(Python) to guide the discovery.
-# Note that pybind11 can be made to use find_python (instead of its legacy discovery), and thus
-# respect Python_EXECUTABLE), via the PYBIND11_FINDPYTHON flag.
-
-# Here, we look for the desired Python version early to avoid any problems with mismatched packages.
-# The desired Python environment should be specified ahead of time via -DPython_EXECUTABLE=...
-# The optional component is only used for the C++ test suite (to spin up its own interpreter),
-# and requires libpython.so to be available on the system.
-find_package(Python REQUIRED
-    COMPONENTS Interpreter Development.Module
-    OPTIONAL_COMPONENTS Development.Embed Development.SABIModule
-)
-
-if(RUNTIME_ENABLE_WARNINGS)
-    message(STATUS "Building with compiler warnings as errors enabled.")
-    add_compile_options(-Werror -Wall)
-endif()
-
-message(STATUS "ENABLE_OPENQASM is ${ENABLE_OPENQASM}.")
-
-set(devices_list)
-list(APPEND devices_list rtd_null_qubit)
-list(APPEND backend_includes "${PROJECT_SOURCE_DIR}/lib/backend/null_qubit")
-
-if(ENABLE_OPENQASM)
-    list(APPEND backend_includes "${PROJECT_SOURCE_DIR}/lib/backend/openqasm")
-    list(APPEND devices_list rtd_openqasm)
-endif()
-
-add_library(catalyst_qir_runtime INTERFACE)
-
-target_link_libraries(catalyst_qir_runtime INTERFACE ${devices_list} rt_capi)
-
-target_include_directories(catalyst_qir_runtime INTERFACE
-    ${runtime_includes}
-    ${backend_includes}
-)
-
-if(ENABLE_CODE_COVERAGE)
-    message(STATUS "ENABLE_CODE_COVERAGE is ON.")
-    if(APPLE)
-        target_compile_options(catalyst_qir_runtime INTERFACE -fprofile-instr-generate -fcoverage-mapping)
-        target_link_options(catalyst_qir_runtime INTERFACE -fprofile-instr-generate -fcoverage-mapping)
-    else()
-        target_compile_options(catalyst_qir_runtime INTERFACE -fprofile-arcs -ftest-coverage)
-        target_link_libraries(catalyst_qir_runtime INTERFACE gcov)
-    endif()
-endif()
-
-
-if(ENABLE_ADDRESS_SANITIZER)
-    message(STATUS "ENABLE_ADDRESS_SANITIZER is ON.")
-    add_compile_options(-fsanitize=address)
-    add_link_options(-fsanitize=address)
-endif()
-
-add_subdirectory(lib)
-add_subdirectory(tests)
-
-if(APPLE AND (${CMAKE_SYSTEM_PROCESSOR} STREQUAL arm64))
-# Don't rerun external project everytime we configure the runtime build.
-if(NOT EXISTS ${CMAKE_BINARY_DIR}/lib/liblapacke.3.dylib)
-    ExternalProject_Add(lapacke-accelerate
-        GIT_REPOSITORY https://github.com/lepus2589/accelerate-lapacke.git
-        GIT_TAG master
-        PREFIX _lapacke-accelerate
-        CMAKE_ARGS "--preset accelerate-lapacke32"
-                   "-DCMAKE_INSTALL_PREFIX=${CMAKE_BINARY_DIR}/_lapacke-accelerate/install"
-        INSTALL_COMMAND ${CMAKE_COMMAND} --build . --target install
-        COMMAND cp ${CMAKE_BINARY_DIR}/_lapacke-accelerate/install/lib/liblapacke.3.dylib ${CMAKE_BINARY_DIR}/lib
-    )
-    add_dependencies(rt_capi lapacke-accelerate)  # automatically build with the runtime
-endif()
-endif()
diff --git a/src/qirlightning/catalyst_runtime/Makefile b/src/qirlightning/catalyst_runtime/Makefile
deleted file mode 100644
index 55733a4..0000000
--- a/src/qirlightning/catalyst_runtime/Makefile
+++ /dev/null
@@ -1,121 +0,0 @@
-PYTHON?=$(shell which python3)
-PYTHON_PREFIX:=$(shell $(PYTHON) -c "import sys; print(sys.prefix)")
-PYTHON_VERSION:=$(shell $(PYTHON) -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')")
-C_COMPILER?=$(shell which clang)
-CXX_COMPILER?=$(shell which clang++)
-COMPILER_LAUNCHER?=$(shell which ccache)
-NPROC?=$(shell python3 -c "import os; print(os.cpu_count())")
-
-MK_ABSPATH := $(abspath $(lastword $(MAKEFILE_LIST)))
-MK_DIR := $(dir $(MK_ABSPATH))
-RT_BUILD_DIR?=$(MK_DIR)/build
-CODE_COVERAGE?=OFF
-BUILD_TYPE?=RelWithDebInfo
-ENABLE_OPENQASM?=ON
-ENABLE_ASAN?=OFF
-
-BUILD_TARGETS := rt_capi rtd_null_qubit
-TEST_TARGETS := runner_tests_qir_runtime
-
-PLATFORM := $(shell uname -s)
-
-ifeq ($(ENABLE_OPENQASM), ON)
-	BUILD_TARGETS += rtd_openqasm
-	TEST_TARGETS += runner_tests_openqasm
-endif
-
-.PHONY: help
-help:
-	@echo "Please use \`make <target>' where <target> is one of"
-	@echo "  all                to build Catalyst Runtime"
-	@echo "  coverage           to generate a coverage report using lcov"
-	@echo "  clean              to delete all temporary, cache, and build files"
-	@echo "  test               to run the Catalyst runtime test suite"
-	@echo "  format [check=1]   to apply C++ formatter; use with 'check=1' to check instead of modify (requires clang-format)"
-	@echo "  format [version=?] to apply C++ formatter; use with 'version={version}' to run clang-format-{version} instead of clang-format"
-	@echo "  check-tidy         to build Catalyst Runtime with RUNTIME_CLANG_TIDY=ON (requires clang-tidy)"
-
-.PHONY: configure
-configure:
-	@echo "Configure Catalyst Runtime"
-
-	cmake -G Ninja -B $(RT_BUILD_DIR) . \
-		-DCMAKE_BUILD_TYPE=$(BUILD_TYPE) \
-		-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=$(RT_BUILD_DIR)/lib \
-		-DCMAKE_C_COMPILER=$(C_COMPILER) \
-		-DCMAKE_CXX_COMPILER=$(CXX_COMPILER) \
-		-DCMAKE_C_COMPILER_LAUNCHER=$(COMPILER_LAUNCHER) \
-		-DCMAKE_CXX_COMPILER_LAUNCHER=$(COMPILER_LAUNCHER) \
-		-DENABLE_OPENQASM=$(ENABLE_OPENQASM) \
-		-DENABLE_CODE_COVERAGE=$(CODE_COVERAGE) \
-        -DPython_EXECUTABLE=$(PYTHON) \
-		-DENABLE_ADDRESS_SANITIZER=$(ENABLE_ASAN)
-
-.PHONY: runtime
-runtime: configure
-	cmake --build $(RT_BUILD_DIR) --target $(BUILD_TARGETS) -j$(NPROC) --verbose
-
-.PHONY: test_runner
-test_runner: configure
-	cmake --build $(RT_BUILD_DIR) --target $(TEST_TARGETS) -j$(NPROC) --verbose
-
-.PHONY: test
-test: CODE_COVERAGE=OFF
-test: BUILD_TYPE?=RelWithDebInfo
-test: test_runner
-	@echo "Catalyst runtime test suite - NullQubit"
-	$(ASAN_COMMAND) $(RT_BUILD_DIR)/tests/runner_tests_qir_runtime
-ifeq ($(ENABLE_OPENQASM), ON)
-	# Test the OpenQasm devices C++ tests
-	$(ASAN_COMMAND) $(RT_BUILD_DIR)/tests/runner_tests_openqasm
-endif
-
-.PHONY: coverage
-coverage: RT_BUILD_DIR := $(RT_BUILD_DIR)_cov
-coverage: CODE_COVERAGE=ON
-coverage: BUILD_TYPE=Debug
-coverage: C_COMPILER=$(shell which gcc)
-coverage: CXX_COMPILER=$(shell which g++)
-coverage: export LLVM_PROFILE_FILE := $(RT_BUILD_DIR)/tests/%m.profraw
-coverage: test_runner
-	@echo "check C++ code coverage"
-	$(RT_BUILD_DIR)/tests/runner_tests_qir_runtime
-ifeq ($(ENABLE_OPENQASM), ON)
-	$(RT_BUILD_DIR)/tests/runner_tests_openqasm
-endif
-ifeq ($(PLATFORM),Linux)
-	lcov --directory $(RT_BUILD_DIR) -b $(MK_DIR)/lib --capture --output-file $(RT_BUILD_DIR)/coverage.info
-	lcov --remove $(RT_BUILD_DIR)/coverage.info '/usr/*' '*/_deps/*' '*/envs/*' '*/mlir/*' --output-file $(RT_BUILD_DIR)/coverage.info
-	genhtml $(RT_BUILD_DIR)/coverage.info --output-directory $(RT_BUILD_DIR)/cov -t "Catalyst Runtime C++ Coverage" --num-spaces 4
-else
-	xcrun llvm-profdata merge $(RT_BUILD_DIR)/tests/*.profraw -o $(RT_BUILD_DIR)/tests/rt_test_coverage.profdata
-	xcrun llvm-cov show -instr-profile $(RT_BUILD_DIR)/tests/rt_test_coverage.profdata \
-		-object $(RT_BUILD_DIR)/tests/runner_tests_openqasm \
-		$(RT_BUILD_DIR)/tests/runner_tests_qir_runtime \
-		-format=html -output-dir=$(RT_BUILD_DIR)/coverage_html \
-		$(MK_DIR)/include $(MK_DIR)/lib $(MK_DIR)/tests
-endif
-
-.PHONY: clean
-clean:
-	@echo "clean build files"
-	rm -rf $(RT_BUILD_DIR) $(RT_BUILD_DIR)_cov cov coverage.info $(MK_DIR)/BuildTidy
-
-.PHONY: format
-format:
-ifdef check
-	$(PYTHON) ../bin/format.py --check $(if $(version:-=),--cfversion $(version)) .
-else
-	$(PYTHON) ../bin/format.py $(if $(version:-=),--cfversion $(version)) .
-endif
-
-.PHONY: check-tidy
-check-tidy:
-	@echo "build Catalyst Runtime with RUNTIME_CLANG_TIDY=ON"
-	cmake -G Ninja -B $(MK_DIR)/BuildTidy . \
-		-DCMAKE_BUILD_TYPE=$(BUILD_TYPE) \
-		-DCMAKE_C_COMPILER=$(C_COMPILER) \
-		-DCMAKE_CXX_COMPILER=$(CXX_COMPILER) \
-		-DRUNTIME_CLANG_TIDY=ON
-
-	cmake --build $(MK_DIR)/BuildTidy --target rt_capi -j$(NPROC)
diff --git a/src/qirlightning/catalyst_runtime/lib/CMakeLists.txt b/src/qirlightning/catalyst_runtime/lib/CMakeLists.txt
deleted file mode 100644
index 50fd0b0..0000000
--- a/src/qirlightning/catalyst_runtime/lib/CMakeLists.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-add_subdirectory(capi)
-add_subdirectory(backend)
-add_subdirectory(registry)
diff --git a/src/qirlightning/catalyst_runtime/lib/backend/CMakeLists.txt b/src/qirlightning/catalyst_runtime/lib/backend/CMakeLists.txt
deleted file mode 100644
index 45b7ad7..0000000
--- a/src/qirlightning/catalyst_runtime/lib/backend/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-add_subdirectory(null_qubit)
-configure_file(null_qubit/null_qubit.toml null_qubit.toml)
-if(ENABLE_OPENQASM)
-add_subdirectory(openqasm)
-configure_file(openqasm/braket_local_qubit.toml braket_local_qubit.toml)
-configure_file(openqasm/braket_aws_qubit.toml braket_aws_qubit.toml)
-endif()
diff --git a/src/qirlightning/catalyst_runtime/lib/backend/common/CacheManager.hpp b/src/qirlightning/catalyst_runtime/lib/backend/common/CacheManager.hpp
deleted file mode 100644
index 0141f33..0000000
--- a/src/qirlightning/catalyst_runtime/lib/backend/common/CacheManager.hpp
+++ /dev/null
@@ -1,199 +0,0 @@
-// Copyright 2022-2023 Xanadu Quantum Technologies Inc.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-//     http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <complex>
-#include <string>
-#include <vector>
-
-#include "Types.h"
-#include "Utils.hpp"
-
-namespace Catalyst::Runtime {
-/**
- * @brief The CacheManager caches the entire operations and observables of
- * a program at runtime.
- *
- * One direct use case of this functionality is explored to compute gradient
- * of a circuit with taking advantage of gradient methods provided by
- * simulators.
- */
-template <typename ComplexT = std::complex<double>> class CacheManager {
-  protected:
-    // Operations Data
-    std::vector<std::string> ops_names_{};
-    std::vector<std::vector<double>> ops_params_{};
-    std::vector<std::vector<size_t>> ops_wires_{};
-    std::vector<bool> ops_inverses_{};
-    std::vector<std::vector<ComplexT>> ops_matrixs_{};
-    std::vector<std::vector<size_t>> ops_controlled_wires_{};
-    std::vector<std::vector<bool>> ops_controlled_values_{};
-
-    // Observables Data
-    std::vector<ObsIdType> obs_keys_{};
-    std::vector<MeasurementsT> obs_callees_{};
-
-    // Number of parameters
-    size_t num_params_{0};
-
-  public:
-    CacheManager() = default;
-    ~CacheManager() = default;
-
-    CacheManager(const CacheManager &) = delete;
-    CacheManager &operator=(const CacheManager &) = delete;
-    CacheManager(CacheManager &&) = delete;
-    CacheManager &operator=(CacheManager &&) = delete;
-
-    /**
-     * Reset cached gates
-     */
-    void Reset()
-    {
-        ops_names_.clear();
-        ops_params_.clear();
-        ops_wires_.clear();
-        ops_inverses_.clear();
-        ops_matrixs_.clear();
-        ops_controlled_wires_.clear();
-        ops_controlled_values_.clear();
-
-        obs_keys_.clear();
-        obs_callees_.clear();
-
-        num_params_ = 0;
-    }
-
-    /**
-     * @brief Add a new operation to the list of cached gates.
-     *
-     * @param name Name of the given gate
-     * @param params Parameters of the gate
-     * @param wires Wires the gate acts on
-     * @param inverse If true, inverse of the gate is applied
-     * @param matrix Unitary matrix for the 'MatrixOp' operations
-     * @param controlled_wires Control wires
-     * @param controlled_values Control values
-     */
-    void addOperation(const std::string &name, const std::vector<double> &params,
-                      const std::vector<size_t> &wires, bool inverse,
-                      const std::vector<ComplexT> &matrix = {},
-                      const std::vector<size_t> &controlled_wires = {},
-                      const std::vector<bool> &controlled_values = {})
-    {
-        ops_names_.push_back(name);
-        ops_params_.push_back(params);
-        ops_wires_.push_back(wires);
-        ops_inverses_.push_back(inverse);
-        ops_matrixs_.push_back(matrix);
-        ops_controlled_wires_.push_back(controlled_wires);
-        ops_controlled_values_.push_back(controlled_values);
-
-        num_params_ += params.size();
-    }
-
-    /**
-     * @brief Add a new observable to the list of cached gates.
-     *
-     * @param id The observable key created by LObsManager()
-     * @param callee The measurement operation
-     */
-    void addObservable(const ObsIdType id, const MeasurementsT &callee = MeasurementsT::None)
-    {
-        obs_keys_.push_back(id);
-        obs_callees_.push_back(callee);
-    }
-
-    /**
-     * @brief Get a reference to observables keys.
-     */
-    auto getObservablesKeys() -> const std::vector<ObsIdType> & { return obs_keys_; }
-
-    /**
-     * @brief Get a reference to observables callees.
-     */
-    auto getObservablesCallees() -> const std::vector<MeasurementsT> & { return obs_callees_; }
-
-    /**
-     * @brief Get a reference to operations names.
-     */
-    auto getOperationsNames() -> const std::vector<std::string> & { return ops_names_; }
-
-    /**
-     * @brief Get a reference to operations parameters.
-     */
-    auto getOperationsParameters() -> const std::vector<std::vector<double>> &
-    {
-        return ops_params_;
-    }
-
-    /**
-     * @brief Get a reference to operations wires.
-     */
-    auto getOperationsWires() -> const std::vector<std::vector<size_t>> & { return ops_wires_; }
-
-    /**
-     * @brief Get a reference to operation controlled wires.
-     */
-    auto getOperationsControlledWires() -> const std::vector<std::vector<size_t>> &
-    {
-        return this->ops_controlled_wires_;
-    }
-
-    /**
-     * @brief Get a reference to operation controlled values.
-     */
-    auto getOperationsControlledValues() -> const std::vector<std::vector<bool>> &
-    {
-        return this->ops_controlled_values_;
-    }
-
-    /**
-     * @brief Get a reference to operations inverses.
-     */
-    auto getOperationsInverses() -> const std::vector<bool> & { return ops_inverses_; }
-
-    /**
-     * @brief Get a reference to operations matrices.
-     */
-    auto getOperationsMatrices() -> const std::vector<std::vector<ComplexT>> &
-    {
-        return ops_matrixs_;
-    }
-
-    /**
-     * @brief Get total number of cached gates.
-     */
-    [[nodiscard]] auto getNumGates() const -> size_t
-    {
-        return ops_names_.size() + obs_keys_.size();
-    }
-
-    /**
-     * @brief Get number of operations.
-     */
-    [[nodiscard]] auto getNumOperations() const -> size_t { return ops_names_.size(); }
-
-    /**
-     * @brief Get number of observables.
-     */
-    [[nodiscard]] auto getNumObservables() const -> size_t { return obs_keys_.size(); }
-
-    /**
-     * @brief Get total number of cached gates.
-     */
-    [[nodiscard]] auto getNumParams() const -> size_t { return num_params_; }
-};
-} // namespace Catalyst::Runtime
diff --git a/src/qirlightning/catalyst_runtime/lib/backend/common/QubitManager.hpp b/src/qirlightning/catalyst_runtime/lib/backend/common/QubitManager.hpp
deleted file mode 100644
index 05dc377..0000000
--- a/src/qirlightning/catalyst_runtime/lib/backend/common/QubitManager.hpp
+++ /dev/null
@@ -1,146 +0,0 @@
-// Copyright 2022-2023 Xanadu Quantum Technologies Inc.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-//     http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <map>
-
-#include "Exception.hpp"
-#include "Types.h"
-#include "Utils.hpp"
-
-namespace Catalyst::Runtime {
-
-/**
- * Qubit Manager
- *
- * @brief That maintains mapping of qubit IDs between runtime and device
- * ids (e.g., Lightning-Dynamic). When user allocates a qubit, the
- * `QubitManager` adds the qubit as an active qubit that operations
- * can act on. When user releases a qubit, the `QubitManager` removes
- * that qubit from the list of active wires.
- */
-template <typename SimQubitIdType = QubitIdType, typename DevQubitIdType = size_t>
-class QubitManager {
-  private:
-    using LQMapT = std::map<SimQubitIdType, DevQubitIdType>;
-
-    SimQubitIdType next_idx{0};
-    LQMapT qubits_map{};
-
-    template <class OIter = typename LQMapT::iterator>
-    [[nodiscard]] inline OIter _remove_simulator_qubit_id(SimQubitIdType s_idx)
-    {
-        const auto &&s_idx_iter = this->qubits_map.find(s_idx);
-        RT_FAIL_IF(s_idx_iter == this->qubits_map.end(), "Invalid simulator qubit index");
-
-        return this->qubits_map.erase(s_idx_iter);
-    }
-
-    template <class IIter = typename LQMapT::iterator>
-    inline void _update_qubits_mapfrom(IIter s_idx_iter)
-    {
-        for (; s_idx_iter != this->qubits_map.end(); s_idx_iter++) {
-            s_idx_iter->second--;
-        }
-    }
-
-  public:
-    QubitManager() = default;
-    ~QubitManager() = default;
-
-    QubitManager(const QubitManager &) = delete;
-    QubitManager &operator=(const QubitManager &) = delete;
-    QubitManager(QubitManager &&) = delete;
-    QubitManager &operator=(QubitManager &&) = delete;
-
-    [[nodiscard]] auto isValidQubitId(SimQubitIdType s_idx) -> bool
-    {
-        return this->qubits_map.contains(s_idx);
-    }
-
-    [[nodiscard]] auto isValidQubitId(const std::vector<SimQubitIdType> &ss_idx) -> bool
-    {
-        return std::all_of(ss_idx.begin(), ss_idx.end(),
-                           [this](SimQubitIdType s) { return isValidQubitId(s); });
-    }
-
-    [[nodiscard]] auto getAllQubitIds() -> std::vector<SimQubitIdType>
-    {
-        std::vector<SimQubitIdType> ids;
-        ids.reserve(this->qubits_map.size());
-        for (const auto &it : this->qubits_map) {
-            ids.push_back(it.first);
-        }
-
-        return ids;
-    }
-
-    [[nodiscard]] auto getDeviceId(SimQubitIdType s_idx) -> DevQubitIdType
-    {
-        RT_FAIL_IF(!isValidQubitId(s_idx), "Invalid device qubit index");
-
-        return this->qubits_map[s_idx];
-    }
-
-    auto getDeviceIds(const std::vector<SimQubitIdType> &ss_idx) -> std::vector<DevQubitIdType>
-    {
-        std::vector<DevQubitIdType> dd_idx;
-        dd_idx.reserve(ss_idx.size());
-        for (const auto &s : ss_idx) {
-            dd_idx.push_back(getDeviceId(s));
-        }
-        return dd_idx;
-    }
-
-    [[nodiscard]] auto getSimulatorId(DevQubitIdType d_idx) -> SimQubitIdType
-    {
-        auto s_idx = std::find_if(this->qubits_map.begin(), this->qubits_map.end(),
-                                  [&d_idx](auto &&p) { return p.second == d_idx; });
-
-        RT_FAIL_IF(s_idx == this->qubits_map.end(), "Invalid simulator qubit index");
-
-        return s_idx->first;
-    }
-
-    [[nodiscard]] auto Allocate(DevQubitIdType d_next_idx) -> SimQubitIdType
-    {
-        this->qubits_map[this->next_idx++] = d_next_idx;
-        return this->next_idx - 1;
-    }
-
-    auto AllocateRange(DevQubitIdType start_idx, size_t size) -> std::vector<SimQubitIdType>
-    {
-        std::vector<SimQubitIdType> ids;
-        ids.reserve(size);
-        for (DevQubitIdType i = start_idx; i < start_idx + size; i++) {
-            ids.push_back(this->next_idx);
-            this->qubits_map[this->next_idx++] = i;
-        }
-        return ids;
-    }
-
-    void Release(SimQubitIdType s_idx)
-    {
-        _update_qubits_mapfrom(_remove_simulator_qubit_id(s_idx));
-    }
-
-    void ReleaseAll()
-    {
-        // Release all qubits by clearing the map.
-        this->qubits_map.clear();
-    }
-};
-} // namespace Catalyst::Runtime
diff --git a/src/qirlightning/catalyst_runtime/lib/backend/common/Utils.hpp b/src/qirlightning/catalyst_runtime/lib/backend/common/Utils.hpp
deleted file mode 100644
index 0527ac4..0000000
--- a/src/qirlightning/catalyst_runtime/lib/backend/common/Utils.hpp
+++ /dev/null
@@ -1,304 +0,0 @@
-// Copyright 2022-2023 Xanadu Quantum Technologies Inc.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-//     http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <array>
-#include <optional>
-#include <random>
-#include <sstream>
-#include <string>
-#include <string_view>
-#include <tuple>
-#include <unordered_map>
-#include <utility>
-
-#include "Exception.hpp"
-#include "Types.h"
-
-#define QUANTUM_DEVICE_DEL_DECLARATIONS(CLASSNAME)                                                 \
-    CLASSNAME(const CLASSNAME &) = delete;                                                         \
-    CLASSNAME &operator=(const CLASSNAME &) = delete;                                              \
-    CLASSNAME(CLASSNAME &&) = delete;                                                              \
-    CLASSNAME &operator=(CLASSNAME &&) = delete;
-
-#define QUANTUM_DEVICE_RT_DECLARATIONS                                                             \
-    auto AllocateQubit()->QubitIdType override;                                                    \
-    auto AllocateQubits(size_t num_qubits)->std::vector<QubitIdType> override;                     \
-    void ReleaseQubit(QubitIdType q) override;                                                     \
-    void ReleaseAllQubits() override;                                                              \
-    [[nodiscard]] auto GetNumQubits() const->size_t override;                                      \
-    void StartTapeRecording() override;                                                            \
-    void StopTapeRecording() override;                                                             \
-    void SetDeviceShots(size_t shots) override;                                                    \
-    [[nodiscard]] auto GetDeviceShots() const->size_t override;                                    \
-    void PrintState() override;                                                                    \
-    [[nodiscard]] auto Zero() const->Result override;                                              \
-    [[nodiscard]] auto One() const->Result override;
-
-#define QUANTUM_DEVICE_QIS_DECLARATIONS                                                            \
-    void NamedOperation(                                                                           \
-        const std::string &name, const std::vector<double> &params,                                \
-        const std::vector<QubitIdType> &wires, [[maybe_unused]] bool inverse = false,              \
-        [[maybe_unused]] const std::vector<QubitIdType> &controlled_wires = {},                    \
-        [[maybe_unused]] const std::vector<bool> &controlled_values = {}) override;                \
-    using Catalyst::Runtime::QuantumDevice::MatrixOperation;                                       \
-    void MatrixOperation(                                                                          \
-        const std::vector<std::complex<double>> &matrix, const std::vector<QubitIdType> &wires,    \
-        [[maybe_unused]] bool inverse = false,                                                     \
-        [[maybe_unused]] const std::vector<QubitIdType> &controlled_wires = {},                    \
-        [[maybe_unused]] const std::vector<bool> &controlled_values = {}) override;                \
-    auto Observable(ObsId id, const std::vector<std::complex<double>> &matrix,                     \
-                    const std::vector<QubitIdType> &wires)                                         \
-        ->ObsIdType override;                                                                      \
-    auto TensorObservable(const std::vector<ObsIdType> &obs)->ObsIdType override;                  \
-    auto HamiltonianObservable(const std::vector<double> &coeffs,                                  \
-                               const std::vector<ObsIdType> &obs)                                  \
-        ->ObsIdType override;                                                                      \
-    auto Expval(ObsIdType obsKey)->double override;                                                \
-    auto Var(ObsIdType obsKey)->double override;                                                   \
-    void State(DataView<std::complex<double>, 1> &state) override;                                 \
-    void Probs(DataView<double, 1> &probs) override;                                               \
-    void PartialProbs(DataView<double, 1> &probs, const std::vector<QubitIdType> &wires) override; \
-    void Sample(DataView<double, 2> &samples, size_t shots) override;                              \
-    void PartialSample(DataView<double, 2> &samples, const std::vector<QubitIdType> &wires,        \
-                       size_t shots) override;                                                     \
-    void Counts(DataView<double, 1> &eigvals, DataView<int64_t, 1> &counts, size_t shots)          \
-        override;                                                                                  \
-    void PartialCounts(DataView<double, 1> &eigvals, DataView<int64_t, 1> &counts,                 \
-                       const std::vector<QubitIdType> &wires, size_t shots) override;              \
-    auto Measure(QubitIdType wire, std::optional<int32_t> postselect = std::nullopt)               \
-        ->Result override;                                                                         \
-    void Gradient(std::vector<DataView<double, 1>> &gradients,                                     \
-                  const std::vector<size_t> &trainParams) override;
-
-namespace Catalyst::Runtime {
-static inline auto parse_kwargs(std::string kwargs) -> std::unordered_map<std::string, std::string>
-{
-    // cleaning kwargs
-    if (kwargs.empty()) {
-        return {};
-    }
-
-    std::unordered_map<std::string, std::string> map;
-    size_t s3_pos = kwargs.find("\'s3_destination_folder\'");
-    if (s3_pos != std::string::npos) {
-        auto opening_pos = kwargs.find('(', s3_pos);
-        RT_ASSERT(opening_pos != std::string::npos);
-        auto closing_pos = kwargs.find(')', opening_pos);
-        RT_ASSERT(closing_pos != std::string::npos);
-        map["s3_destination_folder"] = kwargs.substr(opening_pos, closing_pos - opening_pos + 1);
-    }
-
-    auto kwargs_end_iter = (s3_pos == std::string::npos) ? kwargs.end() : kwargs.begin() + s3_pos;
-
-    kwargs.erase(std::remove_if(kwargs.begin(), kwargs_end_iter,
-                                [](char c) {
-                                    switch (c) {
-                                    case '{':
-                                    case '}':
-                                    case ' ':
-                                    case '\'':
-                                        return true;
-                                    default:
-                                        return false;
-                                    }
-                                }),
-                 kwargs.end());
-
-    // constructing map
-    std::istringstream iss(kwargs);
-    std::string token;
-    while (std::getline(iss, token, ',')) {
-        std::istringstream issp(token);
-        std::string pair[2];
-        std::getline(issp, pair[0], ':');
-        std::getline(issp, pair[1]);
-        map[pair[0]] = pair[1];
-    }
-
-    return map;
-}
-
-enum class MeasurementsT : uint8_t {
-    None, // = 0
-    Expval,
-    Var,
-    Probs,
-    State,
-};
-
-} // namespace Catalyst::Runtime
-
-namespace Catalyst::Runtime::Simulator::Lightning {
-enum class SimulatorGate : uint8_t {
-    // 1-qubit
-    Identity, // = 0
-    PauliX,
-    PauliY,
-    PauliZ,
-    Hadamard,
-    S,
-    T,
-    PhaseShift,
-    RX,
-    RY,
-    RZ,
-    Rot,
-    // 2-qubit
-    CNOT,
-    CY,
-    CZ,
-    SWAP,
-    ISWAP,
-    PSWAP,
-    IsingXX,
-    IsingYY,
-    IsingXY,
-    IsingZZ,
-    ControlledPhaseShift,
-    CRX,
-    CRY,
-    CRZ,
-    CRot,
-    // 3-qubit
-    CSWAP,
-    Toffoli,
-    // n-qubit
-    MultiRZ,
-};
-
-constexpr std::array simulator_observable_support = {
-    // ObsId, ObsName, SimulatorSupport
-    std::tuple<ObsId, std::string_view, bool>{ObsId::Identity, "Identity", true},
-    std::tuple<ObsId, std::string_view, bool>{ObsId::PauliX, "PauliX", true},
-    std::tuple<ObsId, std::string_view, bool>{ObsId::PauliY, "PauliY", true},
-    std::tuple<ObsId, std::string_view, bool>{ObsId::PauliZ, "PauliZ", true},
-    std::tuple<ObsId, std::string_view, bool>{ObsId::Hadamard, "Hadamard", true},
-};
-
-using GateInfoTupleT = std::tuple<SimulatorGate, std::string_view, size_t, size_t>;
-
-constexpr std::array simulator_gate_info = {
-    // 1-qubit
-    GateInfoTupleT{SimulatorGate::Identity, "Identity", 1, 0},
-    GateInfoTupleT{SimulatorGate::PauliX, "PauliX", 1, 0},
-    GateInfoTupleT{SimulatorGate::PauliY, "PauliY", 1, 0},
-    GateInfoTupleT{SimulatorGate::PauliZ, "PauliZ", 1, 0},
-    GateInfoTupleT{SimulatorGate::Hadamard, "Hadamard", 1, 0},
-    GateInfoTupleT{SimulatorGate::S, "S", 1, 0},
-    GateInfoTupleT{SimulatorGate::T, "T", 1, 0},
-    GateInfoTupleT{SimulatorGate::PhaseShift, "PhaseShift", 1, 1},
-    GateInfoTupleT{SimulatorGate::RX, "RX", 1, 1},
-    GateInfoTupleT{SimulatorGate::RY, "RY", 1, 1},
-    GateInfoTupleT{SimulatorGate::RZ, "RZ", 1, 1},
-    GateInfoTupleT{SimulatorGate::Rot, "Rot", 1, 3},
-    // 2-qubit
-    GateInfoTupleT{SimulatorGate::CNOT, "CNOT", 2, 0},
-    GateInfoTupleT{SimulatorGate::CY, "CY", 2, 0},
-    GateInfoTupleT{SimulatorGate::CZ, "CZ", 2, 0},
-    GateInfoTupleT{SimulatorGate::SWAP, "SWAP", 2, 0},
-    GateInfoTupleT{SimulatorGate::ISWAP, "ISWAP", 2, 0},
-    GateInfoTupleT{SimulatorGate::PSWAP, "PSWAP", 2, 1},
-    GateInfoTupleT{SimulatorGate::IsingXX, "IsingXX", 2, 1},
-    GateInfoTupleT{SimulatorGate::IsingYY, "IsingYY", 2, 1},
-    GateInfoTupleT{SimulatorGate::IsingXY, "IsingXY", 2, 1},
-    GateInfoTupleT{SimulatorGate::IsingZZ, "IsingZZ", 2, 1},
-    GateInfoTupleT{SimulatorGate::ControlledPhaseShift, "ControlledPhaseShift", 2, 1},
-    GateInfoTupleT{SimulatorGate::CRX, "CRX", 2, 1},
-    GateInfoTupleT{SimulatorGate::CRY, "CRY", 2, 1},
-    GateInfoTupleT{SimulatorGate::CRZ, "CRZ", 2, 1},
-    GateInfoTupleT{SimulatorGate::CRot, "CRot", 2, 3},
-    // 3-qubit
-    GateInfoTupleT{SimulatorGate::CSWAP, "CSWAP", 3, 0},
-    GateInfoTupleT{SimulatorGate::Toffoli, "Toffoli", 3, 0},
-    // n-qubit
-    GateInfoTupleT{SimulatorGate::MultiRZ, "MultiRZ", 0, 1},
-};
-
-constexpr size_t simulator_gate_info_size = simulator_gate_info.size();
-constexpr size_t simulator_observable_support_size = simulator_observable_support.size();
-
-template <size_t size = simulator_gate_info_size>
-using SimulatorGateInfoDataT = std::array<GateInfoTupleT, size>;
-
-template <size_t size = simulator_observable_support_size>
-constexpr auto lookup_obs(const std::array<std::tuple<ObsId, std::string_view, bool>, size> &arr,
-                          const ObsId key) -> std::string_view
-{
-    for (size_t idx = 0; idx < size; idx++) {
-        auto &&[op_id, op_str, op_support] = arr[idx];
-        if (op_id == key && op_support) {
-            return op_str;
-        }
-    }
-    throw std::range_error("The given observable is not supported by the simulator");
-}
-
-template <size_t size = simulator_gate_info_size>
-constexpr auto lookup_gates(const SimulatorGateInfoDataT<size> &arr, const std::string &key)
-    -> std::pair<size_t, size_t>
-{
-    for (size_t idx = 0; idx < size; idx++) {
-        auto &&[op, op_str, op_num_wires, op_num_params] = arr[idx];
-        if (op_str == key) {
-            return std::make_pair(op_num_wires, op_num_params);
-        }
-    }
-    throw std::range_error("The given operation is not supported by the simulator");
-}
-
-template <size_t size = simulator_gate_info_size>
-constexpr auto has_gate(const SimulatorGateInfoDataT<size> &arr, const std::string &key) -> bool
-{
-    for (size_t idx = 0; idx < size; idx++) {
-        if (std::get<1>(arr[idx]) == key) {
-            return true;
-        }
-    }
-    return false;
-}
-
-static inline auto
-simulateDraw(const std::vector<double> &probs, std::optional<int32_t> postselect,
-             std::mt19937 *gen = nullptr) // NOLINT(readability-non-const-parameter)
-    -> bool
-{
-    if (postselect) {
-        auto postselect_value = postselect.value();
-        RT_FAIL_IF(postselect_value < 0 || postselect_value > 1, "Invalid postselect value");
-        RT_FAIL_IF(probs[postselect_value] == 0, "Probability of postselect value is 0");
-        return static_cast<bool>(postselect_value == 1);
-    }
-
-    // Normal flow, no post-selection
-    // Draw a number according to the given distribution
-    std::uniform_real_distribution<> dis(0., 1.);
-
-    float draw;
-    if (gen != nullptr) {
-        draw = dis(*gen);
-        (*gen)();
-    }
-    else {
-        std::random_device rd;
-        std::mt19937 gen_no_seed(rd());
-        draw = dis(gen_no_seed);
-    }
-
-    return draw > probs[0];
-}
-
-} // namespace Catalyst::Runtime::Simulator::Lightning
diff --git a/src/qirlightning/catalyst_runtime/lib/capi/CMakeLists.txt b/src/qirlightning/catalyst_runtime/lib/capi/CMakeLists.txt
deleted file mode 100644
index e05e9bf..0000000
--- a/src/qirlightning/catalyst_runtime/lib/capi/CMakeLists.txt
+++ /dev/null
@@ -1,57 +0,0 @@
-##################################
-# Object Lib catalyst_qir_qis_obj
-##################################
-
-add_library(catalyst_qir_qis_obj OBJECT RuntimeCAPI.cpp)
-
-# include external MLIR runner utils
-FetchContent_MakeAvailable(MLIRRunnerUtils)
-FetchContent_MakeAvailable(MLIRCRunnerUtils)
-FetchContent_MakeAvailable(MLIRFloat16Bits)
-
-# link to rt_backend
-target_link_libraries(catalyst_qir_qis_obj ${CMAKE_DL_LIBS})
-
-target_link_libraries(catalyst_qir_qis_obj
-    pthread
-    dl
-)
-
-target_include_directories(catalyst_qir_qis_obj PUBLIC .
-    ${CMAKE_CURRENT_SOURCE_DIR}
-    ${runtime_includes}
-    ${mlirrunnerutils_SOURCE_DIR}/../..  # includes are relative to mlir/ExecutionEngine
-    ${PROJECT_SOURCE_DIR}/../mlir/lib/Driver  # Timer.hpp
-)
-
-# The MLIR Runner Utils raises this warning so we need to disable it for our -Werror builds.
-if(RUNTIME_ENABLE_WARNINGS)
-    target_compile_options(catalyst_qir_qis_obj PRIVATE "-Wno-unused-parameter")
-endif()
-
-set_property(TARGET catalyst_qir_qis_obj PROPERTY POSITION_INDEPENDENT_CODE ON)
-
-#####################
-# Shared Lib rt_capi
-#####################
-
-add_library(rt_capi SHARED)
-
-target_link_libraries(rt_capi ${CMAKE_DL_LIBS} catalyst_qir_qis_obj)
-add_dependencies(rt_capi catalyst_callback_registry)
-
-
-target_include_directories(rt_capi PUBLIC .
-    ${CMAKE_CURRENT_SOURCE_DIR}
-    ${runtime_includes}
-    ${capi_utils_includes}
-)
-
-set_property(TARGET rt_capi PROPERTY POSITION_INDEPENDENT_CODE ON)
-set_property(TARGET rt_capi APPEND PROPERTY BUILD_RPATH "$<TARGET_FILE_DIR:catalyst_callback_registry>")
-
-if(NOT APPLE)
-    set_property(TARGET rt_capi APPEND PROPERTY BUILD_RPATH $ORIGIN)
-else()
-    set_property(TARGET rt_capi APPEND PROPERTY BUILD_RPATH @loader_path)
-endif()
diff --git a/src/qirlightning/catalyst_runtime/lib/capi/ExecutionContext.hpp b/src/qirlightning/catalyst_runtime/lib/capi/ExecutionContext.hpp
deleted file mode 100644
index 9abe8cb..0000000
--- a/src/qirlightning/catalyst_runtime/lib/capi/ExecutionContext.hpp
+++ /dev/null
@@ -1,367 +0,0 @@
-// Copyright 2022-2023 Xanadu Quantum Technologies Inc.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-//     http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <dlfcn.h>
-
-#include <cstdio>
-#include <functional>
-#include <memory>
-#include <mutex>
-#include <random>
-#include <string>
-#include <string_view>
-#include <tuple>
-#include <unordered_map>
-#include <unordered_set>
-
-#include "Exception.hpp"
-#include "QuantumDevice.hpp"
-#include "Types.h"
-
-extern void callbackCall(int64_t, int64_t, int64_t, va_list);
-
-namespace Catalyst::Runtime {
-
-extern "C" void __catalyst_inactive_callback(int64_t identifier, int64_t argc, int64_t retc, ...);
-
-class MemoryManager // NOLINT(cppcoreguidelines-special-member-functions,
-                    // hicpp-special-member-functions)
-    final {
-  private:
-    std::unordered_set<void *> _impl;
-    std::mutex mu; // To guard the memory manager
-
-  public:
-    explicit MemoryManager() { _impl.reserve(1024); };
-
-    ~MemoryManager()
-    {
-        // Lock the mutex to protect _impl free
-        std::lock_guard<std::mutex> lock(mu);
-        for (auto *allocation : _impl) {
-            free(allocation); // NOLINT(cppcoreguidelines-no-malloc, hicpp-no-malloc)
-        }
-    }
-
-    void insert(void *ptr)
-    {
-        // Lock the mutex to protect _impl update
-        std::lock_guard<std::mutex> lock(mu);
-        _impl.insert(ptr);
-    }
-    void erase(void *ptr)
-    {
-        // Lock the mutex to protect _impl update
-        std::lock_guard<std::mutex> lock(mu);
-        _impl.erase(ptr);
-    }
-    bool contains(void *ptr)
-    {
-        // Lock the mutex to protect _impl update
-        std::lock_guard<std::mutex> lock(mu);
-        return _impl.contains(ptr);
-    }
-};
-
-class SharedLibraryManager final {
-  private:
-    void *_handler{nullptr};
-
-  public:
-    SharedLibraryManager() = delete;
-    explicit SharedLibraryManager(const std::string &filename)
-    {
-#ifdef __APPLE__
-        auto rtld_flags = RTLD_LAZY;
-#else
-        // Closing the dynamic library of Lightning simulators with dlclose() where OpenMP
-        // directives (in Lightning simulators) are in use would raise memory segfaults.
-        // Note that we use RTLD_NODELETE as a workaround to fix the issue.
-        auto rtld_flags = RTLD_LAZY | RTLD_NODELETE;
-#endif
-
-        _handler = dlopen(filename.c_str(), rtld_flags);
-        RT_FAIL_IF(!_handler, dlerror());
-    }
-
-    ~SharedLibraryManager()
-    {
-        // dlopen and dlclose increment and decrement reference counters.
-        // Since we have a guaranteed _handler in a valid SharedLibraryManager instance
-        // then we don't really need to worry about dlclose.
-        // In other words, there is an one to one correspondence between an instance
-        // of SharedLibraryManager and an increase in the reference count for the dynamic library.
-        // dlclose returns non-zero on error.
-        //
-        // Errors in dlclose are implementation dependent.
-        // There are two possible errors during dlclose in glibc: "shared object not open"
-        // and "cannot create scope list". Look for _dl_signal_error in:
-        //
-        //     https://codebrowser.dev/glibc/glibc/elf/dl-close.c.html
-        //
-        // This means that at the very least, one could trigger an error in the following line by
-        // doing the following: dlopen the same library and closing it multiple times in a different
-        // location.
-        //
-        // This would mean that the reference count would be less than the number of instances
-        // of SharedLibraryManager.
-        //
-        // There really is no way to protect against this error, except to always use
-        // SharedLibraryManager to manage shared libraries.
-        //
-        // Exercise for the reader, how could one trigger the "cannot create scope list" error?
-        dlclose(_handler);
-    }
-
-    SharedLibraryManager(const SharedLibraryManager &other) = delete;
-    SharedLibraryManager &operator=(const SharedLibraryManager &other) = delete;
-    SharedLibraryManager(SharedLibraryManager &&other) = delete;
-    SharedLibraryManager &operator=(SharedLibraryManager &&other) = delete;
-
-    void *getSymbol(const std::string &symbol)
-    {
-        void *sym = dlsym(_handler, symbol.c_str());
-        RT_FAIL_IF(!sym, dlerror());
-        return sym;
-    }
-};
-
-/**
- * This indicates the various stages a device can be in:
- * - `Active`   : The device is added to the device pool and the `ExecutionContext` device pointer
- *                (`RTD_PTR`) points to this device instance. The CAPI routines have only access to
- *                one single active device per thread via `RTD_PTR`.
- * - `Inactive`  : The device is deactivated meaning `RTD_PTR` does not point to this device.
- *                 The device is not removed from the pool, allowing the `ExecutionContext` manager
- *                 to reuse this device in a multi-qnode workflow when another device with identical
- *                 specifications is requested.
- */
-enum class RTDeviceStatus : uint8_t {
-    Active = 0,
-    Inactive,
-};
-
-extern "C" Catalyst::Runtime::QuantumDevice *GenericDeviceFactory(const char *kwargs);
-
-/**
- * Runtime Device data-class.
- *
- * This class introduces an interface for constructed devices by the `ExecutionContext`
- * manager. This includes the device name, library, kwargs, and a shared pointer to the
- * `QuantumDevice` entry point.
- */
-class RTDevice {
-  private:
-    std::string rtd_lib;
-    std::string rtd_name;
-    std::string rtd_kwargs;
-
-    std::unique_ptr<SharedLibraryManager> rtd_dylib{nullptr};
-    std::unique_ptr<QuantumDevice> rtd_qdevice{nullptr};
-
-    RTDeviceStatus status{RTDeviceStatus::Inactive};
-
-    static void _complete_dylib_os_extension(std::string &rtd_lib, const std::string &name) noexcept
-    {
-#ifdef __linux__
-        rtd_lib = "librtd_" + name + ".so";
-#elif defined(__APPLE__)
-        rtd_lib = "librtd_" + name + ".dylib";
-#endif
-    }
-
-    static void _pl2runtime_device_info(std::string &rtd_lib, std::string &rtd_name) noexcept
-    {
-        // The following if-elif is required for C++ tests where these backend devices
-        // are linked in the interface library of the runtime. (check runtime/CMakeLists.txt)
-        // Besides, this provides support for runtime device (RTD) libraries added to the system
-        // path. This maintains backward compatibility for specifying a device using its name.
-        // TODO: This support may need to be removed after updating the C++ unit tests.
-        if (rtd_lib == "null.qubit") {
-            rtd_name = "NullQubit";
-            _complete_dylib_os_extension(rtd_lib, "null_qubit");
-        }
-        else if (rtd_lib == "lightning.qubit") {
-            rtd_name = "LightningSimulator";
-            _complete_dylib_os_extension(rtd_lib, "lightning");
-        }
-        else if (rtd_lib == "braket.aws.qubit" || rtd_lib == "braket.local.qubit") {
-            rtd_name = "OpenQasmDevice";
-            _complete_dylib_os_extension(rtd_lib, "openqasm");
-        }
-    }
-
-  public:
-    explicit RTDevice(std::string _rtd_lib, std::string _rtd_name = {},
-                      std::string _rtd_kwargs = {})
-        : rtd_lib(std::move(_rtd_lib)), rtd_name(std::move(_rtd_name)),
-          rtd_kwargs(std::move(_rtd_kwargs))
-    {
-        _pl2runtime_device_info(rtd_lib, rtd_name);
-    }
-
-    explicit RTDevice(std::string_view _rtd_lib, std::string_view _rtd_name,
-                      std::string_view _rtd_kwargs)
-        : rtd_lib(_rtd_lib), rtd_name(_rtd_name), rtd_kwargs(_rtd_kwargs)
-    {
-        _pl2runtime_device_info(rtd_lib, rtd_name);
-    }
-
-    ~RTDevice() = default;
-    RTDevice(const RTDevice &other) = delete;
-    RTDevice &operator=(const RTDevice &other) = delete;
-    RTDevice(RTDevice &&other) = delete;
-    RTDevice &operator=(RTDevice &&other) = delete;
-
-    auto operator==(const RTDevice &other) const -> bool
-    {
-        return (this->rtd_lib == other.rtd_lib && this->rtd_name == other.rtd_name) &&
-               this->rtd_kwargs == other.rtd_kwargs;
-    }
-
-    [[nodiscard]] auto getQuantumDevicePtr() -> const std::unique_ptr<QuantumDevice> &
-    {
-        if (rtd_qdevice) {
-            return rtd_qdevice;
-        }
-
-        rtd_dylib = std::make_unique<SharedLibraryManager>(rtd_lib);
-        std::string factory_name{rtd_name + "Factory"};
-        void *f_ptr = rtd_dylib->getSymbol(factory_name);
-        rtd_qdevice = std::unique_ptr<QuantumDevice>(
-            (f_ptr != nullptr)
-                ? reinterpret_cast<decltype(GenericDeviceFactory) *>(f_ptr)(rtd_kwargs.c_str())
-                : nullptr);
-        return rtd_qdevice;
-    }
-
-    [[nodiscard]] auto getDeviceInfo() const -> std::tuple<std::string, std::string, std::string>
-    {
-        return {rtd_lib, rtd_name, rtd_kwargs};
-    }
-
-    [[nodiscard]] auto getDeviceName() const -> const std::string & { return rtd_name; }
-
-    void setDeviceStatus(RTDeviceStatus new_status) noexcept { status = new_status; }
-
-    [[nodiscard]] auto getDeviceStatus() const -> RTDeviceStatus { return status; }
-
-    friend std::ostream &operator<<(std::ostream &os, const RTDevice &device)
-    {
-        os << "RTD, name: " << device.rtd_name << " lib: " << device.rtd_lib
-           << " kwargs: " << device.rtd_kwargs;
-        return os;
-    }
-};
-
-class ExecutionContext final {
-  private:
-    // Device pool
-    std::vector<std::shared_ptr<RTDevice>> device_pool;
-    std::mutex pool_mu; // To protect device_pool
-
-    bool initial_tape_recorder_status{false};
-
-    // ExecutionContext pointers
-    std::unique_ptr<MemoryManager> memory_man_ptr{nullptr};
-
-    // PRNG
-    uint32_t *seed;
-    std::mt19937 gen;
-
-  public:
-    explicit ExecutionContext(uint32_t *seed = nullptr) : seed(seed)
-    {
-        memory_man_ptr = std::make_unique<MemoryManager>();
-
-        if (this->seed != nullptr) {
-            this->gen = std::mt19937(*seed);
-        }
-    }
-
-    ~ExecutionContext() = default;
-    ExecutionContext(const ExecutionContext &other) = delete;
-    ExecutionContext &operator=(const ExecutionContext &other) = delete;
-    ExecutionContext(ExecutionContext &&other) = delete;
-    ExecutionContext &operator=(ExecutionContext &&other) = delete;
-
-    void setDeviceRecorderStatus(bool status) noexcept { initial_tape_recorder_status = status; }
-
-    [[nodiscard]] auto getDeviceRecorderStatus() const -> bool
-    {
-        return initial_tape_recorder_status;
-    }
-
-    [[nodiscard]] auto getMemoryManager() const -> const std::unique_ptr<MemoryManager> &
-    {
-        return memory_man_ptr;
-    }
-
-    [[nodiscard]] auto getOrCreateDevice(std::string_view rtd_lib, std::string_view rtd_name,
-                                         std::string_view rtd_kwargs)
-        -> const std::shared_ptr<RTDevice> &
-    {
-        std::lock_guard<std::mutex> lock(pool_mu);
-
-        auto device = std::make_shared<RTDevice>(rtd_lib, rtd_name, rtd_kwargs);
-
-        const size_t key = device_pool.size();
-        for (size_t i = 0; i < key; i++) {
-            if (device_pool[i]->getDeviceStatus() == RTDeviceStatus::Inactive &&
-                *device_pool[i] == *device) {
-                device_pool[i]->setDeviceStatus(RTDeviceStatus::Active);
-                return device_pool[i];
-            }
-        }
-
-        RT_ASSERT(device->getQuantumDevicePtr());
-
-        // Add a new device
-        device->setDeviceStatus(RTDeviceStatus::Active);
-        if (this->seed != nullptr) {
-            device->getQuantumDevicePtr()->SetDevicePRNG(&(this->gen));
-        }
-        else {
-            device->getQuantumDevicePtr()->SetDevicePRNG(nullptr);
-        }
-        device_pool.push_back(device);
-
-        return device_pool[key];
-    }
-
-    [[nodiscard]] auto getOrCreateDevice(const std::string &rtd_lib,
-                                         const std::string &rtd_name = {},
-                                         const std::string &rtd_kwargs = {})
-        -> const std::shared_ptr<RTDevice> &
-    {
-        return getOrCreateDevice(std::string_view{rtd_lib}, std::string_view{rtd_name},
-                                 std::string_view{rtd_kwargs});
-    }
-
-    [[nodiscard]] auto getDevice(size_t device_key) -> const std::shared_ptr<RTDevice> &
-    {
-        std::lock_guard<std::mutex> lock(pool_mu);
-        RT_FAIL_IF(device_key >= device_pool.size(), "Invalid device_key");
-        return device_pool[device_key];
-    }
-
-    void deactivateDevice(RTDevice *RTD_PTR)
-    {
-        std::lock_guard<std::mutex> lock(pool_mu);
-        RTD_PTR->setDeviceStatus(RTDeviceStatus::Inactive);
-    }
-};
-} // namespace Catalyst::Runtime
diff --git a/src/qirlightning/catalyst_runtime/lib/capi/MemRefUtils.hpp b/src/qirlightning/catalyst_runtime/lib/capi/MemRefUtils.hpp
deleted file mode 100644
index 481da78..0000000
--- a/src/qirlightning/catalyst_runtime/lib/capi/MemRefUtils.hpp
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright 2023 Xanadu Quantum Technologies Inc.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-//     http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cstddef>
-
-#include "mlir/ExecutionEngine/RunnerUtils.h"
-
-extern "C" {
-void *_mlir_memref_to_llvm_alloc(size_t size);
-void *_mlir_memref_to_llvm_aligned_alloc(size_t alignment, size_t size);
-bool _mlir_memory_transfer(void *);
-void _mlir_memref_to_llvm_free(void *ptr);
-}
-
-// MemRef type definition
-template <typename T, size_t R> struct MemRefT {
-    T *data_allocated;
-    T *data_aligned;
-    size_t offset;
-    size_t sizes[R];
-    size_t strides[R];
-};
-
-template <typename T>
-inline void printMemref(const UnrankedMemRefType<T> &memref, bool printDescriptor = false)
-{
-    auto m = DynamicMemRefType<T>(memref);
-    if (printDescriptor) {
-        std::cout << "MemRef: ";
-        printMemRefMetaData(std::cout, m);
-        std::cout << " data =" << std::endl;
-    }
-    impl::MemRefDataPrinter<T>::print(std::cout, m.data, m.rank, m.rank, m.offset, m.sizes,
-                                      m.strides);
-}
diff --git a/src/qirlightning/catalyst_runtime/lib/capi/RuntimeCAPI.cpp b/src/qirlightning/catalyst_runtime/lib/capi/RuntimeCAPI.cpp
deleted file mode 100644
index 8c1e019..0000000
--- a/src/qirlightning/catalyst_runtime/lib/capi/RuntimeCAPI.cpp
+++ /dev/null
@@ -1,1012 +0,0 @@
-// Copyright 2022-2023 Xanadu Quantum Technologies Inc.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-//     http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cstdarg>
-#include <cstdlib>
-#include <ctime>
-
-#include <bitset>
-#include <stdexcept>
-
-#include <memory>
-#include <ostream>
-#include <string_view>
-
-#include "mlir/ExecutionEngine/CRunnerUtils.h"
-
-#include "Exception.hpp"
-#include "QuantumDevice.hpp"
-
-#include "ExecutionContext.hpp"
-#include "MemRefUtils.hpp"
-#include "Timer.hpp"
-
-#include "RuntimeCAPI.h"
-
-namespace Catalyst::Runtime {
-
-/**
- * @brief Global quantum device unique pointer.
- */
-static std::unique_ptr<ExecutionContext> CTX = nullptr;
-
-/**
- * @brief Thread local device pointer with internal linkage.
- */
-thread_local static RTDevice *RTD_PTR = nullptr;
-
-bool getModifiersAdjoint(const Modifiers *modifiers)
-{
-    return !modifiers ? false : modifiers->adjoint;
-}
-
-std::vector<QubitIdType> getModifiersControlledWires(const Modifiers *modifiers)
-{
-    return !modifiers ? std::vector<QubitIdType>()
-                      : std::vector<QubitIdType>(
-                            reinterpret_cast<QubitIdType *>(modifiers->controlled_wires),
-                            reinterpret_cast<QubitIdType *>(modifiers->controlled_wires) +
-                                modifiers->num_controlled);
-}
-
-std::vector<bool> getModifiersControlledValues(const Modifiers *modifiers)
-{
-    return !modifiers ? std::vector<bool>()
-                      : std::vector<bool>(modifiers->controlled_values,
-                                          modifiers->controlled_values + modifiers->num_controlled);
-}
-
-#define MODIFIERS_ARGS(mod)                                                                        \
-    getModifiersAdjoint(mod), getModifiersControlledWires(mod), getModifiersControlledValues(mod)
-
-/**
- * @brief Initialize the device instance and update the value of RTD_PTR
- * to the new initialized device pointer.
- */
-[[nodiscard]] bool initRTDevicePtr(std::string_view rtd_lib, std::string_view rtd_name,
-                                   std::string_view rtd_kwargs)
-{
-    auto &&device = CTX->getOrCreateDevice(rtd_lib, rtd_name, rtd_kwargs);
-    if (device) {
-        RTD_PTR = device.get();
-        return RTD_PTR ? true : false;
-    }
-    return false;
-}
-
-/**
- * @brief get the active device.
- */
-auto getQuantumDevicePtr() -> const std::unique_ptr<QuantumDevice> &
-{
-    return RTD_PTR->getQuantumDevicePtr();
-}
-
-/**
- * @brief Inactivate the active device instance.
- */
-void deactivateDevice()
-{
-    CTX->deactivateDevice(RTD_PTR);
-    RTD_PTR = nullptr;
-}
-} // namespace Catalyst::Runtime
-
-extern "C" {
-
-using namespace Catalyst::Runtime;
-using timer = catalyst::utils::Timer;
-
-void __catalyst_inactive_callback(int64_t identifier, int64_t argc, int64_t retc, ...)
-{
-    // LIBREGISTRY is a compile time macro. It is defined based on the output
-    // name of the callback library. And since it is stored in the same location
-    // as this library, it shares the ORIGIN variable. Do a `git grep LIBREGISTRY`
-    // to find its definition in the CMakeFiles.
-    // It is the name of the library that contains the callbackCall implementation.
-    // The reason why this is using dlopen is because we have historically wanted
-    // to avoid a dependency of python in the runtime.
-    // With dlopen, we leave the possibility of linking against the runtime without
-    // linking with LIBREGISTRY which is implemented as a pybind11 module.
-    //
-    // The only restriction is that there should be no calls to pyregsitry.
-    //
-    // This function cannot be tested from the runtime tests because there would be no valid python
-    // function to callback...
-    void *handle = dlopen(LIBREGISTRY, RTLD_LAZY);
-    if (!handle) {
-        char *err_msg = dlerror();
-        RT_FAIL(err_msg);
-    }
-
-    void (*callbackCall)(int64_t, int64_t, int64_t, va_list);
-    typedef void (*func_ptr_t)(int64_t, int64_t, int64_t, va_list);
-    callbackCall = (func_ptr_t)dlsym(handle, "callbackCall");
-    if (!callbackCall) {
-        char *err_msg = dlerror();
-        RT_FAIL(err_msg);
-    }
-
-    va_list args;
-    va_start(args, retc);
-    callbackCall(identifier, argc, retc, args);
-    va_end(args);
-    dlclose(handle);
-}
-
-void __catalyst__host__rt__unrecoverable_error()
-{
-    RT_FAIL("Unrecoverable error from asynchronous execution of multiple quantum programs.");
-}
-
-void *_mlir_memref_to_llvm_alloc(size_t size)
-{
-    void *ptr = malloc(size);
-    CTX->getMemoryManager()->insert(ptr);
-    return ptr;
-}
-
-void *_mlir_memref_to_llvm_aligned_alloc(size_t alignment, size_t size)
-{
-    void *ptr = aligned_alloc(alignment, size);
-    CTX->getMemoryManager()->insert(ptr);
-    return ptr;
-}
-
-bool _mlir_memory_transfer(void *ptr)
-{
-    if (!CTX->getMemoryManager()->contains(ptr)) {
-        return false;
-    }
-    CTX->getMemoryManager()->erase(ptr);
-    return true;
-}
-
-void _mlir_memref_to_llvm_free(void *ptr)
-{
-    CTX->getMemoryManager()->erase(ptr);
-    free(ptr);
-}
-
-void __catalyst__rt__print_string(char *string)
-{
-    if (!string) {
-        std::cout << "None" << std::endl;
-        return;
-    }
-    std::cout << string << std::endl;
-}
-
-void __catalyst__rt__assert_bool(bool p, char *s) { RT_FAIL_IF(!p, s); }
-
-void __catalyst__rt__print_tensor(OpaqueMemRefT *c_memref, bool printDescriptor)
-{
-    if (c_memref->datatype == NumericType::idx) {
-        printMemref<impl::index_type>({c_memref->rank, c_memref->descriptor}, printDescriptor);
-    }
-    else if (c_memref->datatype == NumericType::i1) {
-        printMemref<bool>({c_memref->rank, c_memref->descriptor}, printDescriptor);
-    }
-    else if (c_memref->datatype == NumericType::i8) {
-        printMemref<int8_t>({c_memref->rank, c_memref->descriptor}, printDescriptor);
-    }
-    else if (c_memref->datatype == NumericType::i16) {
-        printMemref<int16_t>({c_memref->rank, c_memref->descriptor}, printDescriptor);
-    }
-    else if (c_memref->datatype == NumericType::i32) {
-        printMemref<int32_t>({c_memref->rank, c_memref->descriptor}, printDescriptor);
-    }
-    else if (c_memref->datatype == NumericType::i64) {
-        printMemref<int64_t>({c_memref->rank, c_memref->descriptor}, printDescriptor);
-    }
-    else if (c_memref->datatype == NumericType::f32) {
-        printMemref<float>({c_memref->rank, c_memref->descriptor}, printDescriptor);
-    }
-    else if (c_memref->datatype == NumericType::f64) {
-        printMemref<double>({c_memref->rank, c_memref->descriptor}, printDescriptor);
-    }
-    else if (c_memref->datatype == NumericType::c64) {
-        printMemref<impl::complex32>({c_memref->rank, c_memref->descriptor}, printDescriptor);
-    }
-    else if (c_memref->datatype == NumericType::c128) {
-        printMemref<impl::complex64>({c_memref->rank, c_memref->descriptor}, printDescriptor);
-    }
-    else {
-        RT_FAIL("Unkown numeric type encoding for array printing.");
-    }
-
-    std::cout << std::endl;
-}
-
-void __catalyst__rt__fail_cstr(const char *cstr) { RT_FAIL(cstr); }
-
-void __catalyst__rt__initialize(uint32_t *seed) { CTX = std::make_unique<ExecutionContext>(seed); }
-
-void __catalyst__rt__finalize()
-{
-    RTD_PTR = nullptr;
-    CTX.reset(nullptr);
-}
-
-static int __catalyst__rt__device_init__impl(int8_t *rtd_lib, int8_t *rtd_name, int8_t *rtd_kwargs,
-                                             int64_t shots)
-{
-    // Device library cannot be a nullptr
-    RT_FAIL_IF(!rtd_lib, "Invalid device library");
-    RT_FAIL_IF(!CTX, "Invalid use of the global driver before initialization");
-    RT_FAIL_IF(RTD_PTR, "Cannot re-initialize an ACTIVE device: Consider using "
-                        "__catalyst__rt__device_release before __catalyst__rt__device_init");
-
-    const std::vector<std::string_view> args{
-        reinterpret_cast<char *>(rtd_lib), (rtd_name ? reinterpret_cast<char *>(rtd_name) : ""),
-        (rtd_kwargs ? reinterpret_cast<char *>(rtd_kwargs) : "")};
-    RT_FAIL_IF(!initRTDevicePtr(args[0], args[1], args[2]),
-               "Failed initialization of the backend device");
-    getQuantumDevicePtr()->SetDeviceShots(shots);
-    if (CTX->getDeviceRecorderStatus()) {
-        getQuantumDevicePtr()->StartTapeRecording();
-    }
-    return 0;
-}
-
-void __catalyst__rt__device_init(int8_t *rtd_lib, int8_t *rtd_name, int8_t *rtd_kwargs,
-                                 int64_t shots)
-{
-    timer::timer(__catalyst__rt__device_init__impl, "device_init", /* add_endl */ true, rtd_lib,
-                 rtd_name, rtd_kwargs, shots);
-}
-
-static int __catalyst__rt__device_release__impl()
-{
-    RT_FAIL_IF(!CTX, "Cannot release an ACTIVE device out of scope of the global driver");
-    // TODO: This will be used for the async support
-    deactivateDevice();
-    return 0;
-}
-
-void __catalyst__rt__device_release()
-{
-    timer::timer(__catalyst__rt__device_release__impl, "device_release", /* add_endl */ true);
-}
-
-void __catalyst__rt__print_state() { getQuantumDevicePtr()->PrintState(); }
-
-void __catalyst__rt__toggle_recorder(bool status)
-{
-    CTX->setDeviceRecorderStatus(status);
-    if (!RTD_PTR) {
-        return;
-    }
-
-    if (status) {
-        getQuantumDevicePtr()->StartTapeRecording();
-    }
-    else {
-        getQuantumDevicePtr()->StopTapeRecording();
-    }
-}
-
-static QUBIT *__catalyst__rt__qubit_allocate__impl()
-{
-    RT_ASSERT(getQuantumDevicePtr() != nullptr);
-    RT_ASSERT(CTX->getMemoryManager() != nullptr);
-
-    return reinterpret_cast<QUBIT *>(getQuantumDevicePtr()->AllocateQubit());
-}
-
-QUBIT *__catalyst__rt__qubit_allocate()
-{
-    return timer::timer(__catalyst__rt__qubit_allocate__impl, "qubit_allocate",
-                        /* add_endl */ true);
-}
-
-static QirArray *__catalyst__rt__qubit_allocate_array__impl(int64_t num_qubits)
-{
-    RT_ASSERT(getQuantumDevicePtr() != nullptr);
-    RT_ASSERT(CTX->getMemoryManager() != nullptr);
-    RT_ASSERT(num_qubits >= 0);
-
-    // For first prototype, we just want to make this work.
-    // But ideally, I think the device should determine the representation.
-    // Essentially just forward this to the device library.
-    // And the device library can choose how to handle everything.
-    std::vector<QubitIdType> qubit_vector = getQuantumDevicePtr()->AllocateQubits(num_qubits);
-
-    // I don't like this copying.
-    std::vector<QubitIdType> *qubit_vector_ptr =
-        new std::vector<QubitIdType>(qubit_vector.begin(), qubit_vector.end());
-
-    // Because this function is interfacing with C
-    // I think we should return a trivial-type
-    //     https://en.cppreference.com/w/cpp/named_req/TrivialType
-    // Why should we return a trivial type?
-    //
-    // Paraphrasing from stackoverflow: https://stackoverflow.com/a/72409589
-    //     extern "C" will avoid name mangling from happening.
-    //     It doesn't prevent a function from returning or accepting a C++ type.
-    //     But the calling language needs to understand the data-layout for the
-    //     type being returned.
-    //     For non-trivial types, this will be difficult to impossible.
-    return (QirArray *)qubit_vector_ptr;
-}
-
-QirArray *__catalyst__rt__qubit_allocate_array(int64_t num_qubits)
-{
-    return timer::timer(__catalyst__rt__qubit_allocate_array__impl, "qubit_allocate_array",
-                        /* add_endl */ true, num_qubits);
-}
-
-static int __catalyst__rt__qubit_release__impl(QUBIT *qubit)
-{
-    getQuantumDevicePtr()->ReleaseQubit(reinterpret_cast<QubitIdType>(qubit));
-    return 0;
-}
-
-void __catalyst__rt__qubit_release(QUBIT *qubit)
-{
-    timer::timer(__catalyst__rt__qubit_release__impl, "qubit_release",
-                 /* add_endl */ true, qubit);
-}
-
-static int __catalyst__rt__qubit_release_array__impl(QirArray *qubit_array)
-{
-    getQuantumDevicePtr()->ReleaseAllQubits();
-    std::vector<QubitIdType> *qubit_array_ptr =
-        reinterpret_cast<std::vector<QubitIdType> *>(qubit_array);
-    delete qubit_array_ptr;
-    return 0;
-}
-
-void __catalyst__rt__qubit_release_array(QirArray *qubit_array)
-{
-    timer::timer(__catalyst__rt__qubit_release_array__impl, "qubit_release_array",
-                 /* add_endl */ true, qubit_array);
-}
-
-int64_t __catalyst__rt__num_qubits()
-{
-    return static_cast<int64_t>(getQuantumDevicePtr()->GetNumQubits());
-}
-
-bool __catalyst__rt__result_equal(RESULT *r0, RESULT *r1) { return (r0 == r1) || (*r0 == *r1); }
-
-RESULT *__catalyst__rt__result_get_one() { return getQuantumDevicePtr()->One(); }
-
-RESULT *__catalyst__rt__result_get_zero() { return getQuantumDevicePtr()->Zero(); }
-
-void __catalyst__qis__Gradient(int64_t numResults, /* results = */...)
-{
-    RT_ASSERT(numResults >= 0);
-    using ResultType = MemRefT<double, 1>;
-
-    std::vector<ResultType *> mem_ptrs;
-    mem_ptrs.reserve(numResults);
-    va_list args;
-    va_start(args, numResults);
-    for (int64_t i = 0; i < numResults; i++) {
-        mem_ptrs.push_back(va_arg(args, ResultType *));
-    }
-    va_end(args);
-
-    std::vector<DataView<double, 1>> mem_views;
-    mem_views.reserve(numResults);
-    for (auto *mr : mem_ptrs) {
-        mem_views.emplace_back(mr->data_aligned, mr->offset, mr->sizes, mr->strides);
-    }
-
-    // num_observables * num_train_params
-    getQuantumDevicePtr()->Gradient(mem_views, {});
-}
-
-void __catalyst__qis__Gradient_params(MemRefT_int64_1d *params, int64_t numResults,
-                                      /* results = */...)
-{
-    RT_ASSERT(numResults >= 0);
-    using ResultType = MemRefT<double, 1>;
-
-    if (params == nullptr || !params->sizes[0]) {
-        RT_FAIL("Invalid number of trainable parameters");
-    }
-
-    const size_t tp_size = params->sizes[0];
-
-    // create a vector of custom trainable parameters
-    std::vector<size_t> train_params;
-    auto *params_data = params->data_aligned;
-    train_params.reserve(tp_size);
-    for (size_t i = 0; i < tp_size; i++) {
-        auto p = params_data[i];
-        RT_FAIL_IF(p < 0, "trainable parameter cannot be a negative integer");
-        train_params.push_back(p);
-    }
-
-    std::vector<ResultType *> mem_ptrs;
-    mem_ptrs.reserve(numResults);
-    va_list args;
-    va_start(args, numResults);
-    for (int64_t i = 0; i < numResults; i++) {
-        mem_ptrs.push_back(va_arg(args, ResultType *));
-    }
-    va_end(args);
-
-    std::vector<DataView<double, 1>> mem_views;
-    mem_views.reserve(numResults);
-    for (auto *mr : mem_ptrs) {
-        mem_views.emplace_back(mr->data_aligned, mr->offset, mr->sizes, mr->strides);
-    }
-
-    // num_observables * num_train_params
-    getQuantumDevicePtr()->Gradient(mem_views, train_params);
-}
-
-void __catalyst__qis__GlobalPhase(double phi, const Modifiers *modifiers)
-{
-    getQuantumDevicePtr()->NamedOperation("GlobalPhase", {phi}, {}, MODIFIERS_ARGS(modifiers));
-}
-
-void __catalyst__qis__SetState(MemRefT_CplxT_double_1d *data, uint64_t numQubits, ...)
-{
-    RT_ASSERT(numQubits > 0);
-
-    va_list args;
-    va_start(args, numQubits);
-    std::vector<QubitIdType> wires(numQubits);
-    for (uint64_t i = 0; i < numQubits; i++) {
-        wires[i] = va_arg(args, QubitIdType);
-    }
-    va_end(args);
-
-    MemRefT<std::complex<double>, 1> *data_p = (MemRefT<std::complex<double>, 1> *)data;
-    DataView<std::complex<double>, 1> data_view(data_p->data_aligned, data_p->offset, data_p->sizes,
-                                                data_p->strides);
-    getQuantumDevicePtr()->SetState(data_view, wires);
-}
-
-void __catalyst__qis__SetBasisState(MemRefT_int8_1d *data, uint64_t numQubits, ...)
-{
-    RT_ASSERT(numQubits > 0);
-
-    DataView<int8_t, 1> data_view(data->data_aligned, data->offset, data->sizes, data->strides);
-
-    va_list args;
-    va_start(args, numQubits);
-    std::vector<QubitIdType> wires(numQubits);
-    for (uint64_t i = 0; i < numQubits; i++) {
-        wires[i] = va_arg(args, QubitIdType);
-    }
-    va_end(args);
-    std::unordered_set<QubitIdType> wire_set(wires.begin(), wires.end());
-    RT_FAIL_IF(wire_set.size() != numQubits, "Wires must be unique");
-    RT_FAIL_IF(data->sizes[0] != numQubits,
-               "BasisState parameter and wires must be of equal length.");
-
-    getQuantumDevicePtr()->SetBasisState(data_view, wires);
-}
-
-void __catalyst__qis__Identity(QUBIT *qubit, const Modifiers *modifiers)
-{
-    getQuantumDevicePtr()->NamedOperation("Identity", {}, {reinterpret_cast<QubitIdType>(qubit)},
-                                          MODIFIERS_ARGS(modifiers));
-}
-
-void __catalyst__qis__PauliX(QUBIT *qubit, const Modifiers *modifiers)
-{
-    getQuantumDevicePtr()->NamedOperation("PauliX", {}, {reinterpret_cast<QubitIdType>(qubit)},
-                                          MODIFIERS_ARGS(modifiers));
-}
-
-void __catalyst__qis__PauliY(QUBIT *qubit, const Modifiers *modifiers)
-{
-    getQuantumDevicePtr()->NamedOperation("PauliY", {}, {reinterpret_cast<QubitIdType>(qubit)},
-                                          MODIFIERS_ARGS(modifiers));
-}
-
-void __catalyst__qis__PauliZ(QUBIT *qubit, const Modifiers *modifiers)
-{
-    getQuantumDevicePtr()->NamedOperation("PauliZ", {}, {reinterpret_cast<QubitIdType>(qubit)},
-                                          MODIFIERS_ARGS(modifiers));
-}
-
-void __catalyst__qis__Hadamard(QUBIT *qubit, const Modifiers *modifiers)
-{
-    getQuantumDevicePtr()->NamedOperation("Hadamard", {}, {reinterpret_cast<QubitIdType>(qubit)},
-                                          MODIFIERS_ARGS(modifiers));
-}
-
-void __catalyst__qis__S(QUBIT *qubit, const Modifiers *modifiers)
-{
-    getQuantumDevicePtr()->NamedOperation("S", {}, {reinterpret_cast<QubitIdType>(qubit)},
-                                          MODIFIERS_ARGS(modifiers));
-}
-
-void __catalyst__qis__T(QUBIT *qubit, const Modifiers *modifiers)
-{
-    getQuantumDevicePtr()->NamedOperation("T", {}, {reinterpret_cast<QubitIdType>(qubit)},
-                                          MODIFIERS_ARGS(modifiers));
-}
-
-void __catalyst__qis__PhaseShift(double theta, QUBIT *qubit, const Modifiers *modifiers)
-{
-    getQuantumDevicePtr()->NamedOperation(
-        "PhaseShift", {theta}, {reinterpret_cast<QubitIdType>(qubit)}, MODIFIERS_ARGS(modifiers));
-}
-
-void __catalyst__qis__RX(double theta, QUBIT *qubit, const Modifiers *modifiers)
-{
-    getQuantumDevicePtr()->NamedOperation("RX", {theta}, {reinterpret_cast<QubitIdType>(qubit)},
-                                          MODIFIERS_ARGS(modifiers));
-}
-
-void __catalyst__qis__RY(double theta, QUBIT *qubit, const Modifiers *modifiers)
-{
-    getQuantumDevicePtr()->NamedOperation("RY", {theta}, {reinterpret_cast<QubitIdType>(qubit)},
-                                          MODIFIERS_ARGS(modifiers));
-}
-
-void __catalyst__qis__RZ(double theta, QUBIT *qubit, const Modifiers *modifiers)
-{
-    getQuantumDevicePtr()->NamedOperation("RZ", {theta}, {reinterpret_cast<QubitIdType>(qubit)},
-                                          MODIFIERS_ARGS(modifiers));
-}
-
-void __catalyst__qis__Rot(double phi, double theta, double omega, QUBIT *qubit,
-                          const Modifiers *modifiers)
-{
-    getQuantumDevicePtr()->NamedOperation("Rot", {phi, theta, omega},
-                                          {reinterpret_cast<QubitIdType>(qubit)},
-                                          MODIFIERS_ARGS(modifiers));
-}
-
-void __catalyst__qis__CNOT(QUBIT *control, QUBIT *target, const Modifiers *modifiers)
-{
-    RT_FAIL_IF(control == target,
-               "Invalid input for CNOT gate. Control and target qubit operands must be distinct.");
-    getQuantumDevicePtr()->NamedOperation("CNOT", {},
-                                          {/* control = */ reinterpret_cast<QubitIdType>(control),
-                                           /* target = */ reinterpret_cast<QubitIdType>(target)},
-                                          /* modifiers */ MODIFIERS_ARGS(modifiers));
-}
-
-void __catalyst__qis__CY(QUBIT *control, QUBIT *target, const Modifiers *modifiers)
-{
-    getQuantumDevicePtr()->NamedOperation("CY", {},
-                                          {/* control = */ reinterpret_cast<QubitIdType>(control),
-                                           /* target = */ reinterpret_cast<QubitIdType>(target)},
-                                          /* modifiers */ MODIFIERS_ARGS(modifiers));
-}
-
-void __catalyst__qis__CZ(QUBIT *control, QUBIT *target, const Modifiers *modifiers)
-{
-    getQuantumDevicePtr()->NamedOperation("CZ", {},
-                                          {/* control = */ reinterpret_cast<QubitIdType>(control),
-                                           /* target = */ reinterpret_cast<QubitIdType>(target)},
-                                          /* modifiers */ MODIFIERS_ARGS(modifiers));
-}
-
-void __catalyst__qis__SWAP(QUBIT *control, QUBIT *target, const Modifiers *modifiers)
-{
-    getQuantumDevicePtr()->NamedOperation("SWAP", {},
-                                          {/* control = */ reinterpret_cast<QubitIdType>(control),
-                                           /* target = */ reinterpret_cast<QubitIdType>(target)},
-                                          /* modifiers */ MODIFIERS_ARGS(modifiers));
-}
-
-void __catalyst__qis__IsingXX(double theta, QUBIT *control, QUBIT *target,
-                              const Modifiers *modifiers)
-{
-    getQuantumDevicePtr()->NamedOperation("IsingXX", {theta},
-                                          {/* control = */ reinterpret_cast<QubitIdType>(control),
-                                           /* target = */ reinterpret_cast<QubitIdType>(target)},
-                                          /* modifiers */ MODIFIERS_ARGS(modifiers));
-}
-
-void __catalyst__qis__IsingYY(double theta, QUBIT *control, QUBIT *target,
-                              const Modifiers *modifiers)
-{
-    getQuantumDevicePtr()->NamedOperation("IsingYY", {theta},
-                                          {/* control = */ reinterpret_cast<QubitIdType>(control),
-                                           /* target = */ reinterpret_cast<QubitIdType>(target)},
-                                          /* modifiers */ MODIFIERS_ARGS(modifiers));
-}
-
-void __catalyst__qis__IsingXY(double theta, QUBIT *control, QUBIT *target,
-                              const Modifiers *modifiers)
-{
-    getQuantumDevicePtr()->NamedOperation("IsingXY", {theta},
-                                          {/* control = */ reinterpret_cast<QubitIdType>(control),
-                                           /* target = */ reinterpret_cast<QubitIdType>(target)},
-                                          /* modifiers */ MODIFIERS_ARGS(modifiers));
-}
-
-void __catalyst__qis__IsingZZ(double theta, QUBIT *control, QUBIT *target,
-                              const Modifiers *modifiers)
-{
-    getQuantumDevicePtr()->NamedOperation("IsingZZ", {theta},
-                                          {/* control = */ reinterpret_cast<QubitIdType>(control),
-                                           /* target = */ reinterpret_cast<QubitIdType>(target)},
-                                          /* modifiers */ MODIFIERS_ARGS(modifiers));
-}
-
-void __catalyst__qis__ControlledPhaseShift(double theta, QUBIT *control, QUBIT *target,
-                                           const Modifiers *modifiers)
-{
-    getQuantumDevicePtr()->NamedOperation("ControlledPhaseShift", {theta},
-                                          {/* control = */ reinterpret_cast<QubitIdType>(control),
-                                           /* target = */ reinterpret_cast<QubitIdType>(target)},
-                                          /* modifiers */ MODIFIERS_ARGS(modifiers));
-}
-
-void __catalyst__qis__CRX(double theta, QUBIT *control, QUBIT *target, const Modifiers *modifiers)
-{
-    getQuantumDevicePtr()->NamedOperation("CRX", {theta},
-                                          {/* control = */ reinterpret_cast<QubitIdType>(control),
-                                           /* target = */ reinterpret_cast<QubitIdType>(target)},
-                                          /* modifiers */ MODIFIERS_ARGS(modifiers));
-}
-
-void __catalyst__qis__CRY(double theta, QUBIT *control, QUBIT *target, const Modifiers *modifiers)
-{
-    getQuantumDevicePtr()->NamedOperation("CRY", {theta},
-                                          {/* control = */ reinterpret_cast<QubitIdType>(control),
-                                           /* target = */ reinterpret_cast<QubitIdType>(target)},
-                                          /* modifiers */ MODIFIERS_ARGS(modifiers));
-}
-
-void __catalyst__qis__CRZ(double theta, QUBIT *control, QUBIT *target, const Modifiers *modifiers)
-{
-    getQuantumDevicePtr()->NamedOperation("CRZ", {theta},
-                                          {/* control = */ reinterpret_cast<QubitIdType>(control),
-                                           /* target = */ reinterpret_cast<QubitIdType>(target)},
-                                          /* modifiers */ MODIFIERS_ARGS(modifiers));
-}
-
-void __catalyst__qis__CRot(double phi, double theta, double omega, QUBIT *control, QUBIT *target,
-                           const Modifiers *modifiers)
-{
-    getQuantumDevicePtr()->NamedOperation("CRot", {phi, theta, omega},
-                                          {/* control = */ reinterpret_cast<QubitIdType>(control),
-                                           /* target = */ reinterpret_cast<QubitIdType>(target)},
-                                          /* modifiers */ MODIFIERS_ARGS(modifiers));
-}
-
-void __catalyst__qis__CSWAP(QUBIT *control, QUBIT *aswap, QUBIT *bswap, const Modifiers *modifiers)
-{
-    getQuantumDevicePtr()->NamedOperation("CSWAP", {},
-                                          {reinterpret_cast<QubitIdType>(control),
-                                           reinterpret_cast<QubitIdType>(aswap),
-                                           reinterpret_cast<QubitIdType>(bswap)},
-                                          /* modifiers */ MODIFIERS_ARGS(modifiers));
-}
-
-void __catalyst__qis__Toffoli(QUBIT *wire0, QUBIT *wire1, QUBIT *wire2, const Modifiers *modifiers)
-{
-    getQuantumDevicePtr()->NamedOperation("Toffoli", {},
-                                          {reinterpret_cast<QubitIdType>(wire0),
-                                           reinterpret_cast<QubitIdType>(wire1),
-                                           reinterpret_cast<QubitIdType>(wire2)},
-                                          /* modifiers */ MODIFIERS_ARGS(modifiers));
-}
-
-void __catalyst__qis__MultiRZ(double theta, const Modifiers *modifiers, int64_t numQubits, ...)
-{
-    RT_ASSERT(numQubits >= 0);
-
-    va_list args;
-    va_start(args, numQubits);
-    std::vector<QubitIdType> wires(numQubits);
-    for (int64_t i = 0; i < numQubits; i++) {
-        wires[i] = va_arg(args, QubitIdType);
-    }
-    va_end(args);
-
-    getQuantumDevicePtr()->NamedOperation("MultiRZ", {theta}, wires,
-                                          /* modifiers */ MODIFIERS_ARGS(modifiers));
-}
-
-void __catalyst__qis__ISWAP(QUBIT *wire0, QUBIT *wire1, const Modifiers *modifiers)
-{
-    getQuantumDevicePtr()->NamedOperation(
-        "ISWAP", {}, {reinterpret_cast<QubitIdType>(wire0), reinterpret_cast<QubitIdType>(wire1)},
-        MODIFIERS_ARGS(modifiers));
-}
-
-void __catalyst__qis__PSWAP(double phi, QUBIT *wire0, QUBIT *wire1, const Modifiers *modifiers)
-{
-    getQuantumDevicePtr()->NamedOperation(
-        "PSWAP", {phi},
-        {reinterpret_cast<QubitIdType>(wire0), reinterpret_cast<QubitIdType>(wire1)},
-        MODIFIERS_ARGS(modifiers));
-}
-
-static void _qubitUnitary_impl(MemRefT_CplxT_double_2d *matrix, int64_t numQubits,
-                               std::vector<std::complex<double>> &coeffs,
-                               std::vector<QubitIdType> &wires, va_list *args)
-{
-    const size_t num_rows = matrix->sizes[0];
-    const size_t num_col = matrix->sizes[1];
-    const size_t expected_size = std::pow(2, numQubits);
-
-    if (num_rows != expected_size || num_col != expected_size) {
-        RT_FAIL("Invalid given QubitUnitary matrix; "
-                "The size of the matrix must be pow(2, numWires) * pow(2, numWires).");
-    }
-
-    wires.reserve(numQubits);
-    for (int64_t i = 0; i < numQubits; i++) {
-        wires.push_back(va_arg(*args, QubitIdType));
-    }
-
-    const size_t matrix_size = num_rows * num_col;
-    coeffs.reserve(matrix_size);
-    for (size_t i = 0; i < matrix_size; i++) {
-        coeffs.emplace_back(matrix->data_aligned[i].real, matrix->data_aligned[i].imag);
-    }
-}
-
-void __catalyst__qis__QubitUnitary(MemRefT_CplxT_double_2d *matrix, const Modifiers *modifiers,
-                                   int64_t numQubits, /*qubits*/...)
-{
-    RT_ASSERT(numQubits >= 0);
-
-    if (matrix == nullptr) {
-        RT_FAIL("The QubitUnitary matrix must be initialized");
-    }
-
-    if (numQubits > __catalyst__rt__num_qubits()) {
-        RT_FAIL("Invalid number of wires");
-    }
-
-    va_list args;
-    std::vector<std::complex<double>> coeffs;
-    std::vector<QubitIdType> wires;
-    va_start(args, numQubits);
-    _qubitUnitary_impl(matrix, numQubits, coeffs, wires, &args);
-    va_end(args);
-    return getQuantumDevicePtr()->MatrixOperation(coeffs, wires, MODIFIERS_ARGS(modifiers));
-}
-
-ObsIdType __catalyst__qis__NamedObs(int64_t obsId, QUBIT *wire)
-{
-    return getQuantumDevicePtr()->Observable(static_cast<ObsId>(obsId), {},
-                                             {reinterpret_cast<QubitIdType>(wire)});
-}
-
-ObsIdType __catalyst__qis__HermitianObs(MemRefT_CplxT_double_2d *matrix, int64_t numQubits, ...)
-{
-    RT_ASSERT(numQubits >= 0);
-
-    if (matrix == nullptr) {
-        RT_FAIL("The Hermitian matrix must be initialized");
-    }
-
-    const size_t num_rows = matrix->sizes[0];
-    const size_t num_col = matrix->sizes[1];
-    const size_t expected_size = std::pow(2, numQubits);
-
-    if (num_rows != expected_size || num_col != expected_size) {
-        RT_FAIL("Invalid given Hermitian matrix; "
-                "The size of the matrix must be pow(2, numWires) * pow(2, numWires).");
-    }
-
-    va_list args;
-    va_start(args, numQubits);
-    std::vector<QubitIdType> wires(numQubits);
-    for (int64_t i = 0; i < numQubits; i++) {
-        wires[i] = va_arg(args, QubitIdType);
-    }
-    va_end(args);
-
-    if (numQubits > __catalyst__rt__num_qubits()) {
-        RT_FAIL("Invalid number of wires");
-    }
-
-    const size_t matrix_size = num_rows * num_col;
-    std::vector<std::complex<double>> coeffs;
-    coeffs.reserve(matrix_size);
-    for (size_t i = 0; i < matrix_size; i++) {
-        coeffs.emplace_back(matrix->data_aligned[i].real, matrix->data_aligned[i].imag);
-    }
-
-    return getQuantumDevicePtr()->Observable(ObsId::Hermitian, coeffs, wires);
-}
-
-ObsIdType __catalyst__qis__TensorObs(int64_t numObs, /*obsKeys*/...)
-{
-    if (numObs < 1) {
-        RT_FAIL("Invalid number of observables to create TensorProdObs");
-    }
-
-    va_list args;
-    va_start(args, numObs);
-    std::vector<ObsIdType> obsKeys;
-    obsKeys.reserve(numObs);
-    for (int64_t i = 0; i < numObs; i++) {
-        obsKeys.push_back(va_arg(args, ObsIdType));
-    }
-    va_end(args);
-
-    return getQuantumDevicePtr()->TensorObservable(obsKeys);
-}
-
-ObsIdType __catalyst__qis__HamiltonianObs(MemRefT_double_1d *coeffs, int64_t numObs,
-                                          /*obsKeys*/...)
-{
-    RT_ASSERT(numObs >= 0);
-
-    if (coeffs == nullptr) {
-        RT_FAIL("Invalid coefficients for computing Hamiltonian; "
-                "The coefficients list must be initialized.");
-    }
-
-    const size_t coeffs_size = coeffs->sizes[0];
-
-    if (static_cast<size_t>(numObs) != coeffs_size) {
-        RT_FAIL("Invalid coefficients for computing Hamiltonian; "
-                "The number of coefficients and observables must be equal.");
-    }
-
-    va_list args;
-    va_start(args, numObs);
-    std::vector<ObsIdType> obsKeys;
-    obsKeys.reserve(numObs);
-    for (int64_t i = 0; i < numObs; i++) {
-        obsKeys.push_back(va_arg(args, ObsIdType));
-    }
-    va_end(args);
-
-    std::vector<double> coeffs_vec(coeffs->data_aligned, coeffs->data_aligned + coeffs_size);
-    return getQuantumDevicePtr()->HamiltonianObservable(coeffs_vec, obsKeys);
-}
-
-RESULT *__catalyst__qis__Measure(QUBIT *wire, int32_t postselect)
-{
-    std::optional<int32_t> postselectOpt{postselect};
-
-    // Any value different to 0 or 1 denotes absence of postselect, and it is hence turned into
-    // std::nullopt at the C++ interface
-    if (postselect != 0 && postselect != 1) {
-        postselectOpt = std::nullopt;
-    }
-
-    return getQuantumDevicePtr()->Measure(reinterpret_cast<QubitIdType>(wire), postselectOpt);
-}
-
-double __catalyst__qis__Expval(ObsIdType obsKey) { return getQuantumDevicePtr()->Expval(obsKey); }
-
-double __catalyst__qis__Variance(ObsIdType obsKey) { return getQuantumDevicePtr()->Var(obsKey); }
-
-void __catalyst__qis__State(MemRefT_CplxT_double_1d *result, int64_t numQubits, ...)
-{
-    RT_ASSERT(numQubits >= 0);
-    MemRefT<std::complex<double>, 1> *result_p = (MemRefT<std::complex<double>, 1> *)result;
-
-    va_list args;
-    va_start(args, numQubits);
-    std::vector<QubitIdType> wires(numQubits);
-    for (int64_t i = 0; i < numQubits; i++) {
-        wires[i] = va_arg(args, QubitIdType);
-    }
-    va_end(args);
-
-    DataView<std::complex<double>, 1> view(result_p->data_aligned, result_p->offset,
-                                           result_p->sizes, result_p->strides);
-
-    if (wires.empty()) {
-        getQuantumDevicePtr()->State(view);
-    }
-    else {
-        RT_FAIL("Partial State-Vector not supported yet");
-        // getQuantumDevicePtr()->PartialState(stateVec,
-        // numElements, wires);
-    }
-}
-
-void __catalyst__qis__Probs(MemRefT_double_1d *result, int64_t numQubits, ...)
-{
-    RT_ASSERT(numQubits >= 0);
-    MemRefT<double, 1> *result_p = (MemRefT<double, 1> *)result;
-
-    va_list args;
-    va_start(args, numQubits);
-    std::vector<QubitIdType> wires(numQubits);
-    for (int64_t i = 0; i < numQubits; i++) {
-        wires[i] = va_arg(args, QubitIdType);
-    }
-    va_end(args);
-
-    DataView<double, 1> view(result_p->data_aligned, result_p->offset, result_p->sizes,
-                             result_p->strides);
-
-    if (wires.empty()) {
-        getQuantumDevicePtr()->Probs(view);
-    }
-    else {
-        getQuantumDevicePtr()->PartialProbs(view, wires);
-    }
-}
-
-void __catalyst__qis__Sample(MemRefT_double_2d *result, int64_t numQubits, ...)
-{
-    int64_t shots = getQuantumDevicePtr()->GetDeviceShots();
-    RT_ASSERT(shots >= 0);
-    RT_ASSERT(numQubits >= 0);
-    MemRefT<double, 2> *result_p = (MemRefT<double, 2> *)result;
-
-    va_list args;
-    va_start(args, numQubits);
-    std::vector<QubitIdType> wires(numQubits);
-    for (int64_t i = 0; i < numQubits; i++) {
-        wires[i] = va_arg(args, QubitIdType);
-    }
-    va_end(args);
-
-    DataView<double, 2> view(result_p->data_aligned, result_p->offset, result_p->sizes,
-                             result_p->strides);
-
-    if (wires.empty()) {
-        getQuantumDevicePtr()->Sample(view, shots);
-    }
-    else {
-        getQuantumDevicePtr()->PartialSample(view, wires, shots);
-    }
-}
-
-void __catalyst__qis__Counts(PairT_MemRefT_double_int64_1d *result, int64_t numQubits, ...)
-{
-    int64_t shots = getQuantumDevicePtr()->GetDeviceShots();
-    RT_ASSERT(shots >= 0);
-    RT_ASSERT(numQubits >= 0);
-    MemRefT<double, 1> *result_eigvals_p = (MemRefT<double, 1> *)&result->first;
-    MemRefT<int64_t, 1> *result_counts_p = (MemRefT<int64_t, 1> *)&result->second;
-
-    va_list args;
-    va_start(args, numQubits);
-    std::vector<QubitIdType> wires(numQubits);
-    for (int64_t i = 0; i < numQubits; i++) {
-        wires[i] = va_arg(args, QubitIdType);
-    }
-    va_end(args);
-
-    DataView<double, 1> eigvals_view(result_eigvals_p->data_aligned, result_eigvals_p->offset,
-                                     result_eigvals_p->sizes, result_eigvals_p->strides);
-    DataView<int64_t, 1> counts_view(result_counts_p->data_aligned, result_counts_p->offset,
-                                     result_counts_p->sizes, result_counts_p->strides);
-
-    if (wires.empty()) {
-        getQuantumDevicePtr()->Counts(eigvals_view, counts_view, shots);
-    }
-    else {
-        getQuantumDevicePtr()->PartialCounts(eigvals_view, counts_view, wires, shots);
-    }
-}
-
-int64_t __catalyst__rt__array_get_size_1d(QirArray *ptr)
-{
-    std::vector<QubitIdType> *qubit_vector_ptr = reinterpret_cast<std::vector<QubitIdType> *>(ptr);
-    return qubit_vector_ptr->size();
-}
-
-int8_t *__catalyst__rt__array_get_element_ptr_1d(QirArray *ptr, int64_t idx)
-{
-    std::vector<QubitIdType> *qubit_vector_ptr = reinterpret_cast<std::vector<QubitIdType> *>(ptr);
-
-    RT_ASSERT(idx >= 0);
-    std::string error_msg = "The qubit register does not contain the requested wire: ";
-    error_msg += std::to_string(idx);
-    RT_FAIL_IF(static_cast<size_t>(idx) >= qubit_vector_ptr->size(), error_msg.c_str());
-
-    QubitIdType *data = qubit_vector_ptr->data();
-    return (int8_t *)&data[idx];
-}
-}
diff --git a/src/qirlightning/catalyst_runtime/lib/registry/CMakeLists.txt b/src/qirlightning/catalyst_runtime/lib/registry/CMakeLists.txt
deleted file mode 100644
index 2c19e4a..0000000
--- a/src/qirlightning/catalyst_runtime/lib/registry/CMakeLists.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-# nanobind suggests including these lines to configure CMake to perform an optimized release build
-# by default unless another build type is specified. Without this addition, binding code may run
-# slowly and produce large binaries.
-# See https://nanobind.readthedocs.io/en/latest/building.html#preliminaries
-if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
-    set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
-    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
-endif()
-
-# Locate nanobind
-execute_process(
-    COMMAND "${Python_EXECUTABLE}" -c "import nanobind; print(nanobind.cmake_dir())"
-    OUTPUT_VARIABLE nanobind_DIR OUTPUT_STRIP_TRAILING_WHITESPACE
-)
-find_package(nanobind CONFIG REQUIRED)
-
-# Source file list for `wrapper` module
-set(REGISTRY_SRC_FILES
-    Registry.cpp
-)
-
-# Create the Python `catalyst_callback_registry` module
-# Target the stable ABI for Python 3.12+, which reduces the number of binary wheels that must be
-# built (`STABLE_ABI` does nothing on older Python versions).
-nanobind_add_module(catalyst_callback_registry STABLE_ABI ${REGISTRY_SRC_FILES})
-
-# Use a consistant suffix ".so" rather than, e.g. ".abi3.so" (when using the Stable ABI) or
-# ".cpython-3xx-darwin.so". Doing so simplifies the process to locate it when calling
-# `dlopen(LIBREGISTRY)` in runtime/lib/capi/RuntimeCAPI.cpp.
-set_target_properties(catalyst_callback_registry PROPERTIES SUFFIX ".so")
-
-target_include_directories(catalyst_callback_registry PUBLIC ${runtime_includes})
-target_compile_definitions(catalyst_qir_qis_obj PUBLIC -DLIBREGISTRY=\"$<TARGET_FILE_NAME:catalyst_callback_registry>\")
diff --git a/src/qirlightning/catalyst_runtime/lib/registry/Registry.cpp b/src/qirlightning/catalyst_runtime/lib/registry/Registry.cpp
deleted file mode 100644
index fd4715d..0000000
--- a/src/qirlightning/catalyst_runtime/lib/registry/Registry.cpp
+++ /dev/null
@@ -1,179 +0,0 @@
-// Copyright 2024 Xanadu Quantum Technologies Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cstdint>
-#include <cstdio>
-#include <dlfcn.h>
-#include <string>
-#include <unordered_map>
-
-#include <nanobind/nanobind.h>
-#include <nanobind/stl/string.h>
-
-namespace nb = nanobind;
-
-// From PyBind11's documentation:
-//
-//     Do you have any global variables that are pybind11 objects or invoke pybind11 functions in
-//     either their constructor or destructor? You are generally not allowed to invoke any Python
-//     function in a global static context. We recommend using lazy initialization and then
-//     intentionally leaking at the end of the program.
-//
-// https://pybind11.readthedocs.io/en/stable/advanced/misc.html#common-sources-of-global-interpreter-lock-errors
-std::unordered_map<int64_t, nb::callable> *references;
-
-std::string libmlirpath;
-
-struct UnrankedMemrefType {
-    int64_t rank;
-    void *descriptor;
-};
-
-class LibraryManager {
-    void *_handle;
-
-  public:
-    LibraryManager(std::string path)
-    {
-        this->_handle = dlopen(path.c_str(), RTLD_LAZY);
-        if (!this->_handle) {
-            throw nb::value_error(dlerror());
-        }
-    }
-
-    ~LibraryManager()
-    {
-        if (this->_handle) {
-            dlclose(this->_handle);
-        }
-    }
-
-    void operator()(long elementSize, UnrankedMemrefType *src, UnrankedMemrefType *dst)
-    {
-        void *f_ptr = dlsym(this->_handle, "memrefCopy");
-        if (!f_ptr) {
-            throw nb::value_error(dlerror());
-        }
-        typedef void (*memrefCopy_t)(int64_t, void *, void *);
-        void (*memrefCopy)(int64_t, void *, void *);
-        memrefCopy = (memrefCopy_t)(f_ptr);
-        return memrefCopy(elementSize, src, dst);
-    }
-};
-
-inline const char *ext()
-{
-#ifdef __APPLE__
-    return ".dylib";
-#elif __linux__
-    return ".so";
-#else
-#error "Only apple and linux are currently supported";
-#endif
-}
-
-std::string library_name(std::string name) { return name + ext(); }
-
-void convertResult(nb::handle tuple)
-{
-    nb::object unrankedMemrefPtrSizeTuple = tuple.attr("__getitem__")(0);
-
-    nb::object unranked_memref = unrankedMemrefPtrSizeTuple.attr("__getitem__")(0);
-    nb::object element_size = unrankedMemrefPtrSizeTuple.attr("__getitem__")(1);
-    nb::object unranked_memref_ptr_int = unranked_memref.attr("value");
-
-    void *unranked_memref_ptr = reinterpret_cast<void *>(nb::cast<long>(unranked_memref_ptr_int));
-    long e_size = nb::cast<long>(element_size);
-
-    nb::object dest = tuple.attr("__getitem__")(1);
-
-    long destAsLong = nb::cast<long>(dest);
-    void *destAsPtr = (void *)(destAsLong);
-
-    UnrankedMemrefType *src = (UnrankedMemrefType *)unranked_memref_ptr;
-    UnrankedMemrefType destMemref = {src->rank, destAsPtr};
-
-    std::string libpath = libmlirpath + library_name("/libmlir_c_runner_utils");
-    LibraryManager memrefCopy(libpath);
-    memrefCopy(e_size, src, &destMemref);
-}
-
-void convertResults(nb::list results, nb::list allocated)
-{
-    auto builtins = nb::module_::import_("builtins");
-    auto zip = builtins.attr("zip");
-    for (nb::handle obj : zip(results, allocated)) {
-        convertResult(obj);
-    }
-}
-
-extern "C" {
-[[gnu::visibility("default")]] void callbackCall(int64_t identifier, int64_t count, int64_t retc,
-                                                 va_list args)
-{
-    nb::gil_scoped_acquire lock;
-    auto it = references->find(identifier);
-    if (it == references->end()) {
-        throw std::invalid_argument("Callback called with invalid identifier");
-    }
-    auto lambda = it->second;
-
-    nb::list flat_args;
-    for (int i = 0; i < count; i++) {
-        int64_t ptr = va_arg(args, int64_t);
-        flat_args.append(ptr);
-    }
-
-    nb::list flat_results = nb::list(lambda(flat_args));
-
-    // We have a flat list of return values.
-    // These returns **may** be array views to
-    // the very same memrefs that we passed as inputs.
-    // As a first prototype, let's copy these values.
-    // I think it is best to always copy them because
-    // of aliasing. Let's just copy them to guarantee
-    // no aliasing issues. We can revisit this as an optimization
-    // and allowing these to alias.
-    nb::list flat_returns_allocated_compiler;
-    for (int i = 0; i < retc; i++) {
-        int64_t ptr = va_arg(args, int64_t);
-        flat_returns_allocated_compiler.append(ptr);
-    }
-    convertResults(flat_results, flat_returns_allocated_compiler);
-}
-}
-
-void setMLIRLibPath(std::string path) { libmlirpath = path; }
-
-auto registerImpl(nb::callable f)
-{
-    // Do we need to see if it is already present or can we just override it? Just override is fine.
-    // Does python reuse id's? Yes.
-    // But only after they have been garbaged collected.
-    // So as long as we maintain a reference to it, then they won't be garbage collected.
-    // Inserting the function into the unordered map increases the reference by one.
-    int64_t id = reinterpret_cast<int64_t>(f.ptr());
-    references->insert({id, f});
-    return id;
-}
-
-NB_MODULE(catalyst_callback_registry, m)
-{
-    if (references == nullptr) {
-        references = new std::unordered_map<int64_t, nb::callable>();
-    }
-    m.doc() = "Callbacks";
-    m.def("register", &registerImpl, "Call a python function registered in a map.");
-    m.def("set_mlir_lib_path", &setMLIRLibPath, "Set location of mlir's libraries.");
-}
diff --git a/src/qirlightning/simple_demo/README.md b/src/qirlightning/simple_demo/README.md
index 7a25fe4..02c6f3c 100644
--- a/src/qirlightning/simple_demo/README.md
+++ b/src/qirlightning/simple_demo/README.md
@@ -1,8 +1,8 @@
 # Simple Demo for Catalyst/Lightning runtime
 
-This is a super simple demo for using Catalyst runtime to drive Lightning devices. The example here uses `lightning.kokkos`, but can easily be updated to target other devices, e.g. lightning.gpu (if an Nvidia GPU is present). 
+This is a super simple demo for driving Lightning devices. The example here uses `lightning.kokkos`, but can easily be updated to target other devices, e.g. lightning.gpu (if an Nvidia GPU is present). 
 
-The new files required are in `../catalyst_runtime`, which contains a subset of files from the [Catalyst Runtime](https://github.com/PennyLaneAI/catalyst/tree/main/runtime).
+The only extra header files required are the `../catalyst_runtime/include`, which contains the include files from the [Catalyst Runtime ](https://github.com/PennyLaneAI/catalyst/tree/main/runtime/include) (for the QuantumDevice interface).
 
 ## Installing a lightning simulator
 
@@ -36,7 +36,7 @@ You can swap `pennylane-lightning-kokkos` for `pennylane-lightning-gpu` for ligh
 To compile:
 
 ```
-$ clang++ --std=c++20 test_rt_device.cpp -I/home/joseph/work/qiree/catalyst/runtime/include -I/home/joseph/work/qiree/catalyst/runtime/lib/capi -I/home/joseph/work/qiree/catalyst/runtime/lib/backend/common -o test_rt_device.out
+$ clang++ --std=c++20 test_rt_device.cpp -I../catalyst_runtime/include -o test_rt_device.out
 ```
 
 To run:
@@ -57,5 +57,6 @@ Measure on wire 0 = 0
 
 To run on other devices, e.g. lightning.gpu, you need to change:
 - `pip install custatevec-cu12 pennylane-lightning-gpu` (custatevec is a dependency)
-- replace `RTDLIB` and `RTDNAME` from `kokkos` to `GPU`
-- include `cuquantum` libraries when running, e.g. `LD_LIBRARY_PATH=/home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/cuquantum/lib/:$LD_LIBRARY_PATH ./test_rt_device.out`
+- replace `RTDLIB` from `kokkos` to `gpu`
+- replace `RTDDEVICE` from `Kokkos` to `GPU`
+- install `cuquantum` via `pip install custatevec-cu12`, then include `cuquantum` libraries when running, e.g. `LD_LIBRARY_PATH=/home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/cuquantum/lib/:$LD_LIBRARY_PATH ./test_rt_device.out`
diff --git a/src/qirlightning/simple_demo/test_rt_device.cpp b/src/qirlightning/simple_demo/test_rt_device.cpp
index f70410a..721ad1d 100644
--- a/src/qirlightning/simple_demo/test_rt_device.cpp
+++ b/src/qirlightning/simple_demo/test_rt_device.cpp
@@ -1,40 +1,73 @@
-#include "ExecutionContext.hpp"
+#include <dlfcn.h>
+
+#include "QuantumDevice.hpp"
 
 // Runtime libraries (kokkos/GPU/qubit etc.)
-#define RTDLIB "/home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/pennylane_lightning/liblightning_kokkos_catalyst.so" // change to liblightning_gpu_catalyst.so
-#define RTDNAME "LightningKokkosSimulator" // change to LightningGPUSimulator
+#define RTDLIB                                                         \
+    "/home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/" \
+    "pennylane_lightning/liblightning_kokkos_catalyst.so";
+#define RTDDEVICE "LightningKokkosSimulator";
+
+extern "C" Catalyst::Runtime::QuantumDevice*
+GenericDeviceFactory(char const* kwargs);
 
 using namespace Catalyst::Runtime;
 
-static inline std::shared_ptr<RTDevice> loadRTDevice(const std::string &rtd_lib,
-                                                   const std::string &rtd_name = {},
-                                                   const std::string &rtd_kwargs = {})
+int main()
 {
-    ExecutionContext context;
-    return context.getOrCreateDevice(rtd_lib, rtd_name, rtd_kwargs);
-}
+    try
+    {
+        // Load lightning simulation library
+        std::string rtd_lib = RTDLIB;
+        std::string rtd_device = RTDDEVICE;
+        std::string kwargs = {};
+        auto rtld_flags = RTLD_LAZY | RTLD_NODELETE;
+        auto rtd_dylib_handler = dlopen(rtd_lib.c_str(), rtld_flags);
+
+        if (!rtd_dylib_handler)
+        {
+            throw std::runtime_error("Failed to load library: " + rtd_lib);
+        }
 
-int main() {
-    auto RTDevice = loadRTDevice(RTDLIB, RTDNAME, "");
+        // Find device factory
+        std::string factory_name = rtd_device + "Factory";
+        void* f_ptr = dlsym(rtd_dylib_handler, factory_name.c_str());
 
-    // Allocate Qubits
-    RTDevice->getQuantumDevicePtr()->AllocateQubits(3);
+        if (!f_ptr)
+        {
+            dlclose(rtd_dylib_handler);
+            throw std::runtime_error("Failed to find factory function: "
+                                     + factory_name);
+        }
+        std::string rtd_kwargs = {};
+        auto rtd_qdevice = std::unique_ptr<QuantumDevice>(
+            reinterpret_cast<decltype(GenericDeviceFactory)*>(f_ptr)(
+                rtd_kwargs.c_str()));
 
-    // Get Num Qubits
-    std::cout << "Num Qubits = " << RTDevice->getQuantumDevicePtr()->GetNumQubits() << std::endl;
+        // Allocate Qubits
+        rtd_qdevice->AllocateQubits(3);
 
-    // Apply Gate
-    RTDevice->getQuantumDevicePtr()->NamedOperation("Hadamard", {}, {0});
+        // Get Num Qubits
+        std::cout << "Num Qubits = " << rtd_qdevice->GetNumQubits()
+                  << std::endl;
 
-    // Print State
-    std::cout << "State = " << std::endl;
-    RTDevice->getQuantumDevicePtr()->PrintState();
+        // Apply Gate
+        rtd_qdevice->NamedOperation("Hadamard", {}, {0});
 
-    // Measure
-    QubitIdType wire{0};
-    Result result = RTDevice->getQuantumDevicePtr()->Measure(wire, std::nullopt);
-    std::cout << "Measure on wire 0 = " << *result << std::endl;
+        // Print State
+        std::cout << "State = " << std::endl;
+        rtd_qdevice->PrintState();
 
+        // Measure
+        QubitIdType wire{0};
+        Result result = rtd_qdevice->Measure(wire, std::nullopt);
+        std::cout << "Measure on wire 0 = " << *result << std::endl;
+    }
+    catch (std::exception const& e)
+    {
+        std::cerr << "Error: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
 
-    return 0;
+    return EXIT_SUCCESS;
 }

From 6c729079aacfba68f5a1605b078031c4fa1296d7 Mon Sep 17 00:00:00 2001
From: Joseph Lee <joseph.lee@xanadu.ai>
Date: Wed, 15 Jan 2025 19:35:11 +0000
Subject: [PATCH 36/64] remove redundant file

---
 src/qirlightning/simple_demo/LightningDevice.cpp | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 src/qirlightning/simple_demo/LightningDevice.cpp

diff --git a/src/qirlightning/simple_demo/LightningDevice.cpp b/src/qirlightning/simple_demo/LightningDevice.cpp
deleted file mode 100644
index e69de29..0000000

From 6da98d30b251314a8309b93b47c9e82139db8f68 Mon Sep 17 00:00:00 2001
From: Joseph Lee <joseph.lee@xanadu.ai>
Date: Wed, 15 Jan 2025 20:07:23 +0000
Subject: [PATCH 37/64] update lightningquantum

---
 src/qirlightning/LightningQuantum.cc | 127 ++++++++++++++-------------
 src/qirlightning/LightningQuantum.hh |  18 +---
 2 files changed, 71 insertions(+), 74 deletions(-)

diff --git a/src/qirlightning/LightningQuantum.cc b/src/qirlightning/LightningQuantum.cc
index 3f39825..fe8b07e 100644
--- a/src/qirlightning/LightningQuantum.cc
+++ b/src/qirlightning/LightningQuantum.cc
@@ -18,28 +18,48 @@
 #include "qiree/Assert.hh"
 
 // Lightning
-#include "catalyst_runtime/lib/capi/ExecutionContext.hpp"
+#include "QuantumDevice.hpp"
+
+#define RTDLIB                                                         \
+    "/home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/" \
+    "pennylane_lightning/liblightning_kokkos_catalyst.so";
+#define RTDDEVICE "LightningKokkosSimulator";
 
 namespace qiree
 {
 using namespace Catalyst::Runtime;
 
-static inline std::shared_ptr<RTDevice> loadRTDevice(const std::string &rtd_lib,
-                                                   const std::string &rtd_name = {},
-                                                   const std::string &rtd_kwargs = {})
-{
-    ExecutionContext context;
-    return context.getOrCreateDevice(rtd_lib, rtd_name, rtd_kwargs);
-}
-
 //---------------------------------------------------------------------------//
 /*!
  * Initialize the Lightning simulator
  */
-LightningQuantum::LightningQuantum(std::ostream& os, unsigned long int seed)
-{
-    auto RTDevice = loadDevice("/home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/pennylane_lightning/liblightning_gpu_catalyst.so", "LightningGPUSimulator", "");
-
+LightningQuantum::LightningQuantum(std::ostream& os) : output_(os)
+{
+    std::string rtd_lib = RTDLIB;
+    std::string rtd_device = RTDDEVICE;
+    std::string kwargs = {};
+    auto rtld_flags = RTLD_LAZY | RTLD_NODELETE;
+    auto rtd_dylib_handler = dlopen(rtd_lib.c_str(), rtld_flags);
+
+    if (!rtd_dylib_handler)
+    {
+        throw std::runtime_error("Failed to load library: " + rtd_lib);
+    }
+
+    // Find device factory
+    std::string factory_name = rtd_device + "Factory";
+    void* f_ptr = dlsym(rtd_dylib_handler, factory_name.c_str());
+
+    if (!f_ptr)
+    {
+        dlclose(rtd_dylib_handler);
+        throw std::runtime_error("Failed to find factory function: "
+                                 + factory_name);
+    }
+    std::string rtd_kwargs = {};
+    rtd_qdevice = std::unique_ptr<QuantumDevice>(
+        reinterpret_cast<decltype(GenericDeviceFactory)*>(f_ptr)(
+            rtd_kwargs.c_str()));
 }
 
 //---------------------------------------------------------------------------//
@@ -54,11 +74,10 @@ void LightningQuantum::set_up(EntryPointAttrs const& attrs)
 {
     QIREE_VALIDATE(attrs.required_num_qubits > 0,
                    << "input is not a quantum program");
-    
-    num_qubits_ = attrs.required_num_qubits;  // Set the number of qubits
 
-    RTDevice->getQuantumDevicePtr()->AllocateQubits(num_qubits_);
+    num_qubits_ = attrs.required_num_qubits;  // Set the number of qubits
 
+    rtd_qdevice->AllocateQubits(num_qubits_);
 }
 
 //---------------------------------------------------------------------------//
@@ -67,8 +86,10 @@ void LightningQuantum::set_up(EntryPointAttrs const& attrs)
  */
 void LightningQuantum::tear_down()
 {
-    context->deactivateDevice(RTDevice);
-    RTDevice = nullptr;
+    if (rtd_dylib_handler)
+    {
+        dlclose(rtd_dylib_handler);
+    }
 }
 
 //---------------------------------------------------------------------------//
@@ -77,7 +98,7 @@ void LightningQuantum::tear_down()
  */
 void LightningQuantum::reset(Qubit q)
 {
-    q.value = 0;
+    rtd_qdevice->SetState({{0, 0}}, {q.value});    
 }
 
 //----------------------------------------------------------------------------//
@@ -86,8 +107,7 @@ void LightningQuantum::reset(Qubit q)
  */
 QState LightningQuantum::read_result(Result r)
 {
-
-    return static_cast<QState>(meas_results[0].bitstring[0]);
+    return results_[r.value]
 }
 
 //---------------------------------------------------------------------------//
@@ -98,19 +118,19 @@ QState LightningQuantum::read_result(Result r)
  * qsim)
  */
 void LightningQuantum::mz(Qubit q, Result r)
-{  // we don't classical register yet.
-    /* QIREE_EXPECT(q.value < this->num_qubits()); */  // TODO: q must be in the set
-                                                 // of qubits, e.g., what
-                                                 // happens if q=5 and qubits
-                                                 // are {2,3,4,5}, q is less
-                                                 // than num_qubits but not it
-                                                 // is in the set of qubits.
+{ 
+    QIREE_EXPECT(q.value < this->num_qubits());  // TODO: q must be in
+                                                       // the set of qubits,
+                                                       // e.g., what happens if
+                                                       // q=5 and qubits are
+                                                       // {2,3,4,5}, q is less
+                                                       // than num_qubits but
+                                                       // not it is in the set
+                                                       // of qubits.
     // TODO: maybe not what we want long term
     QIREE_EXPECT(q.value == r.value);
     // Add measurement instruction
-    Measure(q.value, std::nullopt);
-    // RETURN MEASURE RESULT??
-
+    results_[r.value] = rtd_qdevice->Measure(q.value, std::nullopt);
 }
 
 //---------------------------------------------------------------------------//
@@ -121,75 +141,62 @@ void LightningQuantum::mz(Qubit q, Result r)
 // 1. Entangling gates
 void LightningQuantum::cx(Qubit q1, Qubit q2)
 {
-    RTDevice->getQuantumDevicePtr()->NamedOperation("CNOT", {}, {q1.value, q2.value});
+    rtd_qdevice->NamedOperation(
+        "CNOT", {}, {q1.value, q2.value});
 }
 void LightningQuantum::cnot(Qubit q1, Qubit q2)
 {
-    RTDevice->getQuantumDevicePtr()->NamedOperation("CNOT", {}, {q1.value, q2.value});
+    rtd_qdevice->NamedOperation(
+        "CNOT", {}, {q1.value, q2.value});
 }
 void LightningQuantum::cz(Qubit q1, Qubit q2)
 {
-    RTDevice->getQuantumDevicePtr()->NamedOperation("CZ", {}, {q1.value, q2.value});
+    rtd_qdevice->NamedOperation(
+        "CZ", {}, {q1.value, q2.value});
 }
 // 2. Local gates
 void LightningQuantum::h(Qubit q)
 {
-    RTDevice->getQuantumDevicePtr()->NamedOperation("Hadamard", {}, {q.value});
+    rtd_qdevice->NamedOperation("Hadamard", {}, {q.value});
 }
 void LightningQuantum::s(Qubit q)
 {
-    RTDevice->getQuantumDevicePtr()->NamedOperation("S", {}, {q.value});
+    rtd_qdevice->NamedOperation("S", {}, {q.value});
 }
 void LightningQuantum::t(Qubit q)
 {
-    RTDevice->getQuantumDevicePtr()->NamedOperation("T", {}, {q.value});
+    rtd_qdevice->NamedOperation("T", {}, {q.value});
 }
 // 2.1 Pauli gates
 void LightningQuantum::x(Qubit q)
 {
-    RTDevice->getQuantumDevicePtr()->NamedOperation("PauliX", {}, {q.value});
+    rtd_qdevice->NamedOperation("PauliX", {}, {q.value});
 }
 void LightningQuantum::y(Qubit q)
 {
-    RTDevice->getQuantumDevicePtr()->NamedOperation("PauliY", {}, {q.value});
+    rtd_qdevice->NamedOperation("PauliY", {}, {q.value});
 }
 void LightningQuantum::z(Qubit q)
 {
-    RTDevice->getQuantumDevicePtr()->NamedOperation("PauliZ", {}, {q.value});
+    rtd_qdevice->NamedOperation("PauliZ", {}, {q.value});
 }
 // 2.2 rotation gates
 void LightningQuantum::rx(double theta, Qubit q)
 {
-    RTDevice->getQuantumDevicePtr()->NamedOperation("RX", {theta}, {q.value});
+    rtd_qdevice->NamedOperation("RX", {theta}, {q.value});
 }
 void LightningQuantum::ry(double theta, Qubit q)
 {
-    RTDevice->getQuantumDevicePtr()->NamedOperation("RY", {theta}, {q.value});
+    rtd_qdevice->NamedOperation("RY", {theta}, {q.value});
 }
 void LightningQuantum::rz(double theta, Qubit q)
 {
-    RTDevice->getQuantumDevicePtr()->NamedOperation("RZ", {theta}, {q.value});
+    rtd_qdevice->NamedOperation("RZ", {theta}, {q.value});
 }
 
 Qubit LightningQuantum::result_to_qubit(Result r)
 {
-    // TODO: This function is not working. Giving 0 every time. Maybe not
-    // needed.
-    /* QIREE_EXPECT(r.value < this->num_results()); */
-    return result_to_qubit_[r.value];  // just copied this from the qirxacc, I
-                                       // have no idea if we need to do
-                                       // something else here
-}
-
-void LightningQuantum::print_accelbuf()
-{
-    // TODO: to be implemented, we can create a buffer class to store the
-    // results
-}
-
-void LightningQuantum::execute_if_needed()
-{
-    /* QIREE_EXPECT(false); */
+    return result_to_qubit_[r.value];  
 }
 
 }  // namespace qiree
diff --git a/src/qirlightning/LightningQuantum.hh b/src/qirlightning/LightningQuantum.hh
index e9b8bb2..36e35e8 100644
--- a/src/qirlightning/LightningQuantum.hh
+++ b/src/qirlightning/LightningQuantum.hh
@@ -27,7 +27,7 @@ class LightningQuantum final : virtual public QuantumNotImpl
 {
   public:
     // Construct with number of shots
-    LightningQuantum(std::ostream& os, unsigned long int shots);
+    LightningQuantum(std::ostream& os);
     ~LightningQuantum();
 
     QIREE_DELETE_COPY_MOVE(LightningQuantum);  // Delete copy and move constructors
@@ -89,24 +89,14 @@ class LightningQuantum final : virtual public QuantumNotImpl
     void z(Qubit) final;
     //!@}
 
-    // Update the buffer
-    Buffer manager;
-
   private:
-
-    //// TYPES ////
-
-    struct Factory;
-    struct State;
-
+  
     //// DATA ////
 
     std::ostream& output_;
-    unsigned long int seed_{};
-    std::unique_ptr<State> state_;
+    std::unique_ptr<QuantumDevice> rtd_qdevice;
+    std::vector<bool> results_;
 
-    unsigned num_threads_{};  // Number of threads to use
-    size_t gate_index_;  // when the quantum operation will be executed
     size_type num_qubits_{};
     std::vector<Qubit> result_to_qubit_;
 };

From 7713cca513ab94a8f2ae3da760352e7d872a7f6f Mon Sep 17 00:00:00 2001
From: Joseph Lee <joseph.lee@xanadu.ai>
Date: Mon, 10 Mar 2025 17:55:12 +0000
Subject: [PATCH 38/64] update

---
 src/qirlightning/simple_demo/README.md          | 10 +++++++---
 src/qirlightning/simple_demo/test_rt_device.cpp |  4 ++--
 src/qirqsim/QsimDefaultRuntime.hh               |  1 +
 src/qirqsim/QsimQuantum.hh                      |  4 ++++
 4 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/qirlightning/simple_demo/README.md b/src/qirlightning/simple_demo/README.md
index 02c6f3c..7aecc9d 100644
--- a/src/qirlightning/simple_demo/README.md
+++ b/src/qirlightning/simple_demo/README.md
@@ -16,7 +16,7 @@ $ pip install pennylane-lightning-kokkos
 
 $ pip show pennylane-lightning-kokkos
 Name: PennyLane_Lightning_Kokkos
-Version: 0.39.0
+Version: 0.40.0
 Summary: PennyLane-Lightning plugin
 Home-page: https://github.com/PennyLaneAI/pennylane-lightning
 Author: 
@@ -39,6 +39,8 @@ To compile:
 $ clang++ --std=c++20 test_rt_device.cpp -I../catalyst_runtime/include -o test_rt_device.out
 ```
 
+## Running the example
+
 To run:
 
 ```
@@ -55,8 +57,10 @@ State =
 Measure on wire 0 = 0
 ```
 
+## Running on other devices
+
 To run on other devices, e.g. lightning.gpu, you need to change:
-- `pip install custatevec-cu12 pennylane-lightning-gpu` (custatevec is a dependency)
+- Install pennylane-lightning-gpu: `pip install pennylane-lightning-gpu`
 - replace `RTDLIB` from `kokkos` to `gpu`
 - replace `RTDDEVICE` from `Kokkos` to `GPU`
-- install `cuquantum` via `pip install custatevec-cu12`, then include `cuquantum` libraries when running, e.g. `LD_LIBRARY_PATH=/home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/cuquantum/lib/:$LD_LIBRARY_PATH ./test_rt_device.out`
+- Include `cuquantum` libraries when running (which was installed as a dependency), i.e. `LD_LIBRARY_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")/lib:$LD_LIBRARY_PATH ./test_rt_device.out`
diff --git a/src/qirlightning/simple_demo/test_rt_device.cpp b/src/qirlightning/simple_demo/test_rt_device.cpp
index 721ad1d..4d2736f 100644
--- a/src/qirlightning/simple_demo/test_rt_device.cpp
+++ b/src/qirlightning/simple_demo/test_rt_device.cpp
@@ -5,8 +5,8 @@
 // Runtime libraries (kokkos/GPU/qubit etc.)
 #define RTDLIB                                                         \
     "/home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/" \
-    "pennylane_lightning/liblightning_kokkos_catalyst.so";
-#define RTDDEVICE "LightningKokkosSimulator";
+    "pennylane_lightning/liblightning_gpu_catalyst.so";
+#define RTDDEVICE "LightningGPUSimulator";
 
 extern "C" Catalyst::Runtime::QuantumDevice*
 GenericDeviceFactory(char const* kwargs);
diff --git a/src/qirqsim/QsimDefaultRuntime.hh b/src/qirqsim/QsimDefaultRuntime.hh
index daff34a..a271d84 100644
--- a/src/qirqsim/QsimDefaultRuntime.hh
+++ b/src/qirqsim/QsimDefaultRuntime.hh
@@ -42,6 +42,7 @@ class QsimDefaultRuntime final : virtual public RuntimeInterface
 
     //!@{
     //! \name Runtime interface
+    
     // Initialize the execution environment, resetting qubits
     void initialize(OptionalCString env) override;
 
diff --git a/src/qirqsim/QsimQuantum.hh b/src/qirqsim/QsimQuantum.hh
index 1b04bf0..ddeea67 100644
--- a/src/qirqsim/QsimQuantum.hh
+++ b/src/qirqsim/QsimQuantum.hh
@@ -98,6 +98,10 @@ class QsimQuantum final : virtual public QuantumNotImpl
     unsigned long int seed_{};
     std::unique_ptr<State> state_;
     std::vector<bool> results_;
+    
+    unsigned num_threads_{};  // Number of threads to use
+    size_t gate_index_;  // when the quantum operation will be executed
+    size_type num_qubits_{};
     std::vector<Qubit> result_to_qubit_;
 
     //// HELPER FUNCTIONS ////

From e777b9cc32dbb3a7eac53a8b414a269dbccfd7e0 Mon Sep 17 00:00:00 2001
From: Joseph Lee <joseph.lee@xanadu.ai>
Date: Mon, 10 Mar 2025 17:56:31 +0000
Subject: [PATCH 39/64] format

---
 src/qirqsim/QsimDefaultRuntime.hh | 2 +-
 src/qirqsim/QsimQuantum.hh        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/qirqsim/QsimDefaultRuntime.hh b/src/qirqsim/QsimDefaultRuntime.hh
index a271d84..f4e8e4d 100644
--- a/src/qirqsim/QsimDefaultRuntime.hh
+++ b/src/qirqsim/QsimDefaultRuntime.hh
@@ -42,7 +42,7 @@ class QsimDefaultRuntime final : virtual public RuntimeInterface
 
     //!@{
     //! \name Runtime interface
-    
+
     // Initialize the execution environment, resetting qubits
     void initialize(OptionalCString env) override;
 
diff --git a/src/qirqsim/QsimQuantum.hh b/src/qirqsim/QsimQuantum.hh
index ddeea67..7f2b2bb 100644
--- a/src/qirqsim/QsimQuantum.hh
+++ b/src/qirqsim/QsimQuantum.hh
@@ -98,7 +98,7 @@ class QsimQuantum final : virtual public QuantumNotImpl
     unsigned long int seed_{};
     std::unique_ptr<State> state_;
     std::vector<bool> results_;
-    
+
     unsigned num_threads_{};  // Number of threads to use
     size_t gate_index_;  // when the quantum operation will be executed
     size_type num_qubits_{};

From 6caa66768fe2a365a6ff2c4a30b9e714d11e634f Mon Sep 17 00:00:00 2001
From: Joseph Lee <joseph.lee@xanadu.ai>
Date: Mon, 10 Mar 2025 20:03:58 +0000
Subject: [PATCH 40/64] support lightning

---
 CMakeLists.txt                              | 13 +++-
 app/CMakeLists.txt                          | 17 +++++
 app/qir-lightning.cc                        | 74 +++++++++++++++++++++
 src/CMakeLists.txt                          |  4 ++
 src/qirlightning/CMakeLists.txt             | 19 +++---
 src/qirlightning/LightningDefaultRuntime.cc | 44 ------------
 src/qirlightning/LightningDefaultRuntime.hh | 23 +++++--
 src/qirlightning/LightningQuantum.cc        | 60 +++++++----------
 src/qirlightning/LightningQuantum.hh        | 46 ++++++++-----
 9 files changed, 192 insertions(+), 108 deletions(-)
 create mode 100644 app/qir-lightning.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3eb2675..b0a21a9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -38,7 +38,8 @@ option(QIREE_BUILD_DOCS "Build QIR-EE documentation" OFF)
 option(QIREE_BUILD_TESTS "Build QIR-EE unit tests" ON)
 option(QIREE_BUILD_EXAMPLES "Build QIR-EE examples" OFF)
 option(QIREE_USE_QSIM "Download and build Google qsim backend" OFF)
-option(QIREE_USE_XACC "Build XACC interface" ON)
+option(QIREE_USE_XACC "Build XACC interface" OFF)
+option(QIREE_USE_LIGHTNING "Build Pennylane Lightning backend" ON)
 
 qiree_set_default(BUILD_TESTING ${QIREE_BUILD_TESTS})
 
@@ -138,6 +139,16 @@ if(QIREE_USE_QSIM)
   )
 endif()
 
+if(QIREE_USE_LIGHTNING)
+qiree_add_library(qiree_lightning INTERFACE)
+add_library(QIREE::lightning ALIAS qiree_lightning)
+target_include_directories(qiree_lightning SYSTEM INTERFACE
+  "$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/external>"
+  "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/external>"
+)
+endif()
+
+
 if(QIREE_USE_XACC)
   find_package(XACC REQUIRED)
 endif()
diff --git a/app/CMakeLists.txt b/app/CMakeLists.txt
index 4bf7330..58e63f2 100644
--- a/app/CMakeLists.txt
+++ b/app/CMakeLists.txt
@@ -28,6 +28,23 @@ if(QIREE_USE_QSIM)
   )
 endif()
 
+#-----------------------------------------------------------------------------#
+# LIGHTNING FRONT END
+#-----------------------------------------------------------------------------#
+
+if(QIREE_USE_LIGHTNING)
+
+  # Include directories for catalyst_runtime
+  include_directories("${CMAKE_CURRENT_SOURCE_DIR}/../src/qirlightning/catalyst_runtime/include")
+  qiree_add_executable(qir-lightning
+    qir-lightning.cc
+  )
+  target_link_libraries(qir-lightning
+    PUBLIC QIREE::qiree QIREE::qirlightning
+    PRIVATE CLI11::CLI11
+  )
+endif()
+
 #-----------------------------------------------------------------------------#
 # XACC FRONT END
 #-----------------------------------------------------------------------------#
diff --git a/app/qir-lightning.cc b/app/qir-lightning.cc
new file mode 100644
index 0000000..2244c97
--- /dev/null
+++ b/app/qir-lightning.cc
@@ -0,0 +1,74 @@
+//----------------------------------*-C++-*----------------------------------//
+// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers.
+// See the top-level COPYRIGHT file for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//---------------------------------------------------------------------------//
+//! \file app/qir-lightning.cc
+//---------------------------------------------------------------------------//
+#include <cstdlib>
+#include <iostream>
+#include <string>
+#include <CLI/CLI.hpp>
+
+#include "qiree/Executor.hh"
+#include "qiree/Module.hh"
+#include "qiree/ResultDistribution.hh"
+#include "qirlightning/LightningDefaultRuntime.hh"
+#include "qirlightning/LightningQuantum.hh"
+
+using namespace std::string_view_literals;
+
+namespace qiree
+{
+namespace app
+{
+//---------------------------------------------------------------------------//
+void run(std::string const& filename, int num_shots)
+{
+    // Load the input
+    Executor execute{Module{filename}};
+
+    // Set up qsim
+    LightningQuantum sim(std::cout, 0);
+    LightningDefaultRuntime rt(std::cout, sim);
+    ResultDistribution distribution;
+
+    // Run several time = shots (default 1)
+    for (int i = 0; i < num_shots; i++)
+    {
+        execute(sim, rt);
+        distribution.accumulate(rt.result());
+    }
+
+    std::cout << distribution.to_json() << std::endl;
+}
+
+//---------------------------------------------------------------------------//
+}  // namespace app
+}  // namespace qiree
+
+//---------------------------------------------------------------------------//
+/*!
+ * Execute and run.
+ */
+int main(int argc, char* argv[])
+{
+    int num_shots{1};
+    std::string filename;
+
+    CLI::App app;
+
+    auto* filename_opt
+        = app.add_option("--input,-i,input", filename, "QIR input file");
+    filename_opt->required();
+
+    auto* nshot_opt
+        = app.add_option("-s,--shots", num_shots, "Number of shots");
+    nshot_opt->capture_default_str();
+
+    CLI11_PARSE(app, argc, argv);
+
+    qiree::app::run(filename, num_shots);
+
+    return EXIT_SUCCESS;
+}
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index b01bf2f..b3d81fd 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -21,4 +21,8 @@ if(QIREE_USE_QSIM)
   add_subdirectory(qirqsim)
 endif()
 
+if(QIREE_USE_LIGHTNING)
+  add_subdirectory(qirlightning)
+endif()
+
 #---------------------------------------------------------------------------##
diff --git a/src/qirlightning/CMakeLists.txt b/src/qirlightning/CMakeLists.txt
index 0d81dec..0a3eab5 100644
--- a/src/qirlightning/CMakeLists.txt
+++ b/src/qirlightning/CMakeLists.txt
@@ -4,16 +4,19 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #----------------------------------------------------------------------------#
 
-# Adding qsim as a library to qiree
-qiree_add_library(qirqsim
-  QsimQuantum.cc
-  QsimDefaultRuntime.cc
+# Include directories for catalyst_runtime
+include_directories("${CMAKE_CURRENT_SOURCE_DIR}/catalyst_runtime/include")
+
+# Adding lightning as a library to qiree
+qiree_add_library(qirlightning
+  LightningQuantum.cc
+  LightningDefaultRuntime.cc
 )
 
-#Link the qsim library to qiree and any other relevant libraries
-target_link_libraries(qirqsim
+#Link the lightning library to qiree and any other relevant libraries
+target_link_libraries(qirlightning
   PUBLIC QIREE::qiree  # Link to qiree
-  PRIVATE QIREE::qsim
+  PRIVATE QIREE::lightning
 )
 
 #----------------------------------------------------------------------------#
@@ -22,7 +25,7 @@ target_link_libraries(qirqsim
 
 # Install headers, matching the relevant .hh files for qsim integration
 install(DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/"
-  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/qirqsim"
+  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/qirlightning"
   COMPONENT development
   FILES_MATCHING REGEX ".*\\.hh?$"
 )
diff --git a/src/qirlightning/LightningDefaultRuntime.cc b/src/qirlightning/LightningDefaultRuntime.cc
index 2440ee0..7e5da3e 100644
--- a/src/qirlightning/LightningDefaultRuntime.cc
+++ b/src/qirlightning/LightningDefaultRuntime.cc
@@ -26,48 +26,4 @@ void LightningDefaultRuntime::initialize(OptionalCString env)
     }
 }
 
-//---------------------------------------------------------------------------//
-/*!
- * Execute circuit and mark the following N results as being part of an array
- * named tag
- */
-
-void LightningDefaultRuntime::array_record_output(size_type s, OptionalCString tag)
-{
-    // this->execute_if_needed();
-    // output_ << "array " << (tag ? tag : "<null>") << " length " << s
-    //         << std::endl;
-}
-
-//---------------------------------------------------------------------------//
-/*!
- * Execute circuit and mark the following N results as being part of a tuple
- * named tag
- */
-
-void LightningDefaultRuntime::tuple_record_output(size_type s, OptionalCString tag)
-{
-    // this->execute_if_needed();
-    // output_ << "tuple " << (tag ? tag : "<null>") << " length " << s
-    //         << std::endl;
-}
-
-//---------------------------------------------------------------------------//
-/*!
- * Execute circuit and report a single measurement result
- */
-void LightningDefaultRuntime::result_record_output(Result r, OptionalCString tag)
-{
-    // Access values through the getter
-    // This prints results every time result_record_output is called
-    // Can comment out if only want to see final results
-
-    if (auto value = sim_.manager.getBufferValue("q" + std::to_string(r.value));
-        value.has_value())
-    {
-        std::cout << "q" << std::to_string(r.value) << " : " << value.value()
-                  << "\n";
-    }
-}
-
 }  // namespace qiree
diff --git a/src/qirlightning/LightningDefaultRuntime.hh b/src/qirlightning/LightningDefaultRuntime.hh
index cac9c1e..2d3e9fa 100644
--- a/src/qirlightning/LightningDefaultRuntime.hh
+++ b/src/qirlightning/LightningDefaultRuntime.hh
@@ -8,6 +8,7 @@
 #pragma once
 
 #include "LightningQuantum.hh"
+#include "qiree/RecordedResult.hh"
 
 namespace qiree
 {
@@ -41,22 +42,36 @@ class LightningDefaultRuntime final : virtual public RuntimeInterface
 
     //!@{
     //! \name Runtime interface
+
     // Initialize the execution environment, resetting qubits
     void initialize(OptionalCString env) override;
 
     //! Mark the following N results as being part of an array named tag
-    void array_record_output(size_type, OptionalCString tag) final;
+    void array_record_output(size_type size, OptionalCString tag) final
+    {
+        result_ = RecordedResult(size, tag);
+    }
 
     //! Mark the following N results as being part of a tuple named tag
-    void tuple_record_output(size_type, OptionalCString) final;
+    void tuple_record_output(size_type size, OptionalCString tag) final
+    {
+        result_ = RecordedResult(size, tag);
+    }
 
-    // Save one result
-    void result_record_output(Result result, OptionalCString tag) final;
+    //! Save one result
+    void result_record_output(Result result, OptionalCString tag) final
+    {
+        result_.push_back(sim_.get_result(result), tag);
+    }
     //!@}
 
+    RecordedResult const& result() const { return result_; }
+
+
   private:
     std::ostream& output_;
     LightningQuantum& sim_;
+    RecordedResult result_;
 };
 
 }  // namespace qiree
diff --git a/src/qirlightning/LightningQuantum.cc b/src/qirlightning/LightningQuantum.cc
index fe8b07e..4ec6abd 100644
--- a/src/qirlightning/LightningQuantum.cc
+++ b/src/qirlightning/LightningQuantum.cc
@@ -14,17 +14,16 @@
 #include <stdexcept>
 #include <thread>
 #include <utility>
+#include <dlfcn.h>
 
 #include "qiree/Assert.hh"
 
-// Lightning
-#include "QuantumDevice.hpp"
-
 #define RTDLIB                                                         \
     "/home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/" \
     "pennylane_lightning/liblightning_kokkos_catalyst.so";
 #define RTDDEVICE "LightningKokkosSimulator";
-
+extern "C" Catalyst::Runtime::QuantumDevice*
+GenericDeviceFactory(char const* kwargs);
 namespace qiree
 {
 using namespace Catalyst::Runtime;
@@ -33,13 +32,13 @@ using namespace Catalyst::Runtime;
 /*!
  * Initialize the Lightning simulator
  */
-LightningQuantum::LightningQuantum(std::ostream& os) : output_(os)
+LightningQuantum::LightningQuantum(std::ostream& os, unsigned long int seed) : output_(os)
 {
     std::string rtd_lib = RTDLIB;
     std::string rtd_device = RTDDEVICE;
     std::string kwargs = {};
     auto rtld_flags = RTLD_LAZY | RTLD_NODELETE;
-    auto rtd_dylib_handler = dlopen(rtd_lib.c_str(), rtld_flags);
+    rtd_dylib_handler = dlopen(rtd_lib.c_str(), rtld_flags);
 
     if (!rtd_dylib_handler)
     {
@@ -76,6 +75,7 @@ void LightningQuantum::set_up(EntryPointAttrs const& attrs)
                    << "input is not a quantum program");
 
     num_qubits_ = attrs.required_num_qubits;  // Set the number of qubits
+    results_.resize(attrs.required_num_results);
 
     rtd_qdevice->AllocateQubits(num_qubits_);
 }
@@ -98,7 +98,9 @@ void LightningQuantum::tear_down()
  */
 void LightningQuantum::reset(Qubit q)
 {
-    rtd_qdevice->SetState({{0, 0}}, {q.value});    
+    std::vector<int8_t> data = {0}; 
+    DataView<int8_t, 1> state(data);
+    std::vector<QubitIdType> wires = {static_cast<intptr_t>(q.value)};    rtd_qdevice->SetBasisState(state, wires);    
 }
 
 //----------------------------------------------------------------------------//
@@ -107,7 +109,7 @@ void LightningQuantum::reset(Qubit q)
  */
 QState LightningQuantum::read_result(Result r)
 {
-    return results_[r.value]
+    return this->get_result(r);
 }
 
 //---------------------------------------------------------------------------//
@@ -119,17 +121,8 @@ QState LightningQuantum::read_result(Result r)
  */
 void LightningQuantum::mz(Qubit q, Result r)
 { 
-    QIREE_EXPECT(q.value < this->num_qubits());  // TODO: q must be in
-                                                       // the set of qubits,
-                                                       // e.g., what happens if
-                                                       // q=5 and qubits are
-                                                       // {2,3,4,5}, q is less
-                                                       // than num_qubits but
-                                                       // not it is in the set
-                                                       // of qubits.
-    // TODO: maybe not what we want long term
-    QIREE_EXPECT(q.value == r.value);
-    // Add measurement instruction
+    QIREE_EXPECT(q.value < this->num_qubits());  
+    QIREE_EXPECT(r.value < this->num_results());
     results_[r.value] = rtd_qdevice->Measure(q.value, std::nullopt);
 }
 
@@ -142,61 +135,56 @@ void LightningQuantum::mz(Qubit q, Result r)
 void LightningQuantum::cx(Qubit q1, Qubit q2)
 {
     rtd_qdevice->NamedOperation(
-        "CNOT", {}, {q1.value, q2.value});
+        "CNOT", {}, {static_cast<intptr_t>(q1.value), static_cast<intptr_t>(q2.value)});
 }
 void LightningQuantum::cnot(Qubit q1, Qubit q2)
 {
     rtd_qdevice->NamedOperation(
-        "CNOT", {}, {q1.value, q2.value});
+        "CNOT", {}, {static_cast<intptr_t>(q1.value), static_cast<intptr_t>(q2.value)});
 }
 void LightningQuantum::cz(Qubit q1, Qubit q2)
 {
     rtd_qdevice->NamedOperation(
-        "CZ", {}, {q1.value, q2.value});
+        "CZ", {}, {static_cast<intptr_t>(q1.value), static_cast<intptr_t>(q2.value)});
 }
 // 2. Local gates
 void LightningQuantum::h(Qubit q)
 {
-    rtd_qdevice->NamedOperation("Hadamard", {}, {q.value});
+    rtd_qdevice->NamedOperation("Hadamard", {}, {static_cast<intptr_t>(q.value)});
 }
 void LightningQuantum::s(Qubit q)
 {
-    rtd_qdevice->NamedOperation("S", {}, {q.value});
+    rtd_qdevice->NamedOperation("S", {}, {static_cast<intptr_t>(q.value)});
 }
 void LightningQuantum::t(Qubit q)
 {
-    rtd_qdevice->NamedOperation("T", {}, {q.value});
+    rtd_qdevice->NamedOperation("T", {}, {static_cast<intptr_t>(q.value)});
 }
 // 2.1 Pauli gates
 void LightningQuantum::x(Qubit q)
 {
-    rtd_qdevice->NamedOperation("PauliX", {}, {q.value});
+    rtd_qdevice->NamedOperation("PauliX", {}, {static_cast<intptr_t>(q.value)});
 }
 void LightningQuantum::y(Qubit q)
 {
-    rtd_qdevice->NamedOperation("PauliY", {}, {q.value});
+    rtd_qdevice->NamedOperation("PauliY", {}, {static_cast<intptr_t>(q.value)});
 }
 void LightningQuantum::z(Qubit q)
 {
-    rtd_qdevice->NamedOperation("PauliZ", {}, {q.value});
+    rtd_qdevice->NamedOperation("PauliZ", {}, {static_cast<intptr_t>(q.value)});
 }
 // 2.2 rotation gates
 void LightningQuantum::rx(double theta, Qubit q)
 {
-    rtd_qdevice->NamedOperation("RX", {theta}, {q.value});
+    rtd_qdevice->NamedOperation("RX", {theta}, {static_cast<intptr_t>(q.value)});
 }
 void LightningQuantum::ry(double theta, Qubit q)
 {
-    rtd_qdevice->NamedOperation("RY", {theta}, {q.value});
+    rtd_qdevice->NamedOperation("RY", {theta}, {static_cast<intptr_t>(q.value)});
 }
 void LightningQuantum::rz(double theta, Qubit q)
 {
-    rtd_qdevice->NamedOperation("RZ", {theta}, {q.value});
-}
-
-Qubit LightningQuantum::result_to_qubit(Result r)
-{
-    return result_to_qubit_[r.value];  
+    rtd_qdevice->NamedOperation("RZ", {theta}, {static_cast<intptr_t>(q.value)});
 }
 
 }  // namespace qiree
diff --git a/src/qirlightning/LightningQuantum.hh b/src/qirlightning/LightningQuantum.hh
index 36e35e8..b3856b0 100644
--- a/src/qirlightning/LightningQuantum.hh
+++ b/src/qirlightning/LightningQuantum.hh
@@ -11,11 +11,14 @@
 #include <ostream>
 #include <vector>
 
+#include "qiree/Assert.hh"
 #include "qiree/Macros.hh"
 #include "qiree/QuantumNotImpl.hh"
 #include "qiree/RuntimeInterface.hh"
 #include "qiree/Types.hh"
-#include "qiree/OutputDistribution.hh"
+
+// Lightning
+#include "QuantumDevice.hpp"
 
 namespace qiree
 {
@@ -27,15 +30,23 @@ class LightningQuantum final : virtual public QuantumNotImpl
 {
   public:
     // Construct with number of shots
-    LightningQuantum(std::ostream& os);
+    LightningQuantum(std::ostream& os, unsigned long int shots);
     ~LightningQuantum();
 
     QIREE_DELETE_COPY_MOVE(LightningQuantum);  // Delete copy and move constructors
 
+
     //!@{
     //! \name Accessors
-    size_type num_results() const { return result_to_qubit_.size(); }
+
+    //! Number of qubits in the circuit
     size_type num_qubits() const { return num_qubits_; }
+
+    //! Number of classical result registers
+    size_type num_results() const { return results_.size(); }
+
+    // Get the result from a classical register
+    inline QState get_result(Result r) const;
     //!@}
 
     //!@{
@@ -53,17 +64,6 @@ class LightningQuantum final : virtual public QuantumNotImpl
     QState read_result(Result) final;
     //!@}
 
-    //!@{
-    //! \name Utilities for runtime
-    // Get runtime qubit corresponding to a runtime result
-    Qubit result_to_qubit(Result);
-
-    // Run the circuit on the accelerator if we have not already. Returns true
-    // if the circuit was executed.
-    void execute_if_needed();
-
-    void print_accelbuf();
-    //!@}
 
     //!@{
     //! \name Circuit construction
@@ -90,15 +90,31 @@ class LightningQuantum final : virtual public QuantumNotImpl
     //!@}
 
   private:
+    //// TYPES ////
+
+    struct Factory;
+    struct State;
   
     //// DATA ////
 
     std::ostream& output_;
-    std::unique_ptr<QuantumDevice> rtd_qdevice;
+    void* rtd_dylib_handler;
+    std::unique_ptr<Catalyst::Runtime::QuantumDevice> rtd_qdevice;
     std::vector<bool> results_;
 
     size_type num_qubits_{};
     std::vector<Qubit> result_to_qubit_;
 };
 
+//---------------------------------------------------------------------------//
+/*!
+ * Get the result from a classical register.
+ */
+QState LightningQuantum::get_result(Result r) const
+{
+    QIREE_EXPECT(r.value < results_.size());
+    auto result_bool = static_cast<bool>(results_[r.value]);
+    return static_cast<QState>(result_bool);
+}
+
 }  // namespace qiree

From 422b3e76340cd475e11a1fc2cc25ea76e38236c3 Mon Sep 17 00:00:00 2001
From: Joseph Lee <joseph.lee@xanadu.ai>
Date: Tue, 11 Mar 2025 18:52:54 +0000
Subject: [PATCH 41/64] add seeding

---
 src/qirlightning/LightningQuantum.cc | 6 +++++-
 src/qirlightning/LightningQuantum.hh | 1 +
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/qirlightning/LightningQuantum.cc b/src/qirlightning/LightningQuantum.cc
index 4ec6abd..f99d117 100644
--- a/src/qirlightning/LightningQuantum.cc
+++ b/src/qirlightning/LightningQuantum.cc
@@ -15,6 +15,7 @@
 #include <thread>
 #include <utility>
 #include <dlfcn.h>
+#include <random>
 
 #include "qiree/Assert.hh"
 
@@ -32,7 +33,7 @@ using namespace Catalyst::Runtime;
 /*!
  * Initialize the Lightning simulator
  */
-LightningQuantum::LightningQuantum(std::ostream& os, unsigned long int seed) : output_(os)
+LightningQuantum::LightningQuantum(std::ostream& os, unsigned long int seed) : output_(os), seed_(seed)
 {
     std::string rtd_lib = RTDLIB;
     std::string rtd_device = RTDDEVICE;
@@ -123,6 +124,9 @@ void LightningQuantum::mz(Qubit q, Result r)
 { 
     QIREE_EXPECT(q.value < this->num_qubits());  
     QIREE_EXPECT(r.value < this->num_results());
+    std::mt19937 gen(seed_);
+    seed_++;
+    rtd_qdevice->SetDevicePRNG(&gen);
     results_[r.value] = rtd_qdevice->Measure(q.value, std::nullopt);
 }
 
diff --git a/src/qirlightning/LightningQuantum.hh b/src/qirlightning/LightningQuantum.hh
index b3856b0..24ccd2f 100644
--- a/src/qirlightning/LightningQuantum.hh
+++ b/src/qirlightning/LightningQuantum.hh
@@ -98,6 +98,7 @@ class LightningQuantum final : virtual public QuantumNotImpl
     //// DATA ////
 
     std::ostream& output_;
+    unsigned long int seed_{};
     void* rtd_dylib_handler;
     std::unique_ptr<Catalyst::Runtime::QuantumDevice> rtd_qdevice;
     std::vector<bool> results_;

From 7f6598a8b76f32741c09e22e77f5844851ba8b01 Mon Sep 17 00:00:00 2001
From: Joseph Lee <joseph.lee@xanadu.ai>
Date: Tue, 11 Mar 2025 18:57:32 +0000
Subject: [PATCH 42/64] update

---
 src/qirlightning/LightningQuantum.cc |  3 --
 src/qirlightning/README.md           | 63 ++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+), 3 deletions(-)
 create mode 100644 src/qirlightning/README.md

diff --git a/src/qirlightning/LightningQuantum.cc b/src/qirlightning/LightningQuantum.cc
index f99d117..ee3691f 100644
--- a/src/qirlightning/LightningQuantum.cc
+++ b/src/qirlightning/LightningQuantum.cc
@@ -116,9 +116,6 @@ QState LightningQuantum::read_result(Result r)
 //---------------------------------------------------------------------------//
 /*!
  * Map a qubit to a result index.
- *
- * (TODO: find how to link the classical register to the quantum register in
- * qsim)
  */
 void LightningQuantum::mz(Qubit q, Result r)
 { 
diff --git a/src/qirlightning/README.md b/src/qirlightning/README.md
new file mode 100644
index 0000000..6b42581
--- /dev/null
+++ b/src/qirlightning/README.md
@@ -0,0 +1,63 @@
+# Lightning backend
+
+## Installing a lightning simulator
+
+When installing [Pennylane-Lightning](https://github.com/PennyLaneAI/pennylane-lightning) from pip or source, you will have the shared objects for each of the simulator installed. These are named `liblightning_kokkos_catalyst.so`/`liblightning_GPU_catalyst.so` etc. 
+
+Running `pip install pennylane` or `pip install pennylane-lightning` will install the `lightning.qubit` (CPU) simulator, and other simulators can be installed by running `pip install pennylane-lightning-kokkos / pennylane-lightning-gpu`.
+
+Example:
+```
+$ pip install pennylane-lightning-kokkos
+
+$ pip show pennylane-lightning-kokkos
+Name: PennyLane_Lightning_Kokkos
+Version: 0.40.0
+Summary: PennyLane-Lightning plugin
+Home-page: https://github.com/PennyLaneAI/pennylane-lightning
+Author: 
+Author-email: 
+License: Apache License 2.0
+Location: /home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages
+Requires: pennylane, pennylane-lightning
+
+$ ls /home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/pennylane_lightning
+... liblightning_kokkos_catalyst.so ...
+```
+
+You can swap `pennylane-lightning-kokkos` for `pennylane-lightning-gpu` for lightning.gpu and `pennylane-lightning` for lightning.gpu simulators.
+
+## Compilation
+
+Turn on `QIREE_USE_LIGHTNING` in CMakeLists.txt
+
+To compile:
+
+```
+mkdir build; cd build
+cmake ..
+make
+
+```
+
+## Running the example
+
+To run:
+
+```
+$ ./bin/qir-lightning ../examples/bell.ll -s 1
+(Extra debug output:
+NamedOperation: Hadamard
+NamedOperation: CNOT
+Measure
+Measure)
+{"11":1}
+```
+
+## Running on other devices
+
+To run on other devices, e.g. lightning.gpu, you need to change:
+- Install pennylane-lightning-gpu: `pip install pennylane-lightning-gpu`
+- replace `RTDLIB` from `kokkos` to `gpu`
+- replace `RTDDEVICE` from `Kokkos` to `GPU`
+- Include `cuquantum` libraries when running (which was installed as a dependency), i.e. `LD_LIBRARY_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")/lib:$LD_LIBRARY_PATH ./test_rt_device.out`

From 62460106e75c33f5821fd3ccdf19b9651d3d27af Mon Sep 17 00:00:00 2001
From: Joseph Lee <joseph.lee@xanadu.ai>
Date: Wed, 12 Mar 2025 21:51:05 +0000
Subject: [PATCH 43/64] update

---
 CMakeLists.txt                                |   2 +-
 src/qirlightning/CMakeLists.txt               |   8 ++
 src/qirlightning/LightningQuantum.cc          |  51 ++++----
 src/qirlightning/LightningQuantum.hh          |   2 +-
 src/qirlightning/README.md                    |  36 ++++--
 .../include/DynamicLibraryLoader.hpp          |  79 ------------
 .../catalyst_runtime/include/RuntimeCAPI.h    | 112 ------------------
 .../simple_demo/test_rt_device.cpp            |   1 +
 8 files changed, 62 insertions(+), 229 deletions(-)
 delete mode 100644 src/qirlightning/catalyst_runtime/include/DynamicLibraryLoader.hpp
 delete mode 100644 src/qirlightning/catalyst_runtime/include/RuntimeCAPI.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b0a21a9..5ecc4a8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,7 +37,7 @@ endmacro()
 option(QIREE_BUILD_DOCS "Build QIR-EE documentation" OFF)
 option(QIREE_BUILD_TESTS "Build QIR-EE unit tests" ON)
 option(QIREE_BUILD_EXAMPLES "Build QIR-EE examples" OFF)
-option(QIREE_USE_QSIM "Download and build Google qsim backend" OFF)
+option(QIREE_USE_QSIM "Download and build Google qsim backend" ON)
 option(QIREE_USE_XACC "Build XACC interface" OFF)
 option(QIREE_USE_LIGHTNING "Build Pennylane Lightning backend" ON)
 
diff --git a/src/qirlightning/CMakeLists.txt b/src/qirlightning/CMakeLists.txt
index 0a3eab5..31cfd8c 100644
--- a/src/qirlightning/CMakeLists.txt
+++ b/src/qirlightning/CMakeLists.txt
@@ -7,12 +7,20 @@
 # Include directories for catalyst_runtime
 include_directories("${CMAKE_CURRENT_SOURCE_DIR}/catalyst_runtime/include")
 
+set(RTDLIB_PATH "/home/joseph/work/qiree/pennylane-lightning/build_lightning_qubit/liblightning_qubit_catalyst.so")
+set(RTDDEVICE_NAME "LightningSimulator")
+
 # Adding lightning as a library to qiree
 qiree_add_library(qirlightning
   LightningQuantum.cc
   LightningDefaultRuntime.cc
 )
 
+target_compile_definitions(qirlightning PRIVATE
+    RTDLIB="${RTDLIB_PATH}"
+    RTDDEVICE="${RTDDEVICE_NAME}"
+)
+
 #Link the lightning library to qiree and any other relevant libraries
 target_link_libraries(qirlightning
   PUBLIC QIREE::qiree  # Link to qiree
diff --git a/src/qirlightning/LightningQuantum.cc b/src/qirlightning/LightningQuantum.cc
index ee3691f..b8bc4ad 100644
--- a/src/qirlightning/LightningQuantum.cc
+++ b/src/qirlightning/LightningQuantum.cc
@@ -19,10 +19,6 @@
 
 #include "qiree/Assert.hh"
 
-#define RTDLIB                                                         \
-    "/home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/" \
-    "pennylane_lightning/liblightning_kokkos_catalyst.so";
-#define RTDDEVICE "LightningKokkosSimulator";
 extern "C" Catalyst::Runtime::QuantumDevice*
 GenericDeviceFactory(char const* kwargs);
 namespace qiree
@@ -35,6 +31,27 @@ using namespace Catalyst::Runtime;
  */
 LightningQuantum::LightningQuantum(std::ostream& os, unsigned long int seed) : output_(os), seed_(seed)
 {
+}
+
+//---------------------------------------------------------------------------//
+//! Default destructor
+LightningQuantum::~LightningQuantum() = default;
+
+//---------------------------------------------------------------------------//
+/*!
+ * Prepare to build a quantum circuit for an entry point
+ */
+void LightningQuantum::set_up(EntryPointAttrs const& attrs)
+{
+    QIREE_VALIDATE(attrs.required_num_qubits > 0,
+                   << "input is not a quantum program");
+    num_qubits_ = attrs.required_num_qubits;  // Set the number of qubits
+    results_.resize(attrs.required_num_results);
+
+
+    // We load the library every time because we currently have an issue 
+    // with releasing qubits in Catalyst.
+    // Once that is fixed, this can go to the constructor to execute once
     std::string rtd_lib = RTDLIB;
     std::string rtd_device = RTDDEVICE;
     std::string kwargs = {};
@@ -60,23 +77,6 @@ LightningQuantum::LightningQuantum(std::ostream& os, unsigned long int seed) : o
     rtd_qdevice = std::unique_ptr<QuantumDevice>(
         reinterpret_cast<decltype(GenericDeviceFactory)*>(f_ptr)(
             rtd_kwargs.c_str()));
-}
-
-//---------------------------------------------------------------------------//
-//! Default destructor
-LightningQuantum::~LightningQuantum() = default;
-
-//---------------------------------------------------------------------------//
-/*!
- * Prepare to build a quantum circuit for an entry point
- */
-void LightningQuantum::set_up(EntryPointAttrs const& attrs)
-{
-    QIREE_VALIDATE(attrs.required_num_qubits > 0,
-                   << "input is not a quantum program");
-
-    num_qubits_ = attrs.required_num_qubits;  // Set the number of qubits
-    results_.resize(attrs.required_num_results);
 
     rtd_qdevice->AllocateQubits(num_qubits_);
 }
@@ -87,10 +87,12 @@ void LightningQuantum::set_up(EntryPointAttrs const& attrs)
  */
 void LightningQuantum::tear_down()
 {
+    // This should go to the destructor once we fix the issue with releasing qubits
     if (rtd_dylib_handler)
     {
         dlclose(rtd_dylib_handler);
-    }
+    };
+  
 }
 
 //---------------------------------------------------------------------------//
@@ -106,7 +108,7 @@ void LightningQuantum::reset(Qubit q)
 
 //----------------------------------------------------------------------------//
 /*!
- * Read the value of a result. This utilizes the new BufferManager.
+ * Read the value of a result. 
  */
 QState LightningQuantum::read_result(Result r)
 {
@@ -124,7 +126,8 @@ void LightningQuantum::mz(Qubit q, Result r)
     std::mt19937 gen(seed_);
     seed_++;
     rtd_qdevice->SetDevicePRNG(&gen);
-    results_[r.value] = rtd_qdevice->Measure(q.value, std::nullopt);
+    auto result = rtd_qdevice->Measure(static_cast<intptr_t>(q.value), std::nullopt);
+    results_[r.value] = *result;
 }
 
 //---------------------------------------------------------------------------//
diff --git a/src/qirlightning/LightningQuantum.hh b/src/qirlightning/LightningQuantum.hh
index 24ccd2f..2c2728b 100644
--- a/src/qirlightning/LightningQuantum.hh
+++ b/src/qirlightning/LightningQuantum.hh
@@ -68,7 +68,7 @@ class LightningQuantum final : virtual public QuantumNotImpl
     //!@{
     //! \name Circuit construction
     // void ccx(Qubit, Qubit) final;
-    void ccnot(Qubit, Qubit, Qubit);  // TODO: not in examples or qir runner
+    void ccnot(Qubit, Qubit, Qubit); 
     void cnot(Qubit, Qubit) final;
     void cx(Qubit, Qubit) final;
     // void cy(Qubit, Qubit) final;
diff --git a/src/qirlightning/README.md b/src/qirlightning/README.md
index 6b42581..e3498ff 100644
--- a/src/qirlightning/README.md
+++ b/src/qirlightning/README.md
@@ -2,11 +2,11 @@
 
 ## Installing a lightning simulator
 
-When installing [Pennylane-Lightning](https://github.com/PennyLaneAI/pennylane-lightning) from pip or source, you will have the shared objects for each of the simulator installed. These are named `liblightning_kokkos_catalyst.so`/`liblightning_GPU_catalyst.so` etc. 
+More information on installing Pennylane Lightning simulators can be found in [lightning repository](https://github.com/PennyLaneAI/pennylane-lightning).
 
-Running `pip install pennylane` or `pip install pennylane-lightning` will install the `lightning.qubit` (CPU) simulator, and other simulators can be installed by running `pip install pennylane-lightning-kokkos / pennylane-lightning-gpu`.
+### Quick start
+The easiest way to get started is install a Lightning simulator from PyPI via pip:
 
-Example:
 ```
 $ pip install pennylane-lightning-kokkos
 
@@ -20,20 +20,37 @@ Author-email:
 License: Apache License 2.0
 Location: /home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages
 Requires: pennylane, pennylane-lightning
+```
+Running `pip install pennylane` or `pip install pennylane-lightning` will install the `lightning.qubit` (CPU) simulator, and other simulators can be installed by running `pip install pennylane-lightning-kokkos / pennylane-lightning-gpu`.
+
+When installing [Pennylane-Lightning](https://github.com/PennyLaneAI/pennylane-lightning) from pip or source, you will have the shared objects for each of the simulator installed. These are named `liblightning_qubit_catalyst.so`/`liblightning_kokkos_catalyst.so`/`liblightning_GPU_catalyst.so` respectively.
 
+Example:
+```
 $ ls /home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/pennylane_lightning
 ... liblightning_kokkos_catalyst.so ...
 ```
 
 You can swap `pennylane-lightning-kokkos` for `pennylane-lightning-gpu` for lightning.gpu and `pennylane-lightning` for lightning.gpu simulators.
 
+### Compiling Lightning from Source
+
+The [lightning repository page](https://github.com/PennyLaneAI/pennylane-lightning) contains information on how to install Lightning simulators from source. This will necessary for e.g. Kokkos with HIP backend.
+
 ## Compilation
 
-Turn on `QIREE_USE_LIGHTNING` in CMakeLists.txt
+- Set `QIREE_USE_LIGHTNING` to `ON` in `qiree/CMakeLists.txt`
+- Specify the simulator path and name in `qiree/src/qirlightning/CMakeLists`, set:
+    - `RTDLIB_PATH` to the path of the simulator `.so`
+    - `RTDDEVICE_NAME` to `LightningSimulator`/`LightningKokkosSimulator`/`LightningGPUSimulator`
+These could also be set in cmake using the variables `-D...`
+
+Note: when running on GPU, include `cuquantum` libraries in the library path (which will be installed as a dependency from Python), i.e. `LD_LIBRARY_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")/lib:$LD_LIBRARY_PATH ./test_rt_device.out`
 
 To compile:
 
 ```
+cd qiree/
 mkdir build; cd build
 cmake ..
 make
@@ -45,16 +62,11 @@ make
 To run:
 
 ```
-$ ./bin/qir-lightning ../examples/bell.ll -s 1
-(Extra debug output:
-NamedOperation: Hadamard
-NamedOperation: CNOT
-Measure
-Measure)
-{"11":1}
+$ ./bin/qir-lightning ../examples/bell.ll -s 100
+{"00":43,"11":57}
 ```
 
-## Running on other devices
+## Running on GPU
 
 To run on other devices, e.g. lightning.gpu, you need to change:
 - Install pennylane-lightning-gpu: `pip install pennylane-lightning-gpu`
diff --git a/src/qirlightning/catalyst_runtime/include/DynamicLibraryLoader.hpp b/src/qirlightning/catalyst_runtime/include/DynamicLibraryLoader.hpp
deleted file mode 100644
index 1c25ab8..0000000
--- a/src/qirlightning/catalyst_runtime/include/DynamicLibraryLoader.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright 2024 Xanadu Quantum Technologies Inc.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-//     http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <dlfcn.h>
-#include <string_view>
-
-#include "Exception.hpp"
-
-/**
- * @brief A utility struct to handle opening, closing and retrieving symbols
- *        from dynamic shared objects.
- */
-struct DynamicLibraryLoader {
-    void *handle;
-
-    DynamicLibraryLoader(std::string_view library_name, int mode = RTLD_LAZY | RTLD_NODELETE)
-    {
-        // Load the shared library
-        handle = dlopen(library_name.data(), mode);
-        if (!handle) {
-            const char *err_msg = dlerror();
-            RT_FAIL(err_msg);
-        }
-    }
-
-    ~DynamicLibraryLoader()
-    {
-        if (handle) {
-            // TODO: This is non-sensical.
-            // We are using RTLD_NODELETE, why would calling dlclose have a side-effect?
-            // Worst of all, the side-effect is not in our code.
-            // When we have dlclose, everything works well the first time.
-            // However, when trying to compile a second time, we will find that jaxlib will now
-            // raise a StopIteration exception. This doesn't really make any sense.
-            // My guess is that somehow dlclosing here will unload a the StopIteration symbol (?)
-            // rebind it with another equivalent (but with different id?)
-            // and then the MLIR python bindings are unable to catch it and stop the iteration and
-            // it gets propagated upwards.
-            //
-            // Is not calling dlclose bad?
-            // A little bit, although dlclose implies intent and does not create any requirements
-            // upon the implementation. See here:
-            // https://pubs.opengroup.org/onlinepubs/000095399/functions/dlclose.html
-            // https://github.com/pybind/pybind11/blob/75e48c5f959b4f0a49d8c664e059b6fb4b497102/include/pybind11/detail/internals.h#L108-L113
-            //
-#ifndef __APPLE__
-            dlclose(handle);
-#endif
-        }
-    }
-
-    // Get symbol from library
-    template <typename T> T getSymbol(std::string_view symbol_name)
-    {
-        // Clear any existing errors
-        dlerror();
-
-        // Retrieve symbol
-        T symbol = reinterpret_cast<T>(dlsym(handle, symbol_name.data()));
-        const char *err_msg = dlerror();
-        if (err_msg != nullptr) {
-            RT_FAIL(err_msg);
-        }
-        return symbol;
-    }
-};
diff --git a/src/qirlightning/catalyst_runtime/include/RuntimeCAPI.h b/src/qirlightning/catalyst_runtime/include/RuntimeCAPI.h
deleted file mode 100644
index b0f63ca..0000000
--- a/src/qirlightning/catalyst_runtime/include/RuntimeCAPI.h
+++ /dev/null
@@ -1,112 +0,0 @@
-// Copyright 2022-2023 Xanadu Quantum Technologies Inc.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-//     http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#ifndef RUNTIMECAPI_H
-#define RUNTIMECAPI_H
-
-#include "Types.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Quantum Runtime Instructions
-void __catalyst__rt__fail_cstr(const char *);
-void __catalyst__rt__initialize(uint32_t *seed);
-void __catalyst__rt__device_init(int8_t *, int8_t *, int8_t *, int64_t shots);
-void __catalyst__rt__device_release();
-void __catalyst__rt__finalize();
-void __catalyst__rt__toggle_recorder(bool);
-void __catalyst__rt__print_state();
-void __catalyst__rt__print_tensor(OpaqueMemRefT *, bool);
-void __catalyst__rt__print_string(char *);
-void __catalyst__rt__assert_bool(bool, char *);
-int64_t __catalyst__rt__array_get_size_1d(QirArray *);
-int8_t *__catalyst__rt__array_get_element_ptr_1d(QirArray *, int64_t);
-
-QUBIT *__catalyst__rt__qubit_allocate();
-QirArray *__catalyst__rt__qubit_allocate_array(int64_t);
-void __catalyst__rt__qubit_release(QUBIT *);
-void __catalyst__rt__qubit_release_array(QirArray *);
-
-int64_t __catalyst__rt__num_qubits();
-
-bool __catalyst__rt__result_equal(RESULT *, RESULT *);
-RESULT *__catalyst__rt__result_get_one();
-RESULT *__catalyst__rt__result_get_zero();
-
-// Quantum Gate Set Instructions
-void __catalyst__qis__SetState(MemRefT_CplxT_double_1d *, uint64_t, ...);
-void __catalyst__qis__SetBasisState(MemRefT_int8_1d *, uint64_t, ...);
-void __catalyst__qis__Identity(QUBIT *, const Modifiers *);
-void __catalyst__qis__PauliX(QUBIT *, const Modifiers *);
-void __catalyst__qis__PauliY(QUBIT *, const Modifiers *);
-void __catalyst__qis__PauliZ(QUBIT *, const Modifiers *);
-void __catalyst__qis__Hadamard(QUBIT *, const Modifiers *);
-void __catalyst__qis__S(QUBIT *, const Modifiers *);
-void __catalyst__qis__T(QUBIT *, const Modifiers *);
-void __catalyst__qis__PhaseShift(double, QUBIT *, const Modifiers *);
-void __catalyst__qis__RX(double, QUBIT *, const Modifiers *);
-void __catalyst__qis__RY(double, QUBIT *, const Modifiers *);
-void __catalyst__qis__RZ(double, QUBIT *, const Modifiers *);
-void __catalyst__qis__Rot(double, double, double, QUBIT *, const Modifiers *);
-void __catalyst__qis__CNOT(QUBIT *, QUBIT *, const Modifiers *);
-void __catalyst__qis__CY(QUBIT *, QUBIT *, const Modifiers *);
-void __catalyst__qis__CZ(QUBIT *, QUBIT *, const Modifiers *);
-void __catalyst__qis__SWAP(QUBIT *, QUBIT *, const Modifiers *);
-void __catalyst__qis__IsingXX(double, QUBIT *, QUBIT *, const Modifiers *);
-void __catalyst__qis__IsingYY(double, QUBIT *, QUBIT *, const Modifiers *);
-void __catalyst__qis__IsingXY(double, QUBIT *, QUBIT *, const Modifiers *);
-void __catalyst__qis__IsingZZ(double, QUBIT *, QUBIT *, const Modifiers *);
-void __catalyst__qis__ControlledPhaseShift(double, QUBIT *, QUBIT *, const Modifiers *);
-void __catalyst__qis__CRX(double, QUBIT *, QUBIT *, const Modifiers *);
-void __catalyst__qis__CRY(double, QUBIT *, QUBIT *, const Modifiers *);
-void __catalyst__qis__CRZ(double, QUBIT *, QUBIT *, const Modifiers *);
-void __catalyst__qis__CRot(double, double, double, QUBIT *, QUBIT *, const Modifiers *);
-void __catalyst__qis__CSWAP(QUBIT *, QUBIT *, QUBIT *, const Modifiers *);
-void __catalyst__qis__Toffoli(QUBIT *, QUBIT *, QUBIT *, const Modifiers *);
-void __catalyst__qis__MultiRZ(double, const Modifiers *, int64_t, /*qubits*/...);
-void __catalyst__qis__GlobalPhase(double, const Modifiers *);
-void __catalyst__qis__ISWAP(QUBIT *, QUBIT *, const Modifiers *);
-void __catalyst__qis__PSWAP(double, QUBIT *, QUBIT *, const Modifiers *);
-
-// Struct pointer arguments for these instructions represent real arguments,
-// as passing structs by value is too unreliable / compiler dependant.
-void __catalyst__qis__QubitUnitary(MemRefT_CplxT_double_2d *, const Modifiers *, int64_t,
-                                   /*qubits*/...);
-
-ObsIdType __catalyst__qis__NamedObs(int64_t, QUBIT *);
-ObsIdType __catalyst__qis__HermitianObs(MemRefT_CplxT_double_2d *, int64_t, /*qubits*/...);
-ObsIdType __catalyst__qis__TensorObs(int64_t, /*obsKeys*/...);
-ObsIdType __catalyst__qis__HamiltonianObs(MemRefT_double_1d *, int64_t, /*obsKeys*/...);
-
-// Struct pointers arguments here represent return values.
-RESULT *__catalyst__qis__Measure(QUBIT *, int32_t);
-double __catalyst__qis__Expval(ObsIdType);
-double __catalyst__qis__Variance(ObsIdType);
-void __catalyst__qis__Probs(MemRefT_double_1d *, int64_t, /*qubits*/...);
-void __catalyst__qis__Sample(MemRefT_double_2d *, int64_t, /*qubits*/...);
-void __catalyst__qis__Counts(PairT_MemRefT_double_int64_1d *, int64_t, /*qubits*/...);
-void __catalyst__qis__State(MemRefT_CplxT_double_1d *, int64_t, /*qubits*/...);
-void __catalyst__qis__Gradient(int64_t, /*results*/...);
-void __catalyst__qis__Gradient_params(MemRefT_int64_1d *, int64_t, /*results*/...);
-
-void __catalyst__host__rt__unrecoverable_error();
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif
diff --git a/src/qirlightning/simple_demo/test_rt_device.cpp b/src/qirlightning/simple_demo/test_rt_device.cpp
index 4d2736f..80ee38c 100644
--- a/src/qirlightning/simple_demo/test_rt_device.cpp
+++ b/src/qirlightning/simple_demo/test_rt_device.cpp
@@ -3,6 +3,7 @@
 #include "QuantumDevice.hpp"
 
 // Runtime libraries (kokkos/GPU/qubit etc.)
+// Update these paths to point to the correct library
 #define RTDLIB                                                         \
     "/home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/" \
     "pennylane_lightning/liblightning_gpu_catalyst.so";

From 88dbe30a3e22bcec0af471d690d3608442a45e25 Mon Sep 17 00:00:00 2001
From: Joseph Lee <joseph.lee@xanadu.ai>
Date: Wed, 12 Mar 2025 21:52:28 +0000
Subject: [PATCH 44/64] update readme

---
 src/qirlightning/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/qirlightning/README.md b/src/qirlightning/README.md
index e3498ff..5b5bb39 100644
--- a/src/qirlightning/README.md
+++ b/src/qirlightning/README.md
@@ -1,4 +1,4 @@
-# Lightning backend
+# QIR-EE with Lightning simulator backend
 
 ## Installing a lightning simulator
 
@@ -37,7 +37,7 @@ You can swap `pennylane-lightning-kokkos` for `pennylane-lightning-gpu` for ligh
 
 The [lightning repository page](https://github.com/PennyLaneAI/pennylane-lightning) contains information on how to install Lightning simulators from source. This will necessary for e.g. Kokkos with HIP backend.
 
-## Compilation
+## Compile QIR-EE with Lightning backend
 
 - Set `QIREE_USE_LIGHTNING` to `ON` in `qiree/CMakeLists.txt`
 - Specify the simulator path and name in `qiree/src/qirlightning/CMakeLists`, set:

From 99800fa6c769e13c016cab5b80537a5a715ef5a5 Mon Sep 17 00:00:00 2001
From: Joseph Lee <joseph.lee@xanadu.ai>
Date: Wed, 12 Mar 2025 21:58:14 +0000
Subject: [PATCH 45/64] update readme

---
 src/qirlightning/README.md | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/qirlightning/README.md b/src/qirlightning/README.md
index 5b5bb39..12b4644 100644
--- a/src/qirlightning/README.md
+++ b/src/qirlightning/README.md
@@ -66,10 +66,3 @@ $ ./bin/qir-lightning ../examples/bell.ll -s 100
 {"00":43,"11":57}
 ```
 
-## Running on GPU
-
-To run on other devices, e.g. lightning.gpu, you need to change:
-- Install pennylane-lightning-gpu: `pip install pennylane-lightning-gpu`
-- replace `RTDLIB` from `kokkos` to `gpu`
-- replace `RTDDEVICE` from `Kokkos` to `GPU`
-- Include `cuquantum` libraries when running (which was installed as a dependency), i.e. `LD_LIBRARY_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")/lib:$LD_LIBRARY_PATH ./test_rt_device.out`

From 3eb300210562dba6fe921e2c59a6c54b3156812f Mon Sep 17 00:00:00 2001
From: Joseph Lee <joseph.lee@xanadu.ai>
Date: Thu, 13 Mar 2025 19:06:22 +0000
Subject: [PATCH 46/64] update

---
 app/CMakeLists.txt                            |  3 -
 src/qirlightning/CMakeLists.txt               | 43 +++++++++--
 src/qirlightning/LightningQuantum.cc          | 58 +++++++--------
 src/qirlightning/LightningQuantum.hh          |  1 +
 src/qirlightning/README.md                    | 11 +--
 src/qirlightning/simple_demo/README.md        |  7 +-
 .../snapshot_catalyst_runtime}/README.rst     |  0
 .../include/DataView.hpp                      |  0
 .../include/Exception.hpp                     |  0
 .../include/QuantumDevice.hpp                 |  0
 .../include/Types.h                           |  0
 .../simple_demo/test_rt_device.cpp            |  4 +-
 src/qirlightning/support_catalyst.cmake       | 74 +++++++++++++++++++
 13 files changed, 152 insertions(+), 49 deletions(-)
 rename src/qirlightning/{catalyst_runtime => simple_demo/snapshot_catalyst_runtime}/README.rst (100%)
 rename src/qirlightning/{catalyst_runtime => simple_demo/snapshot_catalyst_runtime}/include/DataView.hpp (100%)
 rename src/qirlightning/{catalyst_runtime => simple_demo/snapshot_catalyst_runtime}/include/Exception.hpp (100%)
 rename src/qirlightning/{catalyst_runtime => simple_demo/snapshot_catalyst_runtime}/include/QuantumDevice.hpp (100%)
 rename src/qirlightning/{catalyst_runtime => simple_demo/snapshot_catalyst_runtime}/include/Types.h (100%)
 create mode 100644 src/qirlightning/support_catalyst.cmake

diff --git a/app/CMakeLists.txt b/app/CMakeLists.txt
index 32bef70..3529ba3 100644
--- a/app/CMakeLists.txt
+++ b/app/CMakeLists.txt
@@ -40,9 +40,6 @@ endif()
 #-----------------------------------------------------------------------------#
 
 if(QIREE_USE_LIGHTNING)
-
-  # Include directories for catalyst_runtime
-  include_directories("${CMAKE_CURRENT_SOURCE_DIR}/../src/qirlightning/catalyst_runtime/include")
   qiree_add_executable(qir-lightning
     qir-lightning.cc
   )
diff --git a/src/qirlightning/CMakeLists.txt b/src/qirlightning/CMakeLists.txt
index 31cfd8c..925f33c 100644
--- a/src/qirlightning/CMakeLists.txt
+++ b/src/qirlightning/CMakeLists.txt
@@ -4,16 +4,47 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #----------------------------------------------------------------------------#
 
-# Include directories for catalyst_runtime
-include_directories("${CMAKE_CURRENT_SOURCE_DIR}/catalyst_runtime/include")
+# Fetch Catalyst runtime include files
+include(FetchContent)
+
+include("${CMAKE_CURRENT_SOURCE_DIR}/support_catalyst.cmake")
+FindCatalyst(qirlightning)
+
+# Set the path to the lightning simulator shared library
+if(DEFINED ENV{LIGHTNING_SIM_PATH})
+  set(RTDLIB_PATH "$ENV{LIGHTNING_SIM_PATH}")
+  message(STATUS "RTDLIB_PATH set from environment variable LIGHTNING_SIM_PATH: ${RTDLIB_PATH}")
+else()
+  # Update hard coded path is not found in environment
+  set(RTDLIB_PATH "/home/joseph/work/qiree/pennylane-lightning/build_lightning_kokkos/liblightning_kokkos_catalyst.so")
+  message(STATUS "RTDLIB_PATH set to default value: ${RTDLIB_PATH}")
+endif()
+
+# Set the device name for the lightning simulator
+execute_process(
+    COMMAND nm -DC "${RTDLIB_PATH}" | grep " Factory"
+    OUTPUT_VARIABLE GREP_OUTPUT
+    ERROR_QUIET
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+  )
+
+if(GREP_OUTPUT)
+  string(REGEX MATCH "T (.*)Factory" SYMBOL_MATCH "${GREP_OUTPUT}")
+  if(SYMBOL_MATCH)
+    string(REGEX REPLACE "T (.*)Factory" "\\1" RTDDEVICE_NAME "${SYMBOL_MATCH}")
+    message(STATUS "Found Lightning Simulator. Extracted RTDDEVICE_NAME: ${RTDDEVICE_NAME}")
+  else()
+    message(WARNING "Symbol 'Factory' found, but regex failed to extract.")
+  endif()
+else()
+  message(WARNING "Symbol 'Factory' not found in ${RTDLIB_PATH}")
+endif()
 
-set(RTDLIB_PATH "/home/joseph/work/qiree/pennylane-lightning/build_lightning_qubit/liblightning_qubit_catalyst.so")
-set(RTDDEVICE_NAME "LightningSimulator")
 
 # Adding lightning as a library to qiree
 qiree_add_library(qirlightning
-  LightningQuantum.cc
-  LightningDefaultRuntime.cc
+LightningQuantum.cc
+LightningDefaultRuntime.cc
 )
 
 target_compile_definitions(qirlightning PRIVATE
diff --git a/src/qirlightning/LightningQuantum.cc b/src/qirlightning/LightningQuantum.cc
index b8bc4ad..b4597b7 100644
--- a/src/qirlightning/LightningQuantum.cc
+++ b/src/qirlightning/LightningQuantum.cc
@@ -31,27 +31,6 @@ using namespace Catalyst::Runtime;
  */
 LightningQuantum::LightningQuantum(std::ostream& os, unsigned long int seed) : output_(os), seed_(seed)
 {
-}
-
-//---------------------------------------------------------------------------//
-//! Default destructor
-LightningQuantum::~LightningQuantum() = default;
-
-//---------------------------------------------------------------------------//
-/*!
- * Prepare to build a quantum circuit for an entry point
- */
-void LightningQuantum::set_up(EntryPointAttrs const& attrs)
-{
-    QIREE_VALIDATE(attrs.required_num_qubits > 0,
-                   << "input is not a quantum program");
-    num_qubits_ = attrs.required_num_qubits;  // Set the number of qubits
-    results_.resize(attrs.required_num_results);
-
-
-    // We load the library every time because we currently have an issue 
-    // with releasing qubits in Catalyst.
-    // Once that is fixed, this can go to the constructor to execute once
     std::string rtd_lib = RTDLIB;
     std::string rtd_device = RTDDEVICE;
     std::string kwargs = {};
@@ -65,17 +44,40 @@ void LightningQuantum::set_up(EntryPointAttrs const& attrs)
 
     // Find device factory
     std::string factory_name = rtd_device + "Factory";
-    void* f_ptr = dlsym(rtd_dylib_handler, factory_name.c_str());
+    factory_f_ptr = dlsym(rtd_dylib_handler, factory_name.c_str());
 
-    if (!f_ptr)
+    if (!factory_f_ptr)
     {
         dlclose(rtd_dylib_handler);
         throw std::runtime_error("Failed to find factory function: "
                                  + factory_name);
     }
+}
+
+//---------------------------------------------------------------------------//
+//! Default destructor
+LightningQuantum::~LightningQuantum() {
+
+    if (rtd_dylib_handler)
+    {
+        dlclose(rtd_dylib_handler);
+    };
+};
+
+//---------------------------------------------------------------------------//
+/*!
+ * Prepare to build a quantum circuit for an entry point
+ */
+void LightningQuantum::set_up(EntryPointAttrs const& attrs)
+{
+    QIREE_VALIDATE(attrs.required_num_qubits > 0,
+                   << "input is not a quantum program");
+    num_qubits_ = attrs.required_num_qubits;  // Set the number of qubits
+    results_.resize(attrs.required_num_results);
+
     std::string rtd_kwargs = {};
     rtd_qdevice = std::unique_ptr<QuantumDevice>(
-        reinterpret_cast<decltype(GenericDeviceFactory)*>(f_ptr)(
+        reinterpret_cast<decltype(GenericDeviceFactory)*>(factory_f_ptr)(
             rtd_kwargs.c_str()));
 
     rtd_qdevice->AllocateQubits(num_qubits_);
@@ -87,11 +89,6 @@ void LightningQuantum::set_up(EntryPointAttrs const& attrs)
  */
 void LightningQuantum::tear_down()
 {
-    // This should go to the destructor once we fix the issue with releasing qubits
-    if (rtd_dylib_handler)
-    {
-        dlclose(rtd_dylib_handler);
-    };
   
 }
 
@@ -103,7 +100,8 @@ void LightningQuantum::reset(Qubit q)
 {
     std::vector<int8_t> data = {0}; 
     DataView<int8_t, 1> state(data);
-    std::vector<QubitIdType> wires = {static_cast<intptr_t>(q.value)};    rtd_qdevice->SetBasisState(state, wires);    
+    std::vector<QubitIdType> wires = {static_cast<intptr_t>(q.value)};    
+    rtd_qdevice->SetBasisState(state, wires);    
 }
 
 //----------------------------------------------------------------------------//
diff --git a/src/qirlightning/LightningQuantum.hh b/src/qirlightning/LightningQuantum.hh
index 2c2728b..96d49d5 100644
--- a/src/qirlightning/LightningQuantum.hh
+++ b/src/qirlightning/LightningQuantum.hh
@@ -100,6 +100,7 @@ class LightningQuantum final : virtual public QuantumNotImpl
     std::ostream& output_;
     unsigned long int seed_{};
     void* rtd_dylib_handler;
+    void* factory_f_ptr;
     std::unique_ptr<Catalyst::Runtime::QuantumDevice> rtd_qdevice;
     std::vector<bool> results_;
 
diff --git a/src/qirlightning/README.md b/src/qirlightning/README.md
index 12b4644..633aeea 100644
--- a/src/qirlightning/README.md
+++ b/src/qirlightning/README.md
@@ -40,10 +40,11 @@ The [lightning repository page](https://github.com/PennyLaneAI/pennylane-lightni
 ## Compile QIR-EE with Lightning backend
 
 - Set `QIREE_USE_LIGHTNING` to `ON` in `qiree/CMakeLists.txt`
-- Specify the simulator path and name in `qiree/src/qirlightning/CMakeLists`, set:
-    - `RTDLIB_PATH` to the path of the simulator `.so`
-    - `RTDDEVICE_NAME` to `LightningSimulator`/`LightningKokkosSimulator`/`LightningGPUSimulator`
-These could also be set in cmake using the variables `-D...`
+- Set the environment variable `LIGHTNING_SIM_PATH` to the shared object of the Lightning Simulator, e.g.
+
+```
+export LIGHTNING_SIM_PATH=/home/joseph/work/qiree/pennylane-lightning/build_lightning_qubit/liblightning_qubit_catalyst.so
+```
 
 Note: when running on GPU, include `cuquantum` libraries in the library path (which will be installed as a dependency from Python), i.e. `LD_LIBRARY_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")/lib:$LD_LIBRARY_PATH ./test_rt_device.out`
 
@@ -59,7 +60,7 @@ make
 
 ## Running the example
 
-To run:
+To run (in the `build` directory):
 
 ```
 $ ./bin/qir-lightning ../examples/bell.ll -s 100
diff --git a/src/qirlightning/simple_demo/README.md b/src/qirlightning/simple_demo/README.md
index 7aecc9d..2e328b1 100644
--- a/src/qirlightning/simple_demo/README.md
+++ b/src/qirlightning/simple_demo/README.md
@@ -2,13 +2,13 @@
 
 This is a super simple demo for driving Lightning devices. The example here uses `lightning.kokkos`, but can easily be updated to target other devices, e.g. lightning.gpu (if an Nvidia GPU is present). 
 
-The only extra header files required are the `../catalyst_runtime/include`, which contains the include files from the [Catalyst Runtime ](https://github.com/PennyLaneAI/catalyst/tree/main/runtime/include) (for the QuantumDevice interface).
+Some Catalyst include files are copied here for convenience - they are in `./snapshot_catalyst_runtime/include`. These are required for the QuantumDevice interface. For the qiree source, these files are fetched automatically during CMake, and these are not used.
 
 ## Installing a lightning simulator
 
 When installing [Pennylane-Lightning](https://github.com/PennyLaneAI/pennylane-lightning) from pip or source, you will have the shared objects for each of the simulator installed. These are named `liblightning_kokkos_catalyst.so`/`liblightning_GPU_catalyst.so` etc. 
 
-Running `pip install pennylane` or `pip install pennylane-lightning` will install the `lightning.qubit` (CPU) simulator, and other simulators can be installed by running `pip install pennylane-lightning-kokkos / pennylane-lightning-gpu`.
+To get started, run `pip install pennylane` or `pip install pennylane-lightning` - this will install the `lightning.qubit` (CPU) simulator, and other simulators can be installed by running `pip install pennylane-lightning-kokkos / pennylane-lightning-gpu`.
 
 Example:
 ```
@@ -36,7 +36,7 @@ You can swap `pennylane-lightning-kokkos` for `pennylane-lightning-gpu` for ligh
 To compile:
 
 ```
-$ clang++ --std=c++20 test_rt_device.cpp -I../catalyst_runtime/include -o test_rt_device.out
+$ clang++ --std=c++20 test_rt_device.cpp -I./snapshot_catalyst_runtime/include -o test_rt_device.out
 ```
 
 ## Running the example
@@ -61,6 +61,7 @@ Measure on wire 0 = 0
 
 To run on other devices, e.g. lightning.gpu, you need to change:
 - Install pennylane-lightning-gpu: `pip install pennylane-lightning-gpu`
+In the c++ file:
 - replace `RTDLIB` from `kokkos` to `gpu`
 - replace `RTDDEVICE` from `Kokkos` to `GPU`
 - Include `cuquantum` libraries when running (which was installed as a dependency), i.e. `LD_LIBRARY_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")/lib:$LD_LIBRARY_PATH ./test_rt_device.out`
diff --git a/src/qirlightning/catalyst_runtime/README.rst b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/README.rst
similarity index 100%
rename from src/qirlightning/catalyst_runtime/README.rst
rename to src/qirlightning/simple_demo/snapshot_catalyst_runtime/README.rst
diff --git a/src/qirlightning/catalyst_runtime/include/DataView.hpp b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/DataView.hpp
similarity index 100%
rename from src/qirlightning/catalyst_runtime/include/DataView.hpp
rename to src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/DataView.hpp
diff --git a/src/qirlightning/catalyst_runtime/include/Exception.hpp b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Exception.hpp
similarity index 100%
rename from src/qirlightning/catalyst_runtime/include/Exception.hpp
rename to src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Exception.hpp
diff --git a/src/qirlightning/catalyst_runtime/include/QuantumDevice.hpp b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/QuantumDevice.hpp
similarity index 100%
rename from src/qirlightning/catalyst_runtime/include/QuantumDevice.hpp
rename to src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/QuantumDevice.hpp
diff --git a/src/qirlightning/catalyst_runtime/include/Types.h b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Types.h
similarity index 100%
rename from src/qirlightning/catalyst_runtime/include/Types.h
rename to src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Types.h
diff --git a/src/qirlightning/simple_demo/test_rt_device.cpp b/src/qirlightning/simple_demo/test_rt_device.cpp
index 80ee38c..091b461 100644
--- a/src/qirlightning/simple_demo/test_rt_device.cpp
+++ b/src/qirlightning/simple_demo/test_rt_device.cpp
@@ -6,8 +6,8 @@
 // Update these paths to point to the correct library
 #define RTDLIB                                                         \
     "/home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/" \
-    "pennylane_lightning/liblightning_gpu_catalyst.so";
-#define RTDDEVICE "LightningGPUSimulator";
+    "pennylane_lightning/liblightning_kokkos_catalyst.so";
+#define RTDDEVICE "LightningKokkosSimulator";
 
 extern "C" Catalyst::Runtime::QuantumDevice*
 GenericDeviceFactory(char const* kwargs);
diff --git a/src/qirlightning/support_catalyst.cmake b/src/qirlightning/support_catalyst.cmake
new file mode 100644
index 0000000..5932f06
--- /dev/null
+++ b/src/qirlightning/support_catalyst.cmake
@@ -0,0 +1,74 @@
+###############################################################################################
+# This file provides macros to process Catalyst.
+###############################################################################################
+
+# Include this only once
+include_guard()
+
+macro(FindCatalyst target_name)
+    if(LIGHTNING_CATALYST_SRC_PATH)
+        if(NOT IS_ABSOLUTE ${LIGHTNING_CATALYST_SRC_PATH})
+            message(FATAL_ERROR " LIGHTNING_CATALYST_SRC_PATH=${LIGHTNING_CATALYST_SRC_PATH} must be set to an absolute path")
+        endif()
+        if(CATALYST_GIT_TAG)
+            message(WARN " Setting `LIGHTNING_CATALYST_SRC_PATH=${LIGHTNING_CATALYST_SRC_PATH}` overrides `CATALYST_GIT_TAG=${CATALYST_GIT_TAG}`")
+        endif()
+
+        # Acquire local git hash and use for CATALYST_GIT_TAG
+        execute_process(COMMAND git rev-parse --short HEAD
+            WORKING_DIRECTORY ${LIGHTNING_CATALYST_SRC_PATH}
+            OUTPUT_VARIABLE CATALYST_GIT_TAG
+        )
+        message(INFO " Building against local Catalyst - path: ${LIGHTNING_CATALYST_SRC_PATH} - GIT TAG: ${CATALYST_GIT_TAG}")
+
+        target_include_directories(${target_name} PUBLIC ${LIGHTNING_CATALYST_SRC_PATH}/runtime/lib/backend/common)
+        target_include_directories(${target_name} PUBLIC ${LIGHTNING_CATALYST_SRC_PATH}/runtime/include)
+
+    else()
+        if(NOT CATALYST_GIT_TAG)
+            set(CATALYST_GIT_TAG "main" CACHE STRING "GIT_TAG value to build Catalyst")
+        endif()
+        message(INFO " Building against Catalyst GIT TAG ${CATALYST_GIT_TAG}")
+
+        # Fetching /lib/backend/common hpp headers
+        set(LIB_BACKEND_COMMON_HEADERS  CacheManager.hpp
+                                    QubitManager.hpp
+                                    Utils.hpp
+        )
+
+        foreach(HEADER ${LIB_BACKEND_COMMON_HEADERS})
+            string(REGEX REPLACE "\\.[^.]*$" "" HEADER_NAME ${HEADER})
+            FetchContent_Declare(
+                ${HEADER_NAME}
+                URL                 https://raw.githubusercontent.com/PennyLaneAI/catalyst/${CATALYST_GIT_TAG}/runtime/lib/backend/common/${HEADER}
+                DOWNLOAD_NO_EXTRACT True
+                SOURCE_DIR          ../../include
+            )
+
+            FetchContent_MakeAvailable(${HEADER_NAME})
+        endforeach()
+
+        # Fetching include hpp headers
+        set(INCLUDE_HEADERS DataView.hpp
+                        Exception.hpp
+                        QuantumDevice.hpp
+                        RuntimeCAPI.h
+                        Types.h
+        )
+
+        foreach(HEADER ${INCLUDE_HEADERS})
+            string(REGEX REPLACE "\\.[^.]*$" "" HEADER_NAME ${HEADER})
+            FetchContent_Declare(
+                ${HEADER_NAME}
+                URL                 https://raw.githubusercontent.com/PennyLaneAI/catalyst/${CATALYST_GIT_TAG}/runtime/include/${HEADER}
+                DOWNLOAD_NO_EXTRACT True
+                SOURCE_DIR          ../../include
+            )
+
+            FetchContent_MakeAvailable(${HEADER_NAME})
+        endforeach()
+
+        #target_include_directories(${target_name} PUBLIC ${CMAKE_CURRENT_BINARY_DIR}/../../include)
+
+    endif()
+endmacro()

From b8ec0711c0e1dee26e4b62d4f7b11b31c6a1c7e6 Mon Sep 17 00:00:00 2001
From: Joseph Lee <joseph.lee@xanadu.ai>
Date: Thu, 13 Mar 2025 19:12:05 +0000
Subject: [PATCH 47/64] remove paths

---
 src/qirlightning/README.md                      | 6 +++---
 src/qirlightning/simple_demo/README.md          | 4 ++--
 src/qirlightning/simple_demo/test_rt_device.cpp | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/qirlightning/README.md b/src/qirlightning/README.md
index 633aeea..45d47ae 100644
--- a/src/qirlightning/README.md
+++ b/src/qirlightning/README.md
@@ -18,7 +18,7 @@ Home-page: https://github.com/PennyLaneAI/pennylane-lightning
 Author: 
 Author-email: 
 License: Apache License 2.0
-Location: /home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages
+Location: <site packages path>
 Requires: pennylane, pennylane-lightning
 ```
 Running `pip install pennylane` or `pip install pennylane-lightning` will install the `lightning.qubit` (CPU) simulator, and other simulators can be installed by running `pip install pennylane-lightning-kokkos / pennylane-lightning-gpu`.
@@ -27,7 +27,7 @@ When installing [Pennylane-Lightning](https://github.com/PennyLaneAI/pennylane-l
 
 Example:
 ```
-$ ls /home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/pennylane_lightning
+$ ls <site packages path>
 ... liblightning_kokkos_catalyst.so ...
 ```
 
@@ -43,7 +43,7 @@ The [lightning repository page](https://github.com/PennyLaneAI/pennylane-lightni
 - Set the environment variable `LIGHTNING_SIM_PATH` to the shared object of the Lightning Simulator, e.g.
 
 ```
-export LIGHTNING_SIM_PATH=/home/joseph/work/qiree/pennylane-lightning/build_lightning_qubit/liblightning_qubit_catalyst.so
+export LIGHTNING_SIM_PATH=<site packages path>/pennylane_lightning/liblightning_qubit_catalyst.so
 ```
 
 Note: when running on GPU, include `cuquantum` libraries in the library path (which will be installed as a dependency from Python), i.e. `LD_LIBRARY_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")/lib:$LD_LIBRARY_PATH ./test_rt_device.out`
diff --git a/src/qirlightning/simple_demo/README.md b/src/qirlightning/simple_demo/README.md
index 2e328b1..1d3e41f 100644
--- a/src/qirlightning/simple_demo/README.md
+++ b/src/qirlightning/simple_demo/README.md
@@ -22,10 +22,10 @@ Home-page: https://github.com/PennyLaneAI/pennylane-lightning
 Author: 
 Author-email: 
 License: Apache License 2.0
-Location: /home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages
+Location: <site packages path>
 Requires: pennylane, pennylane-lightning
 
-$ ls /home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/pennylane_lightning
+$ ls <site packages path>/pennylane_lightning
 ... liblightning_kokkos_catalyst.so ...
 ```
 
diff --git a/src/qirlightning/simple_demo/test_rt_device.cpp b/src/qirlightning/simple_demo/test_rt_device.cpp
index 091b461..4ba8f75 100644
--- a/src/qirlightning/simple_demo/test_rt_device.cpp
+++ b/src/qirlightning/simple_demo/test_rt_device.cpp
@@ -5,7 +5,7 @@
 // Runtime libraries (kokkos/GPU/qubit etc.)
 // Update these paths to point to the correct library
 #define RTDLIB                                                         \
-    "/home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/" \
+    "<UPDATE: site packages path>" \
     "pennylane_lightning/liblightning_kokkos_catalyst.so";
 #define RTDDEVICE "LightningKokkosSimulator";
 

From ce5d7642f76983fc38cbc004af261c2cc1d77f4b Mon Sep 17 00:00:00 2001
From: Joseph Lee <joseph.lee@xanadu.ai>
Date: Thu, 13 Mar 2025 19:17:50 +0000
Subject: [PATCH 48/64] update

---
 src/qirlightning/simple_demo/README.md          | 2 ++
 src/qirlightning/simple_demo/test_rt_device.cpp | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/qirlightning/simple_demo/README.md b/src/qirlightning/simple_demo/README.md
index 1d3e41f..bbd7d27 100644
--- a/src/qirlightning/simple_demo/README.md
+++ b/src/qirlightning/simple_demo/README.md
@@ -33,6 +33,8 @@ You can swap `pennylane-lightning-kokkos` for `pennylane-lightning-gpu` for ligh
 
 ## Compilation
 
+First update the `RTDLIB` in `test_rt_device.cpp` to the local path where lightning is installed (i.e. `<site packages path>` from above).
+
 To compile:
 
 ```
diff --git a/src/qirlightning/simple_demo/test_rt_device.cpp b/src/qirlightning/simple_demo/test_rt_device.cpp
index 4ba8f75..c50ab92 100644
--- a/src/qirlightning/simple_demo/test_rt_device.cpp
+++ b/src/qirlightning/simple_demo/test_rt_device.cpp
@@ -5,7 +5,7 @@
 // Runtime libraries (kokkos/GPU/qubit etc.)
 // Update these paths to point to the correct library
 #define RTDLIB                                                         \
-    "<UPDATE: site packages path>" \
+    "<UPDATE: site packages path>/" \
     "pennylane_lightning/liblightning_kokkos_catalyst.so";
 #define RTDDEVICE "LightningKokkosSimulator";
 

From b7b6aeeb6d453fc463db9583de7607d9da548eb7 Mon Sep 17 00:00:00 2001
From: Joseph Lee <joseph.lee@xanadu.ai>
Date: Mon, 5 May 2025 15:39:10 +0000
Subject: [PATCH 49/64] update installation instructions

---
 src/qirlightning/CMakeLists.txt         | 9 ++++-----
 src/qirlightning/README.md              | 6 ++++--
 src/qirlightning/support_catalyst.cmake | 4 +++-
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/qirlightning/CMakeLists.txt b/src/qirlightning/CMakeLists.txt
index 925f33c..2edbdf9 100644
--- a/src/qirlightning/CMakeLists.txt
+++ b/src/qirlightning/CMakeLists.txt
@@ -15,9 +15,8 @@ if(DEFINED ENV{LIGHTNING_SIM_PATH})
   set(RTDLIB_PATH "$ENV{LIGHTNING_SIM_PATH}")
   message(STATUS "RTDLIB_PATH set from environment variable LIGHTNING_SIM_PATH: ${RTDLIB_PATH}")
 else()
-  # Update hard coded path is not found in environment
-  set(RTDLIB_PATH "/home/joseph/work/qiree/pennylane-lightning/build_lightning_kokkos/liblightning_kokkos_catalyst.so")
-  message(STATUS "RTDLIB_PATH set to default value: ${RTDLIB_PATH}")
+  # Throw an error if the environment variable is not defined
+  message(FATAL_ERROR "Environment variable LIGHTNING_SIM_PATH is not defined. Please set it to the path of the Lightning simulator shared library.")
 endif()
 
 # Set the device name for the lightning simulator
@@ -34,10 +33,10 @@ if(GREP_OUTPUT)
     string(REGEX REPLACE "T (.*)Factory" "\\1" RTDDEVICE_NAME "${SYMBOL_MATCH}")
     message(STATUS "Found Lightning Simulator. Extracted RTDDEVICE_NAME: ${RTDDEVICE_NAME}")
   else()
-    message(WARNING "Symbol 'Factory' found, but regex failed to extract.")
+    message(FATAL_ERROR "Symbol 'Factory' found, but regex failed to extract.")
   endif()
 else()
-  message(WARNING "Symbol 'Factory' not found in ${RTDLIB_PATH}")
+  message(FATAL_ERROR "Symbol 'Factory' not found in ${RTDLIB_PATH}. Please ensure LIGHTNING_SIM_PATH is set correctly.")
 endif()
 
 
diff --git a/src/qirlightning/README.md b/src/qirlightning/README.md
index 45d47ae..7719b7e 100644
--- a/src/qirlightning/README.md
+++ b/src/qirlightning/README.md
@@ -43,10 +43,12 @@ The [lightning repository page](https://github.com/PennyLaneAI/pennylane-lightni
 - Set the environment variable `LIGHTNING_SIM_PATH` to the shared object of the Lightning Simulator, e.g.
 
 ```
-export LIGHTNING_SIM_PATH=<site packages path>/pennylane_lightning/liblightning_qubit_catalyst.so
+export LIGHTNING_SIM_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/pennylane_lightning')")/liblightning_kokkos_catalyst.so
 ```
 
-Note: when running on GPU, include `cuquantum` libraries in the library path (which will be installed as a dependency from Python), i.e. `LD_LIBRARY_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")/lib:$LD_LIBRARY_PATH ./test_rt_device.out`
+Note: 
+- replace `libligghtning_kokkos_catalyst.so` with `liblightning_qubit_catalyst.so` or `liblightning_GPU_catalyst.so` if required.
+- when running on `GPU`, include `cuquantum` libraries in the library path (which will be installed as a dependency from Python), i.e. `LD_LIBRARY_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")/lib:$LD_LIBRARY_PATH ./test_rt_device.out`
 
 To compile:
 
diff --git a/src/qirlightning/support_catalyst.cmake b/src/qirlightning/support_catalyst.cmake
index 5932f06..f46363c 100644
--- a/src/qirlightning/support_catalyst.cmake
+++ b/src/qirlightning/support_catalyst.cmake
@@ -26,7 +26,9 @@ macro(FindCatalyst target_name)
 
     else()
         if(NOT CATALYST_GIT_TAG)
-            set(CATALYST_GIT_TAG "main" CACHE STRING "GIT_TAG value to build Catalyst")
+            # v0.41 of Lightning requires v0.11.0 of Catalyst
+            # If using latest Lightning, use main branch of Catalyst
+            set(CATALYST_GIT_TAG "v0.11.0" CACHE STRING "GIT_TAG value to build Catalyst")
         endif()
         message(INFO " Building against Catalyst GIT TAG ${CATALYST_GIT_TAG}")
 

From d15106b7002d2d9537d9cacf46a0c2a893940da7 Mon Sep 17 00:00:00 2001
From: Joseph Lee <joseph.lee@xanadu.ai>
Date: Fri, 30 May 2025 18:48:11 +0000
Subject: [PATCH 50/64] update tests and GH workflow

---
 .github/workflows/build-lightning.yml      |  98 +++++++++++++++++
 CMakeLists.txt                             |   2 +-
 test/CMakeLists.txt                        |  10 ++
 test/qirlightning/LightningQuantum.test.cc | 118 +++++++++++++++++++++
 4 files changed, 227 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/build-lightning.yml
 create mode 100644 test/qirlightning/LightningQuantum.test.cc

diff --git a/.github/workflows/build-lightning.yml b/.github/workflows/build-lightning.yml
new file mode 100644
index 0000000..052830c
--- /dev/null
+++ b/.github/workflows/build-lightning.yml
@@ -0,0 +1,98 @@
+# Build directly on the GitHub runner with caching
+name: build-lightning
+on:
+  workflow_dispatch:
+  workflow_call:
+
+concurrency:
+  group: build-lightning-${{github.ref}}-${{github.event.pull_request.number || github.run_number}}-${{github.workflow}}
+  cancel-in-progress: true
+
+jobs:
+  linux:
+    name: ${{matrix.runner}}-${{matrix.compiler}}-${{matrix.version}}-llvm${{matrix.llvm}}
+    strategy:
+      matrix:
+        include:
+          - runner: jammy
+            compiler: gcc
+            version: 12
+            llvm: 14
+          - runner: jammy
+            compiler: clang
+            version: 15
+            llvm: 15
+    runs-on: >-
+      ${{  matrix.runner == 'focal' && 'ubuntu-20.04'
+        || matrix.runner == 'jammy' && 'ubuntu-22.04'
+        || null
+      }}
+    env:
+      CCACHE_DIR: "${{github.workspace}}/.ccache"
+      CCACHE_MAXSIZE: "10G"
+      CC: ${{matrix.compiler}}-${{matrix.version}}
+      CXX: ${{matrix.compiler == 'gcc' && 'g++' || 'clang++'}}-${{matrix.version}}
+    steps:
+      - uses: actions/setup-python@v5
+        name: Install Python
+        with:
+          python-version: '3.10'
+      - name: Install dependencies
+        run: |
+          sudo apt-get -q -y update
+          sudo apt-get -q -y install \
+            ccache cmake ninja-build libgtest-dev \
+            llvm-${{matrix.llvm}}-dev \
+            ${{matrix.compiler}}-${{matrix.version}} \
+            ${{matrix.compiler == 'gcc' && format('g++-{0}', matrix.version) || ''}}
+          echo "Installed toolchain:"
+          ld --version | head -1
+          $CC --version | head -1
+          $CXX --version | head -1
+          llvm-config-${{matrix.llvm}} --version | head -1
+          python -m pip install pennylane-lightning
+      - name: Check out
+        uses: actions/checkout@v4
+      - name: Set up ccache
+        uses: actions/cache@v4
+        with:
+          path: ${{env.CCACHE_DIR}}
+          key: ccache-${{matrix.runner}}-${{matrix.compiler}}-${{matrix.version}}-${{github.run_id}}
+          restore-keys: ccache-${{matrix.runner}}-${{matrix.compiler}}-${{matrix.version}}
+      - name: Zero ccache stats
+        run: |
+          ccache -z
+      - name: Configure
+        run: |
+          export LIGHTNING_SIM_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/pennylane_lightning')")/liblightning_qubit_catalyst.so
+          mkdir build && cd build
+          cmake -GNinja \
+            -DQIREE_GIT_DESCRIBE="${{github.event.pull_request
+              && format(';-pr.{0};', github.event.pull_request.number)
+              || format(';-{0};', github.ref_name)}}" \
+            -DQIREE_BUILD_TESTS:BOOL=ON \
+            -DQIREE_DEBUG:BOOL=ON \
+            -DQIREE_USE_XACC:BOOL=OFF \
+            -DQIREE_USE_LIGHTNING:BOOL=ON \
+            -DCMAKE_BUILD_TYPE="Release" \
+            -DCMAKE_INSTALL_PREFIX="${{github.workspace}}/install" \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+            -DCMAKE_CXX_FLAGS="-Wall -Wextra -pedantic" \
+            ..
+      - name: Build all
+        working-directory: build
+        run: |
+          ninja
+      - name: Run tests
+        working-directory: build
+        run: |
+          ctest --parallel 2 --timeout 15 --output-on-failure
+      - name: Install
+        working-directory: build
+        run: |
+          ninja install
+      - name: Show ccache stats
+        run: |
+          ccache -s
+
+# vim: set nowrap tw=100:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5ecc4a8..07254bd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,7 +39,7 @@ option(QIREE_BUILD_TESTS "Build QIR-EE unit tests" ON)
 option(QIREE_BUILD_EXAMPLES "Build QIR-EE examples" OFF)
 option(QIREE_USE_QSIM "Download and build Google qsim backend" ON)
 option(QIREE_USE_XACC "Build XACC interface" OFF)
-option(QIREE_USE_LIGHTNING "Build Pennylane Lightning backend" ON)
+option(QIREE_USE_LIGHTNING "Build Pennylane Lightning backend" OFF)
 
 qiree_set_default(BUILD_TESTING ${QIREE_BUILD_TESTS})
 
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index b224fc8..73d0b81 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -66,3 +66,13 @@ if(QIREE_USE_QSIM)
 endif()
 
 #---------------------------------------------------------------------------##
+
+#---------------------------------------------------------------------------##
+# QIRLIGHTNING TESTS
+#---------------------------------------------------------------------------##
+
+if(QIREE_USE_LIGHTNING)
+  qiree_add_test(qirlightning LightningQuantum)
+endif()
+
+#---------------------------------------------------------------------------##
diff --git a/test/qirlightning/LightningQuantum.test.cc b/test/qirlightning/LightningQuantum.test.cc
new file mode 100644
index 0000000..1d74b29
--- /dev/null
+++ b/test/qirlightning/LightningQuantum.test.cc
@@ -0,0 +1,118 @@
+//----------------------------------*-C++-*----------------------------------//
+// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers.
+// See the top-level COPYRIGHT file for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//---------------------------------------------------------------------------//
+//! \file qirlightning/LightningQuantum.test.cc
+//---------------------------------------------------------------------------//
+#include "qirlightning/LightningQuantum.hh"
+
+#include <regex>
+
+#include "qiree/Types.hh"
+#include "qiree_test.hh"
+#include "qirlightning/LightningDefaultRuntime.hh"
+
+namespace qiree
+{
+namespace test
+{
+//---------------------------------------------------------------------------//
+
+class LightningQuantumTest : public ::qiree::test::Test
+{
+  protected:
+    void SetUp() override {}
+
+    static std::string clean_output(std::string&& s)
+    {
+        std::string result = std::move(s);
+        static std::regex const subs_ptr("0x[0-9a-f]+");
+        result = std::regex_replace(result, subs_ptr, "0x0");
+        return result;
+    }
+};
+
+TEST_F(LightningQuantumTest, sim_dynamicbv)
+{
+    using Q = Qubit;
+    using R = Result;
+
+    std::ostringstream os;
+    os << '\n';
+
+    // Create a simulator that will write to the string stream
+    LightningQuantum lightning_sim{os, 0};
+    LightningDefaultRuntime lightning_rt{os, lightning_sim};
+    // Call functions in the same sequence that dynamicbv.ll would
+    lightning_sim.set_up([] {
+        EntryPointAttrs attrs;
+        attrs.required_num_qubits = 2;
+        attrs.required_num_results = 2;
+        return attrs;
+    }());
+    ASSERT_EQ(2, lightning_sim.num_qubits());
+    ASSERT_EQ(2, lightning_sim.num_results());
+
+    lightning_sim.h(Q{0});
+    lightning_sim.x(Q{1});
+    lightning_sim.h(Q{1});
+    lightning_sim.cnot(Q{0}, Q{1});
+    lightning_sim.h(Q{0});
+    lightning_sim.mz(Q{0}, R{0});
+    lightning_sim.read_result(R{0});
+    lightning_sim.mz(Q{1}, R{1});
+    lightning_sim.read_result(R{1});
+    lightning_rt.array_record_output(2, "");
+    lightning_rt.result_record_output(R{0}, "");
+    lightning_rt.result_record_output(R{1}, "");
+    //EXPECT_EQ(QState::one, lightning_sim.get_result(R{0}));
+    //EXPECT_EQ(QState::one, lightning_sim.get_result(R{1}));
+
+
+    lightning_sim.tear_down();
+}
+
+TEST_F(LightningQuantumTest, result_order)
+{
+    using Q = Qubit;
+    using R = Result;
+
+    std::ostringstream os;
+    os << '\n';
+
+    // Create a simulator that will write to the string stream
+    LightningQuantum qis{os, 0};
+    LightningDefaultRuntime rt{os, qis};
+
+    // Call functions in the same sequence that dynamicbv.ll would
+    qis.set_up([] {
+        EntryPointAttrs attrs;
+        attrs.required_num_qubits = 4;
+        attrs.required_num_results = 3;
+        return attrs;
+    }());
+    qis.mz(Q{0}, R{2});
+    qis.mz(Q{1}, R{1});
+    qis.mz(Q{2}, R{0});
+    std::vector<bool> expected;
+    expected.push_back(static_cast<bool>(qis.get_result(R{2})));
+    expected.push_back(static_cast<bool>(qis.get_result(R{0})));
+    expected.push_back(static_cast<bool>(qis.get_result(R{1})));
+    // So the internal result "buffer" is now {true, false, true}
+    rt.array_record_output(3, "array");
+    rt.result_record_output(R{2}, "foo");  // pushes true
+    rt.result_record_output(R{0}, "bar");  // pushes true
+    rt.result_record_output(R{1}, "baz");  // pushes false
+
+    auto const& result = rt.result();
+    EXPECT_EQ("array", result.container_label());
+    EXPECT_EQ(expected, result.bits());
+    EXPECT_EQ((std::vector<std::string>{"foo", "bar", "baz"}),
+              result.entry_labels());
+
+    qis.tear_down();
+}
+//---------------------------------------------------------------------------//
+}  // namespace test
+}  // namespace qiree

From 227f5c66c463ad6ff2125e28097814dba55264ed Mon Sep 17 00:00:00 2001
From: Joseph Lee <joseph.lee@xanadu.ai>
Date: Tue, 17 Jun 2025 21:20:21 +0000
Subject: [PATCH 51/64] update reset

---
 src/qirlightning/LightningQuantum.cc | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/qirlightning/LightningQuantum.cc b/src/qirlightning/LightningQuantum.cc
index b4597b7..a774543 100644
--- a/src/qirlightning/LightningQuantum.cc
+++ b/src/qirlightning/LightningQuantum.cc
@@ -98,10 +98,7 @@ void LightningQuantum::tear_down()
  */
 void LightningQuantum::reset(Qubit q)
 {
-    std::vector<int8_t> data = {0}; 
-    DataView<int8_t, 1> state(data);
-    std::vector<QubitIdType> wires = {static_cast<intptr_t>(q.value)};    
-    rtd_qdevice->SetBasisState(state, wires);    
+    q.value = 0;
 }
 
 //----------------------------------------------------------------------------//

From cf66bdb0e0e1ff482d48363ef28d134ba5d27e1f Mon Sep 17 00:00:00 2001
From: Joseph Lee <joseph.lee@xanadu.ai>
Date: Tue, 24 Jun 2025 13:50:45 +0000
Subject: [PATCH 52/64] single-result update

---
 app/qir-lightning.cc                          |  4 +-
 src/qirlightning/CMakeLists.txt               |  2 +-
 src/qirlightning/LightningDefaultRuntime.hh   | 77 -------------------
 src/qirlightning/LightningQuantum.cc          |  6 +-
 src/qirlightning/LightningQuantum.hh          | 15 +---
 ...gDefaultRuntime.cc => LightningRuntime.cc} | 16 +++-
 src/qirlightning/LightningRuntime.hh          | 37 +++++++++
 test/qirlightning/LightningQuantum.test.cc    | 16 ++--
 8 files changed, 65 insertions(+), 108 deletions(-)
 delete mode 100644 src/qirlightning/LightningDefaultRuntime.hh
 rename src/qirlightning/{LightningDefaultRuntime.cc => LightningRuntime.cc} (62%)
 create mode 100644 src/qirlightning/LightningRuntime.hh

diff --git a/app/qir-lightning.cc b/app/qir-lightning.cc
index 2244c97..cff96f2 100644
--- a/app/qir-lightning.cc
+++ b/app/qir-lightning.cc
@@ -13,8 +13,8 @@
 #include "qiree/Executor.hh"
 #include "qiree/Module.hh"
 #include "qiree/ResultDistribution.hh"
-#include "qirlightning/LightningDefaultRuntime.hh"
 #include "qirlightning/LightningQuantum.hh"
+#include "qirlightning/LightningRuntime.hh"
 
 using namespace std::string_view_literals;
 
@@ -30,7 +30,7 @@ void run(std::string const& filename, int num_shots)
 
     // Set up qsim
     LightningQuantum sim(std::cout, 0);
-    LightningDefaultRuntime rt(std::cout, sim);
+    LightningRuntime rt(std::cout, sim);
     ResultDistribution distribution;
 
     // Run several time = shots (default 1)
diff --git a/src/qirlightning/CMakeLists.txt b/src/qirlightning/CMakeLists.txt
index 2edbdf9..b02e572 100644
--- a/src/qirlightning/CMakeLists.txt
+++ b/src/qirlightning/CMakeLists.txt
@@ -43,7 +43,7 @@ endif()
 # Adding lightning as a library to qiree
 qiree_add_library(qirlightning
 LightningQuantum.cc
-LightningDefaultRuntime.cc
+LightningRuntime.cc
 )
 
 target_compile_definitions(qirlightning PRIVATE
diff --git a/src/qirlightning/LightningDefaultRuntime.hh b/src/qirlightning/LightningDefaultRuntime.hh
deleted file mode 100644
index 2d3e9fa..0000000
--- a/src/qirlightning/LightningDefaultRuntime.hh
+++ /dev/null
@@ -1,77 +0,0 @@
-//----------------------------------*-C++-*----------------------------------//
-// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers.
-// See the top-level COPYRIGHT file for details.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//---------------------------------------------------------------------------//
-//! \file qirlightning/LightningDefaultRuntime.hh
-//---------------------------------------------------------------------------//
-#pragma once
-
-#include "LightningQuantum.hh"
-#include "qiree/RecordedResult.hh"
-
-namespace qiree
-{
-
-/*!
- * Print per-qubit measurement statistics.
- *
- * Example for three qubits:
- * \code
- * Measurement output:
- * -------------------
- * Number of shots: 1024
- * Number of qubits: 3
- * q0 {0: 542, 1: 482}
- * q1 {0: 521, 1: 503}
- * q2 {0: 0, 1: 1024}
- *
- * \endcode
- */
-
-class LightningDefaultRuntime final : virtual public RuntimeInterface
-{
-  public:
-    /*!
-     * Construct \c LightningDefaultRuntime.
-     */
-    LightningDefaultRuntime(std::ostream& output, LightningQuantum& sim)
-        : output_(output), sim_(sim)
-    {
-    }
-
-    //!@{
-    //! \name Runtime interface
-
-    // Initialize the execution environment, resetting qubits
-    void initialize(OptionalCString env) override;
-
-    //! Mark the following N results as being part of an array named tag
-    void array_record_output(size_type size, OptionalCString tag) final
-    {
-        result_ = RecordedResult(size, tag);
-    }
-
-    //! Mark the following N results as being part of a tuple named tag
-    void tuple_record_output(size_type size, OptionalCString tag) final
-    {
-        result_ = RecordedResult(size, tag);
-    }
-
-    //! Save one result
-    void result_record_output(Result result, OptionalCString tag) final
-    {
-        result_.push_back(sim_.get_result(result), tag);
-    }
-    //!@}
-
-    RecordedResult const& result() const { return result_; }
-
-
-  private:
-    std::ostream& output_;
-    LightningQuantum& sim_;
-    RecordedResult result_;
-};
-
-}  // namespace qiree
diff --git a/src/qirlightning/LightningQuantum.cc b/src/qirlightning/LightningQuantum.cc
index a774543..e584a3b 100644
--- a/src/qirlightning/LightningQuantum.cc
+++ b/src/qirlightning/LightningQuantum.cc
@@ -105,9 +105,11 @@ void LightningQuantum::reset(Qubit q)
 /*!
  * Read the value of a result. 
  */
-QState LightningQuantum::read_result(Result r)
+QState LightningQuantum::read_result(Result r) const
 {
-    return this->get_result(r);
+    QIREE_EXPECT(r.value < results_.size());
+    auto result_bool = static_cast<bool>(results_[r.value]);
+    return static_cast<QState>(result_bool);
 }
 
 //---------------------------------------------------------------------------//
diff --git a/src/qirlightning/LightningQuantum.hh b/src/qirlightning/LightningQuantum.hh
index 96d49d5..125d3eb 100644
--- a/src/qirlightning/LightningQuantum.hh
+++ b/src/qirlightning/LightningQuantum.hh
@@ -45,8 +45,6 @@ class LightningQuantum final : virtual public QuantumNotImpl
     //! Number of classical result registers
     size_type num_results() const { return results_.size(); }
 
-    // Get the result from a classical register
-    inline QState get_result(Result r) const;
     //!@}
 
     //!@{
@@ -61,7 +59,7 @@ class LightningQuantum final : virtual public QuantumNotImpl
     void mz(Qubit, Result) final;
 
     // Read the value of a result.
-    QState read_result(Result) final;
+    QState read_result(Result) const final;
     //!@}
 
 
@@ -108,15 +106,4 @@ class LightningQuantum final : virtual public QuantumNotImpl
     std::vector<Qubit> result_to_qubit_;
 };
 
-//---------------------------------------------------------------------------//
-/*!
- * Get the result from a classical register.
- */
-QState LightningQuantum::get_result(Result r) const
-{
-    QIREE_EXPECT(r.value < results_.size());
-    auto result_bool = static_cast<bool>(results_[r.value]);
-    return static_cast<QState>(result_bool);
-}
-
 }  // namespace qiree
diff --git a/src/qirlightning/LightningDefaultRuntime.cc b/src/qirlightning/LightningRuntime.cc
similarity index 62%
rename from src/qirlightning/LightningDefaultRuntime.cc
rename to src/qirlightning/LightningRuntime.cc
index 7e5da3e..89bce1a 100644
--- a/src/qirlightning/LightningDefaultRuntime.cc
+++ b/src/qirlightning/LightningRuntime.cc
@@ -3,22 +3,32 @@
 // See the top-level COPYRIGHT file for details.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //---------------------------------------------------------------------------//
-//! \file qirlightning/LightningDefaultRuntime.cc
+//! \file qirlightning/LightningRuntime.cc
 //---------------------------------------------------------------------------//
-#include "LightningDefaultRuntime.hh"
+#include "LightningRuntime.hh"
 
 #include <iostream>
 
+#include "LightningQuantum.hh"
 #include "qiree/Assert.hh"
 
 namespace qiree
 {
+//---------------------------------------------------------------------------//
+/*!
+ * Construct with quantum reference to access classical registers.
+ */
+LightningRuntime::LightningRuntime(std::ostream& output, LightningQuantum const& sim)
+    : SingleResultRuntime{sim}, output_(output)
+{
+}
+
 //---------------------------------------------------------------------------//
 /*!
  * Initialize the execution environment, resetting qubits.
  */
 
-void LightningDefaultRuntime::initialize(OptionalCString env)
+void LightningRuntime::initialize(OptionalCString env)
 {
     if (env)
     {
diff --git a/src/qirlightning/LightningRuntime.hh b/src/qirlightning/LightningRuntime.hh
new file mode 100644
index 0000000..0623d16
--- /dev/null
+++ b/src/qirlightning/LightningRuntime.hh
@@ -0,0 +1,37 @@
+//----------------------------------*-C++-*----------------------------------//
+// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers.
+// See the top-level COPYRIGHT file for details.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//---------------------------------------------------------------------------//
+//! \file qirlightning/LightningRuntime.hh
+//---------------------------------------------------------------------------//
+#pragma once
+
+#include "qiree/SingleResultRuntime.hh"
+
+namespace qiree
+{
+//---------------------------------------------------------------------------//
+class LightningQuantum;
+
+//---------------------------------------------------------------------------//
+
+class LightningRuntime final : virtual public SingleResultRuntime
+{
+  public:
+    // Construct with quantum reference to access classical registers
+    LightningRuntime(std::ostream& output, LightningQuantum const& sim);
+
+    //!@{
+    //! \name Runtime interface
+
+    // Initialize the execution environment, resetting qubits
+    void initialize(OptionalCString env) override;
+
+    //!@}
+
+  private:
+    std::ostream& output_;
+};
+
+}  // namespace qiree
diff --git a/test/qirlightning/LightningQuantum.test.cc b/test/qirlightning/LightningQuantum.test.cc
index 1d74b29..237792b 100644
--- a/test/qirlightning/LightningQuantum.test.cc
+++ b/test/qirlightning/LightningQuantum.test.cc
@@ -11,7 +11,7 @@
 
 #include "qiree/Types.hh"
 #include "qiree_test.hh"
-#include "qirlightning/LightningDefaultRuntime.hh"
+#include "qirlightning/LightningRuntime.hh"
 
 namespace qiree
 {
@@ -43,7 +43,7 @@ TEST_F(LightningQuantumTest, sim_dynamicbv)
 
     // Create a simulator that will write to the string stream
     LightningQuantum lightning_sim{os, 0};
-    LightningDefaultRuntime lightning_rt{os, lightning_sim};
+    LightningRuntime lightning_rt{os, lightning_sim};
     // Call functions in the same sequence that dynamicbv.ll would
     lightning_sim.set_up([] {
         EntryPointAttrs attrs;
@@ -66,9 +66,7 @@ TEST_F(LightningQuantumTest, sim_dynamicbv)
     lightning_rt.array_record_output(2, "");
     lightning_rt.result_record_output(R{0}, "");
     lightning_rt.result_record_output(R{1}, "");
-    //EXPECT_EQ(QState::one, lightning_sim.get_result(R{0}));
-    //EXPECT_EQ(QState::one, lightning_sim.get_result(R{1}));
-
+    EXPECT_EQ(QState::one, lightning_sim.read_result(R{0}));
 
     lightning_sim.tear_down();
 }
@@ -83,7 +81,7 @@ TEST_F(LightningQuantumTest, result_order)
 
     // Create a simulator that will write to the string stream
     LightningQuantum qis{os, 0};
-    LightningDefaultRuntime rt{os, qis};
+    LightningRuntime rt{os, qis};
 
     // Call functions in the same sequence that dynamicbv.ll would
     qis.set_up([] {
@@ -96,9 +94,9 @@ TEST_F(LightningQuantumTest, result_order)
     qis.mz(Q{1}, R{1});
     qis.mz(Q{2}, R{0});
     std::vector<bool> expected;
-    expected.push_back(static_cast<bool>(qis.get_result(R{2})));
-    expected.push_back(static_cast<bool>(qis.get_result(R{0})));
-    expected.push_back(static_cast<bool>(qis.get_result(R{1})));
+    expected.push_back(static_cast<bool>(qis.read_result(R{2})));
+    expected.push_back(static_cast<bool>(qis.read_result(R{0})));
+    expected.push_back(static_cast<bool>(qis.read_result(R{1})));
     // So the internal result "buffer" is now {true, false, true}
     rt.array_record_output(3, "array");
     rt.result_record_output(R{2}, "foo");  // pushes true

From f9b76aece71cfc70fdfadb6101526e5a1fd728e3 Mon Sep 17 00:00:00 2001
From: Joseph Lee <joseph.lee@xanadu.ai>
Date: Fri, 18 Jul 2025 15:53:46 +0000
Subject: [PATCH 53/64] update lightnign installation instructions

---
 src/qirlightning/README.md              | 2 +-
 src/qirlightning/support_catalyst.cmake | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/qirlightning/README.md b/src/qirlightning/README.md
index 7719b7e..2777869 100644
--- a/src/qirlightning/README.md
+++ b/src/qirlightning/README.md
@@ -55,7 +55,7 @@ To compile:
 ```
 cd qiree/
 mkdir build; cd build
-cmake ..
+cmake -DQIREE_USE_LIGHTNING=ON ..
 make
 
 ```
diff --git a/src/qirlightning/support_catalyst.cmake b/src/qirlightning/support_catalyst.cmake
index f46363c..5932f06 100644
--- a/src/qirlightning/support_catalyst.cmake
+++ b/src/qirlightning/support_catalyst.cmake
@@ -26,9 +26,7 @@ macro(FindCatalyst target_name)
 
     else()
         if(NOT CATALYST_GIT_TAG)
-            # v0.41 of Lightning requires v0.11.0 of Catalyst
-            # If using latest Lightning, use main branch of Catalyst
-            set(CATALYST_GIT_TAG "v0.11.0" CACHE STRING "GIT_TAG value to build Catalyst")
+            set(CATALYST_GIT_TAG "main" CACHE STRING "GIT_TAG value to build Catalyst")
         endif()
         message(INFO " Building against Catalyst GIT TAG ${CATALYST_GIT_TAG}")
 

From 5574e46f368b30e7b319bb35e53a7c43667d27f9 Mon Sep 17 00:00:00 2001
From: Joseph Lee <joseph.lee@xanadu.ai>
Date: Sat, 19 Jul 2025 00:24:56 +0000
Subject: [PATCH 54/64] update shots to seed

---
 src/qirlightning/LightningQuantum.hh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/qirlightning/LightningQuantum.hh b/src/qirlightning/LightningQuantum.hh
index 125d3eb..e68bf66 100644
--- a/src/qirlightning/LightningQuantum.hh
+++ b/src/qirlightning/LightningQuantum.hh
@@ -30,7 +30,7 @@ class LightningQuantum final : virtual public QuantumNotImpl
 {
   public:
     // Construct with number of shots
-    LightningQuantum(std::ostream& os, unsigned long int shots);
+    LightningQuantum(std::ostream& os, unsigned long int seed);
     ~LightningQuantum();
 
     QIREE_DELETE_COPY_MOVE(LightningQuantum);  // Delete copy and move constructors

From 00080eb1d7df5fa9cdc8f8a2d834bddd73fcb39d Mon Sep 17 00:00:00 2001
From: Joseph Lee <joseph.lee@xanadu.ai>
Date: Tue, 22 Jul 2025 02:47:00 +0000
Subject: [PATCH 55/64] update github workflow to run build-lightning

---
 .github/workflows/pr.yml   | 3 +++
 .github/workflows/push.yml | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 52f8404..df9d110 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -17,11 +17,14 @@ concurrency:
 jobs:
   build-fast:
     uses: ./.github/workflows/build-fast.yml
+  build-lightning:
+    uses: ./.github/workflows/build-lightning.yml
   # Specifying a dependent job allows us to select a single "requires" check in the project GitHub settings
   all:
     if: ${{ always() }}
     needs:
     - build-fast
+    - build-lightning
     runs-on: ubuntu-latest
     steps:
     - name: Decide whether the needed jobs succeeded or failed
diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml
index 4601abc..5e79f80 100644
--- a/.github/workflows/push.yml
+++ b/.github/workflows/push.yml
@@ -12,9 +12,12 @@ concurrency:
 jobs:
   build-fast:
     uses: ./.github/workflows/build-fast.yml
+  build-lightning:
+    uses: ./.github/workflows/build-lightning.yml
   all:
     needs:
       - build-fast
+      - build-lightning
     runs-on: ubuntu-latest
     steps:
     - name: Success

From aa254204a6e0cda078719055ba4235f8092ec4fc Mon Sep 17 00:00:00 2001
From: Joseph Lee <joseph.lee@xanadu.ai>
Date: Wed, 30 Jul 2025 13:02:56 +0000
Subject: [PATCH 56/64] remove demo and pin catalyst

---
 src/qirlightning/simple_demo/README.md        |  69 ----
 .../snapshot_catalyst_runtime/README.rst      | 118 ------
 .../include/DataView.hpp                      | 148 -------
 .../include/Exception.hpp                     |  87 -----
 .../include/QuantumDevice.hpp                 | 364 ------------------
 .../snapshot_catalyst_runtime/include/Types.h | 165 --------
 .../simple_demo/test_rt_device.cpp            |  74 ----
 src/qirlightning/support_catalyst.cmake       |   2 +-
 8 files changed, 1 insertion(+), 1026 deletions(-)
 delete mode 100644 src/qirlightning/simple_demo/README.md
 delete mode 100644 src/qirlightning/simple_demo/snapshot_catalyst_runtime/README.rst
 delete mode 100644 src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/DataView.hpp
 delete mode 100644 src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Exception.hpp
 delete mode 100644 src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/QuantumDevice.hpp
 delete mode 100644 src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Types.h
 delete mode 100644 src/qirlightning/simple_demo/test_rt_device.cpp

diff --git a/src/qirlightning/simple_demo/README.md b/src/qirlightning/simple_demo/README.md
deleted file mode 100644
index bbd7d27..0000000
--- a/src/qirlightning/simple_demo/README.md
+++ /dev/null
@@ -1,69 +0,0 @@
-# Simple Demo for Catalyst/Lightning runtime
-
-This is a super simple demo for driving Lightning devices. The example here uses `lightning.kokkos`, but can easily be updated to target other devices, e.g. lightning.gpu (if an Nvidia GPU is present). 
-
-Some Catalyst include files are copied here for convenience - they are in `./snapshot_catalyst_runtime/include`. These are required for the QuantumDevice interface. For the qiree source, these files are fetched automatically during CMake, and these are not used.
-
-## Installing a lightning simulator
-
-When installing [Pennylane-Lightning](https://github.com/PennyLaneAI/pennylane-lightning) from pip or source, you will have the shared objects for each of the simulator installed. These are named `liblightning_kokkos_catalyst.so`/`liblightning_GPU_catalyst.so` etc. 
-
-To get started, run `pip install pennylane` or `pip install pennylane-lightning` - this will install the `lightning.qubit` (CPU) simulator, and other simulators can be installed by running `pip install pennylane-lightning-kokkos / pennylane-lightning-gpu`.
-
-Example:
-```
-$ pip install pennylane-lightning-kokkos
-
-$ pip show pennylane-lightning-kokkos
-Name: PennyLane_Lightning_Kokkos
-Version: 0.40.0
-Summary: PennyLane-Lightning plugin
-Home-page: https://github.com/PennyLaneAI/pennylane-lightning
-Author: 
-Author-email: 
-License: Apache License 2.0
-Location: <site packages path>
-Requires: pennylane, pennylane-lightning
-
-$ ls <site packages path>/pennylane_lightning
-... liblightning_kokkos_catalyst.so ...
-```
-
-You can swap `pennylane-lightning-kokkos` for `pennylane-lightning-gpu` for lightning.gpu and `pennylane-lightning` for lightning.gpu simulators.
-
-## Compilation
-
-First update the `RTDLIB` in `test_rt_device.cpp` to the local path where lightning is installed (i.e. `<site packages path>` from above).
-
-To compile:
-
-```
-$ clang++ --std=c++20 test_rt_device.cpp -I./snapshot_catalyst_runtime/include -o test_rt_device.out
-```
-
-## Running the example
-
-To run:
-
-```
-$ ./test_rt_device.out 
-Kokkos::OpenMP::initialize WARNING: OMP_PROC_BIND environment variable not set
-  In general, for best performance with OpenMP 4.0 or better set OMP_PROC_BIND=spread and OMP_PLACES=threads
-  For best performance with OpenMP 3.1 set OMP_PROC_BIND=true
-  For unit testing set OMP_PROC_BIND=false
-
-Num Qubits = 3
-State = 
-*** State-Vector of Size 8 ***
-[(0.707107,0), (0,0), (0,0), (0,0), (0.707107,0), (0,0), (0,0), (0,0)]
-Measure on wire 0 = 0
-```
-
-## Running on other devices
-
-To run on other devices, e.g. lightning.gpu, you need to change:
-- Install pennylane-lightning-gpu: `pip install pennylane-lightning-gpu`
-In the c++ file:
-- replace `RTDLIB` from `kokkos` to `gpu`
-- replace `RTDDEVICE` from `Kokkos` to `GPU`
-- Include `cuquantum` libraries when running (which was installed as a dependency), i.e. `LD_LIBRARY_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")/lib:$LD_LIBRARY_PATH ./test_rt_device.out`
diff --git a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/README.rst b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/README.rst
deleted file mode 100644
index 8a881e5..0000000
--- a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/README.rst
+++ /dev/null
@@ -1,118 +0,0 @@
-.. runtime-start-inclusion-marker-do-not-remove
-
-Catalyst Quantum Runtime
-########################
-
-The Catalyst Runtime is a C++ QIR runtime that enables the execution of Catalyst-compiled
-quantum programs, and is currently backed by `PennyLane-Lightning <https://docs.pennylane.ai/projects/lightning/en/stable>`_
-state-vector simulators, and `Amazon Braket <https://amazon-braket-pennylane-plugin-python.readthedocs.io>`__
-devices. Additional hardware support, including QPUs, to come.
-
-The runtime employs the `QuantumDevice <https://docs.pennylane.ai/projects/catalyst/en/stable/api/structCatalyst_1_1Runtime_1_1QuantumDevice.html#exhale-struct-structcatalyst-1-1runtime-1-1quantumdevice>`_
-public interface to support an extensible list of backend devices. This interface comprises two collections of abstract methods:
-
-- The Qubit management, device shot noise, and quantum tape recording methods are utilized for the implementation of Quantum Runtime (QR) instructions.
-
-- The quantum operations, observables, measurements, and gradient methods are used to implement Quantum Instruction Set (QIS) instructions.
-
-A complete list of instructions supported by the runtime can be found in
-`RuntimeCAPI.h <https://github.com/PennyLaneAI/catalyst/tree/main/runtime/include/RuntimeCAPI.h>`_.
-
-Contents
-========
-
-The directory is structured as follows:
-
-- `include <https://github.com/PennyLaneAI/catalyst/tree/main/runtime/include>`_:
-    This contains the public header files of the runtime including the ``QuantumDevice`` API
-    for backend quantum devices and the runtime CAPI.
-
-- `lib <https://github.com/PennyLaneAI/catalyst/tree/main/runtime/lib>`_:
-    The core modules of the runtime are structured into ``lib/capi`` and ``lib/backend``.
-    `lib/capi <https://github.com/PennyLaneAI/catalyst/tree/main/runtime/lib/capi>`_  implements the semantics for
-    QIR instructions lowered to our custom runtime. `lib/backend <https://github.com/PennyLaneAI/catalyst/tree/main/runtime/lib/backend>`_
-    contains implementations of the ``QuantumDevice`` API for backend simulators.
-
-- `tests <https://github.com/PennyLaneAI/catalyst/tree/main/runtime/tests>`_:
-    A collection of C++ tests for modules and methods in the runtime.
-
-Backend Devices
-===============
-
-New device backends for the runtime can be realized by implementing the quantum device interface.
-The following table shows the available devices along with supported features:
-
-.. list-table::
-   :widths: 25 25 25 25
-   :header-rows: 0
-
-   * - **Features**
-     - **PennyLane-Lightning-Qubit**
-     - **PennyLane-Lightning-Kokkos** and **PennyLane-Lightning-GPU**
-     - **Amazon-Braket-OpenQasm**
-   * - Qubit Management
-     - Dynamic allocation/deallocation
-     - Static allocation/deallocation
-     - Static allocation/deallocation
-   * - Gate Operations
-     - `Lightning operations <https://github.com/PennyLaneAI/pennylane-lightning/blob/master/pennylane_lightning/core/src/gates/GateOperation.hpp>`_
-     - `Lightning operations <https://github.com/PennyLaneAI/pennylane-lightning/blob/master/pennylane_lightning/core/src/gates/GateOperation.hpp>`_ without controlled gates support
-     - `Braket operations <https://github.com/PennyLaneAI/catalyst/blob/e812afbadbd777209862d5c76f394e3f0c43ffb6/runtime/lib/backend/openqasm/OpenQasmBuilder.hpp#L49>`_
-   * - Quantum Observables
-     - ``Identity``, ``PauliX``, ``PauliY``, ``PauliZ``, ``Hadamard``, ``Hermitian``, ``Hamiltonian``, and Tensor Product of Observables
-     - ``Identity``, ``PauliX``, ``PauliY``, ``PauliZ``, ``Hadamard``, ``Hermitian``, ``Hamiltonian``, and Tensor Product of Observables
-     - ``Identity``, ``PauliX``, ``PauliY``, ``PauliZ``, ``Hadamard``, ``Hermitian``, and Tensor Product of Observables
-   * - Expectation Value
-     - All observables; Finite-shots supported
-     - All observables; Finite-shots supported
-     - All observables; Finite-shots supported
-   * - Variance
-     - All observables; Finite-shots supported
-     - All observables; Finite-shots supported
-     - All observables; Finite-shots supported
-   * - Probability
-     - Only for the computational basis on the supplied qubits; Finite-shots supported
-     - Only for the computational basis on the supplied qubits; Finite-shots supported
-     - The computational basis on all active qubits; Finite-shots supported
-   * - Sampling
-     - Only for the computational basis on the supplied qubits
-     - Only for the computational basis on the supplied qubits
-     - The computational basis on all active qubits; Finite-shots supported
-   * - Mid-Circuit Measurement
-     - Only for the computational basis on the supplied qubit
-     - Only for the computational basis on the supplied qubit
-     - Not supported
-   * - Gradient
-     - The Adjoint-Jacobian method for expectation values on all observables
-     - The Adjoint-Jacobian method for expectation values on all observables
-     - Not supported
-
-Requirements
-============
-
-To build the runtime from source, it is required to have an up to date version of a C/C++ compiler such as gcc or clang
-with support for the C++20 standard library.
-
-Installation
-============
-
-By default, the runtime builds all supported backend devices.
-You can build the runtime with custom devices from the list of Backend Devices.
-
-You can use ``ENABLE_OPENQASM=OFF`` to disable building the runtime with `Amazon-Braket-OpenQasm <https://aws.amazon.com/braket/>`_:
-
-.. code-block:: console
-
-    make runtime ENABLE_OPENQASM=OFF
-
-This device currently offers generators for the `OpenQasm3 <https://openqasm.com/versions/3.0/index.html>`_ specification and
-`Amazon Braket <https://docs.aws.amazon.com/braket/latest/developerguide/braket-openqasm-supported-features.html>`__ assembly extension.
-Moreover, the generated assembly can be executed on Amazon Braket devices leveraging `amazon-braket-sdk-python <https://github.com/aws/amazon-braket-sdk-python>`_.
-
-To check the runtime test suite from the root directory:
-
-.. code-block:: console
-
-    make test-runtime
-
-.. runtime-end-inclusion-marker-do-not-remove
diff --git a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/DataView.hpp b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/DataView.hpp
deleted file mode 100644
index 6cf50f2..0000000
--- a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/DataView.hpp
+++ /dev/null
@@ -1,148 +0,0 @@
-// Copyright 2023 Xanadu Quantum Technologies Inc.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-//     http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <Exception.hpp>
-
-/**
- * A multi-dimensional view for MemRef-like and std::vector<T> types.
- *
- * @tparam T The underlying data type
- * @tparam R The Rank (R > 0)
- *
- * @note A forward iterator is implemented in this view for traversing over the entire
- * elements of MemRef types rank-by-rank starting from the last dimension (R-1). For example,
- * The DataView iterator for MemRef<T, 2> starts from index (0, 0) and traverses elements
- * in the following order:
- * (0, 0), ..., (0, sizes[1]-1), (1, 0), ..., (1, sizes[1]-1), ... (sizes[0]-1, sizes[1]-1).
- */
-template <typename T, size_t R> class DataView {
-  private:
-    T *data_aligned;
-    size_t offset;
-    size_t sizes[R] = {0};
-    size_t strides[R] = {0};
-
-  public:
-    class iterator {
-      private:
-        const DataView<T, R> &view;
-
-        int64_t loc; // physical index
-        size_t indices[R] = {0};
-
-      public:
-        using iterator_category = std::forward_iterator_tag; // LCOV_EXCL_LINE
-        using value_type = T;                                // LCOV_EXCL_LINE
-        using difference_type = std::ptrdiff_t;              // LCOV_EXCL_LINE
-        using pointer = T *;                                 // LCOV_EXCL_LINE
-        using reference = T &;                               // LCOV_EXCL_LINE
-
-        iterator(const DataView<T, R> &_view, int64_t begin_idx) : view(_view), loc(begin_idx) {}
-        pointer operator->() const { return &view.data_aligned[loc]; }
-        reference operator*() const { return view.data_aligned[loc]; }
-        iterator &operator++()
-        {
-            int64_t next_axis = -1;
-            int64_t idx;
-            for (int64_t i = R; i > 0; --i) {
-                idx = i - 1;
-                if (indices[idx]++ < view.sizes[idx] - 1) {
-                    next_axis = idx;
-                    break;
-                }
-                indices[idx] = 0;
-                loc -= (view.sizes[idx] - 1) * view.strides[idx];
-            }
-
-            loc = next_axis == -1 ? -1 : loc + view.strides[next_axis];
-            return *this;
-        }
-        iterator operator++(int)
-        {
-            auto tmp = *this;
-            int64_t next_axis = -1;
-            int64_t idx;
-            for (int64_t i = R; i > 0; --i) {
-                idx = i - 1;
-                if (indices[idx]++ < view.sizes[idx] - 1) {
-                    next_axis = idx;
-                    break;
-                }
-                indices[idx] = 0;
-                loc -= (view.sizes[idx] - 1) * view.strides[idx];
-            }
-
-            loc = next_axis == -1 ? -1 : loc + view.strides[next_axis];
-            return tmp;
-        }
-        bool operator==(const iterator &other) const
-        {
-            return (loc == other.loc && view.data_aligned == other.view.data_aligned);
-        }
-        bool operator!=(const iterator &other) const { return !(*this == other); }
-    };
-
-    explicit DataView(std::vector<T> &buffer) : data_aligned(buffer.data()), offset(0)
-    {
-        static_assert(R == 1, "[Class: DataView] Assertion: R == 1");
-        sizes[0] = buffer.size();
-        strides[0] = 1;
-    }
-
-    explicit DataView(T *_data_aligned, size_t _offset, const size_t *_sizes,
-                      const size_t *_strides)
-        : data_aligned(_data_aligned), offset(_offset)
-    {
-        static_assert(R > 0, "[Class: DataView] Assertion: R > 0");
-        if (_sizes != nullptr && _strides != nullptr) {
-            for (size_t i = 0; i < R; i++) {
-                sizes[i] = _sizes[i];
-                strides[i] = _strides[i];
-            }
-        } // else sizes = {0}, strides = {0}
-    }
-
-    [[nodiscard]] auto size() const -> size_t
-    {
-        if (!data_aligned) {
-            return 0;
-        }
-
-        size_t tsize = 1;
-        for (size_t i = 0; i < R; i++) {
-            tsize *= sizes[i];
-        }
-        return tsize;
-    }
-
-    template <typename... I> T &operator()(I... idxs) const
-    {
-        static_assert(sizeof...(idxs) == R,
-                      "[Class: DataView] Error in Catalyst Runtime: Wrong number of indices");
-        size_t indices[] = {static_cast<size_t>(idxs)...};
-
-        size_t loc = offset;
-        for (size_t axis = 0; axis < R; axis++) {
-            RT_ASSERT(indices[axis] < sizes[axis]);
-            loc += indices[axis] * strides[axis];
-        }
-        return data_aligned[loc];
-    }
-
-    iterator begin() { return iterator{*this, static_cast<int64_t>(offset)}; }
-
-    iterator end() { return iterator{*this, -1}; }
-};
diff --git a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Exception.hpp b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Exception.hpp
deleted file mode 100644
index a76da14..0000000
--- a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Exception.hpp
+++ /dev/null
@@ -1,87 +0,0 @@
-// Copyright 2023 Xanadu Quantum Technologies Inc.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-//     http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <exception>
-#include <iostream>
-
-#include <sstream>
-#include <string>
-#include <type_traits>
-#include <utility>
-
-/**
- * @brief Macro that throws `RuntimeException` with given message.
- */
-#define RT_FAIL(message) Catalyst::Runtime::_abort((message), __FILE__, __LINE__, __func__)
-
-/**
- * @brief Macro that throws `RuntimeException` if expression evaluates
- * to true.
- */
-#define RT_FAIL_IF(expression, message)                                                            \
-    if ((expression)) {                                                                            \
-        RT_FAIL(message);                                                                          \
-    }
-
-/**
- * @brief Macro that throws `RuntimeException` with the given expression
- * and source location if expression evaluates to false.
- */
-#define RT_ASSERT(expression) RT_FAIL_IF(!(expression), "Assertion: " #expression)
-
-namespace Catalyst::Runtime {
-
-/**
- * @brief This is the general exception thrown by Catalyst for runtime errors
- * that is derived from `std::exception`.
- */
-class RuntimeException : public std::exception {
-  private:
-    const std::string err_msg;
-
-  public:
-    explicit RuntimeException(std::string msg) noexcept
-        : err_msg{std::move(msg)} {}        // LCOV_EXCL_LINE
-    ~RuntimeException() override = default; // LCOV_EXCL_LINE
-
-    RuntimeException(const RuntimeException &) = default;
-    RuntimeException(RuntimeException &&) noexcept = default;
-
-    RuntimeException &operator=(const RuntimeException &) = delete;
-    RuntimeException &operator=(RuntimeException &&) = delete;
-
-    [[nodiscard]] auto what() const noexcept -> const char * override
-    {
-        return err_msg.c_str();
-    } // LCOV_EXCL_LINE
-};
-
-/**
- * @brief Throws a `RuntimeException` with the given error message.
- *
- * @note This is not supposed to be called directly.
- */
-[[noreturn]] inline void _abort(const char *message, const char *file_name, size_t line,
-                                const char *function_name)
-{
-    std::stringstream sstream;
-    sstream << "[" << file_name << "][Line:" << line << "][Function:" << function_name
-            << "] Error in Catalyst Runtime: " << message;
-
-    throw RuntimeException(sstream.str());
-} // LCOV_EXCL_LINE
-
-} // namespace Catalyst::Runtime
diff --git a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/QuantumDevice.hpp b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/QuantumDevice.hpp
deleted file mode 100644
index ccdb606..0000000
--- a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/QuantumDevice.hpp
+++ /dev/null
@@ -1,364 +0,0 @@
-// Copyright 2022-2023 Xanadu Quantum Technologies Inc.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-//     http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <complex>
-#include <memory>
-#include <optional>
-#include <random>
-#include <vector>
-
-#include "DataView.hpp"
-#include "Types.h"
-
-// A helper template macro to generate the <IDENTIFIER>Factory method by
-// calling <CONSTRUCTOR>(kwargs). Check the Custom Devices guideline for details:
-// https://docs.pennylane.ai/projects/catalyst/en/stable/dev/custom_devices.html
-#define GENERATE_DEVICE_FACTORY(IDENTIFIER, CONSTRUCTOR)                                           \
-    extern "C" Catalyst::Runtime::QuantumDevice *IDENTIFIER##Factory(const char *kwargs)           \
-    {                                                                                              \
-        return new CONSTRUCTOR(std::string(kwargs));                                               \
-    }
-
-namespace Catalyst::Runtime {
-
-/**
- * @brief struct API for backend quantum devices.
- *
- * This device API contains,
- * - a set of methods to manage qubit allocations and deallocations, device shot
- *   noise, and quantum tape recording as well as reference values for the result
- *   data-type; these are used to implement Quantum Runtime (QR) instructions.
- *
- * - a set of methods for quantum operations, observables, measurements, and gradient
- *   of the device; these are used to implement Quantum Instruction Set (QIS) instructions.
- *
- */
-struct QuantumDevice {
-    QuantumDevice() = default;          // LCOV_EXCL_LINE
-    virtual ~QuantumDevice() = default; // LCOV_EXCL_LINE
-
-    QuantumDevice &operator=(const QuantumDevice &) = delete;
-    QuantumDevice(const QuantumDevice &) = delete;
-    QuantumDevice(QuantumDevice &&) = delete;
-    QuantumDevice &operator=(QuantumDevice &&) = delete;
-
-    /**
-     * @brief Allocate a qubit.
-     *
-     * @return `QubitIdType`
-     */
-    virtual auto AllocateQubit() -> QubitIdType = 0;
-
-    /**
-     * @brief Allocate a vector of qubits.
-     *
-     * @param num_qubits The number of qubits to allocate.
-     *
-     * @return `std::vector<QubitIdType>`
-     */
-    virtual auto AllocateQubits(size_t num_qubits) -> std::vector<QubitIdType> = 0;
-
-    /**
-     * @brief Release a qubit.
-     *
-     * @param qubit The id of the qubit
-     */
-    virtual void ReleaseQubit(QubitIdType qubit) = 0;
-
-    /**
-     * @brief Release all qubits.
-     */
-    virtual void ReleaseAllQubits() = 0;
-
-    /**
-     * @brief Get the number of allocated qubits.
-     *
-     * @return `size_t`
-     */
-    [[nodiscard]] virtual auto GetNumQubits() const -> size_t = 0;
-
-    /**
-     * @brief Set the number of device shots.
-     *
-     * @param shots The number of noise shots
-     */
-    virtual void SetDeviceShots(size_t shots) = 0;
-
-    /**
-     * @brief Get the number of device shots.
-     *
-     * @return `size_t`
-     */
-    [[nodiscard]] virtual auto GetDeviceShots() const -> size_t = 0;
-
-    /**
-     * @brief Set the PRNG of the device.
-     *
-     * The Catalyst runtime enables seeded program execution on non-hardware devices.
-     * A random number generator instance is managed by the runtime to predictably
-     * generate results for non-deterministic programs, such as those involving `Measure`
-     * calls.
-     * Devices implementing support for this feature do not need to use the provided
-     * PRNG instance as their sole source of random numbers, but it is expected that the
-     * the same instance state will predictable and reproducibly generate the same
-     * program results. It is also expected that the provided PRNG state is evolved
-     * sufficiently so that two device executions sharing the same instance do not produce
-     * identical results.
-     * The provided PRNG instance is not thread-locked, and devices wishing to share it
-     * across threads will need to provide their own thread-safety.
-     *
-     * @param gen The std::mt19937 PRNG object.
-     */
-    virtual void SetDevicePRNG([[maybe_unused]] std::mt19937 *gen){};
-
-    /**
-     * @brief Start recording a quantum tape if provided.
-     *
-     * @note This is backed by the `Catalyst::Runtime::CacheManager<ComplexT>` property in
-     * the device implementation.
-     */
-    virtual void StartTapeRecording() = 0;
-
-    /**
-     * @brief Stop recording a quantum tape if provided.
-     *
-     * @note This is backed by the `Catalyst::Runtime::CacheManager<ComplexT>` property in
-     * the device implementation.
-     */
-    virtual void StopTapeRecording() = 0;
-
-    /**
-     * @brief Result value for "Zero" used in the measurement process.
-     *
-     * @return `Result`
-     */
-    [[nodiscard]] virtual auto Zero() const -> Result = 0;
-
-    /**
-     * @brief Result value for "One"  used in the measurement process.
-     *
-     * @return `Result`
-     */
-    [[nodiscard]] virtual auto One() const -> Result = 0;
-
-    /**
-     * @brief A helper method to print the state vector of a device.
-     */
-    virtual void PrintState() = 0;
-
-    /**
-     * @brief Prepare subsystems using the given ket vector in the computational basis.
-     *
-     * @param state A state vector of size 2**len(wires)
-     * @param wires The wire(s) the operation acts on
-     */
-    virtual void SetState([[maybe_unused]] DataView<std::complex<double>, 1> &state,
-                          [[maybe_unused]] std::vector<QubitIdType> &wires)
-    {
-        RT_FAIL("Unsupported functionality");
-    }
-
-    /**
-     * @brief Prepares a single computational basis state.
-     *
-     * @param n Prepares the basis state |n>, where n is an array of integers from the set {0, 1}
-     * @param wires The wire(s) the operation acts on
-     */
-    virtual void SetBasisState([[maybe_unused]] DataView<int8_t, 1> &n,
-                               [[maybe_unused]] std::vector<QubitIdType> &wires)
-    {
-        RT_FAIL("Unsupported functionality");
-    }
-
-    /**
-     * @brief Apply a single gate to the state vector of a device with its name if this is
-     * supported.
-     *
-     * @param name The name of the gate to apply
-     * @param params Optional parameter list for parametric gates
-     * @param wires Wires to apply gate to
-     * @param inverse Indicates whether to use inverse of gate
-     * @param controlled_wires Optional controlled wires applied to the operation
-     * @param controlled_values Optional controlled values applied to the operation
-     */
-    virtual void
-    NamedOperation(const std::string &name, const std::vector<double> &params,
-                   const std::vector<QubitIdType> &wires, [[maybe_unused]] bool inverse = false,
-                   [[maybe_unused]] const std::vector<QubitIdType> &controlled_wires = {},
-                   [[maybe_unused]] const std::vector<bool> &controlled_values = {}) = 0;
-
-    /**
-     * @brief Apply a given matrix directly to the state vector of a device.
-     *
-     * @param matrix The matrix of data in row-major format
-     * @param wires Wires to apply gate to
-     * @param inverse Indicates whether to use inverse of gate
-     * @param controlled_wires Controlled wires applied to the operation
-     * @param controlled_values Controlled values applied to the operation
-     */
-    virtual void
-    MatrixOperation(const std::vector<std::complex<double>> &matrix,
-                    const std::vector<QubitIdType> &wires, [[maybe_unused]] bool inverse = false,
-                    [[maybe_unused]] const std::vector<QubitIdType> &controlled_wires = {},
-                    [[maybe_unused]] const std::vector<bool> &controlled_values = {}) = 0;
-
-    /**
-     * @brief Construct a named (Identity, PauliX, PauliY, PauliZ, and Hadamard)
-     * or Hermitian observable.
-     *
-     * @param id The type of the observable
-     * @param matrix The matrix of data to construct a hermitian observable
-     * @param wires Wires to apply observable to
-     *
-     * @return `ObsIdType` Index of the constructed observable
-     */
-    virtual auto Observable(ObsId id, const std::vector<std::complex<double>> &matrix,
-                            const std::vector<QubitIdType> &wires) -> ObsIdType = 0;
-
-    /**
-     * @brief Construct a tensor product of observables.
-     *
-     * @param obs The vector of observables indices of type ObsIdType
-     *
-     * @return `ObsIdType` Index of the constructed observable
-     */
-    virtual auto TensorObservable(const std::vector<ObsIdType> &obs) -> ObsIdType = 0;
-
-    /**
-     * @brief Construct a Hamiltonian observable.
-     *
-     * @param coeffs The vector of coefficients
-     * @param obs The vector of observables indices of size `coeffs`
-     *
-     * @return `ObsIdType` Index of the constructed observable
-     */
-    virtual auto HamiltonianObservable(const std::vector<double> &coeffs,
-                                       const std::vector<ObsIdType> &obs) -> ObsIdType = 0;
-
-    /**
-     * @brief Compute the expected value of an observable.
-     *
-     * @param obsKey The index of the constructed observable
-     *
-     * @return `double` The expected value
-     */
-    virtual auto Expval(ObsIdType obsKey) -> double = 0;
-
-    /**
-     * @brief Compute the variance of an observable.
-     *
-     * @param obsKey The index of the constructed observable
-     *
-     * @return `double` The variance
-     */
-    virtual auto Var(ObsIdType obsKey) -> double = 0;
-
-    /**
-     * @brief Get the state-vector of a device.
-     *
-     * @param state The pre-allocated `DataView<complex<double>, 1>`
-     */
-    virtual void State(DataView<std::complex<double>, 1> &state) = 0;
-
-    /**
-     * @brief Compute the probabilities of each computational basis state.
-
-     * @param probs The pre-allocated `DataView<double, 1>`
-     */
-    virtual void Probs(DataView<double, 1> &probs) = 0;
-
-    /**
-     * @brief Compute the probabilities for a subset of the full system.
-     *
-     * @param probs The pre-allocated `DataView<double, 1>`
-     * @param wires Wires will restrict probabilities to a subset of the full system
-     */
-    virtual void PartialProbs(DataView<double, 1> &probs,
-                              const std::vector<QubitIdType> &wires) = 0;
-
-    /**
-     * @brief Compute samples with the number of shots on the entire wires,
-     * returing raw samples.
-     *
-     * @param samples The pre-allocated `DataView<double, 2>`representing a matrix of
-     * shape `shots * numQubits`. The built-in iterator in `DataView<double, 2>`
-     * iterates over all elements of `samples` row-wise.
-     * @param shots The number of shots
-     */
-    virtual void Sample(DataView<double, 2> &samples, size_t shots) = 0;
-
-    /**
-     * @brief Compute partial samples with the number of shots on `wires`,
-     * returing raw samples.
-     *
-     * @param samples The pre-allocated `DataView<double, 2>`representing a matrix of
-     * shape `shots * numWires`. The built-in iterator in `DataView<double, 2>`
-     * iterates over all elements of `samples` row-wise.
-     * @param wires Wires to compute samples on
-     * @param shots The number of shots
-     */
-    virtual void PartialSample(DataView<double, 2> &samples, const std::vector<QubitIdType> &wires,
-                               size_t shots) = 0;
-
-    /**
-     * @brief Sample with the number of shots on the entire wires, returning the
-     * number of counts for each sample.
-     *
-     * @param eigvals The pre-allocated `DataView<double, 1>`
-     * @param counts The pre-allocated `DataView<int64_t, 1>`
-     * @param shots The number of shots
-     */
-    virtual void Counts(DataView<double, 1> &eigvals, DataView<int64_t, 1> &counts,
-                        size_t shots) = 0;
-
-    /**
-     * @brief Partial sample with the number of shots on `wires`, returning the
-     * number of counts for each sample.
-     *
-     * @param eigvals The pre-allocated `DataView<double, 1>`
-     * @param counts The pre-allocated `DataView<int64_t, 1>`
-     * @param wires Wires to compute samples on
-     * @param shots The number of shots
-     */
-    virtual void PartialCounts(DataView<double, 1> &eigvals, DataView<int64_t, 1> &counts,
-                               const std::vector<QubitIdType> &wires, size_t shots) = 0;
-
-    /**
-     * @brief A general measurement method that acts on a single wire.
-     *
-     * @param wire The wire to compute Measure on
-     * @param postselect Which basis state to postselect after a mid-circuit measurement (-1 denotes
-     no post-selection)
-
-     * @return `Result` The measurement result
-     */
-    virtual auto Measure(QubitIdType wire, std::optional<int32_t> postselect) -> Result = 0;
-
-    /**
-     * @brief Compute the gradient of a quantum tape, that is cached using
-     * `Catalyst::Runtime::Simulator::CacheManager`, for a specific set of trainable
-     * parameters.
-     *
-     * @param gradients The vector of pre-allocated `DataView<double, 1>*`
-     * to store gradients resutls for the list of cached observables.
-     * @param trainParams The vector of trainable parameters; if none, all parameters
-     * would be assumed trainable
-     *
-     */
-    virtual void Gradient(std::vector<DataView<double, 1>> &gradients,
-                          const std::vector<size_t> &trainParams) = 0;
-};
-} // namespace Catalyst::Runtime
diff --git a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Types.h b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Types.h
deleted file mode 100644
index a30a1c2..0000000
--- a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Types.h
+++ /dev/null
@@ -1,165 +0,0 @@
-// Copyright 2022-2023 Xanadu Quantum Technologies Inc.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-//     http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#ifndef TYPES_H
-#define TYPES_H
-
-#include <cmath>
-#include <cstdint>
-#include <limits>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Qubit, Result and Observable types
-struct QUBIT;
-using QubitIdType = intptr_t;
-
-using RESULT = bool;
-using Result = RESULT *;
-using QirArray = void *;
-
-using ObsIdType = intptr_t;
-
-enum ObsId : int8_t {
-    Identity = 0,
-    PauliX,
-    PauliY,
-    PauliZ,
-    Hadamard,
-    Hermitian,
-};
-
-enum ObsType : int8_t {
-    Basic = 0,
-    TensorProd,
-    Hamiltonian,
-};
-
-// complex<float> type
-struct CplxT_float {
-    float real;
-    float imag;
-};
-
-// complex<double> type
-struct CplxT_double {
-    double real;
-    double imag;
-};
-
-enum NumericType : int8_t {
-    idx = 0,
-    i1,
-    i8,
-    i16,
-    i32,
-    i64,
-    f32,
-    f64,
-    c64,
-    c128,
-};
-
-// MemRefT<datatype, dimension=rank> type
-struct OpaqueMemRefT {
-    int64_t rank;
-    void *descriptor;
-    NumericType datatype;
-};
-
-// MemRefT<complex<double>, dimension=1> type
-struct MemRefT_CplxT_double_1d {
-    CplxT_double *data_allocated;
-    CplxT_double *data_aligned;
-    size_t offset;
-    size_t sizes[1];
-    size_t strides[1];
-};
-
-// MemRefT<complex<double>, dimension=2> type
-struct MemRefT_CplxT_double_2d {
-    CplxT_double *data_allocated;
-    CplxT_double *data_aligned;
-    size_t offset;
-    size_t sizes[2];
-    size_t strides[2];
-};
-
-// MemRefT<double, dimension=1> type
-struct MemRefT_double_1d {
-    double *data_allocated;
-    double *data_aligned;
-    size_t offset;
-    size_t sizes[1];
-    size_t strides[1];
-};
-
-// MemRefT<double, dimension=2> type
-struct MemRefT_double_2d {
-    double *data_allocated;
-    double *data_aligned;
-    size_t offset;
-    size_t sizes[2];
-    size_t strides[2];
-};
-
-// MemRefT<int64_t, dimension=1> type
-struct MemRefT_int64_1d {
-    int64_t *data_allocated;
-    int64_t *data_aligned;
-    size_t offset;
-    size_t sizes[1];
-    size_t strides[1];
-};
-
-// MemRefT<int64_t, dimension=1> type
-struct MemRefT_int8_1d {
-    int8_t *data_allocated;
-    int8_t *data_aligned;
-    size_t offset;
-    size_t sizes[1];
-    size_t strides[1];
-};
-
-// PairT<MemRefT<double, dimension=1>, MemRefT<int64, dimension=2>> type
-struct PairT_MemRefT_double_int64_1d {
-    struct MemRefT_double_1d first;
-    struct MemRefT_int64_1d second;
-};
-
-// Quantum operation modifiers
-struct Modifiers {
-    bool adjoint;
-    size_t num_controlled;
-    QUBIT *controlled_wires;
-    bool *controlled_values;
-};
-
-using CplxT_double = struct CplxT_double;
-using MemRefT_CplxT_double_1d = struct MemRefT_CplxT_double_1d;
-using MemRefT_CplxT_double_2d = struct MemRefT_CplxT_double_2d;
-using MemRefT_double_1d = struct MemRefT_double_1d;
-using MemRefT_double_2d = struct MemRefT_double_2d;
-using MemRefT_int64_1d = struct MemRefT_int64_1d;
-using PairT_MemRefT_double_int64_1d = struct PairT_MemRefT_double_int64_1d;
-using Modifiers = struct Modifiers;
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif
diff --git a/src/qirlightning/simple_demo/test_rt_device.cpp b/src/qirlightning/simple_demo/test_rt_device.cpp
deleted file mode 100644
index c50ab92..0000000
--- a/src/qirlightning/simple_demo/test_rt_device.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-#include <dlfcn.h>
-
-#include "QuantumDevice.hpp"
-
-// Runtime libraries (kokkos/GPU/qubit etc.)
-// Update these paths to point to the correct library
-#define RTDLIB                                                         \
-    "<UPDATE: site packages path>/" \
-    "pennylane_lightning/liblightning_kokkos_catalyst.so";
-#define RTDDEVICE "LightningKokkosSimulator";
-
-extern "C" Catalyst::Runtime::QuantumDevice*
-GenericDeviceFactory(char const* kwargs);
-
-using namespace Catalyst::Runtime;
-
-int main()
-{
-    try
-    {
-        // Load lightning simulation library
-        std::string rtd_lib = RTDLIB;
-        std::string rtd_device = RTDDEVICE;
-        std::string kwargs = {};
-        auto rtld_flags = RTLD_LAZY | RTLD_NODELETE;
-        auto rtd_dylib_handler = dlopen(rtd_lib.c_str(), rtld_flags);
-
-        if (!rtd_dylib_handler)
-        {
-            throw std::runtime_error("Failed to load library: " + rtd_lib);
-        }
-
-        // Find device factory
-        std::string factory_name = rtd_device + "Factory";
-        void* f_ptr = dlsym(rtd_dylib_handler, factory_name.c_str());
-
-        if (!f_ptr)
-        {
-            dlclose(rtd_dylib_handler);
-            throw std::runtime_error("Failed to find factory function: "
-                                     + factory_name);
-        }
-        std::string rtd_kwargs = {};
-        auto rtd_qdevice = std::unique_ptr<QuantumDevice>(
-            reinterpret_cast<decltype(GenericDeviceFactory)*>(f_ptr)(
-                rtd_kwargs.c_str()));
-
-        // Allocate Qubits
-        rtd_qdevice->AllocateQubits(3);
-
-        // Get Num Qubits
-        std::cout << "Num Qubits = " << rtd_qdevice->GetNumQubits()
-                  << std::endl;
-
-        // Apply Gate
-        rtd_qdevice->NamedOperation("Hadamard", {}, {0});
-
-        // Print State
-        std::cout << "State = " << std::endl;
-        rtd_qdevice->PrintState();
-
-        // Measure
-        QubitIdType wire{0};
-        Result result = rtd_qdevice->Measure(wire, std::nullopt);
-        std::cout << "Measure on wire 0 = " << *result << std::endl;
-    }
-    catch (std::exception const& e)
-    {
-        std::cerr << "Error: " << e.what() << std::endl;
-        return EXIT_FAILURE;
-    }
-
-    return EXIT_SUCCESS;
-}
diff --git a/src/qirlightning/support_catalyst.cmake b/src/qirlightning/support_catalyst.cmake
index 5932f06..ca7df76 100644
--- a/src/qirlightning/support_catalyst.cmake
+++ b/src/qirlightning/support_catalyst.cmake
@@ -26,7 +26,7 @@ macro(FindCatalyst target_name)
 
     else()
         if(NOT CATALYST_GIT_TAG)
-            set(CATALYST_GIT_TAG "main" CACHE STRING "GIT_TAG value to build Catalyst")
+            set(CATALYST_GIT_TAG "v0.12.0" CACHE STRING "GIT_TAG value to build Catalyst")
         endif()
         message(INFO " Building against Catalyst GIT TAG ${CATALYST_GIT_TAG}")
 

From a93414ef23792b385520f78db0b5d5246b6f2d7d Mon Sep 17 00:00:00 2001
From: Joseph Lee <joseph.lee@xanadu.ai>
Date: Wed, 8 Oct 2025 18:43:01 +0000
Subject: [PATCH 57/64] update

---
 src/qirlightning/LightningQuantum.cc | 47 +++++++++++++---------
 src/qirlightning/LightningQuantum.hh |  9 ++---
 src/qirlightning/LightningRuntime.cc |  3 +-
 src/qirlightning/README.md           | 59 +++++++++++++++-------------
 4 files changed, 66 insertions(+), 52 deletions(-)

diff --git a/src/qirlightning/LightningQuantum.cc b/src/qirlightning/LightningQuantum.cc
index e584a3b..e540799 100644
--- a/src/qirlightning/LightningQuantum.cc
+++ b/src/qirlightning/LightningQuantum.cc
@@ -11,11 +11,11 @@
 #include <algorithm>
 #include <iostream>
 #include <optional>
+#include <random>
 #include <stdexcept>
 #include <thread>
 #include <utility>
 #include <dlfcn.h>
-#include <random>
 
 #include "qiree/Assert.hh"
 
@@ -29,7 +29,8 @@ using namespace Catalyst::Runtime;
 /*!
  * Initialize the Lightning simulator
  */
-LightningQuantum::LightningQuantum(std::ostream& os, unsigned long int seed) : output_(os), seed_(seed)
+LightningQuantum::LightningQuantum(std::ostream& os, unsigned long int seed)
+    : output_(os), seed_(seed)
 {
     std::string rtd_lib = RTDLIB;
     std::string rtd_device = RTDDEVICE;
@@ -56,8 +57,8 @@ LightningQuantum::LightningQuantum(std::ostream& os, unsigned long int seed) : o
 
 //---------------------------------------------------------------------------//
 //! Default destructor
-LightningQuantum::~LightningQuantum() {
-
+LightningQuantum::~LightningQuantum()
+{
     if (rtd_dylib_handler)
     {
         dlclose(rtd_dylib_handler);
@@ -87,10 +88,7 @@ void LightningQuantum::set_up(EntryPointAttrs const& attrs)
 /*!
  * Complete an execution
  */
-void LightningQuantum::tear_down()
-{
-  
-}
+void LightningQuantum::tear_down() {}
 
 //---------------------------------------------------------------------------//
 /*!
@@ -103,7 +101,7 @@ void LightningQuantum::reset(Qubit q)
 
 //----------------------------------------------------------------------------//
 /*!
- * Read the value of a result. 
+ * Read the value of a result.
  */
 QState LightningQuantum::read_result(Result r) const
 {
@@ -117,13 +115,14 @@ QState LightningQuantum::read_result(Result r) const
  * Map a qubit to a result index.
  */
 void LightningQuantum::mz(Qubit q, Result r)
-{ 
-    QIREE_EXPECT(q.value < this->num_qubits());  
+{
+    QIREE_EXPECT(q.value < this->num_qubits());
     QIREE_EXPECT(r.value < this->num_results());
     std::mt19937 gen(seed_);
     seed_++;
     rtd_qdevice->SetDevicePRNG(&gen);
-    auto result = rtd_qdevice->Measure(static_cast<intptr_t>(q.value), std::nullopt);
+    auto result
+        = rtd_qdevice->Measure(static_cast<intptr_t>(q.value), std::nullopt);
     results_[r.value] = *result;
 }
 
@@ -136,22 +135,29 @@ void LightningQuantum::mz(Qubit q, Result r)
 void LightningQuantum::cx(Qubit q1, Qubit q2)
 {
     rtd_qdevice->NamedOperation(
-        "CNOT", {}, {static_cast<intptr_t>(q1.value), static_cast<intptr_t>(q2.value)});
+        "CNOT",
+        {},
+        {static_cast<intptr_t>(q1.value), static_cast<intptr_t>(q2.value)});
 }
 void LightningQuantum::cnot(Qubit q1, Qubit q2)
 {
     rtd_qdevice->NamedOperation(
-        "CNOT", {}, {static_cast<intptr_t>(q1.value), static_cast<intptr_t>(q2.value)});
+        "CNOT",
+        {},
+        {static_cast<intptr_t>(q1.value), static_cast<intptr_t>(q2.value)});
 }
 void LightningQuantum::cz(Qubit q1, Qubit q2)
 {
     rtd_qdevice->NamedOperation(
-        "CZ", {}, {static_cast<intptr_t>(q1.value), static_cast<intptr_t>(q2.value)});
+        "CZ",
+        {},
+        {static_cast<intptr_t>(q1.value), static_cast<intptr_t>(q2.value)});
 }
 // 2. Local gates
 void LightningQuantum::h(Qubit q)
 {
-    rtd_qdevice->NamedOperation("Hadamard", {}, {static_cast<intptr_t>(q.value)});
+    rtd_qdevice->NamedOperation(
+        "Hadamard", {}, {static_cast<intptr_t>(q.value)});
 }
 void LightningQuantum::s(Qubit q)
 {
@@ -177,15 +183,18 @@ void LightningQuantum::z(Qubit q)
 // 2.2 rotation gates
 void LightningQuantum::rx(double theta, Qubit q)
 {
-    rtd_qdevice->NamedOperation("RX", {theta}, {static_cast<intptr_t>(q.value)});
+    rtd_qdevice->NamedOperation(
+        "RX", {theta}, {static_cast<intptr_t>(q.value)});
 }
 void LightningQuantum::ry(double theta, Qubit q)
 {
-    rtd_qdevice->NamedOperation("RY", {theta}, {static_cast<intptr_t>(q.value)});
+    rtd_qdevice->NamedOperation(
+        "RY", {theta}, {static_cast<intptr_t>(q.value)});
 }
 void LightningQuantum::rz(double theta, Qubit q)
 {
-    rtd_qdevice->NamedOperation("RZ", {theta}, {static_cast<intptr_t>(q.value)});
+    rtd_qdevice->NamedOperation(
+        "RZ", {theta}, {static_cast<intptr_t>(q.value)});
 }
 
 }  // namespace qiree
diff --git a/src/qirlightning/LightningQuantum.hh b/src/qirlightning/LightningQuantum.hh
index e68bf66..f62a692 100644
--- a/src/qirlightning/LightningQuantum.hh
+++ b/src/qirlightning/LightningQuantum.hh
@@ -33,8 +33,8 @@ class LightningQuantum final : virtual public QuantumNotImpl
     LightningQuantum(std::ostream& os, unsigned long int seed);
     ~LightningQuantum();
 
-    QIREE_DELETE_COPY_MOVE(LightningQuantum);  // Delete copy and move constructors
-
+    QIREE_DELETE_COPY_MOVE(LightningQuantum);  // Delete copy and move
+                                               // constructors
 
     //!@{
     //! \name Accessors
@@ -62,11 +62,10 @@ class LightningQuantum final : virtual public QuantumNotImpl
     QState read_result(Result) const final;
     //!@}
 
-
     //!@{
     //! \name Circuit construction
     // void ccx(Qubit, Qubit) final;
-    void ccnot(Qubit, Qubit, Qubit); 
+    void ccnot(Qubit, Qubit, Qubit);
     void cnot(Qubit, Qubit) final;
     void cx(Qubit, Qubit) final;
     // void cy(Qubit, Qubit) final;
@@ -92,7 +91,7 @@ class LightningQuantum final : virtual public QuantumNotImpl
 
     struct Factory;
     struct State;
-  
+
     //// DATA ////
 
     std::ostream& output_;
diff --git a/src/qirlightning/LightningRuntime.cc b/src/qirlightning/LightningRuntime.cc
index 89bce1a..fb91b2a 100644
--- a/src/qirlightning/LightningRuntime.cc
+++ b/src/qirlightning/LightningRuntime.cc
@@ -18,7 +18,8 @@ namespace qiree
 /*!
  * Construct with quantum reference to access classical registers.
  */
-LightningRuntime::LightningRuntime(std::ostream& output, LightningQuantum const& sim)
+LightningRuntime::LightningRuntime(std::ostream& output,
+                                   LightningQuantum const& sim)
     : SingleResultRuntime{sim}, output_(output)
 {
 }
diff --git a/src/qirlightning/README.md b/src/qirlightning/README.md
index 2777869..dbbc46f 100644
--- a/src/qirlightning/README.md
+++ b/src/qirlightning/README.md
@@ -1,58 +1,55 @@
 # QIR-EE with Lightning simulator backend
 
-## Installing a lightning simulator
+The [PennyLane-Lightning](https://github.com/PennyLaneAI/pennylane-lightning) plugins are high-performance quantum simulators, which are part of the [PennyLane](https://github.com/PennyLaneAI/pennylane) ecosystem. The simulators include the following backends (which can be used with QIREE):
+- `lightning.qubit`: a fast state-vector simulator with optional OpenMP additions and parallelized gate-level SIMD kernels.
+- `lightning.gpu`: a state-vector simulator based on the NVIDIA cuQuantum SDK.
+- `lightning.kokkos`: a state-vector simulator written with Kokkos. It can exploit the inherent parallelism of modern processing units supporting the OpenMP, CUDA or HIP programming models.
 
-More information on installing Pennylane Lightning simulators can be found in [lightning repository](https://github.com/PennyLaneAI/pennylane-lightning).
+## Installing a Lightning simulator
+
+More information on installing Pennylane Lightning simulators from source, please visit the [Lightning installation page](https://docs.pennylane.ai/projects/lightning/en/latest/dev/installation.html). Note: QIREE is tested to work with PennyLane Lightning simulators v0.42.
 
 ### Quick start
-The easiest way to get started is install a Lightning simulator from PyPI via pip:
+
+The easiest way to get started is to install a Lightning simulator (`pennylane-lightning`/`pennylane-lightning-gpu`/`pennylane-lightning-kokkos`) from PyPI via pip:
 
 ```
-$ pip install pennylane-lightning-kokkos
+$ pip install pennylane-lightning-kokkos==0.42.0
 
 $ pip show pennylane-lightning-kokkos
 Name: PennyLane_Lightning_Kokkos
-Version: 0.40.0
+Version: 0.42.0
 Summary: PennyLane-Lightning plugin
 Home-page: https://github.com/PennyLaneAI/pennylane-lightning
-Author: 
-Author-email: 
+Author:
+Author-email:
 License: Apache License 2.0
 Location: <site packages path>
 Requires: pennylane, pennylane-lightning
 ```
-Running `pip install pennylane` or `pip install pennylane-lightning` will install the `lightning.qubit` (CPU) simulator, and other simulators can be installed by running `pip install pennylane-lightning-kokkos / pennylane-lightning-gpu`.
+Running `pip install pennylane` or `pip install pennylane-lightning` will automatically install the `lightning.qubit` (CPU) simulator, and other simulators can be installed by running `pip install pennylane-lightning-kokkos / pennylane-lightning-gpu`. Note: by default, the pre-built `lightning.kokkos` wheels from pip are built with Kokkos OpenMP enabled for CPU. To build Kokkos for other devices (e.g. CUDA or HIP GPUs), please install from source. Instruction can be found [here](https://docs.pennylane.ai/projects/lightning/en/latest/lightning_kokkos/installation.html).
 
-When installing [Pennylane-Lightning](https://github.com/PennyLaneAI/pennylane-lightning) from pip or source, you will have the shared objects for each of the simulator installed. These are named `liblightning_qubit_catalyst.so`/`liblightning_kokkos_catalyst.so`/`liblightning_GPU_catalyst.so` respectively.
+When installing Pennylane-Lightning from pip or from source, you will have the shared libraries for each of the simulator installed. These are named `liblightning_qubit_catalyst.so`/`liblightning_kokkos_catalyst.so`/`liblightning_GPU_catalyst.so` respectively.
 
-Example:
+To obtain the path to the library:
 ```
-$ ls <site packages path>
-... liblightning_kokkos_catalyst.so ...
+$ export PL_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/pennylane_lightning')")
+
+$ ls $PL_PATH
+... liblightning_qubit_catalyst.so  liblightning_kokkos_catalyst.so ...
 ```
 
 You can swap `pennylane-lightning-kokkos` for `pennylane-lightning-gpu` for lightning.gpu and `pennylane-lightning` for lightning.gpu simulators.
 
-### Compiling Lightning from Source
-
-The [lightning repository page](https://github.com/PennyLaneAI/pennylane-lightning) contains information on how to install Lightning simulators from source. This will necessary for e.g. Kokkos with HIP backend.
-
 ## Compile QIR-EE with Lightning backend
 
-- Set `QIREE_USE_LIGHTNING` to `ON` in `qiree/CMakeLists.txt`
-- Set the environment variable `LIGHTNING_SIM_PATH` to the shared object of the Lightning Simulator, e.g.
+To compile QIR-EE with lightning backend:
 
 ```
+# Set the path for the lightning simulator shared library
 export LIGHTNING_SIM_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/pennylane_lightning')")/liblightning_kokkos_catalyst.so
-```
-
-Note: 
-- replace `libligghtning_kokkos_catalyst.so` with `liblightning_qubit_catalyst.so` or `liblightning_GPU_catalyst.so` if required.
-- when running on `GPU`, include `cuquantum` libraries in the library path (which will be installed as a dependency from Python), i.e. `LD_LIBRARY_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")/lib:$LD_LIBRARY_PATH ./test_rt_device.out`
-
-To compile:
 
-```
+# Proceed with usual build instructions, but with `-DQIREE_USE_LIGHTNING=ON` cmake flag
 cd qiree/
 mkdir build; cd build
 cmake -DQIREE_USE_LIGHTNING=ON ..
@@ -60,6 +57,15 @@ make
 
 ```
 
+Note:
+- replace `libligghtning_kokkos_catalyst.so` with `liblightning_qubit_catalyst.so` or `liblightning_GPU_catalyst.so` if required.
+- when running with `lightning.gpu` simulator for Nvidia GPUs, include `cuquantum` libraries in the library path (which will be installed as a dependency from Python), i.e.
+
+```
+LD_LIBRARY_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")/lib:$LD_LIBRARY_PATH ./test_rt_device.out
+```
+
+
 ## Running the example
 
 To run (in the `build` directory):
@@ -68,4 +74,3 @@ To run (in the `build` directory):
 $ ./bin/qir-lightning ../examples/bell.ll -s 100
 {"00":43,"11":57}
 ```
-

From f8e43c5e049cac2b702f786b3c9a13a09fb7bce6 Mon Sep 17 00:00:00 2001
From: Joseph Lee <joseph.lee@xanadu.ai>
Date: Wed, 8 Oct 2025 18:49:55 +0000
Subject: [PATCH 58/64] update docs

---
 src/qirlightning/README.md | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/qirlightning/README.md b/src/qirlightning/README.md
index dbbc46f..4058002 100644
--- a/src/qirlightning/README.md
+++ b/src/qirlightning/README.md
@@ -7,7 +7,7 @@ The [PennyLane-Lightning](https://github.com/PennyLaneAI/pennylane-lightning) pl
 
 ## Installing a Lightning simulator
 
-More information on installing Pennylane Lightning simulators from source, please visit the [Lightning installation page](https://docs.pennylane.ai/projects/lightning/en/latest/dev/installation.html). Note: QIREE is tested to work with PennyLane Lightning simulators v0.42.
+For more information on installing Pennylane Lightning simulators from source, please visit the [Lightning installation page](https://docs.pennylane.ai/projects/lightning/en/latest/dev/installation.html). Note: QIREE is tested to work with PennyLane Lightning simulators v0.42.
 
 ### Quick start
 
@@ -27,7 +27,12 @@ License: Apache License 2.0
 Location: <site packages path>
 Requires: pennylane, pennylane-lightning
 ```
-Running `pip install pennylane` or `pip install pennylane-lightning` will automatically install the `lightning.qubit` (CPU) simulator, and other simulators can be installed by running `pip install pennylane-lightning-kokkos / pennylane-lightning-gpu`. Note: by default, the pre-built `lightning.kokkos` wheels from pip are built with Kokkos OpenMP enabled for CPU. To build Kokkos for other devices (e.g. CUDA or HIP GPUs), please install from source. Instruction can be found [here](https://docs.pennylane.ai/projects/lightning/en/latest/lightning_kokkos/installation.html).
+
+**Note:** PennyLane and PennyLane lightning supports Python 3.11-3.13.
+
+Running `pip install pennylane` or `pip install pennylane-lightning` will automatically install the `lightning.qubit` (CPU) simulator, and other simulators can be installed by running `pip install pennylane-lightning-kokkos / pennylane-lightning-gpu`.
+
+**Note:** By default, the pre-built `lightning.kokkos` wheels from pip are built with Kokkos OpenMP enabled for CPU. To build Kokkos for other devices (e.g. CUDA or HIP GPUs), please install from source. Instruction can be found [here](https://docs.pennylane.ai/projects/lightning/en/latest/lightning_kokkos/installation.html).
 
 When installing Pennylane-Lightning from pip or from source, you will have the shared libraries for each of the simulator installed. These are named `liblightning_qubit_catalyst.so`/`liblightning_kokkos_catalyst.so`/`liblightning_GPU_catalyst.so` respectively.
 
@@ -39,8 +44,6 @@ $ ls $PL_PATH
 ... liblightning_qubit_catalyst.so  liblightning_kokkos_catalyst.so ...
 ```
 
-You can swap `pennylane-lightning-kokkos` for `pennylane-lightning-gpu` for lightning.gpu and `pennylane-lightning` for lightning.gpu simulators.
-
 ## Compile QIR-EE with Lightning backend
 
 To compile QIR-EE with lightning backend:
@@ -57,12 +60,12 @@ make
 
 ```
 
-Note:
+**Note:**
 - replace `libligghtning_kokkos_catalyst.so` with `liblightning_qubit_catalyst.so` or `liblightning_GPU_catalyst.so` if required.
 - when running with `lightning.gpu` simulator for Nvidia GPUs, include `cuquantum` libraries in the library path (which will be installed as a dependency from Python), i.e.
 
 ```
-LD_LIBRARY_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")/lib:$LD_LIBRARY_PATH ./test_rt_device.out
+export LD_LIBRARY_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")/lib:$LD_LIBRARY_PATH
 ```
 
 

From 2602033d9bb41181e260d222e715b384ba5972de Mon Sep 17 00:00:00 2001
From: Joseph Lee <joseph.lee@xanadu.ai>
Date: Mon, 20 Oct 2025 22:21:26 +0000
Subject: [PATCH 59/64] update private member names

---
 src/qirlightning/LightningQuantum.cc | 67 +++++++++++++---------------
 src/qirlightning/LightningQuantum.hh |  6 +--
 2 files changed, 35 insertions(+), 38 deletions(-)

diff --git a/src/qirlightning/LightningQuantum.cc b/src/qirlightning/LightningQuantum.cc
index e540799..d3538a2 100644
--- a/src/qirlightning/LightningQuantum.cc
+++ b/src/qirlightning/LightningQuantum.cc
@@ -32,37 +32,31 @@ using namespace Catalyst::Runtime;
 LightningQuantum::LightningQuantum(std::ostream& os, unsigned long int seed)
     : output_(os), seed_(seed)
 {
-    std::string rtd_lib = RTDLIB;
-    std::string rtd_device = RTDDEVICE;
-    std::string kwargs = {};
     auto rtld_flags = RTLD_LAZY | RTLD_NODELETE;
-    rtd_dylib_handler = dlopen(rtd_lib.c_str(), rtld_flags);
+    rtd_dylib_handler_ = dlopen(RTDLIB, rtld_flags);
 
-    if (!rtd_dylib_handler)
-    {
-        throw std::runtime_error("Failed to load library: " + rtd_lib);
-    }
+    QIREE_VALIDATE(rtd_dylib_handler_,
+                   << "failed to load Lightning runtime library '" << RTDLIB
+                   << "'");
 
     // Find device factory
+    std::string rtd_device = RTDDEVICE;
     std::string factory_name = rtd_device + "Factory";
-    factory_f_ptr = dlsym(rtd_dylib_handler, factory_name.c_str());
+    factory_f_ptr_ = dlsym(rtd_dylib_handler_, factory_name.c_str());
 
-    if (!factory_f_ptr)
-    {
-        dlclose(rtd_dylib_handler);
-        throw std::runtime_error("Failed to find factory function: "
-                                 + factory_name);
-    }
+    QIREE_VALIDATE(factory_f_ptr_,
+                   << "failed to find device factory function '"
+                   << factory_name << "'");
 }
 
 //---------------------------------------------------------------------------//
 //! Default destructor
 LightningQuantum::~LightningQuantum()
 {
-    if (rtd_dylib_handler)
+    if (rtd_dylib_handler_)
     {
-        dlclose(rtd_dylib_handler);
-    };
+        dlclose(rtd_dylib_handler_);
+    }
 };
 
 //---------------------------------------------------------------------------//
@@ -77,11 +71,11 @@ void LightningQuantum::set_up(EntryPointAttrs const& attrs)
     results_.resize(attrs.required_num_results);
 
     std::string rtd_kwargs = {};
-    rtd_qdevice = std::unique_ptr<QuantumDevice>(
-        reinterpret_cast<decltype(GenericDeviceFactory)*>(factory_f_ptr)(
+    rtd_qdevice_ = std::unique_ptr<QuantumDevice>(
+        reinterpret_cast<decltype(GenericDeviceFactory)*>(factory_f_ptr_)(
             rtd_kwargs.c_str()));
 
-    rtd_qdevice->AllocateQubits(num_qubits_);
+    rtd_qdevice_->AllocateQubits(num_qubits_);
 }
 
 //---------------------------------------------------------------------------//
@@ -120,9 +114,9 @@ void LightningQuantum::mz(Qubit q, Result r)
     QIREE_EXPECT(r.value < this->num_results());
     std::mt19937 gen(seed_);
     seed_++;
-    rtd_qdevice->SetDevicePRNG(&gen);
+    rtd_qdevice_->SetDevicePRNG(&gen);
     auto result
-        = rtd_qdevice->Measure(static_cast<intptr_t>(q.value), std::nullopt);
+        = rtd_qdevice_->Measure(static_cast<intptr_t>(q.value), std::nullopt);
     results_[r.value] = *result;
 }
 
@@ -134,21 +128,21 @@ void LightningQuantum::mz(Qubit q, Result r)
 // 1. Entangling gates
 void LightningQuantum::cx(Qubit q1, Qubit q2)
 {
-    rtd_qdevice->NamedOperation(
+    rtd_qdevice_->NamedOperation(
         "CNOT",
         {},
         {static_cast<intptr_t>(q1.value), static_cast<intptr_t>(q2.value)});
 }
 void LightningQuantum::cnot(Qubit q1, Qubit q2)
 {
-    rtd_qdevice->NamedOperation(
+    rtd_qdevice_->NamedOperation(
         "CNOT",
         {},
         {static_cast<intptr_t>(q1.value), static_cast<intptr_t>(q2.value)});
 }
 void LightningQuantum::cz(Qubit q1, Qubit q2)
 {
-    rtd_qdevice->NamedOperation(
+    rtd_qdevice_->NamedOperation(
         "CZ",
         {},
         {static_cast<intptr_t>(q1.value), static_cast<intptr_t>(q2.value)});
@@ -156,44 +150,47 @@ void LightningQuantum::cz(Qubit q1, Qubit q2)
 // 2. Local gates
 void LightningQuantum::h(Qubit q)
 {
-    rtd_qdevice->NamedOperation(
+    rtd_qdevice_->NamedOperation(
         "Hadamard", {}, {static_cast<intptr_t>(q.value)});
 }
 void LightningQuantum::s(Qubit q)
 {
-    rtd_qdevice->NamedOperation("S", {}, {static_cast<intptr_t>(q.value)});
+    rtd_qdevice_->NamedOperation("S", {}, {static_cast<intptr_t>(q.value)});
 }
 void LightningQuantum::t(Qubit q)
 {
-    rtd_qdevice->NamedOperation("T", {}, {static_cast<intptr_t>(q.value)});
+    rtd_qdevice_->NamedOperation("T", {}, {static_cast<intptr_t>(q.value)});
 }
 // 2.1 Pauli gates
 void LightningQuantum::x(Qubit q)
 {
-    rtd_qdevice->NamedOperation("PauliX", {}, {static_cast<intptr_t>(q.value)});
+    rtd_qdevice_->NamedOperation(
+        "PauliX", {}, {static_cast<intptr_t>(q.value)});
 }
 void LightningQuantum::y(Qubit q)
 {
-    rtd_qdevice->NamedOperation("PauliY", {}, {static_cast<intptr_t>(q.value)});
+    rtd_qdevice_->NamedOperation(
+        "PauliY", {}, {static_cast<intptr_t>(q.value)});
 }
 void LightningQuantum::z(Qubit q)
 {
-    rtd_qdevice->NamedOperation("PauliZ", {}, {static_cast<intptr_t>(q.value)});
+    rtd_qdevice_->NamedOperation(
+        "PauliZ", {}, {static_cast<intptr_t>(q.value)});
 }
 // 2.2 rotation gates
 void LightningQuantum::rx(double theta, Qubit q)
 {
-    rtd_qdevice->NamedOperation(
+    rtd_qdevice_->NamedOperation(
         "RX", {theta}, {static_cast<intptr_t>(q.value)});
 }
 void LightningQuantum::ry(double theta, Qubit q)
 {
-    rtd_qdevice->NamedOperation(
+    rtd_qdevice_->NamedOperation(
         "RY", {theta}, {static_cast<intptr_t>(q.value)});
 }
 void LightningQuantum::rz(double theta, Qubit q)
 {
-    rtd_qdevice->NamedOperation(
+    rtd_qdevice_->NamedOperation(
         "RZ", {theta}, {static_cast<intptr_t>(q.value)});
 }
 
diff --git a/src/qirlightning/LightningQuantum.hh b/src/qirlightning/LightningQuantum.hh
index f62a692..ebadd71 100644
--- a/src/qirlightning/LightningQuantum.hh
+++ b/src/qirlightning/LightningQuantum.hh
@@ -96,9 +96,9 @@ class LightningQuantum final : virtual public QuantumNotImpl
 
     std::ostream& output_;
     unsigned long int seed_{};
-    void* rtd_dylib_handler;
-    void* factory_f_ptr;
-    std::unique_ptr<Catalyst::Runtime::QuantumDevice> rtd_qdevice;
+    void* rtd_dylib_handler_;
+    void* factory_f_ptr_;
+    std::unique_ptr<Catalyst::Runtime::QuantumDevice> rtd_qdevice_;
     std::vector<bool> results_;
 
     size_type num_qubits_{};

From c8c69176175f670857328fef10dcdd65181eadd9 Mon Sep 17 00:00:00 2001
From: Joseph Lee <joseph.lee@xanadu.ai>
Date: Wed, 22 Oct 2025 21:35:32 +0000
Subject: [PATCH 60/64] seth comments

---
 .github/workflows/build-fast.yml              |  53 ++-
 .github/workflows/build-lightning.yml         |  98 -----
 CMakeLists.txt                                |  23 +-
 cmake/support_catalyst.cmake                  |  80 ++++
 scripts/lightning-path.sh                     |  61 +++
 src/qirlightning/CMakeLists.txt               |  47 +--
 src/qirlightning/LightningQuantum.cc          |  26 +-
 src/qirlightning/README.md                    |  27 +-
 src/qirlightning/simple_demo/README.md        |  69 +++
 .../snapshot_catalyst_runtime/README.rst      | 118 ++++++
 .../include/DataView.hpp                      | 173 ++++++++
 .../include/Exception.hpp                     |  96 +++++
 .../include/QuantumDevice.hpp                 | 399 ++++++++++++++++++
 .../snapshot_catalyst_runtime/include/Types.h | 179 ++++++++
 .../simple_demo/test_rt_device.cpp            |  74 ++++
 src/qirlightning/support_catalyst.cmake       |  74 ----
 16 files changed, 1345 insertions(+), 252 deletions(-)
 delete mode 100644 .github/workflows/build-lightning.yml
 create mode 100644 cmake/support_catalyst.cmake
 create mode 100755 scripts/lightning-path.sh
 create mode 100644 src/qirlightning/simple_demo/README.md
 create mode 100644 src/qirlightning/simple_demo/snapshot_catalyst_runtime/README.rst
 create mode 100644 src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/DataView.hpp
 create mode 100644 src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Exception.hpp
 create mode 100644 src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/QuantumDevice.hpp
 create mode 100644 src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Types.h
 create mode 100644 src/qirlightning/simple_demo/test_rt_device.cpp
 delete mode 100644 src/qirlightning/support_catalyst.cmake

diff --git a/.github/workflows/build-fast.yml b/.github/workflows/build-fast.yml
index d8c1fa7..e824947 100644
--- a/.github/workflows/build-fast.yml
+++ b/.github/workflows/build-fast.yml
@@ -1,4 +1,3 @@
-# Build directly on the GitHub runner with caching
 name: build-fast
 on:
   workflow_dispatch:
@@ -10,10 +9,10 @@ concurrency:
 
 jobs:
   linux:
-    name: ${{matrix.runner}}-${{matrix.compiler}}-${{matrix.version}}-llvm${{matrix.llvm}}
+    name: ${{matrix.build-config.runner}}-${{matrix.build-config.compiler}}-${{matrix.build-config.version}}-llvm${{matrix.build-config.llvm}}${{ matrix.use-lightning == true && '-lightning' || '' }}
     strategy:
       matrix:
-        include:
+        build-config:
           - runner: jammy
             compiler: gcc
             version: 12
@@ -22,43 +21,65 @@ jobs:
             compiler: clang
             version: 15
             llvm: 15
+        use-lightning: [false, true]
     runs-on: >-
-      ${{  matrix.runner == 'focal' && 'ubuntu-20.04'
-        || matrix.runner == 'jammy' && 'ubuntu-22.04'
+      ${{  matrix.build-config.runner == 'focal' && 'ubuntu-20.04'
+        || matrix.build-config.runner == 'jammy' && 'ubuntu-22.04'
         || null
       }}
     env:
       CCACHE_DIR: "${{github.workspace}}/.ccache"
       CCACHE_MAXSIZE: "10G"
-      CC: ${{matrix.compiler}}-${{matrix.version}}
-      CXX: ${{matrix.compiler == 'gcc' && 'g++' || 'clang++'}}-${{matrix.version}}
+      CC: ${{matrix.build-config.compiler}}-${{matrix.build-config.version}}
+      CXX: ${{matrix.build-config.compiler == 'gcc' && 'g++' || 'clang++'}}-${{matrix.build-config.version}}
     steps:
+      - name: Install Python (if building for Lightning)
+        if: matrix.use-lightning == true
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
       - name: Install dependencies
         run: |
           sudo apt-get -q -y update
           sudo apt-get -q -y install \
             ccache cmake ninja-build libgtest-dev \
-            llvm-${{matrix.llvm}}-dev \
-            ${{matrix.compiler}}-${{matrix.version}} \
-            ${{matrix.compiler == 'gcc' && format('g++-{0}', matrix.version) || ''}}
+            llvm-${{matrix.build-config.llvm}}-dev \
+            ${{matrix.build-config.compiler}}-${{matrix.build-config.version}} \
+            ${{matrix.build-config.compiler == 'gcc' && format('g++-{0}', matrix.build-config.version) || ''}}
+
+          if [[ "${{ matrix.use-lightning }}" == "true" ]]; then
+            echo "Installing Lightning Python dependencies..."
+            python -m pip install pennylane-lightning==0.43.0
+          fi
+
           echo "Installed toolchain:"
           ld --version | head -1
           $CC --version | head -1
           $CXX --version | head -1
-          llvm-config-${{matrix.llvm}} --version | head -1
+          llvm-config-${{matrix.build-config.llvm}} --version | head -1
+
       - name: Check out
         uses: actions/checkout@v4
+
       - name: Set up ccache
         uses: actions/cache@v4
         with:
           path: ${{env.CCACHE_DIR}}
-          key: ccache-${{matrix.runner}}-${{matrix.compiler}}-${{matrix.version}}-${{github.run_id}}
-          restore-keys: ccache-${{matrix.runner}}-${{matrix.compiler}}-${{matrix.version}}
+          key: ccache-${{matrix.build-config.runner}}-${{matrix.build-config.compiler}}-${{matrix.build-config.version}}-${{github.run_id}}
+          restore-keys: ccache-${{matrix.build-config.runner}}-${{matrix.build-config.compiler}}-${{matrix.build-config.version}}
+
       - name: Zero ccache stats
         run: |
           ccache -z
+
       - name: Configure
         run: |
+          if [[ "${{ matrix.use-lightning }}" == "true" ]]; then
+            export LIGHTNING_PATH=$(bash ./scripts/lightning-path.sh qubit)
+          else
+            export LIGHTNING_PATH=""
+          fi
           mkdir build && cd build
           cmake -GNinja \
             -DQIREE_GIT_DESCRIBE="${{github.event.pull_request
@@ -67,23 +88,29 @@ jobs:
             -DQIREE_BUILD_TESTS:BOOL=ON \
             -DQIREE_DEBUG:BOOL=ON \
             -DQIREE_USE_XACC:BOOL=OFF \
+            -DQIREE_USE_LIGHTNING:BOOL=${{ matrix.use-lightning == true && 'ON' || 'OFF' }} \
+            -DQIREE_LIGHTNING_SIM_PATH="$LIGHTNING_PATH" \
             -DCMAKE_BUILD_TYPE="Release" \
             -DCMAKE_INSTALL_PREFIX="${{github.workspace}}/install" \
             -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
             -DCMAKE_CXX_FLAGS="-Wall -Wextra -pedantic" \
             ..
+
       - name: Build all
         working-directory: build
         run: |
           ninja
+
       - name: Run tests
         working-directory: build
         run: |
           ctest --parallel 2 --timeout 15 --output-on-failure
+
       - name: Install
         working-directory: build
         run: |
           ninja install
+
       - name: Show ccache stats
         run: |
           ccache -s
diff --git a/.github/workflows/build-lightning.yml b/.github/workflows/build-lightning.yml
deleted file mode 100644
index 052830c..0000000
--- a/.github/workflows/build-lightning.yml
+++ /dev/null
@@ -1,98 +0,0 @@
-# Build directly on the GitHub runner with caching
-name: build-lightning
-on:
-  workflow_dispatch:
-  workflow_call:
-
-concurrency:
-  group: build-lightning-${{github.ref}}-${{github.event.pull_request.number || github.run_number}}-${{github.workflow}}
-  cancel-in-progress: true
-
-jobs:
-  linux:
-    name: ${{matrix.runner}}-${{matrix.compiler}}-${{matrix.version}}-llvm${{matrix.llvm}}
-    strategy:
-      matrix:
-        include:
-          - runner: jammy
-            compiler: gcc
-            version: 12
-            llvm: 14
-          - runner: jammy
-            compiler: clang
-            version: 15
-            llvm: 15
-    runs-on: >-
-      ${{  matrix.runner == 'focal' && 'ubuntu-20.04'
-        || matrix.runner == 'jammy' && 'ubuntu-22.04'
-        || null
-      }}
-    env:
-      CCACHE_DIR: "${{github.workspace}}/.ccache"
-      CCACHE_MAXSIZE: "10G"
-      CC: ${{matrix.compiler}}-${{matrix.version}}
-      CXX: ${{matrix.compiler == 'gcc' && 'g++' || 'clang++'}}-${{matrix.version}}
-    steps:
-      - uses: actions/setup-python@v5
-        name: Install Python
-        with:
-          python-version: '3.10'
-      - name: Install dependencies
-        run: |
-          sudo apt-get -q -y update
-          sudo apt-get -q -y install \
-            ccache cmake ninja-build libgtest-dev \
-            llvm-${{matrix.llvm}}-dev \
-            ${{matrix.compiler}}-${{matrix.version}} \
-            ${{matrix.compiler == 'gcc' && format('g++-{0}', matrix.version) || ''}}
-          echo "Installed toolchain:"
-          ld --version | head -1
-          $CC --version | head -1
-          $CXX --version | head -1
-          llvm-config-${{matrix.llvm}} --version | head -1
-          python -m pip install pennylane-lightning
-      - name: Check out
-        uses: actions/checkout@v4
-      - name: Set up ccache
-        uses: actions/cache@v4
-        with:
-          path: ${{env.CCACHE_DIR}}
-          key: ccache-${{matrix.runner}}-${{matrix.compiler}}-${{matrix.version}}-${{github.run_id}}
-          restore-keys: ccache-${{matrix.runner}}-${{matrix.compiler}}-${{matrix.version}}
-      - name: Zero ccache stats
-        run: |
-          ccache -z
-      - name: Configure
-        run: |
-          export LIGHTNING_SIM_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/pennylane_lightning')")/liblightning_qubit_catalyst.so
-          mkdir build && cd build
-          cmake -GNinja \
-            -DQIREE_GIT_DESCRIBE="${{github.event.pull_request
-              && format(';-pr.{0};', github.event.pull_request.number)
-              || format(';-{0};', github.ref_name)}}" \
-            -DQIREE_BUILD_TESTS:BOOL=ON \
-            -DQIREE_DEBUG:BOOL=ON \
-            -DQIREE_USE_XACC:BOOL=OFF \
-            -DQIREE_USE_LIGHTNING:BOOL=ON \
-            -DCMAKE_BUILD_TYPE="Release" \
-            -DCMAKE_INSTALL_PREFIX="${{github.workspace}}/install" \
-            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-            -DCMAKE_CXX_FLAGS="-Wall -Wextra -pedantic" \
-            ..
-      - name: Build all
-        working-directory: build
-        run: |
-          ninja
-      - name: Run tests
-        working-directory: build
-        run: |
-          ctest --parallel 2 --timeout 15 --output-on-failure
-      - name: Install
-        working-directory: build
-        run: |
-          ninja install
-      - name: Show ccache stats
-        run: |
-          ccache -s
-
-# vim: set nowrap tw=100:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2ba1193..a4704ef 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -139,12 +139,23 @@ if(QIREE_USE_QSIM)
 endif()
 
 if(QIREE_USE_LIGHTNING)
-qiree_add_library(qiree_lightning INTERFACE)
-add_library(QIREE::lightning ALIAS qiree_lightning)
-target_include_directories(qiree_lightning SYSTEM INTERFACE
-  "$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/external>"
-  "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/external>"
-)
+  qiree_add_library(qiree_lightning INTERFACE)
+  add_library(QIREE::lightning ALIAS qiree_lightning)
+  # Fetch Catalyst runtime include files
+  include("${CMAKE_CURRENT_LIST_DIR}/cmake/support_catalyst.cmake")
+  FindCatalyst(qiree_lightning)
+  target_include_directories(qiree_lightning SYSTEM INTERFACE
+    "$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/external>"
+    "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/external>"
+  )
+  set(QIREE_LIGHTNING_SIM_PATH "" CACHE FILEPATH "Path to the Lightning simulator shared library")
+  if(NOT QIREE_LIGHTNING_SIM_PATH)
+    message(FATAL_ERROR "QIREE_LIGHTNING_SIM_PATH is not set. Please specify the path using: -DQIREE_LIGHTNING_SIM_PATH=/path/to/lib")
+  endif()
+  message(STATUS "Using Lightning simulator shared library: ${QIREE_LIGHTNING_SIM_PATH}")
+  target_compile_definitions(qiree_lightning INTERFACE
+    QIREE_LIGHTNING_RTDLIB="${QIREE_LIGHTNING_SIM_PATH}"
+  )
 endif()
 
 
diff --git a/cmake/support_catalyst.cmake b/cmake/support_catalyst.cmake
new file mode 100644
index 0000000..95c7f73
--- /dev/null
+++ b/cmake/support_catalyst.cmake
@@ -0,0 +1,80 @@
+###############################################################################################
+# This file provides macros to process Catalyst.
+###############################################################################################
+
+# Include this only once
+include_guard()
+
+macro(FindCatalyst target_name)
+  if(LIGHTNING_CATALYST_SRC_PATH)
+    if(NOT IS_ABSOLUTE ${LIGHTNING_CATALYST_SRC_PATH})
+      message(FATAL_ERROR " LIGHTNING_CATALYST_SRC_PATH=${LIGHTNING_CATALYST_SRC_PATH} must be set to an absolute path")
+    endif()
+    if(CATALYST_GIT_TAG)
+      message(WARN " Setting `LIGHTNING_CATALYST_SRC_PATH=${LIGHTNING_CATALYST_SRC_PATH}` overrides `CATALYST_GIT_TAG=${CATALYST_GIT_TAG}`")
+    endif()
+
+    # Acquire local git hash and use for CATALYST_GIT_TAG
+    execute_process(COMMAND git rev-parse --short HEAD
+      WORKING_DIRECTORY ${LIGHTNING_CATALYST_SRC_PATH}
+      OUTPUT_VARIABLE CATALYST_GIT_TAG
+    )
+    message(INFO " Building against local Catalyst - path: ${LIGHTNING_CATALYST_SRC_PATH} - GIT TAG: ${CATALYST_GIT_TAG}")
+
+    target_include_directories(${target_name} INTERFACE
+      $<BUILD_INTERFACE:${LIGHTNING_CATALYST_SRC_PATH}/runtime/lib/backend/common>
+      $<BUILD_INTERFACE:${LIGHTNING_CATALYST_SRC_PATH}/runtime/include>
+    )
+
+  else()
+    if(NOT CATALYST_GIT_TAG)
+      set(CATALYST_GIT_TAG "v0.13.0" CACHE STRING "GIT_TAG value to build Catalyst")
+    endif()
+    message(INFO " Building against Catalyst GIT TAG ${CATALYST_GIT_TAG}")
+
+    set(CATALYST_DOWNLOAD_INCLUDE_DIR "${PROJECT_BINARY_DIR}/catalyst-headers")
+
+    # Fetching /lib/backend/common hpp headers
+    set(LIB_BACKEND_COMMON_HEADERS  CacheManager.hpp
+                    QubitManager.hpp
+                    Utils.hpp
+    )
+
+    foreach(HEADER ${LIB_BACKEND_COMMON_HEADERS})
+      string(REGEX REPLACE "\\.[^.]*$" "" HEADER_NAME ${HEADER})
+      FetchContent_Declare(
+        ${HEADER_NAME}
+        URL         https://raw.githubusercontent.com/PennyLaneAI/catalyst/${CATALYST_GIT_TAG}/runtime/lib/backend/common/${HEADER}
+        DOWNLOAD_NO_EXTRACT True
+        SOURCE_DIR    ${CATALYST_DOWNLOAD_INCLUDE_DIR}
+      )
+
+      FetchContent_MakeAvailable(${HEADER_NAME})
+    endforeach()
+
+    # Fetching include hpp headers
+    set(INCLUDE_HEADERS DataView.hpp
+              Exception.hpp
+              QuantumDevice.hpp
+              RuntimeCAPI.h
+              Types.h
+    )
+
+    foreach(HEADER ${INCLUDE_HEADERS})
+      string(REGEX REPLACE "\\.[^.]*$" "" HEADER_NAME ${HEADER})
+      FetchContent_Declare(
+        ${HEADER_NAME}
+        URL         https://raw.githubusercontent.com/PennyLaneAI/catalyst/${CATALYST_GIT_TAG}/runtime/include/${HEADER}
+        DOWNLOAD_NO_EXTRACT True
+        SOURCE_DIR    ${CATALYST_DOWNLOAD_INCLUDE_DIR}
+      )
+
+      FetchContent_MakeAvailable(${HEADER_NAME})
+    endforeach()
+
+    target_include_directories(${target_name} INTERFACE
+      $<BUILD_INTERFACE:${CATALYST_DOWNLOAD_INCLUDE_DIR}>
+    )
+
+  endif()
+endmacro()
diff --git a/scripts/lightning-path.sh b/scripts/lightning-path.sh
new file mode 100755
index 0000000..b459b5f
--- /dev/null
+++ b/scripts/lightning-path.sh
@@ -0,0 +1,61 @@
+#!/bin/bash -e
+#
+# This script determines the absolute path to a PennyLane-Lightning simulator
+# library to be used with QIREE's Lightning backend.
+#
+# Usage:
+#   ./scripts/lightning-path.sh <simulator_type>
+#
+# Example:
+#   ./scripts/lightning-path.sh qubit
+
+if [ -z "$1" ]; then
+  echo "Error: Missing argument. Usage: $0 <simulator_type>" >&2
+  echo "Example: $0 qubit" >&2
+  exit 1
+fi
+
+# Validate simulator type
+case "$1" in
+  qubit|gpu|kokkos)
+    ;;
+  *)
+    echo "Error: Invalid simulator type '$1'. Must be one of 'qubit', 'gpu', or 'kokkos'." >&2
+    exit 1
+    ;;
+esac
+
+SIM_TYPE="$1"
+
+# Determine OS-specific library suffix
+UNAME_S=$(uname -s)
+case "$UNAME_S" in
+  Linux*)   LIB_SUFFIX=".so";;
+  Darwin*)  LIB_SUFFIX=".dylib";;
+  *)
+    echo "Error: Unsupported platform '$UNAME_S'. QIREE with PennyLane-Lightning only supports Linux and macOS." >&2
+    exit 1
+    ;;
+esac
+
+# Find the base PennyLane-Lightning installation directory using Python
+BASE_PATH=$(python -c "import site; print(f'{site.getsitepackages()[0]}/pennylane_lightning')")
+
+if [ -z "$BASE_PATH" ]; then
+  echo "Error: Could not determine pennylane_lightning path via Python." >&2
+  echo "Is pennylane-lightning installed in your current Python environment?" >&2
+  exit 1
+fi
+
+# Construct the full library path
+LIB_NAME="liblightning_${SIM_TYPE}_catalyst${LIB_SUFFIX}"
+FULL_PATH="${BASE_PATH}/${LIB_NAME}"
+
+# Check if the file actually exists
+if [ ! -f "$FULL_PATH" ]; then
+  echo "Error: Simulator library not found at: $FULL_PATH" >&2
+  echo "Ensure you have installed the correct simulator type ('$SIM_TYPE')" >&2
+  exit 1
+fi
+
+echo "$FULL_PATH"
diff --git a/src/qirlightning/CMakeLists.txt b/src/qirlightning/CMakeLists.txt
index b02e572..086e2e8 100644
--- a/src/qirlightning/CMakeLists.txt
+++ b/src/qirlightning/CMakeLists.txt
@@ -4,57 +4,16 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #----------------------------------------------------------------------------#
 
-# Fetch Catalyst runtime include files
-include(FetchContent)
-
-include("${CMAKE_CURRENT_SOURCE_DIR}/support_catalyst.cmake")
-FindCatalyst(qirlightning)
-
-# Set the path to the lightning simulator shared library
-if(DEFINED ENV{LIGHTNING_SIM_PATH})
-  set(RTDLIB_PATH "$ENV{LIGHTNING_SIM_PATH}")
-  message(STATUS "RTDLIB_PATH set from environment variable LIGHTNING_SIM_PATH: ${RTDLIB_PATH}")
-else()
-  # Throw an error if the environment variable is not defined
-  message(FATAL_ERROR "Environment variable LIGHTNING_SIM_PATH is not defined. Please set it to the path of the Lightning simulator shared library.")
-endif()
-
-# Set the device name for the lightning simulator
-execute_process(
-    COMMAND nm -DC "${RTDLIB_PATH}" | grep " Factory"
-    OUTPUT_VARIABLE GREP_OUTPUT
-    ERROR_QUIET
-    OUTPUT_STRIP_TRAILING_WHITESPACE
-  )
-
-if(GREP_OUTPUT)
-  string(REGEX MATCH "T (.*)Factory" SYMBOL_MATCH "${GREP_OUTPUT}")
-  if(SYMBOL_MATCH)
-    string(REGEX REPLACE "T (.*)Factory" "\\1" RTDDEVICE_NAME "${SYMBOL_MATCH}")
-    message(STATUS "Found Lightning Simulator. Extracted RTDDEVICE_NAME: ${RTDDEVICE_NAME}")
-  else()
-    message(FATAL_ERROR "Symbol 'Factory' found, but regex failed to extract.")
-  endif()
-else()
-  message(FATAL_ERROR "Symbol 'Factory' not found in ${RTDLIB_PATH}. Please ensure LIGHTNING_SIM_PATH is set correctly.")
-endif()
-
-
 # Adding lightning as a library to qiree
 qiree_add_library(qirlightning
-LightningQuantum.cc
-LightningRuntime.cc
-)
-
-target_compile_definitions(qirlightning PRIVATE
-    RTDLIB="${RTDLIB_PATH}"
-    RTDDEVICE="${RTDDEVICE_NAME}"
+  LightningQuantum.cc
+  LightningRuntime.cc
 )
 
 #Link the lightning library to qiree and any other relevant libraries
 target_link_libraries(qirlightning
   PUBLIC QIREE::qiree  # Link to qiree
-  PRIVATE QIREE::lightning
+  PUBLIC QIREE::lightning
 )
 
 #----------------------------------------------------------------------------#
diff --git a/src/qirlightning/LightningQuantum.cc b/src/qirlightning/LightningQuantum.cc
index d3538a2..47c4545 100644
--- a/src/qirlightning/LightningQuantum.cc
+++ b/src/qirlightning/LightningQuantum.cc
@@ -33,20 +33,30 @@ LightningQuantum::LightningQuantum(std::ostream& os, unsigned long int seed)
     : output_(os), seed_(seed)
 {
     auto rtld_flags = RTLD_LAZY | RTLD_NODELETE;
-    rtd_dylib_handler_ = dlopen(RTDLIB, rtld_flags);
+    rtd_dylib_handler_ = dlopen(QIREE_LIGHTNING_RTDLIB, rtld_flags);
 
     QIREE_VALIDATE(rtd_dylib_handler_,
-                   << "failed to load Lightning runtime library '" << RTDLIB
-                   << "'");
+                   << "failed to load Lightning runtime library '"
+                   << QIREE_LIGHTNING_RTDLIB << "'");
 
     // Find device factory
-    std::string rtd_device = RTDDEVICE;
-    std::string factory_name = rtd_device + "Factory";
-    factory_f_ptr_ = dlsym(rtd_dylib_handler_, factory_name.c_str());
+    std::vector<std::string> const factory_names
+        = {"LightningSimulatorFactory",
+           "LightningKokkosSimulatorFactory",
+           "LightningGPUSimulatorFactory"};
+
+    for (auto const& factory_name : factory_names)
+    {
+        dlerror();
+        factory_f_ptr_ = dlsym(rtd_dylib_handler_, factory_name.c_str());
+        if (factory_f_ptr_)
+        {
+            break;
+        }
+    }
 
     QIREE_VALIDATE(factory_f_ptr_,
-                   << "failed to find device factory function '"
-                   << factory_name << "'");
+                   << "failed to find valid device factory function");
 }
 
 //---------------------------------------------------------------------------//
diff --git a/src/qirlightning/README.md b/src/qirlightning/README.md
index 4058002..cc03c38 100644
--- a/src/qirlightning/README.md
+++ b/src/qirlightning/README.md
@@ -7,18 +7,20 @@ The [PennyLane-Lightning](https://github.com/PennyLaneAI/pennylane-lightning) pl
 
 ## Installing a Lightning simulator
 
-For more information on installing Pennylane Lightning simulators from source, please visit the [Lightning installation page](https://docs.pennylane.ai/projects/lightning/en/latest/dev/installation.html). Note: QIREE is tested to work with PennyLane Lightning simulators v0.42.
+For more information on installing Pennylane Lightning simulators from source, please visit the [Lightning installation page](https://docs.pennylane.ai/projects/lightning/en/latest/dev/installation.html).
+
+**Note:** QIREE is tested to work with PennyLane Lightning simulators v0.43.
 
 ### Quick start
 
 The easiest way to get started is to install a Lightning simulator (`pennylane-lightning`/`pennylane-lightning-gpu`/`pennylane-lightning-kokkos`) from PyPI via pip:
 
 ```
-$ pip install pennylane-lightning-kokkos==0.42.0
+$ pip install pennylane-lightning-kokkos==0.43.0
 
 $ pip show pennylane-lightning-kokkos
 Name: PennyLane_Lightning_Kokkos
-Version: 0.42.0
+Version: 0.43.0
 Summary: PennyLane-Lightning plugin
 Home-page: https://github.com/PennyLaneAI/pennylane-lightning
 Author:
@@ -44,24 +46,31 @@ $ ls $PL_PATH
 ... liblightning_qubit_catalyst.so  liblightning_kokkos_catalyst.so ...
 ```
 
+The helper script `qiree/scripts/lightning-path.sh <device>` can be used to obtain the absolute path of the shared library.
+
 ## Compile QIR-EE with Lightning backend
 
 To compile QIR-EE with lightning backend:
 
 ```
-# Set the path for the lightning simulator shared library
-export LIGHTNING_SIM_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/pennylane_lightning')")/liblightning_kokkos_catalyst.so
-
-# Proceed with usual build instructions, but with `-DQIREE_USE_LIGHTNING=ON` cmake flag
 cd qiree/
+
+# Set the path for the lightning simulator shared library using the
+# helper script. Update <device> to qubit / gpu / kokkos as required.
+
+export LIGHTNING_SIM_PATH=$(bash ./scripts/lightning-path.sh <device>)
+
+# Proceed with usual build instructions
+# but with the extra `-DQIREE_USE_LIGHTNING=ON` and
+# `-DQIREE_LIGHTNING_SIM_PATH` cmake flags
+
 mkdir build; cd build
-cmake -DQIREE_USE_LIGHTNING=ON ..
+cmake -DQIREE_USE_LIGHTNING=ON -DQIREE_LIGHTNING_SIM_PATH=$LIGHTNING_SIM_PATH ..
 make
 
 ```
 
 **Note:**
-- replace `libligghtning_kokkos_catalyst.so` with `liblightning_qubit_catalyst.so` or `liblightning_GPU_catalyst.so` if required.
 - when running with `lightning.gpu` simulator for Nvidia GPUs, include `cuquantum` libraries in the library path (which will be installed as a dependency from Python), i.e.
 
 ```
diff --git a/src/qirlightning/simple_demo/README.md b/src/qirlightning/simple_demo/README.md
new file mode 100644
index 0000000..ef08d55
--- /dev/null
+++ b/src/qirlightning/simple_demo/README.md
@@ -0,0 +1,69 @@
+# Simple Demo for Catalyst/Lightning runtime
+
+This is a super simple demo for driving Lightning devices. The example here uses `lightning.kokkos`, but can easily be updated to target other devices, e.g. lightning.gpu (if an Nvidia GPU is present).
+
+Some Catalyst include files are copied here for convenience - they are in `./snapshot_catalyst_runtime/include`. These are required for the QuantumDevice interface. For the qiree source, these files are fetched automatically during CMake, and these are not used.
+
+## Installing a lightning simulator
+
+When installing [Pennylane-Lightning](https://github.com/PennyLaneAI/pennylane-lightning) from pip or source, you will have the shared objects for each of the simulator installed. These are named `liblightning_kokkos_catalyst.so`/`liblightning_GPU_catalyst.so` etc.
+
+To get started, run `pip install pennylane` or `pip install pennylane-lightning` - this will install the `lightning.qubit` (CPU) simulator, and other simulators can be installed by running `pip install pennylane-lightning-kokkos / pennylane-lightning-gpu`.
+
+Example:
+```
+$ pip install pennylane-lightning-kokkos
+
+$ pip show pennylane-lightning-kokkos
+Name: PennyLane_Lightning_Kokkos
+Version: 0.40.0
+Summary: PennyLane-Lightning plugin
+Home-page: https://github.com/PennyLaneAI/pennylane-lightning
+Author:
+Author-email:
+License: Apache License 2.0
+Location: <site packages path>
+Requires: pennylane, pennylane-lightning
+
+$ ls <site packages path>/pennylane_lightning
+... liblightning_kokkos_catalyst.so ...
+```
+
+You can swap `pennylane-lightning-kokkos` for `pennylane-lightning-gpu` for lightning.gpu and `pennylane-lightning` for lightning.gpu simulators.
+
+## Compilation
+
+First update the `RTDLIB` in `test_rt_device.cpp` to the local path where lightning is installed (i.e. `<site packages path>` from above).
+
+To compile:
+
+```
+$ clang++ --std=c++20 test_rt_device.cpp -I./snapshot_catalyst_runtime/include -o test_rt_device.out
+```
+
+## Running the example
+
+To run:
+
+```
+$ ./test_rt_device.out
+Kokkos::OpenMP::initialize WARNING: OMP_PROC_BIND environment variable not set
+  In general, for best performance with OpenMP 4.0 or better set OMP_PROC_BIND=spread and OMP_PLACES=threads
+  For best performance with OpenMP 3.1 set OMP_PROC_BIND=true
+  For unit testing set OMP_PROC_BIND=false
+
+Num Qubits = 3
+State =
+*** State-Vector of Size 8 ***
+[(0.707107,0), (0,0), (0,0), (0,0), (0.707107,0), (0,0), (0,0), (0,0)]
+Measure on wire 0 = 0
+```
+
+## Running on other devices
+
+To run on other devices, e.g. lightning.gpu, you need to change:
+- Install pennylane-lightning-gpu: `pip install pennylane-lightning-gpu`
+In the c++ file:
+- replace `RTDLIB` from `kokkos` to `gpu`
+- replace `RTDDEVICE` from `Kokkos` to `GPU`
+- Include `cuquantum` libraries when running (which was installed as a dependency), i.e. `LD_LIBRARY_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")/lib:$LD_LIBRARY_PATH ./test_rt_device.out`
diff --git a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/README.rst b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/README.rst
new file mode 100644
index 0000000..8a881e5
--- /dev/null
+++ b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/README.rst
@@ -0,0 +1,118 @@
+.. runtime-start-inclusion-marker-do-not-remove
+
+Catalyst Quantum Runtime
+########################
+
+The Catalyst Runtime is a C++ QIR runtime that enables the execution of Catalyst-compiled
+quantum programs, and is currently backed by `PennyLane-Lightning <https://docs.pennylane.ai/projects/lightning/en/stable>`_
+state-vector simulators, and `Amazon Braket <https://amazon-braket-pennylane-plugin-python.readthedocs.io>`__
+devices. Additional hardware support, including QPUs, to come.
+
+The runtime employs the `QuantumDevice <https://docs.pennylane.ai/projects/catalyst/en/stable/api/structCatalyst_1_1Runtime_1_1QuantumDevice.html#exhale-struct-structcatalyst-1-1runtime-1-1quantumdevice>`_
+public interface to support an extensible list of backend devices. This interface comprises two collections of abstract methods:
+
+- The Qubit management, device shot noise, and quantum tape recording methods are utilized for the implementation of Quantum Runtime (QR) instructions.
+
+- The quantum operations, observables, measurements, and gradient methods are used to implement Quantum Instruction Set (QIS) instructions.
+
+A complete list of instructions supported by the runtime can be found in
+`RuntimeCAPI.h <https://github.com/PennyLaneAI/catalyst/tree/main/runtime/include/RuntimeCAPI.h>`_.
+
+Contents
+========
+
+The directory is structured as follows:
+
+- `include <https://github.com/PennyLaneAI/catalyst/tree/main/runtime/include>`_:
+    This contains the public header files of the runtime including the ``QuantumDevice`` API
+    for backend quantum devices and the runtime CAPI.
+
+- `lib <https://github.com/PennyLaneAI/catalyst/tree/main/runtime/lib>`_:
+    The core modules of the runtime are structured into ``lib/capi`` and ``lib/backend``.
+    `lib/capi <https://github.com/PennyLaneAI/catalyst/tree/main/runtime/lib/capi>`_  implements the semantics for
+    QIR instructions lowered to our custom runtime. `lib/backend <https://github.com/PennyLaneAI/catalyst/tree/main/runtime/lib/backend>`_
+    contains implementations of the ``QuantumDevice`` API for backend simulators.
+
+- `tests <https://github.com/PennyLaneAI/catalyst/tree/main/runtime/tests>`_:
+    A collection of C++ tests for modules and methods in the runtime.
+
+Backend Devices
+===============
+
+New device backends for the runtime can be realized by implementing the quantum device interface.
+The following table shows the available devices along with supported features:
+
+.. list-table::
+   :widths: 25 25 25 25
+   :header-rows: 0
+
+   * - **Features**
+     - **PennyLane-Lightning-Qubit**
+     - **PennyLane-Lightning-Kokkos** and **PennyLane-Lightning-GPU**
+     - **Amazon-Braket-OpenQasm**
+   * - Qubit Management
+     - Dynamic allocation/deallocation
+     - Static allocation/deallocation
+     - Static allocation/deallocation
+   * - Gate Operations
+     - `Lightning operations <https://github.com/PennyLaneAI/pennylane-lightning/blob/master/pennylane_lightning/core/src/gates/GateOperation.hpp>`_
+     - `Lightning operations <https://github.com/PennyLaneAI/pennylane-lightning/blob/master/pennylane_lightning/core/src/gates/GateOperation.hpp>`_ without controlled gates support
+     - `Braket operations <https://github.com/PennyLaneAI/catalyst/blob/e812afbadbd777209862d5c76f394e3f0c43ffb6/runtime/lib/backend/openqasm/OpenQasmBuilder.hpp#L49>`_
+   * - Quantum Observables
+     - ``Identity``, ``PauliX``, ``PauliY``, ``PauliZ``, ``Hadamard``, ``Hermitian``, ``Hamiltonian``, and Tensor Product of Observables
+     - ``Identity``, ``PauliX``, ``PauliY``, ``PauliZ``, ``Hadamard``, ``Hermitian``, ``Hamiltonian``, and Tensor Product of Observables
+     - ``Identity``, ``PauliX``, ``PauliY``, ``PauliZ``, ``Hadamard``, ``Hermitian``, and Tensor Product of Observables
+   * - Expectation Value
+     - All observables; Finite-shots supported
+     - All observables; Finite-shots supported
+     - All observables; Finite-shots supported
+   * - Variance
+     - All observables; Finite-shots supported
+     - All observables; Finite-shots supported
+     - All observables; Finite-shots supported
+   * - Probability
+     - Only for the computational basis on the supplied qubits; Finite-shots supported
+     - Only for the computational basis on the supplied qubits; Finite-shots supported
+     - The computational basis on all active qubits; Finite-shots supported
+   * - Sampling
+     - Only for the computational basis on the supplied qubits
+     - Only for the computational basis on the supplied qubits
+     - The computational basis on all active qubits; Finite-shots supported
+   * - Mid-Circuit Measurement
+     - Only for the computational basis on the supplied qubit
+     - Only for the computational basis on the supplied qubit
+     - Not supported
+   * - Gradient
+     - The Adjoint-Jacobian method for expectation values on all observables
+     - The Adjoint-Jacobian method for expectation values on all observables
+     - Not supported
+
+Requirements
+============
+
+To build the runtime from source, it is required to have an up to date version of a C/C++ compiler such as gcc or clang
+with support for the C++20 standard library.
+
+Installation
+============
+
+By default, the runtime builds all supported backend devices.
+You can build the runtime with custom devices from the list of Backend Devices.
+
+You can use ``ENABLE_OPENQASM=OFF`` to disable building the runtime with `Amazon-Braket-OpenQasm <https://aws.amazon.com/braket/>`_:
+
+.. code-block:: console
+
+    make runtime ENABLE_OPENQASM=OFF
+
+This device currently offers generators for the `OpenQasm3 <https://openqasm.com/versions/3.0/index.html>`_ specification and
+`Amazon Braket <https://docs.aws.amazon.com/braket/latest/developerguide/braket-openqasm-supported-features.html>`__ assembly extension.
+Moreover, the generated assembly can be executed on Amazon Braket devices leveraging `amazon-braket-sdk-python <https://github.com/aws/amazon-braket-sdk-python>`_.
+
+To check the runtime test suite from the root directory:
+
+.. code-block:: console
+
+    make test-runtime
+
+.. runtime-end-inclusion-marker-do-not-remove
diff --git a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/DataView.hpp b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/DataView.hpp
new file mode 100644
index 0000000..616b9dc
--- /dev/null
+++ b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/DataView.hpp
@@ -0,0 +1,173 @@
+// Copyright 2023 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <Exception.hpp>
+
+/**
+ * A multi-dimensional view for MemRef-like and std::vector<T> types.
+ *
+ * @tparam T The underlying data type
+ * @tparam R The Rank (R > 0)
+ *
+ * @note A forward iterator is implemented in this view for traversing over the
+ * entire elements of MemRef types rank-by-rank starting from the last
+ * dimension (R-1). For example, The DataView iterator for MemRef<T, 2> starts
+ * from index (0, 0) and traverses elements in the following order: (0, 0),
+ * ..., (0, sizes[1]-1), (1, 0), ..., (1, sizes[1]-1), ... (sizes[0]-1,
+ * sizes[1]-1).
+ */
+template<typename T, size_t R>
+class DataView
+{
+  private:
+    T* data_aligned;
+    size_t offset;
+    size_t sizes[R] = {0};
+    size_t strides[R] = {0};
+
+  public:
+    class iterator
+    {
+      private:
+        DataView<T, R> const& view;
+
+        int64_t loc;  // physical index
+        size_t indices[R] = {0};
+
+      public:
+        using iterator_category = std::forward_iterator_tag;  // LCOV_EXCL_LINE
+        using value_type = T;  // LCOV_EXCL_LINE
+        using difference_type = std::ptrdiff_t;  // LCOV_EXCL_LINE
+        using pointer = T*;  // LCOV_EXCL_LINE
+        using reference = T&;  // LCOV_EXCL_LINE
+
+        iterator(DataView<T, R> const& _view, int64_t begin_idx)
+            : view(_view), loc(begin_idx)
+        {
+        }
+        pointer operator->() const { return &view.data_aligned[loc]; }
+        reference operator*() const { return view.data_aligned[loc]; }
+        iterator& operator++()
+        {
+            int64_t next_axis = -1;
+            int64_t idx;
+            for (int64_t i = R; i > 0; --i)
+            {
+                idx = i - 1;
+                if (indices[idx]++ < view.sizes[idx] - 1)
+                {
+                    next_axis = idx;
+                    break;
+                }
+                indices[idx] = 0;
+                loc -= (view.sizes[idx] - 1) * view.strides[idx];
+            }
+
+            loc = next_axis == -1 ? -1 : loc + view.strides[next_axis];
+            return *this;
+        }
+        iterator operator++(int)
+        {
+            auto tmp = *this;
+            int64_t next_axis = -1;
+            int64_t idx;
+            for (int64_t i = R; i > 0; --i)
+            {
+                idx = i - 1;
+                if (indices[idx]++ < view.sizes[idx] - 1)
+                {
+                    next_axis = idx;
+                    break;
+                }
+                indices[idx] = 0;
+                loc -= (view.sizes[idx] - 1) * view.strides[idx];
+            }
+
+            loc = next_axis == -1 ? -1 : loc + view.strides[next_axis];
+            return tmp;
+        }
+        bool operator==(iterator const& other) const
+        {
+            return (loc == other.loc
+                    && view.data_aligned == other.view.data_aligned);
+        }
+        bool operator!=(iterator const& other) const
+        {
+            return !(*this == other);
+        }
+    };
+
+    explicit DataView(std::vector<T>& buffer)
+        : data_aligned(buffer.data()), offset(0)
+    {
+        static_assert(R == 1, "[Class: DataView] Assertion: R == 1");
+        sizes[0] = buffer.size();
+        strides[0] = 1;
+    }
+
+    explicit DataView(T* _data_aligned,
+                      size_t _offset,
+                      size_t const* _sizes,
+                      size_t const* _strides)
+        : data_aligned(_data_aligned), offset(_offset)
+    {
+        static_assert(R > 0, "[Class: DataView] Assertion: R > 0");
+        if (_sizes != nullptr && _strides != nullptr)
+        {
+            for (size_t i = 0; i < R; i++)
+            {
+                sizes[i] = _sizes[i];
+                strides[i] = _strides[i];
+            }
+        }  // else sizes = {0}, strides = {0}
+    }
+
+    [[nodiscard]] auto size() const -> size_t
+    {
+        if (!data_aligned)
+        {
+            return 0;
+        }
+
+        size_t tsize = 1;
+        for (size_t i = 0; i < R; i++)
+        {
+            tsize *= sizes[i];
+        }
+        return tsize;
+    }
+
+    template<typename... I>
+    T& operator()(I... idxs) const
+    {
+        static_assert(sizeof...(idxs) == R,
+                      "[Class: DataView] Error in Catalyst Runtime: Wrong "
+                      "number of indices");
+        size_t indices[] = {static_cast<size_t>(idxs)...};
+
+        size_t loc = offset;
+        for (size_t axis = 0; axis < R; axis++)
+        {
+            RT_ASSERT(indices[axis] < sizes[axis]);
+            loc += indices[axis] * strides[axis];
+        }
+        return data_aligned[loc];
+    }
+
+    iterator begin() { return iterator{*this, static_cast<int64_t>(offset)}; }
+
+    iterator end() { return iterator{*this, -1}; }
+};
diff --git a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Exception.hpp b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Exception.hpp
new file mode 100644
index 0000000..4e8272d
--- /dev/null
+++ b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Exception.hpp
@@ -0,0 +1,96 @@
+// Copyright 2023 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <exception>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+/**
+ * @brief Macro that throws `RuntimeException` with given message.
+ */
+#define RT_FAIL(message) \
+    Catalyst::Runtime::_abort((message), __FILE__, __LINE__, __func__)
+
+/**
+ * @brief Macro that throws `RuntimeException` if expression evaluates
+ * to true.
+ */
+#define RT_FAIL_IF(expression, message) \
+    if ((expression))                   \
+    {                                   \
+        RT_FAIL(message);               \
+    }
+
+/**
+ * @brief Macro that throws `RuntimeException` with the given expression
+ * and source location if expression evaluates to false.
+ */
+#define RT_ASSERT(expression) \
+    RT_FAIL_IF(!(expression), "Assertion: " #expression)
+
+namespace Catalyst::Runtime
+{
+
+/**
+ * @brief This is the general exception thrown by Catalyst for runtime errors
+ * that is derived from `std::exception`.
+ */
+class RuntimeException : public std::exception
+{
+  private:
+    std::string const err_msg;
+
+  public:
+    explicit RuntimeException(std::string msg) noexcept
+        : err_msg{std::move(msg)}
+    {
+    }  // LCOV_EXCL_LINE
+    ~RuntimeException() override = default;  // LCOV_EXCL_LINE
+
+    RuntimeException(RuntimeException const&) = default;
+    RuntimeException(RuntimeException&&) noexcept = default;
+
+    RuntimeException& operator=(RuntimeException const&) = delete;
+    RuntimeException& operator=(RuntimeException&&) = delete;
+
+    [[nodiscard]] auto what() const noexcept -> char const* override
+    {
+        return err_msg.c_str();
+    }  // LCOV_EXCL_LINE
+};
+
+/**
+ * @brief Throws a `RuntimeException` with the given error message.
+ *
+ * @note This is not supposed to be called directly.
+ */
+[[noreturn]] inline void _abort(char const* message,
+                                char const* file_name,
+                                size_t line,
+                                char const* function_name)
+{
+    std::stringstream sstream;
+    sstream << "[" << file_name << "][Line:" << line
+            << "][Function:" << function_name
+            << "] Error in Catalyst Runtime: " << message;
+
+    throw RuntimeException(sstream.str());
+}  // LCOV_EXCL_LINE
+
+}  // namespace Catalyst::Runtime
diff --git a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/QuantumDevice.hpp b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/QuantumDevice.hpp
new file mode 100644
index 0000000..6794033
--- /dev/null
+++ b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/QuantumDevice.hpp
@@ -0,0 +1,399 @@
+// Copyright 2022-2023 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <complex>
+#include <memory>
+#include <optional>
+#include <random>
+#include <vector>
+
+#include "DataView.hpp"
+#include "Types.h"
+
+// A helper template macro to generate the <IDENTIFIER>Factory method by
+// calling <CONSTRUCTOR>(kwargs). Check the Custom Devices guideline for
+// details:
+// https://docs.pennylane.ai/projects/catalyst/en/stable/dev/custom_devices.html
+#define GENERATE_DEVICE_FACTORY(IDENTIFIER, CONSTRUCTOR)              \
+    extern "C" Catalyst::Runtime::QuantumDevice* IDENTIFIER##Factory( \
+        const char* kwargs)                                           \
+    {                                                                 \
+        return new CONSTRUCTOR(std::string(kwargs));                  \
+    }
+
+namespace Catalyst::Runtime
+{
+
+/**
+ * @brief struct API for backend quantum devices.
+ *
+ * This device API contains,
+ * - a set of methods to manage qubit allocations and deallocations, device
+ * shot noise, and quantum tape recording as well as reference values for the
+ * result data-type; these are used to implement Quantum Runtime (QR)
+ * instructions.
+ *
+ * - a set of methods for quantum operations, observables, measurements, and
+ * gradient of the device; these are used to implement Quantum Instruction Set
+ * (QIS) instructions.
+ *
+ */
+struct QuantumDevice
+{
+    QuantumDevice() = default;  // LCOV_EXCL_LINE
+    virtual ~QuantumDevice() = default;  // LCOV_EXCL_LINE
+
+    QuantumDevice& operator=(QuantumDevice const&) = delete;
+    QuantumDevice(QuantumDevice const&) = delete;
+    QuantumDevice(QuantumDevice&&) = delete;
+    QuantumDevice& operator=(QuantumDevice&&) = delete;
+
+    /**
+     * @brief Allocate a qubit.
+     *
+     * @return `QubitIdType`
+     */
+    virtual auto AllocateQubit() -> QubitIdType = 0;
+
+    /**
+     * @brief Allocate a vector of qubits.
+     *
+     * @param num_qubits The number of qubits to allocate.
+     *
+     * @return `std::vector<QubitIdType>`
+     */
+    virtual auto AllocateQubits(size_t num_qubits) -> std::vector<QubitIdType>
+        = 0;
+
+    /**
+     * @brief Release a qubit.
+     *
+     * @param qubit The id of the qubit
+     */
+    virtual void ReleaseQubit(QubitIdType qubit) = 0;
+
+    /**
+     * @brief Release all qubits.
+     */
+    virtual void ReleaseAllQubits() = 0;
+
+    /**
+     * @brief Get the number of allocated qubits.
+     *
+     * @return `size_t`
+     */
+    [[nodiscard]] virtual auto GetNumQubits() const -> size_t = 0;
+
+    /**
+     * @brief Set the number of device shots.
+     *
+     * @param shots The number of noise shots
+     */
+    virtual void SetDeviceShots(size_t shots) = 0;
+
+    /**
+     * @brief Get the number of device shots.
+     *
+     * @return `size_t`
+     */
+    [[nodiscard]] virtual auto GetDeviceShots() const -> size_t = 0;
+
+    /**
+     * @brief Set the PRNG of the device.
+     *
+     * The Catalyst runtime enables seeded program execution on non-hardware
+     * devices. A random number generator instance is managed by the runtime to
+     * predictably generate results for non-deterministic programs, such as
+     * those involving `Measure` calls. Devices implementing support for this
+     * feature do not need to use the provided PRNG instance as their sole
+     * source of random numbers, but it is expected that the the same instance
+     * state will predictable and reproducibly generate the same program
+     * results. It is also expected that the provided PRNG state is evolved
+     * sufficiently so that two device executions sharing the same instance do
+     * not produce identical results. The provided PRNG instance is not
+     * thread-locked, and devices wishing to share it across threads will need
+     * to provide their own thread-safety.
+     *
+     * @param gen The std::mt19937 PRNG object.
+     */
+    virtual void SetDevicePRNG([[maybe_unused]] std::mt19937* gen) {};
+
+    /**
+     * @brief Start recording a quantum tape if provided.
+     *
+     * @note This is backed by the `Catalyst::Runtime::CacheManager<ComplexT>`
+     * property in the device implementation.
+     */
+    virtual void StartTapeRecording() = 0;
+
+    /**
+     * @brief Stop recording a quantum tape if provided.
+     *
+     * @note This is backed by the `Catalyst::Runtime::CacheManager<ComplexT>`
+     * property in the device implementation.
+     */
+    virtual void StopTapeRecording() = 0;
+
+    /**
+     * @brief Result value for "Zero" used in the measurement process.
+     *
+     * @return `Result`
+     */
+    [[nodiscard]] virtual auto Zero() const -> Result = 0;
+
+    /**
+     * @brief Result value for "One"  used in the measurement process.
+     *
+     * @return `Result`
+     */
+    [[nodiscard]] virtual auto One() const -> Result = 0;
+
+    /**
+     * @brief A helper method to print the state vector of a device.
+     */
+    virtual void PrintState() = 0;
+
+    /**
+     * @brief Prepare subsystems using the given ket vector in the
+     * computational basis.
+     *
+     * @param state A state vector of size 2**len(wires)
+     * @param wires The wire(s) the operation acts on
+     */
+    virtual void
+    SetState([[maybe_unused]] DataView<std::complex<double>, 1>& state,
+             [[maybe_unused]] std::vector<QubitIdType>& wires)
+    {
+        RT_FAIL("Unsupported functionality");
+    }
+
+    /**
+     * @brief Prepares a single computational basis state.
+     *
+     * @param n Prepares the basis state |n>, where n is an array of integers
+     * from the set {0, 1}
+     * @param wires The wire(s) the operation acts on
+     */
+    virtual void SetBasisState([[maybe_unused]] DataView<int8_t, 1>& n,
+                               [[maybe_unused]] std::vector<QubitIdType>& wires)
+    {
+        RT_FAIL("Unsupported functionality");
+    }
+
+    /**
+     * @brief Apply a single gate to the state vector of a device with its name
+     * if this is supported.
+     *
+     * @param name The name of the gate to apply
+     * @param params Optional parameter list for parametric gates
+     * @param wires Wires to apply gate to
+     * @param inverse Indicates whether to use inverse of gate
+     * @param controlled_wires Optional controlled wires applied to the
+     * operation
+     * @param controlled_values Optional controlled values applied to the
+     * operation
+     */
+    virtual void NamedOperation(
+        std::string const& name,
+        std::vector<double> const& params,
+        std::vector<QubitIdType> const& wires,
+        [[maybe_unused]] bool inverse = false,
+        [[maybe_unused]] std::vector<QubitIdType> const& controlled_wires = {},
+        [[maybe_unused]] std::vector<bool> const& controlled_values = {})
+        = 0;
+
+    /**
+     * @brief Apply a given matrix directly to the state vector of a device.
+     *
+     * @param matrix The matrix of data in row-major format
+     * @param wires Wires to apply gate to
+     * @param inverse Indicates whether to use inverse of gate
+     * @param controlled_wires Controlled wires applied to the operation
+     * @param controlled_values Controlled values applied to the operation
+     */
+    virtual void MatrixOperation(
+        std::vector<std::complex<double>> const& matrix,
+        std::vector<QubitIdType> const& wires,
+        [[maybe_unused]] bool inverse = false,
+        [[maybe_unused]] std::vector<QubitIdType> const& controlled_wires = {},
+        [[maybe_unused]] std::vector<bool> const& controlled_values = {})
+        = 0;
+
+    /**
+     * @brief Construct a named (Identity, PauliX, PauliY, PauliZ, and
+     * Hadamard) or Hermitian observable.
+     *
+     * @param id The type of the observable
+     * @param matrix The matrix of data to construct a hermitian observable
+     * @param wires Wires to apply observable to
+     *
+     * @return `ObsIdType` Index of the constructed observable
+     */
+    virtual auto Observable(ObsId id,
+                            std::vector<std::complex<double>> const& matrix,
+                            std::vector<QubitIdType> const& wires) -> ObsIdType
+        = 0;
+
+    /**
+     * @brief Construct a tensor product of observables.
+     *
+     * @param obs The vector of observables indices of type ObsIdType
+     *
+     * @return `ObsIdType` Index of the constructed observable
+     */
+    virtual auto TensorObservable(std::vector<ObsIdType> const& obs)
+        -> ObsIdType
+        = 0;
+
+    /**
+     * @brief Construct a Hamiltonian observable.
+     *
+     * @param coeffs The vector of coefficients
+     * @param obs The vector of observables indices of size `coeffs`
+     *
+     * @return `ObsIdType` Index of the constructed observable
+     */
+    virtual auto HamiltonianObservable(std::vector<double> const& coeffs,
+                                       std::vector<ObsIdType> const& obs)
+        -> ObsIdType
+        = 0;
+
+    /**
+     * @brief Compute the expected value of an observable.
+     *
+     * @param obsKey The index of the constructed observable
+     *
+     * @return `double` The expected value
+     */
+    virtual auto Expval(ObsIdType obsKey) -> double = 0;
+
+    /**
+     * @brief Compute the variance of an observable.
+     *
+     * @param obsKey The index of the constructed observable
+     *
+     * @return `double` The variance
+     */
+    virtual auto Var(ObsIdType obsKey) -> double = 0;
+
+    /**
+     * @brief Get the state-vector of a device.
+     *
+     * @param state The pre-allocated `DataView<complex<double>, 1>`
+     */
+    virtual void State(DataView<std::complex<double>, 1>& state) = 0;
+
+    /**
+     * @brief Compute the probabilities of each computational basis state.
+
+     * @param probs The pre-allocated `DataView<double, 1>`
+     */
+    virtual void Probs(DataView<double, 1>& probs) = 0;
+
+    /**
+     * @brief Compute the probabilities for a subset of the full system.
+     *
+     * @param probs The pre-allocated `DataView<double, 1>`
+     * @param wires Wires will restrict probabilities to a subset of the full
+     * system
+     */
+    virtual void PartialProbs(DataView<double, 1>& probs,
+                              std::vector<QubitIdType> const& wires)
+        = 0;
+
+    /**
+     * @brief Compute samples with the number of shots on the entire wires,
+     * returing raw samples.
+     *
+     * @param samples The pre-allocated `DataView<double, 2>`representing a
+     * matrix of shape `shots * numQubits`. The built-in iterator in
+     * `DataView<double, 2>` iterates over all elements of `samples` row-wise.
+     * @param shots The number of shots
+     */
+    virtual void Sample(DataView<double, 2>& samples, size_t shots) = 0;
+
+    /**
+     * @brief Compute partial samples with the number of shots on `wires`,
+     * returing raw samples.
+     *
+     * @param samples The pre-allocated `DataView<double, 2>`representing a
+     * matrix of shape `shots * numWires`. The built-in iterator in
+     * `DataView<double, 2>` iterates over all elements of `samples` row-wise.
+     * @param wires Wires to compute samples on
+     * @param shots The number of shots
+     */
+    virtual void PartialSample(DataView<double, 2>& samples,
+                               std::vector<QubitIdType> const& wires,
+                               size_t shots)
+        = 0;
+
+    /**
+     * @brief Sample with the number of shots on the entire wires, returning
+     * the number of counts for each sample.
+     *
+     * @param eigvals The pre-allocated `DataView<double, 1>`
+     * @param counts The pre-allocated `DataView<int64_t, 1>`
+     * @param shots The number of shots
+     */
+    virtual void Counts(DataView<double, 1>& eigvals,
+                        DataView<int64_t, 1>& counts,
+                        size_t shots)
+        = 0;
+
+    /**
+     * @brief Partial sample with the number of shots on `wires`, returning the
+     * number of counts for each sample.
+     *
+     * @param eigvals The pre-allocated `DataView<double, 1>`
+     * @param counts The pre-allocated `DataView<int64_t, 1>`
+     * @param wires Wires to compute samples on
+     * @param shots The number of shots
+     */
+    virtual void PartialCounts(DataView<double, 1>& eigvals,
+                               DataView<int64_t, 1>& counts,
+                               std::vector<QubitIdType> const& wires,
+                               size_t shots)
+        = 0;
+
+    /**
+     * @brief A general measurement method that acts on a single wire.
+     *
+     * @param wire The wire to compute Measure on
+     * @param postselect Which basis state to postselect after a mid-circuit
+     measurement (-1 denotes no post-selection)
+
+     * @return `Result` The measurement result
+     */
+    virtual auto Measure(QubitIdType wire, std::optional<int32_t> postselect)
+        -> Result
+        = 0;
+
+    /**
+     * @brief Compute the gradient of a quantum tape, that is cached using
+     * `Catalyst::Runtime::Simulator::CacheManager`, for a specific set of
+     * trainable parameters.
+     *
+     * @param gradients The vector of pre-allocated `DataView<double, 1>*`
+     * to store gradients resutls for the list of cached observables.
+     * @param trainParams The vector of trainable parameters; if none, all
+     * parameters would be assumed trainable
+     *
+     */
+    virtual void Gradient(std::vector<DataView<double, 1>>& gradients,
+                          std::vector<size_t> const& trainParams)
+        = 0;
+};
+}  // namespace Catalyst::Runtime
diff --git a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Types.h b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Types.h
new file mode 100644
index 0000000..a90f69d
--- /dev/null
+++ b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Types.h
@@ -0,0 +1,179 @@
+// Copyright 2022-2023 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#ifndef TYPES_H
+#    define TYPES_H
+
+#    include <cmath>
+#    include <cstdint>
+#    include <limits>
+
+#    ifdef __cplusplus
+extern "C" {
+#    endif
+
+// Qubit, Result and Observable types
+struct QUBIT;
+using QubitIdType = intptr_t;
+
+using RESULT = bool;
+using Result = RESULT*;
+using QirArray = void*;
+
+using ObsIdType = intptr_t;
+
+enum ObsId : int8_t
+{
+    Identity = 0,
+    PauliX,
+    PauliY,
+    PauliZ,
+    Hadamard,
+    Hermitian,
+};
+
+enum ObsType : int8_t
+{
+    Basic = 0,
+    TensorProd,
+    Hamiltonian,
+};
+
+// complex<float> type
+struct CplxT_float
+{
+    float real;
+    float imag;
+};
+
+// complex<double> type
+struct CplxT_double
+{
+    double real;
+    double imag;
+};
+
+enum NumericType : int8_t
+{
+    idx = 0,
+    i1,
+    i8,
+    i16,
+    i32,
+    i64,
+    f32,
+    f64,
+    c64,
+    c128,
+};
+
+// MemRefT<datatype, dimension=rank> type
+struct OpaqueMemRefT
+{
+    int64_t rank;
+    void* descriptor;
+    NumericType datatype;
+};
+
+// MemRefT<complex<double>, dimension=1> type
+struct MemRefT_CplxT_double_1d
+{
+    CplxT_double* data_allocated;
+    CplxT_double* data_aligned;
+    size_t offset;
+    size_t sizes[1];
+    size_t strides[1];
+};
+
+// MemRefT<complex<double>, dimension=2> type
+struct MemRefT_CplxT_double_2d
+{
+    CplxT_double* data_allocated;
+    CplxT_double* data_aligned;
+    size_t offset;
+    size_t sizes[2];
+    size_t strides[2];
+};
+
+// MemRefT<double, dimension=1> type
+struct MemRefT_double_1d
+{
+    double* data_allocated;
+    double* data_aligned;
+    size_t offset;
+    size_t sizes[1];
+    size_t strides[1];
+};
+
+// MemRefT<double, dimension=2> type
+struct MemRefT_double_2d
+{
+    double* data_allocated;
+    double* data_aligned;
+    size_t offset;
+    size_t sizes[2];
+    size_t strides[2];
+};
+
+// MemRefT<int64_t, dimension=1> type
+struct MemRefT_int64_1d
+{
+    int64_t* data_allocated;
+    int64_t* data_aligned;
+    size_t offset;
+    size_t sizes[1];
+    size_t strides[1];
+};
+
+// MemRefT<int64_t, dimension=1> type
+struct MemRefT_int8_1d
+{
+    int8_t* data_allocated;
+    int8_t* data_aligned;
+    size_t offset;
+    size_t sizes[1];
+    size_t strides[1];
+};
+
+// PairT<MemRefT<double, dimension=1>, MemRefT<int64, dimension=2>> type
+struct PairT_MemRefT_double_int64_1d
+{
+    struct MemRefT_double_1d first;
+    struct MemRefT_int64_1d second;
+};
+
+// Quantum operation modifiers
+struct Modifiers
+{
+    bool adjoint;
+    size_t num_controlled;
+    QUBIT* controlled_wires;
+    bool* controlled_values;
+};
+
+using CplxT_double = struct CplxT_double;
+using MemRefT_CplxT_double_1d = struct MemRefT_CplxT_double_1d;
+using MemRefT_CplxT_double_2d = struct MemRefT_CplxT_double_2d;
+using MemRefT_double_1d = struct MemRefT_double_1d;
+using MemRefT_double_2d = struct MemRefT_double_2d;
+using MemRefT_int64_1d = struct MemRefT_int64_1d;
+using PairT_MemRefT_double_int64_1d = struct PairT_MemRefT_double_int64_1d;
+using Modifiers = struct Modifiers;
+
+#    ifdef __cplusplus
+}  // extern "C"
+#    endif
+
+#endif
diff --git a/src/qirlightning/simple_demo/test_rt_device.cpp b/src/qirlightning/simple_demo/test_rt_device.cpp
new file mode 100644
index 0000000..47897f2
--- /dev/null
+++ b/src/qirlightning/simple_demo/test_rt_device.cpp
@@ -0,0 +1,74 @@
+#include <dlfcn.h>
+
+#include "QuantumDevice.hpp"
+
+// Runtime libraries (kokkos/GPU/qubit etc.)
+// Update these paths to point to the correct library
+#define RTDLIB                      \
+    "<UPDATE: site packages path>/" \
+    "pennylane_lightning/liblightning_kokkos_catalyst.so";
+#define RTDDEVICE "LightningKokkosSimulator";
+
+extern "C" Catalyst::Runtime::QuantumDevice*
+GenericDeviceFactory(char const* kwargs);
+
+using namespace Catalyst::Runtime;
+
+int main()
+{
+    try
+    {
+        // Load lightning simulation library
+        std::string rtd_lib = RTDLIB;
+        std::string rtd_device = RTDDEVICE;
+        std::string kwargs = {};
+        auto rtld_flags = RTLD_LAZY | RTLD_NODELETE;
+        auto rtd_dylib_handler = dlopen(rtd_lib.c_str(), rtld_flags);
+
+        if (!rtd_dylib_handler)
+        {
+            throw std::runtime_error("Failed to load library: " + rtd_lib);
+        }
+
+        // Find device factory
+        std::string factory_name = rtd_device + "Factory";
+        void* f_ptr = dlsym(rtd_dylib_handler, factory_name.c_str());
+
+        if (!f_ptr)
+        {
+            dlclose(rtd_dylib_handler);
+            throw std::runtime_error("Failed to find factory function: "
+                                     + factory_name);
+        }
+        std::string rtd_kwargs = {};
+        auto rtd_qdevice = std::unique_ptr<QuantumDevice>(
+            reinterpret_cast<decltype(GenericDeviceFactory)*>(f_ptr)(
+                rtd_kwargs.c_str()));
+
+        // Allocate Qubits
+        rtd_qdevice->AllocateQubits(3);
+
+        // Get Num Qubits
+        std::cout << "Num Qubits = " << rtd_qdevice->GetNumQubits()
+                  << std::endl;
+
+        // Apply Gate
+        rtd_qdevice->NamedOperation("Hadamard", {}, {0});
+
+        // Print State
+        std::cout << "State = " << std::endl;
+        rtd_qdevice->PrintState();
+
+        // Measure
+        QubitIdType wire{0};
+        Result result = rtd_qdevice->Measure(wire, std::nullopt);
+        std::cout << "Measure on wire 0 = " << *result << std::endl;
+    }
+    catch (std::exception const& e)
+    {
+        std::cerr << "Error: " << e.what() << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    return EXIT_SUCCESS;
+}
diff --git a/src/qirlightning/support_catalyst.cmake b/src/qirlightning/support_catalyst.cmake
deleted file mode 100644
index ca7df76..0000000
--- a/src/qirlightning/support_catalyst.cmake
+++ /dev/null
@@ -1,74 +0,0 @@
-###############################################################################################
-# This file provides macros to process Catalyst.
-###############################################################################################
-
-# Include this only once
-include_guard()
-
-macro(FindCatalyst target_name)
-    if(LIGHTNING_CATALYST_SRC_PATH)
-        if(NOT IS_ABSOLUTE ${LIGHTNING_CATALYST_SRC_PATH})
-            message(FATAL_ERROR " LIGHTNING_CATALYST_SRC_PATH=${LIGHTNING_CATALYST_SRC_PATH} must be set to an absolute path")
-        endif()
-        if(CATALYST_GIT_TAG)
-            message(WARN " Setting `LIGHTNING_CATALYST_SRC_PATH=${LIGHTNING_CATALYST_SRC_PATH}` overrides `CATALYST_GIT_TAG=${CATALYST_GIT_TAG}`")
-        endif()
-
-        # Acquire local git hash and use for CATALYST_GIT_TAG
-        execute_process(COMMAND git rev-parse --short HEAD
-            WORKING_DIRECTORY ${LIGHTNING_CATALYST_SRC_PATH}
-            OUTPUT_VARIABLE CATALYST_GIT_TAG
-        )
-        message(INFO " Building against local Catalyst - path: ${LIGHTNING_CATALYST_SRC_PATH} - GIT TAG: ${CATALYST_GIT_TAG}")
-
-        target_include_directories(${target_name} PUBLIC ${LIGHTNING_CATALYST_SRC_PATH}/runtime/lib/backend/common)
-        target_include_directories(${target_name} PUBLIC ${LIGHTNING_CATALYST_SRC_PATH}/runtime/include)
-
-    else()
-        if(NOT CATALYST_GIT_TAG)
-            set(CATALYST_GIT_TAG "v0.12.0" CACHE STRING "GIT_TAG value to build Catalyst")
-        endif()
-        message(INFO " Building against Catalyst GIT TAG ${CATALYST_GIT_TAG}")
-
-        # Fetching /lib/backend/common hpp headers
-        set(LIB_BACKEND_COMMON_HEADERS  CacheManager.hpp
-                                    QubitManager.hpp
-                                    Utils.hpp
-        )
-
-        foreach(HEADER ${LIB_BACKEND_COMMON_HEADERS})
-            string(REGEX REPLACE "\\.[^.]*$" "" HEADER_NAME ${HEADER})
-            FetchContent_Declare(
-                ${HEADER_NAME}
-                URL                 https://raw.githubusercontent.com/PennyLaneAI/catalyst/${CATALYST_GIT_TAG}/runtime/lib/backend/common/${HEADER}
-                DOWNLOAD_NO_EXTRACT True
-                SOURCE_DIR          ../../include
-            )
-
-            FetchContent_MakeAvailable(${HEADER_NAME})
-        endforeach()
-
-        # Fetching include hpp headers
-        set(INCLUDE_HEADERS DataView.hpp
-                        Exception.hpp
-                        QuantumDevice.hpp
-                        RuntimeCAPI.h
-                        Types.h
-        )
-
-        foreach(HEADER ${INCLUDE_HEADERS})
-            string(REGEX REPLACE "\\.[^.]*$" "" HEADER_NAME ${HEADER})
-            FetchContent_Declare(
-                ${HEADER_NAME}
-                URL                 https://raw.githubusercontent.com/PennyLaneAI/catalyst/${CATALYST_GIT_TAG}/runtime/include/${HEADER}
-                DOWNLOAD_NO_EXTRACT True
-                SOURCE_DIR          ../../include
-            )
-
-            FetchContent_MakeAvailable(${HEADER_NAME})
-        endforeach()
-
-        #target_include_directories(${target_name} PUBLIC ${CMAKE_CURRENT_BINARY_DIR}/../../include)
-
-    endif()
-endmacro()

From 67373005e90b66d805dba847d74aa18700a8d0ae Mon Sep 17 00:00:00 2001
From: Joseph Lee <joseph.lee@xanadu.ai>
Date: Wed, 22 Oct 2025 21:41:07 +0000
Subject: [PATCH 61/64] remove simple_demo

---
 src/qirlightning/simple_demo/README.md        |  69 ---
 .../snapshot_catalyst_runtime/README.rst      | 118 ------
 .../include/DataView.hpp                      | 173 --------
 .../include/Exception.hpp                     |  96 -----
 .../include/QuantumDevice.hpp                 | 399 ------------------
 .../snapshot_catalyst_runtime/include/Types.h | 179 --------
 .../simple_demo/test_rt_device.cpp            |  74 ----
 7 files changed, 1108 deletions(-)
 delete mode 100644 src/qirlightning/simple_demo/README.md
 delete mode 100644 src/qirlightning/simple_demo/snapshot_catalyst_runtime/README.rst
 delete mode 100644 src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/DataView.hpp
 delete mode 100644 src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Exception.hpp
 delete mode 100644 src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/QuantumDevice.hpp
 delete mode 100644 src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Types.h
 delete mode 100644 src/qirlightning/simple_demo/test_rt_device.cpp

diff --git a/src/qirlightning/simple_demo/README.md b/src/qirlightning/simple_demo/README.md
deleted file mode 100644
index ef08d55..0000000
--- a/src/qirlightning/simple_demo/README.md
+++ /dev/null
@@ -1,69 +0,0 @@
-# Simple Demo for Catalyst/Lightning runtime
-
-This is a super simple demo for driving Lightning devices. The example here uses `lightning.kokkos`, but can easily be updated to target other devices, e.g. lightning.gpu (if an Nvidia GPU is present).
-
-Some Catalyst include files are copied here for convenience - they are in `./snapshot_catalyst_runtime/include`. These are required for the QuantumDevice interface. For the qiree source, these files are fetched automatically during CMake, and these are not used.
-
-## Installing a lightning simulator
-
-When installing [Pennylane-Lightning](https://github.com/PennyLaneAI/pennylane-lightning) from pip or source, you will have the shared objects for each of the simulator installed. These are named `liblightning_kokkos_catalyst.so`/`liblightning_GPU_catalyst.so` etc.
-
-To get started, run `pip install pennylane` or `pip install pennylane-lightning` - this will install the `lightning.qubit` (CPU) simulator, and other simulators can be installed by running `pip install pennylane-lightning-kokkos / pennylane-lightning-gpu`.
-
-Example:
-```
-$ pip install pennylane-lightning-kokkos
-
-$ pip show pennylane-lightning-kokkos
-Name: PennyLane_Lightning_Kokkos
-Version: 0.40.0
-Summary: PennyLane-Lightning plugin
-Home-page: https://github.com/PennyLaneAI/pennylane-lightning
-Author:
-Author-email:
-License: Apache License 2.0
-Location: <site packages path>
-Requires: pennylane, pennylane-lightning
-
-$ ls <site packages path>/pennylane_lightning
-... liblightning_kokkos_catalyst.so ...
-```
-
-You can swap `pennylane-lightning-kokkos` for `pennylane-lightning-gpu` for lightning.gpu and `pennylane-lightning` for lightning.gpu simulators.
-
-## Compilation
-
-First update the `RTDLIB` in `test_rt_device.cpp` to the local path where lightning is installed (i.e. `<site packages path>` from above).
-
-To compile:
-
-```
-$ clang++ --std=c++20 test_rt_device.cpp -I./snapshot_catalyst_runtime/include -o test_rt_device.out
-```
-
-## Running the example
-
-To run:
-
-```
-$ ./test_rt_device.out
-Kokkos::OpenMP::initialize WARNING: OMP_PROC_BIND environment variable not set
-  In general, for best performance with OpenMP 4.0 or better set OMP_PROC_BIND=spread and OMP_PLACES=threads
-  For best performance with OpenMP 3.1 set OMP_PROC_BIND=true
-  For unit testing set OMP_PROC_BIND=false
-
-Num Qubits = 3
-State =
-*** State-Vector of Size 8 ***
-[(0.707107,0), (0,0), (0,0), (0,0), (0.707107,0), (0,0), (0,0), (0,0)]
-Measure on wire 0 = 0
-```
-
-## Running on other devices
-
-To run on other devices, e.g. lightning.gpu, you need to change:
-- Install pennylane-lightning-gpu: `pip install pennylane-lightning-gpu`
-In the c++ file:
-- replace `RTDLIB` from `kokkos` to `gpu`
-- replace `RTDDEVICE` from `Kokkos` to `GPU`
-- Include `cuquantum` libraries when running (which was installed as a dependency), i.e. `LD_LIBRARY_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")/lib:$LD_LIBRARY_PATH ./test_rt_device.out`
diff --git a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/README.rst b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/README.rst
deleted file mode 100644
index 8a881e5..0000000
--- a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/README.rst
+++ /dev/null
@@ -1,118 +0,0 @@
-.. runtime-start-inclusion-marker-do-not-remove
-
-Catalyst Quantum Runtime
-########################
-
-The Catalyst Runtime is a C++ QIR runtime that enables the execution of Catalyst-compiled
-quantum programs, and is currently backed by `PennyLane-Lightning <https://docs.pennylane.ai/projects/lightning/en/stable>`_
-state-vector simulators, and `Amazon Braket <https://amazon-braket-pennylane-plugin-python.readthedocs.io>`__
-devices. Additional hardware support, including QPUs, to come.
-
-The runtime employs the `QuantumDevice <https://docs.pennylane.ai/projects/catalyst/en/stable/api/structCatalyst_1_1Runtime_1_1QuantumDevice.html#exhale-struct-structcatalyst-1-1runtime-1-1quantumdevice>`_
-public interface to support an extensible list of backend devices. This interface comprises two collections of abstract methods:
-
-- The Qubit management, device shot noise, and quantum tape recording methods are utilized for the implementation of Quantum Runtime (QR) instructions.
-
-- The quantum operations, observables, measurements, and gradient methods are used to implement Quantum Instruction Set (QIS) instructions.
-
-A complete list of instructions supported by the runtime can be found in
-`RuntimeCAPI.h <https://github.com/PennyLaneAI/catalyst/tree/main/runtime/include/RuntimeCAPI.h>`_.
-
-Contents
-========
-
-The directory is structured as follows:
-
-- `include <https://github.com/PennyLaneAI/catalyst/tree/main/runtime/include>`_:
-    This contains the public header files of the runtime including the ``QuantumDevice`` API
-    for backend quantum devices and the runtime CAPI.
-
-- `lib <https://github.com/PennyLaneAI/catalyst/tree/main/runtime/lib>`_:
-    The core modules of the runtime are structured into ``lib/capi`` and ``lib/backend``.
-    `lib/capi <https://github.com/PennyLaneAI/catalyst/tree/main/runtime/lib/capi>`_  implements the semantics for
-    QIR instructions lowered to our custom runtime. `lib/backend <https://github.com/PennyLaneAI/catalyst/tree/main/runtime/lib/backend>`_
-    contains implementations of the ``QuantumDevice`` API for backend simulators.
-
-- `tests <https://github.com/PennyLaneAI/catalyst/tree/main/runtime/tests>`_:
-    A collection of C++ tests for modules and methods in the runtime.
-
-Backend Devices
-===============
-
-New device backends for the runtime can be realized by implementing the quantum device interface.
-The following table shows the available devices along with supported features:
-
-.. list-table::
-   :widths: 25 25 25 25
-   :header-rows: 0
-
-   * - **Features**
-     - **PennyLane-Lightning-Qubit**
-     - **PennyLane-Lightning-Kokkos** and **PennyLane-Lightning-GPU**
-     - **Amazon-Braket-OpenQasm**
-   * - Qubit Management
-     - Dynamic allocation/deallocation
-     - Static allocation/deallocation
-     - Static allocation/deallocation
-   * - Gate Operations
-     - `Lightning operations <https://github.com/PennyLaneAI/pennylane-lightning/blob/master/pennylane_lightning/core/src/gates/GateOperation.hpp>`_
-     - `Lightning operations <https://github.com/PennyLaneAI/pennylane-lightning/blob/master/pennylane_lightning/core/src/gates/GateOperation.hpp>`_ without controlled gates support
-     - `Braket operations <https://github.com/PennyLaneAI/catalyst/blob/e812afbadbd777209862d5c76f394e3f0c43ffb6/runtime/lib/backend/openqasm/OpenQasmBuilder.hpp#L49>`_
-   * - Quantum Observables
-     - ``Identity``, ``PauliX``, ``PauliY``, ``PauliZ``, ``Hadamard``, ``Hermitian``, ``Hamiltonian``, and Tensor Product of Observables
-     - ``Identity``, ``PauliX``, ``PauliY``, ``PauliZ``, ``Hadamard``, ``Hermitian``, ``Hamiltonian``, and Tensor Product of Observables
-     - ``Identity``, ``PauliX``, ``PauliY``, ``PauliZ``, ``Hadamard``, ``Hermitian``, and Tensor Product of Observables
-   * - Expectation Value
-     - All observables; Finite-shots supported
-     - All observables; Finite-shots supported
-     - All observables; Finite-shots supported
-   * - Variance
-     - All observables; Finite-shots supported
-     - All observables; Finite-shots supported
-     - All observables; Finite-shots supported
-   * - Probability
-     - Only for the computational basis on the supplied qubits; Finite-shots supported
-     - Only for the computational basis on the supplied qubits; Finite-shots supported
-     - The computational basis on all active qubits; Finite-shots supported
-   * - Sampling
-     - Only for the computational basis on the supplied qubits
-     - Only for the computational basis on the supplied qubits
-     - The computational basis on all active qubits; Finite-shots supported
-   * - Mid-Circuit Measurement
-     - Only for the computational basis on the supplied qubit
-     - Only for the computational basis on the supplied qubit
-     - Not supported
-   * - Gradient
-     - The Adjoint-Jacobian method for expectation values on all observables
-     - The Adjoint-Jacobian method for expectation values on all observables
-     - Not supported
-
-Requirements
-============
-
-To build the runtime from source, it is required to have an up to date version of a C/C++ compiler such as gcc or clang
-with support for the C++20 standard library.
-
-Installation
-============
-
-By default, the runtime builds all supported backend devices.
-You can build the runtime with custom devices from the list of Backend Devices.
-
-You can use ``ENABLE_OPENQASM=OFF`` to disable building the runtime with `Amazon-Braket-OpenQasm <https://aws.amazon.com/braket/>`_:
-
-.. code-block:: console
-
-    make runtime ENABLE_OPENQASM=OFF
-
-This device currently offers generators for the `OpenQasm3 <https://openqasm.com/versions/3.0/index.html>`_ specification and
-`Amazon Braket <https://docs.aws.amazon.com/braket/latest/developerguide/braket-openqasm-supported-features.html>`__ assembly extension.
-Moreover, the generated assembly can be executed on Amazon Braket devices leveraging `amazon-braket-sdk-python <https://github.com/aws/amazon-braket-sdk-python>`_.
-
-To check the runtime test suite from the root directory:
-
-.. code-block:: console
-
-    make test-runtime
-
-.. runtime-end-inclusion-marker-do-not-remove
diff --git a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/DataView.hpp b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/DataView.hpp
deleted file mode 100644
index 616b9dc..0000000
--- a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/DataView.hpp
+++ /dev/null
@@ -1,173 +0,0 @@
-// Copyright 2023 Xanadu Quantum Technologies Inc.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-//     http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <Exception.hpp>
-
-/**
- * A multi-dimensional view for MemRef-like and std::vector<T> types.
- *
- * @tparam T The underlying data type
- * @tparam R The Rank (R > 0)
- *
- * @note A forward iterator is implemented in this view for traversing over the
- * entire elements of MemRef types rank-by-rank starting from the last
- * dimension (R-1). For example, The DataView iterator for MemRef<T, 2> starts
- * from index (0, 0) and traverses elements in the following order: (0, 0),
- * ..., (0, sizes[1]-1), (1, 0), ..., (1, sizes[1]-1), ... (sizes[0]-1,
- * sizes[1]-1).
- */
-template<typename T, size_t R>
-class DataView
-{
-  private:
-    T* data_aligned;
-    size_t offset;
-    size_t sizes[R] = {0};
-    size_t strides[R] = {0};
-
-  public:
-    class iterator
-    {
-      private:
-        DataView<T, R> const& view;
-
-        int64_t loc;  // physical index
-        size_t indices[R] = {0};
-
-      public:
-        using iterator_category = std::forward_iterator_tag;  // LCOV_EXCL_LINE
-        using value_type = T;  // LCOV_EXCL_LINE
-        using difference_type = std::ptrdiff_t;  // LCOV_EXCL_LINE
-        using pointer = T*;  // LCOV_EXCL_LINE
-        using reference = T&;  // LCOV_EXCL_LINE
-
-        iterator(DataView<T, R> const& _view, int64_t begin_idx)
-            : view(_view), loc(begin_idx)
-        {
-        }
-        pointer operator->() const { return &view.data_aligned[loc]; }
-        reference operator*() const { return view.data_aligned[loc]; }
-        iterator& operator++()
-        {
-            int64_t next_axis = -1;
-            int64_t idx;
-            for (int64_t i = R; i > 0; --i)
-            {
-                idx = i - 1;
-                if (indices[idx]++ < view.sizes[idx] - 1)
-                {
-                    next_axis = idx;
-                    break;
-                }
-                indices[idx] = 0;
-                loc -= (view.sizes[idx] - 1) * view.strides[idx];
-            }
-
-            loc = next_axis == -1 ? -1 : loc + view.strides[next_axis];
-            return *this;
-        }
-        iterator operator++(int)
-        {
-            auto tmp = *this;
-            int64_t next_axis = -1;
-            int64_t idx;
-            for (int64_t i = R; i > 0; --i)
-            {
-                idx = i - 1;
-                if (indices[idx]++ < view.sizes[idx] - 1)
-                {
-                    next_axis = idx;
-                    break;
-                }
-                indices[idx] = 0;
-                loc -= (view.sizes[idx] - 1) * view.strides[idx];
-            }
-
-            loc = next_axis == -1 ? -1 : loc + view.strides[next_axis];
-            return tmp;
-        }
-        bool operator==(iterator const& other) const
-        {
-            return (loc == other.loc
-                    && view.data_aligned == other.view.data_aligned);
-        }
-        bool operator!=(iterator const& other) const
-        {
-            return !(*this == other);
-        }
-    };
-
-    explicit DataView(std::vector<T>& buffer)
-        : data_aligned(buffer.data()), offset(0)
-    {
-        static_assert(R == 1, "[Class: DataView] Assertion: R == 1");
-        sizes[0] = buffer.size();
-        strides[0] = 1;
-    }
-
-    explicit DataView(T* _data_aligned,
-                      size_t _offset,
-                      size_t const* _sizes,
-                      size_t const* _strides)
-        : data_aligned(_data_aligned), offset(_offset)
-    {
-        static_assert(R > 0, "[Class: DataView] Assertion: R > 0");
-        if (_sizes != nullptr && _strides != nullptr)
-        {
-            for (size_t i = 0; i < R; i++)
-            {
-                sizes[i] = _sizes[i];
-                strides[i] = _strides[i];
-            }
-        }  // else sizes = {0}, strides = {0}
-    }
-
-    [[nodiscard]] auto size() const -> size_t
-    {
-        if (!data_aligned)
-        {
-            return 0;
-        }
-
-        size_t tsize = 1;
-        for (size_t i = 0; i < R; i++)
-        {
-            tsize *= sizes[i];
-        }
-        return tsize;
-    }
-
-    template<typename... I>
-    T& operator()(I... idxs) const
-    {
-        static_assert(sizeof...(idxs) == R,
-                      "[Class: DataView] Error in Catalyst Runtime: Wrong "
-                      "number of indices");
-        size_t indices[] = {static_cast<size_t>(idxs)...};
-
-        size_t loc = offset;
-        for (size_t axis = 0; axis < R; axis++)
-        {
-            RT_ASSERT(indices[axis] < sizes[axis]);
-            loc += indices[axis] * strides[axis];
-        }
-        return data_aligned[loc];
-    }
-
-    iterator begin() { return iterator{*this, static_cast<int64_t>(offset)}; }
-
-    iterator end() { return iterator{*this, -1}; }
-};
diff --git a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Exception.hpp b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Exception.hpp
deleted file mode 100644
index 4e8272d..0000000
--- a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Exception.hpp
+++ /dev/null
@@ -1,96 +0,0 @@
-// Copyright 2023 Xanadu Quantum Technologies Inc.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-//     http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <exception>
-#include <iostream>
-#include <sstream>
-#include <string>
-#include <type_traits>
-#include <utility>
-
-/**
- * @brief Macro that throws `RuntimeException` with given message.
- */
-#define RT_FAIL(message) \
-    Catalyst::Runtime::_abort((message), __FILE__, __LINE__, __func__)
-
-/**
- * @brief Macro that throws `RuntimeException` if expression evaluates
- * to true.
- */
-#define RT_FAIL_IF(expression, message) \
-    if ((expression))                   \
-    {                                   \
-        RT_FAIL(message);               \
-    }
-
-/**
- * @brief Macro that throws `RuntimeException` with the given expression
- * and source location if expression evaluates to false.
- */
-#define RT_ASSERT(expression) \
-    RT_FAIL_IF(!(expression), "Assertion: " #expression)
-
-namespace Catalyst::Runtime
-{
-
-/**
- * @brief This is the general exception thrown by Catalyst for runtime errors
- * that is derived from `std::exception`.
- */
-class RuntimeException : public std::exception
-{
-  private:
-    std::string const err_msg;
-
-  public:
-    explicit RuntimeException(std::string msg) noexcept
-        : err_msg{std::move(msg)}
-    {
-    }  // LCOV_EXCL_LINE
-    ~RuntimeException() override = default;  // LCOV_EXCL_LINE
-
-    RuntimeException(RuntimeException const&) = default;
-    RuntimeException(RuntimeException&&) noexcept = default;
-
-    RuntimeException& operator=(RuntimeException const&) = delete;
-    RuntimeException& operator=(RuntimeException&&) = delete;
-
-    [[nodiscard]] auto what() const noexcept -> char const* override
-    {
-        return err_msg.c_str();
-    }  // LCOV_EXCL_LINE
-};
-
-/**
- * @brief Throws a `RuntimeException` with the given error message.
- *
- * @note This is not supposed to be called directly.
- */
-[[noreturn]] inline void _abort(char const* message,
-                                char const* file_name,
-                                size_t line,
-                                char const* function_name)
-{
-    std::stringstream sstream;
-    sstream << "[" << file_name << "][Line:" << line
-            << "][Function:" << function_name
-            << "] Error in Catalyst Runtime: " << message;
-
-    throw RuntimeException(sstream.str());
-}  // LCOV_EXCL_LINE
-
-}  // namespace Catalyst::Runtime
diff --git a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/QuantumDevice.hpp b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/QuantumDevice.hpp
deleted file mode 100644
index 6794033..0000000
--- a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/QuantumDevice.hpp
+++ /dev/null
@@ -1,399 +0,0 @@
-// Copyright 2022-2023 Xanadu Quantum Technologies Inc.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-//     http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <complex>
-#include <memory>
-#include <optional>
-#include <random>
-#include <vector>
-
-#include "DataView.hpp"
-#include "Types.h"
-
-// A helper template macro to generate the <IDENTIFIER>Factory method by
-// calling <CONSTRUCTOR>(kwargs). Check the Custom Devices guideline for
-// details:
-// https://docs.pennylane.ai/projects/catalyst/en/stable/dev/custom_devices.html
-#define GENERATE_DEVICE_FACTORY(IDENTIFIER, CONSTRUCTOR)              \
-    extern "C" Catalyst::Runtime::QuantumDevice* IDENTIFIER##Factory( \
-        const char* kwargs)                                           \
-    {                                                                 \
-        return new CONSTRUCTOR(std::string(kwargs));                  \
-    }
-
-namespace Catalyst::Runtime
-{
-
-/**
- * @brief struct API for backend quantum devices.
- *
- * This device API contains,
- * - a set of methods to manage qubit allocations and deallocations, device
- * shot noise, and quantum tape recording as well as reference values for the
- * result data-type; these are used to implement Quantum Runtime (QR)
- * instructions.
- *
- * - a set of methods for quantum operations, observables, measurements, and
- * gradient of the device; these are used to implement Quantum Instruction Set
- * (QIS) instructions.
- *
- */
-struct QuantumDevice
-{
-    QuantumDevice() = default;  // LCOV_EXCL_LINE
-    virtual ~QuantumDevice() = default;  // LCOV_EXCL_LINE
-
-    QuantumDevice& operator=(QuantumDevice const&) = delete;
-    QuantumDevice(QuantumDevice const&) = delete;
-    QuantumDevice(QuantumDevice&&) = delete;
-    QuantumDevice& operator=(QuantumDevice&&) = delete;
-
-    /**
-     * @brief Allocate a qubit.
-     *
-     * @return `QubitIdType`
-     */
-    virtual auto AllocateQubit() -> QubitIdType = 0;
-
-    /**
-     * @brief Allocate a vector of qubits.
-     *
-     * @param num_qubits The number of qubits to allocate.
-     *
-     * @return `std::vector<QubitIdType>`
-     */
-    virtual auto AllocateQubits(size_t num_qubits) -> std::vector<QubitIdType>
-        = 0;
-
-    /**
-     * @brief Release a qubit.
-     *
-     * @param qubit The id of the qubit
-     */
-    virtual void ReleaseQubit(QubitIdType qubit) = 0;
-
-    /**
-     * @brief Release all qubits.
-     */
-    virtual void ReleaseAllQubits() = 0;
-
-    /**
-     * @brief Get the number of allocated qubits.
-     *
-     * @return `size_t`
-     */
-    [[nodiscard]] virtual auto GetNumQubits() const -> size_t = 0;
-
-    /**
-     * @brief Set the number of device shots.
-     *
-     * @param shots The number of noise shots
-     */
-    virtual void SetDeviceShots(size_t shots) = 0;
-
-    /**
-     * @brief Get the number of device shots.
-     *
-     * @return `size_t`
-     */
-    [[nodiscard]] virtual auto GetDeviceShots() const -> size_t = 0;
-
-    /**
-     * @brief Set the PRNG of the device.
-     *
-     * The Catalyst runtime enables seeded program execution on non-hardware
-     * devices. A random number generator instance is managed by the runtime to
-     * predictably generate results for non-deterministic programs, such as
-     * those involving `Measure` calls. Devices implementing support for this
-     * feature do not need to use the provided PRNG instance as their sole
-     * source of random numbers, but it is expected that the the same instance
-     * state will predictable and reproducibly generate the same program
-     * results. It is also expected that the provided PRNG state is evolved
-     * sufficiently so that two device executions sharing the same instance do
-     * not produce identical results. The provided PRNG instance is not
-     * thread-locked, and devices wishing to share it across threads will need
-     * to provide their own thread-safety.
-     *
-     * @param gen The std::mt19937 PRNG object.
-     */
-    virtual void SetDevicePRNG([[maybe_unused]] std::mt19937* gen) {};
-
-    /**
-     * @brief Start recording a quantum tape if provided.
-     *
-     * @note This is backed by the `Catalyst::Runtime::CacheManager<ComplexT>`
-     * property in the device implementation.
-     */
-    virtual void StartTapeRecording() = 0;
-
-    /**
-     * @brief Stop recording a quantum tape if provided.
-     *
-     * @note This is backed by the `Catalyst::Runtime::CacheManager<ComplexT>`
-     * property in the device implementation.
-     */
-    virtual void StopTapeRecording() = 0;
-
-    /**
-     * @brief Result value for "Zero" used in the measurement process.
-     *
-     * @return `Result`
-     */
-    [[nodiscard]] virtual auto Zero() const -> Result = 0;
-
-    /**
-     * @brief Result value for "One"  used in the measurement process.
-     *
-     * @return `Result`
-     */
-    [[nodiscard]] virtual auto One() const -> Result = 0;
-
-    /**
-     * @brief A helper method to print the state vector of a device.
-     */
-    virtual void PrintState() = 0;
-
-    /**
-     * @brief Prepare subsystems using the given ket vector in the
-     * computational basis.
-     *
-     * @param state A state vector of size 2**len(wires)
-     * @param wires The wire(s) the operation acts on
-     */
-    virtual void
-    SetState([[maybe_unused]] DataView<std::complex<double>, 1>& state,
-             [[maybe_unused]] std::vector<QubitIdType>& wires)
-    {
-        RT_FAIL("Unsupported functionality");
-    }
-
-    /**
-     * @brief Prepares a single computational basis state.
-     *
-     * @param n Prepares the basis state |n>, where n is an array of integers
-     * from the set {0, 1}
-     * @param wires The wire(s) the operation acts on
-     */
-    virtual void SetBasisState([[maybe_unused]] DataView<int8_t, 1>& n,
-                               [[maybe_unused]] std::vector<QubitIdType>& wires)
-    {
-        RT_FAIL("Unsupported functionality");
-    }
-
-    /**
-     * @brief Apply a single gate to the state vector of a device with its name
-     * if this is supported.
-     *
-     * @param name The name of the gate to apply
-     * @param params Optional parameter list for parametric gates
-     * @param wires Wires to apply gate to
-     * @param inverse Indicates whether to use inverse of gate
-     * @param controlled_wires Optional controlled wires applied to the
-     * operation
-     * @param controlled_values Optional controlled values applied to the
-     * operation
-     */
-    virtual void NamedOperation(
-        std::string const& name,
-        std::vector<double> const& params,
-        std::vector<QubitIdType> const& wires,
-        [[maybe_unused]] bool inverse = false,
-        [[maybe_unused]] std::vector<QubitIdType> const& controlled_wires = {},
-        [[maybe_unused]] std::vector<bool> const& controlled_values = {})
-        = 0;
-
-    /**
-     * @brief Apply a given matrix directly to the state vector of a device.
-     *
-     * @param matrix The matrix of data in row-major format
-     * @param wires Wires to apply gate to
-     * @param inverse Indicates whether to use inverse of gate
-     * @param controlled_wires Controlled wires applied to the operation
-     * @param controlled_values Controlled values applied to the operation
-     */
-    virtual void MatrixOperation(
-        std::vector<std::complex<double>> const& matrix,
-        std::vector<QubitIdType> const& wires,
-        [[maybe_unused]] bool inverse = false,
-        [[maybe_unused]] std::vector<QubitIdType> const& controlled_wires = {},
-        [[maybe_unused]] std::vector<bool> const& controlled_values = {})
-        = 0;
-
-    /**
-     * @brief Construct a named (Identity, PauliX, PauliY, PauliZ, and
-     * Hadamard) or Hermitian observable.
-     *
-     * @param id The type of the observable
-     * @param matrix The matrix of data to construct a hermitian observable
-     * @param wires Wires to apply observable to
-     *
-     * @return `ObsIdType` Index of the constructed observable
-     */
-    virtual auto Observable(ObsId id,
-                            std::vector<std::complex<double>> const& matrix,
-                            std::vector<QubitIdType> const& wires) -> ObsIdType
-        = 0;
-
-    /**
-     * @brief Construct a tensor product of observables.
-     *
-     * @param obs The vector of observables indices of type ObsIdType
-     *
-     * @return `ObsIdType` Index of the constructed observable
-     */
-    virtual auto TensorObservable(std::vector<ObsIdType> const& obs)
-        -> ObsIdType
-        = 0;
-
-    /**
-     * @brief Construct a Hamiltonian observable.
-     *
-     * @param coeffs The vector of coefficients
-     * @param obs The vector of observables indices of size `coeffs`
-     *
-     * @return `ObsIdType` Index of the constructed observable
-     */
-    virtual auto HamiltonianObservable(std::vector<double> const& coeffs,
-                                       std::vector<ObsIdType> const& obs)
-        -> ObsIdType
-        = 0;
-
-    /**
-     * @brief Compute the expected value of an observable.
-     *
-     * @param obsKey The index of the constructed observable
-     *
-     * @return `double` The expected value
-     */
-    virtual auto Expval(ObsIdType obsKey) -> double = 0;
-
-    /**
-     * @brief Compute the variance of an observable.
-     *
-     * @param obsKey The index of the constructed observable
-     *
-     * @return `double` The variance
-     */
-    virtual auto Var(ObsIdType obsKey) -> double = 0;
-
-    /**
-     * @brief Get the state-vector of a device.
-     *
-     * @param state The pre-allocated `DataView<complex<double>, 1>`
-     */
-    virtual void State(DataView<std::complex<double>, 1>& state) = 0;
-
-    /**
-     * @brief Compute the probabilities of each computational basis state.
-
-     * @param probs The pre-allocated `DataView<double, 1>`
-     */
-    virtual void Probs(DataView<double, 1>& probs) = 0;
-
-    /**
-     * @brief Compute the probabilities for a subset of the full system.
-     *
-     * @param probs The pre-allocated `DataView<double, 1>`
-     * @param wires Wires will restrict probabilities to a subset of the full
-     * system
-     */
-    virtual void PartialProbs(DataView<double, 1>& probs,
-                              std::vector<QubitIdType> const& wires)
-        = 0;
-
-    /**
-     * @brief Compute samples with the number of shots on the entire wires,
-     * returing raw samples.
-     *
-     * @param samples The pre-allocated `DataView<double, 2>`representing a
-     * matrix of shape `shots * numQubits`. The built-in iterator in
-     * `DataView<double, 2>` iterates over all elements of `samples` row-wise.
-     * @param shots The number of shots
-     */
-    virtual void Sample(DataView<double, 2>& samples, size_t shots) = 0;
-
-    /**
-     * @brief Compute partial samples with the number of shots on `wires`,
-     * returing raw samples.
-     *
-     * @param samples The pre-allocated `DataView<double, 2>`representing a
-     * matrix of shape `shots * numWires`. The built-in iterator in
-     * `DataView<double, 2>` iterates over all elements of `samples` row-wise.
-     * @param wires Wires to compute samples on
-     * @param shots The number of shots
-     */
-    virtual void PartialSample(DataView<double, 2>& samples,
-                               std::vector<QubitIdType> const& wires,
-                               size_t shots)
-        = 0;
-
-    /**
-     * @brief Sample with the number of shots on the entire wires, returning
-     * the number of counts for each sample.
-     *
-     * @param eigvals The pre-allocated `DataView<double, 1>`
-     * @param counts The pre-allocated `DataView<int64_t, 1>`
-     * @param shots The number of shots
-     */
-    virtual void Counts(DataView<double, 1>& eigvals,
-                        DataView<int64_t, 1>& counts,
-                        size_t shots)
-        = 0;
-
-    /**
-     * @brief Partial sample with the number of shots on `wires`, returning the
-     * number of counts for each sample.
-     *
-     * @param eigvals The pre-allocated `DataView<double, 1>`
-     * @param counts The pre-allocated `DataView<int64_t, 1>`
-     * @param wires Wires to compute samples on
-     * @param shots The number of shots
-     */
-    virtual void PartialCounts(DataView<double, 1>& eigvals,
-                               DataView<int64_t, 1>& counts,
-                               std::vector<QubitIdType> const& wires,
-                               size_t shots)
-        = 0;
-
-    /**
-     * @brief A general measurement method that acts on a single wire.
-     *
-     * @param wire The wire to compute Measure on
-     * @param postselect Which basis state to postselect after a mid-circuit
-     measurement (-1 denotes no post-selection)
-
-     * @return `Result` The measurement result
-     */
-    virtual auto Measure(QubitIdType wire, std::optional<int32_t> postselect)
-        -> Result
-        = 0;
-
-    /**
-     * @brief Compute the gradient of a quantum tape, that is cached using
-     * `Catalyst::Runtime::Simulator::CacheManager`, for a specific set of
-     * trainable parameters.
-     *
-     * @param gradients The vector of pre-allocated `DataView<double, 1>*`
-     * to store gradients resutls for the list of cached observables.
-     * @param trainParams The vector of trainable parameters; if none, all
-     * parameters would be assumed trainable
-     *
-     */
-    virtual void Gradient(std::vector<DataView<double, 1>>& gradients,
-                          std::vector<size_t> const& trainParams)
-        = 0;
-};
-}  // namespace Catalyst::Runtime
diff --git a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Types.h b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Types.h
deleted file mode 100644
index a90f69d..0000000
--- a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Types.h
+++ /dev/null
@@ -1,179 +0,0 @@
-// Copyright 2022-2023 Xanadu Quantum Technologies Inc.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-//     http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#ifndef TYPES_H
-#    define TYPES_H
-
-#    include <cmath>
-#    include <cstdint>
-#    include <limits>
-
-#    ifdef __cplusplus
-extern "C" {
-#    endif
-
-// Qubit, Result and Observable types
-struct QUBIT;
-using QubitIdType = intptr_t;
-
-using RESULT = bool;
-using Result = RESULT*;
-using QirArray = void*;
-
-using ObsIdType = intptr_t;
-
-enum ObsId : int8_t
-{
-    Identity = 0,
-    PauliX,
-    PauliY,
-    PauliZ,
-    Hadamard,
-    Hermitian,
-};
-
-enum ObsType : int8_t
-{
-    Basic = 0,
-    TensorProd,
-    Hamiltonian,
-};
-
-// complex<float> type
-struct CplxT_float
-{
-    float real;
-    float imag;
-};
-
-// complex<double> type
-struct CplxT_double
-{
-    double real;
-    double imag;
-};
-
-enum NumericType : int8_t
-{
-    idx = 0,
-    i1,
-    i8,
-    i16,
-    i32,
-    i64,
-    f32,
-    f64,
-    c64,
-    c128,
-};
-
-// MemRefT<datatype, dimension=rank> type
-struct OpaqueMemRefT
-{
-    int64_t rank;
-    void* descriptor;
-    NumericType datatype;
-};
-
-// MemRefT<complex<double>, dimension=1> type
-struct MemRefT_CplxT_double_1d
-{
-    CplxT_double* data_allocated;
-    CplxT_double* data_aligned;
-    size_t offset;
-    size_t sizes[1];
-    size_t strides[1];
-};
-
-// MemRefT<complex<double>, dimension=2> type
-struct MemRefT_CplxT_double_2d
-{
-    CplxT_double* data_allocated;
-    CplxT_double* data_aligned;
-    size_t offset;
-    size_t sizes[2];
-    size_t strides[2];
-};
-
-// MemRefT<double, dimension=1> type
-struct MemRefT_double_1d
-{
-    double* data_allocated;
-    double* data_aligned;
-    size_t offset;
-    size_t sizes[1];
-    size_t strides[1];
-};
-
-// MemRefT<double, dimension=2> type
-struct MemRefT_double_2d
-{
-    double* data_allocated;
-    double* data_aligned;
-    size_t offset;
-    size_t sizes[2];
-    size_t strides[2];
-};
-
-// MemRefT<int64_t, dimension=1> type
-struct MemRefT_int64_1d
-{
-    int64_t* data_allocated;
-    int64_t* data_aligned;
-    size_t offset;
-    size_t sizes[1];
-    size_t strides[1];
-};
-
-// MemRefT<int64_t, dimension=1> type
-struct MemRefT_int8_1d
-{
-    int8_t* data_allocated;
-    int8_t* data_aligned;
-    size_t offset;
-    size_t sizes[1];
-    size_t strides[1];
-};
-
-// PairT<MemRefT<double, dimension=1>, MemRefT<int64, dimension=2>> type
-struct PairT_MemRefT_double_int64_1d
-{
-    struct MemRefT_double_1d first;
-    struct MemRefT_int64_1d second;
-};
-
-// Quantum operation modifiers
-struct Modifiers
-{
-    bool adjoint;
-    size_t num_controlled;
-    QUBIT* controlled_wires;
-    bool* controlled_values;
-};
-
-using CplxT_double = struct CplxT_double;
-using MemRefT_CplxT_double_1d = struct MemRefT_CplxT_double_1d;
-using MemRefT_CplxT_double_2d = struct MemRefT_CplxT_double_2d;
-using MemRefT_double_1d = struct MemRefT_double_1d;
-using MemRefT_double_2d = struct MemRefT_double_2d;
-using MemRefT_int64_1d = struct MemRefT_int64_1d;
-using PairT_MemRefT_double_int64_1d = struct PairT_MemRefT_double_int64_1d;
-using Modifiers = struct Modifiers;
-
-#    ifdef __cplusplus
-}  // extern "C"
-#    endif
-
-#endif
diff --git a/src/qirlightning/simple_demo/test_rt_device.cpp b/src/qirlightning/simple_demo/test_rt_device.cpp
deleted file mode 100644
index 47897f2..0000000
--- a/src/qirlightning/simple_demo/test_rt_device.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-#include <dlfcn.h>
-
-#include "QuantumDevice.hpp"
-
-// Runtime libraries (kokkos/GPU/qubit etc.)
-// Update these paths to point to the correct library
-#define RTDLIB                      \
-    "<UPDATE: site packages path>/" \
-    "pennylane_lightning/liblightning_kokkos_catalyst.so";
-#define RTDDEVICE "LightningKokkosSimulator";
-
-extern "C" Catalyst::Runtime::QuantumDevice*
-GenericDeviceFactory(char const* kwargs);
-
-using namespace Catalyst::Runtime;
-
-int main()
-{
-    try
-    {
-        // Load lightning simulation library
-        std::string rtd_lib = RTDLIB;
-        std::string rtd_device = RTDDEVICE;
-        std::string kwargs = {};
-        auto rtld_flags = RTLD_LAZY | RTLD_NODELETE;
-        auto rtd_dylib_handler = dlopen(rtd_lib.c_str(), rtld_flags);
-
-        if (!rtd_dylib_handler)
-        {
-            throw std::runtime_error("Failed to load library: " + rtd_lib);
-        }
-
-        // Find device factory
-        std::string factory_name = rtd_device + "Factory";
-        void* f_ptr = dlsym(rtd_dylib_handler, factory_name.c_str());
-
-        if (!f_ptr)
-        {
-            dlclose(rtd_dylib_handler);
-            throw std::runtime_error("Failed to find factory function: "
-                                     + factory_name);
-        }
-        std::string rtd_kwargs = {};
-        auto rtd_qdevice = std::unique_ptr<QuantumDevice>(
-            reinterpret_cast<decltype(GenericDeviceFactory)*>(f_ptr)(
-                rtd_kwargs.c_str()));
-
-        // Allocate Qubits
-        rtd_qdevice->AllocateQubits(3);
-
-        // Get Num Qubits
-        std::cout << "Num Qubits = " << rtd_qdevice->GetNumQubits()
-                  << std::endl;
-
-        // Apply Gate
-        rtd_qdevice->NamedOperation("Hadamard", {}, {0});
-
-        // Print State
-        std::cout << "State = " << std::endl;
-        rtd_qdevice->PrintState();
-
-        // Measure
-        QubitIdType wire{0};
-        Result result = rtd_qdevice->Measure(wire, std::nullopt);
-        std::cout << "Measure on wire 0 = " << *result << std::endl;
-    }
-    catch (std::exception const& e)
-    {
-        std::cerr << "Error: " << e.what() << std::endl;
-        return EXIT_FAILURE;
-    }
-
-    return EXIT_SUCCESS;
-}

From b1344f7c13684bd36b1a97bbd58a773df4bbdd59 Mon Sep 17 00:00:00 2001
From: Joseph Lee <joseph.lee@xanadu.ai>
Date: Wed, 22 Oct 2025 21:42:28 +0000
Subject: [PATCH 62/64] remove build-lightning workflow

---
 .github/workflows/pr.yml   | 2 --
 .github/workflows/push.yml | 2 --
 2 files changed, 4 deletions(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index df9d110..02c418a 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -17,8 +17,6 @@ concurrency:
 jobs:
   build-fast:
     uses: ./.github/workflows/build-fast.yml
-  build-lightning:
-    uses: ./.github/workflows/build-lightning.yml
   # Specifying a dependent job allows us to select a single "requires" check in the project GitHub settings
   all:
     if: ${{ always() }}
diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml
index 5e79f80..6ac89fc 100644
--- a/.github/workflows/push.yml
+++ b/.github/workflows/push.yml
@@ -12,8 +12,6 @@ concurrency:
 jobs:
   build-fast:
     uses: ./.github/workflows/build-fast.yml
-  build-lightning:
-    uses: ./.github/workflows/build-lightning.yml
   all:
     needs:
       - build-fast

From 290a7e8c83ab1ef43efee5e7ce924bfeac14e355 Mon Sep 17 00:00:00 2001
From: Joseph Lee <joseph.lee@xanadu.ai>
Date: Wed, 22 Oct 2025 21:43:00 +0000
Subject: [PATCH 63/64] remove build-lightning workflow

---
 .github/workflows/pr.yml   | 1 -
 .github/workflows/push.yml | 1 -
 2 files changed, 2 deletions(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 02c418a..52f8404 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -22,7 +22,6 @@ jobs:
     if: ${{ always() }}
     needs:
     - build-fast
-    - build-lightning
     runs-on: ubuntu-latest
     steps:
     - name: Decide whether the needed jobs succeeded or failed
diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml
index 6ac89fc..4601abc 100644
--- a/.github/workflows/push.yml
+++ b/.github/workflows/push.yml
@@ -15,7 +15,6 @@ jobs:
   all:
     needs:
       - build-fast
-      - build-lightning
     runs-on: ubuntu-latest
     steps:
     - name: Success

From d1d1c5cea2694367a323ec6222e8533d2fc1d07f Mon Sep 17 00:00:00 2001
From: Joseph Lee <joseph.lee@xanadu.ai>
Date: Wed, 22 Oct 2025 21:44:21 +0000
Subject: [PATCH 64/64] improve formatting for cmake/support_catalyst.cmake

---
 cmake/support_catalyst.cmake | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cmake/support_catalyst.cmake b/cmake/support_catalyst.cmake
index 95c7f73..07ab613 100644
--- a/cmake/support_catalyst.cmake
+++ b/cmake/support_catalyst.cmake
@@ -36,8 +36,8 @@ macro(FindCatalyst target_name)
 
     # Fetching /lib/backend/common hpp headers
     set(LIB_BACKEND_COMMON_HEADERS  CacheManager.hpp
-                    QubitManager.hpp
-                    Utils.hpp
+                                    QubitManager.hpp
+                                    Utils.hpp
     )
 
     foreach(HEADER ${LIB_BACKEND_COMMON_HEADERS})
@@ -54,10 +54,10 @@ macro(FindCatalyst target_name)
 
     # Fetching include hpp headers
     set(INCLUDE_HEADERS DataView.hpp
-              Exception.hpp
-              QuantumDevice.hpp
-              RuntimeCAPI.h
-              Types.h
+                        Exception.hpp
+                        QuantumDevice.hpp
+                        RuntimeCAPI.h
+                        Types.h
     )
 
     foreach(HEADER ${INCLUDE_HEADERS})