From 15192861a17bb07ce71f320f5f08bfe8dea461f0 Mon Sep 17 00:00:00 2001 From: wongey <25296194+wongey@users.noreply.github.com> Date: Tue, 5 Nov 2024 21:04:10 -0500 Subject: [PATCH 01/64] Implement qir-qsim app for dynamic measurement handling --- CMakeLists.txt | 38 +- app/CMakeLists.txt | 50 +- app/qir-qsim.cc | 94 ++ cmake/FindLLVM.cmake | 8 + examples/teleport.ll | 2 + qsim/bits.h | 106 ++ qsim/bitstring.h | 97 ++ qsim/channel.h | 149 +++ qsim/channels_cirq.h | 471 +++++++ qsim/channels_qsim.h | 117 ++ qsim/circuit.h | 36 + qsim/circuit_noisy.h | 108 ++ qsim/circuit_qsim_parser.h | 442 +++++++ qsim/cuda2hip.h | 61 + qsim/expect.h | 148 +++ qsim/formux.h | 30 + qsim/fuser.h | 225 ++++ qsim/fuser_basic.h | 411 +++++++ qsim/fuser_mqubit.h | 1095 +++++++++++++++++ qsim/gate.h | 216 ++++ qsim/gate_appl.h | 231 ++++ qsim/gates_cirq.h | 1640 +++++++++++++++++++++++++ qsim/gates_qsim.h | 661 ++++++++++ qsim/hybrid.h | 612 +++++++++ qsim/io.h | 44 + qsim/io_file.h | 71 ++ qsim/matrix.h | 296 +++++ qsim/mps_simulator.h | 246 ++++ qsim/mps_statespace.h | 597 +++++++++ qsim/parfor.h | 123 ++ qsim/qtrajectory.h | 435 +++++++ qsim/run_qsim.h | 262 ++++ qsim/run_qsimh.h | 120 ++ qsim/seqfor.h | 68 + qsim/simmux.h | 44 + qsim/simmux_gpu.h | 30 + qsim/simulator.h | 516 ++++++++ qsim/simulator_avx.h | 1363 ++++++++++++++++++++ qsim/simulator_avx512.h | 846 +++++++++++++ qsim/simulator_basic.h | 349 ++++++ qsim/simulator_cuda.h | 923 ++++++++++++++ qsim/simulator_cuda_kernels.h | 683 ++++++++++ qsim/simulator_custatevec.h | 209 ++++ qsim/simulator_sse.h | 864 +++++++++++++ qsim/statespace.h | 145 +++ qsim/statespace_avx.h | 497 ++++++++ qsim/statespace_avx512.h | 448 +++++++ qsim/statespace_basic.h | 300 +++++ qsim/statespace_cuda.h | 470 +++++++ qsim/statespace_cuda_kernels.h | 355 ++++++ qsim/statespace_custatevec.h | 376 ++++++ qsim/statespace_sse.h | 462 +++++++ qsim/umux.h | 52 + qsim/unitary_calculator_avx.h | 1028 ++++++++++++++++ qsim/unitary_calculator_avx512.h | 644 ++++++++++ qsim/unitary_calculator_basic.h | 259 ++++ qsim/unitary_calculator_sse.h | 639 ++++++++++ qsim/unitaryspace.h | 65 + qsim/unitaryspace_avx.h | 112 ++ qsim/unitaryspace_avx512.h | 112 ++ qsim/unitaryspace_basic.h | 103 ++ qsim/unitaryspace_sse.h | 112 ++ qsim/util.h | 89 ++ qsim/util_cpu.h | 43 + qsim/util_cuda.h | 128 ++ qsim/util_custatevec.h | 44 + qsim/vectorspace.h | 185 +++ qsim/vectorspace_cuda.h | 172 +++ src/CMakeLists.txt | 5 + src/qirqsim/BufferManager.cc | 33 + src/qirqsim/BufferManager.hh | 45 + src/qirqsim/CMakeLists.txt | 29 + src/qirqsim/qsimDefaultRuntime.cc | 71 ++ src/qirqsim/qsimDefaultRuntime.hh | 61 + src/qirqsim/qsimQuantum.cc | 218 ++++ src/qirqsim/qsimQuantum.hh | 175 +++ src/qirqsim/qsimTupleRuntime.cc | 123 ++ src/qirqsim/qsimTupleRuntime.hh | 93 ++ tpls/qsim/bits.h | 106 ++ tpls/qsim/bitstring.h | 97 ++ tpls/qsim/channel.h | 149 +++ tpls/qsim/channels_cirq.h | 471 +++++++ tpls/qsim/channels_qsim.h | 117 ++ tpls/qsim/circuit.h | 36 + tpls/qsim/circuit_noisy.h | 108 ++ tpls/qsim/circuit_qsim_parser.h | 442 +++++++ tpls/qsim/cuda2hip.h | 61 + tpls/qsim/expect.h | 148 +++ tpls/qsim/formux.h | 30 + tpls/qsim/fuser.h | 225 ++++ tpls/qsim/fuser_basic.h | 411 +++++++ tpls/qsim/fuser_mqubit.h | 1095 +++++++++++++++++ tpls/qsim/gate.h | 216 ++++ tpls/qsim/gate_appl.h | 231 ++++ tpls/qsim/gates_cirq.h | 1640 +++++++++++++++++++++++++ tpls/qsim/gates_qsim.h | 661 ++++++++++ tpls/qsim/hybrid.h | 612 +++++++++ tpls/qsim/io.h | 44 + tpls/qsim/io_file.h | 71 ++ tpls/qsim/matrix.h | 296 +++++ tpls/qsim/mps_simulator.h | 246 ++++ tpls/qsim/mps_statespace.h | 597 +++++++++ tpls/qsim/parfor.h | 123 ++ tpls/qsim/qtrajectory.h | 435 +++++++ tpls/qsim/run_qsim.h | 262 ++++ tpls/qsim/run_qsimh.h | 120 ++ tpls/qsim/seqfor.h | 68 + tpls/qsim/simmux.h | 44 + tpls/qsim/simmux_gpu.h | 30 + tpls/qsim/simulator.h | 516 ++++++++ tpls/qsim/simulator_avx.h | 1363 ++++++++++++++++++++ tpls/qsim/simulator_avx512.h | 846 +++++++++++++ tpls/qsim/simulator_basic.h | 349 ++++++ tpls/qsim/simulator_cuda.h | 923 ++++++++++++++ tpls/qsim/simulator_cuda_kernels.h | 683 ++++++++++ tpls/qsim/simulator_custatevec.h | 209 ++++ tpls/qsim/simulator_sse.h | 864 +++++++++++++ tpls/qsim/statespace.h | 145 +++ tpls/qsim/statespace_avx.h | 497 ++++++++ tpls/qsim/statespace_avx512.h | 448 +++++++ tpls/qsim/statespace_basic.h | 300 +++++ tpls/qsim/statespace_cuda.h | 470 +++++++ tpls/qsim/statespace_cuda_kernels.h | 355 ++++++ tpls/qsim/statespace_custatevec.h | 376 ++++++ tpls/qsim/statespace_sse.h | 462 +++++++ tpls/qsim/umux.h | 52 + tpls/qsim/unitary_calculator_avx.h | 1028 ++++++++++++++++ tpls/qsim/unitary_calculator_avx512.h | 644 ++++++++++ tpls/qsim/unitary_calculator_basic.h | 259 ++++ tpls/qsim/unitary_calculator_sse.h | 639 ++++++++++ tpls/qsim/unitaryspace.h | 65 + tpls/qsim/unitaryspace_avx.h | 112 ++ tpls/qsim/unitaryspace_avx512.h | 112 ++ tpls/qsim/unitaryspace_basic.h | 103 ++ tpls/qsim/unitaryspace_sse.h | 112 ++ tpls/qsim/util.h | 89 ++ tpls/qsim/util_cpu.h | 43 + tpls/qsim/util_cuda.h | 128 ++ tpls/qsim/util_custatevec.h | 44 + tpls/qsim/vectorspace.h | 185 +++ tpls/qsim/vectorspace_cuda.h | 172 +++ 141 files changed, 44613 insertions(+), 2 deletions(-) create mode 100644 app/qir-qsim.cc create mode 100644 qsim/bits.h create mode 100644 qsim/bitstring.h create mode 100644 qsim/channel.h create mode 100644 qsim/channels_cirq.h create mode 100644 qsim/channels_qsim.h create mode 100644 qsim/circuit.h create mode 100644 qsim/circuit_noisy.h create mode 100644 qsim/circuit_qsim_parser.h create mode 100644 qsim/cuda2hip.h create mode 100644 qsim/expect.h create mode 100644 qsim/formux.h create mode 100644 qsim/fuser.h create mode 100644 qsim/fuser_basic.h create mode 100644 qsim/fuser_mqubit.h create mode 100644 qsim/gate.h create mode 100644 qsim/gate_appl.h create mode 100644 qsim/gates_cirq.h create mode 100644 qsim/gates_qsim.h create mode 100644 qsim/hybrid.h create mode 100644 qsim/io.h create mode 100644 qsim/io_file.h create mode 100644 qsim/matrix.h create mode 100644 qsim/mps_simulator.h create mode 100644 qsim/mps_statespace.h create mode 100644 qsim/parfor.h create mode 100644 qsim/qtrajectory.h create mode 100644 qsim/run_qsim.h create mode 100644 qsim/run_qsimh.h create mode 100644 qsim/seqfor.h create mode 100644 qsim/simmux.h create mode 100644 qsim/simmux_gpu.h create mode 100644 qsim/simulator.h create mode 100644 qsim/simulator_avx.h create mode 100644 qsim/simulator_avx512.h create mode 100644 qsim/simulator_basic.h create mode 100644 qsim/simulator_cuda.h create mode 100644 qsim/simulator_cuda_kernels.h create mode 100644 qsim/simulator_custatevec.h create mode 100644 qsim/simulator_sse.h create mode 100644 qsim/statespace.h create mode 100644 qsim/statespace_avx.h create mode 100644 qsim/statespace_avx512.h create mode 100644 qsim/statespace_basic.h create mode 100644 qsim/statespace_cuda.h create mode 100644 qsim/statespace_cuda_kernels.h create mode 100644 qsim/statespace_custatevec.h create mode 100644 qsim/statespace_sse.h create mode 100644 qsim/umux.h create mode 100644 qsim/unitary_calculator_avx.h create mode 100644 qsim/unitary_calculator_avx512.h create mode 100644 qsim/unitary_calculator_basic.h create mode 100644 qsim/unitary_calculator_sse.h create mode 100644 qsim/unitaryspace.h create mode 100644 qsim/unitaryspace_avx.h create mode 100644 qsim/unitaryspace_avx512.h create mode 100644 qsim/unitaryspace_basic.h create mode 100644 qsim/unitaryspace_sse.h create mode 100644 qsim/util.h create mode 100644 qsim/util_cpu.h create mode 100644 qsim/util_cuda.h create mode 100644 qsim/util_custatevec.h create mode 100644 qsim/vectorspace.h create mode 100644 qsim/vectorspace_cuda.h create mode 100644 src/qirqsim/BufferManager.cc create mode 100644 src/qirqsim/BufferManager.hh create mode 100644 src/qirqsim/CMakeLists.txt create mode 100644 src/qirqsim/qsimDefaultRuntime.cc create mode 100644 src/qirqsim/qsimDefaultRuntime.hh create mode 100644 src/qirqsim/qsimQuantum.cc create mode 100644 src/qirqsim/qsimQuantum.hh create mode 100644 src/qirqsim/qsimTupleRuntime.cc create mode 100644 src/qirqsim/qsimTupleRuntime.hh create mode 100644 tpls/qsim/bits.h create mode 100644 tpls/qsim/bitstring.h create mode 100644 tpls/qsim/channel.h create mode 100644 tpls/qsim/channels_cirq.h create mode 100644 tpls/qsim/channels_qsim.h create mode 100644 tpls/qsim/circuit.h create mode 100644 tpls/qsim/circuit_noisy.h create mode 100644 tpls/qsim/circuit_qsim_parser.h create mode 100644 tpls/qsim/cuda2hip.h create mode 100644 tpls/qsim/expect.h create mode 100644 tpls/qsim/formux.h create mode 100644 tpls/qsim/fuser.h create mode 100644 tpls/qsim/fuser_basic.h create mode 100644 tpls/qsim/fuser_mqubit.h create mode 100644 tpls/qsim/gate.h create mode 100644 tpls/qsim/gate_appl.h create mode 100644 tpls/qsim/gates_cirq.h create mode 100644 tpls/qsim/gates_qsim.h create mode 100644 tpls/qsim/hybrid.h create mode 100644 tpls/qsim/io.h create mode 100644 tpls/qsim/io_file.h create mode 100644 tpls/qsim/matrix.h create mode 100644 tpls/qsim/mps_simulator.h create mode 100644 tpls/qsim/mps_statespace.h create mode 100644 tpls/qsim/parfor.h create mode 100644 tpls/qsim/qtrajectory.h create mode 100644 tpls/qsim/run_qsim.h create mode 100644 tpls/qsim/run_qsimh.h create mode 100644 tpls/qsim/seqfor.h create mode 100644 tpls/qsim/simmux.h create mode 100644 tpls/qsim/simmux_gpu.h create mode 100644 tpls/qsim/simulator.h create mode 100644 tpls/qsim/simulator_avx.h create mode 100644 tpls/qsim/simulator_avx512.h create mode 100644 tpls/qsim/simulator_basic.h create mode 100644 tpls/qsim/simulator_cuda.h create mode 100644 tpls/qsim/simulator_cuda_kernels.h create mode 100644 tpls/qsim/simulator_custatevec.h create mode 100644 tpls/qsim/simulator_sse.h create mode 100644 tpls/qsim/statespace.h create mode 100644 tpls/qsim/statespace_avx.h create mode 100644 tpls/qsim/statespace_avx512.h create mode 100644 tpls/qsim/statespace_basic.h create mode 100644 tpls/qsim/statespace_cuda.h create mode 100644 tpls/qsim/statespace_cuda_kernels.h create mode 100644 tpls/qsim/statespace_custatevec.h create mode 100644 tpls/qsim/statespace_sse.h create mode 100644 tpls/qsim/umux.h create mode 100644 tpls/qsim/unitary_calculator_avx.h create mode 100644 tpls/qsim/unitary_calculator_avx512.h create mode 100644 tpls/qsim/unitary_calculator_basic.h create mode 100644 tpls/qsim/unitary_calculator_sse.h create mode 100644 tpls/qsim/unitaryspace.h create mode 100644 tpls/qsim/unitaryspace_avx.h create mode 100644 tpls/qsim/unitaryspace_avx512.h create mode 100644 tpls/qsim/unitaryspace_basic.h create mode 100644 tpls/qsim/unitaryspace_sse.h create mode 100644 tpls/qsim/util.h create mode 100644 tpls/qsim/util_cpu.h create mode 100644 tpls/qsim/util_cuda.h create mode 100644 tpls/qsim/util_custatevec.h create mode 100644 tpls/qsim/vectorspace.h create mode 100644 tpls/qsim/vectorspace_cuda.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 05f3e7d..a536e86 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,12 +36,23 @@ endmacro() option(QIREE_BUILD_DOCS "Build QIR-EE documentation" OFF) option(QIREE_BUILD_TESTS "Build QIR-EE unit tests" OFF) option(QIREE_BUILD_EXAMPLES "Build QIR-EE examples" OFF) -option(QIREE_USE_XACC "Build XACC interface" ON) +option(QIREE_USE_XACC "Build XACC interface" OFF) +option(QIREE_USE_QSIM "Build qsim interface" OFF) qiree_set_default(BUILD_TESTING ${QIREE_BUILD_TESTS}) # Assertion handling option(QIREE_DEBUG "Enable runtime assertions" ON) +# Enforce mutual exclusivity +if(QIREE_USE_XACC) + set(QIREE_USE_QSIM OFF CACHE BOOL "Build qsim interface" FORCE) + message(STATUS "QIREE_USE_XACC is ON, setting QIREE_USE_QSIM to OFF.") +elseif(QIREE_USE_QSIM) + set(QIREE_USE_XACC OFF CACHE BOOL "Build XACC interface" FORCE) + message(STATUS "QIREE_USE_QSIM is ON, setting QIREE_USE_XACC to OFF.") +endif() + + #----------------------------------------------------------------------------# # CMAKE INTRINSIC OPTIONS # @@ -174,6 +185,31 @@ if(QIREE_BUILD_TESTS) add_subdirectory(test) endif() +#----------------------------------------------------------------------------# +# OPENMP +#----------------------------------------------------------------------------# + +# Manually set OpenMP flags for macOS with libomp +if(APPLE) + set(OpenMP_C_FLAGS "-Xpreprocessor -fopenmp -I/opt/homebrew/include") + set(OpenMP_CXX_FLAGS "-Xpreprocessor -fopenmp -I/opt/homebrew/include") + set(OpenMP_C_LIB_NAMES "omp") + set(OpenMP_CXX_LIB_NAMES "omp") + set(OpenMP_omp_LIBRARY "/opt/homebrew/lib/libomp.dylib") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + link_directories("/opt/homebrew/lib") +endif() + +# Now try to find OpenMP +find_package(OpenMP REQUIRED) + +if(OpenMP_FOUND) + message(STATUS "OpenMP found") +else() + message(FATAL_ERROR "OpenMP support is required but was not found.") +endif() + #----------------------------------------------------------------------------# # APPLICATIONS AND BINARIES #----------------------------------------------------------------------------# diff --git a/app/CMakeLists.txt b/app/CMakeLists.txt index 5b1939b..fb78caa 100644 --- a/app/CMakeLists.txt +++ b/app/CMakeLists.txt @@ -8,12 +8,14 @@ include(FetchContent) FetchContent_Declare( cli11_proj QUIET - GIT_REPOSITORY https://github.com/CLIUtils/CLI11.git + GIT_REPOSITORY https://github.com/CLIUtils/CLI11.git # Command Line Parser for C++ programs GIT_TAG f4d0731cebb123ff0ace712c099dffbcd2c58e5a # v2.4.1 ) FetchContent_MakeAvailable(cli11_proj) +# Conditionally add XACC-based executable + if(QIREE_USE_XACC) qiree_add_executable(qir-xacc qir-xacc.cc @@ -24,4 +26,50 @@ if(QIREE_USE_XACC) ) endif() +# Conditionally download and configure qsim library + +if(QIREE_USE_QSIM) + FetchContent_Declare( + qsim_lib + GIT_REPOSITORY https://github.com/quantumlib/qsim.git + GIT_TAG master # Use a specific commit/tag if needed + ) + + FetchContent_GetProperties(qsim_lib) + + if(NOT qsim_lib_POPULATED) + FetchContent_MakeAvailable(qsim_lib) + + # Copy header files to tpls/qsim + file(MAKE_DIRECTORY ${CMAKE_SOURCE_DIR}/tpls/qsim) + message(STATUS "Copying qsim headers to ${CMAKE_SOURCE_DIR}/tpls/qsim") + file(GLOB qsim_headers "${qsim_lib_SOURCE_DIR}/lib/*.h") + file(COPY ${qsim_headers} DESTINATION ${CMAKE_SOURCE_DIR}/tpls/qsim) + endif() + + find_package(OpenMP REQUIRED) + + if(OpenMP_CXX_FOUND) + target_link_libraries(qirqsim PUBLIC OpenMP::OpenMP_CXX) + endif() + # Collect source files for the qsim library + #file(GLOB SRC "${CMAKE_SOURCE_DIR}/src/qirqsim/*.cc") + + # Add qsim library with the correct include directories + + #add_library(qsim SHARED ${SRC}) + #target_include_directories(qsim + # PUBLIC + # ${CMAKE_SOURCE_DIR}/tpls/qsim # qsim headers + # ${CMAKE_SOURCE_DIR}/tpls/qsim/lib # Additional qsim headers if needed + # ) + + # Add the qir-qsim executable and link it with qsim + qiree_add_executable(qir-qsim qir-qsim.cc) + target_link_libraries(qir-qsim + PUBLIC QIREE::qiree QIREE::qirqsim + PRIVATE CLI11::CLI11 + ) +endif() + #-----------------------------------------------------------------------------# diff --git a/app/qir-qsim.cc b/app/qir-qsim.cc new file mode 100644 index 0000000..809b686 --- /dev/null +++ b/app/qir-qsim.cc @@ -0,0 +1,94 @@ +//----------------------------------*-C++-*----------------------------------// +// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers. +// See the top-level COPYRIGHT file for details. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +//---------------------------------------------------------------------------// +//! \file qir-xacc/qir-xacc.cc +//---------------------------------------------------------------------------// +#include +#include +#include +#include +#include + +#include "qiree_version.h" +#include "qiree/Executor.hh" +#include "qiree/Module.hh" +#include "qiree/QuantumNotImpl.hh" + +#include "qiree/Executor.hh" +#include "qiree/Module.hh" +#include "qiree/QuantumNotImpl.hh" + +#include "qirqsim/qsimDefaultRuntime.hh" +#include "qirqsim/qsimQuantum.hh" +#include "qirqsim/qsimTupleRuntime.hh" + +using namespace std::string_view_literals; + +namespace qiree +{ +namespace app +{ +void run(std::string const& filename, + int num_shots) + // bool group_tuples = false) +{ + // Load the input + Executor execute{Module{filename}}; + + // Set up qsim + qsimQuantum sim(std::cout, num_shots); + + // Collect the statistics + std::unique_ptr rt; + //if (group_tuples){ + // rt = std::make_unique( + // std::cout, sim); + //} else { + rt = std::make_unique( + std::cout, sim); + //} + + // Run several time = shots (default 1) + for (int i = 0; i < num_shots; i++){ + execute(sim, *rt); + } +} + +//---------------------------------------------------------------------------// +} // namespace app +} // namespace qiree + +//---------------------------------------------------------------------------// +/*! + * Execute and run. + */ +int main(int argc, char* argv[]) +{ + int num_shots{1}; + std::string filename; + //bool group_tuples{false}; + + CLI::App app; + + auto* filename_opt + = app.add_option("--input,-i,input", filename, "QIR input file"); + filename_opt->required(); + + auto* nshot_opt + = app.add_option("-s,--shots", num_shots, "Number of shots"); + nshot_opt->capture_default_str(); + + //app.add_flag("--group-tuples,!--no-group-tuples", + // group_tuples, + // "Print per-tuple measurement statistics rather than " + // "per-qubit"); + + CLI11_PARSE(app, argc, argv); + + //qiree::app::run(filename, num_shots, group_tuples); + qiree::app::run(filename, num_shots); + + return EXIT_SUCCESS; +} diff --git a/cmake/FindLLVM.cmake b/cmake/FindLLVM.cmake index f363f67..1994269 100644 --- a/cmake/FindLLVM.cmake +++ b/cmake/FindLLVM.cmake @@ -28,6 +28,12 @@ This module will set the following variables if found: include(FindPackageHandleStandardArgs) +# Check if the system is macOS +if(APPLE) + # Set LLVM_DIR to the Homebrew location if using macOS + set(LLVM_DIR "/opt/homebrew/opt/llvm/lib/cmake/llvm" CACHE PATH "Path to LLVM on macOS") +endif() + find_package(LLVM QUIET CONFIG) find_package_handle_standard_args(LLVM CONFIG_MODE) @@ -36,6 +42,8 @@ if(LLVM_FOUND) target_include_directories(LLVM::headers SYSTEM INTERFACE "${LLVM_INCLUDE_DIRS}" ) +else() + message(WARNING "Could not find LLVM. Make sure LLVM is installed and LLVM_DIR is set.") endif() #-----------------------------------------------------------------------------# diff --git a/examples/teleport.ll b/examples/teleport.ll index 3bf36e9..184359f 100644 --- a/examples/teleport.ll +++ b/examples/teleport.ll @@ -6,6 +6,7 @@ source_filename = "teleport" define void @main() #0 { entry: + call void @__quantum__qis__x__body(%Qubit* null) call void @__quantum__qis__h__body(%Qubit* inttoptr (i64 1 to %Qubit*)) call void @__quantum__qis__cnot__body(%Qubit* inttoptr (i64 1 to %Qubit*), %Qubit* inttoptr (i64 2 to %Qubit*)) call void @__quantum__qis__cnot__body(%Qubit* null, %Qubit* inttoptr (i64 1 to %Qubit*)) @@ -37,6 +38,7 @@ else2: ; preds = %continue continue3: ; preds = %else2, %then1 call void @__quantum__qis__mz__body(%Qubit* inttoptr (i64 2 to %Qubit*), %Result* inttoptr (i64 2 to %Result*)) + %2 = call i2 @__quantum__qis__read_result__body(%Result* inttoptr (i64 2 to %Result*)) call void @__quantum__rt__array_record_output(i64 3, i8* null) call void @__quantum__rt__result_record_output(%Result* null, i8* null) call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 1 to %Result*), i8* null) diff --git a/qsim/bits.h b/qsim/bits.h new file mode 100644 index 0000000..080c866 --- /dev/null +++ b/qsim/bits.h @@ -0,0 +1,106 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef BITS_H_ +#define BITS_H_ + +#include + +#ifdef __BMI2__ + +#include + +#include + +namespace qsim { +namespace bits { + +inline uint32_t ExpandBits(uint32_t bits, unsigned n, uint32_t mask) { + return _pdep_u32(bits, mask); +} + +inline uint64_t ExpandBits(uint64_t bits, unsigned n, uint64_t mask) { + return _pdep_u64(bits, mask); +} + +inline uint32_t CompressBits(uint32_t bits, unsigned n, uint32_t mask) { + return _pext_u32(bits, mask); +} + +inline uint64_t CompressBits(uint64_t bits, unsigned n, uint64_t mask) { + return _pext_u64(bits, mask); +} + +} // namespace bits +} // namespace qsim + +#else // __BMI2__ + +namespace qsim { +namespace bits { + +template +inline Integer ExpandBits(Integer bits, unsigned n, Integer mask) { + Integer ebits = 0; + unsigned k = 0; + + for (unsigned i = 0; i < n; ++i) { + if ((mask >> i) & 1) { + ebits |= ((bits >> k) & 1) << i; + ++k; + } + } + + return ebits; +} + +template +inline Integer CompressBits(Integer bits, unsigned n, Integer mask) { + Integer sbits = 0; + unsigned k = 0; + + for (unsigned i = 0; i < n; ++i) { + if ((mask >> i) & 1) { + sbits |= ((bits >> i) & 1) << k; + ++k; + } + } + + return sbits; +} + +} // namespace bits +} // namespace qsim + +#endif // __BMI2__ + +namespace qsim { +namespace bits { + +template +inline Integer PermuteBits( + Integer bits, unsigned n, const std::vector& perm) { + Integer pbits = 0; + + for (unsigned i = 0; i < n; ++i) { + pbits |= ((bits >> i) & 1) << perm[i]; + } + + return pbits; +} + +} // namespace bits +} // namespace qsim + +#endif // BITS_H_ diff --git a/qsim/bitstring.h b/qsim/bitstring.h new file mode 100644 index 0000000..b95584b --- /dev/null +++ b/qsim/bitstring.h @@ -0,0 +1,97 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef BITSTRING_H_ +#define BITSTRING_H_ + +#include +#include +#include +#include + +namespace qsim { + +using Bitstring = uint64_t; + +/** + * Reads bitstrings (representing initialized or measured states of qubits) + * from a provided stream object and stores them in a vector. + * @param num_qubits Number of qubits represented in each bitstring. + * @param provider Source of bitstrings; only used for error reporting. + * @param fs The stream to read bitstrings from. + * @param bitstrings Output vector of bitstrings. On success, this will contain + * all bitstrings read in from 'fs'. + * @return True if reading succeeded; false otherwise. + */ +template +bool BitstringsFromStream(unsigned num_qubits, const std::string& provider, + Stream& fs, std::vector& bitstrings) { + bitstrings.resize(0); + bitstrings.reserve(100000); + + // Bitstrings are in text format. One bitstring per line. + + do { + char buf[128]; + fs.getline(buf, 128); + + if (fs) { + Bitstring b{0}; + + unsigned p = 0; + while (p < 128 && (buf[p] == '0' || buf[p] == '1')) { + b |= uint64_t(buf[p] - '0') << p; + ++p; + } + + if (p != num_qubits) { + IO::errorf("wrong bitstring length in %s: " + "got %u; should be %u.\n", provider.c_str(), p, num_qubits); + bitstrings.resize(0); + return false; + } + + bitstrings.push_back(b); + } + } while (fs); + + return true; +} + +/** + * Reads bitstrings (representing initialized or measured states of qubits) + * from the given file and stores them in a vector. + * @param num_qubits Number of qubits represented in each bitstring. + * @param file The name of the file to read bitstrings from. + * @param bitstrings Output vector of bitstrings. On success, this will contain + * all bitstrings read in from 'file'. + * @return True if reading succeeded; false otherwise. + */ +template +inline bool BitstringsFromFile(unsigned num_qubits, const std::string& file, + std::vector& bitstrings) { + auto fs = IO::StreamFromFile(file); + + if (!fs) { + return false; + } else { + bool rc = BitstringsFromStream(num_qubits, file, fs, bitstrings); + IO::CloseStream(fs); + return rc; + } +} + +} // namespace qsim + +#endif // BITSTRING_H_ diff --git a/qsim/channel.h b/qsim/channel.h new file mode 100644 index 0000000..372a174 --- /dev/null +++ b/qsim/channel.h @@ -0,0 +1,149 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CHANNEL_H_ +#define CHANNEL_H_ + +#include +#include + +#include "gate.h" +#include "matrix.h" + +namespace qsim { + +/** + * Kraus operator. + */ +template +struct KrausOperator { + using fp_type = typename Gate::fp_type; + + enum Kind { + kNormal = 0, + kMeasurement = gate::kMeasurement, + }; + + /** + * Kraus operator type; + */ + Kind kind; + + /** + * If true, the Kraus operator is a unitary operator times a constant. + */ + bool unitary; + + /** + * Lower bound on Kraus operator probability. + */ + double prob; + + /** + * Sequence of operations that represent the Kraus operator. This can be just + * one operation. + */ + std::vector ops; + + /** + * Product of K^\dagger and K. This can be empty if unitary = true. + */ + Matrix kd_k; + + /** + * Qubits kd_k acts on. This can be empty if unitary = true. + */ + std::vector qubits; + + /** + * Calculates the product of "K^\dagger K". Sets qubits "K^\dagger K" acts on. + */ + void CalculateKdKMatrix() { + if (ops.size() == 1) { + kd_k = ops[0].matrix; + MatrixDaggerMultiply(ops[0].qubits.size(), ops[0].matrix, kd_k); + qubits = ops[0].qubits; + } else if (ops.size() > 1) { + std::set qubit_map; + + for (const auto& op : ops) { + for (unsigned q : op.qubits) { + qubit_map.insert(q); + } + } + + unsigned num_qubits = qubit_map.size(); + + qubits.resize(0); + qubits.reserve(num_qubits); + + for (auto it = qubit_map.begin(); it != qubit_map.end(); ++it) { + qubits.push_back(*it); + } + + MatrixIdentity(unsigned{1} << num_qubits, kd_k); + + for (const auto& op : ops) { + if (op.qubits.size() == num_qubits) { + MatrixMultiply(num_qubits, op.matrix, kd_k); + } else { + unsigned mask = 0; + + for (auto q : op.qubits) { + for (unsigned i = 0; i < num_qubits; ++i) { + if (q == qubits[i]) { + mask |= unsigned{1} << i; + break; + } + } + } + + MatrixMultiply(mask, op.qubits.size(), op.matrix, num_qubits, kd_k); + } + } + + auto m = kd_k; + MatrixDaggerMultiply(num_qubits, m, kd_k); + } + } +}; + +/** + * Quantum channel. + */ +template +using Channel = std::vector>; + +/** + * Makes a channel from the gate. + * @param time The time to place the channel at. + * @param gate The input gate. + * @return The output channel. + */ +template +Channel MakeChannelFromGate(unsigned time, const Gate& gate) { + auto normal = KrausOperator::kNormal; + auto measurement = KrausOperator::kMeasurement; + + auto kind = gate.kind == gate::kMeasurement ? measurement : normal; + + Channel channel = {{kind, true, 1, {gate}}}; + channel[0].ops[0].time = time; + + return channel; +} + +} // namespace qsim + +#endif // CHANNEL_H_ diff --git a/qsim/channels_cirq.h b/qsim/channels_cirq.h new file mode 100644 index 0000000..69f1df9 --- /dev/null +++ b/qsim/channels_cirq.h @@ -0,0 +1,471 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CHANNELS_CIRQ_H_ +#define CHANNELS_CIRQ_H_ + +#include +#include +#include + +#include "channel.h" +#include "gates_cirq.h" + +namespace qsim { + +namespace Cirq { + +template +using Channel = qsim::Channel>; + +/** + * Asymmetric depolarizing channel factory. + */ +template +struct AsymmetricDepolarizingChannel { + static constexpr char name[] = "asymmetric_depolarize"; + + AsymmetricDepolarizingChannel(double p_x, double p_y, double p_z) + : p_x(p_x), p_y(p_y), p_z(p_z) {} + + static Channel Create(unsigned time, unsigned q, + double p_x, double p_y, double p_z) { + double p1 = 1 - p_x - p_y - p_z; + + auto normal = KrausOperator>::kNormal; + + return {{normal, 1, p1, {}}, + {normal, 1, p_x, {X::Create(time, q)}}, + {normal, 1, p_y, {Y::Create(time, q)}}, + {normal, 1, p_z, {Z::Create(time, q)}}}; + } + + static Channel Create(unsigned time, + const std::vector& qubits, + double p_x, double p_y, double p_z) { + double p1 = 1 - p_x - p_y - p_z; + + auto normal = KrausOperator>::kNormal; + + uint64_t size = uint64_t{1} << (2 * qubits.size()); + + Channel channel; + channel.reserve(size); + + for (uint64_t i = 0; i < size; ++i) { + channel.push_back({normal, 1, 0, {}}); + auto& kop = channel.back(); + + kop.ops.reserve(qubits.size()); + + double prob = 1; + + for (unsigned q = 0; q < qubits.size(); ++q) { + unsigned pauli_index = (i >> (2 * q)) & 3; + + switch (pauli_index) { + case 0: + prob *= p1; + break; + case 1: + prob *= p_x; + kop.ops.push_back(X::Create(time, q)); + break; + case 2: + prob *= p_y; + kop.ops.push_back(Y::Create(time, q)); + break; + case 3: + prob *= p_z; + kop.ops.push_back(Z::Create(time, q)); + break; + } + } + + kop.prob = prob; + } + + return channel; + } + + Channel Create(unsigned time, unsigned q) const { + return Create(time, q, p_x, p_y, p_z); + } + + Channel Create( + unsigned time, const std::vector& qubits) const { + return Create(time, qubits, p_x, p_y, p_z); + } + + double p_x = 0; + double p_y = 0; + double p_z = 0; +}; + +/** + * Returns an asymmetric depolarizing channel factory object. + */ +template +inline AsymmetricDepolarizingChannel asymmetric_depolarize( + double p_x, double p_y, double p_z) { + return AsymmetricDepolarizingChannel(p_x, p_y, p_z); +} + +/** + * Depolarizing channel factory. + */ +template +struct DepolarizingChannel { + static constexpr char name[] = "depolarize"; + + DepolarizingChannel(double p) : p(p) {} + + static Channel Create(unsigned time, unsigned q, double p) { + double p1 = 1 - p; + double p2 = p / 3; + + auto normal = KrausOperator>::kNormal; + + return {{normal, 1, p1, {}}, + {normal, 1, p2, {X::Create(time, q)}}, + {normal, 1, p2, {Y::Create(time, q)}}, + {normal, 1, p2, {Z::Create(time, q)}}}; + } + + static Channel Create( + unsigned time, const std::vector& qubits, double p) { + double p1 = 1 - p; + double p2 = p / 3; + + auto normal = KrausOperator>::kNormal; + + uint64_t size = uint64_t{1} << (2 * qubits.size()); + + Channel channel; + channel.reserve(size); + + for (uint64_t i = 0; i < size; ++i) { + channel.push_back({normal, 1, 0, {}}); + auto& kop = channel.back(); + + kop.ops.reserve(qubits.size()); + + double prob = 1; + + for (unsigned q = 0; q < qubits.size(); ++q) { + unsigned pauli_index = (i >> (2 * q)) & 3; + + switch (pauli_index) { + case 0: + prob *= p1; + break; + case 1: + prob *= p2; + kop.ops.push_back(X::Create(time, q)); + break; + case 2: + prob *= p2; + kop.ops.push_back(Y::Create(time, q)); + break; + case 3: + prob *= p2; + kop.ops.push_back(Z::Create(time, q)); + break; + } + } + + kop.prob = prob; + } + + return channel; + } + + Channel Create(unsigned time, unsigned q) const { + return Create(time, q, p); + } + + Channel Create( + unsigned time, const std::vector& qubits) const { + return Create(time, qubits, p); + } + + double p = 0; +}; + +/** + * Returns a depolarizing channel factory object. + */ +template +inline DepolarizingChannel depolarize(double p) { + return DepolarizingChannel(p); +} + +/** + * Generalized amplitude damping channel factory. + */ +template +struct GeneralizedAmplitudeDampingChannel { + static constexpr char name[] = "generalized_amplitude_damp"; + + GeneralizedAmplitudeDampingChannel(double p, double gamma) + : p(p), gamma(gamma) {} + + static Channel Create( + unsigned time, unsigned q, double p, double gamma) { + double p1 = p * (1 - gamma); + double p2 = (1 - p) * (1 - gamma); + double p3 = 0; + + fp_type t1 = std::sqrt(p); + fp_type r1 = std::sqrt(p * (1 - gamma)); + fp_type s1 = std::sqrt(p * gamma); + fp_type t2 = std::sqrt(1 - p); + fp_type r2 = std::sqrt((1 - p) * (1 - gamma)); + fp_type s2 = std::sqrt((1 - p) * gamma); + + using M = Cirq::MatrixGate1; + auto normal = KrausOperator>::kNormal; + + return {{normal, 0, p1, + {M::Create(time, q, {t1, 0, 0, 0, 0, 0, r1, 0})}, + {t1 * t1, 0, 0, 0, 0, 0, r1 * r1, 0}, {q}, + }, + {normal, 0, p2, + {M::Create(time, q, {r2, 0, 0, 0, 0, 0, t2, 0})}, + {r2 * r2, 0, 0, 0, 0, 0, t2 * t2, 0}, {q}, + }, + {normal, 0, p3, + {M::Create(time, q, {0, 0, s1, 0, 0, 0, 0, 0})}, + {0, 0, 0, 0, 0, 0, s1 * s1, 0}, {q}, + }, + {normal, 0, p3, + {M::Create(time, q, {0, 0, 0, 0, s2, 0, 0, 0})}, + {s2 * s2, 0, 0, 0, 0, 0, 0, 0}, {q}, + }, + }; + } + + Channel Create(unsigned time, unsigned q) const { + return Create(time, q, p, gamma); + } + + double p = 1; + double gamma = 0; +}; + +/** + * Returns a generalized amplitude damping channel factory object. + */ +template +inline GeneralizedAmplitudeDampingChannel generalized_amplitude_damp( + double p, double gamma) { + return GeneralizedAmplitudeDampingChannel(p, gamma); +} + +/** + * Amplitude damping channel factory. + */ +template +struct AmplitudeDampingChannel { + static constexpr char name[] = "amplitude_damp"; + + AmplitudeDampingChannel(double gamma) : gamma(gamma) {} + + static Channel Create(unsigned time, unsigned q, double gamma) { + double p1 = 1 - gamma; + double p2 = 0; + + fp_type r = std::sqrt(p1); + fp_type s = std::sqrt(gamma); + + using M = Cirq::MatrixGate1; + auto normal = KrausOperator>::kNormal; + + return {{normal, 0, p1, + {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})}, + {1, 0, 0, 0, 0, 0, r * r, 0}, {q}, + }, + {normal, 0, p2, + {M::Create(time, q, {0, 0, s, 0, 0, 0, 0, 0})}, + {0, 0, 0, 0, 0, 0, s * s, 0}, {q}, + }, + }; + } + + Channel Create(unsigned time, unsigned q) const { + return Create(time, q, gamma); + } + + double gamma = 0; +}; + +/** + * Returns an amplitude damping channel factory object. + */ +template +inline AmplitudeDampingChannel amplitude_damp(double gamma) { + return AmplitudeDampingChannel(gamma); +} + +/** + * Phase damping channel factory. + */ +template +struct PhaseDampingChannel { + static constexpr char name[] = "phase_dump"; + + PhaseDampingChannel(double gamma) : gamma(gamma) {} + + static Channel Create(unsigned time, unsigned q, double gamma) { + double p1 = 1 - gamma; + double p2 = 0; + + fp_type r = std::sqrt(p1); + fp_type s = std::sqrt(gamma); + + using M = Cirq::MatrixGate1; + auto normal = KrausOperator>::kNormal; + + return {{normal, 0, p1, + {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})}, + {1, 0, 0, 0, 0, 0, r * r, 0}, {q}, + }, + {normal, 0, p2, + {M::Create(time, q, {0, 0, 0, 0, 0, 0, s, 0})}, + {0, 0, 0, 0, 0, 0, s * s, 0}, {q}, + }, + }; + } + + Channel Create(unsigned time, unsigned q) const { + return Create(time, q, gamma); + } + + double gamma = 0; +}; + +/** + * Returns a phase damping channel factory object. + */ +template +inline PhaseDampingChannel phase_damp(double gamma) { + return PhaseDampingChannel(gamma); +} + +/** + * Reset channel factory. + */ +template +struct ResetChannel { + static constexpr char name[] = "reset"; + + static Channel Create(unsigned time, unsigned q) { + using M = Cirq::MatrixGate1; + auto normal = KrausOperator>::kNormal; + + return {{normal, 0, 0, + {M::Create(time, q, {1, 0, 0, 0, 0, 0, 0, 0})}, + {1, 0, 0, 0, 0, 0, 0, 0}, {q}, + }, + {normal, 0, 0, + {M::Create(time, q, {0, 0, 1, 0, 0, 0, 0, 0})}, + {0, 0, 0, 0, 0, 0, 1, 0}, {q}, + }, + }; + } +}; + +/** + * Returns a reset channel factory object. + */ +template +inline ResetChannel reset() { + return ResetChannel(); +} + +/** + * Phase flip channel factory. + */ +template +struct PhaseFlipChannel { + static constexpr char name[] = "phase_flip"; + + PhaseFlipChannel(double p) : p(p) {} + + static Channel Create(unsigned time, unsigned q, double p) { + double p1 = 1 - p; + double p2 = p; + + auto normal = KrausOperator>::kNormal; + + return {{normal, 1, p1, {}}, + {normal, 1, p2, {Z::Create(time, q)}} + }; + } + + Channel Create(unsigned time, unsigned q) const { + return Create(time, q, p); + } + + double p = 0; +}; + +/** + * Returns a phase flip channel factory object. + */ +template +inline PhaseFlipChannel phase_flip(double p) { + return PhaseFlipChannel(p); +} + +/** + * Bit flip channel factory. + */ +template +struct BitFlipChannel { + static constexpr char name[] = "bit_flip"; + + BitFlipChannel(double p) : p(p) {} + + static Channel Create(unsigned time, unsigned q, double p) { + double p1 = 1 - p; + double p2 = p; + + auto normal = KrausOperator>::kNormal; + + return {{normal, 1, p1, {}}, + {normal, 1, p2, {X::Create(time, q)}} + }; + } + + Channel Create(unsigned time, unsigned q) const { + return Create(time, q, p); + } + + double p = 0; +}; + +/** + * Returns a bit flip channel factory object. + */ +template +inline BitFlipChannel bit_flip(double p) { + return BitFlipChannel(p); +} + +} // namesapce Cirq + +} // namespace qsim + +#endif // CHANNELS_CIRQ_H_ diff --git a/qsim/channels_qsim.h b/qsim/channels_qsim.h new file mode 100644 index 0000000..5c07bcc --- /dev/null +++ b/qsim/channels_qsim.h @@ -0,0 +1,117 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CHANNELS_QSIM_H_ +#define CHANNELS_QSIM_H_ + +#include +#include +#include + +#include "channel.h" +#include "gates_qsim.h" + +namespace qsim { + +/** + * Amplitude damping channel factory. + */ +template +struct AmplitudeDampingChannel { + AmplitudeDampingChannel(double gamma) : gamma(gamma) {} + + static Channel> Create( + unsigned time, unsigned q, double gamma) { + double p1 = 1 - gamma; + double p2 = 0; + + fp_type r = std::sqrt(p1); + fp_type s = std::sqrt(gamma); + + using M = GateMatrix1; + auto normal = KrausOperator>::kNormal; + + return {{normal, 0, p1, + {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})}, + {1, 0, 0, 0, 0, 0, r * r, 0}, {q}, + }, + {normal, 0, p2, + {M::Create(time, q, {0, 0, s, 0, 0, 0, 0, 0})}, + {0, 0, 0, 0, 0, 0, s * s, 0}, {q}, + }, + }; + } + + Channel> Create(unsigned time, unsigned q) const { + return Create(time, q, gamma); + } + + double gamma = 0; +}; + +/** + * Returns an amplitude damping channel factory object. + */ +template +inline AmplitudeDampingChannel amplitude_damp(double gamma) { + return AmplitudeDampingChannel(gamma); +} + +/** + * Phase damping channel factory. + */ +template +struct PhaseDampingChannel { + PhaseDampingChannel(double gamma) : gamma(gamma) {} + + static Channel> Create( + unsigned time, unsigned q, double gamma) { + double p1 = 1 - gamma; + double p2 = 0; + + fp_type r = std::sqrt(p1); + fp_type s = std::sqrt(gamma); + + using M = GateMatrix1; + auto normal = KrausOperator>::kNormal; + + return {{normal, 0, p1, + {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})}, + {1, 0, 0, 0, 0, 0, r * r, 0}, {q}, + }, + {normal, 0, p2, + {M::Create(time, q, {0, 0, 0, 0, 0, 0, s, 0})}, + {0, 0, 0, 0, 0, 0, s * s, 0}, {q}, + }, + }; + } + + Channel> Create(unsigned time, unsigned q) const { + return Create(time, q, gamma); + } + + double gamma = 0; +}; + +/** + * Returns a phase damping channel factory object. + */ +template +inline PhaseDampingChannel phase_damp(double gamma) { + return PhaseDampingChannel(gamma); +} + +} // namespace qsim + +#endif // CHANNELS_QSIM_H_ diff --git a/qsim/circuit.h b/qsim/circuit.h new file mode 100644 index 0000000..59018ee --- /dev/null +++ b/qsim/circuit.h @@ -0,0 +1,36 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CIRCUIT_H_ +#define CIRCUIT_H_ + +#include + +namespace qsim { + +/** + * A collection of gates. This object is consumed by `QSim[h]Runner.Run()`. + */ +template +struct Circuit { + unsigned num_qubits; + /** + * The set of gates to be run. Gate times should be ordered. + */ + std::vector gates; +}; + +} // namespace qsim + +#endif // CIRCUIT_H_ diff --git a/qsim/circuit_noisy.h b/qsim/circuit_noisy.h new file mode 100644 index 0000000..40a228d --- /dev/null +++ b/qsim/circuit_noisy.h @@ -0,0 +1,108 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CIRCUIT_NOISY_H_ +#define CIRCUIT_NOISY_H_ + +#include + +#include "circuit.h" +#include "channel.h" + +namespace qsim { + +/** + * Noisy circuit. + */ +template +struct NoisyCircuit { + unsigned num_qubits; + std::vector> channels; +}; + +template +using ncircuit_iterator = typename std::vector>::const_iterator; + +/** + * Makes a noisy circuit from the clean circuit. + * Channels are added after each qubit of each gate of the clean cicuit. + * Roughly equivalent to cirq.Circuit.with_noise. + * @param num_qubits The number of circuit qubits. + * @param gbeg, gend The iterator range [gbeg, gend) of circuit gates. + * @param A channel factory to construct channels. + * @return The output noisy circuit. + */ +template +inline NoisyCircuit MakeNoisy( + unsigned num_qubits, + typename std::vector::const_iterator gbeg, + typename std::vector::const_iterator gend, + const ChannelFactory& channel_factory) { + NoisyCircuit ncircuit; + + ncircuit.num_qubits = num_qubits; + ncircuit.channels.reserve(4 * std::size_t(gend - gbeg)); + + for (auto it = gbeg; it != gend; ++it) { + const auto& gate = *it; + + ncircuit.channels.push_back(MakeChannelFromGate(2 * gate.time, gate)); + + for (auto q : gate.qubits) { + ncircuit.channels.push_back(channel_factory.Create(2 * gate.time + 1, q)); + } + + for (auto q : gate.controlled_by) { + ncircuit.channels.push_back(channel_factory.Create(2 * gate.time + 1, q)); + } + } + + return ncircuit; +} + +/** + * Makes a noisy circuit from the clean circuit. + * Channels are added after each qubit of each gate of the clean cicuit. + * Roughly equivalent to cirq.Circuit.with_noise. + * @param num_qubits The number of circuit qubits. + * @param gates The circuit gates. + * @param A channel factory to construct channels. + * @return The output noisy circuit. + */ +template +inline NoisyCircuit MakeNoisy(unsigned num_qubits, + const std::vector& gates, + const ChannelFactory& channel_factory) { + return + MakeNoisy(num_qubits, gates.begin(), gates.end(), channel_factory); +} + +/** + * Makes a noisy circuit from the clean circuit. + * Channels are added after each qubit of each gate of the clean cicuit. + * Roughly equivalent to cirq.Circuit.with_noise. + * @param circuit The input cicuit. + * @param A channel factory to construct channels. + * @return The output noisy circuit. + */ +template +inline NoisyCircuit MakeNoisy(const Circuit& circuit, + const ChannelFactory& channel_factory) { + return MakeNoisy(circuit.num_qubits, circuit.gates.begin(), + circuit.gates.end(), channel_factory); +} + +} // namespace qsim + +#endif // CIRCUIT_NOISY_H_ diff --git a/qsim/circuit_qsim_parser.h b/qsim/circuit_qsim_parser.h new file mode 100644 index 0000000..de7bd89 --- /dev/null +++ b/qsim/circuit_qsim_parser.h @@ -0,0 +1,442 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CIRCUIT_QSIM_PARSER_H_ +#define CIRCUIT_QSIM_PARSER_H_ + +#include +#include +#include +#include +#include + +#include "circuit.h" +#include "gates_qsim.h" + +namespace qsim { + +/** + * Parser for the (deprecated) qsim file input format. + * The primary supported interface for designing circuits to simulate with qsim + * is Cirq, which relies on + * the Python-based qsimcirq interface. For C++ applications, Cirq gates can be + * explicitly constructed in code. + */ +template +class CircuitQsimParser final { + public: + /** + * Parses the given input stream into a Circuit object, following the rules + * defined in "docs/input_format.md". + * @param maxtime Maximum gate "time" to read operations for (inclusive). + * @param provider Circuit source; only used for error reporting. + * @param fs The stream to read the circuit from. + * @param circuit Output circuit object. If parsing is successful, this will + * contain the circuit defined in 'fs'. + * @return True if parsing succeeds; false otherwise. + */ + template + static bool FromStream(unsigned maxtime, const std::string& provider, + Stream& fs, Circuit>& circuit) { + circuit.num_qubits = 0; + + circuit.gates.resize(0); + circuit.gates.reserve(1024); + + unsigned k = 0; + + std::string line; + line.reserve(128); + + unsigned time; + std::string gate_name; + gate_name.reserve(16); + + unsigned max_time = 0; + unsigned prev_mea_time = 0; + + std::vector last_times; + + while (std::getline(fs, line)) { + ++k; + + if (line.size() == 0 || line[0] == '#') continue; + + std::stringstream ss(line); + + if (circuit.num_qubits == 0) { + ss >> circuit.num_qubits; + if (circuit.num_qubits == 0) { + IO::errorf("invalid number of qubits in %s in line %u.\n", + provider.c_str(), k); + return false; + } + + last_times.resize(circuit.num_qubits, unsigned(-1)); + + continue; + } + + ss >> time >> gate_name; + + if (!ss) { + InvalidGateError(provider, k); + return false; + } + + if (time > maxtime) { + break; + } + + if (gate_name == "c") { + if (!ParseControlledGate(ss, time, + circuit.num_qubits, circuit.gates)) { + InvalidGateError(provider, k); + return false; + } + } else if (!ParseGate(ss, time, circuit.num_qubits, + gate_name, circuit.gates)) { + InvalidGateError(provider, k); + return false; + } + + const auto& gate = circuit.gates.back(); + + if (time < prev_mea_time + || (gate.kind == gate::kMeasurement && time < max_time)) { + IO::errorf("gate crosses the time boundary set by measurement " + "gates in line %u in %s.\n", k, provider.c_str()); + return false; + } + + if (gate.kind == gate::kMeasurement) { + prev_mea_time = time; + } + + if (GateIsOutOfOrder(time, gate.qubits, last_times) + || GateIsOutOfOrder(time, gate.controlled_by, last_times)) { + IO::errorf("gate is out of time order in line %u in %s.\n", + k, provider.c_str()); + return false; + } + + if (time > max_time) { + max_time = time; + } + } + + return true; + } + + /** + * Parses the given file into a Circuit object, following the rules defined + * in "docs/input_format.md". + * @param maxtime Maximum gate "time" to read operations for (inclusive). + * @param file The name of the file to read the circuit from. + * @param circuit Output circuit object. If parsing is successful, this will + * contain the circuit defined in 'file'. + * @return True if parsing succeeds; false otherwise. + */ + template + static bool FromFile(unsigned maxtime, const std::string& file, + Circuit>& circuit) { + auto fs = IO::StreamFromFile(file); + + if (!fs) { + return false; + } else { + bool rc = FromStream(maxtime, file, fs, circuit); + IO::CloseStream(fs); + return rc; + } + } + + private: + static void InvalidGateError(const std::string& provider, unsigned line) { + IO::errorf("invalid gate in %s in line %u.\n", provider.c_str(), line); + } + + /** + * Checks formatting for a zero-qubit gate parsed from 'ss'. + * @param ss Input stream containing the gate specification. + */ + static bool ValidateGate(std::stringstream& ss) { + return ss && ss.peek() == std::stringstream::traits_type::eof(); + } + + /** + * Checks formatting for a single-qubit gate parsed from 'ss'. + * @param ss Input stream containing the gate specification. + * @param num_qubits Number of qubits, as defined at the start of the file. + * @param q0 Index of the affected qubit. + */ + static bool ValidateGate(std::stringstream& ss, + unsigned num_qubits, unsigned q0) { + return ss && ss.peek() == std::stringstream::traits_type::eof() + && q0 < num_qubits; + } + + /** + * Checks formatting for a two-qubit gate parsed from 'ss'. + * @param ss Input stream containing the gate specification. + * @param num_qubits Number of qubits, as defined at the start of the file. + * @param q0 Index of the first affected qubit. + * @param q1 Index of the second affected qubit. + */ + static bool ValidateGate(std::stringstream& ss, + unsigned num_qubits, unsigned q0, unsigned q1) { + return ss && ss.peek() == std::stringstream::traits_type::eof() + && q0 < num_qubits && q1 < num_qubits && q0 != q1; + } + + /** + * Checks formatting for a multiqubit gate parsed from 'ss'. + * @param ss Input stream containing the gate specification. + * @param num_qubits Number of qubits, as defined at the start of the file. + * @param qubits Indices of affected qubits. + */ + static bool ValidateGate(std::stringstream& ss, unsigned num_qubits, + const std::vector& qubits) { + return ss && ValidateQubits(num_qubits, qubits); + } + + static bool ValidateControlledGate( + unsigned num_qubits, const std::vector& qubits, + const std::vector& controlled_by) { + if (!ValidateQubits(num_qubits, controlled_by)) return false; + + std::size_t i = 0, j = 0; + + while (i < qubits.size() && j < controlled_by.size()) { + if (qubits[i] == controlled_by[j]) { + return false; + } else if (qubits[i] < controlled_by[j]) { + ++i; + } else { + ++j; + } + } + + return true; + } + + static bool ValidateQubits(unsigned num_qubits, + const std::vector& qubits) { + if (qubits.size() == 0 || qubits[0] >= num_qubits) return false; + + // qubits should be sorted. + + for (std::size_t i = 1; i < qubits.size(); ++i) { + if (qubits[i] >= num_qubits || qubits[i] == qubits[i - 1]) { + return false; + } + } + + return true; + } + + static bool GateIsOutOfOrder(unsigned time, + const std::vector& qubits, + std::vector& last_times) { + for (auto q : qubits) { + if (last_times[q] != unsigned(-1) && time <= last_times[q]) { + return true; + } + + last_times[q] = time; + } + + return false; + } + + template + static bool ParseGate(Stream& ss, unsigned time, unsigned num_qubits, + const std::string& gate_name, + std::vector& gates) { + unsigned q0, q1; + fp_type phi, theta; + + if (gate_name == "p") { + ss >> phi; + if (!ValidateGate(ss)) return false; + gates.push_back(GateGPh::Create(time, phi)); + } else if (gate_name == "id1") { + ss >> q0; + if (!ValidateGate(ss, num_qubits, q0)) return false; + gates.push_back(GateId1::Create(time, q0)); + } else if (gate_name == "h") { + ss >> q0; + if (!ValidateGate(ss, num_qubits, q0)) return false; + gates.push_back(GateHd::Create(time, q0)); + } else if (gate_name == "t") { + ss >> q0; + if (!ValidateGate(ss, num_qubits, q0)) return false; + gates.push_back(GateT::Create(time, q0)); + } else if (gate_name == "x") { + ss >> q0; + if (!ValidateGate(ss, num_qubits, q0)) return false; + gates.push_back(GateX::Create(time, q0)); + } else if (gate_name == "y") { + ss >> q0; + if (!ValidateGate(ss, num_qubits, q0)) return false; + gates.push_back(GateY::Create(time, q0)); + } else if (gate_name == "z") { + ss >> q0; + if (!ValidateGate(ss, num_qubits, q0)) return false; + gates.push_back(GateZ::Create(time, q0)); + } else if (gate_name == "x_1_2") { + ss >> q0; + if (!ValidateGate(ss, num_qubits, q0)) return false; + gates.push_back(GateX2::Create(time, q0)); + } else if (gate_name == "y_1_2") { + ss >> q0; + if (!ValidateGate(ss, num_qubits, q0)) return false; + gates.push_back(GateY2::Create(time, q0)); + } else if (gate_name == "rx") { + ss >> q0 >> phi; + if (!ValidateGate(ss, num_qubits, q0)) return false; + gates.push_back(GateRX::Create(time, q0, phi)); + } else if (gate_name == "ry") { + ss >> q0 >> phi; + if (!ValidateGate(ss, num_qubits, q0)) return false; + gates.push_back(GateRY::Create(time, q0, phi)); + } else if (gate_name == "rz") { + ss >> q0 >> phi; + if (!ValidateGate(ss, num_qubits, q0)) return false; + gates.push_back(GateRZ::Create(time, q0, phi)); + } else if (gate_name == "rxy") { + ss >> q0 >> theta >> phi; + if (!ValidateGate(ss, num_qubits, q0)) return false; + gates.push_back(GateRXY::Create(time, q0, theta, phi)); + } else if (gate_name == "hz_1_2") { + ss >> q0; + if (!ValidateGate(ss, num_qubits, q0)) return false; + gates.push_back(GateHZ2::Create(time, q0)); + } else if (gate_name == "s") { + ss >> q0; + if (!ValidateGate(ss, num_qubits, q0)) return false; + gates.push_back(GateS::Create(time, q0)); + } else if (gate_name == "id2") { + ss >> q0 >> q1; + if (!ValidateGate(ss, num_qubits, q0, q1)) return false; + gates.push_back(GateId2::Create(time, q0, q1)); + } else if (gate_name == "cz") { + ss >> q0 >> q1; + if (!ValidateGate(ss, num_qubits, q0, q1)) return false; + gates.push_back(GateCZ::Create(time, q0, q1)); + } else if (gate_name == "cnot" || gate_name == "cx") { + ss >> q0 >> q1; + if (!ValidateGate(ss, num_qubits, q0, q1)) return false; + gates.push_back(GateCNot::Create(time, q0, q1)); + } else if (gate_name == "sw") { + ss >> q0 >> q1; + if (!ValidateGate(ss, num_qubits, q0, q1)) return false; + gates.push_back(GateSwap::Create(time, q0, q1)); + } else if (gate_name == "is") { + ss >> q0 >> q1; + if (!ValidateGate(ss, num_qubits, q0, q1)) return false; + gates.push_back(GateIS::Create(time, q0, q1)); + } else if (gate_name == "fs") { + ss >> q0 >> q1 >> theta >> phi; + if (!ValidateGate(ss, num_qubits, q0, q1)) return false; + gates.push_back(GateFS::Create(time, q0, q1, theta, phi)); + } else if (gate_name == "cp") { + ss >> q0 >> q1 >> phi; + if (!ValidateGate(ss, num_qubits, q0, q1)) return false; + gates.push_back(GateCP::Create(time, q0, q1, phi)); + } else if (gate_name == "m") { + std::vector qubits; + qubits.reserve(num_qubits); + + while (ss.good()) { + ss >> q0; + if (ss) { + qubits.push_back(q0); + } else { + return false; + } + } + + gates.push_back(gate::Measurement>::Create( + time, std::move(qubits))); + + if (!ValidateQubits(num_qubits, gates.back().qubits)) return false; + } else { + return false; + } + + return true; + } + + template + static bool ParseControlledGate(Stream& ss, unsigned time, + unsigned num_qubits, + std::vector& gates) { + std::vector controlled_by; + controlled_by.reserve(64); + + std::string gate_name; + gate_name.reserve(16); + + while (1) { + while (ss.good()) { + if (!std::isblank(ss.get())) { + ss.unget(); + break; + } + } + + if (!ss.good()) { + return false; + } + + if (!std::isdigit(ss.peek())) { + break; + } else { + unsigned q; + ss >> q; + + if (!ss.good() || !std::isblank(ss.get())) { + return false; + } + + controlled_by.push_back(q); + } + } + + if (controlled_by.size() == 0) { + return false; + } + + ss >> gate_name; + + if (!ss.good() || !ParseGate(ss, time, + num_qubits, gate_name, gates)) { + return false; + } + + gates.back().ControlledBy(std::move(controlled_by)); + + if (!ValidateControlledGate(num_qubits, gates.back().qubits, + gates.back().controlled_by)) { + return false; + } + + return true; + } +}; + +} // namespace qsim + +#endif // CIRCUIT_QSIM_PARSER_H_ diff --git a/qsim/cuda2hip.h b/qsim/cuda2hip.h new file mode 100644 index 0000000..da2d074 --- /dev/null +++ b/qsim/cuda2hip.h @@ -0,0 +1,61 @@ +// Copyright 2023 Advanced Micro Devices, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SIMULATOR_CUDA2HIP_H_ +#define SIMULATOR_CUDA2HIP_H_ + +#define cublasCaxpy hipblasCaxpy +#define cublasCdotc hipblasCdotc +#define cublasCreate hipblasCreate +#define cublasCscal hipblasCscal +#define cublasCsscal hipblasCsscal +#define cublasDestroy hipblasDestroy +#define cublasDznrm2 hipblasDznrm2 +#define cublasHandle_t hipblasHandle_t +#define cublasScnrm2 hipblasScnrm2 +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define cublasStatus_t hipblasStatus_t +#define cublasZaxpy hipblasZaxpy +#define cublasZdotc hipblasZdotc +#define cublasZdscal hipblasZdscal +#define cublasZscal hipblasZscal +#define cuCimagf hipCimagf +#define cuCimag hipCimag +#define cuComplex hipComplex +#define cuCrealf hipCrealf +#define cuCreal hipCreal +#define CUDA_C_32F HIPBLAS_C_32F +#define CUDA_C_64F HIPBLAS_C_64F +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaError_t hipError_t +#define cudaFree hipFree +#define cudaGetErrorString hipGetErrorString +#define cudaMalloc hipMalloc +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpy hipMemcpy +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemset hipMemset +#define cudaPeekAtLastError hipPeekAtLastError +#define cudaSuccess hipSuccess +#define cuDoubleComplex hipDoubleComplex + +template +__device__ __forceinline__ T __shfl_down_sync( + unsigned mask, T var, unsigned int delta, int width = warpSize) { + return __shfl_down(var, delta, width); +} + +#endif // SIMULATOR_CUDA2HIP_H_ diff --git a/qsim/expect.h b/qsim/expect.h new file mode 100644 index 0000000..518d516 --- /dev/null +++ b/qsim/expect.h @@ -0,0 +1,148 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef EXPECT_H_ +#define EXPECT_H_ + +#include + +#include "fuser.h" +#include "gate_appl.h" + +namespace qsim { + +template +struct OpString { + std::complex weight; + std::vector ops; +}; + +/** + * Computes the expectation value of the sum of operator strings (operator + * sequences). Operators can act on any qubits and they can be any supported + * gates. This function uses a temporary state vector. + * @param param Options for gate fusion. + * @param strings Operator strings. + * @param ss StateSpace object required to copy the state vector and compute + * inner products. + * @param simulator Simulator object. Provides specific implementations for + * applying gates. + * @param state The state vector of the system. + * @param ket Temporary state vector. + * @return The computed expectation value. + */ +template +std::complex ExpectationValue( + const typename Fuser::Parameter& param, + const std::vector>& strings, + const typename Simulator::StateSpace& state_space, + const Simulator& simulator, const typename Simulator::State& state, + typename Simulator::State& ket) { + std::complex eval = 0; + + if (state_space.IsNull(ket) || ket.num_qubits() < state.num_qubits()) { + ket = state_space.Create(state.num_qubits()); + if (state_space.IsNull(ket)) { + IO::errorf("not enough memory: is the number of qubits too large?\n"); + return eval; + } + } + + for (const auto& str : strings) { + if (str.ops.size() == 0) { + eval += str.weight; + continue; + } + + state_space.Copy(state, ket); + + if (str.ops.size() == 1) { + const auto& op = str.ops[0]; + simulator.ApplyGate(op.qubits, op.matrix.data(), ket); + } else { + auto fused_gates = Fuser::FuseGates(param, state.num_qubits(), str.ops); + if (fused_gates.size() == 0) { + eval = 0; + break; + } + + for (const auto& fgate : fused_gates) { + ApplyFusedGate(simulator, fgate, ket); + } + } + + eval += str.weight * state_space.InnerProduct(state, ket); + } + + return eval; +} + +/** + * Computes the expectation value of the sum of operator strings (operator + * sequences). Operators can act on any qubits and they can be any supported + * gates except for user-defined controlled gates. Computation is performed + * in place. No additional memory is allocated. The operator strings should + * act on no more than six qubits and they should be fusible into one gate. + * @param strings Operator strings. + * @param simulator Simulator object. Provides specific implementations for + * computing expectation values. + * @param state The state of the system. + * @return The computed expectation value. + */ +template +std::complex ExpectationValue( + const std::vector>& strings, + const Simulator& simulator, const typename Simulator::State& state) { + std::complex eval = 0; + + typename Fuser::Parameter param; + param.max_fused_size = 6; + for (const auto& str : strings) { + if (str.ops.size() == 0) { + eval += str.weight; + } else if (str.ops.size() == 1) { + const auto& op = str.ops[0]; + auto r = simulator.ExpectationValue(op.qubits, op.matrix.data(), state); + eval += str.weight * r; + } else { + auto fused_gates = Fuser::FuseGates(param, state.num_qubits(), str.ops); + + if (fused_gates.size() != 1) { + IO::errorf("too many fused gates; " + "cannot compute the expectation value.\n"); + eval = 0; + break; + } + + const auto& fgate = fused_gates[0]; + + if (fgate.qubits.size() > 6) { + IO::errorf("operator string acts on too many qubits; " + "cannot compute the expectation value.\n"); + eval = 0; + break; + } + + auto r = simulator.ExpectationValue( + fgate.qubits, fgate.matrix.data(), state); + eval += str.weight * r; + } + } + + return eval; +} + +} // namespace qsim + +#endif // EXPECT_H_ diff --git a/qsim/formux.h b/qsim/formux.h new file mode 100644 index 0000000..4401e9b --- /dev/null +++ b/qsim/formux.h @@ -0,0 +1,30 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef FORMUX_H_ +#define FORMUX_H_ + +#ifdef _OPENMP +# include "parfor.h" + namespace qsim { + using For = ParallelFor; + } +#else +# include "seqfor.h" + namespace qsim { + using For = SequentialFor; + } +#endif + +#endif // FORMUX_H_ diff --git a/qsim/fuser.h b/qsim/fuser.h new file mode 100644 index 0000000..e4f3c3b --- /dev/null +++ b/qsim/fuser.h @@ -0,0 +1,225 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef FUSER_H_ +#define FUSER_H_ + +#include +#include + +#include "gate.h" +#include "matrix.h" + +namespace qsim { + +/** + * A collection of "fused" gates which can be multiplied together before being + * applied to the state vector. + */ +template +struct GateFused { + /** + * Kind of the first ("parent") gate. + */ + typename Gate::GateKind kind; + /** + * The time index of the first ("parent") gate. + */ + unsigned time; + /** + * A list of qubits these gates act upon. Control qubits for + * explicitly-controlled gates are excluded from this list. + */ + std::vector qubits; + /** + * Pointer to the first ("parent") gate. + */ + const Gate* parent; + /** + * Ordered list of component gates. + */ + std::vector gates; + /** + * Fused gate matrix. + */ + Matrix matrix; +}; + +/** + * A base class for fuser classes with some common functions. + */ +template +class Fuser { + protected: + using RGate = typename std::remove_pointer::type; + + static const RGate& GateToConstRef(const RGate& gate) { + return gate; + } + + static const RGate& GateToConstRef(const RGate* gate) { + return *gate; + } + + static std::vector MergeWithMeasurementTimes( + typename std::vector::const_iterator gfirst, + typename std::vector::const_iterator glast, + const std::vector& times) { + std::vector epochs; + epochs.reserve(glast - gfirst + times.size()); + + std::size_t last = 0; + unsigned max_time = 0; + + for (auto gate_it = gfirst; gate_it < glast; ++gate_it) { + const auto& gate = GateToConstRef(*gate_it); + + if (gate.time > max_time) { + max_time = gate.time; + } + + if (epochs.size() > 0 && gate.time < epochs.back()) { + IO::errorf("gate crosses the time boundary.\n"); + epochs.resize(0); + return epochs; + } + + if (gate.kind == gate::kMeasurement) { + if (epochs.size() == 0 || epochs.back() < gate.time) { + if (!AddBoundary(gate.time, max_time, epochs)) { + epochs.resize(0); + return epochs; + } + } + } + + while (last < times.size() && times[last] <= gate.time) { + unsigned prev = times[last++]; + epochs.push_back(prev); + if (!AddBoundary(prev, max_time, epochs)) { + epochs.resize(0); + return epochs; + } + while (last < times.size() && times[last] <= prev) ++last; + } + } + + if (epochs.size() == 0 || epochs.back() < max_time) { + epochs.push_back(max_time); + } + + return epochs; + } + + template + static void FuseZeroQubitGates(const GateSeq0& gate_seq0, + Parent parent, std::size_t first, + std::vector& fused_gates) { + GateFused* fuse_to = nullptr; + + for (std::size_t i = first; i < fused_gates.size(); ++i) { + auto& fgate = fused_gates[i]; + + if (fgate.kind != gate::kMeasurement && fgate.kind != gate::kDecomp + && fgate.parent->controlled_by.size() == 0 + && !fgate.parent->unfusible) { + fuse_to = &fgate; + break; + } + } + + if (fuse_to != nullptr) { + // Fuse zero-qubit gates with the first available fused gate. + for (const auto& g : gate_seq0) { + fuse_to->gates.push_back(parent(g)); + } + } else { + auto g0 = parent(gate_seq0[0]); + fused_gates.push_back({g0->kind, g0->time, {}, g0, {g0}, {}}); + + for (std::size_t i = 1; i < gate_seq0.size(); ++i) { + fused_gates.back().gates.push_back(parent(gate_seq0[i])); + } + } + } + + private: + static bool AddBoundary(unsigned time, unsigned max_time, + std::vector& boundaries) { + if (max_time > time) { + IO::errorf("gate crosses the time boundary.\n"); + return false; + } + + boundaries.push_back(time); + return true; + } +}; + +/** + * Multiplies component gate matrices of a fused gate. + * @param gate Fused gate. + */ +template +inline void CalculateFusedMatrix(FusedGate& gate) { + MatrixIdentity(unsigned{1} << gate.qubits.size(), gate.matrix); + + for (auto pgate : gate.gates) { + if (pgate->qubits.size() == 0) { + MatrixScalarMultiply(pgate->matrix[0], pgate->matrix[1], gate.matrix); + } else if (gate.qubits.size() == pgate->qubits.size()) { + MatrixMultiply(gate.qubits.size(), pgate->matrix, gate.matrix); + } else { + unsigned mask = 0; + + for (auto q : pgate->qubits) { + for (std::size_t i = 0; i < gate.qubits.size(); ++i) { + if (q == gate.qubits[i]) { + mask |= unsigned{1} << i; + break; + } + } + } + + MatrixMultiply(mask, pgate->qubits.size(), pgate->matrix, + gate.qubits.size(), gate.matrix); + } + } +} + +/** + * Multiplies component gate matrices for a range of fused gates. + * @param gbeg, gend The iterator range [gbeg, gend) of fused gates. + */ +template +inline void CalculateFusedMatrices(Iterator gbeg, Iterator gend) { + for (auto g = gbeg; g != gend; ++g) { + if (g->kind != gate::kMeasurement) { + CalculateFusedMatrix(*g); + } + } +} + +/** + * Multiplies component gate matrices for a vector of fused gates. + * @param gates The vector of fused gates. + */ +template +inline void CalculateFusedMatrices(std::vector& gates) { + CalculateFusedMatrices(gates.begin(), gates.end()); +} + +} // namespace qsim + +#endif // FUSER_H_ diff --git a/qsim/fuser_basic.h b/qsim/fuser_basic.h new file mode 100644 index 0000000..3191bd2 --- /dev/null +++ b/qsim/fuser_basic.h @@ -0,0 +1,411 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef FUSER_BASIC_H_ +#define FUSER_BASIC_H_ + +#include +#include +#include +#include + +#include "gate.h" +#include "fuser.h" + +namespace qsim { + +/** + * Stateless object with methods for aggregating `Gate`s into `GateFused`. + * Measurement gates with equal times are fused together. + * User-defined controlled gates (controlled_by.size() > 0) and gates acting on + * more than two qubits are not fused. + * The template parameter Gate can be Gate type or a pointer to Gate type. + * This class is deprecated. It is recommended to use MultiQubitGateFuser + * from fuser_mqubit.h. + */ +template +class BasicGateFuser final : public Fuser { + private: + using Base = Fuser; + using RGate = typename Base::RGate; + + public: + using GateFused = qsim::GateFused; + + /** + * User-specified parameters for gate fusion. + * BasicGateFuser does not use any parameters. + */ + struct Parameter { + unsigned verbosity = 0; + }; + + /** + * Stores sets of gates that can be applied together. Only one- and + * two-qubit gates will get fused. To respect specific time boundaries while + * fusing gates, use the other version of this method below. + * @param param Options for gate fusion. + * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'. + * @param gates The gates (or pointers to the gates) to be fused. + * Gate times of the gates that act on the same qubits should be ordered. + * Gates that are out of time order should not cross the time boundaries + * set by measurement gates. + * @param fuse_matrix If true, multiply gate matrices together. + * @return A vector of fused gate objects. Each element is a set of gates + * acting on a specific pair of qubits which can be applied as a group. + */ + static std::vector FuseGates(const Parameter& param, + unsigned max_qubit1, + const std::vector& gates, + bool fuse_matrix = true) { + return FuseGates( + param, max_qubit1, gates.cbegin(), gates.cend(), {}, fuse_matrix); + } + + /** + * Stores sets of gates that can be applied together. Only one- and + * two-qubit gates will get fused. + * @param param Options for gate fusion. + * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'. + * @param gates The gates (or pointers to the gates) to be fused. + * Gate times of the gates that act on the same qubits should be ordered. + * Gates that are out of time order should not cross the time boundaries + * set by `times_to_split_at` or by measurement gates. + * @param times_to_split_at Ordered list of time steps (boundaries) at which + * to separate fused gates. Each element of the output will contain gates + * from a single 'window' in this list. + * @param fuse_matrix If true, multiply gate matrices together. + * @return A vector of fused gate objects. Each element is a set of gates + * acting on a specific pair of qubits which can be applied as a group. + */ + static std::vector FuseGates( + const Parameter& param, + unsigned max_qubit1, const std::vector& gates, + const std::vector& times_to_split_at, + bool fuse_matrix = true) { + return FuseGates(param, max_qubit1, gates.cbegin(), gates.cend(), + times_to_split_at, fuse_matrix); + } + + /** + * Stores sets of gates that can be applied together. Only one- and + * two-qubit gates will get fused. To respect specific time boundaries while + * fusing gates, use the other version of this method below. + * @param param Options for gate fusion. + * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'. + * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates + * (or pointers to gates) in. Gate times of the gates that act on the same + * qubits should be ordered. Gates that are out of time order should not + * cross the time boundaries set by measurement gates. + * @param fuse_matrix If true, multiply gate matrices together. + * @return A vector of fused gate objects. Each element is a set of gates + * acting on a specific pair of qubits which can be applied as a group. + */ + static std::vector FuseGates( + const Parameter& param, unsigned max_qubit1, + typename std::vector::const_iterator gfirst, + typename std::vector::const_iterator glast, + bool fuse_matrix = true) { + return FuseGates(param, max_qubit1, gfirst, glast, {}, fuse_matrix); + } + + /** + * Stores sets of gates that can be applied together. Only one- and + * two-qubit gates will get fused. + * @param param Options for gate fusion. + * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'. + * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates + * (or pointers to gates) in. Gate times of the gates that act on the same + * qubits should be ordered. Gates that are out of time order should not + * cross the time boundaries set by `times_to_split_at` or by measurement + * gates. + * @param times_to_split_at Ordered list of time steps (boundaries) at which + * to separate fused gates. Each element of the output will contain gates + * from a single 'window' in this list. + * @param fuse_matrix If true, multiply gate matrices together. + * @return A vector of fused gate objects. Each element is a set of gates + * acting on a specific pair of qubits which can be applied as a group. + */ + static std::vector FuseGates( + const Parameter& param, unsigned max_qubit1, + typename std::vector::const_iterator gfirst, + typename std::vector::const_iterator glast, + const std::vector& times_to_split_at, + bool fuse_matrix = true) { + std::vector gates_fused; + + if (gfirst >= glast) return gates_fused; + + std::size_t num_gates = glast - gfirst; + + gates_fused.reserve(num_gates); + + // Merge with measurement gate times to separate fused gates at. + auto times = + Base::MergeWithMeasurementTimes(gfirst, glast, times_to_split_at); + + // Map to keep track of measurement gates with equal times. + std::map> measurement_gates; + + // Sequence of top level gates the other gates get fused to. + std::vector gates_seq; + + // Sequence of zero-qubit gates. + std::vector gates_seq0; + + // Lattice of gates: qubits "hyperplane" and time direction. + std::vector> gates_lat(max_qubit1); + + // Current unfused gate. + auto gate_it = gfirst; + + std::size_t last_fused_gate_index = 0; + + for (std::size_t l = 0; l < times.size(); ++l) { + gates_seq.resize(0); + gates_seq.reserve(num_gates); + + gates_seq0.resize(0); + gates_seq0.reserve(num_gates); + + for (unsigned k = 0; k < max_qubit1; ++k) { + gates_lat[k].resize(0); + gates_lat[k].reserve(128); + } + + // Fill gates_seq and gates_lat in. + for (; gate_it < glast; ++gate_it) { + const auto& gate = Base::GateToConstRef(*gate_it); + + if (gate.time > times[l]) break; + + if (!ValidateGate(gate, max_qubit1, gates_lat)) { + gates_fused.resize(0); + return gates_fused; + } + + if (gate.kind == gate::kMeasurement) { + auto& mea_gates_at_time = measurement_gates[gate.time]; + if (mea_gates_at_time.size() == 0) { + gates_seq.push_back(&gate); + mea_gates_at_time.reserve(max_qubit1); + } + + mea_gates_at_time.push_back(&gate); + } else if (gate.controlled_by.size() > 0 || gate.qubits.size() > 2) { + for (auto q : gate.qubits) { + gates_lat[q].push_back(&gate); + } + for (auto q : gate.controlled_by) { + gates_lat[q].push_back(&gate); + } + gates_seq.push_back(&gate); + } else if (gate.qubits.size() == 1) { + gates_lat[gate.qubits[0]].push_back(&gate); + if (gate.unfusible) { + gates_seq.push_back(&gate); + } + } else if (gate.qubits.size() == 2) { + gates_lat[gate.qubits[0]].push_back(&gate); + gates_lat[gate.qubits[1]].push_back(&gate); + gates_seq.push_back(&gate); + } else { + gates_seq0.push_back(&gate); + } + } + + std::vector last(max_qubit1, 0); + + const RGate* delayed_measurement_gate = nullptr; + + // Fuse gates. + for (auto pgate : gates_seq) { + if (pgate->kind == gate::kMeasurement) { + delayed_measurement_gate = pgate; + } else if (pgate->qubits.size() > 2 + || pgate->controlled_by.size() > 0) { + // Multi-qubit or controlled gate. + + for (auto q : pgate->qubits) { + unsigned l = last[q]; + if (gates_lat[q][l] != pgate) { + last[q] = AddOrphanedQubit(q, l, gates_lat, gates_fused); + } + ++last[q]; + } + + for (auto q : pgate->controlled_by) { + unsigned l = last[q]; + if (gates_lat[q][l] != pgate) { + last[q] = AddOrphanedQubit(q, l, gates_lat, gates_fused); + } + ++last[q]; + } + + gates_fused.push_back({pgate->kind, pgate->time, pgate->qubits, + pgate, {pgate}, {}}); + } else if (pgate->qubits.size() == 1) { + unsigned q0 = pgate->qubits[0]; + + GateFused gate_f = {pgate->kind, pgate->time, {q0}, pgate, {}, {}}; + + last[q0] = Advance(last[q0], gates_lat[q0], gate_f.gates); + gate_f.gates.push_back(gates_lat[q0][last[q0]]); + last[q0] = Advance(last[q0] + 1, gates_lat[q0], gate_f.gates); + + gates_fused.push_back(std::move(gate_f)); + } else if (pgate->qubits.size() == 2) { + unsigned q0 = pgate->qubits[0]; + unsigned q1 = pgate->qubits[1]; + + if (Done(last[q0], pgate->time, gates_lat[q0])) continue; + + GateFused gate_f = + {pgate->kind, pgate->time, {q0, q1}, pgate, {}, {}}; + + do { + last[q0] = Advance(last[q0], gates_lat[q0], gate_f.gates); + last[q1] = Advance(last[q1], gates_lat[q1], gate_f.gates); + // Here gates_lat[q0][last[q0]] == gates_lat[q1][last[q1]]. + + gate_f.gates.push_back(gates_lat[q0][last[q0]]); + + last[q0] = Advance(last[q0] + 1, gates_lat[q0], gate_f.gates); + last[q1] = Advance(last[q1] + 1, gates_lat[q1], gate_f.gates); + } while (NextGate(last[q0], gates_lat[q0], last[q1], gates_lat[q1])); + + gates_fused.push_back(std::move(gate_f)); + } + } + + for (unsigned q = 0; q < max_qubit1; ++q) { + auto l = last[q]; + if (l == gates_lat[q].size()) continue; + + // Orphaned qubit. + AddOrphanedQubit(q, l, gates_lat, gates_fused); + } + + if (delayed_measurement_gate != nullptr) { + auto pgate = delayed_measurement_gate; + + const auto& mea_gates_at_time = measurement_gates[pgate->time]; + + GateFused gate_f = {pgate->kind, pgate->time, {}, pgate, {}, {}}; + gate_f.gates.reserve(mea_gates_at_time.size()); + + // Fuse measurement gates with equal times. + + for (const auto* pgate : mea_gates_at_time) { + gate_f.qubits.insert(gate_f.qubits.end(), + pgate->qubits.begin(), pgate->qubits.end()); + gate_f.gates.push_back(pgate); + } + + gates_fused.push_back(std::move(gate_f)); + } + + if (gates_seq0.size() != 0) { + Base::FuseZeroQubitGates(gates_seq0, [](const RGate* g) { return g; }, + last_fused_gate_index, gates_fused); + } + + if (gate_it == glast) break; + + last_fused_gate_index = gates_fused.size(); + } + + if (fuse_matrix) { + for (auto& gate_f : gates_fused) { + if (gate_f.kind != gate::kMeasurement && gate_f.kind != gate::kDecomp) { + CalculateFusedMatrix(gate_f); + } + } + } + + return gates_fused; + } + + private: + static unsigned Advance(unsigned k, const std::vector& wl, + std::vector& gates) { + while (k < wl.size() && wl[k]->qubits.size() == 1 + && wl[k]->controlled_by.size() == 0 && !wl[k]->unfusible) { + gates.push_back(wl[k++]); + } + + return k; + } + + static bool Done( + unsigned k, unsigned t, const std::vector& wl) { + return k >= wl.size() || wl[k]->time > t; + } + + static bool NextGate(unsigned k1, const std::vector& wl1, + unsigned k2, const std::vector& wl2) { + return k1 < wl1.size() && k2 < wl2.size() && wl1[k1] == wl2[k2] + && wl1[k1]->qubits.size() < 3 && wl1[k1]->controlled_by.size() == 0; + } + + template + static unsigned AddOrphanedQubit(unsigned q, unsigned k, + const GatesLat& gates_lat, + std::vector& gates_fused) { + auto pgate = gates_lat[q][k]; + + GateFused gate_f = {pgate->kind, pgate->time, {q}, pgate, {}, {}}; + gate_f.gates.push_back(pgate); + + k = Advance(k + 1, gates_lat[q], gate_f.gates); + + gates_fused.push_back(std::move(gate_f)); + + return k; + } + + template + static bool ValidateGate(const Gate2& gate, unsigned max_qubit1, + const GatesLat& gates_lat) { + for (unsigned q : gate.qubits) { + if (q >= max_qubit1) { + IO::errorf("fuser: gate qubit %u is out of range " + "(should be smaller than %u).\n", q, max_qubit1); + return false; + } + if (!gates_lat[q].empty() && gate.time <= gates_lat[q].back()->time) { + IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time); + return false; + } + } + + for (unsigned q : gate.controlled_by) { + if (q >= max_qubit1) { + IO::errorf("fuser: gate qubit %u is out of range " + "(should be smaller than %u).\n", q, max_qubit1); + return false; + } + if (!gates_lat[q].empty() && gate.time <= gates_lat[q].back()->time) { + IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time); + return false; + } + } + + return true; + } +}; + +} // namespace qsim + +#endif // FUSER_BASIC_H_ diff --git a/qsim/fuser_mqubit.h b/qsim/fuser_mqubit.h new file mode 100644 index 0000000..c75b1a0 --- /dev/null +++ b/qsim/fuser_mqubit.h @@ -0,0 +1,1095 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef FUSER_MQUBIT_H_ +#define FUSER_MQUBIT_H_ + +#include +#include +#include +#include +#include +#include + +#include "gate.h" +#include "fuser.h" + +namespace qsim { + +/** + * Multi-qubit gate fuser. + * Measurement gates with equal times are fused together. + * User-defined controlled gates (controlled_by.size() > 0) are not fused. + * The template parameter Gate can be Gate type or a pointer to Gate type. + */ +template +class MultiQubitGateFuser final : public Fuser { + private: + using Base = Fuser; + using RGate = typename Base::RGate; + + // Auxillary classes and structs. + + // Manages doubly-linked lists. + template + class LinkManagerT { + public: + struct Link { + T val; + Link* next; + Link* prev; + }; + + explicit LinkManagerT(uint64_t size) { + links_.reserve(size); + } + + Link* AddBack(const T& t, Link* link) { + if (link == nullptr) { + links_.push_back({t, nullptr, nullptr}); + } else { + links_.push_back({t, link->next, link}); + link->next = &links_.back(); + } + + return &links_.back(); + } + + static void Delete(const Link* link) { + if (link->prev != nullptr) { + link->prev->next = link->next; + } + if (link->next != nullptr) { + link->next->prev = link->prev; + } + } + + private: + std::vector links_; + }; + + struct GateF; + + using LinkManager = LinkManagerT; + using Link = typename LinkManager::Link; + + // Intermediate representation of a fused gate. + struct GateF { + const RGate* parent; + std::vector qubits; + std::vector gates; // Gates that get fused to this gate. + std::vector links; // Gate "lattice" links. + uint64_t mask; // Qubit mask. + unsigned visited; + }; + + // Possible values for visited in GateF. + // Note that MakeGateSequence assignes values from kSecond to the number of + // gates in the sequence plus one, see below. + enum Visited { + kZero = 0, // Start value for "normal" gates. + kFirst = 1, // Value after the first pass for partially fused + // "normal" gates. + kSecond = 2, // Start value to assign values in MakeGateSequence. + kCompress = 99999997, // Used to compress links. + kMeaCnt = 99999998, // Start value for controlled or measurement gates. + kFinal = 99999999, // Value after the second pass for fused "normal" + // gates or for controlled and measurement gates. + }; + + struct Stat { + unsigned num_mea_gates = 0; + unsigned num_fused_mea_gates = 0; + unsigned num_fused_gates = 0; + unsigned num_controlled_gates = 0; + std::vector num_gates; + }; + + // Gate that is added to a sequence of gates to fuse together. + struct GateA { + GateF* gate; + std::vector qubits; // Added qubits. + std::vector links; // Added lattice links. + }; + + struct Scratch { + std::vector data; + std::vector prev1; + std::vector prev2; + std::vector next1; + std::vector next2; + std::vector longest_seq; + std::vector stack; + std::vector gates; + unsigned count = 0; + }; + + public: + using GateFused = qsim::GateFused; + + /** + * User-specified parameters for gate fusion. + */ + struct Parameter { + /** + * Maximum number of qubits in a fused gate. It can take values from 2 to + * 6 (0 and 1 are equivalent to 2). It is not recommended to use 5 or 6 as + * that might degrade performance for not very fast machines. + */ + unsigned max_fused_size = 2; + unsigned verbosity = 0; + }; + + /** + * Stores sets of gates that can be applied together. To respect specific + * time boundaries while fusing gates, use the other version of this method + * below. + * @param param Options for gate fusion. + * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'. + * @param gates The gates (or pointers to the gates) to be fused. + * Gate times of the gates that act on the same qubits should be ordered. + * Gates that are out of time order should not cross the time boundaries + * set by measurement gates. + * @param fuse_matrix If true, multiply gate matrices together. + * @return A vector of fused gate objects. Each element is a set of gates + * acting on a specific pair of qubits which can be applied as a group. + */ + static std::vector FuseGates(const Parameter& param, + unsigned max_qubit1, + const std::vector& gates, + bool fuse_matrix = true) { + return FuseGates( + param, max_qubit1, gates.cbegin(), gates.cend(), {}, fuse_matrix); + } + + /** + * Stores sets of gates that can be applied together. + * @param param Options for gate fusion. + * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'. + * @param gates The gates (or pointers to the gates) to be fused. + * Gate times of the gates that act on the same qubits should be ordered. + * Gates that are out of time order should not cross the time boundaries + * set by `times_to_split_at` or by measurement gates. + * @param times_to_split_at Ordered list of time steps (boundaries) at which + * to separate fused gates. Each element of the output will contain gates + * from a single 'window' in this list. + * @param fuse_matrix If true, multiply gate matrices together. + * @return A vector of fused gate objects. Each element is a set of gates + * acting on a specific pair of qubits which can be applied as a group. + */ + static std::vector FuseGates( + const Parameter& param, + unsigned max_qubit1, const std::vector& gates, + const std::vector& times_to_split_at, + bool fuse_matrix = true) { + return FuseGates(param, max_qubit1, gates.cbegin(), gates.cend(), + times_to_split_at, fuse_matrix); + } + + /** + * Stores sets of gates that can be applied together. To respect specific + * time boundaries while fusing gates, use the other version of this method + * below. + * @param param Options for gate fusion. + * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'. + * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates + * (or pointers to gates) in. Gate times of the gates that act on the same + * qubits should be ordered. Gates that are out of time order should not + * cross the time boundaries set by measurement gates. + * @param fuse_matrix If true, multiply gate matrices together. + * @return A vector of fused gate objects. Each element is a set of gates + * acting on a specific pair of qubits which can be applied as a group. + */ + static std::vector FuseGates( + const Parameter& param, unsigned max_qubit1, + typename std::vector::const_iterator gfirst, + typename std::vector::const_iterator glast, + bool fuse_matrix = true) { + return FuseGates(param, max_qubit1, gfirst, glast, {}, fuse_matrix); + } + + /** + * Stores sets of gates that can be applied together. + * @param param Options for gate fusion. + * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'. + * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates + * (or pointers to gates) in. Gate times of the gates that act on the same + * qubits should be ordered. Gates that are out of time order should not + * cross the time boundaries set by `times_to_split_at` or by measurement + * gates. + * @param times_to_split_at Ordered list of time steps (boundaries) at which + * to separate fused gates. Each element of the output will contain gates + * from a single 'window' in this list. + * @param fuse_matrix If true, multiply gate matrices together. + * @return A vector of fused gate objects. Each element is a set of gates + * acting on a specific pair of qubits which can be applied as a group. + */ + static std::vector FuseGates( + const Parameter& param, unsigned max_qubit1, + typename std::vector::const_iterator gfirst, + typename std::vector::const_iterator glast, + const std::vector& times_to_split_at, + bool fuse_matrix = true) { + std::vector fused_gates; + + if (gfirst >= glast) return fused_gates; + + std::size_t num_gates = glast - gfirst; + + fused_gates.reserve(num_gates); + + // Merge with measurement gate times to separate fused gates at. + auto epochs = + Base::MergeWithMeasurementTimes(gfirst, glast, times_to_split_at); + + LinkManager link_manager(max_qubit1 * num_gates); + + // Auxillary data structures. + // Sequence of intermediate fused gates. + std::vector gates_seq; + // Gate "lattice". + std::vector gates_lat; + // Sequences of intermediate fused gates ordered by gate size. + std::vector> fgates(max_qubit1 + 1); + + gates_seq.reserve(num_gates); + gates_lat.reserve(max_qubit1); + + Scratch scratch; + + scratch.data.reserve(1024); + scratch.prev1.reserve(32); + scratch.prev2.reserve(32); + scratch.next1.reserve(32); + scratch.next2.reserve(32); + scratch.longest_seq.reserve(8); + scratch.stack.reserve(8); + + Stat stat; + stat.num_gates.resize(max_qubit1 + 1, 0); + + unsigned max_fused_size = std::min(unsigned{6}, param.max_fused_size); + max_fused_size = std::min(max_fused_size, max_qubit1); + + std::size_t last_fused_gate_index = 0; + auto gate_it = gfirst; + + // Iterate over epochs. + for (std::size_t l = 0; l < epochs.size(); ++l) { + gates_seq.resize(0); + gates_lat.resize(0); + gates_lat.resize(max_qubit1, nullptr); + + for (unsigned i = 0; i <= max_qubit1; ++i) { + fgates[i].resize(0); + } + + uint64_t max_gate_size = 0; + GateF* last_mea_gate = nullptr; + + // Iterate over input gates. + for (; gate_it < glast; ++gate_it) { + const auto& gate = Base::GateToConstRef(*gate_it); + + if (gate.time > epochs[l]) break; + + if (!ValidateGate(gate, max_qubit1, gates_lat)) { + fused_gates.resize(0); + return fused_gates; + } + + // Fill in auxillary data structures. + + if (gate.kind == gate::kMeasurement) { + // Measurement gate. + + if (last_mea_gate == nullptr + || last_mea_gate->parent->time != gate.time) { + gates_seq.push_back({&gate, {}, {}, {}, 0, kMeaCnt}); + last_mea_gate = &gates_seq.back(); + + last_mea_gate->qubits.reserve(max_qubit1); + last_mea_gate->links.reserve(max_qubit1); + + ++stat.num_fused_mea_gates; + } + + for (auto q : gate.qubits) { + last_mea_gate->qubits.push_back(q); + last_mea_gate->mask |= uint64_t{1} << q; + gates_lat[q] = link_manager.AddBack(last_mea_gate, gates_lat[q]); + last_mea_gate->links.push_back(gates_lat[q]); + } + + last_mea_gate->gates.push_back(&gate); + + ++stat.num_mea_gates; + } else { + gates_seq.push_back({&gate, {}, {}, {}, 0, kZero}); + auto& fgate = gates_seq.back(); + + if (gate.controlled_by.size() == 0) { + if (max_gate_size < gate.qubits.size()) { + max_gate_size = gate.qubits.size(); + } + + unsigned num_gate_qubits = gate.qubits.size(); + unsigned size = std::max(max_fused_size, num_gate_qubits); + + fgate.qubits.reserve(size); + fgate.links.reserve(size); + fgate.gates.reserve(4 * size); + fgate.links.reserve(size); + + if (fgates[num_gate_qubits].empty()) { + fgates[num_gate_qubits].reserve(num_gates); + } + fgates[num_gate_qubits].push_back(&fgate); + + ++stat.num_gates[num_gate_qubits]; + } else { + // Controlled gate. + // Controlled gates are not fused with other gates. + + uint64_t size = gate.qubits.size() + gate.controlled_by.size(); + + fgate.qubits.reserve(gate.qubits.size()); + fgate.links.reserve(size); + + fgate.visited = kMeaCnt; + fgate.gates.push_back(&gate); + + ++stat.num_controlled_gates; + } + + for (auto q : gate.qubits) { + fgate.qubits.push_back(q); + fgate.mask |= uint64_t{1} << q; + gates_lat[q] = link_manager.AddBack(&fgate, gates_lat[q]); + fgate.links.push_back(gates_lat[q]); + } + + for (auto q : gate.controlled_by) { + fgate.mask |= uint64_t{1} << q; + gates_lat[q] = link_manager.AddBack(&fgate, gates_lat[q]); + fgate.links.push_back(gates_lat[q]); + } + } + } + + // Fuse large gates with smaller gates. + FuseGates(max_gate_size, fgates); + + if (max_fused_size > 2) { + FuseGateSequences( + max_fused_size, max_qubit1, scratch, gates_seq, stat, fused_gates); + } else { + unsigned prev_time = 0; + + std::vector orphaned_gates; + orphaned_gates.reserve(max_qubit1); + + for (auto& fgate : gates_seq) { + if (fgate.gates.size() == 0) continue; + + if (prev_time != fgate.parent->time) { + if (orphaned_gates.size() > 0) { + FuseOrphanedGates( + max_fused_size, stat, orphaned_gates, fused_gates); + orphaned_gates.resize(0); + } + + prev_time = fgate.parent->time; + } + + if (fgate.qubits.size() == 1 && max_fused_size > 1 + && fgate.visited != kMeaCnt && !fgate.parent->unfusible) { + orphaned_gates.push_back(&fgate); + continue; + } + + // Assume fgate.qubits (gate.qubits) are sorted. + fused_gates.push_back({fgate.parent->kind, fgate.parent->time, + std::move(fgate.qubits), fgate.parent, + std::move(fgate.gates), {}}); + + if (fgate.visited != kMeaCnt) { + ++stat.num_fused_gates; + } + } + + if (orphaned_gates.size() > 0) { + FuseOrphanedGates(max_fused_size, stat, orphaned_gates, fused_gates); + } + } + + if (fgates[0].size() != 0) { + Base::FuseZeroQubitGates(fgates[0], + [](const GateF* g) { return g->parent; }, + last_fused_gate_index, fused_gates); + } + + last_fused_gate_index = fused_gates.size(); + } + + if (fuse_matrix) { + for (auto& fgate : fused_gates) { + if (fgate.kind != gate::kMeasurement && fgate.kind != gate::kDecomp) { + CalculateFusedMatrix(fgate); + } + } + } + + PrintStat(param.verbosity, stat, fused_gates); + + return fused_gates; + } + + private: + // Fuse large gates with smaller gates. + static void FuseGates(uint64_t max_gate_size, + std::vector>& fgates) { + // Traverse gates in order of decreasing size. + for (uint64_t i = 0; i < max_gate_size; ++i) { + std::size_t pos = 0; + + for (auto fgate : fgates[max_gate_size - i]) { + if (fgate->visited > kZero) continue; + + fgates[max_gate_size - i][pos++] = fgate; + + fgate->visited = kFirst; + + FusePrev(0, *fgate); + fgate->gates.push_back(fgate->parent); + FuseNext(0, *fgate); + } + + fgates[max_gate_size - i].resize(pos); + } + } + + // Try to fuse gate sequences as follows. Gate time goes from bottom to top. + // Gates are fused either from left to right or from right to left. + // + // max_fused_size = 3: _- or -_ + // + // max_fused_size = 4: _-_ + // + // max_fused_size = 5: _-_- or -_-_ + // + // max_fused_size = 6: _-_-_ + static void FuseGateSequences(unsigned max_fused_size, + unsigned max_qubit1, Scratch& scratch, + std::vector& gates_seq, Stat& stat, + std::vector& fused_gates) { + unsigned prev_time = 0; + + std::vector orphaned_gates; + orphaned_gates.reserve(max_qubit1); + + for (auto& fgate : gates_seq) { + if (prev_time != fgate.parent->time) { + if (orphaned_gates.size() > 0) { + FuseOrphanedGates(max_fused_size, stat, orphaned_gates, fused_gates); + orphaned_gates.resize(0); + } + + prev_time = fgate.parent->time; + } + + if (fgate.visited == kFinal || fgate.gates.size() == 0) continue; + + if (fgate.visited == kMeaCnt || fgate.qubits.size() >= max_fused_size + || fgate.parent->unfusible) { + if (fgate.visited != kMeaCnt) { + ++stat.num_fused_gates; + } + + fgate.visited = kFinal; + + fused_gates.push_back({fgate.parent->kind, fgate.parent->time, + std::move(fgate.qubits), fgate.parent, + std::move(fgate.gates), {}}); + + continue; + } + + + if (fgate.qubits.size() == 1 && max_fused_size > 1) { + orphaned_gates.push_back(&fgate); + continue; + } + + scratch.data.resize(0); + scratch.gates.resize(0); + scratch.count = 0; + + MakeGateSequence(max_fused_size, scratch, fgate); + + if (scratch.gates.size() == 0) { + orphaned_gates.push_back(&fgate); + } else { + for (auto fgate : scratch.gates) { + std::sort(fgate->qubits.begin(), fgate->qubits.end()); + + fused_gates.push_back({fgate->parent->kind, fgate->parent->time, + std::move(fgate->qubits), fgate->parent, + std::move(fgate->gates), {}}); + + ++stat.num_fused_gates; + } + } + } + + if (orphaned_gates.size() > 0) { + FuseOrphanedGates(max_fused_size, stat, orphaned_gates, fused_gates); + } + } + + static void FuseOrphanedGates(unsigned max_fused_size, Stat& stat, + std::vector& orphaned_gates, + std::vector& fused_gates) { + for (std::size_t i = 0; i < orphaned_gates.size(); ++i) { + auto ogate1 = orphaned_gates[i]; + + if (ogate1->visited == kFinal) continue; + + ogate1->visited = kFinal; + + for (std::size_t j = i + 1; j < orphaned_gates.size(); ++j) { + auto ogate2 = orphaned_gates[j]; + + if (ogate2->visited == kFinal) continue; + + unsigned cur_size = ogate1->qubits.size() + ogate2->qubits.size(); + + if (cur_size <= max_fused_size) { + ogate2->visited = kFinal; + + for (auto q : ogate2->qubits) { + ogate1->qubits.push_back(q); + ogate1->mask |= uint64_t{1} << q; + } + + for (auto l : ogate2->links) { + ogate1->links.push_back(l); + } + + for (auto gate : ogate2->gates) { + ogate1->gates.push_back(gate); + } + } + + if (cur_size == max_fused_size) { + break; + } + } + + FuseNext(1, *ogate1); + + std::sort(ogate1->qubits.begin(), ogate1->qubits.end()); + + fused_gates.push_back({ogate1->parent->kind, ogate1->parent->time, + std::move(ogate1->qubits), ogate1->parent, + std::move(ogate1->gates), {}}); + + ++stat.num_fused_gates; + } + } + + static void MakeGateSequence( + unsigned max_fused_size, Scratch& scratch, GateF& fgate) { + unsigned level = kSecond + scratch.count; + + FindLongestGateSequence(max_fused_size, level, scratch, fgate); + + auto longest_seq = scratch.longest_seq; + + if (longest_seq.size() == 1 && scratch.count == 0) { + fgate.visited = kFirst; + return; + } + + ++scratch.count; + + for (auto p : longest_seq) { + p->gate->visited = kCompress; + + for (auto q : p->qubits) { + fgate.qubits.push_back(q); + fgate.mask |= uint64_t{1} << q; + } + + for (auto l : p->links) { + fgate.links.push_back(l); + } + } + + // Compress links. + for (auto& link : fgate.links) { + while (link->prev != nullptr && link->prev->val->visited == kCompress) { + link = link->prev; + } + + while (link->next != nullptr && link->next->val->visited == kCompress) { + LinkManager::Delete(link->next); + } + } + + for (auto p : longest_seq) { + p->gate->visited = level; + } + + if (longest_seq.size() >= 3) { + AddGatesFromNext(longest_seq[2]->gate->gates, fgate); + } + + if (longest_seq.size() >= 5) { + AddGatesFromNext(longest_seq[4]->gate->gates, fgate); + } + + if (longest_seq.size() >= 2) { + // May call MakeGateSequence recursively. + AddGatesFromPrev(max_fused_size, *longest_seq[1]->gate, scratch, fgate); + } + + if (longest_seq.size() >= 4) { + // May call MakeGateSequence recursively. + AddGatesFromPrev(max_fused_size, *longest_seq[3]->gate, scratch, fgate); + } + + for (auto p : longest_seq) { + p->gate->visited = kFinal; + } + + FuseNext(1, fgate); + + scratch.gates.push_back(&fgate); + } + + static void AddGatesFromNext(std::vector& gates, GateF& fgate) { + for (auto gate : gates) { + fgate.gates.push_back(gate); + } + } + + static void AddGatesFromPrev(unsigned max_fused_size, const GateF& pfgate, + Scratch& scratch, GateF& fgate) { + for (auto gate : pfgate.gates) { + fgate.gates.push_back(gate); + } + + for (auto link : pfgate.links) { + if (link->prev == nullptr) continue; + + auto pgate = link->prev->val; + + if (pgate->visited == kFirst) { + MakeGateSequence(max_fused_size, scratch, *pgate); + } + } + } + + static void FindLongestGateSequence( + unsigned max_fused_size, unsigned level, Scratch& scratch, GateF& fgate) { + scratch.data.push_back({&fgate, {}, {}}); + + scratch.longest_seq.resize(0); + scratch.longest_seq.push_back(&scratch.data.back()); + + scratch.stack.resize(0); + scratch.stack.push_back(&scratch.data.back()); + + unsigned cur_size = fgate.qubits.size(); + fgate.visited = level; + + unsigned max_size = cur_size; + + GetNextAvailableGates(max_fused_size, cur_size, fgate, nullptr, + scratch.data, scratch.next1); + + for (auto n1 : scratch.next1) { + unsigned cur_size2 = cur_size + n1->qubits.size(); + if (cur_size2 > max_fused_size) continue; + + bool feasible = GetPrevAvailableGates(max_fused_size, cur_size, + level, *n1->gate, nullptr, + scratch.data, scratch.prev1); + + if (!feasible) continue; + + if (scratch.prev1.size() == 0 && max_fused_size > 3) continue; + + if (cur_size2 == max_fused_size) { + std::swap(scratch.longest_seq, scratch.stack); + scratch.longest_seq.push_back(n1); + return; + } + + Push(level, cur_size2, cur_size, max_size, scratch, n1); + + for (auto p1 : scratch.prev1) { + unsigned cur_size2 = cur_size + p1->qubits.size(); + + if (cur_size2 > max_fused_size) { + continue; + } else if (cur_size2 == max_fused_size) { + std::swap(scratch.longest_seq, scratch.stack); + scratch.longest_seq.push_back(p1); + return; + } + + Push(level, cur_size2, cur_size, max_size, scratch, p1); + + GetNextAvailableGates(max_fused_size, cur_size, *p1->gate, &fgate, + scratch.data, scratch.next2); + + for (auto n2 : scratch.next2) { + unsigned cur_size2 = cur_size + n2->qubits.size(); + if (cur_size2 > max_fused_size) continue; + + bool feasible = GetPrevAvailableGates(max_fused_size, cur_size, + level, *n2->gate, n1->gate, + scratch.data, scratch.prev2); + + if (!feasible) continue; + + if (cur_size2 == max_fused_size) { + std::swap(scratch.longest_seq, scratch.stack); + scratch.longest_seq.push_back(n2); + return; + } + + Push(level, cur_size2, cur_size, max_size, scratch, n2); + + for (auto p2 : scratch.prev2) { + unsigned cur_size2 = cur_size + p2->qubits.size(); + + if (cur_size2 > max_fused_size) { + continue; + } else if (cur_size2 == max_fused_size) { + std::swap(scratch.longest_seq, scratch.stack); + scratch.longest_seq.push_back(p2); + return; + } + + if (cur_size2 > max_size) { + scratch.stack.push_back(p2); + scratch.longest_seq = scratch.stack; + scratch.stack.pop_back(); + max_size = cur_size2; + } + } + + Pop(cur_size, scratch, n2); + } + + Pop(cur_size, scratch, p1); + } + + Pop(cur_size, scratch, n1); + } + } + + static void Push(unsigned level, unsigned cur_size2, unsigned& cur_size, + unsigned& max_size, Scratch& scratch, GateA* agate) { + agate->gate->visited = level; + cur_size = cur_size2; + scratch.stack.push_back(agate); + + if (cur_size > max_size) { + scratch.longest_seq = scratch.stack; + max_size = cur_size; + } + } + + static void Pop(unsigned& cur_size, Scratch& scratch, GateA* agate) { + agate->gate->visited = kFirst; + cur_size -= agate->qubits.size(); + scratch.stack.pop_back(); + } + + static void GetNextAvailableGates(unsigned max_fused_size, unsigned cur_size, + const GateF& pgate1, const GateF* pgate2, + std::vector& scratch, + std::vector& next_gates) { + next_gates.resize(0); + + for (auto link : pgate1.links) { + if (link->next == nullptr) continue; + + auto ngate = link->next->val; + + if (ngate->visited > kFirst || ngate->parent->unfusible) continue; + + GateA next = {ngate, {}, {}}; + next.qubits.reserve(8); + next.links.reserve(8); + + GetAddedQubits(pgate1, pgate2, *ngate, next); + + if (cur_size + next.qubits.size() > max_fused_size) continue; + + scratch.push_back(std::move(next)); + next_gates.push_back(&scratch.back()); + } + } + + static bool GetPrevAvailableGates(unsigned max_fused_size, + unsigned cur_size, unsigned level, + const GateF& ngate1, const GateF* ngate2, + std::vector& scratch, + std::vector& prev_gates) { + prev_gates.resize(0); + + for (auto link : ngate1.links) { + if (link->prev == nullptr) continue; + + auto pgate = link->prev->val; + + if (pgate->visited == kFinal || pgate->visited == level) continue; + + if (pgate->visited > kFirst || pgate->parent->unfusible) { + prev_gates.resize(0); + return false; + } + + GateA prev = {pgate, {}, {}}; + prev.qubits.reserve(8); + prev.links.reserve(8); + + GetAddedQubits(ngate1, ngate2, *pgate, prev); + + bool all_prev_visited = true; + + for (auto link : pgate->links) { + if (link->prev == nullptr) continue; + + if (link->prev->val->visited <= kMeaCnt) { + all_prev_visited = false; + break; + } + } + + if (!all_prev_visited) { + prev_gates.resize(0); + return false; + } + + if (cur_size + prev.qubits.size() > max_fused_size) continue; + + if (all_prev_visited) { + scratch.push_back(std::move(prev)); + prev_gates.push_back(&scratch.back()); + } + } + + return true; + } + + static void GetAddedQubits(const GateF& fgate0, const GateF* fgate1, + const GateF& fgate2, GateA& added) { + for (std::size_t i = 0; i < fgate2.qubits.size(); ++i) { + unsigned q2 = fgate2.qubits[i]; + + if (std::find(fgate0.qubits.begin(), fgate0.qubits.end(), q2) + != fgate0.qubits.end()) continue; + + if (fgate1 != nullptr + && std::find(fgate1->qubits.begin(), fgate1->qubits.end(), q2) + != fgate1->qubits.end()) continue; + + added.qubits.push_back(q2); + added.links.push_back(fgate2.links[i]); + } + } + + // Fuse smaller gates with fgate back in gate time. + static void FusePrev(unsigned pass, GateF& fgate) { + std::vector gates; + gates.reserve(fgate.gates.capacity()); + + auto neighbor = [](const Link* link) -> const Link* { + return link->prev; + }; + + FusePrevOrNext>(pass, neighbor, fgate, gates); + + for (auto it = gates.rbegin(); it != gates.rend(); ++it) { + fgate.gates.push_back(*it); + } + } + + // Fuse smaller gates with fgate forward in gate time. + static void FuseNext(unsigned pass, GateF& fgate) { + auto neighbor = [](const Link* link) -> const Link* { + return link->next; + }; + + FusePrevOrNext>(pass, neighbor, fgate, fgate.gates); + } + + template + static void FusePrevOrNext(unsigned pass, Neighbor neighb, GateF& fgate, + std::vector& gates) { + uint64_t bad_mask = 0; + auto links = fgate.links; + + bool may_have_gates_to_fuse = true; + + while (may_have_gates_to_fuse) { + may_have_gates_to_fuse = false; + + std::sort(links.begin(), links.end(), + [&neighb](const Link* l, const Link* r) -> bool { + auto ln = neighb(l); + auto rn = neighb(r); + + if (ln != nullptr && rn != nullptr) { + return R()(ln->val->parent->time, rn->val->parent->time); + } else { + // nullptrs are larger than everything else and + // equivalent among each other. + return ln != nullptr; + } + }); + + for (auto link : links) { + auto n = neighb(link); + + if (n == nullptr) continue; + + auto g = n->val; + + if (!QubitsAreIn(fgate.mask, g->mask) || (g->mask & bad_mask) != 0 + || g->visited > pass || g->parent->unfusible) { + bad_mask |= g->mask; + } else { + g->visited = pass == 0 ? kFirst : kFinal; + + if (pass == 0) { + gates.push_back(g->parent); + } else { + for (auto gate : g->gates) { + gates.push_back(gate); + } + } + + for (auto link : g->links) { + LinkManager::Delete(link); + } + + may_have_gates_to_fuse = true; + break; + } + } + } + } + + static bool QubitsAreIn(uint64_t mask0, uint64_t mask) { + return ((mask0 | mask) ^ mask0) == 0; + } + + static void PrintStat(unsigned verbosity, const Stat& stat, + const std::vector& fused_gates) { + if (verbosity < 3) return; + + if (stat.num_controlled_gates > 0) { + IO::messagef("%lu controlled gates\n", stat.num_controlled_gates); + } + + if (stat.num_mea_gates > 0) { + IO::messagef("%lu measurement gates", stat.num_mea_gates); + if (stat.num_fused_mea_gates == stat.num_mea_gates) { + IO::messagef("\n"); + } else { + IO::messagef(" are fused into %lu gates\n", stat.num_fused_mea_gates); + } + } + + bool first = true; + for (unsigned i = 1; i < stat.num_gates.size(); ++i) { + if (stat.num_gates[i] > 0) { + if (first) { + first = false; + } else { + IO::messagef(", "); + } + IO::messagef("%u %u-qubit", stat.num_gates[i], i); + } + } + + IO::messagef(" gates are fused into %lu gates\n", stat.num_fused_gates); + + if (verbosity < 5) return; + + IO::messagef("fused gate qubits:\n"); + for (const auto& g : fused_gates) { + IO::messagef("%6u ", g.parent->time); + if (g.parent->kind == gate::kMeasurement) { + IO::messagef("m"); + } else if (g.parent->controlled_by.size() > 0) { + IO::messagef("c"); + for (auto q : g.parent->controlled_by) { + IO::messagef("%3u", q); + } + IO::messagef(" t"); + } else { + IO::messagef(" "); + } + + for (auto q : g.qubits) { + IO::messagef("%3u", q); + } + IO::messagef("\n"); + } + } + + template + static bool ValidateGate(const Gate2& gate, unsigned max_qubit1, + const GatesLat& gates_lat) { + for (unsigned q : gate.qubits) { + if (q >= max_qubit1) { + IO::errorf("fuser: gate qubit %u is out of range " + "(should be smaller than %u).\n", q, max_qubit1); + return false; + } + if (gates_lat[q] != nullptr + && gate.time <= gates_lat[q]->val->parent->time) { + IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time); + return false; + } + } + + for (unsigned q : gate.controlled_by) { + if (q >= max_qubit1) { + IO::errorf("fuser: gate qubit %u is out of range " + "(should be smaller than %u).\n", q, max_qubit1); + return false; + } + if (gates_lat[q] != nullptr + && gate.time <= gates_lat[q]->val->parent->time) { + IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time); + return false; + } + } + + return true; + } +}; + +} // namespace qsim + +#endif // FUSER_MQUBIT_H_ diff --git a/qsim/gate.h b/qsim/gate.h new file mode 100644 index 0000000..a457acb --- /dev/null +++ b/qsim/gate.h @@ -0,0 +1,216 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef GATE_H_ +#define GATE_H_ + +#include +#include +#include +#include + +#include "matrix.h" + +namespace qsim { + +namespace detail { + +template +inline void SortQubits(Gate& gate) { + for (std::size_t i = 1; i < gate.qubits.size(); ++i) { + if (gate.qubits[i - 1] > gate.qubits[i]) { + if (!GateDef::symmetric) { + auto perm = NormalToGateOrderPermutation(gate.qubits); + MatrixShuffle(perm, gate.qubits.size(), gate.matrix); + } + + gate.swapped = true; + std::sort(gate.qubits.begin(), gate.qubits.end()); + break; + } + } +} + +} // namespace detail + +template , typename Gate> +inline Gate& MakeControlledGate(Qubits&& controlled_by, Gate& gate) { + gate.controlled_by = std::forward(controlled_by); + gate.cmask = (uint64_t{1} << gate.controlled_by.size()) - 1; + + std::sort(gate.controlled_by.begin(), gate.controlled_by.end()); + + return gate; +} + +template , typename Gate> +inline Gate& MakeControlledGate(Qubits&& controlled_by, + const std::vector& control_values, + Gate& gate) { + // Assume controlled_by.size() == control_values.size(). + + bool sorted = true; + + for (std::size_t i = 1; i < controlled_by.size(); ++i) { + if (controlled_by[i - 1] > controlled_by[i]) { + sorted = false; + break; + } + } + + if (sorted) { + gate.controlled_by = std::forward(controlled_by); + gate.cmask = 0; + + for (std::size_t i = 0; i < control_values.size(); ++i) { + gate.cmask |= (control_values[i] & 1) << i; + } + } else { + struct ControlPair { + unsigned q; + unsigned v; + }; + + std::vector cpairs; + cpairs.reserve(controlled_by.size()); + + for (std::size_t i = 0; i < controlled_by.size(); ++i) { + cpairs.push_back({controlled_by[i], control_values[i]}); + } + + // Sort control qubits and control values. + std::sort(cpairs.begin(), cpairs.end(), + [](const ControlPair& l, const ControlPair& r) -> bool { + return l.q < r.q; + }); + + gate.cmask = 0; + gate.controlled_by.reserve(controlled_by.size()); + + for (std::size_t i = 0; i < cpairs.size(); ++i) { + gate.cmask |= (cpairs[i].v & 1) << i; + gate.controlled_by.push_back(cpairs[i].q); + } + } + + return gate; +} + +namespace gate { + +constexpr int kDecomp = 100001; // gate from Schmidt decomposition +constexpr int kMeasurement = 100002; // measurement gate + +} // namespace gate + +enum GateAnyKind { + kGateAny = -1, +}; + +/** + * A generic gate to make it easier to use qsim with external gate sets. + */ +template +struct Gate { + using fp_type = FP; + using GateKind = GK; + + GateKind kind; + unsigned time; + std::vector qubits; + std::vector controlled_by; + uint64_t cmask; + std::vector params; + Matrix matrix; + bool unfusible; // If true, the gate is fused as a parent. + bool swapped; // If true, the gate qubits are swapped to make qubits + // ordered in ascending order. This does not apply to + // control qubits of explicitly-controlled gates. + + template > + Gate&& ControlledBy(Qubits&& controlled_by) { + MakeControlledGate(std::forward(controlled_by), *this); + return std::move(*this); + } + + template > + Gate&& ControlledBy(Qubits&& controlled_by, + const std::vector& control_values) { + MakeControlledGate( + std::forward(controlled_by), control_values, *this); + return std::move(*this); + } +}; + +template , + typename M = Matrix> +inline Gate CreateGate(unsigned time, Qubits&& qubits, M&& matrix = {}, + std::vector&& params = {}) { + Gate gate = {GateDef::kind, time, std::forward(qubits), {}, 0, + std::move(params), std::forward(matrix), false, false}; + + if (GateDef::kind != gate::kMeasurement) { + switch (gate.qubits.size()) { + case 1: + break; + case 2: + if (gate.qubits[0] > gate.qubits[1]) { + gate.swapped = true; + std::swap(gate.qubits[0], gate.qubits[1]); + if (!GateDef::symmetric) { + MatrixShuffle({1, 0}, 2, gate.matrix); + } + } + break; + default: + detail::SortQubits(gate); + } + } + + return gate; +} + +namespace gate { + +/** + * A gate that simulates measurement of one or more qubits, collapsing the + * state vector and storing the measured results. + */ +template +struct Measurement { + using GateKind = typename Gate::GateKind; + + static constexpr GateKind kind = GateKind::kMeasurement; + static constexpr char name[] = "m"; + static constexpr bool symmetric = false; + + template > + static Gate Create(unsigned time, Qubits&& qubits) { + return CreateGate(time, std::forward(qubits)); + } +}; + +} // namespace gate + +template +using schmidt_decomp_type = std::vector>>; + +template +schmidt_decomp_type GetSchmidtDecomp( + GateKind kind, const std::vector& params); + +} // namespace qsim + +#endif // GATE_H_ diff --git a/qsim/gate_appl.h b/qsim/gate_appl.h new file mode 100644 index 0000000..8601e6f --- /dev/null +++ b/qsim/gate_appl.h @@ -0,0 +1,231 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef GATE_APPL_H_ +#define GATE_APPL_H_ + +#include +#include + +#include "fuser.h" +#include "gate.h" +#include "matrix.h" + +namespace qsim { + +/** + * Applies the given gate to the simulator state. Ignores measurement gates. + * @param simulator Simulator object. Provides specific implementations for + * applying gates. + * @param gate The gate to be applied. + * @param state The state of the system, to be updated by this method. + */ +template +inline void ApplyGate(const Simulator& simulator, const Gate& gate, + typename Simulator::State& state) { + if (gate.kind != gate::kMeasurement) { + if (gate.controlled_by.size() == 0) { + simulator.ApplyGate(gate.qubits, gate.matrix.data(), state); + } else { + simulator.ApplyControlledGate(gate.qubits, gate.controlled_by, + gate.cmask, gate.matrix.data(), state); + } + } +} + +/** + * Applies the given gate dagger to the simulator state. If the gate matrix is + * unitary then this is equivalent to applying the inverse gate. Ignores + * measurement gates. + * @param simulator Simulator object. Provides specific implementations for + * applying gates. + * @param gate The gate to be applied. + * @param state The state of the system, to be updated by this method. + */ +template +inline void ApplyGateDagger(const Simulator& simulator, const Gate& gate, + typename Simulator::State& state) { + if (gate.kind != gate::kMeasurement) { + auto matrix = gate.matrix; + MatrixDagger(unsigned{1} << gate.qubits.size(), matrix); + + if (gate.controlled_by.size() == 0) { + simulator.ApplyGate(gate.qubits, matrix.data(), state); + } else { + simulator.ApplyControlledGate(gate.qubits, gate.controlled_by, + gate.cmask, matrix.data(), state); + } + } +} + +/** + * Applies the given gate to the simulator state. + * @param state_space StateSpace object required to perform measurements. + * @param simulator Simulator object. Provides specific implementations for + * applying gates. + * @param gate The gate to be applied. + * @param rgen Random number generator to perform measurements. + * @param state The state of the system, to be updated by this method. + * @param mresults As an input parameter, this can be empty or this can + * contain the results of the previous measurements. If gate is a measurement + * gate then after a successful run, the measurement result will be added to + * this. + * @return True if the measurement performed successfully; false otherwise. + */ +template +inline bool ApplyGate( + const typename Simulator::StateSpace& state_space, + const Simulator& simulator, const Gate& gate, Rgen& rgen, + typename Simulator::State& state, + std::vector& mresults) { + if (gate.kind == gate::kMeasurement) { + auto measure_result = state_space.Measure(gate.qubits, rgen, state); + if (measure_result.valid) { + mresults.push_back(std::move(measure_result)); + } else { + return false; + } + } else { + ApplyGate(simulator, gate, state); + } + + return true; +} + +/** + * Applies the given gate to the simulator state, discarding measurement + * results. + * @param state_space StateSpace object required to perform measurements. + * @param simulator Simulator object. Provides specific implementations for + * applying gates. + * @param gate The gate to be applied. + * @param rgen Random number generator to perform measurements. + * @param state The state of the system, to be updated by this method. + * @return True if the measurement performed successfully; false otherwise. + */ +template +inline bool ApplyGate(const typename Simulator::StateSpace& state_space, + const Simulator& simulator, const Gate& gate, Rgen& rgen, + typename Simulator::State& state) { + using MeasurementResult = typename Simulator::StateSpace::MeasurementResult; + std::vector discarded_results; + return + ApplyGate(state_space, simulator, gate, rgen, state, discarded_results); +} + +/** + * Applies the given fused gate to the simulator state. Ignores measurement + * gates. + * @param simulator Simulator object. Provides specific implementations for + * applying gates. + * @param gate The gate to be applied. + * @param state The state of the system, to be updated by this method. + */ +template +inline void ApplyFusedGate(const Simulator& simulator, const Gate& gate, + typename Simulator::State& state) { + if (gate.kind != gate::kMeasurement) { + if (gate.parent->controlled_by.size() == 0) { + simulator.ApplyGate(gate.qubits, gate.matrix.data(), state); + } else { + simulator.ApplyControlledGate(gate.qubits, gate.parent->controlled_by, + gate.parent->cmask, gate.matrix.data(), + state); + } + } +} + +/** + * Applies the given fused gate dagger to the simulator state. If the gate + * matrix is unitary then this is equivalent to applying the inverse gate. + * Ignores measurement gates. + * @param simulator Simulator object. Provides specific implementations for + * applying gates. + * @param gate The gate to be applied. + * @param state The state of the system, to be updated by this method. + */ +template +inline void ApplyFusedGateDagger(const Simulator& simulator, const Gate& gate, + typename Simulator::State& state) { + if (gate.kind != gate::kMeasurement) { + auto matrix = gate.matrix; + MatrixDagger(unsigned{1} << gate.qubits.size(), matrix); + + if (gate.parent->controlled_by.size() == 0) { + simulator.ApplyGate(gate.qubits, matrix.data(), state); + } else { + simulator.ApplyControlledGate(gate.qubits, gate.parent->controlled_by, + gate.parent->cmask, matrix.data(), state); + } + } +} + +/** + * Applies the given fused gate to the simulator state. + * @param state_space StateSpace object required to perform measurements. + * @param simulator Simulator object. Provides specific implementations for + * applying gates. + * @param gate The gate to be applied. + * @param rgen Random number generator to perform measurements. + * @param state The state of the system, to be updated by this method. + * @param mresults As an input parameter, this can be empty or this can + * contain the results of the previous measurements. If gate is a measurement + * gate then after a successful run, the measurement result will be added to + * this. + * @return True if the measurement performed successfully; false otherwise. + */ +template +inline bool ApplyFusedGate( + const typename Simulator::StateSpace& state_space, + const Simulator& simulator, const Gate& gate, Rgen& rgen, + typename Simulator::State& state, + std::vector& mresults) { + if (gate.kind == gate::kMeasurement) { + auto measure_result = state_space.Measure(gate.qubits, rgen, state); + if (measure_result.valid) { + mresults.push_back(std::move(measure_result)); + } else { + return false; + } + } else { + ApplyFusedGate(simulator, gate, state); + } + + return true; +} + +/** + * Applies the given fused gate to the simulator state, discarding measurement + * results. + * @param state_space StateSpace object required to perform measurements. + * @param simulator Simulator object. Provides specific implementations for + * applying gates. + * @param gate The gate to be applied. + * @param rgen Random number generator to perform measurements. + * @param state The state of the system, to be updated by this method. + * @return True if the measurement performed successfully; false otherwise. + */ +template +inline bool ApplyFusedGate(const typename Simulator::StateSpace& state_space, + const Simulator& simulator, const Gate& gate, + Rgen& rgen, typename Simulator::State& state) { + using MeasurementResult = typename Simulator::StateSpace::MeasurementResult; + std::vector discarded_results; + return ApplyFusedGate( + state_space, simulator, gate, rgen, state, discarded_results); +} + +} // namespace qsim + +#endif // GATE_APPL_H_ diff --git a/qsim/gates_cirq.h b/qsim/gates_cirq.h new file mode 100644 index 0000000..d767959 --- /dev/null +++ b/qsim/gates_cirq.h @@ -0,0 +1,1640 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef GATES_CIRQ_H_ +#define GATES_CIRQ_H_ + +#include +#include +#include +#include + +#include "gate.h" +#include "matrix.h" + +namespace qsim { + +namespace Cirq { + +enum GateKind { + kI1 = 0, // One-qubit identity gate. + kI2, // Two-qubit identity gate. + kI, // Multi-qubit identity gate. + kXPowGate, + kYPowGate, + kZPowGate, + kHPowGate, + kCZPowGate, + kCXPowGate, + krx, + kry, + krz, + kH, + kS, + kCZ, + kCX, + kT, + kX, + kY, + kZ, + kPhasedXPowGate, + kPhasedXZGate, + kXXPowGate, + kYYPowGate, + kZZPowGate, + kXX, + kYY, + kZZ, + kSwapPowGate, + kISwapPowGate, + kriswap, + kSWAP, + kISWAP, + kPhasedISwapPowGate, + kgivens, + kFSimGate, + kTwoQubitDiagonalGate, + kThreeQubitDiagonalGate, + kCCZPowGate, + kCCXPowGate, + kCSwapGate, + kCCZ, + kCCX, + kMatrixGate1, // One-qubit matrix gate. + kMatrixGate2, // Two-qubit matrix gate. + kMatrixGate, // Multi-qubit matrix gate. + kGlobalPhaseGate, + kDecomp = gate::kDecomp, + kMeasurement = gate::kMeasurement, +}; + +template +using GateCirq = Gate; + +constexpr double h_double = 0.5; +constexpr double pi_double = 3.14159265358979323846264338327950288; +constexpr double is2_double = 0.7071067811865475; + +// Gates from cirq/ops/global_phase_op.py: + +/** + * The global phase gate. + */ +template +struct GlobalPhaseGate { + static constexpr GateKind kind = kGlobalPhaseGate; + static constexpr char name[] = "GlobalPhaseGate"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateCirq Create(unsigned time, fp_type phi) { + return Create(time, std::cos(phi), std::sin(phi)); + } + + static GateCirq Create(unsigned time, fp_type cp, fp_type sp) { + return CreateGate, GlobalPhaseGate>( + time, {}, {cp, sp}, {cp, sp}); + } +}; + +template +using global_phase_operation = GlobalPhaseGate; + +// Gates from cirq/ops/identity.py: + +/** + * A one-qubit identity gate. + */ +template +struct I1 { + static constexpr GateKind kind = kI1; + static constexpr char name[] = "I1"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateCirq Create(unsigned time, unsigned q0) { + return CreateGate, I1>( + time, {q0}, {1, 0, 0, 0, 0, 0, 1, 0}); + } +}; + +/** + * A two-qubit identity gate. + */ +template +struct I2 { + static constexpr GateKind kind = kI2; + static constexpr char name[] = "I2"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1) { + return CreateGate, I2>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0}); + } + + static schmidt_decomp_type SchmidtDecomp() { + return schmidt_decomp_type{ + {{1, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, 1, 0}}, + }; + } +}; + +/** + * A multi-qubit identity gate. + */ +template +struct I { + static constexpr GateKind kind = kI; + static constexpr char name[] = "I"; + static constexpr bool symmetric = true; + + static GateCirq Create(unsigned time, + const std::vector& qubits) { + Matrix matrix; + MatrixIdentity(1 << qubits.size(), matrix); + return CreateGate, I>(time, qubits, std::move(matrix)); + } +}; + +// Gates form cirq/ops/common_gates.py: + +/** + * A gate that rotates around the X axis of the Bloch sphere. + * This is a generalization of the X gate. + */ +template +struct XPowGate { + static constexpr GateKind kind = kXPowGate; + static constexpr char name[] = "XPowGate"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, unsigned q0, + fp_type exponent, fp_type global_shift = 0) { + fp_type c = std::cos(pi * exponent * 0.5); + fp_type s = std::sin(pi * exponent * 0.5); + fp_type gc = std::cos(pi * exponent * (0.5 + global_shift)); + fp_type gs = std::sin(pi * exponent * (0.5 + global_shift)); + + return CreateGate, XPowGate>( + time, {q0}, {c * gc, c * gs, s * gs, -s * gc, + s * gs, -s * gc, c * gc, c * gs}, + {exponent, global_shift}); + } +}; + +/** + * A gate that rotates around the Y axis of the Bloch sphere. + * This is a generalization of the Y gate. + */ +template +struct YPowGate { + static constexpr GateKind kind = kYPowGate; + static constexpr char name[] = "YPowGate"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, unsigned q0, + fp_type exponent, fp_type global_shift = 0) { + fp_type c = std::cos(pi * exponent * 0.5); + fp_type s = std::sin(pi * exponent * 0.5); + fp_type gc = std::cos(pi * exponent * (0.5 + global_shift)); + fp_type gs = std::sin(pi * exponent * (0.5 + global_shift)); + + return CreateGate, YPowGate>( + time, {q0}, {c * gc, c * gs, -s * gc, -s * gs, + s * gc, s * gs, c * gc, c * gs}, {exponent, global_shift}); + } +}; + +/** + * A gate that rotates around the Z axis of the Bloch sphere. + * This is a generalization of the Z gate. + */ +template +struct ZPowGate { + static constexpr GateKind kind = kZPowGate; + static constexpr char name[] = "ZPowGate"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, unsigned q0, + fp_type exponent, fp_type global_shift = 0) { + fp_type c = std::cos(pi * exponent); + fp_type s = std::sin(pi * exponent); + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + + return CreateGate, ZPowGate>( + time, {q0}, {gc, gs, 0, 0, 0, 0, c * gc - s * gs, c * gs + s * gc}, + {exponent, global_shift}); + } +}; + +/** + * A gate that rotates around the X+Z axis of the Bloch sphere. + * This is a generalization of the Hadamard gate. + */ +template +struct HPowGate { + static constexpr GateKind kind = kHPowGate; + static constexpr char name[] = "HPowGate"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static constexpr fp_type pi = static_cast(pi_double); + static constexpr fp_type is2 = static_cast(is2_double); + + static GateCirq Create(unsigned time, unsigned q0, + fp_type exponent, fp_type global_shift = 0) { + fp_type c = std::cos(pi * exponent * 0.5); + fp_type s = std::sin(pi * exponent * 0.5); + fp_type gc = std::cos(pi * exponent * (0.5 + global_shift)); + fp_type gs = std::sin(pi * exponent * (0.5 + global_shift)); + + fp_type a = s * gs * is2; + fp_type b = s * gc * is2; + + return CreateGate, HPowGate>( + time, {q0}, {c * gc + a, c * gs - b, a, -b, + a, -b, c * gc - a, c * gs + b}, {exponent, global_shift}); + } +}; + +/** + * A gate that applies a phase to the |11⟩ state of two qubits. + * This is a generalization of the CZ gate. + */ +template +struct CZPowGate { + static constexpr GateKind kind = kCZPowGate; + static constexpr char name[] = "CZPowGate"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1, + fp_type exponent, fp_type global_shift = 0) { + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + fp_type ec = std::cos(pi * exponent * (1 + global_shift)); + fp_type es = std::sin(pi * exponent * (1 + global_shift)); + + return CreateGate, CZPowGate>( + time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0, + 0, 0, gc, gs, 0, 0, 0, 0, + 0, 0, 0, 0, gc, gs, 0, 0, + 0, 0, 0, 0, 0, 0, ec, es}, {exponent, global_shift}); + } + + static schmidt_decomp_type SchmidtDecomp( + fp_type exponent, fp_type global_shift) { + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + fp_type ec = std::cos(pi * exponent * (1 + global_shift)); + fp_type es = std::sin(pi * exponent * (1 + global_shift)); + + return schmidt_decomp_type{ + {{1, 0, 0, 0, 0, 0, 0, 0}, {gc, gs, 0, 0, 0, 0, gc, gs}}, + {{0, 0, 0, 0, 0, 0, 1, 0}, {gc, gs, 0, 0, 0, 0, ec, es}}, + }; + } +}; + +/** + * A gate that applies a controlled power of an X gate. + * This is a generalization of the CX (or CNOT) gate. + */ +template +struct CXPowGate { + static constexpr GateKind kind = kCXPowGate; + static constexpr char name[] = "CXPowGate"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = false; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1, + fp_type exponent, fp_type global_shift = 0) { + fp_type c = std::cos(pi * exponent * 0.5); + fp_type s = std::sin(pi * exponent * 0.5); + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + fp_type ec = std::cos(pi * exponent * (0.5 + global_shift)); + fp_type es = std::sin(pi * exponent * (0.5 + global_shift)); + + // Matrix is in this form because the simulator uses inverse qubit order. + return CreateGate, CXPowGate>( + time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0, + 0, 0, c * ec, c * es, 0, 0, s * es, -s * ec, + 0, 0, 0, 0, gc, gs, 0, 0, + 0, 0, s * es, -s * ec, 0, 0, c * ec, c * es}, + {exponent, global_shift}); + } + + static schmidt_decomp_type SchmidtDecomp( + fp_type exponent, fp_type global_shift) { + fp_type c = std::cos(pi * exponent * 0.5); + fp_type s = std::sin(pi * exponent * 0.5); + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + fp_type ec = std::cos(pi * exponent * (0.5 + global_shift)); + fp_type es = std::sin(pi * exponent * (0.5 + global_shift)); + + return schmidt_decomp_type{ + {{1, 0, 0, 0, 0, 0, 0, 0}, {gc, gs, 0, 0, 0, 0, gc, gs}}, + {{0, 0, 0, 0, 0, 0, 1, 0}, {c * ec, c * es, s * es, -s * ec, + s * es, -s * ec, c * ec, c * es}}, + }; + } +}; + +/** + * The `(exponent = phi/pi, global_shift = -0.5)` instance of XPowGate. + * This is a generalization of the X gate with a fixed global phase. + * This is a function in Cirq. + */ +template +struct rx { + static constexpr GateKind kind = krx; + static constexpr char name[] = "rx"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateCirq Create(unsigned time, unsigned q0, fp_type phi) { + fp_type c = std::cos(-0.5 * phi); + fp_type s = std::sin(-0.5 * phi); + + return CreateGate, rx>( + time, {q0}, {c, 0, 0, s, 0, s, c, 0}, {phi}); + } +}; + +/** + * The `(exponent = phi/pi, global_shift = -0.5)` instance of YPowGate. + * This is a generalization of the Y gate with a fixed global phase. + * This is a function in Cirq. + */ +template +struct ry { + static constexpr GateKind kind = kry; + static constexpr char name[] = "ry"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateCirq Create(unsigned time, unsigned q0, fp_type phi) { + fp_type c = std::cos(-0.5 * phi); + fp_type s = std::sin(-0.5 * phi); + + return CreateGate, ry>( + time, {q0}, {c, 0, s, 0, -s, 0, c, 0}, {phi}); + } +}; + +/** + * The `(exponent = phi/pi, global_shift = -0.5)` instance of ZPowGate. + * This is a generalization of the Z gate with a fixed global phase. + * This is a function in Cirq. + */ +template +struct rz { + static constexpr GateKind kind = krz; + static constexpr char name[] = "rz"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateCirq Create(unsigned time, unsigned q0, fp_type phi) { + fp_type c = std::cos(-0.5 * phi); + fp_type s = std::sin(-0.5 * phi); + + return CreateGate, rz>( + time, {q0}, {c, s, 0, 0, 0, 0, c, -s}, {phi}); + } +}; + +/** + * The `(exponent = 1, global_shift = 0)` instance of HPowGate. + * This is the canonical Hadamard (or H) gate. + */ +template +struct H { + static constexpr GateKind kind = kH; + static constexpr char name[] = "H"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static constexpr fp_type is2 = static_cast(is2_double); + + static GateCirq Create(unsigned time, unsigned q0) { + return CreateGate, H>( + time, {q0}, {is2, 0, is2, 0, is2, 0, -is2, 0}); + } +}; + +/** + * The `(exponent = 0.5, global_shift = 0)` instance of ZPowGate. + * This is the canonical S gate. + */ +template +struct S { + static constexpr GateKind kind = kS; + static constexpr char name[] = "S"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateCirq Create(unsigned time, unsigned q0) { + return CreateGate, S>( + time, {q0}, {1, 0, 0, 0, 0, 0, 0, 1}); + } +}; + +/** + * The `(exponent = 0.25, global_shift = 0)` instance of ZPowGate. + * This is the canonical T gate. + */ +template +struct T { + static constexpr GateKind kind = kT; + static constexpr char name[] = "T"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static constexpr fp_type is2 = static_cast(is2_double); + + static GateCirq Create(unsigned time, unsigned q0) { + return CreateGate, T>( + time, {q0}, {1, 0, 0, 0, 0, 0, is2, is2}); + } +}; + +/** + * The `(exponent = 1, global_shift = 0)` instance of CZPowGate. + * This is the canonical CZ gate. + */ +template +struct CZ { + static constexpr GateKind kind = kCZ; + static constexpr char name[] = "CZ"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1) { + return CreateGate, CZ>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, -1, 0}); + } + + static schmidt_decomp_type SchmidtDecomp() { + return schmidt_decomp_type{ + {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}}, + {{0, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, -1, 0}}, + }; + } +}; + +template +using CNotPowGate = CXPowGate; + +/** + * The `(exponent = 1, global_shift = 0)` instance of CXPowGate. + * This is the canonical CX (or CNOT) gate. + */ +template +struct CX { + static constexpr GateKind kind = kCX; + static constexpr char name[] = "kCX"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = false; + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1) { + // Matrix is in this form because the simulator uses inverse qubit order. + return CreateGate, CX>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0, + 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0}); + } + + static schmidt_decomp_type SchmidtDecomp() { + return schmidt_decomp_type{ + {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}}, + {{0, 0, 0, 0, 0, 0, 1, 0}, {0, 0, 1, 0, 1, 0, 0, 0}}, + }; + } +}; + +template +using CNOT = CX; + +// Gates from cirq/ops/pauli_gates.py: + +/** + * The `(exponent = 1, global_shift = 0)` instance of XPowGate. + * This is the canonical Pauli X gate. + */ +template +struct X : public XPowGate { + static constexpr GateKind kind = kX; + static constexpr char name[] = "X"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateCirq Create(unsigned time, unsigned q0) { + return CreateGate, X>( + time, {q0}, {0, 0, 1, 0, 1, 0, 0, 0}); + } +}; + +/** + * The `(exponent = 1, global_shift = 0)` instance of YPowGate. + * This is the canonical Pauli Y gate. + */ +template +struct Y : public YPowGate { + static constexpr GateKind kind = kY; + static constexpr char name[] = "Y"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateCirq Create(unsigned time, unsigned q0) { + return CreateGate, Y>( + time, {q0}, {0, 0, 0, -1, 0, 1, 0, 0}); + } +}; + +/** + * The `(exponent = 1, global_shift = 0)` instance of ZPowGate. + * This is the canonical Pauli Z gate. + */ +template +struct Z : public ZPowGate { + static constexpr GateKind kind = kZ; + static constexpr char name[] = "Z"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateCirq Create(unsigned time, unsigned q0) { + return CreateGate, Z>( + time, {q0}, {1, 0, 0, 0, 0, 0, -1, 0}); + } +}; + +// Gates from cirq/ops/phased_x_gate.py: + +/** + * An XPowGate conjugated by ZPowGate%s. + * Equivalent to the circuit `───Z^-p───X^t───Z^p───`. + */ +template +struct PhasedXPowGate { + static constexpr GateKind kind = kPhasedXPowGate; + static constexpr char name[] = "PhasedXPowGate"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, unsigned q0, + fp_type phase_exponent, fp_type exponent = 1, + fp_type global_shift = 0) { + fp_type pc = std::cos(pi * phase_exponent); + fp_type ps = std::sin(pi * phase_exponent); + fp_type ec = std::cos(pi * exponent); + fp_type es = std::sin(pi * exponent); + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + + fp_type ar = 0.5 * ((1 + ec) * gc - es * gs); + fp_type ai = 0.5 * ((1 + ec) * gs + es * gc); + fp_type br = -0.5 * ((-1 + ec) * gc - es * gs); + fp_type bi = -0.5 * ((-1 + ec) * gs + es * gc); + + return CreateGate, PhasedXPowGate>( + time, {q0}, {ar, ai, pc * br + ps * bi, pc * bi - ps * br, + pc * br - ps * bi, pc * bi + ps * br, ar, ai}, + {phase_exponent, exponent, global_shift}); + } +}; + +// Gates from cirq/ops/phased_x_z_gate.py: + +/** + * A PhasedXPowGate followed by a ZPowGate. + * Equivalent to the circuit `───Z^(-a)──X^x──Z^a───Z^z───`. + */ +template +struct PhasedXZGate { + static constexpr GateKind kind = kPhasedXZGate; + static constexpr char name[] = "PhasedXZGate"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, unsigned q0, + fp_type x_exponent, fp_type z_exponent, + fp_type axis_phase_exponent) { + fp_type xc = std::cos(pi * x_exponent); + fp_type xs = std::sin(pi * x_exponent); + fp_type zc = std::cos(pi * z_exponent); + fp_type zs = std::sin(pi * z_exponent); + fp_type ac = std::cos(pi * axis_phase_exponent); + fp_type as = std::sin(pi * axis_phase_exponent); + + fp_type br = 0.5 * (1 + xc); + fp_type bi = 0.5 * xs; + fp_type cr = -0.5 * (-1 + xc); + fp_type ci = -0.5 * xs; + fp_type dr = ac * zc - as * zs; + fp_type di = ac * zs + as * zc; + + return CreateGate, PhasedXZGate>( + time, {q0}, {br, bi, ac * cr + as * ci, ac * ci - as * cr, + dr * cr - di * ci, dr * ci + di * cr, + zc * br - zs * bi, zc * bi + zs * br}, + {x_exponent, z_exponent, axis_phase_exponent}); + } +}; + +// Gates from cirq/ops/parity_gates.py: + +/** + * The tensor product of two X gates, possibly raised to an exponent. + */ +template +struct XXPowGate { + static constexpr GateKind kind = kXXPowGate; + static constexpr char name[] = "XXPowGate"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1, + fp_type exponent, fp_type global_shift = 0) { + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + fp_type c = std::cos(pi * exponent); + fp_type s = std::sin(pi * exponent); + fp_type ic = 0.5 * ((1 + c) * gc - s * gs); + fp_type is = 0.5 * ((1 + c) * gs + s * gc); + fp_type xc = 0.5 * ((1 - c) * gc + s * gs); + fp_type xs = 0.5 * ((1 - c) * gs - s * gc); + + return CreateGate, XXPowGate>( + time, {q0, q1}, {ic, is, 0, 0, 0, 0, xc, xs, + 0, 0, ic, is, xc, xs, 0, 0, + 0, 0, xc, xs, ic, is, 0, 0, + xc, xs, 0, 0, 0, 0, ic, is}, {exponent, global_shift}); + } + + static schmidt_decomp_type SchmidtDecomp( + fp_type exponent, fp_type global_shift) { + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + fp_type c = std::cos(pi * exponent); + fp_type s = std::sin(pi * exponent); + fp_type ic = 0.5 * ((1 + c) * gc - s * gs); + fp_type is = 0.5 * ((1 + c) * gs + s * gc); + fp_type xc = 0.5 * ((1 - c) * gc + s * gs); + fp_type xs = 0.5 * ((1 - c) * gs - s * gc); + + return schmidt_decomp_type{ + {{1, 0, 0, 0, 0, 0, 1, 0}, {ic, is, 0, 0, 0, 0, ic, is}}, + {{0, 0, 1, 0, 1, 0, 0, 0}, {0, 0, xc, xs, xc, xs, 0, 0}}, + }; + } +}; + +/** + * The tensor product of two Y gates, possibly raised to an exponent. + */ +template +struct YYPowGate { + static constexpr GateKind kind = kYYPowGate; + static constexpr char name[] = "YYPowGate"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1, + fp_type exponent, fp_type global_shift = 0) { + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + fp_type c = std::cos(pi * exponent); + fp_type s = std::sin(pi * exponent); + fp_type ic = 0.5 * ((1 + c) * gc - s * gs); + fp_type is = 0.5 * ((1 + c) * gs + s * gc); + fp_type yc = 0.5 * ((1 - c) * gc + s * gs); + fp_type ys = 0.5 * ((1 - c) * gs - s * gc); + + return CreateGate, YYPowGate>( + time, {q0, q1}, {ic, is, 0, 0, 0, 0, -yc, -ys, + 0, 0, ic, is, yc, ys, 0, 0, + 0, 0, yc, ys, ic, is, 0, 0, + -yc, -ys, 0, 0, 0, 0, ic, is}, + {exponent, global_shift}); + } + + static schmidt_decomp_type SchmidtDecomp( + fp_type exponent, fp_type global_shift) { + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + fp_type c = std::cos(pi * exponent); + fp_type s = std::sin(pi * exponent); + fp_type ic = 0.5 * ((1 + c) * gc - s * gs); + fp_type is = 0.5 * ((1 + c) * gs + s * gc); + fp_type yc = 0.5 * ((1 - c) * gc + s * gs); + fp_type ys = 0.5 * ((1 - c) * gs - s * gc); + + return schmidt_decomp_type{ + {{1, 0, 0, 0, 0, 0, 1, 0}, {ic, is, 0, 0, 0, 0, ic, is}}, + {{0, 0, 0, -1, 0, 1, 0, 0}, {0, 0, ys, -yc, -ys, yc, 0, 0}}, + }; + } +}; + +/** + * The tensor product of two Z gates, possibly raised to an exponent. + */ +template +struct ZZPowGate { + static constexpr GateKind kind = kZZPowGate; + static constexpr char name[] = "ZZPowGate"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1, + fp_type exponent, fp_type global_shift = 0) { + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + fp_type zc = std::cos(pi * exponent * (1 + global_shift)); + fp_type zs = std::sin(pi * exponent * (1 + global_shift)); + + return CreateGate, ZZPowGate>( + time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0, + 0, 0, zc, zs, 0, 0, 0, 0, + 0, 0, 0, 0, zc, zs, 0, 0, + 0, 0, 0, 0, 0, 0, gc, gs}, {exponent, global_shift}); + } + + static schmidt_decomp_type SchmidtDecomp( + fp_type exponent, fp_type global_shift) { + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + fp_type c = std::cos(pi * exponent); + fp_type s = std::sin(pi * exponent); + fp_type ic = 0.5 * ((1 + c) * gc - s * gs); + fp_type is = 0.5 * ((1 + c) * gs + s * gc); + fp_type zc = 0.5 * ((1 - c) * gc + s * gs); + fp_type zs = 0.5 * ((1 - c) * gs - s * gc); + + return schmidt_decomp_type{ + {{1, 0, 0, 0, 0, 0, 1, 0}, {ic, is, 0, 0, 0, 0, ic, is}}, + {{1, 0, 0, 0, 0, 0, -1, 0}, {zc, zs, 0, 0, 0, 0, -zc, -zs}}, + }; + } +}; + +/** + * The `(exponent = 1, global_shift = 0)` instance of XXPowGate. + * This is the tensor product of two X gates. + */ +template +struct XX { + static constexpr GateKind kind = kXX; + static constexpr char name[] = "XX"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1) { + return CreateGate, XX>( + time, {q0, q1}, {0, 0, 0, 0, 0, 0, 1, 0, + 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 0}); + } + + static schmidt_decomp_type SchmidtDecomp() { + return schmidt_decomp_type{ + {{0, 0, 1, 0, 1, 0, 0, 0}, {0, 0, 1, 0, 1, 0, 0, 0}}, + }; + } +}; + +/** + * The `(exponent = 1, global_shift = 0)` instance of YYPowGate. + * This is the tensor product of two Y gates. + */ +template +struct YY { + static constexpr GateKind kind = kYY; + static constexpr char name[] = "YY"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1) { + return CreateGate, YY>( + time, {q0, q1}, {0, 0, 0, 0, 0, 0, -1, 0, + 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, + -1, 0, 0, 0, 0, 0, 0, 0}); + } + + static schmidt_decomp_type SchmidtDecomp() { + return schmidt_decomp_type{ + {{0, 0, 0, -1, 0, 1, 0, 0}, {0, 0, 0, -1, 0, 1, 0, 0}}, + }; + } +}; + +/** + * The `(exponent = 1, global_shift = 0)` instance of ZZPowGate. + * This is the tensor product of two Z gates. + */ +template +struct ZZ { + static constexpr GateKind kind = kZZ; + static constexpr char name[] = "ZZ"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1) { + return CreateGate, ZZ>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, -1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, -1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0}); + } + + static schmidt_decomp_type SchmidtDecomp() { + return schmidt_decomp_type{ + {{1, 0, 0, 0, 0, 0, -1, 0}, {1, 0, 0, 0, 0, 0, -1, 0}}, + }; + } +}; + +// Gates from cirq/ops/swap_gates.py: + +/** + * The SWAP gate, possibly raised to a power. Exchanges qubits. + */ +template +struct SwapPowGate { + static constexpr GateKind kind = kSwapPowGate; + static constexpr char name[] = "SwapPowGate"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static constexpr fp_type pi = static_cast(pi_double); + static constexpr fp_type h = static_cast(h_double); + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1, + fp_type exponent, fp_type global_shift = 0) { + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + fp_type c = std::cos(pi * exponent * 0.5); + fp_type s = std::sin(pi * exponent * 0.5); + fp_type ec = std::cos(pi * exponent * (0.5 + global_shift)); + fp_type es = std::sin(pi * exponent * (0.5 + global_shift)); + + return CreateGate, SwapPowGate>( + time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0, + 0, 0, c * ec, c * es, s * es, -s * ec, 0, 0, + 0, 0, s * es, -s * ec, c * ec, c * es, 0, 0, + 0, 0, 0, 0, 0, 0, gc, gs}, {exponent, global_shift}); + } + + static schmidt_decomp_type SchmidtDecomp( + fp_type exponent, fp_type global_shift) { + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + fp_type c = std::cos(pi * exponent * 0.5); + fp_type s = std::sin(pi * exponent * 0.5); + fp_type ec = std::cos(pi * exponent * (0.5 + global_shift)); + fp_type es = std::sin(pi * exponent * (0.5 + global_shift)); + + return schmidt_decomp_type{ + {{h, 0, 0, 0, 0, 0, h, 0}, {gc + c * ec, gs + c * es, 0, 0, + 0, 0, gc + c * ec, gs + c * es}}, + {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, s * es, -s * ec, + s * es, -s * ec, 0, 0}}, + {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, -s * ec, -s * es, + s * ec, s * es, 0, 0}}, + {{h, 0, 0, 0, 0, 0, -h, 0}, {gc - c * ec, gs - c * es, 0, 0, + 0, 0, -gc + c * ec, -gs + c * es}}, + }; + } +}; + +/** + * Rotates the |01⟩ vs |10⟩ subspace of two qubits around its Bloch X-axis. + * This is a generalization of the ISWAP gate. + */ +template +struct ISwapPowGate { + static constexpr GateKind kind = kISwapPowGate; + static constexpr char name[] = "ISwapPowGate"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static constexpr fp_type pi = static_cast(pi_double); + static constexpr fp_type h = static_cast(h_double); + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1, + fp_type exponent, fp_type global_shift = 0) { + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + fp_type c = std::cos(pi * exponent * 0.5); + fp_type s = std::sin(pi * exponent * 0.5); + + return CreateGate, ISwapPowGate>( + time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0, + 0, 0, c * gc, c * gs, -s * gs, s * gc, 0, 0, + 0, 0, -s * gs, s * gc, c * gc, c * gs, 0, 0, + 0, 0, 0, 0, 0, 0, gc, gs}, {exponent, global_shift}); + } + + static schmidt_decomp_type SchmidtDecomp( + fp_type exponent, fp_type global_shift) { + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + fp_type c = std::cos(pi * exponent * 0.5); + fp_type s = std::sin(pi * exponent * 0.5); + + return schmidt_decomp_type{ + {{h, 0, 0, 0, 0, 0, h, 0}, {gc + c * gc, gs + c * gs, 0, 0, + 0, 0, gc + c * gc, gs + c * gs}}, + {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, -s * gs, s * gc, + -s * gs, s * gc, 0, 0}}, + {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, s * gc, s * gs, + -s * gc, -s * gs, 0, 0}}, + {{h, 0, 0, 0, 0, 0, -h, 0}, {gc - c * gc, gs - c * gs, 0, 0, + 0, 0, -gc + c * gc, -gs + c * gs}}, + }; + } +}; + +/** + * The `(exponent = 2*phi/pi, global_shift = 0)` instance of ISwapPowGate. + * This is a generalization of the ISWAP gate with a fixed global phase of zero. + * This is a function in Cirq. + */ +template +struct riswap { + static constexpr GateKind kind = kriswap; + static constexpr char name[] = "riswap"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static constexpr fp_type pi = static_cast(pi_double); + static constexpr fp_type h = static_cast(h_double); + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1, + fp_type phi) { + fp_type c = std::cos(phi); + fp_type s = std::sin(phi); + + return CreateGate, riswap>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, c, 0, 0, s, 0, 0, + 0, 0, 0, s, c, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0}, {phi}); + } + + static schmidt_decomp_type SchmidtDecomp(fp_type phi) { + fp_type c = std::cos(phi); + fp_type s = std::sin(phi); + + return schmidt_decomp_type{ + {{h, 0, 0, 0, 0, 0, h, 0}, {1 + c, 0, 0, 0, 0, 0, 1 + c, 0}}, + {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, 0, s, 0, s, 0, 0}}, + {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, s, 0, -s, 0, 0, 0}}, + {{h, 0, 0, 0, 0, 0, -h, 0}, {1 - c, 0, 0, 0, 0, 0, -1 + c, 0}}, + }; + } +}; + +/** + * The `(exponent = 1, global_shift = 0)` instance of SwapPowGate. + * This is the canonical SWAP gate. + */ +template +struct SWAP { + static constexpr GateKind kind = kSWAP; + static constexpr char name[] = "SWAP"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static constexpr fp_type is2 = static_cast(is2_double); + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1) { + return CreateGate, SWAP>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0}); + } + + static schmidt_decomp_type SchmidtDecomp() { + return schmidt_decomp_type{ + {{is2, 0, 0, 0, 0, 0, is2, 0}, {is2, 0, 0, 0, 0, 0, is2, 0}}, + {{0, 0, is2, 0, is2, 0, 0, 0}, {0, 0, is2, 0, is2, 0, 0, 0}}, + {{0, 0, 0, -is2, 0, is2, 0, 0}, {0, 0, 0, -is2, 0, is2, 0, 0}}, + {{is2, 0, 0, 0, 0, 0, -is2, 0}, {is2, 0, 0, 0, 0, 0, -is2, 0}}, + }; + } +}; + +/** + * The `(exponent = 1, global_shift = 0)` instance of ISwapPowGate. + * This is the canonical ISWAP gate. + */ +template +struct ISWAP { + static constexpr GateKind kind = kISWAP; + static constexpr char name[] = "ISWAP"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static constexpr fp_type h = static_cast(h_double); + static constexpr fp_type is2 = static_cast(is2_double); + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1) { + return CreateGate, ISWAP>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 1, 0, 0, + 0, 0, 0, 1, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0}); + } + + static schmidt_decomp_type SchmidtDecomp() { + return schmidt_decomp_type{ + {{is2, 0, 0, 0, 0, 0, is2, 0}, {is2, 0, 0, 0, 0, 0, is2, 0}}, + {{0, 0, h, h, h, h, 0, 0}, {0, 0, h, h, h, h, 0, 0}}, + {{0, 0, h, -h, -h, h, 0, 0}, {0, 0, h, -h, -h, h, 0, 0}}, + {{is2, 0, 0, 0, 0, 0, -is2, 0}, {is2, 0, 0, 0, 0, 0, -is2, 0}}, + }; + } +}; + +// Gates from cirq/ops/phased_iswap_gate.py: + +/** + * An ISwapPowGate conjugated by ZPowGate%s. + * Equivalent to the composition `(Z^-p ⊗ Z^p) ISWAP^t (Z^p ⊗ Z^-p)`. + */ +template +struct PhasedISwapPowGate { + static constexpr GateKind kind = kPhasedISwapPowGate; + static constexpr char name[] = "PhasedISwapPowGate"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = false; + + static constexpr fp_type pi = static_cast(pi_double); + static constexpr fp_type h = static_cast(h_double); + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1, + fp_type phase_exponent = 0.25, + fp_type exponent = 1.0) { + fp_type fc = std::cos(2 * pi * phase_exponent); + fp_type fs = std::sin(2 * pi * phase_exponent); + fp_type c = std::cos(pi * exponent * 0.5); + fp_type s = std::sin(pi * exponent * 0.5); + + // Matrix is in this form because the simulator uses inverse qubit order. + return CreateGate, PhasedISwapPowGate>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, c, 0, s * fs, s * fc, 0, 0, + 0, 0, -s * fs, s * fc, c, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0}, {phase_exponent, exponent}); + } + + static schmidt_decomp_type SchmidtDecomp( + fp_type phase_exponent, fp_type exponent) { + fp_type fc = std::cos(2 * pi * phase_exponent); + fp_type fs = std::sin(2 * pi * phase_exponent); + fp_type c = std::cos(pi * exponent * 0.5); + fp_type s = std::sin(pi * exponent * 0.5); + + return schmidt_decomp_type{ + {{h, 0, 0, 0, 0, 0, h, 0}, {1 + c, 0, 0, 0, 0, 0, 1 + c, 0}}, + {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, s * fs, s * fc, -s * fs, s * fc, 0, 0}}, + {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, s * fc, -s * fs, + -s * fc, -s * fs, 0, 0}}, + {{h, 0, 0, 0, 0, 0, -h, 0}, {1 - c, 0, 0, 0, 0, 0, -1 + c, 0}}, + }; + } +}; + +/** + * The `(phase_exponent = 0.25, exponent = 2*phi/pi)` instance of + * PhasedISwapPowGate. + * This is the "Givens rotation" from numerical linear algebra. + * This is a function in Cirq. + */ +template +struct givens { + static constexpr GateKind kind = kgivens; + static constexpr char name[] = "givens"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = false; + + static constexpr fp_type pi = static_cast(pi_double); + static constexpr fp_type h = static_cast(h_double); + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1, + fp_type phi) { + fp_type c = std::cos(phi); + fp_type s = std::sin(phi); + + // Matrix is in this form because the simulator uses inverse qubit order. + return CreateGate, givens>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, c, 0, s, 0, 0, 0, + 0, 0, -s, 0, c, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0}, {phi}); + } + + static schmidt_decomp_type SchmidtDecomp(fp_type phi) { + fp_type c = std::cos(phi); + fp_type s = std::sin(phi); + + return schmidt_decomp_type{ + {{h, 0, 0, 0, 0, 0, h, 0}, {1 + c, 0, 0, 0, 0, 0, 1 + c, 0}}, + {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, s, 0, -s, 0, 0, 0}}, + {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, 0, -s, 0, -s, 0, 0}}, + {{h, 0, 0, 0, 0, 0, -h, 0}, {1 - c, 0, 0, 0, 0, 0, -1 + c, 0}}, + }; + } +}; + +// Gates from cirq/ops/fsim_gate.py: + +/** + * The fermionic simulation gate family. Contains all two-qubit interactions + * that preserve excitations, up to single-qubit rotations and global phase. + */ +template +struct FSimGate { + static constexpr GateKind kind = kFSimGate; + static constexpr char name[] = "FSimGate"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static constexpr fp_type is2 = static_cast(is2_double); + + static GateCirq Create( + unsigned time, unsigned q0, unsigned q1, fp_type theta, fp_type phi) { + if (phi < 0) { + phi += 2 * 3.141592653589793; + } + + fp_type ct = std::cos(theta); + fp_type st = std::sin(theta); + fp_type cp = std::cos(phi); + fp_type sp = std::sin(phi); + + return CreateGate, FSimGate>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, ct, 0, 0, -st, 0, 0, + 0, 0, 0, -st, ct, 0, 0, 0, + 0, 0, 0, 0, 0, 0, cp, -sp}, {theta, phi}); + } + + static schmidt_decomp_type SchmidtDecomp( + fp_type theta, fp_type phi) { + fp_type ct = std::cos(theta); + fp_type st = std::sin(theta); + + fp_type cp2 = std::cos(0.5 * phi); + fp_type sp2 = std::sin(0.5 * phi); + fp_type cp4 = std::cos(0.25 * phi); + fp_type sp4 = std::sin(0.25 * phi); + + fp_type a0 = std::sqrt(std::sqrt(1 + 2 * ct * cp2 + ct * ct)); + fp_type a1 = std::sqrt(std::sqrt(1 - 2 * ct * cp2 + ct * ct)); + + fp_type p0 = 0.5 * std::atan2(-sp2, cp2 + ct); + fp_type p1 = 0.5 * std::atan2(-sp2, cp2 - ct); + + fp_type c0 = is2 * a0 * std::cos(p0); + fp_type s0 = is2 * a0 * std::sin(p0); + + fp_type c1 = is2 * a1 * std::cos(p1); + fp_type s1 = is2 * a1 * std::sin(p1); + + fp_type st2 = 0.5 * std::sqrt(st); + + fp_type a = cp4 * c0 - sp4 * s0; + fp_type b = cp4 * s0 + sp4 * c0; + fp_type c = cp4 * c0 + sp4 * s0; + fp_type d = cp4 * s0 - sp4 * c0; + + fp_type e = cp4 * c1 - sp4 * s1; + fp_type f = cp4 * s1 + sp4 * c1; + fp_type g = -(cp4 * c1 + sp4 * s1); + fp_type h = -(cp4 * s1 - sp4 * c1); + + return schmidt_decomp_type{ + {{a, b, 0, 0, 0, 0, c, d}, {a, b, 0, 0, 0, 0, c, d}}, + {{0, 0, st2, -st2, st2, -st2, 0, 0}, {0, 0, st2, -st2, st2, -st2, 0, 0}}, + {{0, 0, -st2, -st2, st2, st2, 0, 0}, {0, 0, -st2, -st2, st2, st2, 0, 0}}, + {{e, f, 0, 0, 0, 0, g, h}, {e, f, 0, 0, 0, 0, g, h}}, + }; + } +}; + +// Gates from cirq/ops/two_qubit_diagonal_gate.py: + +/** + * A two-qubit diagonal gate. + */ +template +struct TwoQubitDiagonalGate { + static constexpr GateKind kind = kTwoQubitDiagonalGate; + static constexpr char name[] = "TwoQubitDiagonalGate"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = false; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, + unsigned q0, unsigned q1, + const std::vector& angles) { + std::vector cs; + std::vector ss; + cs.reserve(4); + ss.reserve(4); + + for (std::size_t i = 0; i < angles.size(); ++i) { + cs.push_back(std::cos(angles[i])); + ss.push_back(std::sin(angles[i])); + } + + for (std::size_t i = angles.size(); i < 4; ++i) { + cs.push_back(1); + ss.push_back(0); + } + + // Matrix is in this form because the simulator uses inverse qubit order. + return CreateGate, TwoQubitDiagonalGate>( + time, {q0, q1}, {cs[0], ss[0], 0, 0, 0, 0, 0, 0, + 0, 0, cs[2], ss[2], 0, 0, 0, 0, + 0, 0, 0, 0, cs[1], ss[1], 0, 0, + 0, 0, 0, 0, 0, 0, cs[3], ss[3]}); + } +}; + +// Gates from cirq/ops/three_qubit_gates.py: + +/** + * A three-qubit diagonal gate. + */ +template +struct ThreeQubitDiagonalGate { + static constexpr GateKind kind = kThreeQubitDiagonalGate; + static constexpr char name[] = "ThreeQubitDiagonalGate"; + static constexpr unsigned num_qubits = 3; + static constexpr bool symmetric = false; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, + unsigned q0, unsigned q1, unsigned q2, + const std::vector& angles) { + std::vector cs; + std::vector ss; + cs.reserve(8); + ss.reserve(8); + + for (std::size_t i = 0; i < angles.size(); ++i) { + cs.push_back(std::cos(angles[i])); + ss.push_back(std::sin(angles[i])); + } + + for (std::size_t i = angles.size(); i < 8; ++i) { + cs.push_back(1); + ss.push_back(0); + } + + // Matrix is in this form because the simulator uses inverse qubit order. + return CreateGate, ThreeQubitDiagonalGate>( + time, {q0, q1, q2}, + {cs[0], ss[0], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, cs[4], ss[4], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, cs[2], ss[2], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, cs[6], ss[6], 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, cs[1], ss[1], 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, cs[5], ss[5], 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, cs[3], ss[3], 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, cs[7], ss[7]}); + } +}; + +/** + * A gate that applies a phase to the |111⟩ state of three qubits. + * This is a generalization of the CCZ gate. + */ +template +struct CCZPowGate { + static constexpr GateKind kind = kCCZPowGate; + static constexpr char name[] = "CCZPowGate"; + static constexpr unsigned num_qubits = 3; + static constexpr bool symmetric = true; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, + unsigned q0, unsigned q1, unsigned q2, + fp_type exponent, fp_type global_shift = 0) { + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + fp_type ec = std::cos(pi * exponent * (1 + global_shift)); + fp_type es = std::sin(pi * exponent * (1 + global_shift)); + + return CreateGate, CCZPowGate>( + time, {q0, q1, q2}, {gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ec, es}, + {exponent, global_shift}); + } +}; + +/** + * A gate that applies a doubly-controlled power of an X gate. + * This is a generalization of the CCX (or CCNOT) gate. + */ +template +struct CCXPowGate { + static constexpr GateKind kind = kCCXPowGate; + static constexpr char name[] = "CCXPowGate"; + static constexpr unsigned num_qubits = 3; + static constexpr bool symmetric = false; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, + unsigned q0, unsigned q1, unsigned q2, + fp_type exponent, fp_type global_shift = 0) { + fp_type c = std::cos(pi * exponent * 0.5); + fp_type s = std::sin(pi * exponent * 0.5); + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + fp_type ec = std::cos(pi * exponent * (0.5 + global_shift)); + fp_type es = std::sin(pi * exponent * (0.5 + global_shift)); + + // Matrix is in this form because the simulator uses inverse qubit order. + return CreateGate, CCXPowGate>( + time, {q0, q1, q2}, + {gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, c * ec, c * es, 0, 0, 0, 0, 0, 0, s * es, -s * ec, + 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, + 0, 0, 0, 0, 0, 0, s * es, -s * ec, 0, 0, 0, 0, 0, 0, c * ec, c * es}, + {exponent, global_shift}); + } +}; + +/** + * A controlled swap gate (the Fredkin gate). + */ +template +struct CSwapGate { + static constexpr GateKind kind = kCSwapGate; + static constexpr char name[] = "CSwapGate"; + static constexpr unsigned num_qubits = 3; + static constexpr bool symmetric = false; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, + unsigned q0, unsigned q1, unsigned q2) { + // Matrix is in this form because the simulator uses inverse qubit order. + return CreateGate, CSwapGate>( + time, {q0, q1, q2}, {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0}); + } +}; + +/** + * The `(exponent = 1, global_shift = 0)` instance of CCZPowGate. + * This is the canonical doubly-controlled Z gate. + */ +template +struct CCZ { + static constexpr GateKind kind = kCCZ; + static constexpr char name[] = "CCZ"; + static constexpr unsigned num_qubits = 3; + static constexpr bool symmetric = true; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, + unsigned q0, unsigned q1, unsigned q2) { + return CreateGate, CCZ>( + time, {q0, q1, q2}, {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0}); + } +}; + +/** + * The `(exponent = 1, global_shift = 0)` instance of CCXPowGate. + * This is the canonical doubly-controlled X gate (the TOFFOLI gate). + */ +template +struct CCX { + static constexpr GateKind kind = kCCX; + static constexpr char name[] = "CCX"; + static constexpr unsigned num_qubits = 3; + static constexpr bool symmetric = false; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, + unsigned q0, unsigned q1, unsigned q2) { + // Matrix is in this form because the simulator uses inverse qubit order. + return CreateGate, CCX>( + time, {q0, q1, q2}, {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0}); + } +}; + +template +using CCNotPowGate = CCXPowGate; + +template +using TOFFOLI = CCX; + +template +using CCNOT = CCX; + +template +using CSWAP = CSwapGate; + +template +using FREDKIN = CSwapGate; + +// Gates from cirq/ops/matrix_gates.py: + +/** + * A one-qubit gate defined entirely by its matrix. + */ +template +struct MatrixGate1 { + static constexpr GateKind kind = kMatrixGate1; + static constexpr char name[] = "MatrixGate1"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateCirq Create(unsigned time, unsigned q0, + const Matrix& m) { + auto m2 = m; + return + CreateGate, MatrixGate1>(time, {q0}, std::move(m2)); + } +}; + +/** + * A two-qubit gate defined entirely by its matrix. + */ +template +struct MatrixGate2 { + static constexpr GateKind kind = kMatrixGate2; + static constexpr char name[] = "MatrixGate2"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = false; + + template > + static GateCirq Create( + unsigned time, unsigned q0, unsigned q1, M&& m) { + return CreateGate, MatrixGate2>(time, {q1, q0}, + std::forward(m)); + } +}; + +/** + * A multi-qubit gate defined entirely by its matrix. + */ +template +struct MatrixGate { + static constexpr GateKind kind = kMatrixGate; + static constexpr char name[] = "MatrixGate"; + static constexpr bool symmetric = false; + + template > + static GateCirq Create(unsigned time, + std::vector qubits, M&& m) { + std::reverse(qubits.begin(), qubits.end()); + return CreateGate, MatrixGate>(time, std::move(qubits), + std::forward(m)); + } +}; + +} // namesapce Cirq + +template +inline schmidt_decomp_type GetSchmidtDecomp( + Cirq::GateKind kind, const std::vector& params) { + switch (kind) { + case Cirq::kI2: + return Cirq::I2::SchmidtDecomp(); + case Cirq::kCZPowGate: + return Cirq::CZPowGate::SchmidtDecomp(params[0], params[1]); + case Cirq::kCXPowGate: + return Cirq::CXPowGate::SchmidtDecomp(params[0], params[1]); + case Cirq::kCZ: + return Cirq::CZ::SchmidtDecomp(); + case Cirq::kCX: + return Cirq::CX::SchmidtDecomp(); + case Cirq::kXXPowGate: + return Cirq::XXPowGate::SchmidtDecomp(params[0], params[1]); + case Cirq::kYYPowGate: + return Cirq::YYPowGate::SchmidtDecomp(params[0], params[1]); + case Cirq::kZZPowGate: + return Cirq::ZZPowGate::SchmidtDecomp(params[0], params[1]); + case Cirq::kXX: + return Cirq::XX::SchmidtDecomp(); + case Cirq::kYY: + return Cirq::YY::SchmidtDecomp(); + case Cirq::kZZ: + return Cirq::ZZ::SchmidtDecomp(); + case Cirq::kSwapPowGate: + return Cirq::SwapPowGate::SchmidtDecomp(params[0], params[1]); + case Cirq::kISwapPowGate: + return Cirq::ISwapPowGate::SchmidtDecomp(params[0], params[1]); + case Cirq::kriswap: + return Cirq::riswap::SchmidtDecomp(params[0]); + case Cirq::kSWAP: + return Cirq::SWAP::SchmidtDecomp(); + case Cirq::kISWAP: + return Cirq::ISWAP::SchmidtDecomp(); + case Cirq::kPhasedISwapPowGate: + return Cirq::PhasedISwapPowGate::SchmidtDecomp( + params[0], params[1]); + case Cirq::kgivens: + return Cirq::givens::SchmidtDecomp(params[0]); + case Cirq::kFSimGate: + return Cirq::FSimGate::SchmidtDecomp(params[0], params[1]); + default: + // Single qubit gates of gates with unimplemented Schmidt decomposition. + return schmidt_decomp_type{}; + } +} + +} // namespace qsim + +#endif // GATES_CIRQ_H_ diff --git a/qsim/gates_qsim.h b/qsim/gates_qsim.h new file mode 100644 index 0000000..366c4f1 --- /dev/null +++ b/qsim/gates_qsim.h @@ -0,0 +1,661 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef GATES_QSIM_H_ +#define GATES_QSIM_H_ + +#include +#include +#include + +#include "gate.h" + +namespace qsim { + +// Gate set implemented in qsim contains the following gates. +enum GateKind { + kGateId1 = 0, // one-qubit Id + kGateHd, // Hadamard + kGateT, // T + kGateX, // X + kGateY, // Y + kGateZ, // Z + kGateX2, // sqrt(X) + kGateY2, // sqrt(Y) + kGateRX, // X-rotation + kGateRY, // Y-rotation + kGateRZ, // Z-rotation + kGateRXY, // XY-rotation (rotation around arbitrary axis in the XY plane) + kGateHZ2, // pi / 2 rotation around the X + Y axis + kGateS, // S + kGateId2, // two-qubit Id + kGateCZ, // CZ + kGateCNot, // CNOT (CX) + kGateSwap, // swap + kGateIS, // iSwap + kGateFS, // fSim + kGateCP, // control phase + kGateMatrix1, // one-qubit matrix gate + kGateMatrix2, // two-qubit matrix gate + kGateGPh, // global phase gate + kDecomp = gate::kDecomp, + kMeasurement = gate::kMeasurement, +}; + +// Specialization of Gate (defined in gate.h) for the qsim gate set. +template +using GateQSim = Gate; + +constexpr double h_double = 0.5; +constexpr double is2_double = 0.7071067811865475; + +// Zero-qubit gates: + +/** + * The global phase gate. + */ +template +struct GateGPh { + static constexpr GateKind kind = kGateGPh; + static constexpr char name[] = "p"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateQSim Create(unsigned time, fp_type phi) { + return Create(time, std::cos(phi), std::sin(phi)); + } + + static GateQSim Create(unsigned time, fp_type cp, fp_type sp) { + return CreateGate, GateGPh>( + time, {}, {cp, sp}, {cp, sp}); + } +}; + +// One-qubit gates: + +/** + * The one-qubit identity gate. + */ +template +struct GateId1 { + static constexpr GateKind kind = kGateId1; + static constexpr char name[] = "id1"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateQSim Create(unsigned time, unsigned q0) { + return CreateGate, GateId1>( + time, {q0}, {1, 0, 0, 0, 0, 0, 1, 0}); + } +}; + +/** + * The Hadamard gate. + */ +template +struct GateHd { + static constexpr GateKind kind = kGateHd; + static constexpr char name[] = "h"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static constexpr fp_type is2 = static_cast(is2_double); + + static GateQSim Create(unsigned time, unsigned q0) { + return CreateGate, GateHd>( + time, {q0}, {is2, 0, is2, 0, is2, 0, -is2, 0}); + } +}; + +/** + * The T gate, equivalent to `Z ^ 0.25`. + */ +template +struct GateT { + static constexpr GateKind kind = kGateT; + static constexpr char name[] = "t"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static constexpr fp_type is2 = static_cast(is2_double); + + static GateQSim Create(unsigned time, unsigned q0) { + return CreateGate, GateT>( + time, {q0}, {1, 0, 0, 0, 0, 0, is2, is2}); + } +}; + +/** + * The Pauli X (or "NOT") gate. + */ +template +struct GateX { + static constexpr GateKind kind = kGateX; + static constexpr char name[] = "x"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateQSim Create(unsigned time, unsigned q0) { + return CreateGate, GateX>( + time, {q0}, {0, 0, 1, 0, 1, 0, 0, 0}); + } +}; + +/** + * The Pauli Y gate. + */ +template +struct GateY { + static constexpr GateKind kind = kGateY; + static constexpr char name[] = "y"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateQSim Create(unsigned time, unsigned q0) { + return CreateGate, GateY>( + time, {q0}, {0, 0, 0, -1, 0, 1, 0, 0}); + } +}; + +/** + * The Pauli Z gate. + */ +template +struct GateZ { + static constexpr GateKind kind = kGateZ; + static constexpr char name[] = "z"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateQSim Create(unsigned time, unsigned q0) { + return CreateGate, GateZ>( + time, {q0}, {1, 0, 0, 0, 0, 0, -1, 0}); + } +}; + +/** + * The "square root of X" gate. + */ +template +struct GateX2 { + static constexpr GateKind kind = kGateX2; + static constexpr char name[] = "x_1_2"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static constexpr fp_type h = static_cast(h_double); + + static GateQSim Create(unsigned time, unsigned q0) { + return CreateGate, GateX2>( + time, {q0}, {h, h, h, -h, h, -h, h, h}); + } +}; + +/** + * The "square root of Y" gate. + */ +template +struct GateY2 { + static constexpr GateKind kind = kGateY2; + static constexpr char name[] = "y_1_2"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static constexpr fp_type h = static_cast(h_double); + + static GateQSim Create(unsigned time, unsigned q0) { + return CreateGate, GateY2>( + time, {q0}, {h, h, -h, -h, h, h, h, h}); + } +}; + +/** + * A gate that rotates around the X axis of the Bloch sphere. + * This is a generalization of the X gate. + */ +template +struct GateRX { + static constexpr GateKind kind = kGateRX; + static constexpr char name[] = "rx"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateQSim Create(unsigned time, unsigned q0, fp_type phi) { + fp_type phi2 = -0.5 * phi; + fp_type c = std::cos(phi2); + fp_type s = std::sin(phi2); + + return CreateGate, GateRX>( + time, {q0}, {c, 0, 0, s, 0, s, c, 0}, {phi}); + } +}; + +/** + * A gate that rotates around the Y axis of the Bloch sphere. + * This is a generalization of the Y gate. + */ +template +struct GateRY { + static constexpr GateKind kind = kGateRY; + static constexpr char name[] = "ry"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateQSim Create(unsigned time, unsigned q0, fp_type phi) { + fp_type phi2 = -0.5 * phi; + fp_type c = std::cos(phi2); + fp_type s = std::sin(phi2); + + return CreateGate, GateRY>( + time, {q0}, {c, 0, s, 0, -s, 0, c, 0}, {phi}); + } +}; + +/** + * A gate that rotates around the Z axis of the Bloch sphere. + * This is a generalization of the Z gate. + */ +template +struct GateRZ { + static constexpr GateKind kind = kGateRZ; + static constexpr char name[] = "rz"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateQSim Create(unsigned time, unsigned q0, fp_type phi) { + fp_type phi2 = -0.5 * phi; + fp_type c = std::cos(phi2); + fp_type s = std::sin(phi2); + + return CreateGate, GateRZ>( + time, {q0}, {c, s, 0, 0, 0, 0, c, -s}, {phi}); + } +}; + +/** + * A gate that rotates around an arbitrary axis in the XY-plane. + */ +template +struct GateRXY { + static constexpr GateKind kind = kGateRXY; + static constexpr char name[] = "rxy"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateQSim Create( + unsigned time, unsigned q0, fp_type theta, fp_type phi) { + fp_type phi2 = -0.5 * phi; + fp_type cp = std::cos(phi2); + fp_type sp = std::sin(phi2); + fp_type ct = std::cos(theta) * sp; + fp_type st = std::sin(theta) * sp; + + return CreateGate, GateRXY>( + time, {q0}, {cp, 0, st, ct, -st, ct, cp, 0}, {theta, phi}); + } +}; + +/** + * A pi / 2 rotation around the X + Y axis. + */ +template +struct GateHZ2 { + static constexpr GateKind kind = kGateHZ2; + static constexpr char name[] = "hz_1_2"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static constexpr fp_type h = static_cast(h_double); + + static constexpr fp_type is2 = static_cast(is2_double); + + static GateQSim Create(unsigned time, unsigned q0) { + return CreateGate, GateHZ2>( + time, {q0}, {h, h, 0, -is2, is2, 0, h, h}); + } +}; + +/** + * The S gate, equivalent to "square root of Z". + */ +template +struct GateS { + static constexpr GateKind kind = kGateS; + static constexpr char name[] = "s"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateQSim Create(unsigned time, unsigned q0) { + return CreateGate, GateS>( + time, {q0}, {1, 0, 0, 0, 0, 0, 0, 1}); + } +}; + +/** + * A one-qubit gate defined entirely by its matrix. + */ +template +struct GateMatrix1 { + static constexpr GateKind kind = kGateMatrix1; + static constexpr char name[] = "mat1"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateQSim Create(unsigned time, unsigned q0, + const Matrix& m) { + auto m2 = m; + return + CreateGate, GateMatrix1>(time, {q0}, std::move(m2)); + } +}; + +// Two-qubit gates: + +/** + * The two-qubit identity gate. + */ +template +struct GateId2 { + static constexpr GateKind kind = kGateId2; + static constexpr char name[] = "id2"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static GateQSim Create(unsigned time, unsigned q0, unsigned q1) { + return CreateGate, GateId2>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0}); + } + + static schmidt_decomp_type SchmidtDecomp() { + return schmidt_decomp_type{ + {{1, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, 1, 0}}, + }; + } +}; + +/** + * The controlled-Z (CZ) gate. + */ +template +struct GateCZ { + static constexpr GateKind kind = kGateCZ; + static constexpr char name[] = "cz"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static GateQSim Create(unsigned time, unsigned q0, unsigned q1) { + return CreateGate, GateCZ>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, -1, 0}); + } + + static schmidt_decomp_type SchmidtDecomp() { + return schmidt_decomp_type{ + {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}}, + {{0, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, -1, 0}}, + }; + } +}; + +/** + * The controlled-X (CX or CNOT) gate. + */ +template +struct GateCNot { + static constexpr GateKind kind = kGateCNot; + static constexpr char name[] = "cnot"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = false; + + static GateQSim Create(unsigned time, unsigned q0, unsigned q1) { + // Matrix is in this form because the simulator uses inverse qubit order. + return CreateGate, GateCNot>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0, + 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0}); + } + + static schmidt_decomp_type SchmidtDecomp() { + return schmidt_decomp_type{ + {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}}, + {{0, 0, 0, 0, 0, 0, 1, 0}, {0, 0, 1, 0, 1, 0, 0, 0}}, + }; + } +}; + +/** + * The SWAP gate. Exchanges two qubits. + */ +template +struct GateSwap { + static constexpr GateKind kind = kGateSwap; + static constexpr char name[] = "sw"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static constexpr fp_type is2 = static_cast(is2_double); + + static GateQSim Create(unsigned time, unsigned q0, unsigned q1) { + return CreateGate, GateSwap>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0}); + } + + static schmidt_decomp_type SchmidtDecomp() { + return schmidt_decomp_type{ + {{is2, 0, 0, 0, 0, 0, is2, 0}, {is2, 0, 0, 0, 0, 0, is2, 0}}, + {{0, 0, is2, 0, is2, 0, 0, 0}, {0, 0, is2, 0, is2, 0, 0, 0}}, + {{0, 0, 0, -is2, 0, is2, 0, 0}, {0, 0, 0, -is2, 0, is2, 0, 0}}, + {{is2, 0, 0, 0, 0, 0, -is2, 0}, {is2, 0, 0, 0, 0, 0, -is2, 0}}, + }; + } +}; + +/** + * The ISWAP gate. + */ +template +struct GateIS { + static constexpr GateKind kind = kGateIS; + static constexpr char name[] = "is"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static constexpr fp_type h = static_cast(h_double); + static constexpr fp_type is2 = static_cast(is2_double); + + static GateQSim Create(unsigned time, unsigned q0, unsigned q1) { + return CreateGate, GateIS>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 1, 0, 0, + 0, 0, 0, 1, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0}); + } + + static schmidt_decomp_type SchmidtDecomp() { + return schmidt_decomp_type{ + {{is2, 0, 0, 0, 0, 0, is2, 0}, {is2, 0, 0, 0, 0, 0, is2, 0}}, + {{0, 0, h, h, h, h, 0, 0}, {0, 0, h, h, h, h, 0, 0}}, + {{0, 0, h, -h, -h, h, 0, 0}, {0, 0, h, -h, -h, h, 0, 0}}, + {{is2, 0, 0, 0, 0, 0, -is2, 0}, {is2, 0, 0, 0, 0, 0, -is2, 0}}, + }; + } +}; + +/** + * The fermionic simulation (FSim) gate family. Contains all two-qubit + * interactions that preserve excitations, up to single-qubit rotations and + * global phase. + */ +template +struct GateFS { + static constexpr GateKind kind = kGateFS; + static constexpr char name[] = "fs"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static constexpr fp_type is2 = static_cast(is2_double); + + static GateQSim Create( + unsigned time, unsigned q0, unsigned q1, fp_type theta, fp_type phi) { + if (phi < 0) { + phi += 2 * 3.141592653589793; + } + + fp_type ct = std::cos(theta); + fp_type st = std::sin(theta); + fp_type cp = std::cos(phi); + fp_type sp = std::sin(phi); + + return CreateGate, GateFS>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, ct, 0, 0, -st, 0, 0, + 0, 0, 0, -st, ct, 0, 0, 0, + 0, 0, 0, 0, 0, 0, cp, -sp}, {theta, phi}); + } + + static schmidt_decomp_type SchmidtDecomp( + fp_type theta, fp_type phi) { + fp_type ct = std::cos(theta); + fp_type st = std::sin(theta); + + fp_type cp2 = std::cos(0.5 * phi); + fp_type sp2 = std::sin(0.5 * phi); + fp_type cp4 = std::cos(0.25 * phi); + fp_type sp4 = std::sin(0.25 * phi); + + fp_type a0 = std::sqrt(std::sqrt(1 + 2 * ct * cp2 + ct * ct)); + fp_type a1 = std::sqrt(std::sqrt(1 - 2 * ct * cp2 + ct * ct)); + + fp_type p0 = 0.5 * std::atan2(-sp2, cp2 + ct); + fp_type p1 = 0.5 * std::atan2(-sp2, cp2 - ct); + + fp_type c0 = is2 * a0 * std::cos(p0); + fp_type s0 = is2 * a0 * std::sin(p0); + + fp_type c1 = is2 * a1 * std::cos(p1); + fp_type s1 = is2 * a1 * std::sin(p1); + + fp_type st2 = 0.5 * std::sqrt(st); + + fp_type a = cp4 * c0 - sp4 * s0; + fp_type b = cp4 * s0 + sp4 * c0; + fp_type c = cp4 * c0 + sp4 * s0; + fp_type d = cp4 * s0 - sp4 * c0; + + fp_type e = cp4 * c1 - sp4 * s1; + fp_type f = cp4 * s1 + sp4 * c1; + fp_type g = -(cp4 * c1 + sp4 * s1); + fp_type h = -(cp4 * s1 - sp4 * c1); + + return schmidt_decomp_type{ + {{a, b, 0, 0, 0, 0, c, d}, {a, b, 0, 0, 0, 0, c, d}}, + {{0, 0, st2, -st2, st2, -st2, 0, 0}, {0, 0, st2, -st2, st2, -st2, 0, 0}}, + {{0, 0, -st2, -st2, st2, st2, 0, 0}, {0, 0, -st2, -st2, st2, st2, 0, 0}}, + {{e, f, 0, 0, 0, 0, g, h}, {e, f, 0, 0, 0, 0, g, h}}, + }; + } +}; + +/** + * The controlled phase gate. A generalized version of GateCZ. + */ +template +struct GateCP { + static constexpr GateKind kind = kGateCP; + static constexpr char name[] = "cp"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static GateQSim Create( + unsigned time, unsigned q0, unsigned q1, fp_type phi) { + fp_type cp = std::cos(phi); + fp_type sp = std::sin(phi); + + return CreateGate, GateCP>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, cp, -sp}, {phi}); + } + + static schmidt_decomp_type SchmidtDecomp(fp_type phi) { + fp_type cp = std::cos(phi); + fp_type sp = std::sin(phi); + + return schmidt_decomp_type{ + {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}}, + {{0, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, cp, -sp}}, + }; + } +}; + +/** + * A two-qubit gate defined entirely by its matrix. + */ +template +struct GateMatrix2 { + static constexpr GateKind kind = kGateMatrix2; + static constexpr char name[] = "mat2"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = false; + + template > + static GateQSim Create( + unsigned time, unsigned q0, unsigned q1, M&& m) { + return CreateGate, GateMatrix2>(time, {q1, q0}, + std::forward(m)); + } + + static schmidt_decomp_type SchmidtDecomp(fp_type phi) { + // Not implemented. + return schmidt_decomp_type{}; + } +}; + +template +inline schmidt_decomp_type GetSchmidtDecomp( + GateKind kind, const std::vector& params) { + switch (kind) { + case kGateId2: + return GateId2::SchmidtDecomp(); + case kGateCZ: + return GateCZ::SchmidtDecomp(); + case kGateCNot: + return GateCNot::SchmidtDecomp(); + case kGateSwap: + return GateSwap::SchmidtDecomp(); + case kGateIS: + return GateIS::SchmidtDecomp(); + case kGateFS: + return GateFS::SchmidtDecomp(params[0], params[1]); + case kGateCP: + return GateCP::SchmidtDecomp(params[0]); + default: + // Single qubit gates: empty Schmidt decomposition. + return schmidt_decomp_type{}; + } +} + +} // namespace qsim + +#endif // GATES_QSIM_H_ diff --git a/qsim/hybrid.h b/qsim/hybrid.h new file mode 100644 index 0000000..44fad5b --- /dev/null +++ b/qsim/hybrid.h @@ -0,0 +1,612 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HYBRID_H_ +#define HYBRID_H_ + +#include +#include +#include +#include + +#include "gate.h" +#include "gate_appl.h" + +namespace qsim { + +/** + * Hybrid Feynman-Schrodinger simulator. + */ +template class FuserT, typename For> +struct HybridSimulator final { + public: + using Gate = GateT; + using GateKind = typename Gate::GateKind; + using fp_type = typename Gate::fp_type; + + private: + // Note that one can use "struct GateHybrid : public Gate {" in C++17. + struct GateHybrid { + using GateKind = HybridSimulator::GateKind; + using fp_type = HybridSimulator::fp_type; + + GateKind kind; + unsigned time; + std::vector qubits; + std::vector controlled_by; + uint64_t cmask; + std::vector params; + Matrix matrix; + bool unfusible; + bool swapped; + + const Gate* parent; + unsigned id; + }; + + struct GateX { + GateHybrid* decomposed0; + GateHybrid* decomposed1; + schmidt_decomp_type schmidt_decomp; + unsigned schmidt_bits; + unsigned swapped; + }; + + public: + using Fuser = FuserT; + using GateFused = typename Fuser::GateFused; + + /** + * Contextual data for hybrid simulation. + */ + struct HybridData { + /** + * List of gates on the "0" side of the cut. + */ + std::vector gates0; + /** + * List of gates on the "1" side of the cut. + */ + std::vector gates1; + /** + * List of gates on the cut. + */ + std::vector gatexs; + /** + * Global qubit index to local qubit index map. + */ + std::vector qubit_map; + /** + * Number of qubits on the "0" side of the cut. + */ + unsigned num_qubits0; + /** + * Number of qubits on the "1" side of the cut. + */ + unsigned num_qubits1; + /** + * Number of gates on the cut. + */ + unsigned num_gatexs; + }; + + /** + * User-specified parameters for gate fusion and hybrid simulation. + */ + struct Parameter : public Fuser::Parameter { + /** + * Fixed bitstring indicating values to assign to Schmidt decomposition + * indices of prefix gates. + */ + uint64_t prefix; + /** + * Number of gates on the cut that are part of the prefix. Indices of these + * gates are assigned the value indicated by `prefix`. + */ + unsigned num_prefix_gatexs; + /** + * Number of gates on the cut that are part of the root. All gates that are + * not part of the prefix or root are part of the suffix. + */ + unsigned num_root_gatexs; + unsigned num_threads; + }; + + template + explicit HybridSimulator(Args&&... args) : for_(args...) {} + + /** + * Splits the lattice into two parts, using Schmidt decomposition for gates + * on the cut. + * @param parts Lattice sections to be simulated. + * @param gates List of all gates in the circuit. + * @param hd Output data with split parts. + * @return True if the splitting done successfully; false otherwise. + */ + static bool SplitLattice(const std::vector& parts, + const std::vector& gates, HybridData& hd) { + hd.num_gatexs = 0; + hd.num_qubits0 = 0; + hd.num_qubits1 = 0; + + hd.gates0.reserve(gates.size()); + hd.gates1.reserve(gates.size()); + hd.qubit_map.reserve(parts.size()); + + unsigned count0 = 0; + unsigned count1 = 0; + + // Global qubit index to local qubit index map. + for (std::size_t i = 0; i < parts.size(); ++i) { + parts[i] == 0 ? ++hd.num_qubits0 : ++hd.num_qubits1; + hd.qubit_map.push_back(parts[i] == 0 ? count0++ : count1++); + } + + // Split the lattice. + for (const auto& gate : gates) { + if (gate.kind == gate::kMeasurement) { + IO::errorf("measurement gates are not suported by qsimh.\n"); + return false; + } + + if (gate.controlled_by.size() > 0) { + IO::errorf("controlled gates are not suported by qsimh.\n"); + return false; + } + + switch (gate.qubits.size()) { + case 1: // Single qubit gates. + switch (parts[gate.qubits[0]]) { + case 0: + hd.gates0.emplace_back(GateHybrid{gate.kind, gate.time, + {hd.qubit_map[gate.qubits[0]]}, {}, 0, gate.params, gate.matrix, + false, false, nullptr, 0}); + break; + case 1: + hd.gates1.emplace_back(GateHybrid{gate.kind, gate.time, + {hd.qubit_map[gate.qubits[0]]}, {}, 0, gate.params, gate.matrix, + false, false, nullptr, 0}); + break; + } + break; + case 2: // Two qubit gates. + { + switch ((parts[gate.qubits[1]] << 1) | parts[gate.qubits[0]]) { + case 0: // Both qubits in part 0. + hd.gates0.emplace_back(GateHybrid{gate.kind, gate.time, + {hd.qubit_map[gate.qubits[0]], hd.qubit_map[gate.qubits[1]]}, + {}, 0, gate.params, gate.matrix, false, gate.swapped, + nullptr, 0}); + break; + case 1: // Gate on the cut, qubit 0 in part 1, qubit 1 in part 0. + hd.gates0.emplace_back(GateHybrid{GateKind::kDecomp, gate.time, + {hd.qubit_map[gate.qubits[1]]}, {}, 0, gate.params, {}, + true, gate.swapped, &gate, hd.num_gatexs}); + hd.gates1.emplace_back(GateHybrid{GateKind::kDecomp, gate.time, + {hd.qubit_map[gate.qubits[0]]}, {}, 0, gate.params, {}, + true, gate.swapped, &gate, hd.num_gatexs}); + + ++hd.num_gatexs; + break; + case 2: // Gate on the cut, qubit 0 in part 0, qubit 1 in part 1. + hd.gates0.emplace_back(GateHybrid{GateKind::kDecomp, gate.time, + {hd.qubit_map[gate.qubits[0]]}, {}, 0, gate.params, {}, + true, gate.swapped, &gate, hd.num_gatexs}); + hd.gates1.emplace_back(GateHybrid{GateKind::kDecomp, gate.time, + {hd.qubit_map[gate.qubits[1]]}, {}, 0, gate.params, {}, + true, gate.swapped, &gate, hd.num_gatexs}); + + ++hd.num_gatexs; + break; + case 3: // Both qubits in part 1. + hd.gates1.emplace_back(GateHybrid{gate.kind, gate.time, + {hd.qubit_map[gate.qubits[0]], hd.qubit_map[gate.qubits[1]]}, + {}, 0, gate.params, gate.matrix, false, gate.swapped, + nullptr, 0}); + break; + } + } + break; + default: + IO::errorf("multi-qubit gates are not suported by qsimh.\n"); + return false; + } + } + + auto compare = [](const GateHybrid& l, const GateHybrid& r) -> bool { + return l.time < r.time || (l.time == r.time && + (l.parent < r.parent || (l.parent == r.parent && l.id < r.id))); + }; + + // Sort gates. + std::sort(hd.gates0.begin(), hd.gates0.end(), compare); + std::sort(hd.gates1.begin(), hd.gates1.end(), compare); + + hd.gatexs.reserve(hd.num_gatexs); + + // Get Schmidt matrices. + for (auto& gate0 : hd.gates0) { + if (gate0.parent != nullptr) { + auto d = GetSchmidtDecomp(gate0.parent->kind, gate0.parent->params); + if (d.size() == 0) { + IO::errorf("no Schmidt decomposition for gate kind %u.\n", + gate0.parent->kind); + return false; + } + + unsigned schmidt_bits = SchmidtBits(d.size()); + if (schmidt_bits > 2) { + IO::errorf("Schmidt rank is too large for gate kind %u.\n", + gate0.parent->kind); + return false; + } + + unsigned swapped = parts[gate0.parent->qubits[0]]; + if (gate0.parent->swapped) swapped = 1 - swapped; + hd.gatexs.emplace_back(GateX{&gate0, nullptr, std::move(d), + schmidt_bits, swapped}); + } + } + + unsigned count = 0; + for (auto& gate1 : hd.gates1) { + if (gate1.parent != nullptr) { + hd.gatexs[count++].decomposed1 = &gate1; + } + } + + for (auto& gatex : hd.gatexs) { + if (gatex.schmidt_decomp.size() == 1) { + FillSchmidtMatrices(0, gatex); + } + } + + return true; + } + + /** + * Runs the hybrid simulator on a sectioned lattice. + * @param param Options for parallelism and logging. Also specifies the size + * of the 'prefix' and 'root' sections of the lattice. + * @param factory Object to create simulators and state spaces. + * @param hd Container object for gates on the boundary between lattice + * sections. + * @param parts Lattice sections to be simulated. + * @param fgates0 List of gates from one section of the lattice. + * @param fgates1 List of gates from the other section of the lattice. + * @param bitstrings List of output states to simulate, as bitstrings. + * @param results Output vector of amplitudes. After a successful run, this + * will be populated with amplitudes for each state in 'bitstrings'. + * @return True if the simulation completed successfully; false otherwise. + */ + template + bool Run(const Parameter& param, const Factory& factory, + HybridData& hd, const std::vector& parts, + const std::vector& fgates0, + const std::vector& fgates1, + const std::vector& bitstrings, Results& results) const { + using Simulator = typename Factory::Simulator; + using StateSpace = typename Simulator::StateSpace; + using State = typename StateSpace::State; + + unsigned num_p_gates = param.num_prefix_gatexs; + unsigned num_pr_gates = num_p_gates + param.num_root_gatexs; + + auto bits = CountSchmidtBits(param, hd.gatexs); + + uint64_t rmax = uint64_t{1} << bits.num_r_bits; + uint64_t smax = uint64_t{1} << bits.num_s_bits; + + auto loc0 = CheckpointLocations(param, fgates0); + auto loc1 = CheckpointLocations(param, fgates1); + + struct Index { + unsigned i0; + unsigned i1; + }; + + std::vector indices; + indices.reserve(bitstrings.size()); + + // Bitstring indices for part 0 and part 1. TODO: optimize. + for (const auto& bitstring : bitstrings) { + Index index{0, 0}; + + for (uint64_t i = 0; i < hd.qubit_map.size(); ++i) { + unsigned m = ((bitstring >> i) & 1) << hd.qubit_map[i]; + parts[i] ? index.i1 |= m : index.i0 |= m; + } + + indices.push_back(index); + } + + StateSpace state_space = factory.CreateStateSpace(); + + State* rstate0; + State* rstate1; + + State state0p = state_space.Null(); + State state1p = state_space.Null(); + State state0r = state_space.Null(); + State state1r = state_space.Null(); + State state0s = state_space.Null(); + State state1s = state_space.Null(); + + // Create states. + + if (!CreateStates(hd.num_qubits0, hd.num_qubits1, state_space, true, + state0p, state1p, rstate0, rstate1)) { + return false; + } + + if (!CreateStates(hd.num_qubits0, hd.num_qubits1, state_space, rmax > 1, + state0r, state1r, rstate0, rstate1)) { + return false; + } + + if (!CreateStates(hd.num_qubits0, hd.num_qubits1, state_space, smax > 1, + state0s, state1s, rstate0, rstate1)) { + return false; + } + + state_space.SetStateZero(state0p); + state_space.SetStateZero(state1p); + + Simulator simulator = factory.CreateSimulator(); + + std::vector prev(hd.num_gatexs, unsigned(-1)); + + // param.prefix encodes the prefix path. + unsigned gatex_index = SetSchmidtMatrices( + 0, num_p_gates, param.prefix, prev, hd.gatexs); + + if (gatex_index == 0) { + // Apply gates before the first checkpoint. + ApplyGates(fgates0, 0, loc0[0], simulator, state0p); + ApplyGates(fgates1, 0, loc1[0], simulator, state1p); + } else { + IO::errorf("invalid prefix %lu for prefix gate index %u.\n", + param.prefix, gatex_index - 1); + return false; + } + + // Branch over root gates on the cut. r encodes the root path. + for (uint64_t r = 0; r < rmax; ++r) { + if (rmax > 1) { + state_space.Copy(state0p, state0r); + state_space.Copy(state1p, state1r); + } + + if (SetSchmidtMatrices(num_p_gates, num_pr_gates, + r, prev, hd.gatexs) == 0) { + // Apply gates before the second checkpoint. + ApplyGates(fgates0, loc0[0], loc0[1], simulator, state0r); + ApplyGates(fgates1, loc1[0], loc1[1], simulator, state1r); + } else { + continue; + } + + // Branch over suffix gates on the cut. s encodes the suffix path. + for (uint64_t s = 0; s < smax; ++s) { + if (smax > 1) { + state_space.Copy(rmax > 1 ? state0r : state0p, state0s); + state_space.Copy(rmax > 1 ? state1r : state1p, state1s); + } + + if (SetSchmidtMatrices(num_pr_gates, hd.num_gatexs, + s, prev, hd.gatexs) == 0) { + // Apply the rest of the gates. + ApplyGates(fgates0, loc0[1], fgates0.size(), simulator, state0s); + ApplyGates(fgates1, loc1[1], fgates1.size(), simulator, state1s); + } else { + continue; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const StateSpace& state_space, + const State& state0, const State& state1, + const std::vector& indices, Results& results) { + // TODO: make it faster for the CUDA state space. + auto a0 = state_space.GetAmpl(state0, indices[i].i0); + auto a1 = state_space.GetAmpl(state1, indices[i].i1); + results[i] += a0 * a1; + }; + + // Collect results. + for_.Run(results.size(), f, + state_space, *rstate0, *rstate1, indices, results); + } + } + + return true; + } + + private: + /** + * Identifies when to save "checkpoints" of the simulation state. These allow + * runs with different cut-index values to reuse parts of the simulation. + * @param param Options for parallelism and logging. Also specifies the size + * of the 'prefix' and 'root' sections of the lattice. + * @param fgates Set of gates for which to find checkpoint locations. + * @return A pair of numbers specifying how many gates to apply before the + * first and second checkpoints, respectively. + */ + static std::array CheckpointLocations( + const Parameter& param, const std::vector& fgates) { + std::array loc{0, 0}; + + unsigned num_decomposed = 0; + unsigned num_p_gates = param.num_prefix_gatexs; + unsigned num_pr_gates = num_p_gates + param.num_root_gatexs; + + for (std::size_t i = 0; i < fgates.size(); ++i) { + for (auto gate: fgates[i].gates) { + if (gate->parent != nullptr) { + ++num_decomposed; + // There should be only one decomposed gate in fused gate. + break; + } + } + + if (num_decomposed <= num_p_gates) { + loc[0] = i + 1; + } + + if (num_decomposed <= num_pr_gates) { + loc[1] = i + 1; + } + } + + return loc; + } + + struct Bits { + unsigned num_p_bits; + unsigned num_r_bits; + unsigned num_s_bits; + }; + + static Bits CountSchmidtBits( + const Parameter& param, const std::vector& gatexs) { + Bits bits{0, 0, 0}; + + unsigned num_p_gates = param.num_prefix_gatexs; + unsigned num_pr_gates = num_p_gates + param.num_root_gatexs; + + for (std::size_t i = 0; i < gatexs.size(); ++i) { + const auto& gatex = gatexs[i]; + if (i < num_p_gates) { + bits.num_p_bits += gatex.schmidt_bits; + } else if (i < num_pr_gates) { + bits.num_r_bits += gatex.schmidt_bits; + } else { + bits.num_s_bits += gatex.schmidt_bits; + } + } + + return bits; + } + + static unsigned SetSchmidtMatrices(std::size_t i0, std::size_t i1, + uint64_t path, + std::vector& prev_k, + std::vector& gatexs) { + unsigned shift_length = 0; + + for (std::size_t i = i0; i < i1; ++i) { + const auto& gatex = gatexs[i]; + + if (gatex.schmidt_bits == 0) { + // Continue if gatex has Schmidt rank 1. + continue; + } + + unsigned k = (path >> shift_length) & ((1 << gatex.schmidt_bits) - 1); + shift_length += gatex.schmidt_bits; + + if (k != prev_k[i]) { + if (k >= gatex.schmidt_decomp.size()) { + // Invalid path. Returns gatex index plus one to report error in case + // of invalid prefix. + return i + 1; + } + + FillSchmidtMatrices(k, gatex); + + prev_k[i] = k; + } + } + + return 0; + } + + static void FillSchmidtMatrices(unsigned k, const GateX& gatex) { + unsigned part0 = gatex.swapped; + unsigned part1 = 1 - part0; + { + gatex.decomposed0->matrix.resize(gatex.schmidt_decomp[k][part0].size()); + auto begin = gatex.schmidt_decomp[k][part0].begin(); + auto end = gatex.schmidt_decomp[k][part0].end(); + std::copy(begin, end, gatex.decomposed0->matrix.begin()); + } + { + gatex.decomposed1->matrix.resize(gatex.schmidt_decomp[k][part1].size()); + auto begin = gatex.schmidt_decomp[k][part1].begin(); + auto end = gatex.schmidt_decomp[k][part1].end(); + std::copy(begin, end, gatex.decomposed1->matrix.begin()); + } + } + + template + static void ApplyGates(const std::vector& gates, + std::size_t i0, std::size_t i1, + const Simulator& simulator, + typename Simulator::State& state) { + for (std::size_t i = i0; i < i1; ++i) { + if (gates[i].matrix.size() > 0) { + ApplyFusedGate(simulator, gates[i], state); + } else { + auto gate = gates[i]; + CalculateFusedMatrix(gate); + ApplyFusedGate(simulator, gate, state); + } + } + } + + static unsigned SchmidtBits(unsigned size) { + switch (size) { + case 1: + return 0; + case 2: + return 1; + case 3: + return 2; + case 4: + return 2; + default: + // Not supported. + return 42; + } + } + + template + static bool CreateStates(unsigned num_qubits0,unsigned num_qubits1, + const StateSpace& state_space, bool create, + typename StateSpace::State& state0, + typename StateSpace::State& state1, + typename StateSpace::State* (&rstate0), + typename StateSpace::State* (&rstate1)) { + if (create) { + state0 = state_space.Create(num_qubits0); + state1 = state_space.Create(num_qubits1); + + if (state_space.IsNull(state0) || state_space.IsNull(state1)) { + IO::errorf("not enough memory: is the number of qubits too large?\n"); + return false; + } + + rstate0 = &state0; + rstate1 = &state1; + } + + return true; + } + + For for_; +}; + +} // namespace qsim + +#endif // HYBRID_H_ diff --git a/qsim/io.h b/qsim/io.h new file mode 100644 index 0000000..3b26c7c --- /dev/null +++ b/qsim/io.h @@ -0,0 +1,44 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef IO_H_ +#define IO_H_ + +#include +#include + +namespace qsim { + +/** + * Controller for output logs. + */ +struct IO { + static void errorf(const char* format, ...) { + va_list args; + va_start(args, format); + vfprintf(stderr, format, args); + va_end(args); + } + + static void messagef(const char* format, ...) { + va_list args; + va_start(args, format); + vprintf(format, args); + va_end(args); + } +}; + +} // namespace qsim + +#endif // IO_H_ diff --git a/qsim/io_file.h b/qsim/io_file.h new file mode 100644 index 0000000..3cfac12 --- /dev/null +++ b/qsim/io_file.h @@ -0,0 +1,71 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef IO_FILE_H_ +#define IO_FILE_H_ + +#include +#include +#include + +#include "io.h" + +namespace qsim { + +/** + * Controller for output logs with methods for writing to file. + */ +struct IOFile : public IO { + static std::ifstream StreamFromFile(const std::string& file) { + std::ifstream fs; + fs.open(file); + if (!fs) { + errorf("cannot open %s for reading.\n", file.c_str()); + } + return fs; + } + + static void CloseStream(std::ifstream& fs) { + fs.close(); + } + + static bool WriteToFile( + const std::string& file, const std::string& content) { + return WriteToFile(file, content.data(), content.size()); + } + + static bool WriteToFile( + const std::string& file, const void* data, uint64_t size) { + auto fs = std::fstream(file, std::ios::out | std::ios::binary); + + if (!fs) { + errorf("cannot open %s for writing.\n", file.c_str()); + return false; + } else { + fs.write((const char*) data, size); + if (!fs) { + errorf("cannot write to %s.\n", file.c_str()); + return false; + } + + fs.close(); + } + + return true; + } +}; + +} // namespace qsim + +#endif // IO_FILE_H_ diff --git a/qsim/matrix.h b/qsim/matrix.h new file mode 100644 index 0000000..a3c2640 --- /dev/null +++ b/qsim/matrix.h @@ -0,0 +1,296 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MATRIX_H_ +#define MATRIX_H_ + +#include +#include +#include + +#include "bits.h" + +namespace qsim { + +/** + * Gate matrix type. Matrices are stored as vectors. The matrix elements are + * accessed as real(m[i][j]) <- vector[2 * (n * i + j)] and + * imag(m[i][j]) <- vector[2 * (n * i + j) + 1], where n is the number of rows + * or columns (n = 2^q, where q is the number of gate qubits). + */ +template +using Matrix = std::vector; + +/** + * Sets all matrix elements to zero. + * @m Matrix to be cleared. + */ +template +inline void MatrixClear(Matrix& m) { + for (unsigned i = 0; i < m.size(); ++i) { + m[i] = 0; + } +} + +/** + * Sets an identity matrix. + * @n Number of matrix rows (columns). + * @m Output identity matrix. + */ +template +inline void MatrixIdentity(unsigned n, Matrix& m) { + m.resize(2 * n * n); + + MatrixClear(m); + + for (unsigned i = 0; i < n; ++i) { + m[2 * (n * i + i)] = 1; + } +} + +/** + * Multiplies two gate matrices of equal size: m2 = m1 m2. + * @q Number of gate qubits. The number of matrix rows (columns) is 2^q. + * @m1 Matrix m1. + * @m2 Input matrix m2. Output product of matrices m2 = m1 m2. + */ +template +inline void MatrixMultiply( + unsigned q, const Matrix& m1, Matrix& m2) { + Matrix mt = m2; + unsigned n = unsigned{1} << q; + + for (unsigned i = 0; i < n; ++i) { + for (unsigned j = 0; j < n; ++j) { + fp_type2 re = 0; + fp_type2 im = 0; + + for (unsigned k = 0; k < n; ++k) { + fp_type2 r1 = m1[2 * (n * i + k)]; + fp_type2 i1 = m1[2 * (n * i + k) + 1]; + fp_type2 r2 = mt[2 * (n * k + j)]; + fp_type2 i2 = mt[2 * (n * k + j) + 1]; + + re += r1 * r2 - i1 * i2; + im += r1 * i2 + i1 * r2; + } + + m2[2 * (n * i + j)] = re; + m2[2 * (n * i + j) + 1] = im; + } + } +} + +/** + * Multiplies two gate matrices of equal size: m2 = m1^\dagger m2. + * @q Number of gate qubits. The number of matrix rows (columns) is 2^q. + * @m1 Matrix m1. + * @m2 Input matrix m2. Output product of matrices m2 = m1 m2. + */ +template +inline void MatrixDaggerMultiply( + unsigned q, const Matrix& m1, Matrix& m2) { + Matrix mt = m2; + unsigned n = unsigned{1} << q; + + for (unsigned i = 0; i < n; ++i) { + for (unsigned j = 0; j < n; ++j) { + fp_type2 re = 0; + fp_type2 im = 0; + + for (unsigned k = 0; k < n; ++k) { + fp_type2 r1 = m1[2 * (n * k + i)]; + fp_type2 i1 = m1[2 * (n * k + i) + 1]; + fp_type2 r2 = mt[2 * (n * k + j)]; + fp_type2 i2 = mt[2 * (n * k + j) + 1]; + + re += r1 * r2 + i1 * i2; + im += r1 * i2 - i1 * r2; + } + + m2[2 * (n * i + j)] = re; + m2[2 * (n * i + j) + 1] = im; + } + } +} + +/** + * Multiplies two gate matrices: m2 = m1 m2. The size of m1 should not exceed + * the size of m2. + * @mask1 Qubit mask that specifies the subset of qubits m1 acts on. + * @q1 Number of gate qubits. The number of matrix rows (columns) is 2^q1. + * @m1 Matrix m1. + * @q2 Number of gate qubits. The number of matrix rows (columns) is 2^q2. + * @m2 Input matrix m2. Output product of matrices m2 = m1 m2. + */ +template +inline void MatrixMultiply(unsigned mask1, + unsigned q1, const Matrix& m1, + unsigned q2, Matrix& m2) { + if (q1 == q2) { + MatrixMultiply(q1, m1, m2); + } else { + Matrix mt = m2; + unsigned n1 = unsigned{1} << q1; + unsigned n2 = unsigned{1} << q2; + + for (unsigned i = 0; i < n2; ++i) { + unsigned si = bits::CompressBits(i, q2, mask1); + + for (unsigned j = 0; j < n2; ++j) { + fp_type2 re = 0; + fp_type2 im = 0; + + for (unsigned k = 0; k < n1; ++k) { + unsigned ek = bits::ExpandBits(k, q2, mask1) + (i & ~mask1); + + fp_type2 r1 = m1[2 * (n1 * si + k)]; + fp_type2 i1 = m1[2 * (n1 * si + k) + 1]; + fp_type2 r2 = mt[2 * (n2 * ek + j)]; + fp_type2 i2 = mt[2 * (n2 * ek + j) + 1]; + + re += r1 * r2 - i1 * i2; + im += r1 * i2 + i1 * r2; + } + + m2[2 * (n2 * i + j)] = re; + m2[2 * (n2 * i + j) + 1] = im; + } + } + } +} + +/** + * Multiply a matrix by a real scalar value. + * @c Scalar value. + * @m Input matrix to be multiplied. Output matrix. + */ +template +inline void MatrixScalarMultiply(fp_type1 c, Matrix& m) { + for (unsigned i = 0; i < m.size(); ++i) { + m[i] *= c; + } +} + +/** + * Multiply a matrix by a complex scalar value. + * @re Real part of scalar value. + * @im Imaginary part of scalar value. + * @m Input matrix to be multiplied. Output matrix. + */ +template +inline void MatrixScalarMultiply( + fp_type1 re, fp_type1 im, Matrix& m) { + for (unsigned i = 0; i < m.size() / 2; ++i) { + fp_type2 re0 = m[2 * i + 0]; + fp_type2 im0 = m[2 * i + 1]; + m[2 * i + 0] = re * re0 - im * im0; + m[2 * i + 1] = re * im0 + im * re0; + } +} + +/** + * Daggers a matrix. + * @n Number of matrix rows (columns). + * @m Input matrix. Output matrix. + */ +template +inline void MatrixDagger(unsigned n, Matrix& m) { + for (unsigned i = 0; i < n; ++i) { + m[2 * (n * i + i) + 1] = -m[2 * (n * i + i) + 1]; + + for (unsigned j = i + 1; j < n; ++j) { + std::swap(m[2 * (n * i + j)], m[2 * (n * j + i)]); + fp_type t = m[2 * (n * i + j) + 1]; + m[2 * (n * i + j) + 1] = -m[2 * (n * j + i) + 1]; + m[2 * (n * j + i) + 1] = -t; + } + } +} + +/** + * Gets a permutation to rearrange qubits from "normal" order to "gate" + * order. Qubits are ordered in increasing order for "normal" order. + * Qubits are ordered arbitrarily for "gate" order. Returns an empty vector + * if the qubits are in "normal" order. + * @qubits Qubit indices in "gate" order. + * @return Permutation as a vector. + */ +inline std::vector NormalToGateOrderPermutation( + const std::vector& qubits) { + std::vector perm; + + bool normal_order = true; + + for (std::size_t i = 1; i < qubits.size(); ++i) { + if (qubits[i] < qubits[i - 1]) { + normal_order = false; + break; + } + } + + if (!normal_order) { + struct QI { + unsigned q; + unsigned index; + }; + + std::vector qis; + qis.reserve(qubits.size()); + + for (std::size_t i = 0; i < qubits.size(); ++i) { + qis.push_back({qubits[i], unsigned(i)}); + } + + std::sort(qis.begin(), qis.end(), [](const QI& l, const QI& r) { + return l.q < r.q; + }); + + perm.reserve(qubits.size()); + + for (std::size_t i = 0; i < qubits.size(); ++i) { + perm.push_back(qis[i].index); + } + } + + return perm; +} + +/** + * Shuffles the gate matrix elements to get the matrix that acts on qubits + * that are in "normal" order (in increasing orger). + * @perm Permutation to rearrange qubits from "normal" order to "gate" order. + * @q Number of gate qubits. The number of matrix rows (columns) is 2^q. + * @m Input matrix. Output shuffled matrix. + */ +template +inline void MatrixShuffle(const std::vector& perm, + unsigned q, Matrix& m) { + Matrix mt = m; + unsigned n = unsigned{1} << q; + + for (unsigned i = 0; i < n; ++i) { + unsigned pi = bits::PermuteBits(i, q, perm); + for (unsigned j = 0; j < n; ++j) { + unsigned pj = bits::PermuteBits(j, q, perm); + + m[2 * (n * i + j)] = mt[2 * (n * pi + pj)]; + m[2 * (n * i + j) + 1] = mt[2 * (n * pi + pj) + 1]; + } + } +} + +} // namespace qsim + +#endif // MATRIX_H_ diff --git a/qsim/mps_simulator.h b/qsim/mps_simulator.h new file mode 100644 index 0000000..8fbcbae --- /dev/null +++ b/qsim/mps_simulator.h @@ -0,0 +1,246 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MPS_SIMULATOR_H_ +#define MPS_SIMULATOR_H_ + +// For templates will take care of parallelization. +#define EIGEN_DONT_PARALLELIZE 1 + +#include +#include +#include +#include +#include + +#include "../eigen/Eigen/Dense" +#include "../eigen/Eigen/SVD" +#include "mps_statespace.h" + +namespace qsim { + +namespace mps { + +/** + * Truncated Matrix Product State (MPS) circuit simulator w/ vectorization. + */ +template +class MPSSimulator final { + public: + using MPSStateSpace_ = MPSStateSpace; + using State = typename MPSStateSpace_::MPS; + using fp_type = typename MPSStateSpace_::fp_type; + + using Complex = std::complex; + using Matrix = + Eigen::Matrix; + using ConstMatrixMap = Eigen::Map; + using MatrixMap = Eigen::Map; + + using OneQubitMatrix = Eigen::Matrix; + using ConstOneQubitMap = Eigen::Map; + + // Note: ForArgs are currently unused. + template + explicit MPSSimulator(ForArgs&&... args) : for_(args...) {} + + /** + * Applies a gate using non-vectorized instructions. + * @param qs Indices of the qubits affected by this gate. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyGate(const std::vector& qs, const fp_type* matrix, + State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + switch (qs.size()) { + case 1: + ApplyGate1(qs, matrix, state); + break; + case 2: + ApplyGate2(qs, matrix, state); + break; + // case 3: + // ApplyGate3(qs, matrix, state); + // break; + // case 4: + // ApplyGate4(qs, matrix, state); + // break; + // case 5: + // ApplyGate5(qs, matrix, state); + // break; + // case 6: + // ApplyGate6(qs, matrix, state); + // break; + default: + // Not implemented. + break; + } + } + + /** + * Applies a controlled gate using eigen3 operations w/ instructions. + * @param qs Indices of the qubits affected by this gate. + * @param cqs Indices of control qubits. + * @param cmask Bit mask of control qubit values. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyControlledGate(const std::vector& qs, + const std::vector& cqs, uint64_t cmask, + const fp_type* matrix, State& state) const { + // TODO. + } + + /** + * Computes the expectation value of an operator using eigen3 operations + * w/ vectorized instructions. + * @param qs Indices of the qubits the operator acts on. + * @param matrix The operator matrix. + * @param state The state of the system. + * @return The computed expectation value. + */ + std::complex ExpectationValue(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + // TODO. + return std::complex(-10., -10.); + } + + private: + void ApplyGate1(const std::vector& qs, const fp_type* matrix, + State& state) const { + if (qs[0] == state.num_qubits() - 1) { + Apply1Right(qs, matrix, state); + } else { + Apply1LeftOrInterior(qs, matrix, state); + } + } + + void Apply1LeftOrInterior(const std::vector& qs, + const fp_type* matrix, State& state) const { + fp_type* raw_state = state.get(); + const auto bond_dim = state.bond_dim(); + const auto l_offset = MPSStateSpace_::GetBlockOffset(state, qs[0]); + const auto r_offset = MPSStateSpace_::GetBlockOffset(state, qs[0] + 1); + const auto end = MPSStateSpace_::Size(state); + ConstOneQubitMap gate_matrix((Complex*) matrix); + MatrixMap scratch_block((Complex*)(raw_state + end), 2, bond_dim); + + for (unsigned block_sep = l_offset; block_sep < r_offset; + block_sep += 4 * bond_dim) { + fp_type* cur_block = raw_state + block_sep; + ConstMatrixMap mps_block((Complex*) cur_block, 2, bond_dim); + scratch_block.noalias() = gate_matrix * mps_block; + memcpy(cur_block, raw_state + end, sizeof(fp_type) * bond_dim * 4); + } + } + + void Apply1Right(const std::vector& qs, const fp_type* matrix, + State& state) const { + fp_type* raw_state = state.get(); + const auto bond_dim = state.bond_dim(); + const auto offset = MPSStateSpace_::GetBlockOffset(state, qs[0]); + const auto end = MPSStateSpace_::Size(state); + ConstOneQubitMap gate_matrix((Complex*) matrix); + ConstMatrixMap mps_block((Complex*)(raw_state + offset), bond_dim, 2); + MatrixMap scratch_block((Complex*)(raw_state + end), bond_dim, 2); + scratch_block.noalias() = mps_block * gate_matrix.transpose(); + memcpy(raw_state + offset, raw_state + end, sizeof(fp_type) * bond_dim * 4); + } + + void ApplyGate2(const std::vector& qs, const fp_type* matrix, + State& state) const { + // TODO: micro-benchmark this function and improve performance. + const auto bond_dim = state.bond_dim(); + const auto num_qubits = state.num_qubits(); + fp_type* raw_state = state.get(); + + const auto i_dim = (qs[0] == 0) ? 1 : bond_dim; + const auto j_dim = 2; + const auto k_dim = bond_dim; + const auto l_dim = 2; + const auto m_dim = (qs[1] == num_qubits - 1) ? 1 : bond_dim; + + const auto b_0_offset = MPSStateSpace_::GetBlockOffset(state, qs[0]); + const auto b_1_offset = MPSStateSpace_::GetBlockOffset(state, qs[1]); + const auto end = MPSStateSpace_::Size(state); + + MatrixMap block_0((Complex*)(raw_state + b_0_offset), i_dim * j_dim, k_dim); + MatrixMap block_1((Complex*)(raw_state + b_1_offset), k_dim, l_dim * m_dim); + + // Merge both blocks into scratch space. + MatrixMap scratch_c((Complex*)(raw_state + end), i_dim * j_dim, l_dim * m_dim); + scratch_c.noalias() = block_0 * block_1; + + // Transpose inner dims in-place. + MatrixMap scratch_c_t((Complex*)(raw_state + end), i_dim * j_dim * l_dim, m_dim); + for (unsigned i = 0; i < i_dim * j_dim * l_dim; i += 4) { + scratch_c_t.row(i + 1).swap(scratch_c_t.row(i + 2)); + } + + // Transpose gate matrix and place in 3rd (last) scratch block. + const auto scratch3_offset = end + 8 * bond_dim * bond_dim; + ConstMatrixMap gate_matrix((Complex*) matrix, 4, 4); + MatrixMap gate_matrix_transpose((Complex*)(raw_state + scratch3_offset), 4, 4); + gate_matrix_transpose = gate_matrix.transpose(); + gate_matrix_transpose.col(1).swap(gate_matrix_transpose.col(2)); + + // Contract gate and merged block tensors, placing result in B0B1. + for (unsigned i = 0; i < i_dim; ++i) { + fp_type* src_block = raw_state + end + i * 8 * m_dim; + fp_type* dest_block = raw_state + b_0_offset + i * 8 * m_dim; + MatrixMap block_b0b1((Complex*) dest_block, 4, m_dim); + ConstMatrixMap scratch_c_i((Complex*) src_block, 4, m_dim); + // [i, np, m] = [np, lj] * [i, lj, m] + block_b0b1.noalias() = gate_matrix_transpose * scratch_c_i; + } + + // SVD B0B1. + MatrixMap full_b0b1((Complex*)(raw_state + b_0_offset), 2 * i_dim, 2 * m_dim); + Eigen::BDCSVD svd(full_b0b1, Eigen::ComputeThinU | Eigen::ComputeThinV); + const auto p = std::min(2 * i_dim, 2 * m_dim); + + // Place U in scratch to truncate and then B0. + MatrixMap svd_u((Complex*)(raw_state + end), 2 * i_dim, p); + svd_u.noalias() = svd.matrixU(); + block_0.fill(Complex(0, 0)); + const auto keep_cols = (svd_u.cols() > bond_dim) ? bond_dim : svd_u.cols(); + block_0.block(0, 0, svd_u.rows(), keep_cols).noalias() = + svd_u(Eigen::indexing::all, Eigen::seq(0, keep_cols - 1)); + + // Place row product of S V into scratch to truncate and then B1. + MatrixMap svd_v((Complex*)(raw_state + end), p, 2 * m_dim); + MatrixMap s_vector((Complex*)(raw_state + end + 8 * bond_dim * bond_dim), p, 1); + svd_v.noalias() = svd.matrixV().adjoint(); + s_vector.noalias() = svd.singularValues(); + block_1.fill(Complex(0, 0)); + const auto keep_rows = (svd_v.rows() > bond_dim) ? bond_dim : svd_v.rows(); + const auto row_seq = Eigen::seq(0, keep_rows - 1); + for (unsigned i = 0; i < keep_rows; ++i) { + svd_v.row(i) *= s_vector(i); + } + block_1.block(0, 0, keep_rows, svd_v.cols()).noalias() = + svd_v(row_seq, Eigen::indexing::all); + } + + For for_; +}; + +} // namespace mps +} // namespace qsim + +#endif // MPS_SIMULATOR_H_ diff --git a/qsim/mps_statespace.h b/qsim/mps_statespace.h new file mode 100644 index 0000000..9b3acf3 --- /dev/null +++ b/qsim/mps_statespace.h @@ -0,0 +1,597 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MPS_STATESPACE_H_ +#define MPS_STATESPACE_H_ + +// For templates will take care of parallelization. +#define EIGEN_DONT_PARALLELIZE 1 + +#ifdef _WIN32 +#include +#endif + +#include +#include +#include +#include +#include + +#include "../eigen/Eigen/Dense" +#include "../eigen/unsupported/Eigen/CXX11/Tensor" + +namespace qsim { + +namespace mps { + +namespace detail { + +inline void do_not_free(void*) {} + +inline void free(void* ptr) { +#ifdef _WIN32 + _aligned_free(ptr); +#else + ::free(ptr); +#endif +} + +} // namespace detail + +/** + * Class containing context and routines for fixed bond dimension + * truncated Matrix Product State (MPS) simulation. + */ +template +class MPSStateSpace { + private: + public: + using fp_type = FP; + using Pointer = std::unique_ptr; + + using Complex = std::complex; + using Matrix = + Eigen::Matrix; + using ConstMatrixMap = Eigen::Map; + using MatrixMap = Eigen::Map; + + // Store MPS tensors with the following shape: + // [2, bond_dim], [bond_dim, 2, bond_dim], ... , [bond_dim, 2]. + class MPS { + public: + MPS() = delete; + + MPS(Pointer&& ptr, unsigned num_qubits, unsigned bond_dim) + : ptr_(std::move(ptr)), num_qubits_(num_qubits), bond_dim_(bond_dim) {} + + fp_type* get() { return ptr_.get(); } + + const fp_type* get() const { return ptr_.get(); } + + fp_type* release() { + num_qubits_ = 0; + return ptr_.release(); + } + + unsigned num_qubits() const { return num_qubits_; } + + unsigned bond_dim() const { return bond_dim_; } + + private: + Pointer ptr_; + unsigned num_qubits_; + unsigned bond_dim_; + }; + + // Note: ForArgs are currently unused. + template + MPSStateSpace(ForArgs&&... args) : for_(args...) {} + + // Requires num_qubits >= 2 and bond_dim >= 2. + static MPS Create(unsigned num_qubits, unsigned bond_dim) { + auto end_sizes = 2 * 4 * bond_dim; + auto internal_sizes = 4 * bond_dim * bond_dim * (num_qubits + 1); + // Use three extra "internal style" blocks past the end of the + // working allocation for scratch space. Needed for gate + // application. + auto size = sizeof(fp_type) * (end_sizes + internal_sizes); + +#ifdef _WIN32 + Pointer ptr{(fp_type*)_aligned_malloc(size, 64), &detail::free}; + bool is_null = ptr.get() != nullptr; + return MPS{std::move(ptr), is_null ? num_qubits : 0, + is_null ? bond_dim : 0}; +#else + void* p = nullptr; + if (posix_memalign(&p, 64, size) == 0) { + return MPS{Pointer{(fp_type*)p, &detail::free}, num_qubits, bond_dim}; + } else { + return MPS{Pointer{nullptr, &detail::free}, 0, 0}; + } +#endif + } + + static unsigned Size(const MPS& state) { + auto end_sizes = 2 * 4 * state.bond_dim(); + auto internal_sizes = 4 * state.bond_dim() * state.bond_dim(); + return end_sizes + internal_sizes * (state.num_qubits() - 2); + } + + static unsigned RawSize(const MPS& state) { + return sizeof(fp_type) * Size(state); + } + + // Get the pointer offset to the beginning of an MPS block. + static unsigned GetBlockOffset(const MPS& state, unsigned i) { + if (i == 0) { + return 0; + } + return 4 * state.bond_dim() * (1 + state.bond_dim() * (i - 1)); + } + + // Copies the state contents of one MPS to another. + // Ignores scratch data. + static bool Copy(const MPS& src, MPS& dest) { + if ((src.num_qubits() != dest.num_qubits()) || + src.bond_dim() != dest.bond_dim()) { + return false; + } + auto size = RawSize(src); + memcpy(dest.get(), src.get(), size); + return true; + } + + // Set the MPS to the |0> state. + static void SetStateZero(MPS& state) { + auto size = Size(state); + memset(state.get(), 0, sizeof(fp_type) * size); + auto block_size = 4 * state.bond_dim() * state.bond_dim(); + state.get()[0] = 1.0; + for (unsigned i = 4 * state.bond_dim(); i < size; i += block_size) { + state.get()[i] = 1.0; + } + } + + // Computes Re{} for two equal sized MPS. + // Requires: state1.bond_dim() == state2.bond_dim() && + // state1.num_qubits() == state2.num_qubits() + static fp_type RealInnerProduct(MPS& state1, MPS& state2) { + return InnerProduct(state1, state2).real(); + } + + // Computes for two equal sized MPS. + // Requires: state1.bond_dim() == state2.bond_dim() && + // state1.num_qubits() == state2.num_qubits() + static std::complex InnerProduct(MPS& state1, MPS& state2) { + const auto num_qubits = state1.num_qubits(); + const auto bond_dim = state1.bond_dim(); + const auto end = Size(state1); + auto offset = 0; + fp_type* state1_raw = state1.get(); + fp_type* state2_raw = state2.get(); + + // Contract leftmost blocks together, store result in state1 scratch. + ConstMatrixMap top((Complex*)state2_raw, 2, bond_dim); + ConstMatrixMap bot((Complex*)state1_raw, 2, bond_dim); + MatrixMap partial_contract((Complex*)(state1_raw + end), bond_dim, + bond_dim); + MatrixMap partial_contract2( + (Complex*)(state1_raw + end + 2 * bond_dim * bond_dim), bond_dim, + 2 * bond_dim); + partial_contract.noalias() = top.adjoint() * bot; + + // Contract all internal blocks together. + for (unsigned i = 1; i < num_qubits - 1; ++i) { + offset = GetBlockOffset(state1, i); + + // reshape: + new (&partial_contract2) + MatrixMap((Complex*)(state1_raw + end + 2 * bond_dim * bond_dim), + bond_dim, 2 * bond_dim); + + // Merge bot into left boundary merged tensor. + new (&bot) ConstMatrixMap((Complex*)(state1_raw + offset), bond_dim, + 2 * bond_dim); + partial_contract2.noalias() = partial_contract * bot; + + // reshape: + new (&partial_contract2) + MatrixMap((Complex*)(state1_raw + end + 2 * bond_dim * bond_dim), + 2 * bond_dim, bond_dim); + + // Merge top into partial_contract2. + new (&top) ConstMatrixMap((Complex*)(state2_raw + offset), 2 * bond_dim, + bond_dim); + partial_contract.noalias() = top.adjoint() * partial_contract2; + } + + // Contract rightmost bottom block. + offset = GetBlockOffset(state1, num_qubits - 1); + new (&bot) ConstMatrixMap((Complex*)(state1_raw + offset), bond_dim, 2); + new (&partial_contract2) MatrixMap( + (Complex*)(state1_raw + end + 4 * bond_dim * bond_dim), bond_dim, 2); + partial_contract2.noalias() = partial_contract * bot; + + // Contract rightmost top block. + new (&top) ConstMatrixMap((Complex*)(state2_raw + offset), 2 * bond_dim, 1); + new (&partial_contract) MatrixMap((Complex*)(state1_raw + end), 1, 1); + new (&partial_contract2) + MatrixMap((Complex*)(state1_raw + end + 4 * bond_dim * bond_dim), + 2 * bond_dim, 1); + partial_contract.noalias() = top.adjoint() * partial_contract2; + + return partial_contract(0, 0); + } + + // Compute the 2x2 1-RDM of state on index. Result written to rdm. + // Requires: scratch and rdm to be allocated. + static void ReduceDensityMatrix(MPS& state, MPS& scratch, int index, + fp_type* rdm) { + const auto num_qubits = state.num_qubits(); + const auto bond_dim = state.bond_dim(); + const auto end = Size(state); + const bool last_index = (index == num_qubits - 1); + const auto right_dim = (last_index ? 1 : bond_dim); + auto offset = 0; + fp_type* state_raw = state.get(); + fp_type* scratch_raw = scratch.get(); + fp_type* state_raw_workspace = state_raw + end + 2 * bond_dim * bond_dim; + fp_type* scratch_raw_workspace = + scratch_raw + end + 2 * bond_dim * bond_dim; + + Copy(state, scratch); + + // Contract leftmost blocks together, store result in state scratch. + ConstMatrixMap top((Complex*)scratch_raw, 2, bond_dim); + ConstMatrixMap bot((Complex*)state_raw, 2, bond_dim); + MatrixMap partial_contract((Complex*)(state_raw + end), bond_dim, bond_dim); + MatrixMap partial_contract2((Complex*)(state_raw_workspace), bond_dim, + 2 * bond_dim); + + partial_contract.setZero(); + partial_contract(0, 0) = 1; + if (index > 0) { + partial_contract.noalias() = top.adjoint() * bot; + } + + // Contract all internal blocks together. + for (unsigned i = 1; i < index; ++i) { + offset = GetBlockOffset(state, i); + + // reshape: + new (&partial_contract2) + MatrixMap((Complex*)(state_raw_workspace), bond_dim, 2 * bond_dim); + + // Merge bot into left boundary merged tensor. + new (&bot) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, + 2 * bond_dim); + partial_contract2.noalias() = partial_contract * bot; + + // reshape: + new (&partial_contract2) + MatrixMap((Complex*)(state_raw_workspace), 2 * bond_dim, bond_dim); + + // Merge top into partial_contract2. + new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), 2 * bond_dim, + bond_dim); + partial_contract.noalias() = top.adjoint() * partial_contract2; + } + + // The [bond_dim, bond_dim] block in state_raw now contains the contraction + // up to, but not including index. + // Contract rightmost blocks. + offset = GetBlockOffset(state, num_qubits - 1); + new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), bond_dim, 2); + new (&bot) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, 2); + new (&partial_contract) + MatrixMap((Complex*)(scratch_raw + end), bond_dim, bond_dim); + new (&partial_contract2) + MatrixMap((Complex*)(scratch_raw_workspace), bond_dim, 2 * bond_dim); + + partial_contract.setZero(); + partial_contract(0, 0) = 1; + if (index < num_qubits - 1) { + partial_contract.noalias() = top * bot.adjoint(); + } + + for (unsigned i = num_qubits - 2; i > index; --i) { + offset = GetBlockOffset(state, i); + + // reshape: + new (&partial_contract2) + MatrixMap((Complex*)(scratch_raw_workspace), 2 * bond_dim, bond_dim); + + // Merge bot into left boundary merged tensor. + new (&bot) ConstMatrixMap((Complex*)(state_raw + offset), 2 * bond_dim, + bond_dim); + partial_contract2.noalias() = bot * partial_contract.adjoint(); + + // reshape: + new (&partial_contract2) + MatrixMap((Complex*)(scratch_raw_workspace), bond_dim, 2 * bond_dim); + + // Merge top into partial_contract2. + new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), bond_dim, + 2 * bond_dim); + // [bd, bd] = [bd, 2bd] @ [bd, 2bd] + partial_contract.noalias() = top * partial_contract2.adjoint(); + } + + // The [bond_dim, bond_dim] block in scratch_raw now contains the + // contraction down from the end, but not including the index. Begin final + // contraction steps. + + // Get leftmost [bd, bd] contraction and contract with top. + + offset = GetBlockOffset(state, index); + new (&partial_contract) + MatrixMap((Complex*)(state_raw + end), bond_dim, bond_dim); + new (&top) + ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, 2 * right_dim); + new (&partial_contract2) + MatrixMap((Complex*)(scratch_raw_workspace), bond_dim, 2 * right_dim); + partial_contract2.noalias() = partial_contract * top.conjugate(); + // copy the bottom contraction scratch_raw to state_raw to save space. + memcpy(state_raw + end, scratch_raw + end, + bond_dim * bond_dim * 2 * sizeof(fp_type)); + + // Contract top again for correct shape. + fp_type* contract3_target = (last_index ? rdm : scratch_raw); + MatrixMap partial_contract3((Complex*)contract3_target, 2 * right_dim, + 2 * right_dim); + partial_contract3.noalias() = top.transpose() * partial_contract2; + + // If we are contracting the last index, all the needed transforms are done. + if (last_index) { + return; + } + + // Conduct final tensor contraction operations. Cannot be easily compiled to + // matmul. + const Eigen::TensorMap> + t_4d((Complex*)scratch_raw, 2, bond_dim, 2, bond_dim); + const Eigen::TensorMap> + t_2d((Complex*)(state_raw + end), bond_dim, bond_dim); + + const Eigen::array, 2> product_dims = { + Eigen::IndexPair(1, 0), + Eigen::IndexPair(3, 1), + }; + Eigen::TensorMap> out( + (Complex*)rdm, 2, 2); + out = t_4d.contract(t_2d, product_dims); + } + + // Draw a single bitstring sample from state using scratch and scratch2 + // as working space. + static void SampleOnce(MPS& state, MPS& scratch, MPS& scratch2, + std::mt19937* random_gen, std::vector* sample) { + // TODO: carefully profile with perf and optimize temp storage + // locations for cache friendliness. + const auto bond_dim = state.bond_dim(); + const auto num_qubits = state.num_qubits(); + const auto end = Size(state); + const auto left_frontier_offset = GetBlockOffset(state, num_qubits + 1); + std::default_random_engine generator; + fp_type* state_raw = state.get(); + fp_type* scratch_raw = scratch.get(); + fp_type* scratch2_raw = scratch2.get(); + fp_type rdm[8]; + + sample->reserve(num_qubits); + Copy(state, scratch); + Copy(state, scratch2); + + // Store prefix contractions in scratch2. + auto offset = GetBlockOffset(state, num_qubits - 1); + ConstMatrixMap top((Complex*)(state_raw + offset), bond_dim, 2); + ConstMatrixMap bot((Complex*)(scratch_raw + offset), bond_dim, 2); + MatrixMap partial_contract((Complex*)(scratch2_raw + offset), bond_dim, + bond_dim); + MatrixMap partial_contract2((Complex*)(scratch_raw + end), bond_dim, + 2 * bond_dim); + partial_contract.noalias() = top * bot.adjoint(); + + for (unsigned i = num_qubits - 2; i > 0; --i) { + offset = GetBlockOffset(state, i); + // reshape: + new (&partial_contract2) + MatrixMap((Complex*)(scratch_raw + end), 2 * bond_dim, bond_dim); + + // Merge bot into left boundary merged tensor. + new (&bot) ConstMatrixMap((Complex*)(scratch_raw + offset), 2 * bond_dim, + bond_dim); + partial_contract2.noalias() = bot * partial_contract.adjoint(); + + // reshape: + new (&partial_contract2) + MatrixMap((Complex*)(scratch_raw + end), bond_dim, 2 * bond_dim); + + // Merge top into partial_contract2. + new (&top) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, + 2 * bond_dim); + + // merge into partial_contract -> scracth2_raw. + new (&partial_contract) + MatrixMap((Complex*)(scratch2_raw + offset), bond_dim, bond_dim); + partial_contract.noalias() = top * partial_contract2.adjoint(); + } + + // Compute RDM-0 and draw first sample. + offset = GetBlockOffset(state, 1); + new (&top) ConstMatrixMap((Complex*)state_raw, 2, bond_dim); + new (&bot) ConstMatrixMap((Complex*)scratch_raw, 2, bond_dim); + new (&partial_contract) + MatrixMap((Complex*)(scratch2_raw + offset), bond_dim, bond_dim); + new (&partial_contract2) + MatrixMap((Complex*)(scratch_raw + end), 2, bond_dim); + + partial_contract2.noalias() = bot * partial_contract.adjoint(); + + new (&partial_contract) MatrixMap((Complex*)rdm, 2, 2); + partial_contract.noalias() = top * partial_contract2.adjoint(); + auto p0 = rdm[0] / (rdm[0] + rdm[6]); + std::bernoulli_distribution distribution(1 - p0); + auto bit_val = distribution(*random_gen); + sample->push_back(bit_val); + + // collapse state. + new (&partial_contract) MatrixMap((Complex*)scratch_raw, 2, bond_dim); + partial_contract.row(!bit_val).setZero(); + + // Prepare left contraction frontier. + new (&partial_contract2) MatrixMap( + (Complex*)(scratch2_raw + left_frontier_offset), bond_dim, bond_dim); + partial_contract2.noalias() = + partial_contract.transpose() * partial_contract.conjugate(); + + // Compute RDM-i and draw internal tensor samples. + for (unsigned i = 1; i < num_qubits - 1; i++) { + // Get leftmost [bd, bd] contraction and contract with top. + offset = GetBlockOffset(state, i); + new (&partial_contract) MatrixMap( + (Complex*)(scratch2_raw + left_frontier_offset), bond_dim, bond_dim); + new (&top) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, + 2 * bond_dim); + new (&partial_contract2) + MatrixMap((Complex*)(state_raw + end), bond_dim, 2 * bond_dim); + partial_contract2.noalias() = partial_contract * top.conjugate(); + + // Contract top again for correct shape. + MatrixMap partial_contract3((Complex*)(scratch_raw + end), 2 * bond_dim, + 2 * bond_dim); + partial_contract3.noalias() = top.transpose() * partial_contract2; + + // Conduct final tensor contraction operations. Cannot be easily compiled + // to matmul. Perf reports shows only ~6% of runtime spent here on large + // systems. + offset = GetBlockOffset(state, i + 1); + const Eigen::TensorMap> + t_4d((Complex*)(scratch_raw + end), 2, bond_dim, 2, bond_dim); + const Eigen::TensorMap> + t_2d((Complex*)(scratch2_raw + offset), bond_dim, bond_dim); + + const Eigen::array, 2> product_dims = { + Eigen::IndexPair(1, 0), + Eigen::IndexPair(3, 1), + }; + Eigen::TensorMap> out( + (Complex*)rdm, 2, 2); + out = t_4d.contract(t_2d, product_dims); + + // Sample bit and collapse state. + p0 = rdm[0] / (rdm[0] + rdm[6]); + distribution = std::bernoulli_distribution(1 - p0); + bit_val = distribution(*random_gen); + + sample->push_back(bit_val); + offset = GetBlockOffset(state, i); + new (&partial_contract) + MatrixMap((Complex*)(scratch_raw + offset), bond_dim * 2, bond_dim); + for (unsigned j = !bit_val; j < 2 * bond_dim; j += 2) { + partial_contract.row(j).setZero(); + } + + // Update left frontier. + new (&partial_contract) MatrixMap( + (Complex*)(scratch2_raw + left_frontier_offset), bond_dim, bond_dim); + + // reshape: + new (&partial_contract2) + MatrixMap((Complex*)(state_raw + end), bond_dim, 2 * bond_dim); + + // Merge bot into left boundary merged tensor. + new (&bot) ConstMatrixMap((Complex*)(scratch_raw + offset), bond_dim, + 2 * bond_dim); + partial_contract2.noalias() = partial_contract * bot.conjugate(); + + // reshape: + new (&partial_contract2) + MatrixMap((Complex*)(state_raw + end), 2 * bond_dim, bond_dim); + + // Merge top into partial_contract2. + new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), 2 * bond_dim, + bond_dim); + partial_contract.noalias() = top.transpose() * partial_contract2; + } + + // Compute RDM-(n-1) and sample. + offset = GetBlockOffset(state, num_qubits - 1); + new (&partial_contract2) + MatrixMap((Complex*)(state_raw + end), bond_dim, 2); + + new (&top) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, 2); + partial_contract2.noalias() = partial_contract * top.conjugate(); + new (&partial_contract) MatrixMap((Complex*)rdm, 2, 2); + partial_contract.noalias() = top.transpose() * partial_contract2; + + p0 = rdm[0] / (rdm[0] + rdm[6]); + distribution = std::bernoulli_distribution(1 - p0); + bit_val = distribution(*random_gen); + sample->push_back(bit_val); + } + + // Draw num_samples bitstring samples from state and store the result + // bit vectors in results. Uses scratch and scratch2 as workspace. + static void Sample(MPS& state, MPS& scratch, MPS& scratch2, + unsigned num_samples, unsigned seed, + std::vector>* results) { + std::mt19937 rand_source(seed); + results->reserve(num_samples); + for (unsigned i = 0; i < num_samples; i++) { + SampleOnce(state, scratch, scratch2, &rand_source, &(*results)[i]); + } + } + + // Testing only. Convert the MPS to a wavefunction under "normal" ordering. + // Requires: wf be allocated beforehand with bond_dim * 2 ^ num_qubits -1 + // memory. + static void ToWaveFunction(MPS& state, fp_type* wf) { + const auto bond_dim = state.bond_dim(); + const auto num_qubits = state.num_qubits(); + fp_type* raw_state = state.get(); + + ConstMatrixMap accum = ConstMatrixMap((Complex*)(raw_state), 2, bond_dim); + ConstMatrixMap next_block = ConstMatrixMap(nullptr, 0, 0); + MatrixMap result2 = MatrixMap(nullptr, 0, 0); + auto offset = 0; + auto result2_size = 2; + + for (unsigned i = 1; i < num_qubits - 1; i++) { + offset = GetBlockOffset(state, i); + // use of new does not trigger any expensive operations. + new (&next_block) ConstMatrixMap((Complex*)(raw_state + offset), bond_dim, + 2 * bond_dim); + new (&result2) MatrixMap((Complex*)(wf), result2_size, 2 * bond_dim); + + // temp variable used since result2 and accum point to same memory. + result2 = accum * next_block; + result2_size *= 2; + new (&accum) ConstMatrixMap((Complex*)(wf), result2_size, bond_dim); + } + offset = GetBlockOffset(state, num_qubits - 1); + new (&next_block) + ConstMatrixMap((Complex*)(raw_state + offset), bond_dim, 2); + new (&result2) MatrixMap((Complex*)(wf), result2_size, 2); + result2 = accum * next_block; + } + + protected: + For for_; +}; + +} // namespace mps +} // namespace qsim + +#endif // MPS_STATESPACE_H_ diff --git a/qsim/parfor.h b/qsim/parfor.h new file mode 100644 index 0000000..8a3a4d6 --- /dev/null +++ b/qsim/parfor.h @@ -0,0 +1,123 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef PARFOR_H_ +#define PARFOR_H_ + +#include + +#include +#include +#include + +namespace qsim { + +/** + * Helper struct for executing for-loops in parallel across multiple threads. + */ +template +struct ParallelForT { + explicit ParallelForT(unsigned num_threads) : num_threads(num_threads) {} + + // GetIndex0 and GetIndex1 are useful when we need to know how work was + // divided between threads, for instance, for reusing partial sums obtained + // by RunReduceP. + uint64_t GetIndex0(uint64_t size, unsigned thread_id) const { + return size >= MIN_SIZE ? size * thread_id / num_threads : 0; + } + + uint64_t GetIndex1(uint64_t size, unsigned thread_id) const { + return size >= MIN_SIZE ? size * (thread_id + 1) / num_threads : size; + } + + template + void Run(uint64_t size, Function&& func, Args&&... args) const { + if (num_threads > 1 && size >= MIN_SIZE) { + #pragma omp parallel num_threads(num_threads) + { + unsigned n = omp_get_num_threads(); + unsigned m = omp_get_thread_num(); + + uint64_t i0 = GetIndex0(size, m); + uint64_t i1 = GetIndex1(size, m); + + for (uint64_t i = i0; i < i1; ++i) { + func(n, m, i, args...); + } + } + } else { + for (uint64_t i = 0; i < size; ++i) { + func(1, 0, i, args...); + } + } + } + + template + std::vector RunReduceP( + uint64_t size, Function&& func, Op&& op, Args&&... args) const { + std::vector partial_results; + + if (num_threads > 1 && size >= MIN_SIZE) { + partial_results.resize(num_threads, 0); + + #pragma omp parallel num_threads(num_threads) + { + unsigned n = omp_get_num_threads(); + unsigned m = omp_get_thread_num(); + + uint64_t i0 = GetIndex0(size, m); + uint64_t i1 = GetIndex1(size, m); + + typename Op::result_type partial_result = 0; + + for (uint64_t i = i0; i < i1; ++i) { + partial_result = op(partial_result, func(n, m, i, args...)); + } + + partial_results[m] = partial_result; + } + } else if (num_threads > 0) { + typename Op::result_type result = 0; + for (uint64_t i = 0; i < size; ++i) { + result = op(result, func(1, 0, i, args...)); + } + + partial_results.resize(1, result); + } + + return partial_results; + } + + template + typename Op::result_type RunReduce(uint64_t size, Function&& func, + Op&& op, Args&&... args) const { + auto partial_results = RunReduceP(size, func, std::move(op), args...); + + typename Op::result_type result = 0; + + for (auto partial_result : partial_results) { + result = op(result, partial_result); + } + + return result; + } + + unsigned num_threads; +}; + +using ParallelFor = ParallelForT<1024>; + +} // namespace qsim + +#endif // PARFOR_H_ diff --git a/qsim/qtrajectory.h b/qsim/qtrajectory.h new file mode 100644 index 0000000..1da6692 --- /dev/null +++ b/qsim/qtrajectory.h @@ -0,0 +1,435 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef QTRAJECTORY_H_ +#define QTRAJECTORY_H_ + +#include +#include +#include +#include +#include + +#include "circuit_noisy.h" +#include "gate.h" +#include "gate_appl.h" + +namespace qsim { + +/** + * Quantum trajectory simulator. + */ +template class FuserT, typename Simulator, + typename RGen = std::mt19937> +class QuantumTrajectorySimulator { + public: + using Fuser = FuserT; + using StateSpace = typename Simulator::StateSpace; + using State = typename Simulator::State; + using MeasurementResult = typename StateSpace::MeasurementResult; + + /** + * User-specified parameters for the simulator. + */ + struct Parameter : public Fuser::Parameter { + /** + * If true, collect statistics of sampled Kraus operator indices. + */ + bool collect_kop_stat = false; + /** + * If true, collect statistics of measured bitstrings. + */ + bool collect_mea_stat = false; + /** + * If true, normalize the state vector before performing measurements. + */ + bool normalize_before_mea_gates = true; + /** + * If false, do not apply deferred operators after the main loop for + * the "primary" noise trajectory, that is the trajectory in which + * the primary (the first operators in their respective channels) Kraus + * operators are sampled for each channel and there are no measurements + * in the computational basis. This can be used to speed up simulations + * of circuits with weak noise and without measurements by reusing + * the primary trajectory results. There is an additional condition for + * RunBatch. In this case, the deferred operators after the main loop are + * still applied for the first occurence of the primary trajectory. + * The primary Kraus operators should have the highest sampling + * probabilities to achieve the highest speedup. + * + * It is the client's responsibility to collect the primary trajectory + * results and to reuse them. + */ + bool apply_last_deferred_ops = true; + }; + + /** + * Struct with statistics to populate by RunBatch and RunOnce methods. + */ + struct Stat { + /** + * Indices of sampled Kraus operator indices and/or measured bitstrings. + */ + std::vector samples; + /** + * True if the "primary" noise trajectory is sampled, false otherwise. + */ + bool primary; + }; + + /** + * Runs the given noisy circuit performing repetitions. Each repetition is + * seeded by repetition ID. + * @param param Options for the quantum trajectory simulator. + * @param circuit The noisy circuit to be simulated. + * @param r0, r1 The range of repetition IDs [r0, r1) to perform repetitions. + * @param state_space StateSpace object required to manipulate state vector. + * @param simulator Simulator object. Provides specific implementations for + * applying gates. + * @param measure Function that performs measurements (in the sense of + * computing expectation values, etc). This function should have three + * required parameters [repetition ID (uint64_t), final state vector + * (const State&), statistics of sampled Kraus operator indices and/or + * measured bitstrings (const Stat&)] and any number of optional parameters. + * @param args Optional arguments for the 'measure' function. + * @return True if the simulation completed successfully; false otherwise. + */ + template + static bool RunBatch(const Parameter& param, + const NoisyCircuit& circuit, + uint64_t r0, uint64_t r1, const StateSpace& state_space, + const Simulator& simulator, MeasurementFunc&& measure, + Args&&... args) { + return RunBatch(param, circuit.num_qubits, circuit.channels.begin(), + circuit.channels.end(), r0, r1, state_space, simulator, + measure, args...); + } + + /** + * Runs the given noisy circuit performing repetitions. Each repetition is + * seeded by repetition ID. + * @param param Options for the quantum trajectory simulator. + * @param num_qubits The number of qubits acted on by the circuit. + * @param cbeg, cend The range of channels [cbeg, cend) to run the circuit. + * @param r0, r1 The range of repetition IDs [r0, r1) to perform repetitions. + * @param state_space StateSpace object required to manipulate state vector. + * @param simulator Simulator object. Provides specific implementations for + * applying gates. + * @param measure Function that performs measurements (in the sense of + * computing expectation values, etc). This function should have three + * required parameters [repetition ID (uint64_t), final state vector + * (const State&), statistics of sampled Kraus operator indices and/or + * measured bitstrings (const Stat&)] and any number of optional parameters. + * @param args Optional arguments for the 'measure' function. + * @return True if the simulation completed successfully; false otherwise. + */ + template + static bool RunBatch(const Parameter& param, unsigned num_qubits, + ncircuit_iterator cbeg, + ncircuit_iterator cend, + uint64_t r0, uint64_t r1, const StateSpace& state_space, + const Simulator& simulator, MeasurementFunc&& measure, + Args&&... args) { + std::vector gates; + gates.reserve(4 * std::size_t(cend - cbeg)); + + State state = state_space.Null(); + + Stat stat; + bool had_primary_realization = false; + + for (uint64_t r = r0; r < r1; ++r) { + if (!state_space.IsNull(state)) { + state_space.SetStateZero(state); + } + + bool apply_last_deferred_ops = + param.apply_last_deferred_ops || !had_primary_realization; + + if (!RunIteration(param, apply_last_deferred_ops, num_qubits, cbeg, cend, + r, state_space, simulator, gates, state, stat)) { + return false; + } + + if (stat.primary && !had_primary_realization) { + had_primary_realization = true; + } + + measure(r, state, stat, args...); + } + + return true; + } + + /** + * Runs the given noisy circuit one time. + * @param param Options for the quantum trajectory simulator. + * @param circuit The noisy circuit to be simulated. + * @param r The repetition ID. The random number generator is seeded by 'r'. + * @param state_space StateSpace object required to manipulate state vector. + * @param simulator Simulator object. Provides specific implementations for + * applying gates. + * @param state The state of the system, to be updated by this method. + * @param stat Statistics of sampled Kraus operator indices and/or measured + * bitstrings, to be populated by this method. + * @return True if the simulation completed successfully; false otherwise. + */ + static bool RunOnce(const Parameter& param, + const NoisyCircuit& circuit, uint64_t r, + const StateSpace& state_space, const Simulator& simulator, + State& state, Stat& stat) { + return RunOnce(param, circuit.num_qubits, circuit.channels.begin(), + circuit.channels.end(), r, state_space, simulator, + state, stat); + } + + /** + * Runs the given noisy circuit one time. + * @param param Options for the quantum trajectory simulator. + * @param num_qubits The number of qubits acted on by the circuit. + * @param cbeg, cend The range of channels [cbeg, cend) to run the circuit. + * @param circuit The noisy circuit to be simulated. + * @param r The repetition ID. The random number generator is seeded by 'r'. + * @param state_space StateSpace object required to manipulate state vector. + * @param simulator Simulator object. Provides specific implementations for + * applying gates. + * @param state The state of the system, to be updated by this method. + * @param stat Statistics of sampled Kraus operator indices and/or measured + * bitstrings, to be populated by this method. + * @return True if the simulation completed successfully; false otherwise. + */ + static bool RunOnce(const Parameter& param, unsigned num_qubits, + ncircuit_iterator cbeg, + ncircuit_iterator cend, + uint64_t r, const StateSpace& state_space, + const Simulator& simulator, State& state, Stat& stat) { + std::vector gates; + gates.reserve(4 * std::size_t(cend - cbeg)); + + if (!RunIteration(param, param.apply_last_deferred_ops, num_qubits, cbeg, + cend, r, state_space, simulator, gates, state, stat)) { + return false; + } + + return true; + } + + private: + static bool RunIteration(const Parameter& param, + bool apply_last_deferred_ops, unsigned num_qubits, + ncircuit_iterator cbeg, + ncircuit_iterator cend, + uint64_t rep, const StateSpace& state_space, + const Simulator& simulator, + std::vector& gates, + State& state, Stat& stat) { + if (param.collect_kop_stat || param.collect_mea_stat) { + stat.samples.reserve(std::size_t(cend - cbeg)); + stat.samples.resize(0); + } + + if (state_space.IsNull(state)) { + state = CreateState(num_qubits, state_space); + if (state_space.IsNull(state)) { + return false; + } + + state_space.SetStateZero(state); + } + + gates.resize(0); + + RGen rgen(rep); + std::uniform_real_distribution distr(0.0, 1.0); + + bool unitary = true; + stat.primary = true; + + for (auto it = cbeg; it != cend; ++it) { + const auto& channel = *it; + + if (channel.size() == 0) continue; + + if (channel[0].kind == gate::kMeasurement) { + // Measurement channel. + + if (!ApplyDeferredOps(param, num_qubits, simulator, gates, state)) { + return false; + } + + bool normalize = !unitary && param.normalize_before_mea_gates; + NormalizeState(normalize, state_space, unitary, state); + + auto mresult = ApplyMeasurementGate(state_space, channel[0].ops[0], + rgen, state); + + if (!mresult.valid) { + return false; + } + + CollectStat(param.collect_mea_stat, mresult.bits, stat); + + stat.primary = false; + + continue; + } + + // "Normal" channel. + + double r = distr(rgen); + double cp = 0; + + // Perform sampling of Kraus operators using probability bounds. + for (std::size_t i = 0; i < channel.size(); ++i) { + const auto& kop = channel[i]; + + cp += kop.prob; + + if (r < cp) { + DeferOps(kop.ops, gates); + CollectStat(param.collect_kop_stat, i, stat); + + unitary = unitary && kop.unitary; + + break; + } + } + + if (r < cp) continue; + + if (!ApplyDeferredOps(param, num_qubits, simulator, gates, state)) { + return false; + } + + NormalizeState(!unitary, state_space, unitary, state); + + double max_prob = 0; + std::size_t max_prob_index = 0; + + // Perform sampling of Kraus operators using norms of updated states. + for (std::size_t i = 0; i < channel.size(); ++i) { + const auto& kop = channel[i]; + + if (kop.unitary) continue; + + double prob = std::real( + simulator.ExpectationValue(kop.qubits, kop.kd_k.data(), state)); + + if (prob > max_prob) { + max_prob = prob; + max_prob_index = i; + } + + cp += prob - kop.prob; + + if (r < cp || i == channel.size() - 1) { + // Sample ith Kraus operator if r < cp + // Sample the highest probability Kraus operator if r is greater + // than the sum of all probablities due to round-off errors. + uint64_t k = r < cp ? i : max_prob_index; + + DeferOps(channel[k].ops, gates); + CollectStat(param.collect_kop_stat, k, stat); + + unitary = false; + + break; + } + } + } + + if (apply_last_deferred_ops || !stat.primary) { + if (!ApplyDeferredOps(param, num_qubits, simulator, gates, state)) { + return false; + } + + NormalizeState(!unitary, state_space, unitary, state); + } + + return true; + } + + static State CreateState(unsigned num_qubits, const StateSpace& state_space) { + auto state = state_space.Create(num_qubits); + if (state_space.IsNull(state)) { + IO::errorf("not enough memory: is the number of qubits too large?\n"); + return state_space.Null(); + } + + return state; + } + + static bool ApplyDeferredOps( + const Parameter& param, unsigned num_qubits, const Simulator& simulator, + std::vector& gates, State& state) { + if (gates.size() > 0) { + auto fgates = Fuser::FuseGates(param, num_qubits, gates); + + gates.resize(0); + + if (fgates.size() == 0) { + return false; + } + + for (const auto& fgate : fgates) { + ApplyFusedGate(simulator, fgate, state); + } + } + + return true; + } + + static MeasurementResult ApplyMeasurementGate( + const StateSpace& state_space, const Gate& gate, + RGen& rgen, State& state) { + auto result = state_space.Measure(gate.qubits, rgen, state); + + if (!result.valid) { + IO::errorf("measurement failed.\n"); + } + + return result; + } + + static void DeferOps( + const std::vector& ops, std::vector& gates) { + for (const auto& op : ops) { + gates.push_back(&op); + } + } + + static void CollectStat(bool collect_stat, uint64_t i, Stat& stat) { + if (collect_stat) { + stat.samples.push_back(i); + } + + if (i != 0) { + stat.primary = false; + } + } + + static void NormalizeState(bool normalize, const StateSpace& state_space, + bool& flag, State& state) { + if (normalize) { + double a = 1.0 / std::sqrt(state_space.Norm(state)); + state_space.Multiply(a, state); + flag = true; + } + } +}; + +} // namespace qsim + +#endif // QTRAJECTORY_H_ diff --git a/qsim/run_qsim.h b/qsim/run_qsim.h new file mode 100644 index 0000000..3752915 --- /dev/null +++ b/qsim/run_qsim.h @@ -0,0 +1,262 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef RUN_QSIM_H_ +#define RUN_QSIM_H_ + +#include +#include +#include + +#include "gate.h" +#include "gate_appl.h" +#include "util.h" + +namespace qsim { + +/** + * Helper struct for running qsim. + */ +template +struct QSimRunner final { + public: + using Simulator = typename Factory::Simulator; + using StateSpace = typename Simulator::StateSpace; + using State = typename StateSpace::State; + using MeasurementResult = typename StateSpace::MeasurementResult; + + /** + * User-specified parameters for gate fusion and simulation. + */ + struct Parameter : public Fuser::Parameter { + /** + * Random number generator seed to apply measurement gates. + */ + uint64_t seed; + }; + + /** + * Runs the given circuit, only measuring at the end. + * @param param Options for gate fusion, parallelism and logging. + * @param factory Object to create simulators and state spaces. + * @param circuit The circuit to be simulated. + * @param measure Function that performs measurements (in the sense of + * computing expectation values, etc). + * @return True if the simulation completed successfully; false otherwise. + */ + template + static bool Run(const Parameter& param, const Factory& factory, + const Circuit& circuit, MeasurementFunc measure) { + return Run(param, factory, {circuit.gates.back().time}, circuit, measure); + } + + /** + * Runs the given circuit, measuring at user-specified times. + * @param param Options for gate fusion, parallelism and logging. + * @param factory Object to create simulators and state spaces. + * @param times_to_measure_at Time steps at which to perform measurements. + * @param circuit The circuit to be simulated. + * @param measure Function that performs measurements (in the sense of + * computing expectation values, etc). + * @return True if the simulation completed successfully; false otherwise. + */ + template + static bool Run(const Parameter& param, const Factory& factory, + const std::vector& times_to_measure_at, + const Circuit& circuit, MeasurementFunc measure) { + double t0 = 0.0; + double t1 = 0.0; + + if (param.verbosity > 1) { + t0 = GetTime(); + } + + RGen rgen(param.seed); + + StateSpace state_space = factory.CreateStateSpace(); + + auto state = state_space.Create(circuit.num_qubits); + if (state_space.IsNull(state)) { + IO::errorf("not enough memory: is the number of qubits too large?\n"); + return false; + } + + state_space.SetStateZero(state); + Simulator simulator = factory.CreateSimulator(); + + if (param.verbosity > 1) { + t1 = GetTime(); + IO::messagef("init time is %g seconds.\n", t1 - t0); + t0 = GetTime(); + } + + auto fused_gates = Fuser::FuseGates(param, circuit.num_qubits, + circuit.gates, times_to_measure_at); + + if (fused_gates.size() == 0 && circuit.gates.size() > 0) { + return false; + } + + if (param.verbosity > 1) { + t1 = GetTime(); + IO::messagef("fuse time is %g seconds.\n", t1 - t0); + } + + if (param.verbosity > 0) { + t0 = GetTime(); + } + + unsigned cur_time_index = 0; + + // Apply fused gates. + for (std::size_t i = 0; i < fused_gates.size(); ++i) { + if (param.verbosity > 3) { + t1 = GetTime(); + } + + if (!ApplyFusedGate(state_space, simulator, fused_gates[i], rgen, + state)) { + IO::errorf("measurement failed.\n"); + return false; + } + + if (param.verbosity > 3) { + state_space.DeviceSync(); + double t2 = GetTime(); + IO::messagef("gate %lu done in %g seconds.\n", i, t2 - t1); + } + + unsigned t = times_to_measure_at[cur_time_index]; + + if (i == fused_gates.size() - 1 || t < fused_gates[i + 1].time) { + // Call back to perform measurements. + measure(cur_time_index, state_space, state); + ++cur_time_index; + } + } + + if (param.verbosity > 0) { + state_space.DeviceSync(); + double t2 = GetTime(); + IO::messagef("time is %g seconds.\n", t2 - t0); + } + + return true; + } + + /** + * Runs the given circuit and make the final state available to the caller, + * recording the result of any intermediate measurements in the circuit. + * @param param Options for gate fusion, parallelism and logging. + * @param factory Object to create simulators and state spaces. + * @param circuit The circuit to be simulated. + * @param state As an input parameter, this should contain the initial state + * of the system. After a successful run, it will be populated with the + * final state of the system. + * @param measure_results As an input parameter, this should be empty. + * After a successful run, this will contain all measurements results from + * the run, ordered by time and qubit index. + * @return True if the simulation completed successfully; false otherwise. + */ + template + static bool Run(const Parameter& param, const Factory& factory, + const Circuit& circuit, State& state, + std::vector& measure_results) { + double t0 = 0.0; + double t1 = 0.0; + + if (param.verbosity > 1) { + t0 = GetTime(); + } + + RGen rgen(param.seed); + + StateSpace state_space = factory.CreateStateSpace(); + Simulator simulator = factory.CreateSimulator(); + + if (param.verbosity > 1) { + t1 = GetTime(); + IO::messagef("init time is %g seconds.\n", t1 - t0); + t0 = GetTime(); + } + + auto fused_gates = Fuser::FuseGates(param, circuit.num_qubits, + circuit.gates); + + if (fused_gates.size() == 0 && circuit.gates.size() > 0) { + return false; + } + + measure_results.reserve(fused_gates.size()); + + if (param.verbosity > 1) { + t1 = GetTime(); + IO::messagef("fuse time is %g seconds.\n", t1 - t0); + } + + if (param.verbosity > 0) { + t0 = GetTime(); + } + + // Apply fused gates. + for (std::size_t i = 0; i < fused_gates.size(); ++i) { + if (param.verbosity > 3) { + t1 = GetTime(); + } + + if (!ApplyFusedGate(state_space, simulator, fused_gates[i], rgen, state, + measure_results)) { + IO::errorf("measurement failed.\n"); + return false; + } + + if (param.verbosity > 3) { + state_space.DeviceSync(); + double t2 = GetTime(); + IO::messagef("gate %lu done in %g seconds.\n", i, t2 - t1); + } + } + + if (param.verbosity > 0) { + state_space.DeviceSync(); + double t2 = GetTime(); + IO::messagef("simu time is %g seconds.\n", t2 - t0); + } + + return true; + } + + /** + * Runs the given circuit and make the final state available to the caller, + * discarding the result of any intermediate measurements in the circuit. + * @param param Options for gate fusion, parallelism and logging. + * @param factory Object to create simulators and state spaces. + * @param circuit The circuit to be simulated. + * @param state As an input parameter, this should contain the initial state + * of the system. After a successful run, it will be populated with the + * final state of the system. + * @return True if the simulation completed successfully; false otherwise. + */ + template + static bool Run(const Parameter& param, const Factory& factory, + const Circuit& circuit, State& state) { + std::vector discarded_results; + return Run(param, factory, circuit, state, discarded_results); + } +}; + +} // namespace qsim + +#endif // RUN_QSIM_H_ diff --git a/qsim/run_qsimh.h b/qsim/run_qsimh.h new file mode 100644 index 0000000..c1534d3 --- /dev/null +++ b/qsim/run_qsimh.h @@ -0,0 +1,120 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef RUN_QSIMH_H_ +#define RUN_QSIMH_H_ + +#include +#include + +#include "hybrid.h" +#include "util.h" + +namespace qsim { + +/** + * Helper struct for running qsimh. + */ +template +struct QSimHRunner final { + using Gate = typename HybridSimulator::Gate; + using fp_type = typename HybridSimulator::fp_type; + + using Parameter = typename HybridSimulator::Parameter; + using HybridData = typename HybridSimulator::HybridData; + using Fuser = typename HybridSimulator::Fuser; + + /** + * Evaluates the amplitudes for a given circuit and set of output states. + * @param param Options for gate fusion, parallelism and logging. Also + * specifies the size of the 'prefix' and 'root' sections of the lattice. + * @param factory Object to create simulators and state spaces. + * @param circuit The circuit to be simulated. + * @param parts Lattice sections to be simulated. + * @param bitstrings List of output states to simulate, as bitstrings. + * @param results Output vector of amplitudes. After a successful run, this + * will be populated with amplitudes for each state in 'bitstrings'. + * @return True if the simulation completed successfully; false otherwise. + */ + template + static bool Run(const Parameter& param, const Factory& factory, + const Circuit& circuit, const std::vector& parts, + const std::vector& bitstrings, + std::vector>& results) { + if (circuit.num_qubits != parts.size()) { + IO::errorf("parts size is not equal to the number of qubits."); + return false; + } + + double t0 = 0.0; + + if (param.verbosity > 0) { + t0 = GetTime(); + } + + HybridData hd; + bool rc = HybridSimulator::SplitLattice(parts, circuit.gates, hd); + + if (!rc) { + return false; + } + + if (hd.num_gatexs < param.num_prefix_gatexs + param.num_root_gatexs) { + IO::errorf("error: num_prefix_gates (%u) plus num_root gates (%u) is " + "greater than num_gates_on_the_cut (%u).\n", + param.num_prefix_gatexs, param.num_root_gatexs, + hd.num_gatexs); + return false; + } + + if (param.verbosity > 0) { + PrintInfo(param, hd); + } + + auto fgates0 = Fuser::FuseGates(param, hd.num_qubits0, hd.gates0); + if (fgates0.size() == 0 && hd.gates0.size() > 0) { + return false; + } + + auto fgates1 = Fuser::FuseGates(param, hd.num_qubits1, hd.gates1); + if (fgates1.size() == 0 && hd.gates1.size() > 0) { + return false; + } + + rc = HybridSimulator(param.num_threads).Run( + param, factory, hd, parts, fgates0, fgates1, bitstrings, results); + + if (rc && param.verbosity > 0) { + double t1 = GetTime(); + IO::messagef("time elapsed %g seconds.\n", t1 - t0); + } + + return rc; + } + + private: + static void PrintInfo(const Parameter& param, const HybridData& hd) { + unsigned num_suffix_gates = + hd.num_gatexs - param.num_prefix_gatexs - param.num_root_gatexs; + + IO::messagef("part 0: %u, part 1: %u\n", hd.num_qubits0, hd.num_qubits1); + IO::messagef("%u gates on the cut\n", hd.num_gatexs); + IO::messagef("breakup: %up+%ur+%us\n", param.num_prefix_gatexs, + param.num_root_gatexs, num_suffix_gates); + } +}; + +} // namespace qsim + +#endif // RUN_QSIM_H_ diff --git a/qsim/seqfor.h b/qsim/seqfor.h new file mode 100644 index 0000000..3ebf07c --- /dev/null +++ b/qsim/seqfor.h @@ -0,0 +1,68 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SEQFOR_H_ +#define SEQFOR_H_ + +#include +#include +#include + +namespace qsim { + +/** + * Helper struct for executing for loops in series. + */ +struct SequentialFor { + explicit SequentialFor(unsigned num_threads) {} + + // SequentialFor does not have any state. So all its methods can be static. + + static uint64_t GetIndex0(uint64_t size, unsigned thread_id) { + return 0; + } + + static uint64_t GetIndex1(uint64_t size, unsigned thread_id) { + return size; + } + + template + static void Run(uint64_t size, Function&& func, Args&&... args) { + for (uint64_t i = 0; i < size; ++i) { + func(1, 0, i, args...); + } + } + + template + static std::vector RunReduceP( + uint64_t size, Function&& func, Op&& op, Args&&... args) { + typename Op::result_type result = 0; + + for (uint64_t i = 0; i < size; ++i) { + result = op(result, func(1, 0, i, args...)); + } + + return std::vector(1, result); + } + + template + static typename Op::result_type RunReduce(uint64_t size, Function&& func, + Op&& op, Args&&... args) { + return RunReduceP(size, func, std::move(op), args...)[0]; + } +}; + +} // namespace qsim + +#endif // SEQFOR_H_ diff --git a/qsim/simmux.h b/qsim/simmux.h new file mode 100644 index 0000000..d3c4074 --- /dev/null +++ b/qsim/simmux.h @@ -0,0 +1,44 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SIMMUX_H_ +#define SIMMUX_H_ + +#ifdef __AVX512F__ +# include "simulator_avx512.h" + namespace qsim { + template + using Simulator = SimulatorAVX512; + } +#elif __AVX2__ +# include "simulator_avx.h" + namespace qsim { + template + using Simulator = SimulatorAVX; + } +#elif __SSE4_1__ +# include "simulator_sse.h" + namespace qsim { + template + using Simulator = SimulatorSSE; + } +#else +# include "simulator_basic.h" + namespace qsim { + template + using Simulator = SimulatorBasic; + } +#endif + +#endif // SIMMUX_H_ diff --git a/qsim/simmux_gpu.h b/qsim/simmux_gpu.h new file mode 100644 index 0000000..1f0bb59 --- /dev/null +++ b/qsim/simmux_gpu.h @@ -0,0 +1,30 @@ +// Copyright 2023 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SIMMUX_GPU_H_ +#define SIMMUX_GPU_H_ + +#ifdef __CUSTATEVEC__ +# include "simulator_custatevec.h" + namespace qsim { + using SimulatorGpu = SimulatorCuStateVec<>; + } +#else +# include "simulator_cuda.h" + namespace qsim { + using SimulatorGpu = SimulatorCUDA<>; + } +#endif + +#endif // SIMMUX_GPU_H_ diff --git a/qsim/simulator.h b/qsim/simulator.h new file mode 100644 index 0000000..eff5441 --- /dev/null +++ b/qsim/simulator.h @@ -0,0 +1,516 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SIMULATOR_H_ +#define SIMULATOR_H_ + +#include + +#include "bits.h" + +namespace qsim { + +/** + * Base class for simulator classes. + */ +class SimulatorBase { + protected: + // The follwoing template parameters are used for functions below. + // H - the number of high (target) qubits. + // L - the number of low (target) qubits. + // R - SIMD register width in floats. + + // Fills the table of masks (ms) that is used to calculate base state indices + // and the table of offset indices (xss) that is used to access the state + // vector entries in matrix-vector multiplication functions. This function is + // used in simulator_basic.h, simulator_sse.h and simulator_avx.h (no bmi2 + // version). + template + static void FillIndices(unsigned num_qubits, const std::vector& qs, + uint64_t* ms, uint64_t* xss) { + constexpr unsigned hsize = 1 << H; + + if (H == 0) { + ms[0] = uint64_t(-1); + xss[0] = 0; + } else { + uint64_t xs[H + 1]; + + xs[0] = uint64_t{1} << (qs[L] + 1); + ms[0] = (uint64_t{1} << qs[L]) - 1; + for (unsigned i = 1; i < H; ++i) { + xs[i] = uint64_t{1} << (qs[L + i] + 1); + ms[i] = ((uint64_t{1} << qs[L + i]) - 1) ^ (xs[i - 1] - 1); + } + ms[H] = ((uint64_t{1} << num_qubits) - 1) ^ (xs[H - 1] - 1); + + for (unsigned i = 0; i < hsize; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < H; ++k) { + a += xs[k] * ((i >> k) & 1); + } + xss[i] = a; + } + } + } + + // Fills gate matrix entries for gates with low qubits. + template + static void FillMatrix(unsigned qmaskl, const fp_type* matrix, fp_type* w) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + constexpr unsigned rsize = 1 << R; + + unsigned s = 0; + + for (unsigned i = 0; i < hsize; ++i) { + for (unsigned j = 0; j < gsize; ++j) { + unsigned p0 = 2 * i * lsize * gsize + 2 * lsize * (j / lsize); + + for (unsigned k = 0; k < rsize; ++k) { + unsigned l = bits::CompressBits(k, R, qmaskl); + unsigned p = p0 + 2 * (gsize * l + (j + l) % lsize); + + w[s + 0] = matrix[p]; + w[s + rsize] = matrix[p + 1]; + + ++s; + } + + s += rsize; + } + } + } + + // Fills gate matrix entries for controlled gates with high target qubits + // and low control qubits. + template + static void FillControlledMatrixH(uint64_t cvalsl, uint64_t cmaskl, + const fp_type* matrix, fp_type* w) { + constexpr unsigned hsize = 1 << H; + constexpr unsigned rsize = 1 << R; + + unsigned s = 0; + + for (unsigned i = 0; i < hsize; ++i) { + for (unsigned j = 0; j < hsize; ++j) { + unsigned p = hsize * i + j; + fp_type v = i == j ? 1 : 0; + + for (unsigned k = 0; k < rsize; ++k) { + w[s] = cvalsl == (k & cmaskl) ? matrix[2 * p] : v; + w[s + rsize] = cvalsl == (k & cmaskl) ? matrix[2 * p + 1] : 0; + + ++s; + } + + s += rsize; + } + } + } + + // Fills gate matrix entries for controlled gates with low target qubits + // and low control qubits. + template + static void FillControlledMatrixL(uint64_t cvalsl, uint64_t cmaskl, + unsigned qmaskl, const fp_type* matrix, + fp_type* w) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + constexpr unsigned rsize = 1 << R; + + unsigned s = 0; + + for (unsigned i = 0; i < hsize; ++i) { + for (unsigned j = 0; j < gsize; ++j) { + unsigned p0 = i * lsize * gsize + lsize * (j / lsize); + + for (unsigned k = 0; k < rsize; ++k) { + unsigned l = bits::CompressBits(k, R, qmaskl); + unsigned p = p0 + gsize * l + (j + l) % lsize; + + fp_type v = p / gsize == p % gsize ? 1 : 0; + + w[s] = cvalsl == (k & cmaskl) ? matrix[2 * p] : v; + w[s + rsize] = cvalsl == (k & cmaskl) ? matrix[2 * p + 1] : 0; + + ++s; + } + + s += rsize; + } + } + } + +/* + The GetMasks* functions below provide various masks and related values. + GetMasks1, GetMasks2, GetMasks3, GetMasks4, GetMasks5 and GetMasks6 are + used in simulator_avx.h (BMI2 version) and in simulator_avx512.h. GetMasks7, + GetMasks8, GetMasks9, GetMasks10 and GetMasks11 are used in simulator_avx.h + (no BMI2 version) and in simulator_sse.h. + + imaskh - inverted mask of high qubits (high control and target qubits). + qmaskh - mask of high qubits (high target qubits). + cvalsh - control bit values of high control qubits placed in correct + positions. + cvalsl - control bit values of low control qubits placed in correct positions. + cmaskh - mask of high control qubits. + cmaskl - mask of low control qubits. + qmaskl - mask of low qubits (low target qubits). + cl - the number of low control qubits. + + Note that imaskh, qmaskh and cvalsh are multiplied by two in GetMasks1, + GetMasks2, GetMasks3, GetMasks4, GetMasks5 and GetMasks6. +*/ + + struct Masks1 { + uint64_t imaskh; + uint64_t qmaskh; + }; + + template + static Masks1 GetMasks1(const std::vector& qs) { + uint64_t qmaskh = 0; + + for (unsigned i = 0; i < H; ++i) { + qmaskh |= uint64_t{1} << qs[i]; + } + + return {2 * (~qmaskh ^ ((1 << R) - 1)), 2 * qmaskh}; + } + + struct Masks2 { + uint64_t imaskh; + uint64_t qmaskh; + unsigned qmaskl; + }; + + template + static Masks2 GetMasks2(const std::vector& qs) { + uint64_t qmaskh = 0; + unsigned qmaskl = 0; + + for (unsigned i = 0; i < L; ++i) { + qmaskl |= 1 << qs[i]; + } + + for (unsigned i = L; i < H + L; ++i) { + qmaskh |= uint64_t{1} << qs[i]; + } + + return {2 * (~qmaskh ^ ((1 << R) - 1)), 2 * qmaskh, qmaskl}; + } + + struct Masks3 { + uint64_t imaskh; + uint64_t qmaskh; + uint64_t cvalsh; + }; + + template + static Masks3 GetMasks3(unsigned num_qubits, const std::vector& qs, + const std::vector& cqs, uint64_t cvals) { + uint64_t qmaskh = 0; + uint64_t cmaskh = 0; + + for (unsigned i = 0; i < H; ++i) { + qmaskh |= uint64_t{1} << qs[i]; + } + + for (auto q : cqs) { + cmaskh |= uint64_t{1} << q; + } + + uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh); + + uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1); + + return {2 * maskh, 2 * qmaskh, 2 * cvalsh}; + } + + struct Masks4 { + uint64_t imaskh; + uint64_t qmaskh; + uint64_t cvalsh; + uint64_t cvalsl; + uint64_t cmaskl; + unsigned cl; + }; + + template + static Masks4 GetMasks4(unsigned num_qubits, const std::vector& qs, + const std::vector& cqs, uint64_t cvals) { + unsigned cl = 0; + uint64_t qmaskh = 0; + uint64_t cmaskh = 0; + uint64_t cmaskl = 0; + + for (unsigned i = 0; i < H; ++i) { + qmaskh |= uint64_t{1} << qs[i]; + } + + for (auto q : cqs) { + if (q >= R) { + cmaskh |= uint64_t{1} << q; + } else { + ++cl; + cmaskl |= uint64_t{1} << q; + } + } + + uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh); + uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl); + + uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1); + + return {2 * maskh, 2 * qmaskh, 2 * cvalsh, cvalsl, cmaskl, cl}; + } + + struct Masks5 { + uint64_t imaskh; + uint64_t qmaskh; + uint64_t cvalsh; + unsigned qmaskl; + }; + + template + static Masks5 GetMasks5(unsigned num_qubits, const std::vector& qs, + const std::vector& cqs, uint64_t cvals) { + uint64_t qmaskh = 0; + uint64_t cmaskh = 0; + unsigned qmaskl = 0; + + for (unsigned i = 0; i < L; ++i) { + qmaskl |= 1 << qs[i]; + } + + for (unsigned i = L; i < H + L; ++i) { + qmaskh |= uint64_t{1} << qs[i]; + } + + for (auto q : cqs) { + cmaskh |= uint64_t{1} << q; + } + + uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh); + + uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1); + + return {2 * maskh, 2 * qmaskh, 2 * cvalsh, qmaskl}; + } + + struct Masks6 { + uint64_t imaskh; + uint64_t qmaskh; + uint64_t cvalsh; + uint64_t cvalsl; + uint64_t cmaskl; + unsigned qmaskl; + unsigned cl; + }; + + template + static Masks6 GetMasks6(unsigned num_qubits, const std::vector& qs, + const std::vector& cqs, uint64_t cvals) { + unsigned cl = 0; + uint64_t qmaskh = 0; + uint64_t cmaskh = 0; + uint64_t cmaskl = 0; + unsigned qmaskl = 0; + + for (unsigned i = 0; i < L; ++i) { + qmaskl |= 1 << qs[i]; + } + + for (unsigned i = L; i < H + L; ++i) { + qmaskh |= uint64_t{1} << qs[i]; + } + + for (auto q : cqs) { + if (q >= R) { + cmaskh |= uint64_t{1} << q; + } else { + ++cl; + cmaskl |= uint64_t{1} << q; + } + } + + uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh); + uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl); + + uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1); + + return {2 * maskh, 2 * qmaskh, 2 * cvalsh, cvalsl, cmaskl, qmaskl, cl}; + } + + struct Masks7 { + uint64_t cvalsh; + uint64_t cmaskh; + }; + + static Masks7 GetMasks7(unsigned num_qubits, const std::vector& qs, + const std::vector& cqs, uint64_t cvals) { + uint64_t cmaskh = 0; + + for (auto q : cqs) { + cmaskh |= uint64_t{1} << q; + } + + uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh); + + return {cvalsh, cmaskh}; + } + + struct Masks8 { + uint64_t cvalsh; + uint64_t cmaskh; + uint64_t cvalsl; + uint64_t cmaskl; + }; + + template + static Masks8 GetMasks8(unsigned num_qubits, const std::vector& qs, + const std::vector& cqs, uint64_t cvals) { + unsigned cl = 0; + uint64_t cmaskh = 0; + uint64_t cmaskl = 0; + + for (auto q : cqs) { + if (q >= R) { + cmaskh |= uint64_t{1} << q; + } else { + ++cl; + cmaskl |= uint64_t{1} << q; + } + } + + uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh); + uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl); + + return {cvalsh, cmaskh, cvalsl, cmaskl}; + } + + struct Masks9 { + uint64_t cvalsh; + uint64_t cmaskh; + unsigned qmaskl; + }; + + template + static Masks9 GetMasks9(unsigned num_qubits, const std::vector& qs, + const std::vector& cqs, uint64_t cvals) { + uint64_t cmaskh = 0; + unsigned qmaskl = 0; + + for (unsigned i = 0; i < L; ++i) { + qmaskl |= 1 << qs[i]; + } + + for (auto q : cqs) { + cmaskh |= uint64_t{1} << q; + } + + uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh); + + return {cvalsh, cmaskh, qmaskl}; + } + + struct Masks10 { + uint64_t cvalsh; + uint64_t cmaskh; + uint64_t cvalsl; + uint64_t cmaskl; + unsigned qmaskl; + }; + + template + static Masks10 GetMasks10(unsigned num_qubits, + const std::vector& qs, + const std::vector& cqs, uint64_t cvals) { + unsigned cl = 0; + uint64_t cmaskh = 0; + uint64_t cmaskl = 0; + unsigned qmaskl = 0; + + for (unsigned i = 0; i < L; ++i) { + qmaskl |= 1 << qs[i]; + } + + for (auto q : cqs) { + if (q >= R) { + cmaskh |= uint64_t{1} << q; + } else { + ++cl; + cmaskl |= uint64_t{1} << q; + } + } + + uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh); + uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl); + + return {cvalsh, cmaskh, cvalsl, cmaskl, qmaskl}; + } + + struct Masks11 { + unsigned qmaskl; + }; + + template + static Masks11 GetMasks11(const std::vector& qs) { + unsigned qmaskl = 0; + + for (unsigned i = 0; i < L; ++i) { + qmaskl |= 1 << qs[i]; + } + + return {qmaskl}; + } + + template + static unsigned MaskedAdd( + unsigned a, unsigned b, unsigned mask, unsigned lsize) { + unsigned c = bits::CompressBits(a, R, mask); + return bits::ExpandBits((c + b) % lsize, R, mask); + } +}; + +template <> +inline void SimulatorBase::FillIndices<0, 1>(unsigned num_qubits, + const std::vector& qs, + uint64_t* ms, uint64_t* xss) { + ms[0] = -1; + xss[0] = 0; +} + +template <> +inline void SimulatorBase::FillIndices<0, 2>(unsigned num_qubits, + const std::vector& qs, + uint64_t* ms, uint64_t* xss) { + ms[0] = -1; + xss[0] = 0; +} + +template <> +inline void SimulatorBase::FillIndices<0, 3>(unsigned num_qubits, + const std::vector& qs, + uint64_t* ms, uint64_t* xss) { + ms[0] = -1; + xss[0] = 0; +} + +} // namespace qsim + +#endif // SIMULATOR_H_ diff --git a/qsim/simulator_avx.h b/qsim/simulator_avx.h new file mode 100644 index 0000000..9742849 --- /dev/null +++ b/qsim/simulator_avx.h @@ -0,0 +1,1363 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SIMULATOR_AVX_H_ +#define SIMULATOR_AVX_H_ + +#include + +#include +#include +#include +#include + +#include "simulator.h" +#include "statespace_avx.h" + +namespace qsim { + +/** + * Quantum circuit simulator with AVX vectorization. + */ +template +class SimulatorAVX final : public SimulatorBase { + public: + using StateSpace = StateSpaceAVX; + using State = typename StateSpace::State; + using fp_type = typename StateSpace::fp_type; + + template + explicit SimulatorAVX(ForArgs&&... args) : for_(args...) {} + + /** + * Applies a gate using AVX instructions. + * @param qs Indices of the qubits affected by this gate. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyGate(const std::vector& qs, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + switch (qs.size()) { + case 0: + ApplyGateH<0>(qs, matrix, state); + break; + case 1: + if (qs[0] > 2) { + ApplyGateH<1>(qs, matrix, state); + } else { + ApplyGateL<0, 1>(qs, matrix, state); + } + break; + case 2: + if (qs[0] > 2) { + ApplyGateH<2>(qs, matrix, state); + } else if (qs[1] > 2) { + ApplyGateL<1, 1>(qs, matrix, state); + } else { + ApplyGateL<0, 2>(qs, matrix, state); + } + break; + case 3: + if (qs[0] > 2) { + ApplyGateH<3>(qs, matrix, state); + } else if (qs[1] > 2) { + ApplyGateL<2, 1>(qs, matrix, state); + } else if (qs[2] > 2) { + ApplyGateL<1, 2>(qs, matrix, state); + } else { + ApplyGateL<0, 3>(qs, matrix, state); + } + break; + case 4: + if (qs[0] > 2) { + ApplyGateH<4>(qs, matrix, state); + } else if (qs[1] > 2) { + ApplyGateL<3, 1>(qs, matrix, state); + } else if (qs[2] > 2) { + ApplyGateL<2, 2>(qs, matrix, state); + } else { + ApplyGateL<1, 3>(qs, matrix, state); + } + break; + case 5: + if (qs[0] > 2) { + ApplyGateH<5>(qs, matrix, state); + } else if (qs[1] > 2) { + ApplyGateL<4, 1>(qs, matrix, state); + } else if (qs[2] > 2) { + ApplyGateL<3, 2>(qs, matrix, state); + } else { + ApplyGateL<2, 3>(qs, matrix, state); + } + break; + case 6: + if (qs[0] > 2) { + ApplyGateH<6>(qs, matrix, state); + } else if (qs[1] > 2) { + ApplyGateL<5, 1>(qs, matrix, state); + } else if (qs[2] > 2) { + ApplyGateL<4, 2>(qs, matrix, state); + } else { + ApplyGateL<3, 3>(qs, matrix, state); + } + break; + default: + // Not implemented. + break; + } + } + + /** + * Applies a controlled gate using AVX instructions. + * @param qs Indices of the qubits affected by this gate. + * @param cqs Indices of control qubits. + * @param cvals Bit mask of control qubit values. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyControlledGate(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + // Assume cqs[0] < cqs[1] < cqs[2] < ... . + + if (cqs.size() == 0) { + ApplyGate(qs, matrix, state); + return; + } + + switch (qs.size()) { + case 0: + if (cqs[0] > 2) { + ApplyControlledGateHH<0>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<0>(qs, cqs, cvals, matrix, state); + } + break; + case 1: + if (qs[0] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 2) { + ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 2: + if (qs[0] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 2) { + ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 3: + if (qs[0] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else if (qs[2] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 2) { + ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 4: + if (qs[0] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else if (qs[2] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 2) { + ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + default: + // Not implemented. + break; + } + } + + /** + * Computes the expectation value of an operator using AVX instructions. + * @param qs Indices of the qubits the operator acts on. + * @param matrix The operator matrix. + * @param state The state of the system. + * @return The computed expectation value. + */ + std::complex ExpectationValue(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + switch (qs.size()) { + case 1: + if (qs[0] > 2) { + return ExpectationValueH<1>(qs, matrix, state); + } else { + return ExpectationValueL<0, 1>(qs, matrix, state); + } + break; + case 2: + if (qs[0] > 2) { + return ExpectationValueH<2>(qs, matrix, state); + } else if (qs[1] > 2) { + return ExpectationValueL<1, 1>(qs, matrix, state); + } else { + return ExpectationValueL<0, 2>(qs, matrix, state); + } + break; + case 3: + if (qs[0] > 2) { + return ExpectationValueH<3>(qs, matrix, state); + } else if (qs[1] > 2) { + return ExpectationValueL<2, 1>(qs, matrix, state); + } else if (qs[2] > 2) { + return ExpectationValueL<1, 2>(qs, matrix, state); + } else { + return ExpectationValueL<0, 3>(qs, matrix, state); + } + break; + case 4: + if (qs[0] > 2) { + return ExpectationValueH<4>(qs, matrix, state); + } else if (qs[1] > 2) { + return ExpectationValueL<3, 1>(qs, matrix, state); + } else if (qs[2] > 2) { + return ExpectationValueL<2, 2>(qs, matrix, state); + } else { + return ExpectationValueL<1, 3>(qs, matrix, state); + } + break; + case 5: + if (qs[0] > 2) { + return ExpectationValueH<5>(qs, matrix, state); + } else if (qs[1] > 2) { + return ExpectationValueL<4, 1>(qs, matrix, state); + } else if (qs[2] > 2) { + return ExpectationValueL<3, 2>(qs, matrix, state); + } else { + return ExpectationValueL<2, 3>(qs, matrix, state); + } + break; + case 6: + if (qs[0] > 2) { + return ExpectationValueH<6>(qs, matrix, state); + } else if (qs[1] > 2) { + return ExpectationValueL<5, 1>(qs, matrix, state); + } else if (qs[2] > 2) { + return ExpectationValueL<4, 2>(qs, matrix, state); + } else { + return ExpectationValueL<3, 3>(qs, matrix, state); + } + break; + default: + // Not implemented. + break; + } + + return 0; + } + + /** + * @return The size of SIMD register if applicable. + */ + static unsigned SIMDRegisterSize() { + return 8; + } + + private: +#ifdef __BMI2__ + + template + void ApplyGateH(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + uint64_t imaskh, uint64_t qmaskh, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m256 ru, iu, rn, in; + __m256 rs[hsize], is[hsize]; + + auto p0 = rstate + _pdep_u64(i, imaskh); + + for (unsigned k = 0; k < hsize; ++k) { + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k] = _mm256_load_ps(p0 + p); + is[k] = _mm256_load_ps(p0 + p + 8); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_mul_ps(rs[0], ru); + in = _mm256_mul_ps(rs[0], iu); + rn = _mm256_fnmadd_ps(is[0], iu, rn); + in = _mm256_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_fmadd_ps(rs[l], ru, rn); + in = _mm256_fmadd_ps(rs[l], iu, in); + rn = _mm256_fnmadd_ps(is[l], iu, rn); + in = _mm256_fmadd_ps(is[l], ru, in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm256_store_ps(p0 + p, rn); + _mm256_store_ps(p0 + p + 8, in); + } + }; + + auto m = GetMasks1(qs); + + unsigned k = 3 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, matrix, m.imaskh, m.qmaskh, state.get()); + } + + template + void ApplyGateL(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, + uint64_t imaskh, uint64_t qmaskh, const __m256i* idx, + fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m256 rn, in; + __m256 rs[gsize], is[gsize]; + + auto p0 = rstate + _pdep_u64(i, imaskh); + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k2] = _mm256_load_ps(p0 + p); + is[k2] = _mm256_load_ps(p0 + p + 8); + + for (unsigned l = 1; l < lsize; ++l) { + rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); + is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); + } + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm256_mul_ps(rs[0], w[j]); + in = _mm256_mul_ps(rs[0], w[j + 1]); + rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm256_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm256_fmadd_ps(rs[l], w[j], rn); + in = _mm256_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm256_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm256_store_ps(p0 + p, rn); + _mm256_store_ps(p0 + p + 8, in); + } + }; + + __m256i idx[1 << L]; + __m256 w[1 << (1 + 2 * H + L)]; + + auto m = GetMasks2(qs); + FillPermutationIndices(m.qmaskl, idx); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + unsigned r = 3 + H; + unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, m.imaskh, m.qmaskh, idx, state.get()); + } + + template + void ApplyControlledGateHH(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, + fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m256 ru, iu, rn, in; + __m256 rs[hsize], is[hsize]; + + auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh); + + for (unsigned k = 0; k < hsize; ++k) { + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k] = _mm256_load_ps(p0 + p); + is[k] = _mm256_load_ps(p0 + p + 8); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_mul_ps(rs[0], ru); + in = _mm256_mul_ps(rs[0], iu); + rn = _mm256_fnmadd_ps(is[0], iu, rn); + in = _mm256_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_fmadd_ps(rs[l], ru, rn); + in = _mm256_fmadd_ps(rs[l], iu, in); + rn = _mm256_fnmadd_ps(is[l], iu, rn); + in = _mm256_fmadd_ps(is[l], ru, in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm256_store_ps(p0 + p, rn); + _mm256_store_ps(p0 + p + 8, in); + } + }; + + auto m = GetMasks3(state.num_qubits(), qs, cqs, cvals); + + unsigned k = 3 + H + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, matrix, m.imaskh, m.qmaskh, m.cvalsh, state.get()); + } + + template + void ApplyControlledGateHL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, + uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, + fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m256 rn, in; + __m256 rs[hsize], is[hsize]; + + auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh); + + for (unsigned k = 0; k < hsize; ++k) { + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k] = _mm256_load_ps(p0 + p); + is[k] = _mm256_load_ps(p0 + p + 8); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm256_mul_ps(rs[0], w[j]); + in = _mm256_mul_ps(rs[0], w[j + 1]); + rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm256_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + rn = _mm256_fmadd_ps(rs[l], w[j], rn); + in = _mm256_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm256_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm256_store_ps(p0 + p, rn); + _mm256_store_ps(p0 + p + 8, in); + } + }; + + __m256 w[1 << (1 + 2 * H)]; + + auto m = GetMasks4(state.num_qubits(), qs, cqs, cvals); + FillControlledMatrixH(m.cvalsl, m.cmaskl, matrix, (fp_type*) w); + + unsigned r = 3 + H + cqs.size() - m.cl; + unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, state.get()); + } + + template + void ApplyControlledGateL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, + uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, + const __m256i* idx, fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m256 rn, in; + __m256 rs[gsize], is[gsize]; + + auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh); + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k2] = _mm256_load_ps(p0 + p); + is[k2] = _mm256_load_ps(p0 + p + 8); + + for (unsigned l = 1; l < lsize; ++l) { + rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); + is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); + } + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm256_mul_ps(rs[0], w[j]); + in = _mm256_mul_ps(rs[0], w[j + 1]); + rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm256_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm256_fmadd_ps(rs[l], w[j], rn); + in = _mm256_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm256_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm256_store_ps(p0 + p, rn); + _mm256_store_ps(p0 + p + 8, in); + } + }; + + __m256i idx[1 << L]; + __m256 w[1 << (1 + 2 * H + L)]; + + if (CH) { + auto m = GetMasks5(state.num_qubits(), qs, cqs, cvals); + FillPermutationIndices(m.qmaskl, idx); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + unsigned r = 3 + H + cqs.size(); + unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get()); + } else { + auto m = GetMasks6(state.num_qubits(), qs, cqs, cvals); + FillPermutationIndices(m.qmaskl, idx); + FillControlledMatrixL( + m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w); + + unsigned r = 3 + H + cqs.size() - m.cl; + unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get()); + } + } + + template + std::complex ExpectationValueH(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + uint64_t imaskh, uint64_t qmaskh, const fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m256 ru, iu, rn, in; + __m256 rs[hsize], is[hsize]; + + auto p0 = rstate + _pdep_u64(i, imaskh); + + for (unsigned k = 0; k < hsize; ++k) { + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k] = _mm256_load_ps(p0 + p); + is[k] = _mm256_load_ps(p0 + p + 8); + } + + double re = 0; + double im = 0; + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_mul_ps(rs[0], ru); + in = _mm256_mul_ps(rs[0], iu); + rn = _mm256_fnmadd_ps(is[0], iu, rn); + in = _mm256_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_fmadd_ps(rs[l], ru, rn); + in = _mm256_fmadd_ps(rs[l], iu, in); + rn = _mm256_fnmadd_ps(is[l], iu, rn); + in = _mm256_fmadd_ps(is[l], ru, in); + + j += 2; + } + + __m256 v_re = _mm256_fmadd_ps(is[k], in, _mm256_mul_ps(rs[k], rn)); + __m256 v_im = _mm256_fnmadd_ps(is[k], rn, _mm256_mul_ps(rs[k], in)); + + re += detail::HorizontalSumAVX(v_re); + im += detail::HorizontalSumAVX(v_im); + } + + return std::complex{re, im}; + }; + + auto m = GetMasks1(qs); + + unsigned k = 3 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return + for_.RunReduce(size, f, Op(), matrix, m.imaskh, m.qmaskh, state.get()); + } + + template + std::complex ExpectationValueL(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, + uint64_t imaskh, uint64_t qmaskh, const __m256i* idx, + const fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m256 rn, in; + __m256 rs[gsize], is[gsize]; + + auto p0 = rstate + _pdep_u64(i, imaskh); + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k2] = _mm256_load_ps(p0 + p); + is[k2] = _mm256_load_ps(p0 + p + 8); + + for (unsigned l = 1; l < lsize; ++l) { + rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); + is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); + } + } + + double re = 0; + double im = 0; + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm256_mul_ps(rs[0], w[j]); + in = _mm256_mul_ps(rs[0], w[j + 1]); + rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm256_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm256_fmadd_ps(rs[l], w[j], rn); + in = _mm256_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm256_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + unsigned m = lsize * k; + + __m256 v_re = _mm256_fmadd_ps(is[m], in, _mm256_mul_ps(rs[m], rn)); + __m256 v_im = _mm256_fnmadd_ps(is[m], rn, _mm256_mul_ps(rs[m], in)); + + re += detail::HorizontalSumAVX(v_re); + im += detail::HorizontalSumAVX(v_im); + } + + return std::complex{re, im}; + }; + + __m256i idx[1 << L]; + __m256 w[1 << (1 + 2 * H + L)]; + + auto m = GetMasks2(qs); + FillPermutationIndices(m.qmaskl, idx); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + unsigned r = 3 + H; + unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return + for_.RunReduce(size, f, Op(), w, m.imaskh, m.qmaskh, idx, state.get()); + } + +#else // __BMI2__ + + template + void ApplyGateH(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m256 ru, iu, rn, in; + __m256 rs[hsize], is[hsize]; + + i *= 8; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = _mm256_load_ps(p0 + xss[k]); + is[k] = _mm256_load_ps(p0 + xss[k] + 8); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_mul_ps(rs[0], ru); + in = _mm256_mul_ps(rs[0], iu); + rn = _mm256_fnmadd_ps(is[0], iu, rn); + in = _mm256_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_fmadd_ps(rs[l], ru, rn); + in = _mm256_fmadd_ps(rs[l], iu, in); + rn = _mm256_fnmadd_ps(is[l], iu, rn); + in = _mm256_fmadd_ps(is[l], ru, in); + + j += 2; + } + + _mm256_store_ps(p0 + xss[k], rn); + _mm256_store_ps(p0 + xss[k] + 8, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned k = 3 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, matrix, ms, xss, state.get()); + } + + template + void ApplyGateL(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, + const uint64_t* ms, const uint64_t* xss, const __m256i* idx, + fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m256 rn, in; + __m256 rs[gsize], is[gsize]; + + i *= 8; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + rs[k2] = _mm256_load_ps(p0 + xss[k]); + is[k2] = _mm256_load_ps(p0 + xss[k] + 8); + + for (unsigned l = 1; l < lsize; ++l) { + rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); + is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); + } + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm256_mul_ps(rs[0], w[j]); + in = _mm256_mul_ps(rs[0], w[j + 1]); + rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm256_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm256_fmadd_ps(rs[l], w[j], rn); + in = _mm256_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm256_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + _mm256_store_ps(p0 + xss[k], rn); + _mm256_store_ps(p0 + xss[k] + 8, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + __m256i idx[1 << L]; + __m256 w[1 << (1 + 2 * H + L)]; + + auto m = GetMasks11(qs); + + FillIndices(state.num_qubits(), qs, ms, xss); + FillPermutationIndices(m.qmaskl, idx); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + unsigned r = 3 + H; + unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, idx, state.get()); + } + + template + void ApplyControlledGateHH(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, + uint64_t cmaskh, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m256 ru, iu, rn, in; + __m256 rs[hsize], is[hsize]; + + i *= 8; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + if ((ii & cmaskh) != cvalsh) return; + + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = _mm256_load_ps(p0 + xss[k]); + is[k] = _mm256_load_ps(p0 + xss[k] + 8); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_mul_ps(rs[0], ru); + in = _mm256_mul_ps(rs[0], iu); + rn = _mm256_fnmadd_ps(is[0], iu, rn); + in = _mm256_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_fmadd_ps(rs[l], ru, rn); + in = _mm256_fmadd_ps(rs[l], iu, in); + rn = _mm256_fnmadd_ps(is[l], iu, rn); + in = _mm256_fmadd_ps(is[l], ru, in); + + j += 2; + } + + _mm256_store_ps(p0 + xss[k], rn); + _mm256_store_ps(p0 + xss[k] + 8, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + + auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals); + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned k = 3 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, matrix, ms, xss, m.cvalsh, m.cmaskh, state.get()); + } + + template + void ApplyControlledGateHL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, + const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, + uint64_t cmaskh, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m256 rn, in; + __m256 rs[hsize], is[hsize]; + + i *= 8; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + if ((ii & cmaskh) != cvalsh) return; + + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = _mm256_load_ps(p0 + xss[k]); + is[k] = _mm256_load_ps(p0 + xss[k] + 8); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm256_mul_ps(rs[0], w[j]); + in = _mm256_mul_ps(rs[0], w[j + 1]); + rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm256_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + rn = _mm256_fmadd_ps(rs[l], w[j], rn); + in = _mm256_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm256_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + _mm256_store_ps(p0 + xss[k], rn); + _mm256_store_ps(p0 + xss[k] + 8, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + __m256 w[1 << (1 + 2 * H)]; + + auto m = GetMasks8<3>(state.num_qubits(), qs, cqs, cvals); + FillIndices(state.num_qubits(), qs, ms, xss); + FillControlledMatrixH(m.cvalsl, m.cmaskl, matrix, (fp_type*) w); + + unsigned r = 3 + H; + unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, state.get()); + } + + template + void ApplyControlledGateL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, + const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, + uint64_t cmaskh, const __m256i* idx, fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m256 rn, in; + __m256 rs[gsize], is[gsize]; + + i *= 8; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + if ((ii & cmaskh) != cvalsh) return; + + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + + rs[k2] = _mm256_load_ps(p0 + xss[k]); + is[k2] = _mm256_load_ps(p0 + xss[k] + 8); + + for (unsigned l = 1; l < lsize; ++l) { + rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); + is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); + } + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm256_mul_ps(rs[0], w[j]); + in = _mm256_mul_ps(rs[0], w[j + 1]); + rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm256_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm256_fmadd_ps(rs[l], w[j], rn); + in = _mm256_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm256_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + _mm256_store_ps(p0 + xss[k], rn); + _mm256_store_ps(p0 + xss[k] + 8, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + __m256i idx[1 << L]; + __m256 w[1 << (1 + 2 * H + L)]; + + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned r = 3 + H; + unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; + uint64_t size = uint64_t{1} << n; + + if (CH) { + auto m = GetMasks9(state.num_qubits(), qs, cqs, cvals); + FillPermutationIndices(m.qmaskl, idx); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, idx, state.get()); + } else { + auto m = GetMasks10(state.num_qubits(), qs, cqs, cvals); + FillPermutationIndices(m.qmaskl, idx); + FillControlledMatrixL( + m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w); + + for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, idx, state.get()); + } + } + + template + std::complex ExpectationValueH(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + const fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m256 ru, iu, rn, in; + __m256 rs[hsize], is[hsize]; + + i *= 8; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = _mm256_load_ps(p0 + xss[k]); + is[k] = _mm256_load_ps(p0 + xss[k] + 8); + } + + double re = 0; + double im = 0; + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_mul_ps(rs[0], ru); + in = _mm256_mul_ps(rs[0], iu); + rn = _mm256_fnmadd_ps(is[0], iu, rn); + in = _mm256_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_fmadd_ps(rs[l], ru, rn); + in = _mm256_fmadd_ps(rs[l], iu, in); + rn = _mm256_fnmadd_ps(is[l], iu, rn); + in = _mm256_fmadd_ps(is[l], ru, in); + + j += 2; + } + + __m256 v_re = _mm256_fmadd_ps(is[k], in, _mm256_mul_ps(rs[k], rn)); + __m256 v_im = _mm256_fnmadd_ps(is[k], rn, _mm256_mul_ps(rs[k], in)); + + re += detail::HorizontalSumAVX(v_re); + im += detail::HorizontalSumAVX(v_im); + } + + return std::complex{re, im}; + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned k = 3 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), matrix, ms, xss, state.get()); + } + + template + std::complex ExpectationValueL(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, + const uint64_t* ms, const uint64_t* xss, const __m256i* idx, + const fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m256 rn, in; + __m256 rs[gsize], is[gsize]; + + i *= 8; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + + rs[k2] = _mm256_load_ps(p0 + xss[k]); + is[k2] = _mm256_load_ps(p0 + xss[k] + 8); + + for (unsigned l = 1; l < lsize; ++l) { + rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); + is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); + } + } + + double re = 0; + double im = 0; + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm256_mul_ps(rs[0], w[j]); + in = _mm256_mul_ps(rs[0], w[j + 1]); + rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm256_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm256_fmadd_ps(rs[l], w[j], rn); + in = _mm256_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm256_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + unsigned m = lsize * k; + + __m256 v_re = _mm256_fmadd_ps(is[m], in, _mm256_mul_ps(rs[m], rn)); + __m256 v_im = _mm256_fnmadd_ps(is[m], rn, _mm256_mul_ps(rs[m], in)); + + re += detail::HorizontalSumAVX(v_re); + im += detail::HorizontalSumAVX(v_im); + } + + return std::complex{re, im}; + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + __m256i idx[1 << L]; + __m256 w[1 << (1 + 2 * H + L)]; + + auto m = GetMasks11(qs); + + FillIndices(state.num_qubits(), qs, ms, xss); + FillPermutationIndices(m.qmaskl, idx); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + unsigned r = 3 + H; + unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), w, ms, xss, idx, state.get()); + } + +#endif // __BMI2__ + + template + static void FillPermutationIndices(unsigned qmaskl, __m256i* idx) { + constexpr unsigned lsize = 1 << L; + + for (unsigned i = 0; i < lsize - 1; ++i) { + unsigned p[8]; + + for (unsigned j = 0; j < 8; ++j) { + p[j] = MaskedAdd<3>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl)); + } + + idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]); + } + } + + For for_; +}; + +} // namespace qsim + +#endif // SIMULATOR_AVX_H_ diff --git a/qsim/simulator_avx512.h b/qsim/simulator_avx512.h new file mode 100644 index 0000000..21a2e9d --- /dev/null +++ b/qsim/simulator_avx512.h @@ -0,0 +1,846 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SIMULATOR_AVX512_H_ +#define SIMULATOR_AVX512_H_ + +#include + +#include +#include +#include +#include + +#include "simulator.h" +#include "statespace_avx512.h" + +namespace qsim { + +/** + * Quantum circuit simulator with AVX512 vectorization. + */ +template +class SimulatorAVX512 final : public SimulatorBase { + public: + using StateSpace = StateSpaceAVX512; + using State = typename StateSpace::State; + using fp_type = typename StateSpace::fp_type; + + template + explicit SimulatorAVX512(ForArgs&&... args) : for_(args...) {} + + /** + * Applies a gate using AVX512 instructions. + * @param qs Indices of the qubits affected by this gate. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyGate(const std::vector& qs, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + switch (qs.size()) { + case 0: + ApplyGateH<0>(qs, matrix, state); + break; + case 1: + if (qs[0] > 3) { + ApplyGateH<1>(qs, matrix, state); + } else { + ApplyGateL<0, 1>(qs, matrix, state); + } + break; + case 2: + if (qs[0] > 3) { + ApplyGateH<2>(qs, matrix, state); + } else if (qs[1] > 3) { + ApplyGateL<1, 1>(qs, matrix, state); + } else { + ApplyGateL<0, 2>(qs, matrix, state); + } + break; + case 3: + if (qs[0] > 3) { + ApplyGateH<3>(qs, matrix, state); + } else if (qs[1] > 3) { + ApplyGateL<2, 1>(qs, matrix, state); + } else if (qs[2] > 3) { + ApplyGateL<1, 2>(qs, matrix, state); + } else { + ApplyGateL<0, 3>(qs, matrix, state); + } + break; + case 4: + if (qs[0] > 3) { + ApplyGateH<4>(qs, matrix, state); + } else if (qs[1] > 3) { + ApplyGateL<3, 1>(qs, matrix, state); + } else if (qs[2] > 3) { + ApplyGateL<2, 2>(qs, matrix, state); + } else if (qs[3] > 3) { + ApplyGateL<1, 3>(qs, matrix, state); + } else { + ApplyGateL<0, 4>(qs, matrix, state); + } + break; + case 5: + if (qs[0] > 3) { + ApplyGateH<5>(qs, matrix, state); + } else if (qs[1] > 3) { + ApplyGateL<4, 1>(qs, matrix, state); + } else if (qs[2] > 3) { + ApplyGateL<3, 2>(qs, matrix, state); + } else if (qs[3] > 3) { + ApplyGateL<2, 3>(qs, matrix, state); + } else { + ApplyGateL<1, 4>(qs, matrix, state); + } + break; + case 6: + if (qs[0] > 3) { + ApplyGateH<6>(qs, matrix, state); + } else if (qs[1] > 3) { + ApplyGateL<5, 1>(qs, matrix, state); + } else if (qs[2] > 3) { + ApplyGateL<4, 2>(qs, matrix, state); + } else if (qs[3] > 3) { + ApplyGateL<3, 3>(qs, matrix, state); + } else { + ApplyGateL<2, 4>(qs, matrix, state); + } + break; + default: + // Not implemented. + break; + } + } + + /** + * Applies a controlled gate using AVX512 instructions. + * @param qs Indices of the qubits affected by this gate. + * @param cqs Indices of control qubits. + * @param cvals Bit mask of control qubit values. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyControlledGate(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + // Assume cqs[0] < cqs[1] < cqs[2] < ... . + + if (cqs.size() == 0) { + ApplyGate(qs, matrix, state); + return; + } + + switch (qs.size()) { + case 0: + if (cqs[0] > 3) { + ApplyControlledGateHH<0>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<0>(qs, cqs, cvals, matrix, state); + } + break; + case 1: + if (qs[0] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 3) { + ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 2: + if (qs[0] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 3) { + ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 3: + if (qs[0] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else if (qs[2] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 3) { + ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 4: + if (qs[0] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else if (qs[2] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state); + } + } else if (qs[3] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 3) { + ApplyControlledGateL<0, 4, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 4, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + default: + // Not implemented. + break; + } + } + + /** + * Computes the expectation value of an operator using AVX512 instructions. + * @param qs Indices of the qubits the operator acts on. + * @param matrix The operator matrix. + * @param state The state of the system. + * @return The computed expectation value. + */ + std::complex ExpectationValue(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + switch (qs.size()) { + case 1: + if (qs[0] > 3) { + return ExpectationValueH<1>(qs, matrix, state); + } else { + return ExpectationValueL<0, 1>(qs, matrix, state); + } + break; + case 2: + if (qs[0] > 3) { + return ExpectationValueH<2>(qs, matrix, state); + } else if (qs[1] > 3) { + return ExpectationValueL<1, 1>(qs, matrix, state); + } else { + return ExpectationValueL<0, 2>(qs, matrix, state); + } + break; + case 3: + if (qs[0] > 3) { + return ExpectationValueH<3>(qs, matrix, state); + } else if (qs[1] > 3) { + return ExpectationValueL<2, 1>(qs, matrix, state); + } else if (qs[2] > 3) { + return ExpectationValueL<1, 2>(qs, matrix, state); + } else { + return ExpectationValueL<0, 3>(qs, matrix, state); + } + break; + case 4: + if (qs[0] > 3) { + return ExpectationValueH<4>(qs, matrix, state); + } else if (qs[1] > 3) { + return ExpectationValueL<3, 1>(qs, matrix, state); + } else if (qs[2] > 3) { + return ExpectationValueL<2, 2>(qs, matrix, state); + } else if (qs[3] > 3) { + return ExpectationValueL<1, 3>(qs, matrix, state); + } else { + return ExpectationValueL<0, 4>(qs, matrix, state); + } + break; + case 5: + if (qs[0] > 3) { + return ExpectationValueH<5>(qs, matrix, state); + } else if (qs[1] > 3) { + return ExpectationValueL<4, 1>(qs, matrix, state); + } else if (qs[2] > 3) { + return ExpectationValueL<3, 2>(qs, matrix, state); + } else if (qs[3] > 3) { + return ExpectationValueL<2, 3>(qs, matrix, state); + } else { + return ExpectationValueL<1, 4>(qs, matrix, state); + } + break; + case 6: + if (qs[0] > 3) { + return ExpectationValueH<6>(qs, matrix, state); + } else if (qs[1] > 3) { + return ExpectationValueL<5, 1>(qs, matrix, state); + } else if (qs[2] > 3) { + return ExpectationValueL<4, 2>(qs, matrix, state); + } else if (qs[3] > 3) { + return ExpectationValueL<3, 3>(qs, matrix, state); + } else { + return ExpectationValueL<2, 4>(qs, matrix, state); + } + break; + default: + // Not implemented. + break; + } + + return 0; + } + + /** + * @return The size of SIMD register if applicable. + */ + static unsigned SIMDRegisterSize() { + return 16; + } + + private: + template + void ApplyGateH(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + uint64_t imaskh, uint64_t qmaskh, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m512 ru, iu, rn, in; + __m512 rs[hsize], is[hsize]; + + auto p0 = rstate + _pdep_u64(i, imaskh); + + for (unsigned k = 0; k < hsize; ++k) { + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k] = _mm512_load_ps(p0 + p); + is[k] = _mm512_load_ps(p0 + p + 16); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[l], ru, rn); + in = _mm512_fmadd_ps(rs[l], iu, in); + rn = _mm512_fnmadd_ps(is[l], iu, rn); + in = _mm512_fmadd_ps(is[l], ru, in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm512_store_ps(p0 + p, rn); + _mm512_store_ps(p0 + p + 16, in); + } + }; + + auto m = GetMasks1(qs); + + unsigned k = 4 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, matrix, m.imaskh, m.qmaskh, state.get()); + } + + template + void ApplyGateL(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + uint64_t imaskh, uint64_t qmaskh, const __m512i* idx, + fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m512 rn, in; + __m512 rs[gsize], is[gsize]; + + auto p0 = rstate + _pdep_u64(i, imaskh); + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k2] = _mm512_load_ps(p0 + p); + is[k2] = _mm512_load_ps(p0 + p + 16); + + for (unsigned l = 1; l < lsize; ++l) { + rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]); + is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]); + } + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm512_fmadd_ps(rs[l], w[j], rn); + in = _mm512_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm512_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm512_store_ps(p0 + p, rn); + _mm512_store_ps(p0 + p + 16, in); + } + }; + + __m512i idx[1 << L]; + __m512 w[1 << (1 + 2 * H + L)]; + + auto m = GetMasks2(qs); + FillPermutationIndices(m.qmaskl, idx); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + unsigned r = 4 + H; + unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, m.imaskh, m.qmaskh, idx, state.get()); + } + + template + void ApplyControlledGateHH(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, + fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m512 ru, iu, rn, in; + __m512 rs[hsize], is[hsize]; + + auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh); + + for (unsigned k = 0; k < hsize; ++k) { + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k] = _mm512_load_ps(p0 + p); + is[k] = _mm512_load_ps(p0 + p + 16); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[l], ru, rn); + in = _mm512_fmadd_ps(rs[l], iu, in); + rn = _mm512_fnmadd_ps(is[l], iu, rn); + in = _mm512_fmadd_ps(is[l], ru, in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm512_store_ps(p0 + p, rn); + _mm512_store_ps(p0 + p + 16, in); + } + }; + + auto m = GetMasks3(state.num_qubits(), qs, cqs, cvals); + + unsigned k = 4 + H + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, matrix, m.imaskh, m.qmaskh, m.cvalsh, state.get()); + } + + template + void ApplyControlledGateHL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, + fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m512 rn, in; + __m512 rs[hsize], is[hsize]; + + auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh); + + for (unsigned k = 0; k < hsize; ++k) { + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k] = _mm512_load_ps(p0 + p); + is[k] = _mm512_load_ps(p0 + p + 16); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + rn = _mm512_fmadd_ps(rs[l], w[j], rn); + in = _mm512_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm512_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm512_store_ps(p0 + p, rn); + _mm512_store_ps(p0 + p + 16, in); + } + }; + + __m512 w[1 << (1 + 2 * H)]; + + auto m = GetMasks4(state.num_qubits(), qs, cqs, cvals); + FillControlledMatrixH(m.cvalsl, m.cmaskl, matrix, (fp_type*) w); + + unsigned r = 4 + H + cqs.size() - m.cl; + unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, state.get()); + } + + template + void ApplyControlledGateL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, + const __m512i* idx, fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m512 rn, in; + __m512 rs[gsize], is[gsize]; + + auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh); + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k2] = _mm512_load_ps(p0 + p); + is[k2] = _mm512_load_ps(p0 + p + 16); + + for (unsigned l = 1; l < lsize; ++l) { + rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]); + is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]); + } + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm512_fmadd_ps(rs[l], w[j], rn); + in = _mm512_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm512_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm512_store_ps(p0 + p, rn); + _mm512_store_ps(p0 + p + 16, in); + } + }; + + __m512i idx[1 << L]; + __m512 w[1 << (1 + 2 * H + L)]; + + if (CH) { + auto m = GetMasks5(state.num_qubits(), qs, cqs, cvals); + FillPermutationIndices(m.qmaskl, idx); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + unsigned r = 4 + H + cqs.size(); + unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get()); + } else { + auto m = GetMasks6(state.num_qubits(), qs, cqs, cvals); + FillPermutationIndices(m.qmaskl, idx); + FillControlledMatrixL( + m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w); + + unsigned r = 4 + H + cqs.size() - m.cl; + unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get()); + } + } + + template + std::complex ExpectationValueH(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + uint64_t imaskh, uint64_t qmaskh, const fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m512 ru, iu, rn, in; + __m512 rs[hsize], is[hsize]; + + auto p0 = rstate + _pdep_u64(i, imaskh); + + for (unsigned k = 0; k < hsize; ++k) { + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k] = _mm512_load_ps(p0 + p); + is[k] = _mm512_load_ps(p0 + p + 16); + } + + double re = 0; + double im = 0; + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[l], ru, rn); + in = _mm512_fmadd_ps(rs[l], iu, in); + rn = _mm512_fnmadd_ps(is[l], iu, rn); + in = _mm512_fmadd_ps(is[l], ru, in); + + j += 2; + } + + __m512 v_re = _mm512_fmadd_ps(is[k], in, _mm512_mul_ps(rs[k], rn)); + __m512 v_im = _mm512_fnmadd_ps(is[k], rn, _mm512_mul_ps(rs[k], in)); + + re += detail::HorizontalSumAVX512(v_re); + im += detail::HorizontalSumAVX512(v_im); + } + + return std::complex{re, im}; + }; + + auto m = GetMasks1(qs); + + unsigned k = 4 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return + for_.RunReduce(size, f, Op(), matrix, m.imaskh, m.qmaskh, state.get()); + } + + template + std::complex ExpectationValueL(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + uint64_t imaskh, uint64_t qmaskh, const __m512i* idx, + const fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m512 rn, in; + __m512 rs[gsize], is[gsize]; + + auto p0 = rstate + _pdep_u64(i, imaskh); + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k2] = _mm512_load_ps(p0 + p); + is[k2] = _mm512_load_ps(p0 + p + 16); + + for (unsigned l = 1; l < lsize; ++l) { + rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]); + is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]); + } + } + + double re = 0; + double im = 0; + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm512_fmadd_ps(rs[l], w[j], rn); + in = _mm512_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm512_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + unsigned m = lsize * k; + + __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn)); + __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in)); + + re += detail::HorizontalSumAVX512(v_re); + im += detail::HorizontalSumAVX512(v_im); + } + + return std::complex{re, im}; + }; + + __m512i idx[1 << L]; + __m512 w[1 << (1 + 2 * H + L)]; + + auto m = GetMasks2(qs); + FillPermutationIndices(m.qmaskl, idx); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + unsigned r = 4 + H; + unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return + for_.RunReduce(size, f, Op(), w, m.imaskh, m.qmaskh, idx, state.get()); + } + + template + static void FillPermutationIndices(unsigned qmaskl, __m512i* idx) { + constexpr unsigned lsize = 1 << L; + + for (unsigned i = 0; i < lsize; ++i) { + unsigned p[16]; + + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd<4>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + } + + For for_; +}; + +} // namespace qsim + +#endif // SIMULATOR_AVX512_H_ diff --git a/qsim/simulator_basic.h b/qsim/simulator_basic.h new file mode 100644 index 0000000..752eeb5 --- /dev/null +++ b/qsim/simulator_basic.h @@ -0,0 +1,349 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SIMULATOR_BASIC_H_ +#define SIMULATOR_BASIC_H_ + +#include +#include +#include +#include + +#include "simulator.h" +#include "statespace_basic.h" + +namespace qsim { + +/** + * Quantum circuit simulator without vectorization. + */ +template +class SimulatorBasic final : public SimulatorBase { + public: + using StateSpace = StateSpaceBasic; + using State = typename StateSpace::State; + using fp_type = typename StateSpace::fp_type; + + template + explicit SimulatorBasic(ForArgs&&... args) : for_(args...) {} + + /** + * Applies a gate using non-vectorized instructions. + * @param qs Indices of the qubits affected by this gate. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyGate(const std::vector& qs, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + switch (qs.size()) { + case 0: + ApplyGateH<0>(qs, matrix, state); + break; + case 1: + ApplyGateH<1>(qs, matrix, state); + break; + case 2: + ApplyGateH<2>(qs, matrix, state); + break; + case 3: + ApplyGateH<3>(qs, matrix, state); + break; + case 4: + ApplyGateH<4>(qs, matrix, state); + break; + case 5: + ApplyGateH<5>(qs, matrix, state); + break; + case 6: + ApplyGateH<6>(qs, matrix, state); + break; + default: + // Not implemented. + break; + } + } + + /** + * Applies a controlled gate using non-vectorized instructions. + * @param qs Indices of the qubits affected by this gate. + * @param cqs Indices of control qubits. + * @param cvals Bit mask of control qubit values. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyControlledGate(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + if (cqs.size() == 0) { + ApplyGate(qs, matrix, state); + return; + } + + switch (qs.size()) { + case 0: + ApplyControlledGateH<0>(qs, cqs, cvals, matrix, state); + break; + case 1: + ApplyControlledGateH<1>(qs, cqs, cvals, matrix, state); + break; + case 2: + ApplyControlledGateH<2>(qs, cqs, cvals, matrix, state); + break; + case 3: + ApplyControlledGateH<3>(qs, cqs, cvals, matrix, state); + break; + case 4: + ApplyControlledGateH<4>(qs, cqs, cvals, matrix, state); + break; + default: + // Not implemented. + break; + } + } + + /** + * Computes the expectation value of an operator using non-vectorized + * instructions. + * @param qs Indices of the qubits the operator acts on. + * @param matrix The operator matrix. + * @param state The state of the system. + * @return The computed expectation value. + */ + std::complex ExpectationValue(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + switch (qs.size()) { + case 1: + return ExpectationValueH<1>(qs, matrix, state); + break; + case 2: + return ExpectationValueH<2>(qs, matrix, state); + break; + case 3: + return ExpectationValueH<3>(qs, matrix, state); + break; + case 4: + return ExpectationValueH<4>(qs, matrix, state); + break; + case 5: + return ExpectationValueH<5>(qs, matrix, state); + break; + case 6: + return ExpectationValueH<6>(qs, matrix, state); + break; + default: + // Not implemented. + break; + } + + return 0; + } + + /** + * @return The size of SIMD register if applicable. + */ + static unsigned SIMDRegisterSize() { + return 1; + } + + private: + template + void ApplyGateH(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + fp_type rn, in; + fp_type rs[hsize], is[hsize]; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = *(p0 + xss[k]); + is[k] = *(p0 + xss[k] + 1); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = rs[0] * v[j] - is[0] * v[j + 1]; + in = rs[0] * v[j + 1] + is[0] * v[j]; + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + rn += rs[l] * v[j] - is[l] * v[j + 1]; + in += rs[l] * v[j + 1] + is[l] * v[j]; + + j += 2; + } + + *(p0 + xss[k]) = rn; + *(p0 + xss[k] + 1) = in; + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, matrix, ms, xss, state.get()); + } + + template + void ApplyControlledGateH(const std::vector& qs, + const std::vector& cqs, + uint64_t cvals, const fp_type* matrix, + State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + uint64_t cvalsh, uint64_t cmaskh, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + fp_type rn, in; + fp_type rs[hsize], is[hsize]; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + if ((ii & cmaskh) == cvalsh) { + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = *(p0 + xss[k]); + is[k] = *(p0 + xss[k] + 1); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = rs[0] * v[j] - is[0] * v[j + 1]; + in = rs[0] * v[j + 1] + is[0] * v[j]; + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + rn += rs[l] * v[j] - is[l] * v[j + 1]; + in += rs[l] * v[j + 1] + is[l] * v[j]; + + j += 2; + } + + *(p0 + xss[k]) = rn; + *(p0 + xss[k] + 1) = in; + } + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + + FillIndices(state.num_qubits(), qs, ms, xss); + + auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals); + + unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, matrix, ms, xss, m.cvalsh, m.cmaskh, state.get()); + } + + template + std::complex ExpectationValueH(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + const fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + fp_type rn, in; + fp_type rs[hsize], is[hsize]; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = *(p0 + xss[k]); + is[k] = *(p0 + xss[k] + 1); + } + + double re = 0; + double im = 0; + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = rs[0] * v[j] - is[0] * v[j + 1]; + in = rs[0] * v[j + 1] + is[0] * v[j]; + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + rn += rs[l] * v[j] - is[l] * v[j + 1]; + in += rs[l] * v[j + 1] + is[l] * v[j]; + + j += 2; + } + + re += rs[k] * rn + is[k] * in; + im += rs[k] * in - is[k] * rn; + } + + return std::complex{re, im}; + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), matrix, ms, xss, state.get()); + } + + For for_; +}; + +} // namespace qsim + +#endif // SIMULATOR_BASIC_H_ diff --git a/qsim/simulator_cuda.h b/qsim/simulator_cuda.h new file mode 100644 index 0000000..5743bea --- /dev/null +++ b/qsim/simulator_cuda.h @@ -0,0 +1,923 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SIMULATOR_CUDA_H_ +#define SIMULATOR_CUDA_H_ + +#include "simulator_cuda_kernels.h" + +#include +#include +#include +#include +#include + +#include "bits.h" +#include "statespace_cuda.h" + +namespace qsim { + +/** + * Quantum circuit simulator with GPU vectorization. + */ +template +class SimulatorCUDA final { + private: + using idx_type = uint64_t; + using Complex = qsim::Complex; + + // The maximum buffer size for indices and gate matrices. + // The maximum gate matrix size (for 6-qubit gates) is + // 2 * 2^6 * 2^6 * sizeof(FP) = 8192 * sizeof(FP). The maximum index size is + // 128 * sizeof(idx_type) + 96 * sizeof(unsigned). + static constexpr unsigned max_buf_size = 8192 * sizeof(FP) + + 128 * sizeof(idx_type) + 96 * sizeof(unsigned); + + public: + using StateSpace = StateSpaceCUDA; + using State = typename StateSpace::State; + using fp_type = typename StateSpace::fp_type; + + SimulatorCUDA() : scratch_(nullptr), scratch_size_(0) { + ErrorCheck(cudaMalloc(&d_ws, max_buf_size)); + } + + ~SimulatorCUDA() { + ErrorCheck(cudaFree(d_ws)); + + if (scratch_ != nullptr) { + ErrorCheck(cudaFree(scratch_)); + } + } + + /** + * Applies a gate using CUDA instructions. + * @param qs Indices of the qubits affected by this gate. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyGate(const std::vector& qs, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + if (qs.size() == 0) { + ApplyGateH<0>(qs, matrix, state); + } else if (qs[0] > 4) { + switch (qs.size()) { + case 1: + ApplyGateH<1>(qs, matrix, state); + break; + case 2: + ApplyGateH<2>(qs, matrix, state); + break; + case 3: + ApplyGateH<3>(qs, matrix, state); + break; + case 4: + ApplyGateH<4>(qs, matrix, state); + break; + case 5: + ApplyGateH<5>(qs, matrix, state); + break; + case 6: + ApplyGateH<6>(qs, matrix, state); + break; + default: + // Not implemented. + break; + } + } else { + switch (qs.size()) { + case 1: + ApplyGateL<1>(qs, matrix, state); + break; + case 2: + ApplyGateL<2>(qs, matrix, state); + break; + case 3: + ApplyGateL<3>(qs, matrix, state); + break; + case 4: + ApplyGateL<4>(qs, matrix, state); + break; + case 5: + ApplyGateL<5>(qs, matrix, state); + break; + case 6: + ApplyGateL<6>(qs, matrix, state); + break; + default: + // Not implemented. + break; + } + } + } + + /** + * Applies a controlled gate using CUDA instructions. + * @param qs Indices of the qubits affected by this gate. + * @param cqs Indices of control qubits. + * @param cvals Bit mask of control qubit values. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyControlledGate(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + if (cqs.size() == 0) { + ApplyGate(qs, matrix, state); + return; + } + + if (cqs[0] < 5) { + switch (qs.size()) { + case 0: + ApplyControlledGateL<0>(qs, cqs, cvals, matrix, state); + break; + case 1: + ApplyControlledGateL<1>(qs, cqs, cvals, matrix, state); + break; + case 2: + ApplyControlledGateL<2>(qs, cqs, cvals, matrix, state); + break; + case 3: + ApplyControlledGateL<3>(qs, cqs, cvals, matrix, state); + break; + case 4: + ApplyControlledGateL<4>(qs, cqs, cvals, matrix, state); + break; + default: + // Not implemented. + break; + } + } else { + if (qs.size() == 0) { + ApplyControlledGateHH<0>(qs, cqs, cvals, matrix, state); + } else if (qs[0] > 4) { + switch (qs.size()) { + case 1: + ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state); + break; + case 2: + ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state); + break; + case 3: + ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state); + break; + case 4: + ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state); + break; + default: + // Not implemented. + break; + } + } else { + switch (qs.size()) { + case 1: + ApplyControlledGateLH<1>(qs, cqs, cvals, matrix, state); + break; + case 2: + ApplyControlledGateLH<2>(qs, cqs, cvals, matrix, state); + break; + case 3: + ApplyControlledGateLH<3>(qs, cqs, cvals, matrix, state); + break; + case 4: + ApplyControlledGateLH<4>(qs, cqs, cvals, matrix, state); + break; + default: + // Not implemented. + break; + } + } + } + } + + /** + * Computes the expectation value of an operator using CUDA instructions. + * @param qs Indices of the qubits the operator acts on. + * @param matrix The operator matrix. + * @param state The state of the system. + * @return The computed expectation value. + */ + std::complex ExpectationValue(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + if (qs[0] > 4) { + switch (qs.size()) { + case 1: + return ExpectationValueH<1>(qs, matrix, state); + case 2: + return ExpectationValueH<2>(qs, matrix, state); + case 3: + return ExpectationValueH<3>(qs, matrix, state); + case 4: + return ExpectationValueH<4>(qs, matrix, state); + case 5: + return ExpectationValueH<5>(qs, matrix, state); + case 6: + return ExpectationValueH<6>(qs, matrix, state); + default: + // Not implemented. + break; + } + } else { + switch (qs.size()) { + case 1: + return ExpectationValueL<1>(qs, matrix, state); + case 2: + return ExpectationValueL<2>(qs, matrix, state); + case 3: + return ExpectationValueL<3>(qs, matrix, state); + case 4: + return ExpectationValueL<4>(qs, matrix, state); + case 5: + return ExpectationValueL<5>(qs, matrix, state); + case 6: + return ExpectationValueL<6>(qs, matrix, state); + default: + // Not implemented. + break; + } + } + + return 0; + } + + /** + * @return The size of SIMD register if applicable. + */ + static unsigned SIMDRegisterSize() { + return 32; + } + + private: + // The following indices are used in kernels. + // xss - indices to access the state vector entries in global memory. + // ms - masks to access the state vector entries in global memory. + // tis - indices to access the state vector entries in shared memory + // in the presence of low gate qubits. + // qis - indices to access the state vector entries in shared memory + // in the presence of low gate qubits. + // cis - additional indices to access the state vector entries in global + // memory in the presence of low control qubits. + + template + struct IndicesH { + static constexpr unsigned gsize = 1 << G; + static constexpr unsigned matrix_size = 2 * gsize * gsize * sizeof(fp_type); + static constexpr unsigned xss_size = 32 * sizeof(idx_type) * (1 + (G == 6)); + static constexpr unsigned ms_size = 32 * sizeof(idx_type); + static constexpr unsigned xss_offs = matrix_size; + static constexpr unsigned ms_offs = xss_offs + xss_size; + static constexpr unsigned buf_size = ms_offs + ms_size; + + IndicesH(char* p) + : xss((idx_type*) (p + xss_offs)), ms((idx_type*) (p + ms_offs)) {} + + idx_type* xss; + idx_type* ms; + }; + + template + struct IndicesL : public IndicesH { + using Base = IndicesH; + static constexpr unsigned qis_size = 32 * sizeof(unsigned) * (1 + (G == 6)); + static constexpr unsigned tis_size = 32 * sizeof(unsigned); + static constexpr unsigned qis_offs = Base::buf_size; + static constexpr unsigned tis_offs = qis_offs + qis_size; + static constexpr unsigned buf_size = tis_offs + tis_size; + + IndicesL(char* p) + : Base(p), qis((unsigned*) (p + qis_offs)), + tis((unsigned*) (p + tis_offs)) {} + + unsigned* qis; + unsigned* tis; + }; + + template + struct IndicesLC : public IndicesL { + using Base = IndicesL; + static constexpr unsigned cis_size = 32 * sizeof(idx_type); + static constexpr unsigned cis_offs = Base::buf_size; + static constexpr unsigned buf_size = cis_offs + cis_size; + + IndicesLC(char* p) : Base(p), cis((idx_type*) (p + cis_offs)) {} + + idx_type* cis; + }; + + struct DataC { + idx_type cvalsh; + unsigned num_aqs; + unsigned num_effective_qs; + unsigned remaining_low_cqs; + }; + + template + void ApplyGateH(const std::vector& qs, + const fp_type* matrix, State& state) const { + unsigned num_qubits = state.num_qubits(); + + IndicesH h_i(h_ws); + GetIndicesH(num_qubits, qs, qs.size(), h_i); + + std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); + ErrorCheck( + cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); + + unsigned k = 5 + G; + unsigned n = num_qubits > k ? num_qubits - k : 0; + unsigned size = unsigned{1} << n; + unsigned threads = 64U; + unsigned blocks = std::max(1U, size / 2); + + IndicesH d_i(d_ws); + + ApplyGateH_Kernel<<>>( + (fp_type*) d_ws, d_i.xss, d_i.ms, state.get()); + } + + template + void ApplyGateL(const std::vector& qs, + const fp_type* matrix, State& state) const { + unsigned num_qubits = state.num_qubits(); + + IndicesL h_i(h_ws); + auto num_effective_qs = GetIndicesL(num_qubits, qs, h_i); + + std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); + ErrorCheck( + cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); + + unsigned k = 5 + num_effective_qs; + unsigned n = num_qubits > k ? num_qubits - k : 0; + unsigned size = unsigned{1} << n; + unsigned threads = 32; + unsigned blocks = size; + + IndicesL d_i(d_ws); + + ApplyGateL_Kernel<<>>( + (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, + 1 << num_effective_qs, state.get()); + } + + template + void ApplyControlledGateHH(const std::vector& qs, + const std::vector& cqs, idx_type cvals, + const fp_type* matrix, State& state) const { + unsigned aqs[64]; + idx_type cmaskh = 0; + unsigned num_qubits = state.num_qubits(); + + IndicesH h_i(h_ws); + + unsigned num_aqs = GetHighQubits(qs, 0, cqs, 0, 0, cmaskh, aqs); + GetMs(num_qubits, aqs, num_aqs, h_i.ms); + GetXss(num_qubits, qs, qs.size(), h_i.xss); + + idx_type cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh); + + std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); + ErrorCheck( + cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); + + unsigned k = 5 + G + cqs.size(); + unsigned n = num_qubits > k ? num_qubits - k : 0; + unsigned size = unsigned{1} << n; + unsigned threads = 64U; + unsigned blocks = std::max(1U, size / 2); + + IndicesH d_i(d_ws); + + ApplyControlledGateH_Kernel<<>>( + (fp_type*) d_ws, d_i.xss, d_i.ms, num_aqs + 1, cvalsh, state.get()); + } + + template + void ApplyControlledGateLH(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + unsigned num_qubits = state.num_qubits(); + + IndicesL h_i(h_ws); + auto d = GetIndicesLC(num_qubits, qs, cqs, cvals, h_i); + + std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); + ErrorCheck( + cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); + + unsigned k = 5 + G + cqs.size(); + unsigned n = num_qubits > k ? num_qubits - k : 0; + unsigned size = unsigned{1} << n; + unsigned threads = 32; + unsigned blocks = size; + + IndicesL d_i(d_ws); + + ApplyControlledGateLH_Kernel<<>>( + (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, + d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs, state.get()); + } + + template + void ApplyControlledGateL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + unsigned num_qubits = state.num_qubits(); + + IndicesLC h_i(h_ws); + auto d = GetIndicesLCL(num_qubits, qs, cqs, cvals, h_i); + + std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); + ErrorCheck( + cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); + + unsigned k = 5 + G + cqs.size(); + unsigned n = num_qubits > k ? num_qubits - k : 0; + unsigned size = unsigned{1} << n; + unsigned threads = 32; + unsigned blocks = size; + + IndicesLC d_i(d_ws); + + ApplyControlledGateL_Kernel<<>>( + (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d_i.cis, + d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs, + 1 << (5 - d.remaining_low_cqs), state.get()); + } + + template + std::complex ExpectationValueH(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + unsigned num_qubits = state.num_qubits(); + + IndicesH h_i(h_ws); + GetIndicesH(num_qubits, qs, qs.size(), h_i); + + std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); + ErrorCheck( + cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); + + unsigned k = 5 + G; + unsigned n = num_qubits > k ? num_qubits - k : 0; + unsigned size = unsigned{1} << n; + + unsigned s = std::min(n >= 14 ? n - 14 : 0, 4U); + unsigned threads = 64U; + unsigned blocks = std::max(1U, (size / 2) >> s); + unsigned num_iterations_per_block = 1 << s; + + constexpr unsigned m = 16; + + Complex* d_res1 = (Complex*) AllocScratch((blocks + m) * sizeof(Complex)); + Complex* d_res2 = d_res1 + blocks; + + IndicesH d_i(d_ws); + + ExpectationValueH_Kernel<<>>( + (fp_type*) d_ws, d_i.xss, d_i.ms, num_iterations_per_block, + state.get(), Plus(), d_res1); + + double mul = size == 1 ? 0.5 : 1.0; + + return ExpectationValueReduceFinal(blocks, mul, d_res1, d_res2); + } + + template + std::complex ExpectationValueL(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + unsigned num_qubits = state.num_qubits(); + + IndicesL h_i(h_ws); + auto num_effective_qs = GetIndicesL(num_qubits, qs, h_i); + + std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); + ErrorCheck( + cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); + + unsigned k = 5 + num_effective_qs; + unsigned n = num_qubits > k ? num_qubits - k : 0; + unsigned size = unsigned{1} << n; + + unsigned s = std::min(n >= 13 ? n - 13 : 0, 5U); + unsigned threads = 32; + unsigned blocks = size >> s; + unsigned num_iterations_per_block = 1 << s; + + constexpr unsigned m = 16; + + Complex* d_res1 = (Complex*) AllocScratch((blocks + m) * sizeof(Complex)); + Complex* d_res2 = d_res1 + blocks; + + IndicesL d_i(d_ws); + + ExpectationValueL_Kernel<<>>( + (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, + num_iterations_per_block, state.get(), Plus(), d_res1); + + double mul = double(1 << (5 + num_effective_qs - G)) / 32; + + return ExpectationValueReduceFinal(blocks, mul, d_res1, d_res2); + } + + template + std::complex ExpectationValueReduceFinal( + unsigned blocks, double mul, + const Complex* d_res1, Complex* d_res2) const { + Complex res2[m]; + + if (blocks <= 16) { + ErrorCheck(cudaMemcpy(res2, d_res1, blocks * sizeof(Complex), + cudaMemcpyDeviceToHost)); + } else { + unsigned threads2 = std::min(1024U, blocks); + unsigned blocks2 = std::min(m, blocks / threads2); + + unsigned dblocks = std::max(1U, blocks / (blocks2 * threads2)); + unsigned bytes = threads2 * sizeof(Complex); + + Reduce2Kernel<<>>( + dblocks, blocks, Plus(), Plus(), d_res1, d_res2); + + ErrorCheck(cudaMemcpy(res2, d_res2, blocks2 * sizeof(Complex), + cudaMemcpyDeviceToHost)); + + blocks = blocks2; + } + + double re = 0; + double im = 0; + + for (unsigned i = 0; i < blocks; ++i) { + re += res2[i].re; + im += res2[i].im; + } + + return {mul * re, mul * im}; + } + + template + unsigned GetHighQubits(const std::vector& qs, unsigned qi, + const std::vector& cqs, unsigned ci, + unsigned ai, idx_type& cmaskh, AQ& aqs) const { + while (1) { + if (qi < qs.size() && (ci == cqs.size() || qs[qi] < cqs[ci])) { + aqs[ai++] = qs[qi++]; + } else if (ci < cqs.size()) { + cmaskh |= idx_type{1} << cqs[ci]; + aqs[ai++] = cqs[ci++]; + } else { + break; + } + } + + return ai; + } + + template + void GetMs(unsigned num_qubits, const QS& qs, unsigned qs_size, + idx_type* ms) const { + if (qs_size == 0) { + ms[0] = idx_type(-1); + } else { + idx_type xs = idx_type{1} << (qs[0] + 1); + ms[0] = (idx_type{1} << qs[0]) - 1; + for (unsigned i = 1; i < qs_size; ++i) { + ms[i] = ((idx_type{1} << qs[i]) - 1) ^ (xs - 1); + xs = idx_type{1} << (qs[i] + 1); + } + ms[qs_size] = ((idx_type{1} << num_qubits) - 1) ^ (xs - 1); + } + } + + template + void GetXss(unsigned num_qubits, const QS& qs, unsigned qs_size, + idx_type* xss) const { + if (qs_size == 0) { + xss[0] = 0; + } else { + unsigned g = qs_size; + unsigned gsize = 1 << qs_size; + + idx_type xs[64]; + + xs[0] = idx_type{1} << (qs[0] + 1); + for (unsigned i = 1; i < g; ++i) { + xs[i] = idx_type{1} << (qs[i] + 1); + } + + for (unsigned i = 0; i < gsize; ++i) { + idx_type a = 0; + for (unsigned k = 0; k < g; ++k) { + a += xs[k] * ((i >> k) & 1); + } + xss[i] = a; + } + } + } + + template + void GetIndicesH(unsigned num_qubits, const qs_type& qs, unsigned qs_size, + IndicesH& indices) const { + if (qs_size == 0) { + indices.ms[0] = idx_type(-1); + indices.xss[0] = 0; + } else { + unsigned g = qs_size; + unsigned gsize = 1 << qs_size; + + idx_type xs[64]; + + xs[0] = idx_type{1} << (qs[0] + 1); + indices.ms[0] = (idx_type{1} << qs[0]) - 1; + for (unsigned i = 1; i < g; ++i) { + xs[i] = idx_type{1} << (qs[i] + 1); + indices.ms[i] = ((idx_type{1} << qs[i]) - 1) ^ (xs[i - 1] - 1); + } + indices.ms[g] = ((idx_type{1} << num_qubits) - 1) ^ (xs[g - 1] - 1); + + for (unsigned i = 0; i < gsize; ++i) { + idx_type a = 0; + for (unsigned k = 0; k < g; ++k) { + a += xs[k] * ((i >> k) & 1); + } + indices.xss[i] = a; + } + } + } + + template + void GetIndicesL(unsigned num_effective_qs, unsigned qmask, + IndicesL& indices) const { + for (unsigned i = num_effective_qs + 1; i < (G + 1); ++i) { + indices.ms[i] = 0; + } + + for (unsigned i = (1 << num_effective_qs); i < indices.gsize; ++i) { + indices.xss[i] = 0; + } + + for (unsigned i = 0; i < indices.gsize; ++i) { + indices.qis[i] = bits::ExpandBits(i, 5 + num_effective_qs, qmask); + } + + unsigned tmask = ((1 << (5 + num_effective_qs)) - 1) ^ qmask; + for (unsigned i = 0; i < 32; ++i) { + indices.tis[i] = bits::ExpandBits(i, 5 + num_effective_qs, tmask); + } + } + + template + unsigned GetIndicesL(unsigned num_qubits, const std::vector& qs, + IndicesL& indices) const { + unsigned eqs[32]; + + unsigned qmaskh = 0; + unsigned qmaskl = 0; + + unsigned qi = 0; + + while (qi < qs.size() && qs[qi] < 5) { + qmaskl |= 1 << qs[qi++]; + } + + unsigned nq = std::max(5U, num_qubits); + unsigned num_effective_qs = std::min(nq - 5, unsigned(qs.size())); + + unsigned l = 0; + unsigned ei = 0; + unsigned num_low_qs = qi; + + if (qs.size() == num_low_qs) { + while (ei < num_effective_qs && l++ < num_low_qs) { + eqs[ei] = ei + 5; + ++ei; + } + } else { + while (ei < num_effective_qs && l < num_low_qs) { + unsigned ei5 = ei + 5; + eqs[ei] = ei5; + if (qi < qs.size() && qs[qi] == ei5) { + ++qi; + qmaskh |= 1 << ei5; + } else { + ++l; + } + ++ei; + } + + while (ei < num_effective_qs) { + eqs[ei] = qs[qi++]; + qmaskh |= 1 << (ei + 5); + ++ei; + } + } + + GetIndicesH(num_qubits, eqs, num_effective_qs, indices); + GetIndicesL(num_effective_qs, qmaskh | qmaskl, indices); + + return num_effective_qs; + } + + template + DataC GetIndicesLC(unsigned num_qubits, const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + IndicesL& indices) const { + unsigned aqs[64]; + unsigned eqs[32]; + + unsigned qmaskh = 0; + unsigned qmaskl = 0; + idx_type cmaskh = 0; + + unsigned qi = 0; + + while (qi < qs.size() && qs[qi] < 5) { + qmaskl |= 1 << qs[qi++]; + } + + unsigned nq = std::max(5U, num_qubits - unsigned(cqs.size())); + unsigned num_effective_qs = std::min(nq - 5, unsigned(qs.size())); + + unsigned l = 0; + unsigned ai = 5; + unsigned ci = 0; + unsigned ei = 0; + unsigned num_low_qs = qi; + + while (ai < num_qubits && l < num_low_qs) { + aqs[ai - 5] = ai; + if (qi < qs.size() && qs[qi] == ai) { + ++qi; + eqs[ei++] = ai; + qmaskh |= 1 << (ai - ci); + } else if (ci < cqs.size() && cqs[ci] == ai) { + ++ci; + cmaskh |= idx_type{1} << ai; + } else { + ++l; + eqs[ei++] = ai; + } + ++ai; + } + + unsigned i = ai; + unsigned j = qi; + + while (ei < num_effective_qs) { + eqs[ei++] = qs[j++]; + qmaskh |= 1 << (i++ - ci); + } + + unsigned num_aqs = GetHighQubits(qs, qi, cqs, ci, ai - 5, cmaskh, aqs); + GetMs(num_qubits, aqs, num_aqs, indices.ms); + GetXss(num_qubits, eqs, num_effective_qs, indices.xss); + GetIndicesL(num_effective_qs, qmaskh | qmaskl, indices); + + idx_type cvalsh = bits::ExpandBits(idx_type(cvals), num_qubits, cmaskh); + + return {cvalsh, num_aqs, num_effective_qs}; + } + + template + DataC GetIndicesLCL(unsigned num_qubits, const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + IndicesLC& indices) const { + unsigned aqs[64]; + unsigned eqs[32]; + + unsigned qmaskh = 0; + unsigned qmaskl = 0; + idx_type cmaskh = 0; + idx_type cmaskl = 0; + idx_type cis_mask = 0; + + unsigned qi = 0; + unsigned ci = 0; + + for (unsigned k = 0; k < 5; ++k) { + if (qi < qs.size() && qs[qi] == k) { + qmaskl |= 1 << (k - ci); + ++qi; + } else if (ci < cqs.size() && cqs[ci] == k) { + cmaskl |= idx_type{1} << k; + ++ci; + } + } + + unsigned num_low_qs = qi; + unsigned num_low_cqs = ci; + + unsigned nq = std::max(5U, num_qubits - unsigned(cqs.size())); + unsigned num_effective_qs = std::min(nq - 5, unsigned(qs.size())); + + unsigned l = 0; + unsigned ai = 5; + unsigned ei = 0; + unsigned num_low = num_low_qs + num_low_cqs; + unsigned remaining_low_cqs = num_low_cqs; + unsigned effective_low_qs = num_low_qs; + unsigned highest_cis_bit = 0; + + while (ai < num_qubits && l < num_low) { + aqs[ai - 5] = ai; + if (qi < qs.size() && qs[qi] == ai) { + ++qi; + if ((ai - ci) > 4) { + eqs[ei++] = ai; + qmaskh |= 1 << (ai - ci); + } else { + highest_cis_bit = ai; + cis_mask |= idx_type{1} << ai; + qmaskl |= 1 << (ai - ci); + --remaining_low_cqs; + ++effective_low_qs; + } + } else if (ci < cqs.size() && cqs[ci] == ai) { + ++ci; + cmaskh |= idx_type{1} << ai; + } else { + ++l; + if (remaining_low_cqs == 0) { + eqs[ei++] = ai; + } else { + highest_cis_bit = ai; + cis_mask |= idx_type{1} << ai; + --remaining_low_cqs; + } + } + ++ai; + } + + unsigned i = ai; + unsigned j = effective_low_qs; + + while (ei < num_effective_qs) { + eqs[ei++] = qs[j++]; + qmaskh |= 1 << (i++ - ci); + } + + unsigned num_aqs = GetHighQubits(qs, qi, cqs, ci, ai - 5, cmaskh, aqs); + GetMs(num_qubits, aqs, num_aqs, indices.ms); + GetXss(num_qubits, eqs, num_effective_qs, indices.xss); + GetIndicesL(num_effective_qs, qmaskh | qmaskl, indices); + + idx_type cvalsh = bits::ExpandBits(idx_type(cvals), num_qubits, cmaskh); + idx_type cvalsl = bits::ExpandBits(idx_type(cvals), 5, cmaskl); + + cis_mask |= 31 ^ cmaskl; + highest_cis_bit = highest_cis_bit < 5 ? 5 : highest_cis_bit; + for (idx_type i = 0; i < 32; ++i) { + auto c = bits::ExpandBits(i, highest_cis_bit + 1, cis_mask); + indices.cis[i] = 2 * (c & 0xffffffe0) | (c & 0x1f) | cvalsl; + } + + return {cvalsh, num_aqs, num_effective_qs, remaining_low_cqs}; + } + + + void* AllocScratch(uint64_t size) const { + if (size > scratch_size_) { + if (scratch_ != nullptr) { + ErrorCheck(cudaFree(scratch_)); + } + + ErrorCheck(cudaMalloc(const_cast(&scratch_), size)); + + const_cast(scratch_size_) = size; + } + + return scratch_; + } + + char* d_ws; + char h_ws0[max_buf_size]; + char* h_ws = (char*) h_ws0; + + void* scratch_; + uint64_t scratch_size_; +}; + +} // namespace qsim + +#endif // SIMULATOR_CUDA_H_ diff --git a/qsim/simulator_cuda_kernels.h b/qsim/simulator_cuda_kernels.h new file mode 100644 index 0000000..e21a9d6 --- /dev/null +++ b/qsim/simulator_cuda_kernels.h @@ -0,0 +1,683 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SIMULATOR_CUDA_KERNELS_H_ +#define SIMULATOR_CUDA_KERNELS_H_ + +#ifdef __NVCC__ + #include + #include + + #include "util_cuda.h" +#elif __HIP__ + #include + #include "cuda2hip.h" +#endif + +namespace qsim { + +template +__global__ void ApplyGateH_Kernel( + const fp_type* __restrict__ v0, const idx_type* __restrict__ xss0, + const idx_type* __restrict__ mss, fp_type* __restrict__ rstate) { + // blockDim.x must be equal to 64. + + static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); + + constexpr unsigned gsize = 1 << G; + constexpr unsigned rows = + G < 4 ? gsize : (sizeof(fp_type) == 4 ? + (G < 6 ? gsize : 32) : (G < 5 ? 8 : 16)); + + fp_type rs[gsize], is[gsize]; + + __shared__ idx_type xss[64]; + __shared__ fp_type v[2 * gsize * rows]; + + if (threadIdx.x < gsize) { + xss[threadIdx.x] = xss0[threadIdx.x]; + } + + if (G <= 2) { + if (threadIdx.x < 2 * gsize * gsize) { + v[threadIdx.x] = v0[threadIdx.x]; + } + } else { + for (unsigned m = 0; m < 2 * gsize * rows; m += 64) { + v[m + threadIdx.x] = v0[m + threadIdx.x]; + } + } + + __syncthreads(); + + idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0; + idx_type ii = i & mss[0]; + for (unsigned j = 1; j <= G; ++j) { + i *= 2; + ii |= i & mss[j]; + } + + auto p0 = rstate + 2 * ii + threadIdx.x % 32; + + for (unsigned k = 0; k < gsize; ++k) { + rs[k] = *(p0 + xss[k]); + is[k] = *(p0 + xss[k] + 32); + } + + for (unsigned s = 0; s < gsize / rows; ++s) { + if (s > 0) { + __syncthreads(); + + for (unsigned m = 0; m < 2 * gsize * rows; m += 64) { + v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; + } + + __syncthreads(); + } + + unsigned j = 0; + + for (unsigned k = rows * s; k < rows * (s + 1); ++k) { + fp_type rn = 0; + fp_type in = 0; + + for (unsigned l = 0; l < gsize; ++l) { + fp_type rm = v[j++]; + fp_type im = v[j++]; + rn += rs[l] * rm; + rn -= is[l] * im; + in += rs[l] * im; + in += is[l] * rm; + } + + *(p0 + xss[k]) = rn; + *(p0 + xss[k] + 32) = in; + } + } +} + +template +__global__ void ApplyGateL_Kernel( + const fp_type* __restrict__ v0, const idx_type* __restrict__ xss, + const idx_type* __restrict__ mss, const unsigned* __restrict__ qis, + const unsigned* __restrict__ tis, unsigned esize, + fp_type* __restrict__ rstate) { + // blockDim.x must be equal to 32. + + static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); + + constexpr unsigned gsize = 1 << G; + constexpr unsigned + rows = G < 4 ? gsize : (sizeof(fp_type) == 4 ? + (G < 5 ? gsize : 8) : (G < 6 ? 8 : 4)); + + fp_type rs[gsize], is[gsize]; + + __shared__ fp_type v[2 * gsize * rows]; + __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1]; + + if (G < 2) { + if (threadIdx.x < 2 * gsize * gsize) { + v[threadIdx.x] = v0[threadIdx.x]; + } + } else { + for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { + v[m + threadIdx.x] = v0[m + threadIdx.x]; + } + } + + idx_type i = 32 * idx_type{blockIdx.x}; + idx_type ii = i & mss[0]; + for (unsigned j = 1; j <= G; ++j) { + i *= 2; + ii |= i & mss[j]; + } + + auto p0 = rstate + 2 * ii + threadIdx.x; + + for (unsigned k = 0; k < gsize; ++k) { + rs0[threadIdx.x][k] = *(p0 + xss[k]); + is0[threadIdx.x][k] = *(p0 + xss[k] + 32); + } + + for (unsigned k = 0; k < gsize; ++k) { + unsigned i = tis[threadIdx.x] | qis[k]; + unsigned m = i & 0x1f; + unsigned n = i / 32; + + rs[k] = rs0[m][n]; + is[k] = is0[m][n]; + } + + for (unsigned s = 0; s < gsize / rows; ++s) { + if (s > 0) { + for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { + v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; + } + } + + unsigned j = 0; + + for (unsigned k = rows * s; k < rows * (s + 1); ++k) { + fp_type rn = 0; + fp_type in = 0; + + for (unsigned l = 0; l < gsize; ++l) { + fp_type rm = v[j++]; + fp_type im = v[j++]; + rn += rs[l] * rm; + rn -= is[l] * im; + in += rs[l] * im; + in += is[l] * rm; + } + + unsigned i = tis[threadIdx.x] | qis[k]; + unsigned m = i & 0x1f; + unsigned n = i / 32; + + rs0[m][n] = rn; + is0[m][n] = in; + } + } + + for (unsigned k = 0; k < esize; ++k) { + *(p0 + xss[k]) = rs0[threadIdx.x][k]; + *(p0 + xss[k] + 32) = is0[threadIdx.x][k]; + } +} + +template +__global__ void ApplyControlledGateH_Kernel( + const fp_type* __restrict__ v0, const idx_type* __restrict__ xss0, + const idx_type* __restrict__ mss, unsigned num_mss, idx_type cvalsh, + fp_type* __restrict__ rstate) { + // blockDim.x must be equal to 64. + + static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); + + constexpr unsigned gsize = 1 << G; + constexpr unsigned rows = + G < 4 ? gsize : (sizeof(fp_type) == 4 ? + (G < 6 ? gsize : 32) : (G < 5 ? 8 : 16)); + + fp_type rs[gsize], is[gsize]; + + __shared__ idx_type xss[64]; + __shared__ fp_type v[2 * gsize * rows]; + + if (threadIdx.x < gsize) { + xss[threadIdx.x] = xss0[threadIdx.x]; + } + + if (G <= 2) { + if (threadIdx.x < 2 * gsize * gsize) { + v[threadIdx.x] = v0[threadIdx.x]; + } + } else { + for (unsigned m = 0; m < 2 * gsize * rows; m += 64) { + v[m + threadIdx.x] = v0[m + threadIdx.x]; + } + } + + __syncthreads(); + + idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0; + idx_type ii = i & mss[0]; + for (unsigned j = 1; j < num_mss; ++j) { + i *= 2; + ii |= i & mss[j]; + } + + ii |= cvalsh; + + auto p0 = rstate + 2 * ii + threadIdx.x % 32; + + for (unsigned k = 0; k < gsize; ++k) { + rs[k] = *(p0 + xss[k]); + is[k] = *(p0 + xss[k] + 32); + } + + for (unsigned s = 0; s < gsize / rows; ++s) { + if (s > 0) { + __syncthreads(); + + for (unsigned m = 0; m < 2 * gsize * rows; m += 64) { + v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; + } + + __syncthreads(); + } + + unsigned j = 0; + + for (unsigned k = rows * s; k < rows * (s + 1); ++k) { + fp_type rn = 0; + fp_type in = 0; + + for (unsigned l = 0; l < gsize; ++l) { + fp_type rm = v[j++]; + fp_type im = v[j++]; + rn += rs[l] * rm; + rn -= is[l] * im; + in += rs[l] * im; + in += is[l] * rm; + } + + *(p0 + xss[k]) = rn; + *(p0 + xss[k] + 32) = in; + } + } +} + +template +__global__ void ApplyControlledGateLH_Kernel( + const fp_type* __restrict__ v0, const idx_type* __restrict__ xss, + const idx_type* __restrict__ mss, const unsigned* __restrict__ qis, + const unsigned* __restrict__ tis, unsigned num_mss, idx_type cvalsh, + unsigned esize, fp_type* __restrict__ rstate) { + // blockDim.x must be equal to 32. + + static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); + + constexpr unsigned gsize = 1 << G; + constexpr unsigned + rows = G < 4 ? gsize : (sizeof(fp_type) == 4 ? + (G < 5 ? gsize : 8) : (G < 6 ? 8 : 4)); + + fp_type rs[gsize], is[gsize]; + + __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1]; + __shared__ fp_type v[2 * gsize * rows]; + + idx_type i = 32 * idx_type{blockIdx.x}; + idx_type ii = i & mss[0]; + for (unsigned j = 1; j < num_mss; ++j) { + i *= 2; + ii |= i & mss[j]; + } + + ii |= cvalsh; + + auto p0 = rstate + 2 * ii + threadIdx.x; + + for (unsigned k = 0; k < gsize; ++k) { + rs0[threadIdx.x][k] = *(p0 + xss[k]); + is0[threadIdx.x][k] = *(p0 + xss[k] + 32); + } + + if (G < 2) { + if (threadIdx.x < 2 * gsize * gsize) { + v[threadIdx.x] = v0[threadIdx.x]; + } + } else { + for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { + v[m + threadIdx.x] = v0[m + threadIdx.x]; + } + } + + for (unsigned k = 0; k < gsize; ++k) { + unsigned i = tis[threadIdx.x] | qis[k]; + unsigned m = i & 0x1f; + unsigned n = i / 32; + + rs[k] = rs0[m][n]; + is[k] = is0[m][n]; + } + + for (unsigned s = 0; s < gsize / rows; ++s) { + if (s > 0) { + for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { + v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; + } + } + + unsigned j = 0; + + for (unsigned k = rows * s; k < rows * (s + 1); ++k) { + fp_type rn = 0; + fp_type in = 0; + + for (unsigned l = 0; l < gsize; ++l) { + fp_type rm = v[j++]; + fp_type im = v[j++]; + rn += rs[l] * rm; + rn -= is[l] * im; + in += rs[l] * im; + in += is[l] * rm; + } + + unsigned i = tis[threadIdx.x] | qis[k]; + unsigned m = i & 0x1f; + unsigned n = i / 32; + + rs0[m][n] = rn; + is0[m][n] = in; + } + } + + for (unsigned k = 0; k < esize; ++k) { + *(p0 + xss[k]) = rs0[threadIdx.x][k]; + *(p0 + xss[k] + 32) = is0[threadIdx.x][k]; + } +} + +template +__global__ void ApplyControlledGateL_Kernel( + const fp_type* __restrict__ v0, const idx_type* __restrict__ xss, + const idx_type* __restrict__ mss, const unsigned* __restrict__ qis, + const unsigned* __restrict__ tis, const idx_type* __restrict__ cis, + unsigned num_mss, idx_type cvalsh, unsigned esize, unsigned rwthreads, + fp_type* __restrict__ rstate) { + // blockDim.x must be equal to 32. + + static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); + + constexpr unsigned gsize = 1 << G; + constexpr unsigned + rows = G < 4 ? gsize : (sizeof(fp_type) == 4 ? + (G < 5 ? gsize : 8) : (G < 6 ? 8 : 4)); + + fp_type rs[gsize], is[gsize]; + + __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1]; + __shared__ fp_type v[2 * gsize * rows]; + + idx_type i = 32 * idx_type{blockIdx.x}; + idx_type ii = i & mss[0]; + for (unsigned j = 1; j < num_mss; ++j) { + i *= 2; + ii |= i & mss[j]; + } + + ii |= cvalsh; + + auto p0 = rstate + 2 * ii + cis[threadIdx.x]; + + if (threadIdx.x < rwthreads) { + for (unsigned k = 0; k < gsize; ++k) { + rs0[threadIdx.x][k] = *(p0 + xss[k]); + is0[threadIdx.x][k] = *(p0 + xss[k] + 32); + } + } + + if (G < 2) { + if (threadIdx.x < 2 * gsize * gsize) { + v[threadIdx.x] = v0[threadIdx.x]; + } + } else { + for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { + v[m + threadIdx.x] = v0[m + threadIdx.x]; + } + } + + for (unsigned k = 0; k < gsize; ++k) { + unsigned i = tis[threadIdx.x] | qis[k]; + unsigned m = i & 0x1f; + unsigned n = i / 32; + + rs[k] = rs0[m][n]; + is[k] = is0[m][n]; + } + + for (unsigned s = 0; s < gsize / rows; ++s) { + if (s > 0) { + for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { + v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; + } + } + + unsigned j = 0; + + for (unsigned k = rows * s; k < rows * (s + 1); ++k) { + fp_type rn = 0; + fp_type in = 0; + + for (unsigned l = 0; l < gsize; ++l) { + fp_type rm = v[j++]; + fp_type im = v[j++]; + rn += rs[l] * rm; + rn -= is[l] * im; + in += rs[l] * im; + in += is[l] * rm; + } + + unsigned i = tis[threadIdx.x] | qis[k]; + unsigned m = i & 0x1f; + unsigned n = i / 32; + + rs0[m][n] = rn; + is0[m][n] = in; + } + } + + if (threadIdx.x < rwthreads) { + for (unsigned k = 0; k < esize; ++k) { + *(p0 + xss[k]) = rs0[threadIdx.x][k]; + *(p0 + xss[k] + 32) = is0[threadIdx.x][k]; + } + } +} + +template +__global__ void ExpectationValueH_Kernel( + const fp_type* __restrict__ v0, const idx_type* __restrict__ xss0, + const idx_type* __restrict__ mss, unsigned num_iterations_per_block, + const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) { + // blockDim.x must be equal to 64. + + static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); + + constexpr unsigned gsize = 1 << G; + constexpr unsigned rows = + G < 5 ? gsize : (sizeof(fp_type) == 4 ? (G < 6 ? 4 : 8) : 8); + + fp_type rs[gsize], is[gsize]; + + __shared__ idx_type xss[64]; + __shared__ fp_type v[2 * gsize * rows]; + + if (threadIdx.x < gsize) { + xss[threadIdx.x] = xss0[threadIdx.x]; + } + + if (G <= 2) { + if (threadIdx.x < 2 * gsize * gsize) { + v[threadIdx.x] = v0[threadIdx.x]; + } + } else { + for (unsigned m = 0; m < 2 * gsize * rows; m += 64) { + v[m + threadIdx.x] = v0[m + threadIdx.x]; + } + } + + __syncthreads(); + + double re = 0; + double im = 0; + + for (unsigned iter = 0; iter < num_iterations_per_block; ++iter) { + idx_type b = num_iterations_per_block * idx_type{blockIdx.x} + iter; + + idx_type i = (64 * b + threadIdx.x) & 0xffffffffffe0; + idx_type ii = i & mss[0]; + for (unsigned j = 1; j <= G; ++j) { + i *= 2; + ii |= i & mss[j]; + } + + auto p0 = rstate + 2 * ii + threadIdx.x % 32; + + for (unsigned k = 0; k < gsize; ++k) { + rs[k] = *(p0 + xss[k]); + is[k] = *(p0 + xss[k] + 32); + } + + for (unsigned s = 0; s < gsize / rows; ++s) { + if (s > 0 || iter > 0) { + __syncthreads(); + + for (unsigned m = 0; m < 2 * gsize * rows; m += 64) { + v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; + } + + __syncthreads(); + } + + unsigned j = 0; + + for (unsigned k = rows * s; k < rows * (s + 1); ++k) { + fp_type rn = 0; + fp_type in = 0; + + for (unsigned l = 0; l < gsize; ++l) { + fp_type rm = v[j++]; + fp_type im = v[j++]; + rn += rs[l] * rm; + rn -= is[l] * im; + in += rs[l] * im; + in += is[l] * rm; + } + + re += rs[k] * rn; + re += is[k] * in; + im += rs[k] * in; + im -= is[k] * rn; + } + } + } + + __shared__ cfp_type partial1[64]; + __shared__ cfp_type partial2[2]; + + partial1[threadIdx.x].re = re; + partial1[threadIdx.x].im = im; + + auto val = WarpReduce(partial1[threadIdx.x], op); + + if (threadIdx.x % 32 == 0) { + partial2[threadIdx.x / 32] = val; + } + + __syncthreads(); + + if (threadIdx.x == 0) { + result[blockIdx.x].re = partial2[0].re + partial2[1].re; + result[blockIdx.x].im = partial2[0].im + partial2[1].im; + } +} + +template +__global__ void ExpectationValueL_Kernel( + const fp_type* __restrict__ v0, const idx_type* __restrict__ xss, + const idx_type* __restrict__ mss, const unsigned* __restrict__ qis, + const unsigned* __restrict__ tis, unsigned num_iterations_per_block, + const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) { + // blockDim.x must be equal to 32. + + static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); + + constexpr unsigned gsize = 1 << G; + constexpr unsigned rows = G < 5 ? gsize : (sizeof(fp_type) == 4 ? + (G < 6 ? 4 : 2) : (G < 6 ? 2 : 1)); + + fp_type rs[gsize], is[gsize]; + + __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1]; + __shared__ fp_type v[2 * gsize * rows]; + + if (G < 2) { + if (threadIdx.x < 2 * gsize * gsize) { + v[threadIdx.x] = v0[threadIdx.x]; + } + } else { + for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { + v[m + threadIdx.x] = v0[m + threadIdx.x]; + } + } + + double re = 0; + double im = 0; + + for (idx_type iter = 0; iter < num_iterations_per_block; ++iter) { + idx_type i = 32 * (num_iterations_per_block * idx_type{blockIdx.x} + iter); + idx_type ii = i & mss[0]; + for (unsigned j = 1; j <= G; ++j) { + i *= 2; + ii |= i & mss[j]; + } + + auto p0 = rstate + 2 * ii + threadIdx.x; + + for (unsigned k = 0; k < gsize; ++k) { + rs0[threadIdx.x][k] = *(p0 + xss[k]); + is0[threadIdx.x][k] = *(p0 + xss[k] + 32); + } + + for (unsigned k = 0; k < gsize; ++k) { + unsigned i = tis[threadIdx.x] | qis[k]; + unsigned m = i & 0x1f; + unsigned n = i / 32; + + rs[k] = rs0[m][n]; + is[k] = is0[m][n]; + } + + for (unsigned s = 0; s < gsize / rows; ++s) { + if (s > 0 || iter > 0) { + for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { + v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; + } + } + + unsigned j = 0; + + for (unsigned k = rows * s; k < rows * (s + 1); ++k) { + fp_type rn = 0; + fp_type in = 0; + + for (unsigned l = 0; l < gsize; ++l) { + fp_type rm = v[j++]; + fp_type im = v[j++]; + rn += rs[l] * rm; + rn -= is[l] * im; + in += rs[l] * im; + in += is[l] * rm; + } + + re += rs[k] * rn; + re += is[k] * in; + im += rs[k] * in; + im -= is[k] * rn; + } + } + } + + __shared__ cfp_type partial[32]; + + partial[threadIdx.x].re = re; + partial[threadIdx.x].im = im; + + auto val = WarpReduce(partial[threadIdx.x], op); + + if (threadIdx.x == 0) { + result[blockIdx.x].re = val.re; + result[blockIdx.x].im = val.im; + } +} + +} // namespace qsim + +#endif // SIMULATOR_CUDA_KERNELS_H_ diff --git a/qsim/simulator_custatevec.h b/qsim/simulator_custatevec.h new file mode 100644 index 0000000..40d1902 --- /dev/null +++ b/qsim/simulator_custatevec.h @@ -0,0 +1,209 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SIMULATOR_CUSTATEVEC_H_ +#define SIMULATOR_CUSTATEVEC_H_ + +#include +#include +#include + +#include +#include +#include + +#include "io.h" +#include "statespace_custatevec.h" +#include "util_custatevec.h" + +namespace qsim { + +/** + * Quantum circuit simulator using the NVIDIA cuStateVec library. + */ +template +class SimulatorCuStateVec final { + public: + using StateSpace = StateSpaceCuStateVec; + using State = typename StateSpace::State; + using fp_type = typename StateSpace::fp_type; + + static constexpr auto kStateType = StateSpace::kStateType; + static constexpr auto kMatrixType = StateSpace::kMatrixType; + static constexpr auto kExpectType = StateSpace::kExpectType; + static constexpr auto kComputeType = StateSpace::kComputeType; + static constexpr auto kMatrixLayout = StateSpace::kMatrixLayout; + + explicit SimulatorCuStateVec(const cublasHandle_t& cublas_handle, + const custatevecHandle_t& custatevec_handle) + : cublas_handle_(cublas_handle), custatevec_handle_(custatevec_handle), + workspace_(nullptr), workspace_size_(0) {} + + ~SimulatorCuStateVec() { + ErrorCheck(cudaFree(workspace_)); + } + + /** + * Applies a gate using the NVIDIA cuStateVec library. + * @param qs Indices of the qubits affected by this gate. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyGate(const std::vector& qs, + const fp_type* matrix, State& state) const { + if (qs.size() == 0) { + uint64_t size = uint64_t{1} << state.num_qubits(); + + if (StateSpace::is_float) { + cuComplex a = {matrix[0], matrix[1]}; + auto p = (cuComplex*) state.get(); + ErrorCheck(cublasCscal(cublas_handle_, size, &a, p, 1)); + } else { + cuDoubleComplex a = {matrix[0], matrix[1]}; + auto p = (cuDoubleComplex*) state.get(); + ErrorCheck(cublasZscal(cublas_handle_, size, &a, p, 1)); + } + } else { + auto workspace_size = ApplyGateWorkSpaceSize( + state.num_qubits(), qs.size(), 0, matrix); + AllocWorkSpace(workspace_size); + + ErrorCheck(custatevecApplyMatrix( + custatevec_handle_, state.get(), kStateType, + state.num_qubits(), matrix, kMatrixType, kMatrixLayout, 0, + (int32_t*) qs.data(), qs.size(), nullptr, nullptr, 0, + kComputeType, workspace_, workspace_size)); + } + } + + /** + * Applies a controlled gate using the NVIDIA cuStateVec library. + * @param qs Indices of the qubits affected by this gate. + * @param cqs Indices of control qubits. + * @param cmask Bit mask of control qubit values. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyControlledGate(const std::vector& qs, + const std::vector& cqs, uint64_t cmask, + const fp_type* matrix, State& state) const { + if (qs.size() == 0) { + IO::errorf( + "error: controlled global phase gate is not implemented %s %d\n", + __FILE__, __LINE__); + exit(1); + } else { + std::vector control_bits; + control_bits.reserve(cqs.size()); + + for (std::size_t i = 0; i < cqs.size(); ++i) { + control_bits.push_back((cmask >> i) & 1); + } + + auto workspace_size = ApplyGateWorkSpaceSize( + state.num_qubits(), qs.size(), cqs.size(), matrix); + AllocWorkSpace(workspace_size); + + ErrorCheck(custatevecApplyMatrix( + custatevec_handle_, state.get(), kStateType, + state.num_qubits(), matrix, kMatrixType, kMatrixLayout, 0, + (int32_t*) qs.data(), qs.size(), + (int32_t*) cqs.data(), control_bits.data(), cqs.size(), + kComputeType, workspace_, workspace_size)); + } + } + + /** + * Computes the expectation value of an operator using the NVIDIA cuStateVec + * library. + * @param qs Indices of the qubits the operator acts on. + * @param matrix The operator matrix. + * @param state The state of the system. + * @return The computed expectation value. + */ + std::complex ExpectationValue(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + auto workspace_size = ExpectationValueWorkSpaceSize( + state.num_qubits(), qs.size(), matrix); + AllocWorkSpace(workspace_size); + + cuDoubleComplex eval; + + ErrorCheck(custatevecComputeExpectation( + custatevec_handle_, state.get(), kStateType, + state.num_qubits(), &eval, kExpectType, nullptr, matrix, + kMatrixType, kMatrixLayout, (int32_t*) qs.data(), qs.size(), + kComputeType, workspace_, workspace_size)); + + return {cuCreal(eval), cuCimag(eval)}; + } + + /** + * @return The size of SIMD register if applicable. + */ + static unsigned SIMDRegisterSize() { + return 32; + } + + private: + size_t ApplyGateWorkSpaceSize( + unsigned num_qubits, unsigned num_targets, unsigned num_controls, + const fp_type* matrix) const { + size_t size; + + ErrorCheck(custatevecApplyMatrixGetWorkspaceSize( + custatevec_handle_, kStateType, num_qubits, matrix, + kMatrixType, kMatrixLayout, 0, num_targets, num_controls, + kComputeType, &size)); + + return size; + } + + size_t ExpectationValueWorkSpaceSize( + unsigned num_qubits, unsigned num_targets, const fp_type* matrix) const { + size_t size; + + ErrorCheck(custatevecComputeExpectationGetWorkspaceSize( + custatevec_handle_, kStateType, num_qubits, matrix, + kMatrixType, kMatrixLayout, num_targets, kComputeType, + &size)); + + return size; + } + + void* AllocWorkSpace(size_t size) const { + if (size > workspace_size_) { + if (workspace_ != nullptr) { + ErrorCheck(cudaFree(workspace_)); + } + + ErrorCheck(cudaMalloc(const_cast(&workspace_), size)); + + const_cast(workspace_size_) = size; + } + + return workspace_; + } + + const cublasHandle_t cublas_handle_; + const custatevecHandle_t custatevec_handle_; + + void* workspace_; + size_t workspace_size_; +}; + +} // namespace qsim + +#endif // SIMULATOR_CUSTATEVEC_H_ diff --git a/qsim/simulator_sse.h b/qsim/simulator_sse.h new file mode 100644 index 0000000..5256c53 --- /dev/null +++ b/qsim/simulator_sse.h @@ -0,0 +1,864 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SIMULATOR_SSE_H_ +#define SIMULATOR_SSE_H_ + +#include + +#include +#include +#include +#include + +#include "simulator.h" +#include "statespace_sse.h" + +namespace qsim { + +/** + * Quantum circuit simulator with SSE vectorization. + */ +template +class SimulatorSSE final : public SimulatorBase { + public: + using StateSpace = StateSpaceSSE; + using State = typename StateSpace::State; + using fp_type = typename StateSpace::fp_type; + + template + explicit SimulatorSSE(ForArgs&&... args) : for_(args...) {} + + /** + * Applies a gate using SSE instructions. + * @param qs Indices of the qubits affected by this gate. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyGate(const std::vector& qs, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + switch (qs.size()) { + case 0: + ApplyGateH<0>(qs, matrix, state); + break; + case 1: + if (qs[0] > 1) { + ApplyGateH<1>(qs, matrix, state); + } else { + ApplyGateL<0, 1>(qs, matrix, state); + } + break; + case 2: + if (qs[0] > 1) { + ApplyGateH<2>(qs, matrix, state); + } else if (qs[1] > 1) { + ApplyGateL<1, 1>(qs, matrix, state); + } else { + ApplyGateL<0, 2>(qs, matrix, state); + } + break; + case 3: + if (qs[0] > 1) { + ApplyGateH<3>(qs, matrix, state); + } else if (qs[1] > 1) { + ApplyGateL<2, 1>(qs, matrix, state); + } else { + ApplyGateL<1, 2>(qs, matrix, state); + } + break; + case 4: + if (qs[0] > 1) { + ApplyGateH<4>(qs, matrix, state); + } else if (qs[1] > 1) { + ApplyGateL<3, 1>(qs, matrix, state); + } else { + ApplyGateL<2, 2>(qs, matrix, state); + } + break; + case 5: + if (qs[0] > 1) { + ApplyGateH<5>(qs, matrix, state); + } else if (qs[1] > 1) { + ApplyGateL<4, 1>(qs, matrix, state); + } else { + ApplyGateL<3, 2>(qs, matrix, state); + } + break; + case 6: + if (qs[0] > 1) { + ApplyGateH<6>(qs, matrix, state); + } else if (qs[1] > 1) { + ApplyGateL<5, 1>(qs, matrix, state); + } else { + ApplyGateL<4, 2>(qs, matrix, state); + } + break; + default: + // Not implemented. + break; + } + } + + /** + * Applies a controlled gate using SSE instructions. + * @param qs Indices of the qubits affected by this gate. + * @param cqs Indices of control qubits. + * @param cvals Bit mask of control qubit values. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyControlledGate(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + // Assume cqs[0] < cqs[1] < cqs[2] < ... . + + if (cqs.size() == 0) { + ApplyGate(qs, matrix, state); + return; + } + + switch (qs.size()) { + case 0: + if (cqs[0] > 1) { + ApplyControlledGateHH<0>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<0>(qs, cqs, cvals, matrix, state); + } + break; + case 1: + if (qs[0] > 1) { + if (cqs[0] > 1) { + ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 1) { + ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 2: + if (qs[0] > 1) { + if (cqs[0] > 1) { + ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 1) { + if (cqs[0] > 1) { + ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 1) { + ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 3: + if (qs[0] > 1) { + if (cqs[0] > 1) { + ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 1) { + if (cqs[0] > 1) { + ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 1) { + ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 4: + if (qs[0] > 1) { + if (cqs[0] > 1) { + ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 1) { + if (cqs[0] > 1) { + ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 1) { + ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + default: + // Not implemented. + break; + } + } + + /** + * Computes the expectation value of an operator using SSE instructions. + * @param qs Indices of the qubits the operator acts on. + * @param matrix The operator matrix. + * @param state The state of the system. + * @return The computed expectation value. + */ + std::complex ExpectationValue(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + switch (qs.size()) { + case 1: + if (qs[0] > 1) { + return ExpectationValueH<1>(qs, matrix, state); + } else { + return ExpectationValueL<0, 1>(qs, matrix, state); + } + break; + case 2: + if (qs[0] > 1) { + return ExpectationValueH<2>(qs, matrix, state); + } else if (qs[1] > 1) { + return ExpectationValueL<1, 1>(qs, matrix, state); + } else { + return ExpectationValueL<0, 2>(qs, matrix, state); + } + break; + case 3: + if (qs[0] > 1) { + return ExpectationValueH<3>(qs, matrix, state); + } else if (qs[1] > 1) { + return ExpectationValueL<2, 1>(qs, matrix, state); + } else { + return ExpectationValueL<1, 2>(qs, matrix, state); + } + break; + case 4: + if (qs[0] > 1) { + return ExpectationValueH<4>(qs, matrix, state); + } else if (qs[1] > 1) { + return ExpectationValueL<3, 1>(qs, matrix, state); + } else { + return ExpectationValueL<2, 2>(qs, matrix, state); + } + break; + case 5: + if (qs[0] > 1) { + return ExpectationValueH<5>(qs, matrix, state); + } else if (qs[1] > 1) { + return ExpectationValueL<4, 1>(qs, matrix, state); + } else { + return ExpectationValueL<3, 2>(qs, matrix, state); + } + break; + case 6: + if (qs[0] > 1) { + return ExpectationValueH<6>(qs, matrix, state); + } else if (qs[1] > 1) { + return ExpectationValueL<5, 1>(qs, matrix, state); + } else { + return ExpectationValueL<4, 2>(qs, matrix, state); + } + break; + default: + // Not implemented. + break; + } + + return 0; + } + + /** + * @return The size of SIMD register if applicable. + */ + static unsigned SIMDRegisterSize() { + return 4; + } + + private: + template + void ApplyGateH(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m128 ru, iu, rn, in; + __m128 rs[hsize], is[hsize]; + + i *= 4; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = _mm_load_ps(p0 + xss[k]); + is[k] = _mm_load_ps(p0 + xss[k] + 4); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm_set1_ps(v[j]); + iu = _mm_set1_ps(v[j + 1]); + rn = _mm_mul_ps(rs[0], ru); + in = _mm_mul_ps(rs[0], iu); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu)); + in = _mm_add_ps(in, _mm_mul_ps(is[0], ru)); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm_set1_ps(v[j]); + iu = _mm_set1_ps(v[j + 1]); + rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru)); + in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu)); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu)); + in = _mm_add_ps(in, _mm_mul_ps(is[l], ru)); + + j += 2; + } + + _mm_store_ps(p0 + xss[k], rn); + _mm_store_ps(p0 + xss[k] + 4, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned k = 2 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, matrix, ms, xss, state.get()); + } + + template + void ApplyGateL(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w, + const uint64_t* ms, const uint64_t* xss, + unsigned q0, fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m128 rn, in; + __m128 rs[gsize], is[gsize]; + + i *= 4; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + + rs[k2] = _mm_load_ps(p0 + xss[k]); + is[k2] = _mm_load_ps(p0 + xss[k] + 4); + + if (L == 1) { + rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177) + : _mm_shuffle_ps(rs[k2], rs[k2], 78); + is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177) + : _mm_shuffle_ps(is[k2], is[k2], 78); + } else if (L == 2) { + rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57); + is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57); + rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78); + is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78); + rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147); + is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147); + } + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm_mul_ps(rs[0], w[j]); + in = _mm_mul_ps(rs[0], w[j + 1]); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1])); + in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j])); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j])); + in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1])); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1])); + in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j])); + + j += 2; + } + + _mm_store_ps(p0 + xss[k], rn); + _mm_store_ps(p0 + xss[k] + 4, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + __m128 w[1 << (1 + 2 * H + L)]; + + auto m = GetMasks11(qs); + + FillIndices(state.num_qubits(), qs, ms, xss); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + unsigned k = 2 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, qs[0], state.get()); + } + + template + void ApplyControlledGateHH(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, + uint64_t cmaskh, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m128 ru, iu, rn, in; + __m128 rs[hsize], is[hsize]; + + i *= 4; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + if ((ii & cmaskh) != cvalsh) return; + + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = _mm_load_ps(p0 + xss[k]); + is[k] = _mm_load_ps(p0 + xss[k] + 4); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm_set1_ps(v[j]); + iu = _mm_set1_ps(v[j + 1]); + rn = _mm_mul_ps(rs[0], ru); + in = _mm_mul_ps(rs[0], iu); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu)); + in = _mm_add_ps(in, _mm_mul_ps(is[0], ru)); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm_set1_ps(v[j]); + iu = _mm_set1_ps(v[j + 1]); + rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru)); + in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu)); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu)); + in = _mm_add_ps(in, _mm_mul_ps(is[l], ru)); + + j += 2; + } + + _mm_store_ps(p0 + xss[k], rn); + _mm_store_ps(p0 + xss[k] + 4, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + + auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals); + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned k = 2 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, matrix, ms, xss, m.cvalsh, m.cmaskh, state.get()); + } + + template + void ApplyControlledGateHL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w, + const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, + uint64_t cmaskh, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m128 rn, in; + __m128 rs[hsize], is[hsize]; + + i *= 4; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + if ((ii & cmaskh) != cvalsh) return; + + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = _mm_load_ps(p0 + xss[k]); + is[k] = _mm_load_ps(p0 + xss[k] + 4); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm_mul_ps(rs[0], w[j]); + in = _mm_mul_ps(rs[0], w[j + 1]); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1])); + in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j])); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j])); + in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1])); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1])); + in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j])); + + j += 2; + } + + _mm_store_ps(p0 + xss[k], rn); + _mm_store_ps(p0 + xss[k] + 4, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + __m128 w[1 << (1 + 2 * H)]; + + auto m = GetMasks8<2>(state.num_qubits(), qs, cqs, cvals); + FillIndices(state.num_qubits(), qs, ms, xss); + FillControlledMatrixH(m.cvalsl, m.cmaskl, matrix, (fp_type*) w); + + unsigned r = 2 + H; + unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, state.get()); + } + + template + void ApplyControlledGateL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w, + const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, + uint64_t cmaskh, unsigned q0, fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m128 rn, in; + __m128 rs[gsize], is[gsize]; + + i *= 4; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + if ((ii & cmaskh) != cvalsh) return; + + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + + rs[k2] = _mm_load_ps(p0 + xss[k]); + is[k2] = _mm_load_ps(p0 + xss[k] + 4); + + if (L == 1) { + rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177) + : _mm_shuffle_ps(rs[k2], rs[k2], 78); + is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177) + : _mm_shuffle_ps(is[k2], is[k2], 78); + } else if (L == 2) { + rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57); + is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57); + rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78); + is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78); + rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147); + is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147); + } + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm_mul_ps(rs[0], w[j]); + in = _mm_mul_ps(rs[0], w[j + 1]); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1])); + in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j])); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j])); + in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1])); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1])); + in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j])); + + j += 2; + } + + _mm_store_ps(p0 + xss[k], rn); + _mm_store_ps(p0 + xss[k] + 4, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + __m128 w[1 << (1 + 2 * H + L)]; + + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned r = 2 + H; + unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; + uint64_t size = uint64_t{1} << n; + + if (CH) { + auto m = GetMasks9(state.num_qubits(), qs, cqs, cvals); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, qs[0], state.get()); + } else { + auto m = GetMasks10(state.num_qubits(), qs, cqs, cvals); + FillControlledMatrixL( + m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w); + + for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, qs[0], state.get()); + } + } + + template + std::complex ExpectationValueH(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + const fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m128 ru, iu, rn, in; + __m128 rs[hsize], is[hsize]; + + i *= 4; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = _mm_load_ps(p0 + xss[k]); + is[k] = _mm_load_ps(p0 + xss[k] + 4); + } + + double re = 0; + double im = 0; + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm_set1_ps(v[j]); + iu = _mm_set1_ps(v[j + 1]); + rn = _mm_mul_ps(rs[0], ru); + in = _mm_mul_ps(rs[0], iu); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu)); + in = _mm_add_ps(in, _mm_mul_ps(is[0], ru)); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm_set1_ps(v[j]); + iu = _mm_set1_ps(v[j + 1]); + rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru)); + in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu)); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu)); + in = _mm_add_ps(in, _mm_mul_ps(is[l], ru)); + + j += 2; + } + + __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[k], rn), _mm_mul_ps(is[k], in)); + __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[k], in), _mm_mul_ps(is[k], rn)); + + re += detail::HorizontalSumSSE(v_re); + im += detail::HorizontalSumSSE(v_im); + } + + return std::complex{re, im}; + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned k = 2 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), matrix, ms, xss, state.get()); + } + + template + std::complex ExpectationValueL(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w, + const uint64_t* ms, const uint64_t* xss, unsigned q0, + const fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m128 rn, in; + __m128 rs[gsize], is[gsize]; + + i *= 4; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + + rs[k2] = _mm_load_ps(p0 + xss[k]); + is[k2] = _mm_load_ps(p0 + xss[k] + 4); + + if (L == 1) { + rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177) + : _mm_shuffle_ps(rs[k2], rs[k2], 78); + is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177) + : _mm_shuffle_ps(is[k2], is[k2], 78); + } else if (L == 2) { + rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57); + is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57); + rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78); + is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78); + rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147); + is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147); + } + } + + double re = 0; + double im = 0; + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm_mul_ps(rs[0], w[j]); + in = _mm_mul_ps(rs[0], w[j + 1]); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1])); + in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j])); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j])); + in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1])); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1])); + in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j])); + + j += 2; + } + + unsigned m = lsize * k; + + __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[m], rn), _mm_mul_ps(is[m], in)); + __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[m], in), _mm_mul_ps(is[m], rn)); + + re += detail::HorizontalSumSSE(v_re); + im += detail::HorizontalSumSSE(v_im); + } + + return std::complex{re, im}; + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + __m128 w[1 << (1 + 2 * H + L)]; + + auto m = GetMasks11(qs); + + FillIndices(state.num_qubits(), qs, ms, xss); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + unsigned k = 2 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), w, ms, xss, qs[0], state.get()); + } + + For for_; +}; + +} // namespace qsim + +#endif // SIMULATOR_SSE_H_ diff --git a/qsim/statespace.h b/qsim/statespace.h new file mode 100644 index 0000000..2b0c9af --- /dev/null +++ b/qsim/statespace.h @@ -0,0 +1,145 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef STATESPACE_H_ +#define STATESPACE_H_ + +#include +#include +#include +#include + +#include "util.h" + +namespace qsim { + +/** + * Abstract class containing context and routines for general state-vector + * manipulations. "AVX", "AVX512", "Basic", and "SSE" implementations are + * provided. + */ +template class VectorSpace, typename... VSTypeParams> +class StateSpace : public VectorSpace { + private: + using Base = VectorSpace; + + public: + using fp_type = typename Base::fp_type; + using State = typename Base::Vector; + + /** + * The observed state from a Measurement gate. + */ + struct MeasurementResult { + /** + * A bitmask of all qubits measured in this result. In this format, if the + * qubit at index `i` is measured, the `i`th bit of `mask` is a one. + */ + uint64_t mask; + /** + * A bitwise representation of the measured states. In this format, the + * qubit at index `i` is represented by the `i`th bit of `bits`. + * If `valid` is true, `mask` has already been applied to this field + * (i.e. `bits == bits & mask`). + */ + uint64_t bits; + /** + * Observed states of the measured qubits. This vector only includes qubits + * specified by the associated Measurement gate. + */ + std::vector bitstring; + /** + * Validation bit. If this is false, the measurement failed and all other + * fields of the result are invalid. + */ + bool valid; + }; + + template + StateSpace(Args&&... args) : Base(args...) {} + + double Norm(const State& state) const { + auto partial_norms = static_cast(*this).PartialNorms(state); + + double norm = partial_norms[0]; + for (std::size_t i = 1; i < partial_norms.size(); ++i) { + norm += partial_norms[i]; + } + + return norm; + } + + template + MeasurementResult Measure(const std::vector& qubits, + RGen& rgen, State& state) const { + auto result = + static_cast(*this).VirtualMeasure(qubits, rgen, state); + + if (result.valid) { + static_cast(*this).Collapse(result, state); + } + + return result; + } + + template + MeasurementResult VirtualMeasure(const std::vector& qubits, + RGen& rgen, const State& state) const { + MeasurementResult result; + + result.valid = true; + result.mask = 0; + + for (auto q : qubits) { + if (q >= state.num_qubits()) { + result.valid = false; + return result; + } + + result.mask |= uint64_t{1} << q; + } + + auto partial_norms = static_cast(*this).PartialNorms(state); + + for (std::size_t i = 1; i < partial_norms.size(); ++i) { + partial_norms[i] += partial_norms[i - 1]; + } + + auto norm = partial_norms.back(); + auto r = RandomValue(rgen, norm); + + unsigned m = 0; + while (r > partial_norms[m]) ++m; + if (m > 0) { + r -= partial_norms[m - 1]; + } + + result.bits = static_cast(*this).FindMeasuredBits( + m, r, result.mask, state); + + result.bitstring.reserve(qubits.size()); + result.bitstring.resize(0); + + for (auto q : qubits) { + result.bitstring.push_back((result.bits >> q) & 1); + } + + return result; + } +}; + +} // namespace qsim + +#endif // STATESPACE_H_ diff --git a/qsim/statespace_avx.h b/qsim/statespace_avx.h new file mode 100644 index 0000000..876058b --- /dev/null +++ b/qsim/statespace_avx.h @@ -0,0 +1,497 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef STATESPACE_AVX_H_ +#define STATESPACE_AVX_H_ + +#include + +#include +#include +#include +#include +#include + +#include "statespace.h" +#include "util.h" +#include "vectorspace.h" + +namespace qsim { + +namespace detail { + +inline __m256i GetZeroMaskAVX(uint64_t i, uint64_t mask, uint64_t bits) { + __m256i s1 = _mm256_setr_epi64x(i + 0, i + 2, i + 4, i + 6); + __m256i s2 = _mm256_setr_epi64x(i + 1, i + 3, i + 5, i + 7); + __m256i ma = _mm256_set1_epi64x(mask); + __m256i bi = _mm256_set1_epi64x(bits); + + s1 = _mm256_and_si256(s1, ma); + s2 = _mm256_and_si256(s2, ma); + + s1 = _mm256_cmpeq_epi64(s1, bi); + s2 = _mm256_cmpeq_epi64(s2, bi); + + return _mm256_blend_epi32(s1, s2, 170); // 10101010 +} + +inline double HorizontalSumAVX(__m256 s) { + __m128 l = _mm256_castps256_ps128(s); + __m128 h = _mm256_extractf128_ps(s, 1); + __m128 s1 = _mm_add_ps(h, l); + __m128 s1s = _mm_movehdup_ps(s1); + __m128 s2 = _mm_add_ps(s1, s1s); + + return _mm_cvtss_f32(_mm_add_ss(s2, _mm_movehl_ps(s1s, s2))); +} + +} // namespace detail + +/** + * Object containing context and routines for AVX state-vector manipulations. + * State is a vectorized sequence of eight real components followed by eight + * imaginary components. Eight single-precison floating numbers can be loaded + * into an AVX register. + */ +template +class StateSpaceAVX : + public StateSpace, VectorSpace, For, float> { + private: + using Base = StateSpace, qsim::VectorSpace, For, float>; + + public: + using State = typename Base::State; + using fp_type = typename Base::fp_type; + + template + explicit StateSpaceAVX(ForArgs&&... args) : Base(args...) {} + + static uint64_t MinSize(unsigned num_qubits) { + return std::max(uint64_t{16}, 2 * (uint64_t{1} << num_qubits)); + }; + + void InternalToNormalOrder(State& state) const { + if (state.num_qubits() == 1) { + fp_type* s = state.get(); + + s[2] = s[1]; + s[1] = s[8]; + s[3] = s[9]; + + for (uint64_t i = 4; i < 16; ++i) { + s[i] = 0; + } + } else if (state.num_qubits() == 2) { + fp_type* s = state.get(); + + s[6] = s[3]; + s[4] = s[2]; + s[2] = s[1]; + s[1] = s[8]; + s[3] = s[9]; + s[5] = s[10]; + s[7] = s[11]; + + for (uint64_t i = 8; i < 16; ++i) { + s[i] = 0; + } + } else { + auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) { + fp_type* s = p + 16 * i; + + fp_type re[7]; + fp_type im[7]; + + for (uint64_t i = 0; i < 7; ++i) { + re[i] = s[i + 1]; + im[i] = s[i + 8]; + } + + for (uint64_t i = 0; i < 7; ++i) { + s[2 * i + 1] = im[i]; + s[2 * i + 2] = re[i]; + } + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 16, f, state.get()); + } + } + + void NormalToInternalOrder(State& state) const { + if (state.num_qubits() == 1) { + fp_type* s = state.get(); + + s[8] = s[1]; + s[1] = s[2]; + s[9] = s[3]; + + for (uint64_t i = 2; i < 8; ++i) { + s[i] = 0; + s[i + 8] = 0; + } + } else if (state.num_qubits() == 2) { + fp_type* s = state.get(); + + s[8] = s[1]; + s[9] = s[3]; + s[10] = s[5]; + s[11] = s[7]; + s[1] = s[2]; + s[2] = s[4]; + s[3] = s[6]; + + for (uint64_t i = 4; i < 8; ++i) { + s[i] = 0; + s[i + 8] = 0; + } + } else { + auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) { + fp_type* s = p + 16 * i; + + fp_type re[7]; + fp_type im[7]; + + for (uint64_t i = 0; i < 7; ++i) { + im[i] = s[2 * i + 1]; + re[i] = s[2 * i + 2]; + } + + for (uint64_t i = 0; i < 7; ++i) { + s[i + 1] = re[i]; + s[i + 8] = im[i]; + } + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 16, f, state.get()); + } + } + + void SetAllZeros(State& state) const { + __m256 val0 = _mm256_setzero_ps(); + + auto f = [](unsigned n, unsigned m, uint64_t i, __m256& val, fp_type* p) { + _mm256_store_ps(p + 16 * i, val); + _mm256_store_ps(p + 16 * i + 8, val); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 16, f, val0, state.get()); + } + + // Uniform superposition. + void SetStateUniform(State& state) const { + __m256 val0 = _mm256_setzero_ps(); + __m256 valu; + + fp_type v = double{1} / std::sqrt(uint64_t{1} << state.num_qubits()); + + switch (state.num_qubits()) { + case 1: + valu = _mm256_set_ps(0, 0, 0, 0, 0, 0, v, v); + break; + case 2: + valu = _mm256_set_ps(0, 0, 0, 0, v, v, v, v); + break; + default: + valu = _mm256_set1_ps(v); + break; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + __m256& val0, __m256 valu, fp_type* p) { + _mm256_store_ps(p + 16 * i, valu); + _mm256_store_ps(p + 16 * i + 8, val0); + }; + + Base::for_.Run( + MinSize(state.num_qubits()) / 16, f, val0, valu, state.get()); + } + + // |0> state. + void SetStateZero(State& state) const { + SetAllZeros(state); + state.get()[0] = 1; + } + + static std::complex GetAmpl(const State& state, uint64_t i) { + uint64_t k = (16 * (i / 8)) + (i % 8); + return std::complex(state.get()[k], state.get()[k + 8]); + } + + static void SetAmpl( + State& state, uint64_t i, const std::complex& ampl) { + uint64_t k = (16 * (i / 8)) + (i % 8); + state.get()[k] = std::real(ampl); + state.get()[k + 8] = std::imag(ampl); + } + + static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) { + uint64_t k = (16 * (i / 8)) + (i % 8); + state.get()[k] = re; + state.get()[k + 8] = im; + } + + // Sets state[i] = complex(re, im) where (i & mask) == bits. + // if `exclude` is true then the criteria becomes (i & mask) != bits. + void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, + const std::complex& val, + bool exclude = false) const { + BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude); + } + + // Sets state[i] = complex(re, im) where (i & mask) == bits. + // if `exclude` is true then the criteria becomes (i & mask) != bits. + void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re, + fp_type im, bool exclude = false) const { + __m256 re_reg = _mm256_set1_ps(re); + __m256 im_reg = _mm256_set1_ps(im); + + __m256i exclude_reg = _mm256_setzero_si256(); + if (exclude) { + exclude_reg = _mm256_cmpeq_epi32(exclude_reg, exclude_reg); + } + + auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv, + uint64_t bitsv, __m256 re_n, __m256 im_n, __m256i exclude_n, + fp_type* p) { + __m256 ml = _mm256_castsi256_ps(_mm256_xor_si256( + detail::GetZeroMaskAVX(8 * i, maskv, bitsv), exclude_n)); + + __m256 re = _mm256_load_ps(p + 16 * i); + __m256 im = _mm256_load_ps(p + 16 * i + 8); + + re = _mm256_blendv_ps(re, re_n, ml); + im = _mm256_blendv_ps(im, im_n, ml); + + _mm256_store_ps(p + 16 * i, re); + _mm256_store_ps(p + 16 * i + 8, im); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 16, f, mask, bits, re_reg, + im_reg, exclude_reg, state.get()); + } + + // Does the equivalent of dest += src elementwise. + bool Add(const State& src, State& dest) const { + if (src.num_qubits() != dest.num_qubits()) { + return false; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p1, fp_type* p2) { + __m256 re1 = _mm256_load_ps(p1 + 16 * i); + __m256 im1 = _mm256_load_ps(p1 + 16 * i + 8); + __m256 re2 = _mm256_load_ps(p2 + 16 * i); + __m256 im2 = _mm256_load_ps(p2 + 16 * i + 8); + + _mm256_store_ps(p2 + 16 * i, _mm256_add_ps(re1, re2)); + _mm256_store_ps(p2 + 16 * i + 8, _mm256_add_ps(im1, im2)); + }; + + Base::for_.Run(MinSize(src.num_qubits()) / 16, f, src.get(), dest.get()); + + return true; + } + + // Does the equivalent of state *= a elementwise. + void Multiply(fp_type a, State& state) const { + __m256 r = _mm256_set1_ps(a); + + auto f = [](unsigned n, unsigned m, uint64_t i, __m256 r, fp_type* p) { + __m256 re = _mm256_load_ps(p + 16 * i); + __m256 im = _mm256_load_ps(p + 16 * i + 8); + + re = _mm256_mul_ps(re, r); + im = _mm256_mul_ps(im, r); + + _mm256_store_ps(p + 16 * i, re); + _mm256_store_ps(p + 16 * i + 8, im); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 16, f, r, state.get()); + } + + std::complex InnerProduct( + const State& state1, const State& state2) const { + if (state1.num_qubits() != state2.num_qubits()) { + return std::nan(""); + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p1, const fp_type* p2) -> std::complex { + __m256 re1 = _mm256_load_ps(p1 + 16 * i); + __m256 im1 = _mm256_load_ps(p1 + 16 * i + 8); + __m256 re2 = _mm256_load_ps(p2 + 16 * i); + __m256 im2 = _mm256_load_ps(p2 + 16 * i + 8); + + __m256 ip_re = _mm256_fmadd_ps(im1, im2, _mm256_mul_ps(re1, re2)); + __m256 ip_im = _mm256_fnmadd_ps(im1, re2, _mm256_mul_ps(re1, im2)); + + double re = detail::HorizontalSumAVX(ip_re); + double im = detail::HorizontalSumAVX(ip_im); + + return std::complex{re, im}; + }; + + using Op = std::plus>; + return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 16, f, + Op(), state1.get(), state2.get()); + } + + double RealInnerProduct(const State& state1, const State& state2) const { + if (state1.num_qubits() != state2.num_qubits()) { + return std::nan(""); + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p1, const fp_type* p2) -> double { + __m256 re1 = _mm256_load_ps(p1 + 16 * i); + __m256 im1 = _mm256_load_ps(p1 + 16 * i + 8); + __m256 re2 = _mm256_load_ps(p2 + 16 * i); + __m256 im2 = _mm256_load_ps(p2 + 16 * i + 8); + + __m256 ip_re = _mm256_fmadd_ps(im1, im2, _mm256_mul_ps(re1, re2)); + + return detail::HorizontalSumAVX(ip_re); + }; + + using Op = std::plus; + return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 16, f, + Op(), state1.get(), state2.get()); + } + + template + std::vector Sample( + const State& state, uint64_t num_samples, unsigned seed) const { + std::vector bitstrings; + + if (num_samples > 0) { + double norm = 0; + uint64_t size = MinSize(state.num_qubits()) / 16; + const fp_type* p = state.get(); + + for (uint64_t k = 0; k < size; ++k) { + for (unsigned j = 0; j < 8; ++j) { + double re = p[16 * k + j]; + double im = p[16 * k + 8 + j]; + norm += re * re + im * im; + } + } + + auto rs = GenerateRandomValues(num_samples, seed, norm); + + uint64_t m = 0; + double csum = 0; + bitstrings.reserve(num_samples); + + for (uint64_t k = 0; k < size; ++k) { + for (unsigned j = 0; j < 8; ++j) { + double re = p[16 * k + j]; + double im = p[16 * k + 8 + j]; + csum += re * re + im * im; + while (rs[m] < csum && m < num_samples) { + bitstrings.emplace_back(8 * k + j); + ++m; + } + } + } + + for (; m < num_samples; ++m) { + bitstrings.emplace_back((uint64_t{1} << state.num_qubits()) - 1); + } + } + + return bitstrings; + } + + using MeasurementResult = typename Base::MeasurementResult; + + void Collapse(const MeasurementResult& mr, State& state) const { + auto f1 = [](unsigned n, unsigned m, uint64_t i, + uint64_t mask, uint64_t bits, const fp_type* p) -> double { + __m256i ml = detail::GetZeroMaskAVX(8 * i, mask, bits); + + __m256 re = _mm256_maskload_ps(p + 16 * i, ml); + __m256 im = _mm256_maskload_ps(p + 16 * i + 8, ml); + __m256 s1 = _mm256_fmadd_ps(im, im, _mm256_mul_ps(re, re)); + + return detail::HorizontalSumAVX(s1); + }; + + using Op = std::plus; + double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 16, f1, + Op(), mr.mask, mr.bits, state.get()); + + __m256 renorm = _mm256_set1_ps(1.0 / std::sqrt(norm)); + + auto f2 = [](unsigned n, unsigned m, uint64_t i, + uint64_t mask, uint64_t bits, __m256 renorm, fp_type* p) { + __m256i ml = detail::GetZeroMaskAVX(8 * i, mask, bits); + + __m256 re = _mm256_maskload_ps(p + 16 * i, ml); + __m256 im = _mm256_maskload_ps(p + 16 * i + 8, ml); + + re = _mm256_mul_ps(re, renorm); + im = _mm256_mul_ps(im, renorm); + + _mm256_store_ps(p + 16 * i, re); + _mm256_store_ps(p + 16 * i + 8, im); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 16, f2, + mr.mask, mr.bits, renorm, state.get()); + } + + std::vector PartialNorms(const State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p) -> double { + __m256 re = _mm256_load_ps(p + 16 * i); + __m256 im = _mm256_load_ps(p + 16 * i + 8); + __m256 s1 = _mm256_fmadd_ps(im, im, _mm256_mul_ps(re, re)); + + return detail::HorizontalSumAVX(s1); + }; + + using Op = std::plus; + return Base::for_.RunReduceP( + MinSize(state.num_qubits()) / 16, f, Op(), state.get()); + } + + uint64_t FindMeasuredBits( + unsigned m, double r, uint64_t mask, const State& state) const { + double csum = 0; + + uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 16, m); + uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 16, m); + + const fp_type* p = state.get(); + + for (uint64_t k = k0; k < k1; ++k) { + for (uint64_t j = 0; j < 8; ++j) { + auto re = p[16 * k + j]; + auto im = p[16 * k + j + 8]; + csum += re * re + im * im; + if (r < csum) { + return (8 * k + j) & mask; + } + } + } + + // Return the last bitstring in the unlikely case of underflow. + return (8 * k1 - 1) & mask; + } +}; + +} // namespace qsim + +#endif // STATESPACE_AVX_H_ diff --git a/qsim/statespace_avx512.h b/qsim/statespace_avx512.h new file mode 100644 index 0000000..879fd89 --- /dev/null +++ b/qsim/statespace_avx512.h @@ -0,0 +1,448 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef STATESPACE_AVX512_H_ +#define STATESPACE_AVX512_H_ + +#include + +#include +#include +#include +#include +#include + +#include "statespace.h" +#include "util.h" +#include "vectorspace.h" + +namespace qsim { + +namespace detail { + +inline unsigned GetZeroMaskAVX512(uint64_t i, uint64_t mask, uint64_t bits) { + __m512i s1 = _mm512_setr_epi64( + i + 0, i + 1, i + 2, i + 3, i + 4, i + 5, i + 6, i + 7); + __m512i s2 = _mm512_setr_epi64( + i + 8, i + 9, i + 10, i + 11, i + 12, i + 13, i + 14, i + 15); + __m512i ma = _mm512_set1_epi64(mask); + __m512i bi = _mm512_set1_epi64(bits); + + s1 = _mm512_and_si512(s1, ma); + s2 = _mm512_and_si512(s2, ma); + + unsigned m1 = _mm512_cmpeq_epu64_mask(s1, bi); + unsigned m2 = _mm512_cmpeq_epu64_mask(s2, bi); + + return (m2 << 8) | m1; +} + +inline double HorizontalSumAVX(__m256 s) { + __m128 l = _mm256_castps256_ps128(s); + __m128 h = _mm256_extractf128_ps(s, 1); + __m128 s1 = _mm_add_ps(h, l); + __m128 s1s = _mm_movehdup_ps(s1); + __m128 s2 = _mm_add_ps(s1, s1s); + + return _mm_cvtss_f32(_mm_add_ss(s2, _mm_movehl_ps(s1s, s2))); +} + +inline double HorizontalSumAVX512(__m512 s) { + __m256 l = _mm512_castps512_ps256(s); + __m512d sd = _mm512_castps_pd(s); + __m256d hd = _mm512_extractf64x4_pd(sd, 1); + __m256 h = _mm256_castpd_ps(hd); + __m256 p = _mm256_add_ps(h, l); + + return HorizontalSumAVX(p); +} + +} // namespace detail + +/** + * Object containing context and routines for AVX state-vector manipulations. + * State is a vectorized sequence of sixteen real components followed by + * sixteen imaginary components. Sixteen single-precison floating numbers can + * be loaded into an AVX512 register. + */ +template +class StateSpaceAVX512 : + public StateSpace, VectorSpace, For, float> { + private: + using Base = StateSpace, qsim::VectorSpace, For, float>; + + public: + using State = typename Base::State; + using fp_type = typename Base::fp_type; + + template + explicit StateSpaceAVX512(ForArgs&&... args) : Base(args...) {} + + static uint64_t MinSize(unsigned num_qubits) { + return std::max(uint64_t{32}, 2 * (uint64_t{1} << num_qubits)); + }; + + void InternalToNormalOrder(State& state) const { + __m512i idx1 = _mm512_setr_epi32( + 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); + __m512i idx2 = _mm512_setr_epi32( + 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); + + auto f = [](unsigned n, unsigned m, uint64_t i, + __m512i idx1, __m512i idx2, fp_type* p) { + __m512 v1 = _mm512_load_ps(p + 32 * i); + __m512 v2 = _mm512_load_ps(p + 32 * i + 16); + + _mm512_store_ps(p + 32 * i, _mm512_permutex2var_ps(v1, idx1, v2)); + _mm512_store_ps(p + 32 * i + 16, _mm512_permutex2var_ps(v1, idx2, v2)); + }; + + Base::for_.Run( + MinSize(state.num_qubits()) / 32, f, idx1, idx2, state.get()); + } + + void NormalToInternalOrder(State& state) const { + __m512i idx1 = _mm512_setr_epi32( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30); + __m512i idx2 = _mm512_setr_epi32( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31); + + auto f = [](unsigned n, unsigned m, uint64_t i, + __m512i idx1, __m512i idx2, fp_type* p) { + __m512 re = _mm512_load_ps(p + 32 * i); + __m512 im = _mm512_load_ps(p + 32 * i + 16); + + _mm512_store_ps(p + 32 * i, _mm512_permutex2var_ps(re, idx1, im)); + _mm512_store_ps(p + 32 * i + 16, _mm512_permutex2var_ps(re, idx2, im)); + }; + + Base::for_.Run( + MinSize(state.num_qubits()) / 32, f, idx1, idx2, state.get()); + } + + void SetAllZeros(State& state) const { + __m512 val0 = _mm512_setzero_ps(); + + auto f = [](unsigned n, unsigned m, uint64_t i, __m512 val0, fp_type* p) { + _mm512_store_ps(p + 32 * i, val0); + _mm512_store_ps(p + 32 * i + 16, val0); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 32, f, val0, state.get()); + } + + // Uniform superposition. + void SetStateUniform(State& state) const { + __m512 val0 = _mm512_setzero_ps(); + __m512 valu; + + fp_type v = double{1} / std::sqrt(uint64_t{1} << state.num_qubits()); + + switch (state.num_qubits()) { + case 1: + valu = _mm512_set_ps(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, v, v); + break; + case 2: + valu = _mm512_set_ps(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, v, v, v, v); + break; + case 3: + valu = _mm512_set_ps(0, 0, 0, 0, 0, 0, 0, 0, v, v, v, v, v, v, v, v); + break; + default: + valu = _mm512_set1_ps(v); + break; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const __m512& val0, const __m512& valu, fp_type* p) { + _mm512_store_ps(p + 32 * i, valu); + _mm512_store_ps(p + 32 * i + 16, val0); + }; + + Base::for_.Run( + MinSize(state.num_qubits()) / 32, f, val0, valu, state.get()); + } + + // |0> state. + void SetStateZero(State& state) const { + SetAllZeros(state); + state.get()[0] = 1; + } + + static std::complex GetAmpl(const State& state, uint64_t i) { + uint64_t p = (32 * (i / 16)) + (i % 16); + return std::complex(state.get()[p], state.get()[p + 16]); + } + + static void SetAmpl( + State& state, uint64_t i, const std::complex& ampl) { + uint64_t p = (32 * (i / 16)) + (i % 16); + state.get()[p] = std::real(ampl); + state.get()[p + 16] = std::imag(ampl); + } + + static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) { + uint64_t p = (32 * (i / 16)) + (i % 16); + state.get()[p] = re; + state.get()[p + 16] = im; + } + + // Sets state[i] = complex(re, im) where (i & mask) == bits. + // if `exclude` is true then the criteria becomes (i & mask) != bits. + void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, + const std::complex& val, + bool exclude = false) const { + BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude); + } + + // Sets state[i] = complex(re, im) where (i & mask) == bits. + // if `exclude` is true then the criteria becomes (i & mask) != bits. + void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re, + fp_type im, bool exclude = false) const { + __m512 re_reg = _mm512_set1_ps(re); + __m512 im_reg = _mm512_set1_ps(im); + + __mmask16 exclude_n = exclude ? 0xffff : 0; + + auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv, + uint64_t bitsv, __m512 re_n, __m512 im_n, __mmask16 exclude_n, + fp_type* p) { + __m512 re = _mm512_load_ps(p + 32 * i); + __m512 im = _mm512_load_ps(p + 32 * i + 16); + + __mmask16 ml = + detail::GetZeroMaskAVX512(16 * i, maskv, bitsv) ^ exclude_n; + + re = _mm512_mask_blend_ps(ml, re, re_n); + im = _mm512_mask_blend_ps(ml, im, im_n); + + _mm512_store_ps(p + 32 * i, re); + _mm512_store_ps(p + 32 * i + 16, im); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 32, f, mask, bits, + re_reg, im_reg, exclude_n, state.get()); + } + + // Does the equivalent of dest += src elementwise. + bool Add(const State& src, State& dest) const { + if (src.num_qubits() != dest.num_qubits()) { + return false; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p1, fp_type* p2) { + __m512 re1 = _mm512_load_ps(p1 + 32 * i); + __m512 im1 = _mm512_load_ps(p1 + 32 * i + 16); + __m512 re2 = _mm512_load_ps(p2 + 32 * i); + __m512 im2 = _mm512_load_ps(p2 + 32 * i + 16); + + _mm512_store_ps(p2 + 32 * i, _mm512_add_ps(re1, re2)); + _mm512_store_ps(p2 + 32 * i + 16, _mm512_add_ps(im1, im2)); + }; + + Base::for_.Run(MinSize(src.num_qubits()) / 32, f, src.get(), dest.get()); + + return true; + } + + // Does the equivalent of state *= a elementwise. + void Multiply(fp_type a, State& state) const { + __m512 r = _mm512_set1_ps(a); + + auto f = [](unsigned n, unsigned m, uint64_t i, __m512 r, fp_type* p) { + __m512 re = _mm512_load_ps(p + 32 * i); + __m512 im = _mm512_load_ps(p + 32 * i + 16); + + _mm512_store_ps(p + 32 * i, _mm512_mul_ps(re, r)); + _mm512_store_ps(p + 32 * i + 16, _mm512_mul_ps(im, r)); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 32, f, r, state.get()); + } + + std::complex InnerProduct( + const State& state1, const State& state2) const { + if (state1.num_qubits() != state2.num_qubits()) { + return std::nan(""); + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p1, const fp_type* p2) -> std::complex { + __m512 re1 = _mm512_load_ps(p1 + 32 * i); + __m512 im1 = _mm512_load_ps(p1 + 32 * i + 16); + __m512 re2 = _mm512_load_ps(p2 + 32 * i); + __m512 im2 = _mm512_load_ps(p2 + 32 * i + 16); + + __m512 ip_re = _mm512_fmadd_ps(im1, im2, _mm512_mul_ps(re1, re2)); + __m512 ip_im = _mm512_fnmadd_ps(im1, re2, _mm512_mul_ps(re1, im2)); + + double re = detail::HorizontalSumAVX512(ip_re); + double im = detail::HorizontalSumAVX512(ip_im); + + return std::complex{re, im}; + }; + + using Op = std::plus>; + return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 32, f, + Op(), state1.get(), state2.get()); + } + + double RealInnerProduct(const State& state1, const State& state2) const { + if (state1.num_qubits() != state2.num_qubits()) { + return std::nan(""); + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p1, const fp_type* p2) -> double { + __m512 re1 = _mm512_load_ps(p1 + 32 * i); + __m512 im1 = _mm512_load_ps(p1 + 32 * i + 16); + __m512 re2 = _mm512_load_ps(p2 + 32 * i); + __m512 im2 = _mm512_load_ps(p2 + 32 * i + 16); + + __m512 ip_re = _mm512_fmadd_ps(im1, im2, _mm512_mul_ps(re1, re2)); + + return detail::HorizontalSumAVX512(ip_re); + }; + + using Op = std::plus; + return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 32, f, + Op(), state1.get(), state2.get()); + } + + template + std::vector Sample( + const State& state, uint64_t num_samples, unsigned seed) const { + std::vector bitstrings; + + if (num_samples > 0) { + double norm = 0; + uint64_t size = MinSize(state.num_qubits()) / 32; + const fp_type* p = state.get(); + + for (uint64_t k = 0; k < size; ++k) { + for (unsigned j = 0; j < 16; ++j) { + double re = p[32 * k + j]; + double im = p[32 * k + 16 + j]; + norm += re * re + im * im; + } + } + + auto rs = GenerateRandomValues(num_samples, seed, norm); + + uint64_t m = 0; + double csum = 0; + bitstrings.reserve(num_samples); + + for (uint64_t k = 0; k < size; ++k) { + for (unsigned j = 0; j < 16; ++j) { + double re = p[32 * k + j]; + double im = p[32 * k + 16 + j]; + csum += re * re + im * im; + while (rs[m] < csum && m < num_samples) { + bitstrings.emplace_back(16 * k + j); + ++m; + } + } + } + + for (; m < num_samples; ++m) { + bitstrings.emplace_back((uint64_t{1} << state.num_qubits()) - 1); + } + } + + return bitstrings; + } + + using MeasurementResult = typename Base::MeasurementResult; + + void Collapse(const MeasurementResult& mr, State& state) const { + auto f1 = [](unsigned n, unsigned m, uint64_t i, + uint64_t mask, uint64_t bits, const fp_type* p) -> double { + __mmask16 ml = detail::GetZeroMaskAVX512(16 * i, mask, bits); + + __m512 re = _mm512_maskz_load_ps(ml, p + 32 * i); + __m512 im = _mm512_maskz_load_ps(ml, p + 32 * i + 16); + __m512 s1 = _mm512_fmadd_ps(im, im, _mm512_mul_ps(re, re)); + + return detail::HorizontalSumAVX512(s1); + }; + + using Op = std::plus; + double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 32, f1, + Op(), mr.mask, mr.bits, state.get()); + + __m512 renorm = _mm512_set1_ps(1.0 / std::sqrt(norm)); + + auto f2 = [](unsigned n, unsigned m, uint64_t i, + uint64_t mask, uint64_t bits, __m512 renorm, fp_type* p) { + __mmask16 ml = detail::GetZeroMaskAVX512(16 * i, mask, bits); + + __m512 re = _mm512_maskz_load_ps(ml, p + 32 * i); + __m512 im = _mm512_maskz_load_ps(ml, p + 32 * i + 16); + + re = _mm512_mul_ps(re, renorm); + im = _mm512_mul_ps(im, renorm); + + _mm512_store_ps(p + 32 * i, re); + _mm512_store_ps(p + 32 * i + 16, im); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 32, f2, + mr.mask, mr.bits, renorm, state.get()); + } + + std::vector PartialNorms(const State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p) -> double { + __m512 re = _mm512_load_ps(p + 32 * i); + __m512 im = _mm512_load_ps(p + 32 * i + 16); + __m512 s1 = _mm512_fmadd_ps(im, im, _mm512_mul_ps(re, re)); + + return detail::HorizontalSumAVX512(s1); + }; + + using Op = std::plus; + return Base::for_.RunReduceP( + MinSize(state.num_qubits()) / 32, f, Op(), state.get()); + } + + uint64_t FindMeasuredBits( + unsigned m, double r, uint64_t mask, const State& state) const { + double csum = 0; + + uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 32, m); + uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 32, m); + + const fp_type* p = state.get(); + + for (uint64_t k = k0; k < k1; ++k) { + for (uint64_t j = 0; j < 16; ++j) { + auto re = p[32 * k + j]; + auto im = p[32 * k + j + 16]; + csum += re * re + im * im; + if (r < csum) { + return (16 * k + j) & mask; + } + } + } + + // Return the last bitstring in the unlikely case of underflow. + return (16 * k1 - 1) & mask; + } +}; + +} // namespace qsim + +#endif // STATESPACE_AVX512_H_ diff --git a/qsim/statespace_basic.h b/qsim/statespace_basic.h new file mode 100644 index 0000000..6468483 --- /dev/null +++ b/qsim/statespace_basic.h @@ -0,0 +1,300 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef STATESPACE_BASIC_H_ +#define STATESPACE_BASIC_H_ + +#include +#include +#include +#include + +#include "statespace.h" +#include "util.h" +#include "vectorspace.h" + +namespace qsim { + +/** + * Object containing context and routines for unoptimized state-vector + * manipulations. State is a non-vectorized sequence of one real amplitude + * followed by one imaginary amplitude. + */ +template +class StateSpaceBasic : + public StateSpace, VectorSpace, For, FP> { + private: + using Base = StateSpace, qsim::VectorSpace, For, FP>; + + public: + using State = typename Base::State; + using fp_type = typename Base::fp_type; + + template + explicit StateSpaceBasic(ForArgs&&... args) : Base(args...) {} + + static uint64_t MinSize(unsigned num_qubits) { + return 2 * (uint64_t{1} << num_qubits); + }; + + void InternalToNormalOrder(State& state) const {} + + void NormalToInternalOrder(State& state) const {} + + void SetAllZeros(State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) { + p[2 * i] = 0; + p[2 * i + 1] = 0; + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 2, f, state.get()); + } + + // Uniform superposition. + void SetStateUniform(State& state) const { + fp_type val = fp_type{1} / std::sqrt(uint64_t{1} << state.num_qubits()); + + auto f = [](unsigned n, unsigned m, uint64_t i, + fp_type val, fp_type* p) { + p[2 * i] = val; + p[2 * i + 1] = 0; + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 2, f, val, state.get()); + } + + // |0> state. + void SetStateZero(State& state) const { + SetAllZeros(state); + state.get()[0] = 1; + } + + static std::complex GetAmpl(const State& state, uint64_t i) { + uint64_t p = 2 * i; + return std::complex(state.get()[p], state.get()[p + 1]); + } + + static void SetAmpl( + State& state, uint64_t i, const std::complex& ampl) { + uint64_t p = 2 * i; + state.get()[p] = std::real(ampl); + state.get()[p + 1] = std::imag(ampl); + } + + static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) { + uint64_t p = 2 * i; + state.get()[p] = re; + state.get()[p + 1] = im; + } + + // Sets state[i] = complex(re, im) where (i & mask) == bits. + // if `exclude` is true then the criteria becomes (i & mask) != bits. + void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, + const std::complex& val, + bool exclude = false) const { + BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude); + } + + // Sets state[i] = complex(re, im) where (i & mask) == bits. + // if `exclude` is true then the criteria becomes (i & mask) != bits. + void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re, + fp_type im, bool exclude = false) const { + auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv, + uint64_t bitsv, fp_type re_n, fp_type im_n, bool excludev, + fp_type* p) { + auto s = p + 2 * i; + bool in_mask = (i & maskv) == bitsv; + in_mask ^= excludev; + s[0] = in_mask ? re_n : s[0]; + s[1] = in_mask ? im_n : s[1]; + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 2, f, mask, bits, re, im, + exclude, state.get()); + } + + // Does the equivalent of dest += src elementwise. + bool Add(const State& src, State& dest) const { + if (src.num_qubits() != dest.num_qubits()) { + return false; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p1, fp_type* p2) { + p2[2 * i] += p1[2 * i]; + p2[2 * i + 1] += p1[2 * i + 1]; + }; + + Base::for_.Run(MinSize(src.num_qubits()) / 2, f, src.get(), dest.get()); + + return true; + } + + // Does the equivalent of state *= a elementwise. + void Multiply(fp_type a, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, fp_type a, fp_type* p) { + p[2 * i] *= a; + p[2 * i + 1] *= a; + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 2, f, a, state.get()); + } + + std::complex InnerProduct( + const State& state1, const State& state2) const { + if (state1.num_qubits() != state2.num_qubits()) { + return std::nan(""); + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p1, const fp_type* p2) -> std::complex { + auto s1 = p1 + 2 * i; + auto s2 = p2 + 2 * i; + + double re = s1[0] * s2[0] + s1[1] * s2[1]; + double im = s1[0] * s2[1] - s1[1] * s2[0]; + + return std::complex{re, im}; + }; + + using Op = std::plus>; + return Base::for_.RunReduce( + MinSize(state1.num_qubits()) / 2, f, Op(), state1.get(), state2.get()); + } + + double RealInnerProduct(const State& state1, const State& state2) const { + if (state1.num_qubits() != state2.num_qubits()) { + return std::nan(""); + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p1, const fp_type* p2) -> double { + auto s1 = p1 + 2 * i; + auto s2 = p2 + 2 * i; + + return s1[0] * s2[0] + s1[1] * s2[1]; + }; + + using Op = std::plus; + return Base::for_.RunReduce( + MinSize(state1.num_qubits()) / 2, f, Op(), state1.get(), state2.get()); + } + + template + std::vector Sample( + const State& state, uint64_t num_samples, unsigned seed) const { + std::vector bitstrings; + + if (num_samples > 0) { + double norm = 0; + uint64_t size = MinSize(state.num_qubits()) / 2; + + const fp_type* p = state.get(); + + for (uint64_t k = 0; k < size; ++k) { + double re = p[2 * k]; + double im = p[2 * k + 1]; + norm += re * re + im * im; + } + + auto rs = GenerateRandomValues(num_samples, seed, norm); + + uint64_t m = 0; + double csum = 0; + bitstrings.reserve(num_samples); + + for (uint64_t k = 0; k < size; ++k) { + double re = p[2 * k]; + double im = p[2 * k + 1]; + csum += re * re + im * im; + while (rs[m] < csum && m < num_samples) { + bitstrings.emplace_back(k); + ++m; + } + } + + for (; m < num_samples; ++m) { + bitstrings.emplace_back((uint64_t{1} << state.num_qubits()) - 1); + } + } + + return bitstrings; + } + + using MeasurementResult = typename Base::MeasurementResult; + + void Collapse(const MeasurementResult& mr, State& state) const { + auto f1 = [](unsigned n, unsigned m, uint64_t i, + uint64_t mask, uint64_t bits, const fp_type* p) -> double { + auto s = p + 2 * i; + return (i & mask) == bits ? s[0] * s[0] + s[1] * s[1] : 0; + }; + + using Op = std::plus; + double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 2, f1, + Op(), mr.mask, mr.bits, state.get()); + + double renorm = 1.0 / std::sqrt(norm); + + auto f2 = [](unsigned n, unsigned m, uint64_t i, + uint64_t mask, uint64_t bits, fp_type renorm, fp_type* p) { + auto s = p + 2 * i; + bool not_zero = (i & mask) == bits; + + s[0] = not_zero ? s[0] * renorm : 0; + s[1] = not_zero ? s[1] * renorm : 0; + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 2, f2, + mr.mask, mr.bits, renorm, state.get()); + } + + std::vector PartialNorms(const State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p) -> double { + auto s = p + 2 * i; + return s[0] * s[0] + s[1] * s[1]; + }; + + using Op = std::plus; + return Base::for_.RunReduceP( + MinSize(state.num_qubits()) / 2, f, Op(), state.get()); + } + + uint64_t FindMeasuredBits( + unsigned m, double r, uint64_t mask, const State& state) const { + double csum = 0; + + uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 2, m); + uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 2, m); + + const fp_type* p = state.get(); + + for (uint64_t k = k0; k < k1; ++k) { + auto re = p[2 * k]; + auto im = p[2 * k + 1]; + csum += re * re + im * im; + if (r < csum) { + return k & mask; + } + } + + // Return the last bitstring in the unlikely case of underflow. + return (k1 - 1) & mask; + } +}; + +} // namespace qsim + +#endif // STATESPACE_BASIC_H_ diff --git a/qsim/statespace_cuda.h b/qsim/statespace_cuda.h new file mode 100644 index 0000000..660db07 --- /dev/null +++ b/qsim/statespace_cuda.h @@ -0,0 +1,470 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef STATESPACE_CUDA_H_ +#define STATESPACE_CUDA_H_ + +#ifdef __NVCC__ + #include +#elif __HIP__ + #include + #include "cuda2hip.h" +#endif + +#include +#include +#include + +#include "statespace.h" +#include "statespace_cuda_kernels.h" +#include "vectorspace_cuda.h" +#include "util_cuda.h" + +namespace qsim { + +/** + * Object containing context and routines for CUDA state-vector manipulations. + * State is a vectorized sequence of 32 real components followed by 32 + * imaginary components. 32 floating numbers can be proccessed in parallel by + * a single warp. It is not recommended to use `GetAmpl` and `SetAmpl`. + */ +template +class StateSpaceCUDA : + public StateSpace, VectorSpaceCUDA, FP> { + private: + using Base = StateSpace, qsim::VectorSpaceCUDA, FP>; + + protected: + struct Grid { + unsigned threads; + unsigned dblocks; + unsigned blocks; + }; + + public: + using State = typename Base::State; + using fp_type = typename Base::fp_type; + + struct Parameter { + /** + * The number of threads per block. + * Should be 2 to the power of k, where k is in the range [5,10]. + */ + unsigned num_threads = 512; + /** + * The number of data blocks. Each thread processes num_dblocks data + * blocks in reductions (norms, inner products, etc). + */ + unsigned num_dblocks = 16; + }; + + explicit StateSpaceCUDA(const Parameter& param) + : param_(param), scratch_(nullptr), scratch_size_(0) {} + + virtual ~StateSpaceCUDA() { + if (scratch_ != nullptr) { + ErrorCheck(cudaFree(scratch_)); + } + } + + static uint64_t MinSize(unsigned num_qubits) { + return std::max(uint64_t{64}, 2 * (uint64_t{1} << num_qubits)); + }; + + void InternalToNormalOrder(State& state) const { + uint64_t size = MinSize(state.num_qubits()) / 2; + + unsigned threads = std::min(size, uint64_t{param_.num_threads}); + unsigned blocks = size / threads; + unsigned bytes = 2 * threads * sizeof(fp_type); + + InternalToNormalOrderKernel<<>>(state.get()); + ErrorCheck(cudaPeekAtLastError()); + ErrorCheck(cudaDeviceSynchronize()); + } + + void NormalToInternalOrder(State& state) const { + uint64_t size = MinSize(state.num_qubits()) / 2; + + unsigned threads = std::min(size, uint64_t{param_.num_threads}); + unsigned blocks = size / threads; + unsigned bytes = 2 * threads * sizeof(fp_type); + + NormalToInternalOrderKernel<<>>(state.get()); + ErrorCheck(cudaPeekAtLastError()); + ErrorCheck(cudaDeviceSynchronize()); + } + + void SetAllZeros(State& state) const { + ErrorCheck(cudaMemset(state.get(), 0, + MinSize(state.num_qubits()) * sizeof(fp_type))); + } + + // Uniform superposition. + void SetStateUniform(State& state) const { + uint64_t size = MinSize(state.num_qubits()) / 2; + uint64_t hsize = uint64_t{1} << state.num_qubits(); + + unsigned threads = std::min(size, uint64_t{param_.num_threads}); + unsigned blocks = size / threads; + + fp_type v = double{1} / std::sqrt(hsize); + + SetStateUniformKernel<<>>(v, hsize, state.get()); + ErrorCheck(cudaPeekAtLastError()); + ErrorCheck(cudaDeviceSynchronize()); + } + + // |0> state. + void SetStateZero(State& state) const { + SetAllZeros(state); + fp_type one[1] = {1}; + ErrorCheck( + cudaMemcpy(state.get(), one, sizeof(fp_type), cudaMemcpyHostToDevice)); + } + + // It is not recommended to use this function. + static std::complex GetAmpl(const State& state, uint64_t i) { + fp_type re, im; + auto p = state.get() + 64 * (i / 32) + i % 32; + ErrorCheck(cudaMemcpy(&re, p, sizeof(fp_type), cudaMemcpyDeviceToHost)); + ErrorCheck( + cudaMemcpy(&im, p + 32, sizeof(fp_type), cudaMemcpyDeviceToHost)); + return std::complex(re, im); + } + + // It is not recommended to use this function. + static void SetAmpl( + State& state, uint64_t i, const std::complex& ampl) { + fp_type re = std::real(ampl); + fp_type im = std::imag(ampl); + auto p = state.get() + 64 * (i / 32) + i % 32; + ErrorCheck(cudaMemcpy(p, &re, sizeof(fp_type), cudaMemcpyHostToDevice)); + ErrorCheck( + cudaMemcpy(p + 32, &im, sizeof(fp_type), cudaMemcpyHostToDevice)); + } + + // It is not recommended to use this function. + static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) { + auto p = state.get() + 64 * (i / 32) + i % 32; + ErrorCheck(cudaMemcpy(p, &re, sizeof(fp_type), cudaMemcpyHostToDevice)); + ErrorCheck( + cudaMemcpy(p + 32, &im, sizeof(fp_type), cudaMemcpyHostToDevice)); + } + + // Sets state[i] = complex(re, im) where (i & mask) == bits. + // if `exclude` is true then the criteria becomes (i & mask) != bits. + void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, + const std::complex& val, + bool exclude = false) const { + BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude); + } + + // Sets state[i] = complex(re, im) where (i & mask) == bits. + // if `exclude` is true then the criteria becomes (i & mask) != bits. + void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re, + fp_type im, bool exclude = false) const { + uint64_t size = MinSize(state.num_qubits()) / 2; + + unsigned threads = std::min(size, uint64_t{param_.num_threads}); + unsigned blocks = size / threads; + + BulkSetAmplKernel<<>>( + mask, bits, re, im, exclude, state.get()); + ErrorCheck(cudaPeekAtLastError()); + ErrorCheck(cudaDeviceSynchronize()); + } + + // Does the equivalent of dest += src elementwise. + bool Add(const State& src, State& dest) const { + if (src.num_qubits() != dest.num_qubits()) { + return false; + } + + uint64_t size = MinSize(src.num_qubits()); + + unsigned threads = std::min(size, uint64_t{param_.num_threads}); + unsigned blocks = size / threads; + + AddKernel<<>>(src.get(), dest.get()); + ErrorCheck(cudaPeekAtLastError()); + ErrorCheck(cudaDeviceSynchronize()); + + return true; + } + + // Does the equivalent of state *= a elementwise. + void Multiply(fp_type a, State& state) const { + uint64_t size = MinSize(state.num_qubits()); + + unsigned threads = std::min(size, uint64_t{param_.num_threads}); + unsigned blocks = size / threads; + + MultiplyKernel<<>>(a, state.get()); + ErrorCheck(cudaPeekAtLastError()); + ErrorCheck(cudaDeviceSynchronize()); + } + + std::complex InnerProduct( + const State& state1, const State& state2) const { + if (state1.num_qubits() != state2.num_qubits()) { + return std::nan(""); + } + + using C = Complex; + auto r = Reduce>(state1, state2); + + return {r.re, r.im}; + } + + double RealInnerProduct(const State& state1, const State& state2) const { + if (state1.num_qubits() != state2.num_qubits()) { + return std::nan(""); + } + + return Reduce>(state1, state2); + } + + double Norm(const State& state) const { + return Reduce>(state, state); + } + + template + std::vector Sample( + const State& state, uint64_t num_samples, unsigned seed) const { + std::vector bitstrings; + + if (num_samples > 0) { + Grid g1 = GetGrid1(MinSize(state.num_qubits()) / 2); + unsigned bytes = g1.threads * sizeof(double); + + unsigned scratch_size = (g1.blocks + 1) * sizeof(double) + + num_samples * (sizeof(uint64_t) + sizeof(DistrRealType)); + + void* scratch = AllocScratch(scratch_size); + + double* d_res2 = (double*) scratch; + double* d_res1 = d_res2 + 1; + uint64_t* d_bitstrings = (uint64_t*) (d_res1 + g1.blocks); + DistrRealType* d_rs = (DistrRealType *) (d_bitstrings + num_samples); + + auto op1 = RealProduct(); + auto op2 = Plus(); + + Reduce1Kernel<<>>( + g1.dblocks, op1, op2, op2, state.get(), state.get(), d_res1); + ErrorCheck(cudaPeekAtLastError()); + ErrorCheck(cudaDeviceSynchronize()); + + double norm; + + if (g1.blocks == 1) { + ErrorCheck( + cudaMemcpy(&norm, d_res1, sizeof(double), cudaMemcpyDeviceToHost)); + } else { + Grid g2 = GetGrid2(g1.blocks); + unsigned bytes = g2.threads * sizeof(double); + + auto op3 = Plus(); + + Reduce2Kernel<<>>( + g2.dblocks, g1.blocks, op3, op3, d_res1, d_res2); + ErrorCheck(cudaPeekAtLastError()); + ErrorCheck(cudaDeviceSynchronize()); + + ErrorCheck( + cudaMemcpy(&norm, d_res2, sizeof(double), cudaMemcpyDeviceToHost)); + } + + // TODO: generate random values on the device. + auto rs = GenerateRandomValues(num_samples, seed, norm); + + ErrorCheck(cudaMemcpy(d_rs, rs.data(), + num_samples * sizeof(DistrRealType), + cudaMemcpyHostToDevice)); + + SampleKernel<<<1, g1.threads>>>(g1.blocks, g1.dblocks, num_samples, + d_rs, d_res1, state.get(), d_bitstrings); + ErrorCheck(cudaPeekAtLastError()); + ErrorCheck(cudaDeviceSynchronize()); + + bitstrings.resize(num_samples, 0); + + ErrorCheck(cudaMemcpy(bitstrings.data(), d_bitstrings, + num_samples * sizeof(uint64_t), + cudaMemcpyDeviceToHost)); + } + + return bitstrings; + } + + using MeasurementResult = typename Base::MeasurementResult; + + void Collapse(const MeasurementResult& mr, State& state) const { + using Op = RealProduct; + double r = Reduce(mr.mask, mr.bits, state, state); + fp_type renorm = 1 / std::sqrt(r); + + uint64_t size = MinSize(state.num_qubits()) / 2; + + unsigned threads = std::min(size, uint64_t{param_.num_threads}); + unsigned blocks = size / threads; + + CollapseKernel<<>>(mr.mask, mr.bits, renorm, state.get()); + ErrorCheck(cudaPeekAtLastError()); + ErrorCheck(cudaDeviceSynchronize()); + } + + std::vector PartialNorms(const State& state) const { + Grid g = GetGrid1(MinSize(state.num_qubits()) / 2); + + unsigned scratch_size = g.blocks * sizeof(double); + unsigned bytes = g.threads * sizeof(double); + + double* d_res = (double*) AllocScratch(scratch_size); + + auto op1 = RealProduct(); + auto op2 = Plus(); + + Reduce1Kernel<<>>( + g.dblocks, op1, op2, op2, state.get(), state.get(), d_res); + ErrorCheck(cudaPeekAtLastError()); + ErrorCheck(cudaDeviceSynchronize()); + + std::vector norms(g.blocks); + + ErrorCheck( + cudaMemcpy(norms.data(), d_res, scratch_size, cudaMemcpyDeviceToHost)); + + return norms; + } + + uint64_t FindMeasuredBits( + unsigned m, double r, uint64_t mask, const State& state) const { + Grid g = GetGrid1(MinSize(state.num_qubits()) / 2); + + uint64_t res; + uint64_t* d_res = (uint64_t*) AllocScratch(sizeof(uint64_t)); + + FindMeasuredBitsKernel<<<1, g.threads>>>( + m, g.dblocks, r, state.get(), d_res); + ErrorCheck(cudaPeekAtLastError()); + ErrorCheck(cudaDeviceSynchronize()); + + ErrorCheck( + cudaMemcpy(&res, d_res, sizeof(uint64_t), cudaMemcpyDeviceToHost)); + + return res & mask; + } + + protected: + Parameter param_; + + void* AllocScratch(uint64_t size) const { + if (size > scratch_size_) { + if (scratch_ != nullptr) { + ErrorCheck(cudaFree(scratch_)); + } + + ErrorCheck(cudaMalloc(const_cast(&scratch_), size)); + + const_cast(scratch_size_) = size; + } + + return scratch_; + } + + Grid GetGrid1(uint64_t size) const { + Grid grid; + + grid.threads = std::min(size, uint64_t{param_.num_threads}); + grid.dblocks = std::min(size / grid.threads, uint64_t{param_.num_dblocks}); + grid.blocks = size / (grid.threads * grid.dblocks); + + return grid; + } + + Grid GetGrid2(unsigned size) const { + Grid grid; + + grid.threads = std::min(param_.num_threads, std::max(32U, size)); + grid.dblocks = std::max(1U, size / grid.threads); + grid.blocks = 1; + + return grid; + } + + template + FP2 Reduce(const State& state1, const State& state2) const { + return Reduce(0, 0, state1, state2); + } + + template + FP2 Reduce(uint64_t mask, uint64_t bits, + const State& state1, const State& state2) const { + uint64_t size = MinSize(state1.num_qubits()) / 2; + + Grid g1 = GetGrid1(size); + unsigned bytes = g1.threads * sizeof(FP1); + + FP2* d_res2 = (FP2*) AllocScratch((g1.blocks + 1) * sizeof(FP2)); + FP2* d_res1 = d_res2 + 1; + + auto op1 = Op(); + auto op2 = Plus(); + auto op3 = Plus::type>(); + + if (mask == 0) { + Reduce1Kernel<<>>( + g1.dblocks, op1, op2, op3, state1.get(), state2.get(), d_res1); + } else { + Reduce1MaskedKernel<<>>( + g1.dblocks, mask, bits, op1, op2, op3, state1.get(), state2.get(), + d_res1); + } + ErrorCheck(cudaPeekAtLastError()); + ErrorCheck(cudaDeviceSynchronize()); + + FP2 result; + + if (g1.blocks == 1) { + ErrorCheck( + cudaMemcpy(&result, d_res1, sizeof(FP2), cudaMemcpyDeviceToHost)); + } else { + Grid g2 = GetGrid2(g1.blocks); + unsigned bytes = g2.threads * sizeof(FP2); + + auto op2 = Plus(); + auto op3 = Plus::type>(); + + Reduce2Kernel<<>>( + g2.dblocks, g1.blocks, op2, op3, d_res1, d_res2); + ErrorCheck(cudaPeekAtLastError()); + ErrorCheck(cudaDeviceSynchronize()); + + ErrorCheck( + cudaMemcpy(&result, d_res2, sizeof(FP2), cudaMemcpyDeviceToHost)); + } + + return result; + } + + private: + void* scratch_; + uint64_t scratch_size_; +}; + +} // namespace qsim + +#endif // STATESPACE_CUDA_H_ diff --git a/qsim/statespace_cuda_kernels.h b/qsim/statespace_cuda_kernels.h new file mode 100644 index 0000000..b54ebca --- /dev/null +++ b/qsim/statespace_cuda_kernels.h @@ -0,0 +1,355 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef STATESPACE_CUDA_KERNELS_H_ +#define STATESPACE_CUDA_KERNELS_H_ + +#ifdef __NVCC__ + #include +#elif __HIP__ + #include + #include "cuda2hip.h" +#endif + +#include "util_cuda.h" + +namespace qsim { + +namespace detail { + +template +__device__ __forceinline__ FP1 BlockReduce1( + uint64_t n, Op1 op1, Op2 op2, Op3 op3, const FP2* s1, const FP2* s2) { + extern __shared__ float shared[]; + FP1* partial1 = (FP1*) shared; + + unsigned tid = threadIdx.x; + unsigned warp = threadIdx.x / warp_size; + unsigned lane = threadIdx.x % warp_size; + + uint64_t k0 = 2 * n * blockIdx.x * blockDim.x + 2 * tid - lane; + uint64_t k1 = k0 + 2 * n * blockDim.x; + + FP1 r; + + r = op1(s1[k0], s1[k0 + warp_size], s2[k0], s2[k0 + warp_size]); + while ((k0 += 2 * blockDim.x) < k1) { + r = op2(r, op1(s1[k0], s1[k0 + warp_size], s2[k0], s2[k0 + warp_size])); + } + + partial1[tid] = r; + + __shared__ FP1 partial2[warp_size]; + + if (tid < warp_size) { + partial2[tid] = 0; + } + + __syncthreads(); + + FP1 val = WarpReduce(partial1[tid], op3); + + if (lane == 0) { + partial2[warp] = val; + } + + __syncthreads(); + + FP1 result = 0; + + if (tid < warp_size) { + result = WarpReduce(partial2[tid], op3); + } + + return result; +} + +template +__device__ __forceinline__ FP1 BlockReduce1Masked( + uint64_t n, uint64_t mask, uint64_t bits, Op1 op1, Op2 op2, Op3 op3, + const FP2* s1, const FP2* s2) { + extern __shared__ float shared[]; + FP1* partial1 = (FP1*) shared; + + unsigned tid = threadIdx.x; + unsigned warp = threadIdx.x / warp_size; + unsigned lane = threadIdx.x % warp_size; + + uint64_t k0 = 2 * n * blockIdx.x * blockDim.x + 2 * tid - lane; + uint64_t k1 = k0 + 2 * n * blockDim.x; + + FP1 r = 0; + + if (((k0 + lane) / 2 & mask) == bits) { + r = op1(s1[k0], s1[k0 + warp_size], s2[k0], s2[k0 + warp_size]); + } + while ((k0 += 2 * blockDim.x) < k1) { + if (((k0 + lane) / 2 & mask) == bits) { + r = op2(r, op1(s1[k0], s1[k0 + warp_size], s2[k0], s2[k0 + warp_size])); + } + } + + partial1[tid] = r; + + __shared__ FP1 partial2[warp_size]; + + if (tid < warp_size) { + partial2[tid] = 0; + } + + __syncthreads(); + + FP1 val = WarpReduce(partial1[tid], op3); + + if (lane == 0) { + partial2[warp] = val; + } + + __syncthreads(); + + FP1 result = 0; + + if (tid < warp_size) { + result = WarpReduce(partial2[tid], op3); + } + + return result; +} + +template +__device__ __forceinline__ FP1 BlockReduce2( + uint64_t n, uint64_t size, Op2 op2, Op3 op3, const FP2* s) { + extern __shared__ float shared[]; + FP1* partial1 = (FP1*) shared; + + unsigned tid = threadIdx.x; + uint64_t k0 = n * blockIdx.x * blockDim.x + tid; + uint64_t k1 = k0 + n * blockDim.x; + + FP1 r = 0; + + if (tid < size) { + r = s[k0]; + while ((k0 += blockDim.x) < k1) { + r = op2(r, s[k0]); + } + } + + partial1[tid] = r; + + __shared__ FP1 partial2[warp_size]; + + if (tid < warp_size) { + partial2[tid] = 0; + } + + __syncthreads(); + + FP1 val = WarpReduce(partial1[tid], op3); + + if (threadIdx.x % warp_size == 0) { + partial2[threadIdx.x / warp_size] = val; + } + + __syncthreads(); + + FP1 result = 0; + + if (tid < warp_size) { + result = WarpReduce(partial2[tid], op3); + } + + return result; +} + +} // namespace detail + +template +__global__ void Reduce1Kernel(uint64_t n, Op1 op1, Op2 op2, Op3 op3, + const FP2* s1, const FP2* s2, FP3* result) { + FP1 sum = detail::BlockReduce1(n, op1, op2, op3, s1, s2); + + if (threadIdx.x == 0) { + result[blockIdx.x] = sum; + } +} + +template +__global__ void Reduce1MaskedKernel(uint64_t n, uint64_t mask, uint64_t bits, + Op1 op1, Op2 op2, Op3 op3, + const FP2* s1, const FP2* s2, FP3* result) { + FP1 sum = + detail::BlockReduce1Masked(n, mask, bits, op1, op2, op3, s1, s2); + + if (threadIdx.x == 0) { + result[blockIdx.x] = sum; + } +} + +template +__global__ void Reduce2Kernel( + uint64_t n, uint64_t size, Op2 op2, Op3 op3, const FP2* s, FP3* result) { + FP1 sum = detail::BlockReduce2(n, size, op2, op3, s); + + if (threadIdx.x == 0) { + result[blockIdx.x] = sum; + } +} + +template +__global__ void InternalToNormalOrderKernel(FP* state) { + unsigned lane = threadIdx.x % warp_size; + unsigned l = 2 * threadIdx.x - lane; + uint64_t k = 2 * uint64_t{blockIdx.x} * blockDim.x + l; + + extern __shared__ float shared[]; + FP* buf = (FP*) shared; + + buf[l] = state[k]; + buf[l + warp_size] = state[k + warp_size]; + + __syncthreads(); + + state[k + lane] = buf[l]; + state[k + lane + 1] = buf[l + warp_size]; +} + +template +__global__ void NormalToInternalOrderKernel(FP* state) { + unsigned lane = threadIdx.x % warp_size; + unsigned l = 2 * threadIdx.x - lane; + uint64_t k = 2 * uint64_t{blockIdx.x} * blockDim.x + l; + + extern __shared__ float shared[]; + FP* buf = (FP*) shared; + + buf[l] = state[k]; + buf[l + warp_size] = state[k + warp_size]; + + __syncthreads(); + + state[k] = buf[l + lane]; + state[k + warp_size] = buf[l + lane + 1]; +} + +template +__global__ void SetStateUniformKernel(FP v, uint64_t size, FP* state) { + unsigned lane = threadIdx.x % warp_size; + uint64_t k = 2 * (uint64_t{blockIdx.x} * blockDim.x + threadIdx.x) - lane; + + state[k] = lane < size ? v : 0; + state[k + warp_size] = 0; +} + +template +__global__ void AddKernel(const FP* state1, FP* state2) { + uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x; + state2[k] += state1[k]; +} + +template +__global__ void MultiplyKernel(FP a, FP* state) { + uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x; + state[k] *= a; +} + +template +__global__ void CollapseKernel(uint64_t mask, uint64_t bits, FP r, FP* state) { + uint64_t k1 = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x; + uint64_t k2 = 2 * k1 - threadIdx.x % warp_size; + + if ((k1 & mask) == bits) { + state[k2] *= r; + state[k2 + warp_size] *= r; + } else { + state[k2] = 0; + state[k2 + warp_size] = 0; + } +} + +template +__global__ void BulkSetAmplKernel( + uint64_t mask, uint64_t bits, FP re, FP im, bool exclude, FP* state) { + uint64_t k1 = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x; + uint64_t k2 = 2 * k1 - threadIdx.x % warp_size; + + bool set = ((k1 & mask) == bits) ^ exclude; + + if (set) { + state[k2] = re; + state[k2 + warp_size] = im; + } +} + +template +__global__ void SampleKernel(unsigned num_blocks, + uint64_t n, uint64_t num_samples, + const FP1* rs, const FP2* ps, const FP3* state, + uint64_t *bitstrings) { + // Use just one thread. This can be somewhat slow. + if (threadIdx.x == 0) { + uint64_t m = 0; + double csum = 0; + + for (unsigned block_id = 0; block_id < num_blocks; ++block_id) { + uint64_t km = n * blockDim.x; + uint64_t k0 = block_id * km; + + for (uint64_t k = 0; k < km; ++k) { + uint64_t l = 2 * k0 + 64 * (k / 32) + k % 32; + FP3 re = state[l]; + FP3 im = state[l + warp_size]; + csum += re * re + im * im; + while (rs[m] < csum && m < num_samples) { + bitstrings[m++] = k0 + k; + } + } + } + } +} + +template +__global__ void FindMeasuredBitsKernel( + uint64_t block_id, uint64_t n, double r, const FP* state, uint64_t* res) { + // Use just one thread. This can be somewhat slow, however, this is + // more or less consistent with CPU implementations. + if (threadIdx.x == 0) { + double csum = 0; + uint64_t km = n * blockDim.x; + uint64_t k0 = block_id * km; + + for (uint64_t k = 0; k < km; ++k) { + uint64_t l = 2 * k0 + 64 * (k / 32) + k % 32; + FP re = state[l]; + FP im = state[l + warp_size]; + csum += re * re + im * im; + if (r < csum) { + *res = k0 + k; + return; + } + } + + *res = k0 + n * blockDim.x - 1; + } +} + +} // namespace qsim + +#endif // STATESPACE_CUDA_KERNELS_H_ diff --git a/qsim/statespace_custatevec.h b/qsim/statespace_custatevec.h new file mode 100644 index 0000000..f2f5de1 --- /dev/null +++ b/qsim/statespace_custatevec.h @@ -0,0 +1,376 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef STATESPACE_CUSTATEVEC_H_ +#define STATESPACE_CUSTATEVEC_H_ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "statespace.h" +#include "util_custatevec.h" +#include "vectorspace_cuda.h" + +namespace qsim { + +namespace detail { + +template +__global__ void SetStateUniformKernel(FP v, uint64_t size, FP* state) { + uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x; + + if (k < size) { + state[2 * k] = v; + state[2 * k + 1] = 0; + } +} + +} // namespace detail + +/** + * Object containing context and routines for cuStateVec state-vector + * manipulations. It is not recommended to use `GetAmpl` and `SetAmpl`. + */ +template +class StateSpaceCuStateVec : + public StateSpace, VectorSpaceCUDA, FP> { + private: + using Base = StateSpace, qsim::VectorSpaceCUDA, FP>; + + public: + using State = typename Base::State; + using fp_type = typename Base::fp_type; + + static constexpr auto is_float = std::is_same::value; + + static constexpr auto kStateType = is_float ? CUDA_C_32F : CUDA_C_64F; + static constexpr auto kMatrixType = kStateType; + static constexpr auto kExpectType = CUDA_C_64F; + static constexpr auto kComputeType = + is_float ? CUSTATEVEC_COMPUTE_32F : CUSTATEVEC_COMPUTE_64F; + static constexpr auto kMatrixLayout = CUSTATEVEC_MATRIX_LAYOUT_ROW; + + explicit StateSpaceCuStateVec(const cublasHandle_t& cublas_handle, + const custatevecHandle_t& custatevec_handle) + : cublas_handle_(cublas_handle), custatevec_handle_(custatevec_handle), + workspace_(nullptr), workspace_size_(0) {} + + virtual ~StateSpaceCuStateVec() { + if (workspace_ != nullptr) { + ErrorCheck(cudaFree(workspace_)); + } + } + + static uint64_t MinSize(unsigned num_qubits) { + return 2 * (uint64_t{1} << num_qubits); + }; + + void InternalToNormalOrder(State& state) const { + } + + void NormalToInternalOrder(State& state) const { + } + + void SetAllZeros(State& state) const { + ErrorCheck(cudaMemset(state.get(), 0, + MinSize(state.num_qubits()) * sizeof(fp_type))); + } + + // Uniform superposition. + void SetStateUniform(State& state) const { + uint64_t size = uint64_t{1} << state.num_qubits(); + + unsigned threads = size < 256 ? size : 256; + unsigned blocks = size / threads; + + fp_type v = double{1} / std::sqrt(size); + + detail::SetStateUniformKernel<<>>(v, size, state.get()); + ErrorCheck(cudaPeekAtLastError()); + } + + // |0> state. + void SetStateZero(State& state) const { + SetAllZeros(state); + fp_type one[1] = {1}; + ErrorCheck( + cudaMemcpy(state.get(), one, sizeof(fp_type), cudaMemcpyHostToDevice)); + } + + // It is not recommended to use this function. + static std::complex GetAmpl(const State& state, uint64_t i) { + fp_type a[2]; + auto p = state.get() + 2 * i; + ErrorCheck(cudaMemcpy(a, p, 2 * sizeof(fp_type), cudaMemcpyDeviceToHost)); + return std::complex(a[0], a[1]); + } + + // It is not recommended to use this function. + static void SetAmpl( + State& state, uint64_t i, const std::complex& ampl) { + fp_type a[2] = {std::real(ampl), std::imag(ampl)}; + auto p = state.get() + 2 * i; + ErrorCheck(cudaMemcpy(p, a, 2 * sizeof(fp_type), cudaMemcpyHostToDevice)); + } + + // It is not recommended to use this function. + static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) { + fp_type a[2] = {re, im}; + auto p = state.get() + 2 * i; + ErrorCheck(cudaMemcpy(p, a, 2 * sizeof(fp_type), cudaMemcpyHostToDevice)); + } + + // Sets state[i] = complex(re, im) where (i & mask) == bits. + // if `exclude` is true then the criteria becomes (i & mask) != bits. + void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, + const std::complex& val, + bool exclude = false) const { + // Not implemented. + } + + // Sets state[i] = complex(re, im) where (i & mask) == bits. + // if `exclude` is true then the criteria becomes (i & mask) != bits. + void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re, + fp_type im, bool exclude = false) const { + // Not implemented. + } + + // Does the equivalent of dest += src elementwise. + bool Add(const State& src, State& dest) const { + if (src.num_qubits() != dest.num_qubits()) { + return false; + } + + uint64_t size = uint64_t{1} << src.num_qubits(); + + if (is_float) { + cuComplex a = {1.0, 0.0}; + auto p1 = (const cuComplex*) src.get(); + auto p2 = (cuComplex*) dest.get(); + ErrorCheck(cublasCaxpy(cublas_handle_, size, &a, p1, 1, p2, 1)); + } else { + cuDoubleComplex a = {1.0, 0.0}; + auto p1 = (const cuDoubleComplex*) src.get(); + auto p2 = (cuDoubleComplex*) dest.get(); + ErrorCheck(cublasZaxpy(cublas_handle_, size, &a, p1, 1, p2, 1)); + } + + return true; + } + + // Does the equivalent of state *= a elementwise. + void Multiply(fp_type a, State& state) const { + uint64_t size = uint64_t{1} << state.num_qubits(); + + if (is_float) { + float a1 = a; + auto p = (cuComplex*) state.get(); + ErrorCheck(cublasCsscal(cublas_handle_, size, &a1, p, 1)); + } else { + double a1 = a; + auto p = (cuDoubleComplex*) state.get(); + ErrorCheck(cublasZdscal(cublas_handle_, size, &a1, p, 1)); + } + } + + std::complex InnerProduct( + const State& state1, const State& state2) const { + if (state1.num_qubits() != state2.num_qubits()) { + return std::nan(""); + } + + uint64_t size = uint64_t{1} << state1.num_qubits(); + + if (is_float) { + cuComplex result; + auto p1 = (const cuComplex*) state1.get(); + auto p2 = (const cuComplex*) state2.get(); + ErrorCheck(cublasCdotc(cublas_handle_, size, p1, 1, p2, 1, &result)); + return {cuCrealf(result), cuCimagf(result)}; + } else { + cuDoubleComplex result; + auto p1 = (const cuDoubleComplex*) state1.get(); + auto p2 = (const cuDoubleComplex*) state2.get(); + ErrorCheck(cublasZdotc(cublas_handle_, size, p1, 1, p2, 1, &result)); + return {cuCreal(result), cuCimag(result)}; + } + } + + double RealInnerProduct(const State& state1, const State& state2) const { + return std::real(InnerProduct(state1, state2)); + } + + double Norm(const State& state) const { + uint64_t size = uint64_t{1} << state.num_qubits(); + + if (is_float) { + float result; + auto p = (const cuComplex*) state.get(); + ErrorCheck(cublasScnrm2(cublas_handle_, size, p, 1, &result)); + return result * result; + } else { + double result; + auto p = (const cuDoubleComplex*) state.get(); + ErrorCheck(cublasDznrm2(cublas_handle_, size, p, 1, &result)); + return result * result; + } + } + + template + std::vector Sample( + const State& state, uint64_t num_samples, unsigned seed) const { + std::vector bitstrings; + + if (num_samples > 0) { + auto rs = GenerateRandomValues(num_samples, seed, 1.0); + + size_t workspace_size; + custatevecSamplerDescriptor_t sampler; + + ErrorCheck(custatevecSamplerCreate( + custatevec_handle_, state.get(), kStateType, + state.num_qubits(), &sampler, num_samples, + &workspace_size)); + + AllocWorkSpace(workspace_size); + + ErrorCheck(custatevecSamplerPreprocess( + custatevec_handle_, sampler, workspace_, workspace_size)); + + std::vector bitstrings0(num_samples); + std::vector bitordering; + + bitordering.reserve(state.num_qubits()); + for (unsigned i = 0; i < state.num_qubits(); ++i) { + bitordering.push_back(i); + } + + ErrorCheck(custatevecSamplerSample( + custatevec_handle_, sampler, bitstrings0.data(), + bitordering.data(), state.num_qubits(), rs.data(), + num_samples, CUSTATEVEC_SAMPLER_OUTPUT_RANDNUM_ORDER)); + + bitstrings.reserve(num_samples); + for (unsigned i = 0; i < num_samples; ++i) { + bitstrings.push_back(bitstrings0[i]); + } + } + + return bitstrings; + } + + using MeasurementResult = typename Base::MeasurementResult; + + template + MeasurementResult Measure(const std::vector& qubits, + RGen& rgen, State& state, + bool no_collapse = false) const { + auto r = RandomValue(rgen, 1.0); + + MeasurementResult result; + + result.valid = true; + result.mask = 0; + result.bits = 0; + result.bitstring.resize(qubits.size(), 0); + + for (auto q : qubits) { + if (q >= state.num_qubits()) { + result.valid = false; + return result; + } + + result.mask |= uint64_t{1} << q; + } + + auto collapse = no_collapse ? + CUSTATEVEC_COLLAPSE_NONE : CUSTATEVEC_COLLAPSE_NORMALIZE_AND_ZERO; + + ErrorCheck(custatevecBatchMeasure( + custatevec_handle_, state.get(), kStateType, + state.num_qubits(), (int*) result.bitstring.data(), + (int*) qubits.data(), qubits.size(), r, collapse)); + + for (std::size_t i = 0; i < result.bitstring.size(); ++i) { + result.bits |= result.bitstring[i] << qubits[i]; + } + + return result; + } + + template + MeasurementResult VirtualMeasure(const std::vector& qubits, + RGen& rgen, const State& state) const { + return Measure(qubits, rgen, const_cast(state), true); + } + + void Collapse(const MeasurementResult& mr, State& state) const { + unsigned count = 0; + + std::vector bitstring; + std::vector bitordering; + + bitstring.reserve(state.num_qubits()); + bitordering.reserve(state.num_qubits()); + + for (unsigned i = 0; i < state.num_qubits(); ++i) { + if (((mr.mask >> i) & 1) != 0) { + bitstring.push_back((mr.bits >> i) & 1); + bitordering.push_back(i); + ++count; + } + } + + ErrorCheck(custatevecCollapseByBitString( + custatevec_handle_, state.get(), kStateType, + state.num_qubits(), bitstring.data(), bitordering.data(), + count, 1.0)); + + // TODO: do we need the following? + double norm = Norm(state); + Multiply(1.0 / std::sqrt(norm), state); + } + + private: + void* AllocWorkSpace(size_t size) const { + if (size > workspace_size_) { + if (workspace_ != nullptr) { + ErrorCheck(cudaFree(workspace_)); + } + + ErrorCheck(cudaMalloc(const_cast(&workspace_), size)); + + const_cast(workspace_size_) = size; + } + + return workspace_; + } + + const cublasHandle_t cublas_handle_; + const custatevecHandle_t custatevec_handle_; + + void* workspace_; + size_t workspace_size_; +}; + +} // namespace qsim + +#endif // STATESPACE_CUSTATEVEC_H_ diff --git a/qsim/statespace_sse.h b/qsim/statespace_sse.h new file mode 100644 index 0000000..cf41a09 --- /dev/null +++ b/qsim/statespace_sse.h @@ -0,0 +1,462 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef STATESPACE_SSE_H_ +#define STATESPACE_SSE_H_ + +#include + +#include +#include +#include +#include +#include + +#include "statespace.h" +#include "util.h" +#include "vectorspace.h" + +namespace qsim { + +namespace detail { + +inline __m128i GetZeroMaskSSE(uint64_t i, uint64_t mask, uint64_t bits) { + __m128i s1 = _mm_set_epi64x(i + 2, i + 0); + __m128i s2 = _mm_set_epi64x(i + 3, i + 1); + __m128i ma = _mm_set1_epi64x(mask); + __m128i bi = _mm_set1_epi64x(bits); + + s1 = _mm_and_si128(s1, ma); + s2 = _mm_and_si128(s2, ma); + + s1 = _mm_cmpeq_epi64(s1, bi); + s2 = _mm_cmpeq_epi64(s2, bi); + + return _mm_blend_epi16(s1, s2, 204); // 11001100 +} + +inline double HorizontalSumSSE(__m128 s) { + __m128 ss = _mm_movehdup_ps(s); + __m128 s1 = _mm_add_ps(s, ss); + + return _mm_cvtss_f32(_mm_add_ss(s1, _mm_movehl_ps(ss, s1))); +} + +} // namespace detail + +/** + * Object containing context and routines for SSE state-vector manipulations. + * State is a vectorized sequence of four real components followed by four + * imaginary components. Four single-precison floating numbers can be loaded + * into an SSE register. + */ +template +class StateSpaceSSE : + public StateSpace, VectorSpace, For, float> { + private: + using Base = StateSpace, qsim::VectorSpace, For, float>; + + public: + using State = typename Base::State; + using fp_type = typename Base::fp_type; + + template + explicit StateSpaceSSE(ForArgs&&... args) : Base(args...) {} + + static uint64_t MinSize(unsigned num_qubits) { + return std::max(uint64_t{8}, 2 * (uint64_t{1} << num_qubits)); + }; + + void InternalToNormalOrder(State& state) const { + if (state.num_qubits() == 1) { + auto s = state.get(); + + s[2] = s[1]; + s[1] = s[4]; + s[3] = s[5]; + + for (uint64_t i = 4; i < 8; ++i) { + s[i] = 0; + } + } else { + auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) { + auto s = p + 8 * i; + + fp_type re[3]; + fp_type im[3]; + + for (uint64_t i = 0; i < 3; ++i) { + re[i] = s[i + 1]; + im[i] = s[i + 4]; + } + + for (uint64_t i = 0; i < 3; ++i) { + s[2 * i + 1] = im[i]; + s[2 * i + 2] = re[i]; + } + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 8, f, state.get()); + } + } + + void NormalToInternalOrder(State& state) const { + if (state.num_qubits() == 1) { + auto s = state.get(); + + s[4] = s[1]; + s[1] = s[2]; + s[5] = s[3]; + + s[2] = 0; + s[3] = 0; + s[6] = 0; + s[7] = 0; + } else { + auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) { + auto s = p + 8 * i; + + fp_type re[3]; + fp_type im[3]; + + for (uint64_t i = 0; i < 3; ++i) { + im[i] = s[2 * i + 1]; + re[i] = s[2 * i + 2]; + } + + for (uint64_t i = 0; i < 3; ++i) { + s[i + 1] = re[i]; + s[i + 4] = im[i]; + } + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 8, f, state.get()); + } + } + + void SetAllZeros(State& state) const { + __m128 val0 = _mm_setzero_ps(); + + auto f = [](unsigned n, unsigned m, uint64_t i, __m128 val0, fp_type* p) { + _mm_store_ps(p + 8 * i, val0); + _mm_store_ps(p + 8 * i + 4, val0); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 8, f, val0, state.get()); + } + + // Uniform superposition. + void SetStateUniform(State& state) const { + __m128 val0 = _mm_setzero_ps(); + __m128 valu; + + fp_type v = double{1} / std::sqrt(uint64_t{1} << state.num_qubits()); + + if (state.num_qubits() == 1) { + valu = _mm_set_ps(0, 0, v, v); + } else { + valu = _mm_set1_ps(v); + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + __m128 val0, __m128 valu, fp_type* p) { + _mm_store_ps(p + 8 * i, valu); + _mm_store_ps(p + 8 * i + 4, val0); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 8, f, val0, valu, state.get()); + } + + // |0> state. + void SetStateZero(State& state) const { + SetAllZeros(state); + state.get()[0] = 1; + } + + static std::complex GetAmpl(const State& state, uint64_t i) { + uint64_t p = (8 * (i / 4)) + (i % 4); + return std::complex(state.get()[p], state.get()[p + 4]); + } + + static void SetAmpl( + State& state, uint64_t i, const std::complex& ampl) { + uint64_t p = (8 * (i / 4)) + (i % 4); + state.get()[p] = std::real(ampl); + state.get()[p + 4] = std::imag(ampl); + } + + static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) { + uint64_t p = (8 * (i / 4)) + (i % 4); + state.get()[p] = re; + state.get()[p + 4] = im; + } + + // Sets state[i] = complex(re, im) where (i & mask) == bits. + // if `exclude` is true then the criteria becomes (i & mask) != bits. + void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, + const std::complex& val, + bool exclude = false) const { + BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val)); + } + + // Sets state[i] = complex(re, im) where (i & mask) == bits. + // if `exclude` is true then the criteria becomes (i & mask) != bits. + void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re, + fp_type im, bool exclude = false) const { + __m128 re_reg = _mm_set1_ps(re); + __m128 im_reg = _mm_set1_ps(im); + __m128i exclude_reg = _mm_setzero_si128(); + if (exclude) { + exclude_reg = _mm_cmpeq_epi32(exclude_reg, exclude_reg); + } + + auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv, + uint64_t bitsv, __m128 re_n, __m128 im_n, __m128i exclude_n, + fp_type* p) { + __m128 ml = _mm_castsi128_ps(_mm_xor_si128( + detail::GetZeroMaskSSE(4 * i, maskv, bitsv), exclude_n)); + + __m128 re = _mm_load_ps(p + 8 * i); + __m128 im = _mm_load_ps(p + 8 * i + 4); + + re = _mm_blendv_ps(re, re_n, ml); + im = _mm_blendv_ps(im, im_n, ml); + + _mm_store_ps(p + 8 * i, re); + _mm_store_ps(p + 8 * i + 4, im); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 8, f, mask, bits, re_reg, + im_reg, exclude_reg, state.get()); + } + + // Does the equivalent of dest += src elementwise. + bool Add(const State& src, State& dest) const { + if (src.num_qubits() != dest.num_qubits()) { + return false; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p1, fp_type* p2) { + __m128 re1 = _mm_load_ps(p1 + 8 * i); + __m128 im1 = _mm_load_ps(p1 + 8 * i + 4); + __m128 re2 = _mm_load_ps(p2 + 8 * i); + __m128 im2 = _mm_load_ps(p2 + 8 * i + 4); + + _mm_store_ps(p2 + 8 * i, _mm_add_ps(re1, re2)); + _mm_store_ps(p2 + 8 * i + 4, _mm_add_ps(im1, im2)); + }; + + Base::for_.Run(MinSize(src.num_qubits()) / 8, f, src.get(), dest.get()); + + return true; + } + + // Does the equivalent of state *= a elementwise. + void Multiply(fp_type a, State& state) const { + __m128 r = _mm_set1_ps(a); + + auto f = [](unsigned n, unsigned m, uint64_t i, __m128 r, fp_type* p) { + __m128 re = _mm_load_ps(p + 8 * i); + __m128 im = _mm_load_ps(p + 8 * i + 4); + + re = _mm_mul_ps(re, r); + im = _mm_mul_ps(im, r); + + _mm_store_ps(p + 8 * i, re); + _mm_store_ps(p + 8 * i + 4, im); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 8, f, r, state.get()); + } + + std::complex InnerProduct( + const State& state1, const State& state2) const { + if (state1.num_qubits() != state2.num_qubits()) { + return std::nan(""); + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p1, const fp_type* p2) -> std::complex { + __m128 re1 = _mm_load_ps(p1 + 8 * i); + __m128 im1 = _mm_load_ps(p1 + 8 * i + 4); + __m128 re2 = _mm_load_ps(p2 + 8 * i); + __m128 im2 = _mm_load_ps(p2 + 8 * i + 4); + + __m128 ip_re = _mm_add_ps(_mm_mul_ps(re1, re2), _mm_mul_ps(im1, im2)); + __m128 ip_im = _mm_sub_ps(_mm_mul_ps(re1, im2), _mm_mul_ps(im1, re2)); + + double re = detail::HorizontalSumSSE(ip_re); + double im = detail::HorizontalSumSSE(ip_im); + + return std::complex{re, im}; + }; + + using Op = std::plus>; + return Base::for_.RunReduce( + MinSize(state1.num_qubits()) / 8, f, Op(), state1.get(), state2.get()); + } + + double RealInnerProduct(const State& state1, const State& state2) const { + if (state1.num_qubits() != state2.num_qubits()) { + return std::nan(""); + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p1, const fp_type* p2) -> double { + __m128 re1 = _mm_load_ps(p1 + 8 * i); + __m128 im1 = _mm_load_ps(p1 + 8 * i + 4); + __m128 re2 = _mm_load_ps(p2 + 8 * i); + __m128 im2 = _mm_load_ps(p2 + 8 * i + 4); + + __m128 ip_re = _mm_add_ps(_mm_mul_ps(re1, re2), _mm_mul_ps(im1, im2)); + + return detail::HorizontalSumSSE(ip_re); + }; + + using Op = std::plus; + return Base::for_.RunReduce( + MinSize(state1.num_qubits()) / 8, f, Op(), state1.get(), state2.get()); + } + + template + std::vector Sample( + const State& state, uint64_t num_samples, unsigned seed) const { + std::vector bitstrings; + + if (num_samples > 0) { + double norm = 0; + uint64_t size = MinSize(state.num_qubits()) / 8; + const fp_type* p = state.get(); + + for (uint64_t k = 0; k < size; ++k) { + for (unsigned j = 0; j < 4; ++j) { + double re = p[8 * k + j]; + double im = p[8 * k + 4 + j]; + norm += re * re + im * im; + } + } + + auto rs = GenerateRandomValues(num_samples, seed, norm); + + uint64_t m = 0; + double csum = 0; + bitstrings.reserve(num_samples); + + for (uint64_t k = 0; k < size; ++k) { + for (unsigned j = 0; j < 4; ++j) { + double re = p[8 * k + j]; + double im = p[8 * k + 4 + j]; + csum += re * re + im * im; + while (rs[m] < csum && m < num_samples) { + bitstrings.emplace_back(4 * k + j); + ++m; + } + } + } + + for (; m < num_samples; ++m) { + bitstrings.emplace_back((uint64_t{1} << state.num_qubits()) - 1); + } + } + + return bitstrings; + } + + using MeasurementResult = typename Base::MeasurementResult; + + void Collapse(const MeasurementResult& mr, State& state) const { + __m128 zero = _mm_set1_ps(0); + + auto f1 = [](unsigned n, unsigned m, uint64_t i, uint64_t mask, + uint64_t bits, __m128 zero, const fp_type* p) -> double { + __m128 ml = _mm_castsi128_ps(detail::GetZeroMaskSSE(4 * i, mask, bits)); + + __m128 re = _mm_load_ps(p + 8 * i); + __m128 im = _mm_load_ps(p + 8 * i + 4); + __m128 s1 = _mm_add_ps(_mm_mul_ps(re, re), _mm_mul_ps(im, im)); + + s1 = _mm_blendv_ps(zero, s1, ml); + + return detail::HorizontalSumSSE(s1); + }; + + using Op = std::plus; + double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 8, f1, + Op(), mr.mask, mr.bits, zero, + state.get()); + + __m128 renorm = _mm_set1_ps(1.0 / std::sqrt(norm)); + + auto f2 = [](unsigned n, unsigned m, uint64_t i, uint64_t mask, + uint64_t bits, __m128 renorm, __m128 zero, fp_type* p) { + __m128 ml = _mm_castsi128_ps(detail::GetZeroMaskSSE(4 * i, mask, bits)); + + __m128 re = _mm_load_ps(p + 8 * i); + __m128 im = _mm_load_ps(p + 8 * i + 4); + + re = _mm_blendv_ps(zero, _mm_mul_ps(re, renorm), ml); + im = _mm_blendv_ps(zero, _mm_mul_ps(im, renorm), ml); + + _mm_store_ps(p + 8 * i, re); + _mm_store_ps(p + 8 * i + 4, im); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 8, f2, + mr.mask, mr.bits, renorm, zero, state.get()); + } + + std::vector PartialNorms(const State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p) -> double { + __m128 re = _mm_load_ps(p + 8 * i); + __m128 im = _mm_load_ps(p + 8 * i + 4); + __m128 s1 = _mm_add_ps(_mm_mul_ps(re, re), _mm_mul_ps(im, im)); + + return detail::HorizontalSumSSE(s1); + }; + + using Op = std::plus; + return Base::for_.RunReduceP( + MinSize(state.num_qubits()) / 8, f, Op(), state.get()); + } + + uint64_t FindMeasuredBits( + unsigned m, double r, uint64_t mask, const State& state) const { + double csum = 0; + + uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 8, m); + uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 8, m); + + const fp_type* p = state.get(); + + for (uint64_t k = k0; k < k1; ++k) { + for (uint64_t j = 0; j < 4; ++j) { + auto re = p[8 * k + j]; + auto im = p[8 * k + 4 + j]; + csum += re * re + im * im; + if (r < csum) { + return (4 * k + j) & mask; + } + } + } + + // Return the last bitstring in the unlikely case of underflow. + return (4 * k1 - 1) & mask; + } +}; + +} // namespace qsim + +#endif // STATESPACE_SSE_H_ diff --git a/qsim/umux.h b/qsim/umux.h new file mode 100644 index 0000000..83b951b --- /dev/null +++ b/qsim/umux.h @@ -0,0 +1,52 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UMUX_H_ +#define UMUX_H_ + +#ifdef __AVX512F__ +# include "unitary_calculator_avx512.h" + namespace qsim { + namespace unitary { + template + using UnitaryCalculator = UnitaryCalculatorAVX512; + } + } +#elif __AVX2__ +# include "unitary_calculator_avx.h" + namespace qsim { + namespace unitary { + template + using UnitaryCalculator = UnitaryCalculatorAVX; + } + } +#elif __SSE4_1__ +# include "unitary_calculator_sse.h" + namespace qsim { + namespace unitary { + template + using UnitaryCalculator = UnitaryCalculatorSSE; + } + } +#else +# include "unitary_calculator_basic.h" + namespace qsim { + namespace unitary { + template + using UnitaryCalculator = UnitaryCalculatorBasic; + } + } +#endif + +#endif // UMUX_H_ diff --git a/qsim/unitary_calculator_avx.h b/qsim/unitary_calculator_avx.h new file mode 100644 index 0000000..5e566ca --- /dev/null +++ b/qsim/unitary_calculator_avx.h @@ -0,0 +1,1028 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UNITARY_CALCULATOR_AVX_H_ +#define UNITARY_CALCULATOR_AVX_H_ + +#include + +#include +#include +#include +#include + +#include "simulator.h" +#include "unitaryspace_avx.h" + +namespace qsim { +namespace unitary { + +/** + * Quantum circuit unitary calculator with AVX vectorization. + */ +template +class UnitaryCalculatorAVX final : public SimulatorBase { + public: + using UnitarySpace = UnitarySpaceAVX; + using Unitary = typename UnitarySpace::Unitary; + using fp_type = typename UnitarySpace::fp_type; + + using StateSpace = UnitarySpace; + using State = Unitary; + + template + explicit UnitaryCalculatorAVX(ForArgs&&... args) : for_(args...) {} + + /** + * Applies a gate using AVX instructions. + * @param qs Indices of the qubits affected by this gate. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyGate(const std::vector& qs, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + switch (qs.size()) { + case 1: + if (qs[0] > 2) { + ApplyGateH<1>(qs, matrix, state); + } else { + ApplyGateL<0, 1>(qs, matrix, state); + } + break; + case 2: + if (qs[0] > 2) { + ApplyGateH<2>(qs, matrix, state); + } else if (qs[1] > 2) { + ApplyGateL<1, 1>(qs, matrix, state); + } else { + ApplyGateL<0, 2>(qs, matrix, state); + } + break; + case 3: + if (qs[0] > 2) { + ApplyGateH<3>(qs, matrix, state); + } else if (qs[1] > 2) { + ApplyGateL<2, 1>(qs, matrix, state); + } else if (qs[2] > 2) { + ApplyGateL<1, 2>(qs, matrix, state); + } else { + ApplyGateL<0, 3>(qs, matrix, state); + } + break; + case 4: + if (qs[0] > 2) { + ApplyGateH<4>(qs, matrix, state); + } else if (qs[1] > 2) { + ApplyGateL<3, 1>(qs, matrix, state); + } else if (qs[2] > 2) { + ApplyGateL<2, 2>(qs, matrix, state); + } else { + ApplyGateL<1, 3>(qs, matrix, state); + } + break; + case 5: + if (qs[0] > 2) { + ApplyGateH<5>(qs, matrix, state); + } else if (qs[1] > 2) { + ApplyGateL<4, 1>(qs, matrix, state); + } else if (qs[2] > 2) { + ApplyGateL<3, 2>(qs, matrix, state); + } else { + ApplyGateL<2, 3>(qs, matrix, state); + } + break; + case 6: + if (qs[0] > 2) { + ApplyGateH<6>(qs, matrix, state); + } else if (qs[1] > 2) { + ApplyGateL<5, 1>(qs, matrix, state); + } else if (qs[2] > 2) { + ApplyGateL<4, 2>(qs, matrix, state); + } else { + ApplyGateL<3, 3>(qs, matrix, state); + } + break; + default: + // Not implemented. + break; + } + } + + /** + * Applies a controlled gate using AVX instructions. + * @param qs Indices of the qubits affected by this gate. + * @param cqs Indices of control qubits. + * @param cvals Bit mask of control qubit values. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyControlledGate(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + // Assume cqs[0] < cqs[1] < cqs[2] < ... . + + if (cqs.size() == 0) { + ApplyGate(qs, matrix, state); + return; + } + + switch (qs.size()) { + case 1: + if (qs[0] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 2) { + ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 2: + if (qs[0] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 2) { + ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 3: + if (qs[0] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else if (qs[2] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 2) { + ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 4: + if (qs[0] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else if (qs[2] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 2) { + ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + default: + // Not implemented. + break; + } + } + + /** + * @return The size of SIMD register if applicable. + */ + static unsigned SIMDRegisterSize() { + return 8; + } + + private: + +#ifdef __BMI2__ + + template + void ApplyGateH(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + uint64_t imaskh, uint64_t qmaskh, uint64_t size, + uint64_t row_size, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m256 ru, iu, rn, in; + __m256 rs[hsize], is[hsize]; + + uint64_t r = i % size; + uint64_t s = i / size; + + auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh); + + for (unsigned k = 0; k < hsize; ++k) { + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k] = _mm256_load_ps(p0 + p); + is[k] = _mm256_load_ps(p0 + p + 8); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_mul_ps(rs[0], ru); + in = _mm256_mul_ps(rs[0], iu); + rn = _mm256_fnmadd_ps(is[0], iu, rn); + in = _mm256_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_fmadd_ps(rs[l], ru, rn); + in = _mm256_fmadd_ps(rs[l], iu, in); + rn = _mm256_fnmadd_ps(is[l], iu, rn); + in = _mm256_fmadd_ps(is[l], ru, in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm256_store_ps(p0 + p, rn); + _mm256_store_ps(p0 + p + 8, in); + } + }; + + auto m = GetMasks1(qs); + + unsigned k = 3 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, + matrix, m.imaskh, m.qmaskh, size, raw_size, state.get()); + } + + template + void ApplyGateL(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, + uint64_t imaskh, uint64_t qmaskh, const __m256i* idx, + uint64_t size, uint64_t row_size, fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m256 rn, in; + __m256 rs[gsize], is[gsize]; + + uint64_t r = i % size; + uint64_t s = i / size; + + auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh); + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k2] = _mm256_load_ps(p0 + p); + is[k2] = _mm256_load_ps(p0 + p + 8); + + for (unsigned l = 1; l < lsize; ++l) { + rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); + is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); + } + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm256_mul_ps(rs[0], w[j]); + in = _mm256_mul_ps(rs[0], w[j + 1]); + rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm256_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm256_fmadd_ps(rs[l], w[j], rn); + in = _mm256_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm256_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm256_store_ps(p0 + p, rn); + _mm256_store_ps(p0 + p + 8, in); + } + }; + + __m256i idx[1 << L]; + __m256 w[1 << (1 + 2 * H + L)]; + + auto m = GetMasks2(qs); + FillPermutationIndices(m.qmaskl, idx); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + unsigned k = 3 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, + w, m.imaskh, m.qmaskh, idx, size, raw_size, state.get()); + } + + template + void ApplyControlledGateHH(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, + uint64_t size, uint64_t row_size, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m256 ru, iu, rn, in; + __m256 rs[hsize], is[hsize]; + + uint64_t r = i % size; + uint64_t s = i / size; + + auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh); + + for (unsigned k = 0; k < hsize; ++k) { + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k] = _mm256_load_ps(p0 + p); + is[k] = _mm256_load_ps(p0 + p + 8); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_mul_ps(rs[0], ru); + in = _mm256_mul_ps(rs[0], iu); + rn = _mm256_fnmadd_ps(is[0], iu, rn); + in = _mm256_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_fmadd_ps(rs[l], ru, rn); + in = _mm256_fmadd_ps(rs[l], iu, in); + rn = _mm256_fnmadd_ps(is[l], iu, rn); + in = _mm256_fmadd_ps(is[l], ru, in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm256_store_ps(p0 + p, rn); + _mm256_store_ps(p0 + p + 8, in); + } + }; + + auto m = GetMasks3(state.num_qubits(), qs, cqs, cvals); + + unsigned k = 3 + H + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, + matrix, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get()); + } + + template + void ApplyControlledGateHL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, + uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, + uint64_t size, uint64_t row_size, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m256 rn, in; + __m256 rs[hsize], is[hsize]; + + uint64_t r = i % size; + uint64_t s = i / size; + + auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh); + + for (unsigned k = 0; k < hsize; ++k) { + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k] = _mm256_load_ps(p0 + p); + is[k] = _mm256_load_ps(p0 + p + 8); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm256_mul_ps(rs[0], w[j]); + in = _mm256_mul_ps(rs[0], w[j + 1]); + rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm256_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + rn = _mm256_fmadd_ps(rs[l], w[j], rn); + in = _mm256_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm256_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm256_store_ps(p0 + p, rn); + _mm256_store_ps(p0 + p + 8, in); + } + }; + + __m256 w[1 << (1 + 2 * H)]; + + auto m = GetMasks4(state.num_qubits(), qs, cqs, cvals); + FillControlledMatrixH(m.cvalsl, m.cmaskl, matrix, (fp_type*) w); + + unsigned k = 3 + H + cqs.size() - m.cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, + w, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get()); + } + + template + void ApplyControlledGateL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, + uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, + const __m256i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m256 rn, in; + __m256 rs[gsize], is[gsize]; + + uint64_t r = i % size; + uint64_t s = i / size; + + auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh); + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k2] = _mm256_load_ps(p0 + p); + is[k2] = _mm256_load_ps(p0 + p + 8); + + for (unsigned l = 1; l < lsize; ++l) { + rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); + is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); + } + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm256_mul_ps(rs[0], w[j]); + in = _mm256_mul_ps(rs[0], w[j + 1]); + rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm256_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm256_fmadd_ps(rs[l], w[j], rn); + in = _mm256_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm256_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm256_store_ps(p0 + p, rn); + _mm256_store_ps(p0 + p + 8, in); + } + }; + + __m256i idx[1 << L]; + __m256 w[1 << (1 + 2 * H + L)]; + + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + if (CH) { + auto m = GetMasks5(state.num_qubits(), qs, cqs, cvals); + FillPermutationIndices(m.qmaskl, idx); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + unsigned k = 3 + H + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size * size2, f, w, m.imaskh, m.qmaskh, + m.cvalsh, idx, size, raw_size, state.get()); + } else { + auto m = GetMasks6(state.num_qubits(), qs, cqs, cvals); + FillPermutationIndices(m.qmaskl, idx); + FillControlledMatrixL( + m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w); + + unsigned k = 3 + H + cqs.size() - m.cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size * size2, f, w, m.imaskh, m.qmaskh, + m.cvalsh, idx, size, raw_size, state.get()); + } + } + +#else // __BMI2__ + + template + void ApplyGateH(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, uint64_t size, + uint64_t row_size, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m256 ru, iu, rn, in; + __m256 rs[hsize], is[hsize]; + + uint64_t r = 8 * (i % size); + uint64_t s = i / size; + + uint64_t t = r & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + r *= 2; + t |= r & ms[j]; + } + + auto p0 = rstate + row_size * s + 2 * t; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = _mm256_load_ps(p0 + xss[k]); + is[k] = _mm256_load_ps(p0 + xss[k] + 8); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_mul_ps(rs[0], ru); + in = _mm256_mul_ps(rs[0], iu); + rn = _mm256_fnmadd_ps(is[0], iu, rn); + in = _mm256_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_fmadd_ps(rs[l], ru, rn); + in = _mm256_fmadd_ps(rs[l], iu, in); + rn = _mm256_fnmadd_ps(is[l], iu, rn); + in = _mm256_fmadd_ps(is[l], ru, in); + + j += 2; + } + + _mm256_store_ps(p0 + xss[k], rn); + _mm256_store_ps(p0 + xss[k] + 8, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned k = 3 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, state.get()); + } + + template + void ApplyGateL(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, + const uint64_t* ms, const uint64_t* xss, const __m256i* idx, + uint64_t size, uint64_t row_size, fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m256 rn, in; + __m256 rs[gsize], is[gsize]; + + uint64_t r = 8 * (i % size); + uint64_t s = i / size; + + uint64_t t = r & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + r *= 2; + t |= r & ms[j]; + } + + auto p0 = rstate + row_size * s + 2 * t; + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + rs[k2] = _mm256_load_ps(p0 + xss[k]); + is[k2] = _mm256_load_ps(p0 + xss[k] + 8); + + for (unsigned l = 1; l < lsize; ++l) { + rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); + is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); + } + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm256_mul_ps(rs[0], w[j]); + in = _mm256_mul_ps(rs[0], w[j + 1]); + rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm256_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm256_fmadd_ps(rs[l], w[j], rn); + in = _mm256_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm256_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + _mm256_store_ps(p0 + xss[k], rn); + _mm256_store_ps(p0 + xss[k] + 8, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + __m256i idx[1 << L]; + __m256 w[1 << (1 + 2 * H + L)]; + + auto m = GetMasks11(qs); + + FillIndices(state.num_qubits(), qs, ms, xss); + FillPermutationIndices(m.qmaskl, idx); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + unsigned k = 3 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, state.get()); + } + + template + void ApplyControlledGateHH(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, + uint64_t cmaskh, uint64_t size, uint64_t row_size, + fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m256 ru, iu, rn, in; + __m256 rs[hsize], is[hsize]; + + uint64_t r = 8 * (i % size); + uint64_t s = i / size; + + uint64_t t = r & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + r *= 2; + t |= r & ms[j]; + } + + if ((t & cmaskh) != cvalsh) return; + + auto p0 = rstate + row_size * s + 2 * t; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = _mm256_load_ps(p0 + xss[k]); + is[k] = _mm256_load_ps(p0 + xss[k] + 8); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_mul_ps(rs[0], ru); + in = _mm256_mul_ps(rs[0], iu); + rn = _mm256_fnmadd_ps(is[0], iu, rn); + in = _mm256_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_fmadd_ps(rs[l], ru, rn); + in = _mm256_fmadd_ps(rs[l], iu, in); + rn = _mm256_fnmadd_ps(is[l], iu, rn); + in = _mm256_fmadd_ps(is[l], ru, in); + + j += 2; + } + + _mm256_store_ps(p0 + xss[k], rn); + _mm256_store_ps(p0 + xss[k] + 8, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + + auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals); + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned k = 3 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, + matrix, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get()); + } + + template + void ApplyControlledGateHL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, + const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, + uint64_t cmaskh, uint64_t size, uint64_t row_size, + fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m256 rn, in; + __m256 rs[hsize], is[hsize]; + + uint64_t r = 8 * (i % size); + uint64_t s = i / size; + + uint64_t t = r & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + r *= 2; + t |= r & ms[j]; + } + + if ((t & cmaskh) != cvalsh) return; + + auto p0 = rstate + row_size * s + 2 * t; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = _mm256_load_ps(p0 + xss[k]); + is[k] = _mm256_load_ps(p0 + xss[k] + 8); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm256_mul_ps(rs[0], w[j]); + in = _mm256_mul_ps(rs[0], w[j + 1]); + rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm256_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + rn = _mm256_fmadd_ps(rs[l], w[j], rn); + in = _mm256_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm256_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + _mm256_store_ps(p0 + xss[k], rn); + _mm256_store_ps(p0 + xss[k] + 8, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + __m256 w[1 << (1 + 2 * H)]; + + auto m = GetMasks8<3>(state.num_qubits(), qs, cqs, cvals); + FillIndices(state.num_qubits(), qs, ms, xss); + FillControlledMatrixH(m.cvalsl, m.cmaskl, matrix, (fp_type*) w); + + unsigned k = 3 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, + w, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get()); + } + + template + void ApplyControlledGateL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, + const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, + uint64_t cmaskh, const __m256i* idx, uint64_t size, + uint64_t row_size, fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m256 rn, in; + __m256 rs[gsize], is[gsize]; + + uint64_t r = 8 * (i % size); + uint64_t s = i / size; + + uint64_t t = r & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + r *= 2; + t |= r & ms[j]; + } + + if ((t & cmaskh) != cvalsh) return; + + auto p0 = rstate + row_size * s + 2 * t; + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + + rs[k2] = _mm256_load_ps(p0 + xss[k]); + is[k2] = _mm256_load_ps(p0 + xss[k] + 8); + + for (unsigned l = 1; l < lsize; ++l) { + rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); + is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); + } + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm256_mul_ps(rs[0], w[j]); + in = _mm256_mul_ps(rs[0], w[j + 1]); + rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm256_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm256_fmadd_ps(rs[l], w[j], rn); + in = _mm256_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm256_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + _mm256_store_ps(p0 + xss[k], rn); + _mm256_store_ps(p0 + xss[k] + 8, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + __m256i idx[1 << L]; + __m256 w[1 << (1 + 2 * H + L)]; + + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned k = 3 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + if (CH) { + auto m = GetMasks9(state.num_qubits(), qs, cqs, cvals); + FillPermutationIndices(m.qmaskl, idx); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + for_.Run(size * size2, f, w, ms, xss, m.cvalsh, + m.cmaskh, idx, size, raw_size, state.get()); + } else { + auto m = GetMasks10(state.num_qubits(), qs, cqs, cvals); + FillPermutationIndices(m.qmaskl, idx); + FillControlledMatrixL( + m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w); + + for_.Run(size * size2, f, w, ms, xss, m.cvalsh, + m.cmaskh, idx, size, raw_size, state.get()); + } + } + +#endif // __BMI2__ + + template + static void FillPermutationIndices(unsigned qmaskl, __m256i* idx) { + constexpr unsigned lsize = 1 << L; + + for (unsigned i = 0; i < lsize - 1; ++i) { + unsigned p[8]; + + for (unsigned j = 0; j < 8; ++j) { + p[j] = MaskedAdd<3>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl)); + } + + idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]); + } + } + + For for_; +}; + +} // namespace unitary +} // namespace qsim + +#endif // UNITARY_CALCULATOR_AVX_H_ diff --git a/qsim/unitary_calculator_avx512.h b/qsim/unitary_calculator_avx512.h new file mode 100644 index 0000000..8105367 --- /dev/null +++ b/qsim/unitary_calculator_avx512.h @@ -0,0 +1,644 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UNITARY_CALCULATOR_AVX512_H_ +#define UNITARY_CALCULATOR_AVX512_H_ + +#include + +#include +#include +#include +#include + +#include "simulator.h" +#include "unitaryspace_avx512.h" + +namespace qsim { +namespace unitary { + +/** + * Quantum circuit unitary calculator with AVX512 vectorization. + */ +template +class UnitaryCalculatorAVX512 final : public SimulatorBase { + public: + using UnitarySpace = UnitarySpaceAVX512; + using Unitary = typename UnitarySpace::Unitary; + using fp_type = typename UnitarySpace::fp_type; + + using StateSpace = UnitarySpace; + using State = Unitary; + + template + explicit UnitaryCalculatorAVX512(ForArgs&&... args) : for_(args...) {} + + /** + * Applies a gate using AVX512 instructions. + * @param qs Indices of the qubits affected by this gate. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyGate(const std::vector& qs, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + switch (qs.size()) { + case 1: + if (qs[0] > 3) { + ApplyGateH<1>(qs, matrix, state); + } else { + ApplyGateL<0, 1>(qs, matrix, state); + } + break; + case 2: + if (qs[0] > 3) { + ApplyGateH<2>(qs, matrix, state); + } else if (qs[1] > 3) { + ApplyGateL<1, 1>(qs, matrix, state); + } else { + ApplyGateL<0, 2>(qs, matrix, state); + } + break; + case 3: + if (qs[0] > 3) { + ApplyGateH<3>(qs, matrix, state); + } else if (qs[1] > 3) { + ApplyGateL<2, 1>(qs, matrix, state); + } else if (qs[2] > 3) { + ApplyGateL<1, 2>(qs, matrix, state); + } else { + ApplyGateL<0, 3>(qs, matrix, state); + } + break; + case 4: + if (qs[0] > 3) { + ApplyGateH<4>(qs, matrix, state); + } else if (qs[1] > 3) { + ApplyGateL<3, 1>(qs, matrix, state); + } else if (qs[2] > 3) { + ApplyGateL<2, 2>(qs, matrix, state); + } else if (qs[3] > 3) { + ApplyGateL<1, 3>(qs, matrix, state); + } else { + ApplyGateL<0, 4>(qs, matrix, state); + } + break; + case 5: + if (qs[0] > 3) { + ApplyGateH<5>(qs, matrix, state); + } else if (qs[1] > 3) { + ApplyGateL<4, 1>(qs, matrix, state); + } else if (qs[2] > 3) { + ApplyGateL<3, 2>(qs, matrix, state); + } else if (qs[3] > 3) { + ApplyGateL<2, 3>(qs, matrix, state); + } else { + ApplyGateL<1, 4>(qs, matrix, state); + } + break; + case 6: + if (qs[0] > 3) { + ApplyGateH<6>(qs, matrix, state); + } else if (qs[1] > 3) { + ApplyGateL<5, 1>(qs, matrix, state); + } else if (qs[2] > 3) { + ApplyGateL<4, 2>(qs, matrix, state); + } else if (qs[3] > 3) { + ApplyGateL<3, 3>(qs, matrix, state); + } else { + ApplyGateL<2, 4>(qs, matrix, state); + } + break; + default: + // Not implemented. + break; + } + } + + /** + * Applies a controlled gate using AVX512 instructions. + * @param qs Indices of the qubits affected by this gate. + * @param cqs Indices of control qubits. + * @param cvals Bit mask of control qubit values. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyControlledGate(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + // Assume cqs[0] < cqs[1] < cqs[2] < ... . + + if (cqs.size() == 0) { + ApplyGate(qs, matrix, state); + return; + } + + switch (qs.size()) { + case 1: + if (qs[0] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 3) { + ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 2: + if (qs[0] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 3) { + ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 3: + if (qs[0] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else if (qs[2] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 3) { + ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 4: + if (qs[0] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else if (qs[2] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state); + } + } else if (qs[3] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 3) { + ApplyControlledGateL<0, 4, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 4, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + default: + // Not implemented. + break; + } + } + + /** + * @return The size of SIMD register if applicable. + */ + static unsigned SIMDRegisterSize() { + return 16; + } + + private: + template + void ApplyGateH(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + uint64_t imaskh, uint64_t qmaskh, uint64_t size, + uint64_t row_size, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m512 ru, iu, rn, in; + __m512 rs[hsize], is[hsize]; + + uint64_t r = i % size; + uint64_t s = i / size; + + auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh); + + for (unsigned k = 0; k < hsize; ++k) { + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k] = _mm512_load_ps(p0 + p); + is[k] = _mm512_load_ps(p0 + p + 16); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[l], ru, rn); + in = _mm512_fmadd_ps(rs[l], iu, in); + rn = _mm512_fnmadd_ps(is[l], iu, rn); + in = _mm512_fmadd_ps(is[l], ru, in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm512_store_ps(p0 + p, rn); + _mm512_store_ps(p0 + p + 16, in); + } + }; + + auto m = GetMasks1(qs); + + unsigned k = 4 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, + matrix, m.imaskh, m.qmaskh, size, raw_size, state.get()); + } + + template + void ApplyGateL(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + uint64_t imaskh, uint64_t qmaskh, const __m512i* idx, + uint64_t size, uint64_t row_size, fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m512 rn, in; + __m512 rs[gsize], is[gsize]; + + uint64_t r = i % size; + uint64_t s = i / size; + + auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh); + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k2] = _mm512_load_ps(p0 + p); + is[k2] = _mm512_load_ps(p0 + p + 16); + + for (unsigned l = 1; l < lsize; ++l) { + rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]); + is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]); + } + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm512_fmadd_ps(rs[l], w[j], rn); + in = _mm512_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm512_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm512_store_ps(p0 + p, rn); + _mm512_store_ps(p0 + p + 16, in); + } + }; + + __m512i idx[1 << L]; + __m512 w[1 << (1 + 2 * H + L)]; + + auto m = GetMasks2(qs); + FillPermutationIndices(m.qmaskl, idx); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + unsigned k = 4 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, + w, m.imaskh, m.qmaskh, idx, size, raw_size, state.get()); + } + + template + void ApplyControlledGateHH(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, + uint64_t size, uint64_t row_size, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m512 ru, iu, rn, in; + __m512 rs[hsize], is[hsize]; + + uint64_t r = i % size; + uint64_t s = i / size; + + auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh); + + for (unsigned k = 0; k < hsize; ++k) { + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k] = _mm512_load_ps(p0 + p); + is[k] = _mm512_load_ps(p0 + p + 16); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[l], ru, rn); + in = _mm512_fmadd_ps(rs[l], iu, in); + rn = _mm512_fnmadd_ps(is[l], iu, rn); + in = _mm512_fmadd_ps(is[l], ru, in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm512_store_ps(p0 + p, rn); + _mm512_store_ps(p0 + p + 16, in); + } + }; + + auto m = GetMasks3(state.num_qubits(), qs, cqs, cvals); + + unsigned k = 4 + H + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, + matrix, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get()); + } + + template + void ApplyControlledGateHL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, + uint64_t size, uint64_t row_size, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m512 rn, in; + __m512 rs[hsize], is[hsize]; + + uint64_t r = i % size; + uint64_t s = i / size; + + auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh); + + for (unsigned k = 0; k < hsize; ++k) { + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k] = _mm512_load_ps(p0 + p); + is[k] = _mm512_load_ps(p0 + p + 16); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + rn = _mm512_fmadd_ps(rs[l], w[j], rn); + in = _mm512_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm512_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm512_store_ps(p0 + p, rn); + _mm512_store_ps(p0 + p + 16, in); + } + }; + + __m512 w[1 << (1 + 2 * H)]; + + auto m = GetMasks4(state.num_qubits(), qs, cqs, cvals); + FillControlledMatrixH(m.cvalsl, m.cmaskl, matrix, (fp_type*) w); + + unsigned k = 4 + H + cqs.size() - m.cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, + w, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get()); + } + + template + void ApplyControlledGateL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m512 rn, in; + __m512 rs[gsize], is[gsize]; + + uint64_t r = i % size; + uint64_t s = i / size; + + auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh); + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k2] = _mm512_load_ps(p0 + p); + is[k2] = _mm512_load_ps(p0 + p + 16); + + for (unsigned l = 1; l < lsize; ++l) { + rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]); + is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]); + } + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm512_fmadd_ps(rs[l], w[j], rn); + in = _mm512_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm512_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm512_store_ps(p0 + p, rn); + _mm512_store_ps(p0 + p + 16, in); + } + }; + + __m512i idx[1 << L]; + __m512 w[1 << (1 + 2 * H + L)]; + + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + if (CH) { + auto m = GetMasks5(state.num_qubits(), qs, cqs, cvals); + FillPermutationIndices(m.qmaskl, idx); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + unsigned k = 4 + H + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size * size2, f, w, m.imaskh, m.qmaskh, + m.cvalsh, idx, size, raw_size, state.get()); + } else { + auto m = GetMasks6(state.num_qubits(), qs, cqs, cvals); + FillPermutationIndices(m.qmaskl, idx); + FillControlledMatrixL( + m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w); + + unsigned k = 4 + H + cqs.size() - m.cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size * size2, f, w, m.imaskh, m.qmaskh, + m.cvalsh, idx, size, raw_size, state.get()); + } + } + + template + static void FillPermutationIndices(unsigned qmaskl, __m512i* idx) { + constexpr unsigned lsize = 1 << L; + + for (unsigned i = 0; i < lsize; ++i) { + unsigned p[16]; + + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd<4>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + } + + For for_; +}; + +} // namespace unitary +} // namespace qsim + +#endif // UNITARY_CALCULATOR_AVX512_H_ diff --git a/qsim/unitary_calculator_basic.h b/qsim/unitary_calculator_basic.h new file mode 100644 index 0000000..6b1821a --- /dev/null +++ b/qsim/unitary_calculator_basic.h @@ -0,0 +1,259 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UNITARY_CALCULATOR_BASIC_H_ +#define UNITARY_CALCULATOR_BASIC_H_ + +#include +#include +#include +#include + +#include "simulator.h" +#include "unitaryspace_basic.h" + +namespace qsim { +namespace unitary { + +/** + * Quantum circuit unitary calculator without vectorization. + */ +template +class UnitaryCalculatorBasic final : public SimulatorBase { + public: + using UnitarySpace = UnitarySpaceBasic; + using Unitary = typename UnitarySpace::Unitary; + using fp_type = typename UnitarySpace::fp_type; + + using StateSpace = UnitarySpace; + using State = Unitary; + + template + explicit UnitaryCalculatorBasic(ForArgs&&... args) : for_(args...) {} + + /** + * Applies a gate using non-vectorized instructions. + * @param qs Indices of the qubits affected by this gate. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyGate(const std::vector& qs, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + switch (qs.size()) { + case 1: + ApplyGateH<1>(qs, matrix, state); + break; + case 2: + ApplyGateH<2>(qs, matrix, state); + break; + case 3: + ApplyGateH<3>(qs, matrix, state); + break; + case 4: + ApplyGateH<4>(qs, matrix, state); + break; + case 5: + ApplyGateH<5>(qs, matrix, state); + break; + case 6: + ApplyGateH<6>(qs, matrix, state); + break; + default: + // Not implemented. + break; + } + } + + /** + * Applies a controlled gate using non-vectorized instructions. + * @param qs Indices of the qubits affected by this gate. + * @param cqs Indices of control qubits. + * @param cvals Bit mask of control qubit values. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyControlledGate(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + if (cqs.size() == 0) { + ApplyGate(qs, matrix, state); + return; + } + + switch (qs.size()) { + case 1: + ApplyControlledGateH<1>(qs, cqs, cvals, matrix, state); + break; + case 2: + ApplyControlledGateH<2>(qs, cqs, cvals, matrix, state); + break; + case 3: + ApplyControlledGateH<3>(qs, cqs, cvals, matrix, state); + break; + case 4: + ApplyControlledGateH<4>(qs, cqs, cvals, matrix, state); + break; + default: + // Not implemented. + break; + } + } + + /** + * @return The size of SIMD register if applicable. + */ + static unsigned SIMDRegisterSize() { + return 1; + } + + private: + template + void ApplyGateH(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, uint64_t size, + uint64_t row_size, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + fp_type rn, in; + fp_type rs[hsize], is[hsize]; + + uint64_t r = i % size; + uint64_t s = i / size; + + uint64_t t = r & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + r *= 2; + t |= r & ms[j]; + } + + auto p0 = rstate + row_size * s + 2 * t; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = *(p0 + xss[k]); + is[k] = *(p0 + xss[k] + 1); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = rs[0] * v[j] - is[0] * v[j + 1]; + in = rs[0] * v[j + 1] + is[0] * v[j]; + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + rn += rs[l] * v[j] - is[l] * v[j + 1]; + in += rs[l] * v[j + 1] + is[l] * v[j]; + + j += 2; + } + + *(p0 + xss[k]) = rn; + *(p0 + xss[k] + 1) = in; + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, state.get()); + } + + template + void ApplyControlledGateH(const std::vector& qs, + const std::vector& cqs, + uint64_t cvals, const fp_type* matrix, + State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, + uint64_t cmaskh, uint64_t size, uint64_t row_size, + fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + fp_type rn, in; + fp_type rs[hsize], is[hsize]; + + uint64_t r = i % size; + uint64_t s = i / size; + + uint64_t t = r & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + r *= 2; + t |= r & ms[j]; + } + + if ((t & cmaskh) == cvalsh) { + auto p0 = rstate + row_size * s + 2 * t; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = *(p0 + xss[k]); + is[k] = *(p0 + xss[k] + 1); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = rs[0] * v[j] - is[0] * v[j + 1]; + in = rs[0] * v[j + 1] + is[0] * v[j]; + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + rn += rs[l] * v[j] - is[l] * v[j + 1]; + in += rs[l] * v[j + 1] + is[l] * v[j]; + + j += 2; + } + + *(p0 + xss[k]) = rn; + *(p0 + xss[k] + 1) = in; + } + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + + FillIndices(state.num_qubits(), qs, ms, xss); + + auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals); + + unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, + matrix, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get()); + } + + For for_; +}; + +} // namespace unitary +} // namespace qsim + +#endif // UNITARY_CALCULATOR_BASIC_H_ diff --git a/qsim/unitary_calculator_sse.h b/qsim/unitary_calculator_sse.h new file mode 100644 index 0000000..a3c3f2e --- /dev/null +++ b/qsim/unitary_calculator_sse.h @@ -0,0 +1,639 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UNITARY_CALCULATOR_SSE_H_ +#define UNITARY_CALCULATOR_SSE_H_ + +#include + +#include +#include +#include +#include + +#include "simulator.h" +#include "unitaryspace_sse.h" + +namespace qsim { +namespace unitary { + +/** + * Quantum circuit unitary calculator with SSE vectorization. + */ +template +class UnitaryCalculatorSSE final : public SimulatorBase { + public: + using UnitarySpace = UnitarySpaceSSE; + using Unitary = typename UnitarySpace::Unitary; + using fp_type = typename UnitarySpace::fp_type; + + using StateSpace = UnitarySpace; + using State = Unitary; + + template + explicit UnitaryCalculatorSSE(ForArgs&&... args) : for_(args...) {} + + /** + * Applies a gate using SSE instructions. + * @param qs Indices of the qubits affected by this gate. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyGate(const std::vector& qs, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + switch (qs.size()) { + case 1: + if (qs[0] > 1) { + ApplyGateH<1>(qs, matrix, state); + } else { + ApplyGateL<0, 1>(qs, matrix, state); + } + break; + case 2: + if (qs[0] > 1) { + ApplyGateH<2>(qs, matrix, state); + } else if (qs[1] > 1) { + ApplyGateL<1, 1>(qs, matrix, state); + } else { + ApplyGateL<0, 2>(qs, matrix, state); + } + break; + case 3: + if (qs[0] > 1) { + ApplyGateH<3>(qs, matrix, state); + } else if (qs[1] > 1) { + ApplyGateL<2, 1>(qs, matrix, state); + } else { + ApplyGateL<1, 2>(qs, matrix, state); + } + break; + case 4: + if (qs[0] > 1) { + ApplyGateH<4>(qs, matrix, state); + } else if (qs[1] > 1) { + ApplyGateL<3, 1>(qs, matrix, state); + } else { + ApplyGateL<2, 2>(qs, matrix, state); + } + break; + case 5: + if (qs[0] > 1) { + ApplyGateH<5>(qs, matrix, state); + } else if (qs[1] > 1) { + ApplyGateL<4, 1>(qs, matrix, state); + } else { + ApplyGateL<3, 2>(qs, matrix, state); + } + break; + case 6: + if (qs[0] > 1) { + ApplyGateH<6>(qs, matrix, state); + } else if (qs[1] > 1) { + ApplyGateL<5, 1>(qs, matrix, state); + } else { + ApplyGateL<4, 2>(qs, matrix, state); + } + break; + default: + // Not implemented. + break; + } + } + + /** + * Applies a controlled gate using SSE instructions. + * @param qs Indices of the qubits affected by this gate. + * @param cqs Indices of control qubits. + * @param cvals Bit mask of control qubit values. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyControlledGate(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + // Assume cqs[0] < cqs[1] < cqs[2] < ... . + + if (cqs.size() == 0) { + ApplyGate(qs, matrix, state); + return; + } + + switch (qs.size()) { + case 1: + if (qs[0] > 1) { + if (cqs[0] > 1) { + ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 1) { + ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 2: + if (qs[0] > 1) { + if (cqs[0] > 1) { + ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 1) { + if (cqs[0] > 1) { + ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 1) { + ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 3: + if (qs[0] > 1) { + if (cqs[0] > 1) { + ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 1) { + if (cqs[0] > 1) { + ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 1) { + ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 4: + if (qs[0] > 1) { + if (cqs[0] > 1) { + ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 1) { + if (cqs[0] > 1) { + ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 1) { + ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + default: + // Not implemented. + break; + } + } + + /** + * @return The size of SIMD register if applicable. + */ + static unsigned SIMDRegisterSize() { + return 4; + } + + private: + template + void ApplyGateH(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, uint64_t size, + uint64_t row_size, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m128 ru, iu, rn, in; + __m128 rs[hsize], is[hsize]; + + uint64_t r = 4 * (i % size); + uint64_t s = i / size; + + uint64_t t = r & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + r *= 2; + t |= r & ms[j]; + } + + auto p0 = rstate + row_size * s + 2 * t; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = _mm_load_ps(p0 + xss[k]); + is[k] = _mm_load_ps(p0 + xss[k] + 4); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm_set1_ps(v[j]); + iu = _mm_set1_ps(v[j + 1]); + rn = _mm_mul_ps(rs[0], ru); + in = _mm_mul_ps(rs[0], iu); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu)); + in = _mm_add_ps(in, _mm_mul_ps(is[0], ru)); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm_set1_ps(v[j]); + iu = _mm_set1_ps(v[j + 1]); + rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru)); + in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu)); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu)); + in = _mm_add_ps(in, _mm_mul_ps(is[l], ru)); + + j += 2; + } + + _mm_store_ps(p0 + xss[k], rn); + _mm_store_ps(p0 + xss[k] + 4, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned k = 2 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, state.get()); + } + + template + void ApplyGateL(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w, + const uint64_t* ms, const uint64_t* xss, unsigned q0, + uint64_t size, uint64_t row_size, fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m128 rn, in; + __m128 rs[gsize], is[gsize]; + + uint64_t r = 4 * (i % size); + uint64_t s = i / size; + + uint64_t t = r & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + r *= 2; + t |= r & ms[j]; + } + + auto p0 = rstate + row_size * s + 2 * t; + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + + rs[k2] = _mm_load_ps(p0 + xss[k]); + is[k2] = _mm_load_ps(p0 + xss[k] + 4); + + if (L == 1) { + rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177) + : _mm_shuffle_ps(rs[k2], rs[k2], 78); + is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177) + : _mm_shuffle_ps(is[k2], is[k2], 78); + } else if (L == 2) { + rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57); + is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57); + rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78); + is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78); + rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147); + is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147); + } + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm_mul_ps(rs[0], w[j]); + in = _mm_mul_ps(rs[0], w[j + 1]); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1])); + in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j])); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j])); + in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1])); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1])); + in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j])); + + j += 2; + } + + _mm_store_ps(p0 + xss[k], rn); + _mm_store_ps(p0 + xss[k] + 4, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + __m128 w[1 << (1 + 2 * H + L)]; + + auto m = GetMasks11(qs); + + FillIndices(state.num_qubits(), qs, ms, xss); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + unsigned k = 2 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, qs[0], size, raw_size, state.get()); + } + + template + void ApplyControlledGateHH(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, + uint64_t cmaskh, uint64_t size, uint64_t row_size, + fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m128 ru, iu, rn, in; + __m128 rs[hsize], is[hsize]; + + uint64_t r = 4 * (i % size); + uint64_t s = i / size; + + uint64_t t = r & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + r *= 2; + t |= r & ms[j]; + } + + if ((t & cmaskh) != cvalsh) return; + + auto p0 = rstate + row_size * s + 2 * t; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = _mm_load_ps(p0 + xss[k]); + is[k] = _mm_load_ps(p0 + xss[k] + 4); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm_set1_ps(v[j]); + iu = _mm_set1_ps(v[j + 1]); + rn = _mm_mul_ps(rs[0], ru); + in = _mm_mul_ps(rs[0], iu); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu)); + in = _mm_add_ps(in, _mm_mul_ps(is[0], ru)); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm_set1_ps(v[j]); + iu = _mm_set1_ps(v[j + 1]); + rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru)); + in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu)); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu)); + in = _mm_add_ps(in, _mm_mul_ps(is[l], ru)); + + j += 2; + } + + _mm_store_ps(p0 + xss[k], rn); + _mm_store_ps(p0 + xss[k] + 4, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + + auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals); + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned k = 2 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, + matrix, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get()); + } + + template + void ApplyControlledGateHL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w, + const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, + uint64_t cmaskh, uint64_t size, uint64_t row_size, + fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m128 rn, in; + __m128 rs[hsize], is[hsize]; + + uint64_t r = 4 * (i % size); + uint64_t s = i / size; + + uint64_t t = r & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + r *= 2; + t |= r & ms[j]; + } + + if ((t & cmaskh) != cvalsh) return; + + auto p0 = rstate + row_size * s + 2 * t; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = _mm_load_ps(p0 + xss[k]); + is[k] = _mm_load_ps(p0 + xss[k] + 4); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm_mul_ps(rs[0], w[j]); + in = _mm_mul_ps(rs[0], w[j + 1]); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1])); + in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j])); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j])); + in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1])); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1])); + in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j])); + + j += 2; + } + + _mm_store_ps(p0 + xss[k], rn); + _mm_store_ps(p0 + xss[k] + 4, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + __m128 w[1 << (1 + 2 * H)]; + + auto m = GetMasks8<2>(state.num_qubits(), qs, cqs, cvals); + FillIndices(state.num_qubits(), qs, ms, xss); + FillControlledMatrixH(m.cvalsl, m.cmaskl, matrix, (fp_type*) w); + + unsigned k = 2 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, + w, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get()); + } + + template + void ApplyControlledGateL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w, + const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, + uint64_t cmaskh, unsigned q0, uint64_t size, uint64_t row_size, + fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m128 rn, in; + __m128 rs[gsize], is[gsize]; + + uint64_t r = 4 * (i % size); + uint64_t s = i / size; + + uint64_t t = r & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + r *= 2; + t |= r & ms[j]; + } + + if ((t & cmaskh) != cvalsh) return; + + auto p0 = rstate + row_size * s + 2 * t; + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + + rs[k2] = _mm_load_ps(p0 + xss[k]); + is[k2] = _mm_load_ps(p0 + xss[k] + 4); + + if (L == 1) { + rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177) + : _mm_shuffle_ps(rs[k2], rs[k2], 78); + is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177) + : _mm_shuffle_ps(is[k2], is[k2], 78); + } else if (L == 2) { + rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57); + is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57); + rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78); + is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78); + rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147); + is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147); + } + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm_mul_ps(rs[0], w[j]); + in = _mm_mul_ps(rs[0], w[j + 1]); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1])); + in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j])); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j])); + in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1])); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1])); + in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j])); + + j += 2; + } + + _mm_store_ps(p0 + xss[k], rn); + _mm_store_ps(p0 + xss[k] + 4, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + __m128 w[1 << (1 + 2 * H + L)]; + + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned k = 2 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + if (CH) { + auto m = GetMasks9(state.num_qubits(), qs, cqs, cvals); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + for_.Run(size * size2, f, w, ms, xss, + m.cvalsh, m.cmaskh, qs[0], size, raw_size, state.get()); + } else { + auto m = GetMasks10(state.num_qubits(), qs, cqs, cvals); + FillControlledMatrixL( + m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w); + + for_.Run(size * size2, f, w, ms, xss, + m.cvalsh, m.cmaskh, qs[0], size, raw_size, state.get()); + } + } + + For for_; +}; + +} // namespace unitary +} // namespace qsim + +#endif // UNITARY_CALCULATOR_SSE_H_ diff --git a/qsim/unitaryspace.h b/qsim/unitaryspace.h new file mode 100644 index 0000000..b5e2691 --- /dev/null +++ b/qsim/unitaryspace.h @@ -0,0 +1,65 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UNITARYSPACE_H_ +#define UNITARYSPACE_H_ + +#include + +namespace qsim { + +namespace unitary { + +/** + * Abstract class containing routines for general unitary matrix manipulations. + * "AVX", "AVX512", "Basic", and "SSE" implementations are provided. + */ +template class VectorSpace, typename... VSTypeParams> +class UnitarySpace : public VectorSpace { + private: + using Base = VectorSpace; + + public: + using fp_type = typename Base::fp_type; + using Unitary = typename Base::Vector; + + template + UnitarySpace(ForArgs&&... args) : Base(args...) {} + + static Unitary CreateUnitary(unsigned num_qubits) { + return Base::Create(num_qubits); + } + + static Unitary CreateUnitary(fp_type* p, unsigned num_qubits) { + return Base::Create(p, num_qubits); + } + + static Unitary NullUnitary() { + return Base::Null(); + } + + static uint64_t Size(unsigned num_qubits) { + return uint64_t{1} << num_qubits; + }; + + void CopyUnitary(const Unitary& src, Unitary& dest) const { + Base::Copy(src, dest); + } +}; + +} // namespace unitary +} // namespace qsim + +#endif // UNITARYSPACE_H_ diff --git a/qsim/unitaryspace_avx.h b/qsim/unitaryspace_avx.h new file mode 100644 index 0000000..c1ec59d --- /dev/null +++ b/qsim/unitaryspace_avx.h @@ -0,0 +1,112 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UNITARYSPACE_AVX_H_ +#define UNITARYSPACE_AVX_H_ + +#include + +#include +#include +#include +#include + +#include "unitaryspace.h" +#include "vectorspace.h" + +namespace qsim { + +namespace unitary { + +/** + * Object containing context and routines for unitary manipulations. + * Unitary is a vectorized sequence of eight real components followed by eight + * imaginary components. Eight single-precison floating numbers can be loaded + * into an AVX register. + */ +template +struct UnitarySpaceAVX : + public UnitarySpace, VectorSpace, For, float> { + private: + using Base = UnitarySpace, + qsim::VectorSpace, For, float>; + + public: + using Unitary = typename Base::Unitary; + using fp_type = typename Base::fp_type; + + template + explicit UnitarySpaceAVX(ForArgs&&... args) : Base(args...) {} + + static uint64_t MinRowSize(unsigned num_qubits) { + return std::max(uint64_t{16}, 2 * (uint64_t{1} << num_qubits)); + }; + + static uint64_t MinSize(unsigned num_qubits) { + return Base::Size(num_qubits) * MinRowSize(num_qubits); + }; + + void SetAllZeros(Unitary& state) const { + __m256 val0 = _mm256_setzero_ps(); + + auto f = [](unsigned n, unsigned m, uint64_t i, __m256& val, fp_type* p) { + _mm256_store_ps(p + 16 * i, val); + _mm256_store_ps(p + 16 * i + 8, val); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 16, f, val0, state.get()); + } + + void SetIdentity(Unitary& state) { + SetAllZeros(state); + + auto f = [](unsigned n, unsigned m, uint64_t i, + uint64_t row_size, fp_type* p) { + p[row_size * i + (16 * (i / 8)) + (i % 8)] = 1; + }; + + uint64_t size = Base::Size(state.num_qubits()); + uint64_t row_size = MinRowSize(state.num_qubits()); + Base::for_.Run(size, f, row_size, state.get()); + } + + static std::complex GetEntry(const Unitary& state, + uint64_t i, uint64_t j) { + uint64_t row_size = MinRowSize(state.num_qubits()); + uint64_t k = (16 * (j / 8)) + (j % 8); + return std::complex(state.get()[row_size * i + k], + state.get()[row_size * i + k + 8]); + } + + static void SetEntry(Unitary& state, uint64_t i, uint64_t j, + const std::complex& ampl) { + uint64_t row_size = MinRowSize(state.num_qubits()); + uint64_t k = (16 * (j / 8)) + (j % 8); + state.get()[row_size * i + k] = std::real(ampl); + state.get()[row_size * i + k + 8] = std::imag(ampl); + } + + static void SetEntry(Unitary& state, uint64_t i, uint64_t j, fp_type re, + fp_type im) { + uint64_t row_size = MinRowSize(state.num_qubits()); + uint64_t k = (16 * (j / 8)) + (j % 8); + state.get()[row_size * i + k] = re; + state.get()[row_size * i + k + 8] = im; + } +}; + +} // namespace unitary +} // namespace qsim + +#endif // UNITARYSPACE_AVX_H_ diff --git a/qsim/unitaryspace_avx512.h b/qsim/unitaryspace_avx512.h new file mode 100644 index 0000000..4c23dc9 --- /dev/null +++ b/qsim/unitaryspace_avx512.h @@ -0,0 +1,112 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UNITARYSPACE_AVX512_H_ +#define UNITARYSPACE_AVX512_H_ + +#include + +#include +#include +#include +#include + +#include "unitaryspace.h" +#include "vectorspace.h" + +namespace qsim { + +namespace unitary { + +/** + * Object containing context and routines for unitary manipulations. + * State is a vectorized sequence of sixteen real components followed by + * sixteen imaginary components. Sixteen single-precison floating numbers can + * be loaded into an AVX512 register. + */ +template +struct UnitarySpaceAVX512 : + public UnitarySpace, VectorSpace, For, float> { + private: + using Base = UnitarySpace, + qsim::VectorSpace, For, float>; + + public: + using Unitary = typename Base::Unitary; + using fp_type = typename Base::fp_type; + + template + explicit UnitarySpaceAVX512(ForArgs&&... args) : Base(args...) {} + + static uint64_t MinRowSize(unsigned num_qubits) { + return std::max(uint64_t{32}, 2 * (uint64_t{1} << num_qubits)); + }; + + static uint64_t MinSize(unsigned num_qubits) { + return Base::Size(num_qubits) * MinRowSize(num_qubits); + }; + + void SetAllZeros(Unitary& state) const { + __m512 val0 = _mm512_setzero_ps(); + + auto f = [](unsigned n, unsigned m, uint64_t i, __m512 val0, fp_type* p) { + _mm512_store_ps(p + 32 * i, val0); + _mm512_store_ps(p + 32 * i + 16, val0); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 32, f, val0, state.get()); + } + + void SetIdentity(Unitary& state) { + SetAllZeros(state); + + auto f = [](unsigned n, unsigned m, uint64_t i, + uint64_t row_size, fp_type* p) { + p[row_size * i + (32 * (i / 16)) + (i % 16)] = 1; + }; + + uint64_t size = Base::Size(state.num_qubits()); + uint64_t row_size = MinRowSize(state.num_qubits()); + Base::for_.Run(size, f, row_size, state.get()); + } + + static std::complex GetEntry(const Unitary& state, + uint64_t i, uint64_t j) { + uint64_t row_size = MinRowSize(state.num_qubits()); + uint64_t k = (32 * (j / 16)) + (j % 16); + return std::complex(state.get()[row_size * i + k], + state.get()[row_size * i + k + 16]); + } + + static void SetEntry(Unitary& state, uint64_t i, uint64_t j, + const std::complex& ampl) { + uint64_t row_size = MinRowSize(state.num_qubits()); + uint64_t k = (32 * (j / 16)) + (j % 16); + state.get()[row_size * i + k] = std::real(ampl); + state.get()[row_size * i + k + 16] = std::imag(ampl); + } + + static void SetEntry(Unitary& state, uint64_t i, uint64_t j, fp_type re, + fp_type im) { + uint64_t row_size = MinRowSize(state.num_qubits()); + uint64_t k = (32 * (j / 16)) + (j % 16); + state.get()[row_size * i + k] = re; + state.get()[row_size * i + k + 16] = im; + } +}; + +} // namespace unitary +} // namespace qsim + +#endif // UNITARYSPACE_AVX512_H_ diff --git a/qsim/unitaryspace_basic.h b/qsim/unitaryspace_basic.h new file mode 100644 index 0000000..2db14b6 --- /dev/null +++ b/qsim/unitaryspace_basic.h @@ -0,0 +1,103 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UNITARYSPACE_BASIC_H_ +#define UNITARYSPACE_BASIC_H_ + +#include +#include +#include + +#include "unitaryspace.h" +#include "vectorspace.h" + +namespace qsim { + +namespace unitary { + +/** + * Object containing context and routines for unitary manipulations. + * Unitary is a non-vectorized sequence of one real amplitude followed by + * one imaginary amplitude. + */ +template +struct UnitarySpaceBasic + : public UnitarySpace, VectorSpace, For, FP> { + private: + using Base = UnitarySpace, + qsim::VectorSpace, For, FP>; + + public: + using Unitary = typename Base::Unitary; + using fp_type = typename Base::fp_type; + + template + explicit UnitarySpaceBasic(ForArgs&&... args) : Base(args...) {} + + static uint64_t MinRowSize(unsigned num_qubits) { + return 2 * (uint64_t{1} << num_qubits); + }; + + static uint64_t MinSize(unsigned num_qubits) { + return Base::Size(num_qubits) * MinRowSize(num_qubits); + }; + + void SetAllZeros(Unitary& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) { + p[2 * i + 0] = 0; + p[2 * i + 1] = 0; + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 2, f, state.get()); + } + + void SetIdentity(Unitary& state) { + SetAllZeros(state); + + auto f = [](unsigned n, unsigned m, uint64_t i, + uint64_t row_size, fp_type* p) { + p[row_size * i + 2 * i] = 1; + }; + + uint64_t size = Base::Size(state.num_qubits()); + uint64_t row_size = MinRowSize(state.num_qubits()); + Base::for_.Run(size, f, row_size, state.get()); + } + + static std::complex GetEntry(const Unitary& state, + uint64_t i, uint64_t j) { + uint64_t row_size = MinRowSize(state.num_qubits()); + return std::complex(state.get()[row_size * i + 2 * j], + state.get()[row_size * i + 2 * j + 1]); + } + + static void SetEntry(Unitary& state, uint64_t i, uint64_t j, + const std::complex& ampl) { + uint64_t row_size = MinRowSize(state.num_qubits()); + state.get()[row_size * i + 2 * j] = std::real(ampl); + state.get()[row_size * i + 2 * j + 1] = std::imag(ampl); + } + + static void SetEntry(Unitary& state, uint64_t i, uint64_t j, + fp_type re, fp_type im) { + uint64_t row_size = MinRowSize(state.num_qubits()); + state.get()[row_size * i + 2 * j] = re; + state.get()[row_size * i + 2 * j + 1] = im; + } +}; + +} // namespace unitary +} // namespace qsim + +#endif // UNITARYSPACE_BASIC_H_ diff --git a/qsim/unitaryspace_sse.h b/qsim/unitaryspace_sse.h new file mode 100644 index 0000000..f3762fb --- /dev/null +++ b/qsim/unitaryspace_sse.h @@ -0,0 +1,112 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UNITARYSPACE_SSE_H_ +#define UNITARYSPACE_SSE_H_ + +#include + +#include +#include +#include +#include + +#include "unitaryspace.h" +#include "vectorspace.h" + +namespace qsim { + +namespace unitary { + +/** + * Object containing context and routines for unitary manipulations. + * Unitary is a vectorized sequence of four real components followed by four + * imaginary components. Four single-precison floating numbers can be loaded + * into an SSE register. + */ +template +struct UnitarySpaceSSE : + public UnitarySpace, VectorSpace, For, float> { + private: + using Base = UnitarySpace, + qsim::VectorSpace, For, float>; + + public: + using Unitary = typename Base::Unitary; + using fp_type = typename Base::fp_type; + + template + explicit UnitarySpaceSSE(ForArgs&&... args) : Base(args...) {} + + static uint64_t MinRowSize(unsigned num_qubits) { + return std::max(uint64_t{8}, 2 * (uint64_t{1} << num_qubits)); + }; + + static uint64_t MinSize(unsigned num_qubits) { + return Base::Size(num_qubits) * MinRowSize(num_qubits); + }; + + void SetAllZeros(Unitary& state) const { + __m128 val0 = _mm_setzero_ps(); + + auto f = [](unsigned n, unsigned m, uint64_t i, __m128 val0, fp_type* p) { + _mm_store_ps(p + 8 * i, val0); + _mm_store_ps(p + 8 * i + 4, val0); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 8, f, val0, state.get()); + } + + void SetIdentity(Unitary& state) { + SetAllZeros(state); + + auto f = [](unsigned n, unsigned m, uint64_t i, + uint64_t row_size, fp_type* p) { + p[row_size * i + (8 * (i / 4)) + (i % 4)] = 1; + }; + + uint64_t size = Base::Size(state.num_qubits()); + uint64_t row_size = MinRowSize(state.num_qubits()); + Base::for_.Run(size, f, row_size, state.get()); + } + + static std::complex GetEntry(const Unitary& state, + uint64_t i, uint64_t j) { + uint64_t row_size = MinRowSize(state.num_qubits()); + uint64_t k = (8 * (j / 4)) + (j % 4); + return std::complex(state.get()[row_size * i + k], + state.get()[row_size * i + k + 4]); + } + + static void SetEntry(Unitary& state, uint64_t i, uint64_t j, + const std::complex& ampl) { + uint64_t row_size = MinRowSize(state.num_qubits()); + uint64_t k = (8 * (j / 4)) + (j % 4); + state.get()[row_size * i + k] = std::real(ampl); + state.get()[row_size * i + k + 4] = std::imag(ampl); + } + + static void SetEntry(Unitary& state, uint64_t i, uint64_t j, fp_type re, + fp_type im) { + uint64_t row_size = MinRowSize(state.num_qubits()); + uint64_t k = (8 * (j / 4)) + (j % 4); + state.get()[row_size * i + k] = re; + state.get()[row_size * i + k + 4] = im; + } +}; + +} // namespace unitary +} // namespace qsim + +#endif // UNITARYSPACE_SSE_H_ diff --git a/qsim/util.h b/qsim/util.h new file mode 100644 index 0000000..726a019 --- /dev/null +++ b/qsim/util.h @@ -0,0 +1,89 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UTIL_H_ +#define UTIL_H_ + +#include +#include +#include +#include +#include +#include +#include + +namespace qsim { + +template +inline void SplitString( + const std::string& str, char delim, Container& words) { + words.resize(0); + + std::string word; + std::stringstream ss(str); + + while (std::getline(ss, word, delim)) { + words.push_back(std::move(word)); + } +} + +template +inline void SplitString( + const std::string& str, char delim, Op op, Container& words) { + words.resize(0); + + std::string word; + std::stringstream ss(str); + + while (std::getline(ss, word, delim)) { + words.push_back(op(word)); + } +} + +inline double GetTime() { + using namespace std::chrono; + steady_clock::duration since_epoch = steady_clock::now().time_since_epoch(); + return double(since_epoch.count() * steady_clock::period::num) + / steady_clock::period::den; +} + +template +inline DistrRealType RandomValue(RGen& rgen, DistrRealType max_value) { + std::uniform_real_distribution distr(0.0, max_value); + return distr(rgen); +} + +template +inline std::vector GenerateRandomValues( + uint64_t num_samples, unsigned seed, DistrRealType max_value) { + std::vector rs; + rs.reserve(num_samples + 1); + + std::mt19937 rgen(seed); + std::uniform_real_distribution distr(0.0, max_value); + + for (uint64_t i = 0; i < num_samples; ++i) { + rs.emplace_back(distr(rgen)); + } + + std::sort(rs.begin(), rs.end()); + // Populate the final element to prevent sanitizer errors. + rs.emplace_back(max_value); + + return rs; +} + +} // namespace qsim + +#endif // UTIL_H_ diff --git a/qsim/util_cpu.h b/qsim/util_cpu.h new file mode 100644 index 0000000..8e02425 --- /dev/null +++ b/qsim/util_cpu.h @@ -0,0 +1,43 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UTIL_CPU_H_ +#define UTIL_CPU_H_ + +#ifdef __SSE2__ +# include +#endif + +namespace qsim { + +// This function sets flush-to-zero and denormals-are-zeros MXCSR control +// flags. This prevents rare cases of performance slowdown potentially at +// the cost of a tiny precision loss. +inline void SetFlushToZeroAndDenormalsAreZeros() { +#ifdef __SSE2__ + _mm_setcsr(_mm_getcsr() | 0x8040); +#endif +} + +// This function clears flush-to-zero and denormals-are-zeros MXCSR control +// flags. +inline void ClearFlushToZeroAndDenormalsAreZeros() { +#ifdef __SSE2__ + _mm_setcsr(_mm_getcsr() & ~unsigned{0x8040}); +#endif +} + +} // namespace qsim + +#endif // UTIL_CPU_H_ diff --git a/qsim/util_cuda.h b/qsim/util_cuda.h new file mode 100644 index 0000000..5d8cb5d --- /dev/null +++ b/qsim/util_cuda.h @@ -0,0 +1,128 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UTIL_CUDA_H_ +#define UTIL_CUDA_H_ + +#ifdef __NVCC__ + #include +#elif __HIP__ + #include +#endif + +#include + +#include "io.h" + +namespace qsim { + +#define ErrorCheck(code) { ErrorAssert((code), __FILE__, __LINE__); } + +inline void ErrorAssert(cudaError_t code, const char* file, unsigned line) { + if (code != cudaSuccess) { + IO::errorf("CUDA error: %s %s %d\n", cudaGetErrorString(code), file, line); + exit(code); + } +} + +template +struct Complex { + __host__ __device__ __forceinline__ Complex() {} + + __host__ __device__ __forceinline__ Complex(const T& re) : re(re), im(0) {} + + __host__ __device__ __forceinline__ Complex(const T& re, const T& im) + : re(re), im(im) {} + + template + __host__ __device__ __forceinline__ Complex& operator=( + const Complex& r) { + re = r.re; + im = r.im; + + return *this; + } + + T re; + T im; +}; + +template +__host__ __device__ __forceinline__ Complex operator+( + const Complex& l, const Complex& r) { + return Complex(l.re + r.re, l.im + r.im); +} + +template +__host__ __device__ __forceinline__ Complex operator+( + const Complex& l, const Complex& r) { + return Complex(l.re + r.re, l.im + r.im); +} + +template +struct Scalar { + using type = T; +}; + +template +struct Scalar> { + using type = T; +}; + +template +struct Plus { + template + __device__ __forceinline__ T operator()(const T& v1, const U& v2) const { + return v1 + v2; + } +}; + +template +struct Product { + __device__ __forceinline__ Complex operator()( + const T& re1, const T& im1, const T& re2, const T& im2) const { + return Complex(re1 * re2 + im1 * im2, re1 * im2 - im1 * re2); + } +}; + +template +struct RealProduct { + __device__ __forceinline__ T operator()( + const T& re1, const T& im1, const T& re2, const T& im2) const { + return re1 * re2 + im1 * im2; + } +}; + +template +__device__ __forceinline__ FP1 WarpReduce(FP1 val, Op op) { + for (unsigned i = warp_size / 2; i > 0; i /= 2) { + val = op(val, __shfl_down_sync(0xffffffff, val, i)); + } + + return val; +} + +template +__device__ __forceinline__ Complex WarpReduce(Complex val, Op op) { + for (unsigned i = warp_size / 2; i > 0; i /= 2) { + val.re = op(val.re, __shfl_down_sync(0xffffffff, val.re, i)); + val.im = op(val.im, __shfl_down_sync(0xffffffff, val.im, i)); + } + + return val; +} + +} // namespace qsim + +#endif // UTIL_CUDA_H_ diff --git a/qsim/util_custatevec.h b/qsim/util_custatevec.h new file mode 100644 index 0000000..36f29ef --- /dev/null +++ b/qsim/util_custatevec.h @@ -0,0 +1,44 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UTIL_CUSTATEVEC_H_ +#define UTIL_CUSTATEVEC_H_ + +#include +#include + +#include "io.h" +#include "util_cuda.h" + +namespace qsim { + +inline void ErrorAssert(cublasStatus_t code, const char* file, unsigned line) { + if (code != CUBLAS_STATUS_SUCCESS) { + IO::errorf("cuBLAS error %i: %s %d\n", code, file, line); + exit(code); + } +} + +inline void ErrorAssert( + custatevecStatus_t code, const char* file, unsigned line) { + if (code != CUSTATEVEC_STATUS_SUCCESS) { + IO::errorf("custatevec error: %s %s %d\n", + custatevecGetErrorString(code), file, line); + exit(code); + } +} + +} // namespace qsim + +#endif // UTIL_CUSTATEVEC_H_ diff --git a/qsim/vectorspace.h b/qsim/vectorspace.h new file mode 100644 index 0000000..7b33a53 --- /dev/null +++ b/qsim/vectorspace.h @@ -0,0 +1,185 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef VECTORSPACE_H_ +#define VECTORSPACE_H_ + +#ifdef _WIN32 + #include +#endif + +#include +#include +#include +#include + +namespace qsim { + +namespace detail { + +inline void do_not_free(void*) {} + +inline void free(void* ptr) { +#ifdef _WIN32 + _aligned_free(ptr); +#else + ::free(ptr); +#endif +} + +} // namespace detail + +// Routines for vector manipulations. +template +class VectorSpace { + public: + using fp_type = FP; + + private: + using Pointer = std::unique_ptr; + + public: + class Vector { + public: + Vector() = delete; + + Vector(Pointer&& ptr, unsigned num_qubits) + : ptr_(std::move(ptr)), num_qubits_(num_qubits) {} + + fp_type* get() { + return ptr_.get(); + } + + const fp_type* get() const { + return ptr_.get(); + } + + fp_type* release() { + num_qubits_ = 0; + return ptr_.release(); + } + + unsigned num_qubits() const { + return num_qubits_; + } + + bool requires_copy_to_host() const { + return false; + } + + private: + Pointer ptr_; + unsigned num_qubits_; + }; + + template + VectorSpace(ForArgs&&... args) : for_(args...) {} + + static Vector Create(unsigned num_qubits) { + auto size = sizeof(fp_type) * Impl::MinSize(num_qubits); + #ifdef _WIN32 + Pointer ptr{(fp_type*) _aligned_malloc(size, 64), &detail::free}; + return Vector{std::move(ptr), ptr.get() != nullptr ? num_qubits : 0}; + #else + void* p = nullptr; + if (posix_memalign(&p, 64, size) == 0) { + return Vector{Pointer{(fp_type*) p, &detail::free}, num_qubits}; + } else { + return Null(); + } + #endif + } + + // It is the client's responsibility to make sure that p has at least + // Impl::MinSize(num_qubits) elements. + static Vector Create(fp_type* p, unsigned num_qubits) { + return Vector{Pointer{p, &detail::do_not_free}, num_qubits}; + } + + static Vector Null() { + return Vector{Pointer{nullptr, &detail::free}, 0}; + } + + static bool IsNull(const Vector& vec) { + return vec.get() == nullptr; + } + + static void Free(fp_type* ptr) { + detail::free(ptr); + } + + bool Copy(const Vector& src, Vector& dest) const { + if (src.num_qubits() != dest.num_qubits()) { + return false; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* src, fp_type* dest) { + dest[i] = src[i]; + }; + + for_.Run(Impl::MinSize(src.num_qubits()), f, src.get(), dest.get()); + + return true; + } + + // It is the client's responsibility to make sure that dest has at least + // Impl::MinSize(src.num_qubits()) elements. + bool Copy(const Vector& src, fp_type* dest) const { + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* src, fp_type* dest) { + dest[i] = src[i]; + }; + + for_.Run(Impl::MinSize(src.num_qubits()), f, src.get(), dest); + + return true; + } + + // It is the client's responsibility to make sure that src has at least + // Impl::MinSize(dest.num_qubits()) elements. + bool Copy(const fp_type* src, Vector& dest) const { + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* src, fp_type* dest) { + dest[i] = src[i]; + }; + + for_.Run(Impl::MinSize(dest.num_qubits()), f, src, dest.get()); + + return true; + } + + // It is the client's responsibility to make sure that src has at least + // min(size, Impl::MinSize(dest.num_qubits())) elements. + bool Copy(const fp_type* src, uint64_t size, Vector& dest) const { + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* src, fp_type* dest) { + dest[i] = src[i]; + }; + + size = std::min(size, Impl::MinSize(dest.num_qubits())); + for_.Run(size, f, src, dest.get()); + + return true; + } + + void DeviceSync() {} + + protected: + For for_; +}; + +} // namespace qsim + +#endif // VECTORSPACE_H_ diff --git a/qsim/vectorspace_cuda.h b/qsim/vectorspace_cuda.h new file mode 100644 index 0000000..fd91553 --- /dev/null +++ b/qsim/vectorspace_cuda.h @@ -0,0 +1,172 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef VECTORSPACE_CUDA_H_ +#define VECTORSPACE_CUDA_H_ + +#ifdef __NVCC__ + #include + #include +#elif __HIP__ + #include + #include "cuda2hip.h" +#endif + +#include +#include + +namespace qsim { + +namespace detail { + +inline void do_not_free(void*) {} + +inline void free(void* ptr) { + ErrorCheck(cudaFree(ptr)); +} + +} // namespace detail + +// Routines for vector manipulations. +template +class VectorSpaceCUDA { + public: + using fp_type = FP; + + private: + using Pointer = std::unique_ptr; + + public: + class Vector { + public: + Vector() = delete; + + Vector(Pointer&& ptr, unsigned num_qubits) + : ptr_(std::move(ptr)), num_qubits_(num_qubits) {} + + fp_type* get() { + return ptr_.get(); + } + + const fp_type* get() const { + return ptr_.get(); + } + + fp_type* release() { + num_qubits_ = 0; + return ptr_.release(); + } + + unsigned num_qubits() const { + return num_qubits_; + } + + bool requires_copy_to_host() const { + return true; + } + + private: + Pointer ptr_; + unsigned num_qubits_; + }; + + template + VectorSpaceCUDA(Args&&... args) {} + + static Vector Create(unsigned num_qubits) { + fp_type* p; + auto size = sizeof(fp_type) * Impl::MinSize(num_qubits); + auto rc = cudaMalloc(&p, size); + + if (rc == cudaSuccess) { + return Vector{Pointer{(fp_type*) p, &detail::free}, num_qubits}; + } else { + return Null(); + } + } + + // It is the client's responsibility to make sure that p has at least + // Impl::MinSize(num_qubits) elements. + static Vector Create(fp_type* p, unsigned num_qubits) { + return Vector{Pointer{p, &detail::do_not_free}, num_qubits}; + } + + static Vector Null() { + return Vector{Pointer{nullptr, &detail::free}, 0}; + } + + static bool IsNull(const Vector& vector) { + return vector.get() == nullptr; + } + + static void Free(fp_type* ptr) { + detail::free(ptr); + } + + bool Copy(const Vector& src, Vector& dest) const { + if (src.num_qubits() != dest.num_qubits()) { + return false; + } + + ErrorCheck( + cudaMemcpy(dest.get(), src.get(), + sizeof(fp_type) * Impl::MinSize(src.num_qubits()), + cudaMemcpyDeviceToDevice)); + + return true; + } + + // It is the client's responsibility to make sure that dest has at least + // Impl::MinSize(src.num_qubits()) elements. + bool Copy(const Vector& src, fp_type* dest) const { + ErrorCheck( + cudaMemcpy(dest, src.get(), + sizeof(fp_type) * Impl::MinSize(src.num_qubits()), + cudaMemcpyDeviceToHost)); + + return true; + } + + // It is the client's responsibility to make sure that src has at least + // Impl::MinSize(dest.num_qubits()) elements. + bool Copy(const fp_type* src, Vector& dest) const { + ErrorCheck( + cudaMemcpy(dest.get(), src, + sizeof(fp_type) * Impl::MinSize(dest.num_qubits()), + cudaMemcpyHostToDevice)); + + return true; + } + + // It is the client's responsibility to make sure that src has at least + // min(size, Impl::MinSize(dest.num_qubits())) elements. + bool Copy(const fp_type* src, uint64_t size, Vector& dest) const { + size = std::min(size, Impl::MinSize(dest.num_qubits())); + ErrorCheck( + cudaMemcpy(dest.get(), src, + sizeof(fp_type) * size, + cudaMemcpyHostToDevice)); + return true; + } + + void DeviceSync() { + ErrorCheck(cudaDeviceSynchronize()); + } + + protected: +}; + +} // namespace qsim + +#endif // VECTORSPACE_CUDA_H_ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 845a29a..b01bf2f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -12,8 +12,13 @@ qiree_configure_file("qiree_config.h.in" "qiree_config.h" @ONLY) #----------------------------------------------------------------------------# add_subdirectory(qiree) + if(QIREE_USE_XACC) add_subdirectory(qirxacc) endif() +if(QIREE_USE_QSIM) + add_subdirectory(qirqsim) +endif() + #---------------------------------------------------------------------------## diff --git a/src/qirqsim/BufferManager.cc b/src/qirqsim/BufferManager.cc new file mode 100644 index 0000000..2e6f646 --- /dev/null +++ b/src/qirqsim/BufferManager.cc @@ -0,0 +1,33 @@ +//----------------------------------*-C++-*----------------------------------// +// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers. +// See the top-level COPYRIGHT file for details. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +//---------------------------------------------------------------------------// +//! \file qirqsim/BufferManager.hh +//---------------------------------------------------------------------------// + +#include "BufferManager.hh" +#include +#include +#include + +void BufferManager::updateBuffer(const std::string& qubit, const std::string& state, const int& value) { + // Insert or update the key-value pair in the buffer + std::pair searchKey = {qubit, state}; + int current_frequency = 0; + auto it = buffer.find(searchKey); + if (it != buffer.end()){ + current_frequency = it -> second; + } + // Accumulate counts with every shot + buffer[{qubit, state}] = value + current_frequency; +} + +std::optional BufferManager::getBufferValue(const std::string& qubit, const std::string& state) const { + std::pair searchKey = {qubit, state}; + auto it = buffer.find(searchKey); + if (it != buffer.end()) { + return it->second; // Key found + } + return std::nullopt; // Key not found +} \ No newline at end of file diff --git a/src/qirqsim/BufferManager.hh b/src/qirqsim/BufferManager.hh new file mode 100644 index 0000000..dc03846 --- /dev/null +++ b/src/qirqsim/BufferManager.hh @@ -0,0 +1,45 @@ +//----------------------------------*-C++-*----------------------------------// +// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers. +// See the top-level COPYRIGHT file for details. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +//---------------------------------------------------------------------------// +//! \file qirqsim/BufferManager.hh +//---------------------------------------------------------------------------// + +#ifndef BUFFER_MANAGER_H +#define BUFFER_MANAGER_H + +#include +#include +#include +#include +#include + +// Define a hash function for std::pair + +struct pair_hash { + template + std::size_t operator()(const std::pair& pair) const { + auto hash1 = std::hash{}(pair.first); + auto hash2 = std::hash{}(pair.second); + // Combine the two hash values + return hash1 ^ (hash2 << 1); // Shift and XOR + } +}; + +class BufferManager { +public: + + // Method to update the buffer with a key-value pair + void updateBuffer(const std::string& qubit, const std::string& state, const int& value); + + // Retrieve buffer value for storage or evaluation + std::optional getBufferValue(const std::string& qubit, const std::string& state) const; + +private: + + // Dictionary to store key-value pairs + std::unordered_map, int, pair_hash> buffer; +}; + +#endif // BUFFER_MANAGER_H diff --git a/src/qirqsim/CMakeLists.txt b/src/qirqsim/CMakeLists.txt new file mode 100644 index 0000000..09a0511 --- /dev/null +++ b/src/qirqsim/CMakeLists.txt @@ -0,0 +1,29 @@ +#---------------------------------*-CMake-*----------------------------------# +# Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers. +# See the top-level COPYRIGHT file for details. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#----------------------------------------------------------------------------# + +# Adding qsim as a library to qiree +qiree_add_library(qirqsim + qsimQuantum.cc + qsimDefaultRuntime.cc + qsimTupleRuntime.cc + BufferManager.cc +) + +#Link the qsim library to qiree and any other relevant libraries +target_link_libraries(qirqsim + PUBLIC QIREE::qiree # Link to qiree +) + +#----------------------------------------------------------------------------# +# HEADERS +#----------------------------------------------------------------------------# + +# Install headers, matching the relevant .hh files for qsim integration +install(DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/" + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/qirqsim" + COMPONENT development + FILES_MATCHING REGEX ".*\\.hh?$" +) diff --git a/src/qirqsim/qsimDefaultRuntime.cc b/src/qirqsim/qsimDefaultRuntime.cc new file mode 100644 index 0000000..955959d --- /dev/null +++ b/src/qirqsim/qsimDefaultRuntime.cc @@ -0,0 +1,71 @@ +//----------------------------------*-C++-*----------------------------------// +// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers. +// See the top-level COPYRIGHT file for details. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +//---------------------------------------------------------------------------// +//! \file qirqsim/qsimDefaultRuntime.cc +//---------------------------------------------------------------------------// +#include "qsimDefaultRuntime.hh" +#include +#include "qiree/Assert.hh" + +namespace qiree +{ +//---------------------------------------------------------------------------// +/*! + * Initialize the execution environment, resetting qubits. + */ + +void qsimDefaultRuntime::initialize(OptionalCString env) +{ + if (env) + { + output_ << "Argument to initialize: " << env << std::endl; + } +} + +//---------------------------------------------------------------------------// +/*! + * Execute circuit and mark the following N results as being part of an array + * named tag + */ + +void qsimDefaultRuntime::array_record_output(size_type s, OptionalCString tag) +{ + //this->execute_if_needed(); + //output_ << "array " << (tag ? tag : "") << " length " << s + // << std::endl; +} + +//---------------------------------------------------------------------------// +/*! + * Execute circuit and mark the following N results as being part of a tuple + * named tag + */ + +void qsimDefaultRuntime::tuple_record_output(size_type s, OptionalCString tag) +{ + //this->execute_if_needed(); + //output_ << "tuple " << (tag ? tag : "") << " length " << s + // << std::endl; +} + +//---------------------------------------------------------------------------// +/*! + * Execute circuit and report a single measurement result + */ +void qsimDefaultRuntime::result_record_output(Result r, OptionalCString tag) +{ + // Access values through the getter + // TODO: This prints results 'every time' result_record_output is called. Maybe enough to only print the 'final time' + + if (auto value = sim_.manager.getBufferValue("q"+std::to_string(r.value), "0"); value.has_value()) { + std::cout << "q" << std::to_string(r.value) << " |0> freq: " << value.value() << "\n"; + } + + if (auto value = sim_.manager.getBufferValue("q"+std::to_string(r.value), "1"); value.has_value()) { + std::cout << "q" << std::to_string(r.value) << " |1> freq: " << value.value() << "\n"; + } +} + +} // namespace qiree diff --git a/src/qirqsim/qsimDefaultRuntime.hh b/src/qirqsim/qsimDefaultRuntime.hh new file mode 100644 index 0000000..70dfdd4 --- /dev/null +++ b/src/qirqsim/qsimDefaultRuntime.hh @@ -0,0 +1,61 @@ +//----------------------------------*-C++-*----------------------------------// +// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers. +// See the top-level COPYRIGHT file for details. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +//---------------------------------------------------------------------------// +//! \file qirqsim/qsimDefaultRuntime.hh +//---------------------------------------------------------------------------// +#pragma once + +#include "qsimQuantum.hh" + +namespace qiree +{ + +/*! + * Print per-qubit measurement statistics. + * + * Example for three qubits: + * \code + * q0 |0> freq: 509 + * q0 |1> freq: 515 + * q1 |0> freq: 509 + * q1 |1> freq: 515 + * q2 |1> freq: 1024 + * \endcode + */ + +class qsimDefaultRuntime final : virtual public RuntimeInterface +{ + public: + /*! + * Construct \c qsimDefaultRuntime. + */ + qsimDefaultRuntime(std::ostream& output, + qsimQuantum& sim + ) + : output_(output), sim_(sim) + { + } + + //!@{ + //! \name Runtime interface + // Initialize the execution environment, resetting qubits + void initialize(OptionalCString env) override; + + //! Mark the following N results as being part of an array named tag + void array_record_output(size_type, OptionalCString tag) final; + + //! Mark the following N results as being part of a tuple named tag + void tuple_record_output(size_type, OptionalCString) final; + + // Save one result + void result_record_output(Result result, OptionalCString tag) final; + //!@} + + private: + std::ostream& output_; + qsimQuantum& sim_; +}; + +} // namespace qiree diff --git a/src/qirqsim/qsimQuantum.cc b/src/qirqsim/qsimQuantum.cc new file mode 100644 index 0000000..81f40ef --- /dev/null +++ b/src/qirqsim/qsimQuantum.cc @@ -0,0 +1,218 @@ +//----------------------------------*-C++-*----------------------------------// +// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers. +// See the top-level COPYRIGHT file for details. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +//---------------------------------------------------------------------------// +//! \file qirxacc/qsimQuantum.cc +//---------------------------------------------------------------------------// + +#include "qsimQuantum.hh" + +#include +#include +#include +#include +#include +#include +#include + +#include "qiree/Assert.hh" + +// Qsim +#include "../../tpls/qsim/simulator_basic.h" +#include "../../tpls/qsim/statespace_basic.h" +#include "../../tpls/qsim/gates_qsim.h" +#include "../../tpls/qsim/circuit.h" +#include "../../tpls/qsim/run_qsim.h" +#include "../../tpls/qsim/io.h" +#include "../../tpls/qsim/fuser.h" +#include "../../tpls/qsim/circuit_qsim_parser.h" +#include "../../tpls/qsim/fuser_mqubit.h" +#include "../../tpls/qsim/io_file.h" +#include "../../tpls/qsim/simmux.h" +#include "../../tpls/qsim/util_cpu.h" +#include "../../tpls/qsim/formux.h" +#include "../../tpls/qsim/gate.h" +// + +namespace qiree{ +//---------------------------------------------------------------------------// +/* +Initialize the qsim simulator +*/ + +qsimQuantum::State qsimQuantum::init_state_space() { //check if StateSpace is the proper type for the output, problably it is just State from the Fatory struct. + std::srand(static_cast(std::time(nullptr))); // Seed the random number generator + qsimParam.seed = std::rand(); // Set the seed for qsim parameters + numThreads = std::max(1, static_cast(std::thread::hardware_concurrency())); // Get the number of threads + qsimParam.max_fused_size = 2; // Set the maximum size of fused gates + qsimParam.verbosity = 0; // see verbosity in run_qsim.h + // Initialize the qsim simulator + qsimQuantum::StateSpace state_space = Factory(numThreads).CreateStateSpace(); // Create the state space + State state = state_space.Create(this->num_qubits()); // Create the state + // Check if the state is null + if (state_space.IsNull(state)) { + qsim::IO::errorf("not enough memory: is the number of qubits too large?\n"); + } + state_space.SetStateZero(state); // Set the state to zero, TODO: the initial state is not necessarily zero + return state; + } + + qsimQuantum::qsimQuantum(std::ostream& os, + size_type shots) + : output_(os) + { + } + +//---------------------------------------------------------------------------// +/* +Prepare to build a quantum circuit for an entry point +*/ +void qsimQuantum::set_up(EntryPointAttrs const& attrs) { + QIREE_VALIDATE(attrs.required_num_qubits > 0, + << "input is not a quantum program"); + // Resize the result_to_qubit_ vector, based on the required number of results... + // the idea is to have as many classical registers as qubits (probably not true in general) + result_to_qubit_.resize(attrs.required_num_results); + num_qubits_ = attrs.required_num_qubits; // Set the number of qubits + state_ = std::make_shared(init_state_space()); // Set the state space? Maybe. + q_circuit.num_qubits = num_qubits_; // Allocate the number of qubits in the circuit + execution_time = 0; // Initialize execution time + +} + +//---------------------------------------------------------------------------// +/* +Complete an execution +*/ +void qsimQuantum::tear_down() { + q_circuit = {}; + q_circuit.num_qubits = num_qubits_; + state_ = std::make_shared(init_state_space()); +} + +//---------------------------------------------------------------------------// +/* +Reset the qubit +*/ +void qsimQuantum::reset(Qubit q) { + q.value=0; +} + +//----------------------------------------------------------------------------// +/* +Read the value of a result. This utilizes the new BufferManager. +*/ +QState qsimQuantum::read_result(Result r) +{ + std::string q_index_string = std::to_string(r.value); + auto meas_results = execute_if_needed(); + if (meas_results.size() == 1 && meas_results[0].bitstring.size() == 1) { + const auto bitResult = meas_results[0].bitstring[0]; + assert(bitResult == 0 || bitResult == 1); + std::string stringResult = std::to_string(bitResult); + if (stringResult == "1"){ + manager.updateBuffer("q"+q_index_string, "1", 1); + } else{ + manager.updateBuffer("q"+q_index_string, "0", 1); + } + } else { + qsim::IO::errorf("Unexpected measurement results encountered."); + } + return static_cast(meas_results[0].bitstring[0]); +} + +//---------------------------------------------------------------------------// +/* +Map a qubit to a result index +(TODO: find how to link the classical register to the quantum register in qsim) +*/ +void qsimQuantum::mz(Qubit q, Result r) { //we don't classical register yet. + QIREE_EXPECT(q.value < this->num_qubits()); // TODO: q must be in the set of qubits, e.g., what happens if q=5 and qubits are {2,3,4,5}, q is less than num_qubits but not it is in the set of qubits. + // Add measurement instruction + this->q_circuit.gates.push_back( + qsim::gate::Measurement>::Create( + execution_time++, {this->getQubitIndex(q)})); +} + +//---------------------------------------------------------------------------// +/* +Quantum Instruction Mapping +*/ +// 1. Entangling gates +void qsimQuantum::cx(Qubit q1, Qubit q2) { + q_circuit.gates.push_back( + qsim::GateCNot::Create(execution_time++, this->getQubitIndex(q1), this->getQubitIndex(q2))); +} +void qsimQuantum::cnot(Qubit q1, Qubit q2) { + q_circuit.gates.push_back( + qsim::GateCNot::Create(execution_time++, this->getQubitIndex(q1), this->getQubitIndex(q2))); +} +void qsimQuantum::cz(Qubit q1, Qubit q2) { + q_circuit.gates.push_back( + qsim::GateCZ::Create(execution_time++, this->getQubitIndex(q1), this->getQubitIndex(q2))); +} +// 2. Local gates +void qsimQuantum::h(Qubit q) { + q_circuit.gates.push_back( + qsim::GateHd::Create(execution_time++, this->getQubitIndex(q))); +} +void qsimQuantum::s(Qubit q) { + q_circuit.gates.push_back( + qsim::GateS::Create(execution_time++, this->getQubitIndex(q))); +} +void qsimQuantum::t(Qubit q) { + q_circuit.gates.push_back( + qsim::GateT::Create(execution_time++, this->getQubitIndex(q))); +} +// 2.1 Pauli gates +void qsimQuantum::x(Qubit q) { + q_circuit.gates.push_back( + qsim::GateX::Create(execution_time++, this->getQubitIndex(q))); +} +void qsimQuantum::y(Qubit q) { + q_circuit.gates.push_back( + qsim::GateY::Create(execution_time++, this->getQubitIndex(q))); +} +void qsimQuantum::z(Qubit q) { + q_circuit.gates.push_back( + qsim::GateZ::Create(execution_time++, this->getQubitIndex(q))); +} +// 2.2 rotation gates +void qsimQuantum::rx(double theta, Qubit q) { + q_circuit.gates.push_back( + qsim::GateRX::Create(execution_time++, this->getQubitIndex(q), theta)); +} +void qsimQuantum::ry(double theta, Qubit q) { + q_circuit.gates.push_back( + qsim::GateRY::Create(execution_time++, this->getQubitIndex(q), theta)); +} +void qsimQuantum::rz(double theta, Qubit q) { + q_circuit.gates.push_back( + qsim::GateRZ::Create(execution_time++, this->getQubitIndex(q), theta)); +} + +Qubit qsimQuantum::result_to_qubit(Result r) { + // TODO: This function is not working. Giving 0 every time. Maybe not needed. + QIREE_EXPECT(r.value < this->num_results()); + return result_to_qubit_[r.value]; // just copied this from the qirxacc, I have no idea if we need to do something else here +} + +void qsimQuantum::print_accelbuf() { + // TODO: to be implemented, we can create a buffer class to store the results +} + +qsimQuantum::VecMeas qsimQuantum::execute_if_needed() { + std::vector meas_results; // Vector to hold measurement results, this must be empty before running + std::string stringResult; + static unsigned long int seed = 0; + qsimParam.seed = seed++; + const bool run_success = Runner::Run(qsimParam, Factory(numThreads), q_circuit, *state_, meas_results); // Run the simulation + assert(run_success); // Ensure the run was successful + // reset circuit here + q_circuit = {}; + q_circuit.num_qubits = num_qubits_; + return meas_results; +} + +} // namespace qiree diff --git a/src/qirqsim/qsimQuantum.hh b/src/qirqsim/qsimQuantum.hh new file mode 100644 index 0000000..e720e8c --- /dev/null +++ b/src/qirqsim/qsimQuantum.hh @@ -0,0 +1,175 @@ +//----------------------------------*-C++-*----------------------------------// +// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers. +// See the top-level COPYRIGHT file for details. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +//---------------------------------------------------------------------------// +//! \file qirqsim/qsimQuantum.hh +//---------------------------------------------------------------------------// +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "qiree/Macros.hh" +#include "qiree/QuantumNotImpl.hh" +#include "qiree/RuntimeInterface.hh" +#include "qiree/Types.hh" +#include "BufferManager.hh" + +#include "../../tpls/qsim/simulator_basic.h" +#include "../../tpls/qsim/statespace_basic.h" +#include "../../tpls/qsim/gates_qsim.h" +#include "../../tpls/qsim/circuit.h" +#include "../../tpls/qsim/run_qsim.h" +#include "../../tpls/qsim/io.h" +#include "../../tpls/qsim/fuser.h" +#include "../../tpls/qsim/circuit_qsim_parser.h" +#include "../../tpls/qsim/fuser_mqubit.h" +#include "../../tpls/qsim/io_file.h" +#include "../../tpls/qsim/simmux.h" +#include "../../tpls/qsim/util_cpu.h" +#include "../../tpls/qsim/formux.h" +#include "../../tpls/qsim/gate.h" + +struct Factory { // Factory class for creating simulators in qsim + Factory(unsigned num_threads) : num_threads(num_threads) {} + using Simulator = qsim::Simulator; + using StateSpace = Simulator::StateSpace; + StateSpace CreateStateSpace() const { return StateSpace(num_threads); } + Simulator CreateSimulator() const { return Simulator(num_threads); } + unsigned num_threads; +}; + +namespace qiree +{ + class qsimQuantum final : virtual public QuantumNotImpl + { + + public: + + // Define constructors and destructors + qsimQuantum(std::ostream& os, size_type shots); // Construct with number of shots + + // Define types + using Simulator = qsim::Simulator; + using StateSpace = Simulator::StateSpace; + using State = StateSpace::State; + using Fuser = qsim::MultiQubitGateFuser>; + using Runner = qsim::QSimRunner; + using VecMeas = std::vector; + + State init_state_space(); + + QIREE_DELETE_COPY_MOVE(qsimQuantum); // Delete copy and move constructors + + //!@{ + //! \name Accessors + size_type num_results() const { return result_to_qubit_.size(); } + size_type num_qubits() const { return num_qubits_; } + + unsigned getQubitIndex(Qubit q) { + return static_cast(q.value); // Return the value of the qubit + } + //!@} + + //!@{ + //! \name Quantum interface + // Prepare to build a quantum circuit for an entry point + void set_up(EntryPointAttrs const&) override; + + // Complete an execution + void tear_down() override; + + // Map a qubit to a result index + void mz(Qubit, Result) final; + + // Read the value of a result. + QState read_result(Result) final; + //!@} + + //!@{ + //! \name Utilities for runtime + // Get runtime qubit corresponding to a runtime result + Qubit result_to_qubit(Result); + + // Wrapper for qsim + //std::map + //get_marginal_counts(std::vector const& qubits); + + // Run the circuit on the accelerator if we have not already. Returns true + // if the circuit was executed. + VecMeas execute_if_needed(); + + void print_accelbuf(); + //!@} + + //!@{ + //! \name Circuit construction + // void ccx(Qubit, Qubit) final; + void ccnot(Qubit, Qubit, Qubit); // TODO: not in examples or qir runner + void cnot(Qubit, Qubit) final; + void cx(Qubit, Qubit) final; + // void cy(Qubit, Qubit) final; + void cz(Qubit, Qubit) final; + void h(Qubit) final; + void reset(Qubit) final; + void rx(double, Qubit) final; + void ry(double, Qubit) final; + void rz(double, Qubit) final; + // void rzz(double, Qubit, Qubit) final; + void s(Qubit) final; + // void s_adj(Qubit) final; + // void swap(Qubit, Qubit) final; + void t(Qubit) final; + // void t_adj(Qubit) final; + void x(Qubit) final; + void y(Qubit) final; + void z(Qubit) final; + //!@} + + // Get the quantum circuit + qsim::Circuit> get_circuit() const { return q_circuit; } + // Get the state space + State const& get_state() const { return *state_; } + // update the buffer + BufferManager manager; + + private: + //// TYPES //// + enum class Endianness + { + little, + big + }; + unsigned numThreads; // Number of threads to use + unsigned max_fused_size; // Maximum size of fused gates + qsim::Circuit> q_circuit; // Quantum circuit object + + Runner::Parameter qsimParam; // Parameters for qsim + size_t execution_time; // when the quantum operation will be executed + + bool executed; + size_type num_qubits_{}; + std::vector result_to_qubit_; + Endianness endian_; + + std::ostream& output_; + std::shared_ptr simulator_; + std::shared_ptr statespace_; + std::shared_ptr state_; + + }; + + class buffer { + public: + buffer(size_t size) : size(size) {} + size_t size; + }; + +} // namespace qiree + + diff --git a/src/qirqsim/qsimTupleRuntime.cc b/src/qirqsim/qsimTupleRuntime.cc new file mode 100644 index 0000000..5366b79 --- /dev/null +++ b/src/qirqsim/qsimTupleRuntime.cc @@ -0,0 +1,123 @@ +//----------------------------------*-C++-*----------------------------------// +// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers. +// See the top-level COPYRIGHT file for details. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +//---------------------------------------------------------------------------// +//! \file qirqsim/qsimTupleRuntime.cc +//---------------------------------------------------------------------------// +#include "qsimTupleRuntime.hh" + +#include "qiree/Assert.hh" + +namespace qiree +{ +//---------------------------------------------------------------------------// +/*! + * Initialize the execution environment, resetting qubits. + */ +void qsimTupleRuntime::initialize(OptionalCString env) +{ + if (env) + { + output_ << "Argument to initialize: " << env << std::endl; + } +} + +//---------------------------------------------------------------------------// +/*! + * Execute circuit and mark the following N results as being part of an array + * named tag + */ +void qsimTupleRuntime::array_record_output(size_type s, OptionalCString tag) +{ + execute_if_needed(); + start_tracking(GroupingType::array, tag, s); +} + +//---------------------------------------------------------------------------// +/*! + * Execute circuit and mark the following N results as being part of a tuple + * named tag + */ +void qsimTupleRuntime::tuple_record_output(size_type s, OptionalCString tag) +{ + execute_if_needed(); + start_tracking(GroupingType::tuple, tag, s); +} + +//---------------------------------------------------------------------------// +/*! + * Execute circuit and report a single measurement result + */ +void qsimTupleRuntime::result_record_output(Result r, OptionalCString tag) +{ + execute_if_needed(); + Qubit q = sim_.result_to_qubit(r); + push_result(q); +} + +//---------------------------------------------------------------------------// +// PRIVATE FUNCTIONS +//---------------------------------------------------------------------------// + +void qsimTupleRuntime::execute_if_needed() +{ + /* + if (sim_.execute_if_needed() && print_accelbuf_) + { + sim_.print_accelbuf(); + } + */ +} + +void qsimTupleRuntime::start_tracking(GroupingType type, + std::string tag, + size_type num_results) +{ + QIREE_EXPECT(!valid_); + valid_ = true; + type_ = type; + tag_ = tag; + num_results_ = num_results; + qubits_.clear(); + + if (!num_results_) + { + // Edge case + print_header(0); + valid_ = false; + } +} + +void qsimTupleRuntime::push_result(Qubit q) +{ + QIREE_EXPECT(valid_); + QIREE_EXPECT(qubits_.size() < num_results_); + qubits_.push_back(q); + if (qubits_.size() == num_results_) + { + finish_tuple(); + } +} + +void qsimTupleRuntime::print_header(size_type num_distinct) +{ + auto name = get_name(); + output_ << name << " " << tag_ << " length " << qubits_.size() + << " distinct results " << num_distinct << std::endl; +} + +void qsimTupleRuntime::finish_tuple() +{ + //auto counts = sim_.get_marginal_counts(qubits_); + std::map counts = {{"0", 0}, {"1", 0}}; // Placeholder for actual counts, TODO: replace with actual counts + print_header(counts.size()); + auto name = get_name(); + for (auto& [bits, count] : counts) + { + output_ << name << " " << tag_ << " result " << bits << " count " + << count << std::endl; + } + valid_ = false; +} +} // namespace qiree diff --git a/src/qirqsim/qsimTupleRuntime.hh b/src/qirqsim/qsimTupleRuntime.hh new file mode 100644 index 0000000..fa153f4 --- /dev/null +++ b/src/qirqsim/qsimTupleRuntime.hh @@ -0,0 +1,93 @@ +//----------------------------------*-C++-*----------------------------------// +// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers. +// See the top-level COPYRIGHT file for details. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +//---------------------------------------------------------------------------// +//! \file qirqsim/qsimTupleRuntime.hh +//---------------------------------------------------------------------------// +#pragma once + +#include "qsimQuantum.hh" + +namespace qiree +{ + +/*! + * Print per-tuple (or per-array) measurement statistics. (Compare with \ref + * qsimDefaultRuntime.) + * + * Example: + * \code + * tuple ret length 2 distinct results 2 + * tuple ret result 00 count 512 + * tuple ret result 11 count 512 + * \endcode + */ + +class qsimTupleRuntime final : virtual public RuntimeInterface +{ + public: + /*! + * Construct an \c qsimTupleRuntime. + * The \c print_accelbuf argument determines whether the qsim \c + * AcceleratorBuffer is dumped after execution. + */ + qsimTupleRuntime(std::ostream& output, + qsimQuantum& sim, + bool print_accelbuf = true) + : output_(output) + , sim_(sim) + , print_accelbuf_(print_accelbuf) + , valid_(false) + { + } + + //!@{ + //! \name Runtime interface + // Initialize the execution environment, resetting qubits + void initialize(OptionalCString env) override; + + // Execute circuit and mark the following N results as being part of an + // array named tag + void array_record_output(size_type, OptionalCString tag) final; + + // Execute circuit and mark the following N results as being part of a + // tuple named tag + void tuple_record_output(size_type, OptionalCString) final; + + // Execute circuit and report a single measurement result + void result_record_output(Result result, OptionalCString tag) final; + //!@} + + private: + enum class GroupingType + { + tuple, + array, + }; + + std::ostream& output_; + qsimQuantum& sim_; + bool const print_accelbuf_; + bool valid_; + GroupingType type_; + std::string tag_; + size_type num_results_; + std::vector qubits_; + + void execute_if_needed(); + void + start_tracking(GroupingType type, std::string tag, size_type num_results); + void push_result(Qubit q); + void print_header(size_type num_distinct); + void finish_tuple(); + + inline std::string get_name() + { + return type_ == GroupingType::tuple ? "tuple" + : type_ == GroupingType::array ? "array" + : "grouping"; + } +}; + +} // namespace qiree diff --git a/tpls/qsim/bits.h b/tpls/qsim/bits.h new file mode 100644 index 0000000..080c866 --- /dev/null +++ b/tpls/qsim/bits.h @@ -0,0 +1,106 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef BITS_H_ +#define BITS_H_ + +#include + +#ifdef __BMI2__ + +#include + +#include + +namespace qsim { +namespace bits { + +inline uint32_t ExpandBits(uint32_t bits, unsigned n, uint32_t mask) { + return _pdep_u32(bits, mask); +} + +inline uint64_t ExpandBits(uint64_t bits, unsigned n, uint64_t mask) { + return _pdep_u64(bits, mask); +} + +inline uint32_t CompressBits(uint32_t bits, unsigned n, uint32_t mask) { + return _pext_u32(bits, mask); +} + +inline uint64_t CompressBits(uint64_t bits, unsigned n, uint64_t mask) { + return _pext_u64(bits, mask); +} + +} // namespace bits +} // namespace qsim + +#else // __BMI2__ + +namespace qsim { +namespace bits { + +template +inline Integer ExpandBits(Integer bits, unsigned n, Integer mask) { + Integer ebits = 0; + unsigned k = 0; + + for (unsigned i = 0; i < n; ++i) { + if ((mask >> i) & 1) { + ebits |= ((bits >> k) & 1) << i; + ++k; + } + } + + return ebits; +} + +template +inline Integer CompressBits(Integer bits, unsigned n, Integer mask) { + Integer sbits = 0; + unsigned k = 0; + + for (unsigned i = 0; i < n; ++i) { + if ((mask >> i) & 1) { + sbits |= ((bits >> i) & 1) << k; + ++k; + } + } + + return sbits; +} + +} // namespace bits +} // namespace qsim + +#endif // __BMI2__ + +namespace qsim { +namespace bits { + +template +inline Integer PermuteBits( + Integer bits, unsigned n, const std::vector& perm) { + Integer pbits = 0; + + for (unsigned i = 0; i < n; ++i) { + pbits |= ((bits >> i) & 1) << perm[i]; + } + + return pbits; +} + +} // namespace bits +} // namespace qsim + +#endif // BITS_H_ diff --git a/tpls/qsim/bitstring.h b/tpls/qsim/bitstring.h new file mode 100644 index 0000000..b95584b --- /dev/null +++ b/tpls/qsim/bitstring.h @@ -0,0 +1,97 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef BITSTRING_H_ +#define BITSTRING_H_ + +#include +#include +#include +#include + +namespace qsim { + +using Bitstring = uint64_t; + +/** + * Reads bitstrings (representing initialized or measured states of qubits) + * from a provided stream object and stores them in a vector. + * @param num_qubits Number of qubits represented in each bitstring. + * @param provider Source of bitstrings; only used for error reporting. + * @param fs The stream to read bitstrings from. + * @param bitstrings Output vector of bitstrings. On success, this will contain + * all bitstrings read in from 'fs'. + * @return True if reading succeeded; false otherwise. + */ +template +bool BitstringsFromStream(unsigned num_qubits, const std::string& provider, + Stream& fs, std::vector& bitstrings) { + bitstrings.resize(0); + bitstrings.reserve(100000); + + // Bitstrings are in text format. One bitstring per line. + + do { + char buf[128]; + fs.getline(buf, 128); + + if (fs) { + Bitstring b{0}; + + unsigned p = 0; + while (p < 128 && (buf[p] == '0' || buf[p] == '1')) { + b |= uint64_t(buf[p] - '0') << p; + ++p; + } + + if (p != num_qubits) { + IO::errorf("wrong bitstring length in %s: " + "got %u; should be %u.\n", provider.c_str(), p, num_qubits); + bitstrings.resize(0); + return false; + } + + bitstrings.push_back(b); + } + } while (fs); + + return true; +} + +/** + * Reads bitstrings (representing initialized or measured states of qubits) + * from the given file and stores them in a vector. + * @param num_qubits Number of qubits represented in each bitstring. + * @param file The name of the file to read bitstrings from. + * @param bitstrings Output vector of bitstrings. On success, this will contain + * all bitstrings read in from 'file'. + * @return True if reading succeeded; false otherwise. + */ +template +inline bool BitstringsFromFile(unsigned num_qubits, const std::string& file, + std::vector& bitstrings) { + auto fs = IO::StreamFromFile(file); + + if (!fs) { + return false; + } else { + bool rc = BitstringsFromStream(num_qubits, file, fs, bitstrings); + IO::CloseStream(fs); + return rc; + } +} + +} // namespace qsim + +#endif // BITSTRING_H_ diff --git a/tpls/qsim/channel.h b/tpls/qsim/channel.h new file mode 100644 index 0000000..372a174 --- /dev/null +++ b/tpls/qsim/channel.h @@ -0,0 +1,149 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CHANNEL_H_ +#define CHANNEL_H_ + +#include +#include + +#include "gate.h" +#include "matrix.h" + +namespace qsim { + +/** + * Kraus operator. + */ +template +struct KrausOperator { + using fp_type = typename Gate::fp_type; + + enum Kind { + kNormal = 0, + kMeasurement = gate::kMeasurement, + }; + + /** + * Kraus operator type; + */ + Kind kind; + + /** + * If true, the Kraus operator is a unitary operator times a constant. + */ + bool unitary; + + /** + * Lower bound on Kraus operator probability. + */ + double prob; + + /** + * Sequence of operations that represent the Kraus operator. This can be just + * one operation. + */ + std::vector ops; + + /** + * Product of K^\dagger and K. This can be empty if unitary = true. + */ + Matrix kd_k; + + /** + * Qubits kd_k acts on. This can be empty if unitary = true. + */ + std::vector qubits; + + /** + * Calculates the product of "K^\dagger K". Sets qubits "K^\dagger K" acts on. + */ + void CalculateKdKMatrix() { + if (ops.size() == 1) { + kd_k = ops[0].matrix; + MatrixDaggerMultiply(ops[0].qubits.size(), ops[0].matrix, kd_k); + qubits = ops[0].qubits; + } else if (ops.size() > 1) { + std::set qubit_map; + + for (const auto& op : ops) { + for (unsigned q : op.qubits) { + qubit_map.insert(q); + } + } + + unsigned num_qubits = qubit_map.size(); + + qubits.resize(0); + qubits.reserve(num_qubits); + + for (auto it = qubit_map.begin(); it != qubit_map.end(); ++it) { + qubits.push_back(*it); + } + + MatrixIdentity(unsigned{1} << num_qubits, kd_k); + + for (const auto& op : ops) { + if (op.qubits.size() == num_qubits) { + MatrixMultiply(num_qubits, op.matrix, kd_k); + } else { + unsigned mask = 0; + + for (auto q : op.qubits) { + for (unsigned i = 0; i < num_qubits; ++i) { + if (q == qubits[i]) { + mask |= unsigned{1} << i; + break; + } + } + } + + MatrixMultiply(mask, op.qubits.size(), op.matrix, num_qubits, kd_k); + } + } + + auto m = kd_k; + MatrixDaggerMultiply(num_qubits, m, kd_k); + } + } +}; + +/** + * Quantum channel. + */ +template +using Channel = std::vector>; + +/** + * Makes a channel from the gate. + * @param time The time to place the channel at. + * @param gate The input gate. + * @return The output channel. + */ +template +Channel MakeChannelFromGate(unsigned time, const Gate& gate) { + auto normal = KrausOperator::kNormal; + auto measurement = KrausOperator::kMeasurement; + + auto kind = gate.kind == gate::kMeasurement ? measurement : normal; + + Channel channel = {{kind, true, 1, {gate}}}; + channel[0].ops[0].time = time; + + return channel; +} + +} // namespace qsim + +#endif // CHANNEL_H_ diff --git a/tpls/qsim/channels_cirq.h b/tpls/qsim/channels_cirq.h new file mode 100644 index 0000000..69f1df9 --- /dev/null +++ b/tpls/qsim/channels_cirq.h @@ -0,0 +1,471 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CHANNELS_CIRQ_H_ +#define CHANNELS_CIRQ_H_ + +#include +#include +#include + +#include "channel.h" +#include "gates_cirq.h" + +namespace qsim { + +namespace Cirq { + +template +using Channel = qsim::Channel>; + +/** + * Asymmetric depolarizing channel factory. + */ +template +struct AsymmetricDepolarizingChannel { + static constexpr char name[] = "asymmetric_depolarize"; + + AsymmetricDepolarizingChannel(double p_x, double p_y, double p_z) + : p_x(p_x), p_y(p_y), p_z(p_z) {} + + static Channel Create(unsigned time, unsigned q, + double p_x, double p_y, double p_z) { + double p1 = 1 - p_x - p_y - p_z; + + auto normal = KrausOperator>::kNormal; + + return {{normal, 1, p1, {}}, + {normal, 1, p_x, {X::Create(time, q)}}, + {normal, 1, p_y, {Y::Create(time, q)}}, + {normal, 1, p_z, {Z::Create(time, q)}}}; + } + + static Channel Create(unsigned time, + const std::vector& qubits, + double p_x, double p_y, double p_z) { + double p1 = 1 - p_x - p_y - p_z; + + auto normal = KrausOperator>::kNormal; + + uint64_t size = uint64_t{1} << (2 * qubits.size()); + + Channel channel; + channel.reserve(size); + + for (uint64_t i = 0; i < size; ++i) { + channel.push_back({normal, 1, 0, {}}); + auto& kop = channel.back(); + + kop.ops.reserve(qubits.size()); + + double prob = 1; + + for (unsigned q = 0; q < qubits.size(); ++q) { + unsigned pauli_index = (i >> (2 * q)) & 3; + + switch (pauli_index) { + case 0: + prob *= p1; + break; + case 1: + prob *= p_x; + kop.ops.push_back(X::Create(time, q)); + break; + case 2: + prob *= p_y; + kop.ops.push_back(Y::Create(time, q)); + break; + case 3: + prob *= p_z; + kop.ops.push_back(Z::Create(time, q)); + break; + } + } + + kop.prob = prob; + } + + return channel; + } + + Channel Create(unsigned time, unsigned q) const { + return Create(time, q, p_x, p_y, p_z); + } + + Channel Create( + unsigned time, const std::vector& qubits) const { + return Create(time, qubits, p_x, p_y, p_z); + } + + double p_x = 0; + double p_y = 0; + double p_z = 0; +}; + +/** + * Returns an asymmetric depolarizing channel factory object. + */ +template +inline AsymmetricDepolarizingChannel asymmetric_depolarize( + double p_x, double p_y, double p_z) { + return AsymmetricDepolarizingChannel(p_x, p_y, p_z); +} + +/** + * Depolarizing channel factory. + */ +template +struct DepolarizingChannel { + static constexpr char name[] = "depolarize"; + + DepolarizingChannel(double p) : p(p) {} + + static Channel Create(unsigned time, unsigned q, double p) { + double p1 = 1 - p; + double p2 = p / 3; + + auto normal = KrausOperator>::kNormal; + + return {{normal, 1, p1, {}}, + {normal, 1, p2, {X::Create(time, q)}}, + {normal, 1, p2, {Y::Create(time, q)}}, + {normal, 1, p2, {Z::Create(time, q)}}}; + } + + static Channel Create( + unsigned time, const std::vector& qubits, double p) { + double p1 = 1 - p; + double p2 = p / 3; + + auto normal = KrausOperator>::kNormal; + + uint64_t size = uint64_t{1} << (2 * qubits.size()); + + Channel channel; + channel.reserve(size); + + for (uint64_t i = 0; i < size; ++i) { + channel.push_back({normal, 1, 0, {}}); + auto& kop = channel.back(); + + kop.ops.reserve(qubits.size()); + + double prob = 1; + + for (unsigned q = 0; q < qubits.size(); ++q) { + unsigned pauli_index = (i >> (2 * q)) & 3; + + switch (pauli_index) { + case 0: + prob *= p1; + break; + case 1: + prob *= p2; + kop.ops.push_back(X::Create(time, q)); + break; + case 2: + prob *= p2; + kop.ops.push_back(Y::Create(time, q)); + break; + case 3: + prob *= p2; + kop.ops.push_back(Z::Create(time, q)); + break; + } + } + + kop.prob = prob; + } + + return channel; + } + + Channel Create(unsigned time, unsigned q) const { + return Create(time, q, p); + } + + Channel Create( + unsigned time, const std::vector& qubits) const { + return Create(time, qubits, p); + } + + double p = 0; +}; + +/** + * Returns a depolarizing channel factory object. + */ +template +inline DepolarizingChannel depolarize(double p) { + return DepolarizingChannel(p); +} + +/** + * Generalized amplitude damping channel factory. + */ +template +struct GeneralizedAmplitudeDampingChannel { + static constexpr char name[] = "generalized_amplitude_damp"; + + GeneralizedAmplitudeDampingChannel(double p, double gamma) + : p(p), gamma(gamma) {} + + static Channel Create( + unsigned time, unsigned q, double p, double gamma) { + double p1 = p * (1 - gamma); + double p2 = (1 - p) * (1 - gamma); + double p3 = 0; + + fp_type t1 = std::sqrt(p); + fp_type r1 = std::sqrt(p * (1 - gamma)); + fp_type s1 = std::sqrt(p * gamma); + fp_type t2 = std::sqrt(1 - p); + fp_type r2 = std::sqrt((1 - p) * (1 - gamma)); + fp_type s2 = std::sqrt((1 - p) * gamma); + + using M = Cirq::MatrixGate1; + auto normal = KrausOperator>::kNormal; + + return {{normal, 0, p1, + {M::Create(time, q, {t1, 0, 0, 0, 0, 0, r1, 0})}, + {t1 * t1, 0, 0, 0, 0, 0, r1 * r1, 0}, {q}, + }, + {normal, 0, p2, + {M::Create(time, q, {r2, 0, 0, 0, 0, 0, t2, 0})}, + {r2 * r2, 0, 0, 0, 0, 0, t2 * t2, 0}, {q}, + }, + {normal, 0, p3, + {M::Create(time, q, {0, 0, s1, 0, 0, 0, 0, 0})}, + {0, 0, 0, 0, 0, 0, s1 * s1, 0}, {q}, + }, + {normal, 0, p3, + {M::Create(time, q, {0, 0, 0, 0, s2, 0, 0, 0})}, + {s2 * s2, 0, 0, 0, 0, 0, 0, 0}, {q}, + }, + }; + } + + Channel Create(unsigned time, unsigned q) const { + return Create(time, q, p, gamma); + } + + double p = 1; + double gamma = 0; +}; + +/** + * Returns a generalized amplitude damping channel factory object. + */ +template +inline GeneralizedAmplitudeDampingChannel generalized_amplitude_damp( + double p, double gamma) { + return GeneralizedAmplitudeDampingChannel(p, gamma); +} + +/** + * Amplitude damping channel factory. + */ +template +struct AmplitudeDampingChannel { + static constexpr char name[] = "amplitude_damp"; + + AmplitudeDampingChannel(double gamma) : gamma(gamma) {} + + static Channel Create(unsigned time, unsigned q, double gamma) { + double p1 = 1 - gamma; + double p2 = 0; + + fp_type r = std::sqrt(p1); + fp_type s = std::sqrt(gamma); + + using M = Cirq::MatrixGate1; + auto normal = KrausOperator>::kNormal; + + return {{normal, 0, p1, + {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})}, + {1, 0, 0, 0, 0, 0, r * r, 0}, {q}, + }, + {normal, 0, p2, + {M::Create(time, q, {0, 0, s, 0, 0, 0, 0, 0})}, + {0, 0, 0, 0, 0, 0, s * s, 0}, {q}, + }, + }; + } + + Channel Create(unsigned time, unsigned q) const { + return Create(time, q, gamma); + } + + double gamma = 0; +}; + +/** + * Returns an amplitude damping channel factory object. + */ +template +inline AmplitudeDampingChannel amplitude_damp(double gamma) { + return AmplitudeDampingChannel(gamma); +} + +/** + * Phase damping channel factory. + */ +template +struct PhaseDampingChannel { + static constexpr char name[] = "phase_dump"; + + PhaseDampingChannel(double gamma) : gamma(gamma) {} + + static Channel Create(unsigned time, unsigned q, double gamma) { + double p1 = 1 - gamma; + double p2 = 0; + + fp_type r = std::sqrt(p1); + fp_type s = std::sqrt(gamma); + + using M = Cirq::MatrixGate1; + auto normal = KrausOperator>::kNormal; + + return {{normal, 0, p1, + {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})}, + {1, 0, 0, 0, 0, 0, r * r, 0}, {q}, + }, + {normal, 0, p2, + {M::Create(time, q, {0, 0, 0, 0, 0, 0, s, 0})}, + {0, 0, 0, 0, 0, 0, s * s, 0}, {q}, + }, + }; + } + + Channel Create(unsigned time, unsigned q) const { + return Create(time, q, gamma); + } + + double gamma = 0; +}; + +/** + * Returns a phase damping channel factory object. + */ +template +inline PhaseDampingChannel phase_damp(double gamma) { + return PhaseDampingChannel(gamma); +} + +/** + * Reset channel factory. + */ +template +struct ResetChannel { + static constexpr char name[] = "reset"; + + static Channel Create(unsigned time, unsigned q) { + using M = Cirq::MatrixGate1; + auto normal = KrausOperator>::kNormal; + + return {{normal, 0, 0, + {M::Create(time, q, {1, 0, 0, 0, 0, 0, 0, 0})}, + {1, 0, 0, 0, 0, 0, 0, 0}, {q}, + }, + {normal, 0, 0, + {M::Create(time, q, {0, 0, 1, 0, 0, 0, 0, 0})}, + {0, 0, 0, 0, 0, 0, 1, 0}, {q}, + }, + }; + } +}; + +/** + * Returns a reset channel factory object. + */ +template +inline ResetChannel reset() { + return ResetChannel(); +} + +/** + * Phase flip channel factory. + */ +template +struct PhaseFlipChannel { + static constexpr char name[] = "phase_flip"; + + PhaseFlipChannel(double p) : p(p) {} + + static Channel Create(unsigned time, unsigned q, double p) { + double p1 = 1 - p; + double p2 = p; + + auto normal = KrausOperator>::kNormal; + + return {{normal, 1, p1, {}}, + {normal, 1, p2, {Z::Create(time, q)}} + }; + } + + Channel Create(unsigned time, unsigned q) const { + return Create(time, q, p); + } + + double p = 0; +}; + +/** + * Returns a phase flip channel factory object. + */ +template +inline PhaseFlipChannel phase_flip(double p) { + return PhaseFlipChannel(p); +} + +/** + * Bit flip channel factory. + */ +template +struct BitFlipChannel { + static constexpr char name[] = "bit_flip"; + + BitFlipChannel(double p) : p(p) {} + + static Channel Create(unsigned time, unsigned q, double p) { + double p1 = 1 - p; + double p2 = p; + + auto normal = KrausOperator>::kNormal; + + return {{normal, 1, p1, {}}, + {normal, 1, p2, {X::Create(time, q)}} + }; + } + + Channel Create(unsigned time, unsigned q) const { + return Create(time, q, p); + } + + double p = 0; +}; + +/** + * Returns a bit flip channel factory object. + */ +template +inline BitFlipChannel bit_flip(double p) { + return BitFlipChannel(p); +} + +} // namesapce Cirq + +} // namespace qsim + +#endif // CHANNELS_CIRQ_H_ diff --git a/tpls/qsim/channels_qsim.h b/tpls/qsim/channels_qsim.h new file mode 100644 index 0000000..5c07bcc --- /dev/null +++ b/tpls/qsim/channels_qsim.h @@ -0,0 +1,117 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CHANNELS_QSIM_H_ +#define CHANNELS_QSIM_H_ + +#include +#include +#include + +#include "channel.h" +#include "gates_qsim.h" + +namespace qsim { + +/** + * Amplitude damping channel factory. + */ +template +struct AmplitudeDampingChannel { + AmplitudeDampingChannel(double gamma) : gamma(gamma) {} + + static Channel> Create( + unsigned time, unsigned q, double gamma) { + double p1 = 1 - gamma; + double p2 = 0; + + fp_type r = std::sqrt(p1); + fp_type s = std::sqrt(gamma); + + using M = GateMatrix1; + auto normal = KrausOperator>::kNormal; + + return {{normal, 0, p1, + {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})}, + {1, 0, 0, 0, 0, 0, r * r, 0}, {q}, + }, + {normal, 0, p2, + {M::Create(time, q, {0, 0, s, 0, 0, 0, 0, 0})}, + {0, 0, 0, 0, 0, 0, s * s, 0}, {q}, + }, + }; + } + + Channel> Create(unsigned time, unsigned q) const { + return Create(time, q, gamma); + } + + double gamma = 0; +}; + +/** + * Returns an amplitude damping channel factory object. + */ +template +inline AmplitudeDampingChannel amplitude_damp(double gamma) { + return AmplitudeDampingChannel(gamma); +} + +/** + * Phase damping channel factory. + */ +template +struct PhaseDampingChannel { + PhaseDampingChannel(double gamma) : gamma(gamma) {} + + static Channel> Create( + unsigned time, unsigned q, double gamma) { + double p1 = 1 - gamma; + double p2 = 0; + + fp_type r = std::sqrt(p1); + fp_type s = std::sqrt(gamma); + + using M = GateMatrix1; + auto normal = KrausOperator>::kNormal; + + return {{normal, 0, p1, + {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})}, + {1, 0, 0, 0, 0, 0, r * r, 0}, {q}, + }, + {normal, 0, p2, + {M::Create(time, q, {0, 0, 0, 0, 0, 0, s, 0})}, + {0, 0, 0, 0, 0, 0, s * s, 0}, {q}, + }, + }; + } + + Channel> Create(unsigned time, unsigned q) const { + return Create(time, q, gamma); + } + + double gamma = 0; +}; + +/** + * Returns a phase damping channel factory object. + */ +template +inline PhaseDampingChannel phase_damp(double gamma) { + return PhaseDampingChannel(gamma); +} + +} // namespace qsim + +#endif // CHANNELS_QSIM_H_ diff --git a/tpls/qsim/circuit.h b/tpls/qsim/circuit.h new file mode 100644 index 0000000..59018ee --- /dev/null +++ b/tpls/qsim/circuit.h @@ -0,0 +1,36 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CIRCUIT_H_ +#define CIRCUIT_H_ + +#include + +namespace qsim { + +/** + * A collection of gates. This object is consumed by `QSim[h]Runner.Run()`. + */ +template +struct Circuit { + unsigned num_qubits; + /** + * The set of gates to be run. Gate times should be ordered. + */ + std::vector gates; +}; + +} // namespace qsim + +#endif // CIRCUIT_H_ diff --git a/tpls/qsim/circuit_noisy.h b/tpls/qsim/circuit_noisy.h new file mode 100644 index 0000000..40a228d --- /dev/null +++ b/tpls/qsim/circuit_noisy.h @@ -0,0 +1,108 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CIRCUIT_NOISY_H_ +#define CIRCUIT_NOISY_H_ + +#include + +#include "circuit.h" +#include "channel.h" + +namespace qsim { + +/** + * Noisy circuit. + */ +template +struct NoisyCircuit { + unsigned num_qubits; + std::vector> channels; +}; + +template +using ncircuit_iterator = typename std::vector>::const_iterator; + +/** + * Makes a noisy circuit from the clean circuit. + * Channels are added after each qubit of each gate of the clean cicuit. + * Roughly equivalent to cirq.Circuit.with_noise. + * @param num_qubits The number of circuit qubits. + * @param gbeg, gend The iterator range [gbeg, gend) of circuit gates. + * @param A channel factory to construct channels. + * @return The output noisy circuit. + */ +template +inline NoisyCircuit MakeNoisy( + unsigned num_qubits, + typename std::vector::const_iterator gbeg, + typename std::vector::const_iterator gend, + const ChannelFactory& channel_factory) { + NoisyCircuit ncircuit; + + ncircuit.num_qubits = num_qubits; + ncircuit.channels.reserve(4 * std::size_t(gend - gbeg)); + + for (auto it = gbeg; it != gend; ++it) { + const auto& gate = *it; + + ncircuit.channels.push_back(MakeChannelFromGate(2 * gate.time, gate)); + + for (auto q : gate.qubits) { + ncircuit.channels.push_back(channel_factory.Create(2 * gate.time + 1, q)); + } + + for (auto q : gate.controlled_by) { + ncircuit.channels.push_back(channel_factory.Create(2 * gate.time + 1, q)); + } + } + + return ncircuit; +} + +/** + * Makes a noisy circuit from the clean circuit. + * Channels are added after each qubit of each gate of the clean cicuit. + * Roughly equivalent to cirq.Circuit.with_noise. + * @param num_qubits The number of circuit qubits. + * @param gates The circuit gates. + * @param A channel factory to construct channels. + * @return The output noisy circuit. + */ +template +inline NoisyCircuit MakeNoisy(unsigned num_qubits, + const std::vector& gates, + const ChannelFactory& channel_factory) { + return + MakeNoisy(num_qubits, gates.begin(), gates.end(), channel_factory); +} + +/** + * Makes a noisy circuit from the clean circuit. + * Channels are added after each qubit of each gate of the clean cicuit. + * Roughly equivalent to cirq.Circuit.with_noise. + * @param circuit The input cicuit. + * @param A channel factory to construct channels. + * @return The output noisy circuit. + */ +template +inline NoisyCircuit MakeNoisy(const Circuit& circuit, + const ChannelFactory& channel_factory) { + return MakeNoisy(circuit.num_qubits, circuit.gates.begin(), + circuit.gates.end(), channel_factory); +} + +} // namespace qsim + +#endif // CIRCUIT_NOISY_H_ diff --git a/tpls/qsim/circuit_qsim_parser.h b/tpls/qsim/circuit_qsim_parser.h new file mode 100644 index 0000000..de7bd89 --- /dev/null +++ b/tpls/qsim/circuit_qsim_parser.h @@ -0,0 +1,442 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CIRCUIT_QSIM_PARSER_H_ +#define CIRCUIT_QSIM_PARSER_H_ + +#include +#include +#include +#include +#include + +#include "circuit.h" +#include "gates_qsim.h" + +namespace qsim { + +/** + * Parser for the (deprecated) qsim file input format. + * The primary supported interface for designing circuits to simulate with qsim + * is Cirq, which relies on + * the Python-based qsimcirq interface. For C++ applications, Cirq gates can be + * explicitly constructed in code. + */ +template +class CircuitQsimParser final { + public: + /** + * Parses the given input stream into a Circuit object, following the rules + * defined in "docs/input_format.md". + * @param maxtime Maximum gate "time" to read operations for (inclusive). + * @param provider Circuit source; only used for error reporting. + * @param fs The stream to read the circuit from. + * @param circuit Output circuit object. If parsing is successful, this will + * contain the circuit defined in 'fs'. + * @return True if parsing succeeds; false otherwise. + */ + template + static bool FromStream(unsigned maxtime, const std::string& provider, + Stream& fs, Circuit>& circuit) { + circuit.num_qubits = 0; + + circuit.gates.resize(0); + circuit.gates.reserve(1024); + + unsigned k = 0; + + std::string line; + line.reserve(128); + + unsigned time; + std::string gate_name; + gate_name.reserve(16); + + unsigned max_time = 0; + unsigned prev_mea_time = 0; + + std::vector last_times; + + while (std::getline(fs, line)) { + ++k; + + if (line.size() == 0 || line[0] == '#') continue; + + std::stringstream ss(line); + + if (circuit.num_qubits == 0) { + ss >> circuit.num_qubits; + if (circuit.num_qubits == 0) { + IO::errorf("invalid number of qubits in %s in line %u.\n", + provider.c_str(), k); + return false; + } + + last_times.resize(circuit.num_qubits, unsigned(-1)); + + continue; + } + + ss >> time >> gate_name; + + if (!ss) { + InvalidGateError(provider, k); + return false; + } + + if (time > maxtime) { + break; + } + + if (gate_name == "c") { + if (!ParseControlledGate(ss, time, + circuit.num_qubits, circuit.gates)) { + InvalidGateError(provider, k); + return false; + } + } else if (!ParseGate(ss, time, circuit.num_qubits, + gate_name, circuit.gates)) { + InvalidGateError(provider, k); + return false; + } + + const auto& gate = circuit.gates.back(); + + if (time < prev_mea_time + || (gate.kind == gate::kMeasurement && time < max_time)) { + IO::errorf("gate crosses the time boundary set by measurement " + "gates in line %u in %s.\n", k, provider.c_str()); + return false; + } + + if (gate.kind == gate::kMeasurement) { + prev_mea_time = time; + } + + if (GateIsOutOfOrder(time, gate.qubits, last_times) + || GateIsOutOfOrder(time, gate.controlled_by, last_times)) { + IO::errorf("gate is out of time order in line %u in %s.\n", + k, provider.c_str()); + return false; + } + + if (time > max_time) { + max_time = time; + } + } + + return true; + } + + /** + * Parses the given file into a Circuit object, following the rules defined + * in "docs/input_format.md". + * @param maxtime Maximum gate "time" to read operations for (inclusive). + * @param file The name of the file to read the circuit from. + * @param circuit Output circuit object. If parsing is successful, this will + * contain the circuit defined in 'file'. + * @return True if parsing succeeds; false otherwise. + */ + template + static bool FromFile(unsigned maxtime, const std::string& file, + Circuit>& circuit) { + auto fs = IO::StreamFromFile(file); + + if (!fs) { + return false; + } else { + bool rc = FromStream(maxtime, file, fs, circuit); + IO::CloseStream(fs); + return rc; + } + } + + private: + static void InvalidGateError(const std::string& provider, unsigned line) { + IO::errorf("invalid gate in %s in line %u.\n", provider.c_str(), line); + } + + /** + * Checks formatting for a zero-qubit gate parsed from 'ss'. + * @param ss Input stream containing the gate specification. + */ + static bool ValidateGate(std::stringstream& ss) { + return ss && ss.peek() == std::stringstream::traits_type::eof(); + } + + /** + * Checks formatting for a single-qubit gate parsed from 'ss'. + * @param ss Input stream containing the gate specification. + * @param num_qubits Number of qubits, as defined at the start of the file. + * @param q0 Index of the affected qubit. + */ + static bool ValidateGate(std::stringstream& ss, + unsigned num_qubits, unsigned q0) { + return ss && ss.peek() == std::stringstream::traits_type::eof() + && q0 < num_qubits; + } + + /** + * Checks formatting for a two-qubit gate parsed from 'ss'. + * @param ss Input stream containing the gate specification. + * @param num_qubits Number of qubits, as defined at the start of the file. + * @param q0 Index of the first affected qubit. + * @param q1 Index of the second affected qubit. + */ + static bool ValidateGate(std::stringstream& ss, + unsigned num_qubits, unsigned q0, unsigned q1) { + return ss && ss.peek() == std::stringstream::traits_type::eof() + && q0 < num_qubits && q1 < num_qubits && q0 != q1; + } + + /** + * Checks formatting for a multiqubit gate parsed from 'ss'. + * @param ss Input stream containing the gate specification. + * @param num_qubits Number of qubits, as defined at the start of the file. + * @param qubits Indices of affected qubits. + */ + static bool ValidateGate(std::stringstream& ss, unsigned num_qubits, + const std::vector& qubits) { + return ss && ValidateQubits(num_qubits, qubits); + } + + static bool ValidateControlledGate( + unsigned num_qubits, const std::vector& qubits, + const std::vector& controlled_by) { + if (!ValidateQubits(num_qubits, controlled_by)) return false; + + std::size_t i = 0, j = 0; + + while (i < qubits.size() && j < controlled_by.size()) { + if (qubits[i] == controlled_by[j]) { + return false; + } else if (qubits[i] < controlled_by[j]) { + ++i; + } else { + ++j; + } + } + + return true; + } + + static bool ValidateQubits(unsigned num_qubits, + const std::vector& qubits) { + if (qubits.size() == 0 || qubits[0] >= num_qubits) return false; + + // qubits should be sorted. + + for (std::size_t i = 1; i < qubits.size(); ++i) { + if (qubits[i] >= num_qubits || qubits[i] == qubits[i - 1]) { + return false; + } + } + + return true; + } + + static bool GateIsOutOfOrder(unsigned time, + const std::vector& qubits, + std::vector& last_times) { + for (auto q : qubits) { + if (last_times[q] != unsigned(-1) && time <= last_times[q]) { + return true; + } + + last_times[q] = time; + } + + return false; + } + + template + static bool ParseGate(Stream& ss, unsigned time, unsigned num_qubits, + const std::string& gate_name, + std::vector& gates) { + unsigned q0, q1; + fp_type phi, theta; + + if (gate_name == "p") { + ss >> phi; + if (!ValidateGate(ss)) return false; + gates.push_back(GateGPh::Create(time, phi)); + } else if (gate_name == "id1") { + ss >> q0; + if (!ValidateGate(ss, num_qubits, q0)) return false; + gates.push_back(GateId1::Create(time, q0)); + } else if (gate_name == "h") { + ss >> q0; + if (!ValidateGate(ss, num_qubits, q0)) return false; + gates.push_back(GateHd::Create(time, q0)); + } else if (gate_name == "t") { + ss >> q0; + if (!ValidateGate(ss, num_qubits, q0)) return false; + gates.push_back(GateT::Create(time, q0)); + } else if (gate_name == "x") { + ss >> q0; + if (!ValidateGate(ss, num_qubits, q0)) return false; + gates.push_back(GateX::Create(time, q0)); + } else if (gate_name == "y") { + ss >> q0; + if (!ValidateGate(ss, num_qubits, q0)) return false; + gates.push_back(GateY::Create(time, q0)); + } else if (gate_name == "z") { + ss >> q0; + if (!ValidateGate(ss, num_qubits, q0)) return false; + gates.push_back(GateZ::Create(time, q0)); + } else if (gate_name == "x_1_2") { + ss >> q0; + if (!ValidateGate(ss, num_qubits, q0)) return false; + gates.push_back(GateX2::Create(time, q0)); + } else if (gate_name == "y_1_2") { + ss >> q0; + if (!ValidateGate(ss, num_qubits, q0)) return false; + gates.push_back(GateY2::Create(time, q0)); + } else if (gate_name == "rx") { + ss >> q0 >> phi; + if (!ValidateGate(ss, num_qubits, q0)) return false; + gates.push_back(GateRX::Create(time, q0, phi)); + } else if (gate_name == "ry") { + ss >> q0 >> phi; + if (!ValidateGate(ss, num_qubits, q0)) return false; + gates.push_back(GateRY::Create(time, q0, phi)); + } else if (gate_name == "rz") { + ss >> q0 >> phi; + if (!ValidateGate(ss, num_qubits, q0)) return false; + gates.push_back(GateRZ::Create(time, q0, phi)); + } else if (gate_name == "rxy") { + ss >> q0 >> theta >> phi; + if (!ValidateGate(ss, num_qubits, q0)) return false; + gates.push_back(GateRXY::Create(time, q0, theta, phi)); + } else if (gate_name == "hz_1_2") { + ss >> q0; + if (!ValidateGate(ss, num_qubits, q0)) return false; + gates.push_back(GateHZ2::Create(time, q0)); + } else if (gate_name == "s") { + ss >> q0; + if (!ValidateGate(ss, num_qubits, q0)) return false; + gates.push_back(GateS::Create(time, q0)); + } else if (gate_name == "id2") { + ss >> q0 >> q1; + if (!ValidateGate(ss, num_qubits, q0, q1)) return false; + gates.push_back(GateId2::Create(time, q0, q1)); + } else if (gate_name == "cz") { + ss >> q0 >> q1; + if (!ValidateGate(ss, num_qubits, q0, q1)) return false; + gates.push_back(GateCZ::Create(time, q0, q1)); + } else if (gate_name == "cnot" || gate_name == "cx") { + ss >> q0 >> q1; + if (!ValidateGate(ss, num_qubits, q0, q1)) return false; + gates.push_back(GateCNot::Create(time, q0, q1)); + } else if (gate_name == "sw") { + ss >> q0 >> q1; + if (!ValidateGate(ss, num_qubits, q0, q1)) return false; + gates.push_back(GateSwap::Create(time, q0, q1)); + } else if (gate_name == "is") { + ss >> q0 >> q1; + if (!ValidateGate(ss, num_qubits, q0, q1)) return false; + gates.push_back(GateIS::Create(time, q0, q1)); + } else if (gate_name == "fs") { + ss >> q0 >> q1 >> theta >> phi; + if (!ValidateGate(ss, num_qubits, q0, q1)) return false; + gates.push_back(GateFS::Create(time, q0, q1, theta, phi)); + } else if (gate_name == "cp") { + ss >> q0 >> q1 >> phi; + if (!ValidateGate(ss, num_qubits, q0, q1)) return false; + gates.push_back(GateCP::Create(time, q0, q1, phi)); + } else if (gate_name == "m") { + std::vector qubits; + qubits.reserve(num_qubits); + + while (ss.good()) { + ss >> q0; + if (ss) { + qubits.push_back(q0); + } else { + return false; + } + } + + gates.push_back(gate::Measurement>::Create( + time, std::move(qubits))); + + if (!ValidateQubits(num_qubits, gates.back().qubits)) return false; + } else { + return false; + } + + return true; + } + + template + static bool ParseControlledGate(Stream& ss, unsigned time, + unsigned num_qubits, + std::vector& gates) { + std::vector controlled_by; + controlled_by.reserve(64); + + std::string gate_name; + gate_name.reserve(16); + + while (1) { + while (ss.good()) { + if (!std::isblank(ss.get())) { + ss.unget(); + break; + } + } + + if (!ss.good()) { + return false; + } + + if (!std::isdigit(ss.peek())) { + break; + } else { + unsigned q; + ss >> q; + + if (!ss.good() || !std::isblank(ss.get())) { + return false; + } + + controlled_by.push_back(q); + } + } + + if (controlled_by.size() == 0) { + return false; + } + + ss >> gate_name; + + if (!ss.good() || !ParseGate(ss, time, + num_qubits, gate_name, gates)) { + return false; + } + + gates.back().ControlledBy(std::move(controlled_by)); + + if (!ValidateControlledGate(num_qubits, gates.back().qubits, + gates.back().controlled_by)) { + return false; + } + + return true; + } +}; + +} // namespace qsim + +#endif // CIRCUIT_QSIM_PARSER_H_ diff --git a/tpls/qsim/cuda2hip.h b/tpls/qsim/cuda2hip.h new file mode 100644 index 0000000..da2d074 --- /dev/null +++ b/tpls/qsim/cuda2hip.h @@ -0,0 +1,61 @@ +// Copyright 2023 Advanced Micro Devices, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SIMULATOR_CUDA2HIP_H_ +#define SIMULATOR_CUDA2HIP_H_ + +#define cublasCaxpy hipblasCaxpy +#define cublasCdotc hipblasCdotc +#define cublasCreate hipblasCreate +#define cublasCscal hipblasCscal +#define cublasCsscal hipblasCsscal +#define cublasDestroy hipblasDestroy +#define cublasDznrm2 hipblasDznrm2 +#define cublasHandle_t hipblasHandle_t +#define cublasScnrm2 hipblasScnrm2 +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define cublasStatus_t hipblasStatus_t +#define cublasZaxpy hipblasZaxpy +#define cublasZdotc hipblasZdotc +#define cublasZdscal hipblasZdscal +#define cublasZscal hipblasZscal +#define cuCimagf hipCimagf +#define cuCimag hipCimag +#define cuComplex hipComplex +#define cuCrealf hipCrealf +#define cuCreal hipCreal +#define CUDA_C_32F HIPBLAS_C_32F +#define CUDA_C_64F HIPBLAS_C_64F +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaError_t hipError_t +#define cudaFree hipFree +#define cudaGetErrorString hipGetErrorString +#define cudaMalloc hipMalloc +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpy hipMemcpy +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemset hipMemset +#define cudaPeekAtLastError hipPeekAtLastError +#define cudaSuccess hipSuccess +#define cuDoubleComplex hipDoubleComplex + +template +__device__ __forceinline__ T __shfl_down_sync( + unsigned mask, T var, unsigned int delta, int width = warpSize) { + return __shfl_down(var, delta, width); +} + +#endif // SIMULATOR_CUDA2HIP_H_ diff --git a/tpls/qsim/expect.h b/tpls/qsim/expect.h new file mode 100644 index 0000000..518d516 --- /dev/null +++ b/tpls/qsim/expect.h @@ -0,0 +1,148 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef EXPECT_H_ +#define EXPECT_H_ + +#include + +#include "fuser.h" +#include "gate_appl.h" + +namespace qsim { + +template +struct OpString { + std::complex weight; + std::vector ops; +}; + +/** + * Computes the expectation value of the sum of operator strings (operator + * sequences). Operators can act on any qubits and they can be any supported + * gates. This function uses a temporary state vector. + * @param param Options for gate fusion. + * @param strings Operator strings. + * @param ss StateSpace object required to copy the state vector and compute + * inner products. + * @param simulator Simulator object. Provides specific implementations for + * applying gates. + * @param state The state vector of the system. + * @param ket Temporary state vector. + * @return The computed expectation value. + */ +template +std::complex ExpectationValue( + const typename Fuser::Parameter& param, + const std::vector>& strings, + const typename Simulator::StateSpace& state_space, + const Simulator& simulator, const typename Simulator::State& state, + typename Simulator::State& ket) { + std::complex eval = 0; + + if (state_space.IsNull(ket) || ket.num_qubits() < state.num_qubits()) { + ket = state_space.Create(state.num_qubits()); + if (state_space.IsNull(ket)) { + IO::errorf("not enough memory: is the number of qubits too large?\n"); + return eval; + } + } + + for (const auto& str : strings) { + if (str.ops.size() == 0) { + eval += str.weight; + continue; + } + + state_space.Copy(state, ket); + + if (str.ops.size() == 1) { + const auto& op = str.ops[0]; + simulator.ApplyGate(op.qubits, op.matrix.data(), ket); + } else { + auto fused_gates = Fuser::FuseGates(param, state.num_qubits(), str.ops); + if (fused_gates.size() == 0) { + eval = 0; + break; + } + + for (const auto& fgate : fused_gates) { + ApplyFusedGate(simulator, fgate, ket); + } + } + + eval += str.weight * state_space.InnerProduct(state, ket); + } + + return eval; +} + +/** + * Computes the expectation value of the sum of operator strings (operator + * sequences). Operators can act on any qubits and they can be any supported + * gates except for user-defined controlled gates. Computation is performed + * in place. No additional memory is allocated. The operator strings should + * act on no more than six qubits and they should be fusible into one gate. + * @param strings Operator strings. + * @param simulator Simulator object. Provides specific implementations for + * computing expectation values. + * @param state The state of the system. + * @return The computed expectation value. + */ +template +std::complex ExpectationValue( + const std::vector>& strings, + const Simulator& simulator, const typename Simulator::State& state) { + std::complex eval = 0; + + typename Fuser::Parameter param; + param.max_fused_size = 6; + for (const auto& str : strings) { + if (str.ops.size() == 0) { + eval += str.weight; + } else if (str.ops.size() == 1) { + const auto& op = str.ops[0]; + auto r = simulator.ExpectationValue(op.qubits, op.matrix.data(), state); + eval += str.weight * r; + } else { + auto fused_gates = Fuser::FuseGates(param, state.num_qubits(), str.ops); + + if (fused_gates.size() != 1) { + IO::errorf("too many fused gates; " + "cannot compute the expectation value.\n"); + eval = 0; + break; + } + + const auto& fgate = fused_gates[0]; + + if (fgate.qubits.size() > 6) { + IO::errorf("operator string acts on too many qubits; " + "cannot compute the expectation value.\n"); + eval = 0; + break; + } + + auto r = simulator.ExpectationValue( + fgate.qubits, fgate.matrix.data(), state); + eval += str.weight * r; + } + } + + return eval; +} + +} // namespace qsim + +#endif // EXPECT_H_ diff --git a/tpls/qsim/formux.h b/tpls/qsim/formux.h new file mode 100644 index 0000000..4401e9b --- /dev/null +++ b/tpls/qsim/formux.h @@ -0,0 +1,30 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef FORMUX_H_ +#define FORMUX_H_ + +#ifdef _OPENMP +# include "parfor.h" + namespace qsim { + using For = ParallelFor; + } +#else +# include "seqfor.h" + namespace qsim { + using For = SequentialFor; + } +#endif + +#endif // FORMUX_H_ diff --git a/tpls/qsim/fuser.h b/tpls/qsim/fuser.h new file mode 100644 index 0000000..e4f3c3b --- /dev/null +++ b/tpls/qsim/fuser.h @@ -0,0 +1,225 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef FUSER_H_ +#define FUSER_H_ + +#include +#include + +#include "gate.h" +#include "matrix.h" + +namespace qsim { + +/** + * A collection of "fused" gates which can be multiplied together before being + * applied to the state vector. + */ +template +struct GateFused { + /** + * Kind of the first ("parent") gate. + */ + typename Gate::GateKind kind; + /** + * The time index of the first ("parent") gate. + */ + unsigned time; + /** + * A list of qubits these gates act upon. Control qubits for + * explicitly-controlled gates are excluded from this list. + */ + std::vector qubits; + /** + * Pointer to the first ("parent") gate. + */ + const Gate* parent; + /** + * Ordered list of component gates. + */ + std::vector gates; + /** + * Fused gate matrix. + */ + Matrix matrix; +}; + +/** + * A base class for fuser classes with some common functions. + */ +template +class Fuser { + protected: + using RGate = typename std::remove_pointer::type; + + static const RGate& GateToConstRef(const RGate& gate) { + return gate; + } + + static const RGate& GateToConstRef(const RGate* gate) { + return *gate; + } + + static std::vector MergeWithMeasurementTimes( + typename std::vector::const_iterator gfirst, + typename std::vector::const_iterator glast, + const std::vector& times) { + std::vector epochs; + epochs.reserve(glast - gfirst + times.size()); + + std::size_t last = 0; + unsigned max_time = 0; + + for (auto gate_it = gfirst; gate_it < glast; ++gate_it) { + const auto& gate = GateToConstRef(*gate_it); + + if (gate.time > max_time) { + max_time = gate.time; + } + + if (epochs.size() > 0 && gate.time < epochs.back()) { + IO::errorf("gate crosses the time boundary.\n"); + epochs.resize(0); + return epochs; + } + + if (gate.kind == gate::kMeasurement) { + if (epochs.size() == 0 || epochs.back() < gate.time) { + if (!AddBoundary(gate.time, max_time, epochs)) { + epochs.resize(0); + return epochs; + } + } + } + + while (last < times.size() && times[last] <= gate.time) { + unsigned prev = times[last++]; + epochs.push_back(prev); + if (!AddBoundary(prev, max_time, epochs)) { + epochs.resize(0); + return epochs; + } + while (last < times.size() && times[last] <= prev) ++last; + } + } + + if (epochs.size() == 0 || epochs.back() < max_time) { + epochs.push_back(max_time); + } + + return epochs; + } + + template + static void FuseZeroQubitGates(const GateSeq0& gate_seq0, + Parent parent, std::size_t first, + std::vector& fused_gates) { + GateFused* fuse_to = nullptr; + + for (std::size_t i = first; i < fused_gates.size(); ++i) { + auto& fgate = fused_gates[i]; + + if (fgate.kind != gate::kMeasurement && fgate.kind != gate::kDecomp + && fgate.parent->controlled_by.size() == 0 + && !fgate.parent->unfusible) { + fuse_to = &fgate; + break; + } + } + + if (fuse_to != nullptr) { + // Fuse zero-qubit gates with the first available fused gate. + for (const auto& g : gate_seq0) { + fuse_to->gates.push_back(parent(g)); + } + } else { + auto g0 = parent(gate_seq0[0]); + fused_gates.push_back({g0->kind, g0->time, {}, g0, {g0}, {}}); + + for (std::size_t i = 1; i < gate_seq0.size(); ++i) { + fused_gates.back().gates.push_back(parent(gate_seq0[i])); + } + } + } + + private: + static bool AddBoundary(unsigned time, unsigned max_time, + std::vector& boundaries) { + if (max_time > time) { + IO::errorf("gate crosses the time boundary.\n"); + return false; + } + + boundaries.push_back(time); + return true; + } +}; + +/** + * Multiplies component gate matrices of a fused gate. + * @param gate Fused gate. + */ +template +inline void CalculateFusedMatrix(FusedGate& gate) { + MatrixIdentity(unsigned{1} << gate.qubits.size(), gate.matrix); + + for (auto pgate : gate.gates) { + if (pgate->qubits.size() == 0) { + MatrixScalarMultiply(pgate->matrix[0], pgate->matrix[1], gate.matrix); + } else if (gate.qubits.size() == pgate->qubits.size()) { + MatrixMultiply(gate.qubits.size(), pgate->matrix, gate.matrix); + } else { + unsigned mask = 0; + + for (auto q : pgate->qubits) { + for (std::size_t i = 0; i < gate.qubits.size(); ++i) { + if (q == gate.qubits[i]) { + mask |= unsigned{1} << i; + break; + } + } + } + + MatrixMultiply(mask, pgate->qubits.size(), pgate->matrix, + gate.qubits.size(), gate.matrix); + } + } +} + +/** + * Multiplies component gate matrices for a range of fused gates. + * @param gbeg, gend The iterator range [gbeg, gend) of fused gates. + */ +template +inline void CalculateFusedMatrices(Iterator gbeg, Iterator gend) { + for (auto g = gbeg; g != gend; ++g) { + if (g->kind != gate::kMeasurement) { + CalculateFusedMatrix(*g); + } + } +} + +/** + * Multiplies component gate matrices for a vector of fused gates. + * @param gates The vector of fused gates. + */ +template +inline void CalculateFusedMatrices(std::vector& gates) { + CalculateFusedMatrices(gates.begin(), gates.end()); +} + +} // namespace qsim + +#endif // FUSER_H_ diff --git a/tpls/qsim/fuser_basic.h b/tpls/qsim/fuser_basic.h new file mode 100644 index 0000000..3191bd2 --- /dev/null +++ b/tpls/qsim/fuser_basic.h @@ -0,0 +1,411 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef FUSER_BASIC_H_ +#define FUSER_BASIC_H_ + +#include +#include +#include +#include + +#include "gate.h" +#include "fuser.h" + +namespace qsim { + +/** + * Stateless object with methods for aggregating `Gate`s into `GateFused`. + * Measurement gates with equal times are fused together. + * User-defined controlled gates (controlled_by.size() > 0) and gates acting on + * more than two qubits are not fused. + * The template parameter Gate can be Gate type or a pointer to Gate type. + * This class is deprecated. It is recommended to use MultiQubitGateFuser + * from fuser_mqubit.h. + */ +template +class BasicGateFuser final : public Fuser { + private: + using Base = Fuser; + using RGate = typename Base::RGate; + + public: + using GateFused = qsim::GateFused; + + /** + * User-specified parameters for gate fusion. + * BasicGateFuser does not use any parameters. + */ + struct Parameter { + unsigned verbosity = 0; + }; + + /** + * Stores sets of gates that can be applied together. Only one- and + * two-qubit gates will get fused. To respect specific time boundaries while + * fusing gates, use the other version of this method below. + * @param param Options for gate fusion. + * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'. + * @param gates The gates (or pointers to the gates) to be fused. + * Gate times of the gates that act on the same qubits should be ordered. + * Gates that are out of time order should not cross the time boundaries + * set by measurement gates. + * @param fuse_matrix If true, multiply gate matrices together. + * @return A vector of fused gate objects. Each element is a set of gates + * acting on a specific pair of qubits which can be applied as a group. + */ + static std::vector FuseGates(const Parameter& param, + unsigned max_qubit1, + const std::vector& gates, + bool fuse_matrix = true) { + return FuseGates( + param, max_qubit1, gates.cbegin(), gates.cend(), {}, fuse_matrix); + } + + /** + * Stores sets of gates that can be applied together. Only one- and + * two-qubit gates will get fused. + * @param param Options for gate fusion. + * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'. + * @param gates The gates (or pointers to the gates) to be fused. + * Gate times of the gates that act on the same qubits should be ordered. + * Gates that are out of time order should not cross the time boundaries + * set by `times_to_split_at` or by measurement gates. + * @param times_to_split_at Ordered list of time steps (boundaries) at which + * to separate fused gates. Each element of the output will contain gates + * from a single 'window' in this list. + * @param fuse_matrix If true, multiply gate matrices together. + * @return A vector of fused gate objects. Each element is a set of gates + * acting on a specific pair of qubits which can be applied as a group. + */ + static std::vector FuseGates( + const Parameter& param, + unsigned max_qubit1, const std::vector& gates, + const std::vector& times_to_split_at, + bool fuse_matrix = true) { + return FuseGates(param, max_qubit1, gates.cbegin(), gates.cend(), + times_to_split_at, fuse_matrix); + } + + /** + * Stores sets of gates that can be applied together. Only one- and + * two-qubit gates will get fused. To respect specific time boundaries while + * fusing gates, use the other version of this method below. + * @param param Options for gate fusion. + * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'. + * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates + * (or pointers to gates) in. Gate times of the gates that act on the same + * qubits should be ordered. Gates that are out of time order should not + * cross the time boundaries set by measurement gates. + * @param fuse_matrix If true, multiply gate matrices together. + * @return A vector of fused gate objects. Each element is a set of gates + * acting on a specific pair of qubits which can be applied as a group. + */ + static std::vector FuseGates( + const Parameter& param, unsigned max_qubit1, + typename std::vector::const_iterator gfirst, + typename std::vector::const_iterator glast, + bool fuse_matrix = true) { + return FuseGates(param, max_qubit1, gfirst, glast, {}, fuse_matrix); + } + + /** + * Stores sets of gates that can be applied together. Only one- and + * two-qubit gates will get fused. + * @param param Options for gate fusion. + * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'. + * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates + * (or pointers to gates) in. Gate times of the gates that act on the same + * qubits should be ordered. Gates that are out of time order should not + * cross the time boundaries set by `times_to_split_at` or by measurement + * gates. + * @param times_to_split_at Ordered list of time steps (boundaries) at which + * to separate fused gates. Each element of the output will contain gates + * from a single 'window' in this list. + * @param fuse_matrix If true, multiply gate matrices together. + * @return A vector of fused gate objects. Each element is a set of gates + * acting on a specific pair of qubits which can be applied as a group. + */ + static std::vector FuseGates( + const Parameter& param, unsigned max_qubit1, + typename std::vector::const_iterator gfirst, + typename std::vector::const_iterator glast, + const std::vector& times_to_split_at, + bool fuse_matrix = true) { + std::vector gates_fused; + + if (gfirst >= glast) return gates_fused; + + std::size_t num_gates = glast - gfirst; + + gates_fused.reserve(num_gates); + + // Merge with measurement gate times to separate fused gates at. + auto times = + Base::MergeWithMeasurementTimes(gfirst, glast, times_to_split_at); + + // Map to keep track of measurement gates with equal times. + std::map> measurement_gates; + + // Sequence of top level gates the other gates get fused to. + std::vector gates_seq; + + // Sequence of zero-qubit gates. + std::vector gates_seq0; + + // Lattice of gates: qubits "hyperplane" and time direction. + std::vector> gates_lat(max_qubit1); + + // Current unfused gate. + auto gate_it = gfirst; + + std::size_t last_fused_gate_index = 0; + + for (std::size_t l = 0; l < times.size(); ++l) { + gates_seq.resize(0); + gates_seq.reserve(num_gates); + + gates_seq0.resize(0); + gates_seq0.reserve(num_gates); + + for (unsigned k = 0; k < max_qubit1; ++k) { + gates_lat[k].resize(0); + gates_lat[k].reserve(128); + } + + // Fill gates_seq and gates_lat in. + for (; gate_it < glast; ++gate_it) { + const auto& gate = Base::GateToConstRef(*gate_it); + + if (gate.time > times[l]) break; + + if (!ValidateGate(gate, max_qubit1, gates_lat)) { + gates_fused.resize(0); + return gates_fused; + } + + if (gate.kind == gate::kMeasurement) { + auto& mea_gates_at_time = measurement_gates[gate.time]; + if (mea_gates_at_time.size() == 0) { + gates_seq.push_back(&gate); + mea_gates_at_time.reserve(max_qubit1); + } + + mea_gates_at_time.push_back(&gate); + } else if (gate.controlled_by.size() > 0 || gate.qubits.size() > 2) { + for (auto q : gate.qubits) { + gates_lat[q].push_back(&gate); + } + for (auto q : gate.controlled_by) { + gates_lat[q].push_back(&gate); + } + gates_seq.push_back(&gate); + } else if (gate.qubits.size() == 1) { + gates_lat[gate.qubits[0]].push_back(&gate); + if (gate.unfusible) { + gates_seq.push_back(&gate); + } + } else if (gate.qubits.size() == 2) { + gates_lat[gate.qubits[0]].push_back(&gate); + gates_lat[gate.qubits[1]].push_back(&gate); + gates_seq.push_back(&gate); + } else { + gates_seq0.push_back(&gate); + } + } + + std::vector last(max_qubit1, 0); + + const RGate* delayed_measurement_gate = nullptr; + + // Fuse gates. + for (auto pgate : gates_seq) { + if (pgate->kind == gate::kMeasurement) { + delayed_measurement_gate = pgate; + } else if (pgate->qubits.size() > 2 + || pgate->controlled_by.size() > 0) { + // Multi-qubit or controlled gate. + + for (auto q : pgate->qubits) { + unsigned l = last[q]; + if (gates_lat[q][l] != pgate) { + last[q] = AddOrphanedQubit(q, l, gates_lat, gates_fused); + } + ++last[q]; + } + + for (auto q : pgate->controlled_by) { + unsigned l = last[q]; + if (gates_lat[q][l] != pgate) { + last[q] = AddOrphanedQubit(q, l, gates_lat, gates_fused); + } + ++last[q]; + } + + gates_fused.push_back({pgate->kind, pgate->time, pgate->qubits, + pgate, {pgate}, {}}); + } else if (pgate->qubits.size() == 1) { + unsigned q0 = pgate->qubits[0]; + + GateFused gate_f = {pgate->kind, pgate->time, {q0}, pgate, {}, {}}; + + last[q0] = Advance(last[q0], gates_lat[q0], gate_f.gates); + gate_f.gates.push_back(gates_lat[q0][last[q0]]); + last[q0] = Advance(last[q0] + 1, gates_lat[q0], gate_f.gates); + + gates_fused.push_back(std::move(gate_f)); + } else if (pgate->qubits.size() == 2) { + unsigned q0 = pgate->qubits[0]; + unsigned q1 = pgate->qubits[1]; + + if (Done(last[q0], pgate->time, gates_lat[q0])) continue; + + GateFused gate_f = + {pgate->kind, pgate->time, {q0, q1}, pgate, {}, {}}; + + do { + last[q0] = Advance(last[q0], gates_lat[q0], gate_f.gates); + last[q1] = Advance(last[q1], gates_lat[q1], gate_f.gates); + // Here gates_lat[q0][last[q0]] == gates_lat[q1][last[q1]]. + + gate_f.gates.push_back(gates_lat[q0][last[q0]]); + + last[q0] = Advance(last[q0] + 1, gates_lat[q0], gate_f.gates); + last[q1] = Advance(last[q1] + 1, gates_lat[q1], gate_f.gates); + } while (NextGate(last[q0], gates_lat[q0], last[q1], gates_lat[q1])); + + gates_fused.push_back(std::move(gate_f)); + } + } + + for (unsigned q = 0; q < max_qubit1; ++q) { + auto l = last[q]; + if (l == gates_lat[q].size()) continue; + + // Orphaned qubit. + AddOrphanedQubit(q, l, gates_lat, gates_fused); + } + + if (delayed_measurement_gate != nullptr) { + auto pgate = delayed_measurement_gate; + + const auto& mea_gates_at_time = measurement_gates[pgate->time]; + + GateFused gate_f = {pgate->kind, pgate->time, {}, pgate, {}, {}}; + gate_f.gates.reserve(mea_gates_at_time.size()); + + // Fuse measurement gates with equal times. + + for (const auto* pgate : mea_gates_at_time) { + gate_f.qubits.insert(gate_f.qubits.end(), + pgate->qubits.begin(), pgate->qubits.end()); + gate_f.gates.push_back(pgate); + } + + gates_fused.push_back(std::move(gate_f)); + } + + if (gates_seq0.size() != 0) { + Base::FuseZeroQubitGates(gates_seq0, [](const RGate* g) { return g; }, + last_fused_gate_index, gates_fused); + } + + if (gate_it == glast) break; + + last_fused_gate_index = gates_fused.size(); + } + + if (fuse_matrix) { + for (auto& gate_f : gates_fused) { + if (gate_f.kind != gate::kMeasurement && gate_f.kind != gate::kDecomp) { + CalculateFusedMatrix(gate_f); + } + } + } + + return gates_fused; + } + + private: + static unsigned Advance(unsigned k, const std::vector& wl, + std::vector& gates) { + while (k < wl.size() && wl[k]->qubits.size() == 1 + && wl[k]->controlled_by.size() == 0 && !wl[k]->unfusible) { + gates.push_back(wl[k++]); + } + + return k; + } + + static bool Done( + unsigned k, unsigned t, const std::vector& wl) { + return k >= wl.size() || wl[k]->time > t; + } + + static bool NextGate(unsigned k1, const std::vector& wl1, + unsigned k2, const std::vector& wl2) { + return k1 < wl1.size() && k2 < wl2.size() && wl1[k1] == wl2[k2] + && wl1[k1]->qubits.size() < 3 && wl1[k1]->controlled_by.size() == 0; + } + + template + static unsigned AddOrphanedQubit(unsigned q, unsigned k, + const GatesLat& gates_lat, + std::vector& gates_fused) { + auto pgate = gates_lat[q][k]; + + GateFused gate_f = {pgate->kind, pgate->time, {q}, pgate, {}, {}}; + gate_f.gates.push_back(pgate); + + k = Advance(k + 1, gates_lat[q], gate_f.gates); + + gates_fused.push_back(std::move(gate_f)); + + return k; + } + + template + static bool ValidateGate(const Gate2& gate, unsigned max_qubit1, + const GatesLat& gates_lat) { + for (unsigned q : gate.qubits) { + if (q >= max_qubit1) { + IO::errorf("fuser: gate qubit %u is out of range " + "(should be smaller than %u).\n", q, max_qubit1); + return false; + } + if (!gates_lat[q].empty() && gate.time <= gates_lat[q].back()->time) { + IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time); + return false; + } + } + + for (unsigned q : gate.controlled_by) { + if (q >= max_qubit1) { + IO::errorf("fuser: gate qubit %u is out of range " + "(should be smaller than %u).\n", q, max_qubit1); + return false; + } + if (!gates_lat[q].empty() && gate.time <= gates_lat[q].back()->time) { + IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time); + return false; + } + } + + return true; + } +}; + +} // namespace qsim + +#endif // FUSER_BASIC_H_ diff --git a/tpls/qsim/fuser_mqubit.h b/tpls/qsim/fuser_mqubit.h new file mode 100644 index 0000000..c75b1a0 --- /dev/null +++ b/tpls/qsim/fuser_mqubit.h @@ -0,0 +1,1095 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef FUSER_MQUBIT_H_ +#define FUSER_MQUBIT_H_ + +#include +#include +#include +#include +#include +#include + +#include "gate.h" +#include "fuser.h" + +namespace qsim { + +/** + * Multi-qubit gate fuser. + * Measurement gates with equal times are fused together. + * User-defined controlled gates (controlled_by.size() > 0) are not fused. + * The template parameter Gate can be Gate type or a pointer to Gate type. + */ +template +class MultiQubitGateFuser final : public Fuser { + private: + using Base = Fuser; + using RGate = typename Base::RGate; + + // Auxillary classes and structs. + + // Manages doubly-linked lists. + template + class LinkManagerT { + public: + struct Link { + T val; + Link* next; + Link* prev; + }; + + explicit LinkManagerT(uint64_t size) { + links_.reserve(size); + } + + Link* AddBack(const T& t, Link* link) { + if (link == nullptr) { + links_.push_back({t, nullptr, nullptr}); + } else { + links_.push_back({t, link->next, link}); + link->next = &links_.back(); + } + + return &links_.back(); + } + + static void Delete(const Link* link) { + if (link->prev != nullptr) { + link->prev->next = link->next; + } + if (link->next != nullptr) { + link->next->prev = link->prev; + } + } + + private: + std::vector links_; + }; + + struct GateF; + + using LinkManager = LinkManagerT; + using Link = typename LinkManager::Link; + + // Intermediate representation of a fused gate. + struct GateF { + const RGate* parent; + std::vector qubits; + std::vector gates; // Gates that get fused to this gate. + std::vector links; // Gate "lattice" links. + uint64_t mask; // Qubit mask. + unsigned visited; + }; + + // Possible values for visited in GateF. + // Note that MakeGateSequence assignes values from kSecond to the number of + // gates in the sequence plus one, see below. + enum Visited { + kZero = 0, // Start value for "normal" gates. + kFirst = 1, // Value after the first pass for partially fused + // "normal" gates. + kSecond = 2, // Start value to assign values in MakeGateSequence. + kCompress = 99999997, // Used to compress links. + kMeaCnt = 99999998, // Start value for controlled or measurement gates. + kFinal = 99999999, // Value after the second pass for fused "normal" + // gates or for controlled and measurement gates. + }; + + struct Stat { + unsigned num_mea_gates = 0; + unsigned num_fused_mea_gates = 0; + unsigned num_fused_gates = 0; + unsigned num_controlled_gates = 0; + std::vector num_gates; + }; + + // Gate that is added to a sequence of gates to fuse together. + struct GateA { + GateF* gate; + std::vector qubits; // Added qubits. + std::vector links; // Added lattice links. + }; + + struct Scratch { + std::vector data; + std::vector prev1; + std::vector prev2; + std::vector next1; + std::vector next2; + std::vector longest_seq; + std::vector stack; + std::vector gates; + unsigned count = 0; + }; + + public: + using GateFused = qsim::GateFused; + + /** + * User-specified parameters for gate fusion. + */ + struct Parameter { + /** + * Maximum number of qubits in a fused gate. It can take values from 2 to + * 6 (0 and 1 are equivalent to 2). It is not recommended to use 5 or 6 as + * that might degrade performance for not very fast machines. + */ + unsigned max_fused_size = 2; + unsigned verbosity = 0; + }; + + /** + * Stores sets of gates that can be applied together. To respect specific + * time boundaries while fusing gates, use the other version of this method + * below. + * @param param Options for gate fusion. + * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'. + * @param gates The gates (or pointers to the gates) to be fused. + * Gate times of the gates that act on the same qubits should be ordered. + * Gates that are out of time order should not cross the time boundaries + * set by measurement gates. + * @param fuse_matrix If true, multiply gate matrices together. + * @return A vector of fused gate objects. Each element is a set of gates + * acting on a specific pair of qubits which can be applied as a group. + */ + static std::vector FuseGates(const Parameter& param, + unsigned max_qubit1, + const std::vector& gates, + bool fuse_matrix = true) { + return FuseGates( + param, max_qubit1, gates.cbegin(), gates.cend(), {}, fuse_matrix); + } + + /** + * Stores sets of gates that can be applied together. + * @param param Options for gate fusion. + * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'. + * @param gates The gates (or pointers to the gates) to be fused. + * Gate times of the gates that act on the same qubits should be ordered. + * Gates that are out of time order should not cross the time boundaries + * set by `times_to_split_at` or by measurement gates. + * @param times_to_split_at Ordered list of time steps (boundaries) at which + * to separate fused gates. Each element of the output will contain gates + * from a single 'window' in this list. + * @param fuse_matrix If true, multiply gate matrices together. + * @return A vector of fused gate objects. Each element is a set of gates + * acting on a specific pair of qubits which can be applied as a group. + */ + static std::vector FuseGates( + const Parameter& param, + unsigned max_qubit1, const std::vector& gates, + const std::vector& times_to_split_at, + bool fuse_matrix = true) { + return FuseGates(param, max_qubit1, gates.cbegin(), gates.cend(), + times_to_split_at, fuse_matrix); + } + + /** + * Stores sets of gates that can be applied together. To respect specific + * time boundaries while fusing gates, use the other version of this method + * below. + * @param param Options for gate fusion. + * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'. + * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates + * (or pointers to gates) in. Gate times of the gates that act on the same + * qubits should be ordered. Gates that are out of time order should not + * cross the time boundaries set by measurement gates. + * @param fuse_matrix If true, multiply gate matrices together. + * @return A vector of fused gate objects. Each element is a set of gates + * acting on a specific pair of qubits which can be applied as a group. + */ + static std::vector FuseGates( + const Parameter& param, unsigned max_qubit1, + typename std::vector::const_iterator gfirst, + typename std::vector::const_iterator glast, + bool fuse_matrix = true) { + return FuseGates(param, max_qubit1, gfirst, glast, {}, fuse_matrix); + } + + /** + * Stores sets of gates that can be applied together. + * @param param Options for gate fusion. + * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'. + * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates + * (or pointers to gates) in. Gate times of the gates that act on the same + * qubits should be ordered. Gates that are out of time order should not + * cross the time boundaries set by `times_to_split_at` or by measurement + * gates. + * @param times_to_split_at Ordered list of time steps (boundaries) at which + * to separate fused gates. Each element of the output will contain gates + * from a single 'window' in this list. + * @param fuse_matrix If true, multiply gate matrices together. + * @return A vector of fused gate objects. Each element is a set of gates + * acting on a specific pair of qubits which can be applied as a group. + */ + static std::vector FuseGates( + const Parameter& param, unsigned max_qubit1, + typename std::vector::const_iterator gfirst, + typename std::vector::const_iterator glast, + const std::vector& times_to_split_at, + bool fuse_matrix = true) { + std::vector fused_gates; + + if (gfirst >= glast) return fused_gates; + + std::size_t num_gates = glast - gfirst; + + fused_gates.reserve(num_gates); + + // Merge with measurement gate times to separate fused gates at. + auto epochs = + Base::MergeWithMeasurementTimes(gfirst, glast, times_to_split_at); + + LinkManager link_manager(max_qubit1 * num_gates); + + // Auxillary data structures. + // Sequence of intermediate fused gates. + std::vector gates_seq; + // Gate "lattice". + std::vector gates_lat; + // Sequences of intermediate fused gates ordered by gate size. + std::vector> fgates(max_qubit1 + 1); + + gates_seq.reserve(num_gates); + gates_lat.reserve(max_qubit1); + + Scratch scratch; + + scratch.data.reserve(1024); + scratch.prev1.reserve(32); + scratch.prev2.reserve(32); + scratch.next1.reserve(32); + scratch.next2.reserve(32); + scratch.longest_seq.reserve(8); + scratch.stack.reserve(8); + + Stat stat; + stat.num_gates.resize(max_qubit1 + 1, 0); + + unsigned max_fused_size = std::min(unsigned{6}, param.max_fused_size); + max_fused_size = std::min(max_fused_size, max_qubit1); + + std::size_t last_fused_gate_index = 0; + auto gate_it = gfirst; + + // Iterate over epochs. + for (std::size_t l = 0; l < epochs.size(); ++l) { + gates_seq.resize(0); + gates_lat.resize(0); + gates_lat.resize(max_qubit1, nullptr); + + for (unsigned i = 0; i <= max_qubit1; ++i) { + fgates[i].resize(0); + } + + uint64_t max_gate_size = 0; + GateF* last_mea_gate = nullptr; + + // Iterate over input gates. + for (; gate_it < glast; ++gate_it) { + const auto& gate = Base::GateToConstRef(*gate_it); + + if (gate.time > epochs[l]) break; + + if (!ValidateGate(gate, max_qubit1, gates_lat)) { + fused_gates.resize(0); + return fused_gates; + } + + // Fill in auxillary data structures. + + if (gate.kind == gate::kMeasurement) { + // Measurement gate. + + if (last_mea_gate == nullptr + || last_mea_gate->parent->time != gate.time) { + gates_seq.push_back({&gate, {}, {}, {}, 0, kMeaCnt}); + last_mea_gate = &gates_seq.back(); + + last_mea_gate->qubits.reserve(max_qubit1); + last_mea_gate->links.reserve(max_qubit1); + + ++stat.num_fused_mea_gates; + } + + for (auto q : gate.qubits) { + last_mea_gate->qubits.push_back(q); + last_mea_gate->mask |= uint64_t{1} << q; + gates_lat[q] = link_manager.AddBack(last_mea_gate, gates_lat[q]); + last_mea_gate->links.push_back(gates_lat[q]); + } + + last_mea_gate->gates.push_back(&gate); + + ++stat.num_mea_gates; + } else { + gates_seq.push_back({&gate, {}, {}, {}, 0, kZero}); + auto& fgate = gates_seq.back(); + + if (gate.controlled_by.size() == 0) { + if (max_gate_size < gate.qubits.size()) { + max_gate_size = gate.qubits.size(); + } + + unsigned num_gate_qubits = gate.qubits.size(); + unsigned size = std::max(max_fused_size, num_gate_qubits); + + fgate.qubits.reserve(size); + fgate.links.reserve(size); + fgate.gates.reserve(4 * size); + fgate.links.reserve(size); + + if (fgates[num_gate_qubits].empty()) { + fgates[num_gate_qubits].reserve(num_gates); + } + fgates[num_gate_qubits].push_back(&fgate); + + ++stat.num_gates[num_gate_qubits]; + } else { + // Controlled gate. + // Controlled gates are not fused with other gates. + + uint64_t size = gate.qubits.size() + gate.controlled_by.size(); + + fgate.qubits.reserve(gate.qubits.size()); + fgate.links.reserve(size); + + fgate.visited = kMeaCnt; + fgate.gates.push_back(&gate); + + ++stat.num_controlled_gates; + } + + for (auto q : gate.qubits) { + fgate.qubits.push_back(q); + fgate.mask |= uint64_t{1} << q; + gates_lat[q] = link_manager.AddBack(&fgate, gates_lat[q]); + fgate.links.push_back(gates_lat[q]); + } + + for (auto q : gate.controlled_by) { + fgate.mask |= uint64_t{1} << q; + gates_lat[q] = link_manager.AddBack(&fgate, gates_lat[q]); + fgate.links.push_back(gates_lat[q]); + } + } + } + + // Fuse large gates with smaller gates. + FuseGates(max_gate_size, fgates); + + if (max_fused_size > 2) { + FuseGateSequences( + max_fused_size, max_qubit1, scratch, gates_seq, stat, fused_gates); + } else { + unsigned prev_time = 0; + + std::vector orphaned_gates; + orphaned_gates.reserve(max_qubit1); + + for (auto& fgate : gates_seq) { + if (fgate.gates.size() == 0) continue; + + if (prev_time != fgate.parent->time) { + if (orphaned_gates.size() > 0) { + FuseOrphanedGates( + max_fused_size, stat, orphaned_gates, fused_gates); + orphaned_gates.resize(0); + } + + prev_time = fgate.parent->time; + } + + if (fgate.qubits.size() == 1 && max_fused_size > 1 + && fgate.visited != kMeaCnt && !fgate.parent->unfusible) { + orphaned_gates.push_back(&fgate); + continue; + } + + // Assume fgate.qubits (gate.qubits) are sorted. + fused_gates.push_back({fgate.parent->kind, fgate.parent->time, + std::move(fgate.qubits), fgate.parent, + std::move(fgate.gates), {}}); + + if (fgate.visited != kMeaCnt) { + ++stat.num_fused_gates; + } + } + + if (orphaned_gates.size() > 0) { + FuseOrphanedGates(max_fused_size, stat, orphaned_gates, fused_gates); + } + } + + if (fgates[0].size() != 0) { + Base::FuseZeroQubitGates(fgates[0], + [](const GateF* g) { return g->parent; }, + last_fused_gate_index, fused_gates); + } + + last_fused_gate_index = fused_gates.size(); + } + + if (fuse_matrix) { + for (auto& fgate : fused_gates) { + if (fgate.kind != gate::kMeasurement && fgate.kind != gate::kDecomp) { + CalculateFusedMatrix(fgate); + } + } + } + + PrintStat(param.verbosity, stat, fused_gates); + + return fused_gates; + } + + private: + // Fuse large gates with smaller gates. + static void FuseGates(uint64_t max_gate_size, + std::vector>& fgates) { + // Traverse gates in order of decreasing size. + for (uint64_t i = 0; i < max_gate_size; ++i) { + std::size_t pos = 0; + + for (auto fgate : fgates[max_gate_size - i]) { + if (fgate->visited > kZero) continue; + + fgates[max_gate_size - i][pos++] = fgate; + + fgate->visited = kFirst; + + FusePrev(0, *fgate); + fgate->gates.push_back(fgate->parent); + FuseNext(0, *fgate); + } + + fgates[max_gate_size - i].resize(pos); + } + } + + // Try to fuse gate sequences as follows. Gate time goes from bottom to top. + // Gates are fused either from left to right or from right to left. + // + // max_fused_size = 3: _- or -_ + // + // max_fused_size = 4: _-_ + // + // max_fused_size = 5: _-_- or -_-_ + // + // max_fused_size = 6: _-_-_ + static void FuseGateSequences(unsigned max_fused_size, + unsigned max_qubit1, Scratch& scratch, + std::vector& gates_seq, Stat& stat, + std::vector& fused_gates) { + unsigned prev_time = 0; + + std::vector orphaned_gates; + orphaned_gates.reserve(max_qubit1); + + for (auto& fgate : gates_seq) { + if (prev_time != fgate.parent->time) { + if (orphaned_gates.size() > 0) { + FuseOrphanedGates(max_fused_size, stat, orphaned_gates, fused_gates); + orphaned_gates.resize(0); + } + + prev_time = fgate.parent->time; + } + + if (fgate.visited == kFinal || fgate.gates.size() == 0) continue; + + if (fgate.visited == kMeaCnt || fgate.qubits.size() >= max_fused_size + || fgate.parent->unfusible) { + if (fgate.visited != kMeaCnt) { + ++stat.num_fused_gates; + } + + fgate.visited = kFinal; + + fused_gates.push_back({fgate.parent->kind, fgate.parent->time, + std::move(fgate.qubits), fgate.parent, + std::move(fgate.gates), {}}); + + continue; + } + + + if (fgate.qubits.size() == 1 && max_fused_size > 1) { + orphaned_gates.push_back(&fgate); + continue; + } + + scratch.data.resize(0); + scratch.gates.resize(0); + scratch.count = 0; + + MakeGateSequence(max_fused_size, scratch, fgate); + + if (scratch.gates.size() == 0) { + orphaned_gates.push_back(&fgate); + } else { + for (auto fgate : scratch.gates) { + std::sort(fgate->qubits.begin(), fgate->qubits.end()); + + fused_gates.push_back({fgate->parent->kind, fgate->parent->time, + std::move(fgate->qubits), fgate->parent, + std::move(fgate->gates), {}}); + + ++stat.num_fused_gates; + } + } + } + + if (orphaned_gates.size() > 0) { + FuseOrphanedGates(max_fused_size, stat, orphaned_gates, fused_gates); + } + } + + static void FuseOrphanedGates(unsigned max_fused_size, Stat& stat, + std::vector& orphaned_gates, + std::vector& fused_gates) { + for (std::size_t i = 0; i < orphaned_gates.size(); ++i) { + auto ogate1 = orphaned_gates[i]; + + if (ogate1->visited == kFinal) continue; + + ogate1->visited = kFinal; + + for (std::size_t j = i + 1; j < orphaned_gates.size(); ++j) { + auto ogate2 = orphaned_gates[j]; + + if (ogate2->visited == kFinal) continue; + + unsigned cur_size = ogate1->qubits.size() + ogate2->qubits.size(); + + if (cur_size <= max_fused_size) { + ogate2->visited = kFinal; + + for (auto q : ogate2->qubits) { + ogate1->qubits.push_back(q); + ogate1->mask |= uint64_t{1} << q; + } + + for (auto l : ogate2->links) { + ogate1->links.push_back(l); + } + + for (auto gate : ogate2->gates) { + ogate1->gates.push_back(gate); + } + } + + if (cur_size == max_fused_size) { + break; + } + } + + FuseNext(1, *ogate1); + + std::sort(ogate1->qubits.begin(), ogate1->qubits.end()); + + fused_gates.push_back({ogate1->parent->kind, ogate1->parent->time, + std::move(ogate1->qubits), ogate1->parent, + std::move(ogate1->gates), {}}); + + ++stat.num_fused_gates; + } + } + + static void MakeGateSequence( + unsigned max_fused_size, Scratch& scratch, GateF& fgate) { + unsigned level = kSecond + scratch.count; + + FindLongestGateSequence(max_fused_size, level, scratch, fgate); + + auto longest_seq = scratch.longest_seq; + + if (longest_seq.size() == 1 && scratch.count == 0) { + fgate.visited = kFirst; + return; + } + + ++scratch.count; + + for (auto p : longest_seq) { + p->gate->visited = kCompress; + + for (auto q : p->qubits) { + fgate.qubits.push_back(q); + fgate.mask |= uint64_t{1} << q; + } + + for (auto l : p->links) { + fgate.links.push_back(l); + } + } + + // Compress links. + for (auto& link : fgate.links) { + while (link->prev != nullptr && link->prev->val->visited == kCompress) { + link = link->prev; + } + + while (link->next != nullptr && link->next->val->visited == kCompress) { + LinkManager::Delete(link->next); + } + } + + for (auto p : longest_seq) { + p->gate->visited = level; + } + + if (longest_seq.size() >= 3) { + AddGatesFromNext(longest_seq[2]->gate->gates, fgate); + } + + if (longest_seq.size() >= 5) { + AddGatesFromNext(longest_seq[4]->gate->gates, fgate); + } + + if (longest_seq.size() >= 2) { + // May call MakeGateSequence recursively. + AddGatesFromPrev(max_fused_size, *longest_seq[1]->gate, scratch, fgate); + } + + if (longest_seq.size() >= 4) { + // May call MakeGateSequence recursively. + AddGatesFromPrev(max_fused_size, *longest_seq[3]->gate, scratch, fgate); + } + + for (auto p : longest_seq) { + p->gate->visited = kFinal; + } + + FuseNext(1, fgate); + + scratch.gates.push_back(&fgate); + } + + static void AddGatesFromNext(std::vector& gates, GateF& fgate) { + for (auto gate : gates) { + fgate.gates.push_back(gate); + } + } + + static void AddGatesFromPrev(unsigned max_fused_size, const GateF& pfgate, + Scratch& scratch, GateF& fgate) { + for (auto gate : pfgate.gates) { + fgate.gates.push_back(gate); + } + + for (auto link : pfgate.links) { + if (link->prev == nullptr) continue; + + auto pgate = link->prev->val; + + if (pgate->visited == kFirst) { + MakeGateSequence(max_fused_size, scratch, *pgate); + } + } + } + + static void FindLongestGateSequence( + unsigned max_fused_size, unsigned level, Scratch& scratch, GateF& fgate) { + scratch.data.push_back({&fgate, {}, {}}); + + scratch.longest_seq.resize(0); + scratch.longest_seq.push_back(&scratch.data.back()); + + scratch.stack.resize(0); + scratch.stack.push_back(&scratch.data.back()); + + unsigned cur_size = fgate.qubits.size(); + fgate.visited = level; + + unsigned max_size = cur_size; + + GetNextAvailableGates(max_fused_size, cur_size, fgate, nullptr, + scratch.data, scratch.next1); + + for (auto n1 : scratch.next1) { + unsigned cur_size2 = cur_size + n1->qubits.size(); + if (cur_size2 > max_fused_size) continue; + + bool feasible = GetPrevAvailableGates(max_fused_size, cur_size, + level, *n1->gate, nullptr, + scratch.data, scratch.prev1); + + if (!feasible) continue; + + if (scratch.prev1.size() == 0 && max_fused_size > 3) continue; + + if (cur_size2 == max_fused_size) { + std::swap(scratch.longest_seq, scratch.stack); + scratch.longest_seq.push_back(n1); + return; + } + + Push(level, cur_size2, cur_size, max_size, scratch, n1); + + for (auto p1 : scratch.prev1) { + unsigned cur_size2 = cur_size + p1->qubits.size(); + + if (cur_size2 > max_fused_size) { + continue; + } else if (cur_size2 == max_fused_size) { + std::swap(scratch.longest_seq, scratch.stack); + scratch.longest_seq.push_back(p1); + return; + } + + Push(level, cur_size2, cur_size, max_size, scratch, p1); + + GetNextAvailableGates(max_fused_size, cur_size, *p1->gate, &fgate, + scratch.data, scratch.next2); + + for (auto n2 : scratch.next2) { + unsigned cur_size2 = cur_size + n2->qubits.size(); + if (cur_size2 > max_fused_size) continue; + + bool feasible = GetPrevAvailableGates(max_fused_size, cur_size, + level, *n2->gate, n1->gate, + scratch.data, scratch.prev2); + + if (!feasible) continue; + + if (cur_size2 == max_fused_size) { + std::swap(scratch.longest_seq, scratch.stack); + scratch.longest_seq.push_back(n2); + return; + } + + Push(level, cur_size2, cur_size, max_size, scratch, n2); + + for (auto p2 : scratch.prev2) { + unsigned cur_size2 = cur_size + p2->qubits.size(); + + if (cur_size2 > max_fused_size) { + continue; + } else if (cur_size2 == max_fused_size) { + std::swap(scratch.longest_seq, scratch.stack); + scratch.longest_seq.push_back(p2); + return; + } + + if (cur_size2 > max_size) { + scratch.stack.push_back(p2); + scratch.longest_seq = scratch.stack; + scratch.stack.pop_back(); + max_size = cur_size2; + } + } + + Pop(cur_size, scratch, n2); + } + + Pop(cur_size, scratch, p1); + } + + Pop(cur_size, scratch, n1); + } + } + + static void Push(unsigned level, unsigned cur_size2, unsigned& cur_size, + unsigned& max_size, Scratch& scratch, GateA* agate) { + agate->gate->visited = level; + cur_size = cur_size2; + scratch.stack.push_back(agate); + + if (cur_size > max_size) { + scratch.longest_seq = scratch.stack; + max_size = cur_size; + } + } + + static void Pop(unsigned& cur_size, Scratch& scratch, GateA* agate) { + agate->gate->visited = kFirst; + cur_size -= agate->qubits.size(); + scratch.stack.pop_back(); + } + + static void GetNextAvailableGates(unsigned max_fused_size, unsigned cur_size, + const GateF& pgate1, const GateF* pgate2, + std::vector& scratch, + std::vector& next_gates) { + next_gates.resize(0); + + for (auto link : pgate1.links) { + if (link->next == nullptr) continue; + + auto ngate = link->next->val; + + if (ngate->visited > kFirst || ngate->parent->unfusible) continue; + + GateA next = {ngate, {}, {}}; + next.qubits.reserve(8); + next.links.reserve(8); + + GetAddedQubits(pgate1, pgate2, *ngate, next); + + if (cur_size + next.qubits.size() > max_fused_size) continue; + + scratch.push_back(std::move(next)); + next_gates.push_back(&scratch.back()); + } + } + + static bool GetPrevAvailableGates(unsigned max_fused_size, + unsigned cur_size, unsigned level, + const GateF& ngate1, const GateF* ngate2, + std::vector& scratch, + std::vector& prev_gates) { + prev_gates.resize(0); + + for (auto link : ngate1.links) { + if (link->prev == nullptr) continue; + + auto pgate = link->prev->val; + + if (pgate->visited == kFinal || pgate->visited == level) continue; + + if (pgate->visited > kFirst || pgate->parent->unfusible) { + prev_gates.resize(0); + return false; + } + + GateA prev = {pgate, {}, {}}; + prev.qubits.reserve(8); + prev.links.reserve(8); + + GetAddedQubits(ngate1, ngate2, *pgate, prev); + + bool all_prev_visited = true; + + for (auto link : pgate->links) { + if (link->prev == nullptr) continue; + + if (link->prev->val->visited <= kMeaCnt) { + all_prev_visited = false; + break; + } + } + + if (!all_prev_visited) { + prev_gates.resize(0); + return false; + } + + if (cur_size + prev.qubits.size() > max_fused_size) continue; + + if (all_prev_visited) { + scratch.push_back(std::move(prev)); + prev_gates.push_back(&scratch.back()); + } + } + + return true; + } + + static void GetAddedQubits(const GateF& fgate0, const GateF* fgate1, + const GateF& fgate2, GateA& added) { + for (std::size_t i = 0; i < fgate2.qubits.size(); ++i) { + unsigned q2 = fgate2.qubits[i]; + + if (std::find(fgate0.qubits.begin(), fgate0.qubits.end(), q2) + != fgate0.qubits.end()) continue; + + if (fgate1 != nullptr + && std::find(fgate1->qubits.begin(), fgate1->qubits.end(), q2) + != fgate1->qubits.end()) continue; + + added.qubits.push_back(q2); + added.links.push_back(fgate2.links[i]); + } + } + + // Fuse smaller gates with fgate back in gate time. + static void FusePrev(unsigned pass, GateF& fgate) { + std::vector gates; + gates.reserve(fgate.gates.capacity()); + + auto neighbor = [](const Link* link) -> const Link* { + return link->prev; + }; + + FusePrevOrNext>(pass, neighbor, fgate, gates); + + for (auto it = gates.rbegin(); it != gates.rend(); ++it) { + fgate.gates.push_back(*it); + } + } + + // Fuse smaller gates with fgate forward in gate time. + static void FuseNext(unsigned pass, GateF& fgate) { + auto neighbor = [](const Link* link) -> const Link* { + return link->next; + }; + + FusePrevOrNext>(pass, neighbor, fgate, fgate.gates); + } + + template + static void FusePrevOrNext(unsigned pass, Neighbor neighb, GateF& fgate, + std::vector& gates) { + uint64_t bad_mask = 0; + auto links = fgate.links; + + bool may_have_gates_to_fuse = true; + + while (may_have_gates_to_fuse) { + may_have_gates_to_fuse = false; + + std::sort(links.begin(), links.end(), + [&neighb](const Link* l, const Link* r) -> bool { + auto ln = neighb(l); + auto rn = neighb(r); + + if (ln != nullptr && rn != nullptr) { + return R()(ln->val->parent->time, rn->val->parent->time); + } else { + // nullptrs are larger than everything else and + // equivalent among each other. + return ln != nullptr; + } + }); + + for (auto link : links) { + auto n = neighb(link); + + if (n == nullptr) continue; + + auto g = n->val; + + if (!QubitsAreIn(fgate.mask, g->mask) || (g->mask & bad_mask) != 0 + || g->visited > pass || g->parent->unfusible) { + bad_mask |= g->mask; + } else { + g->visited = pass == 0 ? kFirst : kFinal; + + if (pass == 0) { + gates.push_back(g->parent); + } else { + for (auto gate : g->gates) { + gates.push_back(gate); + } + } + + for (auto link : g->links) { + LinkManager::Delete(link); + } + + may_have_gates_to_fuse = true; + break; + } + } + } + } + + static bool QubitsAreIn(uint64_t mask0, uint64_t mask) { + return ((mask0 | mask) ^ mask0) == 0; + } + + static void PrintStat(unsigned verbosity, const Stat& stat, + const std::vector& fused_gates) { + if (verbosity < 3) return; + + if (stat.num_controlled_gates > 0) { + IO::messagef("%lu controlled gates\n", stat.num_controlled_gates); + } + + if (stat.num_mea_gates > 0) { + IO::messagef("%lu measurement gates", stat.num_mea_gates); + if (stat.num_fused_mea_gates == stat.num_mea_gates) { + IO::messagef("\n"); + } else { + IO::messagef(" are fused into %lu gates\n", stat.num_fused_mea_gates); + } + } + + bool first = true; + for (unsigned i = 1; i < stat.num_gates.size(); ++i) { + if (stat.num_gates[i] > 0) { + if (first) { + first = false; + } else { + IO::messagef(", "); + } + IO::messagef("%u %u-qubit", stat.num_gates[i], i); + } + } + + IO::messagef(" gates are fused into %lu gates\n", stat.num_fused_gates); + + if (verbosity < 5) return; + + IO::messagef("fused gate qubits:\n"); + for (const auto& g : fused_gates) { + IO::messagef("%6u ", g.parent->time); + if (g.parent->kind == gate::kMeasurement) { + IO::messagef("m"); + } else if (g.parent->controlled_by.size() > 0) { + IO::messagef("c"); + for (auto q : g.parent->controlled_by) { + IO::messagef("%3u", q); + } + IO::messagef(" t"); + } else { + IO::messagef(" "); + } + + for (auto q : g.qubits) { + IO::messagef("%3u", q); + } + IO::messagef("\n"); + } + } + + template + static bool ValidateGate(const Gate2& gate, unsigned max_qubit1, + const GatesLat& gates_lat) { + for (unsigned q : gate.qubits) { + if (q >= max_qubit1) { + IO::errorf("fuser: gate qubit %u is out of range " + "(should be smaller than %u).\n", q, max_qubit1); + return false; + } + if (gates_lat[q] != nullptr + && gate.time <= gates_lat[q]->val->parent->time) { + IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time); + return false; + } + } + + for (unsigned q : gate.controlled_by) { + if (q >= max_qubit1) { + IO::errorf("fuser: gate qubit %u is out of range " + "(should be smaller than %u).\n", q, max_qubit1); + return false; + } + if (gates_lat[q] != nullptr + && gate.time <= gates_lat[q]->val->parent->time) { + IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time); + return false; + } + } + + return true; + } +}; + +} // namespace qsim + +#endif // FUSER_MQUBIT_H_ diff --git a/tpls/qsim/gate.h b/tpls/qsim/gate.h new file mode 100644 index 0000000..a457acb --- /dev/null +++ b/tpls/qsim/gate.h @@ -0,0 +1,216 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef GATE_H_ +#define GATE_H_ + +#include +#include +#include +#include + +#include "matrix.h" + +namespace qsim { + +namespace detail { + +template +inline void SortQubits(Gate& gate) { + for (std::size_t i = 1; i < gate.qubits.size(); ++i) { + if (gate.qubits[i - 1] > gate.qubits[i]) { + if (!GateDef::symmetric) { + auto perm = NormalToGateOrderPermutation(gate.qubits); + MatrixShuffle(perm, gate.qubits.size(), gate.matrix); + } + + gate.swapped = true; + std::sort(gate.qubits.begin(), gate.qubits.end()); + break; + } + } +} + +} // namespace detail + +template , typename Gate> +inline Gate& MakeControlledGate(Qubits&& controlled_by, Gate& gate) { + gate.controlled_by = std::forward(controlled_by); + gate.cmask = (uint64_t{1} << gate.controlled_by.size()) - 1; + + std::sort(gate.controlled_by.begin(), gate.controlled_by.end()); + + return gate; +} + +template , typename Gate> +inline Gate& MakeControlledGate(Qubits&& controlled_by, + const std::vector& control_values, + Gate& gate) { + // Assume controlled_by.size() == control_values.size(). + + bool sorted = true; + + for (std::size_t i = 1; i < controlled_by.size(); ++i) { + if (controlled_by[i - 1] > controlled_by[i]) { + sorted = false; + break; + } + } + + if (sorted) { + gate.controlled_by = std::forward(controlled_by); + gate.cmask = 0; + + for (std::size_t i = 0; i < control_values.size(); ++i) { + gate.cmask |= (control_values[i] & 1) << i; + } + } else { + struct ControlPair { + unsigned q; + unsigned v; + }; + + std::vector cpairs; + cpairs.reserve(controlled_by.size()); + + for (std::size_t i = 0; i < controlled_by.size(); ++i) { + cpairs.push_back({controlled_by[i], control_values[i]}); + } + + // Sort control qubits and control values. + std::sort(cpairs.begin(), cpairs.end(), + [](const ControlPair& l, const ControlPair& r) -> bool { + return l.q < r.q; + }); + + gate.cmask = 0; + gate.controlled_by.reserve(controlled_by.size()); + + for (std::size_t i = 0; i < cpairs.size(); ++i) { + gate.cmask |= (cpairs[i].v & 1) << i; + gate.controlled_by.push_back(cpairs[i].q); + } + } + + return gate; +} + +namespace gate { + +constexpr int kDecomp = 100001; // gate from Schmidt decomposition +constexpr int kMeasurement = 100002; // measurement gate + +} // namespace gate + +enum GateAnyKind { + kGateAny = -1, +}; + +/** + * A generic gate to make it easier to use qsim with external gate sets. + */ +template +struct Gate { + using fp_type = FP; + using GateKind = GK; + + GateKind kind; + unsigned time; + std::vector qubits; + std::vector controlled_by; + uint64_t cmask; + std::vector params; + Matrix matrix; + bool unfusible; // If true, the gate is fused as a parent. + bool swapped; // If true, the gate qubits are swapped to make qubits + // ordered in ascending order. This does not apply to + // control qubits of explicitly-controlled gates. + + template > + Gate&& ControlledBy(Qubits&& controlled_by) { + MakeControlledGate(std::forward(controlled_by), *this); + return std::move(*this); + } + + template > + Gate&& ControlledBy(Qubits&& controlled_by, + const std::vector& control_values) { + MakeControlledGate( + std::forward(controlled_by), control_values, *this); + return std::move(*this); + } +}; + +template , + typename M = Matrix> +inline Gate CreateGate(unsigned time, Qubits&& qubits, M&& matrix = {}, + std::vector&& params = {}) { + Gate gate = {GateDef::kind, time, std::forward(qubits), {}, 0, + std::move(params), std::forward(matrix), false, false}; + + if (GateDef::kind != gate::kMeasurement) { + switch (gate.qubits.size()) { + case 1: + break; + case 2: + if (gate.qubits[0] > gate.qubits[1]) { + gate.swapped = true; + std::swap(gate.qubits[0], gate.qubits[1]); + if (!GateDef::symmetric) { + MatrixShuffle({1, 0}, 2, gate.matrix); + } + } + break; + default: + detail::SortQubits(gate); + } + } + + return gate; +} + +namespace gate { + +/** + * A gate that simulates measurement of one or more qubits, collapsing the + * state vector and storing the measured results. + */ +template +struct Measurement { + using GateKind = typename Gate::GateKind; + + static constexpr GateKind kind = GateKind::kMeasurement; + static constexpr char name[] = "m"; + static constexpr bool symmetric = false; + + template > + static Gate Create(unsigned time, Qubits&& qubits) { + return CreateGate(time, std::forward(qubits)); + } +}; + +} // namespace gate + +template +using schmidt_decomp_type = std::vector>>; + +template +schmidt_decomp_type GetSchmidtDecomp( + GateKind kind, const std::vector& params); + +} // namespace qsim + +#endif // GATE_H_ diff --git a/tpls/qsim/gate_appl.h b/tpls/qsim/gate_appl.h new file mode 100644 index 0000000..8601e6f --- /dev/null +++ b/tpls/qsim/gate_appl.h @@ -0,0 +1,231 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef GATE_APPL_H_ +#define GATE_APPL_H_ + +#include +#include + +#include "fuser.h" +#include "gate.h" +#include "matrix.h" + +namespace qsim { + +/** + * Applies the given gate to the simulator state. Ignores measurement gates. + * @param simulator Simulator object. Provides specific implementations for + * applying gates. + * @param gate The gate to be applied. + * @param state The state of the system, to be updated by this method. + */ +template +inline void ApplyGate(const Simulator& simulator, const Gate& gate, + typename Simulator::State& state) { + if (gate.kind != gate::kMeasurement) { + if (gate.controlled_by.size() == 0) { + simulator.ApplyGate(gate.qubits, gate.matrix.data(), state); + } else { + simulator.ApplyControlledGate(gate.qubits, gate.controlled_by, + gate.cmask, gate.matrix.data(), state); + } + } +} + +/** + * Applies the given gate dagger to the simulator state. If the gate matrix is + * unitary then this is equivalent to applying the inverse gate. Ignores + * measurement gates. + * @param simulator Simulator object. Provides specific implementations for + * applying gates. + * @param gate The gate to be applied. + * @param state The state of the system, to be updated by this method. + */ +template +inline void ApplyGateDagger(const Simulator& simulator, const Gate& gate, + typename Simulator::State& state) { + if (gate.kind != gate::kMeasurement) { + auto matrix = gate.matrix; + MatrixDagger(unsigned{1} << gate.qubits.size(), matrix); + + if (gate.controlled_by.size() == 0) { + simulator.ApplyGate(gate.qubits, matrix.data(), state); + } else { + simulator.ApplyControlledGate(gate.qubits, gate.controlled_by, + gate.cmask, matrix.data(), state); + } + } +} + +/** + * Applies the given gate to the simulator state. + * @param state_space StateSpace object required to perform measurements. + * @param simulator Simulator object. Provides specific implementations for + * applying gates. + * @param gate The gate to be applied. + * @param rgen Random number generator to perform measurements. + * @param state The state of the system, to be updated by this method. + * @param mresults As an input parameter, this can be empty or this can + * contain the results of the previous measurements. If gate is a measurement + * gate then after a successful run, the measurement result will be added to + * this. + * @return True if the measurement performed successfully; false otherwise. + */ +template +inline bool ApplyGate( + const typename Simulator::StateSpace& state_space, + const Simulator& simulator, const Gate& gate, Rgen& rgen, + typename Simulator::State& state, + std::vector& mresults) { + if (gate.kind == gate::kMeasurement) { + auto measure_result = state_space.Measure(gate.qubits, rgen, state); + if (measure_result.valid) { + mresults.push_back(std::move(measure_result)); + } else { + return false; + } + } else { + ApplyGate(simulator, gate, state); + } + + return true; +} + +/** + * Applies the given gate to the simulator state, discarding measurement + * results. + * @param state_space StateSpace object required to perform measurements. + * @param simulator Simulator object. Provides specific implementations for + * applying gates. + * @param gate The gate to be applied. + * @param rgen Random number generator to perform measurements. + * @param state The state of the system, to be updated by this method. + * @return True if the measurement performed successfully; false otherwise. + */ +template +inline bool ApplyGate(const typename Simulator::StateSpace& state_space, + const Simulator& simulator, const Gate& gate, Rgen& rgen, + typename Simulator::State& state) { + using MeasurementResult = typename Simulator::StateSpace::MeasurementResult; + std::vector discarded_results; + return + ApplyGate(state_space, simulator, gate, rgen, state, discarded_results); +} + +/** + * Applies the given fused gate to the simulator state. Ignores measurement + * gates. + * @param simulator Simulator object. Provides specific implementations for + * applying gates. + * @param gate The gate to be applied. + * @param state The state of the system, to be updated by this method. + */ +template +inline void ApplyFusedGate(const Simulator& simulator, const Gate& gate, + typename Simulator::State& state) { + if (gate.kind != gate::kMeasurement) { + if (gate.parent->controlled_by.size() == 0) { + simulator.ApplyGate(gate.qubits, gate.matrix.data(), state); + } else { + simulator.ApplyControlledGate(gate.qubits, gate.parent->controlled_by, + gate.parent->cmask, gate.matrix.data(), + state); + } + } +} + +/** + * Applies the given fused gate dagger to the simulator state. If the gate + * matrix is unitary then this is equivalent to applying the inverse gate. + * Ignores measurement gates. + * @param simulator Simulator object. Provides specific implementations for + * applying gates. + * @param gate The gate to be applied. + * @param state The state of the system, to be updated by this method. + */ +template +inline void ApplyFusedGateDagger(const Simulator& simulator, const Gate& gate, + typename Simulator::State& state) { + if (gate.kind != gate::kMeasurement) { + auto matrix = gate.matrix; + MatrixDagger(unsigned{1} << gate.qubits.size(), matrix); + + if (gate.parent->controlled_by.size() == 0) { + simulator.ApplyGate(gate.qubits, matrix.data(), state); + } else { + simulator.ApplyControlledGate(gate.qubits, gate.parent->controlled_by, + gate.parent->cmask, matrix.data(), state); + } + } +} + +/** + * Applies the given fused gate to the simulator state. + * @param state_space StateSpace object required to perform measurements. + * @param simulator Simulator object. Provides specific implementations for + * applying gates. + * @param gate The gate to be applied. + * @param rgen Random number generator to perform measurements. + * @param state The state of the system, to be updated by this method. + * @param mresults As an input parameter, this can be empty or this can + * contain the results of the previous measurements. If gate is a measurement + * gate then after a successful run, the measurement result will be added to + * this. + * @return True if the measurement performed successfully; false otherwise. + */ +template +inline bool ApplyFusedGate( + const typename Simulator::StateSpace& state_space, + const Simulator& simulator, const Gate& gate, Rgen& rgen, + typename Simulator::State& state, + std::vector& mresults) { + if (gate.kind == gate::kMeasurement) { + auto measure_result = state_space.Measure(gate.qubits, rgen, state); + if (measure_result.valid) { + mresults.push_back(std::move(measure_result)); + } else { + return false; + } + } else { + ApplyFusedGate(simulator, gate, state); + } + + return true; +} + +/** + * Applies the given fused gate to the simulator state, discarding measurement + * results. + * @param state_space StateSpace object required to perform measurements. + * @param simulator Simulator object. Provides specific implementations for + * applying gates. + * @param gate The gate to be applied. + * @param rgen Random number generator to perform measurements. + * @param state The state of the system, to be updated by this method. + * @return True if the measurement performed successfully; false otherwise. + */ +template +inline bool ApplyFusedGate(const typename Simulator::StateSpace& state_space, + const Simulator& simulator, const Gate& gate, + Rgen& rgen, typename Simulator::State& state) { + using MeasurementResult = typename Simulator::StateSpace::MeasurementResult; + std::vector discarded_results; + return ApplyFusedGate( + state_space, simulator, gate, rgen, state, discarded_results); +} + +} // namespace qsim + +#endif // GATE_APPL_H_ diff --git a/tpls/qsim/gates_cirq.h b/tpls/qsim/gates_cirq.h new file mode 100644 index 0000000..d767959 --- /dev/null +++ b/tpls/qsim/gates_cirq.h @@ -0,0 +1,1640 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef GATES_CIRQ_H_ +#define GATES_CIRQ_H_ + +#include +#include +#include +#include + +#include "gate.h" +#include "matrix.h" + +namespace qsim { + +namespace Cirq { + +enum GateKind { + kI1 = 0, // One-qubit identity gate. + kI2, // Two-qubit identity gate. + kI, // Multi-qubit identity gate. + kXPowGate, + kYPowGate, + kZPowGate, + kHPowGate, + kCZPowGate, + kCXPowGate, + krx, + kry, + krz, + kH, + kS, + kCZ, + kCX, + kT, + kX, + kY, + kZ, + kPhasedXPowGate, + kPhasedXZGate, + kXXPowGate, + kYYPowGate, + kZZPowGate, + kXX, + kYY, + kZZ, + kSwapPowGate, + kISwapPowGate, + kriswap, + kSWAP, + kISWAP, + kPhasedISwapPowGate, + kgivens, + kFSimGate, + kTwoQubitDiagonalGate, + kThreeQubitDiagonalGate, + kCCZPowGate, + kCCXPowGate, + kCSwapGate, + kCCZ, + kCCX, + kMatrixGate1, // One-qubit matrix gate. + kMatrixGate2, // Two-qubit matrix gate. + kMatrixGate, // Multi-qubit matrix gate. + kGlobalPhaseGate, + kDecomp = gate::kDecomp, + kMeasurement = gate::kMeasurement, +}; + +template +using GateCirq = Gate; + +constexpr double h_double = 0.5; +constexpr double pi_double = 3.14159265358979323846264338327950288; +constexpr double is2_double = 0.7071067811865475; + +// Gates from cirq/ops/global_phase_op.py: + +/** + * The global phase gate. + */ +template +struct GlobalPhaseGate { + static constexpr GateKind kind = kGlobalPhaseGate; + static constexpr char name[] = "GlobalPhaseGate"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateCirq Create(unsigned time, fp_type phi) { + return Create(time, std::cos(phi), std::sin(phi)); + } + + static GateCirq Create(unsigned time, fp_type cp, fp_type sp) { + return CreateGate, GlobalPhaseGate>( + time, {}, {cp, sp}, {cp, sp}); + } +}; + +template +using global_phase_operation = GlobalPhaseGate; + +// Gates from cirq/ops/identity.py: + +/** + * A one-qubit identity gate. + */ +template +struct I1 { + static constexpr GateKind kind = kI1; + static constexpr char name[] = "I1"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateCirq Create(unsigned time, unsigned q0) { + return CreateGate, I1>( + time, {q0}, {1, 0, 0, 0, 0, 0, 1, 0}); + } +}; + +/** + * A two-qubit identity gate. + */ +template +struct I2 { + static constexpr GateKind kind = kI2; + static constexpr char name[] = "I2"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1) { + return CreateGate, I2>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0}); + } + + static schmidt_decomp_type SchmidtDecomp() { + return schmidt_decomp_type{ + {{1, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, 1, 0}}, + }; + } +}; + +/** + * A multi-qubit identity gate. + */ +template +struct I { + static constexpr GateKind kind = kI; + static constexpr char name[] = "I"; + static constexpr bool symmetric = true; + + static GateCirq Create(unsigned time, + const std::vector& qubits) { + Matrix matrix; + MatrixIdentity(1 << qubits.size(), matrix); + return CreateGate, I>(time, qubits, std::move(matrix)); + } +}; + +// Gates form cirq/ops/common_gates.py: + +/** + * A gate that rotates around the X axis of the Bloch sphere. + * This is a generalization of the X gate. + */ +template +struct XPowGate { + static constexpr GateKind kind = kXPowGate; + static constexpr char name[] = "XPowGate"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, unsigned q0, + fp_type exponent, fp_type global_shift = 0) { + fp_type c = std::cos(pi * exponent * 0.5); + fp_type s = std::sin(pi * exponent * 0.5); + fp_type gc = std::cos(pi * exponent * (0.5 + global_shift)); + fp_type gs = std::sin(pi * exponent * (0.5 + global_shift)); + + return CreateGate, XPowGate>( + time, {q0}, {c * gc, c * gs, s * gs, -s * gc, + s * gs, -s * gc, c * gc, c * gs}, + {exponent, global_shift}); + } +}; + +/** + * A gate that rotates around the Y axis of the Bloch sphere. + * This is a generalization of the Y gate. + */ +template +struct YPowGate { + static constexpr GateKind kind = kYPowGate; + static constexpr char name[] = "YPowGate"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, unsigned q0, + fp_type exponent, fp_type global_shift = 0) { + fp_type c = std::cos(pi * exponent * 0.5); + fp_type s = std::sin(pi * exponent * 0.5); + fp_type gc = std::cos(pi * exponent * (0.5 + global_shift)); + fp_type gs = std::sin(pi * exponent * (0.5 + global_shift)); + + return CreateGate, YPowGate>( + time, {q0}, {c * gc, c * gs, -s * gc, -s * gs, + s * gc, s * gs, c * gc, c * gs}, {exponent, global_shift}); + } +}; + +/** + * A gate that rotates around the Z axis of the Bloch sphere. + * This is a generalization of the Z gate. + */ +template +struct ZPowGate { + static constexpr GateKind kind = kZPowGate; + static constexpr char name[] = "ZPowGate"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, unsigned q0, + fp_type exponent, fp_type global_shift = 0) { + fp_type c = std::cos(pi * exponent); + fp_type s = std::sin(pi * exponent); + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + + return CreateGate, ZPowGate>( + time, {q0}, {gc, gs, 0, 0, 0, 0, c * gc - s * gs, c * gs + s * gc}, + {exponent, global_shift}); + } +}; + +/** + * A gate that rotates around the X+Z axis of the Bloch sphere. + * This is a generalization of the Hadamard gate. + */ +template +struct HPowGate { + static constexpr GateKind kind = kHPowGate; + static constexpr char name[] = "HPowGate"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static constexpr fp_type pi = static_cast(pi_double); + static constexpr fp_type is2 = static_cast(is2_double); + + static GateCirq Create(unsigned time, unsigned q0, + fp_type exponent, fp_type global_shift = 0) { + fp_type c = std::cos(pi * exponent * 0.5); + fp_type s = std::sin(pi * exponent * 0.5); + fp_type gc = std::cos(pi * exponent * (0.5 + global_shift)); + fp_type gs = std::sin(pi * exponent * (0.5 + global_shift)); + + fp_type a = s * gs * is2; + fp_type b = s * gc * is2; + + return CreateGate, HPowGate>( + time, {q0}, {c * gc + a, c * gs - b, a, -b, + a, -b, c * gc - a, c * gs + b}, {exponent, global_shift}); + } +}; + +/** + * A gate that applies a phase to the |11⟩ state of two qubits. + * This is a generalization of the CZ gate. + */ +template +struct CZPowGate { + static constexpr GateKind kind = kCZPowGate; + static constexpr char name[] = "CZPowGate"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1, + fp_type exponent, fp_type global_shift = 0) { + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + fp_type ec = std::cos(pi * exponent * (1 + global_shift)); + fp_type es = std::sin(pi * exponent * (1 + global_shift)); + + return CreateGate, CZPowGate>( + time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0, + 0, 0, gc, gs, 0, 0, 0, 0, + 0, 0, 0, 0, gc, gs, 0, 0, + 0, 0, 0, 0, 0, 0, ec, es}, {exponent, global_shift}); + } + + static schmidt_decomp_type SchmidtDecomp( + fp_type exponent, fp_type global_shift) { + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + fp_type ec = std::cos(pi * exponent * (1 + global_shift)); + fp_type es = std::sin(pi * exponent * (1 + global_shift)); + + return schmidt_decomp_type{ + {{1, 0, 0, 0, 0, 0, 0, 0}, {gc, gs, 0, 0, 0, 0, gc, gs}}, + {{0, 0, 0, 0, 0, 0, 1, 0}, {gc, gs, 0, 0, 0, 0, ec, es}}, + }; + } +}; + +/** + * A gate that applies a controlled power of an X gate. + * This is a generalization of the CX (or CNOT) gate. + */ +template +struct CXPowGate { + static constexpr GateKind kind = kCXPowGate; + static constexpr char name[] = "CXPowGate"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = false; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1, + fp_type exponent, fp_type global_shift = 0) { + fp_type c = std::cos(pi * exponent * 0.5); + fp_type s = std::sin(pi * exponent * 0.5); + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + fp_type ec = std::cos(pi * exponent * (0.5 + global_shift)); + fp_type es = std::sin(pi * exponent * (0.5 + global_shift)); + + // Matrix is in this form because the simulator uses inverse qubit order. + return CreateGate, CXPowGate>( + time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0, + 0, 0, c * ec, c * es, 0, 0, s * es, -s * ec, + 0, 0, 0, 0, gc, gs, 0, 0, + 0, 0, s * es, -s * ec, 0, 0, c * ec, c * es}, + {exponent, global_shift}); + } + + static schmidt_decomp_type SchmidtDecomp( + fp_type exponent, fp_type global_shift) { + fp_type c = std::cos(pi * exponent * 0.5); + fp_type s = std::sin(pi * exponent * 0.5); + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + fp_type ec = std::cos(pi * exponent * (0.5 + global_shift)); + fp_type es = std::sin(pi * exponent * (0.5 + global_shift)); + + return schmidt_decomp_type{ + {{1, 0, 0, 0, 0, 0, 0, 0}, {gc, gs, 0, 0, 0, 0, gc, gs}}, + {{0, 0, 0, 0, 0, 0, 1, 0}, {c * ec, c * es, s * es, -s * ec, + s * es, -s * ec, c * ec, c * es}}, + }; + } +}; + +/** + * The `(exponent = phi/pi, global_shift = -0.5)` instance of XPowGate. + * This is a generalization of the X gate with a fixed global phase. + * This is a function in Cirq. + */ +template +struct rx { + static constexpr GateKind kind = krx; + static constexpr char name[] = "rx"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateCirq Create(unsigned time, unsigned q0, fp_type phi) { + fp_type c = std::cos(-0.5 * phi); + fp_type s = std::sin(-0.5 * phi); + + return CreateGate, rx>( + time, {q0}, {c, 0, 0, s, 0, s, c, 0}, {phi}); + } +}; + +/** + * The `(exponent = phi/pi, global_shift = -0.5)` instance of YPowGate. + * This is a generalization of the Y gate with a fixed global phase. + * This is a function in Cirq. + */ +template +struct ry { + static constexpr GateKind kind = kry; + static constexpr char name[] = "ry"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateCirq Create(unsigned time, unsigned q0, fp_type phi) { + fp_type c = std::cos(-0.5 * phi); + fp_type s = std::sin(-0.5 * phi); + + return CreateGate, ry>( + time, {q0}, {c, 0, s, 0, -s, 0, c, 0}, {phi}); + } +}; + +/** + * The `(exponent = phi/pi, global_shift = -0.5)` instance of ZPowGate. + * This is a generalization of the Z gate with a fixed global phase. + * This is a function in Cirq. + */ +template +struct rz { + static constexpr GateKind kind = krz; + static constexpr char name[] = "rz"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateCirq Create(unsigned time, unsigned q0, fp_type phi) { + fp_type c = std::cos(-0.5 * phi); + fp_type s = std::sin(-0.5 * phi); + + return CreateGate, rz>( + time, {q0}, {c, s, 0, 0, 0, 0, c, -s}, {phi}); + } +}; + +/** + * The `(exponent = 1, global_shift = 0)` instance of HPowGate. + * This is the canonical Hadamard (or H) gate. + */ +template +struct H { + static constexpr GateKind kind = kH; + static constexpr char name[] = "H"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static constexpr fp_type is2 = static_cast(is2_double); + + static GateCirq Create(unsigned time, unsigned q0) { + return CreateGate, H>( + time, {q0}, {is2, 0, is2, 0, is2, 0, -is2, 0}); + } +}; + +/** + * The `(exponent = 0.5, global_shift = 0)` instance of ZPowGate. + * This is the canonical S gate. + */ +template +struct S { + static constexpr GateKind kind = kS; + static constexpr char name[] = "S"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateCirq Create(unsigned time, unsigned q0) { + return CreateGate, S>( + time, {q0}, {1, 0, 0, 0, 0, 0, 0, 1}); + } +}; + +/** + * The `(exponent = 0.25, global_shift = 0)` instance of ZPowGate. + * This is the canonical T gate. + */ +template +struct T { + static constexpr GateKind kind = kT; + static constexpr char name[] = "T"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static constexpr fp_type is2 = static_cast(is2_double); + + static GateCirq Create(unsigned time, unsigned q0) { + return CreateGate, T>( + time, {q0}, {1, 0, 0, 0, 0, 0, is2, is2}); + } +}; + +/** + * The `(exponent = 1, global_shift = 0)` instance of CZPowGate. + * This is the canonical CZ gate. + */ +template +struct CZ { + static constexpr GateKind kind = kCZ; + static constexpr char name[] = "CZ"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1) { + return CreateGate, CZ>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, -1, 0}); + } + + static schmidt_decomp_type SchmidtDecomp() { + return schmidt_decomp_type{ + {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}}, + {{0, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, -1, 0}}, + }; + } +}; + +template +using CNotPowGate = CXPowGate; + +/** + * The `(exponent = 1, global_shift = 0)` instance of CXPowGate. + * This is the canonical CX (or CNOT) gate. + */ +template +struct CX { + static constexpr GateKind kind = kCX; + static constexpr char name[] = "kCX"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = false; + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1) { + // Matrix is in this form because the simulator uses inverse qubit order. + return CreateGate, CX>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0, + 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0}); + } + + static schmidt_decomp_type SchmidtDecomp() { + return schmidt_decomp_type{ + {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}}, + {{0, 0, 0, 0, 0, 0, 1, 0}, {0, 0, 1, 0, 1, 0, 0, 0}}, + }; + } +}; + +template +using CNOT = CX; + +// Gates from cirq/ops/pauli_gates.py: + +/** + * The `(exponent = 1, global_shift = 0)` instance of XPowGate. + * This is the canonical Pauli X gate. + */ +template +struct X : public XPowGate { + static constexpr GateKind kind = kX; + static constexpr char name[] = "X"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateCirq Create(unsigned time, unsigned q0) { + return CreateGate, X>( + time, {q0}, {0, 0, 1, 0, 1, 0, 0, 0}); + } +}; + +/** + * The `(exponent = 1, global_shift = 0)` instance of YPowGate. + * This is the canonical Pauli Y gate. + */ +template +struct Y : public YPowGate { + static constexpr GateKind kind = kY; + static constexpr char name[] = "Y"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateCirq Create(unsigned time, unsigned q0) { + return CreateGate, Y>( + time, {q0}, {0, 0, 0, -1, 0, 1, 0, 0}); + } +}; + +/** + * The `(exponent = 1, global_shift = 0)` instance of ZPowGate. + * This is the canonical Pauli Z gate. + */ +template +struct Z : public ZPowGate { + static constexpr GateKind kind = kZ; + static constexpr char name[] = "Z"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateCirq Create(unsigned time, unsigned q0) { + return CreateGate, Z>( + time, {q0}, {1, 0, 0, 0, 0, 0, -1, 0}); + } +}; + +// Gates from cirq/ops/phased_x_gate.py: + +/** + * An XPowGate conjugated by ZPowGate%s. + * Equivalent to the circuit `───Z^-p───X^t───Z^p───`. + */ +template +struct PhasedXPowGate { + static constexpr GateKind kind = kPhasedXPowGate; + static constexpr char name[] = "PhasedXPowGate"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, unsigned q0, + fp_type phase_exponent, fp_type exponent = 1, + fp_type global_shift = 0) { + fp_type pc = std::cos(pi * phase_exponent); + fp_type ps = std::sin(pi * phase_exponent); + fp_type ec = std::cos(pi * exponent); + fp_type es = std::sin(pi * exponent); + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + + fp_type ar = 0.5 * ((1 + ec) * gc - es * gs); + fp_type ai = 0.5 * ((1 + ec) * gs + es * gc); + fp_type br = -0.5 * ((-1 + ec) * gc - es * gs); + fp_type bi = -0.5 * ((-1 + ec) * gs + es * gc); + + return CreateGate, PhasedXPowGate>( + time, {q0}, {ar, ai, pc * br + ps * bi, pc * bi - ps * br, + pc * br - ps * bi, pc * bi + ps * br, ar, ai}, + {phase_exponent, exponent, global_shift}); + } +}; + +// Gates from cirq/ops/phased_x_z_gate.py: + +/** + * A PhasedXPowGate followed by a ZPowGate. + * Equivalent to the circuit `───Z^(-a)──X^x──Z^a───Z^z───`. + */ +template +struct PhasedXZGate { + static constexpr GateKind kind = kPhasedXZGate; + static constexpr char name[] = "PhasedXZGate"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, unsigned q0, + fp_type x_exponent, fp_type z_exponent, + fp_type axis_phase_exponent) { + fp_type xc = std::cos(pi * x_exponent); + fp_type xs = std::sin(pi * x_exponent); + fp_type zc = std::cos(pi * z_exponent); + fp_type zs = std::sin(pi * z_exponent); + fp_type ac = std::cos(pi * axis_phase_exponent); + fp_type as = std::sin(pi * axis_phase_exponent); + + fp_type br = 0.5 * (1 + xc); + fp_type bi = 0.5 * xs; + fp_type cr = -0.5 * (-1 + xc); + fp_type ci = -0.5 * xs; + fp_type dr = ac * zc - as * zs; + fp_type di = ac * zs + as * zc; + + return CreateGate, PhasedXZGate>( + time, {q0}, {br, bi, ac * cr + as * ci, ac * ci - as * cr, + dr * cr - di * ci, dr * ci + di * cr, + zc * br - zs * bi, zc * bi + zs * br}, + {x_exponent, z_exponent, axis_phase_exponent}); + } +}; + +// Gates from cirq/ops/parity_gates.py: + +/** + * The tensor product of two X gates, possibly raised to an exponent. + */ +template +struct XXPowGate { + static constexpr GateKind kind = kXXPowGate; + static constexpr char name[] = "XXPowGate"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1, + fp_type exponent, fp_type global_shift = 0) { + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + fp_type c = std::cos(pi * exponent); + fp_type s = std::sin(pi * exponent); + fp_type ic = 0.5 * ((1 + c) * gc - s * gs); + fp_type is = 0.5 * ((1 + c) * gs + s * gc); + fp_type xc = 0.5 * ((1 - c) * gc + s * gs); + fp_type xs = 0.5 * ((1 - c) * gs - s * gc); + + return CreateGate, XXPowGate>( + time, {q0, q1}, {ic, is, 0, 0, 0, 0, xc, xs, + 0, 0, ic, is, xc, xs, 0, 0, + 0, 0, xc, xs, ic, is, 0, 0, + xc, xs, 0, 0, 0, 0, ic, is}, {exponent, global_shift}); + } + + static schmidt_decomp_type SchmidtDecomp( + fp_type exponent, fp_type global_shift) { + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + fp_type c = std::cos(pi * exponent); + fp_type s = std::sin(pi * exponent); + fp_type ic = 0.5 * ((1 + c) * gc - s * gs); + fp_type is = 0.5 * ((1 + c) * gs + s * gc); + fp_type xc = 0.5 * ((1 - c) * gc + s * gs); + fp_type xs = 0.5 * ((1 - c) * gs - s * gc); + + return schmidt_decomp_type{ + {{1, 0, 0, 0, 0, 0, 1, 0}, {ic, is, 0, 0, 0, 0, ic, is}}, + {{0, 0, 1, 0, 1, 0, 0, 0}, {0, 0, xc, xs, xc, xs, 0, 0}}, + }; + } +}; + +/** + * The tensor product of two Y gates, possibly raised to an exponent. + */ +template +struct YYPowGate { + static constexpr GateKind kind = kYYPowGate; + static constexpr char name[] = "YYPowGate"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1, + fp_type exponent, fp_type global_shift = 0) { + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + fp_type c = std::cos(pi * exponent); + fp_type s = std::sin(pi * exponent); + fp_type ic = 0.5 * ((1 + c) * gc - s * gs); + fp_type is = 0.5 * ((1 + c) * gs + s * gc); + fp_type yc = 0.5 * ((1 - c) * gc + s * gs); + fp_type ys = 0.5 * ((1 - c) * gs - s * gc); + + return CreateGate, YYPowGate>( + time, {q0, q1}, {ic, is, 0, 0, 0, 0, -yc, -ys, + 0, 0, ic, is, yc, ys, 0, 0, + 0, 0, yc, ys, ic, is, 0, 0, + -yc, -ys, 0, 0, 0, 0, ic, is}, + {exponent, global_shift}); + } + + static schmidt_decomp_type SchmidtDecomp( + fp_type exponent, fp_type global_shift) { + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + fp_type c = std::cos(pi * exponent); + fp_type s = std::sin(pi * exponent); + fp_type ic = 0.5 * ((1 + c) * gc - s * gs); + fp_type is = 0.5 * ((1 + c) * gs + s * gc); + fp_type yc = 0.5 * ((1 - c) * gc + s * gs); + fp_type ys = 0.5 * ((1 - c) * gs - s * gc); + + return schmidt_decomp_type{ + {{1, 0, 0, 0, 0, 0, 1, 0}, {ic, is, 0, 0, 0, 0, ic, is}}, + {{0, 0, 0, -1, 0, 1, 0, 0}, {0, 0, ys, -yc, -ys, yc, 0, 0}}, + }; + } +}; + +/** + * The tensor product of two Z gates, possibly raised to an exponent. + */ +template +struct ZZPowGate { + static constexpr GateKind kind = kZZPowGate; + static constexpr char name[] = "ZZPowGate"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1, + fp_type exponent, fp_type global_shift = 0) { + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + fp_type zc = std::cos(pi * exponent * (1 + global_shift)); + fp_type zs = std::sin(pi * exponent * (1 + global_shift)); + + return CreateGate, ZZPowGate>( + time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0, + 0, 0, zc, zs, 0, 0, 0, 0, + 0, 0, 0, 0, zc, zs, 0, 0, + 0, 0, 0, 0, 0, 0, gc, gs}, {exponent, global_shift}); + } + + static schmidt_decomp_type SchmidtDecomp( + fp_type exponent, fp_type global_shift) { + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + fp_type c = std::cos(pi * exponent); + fp_type s = std::sin(pi * exponent); + fp_type ic = 0.5 * ((1 + c) * gc - s * gs); + fp_type is = 0.5 * ((1 + c) * gs + s * gc); + fp_type zc = 0.5 * ((1 - c) * gc + s * gs); + fp_type zs = 0.5 * ((1 - c) * gs - s * gc); + + return schmidt_decomp_type{ + {{1, 0, 0, 0, 0, 0, 1, 0}, {ic, is, 0, 0, 0, 0, ic, is}}, + {{1, 0, 0, 0, 0, 0, -1, 0}, {zc, zs, 0, 0, 0, 0, -zc, -zs}}, + }; + } +}; + +/** + * The `(exponent = 1, global_shift = 0)` instance of XXPowGate. + * This is the tensor product of two X gates. + */ +template +struct XX { + static constexpr GateKind kind = kXX; + static constexpr char name[] = "XX"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1) { + return CreateGate, XX>( + time, {q0, q1}, {0, 0, 0, 0, 0, 0, 1, 0, + 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, + 1, 0, 0, 0, 0, 0, 0, 0}); + } + + static schmidt_decomp_type SchmidtDecomp() { + return schmidt_decomp_type{ + {{0, 0, 1, 0, 1, 0, 0, 0}, {0, 0, 1, 0, 1, 0, 0, 0}}, + }; + } +}; + +/** + * The `(exponent = 1, global_shift = 0)` instance of YYPowGate. + * This is the tensor product of two Y gates. + */ +template +struct YY { + static constexpr GateKind kind = kYY; + static constexpr char name[] = "YY"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1) { + return CreateGate, YY>( + time, {q0, q1}, {0, 0, 0, 0, 0, 0, -1, 0, + 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, + -1, 0, 0, 0, 0, 0, 0, 0}); + } + + static schmidt_decomp_type SchmidtDecomp() { + return schmidt_decomp_type{ + {{0, 0, 0, -1, 0, 1, 0, 0}, {0, 0, 0, -1, 0, 1, 0, 0}}, + }; + } +}; + +/** + * The `(exponent = 1, global_shift = 0)` instance of ZZPowGate. + * This is the tensor product of two Z gates. + */ +template +struct ZZ { + static constexpr GateKind kind = kZZ; + static constexpr char name[] = "ZZ"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1) { + return CreateGate, ZZ>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, -1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, -1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0}); + } + + static schmidt_decomp_type SchmidtDecomp() { + return schmidt_decomp_type{ + {{1, 0, 0, 0, 0, 0, -1, 0}, {1, 0, 0, 0, 0, 0, -1, 0}}, + }; + } +}; + +// Gates from cirq/ops/swap_gates.py: + +/** + * The SWAP gate, possibly raised to a power. Exchanges qubits. + */ +template +struct SwapPowGate { + static constexpr GateKind kind = kSwapPowGate; + static constexpr char name[] = "SwapPowGate"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static constexpr fp_type pi = static_cast(pi_double); + static constexpr fp_type h = static_cast(h_double); + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1, + fp_type exponent, fp_type global_shift = 0) { + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + fp_type c = std::cos(pi * exponent * 0.5); + fp_type s = std::sin(pi * exponent * 0.5); + fp_type ec = std::cos(pi * exponent * (0.5 + global_shift)); + fp_type es = std::sin(pi * exponent * (0.5 + global_shift)); + + return CreateGate, SwapPowGate>( + time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0, + 0, 0, c * ec, c * es, s * es, -s * ec, 0, 0, + 0, 0, s * es, -s * ec, c * ec, c * es, 0, 0, + 0, 0, 0, 0, 0, 0, gc, gs}, {exponent, global_shift}); + } + + static schmidt_decomp_type SchmidtDecomp( + fp_type exponent, fp_type global_shift) { + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + fp_type c = std::cos(pi * exponent * 0.5); + fp_type s = std::sin(pi * exponent * 0.5); + fp_type ec = std::cos(pi * exponent * (0.5 + global_shift)); + fp_type es = std::sin(pi * exponent * (0.5 + global_shift)); + + return schmidt_decomp_type{ + {{h, 0, 0, 0, 0, 0, h, 0}, {gc + c * ec, gs + c * es, 0, 0, + 0, 0, gc + c * ec, gs + c * es}}, + {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, s * es, -s * ec, + s * es, -s * ec, 0, 0}}, + {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, -s * ec, -s * es, + s * ec, s * es, 0, 0}}, + {{h, 0, 0, 0, 0, 0, -h, 0}, {gc - c * ec, gs - c * es, 0, 0, + 0, 0, -gc + c * ec, -gs + c * es}}, + }; + } +}; + +/** + * Rotates the |01⟩ vs |10⟩ subspace of two qubits around its Bloch X-axis. + * This is a generalization of the ISWAP gate. + */ +template +struct ISwapPowGate { + static constexpr GateKind kind = kISwapPowGate; + static constexpr char name[] = "ISwapPowGate"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static constexpr fp_type pi = static_cast(pi_double); + static constexpr fp_type h = static_cast(h_double); + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1, + fp_type exponent, fp_type global_shift = 0) { + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + fp_type c = std::cos(pi * exponent * 0.5); + fp_type s = std::sin(pi * exponent * 0.5); + + return CreateGate, ISwapPowGate>( + time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0, + 0, 0, c * gc, c * gs, -s * gs, s * gc, 0, 0, + 0, 0, -s * gs, s * gc, c * gc, c * gs, 0, 0, + 0, 0, 0, 0, 0, 0, gc, gs}, {exponent, global_shift}); + } + + static schmidt_decomp_type SchmidtDecomp( + fp_type exponent, fp_type global_shift) { + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + fp_type c = std::cos(pi * exponent * 0.5); + fp_type s = std::sin(pi * exponent * 0.5); + + return schmidt_decomp_type{ + {{h, 0, 0, 0, 0, 0, h, 0}, {gc + c * gc, gs + c * gs, 0, 0, + 0, 0, gc + c * gc, gs + c * gs}}, + {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, -s * gs, s * gc, + -s * gs, s * gc, 0, 0}}, + {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, s * gc, s * gs, + -s * gc, -s * gs, 0, 0}}, + {{h, 0, 0, 0, 0, 0, -h, 0}, {gc - c * gc, gs - c * gs, 0, 0, + 0, 0, -gc + c * gc, -gs + c * gs}}, + }; + } +}; + +/** + * The `(exponent = 2*phi/pi, global_shift = 0)` instance of ISwapPowGate. + * This is a generalization of the ISWAP gate with a fixed global phase of zero. + * This is a function in Cirq. + */ +template +struct riswap { + static constexpr GateKind kind = kriswap; + static constexpr char name[] = "riswap"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static constexpr fp_type pi = static_cast(pi_double); + static constexpr fp_type h = static_cast(h_double); + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1, + fp_type phi) { + fp_type c = std::cos(phi); + fp_type s = std::sin(phi); + + return CreateGate, riswap>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, c, 0, 0, s, 0, 0, + 0, 0, 0, s, c, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0}, {phi}); + } + + static schmidt_decomp_type SchmidtDecomp(fp_type phi) { + fp_type c = std::cos(phi); + fp_type s = std::sin(phi); + + return schmidt_decomp_type{ + {{h, 0, 0, 0, 0, 0, h, 0}, {1 + c, 0, 0, 0, 0, 0, 1 + c, 0}}, + {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, 0, s, 0, s, 0, 0}}, + {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, s, 0, -s, 0, 0, 0}}, + {{h, 0, 0, 0, 0, 0, -h, 0}, {1 - c, 0, 0, 0, 0, 0, -1 + c, 0}}, + }; + } +}; + +/** + * The `(exponent = 1, global_shift = 0)` instance of SwapPowGate. + * This is the canonical SWAP gate. + */ +template +struct SWAP { + static constexpr GateKind kind = kSWAP; + static constexpr char name[] = "SWAP"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static constexpr fp_type is2 = static_cast(is2_double); + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1) { + return CreateGate, SWAP>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0}); + } + + static schmidt_decomp_type SchmidtDecomp() { + return schmidt_decomp_type{ + {{is2, 0, 0, 0, 0, 0, is2, 0}, {is2, 0, 0, 0, 0, 0, is2, 0}}, + {{0, 0, is2, 0, is2, 0, 0, 0}, {0, 0, is2, 0, is2, 0, 0, 0}}, + {{0, 0, 0, -is2, 0, is2, 0, 0}, {0, 0, 0, -is2, 0, is2, 0, 0}}, + {{is2, 0, 0, 0, 0, 0, -is2, 0}, {is2, 0, 0, 0, 0, 0, -is2, 0}}, + }; + } +}; + +/** + * The `(exponent = 1, global_shift = 0)` instance of ISwapPowGate. + * This is the canonical ISWAP gate. + */ +template +struct ISWAP { + static constexpr GateKind kind = kISWAP; + static constexpr char name[] = "ISWAP"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static constexpr fp_type h = static_cast(h_double); + static constexpr fp_type is2 = static_cast(is2_double); + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1) { + return CreateGate, ISWAP>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 1, 0, 0, + 0, 0, 0, 1, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0}); + } + + static schmidt_decomp_type SchmidtDecomp() { + return schmidt_decomp_type{ + {{is2, 0, 0, 0, 0, 0, is2, 0}, {is2, 0, 0, 0, 0, 0, is2, 0}}, + {{0, 0, h, h, h, h, 0, 0}, {0, 0, h, h, h, h, 0, 0}}, + {{0, 0, h, -h, -h, h, 0, 0}, {0, 0, h, -h, -h, h, 0, 0}}, + {{is2, 0, 0, 0, 0, 0, -is2, 0}, {is2, 0, 0, 0, 0, 0, -is2, 0}}, + }; + } +}; + +// Gates from cirq/ops/phased_iswap_gate.py: + +/** + * An ISwapPowGate conjugated by ZPowGate%s. + * Equivalent to the composition `(Z^-p ⊗ Z^p) ISWAP^t (Z^p ⊗ Z^-p)`. + */ +template +struct PhasedISwapPowGate { + static constexpr GateKind kind = kPhasedISwapPowGate; + static constexpr char name[] = "PhasedISwapPowGate"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = false; + + static constexpr fp_type pi = static_cast(pi_double); + static constexpr fp_type h = static_cast(h_double); + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1, + fp_type phase_exponent = 0.25, + fp_type exponent = 1.0) { + fp_type fc = std::cos(2 * pi * phase_exponent); + fp_type fs = std::sin(2 * pi * phase_exponent); + fp_type c = std::cos(pi * exponent * 0.5); + fp_type s = std::sin(pi * exponent * 0.5); + + // Matrix is in this form because the simulator uses inverse qubit order. + return CreateGate, PhasedISwapPowGate>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, c, 0, s * fs, s * fc, 0, 0, + 0, 0, -s * fs, s * fc, c, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0}, {phase_exponent, exponent}); + } + + static schmidt_decomp_type SchmidtDecomp( + fp_type phase_exponent, fp_type exponent) { + fp_type fc = std::cos(2 * pi * phase_exponent); + fp_type fs = std::sin(2 * pi * phase_exponent); + fp_type c = std::cos(pi * exponent * 0.5); + fp_type s = std::sin(pi * exponent * 0.5); + + return schmidt_decomp_type{ + {{h, 0, 0, 0, 0, 0, h, 0}, {1 + c, 0, 0, 0, 0, 0, 1 + c, 0}}, + {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, s * fs, s * fc, -s * fs, s * fc, 0, 0}}, + {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, s * fc, -s * fs, + -s * fc, -s * fs, 0, 0}}, + {{h, 0, 0, 0, 0, 0, -h, 0}, {1 - c, 0, 0, 0, 0, 0, -1 + c, 0}}, + }; + } +}; + +/** + * The `(phase_exponent = 0.25, exponent = 2*phi/pi)` instance of + * PhasedISwapPowGate. + * This is the "Givens rotation" from numerical linear algebra. + * This is a function in Cirq. + */ +template +struct givens { + static constexpr GateKind kind = kgivens; + static constexpr char name[] = "givens"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = false; + + static constexpr fp_type pi = static_cast(pi_double); + static constexpr fp_type h = static_cast(h_double); + + static GateCirq Create(unsigned time, unsigned q0, unsigned q1, + fp_type phi) { + fp_type c = std::cos(phi); + fp_type s = std::sin(phi); + + // Matrix is in this form because the simulator uses inverse qubit order. + return CreateGate, givens>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, c, 0, s, 0, 0, 0, + 0, 0, -s, 0, c, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0}, {phi}); + } + + static schmidt_decomp_type SchmidtDecomp(fp_type phi) { + fp_type c = std::cos(phi); + fp_type s = std::sin(phi); + + return schmidt_decomp_type{ + {{h, 0, 0, 0, 0, 0, h, 0}, {1 + c, 0, 0, 0, 0, 0, 1 + c, 0}}, + {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, s, 0, -s, 0, 0, 0}}, + {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, 0, -s, 0, -s, 0, 0}}, + {{h, 0, 0, 0, 0, 0, -h, 0}, {1 - c, 0, 0, 0, 0, 0, -1 + c, 0}}, + }; + } +}; + +// Gates from cirq/ops/fsim_gate.py: + +/** + * The fermionic simulation gate family. Contains all two-qubit interactions + * that preserve excitations, up to single-qubit rotations and global phase. + */ +template +struct FSimGate { + static constexpr GateKind kind = kFSimGate; + static constexpr char name[] = "FSimGate"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static constexpr fp_type is2 = static_cast(is2_double); + + static GateCirq Create( + unsigned time, unsigned q0, unsigned q1, fp_type theta, fp_type phi) { + if (phi < 0) { + phi += 2 * 3.141592653589793; + } + + fp_type ct = std::cos(theta); + fp_type st = std::sin(theta); + fp_type cp = std::cos(phi); + fp_type sp = std::sin(phi); + + return CreateGate, FSimGate>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, ct, 0, 0, -st, 0, 0, + 0, 0, 0, -st, ct, 0, 0, 0, + 0, 0, 0, 0, 0, 0, cp, -sp}, {theta, phi}); + } + + static schmidt_decomp_type SchmidtDecomp( + fp_type theta, fp_type phi) { + fp_type ct = std::cos(theta); + fp_type st = std::sin(theta); + + fp_type cp2 = std::cos(0.5 * phi); + fp_type sp2 = std::sin(0.5 * phi); + fp_type cp4 = std::cos(0.25 * phi); + fp_type sp4 = std::sin(0.25 * phi); + + fp_type a0 = std::sqrt(std::sqrt(1 + 2 * ct * cp2 + ct * ct)); + fp_type a1 = std::sqrt(std::sqrt(1 - 2 * ct * cp2 + ct * ct)); + + fp_type p0 = 0.5 * std::atan2(-sp2, cp2 + ct); + fp_type p1 = 0.5 * std::atan2(-sp2, cp2 - ct); + + fp_type c0 = is2 * a0 * std::cos(p0); + fp_type s0 = is2 * a0 * std::sin(p0); + + fp_type c1 = is2 * a1 * std::cos(p1); + fp_type s1 = is2 * a1 * std::sin(p1); + + fp_type st2 = 0.5 * std::sqrt(st); + + fp_type a = cp4 * c0 - sp4 * s0; + fp_type b = cp4 * s0 + sp4 * c0; + fp_type c = cp4 * c0 + sp4 * s0; + fp_type d = cp4 * s0 - sp4 * c0; + + fp_type e = cp4 * c1 - sp4 * s1; + fp_type f = cp4 * s1 + sp4 * c1; + fp_type g = -(cp4 * c1 + sp4 * s1); + fp_type h = -(cp4 * s1 - sp4 * c1); + + return schmidt_decomp_type{ + {{a, b, 0, 0, 0, 0, c, d}, {a, b, 0, 0, 0, 0, c, d}}, + {{0, 0, st2, -st2, st2, -st2, 0, 0}, {0, 0, st2, -st2, st2, -st2, 0, 0}}, + {{0, 0, -st2, -st2, st2, st2, 0, 0}, {0, 0, -st2, -st2, st2, st2, 0, 0}}, + {{e, f, 0, 0, 0, 0, g, h}, {e, f, 0, 0, 0, 0, g, h}}, + }; + } +}; + +// Gates from cirq/ops/two_qubit_diagonal_gate.py: + +/** + * A two-qubit diagonal gate. + */ +template +struct TwoQubitDiagonalGate { + static constexpr GateKind kind = kTwoQubitDiagonalGate; + static constexpr char name[] = "TwoQubitDiagonalGate"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = false; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, + unsigned q0, unsigned q1, + const std::vector& angles) { + std::vector cs; + std::vector ss; + cs.reserve(4); + ss.reserve(4); + + for (std::size_t i = 0; i < angles.size(); ++i) { + cs.push_back(std::cos(angles[i])); + ss.push_back(std::sin(angles[i])); + } + + for (std::size_t i = angles.size(); i < 4; ++i) { + cs.push_back(1); + ss.push_back(0); + } + + // Matrix is in this form because the simulator uses inverse qubit order. + return CreateGate, TwoQubitDiagonalGate>( + time, {q0, q1}, {cs[0], ss[0], 0, 0, 0, 0, 0, 0, + 0, 0, cs[2], ss[2], 0, 0, 0, 0, + 0, 0, 0, 0, cs[1], ss[1], 0, 0, + 0, 0, 0, 0, 0, 0, cs[3], ss[3]}); + } +}; + +// Gates from cirq/ops/three_qubit_gates.py: + +/** + * A three-qubit diagonal gate. + */ +template +struct ThreeQubitDiagonalGate { + static constexpr GateKind kind = kThreeQubitDiagonalGate; + static constexpr char name[] = "ThreeQubitDiagonalGate"; + static constexpr unsigned num_qubits = 3; + static constexpr bool symmetric = false; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, + unsigned q0, unsigned q1, unsigned q2, + const std::vector& angles) { + std::vector cs; + std::vector ss; + cs.reserve(8); + ss.reserve(8); + + for (std::size_t i = 0; i < angles.size(); ++i) { + cs.push_back(std::cos(angles[i])); + ss.push_back(std::sin(angles[i])); + } + + for (std::size_t i = angles.size(); i < 8; ++i) { + cs.push_back(1); + ss.push_back(0); + } + + // Matrix is in this form because the simulator uses inverse qubit order. + return CreateGate, ThreeQubitDiagonalGate>( + time, {q0, q1, q2}, + {cs[0], ss[0], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, cs[4], ss[4], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, cs[2], ss[2], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, cs[6], ss[6], 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, cs[1], ss[1], 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, cs[5], ss[5], 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, cs[3], ss[3], 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, cs[7], ss[7]}); + } +}; + +/** + * A gate that applies a phase to the |111⟩ state of three qubits. + * This is a generalization of the CCZ gate. + */ +template +struct CCZPowGate { + static constexpr GateKind kind = kCCZPowGate; + static constexpr char name[] = "CCZPowGate"; + static constexpr unsigned num_qubits = 3; + static constexpr bool symmetric = true; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, + unsigned q0, unsigned q1, unsigned q2, + fp_type exponent, fp_type global_shift = 0) { + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + fp_type ec = std::cos(pi * exponent * (1 + global_shift)); + fp_type es = std::sin(pi * exponent * (1 + global_shift)); + + return CreateGate, CCZPowGate>( + time, {q0, q1, q2}, {gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ec, es}, + {exponent, global_shift}); + } +}; + +/** + * A gate that applies a doubly-controlled power of an X gate. + * This is a generalization of the CCX (or CCNOT) gate. + */ +template +struct CCXPowGate { + static constexpr GateKind kind = kCCXPowGate; + static constexpr char name[] = "CCXPowGate"; + static constexpr unsigned num_qubits = 3; + static constexpr bool symmetric = false; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, + unsigned q0, unsigned q1, unsigned q2, + fp_type exponent, fp_type global_shift = 0) { + fp_type c = std::cos(pi * exponent * 0.5); + fp_type s = std::sin(pi * exponent * 0.5); + fp_type gc = std::cos(pi * exponent * global_shift); + fp_type gs = std::sin(pi * exponent * global_shift); + fp_type ec = std::cos(pi * exponent * (0.5 + global_shift)); + fp_type es = std::sin(pi * exponent * (0.5 + global_shift)); + + // Matrix is in this form because the simulator uses inverse qubit order. + return CreateGate, CCXPowGate>( + time, {q0, q1, q2}, + {gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, c * ec, c * es, 0, 0, 0, 0, 0, 0, s * es, -s * ec, + 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, + 0, 0, 0, 0, 0, 0, s * es, -s * ec, 0, 0, 0, 0, 0, 0, c * ec, c * es}, + {exponent, global_shift}); + } +}; + +/** + * A controlled swap gate (the Fredkin gate). + */ +template +struct CSwapGate { + static constexpr GateKind kind = kCSwapGate; + static constexpr char name[] = "CSwapGate"; + static constexpr unsigned num_qubits = 3; + static constexpr bool symmetric = false; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, + unsigned q0, unsigned q1, unsigned q2) { + // Matrix is in this form because the simulator uses inverse qubit order. + return CreateGate, CSwapGate>( + time, {q0, q1, q2}, {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0}); + } +}; + +/** + * The `(exponent = 1, global_shift = 0)` instance of CCZPowGate. + * This is the canonical doubly-controlled Z gate. + */ +template +struct CCZ { + static constexpr GateKind kind = kCCZ; + static constexpr char name[] = "CCZ"; + static constexpr unsigned num_qubits = 3; + static constexpr bool symmetric = true; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, + unsigned q0, unsigned q1, unsigned q2) { + return CreateGate, CCZ>( + time, {q0, q1, q2}, {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0}); + } +}; + +/** + * The `(exponent = 1, global_shift = 0)` instance of CCXPowGate. + * This is the canonical doubly-controlled X gate (the TOFFOLI gate). + */ +template +struct CCX { + static constexpr GateKind kind = kCCX; + static constexpr char name[] = "CCX"; + static constexpr unsigned num_qubits = 3; + static constexpr bool symmetric = false; + + static constexpr fp_type pi = static_cast(pi_double); + + static GateCirq Create(unsigned time, + unsigned q0, unsigned q1, unsigned q2) { + // Matrix is in this form because the simulator uses inverse qubit order. + return CreateGate, CCX>( + time, {q0, q1, q2}, {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0}); + } +}; + +template +using CCNotPowGate = CCXPowGate; + +template +using TOFFOLI = CCX; + +template +using CCNOT = CCX; + +template +using CSWAP = CSwapGate; + +template +using FREDKIN = CSwapGate; + +// Gates from cirq/ops/matrix_gates.py: + +/** + * A one-qubit gate defined entirely by its matrix. + */ +template +struct MatrixGate1 { + static constexpr GateKind kind = kMatrixGate1; + static constexpr char name[] = "MatrixGate1"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateCirq Create(unsigned time, unsigned q0, + const Matrix& m) { + auto m2 = m; + return + CreateGate, MatrixGate1>(time, {q0}, std::move(m2)); + } +}; + +/** + * A two-qubit gate defined entirely by its matrix. + */ +template +struct MatrixGate2 { + static constexpr GateKind kind = kMatrixGate2; + static constexpr char name[] = "MatrixGate2"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = false; + + template > + static GateCirq Create( + unsigned time, unsigned q0, unsigned q1, M&& m) { + return CreateGate, MatrixGate2>(time, {q1, q0}, + std::forward(m)); + } +}; + +/** + * A multi-qubit gate defined entirely by its matrix. + */ +template +struct MatrixGate { + static constexpr GateKind kind = kMatrixGate; + static constexpr char name[] = "MatrixGate"; + static constexpr bool symmetric = false; + + template > + static GateCirq Create(unsigned time, + std::vector qubits, M&& m) { + std::reverse(qubits.begin(), qubits.end()); + return CreateGate, MatrixGate>(time, std::move(qubits), + std::forward(m)); + } +}; + +} // namesapce Cirq + +template +inline schmidt_decomp_type GetSchmidtDecomp( + Cirq::GateKind kind, const std::vector& params) { + switch (kind) { + case Cirq::kI2: + return Cirq::I2::SchmidtDecomp(); + case Cirq::kCZPowGate: + return Cirq::CZPowGate::SchmidtDecomp(params[0], params[1]); + case Cirq::kCXPowGate: + return Cirq::CXPowGate::SchmidtDecomp(params[0], params[1]); + case Cirq::kCZ: + return Cirq::CZ::SchmidtDecomp(); + case Cirq::kCX: + return Cirq::CX::SchmidtDecomp(); + case Cirq::kXXPowGate: + return Cirq::XXPowGate::SchmidtDecomp(params[0], params[1]); + case Cirq::kYYPowGate: + return Cirq::YYPowGate::SchmidtDecomp(params[0], params[1]); + case Cirq::kZZPowGate: + return Cirq::ZZPowGate::SchmidtDecomp(params[0], params[1]); + case Cirq::kXX: + return Cirq::XX::SchmidtDecomp(); + case Cirq::kYY: + return Cirq::YY::SchmidtDecomp(); + case Cirq::kZZ: + return Cirq::ZZ::SchmidtDecomp(); + case Cirq::kSwapPowGate: + return Cirq::SwapPowGate::SchmidtDecomp(params[0], params[1]); + case Cirq::kISwapPowGate: + return Cirq::ISwapPowGate::SchmidtDecomp(params[0], params[1]); + case Cirq::kriswap: + return Cirq::riswap::SchmidtDecomp(params[0]); + case Cirq::kSWAP: + return Cirq::SWAP::SchmidtDecomp(); + case Cirq::kISWAP: + return Cirq::ISWAP::SchmidtDecomp(); + case Cirq::kPhasedISwapPowGate: + return Cirq::PhasedISwapPowGate::SchmidtDecomp( + params[0], params[1]); + case Cirq::kgivens: + return Cirq::givens::SchmidtDecomp(params[0]); + case Cirq::kFSimGate: + return Cirq::FSimGate::SchmidtDecomp(params[0], params[1]); + default: + // Single qubit gates of gates with unimplemented Schmidt decomposition. + return schmidt_decomp_type{}; + } +} + +} // namespace qsim + +#endif // GATES_CIRQ_H_ diff --git a/tpls/qsim/gates_qsim.h b/tpls/qsim/gates_qsim.h new file mode 100644 index 0000000..366c4f1 --- /dev/null +++ b/tpls/qsim/gates_qsim.h @@ -0,0 +1,661 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef GATES_QSIM_H_ +#define GATES_QSIM_H_ + +#include +#include +#include + +#include "gate.h" + +namespace qsim { + +// Gate set implemented in qsim contains the following gates. +enum GateKind { + kGateId1 = 0, // one-qubit Id + kGateHd, // Hadamard + kGateT, // T + kGateX, // X + kGateY, // Y + kGateZ, // Z + kGateX2, // sqrt(X) + kGateY2, // sqrt(Y) + kGateRX, // X-rotation + kGateRY, // Y-rotation + kGateRZ, // Z-rotation + kGateRXY, // XY-rotation (rotation around arbitrary axis in the XY plane) + kGateHZ2, // pi / 2 rotation around the X + Y axis + kGateS, // S + kGateId2, // two-qubit Id + kGateCZ, // CZ + kGateCNot, // CNOT (CX) + kGateSwap, // swap + kGateIS, // iSwap + kGateFS, // fSim + kGateCP, // control phase + kGateMatrix1, // one-qubit matrix gate + kGateMatrix2, // two-qubit matrix gate + kGateGPh, // global phase gate + kDecomp = gate::kDecomp, + kMeasurement = gate::kMeasurement, +}; + +// Specialization of Gate (defined in gate.h) for the qsim gate set. +template +using GateQSim = Gate; + +constexpr double h_double = 0.5; +constexpr double is2_double = 0.7071067811865475; + +// Zero-qubit gates: + +/** + * The global phase gate. + */ +template +struct GateGPh { + static constexpr GateKind kind = kGateGPh; + static constexpr char name[] = "p"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateQSim Create(unsigned time, fp_type phi) { + return Create(time, std::cos(phi), std::sin(phi)); + } + + static GateQSim Create(unsigned time, fp_type cp, fp_type sp) { + return CreateGate, GateGPh>( + time, {}, {cp, sp}, {cp, sp}); + } +}; + +// One-qubit gates: + +/** + * The one-qubit identity gate. + */ +template +struct GateId1 { + static constexpr GateKind kind = kGateId1; + static constexpr char name[] = "id1"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateQSim Create(unsigned time, unsigned q0) { + return CreateGate, GateId1>( + time, {q0}, {1, 0, 0, 0, 0, 0, 1, 0}); + } +}; + +/** + * The Hadamard gate. + */ +template +struct GateHd { + static constexpr GateKind kind = kGateHd; + static constexpr char name[] = "h"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static constexpr fp_type is2 = static_cast(is2_double); + + static GateQSim Create(unsigned time, unsigned q0) { + return CreateGate, GateHd>( + time, {q0}, {is2, 0, is2, 0, is2, 0, -is2, 0}); + } +}; + +/** + * The T gate, equivalent to `Z ^ 0.25`. + */ +template +struct GateT { + static constexpr GateKind kind = kGateT; + static constexpr char name[] = "t"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static constexpr fp_type is2 = static_cast(is2_double); + + static GateQSim Create(unsigned time, unsigned q0) { + return CreateGate, GateT>( + time, {q0}, {1, 0, 0, 0, 0, 0, is2, is2}); + } +}; + +/** + * The Pauli X (or "NOT") gate. + */ +template +struct GateX { + static constexpr GateKind kind = kGateX; + static constexpr char name[] = "x"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateQSim Create(unsigned time, unsigned q0) { + return CreateGate, GateX>( + time, {q0}, {0, 0, 1, 0, 1, 0, 0, 0}); + } +}; + +/** + * The Pauli Y gate. + */ +template +struct GateY { + static constexpr GateKind kind = kGateY; + static constexpr char name[] = "y"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateQSim Create(unsigned time, unsigned q0) { + return CreateGate, GateY>( + time, {q0}, {0, 0, 0, -1, 0, 1, 0, 0}); + } +}; + +/** + * The Pauli Z gate. + */ +template +struct GateZ { + static constexpr GateKind kind = kGateZ; + static constexpr char name[] = "z"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateQSim Create(unsigned time, unsigned q0) { + return CreateGate, GateZ>( + time, {q0}, {1, 0, 0, 0, 0, 0, -1, 0}); + } +}; + +/** + * The "square root of X" gate. + */ +template +struct GateX2 { + static constexpr GateKind kind = kGateX2; + static constexpr char name[] = "x_1_2"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static constexpr fp_type h = static_cast(h_double); + + static GateQSim Create(unsigned time, unsigned q0) { + return CreateGate, GateX2>( + time, {q0}, {h, h, h, -h, h, -h, h, h}); + } +}; + +/** + * The "square root of Y" gate. + */ +template +struct GateY2 { + static constexpr GateKind kind = kGateY2; + static constexpr char name[] = "y_1_2"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static constexpr fp_type h = static_cast(h_double); + + static GateQSim Create(unsigned time, unsigned q0) { + return CreateGate, GateY2>( + time, {q0}, {h, h, -h, -h, h, h, h, h}); + } +}; + +/** + * A gate that rotates around the X axis of the Bloch sphere. + * This is a generalization of the X gate. + */ +template +struct GateRX { + static constexpr GateKind kind = kGateRX; + static constexpr char name[] = "rx"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateQSim Create(unsigned time, unsigned q0, fp_type phi) { + fp_type phi2 = -0.5 * phi; + fp_type c = std::cos(phi2); + fp_type s = std::sin(phi2); + + return CreateGate, GateRX>( + time, {q0}, {c, 0, 0, s, 0, s, c, 0}, {phi}); + } +}; + +/** + * A gate that rotates around the Y axis of the Bloch sphere. + * This is a generalization of the Y gate. + */ +template +struct GateRY { + static constexpr GateKind kind = kGateRY; + static constexpr char name[] = "ry"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateQSim Create(unsigned time, unsigned q0, fp_type phi) { + fp_type phi2 = -0.5 * phi; + fp_type c = std::cos(phi2); + fp_type s = std::sin(phi2); + + return CreateGate, GateRY>( + time, {q0}, {c, 0, s, 0, -s, 0, c, 0}, {phi}); + } +}; + +/** + * A gate that rotates around the Z axis of the Bloch sphere. + * This is a generalization of the Z gate. + */ +template +struct GateRZ { + static constexpr GateKind kind = kGateRZ; + static constexpr char name[] = "rz"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateQSim Create(unsigned time, unsigned q0, fp_type phi) { + fp_type phi2 = -0.5 * phi; + fp_type c = std::cos(phi2); + fp_type s = std::sin(phi2); + + return CreateGate, GateRZ>( + time, {q0}, {c, s, 0, 0, 0, 0, c, -s}, {phi}); + } +}; + +/** + * A gate that rotates around an arbitrary axis in the XY-plane. + */ +template +struct GateRXY { + static constexpr GateKind kind = kGateRXY; + static constexpr char name[] = "rxy"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateQSim Create( + unsigned time, unsigned q0, fp_type theta, fp_type phi) { + fp_type phi2 = -0.5 * phi; + fp_type cp = std::cos(phi2); + fp_type sp = std::sin(phi2); + fp_type ct = std::cos(theta) * sp; + fp_type st = std::sin(theta) * sp; + + return CreateGate, GateRXY>( + time, {q0}, {cp, 0, st, ct, -st, ct, cp, 0}, {theta, phi}); + } +}; + +/** + * A pi / 2 rotation around the X + Y axis. + */ +template +struct GateHZ2 { + static constexpr GateKind kind = kGateHZ2; + static constexpr char name[] = "hz_1_2"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static constexpr fp_type h = static_cast(h_double); + + static constexpr fp_type is2 = static_cast(is2_double); + + static GateQSim Create(unsigned time, unsigned q0) { + return CreateGate, GateHZ2>( + time, {q0}, {h, h, 0, -is2, is2, 0, h, h}); + } +}; + +/** + * The S gate, equivalent to "square root of Z". + */ +template +struct GateS { + static constexpr GateKind kind = kGateS; + static constexpr char name[] = "s"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateQSim Create(unsigned time, unsigned q0) { + return CreateGate, GateS>( + time, {q0}, {1, 0, 0, 0, 0, 0, 0, 1}); + } +}; + +/** + * A one-qubit gate defined entirely by its matrix. + */ +template +struct GateMatrix1 { + static constexpr GateKind kind = kGateMatrix1; + static constexpr char name[] = "mat1"; + static constexpr unsigned num_qubits = 1; + static constexpr bool symmetric = true; + + static GateQSim Create(unsigned time, unsigned q0, + const Matrix& m) { + auto m2 = m; + return + CreateGate, GateMatrix1>(time, {q0}, std::move(m2)); + } +}; + +// Two-qubit gates: + +/** + * The two-qubit identity gate. + */ +template +struct GateId2 { + static constexpr GateKind kind = kGateId2; + static constexpr char name[] = "id2"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static GateQSim Create(unsigned time, unsigned q0, unsigned q1) { + return CreateGate, GateId2>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0}); + } + + static schmidt_decomp_type SchmidtDecomp() { + return schmidt_decomp_type{ + {{1, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, 1, 0}}, + }; + } +}; + +/** + * The controlled-Z (CZ) gate. + */ +template +struct GateCZ { + static constexpr GateKind kind = kGateCZ; + static constexpr char name[] = "cz"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static GateQSim Create(unsigned time, unsigned q0, unsigned q1) { + return CreateGate, GateCZ>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, -1, 0}); + } + + static schmidt_decomp_type SchmidtDecomp() { + return schmidt_decomp_type{ + {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}}, + {{0, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, -1, 0}}, + }; + } +}; + +/** + * The controlled-X (CX or CNOT) gate. + */ +template +struct GateCNot { + static constexpr GateKind kind = kGateCNot; + static constexpr char name[] = "cnot"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = false; + + static GateQSim Create(unsigned time, unsigned q0, unsigned q1) { + // Matrix is in this form because the simulator uses inverse qubit order. + return CreateGate, GateCNot>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0, + 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0}); + } + + static schmidt_decomp_type SchmidtDecomp() { + return schmidt_decomp_type{ + {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}}, + {{0, 0, 0, 0, 0, 0, 1, 0}, {0, 0, 1, 0, 1, 0, 0, 0}}, + }; + } +}; + +/** + * The SWAP gate. Exchanges two qubits. + */ +template +struct GateSwap { + static constexpr GateKind kind = kGateSwap; + static constexpr char name[] = "sw"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static constexpr fp_type is2 = static_cast(is2_double); + + static GateQSim Create(unsigned time, unsigned q0, unsigned q1) { + return CreateGate, GateSwap>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0}); + } + + static schmidt_decomp_type SchmidtDecomp() { + return schmidt_decomp_type{ + {{is2, 0, 0, 0, 0, 0, is2, 0}, {is2, 0, 0, 0, 0, 0, is2, 0}}, + {{0, 0, is2, 0, is2, 0, 0, 0}, {0, 0, is2, 0, is2, 0, 0, 0}}, + {{0, 0, 0, -is2, 0, is2, 0, 0}, {0, 0, 0, -is2, 0, is2, 0, 0}}, + {{is2, 0, 0, 0, 0, 0, -is2, 0}, {is2, 0, 0, 0, 0, 0, -is2, 0}}, + }; + } +}; + +/** + * The ISWAP gate. + */ +template +struct GateIS { + static constexpr GateKind kind = kGateIS; + static constexpr char name[] = "is"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static constexpr fp_type h = static_cast(h_double); + static constexpr fp_type is2 = static_cast(is2_double); + + static GateQSim Create(unsigned time, unsigned q0, unsigned q1) { + return CreateGate, GateIS>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 1, 0, 0, + 0, 0, 0, 1, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0}); + } + + static schmidt_decomp_type SchmidtDecomp() { + return schmidt_decomp_type{ + {{is2, 0, 0, 0, 0, 0, is2, 0}, {is2, 0, 0, 0, 0, 0, is2, 0}}, + {{0, 0, h, h, h, h, 0, 0}, {0, 0, h, h, h, h, 0, 0}}, + {{0, 0, h, -h, -h, h, 0, 0}, {0, 0, h, -h, -h, h, 0, 0}}, + {{is2, 0, 0, 0, 0, 0, -is2, 0}, {is2, 0, 0, 0, 0, 0, -is2, 0}}, + }; + } +}; + +/** + * The fermionic simulation (FSim) gate family. Contains all two-qubit + * interactions that preserve excitations, up to single-qubit rotations and + * global phase. + */ +template +struct GateFS { + static constexpr GateKind kind = kGateFS; + static constexpr char name[] = "fs"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static constexpr fp_type is2 = static_cast(is2_double); + + static GateQSim Create( + unsigned time, unsigned q0, unsigned q1, fp_type theta, fp_type phi) { + if (phi < 0) { + phi += 2 * 3.141592653589793; + } + + fp_type ct = std::cos(theta); + fp_type st = std::sin(theta); + fp_type cp = std::cos(phi); + fp_type sp = std::sin(phi); + + return CreateGate, GateFS>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, ct, 0, 0, -st, 0, 0, + 0, 0, 0, -st, ct, 0, 0, 0, + 0, 0, 0, 0, 0, 0, cp, -sp}, {theta, phi}); + } + + static schmidt_decomp_type SchmidtDecomp( + fp_type theta, fp_type phi) { + fp_type ct = std::cos(theta); + fp_type st = std::sin(theta); + + fp_type cp2 = std::cos(0.5 * phi); + fp_type sp2 = std::sin(0.5 * phi); + fp_type cp4 = std::cos(0.25 * phi); + fp_type sp4 = std::sin(0.25 * phi); + + fp_type a0 = std::sqrt(std::sqrt(1 + 2 * ct * cp2 + ct * ct)); + fp_type a1 = std::sqrt(std::sqrt(1 - 2 * ct * cp2 + ct * ct)); + + fp_type p0 = 0.5 * std::atan2(-sp2, cp2 + ct); + fp_type p1 = 0.5 * std::atan2(-sp2, cp2 - ct); + + fp_type c0 = is2 * a0 * std::cos(p0); + fp_type s0 = is2 * a0 * std::sin(p0); + + fp_type c1 = is2 * a1 * std::cos(p1); + fp_type s1 = is2 * a1 * std::sin(p1); + + fp_type st2 = 0.5 * std::sqrt(st); + + fp_type a = cp4 * c0 - sp4 * s0; + fp_type b = cp4 * s0 + sp4 * c0; + fp_type c = cp4 * c0 + sp4 * s0; + fp_type d = cp4 * s0 - sp4 * c0; + + fp_type e = cp4 * c1 - sp4 * s1; + fp_type f = cp4 * s1 + sp4 * c1; + fp_type g = -(cp4 * c1 + sp4 * s1); + fp_type h = -(cp4 * s1 - sp4 * c1); + + return schmidt_decomp_type{ + {{a, b, 0, 0, 0, 0, c, d}, {a, b, 0, 0, 0, 0, c, d}}, + {{0, 0, st2, -st2, st2, -st2, 0, 0}, {0, 0, st2, -st2, st2, -st2, 0, 0}}, + {{0, 0, -st2, -st2, st2, st2, 0, 0}, {0, 0, -st2, -st2, st2, st2, 0, 0}}, + {{e, f, 0, 0, 0, 0, g, h}, {e, f, 0, 0, 0, 0, g, h}}, + }; + } +}; + +/** + * The controlled phase gate. A generalized version of GateCZ. + */ +template +struct GateCP { + static constexpr GateKind kind = kGateCP; + static constexpr char name[] = "cp"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = true; + + static GateQSim Create( + unsigned time, unsigned q0, unsigned q1, fp_type phi) { + fp_type cp = std::cos(phi); + fp_type sp = std::sin(phi); + + return CreateGate, GateCP>( + time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, cp, -sp}, {phi}); + } + + static schmidt_decomp_type SchmidtDecomp(fp_type phi) { + fp_type cp = std::cos(phi); + fp_type sp = std::sin(phi); + + return schmidt_decomp_type{ + {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}}, + {{0, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, cp, -sp}}, + }; + } +}; + +/** + * A two-qubit gate defined entirely by its matrix. + */ +template +struct GateMatrix2 { + static constexpr GateKind kind = kGateMatrix2; + static constexpr char name[] = "mat2"; + static constexpr unsigned num_qubits = 2; + static constexpr bool symmetric = false; + + template > + static GateQSim Create( + unsigned time, unsigned q0, unsigned q1, M&& m) { + return CreateGate, GateMatrix2>(time, {q1, q0}, + std::forward(m)); + } + + static schmidt_decomp_type SchmidtDecomp(fp_type phi) { + // Not implemented. + return schmidt_decomp_type{}; + } +}; + +template +inline schmidt_decomp_type GetSchmidtDecomp( + GateKind kind, const std::vector& params) { + switch (kind) { + case kGateId2: + return GateId2::SchmidtDecomp(); + case kGateCZ: + return GateCZ::SchmidtDecomp(); + case kGateCNot: + return GateCNot::SchmidtDecomp(); + case kGateSwap: + return GateSwap::SchmidtDecomp(); + case kGateIS: + return GateIS::SchmidtDecomp(); + case kGateFS: + return GateFS::SchmidtDecomp(params[0], params[1]); + case kGateCP: + return GateCP::SchmidtDecomp(params[0]); + default: + // Single qubit gates: empty Schmidt decomposition. + return schmidt_decomp_type{}; + } +} + +} // namespace qsim + +#endif // GATES_QSIM_H_ diff --git a/tpls/qsim/hybrid.h b/tpls/qsim/hybrid.h new file mode 100644 index 0000000..44fad5b --- /dev/null +++ b/tpls/qsim/hybrid.h @@ -0,0 +1,612 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef HYBRID_H_ +#define HYBRID_H_ + +#include +#include +#include +#include + +#include "gate.h" +#include "gate_appl.h" + +namespace qsim { + +/** + * Hybrid Feynman-Schrodinger simulator. + */ +template class FuserT, typename For> +struct HybridSimulator final { + public: + using Gate = GateT; + using GateKind = typename Gate::GateKind; + using fp_type = typename Gate::fp_type; + + private: + // Note that one can use "struct GateHybrid : public Gate {" in C++17. + struct GateHybrid { + using GateKind = HybridSimulator::GateKind; + using fp_type = HybridSimulator::fp_type; + + GateKind kind; + unsigned time; + std::vector qubits; + std::vector controlled_by; + uint64_t cmask; + std::vector params; + Matrix matrix; + bool unfusible; + bool swapped; + + const Gate* parent; + unsigned id; + }; + + struct GateX { + GateHybrid* decomposed0; + GateHybrid* decomposed1; + schmidt_decomp_type schmidt_decomp; + unsigned schmidt_bits; + unsigned swapped; + }; + + public: + using Fuser = FuserT; + using GateFused = typename Fuser::GateFused; + + /** + * Contextual data for hybrid simulation. + */ + struct HybridData { + /** + * List of gates on the "0" side of the cut. + */ + std::vector gates0; + /** + * List of gates on the "1" side of the cut. + */ + std::vector gates1; + /** + * List of gates on the cut. + */ + std::vector gatexs; + /** + * Global qubit index to local qubit index map. + */ + std::vector qubit_map; + /** + * Number of qubits on the "0" side of the cut. + */ + unsigned num_qubits0; + /** + * Number of qubits on the "1" side of the cut. + */ + unsigned num_qubits1; + /** + * Number of gates on the cut. + */ + unsigned num_gatexs; + }; + + /** + * User-specified parameters for gate fusion and hybrid simulation. + */ + struct Parameter : public Fuser::Parameter { + /** + * Fixed bitstring indicating values to assign to Schmidt decomposition + * indices of prefix gates. + */ + uint64_t prefix; + /** + * Number of gates on the cut that are part of the prefix. Indices of these + * gates are assigned the value indicated by `prefix`. + */ + unsigned num_prefix_gatexs; + /** + * Number of gates on the cut that are part of the root. All gates that are + * not part of the prefix or root are part of the suffix. + */ + unsigned num_root_gatexs; + unsigned num_threads; + }; + + template + explicit HybridSimulator(Args&&... args) : for_(args...) {} + + /** + * Splits the lattice into two parts, using Schmidt decomposition for gates + * on the cut. + * @param parts Lattice sections to be simulated. + * @param gates List of all gates in the circuit. + * @param hd Output data with split parts. + * @return True if the splitting done successfully; false otherwise. + */ + static bool SplitLattice(const std::vector& parts, + const std::vector& gates, HybridData& hd) { + hd.num_gatexs = 0; + hd.num_qubits0 = 0; + hd.num_qubits1 = 0; + + hd.gates0.reserve(gates.size()); + hd.gates1.reserve(gates.size()); + hd.qubit_map.reserve(parts.size()); + + unsigned count0 = 0; + unsigned count1 = 0; + + // Global qubit index to local qubit index map. + for (std::size_t i = 0; i < parts.size(); ++i) { + parts[i] == 0 ? ++hd.num_qubits0 : ++hd.num_qubits1; + hd.qubit_map.push_back(parts[i] == 0 ? count0++ : count1++); + } + + // Split the lattice. + for (const auto& gate : gates) { + if (gate.kind == gate::kMeasurement) { + IO::errorf("measurement gates are not suported by qsimh.\n"); + return false; + } + + if (gate.controlled_by.size() > 0) { + IO::errorf("controlled gates are not suported by qsimh.\n"); + return false; + } + + switch (gate.qubits.size()) { + case 1: // Single qubit gates. + switch (parts[gate.qubits[0]]) { + case 0: + hd.gates0.emplace_back(GateHybrid{gate.kind, gate.time, + {hd.qubit_map[gate.qubits[0]]}, {}, 0, gate.params, gate.matrix, + false, false, nullptr, 0}); + break; + case 1: + hd.gates1.emplace_back(GateHybrid{gate.kind, gate.time, + {hd.qubit_map[gate.qubits[0]]}, {}, 0, gate.params, gate.matrix, + false, false, nullptr, 0}); + break; + } + break; + case 2: // Two qubit gates. + { + switch ((parts[gate.qubits[1]] << 1) | parts[gate.qubits[0]]) { + case 0: // Both qubits in part 0. + hd.gates0.emplace_back(GateHybrid{gate.kind, gate.time, + {hd.qubit_map[gate.qubits[0]], hd.qubit_map[gate.qubits[1]]}, + {}, 0, gate.params, gate.matrix, false, gate.swapped, + nullptr, 0}); + break; + case 1: // Gate on the cut, qubit 0 in part 1, qubit 1 in part 0. + hd.gates0.emplace_back(GateHybrid{GateKind::kDecomp, gate.time, + {hd.qubit_map[gate.qubits[1]]}, {}, 0, gate.params, {}, + true, gate.swapped, &gate, hd.num_gatexs}); + hd.gates1.emplace_back(GateHybrid{GateKind::kDecomp, gate.time, + {hd.qubit_map[gate.qubits[0]]}, {}, 0, gate.params, {}, + true, gate.swapped, &gate, hd.num_gatexs}); + + ++hd.num_gatexs; + break; + case 2: // Gate on the cut, qubit 0 in part 0, qubit 1 in part 1. + hd.gates0.emplace_back(GateHybrid{GateKind::kDecomp, gate.time, + {hd.qubit_map[gate.qubits[0]]}, {}, 0, gate.params, {}, + true, gate.swapped, &gate, hd.num_gatexs}); + hd.gates1.emplace_back(GateHybrid{GateKind::kDecomp, gate.time, + {hd.qubit_map[gate.qubits[1]]}, {}, 0, gate.params, {}, + true, gate.swapped, &gate, hd.num_gatexs}); + + ++hd.num_gatexs; + break; + case 3: // Both qubits in part 1. + hd.gates1.emplace_back(GateHybrid{gate.kind, gate.time, + {hd.qubit_map[gate.qubits[0]], hd.qubit_map[gate.qubits[1]]}, + {}, 0, gate.params, gate.matrix, false, gate.swapped, + nullptr, 0}); + break; + } + } + break; + default: + IO::errorf("multi-qubit gates are not suported by qsimh.\n"); + return false; + } + } + + auto compare = [](const GateHybrid& l, const GateHybrid& r) -> bool { + return l.time < r.time || (l.time == r.time && + (l.parent < r.parent || (l.parent == r.parent && l.id < r.id))); + }; + + // Sort gates. + std::sort(hd.gates0.begin(), hd.gates0.end(), compare); + std::sort(hd.gates1.begin(), hd.gates1.end(), compare); + + hd.gatexs.reserve(hd.num_gatexs); + + // Get Schmidt matrices. + for (auto& gate0 : hd.gates0) { + if (gate0.parent != nullptr) { + auto d = GetSchmidtDecomp(gate0.parent->kind, gate0.parent->params); + if (d.size() == 0) { + IO::errorf("no Schmidt decomposition for gate kind %u.\n", + gate0.parent->kind); + return false; + } + + unsigned schmidt_bits = SchmidtBits(d.size()); + if (schmidt_bits > 2) { + IO::errorf("Schmidt rank is too large for gate kind %u.\n", + gate0.parent->kind); + return false; + } + + unsigned swapped = parts[gate0.parent->qubits[0]]; + if (gate0.parent->swapped) swapped = 1 - swapped; + hd.gatexs.emplace_back(GateX{&gate0, nullptr, std::move(d), + schmidt_bits, swapped}); + } + } + + unsigned count = 0; + for (auto& gate1 : hd.gates1) { + if (gate1.parent != nullptr) { + hd.gatexs[count++].decomposed1 = &gate1; + } + } + + for (auto& gatex : hd.gatexs) { + if (gatex.schmidt_decomp.size() == 1) { + FillSchmidtMatrices(0, gatex); + } + } + + return true; + } + + /** + * Runs the hybrid simulator on a sectioned lattice. + * @param param Options for parallelism and logging. Also specifies the size + * of the 'prefix' and 'root' sections of the lattice. + * @param factory Object to create simulators and state spaces. + * @param hd Container object for gates on the boundary between lattice + * sections. + * @param parts Lattice sections to be simulated. + * @param fgates0 List of gates from one section of the lattice. + * @param fgates1 List of gates from the other section of the lattice. + * @param bitstrings List of output states to simulate, as bitstrings. + * @param results Output vector of amplitudes. After a successful run, this + * will be populated with amplitudes for each state in 'bitstrings'. + * @return True if the simulation completed successfully; false otherwise. + */ + template + bool Run(const Parameter& param, const Factory& factory, + HybridData& hd, const std::vector& parts, + const std::vector& fgates0, + const std::vector& fgates1, + const std::vector& bitstrings, Results& results) const { + using Simulator = typename Factory::Simulator; + using StateSpace = typename Simulator::StateSpace; + using State = typename StateSpace::State; + + unsigned num_p_gates = param.num_prefix_gatexs; + unsigned num_pr_gates = num_p_gates + param.num_root_gatexs; + + auto bits = CountSchmidtBits(param, hd.gatexs); + + uint64_t rmax = uint64_t{1} << bits.num_r_bits; + uint64_t smax = uint64_t{1} << bits.num_s_bits; + + auto loc0 = CheckpointLocations(param, fgates0); + auto loc1 = CheckpointLocations(param, fgates1); + + struct Index { + unsigned i0; + unsigned i1; + }; + + std::vector indices; + indices.reserve(bitstrings.size()); + + // Bitstring indices for part 0 and part 1. TODO: optimize. + for (const auto& bitstring : bitstrings) { + Index index{0, 0}; + + for (uint64_t i = 0; i < hd.qubit_map.size(); ++i) { + unsigned m = ((bitstring >> i) & 1) << hd.qubit_map[i]; + parts[i] ? index.i1 |= m : index.i0 |= m; + } + + indices.push_back(index); + } + + StateSpace state_space = factory.CreateStateSpace(); + + State* rstate0; + State* rstate1; + + State state0p = state_space.Null(); + State state1p = state_space.Null(); + State state0r = state_space.Null(); + State state1r = state_space.Null(); + State state0s = state_space.Null(); + State state1s = state_space.Null(); + + // Create states. + + if (!CreateStates(hd.num_qubits0, hd.num_qubits1, state_space, true, + state0p, state1p, rstate0, rstate1)) { + return false; + } + + if (!CreateStates(hd.num_qubits0, hd.num_qubits1, state_space, rmax > 1, + state0r, state1r, rstate0, rstate1)) { + return false; + } + + if (!CreateStates(hd.num_qubits0, hd.num_qubits1, state_space, smax > 1, + state0s, state1s, rstate0, rstate1)) { + return false; + } + + state_space.SetStateZero(state0p); + state_space.SetStateZero(state1p); + + Simulator simulator = factory.CreateSimulator(); + + std::vector prev(hd.num_gatexs, unsigned(-1)); + + // param.prefix encodes the prefix path. + unsigned gatex_index = SetSchmidtMatrices( + 0, num_p_gates, param.prefix, prev, hd.gatexs); + + if (gatex_index == 0) { + // Apply gates before the first checkpoint. + ApplyGates(fgates0, 0, loc0[0], simulator, state0p); + ApplyGates(fgates1, 0, loc1[0], simulator, state1p); + } else { + IO::errorf("invalid prefix %lu for prefix gate index %u.\n", + param.prefix, gatex_index - 1); + return false; + } + + // Branch over root gates on the cut. r encodes the root path. + for (uint64_t r = 0; r < rmax; ++r) { + if (rmax > 1) { + state_space.Copy(state0p, state0r); + state_space.Copy(state1p, state1r); + } + + if (SetSchmidtMatrices(num_p_gates, num_pr_gates, + r, prev, hd.gatexs) == 0) { + // Apply gates before the second checkpoint. + ApplyGates(fgates0, loc0[0], loc0[1], simulator, state0r); + ApplyGates(fgates1, loc1[0], loc1[1], simulator, state1r); + } else { + continue; + } + + // Branch over suffix gates on the cut. s encodes the suffix path. + for (uint64_t s = 0; s < smax; ++s) { + if (smax > 1) { + state_space.Copy(rmax > 1 ? state0r : state0p, state0s); + state_space.Copy(rmax > 1 ? state1r : state1p, state1s); + } + + if (SetSchmidtMatrices(num_pr_gates, hd.num_gatexs, + s, prev, hd.gatexs) == 0) { + // Apply the rest of the gates. + ApplyGates(fgates0, loc0[1], fgates0.size(), simulator, state0s); + ApplyGates(fgates1, loc1[1], fgates1.size(), simulator, state1s); + } else { + continue; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const StateSpace& state_space, + const State& state0, const State& state1, + const std::vector& indices, Results& results) { + // TODO: make it faster for the CUDA state space. + auto a0 = state_space.GetAmpl(state0, indices[i].i0); + auto a1 = state_space.GetAmpl(state1, indices[i].i1); + results[i] += a0 * a1; + }; + + // Collect results. + for_.Run(results.size(), f, + state_space, *rstate0, *rstate1, indices, results); + } + } + + return true; + } + + private: + /** + * Identifies when to save "checkpoints" of the simulation state. These allow + * runs with different cut-index values to reuse parts of the simulation. + * @param param Options for parallelism and logging. Also specifies the size + * of the 'prefix' and 'root' sections of the lattice. + * @param fgates Set of gates for which to find checkpoint locations. + * @return A pair of numbers specifying how many gates to apply before the + * first and second checkpoints, respectively. + */ + static std::array CheckpointLocations( + const Parameter& param, const std::vector& fgates) { + std::array loc{0, 0}; + + unsigned num_decomposed = 0; + unsigned num_p_gates = param.num_prefix_gatexs; + unsigned num_pr_gates = num_p_gates + param.num_root_gatexs; + + for (std::size_t i = 0; i < fgates.size(); ++i) { + for (auto gate: fgates[i].gates) { + if (gate->parent != nullptr) { + ++num_decomposed; + // There should be only one decomposed gate in fused gate. + break; + } + } + + if (num_decomposed <= num_p_gates) { + loc[0] = i + 1; + } + + if (num_decomposed <= num_pr_gates) { + loc[1] = i + 1; + } + } + + return loc; + } + + struct Bits { + unsigned num_p_bits; + unsigned num_r_bits; + unsigned num_s_bits; + }; + + static Bits CountSchmidtBits( + const Parameter& param, const std::vector& gatexs) { + Bits bits{0, 0, 0}; + + unsigned num_p_gates = param.num_prefix_gatexs; + unsigned num_pr_gates = num_p_gates + param.num_root_gatexs; + + for (std::size_t i = 0; i < gatexs.size(); ++i) { + const auto& gatex = gatexs[i]; + if (i < num_p_gates) { + bits.num_p_bits += gatex.schmidt_bits; + } else if (i < num_pr_gates) { + bits.num_r_bits += gatex.schmidt_bits; + } else { + bits.num_s_bits += gatex.schmidt_bits; + } + } + + return bits; + } + + static unsigned SetSchmidtMatrices(std::size_t i0, std::size_t i1, + uint64_t path, + std::vector& prev_k, + std::vector& gatexs) { + unsigned shift_length = 0; + + for (std::size_t i = i0; i < i1; ++i) { + const auto& gatex = gatexs[i]; + + if (gatex.schmidt_bits == 0) { + // Continue if gatex has Schmidt rank 1. + continue; + } + + unsigned k = (path >> shift_length) & ((1 << gatex.schmidt_bits) - 1); + shift_length += gatex.schmidt_bits; + + if (k != prev_k[i]) { + if (k >= gatex.schmidt_decomp.size()) { + // Invalid path. Returns gatex index plus one to report error in case + // of invalid prefix. + return i + 1; + } + + FillSchmidtMatrices(k, gatex); + + prev_k[i] = k; + } + } + + return 0; + } + + static void FillSchmidtMatrices(unsigned k, const GateX& gatex) { + unsigned part0 = gatex.swapped; + unsigned part1 = 1 - part0; + { + gatex.decomposed0->matrix.resize(gatex.schmidt_decomp[k][part0].size()); + auto begin = gatex.schmidt_decomp[k][part0].begin(); + auto end = gatex.schmidt_decomp[k][part0].end(); + std::copy(begin, end, gatex.decomposed0->matrix.begin()); + } + { + gatex.decomposed1->matrix.resize(gatex.schmidt_decomp[k][part1].size()); + auto begin = gatex.schmidt_decomp[k][part1].begin(); + auto end = gatex.schmidt_decomp[k][part1].end(); + std::copy(begin, end, gatex.decomposed1->matrix.begin()); + } + } + + template + static void ApplyGates(const std::vector& gates, + std::size_t i0, std::size_t i1, + const Simulator& simulator, + typename Simulator::State& state) { + for (std::size_t i = i0; i < i1; ++i) { + if (gates[i].matrix.size() > 0) { + ApplyFusedGate(simulator, gates[i], state); + } else { + auto gate = gates[i]; + CalculateFusedMatrix(gate); + ApplyFusedGate(simulator, gate, state); + } + } + } + + static unsigned SchmidtBits(unsigned size) { + switch (size) { + case 1: + return 0; + case 2: + return 1; + case 3: + return 2; + case 4: + return 2; + default: + // Not supported. + return 42; + } + } + + template + static bool CreateStates(unsigned num_qubits0,unsigned num_qubits1, + const StateSpace& state_space, bool create, + typename StateSpace::State& state0, + typename StateSpace::State& state1, + typename StateSpace::State* (&rstate0), + typename StateSpace::State* (&rstate1)) { + if (create) { + state0 = state_space.Create(num_qubits0); + state1 = state_space.Create(num_qubits1); + + if (state_space.IsNull(state0) || state_space.IsNull(state1)) { + IO::errorf("not enough memory: is the number of qubits too large?\n"); + return false; + } + + rstate0 = &state0; + rstate1 = &state1; + } + + return true; + } + + For for_; +}; + +} // namespace qsim + +#endif // HYBRID_H_ diff --git a/tpls/qsim/io.h b/tpls/qsim/io.h new file mode 100644 index 0000000..3b26c7c --- /dev/null +++ b/tpls/qsim/io.h @@ -0,0 +1,44 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef IO_H_ +#define IO_H_ + +#include +#include + +namespace qsim { + +/** + * Controller for output logs. + */ +struct IO { + static void errorf(const char* format, ...) { + va_list args; + va_start(args, format); + vfprintf(stderr, format, args); + va_end(args); + } + + static void messagef(const char* format, ...) { + va_list args; + va_start(args, format); + vprintf(format, args); + va_end(args); + } +}; + +} // namespace qsim + +#endif // IO_H_ diff --git a/tpls/qsim/io_file.h b/tpls/qsim/io_file.h new file mode 100644 index 0000000..3cfac12 --- /dev/null +++ b/tpls/qsim/io_file.h @@ -0,0 +1,71 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef IO_FILE_H_ +#define IO_FILE_H_ + +#include +#include +#include + +#include "io.h" + +namespace qsim { + +/** + * Controller for output logs with methods for writing to file. + */ +struct IOFile : public IO { + static std::ifstream StreamFromFile(const std::string& file) { + std::ifstream fs; + fs.open(file); + if (!fs) { + errorf("cannot open %s for reading.\n", file.c_str()); + } + return fs; + } + + static void CloseStream(std::ifstream& fs) { + fs.close(); + } + + static bool WriteToFile( + const std::string& file, const std::string& content) { + return WriteToFile(file, content.data(), content.size()); + } + + static bool WriteToFile( + const std::string& file, const void* data, uint64_t size) { + auto fs = std::fstream(file, std::ios::out | std::ios::binary); + + if (!fs) { + errorf("cannot open %s for writing.\n", file.c_str()); + return false; + } else { + fs.write((const char*) data, size); + if (!fs) { + errorf("cannot write to %s.\n", file.c_str()); + return false; + } + + fs.close(); + } + + return true; + } +}; + +} // namespace qsim + +#endif // IO_FILE_H_ diff --git a/tpls/qsim/matrix.h b/tpls/qsim/matrix.h new file mode 100644 index 0000000..a3c2640 --- /dev/null +++ b/tpls/qsim/matrix.h @@ -0,0 +1,296 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MATRIX_H_ +#define MATRIX_H_ + +#include +#include +#include + +#include "bits.h" + +namespace qsim { + +/** + * Gate matrix type. Matrices are stored as vectors. The matrix elements are + * accessed as real(m[i][j]) <- vector[2 * (n * i + j)] and + * imag(m[i][j]) <- vector[2 * (n * i + j) + 1], where n is the number of rows + * or columns (n = 2^q, where q is the number of gate qubits). + */ +template +using Matrix = std::vector; + +/** + * Sets all matrix elements to zero. + * @m Matrix to be cleared. + */ +template +inline void MatrixClear(Matrix& m) { + for (unsigned i = 0; i < m.size(); ++i) { + m[i] = 0; + } +} + +/** + * Sets an identity matrix. + * @n Number of matrix rows (columns). + * @m Output identity matrix. + */ +template +inline void MatrixIdentity(unsigned n, Matrix& m) { + m.resize(2 * n * n); + + MatrixClear(m); + + for (unsigned i = 0; i < n; ++i) { + m[2 * (n * i + i)] = 1; + } +} + +/** + * Multiplies two gate matrices of equal size: m2 = m1 m2. + * @q Number of gate qubits. The number of matrix rows (columns) is 2^q. + * @m1 Matrix m1. + * @m2 Input matrix m2. Output product of matrices m2 = m1 m2. + */ +template +inline void MatrixMultiply( + unsigned q, const Matrix& m1, Matrix& m2) { + Matrix mt = m2; + unsigned n = unsigned{1} << q; + + for (unsigned i = 0; i < n; ++i) { + for (unsigned j = 0; j < n; ++j) { + fp_type2 re = 0; + fp_type2 im = 0; + + for (unsigned k = 0; k < n; ++k) { + fp_type2 r1 = m1[2 * (n * i + k)]; + fp_type2 i1 = m1[2 * (n * i + k) + 1]; + fp_type2 r2 = mt[2 * (n * k + j)]; + fp_type2 i2 = mt[2 * (n * k + j) + 1]; + + re += r1 * r2 - i1 * i2; + im += r1 * i2 + i1 * r2; + } + + m2[2 * (n * i + j)] = re; + m2[2 * (n * i + j) + 1] = im; + } + } +} + +/** + * Multiplies two gate matrices of equal size: m2 = m1^\dagger m2. + * @q Number of gate qubits. The number of matrix rows (columns) is 2^q. + * @m1 Matrix m1. + * @m2 Input matrix m2. Output product of matrices m2 = m1 m2. + */ +template +inline void MatrixDaggerMultiply( + unsigned q, const Matrix& m1, Matrix& m2) { + Matrix mt = m2; + unsigned n = unsigned{1} << q; + + for (unsigned i = 0; i < n; ++i) { + for (unsigned j = 0; j < n; ++j) { + fp_type2 re = 0; + fp_type2 im = 0; + + for (unsigned k = 0; k < n; ++k) { + fp_type2 r1 = m1[2 * (n * k + i)]; + fp_type2 i1 = m1[2 * (n * k + i) + 1]; + fp_type2 r2 = mt[2 * (n * k + j)]; + fp_type2 i2 = mt[2 * (n * k + j) + 1]; + + re += r1 * r2 + i1 * i2; + im += r1 * i2 - i1 * r2; + } + + m2[2 * (n * i + j)] = re; + m2[2 * (n * i + j) + 1] = im; + } + } +} + +/** + * Multiplies two gate matrices: m2 = m1 m2. The size of m1 should not exceed + * the size of m2. + * @mask1 Qubit mask that specifies the subset of qubits m1 acts on. + * @q1 Number of gate qubits. The number of matrix rows (columns) is 2^q1. + * @m1 Matrix m1. + * @q2 Number of gate qubits. The number of matrix rows (columns) is 2^q2. + * @m2 Input matrix m2. Output product of matrices m2 = m1 m2. + */ +template +inline void MatrixMultiply(unsigned mask1, + unsigned q1, const Matrix& m1, + unsigned q2, Matrix& m2) { + if (q1 == q2) { + MatrixMultiply(q1, m1, m2); + } else { + Matrix mt = m2; + unsigned n1 = unsigned{1} << q1; + unsigned n2 = unsigned{1} << q2; + + for (unsigned i = 0; i < n2; ++i) { + unsigned si = bits::CompressBits(i, q2, mask1); + + for (unsigned j = 0; j < n2; ++j) { + fp_type2 re = 0; + fp_type2 im = 0; + + for (unsigned k = 0; k < n1; ++k) { + unsigned ek = bits::ExpandBits(k, q2, mask1) + (i & ~mask1); + + fp_type2 r1 = m1[2 * (n1 * si + k)]; + fp_type2 i1 = m1[2 * (n1 * si + k) + 1]; + fp_type2 r2 = mt[2 * (n2 * ek + j)]; + fp_type2 i2 = mt[2 * (n2 * ek + j) + 1]; + + re += r1 * r2 - i1 * i2; + im += r1 * i2 + i1 * r2; + } + + m2[2 * (n2 * i + j)] = re; + m2[2 * (n2 * i + j) + 1] = im; + } + } + } +} + +/** + * Multiply a matrix by a real scalar value. + * @c Scalar value. + * @m Input matrix to be multiplied. Output matrix. + */ +template +inline void MatrixScalarMultiply(fp_type1 c, Matrix& m) { + for (unsigned i = 0; i < m.size(); ++i) { + m[i] *= c; + } +} + +/** + * Multiply a matrix by a complex scalar value. + * @re Real part of scalar value. + * @im Imaginary part of scalar value. + * @m Input matrix to be multiplied. Output matrix. + */ +template +inline void MatrixScalarMultiply( + fp_type1 re, fp_type1 im, Matrix& m) { + for (unsigned i = 0; i < m.size() / 2; ++i) { + fp_type2 re0 = m[2 * i + 0]; + fp_type2 im0 = m[2 * i + 1]; + m[2 * i + 0] = re * re0 - im * im0; + m[2 * i + 1] = re * im0 + im * re0; + } +} + +/** + * Daggers a matrix. + * @n Number of matrix rows (columns). + * @m Input matrix. Output matrix. + */ +template +inline void MatrixDagger(unsigned n, Matrix& m) { + for (unsigned i = 0; i < n; ++i) { + m[2 * (n * i + i) + 1] = -m[2 * (n * i + i) + 1]; + + for (unsigned j = i + 1; j < n; ++j) { + std::swap(m[2 * (n * i + j)], m[2 * (n * j + i)]); + fp_type t = m[2 * (n * i + j) + 1]; + m[2 * (n * i + j) + 1] = -m[2 * (n * j + i) + 1]; + m[2 * (n * j + i) + 1] = -t; + } + } +} + +/** + * Gets a permutation to rearrange qubits from "normal" order to "gate" + * order. Qubits are ordered in increasing order for "normal" order. + * Qubits are ordered arbitrarily for "gate" order. Returns an empty vector + * if the qubits are in "normal" order. + * @qubits Qubit indices in "gate" order. + * @return Permutation as a vector. + */ +inline std::vector NormalToGateOrderPermutation( + const std::vector& qubits) { + std::vector perm; + + bool normal_order = true; + + for (std::size_t i = 1; i < qubits.size(); ++i) { + if (qubits[i] < qubits[i - 1]) { + normal_order = false; + break; + } + } + + if (!normal_order) { + struct QI { + unsigned q; + unsigned index; + }; + + std::vector qis; + qis.reserve(qubits.size()); + + for (std::size_t i = 0; i < qubits.size(); ++i) { + qis.push_back({qubits[i], unsigned(i)}); + } + + std::sort(qis.begin(), qis.end(), [](const QI& l, const QI& r) { + return l.q < r.q; + }); + + perm.reserve(qubits.size()); + + for (std::size_t i = 0; i < qubits.size(); ++i) { + perm.push_back(qis[i].index); + } + } + + return perm; +} + +/** + * Shuffles the gate matrix elements to get the matrix that acts on qubits + * that are in "normal" order (in increasing orger). + * @perm Permutation to rearrange qubits from "normal" order to "gate" order. + * @q Number of gate qubits. The number of matrix rows (columns) is 2^q. + * @m Input matrix. Output shuffled matrix. + */ +template +inline void MatrixShuffle(const std::vector& perm, + unsigned q, Matrix& m) { + Matrix mt = m; + unsigned n = unsigned{1} << q; + + for (unsigned i = 0; i < n; ++i) { + unsigned pi = bits::PermuteBits(i, q, perm); + for (unsigned j = 0; j < n; ++j) { + unsigned pj = bits::PermuteBits(j, q, perm); + + m[2 * (n * i + j)] = mt[2 * (n * pi + pj)]; + m[2 * (n * i + j) + 1] = mt[2 * (n * pi + pj) + 1]; + } + } +} + +} // namespace qsim + +#endif // MATRIX_H_ diff --git a/tpls/qsim/mps_simulator.h b/tpls/qsim/mps_simulator.h new file mode 100644 index 0000000..8fbcbae --- /dev/null +++ b/tpls/qsim/mps_simulator.h @@ -0,0 +1,246 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MPS_SIMULATOR_H_ +#define MPS_SIMULATOR_H_ + +// For templates will take care of parallelization. +#define EIGEN_DONT_PARALLELIZE 1 + +#include +#include +#include +#include +#include + +#include "../eigen/Eigen/Dense" +#include "../eigen/Eigen/SVD" +#include "mps_statespace.h" + +namespace qsim { + +namespace mps { + +/** + * Truncated Matrix Product State (MPS) circuit simulator w/ vectorization. + */ +template +class MPSSimulator final { + public: + using MPSStateSpace_ = MPSStateSpace; + using State = typename MPSStateSpace_::MPS; + using fp_type = typename MPSStateSpace_::fp_type; + + using Complex = std::complex; + using Matrix = + Eigen::Matrix; + using ConstMatrixMap = Eigen::Map; + using MatrixMap = Eigen::Map; + + using OneQubitMatrix = Eigen::Matrix; + using ConstOneQubitMap = Eigen::Map; + + // Note: ForArgs are currently unused. + template + explicit MPSSimulator(ForArgs&&... args) : for_(args...) {} + + /** + * Applies a gate using non-vectorized instructions. + * @param qs Indices of the qubits affected by this gate. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyGate(const std::vector& qs, const fp_type* matrix, + State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + switch (qs.size()) { + case 1: + ApplyGate1(qs, matrix, state); + break; + case 2: + ApplyGate2(qs, matrix, state); + break; + // case 3: + // ApplyGate3(qs, matrix, state); + // break; + // case 4: + // ApplyGate4(qs, matrix, state); + // break; + // case 5: + // ApplyGate5(qs, matrix, state); + // break; + // case 6: + // ApplyGate6(qs, matrix, state); + // break; + default: + // Not implemented. + break; + } + } + + /** + * Applies a controlled gate using eigen3 operations w/ instructions. + * @param qs Indices of the qubits affected by this gate. + * @param cqs Indices of control qubits. + * @param cmask Bit mask of control qubit values. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyControlledGate(const std::vector& qs, + const std::vector& cqs, uint64_t cmask, + const fp_type* matrix, State& state) const { + // TODO. + } + + /** + * Computes the expectation value of an operator using eigen3 operations + * w/ vectorized instructions. + * @param qs Indices of the qubits the operator acts on. + * @param matrix The operator matrix. + * @param state The state of the system. + * @return The computed expectation value. + */ + std::complex ExpectationValue(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + // TODO. + return std::complex(-10., -10.); + } + + private: + void ApplyGate1(const std::vector& qs, const fp_type* matrix, + State& state) const { + if (qs[0] == state.num_qubits() - 1) { + Apply1Right(qs, matrix, state); + } else { + Apply1LeftOrInterior(qs, matrix, state); + } + } + + void Apply1LeftOrInterior(const std::vector& qs, + const fp_type* matrix, State& state) const { + fp_type* raw_state = state.get(); + const auto bond_dim = state.bond_dim(); + const auto l_offset = MPSStateSpace_::GetBlockOffset(state, qs[0]); + const auto r_offset = MPSStateSpace_::GetBlockOffset(state, qs[0] + 1); + const auto end = MPSStateSpace_::Size(state); + ConstOneQubitMap gate_matrix((Complex*) matrix); + MatrixMap scratch_block((Complex*)(raw_state + end), 2, bond_dim); + + for (unsigned block_sep = l_offset; block_sep < r_offset; + block_sep += 4 * bond_dim) { + fp_type* cur_block = raw_state + block_sep; + ConstMatrixMap mps_block((Complex*) cur_block, 2, bond_dim); + scratch_block.noalias() = gate_matrix * mps_block; + memcpy(cur_block, raw_state + end, sizeof(fp_type) * bond_dim * 4); + } + } + + void Apply1Right(const std::vector& qs, const fp_type* matrix, + State& state) const { + fp_type* raw_state = state.get(); + const auto bond_dim = state.bond_dim(); + const auto offset = MPSStateSpace_::GetBlockOffset(state, qs[0]); + const auto end = MPSStateSpace_::Size(state); + ConstOneQubitMap gate_matrix((Complex*) matrix); + ConstMatrixMap mps_block((Complex*)(raw_state + offset), bond_dim, 2); + MatrixMap scratch_block((Complex*)(raw_state + end), bond_dim, 2); + scratch_block.noalias() = mps_block * gate_matrix.transpose(); + memcpy(raw_state + offset, raw_state + end, sizeof(fp_type) * bond_dim * 4); + } + + void ApplyGate2(const std::vector& qs, const fp_type* matrix, + State& state) const { + // TODO: micro-benchmark this function and improve performance. + const auto bond_dim = state.bond_dim(); + const auto num_qubits = state.num_qubits(); + fp_type* raw_state = state.get(); + + const auto i_dim = (qs[0] == 0) ? 1 : bond_dim; + const auto j_dim = 2; + const auto k_dim = bond_dim; + const auto l_dim = 2; + const auto m_dim = (qs[1] == num_qubits - 1) ? 1 : bond_dim; + + const auto b_0_offset = MPSStateSpace_::GetBlockOffset(state, qs[0]); + const auto b_1_offset = MPSStateSpace_::GetBlockOffset(state, qs[1]); + const auto end = MPSStateSpace_::Size(state); + + MatrixMap block_0((Complex*)(raw_state + b_0_offset), i_dim * j_dim, k_dim); + MatrixMap block_1((Complex*)(raw_state + b_1_offset), k_dim, l_dim * m_dim); + + // Merge both blocks into scratch space. + MatrixMap scratch_c((Complex*)(raw_state + end), i_dim * j_dim, l_dim * m_dim); + scratch_c.noalias() = block_0 * block_1; + + // Transpose inner dims in-place. + MatrixMap scratch_c_t((Complex*)(raw_state + end), i_dim * j_dim * l_dim, m_dim); + for (unsigned i = 0; i < i_dim * j_dim * l_dim; i += 4) { + scratch_c_t.row(i + 1).swap(scratch_c_t.row(i + 2)); + } + + // Transpose gate matrix and place in 3rd (last) scratch block. + const auto scratch3_offset = end + 8 * bond_dim * bond_dim; + ConstMatrixMap gate_matrix((Complex*) matrix, 4, 4); + MatrixMap gate_matrix_transpose((Complex*)(raw_state + scratch3_offset), 4, 4); + gate_matrix_transpose = gate_matrix.transpose(); + gate_matrix_transpose.col(1).swap(gate_matrix_transpose.col(2)); + + // Contract gate and merged block tensors, placing result in B0B1. + for (unsigned i = 0; i < i_dim; ++i) { + fp_type* src_block = raw_state + end + i * 8 * m_dim; + fp_type* dest_block = raw_state + b_0_offset + i * 8 * m_dim; + MatrixMap block_b0b1((Complex*) dest_block, 4, m_dim); + ConstMatrixMap scratch_c_i((Complex*) src_block, 4, m_dim); + // [i, np, m] = [np, lj] * [i, lj, m] + block_b0b1.noalias() = gate_matrix_transpose * scratch_c_i; + } + + // SVD B0B1. + MatrixMap full_b0b1((Complex*)(raw_state + b_0_offset), 2 * i_dim, 2 * m_dim); + Eigen::BDCSVD svd(full_b0b1, Eigen::ComputeThinU | Eigen::ComputeThinV); + const auto p = std::min(2 * i_dim, 2 * m_dim); + + // Place U in scratch to truncate and then B0. + MatrixMap svd_u((Complex*)(raw_state + end), 2 * i_dim, p); + svd_u.noalias() = svd.matrixU(); + block_0.fill(Complex(0, 0)); + const auto keep_cols = (svd_u.cols() > bond_dim) ? bond_dim : svd_u.cols(); + block_0.block(0, 0, svd_u.rows(), keep_cols).noalias() = + svd_u(Eigen::indexing::all, Eigen::seq(0, keep_cols - 1)); + + // Place row product of S V into scratch to truncate and then B1. + MatrixMap svd_v((Complex*)(raw_state + end), p, 2 * m_dim); + MatrixMap s_vector((Complex*)(raw_state + end + 8 * bond_dim * bond_dim), p, 1); + svd_v.noalias() = svd.matrixV().adjoint(); + s_vector.noalias() = svd.singularValues(); + block_1.fill(Complex(0, 0)); + const auto keep_rows = (svd_v.rows() > bond_dim) ? bond_dim : svd_v.rows(); + const auto row_seq = Eigen::seq(0, keep_rows - 1); + for (unsigned i = 0; i < keep_rows; ++i) { + svd_v.row(i) *= s_vector(i); + } + block_1.block(0, 0, keep_rows, svd_v.cols()).noalias() = + svd_v(row_seq, Eigen::indexing::all); + } + + For for_; +}; + +} // namespace mps +} // namespace qsim + +#endif // MPS_SIMULATOR_H_ diff --git a/tpls/qsim/mps_statespace.h b/tpls/qsim/mps_statespace.h new file mode 100644 index 0000000..9b3acf3 --- /dev/null +++ b/tpls/qsim/mps_statespace.h @@ -0,0 +1,597 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef MPS_STATESPACE_H_ +#define MPS_STATESPACE_H_ + +// For templates will take care of parallelization. +#define EIGEN_DONT_PARALLELIZE 1 + +#ifdef _WIN32 +#include +#endif + +#include +#include +#include +#include +#include + +#include "../eigen/Eigen/Dense" +#include "../eigen/unsupported/Eigen/CXX11/Tensor" + +namespace qsim { + +namespace mps { + +namespace detail { + +inline void do_not_free(void*) {} + +inline void free(void* ptr) { +#ifdef _WIN32 + _aligned_free(ptr); +#else + ::free(ptr); +#endif +} + +} // namespace detail + +/** + * Class containing context and routines for fixed bond dimension + * truncated Matrix Product State (MPS) simulation. + */ +template +class MPSStateSpace { + private: + public: + using fp_type = FP; + using Pointer = std::unique_ptr; + + using Complex = std::complex; + using Matrix = + Eigen::Matrix; + using ConstMatrixMap = Eigen::Map; + using MatrixMap = Eigen::Map; + + // Store MPS tensors with the following shape: + // [2, bond_dim], [bond_dim, 2, bond_dim], ... , [bond_dim, 2]. + class MPS { + public: + MPS() = delete; + + MPS(Pointer&& ptr, unsigned num_qubits, unsigned bond_dim) + : ptr_(std::move(ptr)), num_qubits_(num_qubits), bond_dim_(bond_dim) {} + + fp_type* get() { return ptr_.get(); } + + const fp_type* get() const { return ptr_.get(); } + + fp_type* release() { + num_qubits_ = 0; + return ptr_.release(); + } + + unsigned num_qubits() const { return num_qubits_; } + + unsigned bond_dim() const { return bond_dim_; } + + private: + Pointer ptr_; + unsigned num_qubits_; + unsigned bond_dim_; + }; + + // Note: ForArgs are currently unused. + template + MPSStateSpace(ForArgs&&... args) : for_(args...) {} + + // Requires num_qubits >= 2 and bond_dim >= 2. + static MPS Create(unsigned num_qubits, unsigned bond_dim) { + auto end_sizes = 2 * 4 * bond_dim; + auto internal_sizes = 4 * bond_dim * bond_dim * (num_qubits + 1); + // Use three extra "internal style" blocks past the end of the + // working allocation for scratch space. Needed for gate + // application. + auto size = sizeof(fp_type) * (end_sizes + internal_sizes); + +#ifdef _WIN32 + Pointer ptr{(fp_type*)_aligned_malloc(size, 64), &detail::free}; + bool is_null = ptr.get() != nullptr; + return MPS{std::move(ptr), is_null ? num_qubits : 0, + is_null ? bond_dim : 0}; +#else + void* p = nullptr; + if (posix_memalign(&p, 64, size) == 0) { + return MPS{Pointer{(fp_type*)p, &detail::free}, num_qubits, bond_dim}; + } else { + return MPS{Pointer{nullptr, &detail::free}, 0, 0}; + } +#endif + } + + static unsigned Size(const MPS& state) { + auto end_sizes = 2 * 4 * state.bond_dim(); + auto internal_sizes = 4 * state.bond_dim() * state.bond_dim(); + return end_sizes + internal_sizes * (state.num_qubits() - 2); + } + + static unsigned RawSize(const MPS& state) { + return sizeof(fp_type) * Size(state); + } + + // Get the pointer offset to the beginning of an MPS block. + static unsigned GetBlockOffset(const MPS& state, unsigned i) { + if (i == 0) { + return 0; + } + return 4 * state.bond_dim() * (1 + state.bond_dim() * (i - 1)); + } + + // Copies the state contents of one MPS to another. + // Ignores scratch data. + static bool Copy(const MPS& src, MPS& dest) { + if ((src.num_qubits() != dest.num_qubits()) || + src.bond_dim() != dest.bond_dim()) { + return false; + } + auto size = RawSize(src); + memcpy(dest.get(), src.get(), size); + return true; + } + + // Set the MPS to the |0> state. + static void SetStateZero(MPS& state) { + auto size = Size(state); + memset(state.get(), 0, sizeof(fp_type) * size); + auto block_size = 4 * state.bond_dim() * state.bond_dim(); + state.get()[0] = 1.0; + for (unsigned i = 4 * state.bond_dim(); i < size; i += block_size) { + state.get()[i] = 1.0; + } + } + + // Computes Re{} for two equal sized MPS. + // Requires: state1.bond_dim() == state2.bond_dim() && + // state1.num_qubits() == state2.num_qubits() + static fp_type RealInnerProduct(MPS& state1, MPS& state2) { + return InnerProduct(state1, state2).real(); + } + + // Computes for two equal sized MPS. + // Requires: state1.bond_dim() == state2.bond_dim() && + // state1.num_qubits() == state2.num_qubits() + static std::complex InnerProduct(MPS& state1, MPS& state2) { + const auto num_qubits = state1.num_qubits(); + const auto bond_dim = state1.bond_dim(); + const auto end = Size(state1); + auto offset = 0; + fp_type* state1_raw = state1.get(); + fp_type* state2_raw = state2.get(); + + // Contract leftmost blocks together, store result in state1 scratch. + ConstMatrixMap top((Complex*)state2_raw, 2, bond_dim); + ConstMatrixMap bot((Complex*)state1_raw, 2, bond_dim); + MatrixMap partial_contract((Complex*)(state1_raw + end), bond_dim, + bond_dim); + MatrixMap partial_contract2( + (Complex*)(state1_raw + end + 2 * bond_dim * bond_dim), bond_dim, + 2 * bond_dim); + partial_contract.noalias() = top.adjoint() * bot; + + // Contract all internal blocks together. + for (unsigned i = 1; i < num_qubits - 1; ++i) { + offset = GetBlockOffset(state1, i); + + // reshape: + new (&partial_contract2) + MatrixMap((Complex*)(state1_raw + end + 2 * bond_dim * bond_dim), + bond_dim, 2 * bond_dim); + + // Merge bot into left boundary merged tensor. + new (&bot) ConstMatrixMap((Complex*)(state1_raw + offset), bond_dim, + 2 * bond_dim); + partial_contract2.noalias() = partial_contract * bot; + + // reshape: + new (&partial_contract2) + MatrixMap((Complex*)(state1_raw + end + 2 * bond_dim * bond_dim), + 2 * bond_dim, bond_dim); + + // Merge top into partial_contract2. + new (&top) ConstMatrixMap((Complex*)(state2_raw + offset), 2 * bond_dim, + bond_dim); + partial_contract.noalias() = top.adjoint() * partial_contract2; + } + + // Contract rightmost bottom block. + offset = GetBlockOffset(state1, num_qubits - 1); + new (&bot) ConstMatrixMap((Complex*)(state1_raw + offset), bond_dim, 2); + new (&partial_contract2) MatrixMap( + (Complex*)(state1_raw + end + 4 * bond_dim * bond_dim), bond_dim, 2); + partial_contract2.noalias() = partial_contract * bot; + + // Contract rightmost top block. + new (&top) ConstMatrixMap((Complex*)(state2_raw + offset), 2 * bond_dim, 1); + new (&partial_contract) MatrixMap((Complex*)(state1_raw + end), 1, 1); + new (&partial_contract2) + MatrixMap((Complex*)(state1_raw + end + 4 * bond_dim * bond_dim), + 2 * bond_dim, 1); + partial_contract.noalias() = top.adjoint() * partial_contract2; + + return partial_contract(0, 0); + } + + // Compute the 2x2 1-RDM of state on index. Result written to rdm. + // Requires: scratch and rdm to be allocated. + static void ReduceDensityMatrix(MPS& state, MPS& scratch, int index, + fp_type* rdm) { + const auto num_qubits = state.num_qubits(); + const auto bond_dim = state.bond_dim(); + const auto end = Size(state); + const bool last_index = (index == num_qubits - 1); + const auto right_dim = (last_index ? 1 : bond_dim); + auto offset = 0; + fp_type* state_raw = state.get(); + fp_type* scratch_raw = scratch.get(); + fp_type* state_raw_workspace = state_raw + end + 2 * bond_dim * bond_dim; + fp_type* scratch_raw_workspace = + scratch_raw + end + 2 * bond_dim * bond_dim; + + Copy(state, scratch); + + // Contract leftmost blocks together, store result in state scratch. + ConstMatrixMap top((Complex*)scratch_raw, 2, bond_dim); + ConstMatrixMap bot((Complex*)state_raw, 2, bond_dim); + MatrixMap partial_contract((Complex*)(state_raw + end), bond_dim, bond_dim); + MatrixMap partial_contract2((Complex*)(state_raw_workspace), bond_dim, + 2 * bond_dim); + + partial_contract.setZero(); + partial_contract(0, 0) = 1; + if (index > 0) { + partial_contract.noalias() = top.adjoint() * bot; + } + + // Contract all internal blocks together. + for (unsigned i = 1; i < index; ++i) { + offset = GetBlockOffset(state, i); + + // reshape: + new (&partial_contract2) + MatrixMap((Complex*)(state_raw_workspace), bond_dim, 2 * bond_dim); + + // Merge bot into left boundary merged tensor. + new (&bot) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, + 2 * bond_dim); + partial_contract2.noalias() = partial_contract * bot; + + // reshape: + new (&partial_contract2) + MatrixMap((Complex*)(state_raw_workspace), 2 * bond_dim, bond_dim); + + // Merge top into partial_contract2. + new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), 2 * bond_dim, + bond_dim); + partial_contract.noalias() = top.adjoint() * partial_contract2; + } + + // The [bond_dim, bond_dim] block in state_raw now contains the contraction + // up to, but not including index. + // Contract rightmost blocks. + offset = GetBlockOffset(state, num_qubits - 1); + new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), bond_dim, 2); + new (&bot) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, 2); + new (&partial_contract) + MatrixMap((Complex*)(scratch_raw + end), bond_dim, bond_dim); + new (&partial_contract2) + MatrixMap((Complex*)(scratch_raw_workspace), bond_dim, 2 * bond_dim); + + partial_contract.setZero(); + partial_contract(0, 0) = 1; + if (index < num_qubits - 1) { + partial_contract.noalias() = top * bot.adjoint(); + } + + for (unsigned i = num_qubits - 2; i > index; --i) { + offset = GetBlockOffset(state, i); + + // reshape: + new (&partial_contract2) + MatrixMap((Complex*)(scratch_raw_workspace), 2 * bond_dim, bond_dim); + + // Merge bot into left boundary merged tensor. + new (&bot) ConstMatrixMap((Complex*)(state_raw + offset), 2 * bond_dim, + bond_dim); + partial_contract2.noalias() = bot * partial_contract.adjoint(); + + // reshape: + new (&partial_contract2) + MatrixMap((Complex*)(scratch_raw_workspace), bond_dim, 2 * bond_dim); + + // Merge top into partial_contract2. + new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), bond_dim, + 2 * bond_dim); + // [bd, bd] = [bd, 2bd] @ [bd, 2bd] + partial_contract.noalias() = top * partial_contract2.adjoint(); + } + + // The [bond_dim, bond_dim] block in scratch_raw now contains the + // contraction down from the end, but not including the index. Begin final + // contraction steps. + + // Get leftmost [bd, bd] contraction and contract with top. + + offset = GetBlockOffset(state, index); + new (&partial_contract) + MatrixMap((Complex*)(state_raw + end), bond_dim, bond_dim); + new (&top) + ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, 2 * right_dim); + new (&partial_contract2) + MatrixMap((Complex*)(scratch_raw_workspace), bond_dim, 2 * right_dim); + partial_contract2.noalias() = partial_contract * top.conjugate(); + // copy the bottom contraction scratch_raw to state_raw to save space. + memcpy(state_raw + end, scratch_raw + end, + bond_dim * bond_dim * 2 * sizeof(fp_type)); + + // Contract top again for correct shape. + fp_type* contract3_target = (last_index ? rdm : scratch_raw); + MatrixMap partial_contract3((Complex*)contract3_target, 2 * right_dim, + 2 * right_dim); + partial_contract3.noalias() = top.transpose() * partial_contract2; + + // If we are contracting the last index, all the needed transforms are done. + if (last_index) { + return; + } + + // Conduct final tensor contraction operations. Cannot be easily compiled to + // matmul. + const Eigen::TensorMap> + t_4d((Complex*)scratch_raw, 2, bond_dim, 2, bond_dim); + const Eigen::TensorMap> + t_2d((Complex*)(state_raw + end), bond_dim, bond_dim); + + const Eigen::array, 2> product_dims = { + Eigen::IndexPair(1, 0), + Eigen::IndexPair(3, 1), + }; + Eigen::TensorMap> out( + (Complex*)rdm, 2, 2); + out = t_4d.contract(t_2d, product_dims); + } + + // Draw a single bitstring sample from state using scratch and scratch2 + // as working space. + static void SampleOnce(MPS& state, MPS& scratch, MPS& scratch2, + std::mt19937* random_gen, std::vector* sample) { + // TODO: carefully profile with perf and optimize temp storage + // locations for cache friendliness. + const auto bond_dim = state.bond_dim(); + const auto num_qubits = state.num_qubits(); + const auto end = Size(state); + const auto left_frontier_offset = GetBlockOffset(state, num_qubits + 1); + std::default_random_engine generator; + fp_type* state_raw = state.get(); + fp_type* scratch_raw = scratch.get(); + fp_type* scratch2_raw = scratch2.get(); + fp_type rdm[8]; + + sample->reserve(num_qubits); + Copy(state, scratch); + Copy(state, scratch2); + + // Store prefix contractions in scratch2. + auto offset = GetBlockOffset(state, num_qubits - 1); + ConstMatrixMap top((Complex*)(state_raw + offset), bond_dim, 2); + ConstMatrixMap bot((Complex*)(scratch_raw + offset), bond_dim, 2); + MatrixMap partial_contract((Complex*)(scratch2_raw + offset), bond_dim, + bond_dim); + MatrixMap partial_contract2((Complex*)(scratch_raw + end), bond_dim, + 2 * bond_dim); + partial_contract.noalias() = top * bot.adjoint(); + + for (unsigned i = num_qubits - 2; i > 0; --i) { + offset = GetBlockOffset(state, i); + // reshape: + new (&partial_contract2) + MatrixMap((Complex*)(scratch_raw + end), 2 * bond_dim, bond_dim); + + // Merge bot into left boundary merged tensor. + new (&bot) ConstMatrixMap((Complex*)(scratch_raw + offset), 2 * bond_dim, + bond_dim); + partial_contract2.noalias() = bot * partial_contract.adjoint(); + + // reshape: + new (&partial_contract2) + MatrixMap((Complex*)(scratch_raw + end), bond_dim, 2 * bond_dim); + + // Merge top into partial_contract2. + new (&top) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, + 2 * bond_dim); + + // merge into partial_contract -> scracth2_raw. + new (&partial_contract) + MatrixMap((Complex*)(scratch2_raw + offset), bond_dim, bond_dim); + partial_contract.noalias() = top * partial_contract2.adjoint(); + } + + // Compute RDM-0 and draw first sample. + offset = GetBlockOffset(state, 1); + new (&top) ConstMatrixMap((Complex*)state_raw, 2, bond_dim); + new (&bot) ConstMatrixMap((Complex*)scratch_raw, 2, bond_dim); + new (&partial_contract) + MatrixMap((Complex*)(scratch2_raw + offset), bond_dim, bond_dim); + new (&partial_contract2) + MatrixMap((Complex*)(scratch_raw + end), 2, bond_dim); + + partial_contract2.noalias() = bot * partial_contract.adjoint(); + + new (&partial_contract) MatrixMap((Complex*)rdm, 2, 2); + partial_contract.noalias() = top * partial_contract2.adjoint(); + auto p0 = rdm[0] / (rdm[0] + rdm[6]); + std::bernoulli_distribution distribution(1 - p0); + auto bit_val = distribution(*random_gen); + sample->push_back(bit_val); + + // collapse state. + new (&partial_contract) MatrixMap((Complex*)scratch_raw, 2, bond_dim); + partial_contract.row(!bit_val).setZero(); + + // Prepare left contraction frontier. + new (&partial_contract2) MatrixMap( + (Complex*)(scratch2_raw + left_frontier_offset), bond_dim, bond_dim); + partial_contract2.noalias() = + partial_contract.transpose() * partial_contract.conjugate(); + + // Compute RDM-i and draw internal tensor samples. + for (unsigned i = 1; i < num_qubits - 1; i++) { + // Get leftmost [bd, bd] contraction and contract with top. + offset = GetBlockOffset(state, i); + new (&partial_contract) MatrixMap( + (Complex*)(scratch2_raw + left_frontier_offset), bond_dim, bond_dim); + new (&top) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, + 2 * bond_dim); + new (&partial_contract2) + MatrixMap((Complex*)(state_raw + end), bond_dim, 2 * bond_dim); + partial_contract2.noalias() = partial_contract * top.conjugate(); + + // Contract top again for correct shape. + MatrixMap partial_contract3((Complex*)(scratch_raw + end), 2 * bond_dim, + 2 * bond_dim); + partial_contract3.noalias() = top.transpose() * partial_contract2; + + // Conduct final tensor contraction operations. Cannot be easily compiled + // to matmul. Perf reports shows only ~6% of runtime spent here on large + // systems. + offset = GetBlockOffset(state, i + 1); + const Eigen::TensorMap> + t_4d((Complex*)(scratch_raw + end), 2, bond_dim, 2, bond_dim); + const Eigen::TensorMap> + t_2d((Complex*)(scratch2_raw + offset), bond_dim, bond_dim); + + const Eigen::array, 2> product_dims = { + Eigen::IndexPair(1, 0), + Eigen::IndexPair(3, 1), + }; + Eigen::TensorMap> out( + (Complex*)rdm, 2, 2); + out = t_4d.contract(t_2d, product_dims); + + // Sample bit and collapse state. + p0 = rdm[0] / (rdm[0] + rdm[6]); + distribution = std::bernoulli_distribution(1 - p0); + bit_val = distribution(*random_gen); + + sample->push_back(bit_val); + offset = GetBlockOffset(state, i); + new (&partial_contract) + MatrixMap((Complex*)(scratch_raw + offset), bond_dim * 2, bond_dim); + for (unsigned j = !bit_val; j < 2 * bond_dim; j += 2) { + partial_contract.row(j).setZero(); + } + + // Update left frontier. + new (&partial_contract) MatrixMap( + (Complex*)(scratch2_raw + left_frontier_offset), bond_dim, bond_dim); + + // reshape: + new (&partial_contract2) + MatrixMap((Complex*)(state_raw + end), bond_dim, 2 * bond_dim); + + // Merge bot into left boundary merged tensor. + new (&bot) ConstMatrixMap((Complex*)(scratch_raw + offset), bond_dim, + 2 * bond_dim); + partial_contract2.noalias() = partial_contract * bot.conjugate(); + + // reshape: + new (&partial_contract2) + MatrixMap((Complex*)(state_raw + end), 2 * bond_dim, bond_dim); + + // Merge top into partial_contract2. + new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), 2 * bond_dim, + bond_dim); + partial_contract.noalias() = top.transpose() * partial_contract2; + } + + // Compute RDM-(n-1) and sample. + offset = GetBlockOffset(state, num_qubits - 1); + new (&partial_contract2) + MatrixMap((Complex*)(state_raw + end), bond_dim, 2); + + new (&top) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, 2); + partial_contract2.noalias() = partial_contract * top.conjugate(); + new (&partial_contract) MatrixMap((Complex*)rdm, 2, 2); + partial_contract.noalias() = top.transpose() * partial_contract2; + + p0 = rdm[0] / (rdm[0] + rdm[6]); + distribution = std::bernoulli_distribution(1 - p0); + bit_val = distribution(*random_gen); + sample->push_back(bit_val); + } + + // Draw num_samples bitstring samples from state and store the result + // bit vectors in results. Uses scratch and scratch2 as workspace. + static void Sample(MPS& state, MPS& scratch, MPS& scratch2, + unsigned num_samples, unsigned seed, + std::vector>* results) { + std::mt19937 rand_source(seed); + results->reserve(num_samples); + for (unsigned i = 0; i < num_samples; i++) { + SampleOnce(state, scratch, scratch2, &rand_source, &(*results)[i]); + } + } + + // Testing only. Convert the MPS to a wavefunction under "normal" ordering. + // Requires: wf be allocated beforehand with bond_dim * 2 ^ num_qubits -1 + // memory. + static void ToWaveFunction(MPS& state, fp_type* wf) { + const auto bond_dim = state.bond_dim(); + const auto num_qubits = state.num_qubits(); + fp_type* raw_state = state.get(); + + ConstMatrixMap accum = ConstMatrixMap((Complex*)(raw_state), 2, bond_dim); + ConstMatrixMap next_block = ConstMatrixMap(nullptr, 0, 0); + MatrixMap result2 = MatrixMap(nullptr, 0, 0); + auto offset = 0; + auto result2_size = 2; + + for (unsigned i = 1; i < num_qubits - 1; i++) { + offset = GetBlockOffset(state, i); + // use of new does not trigger any expensive operations. + new (&next_block) ConstMatrixMap((Complex*)(raw_state + offset), bond_dim, + 2 * bond_dim); + new (&result2) MatrixMap((Complex*)(wf), result2_size, 2 * bond_dim); + + // temp variable used since result2 and accum point to same memory. + result2 = accum * next_block; + result2_size *= 2; + new (&accum) ConstMatrixMap((Complex*)(wf), result2_size, bond_dim); + } + offset = GetBlockOffset(state, num_qubits - 1); + new (&next_block) + ConstMatrixMap((Complex*)(raw_state + offset), bond_dim, 2); + new (&result2) MatrixMap((Complex*)(wf), result2_size, 2); + result2 = accum * next_block; + } + + protected: + For for_; +}; + +} // namespace mps +} // namespace qsim + +#endif // MPS_STATESPACE_H_ diff --git a/tpls/qsim/parfor.h b/tpls/qsim/parfor.h new file mode 100644 index 0000000..8a3a4d6 --- /dev/null +++ b/tpls/qsim/parfor.h @@ -0,0 +1,123 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef PARFOR_H_ +#define PARFOR_H_ + +#include + +#include +#include +#include + +namespace qsim { + +/** + * Helper struct for executing for-loops in parallel across multiple threads. + */ +template +struct ParallelForT { + explicit ParallelForT(unsigned num_threads) : num_threads(num_threads) {} + + // GetIndex0 and GetIndex1 are useful when we need to know how work was + // divided between threads, for instance, for reusing partial sums obtained + // by RunReduceP. + uint64_t GetIndex0(uint64_t size, unsigned thread_id) const { + return size >= MIN_SIZE ? size * thread_id / num_threads : 0; + } + + uint64_t GetIndex1(uint64_t size, unsigned thread_id) const { + return size >= MIN_SIZE ? size * (thread_id + 1) / num_threads : size; + } + + template + void Run(uint64_t size, Function&& func, Args&&... args) const { + if (num_threads > 1 && size >= MIN_SIZE) { + #pragma omp parallel num_threads(num_threads) + { + unsigned n = omp_get_num_threads(); + unsigned m = omp_get_thread_num(); + + uint64_t i0 = GetIndex0(size, m); + uint64_t i1 = GetIndex1(size, m); + + for (uint64_t i = i0; i < i1; ++i) { + func(n, m, i, args...); + } + } + } else { + for (uint64_t i = 0; i < size; ++i) { + func(1, 0, i, args...); + } + } + } + + template + std::vector RunReduceP( + uint64_t size, Function&& func, Op&& op, Args&&... args) const { + std::vector partial_results; + + if (num_threads > 1 && size >= MIN_SIZE) { + partial_results.resize(num_threads, 0); + + #pragma omp parallel num_threads(num_threads) + { + unsigned n = omp_get_num_threads(); + unsigned m = omp_get_thread_num(); + + uint64_t i0 = GetIndex0(size, m); + uint64_t i1 = GetIndex1(size, m); + + typename Op::result_type partial_result = 0; + + for (uint64_t i = i0; i < i1; ++i) { + partial_result = op(partial_result, func(n, m, i, args...)); + } + + partial_results[m] = partial_result; + } + } else if (num_threads > 0) { + typename Op::result_type result = 0; + for (uint64_t i = 0; i < size; ++i) { + result = op(result, func(1, 0, i, args...)); + } + + partial_results.resize(1, result); + } + + return partial_results; + } + + template + typename Op::result_type RunReduce(uint64_t size, Function&& func, + Op&& op, Args&&... args) const { + auto partial_results = RunReduceP(size, func, std::move(op), args...); + + typename Op::result_type result = 0; + + for (auto partial_result : partial_results) { + result = op(result, partial_result); + } + + return result; + } + + unsigned num_threads; +}; + +using ParallelFor = ParallelForT<1024>; + +} // namespace qsim + +#endif // PARFOR_H_ diff --git a/tpls/qsim/qtrajectory.h b/tpls/qsim/qtrajectory.h new file mode 100644 index 0000000..1da6692 --- /dev/null +++ b/tpls/qsim/qtrajectory.h @@ -0,0 +1,435 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef QTRAJECTORY_H_ +#define QTRAJECTORY_H_ + +#include +#include +#include +#include +#include + +#include "circuit_noisy.h" +#include "gate.h" +#include "gate_appl.h" + +namespace qsim { + +/** + * Quantum trajectory simulator. + */ +template class FuserT, typename Simulator, + typename RGen = std::mt19937> +class QuantumTrajectorySimulator { + public: + using Fuser = FuserT; + using StateSpace = typename Simulator::StateSpace; + using State = typename Simulator::State; + using MeasurementResult = typename StateSpace::MeasurementResult; + + /** + * User-specified parameters for the simulator. + */ + struct Parameter : public Fuser::Parameter { + /** + * If true, collect statistics of sampled Kraus operator indices. + */ + bool collect_kop_stat = false; + /** + * If true, collect statistics of measured bitstrings. + */ + bool collect_mea_stat = false; + /** + * If true, normalize the state vector before performing measurements. + */ + bool normalize_before_mea_gates = true; + /** + * If false, do not apply deferred operators after the main loop for + * the "primary" noise trajectory, that is the trajectory in which + * the primary (the first operators in their respective channels) Kraus + * operators are sampled for each channel and there are no measurements + * in the computational basis. This can be used to speed up simulations + * of circuits with weak noise and without measurements by reusing + * the primary trajectory results. There is an additional condition for + * RunBatch. In this case, the deferred operators after the main loop are + * still applied for the first occurence of the primary trajectory. + * The primary Kraus operators should have the highest sampling + * probabilities to achieve the highest speedup. + * + * It is the client's responsibility to collect the primary trajectory + * results and to reuse them. + */ + bool apply_last_deferred_ops = true; + }; + + /** + * Struct with statistics to populate by RunBatch and RunOnce methods. + */ + struct Stat { + /** + * Indices of sampled Kraus operator indices and/or measured bitstrings. + */ + std::vector samples; + /** + * True if the "primary" noise trajectory is sampled, false otherwise. + */ + bool primary; + }; + + /** + * Runs the given noisy circuit performing repetitions. Each repetition is + * seeded by repetition ID. + * @param param Options for the quantum trajectory simulator. + * @param circuit The noisy circuit to be simulated. + * @param r0, r1 The range of repetition IDs [r0, r1) to perform repetitions. + * @param state_space StateSpace object required to manipulate state vector. + * @param simulator Simulator object. Provides specific implementations for + * applying gates. + * @param measure Function that performs measurements (in the sense of + * computing expectation values, etc). This function should have three + * required parameters [repetition ID (uint64_t), final state vector + * (const State&), statistics of sampled Kraus operator indices and/or + * measured bitstrings (const Stat&)] and any number of optional parameters. + * @param args Optional arguments for the 'measure' function. + * @return True if the simulation completed successfully; false otherwise. + */ + template + static bool RunBatch(const Parameter& param, + const NoisyCircuit& circuit, + uint64_t r0, uint64_t r1, const StateSpace& state_space, + const Simulator& simulator, MeasurementFunc&& measure, + Args&&... args) { + return RunBatch(param, circuit.num_qubits, circuit.channels.begin(), + circuit.channels.end(), r0, r1, state_space, simulator, + measure, args...); + } + + /** + * Runs the given noisy circuit performing repetitions. Each repetition is + * seeded by repetition ID. + * @param param Options for the quantum trajectory simulator. + * @param num_qubits The number of qubits acted on by the circuit. + * @param cbeg, cend The range of channels [cbeg, cend) to run the circuit. + * @param r0, r1 The range of repetition IDs [r0, r1) to perform repetitions. + * @param state_space StateSpace object required to manipulate state vector. + * @param simulator Simulator object. Provides specific implementations for + * applying gates. + * @param measure Function that performs measurements (in the sense of + * computing expectation values, etc). This function should have three + * required parameters [repetition ID (uint64_t), final state vector + * (const State&), statistics of sampled Kraus operator indices and/or + * measured bitstrings (const Stat&)] and any number of optional parameters. + * @param args Optional arguments for the 'measure' function. + * @return True if the simulation completed successfully; false otherwise. + */ + template + static bool RunBatch(const Parameter& param, unsigned num_qubits, + ncircuit_iterator cbeg, + ncircuit_iterator cend, + uint64_t r0, uint64_t r1, const StateSpace& state_space, + const Simulator& simulator, MeasurementFunc&& measure, + Args&&... args) { + std::vector gates; + gates.reserve(4 * std::size_t(cend - cbeg)); + + State state = state_space.Null(); + + Stat stat; + bool had_primary_realization = false; + + for (uint64_t r = r0; r < r1; ++r) { + if (!state_space.IsNull(state)) { + state_space.SetStateZero(state); + } + + bool apply_last_deferred_ops = + param.apply_last_deferred_ops || !had_primary_realization; + + if (!RunIteration(param, apply_last_deferred_ops, num_qubits, cbeg, cend, + r, state_space, simulator, gates, state, stat)) { + return false; + } + + if (stat.primary && !had_primary_realization) { + had_primary_realization = true; + } + + measure(r, state, stat, args...); + } + + return true; + } + + /** + * Runs the given noisy circuit one time. + * @param param Options for the quantum trajectory simulator. + * @param circuit The noisy circuit to be simulated. + * @param r The repetition ID. The random number generator is seeded by 'r'. + * @param state_space StateSpace object required to manipulate state vector. + * @param simulator Simulator object. Provides specific implementations for + * applying gates. + * @param state The state of the system, to be updated by this method. + * @param stat Statistics of sampled Kraus operator indices and/or measured + * bitstrings, to be populated by this method. + * @return True if the simulation completed successfully; false otherwise. + */ + static bool RunOnce(const Parameter& param, + const NoisyCircuit& circuit, uint64_t r, + const StateSpace& state_space, const Simulator& simulator, + State& state, Stat& stat) { + return RunOnce(param, circuit.num_qubits, circuit.channels.begin(), + circuit.channels.end(), r, state_space, simulator, + state, stat); + } + + /** + * Runs the given noisy circuit one time. + * @param param Options for the quantum trajectory simulator. + * @param num_qubits The number of qubits acted on by the circuit. + * @param cbeg, cend The range of channels [cbeg, cend) to run the circuit. + * @param circuit The noisy circuit to be simulated. + * @param r The repetition ID. The random number generator is seeded by 'r'. + * @param state_space StateSpace object required to manipulate state vector. + * @param simulator Simulator object. Provides specific implementations for + * applying gates. + * @param state The state of the system, to be updated by this method. + * @param stat Statistics of sampled Kraus operator indices and/or measured + * bitstrings, to be populated by this method. + * @return True if the simulation completed successfully; false otherwise. + */ + static bool RunOnce(const Parameter& param, unsigned num_qubits, + ncircuit_iterator cbeg, + ncircuit_iterator cend, + uint64_t r, const StateSpace& state_space, + const Simulator& simulator, State& state, Stat& stat) { + std::vector gates; + gates.reserve(4 * std::size_t(cend - cbeg)); + + if (!RunIteration(param, param.apply_last_deferred_ops, num_qubits, cbeg, + cend, r, state_space, simulator, gates, state, stat)) { + return false; + } + + return true; + } + + private: + static bool RunIteration(const Parameter& param, + bool apply_last_deferred_ops, unsigned num_qubits, + ncircuit_iterator cbeg, + ncircuit_iterator cend, + uint64_t rep, const StateSpace& state_space, + const Simulator& simulator, + std::vector& gates, + State& state, Stat& stat) { + if (param.collect_kop_stat || param.collect_mea_stat) { + stat.samples.reserve(std::size_t(cend - cbeg)); + stat.samples.resize(0); + } + + if (state_space.IsNull(state)) { + state = CreateState(num_qubits, state_space); + if (state_space.IsNull(state)) { + return false; + } + + state_space.SetStateZero(state); + } + + gates.resize(0); + + RGen rgen(rep); + std::uniform_real_distribution distr(0.0, 1.0); + + bool unitary = true; + stat.primary = true; + + for (auto it = cbeg; it != cend; ++it) { + const auto& channel = *it; + + if (channel.size() == 0) continue; + + if (channel[0].kind == gate::kMeasurement) { + // Measurement channel. + + if (!ApplyDeferredOps(param, num_qubits, simulator, gates, state)) { + return false; + } + + bool normalize = !unitary && param.normalize_before_mea_gates; + NormalizeState(normalize, state_space, unitary, state); + + auto mresult = ApplyMeasurementGate(state_space, channel[0].ops[0], + rgen, state); + + if (!mresult.valid) { + return false; + } + + CollectStat(param.collect_mea_stat, mresult.bits, stat); + + stat.primary = false; + + continue; + } + + // "Normal" channel. + + double r = distr(rgen); + double cp = 0; + + // Perform sampling of Kraus operators using probability bounds. + for (std::size_t i = 0; i < channel.size(); ++i) { + const auto& kop = channel[i]; + + cp += kop.prob; + + if (r < cp) { + DeferOps(kop.ops, gates); + CollectStat(param.collect_kop_stat, i, stat); + + unitary = unitary && kop.unitary; + + break; + } + } + + if (r < cp) continue; + + if (!ApplyDeferredOps(param, num_qubits, simulator, gates, state)) { + return false; + } + + NormalizeState(!unitary, state_space, unitary, state); + + double max_prob = 0; + std::size_t max_prob_index = 0; + + // Perform sampling of Kraus operators using norms of updated states. + for (std::size_t i = 0; i < channel.size(); ++i) { + const auto& kop = channel[i]; + + if (kop.unitary) continue; + + double prob = std::real( + simulator.ExpectationValue(kop.qubits, kop.kd_k.data(), state)); + + if (prob > max_prob) { + max_prob = prob; + max_prob_index = i; + } + + cp += prob - kop.prob; + + if (r < cp || i == channel.size() - 1) { + // Sample ith Kraus operator if r < cp + // Sample the highest probability Kraus operator if r is greater + // than the sum of all probablities due to round-off errors. + uint64_t k = r < cp ? i : max_prob_index; + + DeferOps(channel[k].ops, gates); + CollectStat(param.collect_kop_stat, k, stat); + + unitary = false; + + break; + } + } + } + + if (apply_last_deferred_ops || !stat.primary) { + if (!ApplyDeferredOps(param, num_qubits, simulator, gates, state)) { + return false; + } + + NormalizeState(!unitary, state_space, unitary, state); + } + + return true; + } + + static State CreateState(unsigned num_qubits, const StateSpace& state_space) { + auto state = state_space.Create(num_qubits); + if (state_space.IsNull(state)) { + IO::errorf("not enough memory: is the number of qubits too large?\n"); + return state_space.Null(); + } + + return state; + } + + static bool ApplyDeferredOps( + const Parameter& param, unsigned num_qubits, const Simulator& simulator, + std::vector& gates, State& state) { + if (gates.size() > 0) { + auto fgates = Fuser::FuseGates(param, num_qubits, gates); + + gates.resize(0); + + if (fgates.size() == 0) { + return false; + } + + for (const auto& fgate : fgates) { + ApplyFusedGate(simulator, fgate, state); + } + } + + return true; + } + + static MeasurementResult ApplyMeasurementGate( + const StateSpace& state_space, const Gate& gate, + RGen& rgen, State& state) { + auto result = state_space.Measure(gate.qubits, rgen, state); + + if (!result.valid) { + IO::errorf("measurement failed.\n"); + } + + return result; + } + + static void DeferOps( + const std::vector& ops, std::vector& gates) { + for (const auto& op : ops) { + gates.push_back(&op); + } + } + + static void CollectStat(bool collect_stat, uint64_t i, Stat& stat) { + if (collect_stat) { + stat.samples.push_back(i); + } + + if (i != 0) { + stat.primary = false; + } + } + + static void NormalizeState(bool normalize, const StateSpace& state_space, + bool& flag, State& state) { + if (normalize) { + double a = 1.0 / std::sqrt(state_space.Norm(state)); + state_space.Multiply(a, state); + flag = true; + } + } +}; + +} // namespace qsim + +#endif // QTRAJECTORY_H_ diff --git a/tpls/qsim/run_qsim.h b/tpls/qsim/run_qsim.h new file mode 100644 index 0000000..3752915 --- /dev/null +++ b/tpls/qsim/run_qsim.h @@ -0,0 +1,262 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef RUN_QSIM_H_ +#define RUN_QSIM_H_ + +#include +#include +#include + +#include "gate.h" +#include "gate_appl.h" +#include "util.h" + +namespace qsim { + +/** + * Helper struct for running qsim. + */ +template +struct QSimRunner final { + public: + using Simulator = typename Factory::Simulator; + using StateSpace = typename Simulator::StateSpace; + using State = typename StateSpace::State; + using MeasurementResult = typename StateSpace::MeasurementResult; + + /** + * User-specified parameters for gate fusion and simulation. + */ + struct Parameter : public Fuser::Parameter { + /** + * Random number generator seed to apply measurement gates. + */ + uint64_t seed; + }; + + /** + * Runs the given circuit, only measuring at the end. + * @param param Options for gate fusion, parallelism and logging. + * @param factory Object to create simulators and state spaces. + * @param circuit The circuit to be simulated. + * @param measure Function that performs measurements (in the sense of + * computing expectation values, etc). + * @return True if the simulation completed successfully; false otherwise. + */ + template + static bool Run(const Parameter& param, const Factory& factory, + const Circuit& circuit, MeasurementFunc measure) { + return Run(param, factory, {circuit.gates.back().time}, circuit, measure); + } + + /** + * Runs the given circuit, measuring at user-specified times. + * @param param Options for gate fusion, parallelism and logging. + * @param factory Object to create simulators and state spaces. + * @param times_to_measure_at Time steps at which to perform measurements. + * @param circuit The circuit to be simulated. + * @param measure Function that performs measurements (in the sense of + * computing expectation values, etc). + * @return True if the simulation completed successfully; false otherwise. + */ + template + static bool Run(const Parameter& param, const Factory& factory, + const std::vector& times_to_measure_at, + const Circuit& circuit, MeasurementFunc measure) { + double t0 = 0.0; + double t1 = 0.0; + + if (param.verbosity > 1) { + t0 = GetTime(); + } + + RGen rgen(param.seed); + + StateSpace state_space = factory.CreateStateSpace(); + + auto state = state_space.Create(circuit.num_qubits); + if (state_space.IsNull(state)) { + IO::errorf("not enough memory: is the number of qubits too large?\n"); + return false; + } + + state_space.SetStateZero(state); + Simulator simulator = factory.CreateSimulator(); + + if (param.verbosity > 1) { + t1 = GetTime(); + IO::messagef("init time is %g seconds.\n", t1 - t0); + t0 = GetTime(); + } + + auto fused_gates = Fuser::FuseGates(param, circuit.num_qubits, + circuit.gates, times_to_measure_at); + + if (fused_gates.size() == 0 && circuit.gates.size() > 0) { + return false; + } + + if (param.verbosity > 1) { + t1 = GetTime(); + IO::messagef("fuse time is %g seconds.\n", t1 - t0); + } + + if (param.verbosity > 0) { + t0 = GetTime(); + } + + unsigned cur_time_index = 0; + + // Apply fused gates. + for (std::size_t i = 0; i < fused_gates.size(); ++i) { + if (param.verbosity > 3) { + t1 = GetTime(); + } + + if (!ApplyFusedGate(state_space, simulator, fused_gates[i], rgen, + state)) { + IO::errorf("measurement failed.\n"); + return false; + } + + if (param.verbosity > 3) { + state_space.DeviceSync(); + double t2 = GetTime(); + IO::messagef("gate %lu done in %g seconds.\n", i, t2 - t1); + } + + unsigned t = times_to_measure_at[cur_time_index]; + + if (i == fused_gates.size() - 1 || t < fused_gates[i + 1].time) { + // Call back to perform measurements. + measure(cur_time_index, state_space, state); + ++cur_time_index; + } + } + + if (param.verbosity > 0) { + state_space.DeviceSync(); + double t2 = GetTime(); + IO::messagef("time is %g seconds.\n", t2 - t0); + } + + return true; + } + + /** + * Runs the given circuit and make the final state available to the caller, + * recording the result of any intermediate measurements in the circuit. + * @param param Options for gate fusion, parallelism and logging. + * @param factory Object to create simulators and state spaces. + * @param circuit The circuit to be simulated. + * @param state As an input parameter, this should contain the initial state + * of the system. After a successful run, it will be populated with the + * final state of the system. + * @param measure_results As an input parameter, this should be empty. + * After a successful run, this will contain all measurements results from + * the run, ordered by time and qubit index. + * @return True if the simulation completed successfully; false otherwise. + */ + template + static bool Run(const Parameter& param, const Factory& factory, + const Circuit& circuit, State& state, + std::vector& measure_results) { + double t0 = 0.0; + double t1 = 0.0; + + if (param.verbosity > 1) { + t0 = GetTime(); + } + + RGen rgen(param.seed); + + StateSpace state_space = factory.CreateStateSpace(); + Simulator simulator = factory.CreateSimulator(); + + if (param.verbosity > 1) { + t1 = GetTime(); + IO::messagef("init time is %g seconds.\n", t1 - t0); + t0 = GetTime(); + } + + auto fused_gates = Fuser::FuseGates(param, circuit.num_qubits, + circuit.gates); + + if (fused_gates.size() == 0 && circuit.gates.size() > 0) { + return false; + } + + measure_results.reserve(fused_gates.size()); + + if (param.verbosity > 1) { + t1 = GetTime(); + IO::messagef("fuse time is %g seconds.\n", t1 - t0); + } + + if (param.verbosity > 0) { + t0 = GetTime(); + } + + // Apply fused gates. + for (std::size_t i = 0; i < fused_gates.size(); ++i) { + if (param.verbosity > 3) { + t1 = GetTime(); + } + + if (!ApplyFusedGate(state_space, simulator, fused_gates[i], rgen, state, + measure_results)) { + IO::errorf("measurement failed.\n"); + return false; + } + + if (param.verbosity > 3) { + state_space.DeviceSync(); + double t2 = GetTime(); + IO::messagef("gate %lu done in %g seconds.\n", i, t2 - t1); + } + } + + if (param.verbosity > 0) { + state_space.DeviceSync(); + double t2 = GetTime(); + IO::messagef("simu time is %g seconds.\n", t2 - t0); + } + + return true; + } + + /** + * Runs the given circuit and make the final state available to the caller, + * discarding the result of any intermediate measurements in the circuit. + * @param param Options for gate fusion, parallelism and logging. + * @param factory Object to create simulators and state spaces. + * @param circuit The circuit to be simulated. + * @param state As an input parameter, this should contain the initial state + * of the system. After a successful run, it will be populated with the + * final state of the system. + * @return True if the simulation completed successfully; false otherwise. + */ + template + static bool Run(const Parameter& param, const Factory& factory, + const Circuit& circuit, State& state) { + std::vector discarded_results; + return Run(param, factory, circuit, state, discarded_results); + } +}; + +} // namespace qsim + +#endif // RUN_QSIM_H_ diff --git a/tpls/qsim/run_qsimh.h b/tpls/qsim/run_qsimh.h new file mode 100644 index 0000000..c1534d3 --- /dev/null +++ b/tpls/qsim/run_qsimh.h @@ -0,0 +1,120 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef RUN_QSIMH_H_ +#define RUN_QSIMH_H_ + +#include +#include + +#include "hybrid.h" +#include "util.h" + +namespace qsim { + +/** + * Helper struct for running qsimh. + */ +template +struct QSimHRunner final { + using Gate = typename HybridSimulator::Gate; + using fp_type = typename HybridSimulator::fp_type; + + using Parameter = typename HybridSimulator::Parameter; + using HybridData = typename HybridSimulator::HybridData; + using Fuser = typename HybridSimulator::Fuser; + + /** + * Evaluates the amplitudes for a given circuit and set of output states. + * @param param Options for gate fusion, parallelism and logging. Also + * specifies the size of the 'prefix' and 'root' sections of the lattice. + * @param factory Object to create simulators and state spaces. + * @param circuit The circuit to be simulated. + * @param parts Lattice sections to be simulated. + * @param bitstrings List of output states to simulate, as bitstrings. + * @param results Output vector of amplitudes. After a successful run, this + * will be populated with amplitudes for each state in 'bitstrings'. + * @return True if the simulation completed successfully; false otherwise. + */ + template + static bool Run(const Parameter& param, const Factory& factory, + const Circuit& circuit, const std::vector& parts, + const std::vector& bitstrings, + std::vector>& results) { + if (circuit.num_qubits != parts.size()) { + IO::errorf("parts size is not equal to the number of qubits."); + return false; + } + + double t0 = 0.0; + + if (param.verbosity > 0) { + t0 = GetTime(); + } + + HybridData hd; + bool rc = HybridSimulator::SplitLattice(parts, circuit.gates, hd); + + if (!rc) { + return false; + } + + if (hd.num_gatexs < param.num_prefix_gatexs + param.num_root_gatexs) { + IO::errorf("error: num_prefix_gates (%u) plus num_root gates (%u) is " + "greater than num_gates_on_the_cut (%u).\n", + param.num_prefix_gatexs, param.num_root_gatexs, + hd.num_gatexs); + return false; + } + + if (param.verbosity > 0) { + PrintInfo(param, hd); + } + + auto fgates0 = Fuser::FuseGates(param, hd.num_qubits0, hd.gates0); + if (fgates0.size() == 0 && hd.gates0.size() > 0) { + return false; + } + + auto fgates1 = Fuser::FuseGates(param, hd.num_qubits1, hd.gates1); + if (fgates1.size() == 0 && hd.gates1.size() > 0) { + return false; + } + + rc = HybridSimulator(param.num_threads).Run( + param, factory, hd, parts, fgates0, fgates1, bitstrings, results); + + if (rc && param.verbosity > 0) { + double t1 = GetTime(); + IO::messagef("time elapsed %g seconds.\n", t1 - t0); + } + + return rc; + } + + private: + static void PrintInfo(const Parameter& param, const HybridData& hd) { + unsigned num_suffix_gates = + hd.num_gatexs - param.num_prefix_gatexs - param.num_root_gatexs; + + IO::messagef("part 0: %u, part 1: %u\n", hd.num_qubits0, hd.num_qubits1); + IO::messagef("%u gates on the cut\n", hd.num_gatexs); + IO::messagef("breakup: %up+%ur+%us\n", param.num_prefix_gatexs, + param.num_root_gatexs, num_suffix_gates); + } +}; + +} // namespace qsim + +#endif // RUN_QSIM_H_ diff --git a/tpls/qsim/seqfor.h b/tpls/qsim/seqfor.h new file mode 100644 index 0000000..3ebf07c --- /dev/null +++ b/tpls/qsim/seqfor.h @@ -0,0 +1,68 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SEQFOR_H_ +#define SEQFOR_H_ + +#include +#include +#include + +namespace qsim { + +/** + * Helper struct for executing for loops in series. + */ +struct SequentialFor { + explicit SequentialFor(unsigned num_threads) {} + + // SequentialFor does not have any state. So all its methods can be static. + + static uint64_t GetIndex0(uint64_t size, unsigned thread_id) { + return 0; + } + + static uint64_t GetIndex1(uint64_t size, unsigned thread_id) { + return size; + } + + template + static void Run(uint64_t size, Function&& func, Args&&... args) { + for (uint64_t i = 0; i < size; ++i) { + func(1, 0, i, args...); + } + } + + template + static std::vector RunReduceP( + uint64_t size, Function&& func, Op&& op, Args&&... args) { + typename Op::result_type result = 0; + + for (uint64_t i = 0; i < size; ++i) { + result = op(result, func(1, 0, i, args...)); + } + + return std::vector(1, result); + } + + template + static typename Op::result_type RunReduce(uint64_t size, Function&& func, + Op&& op, Args&&... args) { + return RunReduceP(size, func, std::move(op), args...)[0]; + } +}; + +} // namespace qsim + +#endif // SEQFOR_H_ diff --git a/tpls/qsim/simmux.h b/tpls/qsim/simmux.h new file mode 100644 index 0000000..d3c4074 --- /dev/null +++ b/tpls/qsim/simmux.h @@ -0,0 +1,44 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SIMMUX_H_ +#define SIMMUX_H_ + +#ifdef __AVX512F__ +# include "simulator_avx512.h" + namespace qsim { + template + using Simulator = SimulatorAVX512; + } +#elif __AVX2__ +# include "simulator_avx.h" + namespace qsim { + template + using Simulator = SimulatorAVX; + } +#elif __SSE4_1__ +# include "simulator_sse.h" + namespace qsim { + template + using Simulator = SimulatorSSE; + } +#else +# include "simulator_basic.h" + namespace qsim { + template + using Simulator = SimulatorBasic; + } +#endif + +#endif // SIMMUX_H_ diff --git a/tpls/qsim/simmux_gpu.h b/tpls/qsim/simmux_gpu.h new file mode 100644 index 0000000..1f0bb59 --- /dev/null +++ b/tpls/qsim/simmux_gpu.h @@ -0,0 +1,30 @@ +// Copyright 2023 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SIMMUX_GPU_H_ +#define SIMMUX_GPU_H_ + +#ifdef __CUSTATEVEC__ +# include "simulator_custatevec.h" + namespace qsim { + using SimulatorGpu = SimulatorCuStateVec<>; + } +#else +# include "simulator_cuda.h" + namespace qsim { + using SimulatorGpu = SimulatorCUDA<>; + } +#endif + +#endif // SIMMUX_GPU_H_ diff --git a/tpls/qsim/simulator.h b/tpls/qsim/simulator.h new file mode 100644 index 0000000..eff5441 --- /dev/null +++ b/tpls/qsim/simulator.h @@ -0,0 +1,516 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SIMULATOR_H_ +#define SIMULATOR_H_ + +#include + +#include "bits.h" + +namespace qsim { + +/** + * Base class for simulator classes. + */ +class SimulatorBase { + protected: + // The follwoing template parameters are used for functions below. + // H - the number of high (target) qubits. + // L - the number of low (target) qubits. + // R - SIMD register width in floats. + + // Fills the table of masks (ms) that is used to calculate base state indices + // and the table of offset indices (xss) that is used to access the state + // vector entries in matrix-vector multiplication functions. This function is + // used in simulator_basic.h, simulator_sse.h and simulator_avx.h (no bmi2 + // version). + template + static void FillIndices(unsigned num_qubits, const std::vector& qs, + uint64_t* ms, uint64_t* xss) { + constexpr unsigned hsize = 1 << H; + + if (H == 0) { + ms[0] = uint64_t(-1); + xss[0] = 0; + } else { + uint64_t xs[H + 1]; + + xs[0] = uint64_t{1} << (qs[L] + 1); + ms[0] = (uint64_t{1} << qs[L]) - 1; + for (unsigned i = 1; i < H; ++i) { + xs[i] = uint64_t{1} << (qs[L + i] + 1); + ms[i] = ((uint64_t{1} << qs[L + i]) - 1) ^ (xs[i - 1] - 1); + } + ms[H] = ((uint64_t{1} << num_qubits) - 1) ^ (xs[H - 1] - 1); + + for (unsigned i = 0; i < hsize; ++i) { + uint64_t a = 0; + for (uint64_t k = 0; k < H; ++k) { + a += xs[k] * ((i >> k) & 1); + } + xss[i] = a; + } + } + } + + // Fills gate matrix entries for gates with low qubits. + template + static void FillMatrix(unsigned qmaskl, const fp_type* matrix, fp_type* w) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + constexpr unsigned rsize = 1 << R; + + unsigned s = 0; + + for (unsigned i = 0; i < hsize; ++i) { + for (unsigned j = 0; j < gsize; ++j) { + unsigned p0 = 2 * i * lsize * gsize + 2 * lsize * (j / lsize); + + for (unsigned k = 0; k < rsize; ++k) { + unsigned l = bits::CompressBits(k, R, qmaskl); + unsigned p = p0 + 2 * (gsize * l + (j + l) % lsize); + + w[s + 0] = matrix[p]; + w[s + rsize] = matrix[p + 1]; + + ++s; + } + + s += rsize; + } + } + } + + // Fills gate matrix entries for controlled gates with high target qubits + // and low control qubits. + template + static void FillControlledMatrixH(uint64_t cvalsl, uint64_t cmaskl, + const fp_type* matrix, fp_type* w) { + constexpr unsigned hsize = 1 << H; + constexpr unsigned rsize = 1 << R; + + unsigned s = 0; + + for (unsigned i = 0; i < hsize; ++i) { + for (unsigned j = 0; j < hsize; ++j) { + unsigned p = hsize * i + j; + fp_type v = i == j ? 1 : 0; + + for (unsigned k = 0; k < rsize; ++k) { + w[s] = cvalsl == (k & cmaskl) ? matrix[2 * p] : v; + w[s + rsize] = cvalsl == (k & cmaskl) ? matrix[2 * p + 1] : 0; + + ++s; + } + + s += rsize; + } + } + } + + // Fills gate matrix entries for controlled gates with low target qubits + // and low control qubits. + template + static void FillControlledMatrixL(uint64_t cvalsl, uint64_t cmaskl, + unsigned qmaskl, const fp_type* matrix, + fp_type* w) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + constexpr unsigned rsize = 1 << R; + + unsigned s = 0; + + for (unsigned i = 0; i < hsize; ++i) { + for (unsigned j = 0; j < gsize; ++j) { + unsigned p0 = i * lsize * gsize + lsize * (j / lsize); + + for (unsigned k = 0; k < rsize; ++k) { + unsigned l = bits::CompressBits(k, R, qmaskl); + unsigned p = p0 + gsize * l + (j + l) % lsize; + + fp_type v = p / gsize == p % gsize ? 1 : 0; + + w[s] = cvalsl == (k & cmaskl) ? matrix[2 * p] : v; + w[s + rsize] = cvalsl == (k & cmaskl) ? matrix[2 * p + 1] : 0; + + ++s; + } + + s += rsize; + } + } + } + +/* + The GetMasks* functions below provide various masks and related values. + GetMasks1, GetMasks2, GetMasks3, GetMasks4, GetMasks5 and GetMasks6 are + used in simulator_avx.h (BMI2 version) and in simulator_avx512.h. GetMasks7, + GetMasks8, GetMasks9, GetMasks10 and GetMasks11 are used in simulator_avx.h + (no BMI2 version) and in simulator_sse.h. + + imaskh - inverted mask of high qubits (high control and target qubits). + qmaskh - mask of high qubits (high target qubits). + cvalsh - control bit values of high control qubits placed in correct + positions. + cvalsl - control bit values of low control qubits placed in correct positions. + cmaskh - mask of high control qubits. + cmaskl - mask of low control qubits. + qmaskl - mask of low qubits (low target qubits). + cl - the number of low control qubits. + + Note that imaskh, qmaskh and cvalsh are multiplied by two in GetMasks1, + GetMasks2, GetMasks3, GetMasks4, GetMasks5 and GetMasks6. +*/ + + struct Masks1 { + uint64_t imaskh; + uint64_t qmaskh; + }; + + template + static Masks1 GetMasks1(const std::vector& qs) { + uint64_t qmaskh = 0; + + for (unsigned i = 0; i < H; ++i) { + qmaskh |= uint64_t{1} << qs[i]; + } + + return {2 * (~qmaskh ^ ((1 << R) - 1)), 2 * qmaskh}; + } + + struct Masks2 { + uint64_t imaskh; + uint64_t qmaskh; + unsigned qmaskl; + }; + + template + static Masks2 GetMasks2(const std::vector& qs) { + uint64_t qmaskh = 0; + unsigned qmaskl = 0; + + for (unsigned i = 0; i < L; ++i) { + qmaskl |= 1 << qs[i]; + } + + for (unsigned i = L; i < H + L; ++i) { + qmaskh |= uint64_t{1} << qs[i]; + } + + return {2 * (~qmaskh ^ ((1 << R) - 1)), 2 * qmaskh, qmaskl}; + } + + struct Masks3 { + uint64_t imaskh; + uint64_t qmaskh; + uint64_t cvalsh; + }; + + template + static Masks3 GetMasks3(unsigned num_qubits, const std::vector& qs, + const std::vector& cqs, uint64_t cvals) { + uint64_t qmaskh = 0; + uint64_t cmaskh = 0; + + for (unsigned i = 0; i < H; ++i) { + qmaskh |= uint64_t{1} << qs[i]; + } + + for (auto q : cqs) { + cmaskh |= uint64_t{1} << q; + } + + uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh); + + uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1); + + return {2 * maskh, 2 * qmaskh, 2 * cvalsh}; + } + + struct Masks4 { + uint64_t imaskh; + uint64_t qmaskh; + uint64_t cvalsh; + uint64_t cvalsl; + uint64_t cmaskl; + unsigned cl; + }; + + template + static Masks4 GetMasks4(unsigned num_qubits, const std::vector& qs, + const std::vector& cqs, uint64_t cvals) { + unsigned cl = 0; + uint64_t qmaskh = 0; + uint64_t cmaskh = 0; + uint64_t cmaskl = 0; + + for (unsigned i = 0; i < H; ++i) { + qmaskh |= uint64_t{1} << qs[i]; + } + + for (auto q : cqs) { + if (q >= R) { + cmaskh |= uint64_t{1} << q; + } else { + ++cl; + cmaskl |= uint64_t{1} << q; + } + } + + uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh); + uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl); + + uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1); + + return {2 * maskh, 2 * qmaskh, 2 * cvalsh, cvalsl, cmaskl, cl}; + } + + struct Masks5 { + uint64_t imaskh; + uint64_t qmaskh; + uint64_t cvalsh; + unsigned qmaskl; + }; + + template + static Masks5 GetMasks5(unsigned num_qubits, const std::vector& qs, + const std::vector& cqs, uint64_t cvals) { + uint64_t qmaskh = 0; + uint64_t cmaskh = 0; + unsigned qmaskl = 0; + + for (unsigned i = 0; i < L; ++i) { + qmaskl |= 1 << qs[i]; + } + + for (unsigned i = L; i < H + L; ++i) { + qmaskh |= uint64_t{1} << qs[i]; + } + + for (auto q : cqs) { + cmaskh |= uint64_t{1} << q; + } + + uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh); + + uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1); + + return {2 * maskh, 2 * qmaskh, 2 * cvalsh, qmaskl}; + } + + struct Masks6 { + uint64_t imaskh; + uint64_t qmaskh; + uint64_t cvalsh; + uint64_t cvalsl; + uint64_t cmaskl; + unsigned qmaskl; + unsigned cl; + }; + + template + static Masks6 GetMasks6(unsigned num_qubits, const std::vector& qs, + const std::vector& cqs, uint64_t cvals) { + unsigned cl = 0; + uint64_t qmaskh = 0; + uint64_t cmaskh = 0; + uint64_t cmaskl = 0; + unsigned qmaskl = 0; + + for (unsigned i = 0; i < L; ++i) { + qmaskl |= 1 << qs[i]; + } + + for (unsigned i = L; i < H + L; ++i) { + qmaskh |= uint64_t{1} << qs[i]; + } + + for (auto q : cqs) { + if (q >= R) { + cmaskh |= uint64_t{1} << q; + } else { + ++cl; + cmaskl |= uint64_t{1} << q; + } + } + + uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh); + uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl); + + uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1); + + return {2 * maskh, 2 * qmaskh, 2 * cvalsh, cvalsl, cmaskl, qmaskl, cl}; + } + + struct Masks7 { + uint64_t cvalsh; + uint64_t cmaskh; + }; + + static Masks7 GetMasks7(unsigned num_qubits, const std::vector& qs, + const std::vector& cqs, uint64_t cvals) { + uint64_t cmaskh = 0; + + for (auto q : cqs) { + cmaskh |= uint64_t{1} << q; + } + + uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh); + + return {cvalsh, cmaskh}; + } + + struct Masks8 { + uint64_t cvalsh; + uint64_t cmaskh; + uint64_t cvalsl; + uint64_t cmaskl; + }; + + template + static Masks8 GetMasks8(unsigned num_qubits, const std::vector& qs, + const std::vector& cqs, uint64_t cvals) { + unsigned cl = 0; + uint64_t cmaskh = 0; + uint64_t cmaskl = 0; + + for (auto q : cqs) { + if (q >= R) { + cmaskh |= uint64_t{1} << q; + } else { + ++cl; + cmaskl |= uint64_t{1} << q; + } + } + + uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh); + uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl); + + return {cvalsh, cmaskh, cvalsl, cmaskl}; + } + + struct Masks9 { + uint64_t cvalsh; + uint64_t cmaskh; + unsigned qmaskl; + }; + + template + static Masks9 GetMasks9(unsigned num_qubits, const std::vector& qs, + const std::vector& cqs, uint64_t cvals) { + uint64_t cmaskh = 0; + unsigned qmaskl = 0; + + for (unsigned i = 0; i < L; ++i) { + qmaskl |= 1 << qs[i]; + } + + for (auto q : cqs) { + cmaskh |= uint64_t{1} << q; + } + + uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh); + + return {cvalsh, cmaskh, qmaskl}; + } + + struct Masks10 { + uint64_t cvalsh; + uint64_t cmaskh; + uint64_t cvalsl; + uint64_t cmaskl; + unsigned qmaskl; + }; + + template + static Masks10 GetMasks10(unsigned num_qubits, + const std::vector& qs, + const std::vector& cqs, uint64_t cvals) { + unsigned cl = 0; + uint64_t cmaskh = 0; + uint64_t cmaskl = 0; + unsigned qmaskl = 0; + + for (unsigned i = 0; i < L; ++i) { + qmaskl |= 1 << qs[i]; + } + + for (auto q : cqs) { + if (q >= R) { + cmaskh |= uint64_t{1} << q; + } else { + ++cl; + cmaskl |= uint64_t{1} << q; + } + } + + uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh); + uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl); + + return {cvalsh, cmaskh, cvalsl, cmaskl, qmaskl}; + } + + struct Masks11 { + unsigned qmaskl; + }; + + template + static Masks11 GetMasks11(const std::vector& qs) { + unsigned qmaskl = 0; + + for (unsigned i = 0; i < L; ++i) { + qmaskl |= 1 << qs[i]; + } + + return {qmaskl}; + } + + template + static unsigned MaskedAdd( + unsigned a, unsigned b, unsigned mask, unsigned lsize) { + unsigned c = bits::CompressBits(a, R, mask); + return bits::ExpandBits((c + b) % lsize, R, mask); + } +}; + +template <> +inline void SimulatorBase::FillIndices<0, 1>(unsigned num_qubits, + const std::vector& qs, + uint64_t* ms, uint64_t* xss) { + ms[0] = -1; + xss[0] = 0; +} + +template <> +inline void SimulatorBase::FillIndices<0, 2>(unsigned num_qubits, + const std::vector& qs, + uint64_t* ms, uint64_t* xss) { + ms[0] = -1; + xss[0] = 0; +} + +template <> +inline void SimulatorBase::FillIndices<0, 3>(unsigned num_qubits, + const std::vector& qs, + uint64_t* ms, uint64_t* xss) { + ms[0] = -1; + xss[0] = 0; +} + +} // namespace qsim + +#endif // SIMULATOR_H_ diff --git a/tpls/qsim/simulator_avx.h b/tpls/qsim/simulator_avx.h new file mode 100644 index 0000000..9742849 --- /dev/null +++ b/tpls/qsim/simulator_avx.h @@ -0,0 +1,1363 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SIMULATOR_AVX_H_ +#define SIMULATOR_AVX_H_ + +#include + +#include +#include +#include +#include + +#include "simulator.h" +#include "statespace_avx.h" + +namespace qsim { + +/** + * Quantum circuit simulator with AVX vectorization. + */ +template +class SimulatorAVX final : public SimulatorBase { + public: + using StateSpace = StateSpaceAVX; + using State = typename StateSpace::State; + using fp_type = typename StateSpace::fp_type; + + template + explicit SimulatorAVX(ForArgs&&... args) : for_(args...) {} + + /** + * Applies a gate using AVX instructions. + * @param qs Indices of the qubits affected by this gate. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyGate(const std::vector& qs, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + switch (qs.size()) { + case 0: + ApplyGateH<0>(qs, matrix, state); + break; + case 1: + if (qs[0] > 2) { + ApplyGateH<1>(qs, matrix, state); + } else { + ApplyGateL<0, 1>(qs, matrix, state); + } + break; + case 2: + if (qs[0] > 2) { + ApplyGateH<2>(qs, matrix, state); + } else if (qs[1] > 2) { + ApplyGateL<1, 1>(qs, matrix, state); + } else { + ApplyGateL<0, 2>(qs, matrix, state); + } + break; + case 3: + if (qs[0] > 2) { + ApplyGateH<3>(qs, matrix, state); + } else if (qs[1] > 2) { + ApplyGateL<2, 1>(qs, matrix, state); + } else if (qs[2] > 2) { + ApplyGateL<1, 2>(qs, matrix, state); + } else { + ApplyGateL<0, 3>(qs, matrix, state); + } + break; + case 4: + if (qs[0] > 2) { + ApplyGateH<4>(qs, matrix, state); + } else if (qs[1] > 2) { + ApplyGateL<3, 1>(qs, matrix, state); + } else if (qs[2] > 2) { + ApplyGateL<2, 2>(qs, matrix, state); + } else { + ApplyGateL<1, 3>(qs, matrix, state); + } + break; + case 5: + if (qs[0] > 2) { + ApplyGateH<5>(qs, matrix, state); + } else if (qs[1] > 2) { + ApplyGateL<4, 1>(qs, matrix, state); + } else if (qs[2] > 2) { + ApplyGateL<3, 2>(qs, matrix, state); + } else { + ApplyGateL<2, 3>(qs, matrix, state); + } + break; + case 6: + if (qs[0] > 2) { + ApplyGateH<6>(qs, matrix, state); + } else if (qs[1] > 2) { + ApplyGateL<5, 1>(qs, matrix, state); + } else if (qs[2] > 2) { + ApplyGateL<4, 2>(qs, matrix, state); + } else { + ApplyGateL<3, 3>(qs, matrix, state); + } + break; + default: + // Not implemented. + break; + } + } + + /** + * Applies a controlled gate using AVX instructions. + * @param qs Indices of the qubits affected by this gate. + * @param cqs Indices of control qubits. + * @param cvals Bit mask of control qubit values. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyControlledGate(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + // Assume cqs[0] < cqs[1] < cqs[2] < ... . + + if (cqs.size() == 0) { + ApplyGate(qs, matrix, state); + return; + } + + switch (qs.size()) { + case 0: + if (cqs[0] > 2) { + ApplyControlledGateHH<0>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<0>(qs, cqs, cvals, matrix, state); + } + break; + case 1: + if (qs[0] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 2) { + ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 2: + if (qs[0] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 2) { + ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 3: + if (qs[0] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else if (qs[2] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 2) { + ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 4: + if (qs[0] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else if (qs[2] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 2) { + ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + default: + // Not implemented. + break; + } + } + + /** + * Computes the expectation value of an operator using AVX instructions. + * @param qs Indices of the qubits the operator acts on. + * @param matrix The operator matrix. + * @param state The state of the system. + * @return The computed expectation value. + */ + std::complex ExpectationValue(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + switch (qs.size()) { + case 1: + if (qs[0] > 2) { + return ExpectationValueH<1>(qs, matrix, state); + } else { + return ExpectationValueL<0, 1>(qs, matrix, state); + } + break; + case 2: + if (qs[0] > 2) { + return ExpectationValueH<2>(qs, matrix, state); + } else if (qs[1] > 2) { + return ExpectationValueL<1, 1>(qs, matrix, state); + } else { + return ExpectationValueL<0, 2>(qs, matrix, state); + } + break; + case 3: + if (qs[0] > 2) { + return ExpectationValueH<3>(qs, matrix, state); + } else if (qs[1] > 2) { + return ExpectationValueL<2, 1>(qs, matrix, state); + } else if (qs[2] > 2) { + return ExpectationValueL<1, 2>(qs, matrix, state); + } else { + return ExpectationValueL<0, 3>(qs, matrix, state); + } + break; + case 4: + if (qs[0] > 2) { + return ExpectationValueH<4>(qs, matrix, state); + } else if (qs[1] > 2) { + return ExpectationValueL<3, 1>(qs, matrix, state); + } else if (qs[2] > 2) { + return ExpectationValueL<2, 2>(qs, matrix, state); + } else { + return ExpectationValueL<1, 3>(qs, matrix, state); + } + break; + case 5: + if (qs[0] > 2) { + return ExpectationValueH<5>(qs, matrix, state); + } else if (qs[1] > 2) { + return ExpectationValueL<4, 1>(qs, matrix, state); + } else if (qs[2] > 2) { + return ExpectationValueL<3, 2>(qs, matrix, state); + } else { + return ExpectationValueL<2, 3>(qs, matrix, state); + } + break; + case 6: + if (qs[0] > 2) { + return ExpectationValueH<6>(qs, matrix, state); + } else if (qs[1] > 2) { + return ExpectationValueL<5, 1>(qs, matrix, state); + } else if (qs[2] > 2) { + return ExpectationValueL<4, 2>(qs, matrix, state); + } else { + return ExpectationValueL<3, 3>(qs, matrix, state); + } + break; + default: + // Not implemented. + break; + } + + return 0; + } + + /** + * @return The size of SIMD register if applicable. + */ + static unsigned SIMDRegisterSize() { + return 8; + } + + private: +#ifdef __BMI2__ + + template + void ApplyGateH(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + uint64_t imaskh, uint64_t qmaskh, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m256 ru, iu, rn, in; + __m256 rs[hsize], is[hsize]; + + auto p0 = rstate + _pdep_u64(i, imaskh); + + for (unsigned k = 0; k < hsize; ++k) { + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k] = _mm256_load_ps(p0 + p); + is[k] = _mm256_load_ps(p0 + p + 8); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_mul_ps(rs[0], ru); + in = _mm256_mul_ps(rs[0], iu); + rn = _mm256_fnmadd_ps(is[0], iu, rn); + in = _mm256_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_fmadd_ps(rs[l], ru, rn); + in = _mm256_fmadd_ps(rs[l], iu, in); + rn = _mm256_fnmadd_ps(is[l], iu, rn); + in = _mm256_fmadd_ps(is[l], ru, in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm256_store_ps(p0 + p, rn); + _mm256_store_ps(p0 + p + 8, in); + } + }; + + auto m = GetMasks1(qs); + + unsigned k = 3 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, matrix, m.imaskh, m.qmaskh, state.get()); + } + + template + void ApplyGateL(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, + uint64_t imaskh, uint64_t qmaskh, const __m256i* idx, + fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m256 rn, in; + __m256 rs[gsize], is[gsize]; + + auto p0 = rstate + _pdep_u64(i, imaskh); + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k2] = _mm256_load_ps(p0 + p); + is[k2] = _mm256_load_ps(p0 + p + 8); + + for (unsigned l = 1; l < lsize; ++l) { + rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); + is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); + } + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm256_mul_ps(rs[0], w[j]); + in = _mm256_mul_ps(rs[0], w[j + 1]); + rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm256_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm256_fmadd_ps(rs[l], w[j], rn); + in = _mm256_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm256_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm256_store_ps(p0 + p, rn); + _mm256_store_ps(p0 + p + 8, in); + } + }; + + __m256i idx[1 << L]; + __m256 w[1 << (1 + 2 * H + L)]; + + auto m = GetMasks2(qs); + FillPermutationIndices(m.qmaskl, idx); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + unsigned r = 3 + H; + unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, m.imaskh, m.qmaskh, idx, state.get()); + } + + template + void ApplyControlledGateHH(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, + fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m256 ru, iu, rn, in; + __m256 rs[hsize], is[hsize]; + + auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh); + + for (unsigned k = 0; k < hsize; ++k) { + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k] = _mm256_load_ps(p0 + p); + is[k] = _mm256_load_ps(p0 + p + 8); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_mul_ps(rs[0], ru); + in = _mm256_mul_ps(rs[0], iu); + rn = _mm256_fnmadd_ps(is[0], iu, rn); + in = _mm256_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_fmadd_ps(rs[l], ru, rn); + in = _mm256_fmadd_ps(rs[l], iu, in); + rn = _mm256_fnmadd_ps(is[l], iu, rn); + in = _mm256_fmadd_ps(is[l], ru, in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm256_store_ps(p0 + p, rn); + _mm256_store_ps(p0 + p + 8, in); + } + }; + + auto m = GetMasks3(state.num_qubits(), qs, cqs, cvals); + + unsigned k = 3 + H + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, matrix, m.imaskh, m.qmaskh, m.cvalsh, state.get()); + } + + template + void ApplyControlledGateHL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, + uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, + fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m256 rn, in; + __m256 rs[hsize], is[hsize]; + + auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh); + + for (unsigned k = 0; k < hsize; ++k) { + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k] = _mm256_load_ps(p0 + p); + is[k] = _mm256_load_ps(p0 + p + 8); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm256_mul_ps(rs[0], w[j]); + in = _mm256_mul_ps(rs[0], w[j + 1]); + rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm256_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + rn = _mm256_fmadd_ps(rs[l], w[j], rn); + in = _mm256_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm256_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm256_store_ps(p0 + p, rn); + _mm256_store_ps(p0 + p + 8, in); + } + }; + + __m256 w[1 << (1 + 2 * H)]; + + auto m = GetMasks4(state.num_qubits(), qs, cqs, cvals); + FillControlledMatrixH(m.cvalsl, m.cmaskl, matrix, (fp_type*) w); + + unsigned r = 3 + H + cqs.size() - m.cl; + unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, state.get()); + } + + template + void ApplyControlledGateL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, + uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, + const __m256i* idx, fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m256 rn, in; + __m256 rs[gsize], is[gsize]; + + auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh); + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k2] = _mm256_load_ps(p0 + p); + is[k2] = _mm256_load_ps(p0 + p + 8); + + for (unsigned l = 1; l < lsize; ++l) { + rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); + is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); + } + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm256_mul_ps(rs[0], w[j]); + in = _mm256_mul_ps(rs[0], w[j + 1]); + rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm256_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm256_fmadd_ps(rs[l], w[j], rn); + in = _mm256_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm256_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm256_store_ps(p0 + p, rn); + _mm256_store_ps(p0 + p + 8, in); + } + }; + + __m256i idx[1 << L]; + __m256 w[1 << (1 + 2 * H + L)]; + + if (CH) { + auto m = GetMasks5(state.num_qubits(), qs, cqs, cvals); + FillPermutationIndices(m.qmaskl, idx); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + unsigned r = 3 + H + cqs.size(); + unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get()); + } else { + auto m = GetMasks6(state.num_qubits(), qs, cqs, cvals); + FillPermutationIndices(m.qmaskl, idx); + FillControlledMatrixL( + m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w); + + unsigned r = 3 + H + cqs.size() - m.cl; + unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get()); + } + } + + template + std::complex ExpectationValueH(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + uint64_t imaskh, uint64_t qmaskh, const fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m256 ru, iu, rn, in; + __m256 rs[hsize], is[hsize]; + + auto p0 = rstate + _pdep_u64(i, imaskh); + + for (unsigned k = 0; k < hsize; ++k) { + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k] = _mm256_load_ps(p0 + p); + is[k] = _mm256_load_ps(p0 + p + 8); + } + + double re = 0; + double im = 0; + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_mul_ps(rs[0], ru); + in = _mm256_mul_ps(rs[0], iu); + rn = _mm256_fnmadd_ps(is[0], iu, rn); + in = _mm256_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_fmadd_ps(rs[l], ru, rn); + in = _mm256_fmadd_ps(rs[l], iu, in); + rn = _mm256_fnmadd_ps(is[l], iu, rn); + in = _mm256_fmadd_ps(is[l], ru, in); + + j += 2; + } + + __m256 v_re = _mm256_fmadd_ps(is[k], in, _mm256_mul_ps(rs[k], rn)); + __m256 v_im = _mm256_fnmadd_ps(is[k], rn, _mm256_mul_ps(rs[k], in)); + + re += detail::HorizontalSumAVX(v_re); + im += detail::HorizontalSumAVX(v_im); + } + + return std::complex{re, im}; + }; + + auto m = GetMasks1(qs); + + unsigned k = 3 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return + for_.RunReduce(size, f, Op(), matrix, m.imaskh, m.qmaskh, state.get()); + } + + template + std::complex ExpectationValueL(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, + uint64_t imaskh, uint64_t qmaskh, const __m256i* idx, + const fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m256 rn, in; + __m256 rs[gsize], is[gsize]; + + auto p0 = rstate + _pdep_u64(i, imaskh); + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k2] = _mm256_load_ps(p0 + p); + is[k2] = _mm256_load_ps(p0 + p + 8); + + for (unsigned l = 1; l < lsize; ++l) { + rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); + is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); + } + } + + double re = 0; + double im = 0; + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm256_mul_ps(rs[0], w[j]); + in = _mm256_mul_ps(rs[0], w[j + 1]); + rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm256_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm256_fmadd_ps(rs[l], w[j], rn); + in = _mm256_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm256_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + unsigned m = lsize * k; + + __m256 v_re = _mm256_fmadd_ps(is[m], in, _mm256_mul_ps(rs[m], rn)); + __m256 v_im = _mm256_fnmadd_ps(is[m], rn, _mm256_mul_ps(rs[m], in)); + + re += detail::HorizontalSumAVX(v_re); + im += detail::HorizontalSumAVX(v_im); + } + + return std::complex{re, im}; + }; + + __m256i idx[1 << L]; + __m256 w[1 << (1 + 2 * H + L)]; + + auto m = GetMasks2(qs); + FillPermutationIndices(m.qmaskl, idx); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + unsigned r = 3 + H; + unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return + for_.RunReduce(size, f, Op(), w, m.imaskh, m.qmaskh, idx, state.get()); + } + +#else // __BMI2__ + + template + void ApplyGateH(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m256 ru, iu, rn, in; + __m256 rs[hsize], is[hsize]; + + i *= 8; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = _mm256_load_ps(p0 + xss[k]); + is[k] = _mm256_load_ps(p0 + xss[k] + 8); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_mul_ps(rs[0], ru); + in = _mm256_mul_ps(rs[0], iu); + rn = _mm256_fnmadd_ps(is[0], iu, rn); + in = _mm256_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_fmadd_ps(rs[l], ru, rn); + in = _mm256_fmadd_ps(rs[l], iu, in); + rn = _mm256_fnmadd_ps(is[l], iu, rn); + in = _mm256_fmadd_ps(is[l], ru, in); + + j += 2; + } + + _mm256_store_ps(p0 + xss[k], rn); + _mm256_store_ps(p0 + xss[k] + 8, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned k = 3 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, matrix, ms, xss, state.get()); + } + + template + void ApplyGateL(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, + const uint64_t* ms, const uint64_t* xss, const __m256i* idx, + fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m256 rn, in; + __m256 rs[gsize], is[gsize]; + + i *= 8; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + rs[k2] = _mm256_load_ps(p0 + xss[k]); + is[k2] = _mm256_load_ps(p0 + xss[k] + 8); + + for (unsigned l = 1; l < lsize; ++l) { + rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); + is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); + } + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm256_mul_ps(rs[0], w[j]); + in = _mm256_mul_ps(rs[0], w[j + 1]); + rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm256_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm256_fmadd_ps(rs[l], w[j], rn); + in = _mm256_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm256_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + _mm256_store_ps(p0 + xss[k], rn); + _mm256_store_ps(p0 + xss[k] + 8, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + __m256i idx[1 << L]; + __m256 w[1 << (1 + 2 * H + L)]; + + auto m = GetMasks11(qs); + + FillIndices(state.num_qubits(), qs, ms, xss); + FillPermutationIndices(m.qmaskl, idx); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + unsigned r = 3 + H; + unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, idx, state.get()); + } + + template + void ApplyControlledGateHH(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, + uint64_t cmaskh, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m256 ru, iu, rn, in; + __m256 rs[hsize], is[hsize]; + + i *= 8; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + if ((ii & cmaskh) != cvalsh) return; + + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = _mm256_load_ps(p0 + xss[k]); + is[k] = _mm256_load_ps(p0 + xss[k] + 8); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_mul_ps(rs[0], ru); + in = _mm256_mul_ps(rs[0], iu); + rn = _mm256_fnmadd_ps(is[0], iu, rn); + in = _mm256_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_fmadd_ps(rs[l], ru, rn); + in = _mm256_fmadd_ps(rs[l], iu, in); + rn = _mm256_fnmadd_ps(is[l], iu, rn); + in = _mm256_fmadd_ps(is[l], ru, in); + + j += 2; + } + + _mm256_store_ps(p0 + xss[k], rn); + _mm256_store_ps(p0 + xss[k] + 8, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + + auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals); + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned k = 3 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, matrix, ms, xss, m.cvalsh, m.cmaskh, state.get()); + } + + template + void ApplyControlledGateHL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, + const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, + uint64_t cmaskh, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m256 rn, in; + __m256 rs[hsize], is[hsize]; + + i *= 8; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + if ((ii & cmaskh) != cvalsh) return; + + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = _mm256_load_ps(p0 + xss[k]); + is[k] = _mm256_load_ps(p0 + xss[k] + 8); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm256_mul_ps(rs[0], w[j]); + in = _mm256_mul_ps(rs[0], w[j + 1]); + rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm256_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + rn = _mm256_fmadd_ps(rs[l], w[j], rn); + in = _mm256_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm256_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + _mm256_store_ps(p0 + xss[k], rn); + _mm256_store_ps(p0 + xss[k] + 8, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + __m256 w[1 << (1 + 2 * H)]; + + auto m = GetMasks8<3>(state.num_qubits(), qs, cqs, cvals); + FillIndices(state.num_qubits(), qs, ms, xss); + FillControlledMatrixH(m.cvalsl, m.cmaskl, matrix, (fp_type*) w); + + unsigned r = 3 + H; + unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, state.get()); + } + + template + void ApplyControlledGateL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, + const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, + uint64_t cmaskh, const __m256i* idx, fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m256 rn, in; + __m256 rs[gsize], is[gsize]; + + i *= 8; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + if ((ii & cmaskh) != cvalsh) return; + + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + + rs[k2] = _mm256_load_ps(p0 + xss[k]); + is[k2] = _mm256_load_ps(p0 + xss[k] + 8); + + for (unsigned l = 1; l < lsize; ++l) { + rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); + is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); + } + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm256_mul_ps(rs[0], w[j]); + in = _mm256_mul_ps(rs[0], w[j + 1]); + rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm256_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm256_fmadd_ps(rs[l], w[j], rn); + in = _mm256_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm256_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + _mm256_store_ps(p0 + xss[k], rn); + _mm256_store_ps(p0 + xss[k] + 8, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + __m256i idx[1 << L]; + __m256 w[1 << (1 + 2 * H + L)]; + + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned r = 3 + H; + unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; + uint64_t size = uint64_t{1} << n; + + if (CH) { + auto m = GetMasks9(state.num_qubits(), qs, cqs, cvals); + FillPermutationIndices(m.qmaskl, idx); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, idx, state.get()); + } else { + auto m = GetMasks10(state.num_qubits(), qs, cqs, cvals); + FillPermutationIndices(m.qmaskl, idx); + FillControlledMatrixL( + m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w); + + for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, idx, state.get()); + } + } + + template + std::complex ExpectationValueH(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + const fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m256 ru, iu, rn, in; + __m256 rs[hsize], is[hsize]; + + i *= 8; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = _mm256_load_ps(p0 + xss[k]); + is[k] = _mm256_load_ps(p0 + xss[k] + 8); + } + + double re = 0; + double im = 0; + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_mul_ps(rs[0], ru); + in = _mm256_mul_ps(rs[0], iu); + rn = _mm256_fnmadd_ps(is[0], iu, rn); + in = _mm256_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_fmadd_ps(rs[l], ru, rn); + in = _mm256_fmadd_ps(rs[l], iu, in); + rn = _mm256_fnmadd_ps(is[l], iu, rn); + in = _mm256_fmadd_ps(is[l], ru, in); + + j += 2; + } + + __m256 v_re = _mm256_fmadd_ps(is[k], in, _mm256_mul_ps(rs[k], rn)); + __m256 v_im = _mm256_fnmadd_ps(is[k], rn, _mm256_mul_ps(rs[k], in)); + + re += detail::HorizontalSumAVX(v_re); + im += detail::HorizontalSumAVX(v_im); + } + + return std::complex{re, im}; + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned k = 3 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), matrix, ms, xss, state.get()); + } + + template + std::complex ExpectationValueL(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, + const uint64_t* ms, const uint64_t* xss, const __m256i* idx, + const fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m256 rn, in; + __m256 rs[gsize], is[gsize]; + + i *= 8; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + + rs[k2] = _mm256_load_ps(p0 + xss[k]); + is[k2] = _mm256_load_ps(p0 + xss[k] + 8); + + for (unsigned l = 1; l < lsize; ++l) { + rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); + is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); + } + } + + double re = 0; + double im = 0; + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm256_mul_ps(rs[0], w[j]); + in = _mm256_mul_ps(rs[0], w[j + 1]); + rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm256_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm256_fmadd_ps(rs[l], w[j], rn); + in = _mm256_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm256_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + unsigned m = lsize * k; + + __m256 v_re = _mm256_fmadd_ps(is[m], in, _mm256_mul_ps(rs[m], rn)); + __m256 v_im = _mm256_fnmadd_ps(is[m], rn, _mm256_mul_ps(rs[m], in)); + + re += detail::HorizontalSumAVX(v_re); + im += detail::HorizontalSumAVX(v_im); + } + + return std::complex{re, im}; + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + __m256i idx[1 << L]; + __m256 w[1 << (1 + 2 * H + L)]; + + auto m = GetMasks11(qs); + + FillIndices(state.num_qubits(), qs, ms, xss); + FillPermutationIndices(m.qmaskl, idx); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + unsigned r = 3 + H; + unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), w, ms, xss, idx, state.get()); + } + +#endif // __BMI2__ + + template + static void FillPermutationIndices(unsigned qmaskl, __m256i* idx) { + constexpr unsigned lsize = 1 << L; + + for (unsigned i = 0; i < lsize - 1; ++i) { + unsigned p[8]; + + for (unsigned j = 0; j < 8; ++j) { + p[j] = MaskedAdd<3>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl)); + } + + idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]); + } + } + + For for_; +}; + +} // namespace qsim + +#endif // SIMULATOR_AVX_H_ diff --git a/tpls/qsim/simulator_avx512.h b/tpls/qsim/simulator_avx512.h new file mode 100644 index 0000000..21a2e9d --- /dev/null +++ b/tpls/qsim/simulator_avx512.h @@ -0,0 +1,846 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SIMULATOR_AVX512_H_ +#define SIMULATOR_AVX512_H_ + +#include + +#include +#include +#include +#include + +#include "simulator.h" +#include "statespace_avx512.h" + +namespace qsim { + +/** + * Quantum circuit simulator with AVX512 vectorization. + */ +template +class SimulatorAVX512 final : public SimulatorBase { + public: + using StateSpace = StateSpaceAVX512; + using State = typename StateSpace::State; + using fp_type = typename StateSpace::fp_type; + + template + explicit SimulatorAVX512(ForArgs&&... args) : for_(args...) {} + + /** + * Applies a gate using AVX512 instructions. + * @param qs Indices of the qubits affected by this gate. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyGate(const std::vector& qs, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + switch (qs.size()) { + case 0: + ApplyGateH<0>(qs, matrix, state); + break; + case 1: + if (qs[0] > 3) { + ApplyGateH<1>(qs, matrix, state); + } else { + ApplyGateL<0, 1>(qs, matrix, state); + } + break; + case 2: + if (qs[0] > 3) { + ApplyGateH<2>(qs, matrix, state); + } else if (qs[1] > 3) { + ApplyGateL<1, 1>(qs, matrix, state); + } else { + ApplyGateL<0, 2>(qs, matrix, state); + } + break; + case 3: + if (qs[0] > 3) { + ApplyGateH<3>(qs, matrix, state); + } else if (qs[1] > 3) { + ApplyGateL<2, 1>(qs, matrix, state); + } else if (qs[2] > 3) { + ApplyGateL<1, 2>(qs, matrix, state); + } else { + ApplyGateL<0, 3>(qs, matrix, state); + } + break; + case 4: + if (qs[0] > 3) { + ApplyGateH<4>(qs, matrix, state); + } else if (qs[1] > 3) { + ApplyGateL<3, 1>(qs, matrix, state); + } else if (qs[2] > 3) { + ApplyGateL<2, 2>(qs, matrix, state); + } else if (qs[3] > 3) { + ApplyGateL<1, 3>(qs, matrix, state); + } else { + ApplyGateL<0, 4>(qs, matrix, state); + } + break; + case 5: + if (qs[0] > 3) { + ApplyGateH<5>(qs, matrix, state); + } else if (qs[1] > 3) { + ApplyGateL<4, 1>(qs, matrix, state); + } else if (qs[2] > 3) { + ApplyGateL<3, 2>(qs, matrix, state); + } else if (qs[3] > 3) { + ApplyGateL<2, 3>(qs, matrix, state); + } else { + ApplyGateL<1, 4>(qs, matrix, state); + } + break; + case 6: + if (qs[0] > 3) { + ApplyGateH<6>(qs, matrix, state); + } else if (qs[1] > 3) { + ApplyGateL<5, 1>(qs, matrix, state); + } else if (qs[2] > 3) { + ApplyGateL<4, 2>(qs, matrix, state); + } else if (qs[3] > 3) { + ApplyGateL<3, 3>(qs, matrix, state); + } else { + ApplyGateL<2, 4>(qs, matrix, state); + } + break; + default: + // Not implemented. + break; + } + } + + /** + * Applies a controlled gate using AVX512 instructions. + * @param qs Indices of the qubits affected by this gate. + * @param cqs Indices of control qubits. + * @param cvals Bit mask of control qubit values. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyControlledGate(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + // Assume cqs[0] < cqs[1] < cqs[2] < ... . + + if (cqs.size() == 0) { + ApplyGate(qs, matrix, state); + return; + } + + switch (qs.size()) { + case 0: + if (cqs[0] > 3) { + ApplyControlledGateHH<0>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<0>(qs, cqs, cvals, matrix, state); + } + break; + case 1: + if (qs[0] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 3) { + ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 2: + if (qs[0] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 3) { + ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 3: + if (qs[0] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else if (qs[2] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 3) { + ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 4: + if (qs[0] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else if (qs[2] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state); + } + } else if (qs[3] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 3) { + ApplyControlledGateL<0, 4, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 4, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + default: + // Not implemented. + break; + } + } + + /** + * Computes the expectation value of an operator using AVX512 instructions. + * @param qs Indices of the qubits the operator acts on. + * @param matrix The operator matrix. + * @param state The state of the system. + * @return The computed expectation value. + */ + std::complex ExpectationValue(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + switch (qs.size()) { + case 1: + if (qs[0] > 3) { + return ExpectationValueH<1>(qs, matrix, state); + } else { + return ExpectationValueL<0, 1>(qs, matrix, state); + } + break; + case 2: + if (qs[0] > 3) { + return ExpectationValueH<2>(qs, matrix, state); + } else if (qs[1] > 3) { + return ExpectationValueL<1, 1>(qs, matrix, state); + } else { + return ExpectationValueL<0, 2>(qs, matrix, state); + } + break; + case 3: + if (qs[0] > 3) { + return ExpectationValueH<3>(qs, matrix, state); + } else if (qs[1] > 3) { + return ExpectationValueL<2, 1>(qs, matrix, state); + } else if (qs[2] > 3) { + return ExpectationValueL<1, 2>(qs, matrix, state); + } else { + return ExpectationValueL<0, 3>(qs, matrix, state); + } + break; + case 4: + if (qs[0] > 3) { + return ExpectationValueH<4>(qs, matrix, state); + } else if (qs[1] > 3) { + return ExpectationValueL<3, 1>(qs, matrix, state); + } else if (qs[2] > 3) { + return ExpectationValueL<2, 2>(qs, matrix, state); + } else if (qs[3] > 3) { + return ExpectationValueL<1, 3>(qs, matrix, state); + } else { + return ExpectationValueL<0, 4>(qs, matrix, state); + } + break; + case 5: + if (qs[0] > 3) { + return ExpectationValueH<5>(qs, matrix, state); + } else if (qs[1] > 3) { + return ExpectationValueL<4, 1>(qs, matrix, state); + } else if (qs[2] > 3) { + return ExpectationValueL<3, 2>(qs, matrix, state); + } else if (qs[3] > 3) { + return ExpectationValueL<2, 3>(qs, matrix, state); + } else { + return ExpectationValueL<1, 4>(qs, matrix, state); + } + break; + case 6: + if (qs[0] > 3) { + return ExpectationValueH<6>(qs, matrix, state); + } else if (qs[1] > 3) { + return ExpectationValueL<5, 1>(qs, matrix, state); + } else if (qs[2] > 3) { + return ExpectationValueL<4, 2>(qs, matrix, state); + } else if (qs[3] > 3) { + return ExpectationValueL<3, 3>(qs, matrix, state); + } else { + return ExpectationValueL<2, 4>(qs, matrix, state); + } + break; + default: + // Not implemented. + break; + } + + return 0; + } + + /** + * @return The size of SIMD register if applicable. + */ + static unsigned SIMDRegisterSize() { + return 16; + } + + private: + template + void ApplyGateH(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + uint64_t imaskh, uint64_t qmaskh, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m512 ru, iu, rn, in; + __m512 rs[hsize], is[hsize]; + + auto p0 = rstate + _pdep_u64(i, imaskh); + + for (unsigned k = 0; k < hsize; ++k) { + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k] = _mm512_load_ps(p0 + p); + is[k] = _mm512_load_ps(p0 + p + 16); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[l], ru, rn); + in = _mm512_fmadd_ps(rs[l], iu, in); + rn = _mm512_fnmadd_ps(is[l], iu, rn); + in = _mm512_fmadd_ps(is[l], ru, in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm512_store_ps(p0 + p, rn); + _mm512_store_ps(p0 + p + 16, in); + } + }; + + auto m = GetMasks1(qs); + + unsigned k = 4 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, matrix, m.imaskh, m.qmaskh, state.get()); + } + + template + void ApplyGateL(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + uint64_t imaskh, uint64_t qmaskh, const __m512i* idx, + fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m512 rn, in; + __m512 rs[gsize], is[gsize]; + + auto p0 = rstate + _pdep_u64(i, imaskh); + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k2] = _mm512_load_ps(p0 + p); + is[k2] = _mm512_load_ps(p0 + p + 16); + + for (unsigned l = 1; l < lsize; ++l) { + rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]); + is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]); + } + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm512_fmadd_ps(rs[l], w[j], rn); + in = _mm512_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm512_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm512_store_ps(p0 + p, rn); + _mm512_store_ps(p0 + p + 16, in); + } + }; + + __m512i idx[1 << L]; + __m512 w[1 << (1 + 2 * H + L)]; + + auto m = GetMasks2(qs); + FillPermutationIndices(m.qmaskl, idx); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + unsigned r = 4 + H; + unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, m.imaskh, m.qmaskh, idx, state.get()); + } + + template + void ApplyControlledGateHH(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, + fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m512 ru, iu, rn, in; + __m512 rs[hsize], is[hsize]; + + auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh); + + for (unsigned k = 0; k < hsize; ++k) { + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k] = _mm512_load_ps(p0 + p); + is[k] = _mm512_load_ps(p0 + p + 16); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[l], ru, rn); + in = _mm512_fmadd_ps(rs[l], iu, in); + rn = _mm512_fnmadd_ps(is[l], iu, rn); + in = _mm512_fmadd_ps(is[l], ru, in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm512_store_ps(p0 + p, rn); + _mm512_store_ps(p0 + p + 16, in); + } + }; + + auto m = GetMasks3(state.num_qubits(), qs, cqs, cvals); + + unsigned k = 4 + H + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, matrix, m.imaskh, m.qmaskh, m.cvalsh, state.get()); + } + + template + void ApplyControlledGateHL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, + fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m512 rn, in; + __m512 rs[hsize], is[hsize]; + + auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh); + + for (unsigned k = 0; k < hsize; ++k) { + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k] = _mm512_load_ps(p0 + p); + is[k] = _mm512_load_ps(p0 + p + 16); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + rn = _mm512_fmadd_ps(rs[l], w[j], rn); + in = _mm512_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm512_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm512_store_ps(p0 + p, rn); + _mm512_store_ps(p0 + p + 16, in); + } + }; + + __m512 w[1 << (1 + 2 * H)]; + + auto m = GetMasks4(state.num_qubits(), qs, cqs, cvals); + FillControlledMatrixH(m.cvalsl, m.cmaskl, matrix, (fp_type*) w); + + unsigned r = 4 + H + cqs.size() - m.cl; + unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, state.get()); + } + + template + void ApplyControlledGateL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, + const __m512i* idx, fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m512 rn, in; + __m512 rs[gsize], is[gsize]; + + auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh); + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k2] = _mm512_load_ps(p0 + p); + is[k2] = _mm512_load_ps(p0 + p + 16); + + for (unsigned l = 1; l < lsize; ++l) { + rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]); + is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]); + } + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm512_fmadd_ps(rs[l], w[j], rn); + in = _mm512_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm512_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm512_store_ps(p0 + p, rn); + _mm512_store_ps(p0 + p + 16, in); + } + }; + + __m512i idx[1 << L]; + __m512 w[1 << (1 + 2 * H + L)]; + + if (CH) { + auto m = GetMasks5(state.num_qubits(), qs, cqs, cvals); + FillPermutationIndices(m.qmaskl, idx); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + unsigned r = 4 + H + cqs.size(); + unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get()); + } else { + auto m = GetMasks6(state.num_qubits(), qs, cqs, cvals); + FillPermutationIndices(m.qmaskl, idx); + FillControlledMatrixL( + m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w); + + unsigned r = 4 + H + cqs.size() - m.cl; + unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get()); + } + } + + template + std::complex ExpectationValueH(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + uint64_t imaskh, uint64_t qmaskh, const fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m512 ru, iu, rn, in; + __m512 rs[hsize], is[hsize]; + + auto p0 = rstate + _pdep_u64(i, imaskh); + + for (unsigned k = 0; k < hsize; ++k) { + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k] = _mm512_load_ps(p0 + p); + is[k] = _mm512_load_ps(p0 + p + 16); + } + + double re = 0; + double im = 0; + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[l], ru, rn); + in = _mm512_fmadd_ps(rs[l], iu, in); + rn = _mm512_fnmadd_ps(is[l], iu, rn); + in = _mm512_fmadd_ps(is[l], ru, in); + + j += 2; + } + + __m512 v_re = _mm512_fmadd_ps(is[k], in, _mm512_mul_ps(rs[k], rn)); + __m512 v_im = _mm512_fnmadd_ps(is[k], rn, _mm512_mul_ps(rs[k], in)); + + re += detail::HorizontalSumAVX512(v_re); + im += detail::HorizontalSumAVX512(v_im); + } + + return std::complex{re, im}; + }; + + auto m = GetMasks1(qs); + + unsigned k = 4 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return + for_.RunReduce(size, f, Op(), matrix, m.imaskh, m.qmaskh, state.get()); + } + + template + std::complex ExpectationValueL(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + uint64_t imaskh, uint64_t qmaskh, const __m512i* idx, + const fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m512 rn, in; + __m512 rs[gsize], is[gsize]; + + auto p0 = rstate + _pdep_u64(i, imaskh); + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k2] = _mm512_load_ps(p0 + p); + is[k2] = _mm512_load_ps(p0 + p + 16); + + for (unsigned l = 1; l < lsize; ++l) { + rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]); + is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]); + } + } + + double re = 0; + double im = 0; + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm512_fmadd_ps(rs[l], w[j], rn); + in = _mm512_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm512_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + unsigned m = lsize * k; + + __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn)); + __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in)); + + re += detail::HorizontalSumAVX512(v_re); + im += detail::HorizontalSumAVX512(v_im); + } + + return std::complex{re, im}; + }; + + __m512i idx[1 << L]; + __m512 w[1 << (1 + 2 * H + L)]; + + auto m = GetMasks2(qs); + FillPermutationIndices(m.qmaskl, idx); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + unsigned r = 4 + H; + unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return + for_.RunReduce(size, f, Op(), w, m.imaskh, m.qmaskh, idx, state.get()); + } + + template + static void FillPermutationIndices(unsigned qmaskl, __m512i* idx) { + constexpr unsigned lsize = 1 << L; + + for (unsigned i = 0; i < lsize; ++i) { + unsigned p[16]; + + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd<4>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + } + + For for_; +}; + +} // namespace qsim + +#endif // SIMULATOR_AVX512_H_ diff --git a/tpls/qsim/simulator_basic.h b/tpls/qsim/simulator_basic.h new file mode 100644 index 0000000..752eeb5 --- /dev/null +++ b/tpls/qsim/simulator_basic.h @@ -0,0 +1,349 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SIMULATOR_BASIC_H_ +#define SIMULATOR_BASIC_H_ + +#include +#include +#include +#include + +#include "simulator.h" +#include "statespace_basic.h" + +namespace qsim { + +/** + * Quantum circuit simulator without vectorization. + */ +template +class SimulatorBasic final : public SimulatorBase { + public: + using StateSpace = StateSpaceBasic; + using State = typename StateSpace::State; + using fp_type = typename StateSpace::fp_type; + + template + explicit SimulatorBasic(ForArgs&&... args) : for_(args...) {} + + /** + * Applies a gate using non-vectorized instructions. + * @param qs Indices of the qubits affected by this gate. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyGate(const std::vector& qs, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + switch (qs.size()) { + case 0: + ApplyGateH<0>(qs, matrix, state); + break; + case 1: + ApplyGateH<1>(qs, matrix, state); + break; + case 2: + ApplyGateH<2>(qs, matrix, state); + break; + case 3: + ApplyGateH<3>(qs, matrix, state); + break; + case 4: + ApplyGateH<4>(qs, matrix, state); + break; + case 5: + ApplyGateH<5>(qs, matrix, state); + break; + case 6: + ApplyGateH<6>(qs, matrix, state); + break; + default: + // Not implemented. + break; + } + } + + /** + * Applies a controlled gate using non-vectorized instructions. + * @param qs Indices of the qubits affected by this gate. + * @param cqs Indices of control qubits. + * @param cvals Bit mask of control qubit values. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyControlledGate(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + if (cqs.size() == 0) { + ApplyGate(qs, matrix, state); + return; + } + + switch (qs.size()) { + case 0: + ApplyControlledGateH<0>(qs, cqs, cvals, matrix, state); + break; + case 1: + ApplyControlledGateH<1>(qs, cqs, cvals, matrix, state); + break; + case 2: + ApplyControlledGateH<2>(qs, cqs, cvals, matrix, state); + break; + case 3: + ApplyControlledGateH<3>(qs, cqs, cvals, matrix, state); + break; + case 4: + ApplyControlledGateH<4>(qs, cqs, cvals, matrix, state); + break; + default: + // Not implemented. + break; + } + } + + /** + * Computes the expectation value of an operator using non-vectorized + * instructions. + * @param qs Indices of the qubits the operator acts on. + * @param matrix The operator matrix. + * @param state The state of the system. + * @return The computed expectation value. + */ + std::complex ExpectationValue(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + switch (qs.size()) { + case 1: + return ExpectationValueH<1>(qs, matrix, state); + break; + case 2: + return ExpectationValueH<2>(qs, matrix, state); + break; + case 3: + return ExpectationValueH<3>(qs, matrix, state); + break; + case 4: + return ExpectationValueH<4>(qs, matrix, state); + break; + case 5: + return ExpectationValueH<5>(qs, matrix, state); + break; + case 6: + return ExpectationValueH<6>(qs, matrix, state); + break; + default: + // Not implemented. + break; + } + + return 0; + } + + /** + * @return The size of SIMD register if applicable. + */ + static unsigned SIMDRegisterSize() { + return 1; + } + + private: + template + void ApplyGateH(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + fp_type rn, in; + fp_type rs[hsize], is[hsize]; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = *(p0 + xss[k]); + is[k] = *(p0 + xss[k] + 1); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = rs[0] * v[j] - is[0] * v[j + 1]; + in = rs[0] * v[j + 1] + is[0] * v[j]; + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + rn += rs[l] * v[j] - is[l] * v[j + 1]; + in += rs[l] * v[j + 1] + is[l] * v[j]; + + j += 2; + } + + *(p0 + xss[k]) = rn; + *(p0 + xss[k] + 1) = in; + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, matrix, ms, xss, state.get()); + } + + template + void ApplyControlledGateH(const std::vector& qs, + const std::vector& cqs, + uint64_t cvals, const fp_type* matrix, + State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + uint64_t cvalsh, uint64_t cmaskh, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + fp_type rn, in; + fp_type rs[hsize], is[hsize]; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + if ((ii & cmaskh) == cvalsh) { + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = *(p0 + xss[k]); + is[k] = *(p0 + xss[k] + 1); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = rs[0] * v[j] - is[0] * v[j + 1]; + in = rs[0] * v[j + 1] + is[0] * v[j]; + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + rn += rs[l] * v[j] - is[l] * v[j + 1]; + in += rs[l] * v[j + 1] + is[l] * v[j]; + + j += 2; + } + + *(p0 + xss[k]) = rn; + *(p0 + xss[k] + 1) = in; + } + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + + FillIndices(state.num_qubits(), qs, ms, xss); + + auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals); + + unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, matrix, ms, xss, m.cvalsh, m.cmaskh, state.get()); + } + + template + std::complex ExpectationValueH(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + const fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + fp_type rn, in; + fp_type rs[hsize], is[hsize]; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = *(p0 + xss[k]); + is[k] = *(p0 + xss[k] + 1); + } + + double re = 0; + double im = 0; + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = rs[0] * v[j] - is[0] * v[j + 1]; + in = rs[0] * v[j + 1] + is[0] * v[j]; + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + rn += rs[l] * v[j] - is[l] * v[j + 1]; + in += rs[l] * v[j + 1] + is[l] * v[j]; + + j += 2; + } + + re += rs[k] * rn + is[k] * in; + im += rs[k] * in - is[k] * rn; + } + + return std::complex{re, im}; + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), matrix, ms, xss, state.get()); + } + + For for_; +}; + +} // namespace qsim + +#endif // SIMULATOR_BASIC_H_ diff --git a/tpls/qsim/simulator_cuda.h b/tpls/qsim/simulator_cuda.h new file mode 100644 index 0000000..5743bea --- /dev/null +++ b/tpls/qsim/simulator_cuda.h @@ -0,0 +1,923 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SIMULATOR_CUDA_H_ +#define SIMULATOR_CUDA_H_ + +#include "simulator_cuda_kernels.h" + +#include +#include +#include +#include +#include + +#include "bits.h" +#include "statespace_cuda.h" + +namespace qsim { + +/** + * Quantum circuit simulator with GPU vectorization. + */ +template +class SimulatorCUDA final { + private: + using idx_type = uint64_t; + using Complex = qsim::Complex; + + // The maximum buffer size for indices and gate matrices. + // The maximum gate matrix size (for 6-qubit gates) is + // 2 * 2^6 * 2^6 * sizeof(FP) = 8192 * sizeof(FP). The maximum index size is + // 128 * sizeof(idx_type) + 96 * sizeof(unsigned). + static constexpr unsigned max_buf_size = 8192 * sizeof(FP) + + 128 * sizeof(idx_type) + 96 * sizeof(unsigned); + + public: + using StateSpace = StateSpaceCUDA; + using State = typename StateSpace::State; + using fp_type = typename StateSpace::fp_type; + + SimulatorCUDA() : scratch_(nullptr), scratch_size_(0) { + ErrorCheck(cudaMalloc(&d_ws, max_buf_size)); + } + + ~SimulatorCUDA() { + ErrorCheck(cudaFree(d_ws)); + + if (scratch_ != nullptr) { + ErrorCheck(cudaFree(scratch_)); + } + } + + /** + * Applies a gate using CUDA instructions. + * @param qs Indices of the qubits affected by this gate. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyGate(const std::vector& qs, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + if (qs.size() == 0) { + ApplyGateH<0>(qs, matrix, state); + } else if (qs[0] > 4) { + switch (qs.size()) { + case 1: + ApplyGateH<1>(qs, matrix, state); + break; + case 2: + ApplyGateH<2>(qs, matrix, state); + break; + case 3: + ApplyGateH<3>(qs, matrix, state); + break; + case 4: + ApplyGateH<4>(qs, matrix, state); + break; + case 5: + ApplyGateH<5>(qs, matrix, state); + break; + case 6: + ApplyGateH<6>(qs, matrix, state); + break; + default: + // Not implemented. + break; + } + } else { + switch (qs.size()) { + case 1: + ApplyGateL<1>(qs, matrix, state); + break; + case 2: + ApplyGateL<2>(qs, matrix, state); + break; + case 3: + ApplyGateL<3>(qs, matrix, state); + break; + case 4: + ApplyGateL<4>(qs, matrix, state); + break; + case 5: + ApplyGateL<5>(qs, matrix, state); + break; + case 6: + ApplyGateL<6>(qs, matrix, state); + break; + default: + // Not implemented. + break; + } + } + } + + /** + * Applies a controlled gate using CUDA instructions. + * @param qs Indices of the qubits affected by this gate. + * @param cqs Indices of control qubits. + * @param cvals Bit mask of control qubit values. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyControlledGate(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + if (cqs.size() == 0) { + ApplyGate(qs, matrix, state); + return; + } + + if (cqs[0] < 5) { + switch (qs.size()) { + case 0: + ApplyControlledGateL<0>(qs, cqs, cvals, matrix, state); + break; + case 1: + ApplyControlledGateL<1>(qs, cqs, cvals, matrix, state); + break; + case 2: + ApplyControlledGateL<2>(qs, cqs, cvals, matrix, state); + break; + case 3: + ApplyControlledGateL<3>(qs, cqs, cvals, matrix, state); + break; + case 4: + ApplyControlledGateL<4>(qs, cqs, cvals, matrix, state); + break; + default: + // Not implemented. + break; + } + } else { + if (qs.size() == 0) { + ApplyControlledGateHH<0>(qs, cqs, cvals, matrix, state); + } else if (qs[0] > 4) { + switch (qs.size()) { + case 1: + ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state); + break; + case 2: + ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state); + break; + case 3: + ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state); + break; + case 4: + ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state); + break; + default: + // Not implemented. + break; + } + } else { + switch (qs.size()) { + case 1: + ApplyControlledGateLH<1>(qs, cqs, cvals, matrix, state); + break; + case 2: + ApplyControlledGateLH<2>(qs, cqs, cvals, matrix, state); + break; + case 3: + ApplyControlledGateLH<3>(qs, cqs, cvals, matrix, state); + break; + case 4: + ApplyControlledGateLH<4>(qs, cqs, cvals, matrix, state); + break; + default: + // Not implemented. + break; + } + } + } + } + + /** + * Computes the expectation value of an operator using CUDA instructions. + * @param qs Indices of the qubits the operator acts on. + * @param matrix The operator matrix. + * @param state The state of the system. + * @return The computed expectation value. + */ + std::complex ExpectationValue(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + if (qs[0] > 4) { + switch (qs.size()) { + case 1: + return ExpectationValueH<1>(qs, matrix, state); + case 2: + return ExpectationValueH<2>(qs, matrix, state); + case 3: + return ExpectationValueH<3>(qs, matrix, state); + case 4: + return ExpectationValueH<4>(qs, matrix, state); + case 5: + return ExpectationValueH<5>(qs, matrix, state); + case 6: + return ExpectationValueH<6>(qs, matrix, state); + default: + // Not implemented. + break; + } + } else { + switch (qs.size()) { + case 1: + return ExpectationValueL<1>(qs, matrix, state); + case 2: + return ExpectationValueL<2>(qs, matrix, state); + case 3: + return ExpectationValueL<3>(qs, matrix, state); + case 4: + return ExpectationValueL<4>(qs, matrix, state); + case 5: + return ExpectationValueL<5>(qs, matrix, state); + case 6: + return ExpectationValueL<6>(qs, matrix, state); + default: + // Not implemented. + break; + } + } + + return 0; + } + + /** + * @return The size of SIMD register if applicable. + */ + static unsigned SIMDRegisterSize() { + return 32; + } + + private: + // The following indices are used in kernels. + // xss - indices to access the state vector entries in global memory. + // ms - masks to access the state vector entries in global memory. + // tis - indices to access the state vector entries in shared memory + // in the presence of low gate qubits. + // qis - indices to access the state vector entries in shared memory + // in the presence of low gate qubits. + // cis - additional indices to access the state vector entries in global + // memory in the presence of low control qubits. + + template + struct IndicesH { + static constexpr unsigned gsize = 1 << G; + static constexpr unsigned matrix_size = 2 * gsize * gsize * sizeof(fp_type); + static constexpr unsigned xss_size = 32 * sizeof(idx_type) * (1 + (G == 6)); + static constexpr unsigned ms_size = 32 * sizeof(idx_type); + static constexpr unsigned xss_offs = matrix_size; + static constexpr unsigned ms_offs = xss_offs + xss_size; + static constexpr unsigned buf_size = ms_offs + ms_size; + + IndicesH(char* p) + : xss((idx_type*) (p + xss_offs)), ms((idx_type*) (p + ms_offs)) {} + + idx_type* xss; + idx_type* ms; + }; + + template + struct IndicesL : public IndicesH { + using Base = IndicesH; + static constexpr unsigned qis_size = 32 * sizeof(unsigned) * (1 + (G == 6)); + static constexpr unsigned tis_size = 32 * sizeof(unsigned); + static constexpr unsigned qis_offs = Base::buf_size; + static constexpr unsigned tis_offs = qis_offs + qis_size; + static constexpr unsigned buf_size = tis_offs + tis_size; + + IndicesL(char* p) + : Base(p), qis((unsigned*) (p + qis_offs)), + tis((unsigned*) (p + tis_offs)) {} + + unsigned* qis; + unsigned* tis; + }; + + template + struct IndicesLC : public IndicesL { + using Base = IndicesL; + static constexpr unsigned cis_size = 32 * sizeof(idx_type); + static constexpr unsigned cis_offs = Base::buf_size; + static constexpr unsigned buf_size = cis_offs + cis_size; + + IndicesLC(char* p) : Base(p), cis((idx_type*) (p + cis_offs)) {} + + idx_type* cis; + }; + + struct DataC { + idx_type cvalsh; + unsigned num_aqs; + unsigned num_effective_qs; + unsigned remaining_low_cqs; + }; + + template + void ApplyGateH(const std::vector& qs, + const fp_type* matrix, State& state) const { + unsigned num_qubits = state.num_qubits(); + + IndicesH h_i(h_ws); + GetIndicesH(num_qubits, qs, qs.size(), h_i); + + std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); + ErrorCheck( + cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); + + unsigned k = 5 + G; + unsigned n = num_qubits > k ? num_qubits - k : 0; + unsigned size = unsigned{1} << n; + unsigned threads = 64U; + unsigned blocks = std::max(1U, size / 2); + + IndicesH d_i(d_ws); + + ApplyGateH_Kernel<<>>( + (fp_type*) d_ws, d_i.xss, d_i.ms, state.get()); + } + + template + void ApplyGateL(const std::vector& qs, + const fp_type* matrix, State& state) const { + unsigned num_qubits = state.num_qubits(); + + IndicesL h_i(h_ws); + auto num_effective_qs = GetIndicesL(num_qubits, qs, h_i); + + std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); + ErrorCheck( + cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); + + unsigned k = 5 + num_effective_qs; + unsigned n = num_qubits > k ? num_qubits - k : 0; + unsigned size = unsigned{1} << n; + unsigned threads = 32; + unsigned blocks = size; + + IndicesL d_i(d_ws); + + ApplyGateL_Kernel<<>>( + (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, + 1 << num_effective_qs, state.get()); + } + + template + void ApplyControlledGateHH(const std::vector& qs, + const std::vector& cqs, idx_type cvals, + const fp_type* matrix, State& state) const { + unsigned aqs[64]; + idx_type cmaskh = 0; + unsigned num_qubits = state.num_qubits(); + + IndicesH h_i(h_ws); + + unsigned num_aqs = GetHighQubits(qs, 0, cqs, 0, 0, cmaskh, aqs); + GetMs(num_qubits, aqs, num_aqs, h_i.ms); + GetXss(num_qubits, qs, qs.size(), h_i.xss); + + idx_type cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh); + + std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); + ErrorCheck( + cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); + + unsigned k = 5 + G + cqs.size(); + unsigned n = num_qubits > k ? num_qubits - k : 0; + unsigned size = unsigned{1} << n; + unsigned threads = 64U; + unsigned blocks = std::max(1U, size / 2); + + IndicesH d_i(d_ws); + + ApplyControlledGateH_Kernel<<>>( + (fp_type*) d_ws, d_i.xss, d_i.ms, num_aqs + 1, cvalsh, state.get()); + } + + template + void ApplyControlledGateLH(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + unsigned num_qubits = state.num_qubits(); + + IndicesL h_i(h_ws); + auto d = GetIndicesLC(num_qubits, qs, cqs, cvals, h_i); + + std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); + ErrorCheck( + cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); + + unsigned k = 5 + G + cqs.size(); + unsigned n = num_qubits > k ? num_qubits - k : 0; + unsigned size = unsigned{1} << n; + unsigned threads = 32; + unsigned blocks = size; + + IndicesL d_i(d_ws); + + ApplyControlledGateLH_Kernel<<>>( + (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, + d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs, state.get()); + } + + template + void ApplyControlledGateL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + unsigned num_qubits = state.num_qubits(); + + IndicesLC h_i(h_ws); + auto d = GetIndicesLCL(num_qubits, qs, cqs, cvals, h_i); + + std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); + ErrorCheck( + cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); + + unsigned k = 5 + G + cqs.size(); + unsigned n = num_qubits > k ? num_qubits - k : 0; + unsigned size = unsigned{1} << n; + unsigned threads = 32; + unsigned blocks = size; + + IndicesLC d_i(d_ws); + + ApplyControlledGateL_Kernel<<>>( + (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d_i.cis, + d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs, + 1 << (5 - d.remaining_low_cqs), state.get()); + } + + template + std::complex ExpectationValueH(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + unsigned num_qubits = state.num_qubits(); + + IndicesH h_i(h_ws); + GetIndicesH(num_qubits, qs, qs.size(), h_i); + + std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); + ErrorCheck( + cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); + + unsigned k = 5 + G; + unsigned n = num_qubits > k ? num_qubits - k : 0; + unsigned size = unsigned{1} << n; + + unsigned s = std::min(n >= 14 ? n - 14 : 0, 4U); + unsigned threads = 64U; + unsigned blocks = std::max(1U, (size / 2) >> s); + unsigned num_iterations_per_block = 1 << s; + + constexpr unsigned m = 16; + + Complex* d_res1 = (Complex*) AllocScratch((blocks + m) * sizeof(Complex)); + Complex* d_res2 = d_res1 + blocks; + + IndicesH d_i(d_ws); + + ExpectationValueH_Kernel<<>>( + (fp_type*) d_ws, d_i.xss, d_i.ms, num_iterations_per_block, + state.get(), Plus(), d_res1); + + double mul = size == 1 ? 0.5 : 1.0; + + return ExpectationValueReduceFinal(blocks, mul, d_res1, d_res2); + } + + template + std::complex ExpectationValueL(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + unsigned num_qubits = state.num_qubits(); + + IndicesL h_i(h_ws); + auto num_effective_qs = GetIndicesL(num_qubits, qs, h_i); + + std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); + ErrorCheck( + cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); + + unsigned k = 5 + num_effective_qs; + unsigned n = num_qubits > k ? num_qubits - k : 0; + unsigned size = unsigned{1} << n; + + unsigned s = std::min(n >= 13 ? n - 13 : 0, 5U); + unsigned threads = 32; + unsigned blocks = size >> s; + unsigned num_iterations_per_block = 1 << s; + + constexpr unsigned m = 16; + + Complex* d_res1 = (Complex*) AllocScratch((blocks + m) * sizeof(Complex)); + Complex* d_res2 = d_res1 + blocks; + + IndicesL d_i(d_ws); + + ExpectationValueL_Kernel<<>>( + (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, + num_iterations_per_block, state.get(), Plus(), d_res1); + + double mul = double(1 << (5 + num_effective_qs - G)) / 32; + + return ExpectationValueReduceFinal(blocks, mul, d_res1, d_res2); + } + + template + std::complex ExpectationValueReduceFinal( + unsigned blocks, double mul, + const Complex* d_res1, Complex* d_res2) const { + Complex res2[m]; + + if (blocks <= 16) { + ErrorCheck(cudaMemcpy(res2, d_res1, blocks * sizeof(Complex), + cudaMemcpyDeviceToHost)); + } else { + unsigned threads2 = std::min(1024U, blocks); + unsigned blocks2 = std::min(m, blocks / threads2); + + unsigned dblocks = std::max(1U, blocks / (blocks2 * threads2)); + unsigned bytes = threads2 * sizeof(Complex); + + Reduce2Kernel<<>>( + dblocks, blocks, Plus(), Plus(), d_res1, d_res2); + + ErrorCheck(cudaMemcpy(res2, d_res2, blocks2 * sizeof(Complex), + cudaMemcpyDeviceToHost)); + + blocks = blocks2; + } + + double re = 0; + double im = 0; + + for (unsigned i = 0; i < blocks; ++i) { + re += res2[i].re; + im += res2[i].im; + } + + return {mul * re, mul * im}; + } + + template + unsigned GetHighQubits(const std::vector& qs, unsigned qi, + const std::vector& cqs, unsigned ci, + unsigned ai, idx_type& cmaskh, AQ& aqs) const { + while (1) { + if (qi < qs.size() && (ci == cqs.size() || qs[qi] < cqs[ci])) { + aqs[ai++] = qs[qi++]; + } else if (ci < cqs.size()) { + cmaskh |= idx_type{1} << cqs[ci]; + aqs[ai++] = cqs[ci++]; + } else { + break; + } + } + + return ai; + } + + template + void GetMs(unsigned num_qubits, const QS& qs, unsigned qs_size, + idx_type* ms) const { + if (qs_size == 0) { + ms[0] = idx_type(-1); + } else { + idx_type xs = idx_type{1} << (qs[0] + 1); + ms[0] = (idx_type{1} << qs[0]) - 1; + for (unsigned i = 1; i < qs_size; ++i) { + ms[i] = ((idx_type{1} << qs[i]) - 1) ^ (xs - 1); + xs = idx_type{1} << (qs[i] + 1); + } + ms[qs_size] = ((idx_type{1} << num_qubits) - 1) ^ (xs - 1); + } + } + + template + void GetXss(unsigned num_qubits, const QS& qs, unsigned qs_size, + idx_type* xss) const { + if (qs_size == 0) { + xss[0] = 0; + } else { + unsigned g = qs_size; + unsigned gsize = 1 << qs_size; + + idx_type xs[64]; + + xs[0] = idx_type{1} << (qs[0] + 1); + for (unsigned i = 1; i < g; ++i) { + xs[i] = idx_type{1} << (qs[i] + 1); + } + + for (unsigned i = 0; i < gsize; ++i) { + idx_type a = 0; + for (unsigned k = 0; k < g; ++k) { + a += xs[k] * ((i >> k) & 1); + } + xss[i] = a; + } + } + } + + template + void GetIndicesH(unsigned num_qubits, const qs_type& qs, unsigned qs_size, + IndicesH& indices) const { + if (qs_size == 0) { + indices.ms[0] = idx_type(-1); + indices.xss[0] = 0; + } else { + unsigned g = qs_size; + unsigned gsize = 1 << qs_size; + + idx_type xs[64]; + + xs[0] = idx_type{1} << (qs[0] + 1); + indices.ms[0] = (idx_type{1} << qs[0]) - 1; + for (unsigned i = 1; i < g; ++i) { + xs[i] = idx_type{1} << (qs[i] + 1); + indices.ms[i] = ((idx_type{1} << qs[i]) - 1) ^ (xs[i - 1] - 1); + } + indices.ms[g] = ((idx_type{1} << num_qubits) - 1) ^ (xs[g - 1] - 1); + + for (unsigned i = 0; i < gsize; ++i) { + idx_type a = 0; + for (unsigned k = 0; k < g; ++k) { + a += xs[k] * ((i >> k) & 1); + } + indices.xss[i] = a; + } + } + } + + template + void GetIndicesL(unsigned num_effective_qs, unsigned qmask, + IndicesL& indices) const { + for (unsigned i = num_effective_qs + 1; i < (G + 1); ++i) { + indices.ms[i] = 0; + } + + for (unsigned i = (1 << num_effective_qs); i < indices.gsize; ++i) { + indices.xss[i] = 0; + } + + for (unsigned i = 0; i < indices.gsize; ++i) { + indices.qis[i] = bits::ExpandBits(i, 5 + num_effective_qs, qmask); + } + + unsigned tmask = ((1 << (5 + num_effective_qs)) - 1) ^ qmask; + for (unsigned i = 0; i < 32; ++i) { + indices.tis[i] = bits::ExpandBits(i, 5 + num_effective_qs, tmask); + } + } + + template + unsigned GetIndicesL(unsigned num_qubits, const std::vector& qs, + IndicesL& indices) const { + unsigned eqs[32]; + + unsigned qmaskh = 0; + unsigned qmaskl = 0; + + unsigned qi = 0; + + while (qi < qs.size() && qs[qi] < 5) { + qmaskl |= 1 << qs[qi++]; + } + + unsigned nq = std::max(5U, num_qubits); + unsigned num_effective_qs = std::min(nq - 5, unsigned(qs.size())); + + unsigned l = 0; + unsigned ei = 0; + unsigned num_low_qs = qi; + + if (qs.size() == num_low_qs) { + while (ei < num_effective_qs && l++ < num_low_qs) { + eqs[ei] = ei + 5; + ++ei; + } + } else { + while (ei < num_effective_qs && l < num_low_qs) { + unsigned ei5 = ei + 5; + eqs[ei] = ei5; + if (qi < qs.size() && qs[qi] == ei5) { + ++qi; + qmaskh |= 1 << ei5; + } else { + ++l; + } + ++ei; + } + + while (ei < num_effective_qs) { + eqs[ei] = qs[qi++]; + qmaskh |= 1 << (ei + 5); + ++ei; + } + } + + GetIndicesH(num_qubits, eqs, num_effective_qs, indices); + GetIndicesL(num_effective_qs, qmaskh | qmaskl, indices); + + return num_effective_qs; + } + + template + DataC GetIndicesLC(unsigned num_qubits, const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + IndicesL& indices) const { + unsigned aqs[64]; + unsigned eqs[32]; + + unsigned qmaskh = 0; + unsigned qmaskl = 0; + idx_type cmaskh = 0; + + unsigned qi = 0; + + while (qi < qs.size() && qs[qi] < 5) { + qmaskl |= 1 << qs[qi++]; + } + + unsigned nq = std::max(5U, num_qubits - unsigned(cqs.size())); + unsigned num_effective_qs = std::min(nq - 5, unsigned(qs.size())); + + unsigned l = 0; + unsigned ai = 5; + unsigned ci = 0; + unsigned ei = 0; + unsigned num_low_qs = qi; + + while (ai < num_qubits && l < num_low_qs) { + aqs[ai - 5] = ai; + if (qi < qs.size() && qs[qi] == ai) { + ++qi; + eqs[ei++] = ai; + qmaskh |= 1 << (ai - ci); + } else if (ci < cqs.size() && cqs[ci] == ai) { + ++ci; + cmaskh |= idx_type{1} << ai; + } else { + ++l; + eqs[ei++] = ai; + } + ++ai; + } + + unsigned i = ai; + unsigned j = qi; + + while (ei < num_effective_qs) { + eqs[ei++] = qs[j++]; + qmaskh |= 1 << (i++ - ci); + } + + unsigned num_aqs = GetHighQubits(qs, qi, cqs, ci, ai - 5, cmaskh, aqs); + GetMs(num_qubits, aqs, num_aqs, indices.ms); + GetXss(num_qubits, eqs, num_effective_qs, indices.xss); + GetIndicesL(num_effective_qs, qmaskh | qmaskl, indices); + + idx_type cvalsh = bits::ExpandBits(idx_type(cvals), num_qubits, cmaskh); + + return {cvalsh, num_aqs, num_effective_qs}; + } + + template + DataC GetIndicesLCL(unsigned num_qubits, const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + IndicesLC& indices) const { + unsigned aqs[64]; + unsigned eqs[32]; + + unsigned qmaskh = 0; + unsigned qmaskl = 0; + idx_type cmaskh = 0; + idx_type cmaskl = 0; + idx_type cis_mask = 0; + + unsigned qi = 0; + unsigned ci = 0; + + for (unsigned k = 0; k < 5; ++k) { + if (qi < qs.size() && qs[qi] == k) { + qmaskl |= 1 << (k - ci); + ++qi; + } else if (ci < cqs.size() && cqs[ci] == k) { + cmaskl |= idx_type{1} << k; + ++ci; + } + } + + unsigned num_low_qs = qi; + unsigned num_low_cqs = ci; + + unsigned nq = std::max(5U, num_qubits - unsigned(cqs.size())); + unsigned num_effective_qs = std::min(nq - 5, unsigned(qs.size())); + + unsigned l = 0; + unsigned ai = 5; + unsigned ei = 0; + unsigned num_low = num_low_qs + num_low_cqs; + unsigned remaining_low_cqs = num_low_cqs; + unsigned effective_low_qs = num_low_qs; + unsigned highest_cis_bit = 0; + + while (ai < num_qubits && l < num_low) { + aqs[ai - 5] = ai; + if (qi < qs.size() && qs[qi] == ai) { + ++qi; + if ((ai - ci) > 4) { + eqs[ei++] = ai; + qmaskh |= 1 << (ai - ci); + } else { + highest_cis_bit = ai; + cis_mask |= idx_type{1} << ai; + qmaskl |= 1 << (ai - ci); + --remaining_low_cqs; + ++effective_low_qs; + } + } else if (ci < cqs.size() && cqs[ci] == ai) { + ++ci; + cmaskh |= idx_type{1} << ai; + } else { + ++l; + if (remaining_low_cqs == 0) { + eqs[ei++] = ai; + } else { + highest_cis_bit = ai; + cis_mask |= idx_type{1} << ai; + --remaining_low_cqs; + } + } + ++ai; + } + + unsigned i = ai; + unsigned j = effective_low_qs; + + while (ei < num_effective_qs) { + eqs[ei++] = qs[j++]; + qmaskh |= 1 << (i++ - ci); + } + + unsigned num_aqs = GetHighQubits(qs, qi, cqs, ci, ai - 5, cmaskh, aqs); + GetMs(num_qubits, aqs, num_aqs, indices.ms); + GetXss(num_qubits, eqs, num_effective_qs, indices.xss); + GetIndicesL(num_effective_qs, qmaskh | qmaskl, indices); + + idx_type cvalsh = bits::ExpandBits(idx_type(cvals), num_qubits, cmaskh); + idx_type cvalsl = bits::ExpandBits(idx_type(cvals), 5, cmaskl); + + cis_mask |= 31 ^ cmaskl; + highest_cis_bit = highest_cis_bit < 5 ? 5 : highest_cis_bit; + for (idx_type i = 0; i < 32; ++i) { + auto c = bits::ExpandBits(i, highest_cis_bit + 1, cis_mask); + indices.cis[i] = 2 * (c & 0xffffffe0) | (c & 0x1f) | cvalsl; + } + + return {cvalsh, num_aqs, num_effective_qs, remaining_low_cqs}; + } + + + void* AllocScratch(uint64_t size) const { + if (size > scratch_size_) { + if (scratch_ != nullptr) { + ErrorCheck(cudaFree(scratch_)); + } + + ErrorCheck(cudaMalloc(const_cast(&scratch_), size)); + + const_cast(scratch_size_) = size; + } + + return scratch_; + } + + char* d_ws; + char h_ws0[max_buf_size]; + char* h_ws = (char*) h_ws0; + + void* scratch_; + uint64_t scratch_size_; +}; + +} // namespace qsim + +#endif // SIMULATOR_CUDA_H_ diff --git a/tpls/qsim/simulator_cuda_kernels.h b/tpls/qsim/simulator_cuda_kernels.h new file mode 100644 index 0000000..e21a9d6 --- /dev/null +++ b/tpls/qsim/simulator_cuda_kernels.h @@ -0,0 +1,683 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SIMULATOR_CUDA_KERNELS_H_ +#define SIMULATOR_CUDA_KERNELS_H_ + +#ifdef __NVCC__ + #include + #include + + #include "util_cuda.h" +#elif __HIP__ + #include + #include "cuda2hip.h" +#endif + +namespace qsim { + +template +__global__ void ApplyGateH_Kernel( + const fp_type* __restrict__ v0, const idx_type* __restrict__ xss0, + const idx_type* __restrict__ mss, fp_type* __restrict__ rstate) { + // blockDim.x must be equal to 64. + + static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); + + constexpr unsigned gsize = 1 << G; + constexpr unsigned rows = + G < 4 ? gsize : (sizeof(fp_type) == 4 ? + (G < 6 ? gsize : 32) : (G < 5 ? 8 : 16)); + + fp_type rs[gsize], is[gsize]; + + __shared__ idx_type xss[64]; + __shared__ fp_type v[2 * gsize * rows]; + + if (threadIdx.x < gsize) { + xss[threadIdx.x] = xss0[threadIdx.x]; + } + + if (G <= 2) { + if (threadIdx.x < 2 * gsize * gsize) { + v[threadIdx.x] = v0[threadIdx.x]; + } + } else { + for (unsigned m = 0; m < 2 * gsize * rows; m += 64) { + v[m + threadIdx.x] = v0[m + threadIdx.x]; + } + } + + __syncthreads(); + + idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0; + idx_type ii = i & mss[0]; + for (unsigned j = 1; j <= G; ++j) { + i *= 2; + ii |= i & mss[j]; + } + + auto p0 = rstate + 2 * ii + threadIdx.x % 32; + + for (unsigned k = 0; k < gsize; ++k) { + rs[k] = *(p0 + xss[k]); + is[k] = *(p0 + xss[k] + 32); + } + + for (unsigned s = 0; s < gsize / rows; ++s) { + if (s > 0) { + __syncthreads(); + + for (unsigned m = 0; m < 2 * gsize * rows; m += 64) { + v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; + } + + __syncthreads(); + } + + unsigned j = 0; + + for (unsigned k = rows * s; k < rows * (s + 1); ++k) { + fp_type rn = 0; + fp_type in = 0; + + for (unsigned l = 0; l < gsize; ++l) { + fp_type rm = v[j++]; + fp_type im = v[j++]; + rn += rs[l] * rm; + rn -= is[l] * im; + in += rs[l] * im; + in += is[l] * rm; + } + + *(p0 + xss[k]) = rn; + *(p0 + xss[k] + 32) = in; + } + } +} + +template +__global__ void ApplyGateL_Kernel( + const fp_type* __restrict__ v0, const idx_type* __restrict__ xss, + const idx_type* __restrict__ mss, const unsigned* __restrict__ qis, + const unsigned* __restrict__ tis, unsigned esize, + fp_type* __restrict__ rstate) { + // blockDim.x must be equal to 32. + + static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); + + constexpr unsigned gsize = 1 << G; + constexpr unsigned + rows = G < 4 ? gsize : (sizeof(fp_type) == 4 ? + (G < 5 ? gsize : 8) : (G < 6 ? 8 : 4)); + + fp_type rs[gsize], is[gsize]; + + __shared__ fp_type v[2 * gsize * rows]; + __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1]; + + if (G < 2) { + if (threadIdx.x < 2 * gsize * gsize) { + v[threadIdx.x] = v0[threadIdx.x]; + } + } else { + for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { + v[m + threadIdx.x] = v0[m + threadIdx.x]; + } + } + + idx_type i = 32 * idx_type{blockIdx.x}; + idx_type ii = i & mss[0]; + for (unsigned j = 1; j <= G; ++j) { + i *= 2; + ii |= i & mss[j]; + } + + auto p0 = rstate + 2 * ii + threadIdx.x; + + for (unsigned k = 0; k < gsize; ++k) { + rs0[threadIdx.x][k] = *(p0 + xss[k]); + is0[threadIdx.x][k] = *(p0 + xss[k] + 32); + } + + for (unsigned k = 0; k < gsize; ++k) { + unsigned i = tis[threadIdx.x] | qis[k]; + unsigned m = i & 0x1f; + unsigned n = i / 32; + + rs[k] = rs0[m][n]; + is[k] = is0[m][n]; + } + + for (unsigned s = 0; s < gsize / rows; ++s) { + if (s > 0) { + for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { + v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; + } + } + + unsigned j = 0; + + for (unsigned k = rows * s; k < rows * (s + 1); ++k) { + fp_type rn = 0; + fp_type in = 0; + + for (unsigned l = 0; l < gsize; ++l) { + fp_type rm = v[j++]; + fp_type im = v[j++]; + rn += rs[l] * rm; + rn -= is[l] * im; + in += rs[l] * im; + in += is[l] * rm; + } + + unsigned i = tis[threadIdx.x] | qis[k]; + unsigned m = i & 0x1f; + unsigned n = i / 32; + + rs0[m][n] = rn; + is0[m][n] = in; + } + } + + for (unsigned k = 0; k < esize; ++k) { + *(p0 + xss[k]) = rs0[threadIdx.x][k]; + *(p0 + xss[k] + 32) = is0[threadIdx.x][k]; + } +} + +template +__global__ void ApplyControlledGateH_Kernel( + const fp_type* __restrict__ v0, const idx_type* __restrict__ xss0, + const idx_type* __restrict__ mss, unsigned num_mss, idx_type cvalsh, + fp_type* __restrict__ rstate) { + // blockDim.x must be equal to 64. + + static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); + + constexpr unsigned gsize = 1 << G; + constexpr unsigned rows = + G < 4 ? gsize : (sizeof(fp_type) == 4 ? + (G < 6 ? gsize : 32) : (G < 5 ? 8 : 16)); + + fp_type rs[gsize], is[gsize]; + + __shared__ idx_type xss[64]; + __shared__ fp_type v[2 * gsize * rows]; + + if (threadIdx.x < gsize) { + xss[threadIdx.x] = xss0[threadIdx.x]; + } + + if (G <= 2) { + if (threadIdx.x < 2 * gsize * gsize) { + v[threadIdx.x] = v0[threadIdx.x]; + } + } else { + for (unsigned m = 0; m < 2 * gsize * rows; m += 64) { + v[m + threadIdx.x] = v0[m + threadIdx.x]; + } + } + + __syncthreads(); + + idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0; + idx_type ii = i & mss[0]; + for (unsigned j = 1; j < num_mss; ++j) { + i *= 2; + ii |= i & mss[j]; + } + + ii |= cvalsh; + + auto p0 = rstate + 2 * ii + threadIdx.x % 32; + + for (unsigned k = 0; k < gsize; ++k) { + rs[k] = *(p0 + xss[k]); + is[k] = *(p0 + xss[k] + 32); + } + + for (unsigned s = 0; s < gsize / rows; ++s) { + if (s > 0) { + __syncthreads(); + + for (unsigned m = 0; m < 2 * gsize * rows; m += 64) { + v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; + } + + __syncthreads(); + } + + unsigned j = 0; + + for (unsigned k = rows * s; k < rows * (s + 1); ++k) { + fp_type rn = 0; + fp_type in = 0; + + for (unsigned l = 0; l < gsize; ++l) { + fp_type rm = v[j++]; + fp_type im = v[j++]; + rn += rs[l] * rm; + rn -= is[l] * im; + in += rs[l] * im; + in += is[l] * rm; + } + + *(p0 + xss[k]) = rn; + *(p0 + xss[k] + 32) = in; + } + } +} + +template +__global__ void ApplyControlledGateLH_Kernel( + const fp_type* __restrict__ v0, const idx_type* __restrict__ xss, + const idx_type* __restrict__ mss, const unsigned* __restrict__ qis, + const unsigned* __restrict__ tis, unsigned num_mss, idx_type cvalsh, + unsigned esize, fp_type* __restrict__ rstate) { + // blockDim.x must be equal to 32. + + static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); + + constexpr unsigned gsize = 1 << G; + constexpr unsigned + rows = G < 4 ? gsize : (sizeof(fp_type) == 4 ? + (G < 5 ? gsize : 8) : (G < 6 ? 8 : 4)); + + fp_type rs[gsize], is[gsize]; + + __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1]; + __shared__ fp_type v[2 * gsize * rows]; + + idx_type i = 32 * idx_type{blockIdx.x}; + idx_type ii = i & mss[0]; + for (unsigned j = 1; j < num_mss; ++j) { + i *= 2; + ii |= i & mss[j]; + } + + ii |= cvalsh; + + auto p0 = rstate + 2 * ii + threadIdx.x; + + for (unsigned k = 0; k < gsize; ++k) { + rs0[threadIdx.x][k] = *(p0 + xss[k]); + is0[threadIdx.x][k] = *(p0 + xss[k] + 32); + } + + if (G < 2) { + if (threadIdx.x < 2 * gsize * gsize) { + v[threadIdx.x] = v0[threadIdx.x]; + } + } else { + for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { + v[m + threadIdx.x] = v0[m + threadIdx.x]; + } + } + + for (unsigned k = 0; k < gsize; ++k) { + unsigned i = tis[threadIdx.x] | qis[k]; + unsigned m = i & 0x1f; + unsigned n = i / 32; + + rs[k] = rs0[m][n]; + is[k] = is0[m][n]; + } + + for (unsigned s = 0; s < gsize / rows; ++s) { + if (s > 0) { + for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { + v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; + } + } + + unsigned j = 0; + + for (unsigned k = rows * s; k < rows * (s + 1); ++k) { + fp_type rn = 0; + fp_type in = 0; + + for (unsigned l = 0; l < gsize; ++l) { + fp_type rm = v[j++]; + fp_type im = v[j++]; + rn += rs[l] * rm; + rn -= is[l] * im; + in += rs[l] * im; + in += is[l] * rm; + } + + unsigned i = tis[threadIdx.x] | qis[k]; + unsigned m = i & 0x1f; + unsigned n = i / 32; + + rs0[m][n] = rn; + is0[m][n] = in; + } + } + + for (unsigned k = 0; k < esize; ++k) { + *(p0 + xss[k]) = rs0[threadIdx.x][k]; + *(p0 + xss[k] + 32) = is0[threadIdx.x][k]; + } +} + +template +__global__ void ApplyControlledGateL_Kernel( + const fp_type* __restrict__ v0, const idx_type* __restrict__ xss, + const idx_type* __restrict__ mss, const unsigned* __restrict__ qis, + const unsigned* __restrict__ tis, const idx_type* __restrict__ cis, + unsigned num_mss, idx_type cvalsh, unsigned esize, unsigned rwthreads, + fp_type* __restrict__ rstate) { + // blockDim.x must be equal to 32. + + static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); + + constexpr unsigned gsize = 1 << G; + constexpr unsigned + rows = G < 4 ? gsize : (sizeof(fp_type) == 4 ? + (G < 5 ? gsize : 8) : (G < 6 ? 8 : 4)); + + fp_type rs[gsize], is[gsize]; + + __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1]; + __shared__ fp_type v[2 * gsize * rows]; + + idx_type i = 32 * idx_type{blockIdx.x}; + idx_type ii = i & mss[0]; + for (unsigned j = 1; j < num_mss; ++j) { + i *= 2; + ii |= i & mss[j]; + } + + ii |= cvalsh; + + auto p0 = rstate + 2 * ii + cis[threadIdx.x]; + + if (threadIdx.x < rwthreads) { + for (unsigned k = 0; k < gsize; ++k) { + rs0[threadIdx.x][k] = *(p0 + xss[k]); + is0[threadIdx.x][k] = *(p0 + xss[k] + 32); + } + } + + if (G < 2) { + if (threadIdx.x < 2 * gsize * gsize) { + v[threadIdx.x] = v0[threadIdx.x]; + } + } else { + for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { + v[m + threadIdx.x] = v0[m + threadIdx.x]; + } + } + + for (unsigned k = 0; k < gsize; ++k) { + unsigned i = tis[threadIdx.x] | qis[k]; + unsigned m = i & 0x1f; + unsigned n = i / 32; + + rs[k] = rs0[m][n]; + is[k] = is0[m][n]; + } + + for (unsigned s = 0; s < gsize / rows; ++s) { + if (s > 0) { + for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { + v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; + } + } + + unsigned j = 0; + + for (unsigned k = rows * s; k < rows * (s + 1); ++k) { + fp_type rn = 0; + fp_type in = 0; + + for (unsigned l = 0; l < gsize; ++l) { + fp_type rm = v[j++]; + fp_type im = v[j++]; + rn += rs[l] * rm; + rn -= is[l] * im; + in += rs[l] * im; + in += is[l] * rm; + } + + unsigned i = tis[threadIdx.x] | qis[k]; + unsigned m = i & 0x1f; + unsigned n = i / 32; + + rs0[m][n] = rn; + is0[m][n] = in; + } + } + + if (threadIdx.x < rwthreads) { + for (unsigned k = 0; k < esize; ++k) { + *(p0 + xss[k]) = rs0[threadIdx.x][k]; + *(p0 + xss[k] + 32) = is0[threadIdx.x][k]; + } + } +} + +template +__global__ void ExpectationValueH_Kernel( + const fp_type* __restrict__ v0, const idx_type* __restrict__ xss0, + const idx_type* __restrict__ mss, unsigned num_iterations_per_block, + const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) { + // blockDim.x must be equal to 64. + + static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); + + constexpr unsigned gsize = 1 << G; + constexpr unsigned rows = + G < 5 ? gsize : (sizeof(fp_type) == 4 ? (G < 6 ? 4 : 8) : 8); + + fp_type rs[gsize], is[gsize]; + + __shared__ idx_type xss[64]; + __shared__ fp_type v[2 * gsize * rows]; + + if (threadIdx.x < gsize) { + xss[threadIdx.x] = xss0[threadIdx.x]; + } + + if (G <= 2) { + if (threadIdx.x < 2 * gsize * gsize) { + v[threadIdx.x] = v0[threadIdx.x]; + } + } else { + for (unsigned m = 0; m < 2 * gsize * rows; m += 64) { + v[m + threadIdx.x] = v0[m + threadIdx.x]; + } + } + + __syncthreads(); + + double re = 0; + double im = 0; + + for (unsigned iter = 0; iter < num_iterations_per_block; ++iter) { + idx_type b = num_iterations_per_block * idx_type{blockIdx.x} + iter; + + idx_type i = (64 * b + threadIdx.x) & 0xffffffffffe0; + idx_type ii = i & mss[0]; + for (unsigned j = 1; j <= G; ++j) { + i *= 2; + ii |= i & mss[j]; + } + + auto p0 = rstate + 2 * ii + threadIdx.x % 32; + + for (unsigned k = 0; k < gsize; ++k) { + rs[k] = *(p0 + xss[k]); + is[k] = *(p0 + xss[k] + 32); + } + + for (unsigned s = 0; s < gsize / rows; ++s) { + if (s > 0 || iter > 0) { + __syncthreads(); + + for (unsigned m = 0; m < 2 * gsize * rows; m += 64) { + v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; + } + + __syncthreads(); + } + + unsigned j = 0; + + for (unsigned k = rows * s; k < rows * (s + 1); ++k) { + fp_type rn = 0; + fp_type in = 0; + + for (unsigned l = 0; l < gsize; ++l) { + fp_type rm = v[j++]; + fp_type im = v[j++]; + rn += rs[l] * rm; + rn -= is[l] * im; + in += rs[l] * im; + in += is[l] * rm; + } + + re += rs[k] * rn; + re += is[k] * in; + im += rs[k] * in; + im -= is[k] * rn; + } + } + } + + __shared__ cfp_type partial1[64]; + __shared__ cfp_type partial2[2]; + + partial1[threadIdx.x].re = re; + partial1[threadIdx.x].im = im; + + auto val = WarpReduce(partial1[threadIdx.x], op); + + if (threadIdx.x % 32 == 0) { + partial2[threadIdx.x / 32] = val; + } + + __syncthreads(); + + if (threadIdx.x == 0) { + result[blockIdx.x].re = partial2[0].re + partial2[1].re; + result[blockIdx.x].im = partial2[0].im + partial2[1].im; + } +} + +template +__global__ void ExpectationValueL_Kernel( + const fp_type* __restrict__ v0, const idx_type* __restrict__ xss, + const idx_type* __restrict__ mss, const unsigned* __restrict__ qis, + const unsigned* __restrict__ tis, unsigned num_iterations_per_block, + const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) { + // blockDim.x must be equal to 32. + + static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); + + constexpr unsigned gsize = 1 << G; + constexpr unsigned rows = G < 5 ? gsize : (sizeof(fp_type) == 4 ? + (G < 6 ? 4 : 2) : (G < 6 ? 2 : 1)); + + fp_type rs[gsize], is[gsize]; + + __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1]; + __shared__ fp_type v[2 * gsize * rows]; + + if (G < 2) { + if (threadIdx.x < 2 * gsize * gsize) { + v[threadIdx.x] = v0[threadIdx.x]; + } + } else { + for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { + v[m + threadIdx.x] = v0[m + threadIdx.x]; + } + } + + double re = 0; + double im = 0; + + for (idx_type iter = 0; iter < num_iterations_per_block; ++iter) { + idx_type i = 32 * (num_iterations_per_block * idx_type{blockIdx.x} + iter); + idx_type ii = i & mss[0]; + for (unsigned j = 1; j <= G; ++j) { + i *= 2; + ii |= i & mss[j]; + } + + auto p0 = rstate + 2 * ii + threadIdx.x; + + for (unsigned k = 0; k < gsize; ++k) { + rs0[threadIdx.x][k] = *(p0 + xss[k]); + is0[threadIdx.x][k] = *(p0 + xss[k] + 32); + } + + for (unsigned k = 0; k < gsize; ++k) { + unsigned i = tis[threadIdx.x] | qis[k]; + unsigned m = i & 0x1f; + unsigned n = i / 32; + + rs[k] = rs0[m][n]; + is[k] = is0[m][n]; + } + + for (unsigned s = 0; s < gsize / rows; ++s) { + if (s > 0 || iter > 0) { + for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { + v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; + } + } + + unsigned j = 0; + + for (unsigned k = rows * s; k < rows * (s + 1); ++k) { + fp_type rn = 0; + fp_type in = 0; + + for (unsigned l = 0; l < gsize; ++l) { + fp_type rm = v[j++]; + fp_type im = v[j++]; + rn += rs[l] * rm; + rn -= is[l] * im; + in += rs[l] * im; + in += is[l] * rm; + } + + re += rs[k] * rn; + re += is[k] * in; + im += rs[k] * in; + im -= is[k] * rn; + } + } + } + + __shared__ cfp_type partial[32]; + + partial[threadIdx.x].re = re; + partial[threadIdx.x].im = im; + + auto val = WarpReduce(partial[threadIdx.x], op); + + if (threadIdx.x == 0) { + result[blockIdx.x].re = val.re; + result[blockIdx.x].im = val.im; + } +} + +} // namespace qsim + +#endif // SIMULATOR_CUDA_KERNELS_H_ diff --git a/tpls/qsim/simulator_custatevec.h b/tpls/qsim/simulator_custatevec.h new file mode 100644 index 0000000..40d1902 --- /dev/null +++ b/tpls/qsim/simulator_custatevec.h @@ -0,0 +1,209 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SIMULATOR_CUSTATEVEC_H_ +#define SIMULATOR_CUSTATEVEC_H_ + +#include +#include +#include + +#include +#include +#include + +#include "io.h" +#include "statespace_custatevec.h" +#include "util_custatevec.h" + +namespace qsim { + +/** + * Quantum circuit simulator using the NVIDIA cuStateVec library. + */ +template +class SimulatorCuStateVec final { + public: + using StateSpace = StateSpaceCuStateVec; + using State = typename StateSpace::State; + using fp_type = typename StateSpace::fp_type; + + static constexpr auto kStateType = StateSpace::kStateType; + static constexpr auto kMatrixType = StateSpace::kMatrixType; + static constexpr auto kExpectType = StateSpace::kExpectType; + static constexpr auto kComputeType = StateSpace::kComputeType; + static constexpr auto kMatrixLayout = StateSpace::kMatrixLayout; + + explicit SimulatorCuStateVec(const cublasHandle_t& cublas_handle, + const custatevecHandle_t& custatevec_handle) + : cublas_handle_(cublas_handle), custatevec_handle_(custatevec_handle), + workspace_(nullptr), workspace_size_(0) {} + + ~SimulatorCuStateVec() { + ErrorCheck(cudaFree(workspace_)); + } + + /** + * Applies a gate using the NVIDIA cuStateVec library. + * @param qs Indices of the qubits affected by this gate. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyGate(const std::vector& qs, + const fp_type* matrix, State& state) const { + if (qs.size() == 0) { + uint64_t size = uint64_t{1} << state.num_qubits(); + + if (StateSpace::is_float) { + cuComplex a = {matrix[0], matrix[1]}; + auto p = (cuComplex*) state.get(); + ErrorCheck(cublasCscal(cublas_handle_, size, &a, p, 1)); + } else { + cuDoubleComplex a = {matrix[0], matrix[1]}; + auto p = (cuDoubleComplex*) state.get(); + ErrorCheck(cublasZscal(cublas_handle_, size, &a, p, 1)); + } + } else { + auto workspace_size = ApplyGateWorkSpaceSize( + state.num_qubits(), qs.size(), 0, matrix); + AllocWorkSpace(workspace_size); + + ErrorCheck(custatevecApplyMatrix( + custatevec_handle_, state.get(), kStateType, + state.num_qubits(), matrix, kMatrixType, kMatrixLayout, 0, + (int32_t*) qs.data(), qs.size(), nullptr, nullptr, 0, + kComputeType, workspace_, workspace_size)); + } + } + + /** + * Applies a controlled gate using the NVIDIA cuStateVec library. + * @param qs Indices of the qubits affected by this gate. + * @param cqs Indices of control qubits. + * @param cmask Bit mask of control qubit values. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyControlledGate(const std::vector& qs, + const std::vector& cqs, uint64_t cmask, + const fp_type* matrix, State& state) const { + if (qs.size() == 0) { + IO::errorf( + "error: controlled global phase gate is not implemented %s %d\n", + __FILE__, __LINE__); + exit(1); + } else { + std::vector control_bits; + control_bits.reserve(cqs.size()); + + for (std::size_t i = 0; i < cqs.size(); ++i) { + control_bits.push_back((cmask >> i) & 1); + } + + auto workspace_size = ApplyGateWorkSpaceSize( + state.num_qubits(), qs.size(), cqs.size(), matrix); + AllocWorkSpace(workspace_size); + + ErrorCheck(custatevecApplyMatrix( + custatevec_handle_, state.get(), kStateType, + state.num_qubits(), matrix, kMatrixType, kMatrixLayout, 0, + (int32_t*) qs.data(), qs.size(), + (int32_t*) cqs.data(), control_bits.data(), cqs.size(), + kComputeType, workspace_, workspace_size)); + } + } + + /** + * Computes the expectation value of an operator using the NVIDIA cuStateVec + * library. + * @param qs Indices of the qubits the operator acts on. + * @param matrix The operator matrix. + * @param state The state of the system. + * @return The computed expectation value. + */ + std::complex ExpectationValue(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + auto workspace_size = ExpectationValueWorkSpaceSize( + state.num_qubits(), qs.size(), matrix); + AllocWorkSpace(workspace_size); + + cuDoubleComplex eval; + + ErrorCheck(custatevecComputeExpectation( + custatevec_handle_, state.get(), kStateType, + state.num_qubits(), &eval, kExpectType, nullptr, matrix, + kMatrixType, kMatrixLayout, (int32_t*) qs.data(), qs.size(), + kComputeType, workspace_, workspace_size)); + + return {cuCreal(eval), cuCimag(eval)}; + } + + /** + * @return The size of SIMD register if applicable. + */ + static unsigned SIMDRegisterSize() { + return 32; + } + + private: + size_t ApplyGateWorkSpaceSize( + unsigned num_qubits, unsigned num_targets, unsigned num_controls, + const fp_type* matrix) const { + size_t size; + + ErrorCheck(custatevecApplyMatrixGetWorkspaceSize( + custatevec_handle_, kStateType, num_qubits, matrix, + kMatrixType, kMatrixLayout, 0, num_targets, num_controls, + kComputeType, &size)); + + return size; + } + + size_t ExpectationValueWorkSpaceSize( + unsigned num_qubits, unsigned num_targets, const fp_type* matrix) const { + size_t size; + + ErrorCheck(custatevecComputeExpectationGetWorkspaceSize( + custatevec_handle_, kStateType, num_qubits, matrix, + kMatrixType, kMatrixLayout, num_targets, kComputeType, + &size)); + + return size; + } + + void* AllocWorkSpace(size_t size) const { + if (size > workspace_size_) { + if (workspace_ != nullptr) { + ErrorCheck(cudaFree(workspace_)); + } + + ErrorCheck(cudaMalloc(const_cast(&workspace_), size)); + + const_cast(workspace_size_) = size; + } + + return workspace_; + } + + const cublasHandle_t cublas_handle_; + const custatevecHandle_t custatevec_handle_; + + void* workspace_; + size_t workspace_size_; +}; + +} // namespace qsim + +#endif // SIMULATOR_CUSTATEVEC_H_ diff --git a/tpls/qsim/simulator_sse.h b/tpls/qsim/simulator_sse.h new file mode 100644 index 0000000..5256c53 --- /dev/null +++ b/tpls/qsim/simulator_sse.h @@ -0,0 +1,864 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SIMULATOR_SSE_H_ +#define SIMULATOR_SSE_H_ + +#include + +#include +#include +#include +#include + +#include "simulator.h" +#include "statespace_sse.h" + +namespace qsim { + +/** + * Quantum circuit simulator with SSE vectorization. + */ +template +class SimulatorSSE final : public SimulatorBase { + public: + using StateSpace = StateSpaceSSE; + using State = typename StateSpace::State; + using fp_type = typename StateSpace::fp_type; + + template + explicit SimulatorSSE(ForArgs&&... args) : for_(args...) {} + + /** + * Applies a gate using SSE instructions. + * @param qs Indices of the qubits affected by this gate. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyGate(const std::vector& qs, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + switch (qs.size()) { + case 0: + ApplyGateH<0>(qs, matrix, state); + break; + case 1: + if (qs[0] > 1) { + ApplyGateH<1>(qs, matrix, state); + } else { + ApplyGateL<0, 1>(qs, matrix, state); + } + break; + case 2: + if (qs[0] > 1) { + ApplyGateH<2>(qs, matrix, state); + } else if (qs[1] > 1) { + ApplyGateL<1, 1>(qs, matrix, state); + } else { + ApplyGateL<0, 2>(qs, matrix, state); + } + break; + case 3: + if (qs[0] > 1) { + ApplyGateH<3>(qs, matrix, state); + } else if (qs[1] > 1) { + ApplyGateL<2, 1>(qs, matrix, state); + } else { + ApplyGateL<1, 2>(qs, matrix, state); + } + break; + case 4: + if (qs[0] > 1) { + ApplyGateH<4>(qs, matrix, state); + } else if (qs[1] > 1) { + ApplyGateL<3, 1>(qs, matrix, state); + } else { + ApplyGateL<2, 2>(qs, matrix, state); + } + break; + case 5: + if (qs[0] > 1) { + ApplyGateH<5>(qs, matrix, state); + } else if (qs[1] > 1) { + ApplyGateL<4, 1>(qs, matrix, state); + } else { + ApplyGateL<3, 2>(qs, matrix, state); + } + break; + case 6: + if (qs[0] > 1) { + ApplyGateH<6>(qs, matrix, state); + } else if (qs[1] > 1) { + ApplyGateL<5, 1>(qs, matrix, state); + } else { + ApplyGateL<4, 2>(qs, matrix, state); + } + break; + default: + // Not implemented. + break; + } + } + + /** + * Applies a controlled gate using SSE instructions. + * @param qs Indices of the qubits affected by this gate. + * @param cqs Indices of control qubits. + * @param cvals Bit mask of control qubit values. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyControlledGate(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + // Assume cqs[0] < cqs[1] < cqs[2] < ... . + + if (cqs.size() == 0) { + ApplyGate(qs, matrix, state); + return; + } + + switch (qs.size()) { + case 0: + if (cqs[0] > 1) { + ApplyControlledGateHH<0>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<0>(qs, cqs, cvals, matrix, state); + } + break; + case 1: + if (qs[0] > 1) { + if (cqs[0] > 1) { + ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 1) { + ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 2: + if (qs[0] > 1) { + if (cqs[0] > 1) { + ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 1) { + if (cqs[0] > 1) { + ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 1) { + ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 3: + if (qs[0] > 1) { + if (cqs[0] > 1) { + ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 1) { + if (cqs[0] > 1) { + ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 1) { + ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 4: + if (qs[0] > 1) { + if (cqs[0] > 1) { + ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 1) { + if (cqs[0] > 1) { + ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 1) { + ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + default: + // Not implemented. + break; + } + } + + /** + * Computes the expectation value of an operator using SSE instructions. + * @param qs Indices of the qubits the operator acts on. + * @param matrix The operator matrix. + * @param state The state of the system. + * @return The computed expectation value. + */ + std::complex ExpectationValue(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + switch (qs.size()) { + case 1: + if (qs[0] > 1) { + return ExpectationValueH<1>(qs, matrix, state); + } else { + return ExpectationValueL<0, 1>(qs, matrix, state); + } + break; + case 2: + if (qs[0] > 1) { + return ExpectationValueH<2>(qs, matrix, state); + } else if (qs[1] > 1) { + return ExpectationValueL<1, 1>(qs, matrix, state); + } else { + return ExpectationValueL<0, 2>(qs, matrix, state); + } + break; + case 3: + if (qs[0] > 1) { + return ExpectationValueH<3>(qs, matrix, state); + } else if (qs[1] > 1) { + return ExpectationValueL<2, 1>(qs, matrix, state); + } else { + return ExpectationValueL<1, 2>(qs, matrix, state); + } + break; + case 4: + if (qs[0] > 1) { + return ExpectationValueH<4>(qs, matrix, state); + } else if (qs[1] > 1) { + return ExpectationValueL<3, 1>(qs, matrix, state); + } else { + return ExpectationValueL<2, 2>(qs, matrix, state); + } + break; + case 5: + if (qs[0] > 1) { + return ExpectationValueH<5>(qs, matrix, state); + } else if (qs[1] > 1) { + return ExpectationValueL<4, 1>(qs, matrix, state); + } else { + return ExpectationValueL<3, 2>(qs, matrix, state); + } + break; + case 6: + if (qs[0] > 1) { + return ExpectationValueH<6>(qs, matrix, state); + } else if (qs[1] > 1) { + return ExpectationValueL<5, 1>(qs, matrix, state); + } else { + return ExpectationValueL<4, 2>(qs, matrix, state); + } + break; + default: + // Not implemented. + break; + } + + return 0; + } + + /** + * @return The size of SIMD register if applicable. + */ + static unsigned SIMDRegisterSize() { + return 4; + } + + private: + template + void ApplyGateH(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m128 ru, iu, rn, in; + __m128 rs[hsize], is[hsize]; + + i *= 4; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = _mm_load_ps(p0 + xss[k]); + is[k] = _mm_load_ps(p0 + xss[k] + 4); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm_set1_ps(v[j]); + iu = _mm_set1_ps(v[j + 1]); + rn = _mm_mul_ps(rs[0], ru); + in = _mm_mul_ps(rs[0], iu); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu)); + in = _mm_add_ps(in, _mm_mul_ps(is[0], ru)); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm_set1_ps(v[j]); + iu = _mm_set1_ps(v[j + 1]); + rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru)); + in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu)); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu)); + in = _mm_add_ps(in, _mm_mul_ps(is[l], ru)); + + j += 2; + } + + _mm_store_ps(p0 + xss[k], rn); + _mm_store_ps(p0 + xss[k] + 4, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned k = 2 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, matrix, ms, xss, state.get()); + } + + template + void ApplyGateL(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w, + const uint64_t* ms, const uint64_t* xss, + unsigned q0, fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m128 rn, in; + __m128 rs[gsize], is[gsize]; + + i *= 4; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + + rs[k2] = _mm_load_ps(p0 + xss[k]); + is[k2] = _mm_load_ps(p0 + xss[k] + 4); + + if (L == 1) { + rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177) + : _mm_shuffle_ps(rs[k2], rs[k2], 78); + is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177) + : _mm_shuffle_ps(is[k2], is[k2], 78); + } else if (L == 2) { + rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57); + is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57); + rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78); + is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78); + rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147); + is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147); + } + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm_mul_ps(rs[0], w[j]); + in = _mm_mul_ps(rs[0], w[j + 1]); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1])); + in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j])); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j])); + in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1])); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1])); + in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j])); + + j += 2; + } + + _mm_store_ps(p0 + xss[k], rn); + _mm_store_ps(p0 + xss[k] + 4, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + __m128 w[1 << (1 + 2 * H + L)]; + + auto m = GetMasks11(qs); + + FillIndices(state.num_qubits(), qs, ms, xss); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + unsigned k = 2 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, qs[0], state.get()); + } + + template + void ApplyControlledGateHH(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, + uint64_t cmaskh, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m128 ru, iu, rn, in; + __m128 rs[hsize], is[hsize]; + + i *= 4; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + if ((ii & cmaskh) != cvalsh) return; + + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = _mm_load_ps(p0 + xss[k]); + is[k] = _mm_load_ps(p0 + xss[k] + 4); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm_set1_ps(v[j]); + iu = _mm_set1_ps(v[j + 1]); + rn = _mm_mul_ps(rs[0], ru); + in = _mm_mul_ps(rs[0], iu); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu)); + in = _mm_add_ps(in, _mm_mul_ps(is[0], ru)); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm_set1_ps(v[j]); + iu = _mm_set1_ps(v[j + 1]); + rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru)); + in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu)); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu)); + in = _mm_add_ps(in, _mm_mul_ps(is[l], ru)); + + j += 2; + } + + _mm_store_ps(p0 + xss[k], rn); + _mm_store_ps(p0 + xss[k] + 4, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + + auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals); + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned k = 2 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, matrix, ms, xss, m.cvalsh, m.cmaskh, state.get()); + } + + template + void ApplyControlledGateHL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w, + const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, + uint64_t cmaskh, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m128 rn, in; + __m128 rs[hsize], is[hsize]; + + i *= 4; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + if ((ii & cmaskh) != cvalsh) return; + + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = _mm_load_ps(p0 + xss[k]); + is[k] = _mm_load_ps(p0 + xss[k] + 4); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm_mul_ps(rs[0], w[j]); + in = _mm_mul_ps(rs[0], w[j + 1]); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1])); + in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j])); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j])); + in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1])); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1])); + in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j])); + + j += 2; + } + + _mm_store_ps(p0 + xss[k], rn); + _mm_store_ps(p0 + xss[k] + 4, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + __m128 w[1 << (1 + 2 * H)]; + + auto m = GetMasks8<2>(state.num_qubits(), qs, cqs, cvals); + FillIndices(state.num_qubits(), qs, ms, xss); + FillControlledMatrixH(m.cvalsl, m.cmaskl, matrix, (fp_type*) w); + + unsigned r = 2 + H; + unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, state.get()); + } + + template + void ApplyControlledGateL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w, + const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, + uint64_t cmaskh, unsigned q0, fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m128 rn, in; + __m128 rs[gsize], is[gsize]; + + i *= 4; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + if ((ii & cmaskh) != cvalsh) return; + + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + + rs[k2] = _mm_load_ps(p0 + xss[k]); + is[k2] = _mm_load_ps(p0 + xss[k] + 4); + + if (L == 1) { + rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177) + : _mm_shuffle_ps(rs[k2], rs[k2], 78); + is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177) + : _mm_shuffle_ps(is[k2], is[k2], 78); + } else if (L == 2) { + rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57); + is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57); + rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78); + is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78); + rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147); + is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147); + } + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm_mul_ps(rs[0], w[j]); + in = _mm_mul_ps(rs[0], w[j + 1]); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1])); + in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j])); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j])); + in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1])); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1])); + in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j])); + + j += 2; + } + + _mm_store_ps(p0 + xss[k], rn); + _mm_store_ps(p0 + xss[k] + 4, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + __m128 w[1 << (1 + 2 * H + L)]; + + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned r = 2 + H; + unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; + uint64_t size = uint64_t{1} << n; + + if (CH) { + auto m = GetMasks9(state.num_qubits(), qs, cqs, cvals); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, qs[0], state.get()); + } else { + auto m = GetMasks10(state.num_qubits(), qs, cqs, cvals); + FillControlledMatrixL( + m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w); + + for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, qs[0], state.get()); + } + } + + template + std::complex ExpectationValueH(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, + const fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m128 ru, iu, rn, in; + __m128 rs[hsize], is[hsize]; + + i *= 4; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = _mm_load_ps(p0 + xss[k]); + is[k] = _mm_load_ps(p0 + xss[k] + 4); + } + + double re = 0; + double im = 0; + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm_set1_ps(v[j]); + iu = _mm_set1_ps(v[j + 1]); + rn = _mm_mul_ps(rs[0], ru); + in = _mm_mul_ps(rs[0], iu); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu)); + in = _mm_add_ps(in, _mm_mul_ps(is[0], ru)); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm_set1_ps(v[j]); + iu = _mm_set1_ps(v[j + 1]); + rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru)); + in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu)); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu)); + in = _mm_add_ps(in, _mm_mul_ps(is[l], ru)); + + j += 2; + } + + __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[k], rn), _mm_mul_ps(is[k], in)); + __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[k], in), _mm_mul_ps(is[k], rn)); + + re += detail::HorizontalSumSSE(v_re); + im += detail::HorizontalSumSSE(v_im); + } + + return std::complex{re, im}; + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned k = 2 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), matrix, ms, xss, state.get()); + } + + template + std::complex ExpectationValueL(const std::vector& qs, + const fp_type* matrix, + const State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w, + const uint64_t* ms, const uint64_t* xss, unsigned q0, + const fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m128 rn, in; + __m128 rs[gsize], is[gsize]; + + i *= 4; + + uint64_t ii = i & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + i *= 2; + ii |= i & ms[j]; + } + + auto p0 = rstate + 2 * ii; + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + + rs[k2] = _mm_load_ps(p0 + xss[k]); + is[k2] = _mm_load_ps(p0 + xss[k] + 4); + + if (L == 1) { + rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177) + : _mm_shuffle_ps(rs[k2], rs[k2], 78); + is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177) + : _mm_shuffle_ps(is[k2], is[k2], 78); + } else if (L == 2) { + rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57); + is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57); + rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78); + is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78); + rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147); + is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147); + } + } + + double re = 0; + double im = 0; + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm_mul_ps(rs[0], w[j]); + in = _mm_mul_ps(rs[0], w[j + 1]); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1])); + in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j])); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j])); + in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1])); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1])); + in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j])); + + j += 2; + } + + unsigned m = lsize * k; + + __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[m], rn), _mm_mul_ps(is[m], in)); + __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[m], in), _mm_mul_ps(is[m], rn)); + + re += detail::HorizontalSumSSE(v_re); + im += detail::HorizontalSumSSE(v_im); + } + + return std::complex{re, im}; + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + __m128 w[1 << (1 + 2 * H + L)]; + + auto m = GetMasks11(qs); + + FillIndices(state.num_qubits(), qs, ms, xss); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + unsigned k = 2 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + using Op = std::plus>; + return for_.RunReduce(size, f, Op(), w, ms, xss, qs[0], state.get()); + } + + For for_; +}; + +} // namespace qsim + +#endif // SIMULATOR_SSE_H_ diff --git a/tpls/qsim/statespace.h b/tpls/qsim/statespace.h new file mode 100644 index 0000000..2b0c9af --- /dev/null +++ b/tpls/qsim/statespace.h @@ -0,0 +1,145 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef STATESPACE_H_ +#define STATESPACE_H_ + +#include +#include +#include +#include + +#include "util.h" + +namespace qsim { + +/** + * Abstract class containing context and routines for general state-vector + * manipulations. "AVX", "AVX512", "Basic", and "SSE" implementations are + * provided. + */ +template class VectorSpace, typename... VSTypeParams> +class StateSpace : public VectorSpace { + private: + using Base = VectorSpace; + + public: + using fp_type = typename Base::fp_type; + using State = typename Base::Vector; + + /** + * The observed state from a Measurement gate. + */ + struct MeasurementResult { + /** + * A bitmask of all qubits measured in this result. In this format, if the + * qubit at index `i` is measured, the `i`th bit of `mask` is a one. + */ + uint64_t mask; + /** + * A bitwise representation of the measured states. In this format, the + * qubit at index `i` is represented by the `i`th bit of `bits`. + * If `valid` is true, `mask` has already been applied to this field + * (i.e. `bits == bits & mask`). + */ + uint64_t bits; + /** + * Observed states of the measured qubits. This vector only includes qubits + * specified by the associated Measurement gate. + */ + std::vector bitstring; + /** + * Validation bit. If this is false, the measurement failed and all other + * fields of the result are invalid. + */ + bool valid; + }; + + template + StateSpace(Args&&... args) : Base(args...) {} + + double Norm(const State& state) const { + auto partial_norms = static_cast(*this).PartialNorms(state); + + double norm = partial_norms[0]; + for (std::size_t i = 1; i < partial_norms.size(); ++i) { + norm += partial_norms[i]; + } + + return norm; + } + + template + MeasurementResult Measure(const std::vector& qubits, + RGen& rgen, State& state) const { + auto result = + static_cast(*this).VirtualMeasure(qubits, rgen, state); + + if (result.valid) { + static_cast(*this).Collapse(result, state); + } + + return result; + } + + template + MeasurementResult VirtualMeasure(const std::vector& qubits, + RGen& rgen, const State& state) const { + MeasurementResult result; + + result.valid = true; + result.mask = 0; + + for (auto q : qubits) { + if (q >= state.num_qubits()) { + result.valid = false; + return result; + } + + result.mask |= uint64_t{1} << q; + } + + auto partial_norms = static_cast(*this).PartialNorms(state); + + for (std::size_t i = 1; i < partial_norms.size(); ++i) { + partial_norms[i] += partial_norms[i - 1]; + } + + auto norm = partial_norms.back(); + auto r = RandomValue(rgen, norm); + + unsigned m = 0; + while (r > partial_norms[m]) ++m; + if (m > 0) { + r -= partial_norms[m - 1]; + } + + result.bits = static_cast(*this).FindMeasuredBits( + m, r, result.mask, state); + + result.bitstring.reserve(qubits.size()); + result.bitstring.resize(0); + + for (auto q : qubits) { + result.bitstring.push_back((result.bits >> q) & 1); + } + + return result; + } +}; + +} // namespace qsim + +#endif // STATESPACE_H_ diff --git a/tpls/qsim/statespace_avx.h b/tpls/qsim/statespace_avx.h new file mode 100644 index 0000000..876058b --- /dev/null +++ b/tpls/qsim/statespace_avx.h @@ -0,0 +1,497 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef STATESPACE_AVX_H_ +#define STATESPACE_AVX_H_ + +#include + +#include +#include +#include +#include +#include + +#include "statespace.h" +#include "util.h" +#include "vectorspace.h" + +namespace qsim { + +namespace detail { + +inline __m256i GetZeroMaskAVX(uint64_t i, uint64_t mask, uint64_t bits) { + __m256i s1 = _mm256_setr_epi64x(i + 0, i + 2, i + 4, i + 6); + __m256i s2 = _mm256_setr_epi64x(i + 1, i + 3, i + 5, i + 7); + __m256i ma = _mm256_set1_epi64x(mask); + __m256i bi = _mm256_set1_epi64x(bits); + + s1 = _mm256_and_si256(s1, ma); + s2 = _mm256_and_si256(s2, ma); + + s1 = _mm256_cmpeq_epi64(s1, bi); + s2 = _mm256_cmpeq_epi64(s2, bi); + + return _mm256_blend_epi32(s1, s2, 170); // 10101010 +} + +inline double HorizontalSumAVX(__m256 s) { + __m128 l = _mm256_castps256_ps128(s); + __m128 h = _mm256_extractf128_ps(s, 1); + __m128 s1 = _mm_add_ps(h, l); + __m128 s1s = _mm_movehdup_ps(s1); + __m128 s2 = _mm_add_ps(s1, s1s); + + return _mm_cvtss_f32(_mm_add_ss(s2, _mm_movehl_ps(s1s, s2))); +} + +} // namespace detail + +/** + * Object containing context and routines for AVX state-vector manipulations. + * State is a vectorized sequence of eight real components followed by eight + * imaginary components. Eight single-precison floating numbers can be loaded + * into an AVX register. + */ +template +class StateSpaceAVX : + public StateSpace, VectorSpace, For, float> { + private: + using Base = StateSpace, qsim::VectorSpace, For, float>; + + public: + using State = typename Base::State; + using fp_type = typename Base::fp_type; + + template + explicit StateSpaceAVX(ForArgs&&... args) : Base(args...) {} + + static uint64_t MinSize(unsigned num_qubits) { + return std::max(uint64_t{16}, 2 * (uint64_t{1} << num_qubits)); + }; + + void InternalToNormalOrder(State& state) const { + if (state.num_qubits() == 1) { + fp_type* s = state.get(); + + s[2] = s[1]; + s[1] = s[8]; + s[3] = s[9]; + + for (uint64_t i = 4; i < 16; ++i) { + s[i] = 0; + } + } else if (state.num_qubits() == 2) { + fp_type* s = state.get(); + + s[6] = s[3]; + s[4] = s[2]; + s[2] = s[1]; + s[1] = s[8]; + s[3] = s[9]; + s[5] = s[10]; + s[7] = s[11]; + + for (uint64_t i = 8; i < 16; ++i) { + s[i] = 0; + } + } else { + auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) { + fp_type* s = p + 16 * i; + + fp_type re[7]; + fp_type im[7]; + + for (uint64_t i = 0; i < 7; ++i) { + re[i] = s[i + 1]; + im[i] = s[i + 8]; + } + + for (uint64_t i = 0; i < 7; ++i) { + s[2 * i + 1] = im[i]; + s[2 * i + 2] = re[i]; + } + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 16, f, state.get()); + } + } + + void NormalToInternalOrder(State& state) const { + if (state.num_qubits() == 1) { + fp_type* s = state.get(); + + s[8] = s[1]; + s[1] = s[2]; + s[9] = s[3]; + + for (uint64_t i = 2; i < 8; ++i) { + s[i] = 0; + s[i + 8] = 0; + } + } else if (state.num_qubits() == 2) { + fp_type* s = state.get(); + + s[8] = s[1]; + s[9] = s[3]; + s[10] = s[5]; + s[11] = s[7]; + s[1] = s[2]; + s[2] = s[4]; + s[3] = s[6]; + + for (uint64_t i = 4; i < 8; ++i) { + s[i] = 0; + s[i + 8] = 0; + } + } else { + auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) { + fp_type* s = p + 16 * i; + + fp_type re[7]; + fp_type im[7]; + + for (uint64_t i = 0; i < 7; ++i) { + im[i] = s[2 * i + 1]; + re[i] = s[2 * i + 2]; + } + + for (uint64_t i = 0; i < 7; ++i) { + s[i + 1] = re[i]; + s[i + 8] = im[i]; + } + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 16, f, state.get()); + } + } + + void SetAllZeros(State& state) const { + __m256 val0 = _mm256_setzero_ps(); + + auto f = [](unsigned n, unsigned m, uint64_t i, __m256& val, fp_type* p) { + _mm256_store_ps(p + 16 * i, val); + _mm256_store_ps(p + 16 * i + 8, val); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 16, f, val0, state.get()); + } + + // Uniform superposition. + void SetStateUniform(State& state) const { + __m256 val0 = _mm256_setzero_ps(); + __m256 valu; + + fp_type v = double{1} / std::sqrt(uint64_t{1} << state.num_qubits()); + + switch (state.num_qubits()) { + case 1: + valu = _mm256_set_ps(0, 0, 0, 0, 0, 0, v, v); + break; + case 2: + valu = _mm256_set_ps(0, 0, 0, 0, v, v, v, v); + break; + default: + valu = _mm256_set1_ps(v); + break; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + __m256& val0, __m256 valu, fp_type* p) { + _mm256_store_ps(p + 16 * i, valu); + _mm256_store_ps(p + 16 * i + 8, val0); + }; + + Base::for_.Run( + MinSize(state.num_qubits()) / 16, f, val0, valu, state.get()); + } + + // |0> state. + void SetStateZero(State& state) const { + SetAllZeros(state); + state.get()[0] = 1; + } + + static std::complex GetAmpl(const State& state, uint64_t i) { + uint64_t k = (16 * (i / 8)) + (i % 8); + return std::complex(state.get()[k], state.get()[k + 8]); + } + + static void SetAmpl( + State& state, uint64_t i, const std::complex& ampl) { + uint64_t k = (16 * (i / 8)) + (i % 8); + state.get()[k] = std::real(ampl); + state.get()[k + 8] = std::imag(ampl); + } + + static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) { + uint64_t k = (16 * (i / 8)) + (i % 8); + state.get()[k] = re; + state.get()[k + 8] = im; + } + + // Sets state[i] = complex(re, im) where (i & mask) == bits. + // if `exclude` is true then the criteria becomes (i & mask) != bits. + void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, + const std::complex& val, + bool exclude = false) const { + BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude); + } + + // Sets state[i] = complex(re, im) where (i & mask) == bits. + // if `exclude` is true then the criteria becomes (i & mask) != bits. + void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re, + fp_type im, bool exclude = false) const { + __m256 re_reg = _mm256_set1_ps(re); + __m256 im_reg = _mm256_set1_ps(im); + + __m256i exclude_reg = _mm256_setzero_si256(); + if (exclude) { + exclude_reg = _mm256_cmpeq_epi32(exclude_reg, exclude_reg); + } + + auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv, + uint64_t bitsv, __m256 re_n, __m256 im_n, __m256i exclude_n, + fp_type* p) { + __m256 ml = _mm256_castsi256_ps(_mm256_xor_si256( + detail::GetZeroMaskAVX(8 * i, maskv, bitsv), exclude_n)); + + __m256 re = _mm256_load_ps(p + 16 * i); + __m256 im = _mm256_load_ps(p + 16 * i + 8); + + re = _mm256_blendv_ps(re, re_n, ml); + im = _mm256_blendv_ps(im, im_n, ml); + + _mm256_store_ps(p + 16 * i, re); + _mm256_store_ps(p + 16 * i + 8, im); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 16, f, mask, bits, re_reg, + im_reg, exclude_reg, state.get()); + } + + // Does the equivalent of dest += src elementwise. + bool Add(const State& src, State& dest) const { + if (src.num_qubits() != dest.num_qubits()) { + return false; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p1, fp_type* p2) { + __m256 re1 = _mm256_load_ps(p1 + 16 * i); + __m256 im1 = _mm256_load_ps(p1 + 16 * i + 8); + __m256 re2 = _mm256_load_ps(p2 + 16 * i); + __m256 im2 = _mm256_load_ps(p2 + 16 * i + 8); + + _mm256_store_ps(p2 + 16 * i, _mm256_add_ps(re1, re2)); + _mm256_store_ps(p2 + 16 * i + 8, _mm256_add_ps(im1, im2)); + }; + + Base::for_.Run(MinSize(src.num_qubits()) / 16, f, src.get(), dest.get()); + + return true; + } + + // Does the equivalent of state *= a elementwise. + void Multiply(fp_type a, State& state) const { + __m256 r = _mm256_set1_ps(a); + + auto f = [](unsigned n, unsigned m, uint64_t i, __m256 r, fp_type* p) { + __m256 re = _mm256_load_ps(p + 16 * i); + __m256 im = _mm256_load_ps(p + 16 * i + 8); + + re = _mm256_mul_ps(re, r); + im = _mm256_mul_ps(im, r); + + _mm256_store_ps(p + 16 * i, re); + _mm256_store_ps(p + 16 * i + 8, im); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 16, f, r, state.get()); + } + + std::complex InnerProduct( + const State& state1, const State& state2) const { + if (state1.num_qubits() != state2.num_qubits()) { + return std::nan(""); + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p1, const fp_type* p2) -> std::complex { + __m256 re1 = _mm256_load_ps(p1 + 16 * i); + __m256 im1 = _mm256_load_ps(p1 + 16 * i + 8); + __m256 re2 = _mm256_load_ps(p2 + 16 * i); + __m256 im2 = _mm256_load_ps(p2 + 16 * i + 8); + + __m256 ip_re = _mm256_fmadd_ps(im1, im2, _mm256_mul_ps(re1, re2)); + __m256 ip_im = _mm256_fnmadd_ps(im1, re2, _mm256_mul_ps(re1, im2)); + + double re = detail::HorizontalSumAVX(ip_re); + double im = detail::HorizontalSumAVX(ip_im); + + return std::complex{re, im}; + }; + + using Op = std::plus>; + return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 16, f, + Op(), state1.get(), state2.get()); + } + + double RealInnerProduct(const State& state1, const State& state2) const { + if (state1.num_qubits() != state2.num_qubits()) { + return std::nan(""); + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p1, const fp_type* p2) -> double { + __m256 re1 = _mm256_load_ps(p1 + 16 * i); + __m256 im1 = _mm256_load_ps(p1 + 16 * i + 8); + __m256 re2 = _mm256_load_ps(p2 + 16 * i); + __m256 im2 = _mm256_load_ps(p2 + 16 * i + 8); + + __m256 ip_re = _mm256_fmadd_ps(im1, im2, _mm256_mul_ps(re1, re2)); + + return detail::HorizontalSumAVX(ip_re); + }; + + using Op = std::plus; + return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 16, f, + Op(), state1.get(), state2.get()); + } + + template + std::vector Sample( + const State& state, uint64_t num_samples, unsigned seed) const { + std::vector bitstrings; + + if (num_samples > 0) { + double norm = 0; + uint64_t size = MinSize(state.num_qubits()) / 16; + const fp_type* p = state.get(); + + for (uint64_t k = 0; k < size; ++k) { + for (unsigned j = 0; j < 8; ++j) { + double re = p[16 * k + j]; + double im = p[16 * k + 8 + j]; + norm += re * re + im * im; + } + } + + auto rs = GenerateRandomValues(num_samples, seed, norm); + + uint64_t m = 0; + double csum = 0; + bitstrings.reserve(num_samples); + + for (uint64_t k = 0; k < size; ++k) { + for (unsigned j = 0; j < 8; ++j) { + double re = p[16 * k + j]; + double im = p[16 * k + 8 + j]; + csum += re * re + im * im; + while (rs[m] < csum && m < num_samples) { + bitstrings.emplace_back(8 * k + j); + ++m; + } + } + } + + for (; m < num_samples; ++m) { + bitstrings.emplace_back((uint64_t{1} << state.num_qubits()) - 1); + } + } + + return bitstrings; + } + + using MeasurementResult = typename Base::MeasurementResult; + + void Collapse(const MeasurementResult& mr, State& state) const { + auto f1 = [](unsigned n, unsigned m, uint64_t i, + uint64_t mask, uint64_t bits, const fp_type* p) -> double { + __m256i ml = detail::GetZeroMaskAVX(8 * i, mask, bits); + + __m256 re = _mm256_maskload_ps(p + 16 * i, ml); + __m256 im = _mm256_maskload_ps(p + 16 * i + 8, ml); + __m256 s1 = _mm256_fmadd_ps(im, im, _mm256_mul_ps(re, re)); + + return detail::HorizontalSumAVX(s1); + }; + + using Op = std::plus; + double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 16, f1, + Op(), mr.mask, mr.bits, state.get()); + + __m256 renorm = _mm256_set1_ps(1.0 / std::sqrt(norm)); + + auto f2 = [](unsigned n, unsigned m, uint64_t i, + uint64_t mask, uint64_t bits, __m256 renorm, fp_type* p) { + __m256i ml = detail::GetZeroMaskAVX(8 * i, mask, bits); + + __m256 re = _mm256_maskload_ps(p + 16 * i, ml); + __m256 im = _mm256_maskload_ps(p + 16 * i + 8, ml); + + re = _mm256_mul_ps(re, renorm); + im = _mm256_mul_ps(im, renorm); + + _mm256_store_ps(p + 16 * i, re); + _mm256_store_ps(p + 16 * i + 8, im); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 16, f2, + mr.mask, mr.bits, renorm, state.get()); + } + + std::vector PartialNorms(const State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p) -> double { + __m256 re = _mm256_load_ps(p + 16 * i); + __m256 im = _mm256_load_ps(p + 16 * i + 8); + __m256 s1 = _mm256_fmadd_ps(im, im, _mm256_mul_ps(re, re)); + + return detail::HorizontalSumAVX(s1); + }; + + using Op = std::plus; + return Base::for_.RunReduceP( + MinSize(state.num_qubits()) / 16, f, Op(), state.get()); + } + + uint64_t FindMeasuredBits( + unsigned m, double r, uint64_t mask, const State& state) const { + double csum = 0; + + uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 16, m); + uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 16, m); + + const fp_type* p = state.get(); + + for (uint64_t k = k0; k < k1; ++k) { + for (uint64_t j = 0; j < 8; ++j) { + auto re = p[16 * k + j]; + auto im = p[16 * k + j + 8]; + csum += re * re + im * im; + if (r < csum) { + return (8 * k + j) & mask; + } + } + } + + // Return the last bitstring in the unlikely case of underflow. + return (8 * k1 - 1) & mask; + } +}; + +} // namespace qsim + +#endif // STATESPACE_AVX_H_ diff --git a/tpls/qsim/statespace_avx512.h b/tpls/qsim/statespace_avx512.h new file mode 100644 index 0000000..879fd89 --- /dev/null +++ b/tpls/qsim/statespace_avx512.h @@ -0,0 +1,448 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef STATESPACE_AVX512_H_ +#define STATESPACE_AVX512_H_ + +#include + +#include +#include +#include +#include +#include + +#include "statespace.h" +#include "util.h" +#include "vectorspace.h" + +namespace qsim { + +namespace detail { + +inline unsigned GetZeroMaskAVX512(uint64_t i, uint64_t mask, uint64_t bits) { + __m512i s1 = _mm512_setr_epi64( + i + 0, i + 1, i + 2, i + 3, i + 4, i + 5, i + 6, i + 7); + __m512i s2 = _mm512_setr_epi64( + i + 8, i + 9, i + 10, i + 11, i + 12, i + 13, i + 14, i + 15); + __m512i ma = _mm512_set1_epi64(mask); + __m512i bi = _mm512_set1_epi64(bits); + + s1 = _mm512_and_si512(s1, ma); + s2 = _mm512_and_si512(s2, ma); + + unsigned m1 = _mm512_cmpeq_epu64_mask(s1, bi); + unsigned m2 = _mm512_cmpeq_epu64_mask(s2, bi); + + return (m2 << 8) | m1; +} + +inline double HorizontalSumAVX(__m256 s) { + __m128 l = _mm256_castps256_ps128(s); + __m128 h = _mm256_extractf128_ps(s, 1); + __m128 s1 = _mm_add_ps(h, l); + __m128 s1s = _mm_movehdup_ps(s1); + __m128 s2 = _mm_add_ps(s1, s1s); + + return _mm_cvtss_f32(_mm_add_ss(s2, _mm_movehl_ps(s1s, s2))); +} + +inline double HorizontalSumAVX512(__m512 s) { + __m256 l = _mm512_castps512_ps256(s); + __m512d sd = _mm512_castps_pd(s); + __m256d hd = _mm512_extractf64x4_pd(sd, 1); + __m256 h = _mm256_castpd_ps(hd); + __m256 p = _mm256_add_ps(h, l); + + return HorizontalSumAVX(p); +} + +} // namespace detail + +/** + * Object containing context and routines for AVX state-vector manipulations. + * State is a vectorized sequence of sixteen real components followed by + * sixteen imaginary components. Sixteen single-precison floating numbers can + * be loaded into an AVX512 register. + */ +template +class StateSpaceAVX512 : + public StateSpace, VectorSpace, For, float> { + private: + using Base = StateSpace, qsim::VectorSpace, For, float>; + + public: + using State = typename Base::State; + using fp_type = typename Base::fp_type; + + template + explicit StateSpaceAVX512(ForArgs&&... args) : Base(args...) {} + + static uint64_t MinSize(unsigned num_qubits) { + return std::max(uint64_t{32}, 2 * (uint64_t{1} << num_qubits)); + }; + + void InternalToNormalOrder(State& state) const { + __m512i idx1 = _mm512_setr_epi32( + 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); + __m512i idx2 = _mm512_setr_epi32( + 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); + + auto f = [](unsigned n, unsigned m, uint64_t i, + __m512i idx1, __m512i idx2, fp_type* p) { + __m512 v1 = _mm512_load_ps(p + 32 * i); + __m512 v2 = _mm512_load_ps(p + 32 * i + 16); + + _mm512_store_ps(p + 32 * i, _mm512_permutex2var_ps(v1, idx1, v2)); + _mm512_store_ps(p + 32 * i + 16, _mm512_permutex2var_ps(v1, idx2, v2)); + }; + + Base::for_.Run( + MinSize(state.num_qubits()) / 32, f, idx1, idx2, state.get()); + } + + void NormalToInternalOrder(State& state) const { + __m512i idx1 = _mm512_setr_epi32( + 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30); + __m512i idx2 = _mm512_setr_epi32( + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31); + + auto f = [](unsigned n, unsigned m, uint64_t i, + __m512i idx1, __m512i idx2, fp_type* p) { + __m512 re = _mm512_load_ps(p + 32 * i); + __m512 im = _mm512_load_ps(p + 32 * i + 16); + + _mm512_store_ps(p + 32 * i, _mm512_permutex2var_ps(re, idx1, im)); + _mm512_store_ps(p + 32 * i + 16, _mm512_permutex2var_ps(re, idx2, im)); + }; + + Base::for_.Run( + MinSize(state.num_qubits()) / 32, f, idx1, idx2, state.get()); + } + + void SetAllZeros(State& state) const { + __m512 val0 = _mm512_setzero_ps(); + + auto f = [](unsigned n, unsigned m, uint64_t i, __m512 val0, fp_type* p) { + _mm512_store_ps(p + 32 * i, val0); + _mm512_store_ps(p + 32 * i + 16, val0); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 32, f, val0, state.get()); + } + + // Uniform superposition. + void SetStateUniform(State& state) const { + __m512 val0 = _mm512_setzero_ps(); + __m512 valu; + + fp_type v = double{1} / std::sqrt(uint64_t{1} << state.num_qubits()); + + switch (state.num_qubits()) { + case 1: + valu = _mm512_set_ps(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, v, v); + break; + case 2: + valu = _mm512_set_ps(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, v, v, v, v); + break; + case 3: + valu = _mm512_set_ps(0, 0, 0, 0, 0, 0, 0, 0, v, v, v, v, v, v, v, v); + break; + default: + valu = _mm512_set1_ps(v); + break; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const __m512& val0, const __m512& valu, fp_type* p) { + _mm512_store_ps(p + 32 * i, valu); + _mm512_store_ps(p + 32 * i + 16, val0); + }; + + Base::for_.Run( + MinSize(state.num_qubits()) / 32, f, val0, valu, state.get()); + } + + // |0> state. + void SetStateZero(State& state) const { + SetAllZeros(state); + state.get()[0] = 1; + } + + static std::complex GetAmpl(const State& state, uint64_t i) { + uint64_t p = (32 * (i / 16)) + (i % 16); + return std::complex(state.get()[p], state.get()[p + 16]); + } + + static void SetAmpl( + State& state, uint64_t i, const std::complex& ampl) { + uint64_t p = (32 * (i / 16)) + (i % 16); + state.get()[p] = std::real(ampl); + state.get()[p + 16] = std::imag(ampl); + } + + static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) { + uint64_t p = (32 * (i / 16)) + (i % 16); + state.get()[p] = re; + state.get()[p + 16] = im; + } + + // Sets state[i] = complex(re, im) where (i & mask) == bits. + // if `exclude` is true then the criteria becomes (i & mask) != bits. + void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, + const std::complex& val, + bool exclude = false) const { + BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude); + } + + // Sets state[i] = complex(re, im) where (i & mask) == bits. + // if `exclude` is true then the criteria becomes (i & mask) != bits. + void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re, + fp_type im, bool exclude = false) const { + __m512 re_reg = _mm512_set1_ps(re); + __m512 im_reg = _mm512_set1_ps(im); + + __mmask16 exclude_n = exclude ? 0xffff : 0; + + auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv, + uint64_t bitsv, __m512 re_n, __m512 im_n, __mmask16 exclude_n, + fp_type* p) { + __m512 re = _mm512_load_ps(p + 32 * i); + __m512 im = _mm512_load_ps(p + 32 * i + 16); + + __mmask16 ml = + detail::GetZeroMaskAVX512(16 * i, maskv, bitsv) ^ exclude_n; + + re = _mm512_mask_blend_ps(ml, re, re_n); + im = _mm512_mask_blend_ps(ml, im, im_n); + + _mm512_store_ps(p + 32 * i, re); + _mm512_store_ps(p + 32 * i + 16, im); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 32, f, mask, bits, + re_reg, im_reg, exclude_n, state.get()); + } + + // Does the equivalent of dest += src elementwise. + bool Add(const State& src, State& dest) const { + if (src.num_qubits() != dest.num_qubits()) { + return false; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p1, fp_type* p2) { + __m512 re1 = _mm512_load_ps(p1 + 32 * i); + __m512 im1 = _mm512_load_ps(p1 + 32 * i + 16); + __m512 re2 = _mm512_load_ps(p2 + 32 * i); + __m512 im2 = _mm512_load_ps(p2 + 32 * i + 16); + + _mm512_store_ps(p2 + 32 * i, _mm512_add_ps(re1, re2)); + _mm512_store_ps(p2 + 32 * i + 16, _mm512_add_ps(im1, im2)); + }; + + Base::for_.Run(MinSize(src.num_qubits()) / 32, f, src.get(), dest.get()); + + return true; + } + + // Does the equivalent of state *= a elementwise. + void Multiply(fp_type a, State& state) const { + __m512 r = _mm512_set1_ps(a); + + auto f = [](unsigned n, unsigned m, uint64_t i, __m512 r, fp_type* p) { + __m512 re = _mm512_load_ps(p + 32 * i); + __m512 im = _mm512_load_ps(p + 32 * i + 16); + + _mm512_store_ps(p + 32 * i, _mm512_mul_ps(re, r)); + _mm512_store_ps(p + 32 * i + 16, _mm512_mul_ps(im, r)); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 32, f, r, state.get()); + } + + std::complex InnerProduct( + const State& state1, const State& state2) const { + if (state1.num_qubits() != state2.num_qubits()) { + return std::nan(""); + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p1, const fp_type* p2) -> std::complex { + __m512 re1 = _mm512_load_ps(p1 + 32 * i); + __m512 im1 = _mm512_load_ps(p1 + 32 * i + 16); + __m512 re2 = _mm512_load_ps(p2 + 32 * i); + __m512 im2 = _mm512_load_ps(p2 + 32 * i + 16); + + __m512 ip_re = _mm512_fmadd_ps(im1, im2, _mm512_mul_ps(re1, re2)); + __m512 ip_im = _mm512_fnmadd_ps(im1, re2, _mm512_mul_ps(re1, im2)); + + double re = detail::HorizontalSumAVX512(ip_re); + double im = detail::HorizontalSumAVX512(ip_im); + + return std::complex{re, im}; + }; + + using Op = std::plus>; + return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 32, f, + Op(), state1.get(), state2.get()); + } + + double RealInnerProduct(const State& state1, const State& state2) const { + if (state1.num_qubits() != state2.num_qubits()) { + return std::nan(""); + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p1, const fp_type* p2) -> double { + __m512 re1 = _mm512_load_ps(p1 + 32 * i); + __m512 im1 = _mm512_load_ps(p1 + 32 * i + 16); + __m512 re2 = _mm512_load_ps(p2 + 32 * i); + __m512 im2 = _mm512_load_ps(p2 + 32 * i + 16); + + __m512 ip_re = _mm512_fmadd_ps(im1, im2, _mm512_mul_ps(re1, re2)); + + return detail::HorizontalSumAVX512(ip_re); + }; + + using Op = std::plus; + return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 32, f, + Op(), state1.get(), state2.get()); + } + + template + std::vector Sample( + const State& state, uint64_t num_samples, unsigned seed) const { + std::vector bitstrings; + + if (num_samples > 0) { + double norm = 0; + uint64_t size = MinSize(state.num_qubits()) / 32; + const fp_type* p = state.get(); + + for (uint64_t k = 0; k < size; ++k) { + for (unsigned j = 0; j < 16; ++j) { + double re = p[32 * k + j]; + double im = p[32 * k + 16 + j]; + norm += re * re + im * im; + } + } + + auto rs = GenerateRandomValues(num_samples, seed, norm); + + uint64_t m = 0; + double csum = 0; + bitstrings.reserve(num_samples); + + for (uint64_t k = 0; k < size; ++k) { + for (unsigned j = 0; j < 16; ++j) { + double re = p[32 * k + j]; + double im = p[32 * k + 16 + j]; + csum += re * re + im * im; + while (rs[m] < csum && m < num_samples) { + bitstrings.emplace_back(16 * k + j); + ++m; + } + } + } + + for (; m < num_samples; ++m) { + bitstrings.emplace_back((uint64_t{1} << state.num_qubits()) - 1); + } + } + + return bitstrings; + } + + using MeasurementResult = typename Base::MeasurementResult; + + void Collapse(const MeasurementResult& mr, State& state) const { + auto f1 = [](unsigned n, unsigned m, uint64_t i, + uint64_t mask, uint64_t bits, const fp_type* p) -> double { + __mmask16 ml = detail::GetZeroMaskAVX512(16 * i, mask, bits); + + __m512 re = _mm512_maskz_load_ps(ml, p + 32 * i); + __m512 im = _mm512_maskz_load_ps(ml, p + 32 * i + 16); + __m512 s1 = _mm512_fmadd_ps(im, im, _mm512_mul_ps(re, re)); + + return detail::HorizontalSumAVX512(s1); + }; + + using Op = std::plus; + double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 32, f1, + Op(), mr.mask, mr.bits, state.get()); + + __m512 renorm = _mm512_set1_ps(1.0 / std::sqrt(norm)); + + auto f2 = [](unsigned n, unsigned m, uint64_t i, + uint64_t mask, uint64_t bits, __m512 renorm, fp_type* p) { + __mmask16 ml = detail::GetZeroMaskAVX512(16 * i, mask, bits); + + __m512 re = _mm512_maskz_load_ps(ml, p + 32 * i); + __m512 im = _mm512_maskz_load_ps(ml, p + 32 * i + 16); + + re = _mm512_mul_ps(re, renorm); + im = _mm512_mul_ps(im, renorm); + + _mm512_store_ps(p + 32 * i, re); + _mm512_store_ps(p + 32 * i + 16, im); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 32, f2, + mr.mask, mr.bits, renorm, state.get()); + } + + std::vector PartialNorms(const State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p) -> double { + __m512 re = _mm512_load_ps(p + 32 * i); + __m512 im = _mm512_load_ps(p + 32 * i + 16); + __m512 s1 = _mm512_fmadd_ps(im, im, _mm512_mul_ps(re, re)); + + return detail::HorizontalSumAVX512(s1); + }; + + using Op = std::plus; + return Base::for_.RunReduceP( + MinSize(state.num_qubits()) / 32, f, Op(), state.get()); + } + + uint64_t FindMeasuredBits( + unsigned m, double r, uint64_t mask, const State& state) const { + double csum = 0; + + uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 32, m); + uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 32, m); + + const fp_type* p = state.get(); + + for (uint64_t k = k0; k < k1; ++k) { + for (uint64_t j = 0; j < 16; ++j) { + auto re = p[32 * k + j]; + auto im = p[32 * k + j + 16]; + csum += re * re + im * im; + if (r < csum) { + return (16 * k + j) & mask; + } + } + } + + // Return the last bitstring in the unlikely case of underflow. + return (16 * k1 - 1) & mask; + } +}; + +} // namespace qsim + +#endif // STATESPACE_AVX512_H_ diff --git a/tpls/qsim/statespace_basic.h b/tpls/qsim/statespace_basic.h new file mode 100644 index 0000000..6468483 --- /dev/null +++ b/tpls/qsim/statespace_basic.h @@ -0,0 +1,300 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef STATESPACE_BASIC_H_ +#define STATESPACE_BASIC_H_ + +#include +#include +#include +#include + +#include "statespace.h" +#include "util.h" +#include "vectorspace.h" + +namespace qsim { + +/** + * Object containing context and routines for unoptimized state-vector + * manipulations. State is a non-vectorized sequence of one real amplitude + * followed by one imaginary amplitude. + */ +template +class StateSpaceBasic : + public StateSpace, VectorSpace, For, FP> { + private: + using Base = StateSpace, qsim::VectorSpace, For, FP>; + + public: + using State = typename Base::State; + using fp_type = typename Base::fp_type; + + template + explicit StateSpaceBasic(ForArgs&&... args) : Base(args...) {} + + static uint64_t MinSize(unsigned num_qubits) { + return 2 * (uint64_t{1} << num_qubits); + }; + + void InternalToNormalOrder(State& state) const {} + + void NormalToInternalOrder(State& state) const {} + + void SetAllZeros(State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) { + p[2 * i] = 0; + p[2 * i + 1] = 0; + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 2, f, state.get()); + } + + // Uniform superposition. + void SetStateUniform(State& state) const { + fp_type val = fp_type{1} / std::sqrt(uint64_t{1} << state.num_qubits()); + + auto f = [](unsigned n, unsigned m, uint64_t i, + fp_type val, fp_type* p) { + p[2 * i] = val; + p[2 * i + 1] = 0; + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 2, f, val, state.get()); + } + + // |0> state. + void SetStateZero(State& state) const { + SetAllZeros(state); + state.get()[0] = 1; + } + + static std::complex GetAmpl(const State& state, uint64_t i) { + uint64_t p = 2 * i; + return std::complex(state.get()[p], state.get()[p + 1]); + } + + static void SetAmpl( + State& state, uint64_t i, const std::complex& ampl) { + uint64_t p = 2 * i; + state.get()[p] = std::real(ampl); + state.get()[p + 1] = std::imag(ampl); + } + + static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) { + uint64_t p = 2 * i; + state.get()[p] = re; + state.get()[p + 1] = im; + } + + // Sets state[i] = complex(re, im) where (i & mask) == bits. + // if `exclude` is true then the criteria becomes (i & mask) != bits. + void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, + const std::complex& val, + bool exclude = false) const { + BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude); + } + + // Sets state[i] = complex(re, im) where (i & mask) == bits. + // if `exclude` is true then the criteria becomes (i & mask) != bits. + void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re, + fp_type im, bool exclude = false) const { + auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv, + uint64_t bitsv, fp_type re_n, fp_type im_n, bool excludev, + fp_type* p) { + auto s = p + 2 * i; + bool in_mask = (i & maskv) == bitsv; + in_mask ^= excludev; + s[0] = in_mask ? re_n : s[0]; + s[1] = in_mask ? im_n : s[1]; + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 2, f, mask, bits, re, im, + exclude, state.get()); + } + + // Does the equivalent of dest += src elementwise. + bool Add(const State& src, State& dest) const { + if (src.num_qubits() != dest.num_qubits()) { + return false; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p1, fp_type* p2) { + p2[2 * i] += p1[2 * i]; + p2[2 * i + 1] += p1[2 * i + 1]; + }; + + Base::for_.Run(MinSize(src.num_qubits()) / 2, f, src.get(), dest.get()); + + return true; + } + + // Does the equivalent of state *= a elementwise. + void Multiply(fp_type a, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, fp_type a, fp_type* p) { + p[2 * i] *= a; + p[2 * i + 1] *= a; + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 2, f, a, state.get()); + } + + std::complex InnerProduct( + const State& state1, const State& state2) const { + if (state1.num_qubits() != state2.num_qubits()) { + return std::nan(""); + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p1, const fp_type* p2) -> std::complex { + auto s1 = p1 + 2 * i; + auto s2 = p2 + 2 * i; + + double re = s1[0] * s2[0] + s1[1] * s2[1]; + double im = s1[0] * s2[1] - s1[1] * s2[0]; + + return std::complex{re, im}; + }; + + using Op = std::plus>; + return Base::for_.RunReduce( + MinSize(state1.num_qubits()) / 2, f, Op(), state1.get(), state2.get()); + } + + double RealInnerProduct(const State& state1, const State& state2) const { + if (state1.num_qubits() != state2.num_qubits()) { + return std::nan(""); + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p1, const fp_type* p2) -> double { + auto s1 = p1 + 2 * i; + auto s2 = p2 + 2 * i; + + return s1[0] * s2[0] + s1[1] * s2[1]; + }; + + using Op = std::plus; + return Base::for_.RunReduce( + MinSize(state1.num_qubits()) / 2, f, Op(), state1.get(), state2.get()); + } + + template + std::vector Sample( + const State& state, uint64_t num_samples, unsigned seed) const { + std::vector bitstrings; + + if (num_samples > 0) { + double norm = 0; + uint64_t size = MinSize(state.num_qubits()) / 2; + + const fp_type* p = state.get(); + + for (uint64_t k = 0; k < size; ++k) { + double re = p[2 * k]; + double im = p[2 * k + 1]; + norm += re * re + im * im; + } + + auto rs = GenerateRandomValues(num_samples, seed, norm); + + uint64_t m = 0; + double csum = 0; + bitstrings.reserve(num_samples); + + for (uint64_t k = 0; k < size; ++k) { + double re = p[2 * k]; + double im = p[2 * k + 1]; + csum += re * re + im * im; + while (rs[m] < csum && m < num_samples) { + bitstrings.emplace_back(k); + ++m; + } + } + + for (; m < num_samples; ++m) { + bitstrings.emplace_back((uint64_t{1} << state.num_qubits()) - 1); + } + } + + return bitstrings; + } + + using MeasurementResult = typename Base::MeasurementResult; + + void Collapse(const MeasurementResult& mr, State& state) const { + auto f1 = [](unsigned n, unsigned m, uint64_t i, + uint64_t mask, uint64_t bits, const fp_type* p) -> double { + auto s = p + 2 * i; + return (i & mask) == bits ? s[0] * s[0] + s[1] * s[1] : 0; + }; + + using Op = std::plus; + double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 2, f1, + Op(), mr.mask, mr.bits, state.get()); + + double renorm = 1.0 / std::sqrt(norm); + + auto f2 = [](unsigned n, unsigned m, uint64_t i, + uint64_t mask, uint64_t bits, fp_type renorm, fp_type* p) { + auto s = p + 2 * i; + bool not_zero = (i & mask) == bits; + + s[0] = not_zero ? s[0] * renorm : 0; + s[1] = not_zero ? s[1] * renorm : 0; + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 2, f2, + mr.mask, mr.bits, renorm, state.get()); + } + + std::vector PartialNorms(const State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p) -> double { + auto s = p + 2 * i; + return s[0] * s[0] + s[1] * s[1]; + }; + + using Op = std::plus; + return Base::for_.RunReduceP( + MinSize(state.num_qubits()) / 2, f, Op(), state.get()); + } + + uint64_t FindMeasuredBits( + unsigned m, double r, uint64_t mask, const State& state) const { + double csum = 0; + + uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 2, m); + uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 2, m); + + const fp_type* p = state.get(); + + for (uint64_t k = k0; k < k1; ++k) { + auto re = p[2 * k]; + auto im = p[2 * k + 1]; + csum += re * re + im * im; + if (r < csum) { + return k & mask; + } + } + + // Return the last bitstring in the unlikely case of underflow. + return (k1 - 1) & mask; + } +}; + +} // namespace qsim + +#endif // STATESPACE_BASIC_H_ diff --git a/tpls/qsim/statespace_cuda.h b/tpls/qsim/statespace_cuda.h new file mode 100644 index 0000000..660db07 --- /dev/null +++ b/tpls/qsim/statespace_cuda.h @@ -0,0 +1,470 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef STATESPACE_CUDA_H_ +#define STATESPACE_CUDA_H_ + +#ifdef __NVCC__ + #include +#elif __HIP__ + #include + #include "cuda2hip.h" +#endif + +#include +#include +#include + +#include "statespace.h" +#include "statespace_cuda_kernels.h" +#include "vectorspace_cuda.h" +#include "util_cuda.h" + +namespace qsim { + +/** + * Object containing context and routines for CUDA state-vector manipulations. + * State is a vectorized sequence of 32 real components followed by 32 + * imaginary components. 32 floating numbers can be proccessed in parallel by + * a single warp. It is not recommended to use `GetAmpl` and `SetAmpl`. + */ +template +class StateSpaceCUDA : + public StateSpace, VectorSpaceCUDA, FP> { + private: + using Base = StateSpace, qsim::VectorSpaceCUDA, FP>; + + protected: + struct Grid { + unsigned threads; + unsigned dblocks; + unsigned blocks; + }; + + public: + using State = typename Base::State; + using fp_type = typename Base::fp_type; + + struct Parameter { + /** + * The number of threads per block. + * Should be 2 to the power of k, where k is in the range [5,10]. + */ + unsigned num_threads = 512; + /** + * The number of data blocks. Each thread processes num_dblocks data + * blocks in reductions (norms, inner products, etc). + */ + unsigned num_dblocks = 16; + }; + + explicit StateSpaceCUDA(const Parameter& param) + : param_(param), scratch_(nullptr), scratch_size_(0) {} + + virtual ~StateSpaceCUDA() { + if (scratch_ != nullptr) { + ErrorCheck(cudaFree(scratch_)); + } + } + + static uint64_t MinSize(unsigned num_qubits) { + return std::max(uint64_t{64}, 2 * (uint64_t{1} << num_qubits)); + }; + + void InternalToNormalOrder(State& state) const { + uint64_t size = MinSize(state.num_qubits()) / 2; + + unsigned threads = std::min(size, uint64_t{param_.num_threads}); + unsigned blocks = size / threads; + unsigned bytes = 2 * threads * sizeof(fp_type); + + InternalToNormalOrderKernel<<>>(state.get()); + ErrorCheck(cudaPeekAtLastError()); + ErrorCheck(cudaDeviceSynchronize()); + } + + void NormalToInternalOrder(State& state) const { + uint64_t size = MinSize(state.num_qubits()) / 2; + + unsigned threads = std::min(size, uint64_t{param_.num_threads}); + unsigned blocks = size / threads; + unsigned bytes = 2 * threads * sizeof(fp_type); + + NormalToInternalOrderKernel<<>>(state.get()); + ErrorCheck(cudaPeekAtLastError()); + ErrorCheck(cudaDeviceSynchronize()); + } + + void SetAllZeros(State& state) const { + ErrorCheck(cudaMemset(state.get(), 0, + MinSize(state.num_qubits()) * sizeof(fp_type))); + } + + // Uniform superposition. + void SetStateUniform(State& state) const { + uint64_t size = MinSize(state.num_qubits()) / 2; + uint64_t hsize = uint64_t{1} << state.num_qubits(); + + unsigned threads = std::min(size, uint64_t{param_.num_threads}); + unsigned blocks = size / threads; + + fp_type v = double{1} / std::sqrt(hsize); + + SetStateUniformKernel<<>>(v, hsize, state.get()); + ErrorCheck(cudaPeekAtLastError()); + ErrorCheck(cudaDeviceSynchronize()); + } + + // |0> state. + void SetStateZero(State& state) const { + SetAllZeros(state); + fp_type one[1] = {1}; + ErrorCheck( + cudaMemcpy(state.get(), one, sizeof(fp_type), cudaMemcpyHostToDevice)); + } + + // It is not recommended to use this function. + static std::complex GetAmpl(const State& state, uint64_t i) { + fp_type re, im; + auto p = state.get() + 64 * (i / 32) + i % 32; + ErrorCheck(cudaMemcpy(&re, p, sizeof(fp_type), cudaMemcpyDeviceToHost)); + ErrorCheck( + cudaMemcpy(&im, p + 32, sizeof(fp_type), cudaMemcpyDeviceToHost)); + return std::complex(re, im); + } + + // It is not recommended to use this function. + static void SetAmpl( + State& state, uint64_t i, const std::complex& ampl) { + fp_type re = std::real(ampl); + fp_type im = std::imag(ampl); + auto p = state.get() + 64 * (i / 32) + i % 32; + ErrorCheck(cudaMemcpy(p, &re, sizeof(fp_type), cudaMemcpyHostToDevice)); + ErrorCheck( + cudaMemcpy(p + 32, &im, sizeof(fp_type), cudaMemcpyHostToDevice)); + } + + // It is not recommended to use this function. + static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) { + auto p = state.get() + 64 * (i / 32) + i % 32; + ErrorCheck(cudaMemcpy(p, &re, sizeof(fp_type), cudaMemcpyHostToDevice)); + ErrorCheck( + cudaMemcpy(p + 32, &im, sizeof(fp_type), cudaMemcpyHostToDevice)); + } + + // Sets state[i] = complex(re, im) where (i & mask) == bits. + // if `exclude` is true then the criteria becomes (i & mask) != bits. + void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, + const std::complex& val, + bool exclude = false) const { + BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude); + } + + // Sets state[i] = complex(re, im) where (i & mask) == bits. + // if `exclude` is true then the criteria becomes (i & mask) != bits. + void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re, + fp_type im, bool exclude = false) const { + uint64_t size = MinSize(state.num_qubits()) / 2; + + unsigned threads = std::min(size, uint64_t{param_.num_threads}); + unsigned blocks = size / threads; + + BulkSetAmplKernel<<>>( + mask, bits, re, im, exclude, state.get()); + ErrorCheck(cudaPeekAtLastError()); + ErrorCheck(cudaDeviceSynchronize()); + } + + // Does the equivalent of dest += src elementwise. + bool Add(const State& src, State& dest) const { + if (src.num_qubits() != dest.num_qubits()) { + return false; + } + + uint64_t size = MinSize(src.num_qubits()); + + unsigned threads = std::min(size, uint64_t{param_.num_threads}); + unsigned blocks = size / threads; + + AddKernel<<>>(src.get(), dest.get()); + ErrorCheck(cudaPeekAtLastError()); + ErrorCheck(cudaDeviceSynchronize()); + + return true; + } + + // Does the equivalent of state *= a elementwise. + void Multiply(fp_type a, State& state) const { + uint64_t size = MinSize(state.num_qubits()); + + unsigned threads = std::min(size, uint64_t{param_.num_threads}); + unsigned blocks = size / threads; + + MultiplyKernel<<>>(a, state.get()); + ErrorCheck(cudaPeekAtLastError()); + ErrorCheck(cudaDeviceSynchronize()); + } + + std::complex InnerProduct( + const State& state1, const State& state2) const { + if (state1.num_qubits() != state2.num_qubits()) { + return std::nan(""); + } + + using C = Complex; + auto r = Reduce>(state1, state2); + + return {r.re, r.im}; + } + + double RealInnerProduct(const State& state1, const State& state2) const { + if (state1.num_qubits() != state2.num_qubits()) { + return std::nan(""); + } + + return Reduce>(state1, state2); + } + + double Norm(const State& state) const { + return Reduce>(state, state); + } + + template + std::vector Sample( + const State& state, uint64_t num_samples, unsigned seed) const { + std::vector bitstrings; + + if (num_samples > 0) { + Grid g1 = GetGrid1(MinSize(state.num_qubits()) / 2); + unsigned bytes = g1.threads * sizeof(double); + + unsigned scratch_size = (g1.blocks + 1) * sizeof(double) + + num_samples * (sizeof(uint64_t) + sizeof(DistrRealType)); + + void* scratch = AllocScratch(scratch_size); + + double* d_res2 = (double*) scratch; + double* d_res1 = d_res2 + 1; + uint64_t* d_bitstrings = (uint64_t*) (d_res1 + g1.blocks); + DistrRealType* d_rs = (DistrRealType *) (d_bitstrings + num_samples); + + auto op1 = RealProduct(); + auto op2 = Plus(); + + Reduce1Kernel<<>>( + g1.dblocks, op1, op2, op2, state.get(), state.get(), d_res1); + ErrorCheck(cudaPeekAtLastError()); + ErrorCheck(cudaDeviceSynchronize()); + + double norm; + + if (g1.blocks == 1) { + ErrorCheck( + cudaMemcpy(&norm, d_res1, sizeof(double), cudaMemcpyDeviceToHost)); + } else { + Grid g2 = GetGrid2(g1.blocks); + unsigned bytes = g2.threads * sizeof(double); + + auto op3 = Plus(); + + Reduce2Kernel<<>>( + g2.dblocks, g1.blocks, op3, op3, d_res1, d_res2); + ErrorCheck(cudaPeekAtLastError()); + ErrorCheck(cudaDeviceSynchronize()); + + ErrorCheck( + cudaMemcpy(&norm, d_res2, sizeof(double), cudaMemcpyDeviceToHost)); + } + + // TODO: generate random values on the device. + auto rs = GenerateRandomValues(num_samples, seed, norm); + + ErrorCheck(cudaMemcpy(d_rs, rs.data(), + num_samples * sizeof(DistrRealType), + cudaMemcpyHostToDevice)); + + SampleKernel<<<1, g1.threads>>>(g1.blocks, g1.dblocks, num_samples, + d_rs, d_res1, state.get(), d_bitstrings); + ErrorCheck(cudaPeekAtLastError()); + ErrorCheck(cudaDeviceSynchronize()); + + bitstrings.resize(num_samples, 0); + + ErrorCheck(cudaMemcpy(bitstrings.data(), d_bitstrings, + num_samples * sizeof(uint64_t), + cudaMemcpyDeviceToHost)); + } + + return bitstrings; + } + + using MeasurementResult = typename Base::MeasurementResult; + + void Collapse(const MeasurementResult& mr, State& state) const { + using Op = RealProduct; + double r = Reduce(mr.mask, mr.bits, state, state); + fp_type renorm = 1 / std::sqrt(r); + + uint64_t size = MinSize(state.num_qubits()) / 2; + + unsigned threads = std::min(size, uint64_t{param_.num_threads}); + unsigned blocks = size / threads; + + CollapseKernel<<>>(mr.mask, mr.bits, renorm, state.get()); + ErrorCheck(cudaPeekAtLastError()); + ErrorCheck(cudaDeviceSynchronize()); + } + + std::vector PartialNorms(const State& state) const { + Grid g = GetGrid1(MinSize(state.num_qubits()) / 2); + + unsigned scratch_size = g.blocks * sizeof(double); + unsigned bytes = g.threads * sizeof(double); + + double* d_res = (double*) AllocScratch(scratch_size); + + auto op1 = RealProduct(); + auto op2 = Plus(); + + Reduce1Kernel<<>>( + g.dblocks, op1, op2, op2, state.get(), state.get(), d_res); + ErrorCheck(cudaPeekAtLastError()); + ErrorCheck(cudaDeviceSynchronize()); + + std::vector norms(g.blocks); + + ErrorCheck( + cudaMemcpy(norms.data(), d_res, scratch_size, cudaMemcpyDeviceToHost)); + + return norms; + } + + uint64_t FindMeasuredBits( + unsigned m, double r, uint64_t mask, const State& state) const { + Grid g = GetGrid1(MinSize(state.num_qubits()) / 2); + + uint64_t res; + uint64_t* d_res = (uint64_t*) AllocScratch(sizeof(uint64_t)); + + FindMeasuredBitsKernel<<<1, g.threads>>>( + m, g.dblocks, r, state.get(), d_res); + ErrorCheck(cudaPeekAtLastError()); + ErrorCheck(cudaDeviceSynchronize()); + + ErrorCheck( + cudaMemcpy(&res, d_res, sizeof(uint64_t), cudaMemcpyDeviceToHost)); + + return res & mask; + } + + protected: + Parameter param_; + + void* AllocScratch(uint64_t size) const { + if (size > scratch_size_) { + if (scratch_ != nullptr) { + ErrorCheck(cudaFree(scratch_)); + } + + ErrorCheck(cudaMalloc(const_cast(&scratch_), size)); + + const_cast(scratch_size_) = size; + } + + return scratch_; + } + + Grid GetGrid1(uint64_t size) const { + Grid grid; + + grid.threads = std::min(size, uint64_t{param_.num_threads}); + grid.dblocks = std::min(size / grid.threads, uint64_t{param_.num_dblocks}); + grid.blocks = size / (grid.threads * grid.dblocks); + + return grid; + } + + Grid GetGrid2(unsigned size) const { + Grid grid; + + grid.threads = std::min(param_.num_threads, std::max(32U, size)); + grid.dblocks = std::max(1U, size / grid.threads); + grid.blocks = 1; + + return grid; + } + + template + FP2 Reduce(const State& state1, const State& state2) const { + return Reduce(0, 0, state1, state2); + } + + template + FP2 Reduce(uint64_t mask, uint64_t bits, + const State& state1, const State& state2) const { + uint64_t size = MinSize(state1.num_qubits()) / 2; + + Grid g1 = GetGrid1(size); + unsigned bytes = g1.threads * sizeof(FP1); + + FP2* d_res2 = (FP2*) AllocScratch((g1.blocks + 1) * sizeof(FP2)); + FP2* d_res1 = d_res2 + 1; + + auto op1 = Op(); + auto op2 = Plus(); + auto op3 = Plus::type>(); + + if (mask == 0) { + Reduce1Kernel<<>>( + g1.dblocks, op1, op2, op3, state1.get(), state2.get(), d_res1); + } else { + Reduce1MaskedKernel<<>>( + g1.dblocks, mask, bits, op1, op2, op3, state1.get(), state2.get(), + d_res1); + } + ErrorCheck(cudaPeekAtLastError()); + ErrorCheck(cudaDeviceSynchronize()); + + FP2 result; + + if (g1.blocks == 1) { + ErrorCheck( + cudaMemcpy(&result, d_res1, sizeof(FP2), cudaMemcpyDeviceToHost)); + } else { + Grid g2 = GetGrid2(g1.blocks); + unsigned bytes = g2.threads * sizeof(FP2); + + auto op2 = Plus(); + auto op3 = Plus::type>(); + + Reduce2Kernel<<>>( + g2.dblocks, g1.blocks, op2, op3, d_res1, d_res2); + ErrorCheck(cudaPeekAtLastError()); + ErrorCheck(cudaDeviceSynchronize()); + + ErrorCheck( + cudaMemcpy(&result, d_res2, sizeof(FP2), cudaMemcpyDeviceToHost)); + } + + return result; + } + + private: + void* scratch_; + uint64_t scratch_size_; +}; + +} // namespace qsim + +#endif // STATESPACE_CUDA_H_ diff --git a/tpls/qsim/statespace_cuda_kernels.h b/tpls/qsim/statespace_cuda_kernels.h new file mode 100644 index 0000000..b54ebca --- /dev/null +++ b/tpls/qsim/statespace_cuda_kernels.h @@ -0,0 +1,355 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef STATESPACE_CUDA_KERNELS_H_ +#define STATESPACE_CUDA_KERNELS_H_ + +#ifdef __NVCC__ + #include +#elif __HIP__ + #include + #include "cuda2hip.h" +#endif + +#include "util_cuda.h" + +namespace qsim { + +namespace detail { + +template +__device__ __forceinline__ FP1 BlockReduce1( + uint64_t n, Op1 op1, Op2 op2, Op3 op3, const FP2* s1, const FP2* s2) { + extern __shared__ float shared[]; + FP1* partial1 = (FP1*) shared; + + unsigned tid = threadIdx.x; + unsigned warp = threadIdx.x / warp_size; + unsigned lane = threadIdx.x % warp_size; + + uint64_t k0 = 2 * n * blockIdx.x * blockDim.x + 2 * tid - lane; + uint64_t k1 = k0 + 2 * n * blockDim.x; + + FP1 r; + + r = op1(s1[k0], s1[k0 + warp_size], s2[k0], s2[k0 + warp_size]); + while ((k0 += 2 * blockDim.x) < k1) { + r = op2(r, op1(s1[k0], s1[k0 + warp_size], s2[k0], s2[k0 + warp_size])); + } + + partial1[tid] = r; + + __shared__ FP1 partial2[warp_size]; + + if (tid < warp_size) { + partial2[tid] = 0; + } + + __syncthreads(); + + FP1 val = WarpReduce(partial1[tid], op3); + + if (lane == 0) { + partial2[warp] = val; + } + + __syncthreads(); + + FP1 result = 0; + + if (tid < warp_size) { + result = WarpReduce(partial2[tid], op3); + } + + return result; +} + +template +__device__ __forceinline__ FP1 BlockReduce1Masked( + uint64_t n, uint64_t mask, uint64_t bits, Op1 op1, Op2 op2, Op3 op3, + const FP2* s1, const FP2* s2) { + extern __shared__ float shared[]; + FP1* partial1 = (FP1*) shared; + + unsigned tid = threadIdx.x; + unsigned warp = threadIdx.x / warp_size; + unsigned lane = threadIdx.x % warp_size; + + uint64_t k0 = 2 * n * blockIdx.x * blockDim.x + 2 * tid - lane; + uint64_t k1 = k0 + 2 * n * blockDim.x; + + FP1 r = 0; + + if (((k0 + lane) / 2 & mask) == bits) { + r = op1(s1[k0], s1[k0 + warp_size], s2[k0], s2[k0 + warp_size]); + } + while ((k0 += 2 * blockDim.x) < k1) { + if (((k0 + lane) / 2 & mask) == bits) { + r = op2(r, op1(s1[k0], s1[k0 + warp_size], s2[k0], s2[k0 + warp_size])); + } + } + + partial1[tid] = r; + + __shared__ FP1 partial2[warp_size]; + + if (tid < warp_size) { + partial2[tid] = 0; + } + + __syncthreads(); + + FP1 val = WarpReduce(partial1[tid], op3); + + if (lane == 0) { + partial2[warp] = val; + } + + __syncthreads(); + + FP1 result = 0; + + if (tid < warp_size) { + result = WarpReduce(partial2[tid], op3); + } + + return result; +} + +template +__device__ __forceinline__ FP1 BlockReduce2( + uint64_t n, uint64_t size, Op2 op2, Op3 op3, const FP2* s) { + extern __shared__ float shared[]; + FP1* partial1 = (FP1*) shared; + + unsigned tid = threadIdx.x; + uint64_t k0 = n * blockIdx.x * blockDim.x + tid; + uint64_t k1 = k0 + n * blockDim.x; + + FP1 r = 0; + + if (tid < size) { + r = s[k0]; + while ((k0 += blockDim.x) < k1) { + r = op2(r, s[k0]); + } + } + + partial1[tid] = r; + + __shared__ FP1 partial2[warp_size]; + + if (tid < warp_size) { + partial2[tid] = 0; + } + + __syncthreads(); + + FP1 val = WarpReduce(partial1[tid], op3); + + if (threadIdx.x % warp_size == 0) { + partial2[threadIdx.x / warp_size] = val; + } + + __syncthreads(); + + FP1 result = 0; + + if (tid < warp_size) { + result = WarpReduce(partial2[tid], op3); + } + + return result; +} + +} // namespace detail + +template +__global__ void Reduce1Kernel(uint64_t n, Op1 op1, Op2 op2, Op3 op3, + const FP2* s1, const FP2* s2, FP3* result) { + FP1 sum = detail::BlockReduce1(n, op1, op2, op3, s1, s2); + + if (threadIdx.x == 0) { + result[blockIdx.x] = sum; + } +} + +template +__global__ void Reduce1MaskedKernel(uint64_t n, uint64_t mask, uint64_t bits, + Op1 op1, Op2 op2, Op3 op3, + const FP2* s1, const FP2* s2, FP3* result) { + FP1 sum = + detail::BlockReduce1Masked(n, mask, bits, op1, op2, op3, s1, s2); + + if (threadIdx.x == 0) { + result[blockIdx.x] = sum; + } +} + +template +__global__ void Reduce2Kernel( + uint64_t n, uint64_t size, Op2 op2, Op3 op3, const FP2* s, FP3* result) { + FP1 sum = detail::BlockReduce2(n, size, op2, op3, s); + + if (threadIdx.x == 0) { + result[blockIdx.x] = sum; + } +} + +template +__global__ void InternalToNormalOrderKernel(FP* state) { + unsigned lane = threadIdx.x % warp_size; + unsigned l = 2 * threadIdx.x - lane; + uint64_t k = 2 * uint64_t{blockIdx.x} * blockDim.x + l; + + extern __shared__ float shared[]; + FP* buf = (FP*) shared; + + buf[l] = state[k]; + buf[l + warp_size] = state[k + warp_size]; + + __syncthreads(); + + state[k + lane] = buf[l]; + state[k + lane + 1] = buf[l + warp_size]; +} + +template +__global__ void NormalToInternalOrderKernel(FP* state) { + unsigned lane = threadIdx.x % warp_size; + unsigned l = 2 * threadIdx.x - lane; + uint64_t k = 2 * uint64_t{blockIdx.x} * blockDim.x + l; + + extern __shared__ float shared[]; + FP* buf = (FP*) shared; + + buf[l] = state[k]; + buf[l + warp_size] = state[k + warp_size]; + + __syncthreads(); + + state[k] = buf[l + lane]; + state[k + warp_size] = buf[l + lane + 1]; +} + +template +__global__ void SetStateUniformKernel(FP v, uint64_t size, FP* state) { + unsigned lane = threadIdx.x % warp_size; + uint64_t k = 2 * (uint64_t{blockIdx.x} * blockDim.x + threadIdx.x) - lane; + + state[k] = lane < size ? v : 0; + state[k + warp_size] = 0; +} + +template +__global__ void AddKernel(const FP* state1, FP* state2) { + uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x; + state2[k] += state1[k]; +} + +template +__global__ void MultiplyKernel(FP a, FP* state) { + uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x; + state[k] *= a; +} + +template +__global__ void CollapseKernel(uint64_t mask, uint64_t bits, FP r, FP* state) { + uint64_t k1 = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x; + uint64_t k2 = 2 * k1 - threadIdx.x % warp_size; + + if ((k1 & mask) == bits) { + state[k2] *= r; + state[k2 + warp_size] *= r; + } else { + state[k2] = 0; + state[k2 + warp_size] = 0; + } +} + +template +__global__ void BulkSetAmplKernel( + uint64_t mask, uint64_t bits, FP re, FP im, bool exclude, FP* state) { + uint64_t k1 = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x; + uint64_t k2 = 2 * k1 - threadIdx.x % warp_size; + + bool set = ((k1 & mask) == bits) ^ exclude; + + if (set) { + state[k2] = re; + state[k2 + warp_size] = im; + } +} + +template +__global__ void SampleKernel(unsigned num_blocks, + uint64_t n, uint64_t num_samples, + const FP1* rs, const FP2* ps, const FP3* state, + uint64_t *bitstrings) { + // Use just one thread. This can be somewhat slow. + if (threadIdx.x == 0) { + uint64_t m = 0; + double csum = 0; + + for (unsigned block_id = 0; block_id < num_blocks; ++block_id) { + uint64_t km = n * blockDim.x; + uint64_t k0 = block_id * km; + + for (uint64_t k = 0; k < km; ++k) { + uint64_t l = 2 * k0 + 64 * (k / 32) + k % 32; + FP3 re = state[l]; + FP3 im = state[l + warp_size]; + csum += re * re + im * im; + while (rs[m] < csum && m < num_samples) { + bitstrings[m++] = k0 + k; + } + } + } + } +} + +template +__global__ void FindMeasuredBitsKernel( + uint64_t block_id, uint64_t n, double r, const FP* state, uint64_t* res) { + // Use just one thread. This can be somewhat slow, however, this is + // more or less consistent with CPU implementations. + if (threadIdx.x == 0) { + double csum = 0; + uint64_t km = n * blockDim.x; + uint64_t k0 = block_id * km; + + for (uint64_t k = 0; k < km; ++k) { + uint64_t l = 2 * k0 + 64 * (k / 32) + k % 32; + FP re = state[l]; + FP im = state[l + warp_size]; + csum += re * re + im * im; + if (r < csum) { + *res = k0 + k; + return; + } + } + + *res = k0 + n * blockDim.x - 1; + } +} + +} // namespace qsim + +#endif // STATESPACE_CUDA_KERNELS_H_ diff --git a/tpls/qsim/statespace_custatevec.h b/tpls/qsim/statespace_custatevec.h new file mode 100644 index 0000000..f2f5de1 --- /dev/null +++ b/tpls/qsim/statespace_custatevec.h @@ -0,0 +1,376 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef STATESPACE_CUSTATEVEC_H_ +#define STATESPACE_CUSTATEVEC_H_ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "statespace.h" +#include "util_custatevec.h" +#include "vectorspace_cuda.h" + +namespace qsim { + +namespace detail { + +template +__global__ void SetStateUniformKernel(FP v, uint64_t size, FP* state) { + uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x; + + if (k < size) { + state[2 * k] = v; + state[2 * k + 1] = 0; + } +} + +} // namespace detail + +/** + * Object containing context and routines for cuStateVec state-vector + * manipulations. It is not recommended to use `GetAmpl` and `SetAmpl`. + */ +template +class StateSpaceCuStateVec : + public StateSpace, VectorSpaceCUDA, FP> { + private: + using Base = StateSpace, qsim::VectorSpaceCUDA, FP>; + + public: + using State = typename Base::State; + using fp_type = typename Base::fp_type; + + static constexpr auto is_float = std::is_same::value; + + static constexpr auto kStateType = is_float ? CUDA_C_32F : CUDA_C_64F; + static constexpr auto kMatrixType = kStateType; + static constexpr auto kExpectType = CUDA_C_64F; + static constexpr auto kComputeType = + is_float ? CUSTATEVEC_COMPUTE_32F : CUSTATEVEC_COMPUTE_64F; + static constexpr auto kMatrixLayout = CUSTATEVEC_MATRIX_LAYOUT_ROW; + + explicit StateSpaceCuStateVec(const cublasHandle_t& cublas_handle, + const custatevecHandle_t& custatevec_handle) + : cublas_handle_(cublas_handle), custatevec_handle_(custatevec_handle), + workspace_(nullptr), workspace_size_(0) {} + + virtual ~StateSpaceCuStateVec() { + if (workspace_ != nullptr) { + ErrorCheck(cudaFree(workspace_)); + } + } + + static uint64_t MinSize(unsigned num_qubits) { + return 2 * (uint64_t{1} << num_qubits); + }; + + void InternalToNormalOrder(State& state) const { + } + + void NormalToInternalOrder(State& state) const { + } + + void SetAllZeros(State& state) const { + ErrorCheck(cudaMemset(state.get(), 0, + MinSize(state.num_qubits()) * sizeof(fp_type))); + } + + // Uniform superposition. + void SetStateUniform(State& state) const { + uint64_t size = uint64_t{1} << state.num_qubits(); + + unsigned threads = size < 256 ? size : 256; + unsigned blocks = size / threads; + + fp_type v = double{1} / std::sqrt(size); + + detail::SetStateUniformKernel<<>>(v, size, state.get()); + ErrorCheck(cudaPeekAtLastError()); + } + + // |0> state. + void SetStateZero(State& state) const { + SetAllZeros(state); + fp_type one[1] = {1}; + ErrorCheck( + cudaMemcpy(state.get(), one, sizeof(fp_type), cudaMemcpyHostToDevice)); + } + + // It is not recommended to use this function. + static std::complex GetAmpl(const State& state, uint64_t i) { + fp_type a[2]; + auto p = state.get() + 2 * i; + ErrorCheck(cudaMemcpy(a, p, 2 * sizeof(fp_type), cudaMemcpyDeviceToHost)); + return std::complex(a[0], a[1]); + } + + // It is not recommended to use this function. + static void SetAmpl( + State& state, uint64_t i, const std::complex& ampl) { + fp_type a[2] = {std::real(ampl), std::imag(ampl)}; + auto p = state.get() + 2 * i; + ErrorCheck(cudaMemcpy(p, a, 2 * sizeof(fp_type), cudaMemcpyHostToDevice)); + } + + // It is not recommended to use this function. + static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) { + fp_type a[2] = {re, im}; + auto p = state.get() + 2 * i; + ErrorCheck(cudaMemcpy(p, a, 2 * sizeof(fp_type), cudaMemcpyHostToDevice)); + } + + // Sets state[i] = complex(re, im) where (i & mask) == bits. + // if `exclude` is true then the criteria becomes (i & mask) != bits. + void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, + const std::complex& val, + bool exclude = false) const { + // Not implemented. + } + + // Sets state[i] = complex(re, im) where (i & mask) == bits. + // if `exclude` is true then the criteria becomes (i & mask) != bits. + void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re, + fp_type im, bool exclude = false) const { + // Not implemented. + } + + // Does the equivalent of dest += src elementwise. + bool Add(const State& src, State& dest) const { + if (src.num_qubits() != dest.num_qubits()) { + return false; + } + + uint64_t size = uint64_t{1} << src.num_qubits(); + + if (is_float) { + cuComplex a = {1.0, 0.0}; + auto p1 = (const cuComplex*) src.get(); + auto p2 = (cuComplex*) dest.get(); + ErrorCheck(cublasCaxpy(cublas_handle_, size, &a, p1, 1, p2, 1)); + } else { + cuDoubleComplex a = {1.0, 0.0}; + auto p1 = (const cuDoubleComplex*) src.get(); + auto p2 = (cuDoubleComplex*) dest.get(); + ErrorCheck(cublasZaxpy(cublas_handle_, size, &a, p1, 1, p2, 1)); + } + + return true; + } + + // Does the equivalent of state *= a elementwise. + void Multiply(fp_type a, State& state) const { + uint64_t size = uint64_t{1} << state.num_qubits(); + + if (is_float) { + float a1 = a; + auto p = (cuComplex*) state.get(); + ErrorCheck(cublasCsscal(cublas_handle_, size, &a1, p, 1)); + } else { + double a1 = a; + auto p = (cuDoubleComplex*) state.get(); + ErrorCheck(cublasZdscal(cublas_handle_, size, &a1, p, 1)); + } + } + + std::complex InnerProduct( + const State& state1, const State& state2) const { + if (state1.num_qubits() != state2.num_qubits()) { + return std::nan(""); + } + + uint64_t size = uint64_t{1} << state1.num_qubits(); + + if (is_float) { + cuComplex result; + auto p1 = (const cuComplex*) state1.get(); + auto p2 = (const cuComplex*) state2.get(); + ErrorCheck(cublasCdotc(cublas_handle_, size, p1, 1, p2, 1, &result)); + return {cuCrealf(result), cuCimagf(result)}; + } else { + cuDoubleComplex result; + auto p1 = (const cuDoubleComplex*) state1.get(); + auto p2 = (const cuDoubleComplex*) state2.get(); + ErrorCheck(cublasZdotc(cublas_handle_, size, p1, 1, p2, 1, &result)); + return {cuCreal(result), cuCimag(result)}; + } + } + + double RealInnerProduct(const State& state1, const State& state2) const { + return std::real(InnerProduct(state1, state2)); + } + + double Norm(const State& state) const { + uint64_t size = uint64_t{1} << state.num_qubits(); + + if (is_float) { + float result; + auto p = (const cuComplex*) state.get(); + ErrorCheck(cublasScnrm2(cublas_handle_, size, p, 1, &result)); + return result * result; + } else { + double result; + auto p = (const cuDoubleComplex*) state.get(); + ErrorCheck(cublasDznrm2(cublas_handle_, size, p, 1, &result)); + return result * result; + } + } + + template + std::vector Sample( + const State& state, uint64_t num_samples, unsigned seed) const { + std::vector bitstrings; + + if (num_samples > 0) { + auto rs = GenerateRandomValues(num_samples, seed, 1.0); + + size_t workspace_size; + custatevecSamplerDescriptor_t sampler; + + ErrorCheck(custatevecSamplerCreate( + custatevec_handle_, state.get(), kStateType, + state.num_qubits(), &sampler, num_samples, + &workspace_size)); + + AllocWorkSpace(workspace_size); + + ErrorCheck(custatevecSamplerPreprocess( + custatevec_handle_, sampler, workspace_, workspace_size)); + + std::vector bitstrings0(num_samples); + std::vector bitordering; + + bitordering.reserve(state.num_qubits()); + for (unsigned i = 0; i < state.num_qubits(); ++i) { + bitordering.push_back(i); + } + + ErrorCheck(custatevecSamplerSample( + custatevec_handle_, sampler, bitstrings0.data(), + bitordering.data(), state.num_qubits(), rs.data(), + num_samples, CUSTATEVEC_SAMPLER_OUTPUT_RANDNUM_ORDER)); + + bitstrings.reserve(num_samples); + for (unsigned i = 0; i < num_samples; ++i) { + bitstrings.push_back(bitstrings0[i]); + } + } + + return bitstrings; + } + + using MeasurementResult = typename Base::MeasurementResult; + + template + MeasurementResult Measure(const std::vector& qubits, + RGen& rgen, State& state, + bool no_collapse = false) const { + auto r = RandomValue(rgen, 1.0); + + MeasurementResult result; + + result.valid = true; + result.mask = 0; + result.bits = 0; + result.bitstring.resize(qubits.size(), 0); + + for (auto q : qubits) { + if (q >= state.num_qubits()) { + result.valid = false; + return result; + } + + result.mask |= uint64_t{1} << q; + } + + auto collapse = no_collapse ? + CUSTATEVEC_COLLAPSE_NONE : CUSTATEVEC_COLLAPSE_NORMALIZE_AND_ZERO; + + ErrorCheck(custatevecBatchMeasure( + custatevec_handle_, state.get(), kStateType, + state.num_qubits(), (int*) result.bitstring.data(), + (int*) qubits.data(), qubits.size(), r, collapse)); + + for (std::size_t i = 0; i < result.bitstring.size(); ++i) { + result.bits |= result.bitstring[i] << qubits[i]; + } + + return result; + } + + template + MeasurementResult VirtualMeasure(const std::vector& qubits, + RGen& rgen, const State& state) const { + return Measure(qubits, rgen, const_cast(state), true); + } + + void Collapse(const MeasurementResult& mr, State& state) const { + unsigned count = 0; + + std::vector bitstring; + std::vector bitordering; + + bitstring.reserve(state.num_qubits()); + bitordering.reserve(state.num_qubits()); + + for (unsigned i = 0; i < state.num_qubits(); ++i) { + if (((mr.mask >> i) & 1) != 0) { + bitstring.push_back((mr.bits >> i) & 1); + bitordering.push_back(i); + ++count; + } + } + + ErrorCheck(custatevecCollapseByBitString( + custatevec_handle_, state.get(), kStateType, + state.num_qubits(), bitstring.data(), bitordering.data(), + count, 1.0)); + + // TODO: do we need the following? + double norm = Norm(state); + Multiply(1.0 / std::sqrt(norm), state); + } + + private: + void* AllocWorkSpace(size_t size) const { + if (size > workspace_size_) { + if (workspace_ != nullptr) { + ErrorCheck(cudaFree(workspace_)); + } + + ErrorCheck(cudaMalloc(const_cast(&workspace_), size)); + + const_cast(workspace_size_) = size; + } + + return workspace_; + } + + const cublasHandle_t cublas_handle_; + const custatevecHandle_t custatevec_handle_; + + void* workspace_; + size_t workspace_size_; +}; + +} // namespace qsim + +#endif // STATESPACE_CUSTATEVEC_H_ diff --git a/tpls/qsim/statespace_sse.h b/tpls/qsim/statespace_sse.h new file mode 100644 index 0000000..cf41a09 --- /dev/null +++ b/tpls/qsim/statespace_sse.h @@ -0,0 +1,462 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef STATESPACE_SSE_H_ +#define STATESPACE_SSE_H_ + +#include + +#include +#include +#include +#include +#include + +#include "statespace.h" +#include "util.h" +#include "vectorspace.h" + +namespace qsim { + +namespace detail { + +inline __m128i GetZeroMaskSSE(uint64_t i, uint64_t mask, uint64_t bits) { + __m128i s1 = _mm_set_epi64x(i + 2, i + 0); + __m128i s2 = _mm_set_epi64x(i + 3, i + 1); + __m128i ma = _mm_set1_epi64x(mask); + __m128i bi = _mm_set1_epi64x(bits); + + s1 = _mm_and_si128(s1, ma); + s2 = _mm_and_si128(s2, ma); + + s1 = _mm_cmpeq_epi64(s1, bi); + s2 = _mm_cmpeq_epi64(s2, bi); + + return _mm_blend_epi16(s1, s2, 204); // 11001100 +} + +inline double HorizontalSumSSE(__m128 s) { + __m128 ss = _mm_movehdup_ps(s); + __m128 s1 = _mm_add_ps(s, ss); + + return _mm_cvtss_f32(_mm_add_ss(s1, _mm_movehl_ps(ss, s1))); +} + +} // namespace detail + +/** + * Object containing context and routines for SSE state-vector manipulations. + * State is a vectorized sequence of four real components followed by four + * imaginary components. Four single-precison floating numbers can be loaded + * into an SSE register. + */ +template +class StateSpaceSSE : + public StateSpace, VectorSpace, For, float> { + private: + using Base = StateSpace, qsim::VectorSpace, For, float>; + + public: + using State = typename Base::State; + using fp_type = typename Base::fp_type; + + template + explicit StateSpaceSSE(ForArgs&&... args) : Base(args...) {} + + static uint64_t MinSize(unsigned num_qubits) { + return std::max(uint64_t{8}, 2 * (uint64_t{1} << num_qubits)); + }; + + void InternalToNormalOrder(State& state) const { + if (state.num_qubits() == 1) { + auto s = state.get(); + + s[2] = s[1]; + s[1] = s[4]; + s[3] = s[5]; + + for (uint64_t i = 4; i < 8; ++i) { + s[i] = 0; + } + } else { + auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) { + auto s = p + 8 * i; + + fp_type re[3]; + fp_type im[3]; + + for (uint64_t i = 0; i < 3; ++i) { + re[i] = s[i + 1]; + im[i] = s[i + 4]; + } + + for (uint64_t i = 0; i < 3; ++i) { + s[2 * i + 1] = im[i]; + s[2 * i + 2] = re[i]; + } + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 8, f, state.get()); + } + } + + void NormalToInternalOrder(State& state) const { + if (state.num_qubits() == 1) { + auto s = state.get(); + + s[4] = s[1]; + s[1] = s[2]; + s[5] = s[3]; + + s[2] = 0; + s[3] = 0; + s[6] = 0; + s[7] = 0; + } else { + auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) { + auto s = p + 8 * i; + + fp_type re[3]; + fp_type im[3]; + + for (uint64_t i = 0; i < 3; ++i) { + im[i] = s[2 * i + 1]; + re[i] = s[2 * i + 2]; + } + + for (uint64_t i = 0; i < 3; ++i) { + s[i + 1] = re[i]; + s[i + 4] = im[i]; + } + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 8, f, state.get()); + } + } + + void SetAllZeros(State& state) const { + __m128 val0 = _mm_setzero_ps(); + + auto f = [](unsigned n, unsigned m, uint64_t i, __m128 val0, fp_type* p) { + _mm_store_ps(p + 8 * i, val0); + _mm_store_ps(p + 8 * i + 4, val0); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 8, f, val0, state.get()); + } + + // Uniform superposition. + void SetStateUniform(State& state) const { + __m128 val0 = _mm_setzero_ps(); + __m128 valu; + + fp_type v = double{1} / std::sqrt(uint64_t{1} << state.num_qubits()); + + if (state.num_qubits() == 1) { + valu = _mm_set_ps(0, 0, v, v); + } else { + valu = _mm_set1_ps(v); + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + __m128 val0, __m128 valu, fp_type* p) { + _mm_store_ps(p + 8 * i, valu); + _mm_store_ps(p + 8 * i + 4, val0); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 8, f, val0, valu, state.get()); + } + + // |0> state. + void SetStateZero(State& state) const { + SetAllZeros(state); + state.get()[0] = 1; + } + + static std::complex GetAmpl(const State& state, uint64_t i) { + uint64_t p = (8 * (i / 4)) + (i % 4); + return std::complex(state.get()[p], state.get()[p + 4]); + } + + static void SetAmpl( + State& state, uint64_t i, const std::complex& ampl) { + uint64_t p = (8 * (i / 4)) + (i % 4); + state.get()[p] = std::real(ampl); + state.get()[p + 4] = std::imag(ampl); + } + + static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) { + uint64_t p = (8 * (i / 4)) + (i % 4); + state.get()[p] = re; + state.get()[p + 4] = im; + } + + // Sets state[i] = complex(re, im) where (i & mask) == bits. + // if `exclude` is true then the criteria becomes (i & mask) != bits. + void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, + const std::complex& val, + bool exclude = false) const { + BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val)); + } + + // Sets state[i] = complex(re, im) where (i & mask) == bits. + // if `exclude` is true then the criteria becomes (i & mask) != bits. + void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re, + fp_type im, bool exclude = false) const { + __m128 re_reg = _mm_set1_ps(re); + __m128 im_reg = _mm_set1_ps(im); + __m128i exclude_reg = _mm_setzero_si128(); + if (exclude) { + exclude_reg = _mm_cmpeq_epi32(exclude_reg, exclude_reg); + } + + auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv, + uint64_t bitsv, __m128 re_n, __m128 im_n, __m128i exclude_n, + fp_type* p) { + __m128 ml = _mm_castsi128_ps(_mm_xor_si128( + detail::GetZeroMaskSSE(4 * i, maskv, bitsv), exclude_n)); + + __m128 re = _mm_load_ps(p + 8 * i); + __m128 im = _mm_load_ps(p + 8 * i + 4); + + re = _mm_blendv_ps(re, re_n, ml); + im = _mm_blendv_ps(im, im_n, ml); + + _mm_store_ps(p + 8 * i, re); + _mm_store_ps(p + 8 * i + 4, im); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 8, f, mask, bits, re_reg, + im_reg, exclude_reg, state.get()); + } + + // Does the equivalent of dest += src elementwise. + bool Add(const State& src, State& dest) const { + if (src.num_qubits() != dest.num_qubits()) { + return false; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p1, fp_type* p2) { + __m128 re1 = _mm_load_ps(p1 + 8 * i); + __m128 im1 = _mm_load_ps(p1 + 8 * i + 4); + __m128 re2 = _mm_load_ps(p2 + 8 * i); + __m128 im2 = _mm_load_ps(p2 + 8 * i + 4); + + _mm_store_ps(p2 + 8 * i, _mm_add_ps(re1, re2)); + _mm_store_ps(p2 + 8 * i + 4, _mm_add_ps(im1, im2)); + }; + + Base::for_.Run(MinSize(src.num_qubits()) / 8, f, src.get(), dest.get()); + + return true; + } + + // Does the equivalent of state *= a elementwise. + void Multiply(fp_type a, State& state) const { + __m128 r = _mm_set1_ps(a); + + auto f = [](unsigned n, unsigned m, uint64_t i, __m128 r, fp_type* p) { + __m128 re = _mm_load_ps(p + 8 * i); + __m128 im = _mm_load_ps(p + 8 * i + 4); + + re = _mm_mul_ps(re, r); + im = _mm_mul_ps(im, r); + + _mm_store_ps(p + 8 * i, re); + _mm_store_ps(p + 8 * i + 4, im); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 8, f, r, state.get()); + } + + std::complex InnerProduct( + const State& state1, const State& state2) const { + if (state1.num_qubits() != state2.num_qubits()) { + return std::nan(""); + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p1, const fp_type* p2) -> std::complex { + __m128 re1 = _mm_load_ps(p1 + 8 * i); + __m128 im1 = _mm_load_ps(p1 + 8 * i + 4); + __m128 re2 = _mm_load_ps(p2 + 8 * i); + __m128 im2 = _mm_load_ps(p2 + 8 * i + 4); + + __m128 ip_re = _mm_add_ps(_mm_mul_ps(re1, re2), _mm_mul_ps(im1, im2)); + __m128 ip_im = _mm_sub_ps(_mm_mul_ps(re1, im2), _mm_mul_ps(im1, re2)); + + double re = detail::HorizontalSumSSE(ip_re); + double im = detail::HorizontalSumSSE(ip_im); + + return std::complex{re, im}; + }; + + using Op = std::plus>; + return Base::for_.RunReduce( + MinSize(state1.num_qubits()) / 8, f, Op(), state1.get(), state2.get()); + } + + double RealInnerProduct(const State& state1, const State& state2) const { + if (state1.num_qubits() != state2.num_qubits()) { + return std::nan(""); + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p1, const fp_type* p2) -> double { + __m128 re1 = _mm_load_ps(p1 + 8 * i); + __m128 im1 = _mm_load_ps(p1 + 8 * i + 4); + __m128 re2 = _mm_load_ps(p2 + 8 * i); + __m128 im2 = _mm_load_ps(p2 + 8 * i + 4); + + __m128 ip_re = _mm_add_ps(_mm_mul_ps(re1, re2), _mm_mul_ps(im1, im2)); + + return detail::HorizontalSumSSE(ip_re); + }; + + using Op = std::plus; + return Base::for_.RunReduce( + MinSize(state1.num_qubits()) / 8, f, Op(), state1.get(), state2.get()); + } + + template + std::vector Sample( + const State& state, uint64_t num_samples, unsigned seed) const { + std::vector bitstrings; + + if (num_samples > 0) { + double norm = 0; + uint64_t size = MinSize(state.num_qubits()) / 8; + const fp_type* p = state.get(); + + for (uint64_t k = 0; k < size; ++k) { + for (unsigned j = 0; j < 4; ++j) { + double re = p[8 * k + j]; + double im = p[8 * k + 4 + j]; + norm += re * re + im * im; + } + } + + auto rs = GenerateRandomValues(num_samples, seed, norm); + + uint64_t m = 0; + double csum = 0; + bitstrings.reserve(num_samples); + + for (uint64_t k = 0; k < size; ++k) { + for (unsigned j = 0; j < 4; ++j) { + double re = p[8 * k + j]; + double im = p[8 * k + 4 + j]; + csum += re * re + im * im; + while (rs[m] < csum && m < num_samples) { + bitstrings.emplace_back(4 * k + j); + ++m; + } + } + } + + for (; m < num_samples; ++m) { + bitstrings.emplace_back((uint64_t{1} << state.num_qubits()) - 1); + } + } + + return bitstrings; + } + + using MeasurementResult = typename Base::MeasurementResult; + + void Collapse(const MeasurementResult& mr, State& state) const { + __m128 zero = _mm_set1_ps(0); + + auto f1 = [](unsigned n, unsigned m, uint64_t i, uint64_t mask, + uint64_t bits, __m128 zero, const fp_type* p) -> double { + __m128 ml = _mm_castsi128_ps(detail::GetZeroMaskSSE(4 * i, mask, bits)); + + __m128 re = _mm_load_ps(p + 8 * i); + __m128 im = _mm_load_ps(p + 8 * i + 4); + __m128 s1 = _mm_add_ps(_mm_mul_ps(re, re), _mm_mul_ps(im, im)); + + s1 = _mm_blendv_ps(zero, s1, ml); + + return detail::HorizontalSumSSE(s1); + }; + + using Op = std::plus; + double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 8, f1, + Op(), mr.mask, mr.bits, zero, + state.get()); + + __m128 renorm = _mm_set1_ps(1.0 / std::sqrt(norm)); + + auto f2 = [](unsigned n, unsigned m, uint64_t i, uint64_t mask, + uint64_t bits, __m128 renorm, __m128 zero, fp_type* p) { + __m128 ml = _mm_castsi128_ps(detail::GetZeroMaskSSE(4 * i, mask, bits)); + + __m128 re = _mm_load_ps(p + 8 * i); + __m128 im = _mm_load_ps(p + 8 * i + 4); + + re = _mm_blendv_ps(zero, _mm_mul_ps(re, renorm), ml); + im = _mm_blendv_ps(zero, _mm_mul_ps(im, renorm), ml); + + _mm_store_ps(p + 8 * i, re); + _mm_store_ps(p + 8 * i + 4, im); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 8, f2, + mr.mask, mr.bits, renorm, zero, state.get()); + } + + std::vector PartialNorms(const State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* p) -> double { + __m128 re = _mm_load_ps(p + 8 * i); + __m128 im = _mm_load_ps(p + 8 * i + 4); + __m128 s1 = _mm_add_ps(_mm_mul_ps(re, re), _mm_mul_ps(im, im)); + + return detail::HorizontalSumSSE(s1); + }; + + using Op = std::plus; + return Base::for_.RunReduceP( + MinSize(state.num_qubits()) / 8, f, Op(), state.get()); + } + + uint64_t FindMeasuredBits( + unsigned m, double r, uint64_t mask, const State& state) const { + double csum = 0; + + uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 8, m); + uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 8, m); + + const fp_type* p = state.get(); + + for (uint64_t k = k0; k < k1; ++k) { + for (uint64_t j = 0; j < 4; ++j) { + auto re = p[8 * k + j]; + auto im = p[8 * k + 4 + j]; + csum += re * re + im * im; + if (r < csum) { + return (4 * k + j) & mask; + } + } + } + + // Return the last bitstring in the unlikely case of underflow. + return (4 * k1 - 1) & mask; + } +}; + +} // namespace qsim + +#endif // STATESPACE_SSE_H_ diff --git a/tpls/qsim/umux.h b/tpls/qsim/umux.h new file mode 100644 index 0000000..83b951b --- /dev/null +++ b/tpls/qsim/umux.h @@ -0,0 +1,52 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UMUX_H_ +#define UMUX_H_ + +#ifdef __AVX512F__ +# include "unitary_calculator_avx512.h" + namespace qsim { + namespace unitary { + template + using UnitaryCalculator = UnitaryCalculatorAVX512; + } + } +#elif __AVX2__ +# include "unitary_calculator_avx.h" + namespace qsim { + namespace unitary { + template + using UnitaryCalculator = UnitaryCalculatorAVX; + } + } +#elif __SSE4_1__ +# include "unitary_calculator_sse.h" + namespace qsim { + namespace unitary { + template + using UnitaryCalculator = UnitaryCalculatorSSE; + } + } +#else +# include "unitary_calculator_basic.h" + namespace qsim { + namespace unitary { + template + using UnitaryCalculator = UnitaryCalculatorBasic; + } + } +#endif + +#endif // UMUX_H_ diff --git a/tpls/qsim/unitary_calculator_avx.h b/tpls/qsim/unitary_calculator_avx.h new file mode 100644 index 0000000..5e566ca --- /dev/null +++ b/tpls/qsim/unitary_calculator_avx.h @@ -0,0 +1,1028 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UNITARY_CALCULATOR_AVX_H_ +#define UNITARY_CALCULATOR_AVX_H_ + +#include + +#include +#include +#include +#include + +#include "simulator.h" +#include "unitaryspace_avx.h" + +namespace qsim { +namespace unitary { + +/** + * Quantum circuit unitary calculator with AVX vectorization. + */ +template +class UnitaryCalculatorAVX final : public SimulatorBase { + public: + using UnitarySpace = UnitarySpaceAVX; + using Unitary = typename UnitarySpace::Unitary; + using fp_type = typename UnitarySpace::fp_type; + + using StateSpace = UnitarySpace; + using State = Unitary; + + template + explicit UnitaryCalculatorAVX(ForArgs&&... args) : for_(args...) {} + + /** + * Applies a gate using AVX instructions. + * @param qs Indices of the qubits affected by this gate. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyGate(const std::vector& qs, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + switch (qs.size()) { + case 1: + if (qs[0] > 2) { + ApplyGateH<1>(qs, matrix, state); + } else { + ApplyGateL<0, 1>(qs, matrix, state); + } + break; + case 2: + if (qs[0] > 2) { + ApplyGateH<2>(qs, matrix, state); + } else if (qs[1] > 2) { + ApplyGateL<1, 1>(qs, matrix, state); + } else { + ApplyGateL<0, 2>(qs, matrix, state); + } + break; + case 3: + if (qs[0] > 2) { + ApplyGateH<3>(qs, matrix, state); + } else if (qs[1] > 2) { + ApplyGateL<2, 1>(qs, matrix, state); + } else if (qs[2] > 2) { + ApplyGateL<1, 2>(qs, matrix, state); + } else { + ApplyGateL<0, 3>(qs, matrix, state); + } + break; + case 4: + if (qs[0] > 2) { + ApplyGateH<4>(qs, matrix, state); + } else if (qs[1] > 2) { + ApplyGateL<3, 1>(qs, matrix, state); + } else if (qs[2] > 2) { + ApplyGateL<2, 2>(qs, matrix, state); + } else { + ApplyGateL<1, 3>(qs, matrix, state); + } + break; + case 5: + if (qs[0] > 2) { + ApplyGateH<5>(qs, matrix, state); + } else if (qs[1] > 2) { + ApplyGateL<4, 1>(qs, matrix, state); + } else if (qs[2] > 2) { + ApplyGateL<3, 2>(qs, matrix, state); + } else { + ApplyGateL<2, 3>(qs, matrix, state); + } + break; + case 6: + if (qs[0] > 2) { + ApplyGateH<6>(qs, matrix, state); + } else if (qs[1] > 2) { + ApplyGateL<5, 1>(qs, matrix, state); + } else if (qs[2] > 2) { + ApplyGateL<4, 2>(qs, matrix, state); + } else { + ApplyGateL<3, 3>(qs, matrix, state); + } + break; + default: + // Not implemented. + break; + } + } + + /** + * Applies a controlled gate using AVX instructions. + * @param qs Indices of the qubits affected by this gate. + * @param cqs Indices of control qubits. + * @param cvals Bit mask of control qubit values. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyControlledGate(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + // Assume cqs[0] < cqs[1] < cqs[2] < ... . + + if (cqs.size() == 0) { + ApplyGate(qs, matrix, state); + return; + } + + switch (qs.size()) { + case 1: + if (qs[0] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 2) { + ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 2: + if (qs[0] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 2) { + ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 3: + if (qs[0] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else if (qs[2] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 2) { + ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 4: + if (qs[0] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else if (qs[2] > 2) { + if (cqs[0] > 2) { + ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 2) { + ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + default: + // Not implemented. + break; + } + } + + /** + * @return The size of SIMD register if applicable. + */ + static unsigned SIMDRegisterSize() { + return 8; + } + + private: + +#ifdef __BMI2__ + + template + void ApplyGateH(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + uint64_t imaskh, uint64_t qmaskh, uint64_t size, + uint64_t row_size, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m256 ru, iu, rn, in; + __m256 rs[hsize], is[hsize]; + + uint64_t r = i % size; + uint64_t s = i / size; + + auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh); + + for (unsigned k = 0; k < hsize; ++k) { + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k] = _mm256_load_ps(p0 + p); + is[k] = _mm256_load_ps(p0 + p + 8); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_mul_ps(rs[0], ru); + in = _mm256_mul_ps(rs[0], iu); + rn = _mm256_fnmadd_ps(is[0], iu, rn); + in = _mm256_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_fmadd_ps(rs[l], ru, rn); + in = _mm256_fmadd_ps(rs[l], iu, in); + rn = _mm256_fnmadd_ps(is[l], iu, rn); + in = _mm256_fmadd_ps(is[l], ru, in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm256_store_ps(p0 + p, rn); + _mm256_store_ps(p0 + p + 8, in); + } + }; + + auto m = GetMasks1(qs); + + unsigned k = 3 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, + matrix, m.imaskh, m.qmaskh, size, raw_size, state.get()); + } + + template + void ApplyGateL(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, + uint64_t imaskh, uint64_t qmaskh, const __m256i* idx, + uint64_t size, uint64_t row_size, fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m256 rn, in; + __m256 rs[gsize], is[gsize]; + + uint64_t r = i % size; + uint64_t s = i / size; + + auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh); + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k2] = _mm256_load_ps(p0 + p); + is[k2] = _mm256_load_ps(p0 + p + 8); + + for (unsigned l = 1; l < lsize; ++l) { + rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); + is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); + } + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm256_mul_ps(rs[0], w[j]); + in = _mm256_mul_ps(rs[0], w[j + 1]); + rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm256_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm256_fmadd_ps(rs[l], w[j], rn); + in = _mm256_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm256_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm256_store_ps(p0 + p, rn); + _mm256_store_ps(p0 + p + 8, in); + } + }; + + __m256i idx[1 << L]; + __m256 w[1 << (1 + 2 * H + L)]; + + auto m = GetMasks2(qs); + FillPermutationIndices(m.qmaskl, idx); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + unsigned k = 3 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, + w, m.imaskh, m.qmaskh, idx, size, raw_size, state.get()); + } + + template + void ApplyControlledGateHH(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, + uint64_t size, uint64_t row_size, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m256 ru, iu, rn, in; + __m256 rs[hsize], is[hsize]; + + uint64_t r = i % size; + uint64_t s = i / size; + + auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh); + + for (unsigned k = 0; k < hsize; ++k) { + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k] = _mm256_load_ps(p0 + p); + is[k] = _mm256_load_ps(p0 + p + 8); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_mul_ps(rs[0], ru); + in = _mm256_mul_ps(rs[0], iu); + rn = _mm256_fnmadd_ps(is[0], iu, rn); + in = _mm256_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_fmadd_ps(rs[l], ru, rn); + in = _mm256_fmadd_ps(rs[l], iu, in); + rn = _mm256_fnmadd_ps(is[l], iu, rn); + in = _mm256_fmadd_ps(is[l], ru, in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm256_store_ps(p0 + p, rn); + _mm256_store_ps(p0 + p + 8, in); + } + }; + + auto m = GetMasks3(state.num_qubits(), qs, cqs, cvals); + + unsigned k = 3 + H + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, + matrix, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get()); + } + + template + void ApplyControlledGateHL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, + uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, + uint64_t size, uint64_t row_size, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m256 rn, in; + __m256 rs[hsize], is[hsize]; + + uint64_t r = i % size; + uint64_t s = i / size; + + auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh); + + for (unsigned k = 0; k < hsize; ++k) { + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k] = _mm256_load_ps(p0 + p); + is[k] = _mm256_load_ps(p0 + p + 8); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm256_mul_ps(rs[0], w[j]); + in = _mm256_mul_ps(rs[0], w[j + 1]); + rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm256_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + rn = _mm256_fmadd_ps(rs[l], w[j], rn); + in = _mm256_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm256_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm256_store_ps(p0 + p, rn); + _mm256_store_ps(p0 + p + 8, in); + } + }; + + __m256 w[1 << (1 + 2 * H)]; + + auto m = GetMasks4(state.num_qubits(), qs, cqs, cvals); + FillControlledMatrixH(m.cvalsl, m.cmaskl, matrix, (fp_type*) w); + + unsigned k = 3 + H + cqs.size() - m.cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, + w, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get()); + } + + template + void ApplyControlledGateL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, + uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, + const __m256i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m256 rn, in; + __m256 rs[gsize], is[gsize]; + + uint64_t r = i % size; + uint64_t s = i / size; + + auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh); + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k2] = _mm256_load_ps(p0 + p); + is[k2] = _mm256_load_ps(p0 + p + 8); + + for (unsigned l = 1; l < lsize; ++l) { + rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); + is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); + } + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm256_mul_ps(rs[0], w[j]); + in = _mm256_mul_ps(rs[0], w[j + 1]); + rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm256_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm256_fmadd_ps(rs[l], w[j], rn); + in = _mm256_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm256_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm256_store_ps(p0 + p, rn); + _mm256_store_ps(p0 + p + 8, in); + } + }; + + __m256i idx[1 << L]; + __m256 w[1 << (1 + 2 * H + L)]; + + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + if (CH) { + auto m = GetMasks5(state.num_qubits(), qs, cqs, cvals); + FillPermutationIndices(m.qmaskl, idx); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + unsigned k = 3 + H + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size * size2, f, w, m.imaskh, m.qmaskh, + m.cvalsh, idx, size, raw_size, state.get()); + } else { + auto m = GetMasks6(state.num_qubits(), qs, cqs, cvals); + FillPermutationIndices(m.qmaskl, idx); + FillControlledMatrixL( + m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w); + + unsigned k = 3 + H + cqs.size() - m.cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size * size2, f, w, m.imaskh, m.qmaskh, + m.cvalsh, idx, size, raw_size, state.get()); + } + } + +#else // __BMI2__ + + template + void ApplyGateH(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, uint64_t size, + uint64_t row_size, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m256 ru, iu, rn, in; + __m256 rs[hsize], is[hsize]; + + uint64_t r = 8 * (i % size); + uint64_t s = i / size; + + uint64_t t = r & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + r *= 2; + t |= r & ms[j]; + } + + auto p0 = rstate + row_size * s + 2 * t; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = _mm256_load_ps(p0 + xss[k]); + is[k] = _mm256_load_ps(p0 + xss[k] + 8); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_mul_ps(rs[0], ru); + in = _mm256_mul_ps(rs[0], iu); + rn = _mm256_fnmadd_ps(is[0], iu, rn); + in = _mm256_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_fmadd_ps(rs[l], ru, rn); + in = _mm256_fmadd_ps(rs[l], iu, in); + rn = _mm256_fnmadd_ps(is[l], iu, rn); + in = _mm256_fmadd_ps(is[l], ru, in); + + j += 2; + } + + _mm256_store_ps(p0 + xss[k], rn); + _mm256_store_ps(p0 + xss[k] + 8, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned k = 3 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, state.get()); + } + + template + void ApplyGateL(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, + const uint64_t* ms, const uint64_t* xss, const __m256i* idx, + uint64_t size, uint64_t row_size, fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m256 rn, in; + __m256 rs[gsize], is[gsize]; + + uint64_t r = 8 * (i % size); + uint64_t s = i / size; + + uint64_t t = r & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + r *= 2; + t |= r & ms[j]; + } + + auto p0 = rstate + row_size * s + 2 * t; + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + rs[k2] = _mm256_load_ps(p0 + xss[k]); + is[k2] = _mm256_load_ps(p0 + xss[k] + 8); + + for (unsigned l = 1; l < lsize; ++l) { + rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); + is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); + } + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm256_mul_ps(rs[0], w[j]); + in = _mm256_mul_ps(rs[0], w[j + 1]); + rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm256_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm256_fmadd_ps(rs[l], w[j], rn); + in = _mm256_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm256_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + _mm256_store_ps(p0 + xss[k], rn); + _mm256_store_ps(p0 + xss[k] + 8, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + __m256i idx[1 << L]; + __m256 w[1 << (1 + 2 * H + L)]; + + auto m = GetMasks11(qs); + + FillIndices(state.num_qubits(), qs, ms, xss); + FillPermutationIndices(m.qmaskl, idx); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + unsigned k = 3 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, state.get()); + } + + template + void ApplyControlledGateHH(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, + uint64_t cmaskh, uint64_t size, uint64_t row_size, + fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m256 ru, iu, rn, in; + __m256 rs[hsize], is[hsize]; + + uint64_t r = 8 * (i % size); + uint64_t s = i / size; + + uint64_t t = r & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + r *= 2; + t |= r & ms[j]; + } + + if ((t & cmaskh) != cvalsh) return; + + auto p0 = rstate + row_size * s + 2 * t; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = _mm256_load_ps(p0 + xss[k]); + is[k] = _mm256_load_ps(p0 + xss[k] + 8); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_mul_ps(rs[0], ru); + in = _mm256_mul_ps(rs[0], iu); + rn = _mm256_fnmadd_ps(is[0], iu, rn); + in = _mm256_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm256_set1_ps(v[j]); + iu = _mm256_set1_ps(v[j + 1]); + rn = _mm256_fmadd_ps(rs[l], ru, rn); + in = _mm256_fmadd_ps(rs[l], iu, in); + rn = _mm256_fnmadd_ps(is[l], iu, rn); + in = _mm256_fmadd_ps(is[l], ru, in); + + j += 2; + } + + _mm256_store_ps(p0 + xss[k], rn); + _mm256_store_ps(p0 + xss[k] + 8, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + + auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals); + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned k = 3 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, + matrix, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get()); + } + + template + void ApplyControlledGateHL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, + const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, + uint64_t cmaskh, uint64_t size, uint64_t row_size, + fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m256 rn, in; + __m256 rs[hsize], is[hsize]; + + uint64_t r = 8 * (i % size); + uint64_t s = i / size; + + uint64_t t = r & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + r *= 2; + t |= r & ms[j]; + } + + if ((t & cmaskh) != cvalsh) return; + + auto p0 = rstate + row_size * s + 2 * t; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = _mm256_load_ps(p0 + xss[k]); + is[k] = _mm256_load_ps(p0 + xss[k] + 8); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm256_mul_ps(rs[0], w[j]); + in = _mm256_mul_ps(rs[0], w[j + 1]); + rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm256_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + rn = _mm256_fmadd_ps(rs[l], w[j], rn); + in = _mm256_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm256_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + _mm256_store_ps(p0 + xss[k], rn); + _mm256_store_ps(p0 + xss[k] + 8, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + __m256 w[1 << (1 + 2 * H)]; + + auto m = GetMasks8<3>(state.num_qubits(), qs, cqs, cvals); + FillIndices(state.num_qubits(), qs, ms, xss); + FillControlledMatrixH(m.cvalsl, m.cmaskl, matrix, (fp_type*) w); + + unsigned k = 3 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, + w, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get()); + } + + template + void ApplyControlledGateL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, + const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, + uint64_t cmaskh, const __m256i* idx, uint64_t size, + uint64_t row_size, fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m256 rn, in; + __m256 rs[gsize], is[gsize]; + + uint64_t r = 8 * (i % size); + uint64_t s = i / size; + + uint64_t t = r & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + r *= 2; + t |= r & ms[j]; + } + + if ((t & cmaskh) != cvalsh) return; + + auto p0 = rstate + row_size * s + 2 * t; + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + + rs[k2] = _mm256_load_ps(p0 + xss[k]); + is[k2] = _mm256_load_ps(p0 + xss[k] + 8); + + for (unsigned l = 1; l < lsize; ++l) { + rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); + is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); + } + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm256_mul_ps(rs[0], w[j]); + in = _mm256_mul_ps(rs[0], w[j + 1]); + rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm256_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm256_fmadd_ps(rs[l], w[j], rn); + in = _mm256_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm256_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + _mm256_store_ps(p0 + xss[k], rn); + _mm256_store_ps(p0 + xss[k] + 8, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + __m256i idx[1 << L]; + __m256 w[1 << (1 + 2 * H + L)]; + + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned k = 3 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + if (CH) { + auto m = GetMasks9(state.num_qubits(), qs, cqs, cvals); + FillPermutationIndices(m.qmaskl, idx); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + for_.Run(size * size2, f, w, ms, xss, m.cvalsh, + m.cmaskh, idx, size, raw_size, state.get()); + } else { + auto m = GetMasks10(state.num_qubits(), qs, cqs, cvals); + FillPermutationIndices(m.qmaskl, idx); + FillControlledMatrixL( + m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w); + + for_.Run(size * size2, f, w, ms, xss, m.cvalsh, + m.cmaskh, idx, size, raw_size, state.get()); + } + } + +#endif // __BMI2__ + + template + static void FillPermutationIndices(unsigned qmaskl, __m256i* idx) { + constexpr unsigned lsize = 1 << L; + + for (unsigned i = 0; i < lsize - 1; ++i) { + unsigned p[8]; + + for (unsigned j = 0; j < 8; ++j) { + p[j] = MaskedAdd<3>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl)); + } + + idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]); + } + } + + For for_; +}; + +} // namespace unitary +} // namespace qsim + +#endif // UNITARY_CALCULATOR_AVX_H_ diff --git a/tpls/qsim/unitary_calculator_avx512.h b/tpls/qsim/unitary_calculator_avx512.h new file mode 100644 index 0000000..8105367 --- /dev/null +++ b/tpls/qsim/unitary_calculator_avx512.h @@ -0,0 +1,644 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UNITARY_CALCULATOR_AVX512_H_ +#define UNITARY_CALCULATOR_AVX512_H_ + +#include + +#include +#include +#include +#include + +#include "simulator.h" +#include "unitaryspace_avx512.h" + +namespace qsim { +namespace unitary { + +/** + * Quantum circuit unitary calculator with AVX512 vectorization. + */ +template +class UnitaryCalculatorAVX512 final : public SimulatorBase { + public: + using UnitarySpace = UnitarySpaceAVX512; + using Unitary = typename UnitarySpace::Unitary; + using fp_type = typename UnitarySpace::fp_type; + + using StateSpace = UnitarySpace; + using State = Unitary; + + template + explicit UnitaryCalculatorAVX512(ForArgs&&... args) : for_(args...) {} + + /** + * Applies a gate using AVX512 instructions. + * @param qs Indices of the qubits affected by this gate. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyGate(const std::vector& qs, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + switch (qs.size()) { + case 1: + if (qs[0] > 3) { + ApplyGateH<1>(qs, matrix, state); + } else { + ApplyGateL<0, 1>(qs, matrix, state); + } + break; + case 2: + if (qs[0] > 3) { + ApplyGateH<2>(qs, matrix, state); + } else if (qs[1] > 3) { + ApplyGateL<1, 1>(qs, matrix, state); + } else { + ApplyGateL<0, 2>(qs, matrix, state); + } + break; + case 3: + if (qs[0] > 3) { + ApplyGateH<3>(qs, matrix, state); + } else if (qs[1] > 3) { + ApplyGateL<2, 1>(qs, matrix, state); + } else if (qs[2] > 3) { + ApplyGateL<1, 2>(qs, matrix, state); + } else { + ApplyGateL<0, 3>(qs, matrix, state); + } + break; + case 4: + if (qs[0] > 3) { + ApplyGateH<4>(qs, matrix, state); + } else if (qs[1] > 3) { + ApplyGateL<3, 1>(qs, matrix, state); + } else if (qs[2] > 3) { + ApplyGateL<2, 2>(qs, matrix, state); + } else if (qs[3] > 3) { + ApplyGateL<1, 3>(qs, matrix, state); + } else { + ApplyGateL<0, 4>(qs, matrix, state); + } + break; + case 5: + if (qs[0] > 3) { + ApplyGateH<5>(qs, matrix, state); + } else if (qs[1] > 3) { + ApplyGateL<4, 1>(qs, matrix, state); + } else if (qs[2] > 3) { + ApplyGateL<3, 2>(qs, matrix, state); + } else if (qs[3] > 3) { + ApplyGateL<2, 3>(qs, matrix, state); + } else { + ApplyGateL<1, 4>(qs, matrix, state); + } + break; + case 6: + if (qs[0] > 3) { + ApplyGateH<6>(qs, matrix, state); + } else if (qs[1] > 3) { + ApplyGateL<5, 1>(qs, matrix, state); + } else if (qs[2] > 3) { + ApplyGateL<4, 2>(qs, matrix, state); + } else if (qs[3] > 3) { + ApplyGateL<3, 3>(qs, matrix, state); + } else { + ApplyGateL<2, 4>(qs, matrix, state); + } + break; + default: + // Not implemented. + break; + } + } + + /** + * Applies a controlled gate using AVX512 instructions. + * @param qs Indices of the qubits affected by this gate. + * @param cqs Indices of control qubits. + * @param cvals Bit mask of control qubit values. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyControlledGate(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + // Assume cqs[0] < cqs[1] < cqs[2] < ... . + + if (cqs.size() == 0) { + ApplyGate(qs, matrix, state); + return; + } + + switch (qs.size()) { + case 1: + if (qs[0] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 3) { + ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 2: + if (qs[0] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 3) { + ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 3: + if (qs[0] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else if (qs[2] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 3) { + ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 4: + if (qs[0] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else if (qs[2] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state); + } + } else if (qs[3] > 3) { + if (cqs[0] > 3) { + ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 3) { + ApplyControlledGateL<0, 4, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 4, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + default: + // Not implemented. + break; + } + } + + /** + * @return The size of SIMD register if applicable. + */ + static unsigned SIMDRegisterSize() { + return 16; + } + + private: + template + void ApplyGateH(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + uint64_t imaskh, uint64_t qmaskh, uint64_t size, + uint64_t row_size, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m512 ru, iu, rn, in; + __m512 rs[hsize], is[hsize]; + + uint64_t r = i % size; + uint64_t s = i / size; + + auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh); + + for (unsigned k = 0; k < hsize; ++k) { + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k] = _mm512_load_ps(p0 + p); + is[k] = _mm512_load_ps(p0 + p + 16); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[l], ru, rn); + in = _mm512_fmadd_ps(rs[l], iu, in); + rn = _mm512_fnmadd_ps(is[l], iu, rn); + in = _mm512_fmadd_ps(is[l], ru, in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm512_store_ps(p0 + p, rn); + _mm512_store_ps(p0 + p + 16, in); + } + }; + + auto m = GetMasks1(qs); + + unsigned k = 4 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, + matrix, m.imaskh, m.qmaskh, size, raw_size, state.get()); + } + + template + void ApplyGateL(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + uint64_t imaskh, uint64_t qmaskh, const __m512i* idx, + uint64_t size, uint64_t row_size, fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m512 rn, in; + __m512 rs[gsize], is[gsize]; + + uint64_t r = i % size; + uint64_t s = i / size; + + auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh); + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k2] = _mm512_load_ps(p0 + p); + is[k2] = _mm512_load_ps(p0 + p + 16); + + for (unsigned l = 1; l < lsize; ++l) { + rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]); + is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]); + } + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm512_fmadd_ps(rs[l], w[j], rn); + in = _mm512_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm512_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm512_store_ps(p0 + p, rn); + _mm512_store_ps(p0 + p + 16, in); + } + }; + + __m512i idx[1 << L]; + __m512 w[1 << (1 + 2 * H + L)]; + + auto m = GetMasks2(qs); + FillPermutationIndices(m.qmaskl, idx); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + unsigned k = 4 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, + w, m.imaskh, m.qmaskh, idx, size, raw_size, state.get()); + } + + template + void ApplyControlledGateHH(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, + uint64_t size, uint64_t row_size, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m512 ru, iu, rn, in; + __m512 rs[hsize], is[hsize]; + + uint64_t r = i % size; + uint64_t s = i / size; + + auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh); + + for (unsigned k = 0; k < hsize; ++k) { + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k] = _mm512_load_ps(p0 + p); + is[k] = _mm512_load_ps(p0 + p + 16); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_mul_ps(rs[0], ru); + in = _mm512_mul_ps(rs[0], iu); + rn = _mm512_fnmadd_ps(is[0], iu, rn); + in = _mm512_fmadd_ps(is[0], ru, in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm512_set1_ps(v[j]); + iu = _mm512_set1_ps(v[j + 1]); + rn = _mm512_fmadd_ps(rs[l], ru, rn); + in = _mm512_fmadd_ps(rs[l], iu, in); + rn = _mm512_fnmadd_ps(is[l], iu, rn); + in = _mm512_fmadd_ps(is[l], ru, in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm512_store_ps(p0 + p, rn); + _mm512_store_ps(p0 + p + 16, in); + } + }; + + auto m = GetMasks3(state.num_qubits(), qs, cqs, cvals); + + unsigned k = 4 + H + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, + matrix, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get()); + } + + template + void ApplyControlledGateHL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, + uint64_t size, uint64_t row_size, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m512 rn, in; + __m512 rs[hsize], is[hsize]; + + uint64_t r = i % size; + uint64_t s = i / size; + + auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh); + + for (unsigned k = 0; k < hsize; ++k) { + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k] = _mm512_load_ps(p0 + p); + is[k] = _mm512_load_ps(p0 + p + 16); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + rn = _mm512_fmadd_ps(rs[l], w[j], rn); + in = _mm512_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm512_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm512_store_ps(p0 + p, rn); + _mm512_store_ps(p0 + p + 16, in); + } + }; + + __m512 w[1 << (1 + 2 * H)]; + + auto m = GetMasks4(state.num_qubits(), qs, cqs, cvals); + FillControlledMatrixH(m.cvalsl, m.cmaskl, matrix, (fp_type*) w); + + unsigned k = 4 + H + cqs.size() - m.cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, + w, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get()); + } + + template + void ApplyControlledGateL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, + uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, + const __m512i* idx, uint64_t size, uint64_t row_size, + fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m512 rn, in; + __m512 rs[gsize], is[gsize]; + + uint64_t r = i % size; + uint64_t s = i / size; + + auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh); + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + uint64_t p = _pdep_u64(k, qmaskh); + + rs[k2] = _mm512_load_ps(p0 + p); + is[k2] = _mm512_load_ps(p0 + p + 16); + + for (unsigned l = 1; l < lsize; ++l) { + rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]); + is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]); + } + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm512_mul_ps(rs[0], w[j]); + in = _mm512_mul_ps(rs[0], w[j + 1]); + rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); + in = _mm512_fmadd_ps(is[0], w[j], in); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm512_fmadd_ps(rs[l], w[j], rn); + in = _mm512_fmadd_ps(rs[l], w[j + 1], in); + rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn); + in = _mm512_fmadd_ps(is[l], w[j], in); + + j += 2; + } + + uint64_t p = _pdep_u64(k, qmaskh); + + _mm512_store_ps(p0 + p, rn); + _mm512_store_ps(p0 + p + 16, in); + } + }; + + __m512i idx[1 << L]; + __m512 w[1 << (1 + 2 * H + L)]; + + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + if (CH) { + auto m = GetMasks5(state.num_qubits(), qs, cqs, cvals); + FillPermutationIndices(m.qmaskl, idx); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + unsigned k = 4 + H + cqs.size(); + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size * size2, f, w, m.imaskh, m.qmaskh, + m.cvalsh, idx, size, raw_size, state.get()); + } else { + auto m = GetMasks6(state.num_qubits(), qs, cqs, cvals); + FillPermutationIndices(m.qmaskl, idx); + FillControlledMatrixL( + m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w); + + unsigned k = 4 + H + cqs.size() - m.cl; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + + for_.Run(size * size2, f, w, m.imaskh, m.qmaskh, + m.cvalsh, idx, size, raw_size, state.get()); + } + } + + template + static void FillPermutationIndices(unsigned qmaskl, __m512i* idx) { + constexpr unsigned lsize = 1 << L; + + for (unsigned i = 0; i < lsize; ++i) { + unsigned p[16]; + + for (unsigned j = 0; j < 16; ++j) { + p[j] = MaskedAdd<4>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl)); + } + + idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], + p[9], p[8], p[7], p[6], p[5], p[4], + p[3], p[2], p[1], p[0]); + } + } + + For for_; +}; + +} // namespace unitary +} // namespace qsim + +#endif // UNITARY_CALCULATOR_AVX512_H_ diff --git a/tpls/qsim/unitary_calculator_basic.h b/tpls/qsim/unitary_calculator_basic.h new file mode 100644 index 0000000..6b1821a --- /dev/null +++ b/tpls/qsim/unitary_calculator_basic.h @@ -0,0 +1,259 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UNITARY_CALCULATOR_BASIC_H_ +#define UNITARY_CALCULATOR_BASIC_H_ + +#include +#include +#include +#include + +#include "simulator.h" +#include "unitaryspace_basic.h" + +namespace qsim { +namespace unitary { + +/** + * Quantum circuit unitary calculator without vectorization. + */ +template +class UnitaryCalculatorBasic final : public SimulatorBase { + public: + using UnitarySpace = UnitarySpaceBasic; + using Unitary = typename UnitarySpace::Unitary; + using fp_type = typename UnitarySpace::fp_type; + + using StateSpace = UnitarySpace; + using State = Unitary; + + template + explicit UnitaryCalculatorBasic(ForArgs&&... args) : for_(args...) {} + + /** + * Applies a gate using non-vectorized instructions. + * @param qs Indices of the qubits affected by this gate. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyGate(const std::vector& qs, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + switch (qs.size()) { + case 1: + ApplyGateH<1>(qs, matrix, state); + break; + case 2: + ApplyGateH<2>(qs, matrix, state); + break; + case 3: + ApplyGateH<3>(qs, matrix, state); + break; + case 4: + ApplyGateH<4>(qs, matrix, state); + break; + case 5: + ApplyGateH<5>(qs, matrix, state); + break; + case 6: + ApplyGateH<6>(qs, matrix, state); + break; + default: + // Not implemented. + break; + } + } + + /** + * Applies a controlled gate using non-vectorized instructions. + * @param qs Indices of the qubits affected by this gate. + * @param cqs Indices of control qubits. + * @param cvals Bit mask of control qubit values. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyControlledGate(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + if (cqs.size() == 0) { + ApplyGate(qs, matrix, state); + return; + } + + switch (qs.size()) { + case 1: + ApplyControlledGateH<1>(qs, cqs, cvals, matrix, state); + break; + case 2: + ApplyControlledGateH<2>(qs, cqs, cvals, matrix, state); + break; + case 3: + ApplyControlledGateH<3>(qs, cqs, cvals, matrix, state); + break; + case 4: + ApplyControlledGateH<4>(qs, cqs, cvals, matrix, state); + break; + default: + // Not implemented. + break; + } + } + + /** + * @return The size of SIMD register if applicable. + */ + static unsigned SIMDRegisterSize() { + return 1; + } + + private: + template + void ApplyGateH(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, uint64_t size, + uint64_t row_size, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + fp_type rn, in; + fp_type rs[hsize], is[hsize]; + + uint64_t r = i % size; + uint64_t s = i / size; + + uint64_t t = r & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + r *= 2; + t |= r & ms[j]; + } + + auto p0 = rstate + row_size * s + 2 * t; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = *(p0 + xss[k]); + is[k] = *(p0 + xss[k] + 1); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = rs[0] * v[j] - is[0] * v[j + 1]; + in = rs[0] * v[j + 1] + is[0] * v[j]; + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + rn += rs[l] * v[j] - is[l] * v[j + 1]; + in += rs[l] * v[j + 1] + is[l] * v[j]; + + j += 2; + } + + *(p0 + xss[k]) = rn; + *(p0 + xss[k] + 1) = in; + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, state.get()); + } + + template + void ApplyControlledGateH(const std::vector& qs, + const std::vector& cqs, + uint64_t cvals, const fp_type* matrix, + State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, + uint64_t cmaskh, uint64_t size, uint64_t row_size, + fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + fp_type rn, in; + fp_type rs[hsize], is[hsize]; + + uint64_t r = i % size; + uint64_t s = i / size; + + uint64_t t = r & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + r *= 2; + t |= r & ms[j]; + } + + if ((t & cmaskh) == cvalsh) { + auto p0 = rstate + row_size * s + 2 * t; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = *(p0 + xss[k]); + is[k] = *(p0 + xss[k] + 1); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = rs[0] * v[j] - is[0] * v[j + 1]; + in = rs[0] * v[j + 1] + is[0] * v[j]; + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + rn += rs[l] * v[j] - is[l] * v[j + 1]; + in += rs[l] * v[j + 1] + is[l] * v[j]; + + j += 2; + } + + *(p0 + xss[k]) = rn; + *(p0 + xss[k] + 1) = in; + } + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + + FillIndices(state.num_qubits(), qs, ms, xss); + + auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals); + + unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, + matrix, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get()); + } + + For for_; +}; + +} // namespace unitary +} // namespace qsim + +#endif // UNITARY_CALCULATOR_BASIC_H_ diff --git a/tpls/qsim/unitary_calculator_sse.h b/tpls/qsim/unitary_calculator_sse.h new file mode 100644 index 0000000..a3c3f2e --- /dev/null +++ b/tpls/qsim/unitary_calculator_sse.h @@ -0,0 +1,639 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UNITARY_CALCULATOR_SSE_H_ +#define UNITARY_CALCULATOR_SSE_H_ + +#include + +#include +#include +#include +#include + +#include "simulator.h" +#include "unitaryspace_sse.h" + +namespace qsim { +namespace unitary { + +/** + * Quantum circuit unitary calculator with SSE vectorization. + */ +template +class UnitaryCalculatorSSE final : public SimulatorBase { + public: + using UnitarySpace = UnitarySpaceSSE; + using Unitary = typename UnitarySpace::Unitary; + using fp_type = typename UnitarySpace::fp_type; + + using StateSpace = UnitarySpace; + using State = Unitary; + + template + explicit UnitaryCalculatorSSE(ForArgs&&... args) : for_(args...) {} + + /** + * Applies a gate using SSE instructions. + * @param qs Indices of the qubits affected by this gate. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyGate(const std::vector& qs, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + + switch (qs.size()) { + case 1: + if (qs[0] > 1) { + ApplyGateH<1>(qs, matrix, state); + } else { + ApplyGateL<0, 1>(qs, matrix, state); + } + break; + case 2: + if (qs[0] > 1) { + ApplyGateH<2>(qs, matrix, state); + } else if (qs[1] > 1) { + ApplyGateL<1, 1>(qs, matrix, state); + } else { + ApplyGateL<0, 2>(qs, matrix, state); + } + break; + case 3: + if (qs[0] > 1) { + ApplyGateH<3>(qs, matrix, state); + } else if (qs[1] > 1) { + ApplyGateL<2, 1>(qs, matrix, state); + } else { + ApplyGateL<1, 2>(qs, matrix, state); + } + break; + case 4: + if (qs[0] > 1) { + ApplyGateH<4>(qs, matrix, state); + } else if (qs[1] > 1) { + ApplyGateL<3, 1>(qs, matrix, state); + } else { + ApplyGateL<2, 2>(qs, matrix, state); + } + break; + case 5: + if (qs[0] > 1) { + ApplyGateH<5>(qs, matrix, state); + } else if (qs[1] > 1) { + ApplyGateL<4, 1>(qs, matrix, state); + } else { + ApplyGateL<3, 2>(qs, matrix, state); + } + break; + case 6: + if (qs[0] > 1) { + ApplyGateH<6>(qs, matrix, state); + } else if (qs[1] > 1) { + ApplyGateL<5, 1>(qs, matrix, state); + } else { + ApplyGateL<4, 2>(qs, matrix, state); + } + break; + default: + // Not implemented. + break; + } + } + + /** + * Applies a controlled gate using SSE instructions. + * @param qs Indices of the qubits affected by this gate. + * @param cqs Indices of control qubits. + * @param cvals Bit mask of control qubit values. + * @param matrix Matrix representation of the gate to be applied. + * @param state The state of the system, to be updated by this method. + */ + void ApplyControlledGate(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + // Assume qs[0] < qs[1] < qs[2] < ... . + // Assume cqs[0] < cqs[1] < cqs[2] < ... . + + if (cqs.size() == 0) { + ApplyGate(qs, matrix, state); + return; + } + + switch (qs.size()) { + case 1: + if (qs[0] > 1) { + if (cqs[0] > 1) { + ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 1) { + ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 2: + if (qs[0] > 1) { + if (cqs[0] > 1) { + ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 1) { + if (cqs[0] > 1) { + ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 1) { + ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 3: + if (qs[0] > 1) { + if (cqs[0] > 1) { + ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 1) { + if (cqs[0] > 1) { + ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 1) { + ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + case 4: + if (qs[0] > 1) { + if (cqs[0] > 1) { + ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state); + } + } else if (qs[1] > 1) { + if (cqs[0] > 1) { + ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state); + } + } else { + if (cqs[0] > 1) { + ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state); + } else { + ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state); + } + } + break; + default: + // Not implemented. + break; + } + } + + /** + * @return The size of SIMD register if applicable. + */ + static unsigned SIMDRegisterSize() { + return 4; + } + + private: + template + void ApplyGateH(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, uint64_t size, + uint64_t row_size, fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m128 ru, iu, rn, in; + __m128 rs[hsize], is[hsize]; + + uint64_t r = 4 * (i % size); + uint64_t s = i / size; + + uint64_t t = r & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + r *= 2; + t |= r & ms[j]; + } + + auto p0 = rstate + row_size * s + 2 * t; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = _mm_load_ps(p0 + xss[k]); + is[k] = _mm_load_ps(p0 + xss[k] + 4); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm_set1_ps(v[j]); + iu = _mm_set1_ps(v[j + 1]); + rn = _mm_mul_ps(rs[0], ru); + in = _mm_mul_ps(rs[0], iu); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu)); + in = _mm_add_ps(in, _mm_mul_ps(is[0], ru)); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm_set1_ps(v[j]); + iu = _mm_set1_ps(v[j + 1]); + rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru)); + in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu)); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu)); + in = _mm_add_ps(in, _mm_mul_ps(is[l], ru)); + + j += 2; + } + + _mm_store_ps(p0 + xss[k], rn); + _mm_store_ps(p0 + xss[k] + 4, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned k = 2 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, state.get()); + } + + template + void ApplyGateL(const std::vector& qs, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w, + const uint64_t* ms, const uint64_t* xss, unsigned q0, + uint64_t size, uint64_t row_size, fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m128 rn, in; + __m128 rs[gsize], is[gsize]; + + uint64_t r = 4 * (i % size); + uint64_t s = i / size; + + uint64_t t = r & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + r *= 2; + t |= r & ms[j]; + } + + auto p0 = rstate + row_size * s + 2 * t; + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + + rs[k2] = _mm_load_ps(p0 + xss[k]); + is[k2] = _mm_load_ps(p0 + xss[k] + 4); + + if (L == 1) { + rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177) + : _mm_shuffle_ps(rs[k2], rs[k2], 78); + is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177) + : _mm_shuffle_ps(is[k2], is[k2], 78); + } else if (L == 2) { + rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57); + is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57); + rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78); + is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78); + rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147); + is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147); + } + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm_mul_ps(rs[0], w[j]); + in = _mm_mul_ps(rs[0], w[j + 1]); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1])); + in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j])); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j])); + in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1])); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1])); + in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j])); + + j += 2; + } + + _mm_store_ps(p0 + xss[k], rn); + _mm_store_ps(p0 + xss[k] + 4, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + __m128 w[1 << (1 + 2 * H + L)]; + + auto m = GetMasks11(qs); + + FillIndices(state.num_qubits(), qs, ms, xss); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + unsigned k = 2 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, w, ms, xss, qs[0], size, raw_size, state.get()); + } + + template + void ApplyControlledGateHH(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, + const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, + uint64_t cmaskh, uint64_t size, uint64_t row_size, + fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m128 ru, iu, rn, in; + __m128 rs[hsize], is[hsize]; + + uint64_t r = 4 * (i % size); + uint64_t s = i / size; + + uint64_t t = r & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + r *= 2; + t |= r & ms[j]; + } + + if ((t & cmaskh) != cvalsh) return; + + auto p0 = rstate + row_size * s + 2 * t; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = _mm_load_ps(p0 + xss[k]); + is[k] = _mm_load_ps(p0 + xss[k] + 4); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + ru = _mm_set1_ps(v[j]); + iu = _mm_set1_ps(v[j + 1]); + rn = _mm_mul_ps(rs[0], ru); + in = _mm_mul_ps(rs[0], iu); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu)); + in = _mm_add_ps(in, _mm_mul_ps(is[0], ru)); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + ru = _mm_set1_ps(v[j]); + iu = _mm_set1_ps(v[j + 1]); + rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru)); + in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu)); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu)); + in = _mm_add_ps(in, _mm_mul_ps(is[l], ru)); + + j += 2; + } + + _mm_store_ps(p0 + xss[k], rn); + _mm_store_ps(p0 + xss[k] + 4, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + + auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals); + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned k = 2 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, + matrix, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get()); + } + + template + void ApplyControlledGateHL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w, + const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, + uint64_t cmaskh, uint64_t size, uint64_t row_size, + fp_type* rstate) { + constexpr unsigned hsize = 1 << H; + + __m128 rn, in; + __m128 rs[hsize], is[hsize]; + + uint64_t r = 4 * (i % size); + uint64_t s = i / size; + + uint64_t t = r & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + r *= 2; + t |= r & ms[j]; + } + + if ((t & cmaskh) != cvalsh) return; + + auto p0 = rstate + row_size * s + 2 * t; + + for (unsigned k = 0; k < hsize; ++k) { + rs[k] = _mm_load_ps(p0 + xss[k]); + is[k] = _mm_load_ps(p0 + xss[k] + 4); + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm_mul_ps(rs[0], w[j]); + in = _mm_mul_ps(rs[0], w[j + 1]); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1])); + in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j])); + + j += 2; + + for (unsigned l = 1; l < hsize; ++l) { + rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j])); + in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1])); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1])); + in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j])); + + j += 2; + } + + _mm_store_ps(p0 + xss[k], rn); + _mm_store_ps(p0 + xss[k] + 4, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + __m128 w[1 << (1 + 2 * H)]; + + auto m = GetMasks8<2>(state.num_qubits(), qs, cqs, cvals); + FillIndices(state.num_qubits(), qs, ms, xss); + FillControlledMatrixH(m.cvalsl, m.cmaskl, matrix, (fp_type*) w); + + unsigned k = 2 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + for_.Run(size * size2, f, + w, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get()); + } + + template + void ApplyControlledGateL(const std::vector& qs, + const std::vector& cqs, uint64_t cvals, + const fp_type* matrix, State& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w, + const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, + uint64_t cmaskh, unsigned q0, uint64_t size, uint64_t row_size, + fp_type* rstate) { + constexpr unsigned gsize = 1 << (H + L); + constexpr unsigned hsize = 1 << H; + constexpr unsigned lsize = 1 << L; + + __m128 rn, in; + __m128 rs[gsize], is[gsize]; + + uint64_t r = 4 * (i % size); + uint64_t s = i / size; + + uint64_t t = r & ms[0]; + for (unsigned j = 1; j <= H; ++j) { + r *= 2; + t |= r & ms[j]; + } + + if ((t & cmaskh) != cvalsh) return; + + auto p0 = rstate + row_size * s + 2 * t; + + for (unsigned k = 0; k < hsize; ++k) { + unsigned k2 = lsize * k; + + rs[k2] = _mm_load_ps(p0 + xss[k]); + is[k2] = _mm_load_ps(p0 + xss[k] + 4); + + if (L == 1) { + rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177) + : _mm_shuffle_ps(rs[k2], rs[k2], 78); + is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177) + : _mm_shuffle_ps(is[k2], is[k2], 78); + } else if (L == 2) { + rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57); + is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57); + rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78); + is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78); + rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147); + is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147); + } + } + + uint64_t j = 0; + + for (unsigned k = 0; k < hsize; ++k) { + rn = _mm_mul_ps(rs[0], w[j]); + in = _mm_mul_ps(rs[0], w[j + 1]); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1])); + in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j])); + + j += 2; + + for (unsigned l = 1; l < gsize; ++l) { + rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j])); + in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1])); + rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1])); + in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j])); + + j += 2; + } + + _mm_store_ps(p0 + xss[k], rn); + _mm_store_ps(p0 + xss[k] + 4, in); + } + }; + + uint64_t ms[H + 1]; + uint64_t xss[1 << H]; + __m128 w[1 << (1 + 2 * H + L)]; + + FillIndices(state.num_qubits(), qs, ms, xss); + + unsigned k = 2 + H; + unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; + uint64_t size = uint64_t{1} << n; + uint64_t size2 = uint64_t{1} << state.num_qubits(); + uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); + + if (CH) { + auto m = GetMasks9(state.num_qubits(), qs, cqs, cvals); + FillMatrix(m.qmaskl, matrix, (fp_type*) w); + + for_.Run(size * size2, f, w, ms, xss, + m.cvalsh, m.cmaskh, qs[0], size, raw_size, state.get()); + } else { + auto m = GetMasks10(state.num_qubits(), qs, cqs, cvals); + FillControlledMatrixL( + m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w); + + for_.Run(size * size2, f, w, ms, xss, + m.cvalsh, m.cmaskh, qs[0], size, raw_size, state.get()); + } + } + + For for_; +}; + +} // namespace unitary +} // namespace qsim + +#endif // UNITARY_CALCULATOR_SSE_H_ diff --git a/tpls/qsim/unitaryspace.h b/tpls/qsim/unitaryspace.h new file mode 100644 index 0000000..b5e2691 --- /dev/null +++ b/tpls/qsim/unitaryspace.h @@ -0,0 +1,65 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UNITARYSPACE_H_ +#define UNITARYSPACE_H_ + +#include + +namespace qsim { + +namespace unitary { + +/** + * Abstract class containing routines for general unitary matrix manipulations. + * "AVX", "AVX512", "Basic", and "SSE" implementations are provided. + */ +template class VectorSpace, typename... VSTypeParams> +class UnitarySpace : public VectorSpace { + private: + using Base = VectorSpace; + + public: + using fp_type = typename Base::fp_type; + using Unitary = typename Base::Vector; + + template + UnitarySpace(ForArgs&&... args) : Base(args...) {} + + static Unitary CreateUnitary(unsigned num_qubits) { + return Base::Create(num_qubits); + } + + static Unitary CreateUnitary(fp_type* p, unsigned num_qubits) { + return Base::Create(p, num_qubits); + } + + static Unitary NullUnitary() { + return Base::Null(); + } + + static uint64_t Size(unsigned num_qubits) { + return uint64_t{1} << num_qubits; + }; + + void CopyUnitary(const Unitary& src, Unitary& dest) const { + Base::Copy(src, dest); + } +}; + +} // namespace unitary +} // namespace qsim + +#endif // UNITARYSPACE_H_ diff --git a/tpls/qsim/unitaryspace_avx.h b/tpls/qsim/unitaryspace_avx.h new file mode 100644 index 0000000..c1ec59d --- /dev/null +++ b/tpls/qsim/unitaryspace_avx.h @@ -0,0 +1,112 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UNITARYSPACE_AVX_H_ +#define UNITARYSPACE_AVX_H_ + +#include + +#include +#include +#include +#include + +#include "unitaryspace.h" +#include "vectorspace.h" + +namespace qsim { + +namespace unitary { + +/** + * Object containing context and routines for unitary manipulations. + * Unitary is a vectorized sequence of eight real components followed by eight + * imaginary components. Eight single-precison floating numbers can be loaded + * into an AVX register. + */ +template +struct UnitarySpaceAVX : + public UnitarySpace, VectorSpace, For, float> { + private: + using Base = UnitarySpace, + qsim::VectorSpace, For, float>; + + public: + using Unitary = typename Base::Unitary; + using fp_type = typename Base::fp_type; + + template + explicit UnitarySpaceAVX(ForArgs&&... args) : Base(args...) {} + + static uint64_t MinRowSize(unsigned num_qubits) { + return std::max(uint64_t{16}, 2 * (uint64_t{1} << num_qubits)); + }; + + static uint64_t MinSize(unsigned num_qubits) { + return Base::Size(num_qubits) * MinRowSize(num_qubits); + }; + + void SetAllZeros(Unitary& state) const { + __m256 val0 = _mm256_setzero_ps(); + + auto f = [](unsigned n, unsigned m, uint64_t i, __m256& val, fp_type* p) { + _mm256_store_ps(p + 16 * i, val); + _mm256_store_ps(p + 16 * i + 8, val); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 16, f, val0, state.get()); + } + + void SetIdentity(Unitary& state) { + SetAllZeros(state); + + auto f = [](unsigned n, unsigned m, uint64_t i, + uint64_t row_size, fp_type* p) { + p[row_size * i + (16 * (i / 8)) + (i % 8)] = 1; + }; + + uint64_t size = Base::Size(state.num_qubits()); + uint64_t row_size = MinRowSize(state.num_qubits()); + Base::for_.Run(size, f, row_size, state.get()); + } + + static std::complex GetEntry(const Unitary& state, + uint64_t i, uint64_t j) { + uint64_t row_size = MinRowSize(state.num_qubits()); + uint64_t k = (16 * (j / 8)) + (j % 8); + return std::complex(state.get()[row_size * i + k], + state.get()[row_size * i + k + 8]); + } + + static void SetEntry(Unitary& state, uint64_t i, uint64_t j, + const std::complex& ampl) { + uint64_t row_size = MinRowSize(state.num_qubits()); + uint64_t k = (16 * (j / 8)) + (j % 8); + state.get()[row_size * i + k] = std::real(ampl); + state.get()[row_size * i + k + 8] = std::imag(ampl); + } + + static void SetEntry(Unitary& state, uint64_t i, uint64_t j, fp_type re, + fp_type im) { + uint64_t row_size = MinRowSize(state.num_qubits()); + uint64_t k = (16 * (j / 8)) + (j % 8); + state.get()[row_size * i + k] = re; + state.get()[row_size * i + k + 8] = im; + } +}; + +} // namespace unitary +} // namespace qsim + +#endif // UNITARYSPACE_AVX_H_ diff --git a/tpls/qsim/unitaryspace_avx512.h b/tpls/qsim/unitaryspace_avx512.h new file mode 100644 index 0000000..4c23dc9 --- /dev/null +++ b/tpls/qsim/unitaryspace_avx512.h @@ -0,0 +1,112 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UNITARYSPACE_AVX512_H_ +#define UNITARYSPACE_AVX512_H_ + +#include + +#include +#include +#include +#include + +#include "unitaryspace.h" +#include "vectorspace.h" + +namespace qsim { + +namespace unitary { + +/** + * Object containing context and routines for unitary manipulations. + * State is a vectorized sequence of sixteen real components followed by + * sixteen imaginary components. Sixteen single-precison floating numbers can + * be loaded into an AVX512 register. + */ +template +struct UnitarySpaceAVX512 : + public UnitarySpace, VectorSpace, For, float> { + private: + using Base = UnitarySpace, + qsim::VectorSpace, For, float>; + + public: + using Unitary = typename Base::Unitary; + using fp_type = typename Base::fp_type; + + template + explicit UnitarySpaceAVX512(ForArgs&&... args) : Base(args...) {} + + static uint64_t MinRowSize(unsigned num_qubits) { + return std::max(uint64_t{32}, 2 * (uint64_t{1} << num_qubits)); + }; + + static uint64_t MinSize(unsigned num_qubits) { + return Base::Size(num_qubits) * MinRowSize(num_qubits); + }; + + void SetAllZeros(Unitary& state) const { + __m512 val0 = _mm512_setzero_ps(); + + auto f = [](unsigned n, unsigned m, uint64_t i, __m512 val0, fp_type* p) { + _mm512_store_ps(p + 32 * i, val0); + _mm512_store_ps(p + 32 * i + 16, val0); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 32, f, val0, state.get()); + } + + void SetIdentity(Unitary& state) { + SetAllZeros(state); + + auto f = [](unsigned n, unsigned m, uint64_t i, + uint64_t row_size, fp_type* p) { + p[row_size * i + (32 * (i / 16)) + (i % 16)] = 1; + }; + + uint64_t size = Base::Size(state.num_qubits()); + uint64_t row_size = MinRowSize(state.num_qubits()); + Base::for_.Run(size, f, row_size, state.get()); + } + + static std::complex GetEntry(const Unitary& state, + uint64_t i, uint64_t j) { + uint64_t row_size = MinRowSize(state.num_qubits()); + uint64_t k = (32 * (j / 16)) + (j % 16); + return std::complex(state.get()[row_size * i + k], + state.get()[row_size * i + k + 16]); + } + + static void SetEntry(Unitary& state, uint64_t i, uint64_t j, + const std::complex& ampl) { + uint64_t row_size = MinRowSize(state.num_qubits()); + uint64_t k = (32 * (j / 16)) + (j % 16); + state.get()[row_size * i + k] = std::real(ampl); + state.get()[row_size * i + k + 16] = std::imag(ampl); + } + + static void SetEntry(Unitary& state, uint64_t i, uint64_t j, fp_type re, + fp_type im) { + uint64_t row_size = MinRowSize(state.num_qubits()); + uint64_t k = (32 * (j / 16)) + (j % 16); + state.get()[row_size * i + k] = re; + state.get()[row_size * i + k + 16] = im; + } +}; + +} // namespace unitary +} // namespace qsim + +#endif // UNITARYSPACE_AVX512_H_ diff --git a/tpls/qsim/unitaryspace_basic.h b/tpls/qsim/unitaryspace_basic.h new file mode 100644 index 0000000..2db14b6 --- /dev/null +++ b/tpls/qsim/unitaryspace_basic.h @@ -0,0 +1,103 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UNITARYSPACE_BASIC_H_ +#define UNITARYSPACE_BASIC_H_ + +#include +#include +#include + +#include "unitaryspace.h" +#include "vectorspace.h" + +namespace qsim { + +namespace unitary { + +/** + * Object containing context and routines for unitary manipulations. + * Unitary is a non-vectorized sequence of one real amplitude followed by + * one imaginary amplitude. + */ +template +struct UnitarySpaceBasic + : public UnitarySpace, VectorSpace, For, FP> { + private: + using Base = UnitarySpace, + qsim::VectorSpace, For, FP>; + + public: + using Unitary = typename Base::Unitary; + using fp_type = typename Base::fp_type; + + template + explicit UnitarySpaceBasic(ForArgs&&... args) : Base(args...) {} + + static uint64_t MinRowSize(unsigned num_qubits) { + return 2 * (uint64_t{1} << num_qubits); + }; + + static uint64_t MinSize(unsigned num_qubits) { + return Base::Size(num_qubits) * MinRowSize(num_qubits); + }; + + void SetAllZeros(Unitary& state) const { + auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) { + p[2 * i + 0] = 0; + p[2 * i + 1] = 0; + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 2, f, state.get()); + } + + void SetIdentity(Unitary& state) { + SetAllZeros(state); + + auto f = [](unsigned n, unsigned m, uint64_t i, + uint64_t row_size, fp_type* p) { + p[row_size * i + 2 * i] = 1; + }; + + uint64_t size = Base::Size(state.num_qubits()); + uint64_t row_size = MinRowSize(state.num_qubits()); + Base::for_.Run(size, f, row_size, state.get()); + } + + static std::complex GetEntry(const Unitary& state, + uint64_t i, uint64_t j) { + uint64_t row_size = MinRowSize(state.num_qubits()); + return std::complex(state.get()[row_size * i + 2 * j], + state.get()[row_size * i + 2 * j + 1]); + } + + static void SetEntry(Unitary& state, uint64_t i, uint64_t j, + const std::complex& ampl) { + uint64_t row_size = MinRowSize(state.num_qubits()); + state.get()[row_size * i + 2 * j] = std::real(ampl); + state.get()[row_size * i + 2 * j + 1] = std::imag(ampl); + } + + static void SetEntry(Unitary& state, uint64_t i, uint64_t j, + fp_type re, fp_type im) { + uint64_t row_size = MinRowSize(state.num_qubits()); + state.get()[row_size * i + 2 * j] = re; + state.get()[row_size * i + 2 * j + 1] = im; + } +}; + +} // namespace unitary +} // namespace qsim + +#endif // UNITARYSPACE_BASIC_H_ diff --git a/tpls/qsim/unitaryspace_sse.h b/tpls/qsim/unitaryspace_sse.h new file mode 100644 index 0000000..f3762fb --- /dev/null +++ b/tpls/qsim/unitaryspace_sse.h @@ -0,0 +1,112 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UNITARYSPACE_SSE_H_ +#define UNITARYSPACE_SSE_H_ + +#include + +#include +#include +#include +#include + +#include "unitaryspace.h" +#include "vectorspace.h" + +namespace qsim { + +namespace unitary { + +/** + * Object containing context and routines for unitary manipulations. + * Unitary is a vectorized sequence of four real components followed by four + * imaginary components. Four single-precison floating numbers can be loaded + * into an SSE register. + */ +template +struct UnitarySpaceSSE : + public UnitarySpace, VectorSpace, For, float> { + private: + using Base = UnitarySpace, + qsim::VectorSpace, For, float>; + + public: + using Unitary = typename Base::Unitary; + using fp_type = typename Base::fp_type; + + template + explicit UnitarySpaceSSE(ForArgs&&... args) : Base(args...) {} + + static uint64_t MinRowSize(unsigned num_qubits) { + return std::max(uint64_t{8}, 2 * (uint64_t{1} << num_qubits)); + }; + + static uint64_t MinSize(unsigned num_qubits) { + return Base::Size(num_qubits) * MinRowSize(num_qubits); + }; + + void SetAllZeros(Unitary& state) const { + __m128 val0 = _mm_setzero_ps(); + + auto f = [](unsigned n, unsigned m, uint64_t i, __m128 val0, fp_type* p) { + _mm_store_ps(p + 8 * i, val0); + _mm_store_ps(p + 8 * i + 4, val0); + }; + + Base::for_.Run(MinSize(state.num_qubits()) / 8, f, val0, state.get()); + } + + void SetIdentity(Unitary& state) { + SetAllZeros(state); + + auto f = [](unsigned n, unsigned m, uint64_t i, + uint64_t row_size, fp_type* p) { + p[row_size * i + (8 * (i / 4)) + (i % 4)] = 1; + }; + + uint64_t size = Base::Size(state.num_qubits()); + uint64_t row_size = MinRowSize(state.num_qubits()); + Base::for_.Run(size, f, row_size, state.get()); + } + + static std::complex GetEntry(const Unitary& state, + uint64_t i, uint64_t j) { + uint64_t row_size = MinRowSize(state.num_qubits()); + uint64_t k = (8 * (j / 4)) + (j % 4); + return std::complex(state.get()[row_size * i + k], + state.get()[row_size * i + k + 4]); + } + + static void SetEntry(Unitary& state, uint64_t i, uint64_t j, + const std::complex& ampl) { + uint64_t row_size = MinRowSize(state.num_qubits()); + uint64_t k = (8 * (j / 4)) + (j % 4); + state.get()[row_size * i + k] = std::real(ampl); + state.get()[row_size * i + k + 4] = std::imag(ampl); + } + + static void SetEntry(Unitary& state, uint64_t i, uint64_t j, fp_type re, + fp_type im) { + uint64_t row_size = MinRowSize(state.num_qubits()); + uint64_t k = (8 * (j / 4)) + (j % 4); + state.get()[row_size * i + k] = re; + state.get()[row_size * i + k + 4] = im; + } +}; + +} // namespace unitary +} // namespace qsim + +#endif // UNITARYSPACE_SSE_H_ diff --git a/tpls/qsim/util.h b/tpls/qsim/util.h new file mode 100644 index 0000000..726a019 --- /dev/null +++ b/tpls/qsim/util.h @@ -0,0 +1,89 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UTIL_H_ +#define UTIL_H_ + +#include +#include +#include +#include +#include +#include +#include + +namespace qsim { + +template +inline void SplitString( + const std::string& str, char delim, Container& words) { + words.resize(0); + + std::string word; + std::stringstream ss(str); + + while (std::getline(ss, word, delim)) { + words.push_back(std::move(word)); + } +} + +template +inline void SplitString( + const std::string& str, char delim, Op op, Container& words) { + words.resize(0); + + std::string word; + std::stringstream ss(str); + + while (std::getline(ss, word, delim)) { + words.push_back(op(word)); + } +} + +inline double GetTime() { + using namespace std::chrono; + steady_clock::duration since_epoch = steady_clock::now().time_since_epoch(); + return double(since_epoch.count() * steady_clock::period::num) + / steady_clock::period::den; +} + +template +inline DistrRealType RandomValue(RGen& rgen, DistrRealType max_value) { + std::uniform_real_distribution distr(0.0, max_value); + return distr(rgen); +} + +template +inline std::vector GenerateRandomValues( + uint64_t num_samples, unsigned seed, DistrRealType max_value) { + std::vector rs; + rs.reserve(num_samples + 1); + + std::mt19937 rgen(seed); + std::uniform_real_distribution distr(0.0, max_value); + + for (uint64_t i = 0; i < num_samples; ++i) { + rs.emplace_back(distr(rgen)); + } + + std::sort(rs.begin(), rs.end()); + // Populate the final element to prevent sanitizer errors. + rs.emplace_back(max_value); + + return rs; +} + +} // namespace qsim + +#endif // UTIL_H_ diff --git a/tpls/qsim/util_cpu.h b/tpls/qsim/util_cpu.h new file mode 100644 index 0000000..8e02425 --- /dev/null +++ b/tpls/qsim/util_cpu.h @@ -0,0 +1,43 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UTIL_CPU_H_ +#define UTIL_CPU_H_ + +#ifdef __SSE2__ +# include +#endif + +namespace qsim { + +// This function sets flush-to-zero and denormals-are-zeros MXCSR control +// flags. This prevents rare cases of performance slowdown potentially at +// the cost of a tiny precision loss. +inline void SetFlushToZeroAndDenormalsAreZeros() { +#ifdef __SSE2__ + _mm_setcsr(_mm_getcsr() | 0x8040); +#endif +} + +// This function clears flush-to-zero and denormals-are-zeros MXCSR control +// flags. +inline void ClearFlushToZeroAndDenormalsAreZeros() { +#ifdef __SSE2__ + _mm_setcsr(_mm_getcsr() & ~unsigned{0x8040}); +#endif +} + +} // namespace qsim + +#endif // UTIL_CPU_H_ diff --git a/tpls/qsim/util_cuda.h b/tpls/qsim/util_cuda.h new file mode 100644 index 0000000..5d8cb5d --- /dev/null +++ b/tpls/qsim/util_cuda.h @@ -0,0 +1,128 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UTIL_CUDA_H_ +#define UTIL_CUDA_H_ + +#ifdef __NVCC__ + #include +#elif __HIP__ + #include +#endif + +#include + +#include "io.h" + +namespace qsim { + +#define ErrorCheck(code) { ErrorAssert((code), __FILE__, __LINE__); } + +inline void ErrorAssert(cudaError_t code, const char* file, unsigned line) { + if (code != cudaSuccess) { + IO::errorf("CUDA error: %s %s %d\n", cudaGetErrorString(code), file, line); + exit(code); + } +} + +template +struct Complex { + __host__ __device__ __forceinline__ Complex() {} + + __host__ __device__ __forceinline__ Complex(const T& re) : re(re), im(0) {} + + __host__ __device__ __forceinline__ Complex(const T& re, const T& im) + : re(re), im(im) {} + + template + __host__ __device__ __forceinline__ Complex& operator=( + const Complex& r) { + re = r.re; + im = r.im; + + return *this; + } + + T re; + T im; +}; + +template +__host__ __device__ __forceinline__ Complex operator+( + const Complex& l, const Complex& r) { + return Complex(l.re + r.re, l.im + r.im); +} + +template +__host__ __device__ __forceinline__ Complex operator+( + const Complex& l, const Complex& r) { + return Complex(l.re + r.re, l.im + r.im); +} + +template +struct Scalar { + using type = T; +}; + +template +struct Scalar> { + using type = T; +}; + +template +struct Plus { + template + __device__ __forceinline__ T operator()(const T& v1, const U& v2) const { + return v1 + v2; + } +}; + +template +struct Product { + __device__ __forceinline__ Complex operator()( + const T& re1, const T& im1, const T& re2, const T& im2) const { + return Complex(re1 * re2 + im1 * im2, re1 * im2 - im1 * re2); + } +}; + +template +struct RealProduct { + __device__ __forceinline__ T operator()( + const T& re1, const T& im1, const T& re2, const T& im2) const { + return re1 * re2 + im1 * im2; + } +}; + +template +__device__ __forceinline__ FP1 WarpReduce(FP1 val, Op op) { + for (unsigned i = warp_size / 2; i > 0; i /= 2) { + val = op(val, __shfl_down_sync(0xffffffff, val, i)); + } + + return val; +} + +template +__device__ __forceinline__ Complex WarpReduce(Complex val, Op op) { + for (unsigned i = warp_size / 2; i > 0; i /= 2) { + val.re = op(val.re, __shfl_down_sync(0xffffffff, val.re, i)); + val.im = op(val.im, __shfl_down_sync(0xffffffff, val.im, i)); + } + + return val; +} + +} // namespace qsim + +#endif // UTIL_CUDA_H_ diff --git a/tpls/qsim/util_custatevec.h b/tpls/qsim/util_custatevec.h new file mode 100644 index 0000000..36f29ef --- /dev/null +++ b/tpls/qsim/util_custatevec.h @@ -0,0 +1,44 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef UTIL_CUSTATEVEC_H_ +#define UTIL_CUSTATEVEC_H_ + +#include +#include + +#include "io.h" +#include "util_cuda.h" + +namespace qsim { + +inline void ErrorAssert(cublasStatus_t code, const char* file, unsigned line) { + if (code != CUBLAS_STATUS_SUCCESS) { + IO::errorf("cuBLAS error %i: %s %d\n", code, file, line); + exit(code); + } +} + +inline void ErrorAssert( + custatevecStatus_t code, const char* file, unsigned line) { + if (code != CUSTATEVEC_STATUS_SUCCESS) { + IO::errorf("custatevec error: %s %s %d\n", + custatevecGetErrorString(code), file, line); + exit(code); + } +} + +} // namespace qsim + +#endif // UTIL_CUSTATEVEC_H_ diff --git a/tpls/qsim/vectorspace.h b/tpls/qsim/vectorspace.h new file mode 100644 index 0000000..7b33a53 --- /dev/null +++ b/tpls/qsim/vectorspace.h @@ -0,0 +1,185 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef VECTORSPACE_H_ +#define VECTORSPACE_H_ + +#ifdef _WIN32 + #include +#endif + +#include +#include +#include +#include + +namespace qsim { + +namespace detail { + +inline void do_not_free(void*) {} + +inline void free(void* ptr) { +#ifdef _WIN32 + _aligned_free(ptr); +#else + ::free(ptr); +#endif +} + +} // namespace detail + +// Routines for vector manipulations. +template +class VectorSpace { + public: + using fp_type = FP; + + private: + using Pointer = std::unique_ptr; + + public: + class Vector { + public: + Vector() = delete; + + Vector(Pointer&& ptr, unsigned num_qubits) + : ptr_(std::move(ptr)), num_qubits_(num_qubits) {} + + fp_type* get() { + return ptr_.get(); + } + + const fp_type* get() const { + return ptr_.get(); + } + + fp_type* release() { + num_qubits_ = 0; + return ptr_.release(); + } + + unsigned num_qubits() const { + return num_qubits_; + } + + bool requires_copy_to_host() const { + return false; + } + + private: + Pointer ptr_; + unsigned num_qubits_; + }; + + template + VectorSpace(ForArgs&&... args) : for_(args...) {} + + static Vector Create(unsigned num_qubits) { + auto size = sizeof(fp_type) * Impl::MinSize(num_qubits); + #ifdef _WIN32 + Pointer ptr{(fp_type*) _aligned_malloc(size, 64), &detail::free}; + return Vector{std::move(ptr), ptr.get() != nullptr ? num_qubits : 0}; + #else + void* p = nullptr; + if (posix_memalign(&p, 64, size) == 0) { + return Vector{Pointer{(fp_type*) p, &detail::free}, num_qubits}; + } else { + return Null(); + } + #endif + } + + // It is the client's responsibility to make sure that p has at least + // Impl::MinSize(num_qubits) elements. + static Vector Create(fp_type* p, unsigned num_qubits) { + return Vector{Pointer{p, &detail::do_not_free}, num_qubits}; + } + + static Vector Null() { + return Vector{Pointer{nullptr, &detail::free}, 0}; + } + + static bool IsNull(const Vector& vec) { + return vec.get() == nullptr; + } + + static void Free(fp_type* ptr) { + detail::free(ptr); + } + + bool Copy(const Vector& src, Vector& dest) const { + if (src.num_qubits() != dest.num_qubits()) { + return false; + } + + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* src, fp_type* dest) { + dest[i] = src[i]; + }; + + for_.Run(Impl::MinSize(src.num_qubits()), f, src.get(), dest.get()); + + return true; + } + + // It is the client's responsibility to make sure that dest has at least + // Impl::MinSize(src.num_qubits()) elements. + bool Copy(const Vector& src, fp_type* dest) const { + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* src, fp_type* dest) { + dest[i] = src[i]; + }; + + for_.Run(Impl::MinSize(src.num_qubits()), f, src.get(), dest); + + return true; + } + + // It is the client's responsibility to make sure that src has at least + // Impl::MinSize(dest.num_qubits()) elements. + bool Copy(const fp_type* src, Vector& dest) const { + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* src, fp_type* dest) { + dest[i] = src[i]; + }; + + for_.Run(Impl::MinSize(dest.num_qubits()), f, src, dest.get()); + + return true; + } + + // It is the client's responsibility to make sure that src has at least + // min(size, Impl::MinSize(dest.num_qubits())) elements. + bool Copy(const fp_type* src, uint64_t size, Vector& dest) const { + auto f = [](unsigned n, unsigned m, uint64_t i, + const fp_type* src, fp_type* dest) { + dest[i] = src[i]; + }; + + size = std::min(size, Impl::MinSize(dest.num_qubits())); + for_.Run(size, f, src, dest.get()); + + return true; + } + + void DeviceSync() {} + + protected: + For for_; +}; + +} // namespace qsim + +#endif // VECTORSPACE_H_ diff --git a/tpls/qsim/vectorspace_cuda.h b/tpls/qsim/vectorspace_cuda.h new file mode 100644 index 0000000..fd91553 --- /dev/null +++ b/tpls/qsim/vectorspace_cuda.h @@ -0,0 +1,172 @@ +// Copyright 2019 Google LLC. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef VECTORSPACE_CUDA_H_ +#define VECTORSPACE_CUDA_H_ + +#ifdef __NVCC__ + #include + #include +#elif __HIP__ + #include + #include "cuda2hip.h" +#endif + +#include +#include + +namespace qsim { + +namespace detail { + +inline void do_not_free(void*) {} + +inline void free(void* ptr) { + ErrorCheck(cudaFree(ptr)); +} + +} // namespace detail + +// Routines for vector manipulations. +template +class VectorSpaceCUDA { + public: + using fp_type = FP; + + private: + using Pointer = std::unique_ptr; + + public: + class Vector { + public: + Vector() = delete; + + Vector(Pointer&& ptr, unsigned num_qubits) + : ptr_(std::move(ptr)), num_qubits_(num_qubits) {} + + fp_type* get() { + return ptr_.get(); + } + + const fp_type* get() const { + return ptr_.get(); + } + + fp_type* release() { + num_qubits_ = 0; + return ptr_.release(); + } + + unsigned num_qubits() const { + return num_qubits_; + } + + bool requires_copy_to_host() const { + return true; + } + + private: + Pointer ptr_; + unsigned num_qubits_; + }; + + template + VectorSpaceCUDA(Args&&... args) {} + + static Vector Create(unsigned num_qubits) { + fp_type* p; + auto size = sizeof(fp_type) * Impl::MinSize(num_qubits); + auto rc = cudaMalloc(&p, size); + + if (rc == cudaSuccess) { + return Vector{Pointer{(fp_type*) p, &detail::free}, num_qubits}; + } else { + return Null(); + } + } + + // It is the client's responsibility to make sure that p has at least + // Impl::MinSize(num_qubits) elements. + static Vector Create(fp_type* p, unsigned num_qubits) { + return Vector{Pointer{p, &detail::do_not_free}, num_qubits}; + } + + static Vector Null() { + return Vector{Pointer{nullptr, &detail::free}, 0}; + } + + static bool IsNull(const Vector& vector) { + return vector.get() == nullptr; + } + + static void Free(fp_type* ptr) { + detail::free(ptr); + } + + bool Copy(const Vector& src, Vector& dest) const { + if (src.num_qubits() != dest.num_qubits()) { + return false; + } + + ErrorCheck( + cudaMemcpy(dest.get(), src.get(), + sizeof(fp_type) * Impl::MinSize(src.num_qubits()), + cudaMemcpyDeviceToDevice)); + + return true; + } + + // It is the client's responsibility to make sure that dest has at least + // Impl::MinSize(src.num_qubits()) elements. + bool Copy(const Vector& src, fp_type* dest) const { + ErrorCheck( + cudaMemcpy(dest, src.get(), + sizeof(fp_type) * Impl::MinSize(src.num_qubits()), + cudaMemcpyDeviceToHost)); + + return true; + } + + // It is the client's responsibility to make sure that src has at least + // Impl::MinSize(dest.num_qubits()) elements. + bool Copy(const fp_type* src, Vector& dest) const { + ErrorCheck( + cudaMemcpy(dest.get(), src, + sizeof(fp_type) * Impl::MinSize(dest.num_qubits()), + cudaMemcpyHostToDevice)); + + return true; + } + + // It is the client's responsibility to make sure that src has at least + // min(size, Impl::MinSize(dest.num_qubits())) elements. + bool Copy(const fp_type* src, uint64_t size, Vector& dest) const { + size = std::min(size, Impl::MinSize(dest.num_qubits())); + ErrorCheck( + cudaMemcpy(dest.get(), src, + sizeof(fp_type) * size, + cudaMemcpyHostToDevice)); + return true; + } + + void DeviceSync() { + ErrorCheck(cudaDeviceSynchronize()); + } + + protected: +}; + +} // namespace qsim + +#endif // VECTORSPACE_CUDA_H_ From 177aaa4b5516bedc7942c91480d293d330397460 Mon Sep 17 00:00:00 2001 From: wongey <25296194+wongey@users.noreply.github.com> Date: Tue, 5 Nov 2024 22:05:03 -0500 Subject: [PATCH 02/64] Remove redudant qsim folder which was accidentally copied into the top level --- qsim/bits.h | 106 -- qsim/bitstring.h | 97 -- qsim/channel.h | 149 --- qsim/channels_cirq.h | 471 --------- qsim/channels_qsim.h | 117 --- qsim/circuit.h | 36 - qsim/circuit_noisy.h | 108 -- qsim/circuit_qsim_parser.h | 442 -------- qsim/cuda2hip.h | 61 -- qsim/expect.h | 148 --- qsim/formux.h | 30 - qsim/fuser.h | 225 ---- qsim/fuser_basic.h | 411 -------- qsim/fuser_mqubit.h | 1095 -------------------- qsim/gate.h | 216 ---- qsim/gate_appl.h | 231 ----- qsim/gates_cirq.h | 1640 ------------------------------ qsim/gates_qsim.h | 661 ------------ qsim/hybrid.h | 612 ----------- qsim/io.h | 44 - qsim/io_file.h | 71 -- qsim/matrix.h | 296 ------ qsim/mps_simulator.h | 246 ----- qsim/mps_statespace.h | 597 ----------- qsim/parfor.h | 123 --- qsim/qtrajectory.h | 435 -------- qsim/run_qsim.h | 262 ----- qsim/run_qsimh.h | 120 --- qsim/seqfor.h | 68 -- qsim/simmux.h | 44 - qsim/simmux_gpu.h | 30 - qsim/simulator.h | 516 ---------- qsim/simulator_avx.h | 1363 ------------------------- qsim/simulator_avx512.h | 846 --------------- qsim/simulator_basic.h | 349 ------- qsim/simulator_cuda.h | 923 ----------------- qsim/simulator_cuda_kernels.h | 683 ------------- qsim/simulator_custatevec.h | 209 ---- qsim/simulator_sse.h | 864 ---------------- qsim/statespace.h | 145 --- qsim/statespace_avx.h | 497 --------- qsim/statespace_avx512.h | 448 -------- qsim/statespace_basic.h | 300 ------ qsim/statespace_cuda.h | 470 --------- qsim/statespace_cuda_kernels.h | 355 ------- qsim/statespace_custatevec.h | 376 ------- qsim/statespace_sse.h | 462 --------- qsim/umux.h | 52 - qsim/unitary_calculator_avx.h | 1028 ------------------- qsim/unitary_calculator_avx512.h | 644 ------------ qsim/unitary_calculator_basic.h | 259 ----- qsim/unitary_calculator_sse.h | 639 ------------ qsim/unitaryspace.h | 65 -- qsim/unitaryspace_avx.h | 112 -- qsim/unitaryspace_avx512.h | 112 -- qsim/unitaryspace_basic.h | 103 -- qsim/unitaryspace_sse.h | 112 -- qsim/util.h | 89 -- qsim/util_cpu.h | 43 - qsim/util_cuda.h | 128 --- qsim/util_custatevec.h | 44 - qsim/vectorspace.h | 185 ---- qsim/vectorspace_cuda.h | 172 ---- 63 files changed, 21785 deletions(-) delete mode 100644 qsim/bits.h delete mode 100644 qsim/bitstring.h delete mode 100644 qsim/channel.h delete mode 100644 qsim/channels_cirq.h delete mode 100644 qsim/channels_qsim.h delete mode 100644 qsim/circuit.h delete mode 100644 qsim/circuit_noisy.h delete mode 100644 qsim/circuit_qsim_parser.h delete mode 100644 qsim/cuda2hip.h delete mode 100644 qsim/expect.h delete mode 100644 qsim/formux.h delete mode 100644 qsim/fuser.h delete mode 100644 qsim/fuser_basic.h delete mode 100644 qsim/fuser_mqubit.h delete mode 100644 qsim/gate.h delete mode 100644 qsim/gate_appl.h delete mode 100644 qsim/gates_cirq.h delete mode 100644 qsim/gates_qsim.h delete mode 100644 qsim/hybrid.h delete mode 100644 qsim/io.h delete mode 100644 qsim/io_file.h delete mode 100644 qsim/matrix.h delete mode 100644 qsim/mps_simulator.h delete mode 100644 qsim/mps_statespace.h delete mode 100644 qsim/parfor.h delete mode 100644 qsim/qtrajectory.h delete mode 100644 qsim/run_qsim.h delete mode 100644 qsim/run_qsimh.h delete mode 100644 qsim/seqfor.h delete mode 100644 qsim/simmux.h delete mode 100644 qsim/simmux_gpu.h delete mode 100644 qsim/simulator.h delete mode 100644 qsim/simulator_avx.h delete mode 100644 qsim/simulator_avx512.h delete mode 100644 qsim/simulator_basic.h delete mode 100644 qsim/simulator_cuda.h delete mode 100644 qsim/simulator_cuda_kernels.h delete mode 100644 qsim/simulator_custatevec.h delete mode 100644 qsim/simulator_sse.h delete mode 100644 qsim/statespace.h delete mode 100644 qsim/statespace_avx.h delete mode 100644 qsim/statespace_avx512.h delete mode 100644 qsim/statespace_basic.h delete mode 100644 qsim/statespace_cuda.h delete mode 100644 qsim/statespace_cuda_kernels.h delete mode 100644 qsim/statespace_custatevec.h delete mode 100644 qsim/statespace_sse.h delete mode 100644 qsim/umux.h delete mode 100644 qsim/unitary_calculator_avx.h delete mode 100644 qsim/unitary_calculator_avx512.h delete mode 100644 qsim/unitary_calculator_basic.h delete mode 100644 qsim/unitary_calculator_sse.h delete mode 100644 qsim/unitaryspace.h delete mode 100644 qsim/unitaryspace_avx.h delete mode 100644 qsim/unitaryspace_avx512.h delete mode 100644 qsim/unitaryspace_basic.h delete mode 100644 qsim/unitaryspace_sse.h delete mode 100644 qsim/util.h delete mode 100644 qsim/util_cpu.h delete mode 100644 qsim/util_cuda.h delete mode 100644 qsim/util_custatevec.h delete mode 100644 qsim/vectorspace.h delete mode 100644 qsim/vectorspace_cuda.h diff --git a/qsim/bits.h b/qsim/bits.h deleted file mode 100644 index 080c866..0000000 --- a/qsim/bits.h +++ /dev/null @@ -1,106 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef BITS_H_ -#define BITS_H_ - -#include - -#ifdef __BMI2__ - -#include - -#include - -namespace qsim { -namespace bits { - -inline uint32_t ExpandBits(uint32_t bits, unsigned n, uint32_t mask) { - return _pdep_u32(bits, mask); -} - -inline uint64_t ExpandBits(uint64_t bits, unsigned n, uint64_t mask) { - return _pdep_u64(bits, mask); -} - -inline uint32_t CompressBits(uint32_t bits, unsigned n, uint32_t mask) { - return _pext_u32(bits, mask); -} - -inline uint64_t CompressBits(uint64_t bits, unsigned n, uint64_t mask) { - return _pext_u64(bits, mask); -} - -} // namespace bits -} // namespace qsim - -#else // __BMI2__ - -namespace qsim { -namespace bits { - -template -inline Integer ExpandBits(Integer bits, unsigned n, Integer mask) { - Integer ebits = 0; - unsigned k = 0; - - for (unsigned i = 0; i < n; ++i) { - if ((mask >> i) & 1) { - ebits |= ((bits >> k) & 1) << i; - ++k; - } - } - - return ebits; -} - -template -inline Integer CompressBits(Integer bits, unsigned n, Integer mask) { - Integer sbits = 0; - unsigned k = 0; - - for (unsigned i = 0; i < n; ++i) { - if ((mask >> i) & 1) { - sbits |= ((bits >> i) & 1) << k; - ++k; - } - } - - return sbits; -} - -} // namespace bits -} // namespace qsim - -#endif // __BMI2__ - -namespace qsim { -namespace bits { - -template -inline Integer PermuteBits( - Integer bits, unsigned n, const std::vector& perm) { - Integer pbits = 0; - - for (unsigned i = 0; i < n; ++i) { - pbits |= ((bits >> i) & 1) << perm[i]; - } - - return pbits; -} - -} // namespace bits -} // namespace qsim - -#endif // BITS_H_ diff --git a/qsim/bitstring.h b/qsim/bitstring.h deleted file mode 100644 index b95584b..0000000 --- a/qsim/bitstring.h +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef BITSTRING_H_ -#define BITSTRING_H_ - -#include -#include -#include -#include - -namespace qsim { - -using Bitstring = uint64_t; - -/** - * Reads bitstrings (representing initialized or measured states of qubits) - * from a provided stream object and stores them in a vector. - * @param num_qubits Number of qubits represented in each bitstring. - * @param provider Source of bitstrings; only used for error reporting. - * @param fs The stream to read bitstrings from. - * @param bitstrings Output vector of bitstrings. On success, this will contain - * all bitstrings read in from 'fs'. - * @return True if reading succeeded; false otherwise. - */ -template -bool BitstringsFromStream(unsigned num_qubits, const std::string& provider, - Stream& fs, std::vector& bitstrings) { - bitstrings.resize(0); - bitstrings.reserve(100000); - - // Bitstrings are in text format. One bitstring per line. - - do { - char buf[128]; - fs.getline(buf, 128); - - if (fs) { - Bitstring b{0}; - - unsigned p = 0; - while (p < 128 && (buf[p] == '0' || buf[p] == '1')) { - b |= uint64_t(buf[p] - '0') << p; - ++p; - } - - if (p != num_qubits) { - IO::errorf("wrong bitstring length in %s: " - "got %u; should be %u.\n", provider.c_str(), p, num_qubits); - bitstrings.resize(0); - return false; - } - - bitstrings.push_back(b); - } - } while (fs); - - return true; -} - -/** - * Reads bitstrings (representing initialized or measured states of qubits) - * from the given file and stores them in a vector. - * @param num_qubits Number of qubits represented in each bitstring. - * @param file The name of the file to read bitstrings from. - * @param bitstrings Output vector of bitstrings. On success, this will contain - * all bitstrings read in from 'file'. - * @return True if reading succeeded; false otherwise. - */ -template -inline bool BitstringsFromFile(unsigned num_qubits, const std::string& file, - std::vector& bitstrings) { - auto fs = IO::StreamFromFile(file); - - if (!fs) { - return false; - } else { - bool rc = BitstringsFromStream(num_qubits, file, fs, bitstrings); - IO::CloseStream(fs); - return rc; - } -} - -} // namespace qsim - -#endif // BITSTRING_H_ diff --git a/qsim/channel.h b/qsim/channel.h deleted file mode 100644 index 372a174..0000000 --- a/qsim/channel.h +++ /dev/null @@ -1,149 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef CHANNEL_H_ -#define CHANNEL_H_ - -#include -#include - -#include "gate.h" -#include "matrix.h" - -namespace qsim { - -/** - * Kraus operator. - */ -template -struct KrausOperator { - using fp_type = typename Gate::fp_type; - - enum Kind { - kNormal = 0, - kMeasurement = gate::kMeasurement, - }; - - /** - * Kraus operator type; - */ - Kind kind; - - /** - * If true, the Kraus operator is a unitary operator times a constant. - */ - bool unitary; - - /** - * Lower bound on Kraus operator probability. - */ - double prob; - - /** - * Sequence of operations that represent the Kraus operator. This can be just - * one operation. - */ - std::vector ops; - - /** - * Product of K^\dagger and K. This can be empty if unitary = true. - */ - Matrix kd_k; - - /** - * Qubits kd_k acts on. This can be empty if unitary = true. - */ - std::vector qubits; - - /** - * Calculates the product of "K^\dagger K". Sets qubits "K^\dagger K" acts on. - */ - void CalculateKdKMatrix() { - if (ops.size() == 1) { - kd_k = ops[0].matrix; - MatrixDaggerMultiply(ops[0].qubits.size(), ops[0].matrix, kd_k); - qubits = ops[0].qubits; - } else if (ops.size() > 1) { - std::set qubit_map; - - for (const auto& op : ops) { - for (unsigned q : op.qubits) { - qubit_map.insert(q); - } - } - - unsigned num_qubits = qubit_map.size(); - - qubits.resize(0); - qubits.reserve(num_qubits); - - for (auto it = qubit_map.begin(); it != qubit_map.end(); ++it) { - qubits.push_back(*it); - } - - MatrixIdentity(unsigned{1} << num_qubits, kd_k); - - for (const auto& op : ops) { - if (op.qubits.size() == num_qubits) { - MatrixMultiply(num_qubits, op.matrix, kd_k); - } else { - unsigned mask = 0; - - for (auto q : op.qubits) { - for (unsigned i = 0; i < num_qubits; ++i) { - if (q == qubits[i]) { - mask |= unsigned{1} << i; - break; - } - } - } - - MatrixMultiply(mask, op.qubits.size(), op.matrix, num_qubits, kd_k); - } - } - - auto m = kd_k; - MatrixDaggerMultiply(num_qubits, m, kd_k); - } - } -}; - -/** - * Quantum channel. - */ -template -using Channel = std::vector>; - -/** - * Makes a channel from the gate. - * @param time The time to place the channel at. - * @param gate The input gate. - * @return The output channel. - */ -template -Channel MakeChannelFromGate(unsigned time, const Gate& gate) { - auto normal = KrausOperator::kNormal; - auto measurement = KrausOperator::kMeasurement; - - auto kind = gate.kind == gate::kMeasurement ? measurement : normal; - - Channel channel = {{kind, true, 1, {gate}}}; - channel[0].ops[0].time = time; - - return channel; -} - -} // namespace qsim - -#endif // CHANNEL_H_ diff --git a/qsim/channels_cirq.h b/qsim/channels_cirq.h deleted file mode 100644 index 69f1df9..0000000 --- a/qsim/channels_cirq.h +++ /dev/null @@ -1,471 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef CHANNELS_CIRQ_H_ -#define CHANNELS_CIRQ_H_ - -#include -#include -#include - -#include "channel.h" -#include "gates_cirq.h" - -namespace qsim { - -namespace Cirq { - -template -using Channel = qsim::Channel>; - -/** - * Asymmetric depolarizing channel factory. - */ -template -struct AsymmetricDepolarizingChannel { - static constexpr char name[] = "asymmetric_depolarize"; - - AsymmetricDepolarizingChannel(double p_x, double p_y, double p_z) - : p_x(p_x), p_y(p_y), p_z(p_z) {} - - static Channel Create(unsigned time, unsigned q, - double p_x, double p_y, double p_z) { - double p1 = 1 - p_x - p_y - p_z; - - auto normal = KrausOperator>::kNormal; - - return {{normal, 1, p1, {}}, - {normal, 1, p_x, {X::Create(time, q)}}, - {normal, 1, p_y, {Y::Create(time, q)}}, - {normal, 1, p_z, {Z::Create(time, q)}}}; - } - - static Channel Create(unsigned time, - const std::vector& qubits, - double p_x, double p_y, double p_z) { - double p1 = 1 - p_x - p_y - p_z; - - auto normal = KrausOperator>::kNormal; - - uint64_t size = uint64_t{1} << (2 * qubits.size()); - - Channel channel; - channel.reserve(size); - - for (uint64_t i = 0; i < size; ++i) { - channel.push_back({normal, 1, 0, {}}); - auto& kop = channel.back(); - - kop.ops.reserve(qubits.size()); - - double prob = 1; - - for (unsigned q = 0; q < qubits.size(); ++q) { - unsigned pauli_index = (i >> (2 * q)) & 3; - - switch (pauli_index) { - case 0: - prob *= p1; - break; - case 1: - prob *= p_x; - kop.ops.push_back(X::Create(time, q)); - break; - case 2: - prob *= p_y; - kop.ops.push_back(Y::Create(time, q)); - break; - case 3: - prob *= p_z; - kop.ops.push_back(Z::Create(time, q)); - break; - } - } - - kop.prob = prob; - } - - return channel; - } - - Channel Create(unsigned time, unsigned q) const { - return Create(time, q, p_x, p_y, p_z); - } - - Channel Create( - unsigned time, const std::vector& qubits) const { - return Create(time, qubits, p_x, p_y, p_z); - } - - double p_x = 0; - double p_y = 0; - double p_z = 0; -}; - -/** - * Returns an asymmetric depolarizing channel factory object. - */ -template -inline AsymmetricDepolarizingChannel asymmetric_depolarize( - double p_x, double p_y, double p_z) { - return AsymmetricDepolarizingChannel(p_x, p_y, p_z); -} - -/** - * Depolarizing channel factory. - */ -template -struct DepolarizingChannel { - static constexpr char name[] = "depolarize"; - - DepolarizingChannel(double p) : p(p) {} - - static Channel Create(unsigned time, unsigned q, double p) { - double p1 = 1 - p; - double p2 = p / 3; - - auto normal = KrausOperator>::kNormal; - - return {{normal, 1, p1, {}}, - {normal, 1, p2, {X::Create(time, q)}}, - {normal, 1, p2, {Y::Create(time, q)}}, - {normal, 1, p2, {Z::Create(time, q)}}}; - } - - static Channel Create( - unsigned time, const std::vector& qubits, double p) { - double p1 = 1 - p; - double p2 = p / 3; - - auto normal = KrausOperator>::kNormal; - - uint64_t size = uint64_t{1} << (2 * qubits.size()); - - Channel channel; - channel.reserve(size); - - for (uint64_t i = 0; i < size; ++i) { - channel.push_back({normal, 1, 0, {}}); - auto& kop = channel.back(); - - kop.ops.reserve(qubits.size()); - - double prob = 1; - - for (unsigned q = 0; q < qubits.size(); ++q) { - unsigned pauli_index = (i >> (2 * q)) & 3; - - switch (pauli_index) { - case 0: - prob *= p1; - break; - case 1: - prob *= p2; - kop.ops.push_back(X::Create(time, q)); - break; - case 2: - prob *= p2; - kop.ops.push_back(Y::Create(time, q)); - break; - case 3: - prob *= p2; - kop.ops.push_back(Z::Create(time, q)); - break; - } - } - - kop.prob = prob; - } - - return channel; - } - - Channel Create(unsigned time, unsigned q) const { - return Create(time, q, p); - } - - Channel Create( - unsigned time, const std::vector& qubits) const { - return Create(time, qubits, p); - } - - double p = 0; -}; - -/** - * Returns a depolarizing channel factory object. - */ -template -inline DepolarizingChannel depolarize(double p) { - return DepolarizingChannel(p); -} - -/** - * Generalized amplitude damping channel factory. - */ -template -struct GeneralizedAmplitudeDampingChannel { - static constexpr char name[] = "generalized_amplitude_damp"; - - GeneralizedAmplitudeDampingChannel(double p, double gamma) - : p(p), gamma(gamma) {} - - static Channel Create( - unsigned time, unsigned q, double p, double gamma) { - double p1 = p * (1 - gamma); - double p2 = (1 - p) * (1 - gamma); - double p3 = 0; - - fp_type t1 = std::sqrt(p); - fp_type r1 = std::sqrt(p * (1 - gamma)); - fp_type s1 = std::sqrt(p * gamma); - fp_type t2 = std::sqrt(1 - p); - fp_type r2 = std::sqrt((1 - p) * (1 - gamma)); - fp_type s2 = std::sqrt((1 - p) * gamma); - - using M = Cirq::MatrixGate1; - auto normal = KrausOperator>::kNormal; - - return {{normal, 0, p1, - {M::Create(time, q, {t1, 0, 0, 0, 0, 0, r1, 0})}, - {t1 * t1, 0, 0, 0, 0, 0, r1 * r1, 0}, {q}, - }, - {normal, 0, p2, - {M::Create(time, q, {r2, 0, 0, 0, 0, 0, t2, 0})}, - {r2 * r2, 0, 0, 0, 0, 0, t2 * t2, 0}, {q}, - }, - {normal, 0, p3, - {M::Create(time, q, {0, 0, s1, 0, 0, 0, 0, 0})}, - {0, 0, 0, 0, 0, 0, s1 * s1, 0}, {q}, - }, - {normal, 0, p3, - {M::Create(time, q, {0, 0, 0, 0, s2, 0, 0, 0})}, - {s2 * s2, 0, 0, 0, 0, 0, 0, 0}, {q}, - }, - }; - } - - Channel Create(unsigned time, unsigned q) const { - return Create(time, q, p, gamma); - } - - double p = 1; - double gamma = 0; -}; - -/** - * Returns a generalized amplitude damping channel factory object. - */ -template -inline GeneralizedAmplitudeDampingChannel generalized_amplitude_damp( - double p, double gamma) { - return GeneralizedAmplitudeDampingChannel(p, gamma); -} - -/** - * Amplitude damping channel factory. - */ -template -struct AmplitudeDampingChannel { - static constexpr char name[] = "amplitude_damp"; - - AmplitudeDampingChannel(double gamma) : gamma(gamma) {} - - static Channel Create(unsigned time, unsigned q, double gamma) { - double p1 = 1 - gamma; - double p2 = 0; - - fp_type r = std::sqrt(p1); - fp_type s = std::sqrt(gamma); - - using M = Cirq::MatrixGate1; - auto normal = KrausOperator>::kNormal; - - return {{normal, 0, p1, - {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})}, - {1, 0, 0, 0, 0, 0, r * r, 0}, {q}, - }, - {normal, 0, p2, - {M::Create(time, q, {0, 0, s, 0, 0, 0, 0, 0})}, - {0, 0, 0, 0, 0, 0, s * s, 0}, {q}, - }, - }; - } - - Channel Create(unsigned time, unsigned q) const { - return Create(time, q, gamma); - } - - double gamma = 0; -}; - -/** - * Returns an amplitude damping channel factory object. - */ -template -inline AmplitudeDampingChannel amplitude_damp(double gamma) { - return AmplitudeDampingChannel(gamma); -} - -/** - * Phase damping channel factory. - */ -template -struct PhaseDampingChannel { - static constexpr char name[] = "phase_dump"; - - PhaseDampingChannel(double gamma) : gamma(gamma) {} - - static Channel Create(unsigned time, unsigned q, double gamma) { - double p1 = 1 - gamma; - double p2 = 0; - - fp_type r = std::sqrt(p1); - fp_type s = std::sqrt(gamma); - - using M = Cirq::MatrixGate1; - auto normal = KrausOperator>::kNormal; - - return {{normal, 0, p1, - {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})}, - {1, 0, 0, 0, 0, 0, r * r, 0}, {q}, - }, - {normal, 0, p2, - {M::Create(time, q, {0, 0, 0, 0, 0, 0, s, 0})}, - {0, 0, 0, 0, 0, 0, s * s, 0}, {q}, - }, - }; - } - - Channel Create(unsigned time, unsigned q) const { - return Create(time, q, gamma); - } - - double gamma = 0; -}; - -/** - * Returns a phase damping channel factory object. - */ -template -inline PhaseDampingChannel phase_damp(double gamma) { - return PhaseDampingChannel(gamma); -} - -/** - * Reset channel factory. - */ -template -struct ResetChannel { - static constexpr char name[] = "reset"; - - static Channel Create(unsigned time, unsigned q) { - using M = Cirq::MatrixGate1; - auto normal = KrausOperator>::kNormal; - - return {{normal, 0, 0, - {M::Create(time, q, {1, 0, 0, 0, 0, 0, 0, 0})}, - {1, 0, 0, 0, 0, 0, 0, 0}, {q}, - }, - {normal, 0, 0, - {M::Create(time, q, {0, 0, 1, 0, 0, 0, 0, 0})}, - {0, 0, 0, 0, 0, 0, 1, 0}, {q}, - }, - }; - } -}; - -/** - * Returns a reset channel factory object. - */ -template -inline ResetChannel reset() { - return ResetChannel(); -} - -/** - * Phase flip channel factory. - */ -template -struct PhaseFlipChannel { - static constexpr char name[] = "phase_flip"; - - PhaseFlipChannel(double p) : p(p) {} - - static Channel Create(unsigned time, unsigned q, double p) { - double p1 = 1 - p; - double p2 = p; - - auto normal = KrausOperator>::kNormal; - - return {{normal, 1, p1, {}}, - {normal, 1, p2, {Z::Create(time, q)}} - }; - } - - Channel Create(unsigned time, unsigned q) const { - return Create(time, q, p); - } - - double p = 0; -}; - -/** - * Returns a phase flip channel factory object. - */ -template -inline PhaseFlipChannel phase_flip(double p) { - return PhaseFlipChannel(p); -} - -/** - * Bit flip channel factory. - */ -template -struct BitFlipChannel { - static constexpr char name[] = "bit_flip"; - - BitFlipChannel(double p) : p(p) {} - - static Channel Create(unsigned time, unsigned q, double p) { - double p1 = 1 - p; - double p2 = p; - - auto normal = KrausOperator>::kNormal; - - return {{normal, 1, p1, {}}, - {normal, 1, p2, {X::Create(time, q)}} - }; - } - - Channel Create(unsigned time, unsigned q) const { - return Create(time, q, p); - } - - double p = 0; -}; - -/** - * Returns a bit flip channel factory object. - */ -template -inline BitFlipChannel bit_flip(double p) { - return BitFlipChannel(p); -} - -} // namesapce Cirq - -} // namespace qsim - -#endif // CHANNELS_CIRQ_H_ diff --git a/qsim/channels_qsim.h b/qsim/channels_qsim.h deleted file mode 100644 index 5c07bcc..0000000 --- a/qsim/channels_qsim.h +++ /dev/null @@ -1,117 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef CHANNELS_QSIM_H_ -#define CHANNELS_QSIM_H_ - -#include -#include -#include - -#include "channel.h" -#include "gates_qsim.h" - -namespace qsim { - -/** - * Amplitude damping channel factory. - */ -template -struct AmplitudeDampingChannel { - AmplitudeDampingChannel(double gamma) : gamma(gamma) {} - - static Channel> Create( - unsigned time, unsigned q, double gamma) { - double p1 = 1 - gamma; - double p2 = 0; - - fp_type r = std::sqrt(p1); - fp_type s = std::sqrt(gamma); - - using M = GateMatrix1; - auto normal = KrausOperator>::kNormal; - - return {{normal, 0, p1, - {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})}, - {1, 0, 0, 0, 0, 0, r * r, 0}, {q}, - }, - {normal, 0, p2, - {M::Create(time, q, {0, 0, s, 0, 0, 0, 0, 0})}, - {0, 0, 0, 0, 0, 0, s * s, 0}, {q}, - }, - }; - } - - Channel> Create(unsigned time, unsigned q) const { - return Create(time, q, gamma); - } - - double gamma = 0; -}; - -/** - * Returns an amplitude damping channel factory object. - */ -template -inline AmplitudeDampingChannel amplitude_damp(double gamma) { - return AmplitudeDampingChannel(gamma); -} - -/** - * Phase damping channel factory. - */ -template -struct PhaseDampingChannel { - PhaseDampingChannel(double gamma) : gamma(gamma) {} - - static Channel> Create( - unsigned time, unsigned q, double gamma) { - double p1 = 1 - gamma; - double p2 = 0; - - fp_type r = std::sqrt(p1); - fp_type s = std::sqrt(gamma); - - using M = GateMatrix1; - auto normal = KrausOperator>::kNormal; - - return {{normal, 0, p1, - {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})}, - {1, 0, 0, 0, 0, 0, r * r, 0}, {q}, - }, - {normal, 0, p2, - {M::Create(time, q, {0, 0, 0, 0, 0, 0, s, 0})}, - {0, 0, 0, 0, 0, 0, s * s, 0}, {q}, - }, - }; - } - - Channel> Create(unsigned time, unsigned q) const { - return Create(time, q, gamma); - } - - double gamma = 0; -}; - -/** - * Returns a phase damping channel factory object. - */ -template -inline PhaseDampingChannel phase_damp(double gamma) { - return PhaseDampingChannel(gamma); -} - -} // namespace qsim - -#endif // CHANNELS_QSIM_H_ diff --git a/qsim/circuit.h b/qsim/circuit.h deleted file mode 100644 index 59018ee..0000000 --- a/qsim/circuit.h +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef CIRCUIT_H_ -#define CIRCUIT_H_ - -#include - -namespace qsim { - -/** - * A collection of gates. This object is consumed by `QSim[h]Runner.Run()`. - */ -template -struct Circuit { - unsigned num_qubits; - /** - * The set of gates to be run. Gate times should be ordered. - */ - std::vector gates; -}; - -} // namespace qsim - -#endif // CIRCUIT_H_ diff --git a/qsim/circuit_noisy.h b/qsim/circuit_noisy.h deleted file mode 100644 index 40a228d..0000000 --- a/qsim/circuit_noisy.h +++ /dev/null @@ -1,108 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef CIRCUIT_NOISY_H_ -#define CIRCUIT_NOISY_H_ - -#include - -#include "circuit.h" -#include "channel.h" - -namespace qsim { - -/** - * Noisy circuit. - */ -template -struct NoisyCircuit { - unsigned num_qubits; - std::vector> channels; -}; - -template -using ncircuit_iterator = typename std::vector>::const_iterator; - -/** - * Makes a noisy circuit from the clean circuit. - * Channels are added after each qubit of each gate of the clean cicuit. - * Roughly equivalent to cirq.Circuit.with_noise. - * @param num_qubits The number of circuit qubits. - * @param gbeg, gend The iterator range [gbeg, gend) of circuit gates. - * @param A channel factory to construct channels. - * @return The output noisy circuit. - */ -template -inline NoisyCircuit MakeNoisy( - unsigned num_qubits, - typename std::vector::const_iterator gbeg, - typename std::vector::const_iterator gend, - const ChannelFactory& channel_factory) { - NoisyCircuit ncircuit; - - ncircuit.num_qubits = num_qubits; - ncircuit.channels.reserve(4 * std::size_t(gend - gbeg)); - - for (auto it = gbeg; it != gend; ++it) { - const auto& gate = *it; - - ncircuit.channels.push_back(MakeChannelFromGate(2 * gate.time, gate)); - - for (auto q : gate.qubits) { - ncircuit.channels.push_back(channel_factory.Create(2 * gate.time + 1, q)); - } - - for (auto q : gate.controlled_by) { - ncircuit.channels.push_back(channel_factory.Create(2 * gate.time + 1, q)); - } - } - - return ncircuit; -} - -/** - * Makes a noisy circuit from the clean circuit. - * Channels are added after each qubit of each gate of the clean cicuit. - * Roughly equivalent to cirq.Circuit.with_noise. - * @param num_qubits The number of circuit qubits. - * @param gates The circuit gates. - * @param A channel factory to construct channels. - * @return The output noisy circuit. - */ -template -inline NoisyCircuit MakeNoisy(unsigned num_qubits, - const std::vector& gates, - const ChannelFactory& channel_factory) { - return - MakeNoisy(num_qubits, gates.begin(), gates.end(), channel_factory); -} - -/** - * Makes a noisy circuit from the clean circuit. - * Channels are added after each qubit of each gate of the clean cicuit. - * Roughly equivalent to cirq.Circuit.with_noise. - * @param circuit The input cicuit. - * @param A channel factory to construct channels. - * @return The output noisy circuit. - */ -template -inline NoisyCircuit MakeNoisy(const Circuit& circuit, - const ChannelFactory& channel_factory) { - return MakeNoisy(circuit.num_qubits, circuit.gates.begin(), - circuit.gates.end(), channel_factory); -} - -} // namespace qsim - -#endif // CIRCUIT_NOISY_H_ diff --git a/qsim/circuit_qsim_parser.h b/qsim/circuit_qsim_parser.h deleted file mode 100644 index de7bd89..0000000 --- a/qsim/circuit_qsim_parser.h +++ /dev/null @@ -1,442 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef CIRCUIT_QSIM_PARSER_H_ -#define CIRCUIT_QSIM_PARSER_H_ - -#include -#include -#include -#include -#include - -#include "circuit.h" -#include "gates_qsim.h" - -namespace qsim { - -/** - * Parser for the (deprecated) qsim file input format. - * The primary supported interface for designing circuits to simulate with qsim - * is Cirq, which relies on - * the Python-based qsimcirq interface. For C++ applications, Cirq gates can be - * explicitly constructed in code. - */ -template -class CircuitQsimParser final { - public: - /** - * Parses the given input stream into a Circuit object, following the rules - * defined in "docs/input_format.md". - * @param maxtime Maximum gate "time" to read operations for (inclusive). - * @param provider Circuit source; only used for error reporting. - * @param fs The stream to read the circuit from. - * @param circuit Output circuit object. If parsing is successful, this will - * contain the circuit defined in 'fs'. - * @return True if parsing succeeds; false otherwise. - */ - template - static bool FromStream(unsigned maxtime, const std::string& provider, - Stream& fs, Circuit>& circuit) { - circuit.num_qubits = 0; - - circuit.gates.resize(0); - circuit.gates.reserve(1024); - - unsigned k = 0; - - std::string line; - line.reserve(128); - - unsigned time; - std::string gate_name; - gate_name.reserve(16); - - unsigned max_time = 0; - unsigned prev_mea_time = 0; - - std::vector last_times; - - while (std::getline(fs, line)) { - ++k; - - if (line.size() == 0 || line[0] == '#') continue; - - std::stringstream ss(line); - - if (circuit.num_qubits == 0) { - ss >> circuit.num_qubits; - if (circuit.num_qubits == 0) { - IO::errorf("invalid number of qubits in %s in line %u.\n", - provider.c_str(), k); - return false; - } - - last_times.resize(circuit.num_qubits, unsigned(-1)); - - continue; - } - - ss >> time >> gate_name; - - if (!ss) { - InvalidGateError(provider, k); - return false; - } - - if (time > maxtime) { - break; - } - - if (gate_name == "c") { - if (!ParseControlledGate(ss, time, - circuit.num_qubits, circuit.gates)) { - InvalidGateError(provider, k); - return false; - } - } else if (!ParseGate(ss, time, circuit.num_qubits, - gate_name, circuit.gates)) { - InvalidGateError(provider, k); - return false; - } - - const auto& gate = circuit.gates.back(); - - if (time < prev_mea_time - || (gate.kind == gate::kMeasurement && time < max_time)) { - IO::errorf("gate crosses the time boundary set by measurement " - "gates in line %u in %s.\n", k, provider.c_str()); - return false; - } - - if (gate.kind == gate::kMeasurement) { - prev_mea_time = time; - } - - if (GateIsOutOfOrder(time, gate.qubits, last_times) - || GateIsOutOfOrder(time, gate.controlled_by, last_times)) { - IO::errorf("gate is out of time order in line %u in %s.\n", - k, provider.c_str()); - return false; - } - - if (time > max_time) { - max_time = time; - } - } - - return true; - } - - /** - * Parses the given file into a Circuit object, following the rules defined - * in "docs/input_format.md". - * @param maxtime Maximum gate "time" to read operations for (inclusive). - * @param file The name of the file to read the circuit from. - * @param circuit Output circuit object. If parsing is successful, this will - * contain the circuit defined in 'file'. - * @return True if parsing succeeds; false otherwise. - */ - template - static bool FromFile(unsigned maxtime, const std::string& file, - Circuit>& circuit) { - auto fs = IO::StreamFromFile(file); - - if (!fs) { - return false; - } else { - bool rc = FromStream(maxtime, file, fs, circuit); - IO::CloseStream(fs); - return rc; - } - } - - private: - static void InvalidGateError(const std::string& provider, unsigned line) { - IO::errorf("invalid gate in %s in line %u.\n", provider.c_str(), line); - } - - /** - * Checks formatting for a zero-qubit gate parsed from 'ss'. - * @param ss Input stream containing the gate specification. - */ - static bool ValidateGate(std::stringstream& ss) { - return ss && ss.peek() == std::stringstream::traits_type::eof(); - } - - /** - * Checks formatting for a single-qubit gate parsed from 'ss'. - * @param ss Input stream containing the gate specification. - * @param num_qubits Number of qubits, as defined at the start of the file. - * @param q0 Index of the affected qubit. - */ - static bool ValidateGate(std::stringstream& ss, - unsigned num_qubits, unsigned q0) { - return ss && ss.peek() == std::stringstream::traits_type::eof() - && q0 < num_qubits; - } - - /** - * Checks formatting for a two-qubit gate parsed from 'ss'. - * @param ss Input stream containing the gate specification. - * @param num_qubits Number of qubits, as defined at the start of the file. - * @param q0 Index of the first affected qubit. - * @param q1 Index of the second affected qubit. - */ - static bool ValidateGate(std::stringstream& ss, - unsigned num_qubits, unsigned q0, unsigned q1) { - return ss && ss.peek() == std::stringstream::traits_type::eof() - && q0 < num_qubits && q1 < num_qubits && q0 != q1; - } - - /** - * Checks formatting for a multiqubit gate parsed from 'ss'. - * @param ss Input stream containing the gate specification. - * @param num_qubits Number of qubits, as defined at the start of the file. - * @param qubits Indices of affected qubits. - */ - static bool ValidateGate(std::stringstream& ss, unsigned num_qubits, - const std::vector& qubits) { - return ss && ValidateQubits(num_qubits, qubits); - } - - static bool ValidateControlledGate( - unsigned num_qubits, const std::vector& qubits, - const std::vector& controlled_by) { - if (!ValidateQubits(num_qubits, controlled_by)) return false; - - std::size_t i = 0, j = 0; - - while (i < qubits.size() && j < controlled_by.size()) { - if (qubits[i] == controlled_by[j]) { - return false; - } else if (qubits[i] < controlled_by[j]) { - ++i; - } else { - ++j; - } - } - - return true; - } - - static bool ValidateQubits(unsigned num_qubits, - const std::vector& qubits) { - if (qubits.size() == 0 || qubits[0] >= num_qubits) return false; - - // qubits should be sorted. - - for (std::size_t i = 1; i < qubits.size(); ++i) { - if (qubits[i] >= num_qubits || qubits[i] == qubits[i - 1]) { - return false; - } - } - - return true; - } - - static bool GateIsOutOfOrder(unsigned time, - const std::vector& qubits, - std::vector& last_times) { - for (auto q : qubits) { - if (last_times[q] != unsigned(-1) && time <= last_times[q]) { - return true; - } - - last_times[q] = time; - } - - return false; - } - - template - static bool ParseGate(Stream& ss, unsigned time, unsigned num_qubits, - const std::string& gate_name, - std::vector& gates) { - unsigned q0, q1; - fp_type phi, theta; - - if (gate_name == "p") { - ss >> phi; - if (!ValidateGate(ss)) return false; - gates.push_back(GateGPh::Create(time, phi)); - } else if (gate_name == "id1") { - ss >> q0; - if (!ValidateGate(ss, num_qubits, q0)) return false; - gates.push_back(GateId1::Create(time, q0)); - } else if (gate_name == "h") { - ss >> q0; - if (!ValidateGate(ss, num_qubits, q0)) return false; - gates.push_back(GateHd::Create(time, q0)); - } else if (gate_name == "t") { - ss >> q0; - if (!ValidateGate(ss, num_qubits, q0)) return false; - gates.push_back(GateT::Create(time, q0)); - } else if (gate_name == "x") { - ss >> q0; - if (!ValidateGate(ss, num_qubits, q0)) return false; - gates.push_back(GateX::Create(time, q0)); - } else if (gate_name == "y") { - ss >> q0; - if (!ValidateGate(ss, num_qubits, q0)) return false; - gates.push_back(GateY::Create(time, q0)); - } else if (gate_name == "z") { - ss >> q0; - if (!ValidateGate(ss, num_qubits, q0)) return false; - gates.push_back(GateZ::Create(time, q0)); - } else if (gate_name == "x_1_2") { - ss >> q0; - if (!ValidateGate(ss, num_qubits, q0)) return false; - gates.push_back(GateX2::Create(time, q0)); - } else if (gate_name == "y_1_2") { - ss >> q0; - if (!ValidateGate(ss, num_qubits, q0)) return false; - gates.push_back(GateY2::Create(time, q0)); - } else if (gate_name == "rx") { - ss >> q0 >> phi; - if (!ValidateGate(ss, num_qubits, q0)) return false; - gates.push_back(GateRX::Create(time, q0, phi)); - } else if (gate_name == "ry") { - ss >> q0 >> phi; - if (!ValidateGate(ss, num_qubits, q0)) return false; - gates.push_back(GateRY::Create(time, q0, phi)); - } else if (gate_name == "rz") { - ss >> q0 >> phi; - if (!ValidateGate(ss, num_qubits, q0)) return false; - gates.push_back(GateRZ::Create(time, q0, phi)); - } else if (gate_name == "rxy") { - ss >> q0 >> theta >> phi; - if (!ValidateGate(ss, num_qubits, q0)) return false; - gates.push_back(GateRXY::Create(time, q0, theta, phi)); - } else if (gate_name == "hz_1_2") { - ss >> q0; - if (!ValidateGate(ss, num_qubits, q0)) return false; - gates.push_back(GateHZ2::Create(time, q0)); - } else if (gate_name == "s") { - ss >> q0; - if (!ValidateGate(ss, num_qubits, q0)) return false; - gates.push_back(GateS::Create(time, q0)); - } else if (gate_name == "id2") { - ss >> q0 >> q1; - if (!ValidateGate(ss, num_qubits, q0, q1)) return false; - gates.push_back(GateId2::Create(time, q0, q1)); - } else if (gate_name == "cz") { - ss >> q0 >> q1; - if (!ValidateGate(ss, num_qubits, q0, q1)) return false; - gates.push_back(GateCZ::Create(time, q0, q1)); - } else if (gate_name == "cnot" || gate_name == "cx") { - ss >> q0 >> q1; - if (!ValidateGate(ss, num_qubits, q0, q1)) return false; - gates.push_back(GateCNot::Create(time, q0, q1)); - } else if (gate_name == "sw") { - ss >> q0 >> q1; - if (!ValidateGate(ss, num_qubits, q0, q1)) return false; - gates.push_back(GateSwap::Create(time, q0, q1)); - } else if (gate_name == "is") { - ss >> q0 >> q1; - if (!ValidateGate(ss, num_qubits, q0, q1)) return false; - gates.push_back(GateIS::Create(time, q0, q1)); - } else if (gate_name == "fs") { - ss >> q0 >> q1 >> theta >> phi; - if (!ValidateGate(ss, num_qubits, q0, q1)) return false; - gates.push_back(GateFS::Create(time, q0, q1, theta, phi)); - } else if (gate_name == "cp") { - ss >> q0 >> q1 >> phi; - if (!ValidateGate(ss, num_qubits, q0, q1)) return false; - gates.push_back(GateCP::Create(time, q0, q1, phi)); - } else if (gate_name == "m") { - std::vector qubits; - qubits.reserve(num_qubits); - - while (ss.good()) { - ss >> q0; - if (ss) { - qubits.push_back(q0); - } else { - return false; - } - } - - gates.push_back(gate::Measurement>::Create( - time, std::move(qubits))); - - if (!ValidateQubits(num_qubits, gates.back().qubits)) return false; - } else { - return false; - } - - return true; - } - - template - static bool ParseControlledGate(Stream& ss, unsigned time, - unsigned num_qubits, - std::vector& gates) { - std::vector controlled_by; - controlled_by.reserve(64); - - std::string gate_name; - gate_name.reserve(16); - - while (1) { - while (ss.good()) { - if (!std::isblank(ss.get())) { - ss.unget(); - break; - } - } - - if (!ss.good()) { - return false; - } - - if (!std::isdigit(ss.peek())) { - break; - } else { - unsigned q; - ss >> q; - - if (!ss.good() || !std::isblank(ss.get())) { - return false; - } - - controlled_by.push_back(q); - } - } - - if (controlled_by.size() == 0) { - return false; - } - - ss >> gate_name; - - if (!ss.good() || !ParseGate(ss, time, - num_qubits, gate_name, gates)) { - return false; - } - - gates.back().ControlledBy(std::move(controlled_by)); - - if (!ValidateControlledGate(num_qubits, gates.back().qubits, - gates.back().controlled_by)) { - return false; - } - - return true; - } -}; - -} // namespace qsim - -#endif // CIRCUIT_QSIM_PARSER_H_ diff --git a/qsim/cuda2hip.h b/qsim/cuda2hip.h deleted file mode 100644 index da2d074..0000000 --- a/qsim/cuda2hip.h +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright 2023 Advanced Micro Devices, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SIMULATOR_CUDA2HIP_H_ -#define SIMULATOR_CUDA2HIP_H_ - -#define cublasCaxpy hipblasCaxpy -#define cublasCdotc hipblasCdotc -#define cublasCreate hipblasCreate -#define cublasCscal hipblasCscal -#define cublasCsscal hipblasCsscal -#define cublasDestroy hipblasDestroy -#define cublasDznrm2 hipblasDznrm2 -#define cublasHandle_t hipblasHandle_t -#define cublasScnrm2 hipblasScnrm2 -#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS -#define cublasStatus_t hipblasStatus_t -#define cublasZaxpy hipblasZaxpy -#define cublasZdotc hipblasZdotc -#define cublasZdscal hipblasZdscal -#define cublasZscal hipblasZscal -#define cuCimagf hipCimagf -#define cuCimag hipCimag -#define cuComplex hipComplex -#define cuCrealf hipCrealf -#define cuCreal hipCreal -#define CUDA_C_32F HIPBLAS_C_32F -#define CUDA_C_64F HIPBLAS_C_64F -#define cudaDeviceSynchronize hipDeviceSynchronize -#define cudaError_t hipError_t -#define cudaFree hipFree -#define cudaGetErrorString hipGetErrorString -#define cudaMalloc hipMalloc -#define cudaMemcpyAsync hipMemcpyAsync -#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice -#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost -#define cudaMemcpy hipMemcpy -#define cudaMemcpyHostToDevice hipMemcpyHostToDevice -#define cudaMemset hipMemset -#define cudaPeekAtLastError hipPeekAtLastError -#define cudaSuccess hipSuccess -#define cuDoubleComplex hipDoubleComplex - -template -__device__ __forceinline__ T __shfl_down_sync( - unsigned mask, T var, unsigned int delta, int width = warpSize) { - return __shfl_down(var, delta, width); -} - -#endif // SIMULATOR_CUDA2HIP_H_ diff --git a/qsim/expect.h b/qsim/expect.h deleted file mode 100644 index 518d516..0000000 --- a/qsim/expect.h +++ /dev/null @@ -1,148 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef EXPECT_H_ -#define EXPECT_H_ - -#include - -#include "fuser.h" -#include "gate_appl.h" - -namespace qsim { - -template -struct OpString { - std::complex weight; - std::vector ops; -}; - -/** - * Computes the expectation value of the sum of operator strings (operator - * sequences). Operators can act on any qubits and they can be any supported - * gates. This function uses a temporary state vector. - * @param param Options for gate fusion. - * @param strings Operator strings. - * @param ss StateSpace object required to copy the state vector and compute - * inner products. - * @param simulator Simulator object. Provides specific implementations for - * applying gates. - * @param state The state vector of the system. - * @param ket Temporary state vector. - * @return The computed expectation value. - */ -template -std::complex ExpectationValue( - const typename Fuser::Parameter& param, - const std::vector>& strings, - const typename Simulator::StateSpace& state_space, - const Simulator& simulator, const typename Simulator::State& state, - typename Simulator::State& ket) { - std::complex eval = 0; - - if (state_space.IsNull(ket) || ket.num_qubits() < state.num_qubits()) { - ket = state_space.Create(state.num_qubits()); - if (state_space.IsNull(ket)) { - IO::errorf("not enough memory: is the number of qubits too large?\n"); - return eval; - } - } - - for (const auto& str : strings) { - if (str.ops.size() == 0) { - eval += str.weight; - continue; - } - - state_space.Copy(state, ket); - - if (str.ops.size() == 1) { - const auto& op = str.ops[0]; - simulator.ApplyGate(op.qubits, op.matrix.data(), ket); - } else { - auto fused_gates = Fuser::FuseGates(param, state.num_qubits(), str.ops); - if (fused_gates.size() == 0) { - eval = 0; - break; - } - - for (const auto& fgate : fused_gates) { - ApplyFusedGate(simulator, fgate, ket); - } - } - - eval += str.weight * state_space.InnerProduct(state, ket); - } - - return eval; -} - -/** - * Computes the expectation value of the sum of operator strings (operator - * sequences). Operators can act on any qubits and they can be any supported - * gates except for user-defined controlled gates. Computation is performed - * in place. No additional memory is allocated. The operator strings should - * act on no more than six qubits and they should be fusible into one gate. - * @param strings Operator strings. - * @param simulator Simulator object. Provides specific implementations for - * computing expectation values. - * @param state The state of the system. - * @return The computed expectation value. - */ -template -std::complex ExpectationValue( - const std::vector>& strings, - const Simulator& simulator, const typename Simulator::State& state) { - std::complex eval = 0; - - typename Fuser::Parameter param; - param.max_fused_size = 6; - for (const auto& str : strings) { - if (str.ops.size() == 0) { - eval += str.weight; - } else if (str.ops.size() == 1) { - const auto& op = str.ops[0]; - auto r = simulator.ExpectationValue(op.qubits, op.matrix.data(), state); - eval += str.weight * r; - } else { - auto fused_gates = Fuser::FuseGates(param, state.num_qubits(), str.ops); - - if (fused_gates.size() != 1) { - IO::errorf("too many fused gates; " - "cannot compute the expectation value.\n"); - eval = 0; - break; - } - - const auto& fgate = fused_gates[0]; - - if (fgate.qubits.size() > 6) { - IO::errorf("operator string acts on too many qubits; " - "cannot compute the expectation value.\n"); - eval = 0; - break; - } - - auto r = simulator.ExpectationValue( - fgate.qubits, fgate.matrix.data(), state); - eval += str.weight * r; - } - } - - return eval; -} - -} // namespace qsim - -#endif // EXPECT_H_ diff --git a/qsim/formux.h b/qsim/formux.h deleted file mode 100644 index 4401e9b..0000000 --- a/qsim/formux.h +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef FORMUX_H_ -#define FORMUX_H_ - -#ifdef _OPENMP -# include "parfor.h" - namespace qsim { - using For = ParallelFor; - } -#else -# include "seqfor.h" - namespace qsim { - using For = SequentialFor; - } -#endif - -#endif // FORMUX_H_ diff --git a/qsim/fuser.h b/qsim/fuser.h deleted file mode 100644 index e4f3c3b..0000000 --- a/qsim/fuser.h +++ /dev/null @@ -1,225 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef FUSER_H_ -#define FUSER_H_ - -#include -#include - -#include "gate.h" -#include "matrix.h" - -namespace qsim { - -/** - * A collection of "fused" gates which can be multiplied together before being - * applied to the state vector. - */ -template -struct GateFused { - /** - * Kind of the first ("parent") gate. - */ - typename Gate::GateKind kind; - /** - * The time index of the first ("parent") gate. - */ - unsigned time; - /** - * A list of qubits these gates act upon. Control qubits for - * explicitly-controlled gates are excluded from this list. - */ - std::vector qubits; - /** - * Pointer to the first ("parent") gate. - */ - const Gate* parent; - /** - * Ordered list of component gates. - */ - std::vector gates; - /** - * Fused gate matrix. - */ - Matrix matrix; -}; - -/** - * A base class for fuser classes with some common functions. - */ -template -class Fuser { - protected: - using RGate = typename std::remove_pointer::type; - - static const RGate& GateToConstRef(const RGate& gate) { - return gate; - } - - static const RGate& GateToConstRef(const RGate* gate) { - return *gate; - } - - static std::vector MergeWithMeasurementTimes( - typename std::vector::const_iterator gfirst, - typename std::vector::const_iterator glast, - const std::vector& times) { - std::vector epochs; - epochs.reserve(glast - gfirst + times.size()); - - std::size_t last = 0; - unsigned max_time = 0; - - for (auto gate_it = gfirst; gate_it < glast; ++gate_it) { - const auto& gate = GateToConstRef(*gate_it); - - if (gate.time > max_time) { - max_time = gate.time; - } - - if (epochs.size() > 0 && gate.time < epochs.back()) { - IO::errorf("gate crosses the time boundary.\n"); - epochs.resize(0); - return epochs; - } - - if (gate.kind == gate::kMeasurement) { - if (epochs.size() == 0 || epochs.back() < gate.time) { - if (!AddBoundary(gate.time, max_time, epochs)) { - epochs.resize(0); - return epochs; - } - } - } - - while (last < times.size() && times[last] <= gate.time) { - unsigned prev = times[last++]; - epochs.push_back(prev); - if (!AddBoundary(prev, max_time, epochs)) { - epochs.resize(0); - return epochs; - } - while (last < times.size() && times[last] <= prev) ++last; - } - } - - if (epochs.size() == 0 || epochs.back() < max_time) { - epochs.push_back(max_time); - } - - return epochs; - } - - template - static void FuseZeroQubitGates(const GateSeq0& gate_seq0, - Parent parent, std::size_t first, - std::vector& fused_gates) { - GateFused* fuse_to = nullptr; - - for (std::size_t i = first; i < fused_gates.size(); ++i) { - auto& fgate = fused_gates[i]; - - if (fgate.kind != gate::kMeasurement && fgate.kind != gate::kDecomp - && fgate.parent->controlled_by.size() == 0 - && !fgate.parent->unfusible) { - fuse_to = &fgate; - break; - } - } - - if (fuse_to != nullptr) { - // Fuse zero-qubit gates with the first available fused gate. - for (const auto& g : gate_seq0) { - fuse_to->gates.push_back(parent(g)); - } - } else { - auto g0 = parent(gate_seq0[0]); - fused_gates.push_back({g0->kind, g0->time, {}, g0, {g0}, {}}); - - for (std::size_t i = 1; i < gate_seq0.size(); ++i) { - fused_gates.back().gates.push_back(parent(gate_seq0[i])); - } - } - } - - private: - static bool AddBoundary(unsigned time, unsigned max_time, - std::vector& boundaries) { - if (max_time > time) { - IO::errorf("gate crosses the time boundary.\n"); - return false; - } - - boundaries.push_back(time); - return true; - } -}; - -/** - * Multiplies component gate matrices of a fused gate. - * @param gate Fused gate. - */ -template -inline void CalculateFusedMatrix(FusedGate& gate) { - MatrixIdentity(unsigned{1} << gate.qubits.size(), gate.matrix); - - for (auto pgate : gate.gates) { - if (pgate->qubits.size() == 0) { - MatrixScalarMultiply(pgate->matrix[0], pgate->matrix[1], gate.matrix); - } else if (gate.qubits.size() == pgate->qubits.size()) { - MatrixMultiply(gate.qubits.size(), pgate->matrix, gate.matrix); - } else { - unsigned mask = 0; - - for (auto q : pgate->qubits) { - for (std::size_t i = 0; i < gate.qubits.size(); ++i) { - if (q == gate.qubits[i]) { - mask |= unsigned{1} << i; - break; - } - } - } - - MatrixMultiply(mask, pgate->qubits.size(), pgate->matrix, - gate.qubits.size(), gate.matrix); - } - } -} - -/** - * Multiplies component gate matrices for a range of fused gates. - * @param gbeg, gend The iterator range [gbeg, gend) of fused gates. - */ -template -inline void CalculateFusedMatrices(Iterator gbeg, Iterator gend) { - for (auto g = gbeg; g != gend; ++g) { - if (g->kind != gate::kMeasurement) { - CalculateFusedMatrix(*g); - } - } -} - -/** - * Multiplies component gate matrices for a vector of fused gates. - * @param gates The vector of fused gates. - */ -template -inline void CalculateFusedMatrices(std::vector& gates) { - CalculateFusedMatrices(gates.begin(), gates.end()); -} - -} // namespace qsim - -#endif // FUSER_H_ diff --git a/qsim/fuser_basic.h b/qsim/fuser_basic.h deleted file mode 100644 index 3191bd2..0000000 --- a/qsim/fuser_basic.h +++ /dev/null @@ -1,411 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef FUSER_BASIC_H_ -#define FUSER_BASIC_H_ - -#include -#include -#include -#include - -#include "gate.h" -#include "fuser.h" - -namespace qsim { - -/** - * Stateless object with methods for aggregating `Gate`s into `GateFused`. - * Measurement gates with equal times are fused together. - * User-defined controlled gates (controlled_by.size() > 0) and gates acting on - * more than two qubits are not fused. - * The template parameter Gate can be Gate type or a pointer to Gate type. - * This class is deprecated. It is recommended to use MultiQubitGateFuser - * from fuser_mqubit.h. - */ -template -class BasicGateFuser final : public Fuser { - private: - using Base = Fuser; - using RGate = typename Base::RGate; - - public: - using GateFused = qsim::GateFused; - - /** - * User-specified parameters for gate fusion. - * BasicGateFuser does not use any parameters. - */ - struct Parameter { - unsigned verbosity = 0; - }; - - /** - * Stores sets of gates that can be applied together. Only one- and - * two-qubit gates will get fused. To respect specific time boundaries while - * fusing gates, use the other version of this method below. - * @param param Options for gate fusion. - * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'. - * @param gates The gates (or pointers to the gates) to be fused. - * Gate times of the gates that act on the same qubits should be ordered. - * Gates that are out of time order should not cross the time boundaries - * set by measurement gates. - * @param fuse_matrix If true, multiply gate matrices together. - * @return A vector of fused gate objects. Each element is a set of gates - * acting on a specific pair of qubits which can be applied as a group. - */ - static std::vector FuseGates(const Parameter& param, - unsigned max_qubit1, - const std::vector& gates, - bool fuse_matrix = true) { - return FuseGates( - param, max_qubit1, gates.cbegin(), gates.cend(), {}, fuse_matrix); - } - - /** - * Stores sets of gates that can be applied together. Only one- and - * two-qubit gates will get fused. - * @param param Options for gate fusion. - * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'. - * @param gates The gates (or pointers to the gates) to be fused. - * Gate times of the gates that act on the same qubits should be ordered. - * Gates that are out of time order should not cross the time boundaries - * set by `times_to_split_at` or by measurement gates. - * @param times_to_split_at Ordered list of time steps (boundaries) at which - * to separate fused gates. Each element of the output will contain gates - * from a single 'window' in this list. - * @param fuse_matrix If true, multiply gate matrices together. - * @return A vector of fused gate objects. Each element is a set of gates - * acting on a specific pair of qubits which can be applied as a group. - */ - static std::vector FuseGates( - const Parameter& param, - unsigned max_qubit1, const std::vector& gates, - const std::vector& times_to_split_at, - bool fuse_matrix = true) { - return FuseGates(param, max_qubit1, gates.cbegin(), gates.cend(), - times_to_split_at, fuse_matrix); - } - - /** - * Stores sets of gates that can be applied together. Only one- and - * two-qubit gates will get fused. To respect specific time boundaries while - * fusing gates, use the other version of this method below. - * @param param Options for gate fusion. - * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'. - * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates - * (or pointers to gates) in. Gate times of the gates that act on the same - * qubits should be ordered. Gates that are out of time order should not - * cross the time boundaries set by measurement gates. - * @param fuse_matrix If true, multiply gate matrices together. - * @return A vector of fused gate objects. Each element is a set of gates - * acting on a specific pair of qubits which can be applied as a group. - */ - static std::vector FuseGates( - const Parameter& param, unsigned max_qubit1, - typename std::vector::const_iterator gfirst, - typename std::vector::const_iterator glast, - bool fuse_matrix = true) { - return FuseGates(param, max_qubit1, gfirst, glast, {}, fuse_matrix); - } - - /** - * Stores sets of gates that can be applied together. Only one- and - * two-qubit gates will get fused. - * @param param Options for gate fusion. - * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'. - * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates - * (or pointers to gates) in. Gate times of the gates that act on the same - * qubits should be ordered. Gates that are out of time order should not - * cross the time boundaries set by `times_to_split_at` or by measurement - * gates. - * @param times_to_split_at Ordered list of time steps (boundaries) at which - * to separate fused gates. Each element of the output will contain gates - * from a single 'window' in this list. - * @param fuse_matrix If true, multiply gate matrices together. - * @return A vector of fused gate objects. Each element is a set of gates - * acting on a specific pair of qubits which can be applied as a group. - */ - static std::vector FuseGates( - const Parameter& param, unsigned max_qubit1, - typename std::vector::const_iterator gfirst, - typename std::vector::const_iterator glast, - const std::vector& times_to_split_at, - bool fuse_matrix = true) { - std::vector gates_fused; - - if (gfirst >= glast) return gates_fused; - - std::size_t num_gates = glast - gfirst; - - gates_fused.reserve(num_gates); - - // Merge with measurement gate times to separate fused gates at. - auto times = - Base::MergeWithMeasurementTimes(gfirst, glast, times_to_split_at); - - // Map to keep track of measurement gates with equal times. - std::map> measurement_gates; - - // Sequence of top level gates the other gates get fused to. - std::vector gates_seq; - - // Sequence of zero-qubit gates. - std::vector gates_seq0; - - // Lattice of gates: qubits "hyperplane" and time direction. - std::vector> gates_lat(max_qubit1); - - // Current unfused gate. - auto gate_it = gfirst; - - std::size_t last_fused_gate_index = 0; - - for (std::size_t l = 0; l < times.size(); ++l) { - gates_seq.resize(0); - gates_seq.reserve(num_gates); - - gates_seq0.resize(0); - gates_seq0.reserve(num_gates); - - for (unsigned k = 0; k < max_qubit1; ++k) { - gates_lat[k].resize(0); - gates_lat[k].reserve(128); - } - - // Fill gates_seq and gates_lat in. - for (; gate_it < glast; ++gate_it) { - const auto& gate = Base::GateToConstRef(*gate_it); - - if (gate.time > times[l]) break; - - if (!ValidateGate(gate, max_qubit1, gates_lat)) { - gates_fused.resize(0); - return gates_fused; - } - - if (gate.kind == gate::kMeasurement) { - auto& mea_gates_at_time = measurement_gates[gate.time]; - if (mea_gates_at_time.size() == 0) { - gates_seq.push_back(&gate); - mea_gates_at_time.reserve(max_qubit1); - } - - mea_gates_at_time.push_back(&gate); - } else if (gate.controlled_by.size() > 0 || gate.qubits.size() > 2) { - for (auto q : gate.qubits) { - gates_lat[q].push_back(&gate); - } - for (auto q : gate.controlled_by) { - gates_lat[q].push_back(&gate); - } - gates_seq.push_back(&gate); - } else if (gate.qubits.size() == 1) { - gates_lat[gate.qubits[0]].push_back(&gate); - if (gate.unfusible) { - gates_seq.push_back(&gate); - } - } else if (gate.qubits.size() == 2) { - gates_lat[gate.qubits[0]].push_back(&gate); - gates_lat[gate.qubits[1]].push_back(&gate); - gates_seq.push_back(&gate); - } else { - gates_seq0.push_back(&gate); - } - } - - std::vector last(max_qubit1, 0); - - const RGate* delayed_measurement_gate = nullptr; - - // Fuse gates. - for (auto pgate : gates_seq) { - if (pgate->kind == gate::kMeasurement) { - delayed_measurement_gate = pgate; - } else if (pgate->qubits.size() > 2 - || pgate->controlled_by.size() > 0) { - // Multi-qubit or controlled gate. - - for (auto q : pgate->qubits) { - unsigned l = last[q]; - if (gates_lat[q][l] != pgate) { - last[q] = AddOrphanedQubit(q, l, gates_lat, gates_fused); - } - ++last[q]; - } - - for (auto q : pgate->controlled_by) { - unsigned l = last[q]; - if (gates_lat[q][l] != pgate) { - last[q] = AddOrphanedQubit(q, l, gates_lat, gates_fused); - } - ++last[q]; - } - - gates_fused.push_back({pgate->kind, pgate->time, pgate->qubits, - pgate, {pgate}, {}}); - } else if (pgate->qubits.size() == 1) { - unsigned q0 = pgate->qubits[0]; - - GateFused gate_f = {pgate->kind, pgate->time, {q0}, pgate, {}, {}}; - - last[q0] = Advance(last[q0], gates_lat[q0], gate_f.gates); - gate_f.gates.push_back(gates_lat[q0][last[q0]]); - last[q0] = Advance(last[q0] + 1, gates_lat[q0], gate_f.gates); - - gates_fused.push_back(std::move(gate_f)); - } else if (pgate->qubits.size() == 2) { - unsigned q0 = pgate->qubits[0]; - unsigned q1 = pgate->qubits[1]; - - if (Done(last[q0], pgate->time, gates_lat[q0])) continue; - - GateFused gate_f = - {pgate->kind, pgate->time, {q0, q1}, pgate, {}, {}}; - - do { - last[q0] = Advance(last[q0], gates_lat[q0], gate_f.gates); - last[q1] = Advance(last[q1], gates_lat[q1], gate_f.gates); - // Here gates_lat[q0][last[q0]] == gates_lat[q1][last[q1]]. - - gate_f.gates.push_back(gates_lat[q0][last[q0]]); - - last[q0] = Advance(last[q0] + 1, gates_lat[q0], gate_f.gates); - last[q1] = Advance(last[q1] + 1, gates_lat[q1], gate_f.gates); - } while (NextGate(last[q0], gates_lat[q0], last[q1], gates_lat[q1])); - - gates_fused.push_back(std::move(gate_f)); - } - } - - for (unsigned q = 0; q < max_qubit1; ++q) { - auto l = last[q]; - if (l == gates_lat[q].size()) continue; - - // Orphaned qubit. - AddOrphanedQubit(q, l, gates_lat, gates_fused); - } - - if (delayed_measurement_gate != nullptr) { - auto pgate = delayed_measurement_gate; - - const auto& mea_gates_at_time = measurement_gates[pgate->time]; - - GateFused gate_f = {pgate->kind, pgate->time, {}, pgate, {}, {}}; - gate_f.gates.reserve(mea_gates_at_time.size()); - - // Fuse measurement gates with equal times. - - for (const auto* pgate : mea_gates_at_time) { - gate_f.qubits.insert(gate_f.qubits.end(), - pgate->qubits.begin(), pgate->qubits.end()); - gate_f.gates.push_back(pgate); - } - - gates_fused.push_back(std::move(gate_f)); - } - - if (gates_seq0.size() != 0) { - Base::FuseZeroQubitGates(gates_seq0, [](const RGate* g) { return g; }, - last_fused_gate_index, gates_fused); - } - - if (gate_it == glast) break; - - last_fused_gate_index = gates_fused.size(); - } - - if (fuse_matrix) { - for (auto& gate_f : gates_fused) { - if (gate_f.kind != gate::kMeasurement && gate_f.kind != gate::kDecomp) { - CalculateFusedMatrix(gate_f); - } - } - } - - return gates_fused; - } - - private: - static unsigned Advance(unsigned k, const std::vector& wl, - std::vector& gates) { - while (k < wl.size() && wl[k]->qubits.size() == 1 - && wl[k]->controlled_by.size() == 0 && !wl[k]->unfusible) { - gates.push_back(wl[k++]); - } - - return k; - } - - static bool Done( - unsigned k, unsigned t, const std::vector& wl) { - return k >= wl.size() || wl[k]->time > t; - } - - static bool NextGate(unsigned k1, const std::vector& wl1, - unsigned k2, const std::vector& wl2) { - return k1 < wl1.size() && k2 < wl2.size() && wl1[k1] == wl2[k2] - && wl1[k1]->qubits.size() < 3 && wl1[k1]->controlled_by.size() == 0; - } - - template - static unsigned AddOrphanedQubit(unsigned q, unsigned k, - const GatesLat& gates_lat, - std::vector& gates_fused) { - auto pgate = gates_lat[q][k]; - - GateFused gate_f = {pgate->kind, pgate->time, {q}, pgate, {}, {}}; - gate_f.gates.push_back(pgate); - - k = Advance(k + 1, gates_lat[q], gate_f.gates); - - gates_fused.push_back(std::move(gate_f)); - - return k; - } - - template - static bool ValidateGate(const Gate2& gate, unsigned max_qubit1, - const GatesLat& gates_lat) { - for (unsigned q : gate.qubits) { - if (q >= max_qubit1) { - IO::errorf("fuser: gate qubit %u is out of range " - "(should be smaller than %u).\n", q, max_qubit1); - return false; - } - if (!gates_lat[q].empty() && gate.time <= gates_lat[q].back()->time) { - IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time); - return false; - } - } - - for (unsigned q : gate.controlled_by) { - if (q >= max_qubit1) { - IO::errorf("fuser: gate qubit %u is out of range " - "(should be smaller than %u).\n", q, max_qubit1); - return false; - } - if (!gates_lat[q].empty() && gate.time <= gates_lat[q].back()->time) { - IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time); - return false; - } - } - - return true; - } -}; - -} // namespace qsim - -#endif // FUSER_BASIC_H_ diff --git a/qsim/fuser_mqubit.h b/qsim/fuser_mqubit.h deleted file mode 100644 index c75b1a0..0000000 --- a/qsim/fuser_mqubit.h +++ /dev/null @@ -1,1095 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef FUSER_MQUBIT_H_ -#define FUSER_MQUBIT_H_ - -#include -#include -#include -#include -#include -#include - -#include "gate.h" -#include "fuser.h" - -namespace qsim { - -/** - * Multi-qubit gate fuser. - * Measurement gates with equal times are fused together. - * User-defined controlled gates (controlled_by.size() > 0) are not fused. - * The template parameter Gate can be Gate type or a pointer to Gate type. - */ -template -class MultiQubitGateFuser final : public Fuser { - private: - using Base = Fuser; - using RGate = typename Base::RGate; - - // Auxillary classes and structs. - - // Manages doubly-linked lists. - template - class LinkManagerT { - public: - struct Link { - T val; - Link* next; - Link* prev; - }; - - explicit LinkManagerT(uint64_t size) { - links_.reserve(size); - } - - Link* AddBack(const T& t, Link* link) { - if (link == nullptr) { - links_.push_back({t, nullptr, nullptr}); - } else { - links_.push_back({t, link->next, link}); - link->next = &links_.back(); - } - - return &links_.back(); - } - - static void Delete(const Link* link) { - if (link->prev != nullptr) { - link->prev->next = link->next; - } - if (link->next != nullptr) { - link->next->prev = link->prev; - } - } - - private: - std::vector links_; - }; - - struct GateF; - - using LinkManager = LinkManagerT; - using Link = typename LinkManager::Link; - - // Intermediate representation of a fused gate. - struct GateF { - const RGate* parent; - std::vector qubits; - std::vector gates; // Gates that get fused to this gate. - std::vector links; // Gate "lattice" links. - uint64_t mask; // Qubit mask. - unsigned visited; - }; - - // Possible values for visited in GateF. - // Note that MakeGateSequence assignes values from kSecond to the number of - // gates in the sequence plus one, see below. - enum Visited { - kZero = 0, // Start value for "normal" gates. - kFirst = 1, // Value after the first pass for partially fused - // "normal" gates. - kSecond = 2, // Start value to assign values in MakeGateSequence. - kCompress = 99999997, // Used to compress links. - kMeaCnt = 99999998, // Start value for controlled or measurement gates. - kFinal = 99999999, // Value after the second pass for fused "normal" - // gates or for controlled and measurement gates. - }; - - struct Stat { - unsigned num_mea_gates = 0; - unsigned num_fused_mea_gates = 0; - unsigned num_fused_gates = 0; - unsigned num_controlled_gates = 0; - std::vector num_gates; - }; - - // Gate that is added to a sequence of gates to fuse together. - struct GateA { - GateF* gate; - std::vector qubits; // Added qubits. - std::vector links; // Added lattice links. - }; - - struct Scratch { - std::vector data; - std::vector prev1; - std::vector prev2; - std::vector next1; - std::vector next2; - std::vector longest_seq; - std::vector stack; - std::vector gates; - unsigned count = 0; - }; - - public: - using GateFused = qsim::GateFused; - - /** - * User-specified parameters for gate fusion. - */ - struct Parameter { - /** - * Maximum number of qubits in a fused gate. It can take values from 2 to - * 6 (0 and 1 are equivalent to 2). It is not recommended to use 5 or 6 as - * that might degrade performance for not very fast machines. - */ - unsigned max_fused_size = 2; - unsigned verbosity = 0; - }; - - /** - * Stores sets of gates that can be applied together. To respect specific - * time boundaries while fusing gates, use the other version of this method - * below. - * @param param Options for gate fusion. - * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'. - * @param gates The gates (or pointers to the gates) to be fused. - * Gate times of the gates that act on the same qubits should be ordered. - * Gates that are out of time order should not cross the time boundaries - * set by measurement gates. - * @param fuse_matrix If true, multiply gate matrices together. - * @return A vector of fused gate objects. Each element is a set of gates - * acting on a specific pair of qubits which can be applied as a group. - */ - static std::vector FuseGates(const Parameter& param, - unsigned max_qubit1, - const std::vector& gates, - bool fuse_matrix = true) { - return FuseGates( - param, max_qubit1, gates.cbegin(), gates.cend(), {}, fuse_matrix); - } - - /** - * Stores sets of gates that can be applied together. - * @param param Options for gate fusion. - * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'. - * @param gates The gates (or pointers to the gates) to be fused. - * Gate times of the gates that act on the same qubits should be ordered. - * Gates that are out of time order should not cross the time boundaries - * set by `times_to_split_at` or by measurement gates. - * @param times_to_split_at Ordered list of time steps (boundaries) at which - * to separate fused gates. Each element of the output will contain gates - * from a single 'window' in this list. - * @param fuse_matrix If true, multiply gate matrices together. - * @return A vector of fused gate objects. Each element is a set of gates - * acting on a specific pair of qubits which can be applied as a group. - */ - static std::vector FuseGates( - const Parameter& param, - unsigned max_qubit1, const std::vector& gates, - const std::vector& times_to_split_at, - bool fuse_matrix = true) { - return FuseGates(param, max_qubit1, gates.cbegin(), gates.cend(), - times_to_split_at, fuse_matrix); - } - - /** - * Stores sets of gates that can be applied together. To respect specific - * time boundaries while fusing gates, use the other version of this method - * below. - * @param param Options for gate fusion. - * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'. - * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates - * (or pointers to gates) in. Gate times of the gates that act on the same - * qubits should be ordered. Gates that are out of time order should not - * cross the time boundaries set by measurement gates. - * @param fuse_matrix If true, multiply gate matrices together. - * @return A vector of fused gate objects. Each element is a set of gates - * acting on a specific pair of qubits which can be applied as a group. - */ - static std::vector FuseGates( - const Parameter& param, unsigned max_qubit1, - typename std::vector::const_iterator gfirst, - typename std::vector::const_iterator glast, - bool fuse_matrix = true) { - return FuseGates(param, max_qubit1, gfirst, glast, {}, fuse_matrix); - } - - /** - * Stores sets of gates that can be applied together. - * @param param Options for gate fusion. - * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'. - * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates - * (or pointers to gates) in. Gate times of the gates that act on the same - * qubits should be ordered. Gates that are out of time order should not - * cross the time boundaries set by `times_to_split_at` or by measurement - * gates. - * @param times_to_split_at Ordered list of time steps (boundaries) at which - * to separate fused gates. Each element of the output will contain gates - * from a single 'window' in this list. - * @param fuse_matrix If true, multiply gate matrices together. - * @return A vector of fused gate objects. Each element is a set of gates - * acting on a specific pair of qubits which can be applied as a group. - */ - static std::vector FuseGates( - const Parameter& param, unsigned max_qubit1, - typename std::vector::const_iterator gfirst, - typename std::vector::const_iterator glast, - const std::vector& times_to_split_at, - bool fuse_matrix = true) { - std::vector fused_gates; - - if (gfirst >= glast) return fused_gates; - - std::size_t num_gates = glast - gfirst; - - fused_gates.reserve(num_gates); - - // Merge with measurement gate times to separate fused gates at. - auto epochs = - Base::MergeWithMeasurementTimes(gfirst, glast, times_to_split_at); - - LinkManager link_manager(max_qubit1 * num_gates); - - // Auxillary data structures. - // Sequence of intermediate fused gates. - std::vector gates_seq; - // Gate "lattice". - std::vector gates_lat; - // Sequences of intermediate fused gates ordered by gate size. - std::vector> fgates(max_qubit1 + 1); - - gates_seq.reserve(num_gates); - gates_lat.reserve(max_qubit1); - - Scratch scratch; - - scratch.data.reserve(1024); - scratch.prev1.reserve(32); - scratch.prev2.reserve(32); - scratch.next1.reserve(32); - scratch.next2.reserve(32); - scratch.longest_seq.reserve(8); - scratch.stack.reserve(8); - - Stat stat; - stat.num_gates.resize(max_qubit1 + 1, 0); - - unsigned max_fused_size = std::min(unsigned{6}, param.max_fused_size); - max_fused_size = std::min(max_fused_size, max_qubit1); - - std::size_t last_fused_gate_index = 0; - auto gate_it = gfirst; - - // Iterate over epochs. - for (std::size_t l = 0; l < epochs.size(); ++l) { - gates_seq.resize(0); - gates_lat.resize(0); - gates_lat.resize(max_qubit1, nullptr); - - for (unsigned i = 0; i <= max_qubit1; ++i) { - fgates[i].resize(0); - } - - uint64_t max_gate_size = 0; - GateF* last_mea_gate = nullptr; - - // Iterate over input gates. - for (; gate_it < glast; ++gate_it) { - const auto& gate = Base::GateToConstRef(*gate_it); - - if (gate.time > epochs[l]) break; - - if (!ValidateGate(gate, max_qubit1, gates_lat)) { - fused_gates.resize(0); - return fused_gates; - } - - // Fill in auxillary data structures. - - if (gate.kind == gate::kMeasurement) { - // Measurement gate. - - if (last_mea_gate == nullptr - || last_mea_gate->parent->time != gate.time) { - gates_seq.push_back({&gate, {}, {}, {}, 0, kMeaCnt}); - last_mea_gate = &gates_seq.back(); - - last_mea_gate->qubits.reserve(max_qubit1); - last_mea_gate->links.reserve(max_qubit1); - - ++stat.num_fused_mea_gates; - } - - for (auto q : gate.qubits) { - last_mea_gate->qubits.push_back(q); - last_mea_gate->mask |= uint64_t{1} << q; - gates_lat[q] = link_manager.AddBack(last_mea_gate, gates_lat[q]); - last_mea_gate->links.push_back(gates_lat[q]); - } - - last_mea_gate->gates.push_back(&gate); - - ++stat.num_mea_gates; - } else { - gates_seq.push_back({&gate, {}, {}, {}, 0, kZero}); - auto& fgate = gates_seq.back(); - - if (gate.controlled_by.size() == 0) { - if (max_gate_size < gate.qubits.size()) { - max_gate_size = gate.qubits.size(); - } - - unsigned num_gate_qubits = gate.qubits.size(); - unsigned size = std::max(max_fused_size, num_gate_qubits); - - fgate.qubits.reserve(size); - fgate.links.reserve(size); - fgate.gates.reserve(4 * size); - fgate.links.reserve(size); - - if (fgates[num_gate_qubits].empty()) { - fgates[num_gate_qubits].reserve(num_gates); - } - fgates[num_gate_qubits].push_back(&fgate); - - ++stat.num_gates[num_gate_qubits]; - } else { - // Controlled gate. - // Controlled gates are not fused with other gates. - - uint64_t size = gate.qubits.size() + gate.controlled_by.size(); - - fgate.qubits.reserve(gate.qubits.size()); - fgate.links.reserve(size); - - fgate.visited = kMeaCnt; - fgate.gates.push_back(&gate); - - ++stat.num_controlled_gates; - } - - for (auto q : gate.qubits) { - fgate.qubits.push_back(q); - fgate.mask |= uint64_t{1} << q; - gates_lat[q] = link_manager.AddBack(&fgate, gates_lat[q]); - fgate.links.push_back(gates_lat[q]); - } - - for (auto q : gate.controlled_by) { - fgate.mask |= uint64_t{1} << q; - gates_lat[q] = link_manager.AddBack(&fgate, gates_lat[q]); - fgate.links.push_back(gates_lat[q]); - } - } - } - - // Fuse large gates with smaller gates. - FuseGates(max_gate_size, fgates); - - if (max_fused_size > 2) { - FuseGateSequences( - max_fused_size, max_qubit1, scratch, gates_seq, stat, fused_gates); - } else { - unsigned prev_time = 0; - - std::vector orphaned_gates; - orphaned_gates.reserve(max_qubit1); - - for (auto& fgate : gates_seq) { - if (fgate.gates.size() == 0) continue; - - if (prev_time != fgate.parent->time) { - if (orphaned_gates.size() > 0) { - FuseOrphanedGates( - max_fused_size, stat, orphaned_gates, fused_gates); - orphaned_gates.resize(0); - } - - prev_time = fgate.parent->time; - } - - if (fgate.qubits.size() == 1 && max_fused_size > 1 - && fgate.visited != kMeaCnt && !fgate.parent->unfusible) { - orphaned_gates.push_back(&fgate); - continue; - } - - // Assume fgate.qubits (gate.qubits) are sorted. - fused_gates.push_back({fgate.parent->kind, fgate.parent->time, - std::move(fgate.qubits), fgate.parent, - std::move(fgate.gates), {}}); - - if (fgate.visited != kMeaCnt) { - ++stat.num_fused_gates; - } - } - - if (orphaned_gates.size() > 0) { - FuseOrphanedGates(max_fused_size, stat, orphaned_gates, fused_gates); - } - } - - if (fgates[0].size() != 0) { - Base::FuseZeroQubitGates(fgates[0], - [](const GateF* g) { return g->parent; }, - last_fused_gate_index, fused_gates); - } - - last_fused_gate_index = fused_gates.size(); - } - - if (fuse_matrix) { - for (auto& fgate : fused_gates) { - if (fgate.kind != gate::kMeasurement && fgate.kind != gate::kDecomp) { - CalculateFusedMatrix(fgate); - } - } - } - - PrintStat(param.verbosity, stat, fused_gates); - - return fused_gates; - } - - private: - // Fuse large gates with smaller gates. - static void FuseGates(uint64_t max_gate_size, - std::vector>& fgates) { - // Traverse gates in order of decreasing size. - for (uint64_t i = 0; i < max_gate_size; ++i) { - std::size_t pos = 0; - - for (auto fgate : fgates[max_gate_size - i]) { - if (fgate->visited > kZero) continue; - - fgates[max_gate_size - i][pos++] = fgate; - - fgate->visited = kFirst; - - FusePrev(0, *fgate); - fgate->gates.push_back(fgate->parent); - FuseNext(0, *fgate); - } - - fgates[max_gate_size - i].resize(pos); - } - } - - // Try to fuse gate sequences as follows. Gate time goes from bottom to top. - // Gates are fused either from left to right or from right to left. - // - // max_fused_size = 3: _- or -_ - // - // max_fused_size = 4: _-_ - // - // max_fused_size = 5: _-_- or -_-_ - // - // max_fused_size = 6: _-_-_ - static void FuseGateSequences(unsigned max_fused_size, - unsigned max_qubit1, Scratch& scratch, - std::vector& gates_seq, Stat& stat, - std::vector& fused_gates) { - unsigned prev_time = 0; - - std::vector orphaned_gates; - orphaned_gates.reserve(max_qubit1); - - for (auto& fgate : gates_seq) { - if (prev_time != fgate.parent->time) { - if (orphaned_gates.size() > 0) { - FuseOrphanedGates(max_fused_size, stat, orphaned_gates, fused_gates); - orphaned_gates.resize(0); - } - - prev_time = fgate.parent->time; - } - - if (fgate.visited == kFinal || fgate.gates.size() == 0) continue; - - if (fgate.visited == kMeaCnt || fgate.qubits.size() >= max_fused_size - || fgate.parent->unfusible) { - if (fgate.visited != kMeaCnt) { - ++stat.num_fused_gates; - } - - fgate.visited = kFinal; - - fused_gates.push_back({fgate.parent->kind, fgate.parent->time, - std::move(fgate.qubits), fgate.parent, - std::move(fgate.gates), {}}); - - continue; - } - - - if (fgate.qubits.size() == 1 && max_fused_size > 1) { - orphaned_gates.push_back(&fgate); - continue; - } - - scratch.data.resize(0); - scratch.gates.resize(0); - scratch.count = 0; - - MakeGateSequence(max_fused_size, scratch, fgate); - - if (scratch.gates.size() == 0) { - orphaned_gates.push_back(&fgate); - } else { - for (auto fgate : scratch.gates) { - std::sort(fgate->qubits.begin(), fgate->qubits.end()); - - fused_gates.push_back({fgate->parent->kind, fgate->parent->time, - std::move(fgate->qubits), fgate->parent, - std::move(fgate->gates), {}}); - - ++stat.num_fused_gates; - } - } - } - - if (orphaned_gates.size() > 0) { - FuseOrphanedGates(max_fused_size, stat, orphaned_gates, fused_gates); - } - } - - static void FuseOrphanedGates(unsigned max_fused_size, Stat& stat, - std::vector& orphaned_gates, - std::vector& fused_gates) { - for (std::size_t i = 0; i < orphaned_gates.size(); ++i) { - auto ogate1 = orphaned_gates[i]; - - if (ogate1->visited == kFinal) continue; - - ogate1->visited = kFinal; - - for (std::size_t j = i + 1; j < orphaned_gates.size(); ++j) { - auto ogate2 = orphaned_gates[j]; - - if (ogate2->visited == kFinal) continue; - - unsigned cur_size = ogate1->qubits.size() + ogate2->qubits.size(); - - if (cur_size <= max_fused_size) { - ogate2->visited = kFinal; - - for (auto q : ogate2->qubits) { - ogate1->qubits.push_back(q); - ogate1->mask |= uint64_t{1} << q; - } - - for (auto l : ogate2->links) { - ogate1->links.push_back(l); - } - - for (auto gate : ogate2->gates) { - ogate1->gates.push_back(gate); - } - } - - if (cur_size == max_fused_size) { - break; - } - } - - FuseNext(1, *ogate1); - - std::sort(ogate1->qubits.begin(), ogate1->qubits.end()); - - fused_gates.push_back({ogate1->parent->kind, ogate1->parent->time, - std::move(ogate1->qubits), ogate1->parent, - std::move(ogate1->gates), {}}); - - ++stat.num_fused_gates; - } - } - - static void MakeGateSequence( - unsigned max_fused_size, Scratch& scratch, GateF& fgate) { - unsigned level = kSecond + scratch.count; - - FindLongestGateSequence(max_fused_size, level, scratch, fgate); - - auto longest_seq = scratch.longest_seq; - - if (longest_seq.size() == 1 && scratch.count == 0) { - fgate.visited = kFirst; - return; - } - - ++scratch.count; - - for (auto p : longest_seq) { - p->gate->visited = kCompress; - - for (auto q : p->qubits) { - fgate.qubits.push_back(q); - fgate.mask |= uint64_t{1} << q; - } - - for (auto l : p->links) { - fgate.links.push_back(l); - } - } - - // Compress links. - for (auto& link : fgate.links) { - while (link->prev != nullptr && link->prev->val->visited == kCompress) { - link = link->prev; - } - - while (link->next != nullptr && link->next->val->visited == kCompress) { - LinkManager::Delete(link->next); - } - } - - for (auto p : longest_seq) { - p->gate->visited = level; - } - - if (longest_seq.size() >= 3) { - AddGatesFromNext(longest_seq[2]->gate->gates, fgate); - } - - if (longest_seq.size() >= 5) { - AddGatesFromNext(longest_seq[4]->gate->gates, fgate); - } - - if (longest_seq.size() >= 2) { - // May call MakeGateSequence recursively. - AddGatesFromPrev(max_fused_size, *longest_seq[1]->gate, scratch, fgate); - } - - if (longest_seq.size() >= 4) { - // May call MakeGateSequence recursively. - AddGatesFromPrev(max_fused_size, *longest_seq[3]->gate, scratch, fgate); - } - - for (auto p : longest_seq) { - p->gate->visited = kFinal; - } - - FuseNext(1, fgate); - - scratch.gates.push_back(&fgate); - } - - static void AddGatesFromNext(std::vector& gates, GateF& fgate) { - for (auto gate : gates) { - fgate.gates.push_back(gate); - } - } - - static void AddGatesFromPrev(unsigned max_fused_size, const GateF& pfgate, - Scratch& scratch, GateF& fgate) { - for (auto gate : pfgate.gates) { - fgate.gates.push_back(gate); - } - - for (auto link : pfgate.links) { - if (link->prev == nullptr) continue; - - auto pgate = link->prev->val; - - if (pgate->visited == kFirst) { - MakeGateSequence(max_fused_size, scratch, *pgate); - } - } - } - - static void FindLongestGateSequence( - unsigned max_fused_size, unsigned level, Scratch& scratch, GateF& fgate) { - scratch.data.push_back({&fgate, {}, {}}); - - scratch.longest_seq.resize(0); - scratch.longest_seq.push_back(&scratch.data.back()); - - scratch.stack.resize(0); - scratch.stack.push_back(&scratch.data.back()); - - unsigned cur_size = fgate.qubits.size(); - fgate.visited = level; - - unsigned max_size = cur_size; - - GetNextAvailableGates(max_fused_size, cur_size, fgate, nullptr, - scratch.data, scratch.next1); - - for (auto n1 : scratch.next1) { - unsigned cur_size2 = cur_size + n1->qubits.size(); - if (cur_size2 > max_fused_size) continue; - - bool feasible = GetPrevAvailableGates(max_fused_size, cur_size, - level, *n1->gate, nullptr, - scratch.data, scratch.prev1); - - if (!feasible) continue; - - if (scratch.prev1.size() == 0 && max_fused_size > 3) continue; - - if (cur_size2 == max_fused_size) { - std::swap(scratch.longest_seq, scratch.stack); - scratch.longest_seq.push_back(n1); - return; - } - - Push(level, cur_size2, cur_size, max_size, scratch, n1); - - for (auto p1 : scratch.prev1) { - unsigned cur_size2 = cur_size + p1->qubits.size(); - - if (cur_size2 > max_fused_size) { - continue; - } else if (cur_size2 == max_fused_size) { - std::swap(scratch.longest_seq, scratch.stack); - scratch.longest_seq.push_back(p1); - return; - } - - Push(level, cur_size2, cur_size, max_size, scratch, p1); - - GetNextAvailableGates(max_fused_size, cur_size, *p1->gate, &fgate, - scratch.data, scratch.next2); - - for (auto n2 : scratch.next2) { - unsigned cur_size2 = cur_size + n2->qubits.size(); - if (cur_size2 > max_fused_size) continue; - - bool feasible = GetPrevAvailableGates(max_fused_size, cur_size, - level, *n2->gate, n1->gate, - scratch.data, scratch.prev2); - - if (!feasible) continue; - - if (cur_size2 == max_fused_size) { - std::swap(scratch.longest_seq, scratch.stack); - scratch.longest_seq.push_back(n2); - return; - } - - Push(level, cur_size2, cur_size, max_size, scratch, n2); - - for (auto p2 : scratch.prev2) { - unsigned cur_size2 = cur_size + p2->qubits.size(); - - if (cur_size2 > max_fused_size) { - continue; - } else if (cur_size2 == max_fused_size) { - std::swap(scratch.longest_seq, scratch.stack); - scratch.longest_seq.push_back(p2); - return; - } - - if (cur_size2 > max_size) { - scratch.stack.push_back(p2); - scratch.longest_seq = scratch.stack; - scratch.stack.pop_back(); - max_size = cur_size2; - } - } - - Pop(cur_size, scratch, n2); - } - - Pop(cur_size, scratch, p1); - } - - Pop(cur_size, scratch, n1); - } - } - - static void Push(unsigned level, unsigned cur_size2, unsigned& cur_size, - unsigned& max_size, Scratch& scratch, GateA* agate) { - agate->gate->visited = level; - cur_size = cur_size2; - scratch.stack.push_back(agate); - - if (cur_size > max_size) { - scratch.longest_seq = scratch.stack; - max_size = cur_size; - } - } - - static void Pop(unsigned& cur_size, Scratch& scratch, GateA* agate) { - agate->gate->visited = kFirst; - cur_size -= agate->qubits.size(); - scratch.stack.pop_back(); - } - - static void GetNextAvailableGates(unsigned max_fused_size, unsigned cur_size, - const GateF& pgate1, const GateF* pgate2, - std::vector& scratch, - std::vector& next_gates) { - next_gates.resize(0); - - for (auto link : pgate1.links) { - if (link->next == nullptr) continue; - - auto ngate = link->next->val; - - if (ngate->visited > kFirst || ngate->parent->unfusible) continue; - - GateA next = {ngate, {}, {}}; - next.qubits.reserve(8); - next.links.reserve(8); - - GetAddedQubits(pgate1, pgate2, *ngate, next); - - if (cur_size + next.qubits.size() > max_fused_size) continue; - - scratch.push_back(std::move(next)); - next_gates.push_back(&scratch.back()); - } - } - - static bool GetPrevAvailableGates(unsigned max_fused_size, - unsigned cur_size, unsigned level, - const GateF& ngate1, const GateF* ngate2, - std::vector& scratch, - std::vector& prev_gates) { - prev_gates.resize(0); - - for (auto link : ngate1.links) { - if (link->prev == nullptr) continue; - - auto pgate = link->prev->val; - - if (pgate->visited == kFinal || pgate->visited == level) continue; - - if (pgate->visited > kFirst || pgate->parent->unfusible) { - prev_gates.resize(0); - return false; - } - - GateA prev = {pgate, {}, {}}; - prev.qubits.reserve(8); - prev.links.reserve(8); - - GetAddedQubits(ngate1, ngate2, *pgate, prev); - - bool all_prev_visited = true; - - for (auto link : pgate->links) { - if (link->prev == nullptr) continue; - - if (link->prev->val->visited <= kMeaCnt) { - all_prev_visited = false; - break; - } - } - - if (!all_prev_visited) { - prev_gates.resize(0); - return false; - } - - if (cur_size + prev.qubits.size() > max_fused_size) continue; - - if (all_prev_visited) { - scratch.push_back(std::move(prev)); - prev_gates.push_back(&scratch.back()); - } - } - - return true; - } - - static void GetAddedQubits(const GateF& fgate0, const GateF* fgate1, - const GateF& fgate2, GateA& added) { - for (std::size_t i = 0; i < fgate2.qubits.size(); ++i) { - unsigned q2 = fgate2.qubits[i]; - - if (std::find(fgate0.qubits.begin(), fgate0.qubits.end(), q2) - != fgate0.qubits.end()) continue; - - if (fgate1 != nullptr - && std::find(fgate1->qubits.begin(), fgate1->qubits.end(), q2) - != fgate1->qubits.end()) continue; - - added.qubits.push_back(q2); - added.links.push_back(fgate2.links[i]); - } - } - - // Fuse smaller gates with fgate back in gate time. - static void FusePrev(unsigned pass, GateF& fgate) { - std::vector gates; - gates.reserve(fgate.gates.capacity()); - - auto neighbor = [](const Link* link) -> const Link* { - return link->prev; - }; - - FusePrevOrNext>(pass, neighbor, fgate, gates); - - for (auto it = gates.rbegin(); it != gates.rend(); ++it) { - fgate.gates.push_back(*it); - } - } - - // Fuse smaller gates with fgate forward in gate time. - static void FuseNext(unsigned pass, GateF& fgate) { - auto neighbor = [](const Link* link) -> const Link* { - return link->next; - }; - - FusePrevOrNext>(pass, neighbor, fgate, fgate.gates); - } - - template - static void FusePrevOrNext(unsigned pass, Neighbor neighb, GateF& fgate, - std::vector& gates) { - uint64_t bad_mask = 0; - auto links = fgate.links; - - bool may_have_gates_to_fuse = true; - - while (may_have_gates_to_fuse) { - may_have_gates_to_fuse = false; - - std::sort(links.begin(), links.end(), - [&neighb](const Link* l, const Link* r) -> bool { - auto ln = neighb(l); - auto rn = neighb(r); - - if (ln != nullptr && rn != nullptr) { - return R()(ln->val->parent->time, rn->val->parent->time); - } else { - // nullptrs are larger than everything else and - // equivalent among each other. - return ln != nullptr; - } - }); - - for (auto link : links) { - auto n = neighb(link); - - if (n == nullptr) continue; - - auto g = n->val; - - if (!QubitsAreIn(fgate.mask, g->mask) || (g->mask & bad_mask) != 0 - || g->visited > pass || g->parent->unfusible) { - bad_mask |= g->mask; - } else { - g->visited = pass == 0 ? kFirst : kFinal; - - if (pass == 0) { - gates.push_back(g->parent); - } else { - for (auto gate : g->gates) { - gates.push_back(gate); - } - } - - for (auto link : g->links) { - LinkManager::Delete(link); - } - - may_have_gates_to_fuse = true; - break; - } - } - } - } - - static bool QubitsAreIn(uint64_t mask0, uint64_t mask) { - return ((mask0 | mask) ^ mask0) == 0; - } - - static void PrintStat(unsigned verbosity, const Stat& stat, - const std::vector& fused_gates) { - if (verbosity < 3) return; - - if (stat.num_controlled_gates > 0) { - IO::messagef("%lu controlled gates\n", stat.num_controlled_gates); - } - - if (stat.num_mea_gates > 0) { - IO::messagef("%lu measurement gates", stat.num_mea_gates); - if (stat.num_fused_mea_gates == stat.num_mea_gates) { - IO::messagef("\n"); - } else { - IO::messagef(" are fused into %lu gates\n", stat.num_fused_mea_gates); - } - } - - bool first = true; - for (unsigned i = 1; i < stat.num_gates.size(); ++i) { - if (stat.num_gates[i] > 0) { - if (first) { - first = false; - } else { - IO::messagef(", "); - } - IO::messagef("%u %u-qubit", stat.num_gates[i], i); - } - } - - IO::messagef(" gates are fused into %lu gates\n", stat.num_fused_gates); - - if (verbosity < 5) return; - - IO::messagef("fused gate qubits:\n"); - for (const auto& g : fused_gates) { - IO::messagef("%6u ", g.parent->time); - if (g.parent->kind == gate::kMeasurement) { - IO::messagef("m"); - } else if (g.parent->controlled_by.size() > 0) { - IO::messagef("c"); - for (auto q : g.parent->controlled_by) { - IO::messagef("%3u", q); - } - IO::messagef(" t"); - } else { - IO::messagef(" "); - } - - for (auto q : g.qubits) { - IO::messagef("%3u", q); - } - IO::messagef("\n"); - } - } - - template - static bool ValidateGate(const Gate2& gate, unsigned max_qubit1, - const GatesLat& gates_lat) { - for (unsigned q : gate.qubits) { - if (q >= max_qubit1) { - IO::errorf("fuser: gate qubit %u is out of range " - "(should be smaller than %u).\n", q, max_qubit1); - return false; - } - if (gates_lat[q] != nullptr - && gate.time <= gates_lat[q]->val->parent->time) { - IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time); - return false; - } - } - - for (unsigned q : gate.controlled_by) { - if (q >= max_qubit1) { - IO::errorf("fuser: gate qubit %u is out of range " - "(should be smaller than %u).\n", q, max_qubit1); - return false; - } - if (gates_lat[q] != nullptr - && gate.time <= gates_lat[q]->val->parent->time) { - IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time); - return false; - } - } - - return true; - } -}; - -} // namespace qsim - -#endif // FUSER_MQUBIT_H_ diff --git a/qsim/gate.h b/qsim/gate.h deleted file mode 100644 index a457acb..0000000 --- a/qsim/gate.h +++ /dev/null @@ -1,216 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef GATE_H_ -#define GATE_H_ - -#include -#include -#include -#include - -#include "matrix.h" - -namespace qsim { - -namespace detail { - -template -inline void SortQubits(Gate& gate) { - for (std::size_t i = 1; i < gate.qubits.size(); ++i) { - if (gate.qubits[i - 1] > gate.qubits[i]) { - if (!GateDef::symmetric) { - auto perm = NormalToGateOrderPermutation(gate.qubits); - MatrixShuffle(perm, gate.qubits.size(), gate.matrix); - } - - gate.swapped = true; - std::sort(gate.qubits.begin(), gate.qubits.end()); - break; - } - } -} - -} // namespace detail - -template , typename Gate> -inline Gate& MakeControlledGate(Qubits&& controlled_by, Gate& gate) { - gate.controlled_by = std::forward(controlled_by); - gate.cmask = (uint64_t{1} << gate.controlled_by.size()) - 1; - - std::sort(gate.controlled_by.begin(), gate.controlled_by.end()); - - return gate; -} - -template , typename Gate> -inline Gate& MakeControlledGate(Qubits&& controlled_by, - const std::vector& control_values, - Gate& gate) { - // Assume controlled_by.size() == control_values.size(). - - bool sorted = true; - - for (std::size_t i = 1; i < controlled_by.size(); ++i) { - if (controlled_by[i - 1] > controlled_by[i]) { - sorted = false; - break; - } - } - - if (sorted) { - gate.controlled_by = std::forward(controlled_by); - gate.cmask = 0; - - for (std::size_t i = 0; i < control_values.size(); ++i) { - gate.cmask |= (control_values[i] & 1) << i; - } - } else { - struct ControlPair { - unsigned q; - unsigned v; - }; - - std::vector cpairs; - cpairs.reserve(controlled_by.size()); - - for (std::size_t i = 0; i < controlled_by.size(); ++i) { - cpairs.push_back({controlled_by[i], control_values[i]}); - } - - // Sort control qubits and control values. - std::sort(cpairs.begin(), cpairs.end(), - [](const ControlPair& l, const ControlPair& r) -> bool { - return l.q < r.q; - }); - - gate.cmask = 0; - gate.controlled_by.reserve(controlled_by.size()); - - for (std::size_t i = 0; i < cpairs.size(); ++i) { - gate.cmask |= (cpairs[i].v & 1) << i; - gate.controlled_by.push_back(cpairs[i].q); - } - } - - return gate; -} - -namespace gate { - -constexpr int kDecomp = 100001; // gate from Schmidt decomposition -constexpr int kMeasurement = 100002; // measurement gate - -} // namespace gate - -enum GateAnyKind { - kGateAny = -1, -}; - -/** - * A generic gate to make it easier to use qsim with external gate sets. - */ -template -struct Gate { - using fp_type = FP; - using GateKind = GK; - - GateKind kind; - unsigned time; - std::vector qubits; - std::vector controlled_by; - uint64_t cmask; - std::vector params; - Matrix matrix; - bool unfusible; // If true, the gate is fused as a parent. - bool swapped; // If true, the gate qubits are swapped to make qubits - // ordered in ascending order. This does not apply to - // control qubits of explicitly-controlled gates. - - template > - Gate&& ControlledBy(Qubits&& controlled_by) { - MakeControlledGate(std::forward(controlled_by), *this); - return std::move(*this); - } - - template > - Gate&& ControlledBy(Qubits&& controlled_by, - const std::vector& control_values) { - MakeControlledGate( - std::forward(controlled_by), control_values, *this); - return std::move(*this); - } -}; - -template , - typename M = Matrix> -inline Gate CreateGate(unsigned time, Qubits&& qubits, M&& matrix = {}, - std::vector&& params = {}) { - Gate gate = {GateDef::kind, time, std::forward(qubits), {}, 0, - std::move(params), std::forward(matrix), false, false}; - - if (GateDef::kind != gate::kMeasurement) { - switch (gate.qubits.size()) { - case 1: - break; - case 2: - if (gate.qubits[0] > gate.qubits[1]) { - gate.swapped = true; - std::swap(gate.qubits[0], gate.qubits[1]); - if (!GateDef::symmetric) { - MatrixShuffle({1, 0}, 2, gate.matrix); - } - } - break; - default: - detail::SortQubits(gate); - } - } - - return gate; -} - -namespace gate { - -/** - * A gate that simulates measurement of one or more qubits, collapsing the - * state vector and storing the measured results. - */ -template -struct Measurement { - using GateKind = typename Gate::GateKind; - - static constexpr GateKind kind = GateKind::kMeasurement; - static constexpr char name[] = "m"; - static constexpr bool symmetric = false; - - template > - static Gate Create(unsigned time, Qubits&& qubits) { - return CreateGate(time, std::forward(qubits)); - } -}; - -} // namespace gate - -template -using schmidt_decomp_type = std::vector>>; - -template -schmidt_decomp_type GetSchmidtDecomp( - GateKind kind, const std::vector& params); - -} // namespace qsim - -#endif // GATE_H_ diff --git a/qsim/gate_appl.h b/qsim/gate_appl.h deleted file mode 100644 index 8601e6f..0000000 --- a/qsim/gate_appl.h +++ /dev/null @@ -1,231 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef GATE_APPL_H_ -#define GATE_APPL_H_ - -#include -#include - -#include "fuser.h" -#include "gate.h" -#include "matrix.h" - -namespace qsim { - -/** - * Applies the given gate to the simulator state. Ignores measurement gates. - * @param simulator Simulator object. Provides specific implementations for - * applying gates. - * @param gate The gate to be applied. - * @param state The state of the system, to be updated by this method. - */ -template -inline void ApplyGate(const Simulator& simulator, const Gate& gate, - typename Simulator::State& state) { - if (gate.kind != gate::kMeasurement) { - if (gate.controlled_by.size() == 0) { - simulator.ApplyGate(gate.qubits, gate.matrix.data(), state); - } else { - simulator.ApplyControlledGate(gate.qubits, gate.controlled_by, - gate.cmask, gate.matrix.data(), state); - } - } -} - -/** - * Applies the given gate dagger to the simulator state. If the gate matrix is - * unitary then this is equivalent to applying the inverse gate. Ignores - * measurement gates. - * @param simulator Simulator object. Provides specific implementations for - * applying gates. - * @param gate The gate to be applied. - * @param state The state of the system, to be updated by this method. - */ -template -inline void ApplyGateDagger(const Simulator& simulator, const Gate& gate, - typename Simulator::State& state) { - if (gate.kind != gate::kMeasurement) { - auto matrix = gate.matrix; - MatrixDagger(unsigned{1} << gate.qubits.size(), matrix); - - if (gate.controlled_by.size() == 0) { - simulator.ApplyGate(gate.qubits, matrix.data(), state); - } else { - simulator.ApplyControlledGate(gate.qubits, gate.controlled_by, - gate.cmask, matrix.data(), state); - } - } -} - -/** - * Applies the given gate to the simulator state. - * @param state_space StateSpace object required to perform measurements. - * @param simulator Simulator object. Provides specific implementations for - * applying gates. - * @param gate The gate to be applied. - * @param rgen Random number generator to perform measurements. - * @param state The state of the system, to be updated by this method. - * @param mresults As an input parameter, this can be empty or this can - * contain the results of the previous measurements. If gate is a measurement - * gate then after a successful run, the measurement result will be added to - * this. - * @return True if the measurement performed successfully; false otherwise. - */ -template -inline bool ApplyGate( - const typename Simulator::StateSpace& state_space, - const Simulator& simulator, const Gate& gate, Rgen& rgen, - typename Simulator::State& state, - std::vector& mresults) { - if (gate.kind == gate::kMeasurement) { - auto measure_result = state_space.Measure(gate.qubits, rgen, state); - if (measure_result.valid) { - mresults.push_back(std::move(measure_result)); - } else { - return false; - } - } else { - ApplyGate(simulator, gate, state); - } - - return true; -} - -/** - * Applies the given gate to the simulator state, discarding measurement - * results. - * @param state_space StateSpace object required to perform measurements. - * @param simulator Simulator object. Provides specific implementations for - * applying gates. - * @param gate The gate to be applied. - * @param rgen Random number generator to perform measurements. - * @param state The state of the system, to be updated by this method. - * @return True if the measurement performed successfully; false otherwise. - */ -template -inline bool ApplyGate(const typename Simulator::StateSpace& state_space, - const Simulator& simulator, const Gate& gate, Rgen& rgen, - typename Simulator::State& state) { - using MeasurementResult = typename Simulator::StateSpace::MeasurementResult; - std::vector discarded_results; - return - ApplyGate(state_space, simulator, gate, rgen, state, discarded_results); -} - -/** - * Applies the given fused gate to the simulator state. Ignores measurement - * gates. - * @param simulator Simulator object. Provides specific implementations for - * applying gates. - * @param gate The gate to be applied. - * @param state The state of the system, to be updated by this method. - */ -template -inline void ApplyFusedGate(const Simulator& simulator, const Gate& gate, - typename Simulator::State& state) { - if (gate.kind != gate::kMeasurement) { - if (gate.parent->controlled_by.size() == 0) { - simulator.ApplyGate(gate.qubits, gate.matrix.data(), state); - } else { - simulator.ApplyControlledGate(gate.qubits, gate.parent->controlled_by, - gate.parent->cmask, gate.matrix.data(), - state); - } - } -} - -/** - * Applies the given fused gate dagger to the simulator state. If the gate - * matrix is unitary then this is equivalent to applying the inverse gate. - * Ignores measurement gates. - * @param simulator Simulator object. Provides specific implementations for - * applying gates. - * @param gate The gate to be applied. - * @param state The state of the system, to be updated by this method. - */ -template -inline void ApplyFusedGateDagger(const Simulator& simulator, const Gate& gate, - typename Simulator::State& state) { - if (gate.kind != gate::kMeasurement) { - auto matrix = gate.matrix; - MatrixDagger(unsigned{1} << gate.qubits.size(), matrix); - - if (gate.parent->controlled_by.size() == 0) { - simulator.ApplyGate(gate.qubits, matrix.data(), state); - } else { - simulator.ApplyControlledGate(gate.qubits, gate.parent->controlled_by, - gate.parent->cmask, matrix.data(), state); - } - } -} - -/** - * Applies the given fused gate to the simulator state. - * @param state_space StateSpace object required to perform measurements. - * @param simulator Simulator object. Provides specific implementations for - * applying gates. - * @param gate The gate to be applied. - * @param rgen Random number generator to perform measurements. - * @param state The state of the system, to be updated by this method. - * @param mresults As an input parameter, this can be empty or this can - * contain the results of the previous measurements. If gate is a measurement - * gate then after a successful run, the measurement result will be added to - * this. - * @return True if the measurement performed successfully; false otherwise. - */ -template -inline bool ApplyFusedGate( - const typename Simulator::StateSpace& state_space, - const Simulator& simulator, const Gate& gate, Rgen& rgen, - typename Simulator::State& state, - std::vector& mresults) { - if (gate.kind == gate::kMeasurement) { - auto measure_result = state_space.Measure(gate.qubits, rgen, state); - if (measure_result.valid) { - mresults.push_back(std::move(measure_result)); - } else { - return false; - } - } else { - ApplyFusedGate(simulator, gate, state); - } - - return true; -} - -/** - * Applies the given fused gate to the simulator state, discarding measurement - * results. - * @param state_space StateSpace object required to perform measurements. - * @param simulator Simulator object. Provides specific implementations for - * applying gates. - * @param gate The gate to be applied. - * @param rgen Random number generator to perform measurements. - * @param state The state of the system, to be updated by this method. - * @return True if the measurement performed successfully; false otherwise. - */ -template -inline bool ApplyFusedGate(const typename Simulator::StateSpace& state_space, - const Simulator& simulator, const Gate& gate, - Rgen& rgen, typename Simulator::State& state) { - using MeasurementResult = typename Simulator::StateSpace::MeasurementResult; - std::vector discarded_results; - return ApplyFusedGate( - state_space, simulator, gate, rgen, state, discarded_results); -} - -} // namespace qsim - -#endif // GATE_APPL_H_ diff --git a/qsim/gates_cirq.h b/qsim/gates_cirq.h deleted file mode 100644 index d767959..0000000 --- a/qsim/gates_cirq.h +++ /dev/null @@ -1,1640 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef GATES_CIRQ_H_ -#define GATES_CIRQ_H_ - -#include -#include -#include -#include - -#include "gate.h" -#include "matrix.h" - -namespace qsim { - -namespace Cirq { - -enum GateKind { - kI1 = 0, // One-qubit identity gate. - kI2, // Two-qubit identity gate. - kI, // Multi-qubit identity gate. - kXPowGate, - kYPowGate, - kZPowGate, - kHPowGate, - kCZPowGate, - kCXPowGate, - krx, - kry, - krz, - kH, - kS, - kCZ, - kCX, - kT, - kX, - kY, - kZ, - kPhasedXPowGate, - kPhasedXZGate, - kXXPowGate, - kYYPowGate, - kZZPowGate, - kXX, - kYY, - kZZ, - kSwapPowGate, - kISwapPowGate, - kriswap, - kSWAP, - kISWAP, - kPhasedISwapPowGate, - kgivens, - kFSimGate, - kTwoQubitDiagonalGate, - kThreeQubitDiagonalGate, - kCCZPowGate, - kCCXPowGate, - kCSwapGate, - kCCZ, - kCCX, - kMatrixGate1, // One-qubit matrix gate. - kMatrixGate2, // Two-qubit matrix gate. - kMatrixGate, // Multi-qubit matrix gate. - kGlobalPhaseGate, - kDecomp = gate::kDecomp, - kMeasurement = gate::kMeasurement, -}; - -template -using GateCirq = Gate; - -constexpr double h_double = 0.5; -constexpr double pi_double = 3.14159265358979323846264338327950288; -constexpr double is2_double = 0.7071067811865475; - -// Gates from cirq/ops/global_phase_op.py: - -/** - * The global phase gate. - */ -template -struct GlobalPhaseGate { - static constexpr GateKind kind = kGlobalPhaseGate; - static constexpr char name[] = "GlobalPhaseGate"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateCirq Create(unsigned time, fp_type phi) { - return Create(time, std::cos(phi), std::sin(phi)); - } - - static GateCirq Create(unsigned time, fp_type cp, fp_type sp) { - return CreateGate, GlobalPhaseGate>( - time, {}, {cp, sp}, {cp, sp}); - } -}; - -template -using global_phase_operation = GlobalPhaseGate; - -// Gates from cirq/ops/identity.py: - -/** - * A one-qubit identity gate. - */ -template -struct I1 { - static constexpr GateKind kind = kI1; - static constexpr char name[] = "I1"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateCirq Create(unsigned time, unsigned q0) { - return CreateGate, I1>( - time, {q0}, {1, 0, 0, 0, 0, 0, 1, 0}); - } -}; - -/** - * A two-qubit identity gate. - */ -template -struct I2 { - static constexpr GateKind kind = kI2; - static constexpr char name[] = "I2"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1) { - return CreateGate, I2>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0}); - } - - static schmidt_decomp_type SchmidtDecomp() { - return schmidt_decomp_type{ - {{1, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, 1, 0}}, - }; - } -}; - -/** - * A multi-qubit identity gate. - */ -template -struct I { - static constexpr GateKind kind = kI; - static constexpr char name[] = "I"; - static constexpr bool symmetric = true; - - static GateCirq Create(unsigned time, - const std::vector& qubits) { - Matrix matrix; - MatrixIdentity(1 << qubits.size(), matrix); - return CreateGate, I>(time, qubits, std::move(matrix)); - } -}; - -// Gates form cirq/ops/common_gates.py: - -/** - * A gate that rotates around the X axis of the Bloch sphere. - * This is a generalization of the X gate. - */ -template -struct XPowGate { - static constexpr GateKind kind = kXPowGate; - static constexpr char name[] = "XPowGate"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, unsigned q0, - fp_type exponent, fp_type global_shift = 0) { - fp_type c = std::cos(pi * exponent * 0.5); - fp_type s = std::sin(pi * exponent * 0.5); - fp_type gc = std::cos(pi * exponent * (0.5 + global_shift)); - fp_type gs = std::sin(pi * exponent * (0.5 + global_shift)); - - return CreateGate, XPowGate>( - time, {q0}, {c * gc, c * gs, s * gs, -s * gc, - s * gs, -s * gc, c * gc, c * gs}, - {exponent, global_shift}); - } -}; - -/** - * A gate that rotates around the Y axis of the Bloch sphere. - * This is a generalization of the Y gate. - */ -template -struct YPowGate { - static constexpr GateKind kind = kYPowGate; - static constexpr char name[] = "YPowGate"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, unsigned q0, - fp_type exponent, fp_type global_shift = 0) { - fp_type c = std::cos(pi * exponent * 0.5); - fp_type s = std::sin(pi * exponent * 0.5); - fp_type gc = std::cos(pi * exponent * (0.5 + global_shift)); - fp_type gs = std::sin(pi * exponent * (0.5 + global_shift)); - - return CreateGate, YPowGate>( - time, {q0}, {c * gc, c * gs, -s * gc, -s * gs, - s * gc, s * gs, c * gc, c * gs}, {exponent, global_shift}); - } -}; - -/** - * A gate that rotates around the Z axis of the Bloch sphere. - * This is a generalization of the Z gate. - */ -template -struct ZPowGate { - static constexpr GateKind kind = kZPowGate; - static constexpr char name[] = "ZPowGate"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, unsigned q0, - fp_type exponent, fp_type global_shift = 0) { - fp_type c = std::cos(pi * exponent); - fp_type s = std::sin(pi * exponent); - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - - return CreateGate, ZPowGate>( - time, {q0}, {gc, gs, 0, 0, 0, 0, c * gc - s * gs, c * gs + s * gc}, - {exponent, global_shift}); - } -}; - -/** - * A gate that rotates around the X+Z axis of the Bloch sphere. - * This is a generalization of the Hadamard gate. - */ -template -struct HPowGate { - static constexpr GateKind kind = kHPowGate; - static constexpr char name[] = "HPowGate"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static constexpr fp_type pi = static_cast(pi_double); - static constexpr fp_type is2 = static_cast(is2_double); - - static GateCirq Create(unsigned time, unsigned q0, - fp_type exponent, fp_type global_shift = 0) { - fp_type c = std::cos(pi * exponent * 0.5); - fp_type s = std::sin(pi * exponent * 0.5); - fp_type gc = std::cos(pi * exponent * (0.5 + global_shift)); - fp_type gs = std::sin(pi * exponent * (0.5 + global_shift)); - - fp_type a = s * gs * is2; - fp_type b = s * gc * is2; - - return CreateGate, HPowGate>( - time, {q0}, {c * gc + a, c * gs - b, a, -b, - a, -b, c * gc - a, c * gs + b}, {exponent, global_shift}); - } -}; - -/** - * A gate that applies a phase to the |11⟩ state of two qubits. - * This is a generalization of the CZ gate. - */ -template -struct CZPowGate { - static constexpr GateKind kind = kCZPowGate; - static constexpr char name[] = "CZPowGate"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1, - fp_type exponent, fp_type global_shift = 0) { - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - fp_type ec = std::cos(pi * exponent * (1 + global_shift)); - fp_type es = std::sin(pi * exponent * (1 + global_shift)); - - return CreateGate, CZPowGate>( - time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0, - 0, 0, gc, gs, 0, 0, 0, 0, - 0, 0, 0, 0, gc, gs, 0, 0, - 0, 0, 0, 0, 0, 0, ec, es}, {exponent, global_shift}); - } - - static schmidt_decomp_type SchmidtDecomp( - fp_type exponent, fp_type global_shift) { - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - fp_type ec = std::cos(pi * exponent * (1 + global_shift)); - fp_type es = std::sin(pi * exponent * (1 + global_shift)); - - return schmidt_decomp_type{ - {{1, 0, 0, 0, 0, 0, 0, 0}, {gc, gs, 0, 0, 0, 0, gc, gs}}, - {{0, 0, 0, 0, 0, 0, 1, 0}, {gc, gs, 0, 0, 0, 0, ec, es}}, - }; - } -}; - -/** - * A gate that applies a controlled power of an X gate. - * This is a generalization of the CX (or CNOT) gate. - */ -template -struct CXPowGate { - static constexpr GateKind kind = kCXPowGate; - static constexpr char name[] = "CXPowGate"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = false; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1, - fp_type exponent, fp_type global_shift = 0) { - fp_type c = std::cos(pi * exponent * 0.5); - fp_type s = std::sin(pi * exponent * 0.5); - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - fp_type ec = std::cos(pi * exponent * (0.5 + global_shift)); - fp_type es = std::sin(pi * exponent * (0.5 + global_shift)); - - // Matrix is in this form because the simulator uses inverse qubit order. - return CreateGate, CXPowGate>( - time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0, - 0, 0, c * ec, c * es, 0, 0, s * es, -s * ec, - 0, 0, 0, 0, gc, gs, 0, 0, - 0, 0, s * es, -s * ec, 0, 0, c * ec, c * es}, - {exponent, global_shift}); - } - - static schmidt_decomp_type SchmidtDecomp( - fp_type exponent, fp_type global_shift) { - fp_type c = std::cos(pi * exponent * 0.5); - fp_type s = std::sin(pi * exponent * 0.5); - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - fp_type ec = std::cos(pi * exponent * (0.5 + global_shift)); - fp_type es = std::sin(pi * exponent * (0.5 + global_shift)); - - return schmidt_decomp_type{ - {{1, 0, 0, 0, 0, 0, 0, 0}, {gc, gs, 0, 0, 0, 0, gc, gs}}, - {{0, 0, 0, 0, 0, 0, 1, 0}, {c * ec, c * es, s * es, -s * ec, - s * es, -s * ec, c * ec, c * es}}, - }; - } -}; - -/** - * The `(exponent = phi/pi, global_shift = -0.5)` instance of XPowGate. - * This is a generalization of the X gate with a fixed global phase. - * This is a function in Cirq. - */ -template -struct rx { - static constexpr GateKind kind = krx; - static constexpr char name[] = "rx"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateCirq Create(unsigned time, unsigned q0, fp_type phi) { - fp_type c = std::cos(-0.5 * phi); - fp_type s = std::sin(-0.5 * phi); - - return CreateGate, rx>( - time, {q0}, {c, 0, 0, s, 0, s, c, 0}, {phi}); - } -}; - -/** - * The `(exponent = phi/pi, global_shift = -0.5)` instance of YPowGate. - * This is a generalization of the Y gate with a fixed global phase. - * This is a function in Cirq. - */ -template -struct ry { - static constexpr GateKind kind = kry; - static constexpr char name[] = "ry"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateCirq Create(unsigned time, unsigned q0, fp_type phi) { - fp_type c = std::cos(-0.5 * phi); - fp_type s = std::sin(-0.5 * phi); - - return CreateGate, ry>( - time, {q0}, {c, 0, s, 0, -s, 0, c, 0}, {phi}); - } -}; - -/** - * The `(exponent = phi/pi, global_shift = -0.5)` instance of ZPowGate. - * This is a generalization of the Z gate with a fixed global phase. - * This is a function in Cirq. - */ -template -struct rz { - static constexpr GateKind kind = krz; - static constexpr char name[] = "rz"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateCirq Create(unsigned time, unsigned q0, fp_type phi) { - fp_type c = std::cos(-0.5 * phi); - fp_type s = std::sin(-0.5 * phi); - - return CreateGate, rz>( - time, {q0}, {c, s, 0, 0, 0, 0, c, -s}, {phi}); - } -}; - -/** - * The `(exponent = 1, global_shift = 0)` instance of HPowGate. - * This is the canonical Hadamard (or H) gate. - */ -template -struct H { - static constexpr GateKind kind = kH; - static constexpr char name[] = "H"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static constexpr fp_type is2 = static_cast(is2_double); - - static GateCirq Create(unsigned time, unsigned q0) { - return CreateGate, H>( - time, {q0}, {is2, 0, is2, 0, is2, 0, -is2, 0}); - } -}; - -/** - * The `(exponent = 0.5, global_shift = 0)` instance of ZPowGate. - * This is the canonical S gate. - */ -template -struct S { - static constexpr GateKind kind = kS; - static constexpr char name[] = "S"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateCirq Create(unsigned time, unsigned q0) { - return CreateGate, S>( - time, {q0}, {1, 0, 0, 0, 0, 0, 0, 1}); - } -}; - -/** - * The `(exponent = 0.25, global_shift = 0)` instance of ZPowGate. - * This is the canonical T gate. - */ -template -struct T { - static constexpr GateKind kind = kT; - static constexpr char name[] = "T"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static constexpr fp_type is2 = static_cast(is2_double); - - static GateCirq Create(unsigned time, unsigned q0) { - return CreateGate, T>( - time, {q0}, {1, 0, 0, 0, 0, 0, is2, is2}); - } -}; - -/** - * The `(exponent = 1, global_shift = 0)` instance of CZPowGate. - * This is the canonical CZ gate. - */ -template -struct CZ { - static constexpr GateKind kind = kCZ; - static constexpr char name[] = "CZ"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1) { - return CreateGate, CZ>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, -1, 0}); - } - - static schmidt_decomp_type SchmidtDecomp() { - return schmidt_decomp_type{ - {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}}, - {{0, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, -1, 0}}, - }; - } -}; - -template -using CNotPowGate = CXPowGate; - -/** - * The `(exponent = 1, global_shift = 0)` instance of CXPowGate. - * This is the canonical CX (or CNOT) gate. - */ -template -struct CX { - static constexpr GateKind kind = kCX; - static constexpr char name[] = "kCX"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = false; - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1) { - // Matrix is in this form because the simulator uses inverse qubit order. - return CreateGate, CX>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0, - 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 0}); - } - - static schmidt_decomp_type SchmidtDecomp() { - return schmidt_decomp_type{ - {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}}, - {{0, 0, 0, 0, 0, 0, 1, 0}, {0, 0, 1, 0, 1, 0, 0, 0}}, - }; - } -}; - -template -using CNOT = CX; - -// Gates from cirq/ops/pauli_gates.py: - -/** - * The `(exponent = 1, global_shift = 0)` instance of XPowGate. - * This is the canonical Pauli X gate. - */ -template -struct X : public XPowGate { - static constexpr GateKind kind = kX; - static constexpr char name[] = "X"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateCirq Create(unsigned time, unsigned q0) { - return CreateGate, X>( - time, {q0}, {0, 0, 1, 0, 1, 0, 0, 0}); - } -}; - -/** - * The `(exponent = 1, global_shift = 0)` instance of YPowGate. - * This is the canonical Pauli Y gate. - */ -template -struct Y : public YPowGate { - static constexpr GateKind kind = kY; - static constexpr char name[] = "Y"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateCirq Create(unsigned time, unsigned q0) { - return CreateGate, Y>( - time, {q0}, {0, 0, 0, -1, 0, 1, 0, 0}); - } -}; - -/** - * The `(exponent = 1, global_shift = 0)` instance of ZPowGate. - * This is the canonical Pauli Z gate. - */ -template -struct Z : public ZPowGate { - static constexpr GateKind kind = kZ; - static constexpr char name[] = "Z"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateCirq Create(unsigned time, unsigned q0) { - return CreateGate, Z>( - time, {q0}, {1, 0, 0, 0, 0, 0, -1, 0}); - } -}; - -// Gates from cirq/ops/phased_x_gate.py: - -/** - * An XPowGate conjugated by ZPowGate%s. - * Equivalent to the circuit `───Z^-p───X^t───Z^p───`. - */ -template -struct PhasedXPowGate { - static constexpr GateKind kind = kPhasedXPowGate; - static constexpr char name[] = "PhasedXPowGate"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, unsigned q0, - fp_type phase_exponent, fp_type exponent = 1, - fp_type global_shift = 0) { - fp_type pc = std::cos(pi * phase_exponent); - fp_type ps = std::sin(pi * phase_exponent); - fp_type ec = std::cos(pi * exponent); - fp_type es = std::sin(pi * exponent); - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - - fp_type ar = 0.5 * ((1 + ec) * gc - es * gs); - fp_type ai = 0.5 * ((1 + ec) * gs + es * gc); - fp_type br = -0.5 * ((-1 + ec) * gc - es * gs); - fp_type bi = -0.5 * ((-1 + ec) * gs + es * gc); - - return CreateGate, PhasedXPowGate>( - time, {q0}, {ar, ai, pc * br + ps * bi, pc * bi - ps * br, - pc * br - ps * bi, pc * bi + ps * br, ar, ai}, - {phase_exponent, exponent, global_shift}); - } -}; - -// Gates from cirq/ops/phased_x_z_gate.py: - -/** - * A PhasedXPowGate followed by a ZPowGate. - * Equivalent to the circuit `───Z^(-a)──X^x──Z^a───Z^z───`. - */ -template -struct PhasedXZGate { - static constexpr GateKind kind = kPhasedXZGate; - static constexpr char name[] = "PhasedXZGate"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, unsigned q0, - fp_type x_exponent, fp_type z_exponent, - fp_type axis_phase_exponent) { - fp_type xc = std::cos(pi * x_exponent); - fp_type xs = std::sin(pi * x_exponent); - fp_type zc = std::cos(pi * z_exponent); - fp_type zs = std::sin(pi * z_exponent); - fp_type ac = std::cos(pi * axis_phase_exponent); - fp_type as = std::sin(pi * axis_phase_exponent); - - fp_type br = 0.5 * (1 + xc); - fp_type bi = 0.5 * xs; - fp_type cr = -0.5 * (-1 + xc); - fp_type ci = -0.5 * xs; - fp_type dr = ac * zc - as * zs; - fp_type di = ac * zs + as * zc; - - return CreateGate, PhasedXZGate>( - time, {q0}, {br, bi, ac * cr + as * ci, ac * ci - as * cr, - dr * cr - di * ci, dr * ci + di * cr, - zc * br - zs * bi, zc * bi + zs * br}, - {x_exponent, z_exponent, axis_phase_exponent}); - } -}; - -// Gates from cirq/ops/parity_gates.py: - -/** - * The tensor product of two X gates, possibly raised to an exponent. - */ -template -struct XXPowGate { - static constexpr GateKind kind = kXXPowGate; - static constexpr char name[] = "XXPowGate"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1, - fp_type exponent, fp_type global_shift = 0) { - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - fp_type c = std::cos(pi * exponent); - fp_type s = std::sin(pi * exponent); - fp_type ic = 0.5 * ((1 + c) * gc - s * gs); - fp_type is = 0.5 * ((1 + c) * gs + s * gc); - fp_type xc = 0.5 * ((1 - c) * gc + s * gs); - fp_type xs = 0.5 * ((1 - c) * gs - s * gc); - - return CreateGate, XXPowGate>( - time, {q0, q1}, {ic, is, 0, 0, 0, 0, xc, xs, - 0, 0, ic, is, xc, xs, 0, 0, - 0, 0, xc, xs, ic, is, 0, 0, - xc, xs, 0, 0, 0, 0, ic, is}, {exponent, global_shift}); - } - - static schmidt_decomp_type SchmidtDecomp( - fp_type exponent, fp_type global_shift) { - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - fp_type c = std::cos(pi * exponent); - fp_type s = std::sin(pi * exponent); - fp_type ic = 0.5 * ((1 + c) * gc - s * gs); - fp_type is = 0.5 * ((1 + c) * gs + s * gc); - fp_type xc = 0.5 * ((1 - c) * gc + s * gs); - fp_type xs = 0.5 * ((1 - c) * gs - s * gc); - - return schmidt_decomp_type{ - {{1, 0, 0, 0, 0, 0, 1, 0}, {ic, is, 0, 0, 0, 0, ic, is}}, - {{0, 0, 1, 0, 1, 0, 0, 0}, {0, 0, xc, xs, xc, xs, 0, 0}}, - }; - } -}; - -/** - * The tensor product of two Y gates, possibly raised to an exponent. - */ -template -struct YYPowGate { - static constexpr GateKind kind = kYYPowGate; - static constexpr char name[] = "YYPowGate"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1, - fp_type exponent, fp_type global_shift = 0) { - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - fp_type c = std::cos(pi * exponent); - fp_type s = std::sin(pi * exponent); - fp_type ic = 0.5 * ((1 + c) * gc - s * gs); - fp_type is = 0.5 * ((1 + c) * gs + s * gc); - fp_type yc = 0.5 * ((1 - c) * gc + s * gs); - fp_type ys = 0.5 * ((1 - c) * gs - s * gc); - - return CreateGate, YYPowGate>( - time, {q0, q1}, {ic, is, 0, 0, 0, 0, -yc, -ys, - 0, 0, ic, is, yc, ys, 0, 0, - 0, 0, yc, ys, ic, is, 0, 0, - -yc, -ys, 0, 0, 0, 0, ic, is}, - {exponent, global_shift}); - } - - static schmidt_decomp_type SchmidtDecomp( - fp_type exponent, fp_type global_shift) { - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - fp_type c = std::cos(pi * exponent); - fp_type s = std::sin(pi * exponent); - fp_type ic = 0.5 * ((1 + c) * gc - s * gs); - fp_type is = 0.5 * ((1 + c) * gs + s * gc); - fp_type yc = 0.5 * ((1 - c) * gc + s * gs); - fp_type ys = 0.5 * ((1 - c) * gs - s * gc); - - return schmidt_decomp_type{ - {{1, 0, 0, 0, 0, 0, 1, 0}, {ic, is, 0, 0, 0, 0, ic, is}}, - {{0, 0, 0, -1, 0, 1, 0, 0}, {0, 0, ys, -yc, -ys, yc, 0, 0}}, - }; - } -}; - -/** - * The tensor product of two Z gates, possibly raised to an exponent. - */ -template -struct ZZPowGate { - static constexpr GateKind kind = kZZPowGate; - static constexpr char name[] = "ZZPowGate"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1, - fp_type exponent, fp_type global_shift = 0) { - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - fp_type zc = std::cos(pi * exponent * (1 + global_shift)); - fp_type zs = std::sin(pi * exponent * (1 + global_shift)); - - return CreateGate, ZZPowGate>( - time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0, - 0, 0, zc, zs, 0, 0, 0, 0, - 0, 0, 0, 0, zc, zs, 0, 0, - 0, 0, 0, 0, 0, 0, gc, gs}, {exponent, global_shift}); - } - - static schmidt_decomp_type SchmidtDecomp( - fp_type exponent, fp_type global_shift) { - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - fp_type c = std::cos(pi * exponent); - fp_type s = std::sin(pi * exponent); - fp_type ic = 0.5 * ((1 + c) * gc - s * gs); - fp_type is = 0.5 * ((1 + c) * gs + s * gc); - fp_type zc = 0.5 * ((1 - c) * gc + s * gs); - fp_type zs = 0.5 * ((1 - c) * gs - s * gc); - - return schmidt_decomp_type{ - {{1, 0, 0, 0, 0, 0, 1, 0}, {ic, is, 0, 0, 0, 0, ic, is}}, - {{1, 0, 0, 0, 0, 0, -1, 0}, {zc, zs, 0, 0, 0, 0, -zc, -zs}}, - }; - } -}; - -/** - * The `(exponent = 1, global_shift = 0)` instance of XXPowGate. - * This is the tensor product of two X gates. - */ -template -struct XX { - static constexpr GateKind kind = kXX; - static constexpr char name[] = "XX"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1) { - return CreateGate, XX>( - time, {q0, q1}, {0, 0, 0, 0, 0, 0, 1, 0, - 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 0}); - } - - static schmidt_decomp_type SchmidtDecomp() { - return schmidt_decomp_type{ - {{0, 0, 1, 0, 1, 0, 0, 0}, {0, 0, 1, 0, 1, 0, 0, 0}}, - }; - } -}; - -/** - * The `(exponent = 1, global_shift = 0)` instance of YYPowGate. - * This is the tensor product of two Y gates. - */ -template -struct YY { - static constexpr GateKind kind = kYY; - static constexpr char name[] = "YY"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1) { - return CreateGate, YY>( - time, {q0, q1}, {0, 0, 0, 0, 0, 0, -1, 0, - 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 0, - -1, 0, 0, 0, 0, 0, 0, 0}); - } - - static schmidt_decomp_type SchmidtDecomp() { - return schmidt_decomp_type{ - {{0, 0, 0, -1, 0, 1, 0, 0}, {0, 0, 0, -1, 0, 1, 0, 0}}, - }; - } -}; - -/** - * The `(exponent = 1, global_shift = 0)` instance of ZZPowGate. - * This is the tensor product of two Z gates. - */ -template -struct ZZ { - static constexpr GateKind kind = kZZ; - static constexpr char name[] = "ZZ"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1) { - return CreateGate, ZZ>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, -1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, -1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0}); - } - - static schmidt_decomp_type SchmidtDecomp() { - return schmidt_decomp_type{ - {{1, 0, 0, 0, 0, 0, -1, 0}, {1, 0, 0, 0, 0, 0, -1, 0}}, - }; - } -}; - -// Gates from cirq/ops/swap_gates.py: - -/** - * The SWAP gate, possibly raised to a power. Exchanges qubits. - */ -template -struct SwapPowGate { - static constexpr GateKind kind = kSwapPowGate; - static constexpr char name[] = "SwapPowGate"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static constexpr fp_type pi = static_cast(pi_double); - static constexpr fp_type h = static_cast(h_double); - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1, - fp_type exponent, fp_type global_shift = 0) { - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - fp_type c = std::cos(pi * exponent * 0.5); - fp_type s = std::sin(pi * exponent * 0.5); - fp_type ec = std::cos(pi * exponent * (0.5 + global_shift)); - fp_type es = std::sin(pi * exponent * (0.5 + global_shift)); - - return CreateGate, SwapPowGate>( - time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0, - 0, 0, c * ec, c * es, s * es, -s * ec, 0, 0, - 0, 0, s * es, -s * ec, c * ec, c * es, 0, 0, - 0, 0, 0, 0, 0, 0, gc, gs}, {exponent, global_shift}); - } - - static schmidt_decomp_type SchmidtDecomp( - fp_type exponent, fp_type global_shift) { - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - fp_type c = std::cos(pi * exponent * 0.5); - fp_type s = std::sin(pi * exponent * 0.5); - fp_type ec = std::cos(pi * exponent * (0.5 + global_shift)); - fp_type es = std::sin(pi * exponent * (0.5 + global_shift)); - - return schmidt_decomp_type{ - {{h, 0, 0, 0, 0, 0, h, 0}, {gc + c * ec, gs + c * es, 0, 0, - 0, 0, gc + c * ec, gs + c * es}}, - {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, s * es, -s * ec, - s * es, -s * ec, 0, 0}}, - {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, -s * ec, -s * es, - s * ec, s * es, 0, 0}}, - {{h, 0, 0, 0, 0, 0, -h, 0}, {gc - c * ec, gs - c * es, 0, 0, - 0, 0, -gc + c * ec, -gs + c * es}}, - }; - } -}; - -/** - * Rotates the |01⟩ vs |10⟩ subspace of two qubits around its Bloch X-axis. - * This is a generalization of the ISWAP gate. - */ -template -struct ISwapPowGate { - static constexpr GateKind kind = kISwapPowGate; - static constexpr char name[] = "ISwapPowGate"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static constexpr fp_type pi = static_cast(pi_double); - static constexpr fp_type h = static_cast(h_double); - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1, - fp_type exponent, fp_type global_shift = 0) { - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - fp_type c = std::cos(pi * exponent * 0.5); - fp_type s = std::sin(pi * exponent * 0.5); - - return CreateGate, ISwapPowGate>( - time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0, - 0, 0, c * gc, c * gs, -s * gs, s * gc, 0, 0, - 0, 0, -s * gs, s * gc, c * gc, c * gs, 0, 0, - 0, 0, 0, 0, 0, 0, gc, gs}, {exponent, global_shift}); - } - - static schmidt_decomp_type SchmidtDecomp( - fp_type exponent, fp_type global_shift) { - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - fp_type c = std::cos(pi * exponent * 0.5); - fp_type s = std::sin(pi * exponent * 0.5); - - return schmidt_decomp_type{ - {{h, 0, 0, 0, 0, 0, h, 0}, {gc + c * gc, gs + c * gs, 0, 0, - 0, 0, gc + c * gc, gs + c * gs}}, - {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, -s * gs, s * gc, - -s * gs, s * gc, 0, 0}}, - {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, s * gc, s * gs, - -s * gc, -s * gs, 0, 0}}, - {{h, 0, 0, 0, 0, 0, -h, 0}, {gc - c * gc, gs - c * gs, 0, 0, - 0, 0, -gc + c * gc, -gs + c * gs}}, - }; - } -}; - -/** - * The `(exponent = 2*phi/pi, global_shift = 0)` instance of ISwapPowGate. - * This is a generalization of the ISWAP gate with a fixed global phase of zero. - * This is a function in Cirq. - */ -template -struct riswap { - static constexpr GateKind kind = kriswap; - static constexpr char name[] = "riswap"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static constexpr fp_type pi = static_cast(pi_double); - static constexpr fp_type h = static_cast(h_double); - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1, - fp_type phi) { - fp_type c = std::cos(phi); - fp_type s = std::sin(phi); - - return CreateGate, riswap>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, c, 0, 0, s, 0, 0, - 0, 0, 0, s, c, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0}, {phi}); - } - - static schmidt_decomp_type SchmidtDecomp(fp_type phi) { - fp_type c = std::cos(phi); - fp_type s = std::sin(phi); - - return schmidt_decomp_type{ - {{h, 0, 0, 0, 0, 0, h, 0}, {1 + c, 0, 0, 0, 0, 0, 1 + c, 0}}, - {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, 0, s, 0, s, 0, 0}}, - {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, s, 0, -s, 0, 0, 0}}, - {{h, 0, 0, 0, 0, 0, -h, 0}, {1 - c, 0, 0, 0, 0, 0, -1 + c, 0}}, - }; - } -}; - -/** - * The `(exponent = 1, global_shift = 0)` instance of SwapPowGate. - * This is the canonical SWAP gate. - */ -template -struct SWAP { - static constexpr GateKind kind = kSWAP; - static constexpr char name[] = "SWAP"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static constexpr fp_type is2 = static_cast(is2_double); - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1) { - return CreateGate, SWAP>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0}); - } - - static schmidt_decomp_type SchmidtDecomp() { - return schmidt_decomp_type{ - {{is2, 0, 0, 0, 0, 0, is2, 0}, {is2, 0, 0, 0, 0, 0, is2, 0}}, - {{0, 0, is2, 0, is2, 0, 0, 0}, {0, 0, is2, 0, is2, 0, 0, 0}}, - {{0, 0, 0, -is2, 0, is2, 0, 0}, {0, 0, 0, -is2, 0, is2, 0, 0}}, - {{is2, 0, 0, 0, 0, 0, -is2, 0}, {is2, 0, 0, 0, 0, 0, -is2, 0}}, - }; - } -}; - -/** - * The `(exponent = 1, global_shift = 0)` instance of ISwapPowGate. - * This is the canonical ISWAP gate. - */ -template -struct ISWAP { - static constexpr GateKind kind = kISWAP; - static constexpr char name[] = "ISWAP"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static constexpr fp_type h = static_cast(h_double); - static constexpr fp_type is2 = static_cast(is2_double); - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1) { - return CreateGate, ISWAP>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 1, 0, 0, - 0, 0, 0, 1, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0}); - } - - static schmidt_decomp_type SchmidtDecomp() { - return schmidt_decomp_type{ - {{is2, 0, 0, 0, 0, 0, is2, 0}, {is2, 0, 0, 0, 0, 0, is2, 0}}, - {{0, 0, h, h, h, h, 0, 0}, {0, 0, h, h, h, h, 0, 0}}, - {{0, 0, h, -h, -h, h, 0, 0}, {0, 0, h, -h, -h, h, 0, 0}}, - {{is2, 0, 0, 0, 0, 0, -is2, 0}, {is2, 0, 0, 0, 0, 0, -is2, 0}}, - }; - } -}; - -// Gates from cirq/ops/phased_iswap_gate.py: - -/** - * An ISwapPowGate conjugated by ZPowGate%s. - * Equivalent to the composition `(Z^-p ⊗ Z^p) ISWAP^t (Z^p ⊗ Z^-p)`. - */ -template -struct PhasedISwapPowGate { - static constexpr GateKind kind = kPhasedISwapPowGate; - static constexpr char name[] = "PhasedISwapPowGate"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = false; - - static constexpr fp_type pi = static_cast(pi_double); - static constexpr fp_type h = static_cast(h_double); - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1, - fp_type phase_exponent = 0.25, - fp_type exponent = 1.0) { - fp_type fc = std::cos(2 * pi * phase_exponent); - fp_type fs = std::sin(2 * pi * phase_exponent); - fp_type c = std::cos(pi * exponent * 0.5); - fp_type s = std::sin(pi * exponent * 0.5); - - // Matrix is in this form because the simulator uses inverse qubit order. - return CreateGate, PhasedISwapPowGate>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, c, 0, s * fs, s * fc, 0, 0, - 0, 0, -s * fs, s * fc, c, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0}, {phase_exponent, exponent}); - } - - static schmidt_decomp_type SchmidtDecomp( - fp_type phase_exponent, fp_type exponent) { - fp_type fc = std::cos(2 * pi * phase_exponent); - fp_type fs = std::sin(2 * pi * phase_exponent); - fp_type c = std::cos(pi * exponent * 0.5); - fp_type s = std::sin(pi * exponent * 0.5); - - return schmidt_decomp_type{ - {{h, 0, 0, 0, 0, 0, h, 0}, {1 + c, 0, 0, 0, 0, 0, 1 + c, 0}}, - {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, s * fs, s * fc, -s * fs, s * fc, 0, 0}}, - {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, s * fc, -s * fs, - -s * fc, -s * fs, 0, 0}}, - {{h, 0, 0, 0, 0, 0, -h, 0}, {1 - c, 0, 0, 0, 0, 0, -1 + c, 0}}, - }; - } -}; - -/** - * The `(phase_exponent = 0.25, exponent = 2*phi/pi)` instance of - * PhasedISwapPowGate. - * This is the "Givens rotation" from numerical linear algebra. - * This is a function in Cirq. - */ -template -struct givens { - static constexpr GateKind kind = kgivens; - static constexpr char name[] = "givens"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = false; - - static constexpr fp_type pi = static_cast(pi_double); - static constexpr fp_type h = static_cast(h_double); - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1, - fp_type phi) { - fp_type c = std::cos(phi); - fp_type s = std::sin(phi); - - // Matrix is in this form because the simulator uses inverse qubit order. - return CreateGate, givens>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, c, 0, s, 0, 0, 0, - 0, 0, -s, 0, c, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0}, {phi}); - } - - static schmidt_decomp_type SchmidtDecomp(fp_type phi) { - fp_type c = std::cos(phi); - fp_type s = std::sin(phi); - - return schmidt_decomp_type{ - {{h, 0, 0, 0, 0, 0, h, 0}, {1 + c, 0, 0, 0, 0, 0, 1 + c, 0}}, - {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, s, 0, -s, 0, 0, 0}}, - {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, 0, -s, 0, -s, 0, 0}}, - {{h, 0, 0, 0, 0, 0, -h, 0}, {1 - c, 0, 0, 0, 0, 0, -1 + c, 0}}, - }; - } -}; - -// Gates from cirq/ops/fsim_gate.py: - -/** - * The fermionic simulation gate family. Contains all two-qubit interactions - * that preserve excitations, up to single-qubit rotations and global phase. - */ -template -struct FSimGate { - static constexpr GateKind kind = kFSimGate; - static constexpr char name[] = "FSimGate"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static constexpr fp_type is2 = static_cast(is2_double); - - static GateCirq Create( - unsigned time, unsigned q0, unsigned q1, fp_type theta, fp_type phi) { - if (phi < 0) { - phi += 2 * 3.141592653589793; - } - - fp_type ct = std::cos(theta); - fp_type st = std::sin(theta); - fp_type cp = std::cos(phi); - fp_type sp = std::sin(phi); - - return CreateGate, FSimGate>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, ct, 0, 0, -st, 0, 0, - 0, 0, 0, -st, ct, 0, 0, 0, - 0, 0, 0, 0, 0, 0, cp, -sp}, {theta, phi}); - } - - static schmidt_decomp_type SchmidtDecomp( - fp_type theta, fp_type phi) { - fp_type ct = std::cos(theta); - fp_type st = std::sin(theta); - - fp_type cp2 = std::cos(0.5 * phi); - fp_type sp2 = std::sin(0.5 * phi); - fp_type cp4 = std::cos(0.25 * phi); - fp_type sp4 = std::sin(0.25 * phi); - - fp_type a0 = std::sqrt(std::sqrt(1 + 2 * ct * cp2 + ct * ct)); - fp_type a1 = std::sqrt(std::sqrt(1 - 2 * ct * cp2 + ct * ct)); - - fp_type p0 = 0.5 * std::atan2(-sp2, cp2 + ct); - fp_type p1 = 0.5 * std::atan2(-sp2, cp2 - ct); - - fp_type c0 = is2 * a0 * std::cos(p0); - fp_type s0 = is2 * a0 * std::sin(p0); - - fp_type c1 = is2 * a1 * std::cos(p1); - fp_type s1 = is2 * a1 * std::sin(p1); - - fp_type st2 = 0.5 * std::sqrt(st); - - fp_type a = cp4 * c0 - sp4 * s0; - fp_type b = cp4 * s0 + sp4 * c0; - fp_type c = cp4 * c0 + sp4 * s0; - fp_type d = cp4 * s0 - sp4 * c0; - - fp_type e = cp4 * c1 - sp4 * s1; - fp_type f = cp4 * s1 + sp4 * c1; - fp_type g = -(cp4 * c1 + sp4 * s1); - fp_type h = -(cp4 * s1 - sp4 * c1); - - return schmidt_decomp_type{ - {{a, b, 0, 0, 0, 0, c, d}, {a, b, 0, 0, 0, 0, c, d}}, - {{0, 0, st2, -st2, st2, -st2, 0, 0}, {0, 0, st2, -st2, st2, -st2, 0, 0}}, - {{0, 0, -st2, -st2, st2, st2, 0, 0}, {0, 0, -st2, -st2, st2, st2, 0, 0}}, - {{e, f, 0, 0, 0, 0, g, h}, {e, f, 0, 0, 0, 0, g, h}}, - }; - } -}; - -// Gates from cirq/ops/two_qubit_diagonal_gate.py: - -/** - * A two-qubit diagonal gate. - */ -template -struct TwoQubitDiagonalGate { - static constexpr GateKind kind = kTwoQubitDiagonalGate; - static constexpr char name[] = "TwoQubitDiagonalGate"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = false; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, - unsigned q0, unsigned q1, - const std::vector& angles) { - std::vector cs; - std::vector ss; - cs.reserve(4); - ss.reserve(4); - - for (std::size_t i = 0; i < angles.size(); ++i) { - cs.push_back(std::cos(angles[i])); - ss.push_back(std::sin(angles[i])); - } - - for (std::size_t i = angles.size(); i < 4; ++i) { - cs.push_back(1); - ss.push_back(0); - } - - // Matrix is in this form because the simulator uses inverse qubit order. - return CreateGate, TwoQubitDiagonalGate>( - time, {q0, q1}, {cs[0], ss[0], 0, 0, 0, 0, 0, 0, - 0, 0, cs[2], ss[2], 0, 0, 0, 0, - 0, 0, 0, 0, cs[1], ss[1], 0, 0, - 0, 0, 0, 0, 0, 0, cs[3], ss[3]}); - } -}; - -// Gates from cirq/ops/three_qubit_gates.py: - -/** - * A three-qubit diagonal gate. - */ -template -struct ThreeQubitDiagonalGate { - static constexpr GateKind kind = kThreeQubitDiagonalGate; - static constexpr char name[] = "ThreeQubitDiagonalGate"; - static constexpr unsigned num_qubits = 3; - static constexpr bool symmetric = false; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, - unsigned q0, unsigned q1, unsigned q2, - const std::vector& angles) { - std::vector cs; - std::vector ss; - cs.reserve(8); - ss.reserve(8); - - for (std::size_t i = 0; i < angles.size(); ++i) { - cs.push_back(std::cos(angles[i])); - ss.push_back(std::sin(angles[i])); - } - - for (std::size_t i = angles.size(); i < 8; ++i) { - cs.push_back(1); - ss.push_back(0); - } - - // Matrix is in this form because the simulator uses inverse qubit order. - return CreateGate, ThreeQubitDiagonalGate>( - time, {q0, q1, q2}, - {cs[0], ss[0], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, cs[4], ss[4], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, cs[2], ss[2], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, cs[6], ss[6], 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, cs[1], ss[1], 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, cs[5], ss[5], 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, cs[3], ss[3], 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, cs[7], ss[7]}); - } -}; - -/** - * A gate that applies a phase to the |111⟩ state of three qubits. - * This is a generalization of the CCZ gate. - */ -template -struct CCZPowGate { - static constexpr GateKind kind = kCCZPowGate; - static constexpr char name[] = "CCZPowGate"; - static constexpr unsigned num_qubits = 3; - static constexpr bool symmetric = true; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, - unsigned q0, unsigned q1, unsigned q2, - fp_type exponent, fp_type global_shift = 0) { - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - fp_type ec = std::cos(pi * exponent * (1 + global_shift)); - fp_type es = std::sin(pi * exponent * (1 + global_shift)); - - return CreateGate, CCZPowGate>( - time, {q0, q1, q2}, {gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ec, es}, - {exponent, global_shift}); - } -}; - -/** - * A gate that applies a doubly-controlled power of an X gate. - * This is a generalization of the CCX (or CCNOT) gate. - */ -template -struct CCXPowGate { - static constexpr GateKind kind = kCCXPowGate; - static constexpr char name[] = "CCXPowGate"; - static constexpr unsigned num_qubits = 3; - static constexpr bool symmetric = false; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, - unsigned q0, unsigned q1, unsigned q2, - fp_type exponent, fp_type global_shift = 0) { - fp_type c = std::cos(pi * exponent * 0.5); - fp_type s = std::sin(pi * exponent * 0.5); - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - fp_type ec = std::cos(pi * exponent * (0.5 + global_shift)); - fp_type es = std::sin(pi * exponent * (0.5 + global_shift)); - - // Matrix is in this form because the simulator uses inverse qubit order. - return CreateGate, CCXPowGate>( - time, {q0, q1, q2}, - {gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, c * ec, c * es, 0, 0, 0, 0, 0, 0, s * es, -s * ec, - 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, - 0, 0, 0, 0, 0, 0, s * es, -s * ec, 0, 0, 0, 0, 0, 0, c * ec, c * es}, - {exponent, global_shift}); - } -}; - -/** - * A controlled swap gate (the Fredkin gate). - */ -template -struct CSwapGate { - static constexpr GateKind kind = kCSwapGate; - static constexpr char name[] = "CSwapGate"; - static constexpr unsigned num_qubits = 3; - static constexpr bool symmetric = false; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, - unsigned q0, unsigned q1, unsigned q2) { - // Matrix is in this form because the simulator uses inverse qubit order. - return CreateGate, CSwapGate>( - time, {q0, q1, q2}, {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0}); - } -}; - -/** - * The `(exponent = 1, global_shift = 0)` instance of CCZPowGate. - * This is the canonical doubly-controlled Z gate. - */ -template -struct CCZ { - static constexpr GateKind kind = kCCZ; - static constexpr char name[] = "CCZ"; - static constexpr unsigned num_qubits = 3; - static constexpr bool symmetric = true; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, - unsigned q0, unsigned q1, unsigned q2) { - return CreateGate, CCZ>( - time, {q0, q1, q2}, {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0}); - } -}; - -/** - * The `(exponent = 1, global_shift = 0)` instance of CCXPowGate. - * This is the canonical doubly-controlled X gate (the TOFFOLI gate). - */ -template -struct CCX { - static constexpr GateKind kind = kCCX; - static constexpr char name[] = "CCX"; - static constexpr unsigned num_qubits = 3; - static constexpr bool symmetric = false; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, - unsigned q0, unsigned q1, unsigned q2) { - // Matrix is in this form because the simulator uses inverse qubit order. - return CreateGate, CCX>( - time, {q0, q1, q2}, {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0}); - } -}; - -template -using CCNotPowGate = CCXPowGate; - -template -using TOFFOLI = CCX; - -template -using CCNOT = CCX; - -template -using CSWAP = CSwapGate; - -template -using FREDKIN = CSwapGate; - -// Gates from cirq/ops/matrix_gates.py: - -/** - * A one-qubit gate defined entirely by its matrix. - */ -template -struct MatrixGate1 { - static constexpr GateKind kind = kMatrixGate1; - static constexpr char name[] = "MatrixGate1"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateCirq Create(unsigned time, unsigned q0, - const Matrix& m) { - auto m2 = m; - return - CreateGate, MatrixGate1>(time, {q0}, std::move(m2)); - } -}; - -/** - * A two-qubit gate defined entirely by its matrix. - */ -template -struct MatrixGate2 { - static constexpr GateKind kind = kMatrixGate2; - static constexpr char name[] = "MatrixGate2"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = false; - - template > - static GateCirq Create( - unsigned time, unsigned q0, unsigned q1, M&& m) { - return CreateGate, MatrixGate2>(time, {q1, q0}, - std::forward(m)); - } -}; - -/** - * A multi-qubit gate defined entirely by its matrix. - */ -template -struct MatrixGate { - static constexpr GateKind kind = kMatrixGate; - static constexpr char name[] = "MatrixGate"; - static constexpr bool symmetric = false; - - template > - static GateCirq Create(unsigned time, - std::vector qubits, M&& m) { - std::reverse(qubits.begin(), qubits.end()); - return CreateGate, MatrixGate>(time, std::move(qubits), - std::forward(m)); - } -}; - -} // namesapce Cirq - -template -inline schmidt_decomp_type GetSchmidtDecomp( - Cirq::GateKind kind, const std::vector& params) { - switch (kind) { - case Cirq::kI2: - return Cirq::I2::SchmidtDecomp(); - case Cirq::kCZPowGate: - return Cirq::CZPowGate::SchmidtDecomp(params[0], params[1]); - case Cirq::kCXPowGate: - return Cirq::CXPowGate::SchmidtDecomp(params[0], params[1]); - case Cirq::kCZ: - return Cirq::CZ::SchmidtDecomp(); - case Cirq::kCX: - return Cirq::CX::SchmidtDecomp(); - case Cirq::kXXPowGate: - return Cirq::XXPowGate::SchmidtDecomp(params[0], params[1]); - case Cirq::kYYPowGate: - return Cirq::YYPowGate::SchmidtDecomp(params[0], params[1]); - case Cirq::kZZPowGate: - return Cirq::ZZPowGate::SchmidtDecomp(params[0], params[1]); - case Cirq::kXX: - return Cirq::XX::SchmidtDecomp(); - case Cirq::kYY: - return Cirq::YY::SchmidtDecomp(); - case Cirq::kZZ: - return Cirq::ZZ::SchmidtDecomp(); - case Cirq::kSwapPowGate: - return Cirq::SwapPowGate::SchmidtDecomp(params[0], params[1]); - case Cirq::kISwapPowGate: - return Cirq::ISwapPowGate::SchmidtDecomp(params[0], params[1]); - case Cirq::kriswap: - return Cirq::riswap::SchmidtDecomp(params[0]); - case Cirq::kSWAP: - return Cirq::SWAP::SchmidtDecomp(); - case Cirq::kISWAP: - return Cirq::ISWAP::SchmidtDecomp(); - case Cirq::kPhasedISwapPowGate: - return Cirq::PhasedISwapPowGate::SchmidtDecomp( - params[0], params[1]); - case Cirq::kgivens: - return Cirq::givens::SchmidtDecomp(params[0]); - case Cirq::kFSimGate: - return Cirq::FSimGate::SchmidtDecomp(params[0], params[1]); - default: - // Single qubit gates of gates with unimplemented Schmidt decomposition. - return schmidt_decomp_type{}; - } -} - -} // namespace qsim - -#endif // GATES_CIRQ_H_ diff --git a/qsim/gates_qsim.h b/qsim/gates_qsim.h deleted file mode 100644 index 366c4f1..0000000 --- a/qsim/gates_qsim.h +++ /dev/null @@ -1,661 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef GATES_QSIM_H_ -#define GATES_QSIM_H_ - -#include -#include -#include - -#include "gate.h" - -namespace qsim { - -// Gate set implemented in qsim contains the following gates. -enum GateKind { - kGateId1 = 0, // one-qubit Id - kGateHd, // Hadamard - kGateT, // T - kGateX, // X - kGateY, // Y - kGateZ, // Z - kGateX2, // sqrt(X) - kGateY2, // sqrt(Y) - kGateRX, // X-rotation - kGateRY, // Y-rotation - kGateRZ, // Z-rotation - kGateRXY, // XY-rotation (rotation around arbitrary axis in the XY plane) - kGateHZ2, // pi / 2 rotation around the X + Y axis - kGateS, // S - kGateId2, // two-qubit Id - kGateCZ, // CZ - kGateCNot, // CNOT (CX) - kGateSwap, // swap - kGateIS, // iSwap - kGateFS, // fSim - kGateCP, // control phase - kGateMatrix1, // one-qubit matrix gate - kGateMatrix2, // two-qubit matrix gate - kGateGPh, // global phase gate - kDecomp = gate::kDecomp, - kMeasurement = gate::kMeasurement, -}; - -// Specialization of Gate (defined in gate.h) for the qsim gate set. -template -using GateQSim = Gate; - -constexpr double h_double = 0.5; -constexpr double is2_double = 0.7071067811865475; - -// Zero-qubit gates: - -/** - * The global phase gate. - */ -template -struct GateGPh { - static constexpr GateKind kind = kGateGPh; - static constexpr char name[] = "p"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateQSim Create(unsigned time, fp_type phi) { - return Create(time, std::cos(phi), std::sin(phi)); - } - - static GateQSim Create(unsigned time, fp_type cp, fp_type sp) { - return CreateGate, GateGPh>( - time, {}, {cp, sp}, {cp, sp}); - } -}; - -// One-qubit gates: - -/** - * The one-qubit identity gate. - */ -template -struct GateId1 { - static constexpr GateKind kind = kGateId1; - static constexpr char name[] = "id1"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateQSim Create(unsigned time, unsigned q0) { - return CreateGate, GateId1>( - time, {q0}, {1, 0, 0, 0, 0, 0, 1, 0}); - } -}; - -/** - * The Hadamard gate. - */ -template -struct GateHd { - static constexpr GateKind kind = kGateHd; - static constexpr char name[] = "h"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static constexpr fp_type is2 = static_cast(is2_double); - - static GateQSim Create(unsigned time, unsigned q0) { - return CreateGate, GateHd>( - time, {q0}, {is2, 0, is2, 0, is2, 0, -is2, 0}); - } -}; - -/** - * The T gate, equivalent to `Z ^ 0.25`. - */ -template -struct GateT { - static constexpr GateKind kind = kGateT; - static constexpr char name[] = "t"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static constexpr fp_type is2 = static_cast(is2_double); - - static GateQSim Create(unsigned time, unsigned q0) { - return CreateGate, GateT>( - time, {q0}, {1, 0, 0, 0, 0, 0, is2, is2}); - } -}; - -/** - * The Pauli X (or "NOT") gate. - */ -template -struct GateX { - static constexpr GateKind kind = kGateX; - static constexpr char name[] = "x"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateQSim Create(unsigned time, unsigned q0) { - return CreateGate, GateX>( - time, {q0}, {0, 0, 1, 0, 1, 0, 0, 0}); - } -}; - -/** - * The Pauli Y gate. - */ -template -struct GateY { - static constexpr GateKind kind = kGateY; - static constexpr char name[] = "y"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateQSim Create(unsigned time, unsigned q0) { - return CreateGate, GateY>( - time, {q0}, {0, 0, 0, -1, 0, 1, 0, 0}); - } -}; - -/** - * The Pauli Z gate. - */ -template -struct GateZ { - static constexpr GateKind kind = kGateZ; - static constexpr char name[] = "z"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateQSim Create(unsigned time, unsigned q0) { - return CreateGate, GateZ>( - time, {q0}, {1, 0, 0, 0, 0, 0, -1, 0}); - } -}; - -/** - * The "square root of X" gate. - */ -template -struct GateX2 { - static constexpr GateKind kind = kGateX2; - static constexpr char name[] = "x_1_2"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static constexpr fp_type h = static_cast(h_double); - - static GateQSim Create(unsigned time, unsigned q0) { - return CreateGate, GateX2>( - time, {q0}, {h, h, h, -h, h, -h, h, h}); - } -}; - -/** - * The "square root of Y" gate. - */ -template -struct GateY2 { - static constexpr GateKind kind = kGateY2; - static constexpr char name[] = "y_1_2"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static constexpr fp_type h = static_cast(h_double); - - static GateQSim Create(unsigned time, unsigned q0) { - return CreateGate, GateY2>( - time, {q0}, {h, h, -h, -h, h, h, h, h}); - } -}; - -/** - * A gate that rotates around the X axis of the Bloch sphere. - * This is a generalization of the X gate. - */ -template -struct GateRX { - static constexpr GateKind kind = kGateRX; - static constexpr char name[] = "rx"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateQSim Create(unsigned time, unsigned q0, fp_type phi) { - fp_type phi2 = -0.5 * phi; - fp_type c = std::cos(phi2); - fp_type s = std::sin(phi2); - - return CreateGate, GateRX>( - time, {q0}, {c, 0, 0, s, 0, s, c, 0}, {phi}); - } -}; - -/** - * A gate that rotates around the Y axis of the Bloch sphere. - * This is a generalization of the Y gate. - */ -template -struct GateRY { - static constexpr GateKind kind = kGateRY; - static constexpr char name[] = "ry"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateQSim Create(unsigned time, unsigned q0, fp_type phi) { - fp_type phi2 = -0.5 * phi; - fp_type c = std::cos(phi2); - fp_type s = std::sin(phi2); - - return CreateGate, GateRY>( - time, {q0}, {c, 0, s, 0, -s, 0, c, 0}, {phi}); - } -}; - -/** - * A gate that rotates around the Z axis of the Bloch sphere. - * This is a generalization of the Z gate. - */ -template -struct GateRZ { - static constexpr GateKind kind = kGateRZ; - static constexpr char name[] = "rz"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateQSim Create(unsigned time, unsigned q0, fp_type phi) { - fp_type phi2 = -0.5 * phi; - fp_type c = std::cos(phi2); - fp_type s = std::sin(phi2); - - return CreateGate, GateRZ>( - time, {q0}, {c, s, 0, 0, 0, 0, c, -s}, {phi}); - } -}; - -/** - * A gate that rotates around an arbitrary axis in the XY-plane. - */ -template -struct GateRXY { - static constexpr GateKind kind = kGateRXY; - static constexpr char name[] = "rxy"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateQSim Create( - unsigned time, unsigned q0, fp_type theta, fp_type phi) { - fp_type phi2 = -0.5 * phi; - fp_type cp = std::cos(phi2); - fp_type sp = std::sin(phi2); - fp_type ct = std::cos(theta) * sp; - fp_type st = std::sin(theta) * sp; - - return CreateGate, GateRXY>( - time, {q0}, {cp, 0, st, ct, -st, ct, cp, 0}, {theta, phi}); - } -}; - -/** - * A pi / 2 rotation around the X + Y axis. - */ -template -struct GateHZ2 { - static constexpr GateKind kind = kGateHZ2; - static constexpr char name[] = "hz_1_2"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static constexpr fp_type h = static_cast(h_double); - - static constexpr fp_type is2 = static_cast(is2_double); - - static GateQSim Create(unsigned time, unsigned q0) { - return CreateGate, GateHZ2>( - time, {q0}, {h, h, 0, -is2, is2, 0, h, h}); - } -}; - -/** - * The S gate, equivalent to "square root of Z". - */ -template -struct GateS { - static constexpr GateKind kind = kGateS; - static constexpr char name[] = "s"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateQSim Create(unsigned time, unsigned q0) { - return CreateGate, GateS>( - time, {q0}, {1, 0, 0, 0, 0, 0, 0, 1}); - } -}; - -/** - * A one-qubit gate defined entirely by its matrix. - */ -template -struct GateMatrix1 { - static constexpr GateKind kind = kGateMatrix1; - static constexpr char name[] = "mat1"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateQSim Create(unsigned time, unsigned q0, - const Matrix& m) { - auto m2 = m; - return - CreateGate, GateMatrix1>(time, {q0}, std::move(m2)); - } -}; - -// Two-qubit gates: - -/** - * The two-qubit identity gate. - */ -template -struct GateId2 { - static constexpr GateKind kind = kGateId2; - static constexpr char name[] = "id2"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static GateQSim Create(unsigned time, unsigned q0, unsigned q1) { - return CreateGate, GateId2>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0}); - } - - static schmidt_decomp_type SchmidtDecomp() { - return schmidt_decomp_type{ - {{1, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, 1, 0}}, - }; - } -}; - -/** - * The controlled-Z (CZ) gate. - */ -template -struct GateCZ { - static constexpr GateKind kind = kGateCZ; - static constexpr char name[] = "cz"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static GateQSim Create(unsigned time, unsigned q0, unsigned q1) { - return CreateGate, GateCZ>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, -1, 0}); - } - - static schmidt_decomp_type SchmidtDecomp() { - return schmidt_decomp_type{ - {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}}, - {{0, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, -1, 0}}, - }; - } -}; - -/** - * The controlled-X (CX or CNOT) gate. - */ -template -struct GateCNot { - static constexpr GateKind kind = kGateCNot; - static constexpr char name[] = "cnot"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = false; - - static GateQSim Create(unsigned time, unsigned q0, unsigned q1) { - // Matrix is in this form because the simulator uses inverse qubit order. - return CreateGate, GateCNot>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0, - 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 0}); - } - - static schmidt_decomp_type SchmidtDecomp() { - return schmidt_decomp_type{ - {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}}, - {{0, 0, 0, 0, 0, 0, 1, 0}, {0, 0, 1, 0, 1, 0, 0, 0}}, - }; - } -}; - -/** - * The SWAP gate. Exchanges two qubits. - */ -template -struct GateSwap { - static constexpr GateKind kind = kGateSwap; - static constexpr char name[] = "sw"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static constexpr fp_type is2 = static_cast(is2_double); - - static GateQSim Create(unsigned time, unsigned q0, unsigned q1) { - return CreateGate, GateSwap>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0}); - } - - static schmidt_decomp_type SchmidtDecomp() { - return schmidt_decomp_type{ - {{is2, 0, 0, 0, 0, 0, is2, 0}, {is2, 0, 0, 0, 0, 0, is2, 0}}, - {{0, 0, is2, 0, is2, 0, 0, 0}, {0, 0, is2, 0, is2, 0, 0, 0}}, - {{0, 0, 0, -is2, 0, is2, 0, 0}, {0, 0, 0, -is2, 0, is2, 0, 0}}, - {{is2, 0, 0, 0, 0, 0, -is2, 0}, {is2, 0, 0, 0, 0, 0, -is2, 0}}, - }; - } -}; - -/** - * The ISWAP gate. - */ -template -struct GateIS { - static constexpr GateKind kind = kGateIS; - static constexpr char name[] = "is"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static constexpr fp_type h = static_cast(h_double); - static constexpr fp_type is2 = static_cast(is2_double); - - static GateQSim Create(unsigned time, unsigned q0, unsigned q1) { - return CreateGate, GateIS>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 1, 0, 0, - 0, 0, 0, 1, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0}); - } - - static schmidt_decomp_type SchmidtDecomp() { - return schmidt_decomp_type{ - {{is2, 0, 0, 0, 0, 0, is2, 0}, {is2, 0, 0, 0, 0, 0, is2, 0}}, - {{0, 0, h, h, h, h, 0, 0}, {0, 0, h, h, h, h, 0, 0}}, - {{0, 0, h, -h, -h, h, 0, 0}, {0, 0, h, -h, -h, h, 0, 0}}, - {{is2, 0, 0, 0, 0, 0, -is2, 0}, {is2, 0, 0, 0, 0, 0, -is2, 0}}, - }; - } -}; - -/** - * The fermionic simulation (FSim) gate family. Contains all two-qubit - * interactions that preserve excitations, up to single-qubit rotations and - * global phase. - */ -template -struct GateFS { - static constexpr GateKind kind = kGateFS; - static constexpr char name[] = "fs"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static constexpr fp_type is2 = static_cast(is2_double); - - static GateQSim Create( - unsigned time, unsigned q0, unsigned q1, fp_type theta, fp_type phi) { - if (phi < 0) { - phi += 2 * 3.141592653589793; - } - - fp_type ct = std::cos(theta); - fp_type st = std::sin(theta); - fp_type cp = std::cos(phi); - fp_type sp = std::sin(phi); - - return CreateGate, GateFS>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, ct, 0, 0, -st, 0, 0, - 0, 0, 0, -st, ct, 0, 0, 0, - 0, 0, 0, 0, 0, 0, cp, -sp}, {theta, phi}); - } - - static schmidt_decomp_type SchmidtDecomp( - fp_type theta, fp_type phi) { - fp_type ct = std::cos(theta); - fp_type st = std::sin(theta); - - fp_type cp2 = std::cos(0.5 * phi); - fp_type sp2 = std::sin(0.5 * phi); - fp_type cp4 = std::cos(0.25 * phi); - fp_type sp4 = std::sin(0.25 * phi); - - fp_type a0 = std::sqrt(std::sqrt(1 + 2 * ct * cp2 + ct * ct)); - fp_type a1 = std::sqrt(std::sqrt(1 - 2 * ct * cp2 + ct * ct)); - - fp_type p0 = 0.5 * std::atan2(-sp2, cp2 + ct); - fp_type p1 = 0.5 * std::atan2(-sp2, cp2 - ct); - - fp_type c0 = is2 * a0 * std::cos(p0); - fp_type s0 = is2 * a0 * std::sin(p0); - - fp_type c1 = is2 * a1 * std::cos(p1); - fp_type s1 = is2 * a1 * std::sin(p1); - - fp_type st2 = 0.5 * std::sqrt(st); - - fp_type a = cp4 * c0 - sp4 * s0; - fp_type b = cp4 * s0 + sp4 * c0; - fp_type c = cp4 * c0 + sp4 * s0; - fp_type d = cp4 * s0 - sp4 * c0; - - fp_type e = cp4 * c1 - sp4 * s1; - fp_type f = cp4 * s1 + sp4 * c1; - fp_type g = -(cp4 * c1 + sp4 * s1); - fp_type h = -(cp4 * s1 - sp4 * c1); - - return schmidt_decomp_type{ - {{a, b, 0, 0, 0, 0, c, d}, {a, b, 0, 0, 0, 0, c, d}}, - {{0, 0, st2, -st2, st2, -st2, 0, 0}, {0, 0, st2, -st2, st2, -st2, 0, 0}}, - {{0, 0, -st2, -st2, st2, st2, 0, 0}, {0, 0, -st2, -st2, st2, st2, 0, 0}}, - {{e, f, 0, 0, 0, 0, g, h}, {e, f, 0, 0, 0, 0, g, h}}, - }; - } -}; - -/** - * The controlled phase gate. A generalized version of GateCZ. - */ -template -struct GateCP { - static constexpr GateKind kind = kGateCP; - static constexpr char name[] = "cp"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static GateQSim Create( - unsigned time, unsigned q0, unsigned q1, fp_type phi) { - fp_type cp = std::cos(phi); - fp_type sp = std::sin(phi); - - return CreateGate, GateCP>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, cp, -sp}, {phi}); - } - - static schmidt_decomp_type SchmidtDecomp(fp_type phi) { - fp_type cp = std::cos(phi); - fp_type sp = std::sin(phi); - - return schmidt_decomp_type{ - {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}}, - {{0, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, cp, -sp}}, - }; - } -}; - -/** - * A two-qubit gate defined entirely by its matrix. - */ -template -struct GateMatrix2 { - static constexpr GateKind kind = kGateMatrix2; - static constexpr char name[] = "mat2"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = false; - - template > - static GateQSim Create( - unsigned time, unsigned q0, unsigned q1, M&& m) { - return CreateGate, GateMatrix2>(time, {q1, q0}, - std::forward(m)); - } - - static schmidt_decomp_type SchmidtDecomp(fp_type phi) { - // Not implemented. - return schmidt_decomp_type{}; - } -}; - -template -inline schmidt_decomp_type GetSchmidtDecomp( - GateKind kind, const std::vector& params) { - switch (kind) { - case kGateId2: - return GateId2::SchmidtDecomp(); - case kGateCZ: - return GateCZ::SchmidtDecomp(); - case kGateCNot: - return GateCNot::SchmidtDecomp(); - case kGateSwap: - return GateSwap::SchmidtDecomp(); - case kGateIS: - return GateIS::SchmidtDecomp(); - case kGateFS: - return GateFS::SchmidtDecomp(params[0], params[1]); - case kGateCP: - return GateCP::SchmidtDecomp(params[0]); - default: - // Single qubit gates: empty Schmidt decomposition. - return schmidt_decomp_type{}; - } -} - -} // namespace qsim - -#endif // GATES_QSIM_H_ diff --git a/qsim/hybrid.h b/qsim/hybrid.h deleted file mode 100644 index 44fad5b..0000000 --- a/qsim/hybrid.h +++ /dev/null @@ -1,612 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HYBRID_H_ -#define HYBRID_H_ - -#include -#include -#include -#include - -#include "gate.h" -#include "gate_appl.h" - -namespace qsim { - -/** - * Hybrid Feynman-Schrodinger simulator. - */ -template class FuserT, typename For> -struct HybridSimulator final { - public: - using Gate = GateT; - using GateKind = typename Gate::GateKind; - using fp_type = typename Gate::fp_type; - - private: - // Note that one can use "struct GateHybrid : public Gate {" in C++17. - struct GateHybrid { - using GateKind = HybridSimulator::GateKind; - using fp_type = HybridSimulator::fp_type; - - GateKind kind; - unsigned time; - std::vector qubits; - std::vector controlled_by; - uint64_t cmask; - std::vector params; - Matrix matrix; - bool unfusible; - bool swapped; - - const Gate* parent; - unsigned id; - }; - - struct GateX { - GateHybrid* decomposed0; - GateHybrid* decomposed1; - schmidt_decomp_type schmidt_decomp; - unsigned schmidt_bits; - unsigned swapped; - }; - - public: - using Fuser = FuserT; - using GateFused = typename Fuser::GateFused; - - /** - * Contextual data for hybrid simulation. - */ - struct HybridData { - /** - * List of gates on the "0" side of the cut. - */ - std::vector gates0; - /** - * List of gates on the "1" side of the cut. - */ - std::vector gates1; - /** - * List of gates on the cut. - */ - std::vector gatexs; - /** - * Global qubit index to local qubit index map. - */ - std::vector qubit_map; - /** - * Number of qubits on the "0" side of the cut. - */ - unsigned num_qubits0; - /** - * Number of qubits on the "1" side of the cut. - */ - unsigned num_qubits1; - /** - * Number of gates on the cut. - */ - unsigned num_gatexs; - }; - - /** - * User-specified parameters for gate fusion and hybrid simulation. - */ - struct Parameter : public Fuser::Parameter { - /** - * Fixed bitstring indicating values to assign to Schmidt decomposition - * indices of prefix gates. - */ - uint64_t prefix; - /** - * Number of gates on the cut that are part of the prefix. Indices of these - * gates are assigned the value indicated by `prefix`. - */ - unsigned num_prefix_gatexs; - /** - * Number of gates on the cut that are part of the root. All gates that are - * not part of the prefix or root are part of the suffix. - */ - unsigned num_root_gatexs; - unsigned num_threads; - }; - - template - explicit HybridSimulator(Args&&... args) : for_(args...) {} - - /** - * Splits the lattice into two parts, using Schmidt decomposition for gates - * on the cut. - * @param parts Lattice sections to be simulated. - * @param gates List of all gates in the circuit. - * @param hd Output data with split parts. - * @return True if the splitting done successfully; false otherwise. - */ - static bool SplitLattice(const std::vector& parts, - const std::vector& gates, HybridData& hd) { - hd.num_gatexs = 0; - hd.num_qubits0 = 0; - hd.num_qubits1 = 0; - - hd.gates0.reserve(gates.size()); - hd.gates1.reserve(gates.size()); - hd.qubit_map.reserve(parts.size()); - - unsigned count0 = 0; - unsigned count1 = 0; - - // Global qubit index to local qubit index map. - for (std::size_t i = 0; i < parts.size(); ++i) { - parts[i] == 0 ? ++hd.num_qubits0 : ++hd.num_qubits1; - hd.qubit_map.push_back(parts[i] == 0 ? count0++ : count1++); - } - - // Split the lattice. - for (const auto& gate : gates) { - if (gate.kind == gate::kMeasurement) { - IO::errorf("measurement gates are not suported by qsimh.\n"); - return false; - } - - if (gate.controlled_by.size() > 0) { - IO::errorf("controlled gates are not suported by qsimh.\n"); - return false; - } - - switch (gate.qubits.size()) { - case 1: // Single qubit gates. - switch (parts[gate.qubits[0]]) { - case 0: - hd.gates0.emplace_back(GateHybrid{gate.kind, gate.time, - {hd.qubit_map[gate.qubits[0]]}, {}, 0, gate.params, gate.matrix, - false, false, nullptr, 0}); - break; - case 1: - hd.gates1.emplace_back(GateHybrid{gate.kind, gate.time, - {hd.qubit_map[gate.qubits[0]]}, {}, 0, gate.params, gate.matrix, - false, false, nullptr, 0}); - break; - } - break; - case 2: // Two qubit gates. - { - switch ((parts[gate.qubits[1]] << 1) | parts[gate.qubits[0]]) { - case 0: // Both qubits in part 0. - hd.gates0.emplace_back(GateHybrid{gate.kind, gate.time, - {hd.qubit_map[gate.qubits[0]], hd.qubit_map[gate.qubits[1]]}, - {}, 0, gate.params, gate.matrix, false, gate.swapped, - nullptr, 0}); - break; - case 1: // Gate on the cut, qubit 0 in part 1, qubit 1 in part 0. - hd.gates0.emplace_back(GateHybrid{GateKind::kDecomp, gate.time, - {hd.qubit_map[gate.qubits[1]]}, {}, 0, gate.params, {}, - true, gate.swapped, &gate, hd.num_gatexs}); - hd.gates1.emplace_back(GateHybrid{GateKind::kDecomp, gate.time, - {hd.qubit_map[gate.qubits[0]]}, {}, 0, gate.params, {}, - true, gate.swapped, &gate, hd.num_gatexs}); - - ++hd.num_gatexs; - break; - case 2: // Gate on the cut, qubit 0 in part 0, qubit 1 in part 1. - hd.gates0.emplace_back(GateHybrid{GateKind::kDecomp, gate.time, - {hd.qubit_map[gate.qubits[0]]}, {}, 0, gate.params, {}, - true, gate.swapped, &gate, hd.num_gatexs}); - hd.gates1.emplace_back(GateHybrid{GateKind::kDecomp, gate.time, - {hd.qubit_map[gate.qubits[1]]}, {}, 0, gate.params, {}, - true, gate.swapped, &gate, hd.num_gatexs}); - - ++hd.num_gatexs; - break; - case 3: // Both qubits in part 1. - hd.gates1.emplace_back(GateHybrid{gate.kind, gate.time, - {hd.qubit_map[gate.qubits[0]], hd.qubit_map[gate.qubits[1]]}, - {}, 0, gate.params, gate.matrix, false, gate.swapped, - nullptr, 0}); - break; - } - } - break; - default: - IO::errorf("multi-qubit gates are not suported by qsimh.\n"); - return false; - } - } - - auto compare = [](const GateHybrid& l, const GateHybrid& r) -> bool { - return l.time < r.time || (l.time == r.time && - (l.parent < r.parent || (l.parent == r.parent && l.id < r.id))); - }; - - // Sort gates. - std::sort(hd.gates0.begin(), hd.gates0.end(), compare); - std::sort(hd.gates1.begin(), hd.gates1.end(), compare); - - hd.gatexs.reserve(hd.num_gatexs); - - // Get Schmidt matrices. - for (auto& gate0 : hd.gates0) { - if (gate0.parent != nullptr) { - auto d = GetSchmidtDecomp(gate0.parent->kind, gate0.parent->params); - if (d.size() == 0) { - IO::errorf("no Schmidt decomposition for gate kind %u.\n", - gate0.parent->kind); - return false; - } - - unsigned schmidt_bits = SchmidtBits(d.size()); - if (schmidt_bits > 2) { - IO::errorf("Schmidt rank is too large for gate kind %u.\n", - gate0.parent->kind); - return false; - } - - unsigned swapped = parts[gate0.parent->qubits[0]]; - if (gate0.parent->swapped) swapped = 1 - swapped; - hd.gatexs.emplace_back(GateX{&gate0, nullptr, std::move(d), - schmidt_bits, swapped}); - } - } - - unsigned count = 0; - for (auto& gate1 : hd.gates1) { - if (gate1.parent != nullptr) { - hd.gatexs[count++].decomposed1 = &gate1; - } - } - - for (auto& gatex : hd.gatexs) { - if (gatex.schmidt_decomp.size() == 1) { - FillSchmidtMatrices(0, gatex); - } - } - - return true; - } - - /** - * Runs the hybrid simulator on a sectioned lattice. - * @param param Options for parallelism and logging. Also specifies the size - * of the 'prefix' and 'root' sections of the lattice. - * @param factory Object to create simulators and state spaces. - * @param hd Container object for gates on the boundary between lattice - * sections. - * @param parts Lattice sections to be simulated. - * @param fgates0 List of gates from one section of the lattice. - * @param fgates1 List of gates from the other section of the lattice. - * @param bitstrings List of output states to simulate, as bitstrings. - * @param results Output vector of amplitudes. After a successful run, this - * will be populated with amplitudes for each state in 'bitstrings'. - * @return True if the simulation completed successfully; false otherwise. - */ - template - bool Run(const Parameter& param, const Factory& factory, - HybridData& hd, const std::vector& parts, - const std::vector& fgates0, - const std::vector& fgates1, - const std::vector& bitstrings, Results& results) const { - using Simulator = typename Factory::Simulator; - using StateSpace = typename Simulator::StateSpace; - using State = typename StateSpace::State; - - unsigned num_p_gates = param.num_prefix_gatexs; - unsigned num_pr_gates = num_p_gates + param.num_root_gatexs; - - auto bits = CountSchmidtBits(param, hd.gatexs); - - uint64_t rmax = uint64_t{1} << bits.num_r_bits; - uint64_t smax = uint64_t{1} << bits.num_s_bits; - - auto loc0 = CheckpointLocations(param, fgates0); - auto loc1 = CheckpointLocations(param, fgates1); - - struct Index { - unsigned i0; - unsigned i1; - }; - - std::vector indices; - indices.reserve(bitstrings.size()); - - // Bitstring indices for part 0 and part 1. TODO: optimize. - for (const auto& bitstring : bitstrings) { - Index index{0, 0}; - - for (uint64_t i = 0; i < hd.qubit_map.size(); ++i) { - unsigned m = ((bitstring >> i) & 1) << hd.qubit_map[i]; - parts[i] ? index.i1 |= m : index.i0 |= m; - } - - indices.push_back(index); - } - - StateSpace state_space = factory.CreateStateSpace(); - - State* rstate0; - State* rstate1; - - State state0p = state_space.Null(); - State state1p = state_space.Null(); - State state0r = state_space.Null(); - State state1r = state_space.Null(); - State state0s = state_space.Null(); - State state1s = state_space.Null(); - - // Create states. - - if (!CreateStates(hd.num_qubits0, hd.num_qubits1, state_space, true, - state0p, state1p, rstate0, rstate1)) { - return false; - } - - if (!CreateStates(hd.num_qubits0, hd.num_qubits1, state_space, rmax > 1, - state0r, state1r, rstate0, rstate1)) { - return false; - } - - if (!CreateStates(hd.num_qubits0, hd.num_qubits1, state_space, smax > 1, - state0s, state1s, rstate0, rstate1)) { - return false; - } - - state_space.SetStateZero(state0p); - state_space.SetStateZero(state1p); - - Simulator simulator = factory.CreateSimulator(); - - std::vector prev(hd.num_gatexs, unsigned(-1)); - - // param.prefix encodes the prefix path. - unsigned gatex_index = SetSchmidtMatrices( - 0, num_p_gates, param.prefix, prev, hd.gatexs); - - if (gatex_index == 0) { - // Apply gates before the first checkpoint. - ApplyGates(fgates0, 0, loc0[0], simulator, state0p); - ApplyGates(fgates1, 0, loc1[0], simulator, state1p); - } else { - IO::errorf("invalid prefix %lu for prefix gate index %u.\n", - param.prefix, gatex_index - 1); - return false; - } - - // Branch over root gates on the cut. r encodes the root path. - for (uint64_t r = 0; r < rmax; ++r) { - if (rmax > 1) { - state_space.Copy(state0p, state0r); - state_space.Copy(state1p, state1r); - } - - if (SetSchmidtMatrices(num_p_gates, num_pr_gates, - r, prev, hd.gatexs) == 0) { - // Apply gates before the second checkpoint. - ApplyGates(fgates0, loc0[0], loc0[1], simulator, state0r); - ApplyGates(fgates1, loc1[0], loc1[1], simulator, state1r); - } else { - continue; - } - - // Branch over suffix gates on the cut. s encodes the suffix path. - for (uint64_t s = 0; s < smax; ++s) { - if (smax > 1) { - state_space.Copy(rmax > 1 ? state0r : state0p, state0s); - state_space.Copy(rmax > 1 ? state1r : state1p, state1s); - } - - if (SetSchmidtMatrices(num_pr_gates, hd.num_gatexs, - s, prev, hd.gatexs) == 0) { - // Apply the rest of the gates. - ApplyGates(fgates0, loc0[1], fgates0.size(), simulator, state0s); - ApplyGates(fgates1, loc1[1], fgates1.size(), simulator, state1s); - } else { - continue; - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - const StateSpace& state_space, - const State& state0, const State& state1, - const std::vector& indices, Results& results) { - // TODO: make it faster for the CUDA state space. - auto a0 = state_space.GetAmpl(state0, indices[i].i0); - auto a1 = state_space.GetAmpl(state1, indices[i].i1); - results[i] += a0 * a1; - }; - - // Collect results. - for_.Run(results.size(), f, - state_space, *rstate0, *rstate1, indices, results); - } - } - - return true; - } - - private: - /** - * Identifies when to save "checkpoints" of the simulation state. These allow - * runs with different cut-index values to reuse parts of the simulation. - * @param param Options for parallelism and logging. Also specifies the size - * of the 'prefix' and 'root' sections of the lattice. - * @param fgates Set of gates for which to find checkpoint locations. - * @return A pair of numbers specifying how many gates to apply before the - * first and second checkpoints, respectively. - */ - static std::array CheckpointLocations( - const Parameter& param, const std::vector& fgates) { - std::array loc{0, 0}; - - unsigned num_decomposed = 0; - unsigned num_p_gates = param.num_prefix_gatexs; - unsigned num_pr_gates = num_p_gates + param.num_root_gatexs; - - for (std::size_t i = 0; i < fgates.size(); ++i) { - for (auto gate: fgates[i].gates) { - if (gate->parent != nullptr) { - ++num_decomposed; - // There should be only one decomposed gate in fused gate. - break; - } - } - - if (num_decomposed <= num_p_gates) { - loc[0] = i + 1; - } - - if (num_decomposed <= num_pr_gates) { - loc[1] = i + 1; - } - } - - return loc; - } - - struct Bits { - unsigned num_p_bits; - unsigned num_r_bits; - unsigned num_s_bits; - }; - - static Bits CountSchmidtBits( - const Parameter& param, const std::vector& gatexs) { - Bits bits{0, 0, 0}; - - unsigned num_p_gates = param.num_prefix_gatexs; - unsigned num_pr_gates = num_p_gates + param.num_root_gatexs; - - for (std::size_t i = 0; i < gatexs.size(); ++i) { - const auto& gatex = gatexs[i]; - if (i < num_p_gates) { - bits.num_p_bits += gatex.schmidt_bits; - } else if (i < num_pr_gates) { - bits.num_r_bits += gatex.schmidt_bits; - } else { - bits.num_s_bits += gatex.schmidt_bits; - } - } - - return bits; - } - - static unsigned SetSchmidtMatrices(std::size_t i0, std::size_t i1, - uint64_t path, - std::vector& prev_k, - std::vector& gatexs) { - unsigned shift_length = 0; - - for (std::size_t i = i0; i < i1; ++i) { - const auto& gatex = gatexs[i]; - - if (gatex.schmidt_bits == 0) { - // Continue if gatex has Schmidt rank 1. - continue; - } - - unsigned k = (path >> shift_length) & ((1 << gatex.schmidt_bits) - 1); - shift_length += gatex.schmidt_bits; - - if (k != prev_k[i]) { - if (k >= gatex.schmidt_decomp.size()) { - // Invalid path. Returns gatex index plus one to report error in case - // of invalid prefix. - return i + 1; - } - - FillSchmidtMatrices(k, gatex); - - prev_k[i] = k; - } - } - - return 0; - } - - static void FillSchmidtMatrices(unsigned k, const GateX& gatex) { - unsigned part0 = gatex.swapped; - unsigned part1 = 1 - part0; - { - gatex.decomposed0->matrix.resize(gatex.schmidt_decomp[k][part0].size()); - auto begin = gatex.schmidt_decomp[k][part0].begin(); - auto end = gatex.schmidt_decomp[k][part0].end(); - std::copy(begin, end, gatex.decomposed0->matrix.begin()); - } - { - gatex.decomposed1->matrix.resize(gatex.schmidt_decomp[k][part1].size()); - auto begin = gatex.schmidt_decomp[k][part1].begin(); - auto end = gatex.schmidt_decomp[k][part1].end(); - std::copy(begin, end, gatex.decomposed1->matrix.begin()); - } - } - - template - static void ApplyGates(const std::vector& gates, - std::size_t i0, std::size_t i1, - const Simulator& simulator, - typename Simulator::State& state) { - for (std::size_t i = i0; i < i1; ++i) { - if (gates[i].matrix.size() > 0) { - ApplyFusedGate(simulator, gates[i], state); - } else { - auto gate = gates[i]; - CalculateFusedMatrix(gate); - ApplyFusedGate(simulator, gate, state); - } - } - } - - static unsigned SchmidtBits(unsigned size) { - switch (size) { - case 1: - return 0; - case 2: - return 1; - case 3: - return 2; - case 4: - return 2; - default: - // Not supported. - return 42; - } - } - - template - static bool CreateStates(unsigned num_qubits0,unsigned num_qubits1, - const StateSpace& state_space, bool create, - typename StateSpace::State& state0, - typename StateSpace::State& state1, - typename StateSpace::State* (&rstate0), - typename StateSpace::State* (&rstate1)) { - if (create) { - state0 = state_space.Create(num_qubits0); - state1 = state_space.Create(num_qubits1); - - if (state_space.IsNull(state0) || state_space.IsNull(state1)) { - IO::errorf("not enough memory: is the number of qubits too large?\n"); - return false; - } - - rstate0 = &state0; - rstate1 = &state1; - } - - return true; - } - - For for_; -}; - -} // namespace qsim - -#endif // HYBRID_H_ diff --git a/qsim/io.h b/qsim/io.h deleted file mode 100644 index 3b26c7c..0000000 --- a/qsim/io.h +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef IO_H_ -#define IO_H_ - -#include -#include - -namespace qsim { - -/** - * Controller for output logs. - */ -struct IO { - static void errorf(const char* format, ...) { - va_list args; - va_start(args, format); - vfprintf(stderr, format, args); - va_end(args); - } - - static void messagef(const char* format, ...) { - va_list args; - va_start(args, format); - vprintf(format, args); - va_end(args); - } -}; - -} // namespace qsim - -#endif // IO_H_ diff --git a/qsim/io_file.h b/qsim/io_file.h deleted file mode 100644 index 3cfac12..0000000 --- a/qsim/io_file.h +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef IO_FILE_H_ -#define IO_FILE_H_ - -#include -#include -#include - -#include "io.h" - -namespace qsim { - -/** - * Controller for output logs with methods for writing to file. - */ -struct IOFile : public IO { - static std::ifstream StreamFromFile(const std::string& file) { - std::ifstream fs; - fs.open(file); - if (!fs) { - errorf("cannot open %s for reading.\n", file.c_str()); - } - return fs; - } - - static void CloseStream(std::ifstream& fs) { - fs.close(); - } - - static bool WriteToFile( - const std::string& file, const std::string& content) { - return WriteToFile(file, content.data(), content.size()); - } - - static bool WriteToFile( - const std::string& file, const void* data, uint64_t size) { - auto fs = std::fstream(file, std::ios::out | std::ios::binary); - - if (!fs) { - errorf("cannot open %s for writing.\n", file.c_str()); - return false; - } else { - fs.write((const char*) data, size); - if (!fs) { - errorf("cannot write to %s.\n", file.c_str()); - return false; - } - - fs.close(); - } - - return true; - } -}; - -} // namespace qsim - -#endif // IO_FILE_H_ diff --git a/qsim/matrix.h b/qsim/matrix.h deleted file mode 100644 index a3c2640..0000000 --- a/qsim/matrix.h +++ /dev/null @@ -1,296 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MATRIX_H_ -#define MATRIX_H_ - -#include -#include -#include - -#include "bits.h" - -namespace qsim { - -/** - * Gate matrix type. Matrices are stored as vectors. The matrix elements are - * accessed as real(m[i][j]) <- vector[2 * (n * i + j)] and - * imag(m[i][j]) <- vector[2 * (n * i + j) + 1], where n is the number of rows - * or columns (n = 2^q, where q is the number of gate qubits). - */ -template -using Matrix = std::vector; - -/** - * Sets all matrix elements to zero. - * @m Matrix to be cleared. - */ -template -inline void MatrixClear(Matrix& m) { - for (unsigned i = 0; i < m.size(); ++i) { - m[i] = 0; - } -} - -/** - * Sets an identity matrix. - * @n Number of matrix rows (columns). - * @m Output identity matrix. - */ -template -inline void MatrixIdentity(unsigned n, Matrix& m) { - m.resize(2 * n * n); - - MatrixClear(m); - - for (unsigned i = 0; i < n; ++i) { - m[2 * (n * i + i)] = 1; - } -} - -/** - * Multiplies two gate matrices of equal size: m2 = m1 m2. - * @q Number of gate qubits. The number of matrix rows (columns) is 2^q. - * @m1 Matrix m1. - * @m2 Input matrix m2. Output product of matrices m2 = m1 m2. - */ -template -inline void MatrixMultiply( - unsigned q, const Matrix& m1, Matrix& m2) { - Matrix mt = m2; - unsigned n = unsigned{1} << q; - - for (unsigned i = 0; i < n; ++i) { - for (unsigned j = 0; j < n; ++j) { - fp_type2 re = 0; - fp_type2 im = 0; - - for (unsigned k = 0; k < n; ++k) { - fp_type2 r1 = m1[2 * (n * i + k)]; - fp_type2 i1 = m1[2 * (n * i + k) + 1]; - fp_type2 r2 = mt[2 * (n * k + j)]; - fp_type2 i2 = mt[2 * (n * k + j) + 1]; - - re += r1 * r2 - i1 * i2; - im += r1 * i2 + i1 * r2; - } - - m2[2 * (n * i + j)] = re; - m2[2 * (n * i + j) + 1] = im; - } - } -} - -/** - * Multiplies two gate matrices of equal size: m2 = m1^\dagger m2. - * @q Number of gate qubits. The number of matrix rows (columns) is 2^q. - * @m1 Matrix m1. - * @m2 Input matrix m2. Output product of matrices m2 = m1 m2. - */ -template -inline void MatrixDaggerMultiply( - unsigned q, const Matrix& m1, Matrix& m2) { - Matrix mt = m2; - unsigned n = unsigned{1} << q; - - for (unsigned i = 0; i < n; ++i) { - for (unsigned j = 0; j < n; ++j) { - fp_type2 re = 0; - fp_type2 im = 0; - - for (unsigned k = 0; k < n; ++k) { - fp_type2 r1 = m1[2 * (n * k + i)]; - fp_type2 i1 = m1[2 * (n * k + i) + 1]; - fp_type2 r2 = mt[2 * (n * k + j)]; - fp_type2 i2 = mt[2 * (n * k + j) + 1]; - - re += r1 * r2 + i1 * i2; - im += r1 * i2 - i1 * r2; - } - - m2[2 * (n * i + j)] = re; - m2[2 * (n * i + j) + 1] = im; - } - } -} - -/** - * Multiplies two gate matrices: m2 = m1 m2. The size of m1 should not exceed - * the size of m2. - * @mask1 Qubit mask that specifies the subset of qubits m1 acts on. - * @q1 Number of gate qubits. The number of matrix rows (columns) is 2^q1. - * @m1 Matrix m1. - * @q2 Number of gate qubits. The number of matrix rows (columns) is 2^q2. - * @m2 Input matrix m2. Output product of matrices m2 = m1 m2. - */ -template -inline void MatrixMultiply(unsigned mask1, - unsigned q1, const Matrix& m1, - unsigned q2, Matrix& m2) { - if (q1 == q2) { - MatrixMultiply(q1, m1, m2); - } else { - Matrix mt = m2; - unsigned n1 = unsigned{1} << q1; - unsigned n2 = unsigned{1} << q2; - - for (unsigned i = 0; i < n2; ++i) { - unsigned si = bits::CompressBits(i, q2, mask1); - - for (unsigned j = 0; j < n2; ++j) { - fp_type2 re = 0; - fp_type2 im = 0; - - for (unsigned k = 0; k < n1; ++k) { - unsigned ek = bits::ExpandBits(k, q2, mask1) + (i & ~mask1); - - fp_type2 r1 = m1[2 * (n1 * si + k)]; - fp_type2 i1 = m1[2 * (n1 * si + k) + 1]; - fp_type2 r2 = mt[2 * (n2 * ek + j)]; - fp_type2 i2 = mt[2 * (n2 * ek + j) + 1]; - - re += r1 * r2 - i1 * i2; - im += r1 * i2 + i1 * r2; - } - - m2[2 * (n2 * i + j)] = re; - m2[2 * (n2 * i + j) + 1] = im; - } - } - } -} - -/** - * Multiply a matrix by a real scalar value. - * @c Scalar value. - * @m Input matrix to be multiplied. Output matrix. - */ -template -inline void MatrixScalarMultiply(fp_type1 c, Matrix& m) { - for (unsigned i = 0; i < m.size(); ++i) { - m[i] *= c; - } -} - -/** - * Multiply a matrix by a complex scalar value. - * @re Real part of scalar value. - * @im Imaginary part of scalar value. - * @m Input matrix to be multiplied. Output matrix. - */ -template -inline void MatrixScalarMultiply( - fp_type1 re, fp_type1 im, Matrix& m) { - for (unsigned i = 0; i < m.size() / 2; ++i) { - fp_type2 re0 = m[2 * i + 0]; - fp_type2 im0 = m[2 * i + 1]; - m[2 * i + 0] = re * re0 - im * im0; - m[2 * i + 1] = re * im0 + im * re0; - } -} - -/** - * Daggers a matrix. - * @n Number of matrix rows (columns). - * @m Input matrix. Output matrix. - */ -template -inline void MatrixDagger(unsigned n, Matrix& m) { - for (unsigned i = 0; i < n; ++i) { - m[2 * (n * i + i) + 1] = -m[2 * (n * i + i) + 1]; - - for (unsigned j = i + 1; j < n; ++j) { - std::swap(m[2 * (n * i + j)], m[2 * (n * j + i)]); - fp_type t = m[2 * (n * i + j) + 1]; - m[2 * (n * i + j) + 1] = -m[2 * (n * j + i) + 1]; - m[2 * (n * j + i) + 1] = -t; - } - } -} - -/** - * Gets a permutation to rearrange qubits from "normal" order to "gate" - * order. Qubits are ordered in increasing order for "normal" order. - * Qubits are ordered arbitrarily for "gate" order. Returns an empty vector - * if the qubits are in "normal" order. - * @qubits Qubit indices in "gate" order. - * @return Permutation as a vector. - */ -inline std::vector NormalToGateOrderPermutation( - const std::vector& qubits) { - std::vector perm; - - bool normal_order = true; - - for (std::size_t i = 1; i < qubits.size(); ++i) { - if (qubits[i] < qubits[i - 1]) { - normal_order = false; - break; - } - } - - if (!normal_order) { - struct QI { - unsigned q; - unsigned index; - }; - - std::vector qis; - qis.reserve(qubits.size()); - - for (std::size_t i = 0; i < qubits.size(); ++i) { - qis.push_back({qubits[i], unsigned(i)}); - } - - std::sort(qis.begin(), qis.end(), [](const QI& l, const QI& r) { - return l.q < r.q; - }); - - perm.reserve(qubits.size()); - - for (std::size_t i = 0; i < qubits.size(); ++i) { - perm.push_back(qis[i].index); - } - } - - return perm; -} - -/** - * Shuffles the gate matrix elements to get the matrix that acts on qubits - * that are in "normal" order (in increasing orger). - * @perm Permutation to rearrange qubits from "normal" order to "gate" order. - * @q Number of gate qubits. The number of matrix rows (columns) is 2^q. - * @m Input matrix. Output shuffled matrix. - */ -template -inline void MatrixShuffle(const std::vector& perm, - unsigned q, Matrix& m) { - Matrix mt = m; - unsigned n = unsigned{1} << q; - - for (unsigned i = 0; i < n; ++i) { - unsigned pi = bits::PermuteBits(i, q, perm); - for (unsigned j = 0; j < n; ++j) { - unsigned pj = bits::PermuteBits(j, q, perm); - - m[2 * (n * i + j)] = mt[2 * (n * pi + pj)]; - m[2 * (n * i + j) + 1] = mt[2 * (n * pi + pj) + 1]; - } - } -} - -} // namespace qsim - -#endif // MATRIX_H_ diff --git a/qsim/mps_simulator.h b/qsim/mps_simulator.h deleted file mode 100644 index 8fbcbae..0000000 --- a/qsim/mps_simulator.h +++ /dev/null @@ -1,246 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MPS_SIMULATOR_H_ -#define MPS_SIMULATOR_H_ - -// For templates will take care of parallelization. -#define EIGEN_DONT_PARALLELIZE 1 - -#include -#include -#include -#include -#include - -#include "../eigen/Eigen/Dense" -#include "../eigen/Eigen/SVD" -#include "mps_statespace.h" - -namespace qsim { - -namespace mps { - -/** - * Truncated Matrix Product State (MPS) circuit simulator w/ vectorization. - */ -template -class MPSSimulator final { - public: - using MPSStateSpace_ = MPSStateSpace; - using State = typename MPSStateSpace_::MPS; - using fp_type = typename MPSStateSpace_::fp_type; - - using Complex = std::complex; - using Matrix = - Eigen::Matrix; - using ConstMatrixMap = Eigen::Map; - using MatrixMap = Eigen::Map; - - using OneQubitMatrix = Eigen::Matrix; - using ConstOneQubitMap = Eigen::Map; - - // Note: ForArgs are currently unused. - template - explicit MPSSimulator(ForArgs&&... args) : for_(args...) {} - - /** - * Applies a gate using non-vectorized instructions. - * @param qs Indices of the qubits affected by this gate. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyGate(const std::vector& qs, const fp_type* matrix, - State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - switch (qs.size()) { - case 1: - ApplyGate1(qs, matrix, state); - break; - case 2: - ApplyGate2(qs, matrix, state); - break; - // case 3: - // ApplyGate3(qs, matrix, state); - // break; - // case 4: - // ApplyGate4(qs, matrix, state); - // break; - // case 5: - // ApplyGate5(qs, matrix, state); - // break; - // case 6: - // ApplyGate6(qs, matrix, state); - // break; - default: - // Not implemented. - break; - } - } - - /** - * Applies a controlled gate using eigen3 operations w/ instructions. - * @param qs Indices of the qubits affected by this gate. - * @param cqs Indices of control qubits. - * @param cmask Bit mask of control qubit values. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyControlledGate(const std::vector& qs, - const std::vector& cqs, uint64_t cmask, - const fp_type* matrix, State& state) const { - // TODO. - } - - /** - * Computes the expectation value of an operator using eigen3 operations - * w/ vectorized instructions. - * @param qs Indices of the qubits the operator acts on. - * @param matrix The operator matrix. - * @param state The state of the system. - * @return The computed expectation value. - */ - std::complex ExpectationValue(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - // TODO. - return std::complex(-10., -10.); - } - - private: - void ApplyGate1(const std::vector& qs, const fp_type* matrix, - State& state) const { - if (qs[0] == state.num_qubits() - 1) { - Apply1Right(qs, matrix, state); - } else { - Apply1LeftOrInterior(qs, matrix, state); - } - } - - void Apply1LeftOrInterior(const std::vector& qs, - const fp_type* matrix, State& state) const { - fp_type* raw_state = state.get(); - const auto bond_dim = state.bond_dim(); - const auto l_offset = MPSStateSpace_::GetBlockOffset(state, qs[0]); - const auto r_offset = MPSStateSpace_::GetBlockOffset(state, qs[0] + 1); - const auto end = MPSStateSpace_::Size(state); - ConstOneQubitMap gate_matrix((Complex*) matrix); - MatrixMap scratch_block((Complex*)(raw_state + end), 2, bond_dim); - - for (unsigned block_sep = l_offset; block_sep < r_offset; - block_sep += 4 * bond_dim) { - fp_type* cur_block = raw_state + block_sep; - ConstMatrixMap mps_block((Complex*) cur_block, 2, bond_dim); - scratch_block.noalias() = gate_matrix * mps_block; - memcpy(cur_block, raw_state + end, sizeof(fp_type) * bond_dim * 4); - } - } - - void Apply1Right(const std::vector& qs, const fp_type* matrix, - State& state) const { - fp_type* raw_state = state.get(); - const auto bond_dim = state.bond_dim(); - const auto offset = MPSStateSpace_::GetBlockOffset(state, qs[0]); - const auto end = MPSStateSpace_::Size(state); - ConstOneQubitMap gate_matrix((Complex*) matrix); - ConstMatrixMap mps_block((Complex*)(raw_state + offset), bond_dim, 2); - MatrixMap scratch_block((Complex*)(raw_state + end), bond_dim, 2); - scratch_block.noalias() = mps_block * gate_matrix.transpose(); - memcpy(raw_state + offset, raw_state + end, sizeof(fp_type) * bond_dim * 4); - } - - void ApplyGate2(const std::vector& qs, const fp_type* matrix, - State& state) const { - // TODO: micro-benchmark this function and improve performance. - const auto bond_dim = state.bond_dim(); - const auto num_qubits = state.num_qubits(); - fp_type* raw_state = state.get(); - - const auto i_dim = (qs[0] == 0) ? 1 : bond_dim; - const auto j_dim = 2; - const auto k_dim = bond_dim; - const auto l_dim = 2; - const auto m_dim = (qs[1] == num_qubits - 1) ? 1 : bond_dim; - - const auto b_0_offset = MPSStateSpace_::GetBlockOffset(state, qs[0]); - const auto b_1_offset = MPSStateSpace_::GetBlockOffset(state, qs[1]); - const auto end = MPSStateSpace_::Size(state); - - MatrixMap block_0((Complex*)(raw_state + b_0_offset), i_dim * j_dim, k_dim); - MatrixMap block_1((Complex*)(raw_state + b_1_offset), k_dim, l_dim * m_dim); - - // Merge both blocks into scratch space. - MatrixMap scratch_c((Complex*)(raw_state + end), i_dim * j_dim, l_dim * m_dim); - scratch_c.noalias() = block_0 * block_1; - - // Transpose inner dims in-place. - MatrixMap scratch_c_t((Complex*)(raw_state + end), i_dim * j_dim * l_dim, m_dim); - for (unsigned i = 0; i < i_dim * j_dim * l_dim; i += 4) { - scratch_c_t.row(i + 1).swap(scratch_c_t.row(i + 2)); - } - - // Transpose gate matrix and place in 3rd (last) scratch block. - const auto scratch3_offset = end + 8 * bond_dim * bond_dim; - ConstMatrixMap gate_matrix((Complex*) matrix, 4, 4); - MatrixMap gate_matrix_transpose((Complex*)(raw_state + scratch3_offset), 4, 4); - gate_matrix_transpose = gate_matrix.transpose(); - gate_matrix_transpose.col(1).swap(gate_matrix_transpose.col(2)); - - // Contract gate and merged block tensors, placing result in B0B1. - for (unsigned i = 0; i < i_dim; ++i) { - fp_type* src_block = raw_state + end + i * 8 * m_dim; - fp_type* dest_block = raw_state + b_0_offset + i * 8 * m_dim; - MatrixMap block_b0b1((Complex*) dest_block, 4, m_dim); - ConstMatrixMap scratch_c_i((Complex*) src_block, 4, m_dim); - // [i, np, m] = [np, lj] * [i, lj, m] - block_b0b1.noalias() = gate_matrix_transpose * scratch_c_i; - } - - // SVD B0B1. - MatrixMap full_b0b1((Complex*)(raw_state + b_0_offset), 2 * i_dim, 2 * m_dim); - Eigen::BDCSVD svd(full_b0b1, Eigen::ComputeThinU | Eigen::ComputeThinV); - const auto p = std::min(2 * i_dim, 2 * m_dim); - - // Place U in scratch to truncate and then B0. - MatrixMap svd_u((Complex*)(raw_state + end), 2 * i_dim, p); - svd_u.noalias() = svd.matrixU(); - block_0.fill(Complex(0, 0)); - const auto keep_cols = (svd_u.cols() > bond_dim) ? bond_dim : svd_u.cols(); - block_0.block(0, 0, svd_u.rows(), keep_cols).noalias() = - svd_u(Eigen::indexing::all, Eigen::seq(0, keep_cols - 1)); - - // Place row product of S V into scratch to truncate and then B1. - MatrixMap svd_v((Complex*)(raw_state + end), p, 2 * m_dim); - MatrixMap s_vector((Complex*)(raw_state + end + 8 * bond_dim * bond_dim), p, 1); - svd_v.noalias() = svd.matrixV().adjoint(); - s_vector.noalias() = svd.singularValues(); - block_1.fill(Complex(0, 0)); - const auto keep_rows = (svd_v.rows() > bond_dim) ? bond_dim : svd_v.rows(); - const auto row_seq = Eigen::seq(0, keep_rows - 1); - for (unsigned i = 0; i < keep_rows; ++i) { - svd_v.row(i) *= s_vector(i); - } - block_1.block(0, 0, keep_rows, svd_v.cols()).noalias() = - svd_v(row_seq, Eigen::indexing::all); - } - - For for_; -}; - -} // namespace mps -} // namespace qsim - -#endif // MPS_SIMULATOR_H_ diff --git a/qsim/mps_statespace.h b/qsim/mps_statespace.h deleted file mode 100644 index 9b3acf3..0000000 --- a/qsim/mps_statespace.h +++ /dev/null @@ -1,597 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MPS_STATESPACE_H_ -#define MPS_STATESPACE_H_ - -// For templates will take care of parallelization. -#define EIGEN_DONT_PARALLELIZE 1 - -#ifdef _WIN32 -#include -#endif - -#include -#include -#include -#include -#include - -#include "../eigen/Eigen/Dense" -#include "../eigen/unsupported/Eigen/CXX11/Tensor" - -namespace qsim { - -namespace mps { - -namespace detail { - -inline void do_not_free(void*) {} - -inline void free(void* ptr) { -#ifdef _WIN32 - _aligned_free(ptr); -#else - ::free(ptr); -#endif -} - -} // namespace detail - -/** - * Class containing context and routines for fixed bond dimension - * truncated Matrix Product State (MPS) simulation. - */ -template -class MPSStateSpace { - private: - public: - using fp_type = FP; - using Pointer = std::unique_ptr; - - using Complex = std::complex; - using Matrix = - Eigen::Matrix; - using ConstMatrixMap = Eigen::Map; - using MatrixMap = Eigen::Map; - - // Store MPS tensors with the following shape: - // [2, bond_dim], [bond_dim, 2, bond_dim], ... , [bond_dim, 2]. - class MPS { - public: - MPS() = delete; - - MPS(Pointer&& ptr, unsigned num_qubits, unsigned bond_dim) - : ptr_(std::move(ptr)), num_qubits_(num_qubits), bond_dim_(bond_dim) {} - - fp_type* get() { return ptr_.get(); } - - const fp_type* get() const { return ptr_.get(); } - - fp_type* release() { - num_qubits_ = 0; - return ptr_.release(); - } - - unsigned num_qubits() const { return num_qubits_; } - - unsigned bond_dim() const { return bond_dim_; } - - private: - Pointer ptr_; - unsigned num_qubits_; - unsigned bond_dim_; - }; - - // Note: ForArgs are currently unused. - template - MPSStateSpace(ForArgs&&... args) : for_(args...) {} - - // Requires num_qubits >= 2 and bond_dim >= 2. - static MPS Create(unsigned num_qubits, unsigned bond_dim) { - auto end_sizes = 2 * 4 * bond_dim; - auto internal_sizes = 4 * bond_dim * bond_dim * (num_qubits + 1); - // Use three extra "internal style" blocks past the end of the - // working allocation for scratch space. Needed for gate - // application. - auto size = sizeof(fp_type) * (end_sizes + internal_sizes); - -#ifdef _WIN32 - Pointer ptr{(fp_type*)_aligned_malloc(size, 64), &detail::free}; - bool is_null = ptr.get() != nullptr; - return MPS{std::move(ptr), is_null ? num_qubits : 0, - is_null ? bond_dim : 0}; -#else - void* p = nullptr; - if (posix_memalign(&p, 64, size) == 0) { - return MPS{Pointer{(fp_type*)p, &detail::free}, num_qubits, bond_dim}; - } else { - return MPS{Pointer{nullptr, &detail::free}, 0, 0}; - } -#endif - } - - static unsigned Size(const MPS& state) { - auto end_sizes = 2 * 4 * state.bond_dim(); - auto internal_sizes = 4 * state.bond_dim() * state.bond_dim(); - return end_sizes + internal_sizes * (state.num_qubits() - 2); - } - - static unsigned RawSize(const MPS& state) { - return sizeof(fp_type) * Size(state); - } - - // Get the pointer offset to the beginning of an MPS block. - static unsigned GetBlockOffset(const MPS& state, unsigned i) { - if (i == 0) { - return 0; - } - return 4 * state.bond_dim() * (1 + state.bond_dim() * (i - 1)); - } - - // Copies the state contents of one MPS to another. - // Ignores scratch data. - static bool Copy(const MPS& src, MPS& dest) { - if ((src.num_qubits() != dest.num_qubits()) || - src.bond_dim() != dest.bond_dim()) { - return false; - } - auto size = RawSize(src); - memcpy(dest.get(), src.get(), size); - return true; - } - - // Set the MPS to the |0> state. - static void SetStateZero(MPS& state) { - auto size = Size(state); - memset(state.get(), 0, sizeof(fp_type) * size); - auto block_size = 4 * state.bond_dim() * state.bond_dim(); - state.get()[0] = 1.0; - for (unsigned i = 4 * state.bond_dim(); i < size; i += block_size) { - state.get()[i] = 1.0; - } - } - - // Computes Re{} for two equal sized MPS. - // Requires: state1.bond_dim() == state2.bond_dim() && - // state1.num_qubits() == state2.num_qubits() - static fp_type RealInnerProduct(MPS& state1, MPS& state2) { - return InnerProduct(state1, state2).real(); - } - - // Computes for two equal sized MPS. - // Requires: state1.bond_dim() == state2.bond_dim() && - // state1.num_qubits() == state2.num_qubits() - static std::complex InnerProduct(MPS& state1, MPS& state2) { - const auto num_qubits = state1.num_qubits(); - const auto bond_dim = state1.bond_dim(); - const auto end = Size(state1); - auto offset = 0; - fp_type* state1_raw = state1.get(); - fp_type* state2_raw = state2.get(); - - // Contract leftmost blocks together, store result in state1 scratch. - ConstMatrixMap top((Complex*)state2_raw, 2, bond_dim); - ConstMatrixMap bot((Complex*)state1_raw, 2, bond_dim); - MatrixMap partial_contract((Complex*)(state1_raw + end), bond_dim, - bond_dim); - MatrixMap partial_contract2( - (Complex*)(state1_raw + end + 2 * bond_dim * bond_dim), bond_dim, - 2 * bond_dim); - partial_contract.noalias() = top.adjoint() * bot; - - // Contract all internal blocks together. - for (unsigned i = 1; i < num_qubits - 1; ++i) { - offset = GetBlockOffset(state1, i); - - // reshape: - new (&partial_contract2) - MatrixMap((Complex*)(state1_raw + end + 2 * bond_dim * bond_dim), - bond_dim, 2 * bond_dim); - - // Merge bot into left boundary merged tensor. - new (&bot) ConstMatrixMap((Complex*)(state1_raw + offset), bond_dim, - 2 * bond_dim); - partial_contract2.noalias() = partial_contract * bot; - - // reshape: - new (&partial_contract2) - MatrixMap((Complex*)(state1_raw + end + 2 * bond_dim * bond_dim), - 2 * bond_dim, bond_dim); - - // Merge top into partial_contract2. - new (&top) ConstMatrixMap((Complex*)(state2_raw + offset), 2 * bond_dim, - bond_dim); - partial_contract.noalias() = top.adjoint() * partial_contract2; - } - - // Contract rightmost bottom block. - offset = GetBlockOffset(state1, num_qubits - 1); - new (&bot) ConstMatrixMap((Complex*)(state1_raw + offset), bond_dim, 2); - new (&partial_contract2) MatrixMap( - (Complex*)(state1_raw + end + 4 * bond_dim * bond_dim), bond_dim, 2); - partial_contract2.noalias() = partial_contract * bot; - - // Contract rightmost top block. - new (&top) ConstMatrixMap((Complex*)(state2_raw + offset), 2 * bond_dim, 1); - new (&partial_contract) MatrixMap((Complex*)(state1_raw + end), 1, 1); - new (&partial_contract2) - MatrixMap((Complex*)(state1_raw + end + 4 * bond_dim * bond_dim), - 2 * bond_dim, 1); - partial_contract.noalias() = top.adjoint() * partial_contract2; - - return partial_contract(0, 0); - } - - // Compute the 2x2 1-RDM of state on index. Result written to rdm. - // Requires: scratch and rdm to be allocated. - static void ReduceDensityMatrix(MPS& state, MPS& scratch, int index, - fp_type* rdm) { - const auto num_qubits = state.num_qubits(); - const auto bond_dim = state.bond_dim(); - const auto end = Size(state); - const bool last_index = (index == num_qubits - 1); - const auto right_dim = (last_index ? 1 : bond_dim); - auto offset = 0; - fp_type* state_raw = state.get(); - fp_type* scratch_raw = scratch.get(); - fp_type* state_raw_workspace = state_raw + end + 2 * bond_dim * bond_dim; - fp_type* scratch_raw_workspace = - scratch_raw + end + 2 * bond_dim * bond_dim; - - Copy(state, scratch); - - // Contract leftmost blocks together, store result in state scratch. - ConstMatrixMap top((Complex*)scratch_raw, 2, bond_dim); - ConstMatrixMap bot((Complex*)state_raw, 2, bond_dim); - MatrixMap partial_contract((Complex*)(state_raw + end), bond_dim, bond_dim); - MatrixMap partial_contract2((Complex*)(state_raw_workspace), bond_dim, - 2 * bond_dim); - - partial_contract.setZero(); - partial_contract(0, 0) = 1; - if (index > 0) { - partial_contract.noalias() = top.adjoint() * bot; - } - - // Contract all internal blocks together. - for (unsigned i = 1; i < index; ++i) { - offset = GetBlockOffset(state, i); - - // reshape: - new (&partial_contract2) - MatrixMap((Complex*)(state_raw_workspace), bond_dim, 2 * bond_dim); - - // Merge bot into left boundary merged tensor. - new (&bot) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, - 2 * bond_dim); - partial_contract2.noalias() = partial_contract * bot; - - // reshape: - new (&partial_contract2) - MatrixMap((Complex*)(state_raw_workspace), 2 * bond_dim, bond_dim); - - // Merge top into partial_contract2. - new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), 2 * bond_dim, - bond_dim); - partial_contract.noalias() = top.adjoint() * partial_contract2; - } - - // The [bond_dim, bond_dim] block in state_raw now contains the contraction - // up to, but not including index. - // Contract rightmost blocks. - offset = GetBlockOffset(state, num_qubits - 1); - new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), bond_dim, 2); - new (&bot) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, 2); - new (&partial_contract) - MatrixMap((Complex*)(scratch_raw + end), bond_dim, bond_dim); - new (&partial_contract2) - MatrixMap((Complex*)(scratch_raw_workspace), bond_dim, 2 * bond_dim); - - partial_contract.setZero(); - partial_contract(0, 0) = 1; - if (index < num_qubits - 1) { - partial_contract.noalias() = top * bot.adjoint(); - } - - for (unsigned i = num_qubits - 2; i > index; --i) { - offset = GetBlockOffset(state, i); - - // reshape: - new (&partial_contract2) - MatrixMap((Complex*)(scratch_raw_workspace), 2 * bond_dim, bond_dim); - - // Merge bot into left boundary merged tensor. - new (&bot) ConstMatrixMap((Complex*)(state_raw + offset), 2 * bond_dim, - bond_dim); - partial_contract2.noalias() = bot * partial_contract.adjoint(); - - // reshape: - new (&partial_contract2) - MatrixMap((Complex*)(scratch_raw_workspace), bond_dim, 2 * bond_dim); - - // Merge top into partial_contract2. - new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), bond_dim, - 2 * bond_dim); - // [bd, bd] = [bd, 2bd] @ [bd, 2bd] - partial_contract.noalias() = top * partial_contract2.adjoint(); - } - - // The [bond_dim, bond_dim] block in scratch_raw now contains the - // contraction down from the end, but not including the index. Begin final - // contraction steps. - - // Get leftmost [bd, bd] contraction and contract with top. - - offset = GetBlockOffset(state, index); - new (&partial_contract) - MatrixMap((Complex*)(state_raw + end), bond_dim, bond_dim); - new (&top) - ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, 2 * right_dim); - new (&partial_contract2) - MatrixMap((Complex*)(scratch_raw_workspace), bond_dim, 2 * right_dim); - partial_contract2.noalias() = partial_contract * top.conjugate(); - // copy the bottom contraction scratch_raw to state_raw to save space. - memcpy(state_raw + end, scratch_raw + end, - bond_dim * bond_dim * 2 * sizeof(fp_type)); - - // Contract top again for correct shape. - fp_type* contract3_target = (last_index ? rdm : scratch_raw); - MatrixMap partial_contract3((Complex*)contract3_target, 2 * right_dim, - 2 * right_dim); - partial_contract3.noalias() = top.transpose() * partial_contract2; - - // If we are contracting the last index, all the needed transforms are done. - if (last_index) { - return; - } - - // Conduct final tensor contraction operations. Cannot be easily compiled to - // matmul. - const Eigen::TensorMap> - t_4d((Complex*)scratch_raw, 2, bond_dim, 2, bond_dim); - const Eigen::TensorMap> - t_2d((Complex*)(state_raw + end), bond_dim, bond_dim); - - const Eigen::array, 2> product_dims = { - Eigen::IndexPair(1, 0), - Eigen::IndexPair(3, 1), - }; - Eigen::TensorMap> out( - (Complex*)rdm, 2, 2); - out = t_4d.contract(t_2d, product_dims); - } - - // Draw a single bitstring sample from state using scratch and scratch2 - // as working space. - static void SampleOnce(MPS& state, MPS& scratch, MPS& scratch2, - std::mt19937* random_gen, std::vector* sample) { - // TODO: carefully profile with perf and optimize temp storage - // locations for cache friendliness. - const auto bond_dim = state.bond_dim(); - const auto num_qubits = state.num_qubits(); - const auto end = Size(state); - const auto left_frontier_offset = GetBlockOffset(state, num_qubits + 1); - std::default_random_engine generator; - fp_type* state_raw = state.get(); - fp_type* scratch_raw = scratch.get(); - fp_type* scratch2_raw = scratch2.get(); - fp_type rdm[8]; - - sample->reserve(num_qubits); - Copy(state, scratch); - Copy(state, scratch2); - - // Store prefix contractions in scratch2. - auto offset = GetBlockOffset(state, num_qubits - 1); - ConstMatrixMap top((Complex*)(state_raw + offset), bond_dim, 2); - ConstMatrixMap bot((Complex*)(scratch_raw + offset), bond_dim, 2); - MatrixMap partial_contract((Complex*)(scratch2_raw + offset), bond_dim, - bond_dim); - MatrixMap partial_contract2((Complex*)(scratch_raw + end), bond_dim, - 2 * bond_dim); - partial_contract.noalias() = top * bot.adjoint(); - - for (unsigned i = num_qubits - 2; i > 0; --i) { - offset = GetBlockOffset(state, i); - // reshape: - new (&partial_contract2) - MatrixMap((Complex*)(scratch_raw + end), 2 * bond_dim, bond_dim); - - // Merge bot into left boundary merged tensor. - new (&bot) ConstMatrixMap((Complex*)(scratch_raw + offset), 2 * bond_dim, - bond_dim); - partial_contract2.noalias() = bot * partial_contract.adjoint(); - - // reshape: - new (&partial_contract2) - MatrixMap((Complex*)(scratch_raw + end), bond_dim, 2 * bond_dim); - - // Merge top into partial_contract2. - new (&top) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, - 2 * bond_dim); - - // merge into partial_contract -> scracth2_raw. - new (&partial_contract) - MatrixMap((Complex*)(scratch2_raw + offset), bond_dim, bond_dim); - partial_contract.noalias() = top * partial_contract2.adjoint(); - } - - // Compute RDM-0 and draw first sample. - offset = GetBlockOffset(state, 1); - new (&top) ConstMatrixMap((Complex*)state_raw, 2, bond_dim); - new (&bot) ConstMatrixMap((Complex*)scratch_raw, 2, bond_dim); - new (&partial_contract) - MatrixMap((Complex*)(scratch2_raw + offset), bond_dim, bond_dim); - new (&partial_contract2) - MatrixMap((Complex*)(scratch_raw + end), 2, bond_dim); - - partial_contract2.noalias() = bot * partial_contract.adjoint(); - - new (&partial_contract) MatrixMap((Complex*)rdm, 2, 2); - partial_contract.noalias() = top * partial_contract2.adjoint(); - auto p0 = rdm[0] / (rdm[0] + rdm[6]); - std::bernoulli_distribution distribution(1 - p0); - auto bit_val = distribution(*random_gen); - sample->push_back(bit_val); - - // collapse state. - new (&partial_contract) MatrixMap((Complex*)scratch_raw, 2, bond_dim); - partial_contract.row(!bit_val).setZero(); - - // Prepare left contraction frontier. - new (&partial_contract2) MatrixMap( - (Complex*)(scratch2_raw + left_frontier_offset), bond_dim, bond_dim); - partial_contract2.noalias() = - partial_contract.transpose() * partial_contract.conjugate(); - - // Compute RDM-i and draw internal tensor samples. - for (unsigned i = 1; i < num_qubits - 1; i++) { - // Get leftmost [bd, bd] contraction and contract with top. - offset = GetBlockOffset(state, i); - new (&partial_contract) MatrixMap( - (Complex*)(scratch2_raw + left_frontier_offset), bond_dim, bond_dim); - new (&top) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, - 2 * bond_dim); - new (&partial_contract2) - MatrixMap((Complex*)(state_raw + end), bond_dim, 2 * bond_dim); - partial_contract2.noalias() = partial_contract * top.conjugate(); - - // Contract top again for correct shape. - MatrixMap partial_contract3((Complex*)(scratch_raw + end), 2 * bond_dim, - 2 * bond_dim); - partial_contract3.noalias() = top.transpose() * partial_contract2; - - // Conduct final tensor contraction operations. Cannot be easily compiled - // to matmul. Perf reports shows only ~6% of runtime spent here on large - // systems. - offset = GetBlockOffset(state, i + 1); - const Eigen::TensorMap> - t_4d((Complex*)(scratch_raw + end), 2, bond_dim, 2, bond_dim); - const Eigen::TensorMap> - t_2d((Complex*)(scratch2_raw + offset), bond_dim, bond_dim); - - const Eigen::array, 2> product_dims = { - Eigen::IndexPair(1, 0), - Eigen::IndexPair(3, 1), - }; - Eigen::TensorMap> out( - (Complex*)rdm, 2, 2); - out = t_4d.contract(t_2d, product_dims); - - // Sample bit and collapse state. - p0 = rdm[0] / (rdm[0] + rdm[6]); - distribution = std::bernoulli_distribution(1 - p0); - bit_val = distribution(*random_gen); - - sample->push_back(bit_val); - offset = GetBlockOffset(state, i); - new (&partial_contract) - MatrixMap((Complex*)(scratch_raw + offset), bond_dim * 2, bond_dim); - for (unsigned j = !bit_val; j < 2 * bond_dim; j += 2) { - partial_contract.row(j).setZero(); - } - - // Update left frontier. - new (&partial_contract) MatrixMap( - (Complex*)(scratch2_raw + left_frontier_offset), bond_dim, bond_dim); - - // reshape: - new (&partial_contract2) - MatrixMap((Complex*)(state_raw + end), bond_dim, 2 * bond_dim); - - // Merge bot into left boundary merged tensor. - new (&bot) ConstMatrixMap((Complex*)(scratch_raw + offset), bond_dim, - 2 * bond_dim); - partial_contract2.noalias() = partial_contract * bot.conjugate(); - - // reshape: - new (&partial_contract2) - MatrixMap((Complex*)(state_raw + end), 2 * bond_dim, bond_dim); - - // Merge top into partial_contract2. - new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), 2 * bond_dim, - bond_dim); - partial_contract.noalias() = top.transpose() * partial_contract2; - } - - // Compute RDM-(n-1) and sample. - offset = GetBlockOffset(state, num_qubits - 1); - new (&partial_contract2) - MatrixMap((Complex*)(state_raw + end), bond_dim, 2); - - new (&top) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, 2); - partial_contract2.noalias() = partial_contract * top.conjugate(); - new (&partial_contract) MatrixMap((Complex*)rdm, 2, 2); - partial_contract.noalias() = top.transpose() * partial_contract2; - - p0 = rdm[0] / (rdm[0] + rdm[6]); - distribution = std::bernoulli_distribution(1 - p0); - bit_val = distribution(*random_gen); - sample->push_back(bit_val); - } - - // Draw num_samples bitstring samples from state and store the result - // bit vectors in results. Uses scratch and scratch2 as workspace. - static void Sample(MPS& state, MPS& scratch, MPS& scratch2, - unsigned num_samples, unsigned seed, - std::vector>* results) { - std::mt19937 rand_source(seed); - results->reserve(num_samples); - for (unsigned i = 0; i < num_samples; i++) { - SampleOnce(state, scratch, scratch2, &rand_source, &(*results)[i]); - } - } - - // Testing only. Convert the MPS to a wavefunction under "normal" ordering. - // Requires: wf be allocated beforehand with bond_dim * 2 ^ num_qubits -1 - // memory. - static void ToWaveFunction(MPS& state, fp_type* wf) { - const auto bond_dim = state.bond_dim(); - const auto num_qubits = state.num_qubits(); - fp_type* raw_state = state.get(); - - ConstMatrixMap accum = ConstMatrixMap((Complex*)(raw_state), 2, bond_dim); - ConstMatrixMap next_block = ConstMatrixMap(nullptr, 0, 0); - MatrixMap result2 = MatrixMap(nullptr, 0, 0); - auto offset = 0; - auto result2_size = 2; - - for (unsigned i = 1; i < num_qubits - 1; i++) { - offset = GetBlockOffset(state, i); - // use of new does not trigger any expensive operations. - new (&next_block) ConstMatrixMap((Complex*)(raw_state + offset), bond_dim, - 2 * bond_dim); - new (&result2) MatrixMap((Complex*)(wf), result2_size, 2 * bond_dim); - - // temp variable used since result2 and accum point to same memory. - result2 = accum * next_block; - result2_size *= 2; - new (&accum) ConstMatrixMap((Complex*)(wf), result2_size, bond_dim); - } - offset = GetBlockOffset(state, num_qubits - 1); - new (&next_block) - ConstMatrixMap((Complex*)(raw_state + offset), bond_dim, 2); - new (&result2) MatrixMap((Complex*)(wf), result2_size, 2); - result2 = accum * next_block; - } - - protected: - For for_; -}; - -} // namespace mps -} // namespace qsim - -#endif // MPS_STATESPACE_H_ diff --git a/qsim/parfor.h b/qsim/parfor.h deleted file mode 100644 index 8a3a4d6..0000000 --- a/qsim/parfor.h +++ /dev/null @@ -1,123 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef PARFOR_H_ -#define PARFOR_H_ - -#include - -#include -#include -#include - -namespace qsim { - -/** - * Helper struct for executing for-loops in parallel across multiple threads. - */ -template -struct ParallelForT { - explicit ParallelForT(unsigned num_threads) : num_threads(num_threads) {} - - // GetIndex0 and GetIndex1 are useful when we need to know how work was - // divided between threads, for instance, for reusing partial sums obtained - // by RunReduceP. - uint64_t GetIndex0(uint64_t size, unsigned thread_id) const { - return size >= MIN_SIZE ? size * thread_id / num_threads : 0; - } - - uint64_t GetIndex1(uint64_t size, unsigned thread_id) const { - return size >= MIN_SIZE ? size * (thread_id + 1) / num_threads : size; - } - - template - void Run(uint64_t size, Function&& func, Args&&... args) const { - if (num_threads > 1 && size >= MIN_SIZE) { - #pragma omp parallel num_threads(num_threads) - { - unsigned n = omp_get_num_threads(); - unsigned m = omp_get_thread_num(); - - uint64_t i0 = GetIndex0(size, m); - uint64_t i1 = GetIndex1(size, m); - - for (uint64_t i = i0; i < i1; ++i) { - func(n, m, i, args...); - } - } - } else { - for (uint64_t i = 0; i < size; ++i) { - func(1, 0, i, args...); - } - } - } - - template - std::vector RunReduceP( - uint64_t size, Function&& func, Op&& op, Args&&... args) const { - std::vector partial_results; - - if (num_threads > 1 && size >= MIN_SIZE) { - partial_results.resize(num_threads, 0); - - #pragma omp parallel num_threads(num_threads) - { - unsigned n = omp_get_num_threads(); - unsigned m = omp_get_thread_num(); - - uint64_t i0 = GetIndex0(size, m); - uint64_t i1 = GetIndex1(size, m); - - typename Op::result_type partial_result = 0; - - for (uint64_t i = i0; i < i1; ++i) { - partial_result = op(partial_result, func(n, m, i, args...)); - } - - partial_results[m] = partial_result; - } - } else if (num_threads > 0) { - typename Op::result_type result = 0; - for (uint64_t i = 0; i < size; ++i) { - result = op(result, func(1, 0, i, args...)); - } - - partial_results.resize(1, result); - } - - return partial_results; - } - - template - typename Op::result_type RunReduce(uint64_t size, Function&& func, - Op&& op, Args&&... args) const { - auto partial_results = RunReduceP(size, func, std::move(op), args...); - - typename Op::result_type result = 0; - - for (auto partial_result : partial_results) { - result = op(result, partial_result); - } - - return result; - } - - unsigned num_threads; -}; - -using ParallelFor = ParallelForT<1024>; - -} // namespace qsim - -#endif // PARFOR_H_ diff --git a/qsim/qtrajectory.h b/qsim/qtrajectory.h deleted file mode 100644 index 1da6692..0000000 --- a/qsim/qtrajectory.h +++ /dev/null @@ -1,435 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef QTRAJECTORY_H_ -#define QTRAJECTORY_H_ - -#include -#include -#include -#include -#include - -#include "circuit_noisy.h" -#include "gate.h" -#include "gate_appl.h" - -namespace qsim { - -/** - * Quantum trajectory simulator. - */ -template class FuserT, typename Simulator, - typename RGen = std::mt19937> -class QuantumTrajectorySimulator { - public: - using Fuser = FuserT; - using StateSpace = typename Simulator::StateSpace; - using State = typename Simulator::State; - using MeasurementResult = typename StateSpace::MeasurementResult; - - /** - * User-specified parameters for the simulator. - */ - struct Parameter : public Fuser::Parameter { - /** - * If true, collect statistics of sampled Kraus operator indices. - */ - bool collect_kop_stat = false; - /** - * If true, collect statistics of measured bitstrings. - */ - bool collect_mea_stat = false; - /** - * If true, normalize the state vector before performing measurements. - */ - bool normalize_before_mea_gates = true; - /** - * If false, do not apply deferred operators after the main loop for - * the "primary" noise trajectory, that is the trajectory in which - * the primary (the first operators in their respective channels) Kraus - * operators are sampled for each channel and there are no measurements - * in the computational basis. This can be used to speed up simulations - * of circuits with weak noise and without measurements by reusing - * the primary trajectory results. There is an additional condition for - * RunBatch. In this case, the deferred operators after the main loop are - * still applied for the first occurence of the primary trajectory. - * The primary Kraus operators should have the highest sampling - * probabilities to achieve the highest speedup. - * - * It is the client's responsibility to collect the primary trajectory - * results and to reuse them. - */ - bool apply_last_deferred_ops = true; - }; - - /** - * Struct with statistics to populate by RunBatch and RunOnce methods. - */ - struct Stat { - /** - * Indices of sampled Kraus operator indices and/or measured bitstrings. - */ - std::vector samples; - /** - * True if the "primary" noise trajectory is sampled, false otherwise. - */ - bool primary; - }; - - /** - * Runs the given noisy circuit performing repetitions. Each repetition is - * seeded by repetition ID. - * @param param Options for the quantum trajectory simulator. - * @param circuit The noisy circuit to be simulated. - * @param r0, r1 The range of repetition IDs [r0, r1) to perform repetitions. - * @param state_space StateSpace object required to manipulate state vector. - * @param simulator Simulator object. Provides specific implementations for - * applying gates. - * @param measure Function that performs measurements (in the sense of - * computing expectation values, etc). This function should have three - * required parameters [repetition ID (uint64_t), final state vector - * (const State&), statistics of sampled Kraus operator indices and/or - * measured bitstrings (const Stat&)] and any number of optional parameters. - * @param args Optional arguments for the 'measure' function. - * @return True if the simulation completed successfully; false otherwise. - */ - template - static bool RunBatch(const Parameter& param, - const NoisyCircuit& circuit, - uint64_t r0, uint64_t r1, const StateSpace& state_space, - const Simulator& simulator, MeasurementFunc&& measure, - Args&&... args) { - return RunBatch(param, circuit.num_qubits, circuit.channels.begin(), - circuit.channels.end(), r0, r1, state_space, simulator, - measure, args...); - } - - /** - * Runs the given noisy circuit performing repetitions. Each repetition is - * seeded by repetition ID. - * @param param Options for the quantum trajectory simulator. - * @param num_qubits The number of qubits acted on by the circuit. - * @param cbeg, cend The range of channels [cbeg, cend) to run the circuit. - * @param r0, r1 The range of repetition IDs [r0, r1) to perform repetitions. - * @param state_space StateSpace object required to manipulate state vector. - * @param simulator Simulator object. Provides specific implementations for - * applying gates. - * @param measure Function that performs measurements (in the sense of - * computing expectation values, etc). This function should have three - * required parameters [repetition ID (uint64_t), final state vector - * (const State&), statistics of sampled Kraus operator indices and/or - * measured bitstrings (const Stat&)] and any number of optional parameters. - * @param args Optional arguments for the 'measure' function. - * @return True if the simulation completed successfully; false otherwise. - */ - template - static bool RunBatch(const Parameter& param, unsigned num_qubits, - ncircuit_iterator cbeg, - ncircuit_iterator cend, - uint64_t r0, uint64_t r1, const StateSpace& state_space, - const Simulator& simulator, MeasurementFunc&& measure, - Args&&... args) { - std::vector gates; - gates.reserve(4 * std::size_t(cend - cbeg)); - - State state = state_space.Null(); - - Stat stat; - bool had_primary_realization = false; - - for (uint64_t r = r0; r < r1; ++r) { - if (!state_space.IsNull(state)) { - state_space.SetStateZero(state); - } - - bool apply_last_deferred_ops = - param.apply_last_deferred_ops || !had_primary_realization; - - if (!RunIteration(param, apply_last_deferred_ops, num_qubits, cbeg, cend, - r, state_space, simulator, gates, state, stat)) { - return false; - } - - if (stat.primary && !had_primary_realization) { - had_primary_realization = true; - } - - measure(r, state, stat, args...); - } - - return true; - } - - /** - * Runs the given noisy circuit one time. - * @param param Options for the quantum trajectory simulator. - * @param circuit The noisy circuit to be simulated. - * @param r The repetition ID. The random number generator is seeded by 'r'. - * @param state_space StateSpace object required to manipulate state vector. - * @param simulator Simulator object. Provides specific implementations for - * applying gates. - * @param state The state of the system, to be updated by this method. - * @param stat Statistics of sampled Kraus operator indices and/or measured - * bitstrings, to be populated by this method. - * @return True if the simulation completed successfully; false otherwise. - */ - static bool RunOnce(const Parameter& param, - const NoisyCircuit& circuit, uint64_t r, - const StateSpace& state_space, const Simulator& simulator, - State& state, Stat& stat) { - return RunOnce(param, circuit.num_qubits, circuit.channels.begin(), - circuit.channels.end(), r, state_space, simulator, - state, stat); - } - - /** - * Runs the given noisy circuit one time. - * @param param Options for the quantum trajectory simulator. - * @param num_qubits The number of qubits acted on by the circuit. - * @param cbeg, cend The range of channels [cbeg, cend) to run the circuit. - * @param circuit The noisy circuit to be simulated. - * @param r The repetition ID. The random number generator is seeded by 'r'. - * @param state_space StateSpace object required to manipulate state vector. - * @param simulator Simulator object. Provides specific implementations for - * applying gates. - * @param state The state of the system, to be updated by this method. - * @param stat Statistics of sampled Kraus operator indices and/or measured - * bitstrings, to be populated by this method. - * @return True if the simulation completed successfully; false otherwise. - */ - static bool RunOnce(const Parameter& param, unsigned num_qubits, - ncircuit_iterator cbeg, - ncircuit_iterator cend, - uint64_t r, const StateSpace& state_space, - const Simulator& simulator, State& state, Stat& stat) { - std::vector gates; - gates.reserve(4 * std::size_t(cend - cbeg)); - - if (!RunIteration(param, param.apply_last_deferred_ops, num_qubits, cbeg, - cend, r, state_space, simulator, gates, state, stat)) { - return false; - } - - return true; - } - - private: - static bool RunIteration(const Parameter& param, - bool apply_last_deferred_ops, unsigned num_qubits, - ncircuit_iterator cbeg, - ncircuit_iterator cend, - uint64_t rep, const StateSpace& state_space, - const Simulator& simulator, - std::vector& gates, - State& state, Stat& stat) { - if (param.collect_kop_stat || param.collect_mea_stat) { - stat.samples.reserve(std::size_t(cend - cbeg)); - stat.samples.resize(0); - } - - if (state_space.IsNull(state)) { - state = CreateState(num_qubits, state_space); - if (state_space.IsNull(state)) { - return false; - } - - state_space.SetStateZero(state); - } - - gates.resize(0); - - RGen rgen(rep); - std::uniform_real_distribution distr(0.0, 1.0); - - bool unitary = true; - stat.primary = true; - - for (auto it = cbeg; it != cend; ++it) { - const auto& channel = *it; - - if (channel.size() == 0) continue; - - if (channel[0].kind == gate::kMeasurement) { - // Measurement channel. - - if (!ApplyDeferredOps(param, num_qubits, simulator, gates, state)) { - return false; - } - - bool normalize = !unitary && param.normalize_before_mea_gates; - NormalizeState(normalize, state_space, unitary, state); - - auto mresult = ApplyMeasurementGate(state_space, channel[0].ops[0], - rgen, state); - - if (!mresult.valid) { - return false; - } - - CollectStat(param.collect_mea_stat, mresult.bits, stat); - - stat.primary = false; - - continue; - } - - // "Normal" channel. - - double r = distr(rgen); - double cp = 0; - - // Perform sampling of Kraus operators using probability bounds. - for (std::size_t i = 0; i < channel.size(); ++i) { - const auto& kop = channel[i]; - - cp += kop.prob; - - if (r < cp) { - DeferOps(kop.ops, gates); - CollectStat(param.collect_kop_stat, i, stat); - - unitary = unitary && kop.unitary; - - break; - } - } - - if (r < cp) continue; - - if (!ApplyDeferredOps(param, num_qubits, simulator, gates, state)) { - return false; - } - - NormalizeState(!unitary, state_space, unitary, state); - - double max_prob = 0; - std::size_t max_prob_index = 0; - - // Perform sampling of Kraus operators using norms of updated states. - for (std::size_t i = 0; i < channel.size(); ++i) { - const auto& kop = channel[i]; - - if (kop.unitary) continue; - - double prob = std::real( - simulator.ExpectationValue(kop.qubits, kop.kd_k.data(), state)); - - if (prob > max_prob) { - max_prob = prob; - max_prob_index = i; - } - - cp += prob - kop.prob; - - if (r < cp || i == channel.size() - 1) { - // Sample ith Kraus operator if r < cp - // Sample the highest probability Kraus operator if r is greater - // than the sum of all probablities due to round-off errors. - uint64_t k = r < cp ? i : max_prob_index; - - DeferOps(channel[k].ops, gates); - CollectStat(param.collect_kop_stat, k, stat); - - unitary = false; - - break; - } - } - } - - if (apply_last_deferred_ops || !stat.primary) { - if (!ApplyDeferredOps(param, num_qubits, simulator, gates, state)) { - return false; - } - - NormalizeState(!unitary, state_space, unitary, state); - } - - return true; - } - - static State CreateState(unsigned num_qubits, const StateSpace& state_space) { - auto state = state_space.Create(num_qubits); - if (state_space.IsNull(state)) { - IO::errorf("not enough memory: is the number of qubits too large?\n"); - return state_space.Null(); - } - - return state; - } - - static bool ApplyDeferredOps( - const Parameter& param, unsigned num_qubits, const Simulator& simulator, - std::vector& gates, State& state) { - if (gates.size() > 0) { - auto fgates = Fuser::FuseGates(param, num_qubits, gates); - - gates.resize(0); - - if (fgates.size() == 0) { - return false; - } - - for (const auto& fgate : fgates) { - ApplyFusedGate(simulator, fgate, state); - } - } - - return true; - } - - static MeasurementResult ApplyMeasurementGate( - const StateSpace& state_space, const Gate& gate, - RGen& rgen, State& state) { - auto result = state_space.Measure(gate.qubits, rgen, state); - - if (!result.valid) { - IO::errorf("measurement failed.\n"); - } - - return result; - } - - static void DeferOps( - const std::vector& ops, std::vector& gates) { - for (const auto& op : ops) { - gates.push_back(&op); - } - } - - static void CollectStat(bool collect_stat, uint64_t i, Stat& stat) { - if (collect_stat) { - stat.samples.push_back(i); - } - - if (i != 0) { - stat.primary = false; - } - } - - static void NormalizeState(bool normalize, const StateSpace& state_space, - bool& flag, State& state) { - if (normalize) { - double a = 1.0 / std::sqrt(state_space.Norm(state)); - state_space.Multiply(a, state); - flag = true; - } - } -}; - -} // namespace qsim - -#endif // QTRAJECTORY_H_ diff --git a/qsim/run_qsim.h b/qsim/run_qsim.h deleted file mode 100644 index 3752915..0000000 --- a/qsim/run_qsim.h +++ /dev/null @@ -1,262 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef RUN_QSIM_H_ -#define RUN_QSIM_H_ - -#include -#include -#include - -#include "gate.h" -#include "gate_appl.h" -#include "util.h" - -namespace qsim { - -/** - * Helper struct for running qsim. - */ -template -struct QSimRunner final { - public: - using Simulator = typename Factory::Simulator; - using StateSpace = typename Simulator::StateSpace; - using State = typename StateSpace::State; - using MeasurementResult = typename StateSpace::MeasurementResult; - - /** - * User-specified parameters for gate fusion and simulation. - */ - struct Parameter : public Fuser::Parameter { - /** - * Random number generator seed to apply measurement gates. - */ - uint64_t seed; - }; - - /** - * Runs the given circuit, only measuring at the end. - * @param param Options for gate fusion, parallelism and logging. - * @param factory Object to create simulators and state spaces. - * @param circuit The circuit to be simulated. - * @param measure Function that performs measurements (in the sense of - * computing expectation values, etc). - * @return True if the simulation completed successfully; false otherwise. - */ - template - static bool Run(const Parameter& param, const Factory& factory, - const Circuit& circuit, MeasurementFunc measure) { - return Run(param, factory, {circuit.gates.back().time}, circuit, measure); - } - - /** - * Runs the given circuit, measuring at user-specified times. - * @param param Options for gate fusion, parallelism and logging. - * @param factory Object to create simulators and state spaces. - * @param times_to_measure_at Time steps at which to perform measurements. - * @param circuit The circuit to be simulated. - * @param measure Function that performs measurements (in the sense of - * computing expectation values, etc). - * @return True if the simulation completed successfully; false otherwise. - */ - template - static bool Run(const Parameter& param, const Factory& factory, - const std::vector& times_to_measure_at, - const Circuit& circuit, MeasurementFunc measure) { - double t0 = 0.0; - double t1 = 0.0; - - if (param.verbosity > 1) { - t0 = GetTime(); - } - - RGen rgen(param.seed); - - StateSpace state_space = factory.CreateStateSpace(); - - auto state = state_space.Create(circuit.num_qubits); - if (state_space.IsNull(state)) { - IO::errorf("not enough memory: is the number of qubits too large?\n"); - return false; - } - - state_space.SetStateZero(state); - Simulator simulator = factory.CreateSimulator(); - - if (param.verbosity > 1) { - t1 = GetTime(); - IO::messagef("init time is %g seconds.\n", t1 - t0); - t0 = GetTime(); - } - - auto fused_gates = Fuser::FuseGates(param, circuit.num_qubits, - circuit.gates, times_to_measure_at); - - if (fused_gates.size() == 0 && circuit.gates.size() > 0) { - return false; - } - - if (param.verbosity > 1) { - t1 = GetTime(); - IO::messagef("fuse time is %g seconds.\n", t1 - t0); - } - - if (param.verbosity > 0) { - t0 = GetTime(); - } - - unsigned cur_time_index = 0; - - // Apply fused gates. - for (std::size_t i = 0; i < fused_gates.size(); ++i) { - if (param.verbosity > 3) { - t1 = GetTime(); - } - - if (!ApplyFusedGate(state_space, simulator, fused_gates[i], rgen, - state)) { - IO::errorf("measurement failed.\n"); - return false; - } - - if (param.verbosity > 3) { - state_space.DeviceSync(); - double t2 = GetTime(); - IO::messagef("gate %lu done in %g seconds.\n", i, t2 - t1); - } - - unsigned t = times_to_measure_at[cur_time_index]; - - if (i == fused_gates.size() - 1 || t < fused_gates[i + 1].time) { - // Call back to perform measurements. - measure(cur_time_index, state_space, state); - ++cur_time_index; - } - } - - if (param.verbosity > 0) { - state_space.DeviceSync(); - double t2 = GetTime(); - IO::messagef("time is %g seconds.\n", t2 - t0); - } - - return true; - } - - /** - * Runs the given circuit and make the final state available to the caller, - * recording the result of any intermediate measurements in the circuit. - * @param param Options for gate fusion, parallelism and logging. - * @param factory Object to create simulators and state spaces. - * @param circuit The circuit to be simulated. - * @param state As an input parameter, this should contain the initial state - * of the system. After a successful run, it will be populated with the - * final state of the system. - * @param measure_results As an input parameter, this should be empty. - * After a successful run, this will contain all measurements results from - * the run, ordered by time and qubit index. - * @return True if the simulation completed successfully; false otherwise. - */ - template - static bool Run(const Parameter& param, const Factory& factory, - const Circuit& circuit, State& state, - std::vector& measure_results) { - double t0 = 0.0; - double t1 = 0.0; - - if (param.verbosity > 1) { - t0 = GetTime(); - } - - RGen rgen(param.seed); - - StateSpace state_space = factory.CreateStateSpace(); - Simulator simulator = factory.CreateSimulator(); - - if (param.verbosity > 1) { - t1 = GetTime(); - IO::messagef("init time is %g seconds.\n", t1 - t0); - t0 = GetTime(); - } - - auto fused_gates = Fuser::FuseGates(param, circuit.num_qubits, - circuit.gates); - - if (fused_gates.size() == 0 && circuit.gates.size() > 0) { - return false; - } - - measure_results.reserve(fused_gates.size()); - - if (param.verbosity > 1) { - t1 = GetTime(); - IO::messagef("fuse time is %g seconds.\n", t1 - t0); - } - - if (param.verbosity > 0) { - t0 = GetTime(); - } - - // Apply fused gates. - for (std::size_t i = 0; i < fused_gates.size(); ++i) { - if (param.verbosity > 3) { - t1 = GetTime(); - } - - if (!ApplyFusedGate(state_space, simulator, fused_gates[i], rgen, state, - measure_results)) { - IO::errorf("measurement failed.\n"); - return false; - } - - if (param.verbosity > 3) { - state_space.DeviceSync(); - double t2 = GetTime(); - IO::messagef("gate %lu done in %g seconds.\n", i, t2 - t1); - } - } - - if (param.verbosity > 0) { - state_space.DeviceSync(); - double t2 = GetTime(); - IO::messagef("simu time is %g seconds.\n", t2 - t0); - } - - return true; - } - - /** - * Runs the given circuit and make the final state available to the caller, - * discarding the result of any intermediate measurements in the circuit. - * @param param Options for gate fusion, parallelism and logging. - * @param factory Object to create simulators and state spaces. - * @param circuit The circuit to be simulated. - * @param state As an input parameter, this should contain the initial state - * of the system. After a successful run, it will be populated with the - * final state of the system. - * @return True if the simulation completed successfully; false otherwise. - */ - template - static bool Run(const Parameter& param, const Factory& factory, - const Circuit& circuit, State& state) { - std::vector discarded_results; - return Run(param, factory, circuit, state, discarded_results); - } -}; - -} // namespace qsim - -#endif // RUN_QSIM_H_ diff --git a/qsim/run_qsimh.h b/qsim/run_qsimh.h deleted file mode 100644 index c1534d3..0000000 --- a/qsim/run_qsimh.h +++ /dev/null @@ -1,120 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef RUN_QSIMH_H_ -#define RUN_QSIMH_H_ - -#include -#include - -#include "hybrid.h" -#include "util.h" - -namespace qsim { - -/** - * Helper struct for running qsimh. - */ -template -struct QSimHRunner final { - using Gate = typename HybridSimulator::Gate; - using fp_type = typename HybridSimulator::fp_type; - - using Parameter = typename HybridSimulator::Parameter; - using HybridData = typename HybridSimulator::HybridData; - using Fuser = typename HybridSimulator::Fuser; - - /** - * Evaluates the amplitudes for a given circuit and set of output states. - * @param param Options for gate fusion, parallelism and logging. Also - * specifies the size of the 'prefix' and 'root' sections of the lattice. - * @param factory Object to create simulators and state spaces. - * @param circuit The circuit to be simulated. - * @param parts Lattice sections to be simulated. - * @param bitstrings List of output states to simulate, as bitstrings. - * @param results Output vector of amplitudes. After a successful run, this - * will be populated with amplitudes for each state in 'bitstrings'. - * @return True if the simulation completed successfully; false otherwise. - */ - template - static bool Run(const Parameter& param, const Factory& factory, - const Circuit& circuit, const std::vector& parts, - const std::vector& bitstrings, - std::vector>& results) { - if (circuit.num_qubits != parts.size()) { - IO::errorf("parts size is not equal to the number of qubits."); - return false; - } - - double t0 = 0.0; - - if (param.verbosity > 0) { - t0 = GetTime(); - } - - HybridData hd; - bool rc = HybridSimulator::SplitLattice(parts, circuit.gates, hd); - - if (!rc) { - return false; - } - - if (hd.num_gatexs < param.num_prefix_gatexs + param.num_root_gatexs) { - IO::errorf("error: num_prefix_gates (%u) plus num_root gates (%u) is " - "greater than num_gates_on_the_cut (%u).\n", - param.num_prefix_gatexs, param.num_root_gatexs, - hd.num_gatexs); - return false; - } - - if (param.verbosity > 0) { - PrintInfo(param, hd); - } - - auto fgates0 = Fuser::FuseGates(param, hd.num_qubits0, hd.gates0); - if (fgates0.size() == 0 && hd.gates0.size() > 0) { - return false; - } - - auto fgates1 = Fuser::FuseGates(param, hd.num_qubits1, hd.gates1); - if (fgates1.size() == 0 && hd.gates1.size() > 0) { - return false; - } - - rc = HybridSimulator(param.num_threads).Run( - param, factory, hd, parts, fgates0, fgates1, bitstrings, results); - - if (rc && param.verbosity > 0) { - double t1 = GetTime(); - IO::messagef("time elapsed %g seconds.\n", t1 - t0); - } - - return rc; - } - - private: - static void PrintInfo(const Parameter& param, const HybridData& hd) { - unsigned num_suffix_gates = - hd.num_gatexs - param.num_prefix_gatexs - param.num_root_gatexs; - - IO::messagef("part 0: %u, part 1: %u\n", hd.num_qubits0, hd.num_qubits1); - IO::messagef("%u gates on the cut\n", hd.num_gatexs); - IO::messagef("breakup: %up+%ur+%us\n", param.num_prefix_gatexs, - param.num_root_gatexs, num_suffix_gates); - } -}; - -} // namespace qsim - -#endif // RUN_QSIM_H_ diff --git a/qsim/seqfor.h b/qsim/seqfor.h deleted file mode 100644 index 3ebf07c..0000000 --- a/qsim/seqfor.h +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SEQFOR_H_ -#define SEQFOR_H_ - -#include -#include -#include - -namespace qsim { - -/** - * Helper struct for executing for loops in series. - */ -struct SequentialFor { - explicit SequentialFor(unsigned num_threads) {} - - // SequentialFor does not have any state. So all its methods can be static. - - static uint64_t GetIndex0(uint64_t size, unsigned thread_id) { - return 0; - } - - static uint64_t GetIndex1(uint64_t size, unsigned thread_id) { - return size; - } - - template - static void Run(uint64_t size, Function&& func, Args&&... args) { - for (uint64_t i = 0; i < size; ++i) { - func(1, 0, i, args...); - } - } - - template - static std::vector RunReduceP( - uint64_t size, Function&& func, Op&& op, Args&&... args) { - typename Op::result_type result = 0; - - for (uint64_t i = 0; i < size; ++i) { - result = op(result, func(1, 0, i, args...)); - } - - return std::vector(1, result); - } - - template - static typename Op::result_type RunReduce(uint64_t size, Function&& func, - Op&& op, Args&&... args) { - return RunReduceP(size, func, std::move(op), args...)[0]; - } -}; - -} // namespace qsim - -#endif // SEQFOR_H_ diff --git a/qsim/simmux.h b/qsim/simmux.h deleted file mode 100644 index d3c4074..0000000 --- a/qsim/simmux.h +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SIMMUX_H_ -#define SIMMUX_H_ - -#ifdef __AVX512F__ -# include "simulator_avx512.h" - namespace qsim { - template - using Simulator = SimulatorAVX512; - } -#elif __AVX2__ -# include "simulator_avx.h" - namespace qsim { - template - using Simulator = SimulatorAVX; - } -#elif __SSE4_1__ -# include "simulator_sse.h" - namespace qsim { - template - using Simulator = SimulatorSSE; - } -#else -# include "simulator_basic.h" - namespace qsim { - template - using Simulator = SimulatorBasic; - } -#endif - -#endif // SIMMUX_H_ diff --git a/qsim/simmux_gpu.h b/qsim/simmux_gpu.h deleted file mode 100644 index 1f0bb59..0000000 --- a/qsim/simmux_gpu.h +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2023 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SIMMUX_GPU_H_ -#define SIMMUX_GPU_H_ - -#ifdef __CUSTATEVEC__ -# include "simulator_custatevec.h" - namespace qsim { - using SimulatorGpu = SimulatorCuStateVec<>; - } -#else -# include "simulator_cuda.h" - namespace qsim { - using SimulatorGpu = SimulatorCUDA<>; - } -#endif - -#endif // SIMMUX_GPU_H_ diff --git a/qsim/simulator.h b/qsim/simulator.h deleted file mode 100644 index eff5441..0000000 --- a/qsim/simulator.h +++ /dev/null @@ -1,516 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SIMULATOR_H_ -#define SIMULATOR_H_ - -#include - -#include "bits.h" - -namespace qsim { - -/** - * Base class for simulator classes. - */ -class SimulatorBase { - protected: - // The follwoing template parameters are used for functions below. - // H - the number of high (target) qubits. - // L - the number of low (target) qubits. - // R - SIMD register width in floats. - - // Fills the table of masks (ms) that is used to calculate base state indices - // and the table of offset indices (xss) that is used to access the state - // vector entries in matrix-vector multiplication functions. This function is - // used in simulator_basic.h, simulator_sse.h and simulator_avx.h (no bmi2 - // version). - template - static void FillIndices(unsigned num_qubits, const std::vector& qs, - uint64_t* ms, uint64_t* xss) { - constexpr unsigned hsize = 1 << H; - - if (H == 0) { - ms[0] = uint64_t(-1); - xss[0] = 0; - } else { - uint64_t xs[H + 1]; - - xs[0] = uint64_t{1} << (qs[L] + 1); - ms[0] = (uint64_t{1} << qs[L]) - 1; - for (unsigned i = 1; i < H; ++i) { - xs[i] = uint64_t{1} << (qs[L + i] + 1); - ms[i] = ((uint64_t{1} << qs[L + i]) - 1) ^ (xs[i - 1] - 1); - } - ms[H] = ((uint64_t{1} << num_qubits) - 1) ^ (xs[H - 1] - 1); - - for (unsigned i = 0; i < hsize; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < H; ++k) { - a += xs[k] * ((i >> k) & 1); - } - xss[i] = a; - } - } - } - - // Fills gate matrix entries for gates with low qubits. - template - static void FillMatrix(unsigned qmaskl, const fp_type* matrix, fp_type* w) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - constexpr unsigned rsize = 1 << R; - - unsigned s = 0; - - for (unsigned i = 0; i < hsize; ++i) { - for (unsigned j = 0; j < gsize; ++j) { - unsigned p0 = 2 * i * lsize * gsize + 2 * lsize * (j / lsize); - - for (unsigned k = 0; k < rsize; ++k) { - unsigned l = bits::CompressBits(k, R, qmaskl); - unsigned p = p0 + 2 * (gsize * l + (j + l) % lsize); - - w[s + 0] = matrix[p]; - w[s + rsize] = matrix[p + 1]; - - ++s; - } - - s += rsize; - } - } - } - - // Fills gate matrix entries for controlled gates with high target qubits - // and low control qubits. - template - static void FillControlledMatrixH(uint64_t cvalsl, uint64_t cmaskl, - const fp_type* matrix, fp_type* w) { - constexpr unsigned hsize = 1 << H; - constexpr unsigned rsize = 1 << R; - - unsigned s = 0; - - for (unsigned i = 0; i < hsize; ++i) { - for (unsigned j = 0; j < hsize; ++j) { - unsigned p = hsize * i + j; - fp_type v = i == j ? 1 : 0; - - for (unsigned k = 0; k < rsize; ++k) { - w[s] = cvalsl == (k & cmaskl) ? matrix[2 * p] : v; - w[s + rsize] = cvalsl == (k & cmaskl) ? matrix[2 * p + 1] : 0; - - ++s; - } - - s += rsize; - } - } - } - - // Fills gate matrix entries for controlled gates with low target qubits - // and low control qubits. - template - static void FillControlledMatrixL(uint64_t cvalsl, uint64_t cmaskl, - unsigned qmaskl, const fp_type* matrix, - fp_type* w) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - constexpr unsigned rsize = 1 << R; - - unsigned s = 0; - - for (unsigned i = 0; i < hsize; ++i) { - for (unsigned j = 0; j < gsize; ++j) { - unsigned p0 = i * lsize * gsize + lsize * (j / lsize); - - for (unsigned k = 0; k < rsize; ++k) { - unsigned l = bits::CompressBits(k, R, qmaskl); - unsigned p = p0 + gsize * l + (j + l) % lsize; - - fp_type v = p / gsize == p % gsize ? 1 : 0; - - w[s] = cvalsl == (k & cmaskl) ? matrix[2 * p] : v; - w[s + rsize] = cvalsl == (k & cmaskl) ? matrix[2 * p + 1] : 0; - - ++s; - } - - s += rsize; - } - } - } - -/* - The GetMasks* functions below provide various masks and related values. - GetMasks1, GetMasks2, GetMasks3, GetMasks4, GetMasks5 and GetMasks6 are - used in simulator_avx.h (BMI2 version) and in simulator_avx512.h. GetMasks7, - GetMasks8, GetMasks9, GetMasks10 and GetMasks11 are used in simulator_avx.h - (no BMI2 version) and in simulator_sse.h. - - imaskh - inverted mask of high qubits (high control and target qubits). - qmaskh - mask of high qubits (high target qubits). - cvalsh - control bit values of high control qubits placed in correct - positions. - cvalsl - control bit values of low control qubits placed in correct positions. - cmaskh - mask of high control qubits. - cmaskl - mask of low control qubits. - qmaskl - mask of low qubits (low target qubits). - cl - the number of low control qubits. - - Note that imaskh, qmaskh and cvalsh are multiplied by two in GetMasks1, - GetMasks2, GetMasks3, GetMasks4, GetMasks5 and GetMasks6. -*/ - - struct Masks1 { - uint64_t imaskh; - uint64_t qmaskh; - }; - - template - static Masks1 GetMasks1(const std::vector& qs) { - uint64_t qmaskh = 0; - - for (unsigned i = 0; i < H; ++i) { - qmaskh |= uint64_t{1} << qs[i]; - } - - return {2 * (~qmaskh ^ ((1 << R) - 1)), 2 * qmaskh}; - } - - struct Masks2 { - uint64_t imaskh; - uint64_t qmaskh; - unsigned qmaskl; - }; - - template - static Masks2 GetMasks2(const std::vector& qs) { - uint64_t qmaskh = 0; - unsigned qmaskl = 0; - - for (unsigned i = 0; i < L; ++i) { - qmaskl |= 1 << qs[i]; - } - - for (unsigned i = L; i < H + L; ++i) { - qmaskh |= uint64_t{1} << qs[i]; - } - - return {2 * (~qmaskh ^ ((1 << R) - 1)), 2 * qmaskh, qmaskl}; - } - - struct Masks3 { - uint64_t imaskh; - uint64_t qmaskh; - uint64_t cvalsh; - }; - - template - static Masks3 GetMasks3(unsigned num_qubits, const std::vector& qs, - const std::vector& cqs, uint64_t cvals) { - uint64_t qmaskh = 0; - uint64_t cmaskh = 0; - - for (unsigned i = 0; i < H; ++i) { - qmaskh |= uint64_t{1} << qs[i]; - } - - for (auto q : cqs) { - cmaskh |= uint64_t{1} << q; - } - - uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh); - - uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1); - - return {2 * maskh, 2 * qmaskh, 2 * cvalsh}; - } - - struct Masks4 { - uint64_t imaskh; - uint64_t qmaskh; - uint64_t cvalsh; - uint64_t cvalsl; - uint64_t cmaskl; - unsigned cl; - }; - - template - static Masks4 GetMasks4(unsigned num_qubits, const std::vector& qs, - const std::vector& cqs, uint64_t cvals) { - unsigned cl = 0; - uint64_t qmaskh = 0; - uint64_t cmaskh = 0; - uint64_t cmaskl = 0; - - for (unsigned i = 0; i < H; ++i) { - qmaskh |= uint64_t{1} << qs[i]; - } - - for (auto q : cqs) { - if (q >= R) { - cmaskh |= uint64_t{1} << q; - } else { - ++cl; - cmaskl |= uint64_t{1} << q; - } - } - - uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh); - uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl); - - uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1); - - return {2 * maskh, 2 * qmaskh, 2 * cvalsh, cvalsl, cmaskl, cl}; - } - - struct Masks5 { - uint64_t imaskh; - uint64_t qmaskh; - uint64_t cvalsh; - unsigned qmaskl; - }; - - template - static Masks5 GetMasks5(unsigned num_qubits, const std::vector& qs, - const std::vector& cqs, uint64_t cvals) { - uint64_t qmaskh = 0; - uint64_t cmaskh = 0; - unsigned qmaskl = 0; - - for (unsigned i = 0; i < L; ++i) { - qmaskl |= 1 << qs[i]; - } - - for (unsigned i = L; i < H + L; ++i) { - qmaskh |= uint64_t{1} << qs[i]; - } - - for (auto q : cqs) { - cmaskh |= uint64_t{1} << q; - } - - uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh); - - uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1); - - return {2 * maskh, 2 * qmaskh, 2 * cvalsh, qmaskl}; - } - - struct Masks6 { - uint64_t imaskh; - uint64_t qmaskh; - uint64_t cvalsh; - uint64_t cvalsl; - uint64_t cmaskl; - unsigned qmaskl; - unsigned cl; - }; - - template - static Masks6 GetMasks6(unsigned num_qubits, const std::vector& qs, - const std::vector& cqs, uint64_t cvals) { - unsigned cl = 0; - uint64_t qmaskh = 0; - uint64_t cmaskh = 0; - uint64_t cmaskl = 0; - unsigned qmaskl = 0; - - for (unsigned i = 0; i < L; ++i) { - qmaskl |= 1 << qs[i]; - } - - for (unsigned i = L; i < H + L; ++i) { - qmaskh |= uint64_t{1} << qs[i]; - } - - for (auto q : cqs) { - if (q >= R) { - cmaskh |= uint64_t{1} << q; - } else { - ++cl; - cmaskl |= uint64_t{1} << q; - } - } - - uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh); - uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl); - - uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1); - - return {2 * maskh, 2 * qmaskh, 2 * cvalsh, cvalsl, cmaskl, qmaskl, cl}; - } - - struct Masks7 { - uint64_t cvalsh; - uint64_t cmaskh; - }; - - static Masks7 GetMasks7(unsigned num_qubits, const std::vector& qs, - const std::vector& cqs, uint64_t cvals) { - uint64_t cmaskh = 0; - - for (auto q : cqs) { - cmaskh |= uint64_t{1} << q; - } - - uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh); - - return {cvalsh, cmaskh}; - } - - struct Masks8 { - uint64_t cvalsh; - uint64_t cmaskh; - uint64_t cvalsl; - uint64_t cmaskl; - }; - - template - static Masks8 GetMasks8(unsigned num_qubits, const std::vector& qs, - const std::vector& cqs, uint64_t cvals) { - unsigned cl = 0; - uint64_t cmaskh = 0; - uint64_t cmaskl = 0; - - for (auto q : cqs) { - if (q >= R) { - cmaskh |= uint64_t{1} << q; - } else { - ++cl; - cmaskl |= uint64_t{1} << q; - } - } - - uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh); - uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl); - - return {cvalsh, cmaskh, cvalsl, cmaskl}; - } - - struct Masks9 { - uint64_t cvalsh; - uint64_t cmaskh; - unsigned qmaskl; - }; - - template - static Masks9 GetMasks9(unsigned num_qubits, const std::vector& qs, - const std::vector& cqs, uint64_t cvals) { - uint64_t cmaskh = 0; - unsigned qmaskl = 0; - - for (unsigned i = 0; i < L; ++i) { - qmaskl |= 1 << qs[i]; - } - - for (auto q : cqs) { - cmaskh |= uint64_t{1} << q; - } - - uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh); - - return {cvalsh, cmaskh, qmaskl}; - } - - struct Masks10 { - uint64_t cvalsh; - uint64_t cmaskh; - uint64_t cvalsl; - uint64_t cmaskl; - unsigned qmaskl; - }; - - template - static Masks10 GetMasks10(unsigned num_qubits, - const std::vector& qs, - const std::vector& cqs, uint64_t cvals) { - unsigned cl = 0; - uint64_t cmaskh = 0; - uint64_t cmaskl = 0; - unsigned qmaskl = 0; - - for (unsigned i = 0; i < L; ++i) { - qmaskl |= 1 << qs[i]; - } - - for (auto q : cqs) { - if (q >= R) { - cmaskh |= uint64_t{1} << q; - } else { - ++cl; - cmaskl |= uint64_t{1} << q; - } - } - - uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh); - uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl); - - return {cvalsh, cmaskh, cvalsl, cmaskl, qmaskl}; - } - - struct Masks11 { - unsigned qmaskl; - }; - - template - static Masks11 GetMasks11(const std::vector& qs) { - unsigned qmaskl = 0; - - for (unsigned i = 0; i < L; ++i) { - qmaskl |= 1 << qs[i]; - } - - return {qmaskl}; - } - - template - static unsigned MaskedAdd( - unsigned a, unsigned b, unsigned mask, unsigned lsize) { - unsigned c = bits::CompressBits(a, R, mask); - return bits::ExpandBits((c + b) % lsize, R, mask); - } -}; - -template <> -inline void SimulatorBase::FillIndices<0, 1>(unsigned num_qubits, - const std::vector& qs, - uint64_t* ms, uint64_t* xss) { - ms[0] = -1; - xss[0] = 0; -} - -template <> -inline void SimulatorBase::FillIndices<0, 2>(unsigned num_qubits, - const std::vector& qs, - uint64_t* ms, uint64_t* xss) { - ms[0] = -1; - xss[0] = 0; -} - -template <> -inline void SimulatorBase::FillIndices<0, 3>(unsigned num_qubits, - const std::vector& qs, - uint64_t* ms, uint64_t* xss) { - ms[0] = -1; - xss[0] = 0; -} - -} // namespace qsim - -#endif // SIMULATOR_H_ diff --git a/qsim/simulator_avx.h b/qsim/simulator_avx.h deleted file mode 100644 index 9742849..0000000 --- a/qsim/simulator_avx.h +++ /dev/null @@ -1,1363 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SIMULATOR_AVX_H_ -#define SIMULATOR_AVX_H_ - -#include - -#include -#include -#include -#include - -#include "simulator.h" -#include "statespace_avx.h" - -namespace qsim { - -/** - * Quantum circuit simulator with AVX vectorization. - */ -template -class SimulatorAVX final : public SimulatorBase { - public: - using StateSpace = StateSpaceAVX; - using State = typename StateSpace::State; - using fp_type = typename StateSpace::fp_type; - - template - explicit SimulatorAVX(ForArgs&&... args) : for_(args...) {} - - /** - * Applies a gate using AVX instructions. - * @param qs Indices of the qubits affected by this gate. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyGate(const std::vector& qs, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - switch (qs.size()) { - case 0: - ApplyGateH<0>(qs, matrix, state); - break; - case 1: - if (qs[0] > 2) { - ApplyGateH<1>(qs, matrix, state); - } else { - ApplyGateL<0, 1>(qs, matrix, state); - } - break; - case 2: - if (qs[0] > 2) { - ApplyGateH<2>(qs, matrix, state); - } else if (qs[1] > 2) { - ApplyGateL<1, 1>(qs, matrix, state); - } else { - ApplyGateL<0, 2>(qs, matrix, state); - } - break; - case 3: - if (qs[0] > 2) { - ApplyGateH<3>(qs, matrix, state); - } else if (qs[1] > 2) { - ApplyGateL<2, 1>(qs, matrix, state); - } else if (qs[2] > 2) { - ApplyGateL<1, 2>(qs, matrix, state); - } else { - ApplyGateL<0, 3>(qs, matrix, state); - } - break; - case 4: - if (qs[0] > 2) { - ApplyGateH<4>(qs, matrix, state); - } else if (qs[1] > 2) { - ApplyGateL<3, 1>(qs, matrix, state); - } else if (qs[2] > 2) { - ApplyGateL<2, 2>(qs, matrix, state); - } else { - ApplyGateL<1, 3>(qs, matrix, state); - } - break; - case 5: - if (qs[0] > 2) { - ApplyGateH<5>(qs, matrix, state); - } else if (qs[1] > 2) { - ApplyGateL<4, 1>(qs, matrix, state); - } else if (qs[2] > 2) { - ApplyGateL<3, 2>(qs, matrix, state); - } else { - ApplyGateL<2, 3>(qs, matrix, state); - } - break; - case 6: - if (qs[0] > 2) { - ApplyGateH<6>(qs, matrix, state); - } else if (qs[1] > 2) { - ApplyGateL<5, 1>(qs, matrix, state); - } else if (qs[2] > 2) { - ApplyGateL<4, 2>(qs, matrix, state); - } else { - ApplyGateL<3, 3>(qs, matrix, state); - } - break; - default: - // Not implemented. - break; - } - } - - /** - * Applies a controlled gate using AVX instructions. - * @param qs Indices of the qubits affected by this gate. - * @param cqs Indices of control qubits. - * @param cvals Bit mask of control qubit values. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyControlledGate(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - // Assume cqs[0] < cqs[1] < cqs[2] < ... . - - if (cqs.size() == 0) { - ApplyGate(qs, matrix, state); - return; - } - - switch (qs.size()) { - case 0: - if (cqs[0] > 2) { - ApplyControlledGateHH<0>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<0>(qs, cqs, cvals, matrix, state); - } - break; - case 1: - if (qs[0] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 2) { - ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 2: - if (qs[0] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 2) { - ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 3: - if (qs[0] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else if (qs[2] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 2) { - ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 4: - if (qs[0] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else if (qs[2] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 2) { - ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - default: - // Not implemented. - break; - } - } - - /** - * Computes the expectation value of an operator using AVX instructions. - * @param qs Indices of the qubits the operator acts on. - * @param matrix The operator matrix. - * @param state The state of the system. - * @return The computed expectation value. - */ - std::complex ExpectationValue(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - switch (qs.size()) { - case 1: - if (qs[0] > 2) { - return ExpectationValueH<1>(qs, matrix, state); - } else { - return ExpectationValueL<0, 1>(qs, matrix, state); - } - break; - case 2: - if (qs[0] > 2) { - return ExpectationValueH<2>(qs, matrix, state); - } else if (qs[1] > 2) { - return ExpectationValueL<1, 1>(qs, matrix, state); - } else { - return ExpectationValueL<0, 2>(qs, matrix, state); - } - break; - case 3: - if (qs[0] > 2) { - return ExpectationValueH<3>(qs, matrix, state); - } else if (qs[1] > 2) { - return ExpectationValueL<2, 1>(qs, matrix, state); - } else if (qs[2] > 2) { - return ExpectationValueL<1, 2>(qs, matrix, state); - } else { - return ExpectationValueL<0, 3>(qs, matrix, state); - } - break; - case 4: - if (qs[0] > 2) { - return ExpectationValueH<4>(qs, matrix, state); - } else if (qs[1] > 2) { - return ExpectationValueL<3, 1>(qs, matrix, state); - } else if (qs[2] > 2) { - return ExpectationValueL<2, 2>(qs, matrix, state); - } else { - return ExpectationValueL<1, 3>(qs, matrix, state); - } - break; - case 5: - if (qs[0] > 2) { - return ExpectationValueH<5>(qs, matrix, state); - } else if (qs[1] > 2) { - return ExpectationValueL<4, 1>(qs, matrix, state); - } else if (qs[2] > 2) { - return ExpectationValueL<3, 2>(qs, matrix, state); - } else { - return ExpectationValueL<2, 3>(qs, matrix, state); - } - break; - case 6: - if (qs[0] > 2) { - return ExpectationValueH<6>(qs, matrix, state); - } else if (qs[1] > 2) { - return ExpectationValueL<5, 1>(qs, matrix, state); - } else if (qs[2] > 2) { - return ExpectationValueL<4, 2>(qs, matrix, state); - } else { - return ExpectationValueL<3, 3>(qs, matrix, state); - } - break; - default: - // Not implemented. - break; - } - - return 0; - } - - /** - * @return The size of SIMD register if applicable. - */ - static unsigned SIMDRegisterSize() { - return 8; - } - - private: -#ifdef __BMI2__ - - template - void ApplyGateH(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - uint64_t imaskh, uint64_t qmaskh, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m256 ru, iu, rn, in; - __m256 rs[hsize], is[hsize]; - - auto p0 = rstate + _pdep_u64(i, imaskh); - - for (unsigned k = 0; k < hsize; ++k) { - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k] = _mm256_load_ps(p0 + p); - is[k] = _mm256_load_ps(p0 + p + 8); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_mul_ps(rs[0], ru); - in = _mm256_mul_ps(rs[0], iu); - rn = _mm256_fnmadd_ps(is[0], iu, rn); - in = _mm256_fmadd_ps(is[0], ru, in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_fmadd_ps(rs[l], ru, rn); - in = _mm256_fmadd_ps(rs[l], iu, in); - rn = _mm256_fnmadd_ps(is[l], iu, rn); - in = _mm256_fmadd_ps(is[l], ru, in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm256_store_ps(p0 + p, rn); - _mm256_store_ps(p0 + p + 8, in); - } - }; - - auto m = GetMasks1(qs); - - unsigned k = 3 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, matrix, m.imaskh, m.qmaskh, state.get()); - } - - template - void ApplyGateL(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, - uint64_t imaskh, uint64_t qmaskh, const __m256i* idx, - fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m256 rn, in; - __m256 rs[gsize], is[gsize]; - - auto p0 = rstate + _pdep_u64(i, imaskh); - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k2] = _mm256_load_ps(p0 + p); - is[k2] = _mm256_load_ps(p0 + p + 8); - - for (unsigned l = 1; l < lsize; ++l) { - rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); - is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); - } - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm256_mul_ps(rs[0], w[j]); - in = _mm256_mul_ps(rs[0], w[j + 1]); - rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm256_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm256_fmadd_ps(rs[l], w[j], rn); - in = _mm256_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm256_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm256_store_ps(p0 + p, rn); - _mm256_store_ps(p0 + p + 8, in); - } - }; - - __m256i idx[1 << L]; - __m256 w[1 << (1 + 2 * H + L)]; - - auto m = GetMasks2(qs); - FillPermutationIndices(m.qmaskl, idx); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - unsigned r = 3 + H; - unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, w, m.imaskh, m.qmaskh, idx, state.get()); - } - - template - void ApplyControlledGateHH(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, - fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m256 ru, iu, rn, in; - __m256 rs[hsize], is[hsize]; - - auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh); - - for (unsigned k = 0; k < hsize; ++k) { - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k] = _mm256_load_ps(p0 + p); - is[k] = _mm256_load_ps(p0 + p + 8); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_mul_ps(rs[0], ru); - in = _mm256_mul_ps(rs[0], iu); - rn = _mm256_fnmadd_ps(is[0], iu, rn); - in = _mm256_fmadd_ps(is[0], ru, in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_fmadd_ps(rs[l], ru, rn); - in = _mm256_fmadd_ps(rs[l], iu, in); - rn = _mm256_fnmadd_ps(is[l], iu, rn); - in = _mm256_fmadd_ps(is[l], ru, in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm256_store_ps(p0 + p, rn); - _mm256_store_ps(p0 + p + 8, in); - } - }; - - auto m = GetMasks3(state.num_qubits(), qs, cqs, cvals); - - unsigned k = 3 + H + cqs.size(); - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, matrix, m.imaskh, m.qmaskh, m.cvalsh, state.get()); - } - - template - void ApplyControlledGateHL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, - uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, - fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m256 rn, in; - __m256 rs[hsize], is[hsize]; - - auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh); - - for (unsigned k = 0; k < hsize; ++k) { - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k] = _mm256_load_ps(p0 + p); - is[k] = _mm256_load_ps(p0 + p + 8); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm256_mul_ps(rs[0], w[j]); - in = _mm256_mul_ps(rs[0], w[j + 1]); - rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm256_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - rn = _mm256_fmadd_ps(rs[l], w[j], rn); - in = _mm256_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm256_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm256_store_ps(p0 + p, rn); - _mm256_store_ps(p0 + p + 8, in); - } - }; - - __m256 w[1 << (1 + 2 * H)]; - - auto m = GetMasks4(state.num_qubits(), qs, cqs, cvals); - FillControlledMatrixH(m.cvalsl, m.cmaskl, matrix, (fp_type*) w); - - unsigned r = 3 + H + cqs.size() - m.cl; - unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, state.get()); - } - - template - void ApplyControlledGateL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, - uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, - const __m256i* idx, fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m256 rn, in; - __m256 rs[gsize], is[gsize]; - - auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh); - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k2] = _mm256_load_ps(p0 + p); - is[k2] = _mm256_load_ps(p0 + p + 8); - - for (unsigned l = 1; l < lsize; ++l) { - rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); - is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); - } - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm256_mul_ps(rs[0], w[j]); - in = _mm256_mul_ps(rs[0], w[j + 1]); - rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm256_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm256_fmadd_ps(rs[l], w[j], rn); - in = _mm256_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm256_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm256_store_ps(p0 + p, rn); - _mm256_store_ps(p0 + p + 8, in); - } - }; - - __m256i idx[1 << L]; - __m256 w[1 << (1 + 2 * H + L)]; - - if (CH) { - auto m = GetMasks5(state.num_qubits(), qs, cqs, cvals); - FillPermutationIndices(m.qmaskl, idx); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - unsigned r = 3 + H + cqs.size(); - unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get()); - } else { - auto m = GetMasks6(state.num_qubits(), qs, cqs, cvals); - FillPermutationIndices(m.qmaskl, idx); - FillControlledMatrixL( - m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w); - - unsigned r = 3 + H + cqs.size() - m.cl; - unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get()); - } - } - - template - std::complex ExpectationValueH(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - uint64_t imaskh, uint64_t qmaskh, const fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m256 ru, iu, rn, in; - __m256 rs[hsize], is[hsize]; - - auto p0 = rstate + _pdep_u64(i, imaskh); - - for (unsigned k = 0; k < hsize; ++k) { - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k] = _mm256_load_ps(p0 + p); - is[k] = _mm256_load_ps(p0 + p + 8); - } - - double re = 0; - double im = 0; - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_mul_ps(rs[0], ru); - in = _mm256_mul_ps(rs[0], iu); - rn = _mm256_fnmadd_ps(is[0], iu, rn); - in = _mm256_fmadd_ps(is[0], ru, in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_fmadd_ps(rs[l], ru, rn); - in = _mm256_fmadd_ps(rs[l], iu, in); - rn = _mm256_fnmadd_ps(is[l], iu, rn); - in = _mm256_fmadd_ps(is[l], ru, in); - - j += 2; - } - - __m256 v_re = _mm256_fmadd_ps(is[k], in, _mm256_mul_ps(rs[k], rn)); - __m256 v_im = _mm256_fnmadd_ps(is[k], rn, _mm256_mul_ps(rs[k], in)); - - re += detail::HorizontalSumAVX(v_re); - im += detail::HorizontalSumAVX(v_im); - } - - return std::complex{re, im}; - }; - - auto m = GetMasks1(qs); - - unsigned k = 3 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Op = std::plus>; - return - for_.RunReduce(size, f, Op(), matrix, m.imaskh, m.qmaskh, state.get()); - } - - template - std::complex ExpectationValueL(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, - uint64_t imaskh, uint64_t qmaskh, const __m256i* idx, - const fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m256 rn, in; - __m256 rs[gsize], is[gsize]; - - auto p0 = rstate + _pdep_u64(i, imaskh); - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k2] = _mm256_load_ps(p0 + p); - is[k2] = _mm256_load_ps(p0 + p + 8); - - for (unsigned l = 1; l < lsize; ++l) { - rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); - is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); - } - } - - double re = 0; - double im = 0; - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm256_mul_ps(rs[0], w[j]); - in = _mm256_mul_ps(rs[0], w[j + 1]); - rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm256_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm256_fmadd_ps(rs[l], w[j], rn); - in = _mm256_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm256_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - unsigned m = lsize * k; - - __m256 v_re = _mm256_fmadd_ps(is[m], in, _mm256_mul_ps(rs[m], rn)); - __m256 v_im = _mm256_fnmadd_ps(is[m], rn, _mm256_mul_ps(rs[m], in)); - - re += detail::HorizontalSumAVX(v_re); - im += detail::HorizontalSumAVX(v_im); - } - - return std::complex{re, im}; - }; - - __m256i idx[1 << L]; - __m256 w[1 << (1 + 2 * H + L)]; - - auto m = GetMasks2(qs); - FillPermutationIndices(m.qmaskl, idx); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - unsigned r = 3 + H; - unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; - uint64_t size = uint64_t{1} << n; - - using Op = std::plus>; - return - for_.RunReduce(size, f, Op(), w, m.imaskh, m.qmaskh, idx, state.get()); - } - -#else // __BMI2__ - - template - void ApplyGateH(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - const uint64_t* ms, const uint64_t* xss, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m256 ru, iu, rn, in; - __m256 rs[hsize], is[hsize]; - - i *= 8; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = _mm256_load_ps(p0 + xss[k]); - is[k] = _mm256_load_ps(p0 + xss[k] + 8); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_mul_ps(rs[0], ru); - in = _mm256_mul_ps(rs[0], iu); - rn = _mm256_fnmadd_ps(is[0], iu, rn); - in = _mm256_fmadd_ps(is[0], ru, in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_fmadd_ps(rs[l], ru, rn); - in = _mm256_fmadd_ps(rs[l], iu, in); - rn = _mm256_fnmadd_ps(is[l], iu, rn); - in = _mm256_fmadd_ps(is[l], ru, in); - - j += 2; - } - - _mm256_store_ps(p0 + xss[k], rn); - _mm256_store_ps(p0 + xss[k] + 8, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned k = 3 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, matrix, ms, xss, state.get()); - } - - template - void ApplyGateL(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, - const uint64_t* ms, const uint64_t* xss, const __m256i* idx, - fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m256 rn, in; - __m256 rs[gsize], is[gsize]; - - i *= 8; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - rs[k2] = _mm256_load_ps(p0 + xss[k]); - is[k2] = _mm256_load_ps(p0 + xss[k] + 8); - - for (unsigned l = 1; l < lsize; ++l) { - rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); - is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); - } - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm256_mul_ps(rs[0], w[j]); - in = _mm256_mul_ps(rs[0], w[j + 1]); - rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm256_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm256_fmadd_ps(rs[l], w[j], rn); - in = _mm256_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm256_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - _mm256_store_ps(p0 + xss[k], rn); - _mm256_store_ps(p0 + xss[k] + 8, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - __m256i idx[1 << L]; - __m256 w[1 << (1 + 2 * H + L)]; - - auto m = GetMasks11(qs); - - FillIndices(state.num_qubits(), qs, ms, xss); - FillPermutationIndices(m.qmaskl, idx); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - unsigned r = 3 + H; - unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, w, ms, xss, idx, state.get()); - } - - template - void ApplyControlledGateHH(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, - uint64_t cmaskh, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m256 ru, iu, rn, in; - __m256 rs[hsize], is[hsize]; - - i *= 8; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - if ((ii & cmaskh) != cvalsh) return; - - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = _mm256_load_ps(p0 + xss[k]); - is[k] = _mm256_load_ps(p0 + xss[k] + 8); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_mul_ps(rs[0], ru); - in = _mm256_mul_ps(rs[0], iu); - rn = _mm256_fnmadd_ps(is[0], iu, rn); - in = _mm256_fmadd_ps(is[0], ru, in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_fmadd_ps(rs[l], ru, rn); - in = _mm256_fmadd_ps(rs[l], iu, in); - rn = _mm256_fnmadd_ps(is[l], iu, rn); - in = _mm256_fmadd_ps(is[l], ru, in); - - j += 2; - } - - _mm256_store_ps(p0 + xss[k], rn); - _mm256_store_ps(p0 + xss[k] + 8, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - - auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals); - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned k = 3 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, matrix, ms, xss, m.cvalsh, m.cmaskh, state.get()); - } - - template - void ApplyControlledGateHL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, - const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, - uint64_t cmaskh, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m256 rn, in; - __m256 rs[hsize], is[hsize]; - - i *= 8; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - if ((ii & cmaskh) != cvalsh) return; - - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = _mm256_load_ps(p0 + xss[k]); - is[k] = _mm256_load_ps(p0 + xss[k] + 8); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm256_mul_ps(rs[0], w[j]); - in = _mm256_mul_ps(rs[0], w[j + 1]); - rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm256_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - rn = _mm256_fmadd_ps(rs[l], w[j], rn); - in = _mm256_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm256_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - _mm256_store_ps(p0 + xss[k], rn); - _mm256_store_ps(p0 + xss[k] + 8, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - __m256 w[1 << (1 + 2 * H)]; - - auto m = GetMasks8<3>(state.num_qubits(), qs, cqs, cvals); - FillIndices(state.num_qubits(), qs, ms, xss); - FillControlledMatrixH(m.cvalsl, m.cmaskl, matrix, (fp_type*) w); - - unsigned r = 3 + H; - unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, state.get()); - } - - template - void ApplyControlledGateL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, - const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, - uint64_t cmaskh, const __m256i* idx, fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m256 rn, in; - __m256 rs[gsize], is[gsize]; - - i *= 8; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - if ((ii & cmaskh) != cvalsh) return; - - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - - rs[k2] = _mm256_load_ps(p0 + xss[k]); - is[k2] = _mm256_load_ps(p0 + xss[k] + 8); - - for (unsigned l = 1; l < lsize; ++l) { - rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); - is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); - } - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm256_mul_ps(rs[0], w[j]); - in = _mm256_mul_ps(rs[0], w[j + 1]); - rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm256_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm256_fmadd_ps(rs[l], w[j], rn); - in = _mm256_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm256_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - _mm256_store_ps(p0 + xss[k], rn); - _mm256_store_ps(p0 + xss[k] + 8, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - __m256i idx[1 << L]; - __m256 w[1 << (1 + 2 * H + L)]; - - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned r = 3 + H; - unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; - uint64_t size = uint64_t{1} << n; - - if (CH) { - auto m = GetMasks9(state.num_qubits(), qs, cqs, cvals); - FillPermutationIndices(m.qmaskl, idx); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, idx, state.get()); - } else { - auto m = GetMasks10(state.num_qubits(), qs, cqs, cvals); - FillPermutationIndices(m.qmaskl, idx); - FillControlledMatrixL( - m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w); - - for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, idx, state.get()); - } - } - - template - std::complex ExpectationValueH(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - const uint64_t* ms, const uint64_t* xss, - const fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m256 ru, iu, rn, in; - __m256 rs[hsize], is[hsize]; - - i *= 8; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = _mm256_load_ps(p0 + xss[k]); - is[k] = _mm256_load_ps(p0 + xss[k] + 8); - } - - double re = 0; - double im = 0; - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_mul_ps(rs[0], ru); - in = _mm256_mul_ps(rs[0], iu); - rn = _mm256_fnmadd_ps(is[0], iu, rn); - in = _mm256_fmadd_ps(is[0], ru, in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_fmadd_ps(rs[l], ru, rn); - in = _mm256_fmadd_ps(rs[l], iu, in); - rn = _mm256_fnmadd_ps(is[l], iu, rn); - in = _mm256_fmadd_ps(is[l], ru, in); - - j += 2; - } - - __m256 v_re = _mm256_fmadd_ps(is[k], in, _mm256_mul_ps(rs[k], rn)); - __m256 v_im = _mm256_fnmadd_ps(is[k], rn, _mm256_mul_ps(rs[k], in)); - - re += detail::HorizontalSumAVX(v_re); - im += detail::HorizontalSumAVX(v_im); - } - - return std::complex{re, im}; - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned k = 3 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Op = std::plus>; - return for_.RunReduce(size, f, Op(), matrix, ms, xss, state.get()); - } - - template - std::complex ExpectationValueL(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, - const uint64_t* ms, const uint64_t* xss, const __m256i* idx, - const fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m256 rn, in; - __m256 rs[gsize], is[gsize]; - - i *= 8; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - - rs[k2] = _mm256_load_ps(p0 + xss[k]); - is[k2] = _mm256_load_ps(p0 + xss[k] + 8); - - for (unsigned l = 1; l < lsize; ++l) { - rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); - is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); - } - } - - double re = 0; - double im = 0; - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm256_mul_ps(rs[0], w[j]); - in = _mm256_mul_ps(rs[0], w[j + 1]); - rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm256_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm256_fmadd_ps(rs[l], w[j], rn); - in = _mm256_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm256_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - unsigned m = lsize * k; - - __m256 v_re = _mm256_fmadd_ps(is[m], in, _mm256_mul_ps(rs[m], rn)); - __m256 v_im = _mm256_fnmadd_ps(is[m], rn, _mm256_mul_ps(rs[m], in)); - - re += detail::HorizontalSumAVX(v_re); - im += detail::HorizontalSumAVX(v_im); - } - - return std::complex{re, im}; - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - __m256i idx[1 << L]; - __m256 w[1 << (1 + 2 * H + L)]; - - auto m = GetMasks11(qs); - - FillIndices(state.num_qubits(), qs, ms, xss); - FillPermutationIndices(m.qmaskl, idx); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - unsigned r = 3 + H; - unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; - uint64_t size = uint64_t{1} << n; - - using Op = std::plus>; - return for_.RunReduce(size, f, Op(), w, ms, xss, idx, state.get()); - } - -#endif // __BMI2__ - - template - static void FillPermutationIndices(unsigned qmaskl, __m256i* idx) { - constexpr unsigned lsize = 1 << L; - - for (unsigned i = 0; i < lsize - 1; ++i) { - unsigned p[8]; - - for (unsigned j = 0; j < 8; ++j) { - p[j] = MaskedAdd<3>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl)); - } - - idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]); - } - } - - For for_; -}; - -} // namespace qsim - -#endif // SIMULATOR_AVX_H_ diff --git a/qsim/simulator_avx512.h b/qsim/simulator_avx512.h deleted file mode 100644 index 21a2e9d..0000000 --- a/qsim/simulator_avx512.h +++ /dev/null @@ -1,846 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SIMULATOR_AVX512_H_ -#define SIMULATOR_AVX512_H_ - -#include - -#include -#include -#include -#include - -#include "simulator.h" -#include "statespace_avx512.h" - -namespace qsim { - -/** - * Quantum circuit simulator with AVX512 vectorization. - */ -template -class SimulatorAVX512 final : public SimulatorBase { - public: - using StateSpace = StateSpaceAVX512; - using State = typename StateSpace::State; - using fp_type = typename StateSpace::fp_type; - - template - explicit SimulatorAVX512(ForArgs&&... args) : for_(args...) {} - - /** - * Applies a gate using AVX512 instructions. - * @param qs Indices of the qubits affected by this gate. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyGate(const std::vector& qs, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - switch (qs.size()) { - case 0: - ApplyGateH<0>(qs, matrix, state); - break; - case 1: - if (qs[0] > 3) { - ApplyGateH<1>(qs, matrix, state); - } else { - ApplyGateL<0, 1>(qs, matrix, state); - } - break; - case 2: - if (qs[0] > 3) { - ApplyGateH<2>(qs, matrix, state); - } else if (qs[1] > 3) { - ApplyGateL<1, 1>(qs, matrix, state); - } else { - ApplyGateL<0, 2>(qs, matrix, state); - } - break; - case 3: - if (qs[0] > 3) { - ApplyGateH<3>(qs, matrix, state); - } else if (qs[1] > 3) { - ApplyGateL<2, 1>(qs, matrix, state); - } else if (qs[2] > 3) { - ApplyGateL<1, 2>(qs, matrix, state); - } else { - ApplyGateL<0, 3>(qs, matrix, state); - } - break; - case 4: - if (qs[0] > 3) { - ApplyGateH<4>(qs, matrix, state); - } else if (qs[1] > 3) { - ApplyGateL<3, 1>(qs, matrix, state); - } else if (qs[2] > 3) { - ApplyGateL<2, 2>(qs, matrix, state); - } else if (qs[3] > 3) { - ApplyGateL<1, 3>(qs, matrix, state); - } else { - ApplyGateL<0, 4>(qs, matrix, state); - } - break; - case 5: - if (qs[0] > 3) { - ApplyGateH<5>(qs, matrix, state); - } else if (qs[1] > 3) { - ApplyGateL<4, 1>(qs, matrix, state); - } else if (qs[2] > 3) { - ApplyGateL<3, 2>(qs, matrix, state); - } else if (qs[3] > 3) { - ApplyGateL<2, 3>(qs, matrix, state); - } else { - ApplyGateL<1, 4>(qs, matrix, state); - } - break; - case 6: - if (qs[0] > 3) { - ApplyGateH<6>(qs, matrix, state); - } else if (qs[1] > 3) { - ApplyGateL<5, 1>(qs, matrix, state); - } else if (qs[2] > 3) { - ApplyGateL<4, 2>(qs, matrix, state); - } else if (qs[3] > 3) { - ApplyGateL<3, 3>(qs, matrix, state); - } else { - ApplyGateL<2, 4>(qs, matrix, state); - } - break; - default: - // Not implemented. - break; - } - } - - /** - * Applies a controlled gate using AVX512 instructions. - * @param qs Indices of the qubits affected by this gate. - * @param cqs Indices of control qubits. - * @param cvals Bit mask of control qubit values. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyControlledGate(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - // Assume cqs[0] < cqs[1] < cqs[2] < ... . - - if (cqs.size() == 0) { - ApplyGate(qs, matrix, state); - return; - } - - switch (qs.size()) { - case 0: - if (cqs[0] > 3) { - ApplyControlledGateHH<0>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<0>(qs, cqs, cvals, matrix, state); - } - break; - case 1: - if (qs[0] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 3) { - ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 2: - if (qs[0] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 3) { - ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 3: - if (qs[0] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else if (qs[2] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 3) { - ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 4: - if (qs[0] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else if (qs[2] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state); - } - } else if (qs[3] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 3) { - ApplyControlledGateL<0, 4, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 4, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - default: - // Not implemented. - break; - } - } - - /** - * Computes the expectation value of an operator using AVX512 instructions. - * @param qs Indices of the qubits the operator acts on. - * @param matrix The operator matrix. - * @param state The state of the system. - * @return The computed expectation value. - */ - std::complex ExpectationValue(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - switch (qs.size()) { - case 1: - if (qs[0] > 3) { - return ExpectationValueH<1>(qs, matrix, state); - } else { - return ExpectationValueL<0, 1>(qs, matrix, state); - } - break; - case 2: - if (qs[0] > 3) { - return ExpectationValueH<2>(qs, matrix, state); - } else if (qs[1] > 3) { - return ExpectationValueL<1, 1>(qs, matrix, state); - } else { - return ExpectationValueL<0, 2>(qs, matrix, state); - } - break; - case 3: - if (qs[0] > 3) { - return ExpectationValueH<3>(qs, matrix, state); - } else if (qs[1] > 3) { - return ExpectationValueL<2, 1>(qs, matrix, state); - } else if (qs[2] > 3) { - return ExpectationValueL<1, 2>(qs, matrix, state); - } else { - return ExpectationValueL<0, 3>(qs, matrix, state); - } - break; - case 4: - if (qs[0] > 3) { - return ExpectationValueH<4>(qs, matrix, state); - } else if (qs[1] > 3) { - return ExpectationValueL<3, 1>(qs, matrix, state); - } else if (qs[2] > 3) { - return ExpectationValueL<2, 2>(qs, matrix, state); - } else if (qs[3] > 3) { - return ExpectationValueL<1, 3>(qs, matrix, state); - } else { - return ExpectationValueL<0, 4>(qs, matrix, state); - } - break; - case 5: - if (qs[0] > 3) { - return ExpectationValueH<5>(qs, matrix, state); - } else if (qs[1] > 3) { - return ExpectationValueL<4, 1>(qs, matrix, state); - } else if (qs[2] > 3) { - return ExpectationValueL<3, 2>(qs, matrix, state); - } else if (qs[3] > 3) { - return ExpectationValueL<2, 3>(qs, matrix, state); - } else { - return ExpectationValueL<1, 4>(qs, matrix, state); - } - break; - case 6: - if (qs[0] > 3) { - return ExpectationValueH<6>(qs, matrix, state); - } else if (qs[1] > 3) { - return ExpectationValueL<5, 1>(qs, matrix, state); - } else if (qs[2] > 3) { - return ExpectationValueL<4, 2>(qs, matrix, state); - } else if (qs[3] > 3) { - return ExpectationValueL<3, 3>(qs, matrix, state); - } else { - return ExpectationValueL<2, 4>(qs, matrix, state); - } - break; - default: - // Not implemented. - break; - } - - return 0; - } - - /** - * @return The size of SIMD register if applicable. - */ - static unsigned SIMDRegisterSize() { - return 16; - } - - private: - template - void ApplyGateH(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - uint64_t imaskh, uint64_t qmaskh, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m512 ru, iu, rn, in; - __m512 rs[hsize], is[hsize]; - - auto p0 = rstate + _pdep_u64(i, imaskh); - - for (unsigned k = 0; k < hsize; ++k) { - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k] = _mm512_load_ps(p0 + p); - is[k] = _mm512_load_ps(p0 + p + 16); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm512_set1_ps(v[j]); - iu = _mm512_set1_ps(v[j + 1]); - rn = _mm512_mul_ps(rs[0], ru); - in = _mm512_mul_ps(rs[0], iu); - rn = _mm512_fnmadd_ps(is[0], iu, rn); - in = _mm512_fmadd_ps(is[0], ru, in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm512_set1_ps(v[j]); - iu = _mm512_set1_ps(v[j + 1]); - rn = _mm512_fmadd_ps(rs[l], ru, rn); - in = _mm512_fmadd_ps(rs[l], iu, in); - rn = _mm512_fnmadd_ps(is[l], iu, rn); - in = _mm512_fmadd_ps(is[l], ru, in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm512_store_ps(p0 + p, rn); - _mm512_store_ps(p0 + p + 16, in); - } - }; - - auto m = GetMasks1(qs); - - unsigned k = 4 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, matrix, m.imaskh, m.qmaskh, state.get()); - } - - template - void ApplyGateL(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, - uint64_t imaskh, uint64_t qmaskh, const __m512i* idx, - fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m512 rn, in; - __m512 rs[gsize], is[gsize]; - - auto p0 = rstate + _pdep_u64(i, imaskh); - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k2] = _mm512_load_ps(p0 + p); - is[k2] = _mm512_load_ps(p0 + p + 16); - - for (unsigned l = 1; l < lsize; ++l) { - rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]); - is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]); - } - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm512_mul_ps(rs[0], w[j]); - in = _mm512_mul_ps(rs[0], w[j + 1]); - rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm512_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm512_fmadd_ps(rs[l], w[j], rn); - in = _mm512_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm512_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm512_store_ps(p0 + p, rn); - _mm512_store_ps(p0 + p + 16, in); - } - }; - - __m512i idx[1 << L]; - __m512 w[1 << (1 + 2 * H + L)]; - - auto m = GetMasks2(qs); - FillPermutationIndices(m.qmaskl, idx); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - unsigned r = 4 + H; - unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, w, m.imaskh, m.qmaskh, idx, state.get()); - } - - template - void ApplyControlledGateHH(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, - fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m512 ru, iu, rn, in; - __m512 rs[hsize], is[hsize]; - - auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh); - - for (unsigned k = 0; k < hsize; ++k) { - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k] = _mm512_load_ps(p0 + p); - is[k] = _mm512_load_ps(p0 + p + 16); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm512_set1_ps(v[j]); - iu = _mm512_set1_ps(v[j + 1]); - rn = _mm512_mul_ps(rs[0], ru); - in = _mm512_mul_ps(rs[0], iu); - rn = _mm512_fnmadd_ps(is[0], iu, rn); - in = _mm512_fmadd_ps(is[0], ru, in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm512_set1_ps(v[j]); - iu = _mm512_set1_ps(v[j + 1]); - rn = _mm512_fmadd_ps(rs[l], ru, rn); - in = _mm512_fmadd_ps(rs[l], iu, in); - rn = _mm512_fnmadd_ps(is[l], iu, rn); - in = _mm512_fmadd_ps(is[l], ru, in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm512_store_ps(p0 + p, rn); - _mm512_store_ps(p0 + p + 16, in); - } - }; - - auto m = GetMasks3(state.num_qubits(), qs, cqs, cvals); - - unsigned k = 4 + H + cqs.size(); - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, matrix, m.imaskh, m.qmaskh, m.cvalsh, state.get()); - } - - template - void ApplyControlledGateHL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, - uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, - fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m512 rn, in; - __m512 rs[hsize], is[hsize]; - - auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh); - - for (unsigned k = 0; k < hsize; ++k) { - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k] = _mm512_load_ps(p0 + p); - is[k] = _mm512_load_ps(p0 + p + 16); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm512_mul_ps(rs[0], w[j]); - in = _mm512_mul_ps(rs[0], w[j + 1]); - rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm512_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - rn = _mm512_fmadd_ps(rs[l], w[j], rn); - in = _mm512_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm512_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm512_store_ps(p0 + p, rn); - _mm512_store_ps(p0 + p + 16, in); - } - }; - - __m512 w[1 << (1 + 2 * H)]; - - auto m = GetMasks4(state.num_qubits(), qs, cqs, cvals); - FillControlledMatrixH(m.cvalsl, m.cmaskl, matrix, (fp_type*) w); - - unsigned r = 4 + H + cqs.size() - m.cl; - unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, state.get()); - } - - template - void ApplyControlledGateL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, - uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, - const __m512i* idx, fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m512 rn, in; - __m512 rs[gsize], is[gsize]; - - auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh); - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k2] = _mm512_load_ps(p0 + p); - is[k2] = _mm512_load_ps(p0 + p + 16); - - for (unsigned l = 1; l < lsize; ++l) { - rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]); - is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]); - } - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm512_mul_ps(rs[0], w[j]); - in = _mm512_mul_ps(rs[0], w[j + 1]); - rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm512_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm512_fmadd_ps(rs[l], w[j], rn); - in = _mm512_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm512_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm512_store_ps(p0 + p, rn); - _mm512_store_ps(p0 + p + 16, in); - } - }; - - __m512i idx[1 << L]; - __m512 w[1 << (1 + 2 * H + L)]; - - if (CH) { - auto m = GetMasks5(state.num_qubits(), qs, cqs, cvals); - FillPermutationIndices(m.qmaskl, idx); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - unsigned r = 4 + H + cqs.size(); - unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get()); - } else { - auto m = GetMasks6(state.num_qubits(), qs, cqs, cvals); - FillPermutationIndices(m.qmaskl, idx); - FillControlledMatrixL( - m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w); - - unsigned r = 4 + H + cqs.size() - m.cl; - unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get()); - } - } - - template - std::complex ExpectationValueH(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - uint64_t imaskh, uint64_t qmaskh, const fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m512 ru, iu, rn, in; - __m512 rs[hsize], is[hsize]; - - auto p0 = rstate + _pdep_u64(i, imaskh); - - for (unsigned k = 0; k < hsize; ++k) { - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k] = _mm512_load_ps(p0 + p); - is[k] = _mm512_load_ps(p0 + p + 16); - } - - double re = 0; - double im = 0; - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm512_set1_ps(v[j]); - iu = _mm512_set1_ps(v[j + 1]); - rn = _mm512_mul_ps(rs[0], ru); - in = _mm512_mul_ps(rs[0], iu); - rn = _mm512_fnmadd_ps(is[0], iu, rn); - in = _mm512_fmadd_ps(is[0], ru, in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm512_set1_ps(v[j]); - iu = _mm512_set1_ps(v[j + 1]); - rn = _mm512_fmadd_ps(rs[l], ru, rn); - in = _mm512_fmadd_ps(rs[l], iu, in); - rn = _mm512_fnmadd_ps(is[l], iu, rn); - in = _mm512_fmadd_ps(is[l], ru, in); - - j += 2; - } - - __m512 v_re = _mm512_fmadd_ps(is[k], in, _mm512_mul_ps(rs[k], rn)); - __m512 v_im = _mm512_fnmadd_ps(is[k], rn, _mm512_mul_ps(rs[k], in)); - - re += detail::HorizontalSumAVX512(v_re); - im += detail::HorizontalSumAVX512(v_im); - } - - return std::complex{re, im}; - }; - - auto m = GetMasks1(qs); - - unsigned k = 4 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Op = std::plus>; - return - for_.RunReduce(size, f, Op(), matrix, m.imaskh, m.qmaskh, state.get()); - } - - template - std::complex ExpectationValueL(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, - uint64_t imaskh, uint64_t qmaskh, const __m512i* idx, - const fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m512 rn, in; - __m512 rs[gsize], is[gsize]; - - auto p0 = rstate + _pdep_u64(i, imaskh); - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k2] = _mm512_load_ps(p0 + p); - is[k2] = _mm512_load_ps(p0 + p + 16); - - for (unsigned l = 1; l < lsize; ++l) { - rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]); - is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]); - } - } - - double re = 0; - double im = 0; - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm512_mul_ps(rs[0], w[j]); - in = _mm512_mul_ps(rs[0], w[j + 1]); - rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm512_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm512_fmadd_ps(rs[l], w[j], rn); - in = _mm512_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm512_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - unsigned m = lsize * k; - - __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn)); - __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in)); - - re += detail::HorizontalSumAVX512(v_re); - im += detail::HorizontalSumAVX512(v_im); - } - - return std::complex{re, im}; - }; - - __m512i idx[1 << L]; - __m512 w[1 << (1 + 2 * H + L)]; - - auto m = GetMasks2(qs); - FillPermutationIndices(m.qmaskl, idx); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - unsigned r = 4 + H; - unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; - uint64_t size = uint64_t{1} << n; - - using Op = std::plus>; - return - for_.RunReduce(size, f, Op(), w, m.imaskh, m.qmaskh, idx, state.get()); - } - - template - static void FillPermutationIndices(unsigned qmaskl, __m512i* idx) { - constexpr unsigned lsize = 1 << L; - - for (unsigned i = 0; i < lsize; ++i) { - unsigned p[16]; - - for (unsigned j = 0; j < 16; ++j) { - p[j] = MaskedAdd<4>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl)); - } - - idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], - p[9], p[8], p[7], p[6], p[5], p[4], - p[3], p[2], p[1], p[0]); - } - } - - For for_; -}; - -} // namespace qsim - -#endif // SIMULATOR_AVX512_H_ diff --git a/qsim/simulator_basic.h b/qsim/simulator_basic.h deleted file mode 100644 index 752eeb5..0000000 --- a/qsim/simulator_basic.h +++ /dev/null @@ -1,349 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SIMULATOR_BASIC_H_ -#define SIMULATOR_BASIC_H_ - -#include -#include -#include -#include - -#include "simulator.h" -#include "statespace_basic.h" - -namespace qsim { - -/** - * Quantum circuit simulator without vectorization. - */ -template -class SimulatorBasic final : public SimulatorBase { - public: - using StateSpace = StateSpaceBasic; - using State = typename StateSpace::State; - using fp_type = typename StateSpace::fp_type; - - template - explicit SimulatorBasic(ForArgs&&... args) : for_(args...) {} - - /** - * Applies a gate using non-vectorized instructions. - * @param qs Indices of the qubits affected by this gate. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyGate(const std::vector& qs, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - switch (qs.size()) { - case 0: - ApplyGateH<0>(qs, matrix, state); - break; - case 1: - ApplyGateH<1>(qs, matrix, state); - break; - case 2: - ApplyGateH<2>(qs, matrix, state); - break; - case 3: - ApplyGateH<3>(qs, matrix, state); - break; - case 4: - ApplyGateH<4>(qs, matrix, state); - break; - case 5: - ApplyGateH<5>(qs, matrix, state); - break; - case 6: - ApplyGateH<6>(qs, matrix, state); - break; - default: - // Not implemented. - break; - } - } - - /** - * Applies a controlled gate using non-vectorized instructions. - * @param qs Indices of the qubits affected by this gate. - * @param cqs Indices of control qubits. - * @param cvals Bit mask of control qubit values. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyControlledGate(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - if (cqs.size() == 0) { - ApplyGate(qs, matrix, state); - return; - } - - switch (qs.size()) { - case 0: - ApplyControlledGateH<0>(qs, cqs, cvals, matrix, state); - break; - case 1: - ApplyControlledGateH<1>(qs, cqs, cvals, matrix, state); - break; - case 2: - ApplyControlledGateH<2>(qs, cqs, cvals, matrix, state); - break; - case 3: - ApplyControlledGateH<3>(qs, cqs, cvals, matrix, state); - break; - case 4: - ApplyControlledGateH<4>(qs, cqs, cvals, matrix, state); - break; - default: - // Not implemented. - break; - } - } - - /** - * Computes the expectation value of an operator using non-vectorized - * instructions. - * @param qs Indices of the qubits the operator acts on. - * @param matrix The operator matrix. - * @param state The state of the system. - * @return The computed expectation value. - */ - std::complex ExpectationValue(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - switch (qs.size()) { - case 1: - return ExpectationValueH<1>(qs, matrix, state); - break; - case 2: - return ExpectationValueH<2>(qs, matrix, state); - break; - case 3: - return ExpectationValueH<3>(qs, matrix, state); - break; - case 4: - return ExpectationValueH<4>(qs, matrix, state); - break; - case 5: - return ExpectationValueH<5>(qs, matrix, state); - break; - case 6: - return ExpectationValueH<6>(qs, matrix, state); - break; - default: - // Not implemented. - break; - } - - return 0; - } - - /** - * @return The size of SIMD register if applicable. - */ - static unsigned SIMDRegisterSize() { - return 1; - } - - private: - template - void ApplyGateH(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - const uint64_t* ms, const uint64_t* xss, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - fp_type rn, in; - fp_type rs[hsize], is[hsize]; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = *(p0 + xss[k]); - is[k] = *(p0 + xss[k] + 1); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = rs[0] * v[j] - is[0] * v[j + 1]; - in = rs[0] * v[j + 1] + is[0] * v[j]; - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - rn += rs[l] * v[j] - is[l] * v[j + 1]; - in += rs[l] * v[j + 1] + is[l] * v[j]; - - j += 2; - } - - *(p0 + xss[k]) = rn; - *(p0 + xss[k] + 1) = in; - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, matrix, ms, xss, state.get()); - } - - template - void ApplyControlledGateH(const std::vector& qs, - const std::vector& cqs, - uint64_t cvals, const fp_type* matrix, - State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - const uint64_t* ms, const uint64_t* xss, - uint64_t cvalsh, uint64_t cmaskh, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - fp_type rn, in; - fp_type rs[hsize], is[hsize]; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - if ((ii & cmaskh) == cvalsh) { - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = *(p0 + xss[k]); - is[k] = *(p0 + xss[k] + 1); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = rs[0] * v[j] - is[0] * v[j + 1]; - in = rs[0] * v[j + 1] + is[0] * v[j]; - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - rn += rs[l] * v[j] - is[l] * v[j + 1]; - in += rs[l] * v[j + 1] + is[l] * v[j]; - - j += 2; - } - - *(p0 + xss[k]) = rn; - *(p0 + xss[k] + 1) = in; - } - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - - FillIndices(state.num_qubits(), qs, ms, xss); - - auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals); - - unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, matrix, ms, xss, m.cvalsh, m.cmaskh, state.get()); - } - - template - std::complex ExpectationValueH(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - const uint64_t* ms, const uint64_t* xss, - const fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - fp_type rn, in; - fp_type rs[hsize], is[hsize]; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = *(p0 + xss[k]); - is[k] = *(p0 + xss[k] + 1); - } - - double re = 0; - double im = 0; - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = rs[0] * v[j] - is[0] * v[j + 1]; - in = rs[0] * v[j + 1] + is[0] * v[j]; - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - rn += rs[l] * v[j] - is[l] * v[j + 1]; - in += rs[l] * v[j + 1] + is[l] * v[j]; - - j += 2; - } - - re += rs[k] * rn + is[k] * in; - im += rs[k] * in - is[k] * rn; - } - - return std::complex{re, im}; - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0; - uint64_t size = uint64_t{1} << n; - - using Op = std::plus>; - return for_.RunReduce(size, f, Op(), matrix, ms, xss, state.get()); - } - - For for_; -}; - -} // namespace qsim - -#endif // SIMULATOR_BASIC_H_ diff --git a/qsim/simulator_cuda.h b/qsim/simulator_cuda.h deleted file mode 100644 index 5743bea..0000000 --- a/qsim/simulator_cuda.h +++ /dev/null @@ -1,923 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SIMULATOR_CUDA_H_ -#define SIMULATOR_CUDA_H_ - -#include "simulator_cuda_kernels.h" - -#include -#include -#include -#include -#include - -#include "bits.h" -#include "statespace_cuda.h" - -namespace qsim { - -/** - * Quantum circuit simulator with GPU vectorization. - */ -template -class SimulatorCUDA final { - private: - using idx_type = uint64_t; - using Complex = qsim::Complex; - - // The maximum buffer size for indices and gate matrices. - // The maximum gate matrix size (for 6-qubit gates) is - // 2 * 2^6 * 2^6 * sizeof(FP) = 8192 * sizeof(FP). The maximum index size is - // 128 * sizeof(idx_type) + 96 * sizeof(unsigned). - static constexpr unsigned max_buf_size = 8192 * sizeof(FP) - + 128 * sizeof(idx_type) + 96 * sizeof(unsigned); - - public: - using StateSpace = StateSpaceCUDA; - using State = typename StateSpace::State; - using fp_type = typename StateSpace::fp_type; - - SimulatorCUDA() : scratch_(nullptr), scratch_size_(0) { - ErrorCheck(cudaMalloc(&d_ws, max_buf_size)); - } - - ~SimulatorCUDA() { - ErrorCheck(cudaFree(d_ws)); - - if (scratch_ != nullptr) { - ErrorCheck(cudaFree(scratch_)); - } - } - - /** - * Applies a gate using CUDA instructions. - * @param qs Indices of the qubits affected by this gate. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyGate(const std::vector& qs, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - if (qs.size() == 0) { - ApplyGateH<0>(qs, matrix, state); - } else if (qs[0] > 4) { - switch (qs.size()) { - case 1: - ApplyGateH<1>(qs, matrix, state); - break; - case 2: - ApplyGateH<2>(qs, matrix, state); - break; - case 3: - ApplyGateH<3>(qs, matrix, state); - break; - case 4: - ApplyGateH<4>(qs, matrix, state); - break; - case 5: - ApplyGateH<5>(qs, matrix, state); - break; - case 6: - ApplyGateH<6>(qs, matrix, state); - break; - default: - // Not implemented. - break; - } - } else { - switch (qs.size()) { - case 1: - ApplyGateL<1>(qs, matrix, state); - break; - case 2: - ApplyGateL<2>(qs, matrix, state); - break; - case 3: - ApplyGateL<3>(qs, matrix, state); - break; - case 4: - ApplyGateL<4>(qs, matrix, state); - break; - case 5: - ApplyGateL<5>(qs, matrix, state); - break; - case 6: - ApplyGateL<6>(qs, matrix, state); - break; - default: - // Not implemented. - break; - } - } - } - - /** - * Applies a controlled gate using CUDA instructions. - * @param qs Indices of the qubits affected by this gate. - * @param cqs Indices of control qubits. - * @param cvals Bit mask of control qubit values. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyControlledGate(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - if (cqs.size() == 0) { - ApplyGate(qs, matrix, state); - return; - } - - if (cqs[0] < 5) { - switch (qs.size()) { - case 0: - ApplyControlledGateL<0>(qs, cqs, cvals, matrix, state); - break; - case 1: - ApplyControlledGateL<1>(qs, cqs, cvals, matrix, state); - break; - case 2: - ApplyControlledGateL<2>(qs, cqs, cvals, matrix, state); - break; - case 3: - ApplyControlledGateL<3>(qs, cqs, cvals, matrix, state); - break; - case 4: - ApplyControlledGateL<4>(qs, cqs, cvals, matrix, state); - break; - default: - // Not implemented. - break; - } - } else { - if (qs.size() == 0) { - ApplyControlledGateHH<0>(qs, cqs, cvals, matrix, state); - } else if (qs[0] > 4) { - switch (qs.size()) { - case 1: - ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state); - break; - case 2: - ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state); - break; - case 3: - ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state); - break; - case 4: - ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state); - break; - default: - // Not implemented. - break; - } - } else { - switch (qs.size()) { - case 1: - ApplyControlledGateLH<1>(qs, cqs, cvals, matrix, state); - break; - case 2: - ApplyControlledGateLH<2>(qs, cqs, cvals, matrix, state); - break; - case 3: - ApplyControlledGateLH<3>(qs, cqs, cvals, matrix, state); - break; - case 4: - ApplyControlledGateLH<4>(qs, cqs, cvals, matrix, state); - break; - default: - // Not implemented. - break; - } - } - } - } - - /** - * Computes the expectation value of an operator using CUDA instructions. - * @param qs Indices of the qubits the operator acts on. - * @param matrix The operator matrix. - * @param state The state of the system. - * @return The computed expectation value. - */ - std::complex ExpectationValue(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - if (qs[0] > 4) { - switch (qs.size()) { - case 1: - return ExpectationValueH<1>(qs, matrix, state); - case 2: - return ExpectationValueH<2>(qs, matrix, state); - case 3: - return ExpectationValueH<3>(qs, matrix, state); - case 4: - return ExpectationValueH<4>(qs, matrix, state); - case 5: - return ExpectationValueH<5>(qs, matrix, state); - case 6: - return ExpectationValueH<6>(qs, matrix, state); - default: - // Not implemented. - break; - } - } else { - switch (qs.size()) { - case 1: - return ExpectationValueL<1>(qs, matrix, state); - case 2: - return ExpectationValueL<2>(qs, matrix, state); - case 3: - return ExpectationValueL<3>(qs, matrix, state); - case 4: - return ExpectationValueL<4>(qs, matrix, state); - case 5: - return ExpectationValueL<5>(qs, matrix, state); - case 6: - return ExpectationValueL<6>(qs, matrix, state); - default: - // Not implemented. - break; - } - } - - return 0; - } - - /** - * @return The size of SIMD register if applicable. - */ - static unsigned SIMDRegisterSize() { - return 32; - } - - private: - // The following indices are used in kernels. - // xss - indices to access the state vector entries in global memory. - // ms - masks to access the state vector entries in global memory. - // tis - indices to access the state vector entries in shared memory - // in the presence of low gate qubits. - // qis - indices to access the state vector entries in shared memory - // in the presence of low gate qubits. - // cis - additional indices to access the state vector entries in global - // memory in the presence of low control qubits. - - template - struct IndicesH { - static constexpr unsigned gsize = 1 << G; - static constexpr unsigned matrix_size = 2 * gsize * gsize * sizeof(fp_type); - static constexpr unsigned xss_size = 32 * sizeof(idx_type) * (1 + (G == 6)); - static constexpr unsigned ms_size = 32 * sizeof(idx_type); - static constexpr unsigned xss_offs = matrix_size; - static constexpr unsigned ms_offs = xss_offs + xss_size; - static constexpr unsigned buf_size = ms_offs + ms_size; - - IndicesH(char* p) - : xss((idx_type*) (p + xss_offs)), ms((idx_type*) (p + ms_offs)) {} - - idx_type* xss; - idx_type* ms; - }; - - template - struct IndicesL : public IndicesH { - using Base = IndicesH; - static constexpr unsigned qis_size = 32 * sizeof(unsigned) * (1 + (G == 6)); - static constexpr unsigned tis_size = 32 * sizeof(unsigned); - static constexpr unsigned qis_offs = Base::buf_size; - static constexpr unsigned tis_offs = qis_offs + qis_size; - static constexpr unsigned buf_size = tis_offs + tis_size; - - IndicesL(char* p) - : Base(p), qis((unsigned*) (p + qis_offs)), - tis((unsigned*) (p + tis_offs)) {} - - unsigned* qis; - unsigned* tis; - }; - - template - struct IndicesLC : public IndicesL { - using Base = IndicesL; - static constexpr unsigned cis_size = 32 * sizeof(idx_type); - static constexpr unsigned cis_offs = Base::buf_size; - static constexpr unsigned buf_size = cis_offs + cis_size; - - IndicesLC(char* p) : Base(p), cis((idx_type*) (p + cis_offs)) {} - - idx_type* cis; - }; - - struct DataC { - idx_type cvalsh; - unsigned num_aqs; - unsigned num_effective_qs; - unsigned remaining_low_cqs; - }; - - template - void ApplyGateH(const std::vector& qs, - const fp_type* matrix, State& state) const { - unsigned num_qubits = state.num_qubits(); - - IndicesH h_i(h_ws); - GetIndicesH(num_qubits, qs, qs.size(), h_i); - - std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); - ErrorCheck( - cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); - - unsigned k = 5 + G; - unsigned n = num_qubits > k ? num_qubits - k : 0; - unsigned size = unsigned{1} << n; - unsigned threads = 64U; - unsigned blocks = std::max(1U, size / 2); - - IndicesH d_i(d_ws); - - ApplyGateH_Kernel<<>>( - (fp_type*) d_ws, d_i.xss, d_i.ms, state.get()); - } - - template - void ApplyGateL(const std::vector& qs, - const fp_type* matrix, State& state) const { - unsigned num_qubits = state.num_qubits(); - - IndicesL h_i(h_ws); - auto num_effective_qs = GetIndicesL(num_qubits, qs, h_i); - - std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); - ErrorCheck( - cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); - - unsigned k = 5 + num_effective_qs; - unsigned n = num_qubits > k ? num_qubits - k : 0; - unsigned size = unsigned{1} << n; - unsigned threads = 32; - unsigned blocks = size; - - IndicesL d_i(d_ws); - - ApplyGateL_Kernel<<>>( - (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, - 1 << num_effective_qs, state.get()); - } - - template - void ApplyControlledGateHH(const std::vector& qs, - const std::vector& cqs, idx_type cvals, - const fp_type* matrix, State& state) const { - unsigned aqs[64]; - idx_type cmaskh = 0; - unsigned num_qubits = state.num_qubits(); - - IndicesH h_i(h_ws); - - unsigned num_aqs = GetHighQubits(qs, 0, cqs, 0, 0, cmaskh, aqs); - GetMs(num_qubits, aqs, num_aqs, h_i.ms); - GetXss(num_qubits, qs, qs.size(), h_i.xss); - - idx_type cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh); - - std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); - ErrorCheck( - cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); - - unsigned k = 5 + G + cqs.size(); - unsigned n = num_qubits > k ? num_qubits - k : 0; - unsigned size = unsigned{1} << n; - unsigned threads = 64U; - unsigned blocks = std::max(1U, size / 2); - - IndicesH d_i(d_ws); - - ApplyControlledGateH_Kernel<<>>( - (fp_type*) d_ws, d_i.xss, d_i.ms, num_aqs + 1, cvalsh, state.get()); - } - - template - void ApplyControlledGateLH(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - unsigned num_qubits = state.num_qubits(); - - IndicesL h_i(h_ws); - auto d = GetIndicesLC(num_qubits, qs, cqs, cvals, h_i); - - std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); - ErrorCheck( - cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); - - unsigned k = 5 + G + cqs.size(); - unsigned n = num_qubits > k ? num_qubits - k : 0; - unsigned size = unsigned{1} << n; - unsigned threads = 32; - unsigned blocks = size; - - IndicesL d_i(d_ws); - - ApplyControlledGateLH_Kernel<<>>( - (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, - d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs, state.get()); - } - - template - void ApplyControlledGateL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - unsigned num_qubits = state.num_qubits(); - - IndicesLC h_i(h_ws); - auto d = GetIndicesLCL(num_qubits, qs, cqs, cvals, h_i); - - std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); - ErrorCheck( - cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); - - unsigned k = 5 + G + cqs.size(); - unsigned n = num_qubits > k ? num_qubits - k : 0; - unsigned size = unsigned{1} << n; - unsigned threads = 32; - unsigned blocks = size; - - IndicesLC d_i(d_ws); - - ApplyControlledGateL_Kernel<<>>( - (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d_i.cis, - d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs, - 1 << (5 - d.remaining_low_cqs), state.get()); - } - - template - std::complex ExpectationValueH(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - unsigned num_qubits = state.num_qubits(); - - IndicesH h_i(h_ws); - GetIndicesH(num_qubits, qs, qs.size(), h_i); - - std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); - ErrorCheck( - cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); - - unsigned k = 5 + G; - unsigned n = num_qubits > k ? num_qubits - k : 0; - unsigned size = unsigned{1} << n; - - unsigned s = std::min(n >= 14 ? n - 14 : 0, 4U); - unsigned threads = 64U; - unsigned blocks = std::max(1U, (size / 2) >> s); - unsigned num_iterations_per_block = 1 << s; - - constexpr unsigned m = 16; - - Complex* d_res1 = (Complex*) AllocScratch((blocks + m) * sizeof(Complex)); - Complex* d_res2 = d_res1 + blocks; - - IndicesH d_i(d_ws); - - ExpectationValueH_Kernel<<>>( - (fp_type*) d_ws, d_i.xss, d_i.ms, num_iterations_per_block, - state.get(), Plus(), d_res1); - - double mul = size == 1 ? 0.5 : 1.0; - - return ExpectationValueReduceFinal(blocks, mul, d_res1, d_res2); - } - - template - std::complex ExpectationValueL(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - unsigned num_qubits = state.num_qubits(); - - IndicesL h_i(h_ws); - auto num_effective_qs = GetIndicesL(num_qubits, qs, h_i); - - std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); - ErrorCheck( - cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); - - unsigned k = 5 + num_effective_qs; - unsigned n = num_qubits > k ? num_qubits - k : 0; - unsigned size = unsigned{1} << n; - - unsigned s = std::min(n >= 13 ? n - 13 : 0, 5U); - unsigned threads = 32; - unsigned blocks = size >> s; - unsigned num_iterations_per_block = 1 << s; - - constexpr unsigned m = 16; - - Complex* d_res1 = (Complex*) AllocScratch((blocks + m) * sizeof(Complex)); - Complex* d_res2 = d_res1 + blocks; - - IndicesL d_i(d_ws); - - ExpectationValueL_Kernel<<>>( - (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, - num_iterations_per_block, state.get(), Plus(), d_res1); - - double mul = double(1 << (5 + num_effective_qs - G)) / 32; - - return ExpectationValueReduceFinal(blocks, mul, d_res1, d_res2); - } - - template - std::complex ExpectationValueReduceFinal( - unsigned blocks, double mul, - const Complex* d_res1, Complex* d_res2) const { - Complex res2[m]; - - if (blocks <= 16) { - ErrorCheck(cudaMemcpy(res2, d_res1, blocks * sizeof(Complex), - cudaMemcpyDeviceToHost)); - } else { - unsigned threads2 = std::min(1024U, blocks); - unsigned blocks2 = std::min(m, blocks / threads2); - - unsigned dblocks = std::max(1U, blocks / (blocks2 * threads2)); - unsigned bytes = threads2 * sizeof(Complex); - - Reduce2Kernel<<>>( - dblocks, blocks, Plus(), Plus(), d_res1, d_res2); - - ErrorCheck(cudaMemcpy(res2, d_res2, blocks2 * sizeof(Complex), - cudaMemcpyDeviceToHost)); - - blocks = blocks2; - } - - double re = 0; - double im = 0; - - for (unsigned i = 0; i < blocks; ++i) { - re += res2[i].re; - im += res2[i].im; - } - - return {mul * re, mul * im}; - } - - template - unsigned GetHighQubits(const std::vector& qs, unsigned qi, - const std::vector& cqs, unsigned ci, - unsigned ai, idx_type& cmaskh, AQ& aqs) const { - while (1) { - if (qi < qs.size() && (ci == cqs.size() || qs[qi] < cqs[ci])) { - aqs[ai++] = qs[qi++]; - } else if (ci < cqs.size()) { - cmaskh |= idx_type{1} << cqs[ci]; - aqs[ai++] = cqs[ci++]; - } else { - break; - } - } - - return ai; - } - - template - void GetMs(unsigned num_qubits, const QS& qs, unsigned qs_size, - idx_type* ms) const { - if (qs_size == 0) { - ms[0] = idx_type(-1); - } else { - idx_type xs = idx_type{1} << (qs[0] + 1); - ms[0] = (idx_type{1} << qs[0]) - 1; - for (unsigned i = 1; i < qs_size; ++i) { - ms[i] = ((idx_type{1} << qs[i]) - 1) ^ (xs - 1); - xs = idx_type{1} << (qs[i] + 1); - } - ms[qs_size] = ((idx_type{1} << num_qubits) - 1) ^ (xs - 1); - } - } - - template - void GetXss(unsigned num_qubits, const QS& qs, unsigned qs_size, - idx_type* xss) const { - if (qs_size == 0) { - xss[0] = 0; - } else { - unsigned g = qs_size; - unsigned gsize = 1 << qs_size; - - idx_type xs[64]; - - xs[0] = idx_type{1} << (qs[0] + 1); - for (unsigned i = 1; i < g; ++i) { - xs[i] = idx_type{1} << (qs[i] + 1); - } - - for (unsigned i = 0; i < gsize; ++i) { - idx_type a = 0; - for (unsigned k = 0; k < g; ++k) { - a += xs[k] * ((i >> k) & 1); - } - xss[i] = a; - } - } - } - - template - void GetIndicesH(unsigned num_qubits, const qs_type& qs, unsigned qs_size, - IndicesH& indices) const { - if (qs_size == 0) { - indices.ms[0] = idx_type(-1); - indices.xss[0] = 0; - } else { - unsigned g = qs_size; - unsigned gsize = 1 << qs_size; - - idx_type xs[64]; - - xs[0] = idx_type{1} << (qs[0] + 1); - indices.ms[0] = (idx_type{1} << qs[0]) - 1; - for (unsigned i = 1; i < g; ++i) { - xs[i] = idx_type{1} << (qs[i] + 1); - indices.ms[i] = ((idx_type{1} << qs[i]) - 1) ^ (xs[i - 1] - 1); - } - indices.ms[g] = ((idx_type{1} << num_qubits) - 1) ^ (xs[g - 1] - 1); - - for (unsigned i = 0; i < gsize; ++i) { - idx_type a = 0; - for (unsigned k = 0; k < g; ++k) { - a += xs[k] * ((i >> k) & 1); - } - indices.xss[i] = a; - } - } - } - - template - void GetIndicesL(unsigned num_effective_qs, unsigned qmask, - IndicesL& indices) const { - for (unsigned i = num_effective_qs + 1; i < (G + 1); ++i) { - indices.ms[i] = 0; - } - - for (unsigned i = (1 << num_effective_qs); i < indices.gsize; ++i) { - indices.xss[i] = 0; - } - - for (unsigned i = 0; i < indices.gsize; ++i) { - indices.qis[i] = bits::ExpandBits(i, 5 + num_effective_qs, qmask); - } - - unsigned tmask = ((1 << (5 + num_effective_qs)) - 1) ^ qmask; - for (unsigned i = 0; i < 32; ++i) { - indices.tis[i] = bits::ExpandBits(i, 5 + num_effective_qs, tmask); - } - } - - template - unsigned GetIndicesL(unsigned num_qubits, const std::vector& qs, - IndicesL& indices) const { - unsigned eqs[32]; - - unsigned qmaskh = 0; - unsigned qmaskl = 0; - - unsigned qi = 0; - - while (qi < qs.size() && qs[qi] < 5) { - qmaskl |= 1 << qs[qi++]; - } - - unsigned nq = std::max(5U, num_qubits); - unsigned num_effective_qs = std::min(nq - 5, unsigned(qs.size())); - - unsigned l = 0; - unsigned ei = 0; - unsigned num_low_qs = qi; - - if (qs.size() == num_low_qs) { - while (ei < num_effective_qs && l++ < num_low_qs) { - eqs[ei] = ei + 5; - ++ei; - } - } else { - while (ei < num_effective_qs && l < num_low_qs) { - unsigned ei5 = ei + 5; - eqs[ei] = ei5; - if (qi < qs.size() && qs[qi] == ei5) { - ++qi; - qmaskh |= 1 << ei5; - } else { - ++l; - } - ++ei; - } - - while (ei < num_effective_qs) { - eqs[ei] = qs[qi++]; - qmaskh |= 1 << (ei + 5); - ++ei; - } - } - - GetIndicesH(num_qubits, eqs, num_effective_qs, indices); - GetIndicesL(num_effective_qs, qmaskh | qmaskl, indices); - - return num_effective_qs; - } - - template - DataC GetIndicesLC(unsigned num_qubits, const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - IndicesL& indices) const { - unsigned aqs[64]; - unsigned eqs[32]; - - unsigned qmaskh = 0; - unsigned qmaskl = 0; - idx_type cmaskh = 0; - - unsigned qi = 0; - - while (qi < qs.size() && qs[qi] < 5) { - qmaskl |= 1 << qs[qi++]; - } - - unsigned nq = std::max(5U, num_qubits - unsigned(cqs.size())); - unsigned num_effective_qs = std::min(nq - 5, unsigned(qs.size())); - - unsigned l = 0; - unsigned ai = 5; - unsigned ci = 0; - unsigned ei = 0; - unsigned num_low_qs = qi; - - while (ai < num_qubits && l < num_low_qs) { - aqs[ai - 5] = ai; - if (qi < qs.size() && qs[qi] == ai) { - ++qi; - eqs[ei++] = ai; - qmaskh |= 1 << (ai - ci); - } else if (ci < cqs.size() && cqs[ci] == ai) { - ++ci; - cmaskh |= idx_type{1} << ai; - } else { - ++l; - eqs[ei++] = ai; - } - ++ai; - } - - unsigned i = ai; - unsigned j = qi; - - while (ei < num_effective_qs) { - eqs[ei++] = qs[j++]; - qmaskh |= 1 << (i++ - ci); - } - - unsigned num_aqs = GetHighQubits(qs, qi, cqs, ci, ai - 5, cmaskh, aqs); - GetMs(num_qubits, aqs, num_aqs, indices.ms); - GetXss(num_qubits, eqs, num_effective_qs, indices.xss); - GetIndicesL(num_effective_qs, qmaskh | qmaskl, indices); - - idx_type cvalsh = bits::ExpandBits(idx_type(cvals), num_qubits, cmaskh); - - return {cvalsh, num_aqs, num_effective_qs}; - } - - template - DataC GetIndicesLCL(unsigned num_qubits, const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - IndicesLC& indices) const { - unsigned aqs[64]; - unsigned eqs[32]; - - unsigned qmaskh = 0; - unsigned qmaskl = 0; - idx_type cmaskh = 0; - idx_type cmaskl = 0; - idx_type cis_mask = 0; - - unsigned qi = 0; - unsigned ci = 0; - - for (unsigned k = 0; k < 5; ++k) { - if (qi < qs.size() && qs[qi] == k) { - qmaskl |= 1 << (k - ci); - ++qi; - } else if (ci < cqs.size() && cqs[ci] == k) { - cmaskl |= idx_type{1} << k; - ++ci; - } - } - - unsigned num_low_qs = qi; - unsigned num_low_cqs = ci; - - unsigned nq = std::max(5U, num_qubits - unsigned(cqs.size())); - unsigned num_effective_qs = std::min(nq - 5, unsigned(qs.size())); - - unsigned l = 0; - unsigned ai = 5; - unsigned ei = 0; - unsigned num_low = num_low_qs + num_low_cqs; - unsigned remaining_low_cqs = num_low_cqs; - unsigned effective_low_qs = num_low_qs; - unsigned highest_cis_bit = 0; - - while (ai < num_qubits && l < num_low) { - aqs[ai - 5] = ai; - if (qi < qs.size() && qs[qi] == ai) { - ++qi; - if ((ai - ci) > 4) { - eqs[ei++] = ai; - qmaskh |= 1 << (ai - ci); - } else { - highest_cis_bit = ai; - cis_mask |= idx_type{1} << ai; - qmaskl |= 1 << (ai - ci); - --remaining_low_cqs; - ++effective_low_qs; - } - } else if (ci < cqs.size() && cqs[ci] == ai) { - ++ci; - cmaskh |= idx_type{1} << ai; - } else { - ++l; - if (remaining_low_cqs == 0) { - eqs[ei++] = ai; - } else { - highest_cis_bit = ai; - cis_mask |= idx_type{1} << ai; - --remaining_low_cqs; - } - } - ++ai; - } - - unsigned i = ai; - unsigned j = effective_low_qs; - - while (ei < num_effective_qs) { - eqs[ei++] = qs[j++]; - qmaskh |= 1 << (i++ - ci); - } - - unsigned num_aqs = GetHighQubits(qs, qi, cqs, ci, ai - 5, cmaskh, aqs); - GetMs(num_qubits, aqs, num_aqs, indices.ms); - GetXss(num_qubits, eqs, num_effective_qs, indices.xss); - GetIndicesL(num_effective_qs, qmaskh | qmaskl, indices); - - idx_type cvalsh = bits::ExpandBits(idx_type(cvals), num_qubits, cmaskh); - idx_type cvalsl = bits::ExpandBits(idx_type(cvals), 5, cmaskl); - - cis_mask |= 31 ^ cmaskl; - highest_cis_bit = highest_cis_bit < 5 ? 5 : highest_cis_bit; - for (idx_type i = 0; i < 32; ++i) { - auto c = bits::ExpandBits(i, highest_cis_bit + 1, cis_mask); - indices.cis[i] = 2 * (c & 0xffffffe0) | (c & 0x1f) | cvalsl; - } - - return {cvalsh, num_aqs, num_effective_qs, remaining_low_cqs}; - } - - - void* AllocScratch(uint64_t size) const { - if (size > scratch_size_) { - if (scratch_ != nullptr) { - ErrorCheck(cudaFree(scratch_)); - } - - ErrorCheck(cudaMalloc(const_cast(&scratch_), size)); - - const_cast(scratch_size_) = size; - } - - return scratch_; - } - - char* d_ws; - char h_ws0[max_buf_size]; - char* h_ws = (char*) h_ws0; - - void* scratch_; - uint64_t scratch_size_; -}; - -} // namespace qsim - -#endif // SIMULATOR_CUDA_H_ diff --git a/qsim/simulator_cuda_kernels.h b/qsim/simulator_cuda_kernels.h deleted file mode 100644 index e21a9d6..0000000 --- a/qsim/simulator_cuda_kernels.h +++ /dev/null @@ -1,683 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SIMULATOR_CUDA_KERNELS_H_ -#define SIMULATOR_CUDA_KERNELS_H_ - -#ifdef __NVCC__ - #include - #include - - #include "util_cuda.h" -#elif __HIP__ - #include - #include "cuda2hip.h" -#endif - -namespace qsim { - -template -__global__ void ApplyGateH_Kernel( - const fp_type* __restrict__ v0, const idx_type* __restrict__ xss0, - const idx_type* __restrict__ mss, fp_type* __restrict__ rstate) { - // blockDim.x must be equal to 64. - - static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); - - constexpr unsigned gsize = 1 << G; - constexpr unsigned rows = - G < 4 ? gsize : (sizeof(fp_type) == 4 ? - (G < 6 ? gsize : 32) : (G < 5 ? 8 : 16)); - - fp_type rs[gsize], is[gsize]; - - __shared__ idx_type xss[64]; - __shared__ fp_type v[2 * gsize * rows]; - - if (threadIdx.x < gsize) { - xss[threadIdx.x] = xss0[threadIdx.x]; - } - - if (G <= 2) { - if (threadIdx.x < 2 * gsize * gsize) { - v[threadIdx.x] = v0[threadIdx.x]; - } - } else { - for (unsigned m = 0; m < 2 * gsize * rows; m += 64) { - v[m + threadIdx.x] = v0[m + threadIdx.x]; - } - } - - __syncthreads(); - - idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0; - idx_type ii = i & mss[0]; - for (unsigned j = 1; j <= G; ++j) { - i *= 2; - ii |= i & mss[j]; - } - - auto p0 = rstate + 2 * ii + threadIdx.x % 32; - - for (unsigned k = 0; k < gsize; ++k) { - rs[k] = *(p0 + xss[k]); - is[k] = *(p0 + xss[k] + 32); - } - - for (unsigned s = 0; s < gsize / rows; ++s) { - if (s > 0) { - __syncthreads(); - - for (unsigned m = 0; m < 2 * gsize * rows; m += 64) { - v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; - } - - __syncthreads(); - } - - unsigned j = 0; - - for (unsigned k = rows * s; k < rows * (s + 1); ++k) { - fp_type rn = 0; - fp_type in = 0; - - for (unsigned l = 0; l < gsize; ++l) { - fp_type rm = v[j++]; - fp_type im = v[j++]; - rn += rs[l] * rm; - rn -= is[l] * im; - in += rs[l] * im; - in += is[l] * rm; - } - - *(p0 + xss[k]) = rn; - *(p0 + xss[k] + 32) = in; - } - } -} - -template -__global__ void ApplyGateL_Kernel( - const fp_type* __restrict__ v0, const idx_type* __restrict__ xss, - const idx_type* __restrict__ mss, const unsigned* __restrict__ qis, - const unsigned* __restrict__ tis, unsigned esize, - fp_type* __restrict__ rstate) { - // blockDim.x must be equal to 32. - - static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); - - constexpr unsigned gsize = 1 << G; - constexpr unsigned - rows = G < 4 ? gsize : (sizeof(fp_type) == 4 ? - (G < 5 ? gsize : 8) : (G < 6 ? 8 : 4)); - - fp_type rs[gsize], is[gsize]; - - __shared__ fp_type v[2 * gsize * rows]; - __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1]; - - if (G < 2) { - if (threadIdx.x < 2 * gsize * gsize) { - v[threadIdx.x] = v0[threadIdx.x]; - } - } else { - for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { - v[m + threadIdx.x] = v0[m + threadIdx.x]; - } - } - - idx_type i = 32 * idx_type{blockIdx.x}; - idx_type ii = i & mss[0]; - for (unsigned j = 1; j <= G; ++j) { - i *= 2; - ii |= i & mss[j]; - } - - auto p0 = rstate + 2 * ii + threadIdx.x; - - for (unsigned k = 0; k < gsize; ++k) { - rs0[threadIdx.x][k] = *(p0 + xss[k]); - is0[threadIdx.x][k] = *(p0 + xss[k] + 32); - } - - for (unsigned k = 0; k < gsize; ++k) { - unsigned i = tis[threadIdx.x] | qis[k]; - unsigned m = i & 0x1f; - unsigned n = i / 32; - - rs[k] = rs0[m][n]; - is[k] = is0[m][n]; - } - - for (unsigned s = 0; s < gsize / rows; ++s) { - if (s > 0) { - for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { - v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; - } - } - - unsigned j = 0; - - for (unsigned k = rows * s; k < rows * (s + 1); ++k) { - fp_type rn = 0; - fp_type in = 0; - - for (unsigned l = 0; l < gsize; ++l) { - fp_type rm = v[j++]; - fp_type im = v[j++]; - rn += rs[l] * rm; - rn -= is[l] * im; - in += rs[l] * im; - in += is[l] * rm; - } - - unsigned i = tis[threadIdx.x] | qis[k]; - unsigned m = i & 0x1f; - unsigned n = i / 32; - - rs0[m][n] = rn; - is0[m][n] = in; - } - } - - for (unsigned k = 0; k < esize; ++k) { - *(p0 + xss[k]) = rs0[threadIdx.x][k]; - *(p0 + xss[k] + 32) = is0[threadIdx.x][k]; - } -} - -template -__global__ void ApplyControlledGateH_Kernel( - const fp_type* __restrict__ v0, const idx_type* __restrict__ xss0, - const idx_type* __restrict__ mss, unsigned num_mss, idx_type cvalsh, - fp_type* __restrict__ rstate) { - // blockDim.x must be equal to 64. - - static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); - - constexpr unsigned gsize = 1 << G; - constexpr unsigned rows = - G < 4 ? gsize : (sizeof(fp_type) == 4 ? - (G < 6 ? gsize : 32) : (G < 5 ? 8 : 16)); - - fp_type rs[gsize], is[gsize]; - - __shared__ idx_type xss[64]; - __shared__ fp_type v[2 * gsize * rows]; - - if (threadIdx.x < gsize) { - xss[threadIdx.x] = xss0[threadIdx.x]; - } - - if (G <= 2) { - if (threadIdx.x < 2 * gsize * gsize) { - v[threadIdx.x] = v0[threadIdx.x]; - } - } else { - for (unsigned m = 0; m < 2 * gsize * rows; m += 64) { - v[m + threadIdx.x] = v0[m + threadIdx.x]; - } - } - - __syncthreads(); - - idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0; - idx_type ii = i & mss[0]; - for (unsigned j = 1; j < num_mss; ++j) { - i *= 2; - ii |= i & mss[j]; - } - - ii |= cvalsh; - - auto p0 = rstate + 2 * ii + threadIdx.x % 32; - - for (unsigned k = 0; k < gsize; ++k) { - rs[k] = *(p0 + xss[k]); - is[k] = *(p0 + xss[k] + 32); - } - - for (unsigned s = 0; s < gsize / rows; ++s) { - if (s > 0) { - __syncthreads(); - - for (unsigned m = 0; m < 2 * gsize * rows; m += 64) { - v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; - } - - __syncthreads(); - } - - unsigned j = 0; - - for (unsigned k = rows * s; k < rows * (s + 1); ++k) { - fp_type rn = 0; - fp_type in = 0; - - for (unsigned l = 0; l < gsize; ++l) { - fp_type rm = v[j++]; - fp_type im = v[j++]; - rn += rs[l] * rm; - rn -= is[l] * im; - in += rs[l] * im; - in += is[l] * rm; - } - - *(p0 + xss[k]) = rn; - *(p0 + xss[k] + 32) = in; - } - } -} - -template -__global__ void ApplyControlledGateLH_Kernel( - const fp_type* __restrict__ v0, const idx_type* __restrict__ xss, - const idx_type* __restrict__ mss, const unsigned* __restrict__ qis, - const unsigned* __restrict__ tis, unsigned num_mss, idx_type cvalsh, - unsigned esize, fp_type* __restrict__ rstate) { - // blockDim.x must be equal to 32. - - static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); - - constexpr unsigned gsize = 1 << G; - constexpr unsigned - rows = G < 4 ? gsize : (sizeof(fp_type) == 4 ? - (G < 5 ? gsize : 8) : (G < 6 ? 8 : 4)); - - fp_type rs[gsize], is[gsize]; - - __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1]; - __shared__ fp_type v[2 * gsize * rows]; - - idx_type i = 32 * idx_type{blockIdx.x}; - idx_type ii = i & mss[0]; - for (unsigned j = 1; j < num_mss; ++j) { - i *= 2; - ii |= i & mss[j]; - } - - ii |= cvalsh; - - auto p0 = rstate + 2 * ii + threadIdx.x; - - for (unsigned k = 0; k < gsize; ++k) { - rs0[threadIdx.x][k] = *(p0 + xss[k]); - is0[threadIdx.x][k] = *(p0 + xss[k] + 32); - } - - if (G < 2) { - if (threadIdx.x < 2 * gsize * gsize) { - v[threadIdx.x] = v0[threadIdx.x]; - } - } else { - for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { - v[m + threadIdx.x] = v0[m + threadIdx.x]; - } - } - - for (unsigned k = 0; k < gsize; ++k) { - unsigned i = tis[threadIdx.x] | qis[k]; - unsigned m = i & 0x1f; - unsigned n = i / 32; - - rs[k] = rs0[m][n]; - is[k] = is0[m][n]; - } - - for (unsigned s = 0; s < gsize / rows; ++s) { - if (s > 0) { - for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { - v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; - } - } - - unsigned j = 0; - - for (unsigned k = rows * s; k < rows * (s + 1); ++k) { - fp_type rn = 0; - fp_type in = 0; - - for (unsigned l = 0; l < gsize; ++l) { - fp_type rm = v[j++]; - fp_type im = v[j++]; - rn += rs[l] * rm; - rn -= is[l] * im; - in += rs[l] * im; - in += is[l] * rm; - } - - unsigned i = tis[threadIdx.x] | qis[k]; - unsigned m = i & 0x1f; - unsigned n = i / 32; - - rs0[m][n] = rn; - is0[m][n] = in; - } - } - - for (unsigned k = 0; k < esize; ++k) { - *(p0 + xss[k]) = rs0[threadIdx.x][k]; - *(p0 + xss[k] + 32) = is0[threadIdx.x][k]; - } -} - -template -__global__ void ApplyControlledGateL_Kernel( - const fp_type* __restrict__ v0, const idx_type* __restrict__ xss, - const idx_type* __restrict__ mss, const unsigned* __restrict__ qis, - const unsigned* __restrict__ tis, const idx_type* __restrict__ cis, - unsigned num_mss, idx_type cvalsh, unsigned esize, unsigned rwthreads, - fp_type* __restrict__ rstate) { - // blockDim.x must be equal to 32. - - static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); - - constexpr unsigned gsize = 1 << G; - constexpr unsigned - rows = G < 4 ? gsize : (sizeof(fp_type) == 4 ? - (G < 5 ? gsize : 8) : (G < 6 ? 8 : 4)); - - fp_type rs[gsize], is[gsize]; - - __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1]; - __shared__ fp_type v[2 * gsize * rows]; - - idx_type i = 32 * idx_type{blockIdx.x}; - idx_type ii = i & mss[0]; - for (unsigned j = 1; j < num_mss; ++j) { - i *= 2; - ii |= i & mss[j]; - } - - ii |= cvalsh; - - auto p0 = rstate + 2 * ii + cis[threadIdx.x]; - - if (threadIdx.x < rwthreads) { - for (unsigned k = 0; k < gsize; ++k) { - rs0[threadIdx.x][k] = *(p0 + xss[k]); - is0[threadIdx.x][k] = *(p0 + xss[k] + 32); - } - } - - if (G < 2) { - if (threadIdx.x < 2 * gsize * gsize) { - v[threadIdx.x] = v0[threadIdx.x]; - } - } else { - for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { - v[m + threadIdx.x] = v0[m + threadIdx.x]; - } - } - - for (unsigned k = 0; k < gsize; ++k) { - unsigned i = tis[threadIdx.x] | qis[k]; - unsigned m = i & 0x1f; - unsigned n = i / 32; - - rs[k] = rs0[m][n]; - is[k] = is0[m][n]; - } - - for (unsigned s = 0; s < gsize / rows; ++s) { - if (s > 0) { - for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { - v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; - } - } - - unsigned j = 0; - - for (unsigned k = rows * s; k < rows * (s + 1); ++k) { - fp_type rn = 0; - fp_type in = 0; - - for (unsigned l = 0; l < gsize; ++l) { - fp_type rm = v[j++]; - fp_type im = v[j++]; - rn += rs[l] * rm; - rn -= is[l] * im; - in += rs[l] * im; - in += is[l] * rm; - } - - unsigned i = tis[threadIdx.x] | qis[k]; - unsigned m = i & 0x1f; - unsigned n = i / 32; - - rs0[m][n] = rn; - is0[m][n] = in; - } - } - - if (threadIdx.x < rwthreads) { - for (unsigned k = 0; k < esize; ++k) { - *(p0 + xss[k]) = rs0[threadIdx.x][k]; - *(p0 + xss[k] + 32) = is0[threadIdx.x][k]; - } - } -} - -template -__global__ void ExpectationValueH_Kernel( - const fp_type* __restrict__ v0, const idx_type* __restrict__ xss0, - const idx_type* __restrict__ mss, unsigned num_iterations_per_block, - const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) { - // blockDim.x must be equal to 64. - - static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); - - constexpr unsigned gsize = 1 << G; - constexpr unsigned rows = - G < 5 ? gsize : (sizeof(fp_type) == 4 ? (G < 6 ? 4 : 8) : 8); - - fp_type rs[gsize], is[gsize]; - - __shared__ idx_type xss[64]; - __shared__ fp_type v[2 * gsize * rows]; - - if (threadIdx.x < gsize) { - xss[threadIdx.x] = xss0[threadIdx.x]; - } - - if (G <= 2) { - if (threadIdx.x < 2 * gsize * gsize) { - v[threadIdx.x] = v0[threadIdx.x]; - } - } else { - for (unsigned m = 0; m < 2 * gsize * rows; m += 64) { - v[m + threadIdx.x] = v0[m + threadIdx.x]; - } - } - - __syncthreads(); - - double re = 0; - double im = 0; - - for (unsigned iter = 0; iter < num_iterations_per_block; ++iter) { - idx_type b = num_iterations_per_block * idx_type{blockIdx.x} + iter; - - idx_type i = (64 * b + threadIdx.x) & 0xffffffffffe0; - idx_type ii = i & mss[0]; - for (unsigned j = 1; j <= G; ++j) { - i *= 2; - ii |= i & mss[j]; - } - - auto p0 = rstate + 2 * ii + threadIdx.x % 32; - - for (unsigned k = 0; k < gsize; ++k) { - rs[k] = *(p0 + xss[k]); - is[k] = *(p0 + xss[k] + 32); - } - - for (unsigned s = 0; s < gsize / rows; ++s) { - if (s > 0 || iter > 0) { - __syncthreads(); - - for (unsigned m = 0; m < 2 * gsize * rows; m += 64) { - v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; - } - - __syncthreads(); - } - - unsigned j = 0; - - for (unsigned k = rows * s; k < rows * (s + 1); ++k) { - fp_type rn = 0; - fp_type in = 0; - - for (unsigned l = 0; l < gsize; ++l) { - fp_type rm = v[j++]; - fp_type im = v[j++]; - rn += rs[l] * rm; - rn -= is[l] * im; - in += rs[l] * im; - in += is[l] * rm; - } - - re += rs[k] * rn; - re += is[k] * in; - im += rs[k] * in; - im -= is[k] * rn; - } - } - } - - __shared__ cfp_type partial1[64]; - __shared__ cfp_type partial2[2]; - - partial1[threadIdx.x].re = re; - partial1[threadIdx.x].im = im; - - auto val = WarpReduce(partial1[threadIdx.x], op); - - if (threadIdx.x % 32 == 0) { - partial2[threadIdx.x / 32] = val; - } - - __syncthreads(); - - if (threadIdx.x == 0) { - result[blockIdx.x].re = partial2[0].re + partial2[1].re; - result[blockIdx.x].im = partial2[0].im + partial2[1].im; - } -} - -template -__global__ void ExpectationValueL_Kernel( - const fp_type* __restrict__ v0, const idx_type* __restrict__ xss, - const idx_type* __restrict__ mss, const unsigned* __restrict__ qis, - const unsigned* __restrict__ tis, unsigned num_iterations_per_block, - const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) { - // blockDim.x must be equal to 32. - - static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); - - constexpr unsigned gsize = 1 << G; - constexpr unsigned rows = G < 5 ? gsize : (sizeof(fp_type) == 4 ? - (G < 6 ? 4 : 2) : (G < 6 ? 2 : 1)); - - fp_type rs[gsize], is[gsize]; - - __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1]; - __shared__ fp_type v[2 * gsize * rows]; - - if (G < 2) { - if (threadIdx.x < 2 * gsize * gsize) { - v[threadIdx.x] = v0[threadIdx.x]; - } - } else { - for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { - v[m + threadIdx.x] = v0[m + threadIdx.x]; - } - } - - double re = 0; - double im = 0; - - for (idx_type iter = 0; iter < num_iterations_per_block; ++iter) { - idx_type i = 32 * (num_iterations_per_block * idx_type{blockIdx.x} + iter); - idx_type ii = i & mss[0]; - for (unsigned j = 1; j <= G; ++j) { - i *= 2; - ii |= i & mss[j]; - } - - auto p0 = rstate + 2 * ii + threadIdx.x; - - for (unsigned k = 0; k < gsize; ++k) { - rs0[threadIdx.x][k] = *(p0 + xss[k]); - is0[threadIdx.x][k] = *(p0 + xss[k] + 32); - } - - for (unsigned k = 0; k < gsize; ++k) { - unsigned i = tis[threadIdx.x] | qis[k]; - unsigned m = i & 0x1f; - unsigned n = i / 32; - - rs[k] = rs0[m][n]; - is[k] = is0[m][n]; - } - - for (unsigned s = 0; s < gsize / rows; ++s) { - if (s > 0 || iter > 0) { - for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { - v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; - } - } - - unsigned j = 0; - - for (unsigned k = rows * s; k < rows * (s + 1); ++k) { - fp_type rn = 0; - fp_type in = 0; - - for (unsigned l = 0; l < gsize; ++l) { - fp_type rm = v[j++]; - fp_type im = v[j++]; - rn += rs[l] * rm; - rn -= is[l] * im; - in += rs[l] * im; - in += is[l] * rm; - } - - re += rs[k] * rn; - re += is[k] * in; - im += rs[k] * in; - im -= is[k] * rn; - } - } - } - - __shared__ cfp_type partial[32]; - - partial[threadIdx.x].re = re; - partial[threadIdx.x].im = im; - - auto val = WarpReduce(partial[threadIdx.x], op); - - if (threadIdx.x == 0) { - result[blockIdx.x].re = val.re; - result[blockIdx.x].im = val.im; - } -} - -} // namespace qsim - -#endif // SIMULATOR_CUDA_KERNELS_H_ diff --git a/qsim/simulator_custatevec.h b/qsim/simulator_custatevec.h deleted file mode 100644 index 40d1902..0000000 --- a/qsim/simulator_custatevec.h +++ /dev/null @@ -1,209 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SIMULATOR_CUSTATEVEC_H_ -#define SIMULATOR_CUSTATEVEC_H_ - -#include -#include -#include - -#include -#include -#include - -#include "io.h" -#include "statespace_custatevec.h" -#include "util_custatevec.h" - -namespace qsim { - -/** - * Quantum circuit simulator using the NVIDIA cuStateVec library. - */ -template -class SimulatorCuStateVec final { - public: - using StateSpace = StateSpaceCuStateVec; - using State = typename StateSpace::State; - using fp_type = typename StateSpace::fp_type; - - static constexpr auto kStateType = StateSpace::kStateType; - static constexpr auto kMatrixType = StateSpace::kMatrixType; - static constexpr auto kExpectType = StateSpace::kExpectType; - static constexpr auto kComputeType = StateSpace::kComputeType; - static constexpr auto kMatrixLayout = StateSpace::kMatrixLayout; - - explicit SimulatorCuStateVec(const cublasHandle_t& cublas_handle, - const custatevecHandle_t& custatevec_handle) - : cublas_handle_(cublas_handle), custatevec_handle_(custatevec_handle), - workspace_(nullptr), workspace_size_(0) {} - - ~SimulatorCuStateVec() { - ErrorCheck(cudaFree(workspace_)); - } - - /** - * Applies a gate using the NVIDIA cuStateVec library. - * @param qs Indices of the qubits affected by this gate. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyGate(const std::vector& qs, - const fp_type* matrix, State& state) const { - if (qs.size() == 0) { - uint64_t size = uint64_t{1} << state.num_qubits(); - - if (StateSpace::is_float) { - cuComplex a = {matrix[0], matrix[1]}; - auto p = (cuComplex*) state.get(); - ErrorCheck(cublasCscal(cublas_handle_, size, &a, p, 1)); - } else { - cuDoubleComplex a = {matrix[0], matrix[1]}; - auto p = (cuDoubleComplex*) state.get(); - ErrorCheck(cublasZscal(cublas_handle_, size, &a, p, 1)); - } - } else { - auto workspace_size = ApplyGateWorkSpaceSize( - state.num_qubits(), qs.size(), 0, matrix); - AllocWorkSpace(workspace_size); - - ErrorCheck(custatevecApplyMatrix( - custatevec_handle_, state.get(), kStateType, - state.num_qubits(), matrix, kMatrixType, kMatrixLayout, 0, - (int32_t*) qs.data(), qs.size(), nullptr, nullptr, 0, - kComputeType, workspace_, workspace_size)); - } - } - - /** - * Applies a controlled gate using the NVIDIA cuStateVec library. - * @param qs Indices of the qubits affected by this gate. - * @param cqs Indices of control qubits. - * @param cmask Bit mask of control qubit values. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyControlledGate(const std::vector& qs, - const std::vector& cqs, uint64_t cmask, - const fp_type* matrix, State& state) const { - if (qs.size() == 0) { - IO::errorf( - "error: controlled global phase gate is not implemented %s %d\n", - __FILE__, __LINE__); - exit(1); - } else { - std::vector control_bits; - control_bits.reserve(cqs.size()); - - for (std::size_t i = 0; i < cqs.size(); ++i) { - control_bits.push_back((cmask >> i) & 1); - } - - auto workspace_size = ApplyGateWorkSpaceSize( - state.num_qubits(), qs.size(), cqs.size(), matrix); - AllocWorkSpace(workspace_size); - - ErrorCheck(custatevecApplyMatrix( - custatevec_handle_, state.get(), kStateType, - state.num_qubits(), matrix, kMatrixType, kMatrixLayout, 0, - (int32_t*) qs.data(), qs.size(), - (int32_t*) cqs.data(), control_bits.data(), cqs.size(), - kComputeType, workspace_, workspace_size)); - } - } - - /** - * Computes the expectation value of an operator using the NVIDIA cuStateVec - * library. - * @param qs Indices of the qubits the operator acts on. - * @param matrix The operator matrix. - * @param state The state of the system. - * @return The computed expectation value. - */ - std::complex ExpectationValue(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - auto workspace_size = ExpectationValueWorkSpaceSize( - state.num_qubits(), qs.size(), matrix); - AllocWorkSpace(workspace_size); - - cuDoubleComplex eval; - - ErrorCheck(custatevecComputeExpectation( - custatevec_handle_, state.get(), kStateType, - state.num_qubits(), &eval, kExpectType, nullptr, matrix, - kMatrixType, kMatrixLayout, (int32_t*) qs.data(), qs.size(), - kComputeType, workspace_, workspace_size)); - - return {cuCreal(eval), cuCimag(eval)}; - } - - /** - * @return The size of SIMD register if applicable. - */ - static unsigned SIMDRegisterSize() { - return 32; - } - - private: - size_t ApplyGateWorkSpaceSize( - unsigned num_qubits, unsigned num_targets, unsigned num_controls, - const fp_type* matrix) const { - size_t size; - - ErrorCheck(custatevecApplyMatrixGetWorkspaceSize( - custatevec_handle_, kStateType, num_qubits, matrix, - kMatrixType, kMatrixLayout, 0, num_targets, num_controls, - kComputeType, &size)); - - return size; - } - - size_t ExpectationValueWorkSpaceSize( - unsigned num_qubits, unsigned num_targets, const fp_type* matrix) const { - size_t size; - - ErrorCheck(custatevecComputeExpectationGetWorkspaceSize( - custatevec_handle_, kStateType, num_qubits, matrix, - kMatrixType, kMatrixLayout, num_targets, kComputeType, - &size)); - - return size; - } - - void* AllocWorkSpace(size_t size) const { - if (size > workspace_size_) { - if (workspace_ != nullptr) { - ErrorCheck(cudaFree(workspace_)); - } - - ErrorCheck(cudaMalloc(const_cast(&workspace_), size)); - - const_cast(workspace_size_) = size; - } - - return workspace_; - } - - const cublasHandle_t cublas_handle_; - const custatevecHandle_t custatevec_handle_; - - void* workspace_; - size_t workspace_size_; -}; - -} // namespace qsim - -#endif // SIMULATOR_CUSTATEVEC_H_ diff --git a/qsim/simulator_sse.h b/qsim/simulator_sse.h deleted file mode 100644 index 5256c53..0000000 --- a/qsim/simulator_sse.h +++ /dev/null @@ -1,864 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SIMULATOR_SSE_H_ -#define SIMULATOR_SSE_H_ - -#include - -#include -#include -#include -#include - -#include "simulator.h" -#include "statespace_sse.h" - -namespace qsim { - -/** - * Quantum circuit simulator with SSE vectorization. - */ -template -class SimulatorSSE final : public SimulatorBase { - public: - using StateSpace = StateSpaceSSE; - using State = typename StateSpace::State; - using fp_type = typename StateSpace::fp_type; - - template - explicit SimulatorSSE(ForArgs&&... args) : for_(args...) {} - - /** - * Applies a gate using SSE instructions. - * @param qs Indices of the qubits affected by this gate. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyGate(const std::vector& qs, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - switch (qs.size()) { - case 0: - ApplyGateH<0>(qs, matrix, state); - break; - case 1: - if (qs[0] > 1) { - ApplyGateH<1>(qs, matrix, state); - } else { - ApplyGateL<0, 1>(qs, matrix, state); - } - break; - case 2: - if (qs[0] > 1) { - ApplyGateH<2>(qs, matrix, state); - } else if (qs[1] > 1) { - ApplyGateL<1, 1>(qs, matrix, state); - } else { - ApplyGateL<0, 2>(qs, matrix, state); - } - break; - case 3: - if (qs[0] > 1) { - ApplyGateH<3>(qs, matrix, state); - } else if (qs[1] > 1) { - ApplyGateL<2, 1>(qs, matrix, state); - } else { - ApplyGateL<1, 2>(qs, matrix, state); - } - break; - case 4: - if (qs[0] > 1) { - ApplyGateH<4>(qs, matrix, state); - } else if (qs[1] > 1) { - ApplyGateL<3, 1>(qs, matrix, state); - } else { - ApplyGateL<2, 2>(qs, matrix, state); - } - break; - case 5: - if (qs[0] > 1) { - ApplyGateH<5>(qs, matrix, state); - } else if (qs[1] > 1) { - ApplyGateL<4, 1>(qs, matrix, state); - } else { - ApplyGateL<3, 2>(qs, matrix, state); - } - break; - case 6: - if (qs[0] > 1) { - ApplyGateH<6>(qs, matrix, state); - } else if (qs[1] > 1) { - ApplyGateL<5, 1>(qs, matrix, state); - } else { - ApplyGateL<4, 2>(qs, matrix, state); - } - break; - default: - // Not implemented. - break; - } - } - - /** - * Applies a controlled gate using SSE instructions. - * @param qs Indices of the qubits affected by this gate. - * @param cqs Indices of control qubits. - * @param cvals Bit mask of control qubit values. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyControlledGate(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - // Assume cqs[0] < cqs[1] < cqs[2] < ... . - - if (cqs.size() == 0) { - ApplyGate(qs, matrix, state); - return; - } - - switch (qs.size()) { - case 0: - if (cqs[0] > 1) { - ApplyControlledGateHH<0>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<0>(qs, cqs, cvals, matrix, state); - } - break; - case 1: - if (qs[0] > 1) { - if (cqs[0] > 1) { - ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 1) { - ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 2: - if (qs[0] > 1) { - if (cqs[0] > 1) { - ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 1) { - if (cqs[0] > 1) { - ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 1) { - ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 3: - if (qs[0] > 1) { - if (cqs[0] > 1) { - ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 1) { - if (cqs[0] > 1) { - ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 1) { - ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 4: - if (qs[0] > 1) { - if (cqs[0] > 1) { - ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 1) { - if (cqs[0] > 1) { - ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 1) { - ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - default: - // Not implemented. - break; - } - } - - /** - * Computes the expectation value of an operator using SSE instructions. - * @param qs Indices of the qubits the operator acts on. - * @param matrix The operator matrix. - * @param state The state of the system. - * @return The computed expectation value. - */ - std::complex ExpectationValue(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - switch (qs.size()) { - case 1: - if (qs[0] > 1) { - return ExpectationValueH<1>(qs, matrix, state); - } else { - return ExpectationValueL<0, 1>(qs, matrix, state); - } - break; - case 2: - if (qs[0] > 1) { - return ExpectationValueH<2>(qs, matrix, state); - } else if (qs[1] > 1) { - return ExpectationValueL<1, 1>(qs, matrix, state); - } else { - return ExpectationValueL<0, 2>(qs, matrix, state); - } - break; - case 3: - if (qs[0] > 1) { - return ExpectationValueH<3>(qs, matrix, state); - } else if (qs[1] > 1) { - return ExpectationValueL<2, 1>(qs, matrix, state); - } else { - return ExpectationValueL<1, 2>(qs, matrix, state); - } - break; - case 4: - if (qs[0] > 1) { - return ExpectationValueH<4>(qs, matrix, state); - } else if (qs[1] > 1) { - return ExpectationValueL<3, 1>(qs, matrix, state); - } else { - return ExpectationValueL<2, 2>(qs, matrix, state); - } - break; - case 5: - if (qs[0] > 1) { - return ExpectationValueH<5>(qs, matrix, state); - } else if (qs[1] > 1) { - return ExpectationValueL<4, 1>(qs, matrix, state); - } else { - return ExpectationValueL<3, 2>(qs, matrix, state); - } - break; - case 6: - if (qs[0] > 1) { - return ExpectationValueH<6>(qs, matrix, state); - } else if (qs[1] > 1) { - return ExpectationValueL<5, 1>(qs, matrix, state); - } else { - return ExpectationValueL<4, 2>(qs, matrix, state); - } - break; - default: - // Not implemented. - break; - } - - return 0; - } - - /** - * @return The size of SIMD register if applicable. - */ - static unsigned SIMDRegisterSize() { - return 4; - } - - private: - template - void ApplyGateH(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - const uint64_t* ms, const uint64_t* xss, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m128 ru, iu, rn, in; - __m128 rs[hsize], is[hsize]; - - i *= 4; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = _mm_load_ps(p0 + xss[k]); - is[k] = _mm_load_ps(p0 + xss[k] + 4); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm_set1_ps(v[j]); - iu = _mm_set1_ps(v[j + 1]); - rn = _mm_mul_ps(rs[0], ru); - in = _mm_mul_ps(rs[0], iu); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu)); - in = _mm_add_ps(in, _mm_mul_ps(is[0], ru)); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm_set1_ps(v[j]); - iu = _mm_set1_ps(v[j + 1]); - rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru)); - in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu)); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu)); - in = _mm_add_ps(in, _mm_mul_ps(is[l], ru)); - - j += 2; - } - - _mm_store_ps(p0 + xss[k], rn); - _mm_store_ps(p0 + xss[k] + 4, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned k = 2 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, matrix, ms, xss, state.get()); - } - - template - void ApplyGateL(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w, - const uint64_t* ms, const uint64_t* xss, - unsigned q0, fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m128 rn, in; - __m128 rs[gsize], is[gsize]; - - i *= 4; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - - rs[k2] = _mm_load_ps(p0 + xss[k]); - is[k2] = _mm_load_ps(p0 + xss[k] + 4); - - if (L == 1) { - rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177) - : _mm_shuffle_ps(rs[k2], rs[k2], 78); - is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177) - : _mm_shuffle_ps(is[k2], is[k2], 78); - } else if (L == 2) { - rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57); - is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57); - rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78); - is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78); - rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147); - is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147); - } - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm_mul_ps(rs[0], w[j]); - in = _mm_mul_ps(rs[0], w[j + 1]); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1])); - in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j])); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j])); - in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1])); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1])); - in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j])); - - j += 2; - } - - _mm_store_ps(p0 + xss[k], rn); - _mm_store_ps(p0 + xss[k] + 4, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - __m128 w[1 << (1 + 2 * H + L)]; - - auto m = GetMasks11(qs); - - FillIndices(state.num_qubits(), qs, ms, xss); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - unsigned k = 2 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, w, ms, xss, qs[0], state.get()); - } - - template - void ApplyControlledGateHH(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, - uint64_t cmaskh, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m128 ru, iu, rn, in; - __m128 rs[hsize], is[hsize]; - - i *= 4; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - if ((ii & cmaskh) != cvalsh) return; - - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = _mm_load_ps(p0 + xss[k]); - is[k] = _mm_load_ps(p0 + xss[k] + 4); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm_set1_ps(v[j]); - iu = _mm_set1_ps(v[j + 1]); - rn = _mm_mul_ps(rs[0], ru); - in = _mm_mul_ps(rs[0], iu); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu)); - in = _mm_add_ps(in, _mm_mul_ps(is[0], ru)); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm_set1_ps(v[j]); - iu = _mm_set1_ps(v[j + 1]); - rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru)); - in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu)); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu)); - in = _mm_add_ps(in, _mm_mul_ps(is[l], ru)); - - j += 2; - } - - _mm_store_ps(p0 + xss[k], rn); - _mm_store_ps(p0 + xss[k] + 4, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - - auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals); - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned k = 2 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, matrix, ms, xss, m.cvalsh, m.cmaskh, state.get()); - } - - template - void ApplyControlledGateHL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w, - const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, - uint64_t cmaskh, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m128 rn, in; - __m128 rs[hsize], is[hsize]; - - i *= 4; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - if ((ii & cmaskh) != cvalsh) return; - - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = _mm_load_ps(p0 + xss[k]); - is[k] = _mm_load_ps(p0 + xss[k] + 4); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm_mul_ps(rs[0], w[j]); - in = _mm_mul_ps(rs[0], w[j + 1]); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1])); - in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j])); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j])); - in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1])); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1])); - in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j])); - - j += 2; - } - - _mm_store_ps(p0 + xss[k], rn); - _mm_store_ps(p0 + xss[k] + 4, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - __m128 w[1 << (1 + 2 * H)]; - - auto m = GetMasks8<2>(state.num_qubits(), qs, cqs, cvals); - FillIndices(state.num_qubits(), qs, ms, xss); - FillControlledMatrixH(m.cvalsl, m.cmaskl, matrix, (fp_type*) w); - - unsigned r = 2 + H; - unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, state.get()); - } - - template - void ApplyControlledGateL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w, - const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, - uint64_t cmaskh, unsigned q0, fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m128 rn, in; - __m128 rs[gsize], is[gsize]; - - i *= 4; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - if ((ii & cmaskh) != cvalsh) return; - - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - - rs[k2] = _mm_load_ps(p0 + xss[k]); - is[k2] = _mm_load_ps(p0 + xss[k] + 4); - - if (L == 1) { - rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177) - : _mm_shuffle_ps(rs[k2], rs[k2], 78); - is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177) - : _mm_shuffle_ps(is[k2], is[k2], 78); - } else if (L == 2) { - rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57); - is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57); - rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78); - is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78); - rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147); - is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147); - } - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm_mul_ps(rs[0], w[j]); - in = _mm_mul_ps(rs[0], w[j + 1]); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1])); - in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j])); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j])); - in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1])); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1])); - in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j])); - - j += 2; - } - - _mm_store_ps(p0 + xss[k], rn); - _mm_store_ps(p0 + xss[k] + 4, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - __m128 w[1 << (1 + 2 * H + L)]; - - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned r = 2 + H; - unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; - uint64_t size = uint64_t{1} << n; - - if (CH) { - auto m = GetMasks9(state.num_qubits(), qs, cqs, cvals); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, qs[0], state.get()); - } else { - auto m = GetMasks10(state.num_qubits(), qs, cqs, cvals); - FillControlledMatrixL( - m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w); - - for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, qs[0], state.get()); - } - } - - template - std::complex ExpectationValueH(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - const uint64_t* ms, const uint64_t* xss, - const fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m128 ru, iu, rn, in; - __m128 rs[hsize], is[hsize]; - - i *= 4; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = _mm_load_ps(p0 + xss[k]); - is[k] = _mm_load_ps(p0 + xss[k] + 4); - } - - double re = 0; - double im = 0; - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm_set1_ps(v[j]); - iu = _mm_set1_ps(v[j + 1]); - rn = _mm_mul_ps(rs[0], ru); - in = _mm_mul_ps(rs[0], iu); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu)); - in = _mm_add_ps(in, _mm_mul_ps(is[0], ru)); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm_set1_ps(v[j]); - iu = _mm_set1_ps(v[j + 1]); - rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru)); - in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu)); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu)); - in = _mm_add_ps(in, _mm_mul_ps(is[l], ru)); - - j += 2; - } - - __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[k], rn), _mm_mul_ps(is[k], in)); - __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[k], in), _mm_mul_ps(is[k], rn)); - - re += detail::HorizontalSumSSE(v_re); - im += detail::HorizontalSumSSE(v_im); - } - - return std::complex{re, im}; - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned k = 2 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Op = std::plus>; - return for_.RunReduce(size, f, Op(), matrix, ms, xss, state.get()); - } - - template - std::complex ExpectationValueL(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w, - const uint64_t* ms, const uint64_t* xss, unsigned q0, - const fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m128 rn, in; - __m128 rs[gsize], is[gsize]; - - i *= 4; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - - rs[k2] = _mm_load_ps(p0 + xss[k]); - is[k2] = _mm_load_ps(p0 + xss[k] + 4); - - if (L == 1) { - rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177) - : _mm_shuffle_ps(rs[k2], rs[k2], 78); - is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177) - : _mm_shuffle_ps(is[k2], is[k2], 78); - } else if (L == 2) { - rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57); - is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57); - rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78); - is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78); - rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147); - is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147); - } - } - - double re = 0; - double im = 0; - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm_mul_ps(rs[0], w[j]); - in = _mm_mul_ps(rs[0], w[j + 1]); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1])); - in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j])); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j])); - in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1])); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1])); - in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j])); - - j += 2; - } - - unsigned m = lsize * k; - - __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[m], rn), _mm_mul_ps(is[m], in)); - __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[m], in), _mm_mul_ps(is[m], rn)); - - re += detail::HorizontalSumSSE(v_re); - im += detail::HorizontalSumSSE(v_im); - } - - return std::complex{re, im}; - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - __m128 w[1 << (1 + 2 * H + L)]; - - auto m = GetMasks11(qs); - - FillIndices(state.num_qubits(), qs, ms, xss); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - unsigned k = 2 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Op = std::plus>; - return for_.RunReduce(size, f, Op(), w, ms, xss, qs[0], state.get()); - } - - For for_; -}; - -} // namespace qsim - -#endif // SIMULATOR_SSE_H_ diff --git a/qsim/statespace.h b/qsim/statespace.h deleted file mode 100644 index 2b0c9af..0000000 --- a/qsim/statespace.h +++ /dev/null @@ -1,145 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef STATESPACE_H_ -#define STATESPACE_H_ - -#include -#include -#include -#include - -#include "util.h" - -namespace qsim { - -/** - * Abstract class containing context and routines for general state-vector - * manipulations. "AVX", "AVX512", "Basic", and "SSE" implementations are - * provided. - */ -template class VectorSpace, typename... VSTypeParams> -class StateSpace : public VectorSpace { - private: - using Base = VectorSpace; - - public: - using fp_type = typename Base::fp_type; - using State = typename Base::Vector; - - /** - * The observed state from a Measurement gate. - */ - struct MeasurementResult { - /** - * A bitmask of all qubits measured in this result. In this format, if the - * qubit at index `i` is measured, the `i`th bit of `mask` is a one. - */ - uint64_t mask; - /** - * A bitwise representation of the measured states. In this format, the - * qubit at index `i` is represented by the `i`th bit of `bits`. - * If `valid` is true, `mask` has already been applied to this field - * (i.e. `bits == bits & mask`). - */ - uint64_t bits; - /** - * Observed states of the measured qubits. This vector only includes qubits - * specified by the associated Measurement gate. - */ - std::vector bitstring; - /** - * Validation bit. If this is false, the measurement failed and all other - * fields of the result are invalid. - */ - bool valid; - }; - - template - StateSpace(Args&&... args) : Base(args...) {} - - double Norm(const State& state) const { - auto partial_norms = static_cast(*this).PartialNorms(state); - - double norm = partial_norms[0]; - for (std::size_t i = 1; i < partial_norms.size(); ++i) { - norm += partial_norms[i]; - } - - return norm; - } - - template - MeasurementResult Measure(const std::vector& qubits, - RGen& rgen, State& state) const { - auto result = - static_cast(*this).VirtualMeasure(qubits, rgen, state); - - if (result.valid) { - static_cast(*this).Collapse(result, state); - } - - return result; - } - - template - MeasurementResult VirtualMeasure(const std::vector& qubits, - RGen& rgen, const State& state) const { - MeasurementResult result; - - result.valid = true; - result.mask = 0; - - for (auto q : qubits) { - if (q >= state.num_qubits()) { - result.valid = false; - return result; - } - - result.mask |= uint64_t{1} << q; - } - - auto partial_norms = static_cast(*this).PartialNorms(state); - - for (std::size_t i = 1; i < partial_norms.size(); ++i) { - partial_norms[i] += partial_norms[i - 1]; - } - - auto norm = partial_norms.back(); - auto r = RandomValue(rgen, norm); - - unsigned m = 0; - while (r > partial_norms[m]) ++m; - if (m > 0) { - r -= partial_norms[m - 1]; - } - - result.bits = static_cast(*this).FindMeasuredBits( - m, r, result.mask, state); - - result.bitstring.reserve(qubits.size()); - result.bitstring.resize(0); - - for (auto q : qubits) { - result.bitstring.push_back((result.bits >> q) & 1); - } - - return result; - } -}; - -} // namespace qsim - -#endif // STATESPACE_H_ diff --git a/qsim/statespace_avx.h b/qsim/statespace_avx.h deleted file mode 100644 index 876058b..0000000 --- a/qsim/statespace_avx.h +++ /dev/null @@ -1,497 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef STATESPACE_AVX_H_ -#define STATESPACE_AVX_H_ - -#include - -#include -#include -#include -#include -#include - -#include "statespace.h" -#include "util.h" -#include "vectorspace.h" - -namespace qsim { - -namespace detail { - -inline __m256i GetZeroMaskAVX(uint64_t i, uint64_t mask, uint64_t bits) { - __m256i s1 = _mm256_setr_epi64x(i + 0, i + 2, i + 4, i + 6); - __m256i s2 = _mm256_setr_epi64x(i + 1, i + 3, i + 5, i + 7); - __m256i ma = _mm256_set1_epi64x(mask); - __m256i bi = _mm256_set1_epi64x(bits); - - s1 = _mm256_and_si256(s1, ma); - s2 = _mm256_and_si256(s2, ma); - - s1 = _mm256_cmpeq_epi64(s1, bi); - s2 = _mm256_cmpeq_epi64(s2, bi); - - return _mm256_blend_epi32(s1, s2, 170); // 10101010 -} - -inline double HorizontalSumAVX(__m256 s) { - __m128 l = _mm256_castps256_ps128(s); - __m128 h = _mm256_extractf128_ps(s, 1); - __m128 s1 = _mm_add_ps(h, l); - __m128 s1s = _mm_movehdup_ps(s1); - __m128 s2 = _mm_add_ps(s1, s1s); - - return _mm_cvtss_f32(_mm_add_ss(s2, _mm_movehl_ps(s1s, s2))); -} - -} // namespace detail - -/** - * Object containing context and routines for AVX state-vector manipulations. - * State is a vectorized sequence of eight real components followed by eight - * imaginary components. Eight single-precison floating numbers can be loaded - * into an AVX register. - */ -template -class StateSpaceAVX : - public StateSpace, VectorSpace, For, float> { - private: - using Base = StateSpace, qsim::VectorSpace, For, float>; - - public: - using State = typename Base::State; - using fp_type = typename Base::fp_type; - - template - explicit StateSpaceAVX(ForArgs&&... args) : Base(args...) {} - - static uint64_t MinSize(unsigned num_qubits) { - return std::max(uint64_t{16}, 2 * (uint64_t{1} << num_qubits)); - }; - - void InternalToNormalOrder(State& state) const { - if (state.num_qubits() == 1) { - fp_type* s = state.get(); - - s[2] = s[1]; - s[1] = s[8]; - s[3] = s[9]; - - for (uint64_t i = 4; i < 16; ++i) { - s[i] = 0; - } - } else if (state.num_qubits() == 2) { - fp_type* s = state.get(); - - s[6] = s[3]; - s[4] = s[2]; - s[2] = s[1]; - s[1] = s[8]; - s[3] = s[9]; - s[5] = s[10]; - s[7] = s[11]; - - for (uint64_t i = 8; i < 16; ++i) { - s[i] = 0; - } - } else { - auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) { - fp_type* s = p + 16 * i; - - fp_type re[7]; - fp_type im[7]; - - for (uint64_t i = 0; i < 7; ++i) { - re[i] = s[i + 1]; - im[i] = s[i + 8]; - } - - for (uint64_t i = 0; i < 7; ++i) { - s[2 * i + 1] = im[i]; - s[2 * i + 2] = re[i]; - } - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 16, f, state.get()); - } - } - - void NormalToInternalOrder(State& state) const { - if (state.num_qubits() == 1) { - fp_type* s = state.get(); - - s[8] = s[1]; - s[1] = s[2]; - s[9] = s[3]; - - for (uint64_t i = 2; i < 8; ++i) { - s[i] = 0; - s[i + 8] = 0; - } - } else if (state.num_qubits() == 2) { - fp_type* s = state.get(); - - s[8] = s[1]; - s[9] = s[3]; - s[10] = s[5]; - s[11] = s[7]; - s[1] = s[2]; - s[2] = s[4]; - s[3] = s[6]; - - for (uint64_t i = 4; i < 8; ++i) { - s[i] = 0; - s[i + 8] = 0; - } - } else { - auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) { - fp_type* s = p + 16 * i; - - fp_type re[7]; - fp_type im[7]; - - for (uint64_t i = 0; i < 7; ++i) { - im[i] = s[2 * i + 1]; - re[i] = s[2 * i + 2]; - } - - for (uint64_t i = 0; i < 7; ++i) { - s[i + 1] = re[i]; - s[i + 8] = im[i]; - } - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 16, f, state.get()); - } - } - - void SetAllZeros(State& state) const { - __m256 val0 = _mm256_setzero_ps(); - - auto f = [](unsigned n, unsigned m, uint64_t i, __m256& val, fp_type* p) { - _mm256_store_ps(p + 16 * i, val); - _mm256_store_ps(p + 16 * i + 8, val); - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 16, f, val0, state.get()); - } - - // Uniform superposition. - void SetStateUniform(State& state) const { - __m256 val0 = _mm256_setzero_ps(); - __m256 valu; - - fp_type v = double{1} / std::sqrt(uint64_t{1} << state.num_qubits()); - - switch (state.num_qubits()) { - case 1: - valu = _mm256_set_ps(0, 0, 0, 0, 0, 0, v, v); - break; - case 2: - valu = _mm256_set_ps(0, 0, 0, 0, v, v, v, v); - break; - default: - valu = _mm256_set1_ps(v); - break; - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - __m256& val0, __m256 valu, fp_type* p) { - _mm256_store_ps(p + 16 * i, valu); - _mm256_store_ps(p + 16 * i + 8, val0); - }; - - Base::for_.Run( - MinSize(state.num_qubits()) / 16, f, val0, valu, state.get()); - } - - // |0> state. - void SetStateZero(State& state) const { - SetAllZeros(state); - state.get()[0] = 1; - } - - static std::complex GetAmpl(const State& state, uint64_t i) { - uint64_t k = (16 * (i / 8)) + (i % 8); - return std::complex(state.get()[k], state.get()[k + 8]); - } - - static void SetAmpl( - State& state, uint64_t i, const std::complex& ampl) { - uint64_t k = (16 * (i / 8)) + (i % 8); - state.get()[k] = std::real(ampl); - state.get()[k + 8] = std::imag(ampl); - } - - static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) { - uint64_t k = (16 * (i / 8)) + (i % 8); - state.get()[k] = re; - state.get()[k + 8] = im; - } - - // Sets state[i] = complex(re, im) where (i & mask) == bits. - // if `exclude` is true then the criteria becomes (i & mask) != bits. - void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, - const std::complex& val, - bool exclude = false) const { - BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude); - } - - // Sets state[i] = complex(re, im) where (i & mask) == bits. - // if `exclude` is true then the criteria becomes (i & mask) != bits. - void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re, - fp_type im, bool exclude = false) const { - __m256 re_reg = _mm256_set1_ps(re); - __m256 im_reg = _mm256_set1_ps(im); - - __m256i exclude_reg = _mm256_setzero_si256(); - if (exclude) { - exclude_reg = _mm256_cmpeq_epi32(exclude_reg, exclude_reg); - } - - auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv, - uint64_t bitsv, __m256 re_n, __m256 im_n, __m256i exclude_n, - fp_type* p) { - __m256 ml = _mm256_castsi256_ps(_mm256_xor_si256( - detail::GetZeroMaskAVX(8 * i, maskv, bitsv), exclude_n)); - - __m256 re = _mm256_load_ps(p + 16 * i); - __m256 im = _mm256_load_ps(p + 16 * i + 8); - - re = _mm256_blendv_ps(re, re_n, ml); - im = _mm256_blendv_ps(im, im_n, ml); - - _mm256_store_ps(p + 16 * i, re); - _mm256_store_ps(p + 16 * i + 8, im); - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 16, f, mask, bits, re_reg, - im_reg, exclude_reg, state.get()); - } - - // Does the equivalent of dest += src elementwise. - bool Add(const State& src, State& dest) const { - if (src.num_qubits() != dest.num_qubits()) { - return false; - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* p1, fp_type* p2) { - __m256 re1 = _mm256_load_ps(p1 + 16 * i); - __m256 im1 = _mm256_load_ps(p1 + 16 * i + 8); - __m256 re2 = _mm256_load_ps(p2 + 16 * i); - __m256 im2 = _mm256_load_ps(p2 + 16 * i + 8); - - _mm256_store_ps(p2 + 16 * i, _mm256_add_ps(re1, re2)); - _mm256_store_ps(p2 + 16 * i + 8, _mm256_add_ps(im1, im2)); - }; - - Base::for_.Run(MinSize(src.num_qubits()) / 16, f, src.get(), dest.get()); - - return true; - } - - // Does the equivalent of state *= a elementwise. - void Multiply(fp_type a, State& state) const { - __m256 r = _mm256_set1_ps(a); - - auto f = [](unsigned n, unsigned m, uint64_t i, __m256 r, fp_type* p) { - __m256 re = _mm256_load_ps(p + 16 * i); - __m256 im = _mm256_load_ps(p + 16 * i + 8); - - re = _mm256_mul_ps(re, r); - im = _mm256_mul_ps(im, r); - - _mm256_store_ps(p + 16 * i, re); - _mm256_store_ps(p + 16 * i + 8, im); - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 16, f, r, state.get()); - } - - std::complex InnerProduct( - const State& state1, const State& state2) const { - if (state1.num_qubits() != state2.num_qubits()) { - return std::nan(""); - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* p1, const fp_type* p2) -> std::complex { - __m256 re1 = _mm256_load_ps(p1 + 16 * i); - __m256 im1 = _mm256_load_ps(p1 + 16 * i + 8); - __m256 re2 = _mm256_load_ps(p2 + 16 * i); - __m256 im2 = _mm256_load_ps(p2 + 16 * i + 8); - - __m256 ip_re = _mm256_fmadd_ps(im1, im2, _mm256_mul_ps(re1, re2)); - __m256 ip_im = _mm256_fnmadd_ps(im1, re2, _mm256_mul_ps(re1, im2)); - - double re = detail::HorizontalSumAVX(ip_re); - double im = detail::HorizontalSumAVX(ip_im); - - return std::complex{re, im}; - }; - - using Op = std::plus>; - return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 16, f, - Op(), state1.get(), state2.get()); - } - - double RealInnerProduct(const State& state1, const State& state2) const { - if (state1.num_qubits() != state2.num_qubits()) { - return std::nan(""); - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* p1, const fp_type* p2) -> double { - __m256 re1 = _mm256_load_ps(p1 + 16 * i); - __m256 im1 = _mm256_load_ps(p1 + 16 * i + 8); - __m256 re2 = _mm256_load_ps(p2 + 16 * i); - __m256 im2 = _mm256_load_ps(p2 + 16 * i + 8); - - __m256 ip_re = _mm256_fmadd_ps(im1, im2, _mm256_mul_ps(re1, re2)); - - return detail::HorizontalSumAVX(ip_re); - }; - - using Op = std::plus; - return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 16, f, - Op(), state1.get(), state2.get()); - } - - template - std::vector Sample( - const State& state, uint64_t num_samples, unsigned seed) const { - std::vector bitstrings; - - if (num_samples > 0) { - double norm = 0; - uint64_t size = MinSize(state.num_qubits()) / 16; - const fp_type* p = state.get(); - - for (uint64_t k = 0; k < size; ++k) { - for (unsigned j = 0; j < 8; ++j) { - double re = p[16 * k + j]; - double im = p[16 * k + 8 + j]; - norm += re * re + im * im; - } - } - - auto rs = GenerateRandomValues(num_samples, seed, norm); - - uint64_t m = 0; - double csum = 0; - bitstrings.reserve(num_samples); - - for (uint64_t k = 0; k < size; ++k) { - for (unsigned j = 0; j < 8; ++j) { - double re = p[16 * k + j]; - double im = p[16 * k + 8 + j]; - csum += re * re + im * im; - while (rs[m] < csum && m < num_samples) { - bitstrings.emplace_back(8 * k + j); - ++m; - } - } - } - - for (; m < num_samples; ++m) { - bitstrings.emplace_back((uint64_t{1} << state.num_qubits()) - 1); - } - } - - return bitstrings; - } - - using MeasurementResult = typename Base::MeasurementResult; - - void Collapse(const MeasurementResult& mr, State& state) const { - auto f1 = [](unsigned n, unsigned m, uint64_t i, - uint64_t mask, uint64_t bits, const fp_type* p) -> double { - __m256i ml = detail::GetZeroMaskAVX(8 * i, mask, bits); - - __m256 re = _mm256_maskload_ps(p + 16 * i, ml); - __m256 im = _mm256_maskload_ps(p + 16 * i + 8, ml); - __m256 s1 = _mm256_fmadd_ps(im, im, _mm256_mul_ps(re, re)); - - return detail::HorizontalSumAVX(s1); - }; - - using Op = std::plus; - double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 16, f1, - Op(), mr.mask, mr.bits, state.get()); - - __m256 renorm = _mm256_set1_ps(1.0 / std::sqrt(norm)); - - auto f2 = [](unsigned n, unsigned m, uint64_t i, - uint64_t mask, uint64_t bits, __m256 renorm, fp_type* p) { - __m256i ml = detail::GetZeroMaskAVX(8 * i, mask, bits); - - __m256 re = _mm256_maskload_ps(p + 16 * i, ml); - __m256 im = _mm256_maskload_ps(p + 16 * i + 8, ml); - - re = _mm256_mul_ps(re, renorm); - im = _mm256_mul_ps(im, renorm); - - _mm256_store_ps(p + 16 * i, re); - _mm256_store_ps(p + 16 * i + 8, im); - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 16, f2, - mr.mask, mr.bits, renorm, state.get()); - } - - std::vector PartialNorms(const State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* p) -> double { - __m256 re = _mm256_load_ps(p + 16 * i); - __m256 im = _mm256_load_ps(p + 16 * i + 8); - __m256 s1 = _mm256_fmadd_ps(im, im, _mm256_mul_ps(re, re)); - - return detail::HorizontalSumAVX(s1); - }; - - using Op = std::plus; - return Base::for_.RunReduceP( - MinSize(state.num_qubits()) / 16, f, Op(), state.get()); - } - - uint64_t FindMeasuredBits( - unsigned m, double r, uint64_t mask, const State& state) const { - double csum = 0; - - uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 16, m); - uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 16, m); - - const fp_type* p = state.get(); - - for (uint64_t k = k0; k < k1; ++k) { - for (uint64_t j = 0; j < 8; ++j) { - auto re = p[16 * k + j]; - auto im = p[16 * k + j + 8]; - csum += re * re + im * im; - if (r < csum) { - return (8 * k + j) & mask; - } - } - } - - // Return the last bitstring in the unlikely case of underflow. - return (8 * k1 - 1) & mask; - } -}; - -} // namespace qsim - -#endif // STATESPACE_AVX_H_ diff --git a/qsim/statespace_avx512.h b/qsim/statespace_avx512.h deleted file mode 100644 index 879fd89..0000000 --- a/qsim/statespace_avx512.h +++ /dev/null @@ -1,448 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef STATESPACE_AVX512_H_ -#define STATESPACE_AVX512_H_ - -#include - -#include -#include -#include -#include -#include - -#include "statespace.h" -#include "util.h" -#include "vectorspace.h" - -namespace qsim { - -namespace detail { - -inline unsigned GetZeroMaskAVX512(uint64_t i, uint64_t mask, uint64_t bits) { - __m512i s1 = _mm512_setr_epi64( - i + 0, i + 1, i + 2, i + 3, i + 4, i + 5, i + 6, i + 7); - __m512i s2 = _mm512_setr_epi64( - i + 8, i + 9, i + 10, i + 11, i + 12, i + 13, i + 14, i + 15); - __m512i ma = _mm512_set1_epi64(mask); - __m512i bi = _mm512_set1_epi64(bits); - - s1 = _mm512_and_si512(s1, ma); - s2 = _mm512_and_si512(s2, ma); - - unsigned m1 = _mm512_cmpeq_epu64_mask(s1, bi); - unsigned m2 = _mm512_cmpeq_epu64_mask(s2, bi); - - return (m2 << 8) | m1; -} - -inline double HorizontalSumAVX(__m256 s) { - __m128 l = _mm256_castps256_ps128(s); - __m128 h = _mm256_extractf128_ps(s, 1); - __m128 s1 = _mm_add_ps(h, l); - __m128 s1s = _mm_movehdup_ps(s1); - __m128 s2 = _mm_add_ps(s1, s1s); - - return _mm_cvtss_f32(_mm_add_ss(s2, _mm_movehl_ps(s1s, s2))); -} - -inline double HorizontalSumAVX512(__m512 s) { - __m256 l = _mm512_castps512_ps256(s); - __m512d sd = _mm512_castps_pd(s); - __m256d hd = _mm512_extractf64x4_pd(sd, 1); - __m256 h = _mm256_castpd_ps(hd); - __m256 p = _mm256_add_ps(h, l); - - return HorizontalSumAVX(p); -} - -} // namespace detail - -/** - * Object containing context and routines for AVX state-vector manipulations. - * State is a vectorized sequence of sixteen real components followed by - * sixteen imaginary components. Sixteen single-precison floating numbers can - * be loaded into an AVX512 register. - */ -template -class StateSpaceAVX512 : - public StateSpace, VectorSpace, For, float> { - private: - using Base = StateSpace, qsim::VectorSpace, For, float>; - - public: - using State = typename Base::State; - using fp_type = typename Base::fp_type; - - template - explicit StateSpaceAVX512(ForArgs&&... args) : Base(args...) {} - - static uint64_t MinSize(unsigned num_qubits) { - return std::max(uint64_t{32}, 2 * (uint64_t{1} << num_qubits)); - }; - - void InternalToNormalOrder(State& state) const { - __m512i idx1 = _mm512_setr_epi32( - 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); - __m512i idx2 = _mm512_setr_epi32( - 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); - - auto f = [](unsigned n, unsigned m, uint64_t i, - __m512i idx1, __m512i idx2, fp_type* p) { - __m512 v1 = _mm512_load_ps(p + 32 * i); - __m512 v2 = _mm512_load_ps(p + 32 * i + 16); - - _mm512_store_ps(p + 32 * i, _mm512_permutex2var_ps(v1, idx1, v2)); - _mm512_store_ps(p + 32 * i + 16, _mm512_permutex2var_ps(v1, idx2, v2)); - }; - - Base::for_.Run( - MinSize(state.num_qubits()) / 32, f, idx1, idx2, state.get()); - } - - void NormalToInternalOrder(State& state) const { - __m512i idx1 = _mm512_setr_epi32( - 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30); - __m512i idx2 = _mm512_setr_epi32( - 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31); - - auto f = [](unsigned n, unsigned m, uint64_t i, - __m512i idx1, __m512i idx2, fp_type* p) { - __m512 re = _mm512_load_ps(p + 32 * i); - __m512 im = _mm512_load_ps(p + 32 * i + 16); - - _mm512_store_ps(p + 32 * i, _mm512_permutex2var_ps(re, idx1, im)); - _mm512_store_ps(p + 32 * i + 16, _mm512_permutex2var_ps(re, idx2, im)); - }; - - Base::for_.Run( - MinSize(state.num_qubits()) / 32, f, idx1, idx2, state.get()); - } - - void SetAllZeros(State& state) const { - __m512 val0 = _mm512_setzero_ps(); - - auto f = [](unsigned n, unsigned m, uint64_t i, __m512 val0, fp_type* p) { - _mm512_store_ps(p + 32 * i, val0); - _mm512_store_ps(p + 32 * i + 16, val0); - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 32, f, val0, state.get()); - } - - // Uniform superposition. - void SetStateUniform(State& state) const { - __m512 val0 = _mm512_setzero_ps(); - __m512 valu; - - fp_type v = double{1} / std::sqrt(uint64_t{1} << state.num_qubits()); - - switch (state.num_qubits()) { - case 1: - valu = _mm512_set_ps(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, v, v); - break; - case 2: - valu = _mm512_set_ps(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, v, v, v, v); - break; - case 3: - valu = _mm512_set_ps(0, 0, 0, 0, 0, 0, 0, 0, v, v, v, v, v, v, v, v); - break; - default: - valu = _mm512_set1_ps(v); - break; - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - const __m512& val0, const __m512& valu, fp_type* p) { - _mm512_store_ps(p + 32 * i, valu); - _mm512_store_ps(p + 32 * i + 16, val0); - }; - - Base::for_.Run( - MinSize(state.num_qubits()) / 32, f, val0, valu, state.get()); - } - - // |0> state. - void SetStateZero(State& state) const { - SetAllZeros(state); - state.get()[0] = 1; - } - - static std::complex GetAmpl(const State& state, uint64_t i) { - uint64_t p = (32 * (i / 16)) + (i % 16); - return std::complex(state.get()[p], state.get()[p + 16]); - } - - static void SetAmpl( - State& state, uint64_t i, const std::complex& ampl) { - uint64_t p = (32 * (i / 16)) + (i % 16); - state.get()[p] = std::real(ampl); - state.get()[p + 16] = std::imag(ampl); - } - - static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) { - uint64_t p = (32 * (i / 16)) + (i % 16); - state.get()[p] = re; - state.get()[p + 16] = im; - } - - // Sets state[i] = complex(re, im) where (i & mask) == bits. - // if `exclude` is true then the criteria becomes (i & mask) != bits. - void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, - const std::complex& val, - bool exclude = false) const { - BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude); - } - - // Sets state[i] = complex(re, im) where (i & mask) == bits. - // if `exclude` is true then the criteria becomes (i & mask) != bits. - void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re, - fp_type im, bool exclude = false) const { - __m512 re_reg = _mm512_set1_ps(re); - __m512 im_reg = _mm512_set1_ps(im); - - __mmask16 exclude_n = exclude ? 0xffff : 0; - - auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv, - uint64_t bitsv, __m512 re_n, __m512 im_n, __mmask16 exclude_n, - fp_type* p) { - __m512 re = _mm512_load_ps(p + 32 * i); - __m512 im = _mm512_load_ps(p + 32 * i + 16); - - __mmask16 ml = - detail::GetZeroMaskAVX512(16 * i, maskv, bitsv) ^ exclude_n; - - re = _mm512_mask_blend_ps(ml, re, re_n); - im = _mm512_mask_blend_ps(ml, im, im_n); - - _mm512_store_ps(p + 32 * i, re); - _mm512_store_ps(p + 32 * i + 16, im); - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 32, f, mask, bits, - re_reg, im_reg, exclude_n, state.get()); - } - - // Does the equivalent of dest += src elementwise. - bool Add(const State& src, State& dest) const { - if (src.num_qubits() != dest.num_qubits()) { - return false; - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* p1, fp_type* p2) { - __m512 re1 = _mm512_load_ps(p1 + 32 * i); - __m512 im1 = _mm512_load_ps(p1 + 32 * i + 16); - __m512 re2 = _mm512_load_ps(p2 + 32 * i); - __m512 im2 = _mm512_load_ps(p2 + 32 * i + 16); - - _mm512_store_ps(p2 + 32 * i, _mm512_add_ps(re1, re2)); - _mm512_store_ps(p2 + 32 * i + 16, _mm512_add_ps(im1, im2)); - }; - - Base::for_.Run(MinSize(src.num_qubits()) / 32, f, src.get(), dest.get()); - - return true; - } - - // Does the equivalent of state *= a elementwise. - void Multiply(fp_type a, State& state) const { - __m512 r = _mm512_set1_ps(a); - - auto f = [](unsigned n, unsigned m, uint64_t i, __m512 r, fp_type* p) { - __m512 re = _mm512_load_ps(p + 32 * i); - __m512 im = _mm512_load_ps(p + 32 * i + 16); - - _mm512_store_ps(p + 32 * i, _mm512_mul_ps(re, r)); - _mm512_store_ps(p + 32 * i + 16, _mm512_mul_ps(im, r)); - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 32, f, r, state.get()); - } - - std::complex InnerProduct( - const State& state1, const State& state2) const { - if (state1.num_qubits() != state2.num_qubits()) { - return std::nan(""); - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* p1, const fp_type* p2) -> std::complex { - __m512 re1 = _mm512_load_ps(p1 + 32 * i); - __m512 im1 = _mm512_load_ps(p1 + 32 * i + 16); - __m512 re2 = _mm512_load_ps(p2 + 32 * i); - __m512 im2 = _mm512_load_ps(p2 + 32 * i + 16); - - __m512 ip_re = _mm512_fmadd_ps(im1, im2, _mm512_mul_ps(re1, re2)); - __m512 ip_im = _mm512_fnmadd_ps(im1, re2, _mm512_mul_ps(re1, im2)); - - double re = detail::HorizontalSumAVX512(ip_re); - double im = detail::HorizontalSumAVX512(ip_im); - - return std::complex{re, im}; - }; - - using Op = std::plus>; - return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 32, f, - Op(), state1.get(), state2.get()); - } - - double RealInnerProduct(const State& state1, const State& state2) const { - if (state1.num_qubits() != state2.num_qubits()) { - return std::nan(""); - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* p1, const fp_type* p2) -> double { - __m512 re1 = _mm512_load_ps(p1 + 32 * i); - __m512 im1 = _mm512_load_ps(p1 + 32 * i + 16); - __m512 re2 = _mm512_load_ps(p2 + 32 * i); - __m512 im2 = _mm512_load_ps(p2 + 32 * i + 16); - - __m512 ip_re = _mm512_fmadd_ps(im1, im2, _mm512_mul_ps(re1, re2)); - - return detail::HorizontalSumAVX512(ip_re); - }; - - using Op = std::plus; - return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 32, f, - Op(), state1.get(), state2.get()); - } - - template - std::vector Sample( - const State& state, uint64_t num_samples, unsigned seed) const { - std::vector bitstrings; - - if (num_samples > 0) { - double norm = 0; - uint64_t size = MinSize(state.num_qubits()) / 32; - const fp_type* p = state.get(); - - for (uint64_t k = 0; k < size; ++k) { - for (unsigned j = 0; j < 16; ++j) { - double re = p[32 * k + j]; - double im = p[32 * k + 16 + j]; - norm += re * re + im * im; - } - } - - auto rs = GenerateRandomValues(num_samples, seed, norm); - - uint64_t m = 0; - double csum = 0; - bitstrings.reserve(num_samples); - - for (uint64_t k = 0; k < size; ++k) { - for (unsigned j = 0; j < 16; ++j) { - double re = p[32 * k + j]; - double im = p[32 * k + 16 + j]; - csum += re * re + im * im; - while (rs[m] < csum && m < num_samples) { - bitstrings.emplace_back(16 * k + j); - ++m; - } - } - } - - for (; m < num_samples; ++m) { - bitstrings.emplace_back((uint64_t{1} << state.num_qubits()) - 1); - } - } - - return bitstrings; - } - - using MeasurementResult = typename Base::MeasurementResult; - - void Collapse(const MeasurementResult& mr, State& state) const { - auto f1 = [](unsigned n, unsigned m, uint64_t i, - uint64_t mask, uint64_t bits, const fp_type* p) -> double { - __mmask16 ml = detail::GetZeroMaskAVX512(16 * i, mask, bits); - - __m512 re = _mm512_maskz_load_ps(ml, p + 32 * i); - __m512 im = _mm512_maskz_load_ps(ml, p + 32 * i + 16); - __m512 s1 = _mm512_fmadd_ps(im, im, _mm512_mul_ps(re, re)); - - return detail::HorizontalSumAVX512(s1); - }; - - using Op = std::plus; - double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 32, f1, - Op(), mr.mask, mr.bits, state.get()); - - __m512 renorm = _mm512_set1_ps(1.0 / std::sqrt(norm)); - - auto f2 = [](unsigned n, unsigned m, uint64_t i, - uint64_t mask, uint64_t bits, __m512 renorm, fp_type* p) { - __mmask16 ml = detail::GetZeroMaskAVX512(16 * i, mask, bits); - - __m512 re = _mm512_maskz_load_ps(ml, p + 32 * i); - __m512 im = _mm512_maskz_load_ps(ml, p + 32 * i + 16); - - re = _mm512_mul_ps(re, renorm); - im = _mm512_mul_ps(im, renorm); - - _mm512_store_ps(p + 32 * i, re); - _mm512_store_ps(p + 32 * i + 16, im); - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 32, f2, - mr.mask, mr.bits, renorm, state.get()); - } - - std::vector PartialNorms(const State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* p) -> double { - __m512 re = _mm512_load_ps(p + 32 * i); - __m512 im = _mm512_load_ps(p + 32 * i + 16); - __m512 s1 = _mm512_fmadd_ps(im, im, _mm512_mul_ps(re, re)); - - return detail::HorizontalSumAVX512(s1); - }; - - using Op = std::plus; - return Base::for_.RunReduceP( - MinSize(state.num_qubits()) / 32, f, Op(), state.get()); - } - - uint64_t FindMeasuredBits( - unsigned m, double r, uint64_t mask, const State& state) const { - double csum = 0; - - uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 32, m); - uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 32, m); - - const fp_type* p = state.get(); - - for (uint64_t k = k0; k < k1; ++k) { - for (uint64_t j = 0; j < 16; ++j) { - auto re = p[32 * k + j]; - auto im = p[32 * k + j + 16]; - csum += re * re + im * im; - if (r < csum) { - return (16 * k + j) & mask; - } - } - } - - // Return the last bitstring in the unlikely case of underflow. - return (16 * k1 - 1) & mask; - } -}; - -} // namespace qsim - -#endif // STATESPACE_AVX512_H_ diff --git a/qsim/statespace_basic.h b/qsim/statespace_basic.h deleted file mode 100644 index 6468483..0000000 --- a/qsim/statespace_basic.h +++ /dev/null @@ -1,300 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef STATESPACE_BASIC_H_ -#define STATESPACE_BASIC_H_ - -#include -#include -#include -#include - -#include "statespace.h" -#include "util.h" -#include "vectorspace.h" - -namespace qsim { - -/** - * Object containing context and routines for unoptimized state-vector - * manipulations. State is a non-vectorized sequence of one real amplitude - * followed by one imaginary amplitude. - */ -template -class StateSpaceBasic : - public StateSpace, VectorSpace, For, FP> { - private: - using Base = StateSpace, qsim::VectorSpace, For, FP>; - - public: - using State = typename Base::State; - using fp_type = typename Base::fp_type; - - template - explicit StateSpaceBasic(ForArgs&&... args) : Base(args...) {} - - static uint64_t MinSize(unsigned num_qubits) { - return 2 * (uint64_t{1} << num_qubits); - }; - - void InternalToNormalOrder(State& state) const {} - - void NormalToInternalOrder(State& state) const {} - - void SetAllZeros(State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) { - p[2 * i] = 0; - p[2 * i + 1] = 0; - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 2, f, state.get()); - } - - // Uniform superposition. - void SetStateUniform(State& state) const { - fp_type val = fp_type{1} / std::sqrt(uint64_t{1} << state.num_qubits()); - - auto f = [](unsigned n, unsigned m, uint64_t i, - fp_type val, fp_type* p) { - p[2 * i] = val; - p[2 * i + 1] = 0; - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 2, f, val, state.get()); - } - - // |0> state. - void SetStateZero(State& state) const { - SetAllZeros(state); - state.get()[0] = 1; - } - - static std::complex GetAmpl(const State& state, uint64_t i) { - uint64_t p = 2 * i; - return std::complex(state.get()[p], state.get()[p + 1]); - } - - static void SetAmpl( - State& state, uint64_t i, const std::complex& ampl) { - uint64_t p = 2 * i; - state.get()[p] = std::real(ampl); - state.get()[p + 1] = std::imag(ampl); - } - - static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) { - uint64_t p = 2 * i; - state.get()[p] = re; - state.get()[p + 1] = im; - } - - // Sets state[i] = complex(re, im) where (i & mask) == bits. - // if `exclude` is true then the criteria becomes (i & mask) != bits. - void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, - const std::complex& val, - bool exclude = false) const { - BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude); - } - - // Sets state[i] = complex(re, im) where (i & mask) == bits. - // if `exclude` is true then the criteria becomes (i & mask) != bits. - void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re, - fp_type im, bool exclude = false) const { - auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv, - uint64_t bitsv, fp_type re_n, fp_type im_n, bool excludev, - fp_type* p) { - auto s = p + 2 * i; - bool in_mask = (i & maskv) == bitsv; - in_mask ^= excludev; - s[0] = in_mask ? re_n : s[0]; - s[1] = in_mask ? im_n : s[1]; - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 2, f, mask, bits, re, im, - exclude, state.get()); - } - - // Does the equivalent of dest += src elementwise. - bool Add(const State& src, State& dest) const { - if (src.num_qubits() != dest.num_qubits()) { - return false; - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* p1, fp_type* p2) { - p2[2 * i] += p1[2 * i]; - p2[2 * i + 1] += p1[2 * i + 1]; - }; - - Base::for_.Run(MinSize(src.num_qubits()) / 2, f, src.get(), dest.get()); - - return true; - } - - // Does the equivalent of state *= a elementwise. - void Multiply(fp_type a, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, fp_type a, fp_type* p) { - p[2 * i] *= a; - p[2 * i + 1] *= a; - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 2, f, a, state.get()); - } - - std::complex InnerProduct( - const State& state1, const State& state2) const { - if (state1.num_qubits() != state2.num_qubits()) { - return std::nan(""); - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* p1, const fp_type* p2) -> std::complex { - auto s1 = p1 + 2 * i; - auto s2 = p2 + 2 * i; - - double re = s1[0] * s2[0] + s1[1] * s2[1]; - double im = s1[0] * s2[1] - s1[1] * s2[0]; - - return std::complex{re, im}; - }; - - using Op = std::plus>; - return Base::for_.RunReduce( - MinSize(state1.num_qubits()) / 2, f, Op(), state1.get(), state2.get()); - } - - double RealInnerProduct(const State& state1, const State& state2) const { - if (state1.num_qubits() != state2.num_qubits()) { - return std::nan(""); - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* p1, const fp_type* p2) -> double { - auto s1 = p1 + 2 * i; - auto s2 = p2 + 2 * i; - - return s1[0] * s2[0] + s1[1] * s2[1]; - }; - - using Op = std::plus; - return Base::for_.RunReduce( - MinSize(state1.num_qubits()) / 2, f, Op(), state1.get(), state2.get()); - } - - template - std::vector Sample( - const State& state, uint64_t num_samples, unsigned seed) const { - std::vector bitstrings; - - if (num_samples > 0) { - double norm = 0; - uint64_t size = MinSize(state.num_qubits()) / 2; - - const fp_type* p = state.get(); - - for (uint64_t k = 0; k < size; ++k) { - double re = p[2 * k]; - double im = p[2 * k + 1]; - norm += re * re + im * im; - } - - auto rs = GenerateRandomValues(num_samples, seed, norm); - - uint64_t m = 0; - double csum = 0; - bitstrings.reserve(num_samples); - - for (uint64_t k = 0; k < size; ++k) { - double re = p[2 * k]; - double im = p[2 * k + 1]; - csum += re * re + im * im; - while (rs[m] < csum && m < num_samples) { - bitstrings.emplace_back(k); - ++m; - } - } - - for (; m < num_samples; ++m) { - bitstrings.emplace_back((uint64_t{1} << state.num_qubits()) - 1); - } - } - - return bitstrings; - } - - using MeasurementResult = typename Base::MeasurementResult; - - void Collapse(const MeasurementResult& mr, State& state) const { - auto f1 = [](unsigned n, unsigned m, uint64_t i, - uint64_t mask, uint64_t bits, const fp_type* p) -> double { - auto s = p + 2 * i; - return (i & mask) == bits ? s[0] * s[0] + s[1] * s[1] : 0; - }; - - using Op = std::plus; - double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 2, f1, - Op(), mr.mask, mr.bits, state.get()); - - double renorm = 1.0 / std::sqrt(norm); - - auto f2 = [](unsigned n, unsigned m, uint64_t i, - uint64_t mask, uint64_t bits, fp_type renorm, fp_type* p) { - auto s = p + 2 * i; - bool not_zero = (i & mask) == bits; - - s[0] = not_zero ? s[0] * renorm : 0; - s[1] = not_zero ? s[1] * renorm : 0; - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 2, f2, - mr.mask, mr.bits, renorm, state.get()); - } - - std::vector PartialNorms(const State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* p) -> double { - auto s = p + 2 * i; - return s[0] * s[0] + s[1] * s[1]; - }; - - using Op = std::plus; - return Base::for_.RunReduceP( - MinSize(state.num_qubits()) / 2, f, Op(), state.get()); - } - - uint64_t FindMeasuredBits( - unsigned m, double r, uint64_t mask, const State& state) const { - double csum = 0; - - uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 2, m); - uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 2, m); - - const fp_type* p = state.get(); - - for (uint64_t k = k0; k < k1; ++k) { - auto re = p[2 * k]; - auto im = p[2 * k + 1]; - csum += re * re + im * im; - if (r < csum) { - return k & mask; - } - } - - // Return the last bitstring in the unlikely case of underflow. - return (k1 - 1) & mask; - } -}; - -} // namespace qsim - -#endif // STATESPACE_BASIC_H_ diff --git a/qsim/statespace_cuda.h b/qsim/statespace_cuda.h deleted file mode 100644 index 660db07..0000000 --- a/qsim/statespace_cuda.h +++ /dev/null @@ -1,470 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef STATESPACE_CUDA_H_ -#define STATESPACE_CUDA_H_ - -#ifdef __NVCC__ - #include -#elif __HIP__ - #include - #include "cuda2hip.h" -#endif - -#include -#include -#include - -#include "statespace.h" -#include "statespace_cuda_kernels.h" -#include "vectorspace_cuda.h" -#include "util_cuda.h" - -namespace qsim { - -/** - * Object containing context and routines for CUDA state-vector manipulations. - * State is a vectorized sequence of 32 real components followed by 32 - * imaginary components. 32 floating numbers can be proccessed in parallel by - * a single warp. It is not recommended to use `GetAmpl` and `SetAmpl`. - */ -template -class StateSpaceCUDA : - public StateSpace, VectorSpaceCUDA, FP> { - private: - using Base = StateSpace, qsim::VectorSpaceCUDA, FP>; - - protected: - struct Grid { - unsigned threads; - unsigned dblocks; - unsigned blocks; - }; - - public: - using State = typename Base::State; - using fp_type = typename Base::fp_type; - - struct Parameter { - /** - * The number of threads per block. - * Should be 2 to the power of k, where k is in the range [5,10]. - */ - unsigned num_threads = 512; - /** - * The number of data blocks. Each thread processes num_dblocks data - * blocks in reductions (norms, inner products, etc). - */ - unsigned num_dblocks = 16; - }; - - explicit StateSpaceCUDA(const Parameter& param) - : param_(param), scratch_(nullptr), scratch_size_(0) {} - - virtual ~StateSpaceCUDA() { - if (scratch_ != nullptr) { - ErrorCheck(cudaFree(scratch_)); - } - } - - static uint64_t MinSize(unsigned num_qubits) { - return std::max(uint64_t{64}, 2 * (uint64_t{1} << num_qubits)); - }; - - void InternalToNormalOrder(State& state) const { - uint64_t size = MinSize(state.num_qubits()) / 2; - - unsigned threads = std::min(size, uint64_t{param_.num_threads}); - unsigned blocks = size / threads; - unsigned bytes = 2 * threads * sizeof(fp_type); - - InternalToNormalOrderKernel<<>>(state.get()); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void NormalToInternalOrder(State& state) const { - uint64_t size = MinSize(state.num_qubits()) / 2; - - unsigned threads = std::min(size, uint64_t{param_.num_threads}); - unsigned blocks = size / threads; - unsigned bytes = 2 * threads * sizeof(fp_type); - - NormalToInternalOrderKernel<<>>(state.get()); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void SetAllZeros(State& state) const { - ErrorCheck(cudaMemset(state.get(), 0, - MinSize(state.num_qubits()) * sizeof(fp_type))); - } - - // Uniform superposition. - void SetStateUniform(State& state) const { - uint64_t size = MinSize(state.num_qubits()) / 2; - uint64_t hsize = uint64_t{1} << state.num_qubits(); - - unsigned threads = std::min(size, uint64_t{param_.num_threads}); - unsigned blocks = size / threads; - - fp_type v = double{1} / std::sqrt(hsize); - - SetStateUniformKernel<<>>(v, hsize, state.get()); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - // |0> state. - void SetStateZero(State& state) const { - SetAllZeros(state); - fp_type one[1] = {1}; - ErrorCheck( - cudaMemcpy(state.get(), one, sizeof(fp_type), cudaMemcpyHostToDevice)); - } - - // It is not recommended to use this function. - static std::complex GetAmpl(const State& state, uint64_t i) { - fp_type re, im; - auto p = state.get() + 64 * (i / 32) + i % 32; - ErrorCheck(cudaMemcpy(&re, p, sizeof(fp_type), cudaMemcpyDeviceToHost)); - ErrorCheck( - cudaMemcpy(&im, p + 32, sizeof(fp_type), cudaMemcpyDeviceToHost)); - return std::complex(re, im); - } - - // It is not recommended to use this function. - static void SetAmpl( - State& state, uint64_t i, const std::complex& ampl) { - fp_type re = std::real(ampl); - fp_type im = std::imag(ampl); - auto p = state.get() + 64 * (i / 32) + i % 32; - ErrorCheck(cudaMemcpy(p, &re, sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(p + 32, &im, sizeof(fp_type), cudaMemcpyHostToDevice)); - } - - // It is not recommended to use this function. - static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) { - auto p = state.get() + 64 * (i / 32) + i % 32; - ErrorCheck(cudaMemcpy(p, &re, sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(p + 32, &im, sizeof(fp_type), cudaMemcpyHostToDevice)); - } - - // Sets state[i] = complex(re, im) where (i & mask) == bits. - // if `exclude` is true then the criteria becomes (i & mask) != bits. - void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, - const std::complex& val, - bool exclude = false) const { - BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude); - } - - // Sets state[i] = complex(re, im) where (i & mask) == bits. - // if `exclude` is true then the criteria becomes (i & mask) != bits. - void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re, - fp_type im, bool exclude = false) const { - uint64_t size = MinSize(state.num_qubits()) / 2; - - unsigned threads = std::min(size, uint64_t{param_.num_threads}); - unsigned blocks = size / threads; - - BulkSetAmplKernel<<>>( - mask, bits, re, im, exclude, state.get()); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - // Does the equivalent of dest += src elementwise. - bool Add(const State& src, State& dest) const { - if (src.num_qubits() != dest.num_qubits()) { - return false; - } - - uint64_t size = MinSize(src.num_qubits()); - - unsigned threads = std::min(size, uint64_t{param_.num_threads}); - unsigned blocks = size / threads; - - AddKernel<<>>(src.get(), dest.get()); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - return true; - } - - // Does the equivalent of state *= a elementwise. - void Multiply(fp_type a, State& state) const { - uint64_t size = MinSize(state.num_qubits()); - - unsigned threads = std::min(size, uint64_t{param_.num_threads}); - unsigned blocks = size / threads; - - MultiplyKernel<<>>(a, state.get()); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - std::complex InnerProduct( - const State& state1, const State& state2) const { - if (state1.num_qubits() != state2.num_qubits()) { - return std::nan(""); - } - - using C = Complex; - auto r = Reduce>(state1, state2); - - return {r.re, r.im}; - } - - double RealInnerProduct(const State& state1, const State& state2) const { - if (state1.num_qubits() != state2.num_qubits()) { - return std::nan(""); - } - - return Reduce>(state1, state2); - } - - double Norm(const State& state) const { - return Reduce>(state, state); - } - - template - std::vector Sample( - const State& state, uint64_t num_samples, unsigned seed) const { - std::vector bitstrings; - - if (num_samples > 0) { - Grid g1 = GetGrid1(MinSize(state.num_qubits()) / 2); - unsigned bytes = g1.threads * sizeof(double); - - unsigned scratch_size = (g1.blocks + 1) * sizeof(double) - + num_samples * (sizeof(uint64_t) + sizeof(DistrRealType)); - - void* scratch = AllocScratch(scratch_size); - - double* d_res2 = (double*) scratch; - double* d_res1 = d_res2 + 1; - uint64_t* d_bitstrings = (uint64_t*) (d_res1 + g1.blocks); - DistrRealType* d_rs = (DistrRealType *) (d_bitstrings + num_samples); - - auto op1 = RealProduct(); - auto op2 = Plus(); - - Reduce1Kernel<<>>( - g1.dblocks, op1, op2, op2, state.get(), state.get(), d_res1); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - double norm; - - if (g1.blocks == 1) { - ErrorCheck( - cudaMemcpy(&norm, d_res1, sizeof(double), cudaMemcpyDeviceToHost)); - } else { - Grid g2 = GetGrid2(g1.blocks); - unsigned bytes = g2.threads * sizeof(double); - - auto op3 = Plus(); - - Reduce2Kernel<<>>( - g2.dblocks, g1.blocks, op3, op3, d_res1, d_res2); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - ErrorCheck( - cudaMemcpy(&norm, d_res2, sizeof(double), cudaMemcpyDeviceToHost)); - } - - // TODO: generate random values on the device. - auto rs = GenerateRandomValues(num_samples, seed, norm); - - ErrorCheck(cudaMemcpy(d_rs, rs.data(), - num_samples * sizeof(DistrRealType), - cudaMemcpyHostToDevice)); - - SampleKernel<<<1, g1.threads>>>(g1.blocks, g1.dblocks, num_samples, - d_rs, d_res1, state.get(), d_bitstrings); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - bitstrings.resize(num_samples, 0); - - ErrorCheck(cudaMemcpy(bitstrings.data(), d_bitstrings, - num_samples * sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - } - - return bitstrings; - } - - using MeasurementResult = typename Base::MeasurementResult; - - void Collapse(const MeasurementResult& mr, State& state) const { - using Op = RealProduct; - double r = Reduce(mr.mask, mr.bits, state, state); - fp_type renorm = 1 / std::sqrt(r); - - uint64_t size = MinSize(state.num_qubits()) / 2; - - unsigned threads = std::min(size, uint64_t{param_.num_threads}); - unsigned blocks = size / threads; - - CollapseKernel<<>>(mr.mask, mr.bits, renorm, state.get()); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - std::vector PartialNorms(const State& state) const { - Grid g = GetGrid1(MinSize(state.num_qubits()) / 2); - - unsigned scratch_size = g.blocks * sizeof(double); - unsigned bytes = g.threads * sizeof(double); - - double* d_res = (double*) AllocScratch(scratch_size); - - auto op1 = RealProduct(); - auto op2 = Plus(); - - Reduce1Kernel<<>>( - g.dblocks, op1, op2, op2, state.get(), state.get(), d_res); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - std::vector norms(g.blocks); - - ErrorCheck( - cudaMemcpy(norms.data(), d_res, scratch_size, cudaMemcpyDeviceToHost)); - - return norms; - } - - uint64_t FindMeasuredBits( - unsigned m, double r, uint64_t mask, const State& state) const { - Grid g = GetGrid1(MinSize(state.num_qubits()) / 2); - - uint64_t res; - uint64_t* d_res = (uint64_t*) AllocScratch(sizeof(uint64_t)); - - FindMeasuredBitsKernel<<<1, g.threads>>>( - m, g.dblocks, r, state.get(), d_res); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - ErrorCheck( - cudaMemcpy(&res, d_res, sizeof(uint64_t), cudaMemcpyDeviceToHost)); - - return res & mask; - } - - protected: - Parameter param_; - - void* AllocScratch(uint64_t size) const { - if (size > scratch_size_) { - if (scratch_ != nullptr) { - ErrorCheck(cudaFree(scratch_)); - } - - ErrorCheck(cudaMalloc(const_cast(&scratch_), size)); - - const_cast(scratch_size_) = size; - } - - return scratch_; - } - - Grid GetGrid1(uint64_t size) const { - Grid grid; - - grid.threads = std::min(size, uint64_t{param_.num_threads}); - grid.dblocks = std::min(size / grid.threads, uint64_t{param_.num_dblocks}); - grid.blocks = size / (grid.threads * grid.dblocks); - - return grid; - } - - Grid GetGrid2(unsigned size) const { - Grid grid; - - grid.threads = std::min(param_.num_threads, std::max(32U, size)); - grid.dblocks = std::max(1U, size / grid.threads); - grid.blocks = 1; - - return grid; - } - - template - FP2 Reduce(const State& state1, const State& state2) const { - return Reduce(0, 0, state1, state2); - } - - template - FP2 Reduce(uint64_t mask, uint64_t bits, - const State& state1, const State& state2) const { - uint64_t size = MinSize(state1.num_qubits()) / 2; - - Grid g1 = GetGrid1(size); - unsigned bytes = g1.threads * sizeof(FP1); - - FP2* d_res2 = (FP2*) AllocScratch((g1.blocks + 1) * sizeof(FP2)); - FP2* d_res1 = d_res2 + 1; - - auto op1 = Op(); - auto op2 = Plus(); - auto op3 = Plus::type>(); - - if (mask == 0) { - Reduce1Kernel<<>>( - g1.dblocks, op1, op2, op3, state1.get(), state2.get(), d_res1); - } else { - Reduce1MaskedKernel<<>>( - g1.dblocks, mask, bits, op1, op2, op3, state1.get(), state2.get(), - d_res1); - } - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - FP2 result; - - if (g1.blocks == 1) { - ErrorCheck( - cudaMemcpy(&result, d_res1, sizeof(FP2), cudaMemcpyDeviceToHost)); - } else { - Grid g2 = GetGrid2(g1.blocks); - unsigned bytes = g2.threads * sizeof(FP2); - - auto op2 = Plus(); - auto op3 = Plus::type>(); - - Reduce2Kernel<<>>( - g2.dblocks, g1.blocks, op2, op3, d_res1, d_res2); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - ErrorCheck( - cudaMemcpy(&result, d_res2, sizeof(FP2), cudaMemcpyDeviceToHost)); - } - - return result; - } - - private: - void* scratch_; - uint64_t scratch_size_; -}; - -} // namespace qsim - -#endif // STATESPACE_CUDA_H_ diff --git a/qsim/statespace_cuda_kernels.h b/qsim/statespace_cuda_kernels.h deleted file mode 100644 index b54ebca..0000000 --- a/qsim/statespace_cuda_kernels.h +++ /dev/null @@ -1,355 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef STATESPACE_CUDA_KERNELS_H_ -#define STATESPACE_CUDA_KERNELS_H_ - -#ifdef __NVCC__ - #include -#elif __HIP__ - #include - #include "cuda2hip.h" -#endif - -#include "util_cuda.h" - -namespace qsim { - -namespace detail { - -template -__device__ __forceinline__ FP1 BlockReduce1( - uint64_t n, Op1 op1, Op2 op2, Op3 op3, const FP2* s1, const FP2* s2) { - extern __shared__ float shared[]; - FP1* partial1 = (FP1*) shared; - - unsigned tid = threadIdx.x; - unsigned warp = threadIdx.x / warp_size; - unsigned lane = threadIdx.x % warp_size; - - uint64_t k0 = 2 * n * blockIdx.x * blockDim.x + 2 * tid - lane; - uint64_t k1 = k0 + 2 * n * blockDim.x; - - FP1 r; - - r = op1(s1[k0], s1[k0 + warp_size], s2[k0], s2[k0 + warp_size]); - while ((k0 += 2 * blockDim.x) < k1) { - r = op2(r, op1(s1[k0], s1[k0 + warp_size], s2[k0], s2[k0 + warp_size])); - } - - partial1[tid] = r; - - __shared__ FP1 partial2[warp_size]; - - if (tid < warp_size) { - partial2[tid] = 0; - } - - __syncthreads(); - - FP1 val = WarpReduce(partial1[tid], op3); - - if (lane == 0) { - partial2[warp] = val; - } - - __syncthreads(); - - FP1 result = 0; - - if (tid < warp_size) { - result = WarpReduce(partial2[tid], op3); - } - - return result; -} - -template -__device__ __forceinline__ FP1 BlockReduce1Masked( - uint64_t n, uint64_t mask, uint64_t bits, Op1 op1, Op2 op2, Op3 op3, - const FP2* s1, const FP2* s2) { - extern __shared__ float shared[]; - FP1* partial1 = (FP1*) shared; - - unsigned tid = threadIdx.x; - unsigned warp = threadIdx.x / warp_size; - unsigned lane = threadIdx.x % warp_size; - - uint64_t k0 = 2 * n * blockIdx.x * blockDim.x + 2 * tid - lane; - uint64_t k1 = k0 + 2 * n * blockDim.x; - - FP1 r = 0; - - if (((k0 + lane) / 2 & mask) == bits) { - r = op1(s1[k0], s1[k0 + warp_size], s2[k0], s2[k0 + warp_size]); - } - while ((k0 += 2 * blockDim.x) < k1) { - if (((k0 + lane) / 2 & mask) == bits) { - r = op2(r, op1(s1[k0], s1[k0 + warp_size], s2[k0], s2[k0 + warp_size])); - } - } - - partial1[tid] = r; - - __shared__ FP1 partial2[warp_size]; - - if (tid < warp_size) { - partial2[tid] = 0; - } - - __syncthreads(); - - FP1 val = WarpReduce(partial1[tid], op3); - - if (lane == 0) { - partial2[warp] = val; - } - - __syncthreads(); - - FP1 result = 0; - - if (tid < warp_size) { - result = WarpReduce(partial2[tid], op3); - } - - return result; -} - -template -__device__ __forceinline__ FP1 BlockReduce2( - uint64_t n, uint64_t size, Op2 op2, Op3 op3, const FP2* s) { - extern __shared__ float shared[]; - FP1* partial1 = (FP1*) shared; - - unsigned tid = threadIdx.x; - uint64_t k0 = n * blockIdx.x * blockDim.x + tid; - uint64_t k1 = k0 + n * blockDim.x; - - FP1 r = 0; - - if (tid < size) { - r = s[k0]; - while ((k0 += blockDim.x) < k1) { - r = op2(r, s[k0]); - } - } - - partial1[tid] = r; - - __shared__ FP1 partial2[warp_size]; - - if (tid < warp_size) { - partial2[tid] = 0; - } - - __syncthreads(); - - FP1 val = WarpReduce(partial1[tid], op3); - - if (threadIdx.x % warp_size == 0) { - partial2[threadIdx.x / warp_size] = val; - } - - __syncthreads(); - - FP1 result = 0; - - if (tid < warp_size) { - result = WarpReduce(partial2[tid], op3); - } - - return result; -} - -} // namespace detail - -template -__global__ void Reduce1Kernel(uint64_t n, Op1 op1, Op2 op2, Op3 op3, - const FP2* s1, const FP2* s2, FP3* result) { - FP1 sum = detail::BlockReduce1(n, op1, op2, op3, s1, s2); - - if (threadIdx.x == 0) { - result[blockIdx.x] = sum; - } -} - -template -__global__ void Reduce1MaskedKernel(uint64_t n, uint64_t mask, uint64_t bits, - Op1 op1, Op2 op2, Op3 op3, - const FP2* s1, const FP2* s2, FP3* result) { - FP1 sum = - detail::BlockReduce1Masked(n, mask, bits, op1, op2, op3, s1, s2); - - if (threadIdx.x == 0) { - result[blockIdx.x] = sum; - } -} - -template -__global__ void Reduce2Kernel( - uint64_t n, uint64_t size, Op2 op2, Op3 op3, const FP2* s, FP3* result) { - FP1 sum = detail::BlockReduce2(n, size, op2, op3, s); - - if (threadIdx.x == 0) { - result[blockIdx.x] = sum; - } -} - -template -__global__ void InternalToNormalOrderKernel(FP* state) { - unsigned lane = threadIdx.x % warp_size; - unsigned l = 2 * threadIdx.x - lane; - uint64_t k = 2 * uint64_t{blockIdx.x} * blockDim.x + l; - - extern __shared__ float shared[]; - FP* buf = (FP*) shared; - - buf[l] = state[k]; - buf[l + warp_size] = state[k + warp_size]; - - __syncthreads(); - - state[k + lane] = buf[l]; - state[k + lane + 1] = buf[l + warp_size]; -} - -template -__global__ void NormalToInternalOrderKernel(FP* state) { - unsigned lane = threadIdx.x % warp_size; - unsigned l = 2 * threadIdx.x - lane; - uint64_t k = 2 * uint64_t{blockIdx.x} * blockDim.x + l; - - extern __shared__ float shared[]; - FP* buf = (FP*) shared; - - buf[l] = state[k]; - buf[l + warp_size] = state[k + warp_size]; - - __syncthreads(); - - state[k] = buf[l + lane]; - state[k + warp_size] = buf[l + lane + 1]; -} - -template -__global__ void SetStateUniformKernel(FP v, uint64_t size, FP* state) { - unsigned lane = threadIdx.x % warp_size; - uint64_t k = 2 * (uint64_t{blockIdx.x} * blockDim.x + threadIdx.x) - lane; - - state[k] = lane < size ? v : 0; - state[k + warp_size] = 0; -} - -template -__global__ void AddKernel(const FP* state1, FP* state2) { - uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x; - state2[k] += state1[k]; -} - -template -__global__ void MultiplyKernel(FP a, FP* state) { - uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x; - state[k] *= a; -} - -template -__global__ void CollapseKernel(uint64_t mask, uint64_t bits, FP r, FP* state) { - uint64_t k1 = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x; - uint64_t k2 = 2 * k1 - threadIdx.x % warp_size; - - if ((k1 & mask) == bits) { - state[k2] *= r; - state[k2 + warp_size] *= r; - } else { - state[k2] = 0; - state[k2 + warp_size] = 0; - } -} - -template -__global__ void BulkSetAmplKernel( - uint64_t mask, uint64_t bits, FP re, FP im, bool exclude, FP* state) { - uint64_t k1 = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x; - uint64_t k2 = 2 * k1 - threadIdx.x % warp_size; - - bool set = ((k1 & mask) == bits) ^ exclude; - - if (set) { - state[k2] = re; - state[k2 + warp_size] = im; - } -} - -template -__global__ void SampleKernel(unsigned num_blocks, - uint64_t n, uint64_t num_samples, - const FP1* rs, const FP2* ps, const FP3* state, - uint64_t *bitstrings) { - // Use just one thread. This can be somewhat slow. - if (threadIdx.x == 0) { - uint64_t m = 0; - double csum = 0; - - for (unsigned block_id = 0; block_id < num_blocks; ++block_id) { - uint64_t km = n * blockDim.x; - uint64_t k0 = block_id * km; - - for (uint64_t k = 0; k < km; ++k) { - uint64_t l = 2 * k0 + 64 * (k / 32) + k % 32; - FP3 re = state[l]; - FP3 im = state[l + warp_size]; - csum += re * re + im * im; - while (rs[m] < csum && m < num_samples) { - bitstrings[m++] = k0 + k; - } - } - } - } -} - -template -__global__ void FindMeasuredBitsKernel( - uint64_t block_id, uint64_t n, double r, const FP* state, uint64_t* res) { - // Use just one thread. This can be somewhat slow, however, this is - // more or less consistent with CPU implementations. - if (threadIdx.x == 0) { - double csum = 0; - uint64_t km = n * blockDim.x; - uint64_t k0 = block_id * km; - - for (uint64_t k = 0; k < km; ++k) { - uint64_t l = 2 * k0 + 64 * (k / 32) + k % 32; - FP re = state[l]; - FP im = state[l + warp_size]; - csum += re * re + im * im; - if (r < csum) { - *res = k0 + k; - return; - } - } - - *res = k0 + n * blockDim.x - 1; - } -} - -} // namespace qsim - -#endif // STATESPACE_CUDA_KERNELS_H_ diff --git a/qsim/statespace_custatevec.h b/qsim/statespace_custatevec.h deleted file mode 100644 index f2f5de1..0000000 --- a/qsim/statespace_custatevec.h +++ /dev/null @@ -1,376 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef STATESPACE_CUSTATEVEC_H_ -#define STATESPACE_CUSTATEVEC_H_ - -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "statespace.h" -#include "util_custatevec.h" -#include "vectorspace_cuda.h" - -namespace qsim { - -namespace detail { - -template -__global__ void SetStateUniformKernel(FP v, uint64_t size, FP* state) { - uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x; - - if (k < size) { - state[2 * k] = v; - state[2 * k + 1] = 0; - } -} - -} // namespace detail - -/** - * Object containing context and routines for cuStateVec state-vector - * manipulations. It is not recommended to use `GetAmpl` and `SetAmpl`. - */ -template -class StateSpaceCuStateVec : - public StateSpace, VectorSpaceCUDA, FP> { - private: - using Base = StateSpace, qsim::VectorSpaceCUDA, FP>; - - public: - using State = typename Base::State; - using fp_type = typename Base::fp_type; - - static constexpr auto is_float = std::is_same::value; - - static constexpr auto kStateType = is_float ? CUDA_C_32F : CUDA_C_64F; - static constexpr auto kMatrixType = kStateType; - static constexpr auto kExpectType = CUDA_C_64F; - static constexpr auto kComputeType = - is_float ? CUSTATEVEC_COMPUTE_32F : CUSTATEVEC_COMPUTE_64F; - static constexpr auto kMatrixLayout = CUSTATEVEC_MATRIX_LAYOUT_ROW; - - explicit StateSpaceCuStateVec(const cublasHandle_t& cublas_handle, - const custatevecHandle_t& custatevec_handle) - : cublas_handle_(cublas_handle), custatevec_handle_(custatevec_handle), - workspace_(nullptr), workspace_size_(0) {} - - virtual ~StateSpaceCuStateVec() { - if (workspace_ != nullptr) { - ErrorCheck(cudaFree(workspace_)); - } - } - - static uint64_t MinSize(unsigned num_qubits) { - return 2 * (uint64_t{1} << num_qubits); - }; - - void InternalToNormalOrder(State& state) const { - } - - void NormalToInternalOrder(State& state) const { - } - - void SetAllZeros(State& state) const { - ErrorCheck(cudaMemset(state.get(), 0, - MinSize(state.num_qubits()) * sizeof(fp_type))); - } - - // Uniform superposition. - void SetStateUniform(State& state) const { - uint64_t size = uint64_t{1} << state.num_qubits(); - - unsigned threads = size < 256 ? size : 256; - unsigned blocks = size / threads; - - fp_type v = double{1} / std::sqrt(size); - - detail::SetStateUniformKernel<<>>(v, size, state.get()); - ErrorCheck(cudaPeekAtLastError()); - } - - // |0> state. - void SetStateZero(State& state) const { - SetAllZeros(state); - fp_type one[1] = {1}; - ErrorCheck( - cudaMemcpy(state.get(), one, sizeof(fp_type), cudaMemcpyHostToDevice)); - } - - // It is not recommended to use this function. - static std::complex GetAmpl(const State& state, uint64_t i) { - fp_type a[2]; - auto p = state.get() + 2 * i; - ErrorCheck(cudaMemcpy(a, p, 2 * sizeof(fp_type), cudaMemcpyDeviceToHost)); - return std::complex(a[0], a[1]); - } - - // It is not recommended to use this function. - static void SetAmpl( - State& state, uint64_t i, const std::complex& ampl) { - fp_type a[2] = {std::real(ampl), std::imag(ampl)}; - auto p = state.get() + 2 * i; - ErrorCheck(cudaMemcpy(p, a, 2 * sizeof(fp_type), cudaMemcpyHostToDevice)); - } - - // It is not recommended to use this function. - static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) { - fp_type a[2] = {re, im}; - auto p = state.get() + 2 * i; - ErrorCheck(cudaMemcpy(p, a, 2 * sizeof(fp_type), cudaMemcpyHostToDevice)); - } - - // Sets state[i] = complex(re, im) where (i & mask) == bits. - // if `exclude` is true then the criteria becomes (i & mask) != bits. - void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, - const std::complex& val, - bool exclude = false) const { - // Not implemented. - } - - // Sets state[i] = complex(re, im) where (i & mask) == bits. - // if `exclude` is true then the criteria becomes (i & mask) != bits. - void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re, - fp_type im, bool exclude = false) const { - // Not implemented. - } - - // Does the equivalent of dest += src elementwise. - bool Add(const State& src, State& dest) const { - if (src.num_qubits() != dest.num_qubits()) { - return false; - } - - uint64_t size = uint64_t{1} << src.num_qubits(); - - if (is_float) { - cuComplex a = {1.0, 0.0}; - auto p1 = (const cuComplex*) src.get(); - auto p2 = (cuComplex*) dest.get(); - ErrorCheck(cublasCaxpy(cublas_handle_, size, &a, p1, 1, p2, 1)); - } else { - cuDoubleComplex a = {1.0, 0.0}; - auto p1 = (const cuDoubleComplex*) src.get(); - auto p2 = (cuDoubleComplex*) dest.get(); - ErrorCheck(cublasZaxpy(cublas_handle_, size, &a, p1, 1, p2, 1)); - } - - return true; - } - - // Does the equivalent of state *= a elementwise. - void Multiply(fp_type a, State& state) const { - uint64_t size = uint64_t{1} << state.num_qubits(); - - if (is_float) { - float a1 = a; - auto p = (cuComplex*) state.get(); - ErrorCheck(cublasCsscal(cublas_handle_, size, &a1, p, 1)); - } else { - double a1 = a; - auto p = (cuDoubleComplex*) state.get(); - ErrorCheck(cublasZdscal(cublas_handle_, size, &a1, p, 1)); - } - } - - std::complex InnerProduct( - const State& state1, const State& state2) const { - if (state1.num_qubits() != state2.num_qubits()) { - return std::nan(""); - } - - uint64_t size = uint64_t{1} << state1.num_qubits(); - - if (is_float) { - cuComplex result; - auto p1 = (const cuComplex*) state1.get(); - auto p2 = (const cuComplex*) state2.get(); - ErrorCheck(cublasCdotc(cublas_handle_, size, p1, 1, p2, 1, &result)); - return {cuCrealf(result), cuCimagf(result)}; - } else { - cuDoubleComplex result; - auto p1 = (const cuDoubleComplex*) state1.get(); - auto p2 = (const cuDoubleComplex*) state2.get(); - ErrorCheck(cublasZdotc(cublas_handle_, size, p1, 1, p2, 1, &result)); - return {cuCreal(result), cuCimag(result)}; - } - } - - double RealInnerProduct(const State& state1, const State& state2) const { - return std::real(InnerProduct(state1, state2)); - } - - double Norm(const State& state) const { - uint64_t size = uint64_t{1} << state.num_qubits(); - - if (is_float) { - float result; - auto p = (const cuComplex*) state.get(); - ErrorCheck(cublasScnrm2(cublas_handle_, size, p, 1, &result)); - return result * result; - } else { - double result; - auto p = (const cuDoubleComplex*) state.get(); - ErrorCheck(cublasDznrm2(cublas_handle_, size, p, 1, &result)); - return result * result; - } - } - - template - std::vector Sample( - const State& state, uint64_t num_samples, unsigned seed) const { - std::vector bitstrings; - - if (num_samples > 0) { - auto rs = GenerateRandomValues(num_samples, seed, 1.0); - - size_t workspace_size; - custatevecSamplerDescriptor_t sampler; - - ErrorCheck(custatevecSamplerCreate( - custatevec_handle_, state.get(), kStateType, - state.num_qubits(), &sampler, num_samples, - &workspace_size)); - - AllocWorkSpace(workspace_size); - - ErrorCheck(custatevecSamplerPreprocess( - custatevec_handle_, sampler, workspace_, workspace_size)); - - std::vector bitstrings0(num_samples); - std::vector bitordering; - - bitordering.reserve(state.num_qubits()); - for (unsigned i = 0; i < state.num_qubits(); ++i) { - bitordering.push_back(i); - } - - ErrorCheck(custatevecSamplerSample( - custatevec_handle_, sampler, bitstrings0.data(), - bitordering.data(), state.num_qubits(), rs.data(), - num_samples, CUSTATEVEC_SAMPLER_OUTPUT_RANDNUM_ORDER)); - - bitstrings.reserve(num_samples); - for (unsigned i = 0; i < num_samples; ++i) { - bitstrings.push_back(bitstrings0[i]); - } - } - - return bitstrings; - } - - using MeasurementResult = typename Base::MeasurementResult; - - template - MeasurementResult Measure(const std::vector& qubits, - RGen& rgen, State& state, - bool no_collapse = false) const { - auto r = RandomValue(rgen, 1.0); - - MeasurementResult result; - - result.valid = true; - result.mask = 0; - result.bits = 0; - result.bitstring.resize(qubits.size(), 0); - - for (auto q : qubits) { - if (q >= state.num_qubits()) { - result.valid = false; - return result; - } - - result.mask |= uint64_t{1} << q; - } - - auto collapse = no_collapse ? - CUSTATEVEC_COLLAPSE_NONE : CUSTATEVEC_COLLAPSE_NORMALIZE_AND_ZERO; - - ErrorCheck(custatevecBatchMeasure( - custatevec_handle_, state.get(), kStateType, - state.num_qubits(), (int*) result.bitstring.data(), - (int*) qubits.data(), qubits.size(), r, collapse)); - - for (std::size_t i = 0; i < result.bitstring.size(); ++i) { - result.bits |= result.bitstring[i] << qubits[i]; - } - - return result; - } - - template - MeasurementResult VirtualMeasure(const std::vector& qubits, - RGen& rgen, const State& state) const { - return Measure(qubits, rgen, const_cast(state), true); - } - - void Collapse(const MeasurementResult& mr, State& state) const { - unsigned count = 0; - - std::vector bitstring; - std::vector bitordering; - - bitstring.reserve(state.num_qubits()); - bitordering.reserve(state.num_qubits()); - - for (unsigned i = 0; i < state.num_qubits(); ++i) { - if (((mr.mask >> i) & 1) != 0) { - bitstring.push_back((mr.bits >> i) & 1); - bitordering.push_back(i); - ++count; - } - } - - ErrorCheck(custatevecCollapseByBitString( - custatevec_handle_, state.get(), kStateType, - state.num_qubits(), bitstring.data(), bitordering.data(), - count, 1.0)); - - // TODO: do we need the following? - double norm = Norm(state); - Multiply(1.0 / std::sqrt(norm), state); - } - - private: - void* AllocWorkSpace(size_t size) const { - if (size > workspace_size_) { - if (workspace_ != nullptr) { - ErrorCheck(cudaFree(workspace_)); - } - - ErrorCheck(cudaMalloc(const_cast(&workspace_), size)); - - const_cast(workspace_size_) = size; - } - - return workspace_; - } - - const cublasHandle_t cublas_handle_; - const custatevecHandle_t custatevec_handle_; - - void* workspace_; - size_t workspace_size_; -}; - -} // namespace qsim - -#endif // STATESPACE_CUSTATEVEC_H_ diff --git a/qsim/statespace_sse.h b/qsim/statespace_sse.h deleted file mode 100644 index cf41a09..0000000 --- a/qsim/statespace_sse.h +++ /dev/null @@ -1,462 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef STATESPACE_SSE_H_ -#define STATESPACE_SSE_H_ - -#include - -#include -#include -#include -#include -#include - -#include "statespace.h" -#include "util.h" -#include "vectorspace.h" - -namespace qsim { - -namespace detail { - -inline __m128i GetZeroMaskSSE(uint64_t i, uint64_t mask, uint64_t bits) { - __m128i s1 = _mm_set_epi64x(i + 2, i + 0); - __m128i s2 = _mm_set_epi64x(i + 3, i + 1); - __m128i ma = _mm_set1_epi64x(mask); - __m128i bi = _mm_set1_epi64x(bits); - - s1 = _mm_and_si128(s1, ma); - s2 = _mm_and_si128(s2, ma); - - s1 = _mm_cmpeq_epi64(s1, bi); - s2 = _mm_cmpeq_epi64(s2, bi); - - return _mm_blend_epi16(s1, s2, 204); // 11001100 -} - -inline double HorizontalSumSSE(__m128 s) { - __m128 ss = _mm_movehdup_ps(s); - __m128 s1 = _mm_add_ps(s, ss); - - return _mm_cvtss_f32(_mm_add_ss(s1, _mm_movehl_ps(ss, s1))); -} - -} // namespace detail - -/** - * Object containing context and routines for SSE state-vector manipulations. - * State is a vectorized sequence of four real components followed by four - * imaginary components. Four single-precison floating numbers can be loaded - * into an SSE register. - */ -template -class StateSpaceSSE : - public StateSpace, VectorSpace, For, float> { - private: - using Base = StateSpace, qsim::VectorSpace, For, float>; - - public: - using State = typename Base::State; - using fp_type = typename Base::fp_type; - - template - explicit StateSpaceSSE(ForArgs&&... args) : Base(args...) {} - - static uint64_t MinSize(unsigned num_qubits) { - return std::max(uint64_t{8}, 2 * (uint64_t{1} << num_qubits)); - }; - - void InternalToNormalOrder(State& state) const { - if (state.num_qubits() == 1) { - auto s = state.get(); - - s[2] = s[1]; - s[1] = s[4]; - s[3] = s[5]; - - for (uint64_t i = 4; i < 8; ++i) { - s[i] = 0; - } - } else { - auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) { - auto s = p + 8 * i; - - fp_type re[3]; - fp_type im[3]; - - for (uint64_t i = 0; i < 3; ++i) { - re[i] = s[i + 1]; - im[i] = s[i + 4]; - } - - for (uint64_t i = 0; i < 3; ++i) { - s[2 * i + 1] = im[i]; - s[2 * i + 2] = re[i]; - } - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 8, f, state.get()); - } - } - - void NormalToInternalOrder(State& state) const { - if (state.num_qubits() == 1) { - auto s = state.get(); - - s[4] = s[1]; - s[1] = s[2]; - s[5] = s[3]; - - s[2] = 0; - s[3] = 0; - s[6] = 0; - s[7] = 0; - } else { - auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) { - auto s = p + 8 * i; - - fp_type re[3]; - fp_type im[3]; - - for (uint64_t i = 0; i < 3; ++i) { - im[i] = s[2 * i + 1]; - re[i] = s[2 * i + 2]; - } - - for (uint64_t i = 0; i < 3; ++i) { - s[i + 1] = re[i]; - s[i + 4] = im[i]; - } - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 8, f, state.get()); - } - } - - void SetAllZeros(State& state) const { - __m128 val0 = _mm_setzero_ps(); - - auto f = [](unsigned n, unsigned m, uint64_t i, __m128 val0, fp_type* p) { - _mm_store_ps(p + 8 * i, val0); - _mm_store_ps(p + 8 * i + 4, val0); - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 8, f, val0, state.get()); - } - - // Uniform superposition. - void SetStateUniform(State& state) const { - __m128 val0 = _mm_setzero_ps(); - __m128 valu; - - fp_type v = double{1} / std::sqrt(uint64_t{1} << state.num_qubits()); - - if (state.num_qubits() == 1) { - valu = _mm_set_ps(0, 0, v, v); - } else { - valu = _mm_set1_ps(v); - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - __m128 val0, __m128 valu, fp_type* p) { - _mm_store_ps(p + 8 * i, valu); - _mm_store_ps(p + 8 * i + 4, val0); - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 8, f, val0, valu, state.get()); - } - - // |0> state. - void SetStateZero(State& state) const { - SetAllZeros(state); - state.get()[0] = 1; - } - - static std::complex GetAmpl(const State& state, uint64_t i) { - uint64_t p = (8 * (i / 4)) + (i % 4); - return std::complex(state.get()[p], state.get()[p + 4]); - } - - static void SetAmpl( - State& state, uint64_t i, const std::complex& ampl) { - uint64_t p = (8 * (i / 4)) + (i % 4); - state.get()[p] = std::real(ampl); - state.get()[p + 4] = std::imag(ampl); - } - - static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) { - uint64_t p = (8 * (i / 4)) + (i % 4); - state.get()[p] = re; - state.get()[p + 4] = im; - } - - // Sets state[i] = complex(re, im) where (i & mask) == bits. - // if `exclude` is true then the criteria becomes (i & mask) != bits. - void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, - const std::complex& val, - bool exclude = false) const { - BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val)); - } - - // Sets state[i] = complex(re, im) where (i & mask) == bits. - // if `exclude` is true then the criteria becomes (i & mask) != bits. - void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re, - fp_type im, bool exclude = false) const { - __m128 re_reg = _mm_set1_ps(re); - __m128 im_reg = _mm_set1_ps(im); - __m128i exclude_reg = _mm_setzero_si128(); - if (exclude) { - exclude_reg = _mm_cmpeq_epi32(exclude_reg, exclude_reg); - } - - auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv, - uint64_t bitsv, __m128 re_n, __m128 im_n, __m128i exclude_n, - fp_type* p) { - __m128 ml = _mm_castsi128_ps(_mm_xor_si128( - detail::GetZeroMaskSSE(4 * i, maskv, bitsv), exclude_n)); - - __m128 re = _mm_load_ps(p + 8 * i); - __m128 im = _mm_load_ps(p + 8 * i + 4); - - re = _mm_blendv_ps(re, re_n, ml); - im = _mm_blendv_ps(im, im_n, ml); - - _mm_store_ps(p + 8 * i, re); - _mm_store_ps(p + 8 * i + 4, im); - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 8, f, mask, bits, re_reg, - im_reg, exclude_reg, state.get()); - } - - // Does the equivalent of dest += src elementwise. - bool Add(const State& src, State& dest) const { - if (src.num_qubits() != dest.num_qubits()) { - return false; - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* p1, fp_type* p2) { - __m128 re1 = _mm_load_ps(p1 + 8 * i); - __m128 im1 = _mm_load_ps(p1 + 8 * i + 4); - __m128 re2 = _mm_load_ps(p2 + 8 * i); - __m128 im2 = _mm_load_ps(p2 + 8 * i + 4); - - _mm_store_ps(p2 + 8 * i, _mm_add_ps(re1, re2)); - _mm_store_ps(p2 + 8 * i + 4, _mm_add_ps(im1, im2)); - }; - - Base::for_.Run(MinSize(src.num_qubits()) / 8, f, src.get(), dest.get()); - - return true; - } - - // Does the equivalent of state *= a elementwise. - void Multiply(fp_type a, State& state) const { - __m128 r = _mm_set1_ps(a); - - auto f = [](unsigned n, unsigned m, uint64_t i, __m128 r, fp_type* p) { - __m128 re = _mm_load_ps(p + 8 * i); - __m128 im = _mm_load_ps(p + 8 * i + 4); - - re = _mm_mul_ps(re, r); - im = _mm_mul_ps(im, r); - - _mm_store_ps(p + 8 * i, re); - _mm_store_ps(p + 8 * i + 4, im); - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 8, f, r, state.get()); - } - - std::complex InnerProduct( - const State& state1, const State& state2) const { - if (state1.num_qubits() != state2.num_qubits()) { - return std::nan(""); - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* p1, const fp_type* p2) -> std::complex { - __m128 re1 = _mm_load_ps(p1 + 8 * i); - __m128 im1 = _mm_load_ps(p1 + 8 * i + 4); - __m128 re2 = _mm_load_ps(p2 + 8 * i); - __m128 im2 = _mm_load_ps(p2 + 8 * i + 4); - - __m128 ip_re = _mm_add_ps(_mm_mul_ps(re1, re2), _mm_mul_ps(im1, im2)); - __m128 ip_im = _mm_sub_ps(_mm_mul_ps(re1, im2), _mm_mul_ps(im1, re2)); - - double re = detail::HorizontalSumSSE(ip_re); - double im = detail::HorizontalSumSSE(ip_im); - - return std::complex{re, im}; - }; - - using Op = std::plus>; - return Base::for_.RunReduce( - MinSize(state1.num_qubits()) / 8, f, Op(), state1.get(), state2.get()); - } - - double RealInnerProduct(const State& state1, const State& state2) const { - if (state1.num_qubits() != state2.num_qubits()) { - return std::nan(""); - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* p1, const fp_type* p2) -> double { - __m128 re1 = _mm_load_ps(p1 + 8 * i); - __m128 im1 = _mm_load_ps(p1 + 8 * i + 4); - __m128 re2 = _mm_load_ps(p2 + 8 * i); - __m128 im2 = _mm_load_ps(p2 + 8 * i + 4); - - __m128 ip_re = _mm_add_ps(_mm_mul_ps(re1, re2), _mm_mul_ps(im1, im2)); - - return detail::HorizontalSumSSE(ip_re); - }; - - using Op = std::plus; - return Base::for_.RunReduce( - MinSize(state1.num_qubits()) / 8, f, Op(), state1.get(), state2.get()); - } - - template - std::vector Sample( - const State& state, uint64_t num_samples, unsigned seed) const { - std::vector bitstrings; - - if (num_samples > 0) { - double norm = 0; - uint64_t size = MinSize(state.num_qubits()) / 8; - const fp_type* p = state.get(); - - for (uint64_t k = 0; k < size; ++k) { - for (unsigned j = 0; j < 4; ++j) { - double re = p[8 * k + j]; - double im = p[8 * k + 4 + j]; - norm += re * re + im * im; - } - } - - auto rs = GenerateRandomValues(num_samples, seed, norm); - - uint64_t m = 0; - double csum = 0; - bitstrings.reserve(num_samples); - - for (uint64_t k = 0; k < size; ++k) { - for (unsigned j = 0; j < 4; ++j) { - double re = p[8 * k + j]; - double im = p[8 * k + 4 + j]; - csum += re * re + im * im; - while (rs[m] < csum && m < num_samples) { - bitstrings.emplace_back(4 * k + j); - ++m; - } - } - } - - for (; m < num_samples; ++m) { - bitstrings.emplace_back((uint64_t{1} << state.num_qubits()) - 1); - } - } - - return bitstrings; - } - - using MeasurementResult = typename Base::MeasurementResult; - - void Collapse(const MeasurementResult& mr, State& state) const { - __m128 zero = _mm_set1_ps(0); - - auto f1 = [](unsigned n, unsigned m, uint64_t i, uint64_t mask, - uint64_t bits, __m128 zero, const fp_type* p) -> double { - __m128 ml = _mm_castsi128_ps(detail::GetZeroMaskSSE(4 * i, mask, bits)); - - __m128 re = _mm_load_ps(p + 8 * i); - __m128 im = _mm_load_ps(p + 8 * i + 4); - __m128 s1 = _mm_add_ps(_mm_mul_ps(re, re), _mm_mul_ps(im, im)); - - s1 = _mm_blendv_ps(zero, s1, ml); - - return detail::HorizontalSumSSE(s1); - }; - - using Op = std::plus; - double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 8, f1, - Op(), mr.mask, mr.bits, zero, - state.get()); - - __m128 renorm = _mm_set1_ps(1.0 / std::sqrt(norm)); - - auto f2 = [](unsigned n, unsigned m, uint64_t i, uint64_t mask, - uint64_t bits, __m128 renorm, __m128 zero, fp_type* p) { - __m128 ml = _mm_castsi128_ps(detail::GetZeroMaskSSE(4 * i, mask, bits)); - - __m128 re = _mm_load_ps(p + 8 * i); - __m128 im = _mm_load_ps(p + 8 * i + 4); - - re = _mm_blendv_ps(zero, _mm_mul_ps(re, renorm), ml); - im = _mm_blendv_ps(zero, _mm_mul_ps(im, renorm), ml); - - _mm_store_ps(p + 8 * i, re); - _mm_store_ps(p + 8 * i + 4, im); - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 8, f2, - mr.mask, mr.bits, renorm, zero, state.get()); - } - - std::vector PartialNorms(const State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* p) -> double { - __m128 re = _mm_load_ps(p + 8 * i); - __m128 im = _mm_load_ps(p + 8 * i + 4); - __m128 s1 = _mm_add_ps(_mm_mul_ps(re, re), _mm_mul_ps(im, im)); - - return detail::HorizontalSumSSE(s1); - }; - - using Op = std::plus; - return Base::for_.RunReduceP( - MinSize(state.num_qubits()) / 8, f, Op(), state.get()); - } - - uint64_t FindMeasuredBits( - unsigned m, double r, uint64_t mask, const State& state) const { - double csum = 0; - - uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 8, m); - uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 8, m); - - const fp_type* p = state.get(); - - for (uint64_t k = k0; k < k1; ++k) { - for (uint64_t j = 0; j < 4; ++j) { - auto re = p[8 * k + j]; - auto im = p[8 * k + 4 + j]; - csum += re * re + im * im; - if (r < csum) { - return (4 * k + j) & mask; - } - } - } - - // Return the last bitstring in the unlikely case of underflow. - return (4 * k1 - 1) & mask; - } -}; - -} // namespace qsim - -#endif // STATESPACE_SSE_H_ diff --git a/qsim/umux.h b/qsim/umux.h deleted file mode 100644 index 83b951b..0000000 --- a/qsim/umux.h +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UMUX_H_ -#define UMUX_H_ - -#ifdef __AVX512F__ -# include "unitary_calculator_avx512.h" - namespace qsim { - namespace unitary { - template - using UnitaryCalculator = UnitaryCalculatorAVX512; - } - } -#elif __AVX2__ -# include "unitary_calculator_avx.h" - namespace qsim { - namespace unitary { - template - using UnitaryCalculator = UnitaryCalculatorAVX; - } - } -#elif __SSE4_1__ -# include "unitary_calculator_sse.h" - namespace qsim { - namespace unitary { - template - using UnitaryCalculator = UnitaryCalculatorSSE; - } - } -#else -# include "unitary_calculator_basic.h" - namespace qsim { - namespace unitary { - template - using UnitaryCalculator = UnitaryCalculatorBasic; - } - } -#endif - -#endif // UMUX_H_ diff --git a/qsim/unitary_calculator_avx.h b/qsim/unitary_calculator_avx.h deleted file mode 100644 index 5e566ca..0000000 --- a/qsim/unitary_calculator_avx.h +++ /dev/null @@ -1,1028 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UNITARY_CALCULATOR_AVX_H_ -#define UNITARY_CALCULATOR_AVX_H_ - -#include - -#include -#include -#include -#include - -#include "simulator.h" -#include "unitaryspace_avx.h" - -namespace qsim { -namespace unitary { - -/** - * Quantum circuit unitary calculator with AVX vectorization. - */ -template -class UnitaryCalculatorAVX final : public SimulatorBase { - public: - using UnitarySpace = UnitarySpaceAVX; - using Unitary = typename UnitarySpace::Unitary; - using fp_type = typename UnitarySpace::fp_type; - - using StateSpace = UnitarySpace; - using State = Unitary; - - template - explicit UnitaryCalculatorAVX(ForArgs&&... args) : for_(args...) {} - - /** - * Applies a gate using AVX instructions. - * @param qs Indices of the qubits affected by this gate. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyGate(const std::vector& qs, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - switch (qs.size()) { - case 1: - if (qs[0] > 2) { - ApplyGateH<1>(qs, matrix, state); - } else { - ApplyGateL<0, 1>(qs, matrix, state); - } - break; - case 2: - if (qs[0] > 2) { - ApplyGateH<2>(qs, matrix, state); - } else if (qs[1] > 2) { - ApplyGateL<1, 1>(qs, matrix, state); - } else { - ApplyGateL<0, 2>(qs, matrix, state); - } - break; - case 3: - if (qs[0] > 2) { - ApplyGateH<3>(qs, matrix, state); - } else if (qs[1] > 2) { - ApplyGateL<2, 1>(qs, matrix, state); - } else if (qs[2] > 2) { - ApplyGateL<1, 2>(qs, matrix, state); - } else { - ApplyGateL<0, 3>(qs, matrix, state); - } - break; - case 4: - if (qs[0] > 2) { - ApplyGateH<4>(qs, matrix, state); - } else if (qs[1] > 2) { - ApplyGateL<3, 1>(qs, matrix, state); - } else if (qs[2] > 2) { - ApplyGateL<2, 2>(qs, matrix, state); - } else { - ApplyGateL<1, 3>(qs, matrix, state); - } - break; - case 5: - if (qs[0] > 2) { - ApplyGateH<5>(qs, matrix, state); - } else if (qs[1] > 2) { - ApplyGateL<4, 1>(qs, matrix, state); - } else if (qs[2] > 2) { - ApplyGateL<3, 2>(qs, matrix, state); - } else { - ApplyGateL<2, 3>(qs, matrix, state); - } - break; - case 6: - if (qs[0] > 2) { - ApplyGateH<6>(qs, matrix, state); - } else if (qs[1] > 2) { - ApplyGateL<5, 1>(qs, matrix, state); - } else if (qs[2] > 2) { - ApplyGateL<4, 2>(qs, matrix, state); - } else { - ApplyGateL<3, 3>(qs, matrix, state); - } - break; - default: - // Not implemented. - break; - } - } - - /** - * Applies a controlled gate using AVX instructions. - * @param qs Indices of the qubits affected by this gate. - * @param cqs Indices of control qubits. - * @param cvals Bit mask of control qubit values. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyControlledGate(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - // Assume cqs[0] < cqs[1] < cqs[2] < ... . - - if (cqs.size() == 0) { - ApplyGate(qs, matrix, state); - return; - } - - switch (qs.size()) { - case 1: - if (qs[0] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 2) { - ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 2: - if (qs[0] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 2) { - ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 3: - if (qs[0] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else if (qs[2] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 2) { - ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 4: - if (qs[0] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else if (qs[2] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 2) { - ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - default: - // Not implemented. - break; - } - } - - /** - * @return The size of SIMD register if applicable. - */ - static unsigned SIMDRegisterSize() { - return 8; - } - - private: - -#ifdef __BMI2__ - - template - void ApplyGateH(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - uint64_t imaskh, uint64_t qmaskh, uint64_t size, - uint64_t row_size, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m256 ru, iu, rn, in; - __m256 rs[hsize], is[hsize]; - - uint64_t r = i % size; - uint64_t s = i / size; - - auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh); - - for (unsigned k = 0; k < hsize; ++k) { - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k] = _mm256_load_ps(p0 + p); - is[k] = _mm256_load_ps(p0 + p + 8); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_mul_ps(rs[0], ru); - in = _mm256_mul_ps(rs[0], iu); - rn = _mm256_fnmadd_ps(is[0], iu, rn); - in = _mm256_fmadd_ps(is[0], ru, in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_fmadd_ps(rs[l], ru, rn); - in = _mm256_fmadd_ps(rs[l], iu, in); - rn = _mm256_fnmadd_ps(is[l], iu, rn); - in = _mm256_fmadd_ps(is[l], ru, in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm256_store_ps(p0 + p, rn); - _mm256_store_ps(p0 + p + 8, in); - } - }; - - auto m = GetMasks1(qs); - - unsigned k = 3 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, - matrix, m.imaskh, m.qmaskh, size, raw_size, state.get()); - } - - template - void ApplyGateL(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, - uint64_t imaskh, uint64_t qmaskh, const __m256i* idx, - uint64_t size, uint64_t row_size, fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m256 rn, in; - __m256 rs[gsize], is[gsize]; - - uint64_t r = i % size; - uint64_t s = i / size; - - auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh); - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k2] = _mm256_load_ps(p0 + p); - is[k2] = _mm256_load_ps(p0 + p + 8); - - for (unsigned l = 1; l < lsize; ++l) { - rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); - is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); - } - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm256_mul_ps(rs[0], w[j]); - in = _mm256_mul_ps(rs[0], w[j + 1]); - rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm256_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm256_fmadd_ps(rs[l], w[j], rn); - in = _mm256_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm256_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm256_store_ps(p0 + p, rn); - _mm256_store_ps(p0 + p + 8, in); - } - }; - - __m256i idx[1 << L]; - __m256 w[1 << (1 + 2 * H + L)]; - - auto m = GetMasks2(qs); - FillPermutationIndices(m.qmaskl, idx); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - unsigned k = 3 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, - w, m.imaskh, m.qmaskh, idx, size, raw_size, state.get()); - } - - template - void ApplyControlledGateHH(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, - uint64_t size, uint64_t row_size, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m256 ru, iu, rn, in; - __m256 rs[hsize], is[hsize]; - - uint64_t r = i % size; - uint64_t s = i / size; - - auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh); - - for (unsigned k = 0; k < hsize; ++k) { - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k] = _mm256_load_ps(p0 + p); - is[k] = _mm256_load_ps(p0 + p + 8); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_mul_ps(rs[0], ru); - in = _mm256_mul_ps(rs[0], iu); - rn = _mm256_fnmadd_ps(is[0], iu, rn); - in = _mm256_fmadd_ps(is[0], ru, in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_fmadd_ps(rs[l], ru, rn); - in = _mm256_fmadd_ps(rs[l], iu, in); - rn = _mm256_fnmadd_ps(is[l], iu, rn); - in = _mm256_fmadd_ps(is[l], ru, in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm256_store_ps(p0 + p, rn); - _mm256_store_ps(p0 + p + 8, in); - } - }; - - auto m = GetMasks3(state.num_qubits(), qs, cqs, cvals); - - unsigned k = 3 + H + cqs.size(); - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, - matrix, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get()); - } - - template - void ApplyControlledGateHL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, - uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, - uint64_t size, uint64_t row_size, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m256 rn, in; - __m256 rs[hsize], is[hsize]; - - uint64_t r = i % size; - uint64_t s = i / size; - - auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh); - - for (unsigned k = 0; k < hsize; ++k) { - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k] = _mm256_load_ps(p0 + p); - is[k] = _mm256_load_ps(p0 + p + 8); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm256_mul_ps(rs[0], w[j]); - in = _mm256_mul_ps(rs[0], w[j + 1]); - rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm256_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - rn = _mm256_fmadd_ps(rs[l], w[j], rn); - in = _mm256_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm256_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm256_store_ps(p0 + p, rn); - _mm256_store_ps(p0 + p + 8, in); - } - }; - - __m256 w[1 << (1 + 2 * H)]; - - auto m = GetMasks4(state.num_qubits(), qs, cqs, cvals); - FillControlledMatrixH(m.cvalsl, m.cmaskl, matrix, (fp_type*) w); - - unsigned k = 3 + H + cqs.size() - m.cl; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, - w, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get()); - } - - template - void ApplyControlledGateL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, - uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, - const __m256i* idx, uint64_t size, uint64_t row_size, - fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m256 rn, in; - __m256 rs[gsize], is[gsize]; - - uint64_t r = i % size; - uint64_t s = i / size; - - auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh); - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k2] = _mm256_load_ps(p0 + p); - is[k2] = _mm256_load_ps(p0 + p + 8); - - for (unsigned l = 1; l < lsize; ++l) { - rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); - is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); - } - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm256_mul_ps(rs[0], w[j]); - in = _mm256_mul_ps(rs[0], w[j + 1]); - rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm256_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm256_fmadd_ps(rs[l], w[j], rn); - in = _mm256_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm256_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm256_store_ps(p0 + p, rn); - _mm256_store_ps(p0 + p + 8, in); - } - }; - - __m256i idx[1 << L]; - __m256 w[1 << (1 + 2 * H + L)]; - - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - if (CH) { - auto m = GetMasks5(state.num_qubits(), qs, cqs, cvals); - FillPermutationIndices(m.qmaskl, idx); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - unsigned k = 3 + H + cqs.size(); - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size * size2, f, w, m.imaskh, m.qmaskh, - m.cvalsh, idx, size, raw_size, state.get()); - } else { - auto m = GetMasks6(state.num_qubits(), qs, cqs, cvals); - FillPermutationIndices(m.qmaskl, idx); - FillControlledMatrixL( - m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w); - - unsigned k = 3 + H + cqs.size() - m.cl; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size * size2, f, w, m.imaskh, m.qmaskh, - m.cvalsh, idx, size, raw_size, state.get()); - } - } - -#else // __BMI2__ - - template - void ApplyGateH(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - const uint64_t* ms, const uint64_t* xss, uint64_t size, - uint64_t row_size, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m256 ru, iu, rn, in; - __m256 rs[hsize], is[hsize]; - - uint64_t r = 8 * (i % size); - uint64_t s = i / size; - - uint64_t t = r & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - r *= 2; - t |= r & ms[j]; - } - - auto p0 = rstate + row_size * s + 2 * t; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = _mm256_load_ps(p0 + xss[k]); - is[k] = _mm256_load_ps(p0 + xss[k] + 8); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_mul_ps(rs[0], ru); - in = _mm256_mul_ps(rs[0], iu); - rn = _mm256_fnmadd_ps(is[0], iu, rn); - in = _mm256_fmadd_ps(is[0], ru, in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_fmadd_ps(rs[l], ru, rn); - in = _mm256_fmadd_ps(rs[l], iu, in); - rn = _mm256_fnmadd_ps(is[l], iu, rn); - in = _mm256_fmadd_ps(is[l], ru, in); - - j += 2; - } - - _mm256_store_ps(p0 + xss[k], rn); - _mm256_store_ps(p0 + xss[k] + 8, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned k = 3 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, state.get()); - } - - template - void ApplyGateL(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, - const uint64_t* ms, const uint64_t* xss, const __m256i* idx, - uint64_t size, uint64_t row_size, fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m256 rn, in; - __m256 rs[gsize], is[gsize]; - - uint64_t r = 8 * (i % size); - uint64_t s = i / size; - - uint64_t t = r & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - r *= 2; - t |= r & ms[j]; - } - - auto p0 = rstate + row_size * s + 2 * t; - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - rs[k2] = _mm256_load_ps(p0 + xss[k]); - is[k2] = _mm256_load_ps(p0 + xss[k] + 8); - - for (unsigned l = 1; l < lsize; ++l) { - rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); - is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); - } - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm256_mul_ps(rs[0], w[j]); - in = _mm256_mul_ps(rs[0], w[j + 1]); - rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm256_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm256_fmadd_ps(rs[l], w[j], rn); - in = _mm256_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm256_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - _mm256_store_ps(p0 + xss[k], rn); - _mm256_store_ps(p0 + xss[k] + 8, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - __m256i idx[1 << L]; - __m256 w[1 << (1 + 2 * H + L)]; - - auto m = GetMasks11(qs); - - FillIndices(state.num_qubits(), qs, ms, xss); - FillPermutationIndices(m.qmaskl, idx); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - unsigned k = 3 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, state.get()); - } - - template - void ApplyControlledGateHH(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, - uint64_t cmaskh, uint64_t size, uint64_t row_size, - fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m256 ru, iu, rn, in; - __m256 rs[hsize], is[hsize]; - - uint64_t r = 8 * (i % size); - uint64_t s = i / size; - - uint64_t t = r & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - r *= 2; - t |= r & ms[j]; - } - - if ((t & cmaskh) != cvalsh) return; - - auto p0 = rstate + row_size * s + 2 * t; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = _mm256_load_ps(p0 + xss[k]); - is[k] = _mm256_load_ps(p0 + xss[k] + 8); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_mul_ps(rs[0], ru); - in = _mm256_mul_ps(rs[0], iu); - rn = _mm256_fnmadd_ps(is[0], iu, rn); - in = _mm256_fmadd_ps(is[0], ru, in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_fmadd_ps(rs[l], ru, rn); - in = _mm256_fmadd_ps(rs[l], iu, in); - rn = _mm256_fnmadd_ps(is[l], iu, rn); - in = _mm256_fmadd_ps(is[l], ru, in); - - j += 2; - } - - _mm256_store_ps(p0 + xss[k], rn); - _mm256_store_ps(p0 + xss[k] + 8, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - - auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals); - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned k = 3 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, - matrix, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get()); - } - - template - void ApplyControlledGateHL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, - const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, - uint64_t cmaskh, uint64_t size, uint64_t row_size, - fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m256 rn, in; - __m256 rs[hsize], is[hsize]; - - uint64_t r = 8 * (i % size); - uint64_t s = i / size; - - uint64_t t = r & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - r *= 2; - t |= r & ms[j]; - } - - if ((t & cmaskh) != cvalsh) return; - - auto p0 = rstate + row_size * s + 2 * t; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = _mm256_load_ps(p0 + xss[k]); - is[k] = _mm256_load_ps(p0 + xss[k] + 8); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm256_mul_ps(rs[0], w[j]); - in = _mm256_mul_ps(rs[0], w[j + 1]); - rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm256_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - rn = _mm256_fmadd_ps(rs[l], w[j], rn); - in = _mm256_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm256_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - _mm256_store_ps(p0 + xss[k], rn); - _mm256_store_ps(p0 + xss[k] + 8, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - __m256 w[1 << (1 + 2 * H)]; - - auto m = GetMasks8<3>(state.num_qubits(), qs, cqs, cvals); - FillIndices(state.num_qubits(), qs, ms, xss); - FillControlledMatrixH(m.cvalsl, m.cmaskl, matrix, (fp_type*) w); - - unsigned k = 3 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, - w, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get()); - } - - template - void ApplyControlledGateL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, - const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, - uint64_t cmaskh, const __m256i* idx, uint64_t size, - uint64_t row_size, fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m256 rn, in; - __m256 rs[gsize], is[gsize]; - - uint64_t r = 8 * (i % size); - uint64_t s = i / size; - - uint64_t t = r & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - r *= 2; - t |= r & ms[j]; - } - - if ((t & cmaskh) != cvalsh) return; - - auto p0 = rstate + row_size * s + 2 * t; - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - - rs[k2] = _mm256_load_ps(p0 + xss[k]); - is[k2] = _mm256_load_ps(p0 + xss[k] + 8); - - for (unsigned l = 1; l < lsize; ++l) { - rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); - is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); - } - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm256_mul_ps(rs[0], w[j]); - in = _mm256_mul_ps(rs[0], w[j + 1]); - rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm256_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm256_fmadd_ps(rs[l], w[j], rn); - in = _mm256_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm256_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - _mm256_store_ps(p0 + xss[k], rn); - _mm256_store_ps(p0 + xss[k] + 8, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - __m256i idx[1 << L]; - __m256 w[1 << (1 + 2 * H + L)]; - - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned k = 3 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - if (CH) { - auto m = GetMasks9(state.num_qubits(), qs, cqs, cvals); - FillPermutationIndices(m.qmaskl, idx); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - for_.Run(size * size2, f, w, ms, xss, m.cvalsh, - m.cmaskh, idx, size, raw_size, state.get()); - } else { - auto m = GetMasks10(state.num_qubits(), qs, cqs, cvals); - FillPermutationIndices(m.qmaskl, idx); - FillControlledMatrixL( - m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w); - - for_.Run(size * size2, f, w, ms, xss, m.cvalsh, - m.cmaskh, idx, size, raw_size, state.get()); - } - } - -#endif // __BMI2__ - - template - static void FillPermutationIndices(unsigned qmaskl, __m256i* idx) { - constexpr unsigned lsize = 1 << L; - - for (unsigned i = 0; i < lsize - 1; ++i) { - unsigned p[8]; - - for (unsigned j = 0; j < 8; ++j) { - p[j] = MaskedAdd<3>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl)); - } - - idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]); - } - } - - For for_; -}; - -} // namespace unitary -} // namespace qsim - -#endif // UNITARY_CALCULATOR_AVX_H_ diff --git a/qsim/unitary_calculator_avx512.h b/qsim/unitary_calculator_avx512.h deleted file mode 100644 index 8105367..0000000 --- a/qsim/unitary_calculator_avx512.h +++ /dev/null @@ -1,644 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UNITARY_CALCULATOR_AVX512_H_ -#define UNITARY_CALCULATOR_AVX512_H_ - -#include - -#include -#include -#include -#include - -#include "simulator.h" -#include "unitaryspace_avx512.h" - -namespace qsim { -namespace unitary { - -/** - * Quantum circuit unitary calculator with AVX512 vectorization. - */ -template -class UnitaryCalculatorAVX512 final : public SimulatorBase { - public: - using UnitarySpace = UnitarySpaceAVX512; - using Unitary = typename UnitarySpace::Unitary; - using fp_type = typename UnitarySpace::fp_type; - - using StateSpace = UnitarySpace; - using State = Unitary; - - template - explicit UnitaryCalculatorAVX512(ForArgs&&... args) : for_(args...) {} - - /** - * Applies a gate using AVX512 instructions. - * @param qs Indices of the qubits affected by this gate. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyGate(const std::vector& qs, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - switch (qs.size()) { - case 1: - if (qs[0] > 3) { - ApplyGateH<1>(qs, matrix, state); - } else { - ApplyGateL<0, 1>(qs, matrix, state); - } - break; - case 2: - if (qs[0] > 3) { - ApplyGateH<2>(qs, matrix, state); - } else if (qs[1] > 3) { - ApplyGateL<1, 1>(qs, matrix, state); - } else { - ApplyGateL<0, 2>(qs, matrix, state); - } - break; - case 3: - if (qs[0] > 3) { - ApplyGateH<3>(qs, matrix, state); - } else if (qs[1] > 3) { - ApplyGateL<2, 1>(qs, matrix, state); - } else if (qs[2] > 3) { - ApplyGateL<1, 2>(qs, matrix, state); - } else { - ApplyGateL<0, 3>(qs, matrix, state); - } - break; - case 4: - if (qs[0] > 3) { - ApplyGateH<4>(qs, matrix, state); - } else if (qs[1] > 3) { - ApplyGateL<3, 1>(qs, matrix, state); - } else if (qs[2] > 3) { - ApplyGateL<2, 2>(qs, matrix, state); - } else if (qs[3] > 3) { - ApplyGateL<1, 3>(qs, matrix, state); - } else { - ApplyGateL<0, 4>(qs, matrix, state); - } - break; - case 5: - if (qs[0] > 3) { - ApplyGateH<5>(qs, matrix, state); - } else if (qs[1] > 3) { - ApplyGateL<4, 1>(qs, matrix, state); - } else if (qs[2] > 3) { - ApplyGateL<3, 2>(qs, matrix, state); - } else if (qs[3] > 3) { - ApplyGateL<2, 3>(qs, matrix, state); - } else { - ApplyGateL<1, 4>(qs, matrix, state); - } - break; - case 6: - if (qs[0] > 3) { - ApplyGateH<6>(qs, matrix, state); - } else if (qs[1] > 3) { - ApplyGateL<5, 1>(qs, matrix, state); - } else if (qs[2] > 3) { - ApplyGateL<4, 2>(qs, matrix, state); - } else if (qs[3] > 3) { - ApplyGateL<3, 3>(qs, matrix, state); - } else { - ApplyGateL<2, 4>(qs, matrix, state); - } - break; - default: - // Not implemented. - break; - } - } - - /** - * Applies a controlled gate using AVX512 instructions. - * @param qs Indices of the qubits affected by this gate. - * @param cqs Indices of control qubits. - * @param cvals Bit mask of control qubit values. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyControlledGate(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - // Assume cqs[0] < cqs[1] < cqs[2] < ... . - - if (cqs.size() == 0) { - ApplyGate(qs, matrix, state); - return; - } - - switch (qs.size()) { - case 1: - if (qs[0] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 3) { - ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 2: - if (qs[0] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 3) { - ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 3: - if (qs[0] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else if (qs[2] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 3) { - ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 4: - if (qs[0] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else if (qs[2] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state); - } - } else if (qs[3] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 3) { - ApplyControlledGateL<0, 4, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 4, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - default: - // Not implemented. - break; - } - } - - /** - * @return The size of SIMD register if applicable. - */ - static unsigned SIMDRegisterSize() { - return 16; - } - - private: - template - void ApplyGateH(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - uint64_t imaskh, uint64_t qmaskh, uint64_t size, - uint64_t row_size, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m512 ru, iu, rn, in; - __m512 rs[hsize], is[hsize]; - - uint64_t r = i % size; - uint64_t s = i / size; - - auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh); - - for (unsigned k = 0; k < hsize; ++k) { - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k] = _mm512_load_ps(p0 + p); - is[k] = _mm512_load_ps(p0 + p + 16); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm512_set1_ps(v[j]); - iu = _mm512_set1_ps(v[j + 1]); - rn = _mm512_mul_ps(rs[0], ru); - in = _mm512_mul_ps(rs[0], iu); - rn = _mm512_fnmadd_ps(is[0], iu, rn); - in = _mm512_fmadd_ps(is[0], ru, in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm512_set1_ps(v[j]); - iu = _mm512_set1_ps(v[j + 1]); - rn = _mm512_fmadd_ps(rs[l], ru, rn); - in = _mm512_fmadd_ps(rs[l], iu, in); - rn = _mm512_fnmadd_ps(is[l], iu, rn); - in = _mm512_fmadd_ps(is[l], ru, in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm512_store_ps(p0 + p, rn); - _mm512_store_ps(p0 + p + 16, in); - } - }; - - auto m = GetMasks1(qs); - - unsigned k = 4 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, - matrix, m.imaskh, m.qmaskh, size, raw_size, state.get()); - } - - template - void ApplyGateL(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, - uint64_t imaskh, uint64_t qmaskh, const __m512i* idx, - uint64_t size, uint64_t row_size, fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m512 rn, in; - __m512 rs[gsize], is[gsize]; - - uint64_t r = i % size; - uint64_t s = i / size; - - auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh); - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k2] = _mm512_load_ps(p0 + p); - is[k2] = _mm512_load_ps(p0 + p + 16); - - for (unsigned l = 1; l < lsize; ++l) { - rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]); - is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]); - } - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm512_mul_ps(rs[0], w[j]); - in = _mm512_mul_ps(rs[0], w[j + 1]); - rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm512_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm512_fmadd_ps(rs[l], w[j], rn); - in = _mm512_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm512_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm512_store_ps(p0 + p, rn); - _mm512_store_ps(p0 + p + 16, in); - } - }; - - __m512i idx[1 << L]; - __m512 w[1 << (1 + 2 * H + L)]; - - auto m = GetMasks2(qs); - FillPermutationIndices(m.qmaskl, idx); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - unsigned k = 4 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, - w, m.imaskh, m.qmaskh, idx, size, raw_size, state.get()); - } - - template - void ApplyControlledGateHH(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, - uint64_t size, uint64_t row_size, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m512 ru, iu, rn, in; - __m512 rs[hsize], is[hsize]; - - uint64_t r = i % size; - uint64_t s = i / size; - - auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh); - - for (unsigned k = 0; k < hsize; ++k) { - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k] = _mm512_load_ps(p0 + p); - is[k] = _mm512_load_ps(p0 + p + 16); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm512_set1_ps(v[j]); - iu = _mm512_set1_ps(v[j + 1]); - rn = _mm512_mul_ps(rs[0], ru); - in = _mm512_mul_ps(rs[0], iu); - rn = _mm512_fnmadd_ps(is[0], iu, rn); - in = _mm512_fmadd_ps(is[0], ru, in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm512_set1_ps(v[j]); - iu = _mm512_set1_ps(v[j + 1]); - rn = _mm512_fmadd_ps(rs[l], ru, rn); - in = _mm512_fmadd_ps(rs[l], iu, in); - rn = _mm512_fnmadd_ps(is[l], iu, rn); - in = _mm512_fmadd_ps(is[l], ru, in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm512_store_ps(p0 + p, rn); - _mm512_store_ps(p0 + p + 16, in); - } - }; - - auto m = GetMasks3(state.num_qubits(), qs, cqs, cvals); - - unsigned k = 4 + H + cqs.size(); - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, - matrix, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get()); - } - - template - void ApplyControlledGateHL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, - uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, - uint64_t size, uint64_t row_size, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m512 rn, in; - __m512 rs[hsize], is[hsize]; - - uint64_t r = i % size; - uint64_t s = i / size; - - auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh); - - for (unsigned k = 0; k < hsize; ++k) { - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k] = _mm512_load_ps(p0 + p); - is[k] = _mm512_load_ps(p0 + p + 16); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm512_mul_ps(rs[0], w[j]); - in = _mm512_mul_ps(rs[0], w[j + 1]); - rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm512_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - rn = _mm512_fmadd_ps(rs[l], w[j], rn); - in = _mm512_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm512_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm512_store_ps(p0 + p, rn); - _mm512_store_ps(p0 + p + 16, in); - } - }; - - __m512 w[1 << (1 + 2 * H)]; - - auto m = GetMasks4(state.num_qubits(), qs, cqs, cvals); - FillControlledMatrixH(m.cvalsl, m.cmaskl, matrix, (fp_type*) w); - - unsigned k = 4 + H + cqs.size() - m.cl; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, - w, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get()); - } - - template - void ApplyControlledGateL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, - uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, - const __m512i* idx, uint64_t size, uint64_t row_size, - fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m512 rn, in; - __m512 rs[gsize], is[gsize]; - - uint64_t r = i % size; - uint64_t s = i / size; - - auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh); - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k2] = _mm512_load_ps(p0 + p); - is[k2] = _mm512_load_ps(p0 + p + 16); - - for (unsigned l = 1; l < lsize; ++l) { - rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]); - is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]); - } - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm512_mul_ps(rs[0], w[j]); - in = _mm512_mul_ps(rs[0], w[j + 1]); - rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm512_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm512_fmadd_ps(rs[l], w[j], rn); - in = _mm512_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm512_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm512_store_ps(p0 + p, rn); - _mm512_store_ps(p0 + p + 16, in); - } - }; - - __m512i idx[1 << L]; - __m512 w[1 << (1 + 2 * H + L)]; - - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - if (CH) { - auto m = GetMasks5(state.num_qubits(), qs, cqs, cvals); - FillPermutationIndices(m.qmaskl, idx); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - unsigned k = 4 + H + cqs.size(); - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size * size2, f, w, m.imaskh, m.qmaskh, - m.cvalsh, idx, size, raw_size, state.get()); - } else { - auto m = GetMasks6(state.num_qubits(), qs, cqs, cvals); - FillPermutationIndices(m.qmaskl, idx); - FillControlledMatrixL( - m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w); - - unsigned k = 4 + H + cqs.size() - m.cl; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size * size2, f, w, m.imaskh, m.qmaskh, - m.cvalsh, idx, size, raw_size, state.get()); - } - } - - template - static void FillPermutationIndices(unsigned qmaskl, __m512i* idx) { - constexpr unsigned lsize = 1 << L; - - for (unsigned i = 0; i < lsize; ++i) { - unsigned p[16]; - - for (unsigned j = 0; j < 16; ++j) { - p[j] = MaskedAdd<4>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl)); - } - - idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], - p[9], p[8], p[7], p[6], p[5], p[4], - p[3], p[2], p[1], p[0]); - } - } - - For for_; -}; - -} // namespace unitary -} // namespace qsim - -#endif // UNITARY_CALCULATOR_AVX512_H_ diff --git a/qsim/unitary_calculator_basic.h b/qsim/unitary_calculator_basic.h deleted file mode 100644 index 6b1821a..0000000 --- a/qsim/unitary_calculator_basic.h +++ /dev/null @@ -1,259 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UNITARY_CALCULATOR_BASIC_H_ -#define UNITARY_CALCULATOR_BASIC_H_ - -#include -#include -#include -#include - -#include "simulator.h" -#include "unitaryspace_basic.h" - -namespace qsim { -namespace unitary { - -/** - * Quantum circuit unitary calculator without vectorization. - */ -template -class UnitaryCalculatorBasic final : public SimulatorBase { - public: - using UnitarySpace = UnitarySpaceBasic; - using Unitary = typename UnitarySpace::Unitary; - using fp_type = typename UnitarySpace::fp_type; - - using StateSpace = UnitarySpace; - using State = Unitary; - - template - explicit UnitaryCalculatorBasic(ForArgs&&... args) : for_(args...) {} - - /** - * Applies a gate using non-vectorized instructions. - * @param qs Indices of the qubits affected by this gate. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyGate(const std::vector& qs, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - switch (qs.size()) { - case 1: - ApplyGateH<1>(qs, matrix, state); - break; - case 2: - ApplyGateH<2>(qs, matrix, state); - break; - case 3: - ApplyGateH<3>(qs, matrix, state); - break; - case 4: - ApplyGateH<4>(qs, matrix, state); - break; - case 5: - ApplyGateH<5>(qs, matrix, state); - break; - case 6: - ApplyGateH<6>(qs, matrix, state); - break; - default: - // Not implemented. - break; - } - } - - /** - * Applies a controlled gate using non-vectorized instructions. - * @param qs Indices of the qubits affected by this gate. - * @param cqs Indices of control qubits. - * @param cvals Bit mask of control qubit values. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyControlledGate(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - if (cqs.size() == 0) { - ApplyGate(qs, matrix, state); - return; - } - - switch (qs.size()) { - case 1: - ApplyControlledGateH<1>(qs, cqs, cvals, matrix, state); - break; - case 2: - ApplyControlledGateH<2>(qs, cqs, cvals, matrix, state); - break; - case 3: - ApplyControlledGateH<3>(qs, cqs, cvals, matrix, state); - break; - case 4: - ApplyControlledGateH<4>(qs, cqs, cvals, matrix, state); - break; - default: - // Not implemented. - break; - } - } - - /** - * @return The size of SIMD register if applicable. - */ - static unsigned SIMDRegisterSize() { - return 1; - } - - private: - template - void ApplyGateH(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - const uint64_t* ms, const uint64_t* xss, uint64_t size, - uint64_t row_size, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - fp_type rn, in; - fp_type rs[hsize], is[hsize]; - - uint64_t r = i % size; - uint64_t s = i / size; - - uint64_t t = r & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - r *= 2; - t |= r & ms[j]; - } - - auto p0 = rstate + row_size * s + 2 * t; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = *(p0 + xss[k]); - is[k] = *(p0 + xss[k] + 1); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = rs[0] * v[j] - is[0] * v[j + 1]; - in = rs[0] * v[j + 1] + is[0] * v[j]; - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - rn += rs[l] * v[j] - is[l] * v[j + 1]; - in += rs[l] * v[j + 1] + is[l] * v[j]; - - j += 2; - } - - *(p0 + xss[k]) = rn; - *(p0 + xss[k] + 1) = in; - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, state.get()); - } - - template - void ApplyControlledGateH(const std::vector& qs, - const std::vector& cqs, - uint64_t cvals, const fp_type* matrix, - State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, - uint64_t cmaskh, uint64_t size, uint64_t row_size, - fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - fp_type rn, in; - fp_type rs[hsize], is[hsize]; - - uint64_t r = i % size; - uint64_t s = i / size; - - uint64_t t = r & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - r *= 2; - t |= r & ms[j]; - } - - if ((t & cmaskh) == cvalsh) { - auto p0 = rstate + row_size * s + 2 * t; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = *(p0 + xss[k]); - is[k] = *(p0 + xss[k] + 1); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = rs[0] * v[j] - is[0] * v[j + 1]; - in = rs[0] * v[j + 1] + is[0] * v[j]; - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - rn += rs[l] * v[j] - is[l] * v[j + 1]; - in += rs[l] * v[j + 1] + is[l] * v[j]; - - j += 2; - } - - *(p0 + xss[k]) = rn; - *(p0 + xss[k] + 1) = in; - } - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - - FillIndices(state.num_qubits(), qs, ms, xss); - - auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals); - - unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, - matrix, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get()); - } - - For for_; -}; - -} // namespace unitary -} // namespace qsim - -#endif // UNITARY_CALCULATOR_BASIC_H_ diff --git a/qsim/unitary_calculator_sse.h b/qsim/unitary_calculator_sse.h deleted file mode 100644 index a3c3f2e..0000000 --- a/qsim/unitary_calculator_sse.h +++ /dev/null @@ -1,639 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UNITARY_CALCULATOR_SSE_H_ -#define UNITARY_CALCULATOR_SSE_H_ - -#include - -#include -#include -#include -#include - -#include "simulator.h" -#include "unitaryspace_sse.h" - -namespace qsim { -namespace unitary { - -/** - * Quantum circuit unitary calculator with SSE vectorization. - */ -template -class UnitaryCalculatorSSE final : public SimulatorBase { - public: - using UnitarySpace = UnitarySpaceSSE; - using Unitary = typename UnitarySpace::Unitary; - using fp_type = typename UnitarySpace::fp_type; - - using StateSpace = UnitarySpace; - using State = Unitary; - - template - explicit UnitaryCalculatorSSE(ForArgs&&... args) : for_(args...) {} - - /** - * Applies a gate using SSE instructions. - * @param qs Indices of the qubits affected by this gate. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyGate(const std::vector& qs, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - switch (qs.size()) { - case 1: - if (qs[0] > 1) { - ApplyGateH<1>(qs, matrix, state); - } else { - ApplyGateL<0, 1>(qs, matrix, state); - } - break; - case 2: - if (qs[0] > 1) { - ApplyGateH<2>(qs, matrix, state); - } else if (qs[1] > 1) { - ApplyGateL<1, 1>(qs, matrix, state); - } else { - ApplyGateL<0, 2>(qs, matrix, state); - } - break; - case 3: - if (qs[0] > 1) { - ApplyGateH<3>(qs, matrix, state); - } else if (qs[1] > 1) { - ApplyGateL<2, 1>(qs, matrix, state); - } else { - ApplyGateL<1, 2>(qs, matrix, state); - } - break; - case 4: - if (qs[0] > 1) { - ApplyGateH<4>(qs, matrix, state); - } else if (qs[1] > 1) { - ApplyGateL<3, 1>(qs, matrix, state); - } else { - ApplyGateL<2, 2>(qs, matrix, state); - } - break; - case 5: - if (qs[0] > 1) { - ApplyGateH<5>(qs, matrix, state); - } else if (qs[1] > 1) { - ApplyGateL<4, 1>(qs, matrix, state); - } else { - ApplyGateL<3, 2>(qs, matrix, state); - } - break; - case 6: - if (qs[0] > 1) { - ApplyGateH<6>(qs, matrix, state); - } else if (qs[1] > 1) { - ApplyGateL<5, 1>(qs, matrix, state); - } else { - ApplyGateL<4, 2>(qs, matrix, state); - } - break; - default: - // Not implemented. - break; - } - } - - /** - * Applies a controlled gate using SSE instructions. - * @param qs Indices of the qubits affected by this gate. - * @param cqs Indices of control qubits. - * @param cvals Bit mask of control qubit values. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyControlledGate(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - // Assume cqs[0] < cqs[1] < cqs[2] < ... . - - if (cqs.size() == 0) { - ApplyGate(qs, matrix, state); - return; - } - - switch (qs.size()) { - case 1: - if (qs[0] > 1) { - if (cqs[0] > 1) { - ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 1) { - ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 2: - if (qs[0] > 1) { - if (cqs[0] > 1) { - ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 1) { - if (cqs[0] > 1) { - ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 1) { - ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 3: - if (qs[0] > 1) { - if (cqs[0] > 1) { - ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 1) { - if (cqs[0] > 1) { - ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 1) { - ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 4: - if (qs[0] > 1) { - if (cqs[0] > 1) { - ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 1) { - if (cqs[0] > 1) { - ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 1) { - ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - default: - // Not implemented. - break; - } - } - - /** - * @return The size of SIMD register if applicable. - */ - static unsigned SIMDRegisterSize() { - return 4; - } - - private: - template - void ApplyGateH(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - const uint64_t* ms, const uint64_t* xss, uint64_t size, - uint64_t row_size, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m128 ru, iu, rn, in; - __m128 rs[hsize], is[hsize]; - - uint64_t r = 4 * (i % size); - uint64_t s = i / size; - - uint64_t t = r & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - r *= 2; - t |= r & ms[j]; - } - - auto p0 = rstate + row_size * s + 2 * t; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = _mm_load_ps(p0 + xss[k]); - is[k] = _mm_load_ps(p0 + xss[k] + 4); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm_set1_ps(v[j]); - iu = _mm_set1_ps(v[j + 1]); - rn = _mm_mul_ps(rs[0], ru); - in = _mm_mul_ps(rs[0], iu); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu)); - in = _mm_add_ps(in, _mm_mul_ps(is[0], ru)); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm_set1_ps(v[j]); - iu = _mm_set1_ps(v[j + 1]); - rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru)); - in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu)); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu)); - in = _mm_add_ps(in, _mm_mul_ps(is[l], ru)); - - j += 2; - } - - _mm_store_ps(p0 + xss[k], rn); - _mm_store_ps(p0 + xss[k] + 4, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned k = 2 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, state.get()); - } - - template - void ApplyGateL(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w, - const uint64_t* ms, const uint64_t* xss, unsigned q0, - uint64_t size, uint64_t row_size, fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m128 rn, in; - __m128 rs[gsize], is[gsize]; - - uint64_t r = 4 * (i % size); - uint64_t s = i / size; - - uint64_t t = r & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - r *= 2; - t |= r & ms[j]; - } - - auto p0 = rstate + row_size * s + 2 * t; - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - - rs[k2] = _mm_load_ps(p0 + xss[k]); - is[k2] = _mm_load_ps(p0 + xss[k] + 4); - - if (L == 1) { - rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177) - : _mm_shuffle_ps(rs[k2], rs[k2], 78); - is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177) - : _mm_shuffle_ps(is[k2], is[k2], 78); - } else if (L == 2) { - rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57); - is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57); - rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78); - is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78); - rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147); - is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147); - } - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm_mul_ps(rs[0], w[j]); - in = _mm_mul_ps(rs[0], w[j + 1]); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1])); - in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j])); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j])); - in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1])); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1])); - in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j])); - - j += 2; - } - - _mm_store_ps(p0 + xss[k], rn); - _mm_store_ps(p0 + xss[k] + 4, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - __m128 w[1 << (1 + 2 * H + L)]; - - auto m = GetMasks11(qs); - - FillIndices(state.num_qubits(), qs, ms, xss); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - unsigned k = 2 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, w, ms, xss, qs[0], size, raw_size, state.get()); - } - - template - void ApplyControlledGateHH(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, - uint64_t cmaskh, uint64_t size, uint64_t row_size, - fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m128 ru, iu, rn, in; - __m128 rs[hsize], is[hsize]; - - uint64_t r = 4 * (i % size); - uint64_t s = i / size; - - uint64_t t = r & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - r *= 2; - t |= r & ms[j]; - } - - if ((t & cmaskh) != cvalsh) return; - - auto p0 = rstate + row_size * s + 2 * t; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = _mm_load_ps(p0 + xss[k]); - is[k] = _mm_load_ps(p0 + xss[k] + 4); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm_set1_ps(v[j]); - iu = _mm_set1_ps(v[j + 1]); - rn = _mm_mul_ps(rs[0], ru); - in = _mm_mul_ps(rs[0], iu); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu)); - in = _mm_add_ps(in, _mm_mul_ps(is[0], ru)); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm_set1_ps(v[j]); - iu = _mm_set1_ps(v[j + 1]); - rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru)); - in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu)); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu)); - in = _mm_add_ps(in, _mm_mul_ps(is[l], ru)); - - j += 2; - } - - _mm_store_ps(p0 + xss[k], rn); - _mm_store_ps(p0 + xss[k] + 4, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - - auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals); - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned k = 2 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, - matrix, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get()); - } - - template - void ApplyControlledGateHL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w, - const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, - uint64_t cmaskh, uint64_t size, uint64_t row_size, - fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m128 rn, in; - __m128 rs[hsize], is[hsize]; - - uint64_t r = 4 * (i % size); - uint64_t s = i / size; - - uint64_t t = r & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - r *= 2; - t |= r & ms[j]; - } - - if ((t & cmaskh) != cvalsh) return; - - auto p0 = rstate + row_size * s + 2 * t; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = _mm_load_ps(p0 + xss[k]); - is[k] = _mm_load_ps(p0 + xss[k] + 4); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm_mul_ps(rs[0], w[j]); - in = _mm_mul_ps(rs[0], w[j + 1]); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1])); - in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j])); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j])); - in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1])); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1])); - in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j])); - - j += 2; - } - - _mm_store_ps(p0 + xss[k], rn); - _mm_store_ps(p0 + xss[k] + 4, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - __m128 w[1 << (1 + 2 * H)]; - - auto m = GetMasks8<2>(state.num_qubits(), qs, cqs, cvals); - FillIndices(state.num_qubits(), qs, ms, xss); - FillControlledMatrixH(m.cvalsl, m.cmaskl, matrix, (fp_type*) w); - - unsigned k = 2 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, - w, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get()); - } - - template - void ApplyControlledGateL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w, - const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, - uint64_t cmaskh, unsigned q0, uint64_t size, uint64_t row_size, - fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m128 rn, in; - __m128 rs[gsize], is[gsize]; - - uint64_t r = 4 * (i % size); - uint64_t s = i / size; - - uint64_t t = r & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - r *= 2; - t |= r & ms[j]; - } - - if ((t & cmaskh) != cvalsh) return; - - auto p0 = rstate + row_size * s + 2 * t; - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - - rs[k2] = _mm_load_ps(p0 + xss[k]); - is[k2] = _mm_load_ps(p0 + xss[k] + 4); - - if (L == 1) { - rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177) - : _mm_shuffle_ps(rs[k2], rs[k2], 78); - is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177) - : _mm_shuffle_ps(is[k2], is[k2], 78); - } else if (L == 2) { - rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57); - is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57); - rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78); - is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78); - rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147); - is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147); - } - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm_mul_ps(rs[0], w[j]); - in = _mm_mul_ps(rs[0], w[j + 1]); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1])); - in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j])); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j])); - in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1])); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1])); - in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j])); - - j += 2; - } - - _mm_store_ps(p0 + xss[k], rn); - _mm_store_ps(p0 + xss[k] + 4, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - __m128 w[1 << (1 + 2 * H + L)]; - - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned k = 2 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - if (CH) { - auto m = GetMasks9(state.num_qubits(), qs, cqs, cvals); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - for_.Run(size * size2, f, w, ms, xss, - m.cvalsh, m.cmaskh, qs[0], size, raw_size, state.get()); - } else { - auto m = GetMasks10(state.num_qubits(), qs, cqs, cvals); - FillControlledMatrixL( - m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w); - - for_.Run(size * size2, f, w, ms, xss, - m.cvalsh, m.cmaskh, qs[0], size, raw_size, state.get()); - } - } - - For for_; -}; - -} // namespace unitary -} // namespace qsim - -#endif // UNITARY_CALCULATOR_SSE_H_ diff --git a/qsim/unitaryspace.h b/qsim/unitaryspace.h deleted file mode 100644 index b5e2691..0000000 --- a/qsim/unitaryspace.h +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UNITARYSPACE_H_ -#define UNITARYSPACE_H_ - -#include - -namespace qsim { - -namespace unitary { - -/** - * Abstract class containing routines for general unitary matrix manipulations. - * "AVX", "AVX512", "Basic", and "SSE" implementations are provided. - */ -template class VectorSpace, typename... VSTypeParams> -class UnitarySpace : public VectorSpace { - private: - using Base = VectorSpace; - - public: - using fp_type = typename Base::fp_type; - using Unitary = typename Base::Vector; - - template - UnitarySpace(ForArgs&&... args) : Base(args...) {} - - static Unitary CreateUnitary(unsigned num_qubits) { - return Base::Create(num_qubits); - } - - static Unitary CreateUnitary(fp_type* p, unsigned num_qubits) { - return Base::Create(p, num_qubits); - } - - static Unitary NullUnitary() { - return Base::Null(); - } - - static uint64_t Size(unsigned num_qubits) { - return uint64_t{1} << num_qubits; - }; - - void CopyUnitary(const Unitary& src, Unitary& dest) const { - Base::Copy(src, dest); - } -}; - -} // namespace unitary -} // namespace qsim - -#endif // UNITARYSPACE_H_ diff --git a/qsim/unitaryspace_avx.h b/qsim/unitaryspace_avx.h deleted file mode 100644 index c1ec59d..0000000 --- a/qsim/unitaryspace_avx.h +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UNITARYSPACE_AVX_H_ -#define UNITARYSPACE_AVX_H_ - -#include - -#include -#include -#include -#include - -#include "unitaryspace.h" -#include "vectorspace.h" - -namespace qsim { - -namespace unitary { - -/** - * Object containing context and routines for unitary manipulations. - * Unitary is a vectorized sequence of eight real components followed by eight - * imaginary components. Eight single-precison floating numbers can be loaded - * into an AVX register. - */ -template -struct UnitarySpaceAVX : - public UnitarySpace, VectorSpace, For, float> { - private: - using Base = UnitarySpace, - qsim::VectorSpace, For, float>; - - public: - using Unitary = typename Base::Unitary; - using fp_type = typename Base::fp_type; - - template - explicit UnitarySpaceAVX(ForArgs&&... args) : Base(args...) {} - - static uint64_t MinRowSize(unsigned num_qubits) { - return std::max(uint64_t{16}, 2 * (uint64_t{1} << num_qubits)); - }; - - static uint64_t MinSize(unsigned num_qubits) { - return Base::Size(num_qubits) * MinRowSize(num_qubits); - }; - - void SetAllZeros(Unitary& state) const { - __m256 val0 = _mm256_setzero_ps(); - - auto f = [](unsigned n, unsigned m, uint64_t i, __m256& val, fp_type* p) { - _mm256_store_ps(p + 16 * i, val); - _mm256_store_ps(p + 16 * i + 8, val); - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 16, f, val0, state.get()); - } - - void SetIdentity(Unitary& state) { - SetAllZeros(state); - - auto f = [](unsigned n, unsigned m, uint64_t i, - uint64_t row_size, fp_type* p) { - p[row_size * i + (16 * (i / 8)) + (i % 8)] = 1; - }; - - uint64_t size = Base::Size(state.num_qubits()); - uint64_t row_size = MinRowSize(state.num_qubits()); - Base::for_.Run(size, f, row_size, state.get()); - } - - static std::complex GetEntry(const Unitary& state, - uint64_t i, uint64_t j) { - uint64_t row_size = MinRowSize(state.num_qubits()); - uint64_t k = (16 * (j / 8)) + (j % 8); - return std::complex(state.get()[row_size * i + k], - state.get()[row_size * i + k + 8]); - } - - static void SetEntry(Unitary& state, uint64_t i, uint64_t j, - const std::complex& ampl) { - uint64_t row_size = MinRowSize(state.num_qubits()); - uint64_t k = (16 * (j / 8)) + (j % 8); - state.get()[row_size * i + k] = std::real(ampl); - state.get()[row_size * i + k + 8] = std::imag(ampl); - } - - static void SetEntry(Unitary& state, uint64_t i, uint64_t j, fp_type re, - fp_type im) { - uint64_t row_size = MinRowSize(state.num_qubits()); - uint64_t k = (16 * (j / 8)) + (j % 8); - state.get()[row_size * i + k] = re; - state.get()[row_size * i + k + 8] = im; - } -}; - -} // namespace unitary -} // namespace qsim - -#endif // UNITARYSPACE_AVX_H_ diff --git a/qsim/unitaryspace_avx512.h b/qsim/unitaryspace_avx512.h deleted file mode 100644 index 4c23dc9..0000000 --- a/qsim/unitaryspace_avx512.h +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UNITARYSPACE_AVX512_H_ -#define UNITARYSPACE_AVX512_H_ - -#include - -#include -#include -#include -#include - -#include "unitaryspace.h" -#include "vectorspace.h" - -namespace qsim { - -namespace unitary { - -/** - * Object containing context and routines for unitary manipulations. - * State is a vectorized sequence of sixteen real components followed by - * sixteen imaginary components. Sixteen single-precison floating numbers can - * be loaded into an AVX512 register. - */ -template -struct UnitarySpaceAVX512 : - public UnitarySpace, VectorSpace, For, float> { - private: - using Base = UnitarySpace, - qsim::VectorSpace, For, float>; - - public: - using Unitary = typename Base::Unitary; - using fp_type = typename Base::fp_type; - - template - explicit UnitarySpaceAVX512(ForArgs&&... args) : Base(args...) {} - - static uint64_t MinRowSize(unsigned num_qubits) { - return std::max(uint64_t{32}, 2 * (uint64_t{1} << num_qubits)); - }; - - static uint64_t MinSize(unsigned num_qubits) { - return Base::Size(num_qubits) * MinRowSize(num_qubits); - }; - - void SetAllZeros(Unitary& state) const { - __m512 val0 = _mm512_setzero_ps(); - - auto f = [](unsigned n, unsigned m, uint64_t i, __m512 val0, fp_type* p) { - _mm512_store_ps(p + 32 * i, val0); - _mm512_store_ps(p + 32 * i + 16, val0); - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 32, f, val0, state.get()); - } - - void SetIdentity(Unitary& state) { - SetAllZeros(state); - - auto f = [](unsigned n, unsigned m, uint64_t i, - uint64_t row_size, fp_type* p) { - p[row_size * i + (32 * (i / 16)) + (i % 16)] = 1; - }; - - uint64_t size = Base::Size(state.num_qubits()); - uint64_t row_size = MinRowSize(state.num_qubits()); - Base::for_.Run(size, f, row_size, state.get()); - } - - static std::complex GetEntry(const Unitary& state, - uint64_t i, uint64_t j) { - uint64_t row_size = MinRowSize(state.num_qubits()); - uint64_t k = (32 * (j / 16)) + (j % 16); - return std::complex(state.get()[row_size * i + k], - state.get()[row_size * i + k + 16]); - } - - static void SetEntry(Unitary& state, uint64_t i, uint64_t j, - const std::complex& ampl) { - uint64_t row_size = MinRowSize(state.num_qubits()); - uint64_t k = (32 * (j / 16)) + (j % 16); - state.get()[row_size * i + k] = std::real(ampl); - state.get()[row_size * i + k + 16] = std::imag(ampl); - } - - static void SetEntry(Unitary& state, uint64_t i, uint64_t j, fp_type re, - fp_type im) { - uint64_t row_size = MinRowSize(state.num_qubits()); - uint64_t k = (32 * (j / 16)) + (j % 16); - state.get()[row_size * i + k] = re; - state.get()[row_size * i + k + 16] = im; - } -}; - -} // namespace unitary -} // namespace qsim - -#endif // UNITARYSPACE_AVX512_H_ diff --git a/qsim/unitaryspace_basic.h b/qsim/unitaryspace_basic.h deleted file mode 100644 index 2db14b6..0000000 --- a/qsim/unitaryspace_basic.h +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UNITARYSPACE_BASIC_H_ -#define UNITARYSPACE_BASIC_H_ - -#include -#include -#include - -#include "unitaryspace.h" -#include "vectorspace.h" - -namespace qsim { - -namespace unitary { - -/** - * Object containing context and routines for unitary manipulations. - * Unitary is a non-vectorized sequence of one real amplitude followed by - * one imaginary amplitude. - */ -template -struct UnitarySpaceBasic - : public UnitarySpace, VectorSpace, For, FP> { - private: - using Base = UnitarySpace, - qsim::VectorSpace, For, FP>; - - public: - using Unitary = typename Base::Unitary; - using fp_type = typename Base::fp_type; - - template - explicit UnitarySpaceBasic(ForArgs&&... args) : Base(args...) {} - - static uint64_t MinRowSize(unsigned num_qubits) { - return 2 * (uint64_t{1} << num_qubits); - }; - - static uint64_t MinSize(unsigned num_qubits) { - return Base::Size(num_qubits) * MinRowSize(num_qubits); - }; - - void SetAllZeros(Unitary& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) { - p[2 * i + 0] = 0; - p[2 * i + 1] = 0; - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 2, f, state.get()); - } - - void SetIdentity(Unitary& state) { - SetAllZeros(state); - - auto f = [](unsigned n, unsigned m, uint64_t i, - uint64_t row_size, fp_type* p) { - p[row_size * i + 2 * i] = 1; - }; - - uint64_t size = Base::Size(state.num_qubits()); - uint64_t row_size = MinRowSize(state.num_qubits()); - Base::for_.Run(size, f, row_size, state.get()); - } - - static std::complex GetEntry(const Unitary& state, - uint64_t i, uint64_t j) { - uint64_t row_size = MinRowSize(state.num_qubits()); - return std::complex(state.get()[row_size * i + 2 * j], - state.get()[row_size * i + 2 * j + 1]); - } - - static void SetEntry(Unitary& state, uint64_t i, uint64_t j, - const std::complex& ampl) { - uint64_t row_size = MinRowSize(state.num_qubits()); - state.get()[row_size * i + 2 * j] = std::real(ampl); - state.get()[row_size * i + 2 * j + 1] = std::imag(ampl); - } - - static void SetEntry(Unitary& state, uint64_t i, uint64_t j, - fp_type re, fp_type im) { - uint64_t row_size = MinRowSize(state.num_qubits()); - state.get()[row_size * i + 2 * j] = re; - state.get()[row_size * i + 2 * j + 1] = im; - } -}; - -} // namespace unitary -} // namespace qsim - -#endif // UNITARYSPACE_BASIC_H_ diff --git a/qsim/unitaryspace_sse.h b/qsim/unitaryspace_sse.h deleted file mode 100644 index f3762fb..0000000 --- a/qsim/unitaryspace_sse.h +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UNITARYSPACE_SSE_H_ -#define UNITARYSPACE_SSE_H_ - -#include - -#include -#include -#include -#include - -#include "unitaryspace.h" -#include "vectorspace.h" - -namespace qsim { - -namespace unitary { - -/** - * Object containing context and routines for unitary manipulations. - * Unitary is a vectorized sequence of four real components followed by four - * imaginary components. Four single-precison floating numbers can be loaded - * into an SSE register. - */ -template -struct UnitarySpaceSSE : - public UnitarySpace, VectorSpace, For, float> { - private: - using Base = UnitarySpace, - qsim::VectorSpace, For, float>; - - public: - using Unitary = typename Base::Unitary; - using fp_type = typename Base::fp_type; - - template - explicit UnitarySpaceSSE(ForArgs&&... args) : Base(args...) {} - - static uint64_t MinRowSize(unsigned num_qubits) { - return std::max(uint64_t{8}, 2 * (uint64_t{1} << num_qubits)); - }; - - static uint64_t MinSize(unsigned num_qubits) { - return Base::Size(num_qubits) * MinRowSize(num_qubits); - }; - - void SetAllZeros(Unitary& state) const { - __m128 val0 = _mm_setzero_ps(); - - auto f = [](unsigned n, unsigned m, uint64_t i, __m128 val0, fp_type* p) { - _mm_store_ps(p + 8 * i, val0); - _mm_store_ps(p + 8 * i + 4, val0); - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 8, f, val0, state.get()); - } - - void SetIdentity(Unitary& state) { - SetAllZeros(state); - - auto f = [](unsigned n, unsigned m, uint64_t i, - uint64_t row_size, fp_type* p) { - p[row_size * i + (8 * (i / 4)) + (i % 4)] = 1; - }; - - uint64_t size = Base::Size(state.num_qubits()); - uint64_t row_size = MinRowSize(state.num_qubits()); - Base::for_.Run(size, f, row_size, state.get()); - } - - static std::complex GetEntry(const Unitary& state, - uint64_t i, uint64_t j) { - uint64_t row_size = MinRowSize(state.num_qubits()); - uint64_t k = (8 * (j / 4)) + (j % 4); - return std::complex(state.get()[row_size * i + k], - state.get()[row_size * i + k + 4]); - } - - static void SetEntry(Unitary& state, uint64_t i, uint64_t j, - const std::complex& ampl) { - uint64_t row_size = MinRowSize(state.num_qubits()); - uint64_t k = (8 * (j / 4)) + (j % 4); - state.get()[row_size * i + k] = std::real(ampl); - state.get()[row_size * i + k + 4] = std::imag(ampl); - } - - static void SetEntry(Unitary& state, uint64_t i, uint64_t j, fp_type re, - fp_type im) { - uint64_t row_size = MinRowSize(state.num_qubits()); - uint64_t k = (8 * (j / 4)) + (j % 4); - state.get()[row_size * i + k] = re; - state.get()[row_size * i + k + 4] = im; - } -}; - -} // namespace unitary -} // namespace qsim - -#endif // UNITARYSPACE_SSE_H_ diff --git a/qsim/util.h b/qsim/util.h deleted file mode 100644 index 726a019..0000000 --- a/qsim/util.h +++ /dev/null @@ -1,89 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTIL_H_ -#define UTIL_H_ - -#include -#include -#include -#include -#include -#include -#include - -namespace qsim { - -template -inline void SplitString( - const std::string& str, char delim, Container& words) { - words.resize(0); - - std::string word; - std::stringstream ss(str); - - while (std::getline(ss, word, delim)) { - words.push_back(std::move(word)); - } -} - -template -inline void SplitString( - const std::string& str, char delim, Op op, Container& words) { - words.resize(0); - - std::string word; - std::stringstream ss(str); - - while (std::getline(ss, word, delim)) { - words.push_back(op(word)); - } -} - -inline double GetTime() { - using namespace std::chrono; - steady_clock::duration since_epoch = steady_clock::now().time_since_epoch(); - return double(since_epoch.count() * steady_clock::period::num) - / steady_clock::period::den; -} - -template -inline DistrRealType RandomValue(RGen& rgen, DistrRealType max_value) { - std::uniform_real_distribution distr(0.0, max_value); - return distr(rgen); -} - -template -inline std::vector GenerateRandomValues( - uint64_t num_samples, unsigned seed, DistrRealType max_value) { - std::vector rs; - rs.reserve(num_samples + 1); - - std::mt19937 rgen(seed); - std::uniform_real_distribution distr(0.0, max_value); - - for (uint64_t i = 0; i < num_samples; ++i) { - rs.emplace_back(distr(rgen)); - } - - std::sort(rs.begin(), rs.end()); - // Populate the final element to prevent sanitizer errors. - rs.emplace_back(max_value); - - return rs; -} - -} // namespace qsim - -#endif // UTIL_H_ diff --git a/qsim/util_cpu.h b/qsim/util_cpu.h deleted file mode 100644 index 8e02425..0000000 --- a/qsim/util_cpu.h +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTIL_CPU_H_ -#define UTIL_CPU_H_ - -#ifdef __SSE2__ -# include -#endif - -namespace qsim { - -// This function sets flush-to-zero and denormals-are-zeros MXCSR control -// flags. This prevents rare cases of performance slowdown potentially at -// the cost of a tiny precision loss. -inline void SetFlushToZeroAndDenormalsAreZeros() { -#ifdef __SSE2__ - _mm_setcsr(_mm_getcsr() | 0x8040); -#endif -} - -// This function clears flush-to-zero and denormals-are-zeros MXCSR control -// flags. -inline void ClearFlushToZeroAndDenormalsAreZeros() { -#ifdef __SSE2__ - _mm_setcsr(_mm_getcsr() & ~unsigned{0x8040}); -#endif -} - -} // namespace qsim - -#endif // UTIL_CPU_H_ diff --git a/qsim/util_cuda.h b/qsim/util_cuda.h deleted file mode 100644 index 5d8cb5d..0000000 --- a/qsim/util_cuda.h +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTIL_CUDA_H_ -#define UTIL_CUDA_H_ - -#ifdef __NVCC__ - #include -#elif __HIP__ - #include -#endif - -#include - -#include "io.h" - -namespace qsim { - -#define ErrorCheck(code) { ErrorAssert((code), __FILE__, __LINE__); } - -inline void ErrorAssert(cudaError_t code, const char* file, unsigned line) { - if (code != cudaSuccess) { - IO::errorf("CUDA error: %s %s %d\n", cudaGetErrorString(code), file, line); - exit(code); - } -} - -template -struct Complex { - __host__ __device__ __forceinline__ Complex() {} - - __host__ __device__ __forceinline__ Complex(const T& re) : re(re), im(0) {} - - __host__ __device__ __forceinline__ Complex(const T& re, const T& im) - : re(re), im(im) {} - - template - __host__ __device__ __forceinline__ Complex& operator=( - const Complex& r) { - re = r.re; - im = r.im; - - return *this; - } - - T re; - T im; -}; - -template -__host__ __device__ __forceinline__ Complex operator+( - const Complex& l, const Complex& r) { - return Complex(l.re + r.re, l.im + r.im); -} - -template -__host__ __device__ __forceinline__ Complex operator+( - const Complex& l, const Complex& r) { - return Complex(l.re + r.re, l.im + r.im); -} - -template -struct Scalar { - using type = T; -}; - -template -struct Scalar> { - using type = T; -}; - -template -struct Plus { - template - __device__ __forceinline__ T operator()(const T& v1, const U& v2) const { - return v1 + v2; - } -}; - -template -struct Product { - __device__ __forceinline__ Complex operator()( - const T& re1, const T& im1, const T& re2, const T& im2) const { - return Complex(re1 * re2 + im1 * im2, re1 * im2 - im1 * re2); - } -}; - -template -struct RealProduct { - __device__ __forceinline__ T operator()( - const T& re1, const T& im1, const T& re2, const T& im2) const { - return re1 * re2 + im1 * im2; - } -}; - -template -__device__ __forceinline__ FP1 WarpReduce(FP1 val, Op op) { - for (unsigned i = warp_size / 2; i > 0; i /= 2) { - val = op(val, __shfl_down_sync(0xffffffff, val, i)); - } - - return val; -} - -template -__device__ __forceinline__ Complex WarpReduce(Complex val, Op op) { - for (unsigned i = warp_size / 2; i > 0; i /= 2) { - val.re = op(val.re, __shfl_down_sync(0xffffffff, val.re, i)); - val.im = op(val.im, __shfl_down_sync(0xffffffff, val.im, i)); - } - - return val; -} - -} // namespace qsim - -#endif // UTIL_CUDA_H_ diff --git a/qsim/util_custatevec.h b/qsim/util_custatevec.h deleted file mode 100644 index 36f29ef..0000000 --- a/qsim/util_custatevec.h +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTIL_CUSTATEVEC_H_ -#define UTIL_CUSTATEVEC_H_ - -#include -#include - -#include "io.h" -#include "util_cuda.h" - -namespace qsim { - -inline void ErrorAssert(cublasStatus_t code, const char* file, unsigned line) { - if (code != CUBLAS_STATUS_SUCCESS) { - IO::errorf("cuBLAS error %i: %s %d\n", code, file, line); - exit(code); - } -} - -inline void ErrorAssert( - custatevecStatus_t code, const char* file, unsigned line) { - if (code != CUSTATEVEC_STATUS_SUCCESS) { - IO::errorf("custatevec error: %s %s %d\n", - custatevecGetErrorString(code), file, line); - exit(code); - } -} - -} // namespace qsim - -#endif // UTIL_CUSTATEVEC_H_ diff --git a/qsim/vectorspace.h b/qsim/vectorspace.h deleted file mode 100644 index 7b33a53..0000000 --- a/qsim/vectorspace.h +++ /dev/null @@ -1,185 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef VECTORSPACE_H_ -#define VECTORSPACE_H_ - -#ifdef _WIN32 - #include -#endif - -#include -#include -#include -#include - -namespace qsim { - -namespace detail { - -inline void do_not_free(void*) {} - -inline void free(void* ptr) { -#ifdef _WIN32 - _aligned_free(ptr); -#else - ::free(ptr); -#endif -} - -} // namespace detail - -// Routines for vector manipulations. -template -class VectorSpace { - public: - using fp_type = FP; - - private: - using Pointer = std::unique_ptr; - - public: - class Vector { - public: - Vector() = delete; - - Vector(Pointer&& ptr, unsigned num_qubits) - : ptr_(std::move(ptr)), num_qubits_(num_qubits) {} - - fp_type* get() { - return ptr_.get(); - } - - const fp_type* get() const { - return ptr_.get(); - } - - fp_type* release() { - num_qubits_ = 0; - return ptr_.release(); - } - - unsigned num_qubits() const { - return num_qubits_; - } - - bool requires_copy_to_host() const { - return false; - } - - private: - Pointer ptr_; - unsigned num_qubits_; - }; - - template - VectorSpace(ForArgs&&... args) : for_(args...) {} - - static Vector Create(unsigned num_qubits) { - auto size = sizeof(fp_type) * Impl::MinSize(num_qubits); - #ifdef _WIN32 - Pointer ptr{(fp_type*) _aligned_malloc(size, 64), &detail::free}; - return Vector{std::move(ptr), ptr.get() != nullptr ? num_qubits : 0}; - #else - void* p = nullptr; - if (posix_memalign(&p, 64, size) == 0) { - return Vector{Pointer{(fp_type*) p, &detail::free}, num_qubits}; - } else { - return Null(); - } - #endif - } - - // It is the client's responsibility to make sure that p has at least - // Impl::MinSize(num_qubits) elements. - static Vector Create(fp_type* p, unsigned num_qubits) { - return Vector{Pointer{p, &detail::do_not_free}, num_qubits}; - } - - static Vector Null() { - return Vector{Pointer{nullptr, &detail::free}, 0}; - } - - static bool IsNull(const Vector& vec) { - return vec.get() == nullptr; - } - - static void Free(fp_type* ptr) { - detail::free(ptr); - } - - bool Copy(const Vector& src, Vector& dest) const { - if (src.num_qubits() != dest.num_qubits()) { - return false; - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* src, fp_type* dest) { - dest[i] = src[i]; - }; - - for_.Run(Impl::MinSize(src.num_qubits()), f, src.get(), dest.get()); - - return true; - } - - // It is the client's responsibility to make sure that dest has at least - // Impl::MinSize(src.num_qubits()) elements. - bool Copy(const Vector& src, fp_type* dest) const { - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* src, fp_type* dest) { - dest[i] = src[i]; - }; - - for_.Run(Impl::MinSize(src.num_qubits()), f, src.get(), dest); - - return true; - } - - // It is the client's responsibility to make sure that src has at least - // Impl::MinSize(dest.num_qubits()) elements. - bool Copy(const fp_type* src, Vector& dest) const { - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* src, fp_type* dest) { - dest[i] = src[i]; - }; - - for_.Run(Impl::MinSize(dest.num_qubits()), f, src, dest.get()); - - return true; - } - - // It is the client's responsibility to make sure that src has at least - // min(size, Impl::MinSize(dest.num_qubits())) elements. - bool Copy(const fp_type* src, uint64_t size, Vector& dest) const { - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* src, fp_type* dest) { - dest[i] = src[i]; - }; - - size = std::min(size, Impl::MinSize(dest.num_qubits())); - for_.Run(size, f, src, dest.get()); - - return true; - } - - void DeviceSync() {} - - protected: - For for_; -}; - -} // namespace qsim - -#endif // VECTORSPACE_H_ diff --git a/qsim/vectorspace_cuda.h b/qsim/vectorspace_cuda.h deleted file mode 100644 index fd91553..0000000 --- a/qsim/vectorspace_cuda.h +++ /dev/null @@ -1,172 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef VECTORSPACE_CUDA_H_ -#define VECTORSPACE_CUDA_H_ - -#ifdef __NVCC__ - #include - #include -#elif __HIP__ - #include - #include "cuda2hip.h" -#endif - -#include -#include - -namespace qsim { - -namespace detail { - -inline void do_not_free(void*) {} - -inline void free(void* ptr) { - ErrorCheck(cudaFree(ptr)); -} - -} // namespace detail - -// Routines for vector manipulations. -template -class VectorSpaceCUDA { - public: - using fp_type = FP; - - private: - using Pointer = std::unique_ptr; - - public: - class Vector { - public: - Vector() = delete; - - Vector(Pointer&& ptr, unsigned num_qubits) - : ptr_(std::move(ptr)), num_qubits_(num_qubits) {} - - fp_type* get() { - return ptr_.get(); - } - - const fp_type* get() const { - return ptr_.get(); - } - - fp_type* release() { - num_qubits_ = 0; - return ptr_.release(); - } - - unsigned num_qubits() const { - return num_qubits_; - } - - bool requires_copy_to_host() const { - return true; - } - - private: - Pointer ptr_; - unsigned num_qubits_; - }; - - template - VectorSpaceCUDA(Args&&... args) {} - - static Vector Create(unsigned num_qubits) { - fp_type* p; - auto size = sizeof(fp_type) * Impl::MinSize(num_qubits); - auto rc = cudaMalloc(&p, size); - - if (rc == cudaSuccess) { - return Vector{Pointer{(fp_type*) p, &detail::free}, num_qubits}; - } else { - return Null(); - } - } - - // It is the client's responsibility to make sure that p has at least - // Impl::MinSize(num_qubits) elements. - static Vector Create(fp_type* p, unsigned num_qubits) { - return Vector{Pointer{p, &detail::do_not_free}, num_qubits}; - } - - static Vector Null() { - return Vector{Pointer{nullptr, &detail::free}, 0}; - } - - static bool IsNull(const Vector& vector) { - return vector.get() == nullptr; - } - - static void Free(fp_type* ptr) { - detail::free(ptr); - } - - bool Copy(const Vector& src, Vector& dest) const { - if (src.num_qubits() != dest.num_qubits()) { - return false; - } - - ErrorCheck( - cudaMemcpy(dest.get(), src.get(), - sizeof(fp_type) * Impl::MinSize(src.num_qubits()), - cudaMemcpyDeviceToDevice)); - - return true; - } - - // It is the client's responsibility to make sure that dest has at least - // Impl::MinSize(src.num_qubits()) elements. - bool Copy(const Vector& src, fp_type* dest) const { - ErrorCheck( - cudaMemcpy(dest, src.get(), - sizeof(fp_type) * Impl::MinSize(src.num_qubits()), - cudaMemcpyDeviceToHost)); - - return true; - } - - // It is the client's responsibility to make sure that src has at least - // Impl::MinSize(dest.num_qubits()) elements. - bool Copy(const fp_type* src, Vector& dest) const { - ErrorCheck( - cudaMemcpy(dest.get(), src, - sizeof(fp_type) * Impl::MinSize(dest.num_qubits()), - cudaMemcpyHostToDevice)); - - return true; - } - - // It is the client's responsibility to make sure that src has at least - // min(size, Impl::MinSize(dest.num_qubits())) elements. - bool Copy(const fp_type* src, uint64_t size, Vector& dest) const { - size = std::min(size, Impl::MinSize(dest.num_qubits())); - ErrorCheck( - cudaMemcpy(dest.get(), src, - sizeof(fp_type) * size, - cudaMemcpyHostToDevice)); - return true; - } - - void DeviceSync() { - ErrorCheck(cudaDeviceSynchronize()); - } - - protected: -}; - -} // namespace qsim - -#endif // VECTORSPACE_CUDA_H_ From 39223c35d01f8a6a16433bbe86fdbfe2d0e9564c Mon Sep 17 00:00:00 2001 From: wongey <25296194+wongey@users.noreply.github.com> Date: Tue, 5 Nov 2024 23:04:56 -0500 Subject: [PATCH 03/64] Clean up results printing and output from Vicente --- app/qir-qsim.cc | 14 ++++++++++++++ src/qirqsim/BufferManager.cc | 15 ++++++++++++++- src/qirqsim/BufferManager.hh | 3 +++ src/qirqsim/qsimDefaultRuntime.cc | 10 ++++------ src/qirqsim/qsimDefaultRuntime.hh | 13 ++++++++----- src/qirqsim/qsimQuantum.cc | 16 +++++++++++++++- src/qirqsim/qsimQuantum.hh | 5 ++++- 7 files changed, 62 insertions(+), 14 deletions(-) diff --git a/app/qir-qsim.cc b/app/qir-qsim.cc index 809b686..75f1612 100644 --- a/app/qir-qsim.cc +++ b/app/qir-qsim.cc @@ -54,6 +54,20 @@ void run(std::string const& filename, for (int i = 0; i < num_shots; i++){ execute(sim, *rt); } + + std::cout << std::endl; + std::cout << "Measurement output:" << std::endl; + std::cout << "-------------------" << std::endl; + std::cout << "Number of shots: " << num_shots << std::endl; + std::cout << "Number of qubits: " << sim.num_qubits() << std::endl; + + for(int q_index = 0; q_index < sim.num_qubits(); q_index++){ + int value_0 = 0; + int value_1 = 0; + if (auto value = sim.manager.getBufferValue("q"+std::to_string(q_index), "0"); value.has_value()){ value_0 = value.value();} + if (auto value = sim.manager.getBufferValue("q"+std::to_string(q_index), "1"); value.has_value()){ value_1 = value.value();} + std::cout << "q" << q_index << " {0: " << value_0 << "," << " 1: " << value_1 << "}\n"; + } } //---------------------------------------------------------------------------// diff --git a/src/qirqsim/BufferManager.cc b/src/qirqsim/BufferManager.cc index 2e6f646..46931d9 100644 --- a/src/qirqsim/BufferManager.cc +++ b/src/qirqsim/BufferManager.cc @@ -23,6 +23,11 @@ void BufferManager::updateBuffer(const std::string& qubit, const std::string& st buffer[{qubit, state}] = value + current_frequency; } +void BufferManager::updateBuffer(const std::string& key, const int& value) { + // Insert or update the key-value pair in the buffer + simple_buffer[key] = value; +} + std::optional BufferManager::getBufferValue(const std::string& qubit, const std::string& state) const { std::pair searchKey = {qubit, state}; auto it = buffer.find(searchKey); @@ -30,4 +35,12 @@ std::optional BufferManager::getBufferValue(const std::string& qubit, const return it->second; // Key found } return std::nullopt; // Key not found -} \ No newline at end of file +} + +std::optional BufferManager::getBufferValue(const std::string& key) const { + auto it = simple_buffer.find(key); + if (it != simple_buffer.end()) { + return it->second; // Key found + } + return std::nullopt; // Key not found +} diff --git a/src/qirqsim/BufferManager.hh b/src/qirqsim/BufferManager.hh index dc03846..efb3800 100644 --- a/src/qirqsim/BufferManager.hh +++ b/src/qirqsim/BufferManager.hh @@ -32,14 +32,17 @@ public: // Method to update the buffer with a key-value pair void updateBuffer(const std::string& qubit, const std::string& state, const int& value); + void updateBuffer(const std::string& key, const int& value); // Retrieve buffer value for storage or evaluation std::optional getBufferValue(const std::string& qubit, const std::string& state) const; + std::optional getBufferValue(const std::string& key) const; private: // Dictionary to store key-value pairs std::unordered_map, int, pair_hash> buffer; + std::unordered_map simple_buffer; }; #endif // BUFFER_MANAGER_H diff --git a/src/qirqsim/qsimDefaultRuntime.cc b/src/qirqsim/qsimDefaultRuntime.cc index 955959d..339703a 100644 --- a/src/qirqsim/qsimDefaultRuntime.cc +++ b/src/qirqsim/qsimDefaultRuntime.cc @@ -57,15 +57,13 @@ void qsimDefaultRuntime::tuple_record_output(size_type s, OptionalCString tag) void qsimDefaultRuntime::result_record_output(Result r, OptionalCString tag) { // Access values through the getter - // TODO: This prints results 'every time' result_record_output is called. Maybe enough to only print the 'final time' + // This prints results every time result_record_output is called + // Can comment out if only want to see final results - if (auto value = sim_.manager.getBufferValue("q"+std::to_string(r.value), "0"); value.has_value()) { - std::cout << "q" << std::to_string(r.value) << " |0> freq: " << value.value() << "\n"; + if (auto value = sim_.manager.getBufferValue("q"+std::to_string(r.value)); value.has_value()) { + std::cout << "q" << std::to_string(r.value) << " : " << value.value() << "\n"; } - if (auto value = sim_.manager.getBufferValue("q"+std::to_string(r.value), "1"); value.has_value()) { - std::cout << "q" << std::to_string(r.value) << " |1> freq: " << value.value() << "\n"; - } } } // namespace qiree diff --git a/src/qirqsim/qsimDefaultRuntime.hh b/src/qirqsim/qsimDefaultRuntime.hh index 70dfdd4..26f06ab 100644 --- a/src/qirqsim/qsimDefaultRuntime.hh +++ b/src/qirqsim/qsimDefaultRuntime.hh @@ -17,11 +17,14 @@ namespace qiree * * Example for three qubits: * \code - * q0 |0> freq: 509 - * q0 |1> freq: 515 - * q1 |0> freq: 509 - * q1 |1> freq: 515 - * q2 |1> freq: 1024 + * Measurement output: + * ------------------- + * Number of shots: 1024 + * Number of qubits: 3 + * q0 {0: 542, 1: 482} + * q1 {0: 521, 1: 503} + * q2 {0: 0, 1: 1024} + * * \endcode */ diff --git a/src/qirqsim/qsimQuantum.cc b/src/qirqsim/qsimQuantum.cc index 81f40ef..74d510d 100644 --- a/src/qirqsim/qsimQuantum.cc +++ b/src/qirqsim/qsimQuantum.cc @@ -68,6 +68,7 @@ qsimQuantum::State qsimQuantum::init_state_space() { //check if StateSpace is th /* Prepare to build a quantum circuit for an entry point */ + void qsimQuantum::set_up(EntryPointAttrs const& attrs) { QIREE_VALIDATE(attrs.required_num_qubits > 0, << "input is not a quantum program"); @@ -78,13 +79,20 @@ void qsimQuantum::set_up(EntryPointAttrs const& attrs) { state_ = std::make_shared(init_state_space()); // Set the state space? Maybe. q_circuit.num_qubits = num_qubits_; // Allocate the number of qubits in the circuit execution_time = 0; // Initialize execution time - + static unsigned int rep = 0; + rep++; + this->repCount(rep); } //---------------------------------------------------------------------------// /* Complete an execution */ + +void qsimQuantum::repCount(int rep) { + repetition = rep; +} + void qsimQuantum::tear_down() { q_circuit = {}; q_circuit.num_qubits = num_qubits_; @@ -95,6 +103,7 @@ void qsimQuantum::tear_down() { /* Reset the qubit */ + void qsimQuantum::reset(Qubit q) { q.value=0; } @@ -103,6 +112,7 @@ void qsimQuantum::reset(Qubit q) { /* Read the value of a result. This utilizes the new BufferManager. */ + QState qsimQuantum::read_result(Result r) { std::string q_index_string = std::to_string(r.value); @@ -113,8 +123,10 @@ QState qsimQuantum::read_result(Result r) std::string stringResult = std::to_string(bitResult); if (stringResult == "1"){ manager.updateBuffer("q"+q_index_string, "1", 1); + manager.updateBuffer("q"+q_index_string, 1); } else{ manager.updateBuffer("q"+q_index_string, "0", 1); + manager.updateBuffer("q"+q_index_string, 0); } } else { qsim::IO::errorf("Unexpected measurement results encountered."); @@ -127,6 +139,7 @@ QState qsimQuantum::read_result(Result r) Map a qubit to a result index (TODO: find how to link the classical register to the quantum register in qsim) */ + void qsimQuantum::mz(Qubit q, Result r) { //we don't classical register yet. QIREE_EXPECT(q.value < this->num_qubits()); // TODO: q must be in the set of qubits, e.g., what happens if q=5 and qubits are {2,3,4,5}, q is less than num_qubits but not it is in the set of qubits. // Add measurement instruction @@ -139,6 +152,7 @@ void qsimQuantum::mz(Qubit q, Result r) { //we don't classical register yet. /* Quantum Instruction Mapping */ + // 1. Entangling gates void qsimQuantum::cx(Qubit q1, Qubit q2) { q_circuit.gates.push_back( diff --git a/src/qirqsim/qsimQuantum.hh b/src/qirqsim/qsimQuantum.hh index e720e8c..cfdfc4b 100644 --- a/src/qirqsim/qsimQuantum.hh +++ b/src/qirqsim/qsimQuantum.hh @@ -135,8 +135,11 @@ namespace qiree qsim::Circuit> get_circuit() const { return q_circuit; } // Get the state space State const& get_state() const { return *state_; } - // update the buffer + // Update the buffer BufferManager manager; + // Number of repetitions + int repetition; + void repCount(int rep); private: //// TYPES //// From c699ae48cf8a82566e0582853a15efd3d3682e14 Mon Sep 17 00:00:00 2001 From: wongey <25296194+wongey@users.noreply.github.com> Date: Tue, 5 Nov 2024 23:05:52 -0500 Subject: [PATCH 04/64] Add dyanamic BV example from Vicente --- examples/dynamicbv.ll | 101 ++++++++++++++++++++++++++++++++++++++++++ examples/teleport.ll | 2 +- 2 files changed, 102 insertions(+), 1 deletion(-) create mode 100644 examples/dynamicbv.ll diff --git a/examples/dynamicbv.ll b/examples/dynamicbv.ll new file mode 100644 index 0000000..6d48157 --- /dev/null +++ b/examples/dynamicbv.ll @@ -0,0 +1,101 @@ +; ModuleID = 'dynamicbv' +source_filename = "dynamicbv" + +; ModuleID = 'BernsteinVazirani' +source_filename = "bv_algorithm" + +%Qubit = type opaque +%Result = type opaque + +define void @main() #0 { +entry: + ; Initialize qubits + call void @__quantum__qis__h__body(%Qubit* null) ; apply Hadamard to query qubit + call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 1 to %Qubit*)) ; set ancillary qubit + call void @__quantum__qis__h__body(%Qubit* inttoptr (i64 1 to %Qubit*)) ; + + + ; Apply CNOT for bit '1' + call void @__quantum__qis__cnot__body(%Qubit* null, %Qubit* inttoptr (i64 1 to %Qubit*)) ; kickback phase on q0 + call void @__quantum__qis__h__body(%Qubit* null) ; correcting eigenvalue + + ; Mid-circuit measurement + call void @__quantum__qis__mz__body(%Qubit* null, %Result* null) ; from this we get the first bit + call i1 @__quantum__qis__read_result__body(%Result* null) + call void @__quantum__qis__mz__body(%Qubit* inttoptr (i64 1 to %Qubit*), %Result* inttoptr (i64 1 to %Result*)) ; just to reset ancillary qubit + call i1 @__quantum__qis__read_result__body(%Result* inttoptr (i64 1 to %Result*)) + + ; Output the results + call void @__quantum__rt__array_record_output(i64 2, i8* null) + call void @__quantum__rt__result_record_output(%Result* null, i8* null) + call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 1 to %Result*), i8* null) + + ; Initialize qubits + call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 1 to %Qubit*)) ; set ancillary qubit + call void @__quantum__qis__h__body(%Qubit* inttoptr (i64 1 to %Qubit*)) ; + call void @__quantum__qis__h__body(%Qubit* null) ; apply Hadamard to query qubit + + ; Apply Identiry for bit '0' + ; Nothing + + ; Mid-circuit measurement + call void @__quantum__qis__mz__body(%Qubit* null, %Result* null) ; from this we get the first bit + call i1 @__quantum__qis__read_result__body(%Result* null) + call void @__quantum__qis__mz__body(%Qubit* inttoptr (i64 1 to %Qubit*), %Result* inttoptr (i64 1 to %Result*)) ; just to reset ancillary qubit + call i1 @__quantum__qis__read_result__body(%Result* inttoptr (i64 1 to %Result*)) + + ; Output the results + call void @__quantum__rt__array_record_output(i64 2, i8* null) + call void @__quantum__rt__result_record_output(%Result* null, i8* null) + call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 1 to %Result*), i8* null) + + ; Initialize qubits + call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 1 to %Qubit*)) ; set ancillary qubit + call void @__quantum__qis__h__body(%Qubit* inttoptr (i64 1 to %Qubit*)) ; + call void @__quantum__qis__h__body(%Qubit* null) ; apply Hadamard to query qubit + + ; Apply CNOT for bit '1' + call void @__quantum__qis__cnot__body(%Qubit* null, %Qubit* inttoptr (i64 1 to %Qubit*)) ; kickback phase on q0 + call void @__quantum__qis__h__body(%Qubit* null) ; correcting eigenvalue + + ; Mid-circuit measurement + call void @__quantum__qis__mz__body(%Qubit* null, %Result* null) ; from this we get the first bit + call i1 @__quantum__qis__read_result__body(%Result* null) + call void @__quantum__qis__mz__body(%Qubit* inttoptr (i64 1 to %Qubit*), %Result* inttoptr (i64 1 to %Result*)) ; just to reset ancillary qubit + call i1 @__quantum__qis__read_result__body(%Result* inttoptr (i64 1 to %Result*)) + + ; Output the results + call void @__quantum__rt__array_record_output(i64 2, i8* null) + call void @__quantum__rt__result_record_output(%Result* null, i8* null) + call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 1 to %Result*), i8* null) + + ret void +} + +; Declaration of quantum operations +declare void @__quantum__qis__h__body(%Qubit*) +declare void @__quantum__qis__x__body(%Qubit*) +declare void @__quantum__qis__cnot__body(%Qubit*, %Qubit*) +declare void @__quantum__qis__mz__body(%Qubit*, %Result*) +declare i1 @__quantum__qis__read_result__body(%Result*) + +; Quantum runtime functions for managing qubits and results +declare %Qubit* @__quantum__rt__qubit_allocate() +declare %Result* @__quantum__rt__result_allocate() +declare void @__quantum__rt__qubit_release(%Qubit*) +declare void @__quantum__rt__result_release(%Result*) +declare void @__quantum__rt__result_record_output(%Result*, i8*) +declare void @__quantum__rt__array_record_output(i64, i8*) + + + +attributes #0 = { "entry_point" "num_required_qubits"="2" "num_required_results"="2" "output_labeling_schema" "qir_profiles"="custom" } +attributes #1 = { "irreversible" } + +!llvm.module.flags = !{!0, !1, !2, !3} + +!0 = !{i32 1, !"qir_major_version", i32 1} +!1 = !{i32 7, !"qir_minor_version", i32 0} +!2 = !{i32 1, !"dynamic_qubit_management", i1 false} +!3 = !{i32 1, !"dynamic_result_management", i1 false} + diff --git a/examples/teleport.ll b/examples/teleport.ll index 184359f..6fcb74e 100644 --- a/examples/teleport.ll +++ b/examples/teleport.ll @@ -38,7 +38,7 @@ else2: ; preds = %continue continue3: ; preds = %else2, %then1 call void @__quantum__qis__mz__body(%Qubit* inttoptr (i64 2 to %Qubit*), %Result* inttoptr (i64 2 to %Result*)) - %2 = call i2 @__quantum__qis__read_result__body(%Result* inttoptr (i64 2 to %Result*)) + %2 = call i1 @__quantum__qis__read_result__body(%Result* inttoptr (i64 2 to %Result*)) call void @__quantum__rt__array_record_output(i64 3, i8* null) call void @__quantum__rt__result_record_output(%Result* null, i8* null) call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 1 to %Result*), i8* null) From 65054f280877512ae590a5e31247478971c08c06 Mon Sep 17 00:00:00 2001 From: Seth R Johnson Date: Fri, 22 Nov 2024 13:57:21 -0500 Subject: [PATCH 05/64] Revert changes to upstream and remove tpls --- CMakeLists.txt | 36 +- app/CMakeLists.txt | 51 +- cmake/FindLLVM.cmake | 8 - tpls/qsim/bits.h | 106 -- tpls/qsim/bitstring.h | 97 -- tpls/qsim/channel.h | 149 --- tpls/qsim/channels_cirq.h | 471 ------- tpls/qsim/channels_qsim.h | 117 -- tpls/qsim/circuit.h | 36 - tpls/qsim/circuit_noisy.h | 108 -- tpls/qsim/circuit_qsim_parser.h | 442 ------- tpls/qsim/cuda2hip.h | 61 - tpls/qsim/expect.h | 148 --- tpls/qsim/formux.h | 30 - tpls/qsim/fuser.h | 225 ---- tpls/qsim/fuser_basic.h | 411 ------- tpls/qsim/fuser_mqubit.h | 1095 ----------------- tpls/qsim/gate.h | 216 ---- tpls/qsim/gate_appl.h | 231 ---- tpls/qsim/gates_cirq.h | 1640 ------------------------- tpls/qsim/gates_qsim.h | 661 ---------- tpls/qsim/hybrid.h | 612 --------- tpls/qsim/io.h | 44 - tpls/qsim/io_file.h | 71 -- tpls/qsim/matrix.h | 296 ----- tpls/qsim/mps_simulator.h | 246 ---- tpls/qsim/mps_statespace.h | 597 --------- tpls/qsim/parfor.h | 123 -- tpls/qsim/qtrajectory.h | 435 ------- tpls/qsim/run_qsim.h | 262 ---- tpls/qsim/run_qsimh.h | 120 -- tpls/qsim/seqfor.h | 68 - tpls/qsim/simmux.h | 44 - tpls/qsim/simmux_gpu.h | 30 - tpls/qsim/simulator.h | 516 -------- tpls/qsim/simulator_avx.h | 1363 -------------------- tpls/qsim/simulator_avx512.h | 846 ------------- tpls/qsim/simulator_basic.h | 349 ------ tpls/qsim/simulator_cuda.h | 923 -------------- tpls/qsim/simulator_cuda_kernels.h | 683 ---------- tpls/qsim/simulator_custatevec.h | 209 ---- tpls/qsim/simulator_sse.h | 864 ------------- tpls/qsim/statespace.h | 145 --- tpls/qsim/statespace_avx.h | 497 -------- tpls/qsim/statespace_avx512.h | 448 ------- tpls/qsim/statespace_basic.h | 300 ----- tpls/qsim/statespace_cuda.h | 470 ------- tpls/qsim/statespace_cuda_kernels.h | 355 ------ tpls/qsim/statespace_custatevec.h | 376 ------ tpls/qsim/statespace_sse.h | 462 ------- tpls/qsim/umux.h | 52 - tpls/qsim/unitary_calculator_avx.h | 1028 ---------------- tpls/qsim/unitary_calculator_avx512.h | 644 ---------- tpls/qsim/unitary_calculator_basic.h | 259 ---- tpls/qsim/unitary_calculator_sse.h | 639 ---------- tpls/qsim/unitaryspace.h | 65 - tpls/qsim/unitaryspace_avx.h | 112 -- tpls/qsim/unitaryspace_avx512.h | 112 -- tpls/qsim/unitaryspace_basic.h | 103 -- tpls/qsim/unitaryspace_sse.h | 112 -- tpls/qsim/util.h | 89 -- tpls/qsim/util_cpu.h | 43 - tpls/qsim/util_cuda.h | 128 -- tpls/qsim/util_custatevec.h | 44 - tpls/qsim/vectorspace.h | 185 --- tpls/qsim/vectorspace_cuda.h | 172 --- 66 files changed, 5 insertions(+), 21875 deletions(-) delete mode 100644 tpls/qsim/bits.h delete mode 100644 tpls/qsim/bitstring.h delete mode 100644 tpls/qsim/channel.h delete mode 100644 tpls/qsim/channels_cirq.h delete mode 100644 tpls/qsim/channels_qsim.h delete mode 100644 tpls/qsim/circuit.h delete mode 100644 tpls/qsim/circuit_noisy.h delete mode 100644 tpls/qsim/circuit_qsim_parser.h delete mode 100644 tpls/qsim/cuda2hip.h delete mode 100644 tpls/qsim/expect.h delete mode 100644 tpls/qsim/formux.h delete mode 100644 tpls/qsim/fuser.h delete mode 100644 tpls/qsim/fuser_basic.h delete mode 100644 tpls/qsim/fuser_mqubit.h delete mode 100644 tpls/qsim/gate.h delete mode 100644 tpls/qsim/gate_appl.h delete mode 100644 tpls/qsim/gates_cirq.h delete mode 100644 tpls/qsim/gates_qsim.h delete mode 100644 tpls/qsim/hybrid.h delete mode 100644 tpls/qsim/io.h delete mode 100644 tpls/qsim/io_file.h delete mode 100644 tpls/qsim/matrix.h delete mode 100644 tpls/qsim/mps_simulator.h delete mode 100644 tpls/qsim/mps_statespace.h delete mode 100644 tpls/qsim/parfor.h delete mode 100644 tpls/qsim/qtrajectory.h delete mode 100644 tpls/qsim/run_qsim.h delete mode 100644 tpls/qsim/run_qsimh.h delete mode 100644 tpls/qsim/seqfor.h delete mode 100644 tpls/qsim/simmux.h delete mode 100644 tpls/qsim/simmux_gpu.h delete mode 100644 tpls/qsim/simulator.h delete mode 100644 tpls/qsim/simulator_avx.h delete mode 100644 tpls/qsim/simulator_avx512.h delete mode 100644 tpls/qsim/simulator_basic.h delete mode 100644 tpls/qsim/simulator_cuda.h delete mode 100644 tpls/qsim/simulator_cuda_kernels.h delete mode 100644 tpls/qsim/simulator_custatevec.h delete mode 100644 tpls/qsim/simulator_sse.h delete mode 100644 tpls/qsim/statespace.h delete mode 100644 tpls/qsim/statespace_avx.h delete mode 100644 tpls/qsim/statespace_avx512.h delete mode 100644 tpls/qsim/statespace_basic.h delete mode 100644 tpls/qsim/statespace_cuda.h delete mode 100644 tpls/qsim/statespace_cuda_kernels.h delete mode 100644 tpls/qsim/statespace_custatevec.h delete mode 100644 tpls/qsim/statespace_sse.h delete mode 100644 tpls/qsim/umux.h delete mode 100644 tpls/qsim/unitary_calculator_avx.h delete mode 100644 tpls/qsim/unitary_calculator_avx512.h delete mode 100644 tpls/qsim/unitary_calculator_basic.h delete mode 100644 tpls/qsim/unitary_calculator_sse.h delete mode 100644 tpls/qsim/unitaryspace.h delete mode 100644 tpls/qsim/unitaryspace_avx.h delete mode 100644 tpls/qsim/unitaryspace_avx512.h delete mode 100644 tpls/qsim/unitaryspace_basic.h delete mode 100644 tpls/qsim/unitaryspace_sse.h delete mode 100644 tpls/qsim/util.h delete mode 100644 tpls/qsim/util_cpu.h delete mode 100644 tpls/qsim/util_cuda.h delete mode 100644 tpls/qsim/util_custatevec.h delete mode 100644 tpls/qsim/vectorspace.h delete mode 100644 tpls/qsim/vectorspace_cuda.h diff --git a/CMakeLists.txt b/CMakeLists.txt index a536e86..bd57739 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,15 +43,10 @@ qiree_set_default(BUILD_TESTING ${QIREE_BUILD_TESTS}) # Assertion handling option(QIREE_DEBUG "Enable runtime assertions" ON) -# Enforce mutual exclusivity -if(QIREE_USE_XACC) - set(QIREE_USE_QSIM OFF CACHE BOOL "Build qsim interface" FORCE) - message(STATUS "QIREE_USE_XACC is ON, setting QIREE_USE_QSIM to OFF.") -elseif(QIREE_USE_QSIM) - set(QIREE_USE_XACC OFF CACHE BOOL "Build XACC interface" FORCE) - message(STATUS "QIREE_USE_QSIM is ON, setting QIREE_USE_XACC to OFF.") -endif() +qiree_set_default(BUILD_TESTING ${QIREE_BUILD_TESTS}) +# Assertion handling +option(QIREE_DEBUG "Enable runtime assertions" ON) #----------------------------------------------------------------------------# # CMAKE INTRINSIC OPTIONS @@ -185,31 +180,6 @@ if(QIREE_BUILD_TESTS) add_subdirectory(test) endif() -#----------------------------------------------------------------------------# -# OPENMP -#----------------------------------------------------------------------------# - -# Manually set OpenMP flags for macOS with libomp -if(APPLE) - set(OpenMP_C_FLAGS "-Xpreprocessor -fopenmp -I/opt/homebrew/include") - set(OpenMP_CXX_FLAGS "-Xpreprocessor -fopenmp -I/opt/homebrew/include") - set(OpenMP_C_LIB_NAMES "omp") - set(OpenMP_CXX_LIB_NAMES "omp") - set(OpenMP_omp_LIBRARY "/opt/homebrew/lib/libomp.dylib") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") - link_directories("/opt/homebrew/lib") -endif() - -# Now try to find OpenMP -find_package(OpenMP REQUIRED) - -if(OpenMP_FOUND) - message(STATUS "OpenMP found") -else() - message(FATAL_ERROR "OpenMP support is required but was not found.") -endif() - #----------------------------------------------------------------------------# # APPLICATIONS AND BINARIES #----------------------------------------------------------------------------# diff --git a/app/CMakeLists.txt b/app/CMakeLists.txt index fb78caa..ea7589a 100644 --- a/app/CMakeLists.txt +++ b/app/CMakeLists.txt @@ -6,16 +6,15 @@ include(FetchContent) FetchContent_Declare( + # Command Line Parser for C++ programs cli11_proj QUIET - GIT_REPOSITORY https://github.com/CLIUtils/CLI11.git # Command Line Parser for C++ programs + GIT_REPOSITORY https://github.com/CLIUtils/CLI11.git GIT_TAG f4d0731cebb123ff0ace712c099dffbcd2c58e5a # v2.4.1 ) FetchContent_MakeAvailable(cli11_proj) -# Conditionally add XACC-based executable - if(QIREE_USE_XACC) qiree_add_executable(qir-xacc qir-xacc.cc @@ -26,50 +25,4 @@ if(QIREE_USE_XACC) ) endif() -# Conditionally download and configure qsim library - -if(QIREE_USE_QSIM) - FetchContent_Declare( - qsim_lib - GIT_REPOSITORY https://github.com/quantumlib/qsim.git - GIT_TAG master # Use a specific commit/tag if needed - ) - - FetchContent_GetProperties(qsim_lib) - - if(NOT qsim_lib_POPULATED) - FetchContent_MakeAvailable(qsim_lib) - - # Copy header files to tpls/qsim - file(MAKE_DIRECTORY ${CMAKE_SOURCE_DIR}/tpls/qsim) - message(STATUS "Copying qsim headers to ${CMAKE_SOURCE_DIR}/tpls/qsim") - file(GLOB qsim_headers "${qsim_lib_SOURCE_DIR}/lib/*.h") - file(COPY ${qsim_headers} DESTINATION ${CMAKE_SOURCE_DIR}/tpls/qsim) - endif() - - find_package(OpenMP REQUIRED) - - if(OpenMP_CXX_FOUND) - target_link_libraries(qirqsim PUBLIC OpenMP::OpenMP_CXX) - endif() - # Collect source files for the qsim library - #file(GLOB SRC "${CMAKE_SOURCE_DIR}/src/qirqsim/*.cc") - - # Add qsim library with the correct include directories - - #add_library(qsim SHARED ${SRC}) - #target_include_directories(qsim - # PUBLIC - # ${CMAKE_SOURCE_DIR}/tpls/qsim # qsim headers - # ${CMAKE_SOURCE_DIR}/tpls/qsim/lib # Additional qsim headers if needed - # ) - - # Add the qir-qsim executable and link it with qsim - qiree_add_executable(qir-qsim qir-qsim.cc) - target_link_libraries(qir-qsim - PUBLIC QIREE::qiree QIREE::qirqsim - PRIVATE CLI11::CLI11 - ) -endif() - #-----------------------------------------------------------------------------# diff --git a/cmake/FindLLVM.cmake b/cmake/FindLLVM.cmake index 1994269..f363f67 100644 --- a/cmake/FindLLVM.cmake +++ b/cmake/FindLLVM.cmake @@ -28,12 +28,6 @@ This module will set the following variables if found: include(FindPackageHandleStandardArgs) -# Check if the system is macOS -if(APPLE) - # Set LLVM_DIR to the Homebrew location if using macOS - set(LLVM_DIR "/opt/homebrew/opt/llvm/lib/cmake/llvm" CACHE PATH "Path to LLVM on macOS") -endif() - find_package(LLVM QUIET CONFIG) find_package_handle_standard_args(LLVM CONFIG_MODE) @@ -42,8 +36,6 @@ if(LLVM_FOUND) target_include_directories(LLVM::headers SYSTEM INTERFACE "${LLVM_INCLUDE_DIRS}" ) -else() - message(WARNING "Could not find LLVM. Make sure LLVM is installed and LLVM_DIR is set.") endif() #-----------------------------------------------------------------------------# diff --git a/tpls/qsim/bits.h b/tpls/qsim/bits.h deleted file mode 100644 index 080c866..0000000 --- a/tpls/qsim/bits.h +++ /dev/null @@ -1,106 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef BITS_H_ -#define BITS_H_ - -#include - -#ifdef __BMI2__ - -#include - -#include - -namespace qsim { -namespace bits { - -inline uint32_t ExpandBits(uint32_t bits, unsigned n, uint32_t mask) { - return _pdep_u32(bits, mask); -} - -inline uint64_t ExpandBits(uint64_t bits, unsigned n, uint64_t mask) { - return _pdep_u64(bits, mask); -} - -inline uint32_t CompressBits(uint32_t bits, unsigned n, uint32_t mask) { - return _pext_u32(bits, mask); -} - -inline uint64_t CompressBits(uint64_t bits, unsigned n, uint64_t mask) { - return _pext_u64(bits, mask); -} - -} // namespace bits -} // namespace qsim - -#else // __BMI2__ - -namespace qsim { -namespace bits { - -template -inline Integer ExpandBits(Integer bits, unsigned n, Integer mask) { - Integer ebits = 0; - unsigned k = 0; - - for (unsigned i = 0; i < n; ++i) { - if ((mask >> i) & 1) { - ebits |= ((bits >> k) & 1) << i; - ++k; - } - } - - return ebits; -} - -template -inline Integer CompressBits(Integer bits, unsigned n, Integer mask) { - Integer sbits = 0; - unsigned k = 0; - - for (unsigned i = 0; i < n; ++i) { - if ((mask >> i) & 1) { - sbits |= ((bits >> i) & 1) << k; - ++k; - } - } - - return sbits; -} - -} // namespace bits -} // namespace qsim - -#endif // __BMI2__ - -namespace qsim { -namespace bits { - -template -inline Integer PermuteBits( - Integer bits, unsigned n, const std::vector& perm) { - Integer pbits = 0; - - for (unsigned i = 0; i < n; ++i) { - pbits |= ((bits >> i) & 1) << perm[i]; - } - - return pbits; -} - -} // namespace bits -} // namespace qsim - -#endif // BITS_H_ diff --git a/tpls/qsim/bitstring.h b/tpls/qsim/bitstring.h deleted file mode 100644 index b95584b..0000000 --- a/tpls/qsim/bitstring.h +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef BITSTRING_H_ -#define BITSTRING_H_ - -#include -#include -#include -#include - -namespace qsim { - -using Bitstring = uint64_t; - -/** - * Reads bitstrings (representing initialized or measured states of qubits) - * from a provided stream object and stores them in a vector. - * @param num_qubits Number of qubits represented in each bitstring. - * @param provider Source of bitstrings; only used for error reporting. - * @param fs The stream to read bitstrings from. - * @param bitstrings Output vector of bitstrings. On success, this will contain - * all bitstrings read in from 'fs'. - * @return True if reading succeeded; false otherwise. - */ -template -bool BitstringsFromStream(unsigned num_qubits, const std::string& provider, - Stream& fs, std::vector& bitstrings) { - bitstrings.resize(0); - bitstrings.reserve(100000); - - // Bitstrings are in text format. One bitstring per line. - - do { - char buf[128]; - fs.getline(buf, 128); - - if (fs) { - Bitstring b{0}; - - unsigned p = 0; - while (p < 128 && (buf[p] == '0' || buf[p] == '1')) { - b |= uint64_t(buf[p] - '0') << p; - ++p; - } - - if (p != num_qubits) { - IO::errorf("wrong bitstring length in %s: " - "got %u; should be %u.\n", provider.c_str(), p, num_qubits); - bitstrings.resize(0); - return false; - } - - bitstrings.push_back(b); - } - } while (fs); - - return true; -} - -/** - * Reads bitstrings (representing initialized or measured states of qubits) - * from the given file and stores them in a vector. - * @param num_qubits Number of qubits represented in each bitstring. - * @param file The name of the file to read bitstrings from. - * @param bitstrings Output vector of bitstrings. On success, this will contain - * all bitstrings read in from 'file'. - * @return True if reading succeeded; false otherwise. - */ -template -inline bool BitstringsFromFile(unsigned num_qubits, const std::string& file, - std::vector& bitstrings) { - auto fs = IO::StreamFromFile(file); - - if (!fs) { - return false; - } else { - bool rc = BitstringsFromStream(num_qubits, file, fs, bitstrings); - IO::CloseStream(fs); - return rc; - } -} - -} // namespace qsim - -#endif // BITSTRING_H_ diff --git a/tpls/qsim/channel.h b/tpls/qsim/channel.h deleted file mode 100644 index 372a174..0000000 --- a/tpls/qsim/channel.h +++ /dev/null @@ -1,149 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef CHANNEL_H_ -#define CHANNEL_H_ - -#include -#include - -#include "gate.h" -#include "matrix.h" - -namespace qsim { - -/** - * Kraus operator. - */ -template -struct KrausOperator { - using fp_type = typename Gate::fp_type; - - enum Kind { - kNormal = 0, - kMeasurement = gate::kMeasurement, - }; - - /** - * Kraus operator type; - */ - Kind kind; - - /** - * If true, the Kraus operator is a unitary operator times a constant. - */ - bool unitary; - - /** - * Lower bound on Kraus operator probability. - */ - double prob; - - /** - * Sequence of operations that represent the Kraus operator. This can be just - * one operation. - */ - std::vector ops; - - /** - * Product of K^\dagger and K. This can be empty if unitary = true. - */ - Matrix kd_k; - - /** - * Qubits kd_k acts on. This can be empty if unitary = true. - */ - std::vector qubits; - - /** - * Calculates the product of "K^\dagger K". Sets qubits "K^\dagger K" acts on. - */ - void CalculateKdKMatrix() { - if (ops.size() == 1) { - kd_k = ops[0].matrix; - MatrixDaggerMultiply(ops[0].qubits.size(), ops[0].matrix, kd_k); - qubits = ops[0].qubits; - } else if (ops.size() > 1) { - std::set qubit_map; - - for (const auto& op : ops) { - for (unsigned q : op.qubits) { - qubit_map.insert(q); - } - } - - unsigned num_qubits = qubit_map.size(); - - qubits.resize(0); - qubits.reserve(num_qubits); - - for (auto it = qubit_map.begin(); it != qubit_map.end(); ++it) { - qubits.push_back(*it); - } - - MatrixIdentity(unsigned{1} << num_qubits, kd_k); - - for (const auto& op : ops) { - if (op.qubits.size() == num_qubits) { - MatrixMultiply(num_qubits, op.matrix, kd_k); - } else { - unsigned mask = 0; - - for (auto q : op.qubits) { - for (unsigned i = 0; i < num_qubits; ++i) { - if (q == qubits[i]) { - mask |= unsigned{1} << i; - break; - } - } - } - - MatrixMultiply(mask, op.qubits.size(), op.matrix, num_qubits, kd_k); - } - } - - auto m = kd_k; - MatrixDaggerMultiply(num_qubits, m, kd_k); - } - } -}; - -/** - * Quantum channel. - */ -template -using Channel = std::vector>; - -/** - * Makes a channel from the gate. - * @param time The time to place the channel at. - * @param gate The input gate. - * @return The output channel. - */ -template -Channel MakeChannelFromGate(unsigned time, const Gate& gate) { - auto normal = KrausOperator::kNormal; - auto measurement = KrausOperator::kMeasurement; - - auto kind = gate.kind == gate::kMeasurement ? measurement : normal; - - Channel channel = {{kind, true, 1, {gate}}}; - channel[0].ops[0].time = time; - - return channel; -} - -} // namespace qsim - -#endif // CHANNEL_H_ diff --git a/tpls/qsim/channels_cirq.h b/tpls/qsim/channels_cirq.h deleted file mode 100644 index 69f1df9..0000000 --- a/tpls/qsim/channels_cirq.h +++ /dev/null @@ -1,471 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef CHANNELS_CIRQ_H_ -#define CHANNELS_CIRQ_H_ - -#include -#include -#include - -#include "channel.h" -#include "gates_cirq.h" - -namespace qsim { - -namespace Cirq { - -template -using Channel = qsim::Channel>; - -/** - * Asymmetric depolarizing channel factory. - */ -template -struct AsymmetricDepolarizingChannel { - static constexpr char name[] = "asymmetric_depolarize"; - - AsymmetricDepolarizingChannel(double p_x, double p_y, double p_z) - : p_x(p_x), p_y(p_y), p_z(p_z) {} - - static Channel Create(unsigned time, unsigned q, - double p_x, double p_y, double p_z) { - double p1 = 1 - p_x - p_y - p_z; - - auto normal = KrausOperator>::kNormal; - - return {{normal, 1, p1, {}}, - {normal, 1, p_x, {X::Create(time, q)}}, - {normal, 1, p_y, {Y::Create(time, q)}}, - {normal, 1, p_z, {Z::Create(time, q)}}}; - } - - static Channel Create(unsigned time, - const std::vector& qubits, - double p_x, double p_y, double p_z) { - double p1 = 1 - p_x - p_y - p_z; - - auto normal = KrausOperator>::kNormal; - - uint64_t size = uint64_t{1} << (2 * qubits.size()); - - Channel channel; - channel.reserve(size); - - for (uint64_t i = 0; i < size; ++i) { - channel.push_back({normal, 1, 0, {}}); - auto& kop = channel.back(); - - kop.ops.reserve(qubits.size()); - - double prob = 1; - - for (unsigned q = 0; q < qubits.size(); ++q) { - unsigned pauli_index = (i >> (2 * q)) & 3; - - switch (pauli_index) { - case 0: - prob *= p1; - break; - case 1: - prob *= p_x; - kop.ops.push_back(X::Create(time, q)); - break; - case 2: - prob *= p_y; - kop.ops.push_back(Y::Create(time, q)); - break; - case 3: - prob *= p_z; - kop.ops.push_back(Z::Create(time, q)); - break; - } - } - - kop.prob = prob; - } - - return channel; - } - - Channel Create(unsigned time, unsigned q) const { - return Create(time, q, p_x, p_y, p_z); - } - - Channel Create( - unsigned time, const std::vector& qubits) const { - return Create(time, qubits, p_x, p_y, p_z); - } - - double p_x = 0; - double p_y = 0; - double p_z = 0; -}; - -/** - * Returns an asymmetric depolarizing channel factory object. - */ -template -inline AsymmetricDepolarizingChannel asymmetric_depolarize( - double p_x, double p_y, double p_z) { - return AsymmetricDepolarizingChannel(p_x, p_y, p_z); -} - -/** - * Depolarizing channel factory. - */ -template -struct DepolarizingChannel { - static constexpr char name[] = "depolarize"; - - DepolarizingChannel(double p) : p(p) {} - - static Channel Create(unsigned time, unsigned q, double p) { - double p1 = 1 - p; - double p2 = p / 3; - - auto normal = KrausOperator>::kNormal; - - return {{normal, 1, p1, {}}, - {normal, 1, p2, {X::Create(time, q)}}, - {normal, 1, p2, {Y::Create(time, q)}}, - {normal, 1, p2, {Z::Create(time, q)}}}; - } - - static Channel Create( - unsigned time, const std::vector& qubits, double p) { - double p1 = 1 - p; - double p2 = p / 3; - - auto normal = KrausOperator>::kNormal; - - uint64_t size = uint64_t{1} << (2 * qubits.size()); - - Channel channel; - channel.reserve(size); - - for (uint64_t i = 0; i < size; ++i) { - channel.push_back({normal, 1, 0, {}}); - auto& kop = channel.back(); - - kop.ops.reserve(qubits.size()); - - double prob = 1; - - for (unsigned q = 0; q < qubits.size(); ++q) { - unsigned pauli_index = (i >> (2 * q)) & 3; - - switch (pauli_index) { - case 0: - prob *= p1; - break; - case 1: - prob *= p2; - kop.ops.push_back(X::Create(time, q)); - break; - case 2: - prob *= p2; - kop.ops.push_back(Y::Create(time, q)); - break; - case 3: - prob *= p2; - kop.ops.push_back(Z::Create(time, q)); - break; - } - } - - kop.prob = prob; - } - - return channel; - } - - Channel Create(unsigned time, unsigned q) const { - return Create(time, q, p); - } - - Channel Create( - unsigned time, const std::vector& qubits) const { - return Create(time, qubits, p); - } - - double p = 0; -}; - -/** - * Returns a depolarizing channel factory object. - */ -template -inline DepolarizingChannel depolarize(double p) { - return DepolarizingChannel(p); -} - -/** - * Generalized amplitude damping channel factory. - */ -template -struct GeneralizedAmplitudeDampingChannel { - static constexpr char name[] = "generalized_amplitude_damp"; - - GeneralizedAmplitudeDampingChannel(double p, double gamma) - : p(p), gamma(gamma) {} - - static Channel Create( - unsigned time, unsigned q, double p, double gamma) { - double p1 = p * (1 - gamma); - double p2 = (1 - p) * (1 - gamma); - double p3 = 0; - - fp_type t1 = std::sqrt(p); - fp_type r1 = std::sqrt(p * (1 - gamma)); - fp_type s1 = std::sqrt(p * gamma); - fp_type t2 = std::sqrt(1 - p); - fp_type r2 = std::sqrt((1 - p) * (1 - gamma)); - fp_type s2 = std::sqrt((1 - p) * gamma); - - using M = Cirq::MatrixGate1; - auto normal = KrausOperator>::kNormal; - - return {{normal, 0, p1, - {M::Create(time, q, {t1, 0, 0, 0, 0, 0, r1, 0})}, - {t1 * t1, 0, 0, 0, 0, 0, r1 * r1, 0}, {q}, - }, - {normal, 0, p2, - {M::Create(time, q, {r2, 0, 0, 0, 0, 0, t2, 0})}, - {r2 * r2, 0, 0, 0, 0, 0, t2 * t2, 0}, {q}, - }, - {normal, 0, p3, - {M::Create(time, q, {0, 0, s1, 0, 0, 0, 0, 0})}, - {0, 0, 0, 0, 0, 0, s1 * s1, 0}, {q}, - }, - {normal, 0, p3, - {M::Create(time, q, {0, 0, 0, 0, s2, 0, 0, 0})}, - {s2 * s2, 0, 0, 0, 0, 0, 0, 0}, {q}, - }, - }; - } - - Channel Create(unsigned time, unsigned q) const { - return Create(time, q, p, gamma); - } - - double p = 1; - double gamma = 0; -}; - -/** - * Returns a generalized amplitude damping channel factory object. - */ -template -inline GeneralizedAmplitudeDampingChannel generalized_amplitude_damp( - double p, double gamma) { - return GeneralizedAmplitudeDampingChannel(p, gamma); -} - -/** - * Amplitude damping channel factory. - */ -template -struct AmplitudeDampingChannel { - static constexpr char name[] = "amplitude_damp"; - - AmplitudeDampingChannel(double gamma) : gamma(gamma) {} - - static Channel Create(unsigned time, unsigned q, double gamma) { - double p1 = 1 - gamma; - double p2 = 0; - - fp_type r = std::sqrt(p1); - fp_type s = std::sqrt(gamma); - - using M = Cirq::MatrixGate1; - auto normal = KrausOperator>::kNormal; - - return {{normal, 0, p1, - {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})}, - {1, 0, 0, 0, 0, 0, r * r, 0}, {q}, - }, - {normal, 0, p2, - {M::Create(time, q, {0, 0, s, 0, 0, 0, 0, 0})}, - {0, 0, 0, 0, 0, 0, s * s, 0}, {q}, - }, - }; - } - - Channel Create(unsigned time, unsigned q) const { - return Create(time, q, gamma); - } - - double gamma = 0; -}; - -/** - * Returns an amplitude damping channel factory object. - */ -template -inline AmplitudeDampingChannel amplitude_damp(double gamma) { - return AmplitudeDampingChannel(gamma); -} - -/** - * Phase damping channel factory. - */ -template -struct PhaseDampingChannel { - static constexpr char name[] = "phase_dump"; - - PhaseDampingChannel(double gamma) : gamma(gamma) {} - - static Channel Create(unsigned time, unsigned q, double gamma) { - double p1 = 1 - gamma; - double p2 = 0; - - fp_type r = std::sqrt(p1); - fp_type s = std::sqrt(gamma); - - using M = Cirq::MatrixGate1; - auto normal = KrausOperator>::kNormal; - - return {{normal, 0, p1, - {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})}, - {1, 0, 0, 0, 0, 0, r * r, 0}, {q}, - }, - {normal, 0, p2, - {M::Create(time, q, {0, 0, 0, 0, 0, 0, s, 0})}, - {0, 0, 0, 0, 0, 0, s * s, 0}, {q}, - }, - }; - } - - Channel Create(unsigned time, unsigned q) const { - return Create(time, q, gamma); - } - - double gamma = 0; -}; - -/** - * Returns a phase damping channel factory object. - */ -template -inline PhaseDampingChannel phase_damp(double gamma) { - return PhaseDampingChannel(gamma); -} - -/** - * Reset channel factory. - */ -template -struct ResetChannel { - static constexpr char name[] = "reset"; - - static Channel Create(unsigned time, unsigned q) { - using M = Cirq::MatrixGate1; - auto normal = KrausOperator>::kNormal; - - return {{normal, 0, 0, - {M::Create(time, q, {1, 0, 0, 0, 0, 0, 0, 0})}, - {1, 0, 0, 0, 0, 0, 0, 0}, {q}, - }, - {normal, 0, 0, - {M::Create(time, q, {0, 0, 1, 0, 0, 0, 0, 0})}, - {0, 0, 0, 0, 0, 0, 1, 0}, {q}, - }, - }; - } -}; - -/** - * Returns a reset channel factory object. - */ -template -inline ResetChannel reset() { - return ResetChannel(); -} - -/** - * Phase flip channel factory. - */ -template -struct PhaseFlipChannel { - static constexpr char name[] = "phase_flip"; - - PhaseFlipChannel(double p) : p(p) {} - - static Channel Create(unsigned time, unsigned q, double p) { - double p1 = 1 - p; - double p2 = p; - - auto normal = KrausOperator>::kNormal; - - return {{normal, 1, p1, {}}, - {normal, 1, p2, {Z::Create(time, q)}} - }; - } - - Channel Create(unsigned time, unsigned q) const { - return Create(time, q, p); - } - - double p = 0; -}; - -/** - * Returns a phase flip channel factory object. - */ -template -inline PhaseFlipChannel phase_flip(double p) { - return PhaseFlipChannel(p); -} - -/** - * Bit flip channel factory. - */ -template -struct BitFlipChannel { - static constexpr char name[] = "bit_flip"; - - BitFlipChannel(double p) : p(p) {} - - static Channel Create(unsigned time, unsigned q, double p) { - double p1 = 1 - p; - double p2 = p; - - auto normal = KrausOperator>::kNormal; - - return {{normal, 1, p1, {}}, - {normal, 1, p2, {X::Create(time, q)}} - }; - } - - Channel Create(unsigned time, unsigned q) const { - return Create(time, q, p); - } - - double p = 0; -}; - -/** - * Returns a bit flip channel factory object. - */ -template -inline BitFlipChannel bit_flip(double p) { - return BitFlipChannel(p); -} - -} // namesapce Cirq - -} // namespace qsim - -#endif // CHANNELS_CIRQ_H_ diff --git a/tpls/qsim/channels_qsim.h b/tpls/qsim/channels_qsim.h deleted file mode 100644 index 5c07bcc..0000000 --- a/tpls/qsim/channels_qsim.h +++ /dev/null @@ -1,117 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef CHANNELS_QSIM_H_ -#define CHANNELS_QSIM_H_ - -#include -#include -#include - -#include "channel.h" -#include "gates_qsim.h" - -namespace qsim { - -/** - * Amplitude damping channel factory. - */ -template -struct AmplitudeDampingChannel { - AmplitudeDampingChannel(double gamma) : gamma(gamma) {} - - static Channel> Create( - unsigned time, unsigned q, double gamma) { - double p1 = 1 - gamma; - double p2 = 0; - - fp_type r = std::sqrt(p1); - fp_type s = std::sqrt(gamma); - - using M = GateMatrix1; - auto normal = KrausOperator>::kNormal; - - return {{normal, 0, p1, - {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})}, - {1, 0, 0, 0, 0, 0, r * r, 0}, {q}, - }, - {normal, 0, p2, - {M::Create(time, q, {0, 0, s, 0, 0, 0, 0, 0})}, - {0, 0, 0, 0, 0, 0, s * s, 0}, {q}, - }, - }; - } - - Channel> Create(unsigned time, unsigned q) const { - return Create(time, q, gamma); - } - - double gamma = 0; -}; - -/** - * Returns an amplitude damping channel factory object. - */ -template -inline AmplitudeDampingChannel amplitude_damp(double gamma) { - return AmplitudeDampingChannel(gamma); -} - -/** - * Phase damping channel factory. - */ -template -struct PhaseDampingChannel { - PhaseDampingChannel(double gamma) : gamma(gamma) {} - - static Channel> Create( - unsigned time, unsigned q, double gamma) { - double p1 = 1 - gamma; - double p2 = 0; - - fp_type r = std::sqrt(p1); - fp_type s = std::sqrt(gamma); - - using M = GateMatrix1; - auto normal = KrausOperator>::kNormal; - - return {{normal, 0, p1, - {M::Create(time, q, {1, 0, 0, 0, 0, 0, r, 0})}, - {1, 0, 0, 0, 0, 0, r * r, 0}, {q}, - }, - {normal, 0, p2, - {M::Create(time, q, {0, 0, 0, 0, 0, 0, s, 0})}, - {0, 0, 0, 0, 0, 0, s * s, 0}, {q}, - }, - }; - } - - Channel> Create(unsigned time, unsigned q) const { - return Create(time, q, gamma); - } - - double gamma = 0; -}; - -/** - * Returns a phase damping channel factory object. - */ -template -inline PhaseDampingChannel phase_damp(double gamma) { - return PhaseDampingChannel(gamma); -} - -} // namespace qsim - -#endif // CHANNELS_QSIM_H_ diff --git a/tpls/qsim/circuit.h b/tpls/qsim/circuit.h deleted file mode 100644 index 59018ee..0000000 --- a/tpls/qsim/circuit.h +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef CIRCUIT_H_ -#define CIRCUIT_H_ - -#include - -namespace qsim { - -/** - * A collection of gates. This object is consumed by `QSim[h]Runner.Run()`. - */ -template -struct Circuit { - unsigned num_qubits; - /** - * The set of gates to be run. Gate times should be ordered. - */ - std::vector gates; -}; - -} // namespace qsim - -#endif // CIRCUIT_H_ diff --git a/tpls/qsim/circuit_noisy.h b/tpls/qsim/circuit_noisy.h deleted file mode 100644 index 40a228d..0000000 --- a/tpls/qsim/circuit_noisy.h +++ /dev/null @@ -1,108 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef CIRCUIT_NOISY_H_ -#define CIRCUIT_NOISY_H_ - -#include - -#include "circuit.h" -#include "channel.h" - -namespace qsim { - -/** - * Noisy circuit. - */ -template -struct NoisyCircuit { - unsigned num_qubits; - std::vector> channels; -}; - -template -using ncircuit_iterator = typename std::vector>::const_iterator; - -/** - * Makes a noisy circuit from the clean circuit. - * Channels are added after each qubit of each gate of the clean cicuit. - * Roughly equivalent to cirq.Circuit.with_noise. - * @param num_qubits The number of circuit qubits. - * @param gbeg, gend The iterator range [gbeg, gend) of circuit gates. - * @param A channel factory to construct channels. - * @return The output noisy circuit. - */ -template -inline NoisyCircuit MakeNoisy( - unsigned num_qubits, - typename std::vector::const_iterator gbeg, - typename std::vector::const_iterator gend, - const ChannelFactory& channel_factory) { - NoisyCircuit ncircuit; - - ncircuit.num_qubits = num_qubits; - ncircuit.channels.reserve(4 * std::size_t(gend - gbeg)); - - for (auto it = gbeg; it != gend; ++it) { - const auto& gate = *it; - - ncircuit.channels.push_back(MakeChannelFromGate(2 * gate.time, gate)); - - for (auto q : gate.qubits) { - ncircuit.channels.push_back(channel_factory.Create(2 * gate.time + 1, q)); - } - - for (auto q : gate.controlled_by) { - ncircuit.channels.push_back(channel_factory.Create(2 * gate.time + 1, q)); - } - } - - return ncircuit; -} - -/** - * Makes a noisy circuit from the clean circuit. - * Channels are added after each qubit of each gate of the clean cicuit. - * Roughly equivalent to cirq.Circuit.with_noise. - * @param num_qubits The number of circuit qubits. - * @param gates The circuit gates. - * @param A channel factory to construct channels. - * @return The output noisy circuit. - */ -template -inline NoisyCircuit MakeNoisy(unsigned num_qubits, - const std::vector& gates, - const ChannelFactory& channel_factory) { - return - MakeNoisy(num_qubits, gates.begin(), gates.end(), channel_factory); -} - -/** - * Makes a noisy circuit from the clean circuit. - * Channels are added after each qubit of each gate of the clean cicuit. - * Roughly equivalent to cirq.Circuit.with_noise. - * @param circuit The input cicuit. - * @param A channel factory to construct channels. - * @return The output noisy circuit. - */ -template -inline NoisyCircuit MakeNoisy(const Circuit& circuit, - const ChannelFactory& channel_factory) { - return MakeNoisy(circuit.num_qubits, circuit.gates.begin(), - circuit.gates.end(), channel_factory); -} - -} // namespace qsim - -#endif // CIRCUIT_NOISY_H_ diff --git a/tpls/qsim/circuit_qsim_parser.h b/tpls/qsim/circuit_qsim_parser.h deleted file mode 100644 index de7bd89..0000000 --- a/tpls/qsim/circuit_qsim_parser.h +++ /dev/null @@ -1,442 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef CIRCUIT_QSIM_PARSER_H_ -#define CIRCUIT_QSIM_PARSER_H_ - -#include -#include -#include -#include -#include - -#include "circuit.h" -#include "gates_qsim.h" - -namespace qsim { - -/** - * Parser for the (deprecated) qsim file input format. - * The primary supported interface for designing circuits to simulate with qsim - * is Cirq, which relies on - * the Python-based qsimcirq interface. For C++ applications, Cirq gates can be - * explicitly constructed in code. - */ -template -class CircuitQsimParser final { - public: - /** - * Parses the given input stream into a Circuit object, following the rules - * defined in "docs/input_format.md". - * @param maxtime Maximum gate "time" to read operations for (inclusive). - * @param provider Circuit source; only used for error reporting. - * @param fs The stream to read the circuit from. - * @param circuit Output circuit object. If parsing is successful, this will - * contain the circuit defined in 'fs'. - * @return True if parsing succeeds; false otherwise. - */ - template - static bool FromStream(unsigned maxtime, const std::string& provider, - Stream& fs, Circuit>& circuit) { - circuit.num_qubits = 0; - - circuit.gates.resize(0); - circuit.gates.reserve(1024); - - unsigned k = 0; - - std::string line; - line.reserve(128); - - unsigned time; - std::string gate_name; - gate_name.reserve(16); - - unsigned max_time = 0; - unsigned prev_mea_time = 0; - - std::vector last_times; - - while (std::getline(fs, line)) { - ++k; - - if (line.size() == 0 || line[0] == '#') continue; - - std::stringstream ss(line); - - if (circuit.num_qubits == 0) { - ss >> circuit.num_qubits; - if (circuit.num_qubits == 0) { - IO::errorf("invalid number of qubits in %s in line %u.\n", - provider.c_str(), k); - return false; - } - - last_times.resize(circuit.num_qubits, unsigned(-1)); - - continue; - } - - ss >> time >> gate_name; - - if (!ss) { - InvalidGateError(provider, k); - return false; - } - - if (time > maxtime) { - break; - } - - if (gate_name == "c") { - if (!ParseControlledGate(ss, time, - circuit.num_qubits, circuit.gates)) { - InvalidGateError(provider, k); - return false; - } - } else if (!ParseGate(ss, time, circuit.num_qubits, - gate_name, circuit.gates)) { - InvalidGateError(provider, k); - return false; - } - - const auto& gate = circuit.gates.back(); - - if (time < prev_mea_time - || (gate.kind == gate::kMeasurement && time < max_time)) { - IO::errorf("gate crosses the time boundary set by measurement " - "gates in line %u in %s.\n", k, provider.c_str()); - return false; - } - - if (gate.kind == gate::kMeasurement) { - prev_mea_time = time; - } - - if (GateIsOutOfOrder(time, gate.qubits, last_times) - || GateIsOutOfOrder(time, gate.controlled_by, last_times)) { - IO::errorf("gate is out of time order in line %u in %s.\n", - k, provider.c_str()); - return false; - } - - if (time > max_time) { - max_time = time; - } - } - - return true; - } - - /** - * Parses the given file into a Circuit object, following the rules defined - * in "docs/input_format.md". - * @param maxtime Maximum gate "time" to read operations for (inclusive). - * @param file The name of the file to read the circuit from. - * @param circuit Output circuit object. If parsing is successful, this will - * contain the circuit defined in 'file'. - * @return True if parsing succeeds; false otherwise. - */ - template - static bool FromFile(unsigned maxtime, const std::string& file, - Circuit>& circuit) { - auto fs = IO::StreamFromFile(file); - - if (!fs) { - return false; - } else { - bool rc = FromStream(maxtime, file, fs, circuit); - IO::CloseStream(fs); - return rc; - } - } - - private: - static void InvalidGateError(const std::string& provider, unsigned line) { - IO::errorf("invalid gate in %s in line %u.\n", provider.c_str(), line); - } - - /** - * Checks formatting for a zero-qubit gate parsed from 'ss'. - * @param ss Input stream containing the gate specification. - */ - static bool ValidateGate(std::stringstream& ss) { - return ss && ss.peek() == std::stringstream::traits_type::eof(); - } - - /** - * Checks formatting for a single-qubit gate parsed from 'ss'. - * @param ss Input stream containing the gate specification. - * @param num_qubits Number of qubits, as defined at the start of the file. - * @param q0 Index of the affected qubit. - */ - static bool ValidateGate(std::stringstream& ss, - unsigned num_qubits, unsigned q0) { - return ss && ss.peek() == std::stringstream::traits_type::eof() - && q0 < num_qubits; - } - - /** - * Checks formatting for a two-qubit gate parsed from 'ss'. - * @param ss Input stream containing the gate specification. - * @param num_qubits Number of qubits, as defined at the start of the file. - * @param q0 Index of the first affected qubit. - * @param q1 Index of the second affected qubit. - */ - static bool ValidateGate(std::stringstream& ss, - unsigned num_qubits, unsigned q0, unsigned q1) { - return ss && ss.peek() == std::stringstream::traits_type::eof() - && q0 < num_qubits && q1 < num_qubits && q0 != q1; - } - - /** - * Checks formatting for a multiqubit gate parsed from 'ss'. - * @param ss Input stream containing the gate specification. - * @param num_qubits Number of qubits, as defined at the start of the file. - * @param qubits Indices of affected qubits. - */ - static bool ValidateGate(std::stringstream& ss, unsigned num_qubits, - const std::vector& qubits) { - return ss && ValidateQubits(num_qubits, qubits); - } - - static bool ValidateControlledGate( - unsigned num_qubits, const std::vector& qubits, - const std::vector& controlled_by) { - if (!ValidateQubits(num_qubits, controlled_by)) return false; - - std::size_t i = 0, j = 0; - - while (i < qubits.size() && j < controlled_by.size()) { - if (qubits[i] == controlled_by[j]) { - return false; - } else if (qubits[i] < controlled_by[j]) { - ++i; - } else { - ++j; - } - } - - return true; - } - - static bool ValidateQubits(unsigned num_qubits, - const std::vector& qubits) { - if (qubits.size() == 0 || qubits[0] >= num_qubits) return false; - - // qubits should be sorted. - - for (std::size_t i = 1; i < qubits.size(); ++i) { - if (qubits[i] >= num_qubits || qubits[i] == qubits[i - 1]) { - return false; - } - } - - return true; - } - - static bool GateIsOutOfOrder(unsigned time, - const std::vector& qubits, - std::vector& last_times) { - for (auto q : qubits) { - if (last_times[q] != unsigned(-1) && time <= last_times[q]) { - return true; - } - - last_times[q] = time; - } - - return false; - } - - template - static bool ParseGate(Stream& ss, unsigned time, unsigned num_qubits, - const std::string& gate_name, - std::vector& gates) { - unsigned q0, q1; - fp_type phi, theta; - - if (gate_name == "p") { - ss >> phi; - if (!ValidateGate(ss)) return false; - gates.push_back(GateGPh::Create(time, phi)); - } else if (gate_name == "id1") { - ss >> q0; - if (!ValidateGate(ss, num_qubits, q0)) return false; - gates.push_back(GateId1::Create(time, q0)); - } else if (gate_name == "h") { - ss >> q0; - if (!ValidateGate(ss, num_qubits, q0)) return false; - gates.push_back(GateHd::Create(time, q0)); - } else if (gate_name == "t") { - ss >> q0; - if (!ValidateGate(ss, num_qubits, q0)) return false; - gates.push_back(GateT::Create(time, q0)); - } else if (gate_name == "x") { - ss >> q0; - if (!ValidateGate(ss, num_qubits, q0)) return false; - gates.push_back(GateX::Create(time, q0)); - } else if (gate_name == "y") { - ss >> q0; - if (!ValidateGate(ss, num_qubits, q0)) return false; - gates.push_back(GateY::Create(time, q0)); - } else if (gate_name == "z") { - ss >> q0; - if (!ValidateGate(ss, num_qubits, q0)) return false; - gates.push_back(GateZ::Create(time, q0)); - } else if (gate_name == "x_1_2") { - ss >> q0; - if (!ValidateGate(ss, num_qubits, q0)) return false; - gates.push_back(GateX2::Create(time, q0)); - } else if (gate_name == "y_1_2") { - ss >> q0; - if (!ValidateGate(ss, num_qubits, q0)) return false; - gates.push_back(GateY2::Create(time, q0)); - } else if (gate_name == "rx") { - ss >> q0 >> phi; - if (!ValidateGate(ss, num_qubits, q0)) return false; - gates.push_back(GateRX::Create(time, q0, phi)); - } else if (gate_name == "ry") { - ss >> q0 >> phi; - if (!ValidateGate(ss, num_qubits, q0)) return false; - gates.push_back(GateRY::Create(time, q0, phi)); - } else if (gate_name == "rz") { - ss >> q0 >> phi; - if (!ValidateGate(ss, num_qubits, q0)) return false; - gates.push_back(GateRZ::Create(time, q0, phi)); - } else if (gate_name == "rxy") { - ss >> q0 >> theta >> phi; - if (!ValidateGate(ss, num_qubits, q0)) return false; - gates.push_back(GateRXY::Create(time, q0, theta, phi)); - } else if (gate_name == "hz_1_2") { - ss >> q0; - if (!ValidateGate(ss, num_qubits, q0)) return false; - gates.push_back(GateHZ2::Create(time, q0)); - } else if (gate_name == "s") { - ss >> q0; - if (!ValidateGate(ss, num_qubits, q0)) return false; - gates.push_back(GateS::Create(time, q0)); - } else if (gate_name == "id2") { - ss >> q0 >> q1; - if (!ValidateGate(ss, num_qubits, q0, q1)) return false; - gates.push_back(GateId2::Create(time, q0, q1)); - } else if (gate_name == "cz") { - ss >> q0 >> q1; - if (!ValidateGate(ss, num_qubits, q0, q1)) return false; - gates.push_back(GateCZ::Create(time, q0, q1)); - } else if (gate_name == "cnot" || gate_name == "cx") { - ss >> q0 >> q1; - if (!ValidateGate(ss, num_qubits, q0, q1)) return false; - gates.push_back(GateCNot::Create(time, q0, q1)); - } else if (gate_name == "sw") { - ss >> q0 >> q1; - if (!ValidateGate(ss, num_qubits, q0, q1)) return false; - gates.push_back(GateSwap::Create(time, q0, q1)); - } else if (gate_name == "is") { - ss >> q0 >> q1; - if (!ValidateGate(ss, num_qubits, q0, q1)) return false; - gates.push_back(GateIS::Create(time, q0, q1)); - } else if (gate_name == "fs") { - ss >> q0 >> q1 >> theta >> phi; - if (!ValidateGate(ss, num_qubits, q0, q1)) return false; - gates.push_back(GateFS::Create(time, q0, q1, theta, phi)); - } else if (gate_name == "cp") { - ss >> q0 >> q1 >> phi; - if (!ValidateGate(ss, num_qubits, q0, q1)) return false; - gates.push_back(GateCP::Create(time, q0, q1, phi)); - } else if (gate_name == "m") { - std::vector qubits; - qubits.reserve(num_qubits); - - while (ss.good()) { - ss >> q0; - if (ss) { - qubits.push_back(q0); - } else { - return false; - } - } - - gates.push_back(gate::Measurement>::Create( - time, std::move(qubits))); - - if (!ValidateQubits(num_qubits, gates.back().qubits)) return false; - } else { - return false; - } - - return true; - } - - template - static bool ParseControlledGate(Stream& ss, unsigned time, - unsigned num_qubits, - std::vector& gates) { - std::vector controlled_by; - controlled_by.reserve(64); - - std::string gate_name; - gate_name.reserve(16); - - while (1) { - while (ss.good()) { - if (!std::isblank(ss.get())) { - ss.unget(); - break; - } - } - - if (!ss.good()) { - return false; - } - - if (!std::isdigit(ss.peek())) { - break; - } else { - unsigned q; - ss >> q; - - if (!ss.good() || !std::isblank(ss.get())) { - return false; - } - - controlled_by.push_back(q); - } - } - - if (controlled_by.size() == 0) { - return false; - } - - ss >> gate_name; - - if (!ss.good() || !ParseGate(ss, time, - num_qubits, gate_name, gates)) { - return false; - } - - gates.back().ControlledBy(std::move(controlled_by)); - - if (!ValidateControlledGate(num_qubits, gates.back().qubits, - gates.back().controlled_by)) { - return false; - } - - return true; - } -}; - -} // namespace qsim - -#endif // CIRCUIT_QSIM_PARSER_H_ diff --git a/tpls/qsim/cuda2hip.h b/tpls/qsim/cuda2hip.h deleted file mode 100644 index da2d074..0000000 --- a/tpls/qsim/cuda2hip.h +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright 2023 Advanced Micro Devices, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SIMULATOR_CUDA2HIP_H_ -#define SIMULATOR_CUDA2HIP_H_ - -#define cublasCaxpy hipblasCaxpy -#define cublasCdotc hipblasCdotc -#define cublasCreate hipblasCreate -#define cublasCscal hipblasCscal -#define cublasCsscal hipblasCsscal -#define cublasDestroy hipblasDestroy -#define cublasDznrm2 hipblasDznrm2 -#define cublasHandle_t hipblasHandle_t -#define cublasScnrm2 hipblasScnrm2 -#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS -#define cublasStatus_t hipblasStatus_t -#define cublasZaxpy hipblasZaxpy -#define cublasZdotc hipblasZdotc -#define cublasZdscal hipblasZdscal -#define cublasZscal hipblasZscal -#define cuCimagf hipCimagf -#define cuCimag hipCimag -#define cuComplex hipComplex -#define cuCrealf hipCrealf -#define cuCreal hipCreal -#define CUDA_C_32F HIPBLAS_C_32F -#define CUDA_C_64F HIPBLAS_C_64F -#define cudaDeviceSynchronize hipDeviceSynchronize -#define cudaError_t hipError_t -#define cudaFree hipFree -#define cudaGetErrorString hipGetErrorString -#define cudaMalloc hipMalloc -#define cudaMemcpyAsync hipMemcpyAsync -#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice -#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost -#define cudaMemcpy hipMemcpy -#define cudaMemcpyHostToDevice hipMemcpyHostToDevice -#define cudaMemset hipMemset -#define cudaPeekAtLastError hipPeekAtLastError -#define cudaSuccess hipSuccess -#define cuDoubleComplex hipDoubleComplex - -template -__device__ __forceinline__ T __shfl_down_sync( - unsigned mask, T var, unsigned int delta, int width = warpSize) { - return __shfl_down(var, delta, width); -} - -#endif // SIMULATOR_CUDA2HIP_H_ diff --git a/tpls/qsim/expect.h b/tpls/qsim/expect.h deleted file mode 100644 index 518d516..0000000 --- a/tpls/qsim/expect.h +++ /dev/null @@ -1,148 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef EXPECT_H_ -#define EXPECT_H_ - -#include - -#include "fuser.h" -#include "gate_appl.h" - -namespace qsim { - -template -struct OpString { - std::complex weight; - std::vector ops; -}; - -/** - * Computes the expectation value of the sum of operator strings (operator - * sequences). Operators can act on any qubits and they can be any supported - * gates. This function uses a temporary state vector. - * @param param Options for gate fusion. - * @param strings Operator strings. - * @param ss StateSpace object required to copy the state vector and compute - * inner products. - * @param simulator Simulator object. Provides specific implementations for - * applying gates. - * @param state The state vector of the system. - * @param ket Temporary state vector. - * @return The computed expectation value. - */ -template -std::complex ExpectationValue( - const typename Fuser::Parameter& param, - const std::vector>& strings, - const typename Simulator::StateSpace& state_space, - const Simulator& simulator, const typename Simulator::State& state, - typename Simulator::State& ket) { - std::complex eval = 0; - - if (state_space.IsNull(ket) || ket.num_qubits() < state.num_qubits()) { - ket = state_space.Create(state.num_qubits()); - if (state_space.IsNull(ket)) { - IO::errorf("not enough memory: is the number of qubits too large?\n"); - return eval; - } - } - - for (const auto& str : strings) { - if (str.ops.size() == 0) { - eval += str.weight; - continue; - } - - state_space.Copy(state, ket); - - if (str.ops.size() == 1) { - const auto& op = str.ops[0]; - simulator.ApplyGate(op.qubits, op.matrix.data(), ket); - } else { - auto fused_gates = Fuser::FuseGates(param, state.num_qubits(), str.ops); - if (fused_gates.size() == 0) { - eval = 0; - break; - } - - for (const auto& fgate : fused_gates) { - ApplyFusedGate(simulator, fgate, ket); - } - } - - eval += str.weight * state_space.InnerProduct(state, ket); - } - - return eval; -} - -/** - * Computes the expectation value of the sum of operator strings (operator - * sequences). Operators can act on any qubits and they can be any supported - * gates except for user-defined controlled gates. Computation is performed - * in place. No additional memory is allocated. The operator strings should - * act on no more than six qubits and they should be fusible into one gate. - * @param strings Operator strings. - * @param simulator Simulator object. Provides specific implementations for - * computing expectation values. - * @param state The state of the system. - * @return The computed expectation value. - */ -template -std::complex ExpectationValue( - const std::vector>& strings, - const Simulator& simulator, const typename Simulator::State& state) { - std::complex eval = 0; - - typename Fuser::Parameter param; - param.max_fused_size = 6; - for (const auto& str : strings) { - if (str.ops.size() == 0) { - eval += str.weight; - } else if (str.ops.size() == 1) { - const auto& op = str.ops[0]; - auto r = simulator.ExpectationValue(op.qubits, op.matrix.data(), state); - eval += str.weight * r; - } else { - auto fused_gates = Fuser::FuseGates(param, state.num_qubits(), str.ops); - - if (fused_gates.size() != 1) { - IO::errorf("too many fused gates; " - "cannot compute the expectation value.\n"); - eval = 0; - break; - } - - const auto& fgate = fused_gates[0]; - - if (fgate.qubits.size() > 6) { - IO::errorf("operator string acts on too many qubits; " - "cannot compute the expectation value.\n"); - eval = 0; - break; - } - - auto r = simulator.ExpectationValue( - fgate.qubits, fgate.matrix.data(), state); - eval += str.weight * r; - } - } - - return eval; -} - -} // namespace qsim - -#endif // EXPECT_H_ diff --git a/tpls/qsim/formux.h b/tpls/qsim/formux.h deleted file mode 100644 index 4401e9b..0000000 --- a/tpls/qsim/formux.h +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef FORMUX_H_ -#define FORMUX_H_ - -#ifdef _OPENMP -# include "parfor.h" - namespace qsim { - using For = ParallelFor; - } -#else -# include "seqfor.h" - namespace qsim { - using For = SequentialFor; - } -#endif - -#endif // FORMUX_H_ diff --git a/tpls/qsim/fuser.h b/tpls/qsim/fuser.h deleted file mode 100644 index e4f3c3b..0000000 --- a/tpls/qsim/fuser.h +++ /dev/null @@ -1,225 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef FUSER_H_ -#define FUSER_H_ - -#include -#include - -#include "gate.h" -#include "matrix.h" - -namespace qsim { - -/** - * A collection of "fused" gates which can be multiplied together before being - * applied to the state vector. - */ -template -struct GateFused { - /** - * Kind of the first ("parent") gate. - */ - typename Gate::GateKind kind; - /** - * The time index of the first ("parent") gate. - */ - unsigned time; - /** - * A list of qubits these gates act upon. Control qubits for - * explicitly-controlled gates are excluded from this list. - */ - std::vector qubits; - /** - * Pointer to the first ("parent") gate. - */ - const Gate* parent; - /** - * Ordered list of component gates. - */ - std::vector gates; - /** - * Fused gate matrix. - */ - Matrix matrix; -}; - -/** - * A base class for fuser classes with some common functions. - */ -template -class Fuser { - protected: - using RGate = typename std::remove_pointer::type; - - static const RGate& GateToConstRef(const RGate& gate) { - return gate; - } - - static const RGate& GateToConstRef(const RGate* gate) { - return *gate; - } - - static std::vector MergeWithMeasurementTimes( - typename std::vector::const_iterator gfirst, - typename std::vector::const_iterator glast, - const std::vector& times) { - std::vector epochs; - epochs.reserve(glast - gfirst + times.size()); - - std::size_t last = 0; - unsigned max_time = 0; - - for (auto gate_it = gfirst; gate_it < glast; ++gate_it) { - const auto& gate = GateToConstRef(*gate_it); - - if (gate.time > max_time) { - max_time = gate.time; - } - - if (epochs.size() > 0 && gate.time < epochs.back()) { - IO::errorf("gate crosses the time boundary.\n"); - epochs.resize(0); - return epochs; - } - - if (gate.kind == gate::kMeasurement) { - if (epochs.size() == 0 || epochs.back() < gate.time) { - if (!AddBoundary(gate.time, max_time, epochs)) { - epochs.resize(0); - return epochs; - } - } - } - - while (last < times.size() && times[last] <= gate.time) { - unsigned prev = times[last++]; - epochs.push_back(prev); - if (!AddBoundary(prev, max_time, epochs)) { - epochs.resize(0); - return epochs; - } - while (last < times.size() && times[last] <= prev) ++last; - } - } - - if (epochs.size() == 0 || epochs.back() < max_time) { - epochs.push_back(max_time); - } - - return epochs; - } - - template - static void FuseZeroQubitGates(const GateSeq0& gate_seq0, - Parent parent, std::size_t first, - std::vector& fused_gates) { - GateFused* fuse_to = nullptr; - - for (std::size_t i = first; i < fused_gates.size(); ++i) { - auto& fgate = fused_gates[i]; - - if (fgate.kind != gate::kMeasurement && fgate.kind != gate::kDecomp - && fgate.parent->controlled_by.size() == 0 - && !fgate.parent->unfusible) { - fuse_to = &fgate; - break; - } - } - - if (fuse_to != nullptr) { - // Fuse zero-qubit gates with the first available fused gate. - for (const auto& g : gate_seq0) { - fuse_to->gates.push_back(parent(g)); - } - } else { - auto g0 = parent(gate_seq0[0]); - fused_gates.push_back({g0->kind, g0->time, {}, g0, {g0}, {}}); - - for (std::size_t i = 1; i < gate_seq0.size(); ++i) { - fused_gates.back().gates.push_back(parent(gate_seq0[i])); - } - } - } - - private: - static bool AddBoundary(unsigned time, unsigned max_time, - std::vector& boundaries) { - if (max_time > time) { - IO::errorf("gate crosses the time boundary.\n"); - return false; - } - - boundaries.push_back(time); - return true; - } -}; - -/** - * Multiplies component gate matrices of a fused gate. - * @param gate Fused gate. - */ -template -inline void CalculateFusedMatrix(FusedGate& gate) { - MatrixIdentity(unsigned{1} << gate.qubits.size(), gate.matrix); - - for (auto pgate : gate.gates) { - if (pgate->qubits.size() == 0) { - MatrixScalarMultiply(pgate->matrix[0], pgate->matrix[1], gate.matrix); - } else if (gate.qubits.size() == pgate->qubits.size()) { - MatrixMultiply(gate.qubits.size(), pgate->matrix, gate.matrix); - } else { - unsigned mask = 0; - - for (auto q : pgate->qubits) { - for (std::size_t i = 0; i < gate.qubits.size(); ++i) { - if (q == gate.qubits[i]) { - mask |= unsigned{1} << i; - break; - } - } - } - - MatrixMultiply(mask, pgate->qubits.size(), pgate->matrix, - gate.qubits.size(), gate.matrix); - } - } -} - -/** - * Multiplies component gate matrices for a range of fused gates. - * @param gbeg, gend The iterator range [gbeg, gend) of fused gates. - */ -template -inline void CalculateFusedMatrices(Iterator gbeg, Iterator gend) { - for (auto g = gbeg; g != gend; ++g) { - if (g->kind != gate::kMeasurement) { - CalculateFusedMatrix(*g); - } - } -} - -/** - * Multiplies component gate matrices for a vector of fused gates. - * @param gates The vector of fused gates. - */ -template -inline void CalculateFusedMatrices(std::vector& gates) { - CalculateFusedMatrices(gates.begin(), gates.end()); -} - -} // namespace qsim - -#endif // FUSER_H_ diff --git a/tpls/qsim/fuser_basic.h b/tpls/qsim/fuser_basic.h deleted file mode 100644 index 3191bd2..0000000 --- a/tpls/qsim/fuser_basic.h +++ /dev/null @@ -1,411 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef FUSER_BASIC_H_ -#define FUSER_BASIC_H_ - -#include -#include -#include -#include - -#include "gate.h" -#include "fuser.h" - -namespace qsim { - -/** - * Stateless object with methods for aggregating `Gate`s into `GateFused`. - * Measurement gates with equal times are fused together. - * User-defined controlled gates (controlled_by.size() > 0) and gates acting on - * more than two qubits are not fused. - * The template parameter Gate can be Gate type or a pointer to Gate type. - * This class is deprecated. It is recommended to use MultiQubitGateFuser - * from fuser_mqubit.h. - */ -template -class BasicGateFuser final : public Fuser { - private: - using Base = Fuser; - using RGate = typename Base::RGate; - - public: - using GateFused = qsim::GateFused; - - /** - * User-specified parameters for gate fusion. - * BasicGateFuser does not use any parameters. - */ - struct Parameter { - unsigned verbosity = 0; - }; - - /** - * Stores sets of gates that can be applied together. Only one- and - * two-qubit gates will get fused. To respect specific time boundaries while - * fusing gates, use the other version of this method below. - * @param param Options for gate fusion. - * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'. - * @param gates The gates (or pointers to the gates) to be fused. - * Gate times of the gates that act on the same qubits should be ordered. - * Gates that are out of time order should not cross the time boundaries - * set by measurement gates. - * @param fuse_matrix If true, multiply gate matrices together. - * @return A vector of fused gate objects. Each element is a set of gates - * acting on a specific pair of qubits which can be applied as a group. - */ - static std::vector FuseGates(const Parameter& param, - unsigned max_qubit1, - const std::vector& gates, - bool fuse_matrix = true) { - return FuseGates( - param, max_qubit1, gates.cbegin(), gates.cend(), {}, fuse_matrix); - } - - /** - * Stores sets of gates that can be applied together. Only one- and - * two-qubit gates will get fused. - * @param param Options for gate fusion. - * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'. - * @param gates The gates (or pointers to the gates) to be fused. - * Gate times of the gates that act on the same qubits should be ordered. - * Gates that are out of time order should not cross the time boundaries - * set by `times_to_split_at` or by measurement gates. - * @param times_to_split_at Ordered list of time steps (boundaries) at which - * to separate fused gates. Each element of the output will contain gates - * from a single 'window' in this list. - * @param fuse_matrix If true, multiply gate matrices together. - * @return A vector of fused gate objects. Each element is a set of gates - * acting on a specific pair of qubits which can be applied as a group. - */ - static std::vector FuseGates( - const Parameter& param, - unsigned max_qubit1, const std::vector& gates, - const std::vector& times_to_split_at, - bool fuse_matrix = true) { - return FuseGates(param, max_qubit1, gates.cbegin(), gates.cend(), - times_to_split_at, fuse_matrix); - } - - /** - * Stores sets of gates that can be applied together. Only one- and - * two-qubit gates will get fused. To respect specific time boundaries while - * fusing gates, use the other version of this method below. - * @param param Options for gate fusion. - * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'. - * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates - * (or pointers to gates) in. Gate times of the gates that act on the same - * qubits should be ordered. Gates that are out of time order should not - * cross the time boundaries set by measurement gates. - * @param fuse_matrix If true, multiply gate matrices together. - * @return A vector of fused gate objects. Each element is a set of gates - * acting on a specific pair of qubits which can be applied as a group. - */ - static std::vector FuseGates( - const Parameter& param, unsigned max_qubit1, - typename std::vector::const_iterator gfirst, - typename std::vector::const_iterator glast, - bool fuse_matrix = true) { - return FuseGates(param, max_qubit1, gfirst, glast, {}, fuse_matrix); - } - - /** - * Stores sets of gates that can be applied together. Only one- and - * two-qubit gates will get fused. - * @param param Options for gate fusion. - * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'. - * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates - * (or pointers to gates) in. Gate times of the gates that act on the same - * qubits should be ordered. Gates that are out of time order should not - * cross the time boundaries set by `times_to_split_at` or by measurement - * gates. - * @param times_to_split_at Ordered list of time steps (boundaries) at which - * to separate fused gates. Each element of the output will contain gates - * from a single 'window' in this list. - * @param fuse_matrix If true, multiply gate matrices together. - * @return A vector of fused gate objects. Each element is a set of gates - * acting on a specific pair of qubits which can be applied as a group. - */ - static std::vector FuseGates( - const Parameter& param, unsigned max_qubit1, - typename std::vector::const_iterator gfirst, - typename std::vector::const_iterator glast, - const std::vector& times_to_split_at, - bool fuse_matrix = true) { - std::vector gates_fused; - - if (gfirst >= glast) return gates_fused; - - std::size_t num_gates = glast - gfirst; - - gates_fused.reserve(num_gates); - - // Merge with measurement gate times to separate fused gates at. - auto times = - Base::MergeWithMeasurementTimes(gfirst, glast, times_to_split_at); - - // Map to keep track of measurement gates with equal times. - std::map> measurement_gates; - - // Sequence of top level gates the other gates get fused to. - std::vector gates_seq; - - // Sequence of zero-qubit gates. - std::vector gates_seq0; - - // Lattice of gates: qubits "hyperplane" and time direction. - std::vector> gates_lat(max_qubit1); - - // Current unfused gate. - auto gate_it = gfirst; - - std::size_t last_fused_gate_index = 0; - - for (std::size_t l = 0; l < times.size(); ++l) { - gates_seq.resize(0); - gates_seq.reserve(num_gates); - - gates_seq0.resize(0); - gates_seq0.reserve(num_gates); - - for (unsigned k = 0; k < max_qubit1; ++k) { - gates_lat[k].resize(0); - gates_lat[k].reserve(128); - } - - // Fill gates_seq and gates_lat in. - for (; gate_it < glast; ++gate_it) { - const auto& gate = Base::GateToConstRef(*gate_it); - - if (gate.time > times[l]) break; - - if (!ValidateGate(gate, max_qubit1, gates_lat)) { - gates_fused.resize(0); - return gates_fused; - } - - if (gate.kind == gate::kMeasurement) { - auto& mea_gates_at_time = measurement_gates[gate.time]; - if (mea_gates_at_time.size() == 0) { - gates_seq.push_back(&gate); - mea_gates_at_time.reserve(max_qubit1); - } - - mea_gates_at_time.push_back(&gate); - } else if (gate.controlled_by.size() > 0 || gate.qubits.size() > 2) { - for (auto q : gate.qubits) { - gates_lat[q].push_back(&gate); - } - for (auto q : gate.controlled_by) { - gates_lat[q].push_back(&gate); - } - gates_seq.push_back(&gate); - } else if (gate.qubits.size() == 1) { - gates_lat[gate.qubits[0]].push_back(&gate); - if (gate.unfusible) { - gates_seq.push_back(&gate); - } - } else if (gate.qubits.size() == 2) { - gates_lat[gate.qubits[0]].push_back(&gate); - gates_lat[gate.qubits[1]].push_back(&gate); - gates_seq.push_back(&gate); - } else { - gates_seq0.push_back(&gate); - } - } - - std::vector last(max_qubit1, 0); - - const RGate* delayed_measurement_gate = nullptr; - - // Fuse gates. - for (auto pgate : gates_seq) { - if (pgate->kind == gate::kMeasurement) { - delayed_measurement_gate = pgate; - } else if (pgate->qubits.size() > 2 - || pgate->controlled_by.size() > 0) { - // Multi-qubit or controlled gate. - - for (auto q : pgate->qubits) { - unsigned l = last[q]; - if (gates_lat[q][l] != pgate) { - last[q] = AddOrphanedQubit(q, l, gates_lat, gates_fused); - } - ++last[q]; - } - - for (auto q : pgate->controlled_by) { - unsigned l = last[q]; - if (gates_lat[q][l] != pgate) { - last[q] = AddOrphanedQubit(q, l, gates_lat, gates_fused); - } - ++last[q]; - } - - gates_fused.push_back({pgate->kind, pgate->time, pgate->qubits, - pgate, {pgate}, {}}); - } else if (pgate->qubits.size() == 1) { - unsigned q0 = pgate->qubits[0]; - - GateFused gate_f = {pgate->kind, pgate->time, {q0}, pgate, {}, {}}; - - last[q0] = Advance(last[q0], gates_lat[q0], gate_f.gates); - gate_f.gates.push_back(gates_lat[q0][last[q0]]); - last[q0] = Advance(last[q0] + 1, gates_lat[q0], gate_f.gates); - - gates_fused.push_back(std::move(gate_f)); - } else if (pgate->qubits.size() == 2) { - unsigned q0 = pgate->qubits[0]; - unsigned q1 = pgate->qubits[1]; - - if (Done(last[q0], pgate->time, gates_lat[q0])) continue; - - GateFused gate_f = - {pgate->kind, pgate->time, {q0, q1}, pgate, {}, {}}; - - do { - last[q0] = Advance(last[q0], gates_lat[q0], gate_f.gates); - last[q1] = Advance(last[q1], gates_lat[q1], gate_f.gates); - // Here gates_lat[q0][last[q0]] == gates_lat[q1][last[q1]]. - - gate_f.gates.push_back(gates_lat[q0][last[q0]]); - - last[q0] = Advance(last[q0] + 1, gates_lat[q0], gate_f.gates); - last[q1] = Advance(last[q1] + 1, gates_lat[q1], gate_f.gates); - } while (NextGate(last[q0], gates_lat[q0], last[q1], gates_lat[q1])); - - gates_fused.push_back(std::move(gate_f)); - } - } - - for (unsigned q = 0; q < max_qubit1; ++q) { - auto l = last[q]; - if (l == gates_lat[q].size()) continue; - - // Orphaned qubit. - AddOrphanedQubit(q, l, gates_lat, gates_fused); - } - - if (delayed_measurement_gate != nullptr) { - auto pgate = delayed_measurement_gate; - - const auto& mea_gates_at_time = measurement_gates[pgate->time]; - - GateFused gate_f = {pgate->kind, pgate->time, {}, pgate, {}, {}}; - gate_f.gates.reserve(mea_gates_at_time.size()); - - // Fuse measurement gates with equal times. - - for (const auto* pgate : mea_gates_at_time) { - gate_f.qubits.insert(gate_f.qubits.end(), - pgate->qubits.begin(), pgate->qubits.end()); - gate_f.gates.push_back(pgate); - } - - gates_fused.push_back(std::move(gate_f)); - } - - if (gates_seq0.size() != 0) { - Base::FuseZeroQubitGates(gates_seq0, [](const RGate* g) { return g; }, - last_fused_gate_index, gates_fused); - } - - if (gate_it == glast) break; - - last_fused_gate_index = gates_fused.size(); - } - - if (fuse_matrix) { - for (auto& gate_f : gates_fused) { - if (gate_f.kind != gate::kMeasurement && gate_f.kind != gate::kDecomp) { - CalculateFusedMatrix(gate_f); - } - } - } - - return gates_fused; - } - - private: - static unsigned Advance(unsigned k, const std::vector& wl, - std::vector& gates) { - while (k < wl.size() && wl[k]->qubits.size() == 1 - && wl[k]->controlled_by.size() == 0 && !wl[k]->unfusible) { - gates.push_back(wl[k++]); - } - - return k; - } - - static bool Done( - unsigned k, unsigned t, const std::vector& wl) { - return k >= wl.size() || wl[k]->time > t; - } - - static bool NextGate(unsigned k1, const std::vector& wl1, - unsigned k2, const std::vector& wl2) { - return k1 < wl1.size() && k2 < wl2.size() && wl1[k1] == wl2[k2] - && wl1[k1]->qubits.size() < 3 && wl1[k1]->controlled_by.size() == 0; - } - - template - static unsigned AddOrphanedQubit(unsigned q, unsigned k, - const GatesLat& gates_lat, - std::vector& gates_fused) { - auto pgate = gates_lat[q][k]; - - GateFused gate_f = {pgate->kind, pgate->time, {q}, pgate, {}, {}}; - gate_f.gates.push_back(pgate); - - k = Advance(k + 1, gates_lat[q], gate_f.gates); - - gates_fused.push_back(std::move(gate_f)); - - return k; - } - - template - static bool ValidateGate(const Gate2& gate, unsigned max_qubit1, - const GatesLat& gates_lat) { - for (unsigned q : gate.qubits) { - if (q >= max_qubit1) { - IO::errorf("fuser: gate qubit %u is out of range " - "(should be smaller than %u).\n", q, max_qubit1); - return false; - } - if (!gates_lat[q].empty() && gate.time <= gates_lat[q].back()->time) { - IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time); - return false; - } - } - - for (unsigned q : gate.controlled_by) { - if (q >= max_qubit1) { - IO::errorf("fuser: gate qubit %u is out of range " - "(should be smaller than %u).\n", q, max_qubit1); - return false; - } - if (!gates_lat[q].empty() && gate.time <= gates_lat[q].back()->time) { - IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time); - return false; - } - } - - return true; - } -}; - -} // namespace qsim - -#endif // FUSER_BASIC_H_ diff --git a/tpls/qsim/fuser_mqubit.h b/tpls/qsim/fuser_mqubit.h deleted file mode 100644 index c75b1a0..0000000 --- a/tpls/qsim/fuser_mqubit.h +++ /dev/null @@ -1,1095 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef FUSER_MQUBIT_H_ -#define FUSER_MQUBIT_H_ - -#include -#include -#include -#include -#include -#include - -#include "gate.h" -#include "fuser.h" - -namespace qsim { - -/** - * Multi-qubit gate fuser. - * Measurement gates with equal times are fused together. - * User-defined controlled gates (controlled_by.size() > 0) are not fused. - * The template parameter Gate can be Gate type or a pointer to Gate type. - */ -template -class MultiQubitGateFuser final : public Fuser { - private: - using Base = Fuser; - using RGate = typename Base::RGate; - - // Auxillary classes and structs. - - // Manages doubly-linked lists. - template - class LinkManagerT { - public: - struct Link { - T val; - Link* next; - Link* prev; - }; - - explicit LinkManagerT(uint64_t size) { - links_.reserve(size); - } - - Link* AddBack(const T& t, Link* link) { - if (link == nullptr) { - links_.push_back({t, nullptr, nullptr}); - } else { - links_.push_back({t, link->next, link}); - link->next = &links_.back(); - } - - return &links_.back(); - } - - static void Delete(const Link* link) { - if (link->prev != nullptr) { - link->prev->next = link->next; - } - if (link->next != nullptr) { - link->next->prev = link->prev; - } - } - - private: - std::vector links_; - }; - - struct GateF; - - using LinkManager = LinkManagerT; - using Link = typename LinkManager::Link; - - // Intermediate representation of a fused gate. - struct GateF { - const RGate* parent; - std::vector qubits; - std::vector gates; // Gates that get fused to this gate. - std::vector links; // Gate "lattice" links. - uint64_t mask; // Qubit mask. - unsigned visited; - }; - - // Possible values for visited in GateF. - // Note that MakeGateSequence assignes values from kSecond to the number of - // gates in the sequence plus one, see below. - enum Visited { - kZero = 0, // Start value for "normal" gates. - kFirst = 1, // Value after the first pass for partially fused - // "normal" gates. - kSecond = 2, // Start value to assign values in MakeGateSequence. - kCompress = 99999997, // Used to compress links. - kMeaCnt = 99999998, // Start value for controlled or measurement gates. - kFinal = 99999999, // Value after the second pass for fused "normal" - // gates or for controlled and measurement gates. - }; - - struct Stat { - unsigned num_mea_gates = 0; - unsigned num_fused_mea_gates = 0; - unsigned num_fused_gates = 0; - unsigned num_controlled_gates = 0; - std::vector num_gates; - }; - - // Gate that is added to a sequence of gates to fuse together. - struct GateA { - GateF* gate; - std::vector qubits; // Added qubits. - std::vector links; // Added lattice links. - }; - - struct Scratch { - std::vector data; - std::vector prev1; - std::vector prev2; - std::vector next1; - std::vector next2; - std::vector longest_seq; - std::vector stack; - std::vector gates; - unsigned count = 0; - }; - - public: - using GateFused = qsim::GateFused; - - /** - * User-specified parameters for gate fusion. - */ - struct Parameter { - /** - * Maximum number of qubits in a fused gate. It can take values from 2 to - * 6 (0 and 1 are equivalent to 2). It is not recommended to use 5 or 6 as - * that might degrade performance for not very fast machines. - */ - unsigned max_fused_size = 2; - unsigned verbosity = 0; - }; - - /** - * Stores sets of gates that can be applied together. To respect specific - * time boundaries while fusing gates, use the other version of this method - * below. - * @param param Options for gate fusion. - * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'. - * @param gates The gates (or pointers to the gates) to be fused. - * Gate times of the gates that act on the same qubits should be ordered. - * Gates that are out of time order should not cross the time boundaries - * set by measurement gates. - * @param fuse_matrix If true, multiply gate matrices together. - * @return A vector of fused gate objects. Each element is a set of gates - * acting on a specific pair of qubits which can be applied as a group. - */ - static std::vector FuseGates(const Parameter& param, - unsigned max_qubit1, - const std::vector& gates, - bool fuse_matrix = true) { - return FuseGates( - param, max_qubit1, gates.cbegin(), gates.cend(), {}, fuse_matrix); - } - - /** - * Stores sets of gates that can be applied together. - * @param param Options for gate fusion. - * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'. - * @param gates The gates (or pointers to the gates) to be fused. - * Gate times of the gates that act on the same qubits should be ordered. - * Gates that are out of time order should not cross the time boundaries - * set by `times_to_split_at` or by measurement gates. - * @param times_to_split_at Ordered list of time steps (boundaries) at which - * to separate fused gates. Each element of the output will contain gates - * from a single 'window' in this list. - * @param fuse_matrix If true, multiply gate matrices together. - * @return A vector of fused gate objects. Each element is a set of gates - * acting on a specific pair of qubits which can be applied as a group. - */ - static std::vector FuseGates( - const Parameter& param, - unsigned max_qubit1, const std::vector& gates, - const std::vector& times_to_split_at, - bool fuse_matrix = true) { - return FuseGates(param, max_qubit1, gates.cbegin(), gates.cend(), - times_to_split_at, fuse_matrix); - } - - /** - * Stores sets of gates that can be applied together. To respect specific - * time boundaries while fusing gates, use the other version of this method - * below. - * @param param Options for gate fusion. - * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'. - * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates - * (or pointers to gates) in. Gate times of the gates that act on the same - * qubits should be ordered. Gates that are out of time order should not - * cross the time boundaries set by measurement gates. - * @param fuse_matrix If true, multiply gate matrices together. - * @return A vector of fused gate objects. Each element is a set of gates - * acting on a specific pair of qubits which can be applied as a group. - */ - static std::vector FuseGates( - const Parameter& param, unsigned max_qubit1, - typename std::vector::const_iterator gfirst, - typename std::vector::const_iterator glast, - bool fuse_matrix = true) { - return FuseGates(param, max_qubit1, gfirst, glast, {}, fuse_matrix); - } - - /** - * Stores sets of gates that can be applied together. - * @param param Options for gate fusion. - * @param max_qubit1 The maximum qubit index (plus one) acted on by 'gates'. - * @param gfirst, glast The iterator range [gfirst, glast) to fuse gates - * (or pointers to gates) in. Gate times of the gates that act on the same - * qubits should be ordered. Gates that are out of time order should not - * cross the time boundaries set by `times_to_split_at` or by measurement - * gates. - * @param times_to_split_at Ordered list of time steps (boundaries) at which - * to separate fused gates. Each element of the output will contain gates - * from a single 'window' in this list. - * @param fuse_matrix If true, multiply gate matrices together. - * @return A vector of fused gate objects. Each element is a set of gates - * acting on a specific pair of qubits which can be applied as a group. - */ - static std::vector FuseGates( - const Parameter& param, unsigned max_qubit1, - typename std::vector::const_iterator gfirst, - typename std::vector::const_iterator glast, - const std::vector& times_to_split_at, - bool fuse_matrix = true) { - std::vector fused_gates; - - if (gfirst >= glast) return fused_gates; - - std::size_t num_gates = glast - gfirst; - - fused_gates.reserve(num_gates); - - // Merge with measurement gate times to separate fused gates at. - auto epochs = - Base::MergeWithMeasurementTimes(gfirst, glast, times_to_split_at); - - LinkManager link_manager(max_qubit1 * num_gates); - - // Auxillary data structures. - // Sequence of intermediate fused gates. - std::vector gates_seq; - // Gate "lattice". - std::vector gates_lat; - // Sequences of intermediate fused gates ordered by gate size. - std::vector> fgates(max_qubit1 + 1); - - gates_seq.reserve(num_gates); - gates_lat.reserve(max_qubit1); - - Scratch scratch; - - scratch.data.reserve(1024); - scratch.prev1.reserve(32); - scratch.prev2.reserve(32); - scratch.next1.reserve(32); - scratch.next2.reserve(32); - scratch.longest_seq.reserve(8); - scratch.stack.reserve(8); - - Stat stat; - stat.num_gates.resize(max_qubit1 + 1, 0); - - unsigned max_fused_size = std::min(unsigned{6}, param.max_fused_size); - max_fused_size = std::min(max_fused_size, max_qubit1); - - std::size_t last_fused_gate_index = 0; - auto gate_it = gfirst; - - // Iterate over epochs. - for (std::size_t l = 0; l < epochs.size(); ++l) { - gates_seq.resize(0); - gates_lat.resize(0); - gates_lat.resize(max_qubit1, nullptr); - - for (unsigned i = 0; i <= max_qubit1; ++i) { - fgates[i].resize(0); - } - - uint64_t max_gate_size = 0; - GateF* last_mea_gate = nullptr; - - // Iterate over input gates. - for (; gate_it < glast; ++gate_it) { - const auto& gate = Base::GateToConstRef(*gate_it); - - if (gate.time > epochs[l]) break; - - if (!ValidateGate(gate, max_qubit1, gates_lat)) { - fused_gates.resize(0); - return fused_gates; - } - - // Fill in auxillary data structures. - - if (gate.kind == gate::kMeasurement) { - // Measurement gate. - - if (last_mea_gate == nullptr - || last_mea_gate->parent->time != gate.time) { - gates_seq.push_back({&gate, {}, {}, {}, 0, kMeaCnt}); - last_mea_gate = &gates_seq.back(); - - last_mea_gate->qubits.reserve(max_qubit1); - last_mea_gate->links.reserve(max_qubit1); - - ++stat.num_fused_mea_gates; - } - - for (auto q : gate.qubits) { - last_mea_gate->qubits.push_back(q); - last_mea_gate->mask |= uint64_t{1} << q; - gates_lat[q] = link_manager.AddBack(last_mea_gate, gates_lat[q]); - last_mea_gate->links.push_back(gates_lat[q]); - } - - last_mea_gate->gates.push_back(&gate); - - ++stat.num_mea_gates; - } else { - gates_seq.push_back({&gate, {}, {}, {}, 0, kZero}); - auto& fgate = gates_seq.back(); - - if (gate.controlled_by.size() == 0) { - if (max_gate_size < gate.qubits.size()) { - max_gate_size = gate.qubits.size(); - } - - unsigned num_gate_qubits = gate.qubits.size(); - unsigned size = std::max(max_fused_size, num_gate_qubits); - - fgate.qubits.reserve(size); - fgate.links.reserve(size); - fgate.gates.reserve(4 * size); - fgate.links.reserve(size); - - if (fgates[num_gate_qubits].empty()) { - fgates[num_gate_qubits].reserve(num_gates); - } - fgates[num_gate_qubits].push_back(&fgate); - - ++stat.num_gates[num_gate_qubits]; - } else { - // Controlled gate. - // Controlled gates are not fused with other gates. - - uint64_t size = gate.qubits.size() + gate.controlled_by.size(); - - fgate.qubits.reserve(gate.qubits.size()); - fgate.links.reserve(size); - - fgate.visited = kMeaCnt; - fgate.gates.push_back(&gate); - - ++stat.num_controlled_gates; - } - - for (auto q : gate.qubits) { - fgate.qubits.push_back(q); - fgate.mask |= uint64_t{1} << q; - gates_lat[q] = link_manager.AddBack(&fgate, gates_lat[q]); - fgate.links.push_back(gates_lat[q]); - } - - for (auto q : gate.controlled_by) { - fgate.mask |= uint64_t{1} << q; - gates_lat[q] = link_manager.AddBack(&fgate, gates_lat[q]); - fgate.links.push_back(gates_lat[q]); - } - } - } - - // Fuse large gates with smaller gates. - FuseGates(max_gate_size, fgates); - - if (max_fused_size > 2) { - FuseGateSequences( - max_fused_size, max_qubit1, scratch, gates_seq, stat, fused_gates); - } else { - unsigned prev_time = 0; - - std::vector orphaned_gates; - orphaned_gates.reserve(max_qubit1); - - for (auto& fgate : gates_seq) { - if (fgate.gates.size() == 0) continue; - - if (prev_time != fgate.parent->time) { - if (orphaned_gates.size() > 0) { - FuseOrphanedGates( - max_fused_size, stat, orphaned_gates, fused_gates); - orphaned_gates.resize(0); - } - - prev_time = fgate.parent->time; - } - - if (fgate.qubits.size() == 1 && max_fused_size > 1 - && fgate.visited != kMeaCnt && !fgate.parent->unfusible) { - orphaned_gates.push_back(&fgate); - continue; - } - - // Assume fgate.qubits (gate.qubits) are sorted. - fused_gates.push_back({fgate.parent->kind, fgate.parent->time, - std::move(fgate.qubits), fgate.parent, - std::move(fgate.gates), {}}); - - if (fgate.visited != kMeaCnt) { - ++stat.num_fused_gates; - } - } - - if (orphaned_gates.size() > 0) { - FuseOrphanedGates(max_fused_size, stat, orphaned_gates, fused_gates); - } - } - - if (fgates[0].size() != 0) { - Base::FuseZeroQubitGates(fgates[0], - [](const GateF* g) { return g->parent; }, - last_fused_gate_index, fused_gates); - } - - last_fused_gate_index = fused_gates.size(); - } - - if (fuse_matrix) { - for (auto& fgate : fused_gates) { - if (fgate.kind != gate::kMeasurement && fgate.kind != gate::kDecomp) { - CalculateFusedMatrix(fgate); - } - } - } - - PrintStat(param.verbosity, stat, fused_gates); - - return fused_gates; - } - - private: - // Fuse large gates with smaller gates. - static void FuseGates(uint64_t max_gate_size, - std::vector>& fgates) { - // Traverse gates in order of decreasing size. - for (uint64_t i = 0; i < max_gate_size; ++i) { - std::size_t pos = 0; - - for (auto fgate : fgates[max_gate_size - i]) { - if (fgate->visited > kZero) continue; - - fgates[max_gate_size - i][pos++] = fgate; - - fgate->visited = kFirst; - - FusePrev(0, *fgate); - fgate->gates.push_back(fgate->parent); - FuseNext(0, *fgate); - } - - fgates[max_gate_size - i].resize(pos); - } - } - - // Try to fuse gate sequences as follows. Gate time goes from bottom to top. - // Gates are fused either from left to right or from right to left. - // - // max_fused_size = 3: _- or -_ - // - // max_fused_size = 4: _-_ - // - // max_fused_size = 5: _-_- or -_-_ - // - // max_fused_size = 6: _-_-_ - static void FuseGateSequences(unsigned max_fused_size, - unsigned max_qubit1, Scratch& scratch, - std::vector& gates_seq, Stat& stat, - std::vector& fused_gates) { - unsigned prev_time = 0; - - std::vector orphaned_gates; - orphaned_gates.reserve(max_qubit1); - - for (auto& fgate : gates_seq) { - if (prev_time != fgate.parent->time) { - if (orphaned_gates.size() > 0) { - FuseOrphanedGates(max_fused_size, stat, orphaned_gates, fused_gates); - orphaned_gates.resize(0); - } - - prev_time = fgate.parent->time; - } - - if (fgate.visited == kFinal || fgate.gates.size() == 0) continue; - - if (fgate.visited == kMeaCnt || fgate.qubits.size() >= max_fused_size - || fgate.parent->unfusible) { - if (fgate.visited != kMeaCnt) { - ++stat.num_fused_gates; - } - - fgate.visited = kFinal; - - fused_gates.push_back({fgate.parent->kind, fgate.parent->time, - std::move(fgate.qubits), fgate.parent, - std::move(fgate.gates), {}}); - - continue; - } - - - if (fgate.qubits.size() == 1 && max_fused_size > 1) { - orphaned_gates.push_back(&fgate); - continue; - } - - scratch.data.resize(0); - scratch.gates.resize(0); - scratch.count = 0; - - MakeGateSequence(max_fused_size, scratch, fgate); - - if (scratch.gates.size() == 0) { - orphaned_gates.push_back(&fgate); - } else { - for (auto fgate : scratch.gates) { - std::sort(fgate->qubits.begin(), fgate->qubits.end()); - - fused_gates.push_back({fgate->parent->kind, fgate->parent->time, - std::move(fgate->qubits), fgate->parent, - std::move(fgate->gates), {}}); - - ++stat.num_fused_gates; - } - } - } - - if (orphaned_gates.size() > 0) { - FuseOrphanedGates(max_fused_size, stat, orphaned_gates, fused_gates); - } - } - - static void FuseOrphanedGates(unsigned max_fused_size, Stat& stat, - std::vector& orphaned_gates, - std::vector& fused_gates) { - for (std::size_t i = 0; i < orphaned_gates.size(); ++i) { - auto ogate1 = orphaned_gates[i]; - - if (ogate1->visited == kFinal) continue; - - ogate1->visited = kFinal; - - for (std::size_t j = i + 1; j < orphaned_gates.size(); ++j) { - auto ogate2 = orphaned_gates[j]; - - if (ogate2->visited == kFinal) continue; - - unsigned cur_size = ogate1->qubits.size() + ogate2->qubits.size(); - - if (cur_size <= max_fused_size) { - ogate2->visited = kFinal; - - for (auto q : ogate2->qubits) { - ogate1->qubits.push_back(q); - ogate1->mask |= uint64_t{1} << q; - } - - for (auto l : ogate2->links) { - ogate1->links.push_back(l); - } - - for (auto gate : ogate2->gates) { - ogate1->gates.push_back(gate); - } - } - - if (cur_size == max_fused_size) { - break; - } - } - - FuseNext(1, *ogate1); - - std::sort(ogate1->qubits.begin(), ogate1->qubits.end()); - - fused_gates.push_back({ogate1->parent->kind, ogate1->parent->time, - std::move(ogate1->qubits), ogate1->parent, - std::move(ogate1->gates), {}}); - - ++stat.num_fused_gates; - } - } - - static void MakeGateSequence( - unsigned max_fused_size, Scratch& scratch, GateF& fgate) { - unsigned level = kSecond + scratch.count; - - FindLongestGateSequence(max_fused_size, level, scratch, fgate); - - auto longest_seq = scratch.longest_seq; - - if (longest_seq.size() == 1 && scratch.count == 0) { - fgate.visited = kFirst; - return; - } - - ++scratch.count; - - for (auto p : longest_seq) { - p->gate->visited = kCompress; - - for (auto q : p->qubits) { - fgate.qubits.push_back(q); - fgate.mask |= uint64_t{1} << q; - } - - for (auto l : p->links) { - fgate.links.push_back(l); - } - } - - // Compress links. - for (auto& link : fgate.links) { - while (link->prev != nullptr && link->prev->val->visited == kCompress) { - link = link->prev; - } - - while (link->next != nullptr && link->next->val->visited == kCompress) { - LinkManager::Delete(link->next); - } - } - - for (auto p : longest_seq) { - p->gate->visited = level; - } - - if (longest_seq.size() >= 3) { - AddGatesFromNext(longest_seq[2]->gate->gates, fgate); - } - - if (longest_seq.size() >= 5) { - AddGatesFromNext(longest_seq[4]->gate->gates, fgate); - } - - if (longest_seq.size() >= 2) { - // May call MakeGateSequence recursively. - AddGatesFromPrev(max_fused_size, *longest_seq[1]->gate, scratch, fgate); - } - - if (longest_seq.size() >= 4) { - // May call MakeGateSequence recursively. - AddGatesFromPrev(max_fused_size, *longest_seq[3]->gate, scratch, fgate); - } - - for (auto p : longest_seq) { - p->gate->visited = kFinal; - } - - FuseNext(1, fgate); - - scratch.gates.push_back(&fgate); - } - - static void AddGatesFromNext(std::vector& gates, GateF& fgate) { - for (auto gate : gates) { - fgate.gates.push_back(gate); - } - } - - static void AddGatesFromPrev(unsigned max_fused_size, const GateF& pfgate, - Scratch& scratch, GateF& fgate) { - for (auto gate : pfgate.gates) { - fgate.gates.push_back(gate); - } - - for (auto link : pfgate.links) { - if (link->prev == nullptr) continue; - - auto pgate = link->prev->val; - - if (pgate->visited == kFirst) { - MakeGateSequence(max_fused_size, scratch, *pgate); - } - } - } - - static void FindLongestGateSequence( - unsigned max_fused_size, unsigned level, Scratch& scratch, GateF& fgate) { - scratch.data.push_back({&fgate, {}, {}}); - - scratch.longest_seq.resize(0); - scratch.longest_seq.push_back(&scratch.data.back()); - - scratch.stack.resize(0); - scratch.stack.push_back(&scratch.data.back()); - - unsigned cur_size = fgate.qubits.size(); - fgate.visited = level; - - unsigned max_size = cur_size; - - GetNextAvailableGates(max_fused_size, cur_size, fgate, nullptr, - scratch.data, scratch.next1); - - for (auto n1 : scratch.next1) { - unsigned cur_size2 = cur_size + n1->qubits.size(); - if (cur_size2 > max_fused_size) continue; - - bool feasible = GetPrevAvailableGates(max_fused_size, cur_size, - level, *n1->gate, nullptr, - scratch.data, scratch.prev1); - - if (!feasible) continue; - - if (scratch.prev1.size() == 0 && max_fused_size > 3) continue; - - if (cur_size2 == max_fused_size) { - std::swap(scratch.longest_seq, scratch.stack); - scratch.longest_seq.push_back(n1); - return; - } - - Push(level, cur_size2, cur_size, max_size, scratch, n1); - - for (auto p1 : scratch.prev1) { - unsigned cur_size2 = cur_size + p1->qubits.size(); - - if (cur_size2 > max_fused_size) { - continue; - } else if (cur_size2 == max_fused_size) { - std::swap(scratch.longest_seq, scratch.stack); - scratch.longest_seq.push_back(p1); - return; - } - - Push(level, cur_size2, cur_size, max_size, scratch, p1); - - GetNextAvailableGates(max_fused_size, cur_size, *p1->gate, &fgate, - scratch.data, scratch.next2); - - for (auto n2 : scratch.next2) { - unsigned cur_size2 = cur_size + n2->qubits.size(); - if (cur_size2 > max_fused_size) continue; - - bool feasible = GetPrevAvailableGates(max_fused_size, cur_size, - level, *n2->gate, n1->gate, - scratch.data, scratch.prev2); - - if (!feasible) continue; - - if (cur_size2 == max_fused_size) { - std::swap(scratch.longest_seq, scratch.stack); - scratch.longest_seq.push_back(n2); - return; - } - - Push(level, cur_size2, cur_size, max_size, scratch, n2); - - for (auto p2 : scratch.prev2) { - unsigned cur_size2 = cur_size + p2->qubits.size(); - - if (cur_size2 > max_fused_size) { - continue; - } else if (cur_size2 == max_fused_size) { - std::swap(scratch.longest_seq, scratch.stack); - scratch.longest_seq.push_back(p2); - return; - } - - if (cur_size2 > max_size) { - scratch.stack.push_back(p2); - scratch.longest_seq = scratch.stack; - scratch.stack.pop_back(); - max_size = cur_size2; - } - } - - Pop(cur_size, scratch, n2); - } - - Pop(cur_size, scratch, p1); - } - - Pop(cur_size, scratch, n1); - } - } - - static void Push(unsigned level, unsigned cur_size2, unsigned& cur_size, - unsigned& max_size, Scratch& scratch, GateA* agate) { - agate->gate->visited = level; - cur_size = cur_size2; - scratch.stack.push_back(agate); - - if (cur_size > max_size) { - scratch.longest_seq = scratch.stack; - max_size = cur_size; - } - } - - static void Pop(unsigned& cur_size, Scratch& scratch, GateA* agate) { - agate->gate->visited = kFirst; - cur_size -= agate->qubits.size(); - scratch.stack.pop_back(); - } - - static void GetNextAvailableGates(unsigned max_fused_size, unsigned cur_size, - const GateF& pgate1, const GateF* pgate2, - std::vector& scratch, - std::vector& next_gates) { - next_gates.resize(0); - - for (auto link : pgate1.links) { - if (link->next == nullptr) continue; - - auto ngate = link->next->val; - - if (ngate->visited > kFirst || ngate->parent->unfusible) continue; - - GateA next = {ngate, {}, {}}; - next.qubits.reserve(8); - next.links.reserve(8); - - GetAddedQubits(pgate1, pgate2, *ngate, next); - - if (cur_size + next.qubits.size() > max_fused_size) continue; - - scratch.push_back(std::move(next)); - next_gates.push_back(&scratch.back()); - } - } - - static bool GetPrevAvailableGates(unsigned max_fused_size, - unsigned cur_size, unsigned level, - const GateF& ngate1, const GateF* ngate2, - std::vector& scratch, - std::vector& prev_gates) { - prev_gates.resize(0); - - for (auto link : ngate1.links) { - if (link->prev == nullptr) continue; - - auto pgate = link->prev->val; - - if (pgate->visited == kFinal || pgate->visited == level) continue; - - if (pgate->visited > kFirst || pgate->parent->unfusible) { - prev_gates.resize(0); - return false; - } - - GateA prev = {pgate, {}, {}}; - prev.qubits.reserve(8); - prev.links.reserve(8); - - GetAddedQubits(ngate1, ngate2, *pgate, prev); - - bool all_prev_visited = true; - - for (auto link : pgate->links) { - if (link->prev == nullptr) continue; - - if (link->prev->val->visited <= kMeaCnt) { - all_prev_visited = false; - break; - } - } - - if (!all_prev_visited) { - prev_gates.resize(0); - return false; - } - - if (cur_size + prev.qubits.size() > max_fused_size) continue; - - if (all_prev_visited) { - scratch.push_back(std::move(prev)); - prev_gates.push_back(&scratch.back()); - } - } - - return true; - } - - static void GetAddedQubits(const GateF& fgate0, const GateF* fgate1, - const GateF& fgate2, GateA& added) { - for (std::size_t i = 0; i < fgate2.qubits.size(); ++i) { - unsigned q2 = fgate2.qubits[i]; - - if (std::find(fgate0.qubits.begin(), fgate0.qubits.end(), q2) - != fgate0.qubits.end()) continue; - - if (fgate1 != nullptr - && std::find(fgate1->qubits.begin(), fgate1->qubits.end(), q2) - != fgate1->qubits.end()) continue; - - added.qubits.push_back(q2); - added.links.push_back(fgate2.links[i]); - } - } - - // Fuse smaller gates with fgate back in gate time. - static void FusePrev(unsigned pass, GateF& fgate) { - std::vector gates; - gates.reserve(fgate.gates.capacity()); - - auto neighbor = [](const Link* link) -> const Link* { - return link->prev; - }; - - FusePrevOrNext>(pass, neighbor, fgate, gates); - - for (auto it = gates.rbegin(); it != gates.rend(); ++it) { - fgate.gates.push_back(*it); - } - } - - // Fuse smaller gates with fgate forward in gate time. - static void FuseNext(unsigned pass, GateF& fgate) { - auto neighbor = [](const Link* link) -> const Link* { - return link->next; - }; - - FusePrevOrNext>(pass, neighbor, fgate, fgate.gates); - } - - template - static void FusePrevOrNext(unsigned pass, Neighbor neighb, GateF& fgate, - std::vector& gates) { - uint64_t bad_mask = 0; - auto links = fgate.links; - - bool may_have_gates_to_fuse = true; - - while (may_have_gates_to_fuse) { - may_have_gates_to_fuse = false; - - std::sort(links.begin(), links.end(), - [&neighb](const Link* l, const Link* r) -> bool { - auto ln = neighb(l); - auto rn = neighb(r); - - if (ln != nullptr && rn != nullptr) { - return R()(ln->val->parent->time, rn->val->parent->time); - } else { - // nullptrs are larger than everything else and - // equivalent among each other. - return ln != nullptr; - } - }); - - for (auto link : links) { - auto n = neighb(link); - - if (n == nullptr) continue; - - auto g = n->val; - - if (!QubitsAreIn(fgate.mask, g->mask) || (g->mask & bad_mask) != 0 - || g->visited > pass || g->parent->unfusible) { - bad_mask |= g->mask; - } else { - g->visited = pass == 0 ? kFirst : kFinal; - - if (pass == 0) { - gates.push_back(g->parent); - } else { - for (auto gate : g->gates) { - gates.push_back(gate); - } - } - - for (auto link : g->links) { - LinkManager::Delete(link); - } - - may_have_gates_to_fuse = true; - break; - } - } - } - } - - static bool QubitsAreIn(uint64_t mask0, uint64_t mask) { - return ((mask0 | mask) ^ mask0) == 0; - } - - static void PrintStat(unsigned verbosity, const Stat& stat, - const std::vector& fused_gates) { - if (verbosity < 3) return; - - if (stat.num_controlled_gates > 0) { - IO::messagef("%lu controlled gates\n", stat.num_controlled_gates); - } - - if (stat.num_mea_gates > 0) { - IO::messagef("%lu measurement gates", stat.num_mea_gates); - if (stat.num_fused_mea_gates == stat.num_mea_gates) { - IO::messagef("\n"); - } else { - IO::messagef(" are fused into %lu gates\n", stat.num_fused_mea_gates); - } - } - - bool first = true; - for (unsigned i = 1; i < stat.num_gates.size(); ++i) { - if (stat.num_gates[i] > 0) { - if (first) { - first = false; - } else { - IO::messagef(", "); - } - IO::messagef("%u %u-qubit", stat.num_gates[i], i); - } - } - - IO::messagef(" gates are fused into %lu gates\n", stat.num_fused_gates); - - if (verbosity < 5) return; - - IO::messagef("fused gate qubits:\n"); - for (const auto& g : fused_gates) { - IO::messagef("%6u ", g.parent->time); - if (g.parent->kind == gate::kMeasurement) { - IO::messagef("m"); - } else if (g.parent->controlled_by.size() > 0) { - IO::messagef("c"); - for (auto q : g.parent->controlled_by) { - IO::messagef("%3u", q); - } - IO::messagef(" t"); - } else { - IO::messagef(" "); - } - - for (auto q : g.qubits) { - IO::messagef("%3u", q); - } - IO::messagef("\n"); - } - } - - template - static bool ValidateGate(const Gate2& gate, unsigned max_qubit1, - const GatesLat& gates_lat) { - for (unsigned q : gate.qubits) { - if (q >= max_qubit1) { - IO::errorf("fuser: gate qubit %u is out of range " - "(should be smaller than %u).\n", q, max_qubit1); - return false; - } - if (gates_lat[q] != nullptr - && gate.time <= gates_lat[q]->val->parent->time) { - IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time); - return false; - } - } - - for (unsigned q : gate.controlled_by) { - if (q >= max_qubit1) { - IO::errorf("fuser: gate qubit %u is out of range " - "(should be smaller than %u).\n", q, max_qubit1); - return false; - } - if (gates_lat[q] != nullptr - && gate.time <= gates_lat[q]->val->parent->time) { - IO::errorf("fuser: gate at time %u is out of time order.\n", gate.time); - return false; - } - } - - return true; - } -}; - -} // namespace qsim - -#endif // FUSER_MQUBIT_H_ diff --git a/tpls/qsim/gate.h b/tpls/qsim/gate.h deleted file mode 100644 index a457acb..0000000 --- a/tpls/qsim/gate.h +++ /dev/null @@ -1,216 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef GATE_H_ -#define GATE_H_ - -#include -#include -#include -#include - -#include "matrix.h" - -namespace qsim { - -namespace detail { - -template -inline void SortQubits(Gate& gate) { - for (std::size_t i = 1; i < gate.qubits.size(); ++i) { - if (gate.qubits[i - 1] > gate.qubits[i]) { - if (!GateDef::symmetric) { - auto perm = NormalToGateOrderPermutation(gate.qubits); - MatrixShuffle(perm, gate.qubits.size(), gate.matrix); - } - - gate.swapped = true; - std::sort(gate.qubits.begin(), gate.qubits.end()); - break; - } - } -} - -} // namespace detail - -template , typename Gate> -inline Gate& MakeControlledGate(Qubits&& controlled_by, Gate& gate) { - gate.controlled_by = std::forward(controlled_by); - gate.cmask = (uint64_t{1} << gate.controlled_by.size()) - 1; - - std::sort(gate.controlled_by.begin(), gate.controlled_by.end()); - - return gate; -} - -template , typename Gate> -inline Gate& MakeControlledGate(Qubits&& controlled_by, - const std::vector& control_values, - Gate& gate) { - // Assume controlled_by.size() == control_values.size(). - - bool sorted = true; - - for (std::size_t i = 1; i < controlled_by.size(); ++i) { - if (controlled_by[i - 1] > controlled_by[i]) { - sorted = false; - break; - } - } - - if (sorted) { - gate.controlled_by = std::forward(controlled_by); - gate.cmask = 0; - - for (std::size_t i = 0; i < control_values.size(); ++i) { - gate.cmask |= (control_values[i] & 1) << i; - } - } else { - struct ControlPair { - unsigned q; - unsigned v; - }; - - std::vector cpairs; - cpairs.reserve(controlled_by.size()); - - for (std::size_t i = 0; i < controlled_by.size(); ++i) { - cpairs.push_back({controlled_by[i], control_values[i]}); - } - - // Sort control qubits and control values. - std::sort(cpairs.begin(), cpairs.end(), - [](const ControlPair& l, const ControlPair& r) -> bool { - return l.q < r.q; - }); - - gate.cmask = 0; - gate.controlled_by.reserve(controlled_by.size()); - - for (std::size_t i = 0; i < cpairs.size(); ++i) { - gate.cmask |= (cpairs[i].v & 1) << i; - gate.controlled_by.push_back(cpairs[i].q); - } - } - - return gate; -} - -namespace gate { - -constexpr int kDecomp = 100001; // gate from Schmidt decomposition -constexpr int kMeasurement = 100002; // measurement gate - -} // namespace gate - -enum GateAnyKind { - kGateAny = -1, -}; - -/** - * A generic gate to make it easier to use qsim with external gate sets. - */ -template -struct Gate { - using fp_type = FP; - using GateKind = GK; - - GateKind kind; - unsigned time; - std::vector qubits; - std::vector controlled_by; - uint64_t cmask; - std::vector params; - Matrix matrix; - bool unfusible; // If true, the gate is fused as a parent. - bool swapped; // If true, the gate qubits are swapped to make qubits - // ordered in ascending order. This does not apply to - // control qubits of explicitly-controlled gates. - - template > - Gate&& ControlledBy(Qubits&& controlled_by) { - MakeControlledGate(std::forward(controlled_by), *this); - return std::move(*this); - } - - template > - Gate&& ControlledBy(Qubits&& controlled_by, - const std::vector& control_values) { - MakeControlledGate( - std::forward(controlled_by), control_values, *this); - return std::move(*this); - } -}; - -template , - typename M = Matrix> -inline Gate CreateGate(unsigned time, Qubits&& qubits, M&& matrix = {}, - std::vector&& params = {}) { - Gate gate = {GateDef::kind, time, std::forward(qubits), {}, 0, - std::move(params), std::forward(matrix), false, false}; - - if (GateDef::kind != gate::kMeasurement) { - switch (gate.qubits.size()) { - case 1: - break; - case 2: - if (gate.qubits[0] > gate.qubits[1]) { - gate.swapped = true; - std::swap(gate.qubits[0], gate.qubits[1]); - if (!GateDef::symmetric) { - MatrixShuffle({1, 0}, 2, gate.matrix); - } - } - break; - default: - detail::SortQubits(gate); - } - } - - return gate; -} - -namespace gate { - -/** - * A gate that simulates measurement of one or more qubits, collapsing the - * state vector and storing the measured results. - */ -template -struct Measurement { - using GateKind = typename Gate::GateKind; - - static constexpr GateKind kind = GateKind::kMeasurement; - static constexpr char name[] = "m"; - static constexpr bool symmetric = false; - - template > - static Gate Create(unsigned time, Qubits&& qubits) { - return CreateGate(time, std::forward(qubits)); - } -}; - -} // namespace gate - -template -using schmidt_decomp_type = std::vector>>; - -template -schmidt_decomp_type GetSchmidtDecomp( - GateKind kind, const std::vector& params); - -} // namespace qsim - -#endif // GATE_H_ diff --git a/tpls/qsim/gate_appl.h b/tpls/qsim/gate_appl.h deleted file mode 100644 index 8601e6f..0000000 --- a/tpls/qsim/gate_appl.h +++ /dev/null @@ -1,231 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef GATE_APPL_H_ -#define GATE_APPL_H_ - -#include -#include - -#include "fuser.h" -#include "gate.h" -#include "matrix.h" - -namespace qsim { - -/** - * Applies the given gate to the simulator state. Ignores measurement gates. - * @param simulator Simulator object. Provides specific implementations for - * applying gates. - * @param gate The gate to be applied. - * @param state The state of the system, to be updated by this method. - */ -template -inline void ApplyGate(const Simulator& simulator, const Gate& gate, - typename Simulator::State& state) { - if (gate.kind != gate::kMeasurement) { - if (gate.controlled_by.size() == 0) { - simulator.ApplyGate(gate.qubits, gate.matrix.data(), state); - } else { - simulator.ApplyControlledGate(gate.qubits, gate.controlled_by, - gate.cmask, gate.matrix.data(), state); - } - } -} - -/** - * Applies the given gate dagger to the simulator state. If the gate matrix is - * unitary then this is equivalent to applying the inverse gate. Ignores - * measurement gates. - * @param simulator Simulator object. Provides specific implementations for - * applying gates. - * @param gate The gate to be applied. - * @param state The state of the system, to be updated by this method. - */ -template -inline void ApplyGateDagger(const Simulator& simulator, const Gate& gate, - typename Simulator::State& state) { - if (gate.kind != gate::kMeasurement) { - auto matrix = gate.matrix; - MatrixDagger(unsigned{1} << gate.qubits.size(), matrix); - - if (gate.controlled_by.size() == 0) { - simulator.ApplyGate(gate.qubits, matrix.data(), state); - } else { - simulator.ApplyControlledGate(gate.qubits, gate.controlled_by, - gate.cmask, matrix.data(), state); - } - } -} - -/** - * Applies the given gate to the simulator state. - * @param state_space StateSpace object required to perform measurements. - * @param simulator Simulator object. Provides specific implementations for - * applying gates. - * @param gate The gate to be applied. - * @param rgen Random number generator to perform measurements. - * @param state The state of the system, to be updated by this method. - * @param mresults As an input parameter, this can be empty or this can - * contain the results of the previous measurements. If gate is a measurement - * gate then after a successful run, the measurement result will be added to - * this. - * @return True if the measurement performed successfully; false otherwise. - */ -template -inline bool ApplyGate( - const typename Simulator::StateSpace& state_space, - const Simulator& simulator, const Gate& gate, Rgen& rgen, - typename Simulator::State& state, - std::vector& mresults) { - if (gate.kind == gate::kMeasurement) { - auto measure_result = state_space.Measure(gate.qubits, rgen, state); - if (measure_result.valid) { - mresults.push_back(std::move(measure_result)); - } else { - return false; - } - } else { - ApplyGate(simulator, gate, state); - } - - return true; -} - -/** - * Applies the given gate to the simulator state, discarding measurement - * results. - * @param state_space StateSpace object required to perform measurements. - * @param simulator Simulator object. Provides specific implementations for - * applying gates. - * @param gate The gate to be applied. - * @param rgen Random number generator to perform measurements. - * @param state The state of the system, to be updated by this method. - * @return True if the measurement performed successfully; false otherwise. - */ -template -inline bool ApplyGate(const typename Simulator::StateSpace& state_space, - const Simulator& simulator, const Gate& gate, Rgen& rgen, - typename Simulator::State& state) { - using MeasurementResult = typename Simulator::StateSpace::MeasurementResult; - std::vector discarded_results; - return - ApplyGate(state_space, simulator, gate, rgen, state, discarded_results); -} - -/** - * Applies the given fused gate to the simulator state. Ignores measurement - * gates. - * @param simulator Simulator object. Provides specific implementations for - * applying gates. - * @param gate The gate to be applied. - * @param state The state of the system, to be updated by this method. - */ -template -inline void ApplyFusedGate(const Simulator& simulator, const Gate& gate, - typename Simulator::State& state) { - if (gate.kind != gate::kMeasurement) { - if (gate.parent->controlled_by.size() == 0) { - simulator.ApplyGate(gate.qubits, gate.matrix.data(), state); - } else { - simulator.ApplyControlledGate(gate.qubits, gate.parent->controlled_by, - gate.parent->cmask, gate.matrix.data(), - state); - } - } -} - -/** - * Applies the given fused gate dagger to the simulator state. If the gate - * matrix is unitary then this is equivalent to applying the inverse gate. - * Ignores measurement gates. - * @param simulator Simulator object. Provides specific implementations for - * applying gates. - * @param gate The gate to be applied. - * @param state The state of the system, to be updated by this method. - */ -template -inline void ApplyFusedGateDagger(const Simulator& simulator, const Gate& gate, - typename Simulator::State& state) { - if (gate.kind != gate::kMeasurement) { - auto matrix = gate.matrix; - MatrixDagger(unsigned{1} << gate.qubits.size(), matrix); - - if (gate.parent->controlled_by.size() == 0) { - simulator.ApplyGate(gate.qubits, matrix.data(), state); - } else { - simulator.ApplyControlledGate(gate.qubits, gate.parent->controlled_by, - gate.parent->cmask, matrix.data(), state); - } - } -} - -/** - * Applies the given fused gate to the simulator state. - * @param state_space StateSpace object required to perform measurements. - * @param simulator Simulator object. Provides specific implementations for - * applying gates. - * @param gate The gate to be applied. - * @param rgen Random number generator to perform measurements. - * @param state The state of the system, to be updated by this method. - * @param mresults As an input parameter, this can be empty or this can - * contain the results of the previous measurements. If gate is a measurement - * gate then after a successful run, the measurement result will be added to - * this. - * @return True if the measurement performed successfully; false otherwise. - */ -template -inline bool ApplyFusedGate( - const typename Simulator::StateSpace& state_space, - const Simulator& simulator, const Gate& gate, Rgen& rgen, - typename Simulator::State& state, - std::vector& mresults) { - if (gate.kind == gate::kMeasurement) { - auto measure_result = state_space.Measure(gate.qubits, rgen, state); - if (measure_result.valid) { - mresults.push_back(std::move(measure_result)); - } else { - return false; - } - } else { - ApplyFusedGate(simulator, gate, state); - } - - return true; -} - -/** - * Applies the given fused gate to the simulator state, discarding measurement - * results. - * @param state_space StateSpace object required to perform measurements. - * @param simulator Simulator object. Provides specific implementations for - * applying gates. - * @param gate The gate to be applied. - * @param rgen Random number generator to perform measurements. - * @param state The state of the system, to be updated by this method. - * @return True if the measurement performed successfully; false otherwise. - */ -template -inline bool ApplyFusedGate(const typename Simulator::StateSpace& state_space, - const Simulator& simulator, const Gate& gate, - Rgen& rgen, typename Simulator::State& state) { - using MeasurementResult = typename Simulator::StateSpace::MeasurementResult; - std::vector discarded_results; - return ApplyFusedGate( - state_space, simulator, gate, rgen, state, discarded_results); -} - -} // namespace qsim - -#endif // GATE_APPL_H_ diff --git a/tpls/qsim/gates_cirq.h b/tpls/qsim/gates_cirq.h deleted file mode 100644 index d767959..0000000 --- a/tpls/qsim/gates_cirq.h +++ /dev/null @@ -1,1640 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef GATES_CIRQ_H_ -#define GATES_CIRQ_H_ - -#include -#include -#include -#include - -#include "gate.h" -#include "matrix.h" - -namespace qsim { - -namespace Cirq { - -enum GateKind { - kI1 = 0, // One-qubit identity gate. - kI2, // Two-qubit identity gate. - kI, // Multi-qubit identity gate. - kXPowGate, - kYPowGate, - kZPowGate, - kHPowGate, - kCZPowGate, - kCXPowGate, - krx, - kry, - krz, - kH, - kS, - kCZ, - kCX, - kT, - kX, - kY, - kZ, - kPhasedXPowGate, - kPhasedXZGate, - kXXPowGate, - kYYPowGate, - kZZPowGate, - kXX, - kYY, - kZZ, - kSwapPowGate, - kISwapPowGate, - kriswap, - kSWAP, - kISWAP, - kPhasedISwapPowGate, - kgivens, - kFSimGate, - kTwoQubitDiagonalGate, - kThreeQubitDiagonalGate, - kCCZPowGate, - kCCXPowGate, - kCSwapGate, - kCCZ, - kCCX, - kMatrixGate1, // One-qubit matrix gate. - kMatrixGate2, // Two-qubit matrix gate. - kMatrixGate, // Multi-qubit matrix gate. - kGlobalPhaseGate, - kDecomp = gate::kDecomp, - kMeasurement = gate::kMeasurement, -}; - -template -using GateCirq = Gate; - -constexpr double h_double = 0.5; -constexpr double pi_double = 3.14159265358979323846264338327950288; -constexpr double is2_double = 0.7071067811865475; - -// Gates from cirq/ops/global_phase_op.py: - -/** - * The global phase gate. - */ -template -struct GlobalPhaseGate { - static constexpr GateKind kind = kGlobalPhaseGate; - static constexpr char name[] = "GlobalPhaseGate"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateCirq Create(unsigned time, fp_type phi) { - return Create(time, std::cos(phi), std::sin(phi)); - } - - static GateCirq Create(unsigned time, fp_type cp, fp_type sp) { - return CreateGate, GlobalPhaseGate>( - time, {}, {cp, sp}, {cp, sp}); - } -}; - -template -using global_phase_operation = GlobalPhaseGate; - -// Gates from cirq/ops/identity.py: - -/** - * A one-qubit identity gate. - */ -template -struct I1 { - static constexpr GateKind kind = kI1; - static constexpr char name[] = "I1"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateCirq Create(unsigned time, unsigned q0) { - return CreateGate, I1>( - time, {q0}, {1, 0, 0, 0, 0, 0, 1, 0}); - } -}; - -/** - * A two-qubit identity gate. - */ -template -struct I2 { - static constexpr GateKind kind = kI2; - static constexpr char name[] = "I2"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1) { - return CreateGate, I2>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0}); - } - - static schmidt_decomp_type SchmidtDecomp() { - return schmidt_decomp_type{ - {{1, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, 1, 0}}, - }; - } -}; - -/** - * A multi-qubit identity gate. - */ -template -struct I { - static constexpr GateKind kind = kI; - static constexpr char name[] = "I"; - static constexpr bool symmetric = true; - - static GateCirq Create(unsigned time, - const std::vector& qubits) { - Matrix matrix; - MatrixIdentity(1 << qubits.size(), matrix); - return CreateGate, I>(time, qubits, std::move(matrix)); - } -}; - -// Gates form cirq/ops/common_gates.py: - -/** - * A gate that rotates around the X axis of the Bloch sphere. - * This is a generalization of the X gate. - */ -template -struct XPowGate { - static constexpr GateKind kind = kXPowGate; - static constexpr char name[] = "XPowGate"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, unsigned q0, - fp_type exponent, fp_type global_shift = 0) { - fp_type c = std::cos(pi * exponent * 0.5); - fp_type s = std::sin(pi * exponent * 0.5); - fp_type gc = std::cos(pi * exponent * (0.5 + global_shift)); - fp_type gs = std::sin(pi * exponent * (0.5 + global_shift)); - - return CreateGate, XPowGate>( - time, {q0}, {c * gc, c * gs, s * gs, -s * gc, - s * gs, -s * gc, c * gc, c * gs}, - {exponent, global_shift}); - } -}; - -/** - * A gate that rotates around the Y axis of the Bloch sphere. - * This is a generalization of the Y gate. - */ -template -struct YPowGate { - static constexpr GateKind kind = kYPowGate; - static constexpr char name[] = "YPowGate"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, unsigned q0, - fp_type exponent, fp_type global_shift = 0) { - fp_type c = std::cos(pi * exponent * 0.5); - fp_type s = std::sin(pi * exponent * 0.5); - fp_type gc = std::cos(pi * exponent * (0.5 + global_shift)); - fp_type gs = std::sin(pi * exponent * (0.5 + global_shift)); - - return CreateGate, YPowGate>( - time, {q0}, {c * gc, c * gs, -s * gc, -s * gs, - s * gc, s * gs, c * gc, c * gs}, {exponent, global_shift}); - } -}; - -/** - * A gate that rotates around the Z axis of the Bloch sphere. - * This is a generalization of the Z gate. - */ -template -struct ZPowGate { - static constexpr GateKind kind = kZPowGate; - static constexpr char name[] = "ZPowGate"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, unsigned q0, - fp_type exponent, fp_type global_shift = 0) { - fp_type c = std::cos(pi * exponent); - fp_type s = std::sin(pi * exponent); - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - - return CreateGate, ZPowGate>( - time, {q0}, {gc, gs, 0, 0, 0, 0, c * gc - s * gs, c * gs + s * gc}, - {exponent, global_shift}); - } -}; - -/** - * A gate that rotates around the X+Z axis of the Bloch sphere. - * This is a generalization of the Hadamard gate. - */ -template -struct HPowGate { - static constexpr GateKind kind = kHPowGate; - static constexpr char name[] = "HPowGate"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static constexpr fp_type pi = static_cast(pi_double); - static constexpr fp_type is2 = static_cast(is2_double); - - static GateCirq Create(unsigned time, unsigned q0, - fp_type exponent, fp_type global_shift = 0) { - fp_type c = std::cos(pi * exponent * 0.5); - fp_type s = std::sin(pi * exponent * 0.5); - fp_type gc = std::cos(pi * exponent * (0.5 + global_shift)); - fp_type gs = std::sin(pi * exponent * (0.5 + global_shift)); - - fp_type a = s * gs * is2; - fp_type b = s * gc * is2; - - return CreateGate, HPowGate>( - time, {q0}, {c * gc + a, c * gs - b, a, -b, - a, -b, c * gc - a, c * gs + b}, {exponent, global_shift}); - } -}; - -/** - * A gate that applies a phase to the |11⟩ state of two qubits. - * This is a generalization of the CZ gate. - */ -template -struct CZPowGate { - static constexpr GateKind kind = kCZPowGate; - static constexpr char name[] = "CZPowGate"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1, - fp_type exponent, fp_type global_shift = 0) { - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - fp_type ec = std::cos(pi * exponent * (1 + global_shift)); - fp_type es = std::sin(pi * exponent * (1 + global_shift)); - - return CreateGate, CZPowGate>( - time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0, - 0, 0, gc, gs, 0, 0, 0, 0, - 0, 0, 0, 0, gc, gs, 0, 0, - 0, 0, 0, 0, 0, 0, ec, es}, {exponent, global_shift}); - } - - static schmidt_decomp_type SchmidtDecomp( - fp_type exponent, fp_type global_shift) { - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - fp_type ec = std::cos(pi * exponent * (1 + global_shift)); - fp_type es = std::sin(pi * exponent * (1 + global_shift)); - - return schmidt_decomp_type{ - {{1, 0, 0, 0, 0, 0, 0, 0}, {gc, gs, 0, 0, 0, 0, gc, gs}}, - {{0, 0, 0, 0, 0, 0, 1, 0}, {gc, gs, 0, 0, 0, 0, ec, es}}, - }; - } -}; - -/** - * A gate that applies a controlled power of an X gate. - * This is a generalization of the CX (or CNOT) gate. - */ -template -struct CXPowGate { - static constexpr GateKind kind = kCXPowGate; - static constexpr char name[] = "CXPowGate"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = false; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1, - fp_type exponent, fp_type global_shift = 0) { - fp_type c = std::cos(pi * exponent * 0.5); - fp_type s = std::sin(pi * exponent * 0.5); - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - fp_type ec = std::cos(pi * exponent * (0.5 + global_shift)); - fp_type es = std::sin(pi * exponent * (0.5 + global_shift)); - - // Matrix is in this form because the simulator uses inverse qubit order. - return CreateGate, CXPowGate>( - time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0, - 0, 0, c * ec, c * es, 0, 0, s * es, -s * ec, - 0, 0, 0, 0, gc, gs, 0, 0, - 0, 0, s * es, -s * ec, 0, 0, c * ec, c * es}, - {exponent, global_shift}); - } - - static schmidt_decomp_type SchmidtDecomp( - fp_type exponent, fp_type global_shift) { - fp_type c = std::cos(pi * exponent * 0.5); - fp_type s = std::sin(pi * exponent * 0.5); - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - fp_type ec = std::cos(pi * exponent * (0.5 + global_shift)); - fp_type es = std::sin(pi * exponent * (0.5 + global_shift)); - - return schmidt_decomp_type{ - {{1, 0, 0, 0, 0, 0, 0, 0}, {gc, gs, 0, 0, 0, 0, gc, gs}}, - {{0, 0, 0, 0, 0, 0, 1, 0}, {c * ec, c * es, s * es, -s * ec, - s * es, -s * ec, c * ec, c * es}}, - }; - } -}; - -/** - * The `(exponent = phi/pi, global_shift = -0.5)` instance of XPowGate. - * This is a generalization of the X gate with a fixed global phase. - * This is a function in Cirq. - */ -template -struct rx { - static constexpr GateKind kind = krx; - static constexpr char name[] = "rx"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateCirq Create(unsigned time, unsigned q0, fp_type phi) { - fp_type c = std::cos(-0.5 * phi); - fp_type s = std::sin(-0.5 * phi); - - return CreateGate, rx>( - time, {q0}, {c, 0, 0, s, 0, s, c, 0}, {phi}); - } -}; - -/** - * The `(exponent = phi/pi, global_shift = -0.5)` instance of YPowGate. - * This is a generalization of the Y gate with a fixed global phase. - * This is a function in Cirq. - */ -template -struct ry { - static constexpr GateKind kind = kry; - static constexpr char name[] = "ry"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateCirq Create(unsigned time, unsigned q0, fp_type phi) { - fp_type c = std::cos(-0.5 * phi); - fp_type s = std::sin(-0.5 * phi); - - return CreateGate, ry>( - time, {q0}, {c, 0, s, 0, -s, 0, c, 0}, {phi}); - } -}; - -/** - * The `(exponent = phi/pi, global_shift = -0.5)` instance of ZPowGate. - * This is a generalization of the Z gate with a fixed global phase. - * This is a function in Cirq. - */ -template -struct rz { - static constexpr GateKind kind = krz; - static constexpr char name[] = "rz"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateCirq Create(unsigned time, unsigned q0, fp_type phi) { - fp_type c = std::cos(-0.5 * phi); - fp_type s = std::sin(-0.5 * phi); - - return CreateGate, rz>( - time, {q0}, {c, s, 0, 0, 0, 0, c, -s}, {phi}); - } -}; - -/** - * The `(exponent = 1, global_shift = 0)` instance of HPowGate. - * This is the canonical Hadamard (or H) gate. - */ -template -struct H { - static constexpr GateKind kind = kH; - static constexpr char name[] = "H"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static constexpr fp_type is2 = static_cast(is2_double); - - static GateCirq Create(unsigned time, unsigned q0) { - return CreateGate, H>( - time, {q0}, {is2, 0, is2, 0, is2, 0, -is2, 0}); - } -}; - -/** - * The `(exponent = 0.5, global_shift = 0)` instance of ZPowGate. - * This is the canonical S gate. - */ -template -struct S { - static constexpr GateKind kind = kS; - static constexpr char name[] = "S"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateCirq Create(unsigned time, unsigned q0) { - return CreateGate, S>( - time, {q0}, {1, 0, 0, 0, 0, 0, 0, 1}); - } -}; - -/** - * The `(exponent = 0.25, global_shift = 0)` instance of ZPowGate. - * This is the canonical T gate. - */ -template -struct T { - static constexpr GateKind kind = kT; - static constexpr char name[] = "T"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static constexpr fp_type is2 = static_cast(is2_double); - - static GateCirq Create(unsigned time, unsigned q0) { - return CreateGate, T>( - time, {q0}, {1, 0, 0, 0, 0, 0, is2, is2}); - } -}; - -/** - * The `(exponent = 1, global_shift = 0)` instance of CZPowGate. - * This is the canonical CZ gate. - */ -template -struct CZ { - static constexpr GateKind kind = kCZ; - static constexpr char name[] = "CZ"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1) { - return CreateGate, CZ>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, -1, 0}); - } - - static schmidt_decomp_type SchmidtDecomp() { - return schmidt_decomp_type{ - {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}}, - {{0, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, -1, 0}}, - }; - } -}; - -template -using CNotPowGate = CXPowGate; - -/** - * The `(exponent = 1, global_shift = 0)` instance of CXPowGate. - * This is the canonical CX (or CNOT) gate. - */ -template -struct CX { - static constexpr GateKind kind = kCX; - static constexpr char name[] = "kCX"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = false; - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1) { - // Matrix is in this form because the simulator uses inverse qubit order. - return CreateGate, CX>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0, - 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 0}); - } - - static schmidt_decomp_type SchmidtDecomp() { - return schmidt_decomp_type{ - {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}}, - {{0, 0, 0, 0, 0, 0, 1, 0}, {0, 0, 1, 0, 1, 0, 0, 0}}, - }; - } -}; - -template -using CNOT = CX; - -// Gates from cirq/ops/pauli_gates.py: - -/** - * The `(exponent = 1, global_shift = 0)` instance of XPowGate. - * This is the canonical Pauli X gate. - */ -template -struct X : public XPowGate { - static constexpr GateKind kind = kX; - static constexpr char name[] = "X"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateCirq Create(unsigned time, unsigned q0) { - return CreateGate, X>( - time, {q0}, {0, 0, 1, 0, 1, 0, 0, 0}); - } -}; - -/** - * The `(exponent = 1, global_shift = 0)` instance of YPowGate. - * This is the canonical Pauli Y gate. - */ -template -struct Y : public YPowGate { - static constexpr GateKind kind = kY; - static constexpr char name[] = "Y"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateCirq Create(unsigned time, unsigned q0) { - return CreateGate, Y>( - time, {q0}, {0, 0, 0, -1, 0, 1, 0, 0}); - } -}; - -/** - * The `(exponent = 1, global_shift = 0)` instance of ZPowGate. - * This is the canonical Pauli Z gate. - */ -template -struct Z : public ZPowGate { - static constexpr GateKind kind = kZ; - static constexpr char name[] = "Z"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateCirq Create(unsigned time, unsigned q0) { - return CreateGate, Z>( - time, {q0}, {1, 0, 0, 0, 0, 0, -1, 0}); - } -}; - -// Gates from cirq/ops/phased_x_gate.py: - -/** - * An XPowGate conjugated by ZPowGate%s. - * Equivalent to the circuit `───Z^-p───X^t───Z^p───`. - */ -template -struct PhasedXPowGate { - static constexpr GateKind kind = kPhasedXPowGate; - static constexpr char name[] = "PhasedXPowGate"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, unsigned q0, - fp_type phase_exponent, fp_type exponent = 1, - fp_type global_shift = 0) { - fp_type pc = std::cos(pi * phase_exponent); - fp_type ps = std::sin(pi * phase_exponent); - fp_type ec = std::cos(pi * exponent); - fp_type es = std::sin(pi * exponent); - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - - fp_type ar = 0.5 * ((1 + ec) * gc - es * gs); - fp_type ai = 0.5 * ((1 + ec) * gs + es * gc); - fp_type br = -0.5 * ((-1 + ec) * gc - es * gs); - fp_type bi = -0.5 * ((-1 + ec) * gs + es * gc); - - return CreateGate, PhasedXPowGate>( - time, {q0}, {ar, ai, pc * br + ps * bi, pc * bi - ps * br, - pc * br - ps * bi, pc * bi + ps * br, ar, ai}, - {phase_exponent, exponent, global_shift}); - } -}; - -// Gates from cirq/ops/phased_x_z_gate.py: - -/** - * A PhasedXPowGate followed by a ZPowGate. - * Equivalent to the circuit `───Z^(-a)──X^x──Z^a───Z^z───`. - */ -template -struct PhasedXZGate { - static constexpr GateKind kind = kPhasedXZGate; - static constexpr char name[] = "PhasedXZGate"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, unsigned q0, - fp_type x_exponent, fp_type z_exponent, - fp_type axis_phase_exponent) { - fp_type xc = std::cos(pi * x_exponent); - fp_type xs = std::sin(pi * x_exponent); - fp_type zc = std::cos(pi * z_exponent); - fp_type zs = std::sin(pi * z_exponent); - fp_type ac = std::cos(pi * axis_phase_exponent); - fp_type as = std::sin(pi * axis_phase_exponent); - - fp_type br = 0.5 * (1 + xc); - fp_type bi = 0.5 * xs; - fp_type cr = -0.5 * (-1 + xc); - fp_type ci = -0.5 * xs; - fp_type dr = ac * zc - as * zs; - fp_type di = ac * zs + as * zc; - - return CreateGate, PhasedXZGate>( - time, {q0}, {br, bi, ac * cr + as * ci, ac * ci - as * cr, - dr * cr - di * ci, dr * ci + di * cr, - zc * br - zs * bi, zc * bi + zs * br}, - {x_exponent, z_exponent, axis_phase_exponent}); - } -}; - -// Gates from cirq/ops/parity_gates.py: - -/** - * The tensor product of two X gates, possibly raised to an exponent. - */ -template -struct XXPowGate { - static constexpr GateKind kind = kXXPowGate; - static constexpr char name[] = "XXPowGate"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1, - fp_type exponent, fp_type global_shift = 0) { - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - fp_type c = std::cos(pi * exponent); - fp_type s = std::sin(pi * exponent); - fp_type ic = 0.5 * ((1 + c) * gc - s * gs); - fp_type is = 0.5 * ((1 + c) * gs + s * gc); - fp_type xc = 0.5 * ((1 - c) * gc + s * gs); - fp_type xs = 0.5 * ((1 - c) * gs - s * gc); - - return CreateGate, XXPowGate>( - time, {q0, q1}, {ic, is, 0, 0, 0, 0, xc, xs, - 0, 0, ic, is, xc, xs, 0, 0, - 0, 0, xc, xs, ic, is, 0, 0, - xc, xs, 0, 0, 0, 0, ic, is}, {exponent, global_shift}); - } - - static schmidt_decomp_type SchmidtDecomp( - fp_type exponent, fp_type global_shift) { - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - fp_type c = std::cos(pi * exponent); - fp_type s = std::sin(pi * exponent); - fp_type ic = 0.5 * ((1 + c) * gc - s * gs); - fp_type is = 0.5 * ((1 + c) * gs + s * gc); - fp_type xc = 0.5 * ((1 - c) * gc + s * gs); - fp_type xs = 0.5 * ((1 - c) * gs - s * gc); - - return schmidt_decomp_type{ - {{1, 0, 0, 0, 0, 0, 1, 0}, {ic, is, 0, 0, 0, 0, ic, is}}, - {{0, 0, 1, 0, 1, 0, 0, 0}, {0, 0, xc, xs, xc, xs, 0, 0}}, - }; - } -}; - -/** - * The tensor product of two Y gates, possibly raised to an exponent. - */ -template -struct YYPowGate { - static constexpr GateKind kind = kYYPowGate; - static constexpr char name[] = "YYPowGate"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1, - fp_type exponent, fp_type global_shift = 0) { - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - fp_type c = std::cos(pi * exponent); - fp_type s = std::sin(pi * exponent); - fp_type ic = 0.5 * ((1 + c) * gc - s * gs); - fp_type is = 0.5 * ((1 + c) * gs + s * gc); - fp_type yc = 0.5 * ((1 - c) * gc + s * gs); - fp_type ys = 0.5 * ((1 - c) * gs - s * gc); - - return CreateGate, YYPowGate>( - time, {q0, q1}, {ic, is, 0, 0, 0, 0, -yc, -ys, - 0, 0, ic, is, yc, ys, 0, 0, - 0, 0, yc, ys, ic, is, 0, 0, - -yc, -ys, 0, 0, 0, 0, ic, is}, - {exponent, global_shift}); - } - - static schmidt_decomp_type SchmidtDecomp( - fp_type exponent, fp_type global_shift) { - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - fp_type c = std::cos(pi * exponent); - fp_type s = std::sin(pi * exponent); - fp_type ic = 0.5 * ((1 + c) * gc - s * gs); - fp_type is = 0.5 * ((1 + c) * gs + s * gc); - fp_type yc = 0.5 * ((1 - c) * gc + s * gs); - fp_type ys = 0.5 * ((1 - c) * gs - s * gc); - - return schmidt_decomp_type{ - {{1, 0, 0, 0, 0, 0, 1, 0}, {ic, is, 0, 0, 0, 0, ic, is}}, - {{0, 0, 0, -1, 0, 1, 0, 0}, {0, 0, ys, -yc, -ys, yc, 0, 0}}, - }; - } -}; - -/** - * The tensor product of two Z gates, possibly raised to an exponent. - */ -template -struct ZZPowGate { - static constexpr GateKind kind = kZZPowGate; - static constexpr char name[] = "ZZPowGate"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1, - fp_type exponent, fp_type global_shift = 0) { - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - fp_type zc = std::cos(pi * exponent * (1 + global_shift)); - fp_type zs = std::sin(pi * exponent * (1 + global_shift)); - - return CreateGate, ZZPowGate>( - time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0, - 0, 0, zc, zs, 0, 0, 0, 0, - 0, 0, 0, 0, zc, zs, 0, 0, - 0, 0, 0, 0, 0, 0, gc, gs}, {exponent, global_shift}); - } - - static schmidt_decomp_type SchmidtDecomp( - fp_type exponent, fp_type global_shift) { - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - fp_type c = std::cos(pi * exponent); - fp_type s = std::sin(pi * exponent); - fp_type ic = 0.5 * ((1 + c) * gc - s * gs); - fp_type is = 0.5 * ((1 + c) * gs + s * gc); - fp_type zc = 0.5 * ((1 - c) * gc + s * gs); - fp_type zs = 0.5 * ((1 - c) * gs - s * gc); - - return schmidt_decomp_type{ - {{1, 0, 0, 0, 0, 0, 1, 0}, {ic, is, 0, 0, 0, 0, ic, is}}, - {{1, 0, 0, 0, 0, 0, -1, 0}, {zc, zs, 0, 0, 0, 0, -zc, -zs}}, - }; - } -}; - -/** - * The `(exponent = 1, global_shift = 0)` instance of XXPowGate. - * This is the tensor product of two X gates. - */ -template -struct XX { - static constexpr GateKind kind = kXX; - static constexpr char name[] = "XX"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1) { - return CreateGate, XX>( - time, {q0, q1}, {0, 0, 0, 0, 0, 0, 1, 0, - 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 0, - 1, 0, 0, 0, 0, 0, 0, 0}); - } - - static schmidt_decomp_type SchmidtDecomp() { - return schmidt_decomp_type{ - {{0, 0, 1, 0, 1, 0, 0, 0}, {0, 0, 1, 0, 1, 0, 0, 0}}, - }; - } -}; - -/** - * The `(exponent = 1, global_shift = 0)` instance of YYPowGate. - * This is the tensor product of two Y gates. - */ -template -struct YY { - static constexpr GateKind kind = kYY; - static constexpr char name[] = "YY"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1) { - return CreateGate, YY>( - time, {q0, q1}, {0, 0, 0, 0, 0, 0, -1, 0, - 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 0, - -1, 0, 0, 0, 0, 0, 0, 0}); - } - - static schmidt_decomp_type SchmidtDecomp() { - return schmidt_decomp_type{ - {{0, 0, 0, -1, 0, 1, 0, 0}, {0, 0, 0, -1, 0, 1, 0, 0}}, - }; - } -}; - -/** - * The `(exponent = 1, global_shift = 0)` instance of ZZPowGate. - * This is the tensor product of two Z gates. - */ -template -struct ZZ { - static constexpr GateKind kind = kZZ; - static constexpr char name[] = "ZZ"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1) { - return CreateGate, ZZ>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, -1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, -1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0}); - } - - static schmidt_decomp_type SchmidtDecomp() { - return schmidt_decomp_type{ - {{1, 0, 0, 0, 0, 0, -1, 0}, {1, 0, 0, 0, 0, 0, -1, 0}}, - }; - } -}; - -// Gates from cirq/ops/swap_gates.py: - -/** - * The SWAP gate, possibly raised to a power. Exchanges qubits. - */ -template -struct SwapPowGate { - static constexpr GateKind kind = kSwapPowGate; - static constexpr char name[] = "SwapPowGate"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static constexpr fp_type pi = static_cast(pi_double); - static constexpr fp_type h = static_cast(h_double); - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1, - fp_type exponent, fp_type global_shift = 0) { - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - fp_type c = std::cos(pi * exponent * 0.5); - fp_type s = std::sin(pi * exponent * 0.5); - fp_type ec = std::cos(pi * exponent * (0.5 + global_shift)); - fp_type es = std::sin(pi * exponent * (0.5 + global_shift)); - - return CreateGate, SwapPowGate>( - time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0, - 0, 0, c * ec, c * es, s * es, -s * ec, 0, 0, - 0, 0, s * es, -s * ec, c * ec, c * es, 0, 0, - 0, 0, 0, 0, 0, 0, gc, gs}, {exponent, global_shift}); - } - - static schmidt_decomp_type SchmidtDecomp( - fp_type exponent, fp_type global_shift) { - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - fp_type c = std::cos(pi * exponent * 0.5); - fp_type s = std::sin(pi * exponent * 0.5); - fp_type ec = std::cos(pi * exponent * (0.5 + global_shift)); - fp_type es = std::sin(pi * exponent * (0.5 + global_shift)); - - return schmidt_decomp_type{ - {{h, 0, 0, 0, 0, 0, h, 0}, {gc + c * ec, gs + c * es, 0, 0, - 0, 0, gc + c * ec, gs + c * es}}, - {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, s * es, -s * ec, - s * es, -s * ec, 0, 0}}, - {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, -s * ec, -s * es, - s * ec, s * es, 0, 0}}, - {{h, 0, 0, 0, 0, 0, -h, 0}, {gc - c * ec, gs - c * es, 0, 0, - 0, 0, -gc + c * ec, -gs + c * es}}, - }; - } -}; - -/** - * Rotates the |01⟩ vs |10⟩ subspace of two qubits around its Bloch X-axis. - * This is a generalization of the ISWAP gate. - */ -template -struct ISwapPowGate { - static constexpr GateKind kind = kISwapPowGate; - static constexpr char name[] = "ISwapPowGate"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static constexpr fp_type pi = static_cast(pi_double); - static constexpr fp_type h = static_cast(h_double); - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1, - fp_type exponent, fp_type global_shift = 0) { - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - fp_type c = std::cos(pi * exponent * 0.5); - fp_type s = std::sin(pi * exponent * 0.5); - - return CreateGate, ISwapPowGate>( - time, {q0, q1}, {gc, gs, 0, 0, 0, 0, 0, 0, - 0, 0, c * gc, c * gs, -s * gs, s * gc, 0, 0, - 0, 0, -s * gs, s * gc, c * gc, c * gs, 0, 0, - 0, 0, 0, 0, 0, 0, gc, gs}, {exponent, global_shift}); - } - - static schmidt_decomp_type SchmidtDecomp( - fp_type exponent, fp_type global_shift) { - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - fp_type c = std::cos(pi * exponent * 0.5); - fp_type s = std::sin(pi * exponent * 0.5); - - return schmidt_decomp_type{ - {{h, 0, 0, 0, 0, 0, h, 0}, {gc + c * gc, gs + c * gs, 0, 0, - 0, 0, gc + c * gc, gs + c * gs}}, - {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, -s * gs, s * gc, - -s * gs, s * gc, 0, 0}}, - {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, s * gc, s * gs, - -s * gc, -s * gs, 0, 0}}, - {{h, 0, 0, 0, 0, 0, -h, 0}, {gc - c * gc, gs - c * gs, 0, 0, - 0, 0, -gc + c * gc, -gs + c * gs}}, - }; - } -}; - -/** - * The `(exponent = 2*phi/pi, global_shift = 0)` instance of ISwapPowGate. - * This is a generalization of the ISWAP gate with a fixed global phase of zero. - * This is a function in Cirq. - */ -template -struct riswap { - static constexpr GateKind kind = kriswap; - static constexpr char name[] = "riswap"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static constexpr fp_type pi = static_cast(pi_double); - static constexpr fp_type h = static_cast(h_double); - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1, - fp_type phi) { - fp_type c = std::cos(phi); - fp_type s = std::sin(phi); - - return CreateGate, riswap>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, c, 0, 0, s, 0, 0, - 0, 0, 0, s, c, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0}, {phi}); - } - - static schmidt_decomp_type SchmidtDecomp(fp_type phi) { - fp_type c = std::cos(phi); - fp_type s = std::sin(phi); - - return schmidt_decomp_type{ - {{h, 0, 0, 0, 0, 0, h, 0}, {1 + c, 0, 0, 0, 0, 0, 1 + c, 0}}, - {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, 0, s, 0, s, 0, 0}}, - {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, s, 0, -s, 0, 0, 0}}, - {{h, 0, 0, 0, 0, 0, -h, 0}, {1 - c, 0, 0, 0, 0, 0, -1 + c, 0}}, - }; - } -}; - -/** - * The `(exponent = 1, global_shift = 0)` instance of SwapPowGate. - * This is the canonical SWAP gate. - */ -template -struct SWAP { - static constexpr GateKind kind = kSWAP; - static constexpr char name[] = "SWAP"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static constexpr fp_type is2 = static_cast(is2_double); - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1) { - return CreateGate, SWAP>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0}); - } - - static schmidt_decomp_type SchmidtDecomp() { - return schmidt_decomp_type{ - {{is2, 0, 0, 0, 0, 0, is2, 0}, {is2, 0, 0, 0, 0, 0, is2, 0}}, - {{0, 0, is2, 0, is2, 0, 0, 0}, {0, 0, is2, 0, is2, 0, 0, 0}}, - {{0, 0, 0, -is2, 0, is2, 0, 0}, {0, 0, 0, -is2, 0, is2, 0, 0}}, - {{is2, 0, 0, 0, 0, 0, -is2, 0}, {is2, 0, 0, 0, 0, 0, -is2, 0}}, - }; - } -}; - -/** - * The `(exponent = 1, global_shift = 0)` instance of ISwapPowGate. - * This is the canonical ISWAP gate. - */ -template -struct ISWAP { - static constexpr GateKind kind = kISWAP; - static constexpr char name[] = "ISWAP"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static constexpr fp_type h = static_cast(h_double); - static constexpr fp_type is2 = static_cast(is2_double); - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1) { - return CreateGate, ISWAP>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 1, 0, 0, - 0, 0, 0, 1, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0}); - } - - static schmidt_decomp_type SchmidtDecomp() { - return schmidt_decomp_type{ - {{is2, 0, 0, 0, 0, 0, is2, 0}, {is2, 0, 0, 0, 0, 0, is2, 0}}, - {{0, 0, h, h, h, h, 0, 0}, {0, 0, h, h, h, h, 0, 0}}, - {{0, 0, h, -h, -h, h, 0, 0}, {0, 0, h, -h, -h, h, 0, 0}}, - {{is2, 0, 0, 0, 0, 0, -is2, 0}, {is2, 0, 0, 0, 0, 0, -is2, 0}}, - }; - } -}; - -// Gates from cirq/ops/phased_iswap_gate.py: - -/** - * An ISwapPowGate conjugated by ZPowGate%s. - * Equivalent to the composition `(Z^-p ⊗ Z^p) ISWAP^t (Z^p ⊗ Z^-p)`. - */ -template -struct PhasedISwapPowGate { - static constexpr GateKind kind = kPhasedISwapPowGate; - static constexpr char name[] = "PhasedISwapPowGate"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = false; - - static constexpr fp_type pi = static_cast(pi_double); - static constexpr fp_type h = static_cast(h_double); - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1, - fp_type phase_exponent = 0.25, - fp_type exponent = 1.0) { - fp_type fc = std::cos(2 * pi * phase_exponent); - fp_type fs = std::sin(2 * pi * phase_exponent); - fp_type c = std::cos(pi * exponent * 0.5); - fp_type s = std::sin(pi * exponent * 0.5); - - // Matrix is in this form because the simulator uses inverse qubit order. - return CreateGate, PhasedISwapPowGate>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, c, 0, s * fs, s * fc, 0, 0, - 0, 0, -s * fs, s * fc, c, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0}, {phase_exponent, exponent}); - } - - static schmidt_decomp_type SchmidtDecomp( - fp_type phase_exponent, fp_type exponent) { - fp_type fc = std::cos(2 * pi * phase_exponent); - fp_type fs = std::sin(2 * pi * phase_exponent); - fp_type c = std::cos(pi * exponent * 0.5); - fp_type s = std::sin(pi * exponent * 0.5); - - return schmidt_decomp_type{ - {{h, 0, 0, 0, 0, 0, h, 0}, {1 + c, 0, 0, 0, 0, 0, 1 + c, 0}}, - {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, s * fs, s * fc, -s * fs, s * fc, 0, 0}}, - {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, s * fc, -s * fs, - -s * fc, -s * fs, 0, 0}}, - {{h, 0, 0, 0, 0, 0, -h, 0}, {1 - c, 0, 0, 0, 0, 0, -1 + c, 0}}, - }; - } -}; - -/** - * The `(phase_exponent = 0.25, exponent = 2*phi/pi)` instance of - * PhasedISwapPowGate. - * This is the "Givens rotation" from numerical linear algebra. - * This is a function in Cirq. - */ -template -struct givens { - static constexpr GateKind kind = kgivens; - static constexpr char name[] = "givens"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = false; - - static constexpr fp_type pi = static_cast(pi_double); - static constexpr fp_type h = static_cast(h_double); - - static GateCirq Create(unsigned time, unsigned q0, unsigned q1, - fp_type phi) { - fp_type c = std::cos(phi); - fp_type s = std::sin(phi); - - // Matrix is in this form because the simulator uses inverse qubit order. - return CreateGate, givens>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, c, 0, s, 0, 0, 0, - 0, 0, -s, 0, c, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0}, {phi}); - } - - static schmidt_decomp_type SchmidtDecomp(fp_type phi) { - fp_type c = std::cos(phi); - fp_type s = std::sin(phi); - - return schmidt_decomp_type{ - {{h, 0, 0, 0, 0, 0, h, 0}, {1 + c, 0, 0, 0, 0, 0, 1 + c, 0}}, - {{0, 0, h, 0, h, 0, 0, 0}, {0, 0, s, 0, -s, 0, 0, 0}}, - {{0, 0, 0, -h, 0, h, 0, 0}, {0, 0, 0, -s, 0, -s, 0, 0}}, - {{h, 0, 0, 0, 0, 0, -h, 0}, {1 - c, 0, 0, 0, 0, 0, -1 + c, 0}}, - }; - } -}; - -// Gates from cirq/ops/fsim_gate.py: - -/** - * The fermionic simulation gate family. Contains all two-qubit interactions - * that preserve excitations, up to single-qubit rotations and global phase. - */ -template -struct FSimGate { - static constexpr GateKind kind = kFSimGate; - static constexpr char name[] = "FSimGate"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static constexpr fp_type is2 = static_cast(is2_double); - - static GateCirq Create( - unsigned time, unsigned q0, unsigned q1, fp_type theta, fp_type phi) { - if (phi < 0) { - phi += 2 * 3.141592653589793; - } - - fp_type ct = std::cos(theta); - fp_type st = std::sin(theta); - fp_type cp = std::cos(phi); - fp_type sp = std::sin(phi); - - return CreateGate, FSimGate>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, ct, 0, 0, -st, 0, 0, - 0, 0, 0, -st, ct, 0, 0, 0, - 0, 0, 0, 0, 0, 0, cp, -sp}, {theta, phi}); - } - - static schmidt_decomp_type SchmidtDecomp( - fp_type theta, fp_type phi) { - fp_type ct = std::cos(theta); - fp_type st = std::sin(theta); - - fp_type cp2 = std::cos(0.5 * phi); - fp_type sp2 = std::sin(0.5 * phi); - fp_type cp4 = std::cos(0.25 * phi); - fp_type sp4 = std::sin(0.25 * phi); - - fp_type a0 = std::sqrt(std::sqrt(1 + 2 * ct * cp2 + ct * ct)); - fp_type a1 = std::sqrt(std::sqrt(1 - 2 * ct * cp2 + ct * ct)); - - fp_type p0 = 0.5 * std::atan2(-sp2, cp2 + ct); - fp_type p1 = 0.5 * std::atan2(-sp2, cp2 - ct); - - fp_type c0 = is2 * a0 * std::cos(p0); - fp_type s0 = is2 * a0 * std::sin(p0); - - fp_type c1 = is2 * a1 * std::cos(p1); - fp_type s1 = is2 * a1 * std::sin(p1); - - fp_type st2 = 0.5 * std::sqrt(st); - - fp_type a = cp4 * c0 - sp4 * s0; - fp_type b = cp4 * s0 + sp4 * c0; - fp_type c = cp4 * c0 + sp4 * s0; - fp_type d = cp4 * s0 - sp4 * c0; - - fp_type e = cp4 * c1 - sp4 * s1; - fp_type f = cp4 * s1 + sp4 * c1; - fp_type g = -(cp4 * c1 + sp4 * s1); - fp_type h = -(cp4 * s1 - sp4 * c1); - - return schmidt_decomp_type{ - {{a, b, 0, 0, 0, 0, c, d}, {a, b, 0, 0, 0, 0, c, d}}, - {{0, 0, st2, -st2, st2, -st2, 0, 0}, {0, 0, st2, -st2, st2, -st2, 0, 0}}, - {{0, 0, -st2, -st2, st2, st2, 0, 0}, {0, 0, -st2, -st2, st2, st2, 0, 0}}, - {{e, f, 0, 0, 0, 0, g, h}, {e, f, 0, 0, 0, 0, g, h}}, - }; - } -}; - -// Gates from cirq/ops/two_qubit_diagonal_gate.py: - -/** - * A two-qubit diagonal gate. - */ -template -struct TwoQubitDiagonalGate { - static constexpr GateKind kind = kTwoQubitDiagonalGate; - static constexpr char name[] = "TwoQubitDiagonalGate"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = false; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, - unsigned q0, unsigned q1, - const std::vector& angles) { - std::vector cs; - std::vector ss; - cs.reserve(4); - ss.reserve(4); - - for (std::size_t i = 0; i < angles.size(); ++i) { - cs.push_back(std::cos(angles[i])); - ss.push_back(std::sin(angles[i])); - } - - for (std::size_t i = angles.size(); i < 4; ++i) { - cs.push_back(1); - ss.push_back(0); - } - - // Matrix is in this form because the simulator uses inverse qubit order. - return CreateGate, TwoQubitDiagonalGate>( - time, {q0, q1}, {cs[0], ss[0], 0, 0, 0, 0, 0, 0, - 0, 0, cs[2], ss[2], 0, 0, 0, 0, - 0, 0, 0, 0, cs[1], ss[1], 0, 0, - 0, 0, 0, 0, 0, 0, cs[3], ss[3]}); - } -}; - -// Gates from cirq/ops/three_qubit_gates.py: - -/** - * A three-qubit diagonal gate. - */ -template -struct ThreeQubitDiagonalGate { - static constexpr GateKind kind = kThreeQubitDiagonalGate; - static constexpr char name[] = "ThreeQubitDiagonalGate"; - static constexpr unsigned num_qubits = 3; - static constexpr bool symmetric = false; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, - unsigned q0, unsigned q1, unsigned q2, - const std::vector& angles) { - std::vector cs; - std::vector ss; - cs.reserve(8); - ss.reserve(8); - - for (std::size_t i = 0; i < angles.size(); ++i) { - cs.push_back(std::cos(angles[i])); - ss.push_back(std::sin(angles[i])); - } - - for (std::size_t i = angles.size(); i < 8; ++i) { - cs.push_back(1); - ss.push_back(0); - } - - // Matrix is in this form because the simulator uses inverse qubit order. - return CreateGate, ThreeQubitDiagonalGate>( - time, {q0, q1, q2}, - {cs[0], ss[0], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, cs[4], ss[4], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, cs[2], ss[2], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, cs[6], ss[6], 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, cs[1], ss[1], 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, cs[5], ss[5], 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, cs[3], ss[3], 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, cs[7], ss[7]}); - } -}; - -/** - * A gate that applies a phase to the |111⟩ state of three qubits. - * This is a generalization of the CCZ gate. - */ -template -struct CCZPowGate { - static constexpr GateKind kind = kCCZPowGate; - static constexpr char name[] = "CCZPowGate"; - static constexpr unsigned num_qubits = 3; - static constexpr bool symmetric = true; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, - unsigned q0, unsigned q1, unsigned q2, - fp_type exponent, fp_type global_shift = 0) { - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - fp_type ec = std::cos(pi * exponent * (1 + global_shift)); - fp_type es = std::sin(pi * exponent * (1 + global_shift)); - - return CreateGate, CCZPowGate>( - time, {q0, q1, q2}, {gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ec, es}, - {exponent, global_shift}); - } -}; - -/** - * A gate that applies a doubly-controlled power of an X gate. - * This is a generalization of the CCX (or CCNOT) gate. - */ -template -struct CCXPowGate { - static constexpr GateKind kind = kCCXPowGate; - static constexpr char name[] = "CCXPowGate"; - static constexpr unsigned num_qubits = 3; - static constexpr bool symmetric = false; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, - unsigned q0, unsigned q1, unsigned q2, - fp_type exponent, fp_type global_shift = 0) { - fp_type c = std::cos(pi * exponent * 0.5); - fp_type s = std::sin(pi * exponent * 0.5); - fp_type gc = std::cos(pi * exponent * global_shift); - fp_type gs = std::sin(pi * exponent * global_shift); - fp_type ec = std::cos(pi * exponent * (0.5 + global_shift)); - fp_type es = std::sin(pi * exponent * (0.5 + global_shift)); - - // Matrix is in this form because the simulator uses inverse qubit order. - return CreateGate, CCXPowGate>( - time, {q0, q1, q2}, - {gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, c * ec, c * es, 0, 0, 0, 0, 0, 0, s * es, -s * ec, - 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, gc, gs, 0, 0, - 0, 0, 0, 0, 0, 0, s * es, -s * ec, 0, 0, 0, 0, 0, 0, c * ec, c * es}, - {exponent, global_shift}); - } -}; - -/** - * A controlled swap gate (the Fredkin gate). - */ -template -struct CSwapGate { - static constexpr GateKind kind = kCSwapGate; - static constexpr char name[] = "CSwapGate"; - static constexpr unsigned num_qubits = 3; - static constexpr bool symmetric = false; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, - unsigned q0, unsigned q1, unsigned q2) { - // Matrix is in this form because the simulator uses inverse qubit order. - return CreateGate, CSwapGate>( - time, {q0, q1, q2}, {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0}); - } -}; - -/** - * The `(exponent = 1, global_shift = 0)` instance of CCZPowGate. - * This is the canonical doubly-controlled Z gate. - */ -template -struct CCZ { - static constexpr GateKind kind = kCCZ; - static constexpr char name[] = "CCZ"; - static constexpr unsigned num_qubits = 3; - static constexpr bool symmetric = true; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, - unsigned q0, unsigned q1, unsigned q2) { - return CreateGate, CCZ>( - time, {q0, q1, q2}, {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0}); - } -}; - -/** - * The `(exponent = 1, global_shift = 0)` instance of CCXPowGate. - * This is the canonical doubly-controlled X gate (the TOFFOLI gate). - */ -template -struct CCX { - static constexpr GateKind kind = kCCX; - static constexpr char name[] = "CCX"; - static constexpr unsigned num_qubits = 3; - static constexpr bool symmetric = false; - - static constexpr fp_type pi = static_cast(pi_double); - - static GateCirq Create(unsigned time, - unsigned q0, unsigned q1, unsigned q2) { - // Matrix is in this form because the simulator uses inverse qubit order. - return CreateGate, CCX>( - time, {q0, q1, q2}, {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0}); - } -}; - -template -using CCNotPowGate = CCXPowGate; - -template -using TOFFOLI = CCX; - -template -using CCNOT = CCX; - -template -using CSWAP = CSwapGate; - -template -using FREDKIN = CSwapGate; - -// Gates from cirq/ops/matrix_gates.py: - -/** - * A one-qubit gate defined entirely by its matrix. - */ -template -struct MatrixGate1 { - static constexpr GateKind kind = kMatrixGate1; - static constexpr char name[] = "MatrixGate1"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateCirq Create(unsigned time, unsigned q0, - const Matrix& m) { - auto m2 = m; - return - CreateGate, MatrixGate1>(time, {q0}, std::move(m2)); - } -}; - -/** - * A two-qubit gate defined entirely by its matrix. - */ -template -struct MatrixGate2 { - static constexpr GateKind kind = kMatrixGate2; - static constexpr char name[] = "MatrixGate2"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = false; - - template > - static GateCirq Create( - unsigned time, unsigned q0, unsigned q1, M&& m) { - return CreateGate, MatrixGate2>(time, {q1, q0}, - std::forward(m)); - } -}; - -/** - * A multi-qubit gate defined entirely by its matrix. - */ -template -struct MatrixGate { - static constexpr GateKind kind = kMatrixGate; - static constexpr char name[] = "MatrixGate"; - static constexpr bool symmetric = false; - - template > - static GateCirq Create(unsigned time, - std::vector qubits, M&& m) { - std::reverse(qubits.begin(), qubits.end()); - return CreateGate, MatrixGate>(time, std::move(qubits), - std::forward(m)); - } -}; - -} // namesapce Cirq - -template -inline schmidt_decomp_type GetSchmidtDecomp( - Cirq::GateKind kind, const std::vector& params) { - switch (kind) { - case Cirq::kI2: - return Cirq::I2::SchmidtDecomp(); - case Cirq::kCZPowGate: - return Cirq::CZPowGate::SchmidtDecomp(params[0], params[1]); - case Cirq::kCXPowGate: - return Cirq::CXPowGate::SchmidtDecomp(params[0], params[1]); - case Cirq::kCZ: - return Cirq::CZ::SchmidtDecomp(); - case Cirq::kCX: - return Cirq::CX::SchmidtDecomp(); - case Cirq::kXXPowGate: - return Cirq::XXPowGate::SchmidtDecomp(params[0], params[1]); - case Cirq::kYYPowGate: - return Cirq::YYPowGate::SchmidtDecomp(params[0], params[1]); - case Cirq::kZZPowGate: - return Cirq::ZZPowGate::SchmidtDecomp(params[0], params[1]); - case Cirq::kXX: - return Cirq::XX::SchmidtDecomp(); - case Cirq::kYY: - return Cirq::YY::SchmidtDecomp(); - case Cirq::kZZ: - return Cirq::ZZ::SchmidtDecomp(); - case Cirq::kSwapPowGate: - return Cirq::SwapPowGate::SchmidtDecomp(params[0], params[1]); - case Cirq::kISwapPowGate: - return Cirq::ISwapPowGate::SchmidtDecomp(params[0], params[1]); - case Cirq::kriswap: - return Cirq::riswap::SchmidtDecomp(params[0]); - case Cirq::kSWAP: - return Cirq::SWAP::SchmidtDecomp(); - case Cirq::kISWAP: - return Cirq::ISWAP::SchmidtDecomp(); - case Cirq::kPhasedISwapPowGate: - return Cirq::PhasedISwapPowGate::SchmidtDecomp( - params[0], params[1]); - case Cirq::kgivens: - return Cirq::givens::SchmidtDecomp(params[0]); - case Cirq::kFSimGate: - return Cirq::FSimGate::SchmidtDecomp(params[0], params[1]); - default: - // Single qubit gates of gates with unimplemented Schmidt decomposition. - return schmidt_decomp_type{}; - } -} - -} // namespace qsim - -#endif // GATES_CIRQ_H_ diff --git a/tpls/qsim/gates_qsim.h b/tpls/qsim/gates_qsim.h deleted file mode 100644 index 366c4f1..0000000 --- a/tpls/qsim/gates_qsim.h +++ /dev/null @@ -1,661 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef GATES_QSIM_H_ -#define GATES_QSIM_H_ - -#include -#include -#include - -#include "gate.h" - -namespace qsim { - -// Gate set implemented in qsim contains the following gates. -enum GateKind { - kGateId1 = 0, // one-qubit Id - kGateHd, // Hadamard - kGateT, // T - kGateX, // X - kGateY, // Y - kGateZ, // Z - kGateX2, // sqrt(X) - kGateY2, // sqrt(Y) - kGateRX, // X-rotation - kGateRY, // Y-rotation - kGateRZ, // Z-rotation - kGateRXY, // XY-rotation (rotation around arbitrary axis in the XY plane) - kGateHZ2, // pi / 2 rotation around the X + Y axis - kGateS, // S - kGateId2, // two-qubit Id - kGateCZ, // CZ - kGateCNot, // CNOT (CX) - kGateSwap, // swap - kGateIS, // iSwap - kGateFS, // fSim - kGateCP, // control phase - kGateMatrix1, // one-qubit matrix gate - kGateMatrix2, // two-qubit matrix gate - kGateGPh, // global phase gate - kDecomp = gate::kDecomp, - kMeasurement = gate::kMeasurement, -}; - -// Specialization of Gate (defined in gate.h) for the qsim gate set. -template -using GateQSim = Gate; - -constexpr double h_double = 0.5; -constexpr double is2_double = 0.7071067811865475; - -// Zero-qubit gates: - -/** - * The global phase gate. - */ -template -struct GateGPh { - static constexpr GateKind kind = kGateGPh; - static constexpr char name[] = "p"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateQSim Create(unsigned time, fp_type phi) { - return Create(time, std::cos(phi), std::sin(phi)); - } - - static GateQSim Create(unsigned time, fp_type cp, fp_type sp) { - return CreateGate, GateGPh>( - time, {}, {cp, sp}, {cp, sp}); - } -}; - -// One-qubit gates: - -/** - * The one-qubit identity gate. - */ -template -struct GateId1 { - static constexpr GateKind kind = kGateId1; - static constexpr char name[] = "id1"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateQSim Create(unsigned time, unsigned q0) { - return CreateGate, GateId1>( - time, {q0}, {1, 0, 0, 0, 0, 0, 1, 0}); - } -}; - -/** - * The Hadamard gate. - */ -template -struct GateHd { - static constexpr GateKind kind = kGateHd; - static constexpr char name[] = "h"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static constexpr fp_type is2 = static_cast(is2_double); - - static GateQSim Create(unsigned time, unsigned q0) { - return CreateGate, GateHd>( - time, {q0}, {is2, 0, is2, 0, is2, 0, -is2, 0}); - } -}; - -/** - * The T gate, equivalent to `Z ^ 0.25`. - */ -template -struct GateT { - static constexpr GateKind kind = kGateT; - static constexpr char name[] = "t"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static constexpr fp_type is2 = static_cast(is2_double); - - static GateQSim Create(unsigned time, unsigned q0) { - return CreateGate, GateT>( - time, {q0}, {1, 0, 0, 0, 0, 0, is2, is2}); - } -}; - -/** - * The Pauli X (or "NOT") gate. - */ -template -struct GateX { - static constexpr GateKind kind = kGateX; - static constexpr char name[] = "x"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateQSim Create(unsigned time, unsigned q0) { - return CreateGate, GateX>( - time, {q0}, {0, 0, 1, 0, 1, 0, 0, 0}); - } -}; - -/** - * The Pauli Y gate. - */ -template -struct GateY { - static constexpr GateKind kind = kGateY; - static constexpr char name[] = "y"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateQSim Create(unsigned time, unsigned q0) { - return CreateGate, GateY>( - time, {q0}, {0, 0, 0, -1, 0, 1, 0, 0}); - } -}; - -/** - * The Pauli Z gate. - */ -template -struct GateZ { - static constexpr GateKind kind = kGateZ; - static constexpr char name[] = "z"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateQSim Create(unsigned time, unsigned q0) { - return CreateGate, GateZ>( - time, {q0}, {1, 0, 0, 0, 0, 0, -1, 0}); - } -}; - -/** - * The "square root of X" gate. - */ -template -struct GateX2 { - static constexpr GateKind kind = kGateX2; - static constexpr char name[] = "x_1_2"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static constexpr fp_type h = static_cast(h_double); - - static GateQSim Create(unsigned time, unsigned q0) { - return CreateGate, GateX2>( - time, {q0}, {h, h, h, -h, h, -h, h, h}); - } -}; - -/** - * The "square root of Y" gate. - */ -template -struct GateY2 { - static constexpr GateKind kind = kGateY2; - static constexpr char name[] = "y_1_2"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static constexpr fp_type h = static_cast(h_double); - - static GateQSim Create(unsigned time, unsigned q0) { - return CreateGate, GateY2>( - time, {q0}, {h, h, -h, -h, h, h, h, h}); - } -}; - -/** - * A gate that rotates around the X axis of the Bloch sphere. - * This is a generalization of the X gate. - */ -template -struct GateRX { - static constexpr GateKind kind = kGateRX; - static constexpr char name[] = "rx"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateQSim Create(unsigned time, unsigned q0, fp_type phi) { - fp_type phi2 = -0.5 * phi; - fp_type c = std::cos(phi2); - fp_type s = std::sin(phi2); - - return CreateGate, GateRX>( - time, {q0}, {c, 0, 0, s, 0, s, c, 0}, {phi}); - } -}; - -/** - * A gate that rotates around the Y axis of the Bloch sphere. - * This is a generalization of the Y gate. - */ -template -struct GateRY { - static constexpr GateKind kind = kGateRY; - static constexpr char name[] = "ry"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateQSim Create(unsigned time, unsigned q0, fp_type phi) { - fp_type phi2 = -0.5 * phi; - fp_type c = std::cos(phi2); - fp_type s = std::sin(phi2); - - return CreateGate, GateRY>( - time, {q0}, {c, 0, s, 0, -s, 0, c, 0}, {phi}); - } -}; - -/** - * A gate that rotates around the Z axis of the Bloch sphere. - * This is a generalization of the Z gate. - */ -template -struct GateRZ { - static constexpr GateKind kind = kGateRZ; - static constexpr char name[] = "rz"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateQSim Create(unsigned time, unsigned q0, fp_type phi) { - fp_type phi2 = -0.5 * phi; - fp_type c = std::cos(phi2); - fp_type s = std::sin(phi2); - - return CreateGate, GateRZ>( - time, {q0}, {c, s, 0, 0, 0, 0, c, -s}, {phi}); - } -}; - -/** - * A gate that rotates around an arbitrary axis in the XY-plane. - */ -template -struct GateRXY { - static constexpr GateKind kind = kGateRXY; - static constexpr char name[] = "rxy"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateQSim Create( - unsigned time, unsigned q0, fp_type theta, fp_type phi) { - fp_type phi2 = -0.5 * phi; - fp_type cp = std::cos(phi2); - fp_type sp = std::sin(phi2); - fp_type ct = std::cos(theta) * sp; - fp_type st = std::sin(theta) * sp; - - return CreateGate, GateRXY>( - time, {q0}, {cp, 0, st, ct, -st, ct, cp, 0}, {theta, phi}); - } -}; - -/** - * A pi / 2 rotation around the X + Y axis. - */ -template -struct GateHZ2 { - static constexpr GateKind kind = kGateHZ2; - static constexpr char name[] = "hz_1_2"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static constexpr fp_type h = static_cast(h_double); - - static constexpr fp_type is2 = static_cast(is2_double); - - static GateQSim Create(unsigned time, unsigned q0) { - return CreateGate, GateHZ2>( - time, {q0}, {h, h, 0, -is2, is2, 0, h, h}); - } -}; - -/** - * The S gate, equivalent to "square root of Z". - */ -template -struct GateS { - static constexpr GateKind kind = kGateS; - static constexpr char name[] = "s"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateQSim Create(unsigned time, unsigned q0) { - return CreateGate, GateS>( - time, {q0}, {1, 0, 0, 0, 0, 0, 0, 1}); - } -}; - -/** - * A one-qubit gate defined entirely by its matrix. - */ -template -struct GateMatrix1 { - static constexpr GateKind kind = kGateMatrix1; - static constexpr char name[] = "mat1"; - static constexpr unsigned num_qubits = 1; - static constexpr bool symmetric = true; - - static GateQSim Create(unsigned time, unsigned q0, - const Matrix& m) { - auto m2 = m; - return - CreateGate, GateMatrix1>(time, {q0}, std::move(m2)); - } -}; - -// Two-qubit gates: - -/** - * The two-qubit identity gate. - */ -template -struct GateId2 { - static constexpr GateKind kind = kGateId2; - static constexpr char name[] = "id2"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static GateQSim Create(unsigned time, unsigned q0, unsigned q1) { - return CreateGate, GateId2>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0}); - } - - static schmidt_decomp_type SchmidtDecomp() { - return schmidt_decomp_type{ - {{1, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, 1, 0}}, - }; - } -}; - -/** - * The controlled-Z (CZ) gate. - */ -template -struct GateCZ { - static constexpr GateKind kind = kGateCZ; - static constexpr char name[] = "cz"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static GateQSim Create(unsigned time, unsigned q0, unsigned q1) { - return CreateGate, GateCZ>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, -1, 0}); - } - - static schmidt_decomp_type SchmidtDecomp() { - return schmidt_decomp_type{ - {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}}, - {{0, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, -1, 0}}, - }; - } -}; - -/** - * The controlled-X (CX or CNOT) gate. - */ -template -struct GateCNot { - static constexpr GateKind kind = kGateCNot; - static constexpr char name[] = "cnot"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = false; - - static GateQSim Create(unsigned time, unsigned q0, unsigned q1) { - // Matrix is in this form because the simulator uses inverse qubit order. - return CreateGate, GateCNot>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0, - 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 0}); - } - - static schmidt_decomp_type SchmidtDecomp() { - return schmidt_decomp_type{ - {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}}, - {{0, 0, 0, 0, 0, 0, 1, 0}, {0, 0, 1, 0, 1, 0, 0, 0}}, - }; - } -}; - -/** - * The SWAP gate. Exchanges two qubits. - */ -template -struct GateSwap { - static constexpr GateKind kind = kGateSwap; - static constexpr char name[] = "sw"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static constexpr fp_type is2 = static_cast(is2_double); - - static GateQSim Create(unsigned time, unsigned q0, unsigned q1) { - return CreateGate, GateSwap>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0}); - } - - static schmidt_decomp_type SchmidtDecomp() { - return schmidt_decomp_type{ - {{is2, 0, 0, 0, 0, 0, is2, 0}, {is2, 0, 0, 0, 0, 0, is2, 0}}, - {{0, 0, is2, 0, is2, 0, 0, 0}, {0, 0, is2, 0, is2, 0, 0, 0}}, - {{0, 0, 0, -is2, 0, is2, 0, 0}, {0, 0, 0, -is2, 0, is2, 0, 0}}, - {{is2, 0, 0, 0, 0, 0, -is2, 0}, {is2, 0, 0, 0, 0, 0, -is2, 0}}, - }; - } -}; - -/** - * The ISWAP gate. - */ -template -struct GateIS { - static constexpr GateKind kind = kGateIS; - static constexpr char name[] = "is"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static constexpr fp_type h = static_cast(h_double); - static constexpr fp_type is2 = static_cast(is2_double); - - static GateQSim Create(unsigned time, unsigned q0, unsigned q1) { - return CreateGate, GateIS>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 1, 0, 0, - 0, 0, 0, 1, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0}); - } - - static schmidt_decomp_type SchmidtDecomp() { - return schmidt_decomp_type{ - {{is2, 0, 0, 0, 0, 0, is2, 0}, {is2, 0, 0, 0, 0, 0, is2, 0}}, - {{0, 0, h, h, h, h, 0, 0}, {0, 0, h, h, h, h, 0, 0}}, - {{0, 0, h, -h, -h, h, 0, 0}, {0, 0, h, -h, -h, h, 0, 0}}, - {{is2, 0, 0, 0, 0, 0, -is2, 0}, {is2, 0, 0, 0, 0, 0, -is2, 0}}, - }; - } -}; - -/** - * The fermionic simulation (FSim) gate family. Contains all two-qubit - * interactions that preserve excitations, up to single-qubit rotations and - * global phase. - */ -template -struct GateFS { - static constexpr GateKind kind = kGateFS; - static constexpr char name[] = "fs"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static constexpr fp_type is2 = static_cast(is2_double); - - static GateQSim Create( - unsigned time, unsigned q0, unsigned q1, fp_type theta, fp_type phi) { - if (phi < 0) { - phi += 2 * 3.141592653589793; - } - - fp_type ct = std::cos(theta); - fp_type st = std::sin(theta); - fp_type cp = std::cos(phi); - fp_type sp = std::sin(phi); - - return CreateGate, GateFS>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, ct, 0, 0, -st, 0, 0, - 0, 0, 0, -st, ct, 0, 0, 0, - 0, 0, 0, 0, 0, 0, cp, -sp}, {theta, phi}); - } - - static schmidt_decomp_type SchmidtDecomp( - fp_type theta, fp_type phi) { - fp_type ct = std::cos(theta); - fp_type st = std::sin(theta); - - fp_type cp2 = std::cos(0.5 * phi); - fp_type sp2 = std::sin(0.5 * phi); - fp_type cp4 = std::cos(0.25 * phi); - fp_type sp4 = std::sin(0.25 * phi); - - fp_type a0 = std::sqrt(std::sqrt(1 + 2 * ct * cp2 + ct * ct)); - fp_type a1 = std::sqrt(std::sqrt(1 - 2 * ct * cp2 + ct * ct)); - - fp_type p0 = 0.5 * std::atan2(-sp2, cp2 + ct); - fp_type p1 = 0.5 * std::atan2(-sp2, cp2 - ct); - - fp_type c0 = is2 * a0 * std::cos(p0); - fp_type s0 = is2 * a0 * std::sin(p0); - - fp_type c1 = is2 * a1 * std::cos(p1); - fp_type s1 = is2 * a1 * std::sin(p1); - - fp_type st2 = 0.5 * std::sqrt(st); - - fp_type a = cp4 * c0 - sp4 * s0; - fp_type b = cp4 * s0 + sp4 * c0; - fp_type c = cp4 * c0 + sp4 * s0; - fp_type d = cp4 * s0 - sp4 * c0; - - fp_type e = cp4 * c1 - sp4 * s1; - fp_type f = cp4 * s1 + sp4 * c1; - fp_type g = -(cp4 * c1 + sp4 * s1); - fp_type h = -(cp4 * s1 - sp4 * c1); - - return schmidt_decomp_type{ - {{a, b, 0, 0, 0, 0, c, d}, {a, b, 0, 0, 0, 0, c, d}}, - {{0, 0, st2, -st2, st2, -st2, 0, 0}, {0, 0, st2, -st2, st2, -st2, 0, 0}}, - {{0, 0, -st2, -st2, st2, st2, 0, 0}, {0, 0, -st2, -st2, st2, st2, 0, 0}}, - {{e, f, 0, 0, 0, 0, g, h}, {e, f, 0, 0, 0, 0, g, h}}, - }; - } -}; - -/** - * The controlled phase gate. A generalized version of GateCZ. - */ -template -struct GateCP { - static constexpr GateKind kind = kGateCP; - static constexpr char name[] = "cp"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = true; - - static GateQSim Create( - unsigned time, unsigned q0, unsigned q1, fp_type phi) { - fp_type cp = std::cos(phi); - fp_type sp = std::sin(phi); - - return CreateGate, GateCP>( - time, {q0, q1}, {1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 0, cp, -sp}, {phi}); - } - - static schmidt_decomp_type SchmidtDecomp(fp_type phi) { - fp_type cp = std::cos(phi); - fp_type sp = std::sin(phi); - - return schmidt_decomp_type{ - {{1, 0, 0, 0, 0, 0, 0, 0}, {1, 0, 0, 0, 0, 0, 1, 0}}, - {{0, 0, 0, 0, 0, 0, 1, 0}, {1, 0, 0, 0, 0, 0, cp, -sp}}, - }; - } -}; - -/** - * A two-qubit gate defined entirely by its matrix. - */ -template -struct GateMatrix2 { - static constexpr GateKind kind = kGateMatrix2; - static constexpr char name[] = "mat2"; - static constexpr unsigned num_qubits = 2; - static constexpr bool symmetric = false; - - template > - static GateQSim Create( - unsigned time, unsigned q0, unsigned q1, M&& m) { - return CreateGate, GateMatrix2>(time, {q1, q0}, - std::forward(m)); - } - - static schmidt_decomp_type SchmidtDecomp(fp_type phi) { - // Not implemented. - return schmidt_decomp_type{}; - } -}; - -template -inline schmidt_decomp_type GetSchmidtDecomp( - GateKind kind, const std::vector& params) { - switch (kind) { - case kGateId2: - return GateId2::SchmidtDecomp(); - case kGateCZ: - return GateCZ::SchmidtDecomp(); - case kGateCNot: - return GateCNot::SchmidtDecomp(); - case kGateSwap: - return GateSwap::SchmidtDecomp(); - case kGateIS: - return GateIS::SchmidtDecomp(); - case kGateFS: - return GateFS::SchmidtDecomp(params[0], params[1]); - case kGateCP: - return GateCP::SchmidtDecomp(params[0]); - default: - // Single qubit gates: empty Schmidt decomposition. - return schmidt_decomp_type{}; - } -} - -} // namespace qsim - -#endif // GATES_QSIM_H_ diff --git a/tpls/qsim/hybrid.h b/tpls/qsim/hybrid.h deleted file mode 100644 index 44fad5b..0000000 --- a/tpls/qsim/hybrid.h +++ /dev/null @@ -1,612 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef HYBRID_H_ -#define HYBRID_H_ - -#include -#include -#include -#include - -#include "gate.h" -#include "gate_appl.h" - -namespace qsim { - -/** - * Hybrid Feynman-Schrodinger simulator. - */ -template class FuserT, typename For> -struct HybridSimulator final { - public: - using Gate = GateT; - using GateKind = typename Gate::GateKind; - using fp_type = typename Gate::fp_type; - - private: - // Note that one can use "struct GateHybrid : public Gate {" in C++17. - struct GateHybrid { - using GateKind = HybridSimulator::GateKind; - using fp_type = HybridSimulator::fp_type; - - GateKind kind; - unsigned time; - std::vector qubits; - std::vector controlled_by; - uint64_t cmask; - std::vector params; - Matrix matrix; - bool unfusible; - bool swapped; - - const Gate* parent; - unsigned id; - }; - - struct GateX { - GateHybrid* decomposed0; - GateHybrid* decomposed1; - schmidt_decomp_type schmidt_decomp; - unsigned schmidt_bits; - unsigned swapped; - }; - - public: - using Fuser = FuserT; - using GateFused = typename Fuser::GateFused; - - /** - * Contextual data for hybrid simulation. - */ - struct HybridData { - /** - * List of gates on the "0" side of the cut. - */ - std::vector gates0; - /** - * List of gates on the "1" side of the cut. - */ - std::vector gates1; - /** - * List of gates on the cut. - */ - std::vector gatexs; - /** - * Global qubit index to local qubit index map. - */ - std::vector qubit_map; - /** - * Number of qubits on the "0" side of the cut. - */ - unsigned num_qubits0; - /** - * Number of qubits on the "1" side of the cut. - */ - unsigned num_qubits1; - /** - * Number of gates on the cut. - */ - unsigned num_gatexs; - }; - - /** - * User-specified parameters for gate fusion and hybrid simulation. - */ - struct Parameter : public Fuser::Parameter { - /** - * Fixed bitstring indicating values to assign to Schmidt decomposition - * indices of prefix gates. - */ - uint64_t prefix; - /** - * Number of gates on the cut that are part of the prefix. Indices of these - * gates are assigned the value indicated by `prefix`. - */ - unsigned num_prefix_gatexs; - /** - * Number of gates on the cut that are part of the root. All gates that are - * not part of the prefix or root are part of the suffix. - */ - unsigned num_root_gatexs; - unsigned num_threads; - }; - - template - explicit HybridSimulator(Args&&... args) : for_(args...) {} - - /** - * Splits the lattice into two parts, using Schmidt decomposition for gates - * on the cut. - * @param parts Lattice sections to be simulated. - * @param gates List of all gates in the circuit. - * @param hd Output data with split parts. - * @return True if the splitting done successfully; false otherwise. - */ - static bool SplitLattice(const std::vector& parts, - const std::vector& gates, HybridData& hd) { - hd.num_gatexs = 0; - hd.num_qubits0 = 0; - hd.num_qubits1 = 0; - - hd.gates0.reserve(gates.size()); - hd.gates1.reserve(gates.size()); - hd.qubit_map.reserve(parts.size()); - - unsigned count0 = 0; - unsigned count1 = 0; - - // Global qubit index to local qubit index map. - for (std::size_t i = 0; i < parts.size(); ++i) { - parts[i] == 0 ? ++hd.num_qubits0 : ++hd.num_qubits1; - hd.qubit_map.push_back(parts[i] == 0 ? count0++ : count1++); - } - - // Split the lattice. - for (const auto& gate : gates) { - if (gate.kind == gate::kMeasurement) { - IO::errorf("measurement gates are not suported by qsimh.\n"); - return false; - } - - if (gate.controlled_by.size() > 0) { - IO::errorf("controlled gates are not suported by qsimh.\n"); - return false; - } - - switch (gate.qubits.size()) { - case 1: // Single qubit gates. - switch (parts[gate.qubits[0]]) { - case 0: - hd.gates0.emplace_back(GateHybrid{gate.kind, gate.time, - {hd.qubit_map[gate.qubits[0]]}, {}, 0, gate.params, gate.matrix, - false, false, nullptr, 0}); - break; - case 1: - hd.gates1.emplace_back(GateHybrid{gate.kind, gate.time, - {hd.qubit_map[gate.qubits[0]]}, {}, 0, gate.params, gate.matrix, - false, false, nullptr, 0}); - break; - } - break; - case 2: // Two qubit gates. - { - switch ((parts[gate.qubits[1]] << 1) | parts[gate.qubits[0]]) { - case 0: // Both qubits in part 0. - hd.gates0.emplace_back(GateHybrid{gate.kind, gate.time, - {hd.qubit_map[gate.qubits[0]], hd.qubit_map[gate.qubits[1]]}, - {}, 0, gate.params, gate.matrix, false, gate.swapped, - nullptr, 0}); - break; - case 1: // Gate on the cut, qubit 0 in part 1, qubit 1 in part 0. - hd.gates0.emplace_back(GateHybrid{GateKind::kDecomp, gate.time, - {hd.qubit_map[gate.qubits[1]]}, {}, 0, gate.params, {}, - true, gate.swapped, &gate, hd.num_gatexs}); - hd.gates1.emplace_back(GateHybrid{GateKind::kDecomp, gate.time, - {hd.qubit_map[gate.qubits[0]]}, {}, 0, gate.params, {}, - true, gate.swapped, &gate, hd.num_gatexs}); - - ++hd.num_gatexs; - break; - case 2: // Gate on the cut, qubit 0 in part 0, qubit 1 in part 1. - hd.gates0.emplace_back(GateHybrid{GateKind::kDecomp, gate.time, - {hd.qubit_map[gate.qubits[0]]}, {}, 0, gate.params, {}, - true, gate.swapped, &gate, hd.num_gatexs}); - hd.gates1.emplace_back(GateHybrid{GateKind::kDecomp, gate.time, - {hd.qubit_map[gate.qubits[1]]}, {}, 0, gate.params, {}, - true, gate.swapped, &gate, hd.num_gatexs}); - - ++hd.num_gatexs; - break; - case 3: // Both qubits in part 1. - hd.gates1.emplace_back(GateHybrid{gate.kind, gate.time, - {hd.qubit_map[gate.qubits[0]], hd.qubit_map[gate.qubits[1]]}, - {}, 0, gate.params, gate.matrix, false, gate.swapped, - nullptr, 0}); - break; - } - } - break; - default: - IO::errorf("multi-qubit gates are not suported by qsimh.\n"); - return false; - } - } - - auto compare = [](const GateHybrid& l, const GateHybrid& r) -> bool { - return l.time < r.time || (l.time == r.time && - (l.parent < r.parent || (l.parent == r.parent && l.id < r.id))); - }; - - // Sort gates. - std::sort(hd.gates0.begin(), hd.gates0.end(), compare); - std::sort(hd.gates1.begin(), hd.gates1.end(), compare); - - hd.gatexs.reserve(hd.num_gatexs); - - // Get Schmidt matrices. - for (auto& gate0 : hd.gates0) { - if (gate0.parent != nullptr) { - auto d = GetSchmidtDecomp(gate0.parent->kind, gate0.parent->params); - if (d.size() == 0) { - IO::errorf("no Schmidt decomposition for gate kind %u.\n", - gate0.parent->kind); - return false; - } - - unsigned schmidt_bits = SchmidtBits(d.size()); - if (schmidt_bits > 2) { - IO::errorf("Schmidt rank is too large for gate kind %u.\n", - gate0.parent->kind); - return false; - } - - unsigned swapped = parts[gate0.parent->qubits[0]]; - if (gate0.parent->swapped) swapped = 1 - swapped; - hd.gatexs.emplace_back(GateX{&gate0, nullptr, std::move(d), - schmidt_bits, swapped}); - } - } - - unsigned count = 0; - for (auto& gate1 : hd.gates1) { - if (gate1.parent != nullptr) { - hd.gatexs[count++].decomposed1 = &gate1; - } - } - - for (auto& gatex : hd.gatexs) { - if (gatex.schmidt_decomp.size() == 1) { - FillSchmidtMatrices(0, gatex); - } - } - - return true; - } - - /** - * Runs the hybrid simulator on a sectioned lattice. - * @param param Options for parallelism and logging. Also specifies the size - * of the 'prefix' and 'root' sections of the lattice. - * @param factory Object to create simulators and state spaces. - * @param hd Container object for gates on the boundary between lattice - * sections. - * @param parts Lattice sections to be simulated. - * @param fgates0 List of gates from one section of the lattice. - * @param fgates1 List of gates from the other section of the lattice. - * @param bitstrings List of output states to simulate, as bitstrings. - * @param results Output vector of amplitudes. After a successful run, this - * will be populated with amplitudes for each state in 'bitstrings'. - * @return True if the simulation completed successfully; false otherwise. - */ - template - bool Run(const Parameter& param, const Factory& factory, - HybridData& hd, const std::vector& parts, - const std::vector& fgates0, - const std::vector& fgates1, - const std::vector& bitstrings, Results& results) const { - using Simulator = typename Factory::Simulator; - using StateSpace = typename Simulator::StateSpace; - using State = typename StateSpace::State; - - unsigned num_p_gates = param.num_prefix_gatexs; - unsigned num_pr_gates = num_p_gates + param.num_root_gatexs; - - auto bits = CountSchmidtBits(param, hd.gatexs); - - uint64_t rmax = uint64_t{1} << bits.num_r_bits; - uint64_t smax = uint64_t{1} << bits.num_s_bits; - - auto loc0 = CheckpointLocations(param, fgates0); - auto loc1 = CheckpointLocations(param, fgates1); - - struct Index { - unsigned i0; - unsigned i1; - }; - - std::vector indices; - indices.reserve(bitstrings.size()); - - // Bitstring indices for part 0 and part 1. TODO: optimize. - for (const auto& bitstring : bitstrings) { - Index index{0, 0}; - - for (uint64_t i = 0; i < hd.qubit_map.size(); ++i) { - unsigned m = ((bitstring >> i) & 1) << hd.qubit_map[i]; - parts[i] ? index.i1 |= m : index.i0 |= m; - } - - indices.push_back(index); - } - - StateSpace state_space = factory.CreateStateSpace(); - - State* rstate0; - State* rstate1; - - State state0p = state_space.Null(); - State state1p = state_space.Null(); - State state0r = state_space.Null(); - State state1r = state_space.Null(); - State state0s = state_space.Null(); - State state1s = state_space.Null(); - - // Create states. - - if (!CreateStates(hd.num_qubits0, hd.num_qubits1, state_space, true, - state0p, state1p, rstate0, rstate1)) { - return false; - } - - if (!CreateStates(hd.num_qubits0, hd.num_qubits1, state_space, rmax > 1, - state0r, state1r, rstate0, rstate1)) { - return false; - } - - if (!CreateStates(hd.num_qubits0, hd.num_qubits1, state_space, smax > 1, - state0s, state1s, rstate0, rstate1)) { - return false; - } - - state_space.SetStateZero(state0p); - state_space.SetStateZero(state1p); - - Simulator simulator = factory.CreateSimulator(); - - std::vector prev(hd.num_gatexs, unsigned(-1)); - - // param.prefix encodes the prefix path. - unsigned gatex_index = SetSchmidtMatrices( - 0, num_p_gates, param.prefix, prev, hd.gatexs); - - if (gatex_index == 0) { - // Apply gates before the first checkpoint. - ApplyGates(fgates0, 0, loc0[0], simulator, state0p); - ApplyGates(fgates1, 0, loc1[0], simulator, state1p); - } else { - IO::errorf("invalid prefix %lu for prefix gate index %u.\n", - param.prefix, gatex_index - 1); - return false; - } - - // Branch over root gates on the cut. r encodes the root path. - for (uint64_t r = 0; r < rmax; ++r) { - if (rmax > 1) { - state_space.Copy(state0p, state0r); - state_space.Copy(state1p, state1r); - } - - if (SetSchmidtMatrices(num_p_gates, num_pr_gates, - r, prev, hd.gatexs) == 0) { - // Apply gates before the second checkpoint. - ApplyGates(fgates0, loc0[0], loc0[1], simulator, state0r); - ApplyGates(fgates1, loc1[0], loc1[1], simulator, state1r); - } else { - continue; - } - - // Branch over suffix gates on the cut. s encodes the suffix path. - for (uint64_t s = 0; s < smax; ++s) { - if (smax > 1) { - state_space.Copy(rmax > 1 ? state0r : state0p, state0s); - state_space.Copy(rmax > 1 ? state1r : state1p, state1s); - } - - if (SetSchmidtMatrices(num_pr_gates, hd.num_gatexs, - s, prev, hd.gatexs) == 0) { - // Apply the rest of the gates. - ApplyGates(fgates0, loc0[1], fgates0.size(), simulator, state0s); - ApplyGates(fgates1, loc1[1], fgates1.size(), simulator, state1s); - } else { - continue; - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - const StateSpace& state_space, - const State& state0, const State& state1, - const std::vector& indices, Results& results) { - // TODO: make it faster for the CUDA state space. - auto a0 = state_space.GetAmpl(state0, indices[i].i0); - auto a1 = state_space.GetAmpl(state1, indices[i].i1); - results[i] += a0 * a1; - }; - - // Collect results. - for_.Run(results.size(), f, - state_space, *rstate0, *rstate1, indices, results); - } - } - - return true; - } - - private: - /** - * Identifies when to save "checkpoints" of the simulation state. These allow - * runs with different cut-index values to reuse parts of the simulation. - * @param param Options for parallelism and logging. Also specifies the size - * of the 'prefix' and 'root' sections of the lattice. - * @param fgates Set of gates for which to find checkpoint locations. - * @return A pair of numbers specifying how many gates to apply before the - * first and second checkpoints, respectively. - */ - static std::array CheckpointLocations( - const Parameter& param, const std::vector& fgates) { - std::array loc{0, 0}; - - unsigned num_decomposed = 0; - unsigned num_p_gates = param.num_prefix_gatexs; - unsigned num_pr_gates = num_p_gates + param.num_root_gatexs; - - for (std::size_t i = 0; i < fgates.size(); ++i) { - for (auto gate: fgates[i].gates) { - if (gate->parent != nullptr) { - ++num_decomposed; - // There should be only one decomposed gate in fused gate. - break; - } - } - - if (num_decomposed <= num_p_gates) { - loc[0] = i + 1; - } - - if (num_decomposed <= num_pr_gates) { - loc[1] = i + 1; - } - } - - return loc; - } - - struct Bits { - unsigned num_p_bits; - unsigned num_r_bits; - unsigned num_s_bits; - }; - - static Bits CountSchmidtBits( - const Parameter& param, const std::vector& gatexs) { - Bits bits{0, 0, 0}; - - unsigned num_p_gates = param.num_prefix_gatexs; - unsigned num_pr_gates = num_p_gates + param.num_root_gatexs; - - for (std::size_t i = 0; i < gatexs.size(); ++i) { - const auto& gatex = gatexs[i]; - if (i < num_p_gates) { - bits.num_p_bits += gatex.schmidt_bits; - } else if (i < num_pr_gates) { - bits.num_r_bits += gatex.schmidt_bits; - } else { - bits.num_s_bits += gatex.schmidt_bits; - } - } - - return bits; - } - - static unsigned SetSchmidtMatrices(std::size_t i0, std::size_t i1, - uint64_t path, - std::vector& prev_k, - std::vector& gatexs) { - unsigned shift_length = 0; - - for (std::size_t i = i0; i < i1; ++i) { - const auto& gatex = gatexs[i]; - - if (gatex.schmidt_bits == 0) { - // Continue if gatex has Schmidt rank 1. - continue; - } - - unsigned k = (path >> shift_length) & ((1 << gatex.schmidt_bits) - 1); - shift_length += gatex.schmidt_bits; - - if (k != prev_k[i]) { - if (k >= gatex.schmidt_decomp.size()) { - // Invalid path. Returns gatex index plus one to report error in case - // of invalid prefix. - return i + 1; - } - - FillSchmidtMatrices(k, gatex); - - prev_k[i] = k; - } - } - - return 0; - } - - static void FillSchmidtMatrices(unsigned k, const GateX& gatex) { - unsigned part0 = gatex.swapped; - unsigned part1 = 1 - part0; - { - gatex.decomposed0->matrix.resize(gatex.schmidt_decomp[k][part0].size()); - auto begin = gatex.schmidt_decomp[k][part0].begin(); - auto end = gatex.schmidt_decomp[k][part0].end(); - std::copy(begin, end, gatex.decomposed0->matrix.begin()); - } - { - gatex.decomposed1->matrix.resize(gatex.schmidt_decomp[k][part1].size()); - auto begin = gatex.schmidt_decomp[k][part1].begin(); - auto end = gatex.schmidt_decomp[k][part1].end(); - std::copy(begin, end, gatex.decomposed1->matrix.begin()); - } - } - - template - static void ApplyGates(const std::vector& gates, - std::size_t i0, std::size_t i1, - const Simulator& simulator, - typename Simulator::State& state) { - for (std::size_t i = i0; i < i1; ++i) { - if (gates[i].matrix.size() > 0) { - ApplyFusedGate(simulator, gates[i], state); - } else { - auto gate = gates[i]; - CalculateFusedMatrix(gate); - ApplyFusedGate(simulator, gate, state); - } - } - } - - static unsigned SchmidtBits(unsigned size) { - switch (size) { - case 1: - return 0; - case 2: - return 1; - case 3: - return 2; - case 4: - return 2; - default: - // Not supported. - return 42; - } - } - - template - static bool CreateStates(unsigned num_qubits0,unsigned num_qubits1, - const StateSpace& state_space, bool create, - typename StateSpace::State& state0, - typename StateSpace::State& state1, - typename StateSpace::State* (&rstate0), - typename StateSpace::State* (&rstate1)) { - if (create) { - state0 = state_space.Create(num_qubits0); - state1 = state_space.Create(num_qubits1); - - if (state_space.IsNull(state0) || state_space.IsNull(state1)) { - IO::errorf("not enough memory: is the number of qubits too large?\n"); - return false; - } - - rstate0 = &state0; - rstate1 = &state1; - } - - return true; - } - - For for_; -}; - -} // namespace qsim - -#endif // HYBRID_H_ diff --git a/tpls/qsim/io.h b/tpls/qsim/io.h deleted file mode 100644 index 3b26c7c..0000000 --- a/tpls/qsim/io.h +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef IO_H_ -#define IO_H_ - -#include -#include - -namespace qsim { - -/** - * Controller for output logs. - */ -struct IO { - static void errorf(const char* format, ...) { - va_list args; - va_start(args, format); - vfprintf(stderr, format, args); - va_end(args); - } - - static void messagef(const char* format, ...) { - va_list args; - va_start(args, format); - vprintf(format, args); - va_end(args); - } -}; - -} // namespace qsim - -#endif // IO_H_ diff --git a/tpls/qsim/io_file.h b/tpls/qsim/io_file.h deleted file mode 100644 index 3cfac12..0000000 --- a/tpls/qsim/io_file.h +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef IO_FILE_H_ -#define IO_FILE_H_ - -#include -#include -#include - -#include "io.h" - -namespace qsim { - -/** - * Controller for output logs with methods for writing to file. - */ -struct IOFile : public IO { - static std::ifstream StreamFromFile(const std::string& file) { - std::ifstream fs; - fs.open(file); - if (!fs) { - errorf("cannot open %s for reading.\n", file.c_str()); - } - return fs; - } - - static void CloseStream(std::ifstream& fs) { - fs.close(); - } - - static bool WriteToFile( - const std::string& file, const std::string& content) { - return WriteToFile(file, content.data(), content.size()); - } - - static bool WriteToFile( - const std::string& file, const void* data, uint64_t size) { - auto fs = std::fstream(file, std::ios::out | std::ios::binary); - - if (!fs) { - errorf("cannot open %s for writing.\n", file.c_str()); - return false; - } else { - fs.write((const char*) data, size); - if (!fs) { - errorf("cannot write to %s.\n", file.c_str()); - return false; - } - - fs.close(); - } - - return true; - } -}; - -} // namespace qsim - -#endif // IO_FILE_H_ diff --git a/tpls/qsim/matrix.h b/tpls/qsim/matrix.h deleted file mode 100644 index a3c2640..0000000 --- a/tpls/qsim/matrix.h +++ /dev/null @@ -1,296 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MATRIX_H_ -#define MATRIX_H_ - -#include -#include -#include - -#include "bits.h" - -namespace qsim { - -/** - * Gate matrix type. Matrices are stored as vectors. The matrix elements are - * accessed as real(m[i][j]) <- vector[2 * (n * i + j)] and - * imag(m[i][j]) <- vector[2 * (n * i + j) + 1], where n is the number of rows - * or columns (n = 2^q, where q is the number of gate qubits). - */ -template -using Matrix = std::vector; - -/** - * Sets all matrix elements to zero. - * @m Matrix to be cleared. - */ -template -inline void MatrixClear(Matrix& m) { - for (unsigned i = 0; i < m.size(); ++i) { - m[i] = 0; - } -} - -/** - * Sets an identity matrix. - * @n Number of matrix rows (columns). - * @m Output identity matrix. - */ -template -inline void MatrixIdentity(unsigned n, Matrix& m) { - m.resize(2 * n * n); - - MatrixClear(m); - - for (unsigned i = 0; i < n; ++i) { - m[2 * (n * i + i)] = 1; - } -} - -/** - * Multiplies two gate matrices of equal size: m2 = m1 m2. - * @q Number of gate qubits. The number of matrix rows (columns) is 2^q. - * @m1 Matrix m1. - * @m2 Input matrix m2. Output product of matrices m2 = m1 m2. - */ -template -inline void MatrixMultiply( - unsigned q, const Matrix& m1, Matrix& m2) { - Matrix mt = m2; - unsigned n = unsigned{1} << q; - - for (unsigned i = 0; i < n; ++i) { - for (unsigned j = 0; j < n; ++j) { - fp_type2 re = 0; - fp_type2 im = 0; - - for (unsigned k = 0; k < n; ++k) { - fp_type2 r1 = m1[2 * (n * i + k)]; - fp_type2 i1 = m1[2 * (n * i + k) + 1]; - fp_type2 r2 = mt[2 * (n * k + j)]; - fp_type2 i2 = mt[2 * (n * k + j) + 1]; - - re += r1 * r2 - i1 * i2; - im += r1 * i2 + i1 * r2; - } - - m2[2 * (n * i + j)] = re; - m2[2 * (n * i + j) + 1] = im; - } - } -} - -/** - * Multiplies two gate matrices of equal size: m2 = m1^\dagger m2. - * @q Number of gate qubits. The number of matrix rows (columns) is 2^q. - * @m1 Matrix m1. - * @m2 Input matrix m2. Output product of matrices m2 = m1 m2. - */ -template -inline void MatrixDaggerMultiply( - unsigned q, const Matrix& m1, Matrix& m2) { - Matrix mt = m2; - unsigned n = unsigned{1} << q; - - for (unsigned i = 0; i < n; ++i) { - for (unsigned j = 0; j < n; ++j) { - fp_type2 re = 0; - fp_type2 im = 0; - - for (unsigned k = 0; k < n; ++k) { - fp_type2 r1 = m1[2 * (n * k + i)]; - fp_type2 i1 = m1[2 * (n * k + i) + 1]; - fp_type2 r2 = mt[2 * (n * k + j)]; - fp_type2 i2 = mt[2 * (n * k + j) + 1]; - - re += r1 * r2 + i1 * i2; - im += r1 * i2 - i1 * r2; - } - - m2[2 * (n * i + j)] = re; - m2[2 * (n * i + j) + 1] = im; - } - } -} - -/** - * Multiplies two gate matrices: m2 = m1 m2. The size of m1 should not exceed - * the size of m2. - * @mask1 Qubit mask that specifies the subset of qubits m1 acts on. - * @q1 Number of gate qubits. The number of matrix rows (columns) is 2^q1. - * @m1 Matrix m1. - * @q2 Number of gate qubits. The number of matrix rows (columns) is 2^q2. - * @m2 Input matrix m2. Output product of matrices m2 = m1 m2. - */ -template -inline void MatrixMultiply(unsigned mask1, - unsigned q1, const Matrix& m1, - unsigned q2, Matrix& m2) { - if (q1 == q2) { - MatrixMultiply(q1, m1, m2); - } else { - Matrix mt = m2; - unsigned n1 = unsigned{1} << q1; - unsigned n2 = unsigned{1} << q2; - - for (unsigned i = 0; i < n2; ++i) { - unsigned si = bits::CompressBits(i, q2, mask1); - - for (unsigned j = 0; j < n2; ++j) { - fp_type2 re = 0; - fp_type2 im = 0; - - for (unsigned k = 0; k < n1; ++k) { - unsigned ek = bits::ExpandBits(k, q2, mask1) + (i & ~mask1); - - fp_type2 r1 = m1[2 * (n1 * si + k)]; - fp_type2 i1 = m1[2 * (n1 * si + k) + 1]; - fp_type2 r2 = mt[2 * (n2 * ek + j)]; - fp_type2 i2 = mt[2 * (n2 * ek + j) + 1]; - - re += r1 * r2 - i1 * i2; - im += r1 * i2 + i1 * r2; - } - - m2[2 * (n2 * i + j)] = re; - m2[2 * (n2 * i + j) + 1] = im; - } - } - } -} - -/** - * Multiply a matrix by a real scalar value. - * @c Scalar value. - * @m Input matrix to be multiplied. Output matrix. - */ -template -inline void MatrixScalarMultiply(fp_type1 c, Matrix& m) { - for (unsigned i = 0; i < m.size(); ++i) { - m[i] *= c; - } -} - -/** - * Multiply a matrix by a complex scalar value. - * @re Real part of scalar value. - * @im Imaginary part of scalar value. - * @m Input matrix to be multiplied. Output matrix. - */ -template -inline void MatrixScalarMultiply( - fp_type1 re, fp_type1 im, Matrix& m) { - for (unsigned i = 0; i < m.size() / 2; ++i) { - fp_type2 re0 = m[2 * i + 0]; - fp_type2 im0 = m[2 * i + 1]; - m[2 * i + 0] = re * re0 - im * im0; - m[2 * i + 1] = re * im0 + im * re0; - } -} - -/** - * Daggers a matrix. - * @n Number of matrix rows (columns). - * @m Input matrix. Output matrix. - */ -template -inline void MatrixDagger(unsigned n, Matrix& m) { - for (unsigned i = 0; i < n; ++i) { - m[2 * (n * i + i) + 1] = -m[2 * (n * i + i) + 1]; - - for (unsigned j = i + 1; j < n; ++j) { - std::swap(m[2 * (n * i + j)], m[2 * (n * j + i)]); - fp_type t = m[2 * (n * i + j) + 1]; - m[2 * (n * i + j) + 1] = -m[2 * (n * j + i) + 1]; - m[2 * (n * j + i) + 1] = -t; - } - } -} - -/** - * Gets a permutation to rearrange qubits from "normal" order to "gate" - * order. Qubits are ordered in increasing order for "normal" order. - * Qubits are ordered arbitrarily for "gate" order. Returns an empty vector - * if the qubits are in "normal" order. - * @qubits Qubit indices in "gate" order. - * @return Permutation as a vector. - */ -inline std::vector NormalToGateOrderPermutation( - const std::vector& qubits) { - std::vector perm; - - bool normal_order = true; - - for (std::size_t i = 1; i < qubits.size(); ++i) { - if (qubits[i] < qubits[i - 1]) { - normal_order = false; - break; - } - } - - if (!normal_order) { - struct QI { - unsigned q; - unsigned index; - }; - - std::vector qis; - qis.reserve(qubits.size()); - - for (std::size_t i = 0; i < qubits.size(); ++i) { - qis.push_back({qubits[i], unsigned(i)}); - } - - std::sort(qis.begin(), qis.end(), [](const QI& l, const QI& r) { - return l.q < r.q; - }); - - perm.reserve(qubits.size()); - - for (std::size_t i = 0; i < qubits.size(); ++i) { - perm.push_back(qis[i].index); - } - } - - return perm; -} - -/** - * Shuffles the gate matrix elements to get the matrix that acts on qubits - * that are in "normal" order (in increasing orger). - * @perm Permutation to rearrange qubits from "normal" order to "gate" order. - * @q Number of gate qubits. The number of matrix rows (columns) is 2^q. - * @m Input matrix. Output shuffled matrix. - */ -template -inline void MatrixShuffle(const std::vector& perm, - unsigned q, Matrix& m) { - Matrix mt = m; - unsigned n = unsigned{1} << q; - - for (unsigned i = 0; i < n; ++i) { - unsigned pi = bits::PermuteBits(i, q, perm); - for (unsigned j = 0; j < n; ++j) { - unsigned pj = bits::PermuteBits(j, q, perm); - - m[2 * (n * i + j)] = mt[2 * (n * pi + pj)]; - m[2 * (n * i + j) + 1] = mt[2 * (n * pi + pj) + 1]; - } - } -} - -} // namespace qsim - -#endif // MATRIX_H_ diff --git a/tpls/qsim/mps_simulator.h b/tpls/qsim/mps_simulator.h deleted file mode 100644 index 8fbcbae..0000000 --- a/tpls/qsim/mps_simulator.h +++ /dev/null @@ -1,246 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MPS_SIMULATOR_H_ -#define MPS_SIMULATOR_H_ - -// For templates will take care of parallelization. -#define EIGEN_DONT_PARALLELIZE 1 - -#include -#include -#include -#include -#include - -#include "../eigen/Eigen/Dense" -#include "../eigen/Eigen/SVD" -#include "mps_statespace.h" - -namespace qsim { - -namespace mps { - -/** - * Truncated Matrix Product State (MPS) circuit simulator w/ vectorization. - */ -template -class MPSSimulator final { - public: - using MPSStateSpace_ = MPSStateSpace; - using State = typename MPSStateSpace_::MPS; - using fp_type = typename MPSStateSpace_::fp_type; - - using Complex = std::complex; - using Matrix = - Eigen::Matrix; - using ConstMatrixMap = Eigen::Map; - using MatrixMap = Eigen::Map; - - using OneQubitMatrix = Eigen::Matrix; - using ConstOneQubitMap = Eigen::Map; - - // Note: ForArgs are currently unused. - template - explicit MPSSimulator(ForArgs&&... args) : for_(args...) {} - - /** - * Applies a gate using non-vectorized instructions. - * @param qs Indices of the qubits affected by this gate. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyGate(const std::vector& qs, const fp_type* matrix, - State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - switch (qs.size()) { - case 1: - ApplyGate1(qs, matrix, state); - break; - case 2: - ApplyGate2(qs, matrix, state); - break; - // case 3: - // ApplyGate3(qs, matrix, state); - // break; - // case 4: - // ApplyGate4(qs, matrix, state); - // break; - // case 5: - // ApplyGate5(qs, matrix, state); - // break; - // case 6: - // ApplyGate6(qs, matrix, state); - // break; - default: - // Not implemented. - break; - } - } - - /** - * Applies a controlled gate using eigen3 operations w/ instructions. - * @param qs Indices of the qubits affected by this gate. - * @param cqs Indices of control qubits. - * @param cmask Bit mask of control qubit values. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyControlledGate(const std::vector& qs, - const std::vector& cqs, uint64_t cmask, - const fp_type* matrix, State& state) const { - // TODO. - } - - /** - * Computes the expectation value of an operator using eigen3 operations - * w/ vectorized instructions. - * @param qs Indices of the qubits the operator acts on. - * @param matrix The operator matrix. - * @param state The state of the system. - * @return The computed expectation value. - */ - std::complex ExpectationValue(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - // TODO. - return std::complex(-10., -10.); - } - - private: - void ApplyGate1(const std::vector& qs, const fp_type* matrix, - State& state) const { - if (qs[0] == state.num_qubits() - 1) { - Apply1Right(qs, matrix, state); - } else { - Apply1LeftOrInterior(qs, matrix, state); - } - } - - void Apply1LeftOrInterior(const std::vector& qs, - const fp_type* matrix, State& state) const { - fp_type* raw_state = state.get(); - const auto bond_dim = state.bond_dim(); - const auto l_offset = MPSStateSpace_::GetBlockOffset(state, qs[0]); - const auto r_offset = MPSStateSpace_::GetBlockOffset(state, qs[0] + 1); - const auto end = MPSStateSpace_::Size(state); - ConstOneQubitMap gate_matrix((Complex*) matrix); - MatrixMap scratch_block((Complex*)(raw_state + end), 2, bond_dim); - - for (unsigned block_sep = l_offset; block_sep < r_offset; - block_sep += 4 * bond_dim) { - fp_type* cur_block = raw_state + block_sep; - ConstMatrixMap mps_block((Complex*) cur_block, 2, bond_dim); - scratch_block.noalias() = gate_matrix * mps_block; - memcpy(cur_block, raw_state + end, sizeof(fp_type) * bond_dim * 4); - } - } - - void Apply1Right(const std::vector& qs, const fp_type* matrix, - State& state) const { - fp_type* raw_state = state.get(); - const auto bond_dim = state.bond_dim(); - const auto offset = MPSStateSpace_::GetBlockOffset(state, qs[0]); - const auto end = MPSStateSpace_::Size(state); - ConstOneQubitMap gate_matrix((Complex*) matrix); - ConstMatrixMap mps_block((Complex*)(raw_state + offset), bond_dim, 2); - MatrixMap scratch_block((Complex*)(raw_state + end), bond_dim, 2); - scratch_block.noalias() = mps_block * gate_matrix.transpose(); - memcpy(raw_state + offset, raw_state + end, sizeof(fp_type) * bond_dim * 4); - } - - void ApplyGate2(const std::vector& qs, const fp_type* matrix, - State& state) const { - // TODO: micro-benchmark this function and improve performance. - const auto bond_dim = state.bond_dim(); - const auto num_qubits = state.num_qubits(); - fp_type* raw_state = state.get(); - - const auto i_dim = (qs[0] == 0) ? 1 : bond_dim; - const auto j_dim = 2; - const auto k_dim = bond_dim; - const auto l_dim = 2; - const auto m_dim = (qs[1] == num_qubits - 1) ? 1 : bond_dim; - - const auto b_0_offset = MPSStateSpace_::GetBlockOffset(state, qs[0]); - const auto b_1_offset = MPSStateSpace_::GetBlockOffset(state, qs[1]); - const auto end = MPSStateSpace_::Size(state); - - MatrixMap block_0((Complex*)(raw_state + b_0_offset), i_dim * j_dim, k_dim); - MatrixMap block_1((Complex*)(raw_state + b_1_offset), k_dim, l_dim * m_dim); - - // Merge both blocks into scratch space. - MatrixMap scratch_c((Complex*)(raw_state + end), i_dim * j_dim, l_dim * m_dim); - scratch_c.noalias() = block_0 * block_1; - - // Transpose inner dims in-place. - MatrixMap scratch_c_t((Complex*)(raw_state + end), i_dim * j_dim * l_dim, m_dim); - for (unsigned i = 0; i < i_dim * j_dim * l_dim; i += 4) { - scratch_c_t.row(i + 1).swap(scratch_c_t.row(i + 2)); - } - - // Transpose gate matrix and place in 3rd (last) scratch block. - const auto scratch3_offset = end + 8 * bond_dim * bond_dim; - ConstMatrixMap gate_matrix((Complex*) matrix, 4, 4); - MatrixMap gate_matrix_transpose((Complex*)(raw_state + scratch3_offset), 4, 4); - gate_matrix_transpose = gate_matrix.transpose(); - gate_matrix_transpose.col(1).swap(gate_matrix_transpose.col(2)); - - // Contract gate and merged block tensors, placing result in B0B1. - for (unsigned i = 0; i < i_dim; ++i) { - fp_type* src_block = raw_state + end + i * 8 * m_dim; - fp_type* dest_block = raw_state + b_0_offset + i * 8 * m_dim; - MatrixMap block_b0b1((Complex*) dest_block, 4, m_dim); - ConstMatrixMap scratch_c_i((Complex*) src_block, 4, m_dim); - // [i, np, m] = [np, lj] * [i, lj, m] - block_b0b1.noalias() = gate_matrix_transpose * scratch_c_i; - } - - // SVD B0B1. - MatrixMap full_b0b1((Complex*)(raw_state + b_0_offset), 2 * i_dim, 2 * m_dim); - Eigen::BDCSVD svd(full_b0b1, Eigen::ComputeThinU | Eigen::ComputeThinV); - const auto p = std::min(2 * i_dim, 2 * m_dim); - - // Place U in scratch to truncate and then B0. - MatrixMap svd_u((Complex*)(raw_state + end), 2 * i_dim, p); - svd_u.noalias() = svd.matrixU(); - block_0.fill(Complex(0, 0)); - const auto keep_cols = (svd_u.cols() > bond_dim) ? bond_dim : svd_u.cols(); - block_0.block(0, 0, svd_u.rows(), keep_cols).noalias() = - svd_u(Eigen::indexing::all, Eigen::seq(0, keep_cols - 1)); - - // Place row product of S V into scratch to truncate and then B1. - MatrixMap svd_v((Complex*)(raw_state + end), p, 2 * m_dim); - MatrixMap s_vector((Complex*)(raw_state + end + 8 * bond_dim * bond_dim), p, 1); - svd_v.noalias() = svd.matrixV().adjoint(); - s_vector.noalias() = svd.singularValues(); - block_1.fill(Complex(0, 0)); - const auto keep_rows = (svd_v.rows() > bond_dim) ? bond_dim : svd_v.rows(); - const auto row_seq = Eigen::seq(0, keep_rows - 1); - for (unsigned i = 0; i < keep_rows; ++i) { - svd_v.row(i) *= s_vector(i); - } - block_1.block(0, 0, keep_rows, svd_v.cols()).noalias() = - svd_v(row_seq, Eigen::indexing::all); - } - - For for_; -}; - -} // namespace mps -} // namespace qsim - -#endif // MPS_SIMULATOR_H_ diff --git a/tpls/qsim/mps_statespace.h b/tpls/qsim/mps_statespace.h deleted file mode 100644 index 9b3acf3..0000000 --- a/tpls/qsim/mps_statespace.h +++ /dev/null @@ -1,597 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef MPS_STATESPACE_H_ -#define MPS_STATESPACE_H_ - -// For templates will take care of parallelization. -#define EIGEN_DONT_PARALLELIZE 1 - -#ifdef _WIN32 -#include -#endif - -#include -#include -#include -#include -#include - -#include "../eigen/Eigen/Dense" -#include "../eigen/unsupported/Eigen/CXX11/Tensor" - -namespace qsim { - -namespace mps { - -namespace detail { - -inline void do_not_free(void*) {} - -inline void free(void* ptr) { -#ifdef _WIN32 - _aligned_free(ptr); -#else - ::free(ptr); -#endif -} - -} // namespace detail - -/** - * Class containing context and routines for fixed bond dimension - * truncated Matrix Product State (MPS) simulation. - */ -template -class MPSStateSpace { - private: - public: - using fp_type = FP; - using Pointer = std::unique_ptr; - - using Complex = std::complex; - using Matrix = - Eigen::Matrix; - using ConstMatrixMap = Eigen::Map; - using MatrixMap = Eigen::Map; - - // Store MPS tensors with the following shape: - // [2, bond_dim], [bond_dim, 2, bond_dim], ... , [bond_dim, 2]. - class MPS { - public: - MPS() = delete; - - MPS(Pointer&& ptr, unsigned num_qubits, unsigned bond_dim) - : ptr_(std::move(ptr)), num_qubits_(num_qubits), bond_dim_(bond_dim) {} - - fp_type* get() { return ptr_.get(); } - - const fp_type* get() const { return ptr_.get(); } - - fp_type* release() { - num_qubits_ = 0; - return ptr_.release(); - } - - unsigned num_qubits() const { return num_qubits_; } - - unsigned bond_dim() const { return bond_dim_; } - - private: - Pointer ptr_; - unsigned num_qubits_; - unsigned bond_dim_; - }; - - // Note: ForArgs are currently unused. - template - MPSStateSpace(ForArgs&&... args) : for_(args...) {} - - // Requires num_qubits >= 2 and bond_dim >= 2. - static MPS Create(unsigned num_qubits, unsigned bond_dim) { - auto end_sizes = 2 * 4 * bond_dim; - auto internal_sizes = 4 * bond_dim * bond_dim * (num_qubits + 1); - // Use three extra "internal style" blocks past the end of the - // working allocation for scratch space. Needed for gate - // application. - auto size = sizeof(fp_type) * (end_sizes + internal_sizes); - -#ifdef _WIN32 - Pointer ptr{(fp_type*)_aligned_malloc(size, 64), &detail::free}; - bool is_null = ptr.get() != nullptr; - return MPS{std::move(ptr), is_null ? num_qubits : 0, - is_null ? bond_dim : 0}; -#else - void* p = nullptr; - if (posix_memalign(&p, 64, size) == 0) { - return MPS{Pointer{(fp_type*)p, &detail::free}, num_qubits, bond_dim}; - } else { - return MPS{Pointer{nullptr, &detail::free}, 0, 0}; - } -#endif - } - - static unsigned Size(const MPS& state) { - auto end_sizes = 2 * 4 * state.bond_dim(); - auto internal_sizes = 4 * state.bond_dim() * state.bond_dim(); - return end_sizes + internal_sizes * (state.num_qubits() - 2); - } - - static unsigned RawSize(const MPS& state) { - return sizeof(fp_type) * Size(state); - } - - // Get the pointer offset to the beginning of an MPS block. - static unsigned GetBlockOffset(const MPS& state, unsigned i) { - if (i == 0) { - return 0; - } - return 4 * state.bond_dim() * (1 + state.bond_dim() * (i - 1)); - } - - // Copies the state contents of one MPS to another. - // Ignores scratch data. - static bool Copy(const MPS& src, MPS& dest) { - if ((src.num_qubits() != dest.num_qubits()) || - src.bond_dim() != dest.bond_dim()) { - return false; - } - auto size = RawSize(src); - memcpy(dest.get(), src.get(), size); - return true; - } - - // Set the MPS to the |0> state. - static void SetStateZero(MPS& state) { - auto size = Size(state); - memset(state.get(), 0, sizeof(fp_type) * size); - auto block_size = 4 * state.bond_dim() * state.bond_dim(); - state.get()[0] = 1.0; - for (unsigned i = 4 * state.bond_dim(); i < size; i += block_size) { - state.get()[i] = 1.0; - } - } - - // Computes Re{} for two equal sized MPS. - // Requires: state1.bond_dim() == state2.bond_dim() && - // state1.num_qubits() == state2.num_qubits() - static fp_type RealInnerProduct(MPS& state1, MPS& state2) { - return InnerProduct(state1, state2).real(); - } - - // Computes for two equal sized MPS. - // Requires: state1.bond_dim() == state2.bond_dim() && - // state1.num_qubits() == state2.num_qubits() - static std::complex InnerProduct(MPS& state1, MPS& state2) { - const auto num_qubits = state1.num_qubits(); - const auto bond_dim = state1.bond_dim(); - const auto end = Size(state1); - auto offset = 0; - fp_type* state1_raw = state1.get(); - fp_type* state2_raw = state2.get(); - - // Contract leftmost blocks together, store result in state1 scratch. - ConstMatrixMap top((Complex*)state2_raw, 2, bond_dim); - ConstMatrixMap bot((Complex*)state1_raw, 2, bond_dim); - MatrixMap partial_contract((Complex*)(state1_raw + end), bond_dim, - bond_dim); - MatrixMap partial_contract2( - (Complex*)(state1_raw + end + 2 * bond_dim * bond_dim), bond_dim, - 2 * bond_dim); - partial_contract.noalias() = top.adjoint() * bot; - - // Contract all internal blocks together. - for (unsigned i = 1; i < num_qubits - 1; ++i) { - offset = GetBlockOffset(state1, i); - - // reshape: - new (&partial_contract2) - MatrixMap((Complex*)(state1_raw + end + 2 * bond_dim * bond_dim), - bond_dim, 2 * bond_dim); - - // Merge bot into left boundary merged tensor. - new (&bot) ConstMatrixMap((Complex*)(state1_raw + offset), bond_dim, - 2 * bond_dim); - partial_contract2.noalias() = partial_contract * bot; - - // reshape: - new (&partial_contract2) - MatrixMap((Complex*)(state1_raw + end + 2 * bond_dim * bond_dim), - 2 * bond_dim, bond_dim); - - // Merge top into partial_contract2. - new (&top) ConstMatrixMap((Complex*)(state2_raw + offset), 2 * bond_dim, - bond_dim); - partial_contract.noalias() = top.adjoint() * partial_contract2; - } - - // Contract rightmost bottom block. - offset = GetBlockOffset(state1, num_qubits - 1); - new (&bot) ConstMatrixMap((Complex*)(state1_raw + offset), bond_dim, 2); - new (&partial_contract2) MatrixMap( - (Complex*)(state1_raw + end + 4 * bond_dim * bond_dim), bond_dim, 2); - partial_contract2.noalias() = partial_contract * bot; - - // Contract rightmost top block. - new (&top) ConstMatrixMap((Complex*)(state2_raw + offset), 2 * bond_dim, 1); - new (&partial_contract) MatrixMap((Complex*)(state1_raw + end), 1, 1); - new (&partial_contract2) - MatrixMap((Complex*)(state1_raw + end + 4 * bond_dim * bond_dim), - 2 * bond_dim, 1); - partial_contract.noalias() = top.adjoint() * partial_contract2; - - return partial_contract(0, 0); - } - - // Compute the 2x2 1-RDM of state on index. Result written to rdm. - // Requires: scratch and rdm to be allocated. - static void ReduceDensityMatrix(MPS& state, MPS& scratch, int index, - fp_type* rdm) { - const auto num_qubits = state.num_qubits(); - const auto bond_dim = state.bond_dim(); - const auto end = Size(state); - const bool last_index = (index == num_qubits - 1); - const auto right_dim = (last_index ? 1 : bond_dim); - auto offset = 0; - fp_type* state_raw = state.get(); - fp_type* scratch_raw = scratch.get(); - fp_type* state_raw_workspace = state_raw + end + 2 * bond_dim * bond_dim; - fp_type* scratch_raw_workspace = - scratch_raw + end + 2 * bond_dim * bond_dim; - - Copy(state, scratch); - - // Contract leftmost blocks together, store result in state scratch. - ConstMatrixMap top((Complex*)scratch_raw, 2, bond_dim); - ConstMatrixMap bot((Complex*)state_raw, 2, bond_dim); - MatrixMap partial_contract((Complex*)(state_raw + end), bond_dim, bond_dim); - MatrixMap partial_contract2((Complex*)(state_raw_workspace), bond_dim, - 2 * bond_dim); - - partial_contract.setZero(); - partial_contract(0, 0) = 1; - if (index > 0) { - partial_contract.noalias() = top.adjoint() * bot; - } - - // Contract all internal blocks together. - for (unsigned i = 1; i < index; ++i) { - offset = GetBlockOffset(state, i); - - // reshape: - new (&partial_contract2) - MatrixMap((Complex*)(state_raw_workspace), bond_dim, 2 * bond_dim); - - // Merge bot into left boundary merged tensor. - new (&bot) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, - 2 * bond_dim); - partial_contract2.noalias() = partial_contract * bot; - - // reshape: - new (&partial_contract2) - MatrixMap((Complex*)(state_raw_workspace), 2 * bond_dim, bond_dim); - - // Merge top into partial_contract2. - new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), 2 * bond_dim, - bond_dim); - partial_contract.noalias() = top.adjoint() * partial_contract2; - } - - // The [bond_dim, bond_dim] block in state_raw now contains the contraction - // up to, but not including index. - // Contract rightmost blocks. - offset = GetBlockOffset(state, num_qubits - 1); - new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), bond_dim, 2); - new (&bot) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, 2); - new (&partial_contract) - MatrixMap((Complex*)(scratch_raw + end), bond_dim, bond_dim); - new (&partial_contract2) - MatrixMap((Complex*)(scratch_raw_workspace), bond_dim, 2 * bond_dim); - - partial_contract.setZero(); - partial_contract(0, 0) = 1; - if (index < num_qubits - 1) { - partial_contract.noalias() = top * bot.adjoint(); - } - - for (unsigned i = num_qubits - 2; i > index; --i) { - offset = GetBlockOffset(state, i); - - // reshape: - new (&partial_contract2) - MatrixMap((Complex*)(scratch_raw_workspace), 2 * bond_dim, bond_dim); - - // Merge bot into left boundary merged tensor. - new (&bot) ConstMatrixMap((Complex*)(state_raw + offset), 2 * bond_dim, - bond_dim); - partial_contract2.noalias() = bot * partial_contract.adjoint(); - - // reshape: - new (&partial_contract2) - MatrixMap((Complex*)(scratch_raw_workspace), bond_dim, 2 * bond_dim); - - // Merge top into partial_contract2. - new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), bond_dim, - 2 * bond_dim); - // [bd, bd] = [bd, 2bd] @ [bd, 2bd] - partial_contract.noalias() = top * partial_contract2.adjoint(); - } - - // The [bond_dim, bond_dim] block in scratch_raw now contains the - // contraction down from the end, but not including the index. Begin final - // contraction steps. - - // Get leftmost [bd, bd] contraction and contract with top. - - offset = GetBlockOffset(state, index); - new (&partial_contract) - MatrixMap((Complex*)(state_raw + end), bond_dim, bond_dim); - new (&top) - ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, 2 * right_dim); - new (&partial_contract2) - MatrixMap((Complex*)(scratch_raw_workspace), bond_dim, 2 * right_dim); - partial_contract2.noalias() = partial_contract * top.conjugate(); - // copy the bottom contraction scratch_raw to state_raw to save space. - memcpy(state_raw + end, scratch_raw + end, - bond_dim * bond_dim * 2 * sizeof(fp_type)); - - // Contract top again for correct shape. - fp_type* contract3_target = (last_index ? rdm : scratch_raw); - MatrixMap partial_contract3((Complex*)contract3_target, 2 * right_dim, - 2 * right_dim); - partial_contract3.noalias() = top.transpose() * partial_contract2; - - // If we are contracting the last index, all the needed transforms are done. - if (last_index) { - return; - } - - // Conduct final tensor contraction operations. Cannot be easily compiled to - // matmul. - const Eigen::TensorMap> - t_4d((Complex*)scratch_raw, 2, bond_dim, 2, bond_dim); - const Eigen::TensorMap> - t_2d((Complex*)(state_raw + end), bond_dim, bond_dim); - - const Eigen::array, 2> product_dims = { - Eigen::IndexPair(1, 0), - Eigen::IndexPair(3, 1), - }; - Eigen::TensorMap> out( - (Complex*)rdm, 2, 2); - out = t_4d.contract(t_2d, product_dims); - } - - // Draw a single bitstring sample from state using scratch and scratch2 - // as working space. - static void SampleOnce(MPS& state, MPS& scratch, MPS& scratch2, - std::mt19937* random_gen, std::vector* sample) { - // TODO: carefully profile with perf and optimize temp storage - // locations for cache friendliness. - const auto bond_dim = state.bond_dim(); - const auto num_qubits = state.num_qubits(); - const auto end = Size(state); - const auto left_frontier_offset = GetBlockOffset(state, num_qubits + 1); - std::default_random_engine generator; - fp_type* state_raw = state.get(); - fp_type* scratch_raw = scratch.get(); - fp_type* scratch2_raw = scratch2.get(); - fp_type rdm[8]; - - sample->reserve(num_qubits); - Copy(state, scratch); - Copy(state, scratch2); - - // Store prefix contractions in scratch2. - auto offset = GetBlockOffset(state, num_qubits - 1); - ConstMatrixMap top((Complex*)(state_raw + offset), bond_dim, 2); - ConstMatrixMap bot((Complex*)(scratch_raw + offset), bond_dim, 2); - MatrixMap partial_contract((Complex*)(scratch2_raw + offset), bond_dim, - bond_dim); - MatrixMap partial_contract2((Complex*)(scratch_raw + end), bond_dim, - 2 * bond_dim); - partial_contract.noalias() = top * bot.adjoint(); - - for (unsigned i = num_qubits - 2; i > 0; --i) { - offset = GetBlockOffset(state, i); - // reshape: - new (&partial_contract2) - MatrixMap((Complex*)(scratch_raw + end), 2 * bond_dim, bond_dim); - - // Merge bot into left boundary merged tensor. - new (&bot) ConstMatrixMap((Complex*)(scratch_raw + offset), 2 * bond_dim, - bond_dim); - partial_contract2.noalias() = bot * partial_contract.adjoint(); - - // reshape: - new (&partial_contract2) - MatrixMap((Complex*)(scratch_raw + end), bond_dim, 2 * bond_dim); - - // Merge top into partial_contract2. - new (&top) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, - 2 * bond_dim); - - // merge into partial_contract -> scracth2_raw. - new (&partial_contract) - MatrixMap((Complex*)(scratch2_raw + offset), bond_dim, bond_dim); - partial_contract.noalias() = top * partial_contract2.adjoint(); - } - - // Compute RDM-0 and draw first sample. - offset = GetBlockOffset(state, 1); - new (&top) ConstMatrixMap((Complex*)state_raw, 2, bond_dim); - new (&bot) ConstMatrixMap((Complex*)scratch_raw, 2, bond_dim); - new (&partial_contract) - MatrixMap((Complex*)(scratch2_raw + offset), bond_dim, bond_dim); - new (&partial_contract2) - MatrixMap((Complex*)(scratch_raw + end), 2, bond_dim); - - partial_contract2.noalias() = bot * partial_contract.adjoint(); - - new (&partial_contract) MatrixMap((Complex*)rdm, 2, 2); - partial_contract.noalias() = top * partial_contract2.adjoint(); - auto p0 = rdm[0] / (rdm[0] + rdm[6]); - std::bernoulli_distribution distribution(1 - p0); - auto bit_val = distribution(*random_gen); - sample->push_back(bit_val); - - // collapse state. - new (&partial_contract) MatrixMap((Complex*)scratch_raw, 2, bond_dim); - partial_contract.row(!bit_val).setZero(); - - // Prepare left contraction frontier. - new (&partial_contract2) MatrixMap( - (Complex*)(scratch2_raw + left_frontier_offset), bond_dim, bond_dim); - partial_contract2.noalias() = - partial_contract.transpose() * partial_contract.conjugate(); - - // Compute RDM-i and draw internal tensor samples. - for (unsigned i = 1; i < num_qubits - 1; i++) { - // Get leftmost [bd, bd] contraction and contract with top. - offset = GetBlockOffset(state, i); - new (&partial_contract) MatrixMap( - (Complex*)(scratch2_raw + left_frontier_offset), bond_dim, bond_dim); - new (&top) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, - 2 * bond_dim); - new (&partial_contract2) - MatrixMap((Complex*)(state_raw + end), bond_dim, 2 * bond_dim); - partial_contract2.noalias() = partial_contract * top.conjugate(); - - // Contract top again for correct shape. - MatrixMap partial_contract3((Complex*)(scratch_raw + end), 2 * bond_dim, - 2 * bond_dim); - partial_contract3.noalias() = top.transpose() * partial_contract2; - - // Conduct final tensor contraction operations. Cannot be easily compiled - // to matmul. Perf reports shows only ~6% of runtime spent here on large - // systems. - offset = GetBlockOffset(state, i + 1); - const Eigen::TensorMap> - t_4d((Complex*)(scratch_raw + end), 2, bond_dim, 2, bond_dim); - const Eigen::TensorMap> - t_2d((Complex*)(scratch2_raw + offset), bond_dim, bond_dim); - - const Eigen::array, 2> product_dims = { - Eigen::IndexPair(1, 0), - Eigen::IndexPair(3, 1), - }; - Eigen::TensorMap> out( - (Complex*)rdm, 2, 2); - out = t_4d.contract(t_2d, product_dims); - - // Sample bit and collapse state. - p0 = rdm[0] / (rdm[0] + rdm[6]); - distribution = std::bernoulli_distribution(1 - p0); - bit_val = distribution(*random_gen); - - sample->push_back(bit_val); - offset = GetBlockOffset(state, i); - new (&partial_contract) - MatrixMap((Complex*)(scratch_raw + offset), bond_dim * 2, bond_dim); - for (unsigned j = !bit_val; j < 2 * bond_dim; j += 2) { - partial_contract.row(j).setZero(); - } - - // Update left frontier. - new (&partial_contract) MatrixMap( - (Complex*)(scratch2_raw + left_frontier_offset), bond_dim, bond_dim); - - // reshape: - new (&partial_contract2) - MatrixMap((Complex*)(state_raw + end), bond_dim, 2 * bond_dim); - - // Merge bot into left boundary merged tensor. - new (&bot) ConstMatrixMap((Complex*)(scratch_raw + offset), bond_dim, - 2 * bond_dim); - partial_contract2.noalias() = partial_contract * bot.conjugate(); - - // reshape: - new (&partial_contract2) - MatrixMap((Complex*)(state_raw + end), 2 * bond_dim, bond_dim); - - // Merge top into partial_contract2. - new (&top) ConstMatrixMap((Complex*)(scratch_raw + offset), 2 * bond_dim, - bond_dim); - partial_contract.noalias() = top.transpose() * partial_contract2; - } - - // Compute RDM-(n-1) and sample. - offset = GetBlockOffset(state, num_qubits - 1); - new (&partial_contract2) - MatrixMap((Complex*)(state_raw + end), bond_dim, 2); - - new (&top) ConstMatrixMap((Complex*)(state_raw + offset), bond_dim, 2); - partial_contract2.noalias() = partial_contract * top.conjugate(); - new (&partial_contract) MatrixMap((Complex*)rdm, 2, 2); - partial_contract.noalias() = top.transpose() * partial_contract2; - - p0 = rdm[0] / (rdm[0] + rdm[6]); - distribution = std::bernoulli_distribution(1 - p0); - bit_val = distribution(*random_gen); - sample->push_back(bit_val); - } - - // Draw num_samples bitstring samples from state and store the result - // bit vectors in results. Uses scratch and scratch2 as workspace. - static void Sample(MPS& state, MPS& scratch, MPS& scratch2, - unsigned num_samples, unsigned seed, - std::vector>* results) { - std::mt19937 rand_source(seed); - results->reserve(num_samples); - for (unsigned i = 0; i < num_samples; i++) { - SampleOnce(state, scratch, scratch2, &rand_source, &(*results)[i]); - } - } - - // Testing only. Convert the MPS to a wavefunction under "normal" ordering. - // Requires: wf be allocated beforehand with bond_dim * 2 ^ num_qubits -1 - // memory. - static void ToWaveFunction(MPS& state, fp_type* wf) { - const auto bond_dim = state.bond_dim(); - const auto num_qubits = state.num_qubits(); - fp_type* raw_state = state.get(); - - ConstMatrixMap accum = ConstMatrixMap((Complex*)(raw_state), 2, bond_dim); - ConstMatrixMap next_block = ConstMatrixMap(nullptr, 0, 0); - MatrixMap result2 = MatrixMap(nullptr, 0, 0); - auto offset = 0; - auto result2_size = 2; - - for (unsigned i = 1; i < num_qubits - 1; i++) { - offset = GetBlockOffset(state, i); - // use of new does not trigger any expensive operations. - new (&next_block) ConstMatrixMap((Complex*)(raw_state + offset), bond_dim, - 2 * bond_dim); - new (&result2) MatrixMap((Complex*)(wf), result2_size, 2 * bond_dim); - - // temp variable used since result2 and accum point to same memory. - result2 = accum * next_block; - result2_size *= 2; - new (&accum) ConstMatrixMap((Complex*)(wf), result2_size, bond_dim); - } - offset = GetBlockOffset(state, num_qubits - 1); - new (&next_block) - ConstMatrixMap((Complex*)(raw_state + offset), bond_dim, 2); - new (&result2) MatrixMap((Complex*)(wf), result2_size, 2); - result2 = accum * next_block; - } - - protected: - For for_; -}; - -} // namespace mps -} // namespace qsim - -#endif // MPS_STATESPACE_H_ diff --git a/tpls/qsim/parfor.h b/tpls/qsim/parfor.h deleted file mode 100644 index 8a3a4d6..0000000 --- a/tpls/qsim/parfor.h +++ /dev/null @@ -1,123 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef PARFOR_H_ -#define PARFOR_H_ - -#include - -#include -#include -#include - -namespace qsim { - -/** - * Helper struct for executing for-loops in parallel across multiple threads. - */ -template -struct ParallelForT { - explicit ParallelForT(unsigned num_threads) : num_threads(num_threads) {} - - // GetIndex0 and GetIndex1 are useful when we need to know how work was - // divided between threads, for instance, for reusing partial sums obtained - // by RunReduceP. - uint64_t GetIndex0(uint64_t size, unsigned thread_id) const { - return size >= MIN_SIZE ? size * thread_id / num_threads : 0; - } - - uint64_t GetIndex1(uint64_t size, unsigned thread_id) const { - return size >= MIN_SIZE ? size * (thread_id + 1) / num_threads : size; - } - - template - void Run(uint64_t size, Function&& func, Args&&... args) const { - if (num_threads > 1 && size >= MIN_SIZE) { - #pragma omp parallel num_threads(num_threads) - { - unsigned n = omp_get_num_threads(); - unsigned m = omp_get_thread_num(); - - uint64_t i0 = GetIndex0(size, m); - uint64_t i1 = GetIndex1(size, m); - - for (uint64_t i = i0; i < i1; ++i) { - func(n, m, i, args...); - } - } - } else { - for (uint64_t i = 0; i < size; ++i) { - func(1, 0, i, args...); - } - } - } - - template - std::vector RunReduceP( - uint64_t size, Function&& func, Op&& op, Args&&... args) const { - std::vector partial_results; - - if (num_threads > 1 && size >= MIN_SIZE) { - partial_results.resize(num_threads, 0); - - #pragma omp parallel num_threads(num_threads) - { - unsigned n = omp_get_num_threads(); - unsigned m = omp_get_thread_num(); - - uint64_t i0 = GetIndex0(size, m); - uint64_t i1 = GetIndex1(size, m); - - typename Op::result_type partial_result = 0; - - for (uint64_t i = i0; i < i1; ++i) { - partial_result = op(partial_result, func(n, m, i, args...)); - } - - partial_results[m] = partial_result; - } - } else if (num_threads > 0) { - typename Op::result_type result = 0; - for (uint64_t i = 0; i < size; ++i) { - result = op(result, func(1, 0, i, args...)); - } - - partial_results.resize(1, result); - } - - return partial_results; - } - - template - typename Op::result_type RunReduce(uint64_t size, Function&& func, - Op&& op, Args&&... args) const { - auto partial_results = RunReduceP(size, func, std::move(op), args...); - - typename Op::result_type result = 0; - - for (auto partial_result : partial_results) { - result = op(result, partial_result); - } - - return result; - } - - unsigned num_threads; -}; - -using ParallelFor = ParallelForT<1024>; - -} // namespace qsim - -#endif // PARFOR_H_ diff --git a/tpls/qsim/qtrajectory.h b/tpls/qsim/qtrajectory.h deleted file mode 100644 index 1da6692..0000000 --- a/tpls/qsim/qtrajectory.h +++ /dev/null @@ -1,435 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef QTRAJECTORY_H_ -#define QTRAJECTORY_H_ - -#include -#include -#include -#include -#include - -#include "circuit_noisy.h" -#include "gate.h" -#include "gate_appl.h" - -namespace qsim { - -/** - * Quantum trajectory simulator. - */ -template class FuserT, typename Simulator, - typename RGen = std::mt19937> -class QuantumTrajectorySimulator { - public: - using Fuser = FuserT; - using StateSpace = typename Simulator::StateSpace; - using State = typename Simulator::State; - using MeasurementResult = typename StateSpace::MeasurementResult; - - /** - * User-specified parameters for the simulator. - */ - struct Parameter : public Fuser::Parameter { - /** - * If true, collect statistics of sampled Kraus operator indices. - */ - bool collect_kop_stat = false; - /** - * If true, collect statistics of measured bitstrings. - */ - bool collect_mea_stat = false; - /** - * If true, normalize the state vector before performing measurements. - */ - bool normalize_before_mea_gates = true; - /** - * If false, do not apply deferred operators after the main loop for - * the "primary" noise trajectory, that is the trajectory in which - * the primary (the first operators in their respective channels) Kraus - * operators are sampled for each channel and there are no measurements - * in the computational basis. This can be used to speed up simulations - * of circuits with weak noise and without measurements by reusing - * the primary trajectory results. There is an additional condition for - * RunBatch. In this case, the deferred operators after the main loop are - * still applied for the first occurence of the primary trajectory. - * The primary Kraus operators should have the highest sampling - * probabilities to achieve the highest speedup. - * - * It is the client's responsibility to collect the primary trajectory - * results and to reuse them. - */ - bool apply_last_deferred_ops = true; - }; - - /** - * Struct with statistics to populate by RunBatch and RunOnce methods. - */ - struct Stat { - /** - * Indices of sampled Kraus operator indices and/or measured bitstrings. - */ - std::vector samples; - /** - * True if the "primary" noise trajectory is sampled, false otherwise. - */ - bool primary; - }; - - /** - * Runs the given noisy circuit performing repetitions. Each repetition is - * seeded by repetition ID. - * @param param Options for the quantum trajectory simulator. - * @param circuit The noisy circuit to be simulated. - * @param r0, r1 The range of repetition IDs [r0, r1) to perform repetitions. - * @param state_space StateSpace object required to manipulate state vector. - * @param simulator Simulator object. Provides specific implementations for - * applying gates. - * @param measure Function that performs measurements (in the sense of - * computing expectation values, etc). This function should have three - * required parameters [repetition ID (uint64_t), final state vector - * (const State&), statistics of sampled Kraus operator indices and/or - * measured bitstrings (const Stat&)] and any number of optional parameters. - * @param args Optional arguments for the 'measure' function. - * @return True if the simulation completed successfully; false otherwise. - */ - template - static bool RunBatch(const Parameter& param, - const NoisyCircuit& circuit, - uint64_t r0, uint64_t r1, const StateSpace& state_space, - const Simulator& simulator, MeasurementFunc&& measure, - Args&&... args) { - return RunBatch(param, circuit.num_qubits, circuit.channels.begin(), - circuit.channels.end(), r0, r1, state_space, simulator, - measure, args...); - } - - /** - * Runs the given noisy circuit performing repetitions. Each repetition is - * seeded by repetition ID. - * @param param Options for the quantum trajectory simulator. - * @param num_qubits The number of qubits acted on by the circuit. - * @param cbeg, cend The range of channels [cbeg, cend) to run the circuit. - * @param r0, r1 The range of repetition IDs [r0, r1) to perform repetitions. - * @param state_space StateSpace object required to manipulate state vector. - * @param simulator Simulator object. Provides specific implementations for - * applying gates. - * @param measure Function that performs measurements (in the sense of - * computing expectation values, etc). This function should have three - * required parameters [repetition ID (uint64_t), final state vector - * (const State&), statistics of sampled Kraus operator indices and/or - * measured bitstrings (const Stat&)] and any number of optional parameters. - * @param args Optional arguments for the 'measure' function. - * @return True if the simulation completed successfully; false otherwise. - */ - template - static bool RunBatch(const Parameter& param, unsigned num_qubits, - ncircuit_iterator cbeg, - ncircuit_iterator cend, - uint64_t r0, uint64_t r1, const StateSpace& state_space, - const Simulator& simulator, MeasurementFunc&& measure, - Args&&... args) { - std::vector gates; - gates.reserve(4 * std::size_t(cend - cbeg)); - - State state = state_space.Null(); - - Stat stat; - bool had_primary_realization = false; - - for (uint64_t r = r0; r < r1; ++r) { - if (!state_space.IsNull(state)) { - state_space.SetStateZero(state); - } - - bool apply_last_deferred_ops = - param.apply_last_deferred_ops || !had_primary_realization; - - if (!RunIteration(param, apply_last_deferred_ops, num_qubits, cbeg, cend, - r, state_space, simulator, gates, state, stat)) { - return false; - } - - if (stat.primary && !had_primary_realization) { - had_primary_realization = true; - } - - measure(r, state, stat, args...); - } - - return true; - } - - /** - * Runs the given noisy circuit one time. - * @param param Options for the quantum trajectory simulator. - * @param circuit The noisy circuit to be simulated. - * @param r The repetition ID. The random number generator is seeded by 'r'. - * @param state_space StateSpace object required to manipulate state vector. - * @param simulator Simulator object. Provides specific implementations for - * applying gates. - * @param state The state of the system, to be updated by this method. - * @param stat Statistics of sampled Kraus operator indices and/or measured - * bitstrings, to be populated by this method. - * @return True if the simulation completed successfully; false otherwise. - */ - static bool RunOnce(const Parameter& param, - const NoisyCircuit& circuit, uint64_t r, - const StateSpace& state_space, const Simulator& simulator, - State& state, Stat& stat) { - return RunOnce(param, circuit.num_qubits, circuit.channels.begin(), - circuit.channels.end(), r, state_space, simulator, - state, stat); - } - - /** - * Runs the given noisy circuit one time. - * @param param Options for the quantum trajectory simulator. - * @param num_qubits The number of qubits acted on by the circuit. - * @param cbeg, cend The range of channels [cbeg, cend) to run the circuit. - * @param circuit The noisy circuit to be simulated. - * @param r The repetition ID. The random number generator is seeded by 'r'. - * @param state_space StateSpace object required to manipulate state vector. - * @param simulator Simulator object. Provides specific implementations for - * applying gates. - * @param state The state of the system, to be updated by this method. - * @param stat Statistics of sampled Kraus operator indices and/or measured - * bitstrings, to be populated by this method. - * @return True if the simulation completed successfully; false otherwise. - */ - static bool RunOnce(const Parameter& param, unsigned num_qubits, - ncircuit_iterator cbeg, - ncircuit_iterator cend, - uint64_t r, const StateSpace& state_space, - const Simulator& simulator, State& state, Stat& stat) { - std::vector gates; - gates.reserve(4 * std::size_t(cend - cbeg)); - - if (!RunIteration(param, param.apply_last_deferred_ops, num_qubits, cbeg, - cend, r, state_space, simulator, gates, state, stat)) { - return false; - } - - return true; - } - - private: - static bool RunIteration(const Parameter& param, - bool apply_last_deferred_ops, unsigned num_qubits, - ncircuit_iterator cbeg, - ncircuit_iterator cend, - uint64_t rep, const StateSpace& state_space, - const Simulator& simulator, - std::vector& gates, - State& state, Stat& stat) { - if (param.collect_kop_stat || param.collect_mea_stat) { - stat.samples.reserve(std::size_t(cend - cbeg)); - stat.samples.resize(0); - } - - if (state_space.IsNull(state)) { - state = CreateState(num_qubits, state_space); - if (state_space.IsNull(state)) { - return false; - } - - state_space.SetStateZero(state); - } - - gates.resize(0); - - RGen rgen(rep); - std::uniform_real_distribution distr(0.0, 1.0); - - bool unitary = true; - stat.primary = true; - - for (auto it = cbeg; it != cend; ++it) { - const auto& channel = *it; - - if (channel.size() == 0) continue; - - if (channel[0].kind == gate::kMeasurement) { - // Measurement channel. - - if (!ApplyDeferredOps(param, num_qubits, simulator, gates, state)) { - return false; - } - - bool normalize = !unitary && param.normalize_before_mea_gates; - NormalizeState(normalize, state_space, unitary, state); - - auto mresult = ApplyMeasurementGate(state_space, channel[0].ops[0], - rgen, state); - - if (!mresult.valid) { - return false; - } - - CollectStat(param.collect_mea_stat, mresult.bits, stat); - - stat.primary = false; - - continue; - } - - // "Normal" channel. - - double r = distr(rgen); - double cp = 0; - - // Perform sampling of Kraus operators using probability bounds. - for (std::size_t i = 0; i < channel.size(); ++i) { - const auto& kop = channel[i]; - - cp += kop.prob; - - if (r < cp) { - DeferOps(kop.ops, gates); - CollectStat(param.collect_kop_stat, i, stat); - - unitary = unitary && kop.unitary; - - break; - } - } - - if (r < cp) continue; - - if (!ApplyDeferredOps(param, num_qubits, simulator, gates, state)) { - return false; - } - - NormalizeState(!unitary, state_space, unitary, state); - - double max_prob = 0; - std::size_t max_prob_index = 0; - - // Perform sampling of Kraus operators using norms of updated states. - for (std::size_t i = 0; i < channel.size(); ++i) { - const auto& kop = channel[i]; - - if (kop.unitary) continue; - - double prob = std::real( - simulator.ExpectationValue(kop.qubits, kop.kd_k.data(), state)); - - if (prob > max_prob) { - max_prob = prob; - max_prob_index = i; - } - - cp += prob - kop.prob; - - if (r < cp || i == channel.size() - 1) { - // Sample ith Kraus operator if r < cp - // Sample the highest probability Kraus operator if r is greater - // than the sum of all probablities due to round-off errors. - uint64_t k = r < cp ? i : max_prob_index; - - DeferOps(channel[k].ops, gates); - CollectStat(param.collect_kop_stat, k, stat); - - unitary = false; - - break; - } - } - } - - if (apply_last_deferred_ops || !stat.primary) { - if (!ApplyDeferredOps(param, num_qubits, simulator, gates, state)) { - return false; - } - - NormalizeState(!unitary, state_space, unitary, state); - } - - return true; - } - - static State CreateState(unsigned num_qubits, const StateSpace& state_space) { - auto state = state_space.Create(num_qubits); - if (state_space.IsNull(state)) { - IO::errorf("not enough memory: is the number of qubits too large?\n"); - return state_space.Null(); - } - - return state; - } - - static bool ApplyDeferredOps( - const Parameter& param, unsigned num_qubits, const Simulator& simulator, - std::vector& gates, State& state) { - if (gates.size() > 0) { - auto fgates = Fuser::FuseGates(param, num_qubits, gates); - - gates.resize(0); - - if (fgates.size() == 0) { - return false; - } - - for (const auto& fgate : fgates) { - ApplyFusedGate(simulator, fgate, state); - } - } - - return true; - } - - static MeasurementResult ApplyMeasurementGate( - const StateSpace& state_space, const Gate& gate, - RGen& rgen, State& state) { - auto result = state_space.Measure(gate.qubits, rgen, state); - - if (!result.valid) { - IO::errorf("measurement failed.\n"); - } - - return result; - } - - static void DeferOps( - const std::vector& ops, std::vector& gates) { - for (const auto& op : ops) { - gates.push_back(&op); - } - } - - static void CollectStat(bool collect_stat, uint64_t i, Stat& stat) { - if (collect_stat) { - stat.samples.push_back(i); - } - - if (i != 0) { - stat.primary = false; - } - } - - static void NormalizeState(bool normalize, const StateSpace& state_space, - bool& flag, State& state) { - if (normalize) { - double a = 1.0 / std::sqrt(state_space.Norm(state)); - state_space.Multiply(a, state); - flag = true; - } - } -}; - -} // namespace qsim - -#endif // QTRAJECTORY_H_ diff --git a/tpls/qsim/run_qsim.h b/tpls/qsim/run_qsim.h deleted file mode 100644 index 3752915..0000000 --- a/tpls/qsim/run_qsim.h +++ /dev/null @@ -1,262 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef RUN_QSIM_H_ -#define RUN_QSIM_H_ - -#include -#include -#include - -#include "gate.h" -#include "gate_appl.h" -#include "util.h" - -namespace qsim { - -/** - * Helper struct for running qsim. - */ -template -struct QSimRunner final { - public: - using Simulator = typename Factory::Simulator; - using StateSpace = typename Simulator::StateSpace; - using State = typename StateSpace::State; - using MeasurementResult = typename StateSpace::MeasurementResult; - - /** - * User-specified parameters for gate fusion and simulation. - */ - struct Parameter : public Fuser::Parameter { - /** - * Random number generator seed to apply measurement gates. - */ - uint64_t seed; - }; - - /** - * Runs the given circuit, only measuring at the end. - * @param param Options for gate fusion, parallelism and logging. - * @param factory Object to create simulators and state spaces. - * @param circuit The circuit to be simulated. - * @param measure Function that performs measurements (in the sense of - * computing expectation values, etc). - * @return True if the simulation completed successfully; false otherwise. - */ - template - static bool Run(const Parameter& param, const Factory& factory, - const Circuit& circuit, MeasurementFunc measure) { - return Run(param, factory, {circuit.gates.back().time}, circuit, measure); - } - - /** - * Runs the given circuit, measuring at user-specified times. - * @param param Options for gate fusion, parallelism and logging. - * @param factory Object to create simulators and state spaces. - * @param times_to_measure_at Time steps at which to perform measurements. - * @param circuit The circuit to be simulated. - * @param measure Function that performs measurements (in the sense of - * computing expectation values, etc). - * @return True if the simulation completed successfully; false otherwise. - */ - template - static bool Run(const Parameter& param, const Factory& factory, - const std::vector& times_to_measure_at, - const Circuit& circuit, MeasurementFunc measure) { - double t0 = 0.0; - double t1 = 0.0; - - if (param.verbosity > 1) { - t0 = GetTime(); - } - - RGen rgen(param.seed); - - StateSpace state_space = factory.CreateStateSpace(); - - auto state = state_space.Create(circuit.num_qubits); - if (state_space.IsNull(state)) { - IO::errorf("not enough memory: is the number of qubits too large?\n"); - return false; - } - - state_space.SetStateZero(state); - Simulator simulator = factory.CreateSimulator(); - - if (param.verbosity > 1) { - t1 = GetTime(); - IO::messagef("init time is %g seconds.\n", t1 - t0); - t0 = GetTime(); - } - - auto fused_gates = Fuser::FuseGates(param, circuit.num_qubits, - circuit.gates, times_to_measure_at); - - if (fused_gates.size() == 0 && circuit.gates.size() > 0) { - return false; - } - - if (param.verbosity > 1) { - t1 = GetTime(); - IO::messagef("fuse time is %g seconds.\n", t1 - t0); - } - - if (param.verbosity > 0) { - t0 = GetTime(); - } - - unsigned cur_time_index = 0; - - // Apply fused gates. - for (std::size_t i = 0; i < fused_gates.size(); ++i) { - if (param.verbosity > 3) { - t1 = GetTime(); - } - - if (!ApplyFusedGate(state_space, simulator, fused_gates[i], rgen, - state)) { - IO::errorf("measurement failed.\n"); - return false; - } - - if (param.verbosity > 3) { - state_space.DeviceSync(); - double t2 = GetTime(); - IO::messagef("gate %lu done in %g seconds.\n", i, t2 - t1); - } - - unsigned t = times_to_measure_at[cur_time_index]; - - if (i == fused_gates.size() - 1 || t < fused_gates[i + 1].time) { - // Call back to perform measurements. - measure(cur_time_index, state_space, state); - ++cur_time_index; - } - } - - if (param.verbosity > 0) { - state_space.DeviceSync(); - double t2 = GetTime(); - IO::messagef("time is %g seconds.\n", t2 - t0); - } - - return true; - } - - /** - * Runs the given circuit and make the final state available to the caller, - * recording the result of any intermediate measurements in the circuit. - * @param param Options for gate fusion, parallelism and logging. - * @param factory Object to create simulators and state spaces. - * @param circuit The circuit to be simulated. - * @param state As an input parameter, this should contain the initial state - * of the system. After a successful run, it will be populated with the - * final state of the system. - * @param measure_results As an input parameter, this should be empty. - * After a successful run, this will contain all measurements results from - * the run, ordered by time and qubit index. - * @return True if the simulation completed successfully; false otherwise. - */ - template - static bool Run(const Parameter& param, const Factory& factory, - const Circuit& circuit, State& state, - std::vector& measure_results) { - double t0 = 0.0; - double t1 = 0.0; - - if (param.verbosity > 1) { - t0 = GetTime(); - } - - RGen rgen(param.seed); - - StateSpace state_space = factory.CreateStateSpace(); - Simulator simulator = factory.CreateSimulator(); - - if (param.verbosity > 1) { - t1 = GetTime(); - IO::messagef("init time is %g seconds.\n", t1 - t0); - t0 = GetTime(); - } - - auto fused_gates = Fuser::FuseGates(param, circuit.num_qubits, - circuit.gates); - - if (fused_gates.size() == 0 && circuit.gates.size() > 0) { - return false; - } - - measure_results.reserve(fused_gates.size()); - - if (param.verbosity > 1) { - t1 = GetTime(); - IO::messagef("fuse time is %g seconds.\n", t1 - t0); - } - - if (param.verbosity > 0) { - t0 = GetTime(); - } - - // Apply fused gates. - for (std::size_t i = 0; i < fused_gates.size(); ++i) { - if (param.verbosity > 3) { - t1 = GetTime(); - } - - if (!ApplyFusedGate(state_space, simulator, fused_gates[i], rgen, state, - measure_results)) { - IO::errorf("measurement failed.\n"); - return false; - } - - if (param.verbosity > 3) { - state_space.DeviceSync(); - double t2 = GetTime(); - IO::messagef("gate %lu done in %g seconds.\n", i, t2 - t1); - } - } - - if (param.verbosity > 0) { - state_space.DeviceSync(); - double t2 = GetTime(); - IO::messagef("simu time is %g seconds.\n", t2 - t0); - } - - return true; - } - - /** - * Runs the given circuit and make the final state available to the caller, - * discarding the result of any intermediate measurements in the circuit. - * @param param Options for gate fusion, parallelism and logging. - * @param factory Object to create simulators and state spaces. - * @param circuit The circuit to be simulated. - * @param state As an input parameter, this should contain the initial state - * of the system. After a successful run, it will be populated with the - * final state of the system. - * @return True if the simulation completed successfully; false otherwise. - */ - template - static bool Run(const Parameter& param, const Factory& factory, - const Circuit& circuit, State& state) { - std::vector discarded_results; - return Run(param, factory, circuit, state, discarded_results); - } -}; - -} // namespace qsim - -#endif // RUN_QSIM_H_ diff --git a/tpls/qsim/run_qsimh.h b/tpls/qsim/run_qsimh.h deleted file mode 100644 index c1534d3..0000000 --- a/tpls/qsim/run_qsimh.h +++ /dev/null @@ -1,120 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef RUN_QSIMH_H_ -#define RUN_QSIMH_H_ - -#include -#include - -#include "hybrid.h" -#include "util.h" - -namespace qsim { - -/** - * Helper struct for running qsimh. - */ -template -struct QSimHRunner final { - using Gate = typename HybridSimulator::Gate; - using fp_type = typename HybridSimulator::fp_type; - - using Parameter = typename HybridSimulator::Parameter; - using HybridData = typename HybridSimulator::HybridData; - using Fuser = typename HybridSimulator::Fuser; - - /** - * Evaluates the amplitudes for a given circuit and set of output states. - * @param param Options for gate fusion, parallelism and logging. Also - * specifies the size of the 'prefix' and 'root' sections of the lattice. - * @param factory Object to create simulators and state spaces. - * @param circuit The circuit to be simulated. - * @param parts Lattice sections to be simulated. - * @param bitstrings List of output states to simulate, as bitstrings. - * @param results Output vector of amplitudes. After a successful run, this - * will be populated with amplitudes for each state in 'bitstrings'. - * @return True if the simulation completed successfully; false otherwise. - */ - template - static bool Run(const Parameter& param, const Factory& factory, - const Circuit& circuit, const std::vector& parts, - const std::vector& bitstrings, - std::vector>& results) { - if (circuit.num_qubits != parts.size()) { - IO::errorf("parts size is not equal to the number of qubits."); - return false; - } - - double t0 = 0.0; - - if (param.verbosity > 0) { - t0 = GetTime(); - } - - HybridData hd; - bool rc = HybridSimulator::SplitLattice(parts, circuit.gates, hd); - - if (!rc) { - return false; - } - - if (hd.num_gatexs < param.num_prefix_gatexs + param.num_root_gatexs) { - IO::errorf("error: num_prefix_gates (%u) plus num_root gates (%u) is " - "greater than num_gates_on_the_cut (%u).\n", - param.num_prefix_gatexs, param.num_root_gatexs, - hd.num_gatexs); - return false; - } - - if (param.verbosity > 0) { - PrintInfo(param, hd); - } - - auto fgates0 = Fuser::FuseGates(param, hd.num_qubits0, hd.gates0); - if (fgates0.size() == 0 && hd.gates0.size() > 0) { - return false; - } - - auto fgates1 = Fuser::FuseGates(param, hd.num_qubits1, hd.gates1); - if (fgates1.size() == 0 && hd.gates1.size() > 0) { - return false; - } - - rc = HybridSimulator(param.num_threads).Run( - param, factory, hd, parts, fgates0, fgates1, bitstrings, results); - - if (rc && param.verbosity > 0) { - double t1 = GetTime(); - IO::messagef("time elapsed %g seconds.\n", t1 - t0); - } - - return rc; - } - - private: - static void PrintInfo(const Parameter& param, const HybridData& hd) { - unsigned num_suffix_gates = - hd.num_gatexs - param.num_prefix_gatexs - param.num_root_gatexs; - - IO::messagef("part 0: %u, part 1: %u\n", hd.num_qubits0, hd.num_qubits1); - IO::messagef("%u gates on the cut\n", hd.num_gatexs); - IO::messagef("breakup: %up+%ur+%us\n", param.num_prefix_gatexs, - param.num_root_gatexs, num_suffix_gates); - } -}; - -} // namespace qsim - -#endif // RUN_QSIM_H_ diff --git a/tpls/qsim/seqfor.h b/tpls/qsim/seqfor.h deleted file mode 100644 index 3ebf07c..0000000 --- a/tpls/qsim/seqfor.h +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SEQFOR_H_ -#define SEQFOR_H_ - -#include -#include -#include - -namespace qsim { - -/** - * Helper struct for executing for loops in series. - */ -struct SequentialFor { - explicit SequentialFor(unsigned num_threads) {} - - // SequentialFor does not have any state. So all its methods can be static. - - static uint64_t GetIndex0(uint64_t size, unsigned thread_id) { - return 0; - } - - static uint64_t GetIndex1(uint64_t size, unsigned thread_id) { - return size; - } - - template - static void Run(uint64_t size, Function&& func, Args&&... args) { - for (uint64_t i = 0; i < size; ++i) { - func(1, 0, i, args...); - } - } - - template - static std::vector RunReduceP( - uint64_t size, Function&& func, Op&& op, Args&&... args) { - typename Op::result_type result = 0; - - for (uint64_t i = 0; i < size; ++i) { - result = op(result, func(1, 0, i, args...)); - } - - return std::vector(1, result); - } - - template - static typename Op::result_type RunReduce(uint64_t size, Function&& func, - Op&& op, Args&&... args) { - return RunReduceP(size, func, std::move(op), args...)[0]; - } -}; - -} // namespace qsim - -#endif // SEQFOR_H_ diff --git a/tpls/qsim/simmux.h b/tpls/qsim/simmux.h deleted file mode 100644 index d3c4074..0000000 --- a/tpls/qsim/simmux.h +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SIMMUX_H_ -#define SIMMUX_H_ - -#ifdef __AVX512F__ -# include "simulator_avx512.h" - namespace qsim { - template - using Simulator = SimulatorAVX512; - } -#elif __AVX2__ -# include "simulator_avx.h" - namespace qsim { - template - using Simulator = SimulatorAVX; - } -#elif __SSE4_1__ -# include "simulator_sse.h" - namespace qsim { - template - using Simulator = SimulatorSSE; - } -#else -# include "simulator_basic.h" - namespace qsim { - template - using Simulator = SimulatorBasic; - } -#endif - -#endif // SIMMUX_H_ diff --git a/tpls/qsim/simmux_gpu.h b/tpls/qsim/simmux_gpu.h deleted file mode 100644 index 1f0bb59..0000000 --- a/tpls/qsim/simmux_gpu.h +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright 2023 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SIMMUX_GPU_H_ -#define SIMMUX_GPU_H_ - -#ifdef __CUSTATEVEC__ -# include "simulator_custatevec.h" - namespace qsim { - using SimulatorGpu = SimulatorCuStateVec<>; - } -#else -# include "simulator_cuda.h" - namespace qsim { - using SimulatorGpu = SimulatorCUDA<>; - } -#endif - -#endif // SIMMUX_GPU_H_ diff --git a/tpls/qsim/simulator.h b/tpls/qsim/simulator.h deleted file mode 100644 index eff5441..0000000 --- a/tpls/qsim/simulator.h +++ /dev/null @@ -1,516 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SIMULATOR_H_ -#define SIMULATOR_H_ - -#include - -#include "bits.h" - -namespace qsim { - -/** - * Base class for simulator classes. - */ -class SimulatorBase { - protected: - // The follwoing template parameters are used for functions below. - // H - the number of high (target) qubits. - // L - the number of low (target) qubits. - // R - SIMD register width in floats. - - // Fills the table of masks (ms) that is used to calculate base state indices - // and the table of offset indices (xss) that is used to access the state - // vector entries in matrix-vector multiplication functions. This function is - // used in simulator_basic.h, simulator_sse.h and simulator_avx.h (no bmi2 - // version). - template - static void FillIndices(unsigned num_qubits, const std::vector& qs, - uint64_t* ms, uint64_t* xss) { - constexpr unsigned hsize = 1 << H; - - if (H == 0) { - ms[0] = uint64_t(-1); - xss[0] = 0; - } else { - uint64_t xs[H + 1]; - - xs[0] = uint64_t{1} << (qs[L] + 1); - ms[0] = (uint64_t{1} << qs[L]) - 1; - for (unsigned i = 1; i < H; ++i) { - xs[i] = uint64_t{1} << (qs[L + i] + 1); - ms[i] = ((uint64_t{1} << qs[L + i]) - 1) ^ (xs[i - 1] - 1); - } - ms[H] = ((uint64_t{1} << num_qubits) - 1) ^ (xs[H - 1] - 1); - - for (unsigned i = 0; i < hsize; ++i) { - uint64_t a = 0; - for (uint64_t k = 0; k < H; ++k) { - a += xs[k] * ((i >> k) & 1); - } - xss[i] = a; - } - } - } - - // Fills gate matrix entries for gates with low qubits. - template - static void FillMatrix(unsigned qmaskl, const fp_type* matrix, fp_type* w) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - constexpr unsigned rsize = 1 << R; - - unsigned s = 0; - - for (unsigned i = 0; i < hsize; ++i) { - for (unsigned j = 0; j < gsize; ++j) { - unsigned p0 = 2 * i * lsize * gsize + 2 * lsize * (j / lsize); - - for (unsigned k = 0; k < rsize; ++k) { - unsigned l = bits::CompressBits(k, R, qmaskl); - unsigned p = p0 + 2 * (gsize * l + (j + l) % lsize); - - w[s + 0] = matrix[p]; - w[s + rsize] = matrix[p + 1]; - - ++s; - } - - s += rsize; - } - } - } - - // Fills gate matrix entries for controlled gates with high target qubits - // and low control qubits. - template - static void FillControlledMatrixH(uint64_t cvalsl, uint64_t cmaskl, - const fp_type* matrix, fp_type* w) { - constexpr unsigned hsize = 1 << H; - constexpr unsigned rsize = 1 << R; - - unsigned s = 0; - - for (unsigned i = 0; i < hsize; ++i) { - for (unsigned j = 0; j < hsize; ++j) { - unsigned p = hsize * i + j; - fp_type v = i == j ? 1 : 0; - - for (unsigned k = 0; k < rsize; ++k) { - w[s] = cvalsl == (k & cmaskl) ? matrix[2 * p] : v; - w[s + rsize] = cvalsl == (k & cmaskl) ? matrix[2 * p + 1] : 0; - - ++s; - } - - s += rsize; - } - } - } - - // Fills gate matrix entries for controlled gates with low target qubits - // and low control qubits. - template - static void FillControlledMatrixL(uint64_t cvalsl, uint64_t cmaskl, - unsigned qmaskl, const fp_type* matrix, - fp_type* w) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - constexpr unsigned rsize = 1 << R; - - unsigned s = 0; - - for (unsigned i = 0; i < hsize; ++i) { - for (unsigned j = 0; j < gsize; ++j) { - unsigned p0 = i * lsize * gsize + lsize * (j / lsize); - - for (unsigned k = 0; k < rsize; ++k) { - unsigned l = bits::CompressBits(k, R, qmaskl); - unsigned p = p0 + gsize * l + (j + l) % lsize; - - fp_type v = p / gsize == p % gsize ? 1 : 0; - - w[s] = cvalsl == (k & cmaskl) ? matrix[2 * p] : v; - w[s + rsize] = cvalsl == (k & cmaskl) ? matrix[2 * p + 1] : 0; - - ++s; - } - - s += rsize; - } - } - } - -/* - The GetMasks* functions below provide various masks and related values. - GetMasks1, GetMasks2, GetMasks3, GetMasks4, GetMasks5 and GetMasks6 are - used in simulator_avx.h (BMI2 version) and in simulator_avx512.h. GetMasks7, - GetMasks8, GetMasks9, GetMasks10 and GetMasks11 are used in simulator_avx.h - (no BMI2 version) and in simulator_sse.h. - - imaskh - inverted mask of high qubits (high control and target qubits). - qmaskh - mask of high qubits (high target qubits). - cvalsh - control bit values of high control qubits placed in correct - positions. - cvalsl - control bit values of low control qubits placed in correct positions. - cmaskh - mask of high control qubits. - cmaskl - mask of low control qubits. - qmaskl - mask of low qubits (low target qubits). - cl - the number of low control qubits. - - Note that imaskh, qmaskh and cvalsh are multiplied by two in GetMasks1, - GetMasks2, GetMasks3, GetMasks4, GetMasks5 and GetMasks6. -*/ - - struct Masks1 { - uint64_t imaskh; - uint64_t qmaskh; - }; - - template - static Masks1 GetMasks1(const std::vector& qs) { - uint64_t qmaskh = 0; - - for (unsigned i = 0; i < H; ++i) { - qmaskh |= uint64_t{1} << qs[i]; - } - - return {2 * (~qmaskh ^ ((1 << R) - 1)), 2 * qmaskh}; - } - - struct Masks2 { - uint64_t imaskh; - uint64_t qmaskh; - unsigned qmaskl; - }; - - template - static Masks2 GetMasks2(const std::vector& qs) { - uint64_t qmaskh = 0; - unsigned qmaskl = 0; - - for (unsigned i = 0; i < L; ++i) { - qmaskl |= 1 << qs[i]; - } - - for (unsigned i = L; i < H + L; ++i) { - qmaskh |= uint64_t{1} << qs[i]; - } - - return {2 * (~qmaskh ^ ((1 << R) - 1)), 2 * qmaskh, qmaskl}; - } - - struct Masks3 { - uint64_t imaskh; - uint64_t qmaskh; - uint64_t cvalsh; - }; - - template - static Masks3 GetMasks3(unsigned num_qubits, const std::vector& qs, - const std::vector& cqs, uint64_t cvals) { - uint64_t qmaskh = 0; - uint64_t cmaskh = 0; - - for (unsigned i = 0; i < H; ++i) { - qmaskh |= uint64_t{1} << qs[i]; - } - - for (auto q : cqs) { - cmaskh |= uint64_t{1} << q; - } - - uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh); - - uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1); - - return {2 * maskh, 2 * qmaskh, 2 * cvalsh}; - } - - struct Masks4 { - uint64_t imaskh; - uint64_t qmaskh; - uint64_t cvalsh; - uint64_t cvalsl; - uint64_t cmaskl; - unsigned cl; - }; - - template - static Masks4 GetMasks4(unsigned num_qubits, const std::vector& qs, - const std::vector& cqs, uint64_t cvals) { - unsigned cl = 0; - uint64_t qmaskh = 0; - uint64_t cmaskh = 0; - uint64_t cmaskl = 0; - - for (unsigned i = 0; i < H; ++i) { - qmaskh |= uint64_t{1} << qs[i]; - } - - for (auto q : cqs) { - if (q >= R) { - cmaskh |= uint64_t{1} << q; - } else { - ++cl; - cmaskl |= uint64_t{1} << q; - } - } - - uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh); - uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl); - - uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1); - - return {2 * maskh, 2 * qmaskh, 2 * cvalsh, cvalsl, cmaskl, cl}; - } - - struct Masks5 { - uint64_t imaskh; - uint64_t qmaskh; - uint64_t cvalsh; - unsigned qmaskl; - }; - - template - static Masks5 GetMasks5(unsigned num_qubits, const std::vector& qs, - const std::vector& cqs, uint64_t cvals) { - uint64_t qmaskh = 0; - uint64_t cmaskh = 0; - unsigned qmaskl = 0; - - for (unsigned i = 0; i < L; ++i) { - qmaskl |= 1 << qs[i]; - } - - for (unsigned i = L; i < H + L; ++i) { - qmaskh |= uint64_t{1} << qs[i]; - } - - for (auto q : cqs) { - cmaskh |= uint64_t{1} << q; - } - - uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh); - - uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1); - - return {2 * maskh, 2 * qmaskh, 2 * cvalsh, qmaskl}; - } - - struct Masks6 { - uint64_t imaskh; - uint64_t qmaskh; - uint64_t cvalsh; - uint64_t cvalsl; - uint64_t cmaskl; - unsigned qmaskl; - unsigned cl; - }; - - template - static Masks6 GetMasks6(unsigned num_qubits, const std::vector& qs, - const std::vector& cqs, uint64_t cvals) { - unsigned cl = 0; - uint64_t qmaskh = 0; - uint64_t cmaskh = 0; - uint64_t cmaskl = 0; - unsigned qmaskl = 0; - - for (unsigned i = 0; i < L; ++i) { - qmaskl |= 1 << qs[i]; - } - - for (unsigned i = L; i < H + L; ++i) { - qmaskh |= uint64_t{1} << qs[i]; - } - - for (auto q : cqs) { - if (q >= R) { - cmaskh |= uint64_t{1} << q; - } else { - ++cl; - cmaskl |= uint64_t{1} << q; - } - } - - uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh); - uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl); - - uint64_t maskh = ~(qmaskh | cmaskh) ^ ((1 << R) - 1); - - return {2 * maskh, 2 * qmaskh, 2 * cvalsh, cvalsl, cmaskl, qmaskl, cl}; - } - - struct Masks7 { - uint64_t cvalsh; - uint64_t cmaskh; - }; - - static Masks7 GetMasks7(unsigned num_qubits, const std::vector& qs, - const std::vector& cqs, uint64_t cvals) { - uint64_t cmaskh = 0; - - for (auto q : cqs) { - cmaskh |= uint64_t{1} << q; - } - - uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh); - - return {cvalsh, cmaskh}; - } - - struct Masks8 { - uint64_t cvalsh; - uint64_t cmaskh; - uint64_t cvalsl; - uint64_t cmaskl; - }; - - template - static Masks8 GetMasks8(unsigned num_qubits, const std::vector& qs, - const std::vector& cqs, uint64_t cvals) { - unsigned cl = 0; - uint64_t cmaskh = 0; - uint64_t cmaskl = 0; - - for (auto q : cqs) { - if (q >= R) { - cmaskh |= uint64_t{1} << q; - } else { - ++cl; - cmaskl |= uint64_t{1} << q; - } - } - - uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh); - uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl); - - return {cvalsh, cmaskh, cvalsl, cmaskl}; - } - - struct Masks9 { - uint64_t cvalsh; - uint64_t cmaskh; - unsigned qmaskl; - }; - - template - static Masks9 GetMasks9(unsigned num_qubits, const std::vector& qs, - const std::vector& cqs, uint64_t cvals) { - uint64_t cmaskh = 0; - unsigned qmaskl = 0; - - for (unsigned i = 0; i < L; ++i) { - qmaskl |= 1 << qs[i]; - } - - for (auto q : cqs) { - cmaskh |= uint64_t{1} << q; - } - - uint64_t cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh); - - return {cvalsh, cmaskh, qmaskl}; - } - - struct Masks10 { - uint64_t cvalsh; - uint64_t cmaskh; - uint64_t cvalsl; - uint64_t cmaskl; - unsigned qmaskl; - }; - - template - static Masks10 GetMasks10(unsigned num_qubits, - const std::vector& qs, - const std::vector& cqs, uint64_t cvals) { - unsigned cl = 0; - uint64_t cmaskh = 0; - uint64_t cmaskl = 0; - unsigned qmaskl = 0; - - for (unsigned i = 0; i < L; ++i) { - qmaskl |= 1 << qs[i]; - } - - for (auto q : cqs) { - if (q >= R) { - cmaskh |= uint64_t{1} << q; - } else { - ++cl; - cmaskl |= uint64_t{1} << q; - } - } - - uint64_t cvalsh = bits::ExpandBits(cvals >> cl, num_qubits, cmaskh); - uint64_t cvalsl = bits::ExpandBits(cvals & ((1 << cl) - 1), R, cmaskl); - - return {cvalsh, cmaskh, cvalsl, cmaskl, qmaskl}; - } - - struct Masks11 { - unsigned qmaskl; - }; - - template - static Masks11 GetMasks11(const std::vector& qs) { - unsigned qmaskl = 0; - - for (unsigned i = 0; i < L; ++i) { - qmaskl |= 1 << qs[i]; - } - - return {qmaskl}; - } - - template - static unsigned MaskedAdd( - unsigned a, unsigned b, unsigned mask, unsigned lsize) { - unsigned c = bits::CompressBits(a, R, mask); - return bits::ExpandBits((c + b) % lsize, R, mask); - } -}; - -template <> -inline void SimulatorBase::FillIndices<0, 1>(unsigned num_qubits, - const std::vector& qs, - uint64_t* ms, uint64_t* xss) { - ms[0] = -1; - xss[0] = 0; -} - -template <> -inline void SimulatorBase::FillIndices<0, 2>(unsigned num_qubits, - const std::vector& qs, - uint64_t* ms, uint64_t* xss) { - ms[0] = -1; - xss[0] = 0; -} - -template <> -inline void SimulatorBase::FillIndices<0, 3>(unsigned num_qubits, - const std::vector& qs, - uint64_t* ms, uint64_t* xss) { - ms[0] = -1; - xss[0] = 0; -} - -} // namespace qsim - -#endif // SIMULATOR_H_ diff --git a/tpls/qsim/simulator_avx.h b/tpls/qsim/simulator_avx.h deleted file mode 100644 index 9742849..0000000 --- a/tpls/qsim/simulator_avx.h +++ /dev/null @@ -1,1363 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SIMULATOR_AVX_H_ -#define SIMULATOR_AVX_H_ - -#include - -#include -#include -#include -#include - -#include "simulator.h" -#include "statespace_avx.h" - -namespace qsim { - -/** - * Quantum circuit simulator with AVX vectorization. - */ -template -class SimulatorAVX final : public SimulatorBase { - public: - using StateSpace = StateSpaceAVX; - using State = typename StateSpace::State; - using fp_type = typename StateSpace::fp_type; - - template - explicit SimulatorAVX(ForArgs&&... args) : for_(args...) {} - - /** - * Applies a gate using AVX instructions. - * @param qs Indices of the qubits affected by this gate. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyGate(const std::vector& qs, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - switch (qs.size()) { - case 0: - ApplyGateH<0>(qs, matrix, state); - break; - case 1: - if (qs[0] > 2) { - ApplyGateH<1>(qs, matrix, state); - } else { - ApplyGateL<0, 1>(qs, matrix, state); - } - break; - case 2: - if (qs[0] > 2) { - ApplyGateH<2>(qs, matrix, state); - } else if (qs[1] > 2) { - ApplyGateL<1, 1>(qs, matrix, state); - } else { - ApplyGateL<0, 2>(qs, matrix, state); - } - break; - case 3: - if (qs[0] > 2) { - ApplyGateH<3>(qs, matrix, state); - } else if (qs[1] > 2) { - ApplyGateL<2, 1>(qs, matrix, state); - } else if (qs[2] > 2) { - ApplyGateL<1, 2>(qs, matrix, state); - } else { - ApplyGateL<0, 3>(qs, matrix, state); - } - break; - case 4: - if (qs[0] > 2) { - ApplyGateH<4>(qs, matrix, state); - } else if (qs[1] > 2) { - ApplyGateL<3, 1>(qs, matrix, state); - } else if (qs[2] > 2) { - ApplyGateL<2, 2>(qs, matrix, state); - } else { - ApplyGateL<1, 3>(qs, matrix, state); - } - break; - case 5: - if (qs[0] > 2) { - ApplyGateH<5>(qs, matrix, state); - } else if (qs[1] > 2) { - ApplyGateL<4, 1>(qs, matrix, state); - } else if (qs[2] > 2) { - ApplyGateL<3, 2>(qs, matrix, state); - } else { - ApplyGateL<2, 3>(qs, matrix, state); - } - break; - case 6: - if (qs[0] > 2) { - ApplyGateH<6>(qs, matrix, state); - } else if (qs[1] > 2) { - ApplyGateL<5, 1>(qs, matrix, state); - } else if (qs[2] > 2) { - ApplyGateL<4, 2>(qs, matrix, state); - } else { - ApplyGateL<3, 3>(qs, matrix, state); - } - break; - default: - // Not implemented. - break; - } - } - - /** - * Applies a controlled gate using AVX instructions. - * @param qs Indices of the qubits affected by this gate. - * @param cqs Indices of control qubits. - * @param cvals Bit mask of control qubit values. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyControlledGate(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - // Assume cqs[0] < cqs[1] < cqs[2] < ... . - - if (cqs.size() == 0) { - ApplyGate(qs, matrix, state); - return; - } - - switch (qs.size()) { - case 0: - if (cqs[0] > 2) { - ApplyControlledGateHH<0>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<0>(qs, cqs, cvals, matrix, state); - } - break; - case 1: - if (qs[0] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 2) { - ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 2: - if (qs[0] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 2) { - ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 3: - if (qs[0] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else if (qs[2] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 2) { - ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 4: - if (qs[0] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else if (qs[2] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 2) { - ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - default: - // Not implemented. - break; - } - } - - /** - * Computes the expectation value of an operator using AVX instructions. - * @param qs Indices of the qubits the operator acts on. - * @param matrix The operator matrix. - * @param state The state of the system. - * @return The computed expectation value. - */ - std::complex ExpectationValue(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - switch (qs.size()) { - case 1: - if (qs[0] > 2) { - return ExpectationValueH<1>(qs, matrix, state); - } else { - return ExpectationValueL<0, 1>(qs, matrix, state); - } - break; - case 2: - if (qs[0] > 2) { - return ExpectationValueH<2>(qs, matrix, state); - } else if (qs[1] > 2) { - return ExpectationValueL<1, 1>(qs, matrix, state); - } else { - return ExpectationValueL<0, 2>(qs, matrix, state); - } - break; - case 3: - if (qs[0] > 2) { - return ExpectationValueH<3>(qs, matrix, state); - } else if (qs[1] > 2) { - return ExpectationValueL<2, 1>(qs, matrix, state); - } else if (qs[2] > 2) { - return ExpectationValueL<1, 2>(qs, matrix, state); - } else { - return ExpectationValueL<0, 3>(qs, matrix, state); - } - break; - case 4: - if (qs[0] > 2) { - return ExpectationValueH<4>(qs, matrix, state); - } else if (qs[1] > 2) { - return ExpectationValueL<3, 1>(qs, matrix, state); - } else if (qs[2] > 2) { - return ExpectationValueL<2, 2>(qs, matrix, state); - } else { - return ExpectationValueL<1, 3>(qs, matrix, state); - } - break; - case 5: - if (qs[0] > 2) { - return ExpectationValueH<5>(qs, matrix, state); - } else if (qs[1] > 2) { - return ExpectationValueL<4, 1>(qs, matrix, state); - } else if (qs[2] > 2) { - return ExpectationValueL<3, 2>(qs, matrix, state); - } else { - return ExpectationValueL<2, 3>(qs, matrix, state); - } - break; - case 6: - if (qs[0] > 2) { - return ExpectationValueH<6>(qs, matrix, state); - } else if (qs[1] > 2) { - return ExpectationValueL<5, 1>(qs, matrix, state); - } else if (qs[2] > 2) { - return ExpectationValueL<4, 2>(qs, matrix, state); - } else { - return ExpectationValueL<3, 3>(qs, matrix, state); - } - break; - default: - // Not implemented. - break; - } - - return 0; - } - - /** - * @return The size of SIMD register if applicable. - */ - static unsigned SIMDRegisterSize() { - return 8; - } - - private: -#ifdef __BMI2__ - - template - void ApplyGateH(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - uint64_t imaskh, uint64_t qmaskh, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m256 ru, iu, rn, in; - __m256 rs[hsize], is[hsize]; - - auto p0 = rstate + _pdep_u64(i, imaskh); - - for (unsigned k = 0; k < hsize; ++k) { - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k] = _mm256_load_ps(p0 + p); - is[k] = _mm256_load_ps(p0 + p + 8); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_mul_ps(rs[0], ru); - in = _mm256_mul_ps(rs[0], iu); - rn = _mm256_fnmadd_ps(is[0], iu, rn); - in = _mm256_fmadd_ps(is[0], ru, in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_fmadd_ps(rs[l], ru, rn); - in = _mm256_fmadd_ps(rs[l], iu, in); - rn = _mm256_fnmadd_ps(is[l], iu, rn); - in = _mm256_fmadd_ps(is[l], ru, in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm256_store_ps(p0 + p, rn); - _mm256_store_ps(p0 + p + 8, in); - } - }; - - auto m = GetMasks1(qs); - - unsigned k = 3 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, matrix, m.imaskh, m.qmaskh, state.get()); - } - - template - void ApplyGateL(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, - uint64_t imaskh, uint64_t qmaskh, const __m256i* idx, - fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m256 rn, in; - __m256 rs[gsize], is[gsize]; - - auto p0 = rstate + _pdep_u64(i, imaskh); - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k2] = _mm256_load_ps(p0 + p); - is[k2] = _mm256_load_ps(p0 + p + 8); - - for (unsigned l = 1; l < lsize; ++l) { - rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); - is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); - } - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm256_mul_ps(rs[0], w[j]); - in = _mm256_mul_ps(rs[0], w[j + 1]); - rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm256_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm256_fmadd_ps(rs[l], w[j], rn); - in = _mm256_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm256_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm256_store_ps(p0 + p, rn); - _mm256_store_ps(p0 + p + 8, in); - } - }; - - __m256i idx[1 << L]; - __m256 w[1 << (1 + 2 * H + L)]; - - auto m = GetMasks2(qs); - FillPermutationIndices(m.qmaskl, idx); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - unsigned r = 3 + H; - unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, w, m.imaskh, m.qmaskh, idx, state.get()); - } - - template - void ApplyControlledGateHH(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, - fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m256 ru, iu, rn, in; - __m256 rs[hsize], is[hsize]; - - auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh); - - for (unsigned k = 0; k < hsize; ++k) { - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k] = _mm256_load_ps(p0 + p); - is[k] = _mm256_load_ps(p0 + p + 8); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_mul_ps(rs[0], ru); - in = _mm256_mul_ps(rs[0], iu); - rn = _mm256_fnmadd_ps(is[0], iu, rn); - in = _mm256_fmadd_ps(is[0], ru, in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_fmadd_ps(rs[l], ru, rn); - in = _mm256_fmadd_ps(rs[l], iu, in); - rn = _mm256_fnmadd_ps(is[l], iu, rn); - in = _mm256_fmadd_ps(is[l], ru, in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm256_store_ps(p0 + p, rn); - _mm256_store_ps(p0 + p + 8, in); - } - }; - - auto m = GetMasks3(state.num_qubits(), qs, cqs, cvals); - - unsigned k = 3 + H + cqs.size(); - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, matrix, m.imaskh, m.qmaskh, m.cvalsh, state.get()); - } - - template - void ApplyControlledGateHL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, - uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, - fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m256 rn, in; - __m256 rs[hsize], is[hsize]; - - auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh); - - for (unsigned k = 0; k < hsize; ++k) { - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k] = _mm256_load_ps(p0 + p); - is[k] = _mm256_load_ps(p0 + p + 8); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm256_mul_ps(rs[0], w[j]); - in = _mm256_mul_ps(rs[0], w[j + 1]); - rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm256_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - rn = _mm256_fmadd_ps(rs[l], w[j], rn); - in = _mm256_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm256_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm256_store_ps(p0 + p, rn); - _mm256_store_ps(p0 + p + 8, in); - } - }; - - __m256 w[1 << (1 + 2 * H)]; - - auto m = GetMasks4(state.num_qubits(), qs, cqs, cvals); - FillControlledMatrixH(m.cvalsl, m.cmaskl, matrix, (fp_type*) w); - - unsigned r = 3 + H + cqs.size() - m.cl; - unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, state.get()); - } - - template - void ApplyControlledGateL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, - uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, - const __m256i* idx, fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m256 rn, in; - __m256 rs[gsize], is[gsize]; - - auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh); - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k2] = _mm256_load_ps(p0 + p); - is[k2] = _mm256_load_ps(p0 + p + 8); - - for (unsigned l = 1; l < lsize; ++l) { - rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); - is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); - } - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm256_mul_ps(rs[0], w[j]); - in = _mm256_mul_ps(rs[0], w[j + 1]); - rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm256_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm256_fmadd_ps(rs[l], w[j], rn); - in = _mm256_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm256_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm256_store_ps(p0 + p, rn); - _mm256_store_ps(p0 + p + 8, in); - } - }; - - __m256i idx[1 << L]; - __m256 w[1 << (1 + 2 * H + L)]; - - if (CH) { - auto m = GetMasks5(state.num_qubits(), qs, cqs, cvals); - FillPermutationIndices(m.qmaskl, idx); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - unsigned r = 3 + H + cqs.size(); - unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get()); - } else { - auto m = GetMasks6(state.num_qubits(), qs, cqs, cvals); - FillPermutationIndices(m.qmaskl, idx); - FillControlledMatrixL( - m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w); - - unsigned r = 3 + H + cqs.size() - m.cl; - unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get()); - } - } - - template - std::complex ExpectationValueH(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - uint64_t imaskh, uint64_t qmaskh, const fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m256 ru, iu, rn, in; - __m256 rs[hsize], is[hsize]; - - auto p0 = rstate + _pdep_u64(i, imaskh); - - for (unsigned k = 0; k < hsize; ++k) { - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k] = _mm256_load_ps(p0 + p); - is[k] = _mm256_load_ps(p0 + p + 8); - } - - double re = 0; - double im = 0; - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_mul_ps(rs[0], ru); - in = _mm256_mul_ps(rs[0], iu); - rn = _mm256_fnmadd_ps(is[0], iu, rn); - in = _mm256_fmadd_ps(is[0], ru, in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_fmadd_ps(rs[l], ru, rn); - in = _mm256_fmadd_ps(rs[l], iu, in); - rn = _mm256_fnmadd_ps(is[l], iu, rn); - in = _mm256_fmadd_ps(is[l], ru, in); - - j += 2; - } - - __m256 v_re = _mm256_fmadd_ps(is[k], in, _mm256_mul_ps(rs[k], rn)); - __m256 v_im = _mm256_fnmadd_ps(is[k], rn, _mm256_mul_ps(rs[k], in)); - - re += detail::HorizontalSumAVX(v_re); - im += detail::HorizontalSumAVX(v_im); - } - - return std::complex{re, im}; - }; - - auto m = GetMasks1(qs); - - unsigned k = 3 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Op = std::plus>; - return - for_.RunReduce(size, f, Op(), matrix, m.imaskh, m.qmaskh, state.get()); - } - - template - std::complex ExpectationValueL(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, - uint64_t imaskh, uint64_t qmaskh, const __m256i* idx, - const fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m256 rn, in; - __m256 rs[gsize], is[gsize]; - - auto p0 = rstate + _pdep_u64(i, imaskh); - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k2] = _mm256_load_ps(p0 + p); - is[k2] = _mm256_load_ps(p0 + p + 8); - - for (unsigned l = 1; l < lsize; ++l) { - rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); - is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); - } - } - - double re = 0; - double im = 0; - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm256_mul_ps(rs[0], w[j]); - in = _mm256_mul_ps(rs[0], w[j + 1]); - rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm256_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm256_fmadd_ps(rs[l], w[j], rn); - in = _mm256_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm256_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - unsigned m = lsize * k; - - __m256 v_re = _mm256_fmadd_ps(is[m], in, _mm256_mul_ps(rs[m], rn)); - __m256 v_im = _mm256_fnmadd_ps(is[m], rn, _mm256_mul_ps(rs[m], in)); - - re += detail::HorizontalSumAVX(v_re); - im += detail::HorizontalSumAVX(v_im); - } - - return std::complex{re, im}; - }; - - __m256i idx[1 << L]; - __m256 w[1 << (1 + 2 * H + L)]; - - auto m = GetMasks2(qs); - FillPermutationIndices(m.qmaskl, idx); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - unsigned r = 3 + H; - unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; - uint64_t size = uint64_t{1} << n; - - using Op = std::plus>; - return - for_.RunReduce(size, f, Op(), w, m.imaskh, m.qmaskh, idx, state.get()); - } - -#else // __BMI2__ - - template - void ApplyGateH(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - const uint64_t* ms, const uint64_t* xss, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m256 ru, iu, rn, in; - __m256 rs[hsize], is[hsize]; - - i *= 8; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = _mm256_load_ps(p0 + xss[k]); - is[k] = _mm256_load_ps(p0 + xss[k] + 8); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_mul_ps(rs[0], ru); - in = _mm256_mul_ps(rs[0], iu); - rn = _mm256_fnmadd_ps(is[0], iu, rn); - in = _mm256_fmadd_ps(is[0], ru, in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_fmadd_ps(rs[l], ru, rn); - in = _mm256_fmadd_ps(rs[l], iu, in); - rn = _mm256_fnmadd_ps(is[l], iu, rn); - in = _mm256_fmadd_ps(is[l], ru, in); - - j += 2; - } - - _mm256_store_ps(p0 + xss[k], rn); - _mm256_store_ps(p0 + xss[k] + 8, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned k = 3 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, matrix, ms, xss, state.get()); - } - - template - void ApplyGateL(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, - const uint64_t* ms, const uint64_t* xss, const __m256i* idx, - fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m256 rn, in; - __m256 rs[gsize], is[gsize]; - - i *= 8; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - rs[k2] = _mm256_load_ps(p0 + xss[k]); - is[k2] = _mm256_load_ps(p0 + xss[k] + 8); - - for (unsigned l = 1; l < lsize; ++l) { - rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); - is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); - } - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm256_mul_ps(rs[0], w[j]); - in = _mm256_mul_ps(rs[0], w[j + 1]); - rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm256_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm256_fmadd_ps(rs[l], w[j], rn); - in = _mm256_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm256_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - _mm256_store_ps(p0 + xss[k], rn); - _mm256_store_ps(p0 + xss[k] + 8, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - __m256i idx[1 << L]; - __m256 w[1 << (1 + 2 * H + L)]; - - auto m = GetMasks11(qs); - - FillIndices(state.num_qubits(), qs, ms, xss); - FillPermutationIndices(m.qmaskl, idx); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - unsigned r = 3 + H; - unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, w, ms, xss, idx, state.get()); - } - - template - void ApplyControlledGateHH(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, - uint64_t cmaskh, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m256 ru, iu, rn, in; - __m256 rs[hsize], is[hsize]; - - i *= 8; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - if ((ii & cmaskh) != cvalsh) return; - - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = _mm256_load_ps(p0 + xss[k]); - is[k] = _mm256_load_ps(p0 + xss[k] + 8); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_mul_ps(rs[0], ru); - in = _mm256_mul_ps(rs[0], iu); - rn = _mm256_fnmadd_ps(is[0], iu, rn); - in = _mm256_fmadd_ps(is[0], ru, in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_fmadd_ps(rs[l], ru, rn); - in = _mm256_fmadd_ps(rs[l], iu, in); - rn = _mm256_fnmadd_ps(is[l], iu, rn); - in = _mm256_fmadd_ps(is[l], ru, in); - - j += 2; - } - - _mm256_store_ps(p0 + xss[k], rn); - _mm256_store_ps(p0 + xss[k] + 8, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - - auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals); - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned k = 3 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, matrix, ms, xss, m.cvalsh, m.cmaskh, state.get()); - } - - template - void ApplyControlledGateHL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, - const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, - uint64_t cmaskh, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m256 rn, in; - __m256 rs[hsize], is[hsize]; - - i *= 8; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - if ((ii & cmaskh) != cvalsh) return; - - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = _mm256_load_ps(p0 + xss[k]); - is[k] = _mm256_load_ps(p0 + xss[k] + 8); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm256_mul_ps(rs[0], w[j]); - in = _mm256_mul_ps(rs[0], w[j + 1]); - rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm256_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - rn = _mm256_fmadd_ps(rs[l], w[j], rn); - in = _mm256_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm256_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - _mm256_store_ps(p0 + xss[k], rn); - _mm256_store_ps(p0 + xss[k] + 8, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - __m256 w[1 << (1 + 2 * H)]; - - auto m = GetMasks8<3>(state.num_qubits(), qs, cqs, cvals); - FillIndices(state.num_qubits(), qs, ms, xss); - FillControlledMatrixH(m.cvalsl, m.cmaskl, matrix, (fp_type*) w); - - unsigned r = 3 + H; - unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, state.get()); - } - - template - void ApplyControlledGateL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, - const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, - uint64_t cmaskh, const __m256i* idx, fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m256 rn, in; - __m256 rs[gsize], is[gsize]; - - i *= 8; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - if ((ii & cmaskh) != cvalsh) return; - - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - - rs[k2] = _mm256_load_ps(p0 + xss[k]); - is[k2] = _mm256_load_ps(p0 + xss[k] + 8); - - for (unsigned l = 1; l < lsize; ++l) { - rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); - is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); - } - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm256_mul_ps(rs[0], w[j]); - in = _mm256_mul_ps(rs[0], w[j + 1]); - rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm256_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm256_fmadd_ps(rs[l], w[j], rn); - in = _mm256_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm256_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - _mm256_store_ps(p0 + xss[k], rn); - _mm256_store_ps(p0 + xss[k] + 8, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - __m256i idx[1 << L]; - __m256 w[1 << (1 + 2 * H + L)]; - - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned r = 3 + H; - unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; - uint64_t size = uint64_t{1} << n; - - if (CH) { - auto m = GetMasks9(state.num_qubits(), qs, cqs, cvals); - FillPermutationIndices(m.qmaskl, idx); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, idx, state.get()); - } else { - auto m = GetMasks10(state.num_qubits(), qs, cqs, cvals); - FillPermutationIndices(m.qmaskl, idx); - FillControlledMatrixL( - m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w); - - for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, idx, state.get()); - } - } - - template - std::complex ExpectationValueH(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - const uint64_t* ms, const uint64_t* xss, - const fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m256 ru, iu, rn, in; - __m256 rs[hsize], is[hsize]; - - i *= 8; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = _mm256_load_ps(p0 + xss[k]); - is[k] = _mm256_load_ps(p0 + xss[k] + 8); - } - - double re = 0; - double im = 0; - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_mul_ps(rs[0], ru); - in = _mm256_mul_ps(rs[0], iu); - rn = _mm256_fnmadd_ps(is[0], iu, rn); - in = _mm256_fmadd_ps(is[0], ru, in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_fmadd_ps(rs[l], ru, rn); - in = _mm256_fmadd_ps(rs[l], iu, in); - rn = _mm256_fnmadd_ps(is[l], iu, rn); - in = _mm256_fmadd_ps(is[l], ru, in); - - j += 2; - } - - __m256 v_re = _mm256_fmadd_ps(is[k], in, _mm256_mul_ps(rs[k], rn)); - __m256 v_im = _mm256_fnmadd_ps(is[k], rn, _mm256_mul_ps(rs[k], in)); - - re += detail::HorizontalSumAVX(v_re); - im += detail::HorizontalSumAVX(v_im); - } - - return std::complex{re, im}; - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned k = 3 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Op = std::plus>; - return for_.RunReduce(size, f, Op(), matrix, ms, xss, state.get()); - } - - template - std::complex ExpectationValueL(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, - const uint64_t* ms, const uint64_t* xss, const __m256i* idx, - const fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m256 rn, in; - __m256 rs[gsize], is[gsize]; - - i *= 8; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - - rs[k2] = _mm256_load_ps(p0 + xss[k]); - is[k2] = _mm256_load_ps(p0 + xss[k] + 8); - - for (unsigned l = 1; l < lsize; ++l) { - rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); - is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); - } - } - - double re = 0; - double im = 0; - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm256_mul_ps(rs[0], w[j]); - in = _mm256_mul_ps(rs[0], w[j + 1]); - rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm256_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm256_fmadd_ps(rs[l], w[j], rn); - in = _mm256_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm256_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - unsigned m = lsize * k; - - __m256 v_re = _mm256_fmadd_ps(is[m], in, _mm256_mul_ps(rs[m], rn)); - __m256 v_im = _mm256_fnmadd_ps(is[m], rn, _mm256_mul_ps(rs[m], in)); - - re += detail::HorizontalSumAVX(v_re); - im += detail::HorizontalSumAVX(v_im); - } - - return std::complex{re, im}; - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - __m256i idx[1 << L]; - __m256 w[1 << (1 + 2 * H + L)]; - - auto m = GetMasks11(qs); - - FillIndices(state.num_qubits(), qs, ms, xss); - FillPermutationIndices(m.qmaskl, idx); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - unsigned r = 3 + H; - unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; - uint64_t size = uint64_t{1} << n; - - using Op = std::plus>; - return for_.RunReduce(size, f, Op(), w, ms, xss, idx, state.get()); - } - -#endif // __BMI2__ - - template - static void FillPermutationIndices(unsigned qmaskl, __m256i* idx) { - constexpr unsigned lsize = 1 << L; - - for (unsigned i = 0; i < lsize - 1; ++i) { - unsigned p[8]; - - for (unsigned j = 0; j < 8; ++j) { - p[j] = MaskedAdd<3>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl)); - } - - idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]); - } - } - - For for_; -}; - -} // namespace qsim - -#endif // SIMULATOR_AVX_H_ diff --git a/tpls/qsim/simulator_avx512.h b/tpls/qsim/simulator_avx512.h deleted file mode 100644 index 21a2e9d..0000000 --- a/tpls/qsim/simulator_avx512.h +++ /dev/null @@ -1,846 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SIMULATOR_AVX512_H_ -#define SIMULATOR_AVX512_H_ - -#include - -#include -#include -#include -#include - -#include "simulator.h" -#include "statespace_avx512.h" - -namespace qsim { - -/** - * Quantum circuit simulator with AVX512 vectorization. - */ -template -class SimulatorAVX512 final : public SimulatorBase { - public: - using StateSpace = StateSpaceAVX512; - using State = typename StateSpace::State; - using fp_type = typename StateSpace::fp_type; - - template - explicit SimulatorAVX512(ForArgs&&... args) : for_(args...) {} - - /** - * Applies a gate using AVX512 instructions. - * @param qs Indices of the qubits affected by this gate. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyGate(const std::vector& qs, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - switch (qs.size()) { - case 0: - ApplyGateH<0>(qs, matrix, state); - break; - case 1: - if (qs[0] > 3) { - ApplyGateH<1>(qs, matrix, state); - } else { - ApplyGateL<0, 1>(qs, matrix, state); - } - break; - case 2: - if (qs[0] > 3) { - ApplyGateH<2>(qs, matrix, state); - } else if (qs[1] > 3) { - ApplyGateL<1, 1>(qs, matrix, state); - } else { - ApplyGateL<0, 2>(qs, matrix, state); - } - break; - case 3: - if (qs[0] > 3) { - ApplyGateH<3>(qs, matrix, state); - } else if (qs[1] > 3) { - ApplyGateL<2, 1>(qs, matrix, state); - } else if (qs[2] > 3) { - ApplyGateL<1, 2>(qs, matrix, state); - } else { - ApplyGateL<0, 3>(qs, matrix, state); - } - break; - case 4: - if (qs[0] > 3) { - ApplyGateH<4>(qs, matrix, state); - } else if (qs[1] > 3) { - ApplyGateL<3, 1>(qs, matrix, state); - } else if (qs[2] > 3) { - ApplyGateL<2, 2>(qs, matrix, state); - } else if (qs[3] > 3) { - ApplyGateL<1, 3>(qs, matrix, state); - } else { - ApplyGateL<0, 4>(qs, matrix, state); - } - break; - case 5: - if (qs[0] > 3) { - ApplyGateH<5>(qs, matrix, state); - } else if (qs[1] > 3) { - ApplyGateL<4, 1>(qs, matrix, state); - } else if (qs[2] > 3) { - ApplyGateL<3, 2>(qs, matrix, state); - } else if (qs[3] > 3) { - ApplyGateL<2, 3>(qs, matrix, state); - } else { - ApplyGateL<1, 4>(qs, matrix, state); - } - break; - case 6: - if (qs[0] > 3) { - ApplyGateH<6>(qs, matrix, state); - } else if (qs[1] > 3) { - ApplyGateL<5, 1>(qs, matrix, state); - } else if (qs[2] > 3) { - ApplyGateL<4, 2>(qs, matrix, state); - } else if (qs[3] > 3) { - ApplyGateL<3, 3>(qs, matrix, state); - } else { - ApplyGateL<2, 4>(qs, matrix, state); - } - break; - default: - // Not implemented. - break; - } - } - - /** - * Applies a controlled gate using AVX512 instructions. - * @param qs Indices of the qubits affected by this gate. - * @param cqs Indices of control qubits. - * @param cvals Bit mask of control qubit values. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyControlledGate(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - // Assume cqs[0] < cqs[1] < cqs[2] < ... . - - if (cqs.size() == 0) { - ApplyGate(qs, matrix, state); - return; - } - - switch (qs.size()) { - case 0: - if (cqs[0] > 3) { - ApplyControlledGateHH<0>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<0>(qs, cqs, cvals, matrix, state); - } - break; - case 1: - if (qs[0] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 3) { - ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 2: - if (qs[0] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 3) { - ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 3: - if (qs[0] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else if (qs[2] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 3) { - ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 4: - if (qs[0] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else if (qs[2] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state); - } - } else if (qs[3] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 3) { - ApplyControlledGateL<0, 4, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 4, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - default: - // Not implemented. - break; - } - } - - /** - * Computes the expectation value of an operator using AVX512 instructions. - * @param qs Indices of the qubits the operator acts on. - * @param matrix The operator matrix. - * @param state The state of the system. - * @return The computed expectation value. - */ - std::complex ExpectationValue(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - switch (qs.size()) { - case 1: - if (qs[0] > 3) { - return ExpectationValueH<1>(qs, matrix, state); - } else { - return ExpectationValueL<0, 1>(qs, matrix, state); - } - break; - case 2: - if (qs[0] > 3) { - return ExpectationValueH<2>(qs, matrix, state); - } else if (qs[1] > 3) { - return ExpectationValueL<1, 1>(qs, matrix, state); - } else { - return ExpectationValueL<0, 2>(qs, matrix, state); - } - break; - case 3: - if (qs[0] > 3) { - return ExpectationValueH<3>(qs, matrix, state); - } else if (qs[1] > 3) { - return ExpectationValueL<2, 1>(qs, matrix, state); - } else if (qs[2] > 3) { - return ExpectationValueL<1, 2>(qs, matrix, state); - } else { - return ExpectationValueL<0, 3>(qs, matrix, state); - } - break; - case 4: - if (qs[0] > 3) { - return ExpectationValueH<4>(qs, matrix, state); - } else if (qs[1] > 3) { - return ExpectationValueL<3, 1>(qs, matrix, state); - } else if (qs[2] > 3) { - return ExpectationValueL<2, 2>(qs, matrix, state); - } else if (qs[3] > 3) { - return ExpectationValueL<1, 3>(qs, matrix, state); - } else { - return ExpectationValueL<0, 4>(qs, matrix, state); - } - break; - case 5: - if (qs[0] > 3) { - return ExpectationValueH<5>(qs, matrix, state); - } else if (qs[1] > 3) { - return ExpectationValueL<4, 1>(qs, matrix, state); - } else if (qs[2] > 3) { - return ExpectationValueL<3, 2>(qs, matrix, state); - } else if (qs[3] > 3) { - return ExpectationValueL<2, 3>(qs, matrix, state); - } else { - return ExpectationValueL<1, 4>(qs, matrix, state); - } - break; - case 6: - if (qs[0] > 3) { - return ExpectationValueH<6>(qs, matrix, state); - } else if (qs[1] > 3) { - return ExpectationValueL<5, 1>(qs, matrix, state); - } else if (qs[2] > 3) { - return ExpectationValueL<4, 2>(qs, matrix, state); - } else if (qs[3] > 3) { - return ExpectationValueL<3, 3>(qs, matrix, state); - } else { - return ExpectationValueL<2, 4>(qs, matrix, state); - } - break; - default: - // Not implemented. - break; - } - - return 0; - } - - /** - * @return The size of SIMD register if applicable. - */ - static unsigned SIMDRegisterSize() { - return 16; - } - - private: - template - void ApplyGateH(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - uint64_t imaskh, uint64_t qmaskh, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m512 ru, iu, rn, in; - __m512 rs[hsize], is[hsize]; - - auto p0 = rstate + _pdep_u64(i, imaskh); - - for (unsigned k = 0; k < hsize; ++k) { - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k] = _mm512_load_ps(p0 + p); - is[k] = _mm512_load_ps(p0 + p + 16); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm512_set1_ps(v[j]); - iu = _mm512_set1_ps(v[j + 1]); - rn = _mm512_mul_ps(rs[0], ru); - in = _mm512_mul_ps(rs[0], iu); - rn = _mm512_fnmadd_ps(is[0], iu, rn); - in = _mm512_fmadd_ps(is[0], ru, in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm512_set1_ps(v[j]); - iu = _mm512_set1_ps(v[j + 1]); - rn = _mm512_fmadd_ps(rs[l], ru, rn); - in = _mm512_fmadd_ps(rs[l], iu, in); - rn = _mm512_fnmadd_ps(is[l], iu, rn); - in = _mm512_fmadd_ps(is[l], ru, in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm512_store_ps(p0 + p, rn); - _mm512_store_ps(p0 + p + 16, in); - } - }; - - auto m = GetMasks1(qs); - - unsigned k = 4 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, matrix, m.imaskh, m.qmaskh, state.get()); - } - - template - void ApplyGateL(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, - uint64_t imaskh, uint64_t qmaskh, const __m512i* idx, - fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m512 rn, in; - __m512 rs[gsize], is[gsize]; - - auto p0 = rstate + _pdep_u64(i, imaskh); - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k2] = _mm512_load_ps(p0 + p); - is[k2] = _mm512_load_ps(p0 + p + 16); - - for (unsigned l = 1; l < lsize; ++l) { - rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]); - is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]); - } - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm512_mul_ps(rs[0], w[j]); - in = _mm512_mul_ps(rs[0], w[j + 1]); - rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm512_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm512_fmadd_ps(rs[l], w[j], rn); - in = _mm512_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm512_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm512_store_ps(p0 + p, rn); - _mm512_store_ps(p0 + p + 16, in); - } - }; - - __m512i idx[1 << L]; - __m512 w[1 << (1 + 2 * H + L)]; - - auto m = GetMasks2(qs); - FillPermutationIndices(m.qmaskl, idx); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - unsigned r = 4 + H; - unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, w, m.imaskh, m.qmaskh, idx, state.get()); - } - - template - void ApplyControlledGateHH(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, - fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m512 ru, iu, rn, in; - __m512 rs[hsize], is[hsize]; - - auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh); - - for (unsigned k = 0; k < hsize; ++k) { - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k] = _mm512_load_ps(p0 + p); - is[k] = _mm512_load_ps(p0 + p + 16); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm512_set1_ps(v[j]); - iu = _mm512_set1_ps(v[j + 1]); - rn = _mm512_mul_ps(rs[0], ru); - in = _mm512_mul_ps(rs[0], iu); - rn = _mm512_fnmadd_ps(is[0], iu, rn); - in = _mm512_fmadd_ps(is[0], ru, in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm512_set1_ps(v[j]); - iu = _mm512_set1_ps(v[j + 1]); - rn = _mm512_fmadd_ps(rs[l], ru, rn); - in = _mm512_fmadd_ps(rs[l], iu, in); - rn = _mm512_fnmadd_ps(is[l], iu, rn); - in = _mm512_fmadd_ps(is[l], ru, in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm512_store_ps(p0 + p, rn); - _mm512_store_ps(p0 + p + 16, in); - } - }; - - auto m = GetMasks3(state.num_qubits(), qs, cqs, cvals); - - unsigned k = 4 + H + cqs.size(); - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, matrix, m.imaskh, m.qmaskh, m.cvalsh, state.get()); - } - - template - void ApplyControlledGateHL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, - uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, - fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m512 rn, in; - __m512 rs[hsize], is[hsize]; - - auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh); - - for (unsigned k = 0; k < hsize; ++k) { - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k] = _mm512_load_ps(p0 + p); - is[k] = _mm512_load_ps(p0 + p + 16); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm512_mul_ps(rs[0], w[j]); - in = _mm512_mul_ps(rs[0], w[j + 1]); - rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm512_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - rn = _mm512_fmadd_ps(rs[l], w[j], rn); - in = _mm512_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm512_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm512_store_ps(p0 + p, rn); - _mm512_store_ps(p0 + p + 16, in); - } - }; - - __m512 w[1 << (1 + 2 * H)]; - - auto m = GetMasks4(state.num_qubits(), qs, cqs, cvals); - FillControlledMatrixH(m.cvalsl, m.cmaskl, matrix, (fp_type*) w); - - unsigned r = 4 + H + cqs.size() - m.cl; - unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, state.get()); - } - - template - void ApplyControlledGateL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, - uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, - const __m512i* idx, fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m512 rn, in; - __m512 rs[gsize], is[gsize]; - - auto p0 = rstate + (_pdep_u64(i, imaskh) | cvalsh); - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k2] = _mm512_load_ps(p0 + p); - is[k2] = _mm512_load_ps(p0 + p + 16); - - for (unsigned l = 1; l < lsize; ++l) { - rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]); - is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]); - } - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm512_mul_ps(rs[0], w[j]); - in = _mm512_mul_ps(rs[0], w[j + 1]); - rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm512_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm512_fmadd_ps(rs[l], w[j], rn); - in = _mm512_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm512_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm512_store_ps(p0 + p, rn); - _mm512_store_ps(p0 + p + 16, in); - } - }; - - __m512i idx[1 << L]; - __m512 w[1 << (1 + 2 * H + L)]; - - if (CH) { - auto m = GetMasks5(state.num_qubits(), qs, cqs, cvals); - FillPermutationIndices(m.qmaskl, idx); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - unsigned r = 4 + H + cqs.size(); - unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get()); - } else { - auto m = GetMasks6(state.num_qubits(), qs, cqs, cvals); - FillPermutationIndices(m.qmaskl, idx); - FillControlledMatrixL( - m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w); - - unsigned r = 4 + H + cqs.size() - m.cl; - unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, w, m.imaskh, m.qmaskh, m.cvalsh, idx, state.get()); - } - } - - template - std::complex ExpectationValueH(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - uint64_t imaskh, uint64_t qmaskh, const fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m512 ru, iu, rn, in; - __m512 rs[hsize], is[hsize]; - - auto p0 = rstate + _pdep_u64(i, imaskh); - - for (unsigned k = 0; k < hsize; ++k) { - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k] = _mm512_load_ps(p0 + p); - is[k] = _mm512_load_ps(p0 + p + 16); - } - - double re = 0; - double im = 0; - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm512_set1_ps(v[j]); - iu = _mm512_set1_ps(v[j + 1]); - rn = _mm512_mul_ps(rs[0], ru); - in = _mm512_mul_ps(rs[0], iu); - rn = _mm512_fnmadd_ps(is[0], iu, rn); - in = _mm512_fmadd_ps(is[0], ru, in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm512_set1_ps(v[j]); - iu = _mm512_set1_ps(v[j + 1]); - rn = _mm512_fmadd_ps(rs[l], ru, rn); - in = _mm512_fmadd_ps(rs[l], iu, in); - rn = _mm512_fnmadd_ps(is[l], iu, rn); - in = _mm512_fmadd_ps(is[l], ru, in); - - j += 2; - } - - __m512 v_re = _mm512_fmadd_ps(is[k], in, _mm512_mul_ps(rs[k], rn)); - __m512 v_im = _mm512_fnmadd_ps(is[k], rn, _mm512_mul_ps(rs[k], in)); - - re += detail::HorizontalSumAVX512(v_re); - im += detail::HorizontalSumAVX512(v_im); - } - - return std::complex{re, im}; - }; - - auto m = GetMasks1(qs); - - unsigned k = 4 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Op = std::plus>; - return - for_.RunReduce(size, f, Op(), matrix, m.imaskh, m.qmaskh, state.get()); - } - - template - std::complex ExpectationValueL(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, - uint64_t imaskh, uint64_t qmaskh, const __m512i* idx, - const fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m512 rn, in; - __m512 rs[gsize], is[gsize]; - - auto p0 = rstate + _pdep_u64(i, imaskh); - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k2] = _mm512_load_ps(p0 + p); - is[k2] = _mm512_load_ps(p0 + p + 16); - - for (unsigned l = 1; l < lsize; ++l) { - rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]); - is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]); - } - } - - double re = 0; - double im = 0; - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm512_mul_ps(rs[0], w[j]); - in = _mm512_mul_ps(rs[0], w[j + 1]); - rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm512_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm512_fmadd_ps(rs[l], w[j], rn); - in = _mm512_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm512_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - unsigned m = lsize * k; - - __m512 v_re = _mm512_fmadd_ps(is[m], in, _mm512_mul_ps(rs[m], rn)); - __m512 v_im = _mm512_fnmadd_ps(is[m], rn, _mm512_mul_ps(rs[m], in)); - - re += detail::HorizontalSumAVX512(v_re); - im += detail::HorizontalSumAVX512(v_im); - } - - return std::complex{re, im}; - }; - - __m512i idx[1 << L]; - __m512 w[1 << (1 + 2 * H + L)]; - - auto m = GetMasks2(qs); - FillPermutationIndices(m.qmaskl, idx); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - unsigned r = 4 + H; - unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; - uint64_t size = uint64_t{1} << n; - - using Op = std::plus>; - return - for_.RunReduce(size, f, Op(), w, m.imaskh, m.qmaskh, idx, state.get()); - } - - template - static void FillPermutationIndices(unsigned qmaskl, __m512i* idx) { - constexpr unsigned lsize = 1 << L; - - for (unsigned i = 0; i < lsize; ++i) { - unsigned p[16]; - - for (unsigned j = 0; j < 16; ++j) { - p[j] = MaskedAdd<4>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl)); - } - - idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], - p[9], p[8], p[7], p[6], p[5], p[4], - p[3], p[2], p[1], p[0]); - } - } - - For for_; -}; - -} // namespace qsim - -#endif // SIMULATOR_AVX512_H_ diff --git a/tpls/qsim/simulator_basic.h b/tpls/qsim/simulator_basic.h deleted file mode 100644 index 752eeb5..0000000 --- a/tpls/qsim/simulator_basic.h +++ /dev/null @@ -1,349 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SIMULATOR_BASIC_H_ -#define SIMULATOR_BASIC_H_ - -#include -#include -#include -#include - -#include "simulator.h" -#include "statespace_basic.h" - -namespace qsim { - -/** - * Quantum circuit simulator without vectorization. - */ -template -class SimulatorBasic final : public SimulatorBase { - public: - using StateSpace = StateSpaceBasic; - using State = typename StateSpace::State; - using fp_type = typename StateSpace::fp_type; - - template - explicit SimulatorBasic(ForArgs&&... args) : for_(args...) {} - - /** - * Applies a gate using non-vectorized instructions. - * @param qs Indices of the qubits affected by this gate. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyGate(const std::vector& qs, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - switch (qs.size()) { - case 0: - ApplyGateH<0>(qs, matrix, state); - break; - case 1: - ApplyGateH<1>(qs, matrix, state); - break; - case 2: - ApplyGateH<2>(qs, matrix, state); - break; - case 3: - ApplyGateH<3>(qs, matrix, state); - break; - case 4: - ApplyGateH<4>(qs, matrix, state); - break; - case 5: - ApplyGateH<5>(qs, matrix, state); - break; - case 6: - ApplyGateH<6>(qs, matrix, state); - break; - default: - // Not implemented. - break; - } - } - - /** - * Applies a controlled gate using non-vectorized instructions. - * @param qs Indices of the qubits affected by this gate. - * @param cqs Indices of control qubits. - * @param cvals Bit mask of control qubit values. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyControlledGate(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - if (cqs.size() == 0) { - ApplyGate(qs, matrix, state); - return; - } - - switch (qs.size()) { - case 0: - ApplyControlledGateH<0>(qs, cqs, cvals, matrix, state); - break; - case 1: - ApplyControlledGateH<1>(qs, cqs, cvals, matrix, state); - break; - case 2: - ApplyControlledGateH<2>(qs, cqs, cvals, matrix, state); - break; - case 3: - ApplyControlledGateH<3>(qs, cqs, cvals, matrix, state); - break; - case 4: - ApplyControlledGateH<4>(qs, cqs, cvals, matrix, state); - break; - default: - // Not implemented. - break; - } - } - - /** - * Computes the expectation value of an operator using non-vectorized - * instructions. - * @param qs Indices of the qubits the operator acts on. - * @param matrix The operator matrix. - * @param state The state of the system. - * @return The computed expectation value. - */ - std::complex ExpectationValue(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - switch (qs.size()) { - case 1: - return ExpectationValueH<1>(qs, matrix, state); - break; - case 2: - return ExpectationValueH<2>(qs, matrix, state); - break; - case 3: - return ExpectationValueH<3>(qs, matrix, state); - break; - case 4: - return ExpectationValueH<4>(qs, matrix, state); - break; - case 5: - return ExpectationValueH<5>(qs, matrix, state); - break; - case 6: - return ExpectationValueH<6>(qs, matrix, state); - break; - default: - // Not implemented. - break; - } - - return 0; - } - - /** - * @return The size of SIMD register if applicable. - */ - static unsigned SIMDRegisterSize() { - return 1; - } - - private: - template - void ApplyGateH(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - const uint64_t* ms, const uint64_t* xss, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - fp_type rn, in; - fp_type rs[hsize], is[hsize]; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = *(p0 + xss[k]); - is[k] = *(p0 + xss[k] + 1); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = rs[0] * v[j] - is[0] * v[j + 1]; - in = rs[0] * v[j + 1] + is[0] * v[j]; - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - rn += rs[l] * v[j] - is[l] * v[j + 1]; - in += rs[l] * v[j + 1] + is[l] * v[j]; - - j += 2; - } - - *(p0 + xss[k]) = rn; - *(p0 + xss[k] + 1) = in; - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, matrix, ms, xss, state.get()); - } - - template - void ApplyControlledGateH(const std::vector& qs, - const std::vector& cqs, - uint64_t cvals, const fp_type* matrix, - State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - const uint64_t* ms, const uint64_t* xss, - uint64_t cvalsh, uint64_t cmaskh, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - fp_type rn, in; - fp_type rs[hsize], is[hsize]; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - if ((ii & cmaskh) == cvalsh) { - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = *(p0 + xss[k]); - is[k] = *(p0 + xss[k] + 1); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = rs[0] * v[j] - is[0] * v[j + 1]; - in = rs[0] * v[j + 1] + is[0] * v[j]; - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - rn += rs[l] * v[j] - is[l] * v[j + 1]; - in += rs[l] * v[j + 1] + is[l] * v[j]; - - j += 2; - } - - *(p0 + xss[k]) = rn; - *(p0 + xss[k] + 1) = in; - } - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - - FillIndices(state.num_qubits(), qs, ms, xss); - - auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals); - - unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, matrix, ms, xss, m.cvalsh, m.cmaskh, state.get()); - } - - template - std::complex ExpectationValueH(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - const uint64_t* ms, const uint64_t* xss, - const fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - fp_type rn, in; - fp_type rs[hsize], is[hsize]; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = *(p0 + xss[k]); - is[k] = *(p0 + xss[k] + 1); - } - - double re = 0; - double im = 0; - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = rs[0] * v[j] - is[0] * v[j + 1]; - in = rs[0] * v[j + 1] + is[0] * v[j]; - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - rn += rs[l] * v[j] - is[l] * v[j + 1]; - in += rs[l] * v[j + 1] + is[l] * v[j]; - - j += 2; - } - - re += rs[k] * rn + is[k] * in; - im += rs[k] * in - is[k] * rn; - } - - return std::complex{re, im}; - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0; - uint64_t size = uint64_t{1} << n; - - using Op = std::plus>; - return for_.RunReduce(size, f, Op(), matrix, ms, xss, state.get()); - } - - For for_; -}; - -} // namespace qsim - -#endif // SIMULATOR_BASIC_H_ diff --git a/tpls/qsim/simulator_cuda.h b/tpls/qsim/simulator_cuda.h deleted file mode 100644 index 5743bea..0000000 --- a/tpls/qsim/simulator_cuda.h +++ /dev/null @@ -1,923 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SIMULATOR_CUDA_H_ -#define SIMULATOR_CUDA_H_ - -#include "simulator_cuda_kernels.h" - -#include -#include -#include -#include -#include - -#include "bits.h" -#include "statespace_cuda.h" - -namespace qsim { - -/** - * Quantum circuit simulator with GPU vectorization. - */ -template -class SimulatorCUDA final { - private: - using idx_type = uint64_t; - using Complex = qsim::Complex; - - // The maximum buffer size for indices and gate matrices. - // The maximum gate matrix size (for 6-qubit gates) is - // 2 * 2^6 * 2^6 * sizeof(FP) = 8192 * sizeof(FP). The maximum index size is - // 128 * sizeof(idx_type) + 96 * sizeof(unsigned). - static constexpr unsigned max_buf_size = 8192 * sizeof(FP) - + 128 * sizeof(idx_type) + 96 * sizeof(unsigned); - - public: - using StateSpace = StateSpaceCUDA; - using State = typename StateSpace::State; - using fp_type = typename StateSpace::fp_type; - - SimulatorCUDA() : scratch_(nullptr), scratch_size_(0) { - ErrorCheck(cudaMalloc(&d_ws, max_buf_size)); - } - - ~SimulatorCUDA() { - ErrorCheck(cudaFree(d_ws)); - - if (scratch_ != nullptr) { - ErrorCheck(cudaFree(scratch_)); - } - } - - /** - * Applies a gate using CUDA instructions. - * @param qs Indices of the qubits affected by this gate. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyGate(const std::vector& qs, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - if (qs.size() == 0) { - ApplyGateH<0>(qs, matrix, state); - } else if (qs[0] > 4) { - switch (qs.size()) { - case 1: - ApplyGateH<1>(qs, matrix, state); - break; - case 2: - ApplyGateH<2>(qs, matrix, state); - break; - case 3: - ApplyGateH<3>(qs, matrix, state); - break; - case 4: - ApplyGateH<4>(qs, matrix, state); - break; - case 5: - ApplyGateH<5>(qs, matrix, state); - break; - case 6: - ApplyGateH<6>(qs, matrix, state); - break; - default: - // Not implemented. - break; - } - } else { - switch (qs.size()) { - case 1: - ApplyGateL<1>(qs, matrix, state); - break; - case 2: - ApplyGateL<2>(qs, matrix, state); - break; - case 3: - ApplyGateL<3>(qs, matrix, state); - break; - case 4: - ApplyGateL<4>(qs, matrix, state); - break; - case 5: - ApplyGateL<5>(qs, matrix, state); - break; - case 6: - ApplyGateL<6>(qs, matrix, state); - break; - default: - // Not implemented. - break; - } - } - } - - /** - * Applies a controlled gate using CUDA instructions. - * @param qs Indices of the qubits affected by this gate. - * @param cqs Indices of control qubits. - * @param cvals Bit mask of control qubit values. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyControlledGate(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - if (cqs.size() == 0) { - ApplyGate(qs, matrix, state); - return; - } - - if (cqs[0] < 5) { - switch (qs.size()) { - case 0: - ApplyControlledGateL<0>(qs, cqs, cvals, matrix, state); - break; - case 1: - ApplyControlledGateL<1>(qs, cqs, cvals, matrix, state); - break; - case 2: - ApplyControlledGateL<2>(qs, cqs, cvals, matrix, state); - break; - case 3: - ApplyControlledGateL<3>(qs, cqs, cvals, matrix, state); - break; - case 4: - ApplyControlledGateL<4>(qs, cqs, cvals, matrix, state); - break; - default: - // Not implemented. - break; - } - } else { - if (qs.size() == 0) { - ApplyControlledGateHH<0>(qs, cqs, cvals, matrix, state); - } else if (qs[0] > 4) { - switch (qs.size()) { - case 1: - ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state); - break; - case 2: - ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state); - break; - case 3: - ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state); - break; - case 4: - ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state); - break; - default: - // Not implemented. - break; - } - } else { - switch (qs.size()) { - case 1: - ApplyControlledGateLH<1>(qs, cqs, cvals, matrix, state); - break; - case 2: - ApplyControlledGateLH<2>(qs, cqs, cvals, matrix, state); - break; - case 3: - ApplyControlledGateLH<3>(qs, cqs, cvals, matrix, state); - break; - case 4: - ApplyControlledGateLH<4>(qs, cqs, cvals, matrix, state); - break; - default: - // Not implemented. - break; - } - } - } - } - - /** - * Computes the expectation value of an operator using CUDA instructions. - * @param qs Indices of the qubits the operator acts on. - * @param matrix The operator matrix. - * @param state The state of the system. - * @return The computed expectation value. - */ - std::complex ExpectationValue(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - if (qs[0] > 4) { - switch (qs.size()) { - case 1: - return ExpectationValueH<1>(qs, matrix, state); - case 2: - return ExpectationValueH<2>(qs, matrix, state); - case 3: - return ExpectationValueH<3>(qs, matrix, state); - case 4: - return ExpectationValueH<4>(qs, matrix, state); - case 5: - return ExpectationValueH<5>(qs, matrix, state); - case 6: - return ExpectationValueH<6>(qs, matrix, state); - default: - // Not implemented. - break; - } - } else { - switch (qs.size()) { - case 1: - return ExpectationValueL<1>(qs, matrix, state); - case 2: - return ExpectationValueL<2>(qs, matrix, state); - case 3: - return ExpectationValueL<3>(qs, matrix, state); - case 4: - return ExpectationValueL<4>(qs, matrix, state); - case 5: - return ExpectationValueL<5>(qs, matrix, state); - case 6: - return ExpectationValueL<6>(qs, matrix, state); - default: - // Not implemented. - break; - } - } - - return 0; - } - - /** - * @return The size of SIMD register if applicable. - */ - static unsigned SIMDRegisterSize() { - return 32; - } - - private: - // The following indices are used in kernels. - // xss - indices to access the state vector entries in global memory. - // ms - masks to access the state vector entries in global memory. - // tis - indices to access the state vector entries in shared memory - // in the presence of low gate qubits. - // qis - indices to access the state vector entries in shared memory - // in the presence of low gate qubits. - // cis - additional indices to access the state vector entries in global - // memory in the presence of low control qubits. - - template - struct IndicesH { - static constexpr unsigned gsize = 1 << G; - static constexpr unsigned matrix_size = 2 * gsize * gsize * sizeof(fp_type); - static constexpr unsigned xss_size = 32 * sizeof(idx_type) * (1 + (G == 6)); - static constexpr unsigned ms_size = 32 * sizeof(idx_type); - static constexpr unsigned xss_offs = matrix_size; - static constexpr unsigned ms_offs = xss_offs + xss_size; - static constexpr unsigned buf_size = ms_offs + ms_size; - - IndicesH(char* p) - : xss((idx_type*) (p + xss_offs)), ms((idx_type*) (p + ms_offs)) {} - - idx_type* xss; - idx_type* ms; - }; - - template - struct IndicesL : public IndicesH { - using Base = IndicesH; - static constexpr unsigned qis_size = 32 * sizeof(unsigned) * (1 + (G == 6)); - static constexpr unsigned tis_size = 32 * sizeof(unsigned); - static constexpr unsigned qis_offs = Base::buf_size; - static constexpr unsigned tis_offs = qis_offs + qis_size; - static constexpr unsigned buf_size = tis_offs + tis_size; - - IndicesL(char* p) - : Base(p), qis((unsigned*) (p + qis_offs)), - tis((unsigned*) (p + tis_offs)) {} - - unsigned* qis; - unsigned* tis; - }; - - template - struct IndicesLC : public IndicesL { - using Base = IndicesL; - static constexpr unsigned cis_size = 32 * sizeof(idx_type); - static constexpr unsigned cis_offs = Base::buf_size; - static constexpr unsigned buf_size = cis_offs + cis_size; - - IndicesLC(char* p) : Base(p), cis((idx_type*) (p + cis_offs)) {} - - idx_type* cis; - }; - - struct DataC { - idx_type cvalsh; - unsigned num_aqs; - unsigned num_effective_qs; - unsigned remaining_low_cqs; - }; - - template - void ApplyGateH(const std::vector& qs, - const fp_type* matrix, State& state) const { - unsigned num_qubits = state.num_qubits(); - - IndicesH h_i(h_ws); - GetIndicesH(num_qubits, qs, qs.size(), h_i); - - std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); - ErrorCheck( - cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); - - unsigned k = 5 + G; - unsigned n = num_qubits > k ? num_qubits - k : 0; - unsigned size = unsigned{1} << n; - unsigned threads = 64U; - unsigned blocks = std::max(1U, size / 2); - - IndicesH d_i(d_ws); - - ApplyGateH_Kernel<<>>( - (fp_type*) d_ws, d_i.xss, d_i.ms, state.get()); - } - - template - void ApplyGateL(const std::vector& qs, - const fp_type* matrix, State& state) const { - unsigned num_qubits = state.num_qubits(); - - IndicesL h_i(h_ws); - auto num_effective_qs = GetIndicesL(num_qubits, qs, h_i); - - std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); - ErrorCheck( - cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); - - unsigned k = 5 + num_effective_qs; - unsigned n = num_qubits > k ? num_qubits - k : 0; - unsigned size = unsigned{1} << n; - unsigned threads = 32; - unsigned blocks = size; - - IndicesL d_i(d_ws); - - ApplyGateL_Kernel<<>>( - (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, - 1 << num_effective_qs, state.get()); - } - - template - void ApplyControlledGateHH(const std::vector& qs, - const std::vector& cqs, idx_type cvals, - const fp_type* matrix, State& state) const { - unsigned aqs[64]; - idx_type cmaskh = 0; - unsigned num_qubits = state.num_qubits(); - - IndicesH h_i(h_ws); - - unsigned num_aqs = GetHighQubits(qs, 0, cqs, 0, 0, cmaskh, aqs); - GetMs(num_qubits, aqs, num_aqs, h_i.ms); - GetXss(num_qubits, qs, qs.size(), h_i.xss); - - idx_type cvalsh = bits::ExpandBits(cvals, num_qubits, cmaskh); - - std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); - ErrorCheck( - cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); - - unsigned k = 5 + G + cqs.size(); - unsigned n = num_qubits > k ? num_qubits - k : 0; - unsigned size = unsigned{1} << n; - unsigned threads = 64U; - unsigned blocks = std::max(1U, size / 2); - - IndicesH d_i(d_ws); - - ApplyControlledGateH_Kernel<<>>( - (fp_type*) d_ws, d_i.xss, d_i.ms, num_aqs + 1, cvalsh, state.get()); - } - - template - void ApplyControlledGateLH(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - unsigned num_qubits = state.num_qubits(); - - IndicesL h_i(h_ws); - auto d = GetIndicesLC(num_qubits, qs, cqs, cvals, h_i); - - std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); - ErrorCheck( - cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); - - unsigned k = 5 + G + cqs.size(); - unsigned n = num_qubits > k ? num_qubits - k : 0; - unsigned size = unsigned{1} << n; - unsigned threads = 32; - unsigned blocks = size; - - IndicesL d_i(d_ws); - - ApplyControlledGateLH_Kernel<<>>( - (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, - d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs, state.get()); - } - - template - void ApplyControlledGateL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - unsigned num_qubits = state.num_qubits(); - - IndicesLC h_i(h_ws); - auto d = GetIndicesLCL(num_qubits, qs, cqs, cvals, h_i); - - std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); - ErrorCheck( - cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); - - unsigned k = 5 + G + cqs.size(); - unsigned n = num_qubits > k ? num_qubits - k : 0; - unsigned size = unsigned{1} << n; - unsigned threads = 32; - unsigned blocks = size; - - IndicesLC d_i(d_ws); - - ApplyControlledGateL_Kernel<<>>( - (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, d_i.cis, - d.num_aqs + 1, d.cvalsh, 1 << d.num_effective_qs, - 1 << (5 - d.remaining_low_cqs), state.get()); - } - - template - std::complex ExpectationValueH(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - unsigned num_qubits = state.num_qubits(); - - IndicesH h_i(h_ws); - GetIndicesH(num_qubits, qs, qs.size(), h_i); - - std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); - ErrorCheck( - cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); - - unsigned k = 5 + G; - unsigned n = num_qubits > k ? num_qubits - k : 0; - unsigned size = unsigned{1} << n; - - unsigned s = std::min(n >= 14 ? n - 14 : 0, 4U); - unsigned threads = 64U; - unsigned blocks = std::max(1U, (size / 2) >> s); - unsigned num_iterations_per_block = 1 << s; - - constexpr unsigned m = 16; - - Complex* d_res1 = (Complex*) AllocScratch((blocks + m) * sizeof(Complex)); - Complex* d_res2 = d_res1 + blocks; - - IndicesH d_i(d_ws); - - ExpectationValueH_Kernel<<>>( - (fp_type*) d_ws, d_i.xss, d_i.ms, num_iterations_per_block, - state.get(), Plus(), d_res1); - - double mul = size == 1 ? 0.5 : 1.0; - - return ExpectationValueReduceFinal(blocks, mul, d_res1, d_res2); - } - - template - std::complex ExpectationValueL(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - unsigned num_qubits = state.num_qubits(); - - IndicesL h_i(h_ws); - auto num_effective_qs = GetIndicesL(num_qubits, qs, h_i); - - std::memcpy((fp_type*) h_ws, matrix, h_i.matrix_size); - ErrorCheck( - cudaMemcpyAsync(d_ws, h_ws, h_i.buf_size, cudaMemcpyHostToDevice)); - - unsigned k = 5 + num_effective_qs; - unsigned n = num_qubits > k ? num_qubits - k : 0; - unsigned size = unsigned{1} << n; - - unsigned s = std::min(n >= 13 ? n - 13 : 0, 5U); - unsigned threads = 32; - unsigned blocks = size >> s; - unsigned num_iterations_per_block = 1 << s; - - constexpr unsigned m = 16; - - Complex* d_res1 = (Complex*) AllocScratch((blocks + m) * sizeof(Complex)); - Complex* d_res2 = d_res1 + blocks; - - IndicesL d_i(d_ws); - - ExpectationValueL_Kernel<<>>( - (fp_type*) d_ws, d_i.xss, d_i.ms, d_i.qis, d_i.tis, - num_iterations_per_block, state.get(), Plus(), d_res1); - - double mul = double(1 << (5 + num_effective_qs - G)) / 32; - - return ExpectationValueReduceFinal(blocks, mul, d_res1, d_res2); - } - - template - std::complex ExpectationValueReduceFinal( - unsigned blocks, double mul, - const Complex* d_res1, Complex* d_res2) const { - Complex res2[m]; - - if (blocks <= 16) { - ErrorCheck(cudaMemcpy(res2, d_res1, blocks * sizeof(Complex), - cudaMemcpyDeviceToHost)); - } else { - unsigned threads2 = std::min(1024U, blocks); - unsigned blocks2 = std::min(m, blocks / threads2); - - unsigned dblocks = std::max(1U, blocks / (blocks2 * threads2)); - unsigned bytes = threads2 * sizeof(Complex); - - Reduce2Kernel<<>>( - dblocks, blocks, Plus(), Plus(), d_res1, d_res2); - - ErrorCheck(cudaMemcpy(res2, d_res2, blocks2 * sizeof(Complex), - cudaMemcpyDeviceToHost)); - - blocks = blocks2; - } - - double re = 0; - double im = 0; - - for (unsigned i = 0; i < blocks; ++i) { - re += res2[i].re; - im += res2[i].im; - } - - return {mul * re, mul * im}; - } - - template - unsigned GetHighQubits(const std::vector& qs, unsigned qi, - const std::vector& cqs, unsigned ci, - unsigned ai, idx_type& cmaskh, AQ& aqs) const { - while (1) { - if (qi < qs.size() && (ci == cqs.size() || qs[qi] < cqs[ci])) { - aqs[ai++] = qs[qi++]; - } else if (ci < cqs.size()) { - cmaskh |= idx_type{1} << cqs[ci]; - aqs[ai++] = cqs[ci++]; - } else { - break; - } - } - - return ai; - } - - template - void GetMs(unsigned num_qubits, const QS& qs, unsigned qs_size, - idx_type* ms) const { - if (qs_size == 0) { - ms[0] = idx_type(-1); - } else { - idx_type xs = idx_type{1} << (qs[0] + 1); - ms[0] = (idx_type{1} << qs[0]) - 1; - for (unsigned i = 1; i < qs_size; ++i) { - ms[i] = ((idx_type{1} << qs[i]) - 1) ^ (xs - 1); - xs = idx_type{1} << (qs[i] + 1); - } - ms[qs_size] = ((idx_type{1} << num_qubits) - 1) ^ (xs - 1); - } - } - - template - void GetXss(unsigned num_qubits, const QS& qs, unsigned qs_size, - idx_type* xss) const { - if (qs_size == 0) { - xss[0] = 0; - } else { - unsigned g = qs_size; - unsigned gsize = 1 << qs_size; - - idx_type xs[64]; - - xs[0] = idx_type{1} << (qs[0] + 1); - for (unsigned i = 1; i < g; ++i) { - xs[i] = idx_type{1} << (qs[i] + 1); - } - - for (unsigned i = 0; i < gsize; ++i) { - idx_type a = 0; - for (unsigned k = 0; k < g; ++k) { - a += xs[k] * ((i >> k) & 1); - } - xss[i] = a; - } - } - } - - template - void GetIndicesH(unsigned num_qubits, const qs_type& qs, unsigned qs_size, - IndicesH& indices) const { - if (qs_size == 0) { - indices.ms[0] = idx_type(-1); - indices.xss[0] = 0; - } else { - unsigned g = qs_size; - unsigned gsize = 1 << qs_size; - - idx_type xs[64]; - - xs[0] = idx_type{1} << (qs[0] + 1); - indices.ms[0] = (idx_type{1} << qs[0]) - 1; - for (unsigned i = 1; i < g; ++i) { - xs[i] = idx_type{1} << (qs[i] + 1); - indices.ms[i] = ((idx_type{1} << qs[i]) - 1) ^ (xs[i - 1] - 1); - } - indices.ms[g] = ((idx_type{1} << num_qubits) - 1) ^ (xs[g - 1] - 1); - - for (unsigned i = 0; i < gsize; ++i) { - idx_type a = 0; - for (unsigned k = 0; k < g; ++k) { - a += xs[k] * ((i >> k) & 1); - } - indices.xss[i] = a; - } - } - } - - template - void GetIndicesL(unsigned num_effective_qs, unsigned qmask, - IndicesL& indices) const { - for (unsigned i = num_effective_qs + 1; i < (G + 1); ++i) { - indices.ms[i] = 0; - } - - for (unsigned i = (1 << num_effective_qs); i < indices.gsize; ++i) { - indices.xss[i] = 0; - } - - for (unsigned i = 0; i < indices.gsize; ++i) { - indices.qis[i] = bits::ExpandBits(i, 5 + num_effective_qs, qmask); - } - - unsigned tmask = ((1 << (5 + num_effective_qs)) - 1) ^ qmask; - for (unsigned i = 0; i < 32; ++i) { - indices.tis[i] = bits::ExpandBits(i, 5 + num_effective_qs, tmask); - } - } - - template - unsigned GetIndicesL(unsigned num_qubits, const std::vector& qs, - IndicesL& indices) const { - unsigned eqs[32]; - - unsigned qmaskh = 0; - unsigned qmaskl = 0; - - unsigned qi = 0; - - while (qi < qs.size() && qs[qi] < 5) { - qmaskl |= 1 << qs[qi++]; - } - - unsigned nq = std::max(5U, num_qubits); - unsigned num_effective_qs = std::min(nq - 5, unsigned(qs.size())); - - unsigned l = 0; - unsigned ei = 0; - unsigned num_low_qs = qi; - - if (qs.size() == num_low_qs) { - while (ei < num_effective_qs && l++ < num_low_qs) { - eqs[ei] = ei + 5; - ++ei; - } - } else { - while (ei < num_effective_qs && l < num_low_qs) { - unsigned ei5 = ei + 5; - eqs[ei] = ei5; - if (qi < qs.size() && qs[qi] == ei5) { - ++qi; - qmaskh |= 1 << ei5; - } else { - ++l; - } - ++ei; - } - - while (ei < num_effective_qs) { - eqs[ei] = qs[qi++]; - qmaskh |= 1 << (ei + 5); - ++ei; - } - } - - GetIndicesH(num_qubits, eqs, num_effective_qs, indices); - GetIndicesL(num_effective_qs, qmaskh | qmaskl, indices); - - return num_effective_qs; - } - - template - DataC GetIndicesLC(unsigned num_qubits, const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - IndicesL& indices) const { - unsigned aqs[64]; - unsigned eqs[32]; - - unsigned qmaskh = 0; - unsigned qmaskl = 0; - idx_type cmaskh = 0; - - unsigned qi = 0; - - while (qi < qs.size() && qs[qi] < 5) { - qmaskl |= 1 << qs[qi++]; - } - - unsigned nq = std::max(5U, num_qubits - unsigned(cqs.size())); - unsigned num_effective_qs = std::min(nq - 5, unsigned(qs.size())); - - unsigned l = 0; - unsigned ai = 5; - unsigned ci = 0; - unsigned ei = 0; - unsigned num_low_qs = qi; - - while (ai < num_qubits && l < num_low_qs) { - aqs[ai - 5] = ai; - if (qi < qs.size() && qs[qi] == ai) { - ++qi; - eqs[ei++] = ai; - qmaskh |= 1 << (ai - ci); - } else if (ci < cqs.size() && cqs[ci] == ai) { - ++ci; - cmaskh |= idx_type{1} << ai; - } else { - ++l; - eqs[ei++] = ai; - } - ++ai; - } - - unsigned i = ai; - unsigned j = qi; - - while (ei < num_effective_qs) { - eqs[ei++] = qs[j++]; - qmaskh |= 1 << (i++ - ci); - } - - unsigned num_aqs = GetHighQubits(qs, qi, cqs, ci, ai - 5, cmaskh, aqs); - GetMs(num_qubits, aqs, num_aqs, indices.ms); - GetXss(num_qubits, eqs, num_effective_qs, indices.xss); - GetIndicesL(num_effective_qs, qmaskh | qmaskl, indices); - - idx_type cvalsh = bits::ExpandBits(idx_type(cvals), num_qubits, cmaskh); - - return {cvalsh, num_aqs, num_effective_qs}; - } - - template - DataC GetIndicesLCL(unsigned num_qubits, const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - IndicesLC& indices) const { - unsigned aqs[64]; - unsigned eqs[32]; - - unsigned qmaskh = 0; - unsigned qmaskl = 0; - idx_type cmaskh = 0; - idx_type cmaskl = 0; - idx_type cis_mask = 0; - - unsigned qi = 0; - unsigned ci = 0; - - for (unsigned k = 0; k < 5; ++k) { - if (qi < qs.size() && qs[qi] == k) { - qmaskl |= 1 << (k - ci); - ++qi; - } else if (ci < cqs.size() && cqs[ci] == k) { - cmaskl |= idx_type{1} << k; - ++ci; - } - } - - unsigned num_low_qs = qi; - unsigned num_low_cqs = ci; - - unsigned nq = std::max(5U, num_qubits - unsigned(cqs.size())); - unsigned num_effective_qs = std::min(nq - 5, unsigned(qs.size())); - - unsigned l = 0; - unsigned ai = 5; - unsigned ei = 0; - unsigned num_low = num_low_qs + num_low_cqs; - unsigned remaining_low_cqs = num_low_cqs; - unsigned effective_low_qs = num_low_qs; - unsigned highest_cis_bit = 0; - - while (ai < num_qubits && l < num_low) { - aqs[ai - 5] = ai; - if (qi < qs.size() && qs[qi] == ai) { - ++qi; - if ((ai - ci) > 4) { - eqs[ei++] = ai; - qmaskh |= 1 << (ai - ci); - } else { - highest_cis_bit = ai; - cis_mask |= idx_type{1} << ai; - qmaskl |= 1 << (ai - ci); - --remaining_low_cqs; - ++effective_low_qs; - } - } else if (ci < cqs.size() && cqs[ci] == ai) { - ++ci; - cmaskh |= idx_type{1} << ai; - } else { - ++l; - if (remaining_low_cqs == 0) { - eqs[ei++] = ai; - } else { - highest_cis_bit = ai; - cis_mask |= idx_type{1} << ai; - --remaining_low_cqs; - } - } - ++ai; - } - - unsigned i = ai; - unsigned j = effective_low_qs; - - while (ei < num_effective_qs) { - eqs[ei++] = qs[j++]; - qmaskh |= 1 << (i++ - ci); - } - - unsigned num_aqs = GetHighQubits(qs, qi, cqs, ci, ai - 5, cmaskh, aqs); - GetMs(num_qubits, aqs, num_aqs, indices.ms); - GetXss(num_qubits, eqs, num_effective_qs, indices.xss); - GetIndicesL(num_effective_qs, qmaskh | qmaskl, indices); - - idx_type cvalsh = bits::ExpandBits(idx_type(cvals), num_qubits, cmaskh); - idx_type cvalsl = bits::ExpandBits(idx_type(cvals), 5, cmaskl); - - cis_mask |= 31 ^ cmaskl; - highest_cis_bit = highest_cis_bit < 5 ? 5 : highest_cis_bit; - for (idx_type i = 0; i < 32; ++i) { - auto c = bits::ExpandBits(i, highest_cis_bit + 1, cis_mask); - indices.cis[i] = 2 * (c & 0xffffffe0) | (c & 0x1f) | cvalsl; - } - - return {cvalsh, num_aqs, num_effective_qs, remaining_low_cqs}; - } - - - void* AllocScratch(uint64_t size) const { - if (size > scratch_size_) { - if (scratch_ != nullptr) { - ErrorCheck(cudaFree(scratch_)); - } - - ErrorCheck(cudaMalloc(const_cast(&scratch_), size)); - - const_cast(scratch_size_) = size; - } - - return scratch_; - } - - char* d_ws; - char h_ws0[max_buf_size]; - char* h_ws = (char*) h_ws0; - - void* scratch_; - uint64_t scratch_size_; -}; - -} // namespace qsim - -#endif // SIMULATOR_CUDA_H_ diff --git a/tpls/qsim/simulator_cuda_kernels.h b/tpls/qsim/simulator_cuda_kernels.h deleted file mode 100644 index e21a9d6..0000000 --- a/tpls/qsim/simulator_cuda_kernels.h +++ /dev/null @@ -1,683 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SIMULATOR_CUDA_KERNELS_H_ -#define SIMULATOR_CUDA_KERNELS_H_ - -#ifdef __NVCC__ - #include - #include - - #include "util_cuda.h" -#elif __HIP__ - #include - #include "cuda2hip.h" -#endif - -namespace qsim { - -template -__global__ void ApplyGateH_Kernel( - const fp_type* __restrict__ v0, const idx_type* __restrict__ xss0, - const idx_type* __restrict__ mss, fp_type* __restrict__ rstate) { - // blockDim.x must be equal to 64. - - static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); - - constexpr unsigned gsize = 1 << G; - constexpr unsigned rows = - G < 4 ? gsize : (sizeof(fp_type) == 4 ? - (G < 6 ? gsize : 32) : (G < 5 ? 8 : 16)); - - fp_type rs[gsize], is[gsize]; - - __shared__ idx_type xss[64]; - __shared__ fp_type v[2 * gsize * rows]; - - if (threadIdx.x < gsize) { - xss[threadIdx.x] = xss0[threadIdx.x]; - } - - if (G <= 2) { - if (threadIdx.x < 2 * gsize * gsize) { - v[threadIdx.x] = v0[threadIdx.x]; - } - } else { - for (unsigned m = 0; m < 2 * gsize * rows; m += 64) { - v[m + threadIdx.x] = v0[m + threadIdx.x]; - } - } - - __syncthreads(); - - idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0; - idx_type ii = i & mss[0]; - for (unsigned j = 1; j <= G; ++j) { - i *= 2; - ii |= i & mss[j]; - } - - auto p0 = rstate + 2 * ii + threadIdx.x % 32; - - for (unsigned k = 0; k < gsize; ++k) { - rs[k] = *(p0 + xss[k]); - is[k] = *(p0 + xss[k] + 32); - } - - for (unsigned s = 0; s < gsize / rows; ++s) { - if (s > 0) { - __syncthreads(); - - for (unsigned m = 0; m < 2 * gsize * rows; m += 64) { - v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; - } - - __syncthreads(); - } - - unsigned j = 0; - - for (unsigned k = rows * s; k < rows * (s + 1); ++k) { - fp_type rn = 0; - fp_type in = 0; - - for (unsigned l = 0; l < gsize; ++l) { - fp_type rm = v[j++]; - fp_type im = v[j++]; - rn += rs[l] * rm; - rn -= is[l] * im; - in += rs[l] * im; - in += is[l] * rm; - } - - *(p0 + xss[k]) = rn; - *(p0 + xss[k] + 32) = in; - } - } -} - -template -__global__ void ApplyGateL_Kernel( - const fp_type* __restrict__ v0, const idx_type* __restrict__ xss, - const idx_type* __restrict__ mss, const unsigned* __restrict__ qis, - const unsigned* __restrict__ tis, unsigned esize, - fp_type* __restrict__ rstate) { - // blockDim.x must be equal to 32. - - static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); - - constexpr unsigned gsize = 1 << G; - constexpr unsigned - rows = G < 4 ? gsize : (sizeof(fp_type) == 4 ? - (G < 5 ? gsize : 8) : (G < 6 ? 8 : 4)); - - fp_type rs[gsize], is[gsize]; - - __shared__ fp_type v[2 * gsize * rows]; - __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1]; - - if (G < 2) { - if (threadIdx.x < 2 * gsize * gsize) { - v[threadIdx.x] = v0[threadIdx.x]; - } - } else { - for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { - v[m + threadIdx.x] = v0[m + threadIdx.x]; - } - } - - idx_type i = 32 * idx_type{blockIdx.x}; - idx_type ii = i & mss[0]; - for (unsigned j = 1; j <= G; ++j) { - i *= 2; - ii |= i & mss[j]; - } - - auto p0 = rstate + 2 * ii + threadIdx.x; - - for (unsigned k = 0; k < gsize; ++k) { - rs0[threadIdx.x][k] = *(p0 + xss[k]); - is0[threadIdx.x][k] = *(p0 + xss[k] + 32); - } - - for (unsigned k = 0; k < gsize; ++k) { - unsigned i = tis[threadIdx.x] | qis[k]; - unsigned m = i & 0x1f; - unsigned n = i / 32; - - rs[k] = rs0[m][n]; - is[k] = is0[m][n]; - } - - for (unsigned s = 0; s < gsize / rows; ++s) { - if (s > 0) { - for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { - v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; - } - } - - unsigned j = 0; - - for (unsigned k = rows * s; k < rows * (s + 1); ++k) { - fp_type rn = 0; - fp_type in = 0; - - for (unsigned l = 0; l < gsize; ++l) { - fp_type rm = v[j++]; - fp_type im = v[j++]; - rn += rs[l] * rm; - rn -= is[l] * im; - in += rs[l] * im; - in += is[l] * rm; - } - - unsigned i = tis[threadIdx.x] | qis[k]; - unsigned m = i & 0x1f; - unsigned n = i / 32; - - rs0[m][n] = rn; - is0[m][n] = in; - } - } - - for (unsigned k = 0; k < esize; ++k) { - *(p0 + xss[k]) = rs0[threadIdx.x][k]; - *(p0 + xss[k] + 32) = is0[threadIdx.x][k]; - } -} - -template -__global__ void ApplyControlledGateH_Kernel( - const fp_type* __restrict__ v0, const idx_type* __restrict__ xss0, - const idx_type* __restrict__ mss, unsigned num_mss, idx_type cvalsh, - fp_type* __restrict__ rstate) { - // blockDim.x must be equal to 64. - - static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); - - constexpr unsigned gsize = 1 << G; - constexpr unsigned rows = - G < 4 ? gsize : (sizeof(fp_type) == 4 ? - (G < 6 ? gsize : 32) : (G < 5 ? 8 : 16)); - - fp_type rs[gsize], is[gsize]; - - __shared__ idx_type xss[64]; - __shared__ fp_type v[2 * gsize * rows]; - - if (threadIdx.x < gsize) { - xss[threadIdx.x] = xss0[threadIdx.x]; - } - - if (G <= 2) { - if (threadIdx.x < 2 * gsize * gsize) { - v[threadIdx.x] = v0[threadIdx.x]; - } - } else { - for (unsigned m = 0; m < 2 * gsize * rows; m += 64) { - v[m + threadIdx.x] = v0[m + threadIdx.x]; - } - } - - __syncthreads(); - - idx_type i = (64 * idx_type{blockIdx.x} + threadIdx.x) & 0xffffffffffe0; - idx_type ii = i & mss[0]; - for (unsigned j = 1; j < num_mss; ++j) { - i *= 2; - ii |= i & mss[j]; - } - - ii |= cvalsh; - - auto p0 = rstate + 2 * ii + threadIdx.x % 32; - - for (unsigned k = 0; k < gsize; ++k) { - rs[k] = *(p0 + xss[k]); - is[k] = *(p0 + xss[k] + 32); - } - - for (unsigned s = 0; s < gsize / rows; ++s) { - if (s > 0) { - __syncthreads(); - - for (unsigned m = 0; m < 2 * gsize * rows; m += 64) { - v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; - } - - __syncthreads(); - } - - unsigned j = 0; - - for (unsigned k = rows * s; k < rows * (s + 1); ++k) { - fp_type rn = 0; - fp_type in = 0; - - for (unsigned l = 0; l < gsize; ++l) { - fp_type rm = v[j++]; - fp_type im = v[j++]; - rn += rs[l] * rm; - rn -= is[l] * im; - in += rs[l] * im; - in += is[l] * rm; - } - - *(p0 + xss[k]) = rn; - *(p0 + xss[k] + 32) = in; - } - } -} - -template -__global__ void ApplyControlledGateLH_Kernel( - const fp_type* __restrict__ v0, const idx_type* __restrict__ xss, - const idx_type* __restrict__ mss, const unsigned* __restrict__ qis, - const unsigned* __restrict__ tis, unsigned num_mss, idx_type cvalsh, - unsigned esize, fp_type* __restrict__ rstate) { - // blockDim.x must be equal to 32. - - static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); - - constexpr unsigned gsize = 1 << G; - constexpr unsigned - rows = G < 4 ? gsize : (sizeof(fp_type) == 4 ? - (G < 5 ? gsize : 8) : (G < 6 ? 8 : 4)); - - fp_type rs[gsize], is[gsize]; - - __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1]; - __shared__ fp_type v[2 * gsize * rows]; - - idx_type i = 32 * idx_type{blockIdx.x}; - idx_type ii = i & mss[0]; - for (unsigned j = 1; j < num_mss; ++j) { - i *= 2; - ii |= i & mss[j]; - } - - ii |= cvalsh; - - auto p0 = rstate + 2 * ii + threadIdx.x; - - for (unsigned k = 0; k < gsize; ++k) { - rs0[threadIdx.x][k] = *(p0 + xss[k]); - is0[threadIdx.x][k] = *(p0 + xss[k] + 32); - } - - if (G < 2) { - if (threadIdx.x < 2 * gsize * gsize) { - v[threadIdx.x] = v0[threadIdx.x]; - } - } else { - for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { - v[m + threadIdx.x] = v0[m + threadIdx.x]; - } - } - - for (unsigned k = 0; k < gsize; ++k) { - unsigned i = tis[threadIdx.x] | qis[k]; - unsigned m = i & 0x1f; - unsigned n = i / 32; - - rs[k] = rs0[m][n]; - is[k] = is0[m][n]; - } - - for (unsigned s = 0; s < gsize / rows; ++s) { - if (s > 0) { - for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { - v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; - } - } - - unsigned j = 0; - - for (unsigned k = rows * s; k < rows * (s + 1); ++k) { - fp_type rn = 0; - fp_type in = 0; - - for (unsigned l = 0; l < gsize; ++l) { - fp_type rm = v[j++]; - fp_type im = v[j++]; - rn += rs[l] * rm; - rn -= is[l] * im; - in += rs[l] * im; - in += is[l] * rm; - } - - unsigned i = tis[threadIdx.x] | qis[k]; - unsigned m = i & 0x1f; - unsigned n = i / 32; - - rs0[m][n] = rn; - is0[m][n] = in; - } - } - - for (unsigned k = 0; k < esize; ++k) { - *(p0 + xss[k]) = rs0[threadIdx.x][k]; - *(p0 + xss[k] + 32) = is0[threadIdx.x][k]; - } -} - -template -__global__ void ApplyControlledGateL_Kernel( - const fp_type* __restrict__ v0, const idx_type* __restrict__ xss, - const idx_type* __restrict__ mss, const unsigned* __restrict__ qis, - const unsigned* __restrict__ tis, const idx_type* __restrict__ cis, - unsigned num_mss, idx_type cvalsh, unsigned esize, unsigned rwthreads, - fp_type* __restrict__ rstate) { - // blockDim.x must be equal to 32. - - static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); - - constexpr unsigned gsize = 1 << G; - constexpr unsigned - rows = G < 4 ? gsize : (sizeof(fp_type) == 4 ? - (G < 5 ? gsize : 8) : (G < 6 ? 8 : 4)); - - fp_type rs[gsize], is[gsize]; - - __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1]; - __shared__ fp_type v[2 * gsize * rows]; - - idx_type i = 32 * idx_type{blockIdx.x}; - idx_type ii = i & mss[0]; - for (unsigned j = 1; j < num_mss; ++j) { - i *= 2; - ii |= i & mss[j]; - } - - ii |= cvalsh; - - auto p0 = rstate + 2 * ii + cis[threadIdx.x]; - - if (threadIdx.x < rwthreads) { - for (unsigned k = 0; k < gsize; ++k) { - rs0[threadIdx.x][k] = *(p0 + xss[k]); - is0[threadIdx.x][k] = *(p0 + xss[k] + 32); - } - } - - if (G < 2) { - if (threadIdx.x < 2 * gsize * gsize) { - v[threadIdx.x] = v0[threadIdx.x]; - } - } else { - for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { - v[m + threadIdx.x] = v0[m + threadIdx.x]; - } - } - - for (unsigned k = 0; k < gsize; ++k) { - unsigned i = tis[threadIdx.x] | qis[k]; - unsigned m = i & 0x1f; - unsigned n = i / 32; - - rs[k] = rs0[m][n]; - is[k] = is0[m][n]; - } - - for (unsigned s = 0; s < gsize / rows; ++s) { - if (s > 0) { - for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { - v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; - } - } - - unsigned j = 0; - - for (unsigned k = rows * s; k < rows * (s + 1); ++k) { - fp_type rn = 0; - fp_type in = 0; - - for (unsigned l = 0; l < gsize; ++l) { - fp_type rm = v[j++]; - fp_type im = v[j++]; - rn += rs[l] * rm; - rn -= is[l] * im; - in += rs[l] * im; - in += is[l] * rm; - } - - unsigned i = tis[threadIdx.x] | qis[k]; - unsigned m = i & 0x1f; - unsigned n = i / 32; - - rs0[m][n] = rn; - is0[m][n] = in; - } - } - - if (threadIdx.x < rwthreads) { - for (unsigned k = 0; k < esize; ++k) { - *(p0 + xss[k]) = rs0[threadIdx.x][k]; - *(p0 + xss[k] + 32) = is0[threadIdx.x][k]; - } - } -} - -template -__global__ void ExpectationValueH_Kernel( - const fp_type* __restrict__ v0, const idx_type* __restrict__ xss0, - const idx_type* __restrict__ mss, unsigned num_iterations_per_block, - const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) { - // blockDim.x must be equal to 64. - - static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); - - constexpr unsigned gsize = 1 << G; - constexpr unsigned rows = - G < 5 ? gsize : (sizeof(fp_type) == 4 ? (G < 6 ? 4 : 8) : 8); - - fp_type rs[gsize], is[gsize]; - - __shared__ idx_type xss[64]; - __shared__ fp_type v[2 * gsize * rows]; - - if (threadIdx.x < gsize) { - xss[threadIdx.x] = xss0[threadIdx.x]; - } - - if (G <= 2) { - if (threadIdx.x < 2 * gsize * gsize) { - v[threadIdx.x] = v0[threadIdx.x]; - } - } else { - for (unsigned m = 0; m < 2 * gsize * rows; m += 64) { - v[m + threadIdx.x] = v0[m + threadIdx.x]; - } - } - - __syncthreads(); - - double re = 0; - double im = 0; - - for (unsigned iter = 0; iter < num_iterations_per_block; ++iter) { - idx_type b = num_iterations_per_block * idx_type{blockIdx.x} + iter; - - idx_type i = (64 * b + threadIdx.x) & 0xffffffffffe0; - idx_type ii = i & mss[0]; - for (unsigned j = 1; j <= G; ++j) { - i *= 2; - ii |= i & mss[j]; - } - - auto p0 = rstate + 2 * ii + threadIdx.x % 32; - - for (unsigned k = 0; k < gsize; ++k) { - rs[k] = *(p0 + xss[k]); - is[k] = *(p0 + xss[k] + 32); - } - - for (unsigned s = 0; s < gsize / rows; ++s) { - if (s > 0 || iter > 0) { - __syncthreads(); - - for (unsigned m = 0; m < 2 * gsize * rows; m += 64) { - v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; - } - - __syncthreads(); - } - - unsigned j = 0; - - for (unsigned k = rows * s; k < rows * (s + 1); ++k) { - fp_type rn = 0; - fp_type in = 0; - - for (unsigned l = 0; l < gsize; ++l) { - fp_type rm = v[j++]; - fp_type im = v[j++]; - rn += rs[l] * rm; - rn -= is[l] * im; - in += rs[l] * im; - in += is[l] * rm; - } - - re += rs[k] * rn; - re += is[k] * in; - im += rs[k] * in; - im -= is[k] * rn; - } - } - } - - __shared__ cfp_type partial1[64]; - __shared__ cfp_type partial2[2]; - - partial1[threadIdx.x].re = re; - partial1[threadIdx.x].im = im; - - auto val = WarpReduce(partial1[threadIdx.x], op); - - if (threadIdx.x % 32 == 0) { - partial2[threadIdx.x / 32] = val; - } - - __syncthreads(); - - if (threadIdx.x == 0) { - result[blockIdx.x].re = partial2[0].re + partial2[1].re; - result[blockIdx.x].im = partial2[0].im + partial2[1].im; - } -} - -template -__global__ void ExpectationValueL_Kernel( - const fp_type* __restrict__ v0, const idx_type* __restrict__ xss, - const idx_type* __restrict__ mss, const unsigned* __restrict__ qis, - const unsigned* __restrict__ tis, unsigned num_iterations_per_block, - const fp_type* __restrict__ rstate, Op op, cfp_type* __restrict__ result) { - // blockDim.x must be equal to 32. - - static_assert(G < 7, "gates acting on more than 6 qubits are not supported."); - - constexpr unsigned gsize = 1 << G; - constexpr unsigned rows = G < 5 ? gsize : (sizeof(fp_type) == 4 ? - (G < 6 ? 4 : 2) : (G < 6 ? 2 : 1)); - - fp_type rs[gsize], is[gsize]; - - __shared__ fp_type rs0[32][gsize + 1], is0[32][gsize + 1]; - __shared__ fp_type v[2 * gsize * rows]; - - if (G < 2) { - if (threadIdx.x < 2 * gsize * gsize) { - v[threadIdx.x] = v0[threadIdx.x]; - } - } else { - for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { - v[m + threadIdx.x] = v0[m + threadIdx.x]; - } - } - - double re = 0; - double im = 0; - - for (idx_type iter = 0; iter < num_iterations_per_block; ++iter) { - idx_type i = 32 * (num_iterations_per_block * idx_type{blockIdx.x} + iter); - idx_type ii = i & mss[0]; - for (unsigned j = 1; j <= G; ++j) { - i *= 2; - ii |= i & mss[j]; - } - - auto p0 = rstate + 2 * ii + threadIdx.x; - - for (unsigned k = 0; k < gsize; ++k) { - rs0[threadIdx.x][k] = *(p0 + xss[k]); - is0[threadIdx.x][k] = *(p0 + xss[k] + 32); - } - - for (unsigned k = 0; k < gsize; ++k) { - unsigned i = tis[threadIdx.x] | qis[k]; - unsigned m = i & 0x1f; - unsigned n = i / 32; - - rs[k] = rs0[m][n]; - is[k] = is0[m][n]; - } - - for (unsigned s = 0; s < gsize / rows; ++s) { - if (s > 0 || iter > 0) { - for (unsigned m = 0; m < 2 * gsize * rows; m += 32) { - v[m + threadIdx.x] = v0[m + 2 * gsize * rows * s + threadIdx.x]; - } - } - - unsigned j = 0; - - for (unsigned k = rows * s; k < rows * (s + 1); ++k) { - fp_type rn = 0; - fp_type in = 0; - - for (unsigned l = 0; l < gsize; ++l) { - fp_type rm = v[j++]; - fp_type im = v[j++]; - rn += rs[l] * rm; - rn -= is[l] * im; - in += rs[l] * im; - in += is[l] * rm; - } - - re += rs[k] * rn; - re += is[k] * in; - im += rs[k] * in; - im -= is[k] * rn; - } - } - } - - __shared__ cfp_type partial[32]; - - partial[threadIdx.x].re = re; - partial[threadIdx.x].im = im; - - auto val = WarpReduce(partial[threadIdx.x], op); - - if (threadIdx.x == 0) { - result[blockIdx.x].re = val.re; - result[blockIdx.x].im = val.im; - } -} - -} // namespace qsim - -#endif // SIMULATOR_CUDA_KERNELS_H_ diff --git a/tpls/qsim/simulator_custatevec.h b/tpls/qsim/simulator_custatevec.h deleted file mode 100644 index 40d1902..0000000 --- a/tpls/qsim/simulator_custatevec.h +++ /dev/null @@ -1,209 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SIMULATOR_CUSTATEVEC_H_ -#define SIMULATOR_CUSTATEVEC_H_ - -#include -#include -#include - -#include -#include -#include - -#include "io.h" -#include "statespace_custatevec.h" -#include "util_custatevec.h" - -namespace qsim { - -/** - * Quantum circuit simulator using the NVIDIA cuStateVec library. - */ -template -class SimulatorCuStateVec final { - public: - using StateSpace = StateSpaceCuStateVec; - using State = typename StateSpace::State; - using fp_type = typename StateSpace::fp_type; - - static constexpr auto kStateType = StateSpace::kStateType; - static constexpr auto kMatrixType = StateSpace::kMatrixType; - static constexpr auto kExpectType = StateSpace::kExpectType; - static constexpr auto kComputeType = StateSpace::kComputeType; - static constexpr auto kMatrixLayout = StateSpace::kMatrixLayout; - - explicit SimulatorCuStateVec(const cublasHandle_t& cublas_handle, - const custatevecHandle_t& custatevec_handle) - : cublas_handle_(cublas_handle), custatevec_handle_(custatevec_handle), - workspace_(nullptr), workspace_size_(0) {} - - ~SimulatorCuStateVec() { - ErrorCheck(cudaFree(workspace_)); - } - - /** - * Applies a gate using the NVIDIA cuStateVec library. - * @param qs Indices of the qubits affected by this gate. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyGate(const std::vector& qs, - const fp_type* matrix, State& state) const { - if (qs.size() == 0) { - uint64_t size = uint64_t{1} << state.num_qubits(); - - if (StateSpace::is_float) { - cuComplex a = {matrix[0], matrix[1]}; - auto p = (cuComplex*) state.get(); - ErrorCheck(cublasCscal(cublas_handle_, size, &a, p, 1)); - } else { - cuDoubleComplex a = {matrix[0], matrix[1]}; - auto p = (cuDoubleComplex*) state.get(); - ErrorCheck(cublasZscal(cublas_handle_, size, &a, p, 1)); - } - } else { - auto workspace_size = ApplyGateWorkSpaceSize( - state.num_qubits(), qs.size(), 0, matrix); - AllocWorkSpace(workspace_size); - - ErrorCheck(custatevecApplyMatrix( - custatevec_handle_, state.get(), kStateType, - state.num_qubits(), matrix, kMatrixType, kMatrixLayout, 0, - (int32_t*) qs.data(), qs.size(), nullptr, nullptr, 0, - kComputeType, workspace_, workspace_size)); - } - } - - /** - * Applies a controlled gate using the NVIDIA cuStateVec library. - * @param qs Indices of the qubits affected by this gate. - * @param cqs Indices of control qubits. - * @param cmask Bit mask of control qubit values. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyControlledGate(const std::vector& qs, - const std::vector& cqs, uint64_t cmask, - const fp_type* matrix, State& state) const { - if (qs.size() == 0) { - IO::errorf( - "error: controlled global phase gate is not implemented %s %d\n", - __FILE__, __LINE__); - exit(1); - } else { - std::vector control_bits; - control_bits.reserve(cqs.size()); - - for (std::size_t i = 0; i < cqs.size(); ++i) { - control_bits.push_back((cmask >> i) & 1); - } - - auto workspace_size = ApplyGateWorkSpaceSize( - state.num_qubits(), qs.size(), cqs.size(), matrix); - AllocWorkSpace(workspace_size); - - ErrorCheck(custatevecApplyMatrix( - custatevec_handle_, state.get(), kStateType, - state.num_qubits(), matrix, kMatrixType, kMatrixLayout, 0, - (int32_t*) qs.data(), qs.size(), - (int32_t*) cqs.data(), control_bits.data(), cqs.size(), - kComputeType, workspace_, workspace_size)); - } - } - - /** - * Computes the expectation value of an operator using the NVIDIA cuStateVec - * library. - * @param qs Indices of the qubits the operator acts on. - * @param matrix The operator matrix. - * @param state The state of the system. - * @return The computed expectation value. - */ - std::complex ExpectationValue(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - auto workspace_size = ExpectationValueWorkSpaceSize( - state.num_qubits(), qs.size(), matrix); - AllocWorkSpace(workspace_size); - - cuDoubleComplex eval; - - ErrorCheck(custatevecComputeExpectation( - custatevec_handle_, state.get(), kStateType, - state.num_qubits(), &eval, kExpectType, nullptr, matrix, - kMatrixType, kMatrixLayout, (int32_t*) qs.data(), qs.size(), - kComputeType, workspace_, workspace_size)); - - return {cuCreal(eval), cuCimag(eval)}; - } - - /** - * @return The size of SIMD register if applicable. - */ - static unsigned SIMDRegisterSize() { - return 32; - } - - private: - size_t ApplyGateWorkSpaceSize( - unsigned num_qubits, unsigned num_targets, unsigned num_controls, - const fp_type* matrix) const { - size_t size; - - ErrorCheck(custatevecApplyMatrixGetWorkspaceSize( - custatevec_handle_, kStateType, num_qubits, matrix, - kMatrixType, kMatrixLayout, 0, num_targets, num_controls, - kComputeType, &size)); - - return size; - } - - size_t ExpectationValueWorkSpaceSize( - unsigned num_qubits, unsigned num_targets, const fp_type* matrix) const { - size_t size; - - ErrorCheck(custatevecComputeExpectationGetWorkspaceSize( - custatevec_handle_, kStateType, num_qubits, matrix, - kMatrixType, kMatrixLayout, num_targets, kComputeType, - &size)); - - return size; - } - - void* AllocWorkSpace(size_t size) const { - if (size > workspace_size_) { - if (workspace_ != nullptr) { - ErrorCheck(cudaFree(workspace_)); - } - - ErrorCheck(cudaMalloc(const_cast(&workspace_), size)); - - const_cast(workspace_size_) = size; - } - - return workspace_; - } - - const cublasHandle_t cublas_handle_; - const custatevecHandle_t custatevec_handle_; - - void* workspace_; - size_t workspace_size_; -}; - -} // namespace qsim - -#endif // SIMULATOR_CUSTATEVEC_H_ diff --git a/tpls/qsim/simulator_sse.h b/tpls/qsim/simulator_sse.h deleted file mode 100644 index 5256c53..0000000 --- a/tpls/qsim/simulator_sse.h +++ /dev/null @@ -1,864 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SIMULATOR_SSE_H_ -#define SIMULATOR_SSE_H_ - -#include - -#include -#include -#include -#include - -#include "simulator.h" -#include "statespace_sse.h" - -namespace qsim { - -/** - * Quantum circuit simulator with SSE vectorization. - */ -template -class SimulatorSSE final : public SimulatorBase { - public: - using StateSpace = StateSpaceSSE; - using State = typename StateSpace::State; - using fp_type = typename StateSpace::fp_type; - - template - explicit SimulatorSSE(ForArgs&&... args) : for_(args...) {} - - /** - * Applies a gate using SSE instructions. - * @param qs Indices of the qubits affected by this gate. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyGate(const std::vector& qs, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - switch (qs.size()) { - case 0: - ApplyGateH<0>(qs, matrix, state); - break; - case 1: - if (qs[0] > 1) { - ApplyGateH<1>(qs, matrix, state); - } else { - ApplyGateL<0, 1>(qs, matrix, state); - } - break; - case 2: - if (qs[0] > 1) { - ApplyGateH<2>(qs, matrix, state); - } else if (qs[1] > 1) { - ApplyGateL<1, 1>(qs, matrix, state); - } else { - ApplyGateL<0, 2>(qs, matrix, state); - } - break; - case 3: - if (qs[0] > 1) { - ApplyGateH<3>(qs, matrix, state); - } else if (qs[1] > 1) { - ApplyGateL<2, 1>(qs, matrix, state); - } else { - ApplyGateL<1, 2>(qs, matrix, state); - } - break; - case 4: - if (qs[0] > 1) { - ApplyGateH<4>(qs, matrix, state); - } else if (qs[1] > 1) { - ApplyGateL<3, 1>(qs, matrix, state); - } else { - ApplyGateL<2, 2>(qs, matrix, state); - } - break; - case 5: - if (qs[0] > 1) { - ApplyGateH<5>(qs, matrix, state); - } else if (qs[1] > 1) { - ApplyGateL<4, 1>(qs, matrix, state); - } else { - ApplyGateL<3, 2>(qs, matrix, state); - } - break; - case 6: - if (qs[0] > 1) { - ApplyGateH<6>(qs, matrix, state); - } else if (qs[1] > 1) { - ApplyGateL<5, 1>(qs, matrix, state); - } else { - ApplyGateL<4, 2>(qs, matrix, state); - } - break; - default: - // Not implemented. - break; - } - } - - /** - * Applies a controlled gate using SSE instructions. - * @param qs Indices of the qubits affected by this gate. - * @param cqs Indices of control qubits. - * @param cvals Bit mask of control qubit values. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyControlledGate(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - // Assume cqs[0] < cqs[1] < cqs[2] < ... . - - if (cqs.size() == 0) { - ApplyGate(qs, matrix, state); - return; - } - - switch (qs.size()) { - case 0: - if (cqs[0] > 1) { - ApplyControlledGateHH<0>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<0>(qs, cqs, cvals, matrix, state); - } - break; - case 1: - if (qs[0] > 1) { - if (cqs[0] > 1) { - ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 1) { - ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 2: - if (qs[0] > 1) { - if (cqs[0] > 1) { - ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 1) { - if (cqs[0] > 1) { - ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 1) { - ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 3: - if (qs[0] > 1) { - if (cqs[0] > 1) { - ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 1) { - if (cqs[0] > 1) { - ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 1) { - ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 4: - if (qs[0] > 1) { - if (cqs[0] > 1) { - ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 1) { - if (cqs[0] > 1) { - ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 1) { - ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - default: - // Not implemented. - break; - } - } - - /** - * Computes the expectation value of an operator using SSE instructions. - * @param qs Indices of the qubits the operator acts on. - * @param matrix The operator matrix. - * @param state The state of the system. - * @return The computed expectation value. - */ - std::complex ExpectationValue(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - switch (qs.size()) { - case 1: - if (qs[0] > 1) { - return ExpectationValueH<1>(qs, matrix, state); - } else { - return ExpectationValueL<0, 1>(qs, matrix, state); - } - break; - case 2: - if (qs[0] > 1) { - return ExpectationValueH<2>(qs, matrix, state); - } else if (qs[1] > 1) { - return ExpectationValueL<1, 1>(qs, matrix, state); - } else { - return ExpectationValueL<0, 2>(qs, matrix, state); - } - break; - case 3: - if (qs[0] > 1) { - return ExpectationValueH<3>(qs, matrix, state); - } else if (qs[1] > 1) { - return ExpectationValueL<2, 1>(qs, matrix, state); - } else { - return ExpectationValueL<1, 2>(qs, matrix, state); - } - break; - case 4: - if (qs[0] > 1) { - return ExpectationValueH<4>(qs, matrix, state); - } else if (qs[1] > 1) { - return ExpectationValueL<3, 1>(qs, matrix, state); - } else { - return ExpectationValueL<2, 2>(qs, matrix, state); - } - break; - case 5: - if (qs[0] > 1) { - return ExpectationValueH<5>(qs, matrix, state); - } else if (qs[1] > 1) { - return ExpectationValueL<4, 1>(qs, matrix, state); - } else { - return ExpectationValueL<3, 2>(qs, matrix, state); - } - break; - case 6: - if (qs[0] > 1) { - return ExpectationValueH<6>(qs, matrix, state); - } else if (qs[1] > 1) { - return ExpectationValueL<5, 1>(qs, matrix, state); - } else { - return ExpectationValueL<4, 2>(qs, matrix, state); - } - break; - default: - // Not implemented. - break; - } - - return 0; - } - - /** - * @return The size of SIMD register if applicable. - */ - static unsigned SIMDRegisterSize() { - return 4; - } - - private: - template - void ApplyGateH(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - const uint64_t* ms, const uint64_t* xss, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m128 ru, iu, rn, in; - __m128 rs[hsize], is[hsize]; - - i *= 4; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = _mm_load_ps(p0 + xss[k]); - is[k] = _mm_load_ps(p0 + xss[k] + 4); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm_set1_ps(v[j]); - iu = _mm_set1_ps(v[j + 1]); - rn = _mm_mul_ps(rs[0], ru); - in = _mm_mul_ps(rs[0], iu); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu)); - in = _mm_add_ps(in, _mm_mul_ps(is[0], ru)); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm_set1_ps(v[j]); - iu = _mm_set1_ps(v[j + 1]); - rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru)); - in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu)); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu)); - in = _mm_add_ps(in, _mm_mul_ps(is[l], ru)); - - j += 2; - } - - _mm_store_ps(p0 + xss[k], rn); - _mm_store_ps(p0 + xss[k] + 4, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned k = 2 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, matrix, ms, xss, state.get()); - } - - template - void ApplyGateL(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w, - const uint64_t* ms, const uint64_t* xss, - unsigned q0, fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m128 rn, in; - __m128 rs[gsize], is[gsize]; - - i *= 4; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - - rs[k2] = _mm_load_ps(p0 + xss[k]); - is[k2] = _mm_load_ps(p0 + xss[k] + 4); - - if (L == 1) { - rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177) - : _mm_shuffle_ps(rs[k2], rs[k2], 78); - is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177) - : _mm_shuffle_ps(is[k2], is[k2], 78); - } else if (L == 2) { - rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57); - is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57); - rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78); - is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78); - rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147); - is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147); - } - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm_mul_ps(rs[0], w[j]); - in = _mm_mul_ps(rs[0], w[j + 1]); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1])); - in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j])); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j])); - in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1])); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1])); - in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j])); - - j += 2; - } - - _mm_store_ps(p0 + xss[k], rn); - _mm_store_ps(p0 + xss[k] + 4, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - __m128 w[1 << (1 + 2 * H + L)]; - - auto m = GetMasks11(qs); - - FillIndices(state.num_qubits(), qs, ms, xss); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - unsigned k = 2 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, w, ms, xss, qs[0], state.get()); - } - - template - void ApplyControlledGateHH(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, - uint64_t cmaskh, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m128 ru, iu, rn, in; - __m128 rs[hsize], is[hsize]; - - i *= 4; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - if ((ii & cmaskh) != cvalsh) return; - - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = _mm_load_ps(p0 + xss[k]); - is[k] = _mm_load_ps(p0 + xss[k] + 4); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm_set1_ps(v[j]); - iu = _mm_set1_ps(v[j + 1]); - rn = _mm_mul_ps(rs[0], ru); - in = _mm_mul_ps(rs[0], iu); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu)); - in = _mm_add_ps(in, _mm_mul_ps(is[0], ru)); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm_set1_ps(v[j]); - iu = _mm_set1_ps(v[j + 1]); - rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru)); - in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu)); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu)); - in = _mm_add_ps(in, _mm_mul_ps(is[l], ru)); - - j += 2; - } - - _mm_store_ps(p0 + xss[k], rn); - _mm_store_ps(p0 + xss[k] + 4, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - - auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals); - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned k = 2 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, matrix, ms, xss, m.cvalsh, m.cmaskh, state.get()); - } - - template - void ApplyControlledGateHL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w, - const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, - uint64_t cmaskh, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m128 rn, in; - __m128 rs[hsize], is[hsize]; - - i *= 4; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - if ((ii & cmaskh) != cvalsh) return; - - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = _mm_load_ps(p0 + xss[k]); - is[k] = _mm_load_ps(p0 + xss[k] + 4); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm_mul_ps(rs[0], w[j]); - in = _mm_mul_ps(rs[0], w[j + 1]); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1])); - in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j])); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j])); - in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1])); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1])); - in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j])); - - j += 2; - } - - _mm_store_ps(p0 + xss[k], rn); - _mm_store_ps(p0 + xss[k] + 4, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - __m128 w[1 << (1 + 2 * H)]; - - auto m = GetMasks8<2>(state.num_qubits(), qs, cqs, cvals); - FillIndices(state.num_qubits(), qs, ms, xss); - FillControlledMatrixH(m.cvalsl, m.cmaskl, matrix, (fp_type*) w); - - unsigned r = 2 + H; - unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, state.get()); - } - - template - void ApplyControlledGateL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w, - const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, - uint64_t cmaskh, unsigned q0, fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m128 rn, in; - __m128 rs[gsize], is[gsize]; - - i *= 4; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - if ((ii & cmaskh) != cvalsh) return; - - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - - rs[k2] = _mm_load_ps(p0 + xss[k]); - is[k2] = _mm_load_ps(p0 + xss[k] + 4); - - if (L == 1) { - rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177) - : _mm_shuffle_ps(rs[k2], rs[k2], 78); - is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177) - : _mm_shuffle_ps(is[k2], is[k2], 78); - } else if (L == 2) { - rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57); - is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57); - rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78); - is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78); - rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147); - is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147); - } - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm_mul_ps(rs[0], w[j]); - in = _mm_mul_ps(rs[0], w[j + 1]); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1])); - in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j])); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j])); - in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1])); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1])); - in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j])); - - j += 2; - } - - _mm_store_ps(p0 + xss[k], rn); - _mm_store_ps(p0 + xss[k] + 4, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - __m128 w[1 << (1 + 2 * H + L)]; - - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned r = 2 + H; - unsigned n = state.num_qubits() > r ? state.num_qubits() - r : 0; - uint64_t size = uint64_t{1} << n; - - if (CH) { - auto m = GetMasks9(state.num_qubits(), qs, cqs, cvals); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, qs[0], state.get()); - } else { - auto m = GetMasks10(state.num_qubits(), qs, cqs, cvals); - FillControlledMatrixL( - m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w); - - for_.Run(size, f, w, ms, xss, m.cvalsh, m.cmaskh, qs[0], state.get()); - } - } - - template - std::complex ExpectationValueH(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - const uint64_t* ms, const uint64_t* xss, - const fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m128 ru, iu, rn, in; - __m128 rs[hsize], is[hsize]; - - i *= 4; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = _mm_load_ps(p0 + xss[k]); - is[k] = _mm_load_ps(p0 + xss[k] + 4); - } - - double re = 0; - double im = 0; - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm_set1_ps(v[j]); - iu = _mm_set1_ps(v[j + 1]); - rn = _mm_mul_ps(rs[0], ru); - in = _mm_mul_ps(rs[0], iu); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu)); - in = _mm_add_ps(in, _mm_mul_ps(is[0], ru)); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm_set1_ps(v[j]); - iu = _mm_set1_ps(v[j + 1]); - rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru)); - in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu)); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu)); - in = _mm_add_ps(in, _mm_mul_ps(is[l], ru)); - - j += 2; - } - - __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[k], rn), _mm_mul_ps(is[k], in)); - __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[k], in), _mm_mul_ps(is[k], rn)); - - re += detail::HorizontalSumSSE(v_re); - im += detail::HorizontalSumSSE(v_im); - } - - return std::complex{re, im}; - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned k = 2 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Op = std::plus>; - return for_.RunReduce(size, f, Op(), matrix, ms, xss, state.get()); - } - - template - std::complex ExpectationValueL(const std::vector& qs, - const fp_type* matrix, - const State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w, - const uint64_t* ms, const uint64_t* xss, unsigned q0, - const fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m128 rn, in; - __m128 rs[gsize], is[gsize]; - - i *= 4; - - uint64_t ii = i & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - i *= 2; - ii |= i & ms[j]; - } - - auto p0 = rstate + 2 * ii; - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - - rs[k2] = _mm_load_ps(p0 + xss[k]); - is[k2] = _mm_load_ps(p0 + xss[k] + 4); - - if (L == 1) { - rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177) - : _mm_shuffle_ps(rs[k2], rs[k2], 78); - is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177) - : _mm_shuffle_ps(is[k2], is[k2], 78); - } else if (L == 2) { - rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57); - is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57); - rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78); - is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78); - rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147); - is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147); - } - } - - double re = 0; - double im = 0; - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm_mul_ps(rs[0], w[j]); - in = _mm_mul_ps(rs[0], w[j + 1]); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1])); - in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j])); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j])); - in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1])); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1])); - in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j])); - - j += 2; - } - - unsigned m = lsize * k; - - __m128 v_re = _mm_add_ps(_mm_mul_ps(rs[m], rn), _mm_mul_ps(is[m], in)); - __m128 v_im = _mm_sub_ps(_mm_mul_ps(rs[m], in), _mm_mul_ps(is[m], rn)); - - re += detail::HorizontalSumSSE(v_re); - im += detail::HorizontalSumSSE(v_im); - } - - return std::complex{re, im}; - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - __m128 w[1 << (1 + 2 * H + L)]; - - auto m = GetMasks11(qs); - - FillIndices(state.num_qubits(), qs, ms, xss); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - unsigned k = 2 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - using Op = std::plus>; - return for_.RunReduce(size, f, Op(), w, ms, xss, qs[0], state.get()); - } - - For for_; -}; - -} // namespace qsim - -#endif // SIMULATOR_SSE_H_ diff --git a/tpls/qsim/statespace.h b/tpls/qsim/statespace.h deleted file mode 100644 index 2b0c9af..0000000 --- a/tpls/qsim/statespace.h +++ /dev/null @@ -1,145 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef STATESPACE_H_ -#define STATESPACE_H_ - -#include -#include -#include -#include - -#include "util.h" - -namespace qsim { - -/** - * Abstract class containing context and routines for general state-vector - * manipulations. "AVX", "AVX512", "Basic", and "SSE" implementations are - * provided. - */ -template class VectorSpace, typename... VSTypeParams> -class StateSpace : public VectorSpace { - private: - using Base = VectorSpace; - - public: - using fp_type = typename Base::fp_type; - using State = typename Base::Vector; - - /** - * The observed state from a Measurement gate. - */ - struct MeasurementResult { - /** - * A bitmask of all qubits measured in this result. In this format, if the - * qubit at index `i` is measured, the `i`th bit of `mask` is a one. - */ - uint64_t mask; - /** - * A bitwise representation of the measured states. In this format, the - * qubit at index `i` is represented by the `i`th bit of `bits`. - * If `valid` is true, `mask` has already been applied to this field - * (i.e. `bits == bits & mask`). - */ - uint64_t bits; - /** - * Observed states of the measured qubits. This vector only includes qubits - * specified by the associated Measurement gate. - */ - std::vector bitstring; - /** - * Validation bit. If this is false, the measurement failed and all other - * fields of the result are invalid. - */ - bool valid; - }; - - template - StateSpace(Args&&... args) : Base(args...) {} - - double Norm(const State& state) const { - auto partial_norms = static_cast(*this).PartialNorms(state); - - double norm = partial_norms[0]; - for (std::size_t i = 1; i < partial_norms.size(); ++i) { - norm += partial_norms[i]; - } - - return norm; - } - - template - MeasurementResult Measure(const std::vector& qubits, - RGen& rgen, State& state) const { - auto result = - static_cast(*this).VirtualMeasure(qubits, rgen, state); - - if (result.valid) { - static_cast(*this).Collapse(result, state); - } - - return result; - } - - template - MeasurementResult VirtualMeasure(const std::vector& qubits, - RGen& rgen, const State& state) const { - MeasurementResult result; - - result.valid = true; - result.mask = 0; - - for (auto q : qubits) { - if (q >= state.num_qubits()) { - result.valid = false; - return result; - } - - result.mask |= uint64_t{1} << q; - } - - auto partial_norms = static_cast(*this).PartialNorms(state); - - for (std::size_t i = 1; i < partial_norms.size(); ++i) { - partial_norms[i] += partial_norms[i - 1]; - } - - auto norm = partial_norms.back(); - auto r = RandomValue(rgen, norm); - - unsigned m = 0; - while (r > partial_norms[m]) ++m; - if (m > 0) { - r -= partial_norms[m - 1]; - } - - result.bits = static_cast(*this).FindMeasuredBits( - m, r, result.mask, state); - - result.bitstring.reserve(qubits.size()); - result.bitstring.resize(0); - - for (auto q : qubits) { - result.bitstring.push_back((result.bits >> q) & 1); - } - - return result; - } -}; - -} // namespace qsim - -#endif // STATESPACE_H_ diff --git a/tpls/qsim/statespace_avx.h b/tpls/qsim/statespace_avx.h deleted file mode 100644 index 876058b..0000000 --- a/tpls/qsim/statespace_avx.h +++ /dev/null @@ -1,497 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef STATESPACE_AVX_H_ -#define STATESPACE_AVX_H_ - -#include - -#include -#include -#include -#include -#include - -#include "statespace.h" -#include "util.h" -#include "vectorspace.h" - -namespace qsim { - -namespace detail { - -inline __m256i GetZeroMaskAVX(uint64_t i, uint64_t mask, uint64_t bits) { - __m256i s1 = _mm256_setr_epi64x(i + 0, i + 2, i + 4, i + 6); - __m256i s2 = _mm256_setr_epi64x(i + 1, i + 3, i + 5, i + 7); - __m256i ma = _mm256_set1_epi64x(mask); - __m256i bi = _mm256_set1_epi64x(bits); - - s1 = _mm256_and_si256(s1, ma); - s2 = _mm256_and_si256(s2, ma); - - s1 = _mm256_cmpeq_epi64(s1, bi); - s2 = _mm256_cmpeq_epi64(s2, bi); - - return _mm256_blend_epi32(s1, s2, 170); // 10101010 -} - -inline double HorizontalSumAVX(__m256 s) { - __m128 l = _mm256_castps256_ps128(s); - __m128 h = _mm256_extractf128_ps(s, 1); - __m128 s1 = _mm_add_ps(h, l); - __m128 s1s = _mm_movehdup_ps(s1); - __m128 s2 = _mm_add_ps(s1, s1s); - - return _mm_cvtss_f32(_mm_add_ss(s2, _mm_movehl_ps(s1s, s2))); -} - -} // namespace detail - -/** - * Object containing context and routines for AVX state-vector manipulations. - * State is a vectorized sequence of eight real components followed by eight - * imaginary components. Eight single-precison floating numbers can be loaded - * into an AVX register. - */ -template -class StateSpaceAVX : - public StateSpace, VectorSpace, For, float> { - private: - using Base = StateSpace, qsim::VectorSpace, For, float>; - - public: - using State = typename Base::State; - using fp_type = typename Base::fp_type; - - template - explicit StateSpaceAVX(ForArgs&&... args) : Base(args...) {} - - static uint64_t MinSize(unsigned num_qubits) { - return std::max(uint64_t{16}, 2 * (uint64_t{1} << num_qubits)); - }; - - void InternalToNormalOrder(State& state) const { - if (state.num_qubits() == 1) { - fp_type* s = state.get(); - - s[2] = s[1]; - s[1] = s[8]; - s[3] = s[9]; - - for (uint64_t i = 4; i < 16; ++i) { - s[i] = 0; - } - } else if (state.num_qubits() == 2) { - fp_type* s = state.get(); - - s[6] = s[3]; - s[4] = s[2]; - s[2] = s[1]; - s[1] = s[8]; - s[3] = s[9]; - s[5] = s[10]; - s[7] = s[11]; - - for (uint64_t i = 8; i < 16; ++i) { - s[i] = 0; - } - } else { - auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) { - fp_type* s = p + 16 * i; - - fp_type re[7]; - fp_type im[7]; - - for (uint64_t i = 0; i < 7; ++i) { - re[i] = s[i + 1]; - im[i] = s[i + 8]; - } - - for (uint64_t i = 0; i < 7; ++i) { - s[2 * i + 1] = im[i]; - s[2 * i + 2] = re[i]; - } - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 16, f, state.get()); - } - } - - void NormalToInternalOrder(State& state) const { - if (state.num_qubits() == 1) { - fp_type* s = state.get(); - - s[8] = s[1]; - s[1] = s[2]; - s[9] = s[3]; - - for (uint64_t i = 2; i < 8; ++i) { - s[i] = 0; - s[i + 8] = 0; - } - } else if (state.num_qubits() == 2) { - fp_type* s = state.get(); - - s[8] = s[1]; - s[9] = s[3]; - s[10] = s[5]; - s[11] = s[7]; - s[1] = s[2]; - s[2] = s[4]; - s[3] = s[6]; - - for (uint64_t i = 4; i < 8; ++i) { - s[i] = 0; - s[i + 8] = 0; - } - } else { - auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) { - fp_type* s = p + 16 * i; - - fp_type re[7]; - fp_type im[7]; - - for (uint64_t i = 0; i < 7; ++i) { - im[i] = s[2 * i + 1]; - re[i] = s[2 * i + 2]; - } - - for (uint64_t i = 0; i < 7; ++i) { - s[i + 1] = re[i]; - s[i + 8] = im[i]; - } - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 16, f, state.get()); - } - } - - void SetAllZeros(State& state) const { - __m256 val0 = _mm256_setzero_ps(); - - auto f = [](unsigned n, unsigned m, uint64_t i, __m256& val, fp_type* p) { - _mm256_store_ps(p + 16 * i, val); - _mm256_store_ps(p + 16 * i + 8, val); - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 16, f, val0, state.get()); - } - - // Uniform superposition. - void SetStateUniform(State& state) const { - __m256 val0 = _mm256_setzero_ps(); - __m256 valu; - - fp_type v = double{1} / std::sqrt(uint64_t{1} << state.num_qubits()); - - switch (state.num_qubits()) { - case 1: - valu = _mm256_set_ps(0, 0, 0, 0, 0, 0, v, v); - break; - case 2: - valu = _mm256_set_ps(0, 0, 0, 0, v, v, v, v); - break; - default: - valu = _mm256_set1_ps(v); - break; - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - __m256& val0, __m256 valu, fp_type* p) { - _mm256_store_ps(p + 16 * i, valu); - _mm256_store_ps(p + 16 * i + 8, val0); - }; - - Base::for_.Run( - MinSize(state.num_qubits()) / 16, f, val0, valu, state.get()); - } - - // |0> state. - void SetStateZero(State& state) const { - SetAllZeros(state); - state.get()[0] = 1; - } - - static std::complex GetAmpl(const State& state, uint64_t i) { - uint64_t k = (16 * (i / 8)) + (i % 8); - return std::complex(state.get()[k], state.get()[k + 8]); - } - - static void SetAmpl( - State& state, uint64_t i, const std::complex& ampl) { - uint64_t k = (16 * (i / 8)) + (i % 8); - state.get()[k] = std::real(ampl); - state.get()[k + 8] = std::imag(ampl); - } - - static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) { - uint64_t k = (16 * (i / 8)) + (i % 8); - state.get()[k] = re; - state.get()[k + 8] = im; - } - - // Sets state[i] = complex(re, im) where (i & mask) == bits. - // if `exclude` is true then the criteria becomes (i & mask) != bits. - void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, - const std::complex& val, - bool exclude = false) const { - BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude); - } - - // Sets state[i] = complex(re, im) where (i & mask) == bits. - // if `exclude` is true then the criteria becomes (i & mask) != bits. - void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re, - fp_type im, bool exclude = false) const { - __m256 re_reg = _mm256_set1_ps(re); - __m256 im_reg = _mm256_set1_ps(im); - - __m256i exclude_reg = _mm256_setzero_si256(); - if (exclude) { - exclude_reg = _mm256_cmpeq_epi32(exclude_reg, exclude_reg); - } - - auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv, - uint64_t bitsv, __m256 re_n, __m256 im_n, __m256i exclude_n, - fp_type* p) { - __m256 ml = _mm256_castsi256_ps(_mm256_xor_si256( - detail::GetZeroMaskAVX(8 * i, maskv, bitsv), exclude_n)); - - __m256 re = _mm256_load_ps(p + 16 * i); - __m256 im = _mm256_load_ps(p + 16 * i + 8); - - re = _mm256_blendv_ps(re, re_n, ml); - im = _mm256_blendv_ps(im, im_n, ml); - - _mm256_store_ps(p + 16 * i, re); - _mm256_store_ps(p + 16 * i + 8, im); - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 16, f, mask, bits, re_reg, - im_reg, exclude_reg, state.get()); - } - - // Does the equivalent of dest += src elementwise. - bool Add(const State& src, State& dest) const { - if (src.num_qubits() != dest.num_qubits()) { - return false; - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* p1, fp_type* p2) { - __m256 re1 = _mm256_load_ps(p1 + 16 * i); - __m256 im1 = _mm256_load_ps(p1 + 16 * i + 8); - __m256 re2 = _mm256_load_ps(p2 + 16 * i); - __m256 im2 = _mm256_load_ps(p2 + 16 * i + 8); - - _mm256_store_ps(p2 + 16 * i, _mm256_add_ps(re1, re2)); - _mm256_store_ps(p2 + 16 * i + 8, _mm256_add_ps(im1, im2)); - }; - - Base::for_.Run(MinSize(src.num_qubits()) / 16, f, src.get(), dest.get()); - - return true; - } - - // Does the equivalent of state *= a elementwise. - void Multiply(fp_type a, State& state) const { - __m256 r = _mm256_set1_ps(a); - - auto f = [](unsigned n, unsigned m, uint64_t i, __m256 r, fp_type* p) { - __m256 re = _mm256_load_ps(p + 16 * i); - __m256 im = _mm256_load_ps(p + 16 * i + 8); - - re = _mm256_mul_ps(re, r); - im = _mm256_mul_ps(im, r); - - _mm256_store_ps(p + 16 * i, re); - _mm256_store_ps(p + 16 * i + 8, im); - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 16, f, r, state.get()); - } - - std::complex InnerProduct( - const State& state1, const State& state2) const { - if (state1.num_qubits() != state2.num_qubits()) { - return std::nan(""); - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* p1, const fp_type* p2) -> std::complex { - __m256 re1 = _mm256_load_ps(p1 + 16 * i); - __m256 im1 = _mm256_load_ps(p1 + 16 * i + 8); - __m256 re2 = _mm256_load_ps(p2 + 16 * i); - __m256 im2 = _mm256_load_ps(p2 + 16 * i + 8); - - __m256 ip_re = _mm256_fmadd_ps(im1, im2, _mm256_mul_ps(re1, re2)); - __m256 ip_im = _mm256_fnmadd_ps(im1, re2, _mm256_mul_ps(re1, im2)); - - double re = detail::HorizontalSumAVX(ip_re); - double im = detail::HorizontalSumAVX(ip_im); - - return std::complex{re, im}; - }; - - using Op = std::plus>; - return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 16, f, - Op(), state1.get(), state2.get()); - } - - double RealInnerProduct(const State& state1, const State& state2) const { - if (state1.num_qubits() != state2.num_qubits()) { - return std::nan(""); - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* p1, const fp_type* p2) -> double { - __m256 re1 = _mm256_load_ps(p1 + 16 * i); - __m256 im1 = _mm256_load_ps(p1 + 16 * i + 8); - __m256 re2 = _mm256_load_ps(p2 + 16 * i); - __m256 im2 = _mm256_load_ps(p2 + 16 * i + 8); - - __m256 ip_re = _mm256_fmadd_ps(im1, im2, _mm256_mul_ps(re1, re2)); - - return detail::HorizontalSumAVX(ip_re); - }; - - using Op = std::plus; - return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 16, f, - Op(), state1.get(), state2.get()); - } - - template - std::vector Sample( - const State& state, uint64_t num_samples, unsigned seed) const { - std::vector bitstrings; - - if (num_samples > 0) { - double norm = 0; - uint64_t size = MinSize(state.num_qubits()) / 16; - const fp_type* p = state.get(); - - for (uint64_t k = 0; k < size; ++k) { - for (unsigned j = 0; j < 8; ++j) { - double re = p[16 * k + j]; - double im = p[16 * k + 8 + j]; - norm += re * re + im * im; - } - } - - auto rs = GenerateRandomValues(num_samples, seed, norm); - - uint64_t m = 0; - double csum = 0; - bitstrings.reserve(num_samples); - - for (uint64_t k = 0; k < size; ++k) { - for (unsigned j = 0; j < 8; ++j) { - double re = p[16 * k + j]; - double im = p[16 * k + 8 + j]; - csum += re * re + im * im; - while (rs[m] < csum && m < num_samples) { - bitstrings.emplace_back(8 * k + j); - ++m; - } - } - } - - for (; m < num_samples; ++m) { - bitstrings.emplace_back((uint64_t{1} << state.num_qubits()) - 1); - } - } - - return bitstrings; - } - - using MeasurementResult = typename Base::MeasurementResult; - - void Collapse(const MeasurementResult& mr, State& state) const { - auto f1 = [](unsigned n, unsigned m, uint64_t i, - uint64_t mask, uint64_t bits, const fp_type* p) -> double { - __m256i ml = detail::GetZeroMaskAVX(8 * i, mask, bits); - - __m256 re = _mm256_maskload_ps(p + 16 * i, ml); - __m256 im = _mm256_maskload_ps(p + 16 * i + 8, ml); - __m256 s1 = _mm256_fmadd_ps(im, im, _mm256_mul_ps(re, re)); - - return detail::HorizontalSumAVX(s1); - }; - - using Op = std::plus; - double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 16, f1, - Op(), mr.mask, mr.bits, state.get()); - - __m256 renorm = _mm256_set1_ps(1.0 / std::sqrt(norm)); - - auto f2 = [](unsigned n, unsigned m, uint64_t i, - uint64_t mask, uint64_t bits, __m256 renorm, fp_type* p) { - __m256i ml = detail::GetZeroMaskAVX(8 * i, mask, bits); - - __m256 re = _mm256_maskload_ps(p + 16 * i, ml); - __m256 im = _mm256_maskload_ps(p + 16 * i + 8, ml); - - re = _mm256_mul_ps(re, renorm); - im = _mm256_mul_ps(im, renorm); - - _mm256_store_ps(p + 16 * i, re); - _mm256_store_ps(p + 16 * i + 8, im); - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 16, f2, - mr.mask, mr.bits, renorm, state.get()); - } - - std::vector PartialNorms(const State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* p) -> double { - __m256 re = _mm256_load_ps(p + 16 * i); - __m256 im = _mm256_load_ps(p + 16 * i + 8); - __m256 s1 = _mm256_fmadd_ps(im, im, _mm256_mul_ps(re, re)); - - return detail::HorizontalSumAVX(s1); - }; - - using Op = std::plus; - return Base::for_.RunReduceP( - MinSize(state.num_qubits()) / 16, f, Op(), state.get()); - } - - uint64_t FindMeasuredBits( - unsigned m, double r, uint64_t mask, const State& state) const { - double csum = 0; - - uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 16, m); - uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 16, m); - - const fp_type* p = state.get(); - - for (uint64_t k = k0; k < k1; ++k) { - for (uint64_t j = 0; j < 8; ++j) { - auto re = p[16 * k + j]; - auto im = p[16 * k + j + 8]; - csum += re * re + im * im; - if (r < csum) { - return (8 * k + j) & mask; - } - } - } - - // Return the last bitstring in the unlikely case of underflow. - return (8 * k1 - 1) & mask; - } -}; - -} // namespace qsim - -#endif // STATESPACE_AVX_H_ diff --git a/tpls/qsim/statespace_avx512.h b/tpls/qsim/statespace_avx512.h deleted file mode 100644 index 879fd89..0000000 --- a/tpls/qsim/statespace_avx512.h +++ /dev/null @@ -1,448 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef STATESPACE_AVX512_H_ -#define STATESPACE_AVX512_H_ - -#include - -#include -#include -#include -#include -#include - -#include "statespace.h" -#include "util.h" -#include "vectorspace.h" - -namespace qsim { - -namespace detail { - -inline unsigned GetZeroMaskAVX512(uint64_t i, uint64_t mask, uint64_t bits) { - __m512i s1 = _mm512_setr_epi64( - i + 0, i + 1, i + 2, i + 3, i + 4, i + 5, i + 6, i + 7); - __m512i s2 = _mm512_setr_epi64( - i + 8, i + 9, i + 10, i + 11, i + 12, i + 13, i + 14, i + 15); - __m512i ma = _mm512_set1_epi64(mask); - __m512i bi = _mm512_set1_epi64(bits); - - s1 = _mm512_and_si512(s1, ma); - s2 = _mm512_and_si512(s2, ma); - - unsigned m1 = _mm512_cmpeq_epu64_mask(s1, bi); - unsigned m2 = _mm512_cmpeq_epu64_mask(s2, bi); - - return (m2 << 8) | m1; -} - -inline double HorizontalSumAVX(__m256 s) { - __m128 l = _mm256_castps256_ps128(s); - __m128 h = _mm256_extractf128_ps(s, 1); - __m128 s1 = _mm_add_ps(h, l); - __m128 s1s = _mm_movehdup_ps(s1); - __m128 s2 = _mm_add_ps(s1, s1s); - - return _mm_cvtss_f32(_mm_add_ss(s2, _mm_movehl_ps(s1s, s2))); -} - -inline double HorizontalSumAVX512(__m512 s) { - __m256 l = _mm512_castps512_ps256(s); - __m512d sd = _mm512_castps_pd(s); - __m256d hd = _mm512_extractf64x4_pd(sd, 1); - __m256 h = _mm256_castpd_ps(hd); - __m256 p = _mm256_add_ps(h, l); - - return HorizontalSumAVX(p); -} - -} // namespace detail - -/** - * Object containing context and routines for AVX state-vector manipulations. - * State is a vectorized sequence of sixteen real components followed by - * sixteen imaginary components. Sixteen single-precison floating numbers can - * be loaded into an AVX512 register. - */ -template -class StateSpaceAVX512 : - public StateSpace, VectorSpace, For, float> { - private: - using Base = StateSpace, qsim::VectorSpace, For, float>; - - public: - using State = typename Base::State; - using fp_type = typename Base::fp_type; - - template - explicit StateSpaceAVX512(ForArgs&&... args) : Base(args...) {} - - static uint64_t MinSize(unsigned num_qubits) { - return std::max(uint64_t{32}, 2 * (uint64_t{1} << num_qubits)); - }; - - void InternalToNormalOrder(State& state) const { - __m512i idx1 = _mm512_setr_epi32( - 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); - __m512i idx2 = _mm512_setr_epi32( - 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); - - auto f = [](unsigned n, unsigned m, uint64_t i, - __m512i idx1, __m512i idx2, fp_type* p) { - __m512 v1 = _mm512_load_ps(p + 32 * i); - __m512 v2 = _mm512_load_ps(p + 32 * i + 16); - - _mm512_store_ps(p + 32 * i, _mm512_permutex2var_ps(v1, idx1, v2)); - _mm512_store_ps(p + 32 * i + 16, _mm512_permutex2var_ps(v1, idx2, v2)); - }; - - Base::for_.Run( - MinSize(state.num_qubits()) / 32, f, idx1, idx2, state.get()); - } - - void NormalToInternalOrder(State& state) const { - __m512i idx1 = _mm512_setr_epi32( - 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30); - __m512i idx2 = _mm512_setr_epi32( - 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31); - - auto f = [](unsigned n, unsigned m, uint64_t i, - __m512i idx1, __m512i idx2, fp_type* p) { - __m512 re = _mm512_load_ps(p + 32 * i); - __m512 im = _mm512_load_ps(p + 32 * i + 16); - - _mm512_store_ps(p + 32 * i, _mm512_permutex2var_ps(re, idx1, im)); - _mm512_store_ps(p + 32 * i + 16, _mm512_permutex2var_ps(re, idx2, im)); - }; - - Base::for_.Run( - MinSize(state.num_qubits()) / 32, f, idx1, idx2, state.get()); - } - - void SetAllZeros(State& state) const { - __m512 val0 = _mm512_setzero_ps(); - - auto f = [](unsigned n, unsigned m, uint64_t i, __m512 val0, fp_type* p) { - _mm512_store_ps(p + 32 * i, val0); - _mm512_store_ps(p + 32 * i + 16, val0); - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 32, f, val0, state.get()); - } - - // Uniform superposition. - void SetStateUniform(State& state) const { - __m512 val0 = _mm512_setzero_ps(); - __m512 valu; - - fp_type v = double{1} / std::sqrt(uint64_t{1} << state.num_qubits()); - - switch (state.num_qubits()) { - case 1: - valu = _mm512_set_ps(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, v, v); - break; - case 2: - valu = _mm512_set_ps(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, v, v, v, v); - break; - case 3: - valu = _mm512_set_ps(0, 0, 0, 0, 0, 0, 0, 0, v, v, v, v, v, v, v, v); - break; - default: - valu = _mm512_set1_ps(v); - break; - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - const __m512& val0, const __m512& valu, fp_type* p) { - _mm512_store_ps(p + 32 * i, valu); - _mm512_store_ps(p + 32 * i + 16, val0); - }; - - Base::for_.Run( - MinSize(state.num_qubits()) / 32, f, val0, valu, state.get()); - } - - // |0> state. - void SetStateZero(State& state) const { - SetAllZeros(state); - state.get()[0] = 1; - } - - static std::complex GetAmpl(const State& state, uint64_t i) { - uint64_t p = (32 * (i / 16)) + (i % 16); - return std::complex(state.get()[p], state.get()[p + 16]); - } - - static void SetAmpl( - State& state, uint64_t i, const std::complex& ampl) { - uint64_t p = (32 * (i / 16)) + (i % 16); - state.get()[p] = std::real(ampl); - state.get()[p + 16] = std::imag(ampl); - } - - static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) { - uint64_t p = (32 * (i / 16)) + (i % 16); - state.get()[p] = re; - state.get()[p + 16] = im; - } - - // Sets state[i] = complex(re, im) where (i & mask) == bits. - // if `exclude` is true then the criteria becomes (i & mask) != bits. - void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, - const std::complex& val, - bool exclude = false) const { - BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude); - } - - // Sets state[i] = complex(re, im) where (i & mask) == bits. - // if `exclude` is true then the criteria becomes (i & mask) != bits. - void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re, - fp_type im, bool exclude = false) const { - __m512 re_reg = _mm512_set1_ps(re); - __m512 im_reg = _mm512_set1_ps(im); - - __mmask16 exclude_n = exclude ? 0xffff : 0; - - auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv, - uint64_t bitsv, __m512 re_n, __m512 im_n, __mmask16 exclude_n, - fp_type* p) { - __m512 re = _mm512_load_ps(p + 32 * i); - __m512 im = _mm512_load_ps(p + 32 * i + 16); - - __mmask16 ml = - detail::GetZeroMaskAVX512(16 * i, maskv, bitsv) ^ exclude_n; - - re = _mm512_mask_blend_ps(ml, re, re_n); - im = _mm512_mask_blend_ps(ml, im, im_n); - - _mm512_store_ps(p + 32 * i, re); - _mm512_store_ps(p + 32 * i + 16, im); - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 32, f, mask, bits, - re_reg, im_reg, exclude_n, state.get()); - } - - // Does the equivalent of dest += src elementwise. - bool Add(const State& src, State& dest) const { - if (src.num_qubits() != dest.num_qubits()) { - return false; - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* p1, fp_type* p2) { - __m512 re1 = _mm512_load_ps(p1 + 32 * i); - __m512 im1 = _mm512_load_ps(p1 + 32 * i + 16); - __m512 re2 = _mm512_load_ps(p2 + 32 * i); - __m512 im2 = _mm512_load_ps(p2 + 32 * i + 16); - - _mm512_store_ps(p2 + 32 * i, _mm512_add_ps(re1, re2)); - _mm512_store_ps(p2 + 32 * i + 16, _mm512_add_ps(im1, im2)); - }; - - Base::for_.Run(MinSize(src.num_qubits()) / 32, f, src.get(), dest.get()); - - return true; - } - - // Does the equivalent of state *= a elementwise. - void Multiply(fp_type a, State& state) const { - __m512 r = _mm512_set1_ps(a); - - auto f = [](unsigned n, unsigned m, uint64_t i, __m512 r, fp_type* p) { - __m512 re = _mm512_load_ps(p + 32 * i); - __m512 im = _mm512_load_ps(p + 32 * i + 16); - - _mm512_store_ps(p + 32 * i, _mm512_mul_ps(re, r)); - _mm512_store_ps(p + 32 * i + 16, _mm512_mul_ps(im, r)); - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 32, f, r, state.get()); - } - - std::complex InnerProduct( - const State& state1, const State& state2) const { - if (state1.num_qubits() != state2.num_qubits()) { - return std::nan(""); - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* p1, const fp_type* p2) -> std::complex { - __m512 re1 = _mm512_load_ps(p1 + 32 * i); - __m512 im1 = _mm512_load_ps(p1 + 32 * i + 16); - __m512 re2 = _mm512_load_ps(p2 + 32 * i); - __m512 im2 = _mm512_load_ps(p2 + 32 * i + 16); - - __m512 ip_re = _mm512_fmadd_ps(im1, im2, _mm512_mul_ps(re1, re2)); - __m512 ip_im = _mm512_fnmadd_ps(im1, re2, _mm512_mul_ps(re1, im2)); - - double re = detail::HorizontalSumAVX512(ip_re); - double im = detail::HorizontalSumAVX512(ip_im); - - return std::complex{re, im}; - }; - - using Op = std::plus>; - return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 32, f, - Op(), state1.get(), state2.get()); - } - - double RealInnerProduct(const State& state1, const State& state2) const { - if (state1.num_qubits() != state2.num_qubits()) { - return std::nan(""); - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* p1, const fp_type* p2) -> double { - __m512 re1 = _mm512_load_ps(p1 + 32 * i); - __m512 im1 = _mm512_load_ps(p1 + 32 * i + 16); - __m512 re2 = _mm512_load_ps(p2 + 32 * i); - __m512 im2 = _mm512_load_ps(p2 + 32 * i + 16); - - __m512 ip_re = _mm512_fmadd_ps(im1, im2, _mm512_mul_ps(re1, re2)); - - return detail::HorizontalSumAVX512(ip_re); - }; - - using Op = std::plus; - return Base::for_.RunReduce(MinSize(state1.num_qubits()) / 32, f, - Op(), state1.get(), state2.get()); - } - - template - std::vector Sample( - const State& state, uint64_t num_samples, unsigned seed) const { - std::vector bitstrings; - - if (num_samples > 0) { - double norm = 0; - uint64_t size = MinSize(state.num_qubits()) / 32; - const fp_type* p = state.get(); - - for (uint64_t k = 0; k < size; ++k) { - for (unsigned j = 0; j < 16; ++j) { - double re = p[32 * k + j]; - double im = p[32 * k + 16 + j]; - norm += re * re + im * im; - } - } - - auto rs = GenerateRandomValues(num_samples, seed, norm); - - uint64_t m = 0; - double csum = 0; - bitstrings.reserve(num_samples); - - for (uint64_t k = 0; k < size; ++k) { - for (unsigned j = 0; j < 16; ++j) { - double re = p[32 * k + j]; - double im = p[32 * k + 16 + j]; - csum += re * re + im * im; - while (rs[m] < csum && m < num_samples) { - bitstrings.emplace_back(16 * k + j); - ++m; - } - } - } - - for (; m < num_samples; ++m) { - bitstrings.emplace_back((uint64_t{1} << state.num_qubits()) - 1); - } - } - - return bitstrings; - } - - using MeasurementResult = typename Base::MeasurementResult; - - void Collapse(const MeasurementResult& mr, State& state) const { - auto f1 = [](unsigned n, unsigned m, uint64_t i, - uint64_t mask, uint64_t bits, const fp_type* p) -> double { - __mmask16 ml = detail::GetZeroMaskAVX512(16 * i, mask, bits); - - __m512 re = _mm512_maskz_load_ps(ml, p + 32 * i); - __m512 im = _mm512_maskz_load_ps(ml, p + 32 * i + 16); - __m512 s1 = _mm512_fmadd_ps(im, im, _mm512_mul_ps(re, re)); - - return detail::HorizontalSumAVX512(s1); - }; - - using Op = std::plus; - double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 32, f1, - Op(), mr.mask, mr.bits, state.get()); - - __m512 renorm = _mm512_set1_ps(1.0 / std::sqrt(norm)); - - auto f2 = [](unsigned n, unsigned m, uint64_t i, - uint64_t mask, uint64_t bits, __m512 renorm, fp_type* p) { - __mmask16 ml = detail::GetZeroMaskAVX512(16 * i, mask, bits); - - __m512 re = _mm512_maskz_load_ps(ml, p + 32 * i); - __m512 im = _mm512_maskz_load_ps(ml, p + 32 * i + 16); - - re = _mm512_mul_ps(re, renorm); - im = _mm512_mul_ps(im, renorm); - - _mm512_store_ps(p + 32 * i, re); - _mm512_store_ps(p + 32 * i + 16, im); - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 32, f2, - mr.mask, mr.bits, renorm, state.get()); - } - - std::vector PartialNorms(const State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* p) -> double { - __m512 re = _mm512_load_ps(p + 32 * i); - __m512 im = _mm512_load_ps(p + 32 * i + 16); - __m512 s1 = _mm512_fmadd_ps(im, im, _mm512_mul_ps(re, re)); - - return detail::HorizontalSumAVX512(s1); - }; - - using Op = std::plus; - return Base::for_.RunReduceP( - MinSize(state.num_qubits()) / 32, f, Op(), state.get()); - } - - uint64_t FindMeasuredBits( - unsigned m, double r, uint64_t mask, const State& state) const { - double csum = 0; - - uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 32, m); - uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 32, m); - - const fp_type* p = state.get(); - - for (uint64_t k = k0; k < k1; ++k) { - for (uint64_t j = 0; j < 16; ++j) { - auto re = p[32 * k + j]; - auto im = p[32 * k + j + 16]; - csum += re * re + im * im; - if (r < csum) { - return (16 * k + j) & mask; - } - } - } - - // Return the last bitstring in the unlikely case of underflow. - return (16 * k1 - 1) & mask; - } -}; - -} // namespace qsim - -#endif // STATESPACE_AVX512_H_ diff --git a/tpls/qsim/statespace_basic.h b/tpls/qsim/statespace_basic.h deleted file mode 100644 index 6468483..0000000 --- a/tpls/qsim/statespace_basic.h +++ /dev/null @@ -1,300 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef STATESPACE_BASIC_H_ -#define STATESPACE_BASIC_H_ - -#include -#include -#include -#include - -#include "statespace.h" -#include "util.h" -#include "vectorspace.h" - -namespace qsim { - -/** - * Object containing context and routines for unoptimized state-vector - * manipulations. State is a non-vectorized sequence of one real amplitude - * followed by one imaginary amplitude. - */ -template -class StateSpaceBasic : - public StateSpace, VectorSpace, For, FP> { - private: - using Base = StateSpace, qsim::VectorSpace, For, FP>; - - public: - using State = typename Base::State; - using fp_type = typename Base::fp_type; - - template - explicit StateSpaceBasic(ForArgs&&... args) : Base(args...) {} - - static uint64_t MinSize(unsigned num_qubits) { - return 2 * (uint64_t{1} << num_qubits); - }; - - void InternalToNormalOrder(State& state) const {} - - void NormalToInternalOrder(State& state) const {} - - void SetAllZeros(State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) { - p[2 * i] = 0; - p[2 * i + 1] = 0; - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 2, f, state.get()); - } - - // Uniform superposition. - void SetStateUniform(State& state) const { - fp_type val = fp_type{1} / std::sqrt(uint64_t{1} << state.num_qubits()); - - auto f = [](unsigned n, unsigned m, uint64_t i, - fp_type val, fp_type* p) { - p[2 * i] = val; - p[2 * i + 1] = 0; - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 2, f, val, state.get()); - } - - // |0> state. - void SetStateZero(State& state) const { - SetAllZeros(state); - state.get()[0] = 1; - } - - static std::complex GetAmpl(const State& state, uint64_t i) { - uint64_t p = 2 * i; - return std::complex(state.get()[p], state.get()[p + 1]); - } - - static void SetAmpl( - State& state, uint64_t i, const std::complex& ampl) { - uint64_t p = 2 * i; - state.get()[p] = std::real(ampl); - state.get()[p + 1] = std::imag(ampl); - } - - static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) { - uint64_t p = 2 * i; - state.get()[p] = re; - state.get()[p + 1] = im; - } - - // Sets state[i] = complex(re, im) where (i & mask) == bits. - // if `exclude` is true then the criteria becomes (i & mask) != bits. - void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, - const std::complex& val, - bool exclude = false) const { - BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude); - } - - // Sets state[i] = complex(re, im) where (i & mask) == bits. - // if `exclude` is true then the criteria becomes (i & mask) != bits. - void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re, - fp_type im, bool exclude = false) const { - auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv, - uint64_t bitsv, fp_type re_n, fp_type im_n, bool excludev, - fp_type* p) { - auto s = p + 2 * i; - bool in_mask = (i & maskv) == bitsv; - in_mask ^= excludev; - s[0] = in_mask ? re_n : s[0]; - s[1] = in_mask ? im_n : s[1]; - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 2, f, mask, bits, re, im, - exclude, state.get()); - } - - // Does the equivalent of dest += src elementwise. - bool Add(const State& src, State& dest) const { - if (src.num_qubits() != dest.num_qubits()) { - return false; - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* p1, fp_type* p2) { - p2[2 * i] += p1[2 * i]; - p2[2 * i + 1] += p1[2 * i + 1]; - }; - - Base::for_.Run(MinSize(src.num_qubits()) / 2, f, src.get(), dest.get()); - - return true; - } - - // Does the equivalent of state *= a elementwise. - void Multiply(fp_type a, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, fp_type a, fp_type* p) { - p[2 * i] *= a; - p[2 * i + 1] *= a; - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 2, f, a, state.get()); - } - - std::complex InnerProduct( - const State& state1, const State& state2) const { - if (state1.num_qubits() != state2.num_qubits()) { - return std::nan(""); - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* p1, const fp_type* p2) -> std::complex { - auto s1 = p1 + 2 * i; - auto s2 = p2 + 2 * i; - - double re = s1[0] * s2[0] + s1[1] * s2[1]; - double im = s1[0] * s2[1] - s1[1] * s2[0]; - - return std::complex{re, im}; - }; - - using Op = std::plus>; - return Base::for_.RunReduce( - MinSize(state1.num_qubits()) / 2, f, Op(), state1.get(), state2.get()); - } - - double RealInnerProduct(const State& state1, const State& state2) const { - if (state1.num_qubits() != state2.num_qubits()) { - return std::nan(""); - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* p1, const fp_type* p2) -> double { - auto s1 = p1 + 2 * i; - auto s2 = p2 + 2 * i; - - return s1[0] * s2[0] + s1[1] * s2[1]; - }; - - using Op = std::plus; - return Base::for_.RunReduce( - MinSize(state1.num_qubits()) / 2, f, Op(), state1.get(), state2.get()); - } - - template - std::vector Sample( - const State& state, uint64_t num_samples, unsigned seed) const { - std::vector bitstrings; - - if (num_samples > 0) { - double norm = 0; - uint64_t size = MinSize(state.num_qubits()) / 2; - - const fp_type* p = state.get(); - - for (uint64_t k = 0; k < size; ++k) { - double re = p[2 * k]; - double im = p[2 * k + 1]; - norm += re * re + im * im; - } - - auto rs = GenerateRandomValues(num_samples, seed, norm); - - uint64_t m = 0; - double csum = 0; - bitstrings.reserve(num_samples); - - for (uint64_t k = 0; k < size; ++k) { - double re = p[2 * k]; - double im = p[2 * k + 1]; - csum += re * re + im * im; - while (rs[m] < csum && m < num_samples) { - bitstrings.emplace_back(k); - ++m; - } - } - - for (; m < num_samples; ++m) { - bitstrings.emplace_back((uint64_t{1} << state.num_qubits()) - 1); - } - } - - return bitstrings; - } - - using MeasurementResult = typename Base::MeasurementResult; - - void Collapse(const MeasurementResult& mr, State& state) const { - auto f1 = [](unsigned n, unsigned m, uint64_t i, - uint64_t mask, uint64_t bits, const fp_type* p) -> double { - auto s = p + 2 * i; - return (i & mask) == bits ? s[0] * s[0] + s[1] * s[1] : 0; - }; - - using Op = std::plus; - double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 2, f1, - Op(), mr.mask, mr.bits, state.get()); - - double renorm = 1.0 / std::sqrt(norm); - - auto f2 = [](unsigned n, unsigned m, uint64_t i, - uint64_t mask, uint64_t bits, fp_type renorm, fp_type* p) { - auto s = p + 2 * i; - bool not_zero = (i & mask) == bits; - - s[0] = not_zero ? s[0] * renorm : 0; - s[1] = not_zero ? s[1] * renorm : 0; - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 2, f2, - mr.mask, mr.bits, renorm, state.get()); - } - - std::vector PartialNorms(const State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* p) -> double { - auto s = p + 2 * i; - return s[0] * s[0] + s[1] * s[1]; - }; - - using Op = std::plus; - return Base::for_.RunReduceP( - MinSize(state.num_qubits()) / 2, f, Op(), state.get()); - } - - uint64_t FindMeasuredBits( - unsigned m, double r, uint64_t mask, const State& state) const { - double csum = 0; - - uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 2, m); - uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 2, m); - - const fp_type* p = state.get(); - - for (uint64_t k = k0; k < k1; ++k) { - auto re = p[2 * k]; - auto im = p[2 * k + 1]; - csum += re * re + im * im; - if (r < csum) { - return k & mask; - } - } - - // Return the last bitstring in the unlikely case of underflow. - return (k1 - 1) & mask; - } -}; - -} // namespace qsim - -#endif // STATESPACE_BASIC_H_ diff --git a/tpls/qsim/statespace_cuda.h b/tpls/qsim/statespace_cuda.h deleted file mode 100644 index 660db07..0000000 --- a/tpls/qsim/statespace_cuda.h +++ /dev/null @@ -1,470 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef STATESPACE_CUDA_H_ -#define STATESPACE_CUDA_H_ - -#ifdef __NVCC__ - #include -#elif __HIP__ - #include - #include "cuda2hip.h" -#endif - -#include -#include -#include - -#include "statespace.h" -#include "statespace_cuda_kernels.h" -#include "vectorspace_cuda.h" -#include "util_cuda.h" - -namespace qsim { - -/** - * Object containing context and routines for CUDA state-vector manipulations. - * State is a vectorized sequence of 32 real components followed by 32 - * imaginary components. 32 floating numbers can be proccessed in parallel by - * a single warp. It is not recommended to use `GetAmpl` and `SetAmpl`. - */ -template -class StateSpaceCUDA : - public StateSpace, VectorSpaceCUDA, FP> { - private: - using Base = StateSpace, qsim::VectorSpaceCUDA, FP>; - - protected: - struct Grid { - unsigned threads; - unsigned dblocks; - unsigned blocks; - }; - - public: - using State = typename Base::State; - using fp_type = typename Base::fp_type; - - struct Parameter { - /** - * The number of threads per block. - * Should be 2 to the power of k, where k is in the range [5,10]. - */ - unsigned num_threads = 512; - /** - * The number of data blocks. Each thread processes num_dblocks data - * blocks in reductions (norms, inner products, etc). - */ - unsigned num_dblocks = 16; - }; - - explicit StateSpaceCUDA(const Parameter& param) - : param_(param), scratch_(nullptr), scratch_size_(0) {} - - virtual ~StateSpaceCUDA() { - if (scratch_ != nullptr) { - ErrorCheck(cudaFree(scratch_)); - } - } - - static uint64_t MinSize(unsigned num_qubits) { - return std::max(uint64_t{64}, 2 * (uint64_t{1} << num_qubits)); - }; - - void InternalToNormalOrder(State& state) const { - uint64_t size = MinSize(state.num_qubits()) / 2; - - unsigned threads = std::min(size, uint64_t{param_.num_threads}); - unsigned blocks = size / threads; - unsigned bytes = 2 * threads * sizeof(fp_type); - - InternalToNormalOrderKernel<<>>(state.get()); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void NormalToInternalOrder(State& state) const { - uint64_t size = MinSize(state.num_qubits()) / 2; - - unsigned threads = std::min(size, uint64_t{param_.num_threads}); - unsigned blocks = size / threads; - unsigned bytes = 2 * threads * sizeof(fp_type); - - NormalToInternalOrderKernel<<>>(state.get()); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - void SetAllZeros(State& state) const { - ErrorCheck(cudaMemset(state.get(), 0, - MinSize(state.num_qubits()) * sizeof(fp_type))); - } - - // Uniform superposition. - void SetStateUniform(State& state) const { - uint64_t size = MinSize(state.num_qubits()) / 2; - uint64_t hsize = uint64_t{1} << state.num_qubits(); - - unsigned threads = std::min(size, uint64_t{param_.num_threads}); - unsigned blocks = size / threads; - - fp_type v = double{1} / std::sqrt(hsize); - - SetStateUniformKernel<<>>(v, hsize, state.get()); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - // |0> state. - void SetStateZero(State& state) const { - SetAllZeros(state); - fp_type one[1] = {1}; - ErrorCheck( - cudaMemcpy(state.get(), one, sizeof(fp_type), cudaMemcpyHostToDevice)); - } - - // It is not recommended to use this function. - static std::complex GetAmpl(const State& state, uint64_t i) { - fp_type re, im; - auto p = state.get() + 64 * (i / 32) + i % 32; - ErrorCheck(cudaMemcpy(&re, p, sizeof(fp_type), cudaMemcpyDeviceToHost)); - ErrorCheck( - cudaMemcpy(&im, p + 32, sizeof(fp_type), cudaMemcpyDeviceToHost)); - return std::complex(re, im); - } - - // It is not recommended to use this function. - static void SetAmpl( - State& state, uint64_t i, const std::complex& ampl) { - fp_type re = std::real(ampl); - fp_type im = std::imag(ampl); - auto p = state.get() + 64 * (i / 32) + i % 32; - ErrorCheck(cudaMemcpy(p, &re, sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(p + 32, &im, sizeof(fp_type), cudaMemcpyHostToDevice)); - } - - // It is not recommended to use this function. - static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) { - auto p = state.get() + 64 * (i / 32) + i % 32; - ErrorCheck(cudaMemcpy(p, &re, sizeof(fp_type), cudaMemcpyHostToDevice)); - ErrorCheck( - cudaMemcpy(p + 32, &im, sizeof(fp_type), cudaMemcpyHostToDevice)); - } - - // Sets state[i] = complex(re, im) where (i & mask) == bits. - // if `exclude` is true then the criteria becomes (i & mask) != bits. - void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, - const std::complex& val, - bool exclude = false) const { - BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val), exclude); - } - - // Sets state[i] = complex(re, im) where (i & mask) == bits. - // if `exclude` is true then the criteria becomes (i & mask) != bits. - void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re, - fp_type im, bool exclude = false) const { - uint64_t size = MinSize(state.num_qubits()) / 2; - - unsigned threads = std::min(size, uint64_t{param_.num_threads}); - unsigned blocks = size / threads; - - BulkSetAmplKernel<<>>( - mask, bits, re, im, exclude, state.get()); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - // Does the equivalent of dest += src elementwise. - bool Add(const State& src, State& dest) const { - if (src.num_qubits() != dest.num_qubits()) { - return false; - } - - uint64_t size = MinSize(src.num_qubits()); - - unsigned threads = std::min(size, uint64_t{param_.num_threads}); - unsigned blocks = size / threads; - - AddKernel<<>>(src.get(), dest.get()); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - return true; - } - - // Does the equivalent of state *= a elementwise. - void Multiply(fp_type a, State& state) const { - uint64_t size = MinSize(state.num_qubits()); - - unsigned threads = std::min(size, uint64_t{param_.num_threads}); - unsigned blocks = size / threads; - - MultiplyKernel<<>>(a, state.get()); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - std::complex InnerProduct( - const State& state1, const State& state2) const { - if (state1.num_qubits() != state2.num_qubits()) { - return std::nan(""); - } - - using C = Complex; - auto r = Reduce>(state1, state2); - - return {r.re, r.im}; - } - - double RealInnerProduct(const State& state1, const State& state2) const { - if (state1.num_qubits() != state2.num_qubits()) { - return std::nan(""); - } - - return Reduce>(state1, state2); - } - - double Norm(const State& state) const { - return Reduce>(state, state); - } - - template - std::vector Sample( - const State& state, uint64_t num_samples, unsigned seed) const { - std::vector bitstrings; - - if (num_samples > 0) { - Grid g1 = GetGrid1(MinSize(state.num_qubits()) / 2); - unsigned bytes = g1.threads * sizeof(double); - - unsigned scratch_size = (g1.blocks + 1) * sizeof(double) - + num_samples * (sizeof(uint64_t) + sizeof(DistrRealType)); - - void* scratch = AllocScratch(scratch_size); - - double* d_res2 = (double*) scratch; - double* d_res1 = d_res2 + 1; - uint64_t* d_bitstrings = (uint64_t*) (d_res1 + g1.blocks); - DistrRealType* d_rs = (DistrRealType *) (d_bitstrings + num_samples); - - auto op1 = RealProduct(); - auto op2 = Plus(); - - Reduce1Kernel<<>>( - g1.dblocks, op1, op2, op2, state.get(), state.get(), d_res1); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - double norm; - - if (g1.blocks == 1) { - ErrorCheck( - cudaMemcpy(&norm, d_res1, sizeof(double), cudaMemcpyDeviceToHost)); - } else { - Grid g2 = GetGrid2(g1.blocks); - unsigned bytes = g2.threads * sizeof(double); - - auto op3 = Plus(); - - Reduce2Kernel<<>>( - g2.dblocks, g1.blocks, op3, op3, d_res1, d_res2); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - ErrorCheck( - cudaMemcpy(&norm, d_res2, sizeof(double), cudaMemcpyDeviceToHost)); - } - - // TODO: generate random values on the device. - auto rs = GenerateRandomValues(num_samples, seed, norm); - - ErrorCheck(cudaMemcpy(d_rs, rs.data(), - num_samples * sizeof(DistrRealType), - cudaMemcpyHostToDevice)); - - SampleKernel<<<1, g1.threads>>>(g1.blocks, g1.dblocks, num_samples, - d_rs, d_res1, state.get(), d_bitstrings); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - bitstrings.resize(num_samples, 0); - - ErrorCheck(cudaMemcpy(bitstrings.data(), d_bitstrings, - num_samples * sizeof(uint64_t), - cudaMemcpyDeviceToHost)); - } - - return bitstrings; - } - - using MeasurementResult = typename Base::MeasurementResult; - - void Collapse(const MeasurementResult& mr, State& state) const { - using Op = RealProduct; - double r = Reduce(mr.mask, mr.bits, state, state); - fp_type renorm = 1 / std::sqrt(r); - - uint64_t size = MinSize(state.num_qubits()) / 2; - - unsigned threads = std::min(size, uint64_t{param_.num_threads}); - unsigned blocks = size / threads; - - CollapseKernel<<>>(mr.mask, mr.bits, renorm, state.get()); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - } - - std::vector PartialNorms(const State& state) const { - Grid g = GetGrid1(MinSize(state.num_qubits()) / 2); - - unsigned scratch_size = g.blocks * sizeof(double); - unsigned bytes = g.threads * sizeof(double); - - double* d_res = (double*) AllocScratch(scratch_size); - - auto op1 = RealProduct(); - auto op2 = Plus(); - - Reduce1Kernel<<>>( - g.dblocks, op1, op2, op2, state.get(), state.get(), d_res); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - std::vector norms(g.blocks); - - ErrorCheck( - cudaMemcpy(norms.data(), d_res, scratch_size, cudaMemcpyDeviceToHost)); - - return norms; - } - - uint64_t FindMeasuredBits( - unsigned m, double r, uint64_t mask, const State& state) const { - Grid g = GetGrid1(MinSize(state.num_qubits()) / 2); - - uint64_t res; - uint64_t* d_res = (uint64_t*) AllocScratch(sizeof(uint64_t)); - - FindMeasuredBitsKernel<<<1, g.threads>>>( - m, g.dblocks, r, state.get(), d_res); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - ErrorCheck( - cudaMemcpy(&res, d_res, sizeof(uint64_t), cudaMemcpyDeviceToHost)); - - return res & mask; - } - - protected: - Parameter param_; - - void* AllocScratch(uint64_t size) const { - if (size > scratch_size_) { - if (scratch_ != nullptr) { - ErrorCheck(cudaFree(scratch_)); - } - - ErrorCheck(cudaMalloc(const_cast(&scratch_), size)); - - const_cast(scratch_size_) = size; - } - - return scratch_; - } - - Grid GetGrid1(uint64_t size) const { - Grid grid; - - grid.threads = std::min(size, uint64_t{param_.num_threads}); - grid.dblocks = std::min(size / grid.threads, uint64_t{param_.num_dblocks}); - grid.blocks = size / (grid.threads * grid.dblocks); - - return grid; - } - - Grid GetGrid2(unsigned size) const { - Grid grid; - - grid.threads = std::min(param_.num_threads, std::max(32U, size)); - grid.dblocks = std::max(1U, size / grid.threads); - grid.blocks = 1; - - return grid; - } - - template - FP2 Reduce(const State& state1, const State& state2) const { - return Reduce(0, 0, state1, state2); - } - - template - FP2 Reduce(uint64_t mask, uint64_t bits, - const State& state1, const State& state2) const { - uint64_t size = MinSize(state1.num_qubits()) / 2; - - Grid g1 = GetGrid1(size); - unsigned bytes = g1.threads * sizeof(FP1); - - FP2* d_res2 = (FP2*) AllocScratch((g1.blocks + 1) * sizeof(FP2)); - FP2* d_res1 = d_res2 + 1; - - auto op1 = Op(); - auto op2 = Plus(); - auto op3 = Plus::type>(); - - if (mask == 0) { - Reduce1Kernel<<>>( - g1.dblocks, op1, op2, op3, state1.get(), state2.get(), d_res1); - } else { - Reduce1MaskedKernel<<>>( - g1.dblocks, mask, bits, op1, op2, op3, state1.get(), state2.get(), - d_res1); - } - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - FP2 result; - - if (g1.blocks == 1) { - ErrorCheck( - cudaMemcpy(&result, d_res1, sizeof(FP2), cudaMemcpyDeviceToHost)); - } else { - Grid g2 = GetGrid2(g1.blocks); - unsigned bytes = g2.threads * sizeof(FP2); - - auto op2 = Plus(); - auto op3 = Plus::type>(); - - Reduce2Kernel<<>>( - g2.dblocks, g1.blocks, op2, op3, d_res1, d_res2); - ErrorCheck(cudaPeekAtLastError()); - ErrorCheck(cudaDeviceSynchronize()); - - ErrorCheck( - cudaMemcpy(&result, d_res2, sizeof(FP2), cudaMemcpyDeviceToHost)); - } - - return result; - } - - private: - void* scratch_; - uint64_t scratch_size_; -}; - -} // namespace qsim - -#endif // STATESPACE_CUDA_H_ diff --git a/tpls/qsim/statespace_cuda_kernels.h b/tpls/qsim/statespace_cuda_kernels.h deleted file mode 100644 index b54ebca..0000000 --- a/tpls/qsim/statespace_cuda_kernels.h +++ /dev/null @@ -1,355 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef STATESPACE_CUDA_KERNELS_H_ -#define STATESPACE_CUDA_KERNELS_H_ - -#ifdef __NVCC__ - #include -#elif __HIP__ - #include - #include "cuda2hip.h" -#endif - -#include "util_cuda.h" - -namespace qsim { - -namespace detail { - -template -__device__ __forceinline__ FP1 BlockReduce1( - uint64_t n, Op1 op1, Op2 op2, Op3 op3, const FP2* s1, const FP2* s2) { - extern __shared__ float shared[]; - FP1* partial1 = (FP1*) shared; - - unsigned tid = threadIdx.x; - unsigned warp = threadIdx.x / warp_size; - unsigned lane = threadIdx.x % warp_size; - - uint64_t k0 = 2 * n * blockIdx.x * blockDim.x + 2 * tid - lane; - uint64_t k1 = k0 + 2 * n * blockDim.x; - - FP1 r; - - r = op1(s1[k0], s1[k0 + warp_size], s2[k0], s2[k0 + warp_size]); - while ((k0 += 2 * blockDim.x) < k1) { - r = op2(r, op1(s1[k0], s1[k0 + warp_size], s2[k0], s2[k0 + warp_size])); - } - - partial1[tid] = r; - - __shared__ FP1 partial2[warp_size]; - - if (tid < warp_size) { - partial2[tid] = 0; - } - - __syncthreads(); - - FP1 val = WarpReduce(partial1[tid], op3); - - if (lane == 0) { - partial2[warp] = val; - } - - __syncthreads(); - - FP1 result = 0; - - if (tid < warp_size) { - result = WarpReduce(partial2[tid], op3); - } - - return result; -} - -template -__device__ __forceinline__ FP1 BlockReduce1Masked( - uint64_t n, uint64_t mask, uint64_t bits, Op1 op1, Op2 op2, Op3 op3, - const FP2* s1, const FP2* s2) { - extern __shared__ float shared[]; - FP1* partial1 = (FP1*) shared; - - unsigned tid = threadIdx.x; - unsigned warp = threadIdx.x / warp_size; - unsigned lane = threadIdx.x % warp_size; - - uint64_t k0 = 2 * n * blockIdx.x * blockDim.x + 2 * tid - lane; - uint64_t k1 = k0 + 2 * n * blockDim.x; - - FP1 r = 0; - - if (((k0 + lane) / 2 & mask) == bits) { - r = op1(s1[k0], s1[k0 + warp_size], s2[k0], s2[k0 + warp_size]); - } - while ((k0 += 2 * blockDim.x) < k1) { - if (((k0 + lane) / 2 & mask) == bits) { - r = op2(r, op1(s1[k0], s1[k0 + warp_size], s2[k0], s2[k0 + warp_size])); - } - } - - partial1[tid] = r; - - __shared__ FP1 partial2[warp_size]; - - if (tid < warp_size) { - partial2[tid] = 0; - } - - __syncthreads(); - - FP1 val = WarpReduce(partial1[tid], op3); - - if (lane == 0) { - partial2[warp] = val; - } - - __syncthreads(); - - FP1 result = 0; - - if (tid < warp_size) { - result = WarpReduce(partial2[tid], op3); - } - - return result; -} - -template -__device__ __forceinline__ FP1 BlockReduce2( - uint64_t n, uint64_t size, Op2 op2, Op3 op3, const FP2* s) { - extern __shared__ float shared[]; - FP1* partial1 = (FP1*) shared; - - unsigned tid = threadIdx.x; - uint64_t k0 = n * blockIdx.x * blockDim.x + tid; - uint64_t k1 = k0 + n * blockDim.x; - - FP1 r = 0; - - if (tid < size) { - r = s[k0]; - while ((k0 += blockDim.x) < k1) { - r = op2(r, s[k0]); - } - } - - partial1[tid] = r; - - __shared__ FP1 partial2[warp_size]; - - if (tid < warp_size) { - partial2[tid] = 0; - } - - __syncthreads(); - - FP1 val = WarpReduce(partial1[tid], op3); - - if (threadIdx.x % warp_size == 0) { - partial2[threadIdx.x / warp_size] = val; - } - - __syncthreads(); - - FP1 result = 0; - - if (tid < warp_size) { - result = WarpReduce(partial2[tid], op3); - } - - return result; -} - -} // namespace detail - -template -__global__ void Reduce1Kernel(uint64_t n, Op1 op1, Op2 op2, Op3 op3, - const FP2* s1, const FP2* s2, FP3* result) { - FP1 sum = detail::BlockReduce1(n, op1, op2, op3, s1, s2); - - if (threadIdx.x == 0) { - result[blockIdx.x] = sum; - } -} - -template -__global__ void Reduce1MaskedKernel(uint64_t n, uint64_t mask, uint64_t bits, - Op1 op1, Op2 op2, Op3 op3, - const FP2* s1, const FP2* s2, FP3* result) { - FP1 sum = - detail::BlockReduce1Masked(n, mask, bits, op1, op2, op3, s1, s2); - - if (threadIdx.x == 0) { - result[blockIdx.x] = sum; - } -} - -template -__global__ void Reduce2Kernel( - uint64_t n, uint64_t size, Op2 op2, Op3 op3, const FP2* s, FP3* result) { - FP1 sum = detail::BlockReduce2(n, size, op2, op3, s); - - if (threadIdx.x == 0) { - result[blockIdx.x] = sum; - } -} - -template -__global__ void InternalToNormalOrderKernel(FP* state) { - unsigned lane = threadIdx.x % warp_size; - unsigned l = 2 * threadIdx.x - lane; - uint64_t k = 2 * uint64_t{blockIdx.x} * blockDim.x + l; - - extern __shared__ float shared[]; - FP* buf = (FP*) shared; - - buf[l] = state[k]; - buf[l + warp_size] = state[k + warp_size]; - - __syncthreads(); - - state[k + lane] = buf[l]; - state[k + lane + 1] = buf[l + warp_size]; -} - -template -__global__ void NormalToInternalOrderKernel(FP* state) { - unsigned lane = threadIdx.x % warp_size; - unsigned l = 2 * threadIdx.x - lane; - uint64_t k = 2 * uint64_t{blockIdx.x} * blockDim.x + l; - - extern __shared__ float shared[]; - FP* buf = (FP*) shared; - - buf[l] = state[k]; - buf[l + warp_size] = state[k + warp_size]; - - __syncthreads(); - - state[k] = buf[l + lane]; - state[k + warp_size] = buf[l + lane + 1]; -} - -template -__global__ void SetStateUniformKernel(FP v, uint64_t size, FP* state) { - unsigned lane = threadIdx.x % warp_size; - uint64_t k = 2 * (uint64_t{blockIdx.x} * blockDim.x + threadIdx.x) - lane; - - state[k] = lane < size ? v : 0; - state[k + warp_size] = 0; -} - -template -__global__ void AddKernel(const FP* state1, FP* state2) { - uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x; - state2[k] += state1[k]; -} - -template -__global__ void MultiplyKernel(FP a, FP* state) { - uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x; - state[k] *= a; -} - -template -__global__ void CollapseKernel(uint64_t mask, uint64_t bits, FP r, FP* state) { - uint64_t k1 = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x; - uint64_t k2 = 2 * k1 - threadIdx.x % warp_size; - - if ((k1 & mask) == bits) { - state[k2] *= r; - state[k2 + warp_size] *= r; - } else { - state[k2] = 0; - state[k2 + warp_size] = 0; - } -} - -template -__global__ void BulkSetAmplKernel( - uint64_t mask, uint64_t bits, FP re, FP im, bool exclude, FP* state) { - uint64_t k1 = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x; - uint64_t k2 = 2 * k1 - threadIdx.x % warp_size; - - bool set = ((k1 & mask) == bits) ^ exclude; - - if (set) { - state[k2] = re; - state[k2 + warp_size] = im; - } -} - -template -__global__ void SampleKernel(unsigned num_blocks, - uint64_t n, uint64_t num_samples, - const FP1* rs, const FP2* ps, const FP3* state, - uint64_t *bitstrings) { - // Use just one thread. This can be somewhat slow. - if (threadIdx.x == 0) { - uint64_t m = 0; - double csum = 0; - - for (unsigned block_id = 0; block_id < num_blocks; ++block_id) { - uint64_t km = n * blockDim.x; - uint64_t k0 = block_id * km; - - for (uint64_t k = 0; k < km; ++k) { - uint64_t l = 2 * k0 + 64 * (k / 32) + k % 32; - FP3 re = state[l]; - FP3 im = state[l + warp_size]; - csum += re * re + im * im; - while (rs[m] < csum && m < num_samples) { - bitstrings[m++] = k0 + k; - } - } - } - } -} - -template -__global__ void FindMeasuredBitsKernel( - uint64_t block_id, uint64_t n, double r, const FP* state, uint64_t* res) { - // Use just one thread. This can be somewhat slow, however, this is - // more or less consistent with CPU implementations. - if (threadIdx.x == 0) { - double csum = 0; - uint64_t km = n * blockDim.x; - uint64_t k0 = block_id * km; - - for (uint64_t k = 0; k < km; ++k) { - uint64_t l = 2 * k0 + 64 * (k / 32) + k % 32; - FP re = state[l]; - FP im = state[l + warp_size]; - csum += re * re + im * im; - if (r < csum) { - *res = k0 + k; - return; - } - } - - *res = k0 + n * blockDim.x - 1; - } -} - -} // namespace qsim - -#endif // STATESPACE_CUDA_KERNELS_H_ diff --git a/tpls/qsim/statespace_custatevec.h b/tpls/qsim/statespace_custatevec.h deleted file mode 100644 index f2f5de1..0000000 --- a/tpls/qsim/statespace_custatevec.h +++ /dev/null @@ -1,376 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef STATESPACE_CUSTATEVEC_H_ -#define STATESPACE_CUSTATEVEC_H_ - -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "statespace.h" -#include "util_custatevec.h" -#include "vectorspace_cuda.h" - -namespace qsim { - -namespace detail { - -template -__global__ void SetStateUniformKernel(FP v, uint64_t size, FP* state) { - uint64_t k = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x; - - if (k < size) { - state[2 * k] = v; - state[2 * k + 1] = 0; - } -} - -} // namespace detail - -/** - * Object containing context and routines for cuStateVec state-vector - * manipulations. It is not recommended to use `GetAmpl` and `SetAmpl`. - */ -template -class StateSpaceCuStateVec : - public StateSpace, VectorSpaceCUDA, FP> { - private: - using Base = StateSpace, qsim::VectorSpaceCUDA, FP>; - - public: - using State = typename Base::State; - using fp_type = typename Base::fp_type; - - static constexpr auto is_float = std::is_same::value; - - static constexpr auto kStateType = is_float ? CUDA_C_32F : CUDA_C_64F; - static constexpr auto kMatrixType = kStateType; - static constexpr auto kExpectType = CUDA_C_64F; - static constexpr auto kComputeType = - is_float ? CUSTATEVEC_COMPUTE_32F : CUSTATEVEC_COMPUTE_64F; - static constexpr auto kMatrixLayout = CUSTATEVEC_MATRIX_LAYOUT_ROW; - - explicit StateSpaceCuStateVec(const cublasHandle_t& cublas_handle, - const custatevecHandle_t& custatevec_handle) - : cublas_handle_(cublas_handle), custatevec_handle_(custatevec_handle), - workspace_(nullptr), workspace_size_(0) {} - - virtual ~StateSpaceCuStateVec() { - if (workspace_ != nullptr) { - ErrorCheck(cudaFree(workspace_)); - } - } - - static uint64_t MinSize(unsigned num_qubits) { - return 2 * (uint64_t{1} << num_qubits); - }; - - void InternalToNormalOrder(State& state) const { - } - - void NormalToInternalOrder(State& state) const { - } - - void SetAllZeros(State& state) const { - ErrorCheck(cudaMemset(state.get(), 0, - MinSize(state.num_qubits()) * sizeof(fp_type))); - } - - // Uniform superposition. - void SetStateUniform(State& state) const { - uint64_t size = uint64_t{1} << state.num_qubits(); - - unsigned threads = size < 256 ? size : 256; - unsigned blocks = size / threads; - - fp_type v = double{1} / std::sqrt(size); - - detail::SetStateUniformKernel<<>>(v, size, state.get()); - ErrorCheck(cudaPeekAtLastError()); - } - - // |0> state. - void SetStateZero(State& state) const { - SetAllZeros(state); - fp_type one[1] = {1}; - ErrorCheck( - cudaMemcpy(state.get(), one, sizeof(fp_type), cudaMemcpyHostToDevice)); - } - - // It is not recommended to use this function. - static std::complex GetAmpl(const State& state, uint64_t i) { - fp_type a[2]; - auto p = state.get() + 2 * i; - ErrorCheck(cudaMemcpy(a, p, 2 * sizeof(fp_type), cudaMemcpyDeviceToHost)); - return std::complex(a[0], a[1]); - } - - // It is not recommended to use this function. - static void SetAmpl( - State& state, uint64_t i, const std::complex& ampl) { - fp_type a[2] = {std::real(ampl), std::imag(ampl)}; - auto p = state.get() + 2 * i; - ErrorCheck(cudaMemcpy(p, a, 2 * sizeof(fp_type), cudaMemcpyHostToDevice)); - } - - // It is not recommended to use this function. - static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) { - fp_type a[2] = {re, im}; - auto p = state.get() + 2 * i; - ErrorCheck(cudaMemcpy(p, a, 2 * sizeof(fp_type), cudaMemcpyHostToDevice)); - } - - // Sets state[i] = complex(re, im) where (i & mask) == bits. - // if `exclude` is true then the criteria becomes (i & mask) != bits. - void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, - const std::complex& val, - bool exclude = false) const { - // Not implemented. - } - - // Sets state[i] = complex(re, im) where (i & mask) == bits. - // if `exclude` is true then the criteria becomes (i & mask) != bits. - void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re, - fp_type im, bool exclude = false) const { - // Not implemented. - } - - // Does the equivalent of dest += src elementwise. - bool Add(const State& src, State& dest) const { - if (src.num_qubits() != dest.num_qubits()) { - return false; - } - - uint64_t size = uint64_t{1} << src.num_qubits(); - - if (is_float) { - cuComplex a = {1.0, 0.0}; - auto p1 = (const cuComplex*) src.get(); - auto p2 = (cuComplex*) dest.get(); - ErrorCheck(cublasCaxpy(cublas_handle_, size, &a, p1, 1, p2, 1)); - } else { - cuDoubleComplex a = {1.0, 0.0}; - auto p1 = (const cuDoubleComplex*) src.get(); - auto p2 = (cuDoubleComplex*) dest.get(); - ErrorCheck(cublasZaxpy(cublas_handle_, size, &a, p1, 1, p2, 1)); - } - - return true; - } - - // Does the equivalent of state *= a elementwise. - void Multiply(fp_type a, State& state) const { - uint64_t size = uint64_t{1} << state.num_qubits(); - - if (is_float) { - float a1 = a; - auto p = (cuComplex*) state.get(); - ErrorCheck(cublasCsscal(cublas_handle_, size, &a1, p, 1)); - } else { - double a1 = a; - auto p = (cuDoubleComplex*) state.get(); - ErrorCheck(cublasZdscal(cublas_handle_, size, &a1, p, 1)); - } - } - - std::complex InnerProduct( - const State& state1, const State& state2) const { - if (state1.num_qubits() != state2.num_qubits()) { - return std::nan(""); - } - - uint64_t size = uint64_t{1} << state1.num_qubits(); - - if (is_float) { - cuComplex result; - auto p1 = (const cuComplex*) state1.get(); - auto p2 = (const cuComplex*) state2.get(); - ErrorCheck(cublasCdotc(cublas_handle_, size, p1, 1, p2, 1, &result)); - return {cuCrealf(result), cuCimagf(result)}; - } else { - cuDoubleComplex result; - auto p1 = (const cuDoubleComplex*) state1.get(); - auto p2 = (const cuDoubleComplex*) state2.get(); - ErrorCheck(cublasZdotc(cublas_handle_, size, p1, 1, p2, 1, &result)); - return {cuCreal(result), cuCimag(result)}; - } - } - - double RealInnerProduct(const State& state1, const State& state2) const { - return std::real(InnerProduct(state1, state2)); - } - - double Norm(const State& state) const { - uint64_t size = uint64_t{1} << state.num_qubits(); - - if (is_float) { - float result; - auto p = (const cuComplex*) state.get(); - ErrorCheck(cublasScnrm2(cublas_handle_, size, p, 1, &result)); - return result * result; - } else { - double result; - auto p = (const cuDoubleComplex*) state.get(); - ErrorCheck(cublasDznrm2(cublas_handle_, size, p, 1, &result)); - return result * result; - } - } - - template - std::vector Sample( - const State& state, uint64_t num_samples, unsigned seed) const { - std::vector bitstrings; - - if (num_samples > 0) { - auto rs = GenerateRandomValues(num_samples, seed, 1.0); - - size_t workspace_size; - custatevecSamplerDescriptor_t sampler; - - ErrorCheck(custatevecSamplerCreate( - custatevec_handle_, state.get(), kStateType, - state.num_qubits(), &sampler, num_samples, - &workspace_size)); - - AllocWorkSpace(workspace_size); - - ErrorCheck(custatevecSamplerPreprocess( - custatevec_handle_, sampler, workspace_, workspace_size)); - - std::vector bitstrings0(num_samples); - std::vector bitordering; - - bitordering.reserve(state.num_qubits()); - for (unsigned i = 0; i < state.num_qubits(); ++i) { - bitordering.push_back(i); - } - - ErrorCheck(custatevecSamplerSample( - custatevec_handle_, sampler, bitstrings0.data(), - bitordering.data(), state.num_qubits(), rs.data(), - num_samples, CUSTATEVEC_SAMPLER_OUTPUT_RANDNUM_ORDER)); - - bitstrings.reserve(num_samples); - for (unsigned i = 0; i < num_samples; ++i) { - bitstrings.push_back(bitstrings0[i]); - } - } - - return bitstrings; - } - - using MeasurementResult = typename Base::MeasurementResult; - - template - MeasurementResult Measure(const std::vector& qubits, - RGen& rgen, State& state, - bool no_collapse = false) const { - auto r = RandomValue(rgen, 1.0); - - MeasurementResult result; - - result.valid = true; - result.mask = 0; - result.bits = 0; - result.bitstring.resize(qubits.size(), 0); - - for (auto q : qubits) { - if (q >= state.num_qubits()) { - result.valid = false; - return result; - } - - result.mask |= uint64_t{1} << q; - } - - auto collapse = no_collapse ? - CUSTATEVEC_COLLAPSE_NONE : CUSTATEVEC_COLLAPSE_NORMALIZE_AND_ZERO; - - ErrorCheck(custatevecBatchMeasure( - custatevec_handle_, state.get(), kStateType, - state.num_qubits(), (int*) result.bitstring.data(), - (int*) qubits.data(), qubits.size(), r, collapse)); - - for (std::size_t i = 0; i < result.bitstring.size(); ++i) { - result.bits |= result.bitstring[i] << qubits[i]; - } - - return result; - } - - template - MeasurementResult VirtualMeasure(const std::vector& qubits, - RGen& rgen, const State& state) const { - return Measure(qubits, rgen, const_cast(state), true); - } - - void Collapse(const MeasurementResult& mr, State& state) const { - unsigned count = 0; - - std::vector bitstring; - std::vector bitordering; - - bitstring.reserve(state.num_qubits()); - bitordering.reserve(state.num_qubits()); - - for (unsigned i = 0; i < state.num_qubits(); ++i) { - if (((mr.mask >> i) & 1) != 0) { - bitstring.push_back((mr.bits >> i) & 1); - bitordering.push_back(i); - ++count; - } - } - - ErrorCheck(custatevecCollapseByBitString( - custatevec_handle_, state.get(), kStateType, - state.num_qubits(), bitstring.data(), bitordering.data(), - count, 1.0)); - - // TODO: do we need the following? - double norm = Norm(state); - Multiply(1.0 / std::sqrt(norm), state); - } - - private: - void* AllocWorkSpace(size_t size) const { - if (size > workspace_size_) { - if (workspace_ != nullptr) { - ErrorCheck(cudaFree(workspace_)); - } - - ErrorCheck(cudaMalloc(const_cast(&workspace_), size)); - - const_cast(workspace_size_) = size; - } - - return workspace_; - } - - const cublasHandle_t cublas_handle_; - const custatevecHandle_t custatevec_handle_; - - void* workspace_; - size_t workspace_size_; -}; - -} // namespace qsim - -#endif // STATESPACE_CUSTATEVEC_H_ diff --git a/tpls/qsim/statespace_sse.h b/tpls/qsim/statespace_sse.h deleted file mode 100644 index cf41a09..0000000 --- a/tpls/qsim/statespace_sse.h +++ /dev/null @@ -1,462 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef STATESPACE_SSE_H_ -#define STATESPACE_SSE_H_ - -#include - -#include -#include -#include -#include -#include - -#include "statespace.h" -#include "util.h" -#include "vectorspace.h" - -namespace qsim { - -namespace detail { - -inline __m128i GetZeroMaskSSE(uint64_t i, uint64_t mask, uint64_t bits) { - __m128i s1 = _mm_set_epi64x(i + 2, i + 0); - __m128i s2 = _mm_set_epi64x(i + 3, i + 1); - __m128i ma = _mm_set1_epi64x(mask); - __m128i bi = _mm_set1_epi64x(bits); - - s1 = _mm_and_si128(s1, ma); - s2 = _mm_and_si128(s2, ma); - - s1 = _mm_cmpeq_epi64(s1, bi); - s2 = _mm_cmpeq_epi64(s2, bi); - - return _mm_blend_epi16(s1, s2, 204); // 11001100 -} - -inline double HorizontalSumSSE(__m128 s) { - __m128 ss = _mm_movehdup_ps(s); - __m128 s1 = _mm_add_ps(s, ss); - - return _mm_cvtss_f32(_mm_add_ss(s1, _mm_movehl_ps(ss, s1))); -} - -} // namespace detail - -/** - * Object containing context and routines for SSE state-vector manipulations. - * State is a vectorized sequence of four real components followed by four - * imaginary components. Four single-precison floating numbers can be loaded - * into an SSE register. - */ -template -class StateSpaceSSE : - public StateSpace, VectorSpace, For, float> { - private: - using Base = StateSpace, qsim::VectorSpace, For, float>; - - public: - using State = typename Base::State; - using fp_type = typename Base::fp_type; - - template - explicit StateSpaceSSE(ForArgs&&... args) : Base(args...) {} - - static uint64_t MinSize(unsigned num_qubits) { - return std::max(uint64_t{8}, 2 * (uint64_t{1} << num_qubits)); - }; - - void InternalToNormalOrder(State& state) const { - if (state.num_qubits() == 1) { - auto s = state.get(); - - s[2] = s[1]; - s[1] = s[4]; - s[3] = s[5]; - - for (uint64_t i = 4; i < 8; ++i) { - s[i] = 0; - } - } else { - auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) { - auto s = p + 8 * i; - - fp_type re[3]; - fp_type im[3]; - - for (uint64_t i = 0; i < 3; ++i) { - re[i] = s[i + 1]; - im[i] = s[i + 4]; - } - - for (uint64_t i = 0; i < 3; ++i) { - s[2 * i + 1] = im[i]; - s[2 * i + 2] = re[i]; - } - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 8, f, state.get()); - } - } - - void NormalToInternalOrder(State& state) const { - if (state.num_qubits() == 1) { - auto s = state.get(); - - s[4] = s[1]; - s[1] = s[2]; - s[5] = s[3]; - - s[2] = 0; - s[3] = 0; - s[6] = 0; - s[7] = 0; - } else { - auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) { - auto s = p + 8 * i; - - fp_type re[3]; - fp_type im[3]; - - for (uint64_t i = 0; i < 3; ++i) { - im[i] = s[2 * i + 1]; - re[i] = s[2 * i + 2]; - } - - for (uint64_t i = 0; i < 3; ++i) { - s[i + 1] = re[i]; - s[i + 4] = im[i]; - } - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 8, f, state.get()); - } - } - - void SetAllZeros(State& state) const { - __m128 val0 = _mm_setzero_ps(); - - auto f = [](unsigned n, unsigned m, uint64_t i, __m128 val0, fp_type* p) { - _mm_store_ps(p + 8 * i, val0); - _mm_store_ps(p + 8 * i + 4, val0); - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 8, f, val0, state.get()); - } - - // Uniform superposition. - void SetStateUniform(State& state) const { - __m128 val0 = _mm_setzero_ps(); - __m128 valu; - - fp_type v = double{1} / std::sqrt(uint64_t{1} << state.num_qubits()); - - if (state.num_qubits() == 1) { - valu = _mm_set_ps(0, 0, v, v); - } else { - valu = _mm_set1_ps(v); - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - __m128 val0, __m128 valu, fp_type* p) { - _mm_store_ps(p + 8 * i, valu); - _mm_store_ps(p + 8 * i + 4, val0); - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 8, f, val0, valu, state.get()); - } - - // |0> state. - void SetStateZero(State& state) const { - SetAllZeros(state); - state.get()[0] = 1; - } - - static std::complex GetAmpl(const State& state, uint64_t i) { - uint64_t p = (8 * (i / 4)) + (i % 4); - return std::complex(state.get()[p], state.get()[p + 4]); - } - - static void SetAmpl( - State& state, uint64_t i, const std::complex& ampl) { - uint64_t p = (8 * (i / 4)) + (i % 4); - state.get()[p] = std::real(ampl); - state.get()[p + 4] = std::imag(ampl); - } - - static void SetAmpl(State& state, uint64_t i, fp_type re, fp_type im) { - uint64_t p = (8 * (i / 4)) + (i % 4); - state.get()[p] = re; - state.get()[p + 4] = im; - } - - // Sets state[i] = complex(re, im) where (i & mask) == bits. - // if `exclude` is true then the criteria becomes (i & mask) != bits. - void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, - const std::complex& val, - bool exclude = false) const { - BulkSetAmpl(state, mask, bits, std::real(val), std::imag(val)); - } - - // Sets state[i] = complex(re, im) where (i & mask) == bits. - // if `exclude` is true then the criteria becomes (i & mask) != bits. - void BulkSetAmpl(State& state, uint64_t mask, uint64_t bits, fp_type re, - fp_type im, bool exclude = false) const { - __m128 re_reg = _mm_set1_ps(re); - __m128 im_reg = _mm_set1_ps(im); - __m128i exclude_reg = _mm_setzero_si128(); - if (exclude) { - exclude_reg = _mm_cmpeq_epi32(exclude_reg, exclude_reg); - } - - auto f = [](unsigned n, unsigned m, uint64_t i, uint64_t maskv, - uint64_t bitsv, __m128 re_n, __m128 im_n, __m128i exclude_n, - fp_type* p) { - __m128 ml = _mm_castsi128_ps(_mm_xor_si128( - detail::GetZeroMaskSSE(4 * i, maskv, bitsv), exclude_n)); - - __m128 re = _mm_load_ps(p + 8 * i); - __m128 im = _mm_load_ps(p + 8 * i + 4); - - re = _mm_blendv_ps(re, re_n, ml); - im = _mm_blendv_ps(im, im_n, ml); - - _mm_store_ps(p + 8 * i, re); - _mm_store_ps(p + 8 * i + 4, im); - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 8, f, mask, bits, re_reg, - im_reg, exclude_reg, state.get()); - } - - // Does the equivalent of dest += src elementwise. - bool Add(const State& src, State& dest) const { - if (src.num_qubits() != dest.num_qubits()) { - return false; - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* p1, fp_type* p2) { - __m128 re1 = _mm_load_ps(p1 + 8 * i); - __m128 im1 = _mm_load_ps(p1 + 8 * i + 4); - __m128 re2 = _mm_load_ps(p2 + 8 * i); - __m128 im2 = _mm_load_ps(p2 + 8 * i + 4); - - _mm_store_ps(p2 + 8 * i, _mm_add_ps(re1, re2)); - _mm_store_ps(p2 + 8 * i + 4, _mm_add_ps(im1, im2)); - }; - - Base::for_.Run(MinSize(src.num_qubits()) / 8, f, src.get(), dest.get()); - - return true; - } - - // Does the equivalent of state *= a elementwise. - void Multiply(fp_type a, State& state) const { - __m128 r = _mm_set1_ps(a); - - auto f = [](unsigned n, unsigned m, uint64_t i, __m128 r, fp_type* p) { - __m128 re = _mm_load_ps(p + 8 * i); - __m128 im = _mm_load_ps(p + 8 * i + 4); - - re = _mm_mul_ps(re, r); - im = _mm_mul_ps(im, r); - - _mm_store_ps(p + 8 * i, re); - _mm_store_ps(p + 8 * i + 4, im); - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 8, f, r, state.get()); - } - - std::complex InnerProduct( - const State& state1, const State& state2) const { - if (state1.num_qubits() != state2.num_qubits()) { - return std::nan(""); - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* p1, const fp_type* p2) -> std::complex { - __m128 re1 = _mm_load_ps(p1 + 8 * i); - __m128 im1 = _mm_load_ps(p1 + 8 * i + 4); - __m128 re2 = _mm_load_ps(p2 + 8 * i); - __m128 im2 = _mm_load_ps(p2 + 8 * i + 4); - - __m128 ip_re = _mm_add_ps(_mm_mul_ps(re1, re2), _mm_mul_ps(im1, im2)); - __m128 ip_im = _mm_sub_ps(_mm_mul_ps(re1, im2), _mm_mul_ps(im1, re2)); - - double re = detail::HorizontalSumSSE(ip_re); - double im = detail::HorizontalSumSSE(ip_im); - - return std::complex{re, im}; - }; - - using Op = std::plus>; - return Base::for_.RunReduce( - MinSize(state1.num_qubits()) / 8, f, Op(), state1.get(), state2.get()); - } - - double RealInnerProduct(const State& state1, const State& state2) const { - if (state1.num_qubits() != state2.num_qubits()) { - return std::nan(""); - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* p1, const fp_type* p2) -> double { - __m128 re1 = _mm_load_ps(p1 + 8 * i); - __m128 im1 = _mm_load_ps(p1 + 8 * i + 4); - __m128 re2 = _mm_load_ps(p2 + 8 * i); - __m128 im2 = _mm_load_ps(p2 + 8 * i + 4); - - __m128 ip_re = _mm_add_ps(_mm_mul_ps(re1, re2), _mm_mul_ps(im1, im2)); - - return detail::HorizontalSumSSE(ip_re); - }; - - using Op = std::plus; - return Base::for_.RunReduce( - MinSize(state1.num_qubits()) / 8, f, Op(), state1.get(), state2.get()); - } - - template - std::vector Sample( - const State& state, uint64_t num_samples, unsigned seed) const { - std::vector bitstrings; - - if (num_samples > 0) { - double norm = 0; - uint64_t size = MinSize(state.num_qubits()) / 8; - const fp_type* p = state.get(); - - for (uint64_t k = 0; k < size; ++k) { - for (unsigned j = 0; j < 4; ++j) { - double re = p[8 * k + j]; - double im = p[8 * k + 4 + j]; - norm += re * re + im * im; - } - } - - auto rs = GenerateRandomValues(num_samples, seed, norm); - - uint64_t m = 0; - double csum = 0; - bitstrings.reserve(num_samples); - - for (uint64_t k = 0; k < size; ++k) { - for (unsigned j = 0; j < 4; ++j) { - double re = p[8 * k + j]; - double im = p[8 * k + 4 + j]; - csum += re * re + im * im; - while (rs[m] < csum && m < num_samples) { - bitstrings.emplace_back(4 * k + j); - ++m; - } - } - } - - for (; m < num_samples; ++m) { - bitstrings.emplace_back((uint64_t{1} << state.num_qubits()) - 1); - } - } - - return bitstrings; - } - - using MeasurementResult = typename Base::MeasurementResult; - - void Collapse(const MeasurementResult& mr, State& state) const { - __m128 zero = _mm_set1_ps(0); - - auto f1 = [](unsigned n, unsigned m, uint64_t i, uint64_t mask, - uint64_t bits, __m128 zero, const fp_type* p) -> double { - __m128 ml = _mm_castsi128_ps(detail::GetZeroMaskSSE(4 * i, mask, bits)); - - __m128 re = _mm_load_ps(p + 8 * i); - __m128 im = _mm_load_ps(p + 8 * i + 4); - __m128 s1 = _mm_add_ps(_mm_mul_ps(re, re), _mm_mul_ps(im, im)); - - s1 = _mm_blendv_ps(zero, s1, ml); - - return detail::HorizontalSumSSE(s1); - }; - - using Op = std::plus; - double norm = Base::for_.RunReduce(MinSize(state.num_qubits()) / 8, f1, - Op(), mr.mask, mr.bits, zero, - state.get()); - - __m128 renorm = _mm_set1_ps(1.0 / std::sqrt(norm)); - - auto f2 = [](unsigned n, unsigned m, uint64_t i, uint64_t mask, - uint64_t bits, __m128 renorm, __m128 zero, fp_type* p) { - __m128 ml = _mm_castsi128_ps(detail::GetZeroMaskSSE(4 * i, mask, bits)); - - __m128 re = _mm_load_ps(p + 8 * i); - __m128 im = _mm_load_ps(p + 8 * i + 4); - - re = _mm_blendv_ps(zero, _mm_mul_ps(re, renorm), ml); - im = _mm_blendv_ps(zero, _mm_mul_ps(im, renorm), ml); - - _mm_store_ps(p + 8 * i, re); - _mm_store_ps(p + 8 * i + 4, im); - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 8, f2, - mr.mask, mr.bits, renorm, zero, state.get()); - } - - std::vector PartialNorms(const State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* p) -> double { - __m128 re = _mm_load_ps(p + 8 * i); - __m128 im = _mm_load_ps(p + 8 * i + 4); - __m128 s1 = _mm_add_ps(_mm_mul_ps(re, re), _mm_mul_ps(im, im)); - - return detail::HorizontalSumSSE(s1); - }; - - using Op = std::plus; - return Base::for_.RunReduceP( - MinSize(state.num_qubits()) / 8, f, Op(), state.get()); - } - - uint64_t FindMeasuredBits( - unsigned m, double r, uint64_t mask, const State& state) const { - double csum = 0; - - uint64_t k0 = Base::for_.GetIndex0(MinSize(state.num_qubits()) / 8, m); - uint64_t k1 = Base::for_.GetIndex1(MinSize(state.num_qubits()) / 8, m); - - const fp_type* p = state.get(); - - for (uint64_t k = k0; k < k1; ++k) { - for (uint64_t j = 0; j < 4; ++j) { - auto re = p[8 * k + j]; - auto im = p[8 * k + 4 + j]; - csum += re * re + im * im; - if (r < csum) { - return (4 * k + j) & mask; - } - } - } - - // Return the last bitstring in the unlikely case of underflow. - return (4 * k1 - 1) & mask; - } -}; - -} // namespace qsim - -#endif // STATESPACE_SSE_H_ diff --git a/tpls/qsim/umux.h b/tpls/qsim/umux.h deleted file mode 100644 index 83b951b..0000000 --- a/tpls/qsim/umux.h +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UMUX_H_ -#define UMUX_H_ - -#ifdef __AVX512F__ -# include "unitary_calculator_avx512.h" - namespace qsim { - namespace unitary { - template - using UnitaryCalculator = UnitaryCalculatorAVX512; - } - } -#elif __AVX2__ -# include "unitary_calculator_avx.h" - namespace qsim { - namespace unitary { - template - using UnitaryCalculator = UnitaryCalculatorAVX; - } - } -#elif __SSE4_1__ -# include "unitary_calculator_sse.h" - namespace qsim { - namespace unitary { - template - using UnitaryCalculator = UnitaryCalculatorSSE; - } - } -#else -# include "unitary_calculator_basic.h" - namespace qsim { - namespace unitary { - template - using UnitaryCalculator = UnitaryCalculatorBasic; - } - } -#endif - -#endif // UMUX_H_ diff --git a/tpls/qsim/unitary_calculator_avx.h b/tpls/qsim/unitary_calculator_avx.h deleted file mode 100644 index 5e566ca..0000000 --- a/tpls/qsim/unitary_calculator_avx.h +++ /dev/null @@ -1,1028 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UNITARY_CALCULATOR_AVX_H_ -#define UNITARY_CALCULATOR_AVX_H_ - -#include - -#include -#include -#include -#include - -#include "simulator.h" -#include "unitaryspace_avx.h" - -namespace qsim { -namespace unitary { - -/** - * Quantum circuit unitary calculator with AVX vectorization. - */ -template -class UnitaryCalculatorAVX final : public SimulatorBase { - public: - using UnitarySpace = UnitarySpaceAVX; - using Unitary = typename UnitarySpace::Unitary; - using fp_type = typename UnitarySpace::fp_type; - - using StateSpace = UnitarySpace; - using State = Unitary; - - template - explicit UnitaryCalculatorAVX(ForArgs&&... args) : for_(args...) {} - - /** - * Applies a gate using AVX instructions. - * @param qs Indices of the qubits affected by this gate. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyGate(const std::vector& qs, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - switch (qs.size()) { - case 1: - if (qs[0] > 2) { - ApplyGateH<1>(qs, matrix, state); - } else { - ApplyGateL<0, 1>(qs, matrix, state); - } - break; - case 2: - if (qs[0] > 2) { - ApplyGateH<2>(qs, matrix, state); - } else if (qs[1] > 2) { - ApplyGateL<1, 1>(qs, matrix, state); - } else { - ApplyGateL<0, 2>(qs, matrix, state); - } - break; - case 3: - if (qs[0] > 2) { - ApplyGateH<3>(qs, matrix, state); - } else if (qs[1] > 2) { - ApplyGateL<2, 1>(qs, matrix, state); - } else if (qs[2] > 2) { - ApplyGateL<1, 2>(qs, matrix, state); - } else { - ApplyGateL<0, 3>(qs, matrix, state); - } - break; - case 4: - if (qs[0] > 2) { - ApplyGateH<4>(qs, matrix, state); - } else if (qs[1] > 2) { - ApplyGateL<3, 1>(qs, matrix, state); - } else if (qs[2] > 2) { - ApplyGateL<2, 2>(qs, matrix, state); - } else { - ApplyGateL<1, 3>(qs, matrix, state); - } - break; - case 5: - if (qs[0] > 2) { - ApplyGateH<5>(qs, matrix, state); - } else if (qs[1] > 2) { - ApplyGateL<4, 1>(qs, matrix, state); - } else if (qs[2] > 2) { - ApplyGateL<3, 2>(qs, matrix, state); - } else { - ApplyGateL<2, 3>(qs, matrix, state); - } - break; - case 6: - if (qs[0] > 2) { - ApplyGateH<6>(qs, matrix, state); - } else if (qs[1] > 2) { - ApplyGateL<5, 1>(qs, matrix, state); - } else if (qs[2] > 2) { - ApplyGateL<4, 2>(qs, matrix, state); - } else { - ApplyGateL<3, 3>(qs, matrix, state); - } - break; - default: - // Not implemented. - break; - } - } - - /** - * Applies a controlled gate using AVX instructions. - * @param qs Indices of the qubits affected by this gate. - * @param cqs Indices of control qubits. - * @param cvals Bit mask of control qubit values. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyControlledGate(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - // Assume cqs[0] < cqs[1] < cqs[2] < ... . - - if (cqs.size() == 0) { - ApplyGate(qs, matrix, state); - return; - } - - switch (qs.size()) { - case 1: - if (qs[0] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 2) { - ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 2: - if (qs[0] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 2) { - ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 3: - if (qs[0] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else if (qs[2] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 2) { - ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 4: - if (qs[0] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else if (qs[2] > 2) { - if (cqs[0] > 2) { - ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 2) { - ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - default: - // Not implemented. - break; - } - } - - /** - * @return The size of SIMD register if applicable. - */ - static unsigned SIMDRegisterSize() { - return 8; - } - - private: - -#ifdef __BMI2__ - - template - void ApplyGateH(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - uint64_t imaskh, uint64_t qmaskh, uint64_t size, - uint64_t row_size, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m256 ru, iu, rn, in; - __m256 rs[hsize], is[hsize]; - - uint64_t r = i % size; - uint64_t s = i / size; - - auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh); - - for (unsigned k = 0; k < hsize; ++k) { - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k] = _mm256_load_ps(p0 + p); - is[k] = _mm256_load_ps(p0 + p + 8); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_mul_ps(rs[0], ru); - in = _mm256_mul_ps(rs[0], iu); - rn = _mm256_fnmadd_ps(is[0], iu, rn); - in = _mm256_fmadd_ps(is[0], ru, in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_fmadd_ps(rs[l], ru, rn); - in = _mm256_fmadd_ps(rs[l], iu, in); - rn = _mm256_fnmadd_ps(is[l], iu, rn); - in = _mm256_fmadd_ps(is[l], ru, in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm256_store_ps(p0 + p, rn); - _mm256_store_ps(p0 + p + 8, in); - } - }; - - auto m = GetMasks1(qs); - - unsigned k = 3 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, - matrix, m.imaskh, m.qmaskh, size, raw_size, state.get()); - } - - template - void ApplyGateL(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, - uint64_t imaskh, uint64_t qmaskh, const __m256i* idx, - uint64_t size, uint64_t row_size, fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m256 rn, in; - __m256 rs[gsize], is[gsize]; - - uint64_t r = i % size; - uint64_t s = i / size; - - auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh); - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k2] = _mm256_load_ps(p0 + p); - is[k2] = _mm256_load_ps(p0 + p + 8); - - for (unsigned l = 1; l < lsize; ++l) { - rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); - is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); - } - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm256_mul_ps(rs[0], w[j]); - in = _mm256_mul_ps(rs[0], w[j + 1]); - rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm256_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm256_fmadd_ps(rs[l], w[j], rn); - in = _mm256_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm256_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm256_store_ps(p0 + p, rn); - _mm256_store_ps(p0 + p + 8, in); - } - }; - - __m256i idx[1 << L]; - __m256 w[1 << (1 + 2 * H + L)]; - - auto m = GetMasks2(qs); - FillPermutationIndices(m.qmaskl, idx); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - unsigned k = 3 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, - w, m.imaskh, m.qmaskh, idx, size, raw_size, state.get()); - } - - template - void ApplyControlledGateHH(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, - uint64_t size, uint64_t row_size, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m256 ru, iu, rn, in; - __m256 rs[hsize], is[hsize]; - - uint64_t r = i % size; - uint64_t s = i / size; - - auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh); - - for (unsigned k = 0; k < hsize; ++k) { - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k] = _mm256_load_ps(p0 + p); - is[k] = _mm256_load_ps(p0 + p + 8); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_mul_ps(rs[0], ru); - in = _mm256_mul_ps(rs[0], iu); - rn = _mm256_fnmadd_ps(is[0], iu, rn); - in = _mm256_fmadd_ps(is[0], ru, in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_fmadd_ps(rs[l], ru, rn); - in = _mm256_fmadd_ps(rs[l], iu, in); - rn = _mm256_fnmadd_ps(is[l], iu, rn); - in = _mm256_fmadd_ps(is[l], ru, in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm256_store_ps(p0 + p, rn); - _mm256_store_ps(p0 + p + 8, in); - } - }; - - auto m = GetMasks3(state.num_qubits(), qs, cqs, cvals); - - unsigned k = 3 + H + cqs.size(); - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, - matrix, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get()); - } - - template - void ApplyControlledGateHL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, - uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, - uint64_t size, uint64_t row_size, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m256 rn, in; - __m256 rs[hsize], is[hsize]; - - uint64_t r = i % size; - uint64_t s = i / size; - - auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh); - - for (unsigned k = 0; k < hsize; ++k) { - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k] = _mm256_load_ps(p0 + p); - is[k] = _mm256_load_ps(p0 + p + 8); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm256_mul_ps(rs[0], w[j]); - in = _mm256_mul_ps(rs[0], w[j + 1]); - rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm256_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - rn = _mm256_fmadd_ps(rs[l], w[j], rn); - in = _mm256_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm256_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm256_store_ps(p0 + p, rn); - _mm256_store_ps(p0 + p + 8, in); - } - }; - - __m256 w[1 << (1 + 2 * H)]; - - auto m = GetMasks4(state.num_qubits(), qs, cqs, cvals); - FillControlledMatrixH(m.cvalsl, m.cmaskl, matrix, (fp_type*) w); - - unsigned k = 3 + H + cqs.size() - m.cl; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, - w, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get()); - } - - template - void ApplyControlledGateL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, - uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, - const __m256i* idx, uint64_t size, uint64_t row_size, - fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m256 rn, in; - __m256 rs[gsize], is[gsize]; - - uint64_t r = i % size; - uint64_t s = i / size; - - auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh); - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k2] = _mm256_load_ps(p0 + p); - is[k2] = _mm256_load_ps(p0 + p + 8); - - for (unsigned l = 1; l < lsize; ++l) { - rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); - is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); - } - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm256_mul_ps(rs[0], w[j]); - in = _mm256_mul_ps(rs[0], w[j + 1]); - rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm256_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm256_fmadd_ps(rs[l], w[j], rn); - in = _mm256_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm256_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm256_store_ps(p0 + p, rn); - _mm256_store_ps(p0 + p + 8, in); - } - }; - - __m256i idx[1 << L]; - __m256 w[1 << (1 + 2 * H + L)]; - - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - if (CH) { - auto m = GetMasks5(state.num_qubits(), qs, cqs, cvals); - FillPermutationIndices(m.qmaskl, idx); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - unsigned k = 3 + H + cqs.size(); - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size * size2, f, w, m.imaskh, m.qmaskh, - m.cvalsh, idx, size, raw_size, state.get()); - } else { - auto m = GetMasks6(state.num_qubits(), qs, cqs, cvals); - FillPermutationIndices(m.qmaskl, idx); - FillControlledMatrixL( - m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w); - - unsigned k = 3 + H + cqs.size() - m.cl; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size * size2, f, w, m.imaskh, m.qmaskh, - m.cvalsh, idx, size, raw_size, state.get()); - } - } - -#else // __BMI2__ - - template - void ApplyGateH(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - const uint64_t* ms, const uint64_t* xss, uint64_t size, - uint64_t row_size, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m256 ru, iu, rn, in; - __m256 rs[hsize], is[hsize]; - - uint64_t r = 8 * (i % size); - uint64_t s = i / size; - - uint64_t t = r & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - r *= 2; - t |= r & ms[j]; - } - - auto p0 = rstate + row_size * s + 2 * t; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = _mm256_load_ps(p0 + xss[k]); - is[k] = _mm256_load_ps(p0 + xss[k] + 8); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_mul_ps(rs[0], ru); - in = _mm256_mul_ps(rs[0], iu); - rn = _mm256_fnmadd_ps(is[0], iu, rn); - in = _mm256_fmadd_ps(is[0], ru, in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_fmadd_ps(rs[l], ru, rn); - in = _mm256_fmadd_ps(rs[l], iu, in); - rn = _mm256_fnmadd_ps(is[l], iu, rn); - in = _mm256_fmadd_ps(is[l], ru, in); - - j += 2; - } - - _mm256_store_ps(p0 + xss[k], rn); - _mm256_store_ps(p0 + xss[k] + 8, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned k = 3 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, state.get()); - } - - template - void ApplyGateL(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, - const uint64_t* ms, const uint64_t* xss, const __m256i* idx, - uint64_t size, uint64_t row_size, fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m256 rn, in; - __m256 rs[gsize], is[gsize]; - - uint64_t r = 8 * (i % size); - uint64_t s = i / size; - - uint64_t t = r & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - r *= 2; - t |= r & ms[j]; - } - - auto p0 = rstate + row_size * s + 2 * t; - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - rs[k2] = _mm256_load_ps(p0 + xss[k]); - is[k2] = _mm256_load_ps(p0 + xss[k] + 8); - - for (unsigned l = 1; l < lsize; ++l) { - rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); - is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); - } - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm256_mul_ps(rs[0], w[j]); - in = _mm256_mul_ps(rs[0], w[j + 1]); - rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm256_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm256_fmadd_ps(rs[l], w[j], rn); - in = _mm256_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm256_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - _mm256_store_ps(p0 + xss[k], rn); - _mm256_store_ps(p0 + xss[k] + 8, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - __m256i idx[1 << L]; - __m256 w[1 << (1 + 2 * H + L)]; - - auto m = GetMasks11(qs); - - FillIndices(state.num_qubits(), qs, ms, xss); - FillPermutationIndices(m.qmaskl, idx); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - unsigned k = 3 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, w, ms, xss, idx, size, raw_size, state.get()); - } - - template - void ApplyControlledGateHH(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, - uint64_t cmaskh, uint64_t size, uint64_t row_size, - fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m256 ru, iu, rn, in; - __m256 rs[hsize], is[hsize]; - - uint64_t r = 8 * (i % size); - uint64_t s = i / size; - - uint64_t t = r & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - r *= 2; - t |= r & ms[j]; - } - - if ((t & cmaskh) != cvalsh) return; - - auto p0 = rstate + row_size * s + 2 * t; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = _mm256_load_ps(p0 + xss[k]); - is[k] = _mm256_load_ps(p0 + xss[k] + 8); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_mul_ps(rs[0], ru); - in = _mm256_mul_ps(rs[0], iu); - rn = _mm256_fnmadd_ps(is[0], iu, rn); - in = _mm256_fmadd_ps(is[0], ru, in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm256_set1_ps(v[j]); - iu = _mm256_set1_ps(v[j + 1]); - rn = _mm256_fmadd_ps(rs[l], ru, rn); - in = _mm256_fmadd_ps(rs[l], iu, in); - rn = _mm256_fnmadd_ps(is[l], iu, rn); - in = _mm256_fmadd_ps(is[l], ru, in); - - j += 2; - } - - _mm256_store_ps(p0 + xss[k], rn); - _mm256_store_ps(p0 + xss[k] + 8, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - - auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals); - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned k = 3 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, - matrix, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get()); - } - - template - void ApplyControlledGateHL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, - const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, - uint64_t cmaskh, uint64_t size, uint64_t row_size, - fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m256 rn, in; - __m256 rs[hsize], is[hsize]; - - uint64_t r = 8 * (i % size); - uint64_t s = i / size; - - uint64_t t = r & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - r *= 2; - t |= r & ms[j]; - } - - if ((t & cmaskh) != cvalsh) return; - - auto p0 = rstate + row_size * s + 2 * t; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = _mm256_load_ps(p0 + xss[k]); - is[k] = _mm256_load_ps(p0 + xss[k] + 8); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm256_mul_ps(rs[0], w[j]); - in = _mm256_mul_ps(rs[0], w[j + 1]); - rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm256_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - rn = _mm256_fmadd_ps(rs[l], w[j], rn); - in = _mm256_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm256_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - _mm256_store_ps(p0 + xss[k], rn); - _mm256_store_ps(p0 + xss[k] + 8, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - __m256 w[1 << (1 + 2 * H)]; - - auto m = GetMasks8<3>(state.num_qubits(), qs, cqs, cvals); - FillIndices(state.num_qubits(), qs, ms, xss); - FillControlledMatrixH(m.cvalsl, m.cmaskl, matrix, (fp_type*) w); - - unsigned k = 3 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, - w, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get()); - } - - template - void ApplyControlledGateL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m256* w, - const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, - uint64_t cmaskh, const __m256i* idx, uint64_t size, - uint64_t row_size, fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m256 rn, in; - __m256 rs[gsize], is[gsize]; - - uint64_t r = 8 * (i % size); - uint64_t s = i / size; - - uint64_t t = r & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - r *= 2; - t |= r & ms[j]; - } - - if ((t & cmaskh) != cvalsh) return; - - auto p0 = rstate + row_size * s + 2 * t; - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - - rs[k2] = _mm256_load_ps(p0 + xss[k]); - is[k2] = _mm256_load_ps(p0 + xss[k] + 8); - - for (unsigned l = 1; l < lsize; ++l) { - rs[k2 + l] = _mm256_permutevar8x32_ps(rs[k2], idx[l - 1]); - is[k2 + l] = _mm256_permutevar8x32_ps(is[k2], idx[l - 1]); - } - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm256_mul_ps(rs[0], w[j]); - in = _mm256_mul_ps(rs[0], w[j + 1]); - rn = _mm256_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm256_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm256_fmadd_ps(rs[l], w[j], rn); - in = _mm256_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm256_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm256_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - _mm256_store_ps(p0 + xss[k], rn); - _mm256_store_ps(p0 + xss[k] + 8, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - __m256i idx[1 << L]; - __m256 w[1 << (1 + 2 * H + L)]; - - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned k = 3 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - if (CH) { - auto m = GetMasks9(state.num_qubits(), qs, cqs, cvals); - FillPermutationIndices(m.qmaskl, idx); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - for_.Run(size * size2, f, w, ms, xss, m.cvalsh, - m.cmaskh, idx, size, raw_size, state.get()); - } else { - auto m = GetMasks10(state.num_qubits(), qs, cqs, cvals); - FillPermutationIndices(m.qmaskl, idx); - FillControlledMatrixL( - m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w); - - for_.Run(size * size2, f, w, ms, xss, m.cvalsh, - m.cmaskh, idx, size, raw_size, state.get()); - } - } - -#endif // __BMI2__ - - template - static void FillPermutationIndices(unsigned qmaskl, __m256i* idx) { - constexpr unsigned lsize = 1 << L; - - for (unsigned i = 0; i < lsize - 1; ++i) { - unsigned p[8]; - - for (unsigned j = 0; j < 8; ++j) { - p[j] = MaskedAdd<3>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl)); - } - - idx[i] = _mm256_set_epi32(p[7], p[6], p[5], p[4], p[3], p[2], p[1], p[0]); - } - } - - For for_; -}; - -} // namespace unitary -} // namespace qsim - -#endif // UNITARY_CALCULATOR_AVX_H_ diff --git a/tpls/qsim/unitary_calculator_avx512.h b/tpls/qsim/unitary_calculator_avx512.h deleted file mode 100644 index 8105367..0000000 --- a/tpls/qsim/unitary_calculator_avx512.h +++ /dev/null @@ -1,644 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UNITARY_CALCULATOR_AVX512_H_ -#define UNITARY_CALCULATOR_AVX512_H_ - -#include - -#include -#include -#include -#include - -#include "simulator.h" -#include "unitaryspace_avx512.h" - -namespace qsim { -namespace unitary { - -/** - * Quantum circuit unitary calculator with AVX512 vectorization. - */ -template -class UnitaryCalculatorAVX512 final : public SimulatorBase { - public: - using UnitarySpace = UnitarySpaceAVX512; - using Unitary = typename UnitarySpace::Unitary; - using fp_type = typename UnitarySpace::fp_type; - - using StateSpace = UnitarySpace; - using State = Unitary; - - template - explicit UnitaryCalculatorAVX512(ForArgs&&... args) : for_(args...) {} - - /** - * Applies a gate using AVX512 instructions. - * @param qs Indices of the qubits affected by this gate. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyGate(const std::vector& qs, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - switch (qs.size()) { - case 1: - if (qs[0] > 3) { - ApplyGateH<1>(qs, matrix, state); - } else { - ApplyGateL<0, 1>(qs, matrix, state); - } - break; - case 2: - if (qs[0] > 3) { - ApplyGateH<2>(qs, matrix, state); - } else if (qs[1] > 3) { - ApplyGateL<1, 1>(qs, matrix, state); - } else { - ApplyGateL<0, 2>(qs, matrix, state); - } - break; - case 3: - if (qs[0] > 3) { - ApplyGateH<3>(qs, matrix, state); - } else if (qs[1] > 3) { - ApplyGateL<2, 1>(qs, matrix, state); - } else if (qs[2] > 3) { - ApplyGateL<1, 2>(qs, matrix, state); - } else { - ApplyGateL<0, 3>(qs, matrix, state); - } - break; - case 4: - if (qs[0] > 3) { - ApplyGateH<4>(qs, matrix, state); - } else if (qs[1] > 3) { - ApplyGateL<3, 1>(qs, matrix, state); - } else if (qs[2] > 3) { - ApplyGateL<2, 2>(qs, matrix, state); - } else if (qs[3] > 3) { - ApplyGateL<1, 3>(qs, matrix, state); - } else { - ApplyGateL<0, 4>(qs, matrix, state); - } - break; - case 5: - if (qs[0] > 3) { - ApplyGateH<5>(qs, matrix, state); - } else if (qs[1] > 3) { - ApplyGateL<4, 1>(qs, matrix, state); - } else if (qs[2] > 3) { - ApplyGateL<3, 2>(qs, matrix, state); - } else if (qs[3] > 3) { - ApplyGateL<2, 3>(qs, matrix, state); - } else { - ApplyGateL<1, 4>(qs, matrix, state); - } - break; - case 6: - if (qs[0] > 3) { - ApplyGateH<6>(qs, matrix, state); - } else if (qs[1] > 3) { - ApplyGateL<5, 1>(qs, matrix, state); - } else if (qs[2] > 3) { - ApplyGateL<4, 2>(qs, matrix, state); - } else if (qs[3] > 3) { - ApplyGateL<3, 3>(qs, matrix, state); - } else { - ApplyGateL<2, 4>(qs, matrix, state); - } - break; - default: - // Not implemented. - break; - } - } - - /** - * Applies a controlled gate using AVX512 instructions. - * @param qs Indices of the qubits affected by this gate. - * @param cqs Indices of control qubits. - * @param cvals Bit mask of control qubit values. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyControlledGate(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - // Assume cqs[0] < cqs[1] < cqs[2] < ... . - - if (cqs.size() == 0) { - ApplyGate(qs, matrix, state); - return; - } - - switch (qs.size()) { - case 1: - if (qs[0] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 3) { - ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 2: - if (qs[0] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 3) { - ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 3: - if (qs[0] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else if (qs[2] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 3) { - ApplyControlledGateL<0, 3, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 3, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 4: - if (qs[0] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else if (qs[2] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state); - } - } else if (qs[3] > 3) { - if (cqs[0] > 3) { - ApplyControlledGateL<1, 3, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<1, 3, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 3) { - ApplyControlledGateL<0, 4, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 4, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - default: - // Not implemented. - break; - } - } - - /** - * @return The size of SIMD register if applicable. - */ - static unsigned SIMDRegisterSize() { - return 16; - } - - private: - template - void ApplyGateH(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - uint64_t imaskh, uint64_t qmaskh, uint64_t size, - uint64_t row_size, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m512 ru, iu, rn, in; - __m512 rs[hsize], is[hsize]; - - uint64_t r = i % size; - uint64_t s = i / size; - - auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh); - - for (unsigned k = 0; k < hsize; ++k) { - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k] = _mm512_load_ps(p0 + p); - is[k] = _mm512_load_ps(p0 + p + 16); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm512_set1_ps(v[j]); - iu = _mm512_set1_ps(v[j + 1]); - rn = _mm512_mul_ps(rs[0], ru); - in = _mm512_mul_ps(rs[0], iu); - rn = _mm512_fnmadd_ps(is[0], iu, rn); - in = _mm512_fmadd_ps(is[0], ru, in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm512_set1_ps(v[j]); - iu = _mm512_set1_ps(v[j + 1]); - rn = _mm512_fmadd_ps(rs[l], ru, rn); - in = _mm512_fmadd_ps(rs[l], iu, in); - rn = _mm512_fnmadd_ps(is[l], iu, rn); - in = _mm512_fmadd_ps(is[l], ru, in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm512_store_ps(p0 + p, rn); - _mm512_store_ps(p0 + p + 16, in); - } - }; - - auto m = GetMasks1(qs); - - unsigned k = 4 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, - matrix, m.imaskh, m.qmaskh, size, raw_size, state.get()); - } - - template - void ApplyGateL(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, - uint64_t imaskh, uint64_t qmaskh, const __m512i* idx, - uint64_t size, uint64_t row_size, fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m512 rn, in; - __m512 rs[gsize], is[gsize]; - - uint64_t r = i % size; - uint64_t s = i / size; - - auto p0 = rstate + row_size * s + _pdep_u64(r, imaskh); - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k2] = _mm512_load_ps(p0 + p); - is[k2] = _mm512_load_ps(p0 + p + 16); - - for (unsigned l = 1; l < lsize; ++l) { - rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]); - is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]); - } - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm512_mul_ps(rs[0], w[j]); - in = _mm512_mul_ps(rs[0], w[j + 1]); - rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm512_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm512_fmadd_ps(rs[l], w[j], rn); - in = _mm512_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm512_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm512_store_ps(p0 + p, rn); - _mm512_store_ps(p0 + p + 16, in); - } - }; - - __m512i idx[1 << L]; - __m512 w[1 << (1 + 2 * H + L)]; - - auto m = GetMasks2(qs); - FillPermutationIndices(m.qmaskl, idx); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - unsigned k = 4 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, - w, m.imaskh, m.qmaskh, idx, size, raw_size, state.get()); - } - - template - void ApplyControlledGateHH(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, - uint64_t size, uint64_t row_size, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m512 ru, iu, rn, in; - __m512 rs[hsize], is[hsize]; - - uint64_t r = i % size; - uint64_t s = i / size; - - auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh); - - for (unsigned k = 0; k < hsize; ++k) { - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k] = _mm512_load_ps(p0 + p); - is[k] = _mm512_load_ps(p0 + p + 16); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm512_set1_ps(v[j]); - iu = _mm512_set1_ps(v[j + 1]); - rn = _mm512_mul_ps(rs[0], ru); - in = _mm512_mul_ps(rs[0], iu); - rn = _mm512_fnmadd_ps(is[0], iu, rn); - in = _mm512_fmadd_ps(is[0], ru, in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm512_set1_ps(v[j]); - iu = _mm512_set1_ps(v[j + 1]); - rn = _mm512_fmadd_ps(rs[l], ru, rn); - in = _mm512_fmadd_ps(rs[l], iu, in); - rn = _mm512_fnmadd_ps(is[l], iu, rn); - in = _mm512_fmadd_ps(is[l], ru, in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm512_store_ps(p0 + p, rn); - _mm512_store_ps(p0 + p + 16, in); - } - }; - - auto m = GetMasks3(state.num_qubits(), qs, cqs, cvals); - - unsigned k = 4 + H + cqs.size(); - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, - matrix, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get()); - } - - template - void ApplyControlledGateHL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, - uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, - uint64_t size, uint64_t row_size, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m512 rn, in; - __m512 rs[hsize], is[hsize]; - - uint64_t r = i % size; - uint64_t s = i / size; - - auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh); - - for (unsigned k = 0; k < hsize; ++k) { - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k] = _mm512_load_ps(p0 + p); - is[k] = _mm512_load_ps(p0 + p + 16); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm512_mul_ps(rs[0], w[j]); - in = _mm512_mul_ps(rs[0], w[j + 1]); - rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm512_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - rn = _mm512_fmadd_ps(rs[l], w[j], rn); - in = _mm512_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm512_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm512_store_ps(p0 + p, rn); - _mm512_store_ps(p0 + p + 16, in); - } - }; - - __m512 w[1 << (1 + 2 * H)]; - - auto m = GetMasks4(state.num_qubits(), qs, cqs, cvals); - FillControlledMatrixH(m.cvalsl, m.cmaskl, matrix, (fp_type*) w); - - unsigned k = 4 + H + cqs.size() - m.cl; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, - w, m.imaskh, m.qmaskh, m.cvalsh, size, raw_size, state.get()); - } - - template - void ApplyControlledGateL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m512* w, - uint64_t imaskh, uint64_t qmaskh, uint64_t cvalsh, - const __m512i* idx, uint64_t size, uint64_t row_size, - fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m512 rn, in; - __m512 rs[gsize], is[gsize]; - - uint64_t r = i % size; - uint64_t s = i / size; - - auto p0 = rstate + row_size * s + (_pdep_u64(r, imaskh) | cvalsh); - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - uint64_t p = _pdep_u64(k, qmaskh); - - rs[k2] = _mm512_load_ps(p0 + p); - is[k2] = _mm512_load_ps(p0 + p + 16); - - for (unsigned l = 1; l < lsize; ++l) { - rs[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], rs[k2]); - is[k2 + l] = _mm512_permutexvar_ps(idx[l - 1], is[k2]); - } - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm512_mul_ps(rs[0], w[j]); - in = _mm512_mul_ps(rs[0], w[j + 1]); - rn = _mm512_fnmadd_ps(is[0], w[j + 1], rn); - in = _mm512_fmadd_ps(is[0], w[j], in); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm512_fmadd_ps(rs[l], w[j], rn); - in = _mm512_fmadd_ps(rs[l], w[j + 1], in); - rn = _mm512_fnmadd_ps(is[l], w[j + 1], rn); - in = _mm512_fmadd_ps(is[l], w[j], in); - - j += 2; - } - - uint64_t p = _pdep_u64(k, qmaskh); - - _mm512_store_ps(p0 + p, rn); - _mm512_store_ps(p0 + p + 16, in); - } - }; - - __m512i idx[1 << L]; - __m512 w[1 << (1 + 2 * H + L)]; - - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - if (CH) { - auto m = GetMasks5(state.num_qubits(), qs, cqs, cvals); - FillPermutationIndices(m.qmaskl, idx); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - unsigned k = 4 + H + cqs.size(); - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size * size2, f, w, m.imaskh, m.qmaskh, - m.cvalsh, idx, size, raw_size, state.get()); - } else { - auto m = GetMasks6(state.num_qubits(), qs, cqs, cvals); - FillPermutationIndices(m.qmaskl, idx); - FillControlledMatrixL( - m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w); - - unsigned k = 4 + H + cqs.size() - m.cl; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - - for_.Run(size * size2, f, w, m.imaskh, m.qmaskh, - m.cvalsh, idx, size, raw_size, state.get()); - } - } - - template - static void FillPermutationIndices(unsigned qmaskl, __m512i* idx) { - constexpr unsigned lsize = 1 << L; - - for (unsigned i = 0; i < lsize; ++i) { - unsigned p[16]; - - for (unsigned j = 0; j < 16; ++j) { - p[j] = MaskedAdd<4>(j, i + 1, qmaskl, lsize) | (j & (-1 ^ qmaskl)); - } - - idx[i] = _mm512_set_epi32(p[15], p[14], p[13], p[12], p[11], p[10], - p[9], p[8], p[7], p[6], p[5], p[4], - p[3], p[2], p[1], p[0]); - } - } - - For for_; -}; - -} // namespace unitary -} // namespace qsim - -#endif // UNITARY_CALCULATOR_AVX512_H_ diff --git a/tpls/qsim/unitary_calculator_basic.h b/tpls/qsim/unitary_calculator_basic.h deleted file mode 100644 index 6b1821a..0000000 --- a/tpls/qsim/unitary_calculator_basic.h +++ /dev/null @@ -1,259 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UNITARY_CALCULATOR_BASIC_H_ -#define UNITARY_CALCULATOR_BASIC_H_ - -#include -#include -#include -#include - -#include "simulator.h" -#include "unitaryspace_basic.h" - -namespace qsim { -namespace unitary { - -/** - * Quantum circuit unitary calculator without vectorization. - */ -template -class UnitaryCalculatorBasic final : public SimulatorBase { - public: - using UnitarySpace = UnitarySpaceBasic; - using Unitary = typename UnitarySpace::Unitary; - using fp_type = typename UnitarySpace::fp_type; - - using StateSpace = UnitarySpace; - using State = Unitary; - - template - explicit UnitaryCalculatorBasic(ForArgs&&... args) : for_(args...) {} - - /** - * Applies a gate using non-vectorized instructions. - * @param qs Indices of the qubits affected by this gate. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyGate(const std::vector& qs, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - switch (qs.size()) { - case 1: - ApplyGateH<1>(qs, matrix, state); - break; - case 2: - ApplyGateH<2>(qs, matrix, state); - break; - case 3: - ApplyGateH<3>(qs, matrix, state); - break; - case 4: - ApplyGateH<4>(qs, matrix, state); - break; - case 5: - ApplyGateH<5>(qs, matrix, state); - break; - case 6: - ApplyGateH<6>(qs, matrix, state); - break; - default: - // Not implemented. - break; - } - } - - /** - * Applies a controlled gate using non-vectorized instructions. - * @param qs Indices of the qubits affected by this gate. - * @param cqs Indices of control qubits. - * @param cvals Bit mask of control qubit values. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyControlledGate(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - if (cqs.size() == 0) { - ApplyGate(qs, matrix, state); - return; - } - - switch (qs.size()) { - case 1: - ApplyControlledGateH<1>(qs, cqs, cvals, matrix, state); - break; - case 2: - ApplyControlledGateH<2>(qs, cqs, cvals, matrix, state); - break; - case 3: - ApplyControlledGateH<3>(qs, cqs, cvals, matrix, state); - break; - case 4: - ApplyControlledGateH<4>(qs, cqs, cvals, matrix, state); - break; - default: - // Not implemented. - break; - } - } - - /** - * @return The size of SIMD register if applicable. - */ - static unsigned SIMDRegisterSize() { - return 1; - } - - private: - template - void ApplyGateH(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - const uint64_t* ms, const uint64_t* xss, uint64_t size, - uint64_t row_size, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - fp_type rn, in; - fp_type rs[hsize], is[hsize]; - - uint64_t r = i % size; - uint64_t s = i / size; - - uint64_t t = r & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - r *= 2; - t |= r & ms[j]; - } - - auto p0 = rstate + row_size * s + 2 * t; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = *(p0 + xss[k]); - is[k] = *(p0 + xss[k] + 1); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = rs[0] * v[j] - is[0] * v[j + 1]; - in = rs[0] * v[j + 1] + is[0] * v[j]; - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - rn += rs[l] * v[j] - is[l] * v[j + 1]; - in += rs[l] * v[j + 1] + is[l] * v[j]; - - j += 2; - } - - *(p0 + xss[k]) = rn; - *(p0 + xss[k] + 1) = in; - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, state.get()); - } - - template - void ApplyControlledGateH(const std::vector& qs, - const std::vector& cqs, - uint64_t cvals, const fp_type* matrix, - State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, - uint64_t cmaskh, uint64_t size, uint64_t row_size, - fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - fp_type rn, in; - fp_type rs[hsize], is[hsize]; - - uint64_t r = i % size; - uint64_t s = i / size; - - uint64_t t = r & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - r *= 2; - t |= r & ms[j]; - } - - if ((t & cmaskh) == cvalsh) { - auto p0 = rstate + row_size * s + 2 * t; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = *(p0 + xss[k]); - is[k] = *(p0 + xss[k] + 1); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = rs[0] * v[j] - is[0] * v[j + 1]; - in = rs[0] * v[j + 1] + is[0] * v[j]; - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - rn += rs[l] * v[j] - is[l] * v[j + 1]; - in += rs[l] * v[j + 1] + is[l] * v[j]; - - j += 2; - } - - *(p0 + xss[k]) = rn; - *(p0 + xss[k] + 1) = in; - } - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - - FillIndices(state.num_qubits(), qs, ms, xss); - - auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals); - - unsigned n = state.num_qubits() > H ? state.num_qubits() - H : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, - matrix, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get()); - } - - For for_; -}; - -} // namespace unitary -} // namespace qsim - -#endif // UNITARY_CALCULATOR_BASIC_H_ diff --git a/tpls/qsim/unitary_calculator_sse.h b/tpls/qsim/unitary_calculator_sse.h deleted file mode 100644 index a3c3f2e..0000000 --- a/tpls/qsim/unitary_calculator_sse.h +++ /dev/null @@ -1,639 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UNITARY_CALCULATOR_SSE_H_ -#define UNITARY_CALCULATOR_SSE_H_ - -#include - -#include -#include -#include -#include - -#include "simulator.h" -#include "unitaryspace_sse.h" - -namespace qsim { -namespace unitary { - -/** - * Quantum circuit unitary calculator with SSE vectorization. - */ -template -class UnitaryCalculatorSSE final : public SimulatorBase { - public: - using UnitarySpace = UnitarySpaceSSE; - using Unitary = typename UnitarySpace::Unitary; - using fp_type = typename UnitarySpace::fp_type; - - using StateSpace = UnitarySpace; - using State = Unitary; - - template - explicit UnitaryCalculatorSSE(ForArgs&&... args) : for_(args...) {} - - /** - * Applies a gate using SSE instructions. - * @param qs Indices of the qubits affected by this gate. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyGate(const std::vector& qs, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - - switch (qs.size()) { - case 1: - if (qs[0] > 1) { - ApplyGateH<1>(qs, matrix, state); - } else { - ApplyGateL<0, 1>(qs, matrix, state); - } - break; - case 2: - if (qs[0] > 1) { - ApplyGateH<2>(qs, matrix, state); - } else if (qs[1] > 1) { - ApplyGateL<1, 1>(qs, matrix, state); - } else { - ApplyGateL<0, 2>(qs, matrix, state); - } - break; - case 3: - if (qs[0] > 1) { - ApplyGateH<3>(qs, matrix, state); - } else if (qs[1] > 1) { - ApplyGateL<2, 1>(qs, matrix, state); - } else { - ApplyGateL<1, 2>(qs, matrix, state); - } - break; - case 4: - if (qs[0] > 1) { - ApplyGateH<4>(qs, matrix, state); - } else if (qs[1] > 1) { - ApplyGateL<3, 1>(qs, matrix, state); - } else { - ApplyGateL<2, 2>(qs, matrix, state); - } - break; - case 5: - if (qs[0] > 1) { - ApplyGateH<5>(qs, matrix, state); - } else if (qs[1] > 1) { - ApplyGateL<4, 1>(qs, matrix, state); - } else { - ApplyGateL<3, 2>(qs, matrix, state); - } - break; - case 6: - if (qs[0] > 1) { - ApplyGateH<6>(qs, matrix, state); - } else if (qs[1] > 1) { - ApplyGateL<5, 1>(qs, matrix, state); - } else { - ApplyGateL<4, 2>(qs, matrix, state); - } - break; - default: - // Not implemented. - break; - } - } - - /** - * Applies a controlled gate using SSE instructions. - * @param qs Indices of the qubits affected by this gate. - * @param cqs Indices of control qubits. - * @param cvals Bit mask of control qubit values. - * @param matrix Matrix representation of the gate to be applied. - * @param state The state of the system, to be updated by this method. - */ - void ApplyControlledGate(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - // Assume qs[0] < qs[1] < qs[2] < ... . - // Assume cqs[0] < cqs[1] < cqs[2] < ... . - - if (cqs.size() == 0) { - ApplyGate(qs, matrix, state); - return; - } - - switch (qs.size()) { - case 1: - if (qs[0] > 1) { - if (cqs[0] > 1) { - ApplyControlledGateHH<1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<1>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 1) { - ApplyControlledGateL<0, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 1, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 2: - if (qs[0] > 1) { - if (cqs[0] > 1) { - ApplyControlledGateHH<2>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<2>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 1) { - if (cqs[0] > 1) { - ApplyControlledGateL<1, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<1, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 1) { - ApplyControlledGateL<0, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<0, 2, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 3: - if (qs[0] > 1) { - if (cqs[0] > 1) { - ApplyControlledGateHH<3>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<3>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 1) { - if (cqs[0] > 1) { - ApplyControlledGateL<2, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<2, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 1) { - ApplyControlledGateL<1, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<1, 2, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - case 4: - if (qs[0] > 1) { - if (cqs[0] > 1) { - ApplyControlledGateHH<4>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateHL<4>(qs, cqs, cvals, matrix, state); - } - } else if (qs[1] > 1) { - if (cqs[0] > 1) { - ApplyControlledGateL<3, 1, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<3, 1, 0>(qs, cqs, cvals, matrix, state); - } - } else { - if (cqs[0] > 1) { - ApplyControlledGateL<2, 2, 1>(qs, cqs, cvals, matrix, state); - } else { - ApplyControlledGateL<2, 2, 0>(qs, cqs, cvals, matrix, state); - } - } - break; - default: - // Not implemented. - break; - } - } - - /** - * @return The size of SIMD register if applicable. - */ - static unsigned SIMDRegisterSize() { - return 4; - } - - private: - template - void ApplyGateH(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - const uint64_t* ms, const uint64_t* xss, uint64_t size, - uint64_t row_size, fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m128 ru, iu, rn, in; - __m128 rs[hsize], is[hsize]; - - uint64_t r = 4 * (i % size); - uint64_t s = i / size; - - uint64_t t = r & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - r *= 2; - t |= r & ms[j]; - } - - auto p0 = rstate + row_size * s + 2 * t; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = _mm_load_ps(p0 + xss[k]); - is[k] = _mm_load_ps(p0 + xss[k] + 4); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm_set1_ps(v[j]); - iu = _mm_set1_ps(v[j + 1]); - rn = _mm_mul_ps(rs[0], ru); - in = _mm_mul_ps(rs[0], iu); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu)); - in = _mm_add_ps(in, _mm_mul_ps(is[0], ru)); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm_set1_ps(v[j]); - iu = _mm_set1_ps(v[j + 1]); - rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru)); - in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu)); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu)); - in = _mm_add_ps(in, _mm_mul_ps(is[l], ru)); - - j += 2; - } - - _mm_store_ps(p0 + xss[k], rn); - _mm_store_ps(p0 + xss[k] + 4, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned k = 2 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, matrix, ms, xss, size, raw_size, state.get()); - } - - template - void ApplyGateL(const std::vector& qs, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w, - const uint64_t* ms, const uint64_t* xss, unsigned q0, - uint64_t size, uint64_t row_size, fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m128 rn, in; - __m128 rs[gsize], is[gsize]; - - uint64_t r = 4 * (i % size); - uint64_t s = i / size; - - uint64_t t = r & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - r *= 2; - t |= r & ms[j]; - } - - auto p0 = rstate + row_size * s + 2 * t; - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - - rs[k2] = _mm_load_ps(p0 + xss[k]); - is[k2] = _mm_load_ps(p0 + xss[k] + 4); - - if (L == 1) { - rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177) - : _mm_shuffle_ps(rs[k2], rs[k2], 78); - is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177) - : _mm_shuffle_ps(is[k2], is[k2], 78); - } else if (L == 2) { - rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57); - is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57); - rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78); - is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78); - rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147); - is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147); - } - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm_mul_ps(rs[0], w[j]); - in = _mm_mul_ps(rs[0], w[j + 1]); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1])); - in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j])); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j])); - in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1])); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1])); - in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j])); - - j += 2; - } - - _mm_store_ps(p0 + xss[k], rn); - _mm_store_ps(p0 + xss[k] + 4, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - __m128 w[1 << (1 + 2 * H + L)]; - - auto m = GetMasks11(qs); - - FillIndices(state.num_qubits(), qs, ms, xss); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - unsigned k = 2 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, w, ms, xss, qs[0], size, raw_size, state.get()); - } - - template - void ApplyControlledGateHH(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const fp_type* v, - const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, - uint64_t cmaskh, uint64_t size, uint64_t row_size, - fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m128 ru, iu, rn, in; - __m128 rs[hsize], is[hsize]; - - uint64_t r = 4 * (i % size); - uint64_t s = i / size; - - uint64_t t = r & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - r *= 2; - t |= r & ms[j]; - } - - if ((t & cmaskh) != cvalsh) return; - - auto p0 = rstate + row_size * s + 2 * t; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = _mm_load_ps(p0 + xss[k]); - is[k] = _mm_load_ps(p0 + xss[k] + 4); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - ru = _mm_set1_ps(v[j]); - iu = _mm_set1_ps(v[j + 1]); - rn = _mm_mul_ps(rs[0], ru); - in = _mm_mul_ps(rs[0], iu); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], iu)); - in = _mm_add_ps(in, _mm_mul_ps(is[0], ru)); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - ru = _mm_set1_ps(v[j]); - iu = _mm_set1_ps(v[j + 1]); - rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], ru)); - in = _mm_add_ps(in, _mm_mul_ps(rs[l], iu)); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], iu)); - in = _mm_add_ps(in, _mm_mul_ps(is[l], ru)); - - j += 2; - } - - _mm_store_ps(p0 + xss[k], rn); - _mm_store_ps(p0 + xss[k] + 4, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - - auto m = GetMasks7(state.num_qubits(), qs, cqs, cvals); - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned k = 2 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, - matrix, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get()); - } - - template - void ApplyControlledGateHL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w, - const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, - uint64_t cmaskh, uint64_t size, uint64_t row_size, - fp_type* rstate) { - constexpr unsigned hsize = 1 << H; - - __m128 rn, in; - __m128 rs[hsize], is[hsize]; - - uint64_t r = 4 * (i % size); - uint64_t s = i / size; - - uint64_t t = r & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - r *= 2; - t |= r & ms[j]; - } - - if ((t & cmaskh) != cvalsh) return; - - auto p0 = rstate + row_size * s + 2 * t; - - for (unsigned k = 0; k < hsize; ++k) { - rs[k] = _mm_load_ps(p0 + xss[k]); - is[k] = _mm_load_ps(p0 + xss[k] + 4); - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm_mul_ps(rs[0], w[j]); - in = _mm_mul_ps(rs[0], w[j + 1]); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1])); - in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j])); - - j += 2; - - for (unsigned l = 1; l < hsize; ++l) { - rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j])); - in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1])); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1])); - in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j])); - - j += 2; - } - - _mm_store_ps(p0 + xss[k], rn); - _mm_store_ps(p0 + xss[k] + 4, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - __m128 w[1 << (1 + 2 * H)]; - - auto m = GetMasks8<2>(state.num_qubits(), qs, cqs, cvals); - FillIndices(state.num_qubits(), qs, ms, xss); - FillControlledMatrixH(m.cvalsl, m.cmaskl, matrix, (fp_type*) w); - - unsigned k = 2 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - for_.Run(size * size2, f, - w, ms, xss, m.cvalsh, m.cmaskh, size, raw_size, state.get()); - } - - template - void ApplyControlledGateL(const std::vector& qs, - const std::vector& cqs, uint64_t cvals, - const fp_type* matrix, State& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, const __m128* w, - const uint64_t* ms, const uint64_t* xss, uint64_t cvalsh, - uint64_t cmaskh, unsigned q0, uint64_t size, uint64_t row_size, - fp_type* rstate) { - constexpr unsigned gsize = 1 << (H + L); - constexpr unsigned hsize = 1 << H; - constexpr unsigned lsize = 1 << L; - - __m128 rn, in; - __m128 rs[gsize], is[gsize]; - - uint64_t r = 4 * (i % size); - uint64_t s = i / size; - - uint64_t t = r & ms[0]; - for (unsigned j = 1; j <= H; ++j) { - r *= 2; - t |= r & ms[j]; - } - - if ((t & cmaskh) != cvalsh) return; - - auto p0 = rstate + row_size * s + 2 * t; - - for (unsigned k = 0; k < hsize; ++k) { - unsigned k2 = lsize * k; - - rs[k2] = _mm_load_ps(p0 + xss[k]); - is[k2] = _mm_load_ps(p0 + xss[k] + 4); - - if (L == 1) { - rs[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(rs[k2], rs[k2], 177) - : _mm_shuffle_ps(rs[k2], rs[k2], 78); - is[k2 + 1] = q0 == 0 ? _mm_shuffle_ps(is[k2], is[k2], 177) - : _mm_shuffle_ps(is[k2], is[k2], 78); - } else if (L == 2) { - rs[k2 + 1] = _mm_shuffle_ps(rs[k2], rs[k2], 57); - is[k2 + 1] = _mm_shuffle_ps(is[k2], is[k2], 57); - rs[k2 + 2] = _mm_shuffle_ps(rs[k2], rs[k2], 78); - is[k2 + 2] = _mm_shuffle_ps(is[k2], is[k2], 78); - rs[k2 + 3] = _mm_shuffle_ps(rs[k2], rs[k2], 147); - is[k2 + 3] = _mm_shuffle_ps(is[k2], is[k2], 147); - } - } - - uint64_t j = 0; - - for (unsigned k = 0; k < hsize; ++k) { - rn = _mm_mul_ps(rs[0], w[j]); - in = _mm_mul_ps(rs[0], w[j + 1]); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[0], w[j + 1])); - in = _mm_add_ps(in, _mm_mul_ps(is[0], w[j])); - - j += 2; - - for (unsigned l = 1; l < gsize; ++l) { - rn = _mm_add_ps(rn, _mm_mul_ps(rs[l], w[j])); - in = _mm_add_ps(in, _mm_mul_ps(rs[l], w[j + 1])); - rn = _mm_sub_ps(rn, _mm_mul_ps(is[l], w[j + 1])); - in = _mm_add_ps(in, _mm_mul_ps(is[l], w[j])); - - j += 2; - } - - _mm_store_ps(p0 + xss[k], rn); - _mm_store_ps(p0 + xss[k] + 4, in); - } - }; - - uint64_t ms[H + 1]; - uint64_t xss[1 << H]; - __m128 w[1 << (1 + 2 * H + L)]; - - FillIndices(state.num_qubits(), qs, ms, xss); - - unsigned k = 2 + H; - unsigned n = state.num_qubits() > k ? state.num_qubits() - k : 0; - uint64_t size = uint64_t{1} << n; - uint64_t size2 = uint64_t{1} << state.num_qubits(); - uint64_t raw_size = UnitarySpace::MinRowSize(state.num_qubits()); - - if (CH) { - auto m = GetMasks9(state.num_qubits(), qs, cqs, cvals); - FillMatrix(m.qmaskl, matrix, (fp_type*) w); - - for_.Run(size * size2, f, w, ms, xss, - m.cvalsh, m.cmaskh, qs[0], size, raw_size, state.get()); - } else { - auto m = GetMasks10(state.num_qubits(), qs, cqs, cvals); - FillControlledMatrixL( - m.cvalsl, m.cmaskl, m.qmaskl, matrix, (fp_type*) w); - - for_.Run(size * size2, f, w, ms, xss, - m.cvalsh, m.cmaskh, qs[0], size, raw_size, state.get()); - } - } - - For for_; -}; - -} // namespace unitary -} // namespace qsim - -#endif // UNITARY_CALCULATOR_SSE_H_ diff --git a/tpls/qsim/unitaryspace.h b/tpls/qsim/unitaryspace.h deleted file mode 100644 index b5e2691..0000000 --- a/tpls/qsim/unitaryspace.h +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UNITARYSPACE_H_ -#define UNITARYSPACE_H_ - -#include - -namespace qsim { - -namespace unitary { - -/** - * Abstract class containing routines for general unitary matrix manipulations. - * "AVX", "AVX512", "Basic", and "SSE" implementations are provided. - */ -template class VectorSpace, typename... VSTypeParams> -class UnitarySpace : public VectorSpace { - private: - using Base = VectorSpace; - - public: - using fp_type = typename Base::fp_type; - using Unitary = typename Base::Vector; - - template - UnitarySpace(ForArgs&&... args) : Base(args...) {} - - static Unitary CreateUnitary(unsigned num_qubits) { - return Base::Create(num_qubits); - } - - static Unitary CreateUnitary(fp_type* p, unsigned num_qubits) { - return Base::Create(p, num_qubits); - } - - static Unitary NullUnitary() { - return Base::Null(); - } - - static uint64_t Size(unsigned num_qubits) { - return uint64_t{1} << num_qubits; - }; - - void CopyUnitary(const Unitary& src, Unitary& dest) const { - Base::Copy(src, dest); - } -}; - -} // namespace unitary -} // namespace qsim - -#endif // UNITARYSPACE_H_ diff --git a/tpls/qsim/unitaryspace_avx.h b/tpls/qsim/unitaryspace_avx.h deleted file mode 100644 index c1ec59d..0000000 --- a/tpls/qsim/unitaryspace_avx.h +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UNITARYSPACE_AVX_H_ -#define UNITARYSPACE_AVX_H_ - -#include - -#include -#include -#include -#include - -#include "unitaryspace.h" -#include "vectorspace.h" - -namespace qsim { - -namespace unitary { - -/** - * Object containing context and routines for unitary manipulations. - * Unitary is a vectorized sequence of eight real components followed by eight - * imaginary components. Eight single-precison floating numbers can be loaded - * into an AVX register. - */ -template -struct UnitarySpaceAVX : - public UnitarySpace, VectorSpace, For, float> { - private: - using Base = UnitarySpace, - qsim::VectorSpace, For, float>; - - public: - using Unitary = typename Base::Unitary; - using fp_type = typename Base::fp_type; - - template - explicit UnitarySpaceAVX(ForArgs&&... args) : Base(args...) {} - - static uint64_t MinRowSize(unsigned num_qubits) { - return std::max(uint64_t{16}, 2 * (uint64_t{1} << num_qubits)); - }; - - static uint64_t MinSize(unsigned num_qubits) { - return Base::Size(num_qubits) * MinRowSize(num_qubits); - }; - - void SetAllZeros(Unitary& state) const { - __m256 val0 = _mm256_setzero_ps(); - - auto f = [](unsigned n, unsigned m, uint64_t i, __m256& val, fp_type* p) { - _mm256_store_ps(p + 16 * i, val); - _mm256_store_ps(p + 16 * i + 8, val); - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 16, f, val0, state.get()); - } - - void SetIdentity(Unitary& state) { - SetAllZeros(state); - - auto f = [](unsigned n, unsigned m, uint64_t i, - uint64_t row_size, fp_type* p) { - p[row_size * i + (16 * (i / 8)) + (i % 8)] = 1; - }; - - uint64_t size = Base::Size(state.num_qubits()); - uint64_t row_size = MinRowSize(state.num_qubits()); - Base::for_.Run(size, f, row_size, state.get()); - } - - static std::complex GetEntry(const Unitary& state, - uint64_t i, uint64_t j) { - uint64_t row_size = MinRowSize(state.num_qubits()); - uint64_t k = (16 * (j / 8)) + (j % 8); - return std::complex(state.get()[row_size * i + k], - state.get()[row_size * i + k + 8]); - } - - static void SetEntry(Unitary& state, uint64_t i, uint64_t j, - const std::complex& ampl) { - uint64_t row_size = MinRowSize(state.num_qubits()); - uint64_t k = (16 * (j / 8)) + (j % 8); - state.get()[row_size * i + k] = std::real(ampl); - state.get()[row_size * i + k + 8] = std::imag(ampl); - } - - static void SetEntry(Unitary& state, uint64_t i, uint64_t j, fp_type re, - fp_type im) { - uint64_t row_size = MinRowSize(state.num_qubits()); - uint64_t k = (16 * (j / 8)) + (j % 8); - state.get()[row_size * i + k] = re; - state.get()[row_size * i + k + 8] = im; - } -}; - -} // namespace unitary -} // namespace qsim - -#endif // UNITARYSPACE_AVX_H_ diff --git a/tpls/qsim/unitaryspace_avx512.h b/tpls/qsim/unitaryspace_avx512.h deleted file mode 100644 index 4c23dc9..0000000 --- a/tpls/qsim/unitaryspace_avx512.h +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UNITARYSPACE_AVX512_H_ -#define UNITARYSPACE_AVX512_H_ - -#include - -#include -#include -#include -#include - -#include "unitaryspace.h" -#include "vectorspace.h" - -namespace qsim { - -namespace unitary { - -/** - * Object containing context and routines for unitary manipulations. - * State is a vectorized sequence of sixteen real components followed by - * sixteen imaginary components. Sixteen single-precison floating numbers can - * be loaded into an AVX512 register. - */ -template -struct UnitarySpaceAVX512 : - public UnitarySpace, VectorSpace, For, float> { - private: - using Base = UnitarySpace, - qsim::VectorSpace, For, float>; - - public: - using Unitary = typename Base::Unitary; - using fp_type = typename Base::fp_type; - - template - explicit UnitarySpaceAVX512(ForArgs&&... args) : Base(args...) {} - - static uint64_t MinRowSize(unsigned num_qubits) { - return std::max(uint64_t{32}, 2 * (uint64_t{1} << num_qubits)); - }; - - static uint64_t MinSize(unsigned num_qubits) { - return Base::Size(num_qubits) * MinRowSize(num_qubits); - }; - - void SetAllZeros(Unitary& state) const { - __m512 val0 = _mm512_setzero_ps(); - - auto f = [](unsigned n, unsigned m, uint64_t i, __m512 val0, fp_type* p) { - _mm512_store_ps(p + 32 * i, val0); - _mm512_store_ps(p + 32 * i + 16, val0); - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 32, f, val0, state.get()); - } - - void SetIdentity(Unitary& state) { - SetAllZeros(state); - - auto f = [](unsigned n, unsigned m, uint64_t i, - uint64_t row_size, fp_type* p) { - p[row_size * i + (32 * (i / 16)) + (i % 16)] = 1; - }; - - uint64_t size = Base::Size(state.num_qubits()); - uint64_t row_size = MinRowSize(state.num_qubits()); - Base::for_.Run(size, f, row_size, state.get()); - } - - static std::complex GetEntry(const Unitary& state, - uint64_t i, uint64_t j) { - uint64_t row_size = MinRowSize(state.num_qubits()); - uint64_t k = (32 * (j / 16)) + (j % 16); - return std::complex(state.get()[row_size * i + k], - state.get()[row_size * i + k + 16]); - } - - static void SetEntry(Unitary& state, uint64_t i, uint64_t j, - const std::complex& ampl) { - uint64_t row_size = MinRowSize(state.num_qubits()); - uint64_t k = (32 * (j / 16)) + (j % 16); - state.get()[row_size * i + k] = std::real(ampl); - state.get()[row_size * i + k + 16] = std::imag(ampl); - } - - static void SetEntry(Unitary& state, uint64_t i, uint64_t j, fp_type re, - fp_type im) { - uint64_t row_size = MinRowSize(state.num_qubits()); - uint64_t k = (32 * (j / 16)) + (j % 16); - state.get()[row_size * i + k] = re; - state.get()[row_size * i + k + 16] = im; - } -}; - -} // namespace unitary -} // namespace qsim - -#endif // UNITARYSPACE_AVX512_H_ diff --git a/tpls/qsim/unitaryspace_basic.h b/tpls/qsim/unitaryspace_basic.h deleted file mode 100644 index 2db14b6..0000000 --- a/tpls/qsim/unitaryspace_basic.h +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UNITARYSPACE_BASIC_H_ -#define UNITARYSPACE_BASIC_H_ - -#include -#include -#include - -#include "unitaryspace.h" -#include "vectorspace.h" - -namespace qsim { - -namespace unitary { - -/** - * Object containing context and routines for unitary manipulations. - * Unitary is a non-vectorized sequence of one real amplitude followed by - * one imaginary amplitude. - */ -template -struct UnitarySpaceBasic - : public UnitarySpace, VectorSpace, For, FP> { - private: - using Base = UnitarySpace, - qsim::VectorSpace, For, FP>; - - public: - using Unitary = typename Base::Unitary; - using fp_type = typename Base::fp_type; - - template - explicit UnitarySpaceBasic(ForArgs&&... args) : Base(args...) {} - - static uint64_t MinRowSize(unsigned num_qubits) { - return 2 * (uint64_t{1} << num_qubits); - }; - - static uint64_t MinSize(unsigned num_qubits) { - return Base::Size(num_qubits) * MinRowSize(num_qubits); - }; - - void SetAllZeros(Unitary& state) const { - auto f = [](unsigned n, unsigned m, uint64_t i, fp_type* p) { - p[2 * i + 0] = 0; - p[2 * i + 1] = 0; - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 2, f, state.get()); - } - - void SetIdentity(Unitary& state) { - SetAllZeros(state); - - auto f = [](unsigned n, unsigned m, uint64_t i, - uint64_t row_size, fp_type* p) { - p[row_size * i + 2 * i] = 1; - }; - - uint64_t size = Base::Size(state.num_qubits()); - uint64_t row_size = MinRowSize(state.num_qubits()); - Base::for_.Run(size, f, row_size, state.get()); - } - - static std::complex GetEntry(const Unitary& state, - uint64_t i, uint64_t j) { - uint64_t row_size = MinRowSize(state.num_qubits()); - return std::complex(state.get()[row_size * i + 2 * j], - state.get()[row_size * i + 2 * j + 1]); - } - - static void SetEntry(Unitary& state, uint64_t i, uint64_t j, - const std::complex& ampl) { - uint64_t row_size = MinRowSize(state.num_qubits()); - state.get()[row_size * i + 2 * j] = std::real(ampl); - state.get()[row_size * i + 2 * j + 1] = std::imag(ampl); - } - - static void SetEntry(Unitary& state, uint64_t i, uint64_t j, - fp_type re, fp_type im) { - uint64_t row_size = MinRowSize(state.num_qubits()); - state.get()[row_size * i + 2 * j] = re; - state.get()[row_size * i + 2 * j + 1] = im; - } -}; - -} // namespace unitary -} // namespace qsim - -#endif // UNITARYSPACE_BASIC_H_ diff --git a/tpls/qsim/unitaryspace_sse.h b/tpls/qsim/unitaryspace_sse.h deleted file mode 100644 index f3762fb..0000000 --- a/tpls/qsim/unitaryspace_sse.h +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UNITARYSPACE_SSE_H_ -#define UNITARYSPACE_SSE_H_ - -#include - -#include -#include -#include -#include - -#include "unitaryspace.h" -#include "vectorspace.h" - -namespace qsim { - -namespace unitary { - -/** - * Object containing context and routines for unitary manipulations. - * Unitary is a vectorized sequence of four real components followed by four - * imaginary components. Four single-precison floating numbers can be loaded - * into an SSE register. - */ -template -struct UnitarySpaceSSE : - public UnitarySpace, VectorSpace, For, float> { - private: - using Base = UnitarySpace, - qsim::VectorSpace, For, float>; - - public: - using Unitary = typename Base::Unitary; - using fp_type = typename Base::fp_type; - - template - explicit UnitarySpaceSSE(ForArgs&&... args) : Base(args...) {} - - static uint64_t MinRowSize(unsigned num_qubits) { - return std::max(uint64_t{8}, 2 * (uint64_t{1} << num_qubits)); - }; - - static uint64_t MinSize(unsigned num_qubits) { - return Base::Size(num_qubits) * MinRowSize(num_qubits); - }; - - void SetAllZeros(Unitary& state) const { - __m128 val0 = _mm_setzero_ps(); - - auto f = [](unsigned n, unsigned m, uint64_t i, __m128 val0, fp_type* p) { - _mm_store_ps(p + 8 * i, val0); - _mm_store_ps(p + 8 * i + 4, val0); - }; - - Base::for_.Run(MinSize(state.num_qubits()) / 8, f, val0, state.get()); - } - - void SetIdentity(Unitary& state) { - SetAllZeros(state); - - auto f = [](unsigned n, unsigned m, uint64_t i, - uint64_t row_size, fp_type* p) { - p[row_size * i + (8 * (i / 4)) + (i % 4)] = 1; - }; - - uint64_t size = Base::Size(state.num_qubits()); - uint64_t row_size = MinRowSize(state.num_qubits()); - Base::for_.Run(size, f, row_size, state.get()); - } - - static std::complex GetEntry(const Unitary& state, - uint64_t i, uint64_t j) { - uint64_t row_size = MinRowSize(state.num_qubits()); - uint64_t k = (8 * (j / 4)) + (j % 4); - return std::complex(state.get()[row_size * i + k], - state.get()[row_size * i + k + 4]); - } - - static void SetEntry(Unitary& state, uint64_t i, uint64_t j, - const std::complex& ampl) { - uint64_t row_size = MinRowSize(state.num_qubits()); - uint64_t k = (8 * (j / 4)) + (j % 4); - state.get()[row_size * i + k] = std::real(ampl); - state.get()[row_size * i + k + 4] = std::imag(ampl); - } - - static void SetEntry(Unitary& state, uint64_t i, uint64_t j, fp_type re, - fp_type im) { - uint64_t row_size = MinRowSize(state.num_qubits()); - uint64_t k = (8 * (j / 4)) + (j % 4); - state.get()[row_size * i + k] = re; - state.get()[row_size * i + k + 4] = im; - } -}; - -} // namespace unitary -} // namespace qsim - -#endif // UNITARYSPACE_SSE_H_ diff --git a/tpls/qsim/util.h b/tpls/qsim/util.h deleted file mode 100644 index 726a019..0000000 --- a/tpls/qsim/util.h +++ /dev/null @@ -1,89 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTIL_H_ -#define UTIL_H_ - -#include -#include -#include -#include -#include -#include -#include - -namespace qsim { - -template -inline void SplitString( - const std::string& str, char delim, Container& words) { - words.resize(0); - - std::string word; - std::stringstream ss(str); - - while (std::getline(ss, word, delim)) { - words.push_back(std::move(word)); - } -} - -template -inline void SplitString( - const std::string& str, char delim, Op op, Container& words) { - words.resize(0); - - std::string word; - std::stringstream ss(str); - - while (std::getline(ss, word, delim)) { - words.push_back(op(word)); - } -} - -inline double GetTime() { - using namespace std::chrono; - steady_clock::duration since_epoch = steady_clock::now().time_since_epoch(); - return double(since_epoch.count() * steady_clock::period::num) - / steady_clock::period::den; -} - -template -inline DistrRealType RandomValue(RGen& rgen, DistrRealType max_value) { - std::uniform_real_distribution distr(0.0, max_value); - return distr(rgen); -} - -template -inline std::vector GenerateRandomValues( - uint64_t num_samples, unsigned seed, DistrRealType max_value) { - std::vector rs; - rs.reserve(num_samples + 1); - - std::mt19937 rgen(seed); - std::uniform_real_distribution distr(0.0, max_value); - - for (uint64_t i = 0; i < num_samples; ++i) { - rs.emplace_back(distr(rgen)); - } - - std::sort(rs.begin(), rs.end()); - // Populate the final element to prevent sanitizer errors. - rs.emplace_back(max_value); - - return rs; -} - -} // namespace qsim - -#endif // UTIL_H_ diff --git a/tpls/qsim/util_cpu.h b/tpls/qsim/util_cpu.h deleted file mode 100644 index 8e02425..0000000 --- a/tpls/qsim/util_cpu.h +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTIL_CPU_H_ -#define UTIL_CPU_H_ - -#ifdef __SSE2__ -# include -#endif - -namespace qsim { - -// This function sets flush-to-zero and denormals-are-zeros MXCSR control -// flags. This prevents rare cases of performance slowdown potentially at -// the cost of a tiny precision loss. -inline void SetFlushToZeroAndDenormalsAreZeros() { -#ifdef __SSE2__ - _mm_setcsr(_mm_getcsr() | 0x8040); -#endif -} - -// This function clears flush-to-zero and denormals-are-zeros MXCSR control -// flags. -inline void ClearFlushToZeroAndDenormalsAreZeros() { -#ifdef __SSE2__ - _mm_setcsr(_mm_getcsr() & ~unsigned{0x8040}); -#endif -} - -} // namespace qsim - -#endif // UTIL_CPU_H_ diff --git a/tpls/qsim/util_cuda.h b/tpls/qsim/util_cuda.h deleted file mode 100644 index 5d8cb5d..0000000 --- a/tpls/qsim/util_cuda.h +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTIL_CUDA_H_ -#define UTIL_CUDA_H_ - -#ifdef __NVCC__ - #include -#elif __HIP__ - #include -#endif - -#include - -#include "io.h" - -namespace qsim { - -#define ErrorCheck(code) { ErrorAssert((code), __FILE__, __LINE__); } - -inline void ErrorAssert(cudaError_t code, const char* file, unsigned line) { - if (code != cudaSuccess) { - IO::errorf("CUDA error: %s %s %d\n", cudaGetErrorString(code), file, line); - exit(code); - } -} - -template -struct Complex { - __host__ __device__ __forceinline__ Complex() {} - - __host__ __device__ __forceinline__ Complex(const T& re) : re(re), im(0) {} - - __host__ __device__ __forceinline__ Complex(const T& re, const T& im) - : re(re), im(im) {} - - template - __host__ __device__ __forceinline__ Complex& operator=( - const Complex& r) { - re = r.re; - im = r.im; - - return *this; - } - - T re; - T im; -}; - -template -__host__ __device__ __forceinline__ Complex operator+( - const Complex& l, const Complex& r) { - return Complex(l.re + r.re, l.im + r.im); -} - -template -__host__ __device__ __forceinline__ Complex operator+( - const Complex& l, const Complex& r) { - return Complex(l.re + r.re, l.im + r.im); -} - -template -struct Scalar { - using type = T; -}; - -template -struct Scalar> { - using type = T; -}; - -template -struct Plus { - template - __device__ __forceinline__ T operator()(const T& v1, const U& v2) const { - return v1 + v2; - } -}; - -template -struct Product { - __device__ __forceinline__ Complex operator()( - const T& re1, const T& im1, const T& re2, const T& im2) const { - return Complex(re1 * re2 + im1 * im2, re1 * im2 - im1 * re2); - } -}; - -template -struct RealProduct { - __device__ __forceinline__ T operator()( - const T& re1, const T& im1, const T& re2, const T& im2) const { - return re1 * re2 + im1 * im2; - } -}; - -template -__device__ __forceinline__ FP1 WarpReduce(FP1 val, Op op) { - for (unsigned i = warp_size / 2; i > 0; i /= 2) { - val = op(val, __shfl_down_sync(0xffffffff, val, i)); - } - - return val; -} - -template -__device__ __forceinline__ Complex WarpReduce(Complex val, Op op) { - for (unsigned i = warp_size / 2; i > 0; i /= 2) { - val.re = op(val.re, __shfl_down_sync(0xffffffff, val.re, i)); - val.im = op(val.im, __shfl_down_sync(0xffffffff, val.im, i)); - } - - return val; -} - -} // namespace qsim - -#endif // UTIL_CUDA_H_ diff --git a/tpls/qsim/util_custatevec.h b/tpls/qsim/util_custatevec.h deleted file mode 100644 index 36f29ef..0000000 --- a/tpls/qsim/util_custatevec.h +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef UTIL_CUSTATEVEC_H_ -#define UTIL_CUSTATEVEC_H_ - -#include -#include - -#include "io.h" -#include "util_cuda.h" - -namespace qsim { - -inline void ErrorAssert(cublasStatus_t code, const char* file, unsigned line) { - if (code != CUBLAS_STATUS_SUCCESS) { - IO::errorf("cuBLAS error %i: %s %d\n", code, file, line); - exit(code); - } -} - -inline void ErrorAssert( - custatevecStatus_t code, const char* file, unsigned line) { - if (code != CUSTATEVEC_STATUS_SUCCESS) { - IO::errorf("custatevec error: %s %s %d\n", - custatevecGetErrorString(code), file, line); - exit(code); - } -} - -} // namespace qsim - -#endif // UTIL_CUSTATEVEC_H_ diff --git a/tpls/qsim/vectorspace.h b/tpls/qsim/vectorspace.h deleted file mode 100644 index 7b33a53..0000000 --- a/tpls/qsim/vectorspace.h +++ /dev/null @@ -1,185 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef VECTORSPACE_H_ -#define VECTORSPACE_H_ - -#ifdef _WIN32 - #include -#endif - -#include -#include -#include -#include - -namespace qsim { - -namespace detail { - -inline void do_not_free(void*) {} - -inline void free(void* ptr) { -#ifdef _WIN32 - _aligned_free(ptr); -#else - ::free(ptr); -#endif -} - -} // namespace detail - -// Routines for vector manipulations. -template -class VectorSpace { - public: - using fp_type = FP; - - private: - using Pointer = std::unique_ptr; - - public: - class Vector { - public: - Vector() = delete; - - Vector(Pointer&& ptr, unsigned num_qubits) - : ptr_(std::move(ptr)), num_qubits_(num_qubits) {} - - fp_type* get() { - return ptr_.get(); - } - - const fp_type* get() const { - return ptr_.get(); - } - - fp_type* release() { - num_qubits_ = 0; - return ptr_.release(); - } - - unsigned num_qubits() const { - return num_qubits_; - } - - bool requires_copy_to_host() const { - return false; - } - - private: - Pointer ptr_; - unsigned num_qubits_; - }; - - template - VectorSpace(ForArgs&&... args) : for_(args...) {} - - static Vector Create(unsigned num_qubits) { - auto size = sizeof(fp_type) * Impl::MinSize(num_qubits); - #ifdef _WIN32 - Pointer ptr{(fp_type*) _aligned_malloc(size, 64), &detail::free}; - return Vector{std::move(ptr), ptr.get() != nullptr ? num_qubits : 0}; - #else - void* p = nullptr; - if (posix_memalign(&p, 64, size) == 0) { - return Vector{Pointer{(fp_type*) p, &detail::free}, num_qubits}; - } else { - return Null(); - } - #endif - } - - // It is the client's responsibility to make sure that p has at least - // Impl::MinSize(num_qubits) elements. - static Vector Create(fp_type* p, unsigned num_qubits) { - return Vector{Pointer{p, &detail::do_not_free}, num_qubits}; - } - - static Vector Null() { - return Vector{Pointer{nullptr, &detail::free}, 0}; - } - - static bool IsNull(const Vector& vec) { - return vec.get() == nullptr; - } - - static void Free(fp_type* ptr) { - detail::free(ptr); - } - - bool Copy(const Vector& src, Vector& dest) const { - if (src.num_qubits() != dest.num_qubits()) { - return false; - } - - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* src, fp_type* dest) { - dest[i] = src[i]; - }; - - for_.Run(Impl::MinSize(src.num_qubits()), f, src.get(), dest.get()); - - return true; - } - - // It is the client's responsibility to make sure that dest has at least - // Impl::MinSize(src.num_qubits()) elements. - bool Copy(const Vector& src, fp_type* dest) const { - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* src, fp_type* dest) { - dest[i] = src[i]; - }; - - for_.Run(Impl::MinSize(src.num_qubits()), f, src.get(), dest); - - return true; - } - - // It is the client's responsibility to make sure that src has at least - // Impl::MinSize(dest.num_qubits()) elements. - bool Copy(const fp_type* src, Vector& dest) const { - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* src, fp_type* dest) { - dest[i] = src[i]; - }; - - for_.Run(Impl::MinSize(dest.num_qubits()), f, src, dest.get()); - - return true; - } - - // It is the client's responsibility to make sure that src has at least - // min(size, Impl::MinSize(dest.num_qubits())) elements. - bool Copy(const fp_type* src, uint64_t size, Vector& dest) const { - auto f = [](unsigned n, unsigned m, uint64_t i, - const fp_type* src, fp_type* dest) { - dest[i] = src[i]; - }; - - size = std::min(size, Impl::MinSize(dest.num_qubits())); - for_.Run(size, f, src, dest.get()); - - return true; - } - - void DeviceSync() {} - - protected: - For for_; -}; - -} // namespace qsim - -#endif // VECTORSPACE_H_ diff --git a/tpls/qsim/vectorspace_cuda.h b/tpls/qsim/vectorspace_cuda.h deleted file mode 100644 index fd91553..0000000 --- a/tpls/qsim/vectorspace_cuda.h +++ /dev/null @@ -1,172 +0,0 @@ -// Copyright 2019 Google LLC. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef VECTORSPACE_CUDA_H_ -#define VECTORSPACE_CUDA_H_ - -#ifdef __NVCC__ - #include - #include -#elif __HIP__ - #include - #include "cuda2hip.h" -#endif - -#include -#include - -namespace qsim { - -namespace detail { - -inline void do_not_free(void*) {} - -inline void free(void* ptr) { - ErrorCheck(cudaFree(ptr)); -} - -} // namespace detail - -// Routines for vector manipulations. -template -class VectorSpaceCUDA { - public: - using fp_type = FP; - - private: - using Pointer = std::unique_ptr; - - public: - class Vector { - public: - Vector() = delete; - - Vector(Pointer&& ptr, unsigned num_qubits) - : ptr_(std::move(ptr)), num_qubits_(num_qubits) {} - - fp_type* get() { - return ptr_.get(); - } - - const fp_type* get() const { - return ptr_.get(); - } - - fp_type* release() { - num_qubits_ = 0; - return ptr_.release(); - } - - unsigned num_qubits() const { - return num_qubits_; - } - - bool requires_copy_to_host() const { - return true; - } - - private: - Pointer ptr_; - unsigned num_qubits_; - }; - - template - VectorSpaceCUDA(Args&&... args) {} - - static Vector Create(unsigned num_qubits) { - fp_type* p; - auto size = sizeof(fp_type) * Impl::MinSize(num_qubits); - auto rc = cudaMalloc(&p, size); - - if (rc == cudaSuccess) { - return Vector{Pointer{(fp_type*) p, &detail::free}, num_qubits}; - } else { - return Null(); - } - } - - // It is the client's responsibility to make sure that p has at least - // Impl::MinSize(num_qubits) elements. - static Vector Create(fp_type* p, unsigned num_qubits) { - return Vector{Pointer{p, &detail::do_not_free}, num_qubits}; - } - - static Vector Null() { - return Vector{Pointer{nullptr, &detail::free}, 0}; - } - - static bool IsNull(const Vector& vector) { - return vector.get() == nullptr; - } - - static void Free(fp_type* ptr) { - detail::free(ptr); - } - - bool Copy(const Vector& src, Vector& dest) const { - if (src.num_qubits() != dest.num_qubits()) { - return false; - } - - ErrorCheck( - cudaMemcpy(dest.get(), src.get(), - sizeof(fp_type) * Impl::MinSize(src.num_qubits()), - cudaMemcpyDeviceToDevice)); - - return true; - } - - // It is the client's responsibility to make sure that dest has at least - // Impl::MinSize(src.num_qubits()) elements. - bool Copy(const Vector& src, fp_type* dest) const { - ErrorCheck( - cudaMemcpy(dest, src.get(), - sizeof(fp_type) * Impl::MinSize(src.num_qubits()), - cudaMemcpyDeviceToHost)); - - return true; - } - - // It is the client's responsibility to make sure that src has at least - // Impl::MinSize(dest.num_qubits()) elements. - bool Copy(const fp_type* src, Vector& dest) const { - ErrorCheck( - cudaMemcpy(dest.get(), src, - sizeof(fp_type) * Impl::MinSize(dest.num_qubits()), - cudaMemcpyHostToDevice)); - - return true; - } - - // It is the client's responsibility to make sure that src has at least - // min(size, Impl::MinSize(dest.num_qubits())) elements. - bool Copy(const fp_type* src, uint64_t size, Vector& dest) const { - size = std::min(size, Impl::MinSize(dest.num_qubits())); - ErrorCheck( - cudaMemcpy(dest.get(), src, - sizeof(fp_type) * size, - cudaMemcpyHostToDevice)); - return true; - } - - void DeviceSync() { - ErrorCheck(cudaDeviceSynchronize()); - } - - protected: -}; - -} // namespace qsim - -#endif // VECTORSPACE_CUDA_H_ From 63874bc7ddf8c2eafd31691cc0a1cf92c2830a41 Mon Sep 17 00:00:00 2001 From: Seth R Johnson Date: Fri, 22 Nov 2024 14:15:50 -0500 Subject: [PATCH 06/64] Update goldfinger configuration --- CMakeLists.txt | 5 +++-- scripts/cmake-presets/goldfinger.json | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bd57739..79e8162 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -117,8 +117,9 @@ set(QIREE_RUNTIME_OUTPUT_DIRECTORY enable_language(C) # Needed for LLVM find_package(LLVM REQUIRED) if((LLVM_VERSION VERSION_LESS 14) - OR (LLVM_VERSION VERSION_GREATER_EQUAL 19)) - message(WARNING "QIR-EE is only tested with LLVM 14-18: found version ${LLVM_VERSION}") + OR (LLVM_VERSION VERSION_GREATER_EQUAL 20)) + message(WARNING "QIR-EE is only tested with LLVM 14-19: found version ${LLVM_VERSION}") +endif() endif() if(QIREE_USE_XACC) diff --git a/scripts/cmake-presets/goldfinger.json b/scripts/cmake-presets/goldfinger.json index 0e21a45..a056420 100644 --- a/scripts/cmake-presets/goldfinger.json +++ b/scripts/cmake-presets/goldfinger.json @@ -9,7 +9,7 @@ "cacheVariables": { "CMAKE_BUILD_TYPE": {"type": "STRING", "value": "Debug"}, "CMAKE_EXPORT_COMPILE_COMMANDS": {"type": "BOOL", "value": "ON"}, - "CMAKE_OSX_DEPLOYMENT_TARGET": {"type": "STRING", "value": "14"}, + "CMAKE_OSX_DEPLOYMENT_TARGET": {"type": "STRING", "value": "15"}, "CMAKE_CXX_STANDARD": {"type": "STRING", "value": "17"}, "CMAKE_CXX_EXTENSIONS": {"type": "BOOL", "value": "OFF"}, "CMAKE_FIND_FRAMEWORK": {"type": "STRING", "value": "LAST"}, @@ -17,7 +17,7 @@ "CMAKE_CXX_FLAGS": "-Wall -Wextra -Werror -Wno-error=deprecated -pedantic -fdiagnostics-color=always" }, "environment": { - "CMAKE_PREFIX_PATH": "/opt/homebrew/Cellar/llvm/18.1.8:/opt/spack/var/spack/environments/xacc/.spack-env/view:$env{HOME}/Code/xacc/install" + "CMAKE_PREFIX_PATH": "/opt/homebrew/opt/llvm:/opt/spack/var/spack/environments/xacc/.spack-env/view:$env{HOME}/Code/xacc/install" } }, { From 91753d642b8668ee1305e2876b7b2e8662636daf Mon Sep 17 00:00:00 2001 From: Seth R Johnson Date: Fri, 22 Nov 2024 14:29:42 -0500 Subject: [PATCH 07/64] Add qsim --- CMakeLists.txt | 35 +++++++++++++++++++++++++---------- app/CMakeLists.txt | 19 ++++++++++++++++++- src/qiree_config.h.in | 2 ++ src/qirqsim/CMakeLists.txt | 1 + src/qirqsim/qsimQuantum.cc | 28 ++++++++++++++-------------- src/qirqsim/qsimQuantum.hh | 33 ++++++++++++++++----------------- 6 files changed, 76 insertions(+), 42 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 79e8162..864093d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,7 +4,7 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #----------------------------------------------------------------------------# -cmake_minimum_required(VERSION 3.12) +cmake_minimum_required(VERSION 3.18) # Set QIREE_VERSION using git tags using the following format set(CGV_TAG_REGEX "v([0-9.]+)(-dev|-rc.[0-9]+)?") @@ -12,12 +12,13 @@ include("${CMAKE_CURRENT_LIST_DIR}/cmake/CgvFindVersion.cmake") cgv_find_version(QIREE) project(QIREE VERSION "${QIREE_VERSION}" LANGUAGES CXX) -cmake_policy(VERSION 3.12...3.22) +cmake_policy(VERSION 3.18...3.30) list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake") include(GNUInstallDirs) include(CMakePackageConfigHelpers) +include(FetchContent) include("${CMAKE_CURRENT_LIST_DIR}/cmake/QIREEUtils.cmake") macro(qiree_set_default name value) @@ -34,14 +35,10 @@ endmacro() # Components option(QIREE_BUILD_DOCS "Build QIR-EE documentation" OFF) -option(QIREE_BUILD_TESTS "Build QIR-EE unit tests" OFF) +option(QIREE_BUILD_TESTS "Build QIR-EE unit tests" ON) option(QIREE_BUILD_EXAMPLES "Build QIR-EE examples" OFF) -option(QIREE_USE_XACC "Build XACC interface" OFF) -option(QIREE_USE_QSIM "Build qsim interface" OFF) -qiree_set_default(BUILD_TESTING ${QIREE_BUILD_TESTS}) - -# Assertion handling -option(QIREE_DEBUG "Enable runtime assertions" ON) +option(QIREE_USE_QSIM "Download and build Google qsim backend" OFF) +option(QIREE_USE_XACC "Build XACC interface" ON) qiree_set_default(BUILD_TESTING ${QIREE_BUILD_TESTS}) @@ -120,6 +117,24 @@ if((LLVM_VERSION VERSION_LESS 14) OR (LLVM_VERSION VERSION_GREATER_EQUAL 20)) message(WARNING "QIR-EE is only tested with LLVM 14-19: found version ${LLVM_VERSION}") endif() + +if(QIREE_USE_QSIM) + # Declare and download qsim: it's header-only and the code is in "lib", + # so download it into "external/qsim" directory and include "external" + FetchContent_Declare( + qsim_content + QUIET + GIT_REPOSITORY https://github.com/quantumlib/qsim.git + GIT_TAG e5817518b16858e0732269b56525f72bcdb30764 # v0.21.0 + SOURCE_SUBDIR "lib" # Don't load top-level cmake file + SOURCE_DIR "external/qsim" + ) + FetchContent_MakeAvailable(qsim_content) + add_library(qiree_qsim INTERFACE) + add_library(QIREE::qsim ALIAS qiree_qsim) + target_include_directories(qiree_qsim SYSTEM INTERFACE + "${CMAKE_CURRENT_BINARY_DIR}/external" + ) endif() if(QIREE_USE_XACC) @@ -192,7 +207,7 @@ add_subdirectory(app) #----------------------------------------------------------------------------# if(QIREE_BUILD_EXAMPLES) - add_subdirectory(examples) + add_subdirectory(examples) endif() #----------------------------------------------------------------------------# diff --git a/app/CMakeLists.txt b/app/CMakeLists.txt index ea7589a..4bf7330 100644 --- a/app/CMakeLists.txt +++ b/app/CMakeLists.txt @@ -4,7 +4,6 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #-----------------------------------------------------------------------------# -include(FetchContent) FetchContent_Declare( # Command Line Parser for C++ programs cli11_proj @@ -15,6 +14,24 @@ FetchContent_Declare( FetchContent_MakeAvailable(cli11_proj) +#-----------------------------------------------------------------------------# +# QSIM FRONT END +#-----------------------------------------------------------------------------# + +if(QIREE_USE_QSIM) + qiree_add_executable(qir-qsim + qir-qsim.cc + ) + target_link_libraries(qir-qsim + PUBLIC QIREE::qiree QIREE::qirqsim + PRIVATE CLI11::CLI11 + ) +endif() + +#-----------------------------------------------------------------------------# +# XACC FRONT END +#-----------------------------------------------------------------------------# + if(QIREE_USE_XACC) qiree_add_executable(qir-xacc qir-xacc.cc diff --git a/src/qiree_config.h.in b/src/qiree_config.h.in index 475c792..d46b752 100644 --- a/src/qiree_config.h.in +++ b/src/qiree_config.h.in @@ -10,5 +10,7 @@ #define qiree_config_h #cmakedefine01 QIREE_DEBUG +#cmakedefine01 QIREE_USE_QSIM +#cmakedefine01 QIREE_USE_XACC #endif /* qiree_config_h */ diff --git a/src/qirqsim/CMakeLists.txt b/src/qirqsim/CMakeLists.txt index 09a0511..380bbac 100644 --- a/src/qirqsim/CMakeLists.txt +++ b/src/qirqsim/CMakeLists.txt @@ -15,6 +15,7 @@ qiree_add_library(qirqsim #Link the qsim library to qiree and any other relevant libraries target_link_libraries(qirqsim PUBLIC QIREE::qiree # Link to qiree + PRIVATE QIREE::qsim ) #----------------------------------------------------------------------------# diff --git a/src/qirqsim/qsimQuantum.cc b/src/qirqsim/qsimQuantum.cc index 74d510d..0ee6746 100644 --- a/src/qirqsim/qsimQuantum.cc +++ b/src/qirqsim/qsimQuantum.cc @@ -19,20 +19,20 @@ #include "qiree/Assert.hh" // Qsim -#include "../../tpls/qsim/simulator_basic.h" -#include "../../tpls/qsim/statespace_basic.h" -#include "../../tpls/qsim/gates_qsim.h" -#include "../../tpls/qsim/circuit.h" -#include "../../tpls/qsim/run_qsim.h" -#include "../../tpls/qsim/io.h" -#include "../../tpls/qsim/fuser.h" -#include "../../tpls/qsim/circuit_qsim_parser.h" -#include "../../tpls/qsim/fuser_mqubit.h" -#include "../../tpls/qsim/io_file.h" -#include "../../tpls/qsim/simmux.h" -#include "../../tpls/qsim/util_cpu.h" -#include "../../tpls/qsim/formux.h" -#include "../../tpls/qsim/gate.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // namespace qiree{ diff --git a/src/qirqsim/qsimQuantum.hh b/src/qirqsim/qsimQuantum.hh index cfdfc4b..024209f 100644 --- a/src/qirqsim/qsimQuantum.hh +++ b/src/qirqsim/qsimQuantum.hh @@ -7,33 +7,32 @@ //---------------------------------------------------------------------------// #pragma once +#include #include #include #include #include #include -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "BufferManager.hh" #include "qiree/Macros.hh" #include "qiree/QuantumNotImpl.hh" #include "qiree/RuntimeInterface.hh" #include "qiree/Types.hh" -#include "BufferManager.hh" - -#include "../../tpls/qsim/simulator_basic.h" -#include "../../tpls/qsim/statespace_basic.h" -#include "../../tpls/qsim/gates_qsim.h" -#include "../../tpls/qsim/circuit.h" -#include "../../tpls/qsim/run_qsim.h" -#include "../../tpls/qsim/io.h" -#include "../../tpls/qsim/fuser.h" -#include "../../tpls/qsim/circuit_qsim_parser.h" -#include "../../tpls/qsim/fuser_mqubit.h" -#include "../../tpls/qsim/io_file.h" -#include "../../tpls/qsim/simmux.h" -#include "../../tpls/qsim/util_cpu.h" -#include "../../tpls/qsim/formux.h" -#include "../../tpls/qsim/gate.h" struct Factory { // Factory class for creating simulators in qsim Factory(unsigned num_threads) : num_threads(num_threads) {} From e289381d75d9084f551026677abd2b15b021f22a Mon Sep 17 00:00:00 2001 From: Seth R Johnson Date: Fri, 22 Nov 2024 14:33:57 -0500 Subject: [PATCH 08/64] Update version to get include directories working --- CMakeLists.txt | 7 ++++--- src/qirqsim/CMakeLists.txt | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 864093d..3eb2675 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -125,15 +125,16 @@ if(QIREE_USE_QSIM) qsim_content QUIET GIT_REPOSITORY https://github.com/quantumlib/qsim.git - GIT_TAG e5817518b16858e0732269b56525f72bcdb30764 # v0.21.0 + GIT_TAG 55b4d0e7ea8f085a1709c2c06ff1e28b3aa93357 # 'main' on 22 Nov 2024 SOURCE_SUBDIR "lib" # Don't load top-level cmake file SOURCE_DIR "external/qsim" ) FetchContent_MakeAvailable(qsim_content) - add_library(qiree_qsim INTERFACE) + qiree_add_library(qiree_qsim INTERFACE) add_library(QIREE::qsim ALIAS qiree_qsim) target_include_directories(qiree_qsim SYSTEM INTERFACE - "${CMAKE_CURRENT_BINARY_DIR}/external" + "$" + "$" ) endif() diff --git a/src/qirqsim/CMakeLists.txt b/src/qirqsim/CMakeLists.txt index 380bbac..b0cf690 100644 --- a/src/qirqsim/CMakeLists.txt +++ b/src/qirqsim/CMakeLists.txt @@ -15,7 +15,7 @@ qiree_add_library(qirqsim #Link the qsim library to qiree and any other relevant libraries target_link_libraries(qirqsim PUBLIC QIREE::qiree # Link to qiree - PRIVATE QIREE::qsim + PUBLIC QIREE::qsim #FIXME: make private ) #----------------------------------------------------------------------------# From ac7c24402879e5404f20385c7635d879ed8708a4 Mon Sep 17 00:00:00 2001 From: Seth R Johnson Date: Fri, 22 Nov 2024 14:49:06 -0500 Subject: [PATCH 09/64] Run clang format --- src/qirqsim/BufferManager.cc | 31 +++-- src/qirqsim/BufferManager.hh | 40 +++--- src/qirqsim/qsimDefaultRuntime.cc | 24 ++-- src/qirqsim/qsimDefaultRuntime.hh | 8 +- src/qirqsim/qsimQuantum.cc | 222 +++++++++++++++++++----------- src/qirqsim/qsimQuantum.hh | 95 ++++++------- src/qirqsim/qsimTupleRuntime.cc | 10 +- 7 files changed, 255 insertions(+), 175 deletions(-) diff --git a/src/qirqsim/BufferManager.cc b/src/qirqsim/BufferManager.cc index 46931d9..b340604 100644 --- a/src/qirqsim/BufferManager.cc +++ b/src/qirqsim/BufferManager.cc @@ -7,39 +7,50 @@ //---------------------------------------------------------------------------// #include "BufferManager.hh" -#include -#include + #include +#include +#include -void BufferManager::updateBuffer(const std::string& qubit, const std::string& state, const int& value) { +void BufferManager::updateBuffer(std::string const& qubit, + std::string const& state, + int const& value) +{ // Insert or update the key-value pair in the buffer std::pair searchKey = {qubit, state}; int current_frequency = 0; auto it = buffer.find(searchKey); - if (it != buffer.end()){ - current_frequency = it -> second; + if (it != buffer.end()) + { + current_frequency = it->second; } // Accumulate counts with every shot buffer[{qubit, state}] = value + current_frequency; } -void BufferManager::updateBuffer(const std::string& key, const int& value) { +void BufferManager::updateBuffer(std::string const& key, int const& value) +{ // Insert or update the key-value pair in the buffer simple_buffer[key] = value; } -std::optional BufferManager::getBufferValue(const std::string& qubit, const std::string& state) const { +std::optional BufferManager::getBufferValue(std::string const& qubit, + std::string const& state) const +{ std::pair searchKey = {qubit, state}; auto it = buffer.find(searchKey); - if (it != buffer.end()) { + if (it != buffer.end()) + { return it->second; // Key found } return std::nullopt; // Key not found } -std::optional BufferManager::getBufferValue(const std::string& key) const { +std::optional BufferManager::getBufferValue(std::string const& key) const +{ auto it = simple_buffer.find(key); - if (it != simple_buffer.end()) { + if (it != simple_buffer.end()) + { return it->second; // Key found } return std::nullopt; // Key not found diff --git a/src/qirqsim/BufferManager.hh b/src/qirqsim/BufferManager.hh index efb3800..9bac1b5 100644 --- a/src/qirqsim/BufferManager.hh +++ b/src/qirqsim/BufferManager.hh @@ -9,17 +9,19 @@ #ifndef BUFFER_MANAGER_H #define BUFFER_MANAGER_H -#include -#include -#include #include +#include +#include +#include #include // Define a hash function for std::pair -struct pair_hash { - template - std::size_t operator()(const std::pair& pair) const { +struct pair_hash +{ + template + std::size_t operator()(std::pair const& pair) const + { auto hash1 = std::hash{}(pair.first); auto hash2 = std::hash{}(pair.second); // Combine the two hash values @@ -27,22 +29,24 @@ struct pair_hash { } }; -class BufferManager { -public: - +class BufferManager +{ + public: // Method to update the buffer with a key-value pair - void updateBuffer(const std::string& qubit, const std::string& state, const int& value); - void updateBuffer(const std::string& key, const int& value); - + void updateBuffer(std::string const& qubit, + std::string const& state, + int const& value); + void updateBuffer(std::string const& key, int const& value); + // Retrieve buffer value for storage or evaluation - std::optional getBufferValue(const std::string& qubit, const std::string& state) const; - std::optional getBufferValue(const std::string& key) const; - -private: - + std::optional + getBufferValue(std::string const& qubit, std::string const& state) const; + std::optional getBufferValue(std::string const& key) const; + + private: // Dictionary to store key-value pairs std::unordered_map, int, pair_hash> buffer; std::unordered_map simple_buffer; }; -#endif // BUFFER_MANAGER_H +#endif // BUFFER_MANAGER_H diff --git a/src/qirqsim/qsimDefaultRuntime.cc b/src/qirqsim/qsimDefaultRuntime.cc index 339703a..924e1e6 100644 --- a/src/qirqsim/qsimDefaultRuntime.cc +++ b/src/qirqsim/qsimDefaultRuntime.cc @@ -6,9 +6,11 @@ //! \file qirqsim/qsimDefaultRuntime.cc //---------------------------------------------------------------------------// #include "qsimDefaultRuntime.hh" + #include + #include "qiree/Assert.hh" - + namespace qiree { //---------------------------------------------------------------------------// @@ -32,9 +34,9 @@ void qsimDefaultRuntime::initialize(OptionalCString env) void qsimDefaultRuntime::array_record_output(size_type s, OptionalCString tag) { - //this->execute_if_needed(); - //output_ << "array " << (tag ? tag : "") << " length " << s - // << std::endl; + // this->execute_if_needed(); + // output_ << "array " << (tag ? tag : "") << " length " << s + // << std::endl; } //---------------------------------------------------------------------------// @@ -45,9 +47,9 @@ void qsimDefaultRuntime::array_record_output(size_type s, OptionalCString tag) void qsimDefaultRuntime::tuple_record_output(size_type s, OptionalCString tag) { - //this->execute_if_needed(); - //output_ << "tuple " << (tag ? tag : "") << " length " << s - // << std::endl; + // this->execute_if_needed(); + // output_ << "tuple " << (tag ? tag : "") << " length " << s + // << std::endl; } //---------------------------------------------------------------------------// @@ -60,10 +62,12 @@ void qsimDefaultRuntime::result_record_output(Result r, OptionalCString tag) // This prints results every time result_record_output is called // Can comment out if only want to see final results - if (auto value = sim_.manager.getBufferValue("q"+std::to_string(r.value)); value.has_value()) { - std::cout << "q" << std::to_string(r.value) << " : " << value.value() << "\n"; + if (auto value = sim_.manager.getBufferValue("q" + std::to_string(r.value)); + value.has_value()) + { + std::cout << "q" << std::to_string(r.value) << " : " << value.value() + << "\n"; } - } } // namespace qiree diff --git a/src/qirqsim/qsimDefaultRuntime.hh b/src/qirqsim/qsimDefaultRuntime.hh index 26f06ab..e76308e 100644 --- a/src/qirqsim/qsimDefaultRuntime.hh +++ b/src/qirqsim/qsimDefaultRuntime.hh @@ -13,7 +13,7 @@ namespace qiree { /*! - * Print per-qubit measurement statistics. + * Print per-qubit measurement statistics. * * Example for three qubits: * \code @@ -24,7 +24,7 @@ namespace qiree * q0 {0: 542, 1: 482} * q1 {0: 521, 1: 503} * q2 {0: 0, 1: 1024} - * + * * \endcode */ @@ -34,9 +34,7 @@ class qsimDefaultRuntime final : virtual public RuntimeInterface /*! * Construct \c qsimDefaultRuntime. */ - qsimDefaultRuntime(std::ostream& output, - qsimQuantum& sim - ) + qsimDefaultRuntime(std::ostream& output, qsimQuantum& sim) : output_(output), sim_(sim) { } diff --git a/src/qirqsim/qsimQuantum.cc b/src/qirqsim/qsimQuantum.cc index 0ee6746..0fe9a98 100644 --- a/src/qirqsim/qsimQuantum.cc +++ b/src/qirqsim/qsimQuantum.cc @@ -9,12 +9,12 @@ #include "qsimQuantum.hh" #include +#include #include -#include +#include #include #include -#include -#include +#include #include "qiree/Assert.hh" @@ -35,50 +35,64 @@ #include // -namespace qiree{ +namespace qiree +{ //---------------------------------------------------------------------------// /* Initialize the qsim simulator */ -qsimQuantum::State qsimQuantum::init_state_space() { //check if StateSpace is the proper type for the output, problably it is just State from the Fatory struct. - std::srand(static_cast(std::time(nullptr))); // Seed the random number generator - qsimParam.seed = std::rand(); // Set the seed for qsim parameters - numThreads = std::max(1, static_cast(std::thread::hardware_concurrency())); // Get the number of threads - qsimParam.max_fused_size = 2; // Set the maximum size of fused gates - qsimParam.verbosity = 0; // see verbosity in run_qsim.h +qsimQuantum::State qsimQuantum::init_state_space() +{ // check if StateSpace is the proper type for the output, problably it is + // just State from the Fatory struct. + std::srand(static_cast(std::time(nullptr))); // Seed the + // random + // number + // generator + qsimParam.seed = std::rand(); // Set the seed for qsim parameters + numThreads = std::max( + 1, static_cast(std::thread::hardware_concurrency())); // Get the + // number + // of + // threads + qsimParam.max_fused_size = 2; // Set the maximum size of fused gates + qsimParam.verbosity = 0; // see verbosity in run_qsim.h // Initialize the qsim simulator - qsimQuantum::StateSpace state_space = Factory(numThreads).CreateStateSpace(); // Create the state space - State state = state_space.Create(this->num_qubits()); // Create the state + qsimQuantum::StateSpace state_space + = Factory(numThreads).CreateStateSpace(); // Create the state space + State state = state_space.Create(this->num_qubits()); // Create the state // Check if the state is null - if (state_space.IsNull(state)) { - qsim::IO::errorf("not enough memory: is the number of qubits too large?\n"); + if (state_space.IsNull(state)) + { + qsim::IO::errorf( + "not enough memory: is the number of qubits too large?\n"); } - state_space.SetStateZero(state); // Set the state to zero, TODO: the initial state is not necessarily zero - return state; - } - - qsimQuantum::qsimQuantum(std::ostream& os, - size_type shots) - : output_(os) - { - } + state_space.SetStateZero(state); // Set the state to zero, TODO: the + // initial state is not necessarily zero + return state; +} + +qsimQuantum::qsimQuantum(std::ostream& os, size_type shots) : output_(os) {} //---------------------------------------------------------------------------// /* Prepare to build a quantum circuit for an entry point */ -void qsimQuantum::set_up(EntryPointAttrs const& attrs) { +void qsimQuantum::set_up(EntryPointAttrs const& attrs) +{ QIREE_VALIDATE(attrs.required_num_qubits > 0, << "input is not a quantum program"); - // Resize the result_to_qubit_ vector, based on the required number of results... - // the idea is to have as many classical registers as qubits (probably not true in general) + // Resize the result_to_qubit_ vector, based on the required number of + // results... the idea is to have as many classical registers as qubits + // (probably not true in general) result_to_qubit_.resize(attrs.required_num_results); - num_qubits_ = attrs.required_num_qubits; // Set the number of qubits - state_ = std::make_shared(init_state_space()); // Set the state space? Maybe. - q_circuit.num_qubits = num_qubits_; // Allocate the number of qubits in the circuit - execution_time = 0; // Initialize execution time + num_qubits_ = attrs.required_num_qubits; // Set the number of qubits + state_ = std::make_shared(init_state_space()); // Set the state + // space? Maybe. + q_circuit.num_qubits = num_qubits_; // Allocate the number of qubits in + // the circuit + execution_time = 0; // Initialize execution time static unsigned int rep = 0; rep++; this->repCount(rep); @@ -89,11 +103,13 @@ void qsimQuantum::set_up(EntryPointAttrs const& attrs) { Complete an execution */ -void qsimQuantum::repCount(int rep) { +void qsimQuantum::repCount(int rep) +{ repetition = rep; } -void qsimQuantum::tear_down() { +void qsimQuantum::tear_down() +{ q_circuit = {}; q_circuit.num_qubits = num_qubits_; state_ = std::make_shared(init_state_space()); @@ -104,12 +120,13 @@ void qsimQuantum::tear_down() { Reset the qubit */ -void qsimQuantum::reset(Qubit q) { - q.value=0; +void qsimQuantum::reset(Qubit q) +{ + q.value = 0; } //----------------------------------------------------------------------------// -/* +/* Read the value of a result. This utilizes the new BufferManager. */ @@ -117,18 +134,24 @@ QState qsimQuantum::read_result(Result r) { std::string q_index_string = std::to_string(r.value); auto meas_results = execute_if_needed(); - if (meas_results.size() == 1 && meas_results[0].bitstring.size() == 1) { - const auto bitResult = meas_results[0].bitstring[0]; + if (meas_results.size() == 1 && meas_results[0].bitstring.size() == 1) + { + auto const bitResult = meas_results[0].bitstring[0]; assert(bitResult == 0 || bitResult == 1); std::string stringResult = std::to_string(bitResult); - if (stringResult == "1"){ - manager.updateBuffer("q"+q_index_string, "1", 1); - manager.updateBuffer("q"+q_index_string, 1); - } else{ - manager.updateBuffer("q"+q_index_string, "0", 1); - manager.updateBuffer("q"+q_index_string, 0); + if (stringResult == "1") + { + manager.updateBuffer("q" + q_index_string, "1", 1); + manager.updateBuffer("q" + q_index_string, 1); + } + else + { + manager.updateBuffer("q" + q_index_string, "0", 1); + manager.updateBuffer("q" + q_index_string, 0); } - } else { + } + else + { qsim::IO::errorf("Unexpected measurement results encountered."); } return static_cast(meas_results[0].bitstring[0]); @@ -136,12 +159,18 @@ QState qsimQuantum::read_result(Result r) //---------------------------------------------------------------------------// /* -Map a qubit to a result index +Map a qubit to a result index (TODO: find how to link the classical register to the quantum register in qsim) */ -void qsimQuantum::mz(Qubit q, Result r) { //we don't classical register yet. - QIREE_EXPECT(q.value < this->num_qubits()); // TODO: q must be in the set of qubits, e.g., what happens if q=5 and qubits are {2,3,4,5}, q is less than num_qubits but not it is in the set of qubits. +void qsimQuantum::mz(Qubit q, Result r) +{ // we don't classical register yet. + QIREE_EXPECT(q.value < this->num_qubits()); // TODO: q must be in the set + // of qubits, e.g., what + // happens if q=5 and qubits + // are {2,3,4,5}, q is less + // than num_qubits but not it + // is in the set of qubits. // Add measurement instruction this->q_circuit.gates.push_back( qsim::gate::Measurement>::Create( @@ -154,79 +183,106 @@ Quantum Instruction Mapping */ // 1. Entangling gates -void qsimQuantum::cx(Qubit q1, Qubit q2) { - q_circuit.gates.push_back( - qsim::GateCNot::Create(execution_time++, this->getQubitIndex(q1), this->getQubitIndex(q2))); +void qsimQuantum::cx(Qubit q1, Qubit q2) +{ + q_circuit.gates.push_back(qsim::GateCNot::Create( + execution_time++, this->getQubitIndex(q1), this->getQubitIndex(q2))); } -void qsimQuantum::cnot(Qubit q1, Qubit q2) { - q_circuit.gates.push_back( - qsim::GateCNot::Create(execution_time++, this->getQubitIndex(q1), this->getQubitIndex(q2))); +void qsimQuantum::cnot(Qubit q1, Qubit q2) +{ + q_circuit.gates.push_back(qsim::GateCNot::Create( + execution_time++, this->getQubitIndex(q1), this->getQubitIndex(q2))); } -void qsimQuantum::cz(Qubit q1, Qubit q2) { - q_circuit.gates.push_back( - qsim::GateCZ::Create(execution_time++, this->getQubitIndex(q1), this->getQubitIndex(q2))); +void qsimQuantum::cz(Qubit q1, Qubit q2) +{ + q_circuit.gates.push_back(qsim::GateCZ::Create( + execution_time++, this->getQubitIndex(q1), this->getQubitIndex(q2))); } // 2. Local gates -void qsimQuantum::h(Qubit q) { +void qsimQuantum::h(Qubit q) +{ q_circuit.gates.push_back( qsim::GateHd::Create(execution_time++, this->getQubitIndex(q))); } -void qsimQuantum::s(Qubit q) { +void qsimQuantum::s(Qubit q) +{ q_circuit.gates.push_back( qsim::GateS::Create(execution_time++, this->getQubitIndex(q))); } -void qsimQuantum::t(Qubit q) { +void qsimQuantum::t(Qubit q) +{ q_circuit.gates.push_back( qsim::GateT::Create(execution_time++, this->getQubitIndex(q))); } // 2.1 Pauli gates -void qsimQuantum::x(Qubit q) { +void qsimQuantum::x(Qubit q) +{ q_circuit.gates.push_back( qsim::GateX::Create(execution_time++, this->getQubitIndex(q))); } -void qsimQuantum::y(Qubit q) { +void qsimQuantum::y(Qubit q) +{ q_circuit.gates.push_back( qsim::GateY::Create(execution_time++, this->getQubitIndex(q))); } -void qsimQuantum::z(Qubit q) { +void qsimQuantum::z(Qubit q) +{ q_circuit.gates.push_back( qsim::GateZ::Create(execution_time++, this->getQubitIndex(q))); } // 2.2 rotation gates -void qsimQuantum::rx(double theta, Qubit q) { - q_circuit.gates.push_back( - qsim::GateRX::Create(execution_time++, this->getQubitIndex(q), theta)); +void qsimQuantum::rx(double theta, Qubit q) +{ + q_circuit.gates.push_back(qsim::GateRX::Create( + execution_time++, this->getQubitIndex(q), theta)); } -void qsimQuantum::ry(double theta, Qubit q) { - q_circuit.gates.push_back( - qsim::GateRY::Create(execution_time++, this->getQubitIndex(q), theta)); +void qsimQuantum::ry(double theta, Qubit q) +{ + q_circuit.gates.push_back(qsim::GateRY::Create( + execution_time++, this->getQubitIndex(q), theta)); } -void qsimQuantum::rz(double theta, Qubit q) { - q_circuit.gates.push_back( - qsim::GateRZ::Create(execution_time++, this->getQubitIndex(q), theta)); +void qsimQuantum::rz(double theta, Qubit q) +{ + q_circuit.gates.push_back(qsim::GateRZ::Create( + execution_time++, this->getQubitIndex(q), theta)); } -Qubit qsimQuantum::result_to_qubit(Result r) { - // TODO: This function is not working. Giving 0 every time. Maybe not needed. +Qubit qsimQuantum::result_to_qubit(Result r) +{ + // TODO: This function is not working. Giving 0 every time. Maybe not + // needed. QIREE_EXPECT(r.value < this->num_results()); - return result_to_qubit_[r.value]; // just copied this from the qirxacc, I have no idea if we need to do something else here + return result_to_qubit_[r.value]; // just copied this from the qirxacc, I + // have no idea if we need to do + // something else here } -void qsimQuantum::print_accelbuf() { - // TODO: to be implemented, we can create a buffer class to store the results +void qsimQuantum::print_accelbuf() +{ + // TODO: to be implemented, we can create a buffer class to store the + // results } -qsimQuantum::VecMeas qsimQuantum::execute_if_needed() { - std::vector meas_results; // Vector to hold measurement results, this must be empty before running +qsimQuantum::VecMeas qsimQuantum::execute_if_needed() +{ + std::vector meas_results; // Vector to hold + // measurement + // results, this + // must be empty + // before running std::string stringResult; static unsigned long int seed = 0; qsimParam.seed = seed++; - const bool run_success = Runner::Run(qsimParam, Factory(numThreads), q_circuit, *state_, meas_results); // Run the simulation - assert(run_success); // Ensure the run was successful - // reset circuit here - q_circuit = {}; + bool const run_success = Runner::Run(qsimParam, + Factory(numThreads), + q_circuit, + *state_, + meas_results); // Run the simulation + assert(run_success); // Ensure the run was successful + // reset circuit here + q_circuit = {}; q_circuit.num_qubits = num_qubits_; return meas_results; } -} // namespace qiree +} // namespace qiree diff --git a/src/qirqsim/qsimQuantum.hh b/src/qirqsim/qsimQuantum.hh index 024209f..745da51 100644 --- a/src/qirqsim/qsimQuantum.hh +++ b/src/qirqsim/qsimQuantum.hh @@ -34,24 +34,24 @@ #include "qiree/RuntimeInterface.hh" #include "qiree/Types.hh" -struct Factory { // Factory class for creating simulators in qsim +struct Factory +{ // Factory class for creating simulators in qsim Factory(unsigned num_threads) : num_threads(num_threads) {} using Simulator = qsim::Simulator; using StateSpace = Simulator::StateSpace; - StateSpace CreateStateSpace() const { return StateSpace(num_threads); } + StateSpace CreateStateSpace() const { return StateSpace(num_threads); } Simulator CreateSimulator() const { return Simulator(num_threads); } unsigned num_threads; }; namespace qiree { - class qsimQuantum final : virtual public QuantumNotImpl - { - - public: - +class qsimQuantum final : virtual public QuantumNotImpl +{ + public: // Define constructors and destructors - qsimQuantum(std::ostream& os, size_type shots); // Construct with number of shots + qsimQuantum(std::ostream& os, size_type shots); // Construct with number + // of shots // Define types using Simulator = qsim::Simulator; @@ -63,15 +63,17 @@ namespace qiree State init_state_space(); - QIREE_DELETE_COPY_MOVE(qsimQuantum); // Delete copy and move constructors + QIREE_DELETE_COPY_MOVE(qsimQuantum); // Delete copy and move constructors //!@{ //! \name Accessors size_type num_results() const { return result_to_qubit_.size(); } size_type num_qubits() const { return num_qubits_; } - - unsigned getQubitIndex(Qubit q) { - return static_cast(q.value); // Return the value of the qubit + + unsigned getQubitIndex(Qubit q) + { + return static_cast(q.value); // Return the value of the + // qubit } //!@} @@ -96,8 +98,8 @@ namespace qiree Qubit result_to_qubit(Result); // Wrapper for qsim - //std::map - //get_marginal_counts(std::vector const& qubits); + // std::map + // get_marginal_counts(std::vector const& qubits); // Run the circuit on the accelerator if we have not already. Returns true // if the circuit was executed. @@ -131,7 +133,10 @@ namespace qiree //!@} // Get the quantum circuit - qsim::Circuit> get_circuit() const { return q_circuit; } + qsim::Circuit> get_circuit() const + { + return q_circuit; + } // Get the state space State const& get_state() const { return *state_; } // Update the buffer @@ -139,39 +144,37 @@ namespace qiree // Number of repetitions int repetition; void repCount(int rep); - - private: - //// TYPES //// - enum class Endianness - { - little, - big - }; - unsigned numThreads; // Number of threads to use - unsigned max_fused_size; // Maximum size of fused gates - qsim::Circuit> q_circuit; // Quantum circuit object - - Runner::Parameter qsimParam; // Parameters for qsim - size_t execution_time; // when the quantum operation will be executed - - bool executed; - size_type num_qubits_{}; - std::vector result_to_qubit_; - Endianness endian_; - - std::ostream& output_; - std::shared_ptr simulator_; - std::shared_ptr statespace_; - std::shared_ptr state_; + private: + //// TYPES //// + enum class Endianness + { + little, + big }; + unsigned numThreads; // Number of threads to use + unsigned max_fused_size; // Maximum size of fused gates + qsim::Circuit> q_circuit; // Quantum circuit object + + Runner::Parameter qsimParam; // Parameters for qsim + size_t execution_time; // when the quantum operation will be executed + + bool executed; + size_type num_qubits_{}; + std::vector result_to_qubit_; + Endianness endian_; + + std::ostream& output_; + std::shared_ptr simulator_; + std::shared_ptr statespace_; + std::shared_ptr state_; +}; - class buffer { - public: - buffer(size_t size) : size(size) {} - size_t size; - }; +class buffer +{ + public: + buffer(size_t size) : size(size) {} + size_t size; +}; } // namespace qiree - - diff --git a/src/qirqsim/qsimTupleRuntime.cc b/src/qirqsim/qsimTupleRuntime.cc index 5366b79..bf88e6d 100644 --- a/src/qirqsim/qsimTupleRuntime.cc +++ b/src/qirqsim/qsimTupleRuntime.cc @@ -76,7 +76,7 @@ void qsimTupleRuntime::start_tracking(GroupingType type, { QIREE_EXPECT(!valid_); valid_ = true; - type_ = type; + type_ = type; tag_ = tag; num_results_ = num_results; qubits_.clear(); @@ -109,8 +109,12 @@ void qsimTupleRuntime::print_header(size_type num_distinct) void qsimTupleRuntime::finish_tuple() { - //auto counts = sim_.get_marginal_counts(qubits_); - std::map counts = {{"0", 0}, {"1", 0}}; // Placeholder for actual counts, TODO: replace with actual counts + // auto counts = sim_.get_marginal_counts(qubits_); + std::map counts = {{"0", 0}, {"1", 0}}; // Placeholder + // for actual + // counts, TODO: + // replace with + // actual counts print_header(counts.size()); auto name = get_name(); for (auto& [bits, count] : counts) From ece521f5fd02a20aa867a19eb9c80f9bd778aaa6 Mon Sep 17 00:00:00 2001 From: wongey <25296194+wongey@users.noreply.github.com> Date: Mon, 25 Nov 2024 10:00:29 -0500 Subject: [PATCH 10/64] Add qsim dynamicbv test Remark: Compares only empty output for now until we find a way to properly store the result and not just print as we go. Temporarily using lowercase qsim. Will modify class names later with the others at the same time. --- test/CMakeLists.txt | 10 ++++ test/qirqsim/qsimQuantum.test.cc | 98 ++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+) create mode 100644 test/qirqsim/qsimQuantum.test.cc diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c4191cc..716bcbc 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -55,3 +55,13 @@ if(QIREE_USE_XACC) endif() #---------------------------------------------------------------------------## + +#---------------------------------------------------------------------------## +# QIRQSIM TESTS +#---------------------------------------------------------------------------## + +if(QIREE_USE_QSIM) + qiree_add_test(qirqsim qsimQuantum) +endif() + +#---------------------------------------------------------------------------## diff --git a/test/qirqsim/qsimQuantum.test.cc b/test/qirqsim/qsimQuantum.test.cc new file mode 100644 index 0000000..3d29034 --- /dev/null +++ b/test/qirqsim/qsimQuantum.test.cc @@ -0,0 +1,98 @@ +//----------------------------------*-C++-*----------------------------------// +// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers. +// See the top-level COPYRIGHT file for details. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +//---------------------------------------------------------------------------// +//! \file qirxacc/XaccQuantum.test.cc +//---------------------------------------------------------------------------// +#include "qirqsim/qsimQuantum.hh" + +#include + +#include "qiree/Types.hh" +#include "qiree_test.hh" +#include "qirqsim/qsimDefaultRuntime.hh" + +namespace qiree +{ +namespace test +{ +//---------------------------------------------------------------------------// + +class qsimQuantumTest : public ::qiree::test::Test +{ + protected: + void SetUp() override {} + + static std::string clean_output(std::string&& s) + { + std::string result = std::move(s); + static std::regex const subs_ptr("0x[0-9a-f]+"); + result = std::regex_replace(result, subs_ptr, "0x0"); + return result; + } +}; + + +TEST_F(qsimQuantumTest, sim_dynamicbv) +{ + using Q = Qubit; + using R = Result; + + std::ostringstream os; + os << '\n'; + + // Create a simulator that will write to the string stream + qsimQuantum qsim_sim{os, 1}; + qsimDefaultRuntime qsim_rt{os, qsim_sim}; + + // Call functions in the same sequence that dynamicbv.ll would + qsim_sim.set_up([] { + EntryPointAttrs attrs; + attrs.required_num_qubits = 2; + attrs.required_num_results = 2; + return attrs; + }()); + qsim_sim.h(Q{0}); + qsim_sim.x(Q{1}); + qsim_sim.h(Q{1}); + qsim_sim.cnot(Q{0},Q{1}); + qsim_sim.h(Q{0}); + qsim_sim.mz(Q{0}, R{0}); + qsim_sim.read_result(R{0}); + qsim_sim.mz(Q{1}, R{1}); + qsim_sim.read_result(R{1}); + qsim_rt.array_record_output(2,""); + qsim_rt.result_record_output(R{0},""); + qsim_rt.result_record_output(R{1},""); + qsim_sim.h(Q{0}); + qsim_sim.x(Q{1}); + qsim_sim.h(Q{1}); + qsim_sim.mz(Q{0}, R{0}); + qsim_sim.read_result(R{0}); + qsim_sim.mz(Q{1}, R{1}); + qsim_sim.read_result(R{1}); + qsim_rt.array_record_output(2,""); + qsim_rt.result_record_output(R{0},""); + qsim_rt.result_record_output(R{1},""); + qsim_sim.h(Q{0}); + qsim_sim.x(Q{1}); + qsim_sim.h(Q{1}); + qsim_sim.cnot(Q{0},Q{1}); + qsim_sim.h(Q{0}); + qsim_sim.mz(Q{0}, R{0}); + qsim_sim.read_result(R{0}); + qsim_sim.mz(Q{1}, R{1}); + qsim_sim.read_result(R{1}); + qsim_rt.array_record_output(2,""); + qsim_rt.result_record_output(R{0},""); + qsim_rt.result_record_output(R{1},""); + qsim_sim.tear_down(); + auto result = clean_output(os.str()); + EXPECT_EQ(R"( +)", result) << result; // TODO: Modify qsimDefaultRuntime.cc so that it stores a result to be compared here (currently just prints as it goes...) +} + +//---------------------------------------------------------------------------// +} // namespace test +} // namespace qiree From a34f190ed977727662c2838bf41ae8e957dfc4ce Mon Sep 17 00:00:00 2001 From: wongey <25296194+wongey@users.noreply.github.com> Date: Mon, 25 Nov 2024 20:25:08 -0500 Subject: [PATCH 11/64] Capitalize class names q -> Q This commit will fail tests since I am documenting the file name change separately. --- app/qir-qsim.cc | 12 +++---- src/qirqsim/CMakeLists.txt | 6 ++-- src/qirqsim/qsimDefaultRuntime.cc | 12 +++---- src/qirqsim/qsimDefaultRuntime.hh | 12 +++---- src/qirqsim/qsimQuantum.cc | 52 +++++++++++++++---------------- src/qirqsim/qsimQuantum.hh | 8 ++--- src/qirqsim/qsimTupleRuntime.cc | 22 ++++++------- src/qirqsim/qsimTupleRuntime.hh | 16 +++++----- test/CMakeLists.txt | 2 +- test/qirqsim/qsimQuantum.test.cc | 16 +++++----- 10 files changed, 79 insertions(+), 79 deletions(-) diff --git a/app/qir-qsim.cc b/app/qir-qsim.cc index 75f1612..e5d3e72 100644 --- a/app/qir-qsim.cc +++ b/app/qir-qsim.cc @@ -20,9 +20,9 @@ #include "qiree/Module.hh" #include "qiree/QuantumNotImpl.hh" -#include "qirqsim/qsimDefaultRuntime.hh" -#include "qirqsim/qsimQuantum.hh" -#include "qirqsim/qsimTupleRuntime.hh" +#include "qirqsim/QsimDefaultRuntime.hh" +#include "qirqsim/QsimQuantum.hh" +#include "qirqsim/QsimTupleRuntime.hh" using namespace std::string_view_literals; @@ -38,15 +38,15 @@ void run(std::string const& filename, Executor execute{Module{filename}}; // Set up qsim - qsimQuantum sim(std::cout, num_shots); + QsimQuantum sim(std::cout, num_shots); // Collect the statistics std::unique_ptr rt; //if (group_tuples){ - // rt = std::make_unique( + // rt = std::make_unique( // std::cout, sim); //} else { - rt = std::make_unique( + rt = std::make_unique( std::cout, sim); //} diff --git a/src/qirqsim/CMakeLists.txt b/src/qirqsim/CMakeLists.txt index b0cf690..b11018d 100644 --- a/src/qirqsim/CMakeLists.txt +++ b/src/qirqsim/CMakeLists.txt @@ -6,9 +6,9 @@ # Adding qsim as a library to qiree qiree_add_library(qirqsim - qsimQuantum.cc - qsimDefaultRuntime.cc - qsimTupleRuntime.cc + QsimQuantum.cc + QsimDefaultRuntime.cc + QsimTupleRuntime.cc BufferManager.cc ) diff --git a/src/qirqsim/qsimDefaultRuntime.cc b/src/qirqsim/qsimDefaultRuntime.cc index 924e1e6..4ece7c1 100644 --- a/src/qirqsim/qsimDefaultRuntime.cc +++ b/src/qirqsim/qsimDefaultRuntime.cc @@ -3,9 +3,9 @@ // See the top-level COPYRIGHT file for details. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception //---------------------------------------------------------------------------// -//! \file qirqsim/qsimDefaultRuntime.cc +//! \file qirqsim/QsimDefaultRuntime.cc //---------------------------------------------------------------------------// -#include "qsimDefaultRuntime.hh" +#include "QsimDefaultRuntime.hh" #include @@ -18,7 +18,7 @@ namespace qiree * Initialize the execution environment, resetting qubits. */ -void qsimDefaultRuntime::initialize(OptionalCString env) +void QsimDefaultRuntime::initialize(OptionalCString env) { if (env) { @@ -32,7 +32,7 @@ void qsimDefaultRuntime::initialize(OptionalCString env) * named tag */ -void qsimDefaultRuntime::array_record_output(size_type s, OptionalCString tag) +void QsimDefaultRuntime::array_record_output(size_type s, OptionalCString tag) { // this->execute_if_needed(); // output_ << "array " << (tag ? tag : "") << " length " << s @@ -45,7 +45,7 @@ void qsimDefaultRuntime::array_record_output(size_type s, OptionalCString tag) * named tag */ -void qsimDefaultRuntime::tuple_record_output(size_type s, OptionalCString tag) +void QsimDefaultRuntime::tuple_record_output(size_type s, OptionalCString tag) { // this->execute_if_needed(); // output_ << "tuple " << (tag ? tag : "") << " length " << s @@ -56,7 +56,7 @@ void qsimDefaultRuntime::tuple_record_output(size_type s, OptionalCString tag) /*! * Execute circuit and report a single measurement result */ -void qsimDefaultRuntime::result_record_output(Result r, OptionalCString tag) +void QsimDefaultRuntime::result_record_output(Result r, OptionalCString tag) { // Access values through the getter // This prints results every time result_record_output is called diff --git a/src/qirqsim/qsimDefaultRuntime.hh b/src/qirqsim/qsimDefaultRuntime.hh index e76308e..fb0b7a3 100644 --- a/src/qirqsim/qsimDefaultRuntime.hh +++ b/src/qirqsim/qsimDefaultRuntime.hh @@ -3,11 +3,11 @@ // See the top-level COPYRIGHT file for details. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception //---------------------------------------------------------------------------// -//! \file qirqsim/qsimDefaultRuntime.hh +//! \file qirqsim/QsimDefaultRuntime.hh //---------------------------------------------------------------------------// #pragma once -#include "qsimQuantum.hh" +#include "QsimQuantum.hh" namespace qiree { @@ -28,13 +28,13 @@ namespace qiree * \endcode */ -class qsimDefaultRuntime final : virtual public RuntimeInterface +class QsimDefaultRuntime final : virtual public RuntimeInterface { public: /*! - * Construct \c qsimDefaultRuntime. + * Construct \c QsimDefaultRuntime. */ - qsimDefaultRuntime(std::ostream& output, qsimQuantum& sim) + QsimDefaultRuntime(std::ostream& output, QsimQuantum& sim) : output_(output), sim_(sim) { } @@ -56,7 +56,7 @@ class qsimDefaultRuntime final : virtual public RuntimeInterface private: std::ostream& output_; - qsimQuantum& sim_; + QsimQuantum& sim_; }; } // namespace qiree diff --git a/src/qirqsim/qsimQuantum.cc b/src/qirqsim/qsimQuantum.cc index 0fe9a98..5ae1e9f 100644 --- a/src/qirqsim/qsimQuantum.cc +++ b/src/qirqsim/qsimQuantum.cc @@ -3,10 +3,10 @@ // See the top-level COPYRIGHT file for details. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception //---------------------------------------------------------------------------// -//! \file qirxacc/qsimQuantum.cc +//! \file qirxacc/QsimQuantum.cc //---------------------------------------------------------------------------// -#include "qsimQuantum.hh" +#include "QsimQuantum.hh" #include #include @@ -42,7 +42,7 @@ namespace qiree Initialize the qsim simulator */ -qsimQuantum::State qsimQuantum::init_state_space() +QsimQuantum::State QsimQuantum::init_state_space() { // check if StateSpace is the proper type for the output, problably it is // just State from the Fatory struct. std::srand(static_cast(std::time(nullptr))); // Seed the @@ -58,7 +58,7 @@ qsimQuantum::State qsimQuantum::init_state_space() qsimParam.max_fused_size = 2; // Set the maximum size of fused gates qsimParam.verbosity = 0; // see verbosity in run_qsim.h // Initialize the qsim simulator - qsimQuantum::StateSpace state_space + QsimQuantum::StateSpace state_space = Factory(numThreads).CreateStateSpace(); // Create the state space State state = state_space.Create(this->num_qubits()); // Create the state // Check if the state is null @@ -72,14 +72,14 @@ qsimQuantum::State qsimQuantum::init_state_space() return state; } -qsimQuantum::qsimQuantum(std::ostream& os, size_type shots) : output_(os) {} +QsimQuantum::QsimQuantum(std::ostream& os, size_type shots) : output_(os) {} //---------------------------------------------------------------------------// /* Prepare to build a quantum circuit for an entry point */ -void qsimQuantum::set_up(EntryPointAttrs const& attrs) +void QsimQuantum::set_up(EntryPointAttrs const& attrs) { QIREE_VALIDATE(attrs.required_num_qubits > 0, << "input is not a quantum program"); @@ -103,12 +103,12 @@ void qsimQuantum::set_up(EntryPointAttrs const& attrs) Complete an execution */ -void qsimQuantum::repCount(int rep) +void QsimQuantum::repCount(int rep) { repetition = rep; } -void qsimQuantum::tear_down() +void QsimQuantum::tear_down() { q_circuit = {}; q_circuit.num_qubits = num_qubits_; @@ -120,7 +120,7 @@ void qsimQuantum::tear_down() Reset the qubit */ -void qsimQuantum::reset(Qubit q) +void QsimQuantum::reset(Qubit q) { q.value = 0; } @@ -130,7 +130,7 @@ void qsimQuantum::reset(Qubit q) Read the value of a result. This utilizes the new BufferManager. */ -QState qsimQuantum::read_result(Result r) +QState QsimQuantum::read_result(Result r) { std::string q_index_string = std::to_string(r.value); auto meas_results = execute_if_needed(); @@ -163,7 +163,7 @@ Map a qubit to a result index (TODO: find how to link the classical register to the quantum register in qsim) */ -void qsimQuantum::mz(Qubit q, Result r) +void QsimQuantum::mz(Qubit q, Result r) { // we don't classical register yet. QIREE_EXPECT(q.value < this->num_qubits()); // TODO: q must be in the set // of qubits, e.g., what @@ -183,71 +183,71 @@ Quantum Instruction Mapping */ // 1. Entangling gates -void qsimQuantum::cx(Qubit q1, Qubit q2) +void QsimQuantum::cx(Qubit q1, Qubit q2) { q_circuit.gates.push_back(qsim::GateCNot::Create( execution_time++, this->getQubitIndex(q1), this->getQubitIndex(q2))); } -void qsimQuantum::cnot(Qubit q1, Qubit q2) +void QsimQuantum::cnot(Qubit q1, Qubit q2) { q_circuit.gates.push_back(qsim::GateCNot::Create( execution_time++, this->getQubitIndex(q1), this->getQubitIndex(q2))); } -void qsimQuantum::cz(Qubit q1, Qubit q2) +void QsimQuantum::cz(Qubit q1, Qubit q2) { q_circuit.gates.push_back(qsim::GateCZ::Create( execution_time++, this->getQubitIndex(q1), this->getQubitIndex(q2))); } // 2. Local gates -void qsimQuantum::h(Qubit q) +void QsimQuantum::h(Qubit q) { q_circuit.gates.push_back( qsim::GateHd::Create(execution_time++, this->getQubitIndex(q))); } -void qsimQuantum::s(Qubit q) +void QsimQuantum::s(Qubit q) { q_circuit.gates.push_back( qsim::GateS::Create(execution_time++, this->getQubitIndex(q))); } -void qsimQuantum::t(Qubit q) +void QsimQuantum::t(Qubit q) { q_circuit.gates.push_back( qsim::GateT::Create(execution_time++, this->getQubitIndex(q))); } // 2.1 Pauli gates -void qsimQuantum::x(Qubit q) +void QsimQuantum::x(Qubit q) { q_circuit.gates.push_back( qsim::GateX::Create(execution_time++, this->getQubitIndex(q))); } -void qsimQuantum::y(Qubit q) +void QsimQuantum::y(Qubit q) { q_circuit.gates.push_back( qsim::GateY::Create(execution_time++, this->getQubitIndex(q))); } -void qsimQuantum::z(Qubit q) +void QsimQuantum::z(Qubit q) { q_circuit.gates.push_back( qsim::GateZ::Create(execution_time++, this->getQubitIndex(q))); } // 2.2 rotation gates -void qsimQuantum::rx(double theta, Qubit q) +void QsimQuantum::rx(double theta, Qubit q) { q_circuit.gates.push_back(qsim::GateRX::Create( execution_time++, this->getQubitIndex(q), theta)); } -void qsimQuantum::ry(double theta, Qubit q) +void QsimQuantum::ry(double theta, Qubit q) { q_circuit.gates.push_back(qsim::GateRY::Create( execution_time++, this->getQubitIndex(q), theta)); } -void qsimQuantum::rz(double theta, Qubit q) +void QsimQuantum::rz(double theta, Qubit q) { q_circuit.gates.push_back(qsim::GateRZ::Create( execution_time++, this->getQubitIndex(q), theta)); } -Qubit qsimQuantum::result_to_qubit(Result r) +Qubit QsimQuantum::result_to_qubit(Result r) { // TODO: This function is not working. Giving 0 every time. Maybe not // needed. @@ -257,13 +257,13 @@ Qubit qsimQuantum::result_to_qubit(Result r) // something else here } -void qsimQuantum::print_accelbuf() +void QsimQuantum::print_accelbuf() { // TODO: to be implemented, we can create a buffer class to store the // results } -qsimQuantum::VecMeas qsimQuantum::execute_if_needed() +QsimQuantum::VecMeas QsimQuantum::execute_if_needed() { std::vector meas_results; // Vector to hold // measurement diff --git a/src/qirqsim/qsimQuantum.hh b/src/qirqsim/qsimQuantum.hh index 745da51..db88dd4 100644 --- a/src/qirqsim/qsimQuantum.hh +++ b/src/qirqsim/qsimQuantum.hh @@ -3,7 +3,7 @@ // See the top-level COPYRIGHT file for details. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception //---------------------------------------------------------------------------// -//! \file qirqsim/qsimQuantum.hh +//! \file qirqsim/QsimQuantum.hh //---------------------------------------------------------------------------// #pragma once @@ -46,11 +46,11 @@ struct Factory namespace qiree { -class qsimQuantum final : virtual public QuantumNotImpl +class QsimQuantum final : virtual public QuantumNotImpl { public: // Define constructors and destructors - qsimQuantum(std::ostream& os, size_type shots); // Construct with number + QsimQuantum(std::ostream& os, size_type shots); // Construct with number // of shots // Define types @@ -63,7 +63,7 @@ class qsimQuantum final : virtual public QuantumNotImpl State init_state_space(); - QIREE_DELETE_COPY_MOVE(qsimQuantum); // Delete copy and move constructors + QIREE_DELETE_COPY_MOVE(QsimQuantum); // Delete copy and move constructors //!@{ //! \name Accessors diff --git a/src/qirqsim/qsimTupleRuntime.cc b/src/qirqsim/qsimTupleRuntime.cc index bf88e6d..34a7440 100644 --- a/src/qirqsim/qsimTupleRuntime.cc +++ b/src/qirqsim/qsimTupleRuntime.cc @@ -3,9 +3,9 @@ // See the top-level COPYRIGHT file for details. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception //---------------------------------------------------------------------------// -//! \file qirqsim/qsimTupleRuntime.cc +//! \file qirqsim/QsimTupleRuntime.cc //---------------------------------------------------------------------------// -#include "qsimTupleRuntime.hh" +#include "QsimTupleRuntime.hh" #include "qiree/Assert.hh" @@ -15,7 +15,7 @@ namespace qiree /*! * Initialize the execution environment, resetting qubits. */ -void qsimTupleRuntime::initialize(OptionalCString env) +void QsimTupleRuntime::initialize(OptionalCString env) { if (env) { @@ -28,7 +28,7 @@ void qsimTupleRuntime::initialize(OptionalCString env) * Execute circuit and mark the following N results as being part of an array * named tag */ -void qsimTupleRuntime::array_record_output(size_type s, OptionalCString tag) +void QsimTupleRuntime::array_record_output(size_type s, OptionalCString tag) { execute_if_needed(); start_tracking(GroupingType::array, tag, s); @@ -39,7 +39,7 @@ void qsimTupleRuntime::array_record_output(size_type s, OptionalCString tag) * Execute circuit and mark the following N results as being part of a tuple * named tag */ -void qsimTupleRuntime::tuple_record_output(size_type s, OptionalCString tag) +void QsimTupleRuntime::tuple_record_output(size_type s, OptionalCString tag) { execute_if_needed(); start_tracking(GroupingType::tuple, tag, s); @@ -49,7 +49,7 @@ void qsimTupleRuntime::tuple_record_output(size_type s, OptionalCString tag) /*! * Execute circuit and report a single measurement result */ -void qsimTupleRuntime::result_record_output(Result r, OptionalCString tag) +void QsimTupleRuntime::result_record_output(Result r, OptionalCString tag) { execute_if_needed(); Qubit q = sim_.result_to_qubit(r); @@ -60,7 +60,7 @@ void qsimTupleRuntime::result_record_output(Result r, OptionalCString tag) // PRIVATE FUNCTIONS //---------------------------------------------------------------------------// -void qsimTupleRuntime::execute_if_needed() +void QsimTupleRuntime::execute_if_needed() { /* if (sim_.execute_if_needed() && print_accelbuf_) @@ -70,7 +70,7 @@ void qsimTupleRuntime::execute_if_needed() */ } -void qsimTupleRuntime::start_tracking(GroupingType type, +void QsimTupleRuntime::start_tracking(GroupingType type, std::string tag, size_type num_results) { @@ -89,7 +89,7 @@ void qsimTupleRuntime::start_tracking(GroupingType type, } } -void qsimTupleRuntime::push_result(Qubit q) +void QsimTupleRuntime::push_result(Qubit q) { QIREE_EXPECT(valid_); QIREE_EXPECT(qubits_.size() < num_results_); @@ -100,14 +100,14 @@ void qsimTupleRuntime::push_result(Qubit q) } } -void qsimTupleRuntime::print_header(size_type num_distinct) +void QsimTupleRuntime::print_header(size_type num_distinct) { auto name = get_name(); output_ << name << " " << tag_ << " length " << qubits_.size() << " distinct results " << num_distinct << std::endl; } -void qsimTupleRuntime::finish_tuple() +void QsimTupleRuntime::finish_tuple() { // auto counts = sim_.get_marginal_counts(qubits_); std::map counts = {{"0", 0}, {"1", 0}}; // Placeholder diff --git a/src/qirqsim/qsimTupleRuntime.hh b/src/qirqsim/qsimTupleRuntime.hh index fa153f4..d6cafbe 100644 --- a/src/qirqsim/qsimTupleRuntime.hh +++ b/src/qirqsim/qsimTupleRuntime.hh @@ -3,18 +3,18 @@ // See the top-level COPYRIGHT file for details. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception //---------------------------------------------------------------------------// -//! \file qirqsim/qsimTupleRuntime.hh +//! \file qirqsim/QsimTupleRuntime.hh //---------------------------------------------------------------------------// #pragma once -#include "qsimQuantum.hh" +#include "QsimQuantum.hh" namespace qiree { /*! * Print per-tuple (or per-array) measurement statistics. (Compare with \ref - * qsimDefaultRuntime.) + * QsimDefaultRuntime.) * * Example: * \code @@ -24,16 +24,16 @@ namespace qiree * \endcode */ -class qsimTupleRuntime final : virtual public RuntimeInterface +class QsimTupleRuntime final : virtual public RuntimeInterface { public: /*! - * Construct an \c qsimTupleRuntime. + * Construct an \c QsimTupleRuntime. * The \c print_accelbuf argument determines whether the qsim \c * AcceleratorBuffer is dumped after execution. */ - qsimTupleRuntime(std::ostream& output, - qsimQuantum& sim, + QsimTupleRuntime(std::ostream& output, + QsimQuantum& sim, bool print_accelbuf = true) : output_(output) , sim_(sim) @@ -67,7 +67,7 @@ class qsimTupleRuntime final : virtual public RuntimeInterface }; std::ostream& output_; - qsimQuantum& sim_; + QsimQuantum& sim_; bool const print_accelbuf_; bool valid_; GroupingType type_; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 716bcbc..af87510 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -61,7 +61,7 @@ endif() #---------------------------------------------------------------------------## if(QIREE_USE_QSIM) - qiree_add_test(qirqsim qsimQuantum) + qiree_add_test(qirqsim QsimQuantum) endif() #---------------------------------------------------------------------------## diff --git a/test/qirqsim/qsimQuantum.test.cc b/test/qirqsim/qsimQuantum.test.cc index 3d29034..066f9bf 100644 --- a/test/qirqsim/qsimQuantum.test.cc +++ b/test/qirqsim/qsimQuantum.test.cc @@ -3,15 +3,15 @@ // See the top-level COPYRIGHT file for details. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception //---------------------------------------------------------------------------// -//! \file qirxacc/XaccQuantum.test.cc +//! \file qirxacc/QsimQuantum.test.cc //---------------------------------------------------------------------------// -#include "qirqsim/qsimQuantum.hh" +#include "qirqsim/QsimQuantum.hh" #include #include "qiree/Types.hh" #include "qiree_test.hh" -#include "qirqsim/qsimDefaultRuntime.hh" +#include "qirqsim/QsimDefaultRuntime.hh" namespace qiree { @@ -19,7 +19,7 @@ namespace test { //---------------------------------------------------------------------------// -class qsimQuantumTest : public ::qiree::test::Test +class QsimQuantumTest : public ::qiree::test::Test { protected: void SetUp() override {} @@ -34,7 +34,7 @@ class qsimQuantumTest : public ::qiree::test::Test }; -TEST_F(qsimQuantumTest, sim_dynamicbv) +TEST_F(QsimQuantumTest, sim_dynamicbv) { using Q = Qubit; using R = Result; @@ -43,8 +43,8 @@ TEST_F(qsimQuantumTest, sim_dynamicbv) os << '\n'; // Create a simulator that will write to the string stream - qsimQuantum qsim_sim{os, 1}; - qsimDefaultRuntime qsim_rt{os, qsim_sim}; + QsimQuantum qsim_sim{os, 1}; + QsimDefaultRuntime qsim_rt{os, qsim_sim}; // Call functions in the same sequence that dynamicbv.ll would qsim_sim.set_up([] { @@ -90,7 +90,7 @@ TEST_F(qsimQuantumTest, sim_dynamicbv) qsim_sim.tear_down(); auto result = clean_output(os.str()); EXPECT_EQ(R"( -)", result) << result; // TODO: Modify qsimDefaultRuntime.cc so that it stores a result to be compared here (currently just prints as it goes...) +)", result) << result; // TODO: Modify QsimDefaultRuntime.cc so that it stores a result to be compared here (currently just prints as it goes...) } //---------------------------------------------------------------------------// From 8fb40f3f22c7abba35764824b91cb4b48ec07126 Mon Sep 17 00:00:00 2001 From: wongey <25296194+wongey@users.noreply.github.com> Date: Mon, 25 Nov 2024 20:28:08 -0500 Subject: [PATCH 12/64] Update qsim file names --- src/qirqsim/{qsimDefaultRuntime.cc => QsimDefaultRuntime.cc} | 0 src/qirqsim/{qsimDefaultRuntime.hh => QsimDefaultRuntime.hh} | 0 src/qirqsim/{qsimQuantum.cc => QsimQuantum.cc} | 0 src/qirqsim/{qsimQuantum.hh => QsimQuantum.hh} | 0 src/qirqsim/{qsimTupleRuntime.cc => QsimTupleRuntime.cc} | 0 src/qirqsim/{qsimTupleRuntime.hh => QsimTupleRuntime.hh} | 0 test/qirqsim/{qsimQuantum.test.cc => QsimQuantum.test.cc} | 0 7 files changed, 0 insertions(+), 0 deletions(-) rename src/qirqsim/{qsimDefaultRuntime.cc => QsimDefaultRuntime.cc} (100%) rename src/qirqsim/{qsimDefaultRuntime.hh => QsimDefaultRuntime.hh} (100%) rename src/qirqsim/{qsimQuantum.cc => QsimQuantum.cc} (100%) rename src/qirqsim/{qsimQuantum.hh => QsimQuantum.hh} (100%) rename src/qirqsim/{qsimTupleRuntime.cc => QsimTupleRuntime.cc} (100%) rename src/qirqsim/{qsimTupleRuntime.hh => QsimTupleRuntime.hh} (100%) rename test/qirqsim/{qsimQuantum.test.cc => QsimQuantum.test.cc} (100%) diff --git a/src/qirqsim/qsimDefaultRuntime.cc b/src/qirqsim/QsimDefaultRuntime.cc similarity index 100% rename from src/qirqsim/qsimDefaultRuntime.cc rename to src/qirqsim/QsimDefaultRuntime.cc diff --git a/src/qirqsim/qsimDefaultRuntime.hh b/src/qirqsim/QsimDefaultRuntime.hh similarity index 100% rename from src/qirqsim/qsimDefaultRuntime.hh rename to src/qirqsim/QsimDefaultRuntime.hh diff --git a/src/qirqsim/qsimQuantum.cc b/src/qirqsim/QsimQuantum.cc similarity index 100% rename from src/qirqsim/qsimQuantum.cc rename to src/qirqsim/QsimQuantum.cc diff --git a/src/qirqsim/qsimQuantum.hh b/src/qirqsim/QsimQuantum.hh similarity index 100% rename from src/qirqsim/qsimQuantum.hh rename to src/qirqsim/QsimQuantum.hh diff --git a/src/qirqsim/qsimTupleRuntime.cc b/src/qirqsim/QsimTupleRuntime.cc similarity index 100% rename from src/qirqsim/qsimTupleRuntime.cc rename to src/qirqsim/QsimTupleRuntime.cc diff --git a/src/qirqsim/qsimTupleRuntime.hh b/src/qirqsim/QsimTupleRuntime.hh similarity index 100% rename from src/qirqsim/qsimTupleRuntime.hh rename to src/qirqsim/QsimTupleRuntime.hh diff --git a/test/qirqsim/qsimQuantum.test.cc b/test/qirqsim/QsimQuantum.test.cc similarity index 100% rename from test/qirqsim/qsimQuantum.test.cc rename to test/qirqsim/QsimQuantum.test.cc From 25d94333271ec8d9ca3009832ec8408e061726ae Mon Sep 17 00:00:00 2001 From: wongey <25296194+wongey@users.noreply.github.com> Date: Mon, 25 Nov 2024 20:28:52 -0500 Subject: [PATCH 13/64] Add examples bell_ccx.ll to examples folder and dynamicbv.ll to test data folder --- examples/bell_ccx.ll | 43 ++++++++++++++++++ test/data/dynamicbv.ll | 101 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 144 insertions(+) create mode 100644 examples/bell_ccx.ll create mode 100644 test/data/dynamicbv.ll diff --git a/examples/bell_ccx.ll b/examples/bell_ccx.ll new file mode 100644 index 0000000..e5b2ea7 --- /dev/null +++ b/examples/bell_ccx.ll @@ -0,0 +1,43 @@ +; ModuleID = 'Bell_ccx' +source_filename = "Bell_ccx" + +%Qubit = type opaque +%Result = type opaque + +define void @main() #0 { +entry: + call void @__quantum__qis__h__body(%Qubit* null) + call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 1 to %Qubit*)) + call void @__quantum__qis__ccx__body(%Qubit* null, %Qubit* inttoptr (i64 1 to %Qubit*), %Qubit* inttoptr (i64 2 to %Qubit*)) + call void @__quantum__qis__mz__body(%Qubit* null, %Result* null) + call void @__quantum__qis__mz__body(%Qubit* inttoptr (i64 1 to %Qubit*), %Result* inttoptr (i64 1 to %Result*)) + call void @__quantum__qis__mz__body(%Qubit* inttoptr (i64 2 to %Qubit*), %Result* inttoptr (i64 2 to %Result*)) + call void @__quantum__rt__array_record_output(i64 3, i8* null) + call void @__quantum__rt__result_record_output(%Result* null, i8* null) + call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 1 to %Result*), i8* null) + call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 2 to %Result*), i8* null) + ret void +} + +declare void @__quantum__qis__h__body(%Qubit*) + +declare void @__quantum__qis__x__body(%Qubit*) + +declare void @__quantum__qis__ccx__body(%Qubit*, %Qubit*, %Qubit*) + +declare void @__quantum__qis__mz__body(%Qubit*, %Result* writeonly) #1 + +declare void @__quantum__rt__array_record_output(i64, i8*) + +declare void @__quantum__rt__result_record_output(%Result*, i8*) + +attributes #0 = { "entry_point" "num_required_qubits"="3" "num_required_results"="3" "output_labeling_schema" "qir_profiles"="custom" } +attributes #1 = { "irreversible" } + +!llvm.module.flags = !{!0, !1, !2, !3} + +!0 = !{i32 1, !"qir_major_version", i32 1} +!1 = !{i32 7, !"qir_minor_version", i32 0} +!2 = !{i32 1, !"dynamic_qubit_management", i1 false} +!3 = !{i32 1, !"dynamic_result_management", i1 false} + diff --git a/test/data/dynamicbv.ll b/test/data/dynamicbv.ll new file mode 100644 index 0000000..6d48157 --- /dev/null +++ b/test/data/dynamicbv.ll @@ -0,0 +1,101 @@ +; ModuleID = 'dynamicbv' +source_filename = "dynamicbv" + +; ModuleID = 'BernsteinVazirani' +source_filename = "bv_algorithm" + +%Qubit = type opaque +%Result = type opaque + +define void @main() #0 { +entry: + ; Initialize qubits + call void @__quantum__qis__h__body(%Qubit* null) ; apply Hadamard to query qubit + call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 1 to %Qubit*)) ; set ancillary qubit + call void @__quantum__qis__h__body(%Qubit* inttoptr (i64 1 to %Qubit*)) ; + + + ; Apply CNOT for bit '1' + call void @__quantum__qis__cnot__body(%Qubit* null, %Qubit* inttoptr (i64 1 to %Qubit*)) ; kickback phase on q0 + call void @__quantum__qis__h__body(%Qubit* null) ; correcting eigenvalue + + ; Mid-circuit measurement + call void @__quantum__qis__mz__body(%Qubit* null, %Result* null) ; from this we get the first bit + call i1 @__quantum__qis__read_result__body(%Result* null) + call void @__quantum__qis__mz__body(%Qubit* inttoptr (i64 1 to %Qubit*), %Result* inttoptr (i64 1 to %Result*)) ; just to reset ancillary qubit + call i1 @__quantum__qis__read_result__body(%Result* inttoptr (i64 1 to %Result*)) + + ; Output the results + call void @__quantum__rt__array_record_output(i64 2, i8* null) + call void @__quantum__rt__result_record_output(%Result* null, i8* null) + call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 1 to %Result*), i8* null) + + ; Initialize qubits + call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 1 to %Qubit*)) ; set ancillary qubit + call void @__quantum__qis__h__body(%Qubit* inttoptr (i64 1 to %Qubit*)) ; + call void @__quantum__qis__h__body(%Qubit* null) ; apply Hadamard to query qubit + + ; Apply Identiry for bit '0' + ; Nothing + + ; Mid-circuit measurement + call void @__quantum__qis__mz__body(%Qubit* null, %Result* null) ; from this we get the first bit + call i1 @__quantum__qis__read_result__body(%Result* null) + call void @__quantum__qis__mz__body(%Qubit* inttoptr (i64 1 to %Qubit*), %Result* inttoptr (i64 1 to %Result*)) ; just to reset ancillary qubit + call i1 @__quantum__qis__read_result__body(%Result* inttoptr (i64 1 to %Result*)) + + ; Output the results + call void @__quantum__rt__array_record_output(i64 2, i8* null) + call void @__quantum__rt__result_record_output(%Result* null, i8* null) + call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 1 to %Result*), i8* null) + + ; Initialize qubits + call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 1 to %Qubit*)) ; set ancillary qubit + call void @__quantum__qis__h__body(%Qubit* inttoptr (i64 1 to %Qubit*)) ; + call void @__quantum__qis__h__body(%Qubit* null) ; apply Hadamard to query qubit + + ; Apply CNOT for bit '1' + call void @__quantum__qis__cnot__body(%Qubit* null, %Qubit* inttoptr (i64 1 to %Qubit*)) ; kickback phase on q0 + call void @__quantum__qis__h__body(%Qubit* null) ; correcting eigenvalue + + ; Mid-circuit measurement + call void @__quantum__qis__mz__body(%Qubit* null, %Result* null) ; from this we get the first bit + call i1 @__quantum__qis__read_result__body(%Result* null) + call void @__quantum__qis__mz__body(%Qubit* inttoptr (i64 1 to %Qubit*), %Result* inttoptr (i64 1 to %Result*)) ; just to reset ancillary qubit + call i1 @__quantum__qis__read_result__body(%Result* inttoptr (i64 1 to %Result*)) + + ; Output the results + call void @__quantum__rt__array_record_output(i64 2, i8* null) + call void @__quantum__rt__result_record_output(%Result* null, i8* null) + call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 1 to %Result*), i8* null) + + ret void +} + +; Declaration of quantum operations +declare void @__quantum__qis__h__body(%Qubit*) +declare void @__quantum__qis__x__body(%Qubit*) +declare void @__quantum__qis__cnot__body(%Qubit*, %Qubit*) +declare void @__quantum__qis__mz__body(%Qubit*, %Result*) +declare i1 @__quantum__qis__read_result__body(%Result*) + +; Quantum runtime functions for managing qubits and results +declare %Qubit* @__quantum__rt__qubit_allocate() +declare %Result* @__quantum__rt__result_allocate() +declare void @__quantum__rt__qubit_release(%Qubit*) +declare void @__quantum__rt__result_release(%Result*) +declare void @__quantum__rt__result_record_output(%Result*, i8*) +declare void @__quantum__rt__array_record_output(i64, i8*) + + + +attributes #0 = { "entry_point" "num_required_qubits"="2" "num_required_results"="2" "output_labeling_schema" "qir_profiles"="custom" } +attributes #1 = { "irreversible" } + +!llvm.module.flags = !{!0, !1, !2, !3} + +!0 = !{i32 1, !"qir_major_version", i32 1} +!1 = !{i32 7, !"qir_minor_version", i32 0} +!2 = !{i32 1, !"dynamic_qubit_management", i1 false} +!3 = !{i32 1, !"dynamic_result_management", i1 false} + From 22d0ac9880b07fb331beb296b9b7a2b923d45f65 Mon Sep 17 00:00:00 2001 From: wongey <25296194+wongey@users.noreply.github.com> Date: Mon, 25 Nov 2024 20:44:20 -0500 Subject: [PATCH 14/64] Minor formatting --- src/qirqsim/QsimQuantum.cc | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/src/qirqsim/QsimQuantum.cc b/src/qirqsim/QsimQuantum.cc index 5ae1e9f..d6f1a98 100644 --- a/src/qirqsim/QsimQuantum.cc +++ b/src/qirqsim/QsimQuantum.cc @@ -62,11 +62,8 @@ QsimQuantum::State QsimQuantum::init_state_space() = Factory(numThreads).CreateStateSpace(); // Create the state space State state = state_space.Create(this->num_qubits()); // Create the state // Check if the state is null - if (state_space.IsNull(state)) - { - qsim::IO::errorf( - "not enough memory: is the number of qubits too large?\n"); - } + QIREE_VALIDATE(!state_space.IsNull(state), + << "not enough memory: is the number of qubits too large?"; state_space.SetStateZero(state); // Set the state to zero, TODO: the // initial state is not necessarily zero return state; @@ -76,7 +73,7 @@ QsimQuantum::QsimQuantum(std::ostream& os, size_type shots) : output_(os) {} //---------------------------------------------------------------------------// /* -Prepare to build a quantum circuit for an entry point +* Prepare to build a quantum circuit for an entry point */ void QsimQuantum::set_up(EntryPointAttrs const& attrs) @@ -100,7 +97,7 @@ void QsimQuantum::set_up(EntryPointAttrs const& attrs) //---------------------------------------------------------------------------// /* -Complete an execution +* Complete an execution */ void QsimQuantum::repCount(int rep) @@ -117,7 +114,7 @@ void QsimQuantum::tear_down() //---------------------------------------------------------------------------// /* -Reset the qubit +* Reset the qubit */ void QsimQuantum::reset(Qubit q) @@ -127,7 +124,7 @@ void QsimQuantum::reset(Qubit q) //----------------------------------------------------------------------------// /* -Read the value of a result. This utilizes the new BufferManager. +* Read the value of a result. This utilizes the new BufferManager. */ QState QsimQuantum::read_result(Result r) @@ -159,8 +156,8 @@ QState QsimQuantum::read_result(Result r) //---------------------------------------------------------------------------// /* -Map a qubit to a result index -(TODO: find how to link the classical register to the quantum register in qsim) +* Map a qubit to a result index +* (TODO: find how to link the classical register to the quantum register in qsim) */ void QsimQuantum::mz(Qubit q, Result r) @@ -179,7 +176,7 @@ void QsimQuantum::mz(Qubit q, Result r) //---------------------------------------------------------------------------// /* -Quantum Instruction Mapping +* Quantum Instruction Mapping */ // 1. Entangling gates From 29be7aef6803a02d4a4f3efb3dfebaa46c02efa8 Mon Sep 17 00:00:00 2001 From: wongey <25296194+wongey@users.noreply.github.com> Date: Wed, 27 Nov 2024 01:10:26 -0500 Subject: [PATCH 15/64] Resolve seed issue --- app/qir-qsim.cc | 2 +- src/qirqsim/QsimQuantum.cc | 26 +++++++++++--------------- src/qirqsim/QsimQuantum.hh | 5 +++-- 3 files changed, 15 insertions(+), 18 deletions(-) diff --git a/app/qir-qsim.cc b/app/qir-qsim.cc index e5d3e72..5493f26 100644 --- a/app/qir-qsim.cc +++ b/app/qir-qsim.cc @@ -38,7 +38,7 @@ void run(std::string const& filename, Executor execute{Module{filename}}; // Set up qsim - QsimQuantum sim(std::cout, num_shots); + QsimQuantum sim(std::cout, 0); // Collect the statistics std::unique_ptr rt; diff --git a/src/qirqsim/QsimQuantum.cc b/src/qirqsim/QsimQuantum.cc index d6f1a98..938fb99 100644 --- a/src/qirqsim/QsimQuantum.cc +++ b/src/qirqsim/QsimQuantum.cc @@ -39,22 +39,18 @@ namespace qiree { //---------------------------------------------------------------------------// /* -Initialize the qsim simulator +* Initialize the qsim simulator */ QsimQuantum::State QsimQuantum::init_state_space() -{ // check if StateSpace is the proper type for the output, problably it is - // just State from the Fatory struct. - std::srand(static_cast(std::time(nullptr))); // Seed the - // random - // number - // generator - qsimParam.seed = std::rand(); // Set the seed for qsim parameters - numThreads = std::max( - 1, static_cast(std::thread::hardware_concurrency())); // Get the - // number - // of - // threads +{ + // check if StateSpace is the proper type for the output, problably it is + // just State from the Factory struct. + qsimParam.seed = seed_; + seed_++; + // Get the number of threads + numThreads + = std::max(1, static_cast(std::thread::hardware_concurrency())); qsimParam.max_fused_size = 2; // Set the maximum size of fused gates qsimParam.verbosity = 0; // see verbosity in run_qsim.h // Initialize the qsim simulator @@ -63,13 +59,13 @@ QsimQuantum::State QsimQuantum::init_state_space() State state = state_space.Create(this->num_qubits()); // Create the state // Check if the state is null QIREE_VALIDATE(!state_space.IsNull(state), - << "not enough memory: is the number of qubits too large?"; + << "not enough memory: is the number of qubits too large?"); state_space.SetStateZero(state); // Set the state to zero, TODO: the // initial state is not necessarily zero return state; } -QsimQuantum::QsimQuantum(std::ostream& os, size_type shots) : output_(os) {} +QsimQuantum::QsimQuantum(std::ostream& os, unsigned long int seed) : output_(os), seed_(seed) {} //---------------------------------------------------------------------------// /* diff --git a/src/qirqsim/QsimQuantum.hh b/src/qirqsim/QsimQuantum.hh index db88dd4..fa13a3d 100644 --- a/src/qirqsim/QsimQuantum.hh +++ b/src/qirqsim/QsimQuantum.hh @@ -50,8 +50,8 @@ class QsimQuantum final : virtual public QuantumNotImpl { public: // Define constructors and destructors - QsimQuantum(std::ostream& os, size_type shots); // Construct with number - // of shots + // Construct with number of shots + QsimQuantum(std::ostream& os, size_type shots); // Define types using Simulator = qsim::Simulator; @@ -157,6 +157,7 @@ class QsimQuantum final : virtual public QuantumNotImpl qsim::Circuit> q_circuit; // Quantum circuit object Runner::Parameter qsimParam; // Parameters for qsim + unsigned long int seed_; size_t execution_time; // when the quantum operation will be executed bool executed; From 19867f36bb8bfccbdabe3e795b9a51d915c87ed6 Mon Sep 17 00:00:00 2001 From: wongey <25296194+wongey@users.noreply.github.com> Date: Wed, 27 Nov 2024 01:11:05 -0500 Subject: [PATCH 16/64] Minor formatting --- CMakePresets.json | 92 +++++++++++++++++++++++------------- src/qirqsim/BufferManager.hh | 5 +- 2 files changed, 62 insertions(+), 35 deletions(-) diff --git a/CMakePresets.json b/CMakePresets.json index 287e268..4d9e63b 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -1,33 +1,61 @@ { - "version": 3, - "cmakeMinimumRequired": {"major": 3, "minor": 21, "patch": 0}, - "configurePresets": [ - { - "name": "default", - "displayName": "Automatic options (debug with tests)", - "description": "Dependencies are enabled based on environment probing", - "binaryDir": "${sourceDir}/build-${presetName}", - "generator": "Ninja", - "cacheVariables": { - "BUILD_SHARED_LIBS": {"type": "BOOL", "value": "ON"}, - "CMAKE_BUILD_TYPE": {"type": "STRING", "value": "Debug"}, - "CMAKE_INSTALL_PREFIX": "${sourceDir}/install-${presetName}" - } - } - ], - "buildPresets": [ - { - "name": "default", - "jobs": 0, - "configurePreset": "default" - } - ], - "testPresets": [ - { - "name": "default", - "configurePreset": "default", - "output": {"outputOnFailure": true}, - "execution": {"noTestsAction": "error", "stopOnFailure": false, "jobs": 8} - } - ] -} +"version": 3, + "cmakeMinimumRequired": { + "major": 3, + "minor": 21, + "patch": 0 + }, + "configurePresets": [ + { + "name": "default", + "displayName": "Automatic options (debug with tests)", + "description": "Dependencies are enabled based on environment probing", + "binaryDir": "${sourceDir}/build-${presetName}", + "generator": "Ninja", + "cacheVariables": { + "BUILD_SHARED_LIBS": { + "type": "BOOL", + "value": "ON" + }, + "CMAKE_BUILD_TYPE": { + "type": "STRING", + "value": "Debug" + }, + "CMAKE_INSTALL_PREFIX": "${sourceDir}/install-${presetName}" + } + }, + { + "name": "default", + "displayName": "Clang 16.0.6 x86_64-pc-linux-gnu", + "description": "Using compilers: C = /usr/bin/clang-16, CXX = /usr/bin/clang++-16", + "binaryDir": "${sourceDir}/out/build/${presetName}", + "cacheVariables": { + "CMAKE_INSTALL_PREFIX": "${sourceDir}/out/install/${presetName}", + "CMAKE_C_COMPILER": "/usr/bin/clang-16", + "CMAKE_CXX_COMPILER": "/usr/bin/clang++-16", + "CMAKE_BUILD_TYPE": "Debug" + } + } + ], + "buildPresets": [ + { + "name": "default", + "jobs": 0, + "configurePreset": "default" + } + ], + "testPresets": [ + { + "name": "default", + "configurePreset": "default", + "output": { + "outputOnFailure": true + }, + "execution": { + "noTestsAction": "error", + "stopOnFailure": false, + "jobs": 8 + } + } + ] +} \ No newline at end of file diff --git a/src/qirqsim/BufferManager.hh b/src/qirqsim/BufferManager.hh index 9bac1b5..01035f8 100644 --- a/src/qirqsim/BufferManager.hh +++ b/src/qirqsim/BufferManager.hh @@ -6,8 +6,7 @@ //! \file qirqsim/BufferManager.hh //---------------------------------------------------------------------------// -#ifndef BUFFER_MANAGER_H -#define BUFFER_MANAGER_H +#pragma once #include #include @@ -49,4 +48,4 @@ class BufferManager std::unordered_map simple_buffer; }; -#endif // BUFFER_MANAGER_H +// BUFFER_MANAGER_H From ede6e4acc00a2e08405d1db5b0d4c8cba390f0c4 Mon Sep 17 00:00:00 2001 From: wongey <25296194+wongey@users.noreply.github.com> Date: Wed, 27 Nov 2024 01:11:31 -0500 Subject: [PATCH 17/64] Update qsim unit test --- test/qirqsim/QsimQuantum.test.cc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/test/qirqsim/QsimQuantum.test.cc b/test/qirqsim/QsimQuantum.test.cc index 066f9bf..e01dca7 100644 --- a/test/qirqsim/QsimQuantum.test.cc +++ b/test/qirqsim/QsimQuantum.test.cc @@ -88,9 +88,12 @@ TEST_F(QsimQuantumTest, sim_dynamicbv) qsim_rt.result_record_output(R{0},""); qsim_rt.result_record_output(R{1},""); qsim_sim.tear_down(); - auto result = clean_output(os.str()); - EXPECT_EQ(R"( -)", result) << result; // TODO: Modify QsimDefaultRuntime.cc so that it stores a result to be compared here (currently just prints as it goes...) + + ASSERT_EQ(2, qsim_sim.num_qubits()); + EXPECT_EQ(1, qsim_sim.manager.getBufferValue("q0", "0").value()); + EXPECT_EQ(2, qsim_sim.manager.getBufferValue("q0", "1").value()); + EXPECT_EQ(2, qsim_sim.manager.getBufferValue("q1", "0").value()); + EXPECT_EQ(1, qsim_sim.manager.getBufferValue("q1", "1").value()); } //---------------------------------------------------------------------------// From 6fe34744c7ec4d7777390dcbc92cc6ab92f1ba52 Mon Sep 17 00:00:00 2001 From: wongey <25296194+wongey@users.noreply.github.com> Date: Wed, 27 Nov 2024 11:19:09 -0500 Subject: [PATCH 18/64] Refactor to move public to private types --- src/qirqsim/BufferManager.hh | 1 + src/qirqsim/CMakeLists.txt | 2 +- src/qirqsim/QsimQuantum.cc | 148 ++++++++++++++++++++--------------- src/qirqsim/QsimQuantum.hh | 58 ++------------ 4 files changed, 93 insertions(+), 116 deletions(-) diff --git a/src/qirqsim/BufferManager.hh b/src/qirqsim/BufferManager.hh index 01035f8..deac635 100644 --- a/src/qirqsim/BufferManager.hh +++ b/src/qirqsim/BufferManager.hh @@ -32,6 +32,7 @@ class BufferManager { public: // Method to update the buffer with a key-value pair + // TODO: Don't use strings here void updateBuffer(std::string const& qubit, std::string const& state, int const& value); diff --git a/src/qirqsim/CMakeLists.txt b/src/qirqsim/CMakeLists.txt index b11018d..c75b054 100644 --- a/src/qirqsim/CMakeLists.txt +++ b/src/qirqsim/CMakeLists.txt @@ -15,7 +15,7 @@ qiree_add_library(qirqsim #Link the qsim library to qiree and any other relevant libraries target_link_libraries(qirqsim PUBLIC QIREE::qiree # Link to qiree - PUBLIC QIREE::qsim #FIXME: make private + PRIVATE QIREE::qsim ) #----------------------------------------------------------------------------# diff --git a/src/qirqsim/QsimQuantum.cc b/src/qirqsim/QsimQuantum.cc index 938fb99..d40f27c 100644 --- a/src/qirqsim/QsimQuantum.cc +++ b/src/qirqsim/QsimQuantum.cc @@ -37,40 +37,31 @@ namespace qiree { + +struct Factory +{ // Factory class for creating simulators in qsim + Factory(unsigned num_threads) : num_threads(num_threads) {} + using Simulator = qsim::Simulator; + using StateSpace = Simulator::StateSpace; + StateSpace CreateStateSpace() const { return StateSpace(num_threads); } + Simulator CreateSimulator() const { return Simulator(num_threads); } + unsigned num_threads; +}; + //---------------------------------------------------------------------------// /* -* Initialize the qsim simulator -*/ + * Initialize the qsim simulator + */ -QsimQuantum::State QsimQuantum::init_state_space() -{ - // check if StateSpace is the proper type for the output, problably it is - // just State from the Factory struct. - qsimParam.seed = seed_; - seed_++; - // Get the number of threads - numThreads - = std::max(1, static_cast(std::thread::hardware_concurrency())); - qsimParam.max_fused_size = 2; // Set the maximum size of fused gates - qsimParam.verbosity = 0; // see verbosity in run_qsim.h - // Initialize the qsim simulator - QsimQuantum::StateSpace state_space - = Factory(numThreads).CreateStateSpace(); // Create the state space - State state = state_space.Create(this->num_qubits()); // Create the state - // Check if the state is null - QIREE_VALIDATE(!state_space.IsNull(state), - << "not enough memory: is the number of qubits too large?"); - state_space.SetStateZero(state); // Set the state to zero, TODO: the - // initial state is not necessarily zero - return state; +QsimQuantum::QsimQuantum(std::ostream& os, unsigned long int seed) + : output_(os), seed_(seed) +{ } -QsimQuantum::QsimQuantum(std::ostream& os, unsigned long int seed) : output_(os), seed_(seed) {} - //---------------------------------------------------------------------------// /* -* Prepare to build a quantum circuit for an entry point -*/ + * Prepare to build a quantum circuit for an entry point + */ void QsimQuantum::set_up(EntryPointAttrs const& attrs) { @@ -81,10 +72,28 @@ void QsimQuantum::set_up(EntryPointAttrs const& attrs) // (probably not true in general) result_to_qubit_.resize(attrs.required_num_results); num_qubits_ = attrs.required_num_qubits; // Set the number of qubits - state_ = std::make_shared(init_state_space()); // Set the state - // space? Maybe. - q_circuit.num_qubits = num_qubits_; // Allocate the number of qubits in - // the circuit + + // Get the number of threads + numThreads + = std::max(1, static_cast(std::thread::hardware_concurrency())); + + // Initialize the qsim simulator + QsimQuantum::StateSpace state_space + = Factory(numThreads).CreateStateSpace(); // Create the state space + + // Create the state + State state = state_space.Create(this->num_qubits()); + // Check if the state is null + QIREE_VALIDATE(!state_space.IsNull(state), + << "not enough memory: is the number of qubits too large?"); + + state_space.SetStateZero(state); // Set the state to zero, TODO: the + // initial state is not necessarily zero + + state_ = std::make_shared(std::move(state)); + + // Allocate the number of qubits in the circuit + q_circuit.num_qubits = num_qubits_; execution_time = 0; // Initialize execution time static unsigned int rep = 0; rep++; @@ -93,8 +102,8 @@ void QsimQuantum::set_up(EntryPointAttrs const& attrs) //---------------------------------------------------------------------------// /* -* Complete an execution -*/ + * Complete an execution + */ void QsimQuantum::repCount(int rep) { @@ -104,14 +113,12 @@ void QsimQuantum::repCount(int rep) void QsimQuantum::tear_down() { q_circuit = {}; - q_circuit.num_qubits = num_qubits_; - state_ = std::make_shared(init_state_space()); } //---------------------------------------------------------------------------// /* -* Reset the qubit -*/ + * Reset the qubit + */ void QsimQuantum::reset(Qubit q) { @@ -120,18 +127,43 @@ void QsimQuantum::reset(Qubit q) //----------------------------------------------------------------------------// /* -* Read the value of a result. This utilizes the new BufferManager. -*/ + * Read the value of a result. This utilizes the new BufferManager. + */ QState QsimQuantum::read_result(Result r) { - std::string q_index_string = std::to_string(r.value); - auto meas_results = execute_if_needed(); + using Fuser = qsim::MultiQubitGateFuser>; + using Runner = qsim::QSimRunner; + using VecMeas = std::vector; + + // Vector to hold measurement results, this must be empty before running + std::vector meas_results; + std::string stringResult; + + Runner::Parameter qsimParam; // Parameters for qsim + qsimParam.seed = seed_; + seed_++; + qsimParam.max_fused_size = 2; // Set the maximum size of fused gates + qsimParam.verbosity = 0; // see verbosity in run_qsim.h + + // Run the simulation + bool const run_success = Runner::Run(qsimParam, + Factory(numThreads), + q_circuit, + *state_, + meas_results); + + assert(run_success); // Ensure the run was successful + // reset circuit here + q_circuit = {}; + q_circuit.num_qubits = num_qubits_; + if (meas_results.size() == 1 && meas_results[0].bitstring.size() == 1) { auto const bitResult = meas_results[0].bitstring[0]; assert(bitResult == 0 || bitResult == 1); std::string stringResult = std::to_string(bitResult); + std::string q_index_string = std::to_string(r.value); if (stringResult == "1") { manager.updateBuffer("q" + q_index_string, "1", 1); @@ -152,9 +184,10 @@ QState QsimQuantum::read_result(Result r) //---------------------------------------------------------------------------// /* -* Map a qubit to a result index -* (TODO: find how to link the classical register to the quantum register in qsim) -*/ + * Map a qubit to a result index + * (TODO: find how to link the classical register to the quantum register in + * qsim) + */ void QsimQuantum::mz(Qubit q, Result r) { // we don't classical register yet. @@ -164,6 +197,8 @@ void QsimQuantum::mz(Qubit q, Result r) // are {2,3,4,5}, q is less // than num_qubits but not it // is in the set of qubits. + // TODO: maybe not what we want long term + QIREE_EXPECT(q.value == r.value); // Add measurement instruction this->q_circuit.gates.push_back( qsim::gate::Measurement>::Create( @@ -172,8 +207,8 @@ void QsimQuantum::mz(Qubit q, Result r) //---------------------------------------------------------------------------// /* -* Quantum Instruction Mapping -*/ + * Quantum Instruction Mapping + */ // 1. Entangling gates void QsimQuantum::cx(Qubit q1, Qubit q2) @@ -256,26 +291,9 @@ void QsimQuantum::print_accelbuf() // results } -QsimQuantum::VecMeas QsimQuantum::execute_if_needed() +void QsimQuantum::execute_if_needed() { - std::vector meas_results; // Vector to hold - // measurement - // results, this - // must be empty - // before running - std::string stringResult; - static unsigned long int seed = 0; - qsimParam.seed = seed++; - bool const run_success = Runner::Run(qsimParam, - Factory(numThreads), - q_circuit, - *state_, - meas_results); // Run the simulation - assert(run_success); // Ensure the run was successful - // reset circuit here - q_circuit = {}; - q_circuit.num_qubits = num_qubits_; - return meas_results; + QIREE_EXPECT(false); } } // namespace qiree diff --git a/src/qirqsim/QsimQuantum.hh b/src/qirqsim/QsimQuantum.hh index fa13a3d..f2d10c6 100644 --- a/src/qirqsim/QsimQuantum.hh +++ b/src/qirqsim/QsimQuantum.hh @@ -13,20 +13,6 @@ #include #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include "BufferManager.hh" #include "qiree/Macros.hh" @@ -34,16 +20,6 @@ #include "qiree/RuntimeInterface.hh" #include "qiree/Types.hh" -struct Factory -{ // Factory class for creating simulators in qsim - Factory(unsigned num_threads) : num_threads(num_threads) {} - using Simulator = qsim::Simulator; - using StateSpace = Simulator::StateSpace; - StateSpace CreateStateSpace() const { return StateSpace(num_threads); } - Simulator CreateSimulator() const { return Simulator(num_threads); } - unsigned num_threads; -}; - namespace qiree { class QsimQuantum final : virtual public QuantumNotImpl @@ -53,16 +29,6 @@ class QsimQuantum final : virtual public QuantumNotImpl // Construct with number of shots QsimQuantum(std::ostream& os, size_type shots); - // Define types - using Simulator = qsim::Simulator; - using StateSpace = Simulator::StateSpace; - using State = StateSpace::State; - using Fuser = qsim::MultiQubitGateFuser>; - using Runner = qsim::QSimRunner; - using VecMeas = std::vector; - - State init_state_space(); - QIREE_DELETE_COPY_MOVE(QsimQuantum); // Delete copy and move constructors //!@{ @@ -103,7 +69,7 @@ class QsimQuantum final : virtual public QuantumNotImpl // Run the circuit on the accelerator if we have not already. Returns true // if the circuit was executed. - VecMeas execute_if_needed(); + void execute_if_needed(); void print_accelbuf(); //!@} @@ -132,13 +98,6 @@ class QsimQuantum final : virtual public QuantumNotImpl void z(Qubit) final; //!@} - // Get the quantum circuit - qsim::Circuit> get_circuit() const - { - return q_circuit; - } - // Get the state space - State const& get_state() const { return *state_; } // Update the buffer BufferManager manager; // Number of repetitions @@ -146,17 +105,23 @@ class QsimQuantum final : virtual public QuantumNotImpl void repCount(int rep); private: + //// TYPES //// + + using Simulator = qsim::Simulator; + using StateSpace = Simulator::StateSpace; + using State = StateSpace::State; + enum class Endianness { little, big }; + unsigned numThreads; // Number of threads to use unsigned max_fused_size; // Maximum size of fused gates qsim::Circuit> q_circuit; // Quantum circuit object - Runner::Parameter qsimParam; // Parameters for qsim unsigned long int seed_; size_t execution_time; // when the quantum operation will be executed @@ -171,11 +136,4 @@ class QsimQuantum final : virtual public QuantumNotImpl std::shared_ptr state_; }; -class buffer -{ - public: - buffer(size_t size) : size(size) {} - size_t size; -}; - } // namespace qiree From 24a06bb89aac2fe4b3bf79c11e248357f953c438 Mon Sep 17 00:00:00 2001 From: Seth R Johnson Date: Wed, 27 Nov 2024 11:25:12 -0500 Subject: [PATCH 19/64] Unused variables and naming --- src/qirqsim/QsimQuantum.cc | 90 ++++++++++++++++---------------------- src/qirqsim/QsimQuantum.hh | 33 +++----------- 2 files changed, 44 insertions(+), 79 deletions(-) diff --git a/src/qirqsim/QsimQuantum.cc b/src/qirqsim/QsimQuantum.cc index d40f27c..ce0834a 100644 --- a/src/qirqsim/QsimQuantum.cc +++ b/src/qirqsim/QsimQuantum.cc @@ -49,20 +49,18 @@ struct Factory }; //---------------------------------------------------------------------------// -/* +/*! * Initialize the qsim simulator */ - QsimQuantum::QsimQuantum(std::ostream& os, unsigned long int seed) : output_(os), seed_(seed) { } //---------------------------------------------------------------------------// -/* +/*! * Prepare to build a quantum circuit for an entry point */ - void QsimQuantum::set_up(EntryPointAttrs const& attrs) { QIREE_VALIDATE(attrs.required_num_qubits > 0, @@ -74,62 +72,51 @@ void QsimQuantum::set_up(EntryPointAttrs const& attrs) num_qubits_ = attrs.required_num_qubits; // Set the number of qubits // Get the number of threads - numThreads + num_threads_ = std::max(1, static_cast(std::thread::hardware_concurrency())); // Initialize the qsim simulator QsimQuantum::StateSpace state_space - = Factory(numThreads).CreateStateSpace(); // Create the state space - + = Factory(num_threads_).CreateStateSpace(); // Create the state space + // Create the state State state = state_space.Create(this->num_qubits()); // Check if the state is null QIREE_VALIDATE(!state_space.IsNull(state), << "not enough memory: is the number of qubits too large?"); - + state_space.SetStateZero(state); // Set the state to zero, TODO: the // initial state is not necessarily zero - state_ = std::make_shared(std::move(state)); + state_ = std::make_shared(std::move(state)); // Allocate the number of qubits in the circuit - q_circuit.num_qubits = num_qubits_; - execution_time = 0; // Initialize execution time - static unsigned int rep = 0; - rep++; - this->repCount(rep); + q_circuit.num_qubits = num_qubits_; + gate_index_ = 0; // Initialize execution time } //---------------------------------------------------------------------------// -/* +/*! * Complete an execution */ - -void QsimQuantum::repCount(int rep) -{ - repetition = rep; -} - void QsimQuantum::tear_down() { q_circuit = {}; } //---------------------------------------------------------------------------// -/* +/*! * Reset the qubit */ - void QsimQuantum::reset(Qubit q) { q.value = 0; } //----------------------------------------------------------------------------// -/* +/*! * Read the value of a result. This utilizes the new BufferManager. */ - QState QsimQuantum::read_result(Result r) { using Fuser = qsim::MultiQubitGateFuser>; @@ -147,11 +134,8 @@ QState QsimQuantum::read_result(Result r) qsimParam.verbosity = 0; // see verbosity in run_qsim.h // Run the simulation - bool const run_success = Runner::Run(qsimParam, - Factory(numThreads), - q_circuit, - *state_, - meas_results); + bool const run_success = Runner::Run( + qsimParam, Factory(num_threads_), q_circuit, *state_, meas_results); assert(run_success); // Ensure the run was successful // reset circuit here @@ -183,12 +167,12 @@ QState QsimQuantum::read_result(Result r) } //---------------------------------------------------------------------------// -/* - * Map a qubit to a result index +/*! + * Map a qubit to a result index. + * * (TODO: find how to link the classical register to the quantum register in * qsim) */ - void QsimQuantum::mz(Qubit q, Result r) { // we don't classical register yet. QIREE_EXPECT(q.value < this->num_qubits()); // TODO: q must be in the set @@ -201,8 +185,8 @@ void QsimQuantum::mz(Qubit q, Result r) QIREE_EXPECT(q.value == r.value); // Add measurement instruction this->q_circuit.gates.push_back( - qsim::gate::Measurement>::Create( - execution_time++, {this->getQubitIndex(q)})); + qsim::gate::Measurement>::Create(gate_index_++, + {q.value})); } //---------------------------------------------------------------------------// @@ -213,66 +197,66 @@ void QsimQuantum::mz(Qubit q, Result r) // 1. Entangling gates void QsimQuantum::cx(Qubit q1, Qubit q2) { - q_circuit.gates.push_back(qsim::GateCNot::Create( - execution_time++, this->getQubitIndex(q1), this->getQubitIndex(q2))); + q_circuit.gates.push_back( + qsim::GateCNot::Create(gate_index_++, q1.value, q2.value)); } void QsimQuantum::cnot(Qubit q1, Qubit q2) { - q_circuit.gates.push_back(qsim::GateCNot::Create( - execution_time++, this->getQubitIndex(q1), this->getQubitIndex(q2))); + q_circuit.gates.push_back( + qsim::GateCNot::Create(gate_index_++, q1.value, q2.value)); } void QsimQuantum::cz(Qubit q1, Qubit q2) { - q_circuit.gates.push_back(qsim::GateCZ::Create( - execution_time++, this->getQubitIndex(q1), this->getQubitIndex(q2))); + q_circuit.gates.push_back( + qsim::GateCZ::Create(gate_index_++, q1.value, q2.value)); } // 2. Local gates void QsimQuantum::h(Qubit q) { q_circuit.gates.push_back( - qsim::GateHd::Create(execution_time++, this->getQubitIndex(q))); + qsim::GateHd::Create(gate_index_++, q.value)); } void QsimQuantum::s(Qubit q) { q_circuit.gates.push_back( - qsim::GateS::Create(execution_time++, this->getQubitIndex(q))); + qsim::GateS::Create(gate_index_++, q.value)); } void QsimQuantum::t(Qubit q) { q_circuit.gates.push_back( - qsim::GateT::Create(execution_time++, this->getQubitIndex(q))); + qsim::GateT::Create(gate_index_++, q.value)); } // 2.1 Pauli gates void QsimQuantum::x(Qubit q) { q_circuit.gates.push_back( - qsim::GateX::Create(execution_time++, this->getQubitIndex(q))); + qsim::GateX::Create(gate_index_++, q.value)); } void QsimQuantum::y(Qubit q) { q_circuit.gates.push_back( - qsim::GateY::Create(execution_time++, this->getQubitIndex(q))); + qsim::GateY::Create(gate_index_++, q.value)); } void QsimQuantum::z(Qubit q) { q_circuit.gates.push_back( - qsim::GateZ::Create(execution_time++, this->getQubitIndex(q))); + qsim::GateZ::Create(gate_index_++, q.value)); } // 2.2 rotation gates void QsimQuantum::rx(double theta, Qubit q) { - q_circuit.gates.push_back(qsim::GateRX::Create( - execution_time++, this->getQubitIndex(q), theta)); + q_circuit.gates.push_back( + qsim::GateRX::Create(gate_index_++, q.value, theta)); } void QsimQuantum::ry(double theta, Qubit q) { - q_circuit.gates.push_back(qsim::GateRY::Create( - execution_time++, this->getQubitIndex(q), theta)); + q_circuit.gates.push_back( + qsim::GateRY::Create(gate_index_++, q.value, theta)); } void QsimQuantum::rz(double theta, Qubit q) { - q_circuit.gates.push_back(qsim::GateRZ::Create( - execution_time++, this->getQubitIndex(q), theta)); + q_circuit.gates.push_back( + qsim::GateRZ::Create(gate_index_++, q.value, theta)); } Qubit QsimQuantum::result_to_qubit(Result r) diff --git a/src/qirqsim/QsimQuantum.hh b/src/qirqsim/QsimQuantum.hh index f2d10c6..e42507f 100644 --- a/src/qirqsim/QsimQuantum.hh +++ b/src/qirqsim/QsimQuantum.hh @@ -22,12 +22,15 @@ namespace qiree { +//---------------------------------------------------------------------------// +/*! + * Create and execute quantum circuits using google Qsim. + */ class QsimQuantum final : virtual public QuantumNotImpl { public: - // Define constructors and destructors // Construct with number of shots - QsimQuantum(std::ostream& os, size_type shots); + QsimQuantum(std::ostream& os, size_type shots); QIREE_DELETE_COPY_MOVE(QsimQuantum); // Delete copy and move constructors @@ -35,12 +38,6 @@ class QsimQuantum final : virtual public QuantumNotImpl //! \name Accessors size_type num_results() const { return result_to_qubit_.size(); } size_type num_qubits() const { return num_qubits_; } - - unsigned getQubitIndex(Qubit q) - { - return static_cast(q.value); // Return the value of the - // qubit - } //!@} //!@{ @@ -63,10 +60,6 @@ class QsimQuantum final : virtual public QuantumNotImpl // Get runtime qubit corresponding to a runtime result Qubit result_to_qubit(Result); - // Wrapper for qsim - // std::map - // get_marginal_counts(std::vector const& qubits); - // Run the circuit on the accelerator if we have not already. Returns true // if the circuit was executed. void execute_if_needed(); @@ -100,9 +93,6 @@ class QsimQuantum final : virtual public QuantumNotImpl // Update the buffer BufferManager manager; - // Number of repetitions - int repetition; - void repCount(int rep); private: @@ -112,23 +102,14 @@ class QsimQuantum final : virtual public QuantumNotImpl using StateSpace = Simulator::StateSpace; using State = StateSpace::State; - enum class Endianness - { - little, - big - }; - - unsigned numThreads; // Number of threads to use - unsigned max_fused_size; // Maximum size of fused gates + unsigned num_threads_; // Number of threads to use qsim::Circuit> q_circuit; // Quantum circuit object unsigned long int seed_; - size_t execution_time; // when the quantum operation will be executed + size_t gate_index_; // when the quantum operation will be executed - bool executed; size_type num_qubits_{}; std::vector result_to_qubit_; - Endianness endian_; std::ostream& output_; std::shared_ptr simulator_; From aadbb8a4e586bbbab04b000c81c0a3691caa0d2f Mon Sep 17 00:00:00 2001 From: Seth R Johnson Date: Wed, 27 Nov 2024 11:33:31 -0500 Subject: [PATCH 20/64] Use PIMPL --- src/qirqsim/QsimQuantum.cc | 81 ++++++++++++++++++++------------- src/qirqsim/QsimQuantum.hh | 18 ++++---- src/qirqsim/QsimTupleRuntime.cc | 1 + 3 files changed, 59 insertions(+), 41 deletions(-) diff --git a/src/qirqsim/QsimQuantum.cc b/src/qirqsim/QsimQuantum.cc index ce0834a..c1e2770 100644 --- a/src/qirqsim/QsimQuantum.cc +++ b/src/qirqsim/QsimQuantum.cc @@ -9,7 +9,6 @@ #include "QsimQuantum.hh" #include -#include #include #include #include @@ -37,26 +36,44 @@ namespace qiree { - -struct Factory -{ // Factory class for creating simulators in qsim +//---------------------------------------------------------------------------// +/*! + * Factory class for creating simulators in qsim. + */ +struct QsimQuantum::Factory +{ Factory(unsigned num_threads) : num_threads(num_threads) {} using Simulator = qsim::Simulator; using StateSpace = Simulator::StateSpace; + StateSpace CreateStateSpace() const { return StateSpace(num_threads); } Simulator CreateSimulator() const { return Simulator(num_threads); } unsigned num_threads; }; +//---------------------------------------------------------------------------// +/*! + * Quantum state and circuit. + */ +struct QsimQuantum::State +{ + qsim::Circuit> circuit; + Factory::StateSpace::State state; +}; + //---------------------------------------------------------------------------// /*! * Initialize the qsim simulator */ QsimQuantum::QsimQuantum(std::ostream& os, unsigned long int seed) - : output_(os), seed_(seed) + : output_(os), seed_(seed), state_{std::make_unique()} { } +//---------------------------------------------------------------------------// +//! Default destructor +QsimQuantum::~QsimQuantum() = default; + //---------------------------------------------------------------------------// /*! * Prepare to build a quantum circuit for an entry point @@ -76,11 +93,12 @@ void QsimQuantum::set_up(EntryPointAttrs const& attrs) = std::max(1, static_cast(std::thread::hardware_concurrency())); // Initialize the qsim simulator - QsimQuantum::StateSpace state_space - = Factory(num_threads_).CreateStateSpace(); // Create the state space + auto state_space = Factory(num_threads_).CreateStateSpace(); // Create the + // state + // space // Create the state - State state = state_space.Create(this->num_qubits()); + state_->state = state_space.Create(this->num_qubits()); // Check if the state is null QIREE_VALIDATE(!state_space.IsNull(state), << "not enough memory: is the number of qubits too large?"); @@ -88,10 +106,8 @@ void QsimQuantum::set_up(EntryPointAttrs const& attrs) state_space.SetStateZero(state); // Set the state to zero, TODO: the // initial state is not necessarily zero - state_ = std::make_shared(std::move(state)); - // Allocate the number of qubits in the circuit - q_circuit.num_qubits = num_qubits_; + state_->circuit.num_qubits = num_qubits_; gate_index_ = 0; // Initialize execution time } @@ -101,7 +117,7 @@ void QsimQuantum::set_up(EntryPointAttrs const& attrs) */ void QsimQuantum::tear_down() { - q_circuit = {}; + state_->circuit = {}; } //---------------------------------------------------------------------------// @@ -134,18 +150,21 @@ QState QsimQuantum::read_result(Result r) qsimParam.verbosity = 0; // see verbosity in run_qsim.h // Run the simulation - bool const run_success = Runner::Run( - qsimParam, Factory(num_threads_), q_circuit, *state_, meas_results); + bool const run_success = Runner::Run(qsimParam, + Factory(num_threads_), + state_->circuit, + state_->state, + meas_results); - assert(run_success); // Ensure the run was successful + QIREE_ASSERT(run_success); // Ensure the run was successful // reset circuit here - q_circuit = {}; - q_circuit.num_qubits = num_qubits_; + state_->circuit = {}; + state_->circuit.num_qubits = num_qubits_; if (meas_results.size() == 1 && meas_results[0].bitstring.size() == 1) { auto const bitResult = meas_results[0].bitstring[0]; - assert(bitResult == 0 || bitResult == 1); + QIREE_ASSERT(bitResult == 0 || bitResult == 1); std::string stringResult = std::to_string(bitResult); std::string q_index_string = std::to_string(r.value); if (stringResult == "1") @@ -184,7 +203,7 @@ void QsimQuantum::mz(Qubit q, Result r) // TODO: maybe not what we want long term QIREE_EXPECT(q.value == r.value); // Add measurement instruction - this->q_circuit.gates.push_back( + state_->circuit.gates.push_back( qsim::gate::Measurement>::Create(gate_index_++, {q.value})); } @@ -197,65 +216,65 @@ void QsimQuantum::mz(Qubit q, Result r) // 1. Entangling gates void QsimQuantum::cx(Qubit q1, Qubit q2) { - q_circuit.gates.push_back( + state_->circuit.gates.push_back( qsim::GateCNot::Create(gate_index_++, q1.value, q2.value)); } void QsimQuantum::cnot(Qubit q1, Qubit q2) { - q_circuit.gates.push_back( + state_->circuit.gates.push_back( qsim::GateCNot::Create(gate_index_++, q1.value, q2.value)); } void QsimQuantum::cz(Qubit q1, Qubit q2) { - q_circuit.gates.push_back( + state_->circuit.gates.push_back( qsim::GateCZ::Create(gate_index_++, q1.value, q2.value)); } // 2. Local gates void QsimQuantum::h(Qubit q) { - q_circuit.gates.push_back( + state_->circuit.gates.push_back( qsim::GateHd::Create(gate_index_++, q.value)); } void QsimQuantum::s(Qubit q) { - q_circuit.gates.push_back( + state_->circuit.gates.push_back( qsim::GateS::Create(gate_index_++, q.value)); } void QsimQuantum::t(Qubit q) { - q_circuit.gates.push_back( + state_->circuit.gates.push_back( qsim::GateT::Create(gate_index_++, q.value)); } // 2.1 Pauli gates void QsimQuantum::x(Qubit q) { - q_circuit.gates.push_back( + state_->circuit.gates.push_back( qsim::GateX::Create(gate_index_++, q.value)); } void QsimQuantum::y(Qubit q) { - q_circuit.gates.push_back( + state_->circuit.gates.push_back( qsim::GateY::Create(gate_index_++, q.value)); } void QsimQuantum::z(Qubit q) { - q_circuit.gates.push_back( + state_->circuit.gates.push_back( qsim::GateZ::Create(gate_index_++, q.value)); } // 2.2 rotation gates void QsimQuantum::rx(double theta, Qubit q) { - q_circuit.gates.push_back( + state_->circuit.gates.push_back( qsim::GateRX::Create(gate_index_++, q.value, theta)); } void QsimQuantum::ry(double theta, Qubit q) { - q_circuit.gates.push_back( + state_->circuit.gates.push_back( qsim::GateRY::Create(gate_index_++, q.value, theta)); } void QsimQuantum::rz(double theta, Qubit q) { - q_circuit.gates.push_back( + state_->circuit.gates.push_back( qsim::GateRZ::Create(gate_index_++, q.value, theta)); } diff --git a/src/qirqsim/QsimQuantum.hh b/src/qirqsim/QsimQuantum.hh index e42507f..1292e8b 100644 --- a/src/qirqsim/QsimQuantum.hh +++ b/src/qirqsim/QsimQuantum.hh @@ -7,9 +7,6 @@ //---------------------------------------------------------------------------// #pragma once -#include -#include -#include #include #include #include @@ -31,6 +28,7 @@ class QsimQuantum final : virtual public QuantumNotImpl public: // Construct with number of shots QsimQuantum(std::ostream& os, size_type shots); + ~QsimQuantum(); QIREE_DELETE_COPY_MOVE(QsimQuantum); // Delete copy and move constructors @@ -98,12 +96,12 @@ class QsimQuantum final : virtual public QuantumNotImpl //// TYPES //// - using Simulator = qsim::Simulator; - using StateSpace = Simulator::StateSpace; - using State = StateSpace::State; + struct Factory; + struct State; + + //// DATA //// unsigned num_threads_; // Number of threads to use - qsim::Circuit> q_circuit; // Quantum circuit object unsigned long int seed_; size_t gate_index_; // when the quantum operation will be executed @@ -112,9 +110,9 @@ class QsimQuantum final : virtual public QuantumNotImpl std::vector result_to_qubit_; std::ostream& output_; - std::shared_ptr simulator_; - std::shared_ptr statespace_; - std::shared_ptr state_; + + // Quantum circuit, simulator, and measured results + std::unique_ptr state_; }; } // namespace qiree diff --git a/src/qirqsim/QsimTupleRuntime.cc b/src/qirqsim/QsimTupleRuntime.cc index 34a7440..aa06798 100644 --- a/src/qirqsim/QsimTupleRuntime.cc +++ b/src/qirqsim/QsimTupleRuntime.cc @@ -7,6 +7,7 @@ //---------------------------------------------------------------------------// #include "QsimTupleRuntime.hh" +#include #include "qiree/Assert.hh" namespace qiree From 6f13b04356c134acd0c1e1f0ba2e01b75cac3745 Mon Sep 17 00:00:00 2001 From: Seth R Johnson Date: Wed, 27 Nov 2024 11:38:32 -0500 Subject: [PATCH 21/64] Delete tuple runtime, fix errors --- app/qir-qsim.cc | 32 +++----- src/qirqsim/CMakeLists.txt | 1 - src/qirqsim/QsimQuantum.cc | 16 ++-- src/qirqsim/QsimQuantum.hh | 14 ++-- src/qirqsim/QsimTupleRuntime.cc | 128 -------------------------------- src/qirqsim/QsimTupleRuntime.hh | 93 ----------------------- 6 files changed, 24 insertions(+), 260 deletions(-) delete mode 100644 src/qirqsim/QsimTupleRuntime.cc delete mode 100644 src/qirqsim/QsimTupleRuntime.hh diff --git a/app/qir-qsim.cc b/app/qir-qsim.cc index 5493f26..92f4669 100644 --- a/app/qir-qsim.cc +++ b/app/qir-qsim.cc @@ -12,17 +12,12 @@ #include #include "qiree_version.h" -#include "qiree/Executor.hh" -#include "qiree/Module.hh" -#include "qiree/QuantumNotImpl.hh" #include "qiree/Executor.hh" #include "qiree/Module.hh" #include "qiree/QuantumNotImpl.hh" - #include "qirqsim/QsimDefaultRuntime.hh" #include "qirqsim/QsimQuantum.hh" -#include "qirqsim/QsimTupleRuntime.hh" using namespace std::string_view_literals; @@ -36,22 +31,17 @@ void run(std::string const& filename, { // Load the input Executor execute{Module{filename}}; - + // Set up qsim QsimQuantum sim(std::cout, 0); - - // Collect the statistics + + // Collect the statistics std::unique_ptr rt; - //if (group_tuples){ - // rt = std::make_unique( - // std::cout, sim); - //} else { - rt = std::make_unique( - std::cout, sim); - //} + rt = std::make_unique(std::cout, sim); // Run several time = shots (default 1) - for (int i = 0; i < num_shots; i++){ + for (int i = 0; i < num_shots; i++) + { execute(sim, *rt); } @@ -60,7 +50,7 @@ void run(std::string const& filename, std::cout << "-------------------" << std::endl; std::cout << "Number of shots: " << num_shots << std::endl; std::cout << "Number of qubits: " << sim.num_qubits() << std::endl; - + for(int q_index = 0; q_index < sim.num_qubits(); q_index++){ int value_0 = 0; int value_1 = 0; @@ -83,7 +73,7 @@ int main(int argc, char* argv[]) int num_shots{1}; std::string filename; //bool group_tuples{false}; - + CLI::App app; auto* filename_opt @@ -93,16 +83,16 @@ int main(int argc, char* argv[]) auto* nshot_opt = app.add_option("-s,--shots", num_shots, "Number of shots"); nshot_opt->capture_default_str(); - + //app.add_flag("--group-tuples,!--no-group-tuples", // group_tuples, // "Print per-tuple measurement statistics rather than " // "per-qubit"); - + CLI11_PARSE(app, argc, argv); //qiree::app::run(filename, num_shots, group_tuples); qiree::app::run(filename, num_shots); - + return EXIT_SUCCESS; } diff --git a/src/qirqsim/CMakeLists.txt b/src/qirqsim/CMakeLists.txt index c75b054..f0c34d6 100644 --- a/src/qirqsim/CMakeLists.txt +++ b/src/qirqsim/CMakeLists.txt @@ -8,7 +8,6 @@ qiree_add_library(qirqsim QsimQuantum.cc QsimDefaultRuntime.cc - QsimTupleRuntime.cc BufferManager.cc ) diff --git a/src/qirqsim/QsimQuantum.cc b/src/qirqsim/QsimQuantum.cc index c1e2770..b71ccf4 100644 --- a/src/qirqsim/QsimQuantum.cc +++ b/src/qirqsim/QsimQuantum.cc @@ -58,7 +58,7 @@ struct QsimQuantum::Factory struct QsimQuantum::State { qsim::Circuit> circuit; - Factory::StateSpace::State state; + std::optional state; }; //---------------------------------------------------------------------------// @@ -100,11 +100,11 @@ void QsimQuantum::set_up(EntryPointAttrs const& attrs) // Create the state state_->state = state_space.Create(this->num_qubits()); // Check if the state is null - QIREE_VALIDATE(!state_space.IsNull(state), + QIREE_VALIDATE(!state_space.IsNull(*state_->state), << "not enough memory: is the number of qubits too large?"); - state_space.SetStateZero(state); // Set the state to zero, TODO: the - // initial state is not necessarily zero + // TODO: initial states shouldn't necessarily be zero + state_space.SetStateZero(*state_->state); // Allocate the number of qubits in the circuit state_->circuit.num_qubits = num_qubits_; @@ -137,7 +137,7 @@ QState QsimQuantum::read_result(Result r) { using Fuser = qsim::MultiQubitGateFuser>; using Runner = qsim::QSimRunner; - using VecMeas = std::vector; + using StateSpace = Factory::StateSpace; // Vector to hold measurement results, this must be empty before running std::vector meas_results; @@ -153,7 +153,7 @@ QState QsimQuantum::read_result(Result r) bool const run_success = Runner::Run(qsimParam, Factory(num_threads_), state_->circuit, - state_->state, + *state_->state, meas_results); QIREE_ASSERT(run_success); // Ensure the run was successful @@ -204,8 +204,8 @@ void QsimQuantum::mz(Qubit q, Result r) QIREE_EXPECT(q.value == r.value); // Add measurement instruction state_->circuit.gates.push_back( - qsim::gate::Measurement>::Create(gate_index_++, - {q.value})); + qsim::gate::Measurement>::Create( + gate_index_++, {static_cast(q.value)})); } //---------------------------------------------------------------------------// diff --git a/src/qirqsim/QsimQuantum.hh b/src/qirqsim/QsimQuantum.hh index 1292e8b..94813e6 100644 --- a/src/qirqsim/QsimQuantum.hh +++ b/src/qirqsim/QsimQuantum.hh @@ -27,7 +27,7 @@ class QsimQuantum final : virtual public QuantumNotImpl { public: // Construct with number of shots - QsimQuantum(std::ostream& os, size_type shots); + QsimQuantum(std::ostream& os, unsigned long int shots); ~QsimQuantum(); QIREE_DELETE_COPY_MOVE(QsimQuantum); // Delete copy and move constructors @@ -101,18 +101,14 @@ class QsimQuantum final : virtual public QuantumNotImpl //// DATA //// - unsigned num_threads_; // Number of threads to use + std::ostream& output_; + unsigned long int seed_{}; + std::unique_ptr state_; - unsigned long int seed_; + unsigned num_threads_{}; // Number of threads to use size_t gate_index_; // when the quantum operation will be executed - size_type num_qubits_{}; std::vector result_to_qubit_; - - std::ostream& output_; - - // Quantum circuit, simulator, and measured results - std::unique_ptr state_; }; } // namespace qiree diff --git a/src/qirqsim/QsimTupleRuntime.cc b/src/qirqsim/QsimTupleRuntime.cc deleted file mode 100644 index aa06798..0000000 --- a/src/qirqsim/QsimTupleRuntime.cc +++ /dev/null @@ -1,128 +0,0 @@ -//----------------------------------*-C++-*----------------------------------// -// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers. -// See the top-level COPYRIGHT file for details. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -//---------------------------------------------------------------------------// -//! \file qirqsim/QsimTupleRuntime.cc -//---------------------------------------------------------------------------// -#include "QsimTupleRuntime.hh" - -#include -#include "qiree/Assert.hh" - -namespace qiree -{ -//---------------------------------------------------------------------------// -/*! - * Initialize the execution environment, resetting qubits. - */ -void QsimTupleRuntime::initialize(OptionalCString env) -{ - if (env) - { - output_ << "Argument to initialize: " << env << std::endl; - } -} - -//---------------------------------------------------------------------------// -/*! - * Execute circuit and mark the following N results as being part of an array - * named tag - */ -void QsimTupleRuntime::array_record_output(size_type s, OptionalCString tag) -{ - execute_if_needed(); - start_tracking(GroupingType::array, tag, s); -} - -//---------------------------------------------------------------------------// -/*! - * Execute circuit and mark the following N results as being part of a tuple - * named tag - */ -void QsimTupleRuntime::tuple_record_output(size_type s, OptionalCString tag) -{ - execute_if_needed(); - start_tracking(GroupingType::tuple, tag, s); -} - -//---------------------------------------------------------------------------// -/*! - * Execute circuit and report a single measurement result - */ -void QsimTupleRuntime::result_record_output(Result r, OptionalCString tag) -{ - execute_if_needed(); - Qubit q = sim_.result_to_qubit(r); - push_result(q); -} - -//---------------------------------------------------------------------------// -// PRIVATE FUNCTIONS -//---------------------------------------------------------------------------// - -void QsimTupleRuntime::execute_if_needed() -{ - /* - if (sim_.execute_if_needed() && print_accelbuf_) - { - sim_.print_accelbuf(); - } - */ -} - -void QsimTupleRuntime::start_tracking(GroupingType type, - std::string tag, - size_type num_results) -{ - QIREE_EXPECT(!valid_); - valid_ = true; - type_ = type; - tag_ = tag; - num_results_ = num_results; - qubits_.clear(); - - if (!num_results_) - { - // Edge case - print_header(0); - valid_ = false; - } -} - -void QsimTupleRuntime::push_result(Qubit q) -{ - QIREE_EXPECT(valid_); - QIREE_EXPECT(qubits_.size() < num_results_); - qubits_.push_back(q); - if (qubits_.size() == num_results_) - { - finish_tuple(); - } -} - -void QsimTupleRuntime::print_header(size_type num_distinct) -{ - auto name = get_name(); - output_ << name << " " << tag_ << " length " << qubits_.size() - << " distinct results " << num_distinct << std::endl; -} - -void QsimTupleRuntime::finish_tuple() -{ - // auto counts = sim_.get_marginal_counts(qubits_); - std::map counts = {{"0", 0}, {"1", 0}}; // Placeholder - // for actual - // counts, TODO: - // replace with - // actual counts - print_header(counts.size()); - auto name = get_name(); - for (auto& [bits, count] : counts) - { - output_ << name << " " << tag_ << " result " << bits << " count " - << count << std::endl; - } - valid_ = false; -} -} // namespace qiree diff --git a/src/qirqsim/QsimTupleRuntime.hh b/src/qirqsim/QsimTupleRuntime.hh deleted file mode 100644 index d6cafbe..0000000 --- a/src/qirqsim/QsimTupleRuntime.hh +++ /dev/null @@ -1,93 +0,0 @@ -//----------------------------------*-C++-*----------------------------------// -// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers. -// See the top-level COPYRIGHT file for details. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -//---------------------------------------------------------------------------// -//! \file qirqsim/QsimTupleRuntime.hh -//---------------------------------------------------------------------------// -#pragma once - -#include "QsimQuantum.hh" - -namespace qiree -{ - -/*! - * Print per-tuple (or per-array) measurement statistics. (Compare with \ref - * QsimDefaultRuntime.) - * - * Example: - * \code - * tuple ret length 2 distinct results 2 - * tuple ret result 00 count 512 - * tuple ret result 11 count 512 - * \endcode - */ - -class QsimTupleRuntime final : virtual public RuntimeInterface -{ - public: - /*! - * Construct an \c QsimTupleRuntime. - * The \c print_accelbuf argument determines whether the qsim \c - * AcceleratorBuffer is dumped after execution. - */ - QsimTupleRuntime(std::ostream& output, - QsimQuantum& sim, - bool print_accelbuf = true) - : output_(output) - , sim_(sim) - , print_accelbuf_(print_accelbuf) - , valid_(false) - { - } - - //!@{ - //! \name Runtime interface - // Initialize the execution environment, resetting qubits - void initialize(OptionalCString env) override; - - // Execute circuit and mark the following N results as being part of an - // array named tag - void array_record_output(size_type, OptionalCString tag) final; - - // Execute circuit and mark the following N results as being part of a - // tuple named tag - void tuple_record_output(size_type, OptionalCString) final; - - // Execute circuit and report a single measurement result - void result_record_output(Result result, OptionalCString tag) final; - //!@} - - private: - enum class GroupingType - { - tuple, - array, - }; - - std::ostream& output_; - QsimQuantum& sim_; - bool const print_accelbuf_; - bool valid_; - GroupingType type_; - std::string tag_; - size_type num_results_; - std::vector qubits_; - - void execute_if_needed(); - void - start_tracking(GroupingType type, std::string tag, size_type num_results); - void push_result(Qubit q); - void print_header(size_type num_distinct); - void finish_tuple(); - - inline std::string get_name() - { - return type_ == GroupingType::tuple ? "tuple" - : type_ == GroupingType::array ? "array" - : "grouping"; - } -}; - -} // namespace qiree From 35778add90605e1624898aae4e135c48acc7dcb8 Mon Sep 17 00:00:00 2001 From: wongey <25296194+wongey@users.noreply.github.com> Date: Wed, 27 Nov 2024 11:44:03 -0500 Subject: [PATCH 22/64] Fix qsim test --- test/qirqsim/QsimQuantum.test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/qirqsim/QsimQuantum.test.cc b/test/qirqsim/QsimQuantum.test.cc index e01dca7..21395cc 100644 --- a/test/qirqsim/QsimQuantum.test.cc +++ b/test/qirqsim/QsimQuantum.test.cc @@ -43,7 +43,7 @@ TEST_F(QsimQuantumTest, sim_dynamicbv) os << '\n'; // Create a simulator that will write to the string stream - QsimQuantum qsim_sim{os, 1}; + QsimQuantum qsim_sim{os, 0}; QsimDefaultRuntime qsim_rt{os, qsim_sim}; // Call functions in the same sequence that dynamicbv.ll would From bdd35c413bb5bb21588adf5dd7201b77d92ba34b Mon Sep 17 00:00:00 2001 From: Vicente Date: Mon, 23 Dec 2024 13:18:40 -0500 Subject: [PATCH 23/64] including OutputDistribution into the libs list --- src/qiree/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/src/qiree/CMakeLists.txt b/src/qiree/CMakeLists.txt index 1c2be37..b19ff14 100644 --- a/src/qiree/CMakeLists.txt +++ b/src/qiree/CMakeLists.txt @@ -19,6 +19,7 @@ qiree_add_library(qiree Module.cc Executor.cc QuantumNotImpl.cc + OutputDistribution.cc ) target_compile_features(qiree PUBLIC cxx_std_17) target_link_libraries(qiree From cb373bad6331a45c13c3fe1abbc2464785fdffbf Mon Sep 17 00:00:00 2001 From: Vicente Date: Mon, 23 Dec 2024 13:19:14 -0500 Subject: [PATCH 24/64] old BufferManager now in qiree namespace --- src/qiree/OutputDistribution.cc | 64 ++++++++++++++++++++++++++++++++ src/qiree/OutputDistribution.hh | 65 +++++++++++++++++++++++++++++++++ 2 files changed, 129 insertions(+) create mode 100644 src/qiree/OutputDistribution.cc create mode 100644 src/qiree/OutputDistribution.hh diff --git a/src/qiree/OutputDistribution.cc b/src/qiree/OutputDistribution.cc new file mode 100644 index 0000000..0d0297f --- /dev/null +++ b/src/qiree/OutputDistribution.cc @@ -0,0 +1,64 @@ +//----------------------------------*-C++-*----------------------------------// +// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers. +// See the top-level COPYRIGHT file for details. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +//---------------------------------------------------------------------------// +//! \file qiree/Buffer.hh +//---------------------------------------------------------------------------// + +#include "OutputDistribution.hh" + +#include +#include +#include + +namespace qiree +{ + + + +void Buffer::updateBuffer(std::string const& qubit, + std::string const& state, + int const& value) +{ + // Insert or update the key-value pair in the buffer + std::pair searchKey = {qubit, state}; + int current_frequency = 0; + auto it = buffer.find(searchKey); + if (it != buffer.end()) + { + current_frequency = it->second; + } + // Accumulate counts with every shot + buffer[{qubit, state}] = value + current_frequency; +} + +void Buffer::updateBuffer(std::string const& key, int const& value) +{ + // Insert or update the key-value pair in the buffer + simple_buffer[key] = value; +} + +std::optional Buffer::getBufferValue(std::string const& qubit, + std::string const& state) const +{ + std::pair searchKey = {qubit, state}; + auto it = buffer.find(searchKey); + if (it != buffer.end()) + { + return it->second; // Key found + } + return std::nullopt; // Key not found +} + +std::optional Buffer::getBufferValue(std::string const& key) const +{ + auto it = simple_buffer.find(key); + if (it != simple_buffer.end()) + { + return it->second; // Key found + } + return std::nullopt; // Key not found +} + +} // namespace qiree \ No newline at end of file diff --git a/src/qiree/OutputDistribution.hh b/src/qiree/OutputDistribution.hh new file mode 100644 index 0000000..398c88e --- /dev/null +++ b/src/qiree/OutputDistribution.hh @@ -0,0 +1,65 @@ +//----------------------------------*-C++-*----------------------------------// +// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers. +// See the top-level COPYRIGHT file for details. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +//---------------------------------------------------------------------------// +//! \file qiree/Buffer.hh +//---------------------------------------------------------------------------// + +#pragma once + +#include +#include +#include +#include +#include + +namespace qiree +{ + +// Define a hash function for std::pair + +struct pair_hash +{ + template + std::size_t operator()(std::pair const& pair) const + { + auto hash1 = std::hash{}(pair.first); + auto hash2 = std::hash{}(pair.second); + // Combine the two hash values + return hash1 ^ (hash2 << 1); // Shift and XOR + } +}; + +class Buffer +{ + public: + // Method to update the buffer with a key-value pair + // TODO: Don't use strings here + void updateBuffer(std::string const& qubit, + std::string const& state, + int const& value); + void updateBuffer(std::string const& key, int const& value); + + // Retrieve buffer value for storage or evaluation + std::optional + getBufferValue(std::string const& qubit, std::string const& state) const; + std::optional getBufferValue(std::string const& key) const; + + private: + // Dictionary to store key-value pairs + std::unordered_map, int, pair_hash> buffer; + std::unordered_map simple_buffer; +}; + +// BUFFER_H + +} // namespace qiree + + + + + + + + From 181737d6c7850a6fa388493036289d93044c8b28 Mon Sep 17 00:00:00 2001 From: Vicente Date: Mon, 23 Dec 2024 13:19:47 -0500 Subject: [PATCH 25/64] deleting the old BufferManager --- src/qirqsim/BufferManager.cc | 57 ------------------------------------ src/qirqsim/BufferManager.hh | 52 -------------------------------- 2 files changed, 109 deletions(-) delete mode 100644 src/qirqsim/BufferManager.cc delete mode 100644 src/qirqsim/BufferManager.hh diff --git a/src/qirqsim/BufferManager.cc b/src/qirqsim/BufferManager.cc deleted file mode 100644 index b340604..0000000 --- a/src/qirqsim/BufferManager.cc +++ /dev/null @@ -1,57 +0,0 @@ -//----------------------------------*-C++-*----------------------------------// -// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers. -// See the top-level COPYRIGHT file for details. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -//---------------------------------------------------------------------------// -//! \file qirqsim/BufferManager.hh -//---------------------------------------------------------------------------// - -#include "BufferManager.hh" - -#include -#include -#include - -void BufferManager::updateBuffer(std::string const& qubit, - std::string const& state, - int const& value) -{ - // Insert or update the key-value pair in the buffer - std::pair searchKey = {qubit, state}; - int current_frequency = 0; - auto it = buffer.find(searchKey); - if (it != buffer.end()) - { - current_frequency = it->second; - } - // Accumulate counts with every shot - buffer[{qubit, state}] = value + current_frequency; -} - -void BufferManager::updateBuffer(std::string const& key, int const& value) -{ - // Insert or update the key-value pair in the buffer - simple_buffer[key] = value; -} - -std::optional BufferManager::getBufferValue(std::string const& qubit, - std::string const& state) const -{ - std::pair searchKey = {qubit, state}; - auto it = buffer.find(searchKey); - if (it != buffer.end()) - { - return it->second; // Key found - } - return std::nullopt; // Key not found -} - -std::optional BufferManager::getBufferValue(std::string const& key) const -{ - auto it = simple_buffer.find(key); - if (it != simple_buffer.end()) - { - return it->second; // Key found - } - return std::nullopt; // Key not found -} diff --git a/src/qirqsim/BufferManager.hh b/src/qirqsim/BufferManager.hh deleted file mode 100644 index deac635..0000000 --- a/src/qirqsim/BufferManager.hh +++ /dev/null @@ -1,52 +0,0 @@ -//----------------------------------*-C++-*----------------------------------// -// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers. -// See the top-level COPYRIGHT file for details. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -//---------------------------------------------------------------------------// -//! \file qirqsim/BufferManager.hh -//---------------------------------------------------------------------------// - -#pragma once - -#include -#include -#include -#include -#include - -// Define a hash function for std::pair - -struct pair_hash -{ - template - std::size_t operator()(std::pair const& pair) const - { - auto hash1 = std::hash{}(pair.first); - auto hash2 = std::hash{}(pair.second); - // Combine the two hash values - return hash1 ^ (hash2 << 1); // Shift and XOR - } -}; - -class BufferManager -{ - public: - // Method to update the buffer with a key-value pair - // TODO: Don't use strings here - void updateBuffer(std::string const& qubit, - std::string const& state, - int const& value); - void updateBuffer(std::string const& key, int const& value); - - // Retrieve buffer value for storage or evaluation - std::optional - getBufferValue(std::string const& qubit, std::string const& state) const; - std::optional getBufferValue(std::string const& key) const; - - private: - // Dictionary to store key-value pairs - std::unordered_map, int, pair_hash> buffer; - std::unordered_map simple_buffer; -}; - -// BUFFER_MANAGER_H From a92aacfb7df5acdaa0ab0010edce3a877f6c4c28 Mon Sep 17 00:00:00 2001 From: Vicente Date: Mon, 23 Dec 2024 13:20:43 -0500 Subject: [PATCH 26/64] updating libs list --- src/qirqsim/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/src/qirqsim/CMakeLists.txt b/src/qirqsim/CMakeLists.txt index f0c34d6..0d81dec 100644 --- a/src/qirqsim/CMakeLists.txt +++ b/src/qirqsim/CMakeLists.txt @@ -8,7 +8,6 @@ qiree_add_library(qirqsim QsimQuantum.cc QsimDefaultRuntime.cc - BufferManager.cc ) #Link the qsim library to qiree and any other relevant libraries From 040b834b079a6c3a4079d058127a63f2dd23004b Mon Sep 17 00:00:00 2001 From: Vicente Date: Mon, 23 Dec 2024 13:22:04 -0500 Subject: [PATCH 27/64] updating manager (Buffer) --- src/qirqsim/QsimQuantum.hh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/qirqsim/QsimQuantum.hh b/src/qirqsim/QsimQuantum.hh index 94813e6..cde10a8 100644 --- a/src/qirqsim/QsimQuantum.hh +++ b/src/qirqsim/QsimQuantum.hh @@ -11,11 +11,11 @@ #include #include -#include "BufferManager.hh" #include "qiree/Macros.hh" #include "qiree/QuantumNotImpl.hh" #include "qiree/RuntimeInterface.hh" #include "qiree/Types.hh" +#include "qiree/OutputDistribution.hh" namespace qiree { @@ -90,7 +90,7 @@ class QsimQuantum final : virtual public QuantumNotImpl //!@} // Update the buffer - BufferManager manager; + Buffer manager; private: From c5d2f04da4baa7d7050d638ac7d3a05556b38af5 Mon Sep 17 00:00:00 2001 From: wongey <25296194+wongey@users.noreply.github.com> Date: Tue, 14 Jan 2025 14:13:41 -0500 Subject: [PATCH 28/64] REVERT ME Delete OutputDistribution and qsim app temporarily --- app/CMakeLists.txt | 14 ----- app/qir-qsim.cc | 98 --------------------------------- src/qiree/CMakeLists.txt | 1 - src/qiree/OutputDistribution.cc | 64 --------------------- src/qiree/OutputDistribution.hh | 65 ---------------------- 5 files changed, 242 deletions(-) delete mode 100644 app/qir-qsim.cc delete mode 100644 src/qiree/OutputDistribution.cc delete mode 100644 src/qiree/OutputDistribution.hh diff --git a/app/CMakeLists.txt b/app/CMakeLists.txt index 4bf7330..d0640b4 100644 --- a/app/CMakeLists.txt +++ b/app/CMakeLists.txt @@ -14,20 +14,6 @@ FetchContent_Declare( FetchContent_MakeAvailable(cli11_proj) -#-----------------------------------------------------------------------------# -# QSIM FRONT END -#-----------------------------------------------------------------------------# - -if(QIREE_USE_QSIM) - qiree_add_executable(qir-qsim - qir-qsim.cc - ) - target_link_libraries(qir-qsim - PUBLIC QIREE::qiree QIREE::qirqsim - PRIVATE CLI11::CLI11 - ) -endif() - #-----------------------------------------------------------------------------# # XACC FRONT END #-----------------------------------------------------------------------------# diff --git a/app/qir-qsim.cc b/app/qir-qsim.cc deleted file mode 100644 index 92f4669..0000000 --- a/app/qir-qsim.cc +++ /dev/null @@ -1,98 +0,0 @@ -//----------------------------------*-C++-*----------------------------------// -// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers. -// See the top-level COPYRIGHT file for details. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -//---------------------------------------------------------------------------// -//! \file qir-xacc/qir-xacc.cc -//---------------------------------------------------------------------------// -#include -#include -#include -#include -#include - -#include "qiree_version.h" - -#include "qiree/Executor.hh" -#include "qiree/Module.hh" -#include "qiree/QuantumNotImpl.hh" -#include "qirqsim/QsimDefaultRuntime.hh" -#include "qirqsim/QsimQuantum.hh" - -using namespace std::string_view_literals; - -namespace qiree -{ -namespace app -{ -void run(std::string const& filename, - int num_shots) - // bool group_tuples = false) -{ - // Load the input - Executor execute{Module{filename}}; - - // Set up qsim - QsimQuantum sim(std::cout, 0); - - // Collect the statistics - std::unique_ptr rt; - rt = std::make_unique(std::cout, sim); - - // Run several time = shots (default 1) - for (int i = 0; i < num_shots; i++) - { - execute(sim, *rt); - } - - std::cout << std::endl; - std::cout << "Measurement output:" << std::endl; - std::cout << "-------------------" << std::endl; - std::cout << "Number of shots: " << num_shots << std::endl; - std::cout << "Number of qubits: " << sim.num_qubits() << std::endl; - - for(int q_index = 0; q_index < sim.num_qubits(); q_index++){ - int value_0 = 0; - int value_1 = 0; - if (auto value = sim.manager.getBufferValue("q"+std::to_string(q_index), "0"); value.has_value()){ value_0 = value.value();} - if (auto value = sim.manager.getBufferValue("q"+std::to_string(q_index), "1"); value.has_value()){ value_1 = value.value();} - std::cout << "q" << q_index << " {0: " << value_0 << "," << " 1: " << value_1 << "}\n"; - } -} - -//---------------------------------------------------------------------------// -} // namespace app -} // namespace qiree - -//---------------------------------------------------------------------------// -/*! - * Execute and run. - */ -int main(int argc, char* argv[]) -{ - int num_shots{1}; - std::string filename; - //bool group_tuples{false}; - - CLI::App app; - - auto* filename_opt - = app.add_option("--input,-i,input", filename, "QIR input file"); - filename_opt->required(); - - auto* nshot_opt - = app.add_option("-s,--shots", num_shots, "Number of shots"); - nshot_opt->capture_default_str(); - - //app.add_flag("--group-tuples,!--no-group-tuples", - // group_tuples, - // "Print per-tuple measurement statistics rather than " - // "per-qubit"); - - CLI11_PARSE(app, argc, argv); - - //qiree::app::run(filename, num_shots, group_tuples); - qiree::app::run(filename, num_shots); - - return EXIT_SUCCESS; -} diff --git a/src/qiree/CMakeLists.txt b/src/qiree/CMakeLists.txt index b19ff14..1c2be37 100644 --- a/src/qiree/CMakeLists.txt +++ b/src/qiree/CMakeLists.txt @@ -19,7 +19,6 @@ qiree_add_library(qiree Module.cc Executor.cc QuantumNotImpl.cc - OutputDistribution.cc ) target_compile_features(qiree PUBLIC cxx_std_17) target_link_libraries(qiree diff --git a/src/qiree/OutputDistribution.cc b/src/qiree/OutputDistribution.cc deleted file mode 100644 index 0d0297f..0000000 --- a/src/qiree/OutputDistribution.cc +++ /dev/null @@ -1,64 +0,0 @@ -//----------------------------------*-C++-*----------------------------------// -// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers. -// See the top-level COPYRIGHT file for details. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -//---------------------------------------------------------------------------// -//! \file qiree/Buffer.hh -//---------------------------------------------------------------------------// - -#include "OutputDistribution.hh" - -#include -#include -#include - -namespace qiree -{ - - - -void Buffer::updateBuffer(std::string const& qubit, - std::string const& state, - int const& value) -{ - // Insert or update the key-value pair in the buffer - std::pair searchKey = {qubit, state}; - int current_frequency = 0; - auto it = buffer.find(searchKey); - if (it != buffer.end()) - { - current_frequency = it->second; - } - // Accumulate counts with every shot - buffer[{qubit, state}] = value + current_frequency; -} - -void Buffer::updateBuffer(std::string const& key, int const& value) -{ - // Insert or update the key-value pair in the buffer - simple_buffer[key] = value; -} - -std::optional Buffer::getBufferValue(std::string const& qubit, - std::string const& state) const -{ - std::pair searchKey = {qubit, state}; - auto it = buffer.find(searchKey); - if (it != buffer.end()) - { - return it->second; // Key found - } - return std::nullopt; // Key not found -} - -std::optional Buffer::getBufferValue(std::string const& key) const -{ - auto it = simple_buffer.find(key); - if (it != simple_buffer.end()) - { - return it->second; // Key found - } - return std::nullopt; // Key not found -} - -} // namespace qiree \ No newline at end of file diff --git a/src/qiree/OutputDistribution.hh b/src/qiree/OutputDistribution.hh deleted file mode 100644 index 398c88e..0000000 --- a/src/qiree/OutputDistribution.hh +++ /dev/null @@ -1,65 +0,0 @@ -//----------------------------------*-C++-*----------------------------------// -// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers. -// See the top-level COPYRIGHT file for details. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -//---------------------------------------------------------------------------// -//! \file qiree/Buffer.hh -//---------------------------------------------------------------------------// - -#pragma once - -#include -#include -#include -#include -#include - -namespace qiree -{ - -// Define a hash function for std::pair - -struct pair_hash -{ - template - std::size_t operator()(std::pair const& pair) const - { - auto hash1 = std::hash{}(pair.first); - auto hash2 = std::hash{}(pair.second); - // Combine the two hash values - return hash1 ^ (hash2 << 1); // Shift and XOR - } -}; - -class Buffer -{ - public: - // Method to update the buffer with a key-value pair - // TODO: Don't use strings here - void updateBuffer(std::string const& qubit, - std::string const& state, - int const& value); - void updateBuffer(std::string const& key, int const& value); - - // Retrieve buffer value for storage or evaluation - std::optional - getBufferValue(std::string const& qubit, std::string const& state) const; - std::optional getBufferValue(std::string const& key) const; - - private: - // Dictionary to store key-value pairs - std::unordered_map, int, pair_hash> buffer; - std::unordered_map simple_buffer; -}; - -// BUFFER_H - -} // namespace qiree - - - - - - - - From 10ed1ae013abdd1aa102a46e4885c71ed867e71d Mon Sep 17 00:00:00 2001 From: Seth R Johnson Date: Tue, 14 Jan 2025 14:18:49 -0500 Subject: [PATCH 29/64] Fix build (some to be reverted) --- CMakePresets.json | 14 +------------- src/qirqsim/QsimDefaultRuntime.cc | 9 ++++++--- src/qirqsim/QsimQuantum.cc | 4 +++- src/qirqsim/QsimQuantum.hh | 4 ---- test/qirqsim/QsimQuantum.test.cc | 2 ++ 5 files changed, 12 insertions(+), 21 deletions(-) diff --git a/CMakePresets.json b/CMakePresets.json index 4d9e63b..8de704f 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -23,18 +23,6 @@ }, "CMAKE_INSTALL_PREFIX": "${sourceDir}/install-${presetName}" } - }, - { - "name": "default", - "displayName": "Clang 16.0.6 x86_64-pc-linux-gnu", - "description": "Using compilers: C = /usr/bin/clang-16, CXX = /usr/bin/clang++-16", - "binaryDir": "${sourceDir}/out/build/${presetName}", - "cacheVariables": { - "CMAKE_INSTALL_PREFIX": "${sourceDir}/out/install/${presetName}", - "CMAKE_C_COMPILER": "/usr/bin/clang-16", - "CMAKE_CXX_COMPILER": "/usr/bin/clang++-16", - "CMAKE_BUILD_TYPE": "Debug" - } } ], "buildPresets": [ @@ -58,4 +46,4 @@ } } ] -} \ No newline at end of file +} diff --git a/src/qirqsim/QsimDefaultRuntime.cc b/src/qirqsim/QsimDefaultRuntime.cc index 4ece7c1..d9571f8 100644 --- a/src/qirqsim/QsimDefaultRuntime.cc +++ b/src/qirqsim/QsimDefaultRuntime.cc @@ -32,7 +32,7 @@ void QsimDefaultRuntime::initialize(OptionalCString env) * named tag */ -void QsimDefaultRuntime::array_record_output(size_type s, OptionalCString tag) +void QsimDefaultRuntime::array_record_output(size_type, OptionalCString) { // this->execute_if_needed(); // output_ << "array " << (tag ? tag : "") << " length " << s @@ -45,7 +45,7 @@ void QsimDefaultRuntime::array_record_output(size_type s, OptionalCString tag) * named tag */ -void QsimDefaultRuntime::tuple_record_output(size_type s, OptionalCString tag) +void QsimDefaultRuntime::tuple_record_output(size_type, OptionalCString) { // this->execute_if_needed(); // output_ << "tuple " << (tag ? tag : "") << " length " << s @@ -56,18 +56,21 @@ void QsimDefaultRuntime::tuple_record_output(size_type s, OptionalCString tag) /*! * Execute circuit and report a single measurement result */ -void QsimDefaultRuntime::result_record_output(Result r, OptionalCString tag) +void QsimDefaultRuntime::result_record_output(Result, OptionalCString) { // Access values through the getter // This prints results every time result_record_output is called // Can comment out if only want to see final results +#if 0 if (auto value = sim_.manager.getBufferValue("q" + std::to_string(r.value)); value.has_value()) { std::cout << "q" << std::to_string(r.value) << " : " << value.value() << "\n"; } +#endif + (void)sizeof(sim_); } } // namespace qiree diff --git a/src/qirqsim/QsimQuantum.cc b/src/qirqsim/QsimQuantum.cc index b71ccf4..c25c297 100644 --- a/src/qirqsim/QsimQuantum.cc +++ b/src/qirqsim/QsimQuantum.cc @@ -133,7 +133,7 @@ void QsimQuantum::reset(Qubit q) /*! * Read the value of a result. This utilizes the new BufferManager. */ -QState QsimQuantum::read_result(Result r) +QState QsimQuantum::read_result(Result) { using Fuser = qsim::MultiQubitGateFuser>; using Runner = qsim::QSimRunner; @@ -161,6 +161,7 @@ QState QsimQuantum::read_result(Result r) state_->circuit = {}; state_->circuit.num_qubits = num_qubits_; +#if 0 if (meas_results.size() == 1 && meas_results[0].bitstring.size() == 1) { auto const bitResult = meas_results[0].bitstring[0]; @@ -182,6 +183,7 @@ QState QsimQuantum::read_result(Result r) { qsim::IO::errorf("Unexpected measurement results encountered."); } +#endif return static_cast(meas_results[0].bitstring[0]); } diff --git a/src/qirqsim/QsimQuantum.hh b/src/qirqsim/QsimQuantum.hh index cde10a8..83533da 100644 --- a/src/qirqsim/QsimQuantum.hh +++ b/src/qirqsim/QsimQuantum.hh @@ -15,7 +15,6 @@ #include "qiree/QuantumNotImpl.hh" #include "qiree/RuntimeInterface.hh" #include "qiree/Types.hh" -#include "qiree/OutputDistribution.hh" namespace qiree { @@ -89,9 +88,6 @@ class QsimQuantum final : virtual public QuantumNotImpl void z(Qubit) final; //!@} - // Update the buffer - Buffer manager; - private: //// TYPES //// diff --git a/test/qirqsim/QsimQuantum.test.cc b/test/qirqsim/QsimQuantum.test.cc index 21395cc..b9ef992 100644 --- a/test/qirqsim/QsimQuantum.test.cc +++ b/test/qirqsim/QsimQuantum.test.cc @@ -90,10 +90,12 @@ TEST_F(QsimQuantumTest, sim_dynamicbv) qsim_sim.tear_down(); ASSERT_EQ(2, qsim_sim.num_qubits()); +#if 0 EXPECT_EQ(1, qsim_sim.manager.getBufferValue("q0", "0").value()); EXPECT_EQ(2, qsim_sim.manager.getBufferValue("q0", "1").value()); EXPECT_EQ(2, qsim_sim.manager.getBufferValue("q1", "0").value()); EXPECT_EQ(1, qsim_sim.manager.getBufferValue("q1", "1").value()); +#endif } //---------------------------------------------------------------------------// From f5f54cc8038539fc78276a332c123cfe3a721b5e Mon Sep 17 00:00:00 2001 From: Seth R Johnson Date: Tue, 14 Jan 2025 14:36:26 -0500 Subject: [PATCH 30/64] Store results in a buffer for post-entrypoint retrieval --- src/qirqsim/QsimQuantum.cc | 89 ++++++++++++++------------------ src/qirqsim/QsimQuantum.hh | 24 ++++++++- test/qirqsim/QsimQuantum.test.cc | 20 ++++--- 3 files changed, 74 insertions(+), 59 deletions(-) diff --git a/src/qirqsim/QsimQuantum.cc b/src/qirqsim/QsimQuantum.cc index c25c297..3e4011f 100644 --- a/src/qirqsim/QsimQuantum.cc +++ b/src/qirqsim/QsimQuantum.cc @@ -82,10 +82,11 @@ void QsimQuantum::set_up(EntryPointAttrs const& attrs) { QIREE_VALIDATE(attrs.required_num_qubits > 0, << "input is not a quantum program"); + // Resize the result_to_qubit_ vector, based on the required number of // results... the idea is to have as many classical registers as qubits // (probably not true in general) - result_to_qubit_.resize(attrs.required_num_results); + results_.resize(attrs.required_num_results); num_qubits_ = attrs.required_num_qubits; // Set the number of qubits // Get the number of threads @@ -129,12 +130,25 @@ void QsimQuantum::reset(Qubit q) q.value = 0; } -//----------------------------------------------------------------------------// +//---------------------------------------------------------------------------// /*! - * Read the value of a result. This utilizes the new BufferManager. + * Map a qubit to a result index. + * + * (TODO: find how to link the classical register to the quantum register in + * qsim) */ -QState QsimQuantum::read_result(Result) +void QsimQuantum::mz(Qubit q, Result r) { + QIREE_EXPECT(q.value < this->num_qubits()); + QIREE_EXPECT(r.value < this->num_results()); + + // Add measurement instruction + state_->circuit.gates.push_back( + qsim::gate::Measurement>::Create( + gate_index_++, {static_cast(q.value)})); + + //// EXECUTE CIRCUIT //// + using Fuser = qsim::MultiQubitGateFuser>; using Runner = qsim::QSimRunner; using StateSpace = Factory::StateSpace; @@ -149,72 +163,47 @@ QState QsimQuantum::read_result(Result) qsimParam.max_fused_size = 2; // Set the maximum size of fused gates qsimParam.verbosity = 0; // see verbosity in run_qsim.h - // Run the simulation + // Run the simulation and check that it passed bool const run_success = Runner::Run(qsimParam, Factory(num_threads_), state_->circuit, *state_->state, meas_results); + QIREE_ASSERT(run_success); + QIREE_VALIDATE( + meas_results.size() == 1 && meas_results[0].bitstring.size() == 1, + << "inconsistent measured results size (" << meas_results.size() + << "), bitstring size"); + + //// RESET CIRCUIT //// - QIREE_ASSERT(run_success); // Ensure the run was successful - // reset circuit here state_->circuit = {}; state_->circuit.num_qubits = num_qubits_; -#if 0 - if (meas_results.size() == 1 && meas_results[0].bitstring.size() == 1) - { - auto const bitResult = meas_results[0].bitstring[0]; - QIREE_ASSERT(bitResult == 0 || bitResult == 1); - std::string stringResult = std::to_string(bitResult); - std::string q_index_string = std::to_string(r.value); - if (stringResult == "1") - { - manager.updateBuffer("q" + q_index_string, "1", 1); - manager.updateBuffer("q" + q_index_string, 1); - } - else - { - manager.updateBuffer("q" + q_index_string, "0", 1); - manager.updateBuffer("q" + q_index_string, 0); - } - } - else - { - qsim::IO::errorf("Unexpected measurement results encountered."); - } -#endif - return static_cast(meas_results[0].bitstring[0]); + //// STORE RESULT //// + + auto result = meas_results[0].bitstring[0]; + QIREE_ASSERT(result == 0 || result == 1); + + results_[r.value] = result; } -//---------------------------------------------------------------------------// +//----------------------------------------------------------------------------// /*! - * Map a qubit to a result index. + * Read the value of a result. * - * (TODO: find how to link the classical register to the quantum register in - * qsim) + * \todo We could add assertions to check that we actually measured into the + * given result. */ -void QsimQuantum::mz(Qubit q, Result r) -{ // we don't classical register yet. - QIREE_EXPECT(q.value < this->num_qubits()); // TODO: q must be in the set - // of qubits, e.g., what - // happens if q=5 and qubits - // are {2,3,4,5}, q is less - // than num_qubits but not it - // is in the set of qubits. - // TODO: maybe not what we want long term - QIREE_EXPECT(q.value == r.value); - // Add measurement instruction - state_->circuit.gates.push_back( - qsim::gate::Measurement>::Create( - gate_index_++, {static_cast(q.value)})); +QState QsimQuantum::read_result(Result r) +{ + return this->get_result(r); } //---------------------------------------------------------------------------// /* * Quantum Instruction Mapping */ - // 1. Entangling gates void QsimQuantum::cx(Qubit q1, Qubit q2) { diff --git a/src/qirqsim/QsimQuantum.hh b/src/qirqsim/QsimQuantum.hh index 83533da..8ca79c8 100644 --- a/src/qirqsim/QsimQuantum.hh +++ b/src/qirqsim/QsimQuantum.hh @@ -11,6 +11,7 @@ #include #include +#include "qiree/Assert.hh" #include "qiree/Macros.hh" #include "qiree/QuantumNotImpl.hh" #include "qiree/RuntimeInterface.hh" @@ -33,8 +34,15 @@ class QsimQuantum final : virtual public QuantumNotImpl //!@{ //! \name Accessors - size_type num_results() const { return result_to_qubit_.size(); } + + //! Number of qubits in the circuit size_type num_qubits() const { return num_qubits_; } + + //! Number of classical result registers + size_type num_results() const { return results_.size(); } + + // Get the result from a classical register + inline QState get_result(Result r) const; //!@} //!@{ @@ -88,6 +96,8 @@ class QsimQuantum final : virtual public QuantumNotImpl void z(Qubit) final; //!@} + // + private: //// TYPES //// @@ -100,6 +110,7 @@ class QsimQuantum final : virtual public QuantumNotImpl std::ostream& output_; unsigned long int seed_{}; std::unique_ptr state_; + std::vector results_; unsigned num_threads_{}; // Number of threads to use size_t gate_index_; // when the quantum operation will be executed @@ -107,4 +118,15 @@ class QsimQuantum final : virtual public QuantumNotImpl std::vector result_to_qubit_; }; +//---------------------------------------------------------------------------// +/*! + * Get the result from a classical register. + */ +QState QsimQuantum::get_result(Result r) const +{ + QIREE_EXPECT(r.value < results_.size()); + auto result_bool = static_cast(results_[r.value]); + return static_cast(result_bool); +} + } // namespace qiree diff --git a/test/qirqsim/QsimQuantum.test.cc b/test/qirqsim/QsimQuantum.test.cc index b9ef992..711c2a0 100644 --- a/test/qirqsim/QsimQuantum.test.cc +++ b/test/qirqsim/QsimQuantum.test.cc @@ -53,6 +53,9 @@ TEST_F(QsimQuantumTest, sim_dynamicbv) attrs.required_num_results = 2; return attrs; }()); + ASSERT_EQ(2, qsim_sim.num_qubits()); + ASSERT_EQ(2, qsim_sim.num_results()); + qsim_sim.h(Q{0}); qsim_sim.x(Q{1}); qsim_sim.h(Q{1}); @@ -65,6 +68,9 @@ TEST_F(QsimQuantumTest, sim_dynamicbv) qsim_rt.array_record_output(2,""); qsim_rt.result_record_output(R{0},""); qsim_rt.result_record_output(R{1},""); + EXPECT_EQ(QState::one, qsim_sim.get_result(R{0})); + EXPECT_EQ(QState::one, qsim_sim.get_result(R{1})); + qsim_sim.h(Q{0}); qsim_sim.x(Q{1}); qsim_sim.h(Q{1}); @@ -75,6 +81,9 @@ TEST_F(QsimQuantumTest, sim_dynamicbv) qsim_rt.array_record_output(2,""); qsim_rt.result_record_output(R{0},""); qsim_rt.result_record_output(R{1},""); + EXPECT_EQ(QState::zero, qsim_sim.get_result(R{0})); + EXPECT_EQ(QState::zero, qsim_sim.get_result(R{1})); + qsim_sim.h(Q{0}); qsim_sim.x(Q{1}); qsim_sim.h(Q{1}); @@ -87,15 +96,10 @@ TEST_F(QsimQuantumTest, sim_dynamicbv) qsim_rt.array_record_output(2,""); qsim_rt.result_record_output(R{0},""); qsim_rt.result_record_output(R{1},""); - qsim_sim.tear_down(); + EXPECT_EQ(QState::one, qsim_sim.get_result(R{0})); + EXPECT_EQ(QState::zero, qsim_sim.get_result(R{1})); - ASSERT_EQ(2, qsim_sim.num_qubits()); -#if 0 - EXPECT_EQ(1, qsim_sim.manager.getBufferValue("q0", "0").value()); - EXPECT_EQ(2, qsim_sim.manager.getBufferValue("q0", "1").value()); - EXPECT_EQ(2, qsim_sim.manager.getBufferValue("q1", "0").value()); - EXPECT_EQ(1, qsim_sim.manager.getBufferValue("q1", "1").value()); -#endif + qsim_sim.tear_down(); } //---------------------------------------------------------------------------// From 43c5b79b593157a53c76a03ce66d1e1f041997ef Mon Sep 17 00:00:00 2001 From: Seth R Johnson Date: Tue, 14 Jan 2025 14:46:23 -0500 Subject: [PATCH 31/64] Add helper function --- src/qirqsim/QsimQuantum.cc | 43 +++++++++++++++++--------------------- src/qirqsim/QsimQuantum.hh | 5 +++++ 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/src/qirqsim/QsimQuantum.cc b/src/qirqsim/QsimQuantum.cc index 3e4011f..9a15ef0 100644 --- a/src/qirqsim/QsimQuantum.cc +++ b/src/qirqsim/QsimQuantum.cc @@ -207,66 +207,54 @@ QState QsimQuantum::read_result(Result r) // 1. Entangling gates void QsimQuantum::cx(Qubit q1, Qubit q2) { - state_->circuit.gates.push_back( - qsim::GateCNot::Create(gate_index_++, q1.value, q2.value)); + this->add_gate(q1.value, q2.value); } void QsimQuantum::cnot(Qubit q1, Qubit q2) { - state_->circuit.gates.push_back( - qsim::GateCNot::Create(gate_index_++, q1.value, q2.value)); + this->add_gate(q1.value, q2.value); } void QsimQuantum::cz(Qubit q1, Qubit q2) { - state_->circuit.gates.push_back( - qsim::GateCZ::Create(gate_index_++, q1.value, q2.value)); + this->add_gate(q1.value, q2.value); } // 2. Local gates void QsimQuantum::h(Qubit q) { - state_->circuit.gates.push_back( - qsim::GateHd::Create(gate_index_++, q.value)); + this->add_gate(q.value); } void QsimQuantum::s(Qubit q) { - state_->circuit.gates.push_back( - qsim::GateS::Create(gate_index_++, q.value)); + this->add_gate(q.value); } void QsimQuantum::t(Qubit q) { - state_->circuit.gates.push_back( - qsim::GateT::Create(gate_index_++, q.value)); + this->add_gate(q.value); } // 2.1 Pauli gates void QsimQuantum::x(Qubit q) { - state_->circuit.gates.push_back( - qsim::GateX::Create(gate_index_++, q.value)); + this->add_gate(q.value); } void QsimQuantum::y(Qubit q) { - state_->circuit.gates.push_back( - qsim::GateY::Create(gate_index_++, q.value)); + this->add_gate(q.value); } void QsimQuantum::z(Qubit q) { - state_->circuit.gates.push_back( - qsim::GateZ::Create(gate_index_++, q.value)); + this->add_gate(q.value); } // 2.2 rotation gates void QsimQuantum::rx(double theta, Qubit q) { - state_->circuit.gates.push_back( - qsim::GateRX::Create(gate_index_++, q.value, theta)); + this->add_gate(q.value, theta); } void QsimQuantum::ry(double theta, Qubit q) { - state_->circuit.gates.push_back( - qsim::GateRY::Create(gate_index_++, q.value, theta)); + this->add_gate(q.value, theta); } void QsimQuantum::rz(double theta, Qubit q) { - state_->circuit.gates.push_back( - qsim::GateRZ::Create(gate_index_++, q.value, theta)); + this->add_gate(q.value, theta); } Qubit QsimQuantum::result_to_qubit(Result r) @@ -290,4 +278,11 @@ void QsimQuantum::execute_if_needed() QIREE_EXPECT(false); } +template class Gate, class... Ts> +void QsimQuantum::add_gate(Ts&&... args) +{ + state_->circuit.gates.push_back( + Gate::Create(gate_index_++, std::forward(args)...)); +} + } // namespace qiree diff --git a/src/qirqsim/QsimQuantum.hh b/src/qirqsim/QsimQuantum.hh index 8ca79c8..0ad6942 100644 --- a/src/qirqsim/QsimQuantum.hh +++ b/src/qirqsim/QsimQuantum.hh @@ -116,6 +116,11 @@ class QsimQuantum final : virtual public QuantumNotImpl size_t gate_index_; // when the quantum operation will be executed size_type num_qubits_{}; std::vector result_to_qubit_; + + //// HELPER FUNCTIONS //// + + template class Gate, class... Ts> + void add_gate(Ts&&... args); }; //---------------------------------------------------------------------------// From 5e32f3e3942993640612f3fa26b2e6bc115db4c6 Mon Sep 17 00:00:00 2001 From: Joseph Lee Date: Tue, 14 Jan 2025 22:16:17 +0000 Subject: [PATCH 32/64] add qirlightning runtime --- src/qirlightning/CMakeLists.txt | 28 + src/qirlightning/LightningDefaultRuntime.cc | 73 ++ src/qirlightning/LightningDefaultRuntime.hh | 62 + src/qirlightning/LightningQuantum.cc | 195 ++++ src/qirlightning/LightningQuantum.hh | 114 ++ src/qirlightning/catalyst_runtime/.clang-tidy | 232 ++++ src/qirlightning/catalyst_runtime/.gitignore | 3 + .../catalyst_runtime/CMakeLists.txt | 133 +++ src/qirlightning/catalyst_runtime/Makefile | 121 ++ src/qirlightning/catalyst_runtime/README.rst | 118 ++ .../catalyst_runtime/include/DataView.hpp | 148 +++ .../include/DynamicLibraryLoader.hpp | 79 ++ .../catalyst_runtime/include/Exception.hpp | 87 ++ .../include/QuantumDevice.hpp | 364 ++++++ .../catalyst_runtime/include/RuntimeCAPI.h | 112 ++ .../catalyst_runtime/include/Types.h | 165 +++ .../catalyst_runtime/lib/CMakeLists.txt | 3 + .../lib/backend/CMakeLists.txt | 7 + .../lib/backend/common/CacheManager.hpp | 199 ++++ .../lib/backend/common/QubitManager.hpp | 146 +++ .../lib/backend/common/Utils.hpp | 304 +++++ .../catalyst_runtime/lib/capi/CMakeLists.txt | 57 + .../lib/capi/ExecutionContext.hpp | 367 ++++++ .../catalyst_runtime/lib/capi/MemRefUtils.hpp | 48 + .../catalyst_runtime/lib/capi/RuntimeCAPI.cpp | 1012 +++++++++++++++++ .../lib/registry/CMakeLists.txt | 33 + .../lib/registry/Registry.cpp | 179 +++ 27 files changed, 4389 insertions(+) create mode 100644 src/qirlightning/CMakeLists.txt create mode 100644 src/qirlightning/LightningDefaultRuntime.cc create mode 100644 src/qirlightning/LightningDefaultRuntime.hh create mode 100644 src/qirlightning/LightningQuantum.cc create mode 100644 src/qirlightning/LightningQuantum.hh create mode 100644 src/qirlightning/catalyst_runtime/.clang-tidy create mode 100644 src/qirlightning/catalyst_runtime/.gitignore create mode 100644 src/qirlightning/catalyst_runtime/CMakeLists.txt create mode 100644 src/qirlightning/catalyst_runtime/Makefile create mode 100644 src/qirlightning/catalyst_runtime/README.rst create mode 100644 src/qirlightning/catalyst_runtime/include/DataView.hpp create mode 100644 src/qirlightning/catalyst_runtime/include/DynamicLibraryLoader.hpp create mode 100644 src/qirlightning/catalyst_runtime/include/Exception.hpp create mode 100644 src/qirlightning/catalyst_runtime/include/QuantumDevice.hpp create mode 100644 src/qirlightning/catalyst_runtime/include/RuntimeCAPI.h create mode 100644 src/qirlightning/catalyst_runtime/include/Types.h create mode 100644 src/qirlightning/catalyst_runtime/lib/CMakeLists.txt create mode 100644 src/qirlightning/catalyst_runtime/lib/backend/CMakeLists.txt create mode 100644 src/qirlightning/catalyst_runtime/lib/backend/common/CacheManager.hpp create mode 100644 src/qirlightning/catalyst_runtime/lib/backend/common/QubitManager.hpp create mode 100644 src/qirlightning/catalyst_runtime/lib/backend/common/Utils.hpp create mode 100644 src/qirlightning/catalyst_runtime/lib/capi/CMakeLists.txt create mode 100644 src/qirlightning/catalyst_runtime/lib/capi/ExecutionContext.hpp create mode 100644 src/qirlightning/catalyst_runtime/lib/capi/MemRefUtils.hpp create mode 100644 src/qirlightning/catalyst_runtime/lib/capi/RuntimeCAPI.cpp create mode 100644 src/qirlightning/catalyst_runtime/lib/registry/CMakeLists.txt create mode 100644 src/qirlightning/catalyst_runtime/lib/registry/Registry.cpp diff --git a/src/qirlightning/CMakeLists.txt b/src/qirlightning/CMakeLists.txt new file mode 100644 index 0000000..0d81dec --- /dev/null +++ b/src/qirlightning/CMakeLists.txt @@ -0,0 +1,28 @@ +#---------------------------------*-CMake-*----------------------------------# +# Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers. +# See the top-level COPYRIGHT file for details. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#----------------------------------------------------------------------------# + +# Adding qsim as a library to qiree +qiree_add_library(qirqsim + QsimQuantum.cc + QsimDefaultRuntime.cc +) + +#Link the qsim library to qiree and any other relevant libraries +target_link_libraries(qirqsim + PUBLIC QIREE::qiree # Link to qiree + PRIVATE QIREE::qsim +) + +#----------------------------------------------------------------------------# +# HEADERS +#----------------------------------------------------------------------------# + +# Install headers, matching the relevant .hh files for qsim integration +install(DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/" + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/qirqsim" + COMPONENT development + FILES_MATCHING REGEX ".*\\.hh?$" +) diff --git a/src/qirlightning/LightningDefaultRuntime.cc b/src/qirlightning/LightningDefaultRuntime.cc new file mode 100644 index 0000000..2440ee0 --- /dev/null +++ b/src/qirlightning/LightningDefaultRuntime.cc @@ -0,0 +1,73 @@ +//----------------------------------*-C++-*----------------------------------// +// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers. +// See the top-level COPYRIGHT file for details. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +//---------------------------------------------------------------------------// +//! \file qirlightning/LightningDefaultRuntime.cc +//---------------------------------------------------------------------------// +#include "LightningDefaultRuntime.hh" + +#include + +#include "qiree/Assert.hh" + +namespace qiree +{ +//---------------------------------------------------------------------------// +/*! + * Initialize the execution environment, resetting qubits. + */ + +void LightningDefaultRuntime::initialize(OptionalCString env) +{ + if (env) + { + output_ << "Argument to initialize: " << env << std::endl; + } +} + +//---------------------------------------------------------------------------// +/*! + * Execute circuit and mark the following N results as being part of an array + * named tag + */ + +void LightningDefaultRuntime::array_record_output(size_type s, OptionalCString tag) +{ + // this->execute_if_needed(); + // output_ << "array " << (tag ? tag : "") << " length " << s + // << std::endl; +} + +//---------------------------------------------------------------------------// +/*! + * Execute circuit and mark the following N results as being part of a tuple + * named tag + */ + +void LightningDefaultRuntime::tuple_record_output(size_type s, OptionalCString tag) +{ + // this->execute_if_needed(); + // output_ << "tuple " << (tag ? tag : "") << " length " << s + // << std::endl; +} + +//---------------------------------------------------------------------------// +/*! + * Execute circuit and report a single measurement result + */ +void LightningDefaultRuntime::result_record_output(Result r, OptionalCString tag) +{ + // Access values through the getter + // This prints results every time result_record_output is called + // Can comment out if only want to see final results + + if (auto value = sim_.manager.getBufferValue("q" + std::to_string(r.value)); + value.has_value()) + { + std::cout << "q" << std::to_string(r.value) << " : " << value.value() + << "\n"; + } +} + +} // namespace qiree diff --git a/src/qirlightning/LightningDefaultRuntime.hh b/src/qirlightning/LightningDefaultRuntime.hh new file mode 100644 index 0000000..cac9c1e --- /dev/null +++ b/src/qirlightning/LightningDefaultRuntime.hh @@ -0,0 +1,62 @@ +//----------------------------------*-C++-*----------------------------------// +// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers. +// See the top-level COPYRIGHT file for details. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +//---------------------------------------------------------------------------// +//! \file qirlightning/LightningDefaultRuntime.hh +//---------------------------------------------------------------------------// +#pragma once + +#include "LightningQuantum.hh" + +namespace qiree +{ + +/*! + * Print per-qubit measurement statistics. + * + * Example for three qubits: + * \code + * Measurement output: + * ------------------- + * Number of shots: 1024 + * Number of qubits: 3 + * q0 {0: 542, 1: 482} + * q1 {0: 521, 1: 503} + * q2 {0: 0, 1: 1024} + * + * \endcode + */ + +class LightningDefaultRuntime final : virtual public RuntimeInterface +{ + public: + /*! + * Construct \c LightningDefaultRuntime. + */ + LightningDefaultRuntime(std::ostream& output, LightningQuantum& sim) + : output_(output), sim_(sim) + { + } + + //!@{ + //! \name Runtime interface + // Initialize the execution environment, resetting qubits + void initialize(OptionalCString env) override; + + //! Mark the following N results as being part of an array named tag + void array_record_output(size_type, OptionalCString tag) final; + + //! Mark the following N results as being part of a tuple named tag + void tuple_record_output(size_type, OptionalCString) final; + + // Save one result + void result_record_output(Result result, OptionalCString tag) final; + //!@} + + private: + std::ostream& output_; + LightningQuantum& sim_; +}; + +} // namespace qiree diff --git a/src/qirlightning/LightningQuantum.cc b/src/qirlightning/LightningQuantum.cc new file mode 100644 index 0000000..3f39825 --- /dev/null +++ b/src/qirlightning/LightningQuantum.cc @@ -0,0 +1,195 @@ +//----------------------------------*-C++-*----------------------------------// +// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers. +// See the top-level COPYRIGHT file for details. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +//---------------------------------------------------------------------------// +//! \file qirlightning/LightningQuantum.cc +//---------------------------------------------------------------------------// + +#include "LightningQuantum.hh" + +#include +#include +#include +#include +#include +#include + +#include "qiree/Assert.hh" + +// Lightning +#include "catalyst_runtime/lib/capi/ExecutionContext.hpp" + +namespace qiree +{ +using namespace Catalyst::Runtime; + +static inline std::shared_ptr loadRTDevice(const std::string &rtd_lib, + const std::string &rtd_name = {}, + const std::string &rtd_kwargs = {}) +{ + ExecutionContext context; + return context.getOrCreateDevice(rtd_lib, rtd_name, rtd_kwargs); +} + +//---------------------------------------------------------------------------// +/*! + * Initialize the Lightning simulator + */ +LightningQuantum::LightningQuantum(std::ostream& os, unsigned long int seed) +{ + auto RTDevice = loadDevice("/home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/pennylane_lightning/liblightning_gpu_catalyst.so", "LightningGPUSimulator", ""); + +} + +//---------------------------------------------------------------------------// +//! Default destructor +LightningQuantum::~LightningQuantum() = default; + +//---------------------------------------------------------------------------// +/*! + * Prepare to build a quantum circuit for an entry point + */ +void LightningQuantum::set_up(EntryPointAttrs const& attrs) +{ + QIREE_VALIDATE(attrs.required_num_qubits > 0, + << "input is not a quantum program"); + + num_qubits_ = attrs.required_num_qubits; // Set the number of qubits + + RTDevice->getQuantumDevicePtr()->AllocateQubits(num_qubits_); + +} + +//---------------------------------------------------------------------------// +/*! + * Complete an execution + */ +void LightningQuantum::tear_down() +{ + context->deactivateDevice(RTDevice); + RTDevice = nullptr; +} + +//---------------------------------------------------------------------------// +/*! + * Reset the qubit + */ +void LightningQuantum::reset(Qubit q) +{ + q.value = 0; +} + +//----------------------------------------------------------------------------// +/*! + * Read the value of a result. This utilizes the new BufferManager. + */ +QState LightningQuantum::read_result(Result r) +{ + + return static_cast(meas_results[0].bitstring[0]); +} + +//---------------------------------------------------------------------------// +/*! + * Map a qubit to a result index. + * + * (TODO: find how to link the classical register to the quantum register in + * qsim) + */ +void LightningQuantum::mz(Qubit q, Result r) +{ // we don't classical register yet. + /* QIREE_EXPECT(q.value < this->num_qubits()); */ // TODO: q must be in the set + // of qubits, e.g., what + // happens if q=5 and qubits + // are {2,3,4,5}, q is less + // than num_qubits but not it + // is in the set of qubits. + // TODO: maybe not what we want long term + QIREE_EXPECT(q.value == r.value); + // Add measurement instruction + Measure(q.value, std::nullopt); + // RETURN MEASURE RESULT?? + +} + +//---------------------------------------------------------------------------// +/* + * Quantum Instruction Mapping + */ + +// 1. Entangling gates +void LightningQuantum::cx(Qubit q1, Qubit q2) +{ + RTDevice->getQuantumDevicePtr()->NamedOperation("CNOT", {}, {q1.value, q2.value}); +} +void LightningQuantum::cnot(Qubit q1, Qubit q2) +{ + RTDevice->getQuantumDevicePtr()->NamedOperation("CNOT", {}, {q1.value, q2.value}); +} +void LightningQuantum::cz(Qubit q1, Qubit q2) +{ + RTDevice->getQuantumDevicePtr()->NamedOperation("CZ", {}, {q1.value, q2.value}); +} +// 2. Local gates +void LightningQuantum::h(Qubit q) +{ + RTDevice->getQuantumDevicePtr()->NamedOperation("Hadamard", {}, {q.value}); +} +void LightningQuantum::s(Qubit q) +{ + RTDevice->getQuantumDevicePtr()->NamedOperation("S", {}, {q.value}); +} +void LightningQuantum::t(Qubit q) +{ + RTDevice->getQuantumDevicePtr()->NamedOperation("T", {}, {q.value}); +} +// 2.1 Pauli gates +void LightningQuantum::x(Qubit q) +{ + RTDevice->getQuantumDevicePtr()->NamedOperation("PauliX", {}, {q.value}); +} +void LightningQuantum::y(Qubit q) +{ + RTDevice->getQuantumDevicePtr()->NamedOperation("PauliY", {}, {q.value}); +} +void LightningQuantum::z(Qubit q) +{ + RTDevice->getQuantumDevicePtr()->NamedOperation("PauliZ", {}, {q.value}); +} +// 2.2 rotation gates +void LightningQuantum::rx(double theta, Qubit q) +{ + RTDevice->getQuantumDevicePtr()->NamedOperation("RX", {theta}, {q.value}); +} +void LightningQuantum::ry(double theta, Qubit q) +{ + RTDevice->getQuantumDevicePtr()->NamedOperation("RY", {theta}, {q.value}); +} +void LightningQuantum::rz(double theta, Qubit q) +{ + RTDevice->getQuantumDevicePtr()->NamedOperation("RZ", {theta}, {q.value}); +} + +Qubit LightningQuantum::result_to_qubit(Result r) +{ + // TODO: This function is not working. Giving 0 every time. Maybe not + // needed. + /* QIREE_EXPECT(r.value < this->num_results()); */ + return result_to_qubit_[r.value]; // just copied this from the qirxacc, I + // have no idea if we need to do + // something else here +} + +void LightningQuantum::print_accelbuf() +{ + // TODO: to be implemented, we can create a buffer class to store the + // results +} + +void LightningQuantum::execute_if_needed() +{ + /* QIREE_EXPECT(false); */ +} + +} // namespace qiree diff --git a/src/qirlightning/LightningQuantum.hh b/src/qirlightning/LightningQuantum.hh new file mode 100644 index 0000000..e9b8bb2 --- /dev/null +++ b/src/qirlightning/LightningQuantum.hh @@ -0,0 +1,114 @@ +//----------------------------------*-C++-*----------------------------------// +// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers. +// See the top-level COPYRIGHT file for details. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +//---------------------------------------------------------------------------// +//! \file qirlightning/LightningQuantum.hh +//---------------------------------------------------------------------------// +#pragma once + +#include +#include +#include + +#include "qiree/Macros.hh" +#include "qiree/QuantumNotImpl.hh" +#include "qiree/RuntimeInterface.hh" +#include "qiree/Types.hh" +#include "qiree/OutputDistribution.hh" + +namespace qiree +{ +//---------------------------------------------------------------------------// +/*! + * Create and execute quantum circuits using Pennylane Lightning. + */ +class LightningQuantum final : virtual public QuantumNotImpl +{ + public: + // Construct with number of shots + LightningQuantum(std::ostream& os, unsigned long int shots); + ~LightningQuantum(); + + QIREE_DELETE_COPY_MOVE(LightningQuantum); // Delete copy and move constructors + + //!@{ + //! \name Accessors + size_type num_results() const { return result_to_qubit_.size(); } + size_type num_qubits() const { return num_qubits_; } + //!@} + + //!@{ + //! \name Quantum interface + // Prepare to build a quantum circuit for an entry point + void set_up(EntryPointAttrs const&) override; + + // Complete an execution + void tear_down() override; + + // Map a qubit to a result index + void mz(Qubit, Result) final; + + // Read the value of a result. + QState read_result(Result) final; + //!@} + + //!@{ + //! \name Utilities for runtime + // Get runtime qubit corresponding to a runtime result + Qubit result_to_qubit(Result); + + // Run the circuit on the accelerator if we have not already. Returns true + // if the circuit was executed. + void execute_if_needed(); + + void print_accelbuf(); + //!@} + + //!@{ + //! \name Circuit construction + // void ccx(Qubit, Qubit) final; + void ccnot(Qubit, Qubit, Qubit); // TODO: not in examples or qir runner + void cnot(Qubit, Qubit) final; + void cx(Qubit, Qubit) final; + // void cy(Qubit, Qubit) final; + void cz(Qubit, Qubit) final; + void h(Qubit) final; + void reset(Qubit) final; + void rx(double, Qubit) final; + void ry(double, Qubit) final; + void rz(double, Qubit) final; + // void rzz(double, Qubit, Qubit) final; + void s(Qubit) final; + // void s_adj(Qubit) final; + // void swap(Qubit, Qubit) final; + void t(Qubit) final; + // void t_adj(Qubit) final; + void x(Qubit) final; + void y(Qubit) final; + void z(Qubit) final; + //!@} + + // Update the buffer + Buffer manager; + + private: + + //// TYPES //// + + struct Factory; + struct State; + + //// DATA //// + + std::ostream& output_; + unsigned long int seed_{}; + std::unique_ptr state_; + + unsigned num_threads_{}; // Number of threads to use + size_t gate_index_; // when the quantum operation will be executed + size_type num_qubits_{}; + std::vector result_to_qubit_; +}; + +} // namespace qiree diff --git a/src/qirlightning/catalyst_runtime/.clang-tidy b/src/qirlightning/catalyst_runtime/.clang-tidy new file mode 100644 index 0000000..e7ca11f --- /dev/null +++ b/src/qirlightning/catalyst_runtime/.clang-tidy @@ -0,0 +1,232 @@ +--- +Checks: '-*,clang-diagnostic-*,clang-analyzer-*,-llvmlibc-*,modernize-*,-modernize-use-trailing-return-type,clang-analyzer-cplusplus*,openmp-*,performance-*,portability-*,readability-*,-modernize-avoid-c-arrays,-readability-magic-numbers,hicpp-*,-hicpp-no-array-decay,-hicpp-avoid-c-arrays,bugprone-suspicious-*,llvm-namespace-comment,cppcoreguidelines-slicing,cppcoreguidelines-special-member-functions,-readability-identifier-length' +WarningsAsErrors: '*' +HeaderFilterRegex: '.*' +AnalyzeTemporaryDtors: false +FormatStyle: none +InheritParentConfig: true +User: mlxd +CheckOptions: + - key: modernize-replace-auto-ptr.IncludeStyle + value: llvm + - key: performance-move-const-arg.CheckTriviallyCopyableMove + value: 'true' + - key: modernize-use-auto.MinTypeNameLength + value: '5' + - key: readability-static-accessed-through-instance.NameSpecifierNestingThreshold + value: '3' + - key: readability-function-size.VariableThreshold + value: '4294967295' + - key: cert-dcl16-c.NewSuffixes + value: 'L;LL;LU;LLU' + - key: readability-identifier-naming.GetConfigPerFile + value: 'true' + - key: readability-inconsistent-declaration-parameter-name.Strict + value: 'false' + - key: readability-magic-numbers.IgnoredIntegerValues + value: '1;2;3;4;' + - key: modernize-use-default-member-init.UseAssignment + value: 'false' + - key: readability-function-size.NestingThreshold + value: '4294967295' + - key: modernize-use-override.AllowOverrideAndFinal + value: 'false' + - key: readability-function-size.ParameterThreshold + value: '4294967295' + - key: openmp-exception-escape.IgnoredExceptions + value: '' + - key: modernize-pass-by-value.ValuesOnly + value: 'false' + - key: modernize-loop-convert.IncludeStyle + value: llvm + - key: cert-str34-c.DiagnoseSignedUnsignedCharComparisons + value: '0' + - key: readability-identifier-naming.AggressiveDependentMemberLookup + value: 'false' + - key: readability-redundant-smartptr-get.IgnoreMacros + value: 'true' + - key: modernize-use-emplace.TupleTypes + value: '::std::pair;::std::tuple' + - key: modernize-use-emplace.TupleMakeFunctions + value: '::std::make_pair;::std::make_tuple' + - key: modernize-use-nodiscard.ReplacementString + value: '[[nodiscard]]' + - key: modernize-loop-convert.MakeReverseRangeHeader + value: '' + - key: modernize-replace-random-shuffle.IncludeStyle + value: llvm + - key: modernize-use-bool-literals.IgnoreMacros + value: 'true' + - key: google-readability-namespace-comments.ShortNamespaceLines + value: '10' + - key: modernize-avoid-bind.PermissiveParameterList + value: 'false' + - key: modernize-use-override.FinalSpelling + value: final + - key: performance-move-constructor-init.IncludeStyle + value: llvm + - key: modernize-loop-convert.UseCxx20ReverseRanges + value: 'true' + - key: modernize-use-noexcept.ReplacementString + value: '' + - key: modernize-use-using.IgnoreMacros + value: 'true' + - key: performance-type-promotion-in-math-fn.IncludeStyle + value: llvm + - key: modernize-loop-convert.NamingStyle + value: CamelCase + - key: modernize-loop-convert.MakeReverseRangeFunction + value: '' + - key: readability-inconsistent-declaration-parameter-name.IgnoreMacros + value: 'true' + - key: performance-no-automatic-move.AllowedTypes + value: '' + - key: performance-for-range-copy.WarnOnAllAutoCopies + value: 'false' + - key: readability-identifier-naming.IgnoreFailedSplit + value: 'false' + - key: modernize-pass-by-value.IncludeStyle + value: llvm + - key: readability-qualified-auto.AddConstToQualified + value: 'true' + - key: readability-simplify-boolean-expr.ChainedConditionalReturn + value: 'false' + - key: readability-else-after-return.WarnOnConditionVariables + value: 'true' + - key: readability-uppercase-literal-suffix.IgnoreMacros + value: 'true' + - key: modernize-use-nullptr.NullMacros + value: 'NULL' + - key: modernize-make-shared.IgnoreMacros + value: 'true' + - key: performance-unnecessary-copy-initialization.AllowedTypes + value: '' + - key: modernize-use-transparent-functors.SafeMode + value: 'false' + - key: modernize-make-shared.IgnoreDefaultInitialization + value: 'true' + - key: modernize-make-shared.IncludeStyle + value: llvm + - key: readability-simplify-boolean-expr.ChainedConditionalAssignment + value: 'false' + - key: cert-oop54-cpp.WarnOnlyIfThisHasSuspiciousField + value: '0' + - key: readability-function-size.LineThreshold + value: '4294967295' + - key: performance-inefficient-vector-operation.EnableProto + value: 'false' + - key: modernize-use-override.IgnoreDestructors + value: 'false' + - key: modernize-loop-convert.MaxCopySize + value: '16' + - key: modernize-make-shared.MakeSmartPtrFunction + value: 'std::make_shared' + - key: portability-simd-intrinsics.Suggest + value: 'false' + - key: cppcoreguidelines-explicit-virtual-functions.IgnoreDestructors + value: '1' + - key: modernize-make-unique.IgnoreMacros + value: 'true' + - key: modernize-make-shared.MakeSmartPtrFunctionHeader + value: '' + - key: performance-for-range-copy.AllowedTypes + value: '' + - key: readability-redundant-string-init.StringNames + value: '::std::basic_string_view;::std::basic_string' + - key: modernize-make-unique.IgnoreDefaultInitialization + value: 'true' + - key: modernize-use-emplace.ContainersWithPushBack + value: '::std::vector;::std::list;::std::deque' + - key: readability-magic-numbers.IgnoreBitFieldsWidths + value: 'true' + - key: modernize-make-unique.IncludeStyle + value: llvm + - key: readability-braces-around-statements.ShortStatementLines + value: '0' + - key: modernize-use-override.OverrideSpelling + value: override + - key: readability-magic-numbers.IgnoredFloatingPointValues + value: '1.0;100.0;' + - key: performance-inefficient-string-concatenation.StrictMode + value: 'false' + - key: readability-implicit-bool-conversion.AllowPointerConditions + value: 'false' + - key: readability-redundant-declaration.IgnoreMacros + value: 'true' + - key: google-readability-braces-around-statements.ShortStatementLines + value: '1' + - key: modernize-make-unique.MakeSmartPtrFunction + value: 'std::make_unique' + - key: portability-restrict-system-includes.Includes + value: '*' + - key: readability-else-after-return.WarnOnUnfixable + value: 'true' + - key: modernize-use-emplace.IgnoreImplicitConstructors + value: 'false' + - key: modernize-make-unique.MakeSmartPtrFunctionHeader + value: '' + - key: modernize-use-equals-delete.IgnoreMacros + value: 'true' + - key: readability-magic-numbers.IgnoreAllFloatingPointValues + value: 'false' + - key: readability-uppercase-literal-suffix.NewSuffixes + value: '' + - key: modernize-loop-convert.MinConfidence + value: reasonable + - key: performance-unnecessary-value-param.AllowedTypes + value: '' + - key: modernize-use-noexcept.UseNoexceptFalse + value: 'true' + - key: google-readability-namespace-comments.SpacesBeforeComments + value: '2' + - key: readability-function-cognitive-complexity.Threshold + value: '100' + - key: readability-function-cognitive-complexity.IgnoreMacros + value: 'true' + - key: cppcoreguidelines-non-private-member-variables-in-classes.IgnoreClassesWithAllMemberVariablesBeingPublic + value: '1' + - key: performance-faster-string-find.StringLikeClasses + value: '::std::basic_string;::std::basic_string_view' + - key: readability-function-size.BranchThreshold + value: '4294967295' + - key: readability-implicit-bool-conversion.AllowIntegerConditions + value: 'false' + - key: readability-function-size.StatementThreshold + value: '800' + - key: modernize-use-default-member-init.IgnoreMacros + value: 'true' + - key: llvm-qualified-auto.AddConstToQualified + value: '0' + - key: readability-identifier-naming.IgnoreMainLikeFunctions + value: 'false' + - key: google-readability-function-size.StatementThreshold + value: '800' + - key: llvm-else-after-return.WarnOnConditionVariables + value: '0' + - key: modernize-raw-string-literal.DelimiterStem + value: lit + - key: modernize-use-equals-default.IgnoreMacros + value: 'true' + - key: modernize-raw-string-literal.ReplaceShorterLiterals + value: 'false' + - key: modernize-use-emplace.SmartPointers + value: '::std::shared_ptr;::std::unique_ptr;::std::auto_ptr;::std::weak_ptr' + - key: performance-inefficient-vector-operation.VectorLikeClasses + value: '::std::vector' + - key: modernize-use-auto.RemoveStars + value: 'false' + - key: readability-magic-numbers.IgnorePowersOf2IntegerValues + value: 'true' + - key: portability-simd-intrinsics.Std + value: '' + - key: readability-redundant-member-init.IgnoreBaseInCopyConstructors + value: 'false' + - key: performance-unnecessary-value-param.IncludeStyle + value: llvm + - key: modernize-replace-disallow-copy-and-assign-macro.MacroName + value: DISALLOW_COPY_AND_ASSIGN + - key: llvm-else-after-return.WarnOnUnfixable + value: '0' + - key: readability-simplify-subscript-expr.Types + value: '::std::basic_string;::std::basic_string_view;::std::vector;::std::array' +... diff --git a/src/qirlightning/catalyst_runtime/.gitignore b/src/qirlightning/catalyst_runtime/.gitignore new file mode 100644 index 0000000..4258b32 --- /dev/null +++ b/src/qirlightning/catalyst_runtime/.gitignore @@ -0,0 +1,3 @@ +build +build_cov +bin/__pycache__/ diff --git a/src/qirlightning/catalyst_runtime/CMakeLists.txt b/src/qirlightning/catalyst_runtime/CMakeLists.txt new file mode 100644 index 0000000..1651851 --- /dev/null +++ b/src/qirlightning/catalyst_runtime/CMakeLists.txt @@ -0,0 +1,133 @@ +cmake_minimum_required(VERSION 3.26) + +project(catalyst_runtime) +include(FetchContent) +include(ExternalProject) + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +# Compiler options +option(ENABLE_CODE_COVERAGE "Enable code coverage" OFF) +option(ENABLE_ADDRESS_SANITIZER "Enable address sanitizer" OFF) +option(RUNTIME_CLANG_TIDY "Enable Clang Tidy" OFF) + +option(ENABLE_OPENQASM "Build OpenQasm backend device" OFF) + +set(CMAKE_VERBOSE_MAKEFILE ON) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +set(runtime_includes "${PROJECT_SOURCE_DIR}/include") +set(capi_utils_includes "${PROJECT_SOURCE_DIR}/lib/capi") +set(backend_includes "${PROJECT_SOURCE_DIR}/lib/backend/common") + + +# Get LLVM hash to target from source tree. +file(READ ../.dep-versions DEPENDENCY_VERSIONS) +string(REGEX MATCH "llvm=([0-9a-f]+)" _ ${DEPENDENCY_VERSIONS}) +set(LLVM_HASH ${CMAKE_MATCH_1}) +message(STATUS "Detected LLVM version - ${LLVM_HASH}") + +FetchContent_Declare( + MLIRRunnerUtils + URL https://raw.githubusercontent.com/llvm/llvm-project/${LLVM_HASH}/mlir/include/mlir/ExecutionEngine/RunnerUtils.h + DOWNLOAD_NO_EXTRACT True + SOURCE_DIR mlir/ExecutionEngine +) + +FetchContent_Declare( + MLIRCRunnerUtils + URL https://raw.githubusercontent.com/llvm/llvm-project/${LLVM_HASH}/mlir/include/mlir/ExecutionEngine/CRunnerUtils.h + DOWNLOAD_NO_EXTRACT True + SOURCE_DIR mlir/ExecutionEngine +) + +FetchContent_Declare( + MLIRFloat16Bits + URL https://raw.githubusercontent.com/llvm/llvm-project/${LLVM_HASH}/mlir/include/mlir/ExecutionEngine/Float16bits.h + DOWNLOAD_NO_EXTRACT True + SOURCE_DIR mlir/ExecutionEngine +) + +# Note on pybind11 vs python discovery order: +# If Python is looked for first, then we have to look for all the components needed by pybind11. +# In particular, if pybind11::embed is used, then we need to find both headers (Development.Module) +# and the shared library (Development.Embed) before pybind11 is discovered. +# With the other order PyBind will discover everything it needs. +# Note on flags: +# - PYTHON_EXECUTABLE is a pybind11 specific flag used by its own (legacy) Python discovery process, +# it will not affect find_package(Python) calls. +# - Python_EXECUTABLE is a cmake flag used in find_package(Python) to guide the discovery. +# Note that pybind11 can be made to use find_python (instead of its legacy discovery), and thus +# respect Python_EXECUTABLE), via the PYBIND11_FINDPYTHON flag. + +# Here, we look for the desired Python version early to avoid any problems with mismatched packages. +# The desired Python environment should be specified ahead of time via -DPython_EXECUTABLE=... +# The optional component is only used for the C++ test suite (to spin up its own interpreter), +# and requires libpython.so to be available on the system. +find_package(Python REQUIRED + COMPONENTS Interpreter Development.Module + OPTIONAL_COMPONENTS Development.Embed Development.SABIModule +) + +if(RUNTIME_ENABLE_WARNINGS) + message(STATUS "Building with compiler warnings as errors enabled.") + add_compile_options(-Werror -Wall) +endif() + +message(STATUS "ENABLE_OPENQASM is ${ENABLE_OPENQASM}.") + +set(devices_list) +list(APPEND devices_list rtd_null_qubit) +list(APPEND backend_includes "${PROJECT_SOURCE_DIR}/lib/backend/null_qubit") + +if(ENABLE_OPENQASM) + list(APPEND backend_includes "${PROJECT_SOURCE_DIR}/lib/backend/openqasm") + list(APPEND devices_list rtd_openqasm) +endif() + +add_library(catalyst_qir_runtime INTERFACE) + +target_link_libraries(catalyst_qir_runtime INTERFACE ${devices_list} rt_capi) + +target_include_directories(catalyst_qir_runtime INTERFACE + ${runtime_includes} + ${backend_includes} +) + +if(ENABLE_CODE_COVERAGE) + message(STATUS "ENABLE_CODE_COVERAGE is ON.") + if(APPLE) + target_compile_options(catalyst_qir_runtime INTERFACE -fprofile-instr-generate -fcoverage-mapping) + target_link_options(catalyst_qir_runtime INTERFACE -fprofile-instr-generate -fcoverage-mapping) + else() + target_compile_options(catalyst_qir_runtime INTERFACE -fprofile-arcs -ftest-coverage) + target_link_libraries(catalyst_qir_runtime INTERFACE gcov) + endif() +endif() + + +if(ENABLE_ADDRESS_SANITIZER) + message(STATUS "ENABLE_ADDRESS_SANITIZER is ON.") + add_compile_options(-fsanitize=address) + add_link_options(-fsanitize=address) +endif() + +add_subdirectory(lib) +add_subdirectory(tests) + +if(APPLE AND (${CMAKE_SYSTEM_PROCESSOR} STREQUAL arm64)) +# Don't rerun external project everytime we configure the runtime build. +if(NOT EXISTS ${CMAKE_BINARY_DIR}/lib/liblapacke.3.dylib) + ExternalProject_Add(lapacke-accelerate + GIT_REPOSITORY https://github.com/lepus2589/accelerate-lapacke.git + GIT_TAG master + PREFIX _lapacke-accelerate + CMAKE_ARGS "--preset accelerate-lapacke32" + "-DCMAKE_INSTALL_PREFIX=${CMAKE_BINARY_DIR}/_lapacke-accelerate/install" + INSTALL_COMMAND ${CMAKE_COMMAND} --build . --target install + COMMAND cp ${CMAKE_BINARY_DIR}/_lapacke-accelerate/install/lib/liblapacke.3.dylib ${CMAKE_BINARY_DIR}/lib + ) + add_dependencies(rt_capi lapacke-accelerate) # automatically build with the runtime +endif() +endif() diff --git a/src/qirlightning/catalyst_runtime/Makefile b/src/qirlightning/catalyst_runtime/Makefile new file mode 100644 index 0000000..55733a4 --- /dev/null +++ b/src/qirlightning/catalyst_runtime/Makefile @@ -0,0 +1,121 @@ +PYTHON?=$(shell which python3) +PYTHON_PREFIX:=$(shell $(PYTHON) -c "import sys; print(sys.prefix)") +PYTHON_VERSION:=$(shell $(PYTHON) -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')") +C_COMPILER?=$(shell which clang) +CXX_COMPILER?=$(shell which clang++) +COMPILER_LAUNCHER?=$(shell which ccache) +NPROC?=$(shell python3 -c "import os; print(os.cpu_count())") + +MK_ABSPATH := $(abspath $(lastword $(MAKEFILE_LIST))) +MK_DIR := $(dir $(MK_ABSPATH)) +RT_BUILD_DIR?=$(MK_DIR)/build +CODE_COVERAGE?=OFF +BUILD_TYPE?=RelWithDebInfo +ENABLE_OPENQASM?=ON +ENABLE_ASAN?=OFF + +BUILD_TARGETS := rt_capi rtd_null_qubit +TEST_TARGETS := runner_tests_qir_runtime + +PLATFORM := $(shell uname -s) + +ifeq ($(ENABLE_OPENQASM), ON) + BUILD_TARGETS += rtd_openqasm + TEST_TARGETS += runner_tests_openqasm +endif + +.PHONY: help +help: + @echo "Please use \`make ' where is one of" + @echo " all to build Catalyst Runtime" + @echo " coverage to generate a coverage report using lcov" + @echo " clean to delete all temporary, cache, and build files" + @echo " test to run the Catalyst runtime test suite" + @echo " format [check=1] to apply C++ formatter; use with 'check=1' to check instead of modify (requires clang-format)" + @echo " format [version=?] to apply C++ formatter; use with 'version={version}' to run clang-format-{version} instead of clang-format" + @echo " check-tidy to build Catalyst Runtime with RUNTIME_CLANG_TIDY=ON (requires clang-tidy)" + +.PHONY: configure +configure: + @echo "Configure Catalyst Runtime" + + cmake -G Ninja -B $(RT_BUILD_DIR) . \ + -DCMAKE_BUILD_TYPE=$(BUILD_TYPE) \ + -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=$(RT_BUILD_DIR)/lib \ + -DCMAKE_C_COMPILER=$(C_COMPILER) \ + -DCMAKE_CXX_COMPILER=$(CXX_COMPILER) \ + -DCMAKE_C_COMPILER_LAUNCHER=$(COMPILER_LAUNCHER) \ + -DCMAKE_CXX_COMPILER_LAUNCHER=$(COMPILER_LAUNCHER) \ + -DENABLE_OPENQASM=$(ENABLE_OPENQASM) \ + -DENABLE_CODE_COVERAGE=$(CODE_COVERAGE) \ + -DPython_EXECUTABLE=$(PYTHON) \ + -DENABLE_ADDRESS_SANITIZER=$(ENABLE_ASAN) + +.PHONY: runtime +runtime: configure + cmake --build $(RT_BUILD_DIR) --target $(BUILD_TARGETS) -j$(NPROC) --verbose + +.PHONY: test_runner +test_runner: configure + cmake --build $(RT_BUILD_DIR) --target $(TEST_TARGETS) -j$(NPROC) --verbose + +.PHONY: test +test: CODE_COVERAGE=OFF +test: BUILD_TYPE?=RelWithDebInfo +test: test_runner + @echo "Catalyst runtime test suite - NullQubit" + $(ASAN_COMMAND) $(RT_BUILD_DIR)/tests/runner_tests_qir_runtime +ifeq ($(ENABLE_OPENQASM), ON) + # Test the OpenQasm devices C++ tests + $(ASAN_COMMAND) $(RT_BUILD_DIR)/tests/runner_tests_openqasm +endif + +.PHONY: coverage +coverage: RT_BUILD_DIR := $(RT_BUILD_DIR)_cov +coverage: CODE_COVERAGE=ON +coverage: BUILD_TYPE=Debug +coverage: C_COMPILER=$(shell which gcc) +coverage: CXX_COMPILER=$(shell which g++) +coverage: export LLVM_PROFILE_FILE := $(RT_BUILD_DIR)/tests/%m.profraw +coverage: test_runner + @echo "check C++ code coverage" + $(RT_BUILD_DIR)/tests/runner_tests_qir_runtime +ifeq ($(ENABLE_OPENQASM), ON) + $(RT_BUILD_DIR)/tests/runner_tests_openqasm +endif +ifeq ($(PLATFORM),Linux) + lcov --directory $(RT_BUILD_DIR) -b $(MK_DIR)/lib --capture --output-file $(RT_BUILD_DIR)/coverage.info + lcov --remove $(RT_BUILD_DIR)/coverage.info '/usr/*' '*/_deps/*' '*/envs/*' '*/mlir/*' --output-file $(RT_BUILD_DIR)/coverage.info + genhtml $(RT_BUILD_DIR)/coverage.info --output-directory $(RT_BUILD_DIR)/cov -t "Catalyst Runtime C++ Coverage" --num-spaces 4 +else + xcrun llvm-profdata merge $(RT_BUILD_DIR)/tests/*.profraw -o $(RT_BUILD_DIR)/tests/rt_test_coverage.profdata + xcrun llvm-cov show -instr-profile $(RT_BUILD_DIR)/tests/rt_test_coverage.profdata \ + -object $(RT_BUILD_DIR)/tests/runner_tests_openqasm \ + $(RT_BUILD_DIR)/tests/runner_tests_qir_runtime \ + -format=html -output-dir=$(RT_BUILD_DIR)/coverage_html \ + $(MK_DIR)/include $(MK_DIR)/lib $(MK_DIR)/tests +endif + +.PHONY: clean +clean: + @echo "clean build files" + rm -rf $(RT_BUILD_DIR) $(RT_BUILD_DIR)_cov cov coverage.info $(MK_DIR)/BuildTidy + +.PHONY: format +format: +ifdef check + $(PYTHON) ../bin/format.py --check $(if $(version:-=),--cfversion $(version)) . +else + $(PYTHON) ../bin/format.py $(if $(version:-=),--cfversion $(version)) . +endif + +.PHONY: check-tidy +check-tidy: + @echo "build Catalyst Runtime with RUNTIME_CLANG_TIDY=ON" + cmake -G Ninja -B $(MK_DIR)/BuildTidy . \ + -DCMAKE_BUILD_TYPE=$(BUILD_TYPE) \ + -DCMAKE_C_COMPILER=$(C_COMPILER) \ + -DCMAKE_CXX_COMPILER=$(CXX_COMPILER) \ + -DRUNTIME_CLANG_TIDY=ON + + cmake --build $(MK_DIR)/BuildTidy --target rt_capi -j$(NPROC) diff --git a/src/qirlightning/catalyst_runtime/README.rst b/src/qirlightning/catalyst_runtime/README.rst new file mode 100644 index 0000000..8a881e5 --- /dev/null +++ b/src/qirlightning/catalyst_runtime/README.rst @@ -0,0 +1,118 @@ +.. runtime-start-inclusion-marker-do-not-remove + +Catalyst Quantum Runtime +######################## + +The Catalyst Runtime is a C++ QIR runtime that enables the execution of Catalyst-compiled +quantum programs, and is currently backed by `PennyLane-Lightning `_ +state-vector simulators, and `Amazon Braket `__ +devices. Additional hardware support, including QPUs, to come. + +The runtime employs the `QuantumDevice `_ +public interface to support an extensible list of backend devices. This interface comprises two collections of abstract methods: + +- The Qubit management, device shot noise, and quantum tape recording methods are utilized for the implementation of Quantum Runtime (QR) instructions. + +- The quantum operations, observables, measurements, and gradient methods are used to implement Quantum Instruction Set (QIS) instructions. + +A complete list of instructions supported by the runtime can be found in +`RuntimeCAPI.h `_. + +Contents +======== + +The directory is structured as follows: + +- `include `_: + This contains the public header files of the runtime including the ``QuantumDevice`` API + for backend quantum devices and the runtime CAPI. + +- `lib `_: + The core modules of the runtime are structured into ``lib/capi`` and ``lib/backend``. + `lib/capi `_ implements the semantics for + QIR instructions lowered to our custom runtime. `lib/backend `_ + contains implementations of the ``QuantumDevice`` API for backend simulators. + +- `tests `_: + A collection of C++ tests for modules and methods in the runtime. + +Backend Devices +=============== + +New device backends for the runtime can be realized by implementing the quantum device interface. +The following table shows the available devices along with supported features: + +.. list-table:: + :widths: 25 25 25 25 + :header-rows: 0 + + * - **Features** + - **PennyLane-Lightning-Qubit** + - **PennyLane-Lightning-Kokkos** and **PennyLane-Lightning-GPU** + - **Amazon-Braket-OpenQasm** + * - Qubit Management + - Dynamic allocation/deallocation + - Static allocation/deallocation + - Static allocation/deallocation + * - Gate Operations + - `Lightning operations `_ + - `Lightning operations `_ without controlled gates support + - `Braket operations `_ + * - Quantum Observables + - ``Identity``, ``PauliX``, ``PauliY``, ``PauliZ``, ``Hadamard``, ``Hermitian``, ``Hamiltonian``, and Tensor Product of Observables + - ``Identity``, ``PauliX``, ``PauliY``, ``PauliZ``, ``Hadamard``, ``Hermitian``, ``Hamiltonian``, and Tensor Product of Observables + - ``Identity``, ``PauliX``, ``PauliY``, ``PauliZ``, ``Hadamard``, ``Hermitian``, and Tensor Product of Observables + * - Expectation Value + - All observables; Finite-shots supported + - All observables; Finite-shots supported + - All observables; Finite-shots supported + * - Variance + - All observables; Finite-shots supported + - All observables; Finite-shots supported + - All observables; Finite-shots supported + * - Probability + - Only for the computational basis on the supplied qubits; Finite-shots supported + - Only for the computational basis on the supplied qubits; Finite-shots supported + - The computational basis on all active qubits; Finite-shots supported + * - Sampling + - Only for the computational basis on the supplied qubits + - Only for the computational basis on the supplied qubits + - The computational basis on all active qubits; Finite-shots supported + * - Mid-Circuit Measurement + - Only for the computational basis on the supplied qubit + - Only for the computational basis on the supplied qubit + - Not supported + * - Gradient + - The Adjoint-Jacobian method for expectation values on all observables + - The Adjoint-Jacobian method for expectation values on all observables + - Not supported + +Requirements +============ + +To build the runtime from source, it is required to have an up to date version of a C/C++ compiler such as gcc or clang +with support for the C++20 standard library. + +Installation +============ + +By default, the runtime builds all supported backend devices. +You can build the runtime with custom devices from the list of Backend Devices. + +You can use ``ENABLE_OPENQASM=OFF`` to disable building the runtime with `Amazon-Braket-OpenQasm `_: + +.. code-block:: console + + make runtime ENABLE_OPENQASM=OFF + +This device currently offers generators for the `OpenQasm3 `_ specification and +`Amazon Braket `__ assembly extension. +Moreover, the generated assembly can be executed on Amazon Braket devices leveraging `amazon-braket-sdk-python `_. + +To check the runtime test suite from the root directory: + +.. code-block:: console + + make test-runtime + +.. runtime-end-inclusion-marker-do-not-remove diff --git a/src/qirlightning/catalyst_runtime/include/DataView.hpp b/src/qirlightning/catalyst_runtime/include/DataView.hpp new file mode 100644 index 0000000..6cf50f2 --- /dev/null +++ b/src/qirlightning/catalyst_runtime/include/DataView.hpp @@ -0,0 +1,148 @@ +// Copyright 2023 Xanadu Quantum Technologies Inc. + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +/** + * A multi-dimensional view for MemRef-like and std::vector types. + * + * @tparam T The underlying data type + * @tparam R The Rank (R > 0) + * + * @note A forward iterator is implemented in this view for traversing over the entire + * elements of MemRef types rank-by-rank starting from the last dimension (R-1). For example, + * The DataView iterator for MemRef starts from index (0, 0) and traverses elements + * in the following order: + * (0, 0), ..., (0, sizes[1]-1), (1, 0), ..., (1, sizes[1]-1), ... (sizes[0]-1, sizes[1]-1). + */ +template class DataView { + private: + T *data_aligned; + size_t offset; + size_t sizes[R] = {0}; + size_t strides[R] = {0}; + + public: + class iterator { + private: + const DataView &view; + + int64_t loc; // physical index + size_t indices[R] = {0}; + + public: + using iterator_category = std::forward_iterator_tag; // LCOV_EXCL_LINE + using value_type = T; // LCOV_EXCL_LINE + using difference_type = std::ptrdiff_t; // LCOV_EXCL_LINE + using pointer = T *; // LCOV_EXCL_LINE + using reference = T &; // LCOV_EXCL_LINE + + iterator(const DataView &_view, int64_t begin_idx) : view(_view), loc(begin_idx) {} + pointer operator->() const { return &view.data_aligned[loc]; } + reference operator*() const { return view.data_aligned[loc]; } + iterator &operator++() + { + int64_t next_axis = -1; + int64_t idx; + for (int64_t i = R; i > 0; --i) { + idx = i - 1; + if (indices[idx]++ < view.sizes[idx] - 1) { + next_axis = idx; + break; + } + indices[idx] = 0; + loc -= (view.sizes[idx] - 1) * view.strides[idx]; + } + + loc = next_axis == -1 ? -1 : loc + view.strides[next_axis]; + return *this; + } + iterator operator++(int) + { + auto tmp = *this; + int64_t next_axis = -1; + int64_t idx; + for (int64_t i = R; i > 0; --i) { + idx = i - 1; + if (indices[idx]++ < view.sizes[idx] - 1) { + next_axis = idx; + break; + } + indices[idx] = 0; + loc -= (view.sizes[idx] - 1) * view.strides[idx]; + } + + loc = next_axis == -1 ? -1 : loc + view.strides[next_axis]; + return tmp; + } + bool operator==(const iterator &other) const + { + return (loc == other.loc && view.data_aligned == other.view.data_aligned); + } + bool operator!=(const iterator &other) const { return !(*this == other); } + }; + + explicit DataView(std::vector &buffer) : data_aligned(buffer.data()), offset(0) + { + static_assert(R == 1, "[Class: DataView] Assertion: R == 1"); + sizes[0] = buffer.size(); + strides[0] = 1; + } + + explicit DataView(T *_data_aligned, size_t _offset, const size_t *_sizes, + const size_t *_strides) + : data_aligned(_data_aligned), offset(_offset) + { + static_assert(R > 0, "[Class: DataView] Assertion: R > 0"); + if (_sizes != nullptr && _strides != nullptr) { + for (size_t i = 0; i < R; i++) { + sizes[i] = _sizes[i]; + strides[i] = _strides[i]; + } + } // else sizes = {0}, strides = {0} + } + + [[nodiscard]] auto size() const -> size_t + { + if (!data_aligned) { + return 0; + } + + size_t tsize = 1; + for (size_t i = 0; i < R; i++) { + tsize *= sizes[i]; + } + return tsize; + } + + template T &operator()(I... idxs) const + { + static_assert(sizeof...(idxs) == R, + "[Class: DataView] Error in Catalyst Runtime: Wrong number of indices"); + size_t indices[] = {static_cast(idxs)...}; + + size_t loc = offset; + for (size_t axis = 0; axis < R; axis++) { + RT_ASSERT(indices[axis] < sizes[axis]); + loc += indices[axis] * strides[axis]; + } + return data_aligned[loc]; + } + + iterator begin() { return iterator{*this, static_cast(offset)}; } + + iterator end() { return iterator{*this, -1}; } +}; diff --git a/src/qirlightning/catalyst_runtime/include/DynamicLibraryLoader.hpp b/src/qirlightning/catalyst_runtime/include/DynamicLibraryLoader.hpp new file mode 100644 index 0000000..1c25ab8 --- /dev/null +++ b/src/qirlightning/catalyst_runtime/include/DynamicLibraryLoader.hpp @@ -0,0 +1,79 @@ +// Copyright 2024 Xanadu Quantum Technologies Inc. + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "Exception.hpp" + +/** + * @brief A utility struct to handle opening, closing and retrieving symbols + * from dynamic shared objects. + */ +struct DynamicLibraryLoader { + void *handle; + + DynamicLibraryLoader(std::string_view library_name, int mode = RTLD_LAZY | RTLD_NODELETE) + { + // Load the shared library + handle = dlopen(library_name.data(), mode); + if (!handle) { + const char *err_msg = dlerror(); + RT_FAIL(err_msg); + } + } + + ~DynamicLibraryLoader() + { + if (handle) { + // TODO: This is non-sensical. + // We are using RTLD_NODELETE, why would calling dlclose have a side-effect? + // Worst of all, the side-effect is not in our code. + // When we have dlclose, everything works well the first time. + // However, when trying to compile a second time, we will find that jaxlib will now + // raise a StopIteration exception. This doesn't really make any sense. + // My guess is that somehow dlclosing here will unload a the StopIteration symbol (?) + // rebind it with another equivalent (but with different id?) + // and then the MLIR python bindings are unable to catch it and stop the iteration and + // it gets propagated upwards. + // + // Is not calling dlclose bad? + // A little bit, although dlclose implies intent and does not create any requirements + // upon the implementation. See here: + // https://pubs.opengroup.org/onlinepubs/000095399/functions/dlclose.html + // https://github.com/pybind/pybind11/blob/75e48c5f959b4f0a49d8c664e059b6fb4b497102/include/pybind11/detail/internals.h#L108-L113 + // +#ifndef __APPLE__ + dlclose(handle); +#endif + } + } + + // Get symbol from library + template T getSymbol(std::string_view symbol_name) + { + // Clear any existing errors + dlerror(); + + // Retrieve symbol + T symbol = reinterpret_cast(dlsym(handle, symbol_name.data())); + const char *err_msg = dlerror(); + if (err_msg != nullptr) { + RT_FAIL(err_msg); + } + return symbol; + } +}; diff --git a/src/qirlightning/catalyst_runtime/include/Exception.hpp b/src/qirlightning/catalyst_runtime/include/Exception.hpp new file mode 100644 index 0000000..a76da14 --- /dev/null +++ b/src/qirlightning/catalyst_runtime/include/Exception.hpp @@ -0,0 +1,87 @@ +// Copyright 2023 Xanadu Quantum Technologies Inc. + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include +#include +#include +#include + +/** + * @brief Macro that throws `RuntimeException` with given message. + */ +#define RT_FAIL(message) Catalyst::Runtime::_abort((message), __FILE__, __LINE__, __func__) + +/** + * @brief Macro that throws `RuntimeException` if expression evaluates + * to true. + */ +#define RT_FAIL_IF(expression, message) \ + if ((expression)) { \ + RT_FAIL(message); \ + } + +/** + * @brief Macro that throws `RuntimeException` with the given expression + * and source location if expression evaluates to false. + */ +#define RT_ASSERT(expression) RT_FAIL_IF(!(expression), "Assertion: " #expression) + +namespace Catalyst::Runtime { + +/** + * @brief This is the general exception thrown by Catalyst for runtime errors + * that is derived from `std::exception`. + */ +class RuntimeException : public std::exception { + private: + const std::string err_msg; + + public: + explicit RuntimeException(std::string msg) noexcept + : err_msg{std::move(msg)} {} // LCOV_EXCL_LINE + ~RuntimeException() override = default; // LCOV_EXCL_LINE + + RuntimeException(const RuntimeException &) = default; + RuntimeException(RuntimeException &&) noexcept = default; + + RuntimeException &operator=(const RuntimeException &) = delete; + RuntimeException &operator=(RuntimeException &&) = delete; + + [[nodiscard]] auto what() const noexcept -> const char * override + { + return err_msg.c_str(); + } // LCOV_EXCL_LINE +}; + +/** + * @brief Throws a `RuntimeException` with the given error message. + * + * @note This is not supposed to be called directly. + */ +[[noreturn]] inline void _abort(const char *message, const char *file_name, size_t line, + const char *function_name) +{ + std::stringstream sstream; + sstream << "[" << file_name << "][Line:" << line << "][Function:" << function_name + << "] Error in Catalyst Runtime: " << message; + + throw RuntimeException(sstream.str()); +} // LCOV_EXCL_LINE + +} // namespace Catalyst::Runtime diff --git a/src/qirlightning/catalyst_runtime/include/QuantumDevice.hpp b/src/qirlightning/catalyst_runtime/include/QuantumDevice.hpp new file mode 100644 index 0000000..ccdb606 --- /dev/null +++ b/src/qirlightning/catalyst_runtime/include/QuantumDevice.hpp @@ -0,0 +1,364 @@ +// Copyright 2022-2023 Xanadu Quantum Technologies Inc. + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "DataView.hpp" +#include "Types.h" + +// A helper template macro to generate the Factory method by +// calling (kwargs). Check the Custom Devices guideline for details: +// https://docs.pennylane.ai/projects/catalyst/en/stable/dev/custom_devices.html +#define GENERATE_DEVICE_FACTORY(IDENTIFIER, CONSTRUCTOR) \ + extern "C" Catalyst::Runtime::QuantumDevice *IDENTIFIER##Factory(const char *kwargs) \ + { \ + return new CONSTRUCTOR(std::string(kwargs)); \ + } + +namespace Catalyst::Runtime { + +/** + * @brief struct API for backend quantum devices. + * + * This device API contains, + * - a set of methods to manage qubit allocations and deallocations, device shot + * noise, and quantum tape recording as well as reference values for the result + * data-type; these are used to implement Quantum Runtime (QR) instructions. + * + * - a set of methods for quantum operations, observables, measurements, and gradient + * of the device; these are used to implement Quantum Instruction Set (QIS) instructions. + * + */ +struct QuantumDevice { + QuantumDevice() = default; // LCOV_EXCL_LINE + virtual ~QuantumDevice() = default; // LCOV_EXCL_LINE + + QuantumDevice &operator=(const QuantumDevice &) = delete; + QuantumDevice(const QuantumDevice &) = delete; + QuantumDevice(QuantumDevice &&) = delete; + QuantumDevice &operator=(QuantumDevice &&) = delete; + + /** + * @brief Allocate a qubit. + * + * @return `QubitIdType` + */ + virtual auto AllocateQubit() -> QubitIdType = 0; + + /** + * @brief Allocate a vector of qubits. + * + * @param num_qubits The number of qubits to allocate. + * + * @return `std::vector` + */ + virtual auto AllocateQubits(size_t num_qubits) -> std::vector = 0; + + /** + * @brief Release a qubit. + * + * @param qubit The id of the qubit + */ + virtual void ReleaseQubit(QubitIdType qubit) = 0; + + /** + * @brief Release all qubits. + */ + virtual void ReleaseAllQubits() = 0; + + /** + * @brief Get the number of allocated qubits. + * + * @return `size_t` + */ + [[nodiscard]] virtual auto GetNumQubits() const -> size_t = 0; + + /** + * @brief Set the number of device shots. + * + * @param shots The number of noise shots + */ + virtual void SetDeviceShots(size_t shots) = 0; + + /** + * @brief Get the number of device shots. + * + * @return `size_t` + */ + [[nodiscard]] virtual auto GetDeviceShots() const -> size_t = 0; + + /** + * @brief Set the PRNG of the device. + * + * The Catalyst runtime enables seeded program execution on non-hardware devices. + * A random number generator instance is managed by the runtime to predictably + * generate results for non-deterministic programs, such as those involving `Measure` + * calls. + * Devices implementing support for this feature do not need to use the provided + * PRNG instance as their sole source of random numbers, but it is expected that the + * the same instance state will predictable and reproducibly generate the same + * program results. It is also expected that the provided PRNG state is evolved + * sufficiently so that two device executions sharing the same instance do not produce + * identical results. + * The provided PRNG instance is not thread-locked, and devices wishing to share it + * across threads will need to provide their own thread-safety. + * + * @param gen The std::mt19937 PRNG object. + */ + virtual void SetDevicePRNG([[maybe_unused]] std::mt19937 *gen){}; + + /** + * @brief Start recording a quantum tape if provided. + * + * @note This is backed by the `Catalyst::Runtime::CacheManager` property in + * the device implementation. + */ + virtual void StartTapeRecording() = 0; + + /** + * @brief Stop recording a quantum tape if provided. + * + * @note This is backed by the `Catalyst::Runtime::CacheManager` property in + * the device implementation. + */ + virtual void StopTapeRecording() = 0; + + /** + * @brief Result value for "Zero" used in the measurement process. + * + * @return `Result` + */ + [[nodiscard]] virtual auto Zero() const -> Result = 0; + + /** + * @brief Result value for "One" used in the measurement process. + * + * @return `Result` + */ + [[nodiscard]] virtual auto One() const -> Result = 0; + + /** + * @brief A helper method to print the state vector of a device. + */ + virtual void PrintState() = 0; + + /** + * @brief Prepare subsystems using the given ket vector in the computational basis. + * + * @param state A state vector of size 2**len(wires) + * @param wires The wire(s) the operation acts on + */ + virtual void SetState([[maybe_unused]] DataView, 1> &state, + [[maybe_unused]] std::vector &wires) + { + RT_FAIL("Unsupported functionality"); + } + + /** + * @brief Prepares a single computational basis state. + * + * @param n Prepares the basis state |n>, where n is an array of integers from the set {0, 1} + * @param wires The wire(s) the operation acts on + */ + virtual void SetBasisState([[maybe_unused]] DataView &n, + [[maybe_unused]] std::vector &wires) + { + RT_FAIL("Unsupported functionality"); + } + + /** + * @brief Apply a single gate to the state vector of a device with its name if this is + * supported. + * + * @param name The name of the gate to apply + * @param params Optional parameter list for parametric gates + * @param wires Wires to apply gate to + * @param inverse Indicates whether to use inverse of gate + * @param controlled_wires Optional controlled wires applied to the operation + * @param controlled_values Optional controlled values applied to the operation + */ + virtual void + NamedOperation(const std::string &name, const std::vector ¶ms, + const std::vector &wires, [[maybe_unused]] bool inverse = false, + [[maybe_unused]] const std::vector &controlled_wires = {}, + [[maybe_unused]] const std::vector &controlled_values = {}) = 0; + + /** + * @brief Apply a given matrix directly to the state vector of a device. + * + * @param matrix The matrix of data in row-major format + * @param wires Wires to apply gate to + * @param inverse Indicates whether to use inverse of gate + * @param controlled_wires Controlled wires applied to the operation + * @param controlled_values Controlled values applied to the operation + */ + virtual void + MatrixOperation(const std::vector> &matrix, + const std::vector &wires, [[maybe_unused]] bool inverse = false, + [[maybe_unused]] const std::vector &controlled_wires = {}, + [[maybe_unused]] const std::vector &controlled_values = {}) = 0; + + /** + * @brief Construct a named (Identity, PauliX, PauliY, PauliZ, and Hadamard) + * or Hermitian observable. + * + * @param id The type of the observable + * @param matrix The matrix of data to construct a hermitian observable + * @param wires Wires to apply observable to + * + * @return `ObsIdType` Index of the constructed observable + */ + virtual auto Observable(ObsId id, const std::vector> &matrix, + const std::vector &wires) -> ObsIdType = 0; + + /** + * @brief Construct a tensor product of observables. + * + * @param obs The vector of observables indices of type ObsIdType + * + * @return `ObsIdType` Index of the constructed observable + */ + virtual auto TensorObservable(const std::vector &obs) -> ObsIdType = 0; + + /** + * @brief Construct a Hamiltonian observable. + * + * @param coeffs The vector of coefficients + * @param obs The vector of observables indices of size `coeffs` + * + * @return `ObsIdType` Index of the constructed observable + */ + virtual auto HamiltonianObservable(const std::vector &coeffs, + const std::vector &obs) -> ObsIdType = 0; + + /** + * @brief Compute the expected value of an observable. + * + * @param obsKey The index of the constructed observable + * + * @return `double` The expected value + */ + virtual auto Expval(ObsIdType obsKey) -> double = 0; + + /** + * @brief Compute the variance of an observable. + * + * @param obsKey The index of the constructed observable + * + * @return `double` The variance + */ + virtual auto Var(ObsIdType obsKey) -> double = 0; + + /** + * @brief Get the state-vector of a device. + * + * @param state The pre-allocated `DataView, 1>` + */ + virtual void State(DataView, 1> &state) = 0; + + /** + * @brief Compute the probabilities of each computational basis state. + + * @param probs The pre-allocated `DataView` + */ + virtual void Probs(DataView &probs) = 0; + + /** + * @brief Compute the probabilities for a subset of the full system. + * + * @param probs The pre-allocated `DataView` + * @param wires Wires will restrict probabilities to a subset of the full system + */ + virtual void PartialProbs(DataView &probs, + const std::vector &wires) = 0; + + /** + * @brief Compute samples with the number of shots on the entire wires, + * returing raw samples. + * + * @param samples The pre-allocated `DataView`representing a matrix of + * shape `shots * numQubits`. The built-in iterator in `DataView` + * iterates over all elements of `samples` row-wise. + * @param shots The number of shots + */ + virtual void Sample(DataView &samples, size_t shots) = 0; + + /** + * @brief Compute partial samples with the number of shots on `wires`, + * returing raw samples. + * + * @param samples The pre-allocated `DataView`representing a matrix of + * shape `shots * numWires`. The built-in iterator in `DataView` + * iterates over all elements of `samples` row-wise. + * @param wires Wires to compute samples on + * @param shots The number of shots + */ + virtual void PartialSample(DataView &samples, const std::vector &wires, + size_t shots) = 0; + + /** + * @brief Sample with the number of shots on the entire wires, returning the + * number of counts for each sample. + * + * @param eigvals The pre-allocated `DataView` + * @param counts The pre-allocated `DataView` + * @param shots The number of shots + */ + virtual void Counts(DataView &eigvals, DataView &counts, + size_t shots) = 0; + + /** + * @brief Partial sample with the number of shots on `wires`, returning the + * number of counts for each sample. + * + * @param eigvals The pre-allocated `DataView` + * @param counts The pre-allocated `DataView` + * @param wires Wires to compute samples on + * @param shots The number of shots + */ + virtual void PartialCounts(DataView &eigvals, DataView &counts, + const std::vector &wires, size_t shots) = 0; + + /** + * @brief A general measurement method that acts on a single wire. + * + * @param wire The wire to compute Measure on + * @param postselect Which basis state to postselect after a mid-circuit measurement (-1 denotes + no post-selection) + + * @return `Result` The measurement result + */ + virtual auto Measure(QubitIdType wire, std::optional postselect) -> Result = 0; + + /** + * @brief Compute the gradient of a quantum tape, that is cached using + * `Catalyst::Runtime::Simulator::CacheManager`, for a specific set of trainable + * parameters. + * + * @param gradients The vector of pre-allocated `DataView*` + * to store gradients resutls for the list of cached observables. + * @param trainParams The vector of trainable parameters; if none, all parameters + * would be assumed trainable + * + */ + virtual void Gradient(std::vector> &gradients, + const std::vector &trainParams) = 0; +}; +} // namespace Catalyst::Runtime diff --git a/src/qirlightning/catalyst_runtime/include/RuntimeCAPI.h b/src/qirlightning/catalyst_runtime/include/RuntimeCAPI.h new file mode 100644 index 0000000..b0f63ca --- /dev/null +++ b/src/qirlightning/catalyst_runtime/include/RuntimeCAPI.h @@ -0,0 +1,112 @@ +// Copyright 2022-2023 Xanadu Quantum Technologies Inc. + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#ifndef RUNTIMECAPI_H +#define RUNTIMECAPI_H + +#include "Types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Quantum Runtime Instructions +void __catalyst__rt__fail_cstr(const char *); +void __catalyst__rt__initialize(uint32_t *seed); +void __catalyst__rt__device_init(int8_t *, int8_t *, int8_t *, int64_t shots); +void __catalyst__rt__device_release(); +void __catalyst__rt__finalize(); +void __catalyst__rt__toggle_recorder(bool); +void __catalyst__rt__print_state(); +void __catalyst__rt__print_tensor(OpaqueMemRefT *, bool); +void __catalyst__rt__print_string(char *); +void __catalyst__rt__assert_bool(bool, char *); +int64_t __catalyst__rt__array_get_size_1d(QirArray *); +int8_t *__catalyst__rt__array_get_element_ptr_1d(QirArray *, int64_t); + +QUBIT *__catalyst__rt__qubit_allocate(); +QirArray *__catalyst__rt__qubit_allocate_array(int64_t); +void __catalyst__rt__qubit_release(QUBIT *); +void __catalyst__rt__qubit_release_array(QirArray *); + +int64_t __catalyst__rt__num_qubits(); + +bool __catalyst__rt__result_equal(RESULT *, RESULT *); +RESULT *__catalyst__rt__result_get_one(); +RESULT *__catalyst__rt__result_get_zero(); + +// Quantum Gate Set Instructions +void __catalyst__qis__SetState(MemRefT_CplxT_double_1d *, uint64_t, ...); +void __catalyst__qis__SetBasisState(MemRefT_int8_1d *, uint64_t, ...); +void __catalyst__qis__Identity(QUBIT *, const Modifiers *); +void __catalyst__qis__PauliX(QUBIT *, const Modifiers *); +void __catalyst__qis__PauliY(QUBIT *, const Modifiers *); +void __catalyst__qis__PauliZ(QUBIT *, const Modifiers *); +void __catalyst__qis__Hadamard(QUBIT *, const Modifiers *); +void __catalyst__qis__S(QUBIT *, const Modifiers *); +void __catalyst__qis__T(QUBIT *, const Modifiers *); +void __catalyst__qis__PhaseShift(double, QUBIT *, const Modifiers *); +void __catalyst__qis__RX(double, QUBIT *, const Modifiers *); +void __catalyst__qis__RY(double, QUBIT *, const Modifiers *); +void __catalyst__qis__RZ(double, QUBIT *, const Modifiers *); +void __catalyst__qis__Rot(double, double, double, QUBIT *, const Modifiers *); +void __catalyst__qis__CNOT(QUBIT *, QUBIT *, const Modifiers *); +void __catalyst__qis__CY(QUBIT *, QUBIT *, const Modifiers *); +void __catalyst__qis__CZ(QUBIT *, QUBIT *, const Modifiers *); +void __catalyst__qis__SWAP(QUBIT *, QUBIT *, const Modifiers *); +void __catalyst__qis__IsingXX(double, QUBIT *, QUBIT *, const Modifiers *); +void __catalyst__qis__IsingYY(double, QUBIT *, QUBIT *, const Modifiers *); +void __catalyst__qis__IsingXY(double, QUBIT *, QUBIT *, const Modifiers *); +void __catalyst__qis__IsingZZ(double, QUBIT *, QUBIT *, const Modifiers *); +void __catalyst__qis__ControlledPhaseShift(double, QUBIT *, QUBIT *, const Modifiers *); +void __catalyst__qis__CRX(double, QUBIT *, QUBIT *, const Modifiers *); +void __catalyst__qis__CRY(double, QUBIT *, QUBIT *, const Modifiers *); +void __catalyst__qis__CRZ(double, QUBIT *, QUBIT *, const Modifiers *); +void __catalyst__qis__CRot(double, double, double, QUBIT *, QUBIT *, const Modifiers *); +void __catalyst__qis__CSWAP(QUBIT *, QUBIT *, QUBIT *, const Modifiers *); +void __catalyst__qis__Toffoli(QUBIT *, QUBIT *, QUBIT *, const Modifiers *); +void __catalyst__qis__MultiRZ(double, const Modifiers *, int64_t, /*qubits*/...); +void __catalyst__qis__GlobalPhase(double, const Modifiers *); +void __catalyst__qis__ISWAP(QUBIT *, QUBIT *, const Modifiers *); +void __catalyst__qis__PSWAP(double, QUBIT *, QUBIT *, const Modifiers *); + +// Struct pointer arguments for these instructions represent real arguments, +// as passing structs by value is too unreliable / compiler dependant. +void __catalyst__qis__QubitUnitary(MemRefT_CplxT_double_2d *, const Modifiers *, int64_t, + /*qubits*/...); + +ObsIdType __catalyst__qis__NamedObs(int64_t, QUBIT *); +ObsIdType __catalyst__qis__HermitianObs(MemRefT_CplxT_double_2d *, int64_t, /*qubits*/...); +ObsIdType __catalyst__qis__TensorObs(int64_t, /*obsKeys*/...); +ObsIdType __catalyst__qis__HamiltonianObs(MemRefT_double_1d *, int64_t, /*obsKeys*/...); + +// Struct pointers arguments here represent return values. +RESULT *__catalyst__qis__Measure(QUBIT *, int32_t); +double __catalyst__qis__Expval(ObsIdType); +double __catalyst__qis__Variance(ObsIdType); +void __catalyst__qis__Probs(MemRefT_double_1d *, int64_t, /*qubits*/...); +void __catalyst__qis__Sample(MemRefT_double_2d *, int64_t, /*qubits*/...); +void __catalyst__qis__Counts(PairT_MemRefT_double_int64_1d *, int64_t, /*qubits*/...); +void __catalyst__qis__State(MemRefT_CplxT_double_1d *, int64_t, /*qubits*/...); +void __catalyst__qis__Gradient(int64_t, /*results*/...); +void __catalyst__qis__Gradient_params(MemRefT_int64_1d *, int64_t, /*results*/...); + +void __catalyst__host__rt__unrecoverable_error(); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/src/qirlightning/catalyst_runtime/include/Types.h b/src/qirlightning/catalyst_runtime/include/Types.h new file mode 100644 index 0000000..a30a1c2 --- /dev/null +++ b/src/qirlightning/catalyst_runtime/include/Types.h @@ -0,0 +1,165 @@ +// Copyright 2022-2023 Xanadu Quantum Technologies Inc. + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#ifndef TYPES_H +#define TYPES_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Qubit, Result and Observable types +struct QUBIT; +using QubitIdType = intptr_t; + +using RESULT = bool; +using Result = RESULT *; +using QirArray = void *; + +using ObsIdType = intptr_t; + +enum ObsId : int8_t { + Identity = 0, + PauliX, + PauliY, + PauliZ, + Hadamard, + Hermitian, +}; + +enum ObsType : int8_t { + Basic = 0, + TensorProd, + Hamiltonian, +}; + +// complex type +struct CplxT_float { + float real; + float imag; +}; + +// complex type +struct CplxT_double { + double real; + double imag; +}; + +enum NumericType : int8_t { + idx = 0, + i1, + i8, + i16, + i32, + i64, + f32, + f64, + c64, + c128, +}; + +// MemRefT type +struct OpaqueMemRefT { + int64_t rank; + void *descriptor; + NumericType datatype; +}; + +// MemRefT, dimension=1> type +struct MemRefT_CplxT_double_1d { + CplxT_double *data_allocated; + CplxT_double *data_aligned; + size_t offset; + size_t sizes[1]; + size_t strides[1]; +}; + +// MemRefT, dimension=2> type +struct MemRefT_CplxT_double_2d { + CplxT_double *data_allocated; + CplxT_double *data_aligned; + size_t offset; + size_t sizes[2]; + size_t strides[2]; +}; + +// MemRefT type +struct MemRefT_double_1d { + double *data_allocated; + double *data_aligned; + size_t offset; + size_t sizes[1]; + size_t strides[1]; +}; + +// MemRefT type +struct MemRefT_double_2d { + double *data_allocated; + double *data_aligned; + size_t offset; + size_t sizes[2]; + size_t strides[2]; +}; + +// MemRefT type +struct MemRefT_int64_1d { + int64_t *data_allocated; + int64_t *data_aligned; + size_t offset; + size_t sizes[1]; + size_t strides[1]; +}; + +// MemRefT type +struct MemRefT_int8_1d { + int8_t *data_allocated; + int8_t *data_aligned; + size_t offset; + size_t sizes[1]; + size_t strides[1]; +}; + +// PairT, MemRefT> type +struct PairT_MemRefT_double_int64_1d { + struct MemRefT_double_1d first; + struct MemRefT_int64_1d second; +}; + +// Quantum operation modifiers +struct Modifiers { + bool adjoint; + size_t num_controlled; + QUBIT *controlled_wires; + bool *controlled_values; +}; + +using CplxT_double = struct CplxT_double; +using MemRefT_CplxT_double_1d = struct MemRefT_CplxT_double_1d; +using MemRefT_CplxT_double_2d = struct MemRefT_CplxT_double_2d; +using MemRefT_double_1d = struct MemRefT_double_1d; +using MemRefT_double_2d = struct MemRefT_double_2d; +using MemRefT_int64_1d = struct MemRefT_int64_1d; +using PairT_MemRefT_double_int64_1d = struct PairT_MemRefT_double_int64_1d; +using Modifiers = struct Modifiers; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/src/qirlightning/catalyst_runtime/lib/CMakeLists.txt b/src/qirlightning/catalyst_runtime/lib/CMakeLists.txt new file mode 100644 index 0000000..50fd0b0 --- /dev/null +++ b/src/qirlightning/catalyst_runtime/lib/CMakeLists.txt @@ -0,0 +1,3 @@ +add_subdirectory(capi) +add_subdirectory(backend) +add_subdirectory(registry) diff --git a/src/qirlightning/catalyst_runtime/lib/backend/CMakeLists.txt b/src/qirlightning/catalyst_runtime/lib/backend/CMakeLists.txt new file mode 100644 index 0000000..45b7ad7 --- /dev/null +++ b/src/qirlightning/catalyst_runtime/lib/backend/CMakeLists.txt @@ -0,0 +1,7 @@ +add_subdirectory(null_qubit) +configure_file(null_qubit/null_qubit.toml null_qubit.toml) +if(ENABLE_OPENQASM) +add_subdirectory(openqasm) +configure_file(openqasm/braket_local_qubit.toml braket_local_qubit.toml) +configure_file(openqasm/braket_aws_qubit.toml braket_aws_qubit.toml) +endif() diff --git a/src/qirlightning/catalyst_runtime/lib/backend/common/CacheManager.hpp b/src/qirlightning/catalyst_runtime/lib/backend/common/CacheManager.hpp new file mode 100644 index 0000000..0141f33 --- /dev/null +++ b/src/qirlightning/catalyst_runtime/lib/backend/common/CacheManager.hpp @@ -0,0 +1,199 @@ +// Copyright 2022-2023 Xanadu Quantum Technologies Inc. + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "Types.h" +#include "Utils.hpp" + +namespace Catalyst::Runtime { +/** + * @brief The CacheManager caches the entire operations and observables of + * a program at runtime. + * + * One direct use case of this functionality is explored to compute gradient + * of a circuit with taking advantage of gradient methods provided by + * simulators. + */ +template > class CacheManager { + protected: + // Operations Data + std::vector ops_names_{}; + std::vector> ops_params_{}; + std::vector> ops_wires_{}; + std::vector ops_inverses_{}; + std::vector> ops_matrixs_{}; + std::vector> ops_controlled_wires_{}; + std::vector> ops_controlled_values_{}; + + // Observables Data + std::vector obs_keys_{}; + std::vector obs_callees_{}; + + // Number of parameters + size_t num_params_{0}; + + public: + CacheManager() = default; + ~CacheManager() = default; + + CacheManager(const CacheManager &) = delete; + CacheManager &operator=(const CacheManager &) = delete; + CacheManager(CacheManager &&) = delete; + CacheManager &operator=(CacheManager &&) = delete; + + /** + * Reset cached gates + */ + void Reset() + { + ops_names_.clear(); + ops_params_.clear(); + ops_wires_.clear(); + ops_inverses_.clear(); + ops_matrixs_.clear(); + ops_controlled_wires_.clear(); + ops_controlled_values_.clear(); + + obs_keys_.clear(); + obs_callees_.clear(); + + num_params_ = 0; + } + + /** + * @brief Add a new operation to the list of cached gates. + * + * @param name Name of the given gate + * @param params Parameters of the gate + * @param wires Wires the gate acts on + * @param inverse If true, inverse of the gate is applied + * @param matrix Unitary matrix for the 'MatrixOp' operations + * @param controlled_wires Control wires + * @param controlled_values Control values + */ + void addOperation(const std::string &name, const std::vector ¶ms, + const std::vector &wires, bool inverse, + const std::vector &matrix = {}, + const std::vector &controlled_wires = {}, + const std::vector &controlled_values = {}) + { + ops_names_.push_back(name); + ops_params_.push_back(params); + ops_wires_.push_back(wires); + ops_inverses_.push_back(inverse); + ops_matrixs_.push_back(matrix); + ops_controlled_wires_.push_back(controlled_wires); + ops_controlled_values_.push_back(controlled_values); + + num_params_ += params.size(); + } + + /** + * @brief Add a new observable to the list of cached gates. + * + * @param id The observable key created by LObsManager() + * @param callee The measurement operation + */ + void addObservable(const ObsIdType id, const MeasurementsT &callee = MeasurementsT::None) + { + obs_keys_.push_back(id); + obs_callees_.push_back(callee); + } + + /** + * @brief Get a reference to observables keys. + */ + auto getObservablesKeys() -> const std::vector & { return obs_keys_; } + + /** + * @brief Get a reference to observables callees. + */ + auto getObservablesCallees() -> const std::vector & { return obs_callees_; } + + /** + * @brief Get a reference to operations names. + */ + auto getOperationsNames() -> const std::vector & { return ops_names_; } + + /** + * @brief Get a reference to operations parameters. + */ + auto getOperationsParameters() -> const std::vector> & + { + return ops_params_; + } + + /** + * @brief Get a reference to operations wires. + */ + auto getOperationsWires() -> const std::vector> & { return ops_wires_; } + + /** + * @brief Get a reference to operation controlled wires. + */ + auto getOperationsControlledWires() -> const std::vector> & + { + return this->ops_controlled_wires_; + } + + /** + * @brief Get a reference to operation controlled values. + */ + auto getOperationsControlledValues() -> const std::vector> & + { + return this->ops_controlled_values_; + } + + /** + * @brief Get a reference to operations inverses. + */ + auto getOperationsInverses() -> const std::vector & { return ops_inverses_; } + + /** + * @brief Get a reference to operations matrices. + */ + auto getOperationsMatrices() -> const std::vector> & + { + return ops_matrixs_; + } + + /** + * @brief Get total number of cached gates. + */ + [[nodiscard]] auto getNumGates() const -> size_t + { + return ops_names_.size() + obs_keys_.size(); + } + + /** + * @brief Get number of operations. + */ + [[nodiscard]] auto getNumOperations() const -> size_t { return ops_names_.size(); } + + /** + * @brief Get number of observables. + */ + [[nodiscard]] auto getNumObservables() const -> size_t { return obs_keys_.size(); } + + /** + * @brief Get total number of cached gates. + */ + [[nodiscard]] auto getNumParams() const -> size_t { return num_params_; } +}; +} // namespace Catalyst::Runtime diff --git a/src/qirlightning/catalyst_runtime/lib/backend/common/QubitManager.hpp b/src/qirlightning/catalyst_runtime/lib/backend/common/QubitManager.hpp new file mode 100644 index 0000000..05dc377 --- /dev/null +++ b/src/qirlightning/catalyst_runtime/lib/backend/common/QubitManager.hpp @@ -0,0 +1,146 @@ +// Copyright 2022-2023 Xanadu Quantum Technologies Inc. + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "Exception.hpp" +#include "Types.h" +#include "Utils.hpp" + +namespace Catalyst::Runtime { + +/** + * Qubit Manager + * + * @brief That maintains mapping of qubit IDs between runtime and device + * ids (e.g., Lightning-Dynamic). When user allocates a qubit, the + * `QubitManager` adds the qubit as an active qubit that operations + * can act on. When user releases a qubit, the `QubitManager` removes + * that qubit from the list of active wires. + */ +template +class QubitManager { + private: + using LQMapT = std::map; + + SimQubitIdType next_idx{0}; + LQMapT qubits_map{}; + + template + [[nodiscard]] inline OIter _remove_simulator_qubit_id(SimQubitIdType s_idx) + { + const auto &&s_idx_iter = this->qubits_map.find(s_idx); + RT_FAIL_IF(s_idx_iter == this->qubits_map.end(), "Invalid simulator qubit index"); + + return this->qubits_map.erase(s_idx_iter); + } + + template + inline void _update_qubits_mapfrom(IIter s_idx_iter) + { + for (; s_idx_iter != this->qubits_map.end(); s_idx_iter++) { + s_idx_iter->second--; + } + } + + public: + QubitManager() = default; + ~QubitManager() = default; + + QubitManager(const QubitManager &) = delete; + QubitManager &operator=(const QubitManager &) = delete; + QubitManager(QubitManager &&) = delete; + QubitManager &operator=(QubitManager &&) = delete; + + [[nodiscard]] auto isValidQubitId(SimQubitIdType s_idx) -> bool + { + return this->qubits_map.contains(s_idx); + } + + [[nodiscard]] auto isValidQubitId(const std::vector &ss_idx) -> bool + { + return std::all_of(ss_idx.begin(), ss_idx.end(), + [this](SimQubitIdType s) { return isValidQubitId(s); }); + } + + [[nodiscard]] auto getAllQubitIds() -> std::vector + { + std::vector ids; + ids.reserve(this->qubits_map.size()); + for (const auto &it : this->qubits_map) { + ids.push_back(it.first); + } + + return ids; + } + + [[nodiscard]] auto getDeviceId(SimQubitIdType s_idx) -> DevQubitIdType + { + RT_FAIL_IF(!isValidQubitId(s_idx), "Invalid device qubit index"); + + return this->qubits_map[s_idx]; + } + + auto getDeviceIds(const std::vector &ss_idx) -> std::vector + { + std::vector dd_idx; + dd_idx.reserve(ss_idx.size()); + for (const auto &s : ss_idx) { + dd_idx.push_back(getDeviceId(s)); + } + return dd_idx; + } + + [[nodiscard]] auto getSimulatorId(DevQubitIdType d_idx) -> SimQubitIdType + { + auto s_idx = std::find_if(this->qubits_map.begin(), this->qubits_map.end(), + [&d_idx](auto &&p) { return p.second == d_idx; }); + + RT_FAIL_IF(s_idx == this->qubits_map.end(), "Invalid simulator qubit index"); + + return s_idx->first; + } + + [[nodiscard]] auto Allocate(DevQubitIdType d_next_idx) -> SimQubitIdType + { + this->qubits_map[this->next_idx++] = d_next_idx; + return this->next_idx - 1; + } + + auto AllocateRange(DevQubitIdType start_idx, size_t size) -> std::vector + { + std::vector ids; + ids.reserve(size); + for (DevQubitIdType i = start_idx; i < start_idx + size; i++) { + ids.push_back(this->next_idx); + this->qubits_map[this->next_idx++] = i; + } + return ids; + } + + void Release(SimQubitIdType s_idx) + { + _update_qubits_mapfrom(_remove_simulator_qubit_id(s_idx)); + } + + void ReleaseAll() + { + // Release all qubits by clearing the map. + this->qubits_map.clear(); + } +}; +} // namespace Catalyst::Runtime diff --git a/src/qirlightning/catalyst_runtime/lib/backend/common/Utils.hpp b/src/qirlightning/catalyst_runtime/lib/backend/common/Utils.hpp new file mode 100644 index 0000000..0527ac4 --- /dev/null +++ b/src/qirlightning/catalyst_runtime/lib/backend/common/Utils.hpp @@ -0,0 +1,304 @@ +// Copyright 2022-2023 Xanadu Quantum Technologies Inc. + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "Exception.hpp" +#include "Types.h" + +#define QUANTUM_DEVICE_DEL_DECLARATIONS(CLASSNAME) \ + CLASSNAME(const CLASSNAME &) = delete; \ + CLASSNAME &operator=(const CLASSNAME &) = delete; \ + CLASSNAME(CLASSNAME &&) = delete; \ + CLASSNAME &operator=(CLASSNAME &&) = delete; + +#define QUANTUM_DEVICE_RT_DECLARATIONS \ + auto AllocateQubit()->QubitIdType override; \ + auto AllocateQubits(size_t num_qubits)->std::vector override; \ + void ReleaseQubit(QubitIdType q) override; \ + void ReleaseAllQubits() override; \ + [[nodiscard]] auto GetNumQubits() const->size_t override; \ + void StartTapeRecording() override; \ + void StopTapeRecording() override; \ + void SetDeviceShots(size_t shots) override; \ + [[nodiscard]] auto GetDeviceShots() const->size_t override; \ + void PrintState() override; \ + [[nodiscard]] auto Zero() const->Result override; \ + [[nodiscard]] auto One() const->Result override; + +#define QUANTUM_DEVICE_QIS_DECLARATIONS \ + void NamedOperation( \ + const std::string &name, const std::vector ¶ms, \ + const std::vector &wires, [[maybe_unused]] bool inverse = false, \ + [[maybe_unused]] const std::vector &controlled_wires = {}, \ + [[maybe_unused]] const std::vector &controlled_values = {}) override; \ + using Catalyst::Runtime::QuantumDevice::MatrixOperation; \ + void MatrixOperation( \ + const std::vector> &matrix, const std::vector &wires, \ + [[maybe_unused]] bool inverse = false, \ + [[maybe_unused]] const std::vector &controlled_wires = {}, \ + [[maybe_unused]] const std::vector &controlled_values = {}) override; \ + auto Observable(ObsId id, const std::vector> &matrix, \ + const std::vector &wires) \ + ->ObsIdType override; \ + auto TensorObservable(const std::vector &obs)->ObsIdType override; \ + auto HamiltonianObservable(const std::vector &coeffs, \ + const std::vector &obs) \ + ->ObsIdType override; \ + auto Expval(ObsIdType obsKey)->double override; \ + auto Var(ObsIdType obsKey)->double override; \ + void State(DataView, 1> &state) override; \ + void Probs(DataView &probs) override; \ + void PartialProbs(DataView &probs, const std::vector &wires) override; \ + void Sample(DataView &samples, size_t shots) override; \ + void PartialSample(DataView &samples, const std::vector &wires, \ + size_t shots) override; \ + void Counts(DataView &eigvals, DataView &counts, size_t shots) \ + override; \ + void PartialCounts(DataView &eigvals, DataView &counts, \ + const std::vector &wires, size_t shots) override; \ + auto Measure(QubitIdType wire, std::optional postselect = std::nullopt) \ + ->Result override; \ + void Gradient(std::vector> &gradients, \ + const std::vector &trainParams) override; + +namespace Catalyst::Runtime { +static inline auto parse_kwargs(std::string kwargs) -> std::unordered_map +{ + // cleaning kwargs + if (kwargs.empty()) { + return {}; + } + + std::unordered_map map; + size_t s3_pos = kwargs.find("\'s3_destination_folder\'"); + if (s3_pos != std::string::npos) { + auto opening_pos = kwargs.find('(', s3_pos); + RT_ASSERT(opening_pos != std::string::npos); + auto closing_pos = kwargs.find(')', opening_pos); + RT_ASSERT(closing_pos != std::string::npos); + map["s3_destination_folder"] = kwargs.substr(opening_pos, closing_pos - opening_pos + 1); + } + + auto kwargs_end_iter = (s3_pos == std::string::npos) ? kwargs.end() : kwargs.begin() + s3_pos; + + kwargs.erase(std::remove_if(kwargs.begin(), kwargs_end_iter, + [](char c) { + switch (c) { + case '{': + case '}': + case ' ': + case '\'': + return true; + default: + return false; + } + }), + kwargs.end()); + + // constructing map + std::istringstream iss(kwargs); + std::string token; + while (std::getline(iss, token, ',')) { + std::istringstream issp(token); + std::string pair[2]; + std::getline(issp, pair[0], ':'); + std::getline(issp, pair[1]); + map[pair[0]] = pair[1]; + } + + return map; +} + +enum class MeasurementsT : uint8_t { + None, // = 0 + Expval, + Var, + Probs, + State, +}; + +} // namespace Catalyst::Runtime + +namespace Catalyst::Runtime::Simulator::Lightning { +enum class SimulatorGate : uint8_t { + // 1-qubit + Identity, // = 0 + PauliX, + PauliY, + PauliZ, + Hadamard, + S, + T, + PhaseShift, + RX, + RY, + RZ, + Rot, + // 2-qubit + CNOT, + CY, + CZ, + SWAP, + ISWAP, + PSWAP, + IsingXX, + IsingYY, + IsingXY, + IsingZZ, + ControlledPhaseShift, + CRX, + CRY, + CRZ, + CRot, + // 3-qubit + CSWAP, + Toffoli, + // n-qubit + MultiRZ, +}; + +constexpr std::array simulator_observable_support = { + // ObsId, ObsName, SimulatorSupport + std::tuple{ObsId::Identity, "Identity", true}, + std::tuple{ObsId::PauliX, "PauliX", true}, + std::tuple{ObsId::PauliY, "PauliY", true}, + std::tuple{ObsId::PauliZ, "PauliZ", true}, + std::tuple{ObsId::Hadamard, "Hadamard", true}, +}; + +using GateInfoTupleT = std::tuple; + +constexpr std::array simulator_gate_info = { + // 1-qubit + GateInfoTupleT{SimulatorGate::Identity, "Identity", 1, 0}, + GateInfoTupleT{SimulatorGate::PauliX, "PauliX", 1, 0}, + GateInfoTupleT{SimulatorGate::PauliY, "PauliY", 1, 0}, + GateInfoTupleT{SimulatorGate::PauliZ, "PauliZ", 1, 0}, + GateInfoTupleT{SimulatorGate::Hadamard, "Hadamard", 1, 0}, + GateInfoTupleT{SimulatorGate::S, "S", 1, 0}, + GateInfoTupleT{SimulatorGate::T, "T", 1, 0}, + GateInfoTupleT{SimulatorGate::PhaseShift, "PhaseShift", 1, 1}, + GateInfoTupleT{SimulatorGate::RX, "RX", 1, 1}, + GateInfoTupleT{SimulatorGate::RY, "RY", 1, 1}, + GateInfoTupleT{SimulatorGate::RZ, "RZ", 1, 1}, + GateInfoTupleT{SimulatorGate::Rot, "Rot", 1, 3}, + // 2-qubit + GateInfoTupleT{SimulatorGate::CNOT, "CNOT", 2, 0}, + GateInfoTupleT{SimulatorGate::CY, "CY", 2, 0}, + GateInfoTupleT{SimulatorGate::CZ, "CZ", 2, 0}, + GateInfoTupleT{SimulatorGate::SWAP, "SWAP", 2, 0}, + GateInfoTupleT{SimulatorGate::ISWAP, "ISWAP", 2, 0}, + GateInfoTupleT{SimulatorGate::PSWAP, "PSWAP", 2, 1}, + GateInfoTupleT{SimulatorGate::IsingXX, "IsingXX", 2, 1}, + GateInfoTupleT{SimulatorGate::IsingYY, "IsingYY", 2, 1}, + GateInfoTupleT{SimulatorGate::IsingXY, "IsingXY", 2, 1}, + GateInfoTupleT{SimulatorGate::IsingZZ, "IsingZZ", 2, 1}, + GateInfoTupleT{SimulatorGate::ControlledPhaseShift, "ControlledPhaseShift", 2, 1}, + GateInfoTupleT{SimulatorGate::CRX, "CRX", 2, 1}, + GateInfoTupleT{SimulatorGate::CRY, "CRY", 2, 1}, + GateInfoTupleT{SimulatorGate::CRZ, "CRZ", 2, 1}, + GateInfoTupleT{SimulatorGate::CRot, "CRot", 2, 3}, + // 3-qubit + GateInfoTupleT{SimulatorGate::CSWAP, "CSWAP", 3, 0}, + GateInfoTupleT{SimulatorGate::Toffoli, "Toffoli", 3, 0}, + // n-qubit + GateInfoTupleT{SimulatorGate::MultiRZ, "MultiRZ", 0, 1}, +}; + +constexpr size_t simulator_gate_info_size = simulator_gate_info.size(); +constexpr size_t simulator_observable_support_size = simulator_observable_support.size(); + +template +using SimulatorGateInfoDataT = std::array; + +template +constexpr auto lookup_obs(const std::array, size> &arr, + const ObsId key) -> std::string_view +{ + for (size_t idx = 0; idx < size; idx++) { + auto &&[op_id, op_str, op_support] = arr[idx]; + if (op_id == key && op_support) { + return op_str; + } + } + throw std::range_error("The given observable is not supported by the simulator"); +} + +template +constexpr auto lookup_gates(const SimulatorGateInfoDataT &arr, const std::string &key) + -> std::pair +{ + for (size_t idx = 0; idx < size; idx++) { + auto &&[op, op_str, op_num_wires, op_num_params] = arr[idx]; + if (op_str == key) { + return std::make_pair(op_num_wires, op_num_params); + } + } + throw std::range_error("The given operation is not supported by the simulator"); +} + +template +constexpr auto has_gate(const SimulatorGateInfoDataT &arr, const std::string &key) -> bool +{ + for (size_t idx = 0; idx < size; idx++) { + if (std::get<1>(arr[idx]) == key) { + return true; + } + } + return false; +} + +static inline auto +simulateDraw(const std::vector &probs, std::optional postselect, + std::mt19937 *gen = nullptr) // NOLINT(readability-non-const-parameter) + -> bool +{ + if (postselect) { + auto postselect_value = postselect.value(); + RT_FAIL_IF(postselect_value < 0 || postselect_value > 1, "Invalid postselect value"); + RT_FAIL_IF(probs[postselect_value] == 0, "Probability of postselect value is 0"); + return static_cast(postselect_value == 1); + } + + // Normal flow, no post-selection + // Draw a number according to the given distribution + std::uniform_real_distribution<> dis(0., 1.); + + float draw; + if (gen != nullptr) { + draw = dis(*gen); + (*gen)(); + } + else { + std::random_device rd; + std::mt19937 gen_no_seed(rd()); + draw = dis(gen_no_seed); + } + + return draw > probs[0]; +} + +} // namespace Catalyst::Runtime::Simulator::Lightning diff --git a/src/qirlightning/catalyst_runtime/lib/capi/CMakeLists.txt b/src/qirlightning/catalyst_runtime/lib/capi/CMakeLists.txt new file mode 100644 index 0000000..e05e9bf --- /dev/null +++ b/src/qirlightning/catalyst_runtime/lib/capi/CMakeLists.txt @@ -0,0 +1,57 @@ +################################## +# Object Lib catalyst_qir_qis_obj +################################## + +add_library(catalyst_qir_qis_obj OBJECT RuntimeCAPI.cpp) + +# include external MLIR runner utils +FetchContent_MakeAvailable(MLIRRunnerUtils) +FetchContent_MakeAvailable(MLIRCRunnerUtils) +FetchContent_MakeAvailable(MLIRFloat16Bits) + +# link to rt_backend +target_link_libraries(catalyst_qir_qis_obj ${CMAKE_DL_LIBS}) + +target_link_libraries(catalyst_qir_qis_obj + pthread + dl +) + +target_include_directories(catalyst_qir_qis_obj PUBLIC . + ${CMAKE_CURRENT_SOURCE_DIR} + ${runtime_includes} + ${mlirrunnerutils_SOURCE_DIR}/../.. # includes are relative to mlir/ExecutionEngine + ${PROJECT_SOURCE_DIR}/../mlir/lib/Driver # Timer.hpp +) + +# The MLIR Runner Utils raises this warning so we need to disable it for our -Werror builds. +if(RUNTIME_ENABLE_WARNINGS) + target_compile_options(catalyst_qir_qis_obj PRIVATE "-Wno-unused-parameter") +endif() + +set_property(TARGET catalyst_qir_qis_obj PROPERTY POSITION_INDEPENDENT_CODE ON) + +##################### +# Shared Lib rt_capi +##################### + +add_library(rt_capi SHARED) + +target_link_libraries(rt_capi ${CMAKE_DL_LIBS} catalyst_qir_qis_obj) +add_dependencies(rt_capi catalyst_callback_registry) + + +target_include_directories(rt_capi PUBLIC . + ${CMAKE_CURRENT_SOURCE_DIR} + ${runtime_includes} + ${capi_utils_includes} +) + +set_property(TARGET rt_capi PROPERTY POSITION_INDEPENDENT_CODE ON) +set_property(TARGET rt_capi APPEND PROPERTY BUILD_RPATH "$") + +if(NOT APPLE) + set_property(TARGET rt_capi APPEND PROPERTY BUILD_RPATH $ORIGIN) +else() + set_property(TARGET rt_capi APPEND PROPERTY BUILD_RPATH @loader_path) +endif() diff --git a/src/qirlightning/catalyst_runtime/lib/capi/ExecutionContext.hpp b/src/qirlightning/catalyst_runtime/lib/capi/ExecutionContext.hpp new file mode 100644 index 0000000..9abe8cb --- /dev/null +++ b/src/qirlightning/catalyst_runtime/lib/capi/ExecutionContext.hpp @@ -0,0 +1,367 @@ +// Copyright 2022-2023 Xanadu Quantum Technologies Inc. + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "Exception.hpp" +#include "QuantumDevice.hpp" +#include "Types.h" + +extern void callbackCall(int64_t, int64_t, int64_t, va_list); + +namespace Catalyst::Runtime { + +extern "C" void __catalyst_inactive_callback(int64_t identifier, int64_t argc, int64_t retc, ...); + +class MemoryManager // NOLINT(cppcoreguidelines-special-member-functions, + // hicpp-special-member-functions) + final { + private: + std::unordered_set _impl; + std::mutex mu; // To guard the memory manager + + public: + explicit MemoryManager() { _impl.reserve(1024); }; + + ~MemoryManager() + { + // Lock the mutex to protect _impl free + std::lock_guard lock(mu); + for (auto *allocation : _impl) { + free(allocation); // NOLINT(cppcoreguidelines-no-malloc, hicpp-no-malloc) + } + } + + void insert(void *ptr) + { + // Lock the mutex to protect _impl update + std::lock_guard lock(mu); + _impl.insert(ptr); + } + void erase(void *ptr) + { + // Lock the mutex to protect _impl update + std::lock_guard lock(mu); + _impl.erase(ptr); + } + bool contains(void *ptr) + { + // Lock the mutex to protect _impl update + std::lock_guard lock(mu); + return _impl.contains(ptr); + } +}; + +class SharedLibraryManager final { + private: + void *_handler{nullptr}; + + public: + SharedLibraryManager() = delete; + explicit SharedLibraryManager(const std::string &filename) + { +#ifdef __APPLE__ + auto rtld_flags = RTLD_LAZY; +#else + // Closing the dynamic library of Lightning simulators with dlclose() where OpenMP + // directives (in Lightning simulators) are in use would raise memory segfaults. + // Note that we use RTLD_NODELETE as a workaround to fix the issue. + auto rtld_flags = RTLD_LAZY | RTLD_NODELETE; +#endif + + _handler = dlopen(filename.c_str(), rtld_flags); + RT_FAIL_IF(!_handler, dlerror()); + } + + ~SharedLibraryManager() + { + // dlopen and dlclose increment and decrement reference counters. + // Since we have a guaranteed _handler in a valid SharedLibraryManager instance + // then we don't really need to worry about dlclose. + // In other words, there is an one to one correspondence between an instance + // of SharedLibraryManager and an increase in the reference count for the dynamic library. + // dlclose returns non-zero on error. + // + // Errors in dlclose are implementation dependent. + // There are two possible errors during dlclose in glibc: "shared object not open" + // and "cannot create scope list". Look for _dl_signal_error in: + // + // https://codebrowser.dev/glibc/glibc/elf/dl-close.c.html + // + // This means that at the very least, one could trigger an error in the following line by + // doing the following: dlopen the same library and closing it multiple times in a different + // location. + // + // This would mean that the reference count would be less than the number of instances + // of SharedLibraryManager. + // + // There really is no way to protect against this error, except to always use + // SharedLibraryManager to manage shared libraries. + // + // Exercise for the reader, how could one trigger the "cannot create scope list" error? + dlclose(_handler); + } + + SharedLibraryManager(const SharedLibraryManager &other) = delete; + SharedLibraryManager &operator=(const SharedLibraryManager &other) = delete; + SharedLibraryManager(SharedLibraryManager &&other) = delete; + SharedLibraryManager &operator=(SharedLibraryManager &&other) = delete; + + void *getSymbol(const std::string &symbol) + { + void *sym = dlsym(_handler, symbol.c_str()); + RT_FAIL_IF(!sym, dlerror()); + return sym; + } +}; + +/** + * This indicates the various stages a device can be in: + * - `Active` : The device is added to the device pool and the `ExecutionContext` device pointer + * (`RTD_PTR`) points to this device instance. The CAPI routines have only access to + * one single active device per thread via `RTD_PTR`. + * - `Inactive` : The device is deactivated meaning `RTD_PTR` does not point to this device. + * The device is not removed from the pool, allowing the `ExecutionContext` manager + * to reuse this device in a multi-qnode workflow when another device with identical + * specifications is requested. + */ +enum class RTDeviceStatus : uint8_t { + Active = 0, + Inactive, +}; + +extern "C" Catalyst::Runtime::QuantumDevice *GenericDeviceFactory(const char *kwargs); + +/** + * Runtime Device data-class. + * + * This class introduces an interface for constructed devices by the `ExecutionContext` + * manager. This includes the device name, library, kwargs, and a shared pointer to the + * `QuantumDevice` entry point. + */ +class RTDevice { + private: + std::string rtd_lib; + std::string rtd_name; + std::string rtd_kwargs; + + std::unique_ptr rtd_dylib{nullptr}; + std::unique_ptr rtd_qdevice{nullptr}; + + RTDeviceStatus status{RTDeviceStatus::Inactive}; + + static void _complete_dylib_os_extension(std::string &rtd_lib, const std::string &name) noexcept + { +#ifdef __linux__ + rtd_lib = "librtd_" + name + ".so"; +#elif defined(__APPLE__) + rtd_lib = "librtd_" + name + ".dylib"; +#endif + } + + static void _pl2runtime_device_info(std::string &rtd_lib, std::string &rtd_name) noexcept + { + // The following if-elif is required for C++ tests where these backend devices + // are linked in the interface library of the runtime. (check runtime/CMakeLists.txt) + // Besides, this provides support for runtime device (RTD) libraries added to the system + // path. This maintains backward compatibility for specifying a device using its name. + // TODO: This support may need to be removed after updating the C++ unit tests. + if (rtd_lib == "null.qubit") { + rtd_name = "NullQubit"; + _complete_dylib_os_extension(rtd_lib, "null_qubit"); + } + else if (rtd_lib == "lightning.qubit") { + rtd_name = "LightningSimulator"; + _complete_dylib_os_extension(rtd_lib, "lightning"); + } + else if (rtd_lib == "braket.aws.qubit" || rtd_lib == "braket.local.qubit") { + rtd_name = "OpenQasmDevice"; + _complete_dylib_os_extension(rtd_lib, "openqasm"); + } + } + + public: + explicit RTDevice(std::string _rtd_lib, std::string _rtd_name = {}, + std::string _rtd_kwargs = {}) + : rtd_lib(std::move(_rtd_lib)), rtd_name(std::move(_rtd_name)), + rtd_kwargs(std::move(_rtd_kwargs)) + { + _pl2runtime_device_info(rtd_lib, rtd_name); + } + + explicit RTDevice(std::string_view _rtd_lib, std::string_view _rtd_name, + std::string_view _rtd_kwargs) + : rtd_lib(_rtd_lib), rtd_name(_rtd_name), rtd_kwargs(_rtd_kwargs) + { + _pl2runtime_device_info(rtd_lib, rtd_name); + } + + ~RTDevice() = default; + RTDevice(const RTDevice &other) = delete; + RTDevice &operator=(const RTDevice &other) = delete; + RTDevice(RTDevice &&other) = delete; + RTDevice &operator=(RTDevice &&other) = delete; + + auto operator==(const RTDevice &other) const -> bool + { + return (this->rtd_lib == other.rtd_lib && this->rtd_name == other.rtd_name) && + this->rtd_kwargs == other.rtd_kwargs; + } + + [[nodiscard]] auto getQuantumDevicePtr() -> const std::unique_ptr & + { + if (rtd_qdevice) { + return rtd_qdevice; + } + + rtd_dylib = std::make_unique(rtd_lib); + std::string factory_name{rtd_name + "Factory"}; + void *f_ptr = rtd_dylib->getSymbol(factory_name); + rtd_qdevice = std::unique_ptr( + (f_ptr != nullptr) + ? reinterpret_cast(f_ptr)(rtd_kwargs.c_str()) + : nullptr); + return rtd_qdevice; + } + + [[nodiscard]] auto getDeviceInfo() const -> std::tuple + { + return {rtd_lib, rtd_name, rtd_kwargs}; + } + + [[nodiscard]] auto getDeviceName() const -> const std::string & { return rtd_name; } + + void setDeviceStatus(RTDeviceStatus new_status) noexcept { status = new_status; } + + [[nodiscard]] auto getDeviceStatus() const -> RTDeviceStatus { return status; } + + friend std::ostream &operator<<(std::ostream &os, const RTDevice &device) + { + os << "RTD, name: " << device.rtd_name << " lib: " << device.rtd_lib + << " kwargs: " << device.rtd_kwargs; + return os; + } +}; + +class ExecutionContext final { + private: + // Device pool + std::vector> device_pool; + std::mutex pool_mu; // To protect device_pool + + bool initial_tape_recorder_status{false}; + + // ExecutionContext pointers + std::unique_ptr memory_man_ptr{nullptr}; + + // PRNG + uint32_t *seed; + std::mt19937 gen; + + public: + explicit ExecutionContext(uint32_t *seed = nullptr) : seed(seed) + { + memory_man_ptr = std::make_unique(); + + if (this->seed != nullptr) { + this->gen = std::mt19937(*seed); + } + } + + ~ExecutionContext() = default; + ExecutionContext(const ExecutionContext &other) = delete; + ExecutionContext &operator=(const ExecutionContext &other) = delete; + ExecutionContext(ExecutionContext &&other) = delete; + ExecutionContext &operator=(ExecutionContext &&other) = delete; + + void setDeviceRecorderStatus(bool status) noexcept { initial_tape_recorder_status = status; } + + [[nodiscard]] auto getDeviceRecorderStatus() const -> bool + { + return initial_tape_recorder_status; + } + + [[nodiscard]] auto getMemoryManager() const -> const std::unique_ptr & + { + return memory_man_ptr; + } + + [[nodiscard]] auto getOrCreateDevice(std::string_view rtd_lib, std::string_view rtd_name, + std::string_view rtd_kwargs) + -> const std::shared_ptr & + { + std::lock_guard lock(pool_mu); + + auto device = std::make_shared(rtd_lib, rtd_name, rtd_kwargs); + + const size_t key = device_pool.size(); + for (size_t i = 0; i < key; i++) { + if (device_pool[i]->getDeviceStatus() == RTDeviceStatus::Inactive && + *device_pool[i] == *device) { + device_pool[i]->setDeviceStatus(RTDeviceStatus::Active); + return device_pool[i]; + } + } + + RT_ASSERT(device->getQuantumDevicePtr()); + + // Add a new device + device->setDeviceStatus(RTDeviceStatus::Active); + if (this->seed != nullptr) { + device->getQuantumDevicePtr()->SetDevicePRNG(&(this->gen)); + } + else { + device->getQuantumDevicePtr()->SetDevicePRNG(nullptr); + } + device_pool.push_back(device); + + return device_pool[key]; + } + + [[nodiscard]] auto getOrCreateDevice(const std::string &rtd_lib, + const std::string &rtd_name = {}, + const std::string &rtd_kwargs = {}) + -> const std::shared_ptr & + { + return getOrCreateDevice(std::string_view{rtd_lib}, std::string_view{rtd_name}, + std::string_view{rtd_kwargs}); + } + + [[nodiscard]] auto getDevice(size_t device_key) -> const std::shared_ptr & + { + std::lock_guard lock(pool_mu); + RT_FAIL_IF(device_key >= device_pool.size(), "Invalid device_key"); + return device_pool[device_key]; + } + + void deactivateDevice(RTDevice *RTD_PTR) + { + std::lock_guard lock(pool_mu); + RTD_PTR->setDeviceStatus(RTDeviceStatus::Inactive); + } +}; +} // namespace Catalyst::Runtime diff --git a/src/qirlightning/catalyst_runtime/lib/capi/MemRefUtils.hpp b/src/qirlightning/catalyst_runtime/lib/capi/MemRefUtils.hpp new file mode 100644 index 0000000..481da78 --- /dev/null +++ b/src/qirlightning/catalyst_runtime/lib/capi/MemRefUtils.hpp @@ -0,0 +1,48 @@ +// Copyright 2023 Xanadu Quantum Technologies Inc. + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "mlir/ExecutionEngine/RunnerUtils.h" + +extern "C" { +void *_mlir_memref_to_llvm_alloc(size_t size); +void *_mlir_memref_to_llvm_aligned_alloc(size_t alignment, size_t size); +bool _mlir_memory_transfer(void *); +void _mlir_memref_to_llvm_free(void *ptr); +} + +// MemRef type definition +template struct MemRefT { + T *data_allocated; + T *data_aligned; + size_t offset; + size_t sizes[R]; + size_t strides[R]; +}; + +template +inline void printMemref(const UnrankedMemRefType &memref, bool printDescriptor = false) +{ + auto m = DynamicMemRefType(memref); + if (printDescriptor) { + std::cout << "MemRef: "; + printMemRefMetaData(std::cout, m); + std::cout << " data =" << std::endl; + } + impl::MemRefDataPrinter::print(std::cout, m.data, m.rank, m.rank, m.offset, m.sizes, + m.strides); +} diff --git a/src/qirlightning/catalyst_runtime/lib/capi/RuntimeCAPI.cpp b/src/qirlightning/catalyst_runtime/lib/capi/RuntimeCAPI.cpp new file mode 100644 index 0000000..8c1e019 --- /dev/null +++ b/src/qirlightning/catalyst_runtime/lib/capi/RuntimeCAPI.cpp @@ -0,0 +1,1012 @@ +// Copyright 2022-2023 Xanadu Quantum Technologies Inc. + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include "mlir/ExecutionEngine/CRunnerUtils.h" + +#include "Exception.hpp" +#include "QuantumDevice.hpp" + +#include "ExecutionContext.hpp" +#include "MemRefUtils.hpp" +#include "Timer.hpp" + +#include "RuntimeCAPI.h" + +namespace Catalyst::Runtime { + +/** + * @brief Global quantum device unique pointer. + */ +static std::unique_ptr CTX = nullptr; + +/** + * @brief Thread local device pointer with internal linkage. + */ +thread_local static RTDevice *RTD_PTR = nullptr; + +bool getModifiersAdjoint(const Modifiers *modifiers) +{ + return !modifiers ? false : modifiers->adjoint; +} + +std::vector getModifiersControlledWires(const Modifiers *modifiers) +{ + return !modifiers ? std::vector() + : std::vector( + reinterpret_cast(modifiers->controlled_wires), + reinterpret_cast(modifiers->controlled_wires) + + modifiers->num_controlled); +} + +std::vector getModifiersControlledValues(const Modifiers *modifiers) +{ + return !modifiers ? std::vector() + : std::vector(modifiers->controlled_values, + modifiers->controlled_values + modifiers->num_controlled); +} + +#define MODIFIERS_ARGS(mod) \ + getModifiersAdjoint(mod), getModifiersControlledWires(mod), getModifiersControlledValues(mod) + +/** + * @brief Initialize the device instance and update the value of RTD_PTR + * to the new initialized device pointer. + */ +[[nodiscard]] bool initRTDevicePtr(std::string_view rtd_lib, std::string_view rtd_name, + std::string_view rtd_kwargs) +{ + auto &&device = CTX->getOrCreateDevice(rtd_lib, rtd_name, rtd_kwargs); + if (device) { + RTD_PTR = device.get(); + return RTD_PTR ? true : false; + } + return false; +} + +/** + * @brief get the active device. + */ +auto getQuantumDevicePtr() -> const std::unique_ptr & +{ + return RTD_PTR->getQuantumDevicePtr(); +} + +/** + * @brief Inactivate the active device instance. + */ +void deactivateDevice() +{ + CTX->deactivateDevice(RTD_PTR); + RTD_PTR = nullptr; +} +} // namespace Catalyst::Runtime + +extern "C" { + +using namespace Catalyst::Runtime; +using timer = catalyst::utils::Timer; + +void __catalyst_inactive_callback(int64_t identifier, int64_t argc, int64_t retc, ...) +{ + // LIBREGISTRY is a compile time macro. It is defined based on the output + // name of the callback library. And since it is stored in the same location + // as this library, it shares the ORIGIN variable. Do a `git grep LIBREGISTRY` + // to find its definition in the CMakeFiles. + // It is the name of the library that contains the callbackCall implementation. + // The reason why this is using dlopen is because we have historically wanted + // to avoid a dependency of python in the runtime. + // With dlopen, we leave the possibility of linking against the runtime without + // linking with LIBREGISTRY which is implemented as a pybind11 module. + // + // The only restriction is that there should be no calls to pyregsitry. + // + // This function cannot be tested from the runtime tests because there would be no valid python + // function to callback... + void *handle = dlopen(LIBREGISTRY, RTLD_LAZY); + if (!handle) { + char *err_msg = dlerror(); + RT_FAIL(err_msg); + } + + void (*callbackCall)(int64_t, int64_t, int64_t, va_list); + typedef void (*func_ptr_t)(int64_t, int64_t, int64_t, va_list); + callbackCall = (func_ptr_t)dlsym(handle, "callbackCall"); + if (!callbackCall) { + char *err_msg = dlerror(); + RT_FAIL(err_msg); + } + + va_list args; + va_start(args, retc); + callbackCall(identifier, argc, retc, args); + va_end(args); + dlclose(handle); +} + +void __catalyst__host__rt__unrecoverable_error() +{ + RT_FAIL("Unrecoverable error from asynchronous execution of multiple quantum programs."); +} + +void *_mlir_memref_to_llvm_alloc(size_t size) +{ + void *ptr = malloc(size); + CTX->getMemoryManager()->insert(ptr); + return ptr; +} + +void *_mlir_memref_to_llvm_aligned_alloc(size_t alignment, size_t size) +{ + void *ptr = aligned_alloc(alignment, size); + CTX->getMemoryManager()->insert(ptr); + return ptr; +} + +bool _mlir_memory_transfer(void *ptr) +{ + if (!CTX->getMemoryManager()->contains(ptr)) { + return false; + } + CTX->getMemoryManager()->erase(ptr); + return true; +} + +void _mlir_memref_to_llvm_free(void *ptr) +{ + CTX->getMemoryManager()->erase(ptr); + free(ptr); +} + +void __catalyst__rt__print_string(char *string) +{ + if (!string) { + std::cout << "None" << std::endl; + return; + } + std::cout << string << std::endl; +} + +void __catalyst__rt__assert_bool(bool p, char *s) { RT_FAIL_IF(!p, s); } + +void __catalyst__rt__print_tensor(OpaqueMemRefT *c_memref, bool printDescriptor) +{ + if (c_memref->datatype == NumericType::idx) { + printMemref({c_memref->rank, c_memref->descriptor}, printDescriptor); + } + else if (c_memref->datatype == NumericType::i1) { + printMemref({c_memref->rank, c_memref->descriptor}, printDescriptor); + } + else if (c_memref->datatype == NumericType::i8) { + printMemref({c_memref->rank, c_memref->descriptor}, printDescriptor); + } + else if (c_memref->datatype == NumericType::i16) { + printMemref({c_memref->rank, c_memref->descriptor}, printDescriptor); + } + else if (c_memref->datatype == NumericType::i32) { + printMemref({c_memref->rank, c_memref->descriptor}, printDescriptor); + } + else if (c_memref->datatype == NumericType::i64) { + printMemref({c_memref->rank, c_memref->descriptor}, printDescriptor); + } + else if (c_memref->datatype == NumericType::f32) { + printMemref({c_memref->rank, c_memref->descriptor}, printDescriptor); + } + else if (c_memref->datatype == NumericType::f64) { + printMemref({c_memref->rank, c_memref->descriptor}, printDescriptor); + } + else if (c_memref->datatype == NumericType::c64) { + printMemref({c_memref->rank, c_memref->descriptor}, printDescriptor); + } + else if (c_memref->datatype == NumericType::c128) { + printMemref({c_memref->rank, c_memref->descriptor}, printDescriptor); + } + else { + RT_FAIL("Unkown numeric type encoding for array printing."); + } + + std::cout << std::endl; +} + +void __catalyst__rt__fail_cstr(const char *cstr) { RT_FAIL(cstr); } + +void __catalyst__rt__initialize(uint32_t *seed) { CTX = std::make_unique(seed); } + +void __catalyst__rt__finalize() +{ + RTD_PTR = nullptr; + CTX.reset(nullptr); +} + +static int __catalyst__rt__device_init__impl(int8_t *rtd_lib, int8_t *rtd_name, int8_t *rtd_kwargs, + int64_t shots) +{ + // Device library cannot be a nullptr + RT_FAIL_IF(!rtd_lib, "Invalid device library"); + RT_FAIL_IF(!CTX, "Invalid use of the global driver before initialization"); + RT_FAIL_IF(RTD_PTR, "Cannot re-initialize an ACTIVE device: Consider using " + "__catalyst__rt__device_release before __catalyst__rt__device_init"); + + const std::vector args{ + reinterpret_cast(rtd_lib), (rtd_name ? reinterpret_cast(rtd_name) : ""), + (rtd_kwargs ? reinterpret_cast(rtd_kwargs) : "")}; + RT_FAIL_IF(!initRTDevicePtr(args[0], args[1], args[2]), + "Failed initialization of the backend device"); + getQuantumDevicePtr()->SetDeviceShots(shots); + if (CTX->getDeviceRecorderStatus()) { + getQuantumDevicePtr()->StartTapeRecording(); + } + return 0; +} + +void __catalyst__rt__device_init(int8_t *rtd_lib, int8_t *rtd_name, int8_t *rtd_kwargs, + int64_t shots) +{ + timer::timer(__catalyst__rt__device_init__impl, "device_init", /* add_endl */ true, rtd_lib, + rtd_name, rtd_kwargs, shots); +} + +static int __catalyst__rt__device_release__impl() +{ + RT_FAIL_IF(!CTX, "Cannot release an ACTIVE device out of scope of the global driver"); + // TODO: This will be used for the async support + deactivateDevice(); + return 0; +} + +void __catalyst__rt__device_release() +{ + timer::timer(__catalyst__rt__device_release__impl, "device_release", /* add_endl */ true); +} + +void __catalyst__rt__print_state() { getQuantumDevicePtr()->PrintState(); } + +void __catalyst__rt__toggle_recorder(bool status) +{ + CTX->setDeviceRecorderStatus(status); + if (!RTD_PTR) { + return; + } + + if (status) { + getQuantumDevicePtr()->StartTapeRecording(); + } + else { + getQuantumDevicePtr()->StopTapeRecording(); + } +} + +static QUBIT *__catalyst__rt__qubit_allocate__impl() +{ + RT_ASSERT(getQuantumDevicePtr() != nullptr); + RT_ASSERT(CTX->getMemoryManager() != nullptr); + + return reinterpret_cast(getQuantumDevicePtr()->AllocateQubit()); +} + +QUBIT *__catalyst__rt__qubit_allocate() +{ + return timer::timer(__catalyst__rt__qubit_allocate__impl, "qubit_allocate", + /* add_endl */ true); +} + +static QirArray *__catalyst__rt__qubit_allocate_array__impl(int64_t num_qubits) +{ + RT_ASSERT(getQuantumDevicePtr() != nullptr); + RT_ASSERT(CTX->getMemoryManager() != nullptr); + RT_ASSERT(num_qubits >= 0); + + // For first prototype, we just want to make this work. + // But ideally, I think the device should determine the representation. + // Essentially just forward this to the device library. + // And the device library can choose how to handle everything. + std::vector qubit_vector = getQuantumDevicePtr()->AllocateQubits(num_qubits); + + // I don't like this copying. + std::vector *qubit_vector_ptr = + new std::vector(qubit_vector.begin(), qubit_vector.end()); + + // Because this function is interfacing with C + // I think we should return a trivial-type + // https://en.cppreference.com/w/cpp/named_req/TrivialType + // Why should we return a trivial type? + // + // Paraphrasing from stackoverflow: https://stackoverflow.com/a/72409589 + // extern "C" will avoid name mangling from happening. + // It doesn't prevent a function from returning or accepting a C++ type. + // But the calling language needs to understand the data-layout for the + // type being returned. + // For non-trivial types, this will be difficult to impossible. + return (QirArray *)qubit_vector_ptr; +} + +QirArray *__catalyst__rt__qubit_allocate_array(int64_t num_qubits) +{ + return timer::timer(__catalyst__rt__qubit_allocate_array__impl, "qubit_allocate_array", + /* add_endl */ true, num_qubits); +} + +static int __catalyst__rt__qubit_release__impl(QUBIT *qubit) +{ + getQuantumDevicePtr()->ReleaseQubit(reinterpret_cast(qubit)); + return 0; +} + +void __catalyst__rt__qubit_release(QUBIT *qubit) +{ + timer::timer(__catalyst__rt__qubit_release__impl, "qubit_release", + /* add_endl */ true, qubit); +} + +static int __catalyst__rt__qubit_release_array__impl(QirArray *qubit_array) +{ + getQuantumDevicePtr()->ReleaseAllQubits(); + std::vector *qubit_array_ptr = + reinterpret_cast *>(qubit_array); + delete qubit_array_ptr; + return 0; +} + +void __catalyst__rt__qubit_release_array(QirArray *qubit_array) +{ + timer::timer(__catalyst__rt__qubit_release_array__impl, "qubit_release_array", + /* add_endl */ true, qubit_array); +} + +int64_t __catalyst__rt__num_qubits() +{ + return static_cast(getQuantumDevicePtr()->GetNumQubits()); +} + +bool __catalyst__rt__result_equal(RESULT *r0, RESULT *r1) { return (r0 == r1) || (*r0 == *r1); } + +RESULT *__catalyst__rt__result_get_one() { return getQuantumDevicePtr()->One(); } + +RESULT *__catalyst__rt__result_get_zero() { return getQuantumDevicePtr()->Zero(); } + +void __catalyst__qis__Gradient(int64_t numResults, /* results = */...) +{ + RT_ASSERT(numResults >= 0); + using ResultType = MemRefT; + + std::vector mem_ptrs; + mem_ptrs.reserve(numResults); + va_list args; + va_start(args, numResults); + for (int64_t i = 0; i < numResults; i++) { + mem_ptrs.push_back(va_arg(args, ResultType *)); + } + va_end(args); + + std::vector> mem_views; + mem_views.reserve(numResults); + for (auto *mr : mem_ptrs) { + mem_views.emplace_back(mr->data_aligned, mr->offset, mr->sizes, mr->strides); + } + + // num_observables * num_train_params + getQuantumDevicePtr()->Gradient(mem_views, {}); +} + +void __catalyst__qis__Gradient_params(MemRefT_int64_1d *params, int64_t numResults, + /* results = */...) +{ + RT_ASSERT(numResults >= 0); + using ResultType = MemRefT; + + if (params == nullptr || !params->sizes[0]) { + RT_FAIL("Invalid number of trainable parameters"); + } + + const size_t tp_size = params->sizes[0]; + + // create a vector of custom trainable parameters + std::vector train_params; + auto *params_data = params->data_aligned; + train_params.reserve(tp_size); + for (size_t i = 0; i < tp_size; i++) { + auto p = params_data[i]; + RT_FAIL_IF(p < 0, "trainable parameter cannot be a negative integer"); + train_params.push_back(p); + } + + std::vector mem_ptrs; + mem_ptrs.reserve(numResults); + va_list args; + va_start(args, numResults); + for (int64_t i = 0; i < numResults; i++) { + mem_ptrs.push_back(va_arg(args, ResultType *)); + } + va_end(args); + + std::vector> mem_views; + mem_views.reserve(numResults); + for (auto *mr : mem_ptrs) { + mem_views.emplace_back(mr->data_aligned, mr->offset, mr->sizes, mr->strides); + } + + // num_observables * num_train_params + getQuantumDevicePtr()->Gradient(mem_views, train_params); +} + +void __catalyst__qis__GlobalPhase(double phi, const Modifiers *modifiers) +{ + getQuantumDevicePtr()->NamedOperation("GlobalPhase", {phi}, {}, MODIFIERS_ARGS(modifiers)); +} + +void __catalyst__qis__SetState(MemRefT_CplxT_double_1d *data, uint64_t numQubits, ...) +{ + RT_ASSERT(numQubits > 0); + + va_list args; + va_start(args, numQubits); + std::vector wires(numQubits); + for (uint64_t i = 0; i < numQubits; i++) { + wires[i] = va_arg(args, QubitIdType); + } + va_end(args); + + MemRefT, 1> *data_p = (MemRefT, 1> *)data; + DataView, 1> data_view(data_p->data_aligned, data_p->offset, data_p->sizes, + data_p->strides); + getQuantumDevicePtr()->SetState(data_view, wires); +} + +void __catalyst__qis__SetBasisState(MemRefT_int8_1d *data, uint64_t numQubits, ...) +{ + RT_ASSERT(numQubits > 0); + + DataView data_view(data->data_aligned, data->offset, data->sizes, data->strides); + + va_list args; + va_start(args, numQubits); + std::vector wires(numQubits); + for (uint64_t i = 0; i < numQubits; i++) { + wires[i] = va_arg(args, QubitIdType); + } + va_end(args); + std::unordered_set wire_set(wires.begin(), wires.end()); + RT_FAIL_IF(wire_set.size() != numQubits, "Wires must be unique"); + RT_FAIL_IF(data->sizes[0] != numQubits, + "BasisState parameter and wires must be of equal length."); + + getQuantumDevicePtr()->SetBasisState(data_view, wires); +} + +void __catalyst__qis__Identity(QUBIT *qubit, const Modifiers *modifiers) +{ + getQuantumDevicePtr()->NamedOperation("Identity", {}, {reinterpret_cast(qubit)}, + MODIFIERS_ARGS(modifiers)); +} + +void __catalyst__qis__PauliX(QUBIT *qubit, const Modifiers *modifiers) +{ + getQuantumDevicePtr()->NamedOperation("PauliX", {}, {reinterpret_cast(qubit)}, + MODIFIERS_ARGS(modifiers)); +} + +void __catalyst__qis__PauliY(QUBIT *qubit, const Modifiers *modifiers) +{ + getQuantumDevicePtr()->NamedOperation("PauliY", {}, {reinterpret_cast(qubit)}, + MODIFIERS_ARGS(modifiers)); +} + +void __catalyst__qis__PauliZ(QUBIT *qubit, const Modifiers *modifiers) +{ + getQuantumDevicePtr()->NamedOperation("PauliZ", {}, {reinterpret_cast(qubit)}, + MODIFIERS_ARGS(modifiers)); +} + +void __catalyst__qis__Hadamard(QUBIT *qubit, const Modifiers *modifiers) +{ + getQuantumDevicePtr()->NamedOperation("Hadamard", {}, {reinterpret_cast(qubit)}, + MODIFIERS_ARGS(modifiers)); +} + +void __catalyst__qis__S(QUBIT *qubit, const Modifiers *modifiers) +{ + getQuantumDevicePtr()->NamedOperation("S", {}, {reinterpret_cast(qubit)}, + MODIFIERS_ARGS(modifiers)); +} + +void __catalyst__qis__T(QUBIT *qubit, const Modifiers *modifiers) +{ + getQuantumDevicePtr()->NamedOperation("T", {}, {reinterpret_cast(qubit)}, + MODIFIERS_ARGS(modifiers)); +} + +void __catalyst__qis__PhaseShift(double theta, QUBIT *qubit, const Modifiers *modifiers) +{ + getQuantumDevicePtr()->NamedOperation( + "PhaseShift", {theta}, {reinterpret_cast(qubit)}, MODIFIERS_ARGS(modifiers)); +} + +void __catalyst__qis__RX(double theta, QUBIT *qubit, const Modifiers *modifiers) +{ + getQuantumDevicePtr()->NamedOperation("RX", {theta}, {reinterpret_cast(qubit)}, + MODIFIERS_ARGS(modifiers)); +} + +void __catalyst__qis__RY(double theta, QUBIT *qubit, const Modifiers *modifiers) +{ + getQuantumDevicePtr()->NamedOperation("RY", {theta}, {reinterpret_cast(qubit)}, + MODIFIERS_ARGS(modifiers)); +} + +void __catalyst__qis__RZ(double theta, QUBIT *qubit, const Modifiers *modifiers) +{ + getQuantumDevicePtr()->NamedOperation("RZ", {theta}, {reinterpret_cast(qubit)}, + MODIFIERS_ARGS(modifiers)); +} + +void __catalyst__qis__Rot(double phi, double theta, double omega, QUBIT *qubit, + const Modifiers *modifiers) +{ + getQuantumDevicePtr()->NamedOperation("Rot", {phi, theta, omega}, + {reinterpret_cast(qubit)}, + MODIFIERS_ARGS(modifiers)); +} + +void __catalyst__qis__CNOT(QUBIT *control, QUBIT *target, const Modifiers *modifiers) +{ + RT_FAIL_IF(control == target, + "Invalid input for CNOT gate. Control and target qubit operands must be distinct."); + getQuantumDevicePtr()->NamedOperation("CNOT", {}, + {/* control = */ reinterpret_cast(control), + /* target = */ reinterpret_cast(target)}, + /* modifiers */ MODIFIERS_ARGS(modifiers)); +} + +void __catalyst__qis__CY(QUBIT *control, QUBIT *target, const Modifiers *modifiers) +{ + getQuantumDevicePtr()->NamedOperation("CY", {}, + {/* control = */ reinterpret_cast(control), + /* target = */ reinterpret_cast(target)}, + /* modifiers */ MODIFIERS_ARGS(modifiers)); +} + +void __catalyst__qis__CZ(QUBIT *control, QUBIT *target, const Modifiers *modifiers) +{ + getQuantumDevicePtr()->NamedOperation("CZ", {}, + {/* control = */ reinterpret_cast(control), + /* target = */ reinterpret_cast(target)}, + /* modifiers */ MODIFIERS_ARGS(modifiers)); +} + +void __catalyst__qis__SWAP(QUBIT *control, QUBIT *target, const Modifiers *modifiers) +{ + getQuantumDevicePtr()->NamedOperation("SWAP", {}, + {/* control = */ reinterpret_cast(control), + /* target = */ reinterpret_cast(target)}, + /* modifiers */ MODIFIERS_ARGS(modifiers)); +} + +void __catalyst__qis__IsingXX(double theta, QUBIT *control, QUBIT *target, + const Modifiers *modifiers) +{ + getQuantumDevicePtr()->NamedOperation("IsingXX", {theta}, + {/* control = */ reinterpret_cast(control), + /* target = */ reinterpret_cast(target)}, + /* modifiers */ MODIFIERS_ARGS(modifiers)); +} + +void __catalyst__qis__IsingYY(double theta, QUBIT *control, QUBIT *target, + const Modifiers *modifiers) +{ + getQuantumDevicePtr()->NamedOperation("IsingYY", {theta}, + {/* control = */ reinterpret_cast(control), + /* target = */ reinterpret_cast(target)}, + /* modifiers */ MODIFIERS_ARGS(modifiers)); +} + +void __catalyst__qis__IsingXY(double theta, QUBIT *control, QUBIT *target, + const Modifiers *modifiers) +{ + getQuantumDevicePtr()->NamedOperation("IsingXY", {theta}, + {/* control = */ reinterpret_cast(control), + /* target = */ reinterpret_cast(target)}, + /* modifiers */ MODIFIERS_ARGS(modifiers)); +} + +void __catalyst__qis__IsingZZ(double theta, QUBIT *control, QUBIT *target, + const Modifiers *modifiers) +{ + getQuantumDevicePtr()->NamedOperation("IsingZZ", {theta}, + {/* control = */ reinterpret_cast(control), + /* target = */ reinterpret_cast(target)}, + /* modifiers */ MODIFIERS_ARGS(modifiers)); +} + +void __catalyst__qis__ControlledPhaseShift(double theta, QUBIT *control, QUBIT *target, + const Modifiers *modifiers) +{ + getQuantumDevicePtr()->NamedOperation("ControlledPhaseShift", {theta}, + {/* control = */ reinterpret_cast(control), + /* target = */ reinterpret_cast(target)}, + /* modifiers */ MODIFIERS_ARGS(modifiers)); +} + +void __catalyst__qis__CRX(double theta, QUBIT *control, QUBIT *target, const Modifiers *modifiers) +{ + getQuantumDevicePtr()->NamedOperation("CRX", {theta}, + {/* control = */ reinterpret_cast(control), + /* target = */ reinterpret_cast(target)}, + /* modifiers */ MODIFIERS_ARGS(modifiers)); +} + +void __catalyst__qis__CRY(double theta, QUBIT *control, QUBIT *target, const Modifiers *modifiers) +{ + getQuantumDevicePtr()->NamedOperation("CRY", {theta}, + {/* control = */ reinterpret_cast(control), + /* target = */ reinterpret_cast(target)}, + /* modifiers */ MODIFIERS_ARGS(modifiers)); +} + +void __catalyst__qis__CRZ(double theta, QUBIT *control, QUBIT *target, const Modifiers *modifiers) +{ + getQuantumDevicePtr()->NamedOperation("CRZ", {theta}, + {/* control = */ reinterpret_cast(control), + /* target = */ reinterpret_cast(target)}, + /* modifiers */ MODIFIERS_ARGS(modifiers)); +} + +void __catalyst__qis__CRot(double phi, double theta, double omega, QUBIT *control, QUBIT *target, + const Modifiers *modifiers) +{ + getQuantumDevicePtr()->NamedOperation("CRot", {phi, theta, omega}, + {/* control = */ reinterpret_cast(control), + /* target = */ reinterpret_cast(target)}, + /* modifiers */ MODIFIERS_ARGS(modifiers)); +} + +void __catalyst__qis__CSWAP(QUBIT *control, QUBIT *aswap, QUBIT *bswap, const Modifiers *modifiers) +{ + getQuantumDevicePtr()->NamedOperation("CSWAP", {}, + {reinterpret_cast(control), + reinterpret_cast(aswap), + reinterpret_cast(bswap)}, + /* modifiers */ MODIFIERS_ARGS(modifiers)); +} + +void __catalyst__qis__Toffoli(QUBIT *wire0, QUBIT *wire1, QUBIT *wire2, const Modifiers *modifiers) +{ + getQuantumDevicePtr()->NamedOperation("Toffoli", {}, + {reinterpret_cast(wire0), + reinterpret_cast(wire1), + reinterpret_cast(wire2)}, + /* modifiers */ MODIFIERS_ARGS(modifiers)); +} + +void __catalyst__qis__MultiRZ(double theta, const Modifiers *modifiers, int64_t numQubits, ...) +{ + RT_ASSERT(numQubits >= 0); + + va_list args; + va_start(args, numQubits); + std::vector wires(numQubits); + for (int64_t i = 0; i < numQubits; i++) { + wires[i] = va_arg(args, QubitIdType); + } + va_end(args); + + getQuantumDevicePtr()->NamedOperation("MultiRZ", {theta}, wires, + /* modifiers */ MODIFIERS_ARGS(modifiers)); +} + +void __catalyst__qis__ISWAP(QUBIT *wire0, QUBIT *wire1, const Modifiers *modifiers) +{ + getQuantumDevicePtr()->NamedOperation( + "ISWAP", {}, {reinterpret_cast(wire0), reinterpret_cast(wire1)}, + MODIFIERS_ARGS(modifiers)); +} + +void __catalyst__qis__PSWAP(double phi, QUBIT *wire0, QUBIT *wire1, const Modifiers *modifiers) +{ + getQuantumDevicePtr()->NamedOperation( + "PSWAP", {phi}, + {reinterpret_cast(wire0), reinterpret_cast(wire1)}, + MODIFIERS_ARGS(modifiers)); +} + +static void _qubitUnitary_impl(MemRefT_CplxT_double_2d *matrix, int64_t numQubits, + std::vector> &coeffs, + std::vector &wires, va_list *args) +{ + const size_t num_rows = matrix->sizes[0]; + const size_t num_col = matrix->sizes[1]; + const size_t expected_size = std::pow(2, numQubits); + + if (num_rows != expected_size || num_col != expected_size) { + RT_FAIL("Invalid given QubitUnitary matrix; " + "The size of the matrix must be pow(2, numWires) * pow(2, numWires)."); + } + + wires.reserve(numQubits); + for (int64_t i = 0; i < numQubits; i++) { + wires.push_back(va_arg(*args, QubitIdType)); + } + + const size_t matrix_size = num_rows * num_col; + coeffs.reserve(matrix_size); + for (size_t i = 0; i < matrix_size; i++) { + coeffs.emplace_back(matrix->data_aligned[i].real, matrix->data_aligned[i].imag); + } +} + +void __catalyst__qis__QubitUnitary(MemRefT_CplxT_double_2d *matrix, const Modifiers *modifiers, + int64_t numQubits, /*qubits*/...) +{ + RT_ASSERT(numQubits >= 0); + + if (matrix == nullptr) { + RT_FAIL("The QubitUnitary matrix must be initialized"); + } + + if (numQubits > __catalyst__rt__num_qubits()) { + RT_FAIL("Invalid number of wires"); + } + + va_list args; + std::vector> coeffs; + std::vector wires; + va_start(args, numQubits); + _qubitUnitary_impl(matrix, numQubits, coeffs, wires, &args); + va_end(args); + return getQuantumDevicePtr()->MatrixOperation(coeffs, wires, MODIFIERS_ARGS(modifiers)); +} + +ObsIdType __catalyst__qis__NamedObs(int64_t obsId, QUBIT *wire) +{ + return getQuantumDevicePtr()->Observable(static_cast(obsId), {}, + {reinterpret_cast(wire)}); +} + +ObsIdType __catalyst__qis__HermitianObs(MemRefT_CplxT_double_2d *matrix, int64_t numQubits, ...) +{ + RT_ASSERT(numQubits >= 0); + + if (matrix == nullptr) { + RT_FAIL("The Hermitian matrix must be initialized"); + } + + const size_t num_rows = matrix->sizes[0]; + const size_t num_col = matrix->sizes[1]; + const size_t expected_size = std::pow(2, numQubits); + + if (num_rows != expected_size || num_col != expected_size) { + RT_FAIL("Invalid given Hermitian matrix; " + "The size of the matrix must be pow(2, numWires) * pow(2, numWires)."); + } + + va_list args; + va_start(args, numQubits); + std::vector wires(numQubits); + for (int64_t i = 0; i < numQubits; i++) { + wires[i] = va_arg(args, QubitIdType); + } + va_end(args); + + if (numQubits > __catalyst__rt__num_qubits()) { + RT_FAIL("Invalid number of wires"); + } + + const size_t matrix_size = num_rows * num_col; + std::vector> coeffs; + coeffs.reserve(matrix_size); + for (size_t i = 0; i < matrix_size; i++) { + coeffs.emplace_back(matrix->data_aligned[i].real, matrix->data_aligned[i].imag); + } + + return getQuantumDevicePtr()->Observable(ObsId::Hermitian, coeffs, wires); +} + +ObsIdType __catalyst__qis__TensorObs(int64_t numObs, /*obsKeys*/...) +{ + if (numObs < 1) { + RT_FAIL("Invalid number of observables to create TensorProdObs"); + } + + va_list args; + va_start(args, numObs); + std::vector obsKeys; + obsKeys.reserve(numObs); + for (int64_t i = 0; i < numObs; i++) { + obsKeys.push_back(va_arg(args, ObsIdType)); + } + va_end(args); + + return getQuantumDevicePtr()->TensorObservable(obsKeys); +} + +ObsIdType __catalyst__qis__HamiltonianObs(MemRefT_double_1d *coeffs, int64_t numObs, + /*obsKeys*/...) +{ + RT_ASSERT(numObs >= 0); + + if (coeffs == nullptr) { + RT_FAIL("Invalid coefficients for computing Hamiltonian; " + "The coefficients list must be initialized."); + } + + const size_t coeffs_size = coeffs->sizes[0]; + + if (static_cast(numObs) != coeffs_size) { + RT_FAIL("Invalid coefficients for computing Hamiltonian; " + "The number of coefficients and observables must be equal."); + } + + va_list args; + va_start(args, numObs); + std::vector obsKeys; + obsKeys.reserve(numObs); + for (int64_t i = 0; i < numObs; i++) { + obsKeys.push_back(va_arg(args, ObsIdType)); + } + va_end(args); + + std::vector coeffs_vec(coeffs->data_aligned, coeffs->data_aligned + coeffs_size); + return getQuantumDevicePtr()->HamiltonianObservable(coeffs_vec, obsKeys); +} + +RESULT *__catalyst__qis__Measure(QUBIT *wire, int32_t postselect) +{ + std::optional postselectOpt{postselect}; + + // Any value different to 0 or 1 denotes absence of postselect, and it is hence turned into + // std::nullopt at the C++ interface + if (postselect != 0 && postselect != 1) { + postselectOpt = std::nullopt; + } + + return getQuantumDevicePtr()->Measure(reinterpret_cast(wire), postselectOpt); +} + +double __catalyst__qis__Expval(ObsIdType obsKey) { return getQuantumDevicePtr()->Expval(obsKey); } + +double __catalyst__qis__Variance(ObsIdType obsKey) { return getQuantumDevicePtr()->Var(obsKey); } + +void __catalyst__qis__State(MemRefT_CplxT_double_1d *result, int64_t numQubits, ...) +{ + RT_ASSERT(numQubits >= 0); + MemRefT, 1> *result_p = (MemRefT, 1> *)result; + + va_list args; + va_start(args, numQubits); + std::vector wires(numQubits); + for (int64_t i = 0; i < numQubits; i++) { + wires[i] = va_arg(args, QubitIdType); + } + va_end(args); + + DataView, 1> view(result_p->data_aligned, result_p->offset, + result_p->sizes, result_p->strides); + + if (wires.empty()) { + getQuantumDevicePtr()->State(view); + } + else { + RT_FAIL("Partial State-Vector not supported yet"); + // getQuantumDevicePtr()->PartialState(stateVec, + // numElements, wires); + } +} + +void __catalyst__qis__Probs(MemRefT_double_1d *result, int64_t numQubits, ...) +{ + RT_ASSERT(numQubits >= 0); + MemRefT *result_p = (MemRefT *)result; + + va_list args; + va_start(args, numQubits); + std::vector wires(numQubits); + for (int64_t i = 0; i < numQubits; i++) { + wires[i] = va_arg(args, QubitIdType); + } + va_end(args); + + DataView view(result_p->data_aligned, result_p->offset, result_p->sizes, + result_p->strides); + + if (wires.empty()) { + getQuantumDevicePtr()->Probs(view); + } + else { + getQuantumDevicePtr()->PartialProbs(view, wires); + } +} + +void __catalyst__qis__Sample(MemRefT_double_2d *result, int64_t numQubits, ...) +{ + int64_t shots = getQuantumDevicePtr()->GetDeviceShots(); + RT_ASSERT(shots >= 0); + RT_ASSERT(numQubits >= 0); + MemRefT *result_p = (MemRefT *)result; + + va_list args; + va_start(args, numQubits); + std::vector wires(numQubits); + for (int64_t i = 0; i < numQubits; i++) { + wires[i] = va_arg(args, QubitIdType); + } + va_end(args); + + DataView view(result_p->data_aligned, result_p->offset, result_p->sizes, + result_p->strides); + + if (wires.empty()) { + getQuantumDevicePtr()->Sample(view, shots); + } + else { + getQuantumDevicePtr()->PartialSample(view, wires, shots); + } +} + +void __catalyst__qis__Counts(PairT_MemRefT_double_int64_1d *result, int64_t numQubits, ...) +{ + int64_t shots = getQuantumDevicePtr()->GetDeviceShots(); + RT_ASSERT(shots >= 0); + RT_ASSERT(numQubits >= 0); + MemRefT *result_eigvals_p = (MemRefT *)&result->first; + MemRefT *result_counts_p = (MemRefT *)&result->second; + + va_list args; + va_start(args, numQubits); + std::vector wires(numQubits); + for (int64_t i = 0; i < numQubits; i++) { + wires[i] = va_arg(args, QubitIdType); + } + va_end(args); + + DataView eigvals_view(result_eigvals_p->data_aligned, result_eigvals_p->offset, + result_eigvals_p->sizes, result_eigvals_p->strides); + DataView counts_view(result_counts_p->data_aligned, result_counts_p->offset, + result_counts_p->sizes, result_counts_p->strides); + + if (wires.empty()) { + getQuantumDevicePtr()->Counts(eigvals_view, counts_view, shots); + } + else { + getQuantumDevicePtr()->PartialCounts(eigvals_view, counts_view, wires, shots); + } +} + +int64_t __catalyst__rt__array_get_size_1d(QirArray *ptr) +{ + std::vector *qubit_vector_ptr = reinterpret_cast *>(ptr); + return qubit_vector_ptr->size(); +} + +int8_t *__catalyst__rt__array_get_element_ptr_1d(QirArray *ptr, int64_t idx) +{ + std::vector *qubit_vector_ptr = reinterpret_cast *>(ptr); + + RT_ASSERT(idx >= 0); + std::string error_msg = "The qubit register does not contain the requested wire: "; + error_msg += std::to_string(idx); + RT_FAIL_IF(static_cast(idx) >= qubit_vector_ptr->size(), error_msg.c_str()); + + QubitIdType *data = qubit_vector_ptr->data(); + return (int8_t *)&data[idx]; +} +} diff --git a/src/qirlightning/catalyst_runtime/lib/registry/CMakeLists.txt b/src/qirlightning/catalyst_runtime/lib/registry/CMakeLists.txt new file mode 100644 index 0000000..2c19e4a --- /dev/null +++ b/src/qirlightning/catalyst_runtime/lib/registry/CMakeLists.txt @@ -0,0 +1,33 @@ +# nanobind suggests including these lines to configure CMake to perform an optimized release build +# by default unless another build type is specified. Without this addition, binding code may run +# slowly and produce large binaries. +# See https://nanobind.readthedocs.io/en/latest/building.html#preliminaries +if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) + set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") +endif() + +# Locate nanobind +execute_process( + COMMAND "${Python_EXECUTABLE}" -c "import nanobind; print(nanobind.cmake_dir())" + OUTPUT_VARIABLE nanobind_DIR OUTPUT_STRIP_TRAILING_WHITESPACE +) +find_package(nanobind CONFIG REQUIRED) + +# Source file list for `wrapper` module +set(REGISTRY_SRC_FILES + Registry.cpp +) + +# Create the Python `catalyst_callback_registry` module +# Target the stable ABI for Python 3.12+, which reduces the number of binary wheels that must be +# built (`STABLE_ABI` does nothing on older Python versions). +nanobind_add_module(catalyst_callback_registry STABLE_ABI ${REGISTRY_SRC_FILES}) + +# Use a consistant suffix ".so" rather than, e.g. ".abi3.so" (when using the Stable ABI) or +# ".cpython-3xx-darwin.so". Doing so simplifies the process to locate it when calling +# `dlopen(LIBREGISTRY)` in runtime/lib/capi/RuntimeCAPI.cpp. +set_target_properties(catalyst_callback_registry PROPERTIES SUFFIX ".so") + +target_include_directories(catalyst_callback_registry PUBLIC ${runtime_includes}) +target_compile_definitions(catalyst_qir_qis_obj PUBLIC -DLIBREGISTRY=\"$\") diff --git a/src/qirlightning/catalyst_runtime/lib/registry/Registry.cpp b/src/qirlightning/catalyst_runtime/lib/registry/Registry.cpp new file mode 100644 index 0000000..fd4715d --- /dev/null +++ b/src/qirlightning/catalyst_runtime/lib/registry/Registry.cpp @@ -0,0 +1,179 @@ +// Copyright 2024 Xanadu Quantum Technologies Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include + +#include +#include + +namespace nb = nanobind; + +// From PyBind11's documentation: +// +// Do you have any global variables that are pybind11 objects or invoke pybind11 functions in +// either their constructor or destructor? You are generally not allowed to invoke any Python +// function in a global static context. We recommend using lazy initialization and then +// intentionally leaking at the end of the program. +// +// https://pybind11.readthedocs.io/en/stable/advanced/misc.html#common-sources-of-global-interpreter-lock-errors +std::unordered_map *references; + +std::string libmlirpath; + +struct UnrankedMemrefType { + int64_t rank; + void *descriptor; +}; + +class LibraryManager { + void *_handle; + + public: + LibraryManager(std::string path) + { + this->_handle = dlopen(path.c_str(), RTLD_LAZY); + if (!this->_handle) { + throw nb::value_error(dlerror()); + } + } + + ~LibraryManager() + { + if (this->_handle) { + dlclose(this->_handle); + } + } + + void operator()(long elementSize, UnrankedMemrefType *src, UnrankedMemrefType *dst) + { + void *f_ptr = dlsym(this->_handle, "memrefCopy"); + if (!f_ptr) { + throw nb::value_error(dlerror()); + } + typedef void (*memrefCopy_t)(int64_t, void *, void *); + void (*memrefCopy)(int64_t, void *, void *); + memrefCopy = (memrefCopy_t)(f_ptr); + return memrefCopy(elementSize, src, dst); + } +}; + +inline const char *ext() +{ +#ifdef __APPLE__ + return ".dylib"; +#elif __linux__ + return ".so"; +#else +#error "Only apple and linux are currently supported"; +#endif +} + +std::string library_name(std::string name) { return name + ext(); } + +void convertResult(nb::handle tuple) +{ + nb::object unrankedMemrefPtrSizeTuple = tuple.attr("__getitem__")(0); + + nb::object unranked_memref = unrankedMemrefPtrSizeTuple.attr("__getitem__")(0); + nb::object element_size = unrankedMemrefPtrSizeTuple.attr("__getitem__")(1); + nb::object unranked_memref_ptr_int = unranked_memref.attr("value"); + + void *unranked_memref_ptr = reinterpret_cast(nb::cast(unranked_memref_ptr_int)); + long e_size = nb::cast(element_size); + + nb::object dest = tuple.attr("__getitem__")(1); + + long destAsLong = nb::cast(dest); + void *destAsPtr = (void *)(destAsLong); + + UnrankedMemrefType *src = (UnrankedMemrefType *)unranked_memref_ptr; + UnrankedMemrefType destMemref = {src->rank, destAsPtr}; + + std::string libpath = libmlirpath + library_name("/libmlir_c_runner_utils"); + LibraryManager memrefCopy(libpath); + memrefCopy(e_size, src, &destMemref); +} + +void convertResults(nb::list results, nb::list allocated) +{ + auto builtins = nb::module_::import_("builtins"); + auto zip = builtins.attr("zip"); + for (nb::handle obj : zip(results, allocated)) { + convertResult(obj); + } +} + +extern "C" { +[[gnu::visibility("default")]] void callbackCall(int64_t identifier, int64_t count, int64_t retc, + va_list args) +{ + nb::gil_scoped_acquire lock; + auto it = references->find(identifier); + if (it == references->end()) { + throw std::invalid_argument("Callback called with invalid identifier"); + } + auto lambda = it->second; + + nb::list flat_args; + for (int i = 0; i < count; i++) { + int64_t ptr = va_arg(args, int64_t); + flat_args.append(ptr); + } + + nb::list flat_results = nb::list(lambda(flat_args)); + + // We have a flat list of return values. + // These returns **may** be array views to + // the very same memrefs that we passed as inputs. + // As a first prototype, let's copy these values. + // I think it is best to always copy them because + // of aliasing. Let's just copy them to guarantee + // no aliasing issues. We can revisit this as an optimization + // and allowing these to alias. + nb::list flat_returns_allocated_compiler; + for (int i = 0; i < retc; i++) { + int64_t ptr = va_arg(args, int64_t); + flat_returns_allocated_compiler.append(ptr); + } + convertResults(flat_results, flat_returns_allocated_compiler); +} +} + +void setMLIRLibPath(std::string path) { libmlirpath = path; } + +auto registerImpl(nb::callable f) +{ + // Do we need to see if it is already present or can we just override it? Just override is fine. + // Does python reuse id's? Yes. + // But only after they have been garbaged collected. + // So as long as we maintain a reference to it, then they won't be garbage collected. + // Inserting the function into the unordered map increases the reference by one. + int64_t id = reinterpret_cast(f.ptr()); + references->insert({id, f}); + return id; +} + +NB_MODULE(catalyst_callback_registry, m) +{ + if (references == nullptr) { + references = new std::unordered_map(); + } + m.doc() = "Callbacks"; + m.def("register", ®isterImpl, "Call a python function registered in a map."); + m.def("set_mlir_lib_path", &setMLIRLibPath, "Set location of mlir's libraries."); +} From 40c8fb73579e76765cc5f0baac22ab5bbd918e5f Mon Sep 17 00:00:00 2001 From: Joseph Lee Date: Tue, 14 Jan 2025 22:52:33 +0000 Subject: [PATCH 33/64] add simple demo --- src/qirlightning/simple_demo/README.md | 61 +++++++++++++++++++ .../simple_demo/test_rt_device.cpp | 40 ++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 src/qirlightning/simple_demo/README.md create mode 100644 src/qirlightning/simple_demo/test_rt_device.cpp diff --git a/src/qirlightning/simple_demo/README.md b/src/qirlightning/simple_demo/README.md new file mode 100644 index 0000000..7a25fe4 --- /dev/null +++ b/src/qirlightning/simple_demo/README.md @@ -0,0 +1,61 @@ +# Simple Demo for Catalyst/Lightning runtime + +This is a super simple demo for using Catalyst runtime to drive Lightning devices. The example here uses `lightning.kokkos`, but can easily be updated to target other devices, e.g. lightning.gpu (if an Nvidia GPU is present). + +The new files required are in `../catalyst_runtime`, which contains a subset of files from the [Catalyst Runtime](https://github.com/PennyLaneAI/catalyst/tree/main/runtime). + +## Installing a lightning simulator + +When installing [Pennylane-Lightning](https://github.com/PennyLaneAI/pennylane-lightning) from pip or source, you will have the shared objects for each of the simulator installed. These are named `liblightning_kokkos_catalyst.so`/`liblightning_GPU_catalyst.so` etc. + +Running `pip install pennylane` or `pip install pennylane-lightning` will install the `lightning.qubit` (CPU) simulator, and other simulators can be installed by running `pip install pennylane-lightning-kokkos / pennylane-lightning-gpu`. + +Example: +``` +$ pip install pennylane-lightning-kokkos + +$ pip show pennylane-lightning-kokkos +Name: PennyLane_Lightning_Kokkos +Version: 0.39.0 +Summary: PennyLane-Lightning plugin +Home-page: https://github.com/PennyLaneAI/pennylane-lightning +Author: +Author-email: +License: Apache License 2.0 +Location: /home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages +Requires: pennylane, pennylane-lightning + +$ ls /home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/pennylane_lightning +... liblightning_kokkos_catalyst.so ... +``` + +You can swap `pennylane-lightning-kokkos` for `pennylane-lightning-gpu` for lightning.gpu and `pennylane-lightning` for lightning.gpu simulators. + +## Compilation + +To compile: + +``` +$ clang++ --std=c++20 test_rt_device.cpp -I/home/joseph/work/qiree/catalyst/runtime/include -I/home/joseph/work/qiree/catalyst/runtime/lib/capi -I/home/joseph/work/qiree/catalyst/runtime/lib/backend/common -o test_rt_device.out +``` + +To run: + +``` +$ ./test_rt_device.out +Kokkos::OpenMP::initialize WARNING: OMP_PROC_BIND environment variable not set + In general, for best performance with OpenMP 4.0 or better set OMP_PROC_BIND=spread and OMP_PLACES=threads + For best performance with OpenMP 3.1 set OMP_PROC_BIND=true + For unit testing set OMP_PROC_BIND=false + +Num Qubits = 3 +State = +*** State-Vector of Size 8 *** +[(0.707107,0), (0,0), (0,0), (0,0), (0.707107,0), (0,0), (0,0), (0,0)] +Measure on wire 0 = 0 +``` + +To run on other devices, e.g. lightning.gpu, you need to change: +- `pip install custatevec-cu12 pennylane-lightning-gpu` (custatevec is a dependency) +- replace `RTDLIB` and `RTDNAME` from `kokkos` to `GPU` +- include `cuquantum` libraries when running, e.g. `LD_LIBRARY_PATH=/home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/cuquantum/lib/:$LD_LIBRARY_PATH ./test_rt_device.out` diff --git a/src/qirlightning/simple_demo/test_rt_device.cpp b/src/qirlightning/simple_demo/test_rt_device.cpp new file mode 100644 index 0000000..f70410a --- /dev/null +++ b/src/qirlightning/simple_demo/test_rt_device.cpp @@ -0,0 +1,40 @@ +#include "ExecutionContext.hpp" + +// Runtime libraries (kokkos/GPU/qubit etc.) +#define RTDLIB "/home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/pennylane_lightning/liblightning_kokkos_catalyst.so" // change to liblightning_gpu_catalyst.so +#define RTDNAME "LightningKokkosSimulator" // change to LightningGPUSimulator + +using namespace Catalyst::Runtime; + +static inline std::shared_ptr loadRTDevice(const std::string &rtd_lib, + const std::string &rtd_name = {}, + const std::string &rtd_kwargs = {}) +{ + ExecutionContext context; + return context.getOrCreateDevice(rtd_lib, rtd_name, rtd_kwargs); +} + +int main() { + auto RTDevice = loadRTDevice(RTDLIB, RTDNAME, ""); + + // Allocate Qubits + RTDevice->getQuantumDevicePtr()->AllocateQubits(3); + + // Get Num Qubits + std::cout << "Num Qubits = " << RTDevice->getQuantumDevicePtr()->GetNumQubits() << std::endl; + + // Apply Gate + RTDevice->getQuantumDevicePtr()->NamedOperation("Hadamard", {}, {0}); + + // Print State + std::cout << "State = " << std::endl; + RTDevice->getQuantumDevicePtr()->PrintState(); + + // Measure + QubitIdType wire{0}; + Result result = RTDevice->getQuantumDevicePtr()->Measure(wire, std::nullopt); + std::cout << "Measure on wire 0 = " << *result << std::endl; + + + return 0; +} From a76563c4344f28d32e959444f74c7b070fce43ed Mon Sep 17 00:00:00 2001 From: Joseph Lee <40768758+josephleekl@users.noreply.github.com> Date: Wed, 15 Jan 2025 09:58:29 -0500 Subject: [PATCH 34/64] Update README.md --- src/qirlightning/simple_demo/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/qirlightning/simple_demo/README.md b/src/qirlightning/simple_demo/README.md index 7a25fe4..f514e92 100644 --- a/src/qirlightning/simple_demo/README.md +++ b/src/qirlightning/simple_demo/README.md @@ -36,7 +36,7 @@ You can swap `pennylane-lightning-kokkos` for `pennylane-lightning-gpu` for ligh To compile: ``` -$ clang++ --std=c++20 test_rt_device.cpp -I/home/joseph/work/qiree/catalyst/runtime/include -I/home/joseph/work/qiree/catalyst/runtime/lib/capi -I/home/joseph/work/qiree/catalyst/runtime/lib/backend/common -o test_rt_device.out +$ clang++ --std=c++20 test_rt_device.cpp -I../catalyst_runtime/lib/capi -I../catalyst_runtime/include -o test_rt_device.out ``` To run: From 611fa66e1ca434fa6728ae6f9c4693454bf665a2 Mon Sep 17 00:00:00 2001 From: Joseph Lee Date: Wed, 15 Jan 2025 19:07:51 +0000 Subject: [PATCH 35/64] remove catalyst runtime deps and update demo --- src/qirlightning/catalyst_runtime/.clang-tidy | 232 ---- src/qirlightning/catalyst_runtime/.gitignore | 3 - .../catalyst_runtime/CMakeLists.txt | 133 --- src/qirlightning/catalyst_runtime/Makefile | 121 -- .../catalyst_runtime/lib/CMakeLists.txt | 3 - .../lib/backend/CMakeLists.txt | 7 - .../lib/backend/common/CacheManager.hpp | 199 ---- .../lib/backend/common/QubitManager.hpp | 146 --- .../lib/backend/common/Utils.hpp | 304 ----- .../catalyst_runtime/lib/capi/CMakeLists.txt | 57 - .../lib/capi/ExecutionContext.hpp | 367 ------ .../catalyst_runtime/lib/capi/MemRefUtils.hpp | 48 - .../catalyst_runtime/lib/capi/RuntimeCAPI.cpp | 1012 ----------------- .../lib/registry/CMakeLists.txt | 33 - .../lib/registry/Registry.cpp | 179 --- src/qirlightning/simple_demo/README.md | 11 +- .../simple_demo/test_rt_device.cpp | 83 +- 17 files changed, 64 insertions(+), 2874 deletions(-) delete mode 100644 src/qirlightning/catalyst_runtime/.clang-tidy delete mode 100644 src/qirlightning/catalyst_runtime/.gitignore delete mode 100644 src/qirlightning/catalyst_runtime/CMakeLists.txt delete mode 100644 src/qirlightning/catalyst_runtime/Makefile delete mode 100644 src/qirlightning/catalyst_runtime/lib/CMakeLists.txt delete mode 100644 src/qirlightning/catalyst_runtime/lib/backend/CMakeLists.txt delete mode 100644 src/qirlightning/catalyst_runtime/lib/backend/common/CacheManager.hpp delete mode 100644 src/qirlightning/catalyst_runtime/lib/backend/common/QubitManager.hpp delete mode 100644 src/qirlightning/catalyst_runtime/lib/backend/common/Utils.hpp delete mode 100644 src/qirlightning/catalyst_runtime/lib/capi/CMakeLists.txt delete mode 100644 src/qirlightning/catalyst_runtime/lib/capi/ExecutionContext.hpp delete mode 100644 src/qirlightning/catalyst_runtime/lib/capi/MemRefUtils.hpp delete mode 100644 src/qirlightning/catalyst_runtime/lib/capi/RuntimeCAPI.cpp delete mode 100644 src/qirlightning/catalyst_runtime/lib/registry/CMakeLists.txt delete mode 100644 src/qirlightning/catalyst_runtime/lib/registry/Registry.cpp diff --git a/src/qirlightning/catalyst_runtime/.clang-tidy b/src/qirlightning/catalyst_runtime/.clang-tidy deleted file mode 100644 index e7ca11f..0000000 --- a/src/qirlightning/catalyst_runtime/.clang-tidy +++ /dev/null @@ -1,232 +0,0 @@ ---- -Checks: '-*,clang-diagnostic-*,clang-analyzer-*,-llvmlibc-*,modernize-*,-modernize-use-trailing-return-type,clang-analyzer-cplusplus*,openmp-*,performance-*,portability-*,readability-*,-modernize-avoid-c-arrays,-readability-magic-numbers,hicpp-*,-hicpp-no-array-decay,-hicpp-avoid-c-arrays,bugprone-suspicious-*,llvm-namespace-comment,cppcoreguidelines-slicing,cppcoreguidelines-special-member-functions,-readability-identifier-length' -WarningsAsErrors: '*' -HeaderFilterRegex: '.*' -AnalyzeTemporaryDtors: false -FormatStyle: none -InheritParentConfig: true -User: mlxd -CheckOptions: - - key: modernize-replace-auto-ptr.IncludeStyle - value: llvm - - key: performance-move-const-arg.CheckTriviallyCopyableMove - value: 'true' - - key: modernize-use-auto.MinTypeNameLength - value: '5' - - key: readability-static-accessed-through-instance.NameSpecifierNestingThreshold - value: '3' - - key: readability-function-size.VariableThreshold - value: '4294967295' - - key: cert-dcl16-c.NewSuffixes - value: 'L;LL;LU;LLU' - - key: readability-identifier-naming.GetConfigPerFile - value: 'true' - - key: readability-inconsistent-declaration-parameter-name.Strict - value: 'false' - - key: readability-magic-numbers.IgnoredIntegerValues - value: '1;2;3;4;' - - key: modernize-use-default-member-init.UseAssignment - value: 'false' - - key: readability-function-size.NestingThreshold - value: '4294967295' - - key: modernize-use-override.AllowOverrideAndFinal - value: 'false' - - key: readability-function-size.ParameterThreshold - value: '4294967295' - - key: openmp-exception-escape.IgnoredExceptions - value: '' - - key: modernize-pass-by-value.ValuesOnly - value: 'false' - - key: modernize-loop-convert.IncludeStyle - value: llvm - - key: cert-str34-c.DiagnoseSignedUnsignedCharComparisons - value: '0' - - key: readability-identifier-naming.AggressiveDependentMemberLookup - value: 'false' - - key: readability-redundant-smartptr-get.IgnoreMacros - value: 'true' - - key: modernize-use-emplace.TupleTypes - value: '::std::pair;::std::tuple' - - key: modernize-use-emplace.TupleMakeFunctions - value: '::std::make_pair;::std::make_tuple' - - key: modernize-use-nodiscard.ReplacementString - value: '[[nodiscard]]' - - key: modernize-loop-convert.MakeReverseRangeHeader - value: '' - - key: modernize-replace-random-shuffle.IncludeStyle - value: llvm - - key: modernize-use-bool-literals.IgnoreMacros - value: 'true' - - key: google-readability-namespace-comments.ShortNamespaceLines - value: '10' - - key: modernize-avoid-bind.PermissiveParameterList - value: 'false' - - key: modernize-use-override.FinalSpelling - value: final - - key: performance-move-constructor-init.IncludeStyle - value: llvm - - key: modernize-loop-convert.UseCxx20ReverseRanges - value: 'true' - - key: modernize-use-noexcept.ReplacementString - value: '' - - key: modernize-use-using.IgnoreMacros - value: 'true' - - key: performance-type-promotion-in-math-fn.IncludeStyle - value: llvm - - key: modernize-loop-convert.NamingStyle - value: CamelCase - - key: modernize-loop-convert.MakeReverseRangeFunction - value: '' - - key: readability-inconsistent-declaration-parameter-name.IgnoreMacros - value: 'true' - - key: performance-no-automatic-move.AllowedTypes - value: '' - - key: performance-for-range-copy.WarnOnAllAutoCopies - value: 'false' - - key: readability-identifier-naming.IgnoreFailedSplit - value: 'false' - - key: modernize-pass-by-value.IncludeStyle - value: llvm - - key: readability-qualified-auto.AddConstToQualified - value: 'true' - - key: readability-simplify-boolean-expr.ChainedConditionalReturn - value: 'false' - - key: readability-else-after-return.WarnOnConditionVariables - value: 'true' - - key: readability-uppercase-literal-suffix.IgnoreMacros - value: 'true' - - key: modernize-use-nullptr.NullMacros - value: 'NULL' - - key: modernize-make-shared.IgnoreMacros - value: 'true' - - key: performance-unnecessary-copy-initialization.AllowedTypes - value: '' - - key: modernize-use-transparent-functors.SafeMode - value: 'false' - - key: modernize-make-shared.IgnoreDefaultInitialization - value: 'true' - - key: modernize-make-shared.IncludeStyle - value: llvm - - key: readability-simplify-boolean-expr.ChainedConditionalAssignment - value: 'false' - - key: cert-oop54-cpp.WarnOnlyIfThisHasSuspiciousField - value: '0' - - key: readability-function-size.LineThreshold - value: '4294967295' - - key: performance-inefficient-vector-operation.EnableProto - value: 'false' - - key: modernize-use-override.IgnoreDestructors - value: 'false' - - key: modernize-loop-convert.MaxCopySize - value: '16' - - key: modernize-make-shared.MakeSmartPtrFunction - value: 'std::make_shared' - - key: portability-simd-intrinsics.Suggest - value: 'false' - - key: cppcoreguidelines-explicit-virtual-functions.IgnoreDestructors - value: '1' - - key: modernize-make-unique.IgnoreMacros - value: 'true' - - key: modernize-make-shared.MakeSmartPtrFunctionHeader - value: '' - - key: performance-for-range-copy.AllowedTypes - value: '' - - key: readability-redundant-string-init.StringNames - value: '::std::basic_string_view;::std::basic_string' - - key: modernize-make-unique.IgnoreDefaultInitialization - value: 'true' - - key: modernize-use-emplace.ContainersWithPushBack - value: '::std::vector;::std::list;::std::deque' - - key: readability-magic-numbers.IgnoreBitFieldsWidths - value: 'true' - - key: modernize-make-unique.IncludeStyle - value: llvm - - key: readability-braces-around-statements.ShortStatementLines - value: '0' - - key: modernize-use-override.OverrideSpelling - value: override - - key: readability-magic-numbers.IgnoredFloatingPointValues - value: '1.0;100.0;' - - key: performance-inefficient-string-concatenation.StrictMode - value: 'false' - - key: readability-implicit-bool-conversion.AllowPointerConditions - value: 'false' - - key: readability-redundant-declaration.IgnoreMacros - value: 'true' - - key: google-readability-braces-around-statements.ShortStatementLines - value: '1' - - key: modernize-make-unique.MakeSmartPtrFunction - value: 'std::make_unique' - - key: portability-restrict-system-includes.Includes - value: '*' - - key: readability-else-after-return.WarnOnUnfixable - value: 'true' - - key: modernize-use-emplace.IgnoreImplicitConstructors - value: 'false' - - key: modernize-make-unique.MakeSmartPtrFunctionHeader - value: '' - - key: modernize-use-equals-delete.IgnoreMacros - value: 'true' - - key: readability-magic-numbers.IgnoreAllFloatingPointValues - value: 'false' - - key: readability-uppercase-literal-suffix.NewSuffixes - value: '' - - key: modernize-loop-convert.MinConfidence - value: reasonable - - key: performance-unnecessary-value-param.AllowedTypes - value: '' - - key: modernize-use-noexcept.UseNoexceptFalse - value: 'true' - - key: google-readability-namespace-comments.SpacesBeforeComments - value: '2' - - key: readability-function-cognitive-complexity.Threshold - value: '100' - - key: readability-function-cognitive-complexity.IgnoreMacros - value: 'true' - - key: cppcoreguidelines-non-private-member-variables-in-classes.IgnoreClassesWithAllMemberVariablesBeingPublic - value: '1' - - key: performance-faster-string-find.StringLikeClasses - value: '::std::basic_string;::std::basic_string_view' - - key: readability-function-size.BranchThreshold - value: '4294967295' - - key: readability-implicit-bool-conversion.AllowIntegerConditions - value: 'false' - - key: readability-function-size.StatementThreshold - value: '800' - - key: modernize-use-default-member-init.IgnoreMacros - value: 'true' - - key: llvm-qualified-auto.AddConstToQualified - value: '0' - - key: readability-identifier-naming.IgnoreMainLikeFunctions - value: 'false' - - key: google-readability-function-size.StatementThreshold - value: '800' - - key: llvm-else-after-return.WarnOnConditionVariables - value: '0' - - key: modernize-raw-string-literal.DelimiterStem - value: lit - - key: modernize-use-equals-default.IgnoreMacros - value: 'true' - - key: modernize-raw-string-literal.ReplaceShorterLiterals - value: 'false' - - key: modernize-use-emplace.SmartPointers - value: '::std::shared_ptr;::std::unique_ptr;::std::auto_ptr;::std::weak_ptr' - - key: performance-inefficient-vector-operation.VectorLikeClasses - value: '::std::vector' - - key: modernize-use-auto.RemoveStars - value: 'false' - - key: readability-magic-numbers.IgnorePowersOf2IntegerValues - value: 'true' - - key: portability-simd-intrinsics.Std - value: '' - - key: readability-redundant-member-init.IgnoreBaseInCopyConstructors - value: 'false' - - key: performance-unnecessary-value-param.IncludeStyle - value: llvm - - key: modernize-replace-disallow-copy-and-assign-macro.MacroName - value: DISALLOW_COPY_AND_ASSIGN - - key: llvm-else-after-return.WarnOnUnfixable - value: '0' - - key: readability-simplify-subscript-expr.Types - value: '::std::basic_string;::std::basic_string_view;::std::vector;::std::array' -... diff --git a/src/qirlightning/catalyst_runtime/.gitignore b/src/qirlightning/catalyst_runtime/.gitignore deleted file mode 100644 index 4258b32..0000000 --- a/src/qirlightning/catalyst_runtime/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -build -build_cov -bin/__pycache__/ diff --git a/src/qirlightning/catalyst_runtime/CMakeLists.txt b/src/qirlightning/catalyst_runtime/CMakeLists.txt deleted file mode 100644 index 1651851..0000000 --- a/src/qirlightning/catalyst_runtime/CMakeLists.txt +++ /dev/null @@ -1,133 +0,0 @@ -cmake_minimum_required(VERSION 3.26) - -project(catalyst_runtime) -include(FetchContent) -include(ExternalProject) - -set(CMAKE_CXX_STANDARD 20) -set(CMAKE_CXX_STANDARD_REQUIRED ON) - -# Compiler options -option(ENABLE_CODE_COVERAGE "Enable code coverage" OFF) -option(ENABLE_ADDRESS_SANITIZER "Enable address sanitizer" OFF) -option(RUNTIME_CLANG_TIDY "Enable Clang Tidy" OFF) - -option(ENABLE_OPENQASM "Build OpenQasm backend device" OFF) - -set(CMAKE_VERBOSE_MAKEFILE ON) -set(CMAKE_EXPORT_COMPILE_COMMANDS ON) - -set(runtime_includes "${PROJECT_SOURCE_DIR}/include") -set(capi_utils_includes "${PROJECT_SOURCE_DIR}/lib/capi") -set(backend_includes "${PROJECT_SOURCE_DIR}/lib/backend/common") - - -# Get LLVM hash to target from source tree. -file(READ ../.dep-versions DEPENDENCY_VERSIONS) -string(REGEX MATCH "llvm=([0-9a-f]+)" _ ${DEPENDENCY_VERSIONS}) -set(LLVM_HASH ${CMAKE_MATCH_1}) -message(STATUS "Detected LLVM version - ${LLVM_HASH}") - -FetchContent_Declare( - MLIRRunnerUtils - URL https://raw.githubusercontent.com/llvm/llvm-project/${LLVM_HASH}/mlir/include/mlir/ExecutionEngine/RunnerUtils.h - DOWNLOAD_NO_EXTRACT True - SOURCE_DIR mlir/ExecutionEngine -) - -FetchContent_Declare( - MLIRCRunnerUtils - URL https://raw.githubusercontent.com/llvm/llvm-project/${LLVM_HASH}/mlir/include/mlir/ExecutionEngine/CRunnerUtils.h - DOWNLOAD_NO_EXTRACT True - SOURCE_DIR mlir/ExecutionEngine -) - -FetchContent_Declare( - MLIRFloat16Bits - URL https://raw.githubusercontent.com/llvm/llvm-project/${LLVM_HASH}/mlir/include/mlir/ExecutionEngine/Float16bits.h - DOWNLOAD_NO_EXTRACT True - SOURCE_DIR mlir/ExecutionEngine -) - -# Note on pybind11 vs python discovery order: -# If Python is looked for first, then we have to look for all the components needed by pybind11. -# In particular, if pybind11::embed is used, then we need to find both headers (Development.Module) -# and the shared library (Development.Embed) before pybind11 is discovered. -# With the other order PyBind will discover everything it needs. -# Note on flags: -# - PYTHON_EXECUTABLE is a pybind11 specific flag used by its own (legacy) Python discovery process, -# it will not affect find_package(Python) calls. -# - Python_EXECUTABLE is a cmake flag used in find_package(Python) to guide the discovery. -# Note that pybind11 can be made to use find_python (instead of its legacy discovery), and thus -# respect Python_EXECUTABLE), via the PYBIND11_FINDPYTHON flag. - -# Here, we look for the desired Python version early to avoid any problems with mismatched packages. -# The desired Python environment should be specified ahead of time via -DPython_EXECUTABLE=... -# The optional component is only used for the C++ test suite (to spin up its own interpreter), -# and requires libpython.so to be available on the system. -find_package(Python REQUIRED - COMPONENTS Interpreter Development.Module - OPTIONAL_COMPONENTS Development.Embed Development.SABIModule -) - -if(RUNTIME_ENABLE_WARNINGS) - message(STATUS "Building with compiler warnings as errors enabled.") - add_compile_options(-Werror -Wall) -endif() - -message(STATUS "ENABLE_OPENQASM is ${ENABLE_OPENQASM}.") - -set(devices_list) -list(APPEND devices_list rtd_null_qubit) -list(APPEND backend_includes "${PROJECT_SOURCE_DIR}/lib/backend/null_qubit") - -if(ENABLE_OPENQASM) - list(APPEND backend_includes "${PROJECT_SOURCE_DIR}/lib/backend/openqasm") - list(APPEND devices_list rtd_openqasm) -endif() - -add_library(catalyst_qir_runtime INTERFACE) - -target_link_libraries(catalyst_qir_runtime INTERFACE ${devices_list} rt_capi) - -target_include_directories(catalyst_qir_runtime INTERFACE - ${runtime_includes} - ${backend_includes} -) - -if(ENABLE_CODE_COVERAGE) - message(STATUS "ENABLE_CODE_COVERAGE is ON.") - if(APPLE) - target_compile_options(catalyst_qir_runtime INTERFACE -fprofile-instr-generate -fcoverage-mapping) - target_link_options(catalyst_qir_runtime INTERFACE -fprofile-instr-generate -fcoverage-mapping) - else() - target_compile_options(catalyst_qir_runtime INTERFACE -fprofile-arcs -ftest-coverage) - target_link_libraries(catalyst_qir_runtime INTERFACE gcov) - endif() -endif() - - -if(ENABLE_ADDRESS_SANITIZER) - message(STATUS "ENABLE_ADDRESS_SANITIZER is ON.") - add_compile_options(-fsanitize=address) - add_link_options(-fsanitize=address) -endif() - -add_subdirectory(lib) -add_subdirectory(tests) - -if(APPLE AND (${CMAKE_SYSTEM_PROCESSOR} STREQUAL arm64)) -# Don't rerun external project everytime we configure the runtime build. -if(NOT EXISTS ${CMAKE_BINARY_DIR}/lib/liblapacke.3.dylib) - ExternalProject_Add(lapacke-accelerate - GIT_REPOSITORY https://github.com/lepus2589/accelerate-lapacke.git - GIT_TAG master - PREFIX _lapacke-accelerate - CMAKE_ARGS "--preset accelerate-lapacke32" - "-DCMAKE_INSTALL_PREFIX=${CMAKE_BINARY_DIR}/_lapacke-accelerate/install" - INSTALL_COMMAND ${CMAKE_COMMAND} --build . --target install - COMMAND cp ${CMAKE_BINARY_DIR}/_lapacke-accelerate/install/lib/liblapacke.3.dylib ${CMAKE_BINARY_DIR}/lib - ) - add_dependencies(rt_capi lapacke-accelerate) # automatically build with the runtime -endif() -endif() diff --git a/src/qirlightning/catalyst_runtime/Makefile b/src/qirlightning/catalyst_runtime/Makefile deleted file mode 100644 index 55733a4..0000000 --- a/src/qirlightning/catalyst_runtime/Makefile +++ /dev/null @@ -1,121 +0,0 @@ -PYTHON?=$(shell which python3) -PYTHON_PREFIX:=$(shell $(PYTHON) -c "import sys; print(sys.prefix)") -PYTHON_VERSION:=$(shell $(PYTHON) -c "import sys; print(f'{sys.version_info.major}.{sys.version_info.minor}')") -C_COMPILER?=$(shell which clang) -CXX_COMPILER?=$(shell which clang++) -COMPILER_LAUNCHER?=$(shell which ccache) -NPROC?=$(shell python3 -c "import os; print(os.cpu_count())") - -MK_ABSPATH := $(abspath $(lastword $(MAKEFILE_LIST))) -MK_DIR := $(dir $(MK_ABSPATH)) -RT_BUILD_DIR?=$(MK_DIR)/build -CODE_COVERAGE?=OFF -BUILD_TYPE?=RelWithDebInfo -ENABLE_OPENQASM?=ON -ENABLE_ASAN?=OFF - -BUILD_TARGETS := rt_capi rtd_null_qubit -TEST_TARGETS := runner_tests_qir_runtime - -PLATFORM := $(shell uname -s) - -ifeq ($(ENABLE_OPENQASM), ON) - BUILD_TARGETS += rtd_openqasm - TEST_TARGETS += runner_tests_openqasm -endif - -.PHONY: help -help: - @echo "Please use \`make ' where is one of" - @echo " all to build Catalyst Runtime" - @echo " coverage to generate a coverage report using lcov" - @echo " clean to delete all temporary, cache, and build files" - @echo " test to run the Catalyst runtime test suite" - @echo " format [check=1] to apply C++ formatter; use with 'check=1' to check instead of modify (requires clang-format)" - @echo " format [version=?] to apply C++ formatter; use with 'version={version}' to run clang-format-{version} instead of clang-format" - @echo " check-tidy to build Catalyst Runtime with RUNTIME_CLANG_TIDY=ON (requires clang-tidy)" - -.PHONY: configure -configure: - @echo "Configure Catalyst Runtime" - - cmake -G Ninja -B $(RT_BUILD_DIR) . \ - -DCMAKE_BUILD_TYPE=$(BUILD_TYPE) \ - -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=$(RT_BUILD_DIR)/lib \ - -DCMAKE_C_COMPILER=$(C_COMPILER) \ - -DCMAKE_CXX_COMPILER=$(CXX_COMPILER) \ - -DCMAKE_C_COMPILER_LAUNCHER=$(COMPILER_LAUNCHER) \ - -DCMAKE_CXX_COMPILER_LAUNCHER=$(COMPILER_LAUNCHER) \ - -DENABLE_OPENQASM=$(ENABLE_OPENQASM) \ - -DENABLE_CODE_COVERAGE=$(CODE_COVERAGE) \ - -DPython_EXECUTABLE=$(PYTHON) \ - -DENABLE_ADDRESS_SANITIZER=$(ENABLE_ASAN) - -.PHONY: runtime -runtime: configure - cmake --build $(RT_BUILD_DIR) --target $(BUILD_TARGETS) -j$(NPROC) --verbose - -.PHONY: test_runner -test_runner: configure - cmake --build $(RT_BUILD_DIR) --target $(TEST_TARGETS) -j$(NPROC) --verbose - -.PHONY: test -test: CODE_COVERAGE=OFF -test: BUILD_TYPE?=RelWithDebInfo -test: test_runner - @echo "Catalyst runtime test suite - NullQubit" - $(ASAN_COMMAND) $(RT_BUILD_DIR)/tests/runner_tests_qir_runtime -ifeq ($(ENABLE_OPENQASM), ON) - # Test the OpenQasm devices C++ tests - $(ASAN_COMMAND) $(RT_BUILD_DIR)/tests/runner_tests_openqasm -endif - -.PHONY: coverage -coverage: RT_BUILD_DIR := $(RT_BUILD_DIR)_cov -coverage: CODE_COVERAGE=ON -coverage: BUILD_TYPE=Debug -coverage: C_COMPILER=$(shell which gcc) -coverage: CXX_COMPILER=$(shell which g++) -coverage: export LLVM_PROFILE_FILE := $(RT_BUILD_DIR)/tests/%m.profraw -coverage: test_runner - @echo "check C++ code coverage" - $(RT_BUILD_DIR)/tests/runner_tests_qir_runtime -ifeq ($(ENABLE_OPENQASM), ON) - $(RT_BUILD_DIR)/tests/runner_tests_openqasm -endif -ifeq ($(PLATFORM),Linux) - lcov --directory $(RT_BUILD_DIR) -b $(MK_DIR)/lib --capture --output-file $(RT_BUILD_DIR)/coverage.info - lcov --remove $(RT_BUILD_DIR)/coverage.info '/usr/*' '*/_deps/*' '*/envs/*' '*/mlir/*' --output-file $(RT_BUILD_DIR)/coverage.info - genhtml $(RT_BUILD_DIR)/coverage.info --output-directory $(RT_BUILD_DIR)/cov -t "Catalyst Runtime C++ Coverage" --num-spaces 4 -else - xcrun llvm-profdata merge $(RT_BUILD_DIR)/tests/*.profraw -o $(RT_BUILD_DIR)/tests/rt_test_coverage.profdata - xcrun llvm-cov show -instr-profile $(RT_BUILD_DIR)/tests/rt_test_coverage.profdata \ - -object $(RT_BUILD_DIR)/tests/runner_tests_openqasm \ - $(RT_BUILD_DIR)/tests/runner_tests_qir_runtime \ - -format=html -output-dir=$(RT_BUILD_DIR)/coverage_html \ - $(MK_DIR)/include $(MK_DIR)/lib $(MK_DIR)/tests -endif - -.PHONY: clean -clean: - @echo "clean build files" - rm -rf $(RT_BUILD_DIR) $(RT_BUILD_DIR)_cov cov coverage.info $(MK_DIR)/BuildTidy - -.PHONY: format -format: -ifdef check - $(PYTHON) ../bin/format.py --check $(if $(version:-=),--cfversion $(version)) . -else - $(PYTHON) ../bin/format.py $(if $(version:-=),--cfversion $(version)) . -endif - -.PHONY: check-tidy -check-tidy: - @echo "build Catalyst Runtime with RUNTIME_CLANG_TIDY=ON" - cmake -G Ninja -B $(MK_DIR)/BuildTidy . \ - -DCMAKE_BUILD_TYPE=$(BUILD_TYPE) \ - -DCMAKE_C_COMPILER=$(C_COMPILER) \ - -DCMAKE_CXX_COMPILER=$(CXX_COMPILER) \ - -DRUNTIME_CLANG_TIDY=ON - - cmake --build $(MK_DIR)/BuildTidy --target rt_capi -j$(NPROC) diff --git a/src/qirlightning/catalyst_runtime/lib/CMakeLists.txt b/src/qirlightning/catalyst_runtime/lib/CMakeLists.txt deleted file mode 100644 index 50fd0b0..0000000 --- a/src/qirlightning/catalyst_runtime/lib/CMakeLists.txt +++ /dev/null @@ -1,3 +0,0 @@ -add_subdirectory(capi) -add_subdirectory(backend) -add_subdirectory(registry) diff --git a/src/qirlightning/catalyst_runtime/lib/backend/CMakeLists.txt b/src/qirlightning/catalyst_runtime/lib/backend/CMakeLists.txt deleted file mode 100644 index 45b7ad7..0000000 --- a/src/qirlightning/catalyst_runtime/lib/backend/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -add_subdirectory(null_qubit) -configure_file(null_qubit/null_qubit.toml null_qubit.toml) -if(ENABLE_OPENQASM) -add_subdirectory(openqasm) -configure_file(openqasm/braket_local_qubit.toml braket_local_qubit.toml) -configure_file(openqasm/braket_aws_qubit.toml braket_aws_qubit.toml) -endif() diff --git a/src/qirlightning/catalyst_runtime/lib/backend/common/CacheManager.hpp b/src/qirlightning/catalyst_runtime/lib/backend/common/CacheManager.hpp deleted file mode 100644 index 0141f33..0000000 --- a/src/qirlightning/catalyst_runtime/lib/backend/common/CacheManager.hpp +++ /dev/null @@ -1,199 +0,0 @@ -// Copyright 2022-2023 Xanadu Quantum Technologies Inc. - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include - -#include "Types.h" -#include "Utils.hpp" - -namespace Catalyst::Runtime { -/** - * @brief The CacheManager caches the entire operations and observables of - * a program at runtime. - * - * One direct use case of this functionality is explored to compute gradient - * of a circuit with taking advantage of gradient methods provided by - * simulators. - */ -template > class CacheManager { - protected: - // Operations Data - std::vector ops_names_{}; - std::vector> ops_params_{}; - std::vector> ops_wires_{}; - std::vector ops_inverses_{}; - std::vector> ops_matrixs_{}; - std::vector> ops_controlled_wires_{}; - std::vector> ops_controlled_values_{}; - - // Observables Data - std::vector obs_keys_{}; - std::vector obs_callees_{}; - - // Number of parameters - size_t num_params_{0}; - - public: - CacheManager() = default; - ~CacheManager() = default; - - CacheManager(const CacheManager &) = delete; - CacheManager &operator=(const CacheManager &) = delete; - CacheManager(CacheManager &&) = delete; - CacheManager &operator=(CacheManager &&) = delete; - - /** - * Reset cached gates - */ - void Reset() - { - ops_names_.clear(); - ops_params_.clear(); - ops_wires_.clear(); - ops_inverses_.clear(); - ops_matrixs_.clear(); - ops_controlled_wires_.clear(); - ops_controlled_values_.clear(); - - obs_keys_.clear(); - obs_callees_.clear(); - - num_params_ = 0; - } - - /** - * @brief Add a new operation to the list of cached gates. - * - * @param name Name of the given gate - * @param params Parameters of the gate - * @param wires Wires the gate acts on - * @param inverse If true, inverse of the gate is applied - * @param matrix Unitary matrix for the 'MatrixOp' operations - * @param controlled_wires Control wires - * @param controlled_values Control values - */ - void addOperation(const std::string &name, const std::vector ¶ms, - const std::vector &wires, bool inverse, - const std::vector &matrix = {}, - const std::vector &controlled_wires = {}, - const std::vector &controlled_values = {}) - { - ops_names_.push_back(name); - ops_params_.push_back(params); - ops_wires_.push_back(wires); - ops_inverses_.push_back(inverse); - ops_matrixs_.push_back(matrix); - ops_controlled_wires_.push_back(controlled_wires); - ops_controlled_values_.push_back(controlled_values); - - num_params_ += params.size(); - } - - /** - * @brief Add a new observable to the list of cached gates. - * - * @param id The observable key created by LObsManager() - * @param callee The measurement operation - */ - void addObservable(const ObsIdType id, const MeasurementsT &callee = MeasurementsT::None) - { - obs_keys_.push_back(id); - obs_callees_.push_back(callee); - } - - /** - * @brief Get a reference to observables keys. - */ - auto getObservablesKeys() -> const std::vector & { return obs_keys_; } - - /** - * @brief Get a reference to observables callees. - */ - auto getObservablesCallees() -> const std::vector & { return obs_callees_; } - - /** - * @brief Get a reference to operations names. - */ - auto getOperationsNames() -> const std::vector & { return ops_names_; } - - /** - * @brief Get a reference to operations parameters. - */ - auto getOperationsParameters() -> const std::vector> & - { - return ops_params_; - } - - /** - * @brief Get a reference to operations wires. - */ - auto getOperationsWires() -> const std::vector> & { return ops_wires_; } - - /** - * @brief Get a reference to operation controlled wires. - */ - auto getOperationsControlledWires() -> const std::vector> & - { - return this->ops_controlled_wires_; - } - - /** - * @brief Get a reference to operation controlled values. - */ - auto getOperationsControlledValues() -> const std::vector> & - { - return this->ops_controlled_values_; - } - - /** - * @brief Get a reference to operations inverses. - */ - auto getOperationsInverses() -> const std::vector & { return ops_inverses_; } - - /** - * @brief Get a reference to operations matrices. - */ - auto getOperationsMatrices() -> const std::vector> & - { - return ops_matrixs_; - } - - /** - * @brief Get total number of cached gates. - */ - [[nodiscard]] auto getNumGates() const -> size_t - { - return ops_names_.size() + obs_keys_.size(); - } - - /** - * @brief Get number of operations. - */ - [[nodiscard]] auto getNumOperations() const -> size_t { return ops_names_.size(); } - - /** - * @brief Get number of observables. - */ - [[nodiscard]] auto getNumObservables() const -> size_t { return obs_keys_.size(); } - - /** - * @brief Get total number of cached gates. - */ - [[nodiscard]] auto getNumParams() const -> size_t { return num_params_; } -}; -} // namespace Catalyst::Runtime diff --git a/src/qirlightning/catalyst_runtime/lib/backend/common/QubitManager.hpp b/src/qirlightning/catalyst_runtime/lib/backend/common/QubitManager.hpp deleted file mode 100644 index 05dc377..0000000 --- a/src/qirlightning/catalyst_runtime/lib/backend/common/QubitManager.hpp +++ /dev/null @@ -1,146 +0,0 @@ -// Copyright 2022-2023 Xanadu Quantum Technologies Inc. - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -#include "Exception.hpp" -#include "Types.h" -#include "Utils.hpp" - -namespace Catalyst::Runtime { - -/** - * Qubit Manager - * - * @brief That maintains mapping of qubit IDs between runtime and device - * ids (e.g., Lightning-Dynamic). When user allocates a qubit, the - * `QubitManager` adds the qubit as an active qubit that operations - * can act on. When user releases a qubit, the `QubitManager` removes - * that qubit from the list of active wires. - */ -template -class QubitManager { - private: - using LQMapT = std::map; - - SimQubitIdType next_idx{0}; - LQMapT qubits_map{}; - - template - [[nodiscard]] inline OIter _remove_simulator_qubit_id(SimQubitIdType s_idx) - { - const auto &&s_idx_iter = this->qubits_map.find(s_idx); - RT_FAIL_IF(s_idx_iter == this->qubits_map.end(), "Invalid simulator qubit index"); - - return this->qubits_map.erase(s_idx_iter); - } - - template - inline void _update_qubits_mapfrom(IIter s_idx_iter) - { - for (; s_idx_iter != this->qubits_map.end(); s_idx_iter++) { - s_idx_iter->second--; - } - } - - public: - QubitManager() = default; - ~QubitManager() = default; - - QubitManager(const QubitManager &) = delete; - QubitManager &operator=(const QubitManager &) = delete; - QubitManager(QubitManager &&) = delete; - QubitManager &operator=(QubitManager &&) = delete; - - [[nodiscard]] auto isValidQubitId(SimQubitIdType s_idx) -> bool - { - return this->qubits_map.contains(s_idx); - } - - [[nodiscard]] auto isValidQubitId(const std::vector &ss_idx) -> bool - { - return std::all_of(ss_idx.begin(), ss_idx.end(), - [this](SimQubitIdType s) { return isValidQubitId(s); }); - } - - [[nodiscard]] auto getAllQubitIds() -> std::vector - { - std::vector ids; - ids.reserve(this->qubits_map.size()); - for (const auto &it : this->qubits_map) { - ids.push_back(it.first); - } - - return ids; - } - - [[nodiscard]] auto getDeviceId(SimQubitIdType s_idx) -> DevQubitIdType - { - RT_FAIL_IF(!isValidQubitId(s_idx), "Invalid device qubit index"); - - return this->qubits_map[s_idx]; - } - - auto getDeviceIds(const std::vector &ss_idx) -> std::vector - { - std::vector dd_idx; - dd_idx.reserve(ss_idx.size()); - for (const auto &s : ss_idx) { - dd_idx.push_back(getDeviceId(s)); - } - return dd_idx; - } - - [[nodiscard]] auto getSimulatorId(DevQubitIdType d_idx) -> SimQubitIdType - { - auto s_idx = std::find_if(this->qubits_map.begin(), this->qubits_map.end(), - [&d_idx](auto &&p) { return p.second == d_idx; }); - - RT_FAIL_IF(s_idx == this->qubits_map.end(), "Invalid simulator qubit index"); - - return s_idx->first; - } - - [[nodiscard]] auto Allocate(DevQubitIdType d_next_idx) -> SimQubitIdType - { - this->qubits_map[this->next_idx++] = d_next_idx; - return this->next_idx - 1; - } - - auto AllocateRange(DevQubitIdType start_idx, size_t size) -> std::vector - { - std::vector ids; - ids.reserve(size); - for (DevQubitIdType i = start_idx; i < start_idx + size; i++) { - ids.push_back(this->next_idx); - this->qubits_map[this->next_idx++] = i; - } - return ids; - } - - void Release(SimQubitIdType s_idx) - { - _update_qubits_mapfrom(_remove_simulator_qubit_id(s_idx)); - } - - void ReleaseAll() - { - // Release all qubits by clearing the map. - this->qubits_map.clear(); - } -}; -} // namespace Catalyst::Runtime diff --git a/src/qirlightning/catalyst_runtime/lib/backend/common/Utils.hpp b/src/qirlightning/catalyst_runtime/lib/backend/common/Utils.hpp deleted file mode 100644 index 0527ac4..0000000 --- a/src/qirlightning/catalyst_runtime/lib/backend/common/Utils.hpp +++ /dev/null @@ -1,304 +0,0 @@ -// Copyright 2022-2023 Xanadu Quantum Technologies Inc. - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "Exception.hpp" -#include "Types.h" - -#define QUANTUM_DEVICE_DEL_DECLARATIONS(CLASSNAME) \ - CLASSNAME(const CLASSNAME &) = delete; \ - CLASSNAME &operator=(const CLASSNAME &) = delete; \ - CLASSNAME(CLASSNAME &&) = delete; \ - CLASSNAME &operator=(CLASSNAME &&) = delete; - -#define QUANTUM_DEVICE_RT_DECLARATIONS \ - auto AllocateQubit()->QubitIdType override; \ - auto AllocateQubits(size_t num_qubits)->std::vector override; \ - void ReleaseQubit(QubitIdType q) override; \ - void ReleaseAllQubits() override; \ - [[nodiscard]] auto GetNumQubits() const->size_t override; \ - void StartTapeRecording() override; \ - void StopTapeRecording() override; \ - void SetDeviceShots(size_t shots) override; \ - [[nodiscard]] auto GetDeviceShots() const->size_t override; \ - void PrintState() override; \ - [[nodiscard]] auto Zero() const->Result override; \ - [[nodiscard]] auto One() const->Result override; - -#define QUANTUM_DEVICE_QIS_DECLARATIONS \ - void NamedOperation( \ - const std::string &name, const std::vector ¶ms, \ - const std::vector &wires, [[maybe_unused]] bool inverse = false, \ - [[maybe_unused]] const std::vector &controlled_wires = {}, \ - [[maybe_unused]] const std::vector &controlled_values = {}) override; \ - using Catalyst::Runtime::QuantumDevice::MatrixOperation; \ - void MatrixOperation( \ - const std::vector> &matrix, const std::vector &wires, \ - [[maybe_unused]] bool inverse = false, \ - [[maybe_unused]] const std::vector &controlled_wires = {}, \ - [[maybe_unused]] const std::vector &controlled_values = {}) override; \ - auto Observable(ObsId id, const std::vector> &matrix, \ - const std::vector &wires) \ - ->ObsIdType override; \ - auto TensorObservable(const std::vector &obs)->ObsIdType override; \ - auto HamiltonianObservable(const std::vector &coeffs, \ - const std::vector &obs) \ - ->ObsIdType override; \ - auto Expval(ObsIdType obsKey)->double override; \ - auto Var(ObsIdType obsKey)->double override; \ - void State(DataView, 1> &state) override; \ - void Probs(DataView &probs) override; \ - void PartialProbs(DataView &probs, const std::vector &wires) override; \ - void Sample(DataView &samples, size_t shots) override; \ - void PartialSample(DataView &samples, const std::vector &wires, \ - size_t shots) override; \ - void Counts(DataView &eigvals, DataView &counts, size_t shots) \ - override; \ - void PartialCounts(DataView &eigvals, DataView &counts, \ - const std::vector &wires, size_t shots) override; \ - auto Measure(QubitIdType wire, std::optional postselect = std::nullopt) \ - ->Result override; \ - void Gradient(std::vector> &gradients, \ - const std::vector &trainParams) override; - -namespace Catalyst::Runtime { -static inline auto parse_kwargs(std::string kwargs) -> std::unordered_map -{ - // cleaning kwargs - if (kwargs.empty()) { - return {}; - } - - std::unordered_map map; - size_t s3_pos = kwargs.find("\'s3_destination_folder\'"); - if (s3_pos != std::string::npos) { - auto opening_pos = kwargs.find('(', s3_pos); - RT_ASSERT(opening_pos != std::string::npos); - auto closing_pos = kwargs.find(')', opening_pos); - RT_ASSERT(closing_pos != std::string::npos); - map["s3_destination_folder"] = kwargs.substr(opening_pos, closing_pos - opening_pos + 1); - } - - auto kwargs_end_iter = (s3_pos == std::string::npos) ? kwargs.end() : kwargs.begin() + s3_pos; - - kwargs.erase(std::remove_if(kwargs.begin(), kwargs_end_iter, - [](char c) { - switch (c) { - case '{': - case '}': - case ' ': - case '\'': - return true; - default: - return false; - } - }), - kwargs.end()); - - // constructing map - std::istringstream iss(kwargs); - std::string token; - while (std::getline(iss, token, ',')) { - std::istringstream issp(token); - std::string pair[2]; - std::getline(issp, pair[0], ':'); - std::getline(issp, pair[1]); - map[pair[0]] = pair[1]; - } - - return map; -} - -enum class MeasurementsT : uint8_t { - None, // = 0 - Expval, - Var, - Probs, - State, -}; - -} // namespace Catalyst::Runtime - -namespace Catalyst::Runtime::Simulator::Lightning { -enum class SimulatorGate : uint8_t { - // 1-qubit - Identity, // = 0 - PauliX, - PauliY, - PauliZ, - Hadamard, - S, - T, - PhaseShift, - RX, - RY, - RZ, - Rot, - // 2-qubit - CNOT, - CY, - CZ, - SWAP, - ISWAP, - PSWAP, - IsingXX, - IsingYY, - IsingXY, - IsingZZ, - ControlledPhaseShift, - CRX, - CRY, - CRZ, - CRot, - // 3-qubit - CSWAP, - Toffoli, - // n-qubit - MultiRZ, -}; - -constexpr std::array simulator_observable_support = { - // ObsId, ObsName, SimulatorSupport - std::tuple{ObsId::Identity, "Identity", true}, - std::tuple{ObsId::PauliX, "PauliX", true}, - std::tuple{ObsId::PauliY, "PauliY", true}, - std::tuple{ObsId::PauliZ, "PauliZ", true}, - std::tuple{ObsId::Hadamard, "Hadamard", true}, -}; - -using GateInfoTupleT = std::tuple; - -constexpr std::array simulator_gate_info = { - // 1-qubit - GateInfoTupleT{SimulatorGate::Identity, "Identity", 1, 0}, - GateInfoTupleT{SimulatorGate::PauliX, "PauliX", 1, 0}, - GateInfoTupleT{SimulatorGate::PauliY, "PauliY", 1, 0}, - GateInfoTupleT{SimulatorGate::PauliZ, "PauliZ", 1, 0}, - GateInfoTupleT{SimulatorGate::Hadamard, "Hadamard", 1, 0}, - GateInfoTupleT{SimulatorGate::S, "S", 1, 0}, - GateInfoTupleT{SimulatorGate::T, "T", 1, 0}, - GateInfoTupleT{SimulatorGate::PhaseShift, "PhaseShift", 1, 1}, - GateInfoTupleT{SimulatorGate::RX, "RX", 1, 1}, - GateInfoTupleT{SimulatorGate::RY, "RY", 1, 1}, - GateInfoTupleT{SimulatorGate::RZ, "RZ", 1, 1}, - GateInfoTupleT{SimulatorGate::Rot, "Rot", 1, 3}, - // 2-qubit - GateInfoTupleT{SimulatorGate::CNOT, "CNOT", 2, 0}, - GateInfoTupleT{SimulatorGate::CY, "CY", 2, 0}, - GateInfoTupleT{SimulatorGate::CZ, "CZ", 2, 0}, - GateInfoTupleT{SimulatorGate::SWAP, "SWAP", 2, 0}, - GateInfoTupleT{SimulatorGate::ISWAP, "ISWAP", 2, 0}, - GateInfoTupleT{SimulatorGate::PSWAP, "PSWAP", 2, 1}, - GateInfoTupleT{SimulatorGate::IsingXX, "IsingXX", 2, 1}, - GateInfoTupleT{SimulatorGate::IsingYY, "IsingYY", 2, 1}, - GateInfoTupleT{SimulatorGate::IsingXY, "IsingXY", 2, 1}, - GateInfoTupleT{SimulatorGate::IsingZZ, "IsingZZ", 2, 1}, - GateInfoTupleT{SimulatorGate::ControlledPhaseShift, "ControlledPhaseShift", 2, 1}, - GateInfoTupleT{SimulatorGate::CRX, "CRX", 2, 1}, - GateInfoTupleT{SimulatorGate::CRY, "CRY", 2, 1}, - GateInfoTupleT{SimulatorGate::CRZ, "CRZ", 2, 1}, - GateInfoTupleT{SimulatorGate::CRot, "CRot", 2, 3}, - // 3-qubit - GateInfoTupleT{SimulatorGate::CSWAP, "CSWAP", 3, 0}, - GateInfoTupleT{SimulatorGate::Toffoli, "Toffoli", 3, 0}, - // n-qubit - GateInfoTupleT{SimulatorGate::MultiRZ, "MultiRZ", 0, 1}, -}; - -constexpr size_t simulator_gate_info_size = simulator_gate_info.size(); -constexpr size_t simulator_observable_support_size = simulator_observable_support.size(); - -template -using SimulatorGateInfoDataT = std::array; - -template -constexpr auto lookup_obs(const std::array, size> &arr, - const ObsId key) -> std::string_view -{ - for (size_t idx = 0; idx < size; idx++) { - auto &&[op_id, op_str, op_support] = arr[idx]; - if (op_id == key && op_support) { - return op_str; - } - } - throw std::range_error("The given observable is not supported by the simulator"); -} - -template -constexpr auto lookup_gates(const SimulatorGateInfoDataT &arr, const std::string &key) - -> std::pair -{ - for (size_t idx = 0; idx < size; idx++) { - auto &&[op, op_str, op_num_wires, op_num_params] = arr[idx]; - if (op_str == key) { - return std::make_pair(op_num_wires, op_num_params); - } - } - throw std::range_error("The given operation is not supported by the simulator"); -} - -template -constexpr auto has_gate(const SimulatorGateInfoDataT &arr, const std::string &key) -> bool -{ - for (size_t idx = 0; idx < size; idx++) { - if (std::get<1>(arr[idx]) == key) { - return true; - } - } - return false; -} - -static inline auto -simulateDraw(const std::vector &probs, std::optional postselect, - std::mt19937 *gen = nullptr) // NOLINT(readability-non-const-parameter) - -> bool -{ - if (postselect) { - auto postselect_value = postselect.value(); - RT_FAIL_IF(postselect_value < 0 || postselect_value > 1, "Invalid postselect value"); - RT_FAIL_IF(probs[postselect_value] == 0, "Probability of postselect value is 0"); - return static_cast(postselect_value == 1); - } - - // Normal flow, no post-selection - // Draw a number according to the given distribution - std::uniform_real_distribution<> dis(0., 1.); - - float draw; - if (gen != nullptr) { - draw = dis(*gen); - (*gen)(); - } - else { - std::random_device rd; - std::mt19937 gen_no_seed(rd()); - draw = dis(gen_no_seed); - } - - return draw > probs[0]; -} - -} // namespace Catalyst::Runtime::Simulator::Lightning diff --git a/src/qirlightning/catalyst_runtime/lib/capi/CMakeLists.txt b/src/qirlightning/catalyst_runtime/lib/capi/CMakeLists.txt deleted file mode 100644 index e05e9bf..0000000 --- a/src/qirlightning/catalyst_runtime/lib/capi/CMakeLists.txt +++ /dev/null @@ -1,57 +0,0 @@ -################################## -# Object Lib catalyst_qir_qis_obj -################################## - -add_library(catalyst_qir_qis_obj OBJECT RuntimeCAPI.cpp) - -# include external MLIR runner utils -FetchContent_MakeAvailable(MLIRRunnerUtils) -FetchContent_MakeAvailable(MLIRCRunnerUtils) -FetchContent_MakeAvailable(MLIRFloat16Bits) - -# link to rt_backend -target_link_libraries(catalyst_qir_qis_obj ${CMAKE_DL_LIBS}) - -target_link_libraries(catalyst_qir_qis_obj - pthread - dl -) - -target_include_directories(catalyst_qir_qis_obj PUBLIC . - ${CMAKE_CURRENT_SOURCE_DIR} - ${runtime_includes} - ${mlirrunnerutils_SOURCE_DIR}/../.. # includes are relative to mlir/ExecutionEngine - ${PROJECT_SOURCE_DIR}/../mlir/lib/Driver # Timer.hpp -) - -# The MLIR Runner Utils raises this warning so we need to disable it for our -Werror builds. -if(RUNTIME_ENABLE_WARNINGS) - target_compile_options(catalyst_qir_qis_obj PRIVATE "-Wno-unused-parameter") -endif() - -set_property(TARGET catalyst_qir_qis_obj PROPERTY POSITION_INDEPENDENT_CODE ON) - -##################### -# Shared Lib rt_capi -##################### - -add_library(rt_capi SHARED) - -target_link_libraries(rt_capi ${CMAKE_DL_LIBS} catalyst_qir_qis_obj) -add_dependencies(rt_capi catalyst_callback_registry) - - -target_include_directories(rt_capi PUBLIC . - ${CMAKE_CURRENT_SOURCE_DIR} - ${runtime_includes} - ${capi_utils_includes} -) - -set_property(TARGET rt_capi PROPERTY POSITION_INDEPENDENT_CODE ON) -set_property(TARGET rt_capi APPEND PROPERTY BUILD_RPATH "$") - -if(NOT APPLE) - set_property(TARGET rt_capi APPEND PROPERTY BUILD_RPATH $ORIGIN) -else() - set_property(TARGET rt_capi APPEND PROPERTY BUILD_RPATH @loader_path) -endif() diff --git a/src/qirlightning/catalyst_runtime/lib/capi/ExecutionContext.hpp b/src/qirlightning/catalyst_runtime/lib/capi/ExecutionContext.hpp deleted file mode 100644 index 9abe8cb..0000000 --- a/src/qirlightning/catalyst_runtime/lib/capi/ExecutionContext.hpp +++ /dev/null @@ -1,367 +0,0 @@ -// Copyright 2022-2023 Xanadu Quantum Technologies Inc. - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "Exception.hpp" -#include "QuantumDevice.hpp" -#include "Types.h" - -extern void callbackCall(int64_t, int64_t, int64_t, va_list); - -namespace Catalyst::Runtime { - -extern "C" void __catalyst_inactive_callback(int64_t identifier, int64_t argc, int64_t retc, ...); - -class MemoryManager // NOLINT(cppcoreguidelines-special-member-functions, - // hicpp-special-member-functions) - final { - private: - std::unordered_set _impl; - std::mutex mu; // To guard the memory manager - - public: - explicit MemoryManager() { _impl.reserve(1024); }; - - ~MemoryManager() - { - // Lock the mutex to protect _impl free - std::lock_guard lock(mu); - for (auto *allocation : _impl) { - free(allocation); // NOLINT(cppcoreguidelines-no-malloc, hicpp-no-malloc) - } - } - - void insert(void *ptr) - { - // Lock the mutex to protect _impl update - std::lock_guard lock(mu); - _impl.insert(ptr); - } - void erase(void *ptr) - { - // Lock the mutex to protect _impl update - std::lock_guard lock(mu); - _impl.erase(ptr); - } - bool contains(void *ptr) - { - // Lock the mutex to protect _impl update - std::lock_guard lock(mu); - return _impl.contains(ptr); - } -}; - -class SharedLibraryManager final { - private: - void *_handler{nullptr}; - - public: - SharedLibraryManager() = delete; - explicit SharedLibraryManager(const std::string &filename) - { -#ifdef __APPLE__ - auto rtld_flags = RTLD_LAZY; -#else - // Closing the dynamic library of Lightning simulators with dlclose() where OpenMP - // directives (in Lightning simulators) are in use would raise memory segfaults. - // Note that we use RTLD_NODELETE as a workaround to fix the issue. - auto rtld_flags = RTLD_LAZY | RTLD_NODELETE; -#endif - - _handler = dlopen(filename.c_str(), rtld_flags); - RT_FAIL_IF(!_handler, dlerror()); - } - - ~SharedLibraryManager() - { - // dlopen and dlclose increment and decrement reference counters. - // Since we have a guaranteed _handler in a valid SharedLibraryManager instance - // then we don't really need to worry about dlclose. - // In other words, there is an one to one correspondence between an instance - // of SharedLibraryManager and an increase in the reference count for the dynamic library. - // dlclose returns non-zero on error. - // - // Errors in dlclose are implementation dependent. - // There are two possible errors during dlclose in glibc: "shared object not open" - // and "cannot create scope list". Look for _dl_signal_error in: - // - // https://codebrowser.dev/glibc/glibc/elf/dl-close.c.html - // - // This means that at the very least, one could trigger an error in the following line by - // doing the following: dlopen the same library and closing it multiple times in a different - // location. - // - // This would mean that the reference count would be less than the number of instances - // of SharedLibraryManager. - // - // There really is no way to protect against this error, except to always use - // SharedLibraryManager to manage shared libraries. - // - // Exercise for the reader, how could one trigger the "cannot create scope list" error? - dlclose(_handler); - } - - SharedLibraryManager(const SharedLibraryManager &other) = delete; - SharedLibraryManager &operator=(const SharedLibraryManager &other) = delete; - SharedLibraryManager(SharedLibraryManager &&other) = delete; - SharedLibraryManager &operator=(SharedLibraryManager &&other) = delete; - - void *getSymbol(const std::string &symbol) - { - void *sym = dlsym(_handler, symbol.c_str()); - RT_FAIL_IF(!sym, dlerror()); - return sym; - } -}; - -/** - * This indicates the various stages a device can be in: - * - `Active` : The device is added to the device pool and the `ExecutionContext` device pointer - * (`RTD_PTR`) points to this device instance. The CAPI routines have only access to - * one single active device per thread via `RTD_PTR`. - * - `Inactive` : The device is deactivated meaning `RTD_PTR` does not point to this device. - * The device is not removed from the pool, allowing the `ExecutionContext` manager - * to reuse this device in a multi-qnode workflow when another device with identical - * specifications is requested. - */ -enum class RTDeviceStatus : uint8_t { - Active = 0, - Inactive, -}; - -extern "C" Catalyst::Runtime::QuantumDevice *GenericDeviceFactory(const char *kwargs); - -/** - * Runtime Device data-class. - * - * This class introduces an interface for constructed devices by the `ExecutionContext` - * manager. This includes the device name, library, kwargs, and a shared pointer to the - * `QuantumDevice` entry point. - */ -class RTDevice { - private: - std::string rtd_lib; - std::string rtd_name; - std::string rtd_kwargs; - - std::unique_ptr rtd_dylib{nullptr}; - std::unique_ptr rtd_qdevice{nullptr}; - - RTDeviceStatus status{RTDeviceStatus::Inactive}; - - static void _complete_dylib_os_extension(std::string &rtd_lib, const std::string &name) noexcept - { -#ifdef __linux__ - rtd_lib = "librtd_" + name + ".so"; -#elif defined(__APPLE__) - rtd_lib = "librtd_" + name + ".dylib"; -#endif - } - - static void _pl2runtime_device_info(std::string &rtd_lib, std::string &rtd_name) noexcept - { - // The following if-elif is required for C++ tests where these backend devices - // are linked in the interface library of the runtime. (check runtime/CMakeLists.txt) - // Besides, this provides support for runtime device (RTD) libraries added to the system - // path. This maintains backward compatibility for specifying a device using its name. - // TODO: This support may need to be removed after updating the C++ unit tests. - if (rtd_lib == "null.qubit") { - rtd_name = "NullQubit"; - _complete_dylib_os_extension(rtd_lib, "null_qubit"); - } - else if (rtd_lib == "lightning.qubit") { - rtd_name = "LightningSimulator"; - _complete_dylib_os_extension(rtd_lib, "lightning"); - } - else if (rtd_lib == "braket.aws.qubit" || rtd_lib == "braket.local.qubit") { - rtd_name = "OpenQasmDevice"; - _complete_dylib_os_extension(rtd_lib, "openqasm"); - } - } - - public: - explicit RTDevice(std::string _rtd_lib, std::string _rtd_name = {}, - std::string _rtd_kwargs = {}) - : rtd_lib(std::move(_rtd_lib)), rtd_name(std::move(_rtd_name)), - rtd_kwargs(std::move(_rtd_kwargs)) - { - _pl2runtime_device_info(rtd_lib, rtd_name); - } - - explicit RTDevice(std::string_view _rtd_lib, std::string_view _rtd_name, - std::string_view _rtd_kwargs) - : rtd_lib(_rtd_lib), rtd_name(_rtd_name), rtd_kwargs(_rtd_kwargs) - { - _pl2runtime_device_info(rtd_lib, rtd_name); - } - - ~RTDevice() = default; - RTDevice(const RTDevice &other) = delete; - RTDevice &operator=(const RTDevice &other) = delete; - RTDevice(RTDevice &&other) = delete; - RTDevice &operator=(RTDevice &&other) = delete; - - auto operator==(const RTDevice &other) const -> bool - { - return (this->rtd_lib == other.rtd_lib && this->rtd_name == other.rtd_name) && - this->rtd_kwargs == other.rtd_kwargs; - } - - [[nodiscard]] auto getQuantumDevicePtr() -> const std::unique_ptr & - { - if (rtd_qdevice) { - return rtd_qdevice; - } - - rtd_dylib = std::make_unique(rtd_lib); - std::string factory_name{rtd_name + "Factory"}; - void *f_ptr = rtd_dylib->getSymbol(factory_name); - rtd_qdevice = std::unique_ptr( - (f_ptr != nullptr) - ? reinterpret_cast(f_ptr)(rtd_kwargs.c_str()) - : nullptr); - return rtd_qdevice; - } - - [[nodiscard]] auto getDeviceInfo() const -> std::tuple - { - return {rtd_lib, rtd_name, rtd_kwargs}; - } - - [[nodiscard]] auto getDeviceName() const -> const std::string & { return rtd_name; } - - void setDeviceStatus(RTDeviceStatus new_status) noexcept { status = new_status; } - - [[nodiscard]] auto getDeviceStatus() const -> RTDeviceStatus { return status; } - - friend std::ostream &operator<<(std::ostream &os, const RTDevice &device) - { - os << "RTD, name: " << device.rtd_name << " lib: " << device.rtd_lib - << " kwargs: " << device.rtd_kwargs; - return os; - } -}; - -class ExecutionContext final { - private: - // Device pool - std::vector> device_pool; - std::mutex pool_mu; // To protect device_pool - - bool initial_tape_recorder_status{false}; - - // ExecutionContext pointers - std::unique_ptr memory_man_ptr{nullptr}; - - // PRNG - uint32_t *seed; - std::mt19937 gen; - - public: - explicit ExecutionContext(uint32_t *seed = nullptr) : seed(seed) - { - memory_man_ptr = std::make_unique(); - - if (this->seed != nullptr) { - this->gen = std::mt19937(*seed); - } - } - - ~ExecutionContext() = default; - ExecutionContext(const ExecutionContext &other) = delete; - ExecutionContext &operator=(const ExecutionContext &other) = delete; - ExecutionContext(ExecutionContext &&other) = delete; - ExecutionContext &operator=(ExecutionContext &&other) = delete; - - void setDeviceRecorderStatus(bool status) noexcept { initial_tape_recorder_status = status; } - - [[nodiscard]] auto getDeviceRecorderStatus() const -> bool - { - return initial_tape_recorder_status; - } - - [[nodiscard]] auto getMemoryManager() const -> const std::unique_ptr & - { - return memory_man_ptr; - } - - [[nodiscard]] auto getOrCreateDevice(std::string_view rtd_lib, std::string_view rtd_name, - std::string_view rtd_kwargs) - -> const std::shared_ptr & - { - std::lock_guard lock(pool_mu); - - auto device = std::make_shared(rtd_lib, rtd_name, rtd_kwargs); - - const size_t key = device_pool.size(); - for (size_t i = 0; i < key; i++) { - if (device_pool[i]->getDeviceStatus() == RTDeviceStatus::Inactive && - *device_pool[i] == *device) { - device_pool[i]->setDeviceStatus(RTDeviceStatus::Active); - return device_pool[i]; - } - } - - RT_ASSERT(device->getQuantumDevicePtr()); - - // Add a new device - device->setDeviceStatus(RTDeviceStatus::Active); - if (this->seed != nullptr) { - device->getQuantumDevicePtr()->SetDevicePRNG(&(this->gen)); - } - else { - device->getQuantumDevicePtr()->SetDevicePRNG(nullptr); - } - device_pool.push_back(device); - - return device_pool[key]; - } - - [[nodiscard]] auto getOrCreateDevice(const std::string &rtd_lib, - const std::string &rtd_name = {}, - const std::string &rtd_kwargs = {}) - -> const std::shared_ptr & - { - return getOrCreateDevice(std::string_view{rtd_lib}, std::string_view{rtd_name}, - std::string_view{rtd_kwargs}); - } - - [[nodiscard]] auto getDevice(size_t device_key) -> const std::shared_ptr & - { - std::lock_guard lock(pool_mu); - RT_FAIL_IF(device_key >= device_pool.size(), "Invalid device_key"); - return device_pool[device_key]; - } - - void deactivateDevice(RTDevice *RTD_PTR) - { - std::lock_guard lock(pool_mu); - RTD_PTR->setDeviceStatus(RTDeviceStatus::Inactive); - } -}; -} // namespace Catalyst::Runtime diff --git a/src/qirlightning/catalyst_runtime/lib/capi/MemRefUtils.hpp b/src/qirlightning/catalyst_runtime/lib/capi/MemRefUtils.hpp deleted file mode 100644 index 481da78..0000000 --- a/src/qirlightning/catalyst_runtime/lib/capi/MemRefUtils.hpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright 2023 Xanadu Quantum Technologies Inc. - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include "mlir/ExecutionEngine/RunnerUtils.h" - -extern "C" { -void *_mlir_memref_to_llvm_alloc(size_t size); -void *_mlir_memref_to_llvm_aligned_alloc(size_t alignment, size_t size); -bool _mlir_memory_transfer(void *); -void _mlir_memref_to_llvm_free(void *ptr); -} - -// MemRef type definition -template struct MemRefT { - T *data_allocated; - T *data_aligned; - size_t offset; - size_t sizes[R]; - size_t strides[R]; -}; - -template -inline void printMemref(const UnrankedMemRefType &memref, bool printDescriptor = false) -{ - auto m = DynamicMemRefType(memref); - if (printDescriptor) { - std::cout << "MemRef: "; - printMemRefMetaData(std::cout, m); - std::cout << " data =" << std::endl; - } - impl::MemRefDataPrinter::print(std::cout, m.data, m.rank, m.rank, m.offset, m.sizes, - m.strides); -} diff --git a/src/qirlightning/catalyst_runtime/lib/capi/RuntimeCAPI.cpp b/src/qirlightning/catalyst_runtime/lib/capi/RuntimeCAPI.cpp deleted file mode 100644 index 8c1e019..0000000 --- a/src/qirlightning/catalyst_runtime/lib/capi/RuntimeCAPI.cpp +++ /dev/null @@ -1,1012 +0,0 @@ -// Copyright 2022-2023 Xanadu Quantum Technologies Inc. - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include - -#include -#include - -#include -#include -#include - -#include "mlir/ExecutionEngine/CRunnerUtils.h" - -#include "Exception.hpp" -#include "QuantumDevice.hpp" - -#include "ExecutionContext.hpp" -#include "MemRefUtils.hpp" -#include "Timer.hpp" - -#include "RuntimeCAPI.h" - -namespace Catalyst::Runtime { - -/** - * @brief Global quantum device unique pointer. - */ -static std::unique_ptr CTX = nullptr; - -/** - * @brief Thread local device pointer with internal linkage. - */ -thread_local static RTDevice *RTD_PTR = nullptr; - -bool getModifiersAdjoint(const Modifiers *modifiers) -{ - return !modifiers ? false : modifiers->adjoint; -} - -std::vector getModifiersControlledWires(const Modifiers *modifiers) -{ - return !modifiers ? std::vector() - : std::vector( - reinterpret_cast(modifiers->controlled_wires), - reinterpret_cast(modifiers->controlled_wires) + - modifiers->num_controlled); -} - -std::vector getModifiersControlledValues(const Modifiers *modifiers) -{ - return !modifiers ? std::vector() - : std::vector(modifiers->controlled_values, - modifiers->controlled_values + modifiers->num_controlled); -} - -#define MODIFIERS_ARGS(mod) \ - getModifiersAdjoint(mod), getModifiersControlledWires(mod), getModifiersControlledValues(mod) - -/** - * @brief Initialize the device instance and update the value of RTD_PTR - * to the new initialized device pointer. - */ -[[nodiscard]] bool initRTDevicePtr(std::string_view rtd_lib, std::string_view rtd_name, - std::string_view rtd_kwargs) -{ - auto &&device = CTX->getOrCreateDevice(rtd_lib, rtd_name, rtd_kwargs); - if (device) { - RTD_PTR = device.get(); - return RTD_PTR ? true : false; - } - return false; -} - -/** - * @brief get the active device. - */ -auto getQuantumDevicePtr() -> const std::unique_ptr & -{ - return RTD_PTR->getQuantumDevicePtr(); -} - -/** - * @brief Inactivate the active device instance. - */ -void deactivateDevice() -{ - CTX->deactivateDevice(RTD_PTR); - RTD_PTR = nullptr; -} -} // namespace Catalyst::Runtime - -extern "C" { - -using namespace Catalyst::Runtime; -using timer = catalyst::utils::Timer; - -void __catalyst_inactive_callback(int64_t identifier, int64_t argc, int64_t retc, ...) -{ - // LIBREGISTRY is a compile time macro. It is defined based on the output - // name of the callback library. And since it is stored in the same location - // as this library, it shares the ORIGIN variable. Do a `git grep LIBREGISTRY` - // to find its definition in the CMakeFiles. - // It is the name of the library that contains the callbackCall implementation. - // The reason why this is using dlopen is because we have historically wanted - // to avoid a dependency of python in the runtime. - // With dlopen, we leave the possibility of linking against the runtime without - // linking with LIBREGISTRY which is implemented as a pybind11 module. - // - // The only restriction is that there should be no calls to pyregsitry. - // - // This function cannot be tested from the runtime tests because there would be no valid python - // function to callback... - void *handle = dlopen(LIBREGISTRY, RTLD_LAZY); - if (!handle) { - char *err_msg = dlerror(); - RT_FAIL(err_msg); - } - - void (*callbackCall)(int64_t, int64_t, int64_t, va_list); - typedef void (*func_ptr_t)(int64_t, int64_t, int64_t, va_list); - callbackCall = (func_ptr_t)dlsym(handle, "callbackCall"); - if (!callbackCall) { - char *err_msg = dlerror(); - RT_FAIL(err_msg); - } - - va_list args; - va_start(args, retc); - callbackCall(identifier, argc, retc, args); - va_end(args); - dlclose(handle); -} - -void __catalyst__host__rt__unrecoverable_error() -{ - RT_FAIL("Unrecoverable error from asynchronous execution of multiple quantum programs."); -} - -void *_mlir_memref_to_llvm_alloc(size_t size) -{ - void *ptr = malloc(size); - CTX->getMemoryManager()->insert(ptr); - return ptr; -} - -void *_mlir_memref_to_llvm_aligned_alloc(size_t alignment, size_t size) -{ - void *ptr = aligned_alloc(alignment, size); - CTX->getMemoryManager()->insert(ptr); - return ptr; -} - -bool _mlir_memory_transfer(void *ptr) -{ - if (!CTX->getMemoryManager()->contains(ptr)) { - return false; - } - CTX->getMemoryManager()->erase(ptr); - return true; -} - -void _mlir_memref_to_llvm_free(void *ptr) -{ - CTX->getMemoryManager()->erase(ptr); - free(ptr); -} - -void __catalyst__rt__print_string(char *string) -{ - if (!string) { - std::cout << "None" << std::endl; - return; - } - std::cout << string << std::endl; -} - -void __catalyst__rt__assert_bool(bool p, char *s) { RT_FAIL_IF(!p, s); } - -void __catalyst__rt__print_tensor(OpaqueMemRefT *c_memref, bool printDescriptor) -{ - if (c_memref->datatype == NumericType::idx) { - printMemref({c_memref->rank, c_memref->descriptor}, printDescriptor); - } - else if (c_memref->datatype == NumericType::i1) { - printMemref({c_memref->rank, c_memref->descriptor}, printDescriptor); - } - else if (c_memref->datatype == NumericType::i8) { - printMemref({c_memref->rank, c_memref->descriptor}, printDescriptor); - } - else if (c_memref->datatype == NumericType::i16) { - printMemref({c_memref->rank, c_memref->descriptor}, printDescriptor); - } - else if (c_memref->datatype == NumericType::i32) { - printMemref({c_memref->rank, c_memref->descriptor}, printDescriptor); - } - else if (c_memref->datatype == NumericType::i64) { - printMemref({c_memref->rank, c_memref->descriptor}, printDescriptor); - } - else if (c_memref->datatype == NumericType::f32) { - printMemref({c_memref->rank, c_memref->descriptor}, printDescriptor); - } - else if (c_memref->datatype == NumericType::f64) { - printMemref({c_memref->rank, c_memref->descriptor}, printDescriptor); - } - else if (c_memref->datatype == NumericType::c64) { - printMemref({c_memref->rank, c_memref->descriptor}, printDescriptor); - } - else if (c_memref->datatype == NumericType::c128) { - printMemref({c_memref->rank, c_memref->descriptor}, printDescriptor); - } - else { - RT_FAIL("Unkown numeric type encoding for array printing."); - } - - std::cout << std::endl; -} - -void __catalyst__rt__fail_cstr(const char *cstr) { RT_FAIL(cstr); } - -void __catalyst__rt__initialize(uint32_t *seed) { CTX = std::make_unique(seed); } - -void __catalyst__rt__finalize() -{ - RTD_PTR = nullptr; - CTX.reset(nullptr); -} - -static int __catalyst__rt__device_init__impl(int8_t *rtd_lib, int8_t *rtd_name, int8_t *rtd_kwargs, - int64_t shots) -{ - // Device library cannot be a nullptr - RT_FAIL_IF(!rtd_lib, "Invalid device library"); - RT_FAIL_IF(!CTX, "Invalid use of the global driver before initialization"); - RT_FAIL_IF(RTD_PTR, "Cannot re-initialize an ACTIVE device: Consider using " - "__catalyst__rt__device_release before __catalyst__rt__device_init"); - - const std::vector args{ - reinterpret_cast(rtd_lib), (rtd_name ? reinterpret_cast(rtd_name) : ""), - (rtd_kwargs ? reinterpret_cast(rtd_kwargs) : "")}; - RT_FAIL_IF(!initRTDevicePtr(args[0], args[1], args[2]), - "Failed initialization of the backend device"); - getQuantumDevicePtr()->SetDeviceShots(shots); - if (CTX->getDeviceRecorderStatus()) { - getQuantumDevicePtr()->StartTapeRecording(); - } - return 0; -} - -void __catalyst__rt__device_init(int8_t *rtd_lib, int8_t *rtd_name, int8_t *rtd_kwargs, - int64_t shots) -{ - timer::timer(__catalyst__rt__device_init__impl, "device_init", /* add_endl */ true, rtd_lib, - rtd_name, rtd_kwargs, shots); -} - -static int __catalyst__rt__device_release__impl() -{ - RT_FAIL_IF(!CTX, "Cannot release an ACTIVE device out of scope of the global driver"); - // TODO: This will be used for the async support - deactivateDevice(); - return 0; -} - -void __catalyst__rt__device_release() -{ - timer::timer(__catalyst__rt__device_release__impl, "device_release", /* add_endl */ true); -} - -void __catalyst__rt__print_state() { getQuantumDevicePtr()->PrintState(); } - -void __catalyst__rt__toggle_recorder(bool status) -{ - CTX->setDeviceRecorderStatus(status); - if (!RTD_PTR) { - return; - } - - if (status) { - getQuantumDevicePtr()->StartTapeRecording(); - } - else { - getQuantumDevicePtr()->StopTapeRecording(); - } -} - -static QUBIT *__catalyst__rt__qubit_allocate__impl() -{ - RT_ASSERT(getQuantumDevicePtr() != nullptr); - RT_ASSERT(CTX->getMemoryManager() != nullptr); - - return reinterpret_cast(getQuantumDevicePtr()->AllocateQubit()); -} - -QUBIT *__catalyst__rt__qubit_allocate() -{ - return timer::timer(__catalyst__rt__qubit_allocate__impl, "qubit_allocate", - /* add_endl */ true); -} - -static QirArray *__catalyst__rt__qubit_allocate_array__impl(int64_t num_qubits) -{ - RT_ASSERT(getQuantumDevicePtr() != nullptr); - RT_ASSERT(CTX->getMemoryManager() != nullptr); - RT_ASSERT(num_qubits >= 0); - - // For first prototype, we just want to make this work. - // But ideally, I think the device should determine the representation. - // Essentially just forward this to the device library. - // And the device library can choose how to handle everything. - std::vector qubit_vector = getQuantumDevicePtr()->AllocateQubits(num_qubits); - - // I don't like this copying. - std::vector *qubit_vector_ptr = - new std::vector(qubit_vector.begin(), qubit_vector.end()); - - // Because this function is interfacing with C - // I think we should return a trivial-type - // https://en.cppreference.com/w/cpp/named_req/TrivialType - // Why should we return a trivial type? - // - // Paraphrasing from stackoverflow: https://stackoverflow.com/a/72409589 - // extern "C" will avoid name mangling from happening. - // It doesn't prevent a function from returning or accepting a C++ type. - // But the calling language needs to understand the data-layout for the - // type being returned. - // For non-trivial types, this will be difficult to impossible. - return (QirArray *)qubit_vector_ptr; -} - -QirArray *__catalyst__rt__qubit_allocate_array(int64_t num_qubits) -{ - return timer::timer(__catalyst__rt__qubit_allocate_array__impl, "qubit_allocate_array", - /* add_endl */ true, num_qubits); -} - -static int __catalyst__rt__qubit_release__impl(QUBIT *qubit) -{ - getQuantumDevicePtr()->ReleaseQubit(reinterpret_cast(qubit)); - return 0; -} - -void __catalyst__rt__qubit_release(QUBIT *qubit) -{ - timer::timer(__catalyst__rt__qubit_release__impl, "qubit_release", - /* add_endl */ true, qubit); -} - -static int __catalyst__rt__qubit_release_array__impl(QirArray *qubit_array) -{ - getQuantumDevicePtr()->ReleaseAllQubits(); - std::vector *qubit_array_ptr = - reinterpret_cast *>(qubit_array); - delete qubit_array_ptr; - return 0; -} - -void __catalyst__rt__qubit_release_array(QirArray *qubit_array) -{ - timer::timer(__catalyst__rt__qubit_release_array__impl, "qubit_release_array", - /* add_endl */ true, qubit_array); -} - -int64_t __catalyst__rt__num_qubits() -{ - return static_cast(getQuantumDevicePtr()->GetNumQubits()); -} - -bool __catalyst__rt__result_equal(RESULT *r0, RESULT *r1) { return (r0 == r1) || (*r0 == *r1); } - -RESULT *__catalyst__rt__result_get_one() { return getQuantumDevicePtr()->One(); } - -RESULT *__catalyst__rt__result_get_zero() { return getQuantumDevicePtr()->Zero(); } - -void __catalyst__qis__Gradient(int64_t numResults, /* results = */...) -{ - RT_ASSERT(numResults >= 0); - using ResultType = MemRefT; - - std::vector mem_ptrs; - mem_ptrs.reserve(numResults); - va_list args; - va_start(args, numResults); - for (int64_t i = 0; i < numResults; i++) { - mem_ptrs.push_back(va_arg(args, ResultType *)); - } - va_end(args); - - std::vector> mem_views; - mem_views.reserve(numResults); - for (auto *mr : mem_ptrs) { - mem_views.emplace_back(mr->data_aligned, mr->offset, mr->sizes, mr->strides); - } - - // num_observables * num_train_params - getQuantumDevicePtr()->Gradient(mem_views, {}); -} - -void __catalyst__qis__Gradient_params(MemRefT_int64_1d *params, int64_t numResults, - /* results = */...) -{ - RT_ASSERT(numResults >= 0); - using ResultType = MemRefT; - - if (params == nullptr || !params->sizes[0]) { - RT_FAIL("Invalid number of trainable parameters"); - } - - const size_t tp_size = params->sizes[0]; - - // create a vector of custom trainable parameters - std::vector train_params; - auto *params_data = params->data_aligned; - train_params.reserve(tp_size); - for (size_t i = 0; i < tp_size; i++) { - auto p = params_data[i]; - RT_FAIL_IF(p < 0, "trainable parameter cannot be a negative integer"); - train_params.push_back(p); - } - - std::vector mem_ptrs; - mem_ptrs.reserve(numResults); - va_list args; - va_start(args, numResults); - for (int64_t i = 0; i < numResults; i++) { - mem_ptrs.push_back(va_arg(args, ResultType *)); - } - va_end(args); - - std::vector> mem_views; - mem_views.reserve(numResults); - for (auto *mr : mem_ptrs) { - mem_views.emplace_back(mr->data_aligned, mr->offset, mr->sizes, mr->strides); - } - - // num_observables * num_train_params - getQuantumDevicePtr()->Gradient(mem_views, train_params); -} - -void __catalyst__qis__GlobalPhase(double phi, const Modifiers *modifiers) -{ - getQuantumDevicePtr()->NamedOperation("GlobalPhase", {phi}, {}, MODIFIERS_ARGS(modifiers)); -} - -void __catalyst__qis__SetState(MemRefT_CplxT_double_1d *data, uint64_t numQubits, ...) -{ - RT_ASSERT(numQubits > 0); - - va_list args; - va_start(args, numQubits); - std::vector wires(numQubits); - for (uint64_t i = 0; i < numQubits; i++) { - wires[i] = va_arg(args, QubitIdType); - } - va_end(args); - - MemRefT, 1> *data_p = (MemRefT, 1> *)data; - DataView, 1> data_view(data_p->data_aligned, data_p->offset, data_p->sizes, - data_p->strides); - getQuantumDevicePtr()->SetState(data_view, wires); -} - -void __catalyst__qis__SetBasisState(MemRefT_int8_1d *data, uint64_t numQubits, ...) -{ - RT_ASSERT(numQubits > 0); - - DataView data_view(data->data_aligned, data->offset, data->sizes, data->strides); - - va_list args; - va_start(args, numQubits); - std::vector wires(numQubits); - for (uint64_t i = 0; i < numQubits; i++) { - wires[i] = va_arg(args, QubitIdType); - } - va_end(args); - std::unordered_set wire_set(wires.begin(), wires.end()); - RT_FAIL_IF(wire_set.size() != numQubits, "Wires must be unique"); - RT_FAIL_IF(data->sizes[0] != numQubits, - "BasisState parameter and wires must be of equal length."); - - getQuantumDevicePtr()->SetBasisState(data_view, wires); -} - -void __catalyst__qis__Identity(QUBIT *qubit, const Modifiers *modifiers) -{ - getQuantumDevicePtr()->NamedOperation("Identity", {}, {reinterpret_cast(qubit)}, - MODIFIERS_ARGS(modifiers)); -} - -void __catalyst__qis__PauliX(QUBIT *qubit, const Modifiers *modifiers) -{ - getQuantumDevicePtr()->NamedOperation("PauliX", {}, {reinterpret_cast(qubit)}, - MODIFIERS_ARGS(modifiers)); -} - -void __catalyst__qis__PauliY(QUBIT *qubit, const Modifiers *modifiers) -{ - getQuantumDevicePtr()->NamedOperation("PauliY", {}, {reinterpret_cast(qubit)}, - MODIFIERS_ARGS(modifiers)); -} - -void __catalyst__qis__PauliZ(QUBIT *qubit, const Modifiers *modifiers) -{ - getQuantumDevicePtr()->NamedOperation("PauliZ", {}, {reinterpret_cast(qubit)}, - MODIFIERS_ARGS(modifiers)); -} - -void __catalyst__qis__Hadamard(QUBIT *qubit, const Modifiers *modifiers) -{ - getQuantumDevicePtr()->NamedOperation("Hadamard", {}, {reinterpret_cast(qubit)}, - MODIFIERS_ARGS(modifiers)); -} - -void __catalyst__qis__S(QUBIT *qubit, const Modifiers *modifiers) -{ - getQuantumDevicePtr()->NamedOperation("S", {}, {reinterpret_cast(qubit)}, - MODIFIERS_ARGS(modifiers)); -} - -void __catalyst__qis__T(QUBIT *qubit, const Modifiers *modifiers) -{ - getQuantumDevicePtr()->NamedOperation("T", {}, {reinterpret_cast(qubit)}, - MODIFIERS_ARGS(modifiers)); -} - -void __catalyst__qis__PhaseShift(double theta, QUBIT *qubit, const Modifiers *modifiers) -{ - getQuantumDevicePtr()->NamedOperation( - "PhaseShift", {theta}, {reinterpret_cast(qubit)}, MODIFIERS_ARGS(modifiers)); -} - -void __catalyst__qis__RX(double theta, QUBIT *qubit, const Modifiers *modifiers) -{ - getQuantumDevicePtr()->NamedOperation("RX", {theta}, {reinterpret_cast(qubit)}, - MODIFIERS_ARGS(modifiers)); -} - -void __catalyst__qis__RY(double theta, QUBIT *qubit, const Modifiers *modifiers) -{ - getQuantumDevicePtr()->NamedOperation("RY", {theta}, {reinterpret_cast(qubit)}, - MODIFIERS_ARGS(modifiers)); -} - -void __catalyst__qis__RZ(double theta, QUBIT *qubit, const Modifiers *modifiers) -{ - getQuantumDevicePtr()->NamedOperation("RZ", {theta}, {reinterpret_cast(qubit)}, - MODIFIERS_ARGS(modifiers)); -} - -void __catalyst__qis__Rot(double phi, double theta, double omega, QUBIT *qubit, - const Modifiers *modifiers) -{ - getQuantumDevicePtr()->NamedOperation("Rot", {phi, theta, omega}, - {reinterpret_cast(qubit)}, - MODIFIERS_ARGS(modifiers)); -} - -void __catalyst__qis__CNOT(QUBIT *control, QUBIT *target, const Modifiers *modifiers) -{ - RT_FAIL_IF(control == target, - "Invalid input for CNOT gate. Control and target qubit operands must be distinct."); - getQuantumDevicePtr()->NamedOperation("CNOT", {}, - {/* control = */ reinterpret_cast(control), - /* target = */ reinterpret_cast(target)}, - /* modifiers */ MODIFIERS_ARGS(modifiers)); -} - -void __catalyst__qis__CY(QUBIT *control, QUBIT *target, const Modifiers *modifiers) -{ - getQuantumDevicePtr()->NamedOperation("CY", {}, - {/* control = */ reinterpret_cast(control), - /* target = */ reinterpret_cast(target)}, - /* modifiers */ MODIFIERS_ARGS(modifiers)); -} - -void __catalyst__qis__CZ(QUBIT *control, QUBIT *target, const Modifiers *modifiers) -{ - getQuantumDevicePtr()->NamedOperation("CZ", {}, - {/* control = */ reinterpret_cast(control), - /* target = */ reinterpret_cast(target)}, - /* modifiers */ MODIFIERS_ARGS(modifiers)); -} - -void __catalyst__qis__SWAP(QUBIT *control, QUBIT *target, const Modifiers *modifiers) -{ - getQuantumDevicePtr()->NamedOperation("SWAP", {}, - {/* control = */ reinterpret_cast(control), - /* target = */ reinterpret_cast(target)}, - /* modifiers */ MODIFIERS_ARGS(modifiers)); -} - -void __catalyst__qis__IsingXX(double theta, QUBIT *control, QUBIT *target, - const Modifiers *modifiers) -{ - getQuantumDevicePtr()->NamedOperation("IsingXX", {theta}, - {/* control = */ reinterpret_cast(control), - /* target = */ reinterpret_cast(target)}, - /* modifiers */ MODIFIERS_ARGS(modifiers)); -} - -void __catalyst__qis__IsingYY(double theta, QUBIT *control, QUBIT *target, - const Modifiers *modifiers) -{ - getQuantumDevicePtr()->NamedOperation("IsingYY", {theta}, - {/* control = */ reinterpret_cast(control), - /* target = */ reinterpret_cast(target)}, - /* modifiers */ MODIFIERS_ARGS(modifiers)); -} - -void __catalyst__qis__IsingXY(double theta, QUBIT *control, QUBIT *target, - const Modifiers *modifiers) -{ - getQuantumDevicePtr()->NamedOperation("IsingXY", {theta}, - {/* control = */ reinterpret_cast(control), - /* target = */ reinterpret_cast(target)}, - /* modifiers */ MODIFIERS_ARGS(modifiers)); -} - -void __catalyst__qis__IsingZZ(double theta, QUBIT *control, QUBIT *target, - const Modifiers *modifiers) -{ - getQuantumDevicePtr()->NamedOperation("IsingZZ", {theta}, - {/* control = */ reinterpret_cast(control), - /* target = */ reinterpret_cast(target)}, - /* modifiers */ MODIFIERS_ARGS(modifiers)); -} - -void __catalyst__qis__ControlledPhaseShift(double theta, QUBIT *control, QUBIT *target, - const Modifiers *modifiers) -{ - getQuantumDevicePtr()->NamedOperation("ControlledPhaseShift", {theta}, - {/* control = */ reinterpret_cast(control), - /* target = */ reinterpret_cast(target)}, - /* modifiers */ MODIFIERS_ARGS(modifiers)); -} - -void __catalyst__qis__CRX(double theta, QUBIT *control, QUBIT *target, const Modifiers *modifiers) -{ - getQuantumDevicePtr()->NamedOperation("CRX", {theta}, - {/* control = */ reinterpret_cast(control), - /* target = */ reinterpret_cast(target)}, - /* modifiers */ MODIFIERS_ARGS(modifiers)); -} - -void __catalyst__qis__CRY(double theta, QUBIT *control, QUBIT *target, const Modifiers *modifiers) -{ - getQuantumDevicePtr()->NamedOperation("CRY", {theta}, - {/* control = */ reinterpret_cast(control), - /* target = */ reinterpret_cast(target)}, - /* modifiers */ MODIFIERS_ARGS(modifiers)); -} - -void __catalyst__qis__CRZ(double theta, QUBIT *control, QUBIT *target, const Modifiers *modifiers) -{ - getQuantumDevicePtr()->NamedOperation("CRZ", {theta}, - {/* control = */ reinterpret_cast(control), - /* target = */ reinterpret_cast(target)}, - /* modifiers */ MODIFIERS_ARGS(modifiers)); -} - -void __catalyst__qis__CRot(double phi, double theta, double omega, QUBIT *control, QUBIT *target, - const Modifiers *modifiers) -{ - getQuantumDevicePtr()->NamedOperation("CRot", {phi, theta, omega}, - {/* control = */ reinterpret_cast(control), - /* target = */ reinterpret_cast(target)}, - /* modifiers */ MODIFIERS_ARGS(modifiers)); -} - -void __catalyst__qis__CSWAP(QUBIT *control, QUBIT *aswap, QUBIT *bswap, const Modifiers *modifiers) -{ - getQuantumDevicePtr()->NamedOperation("CSWAP", {}, - {reinterpret_cast(control), - reinterpret_cast(aswap), - reinterpret_cast(bswap)}, - /* modifiers */ MODIFIERS_ARGS(modifiers)); -} - -void __catalyst__qis__Toffoli(QUBIT *wire0, QUBIT *wire1, QUBIT *wire2, const Modifiers *modifiers) -{ - getQuantumDevicePtr()->NamedOperation("Toffoli", {}, - {reinterpret_cast(wire0), - reinterpret_cast(wire1), - reinterpret_cast(wire2)}, - /* modifiers */ MODIFIERS_ARGS(modifiers)); -} - -void __catalyst__qis__MultiRZ(double theta, const Modifiers *modifiers, int64_t numQubits, ...) -{ - RT_ASSERT(numQubits >= 0); - - va_list args; - va_start(args, numQubits); - std::vector wires(numQubits); - for (int64_t i = 0; i < numQubits; i++) { - wires[i] = va_arg(args, QubitIdType); - } - va_end(args); - - getQuantumDevicePtr()->NamedOperation("MultiRZ", {theta}, wires, - /* modifiers */ MODIFIERS_ARGS(modifiers)); -} - -void __catalyst__qis__ISWAP(QUBIT *wire0, QUBIT *wire1, const Modifiers *modifiers) -{ - getQuantumDevicePtr()->NamedOperation( - "ISWAP", {}, {reinterpret_cast(wire0), reinterpret_cast(wire1)}, - MODIFIERS_ARGS(modifiers)); -} - -void __catalyst__qis__PSWAP(double phi, QUBIT *wire0, QUBIT *wire1, const Modifiers *modifiers) -{ - getQuantumDevicePtr()->NamedOperation( - "PSWAP", {phi}, - {reinterpret_cast(wire0), reinterpret_cast(wire1)}, - MODIFIERS_ARGS(modifiers)); -} - -static void _qubitUnitary_impl(MemRefT_CplxT_double_2d *matrix, int64_t numQubits, - std::vector> &coeffs, - std::vector &wires, va_list *args) -{ - const size_t num_rows = matrix->sizes[0]; - const size_t num_col = matrix->sizes[1]; - const size_t expected_size = std::pow(2, numQubits); - - if (num_rows != expected_size || num_col != expected_size) { - RT_FAIL("Invalid given QubitUnitary matrix; " - "The size of the matrix must be pow(2, numWires) * pow(2, numWires)."); - } - - wires.reserve(numQubits); - for (int64_t i = 0; i < numQubits; i++) { - wires.push_back(va_arg(*args, QubitIdType)); - } - - const size_t matrix_size = num_rows * num_col; - coeffs.reserve(matrix_size); - for (size_t i = 0; i < matrix_size; i++) { - coeffs.emplace_back(matrix->data_aligned[i].real, matrix->data_aligned[i].imag); - } -} - -void __catalyst__qis__QubitUnitary(MemRefT_CplxT_double_2d *matrix, const Modifiers *modifiers, - int64_t numQubits, /*qubits*/...) -{ - RT_ASSERT(numQubits >= 0); - - if (matrix == nullptr) { - RT_FAIL("The QubitUnitary matrix must be initialized"); - } - - if (numQubits > __catalyst__rt__num_qubits()) { - RT_FAIL("Invalid number of wires"); - } - - va_list args; - std::vector> coeffs; - std::vector wires; - va_start(args, numQubits); - _qubitUnitary_impl(matrix, numQubits, coeffs, wires, &args); - va_end(args); - return getQuantumDevicePtr()->MatrixOperation(coeffs, wires, MODIFIERS_ARGS(modifiers)); -} - -ObsIdType __catalyst__qis__NamedObs(int64_t obsId, QUBIT *wire) -{ - return getQuantumDevicePtr()->Observable(static_cast(obsId), {}, - {reinterpret_cast(wire)}); -} - -ObsIdType __catalyst__qis__HermitianObs(MemRefT_CplxT_double_2d *matrix, int64_t numQubits, ...) -{ - RT_ASSERT(numQubits >= 0); - - if (matrix == nullptr) { - RT_FAIL("The Hermitian matrix must be initialized"); - } - - const size_t num_rows = matrix->sizes[0]; - const size_t num_col = matrix->sizes[1]; - const size_t expected_size = std::pow(2, numQubits); - - if (num_rows != expected_size || num_col != expected_size) { - RT_FAIL("Invalid given Hermitian matrix; " - "The size of the matrix must be pow(2, numWires) * pow(2, numWires)."); - } - - va_list args; - va_start(args, numQubits); - std::vector wires(numQubits); - for (int64_t i = 0; i < numQubits; i++) { - wires[i] = va_arg(args, QubitIdType); - } - va_end(args); - - if (numQubits > __catalyst__rt__num_qubits()) { - RT_FAIL("Invalid number of wires"); - } - - const size_t matrix_size = num_rows * num_col; - std::vector> coeffs; - coeffs.reserve(matrix_size); - for (size_t i = 0; i < matrix_size; i++) { - coeffs.emplace_back(matrix->data_aligned[i].real, matrix->data_aligned[i].imag); - } - - return getQuantumDevicePtr()->Observable(ObsId::Hermitian, coeffs, wires); -} - -ObsIdType __catalyst__qis__TensorObs(int64_t numObs, /*obsKeys*/...) -{ - if (numObs < 1) { - RT_FAIL("Invalid number of observables to create TensorProdObs"); - } - - va_list args; - va_start(args, numObs); - std::vector obsKeys; - obsKeys.reserve(numObs); - for (int64_t i = 0; i < numObs; i++) { - obsKeys.push_back(va_arg(args, ObsIdType)); - } - va_end(args); - - return getQuantumDevicePtr()->TensorObservable(obsKeys); -} - -ObsIdType __catalyst__qis__HamiltonianObs(MemRefT_double_1d *coeffs, int64_t numObs, - /*obsKeys*/...) -{ - RT_ASSERT(numObs >= 0); - - if (coeffs == nullptr) { - RT_FAIL("Invalid coefficients for computing Hamiltonian; " - "The coefficients list must be initialized."); - } - - const size_t coeffs_size = coeffs->sizes[0]; - - if (static_cast(numObs) != coeffs_size) { - RT_FAIL("Invalid coefficients for computing Hamiltonian; " - "The number of coefficients and observables must be equal."); - } - - va_list args; - va_start(args, numObs); - std::vector obsKeys; - obsKeys.reserve(numObs); - for (int64_t i = 0; i < numObs; i++) { - obsKeys.push_back(va_arg(args, ObsIdType)); - } - va_end(args); - - std::vector coeffs_vec(coeffs->data_aligned, coeffs->data_aligned + coeffs_size); - return getQuantumDevicePtr()->HamiltonianObservable(coeffs_vec, obsKeys); -} - -RESULT *__catalyst__qis__Measure(QUBIT *wire, int32_t postselect) -{ - std::optional postselectOpt{postselect}; - - // Any value different to 0 or 1 denotes absence of postselect, and it is hence turned into - // std::nullopt at the C++ interface - if (postselect != 0 && postselect != 1) { - postselectOpt = std::nullopt; - } - - return getQuantumDevicePtr()->Measure(reinterpret_cast(wire), postselectOpt); -} - -double __catalyst__qis__Expval(ObsIdType obsKey) { return getQuantumDevicePtr()->Expval(obsKey); } - -double __catalyst__qis__Variance(ObsIdType obsKey) { return getQuantumDevicePtr()->Var(obsKey); } - -void __catalyst__qis__State(MemRefT_CplxT_double_1d *result, int64_t numQubits, ...) -{ - RT_ASSERT(numQubits >= 0); - MemRefT, 1> *result_p = (MemRefT, 1> *)result; - - va_list args; - va_start(args, numQubits); - std::vector wires(numQubits); - for (int64_t i = 0; i < numQubits; i++) { - wires[i] = va_arg(args, QubitIdType); - } - va_end(args); - - DataView, 1> view(result_p->data_aligned, result_p->offset, - result_p->sizes, result_p->strides); - - if (wires.empty()) { - getQuantumDevicePtr()->State(view); - } - else { - RT_FAIL("Partial State-Vector not supported yet"); - // getQuantumDevicePtr()->PartialState(stateVec, - // numElements, wires); - } -} - -void __catalyst__qis__Probs(MemRefT_double_1d *result, int64_t numQubits, ...) -{ - RT_ASSERT(numQubits >= 0); - MemRefT *result_p = (MemRefT *)result; - - va_list args; - va_start(args, numQubits); - std::vector wires(numQubits); - for (int64_t i = 0; i < numQubits; i++) { - wires[i] = va_arg(args, QubitIdType); - } - va_end(args); - - DataView view(result_p->data_aligned, result_p->offset, result_p->sizes, - result_p->strides); - - if (wires.empty()) { - getQuantumDevicePtr()->Probs(view); - } - else { - getQuantumDevicePtr()->PartialProbs(view, wires); - } -} - -void __catalyst__qis__Sample(MemRefT_double_2d *result, int64_t numQubits, ...) -{ - int64_t shots = getQuantumDevicePtr()->GetDeviceShots(); - RT_ASSERT(shots >= 0); - RT_ASSERT(numQubits >= 0); - MemRefT *result_p = (MemRefT *)result; - - va_list args; - va_start(args, numQubits); - std::vector wires(numQubits); - for (int64_t i = 0; i < numQubits; i++) { - wires[i] = va_arg(args, QubitIdType); - } - va_end(args); - - DataView view(result_p->data_aligned, result_p->offset, result_p->sizes, - result_p->strides); - - if (wires.empty()) { - getQuantumDevicePtr()->Sample(view, shots); - } - else { - getQuantumDevicePtr()->PartialSample(view, wires, shots); - } -} - -void __catalyst__qis__Counts(PairT_MemRefT_double_int64_1d *result, int64_t numQubits, ...) -{ - int64_t shots = getQuantumDevicePtr()->GetDeviceShots(); - RT_ASSERT(shots >= 0); - RT_ASSERT(numQubits >= 0); - MemRefT *result_eigvals_p = (MemRefT *)&result->first; - MemRefT *result_counts_p = (MemRefT *)&result->second; - - va_list args; - va_start(args, numQubits); - std::vector wires(numQubits); - for (int64_t i = 0; i < numQubits; i++) { - wires[i] = va_arg(args, QubitIdType); - } - va_end(args); - - DataView eigvals_view(result_eigvals_p->data_aligned, result_eigvals_p->offset, - result_eigvals_p->sizes, result_eigvals_p->strides); - DataView counts_view(result_counts_p->data_aligned, result_counts_p->offset, - result_counts_p->sizes, result_counts_p->strides); - - if (wires.empty()) { - getQuantumDevicePtr()->Counts(eigvals_view, counts_view, shots); - } - else { - getQuantumDevicePtr()->PartialCounts(eigvals_view, counts_view, wires, shots); - } -} - -int64_t __catalyst__rt__array_get_size_1d(QirArray *ptr) -{ - std::vector *qubit_vector_ptr = reinterpret_cast *>(ptr); - return qubit_vector_ptr->size(); -} - -int8_t *__catalyst__rt__array_get_element_ptr_1d(QirArray *ptr, int64_t idx) -{ - std::vector *qubit_vector_ptr = reinterpret_cast *>(ptr); - - RT_ASSERT(idx >= 0); - std::string error_msg = "The qubit register does not contain the requested wire: "; - error_msg += std::to_string(idx); - RT_FAIL_IF(static_cast(idx) >= qubit_vector_ptr->size(), error_msg.c_str()); - - QubitIdType *data = qubit_vector_ptr->data(); - return (int8_t *)&data[idx]; -} -} diff --git a/src/qirlightning/catalyst_runtime/lib/registry/CMakeLists.txt b/src/qirlightning/catalyst_runtime/lib/registry/CMakeLists.txt deleted file mode 100644 index 2c19e4a..0000000 --- a/src/qirlightning/catalyst_runtime/lib/registry/CMakeLists.txt +++ /dev/null @@ -1,33 +0,0 @@ -# nanobind suggests including these lines to configure CMake to perform an optimized release build -# by default unless another build type is specified. Without this addition, binding code may run -# slowly and produce large binaries. -# See https://nanobind.readthedocs.io/en/latest/building.html#preliminaries -if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) - set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) - set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") -endif() - -# Locate nanobind -execute_process( - COMMAND "${Python_EXECUTABLE}" -c "import nanobind; print(nanobind.cmake_dir())" - OUTPUT_VARIABLE nanobind_DIR OUTPUT_STRIP_TRAILING_WHITESPACE -) -find_package(nanobind CONFIG REQUIRED) - -# Source file list for `wrapper` module -set(REGISTRY_SRC_FILES - Registry.cpp -) - -# Create the Python `catalyst_callback_registry` module -# Target the stable ABI for Python 3.12+, which reduces the number of binary wheels that must be -# built (`STABLE_ABI` does nothing on older Python versions). -nanobind_add_module(catalyst_callback_registry STABLE_ABI ${REGISTRY_SRC_FILES}) - -# Use a consistant suffix ".so" rather than, e.g. ".abi3.so" (when using the Stable ABI) or -# ".cpython-3xx-darwin.so". Doing so simplifies the process to locate it when calling -# `dlopen(LIBREGISTRY)` in runtime/lib/capi/RuntimeCAPI.cpp. -set_target_properties(catalyst_callback_registry PROPERTIES SUFFIX ".so") - -target_include_directories(catalyst_callback_registry PUBLIC ${runtime_includes}) -target_compile_definitions(catalyst_qir_qis_obj PUBLIC -DLIBREGISTRY=\"$\") diff --git a/src/qirlightning/catalyst_runtime/lib/registry/Registry.cpp b/src/qirlightning/catalyst_runtime/lib/registry/Registry.cpp deleted file mode 100644 index fd4715d..0000000 --- a/src/qirlightning/catalyst_runtime/lib/registry/Registry.cpp +++ /dev/null @@ -1,179 +0,0 @@ -// Copyright 2024 Xanadu Quantum Technologies Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include -#include - -#include -#include - -namespace nb = nanobind; - -// From PyBind11's documentation: -// -// Do you have any global variables that are pybind11 objects or invoke pybind11 functions in -// either their constructor or destructor? You are generally not allowed to invoke any Python -// function in a global static context. We recommend using lazy initialization and then -// intentionally leaking at the end of the program. -// -// https://pybind11.readthedocs.io/en/stable/advanced/misc.html#common-sources-of-global-interpreter-lock-errors -std::unordered_map *references; - -std::string libmlirpath; - -struct UnrankedMemrefType { - int64_t rank; - void *descriptor; -}; - -class LibraryManager { - void *_handle; - - public: - LibraryManager(std::string path) - { - this->_handle = dlopen(path.c_str(), RTLD_LAZY); - if (!this->_handle) { - throw nb::value_error(dlerror()); - } - } - - ~LibraryManager() - { - if (this->_handle) { - dlclose(this->_handle); - } - } - - void operator()(long elementSize, UnrankedMemrefType *src, UnrankedMemrefType *dst) - { - void *f_ptr = dlsym(this->_handle, "memrefCopy"); - if (!f_ptr) { - throw nb::value_error(dlerror()); - } - typedef void (*memrefCopy_t)(int64_t, void *, void *); - void (*memrefCopy)(int64_t, void *, void *); - memrefCopy = (memrefCopy_t)(f_ptr); - return memrefCopy(elementSize, src, dst); - } -}; - -inline const char *ext() -{ -#ifdef __APPLE__ - return ".dylib"; -#elif __linux__ - return ".so"; -#else -#error "Only apple and linux are currently supported"; -#endif -} - -std::string library_name(std::string name) { return name + ext(); } - -void convertResult(nb::handle tuple) -{ - nb::object unrankedMemrefPtrSizeTuple = tuple.attr("__getitem__")(0); - - nb::object unranked_memref = unrankedMemrefPtrSizeTuple.attr("__getitem__")(0); - nb::object element_size = unrankedMemrefPtrSizeTuple.attr("__getitem__")(1); - nb::object unranked_memref_ptr_int = unranked_memref.attr("value"); - - void *unranked_memref_ptr = reinterpret_cast(nb::cast(unranked_memref_ptr_int)); - long e_size = nb::cast(element_size); - - nb::object dest = tuple.attr("__getitem__")(1); - - long destAsLong = nb::cast(dest); - void *destAsPtr = (void *)(destAsLong); - - UnrankedMemrefType *src = (UnrankedMemrefType *)unranked_memref_ptr; - UnrankedMemrefType destMemref = {src->rank, destAsPtr}; - - std::string libpath = libmlirpath + library_name("/libmlir_c_runner_utils"); - LibraryManager memrefCopy(libpath); - memrefCopy(e_size, src, &destMemref); -} - -void convertResults(nb::list results, nb::list allocated) -{ - auto builtins = nb::module_::import_("builtins"); - auto zip = builtins.attr("zip"); - for (nb::handle obj : zip(results, allocated)) { - convertResult(obj); - } -} - -extern "C" { -[[gnu::visibility("default")]] void callbackCall(int64_t identifier, int64_t count, int64_t retc, - va_list args) -{ - nb::gil_scoped_acquire lock; - auto it = references->find(identifier); - if (it == references->end()) { - throw std::invalid_argument("Callback called with invalid identifier"); - } - auto lambda = it->second; - - nb::list flat_args; - for (int i = 0; i < count; i++) { - int64_t ptr = va_arg(args, int64_t); - flat_args.append(ptr); - } - - nb::list flat_results = nb::list(lambda(flat_args)); - - // We have a flat list of return values. - // These returns **may** be array views to - // the very same memrefs that we passed as inputs. - // As a first prototype, let's copy these values. - // I think it is best to always copy them because - // of aliasing. Let's just copy them to guarantee - // no aliasing issues. We can revisit this as an optimization - // and allowing these to alias. - nb::list flat_returns_allocated_compiler; - for (int i = 0; i < retc; i++) { - int64_t ptr = va_arg(args, int64_t); - flat_returns_allocated_compiler.append(ptr); - } - convertResults(flat_results, flat_returns_allocated_compiler); -} -} - -void setMLIRLibPath(std::string path) { libmlirpath = path; } - -auto registerImpl(nb::callable f) -{ - // Do we need to see if it is already present or can we just override it? Just override is fine. - // Does python reuse id's? Yes. - // But only after they have been garbaged collected. - // So as long as we maintain a reference to it, then they won't be garbage collected. - // Inserting the function into the unordered map increases the reference by one. - int64_t id = reinterpret_cast(f.ptr()); - references->insert({id, f}); - return id; -} - -NB_MODULE(catalyst_callback_registry, m) -{ - if (references == nullptr) { - references = new std::unordered_map(); - } - m.doc() = "Callbacks"; - m.def("register", ®isterImpl, "Call a python function registered in a map."); - m.def("set_mlir_lib_path", &setMLIRLibPath, "Set location of mlir's libraries."); -} diff --git a/src/qirlightning/simple_demo/README.md b/src/qirlightning/simple_demo/README.md index 7a25fe4..02c6f3c 100644 --- a/src/qirlightning/simple_demo/README.md +++ b/src/qirlightning/simple_demo/README.md @@ -1,8 +1,8 @@ # Simple Demo for Catalyst/Lightning runtime -This is a super simple demo for using Catalyst runtime to drive Lightning devices. The example here uses `lightning.kokkos`, but can easily be updated to target other devices, e.g. lightning.gpu (if an Nvidia GPU is present). +This is a super simple demo for driving Lightning devices. The example here uses `lightning.kokkos`, but can easily be updated to target other devices, e.g. lightning.gpu (if an Nvidia GPU is present). -The new files required are in `../catalyst_runtime`, which contains a subset of files from the [Catalyst Runtime](https://github.com/PennyLaneAI/catalyst/tree/main/runtime). +The only extra header files required are the `../catalyst_runtime/include`, which contains the include files from the [Catalyst Runtime ](https://github.com/PennyLaneAI/catalyst/tree/main/runtime/include) (for the QuantumDevice interface). ## Installing a lightning simulator @@ -36,7 +36,7 @@ You can swap `pennylane-lightning-kokkos` for `pennylane-lightning-gpu` for ligh To compile: ``` -$ clang++ --std=c++20 test_rt_device.cpp -I/home/joseph/work/qiree/catalyst/runtime/include -I/home/joseph/work/qiree/catalyst/runtime/lib/capi -I/home/joseph/work/qiree/catalyst/runtime/lib/backend/common -o test_rt_device.out +$ clang++ --std=c++20 test_rt_device.cpp -I../catalyst_runtime/include -o test_rt_device.out ``` To run: @@ -57,5 +57,6 @@ Measure on wire 0 = 0 To run on other devices, e.g. lightning.gpu, you need to change: - `pip install custatevec-cu12 pennylane-lightning-gpu` (custatevec is a dependency) -- replace `RTDLIB` and `RTDNAME` from `kokkos` to `GPU` -- include `cuquantum` libraries when running, e.g. `LD_LIBRARY_PATH=/home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/cuquantum/lib/:$LD_LIBRARY_PATH ./test_rt_device.out` +- replace `RTDLIB` from `kokkos` to `gpu` +- replace `RTDDEVICE` from `Kokkos` to `GPU` +- install `cuquantum` via `pip install custatevec-cu12`, then include `cuquantum` libraries when running, e.g. `LD_LIBRARY_PATH=/home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/cuquantum/lib/:$LD_LIBRARY_PATH ./test_rt_device.out` diff --git a/src/qirlightning/simple_demo/test_rt_device.cpp b/src/qirlightning/simple_demo/test_rt_device.cpp index f70410a..721ad1d 100644 --- a/src/qirlightning/simple_demo/test_rt_device.cpp +++ b/src/qirlightning/simple_demo/test_rt_device.cpp @@ -1,40 +1,73 @@ -#include "ExecutionContext.hpp" +#include + +#include "QuantumDevice.hpp" // Runtime libraries (kokkos/GPU/qubit etc.) -#define RTDLIB "/home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/pennylane_lightning/liblightning_kokkos_catalyst.so" // change to liblightning_gpu_catalyst.so -#define RTDNAME "LightningKokkosSimulator" // change to LightningGPUSimulator +#define RTDLIB \ + "/home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/" \ + "pennylane_lightning/liblightning_kokkos_catalyst.so"; +#define RTDDEVICE "LightningKokkosSimulator"; + +extern "C" Catalyst::Runtime::QuantumDevice* +GenericDeviceFactory(char const* kwargs); using namespace Catalyst::Runtime; -static inline std::shared_ptr loadRTDevice(const std::string &rtd_lib, - const std::string &rtd_name = {}, - const std::string &rtd_kwargs = {}) +int main() { - ExecutionContext context; - return context.getOrCreateDevice(rtd_lib, rtd_name, rtd_kwargs); -} + try + { + // Load lightning simulation library + std::string rtd_lib = RTDLIB; + std::string rtd_device = RTDDEVICE; + std::string kwargs = {}; + auto rtld_flags = RTLD_LAZY | RTLD_NODELETE; + auto rtd_dylib_handler = dlopen(rtd_lib.c_str(), rtld_flags); + + if (!rtd_dylib_handler) + { + throw std::runtime_error("Failed to load library: " + rtd_lib); + } -int main() { - auto RTDevice = loadRTDevice(RTDLIB, RTDNAME, ""); + // Find device factory + std::string factory_name = rtd_device + "Factory"; + void* f_ptr = dlsym(rtd_dylib_handler, factory_name.c_str()); - // Allocate Qubits - RTDevice->getQuantumDevicePtr()->AllocateQubits(3); + if (!f_ptr) + { + dlclose(rtd_dylib_handler); + throw std::runtime_error("Failed to find factory function: " + + factory_name); + } + std::string rtd_kwargs = {}; + auto rtd_qdevice = std::unique_ptr( + reinterpret_cast(f_ptr)( + rtd_kwargs.c_str())); - // Get Num Qubits - std::cout << "Num Qubits = " << RTDevice->getQuantumDevicePtr()->GetNumQubits() << std::endl; + // Allocate Qubits + rtd_qdevice->AllocateQubits(3); - // Apply Gate - RTDevice->getQuantumDevicePtr()->NamedOperation("Hadamard", {}, {0}); + // Get Num Qubits + std::cout << "Num Qubits = " << rtd_qdevice->GetNumQubits() + << std::endl; - // Print State - std::cout << "State = " << std::endl; - RTDevice->getQuantumDevicePtr()->PrintState(); + // Apply Gate + rtd_qdevice->NamedOperation("Hadamard", {}, {0}); - // Measure - QubitIdType wire{0}; - Result result = RTDevice->getQuantumDevicePtr()->Measure(wire, std::nullopt); - std::cout << "Measure on wire 0 = " << *result << std::endl; + // Print State + std::cout << "State = " << std::endl; + rtd_qdevice->PrintState(); + // Measure + QubitIdType wire{0}; + Result result = rtd_qdevice->Measure(wire, std::nullopt); + std::cout << "Measure on wire 0 = " << *result << std::endl; + } + catch (std::exception const& e) + { + std::cerr << "Error: " << e.what() << std::endl; + return EXIT_FAILURE; + } - return 0; + return EXIT_SUCCESS; } From 6c729079aacfba68f5a1605b078031c4fa1296d7 Mon Sep 17 00:00:00 2001 From: Joseph Lee Date: Wed, 15 Jan 2025 19:35:11 +0000 Subject: [PATCH 36/64] remove redundant file --- src/qirlightning/simple_demo/LightningDevice.cpp | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 src/qirlightning/simple_demo/LightningDevice.cpp diff --git a/src/qirlightning/simple_demo/LightningDevice.cpp b/src/qirlightning/simple_demo/LightningDevice.cpp deleted file mode 100644 index e69de29..0000000 From 6da98d30b251314a8309b93b47c9e82139db8f68 Mon Sep 17 00:00:00 2001 From: Joseph Lee Date: Wed, 15 Jan 2025 20:07:23 +0000 Subject: [PATCH 37/64] update lightningquantum --- src/qirlightning/LightningQuantum.cc | 127 ++++++++++++++------------- src/qirlightning/LightningQuantum.hh | 18 +--- 2 files changed, 71 insertions(+), 74 deletions(-) diff --git a/src/qirlightning/LightningQuantum.cc b/src/qirlightning/LightningQuantum.cc index 3f39825..fe8b07e 100644 --- a/src/qirlightning/LightningQuantum.cc +++ b/src/qirlightning/LightningQuantum.cc @@ -18,28 +18,48 @@ #include "qiree/Assert.hh" // Lightning -#include "catalyst_runtime/lib/capi/ExecutionContext.hpp" +#include "QuantumDevice.hpp" + +#define RTDLIB \ + "/home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/" \ + "pennylane_lightning/liblightning_kokkos_catalyst.so"; +#define RTDDEVICE "LightningKokkosSimulator"; namespace qiree { using namespace Catalyst::Runtime; -static inline std::shared_ptr loadRTDevice(const std::string &rtd_lib, - const std::string &rtd_name = {}, - const std::string &rtd_kwargs = {}) -{ - ExecutionContext context; - return context.getOrCreateDevice(rtd_lib, rtd_name, rtd_kwargs); -} - //---------------------------------------------------------------------------// /*! * Initialize the Lightning simulator */ -LightningQuantum::LightningQuantum(std::ostream& os, unsigned long int seed) -{ - auto RTDevice = loadDevice("/home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/pennylane_lightning/liblightning_gpu_catalyst.so", "LightningGPUSimulator", ""); - +LightningQuantum::LightningQuantum(std::ostream& os) : output_(os) +{ + std::string rtd_lib = RTDLIB; + std::string rtd_device = RTDDEVICE; + std::string kwargs = {}; + auto rtld_flags = RTLD_LAZY | RTLD_NODELETE; + auto rtd_dylib_handler = dlopen(rtd_lib.c_str(), rtld_flags); + + if (!rtd_dylib_handler) + { + throw std::runtime_error("Failed to load library: " + rtd_lib); + } + + // Find device factory + std::string factory_name = rtd_device + "Factory"; + void* f_ptr = dlsym(rtd_dylib_handler, factory_name.c_str()); + + if (!f_ptr) + { + dlclose(rtd_dylib_handler); + throw std::runtime_error("Failed to find factory function: " + + factory_name); + } + std::string rtd_kwargs = {}; + rtd_qdevice = std::unique_ptr( + reinterpret_cast(f_ptr)( + rtd_kwargs.c_str())); } //---------------------------------------------------------------------------// @@ -54,11 +74,10 @@ void LightningQuantum::set_up(EntryPointAttrs const& attrs) { QIREE_VALIDATE(attrs.required_num_qubits > 0, << "input is not a quantum program"); - - num_qubits_ = attrs.required_num_qubits; // Set the number of qubits - RTDevice->getQuantumDevicePtr()->AllocateQubits(num_qubits_); + num_qubits_ = attrs.required_num_qubits; // Set the number of qubits + rtd_qdevice->AllocateQubits(num_qubits_); } //---------------------------------------------------------------------------// @@ -67,8 +86,10 @@ void LightningQuantum::set_up(EntryPointAttrs const& attrs) */ void LightningQuantum::tear_down() { - context->deactivateDevice(RTDevice); - RTDevice = nullptr; + if (rtd_dylib_handler) + { + dlclose(rtd_dylib_handler); + } } //---------------------------------------------------------------------------// @@ -77,7 +98,7 @@ void LightningQuantum::tear_down() */ void LightningQuantum::reset(Qubit q) { - q.value = 0; + rtd_qdevice->SetState({{0, 0}}, {q.value}); } //----------------------------------------------------------------------------// @@ -86,8 +107,7 @@ void LightningQuantum::reset(Qubit q) */ QState LightningQuantum::read_result(Result r) { - - return static_cast(meas_results[0].bitstring[0]); + return results_[r.value] } //---------------------------------------------------------------------------// @@ -98,19 +118,19 @@ QState LightningQuantum::read_result(Result r) * qsim) */ void LightningQuantum::mz(Qubit q, Result r) -{ // we don't classical register yet. - /* QIREE_EXPECT(q.value < this->num_qubits()); */ // TODO: q must be in the set - // of qubits, e.g., what - // happens if q=5 and qubits - // are {2,3,4,5}, q is less - // than num_qubits but not it - // is in the set of qubits. +{ + QIREE_EXPECT(q.value < this->num_qubits()); // TODO: q must be in + // the set of qubits, + // e.g., what happens if + // q=5 and qubits are + // {2,3,4,5}, q is less + // than num_qubits but + // not it is in the set + // of qubits. // TODO: maybe not what we want long term QIREE_EXPECT(q.value == r.value); // Add measurement instruction - Measure(q.value, std::nullopt); - // RETURN MEASURE RESULT?? - + results_[r.value] = rtd_qdevice->Measure(q.value, std::nullopt); } //---------------------------------------------------------------------------// @@ -121,75 +141,62 @@ void LightningQuantum::mz(Qubit q, Result r) // 1. Entangling gates void LightningQuantum::cx(Qubit q1, Qubit q2) { - RTDevice->getQuantumDevicePtr()->NamedOperation("CNOT", {}, {q1.value, q2.value}); + rtd_qdevice->NamedOperation( + "CNOT", {}, {q1.value, q2.value}); } void LightningQuantum::cnot(Qubit q1, Qubit q2) { - RTDevice->getQuantumDevicePtr()->NamedOperation("CNOT", {}, {q1.value, q2.value}); + rtd_qdevice->NamedOperation( + "CNOT", {}, {q1.value, q2.value}); } void LightningQuantum::cz(Qubit q1, Qubit q2) { - RTDevice->getQuantumDevicePtr()->NamedOperation("CZ", {}, {q1.value, q2.value}); + rtd_qdevice->NamedOperation( + "CZ", {}, {q1.value, q2.value}); } // 2. Local gates void LightningQuantum::h(Qubit q) { - RTDevice->getQuantumDevicePtr()->NamedOperation("Hadamard", {}, {q.value}); + rtd_qdevice->NamedOperation("Hadamard", {}, {q.value}); } void LightningQuantum::s(Qubit q) { - RTDevice->getQuantumDevicePtr()->NamedOperation("S", {}, {q.value}); + rtd_qdevice->NamedOperation("S", {}, {q.value}); } void LightningQuantum::t(Qubit q) { - RTDevice->getQuantumDevicePtr()->NamedOperation("T", {}, {q.value}); + rtd_qdevice->NamedOperation("T", {}, {q.value}); } // 2.1 Pauli gates void LightningQuantum::x(Qubit q) { - RTDevice->getQuantumDevicePtr()->NamedOperation("PauliX", {}, {q.value}); + rtd_qdevice->NamedOperation("PauliX", {}, {q.value}); } void LightningQuantum::y(Qubit q) { - RTDevice->getQuantumDevicePtr()->NamedOperation("PauliY", {}, {q.value}); + rtd_qdevice->NamedOperation("PauliY", {}, {q.value}); } void LightningQuantum::z(Qubit q) { - RTDevice->getQuantumDevicePtr()->NamedOperation("PauliZ", {}, {q.value}); + rtd_qdevice->NamedOperation("PauliZ", {}, {q.value}); } // 2.2 rotation gates void LightningQuantum::rx(double theta, Qubit q) { - RTDevice->getQuantumDevicePtr()->NamedOperation("RX", {theta}, {q.value}); + rtd_qdevice->NamedOperation("RX", {theta}, {q.value}); } void LightningQuantum::ry(double theta, Qubit q) { - RTDevice->getQuantumDevicePtr()->NamedOperation("RY", {theta}, {q.value}); + rtd_qdevice->NamedOperation("RY", {theta}, {q.value}); } void LightningQuantum::rz(double theta, Qubit q) { - RTDevice->getQuantumDevicePtr()->NamedOperation("RZ", {theta}, {q.value}); + rtd_qdevice->NamedOperation("RZ", {theta}, {q.value}); } Qubit LightningQuantum::result_to_qubit(Result r) { - // TODO: This function is not working. Giving 0 every time. Maybe not - // needed. - /* QIREE_EXPECT(r.value < this->num_results()); */ - return result_to_qubit_[r.value]; // just copied this from the qirxacc, I - // have no idea if we need to do - // something else here -} - -void LightningQuantum::print_accelbuf() -{ - // TODO: to be implemented, we can create a buffer class to store the - // results -} - -void LightningQuantum::execute_if_needed() -{ - /* QIREE_EXPECT(false); */ + return result_to_qubit_[r.value]; } } // namespace qiree diff --git a/src/qirlightning/LightningQuantum.hh b/src/qirlightning/LightningQuantum.hh index e9b8bb2..36e35e8 100644 --- a/src/qirlightning/LightningQuantum.hh +++ b/src/qirlightning/LightningQuantum.hh @@ -27,7 +27,7 @@ class LightningQuantum final : virtual public QuantumNotImpl { public: // Construct with number of shots - LightningQuantum(std::ostream& os, unsigned long int shots); + LightningQuantum(std::ostream& os); ~LightningQuantum(); QIREE_DELETE_COPY_MOVE(LightningQuantum); // Delete copy and move constructors @@ -89,24 +89,14 @@ class LightningQuantum final : virtual public QuantumNotImpl void z(Qubit) final; //!@} - // Update the buffer - Buffer manager; - private: - - //// TYPES //// - - struct Factory; - struct State; - + //// DATA //// std::ostream& output_; - unsigned long int seed_{}; - std::unique_ptr state_; + std::unique_ptr rtd_qdevice; + std::vector results_; - unsigned num_threads_{}; // Number of threads to use - size_t gate_index_; // when the quantum operation will be executed size_type num_qubits_{}; std::vector result_to_qubit_; }; From 7713cca513ab94a8f2ae3da760352e7d872a7f6f Mon Sep 17 00:00:00 2001 From: Joseph Lee Date: Mon, 10 Mar 2025 17:55:12 +0000 Subject: [PATCH 38/64] update --- src/qirlightning/simple_demo/README.md | 10 +++++++--- src/qirlightning/simple_demo/test_rt_device.cpp | 4 ++-- src/qirqsim/QsimDefaultRuntime.hh | 1 + src/qirqsim/QsimQuantum.hh | 4 ++++ 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/src/qirlightning/simple_demo/README.md b/src/qirlightning/simple_demo/README.md index 02c6f3c..7aecc9d 100644 --- a/src/qirlightning/simple_demo/README.md +++ b/src/qirlightning/simple_demo/README.md @@ -16,7 +16,7 @@ $ pip install pennylane-lightning-kokkos $ pip show pennylane-lightning-kokkos Name: PennyLane_Lightning_Kokkos -Version: 0.39.0 +Version: 0.40.0 Summary: PennyLane-Lightning plugin Home-page: https://github.com/PennyLaneAI/pennylane-lightning Author: @@ -39,6 +39,8 @@ To compile: $ clang++ --std=c++20 test_rt_device.cpp -I../catalyst_runtime/include -o test_rt_device.out ``` +## Running the example + To run: ``` @@ -55,8 +57,10 @@ State = Measure on wire 0 = 0 ``` +## Running on other devices + To run on other devices, e.g. lightning.gpu, you need to change: -- `pip install custatevec-cu12 pennylane-lightning-gpu` (custatevec is a dependency) +- Install pennylane-lightning-gpu: `pip install pennylane-lightning-gpu` - replace `RTDLIB` from `kokkos` to `gpu` - replace `RTDDEVICE` from `Kokkos` to `GPU` -- install `cuquantum` via `pip install custatevec-cu12`, then include `cuquantum` libraries when running, e.g. `LD_LIBRARY_PATH=/home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/cuquantum/lib/:$LD_LIBRARY_PATH ./test_rt_device.out` +- Include `cuquantum` libraries when running (which was installed as a dependency), i.e. `LD_LIBRARY_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")/lib:$LD_LIBRARY_PATH ./test_rt_device.out` diff --git a/src/qirlightning/simple_demo/test_rt_device.cpp b/src/qirlightning/simple_demo/test_rt_device.cpp index 721ad1d..4d2736f 100644 --- a/src/qirlightning/simple_demo/test_rt_device.cpp +++ b/src/qirlightning/simple_demo/test_rt_device.cpp @@ -5,8 +5,8 @@ // Runtime libraries (kokkos/GPU/qubit etc.) #define RTDLIB \ "/home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/" \ - "pennylane_lightning/liblightning_kokkos_catalyst.so"; -#define RTDDEVICE "LightningKokkosSimulator"; + "pennylane_lightning/liblightning_gpu_catalyst.so"; +#define RTDDEVICE "LightningGPUSimulator"; extern "C" Catalyst::Runtime::QuantumDevice* GenericDeviceFactory(char const* kwargs); diff --git a/src/qirqsim/QsimDefaultRuntime.hh b/src/qirqsim/QsimDefaultRuntime.hh index daff34a..a271d84 100644 --- a/src/qirqsim/QsimDefaultRuntime.hh +++ b/src/qirqsim/QsimDefaultRuntime.hh @@ -42,6 +42,7 @@ class QsimDefaultRuntime final : virtual public RuntimeInterface //!@{ //! \name Runtime interface + // Initialize the execution environment, resetting qubits void initialize(OptionalCString env) override; diff --git a/src/qirqsim/QsimQuantum.hh b/src/qirqsim/QsimQuantum.hh index 1b04bf0..ddeea67 100644 --- a/src/qirqsim/QsimQuantum.hh +++ b/src/qirqsim/QsimQuantum.hh @@ -98,6 +98,10 @@ class QsimQuantum final : virtual public QuantumNotImpl unsigned long int seed_{}; std::unique_ptr state_; std::vector results_; + + unsigned num_threads_{}; // Number of threads to use + size_t gate_index_; // when the quantum operation will be executed + size_type num_qubits_{}; std::vector result_to_qubit_; //// HELPER FUNCTIONS //// From e777b9cc32dbb3a7eac53a8b414a269dbccfd7e0 Mon Sep 17 00:00:00 2001 From: Joseph Lee Date: Mon, 10 Mar 2025 17:56:31 +0000 Subject: [PATCH 39/64] format --- src/qirqsim/QsimDefaultRuntime.hh | 2 +- src/qirqsim/QsimQuantum.hh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/qirqsim/QsimDefaultRuntime.hh b/src/qirqsim/QsimDefaultRuntime.hh index a271d84..f4e8e4d 100644 --- a/src/qirqsim/QsimDefaultRuntime.hh +++ b/src/qirqsim/QsimDefaultRuntime.hh @@ -42,7 +42,7 @@ class QsimDefaultRuntime final : virtual public RuntimeInterface //!@{ //! \name Runtime interface - + // Initialize the execution environment, resetting qubits void initialize(OptionalCString env) override; diff --git a/src/qirqsim/QsimQuantum.hh b/src/qirqsim/QsimQuantum.hh index ddeea67..7f2b2bb 100644 --- a/src/qirqsim/QsimQuantum.hh +++ b/src/qirqsim/QsimQuantum.hh @@ -98,7 +98,7 @@ class QsimQuantum final : virtual public QuantumNotImpl unsigned long int seed_{}; std::unique_ptr state_; std::vector results_; - + unsigned num_threads_{}; // Number of threads to use size_t gate_index_; // when the quantum operation will be executed size_type num_qubits_{}; From 6caa66768fe2a365a6ff2c4a30b9e714d11e634f Mon Sep 17 00:00:00 2001 From: Joseph Lee Date: Mon, 10 Mar 2025 20:03:58 +0000 Subject: [PATCH 40/64] support lightning --- CMakeLists.txt | 13 +++- app/CMakeLists.txt | 17 +++++ app/qir-lightning.cc | 74 +++++++++++++++++++++ src/CMakeLists.txt | 4 ++ src/qirlightning/CMakeLists.txt | 19 +++--- src/qirlightning/LightningDefaultRuntime.cc | 44 ------------ src/qirlightning/LightningDefaultRuntime.hh | 23 +++++-- src/qirlightning/LightningQuantum.cc | 60 +++++++---------- src/qirlightning/LightningQuantum.hh | 46 ++++++++----- 9 files changed, 192 insertions(+), 108 deletions(-) create mode 100644 app/qir-lightning.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 3eb2675..b0a21a9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -38,7 +38,8 @@ option(QIREE_BUILD_DOCS "Build QIR-EE documentation" OFF) option(QIREE_BUILD_TESTS "Build QIR-EE unit tests" ON) option(QIREE_BUILD_EXAMPLES "Build QIR-EE examples" OFF) option(QIREE_USE_QSIM "Download and build Google qsim backend" OFF) -option(QIREE_USE_XACC "Build XACC interface" ON) +option(QIREE_USE_XACC "Build XACC interface" OFF) +option(QIREE_USE_LIGHTNING "Build Pennylane Lightning backend" ON) qiree_set_default(BUILD_TESTING ${QIREE_BUILD_TESTS}) @@ -138,6 +139,16 @@ if(QIREE_USE_QSIM) ) endif() +if(QIREE_USE_LIGHTNING) +qiree_add_library(qiree_lightning INTERFACE) +add_library(QIREE::lightning ALIAS qiree_lightning) +target_include_directories(qiree_lightning SYSTEM INTERFACE + "$" + "$" +) +endif() + + if(QIREE_USE_XACC) find_package(XACC REQUIRED) endif() diff --git a/app/CMakeLists.txt b/app/CMakeLists.txt index 4bf7330..58e63f2 100644 --- a/app/CMakeLists.txt +++ b/app/CMakeLists.txt @@ -28,6 +28,23 @@ if(QIREE_USE_QSIM) ) endif() +#-----------------------------------------------------------------------------# +# LIGHTNING FRONT END +#-----------------------------------------------------------------------------# + +if(QIREE_USE_LIGHTNING) + + # Include directories for catalyst_runtime + include_directories("${CMAKE_CURRENT_SOURCE_DIR}/../src/qirlightning/catalyst_runtime/include") + qiree_add_executable(qir-lightning + qir-lightning.cc + ) + target_link_libraries(qir-lightning + PUBLIC QIREE::qiree QIREE::qirlightning + PRIVATE CLI11::CLI11 + ) +endif() + #-----------------------------------------------------------------------------# # XACC FRONT END #-----------------------------------------------------------------------------# diff --git a/app/qir-lightning.cc b/app/qir-lightning.cc new file mode 100644 index 0000000..2244c97 --- /dev/null +++ b/app/qir-lightning.cc @@ -0,0 +1,74 @@ +//----------------------------------*-C++-*----------------------------------// +// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers. +// See the top-level COPYRIGHT file for details. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +//---------------------------------------------------------------------------// +//! \file app/qir-lightning.cc +//---------------------------------------------------------------------------// +#include +#include +#include +#include + +#include "qiree/Executor.hh" +#include "qiree/Module.hh" +#include "qiree/ResultDistribution.hh" +#include "qirlightning/LightningDefaultRuntime.hh" +#include "qirlightning/LightningQuantum.hh" + +using namespace std::string_view_literals; + +namespace qiree +{ +namespace app +{ +//---------------------------------------------------------------------------// +void run(std::string const& filename, int num_shots) +{ + // Load the input + Executor execute{Module{filename}}; + + // Set up qsim + LightningQuantum sim(std::cout, 0); + LightningDefaultRuntime rt(std::cout, sim); + ResultDistribution distribution; + + // Run several time = shots (default 1) + for (int i = 0; i < num_shots; i++) + { + execute(sim, rt); + distribution.accumulate(rt.result()); + } + + std::cout << distribution.to_json() << std::endl; +} + +//---------------------------------------------------------------------------// +} // namespace app +} // namespace qiree + +//---------------------------------------------------------------------------// +/*! + * Execute and run. + */ +int main(int argc, char* argv[]) +{ + int num_shots{1}; + std::string filename; + + CLI::App app; + + auto* filename_opt + = app.add_option("--input,-i,input", filename, "QIR input file"); + filename_opt->required(); + + auto* nshot_opt + = app.add_option("-s,--shots", num_shots, "Number of shots"); + nshot_opt->capture_default_str(); + + CLI11_PARSE(app, argc, argv); + + qiree::app::run(filename, num_shots); + + return EXIT_SUCCESS; +} diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b01bf2f..b3d81fd 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -21,4 +21,8 @@ if(QIREE_USE_QSIM) add_subdirectory(qirqsim) endif() +if(QIREE_USE_LIGHTNING) + add_subdirectory(qirlightning) +endif() + #---------------------------------------------------------------------------## diff --git a/src/qirlightning/CMakeLists.txt b/src/qirlightning/CMakeLists.txt index 0d81dec..0a3eab5 100644 --- a/src/qirlightning/CMakeLists.txt +++ b/src/qirlightning/CMakeLists.txt @@ -4,16 +4,19 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #----------------------------------------------------------------------------# -# Adding qsim as a library to qiree -qiree_add_library(qirqsim - QsimQuantum.cc - QsimDefaultRuntime.cc +# Include directories for catalyst_runtime +include_directories("${CMAKE_CURRENT_SOURCE_DIR}/catalyst_runtime/include") + +# Adding lightning as a library to qiree +qiree_add_library(qirlightning + LightningQuantum.cc + LightningDefaultRuntime.cc ) -#Link the qsim library to qiree and any other relevant libraries -target_link_libraries(qirqsim +#Link the lightning library to qiree and any other relevant libraries +target_link_libraries(qirlightning PUBLIC QIREE::qiree # Link to qiree - PRIVATE QIREE::qsim + PRIVATE QIREE::lightning ) #----------------------------------------------------------------------------# @@ -22,7 +25,7 @@ target_link_libraries(qirqsim # Install headers, matching the relevant .hh files for qsim integration install(DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/" - DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/qirqsim" + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/qirlightning" COMPONENT development FILES_MATCHING REGEX ".*\\.hh?$" ) diff --git a/src/qirlightning/LightningDefaultRuntime.cc b/src/qirlightning/LightningDefaultRuntime.cc index 2440ee0..7e5da3e 100644 --- a/src/qirlightning/LightningDefaultRuntime.cc +++ b/src/qirlightning/LightningDefaultRuntime.cc @@ -26,48 +26,4 @@ void LightningDefaultRuntime::initialize(OptionalCString env) } } -//---------------------------------------------------------------------------// -/*! - * Execute circuit and mark the following N results as being part of an array - * named tag - */ - -void LightningDefaultRuntime::array_record_output(size_type s, OptionalCString tag) -{ - // this->execute_if_needed(); - // output_ << "array " << (tag ? tag : "") << " length " << s - // << std::endl; -} - -//---------------------------------------------------------------------------// -/*! - * Execute circuit and mark the following N results as being part of a tuple - * named tag - */ - -void LightningDefaultRuntime::tuple_record_output(size_type s, OptionalCString tag) -{ - // this->execute_if_needed(); - // output_ << "tuple " << (tag ? tag : "") << " length " << s - // << std::endl; -} - -//---------------------------------------------------------------------------// -/*! - * Execute circuit and report a single measurement result - */ -void LightningDefaultRuntime::result_record_output(Result r, OptionalCString tag) -{ - // Access values through the getter - // This prints results every time result_record_output is called - // Can comment out if only want to see final results - - if (auto value = sim_.manager.getBufferValue("q" + std::to_string(r.value)); - value.has_value()) - { - std::cout << "q" << std::to_string(r.value) << " : " << value.value() - << "\n"; - } -} - } // namespace qiree diff --git a/src/qirlightning/LightningDefaultRuntime.hh b/src/qirlightning/LightningDefaultRuntime.hh index cac9c1e..2d3e9fa 100644 --- a/src/qirlightning/LightningDefaultRuntime.hh +++ b/src/qirlightning/LightningDefaultRuntime.hh @@ -8,6 +8,7 @@ #pragma once #include "LightningQuantum.hh" +#include "qiree/RecordedResult.hh" namespace qiree { @@ -41,22 +42,36 @@ class LightningDefaultRuntime final : virtual public RuntimeInterface //!@{ //! \name Runtime interface + // Initialize the execution environment, resetting qubits void initialize(OptionalCString env) override; //! Mark the following N results as being part of an array named tag - void array_record_output(size_type, OptionalCString tag) final; + void array_record_output(size_type size, OptionalCString tag) final + { + result_ = RecordedResult(size, tag); + } //! Mark the following N results as being part of a tuple named tag - void tuple_record_output(size_type, OptionalCString) final; + void tuple_record_output(size_type size, OptionalCString tag) final + { + result_ = RecordedResult(size, tag); + } - // Save one result - void result_record_output(Result result, OptionalCString tag) final; + //! Save one result + void result_record_output(Result result, OptionalCString tag) final + { + result_.push_back(sim_.get_result(result), tag); + } //!@} + RecordedResult const& result() const { return result_; } + + private: std::ostream& output_; LightningQuantum& sim_; + RecordedResult result_; }; } // namespace qiree diff --git a/src/qirlightning/LightningQuantum.cc b/src/qirlightning/LightningQuantum.cc index fe8b07e..4ec6abd 100644 --- a/src/qirlightning/LightningQuantum.cc +++ b/src/qirlightning/LightningQuantum.cc @@ -14,17 +14,16 @@ #include #include #include +#include #include "qiree/Assert.hh" -// Lightning -#include "QuantumDevice.hpp" - #define RTDLIB \ "/home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/" \ "pennylane_lightning/liblightning_kokkos_catalyst.so"; #define RTDDEVICE "LightningKokkosSimulator"; - +extern "C" Catalyst::Runtime::QuantumDevice* +GenericDeviceFactory(char const* kwargs); namespace qiree { using namespace Catalyst::Runtime; @@ -33,13 +32,13 @@ using namespace Catalyst::Runtime; /*! * Initialize the Lightning simulator */ -LightningQuantum::LightningQuantum(std::ostream& os) : output_(os) +LightningQuantum::LightningQuantum(std::ostream& os, unsigned long int seed) : output_(os) { std::string rtd_lib = RTDLIB; std::string rtd_device = RTDDEVICE; std::string kwargs = {}; auto rtld_flags = RTLD_LAZY | RTLD_NODELETE; - auto rtd_dylib_handler = dlopen(rtd_lib.c_str(), rtld_flags); + rtd_dylib_handler = dlopen(rtd_lib.c_str(), rtld_flags); if (!rtd_dylib_handler) { @@ -76,6 +75,7 @@ void LightningQuantum::set_up(EntryPointAttrs const& attrs) << "input is not a quantum program"); num_qubits_ = attrs.required_num_qubits; // Set the number of qubits + results_.resize(attrs.required_num_results); rtd_qdevice->AllocateQubits(num_qubits_); } @@ -98,7 +98,9 @@ void LightningQuantum::tear_down() */ void LightningQuantum::reset(Qubit q) { - rtd_qdevice->SetState({{0, 0}}, {q.value}); + std::vector data = {0}; + DataView state(data); + std::vector wires = {static_cast(q.value)}; rtd_qdevice->SetBasisState(state, wires); } //----------------------------------------------------------------------------// @@ -107,7 +109,7 @@ void LightningQuantum::reset(Qubit q) */ QState LightningQuantum::read_result(Result r) { - return results_[r.value] + return this->get_result(r); } //---------------------------------------------------------------------------// @@ -119,17 +121,8 @@ QState LightningQuantum::read_result(Result r) */ void LightningQuantum::mz(Qubit q, Result r) { - QIREE_EXPECT(q.value < this->num_qubits()); // TODO: q must be in - // the set of qubits, - // e.g., what happens if - // q=5 and qubits are - // {2,3,4,5}, q is less - // than num_qubits but - // not it is in the set - // of qubits. - // TODO: maybe not what we want long term - QIREE_EXPECT(q.value == r.value); - // Add measurement instruction + QIREE_EXPECT(q.value < this->num_qubits()); + QIREE_EXPECT(r.value < this->num_results()); results_[r.value] = rtd_qdevice->Measure(q.value, std::nullopt); } @@ -142,61 +135,56 @@ void LightningQuantum::mz(Qubit q, Result r) void LightningQuantum::cx(Qubit q1, Qubit q2) { rtd_qdevice->NamedOperation( - "CNOT", {}, {q1.value, q2.value}); + "CNOT", {}, {static_cast(q1.value), static_cast(q2.value)}); } void LightningQuantum::cnot(Qubit q1, Qubit q2) { rtd_qdevice->NamedOperation( - "CNOT", {}, {q1.value, q2.value}); + "CNOT", {}, {static_cast(q1.value), static_cast(q2.value)}); } void LightningQuantum::cz(Qubit q1, Qubit q2) { rtd_qdevice->NamedOperation( - "CZ", {}, {q1.value, q2.value}); + "CZ", {}, {static_cast(q1.value), static_cast(q2.value)}); } // 2. Local gates void LightningQuantum::h(Qubit q) { - rtd_qdevice->NamedOperation("Hadamard", {}, {q.value}); + rtd_qdevice->NamedOperation("Hadamard", {}, {static_cast(q.value)}); } void LightningQuantum::s(Qubit q) { - rtd_qdevice->NamedOperation("S", {}, {q.value}); + rtd_qdevice->NamedOperation("S", {}, {static_cast(q.value)}); } void LightningQuantum::t(Qubit q) { - rtd_qdevice->NamedOperation("T", {}, {q.value}); + rtd_qdevice->NamedOperation("T", {}, {static_cast(q.value)}); } // 2.1 Pauli gates void LightningQuantum::x(Qubit q) { - rtd_qdevice->NamedOperation("PauliX", {}, {q.value}); + rtd_qdevice->NamedOperation("PauliX", {}, {static_cast(q.value)}); } void LightningQuantum::y(Qubit q) { - rtd_qdevice->NamedOperation("PauliY", {}, {q.value}); + rtd_qdevice->NamedOperation("PauliY", {}, {static_cast(q.value)}); } void LightningQuantum::z(Qubit q) { - rtd_qdevice->NamedOperation("PauliZ", {}, {q.value}); + rtd_qdevice->NamedOperation("PauliZ", {}, {static_cast(q.value)}); } // 2.2 rotation gates void LightningQuantum::rx(double theta, Qubit q) { - rtd_qdevice->NamedOperation("RX", {theta}, {q.value}); + rtd_qdevice->NamedOperation("RX", {theta}, {static_cast(q.value)}); } void LightningQuantum::ry(double theta, Qubit q) { - rtd_qdevice->NamedOperation("RY", {theta}, {q.value}); + rtd_qdevice->NamedOperation("RY", {theta}, {static_cast(q.value)}); } void LightningQuantum::rz(double theta, Qubit q) { - rtd_qdevice->NamedOperation("RZ", {theta}, {q.value}); -} - -Qubit LightningQuantum::result_to_qubit(Result r) -{ - return result_to_qubit_[r.value]; + rtd_qdevice->NamedOperation("RZ", {theta}, {static_cast(q.value)}); } } // namespace qiree diff --git a/src/qirlightning/LightningQuantum.hh b/src/qirlightning/LightningQuantum.hh index 36e35e8..b3856b0 100644 --- a/src/qirlightning/LightningQuantum.hh +++ b/src/qirlightning/LightningQuantum.hh @@ -11,11 +11,14 @@ #include #include +#include "qiree/Assert.hh" #include "qiree/Macros.hh" #include "qiree/QuantumNotImpl.hh" #include "qiree/RuntimeInterface.hh" #include "qiree/Types.hh" -#include "qiree/OutputDistribution.hh" + +// Lightning +#include "QuantumDevice.hpp" namespace qiree { @@ -27,15 +30,23 @@ class LightningQuantum final : virtual public QuantumNotImpl { public: // Construct with number of shots - LightningQuantum(std::ostream& os); + LightningQuantum(std::ostream& os, unsigned long int shots); ~LightningQuantum(); QIREE_DELETE_COPY_MOVE(LightningQuantum); // Delete copy and move constructors + //!@{ //! \name Accessors - size_type num_results() const { return result_to_qubit_.size(); } + + //! Number of qubits in the circuit size_type num_qubits() const { return num_qubits_; } + + //! Number of classical result registers + size_type num_results() const { return results_.size(); } + + // Get the result from a classical register + inline QState get_result(Result r) const; //!@} //!@{ @@ -53,17 +64,6 @@ class LightningQuantum final : virtual public QuantumNotImpl QState read_result(Result) final; //!@} - //!@{ - //! \name Utilities for runtime - // Get runtime qubit corresponding to a runtime result - Qubit result_to_qubit(Result); - - // Run the circuit on the accelerator if we have not already. Returns true - // if the circuit was executed. - void execute_if_needed(); - - void print_accelbuf(); - //!@} //!@{ //! \name Circuit construction @@ -90,15 +90,31 @@ class LightningQuantum final : virtual public QuantumNotImpl //!@} private: + //// TYPES //// + + struct Factory; + struct State; //// DATA //// std::ostream& output_; - std::unique_ptr rtd_qdevice; + void* rtd_dylib_handler; + std::unique_ptr rtd_qdevice; std::vector results_; size_type num_qubits_{}; std::vector result_to_qubit_; }; +//---------------------------------------------------------------------------// +/*! + * Get the result from a classical register. + */ +QState LightningQuantum::get_result(Result r) const +{ + QIREE_EXPECT(r.value < results_.size()); + auto result_bool = static_cast(results_[r.value]); + return static_cast(result_bool); +} + } // namespace qiree From 422b3e76340cd475e11a1fc2cc25ea76e38236c3 Mon Sep 17 00:00:00 2001 From: Joseph Lee Date: Tue, 11 Mar 2025 18:52:54 +0000 Subject: [PATCH 41/64] add seeding --- src/qirlightning/LightningQuantum.cc | 6 +++++- src/qirlightning/LightningQuantum.hh | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/qirlightning/LightningQuantum.cc b/src/qirlightning/LightningQuantum.cc index 4ec6abd..f99d117 100644 --- a/src/qirlightning/LightningQuantum.cc +++ b/src/qirlightning/LightningQuantum.cc @@ -15,6 +15,7 @@ #include #include #include +#include #include "qiree/Assert.hh" @@ -32,7 +33,7 @@ using namespace Catalyst::Runtime; /*! * Initialize the Lightning simulator */ -LightningQuantum::LightningQuantum(std::ostream& os, unsigned long int seed) : output_(os) +LightningQuantum::LightningQuantum(std::ostream& os, unsigned long int seed) : output_(os), seed_(seed) { std::string rtd_lib = RTDLIB; std::string rtd_device = RTDDEVICE; @@ -123,6 +124,9 @@ void LightningQuantum::mz(Qubit q, Result r) { QIREE_EXPECT(q.value < this->num_qubits()); QIREE_EXPECT(r.value < this->num_results()); + std::mt19937 gen(seed_); + seed_++; + rtd_qdevice->SetDevicePRNG(&gen); results_[r.value] = rtd_qdevice->Measure(q.value, std::nullopt); } diff --git a/src/qirlightning/LightningQuantum.hh b/src/qirlightning/LightningQuantum.hh index b3856b0..24ccd2f 100644 --- a/src/qirlightning/LightningQuantum.hh +++ b/src/qirlightning/LightningQuantum.hh @@ -98,6 +98,7 @@ class LightningQuantum final : virtual public QuantumNotImpl //// DATA //// std::ostream& output_; + unsigned long int seed_{}; void* rtd_dylib_handler; std::unique_ptr rtd_qdevice; std::vector results_; From 7f6598a8b76f32741c09e22e77f5844851ba8b01 Mon Sep 17 00:00:00 2001 From: Joseph Lee Date: Tue, 11 Mar 2025 18:57:32 +0000 Subject: [PATCH 42/64] update --- src/qirlightning/LightningQuantum.cc | 3 -- src/qirlightning/README.md | 63 ++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 3 deletions(-) create mode 100644 src/qirlightning/README.md diff --git a/src/qirlightning/LightningQuantum.cc b/src/qirlightning/LightningQuantum.cc index f99d117..ee3691f 100644 --- a/src/qirlightning/LightningQuantum.cc +++ b/src/qirlightning/LightningQuantum.cc @@ -116,9 +116,6 @@ QState LightningQuantum::read_result(Result r) //---------------------------------------------------------------------------// /*! * Map a qubit to a result index. - * - * (TODO: find how to link the classical register to the quantum register in - * qsim) */ void LightningQuantum::mz(Qubit q, Result r) { diff --git a/src/qirlightning/README.md b/src/qirlightning/README.md new file mode 100644 index 0000000..6b42581 --- /dev/null +++ b/src/qirlightning/README.md @@ -0,0 +1,63 @@ +# Lightning backend + +## Installing a lightning simulator + +When installing [Pennylane-Lightning](https://github.com/PennyLaneAI/pennylane-lightning) from pip or source, you will have the shared objects for each of the simulator installed. These are named `liblightning_kokkos_catalyst.so`/`liblightning_GPU_catalyst.so` etc. + +Running `pip install pennylane` or `pip install pennylane-lightning` will install the `lightning.qubit` (CPU) simulator, and other simulators can be installed by running `pip install pennylane-lightning-kokkos / pennylane-lightning-gpu`. + +Example: +``` +$ pip install pennylane-lightning-kokkos + +$ pip show pennylane-lightning-kokkos +Name: PennyLane_Lightning_Kokkos +Version: 0.40.0 +Summary: PennyLane-Lightning plugin +Home-page: https://github.com/PennyLaneAI/pennylane-lightning +Author: +Author-email: +License: Apache License 2.0 +Location: /home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages +Requires: pennylane, pennylane-lightning + +$ ls /home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/pennylane_lightning +... liblightning_kokkos_catalyst.so ... +``` + +You can swap `pennylane-lightning-kokkos` for `pennylane-lightning-gpu` for lightning.gpu and `pennylane-lightning` for lightning.gpu simulators. + +## Compilation + +Turn on `QIREE_USE_LIGHTNING` in CMakeLists.txt + +To compile: + +``` +mkdir build; cd build +cmake .. +make + +``` + +## Running the example + +To run: + +``` +$ ./bin/qir-lightning ../examples/bell.ll -s 1 +(Extra debug output: +NamedOperation: Hadamard +NamedOperation: CNOT +Measure +Measure) +{"11":1} +``` + +## Running on other devices + +To run on other devices, e.g. lightning.gpu, you need to change: +- Install pennylane-lightning-gpu: `pip install pennylane-lightning-gpu` +- replace `RTDLIB` from `kokkos` to `gpu` +- replace `RTDDEVICE` from `Kokkos` to `GPU` +- Include `cuquantum` libraries when running (which was installed as a dependency), i.e. `LD_LIBRARY_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")/lib:$LD_LIBRARY_PATH ./test_rt_device.out` From 62460106e75c33f5821fd3ccdf19b9651d3d27af Mon Sep 17 00:00:00 2001 From: Joseph Lee Date: Wed, 12 Mar 2025 21:51:05 +0000 Subject: [PATCH 43/64] update --- CMakeLists.txt | 2 +- src/qirlightning/CMakeLists.txt | 8 ++ src/qirlightning/LightningQuantum.cc | 51 ++++---- src/qirlightning/LightningQuantum.hh | 2 +- src/qirlightning/README.md | 36 ++++-- .../include/DynamicLibraryLoader.hpp | 79 ------------ .../catalyst_runtime/include/RuntimeCAPI.h | 112 ------------------ .../simple_demo/test_rt_device.cpp | 1 + 8 files changed, 62 insertions(+), 229 deletions(-) delete mode 100644 src/qirlightning/catalyst_runtime/include/DynamicLibraryLoader.hpp delete mode 100644 src/qirlightning/catalyst_runtime/include/RuntimeCAPI.h diff --git a/CMakeLists.txt b/CMakeLists.txt index b0a21a9..5ecc4a8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,7 +37,7 @@ endmacro() option(QIREE_BUILD_DOCS "Build QIR-EE documentation" OFF) option(QIREE_BUILD_TESTS "Build QIR-EE unit tests" ON) option(QIREE_BUILD_EXAMPLES "Build QIR-EE examples" OFF) -option(QIREE_USE_QSIM "Download and build Google qsim backend" OFF) +option(QIREE_USE_QSIM "Download and build Google qsim backend" ON) option(QIREE_USE_XACC "Build XACC interface" OFF) option(QIREE_USE_LIGHTNING "Build Pennylane Lightning backend" ON) diff --git a/src/qirlightning/CMakeLists.txt b/src/qirlightning/CMakeLists.txt index 0a3eab5..31cfd8c 100644 --- a/src/qirlightning/CMakeLists.txt +++ b/src/qirlightning/CMakeLists.txt @@ -7,12 +7,20 @@ # Include directories for catalyst_runtime include_directories("${CMAKE_CURRENT_SOURCE_DIR}/catalyst_runtime/include") +set(RTDLIB_PATH "/home/joseph/work/qiree/pennylane-lightning/build_lightning_qubit/liblightning_qubit_catalyst.so") +set(RTDDEVICE_NAME "LightningSimulator") + # Adding lightning as a library to qiree qiree_add_library(qirlightning LightningQuantum.cc LightningDefaultRuntime.cc ) +target_compile_definitions(qirlightning PRIVATE + RTDLIB="${RTDLIB_PATH}" + RTDDEVICE="${RTDDEVICE_NAME}" +) + #Link the lightning library to qiree and any other relevant libraries target_link_libraries(qirlightning PUBLIC QIREE::qiree # Link to qiree diff --git a/src/qirlightning/LightningQuantum.cc b/src/qirlightning/LightningQuantum.cc index ee3691f..b8bc4ad 100644 --- a/src/qirlightning/LightningQuantum.cc +++ b/src/qirlightning/LightningQuantum.cc @@ -19,10 +19,6 @@ #include "qiree/Assert.hh" -#define RTDLIB \ - "/home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/" \ - "pennylane_lightning/liblightning_kokkos_catalyst.so"; -#define RTDDEVICE "LightningKokkosSimulator"; extern "C" Catalyst::Runtime::QuantumDevice* GenericDeviceFactory(char const* kwargs); namespace qiree @@ -35,6 +31,27 @@ using namespace Catalyst::Runtime; */ LightningQuantum::LightningQuantum(std::ostream& os, unsigned long int seed) : output_(os), seed_(seed) { +} + +//---------------------------------------------------------------------------// +//! Default destructor +LightningQuantum::~LightningQuantum() = default; + +//---------------------------------------------------------------------------// +/*! + * Prepare to build a quantum circuit for an entry point + */ +void LightningQuantum::set_up(EntryPointAttrs const& attrs) +{ + QIREE_VALIDATE(attrs.required_num_qubits > 0, + << "input is not a quantum program"); + num_qubits_ = attrs.required_num_qubits; // Set the number of qubits + results_.resize(attrs.required_num_results); + + + // We load the library every time because we currently have an issue + // with releasing qubits in Catalyst. + // Once that is fixed, this can go to the constructor to execute once std::string rtd_lib = RTDLIB; std::string rtd_device = RTDDEVICE; std::string kwargs = {}; @@ -60,23 +77,6 @@ LightningQuantum::LightningQuantum(std::ostream& os, unsigned long int seed) : o rtd_qdevice = std::unique_ptr( reinterpret_cast(f_ptr)( rtd_kwargs.c_str())); -} - -//---------------------------------------------------------------------------// -//! Default destructor -LightningQuantum::~LightningQuantum() = default; - -//---------------------------------------------------------------------------// -/*! - * Prepare to build a quantum circuit for an entry point - */ -void LightningQuantum::set_up(EntryPointAttrs const& attrs) -{ - QIREE_VALIDATE(attrs.required_num_qubits > 0, - << "input is not a quantum program"); - - num_qubits_ = attrs.required_num_qubits; // Set the number of qubits - results_.resize(attrs.required_num_results); rtd_qdevice->AllocateQubits(num_qubits_); } @@ -87,10 +87,12 @@ void LightningQuantum::set_up(EntryPointAttrs const& attrs) */ void LightningQuantum::tear_down() { + // This should go to the destructor once we fix the issue with releasing qubits if (rtd_dylib_handler) { dlclose(rtd_dylib_handler); - } + }; + } //---------------------------------------------------------------------------// @@ -106,7 +108,7 @@ void LightningQuantum::reset(Qubit q) //----------------------------------------------------------------------------// /*! - * Read the value of a result. This utilizes the new BufferManager. + * Read the value of a result. */ QState LightningQuantum::read_result(Result r) { @@ -124,7 +126,8 @@ void LightningQuantum::mz(Qubit q, Result r) std::mt19937 gen(seed_); seed_++; rtd_qdevice->SetDevicePRNG(&gen); - results_[r.value] = rtd_qdevice->Measure(q.value, std::nullopt); + auto result = rtd_qdevice->Measure(static_cast(q.value), std::nullopt); + results_[r.value] = *result; } //---------------------------------------------------------------------------// diff --git a/src/qirlightning/LightningQuantum.hh b/src/qirlightning/LightningQuantum.hh index 24ccd2f..2c2728b 100644 --- a/src/qirlightning/LightningQuantum.hh +++ b/src/qirlightning/LightningQuantum.hh @@ -68,7 +68,7 @@ class LightningQuantum final : virtual public QuantumNotImpl //!@{ //! \name Circuit construction // void ccx(Qubit, Qubit) final; - void ccnot(Qubit, Qubit, Qubit); // TODO: not in examples or qir runner + void ccnot(Qubit, Qubit, Qubit); void cnot(Qubit, Qubit) final; void cx(Qubit, Qubit) final; // void cy(Qubit, Qubit) final; diff --git a/src/qirlightning/README.md b/src/qirlightning/README.md index 6b42581..e3498ff 100644 --- a/src/qirlightning/README.md +++ b/src/qirlightning/README.md @@ -2,11 +2,11 @@ ## Installing a lightning simulator -When installing [Pennylane-Lightning](https://github.com/PennyLaneAI/pennylane-lightning) from pip or source, you will have the shared objects for each of the simulator installed. These are named `liblightning_kokkos_catalyst.so`/`liblightning_GPU_catalyst.so` etc. +More information on installing Pennylane Lightning simulators can be found in [lightning repository](https://github.com/PennyLaneAI/pennylane-lightning). -Running `pip install pennylane` or `pip install pennylane-lightning` will install the `lightning.qubit` (CPU) simulator, and other simulators can be installed by running `pip install pennylane-lightning-kokkos / pennylane-lightning-gpu`. +### Quick start +The easiest way to get started is install a Lightning simulator from PyPI via pip: -Example: ``` $ pip install pennylane-lightning-kokkos @@ -20,20 +20,37 @@ Author-email: License: Apache License 2.0 Location: /home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages Requires: pennylane, pennylane-lightning +``` +Running `pip install pennylane` or `pip install pennylane-lightning` will install the `lightning.qubit` (CPU) simulator, and other simulators can be installed by running `pip install pennylane-lightning-kokkos / pennylane-lightning-gpu`. + +When installing [Pennylane-Lightning](https://github.com/PennyLaneAI/pennylane-lightning) from pip or source, you will have the shared objects for each of the simulator installed. These are named `liblightning_qubit_catalyst.so`/`liblightning_kokkos_catalyst.so`/`liblightning_GPU_catalyst.so` respectively. +Example: +``` $ ls /home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/pennylane_lightning ... liblightning_kokkos_catalyst.so ... ``` You can swap `pennylane-lightning-kokkos` for `pennylane-lightning-gpu` for lightning.gpu and `pennylane-lightning` for lightning.gpu simulators. +### Compiling Lightning from Source + +The [lightning repository page](https://github.com/PennyLaneAI/pennylane-lightning) contains information on how to install Lightning simulators from source. This will necessary for e.g. Kokkos with HIP backend. + ## Compilation -Turn on `QIREE_USE_LIGHTNING` in CMakeLists.txt +- Set `QIREE_USE_LIGHTNING` to `ON` in `qiree/CMakeLists.txt` +- Specify the simulator path and name in `qiree/src/qirlightning/CMakeLists`, set: + - `RTDLIB_PATH` to the path of the simulator `.so` + - `RTDDEVICE_NAME` to `LightningSimulator`/`LightningKokkosSimulator`/`LightningGPUSimulator` +These could also be set in cmake using the variables `-D...` + +Note: when running on GPU, include `cuquantum` libraries in the library path (which will be installed as a dependency from Python), i.e. `LD_LIBRARY_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")/lib:$LD_LIBRARY_PATH ./test_rt_device.out` To compile: ``` +cd qiree/ mkdir build; cd build cmake .. make @@ -45,16 +62,11 @@ make To run: ``` -$ ./bin/qir-lightning ../examples/bell.ll -s 1 -(Extra debug output: -NamedOperation: Hadamard -NamedOperation: CNOT -Measure -Measure) -{"11":1} +$ ./bin/qir-lightning ../examples/bell.ll -s 100 +{"00":43,"11":57} ``` -## Running on other devices +## Running on GPU To run on other devices, e.g. lightning.gpu, you need to change: - Install pennylane-lightning-gpu: `pip install pennylane-lightning-gpu` diff --git a/src/qirlightning/catalyst_runtime/include/DynamicLibraryLoader.hpp b/src/qirlightning/catalyst_runtime/include/DynamicLibraryLoader.hpp deleted file mode 100644 index 1c25ab8..0000000 --- a/src/qirlightning/catalyst_runtime/include/DynamicLibraryLoader.hpp +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright 2024 Xanadu Quantum Technologies Inc. - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -#include "Exception.hpp" - -/** - * @brief A utility struct to handle opening, closing and retrieving symbols - * from dynamic shared objects. - */ -struct DynamicLibraryLoader { - void *handle; - - DynamicLibraryLoader(std::string_view library_name, int mode = RTLD_LAZY | RTLD_NODELETE) - { - // Load the shared library - handle = dlopen(library_name.data(), mode); - if (!handle) { - const char *err_msg = dlerror(); - RT_FAIL(err_msg); - } - } - - ~DynamicLibraryLoader() - { - if (handle) { - // TODO: This is non-sensical. - // We are using RTLD_NODELETE, why would calling dlclose have a side-effect? - // Worst of all, the side-effect is not in our code. - // When we have dlclose, everything works well the first time. - // However, when trying to compile a second time, we will find that jaxlib will now - // raise a StopIteration exception. This doesn't really make any sense. - // My guess is that somehow dlclosing here will unload a the StopIteration symbol (?) - // rebind it with another equivalent (but with different id?) - // and then the MLIR python bindings are unable to catch it and stop the iteration and - // it gets propagated upwards. - // - // Is not calling dlclose bad? - // A little bit, although dlclose implies intent and does not create any requirements - // upon the implementation. See here: - // https://pubs.opengroup.org/onlinepubs/000095399/functions/dlclose.html - // https://github.com/pybind/pybind11/blob/75e48c5f959b4f0a49d8c664e059b6fb4b497102/include/pybind11/detail/internals.h#L108-L113 - // -#ifndef __APPLE__ - dlclose(handle); -#endif - } - } - - // Get symbol from library - template T getSymbol(std::string_view symbol_name) - { - // Clear any existing errors - dlerror(); - - // Retrieve symbol - T symbol = reinterpret_cast(dlsym(handle, symbol_name.data())); - const char *err_msg = dlerror(); - if (err_msg != nullptr) { - RT_FAIL(err_msg); - } - return symbol; - } -}; diff --git a/src/qirlightning/catalyst_runtime/include/RuntimeCAPI.h b/src/qirlightning/catalyst_runtime/include/RuntimeCAPI.h deleted file mode 100644 index b0f63ca..0000000 --- a/src/qirlightning/catalyst_runtime/include/RuntimeCAPI.h +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright 2022-2023 Xanadu Quantum Technologies Inc. - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#ifndef RUNTIMECAPI_H -#define RUNTIMECAPI_H - -#include "Types.h" - -#ifdef __cplusplus -extern "C" { -#endif - -// Quantum Runtime Instructions -void __catalyst__rt__fail_cstr(const char *); -void __catalyst__rt__initialize(uint32_t *seed); -void __catalyst__rt__device_init(int8_t *, int8_t *, int8_t *, int64_t shots); -void __catalyst__rt__device_release(); -void __catalyst__rt__finalize(); -void __catalyst__rt__toggle_recorder(bool); -void __catalyst__rt__print_state(); -void __catalyst__rt__print_tensor(OpaqueMemRefT *, bool); -void __catalyst__rt__print_string(char *); -void __catalyst__rt__assert_bool(bool, char *); -int64_t __catalyst__rt__array_get_size_1d(QirArray *); -int8_t *__catalyst__rt__array_get_element_ptr_1d(QirArray *, int64_t); - -QUBIT *__catalyst__rt__qubit_allocate(); -QirArray *__catalyst__rt__qubit_allocate_array(int64_t); -void __catalyst__rt__qubit_release(QUBIT *); -void __catalyst__rt__qubit_release_array(QirArray *); - -int64_t __catalyst__rt__num_qubits(); - -bool __catalyst__rt__result_equal(RESULT *, RESULT *); -RESULT *__catalyst__rt__result_get_one(); -RESULT *__catalyst__rt__result_get_zero(); - -// Quantum Gate Set Instructions -void __catalyst__qis__SetState(MemRefT_CplxT_double_1d *, uint64_t, ...); -void __catalyst__qis__SetBasisState(MemRefT_int8_1d *, uint64_t, ...); -void __catalyst__qis__Identity(QUBIT *, const Modifiers *); -void __catalyst__qis__PauliX(QUBIT *, const Modifiers *); -void __catalyst__qis__PauliY(QUBIT *, const Modifiers *); -void __catalyst__qis__PauliZ(QUBIT *, const Modifiers *); -void __catalyst__qis__Hadamard(QUBIT *, const Modifiers *); -void __catalyst__qis__S(QUBIT *, const Modifiers *); -void __catalyst__qis__T(QUBIT *, const Modifiers *); -void __catalyst__qis__PhaseShift(double, QUBIT *, const Modifiers *); -void __catalyst__qis__RX(double, QUBIT *, const Modifiers *); -void __catalyst__qis__RY(double, QUBIT *, const Modifiers *); -void __catalyst__qis__RZ(double, QUBIT *, const Modifiers *); -void __catalyst__qis__Rot(double, double, double, QUBIT *, const Modifiers *); -void __catalyst__qis__CNOT(QUBIT *, QUBIT *, const Modifiers *); -void __catalyst__qis__CY(QUBIT *, QUBIT *, const Modifiers *); -void __catalyst__qis__CZ(QUBIT *, QUBIT *, const Modifiers *); -void __catalyst__qis__SWAP(QUBIT *, QUBIT *, const Modifiers *); -void __catalyst__qis__IsingXX(double, QUBIT *, QUBIT *, const Modifiers *); -void __catalyst__qis__IsingYY(double, QUBIT *, QUBIT *, const Modifiers *); -void __catalyst__qis__IsingXY(double, QUBIT *, QUBIT *, const Modifiers *); -void __catalyst__qis__IsingZZ(double, QUBIT *, QUBIT *, const Modifiers *); -void __catalyst__qis__ControlledPhaseShift(double, QUBIT *, QUBIT *, const Modifiers *); -void __catalyst__qis__CRX(double, QUBIT *, QUBIT *, const Modifiers *); -void __catalyst__qis__CRY(double, QUBIT *, QUBIT *, const Modifiers *); -void __catalyst__qis__CRZ(double, QUBIT *, QUBIT *, const Modifiers *); -void __catalyst__qis__CRot(double, double, double, QUBIT *, QUBIT *, const Modifiers *); -void __catalyst__qis__CSWAP(QUBIT *, QUBIT *, QUBIT *, const Modifiers *); -void __catalyst__qis__Toffoli(QUBIT *, QUBIT *, QUBIT *, const Modifiers *); -void __catalyst__qis__MultiRZ(double, const Modifiers *, int64_t, /*qubits*/...); -void __catalyst__qis__GlobalPhase(double, const Modifiers *); -void __catalyst__qis__ISWAP(QUBIT *, QUBIT *, const Modifiers *); -void __catalyst__qis__PSWAP(double, QUBIT *, QUBIT *, const Modifiers *); - -// Struct pointer arguments for these instructions represent real arguments, -// as passing structs by value is too unreliable / compiler dependant. -void __catalyst__qis__QubitUnitary(MemRefT_CplxT_double_2d *, const Modifiers *, int64_t, - /*qubits*/...); - -ObsIdType __catalyst__qis__NamedObs(int64_t, QUBIT *); -ObsIdType __catalyst__qis__HermitianObs(MemRefT_CplxT_double_2d *, int64_t, /*qubits*/...); -ObsIdType __catalyst__qis__TensorObs(int64_t, /*obsKeys*/...); -ObsIdType __catalyst__qis__HamiltonianObs(MemRefT_double_1d *, int64_t, /*obsKeys*/...); - -// Struct pointers arguments here represent return values. -RESULT *__catalyst__qis__Measure(QUBIT *, int32_t); -double __catalyst__qis__Expval(ObsIdType); -double __catalyst__qis__Variance(ObsIdType); -void __catalyst__qis__Probs(MemRefT_double_1d *, int64_t, /*qubits*/...); -void __catalyst__qis__Sample(MemRefT_double_2d *, int64_t, /*qubits*/...); -void __catalyst__qis__Counts(PairT_MemRefT_double_int64_1d *, int64_t, /*qubits*/...); -void __catalyst__qis__State(MemRefT_CplxT_double_1d *, int64_t, /*qubits*/...); -void __catalyst__qis__Gradient(int64_t, /*results*/...); -void __catalyst__qis__Gradient_params(MemRefT_int64_1d *, int64_t, /*results*/...); - -void __catalyst__host__rt__unrecoverable_error(); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif diff --git a/src/qirlightning/simple_demo/test_rt_device.cpp b/src/qirlightning/simple_demo/test_rt_device.cpp index 4d2736f..80ee38c 100644 --- a/src/qirlightning/simple_demo/test_rt_device.cpp +++ b/src/qirlightning/simple_demo/test_rt_device.cpp @@ -3,6 +3,7 @@ #include "QuantumDevice.hpp" // Runtime libraries (kokkos/GPU/qubit etc.) +// Update these paths to point to the correct library #define RTDLIB \ "/home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/" \ "pennylane_lightning/liblightning_gpu_catalyst.so"; From 88dbe30a3e22bcec0af471d690d3608442a45e25 Mon Sep 17 00:00:00 2001 From: Joseph Lee Date: Wed, 12 Mar 2025 21:52:28 +0000 Subject: [PATCH 44/64] update readme --- src/qirlightning/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/qirlightning/README.md b/src/qirlightning/README.md index e3498ff..5b5bb39 100644 --- a/src/qirlightning/README.md +++ b/src/qirlightning/README.md @@ -1,4 +1,4 @@ -# Lightning backend +# QIR-EE with Lightning simulator backend ## Installing a lightning simulator @@ -37,7 +37,7 @@ You can swap `pennylane-lightning-kokkos` for `pennylane-lightning-gpu` for ligh The [lightning repository page](https://github.com/PennyLaneAI/pennylane-lightning) contains information on how to install Lightning simulators from source. This will necessary for e.g. Kokkos with HIP backend. -## Compilation +## Compile QIR-EE with Lightning backend - Set `QIREE_USE_LIGHTNING` to `ON` in `qiree/CMakeLists.txt` - Specify the simulator path and name in `qiree/src/qirlightning/CMakeLists`, set: From 99800fa6c769e13c016cab5b80537a5a715ef5a5 Mon Sep 17 00:00:00 2001 From: Joseph Lee Date: Wed, 12 Mar 2025 21:58:14 +0000 Subject: [PATCH 45/64] update readme --- src/qirlightning/README.md | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/qirlightning/README.md b/src/qirlightning/README.md index 5b5bb39..12b4644 100644 --- a/src/qirlightning/README.md +++ b/src/qirlightning/README.md @@ -66,10 +66,3 @@ $ ./bin/qir-lightning ../examples/bell.ll -s 100 {"00":43,"11":57} ``` -## Running on GPU - -To run on other devices, e.g. lightning.gpu, you need to change: -- Install pennylane-lightning-gpu: `pip install pennylane-lightning-gpu` -- replace `RTDLIB` from `kokkos` to `gpu` -- replace `RTDDEVICE` from `Kokkos` to `GPU` -- Include `cuquantum` libraries when running (which was installed as a dependency), i.e. `LD_LIBRARY_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")/lib:$LD_LIBRARY_PATH ./test_rt_device.out` From 3eb300210562dba6fe921e2c59a6c54b3156812f Mon Sep 17 00:00:00 2001 From: Joseph Lee Date: Thu, 13 Mar 2025 19:06:22 +0000 Subject: [PATCH 46/64] update --- app/CMakeLists.txt | 3 - src/qirlightning/CMakeLists.txt | 43 +++++++++-- src/qirlightning/LightningQuantum.cc | 58 +++++++-------- src/qirlightning/LightningQuantum.hh | 1 + src/qirlightning/README.md | 11 +-- src/qirlightning/simple_demo/README.md | 7 +- .../snapshot_catalyst_runtime}/README.rst | 0 .../include/DataView.hpp | 0 .../include/Exception.hpp | 0 .../include/QuantumDevice.hpp | 0 .../include/Types.h | 0 .../simple_demo/test_rt_device.cpp | 4 +- src/qirlightning/support_catalyst.cmake | 74 +++++++++++++++++++ 13 files changed, 152 insertions(+), 49 deletions(-) rename src/qirlightning/{catalyst_runtime => simple_demo/snapshot_catalyst_runtime}/README.rst (100%) rename src/qirlightning/{catalyst_runtime => simple_demo/snapshot_catalyst_runtime}/include/DataView.hpp (100%) rename src/qirlightning/{catalyst_runtime => simple_demo/snapshot_catalyst_runtime}/include/Exception.hpp (100%) rename src/qirlightning/{catalyst_runtime => simple_demo/snapshot_catalyst_runtime}/include/QuantumDevice.hpp (100%) rename src/qirlightning/{catalyst_runtime => simple_demo/snapshot_catalyst_runtime}/include/Types.h (100%) create mode 100644 src/qirlightning/support_catalyst.cmake diff --git a/app/CMakeLists.txt b/app/CMakeLists.txt index 32bef70..3529ba3 100644 --- a/app/CMakeLists.txt +++ b/app/CMakeLists.txt @@ -40,9 +40,6 @@ endif() #-----------------------------------------------------------------------------# if(QIREE_USE_LIGHTNING) - - # Include directories for catalyst_runtime - include_directories("${CMAKE_CURRENT_SOURCE_DIR}/../src/qirlightning/catalyst_runtime/include") qiree_add_executable(qir-lightning qir-lightning.cc ) diff --git a/src/qirlightning/CMakeLists.txt b/src/qirlightning/CMakeLists.txt index 31cfd8c..925f33c 100644 --- a/src/qirlightning/CMakeLists.txt +++ b/src/qirlightning/CMakeLists.txt @@ -4,16 +4,47 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #----------------------------------------------------------------------------# -# Include directories for catalyst_runtime -include_directories("${CMAKE_CURRENT_SOURCE_DIR}/catalyst_runtime/include") +# Fetch Catalyst runtime include files +include(FetchContent) + +include("${CMAKE_CURRENT_SOURCE_DIR}/support_catalyst.cmake") +FindCatalyst(qirlightning) + +# Set the path to the lightning simulator shared library +if(DEFINED ENV{LIGHTNING_SIM_PATH}) + set(RTDLIB_PATH "$ENV{LIGHTNING_SIM_PATH}") + message(STATUS "RTDLIB_PATH set from environment variable LIGHTNING_SIM_PATH: ${RTDLIB_PATH}") +else() + # Update hard coded path is not found in environment + set(RTDLIB_PATH "/home/joseph/work/qiree/pennylane-lightning/build_lightning_kokkos/liblightning_kokkos_catalyst.so") + message(STATUS "RTDLIB_PATH set to default value: ${RTDLIB_PATH}") +endif() + +# Set the device name for the lightning simulator +execute_process( + COMMAND nm -DC "${RTDLIB_PATH}" | grep " Factory" + OUTPUT_VARIABLE GREP_OUTPUT + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + +if(GREP_OUTPUT) + string(REGEX MATCH "T (.*)Factory" SYMBOL_MATCH "${GREP_OUTPUT}") + if(SYMBOL_MATCH) + string(REGEX REPLACE "T (.*)Factory" "\\1" RTDDEVICE_NAME "${SYMBOL_MATCH}") + message(STATUS "Found Lightning Simulator. Extracted RTDDEVICE_NAME: ${RTDDEVICE_NAME}") + else() + message(WARNING "Symbol 'Factory' found, but regex failed to extract.") + endif() +else() + message(WARNING "Symbol 'Factory' not found in ${RTDLIB_PATH}") +endif() -set(RTDLIB_PATH "/home/joseph/work/qiree/pennylane-lightning/build_lightning_qubit/liblightning_qubit_catalyst.so") -set(RTDDEVICE_NAME "LightningSimulator") # Adding lightning as a library to qiree qiree_add_library(qirlightning - LightningQuantum.cc - LightningDefaultRuntime.cc +LightningQuantum.cc +LightningDefaultRuntime.cc ) target_compile_definitions(qirlightning PRIVATE diff --git a/src/qirlightning/LightningQuantum.cc b/src/qirlightning/LightningQuantum.cc index b8bc4ad..b4597b7 100644 --- a/src/qirlightning/LightningQuantum.cc +++ b/src/qirlightning/LightningQuantum.cc @@ -31,27 +31,6 @@ using namespace Catalyst::Runtime; */ LightningQuantum::LightningQuantum(std::ostream& os, unsigned long int seed) : output_(os), seed_(seed) { -} - -//---------------------------------------------------------------------------// -//! Default destructor -LightningQuantum::~LightningQuantum() = default; - -//---------------------------------------------------------------------------// -/*! - * Prepare to build a quantum circuit for an entry point - */ -void LightningQuantum::set_up(EntryPointAttrs const& attrs) -{ - QIREE_VALIDATE(attrs.required_num_qubits > 0, - << "input is not a quantum program"); - num_qubits_ = attrs.required_num_qubits; // Set the number of qubits - results_.resize(attrs.required_num_results); - - - // We load the library every time because we currently have an issue - // with releasing qubits in Catalyst. - // Once that is fixed, this can go to the constructor to execute once std::string rtd_lib = RTDLIB; std::string rtd_device = RTDDEVICE; std::string kwargs = {}; @@ -65,17 +44,40 @@ void LightningQuantum::set_up(EntryPointAttrs const& attrs) // Find device factory std::string factory_name = rtd_device + "Factory"; - void* f_ptr = dlsym(rtd_dylib_handler, factory_name.c_str()); + factory_f_ptr = dlsym(rtd_dylib_handler, factory_name.c_str()); - if (!f_ptr) + if (!factory_f_ptr) { dlclose(rtd_dylib_handler); throw std::runtime_error("Failed to find factory function: " + factory_name); } +} + +//---------------------------------------------------------------------------// +//! Default destructor +LightningQuantum::~LightningQuantum() { + + if (rtd_dylib_handler) + { + dlclose(rtd_dylib_handler); + }; +}; + +//---------------------------------------------------------------------------// +/*! + * Prepare to build a quantum circuit for an entry point + */ +void LightningQuantum::set_up(EntryPointAttrs const& attrs) +{ + QIREE_VALIDATE(attrs.required_num_qubits > 0, + << "input is not a quantum program"); + num_qubits_ = attrs.required_num_qubits; // Set the number of qubits + results_.resize(attrs.required_num_results); + std::string rtd_kwargs = {}; rtd_qdevice = std::unique_ptr( - reinterpret_cast(f_ptr)( + reinterpret_cast(factory_f_ptr)( rtd_kwargs.c_str())); rtd_qdevice->AllocateQubits(num_qubits_); @@ -87,11 +89,6 @@ void LightningQuantum::set_up(EntryPointAttrs const& attrs) */ void LightningQuantum::tear_down() { - // This should go to the destructor once we fix the issue with releasing qubits - if (rtd_dylib_handler) - { - dlclose(rtd_dylib_handler); - }; } @@ -103,7 +100,8 @@ void LightningQuantum::reset(Qubit q) { std::vector data = {0}; DataView state(data); - std::vector wires = {static_cast(q.value)}; rtd_qdevice->SetBasisState(state, wires); + std::vector wires = {static_cast(q.value)}; + rtd_qdevice->SetBasisState(state, wires); } //----------------------------------------------------------------------------// diff --git a/src/qirlightning/LightningQuantum.hh b/src/qirlightning/LightningQuantum.hh index 2c2728b..96d49d5 100644 --- a/src/qirlightning/LightningQuantum.hh +++ b/src/qirlightning/LightningQuantum.hh @@ -100,6 +100,7 @@ class LightningQuantum final : virtual public QuantumNotImpl std::ostream& output_; unsigned long int seed_{}; void* rtd_dylib_handler; + void* factory_f_ptr; std::unique_ptr rtd_qdevice; std::vector results_; diff --git a/src/qirlightning/README.md b/src/qirlightning/README.md index 12b4644..633aeea 100644 --- a/src/qirlightning/README.md +++ b/src/qirlightning/README.md @@ -40,10 +40,11 @@ The [lightning repository page](https://github.com/PennyLaneAI/pennylane-lightni ## Compile QIR-EE with Lightning backend - Set `QIREE_USE_LIGHTNING` to `ON` in `qiree/CMakeLists.txt` -- Specify the simulator path and name in `qiree/src/qirlightning/CMakeLists`, set: - - `RTDLIB_PATH` to the path of the simulator `.so` - - `RTDDEVICE_NAME` to `LightningSimulator`/`LightningKokkosSimulator`/`LightningGPUSimulator` -These could also be set in cmake using the variables `-D...` +- Set the environment variable `LIGHTNING_SIM_PATH` to the shared object of the Lightning Simulator, e.g. + +``` +export LIGHTNING_SIM_PATH=/home/joseph/work/qiree/pennylane-lightning/build_lightning_qubit/liblightning_qubit_catalyst.so +``` Note: when running on GPU, include `cuquantum` libraries in the library path (which will be installed as a dependency from Python), i.e. `LD_LIBRARY_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")/lib:$LD_LIBRARY_PATH ./test_rt_device.out` @@ -59,7 +60,7 @@ make ## Running the example -To run: +To run (in the `build` directory): ``` $ ./bin/qir-lightning ../examples/bell.ll -s 100 diff --git a/src/qirlightning/simple_demo/README.md b/src/qirlightning/simple_demo/README.md index 7aecc9d..2e328b1 100644 --- a/src/qirlightning/simple_demo/README.md +++ b/src/qirlightning/simple_demo/README.md @@ -2,13 +2,13 @@ This is a super simple demo for driving Lightning devices. The example here uses `lightning.kokkos`, but can easily be updated to target other devices, e.g. lightning.gpu (if an Nvidia GPU is present). -The only extra header files required are the `../catalyst_runtime/include`, which contains the include files from the [Catalyst Runtime ](https://github.com/PennyLaneAI/catalyst/tree/main/runtime/include) (for the QuantumDevice interface). +Some Catalyst include files are copied here for convenience - they are in `./snapshot_catalyst_runtime/include`. These are required for the QuantumDevice interface. For the qiree source, these files are fetched automatically during CMake, and these are not used. ## Installing a lightning simulator When installing [Pennylane-Lightning](https://github.com/PennyLaneAI/pennylane-lightning) from pip or source, you will have the shared objects for each of the simulator installed. These are named `liblightning_kokkos_catalyst.so`/`liblightning_GPU_catalyst.so` etc. -Running `pip install pennylane` or `pip install pennylane-lightning` will install the `lightning.qubit` (CPU) simulator, and other simulators can be installed by running `pip install pennylane-lightning-kokkos / pennylane-lightning-gpu`. +To get started, run `pip install pennylane` or `pip install pennylane-lightning` - this will install the `lightning.qubit` (CPU) simulator, and other simulators can be installed by running `pip install pennylane-lightning-kokkos / pennylane-lightning-gpu`. Example: ``` @@ -36,7 +36,7 @@ You can swap `pennylane-lightning-kokkos` for `pennylane-lightning-gpu` for ligh To compile: ``` -$ clang++ --std=c++20 test_rt_device.cpp -I../catalyst_runtime/include -o test_rt_device.out +$ clang++ --std=c++20 test_rt_device.cpp -I./snapshot_catalyst_runtime/include -o test_rt_device.out ``` ## Running the example @@ -61,6 +61,7 @@ Measure on wire 0 = 0 To run on other devices, e.g. lightning.gpu, you need to change: - Install pennylane-lightning-gpu: `pip install pennylane-lightning-gpu` +In the c++ file: - replace `RTDLIB` from `kokkos` to `gpu` - replace `RTDDEVICE` from `Kokkos` to `GPU` - Include `cuquantum` libraries when running (which was installed as a dependency), i.e. `LD_LIBRARY_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")/lib:$LD_LIBRARY_PATH ./test_rt_device.out` diff --git a/src/qirlightning/catalyst_runtime/README.rst b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/README.rst similarity index 100% rename from src/qirlightning/catalyst_runtime/README.rst rename to src/qirlightning/simple_demo/snapshot_catalyst_runtime/README.rst diff --git a/src/qirlightning/catalyst_runtime/include/DataView.hpp b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/DataView.hpp similarity index 100% rename from src/qirlightning/catalyst_runtime/include/DataView.hpp rename to src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/DataView.hpp diff --git a/src/qirlightning/catalyst_runtime/include/Exception.hpp b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Exception.hpp similarity index 100% rename from src/qirlightning/catalyst_runtime/include/Exception.hpp rename to src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Exception.hpp diff --git a/src/qirlightning/catalyst_runtime/include/QuantumDevice.hpp b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/QuantumDevice.hpp similarity index 100% rename from src/qirlightning/catalyst_runtime/include/QuantumDevice.hpp rename to src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/QuantumDevice.hpp diff --git a/src/qirlightning/catalyst_runtime/include/Types.h b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Types.h similarity index 100% rename from src/qirlightning/catalyst_runtime/include/Types.h rename to src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Types.h diff --git a/src/qirlightning/simple_demo/test_rt_device.cpp b/src/qirlightning/simple_demo/test_rt_device.cpp index 80ee38c..091b461 100644 --- a/src/qirlightning/simple_demo/test_rt_device.cpp +++ b/src/qirlightning/simple_demo/test_rt_device.cpp @@ -6,8 +6,8 @@ // Update these paths to point to the correct library #define RTDLIB \ "/home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/" \ - "pennylane_lightning/liblightning_gpu_catalyst.so"; -#define RTDDEVICE "LightningGPUSimulator"; + "pennylane_lightning/liblightning_kokkos_catalyst.so"; +#define RTDDEVICE "LightningKokkosSimulator"; extern "C" Catalyst::Runtime::QuantumDevice* GenericDeviceFactory(char const* kwargs); diff --git a/src/qirlightning/support_catalyst.cmake b/src/qirlightning/support_catalyst.cmake new file mode 100644 index 0000000..5932f06 --- /dev/null +++ b/src/qirlightning/support_catalyst.cmake @@ -0,0 +1,74 @@ +############################################################################################### +# This file provides macros to process Catalyst. +############################################################################################### + +# Include this only once +include_guard() + +macro(FindCatalyst target_name) + if(LIGHTNING_CATALYST_SRC_PATH) + if(NOT IS_ABSOLUTE ${LIGHTNING_CATALYST_SRC_PATH}) + message(FATAL_ERROR " LIGHTNING_CATALYST_SRC_PATH=${LIGHTNING_CATALYST_SRC_PATH} must be set to an absolute path") + endif() + if(CATALYST_GIT_TAG) + message(WARN " Setting `LIGHTNING_CATALYST_SRC_PATH=${LIGHTNING_CATALYST_SRC_PATH}` overrides `CATALYST_GIT_TAG=${CATALYST_GIT_TAG}`") + endif() + + # Acquire local git hash and use for CATALYST_GIT_TAG + execute_process(COMMAND git rev-parse --short HEAD + WORKING_DIRECTORY ${LIGHTNING_CATALYST_SRC_PATH} + OUTPUT_VARIABLE CATALYST_GIT_TAG + ) + message(INFO " Building against local Catalyst - path: ${LIGHTNING_CATALYST_SRC_PATH} - GIT TAG: ${CATALYST_GIT_TAG}") + + target_include_directories(${target_name} PUBLIC ${LIGHTNING_CATALYST_SRC_PATH}/runtime/lib/backend/common) + target_include_directories(${target_name} PUBLIC ${LIGHTNING_CATALYST_SRC_PATH}/runtime/include) + + else() + if(NOT CATALYST_GIT_TAG) + set(CATALYST_GIT_TAG "main" CACHE STRING "GIT_TAG value to build Catalyst") + endif() + message(INFO " Building against Catalyst GIT TAG ${CATALYST_GIT_TAG}") + + # Fetching /lib/backend/common hpp headers + set(LIB_BACKEND_COMMON_HEADERS CacheManager.hpp + QubitManager.hpp + Utils.hpp + ) + + foreach(HEADER ${LIB_BACKEND_COMMON_HEADERS}) + string(REGEX REPLACE "\\.[^.]*$" "" HEADER_NAME ${HEADER}) + FetchContent_Declare( + ${HEADER_NAME} + URL https://raw.githubusercontent.com/PennyLaneAI/catalyst/${CATALYST_GIT_TAG}/runtime/lib/backend/common/${HEADER} + DOWNLOAD_NO_EXTRACT True + SOURCE_DIR ../../include + ) + + FetchContent_MakeAvailable(${HEADER_NAME}) + endforeach() + + # Fetching include hpp headers + set(INCLUDE_HEADERS DataView.hpp + Exception.hpp + QuantumDevice.hpp + RuntimeCAPI.h + Types.h + ) + + foreach(HEADER ${INCLUDE_HEADERS}) + string(REGEX REPLACE "\\.[^.]*$" "" HEADER_NAME ${HEADER}) + FetchContent_Declare( + ${HEADER_NAME} + URL https://raw.githubusercontent.com/PennyLaneAI/catalyst/${CATALYST_GIT_TAG}/runtime/include/${HEADER} + DOWNLOAD_NO_EXTRACT True + SOURCE_DIR ../../include + ) + + FetchContent_MakeAvailable(${HEADER_NAME}) + endforeach() + + #target_include_directories(${target_name} PUBLIC ${CMAKE_CURRENT_BINARY_DIR}/../../include) + + endif() +endmacro() From b8ec0711c0e1dee26e4b62d4f7b11b31c6a1c7e6 Mon Sep 17 00:00:00 2001 From: Joseph Lee Date: Thu, 13 Mar 2025 19:12:05 +0000 Subject: [PATCH 47/64] remove paths --- src/qirlightning/README.md | 6 +++--- src/qirlightning/simple_demo/README.md | 4 ++-- src/qirlightning/simple_demo/test_rt_device.cpp | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/qirlightning/README.md b/src/qirlightning/README.md index 633aeea..45d47ae 100644 --- a/src/qirlightning/README.md +++ b/src/qirlightning/README.md @@ -18,7 +18,7 @@ Home-page: https://github.com/PennyLaneAI/pennylane-lightning Author: Author-email: License: Apache License 2.0 -Location: /home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages +Location: Requires: pennylane, pennylane-lightning ``` Running `pip install pennylane` or `pip install pennylane-lightning` will install the `lightning.qubit` (CPU) simulator, and other simulators can be installed by running `pip install pennylane-lightning-kokkos / pennylane-lightning-gpu`. @@ -27,7 +27,7 @@ When installing [Pennylane-Lightning](https://github.com/PennyLaneAI/pennylane-l Example: ``` -$ ls /home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/pennylane_lightning +$ ls ... liblightning_kokkos_catalyst.so ... ``` @@ -43,7 +43,7 @@ The [lightning repository page](https://github.com/PennyLaneAI/pennylane-lightni - Set the environment variable `LIGHTNING_SIM_PATH` to the shared object of the Lightning Simulator, e.g. ``` -export LIGHTNING_SIM_PATH=/home/joseph/work/qiree/pennylane-lightning/build_lightning_qubit/liblightning_qubit_catalyst.so +export LIGHTNING_SIM_PATH=/pennylane_lightning/liblightning_qubit_catalyst.so ``` Note: when running on GPU, include `cuquantum` libraries in the library path (which will be installed as a dependency from Python), i.e. `LD_LIBRARY_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")/lib:$LD_LIBRARY_PATH ./test_rt_device.out` diff --git a/src/qirlightning/simple_demo/README.md b/src/qirlightning/simple_demo/README.md index 2e328b1..1d3e41f 100644 --- a/src/qirlightning/simple_demo/README.md +++ b/src/qirlightning/simple_demo/README.md @@ -22,10 +22,10 @@ Home-page: https://github.com/PennyLaneAI/pennylane-lightning Author: Author-email: License: Apache License 2.0 -Location: /home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages +Location: Requires: pennylane, pennylane-lightning -$ ls /home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/pennylane_lightning +$ ls /pennylane_lightning ... liblightning_kokkos_catalyst.so ... ``` diff --git a/src/qirlightning/simple_demo/test_rt_device.cpp b/src/qirlightning/simple_demo/test_rt_device.cpp index 091b461..4ba8f75 100644 --- a/src/qirlightning/simple_demo/test_rt_device.cpp +++ b/src/qirlightning/simple_demo/test_rt_device.cpp @@ -5,7 +5,7 @@ // Runtime libraries (kokkos/GPU/qubit etc.) // Update these paths to point to the correct library #define RTDLIB \ - "/home/joseph/work/qiree/venv-qiree/lib/python3.10/site-packages/" \ + "" \ "pennylane_lightning/liblightning_kokkos_catalyst.so"; #define RTDDEVICE "LightningKokkosSimulator"; From ce5d7642f76983fc38cbc004af261c2cc1d77f4b Mon Sep 17 00:00:00 2001 From: Joseph Lee Date: Thu, 13 Mar 2025 19:17:50 +0000 Subject: [PATCH 48/64] update --- src/qirlightning/simple_demo/README.md | 2 ++ src/qirlightning/simple_demo/test_rt_device.cpp | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/qirlightning/simple_demo/README.md b/src/qirlightning/simple_demo/README.md index 1d3e41f..bbd7d27 100644 --- a/src/qirlightning/simple_demo/README.md +++ b/src/qirlightning/simple_demo/README.md @@ -33,6 +33,8 @@ You can swap `pennylane-lightning-kokkos` for `pennylane-lightning-gpu` for ligh ## Compilation +First update the `RTDLIB` in `test_rt_device.cpp` to the local path where lightning is installed (i.e. `` from above). + To compile: ``` diff --git a/src/qirlightning/simple_demo/test_rt_device.cpp b/src/qirlightning/simple_demo/test_rt_device.cpp index 4ba8f75..c50ab92 100644 --- a/src/qirlightning/simple_demo/test_rt_device.cpp +++ b/src/qirlightning/simple_demo/test_rt_device.cpp @@ -5,7 +5,7 @@ // Runtime libraries (kokkos/GPU/qubit etc.) // Update these paths to point to the correct library #define RTDLIB \ - "" \ + "/" \ "pennylane_lightning/liblightning_kokkos_catalyst.so"; #define RTDDEVICE "LightningKokkosSimulator"; From b7b6aeeb6d453fc463db9583de7607d9da548eb7 Mon Sep 17 00:00:00 2001 From: Joseph Lee Date: Mon, 5 May 2025 15:39:10 +0000 Subject: [PATCH 49/64] update installation instructions --- src/qirlightning/CMakeLists.txt | 9 ++++----- src/qirlightning/README.md | 6 ++++-- src/qirlightning/support_catalyst.cmake | 4 +++- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/qirlightning/CMakeLists.txt b/src/qirlightning/CMakeLists.txt index 925f33c..2edbdf9 100644 --- a/src/qirlightning/CMakeLists.txt +++ b/src/qirlightning/CMakeLists.txt @@ -15,9 +15,8 @@ if(DEFINED ENV{LIGHTNING_SIM_PATH}) set(RTDLIB_PATH "$ENV{LIGHTNING_SIM_PATH}") message(STATUS "RTDLIB_PATH set from environment variable LIGHTNING_SIM_PATH: ${RTDLIB_PATH}") else() - # Update hard coded path is not found in environment - set(RTDLIB_PATH "/home/joseph/work/qiree/pennylane-lightning/build_lightning_kokkos/liblightning_kokkos_catalyst.so") - message(STATUS "RTDLIB_PATH set to default value: ${RTDLIB_PATH}") + # Throw an error if the environment variable is not defined + message(FATAL_ERROR "Environment variable LIGHTNING_SIM_PATH is not defined. Please set it to the path of the Lightning simulator shared library.") endif() # Set the device name for the lightning simulator @@ -34,10 +33,10 @@ if(GREP_OUTPUT) string(REGEX REPLACE "T (.*)Factory" "\\1" RTDDEVICE_NAME "${SYMBOL_MATCH}") message(STATUS "Found Lightning Simulator. Extracted RTDDEVICE_NAME: ${RTDDEVICE_NAME}") else() - message(WARNING "Symbol 'Factory' found, but regex failed to extract.") + message(FATAL_ERROR "Symbol 'Factory' found, but regex failed to extract.") endif() else() - message(WARNING "Symbol 'Factory' not found in ${RTDLIB_PATH}") + message(FATAL_ERROR "Symbol 'Factory' not found in ${RTDLIB_PATH}. Please ensure LIGHTNING_SIM_PATH is set correctly.") endif() diff --git a/src/qirlightning/README.md b/src/qirlightning/README.md index 45d47ae..7719b7e 100644 --- a/src/qirlightning/README.md +++ b/src/qirlightning/README.md @@ -43,10 +43,12 @@ The [lightning repository page](https://github.com/PennyLaneAI/pennylane-lightni - Set the environment variable `LIGHTNING_SIM_PATH` to the shared object of the Lightning Simulator, e.g. ``` -export LIGHTNING_SIM_PATH=/pennylane_lightning/liblightning_qubit_catalyst.so +export LIGHTNING_SIM_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/pennylane_lightning')")/liblightning_kokkos_catalyst.so ``` -Note: when running on GPU, include `cuquantum` libraries in the library path (which will be installed as a dependency from Python), i.e. `LD_LIBRARY_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")/lib:$LD_LIBRARY_PATH ./test_rt_device.out` +Note: +- replace `libligghtning_kokkos_catalyst.so` with `liblightning_qubit_catalyst.so` or `liblightning_GPU_catalyst.so` if required. +- when running on `GPU`, include `cuquantum` libraries in the library path (which will be installed as a dependency from Python), i.e. `LD_LIBRARY_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")/lib:$LD_LIBRARY_PATH ./test_rt_device.out` To compile: diff --git a/src/qirlightning/support_catalyst.cmake b/src/qirlightning/support_catalyst.cmake index 5932f06..f46363c 100644 --- a/src/qirlightning/support_catalyst.cmake +++ b/src/qirlightning/support_catalyst.cmake @@ -26,7 +26,9 @@ macro(FindCatalyst target_name) else() if(NOT CATALYST_GIT_TAG) - set(CATALYST_GIT_TAG "main" CACHE STRING "GIT_TAG value to build Catalyst") + # v0.41 of Lightning requires v0.11.0 of Catalyst + # If using latest Lightning, use main branch of Catalyst + set(CATALYST_GIT_TAG "v0.11.0" CACHE STRING "GIT_TAG value to build Catalyst") endif() message(INFO " Building against Catalyst GIT TAG ${CATALYST_GIT_TAG}") From d15106b7002d2d9537d9cacf46a0c2a893940da7 Mon Sep 17 00:00:00 2001 From: Joseph Lee Date: Fri, 30 May 2025 18:48:11 +0000 Subject: [PATCH 50/64] update tests and GH workflow --- .github/workflows/build-lightning.yml | 98 +++++++++++++++++ CMakeLists.txt | 2 +- test/CMakeLists.txt | 10 ++ test/qirlightning/LightningQuantum.test.cc | 118 +++++++++++++++++++++ 4 files changed, 227 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/build-lightning.yml create mode 100644 test/qirlightning/LightningQuantum.test.cc diff --git a/.github/workflows/build-lightning.yml b/.github/workflows/build-lightning.yml new file mode 100644 index 0000000..052830c --- /dev/null +++ b/.github/workflows/build-lightning.yml @@ -0,0 +1,98 @@ +# Build directly on the GitHub runner with caching +name: build-lightning +on: + workflow_dispatch: + workflow_call: + +concurrency: + group: build-lightning-${{github.ref}}-${{github.event.pull_request.number || github.run_number}}-${{github.workflow}} + cancel-in-progress: true + +jobs: + linux: + name: ${{matrix.runner}}-${{matrix.compiler}}-${{matrix.version}}-llvm${{matrix.llvm}} + strategy: + matrix: + include: + - runner: jammy + compiler: gcc + version: 12 + llvm: 14 + - runner: jammy + compiler: clang + version: 15 + llvm: 15 + runs-on: >- + ${{ matrix.runner == 'focal' && 'ubuntu-20.04' + || matrix.runner == 'jammy' && 'ubuntu-22.04' + || null + }} + env: + CCACHE_DIR: "${{github.workspace}}/.ccache" + CCACHE_MAXSIZE: "10G" + CC: ${{matrix.compiler}}-${{matrix.version}} + CXX: ${{matrix.compiler == 'gcc' && 'g++' || 'clang++'}}-${{matrix.version}} + steps: + - uses: actions/setup-python@v5 + name: Install Python + with: + python-version: '3.10' + - name: Install dependencies + run: | + sudo apt-get -q -y update + sudo apt-get -q -y install \ + ccache cmake ninja-build libgtest-dev \ + llvm-${{matrix.llvm}}-dev \ + ${{matrix.compiler}}-${{matrix.version}} \ + ${{matrix.compiler == 'gcc' && format('g++-{0}', matrix.version) || ''}} + echo "Installed toolchain:" + ld --version | head -1 + $CC --version | head -1 + $CXX --version | head -1 + llvm-config-${{matrix.llvm}} --version | head -1 + python -m pip install pennylane-lightning + - name: Check out + uses: actions/checkout@v4 + - name: Set up ccache + uses: actions/cache@v4 + with: + path: ${{env.CCACHE_DIR}} + key: ccache-${{matrix.runner}}-${{matrix.compiler}}-${{matrix.version}}-${{github.run_id}} + restore-keys: ccache-${{matrix.runner}}-${{matrix.compiler}}-${{matrix.version}} + - name: Zero ccache stats + run: | + ccache -z + - name: Configure + run: | + export LIGHTNING_SIM_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/pennylane_lightning')")/liblightning_qubit_catalyst.so + mkdir build && cd build + cmake -GNinja \ + -DQIREE_GIT_DESCRIBE="${{github.event.pull_request + && format(';-pr.{0};', github.event.pull_request.number) + || format(';-{0};', github.ref_name)}}" \ + -DQIREE_BUILD_TESTS:BOOL=ON \ + -DQIREE_DEBUG:BOOL=ON \ + -DQIREE_USE_XACC:BOOL=OFF \ + -DQIREE_USE_LIGHTNING:BOOL=ON \ + -DCMAKE_BUILD_TYPE="Release" \ + -DCMAKE_INSTALL_PREFIX="${{github.workspace}}/install" \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ + -DCMAKE_CXX_FLAGS="-Wall -Wextra -pedantic" \ + .. + - name: Build all + working-directory: build + run: | + ninja + - name: Run tests + working-directory: build + run: | + ctest --parallel 2 --timeout 15 --output-on-failure + - name: Install + working-directory: build + run: | + ninja install + - name: Show ccache stats + run: | + ccache -s + +# vim: set nowrap tw=100: diff --git a/CMakeLists.txt b/CMakeLists.txt index 5ecc4a8..07254bd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,7 +39,7 @@ option(QIREE_BUILD_TESTS "Build QIR-EE unit tests" ON) option(QIREE_BUILD_EXAMPLES "Build QIR-EE examples" OFF) option(QIREE_USE_QSIM "Download and build Google qsim backend" ON) option(QIREE_USE_XACC "Build XACC interface" OFF) -option(QIREE_USE_LIGHTNING "Build Pennylane Lightning backend" ON) +option(QIREE_USE_LIGHTNING "Build Pennylane Lightning backend" OFF) qiree_set_default(BUILD_TESTING ${QIREE_BUILD_TESTS}) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index b224fc8..73d0b81 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -66,3 +66,13 @@ if(QIREE_USE_QSIM) endif() #---------------------------------------------------------------------------## + +#---------------------------------------------------------------------------## +# QIRLIGHTNING TESTS +#---------------------------------------------------------------------------## + +if(QIREE_USE_LIGHTNING) + qiree_add_test(qirlightning LightningQuantum) +endif() + +#---------------------------------------------------------------------------## diff --git a/test/qirlightning/LightningQuantum.test.cc b/test/qirlightning/LightningQuantum.test.cc new file mode 100644 index 0000000..1d74b29 --- /dev/null +++ b/test/qirlightning/LightningQuantum.test.cc @@ -0,0 +1,118 @@ +//----------------------------------*-C++-*----------------------------------// +// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers. +// See the top-level COPYRIGHT file for details. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +//---------------------------------------------------------------------------// +//! \file qirlightning/LightningQuantum.test.cc +//---------------------------------------------------------------------------// +#include "qirlightning/LightningQuantum.hh" + +#include + +#include "qiree/Types.hh" +#include "qiree_test.hh" +#include "qirlightning/LightningDefaultRuntime.hh" + +namespace qiree +{ +namespace test +{ +//---------------------------------------------------------------------------// + +class LightningQuantumTest : public ::qiree::test::Test +{ + protected: + void SetUp() override {} + + static std::string clean_output(std::string&& s) + { + std::string result = std::move(s); + static std::regex const subs_ptr("0x[0-9a-f]+"); + result = std::regex_replace(result, subs_ptr, "0x0"); + return result; + } +}; + +TEST_F(LightningQuantumTest, sim_dynamicbv) +{ + using Q = Qubit; + using R = Result; + + std::ostringstream os; + os << '\n'; + + // Create a simulator that will write to the string stream + LightningQuantum lightning_sim{os, 0}; + LightningDefaultRuntime lightning_rt{os, lightning_sim}; + // Call functions in the same sequence that dynamicbv.ll would + lightning_sim.set_up([] { + EntryPointAttrs attrs; + attrs.required_num_qubits = 2; + attrs.required_num_results = 2; + return attrs; + }()); + ASSERT_EQ(2, lightning_sim.num_qubits()); + ASSERT_EQ(2, lightning_sim.num_results()); + + lightning_sim.h(Q{0}); + lightning_sim.x(Q{1}); + lightning_sim.h(Q{1}); + lightning_sim.cnot(Q{0}, Q{1}); + lightning_sim.h(Q{0}); + lightning_sim.mz(Q{0}, R{0}); + lightning_sim.read_result(R{0}); + lightning_sim.mz(Q{1}, R{1}); + lightning_sim.read_result(R{1}); + lightning_rt.array_record_output(2, ""); + lightning_rt.result_record_output(R{0}, ""); + lightning_rt.result_record_output(R{1}, ""); + //EXPECT_EQ(QState::one, lightning_sim.get_result(R{0})); + //EXPECT_EQ(QState::one, lightning_sim.get_result(R{1})); + + + lightning_sim.tear_down(); +} + +TEST_F(LightningQuantumTest, result_order) +{ + using Q = Qubit; + using R = Result; + + std::ostringstream os; + os << '\n'; + + // Create a simulator that will write to the string stream + LightningQuantum qis{os, 0}; + LightningDefaultRuntime rt{os, qis}; + + // Call functions in the same sequence that dynamicbv.ll would + qis.set_up([] { + EntryPointAttrs attrs; + attrs.required_num_qubits = 4; + attrs.required_num_results = 3; + return attrs; + }()); + qis.mz(Q{0}, R{2}); + qis.mz(Q{1}, R{1}); + qis.mz(Q{2}, R{0}); + std::vector expected; + expected.push_back(static_cast(qis.get_result(R{2}))); + expected.push_back(static_cast(qis.get_result(R{0}))); + expected.push_back(static_cast(qis.get_result(R{1}))); + // So the internal result "buffer" is now {true, false, true} + rt.array_record_output(3, "array"); + rt.result_record_output(R{2}, "foo"); // pushes true + rt.result_record_output(R{0}, "bar"); // pushes true + rt.result_record_output(R{1}, "baz"); // pushes false + + auto const& result = rt.result(); + EXPECT_EQ("array", result.container_label()); + EXPECT_EQ(expected, result.bits()); + EXPECT_EQ((std::vector{"foo", "bar", "baz"}), + result.entry_labels()); + + qis.tear_down(); +} +//---------------------------------------------------------------------------// +} // namespace test +} // namespace qiree From 227f5c66c463ad6ff2125e28097814dba55264ed Mon Sep 17 00:00:00 2001 From: Joseph Lee Date: Tue, 17 Jun 2025 21:20:21 +0000 Subject: [PATCH 51/64] update reset --- src/qirlightning/LightningQuantum.cc | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/qirlightning/LightningQuantum.cc b/src/qirlightning/LightningQuantum.cc index b4597b7..a774543 100644 --- a/src/qirlightning/LightningQuantum.cc +++ b/src/qirlightning/LightningQuantum.cc @@ -98,10 +98,7 @@ void LightningQuantum::tear_down() */ void LightningQuantum::reset(Qubit q) { - std::vector data = {0}; - DataView state(data); - std::vector wires = {static_cast(q.value)}; - rtd_qdevice->SetBasisState(state, wires); + q.value = 0; } //----------------------------------------------------------------------------// From cf66bdb0e0e1ff482d48363ef28d134ba5d27e1f Mon Sep 17 00:00:00 2001 From: Joseph Lee Date: Tue, 24 Jun 2025 13:50:45 +0000 Subject: [PATCH 52/64] single-result update --- app/qir-lightning.cc | 4 +- src/qirlightning/CMakeLists.txt | 2 +- src/qirlightning/LightningDefaultRuntime.hh | 77 ------------------- src/qirlightning/LightningQuantum.cc | 6 +- src/qirlightning/LightningQuantum.hh | 15 +--- ...gDefaultRuntime.cc => LightningRuntime.cc} | 16 +++- src/qirlightning/LightningRuntime.hh | 37 +++++++++ test/qirlightning/LightningQuantum.test.cc | 16 ++-- 8 files changed, 65 insertions(+), 108 deletions(-) delete mode 100644 src/qirlightning/LightningDefaultRuntime.hh rename src/qirlightning/{LightningDefaultRuntime.cc => LightningRuntime.cc} (62%) create mode 100644 src/qirlightning/LightningRuntime.hh diff --git a/app/qir-lightning.cc b/app/qir-lightning.cc index 2244c97..cff96f2 100644 --- a/app/qir-lightning.cc +++ b/app/qir-lightning.cc @@ -13,8 +13,8 @@ #include "qiree/Executor.hh" #include "qiree/Module.hh" #include "qiree/ResultDistribution.hh" -#include "qirlightning/LightningDefaultRuntime.hh" #include "qirlightning/LightningQuantum.hh" +#include "qirlightning/LightningRuntime.hh" using namespace std::string_view_literals; @@ -30,7 +30,7 @@ void run(std::string const& filename, int num_shots) // Set up qsim LightningQuantum sim(std::cout, 0); - LightningDefaultRuntime rt(std::cout, sim); + LightningRuntime rt(std::cout, sim); ResultDistribution distribution; // Run several time = shots (default 1) diff --git a/src/qirlightning/CMakeLists.txt b/src/qirlightning/CMakeLists.txt index 2edbdf9..b02e572 100644 --- a/src/qirlightning/CMakeLists.txt +++ b/src/qirlightning/CMakeLists.txt @@ -43,7 +43,7 @@ endif() # Adding lightning as a library to qiree qiree_add_library(qirlightning LightningQuantum.cc -LightningDefaultRuntime.cc +LightningRuntime.cc ) target_compile_definitions(qirlightning PRIVATE diff --git a/src/qirlightning/LightningDefaultRuntime.hh b/src/qirlightning/LightningDefaultRuntime.hh deleted file mode 100644 index 2d3e9fa..0000000 --- a/src/qirlightning/LightningDefaultRuntime.hh +++ /dev/null @@ -1,77 +0,0 @@ -//----------------------------------*-C++-*----------------------------------// -// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers. -// See the top-level COPYRIGHT file for details. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -//---------------------------------------------------------------------------// -//! \file qirlightning/LightningDefaultRuntime.hh -//---------------------------------------------------------------------------// -#pragma once - -#include "LightningQuantum.hh" -#include "qiree/RecordedResult.hh" - -namespace qiree -{ - -/*! - * Print per-qubit measurement statistics. - * - * Example for three qubits: - * \code - * Measurement output: - * ------------------- - * Number of shots: 1024 - * Number of qubits: 3 - * q0 {0: 542, 1: 482} - * q1 {0: 521, 1: 503} - * q2 {0: 0, 1: 1024} - * - * \endcode - */ - -class LightningDefaultRuntime final : virtual public RuntimeInterface -{ - public: - /*! - * Construct \c LightningDefaultRuntime. - */ - LightningDefaultRuntime(std::ostream& output, LightningQuantum& sim) - : output_(output), sim_(sim) - { - } - - //!@{ - //! \name Runtime interface - - // Initialize the execution environment, resetting qubits - void initialize(OptionalCString env) override; - - //! Mark the following N results as being part of an array named tag - void array_record_output(size_type size, OptionalCString tag) final - { - result_ = RecordedResult(size, tag); - } - - //! Mark the following N results as being part of a tuple named tag - void tuple_record_output(size_type size, OptionalCString tag) final - { - result_ = RecordedResult(size, tag); - } - - //! Save one result - void result_record_output(Result result, OptionalCString tag) final - { - result_.push_back(sim_.get_result(result), tag); - } - //!@} - - RecordedResult const& result() const { return result_; } - - - private: - std::ostream& output_; - LightningQuantum& sim_; - RecordedResult result_; -}; - -} // namespace qiree diff --git a/src/qirlightning/LightningQuantum.cc b/src/qirlightning/LightningQuantum.cc index a774543..e584a3b 100644 --- a/src/qirlightning/LightningQuantum.cc +++ b/src/qirlightning/LightningQuantum.cc @@ -105,9 +105,11 @@ void LightningQuantum::reset(Qubit q) /*! * Read the value of a result. */ -QState LightningQuantum::read_result(Result r) +QState LightningQuantum::read_result(Result r) const { - return this->get_result(r); + QIREE_EXPECT(r.value < results_.size()); + auto result_bool = static_cast(results_[r.value]); + return static_cast(result_bool); } //---------------------------------------------------------------------------// diff --git a/src/qirlightning/LightningQuantum.hh b/src/qirlightning/LightningQuantum.hh index 96d49d5..125d3eb 100644 --- a/src/qirlightning/LightningQuantum.hh +++ b/src/qirlightning/LightningQuantum.hh @@ -45,8 +45,6 @@ class LightningQuantum final : virtual public QuantumNotImpl //! Number of classical result registers size_type num_results() const { return results_.size(); } - // Get the result from a classical register - inline QState get_result(Result r) const; //!@} //!@{ @@ -61,7 +59,7 @@ class LightningQuantum final : virtual public QuantumNotImpl void mz(Qubit, Result) final; // Read the value of a result. - QState read_result(Result) final; + QState read_result(Result) const final; //!@} @@ -108,15 +106,4 @@ class LightningQuantum final : virtual public QuantumNotImpl std::vector result_to_qubit_; }; -//---------------------------------------------------------------------------// -/*! - * Get the result from a classical register. - */ -QState LightningQuantum::get_result(Result r) const -{ - QIREE_EXPECT(r.value < results_.size()); - auto result_bool = static_cast(results_[r.value]); - return static_cast(result_bool); -} - } // namespace qiree diff --git a/src/qirlightning/LightningDefaultRuntime.cc b/src/qirlightning/LightningRuntime.cc similarity index 62% rename from src/qirlightning/LightningDefaultRuntime.cc rename to src/qirlightning/LightningRuntime.cc index 7e5da3e..89bce1a 100644 --- a/src/qirlightning/LightningDefaultRuntime.cc +++ b/src/qirlightning/LightningRuntime.cc @@ -3,22 +3,32 @@ // See the top-level COPYRIGHT file for details. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception //---------------------------------------------------------------------------// -//! \file qirlightning/LightningDefaultRuntime.cc +//! \file qirlightning/LightningRuntime.cc //---------------------------------------------------------------------------// -#include "LightningDefaultRuntime.hh" +#include "LightningRuntime.hh" #include +#include "LightningQuantum.hh" #include "qiree/Assert.hh" namespace qiree { +//---------------------------------------------------------------------------// +/*! + * Construct with quantum reference to access classical registers. + */ +LightningRuntime::LightningRuntime(std::ostream& output, LightningQuantum const& sim) + : SingleResultRuntime{sim}, output_(output) +{ +} + //---------------------------------------------------------------------------// /*! * Initialize the execution environment, resetting qubits. */ -void LightningDefaultRuntime::initialize(OptionalCString env) +void LightningRuntime::initialize(OptionalCString env) { if (env) { diff --git a/src/qirlightning/LightningRuntime.hh b/src/qirlightning/LightningRuntime.hh new file mode 100644 index 0000000..0623d16 --- /dev/null +++ b/src/qirlightning/LightningRuntime.hh @@ -0,0 +1,37 @@ +//----------------------------------*-C++-*----------------------------------// +// Copyright 2024 UT-Battelle, LLC, and other QIR-EE developers. +// See the top-level COPYRIGHT file for details. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +//---------------------------------------------------------------------------// +//! \file qirlightning/LightningRuntime.hh +//---------------------------------------------------------------------------// +#pragma once + +#include "qiree/SingleResultRuntime.hh" + +namespace qiree +{ +//---------------------------------------------------------------------------// +class LightningQuantum; + +//---------------------------------------------------------------------------// + +class LightningRuntime final : virtual public SingleResultRuntime +{ + public: + // Construct with quantum reference to access classical registers + LightningRuntime(std::ostream& output, LightningQuantum const& sim); + + //!@{ + //! \name Runtime interface + + // Initialize the execution environment, resetting qubits + void initialize(OptionalCString env) override; + + //!@} + + private: + std::ostream& output_; +}; + +} // namespace qiree diff --git a/test/qirlightning/LightningQuantum.test.cc b/test/qirlightning/LightningQuantum.test.cc index 1d74b29..237792b 100644 --- a/test/qirlightning/LightningQuantum.test.cc +++ b/test/qirlightning/LightningQuantum.test.cc @@ -11,7 +11,7 @@ #include "qiree/Types.hh" #include "qiree_test.hh" -#include "qirlightning/LightningDefaultRuntime.hh" +#include "qirlightning/LightningRuntime.hh" namespace qiree { @@ -43,7 +43,7 @@ TEST_F(LightningQuantumTest, sim_dynamicbv) // Create a simulator that will write to the string stream LightningQuantum lightning_sim{os, 0}; - LightningDefaultRuntime lightning_rt{os, lightning_sim}; + LightningRuntime lightning_rt{os, lightning_sim}; // Call functions in the same sequence that dynamicbv.ll would lightning_sim.set_up([] { EntryPointAttrs attrs; @@ -66,9 +66,7 @@ TEST_F(LightningQuantumTest, sim_dynamicbv) lightning_rt.array_record_output(2, ""); lightning_rt.result_record_output(R{0}, ""); lightning_rt.result_record_output(R{1}, ""); - //EXPECT_EQ(QState::one, lightning_sim.get_result(R{0})); - //EXPECT_EQ(QState::one, lightning_sim.get_result(R{1})); - + EXPECT_EQ(QState::one, lightning_sim.read_result(R{0})); lightning_sim.tear_down(); } @@ -83,7 +81,7 @@ TEST_F(LightningQuantumTest, result_order) // Create a simulator that will write to the string stream LightningQuantum qis{os, 0}; - LightningDefaultRuntime rt{os, qis}; + LightningRuntime rt{os, qis}; // Call functions in the same sequence that dynamicbv.ll would qis.set_up([] { @@ -96,9 +94,9 @@ TEST_F(LightningQuantumTest, result_order) qis.mz(Q{1}, R{1}); qis.mz(Q{2}, R{0}); std::vector expected; - expected.push_back(static_cast(qis.get_result(R{2}))); - expected.push_back(static_cast(qis.get_result(R{0}))); - expected.push_back(static_cast(qis.get_result(R{1}))); + expected.push_back(static_cast(qis.read_result(R{2}))); + expected.push_back(static_cast(qis.read_result(R{0}))); + expected.push_back(static_cast(qis.read_result(R{1}))); // So the internal result "buffer" is now {true, false, true} rt.array_record_output(3, "array"); rt.result_record_output(R{2}, "foo"); // pushes true From f9b76aece71cfc70fdfadb6101526e5a1fd728e3 Mon Sep 17 00:00:00 2001 From: Joseph Lee Date: Fri, 18 Jul 2025 15:53:46 +0000 Subject: [PATCH 53/64] update lightnign installation instructions --- src/qirlightning/README.md | 2 +- src/qirlightning/support_catalyst.cmake | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/qirlightning/README.md b/src/qirlightning/README.md index 7719b7e..2777869 100644 --- a/src/qirlightning/README.md +++ b/src/qirlightning/README.md @@ -55,7 +55,7 @@ To compile: ``` cd qiree/ mkdir build; cd build -cmake .. +cmake -DQIREE_USE_LIGHTNING=ON .. make ``` diff --git a/src/qirlightning/support_catalyst.cmake b/src/qirlightning/support_catalyst.cmake index f46363c..5932f06 100644 --- a/src/qirlightning/support_catalyst.cmake +++ b/src/qirlightning/support_catalyst.cmake @@ -26,9 +26,7 @@ macro(FindCatalyst target_name) else() if(NOT CATALYST_GIT_TAG) - # v0.41 of Lightning requires v0.11.0 of Catalyst - # If using latest Lightning, use main branch of Catalyst - set(CATALYST_GIT_TAG "v0.11.0" CACHE STRING "GIT_TAG value to build Catalyst") + set(CATALYST_GIT_TAG "main" CACHE STRING "GIT_TAG value to build Catalyst") endif() message(INFO " Building against Catalyst GIT TAG ${CATALYST_GIT_TAG}") From 5574e46f368b30e7b319bb35e53a7c43667d27f9 Mon Sep 17 00:00:00 2001 From: Joseph Lee Date: Sat, 19 Jul 2025 00:24:56 +0000 Subject: [PATCH 54/64] update shots to seed --- src/qirlightning/LightningQuantum.hh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/qirlightning/LightningQuantum.hh b/src/qirlightning/LightningQuantum.hh index 125d3eb..e68bf66 100644 --- a/src/qirlightning/LightningQuantum.hh +++ b/src/qirlightning/LightningQuantum.hh @@ -30,7 +30,7 @@ class LightningQuantum final : virtual public QuantumNotImpl { public: // Construct with number of shots - LightningQuantum(std::ostream& os, unsigned long int shots); + LightningQuantum(std::ostream& os, unsigned long int seed); ~LightningQuantum(); QIREE_DELETE_COPY_MOVE(LightningQuantum); // Delete copy and move constructors From 00080eb1d7df5fa9cdc8f8a2d834bddd73fcb39d Mon Sep 17 00:00:00 2001 From: Joseph Lee Date: Tue, 22 Jul 2025 02:47:00 +0000 Subject: [PATCH 55/64] update github workflow to run build-lightning --- .github/workflows/pr.yml | 3 +++ .github/workflows/push.yml | 3 +++ 2 files changed, 6 insertions(+) diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 52f8404..df9d110 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -17,11 +17,14 @@ concurrency: jobs: build-fast: uses: ./.github/workflows/build-fast.yml + build-lightning: + uses: ./.github/workflows/build-lightning.yml # Specifying a dependent job allows us to select a single "requires" check in the project GitHub settings all: if: ${{ always() }} needs: - build-fast + - build-lightning runs-on: ubuntu-latest steps: - name: Decide whether the needed jobs succeeded or failed diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 4601abc..5e79f80 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -12,9 +12,12 @@ concurrency: jobs: build-fast: uses: ./.github/workflows/build-fast.yml + build-lightning: + uses: ./.github/workflows/build-lightning.yml all: needs: - build-fast + - build-lightning runs-on: ubuntu-latest steps: - name: Success From aa254204a6e0cda078719055ba4235f8092ec4fc Mon Sep 17 00:00:00 2001 From: Joseph Lee Date: Wed, 30 Jul 2025 13:02:56 +0000 Subject: [PATCH 56/64] remove demo and pin catalyst --- src/qirlightning/simple_demo/README.md | 69 ---- .../snapshot_catalyst_runtime/README.rst | 118 ------ .../include/DataView.hpp | 148 ------- .../include/Exception.hpp | 87 ----- .../include/QuantumDevice.hpp | 364 ------------------ .../snapshot_catalyst_runtime/include/Types.h | 165 -------- .../simple_demo/test_rt_device.cpp | 74 ---- src/qirlightning/support_catalyst.cmake | 2 +- 8 files changed, 1 insertion(+), 1026 deletions(-) delete mode 100644 src/qirlightning/simple_demo/README.md delete mode 100644 src/qirlightning/simple_demo/snapshot_catalyst_runtime/README.rst delete mode 100644 src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/DataView.hpp delete mode 100644 src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Exception.hpp delete mode 100644 src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/QuantumDevice.hpp delete mode 100644 src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Types.h delete mode 100644 src/qirlightning/simple_demo/test_rt_device.cpp diff --git a/src/qirlightning/simple_demo/README.md b/src/qirlightning/simple_demo/README.md deleted file mode 100644 index bbd7d27..0000000 --- a/src/qirlightning/simple_demo/README.md +++ /dev/null @@ -1,69 +0,0 @@ -# Simple Demo for Catalyst/Lightning runtime - -This is a super simple demo for driving Lightning devices. The example here uses `lightning.kokkos`, but can easily be updated to target other devices, e.g. lightning.gpu (if an Nvidia GPU is present). - -Some Catalyst include files are copied here for convenience - they are in `./snapshot_catalyst_runtime/include`. These are required for the QuantumDevice interface. For the qiree source, these files are fetched automatically during CMake, and these are not used. - -## Installing a lightning simulator - -When installing [Pennylane-Lightning](https://github.com/PennyLaneAI/pennylane-lightning) from pip or source, you will have the shared objects for each of the simulator installed. These are named `liblightning_kokkos_catalyst.so`/`liblightning_GPU_catalyst.so` etc. - -To get started, run `pip install pennylane` or `pip install pennylane-lightning` - this will install the `lightning.qubit` (CPU) simulator, and other simulators can be installed by running `pip install pennylane-lightning-kokkos / pennylane-lightning-gpu`. - -Example: -``` -$ pip install pennylane-lightning-kokkos - -$ pip show pennylane-lightning-kokkos -Name: PennyLane_Lightning_Kokkos -Version: 0.40.0 -Summary: PennyLane-Lightning plugin -Home-page: https://github.com/PennyLaneAI/pennylane-lightning -Author: -Author-email: -License: Apache License 2.0 -Location: -Requires: pennylane, pennylane-lightning - -$ ls /pennylane_lightning -... liblightning_kokkos_catalyst.so ... -``` - -You can swap `pennylane-lightning-kokkos` for `pennylane-lightning-gpu` for lightning.gpu and `pennylane-lightning` for lightning.gpu simulators. - -## Compilation - -First update the `RTDLIB` in `test_rt_device.cpp` to the local path where lightning is installed (i.e. `` from above). - -To compile: - -``` -$ clang++ --std=c++20 test_rt_device.cpp -I./snapshot_catalyst_runtime/include -o test_rt_device.out -``` - -## Running the example - -To run: - -``` -$ ./test_rt_device.out -Kokkos::OpenMP::initialize WARNING: OMP_PROC_BIND environment variable not set - In general, for best performance with OpenMP 4.0 or better set OMP_PROC_BIND=spread and OMP_PLACES=threads - For best performance with OpenMP 3.1 set OMP_PROC_BIND=true - For unit testing set OMP_PROC_BIND=false - -Num Qubits = 3 -State = -*** State-Vector of Size 8 *** -[(0.707107,0), (0,0), (0,0), (0,0), (0.707107,0), (0,0), (0,0), (0,0)] -Measure on wire 0 = 0 -``` - -## Running on other devices - -To run on other devices, e.g. lightning.gpu, you need to change: -- Install pennylane-lightning-gpu: `pip install pennylane-lightning-gpu` -In the c++ file: -- replace `RTDLIB` from `kokkos` to `gpu` -- replace `RTDDEVICE` from `Kokkos` to `GPU` -- Include `cuquantum` libraries when running (which was installed as a dependency), i.e. `LD_LIBRARY_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")/lib:$LD_LIBRARY_PATH ./test_rt_device.out` diff --git a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/README.rst b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/README.rst deleted file mode 100644 index 8a881e5..0000000 --- a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/README.rst +++ /dev/null @@ -1,118 +0,0 @@ -.. runtime-start-inclusion-marker-do-not-remove - -Catalyst Quantum Runtime -######################## - -The Catalyst Runtime is a C++ QIR runtime that enables the execution of Catalyst-compiled -quantum programs, and is currently backed by `PennyLane-Lightning `_ -state-vector simulators, and `Amazon Braket `__ -devices. Additional hardware support, including QPUs, to come. - -The runtime employs the `QuantumDevice `_ -public interface to support an extensible list of backend devices. This interface comprises two collections of abstract methods: - -- The Qubit management, device shot noise, and quantum tape recording methods are utilized for the implementation of Quantum Runtime (QR) instructions. - -- The quantum operations, observables, measurements, and gradient methods are used to implement Quantum Instruction Set (QIS) instructions. - -A complete list of instructions supported by the runtime can be found in -`RuntimeCAPI.h `_. - -Contents -======== - -The directory is structured as follows: - -- `include `_: - This contains the public header files of the runtime including the ``QuantumDevice`` API - for backend quantum devices and the runtime CAPI. - -- `lib `_: - The core modules of the runtime are structured into ``lib/capi`` and ``lib/backend``. - `lib/capi `_ implements the semantics for - QIR instructions lowered to our custom runtime. `lib/backend `_ - contains implementations of the ``QuantumDevice`` API for backend simulators. - -- `tests `_: - A collection of C++ tests for modules and methods in the runtime. - -Backend Devices -=============== - -New device backends for the runtime can be realized by implementing the quantum device interface. -The following table shows the available devices along with supported features: - -.. list-table:: - :widths: 25 25 25 25 - :header-rows: 0 - - * - **Features** - - **PennyLane-Lightning-Qubit** - - **PennyLane-Lightning-Kokkos** and **PennyLane-Lightning-GPU** - - **Amazon-Braket-OpenQasm** - * - Qubit Management - - Dynamic allocation/deallocation - - Static allocation/deallocation - - Static allocation/deallocation - * - Gate Operations - - `Lightning operations `_ - - `Lightning operations `_ without controlled gates support - - `Braket operations `_ - * - Quantum Observables - - ``Identity``, ``PauliX``, ``PauliY``, ``PauliZ``, ``Hadamard``, ``Hermitian``, ``Hamiltonian``, and Tensor Product of Observables - - ``Identity``, ``PauliX``, ``PauliY``, ``PauliZ``, ``Hadamard``, ``Hermitian``, ``Hamiltonian``, and Tensor Product of Observables - - ``Identity``, ``PauliX``, ``PauliY``, ``PauliZ``, ``Hadamard``, ``Hermitian``, and Tensor Product of Observables - * - Expectation Value - - All observables; Finite-shots supported - - All observables; Finite-shots supported - - All observables; Finite-shots supported - * - Variance - - All observables; Finite-shots supported - - All observables; Finite-shots supported - - All observables; Finite-shots supported - * - Probability - - Only for the computational basis on the supplied qubits; Finite-shots supported - - Only for the computational basis on the supplied qubits; Finite-shots supported - - The computational basis on all active qubits; Finite-shots supported - * - Sampling - - Only for the computational basis on the supplied qubits - - Only for the computational basis on the supplied qubits - - The computational basis on all active qubits; Finite-shots supported - * - Mid-Circuit Measurement - - Only for the computational basis on the supplied qubit - - Only for the computational basis on the supplied qubit - - Not supported - * - Gradient - - The Adjoint-Jacobian method for expectation values on all observables - - The Adjoint-Jacobian method for expectation values on all observables - - Not supported - -Requirements -============ - -To build the runtime from source, it is required to have an up to date version of a C/C++ compiler such as gcc or clang -with support for the C++20 standard library. - -Installation -============ - -By default, the runtime builds all supported backend devices. -You can build the runtime with custom devices from the list of Backend Devices. - -You can use ``ENABLE_OPENQASM=OFF`` to disable building the runtime with `Amazon-Braket-OpenQasm `_: - -.. code-block:: console - - make runtime ENABLE_OPENQASM=OFF - -This device currently offers generators for the `OpenQasm3 `_ specification and -`Amazon Braket `__ assembly extension. -Moreover, the generated assembly can be executed on Amazon Braket devices leveraging `amazon-braket-sdk-python `_. - -To check the runtime test suite from the root directory: - -.. code-block:: console - - make test-runtime - -.. runtime-end-inclusion-marker-do-not-remove diff --git a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/DataView.hpp b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/DataView.hpp deleted file mode 100644 index 6cf50f2..0000000 --- a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/DataView.hpp +++ /dev/null @@ -1,148 +0,0 @@ -// Copyright 2023 Xanadu Quantum Technologies Inc. - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -/** - * A multi-dimensional view for MemRef-like and std::vector types. - * - * @tparam T The underlying data type - * @tparam R The Rank (R > 0) - * - * @note A forward iterator is implemented in this view for traversing over the entire - * elements of MemRef types rank-by-rank starting from the last dimension (R-1). For example, - * The DataView iterator for MemRef starts from index (0, 0) and traverses elements - * in the following order: - * (0, 0), ..., (0, sizes[1]-1), (1, 0), ..., (1, sizes[1]-1), ... (sizes[0]-1, sizes[1]-1). - */ -template class DataView { - private: - T *data_aligned; - size_t offset; - size_t sizes[R] = {0}; - size_t strides[R] = {0}; - - public: - class iterator { - private: - const DataView &view; - - int64_t loc; // physical index - size_t indices[R] = {0}; - - public: - using iterator_category = std::forward_iterator_tag; // LCOV_EXCL_LINE - using value_type = T; // LCOV_EXCL_LINE - using difference_type = std::ptrdiff_t; // LCOV_EXCL_LINE - using pointer = T *; // LCOV_EXCL_LINE - using reference = T &; // LCOV_EXCL_LINE - - iterator(const DataView &_view, int64_t begin_idx) : view(_view), loc(begin_idx) {} - pointer operator->() const { return &view.data_aligned[loc]; } - reference operator*() const { return view.data_aligned[loc]; } - iterator &operator++() - { - int64_t next_axis = -1; - int64_t idx; - for (int64_t i = R; i > 0; --i) { - idx = i - 1; - if (indices[idx]++ < view.sizes[idx] - 1) { - next_axis = idx; - break; - } - indices[idx] = 0; - loc -= (view.sizes[idx] - 1) * view.strides[idx]; - } - - loc = next_axis == -1 ? -1 : loc + view.strides[next_axis]; - return *this; - } - iterator operator++(int) - { - auto tmp = *this; - int64_t next_axis = -1; - int64_t idx; - for (int64_t i = R; i > 0; --i) { - idx = i - 1; - if (indices[idx]++ < view.sizes[idx] - 1) { - next_axis = idx; - break; - } - indices[idx] = 0; - loc -= (view.sizes[idx] - 1) * view.strides[idx]; - } - - loc = next_axis == -1 ? -1 : loc + view.strides[next_axis]; - return tmp; - } - bool operator==(const iterator &other) const - { - return (loc == other.loc && view.data_aligned == other.view.data_aligned); - } - bool operator!=(const iterator &other) const { return !(*this == other); } - }; - - explicit DataView(std::vector &buffer) : data_aligned(buffer.data()), offset(0) - { - static_assert(R == 1, "[Class: DataView] Assertion: R == 1"); - sizes[0] = buffer.size(); - strides[0] = 1; - } - - explicit DataView(T *_data_aligned, size_t _offset, const size_t *_sizes, - const size_t *_strides) - : data_aligned(_data_aligned), offset(_offset) - { - static_assert(R > 0, "[Class: DataView] Assertion: R > 0"); - if (_sizes != nullptr && _strides != nullptr) { - for (size_t i = 0; i < R; i++) { - sizes[i] = _sizes[i]; - strides[i] = _strides[i]; - } - } // else sizes = {0}, strides = {0} - } - - [[nodiscard]] auto size() const -> size_t - { - if (!data_aligned) { - return 0; - } - - size_t tsize = 1; - for (size_t i = 0; i < R; i++) { - tsize *= sizes[i]; - } - return tsize; - } - - template T &operator()(I... idxs) const - { - static_assert(sizeof...(idxs) == R, - "[Class: DataView] Error in Catalyst Runtime: Wrong number of indices"); - size_t indices[] = {static_cast(idxs)...}; - - size_t loc = offset; - for (size_t axis = 0; axis < R; axis++) { - RT_ASSERT(indices[axis] < sizes[axis]); - loc += indices[axis] * strides[axis]; - } - return data_aligned[loc]; - } - - iterator begin() { return iterator{*this, static_cast(offset)}; } - - iterator end() { return iterator{*this, -1}; } -}; diff --git a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Exception.hpp b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Exception.hpp deleted file mode 100644 index a76da14..0000000 --- a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Exception.hpp +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright 2023 Xanadu Quantum Technologies Inc. - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -#include -#include -#include -#include - -/** - * @brief Macro that throws `RuntimeException` with given message. - */ -#define RT_FAIL(message) Catalyst::Runtime::_abort((message), __FILE__, __LINE__, __func__) - -/** - * @brief Macro that throws `RuntimeException` if expression evaluates - * to true. - */ -#define RT_FAIL_IF(expression, message) \ - if ((expression)) { \ - RT_FAIL(message); \ - } - -/** - * @brief Macro that throws `RuntimeException` with the given expression - * and source location if expression evaluates to false. - */ -#define RT_ASSERT(expression) RT_FAIL_IF(!(expression), "Assertion: " #expression) - -namespace Catalyst::Runtime { - -/** - * @brief This is the general exception thrown by Catalyst for runtime errors - * that is derived from `std::exception`. - */ -class RuntimeException : public std::exception { - private: - const std::string err_msg; - - public: - explicit RuntimeException(std::string msg) noexcept - : err_msg{std::move(msg)} {} // LCOV_EXCL_LINE - ~RuntimeException() override = default; // LCOV_EXCL_LINE - - RuntimeException(const RuntimeException &) = default; - RuntimeException(RuntimeException &&) noexcept = default; - - RuntimeException &operator=(const RuntimeException &) = delete; - RuntimeException &operator=(RuntimeException &&) = delete; - - [[nodiscard]] auto what() const noexcept -> const char * override - { - return err_msg.c_str(); - } // LCOV_EXCL_LINE -}; - -/** - * @brief Throws a `RuntimeException` with the given error message. - * - * @note This is not supposed to be called directly. - */ -[[noreturn]] inline void _abort(const char *message, const char *file_name, size_t line, - const char *function_name) -{ - std::stringstream sstream; - sstream << "[" << file_name << "][Line:" << line << "][Function:" << function_name - << "] Error in Catalyst Runtime: " << message; - - throw RuntimeException(sstream.str()); -} // LCOV_EXCL_LINE - -} // namespace Catalyst::Runtime diff --git a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/QuantumDevice.hpp b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/QuantumDevice.hpp deleted file mode 100644 index ccdb606..0000000 --- a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/QuantumDevice.hpp +++ /dev/null @@ -1,364 +0,0 @@ -// Copyright 2022-2023 Xanadu Quantum Technologies Inc. - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include - -#include "DataView.hpp" -#include "Types.h" - -// A helper template macro to generate the Factory method by -// calling (kwargs). Check the Custom Devices guideline for details: -// https://docs.pennylane.ai/projects/catalyst/en/stable/dev/custom_devices.html -#define GENERATE_DEVICE_FACTORY(IDENTIFIER, CONSTRUCTOR) \ - extern "C" Catalyst::Runtime::QuantumDevice *IDENTIFIER##Factory(const char *kwargs) \ - { \ - return new CONSTRUCTOR(std::string(kwargs)); \ - } - -namespace Catalyst::Runtime { - -/** - * @brief struct API for backend quantum devices. - * - * This device API contains, - * - a set of methods to manage qubit allocations and deallocations, device shot - * noise, and quantum tape recording as well as reference values for the result - * data-type; these are used to implement Quantum Runtime (QR) instructions. - * - * - a set of methods for quantum operations, observables, measurements, and gradient - * of the device; these are used to implement Quantum Instruction Set (QIS) instructions. - * - */ -struct QuantumDevice { - QuantumDevice() = default; // LCOV_EXCL_LINE - virtual ~QuantumDevice() = default; // LCOV_EXCL_LINE - - QuantumDevice &operator=(const QuantumDevice &) = delete; - QuantumDevice(const QuantumDevice &) = delete; - QuantumDevice(QuantumDevice &&) = delete; - QuantumDevice &operator=(QuantumDevice &&) = delete; - - /** - * @brief Allocate a qubit. - * - * @return `QubitIdType` - */ - virtual auto AllocateQubit() -> QubitIdType = 0; - - /** - * @brief Allocate a vector of qubits. - * - * @param num_qubits The number of qubits to allocate. - * - * @return `std::vector` - */ - virtual auto AllocateQubits(size_t num_qubits) -> std::vector = 0; - - /** - * @brief Release a qubit. - * - * @param qubit The id of the qubit - */ - virtual void ReleaseQubit(QubitIdType qubit) = 0; - - /** - * @brief Release all qubits. - */ - virtual void ReleaseAllQubits() = 0; - - /** - * @brief Get the number of allocated qubits. - * - * @return `size_t` - */ - [[nodiscard]] virtual auto GetNumQubits() const -> size_t = 0; - - /** - * @brief Set the number of device shots. - * - * @param shots The number of noise shots - */ - virtual void SetDeviceShots(size_t shots) = 0; - - /** - * @brief Get the number of device shots. - * - * @return `size_t` - */ - [[nodiscard]] virtual auto GetDeviceShots() const -> size_t = 0; - - /** - * @brief Set the PRNG of the device. - * - * The Catalyst runtime enables seeded program execution on non-hardware devices. - * A random number generator instance is managed by the runtime to predictably - * generate results for non-deterministic programs, such as those involving `Measure` - * calls. - * Devices implementing support for this feature do not need to use the provided - * PRNG instance as their sole source of random numbers, but it is expected that the - * the same instance state will predictable and reproducibly generate the same - * program results. It is also expected that the provided PRNG state is evolved - * sufficiently so that two device executions sharing the same instance do not produce - * identical results. - * The provided PRNG instance is not thread-locked, and devices wishing to share it - * across threads will need to provide their own thread-safety. - * - * @param gen The std::mt19937 PRNG object. - */ - virtual void SetDevicePRNG([[maybe_unused]] std::mt19937 *gen){}; - - /** - * @brief Start recording a quantum tape if provided. - * - * @note This is backed by the `Catalyst::Runtime::CacheManager` property in - * the device implementation. - */ - virtual void StartTapeRecording() = 0; - - /** - * @brief Stop recording a quantum tape if provided. - * - * @note This is backed by the `Catalyst::Runtime::CacheManager` property in - * the device implementation. - */ - virtual void StopTapeRecording() = 0; - - /** - * @brief Result value for "Zero" used in the measurement process. - * - * @return `Result` - */ - [[nodiscard]] virtual auto Zero() const -> Result = 0; - - /** - * @brief Result value for "One" used in the measurement process. - * - * @return `Result` - */ - [[nodiscard]] virtual auto One() const -> Result = 0; - - /** - * @brief A helper method to print the state vector of a device. - */ - virtual void PrintState() = 0; - - /** - * @brief Prepare subsystems using the given ket vector in the computational basis. - * - * @param state A state vector of size 2**len(wires) - * @param wires The wire(s) the operation acts on - */ - virtual void SetState([[maybe_unused]] DataView, 1> &state, - [[maybe_unused]] std::vector &wires) - { - RT_FAIL("Unsupported functionality"); - } - - /** - * @brief Prepares a single computational basis state. - * - * @param n Prepares the basis state |n>, where n is an array of integers from the set {0, 1} - * @param wires The wire(s) the operation acts on - */ - virtual void SetBasisState([[maybe_unused]] DataView &n, - [[maybe_unused]] std::vector &wires) - { - RT_FAIL("Unsupported functionality"); - } - - /** - * @brief Apply a single gate to the state vector of a device with its name if this is - * supported. - * - * @param name The name of the gate to apply - * @param params Optional parameter list for parametric gates - * @param wires Wires to apply gate to - * @param inverse Indicates whether to use inverse of gate - * @param controlled_wires Optional controlled wires applied to the operation - * @param controlled_values Optional controlled values applied to the operation - */ - virtual void - NamedOperation(const std::string &name, const std::vector ¶ms, - const std::vector &wires, [[maybe_unused]] bool inverse = false, - [[maybe_unused]] const std::vector &controlled_wires = {}, - [[maybe_unused]] const std::vector &controlled_values = {}) = 0; - - /** - * @brief Apply a given matrix directly to the state vector of a device. - * - * @param matrix The matrix of data in row-major format - * @param wires Wires to apply gate to - * @param inverse Indicates whether to use inverse of gate - * @param controlled_wires Controlled wires applied to the operation - * @param controlled_values Controlled values applied to the operation - */ - virtual void - MatrixOperation(const std::vector> &matrix, - const std::vector &wires, [[maybe_unused]] bool inverse = false, - [[maybe_unused]] const std::vector &controlled_wires = {}, - [[maybe_unused]] const std::vector &controlled_values = {}) = 0; - - /** - * @brief Construct a named (Identity, PauliX, PauliY, PauliZ, and Hadamard) - * or Hermitian observable. - * - * @param id The type of the observable - * @param matrix The matrix of data to construct a hermitian observable - * @param wires Wires to apply observable to - * - * @return `ObsIdType` Index of the constructed observable - */ - virtual auto Observable(ObsId id, const std::vector> &matrix, - const std::vector &wires) -> ObsIdType = 0; - - /** - * @brief Construct a tensor product of observables. - * - * @param obs The vector of observables indices of type ObsIdType - * - * @return `ObsIdType` Index of the constructed observable - */ - virtual auto TensorObservable(const std::vector &obs) -> ObsIdType = 0; - - /** - * @brief Construct a Hamiltonian observable. - * - * @param coeffs The vector of coefficients - * @param obs The vector of observables indices of size `coeffs` - * - * @return `ObsIdType` Index of the constructed observable - */ - virtual auto HamiltonianObservable(const std::vector &coeffs, - const std::vector &obs) -> ObsIdType = 0; - - /** - * @brief Compute the expected value of an observable. - * - * @param obsKey The index of the constructed observable - * - * @return `double` The expected value - */ - virtual auto Expval(ObsIdType obsKey) -> double = 0; - - /** - * @brief Compute the variance of an observable. - * - * @param obsKey The index of the constructed observable - * - * @return `double` The variance - */ - virtual auto Var(ObsIdType obsKey) -> double = 0; - - /** - * @brief Get the state-vector of a device. - * - * @param state The pre-allocated `DataView, 1>` - */ - virtual void State(DataView, 1> &state) = 0; - - /** - * @brief Compute the probabilities of each computational basis state. - - * @param probs The pre-allocated `DataView` - */ - virtual void Probs(DataView &probs) = 0; - - /** - * @brief Compute the probabilities for a subset of the full system. - * - * @param probs The pre-allocated `DataView` - * @param wires Wires will restrict probabilities to a subset of the full system - */ - virtual void PartialProbs(DataView &probs, - const std::vector &wires) = 0; - - /** - * @brief Compute samples with the number of shots on the entire wires, - * returing raw samples. - * - * @param samples The pre-allocated `DataView`representing a matrix of - * shape `shots * numQubits`. The built-in iterator in `DataView` - * iterates over all elements of `samples` row-wise. - * @param shots The number of shots - */ - virtual void Sample(DataView &samples, size_t shots) = 0; - - /** - * @brief Compute partial samples with the number of shots on `wires`, - * returing raw samples. - * - * @param samples The pre-allocated `DataView`representing a matrix of - * shape `shots * numWires`. The built-in iterator in `DataView` - * iterates over all elements of `samples` row-wise. - * @param wires Wires to compute samples on - * @param shots The number of shots - */ - virtual void PartialSample(DataView &samples, const std::vector &wires, - size_t shots) = 0; - - /** - * @brief Sample with the number of shots on the entire wires, returning the - * number of counts for each sample. - * - * @param eigvals The pre-allocated `DataView` - * @param counts The pre-allocated `DataView` - * @param shots The number of shots - */ - virtual void Counts(DataView &eigvals, DataView &counts, - size_t shots) = 0; - - /** - * @brief Partial sample with the number of shots on `wires`, returning the - * number of counts for each sample. - * - * @param eigvals The pre-allocated `DataView` - * @param counts The pre-allocated `DataView` - * @param wires Wires to compute samples on - * @param shots The number of shots - */ - virtual void PartialCounts(DataView &eigvals, DataView &counts, - const std::vector &wires, size_t shots) = 0; - - /** - * @brief A general measurement method that acts on a single wire. - * - * @param wire The wire to compute Measure on - * @param postselect Which basis state to postselect after a mid-circuit measurement (-1 denotes - no post-selection) - - * @return `Result` The measurement result - */ - virtual auto Measure(QubitIdType wire, std::optional postselect) -> Result = 0; - - /** - * @brief Compute the gradient of a quantum tape, that is cached using - * `Catalyst::Runtime::Simulator::CacheManager`, for a specific set of trainable - * parameters. - * - * @param gradients The vector of pre-allocated `DataView*` - * to store gradients resutls for the list of cached observables. - * @param trainParams The vector of trainable parameters; if none, all parameters - * would be assumed trainable - * - */ - virtual void Gradient(std::vector> &gradients, - const std::vector &trainParams) = 0; -}; -} // namespace Catalyst::Runtime diff --git a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Types.h b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Types.h deleted file mode 100644 index a30a1c2..0000000 --- a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Types.h +++ /dev/null @@ -1,165 +0,0 @@ -// Copyright 2022-2023 Xanadu Quantum Technologies Inc. - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#ifndef TYPES_H -#define TYPES_H - -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -// Qubit, Result and Observable types -struct QUBIT; -using QubitIdType = intptr_t; - -using RESULT = bool; -using Result = RESULT *; -using QirArray = void *; - -using ObsIdType = intptr_t; - -enum ObsId : int8_t { - Identity = 0, - PauliX, - PauliY, - PauliZ, - Hadamard, - Hermitian, -}; - -enum ObsType : int8_t { - Basic = 0, - TensorProd, - Hamiltonian, -}; - -// complex type -struct CplxT_float { - float real; - float imag; -}; - -// complex type -struct CplxT_double { - double real; - double imag; -}; - -enum NumericType : int8_t { - idx = 0, - i1, - i8, - i16, - i32, - i64, - f32, - f64, - c64, - c128, -}; - -// MemRefT type -struct OpaqueMemRefT { - int64_t rank; - void *descriptor; - NumericType datatype; -}; - -// MemRefT, dimension=1> type -struct MemRefT_CplxT_double_1d { - CplxT_double *data_allocated; - CplxT_double *data_aligned; - size_t offset; - size_t sizes[1]; - size_t strides[1]; -}; - -// MemRefT, dimension=2> type -struct MemRefT_CplxT_double_2d { - CplxT_double *data_allocated; - CplxT_double *data_aligned; - size_t offset; - size_t sizes[2]; - size_t strides[2]; -}; - -// MemRefT type -struct MemRefT_double_1d { - double *data_allocated; - double *data_aligned; - size_t offset; - size_t sizes[1]; - size_t strides[1]; -}; - -// MemRefT type -struct MemRefT_double_2d { - double *data_allocated; - double *data_aligned; - size_t offset; - size_t sizes[2]; - size_t strides[2]; -}; - -// MemRefT type -struct MemRefT_int64_1d { - int64_t *data_allocated; - int64_t *data_aligned; - size_t offset; - size_t sizes[1]; - size_t strides[1]; -}; - -// MemRefT type -struct MemRefT_int8_1d { - int8_t *data_allocated; - int8_t *data_aligned; - size_t offset; - size_t sizes[1]; - size_t strides[1]; -}; - -// PairT, MemRefT> type -struct PairT_MemRefT_double_int64_1d { - struct MemRefT_double_1d first; - struct MemRefT_int64_1d second; -}; - -// Quantum operation modifiers -struct Modifiers { - bool adjoint; - size_t num_controlled; - QUBIT *controlled_wires; - bool *controlled_values; -}; - -using CplxT_double = struct CplxT_double; -using MemRefT_CplxT_double_1d = struct MemRefT_CplxT_double_1d; -using MemRefT_CplxT_double_2d = struct MemRefT_CplxT_double_2d; -using MemRefT_double_1d = struct MemRefT_double_1d; -using MemRefT_double_2d = struct MemRefT_double_2d; -using MemRefT_int64_1d = struct MemRefT_int64_1d; -using PairT_MemRefT_double_int64_1d = struct PairT_MemRefT_double_int64_1d; -using Modifiers = struct Modifiers; - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif diff --git a/src/qirlightning/simple_demo/test_rt_device.cpp b/src/qirlightning/simple_demo/test_rt_device.cpp deleted file mode 100644 index c50ab92..0000000 --- a/src/qirlightning/simple_demo/test_rt_device.cpp +++ /dev/null @@ -1,74 +0,0 @@ -#include - -#include "QuantumDevice.hpp" - -// Runtime libraries (kokkos/GPU/qubit etc.) -// Update these paths to point to the correct library -#define RTDLIB \ - "/" \ - "pennylane_lightning/liblightning_kokkos_catalyst.so"; -#define RTDDEVICE "LightningKokkosSimulator"; - -extern "C" Catalyst::Runtime::QuantumDevice* -GenericDeviceFactory(char const* kwargs); - -using namespace Catalyst::Runtime; - -int main() -{ - try - { - // Load lightning simulation library - std::string rtd_lib = RTDLIB; - std::string rtd_device = RTDDEVICE; - std::string kwargs = {}; - auto rtld_flags = RTLD_LAZY | RTLD_NODELETE; - auto rtd_dylib_handler = dlopen(rtd_lib.c_str(), rtld_flags); - - if (!rtd_dylib_handler) - { - throw std::runtime_error("Failed to load library: " + rtd_lib); - } - - // Find device factory - std::string factory_name = rtd_device + "Factory"; - void* f_ptr = dlsym(rtd_dylib_handler, factory_name.c_str()); - - if (!f_ptr) - { - dlclose(rtd_dylib_handler); - throw std::runtime_error("Failed to find factory function: " - + factory_name); - } - std::string rtd_kwargs = {}; - auto rtd_qdevice = std::unique_ptr( - reinterpret_cast(f_ptr)( - rtd_kwargs.c_str())); - - // Allocate Qubits - rtd_qdevice->AllocateQubits(3); - - // Get Num Qubits - std::cout << "Num Qubits = " << rtd_qdevice->GetNumQubits() - << std::endl; - - // Apply Gate - rtd_qdevice->NamedOperation("Hadamard", {}, {0}); - - // Print State - std::cout << "State = " << std::endl; - rtd_qdevice->PrintState(); - - // Measure - QubitIdType wire{0}; - Result result = rtd_qdevice->Measure(wire, std::nullopt); - std::cout << "Measure on wire 0 = " << *result << std::endl; - } - catch (std::exception const& e) - { - std::cerr << "Error: " << e.what() << std::endl; - return EXIT_FAILURE; - } - - return EXIT_SUCCESS; -} diff --git a/src/qirlightning/support_catalyst.cmake b/src/qirlightning/support_catalyst.cmake index 5932f06..ca7df76 100644 --- a/src/qirlightning/support_catalyst.cmake +++ b/src/qirlightning/support_catalyst.cmake @@ -26,7 +26,7 @@ macro(FindCatalyst target_name) else() if(NOT CATALYST_GIT_TAG) - set(CATALYST_GIT_TAG "main" CACHE STRING "GIT_TAG value to build Catalyst") + set(CATALYST_GIT_TAG "v0.12.0" CACHE STRING "GIT_TAG value to build Catalyst") endif() message(INFO " Building against Catalyst GIT TAG ${CATALYST_GIT_TAG}") From a93414ef23792b385520f78db0b5d5246b6f2d7d Mon Sep 17 00:00:00 2001 From: Joseph Lee Date: Wed, 8 Oct 2025 18:43:01 +0000 Subject: [PATCH 57/64] update --- src/qirlightning/LightningQuantum.cc | 47 +++++++++++++--------- src/qirlightning/LightningQuantum.hh | 9 ++--- src/qirlightning/LightningRuntime.cc | 3 +- src/qirlightning/README.md | 59 +++++++++++++++------------- 4 files changed, 66 insertions(+), 52 deletions(-) diff --git a/src/qirlightning/LightningQuantum.cc b/src/qirlightning/LightningQuantum.cc index e584a3b..e540799 100644 --- a/src/qirlightning/LightningQuantum.cc +++ b/src/qirlightning/LightningQuantum.cc @@ -11,11 +11,11 @@ #include #include #include +#include #include #include #include #include -#include #include "qiree/Assert.hh" @@ -29,7 +29,8 @@ using namespace Catalyst::Runtime; /*! * Initialize the Lightning simulator */ -LightningQuantum::LightningQuantum(std::ostream& os, unsigned long int seed) : output_(os), seed_(seed) +LightningQuantum::LightningQuantum(std::ostream& os, unsigned long int seed) + : output_(os), seed_(seed) { std::string rtd_lib = RTDLIB; std::string rtd_device = RTDDEVICE; @@ -56,8 +57,8 @@ LightningQuantum::LightningQuantum(std::ostream& os, unsigned long int seed) : o //---------------------------------------------------------------------------// //! Default destructor -LightningQuantum::~LightningQuantum() { - +LightningQuantum::~LightningQuantum() +{ if (rtd_dylib_handler) { dlclose(rtd_dylib_handler); @@ -87,10 +88,7 @@ void LightningQuantum::set_up(EntryPointAttrs const& attrs) /*! * Complete an execution */ -void LightningQuantum::tear_down() -{ - -} +void LightningQuantum::tear_down() {} //---------------------------------------------------------------------------// /*! @@ -103,7 +101,7 @@ void LightningQuantum::reset(Qubit q) //----------------------------------------------------------------------------// /*! - * Read the value of a result. + * Read the value of a result. */ QState LightningQuantum::read_result(Result r) const { @@ -117,13 +115,14 @@ QState LightningQuantum::read_result(Result r) const * Map a qubit to a result index. */ void LightningQuantum::mz(Qubit q, Result r) -{ - QIREE_EXPECT(q.value < this->num_qubits()); +{ + QIREE_EXPECT(q.value < this->num_qubits()); QIREE_EXPECT(r.value < this->num_results()); std::mt19937 gen(seed_); seed_++; rtd_qdevice->SetDevicePRNG(&gen); - auto result = rtd_qdevice->Measure(static_cast(q.value), std::nullopt); + auto result + = rtd_qdevice->Measure(static_cast(q.value), std::nullopt); results_[r.value] = *result; } @@ -136,22 +135,29 @@ void LightningQuantum::mz(Qubit q, Result r) void LightningQuantum::cx(Qubit q1, Qubit q2) { rtd_qdevice->NamedOperation( - "CNOT", {}, {static_cast(q1.value), static_cast(q2.value)}); + "CNOT", + {}, + {static_cast(q1.value), static_cast(q2.value)}); } void LightningQuantum::cnot(Qubit q1, Qubit q2) { rtd_qdevice->NamedOperation( - "CNOT", {}, {static_cast(q1.value), static_cast(q2.value)}); + "CNOT", + {}, + {static_cast(q1.value), static_cast(q2.value)}); } void LightningQuantum::cz(Qubit q1, Qubit q2) { rtd_qdevice->NamedOperation( - "CZ", {}, {static_cast(q1.value), static_cast(q2.value)}); + "CZ", + {}, + {static_cast(q1.value), static_cast(q2.value)}); } // 2. Local gates void LightningQuantum::h(Qubit q) { - rtd_qdevice->NamedOperation("Hadamard", {}, {static_cast(q.value)}); + rtd_qdevice->NamedOperation( + "Hadamard", {}, {static_cast(q.value)}); } void LightningQuantum::s(Qubit q) { @@ -177,15 +183,18 @@ void LightningQuantum::z(Qubit q) // 2.2 rotation gates void LightningQuantum::rx(double theta, Qubit q) { - rtd_qdevice->NamedOperation("RX", {theta}, {static_cast(q.value)}); + rtd_qdevice->NamedOperation( + "RX", {theta}, {static_cast(q.value)}); } void LightningQuantum::ry(double theta, Qubit q) { - rtd_qdevice->NamedOperation("RY", {theta}, {static_cast(q.value)}); + rtd_qdevice->NamedOperation( + "RY", {theta}, {static_cast(q.value)}); } void LightningQuantum::rz(double theta, Qubit q) { - rtd_qdevice->NamedOperation("RZ", {theta}, {static_cast(q.value)}); + rtd_qdevice->NamedOperation( + "RZ", {theta}, {static_cast(q.value)}); } } // namespace qiree diff --git a/src/qirlightning/LightningQuantum.hh b/src/qirlightning/LightningQuantum.hh index e68bf66..f62a692 100644 --- a/src/qirlightning/LightningQuantum.hh +++ b/src/qirlightning/LightningQuantum.hh @@ -33,8 +33,8 @@ class LightningQuantum final : virtual public QuantumNotImpl LightningQuantum(std::ostream& os, unsigned long int seed); ~LightningQuantum(); - QIREE_DELETE_COPY_MOVE(LightningQuantum); // Delete copy and move constructors - + QIREE_DELETE_COPY_MOVE(LightningQuantum); // Delete copy and move + // constructors //!@{ //! \name Accessors @@ -62,11 +62,10 @@ class LightningQuantum final : virtual public QuantumNotImpl QState read_result(Result) const final; //!@} - //!@{ //! \name Circuit construction // void ccx(Qubit, Qubit) final; - void ccnot(Qubit, Qubit, Qubit); + void ccnot(Qubit, Qubit, Qubit); void cnot(Qubit, Qubit) final; void cx(Qubit, Qubit) final; // void cy(Qubit, Qubit) final; @@ -92,7 +91,7 @@ class LightningQuantum final : virtual public QuantumNotImpl struct Factory; struct State; - + //// DATA //// std::ostream& output_; diff --git a/src/qirlightning/LightningRuntime.cc b/src/qirlightning/LightningRuntime.cc index 89bce1a..fb91b2a 100644 --- a/src/qirlightning/LightningRuntime.cc +++ b/src/qirlightning/LightningRuntime.cc @@ -18,7 +18,8 @@ namespace qiree /*! * Construct with quantum reference to access classical registers. */ -LightningRuntime::LightningRuntime(std::ostream& output, LightningQuantum const& sim) +LightningRuntime::LightningRuntime(std::ostream& output, + LightningQuantum const& sim) : SingleResultRuntime{sim}, output_(output) { } diff --git a/src/qirlightning/README.md b/src/qirlightning/README.md index 2777869..dbbc46f 100644 --- a/src/qirlightning/README.md +++ b/src/qirlightning/README.md @@ -1,58 +1,55 @@ # QIR-EE with Lightning simulator backend -## Installing a lightning simulator +The [PennyLane-Lightning](https://github.com/PennyLaneAI/pennylane-lightning) plugins are high-performance quantum simulators, which are part of the [PennyLane](https://github.com/PennyLaneAI/pennylane) ecosystem. The simulators include the following backends (which can be used with QIREE): +- `lightning.qubit`: a fast state-vector simulator with optional OpenMP additions and parallelized gate-level SIMD kernels. +- `lightning.gpu`: a state-vector simulator based on the NVIDIA cuQuantum SDK. +- `lightning.kokkos`: a state-vector simulator written with Kokkos. It can exploit the inherent parallelism of modern processing units supporting the OpenMP, CUDA or HIP programming models. -More information on installing Pennylane Lightning simulators can be found in [lightning repository](https://github.com/PennyLaneAI/pennylane-lightning). +## Installing a Lightning simulator + +More information on installing Pennylane Lightning simulators from source, please visit the [Lightning installation page](https://docs.pennylane.ai/projects/lightning/en/latest/dev/installation.html). Note: QIREE is tested to work with PennyLane Lightning simulators v0.42. ### Quick start -The easiest way to get started is install a Lightning simulator from PyPI via pip: + +The easiest way to get started is to install a Lightning simulator (`pennylane-lightning`/`pennylane-lightning-gpu`/`pennylane-lightning-kokkos`) from PyPI via pip: ``` -$ pip install pennylane-lightning-kokkos +$ pip install pennylane-lightning-kokkos==0.42.0 $ pip show pennylane-lightning-kokkos Name: PennyLane_Lightning_Kokkos -Version: 0.40.0 +Version: 0.42.0 Summary: PennyLane-Lightning plugin Home-page: https://github.com/PennyLaneAI/pennylane-lightning -Author: -Author-email: +Author: +Author-email: License: Apache License 2.0 Location: Requires: pennylane, pennylane-lightning ``` -Running `pip install pennylane` or `pip install pennylane-lightning` will install the `lightning.qubit` (CPU) simulator, and other simulators can be installed by running `pip install pennylane-lightning-kokkos / pennylane-lightning-gpu`. +Running `pip install pennylane` or `pip install pennylane-lightning` will automatically install the `lightning.qubit` (CPU) simulator, and other simulators can be installed by running `pip install pennylane-lightning-kokkos / pennylane-lightning-gpu`. Note: by default, the pre-built `lightning.kokkos` wheels from pip are built with Kokkos OpenMP enabled for CPU. To build Kokkos for other devices (e.g. CUDA or HIP GPUs), please install from source. Instruction can be found [here](https://docs.pennylane.ai/projects/lightning/en/latest/lightning_kokkos/installation.html). -When installing [Pennylane-Lightning](https://github.com/PennyLaneAI/pennylane-lightning) from pip or source, you will have the shared objects for each of the simulator installed. These are named `liblightning_qubit_catalyst.so`/`liblightning_kokkos_catalyst.so`/`liblightning_GPU_catalyst.so` respectively. +When installing Pennylane-Lightning from pip or from source, you will have the shared libraries for each of the simulator installed. These are named `liblightning_qubit_catalyst.so`/`liblightning_kokkos_catalyst.so`/`liblightning_GPU_catalyst.so` respectively. -Example: +To obtain the path to the library: ``` -$ ls -... liblightning_kokkos_catalyst.so ... +$ export PL_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/pennylane_lightning')") + +$ ls $PL_PATH +... liblightning_qubit_catalyst.so liblightning_kokkos_catalyst.so ... ``` You can swap `pennylane-lightning-kokkos` for `pennylane-lightning-gpu` for lightning.gpu and `pennylane-lightning` for lightning.gpu simulators. -### Compiling Lightning from Source - -The [lightning repository page](https://github.com/PennyLaneAI/pennylane-lightning) contains information on how to install Lightning simulators from source. This will necessary for e.g. Kokkos with HIP backend. - ## Compile QIR-EE with Lightning backend -- Set `QIREE_USE_LIGHTNING` to `ON` in `qiree/CMakeLists.txt` -- Set the environment variable `LIGHTNING_SIM_PATH` to the shared object of the Lightning Simulator, e.g. +To compile QIR-EE with lightning backend: ``` +# Set the path for the lightning simulator shared library export LIGHTNING_SIM_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/pennylane_lightning')")/liblightning_kokkos_catalyst.so -``` - -Note: -- replace `libligghtning_kokkos_catalyst.so` with `liblightning_qubit_catalyst.so` or `liblightning_GPU_catalyst.so` if required. -- when running on `GPU`, include `cuquantum` libraries in the library path (which will be installed as a dependency from Python), i.e. `LD_LIBRARY_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")/lib:$LD_LIBRARY_PATH ./test_rt_device.out` - -To compile: -``` +# Proceed with usual build instructions, but with `-DQIREE_USE_LIGHTNING=ON` cmake flag cd qiree/ mkdir build; cd build cmake -DQIREE_USE_LIGHTNING=ON .. @@ -60,6 +57,15 @@ make ``` +Note: +- replace `libligghtning_kokkos_catalyst.so` with `liblightning_qubit_catalyst.so` or `liblightning_GPU_catalyst.so` if required. +- when running with `lightning.gpu` simulator for Nvidia GPUs, include `cuquantum` libraries in the library path (which will be installed as a dependency from Python), i.e. + +``` +LD_LIBRARY_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")/lib:$LD_LIBRARY_PATH ./test_rt_device.out +``` + + ## Running the example To run (in the `build` directory): @@ -68,4 +74,3 @@ To run (in the `build` directory): $ ./bin/qir-lightning ../examples/bell.ll -s 100 {"00":43,"11":57} ``` - From f8e43c5e049cac2b702f786b3c9a13a09fb7bce6 Mon Sep 17 00:00:00 2001 From: Joseph Lee Date: Wed, 8 Oct 2025 18:49:55 +0000 Subject: [PATCH 58/64] update docs --- src/qirlightning/README.md | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/qirlightning/README.md b/src/qirlightning/README.md index dbbc46f..4058002 100644 --- a/src/qirlightning/README.md +++ b/src/qirlightning/README.md @@ -7,7 +7,7 @@ The [PennyLane-Lightning](https://github.com/PennyLaneAI/pennylane-lightning) pl ## Installing a Lightning simulator -More information on installing Pennylane Lightning simulators from source, please visit the [Lightning installation page](https://docs.pennylane.ai/projects/lightning/en/latest/dev/installation.html). Note: QIREE is tested to work with PennyLane Lightning simulators v0.42. +For more information on installing Pennylane Lightning simulators from source, please visit the [Lightning installation page](https://docs.pennylane.ai/projects/lightning/en/latest/dev/installation.html). Note: QIREE is tested to work with PennyLane Lightning simulators v0.42. ### Quick start @@ -27,7 +27,12 @@ License: Apache License 2.0 Location: Requires: pennylane, pennylane-lightning ``` -Running `pip install pennylane` or `pip install pennylane-lightning` will automatically install the `lightning.qubit` (CPU) simulator, and other simulators can be installed by running `pip install pennylane-lightning-kokkos / pennylane-lightning-gpu`. Note: by default, the pre-built `lightning.kokkos` wheels from pip are built with Kokkos OpenMP enabled for CPU. To build Kokkos for other devices (e.g. CUDA or HIP GPUs), please install from source. Instruction can be found [here](https://docs.pennylane.ai/projects/lightning/en/latest/lightning_kokkos/installation.html). + +**Note:** PennyLane and PennyLane lightning supports Python 3.11-3.13. + +Running `pip install pennylane` or `pip install pennylane-lightning` will automatically install the `lightning.qubit` (CPU) simulator, and other simulators can be installed by running `pip install pennylane-lightning-kokkos / pennylane-lightning-gpu`. + +**Note:** By default, the pre-built `lightning.kokkos` wheels from pip are built with Kokkos OpenMP enabled for CPU. To build Kokkos for other devices (e.g. CUDA or HIP GPUs), please install from source. Instruction can be found [here](https://docs.pennylane.ai/projects/lightning/en/latest/lightning_kokkos/installation.html). When installing Pennylane-Lightning from pip or from source, you will have the shared libraries for each of the simulator installed. These are named `liblightning_qubit_catalyst.so`/`liblightning_kokkos_catalyst.so`/`liblightning_GPU_catalyst.so` respectively. @@ -39,8 +44,6 @@ $ ls $PL_PATH ... liblightning_qubit_catalyst.so liblightning_kokkos_catalyst.so ... ``` -You can swap `pennylane-lightning-kokkos` for `pennylane-lightning-gpu` for lightning.gpu and `pennylane-lightning` for lightning.gpu simulators. - ## Compile QIR-EE with Lightning backend To compile QIR-EE with lightning backend: @@ -57,12 +60,12 @@ make ``` -Note: +**Note:** - replace `libligghtning_kokkos_catalyst.so` with `liblightning_qubit_catalyst.so` or `liblightning_GPU_catalyst.so` if required. - when running with `lightning.gpu` simulator for Nvidia GPUs, include `cuquantum` libraries in the library path (which will be installed as a dependency from Python), i.e. ``` -LD_LIBRARY_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")/lib:$LD_LIBRARY_PATH ./test_rt_device.out +export LD_LIBRARY_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")/lib:$LD_LIBRARY_PATH ``` From 2602033d9bb41181e260d222e715b384ba5972de Mon Sep 17 00:00:00 2001 From: Joseph Lee Date: Mon, 20 Oct 2025 22:21:26 +0000 Subject: [PATCH 59/64] update private member names --- src/qirlightning/LightningQuantum.cc | 67 +++++++++++++--------------- src/qirlightning/LightningQuantum.hh | 6 +-- 2 files changed, 35 insertions(+), 38 deletions(-) diff --git a/src/qirlightning/LightningQuantum.cc b/src/qirlightning/LightningQuantum.cc index e540799..d3538a2 100644 --- a/src/qirlightning/LightningQuantum.cc +++ b/src/qirlightning/LightningQuantum.cc @@ -32,37 +32,31 @@ using namespace Catalyst::Runtime; LightningQuantum::LightningQuantum(std::ostream& os, unsigned long int seed) : output_(os), seed_(seed) { - std::string rtd_lib = RTDLIB; - std::string rtd_device = RTDDEVICE; - std::string kwargs = {}; auto rtld_flags = RTLD_LAZY | RTLD_NODELETE; - rtd_dylib_handler = dlopen(rtd_lib.c_str(), rtld_flags); + rtd_dylib_handler_ = dlopen(RTDLIB, rtld_flags); - if (!rtd_dylib_handler) - { - throw std::runtime_error("Failed to load library: " + rtd_lib); - } + QIREE_VALIDATE(rtd_dylib_handler_, + << "failed to load Lightning runtime library '" << RTDLIB + << "'"); // Find device factory + std::string rtd_device = RTDDEVICE; std::string factory_name = rtd_device + "Factory"; - factory_f_ptr = dlsym(rtd_dylib_handler, factory_name.c_str()); + factory_f_ptr_ = dlsym(rtd_dylib_handler_, factory_name.c_str()); - if (!factory_f_ptr) - { - dlclose(rtd_dylib_handler); - throw std::runtime_error("Failed to find factory function: " - + factory_name); - } + QIREE_VALIDATE(factory_f_ptr_, + << "failed to find device factory function '" + << factory_name << "'"); } //---------------------------------------------------------------------------// //! Default destructor LightningQuantum::~LightningQuantum() { - if (rtd_dylib_handler) + if (rtd_dylib_handler_) { - dlclose(rtd_dylib_handler); - }; + dlclose(rtd_dylib_handler_); + } }; //---------------------------------------------------------------------------// @@ -77,11 +71,11 @@ void LightningQuantum::set_up(EntryPointAttrs const& attrs) results_.resize(attrs.required_num_results); std::string rtd_kwargs = {}; - rtd_qdevice = std::unique_ptr( - reinterpret_cast(factory_f_ptr)( + rtd_qdevice_ = std::unique_ptr( + reinterpret_cast(factory_f_ptr_)( rtd_kwargs.c_str())); - rtd_qdevice->AllocateQubits(num_qubits_); + rtd_qdevice_->AllocateQubits(num_qubits_); } //---------------------------------------------------------------------------// @@ -120,9 +114,9 @@ void LightningQuantum::mz(Qubit q, Result r) QIREE_EXPECT(r.value < this->num_results()); std::mt19937 gen(seed_); seed_++; - rtd_qdevice->SetDevicePRNG(&gen); + rtd_qdevice_->SetDevicePRNG(&gen); auto result - = rtd_qdevice->Measure(static_cast(q.value), std::nullopt); + = rtd_qdevice_->Measure(static_cast(q.value), std::nullopt); results_[r.value] = *result; } @@ -134,21 +128,21 @@ void LightningQuantum::mz(Qubit q, Result r) // 1. Entangling gates void LightningQuantum::cx(Qubit q1, Qubit q2) { - rtd_qdevice->NamedOperation( + rtd_qdevice_->NamedOperation( "CNOT", {}, {static_cast(q1.value), static_cast(q2.value)}); } void LightningQuantum::cnot(Qubit q1, Qubit q2) { - rtd_qdevice->NamedOperation( + rtd_qdevice_->NamedOperation( "CNOT", {}, {static_cast(q1.value), static_cast(q2.value)}); } void LightningQuantum::cz(Qubit q1, Qubit q2) { - rtd_qdevice->NamedOperation( + rtd_qdevice_->NamedOperation( "CZ", {}, {static_cast(q1.value), static_cast(q2.value)}); @@ -156,44 +150,47 @@ void LightningQuantum::cz(Qubit q1, Qubit q2) // 2. Local gates void LightningQuantum::h(Qubit q) { - rtd_qdevice->NamedOperation( + rtd_qdevice_->NamedOperation( "Hadamard", {}, {static_cast(q.value)}); } void LightningQuantum::s(Qubit q) { - rtd_qdevice->NamedOperation("S", {}, {static_cast(q.value)}); + rtd_qdevice_->NamedOperation("S", {}, {static_cast(q.value)}); } void LightningQuantum::t(Qubit q) { - rtd_qdevice->NamedOperation("T", {}, {static_cast(q.value)}); + rtd_qdevice_->NamedOperation("T", {}, {static_cast(q.value)}); } // 2.1 Pauli gates void LightningQuantum::x(Qubit q) { - rtd_qdevice->NamedOperation("PauliX", {}, {static_cast(q.value)}); + rtd_qdevice_->NamedOperation( + "PauliX", {}, {static_cast(q.value)}); } void LightningQuantum::y(Qubit q) { - rtd_qdevice->NamedOperation("PauliY", {}, {static_cast(q.value)}); + rtd_qdevice_->NamedOperation( + "PauliY", {}, {static_cast(q.value)}); } void LightningQuantum::z(Qubit q) { - rtd_qdevice->NamedOperation("PauliZ", {}, {static_cast(q.value)}); + rtd_qdevice_->NamedOperation( + "PauliZ", {}, {static_cast(q.value)}); } // 2.2 rotation gates void LightningQuantum::rx(double theta, Qubit q) { - rtd_qdevice->NamedOperation( + rtd_qdevice_->NamedOperation( "RX", {theta}, {static_cast(q.value)}); } void LightningQuantum::ry(double theta, Qubit q) { - rtd_qdevice->NamedOperation( + rtd_qdevice_->NamedOperation( "RY", {theta}, {static_cast(q.value)}); } void LightningQuantum::rz(double theta, Qubit q) { - rtd_qdevice->NamedOperation( + rtd_qdevice_->NamedOperation( "RZ", {theta}, {static_cast(q.value)}); } diff --git a/src/qirlightning/LightningQuantum.hh b/src/qirlightning/LightningQuantum.hh index f62a692..ebadd71 100644 --- a/src/qirlightning/LightningQuantum.hh +++ b/src/qirlightning/LightningQuantum.hh @@ -96,9 +96,9 @@ class LightningQuantum final : virtual public QuantumNotImpl std::ostream& output_; unsigned long int seed_{}; - void* rtd_dylib_handler; - void* factory_f_ptr; - std::unique_ptr rtd_qdevice; + void* rtd_dylib_handler_; + void* factory_f_ptr_; + std::unique_ptr rtd_qdevice_; std::vector results_; size_type num_qubits_{}; From c8c69176175f670857328fef10dcdd65181eadd9 Mon Sep 17 00:00:00 2001 From: Joseph Lee Date: Wed, 22 Oct 2025 21:35:32 +0000 Subject: [PATCH 60/64] seth comments --- .github/workflows/build-fast.yml | 53 ++- .github/workflows/build-lightning.yml | 98 ----- CMakeLists.txt | 23 +- cmake/support_catalyst.cmake | 80 ++++ scripts/lightning-path.sh | 61 +++ src/qirlightning/CMakeLists.txt | 47 +-- src/qirlightning/LightningQuantum.cc | 26 +- src/qirlightning/README.md | 27 +- src/qirlightning/simple_demo/README.md | 69 +++ .../snapshot_catalyst_runtime/README.rst | 118 ++++++ .../include/DataView.hpp | 173 ++++++++ .../include/Exception.hpp | 96 +++++ .../include/QuantumDevice.hpp | 399 ++++++++++++++++++ .../snapshot_catalyst_runtime/include/Types.h | 179 ++++++++ .../simple_demo/test_rt_device.cpp | 74 ++++ src/qirlightning/support_catalyst.cmake | 74 ---- 16 files changed, 1345 insertions(+), 252 deletions(-) delete mode 100644 .github/workflows/build-lightning.yml create mode 100644 cmake/support_catalyst.cmake create mode 100755 scripts/lightning-path.sh create mode 100644 src/qirlightning/simple_demo/README.md create mode 100644 src/qirlightning/simple_demo/snapshot_catalyst_runtime/README.rst create mode 100644 src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/DataView.hpp create mode 100644 src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Exception.hpp create mode 100644 src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/QuantumDevice.hpp create mode 100644 src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Types.h create mode 100644 src/qirlightning/simple_demo/test_rt_device.cpp delete mode 100644 src/qirlightning/support_catalyst.cmake diff --git a/.github/workflows/build-fast.yml b/.github/workflows/build-fast.yml index d8c1fa7..e824947 100644 --- a/.github/workflows/build-fast.yml +++ b/.github/workflows/build-fast.yml @@ -1,4 +1,3 @@ -# Build directly on the GitHub runner with caching name: build-fast on: workflow_dispatch: @@ -10,10 +9,10 @@ concurrency: jobs: linux: - name: ${{matrix.runner}}-${{matrix.compiler}}-${{matrix.version}}-llvm${{matrix.llvm}} + name: ${{matrix.build-config.runner}}-${{matrix.build-config.compiler}}-${{matrix.build-config.version}}-llvm${{matrix.build-config.llvm}}${{ matrix.use-lightning == true && '-lightning' || '' }} strategy: matrix: - include: + build-config: - runner: jammy compiler: gcc version: 12 @@ -22,43 +21,65 @@ jobs: compiler: clang version: 15 llvm: 15 + use-lightning: [false, true] runs-on: >- - ${{ matrix.runner == 'focal' && 'ubuntu-20.04' - || matrix.runner == 'jammy' && 'ubuntu-22.04' + ${{ matrix.build-config.runner == 'focal' && 'ubuntu-20.04' + || matrix.build-config.runner == 'jammy' && 'ubuntu-22.04' || null }} env: CCACHE_DIR: "${{github.workspace}}/.ccache" CCACHE_MAXSIZE: "10G" - CC: ${{matrix.compiler}}-${{matrix.version}} - CXX: ${{matrix.compiler == 'gcc' && 'g++' || 'clang++'}}-${{matrix.version}} + CC: ${{matrix.build-config.compiler}}-${{matrix.build-config.version}} + CXX: ${{matrix.build-config.compiler == 'gcc' && 'g++' || 'clang++'}}-${{matrix.build-config.version}} steps: + - name: Install Python (if building for Lightning) + if: matrix.use-lightning == true + uses: actions/setup-python@v5 + with: + python-version: '3.11' + - name: Install dependencies run: | sudo apt-get -q -y update sudo apt-get -q -y install \ ccache cmake ninja-build libgtest-dev \ - llvm-${{matrix.llvm}}-dev \ - ${{matrix.compiler}}-${{matrix.version}} \ - ${{matrix.compiler == 'gcc' && format('g++-{0}', matrix.version) || ''}} + llvm-${{matrix.build-config.llvm}}-dev \ + ${{matrix.build-config.compiler}}-${{matrix.build-config.version}} \ + ${{matrix.build-config.compiler == 'gcc' && format('g++-{0}', matrix.build-config.version) || ''}} + + if [[ "${{ matrix.use-lightning }}" == "true" ]]; then + echo "Installing Lightning Python dependencies..." + python -m pip install pennylane-lightning==0.43.0 + fi + echo "Installed toolchain:" ld --version | head -1 $CC --version | head -1 $CXX --version | head -1 - llvm-config-${{matrix.llvm}} --version | head -1 + llvm-config-${{matrix.build-config.llvm}} --version | head -1 + - name: Check out uses: actions/checkout@v4 + - name: Set up ccache uses: actions/cache@v4 with: path: ${{env.CCACHE_DIR}} - key: ccache-${{matrix.runner}}-${{matrix.compiler}}-${{matrix.version}}-${{github.run_id}} - restore-keys: ccache-${{matrix.runner}}-${{matrix.compiler}}-${{matrix.version}} + key: ccache-${{matrix.build-config.runner}}-${{matrix.build-config.compiler}}-${{matrix.build-config.version}}-${{github.run_id}} + restore-keys: ccache-${{matrix.build-config.runner}}-${{matrix.build-config.compiler}}-${{matrix.build-config.version}} + - name: Zero ccache stats run: | ccache -z + - name: Configure run: | + if [[ "${{ matrix.use-lightning }}" == "true" ]]; then + export LIGHTNING_PATH=$(bash ./scripts/lightning-path.sh qubit) + else + export LIGHTNING_PATH="" + fi mkdir build && cd build cmake -GNinja \ -DQIREE_GIT_DESCRIBE="${{github.event.pull_request @@ -67,23 +88,29 @@ jobs: -DQIREE_BUILD_TESTS:BOOL=ON \ -DQIREE_DEBUG:BOOL=ON \ -DQIREE_USE_XACC:BOOL=OFF \ + -DQIREE_USE_LIGHTNING:BOOL=${{ matrix.use-lightning == true && 'ON' || 'OFF' }} \ + -DQIREE_LIGHTNING_SIM_PATH="$LIGHTNING_PATH" \ -DCMAKE_BUILD_TYPE="Release" \ -DCMAKE_INSTALL_PREFIX="${{github.workspace}}/install" \ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ -DCMAKE_CXX_FLAGS="-Wall -Wextra -pedantic" \ .. + - name: Build all working-directory: build run: | ninja + - name: Run tests working-directory: build run: | ctest --parallel 2 --timeout 15 --output-on-failure + - name: Install working-directory: build run: | ninja install + - name: Show ccache stats run: | ccache -s diff --git a/.github/workflows/build-lightning.yml b/.github/workflows/build-lightning.yml deleted file mode 100644 index 052830c..0000000 --- a/.github/workflows/build-lightning.yml +++ /dev/null @@ -1,98 +0,0 @@ -# Build directly on the GitHub runner with caching -name: build-lightning -on: - workflow_dispatch: - workflow_call: - -concurrency: - group: build-lightning-${{github.ref}}-${{github.event.pull_request.number || github.run_number}}-${{github.workflow}} - cancel-in-progress: true - -jobs: - linux: - name: ${{matrix.runner}}-${{matrix.compiler}}-${{matrix.version}}-llvm${{matrix.llvm}} - strategy: - matrix: - include: - - runner: jammy - compiler: gcc - version: 12 - llvm: 14 - - runner: jammy - compiler: clang - version: 15 - llvm: 15 - runs-on: >- - ${{ matrix.runner == 'focal' && 'ubuntu-20.04' - || matrix.runner == 'jammy' && 'ubuntu-22.04' - || null - }} - env: - CCACHE_DIR: "${{github.workspace}}/.ccache" - CCACHE_MAXSIZE: "10G" - CC: ${{matrix.compiler}}-${{matrix.version}} - CXX: ${{matrix.compiler == 'gcc' && 'g++' || 'clang++'}}-${{matrix.version}} - steps: - - uses: actions/setup-python@v5 - name: Install Python - with: - python-version: '3.10' - - name: Install dependencies - run: | - sudo apt-get -q -y update - sudo apt-get -q -y install \ - ccache cmake ninja-build libgtest-dev \ - llvm-${{matrix.llvm}}-dev \ - ${{matrix.compiler}}-${{matrix.version}} \ - ${{matrix.compiler == 'gcc' && format('g++-{0}', matrix.version) || ''}} - echo "Installed toolchain:" - ld --version | head -1 - $CC --version | head -1 - $CXX --version | head -1 - llvm-config-${{matrix.llvm}} --version | head -1 - python -m pip install pennylane-lightning - - name: Check out - uses: actions/checkout@v4 - - name: Set up ccache - uses: actions/cache@v4 - with: - path: ${{env.CCACHE_DIR}} - key: ccache-${{matrix.runner}}-${{matrix.compiler}}-${{matrix.version}}-${{github.run_id}} - restore-keys: ccache-${{matrix.runner}}-${{matrix.compiler}}-${{matrix.version}} - - name: Zero ccache stats - run: | - ccache -z - - name: Configure - run: | - export LIGHTNING_SIM_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/pennylane_lightning')")/liblightning_qubit_catalyst.so - mkdir build && cd build - cmake -GNinja \ - -DQIREE_GIT_DESCRIBE="${{github.event.pull_request - && format(';-pr.{0};', github.event.pull_request.number) - || format(';-{0};', github.ref_name)}}" \ - -DQIREE_BUILD_TESTS:BOOL=ON \ - -DQIREE_DEBUG:BOOL=ON \ - -DQIREE_USE_XACC:BOOL=OFF \ - -DQIREE_USE_LIGHTNING:BOOL=ON \ - -DCMAKE_BUILD_TYPE="Release" \ - -DCMAKE_INSTALL_PREFIX="${{github.workspace}}/install" \ - -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \ - -DCMAKE_CXX_FLAGS="-Wall -Wextra -pedantic" \ - .. - - name: Build all - working-directory: build - run: | - ninja - - name: Run tests - working-directory: build - run: | - ctest --parallel 2 --timeout 15 --output-on-failure - - name: Install - working-directory: build - run: | - ninja install - - name: Show ccache stats - run: | - ccache -s - -# vim: set nowrap tw=100: diff --git a/CMakeLists.txt b/CMakeLists.txt index 2ba1193..a4704ef 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -139,12 +139,23 @@ if(QIREE_USE_QSIM) endif() if(QIREE_USE_LIGHTNING) -qiree_add_library(qiree_lightning INTERFACE) -add_library(QIREE::lightning ALIAS qiree_lightning) -target_include_directories(qiree_lightning SYSTEM INTERFACE - "$" - "$" -) + qiree_add_library(qiree_lightning INTERFACE) + add_library(QIREE::lightning ALIAS qiree_lightning) + # Fetch Catalyst runtime include files + include("${CMAKE_CURRENT_LIST_DIR}/cmake/support_catalyst.cmake") + FindCatalyst(qiree_lightning) + target_include_directories(qiree_lightning SYSTEM INTERFACE + "$" + "$" + ) + set(QIREE_LIGHTNING_SIM_PATH "" CACHE FILEPATH "Path to the Lightning simulator shared library") + if(NOT QIREE_LIGHTNING_SIM_PATH) + message(FATAL_ERROR "QIREE_LIGHTNING_SIM_PATH is not set. Please specify the path using: -DQIREE_LIGHTNING_SIM_PATH=/path/to/lib") + endif() + message(STATUS "Using Lightning simulator shared library: ${QIREE_LIGHTNING_SIM_PATH}") + target_compile_definitions(qiree_lightning INTERFACE + QIREE_LIGHTNING_RTDLIB="${QIREE_LIGHTNING_SIM_PATH}" + ) endif() diff --git a/cmake/support_catalyst.cmake b/cmake/support_catalyst.cmake new file mode 100644 index 0000000..95c7f73 --- /dev/null +++ b/cmake/support_catalyst.cmake @@ -0,0 +1,80 @@ +############################################################################################### +# This file provides macros to process Catalyst. +############################################################################################### + +# Include this only once +include_guard() + +macro(FindCatalyst target_name) + if(LIGHTNING_CATALYST_SRC_PATH) + if(NOT IS_ABSOLUTE ${LIGHTNING_CATALYST_SRC_PATH}) + message(FATAL_ERROR " LIGHTNING_CATALYST_SRC_PATH=${LIGHTNING_CATALYST_SRC_PATH} must be set to an absolute path") + endif() + if(CATALYST_GIT_TAG) + message(WARN " Setting `LIGHTNING_CATALYST_SRC_PATH=${LIGHTNING_CATALYST_SRC_PATH}` overrides `CATALYST_GIT_TAG=${CATALYST_GIT_TAG}`") + endif() + + # Acquire local git hash and use for CATALYST_GIT_TAG + execute_process(COMMAND git rev-parse --short HEAD + WORKING_DIRECTORY ${LIGHTNING_CATALYST_SRC_PATH} + OUTPUT_VARIABLE CATALYST_GIT_TAG + ) + message(INFO " Building against local Catalyst - path: ${LIGHTNING_CATALYST_SRC_PATH} - GIT TAG: ${CATALYST_GIT_TAG}") + + target_include_directories(${target_name} INTERFACE + $ + $ + ) + + else() + if(NOT CATALYST_GIT_TAG) + set(CATALYST_GIT_TAG "v0.13.0" CACHE STRING "GIT_TAG value to build Catalyst") + endif() + message(INFO " Building against Catalyst GIT TAG ${CATALYST_GIT_TAG}") + + set(CATALYST_DOWNLOAD_INCLUDE_DIR "${PROJECT_BINARY_DIR}/catalyst-headers") + + # Fetching /lib/backend/common hpp headers + set(LIB_BACKEND_COMMON_HEADERS CacheManager.hpp + QubitManager.hpp + Utils.hpp + ) + + foreach(HEADER ${LIB_BACKEND_COMMON_HEADERS}) + string(REGEX REPLACE "\\.[^.]*$" "" HEADER_NAME ${HEADER}) + FetchContent_Declare( + ${HEADER_NAME} + URL https://raw.githubusercontent.com/PennyLaneAI/catalyst/${CATALYST_GIT_TAG}/runtime/lib/backend/common/${HEADER} + DOWNLOAD_NO_EXTRACT True + SOURCE_DIR ${CATALYST_DOWNLOAD_INCLUDE_DIR} + ) + + FetchContent_MakeAvailable(${HEADER_NAME}) + endforeach() + + # Fetching include hpp headers + set(INCLUDE_HEADERS DataView.hpp + Exception.hpp + QuantumDevice.hpp + RuntimeCAPI.h + Types.h + ) + + foreach(HEADER ${INCLUDE_HEADERS}) + string(REGEX REPLACE "\\.[^.]*$" "" HEADER_NAME ${HEADER}) + FetchContent_Declare( + ${HEADER_NAME} + URL https://raw.githubusercontent.com/PennyLaneAI/catalyst/${CATALYST_GIT_TAG}/runtime/include/${HEADER} + DOWNLOAD_NO_EXTRACT True + SOURCE_DIR ${CATALYST_DOWNLOAD_INCLUDE_DIR} + ) + + FetchContent_MakeAvailable(${HEADER_NAME}) + endforeach() + + target_include_directories(${target_name} INTERFACE + $ + ) + + endif() +endmacro() diff --git a/scripts/lightning-path.sh b/scripts/lightning-path.sh new file mode 100755 index 0000000..b459b5f --- /dev/null +++ b/scripts/lightning-path.sh @@ -0,0 +1,61 @@ +#!/bin/bash -e +# +# This script determines the absolute path to a PennyLane-Lightning simulator +# library to be used with QIREE's Lightning backend. +# +# Usage: +# ./scripts/lightning-path.sh +# +# Example: +# ./scripts/lightning-path.sh qubit + +if [ -z "$1" ]; then + echo "Error: Missing argument. Usage: $0 " >&2 + echo "Example: $0 qubit" >&2 + exit 1 +fi + +# Validate simulator type +case "$1" in + qubit|gpu|kokkos) + ;; + *) + echo "Error: Invalid simulator type '$1'. Must be one of 'qubit', 'gpu', or 'kokkos'." >&2 + exit 1 + ;; +esac + +SIM_TYPE="$1" + +# Determine OS-specific library suffix +UNAME_S=$(uname -s) +case "$UNAME_S" in + Linux*) LIB_SUFFIX=".so";; + Darwin*) LIB_SUFFIX=".dylib";; + *) + echo "Error: Unsupported platform '$UNAME_S'. QIREE with PennyLane-Lightning only supports Linux and macOS." >&2 + exit 1 + ;; +esac + +# Find the base PennyLane-Lightning installation directory using Python +BASE_PATH=$(python -c "import site; print(f'{site.getsitepackages()[0]}/pennylane_lightning')") + +if [ -z "$BASE_PATH" ]; then + echo "Error: Could not determine pennylane_lightning path via Python." >&2 + echo "Is pennylane-lightning installed in your current Python environment?" >&2 + exit 1 +fi + +# Construct the full library path +LIB_NAME="liblightning_${SIM_TYPE}_catalyst${LIB_SUFFIX}" +FULL_PATH="${BASE_PATH}/${LIB_NAME}" + +# Check if the file actually exists +if [ ! -f "$FULL_PATH" ]; then + echo "Error: Simulator library not found at: $FULL_PATH" >&2 + echo "Ensure you have installed the correct simulator type ('$SIM_TYPE')" >&2 + exit 1 +fi + +echo "$FULL_PATH" diff --git a/src/qirlightning/CMakeLists.txt b/src/qirlightning/CMakeLists.txt index b02e572..086e2e8 100644 --- a/src/qirlightning/CMakeLists.txt +++ b/src/qirlightning/CMakeLists.txt @@ -4,57 +4,16 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #----------------------------------------------------------------------------# -# Fetch Catalyst runtime include files -include(FetchContent) - -include("${CMAKE_CURRENT_SOURCE_DIR}/support_catalyst.cmake") -FindCatalyst(qirlightning) - -# Set the path to the lightning simulator shared library -if(DEFINED ENV{LIGHTNING_SIM_PATH}) - set(RTDLIB_PATH "$ENV{LIGHTNING_SIM_PATH}") - message(STATUS "RTDLIB_PATH set from environment variable LIGHTNING_SIM_PATH: ${RTDLIB_PATH}") -else() - # Throw an error if the environment variable is not defined - message(FATAL_ERROR "Environment variable LIGHTNING_SIM_PATH is not defined. Please set it to the path of the Lightning simulator shared library.") -endif() - -# Set the device name for the lightning simulator -execute_process( - COMMAND nm -DC "${RTDLIB_PATH}" | grep " Factory" - OUTPUT_VARIABLE GREP_OUTPUT - ERROR_QUIET - OUTPUT_STRIP_TRAILING_WHITESPACE - ) - -if(GREP_OUTPUT) - string(REGEX MATCH "T (.*)Factory" SYMBOL_MATCH "${GREP_OUTPUT}") - if(SYMBOL_MATCH) - string(REGEX REPLACE "T (.*)Factory" "\\1" RTDDEVICE_NAME "${SYMBOL_MATCH}") - message(STATUS "Found Lightning Simulator. Extracted RTDDEVICE_NAME: ${RTDDEVICE_NAME}") - else() - message(FATAL_ERROR "Symbol 'Factory' found, but regex failed to extract.") - endif() -else() - message(FATAL_ERROR "Symbol 'Factory' not found in ${RTDLIB_PATH}. Please ensure LIGHTNING_SIM_PATH is set correctly.") -endif() - - # Adding lightning as a library to qiree qiree_add_library(qirlightning -LightningQuantum.cc -LightningRuntime.cc -) - -target_compile_definitions(qirlightning PRIVATE - RTDLIB="${RTDLIB_PATH}" - RTDDEVICE="${RTDDEVICE_NAME}" + LightningQuantum.cc + LightningRuntime.cc ) #Link the lightning library to qiree and any other relevant libraries target_link_libraries(qirlightning PUBLIC QIREE::qiree # Link to qiree - PRIVATE QIREE::lightning + PUBLIC QIREE::lightning ) #----------------------------------------------------------------------------# diff --git a/src/qirlightning/LightningQuantum.cc b/src/qirlightning/LightningQuantum.cc index d3538a2..47c4545 100644 --- a/src/qirlightning/LightningQuantum.cc +++ b/src/qirlightning/LightningQuantum.cc @@ -33,20 +33,30 @@ LightningQuantum::LightningQuantum(std::ostream& os, unsigned long int seed) : output_(os), seed_(seed) { auto rtld_flags = RTLD_LAZY | RTLD_NODELETE; - rtd_dylib_handler_ = dlopen(RTDLIB, rtld_flags); + rtd_dylib_handler_ = dlopen(QIREE_LIGHTNING_RTDLIB, rtld_flags); QIREE_VALIDATE(rtd_dylib_handler_, - << "failed to load Lightning runtime library '" << RTDLIB - << "'"); + << "failed to load Lightning runtime library '" + << QIREE_LIGHTNING_RTDLIB << "'"); // Find device factory - std::string rtd_device = RTDDEVICE; - std::string factory_name = rtd_device + "Factory"; - factory_f_ptr_ = dlsym(rtd_dylib_handler_, factory_name.c_str()); + std::vector const factory_names + = {"LightningSimulatorFactory", + "LightningKokkosSimulatorFactory", + "LightningGPUSimulatorFactory"}; + + for (auto const& factory_name : factory_names) + { + dlerror(); + factory_f_ptr_ = dlsym(rtd_dylib_handler_, factory_name.c_str()); + if (factory_f_ptr_) + { + break; + } + } QIREE_VALIDATE(factory_f_ptr_, - << "failed to find device factory function '" - << factory_name << "'"); + << "failed to find valid device factory function"); } //---------------------------------------------------------------------------// diff --git a/src/qirlightning/README.md b/src/qirlightning/README.md index 4058002..cc03c38 100644 --- a/src/qirlightning/README.md +++ b/src/qirlightning/README.md @@ -7,18 +7,20 @@ The [PennyLane-Lightning](https://github.com/PennyLaneAI/pennylane-lightning) pl ## Installing a Lightning simulator -For more information on installing Pennylane Lightning simulators from source, please visit the [Lightning installation page](https://docs.pennylane.ai/projects/lightning/en/latest/dev/installation.html). Note: QIREE is tested to work with PennyLane Lightning simulators v0.42. +For more information on installing Pennylane Lightning simulators from source, please visit the [Lightning installation page](https://docs.pennylane.ai/projects/lightning/en/latest/dev/installation.html). + +**Note:** QIREE is tested to work with PennyLane Lightning simulators v0.43. ### Quick start The easiest way to get started is to install a Lightning simulator (`pennylane-lightning`/`pennylane-lightning-gpu`/`pennylane-lightning-kokkos`) from PyPI via pip: ``` -$ pip install pennylane-lightning-kokkos==0.42.0 +$ pip install pennylane-lightning-kokkos==0.43.0 $ pip show pennylane-lightning-kokkos Name: PennyLane_Lightning_Kokkos -Version: 0.42.0 +Version: 0.43.0 Summary: PennyLane-Lightning plugin Home-page: https://github.com/PennyLaneAI/pennylane-lightning Author: @@ -44,24 +46,31 @@ $ ls $PL_PATH ... liblightning_qubit_catalyst.so liblightning_kokkos_catalyst.so ... ``` +The helper script `qiree/scripts/lightning-path.sh ` can be used to obtain the absolute path of the shared library. + ## Compile QIR-EE with Lightning backend To compile QIR-EE with lightning backend: ``` -# Set the path for the lightning simulator shared library -export LIGHTNING_SIM_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/pennylane_lightning')")/liblightning_kokkos_catalyst.so - -# Proceed with usual build instructions, but with `-DQIREE_USE_LIGHTNING=ON` cmake flag cd qiree/ + +# Set the path for the lightning simulator shared library using the +# helper script. Update to qubit / gpu / kokkos as required. + +export LIGHTNING_SIM_PATH=$(bash ./scripts/lightning-path.sh ) + +# Proceed with usual build instructions +# but with the extra `-DQIREE_USE_LIGHTNING=ON` and +# `-DQIREE_LIGHTNING_SIM_PATH` cmake flags + mkdir build; cd build -cmake -DQIREE_USE_LIGHTNING=ON .. +cmake -DQIREE_USE_LIGHTNING=ON -DQIREE_LIGHTNING_SIM_PATH=$LIGHTNING_SIM_PATH .. make ``` **Note:** -- replace `libligghtning_kokkos_catalyst.so` with `liblightning_qubit_catalyst.so` or `liblightning_GPU_catalyst.so` if required. - when running with `lightning.gpu` simulator for Nvidia GPUs, include `cuquantum` libraries in the library path (which will be installed as a dependency from Python), i.e. ``` diff --git a/src/qirlightning/simple_demo/README.md b/src/qirlightning/simple_demo/README.md new file mode 100644 index 0000000..ef08d55 --- /dev/null +++ b/src/qirlightning/simple_demo/README.md @@ -0,0 +1,69 @@ +# Simple Demo for Catalyst/Lightning runtime + +This is a super simple demo for driving Lightning devices. The example here uses `lightning.kokkos`, but can easily be updated to target other devices, e.g. lightning.gpu (if an Nvidia GPU is present). + +Some Catalyst include files are copied here for convenience - they are in `./snapshot_catalyst_runtime/include`. These are required for the QuantumDevice interface. For the qiree source, these files are fetched automatically during CMake, and these are not used. + +## Installing a lightning simulator + +When installing [Pennylane-Lightning](https://github.com/PennyLaneAI/pennylane-lightning) from pip or source, you will have the shared objects for each of the simulator installed. These are named `liblightning_kokkos_catalyst.so`/`liblightning_GPU_catalyst.so` etc. + +To get started, run `pip install pennylane` or `pip install pennylane-lightning` - this will install the `lightning.qubit` (CPU) simulator, and other simulators can be installed by running `pip install pennylane-lightning-kokkos / pennylane-lightning-gpu`. + +Example: +``` +$ pip install pennylane-lightning-kokkos + +$ pip show pennylane-lightning-kokkos +Name: PennyLane_Lightning_Kokkos +Version: 0.40.0 +Summary: PennyLane-Lightning plugin +Home-page: https://github.com/PennyLaneAI/pennylane-lightning +Author: +Author-email: +License: Apache License 2.0 +Location: +Requires: pennylane, pennylane-lightning + +$ ls /pennylane_lightning +... liblightning_kokkos_catalyst.so ... +``` + +You can swap `pennylane-lightning-kokkos` for `pennylane-lightning-gpu` for lightning.gpu and `pennylane-lightning` for lightning.gpu simulators. + +## Compilation + +First update the `RTDLIB` in `test_rt_device.cpp` to the local path where lightning is installed (i.e. `` from above). + +To compile: + +``` +$ clang++ --std=c++20 test_rt_device.cpp -I./snapshot_catalyst_runtime/include -o test_rt_device.out +``` + +## Running the example + +To run: + +``` +$ ./test_rt_device.out +Kokkos::OpenMP::initialize WARNING: OMP_PROC_BIND environment variable not set + In general, for best performance with OpenMP 4.0 or better set OMP_PROC_BIND=spread and OMP_PLACES=threads + For best performance with OpenMP 3.1 set OMP_PROC_BIND=true + For unit testing set OMP_PROC_BIND=false + +Num Qubits = 3 +State = +*** State-Vector of Size 8 *** +[(0.707107,0), (0,0), (0,0), (0,0), (0.707107,0), (0,0), (0,0), (0,0)] +Measure on wire 0 = 0 +``` + +## Running on other devices + +To run on other devices, e.g. lightning.gpu, you need to change: +- Install pennylane-lightning-gpu: `pip install pennylane-lightning-gpu` +In the c++ file: +- replace `RTDLIB` from `kokkos` to `gpu` +- replace `RTDDEVICE` from `Kokkos` to `GPU` +- Include `cuquantum` libraries when running (which was installed as a dependency), i.e. `LD_LIBRARY_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")/lib:$LD_LIBRARY_PATH ./test_rt_device.out` diff --git a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/README.rst b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/README.rst new file mode 100644 index 0000000..8a881e5 --- /dev/null +++ b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/README.rst @@ -0,0 +1,118 @@ +.. runtime-start-inclusion-marker-do-not-remove + +Catalyst Quantum Runtime +######################## + +The Catalyst Runtime is a C++ QIR runtime that enables the execution of Catalyst-compiled +quantum programs, and is currently backed by `PennyLane-Lightning `_ +state-vector simulators, and `Amazon Braket `__ +devices. Additional hardware support, including QPUs, to come. + +The runtime employs the `QuantumDevice `_ +public interface to support an extensible list of backend devices. This interface comprises two collections of abstract methods: + +- The Qubit management, device shot noise, and quantum tape recording methods are utilized for the implementation of Quantum Runtime (QR) instructions. + +- The quantum operations, observables, measurements, and gradient methods are used to implement Quantum Instruction Set (QIS) instructions. + +A complete list of instructions supported by the runtime can be found in +`RuntimeCAPI.h `_. + +Contents +======== + +The directory is structured as follows: + +- `include `_: + This contains the public header files of the runtime including the ``QuantumDevice`` API + for backend quantum devices and the runtime CAPI. + +- `lib `_: + The core modules of the runtime are structured into ``lib/capi`` and ``lib/backend``. + `lib/capi `_ implements the semantics for + QIR instructions lowered to our custom runtime. `lib/backend `_ + contains implementations of the ``QuantumDevice`` API for backend simulators. + +- `tests `_: + A collection of C++ tests for modules and methods in the runtime. + +Backend Devices +=============== + +New device backends for the runtime can be realized by implementing the quantum device interface. +The following table shows the available devices along with supported features: + +.. list-table:: + :widths: 25 25 25 25 + :header-rows: 0 + + * - **Features** + - **PennyLane-Lightning-Qubit** + - **PennyLane-Lightning-Kokkos** and **PennyLane-Lightning-GPU** + - **Amazon-Braket-OpenQasm** + * - Qubit Management + - Dynamic allocation/deallocation + - Static allocation/deallocation + - Static allocation/deallocation + * - Gate Operations + - `Lightning operations `_ + - `Lightning operations `_ without controlled gates support + - `Braket operations `_ + * - Quantum Observables + - ``Identity``, ``PauliX``, ``PauliY``, ``PauliZ``, ``Hadamard``, ``Hermitian``, ``Hamiltonian``, and Tensor Product of Observables + - ``Identity``, ``PauliX``, ``PauliY``, ``PauliZ``, ``Hadamard``, ``Hermitian``, ``Hamiltonian``, and Tensor Product of Observables + - ``Identity``, ``PauliX``, ``PauliY``, ``PauliZ``, ``Hadamard``, ``Hermitian``, and Tensor Product of Observables + * - Expectation Value + - All observables; Finite-shots supported + - All observables; Finite-shots supported + - All observables; Finite-shots supported + * - Variance + - All observables; Finite-shots supported + - All observables; Finite-shots supported + - All observables; Finite-shots supported + * - Probability + - Only for the computational basis on the supplied qubits; Finite-shots supported + - Only for the computational basis on the supplied qubits; Finite-shots supported + - The computational basis on all active qubits; Finite-shots supported + * - Sampling + - Only for the computational basis on the supplied qubits + - Only for the computational basis on the supplied qubits + - The computational basis on all active qubits; Finite-shots supported + * - Mid-Circuit Measurement + - Only for the computational basis on the supplied qubit + - Only for the computational basis on the supplied qubit + - Not supported + * - Gradient + - The Adjoint-Jacobian method for expectation values on all observables + - The Adjoint-Jacobian method for expectation values on all observables + - Not supported + +Requirements +============ + +To build the runtime from source, it is required to have an up to date version of a C/C++ compiler such as gcc or clang +with support for the C++20 standard library. + +Installation +============ + +By default, the runtime builds all supported backend devices. +You can build the runtime with custom devices from the list of Backend Devices. + +You can use ``ENABLE_OPENQASM=OFF`` to disable building the runtime with `Amazon-Braket-OpenQasm `_: + +.. code-block:: console + + make runtime ENABLE_OPENQASM=OFF + +This device currently offers generators for the `OpenQasm3 `_ specification and +`Amazon Braket `__ assembly extension. +Moreover, the generated assembly can be executed on Amazon Braket devices leveraging `amazon-braket-sdk-python `_. + +To check the runtime test suite from the root directory: + +.. code-block:: console + + make test-runtime + +.. runtime-end-inclusion-marker-do-not-remove diff --git a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/DataView.hpp b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/DataView.hpp new file mode 100644 index 0000000..616b9dc --- /dev/null +++ b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/DataView.hpp @@ -0,0 +1,173 @@ +// Copyright 2023 Xanadu Quantum Technologies Inc. + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +/** + * A multi-dimensional view for MemRef-like and std::vector types. + * + * @tparam T The underlying data type + * @tparam R The Rank (R > 0) + * + * @note A forward iterator is implemented in this view for traversing over the + * entire elements of MemRef types rank-by-rank starting from the last + * dimension (R-1). For example, The DataView iterator for MemRef starts + * from index (0, 0) and traverses elements in the following order: (0, 0), + * ..., (0, sizes[1]-1), (1, 0), ..., (1, sizes[1]-1), ... (sizes[0]-1, + * sizes[1]-1). + */ +template +class DataView +{ + private: + T* data_aligned; + size_t offset; + size_t sizes[R] = {0}; + size_t strides[R] = {0}; + + public: + class iterator + { + private: + DataView const& view; + + int64_t loc; // physical index + size_t indices[R] = {0}; + + public: + using iterator_category = std::forward_iterator_tag; // LCOV_EXCL_LINE + using value_type = T; // LCOV_EXCL_LINE + using difference_type = std::ptrdiff_t; // LCOV_EXCL_LINE + using pointer = T*; // LCOV_EXCL_LINE + using reference = T&; // LCOV_EXCL_LINE + + iterator(DataView const& _view, int64_t begin_idx) + : view(_view), loc(begin_idx) + { + } + pointer operator->() const { return &view.data_aligned[loc]; } + reference operator*() const { return view.data_aligned[loc]; } + iterator& operator++() + { + int64_t next_axis = -1; + int64_t idx; + for (int64_t i = R; i > 0; --i) + { + idx = i - 1; + if (indices[idx]++ < view.sizes[idx] - 1) + { + next_axis = idx; + break; + } + indices[idx] = 0; + loc -= (view.sizes[idx] - 1) * view.strides[idx]; + } + + loc = next_axis == -1 ? -1 : loc + view.strides[next_axis]; + return *this; + } + iterator operator++(int) + { + auto tmp = *this; + int64_t next_axis = -1; + int64_t idx; + for (int64_t i = R; i > 0; --i) + { + idx = i - 1; + if (indices[idx]++ < view.sizes[idx] - 1) + { + next_axis = idx; + break; + } + indices[idx] = 0; + loc -= (view.sizes[idx] - 1) * view.strides[idx]; + } + + loc = next_axis == -1 ? -1 : loc + view.strides[next_axis]; + return tmp; + } + bool operator==(iterator const& other) const + { + return (loc == other.loc + && view.data_aligned == other.view.data_aligned); + } + bool operator!=(iterator const& other) const + { + return !(*this == other); + } + }; + + explicit DataView(std::vector& buffer) + : data_aligned(buffer.data()), offset(0) + { + static_assert(R == 1, "[Class: DataView] Assertion: R == 1"); + sizes[0] = buffer.size(); + strides[0] = 1; + } + + explicit DataView(T* _data_aligned, + size_t _offset, + size_t const* _sizes, + size_t const* _strides) + : data_aligned(_data_aligned), offset(_offset) + { + static_assert(R > 0, "[Class: DataView] Assertion: R > 0"); + if (_sizes != nullptr && _strides != nullptr) + { + for (size_t i = 0; i < R; i++) + { + sizes[i] = _sizes[i]; + strides[i] = _strides[i]; + } + } // else sizes = {0}, strides = {0} + } + + [[nodiscard]] auto size() const -> size_t + { + if (!data_aligned) + { + return 0; + } + + size_t tsize = 1; + for (size_t i = 0; i < R; i++) + { + tsize *= sizes[i]; + } + return tsize; + } + + template + T& operator()(I... idxs) const + { + static_assert(sizeof...(idxs) == R, + "[Class: DataView] Error in Catalyst Runtime: Wrong " + "number of indices"); + size_t indices[] = {static_cast(idxs)...}; + + size_t loc = offset; + for (size_t axis = 0; axis < R; axis++) + { + RT_ASSERT(indices[axis] < sizes[axis]); + loc += indices[axis] * strides[axis]; + } + return data_aligned[loc]; + } + + iterator begin() { return iterator{*this, static_cast(offset)}; } + + iterator end() { return iterator{*this, -1}; } +}; diff --git a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Exception.hpp b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Exception.hpp new file mode 100644 index 0000000..4e8272d --- /dev/null +++ b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Exception.hpp @@ -0,0 +1,96 @@ +// Copyright 2023 Xanadu Quantum Technologies Inc. + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +/** + * @brief Macro that throws `RuntimeException` with given message. + */ +#define RT_FAIL(message) \ + Catalyst::Runtime::_abort((message), __FILE__, __LINE__, __func__) + +/** + * @brief Macro that throws `RuntimeException` if expression evaluates + * to true. + */ +#define RT_FAIL_IF(expression, message) \ + if ((expression)) \ + { \ + RT_FAIL(message); \ + } + +/** + * @brief Macro that throws `RuntimeException` with the given expression + * and source location if expression evaluates to false. + */ +#define RT_ASSERT(expression) \ + RT_FAIL_IF(!(expression), "Assertion: " #expression) + +namespace Catalyst::Runtime +{ + +/** + * @brief This is the general exception thrown by Catalyst for runtime errors + * that is derived from `std::exception`. + */ +class RuntimeException : public std::exception +{ + private: + std::string const err_msg; + + public: + explicit RuntimeException(std::string msg) noexcept + : err_msg{std::move(msg)} + { + } // LCOV_EXCL_LINE + ~RuntimeException() override = default; // LCOV_EXCL_LINE + + RuntimeException(RuntimeException const&) = default; + RuntimeException(RuntimeException&&) noexcept = default; + + RuntimeException& operator=(RuntimeException const&) = delete; + RuntimeException& operator=(RuntimeException&&) = delete; + + [[nodiscard]] auto what() const noexcept -> char const* override + { + return err_msg.c_str(); + } // LCOV_EXCL_LINE +}; + +/** + * @brief Throws a `RuntimeException` with the given error message. + * + * @note This is not supposed to be called directly. + */ +[[noreturn]] inline void _abort(char const* message, + char const* file_name, + size_t line, + char const* function_name) +{ + std::stringstream sstream; + sstream << "[" << file_name << "][Line:" << line + << "][Function:" << function_name + << "] Error in Catalyst Runtime: " << message; + + throw RuntimeException(sstream.str()); +} // LCOV_EXCL_LINE + +} // namespace Catalyst::Runtime diff --git a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/QuantumDevice.hpp b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/QuantumDevice.hpp new file mode 100644 index 0000000..6794033 --- /dev/null +++ b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/QuantumDevice.hpp @@ -0,0 +1,399 @@ +// Copyright 2022-2023 Xanadu Quantum Technologies Inc. + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "DataView.hpp" +#include "Types.h" + +// A helper template macro to generate the Factory method by +// calling (kwargs). Check the Custom Devices guideline for +// details: +// https://docs.pennylane.ai/projects/catalyst/en/stable/dev/custom_devices.html +#define GENERATE_DEVICE_FACTORY(IDENTIFIER, CONSTRUCTOR) \ + extern "C" Catalyst::Runtime::QuantumDevice* IDENTIFIER##Factory( \ + const char* kwargs) \ + { \ + return new CONSTRUCTOR(std::string(kwargs)); \ + } + +namespace Catalyst::Runtime +{ + +/** + * @brief struct API for backend quantum devices. + * + * This device API contains, + * - a set of methods to manage qubit allocations and deallocations, device + * shot noise, and quantum tape recording as well as reference values for the + * result data-type; these are used to implement Quantum Runtime (QR) + * instructions. + * + * - a set of methods for quantum operations, observables, measurements, and + * gradient of the device; these are used to implement Quantum Instruction Set + * (QIS) instructions. + * + */ +struct QuantumDevice +{ + QuantumDevice() = default; // LCOV_EXCL_LINE + virtual ~QuantumDevice() = default; // LCOV_EXCL_LINE + + QuantumDevice& operator=(QuantumDevice const&) = delete; + QuantumDevice(QuantumDevice const&) = delete; + QuantumDevice(QuantumDevice&&) = delete; + QuantumDevice& operator=(QuantumDevice&&) = delete; + + /** + * @brief Allocate a qubit. + * + * @return `QubitIdType` + */ + virtual auto AllocateQubit() -> QubitIdType = 0; + + /** + * @brief Allocate a vector of qubits. + * + * @param num_qubits The number of qubits to allocate. + * + * @return `std::vector` + */ + virtual auto AllocateQubits(size_t num_qubits) -> std::vector + = 0; + + /** + * @brief Release a qubit. + * + * @param qubit The id of the qubit + */ + virtual void ReleaseQubit(QubitIdType qubit) = 0; + + /** + * @brief Release all qubits. + */ + virtual void ReleaseAllQubits() = 0; + + /** + * @brief Get the number of allocated qubits. + * + * @return `size_t` + */ + [[nodiscard]] virtual auto GetNumQubits() const -> size_t = 0; + + /** + * @brief Set the number of device shots. + * + * @param shots The number of noise shots + */ + virtual void SetDeviceShots(size_t shots) = 0; + + /** + * @brief Get the number of device shots. + * + * @return `size_t` + */ + [[nodiscard]] virtual auto GetDeviceShots() const -> size_t = 0; + + /** + * @brief Set the PRNG of the device. + * + * The Catalyst runtime enables seeded program execution on non-hardware + * devices. A random number generator instance is managed by the runtime to + * predictably generate results for non-deterministic programs, such as + * those involving `Measure` calls. Devices implementing support for this + * feature do not need to use the provided PRNG instance as their sole + * source of random numbers, but it is expected that the the same instance + * state will predictable and reproducibly generate the same program + * results. It is also expected that the provided PRNG state is evolved + * sufficiently so that two device executions sharing the same instance do + * not produce identical results. The provided PRNG instance is not + * thread-locked, and devices wishing to share it across threads will need + * to provide their own thread-safety. + * + * @param gen The std::mt19937 PRNG object. + */ + virtual void SetDevicePRNG([[maybe_unused]] std::mt19937* gen) {}; + + /** + * @brief Start recording a quantum tape if provided. + * + * @note This is backed by the `Catalyst::Runtime::CacheManager` + * property in the device implementation. + */ + virtual void StartTapeRecording() = 0; + + /** + * @brief Stop recording a quantum tape if provided. + * + * @note This is backed by the `Catalyst::Runtime::CacheManager` + * property in the device implementation. + */ + virtual void StopTapeRecording() = 0; + + /** + * @brief Result value for "Zero" used in the measurement process. + * + * @return `Result` + */ + [[nodiscard]] virtual auto Zero() const -> Result = 0; + + /** + * @brief Result value for "One" used in the measurement process. + * + * @return `Result` + */ + [[nodiscard]] virtual auto One() const -> Result = 0; + + /** + * @brief A helper method to print the state vector of a device. + */ + virtual void PrintState() = 0; + + /** + * @brief Prepare subsystems using the given ket vector in the + * computational basis. + * + * @param state A state vector of size 2**len(wires) + * @param wires The wire(s) the operation acts on + */ + virtual void + SetState([[maybe_unused]] DataView, 1>& state, + [[maybe_unused]] std::vector& wires) + { + RT_FAIL("Unsupported functionality"); + } + + /** + * @brief Prepares a single computational basis state. + * + * @param n Prepares the basis state |n>, where n is an array of integers + * from the set {0, 1} + * @param wires The wire(s) the operation acts on + */ + virtual void SetBasisState([[maybe_unused]] DataView& n, + [[maybe_unused]] std::vector& wires) + { + RT_FAIL("Unsupported functionality"); + } + + /** + * @brief Apply a single gate to the state vector of a device with its name + * if this is supported. + * + * @param name The name of the gate to apply + * @param params Optional parameter list for parametric gates + * @param wires Wires to apply gate to + * @param inverse Indicates whether to use inverse of gate + * @param controlled_wires Optional controlled wires applied to the + * operation + * @param controlled_values Optional controlled values applied to the + * operation + */ + virtual void NamedOperation( + std::string const& name, + std::vector const& params, + std::vector const& wires, + [[maybe_unused]] bool inverse = false, + [[maybe_unused]] std::vector const& controlled_wires = {}, + [[maybe_unused]] std::vector const& controlled_values = {}) + = 0; + + /** + * @brief Apply a given matrix directly to the state vector of a device. + * + * @param matrix The matrix of data in row-major format + * @param wires Wires to apply gate to + * @param inverse Indicates whether to use inverse of gate + * @param controlled_wires Controlled wires applied to the operation + * @param controlled_values Controlled values applied to the operation + */ + virtual void MatrixOperation( + std::vector> const& matrix, + std::vector const& wires, + [[maybe_unused]] bool inverse = false, + [[maybe_unused]] std::vector const& controlled_wires = {}, + [[maybe_unused]] std::vector const& controlled_values = {}) + = 0; + + /** + * @brief Construct a named (Identity, PauliX, PauliY, PauliZ, and + * Hadamard) or Hermitian observable. + * + * @param id The type of the observable + * @param matrix The matrix of data to construct a hermitian observable + * @param wires Wires to apply observable to + * + * @return `ObsIdType` Index of the constructed observable + */ + virtual auto Observable(ObsId id, + std::vector> const& matrix, + std::vector const& wires) -> ObsIdType + = 0; + + /** + * @brief Construct a tensor product of observables. + * + * @param obs The vector of observables indices of type ObsIdType + * + * @return `ObsIdType` Index of the constructed observable + */ + virtual auto TensorObservable(std::vector const& obs) + -> ObsIdType + = 0; + + /** + * @brief Construct a Hamiltonian observable. + * + * @param coeffs The vector of coefficients + * @param obs The vector of observables indices of size `coeffs` + * + * @return `ObsIdType` Index of the constructed observable + */ + virtual auto HamiltonianObservable(std::vector const& coeffs, + std::vector const& obs) + -> ObsIdType + = 0; + + /** + * @brief Compute the expected value of an observable. + * + * @param obsKey The index of the constructed observable + * + * @return `double` The expected value + */ + virtual auto Expval(ObsIdType obsKey) -> double = 0; + + /** + * @brief Compute the variance of an observable. + * + * @param obsKey The index of the constructed observable + * + * @return `double` The variance + */ + virtual auto Var(ObsIdType obsKey) -> double = 0; + + /** + * @brief Get the state-vector of a device. + * + * @param state The pre-allocated `DataView, 1>` + */ + virtual void State(DataView, 1>& state) = 0; + + /** + * @brief Compute the probabilities of each computational basis state. + + * @param probs The pre-allocated `DataView` + */ + virtual void Probs(DataView& probs) = 0; + + /** + * @brief Compute the probabilities for a subset of the full system. + * + * @param probs The pre-allocated `DataView` + * @param wires Wires will restrict probabilities to a subset of the full + * system + */ + virtual void PartialProbs(DataView& probs, + std::vector const& wires) + = 0; + + /** + * @brief Compute samples with the number of shots on the entire wires, + * returing raw samples. + * + * @param samples The pre-allocated `DataView`representing a + * matrix of shape `shots * numQubits`. The built-in iterator in + * `DataView` iterates over all elements of `samples` row-wise. + * @param shots The number of shots + */ + virtual void Sample(DataView& samples, size_t shots) = 0; + + /** + * @brief Compute partial samples with the number of shots on `wires`, + * returing raw samples. + * + * @param samples The pre-allocated `DataView`representing a + * matrix of shape `shots * numWires`. The built-in iterator in + * `DataView` iterates over all elements of `samples` row-wise. + * @param wires Wires to compute samples on + * @param shots The number of shots + */ + virtual void PartialSample(DataView& samples, + std::vector const& wires, + size_t shots) + = 0; + + /** + * @brief Sample with the number of shots on the entire wires, returning + * the number of counts for each sample. + * + * @param eigvals The pre-allocated `DataView` + * @param counts The pre-allocated `DataView` + * @param shots The number of shots + */ + virtual void Counts(DataView& eigvals, + DataView& counts, + size_t shots) + = 0; + + /** + * @brief Partial sample with the number of shots on `wires`, returning the + * number of counts for each sample. + * + * @param eigvals The pre-allocated `DataView` + * @param counts The pre-allocated `DataView` + * @param wires Wires to compute samples on + * @param shots The number of shots + */ + virtual void PartialCounts(DataView& eigvals, + DataView& counts, + std::vector const& wires, + size_t shots) + = 0; + + /** + * @brief A general measurement method that acts on a single wire. + * + * @param wire The wire to compute Measure on + * @param postselect Which basis state to postselect after a mid-circuit + measurement (-1 denotes no post-selection) + + * @return `Result` The measurement result + */ + virtual auto Measure(QubitIdType wire, std::optional postselect) + -> Result + = 0; + + /** + * @brief Compute the gradient of a quantum tape, that is cached using + * `Catalyst::Runtime::Simulator::CacheManager`, for a specific set of + * trainable parameters. + * + * @param gradients The vector of pre-allocated `DataView*` + * to store gradients resutls for the list of cached observables. + * @param trainParams The vector of trainable parameters; if none, all + * parameters would be assumed trainable + * + */ + virtual void Gradient(std::vector>& gradients, + std::vector const& trainParams) + = 0; +}; +} // namespace Catalyst::Runtime diff --git a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Types.h b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Types.h new file mode 100644 index 0000000..a90f69d --- /dev/null +++ b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Types.h @@ -0,0 +1,179 @@ +// Copyright 2022-2023 Xanadu Quantum Technologies Inc. + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#ifndef TYPES_H +# define TYPES_H + +# include +# include +# include + +# ifdef __cplusplus +extern "C" { +# endif + +// Qubit, Result and Observable types +struct QUBIT; +using QubitIdType = intptr_t; + +using RESULT = bool; +using Result = RESULT*; +using QirArray = void*; + +using ObsIdType = intptr_t; + +enum ObsId : int8_t +{ + Identity = 0, + PauliX, + PauliY, + PauliZ, + Hadamard, + Hermitian, +}; + +enum ObsType : int8_t +{ + Basic = 0, + TensorProd, + Hamiltonian, +}; + +// complex type +struct CplxT_float +{ + float real; + float imag; +}; + +// complex type +struct CplxT_double +{ + double real; + double imag; +}; + +enum NumericType : int8_t +{ + idx = 0, + i1, + i8, + i16, + i32, + i64, + f32, + f64, + c64, + c128, +}; + +// MemRefT type +struct OpaqueMemRefT +{ + int64_t rank; + void* descriptor; + NumericType datatype; +}; + +// MemRefT, dimension=1> type +struct MemRefT_CplxT_double_1d +{ + CplxT_double* data_allocated; + CplxT_double* data_aligned; + size_t offset; + size_t sizes[1]; + size_t strides[1]; +}; + +// MemRefT, dimension=2> type +struct MemRefT_CplxT_double_2d +{ + CplxT_double* data_allocated; + CplxT_double* data_aligned; + size_t offset; + size_t sizes[2]; + size_t strides[2]; +}; + +// MemRefT type +struct MemRefT_double_1d +{ + double* data_allocated; + double* data_aligned; + size_t offset; + size_t sizes[1]; + size_t strides[1]; +}; + +// MemRefT type +struct MemRefT_double_2d +{ + double* data_allocated; + double* data_aligned; + size_t offset; + size_t sizes[2]; + size_t strides[2]; +}; + +// MemRefT type +struct MemRefT_int64_1d +{ + int64_t* data_allocated; + int64_t* data_aligned; + size_t offset; + size_t sizes[1]; + size_t strides[1]; +}; + +// MemRefT type +struct MemRefT_int8_1d +{ + int8_t* data_allocated; + int8_t* data_aligned; + size_t offset; + size_t sizes[1]; + size_t strides[1]; +}; + +// PairT, MemRefT> type +struct PairT_MemRefT_double_int64_1d +{ + struct MemRefT_double_1d first; + struct MemRefT_int64_1d second; +}; + +// Quantum operation modifiers +struct Modifiers +{ + bool adjoint; + size_t num_controlled; + QUBIT* controlled_wires; + bool* controlled_values; +}; + +using CplxT_double = struct CplxT_double; +using MemRefT_CplxT_double_1d = struct MemRefT_CplxT_double_1d; +using MemRefT_CplxT_double_2d = struct MemRefT_CplxT_double_2d; +using MemRefT_double_1d = struct MemRefT_double_1d; +using MemRefT_double_2d = struct MemRefT_double_2d; +using MemRefT_int64_1d = struct MemRefT_int64_1d; +using PairT_MemRefT_double_int64_1d = struct PairT_MemRefT_double_int64_1d; +using Modifiers = struct Modifiers; + +# ifdef __cplusplus +} // extern "C" +# endif + +#endif diff --git a/src/qirlightning/simple_demo/test_rt_device.cpp b/src/qirlightning/simple_demo/test_rt_device.cpp new file mode 100644 index 0000000..47897f2 --- /dev/null +++ b/src/qirlightning/simple_demo/test_rt_device.cpp @@ -0,0 +1,74 @@ +#include + +#include "QuantumDevice.hpp" + +// Runtime libraries (kokkos/GPU/qubit etc.) +// Update these paths to point to the correct library +#define RTDLIB \ + "/" \ + "pennylane_lightning/liblightning_kokkos_catalyst.so"; +#define RTDDEVICE "LightningKokkosSimulator"; + +extern "C" Catalyst::Runtime::QuantumDevice* +GenericDeviceFactory(char const* kwargs); + +using namespace Catalyst::Runtime; + +int main() +{ + try + { + // Load lightning simulation library + std::string rtd_lib = RTDLIB; + std::string rtd_device = RTDDEVICE; + std::string kwargs = {}; + auto rtld_flags = RTLD_LAZY | RTLD_NODELETE; + auto rtd_dylib_handler = dlopen(rtd_lib.c_str(), rtld_flags); + + if (!rtd_dylib_handler) + { + throw std::runtime_error("Failed to load library: " + rtd_lib); + } + + // Find device factory + std::string factory_name = rtd_device + "Factory"; + void* f_ptr = dlsym(rtd_dylib_handler, factory_name.c_str()); + + if (!f_ptr) + { + dlclose(rtd_dylib_handler); + throw std::runtime_error("Failed to find factory function: " + + factory_name); + } + std::string rtd_kwargs = {}; + auto rtd_qdevice = std::unique_ptr( + reinterpret_cast(f_ptr)( + rtd_kwargs.c_str())); + + // Allocate Qubits + rtd_qdevice->AllocateQubits(3); + + // Get Num Qubits + std::cout << "Num Qubits = " << rtd_qdevice->GetNumQubits() + << std::endl; + + // Apply Gate + rtd_qdevice->NamedOperation("Hadamard", {}, {0}); + + // Print State + std::cout << "State = " << std::endl; + rtd_qdevice->PrintState(); + + // Measure + QubitIdType wire{0}; + Result result = rtd_qdevice->Measure(wire, std::nullopt); + std::cout << "Measure on wire 0 = " << *result << std::endl; + } + catch (std::exception const& e) + { + std::cerr << "Error: " << e.what() << std::endl; + return EXIT_FAILURE; + } + + return EXIT_SUCCESS; +} diff --git a/src/qirlightning/support_catalyst.cmake b/src/qirlightning/support_catalyst.cmake deleted file mode 100644 index ca7df76..0000000 --- a/src/qirlightning/support_catalyst.cmake +++ /dev/null @@ -1,74 +0,0 @@ -############################################################################################### -# This file provides macros to process Catalyst. -############################################################################################### - -# Include this only once -include_guard() - -macro(FindCatalyst target_name) - if(LIGHTNING_CATALYST_SRC_PATH) - if(NOT IS_ABSOLUTE ${LIGHTNING_CATALYST_SRC_PATH}) - message(FATAL_ERROR " LIGHTNING_CATALYST_SRC_PATH=${LIGHTNING_CATALYST_SRC_PATH} must be set to an absolute path") - endif() - if(CATALYST_GIT_TAG) - message(WARN " Setting `LIGHTNING_CATALYST_SRC_PATH=${LIGHTNING_CATALYST_SRC_PATH}` overrides `CATALYST_GIT_TAG=${CATALYST_GIT_TAG}`") - endif() - - # Acquire local git hash and use for CATALYST_GIT_TAG - execute_process(COMMAND git rev-parse --short HEAD - WORKING_DIRECTORY ${LIGHTNING_CATALYST_SRC_PATH} - OUTPUT_VARIABLE CATALYST_GIT_TAG - ) - message(INFO " Building against local Catalyst - path: ${LIGHTNING_CATALYST_SRC_PATH} - GIT TAG: ${CATALYST_GIT_TAG}") - - target_include_directories(${target_name} PUBLIC ${LIGHTNING_CATALYST_SRC_PATH}/runtime/lib/backend/common) - target_include_directories(${target_name} PUBLIC ${LIGHTNING_CATALYST_SRC_PATH}/runtime/include) - - else() - if(NOT CATALYST_GIT_TAG) - set(CATALYST_GIT_TAG "v0.12.0" CACHE STRING "GIT_TAG value to build Catalyst") - endif() - message(INFO " Building against Catalyst GIT TAG ${CATALYST_GIT_TAG}") - - # Fetching /lib/backend/common hpp headers - set(LIB_BACKEND_COMMON_HEADERS CacheManager.hpp - QubitManager.hpp - Utils.hpp - ) - - foreach(HEADER ${LIB_BACKEND_COMMON_HEADERS}) - string(REGEX REPLACE "\\.[^.]*$" "" HEADER_NAME ${HEADER}) - FetchContent_Declare( - ${HEADER_NAME} - URL https://raw.githubusercontent.com/PennyLaneAI/catalyst/${CATALYST_GIT_TAG}/runtime/lib/backend/common/${HEADER} - DOWNLOAD_NO_EXTRACT True - SOURCE_DIR ../../include - ) - - FetchContent_MakeAvailable(${HEADER_NAME}) - endforeach() - - # Fetching include hpp headers - set(INCLUDE_HEADERS DataView.hpp - Exception.hpp - QuantumDevice.hpp - RuntimeCAPI.h - Types.h - ) - - foreach(HEADER ${INCLUDE_HEADERS}) - string(REGEX REPLACE "\\.[^.]*$" "" HEADER_NAME ${HEADER}) - FetchContent_Declare( - ${HEADER_NAME} - URL https://raw.githubusercontent.com/PennyLaneAI/catalyst/${CATALYST_GIT_TAG}/runtime/include/${HEADER} - DOWNLOAD_NO_EXTRACT True - SOURCE_DIR ../../include - ) - - FetchContent_MakeAvailable(${HEADER_NAME}) - endforeach() - - #target_include_directories(${target_name} PUBLIC ${CMAKE_CURRENT_BINARY_DIR}/../../include) - - endif() -endmacro() From 67373005e90b66d805dba847d74aa18700a8d0ae Mon Sep 17 00:00:00 2001 From: Joseph Lee Date: Wed, 22 Oct 2025 21:41:07 +0000 Subject: [PATCH 61/64] remove simple_demo --- src/qirlightning/simple_demo/README.md | 69 --- .../snapshot_catalyst_runtime/README.rst | 118 ------ .../include/DataView.hpp | 173 -------- .../include/Exception.hpp | 96 ----- .../include/QuantumDevice.hpp | 399 ------------------ .../snapshot_catalyst_runtime/include/Types.h | 179 -------- .../simple_demo/test_rt_device.cpp | 74 ---- 7 files changed, 1108 deletions(-) delete mode 100644 src/qirlightning/simple_demo/README.md delete mode 100644 src/qirlightning/simple_demo/snapshot_catalyst_runtime/README.rst delete mode 100644 src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/DataView.hpp delete mode 100644 src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Exception.hpp delete mode 100644 src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/QuantumDevice.hpp delete mode 100644 src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Types.h delete mode 100644 src/qirlightning/simple_demo/test_rt_device.cpp diff --git a/src/qirlightning/simple_demo/README.md b/src/qirlightning/simple_demo/README.md deleted file mode 100644 index ef08d55..0000000 --- a/src/qirlightning/simple_demo/README.md +++ /dev/null @@ -1,69 +0,0 @@ -# Simple Demo for Catalyst/Lightning runtime - -This is a super simple demo for driving Lightning devices. The example here uses `lightning.kokkos`, but can easily be updated to target other devices, e.g. lightning.gpu (if an Nvidia GPU is present). - -Some Catalyst include files are copied here for convenience - they are in `./snapshot_catalyst_runtime/include`. These are required for the QuantumDevice interface. For the qiree source, these files are fetched automatically during CMake, and these are not used. - -## Installing a lightning simulator - -When installing [Pennylane-Lightning](https://github.com/PennyLaneAI/pennylane-lightning) from pip or source, you will have the shared objects for each of the simulator installed. These are named `liblightning_kokkos_catalyst.so`/`liblightning_GPU_catalyst.so` etc. - -To get started, run `pip install pennylane` or `pip install pennylane-lightning` - this will install the `lightning.qubit` (CPU) simulator, and other simulators can be installed by running `pip install pennylane-lightning-kokkos / pennylane-lightning-gpu`. - -Example: -``` -$ pip install pennylane-lightning-kokkos - -$ pip show pennylane-lightning-kokkos -Name: PennyLane_Lightning_Kokkos -Version: 0.40.0 -Summary: PennyLane-Lightning plugin -Home-page: https://github.com/PennyLaneAI/pennylane-lightning -Author: -Author-email: -License: Apache License 2.0 -Location: -Requires: pennylane, pennylane-lightning - -$ ls /pennylane_lightning -... liblightning_kokkos_catalyst.so ... -``` - -You can swap `pennylane-lightning-kokkos` for `pennylane-lightning-gpu` for lightning.gpu and `pennylane-lightning` for lightning.gpu simulators. - -## Compilation - -First update the `RTDLIB` in `test_rt_device.cpp` to the local path where lightning is installed (i.e. `` from above). - -To compile: - -``` -$ clang++ --std=c++20 test_rt_device.cpp -I./snapshot_catalyst_runtime/include -o test_rt_device.out -``` - -## Running the example - -To run: - -``` -$ ./test_rt_device.out -Kokkos::OpenMP::initialize WARNING: OMP_PROC_BIND environment variable not set - In general, for best performance with OpenMP 4.0 or better set OMP_PROC_BIND=spread and OMP_PLACES=threads - For best performance with OpenMP 3.1 set OMP_PROC_BIND=true - For unit testing set OMP_PROC_BIND=false - -Num Qubits = 3 -State = -*** State-Vector of Size 8 *** -[(0.707107,0), (0,0), (0,0), (0,0), (0.707107,0), (0,0), (0,0), (0,0)] -Measure on wire 0 = 0 -``` - -## Running on other devices - -To run on other devices, e.g. lightning.gpu, you need to change: -- Install pennylane-lightning-gpu: `pip install pennylane-lightning-gpu` -In the c++ file: -- replace `RTDLIB` from `kokkos` to `gpu` -- replace `RTDDEVICE` from `Kokkos` to `GPU` -- Include `cuquantum` libraries when running (which was installed as a dependency), i.e. `LD_LIBRARY_PATH=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")/lib:$LD_LIBRARY_PATH ./test_rt_device.out` diff --git a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/README.rst b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/README.rst deleted file mode 100644 index 8a881e5..0000000 --- a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/README.rst +++ /dev/null @@ -1,118 +0,0 @@ -.. runtime-start-inclusion-marker-do-not-remove - -Catalyst Quantum Runtime -######################## - -The Catalyst Runtime is a C++ QIR runtime that enables the execution of Catalyst-compiled -quantum programs, and is currently backed by `PennyLane-Lightning `_ -state-vector simulators, and `Amazon Braket `__ -devices. Additional hardware support, including QPUs, to come. - -The runtime employs the `QuantumDevice `_ -public interface to support an extensible list of backend devices. This interface comprises two collections of abstract methods: - -- The Qubit management, device shot noise, and quantum tape recording methods are utilized for the implementation of Quantum Runtime (QR) instructions. - -- The quantum operations, observables, measurements, and gradient methods are used to implement Quantum Instruction Set (QIS) instructions. - -A complete list of instructions supported by the runtime can be found in -`RuntimeCAPI.h `_. - -Contents -======== - -The directory is structured as follows: - -- `include `_: - This contains the public header files of the runtime including the ``QuantumDevice`` API - for backend quantum devices and the runtime CAPI. - -- `lib `_: - The core modules of the runtime are structured into ``lib/capi`` and ``lib/backend``. - `lib/capi `_ implements the semantics for - QIR instructions lowered to our custom runtime. `lib/backend `_ - contains implementations of the ``QuantumDevice`` API for backend simulators. - -- `tests `_: - A collection of C++ tests for modules and methods in the runtime. - -Backend Devices -=============== - -New device backends for the runtime can be realized by implementing the quantum device interface. -The following table shows the available devices along with supported features: - -.. list-table:: - :widths: 25 25 25 25 - :header-rows: 0 - - * - **Features** - - **PennyLane-Lightning-Qubit** - - **PennyLane-Lightning-Kokkos** and **PennyLane-Lightning-GPU** - - **Amazon-Braket-OpenQasm** - * - Qubit Management - - Dynamic allocation/deallocation - - Static allocation/deallocation - - Static allocation/deallocation - * - Gate Operations - - `Lightning operations `_ - - `Lightning operations `_ without controlled gates support - - `Braket operations `_ - * - Quantum Observables - - ``Identity``, ``PauliX``, ``PauliY``, ``PauliZ``, ``Hadamard``, ``Hermitian``, ``Hamiltonian``, and Tensor Product of Observables - - ``Identity``, ``PauliX``, ``PauliY``, ``PauliZ``, ``Hadamard``, ``Hermitian``, ``Hamiltonian``, and Tensor Product of Observables - - ``Identity``, ``PauliX``, ``PauliY``, ``PauliZ``, ``Hadamard``, ``Hermitian``, and Tensor Product of Observables - * - Expectation Value - - All observables; Finite-shots supported - - All observables; Finite-shots supported - - All observables; Finite-shots supported - * - Variance - - All observables; Finite-shots supported - - All observables; Finite-shots supported - - All observables; Finite-shots supported - * - Probability - - Only for the computational basis on the supplied qubits; Finite-shots supported - - Only for the computational basis on the supplied qubits; Finite-shots supported - - The computational basis on all active qubits; Finite-shots supported - * - Sampling - - Only for the computational basis on the supplied qubits - - Only for the computational basis on the supplied qubits - - The computational basis on all active qubits; Finite-shots supported - * - Mid-Circuit Measurement - - Only for the computational basis on the supplied qubit - - Only for the computational basis on the supplied qubit - - Not supported - * - Gradient - - The Adjoint-Jacobian method for expectation values on all observables - - The Adjoint-Jacobian method for expectation values on all observables - - Not supported - -Requirements -============ - -To build the runtime from source, it is required to have an up to date version of a C/C++ compiler such as gcc or clang -with support for the C++20 standard library. - -Installation -============ - -By default, the runtime builds all supported backend devices. -You can build the runtime with custom devices from the list of Backend Devices. - -You can use ``ENABLE_OPENQASM=OFF`` to disable building the runtime with `Amazon-Braket-OpenQasm `_: - -.. code-block:: console - - make runtime ENABLE_OPENQASM=OFF - -This device currently offers generators for the `OpenQasm3 `_ specification and -`Amazon Braket `__ assembly extension. -Moreover, the generated assembly can be executed on Amazon Braket devices leveraging `amazon-braket-sdk-python `_. - -To check the runtime test suite from the root directory: - -.. code-block:: console - - make test-runtime - -.. runtime-end-inclusion-marker-do-not-remove diff --git a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/DataView.hpp b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/DataView.hpp deleted file mode 100644 index 616b9dc..0000000 --- a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/DataView.hpp +++ /dev/null @@ -1,173 +0,0 @@ -// Copyright 2023 Xanadu Quantum Technologies Inc. - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -/** - * A multi-dimensional view for MemRef-like and std::vector types. - * - * @tparam T The underlying data type - * @tparam R The Rank (R > 0) - * - * @note A forward iterator is implemented in this view for traversing over the - * entire elements of MemRef types rank-by-rank starting from the last - * dimension (R-1). For example, The DataView iterator for MemRef starts - * from index (0, 0) and traverses elements in the following order: (0, 0), - * ..., (0, sizes[1]-1), (1, 0), ..., (1, sizes[1]-1), ... (sizes[0]-1, - * sizes[1]-1). - */ -template -class DataView -{ - private: - T* data_aligned; - size_t offset; - size_t sizes[R] = {0}; - size_t strides[R] = {0}; - - public: - class iterator - { - private: - DataView const& view; - - int64_t loc; // physical index - size_t indices[R] = {0}; - - public: - using iterator_category = std::forward_iterator_tag; // LCOV_EXCL_LINE - using value_type = T; // LCOV_EXCL_LINE - using difference_type = std::ptrdiff_t; // LCOV_EXCL_LINE - using pointer = T*; // LCOV_EXCL_LINE - using reference = T&; // LCOV_EXCL_LINE - - iterator(DataView const& _view, int64_t begin_idx) - : view(_view), loc(begin_idx) - { - } - pointer operator->() const { return &view.data_aligned[loc]; } - reference operator*() const { return view.data_aligned[loc]; } - iterator& operator++() - { - int64_t next_axis = -1; - int64_t idx; - for (int64_t i = R; i > 0; --i) - { - idx = i - 1; - if (indices[idx]++ < view.sizes[idx] - 1) - { - next_axis = idx; - break; - } - indices[idx] = 0; - loc -= (view.sizes[idx] - 1) * view.strides[idx]; - } - - loc = next_axis == -1 ? -1 : loc + view.strides[next_axis]; - return *this; - } - iterator operator++(int) - { - auto tmp = *this; - int64_t next_axis = -1; - int64_t idx; - for (int64_t i = R; i > 0; --i) - { - idx = i - 1; - if (indices[idx]++ < view.sizes[idx] - 1) - { - next_axis = idx; - break; - } - indices[idx] = 0; - loc -= (view.sizes[idx] - 1) * view.strides[idx]; - } - - loc = next_axis == -1 ? -1 : loc + view.strides[next_axis]; - return tmp; - } - bool operator==(iterator const& other) const - { - return (loc == other.loc - && view.data_aligned == other.view.data_aligned); - } - bool operator!=(iterator const& other) const - { - return !(*this == other); - } - }; - - explicit DataView(std::vector& buffer) - : data_aligned(buffer.data()), offset(0) - { - static_assert(R == 1, "[Class: DataView] Assertion: R == 1"); - sizes[0] = buffer.size(); - strides[0] = 1; - } - - explicit DataView(T* _data_aligned, - size_t _offset, - size_t const* _sizes, - size_t const* _strides) - : data_aligned(_data_aligned), offset(_offset) - { - static_assert(R > 0, "[Class: DataView] Assertion: R > 0"); - if (_sizes != nullptr && _strides != nullptr) - { - for (size_t i = 0; i < R; i++) - { - sizes[i] = _sizes[i]; - strides[i] = _strides[i]; - } - } // else sizes = {0}, strides = {0} - } - - [[nodiscard]] auto size() const -> size_t - { - if (!data_aligned) - { - return 0; - } - - size_t tsize = 1; - for (size_t i = 0; i < R; i++) - { - tsize *= sizes[i]; - } - return tsize; - } - - template - T& operator()(I... idxs) const - { - static_assert(sizeof...(idxs) == R, - "[Class: DataView] Error in Catalyst Runtime: Wrong " - "number of indices"); - size_t indices[] = {static_cast(idxs)...}; - - size_t loc = offset; - for (size_t axis = 0; axis < R; axis++) - { - RT_ASSERT(indices[axis] < sizes[axis]); - loc += indices[axis] * strides[axis]; - } - return data_aligned[loc]; - } - - iterator begin() { return iterator{*this, static_cast(offset)}; } - - iterator end() { return iterator{*this, -1}; } -}; diff --git a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Exception.hpp b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Exception.hpp deleted file mode 100644 index 4e8272d..0000000 --- a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Exception.hpp +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright 2023 Xanadu Quantum Technologies Inc. - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include - -/** - * @brief Macro that throws `RuntimeException` with given message. - */ -#define RT_FAIL(message) \ - Catalyst::Runtime::_abort((message), __FILE__, __LINE__, __func__) - -/** - * @brief Macro that throws `RuntimeException` if expression evaluates - * to true. - */ -#define RT_FAIL_IF(expression, message) \ - if ((expression)) \ - { \ - RT_FAIL(message); \ - } - -/** - * @brief Macro that throws `RuntimeException` with the given expression - * and source location if expression evaluates to false. - */ -#define RT_ASSERT(expression) \ - RT_FAIL_IF(!(expression), "Assertion: " #expression) - -namespace Catalyst::Runtime -{ - -/** - * @brief This is the general exception thrown by Catalyst for runtime errors - * that is derived from `std::exception`. - */ -class RuntimeException : public std::exception -{ - private: - std::string const err_msg; - - public: - explicit RuntimeException(std::string msg) noexcept - : err_msg{std::move(msg)} - { - } // LCOV_EXCL_LINE - ~RuntimeException() override = default; // LCOV_EXCL_LINE - - RuntimeException(RuntimeException const&) = default; - RuntimeException(RuntimeException&&) noexcept = default; - - RuntimeException& operator=(RuntimeException const&) = delete; - RuntimeException& operator=(RuntimeException&&) = delete; - - [[nodiscard]] auto what() const noexcept -> char const* override - { - return err_msg.c_str(); - } // LCOV_EXCL_LINE -}; - -/** - * @brief Throws a `RuntimeException` with the given error message. - * - * @note This is not supposed to be called directly. - */ -[[noreturn]] inline void _abort(char const* message, - char const* file_name, - size_t line, - char const* function_name) -{ - std::stringstream sstream; - sstream << "[" << file_name << "][Line:" << line - << "][Function:" << function_name - << "] Error in Catalyst Runtime: " << message; - - throw RuntimeException(sstream.str()); -} // LCOV_EXCL_LINE - -} // namespace Catalyst::Runtime diff --git a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/QuantumDevice.hpp b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/QuantumDevice.hpp deleted file mode 100644 index 6794033..0000000 --- a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/QuantumDevice.hpp +++ /dev/null @@ -1,399 +0,0 @@ -// Copyright 2022-2023 Xanadu Quantum Technologies Inc. - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include - -#include "DataView.hpp" -#include "Types.h" - -// A helper template macro to generate the Factory method by -// calling (kwargs). Check the Custom Devices guideline for -// details: -// https://docs.pennylane.ai/projects/catalyst/en/stable/dev/custom_devices.html -#define GENERATE_DEVICE_FACTORY(IDENTIFIER, CONSTRUCTOR) \ - extern "C" Catalyst::Runtime::QuantumDevice* IDENTIFIER##Factory( \ - const char* kwargs) \ - { \ - return new CONSTRUCTOR(std::string(kwargs)); \ - } - -namespace Catalyst::Runtime -{ - -/** - * @brief struct API for backend quantum devices. - * - * This device API contains, - * - a set of methods to manage qubit allocations and deallocations, device - * shot noise, and quantum tape recording as well as reference values for the - * result data-type; these are used to implement Quantum Runtime (QR) - * instructions. - * - * - a set of methods for quantum operations, observables, measurements, and - * gradient of the device; these are used to implement Quantum Instruction Set - * (QIS) instructions. - * - */ -struct QuantumDevice -{ - QuantumDevice() = default; // LCOV_EXCL_LINE - virtual ~QuantumDevice() = default; // LCOV_EXCL_LINE - - QuantumDevice& operator=(QuantumDevice const&) = delete; - QuantumDevice(QuantumDevice const&) = delete; - QuantumDevice(QuantumDevice&&) = delete; - QuantumDevice& operator=(QuantumDevice&&) = delete; - - /** - * @brief Allocate a qubit. - * - * @return `QubitIdType` - */ - virtual auto AllocateQubit() -> QubitIdType = 0; - - /** - * @brief Allocate a vector of qubits. - * - * @param num_qubits The number of qubits to allocate. - * - * @return `std::vector` - */ - virtual auto AllocateQubits(size_t num_qubits) -> std::vector - = 0; - - /** - * @brief Release a qubit. - * - * @param qubit The id of the qubit - */ - virtual void ReleaseQubit(QubitIdType qubit) = 0; - - /** - * @brief Release all qubits. - */ - virtual void ReleaseAllQubits() = 0; - - /** - * @brief Get the number of allocated qubits. - * - * @return `size_t` - */ - [[nodiscard]] virtual auto GetNumQubits() const -> size_t = 0; - - /** - * @brief Set the number of device shots. - * - * @param shots The number of noise shots - */ - virtual void SetDeviceShots(size_t shots) = 0; - - /** - * @brief Get the number of device shots. - * - * @return `size_t` - */ - [[nodiscard]] virtual auto GetDeviceShots() const -> size_t = 0; - - /** - * @brief Set the PRNG of the device. - * - * The Catalyst runtime enables seeded program execution on non-hardware - * devices. A random number generator instance is managed by the runtime to - * predictably generate results for non-deterministic programs, such as - * those involving `Measure` calls. Devices implementing support for this - * feature do not need to use the provided PRNG instance as their sole - * source of random numbers, but it is expected that the the same instance - * state will predictable and reproducibly generate the same program - * results. It is also expected that the provided PRNG state is evolved - * sufficiently so that two device executions sharing the same instance do - * not produce identical results. The provided PRNG instance is not - * thread-locked, and devices wishing to share it across threads will need - * to provide their own thread-safety. - * - * @param gen The std::mt19937 PRNG object. - */ - virtual void SetDevicePRNG([[maybe_unused]] std::mt19937* gen) {}; - - /** - * @brief Start recording a quantum tape if provided. - * - * @note This is backed by the `Catalyst::Runtime::CacheManager` - * property in the device implementation. - */ - virtual void StartTapeRecording() = 0; - - /** - * @brief Stop recording a quantum tape if provided. - * - * @note This is backed by the `Catalyst::Runtime::CacheManager` - * property in the device implementation. - */ - virtual void StopTapeRecording() = 0; - - /** - * @brief Result value for "Zero" used in the measurement process. - * - * @return `Result` - */ - [[nodiscard]] virtual auto Zero() const -> Result = 0; - - /** - * @brief Result value for "One" used in the measurement process. - * - * @return `Result` - */ - [[nodiscard]] virtual auto One() const -> Result = 0; - - /** - * @brief A helper method to print the state vector of a device. - */ - virtual void PrintState() = 0; - - /** - * @brief Prepare subsystems using the given ket vector in the - * computational basis. - * - * @param state A state vector of size 2**len(wires) - * @param wires The wire(s) the operation acts on - */ - virtual void - SetState([[maybe_unused]] DataView, 1>& state, - [[maybe_unused]] std::vector& wires) - { - RT_FAIL("Unsupported functionality"); - } - - /** - * @brief Prepares a single computational basis state. - * - * @param n Prepares the basis state |n>, where n is an array of integers - * from the set {0, 1} - * @param wires The wire(s) the operation acts on - */ - virtual void SetBasisState([[maybe_unused]] DataView& n, - [[maybe_unused]] std::vector& wires) - { - RT_FAIL("Unsupported functionality"); - } - - /** - * @brief Apply a single gate to the state vector of a device with its name - * if this is supported. - * - * @param name The name of the gate to apply - * @param params Optional parameter list for parametric gates - * @param wires Wires to apply gate to - * @param inverse Indicates whether to use inverse of gate - * @param controlled_wires Optional controlled wires applied to the - * operation - * @param controlled_values Optional controlled values applied to the - * operation - */ - virtual void NamedOperation( - std::string const& name, - std::vector const& params, - std::vector const& wires, - [[maybe_unused]] bool inverse = false, - [[maybe_unused]] std::vector const& controlled_wires = {}, - [[maybe_unused]] std::vector const& controlled_values = {}) - = 0; - - /** - * @brief Apply a given matrix directly to the state vector of a device. - * - * @param matrix The matrix of data in row-major format - * @param wires Wires to apply gate to - * @param inverse Indicates whether to use inverse of gate - * @param controlled_wires Controlled wires applied to the operation - * @param controlled_values Controlled values applied to the operation - */ - virtual void MatrixOperation( - std::vector> const& matrix, - std::vector const& wires, - [[maybe_unused]] bool inverse = false, - [[maybe_unused]] std::vector const& controlled_wires = {}, - [[maybe_unused]] std::vector const& controlled_values = {}) - = 0; - - /** - * @brief Construct a named (Identity, PauliX, PauliY, PauliZ, and - * Hadamard) or Hermitian observable. - * - * @param id The type of the observable - * @param matrix The matrix of data to construct a hermitian observable - * @param wires Wires to apply observable to - * - * @return `ObsIdType` Index of the constructed observable - */ - virtual auto Observable(ObsId id, - std::vector> const& matrix, - std::vector const& wires) -> ObsIdType - = 0; - - /** - * @brief Construct a tensor product of observables. - * - * @param obs The vector of observables indices of type ObsIdType - * - * @return `ObsIdType` Index of the constructed observable - */ - virtual auto TensorObservable(std::vector const& obs) - -> ObsIdType - = 0; - - /** - * @brief Construct a Hamiltonian observable. - * - * @param coeffs The vector of coefficients - * @param obs The vector of observables indices of size `coeffs` - * - * @return `ObsIdType` Index of the constructed observable - */ - virtual auto HamiltonianObservable(std::vector const& coeffs, - std::vector const& obs) - -> ObsIdType - = 0; - - /** - * @brief Compute the expected value of an observable. - * - * @param obsKey The index of the constructed observable - * - * @return `double` The expected value - */ - virtual auto Expval(ObsIdType obsKey) -> double = 0; - - /** - * @brief Compute the variance of an observable. - * - * @param obsKey The index of the constructed observable - * - * @return `double` The variance - */ - virtual auto Var(ObsIdType obsKey) -> double = 0; - - /** - * @brief Get the state-vector of a device. - * - * @param state The pre-allocated `DataView, 1>` - */ - virtual void State(DataView, 1>& state) = 0; - - /** - * @brief Compute the probabilities of each computational basis state. - - * @param probs The pre-allocated `DataView` - */ - virtual void Probs(DataView& probs) = 0; - - /** - * @brief Compute the probabilities for a subset of the full system. - * - * @param probs The pre-allocated `DataView` - * @param wires Wires will restrict probabilities to a subset of the full - * system - */ - virtual void PartialProbs(DataView& probs, - std::vector const& wires) - = 0; - - /** - * @brief Compute samples with the number of shots on the entire wires, - * returing raw samples. - * - * @param samples The pre-allocated `DataView`representing a - * matrix of shape `shots * numQubits`. The built-in iterator in - * `DataView` iterates over all elements of `samples` row-wise. - * @param shots The number of shots - */ - virtual void Sample(DataView& samples, size_t shots) = 0; - - /** - * @brief Compute partial samples with the number of shots on `wires`, - * returing raw samples. - * - * @param samples The pre-allocated `DataView`representing a - * matrix of shape `shots * numWires`. The built-in iterator in - * `DataView` iterates over all elements of `samples` row-wise. - * @param wires Wires to compute samples on - * @param shots The number of shots - */ - virtual void PartialSample(DataView& samples, - std::vector const& wires, - size_t shots) - = 0; - - /** - * @brief Sample with the number of shots on the entire wires, returning - * the number of counts for each sample. - * - * @param eigvals The pre-allocated `DataView` - * @param counts The pre-allocated `DataView` - * @param shots The number of shots - */ - virtual void Counts(DataView& eigvals, - DataView& counts, - size_t shots) - = 0; - - /** - * @brief Partial sample with the number of shots on `wires`, returning the - * number of counts for each sample. - * - * @param eigvals The pre-allocated `DataView` - * @param counts The pre-allocated `DataView` - * @param wires Wires to compute samples on - * @param shots The number of shots - */ - virtual void PartialCounts(DataView& eigvals, - DataView& counts, - std::vector const& wires, - size_t shots) - = 0; - - /** - * @brief A general measurement method that acts on a single wire. - * - * @param wire The wire to compute Measure on - * @param postselect Which basis state to postselect after a mid-circuit - measurement (-1 denotes no post-selection) - - * @return `Result` The measurement result - */ - virtual auto Measure(QubitIdType wire, std::optional postselect) - -> Result - = 0; - - /** - * @brief Compute the gradient of a quantum tape, that is cached using - * `Catalyst::Runtime::Simulator::CacheManager`, for a specific set of - * trainable parameters. - * - * @param gradients The vector of pre-allocated `DataView*` - * to store gradients resutls for the list of cached observables. - * @param trainParams The vector of trainable parameters; if none, all - * parameters would be assumed trainable - * - */ - virtual void Gradient(std::vector>& gradients, - std::vector const& trainParams) - = 0; -}; -} // namespace Catalyst::Runtime diff --git a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Types.h b/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Types.h deleted file mode 100644 index a90f69d..0000000 --- a/src/qirlightning/simple_demo/snapshot_catalyst_runtime/include/Types.h +++ /dev/null @@ -1,179 +0,0 @@ -// Copyright 2022-2023 Xanadu Quantum Technologies Inc. - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#ifndef TYPES_H -# define TYPES_H - -# include -# include -# include - -# ifdef __cplusplus -extern "C" { -# endif - -// Qubit, Result and Observable types -struct QUBIT; -using QubitIdType = intptr_t; - -using RESULT = bool; -using Result = RESULT*; -using QirArray = void*; - -using ObsIdType = intptr_t; - -enum ObsId : int8_t -{ - Identity = 0, - PauliX, - PauliY, - PauliZ, - Hadamard, - Hermitian, -}; - -enum ObsType : int8_t -{ - Basic = 0, - TensorProd, - Hamiltonian, -}; - -// complex type -struct CplxT_float -{ - float real; - float imag; -}; - -// complex type -struct CplxT_double -{ - double real; - double imag; -}; - -enum NumericType : int8_t -{ - idx = 0, - i1, - i8, - i16, - i32, - i64, - f32, - f64, - c64, - c128, -}; - -// MemRefT type -struct OpaqueMemRefT -{ - int64_t rank; - void* descriptor; - NumericType datatype; -}; - -// MemRefT, dimension=1> type -struct MemRefT_CplxT_double_1d -{ - CplxT_double* data_allocated; - CplxT_double* data_aligned; - size_t offset; - size_t sizes[1]; - size_t strides[1]; -}; - -// MemRefT, dimension=2> type -struct MemRefT_CplxT_double_2d -{ - CplxT_double* data_allocated; - CplxT_double* data_aligned; - size_t offset; - size_t sizes[2]; - size_t strides[2]; -}; - -// MemRefT type -struct MemRefT_double_1d -{ - double* data_allocated; - double* data_aligned; - size_t offset; - size_t sizes[1]; - size_t strides[1]; -}; - -// MemRefT type -struct MemRefT_double_2d -{ - double* data_allocated; - double* data_aligned; - size_t offset; - size_t sizes[2]; - size_t strides[2]; -}; - -// MemRefT type -struct MemRefT_int64_1d -{ - int64_t* data_allocated; - int64_t* data_aligned; - size_t offset; - size_t sizes[1]; - size_t strides[1]; -}; - -// MemRefT type -struct MemRefT_int8_1d -{ - int8_t* data_allocated; - int8_t* data_aligned; - size_t offset; - size_t sizes[1]; - size_t strides[1]; -}; - -// PairT, MemRefT> type -struct PairT_MemRefT_double_int64_1d -{ - struct MemRefT_double_1d first; - struct MemRefT_int64_1d second; -}; - -// Quantum operation modifiers -struct Modifiers -{ - bool adjoint; - size_t num_controlled; - QUBIT* controlled_wires; - bool* controlled_values; -}; - -using CplxT_double = struct CplxT_double; -using MemRefT_CplxT_double_1d = struct MemRefT_CplxT_double_1d; -using MemRefT_CplxT_double_2d = struct MemRefT_CplxT_double_2d; -using MemRefT_double_1d = struct MemRefT_double_1d; -using MemRefT_double_2d = struct MemRefT_double_2d; -using MemRefT_int64_1d = struct MemRefT_int64_1d; -using PairT_MemRefT_double_int64_1d = struct PairT_MemRefT_double_int64_1d; -using Modifiers = struct Modifiers; - -# ifdef __cplusplus -} // extern "C" -# endif - -#endif diff --git a/src/qirlightning/simple_demo/test_rt_device.cpp b/src/qirlightning/simple_demo/test_rt_device.cpp deleted file mode 100644 index 47897f2..0000000 --- a/src/qirlightning/simple_demo/test_rt_device.cpp +++ /dev/null @@ -1,74 +0,0 @@ -#include - -#include "QuantumDevice.hpp" - -// Runtime libraries (kokkos/GPU/qubit etc.) -// Update these paths to point to the correct library -#define RTDLIB \ - "/" \ - "pennylane_lightning/liblightning_kokkos_catalyst.so"; -#define RTDDEVICE "LightningKokkosSimulator"; - -extern "C" Catalyst::Runtime::QuantumDevice* -GenericDeviceFactory(char const* kwargs); - -using namespace Catalyst::Runtime; - -int main() -{ - try - { - // Load lightning simulation library - std::string rtd_lib = RTDLIB; - std::string rtd_device = RTDDEVICE; - std::string kwargs = {}; - auto rtld_flags = RTLD_LAZY | RTLD_NODELETE; - auto rtd_dylib_handler = dlopen(rtd_lib.c_str(), rtld_flags); - - if (!rtd_dylib_handler) - { - throw std::runtime_error("Failed to load library: " + rtd_lib); - } - - // Find device factory - std::string factory_name = rtd_device + "Factory"; - void* f_ptr = dlsym(rtd_dylib_handler, factory_name.c_str()); - - if (!f_ptr) - { - dlclose(rtd_dylib_handler); - throw std::runtime_error("Failed to find factory function: " - + factory_name); - } - std::string rtd_kwargs = {}; - auto rtd_qdevice = std::unique_ptr( - reinterpret_cast(f_ptr)( - rtd_kwargs.c_str())); - - // Allocate Qubits - rtd_qdevice->AllocateQubits(3); - - // Get Num Qubits - std::cout << "Num Qubits = " << rtd_qdevice->GetNumQubits() - << std::endl; - - // Apply Gate - rtd_qdevice->NamedOperation("Hadamard", {}, {0}); - - // Print State - std::cout << "State = " << std::endl; - rtd_qdevice->PrintState(); - - // Measure - QubitIdType wire{0}; - Result result = rtd_qdevice->Measure(wire, std::nullopt); - std::cout << "Measure on wire 0 = " << *result << std::endl; - } - catch (std::exception const& e) - { - std::cerr << "Error: " << e.what() << std::endl; - return EXIT_FAILURE; - } - - return EXIT_SUCCESS; -} From b1344f7c13684bd36b1a97bbd58a773df4bbdd59 Mon Sep 17 00:00:00 2001 From: Joseph Lee Date: Wed, 22 Oct 2025 21:42:28 +0000 Subject: [PATCH 62/64] remove build-lightning workflow --- .github/workflows/pr.yml | 2 -- .github/workflows/push.yml | 2 -- 2 files changed, 4 deletions(-) diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index df9d110..02c418a 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -17,8 +17,6 @@ concurrency: jobs: build-fast: uses: ./.github/workflows/build-fast.yml - build-lightning: - uses: ./.github/workflows/build-lightning.yml # Specifying a dependent job allows us to select a single "requires" check in the project GitHub settings all: if: ${{ always() }} diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 5e79f80..6ac89fc 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -12,8 +12,6 @@ concurrency: jobs: build-fast: uses: ./.github/workflows/build-fast.yml - build-lightning: - uses: ./.github/workflows/build-lightning.yml all: needs: - build-fast From 290a7e8c83ab1ef43efee5e7ce924bfeac14e355 Mon Sep 17 00:00:00 2001 From: Joseph Lee Date: Wed, 22 Oct 2025 21:43:00 +0000 Subject: [PATCH 63/64] remove build-lightning workflow --- .github/workflows/pr.yml | 1 - .github/workflows/push.yml | 1 - 2 files changed, 2 deletions(-) diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 02c418a..52f8404 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -22,7 +22,6 @@ jobs: if: ${{ always() }} needs: - build-fast - - build-lightning runs-on: ubuntu-latest steps: - name: Decide whether the needed jobs succeeded or failed diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 6ac89fc..4601abc 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -15,7 +15,6 @@ jobs: all: needs: - build-fast - - build-lightning runs-on: ubuntu-latest steps: - name: Success From d1d1c5cea2694367a323ec6222e8533d2fc1d07f Mon Sep 17 00:00:00 2001 From: Joseph Lee Date: Wed, 22 Oct 2025 21:44:21 +0000 Subject: [PATCH 64/64] improve formatting for cmake/support_catalyst.cmake --- cmake/support_catalyst.cmake | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cmake/support_catalyst.cmake b/cmake/support_catalyst.cmake index 95c7f73..07ab613 100644 --- a/cmake/support_catalyst.cmake +++ b/cmake/support_catalyst.cmake @@ -36,8 +36,8 @@ macro(FindCatalyst target_name) # Fetching /lib/backend/common hpp headers set(LIB_BACKEND_COMMON_HEADERS CacheManager.hpp - QubitManager.hpp - Utils.hpp + QubitManager.hpp + Utils.hpp ) foreach(HEADER ${LIB_BACKEND_COMMON_HEADERS}) @@ -54,10 +54,10 @@ macro(FindCatalyst target_name) # Fetching include hpp headers set(INCLUDE_HEADERS DataView.hpp - Exception.hpp - QuantumDevice.hpp - RuntimeCAPI.h - Types.h + Exception.hpp + QuantumDevice.hpp + RuntimeCAPI.h + Types.h ) foreach(HEADER ${INCLUDE_HEADERS})