diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml
index f8251325e..49fdec3b3 100644
--- a/.github/workflows/pull-request.yml
+++ b/.github/workflows/pull-request.yml
@@ -15,7 +15,7 @@ jobs:
       native_backend: "all"
       mb2_jobs: "mb2"
       mb4_jobs: "mb4_tcm"
-      mb6_jobs: "mb6"
+      mb6_jobs: "mb6_ntl"
       # cmake_args_map holds job specific additional cmake options. compiler flags, native_backend flag and
       # OpenMP flag are set in generic_workflow.yml
       cmake_args_map: '{
diff --git a/README.md b/README.md
index b762af18c..4f16f0a4d 100644
--- a/README.md
+++ b/README.md
@@ -12,6 +12,8 @@ OpenFHE also supports hybrid vectorized schemes, with the goal of enabling the F
   * Switching between CKKS and FHEW/TFHE to evaluate non-smooth functions, e.g., comparison, using (scalar) FHEW/TFHE functional bootstrapping
   * Switching between RLWE (a scheme equivalent to the coefficient-encoded additive BFV scheme) and CKKS to evaluate arbitrary lookup tables over vectors of integers, e.g., modular reduction, comparison or S-box, using vectorized functional bootstrapping implemented in CKKS
 
+OpenFHE also supports partial schemes, called schemelets, such as RLWE which is equivalent to the coefficient-encoded additive BFV scheme. In OpenFHE, the RLWE schemelet is the starting point for the vectorized functional bootstrapping capability, which allows the evaluation of arbitrary lookup tables over vectors of integers, e.g., modular reduction, comparison or Sbox, using CKKS in an intermediate step.
+
 OpenFHE also includes the following multiparty extensions of FHE:
   * Threshold FHE for BGV, BFV, and CKKS schemes
   * Interactive bootstrapping for Threshold CKKS
diff --git a/benchmark/src/ckks-bootstrapping.cpp b/benchmark/src/ckks-bootstrapping.cpp
new file mode 100644
index 000000000..8b7c84477
--- /dev/null
+++ b/benchmark/src/ckks-bootstrapping.cpp
@@ -0,0 +1,128 @@
+//==================================================================================
+// BSD 2-Clause License
+//
+// Copyright (c) 2014-2025, NJIT, Duality Technologies Inc. and other contributors
+//
+// All rights reserved.
+//
+// Author TPOC: contact@openfhe.org
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//==================================================================================
+
+#include "benchmark/benchmark.h"
+#include "config_core.h"
+#include "cryptocontext.h"
+#include "gen-cryptocontext.h"
+#include "gtest/gtest.h"
+#include "scheme/ckksrns/ckksrns-fhe.h"
+#include "scheme/ckksrns/ckksrns-utils.h"
+#include "scheme/ckksrns/gen-cryptocontext-ckksrns.h"
+
+#include <vector>
+
+using namespace lbcrypto;
+
+struct boot_config {
+    uint32_t ringDim;
+    uint32_t slots;
+    uint32_t dcrtBits;
+    uint32_t firstMod;
+    uint32_t numDigits;
+    uint32_t lvlsAfter;
+    uint32_t iters;
+    std::vector<uint32_t> lvlb;
+    SecretKeyDist skdst;
+    ScalingTechnique stech;
+};
+
+// clang-format off
+[[maybe_unused]] std::vector<boot_config> boot_configs = {
+    // ringDm,   slots, dcrtBits, firstMod, numDigits, lvlsAfter, iters,   lvlb,                skdst,                stech
+    { 1 << 16, 1 << 15,       54,       60,        15,         9,     1, {3, 3},      UNIFORM_TERNARY,         FLEXIBLEAUTO},
+    { 1 << 16, 1 << 15,       50,       57,        11,         9,     2, {3, 3},      UNIFORM_TERNARY,         FLEXIBLEAUTO},
+    { 1 << 16, 1 << 15,       50,       57,        16,        10,     2, {3, 3},      UNIFORM_TERNARY,         FLEXIBLEAUTO},
+    { 1 << 16, 1 << 15,       52,       57,        10,         8,     2, {3, 3},      UNIFORM_TERNARY,          FIXEDMANUAL},
+    { 1 << 16, 1 << 15,       52,       57,        16,         9,     2, {3, 3},      UNIFORM_TERNARY,          FIXEDMANUAL},
+    { 1 << 17, 1 << 16,       59,       60,         0,         5,     1, {4, 4},       SPARSE_TERNARY,         FLEXIBLEAUTO},
+    { 1 << 17, 1 << 16,       59,       60,         0,         5,     1, {4, 4},  SPARSE_ENCAPSULATED,         FLEXIBLEAUTO},
+    { 1 << 16,  1 << 5,       59,       60,         0,         5,     1, {1, 1},       SPARSE_TERNARY,         FLEXIBLEAUTO},
+    { 1 << 16,  1 << 5,       59,       60,         0,         5,     1, {1, 1},  SPARSE_ENCAPSULATED,         FLEXIBLEAUTO},
+    { 1 << 17,  1 << 5,       59,       60,         0,         5,     1, {1, 1},       SPARSE_TERNARY,         FLEXIBLEAUTO},
+    { 1 << 17,  1 << 5,       59,       60,         0,         5,     1, {1, 1},  SPARSE_ENCAPSULATED,         FLEXIBLEAUTO},
+    { 1 << 17, 1 << 16,       59,       60,         0,        10,     1, {4, 4},  SPARSE_ENCAPSULATED,         FLEXIBLEAUTO},
+    { 1 << 17,  1 << 5,       59,       60,         0,        10,     1, {1, 1},  SPARSE_ENCAPSULATED,         FLEXIBLEAUTO},
+    { 1 << 17, 1 << 16,       59,       60,         0,        10,     2, {4, 4},  SPARSE_ENCAPSULATED,         FLEXIBLEAUTO},
+    { 1 << 17,  1 << 5,       59,       60,         0,        10,     2, {1, 1},  SPARSE_ENCAPSULATED,         FLEXIBLEAUTO},
+    { 1 << 17, 1 << 16,       78,       96,         0,        10,     2, {4, 4},       SPARSE_TERNARY, COMPOSITESCALINGAUTO},
+};
+// clang-format on
+
+[[maybe_unused]] static void BootConfigs(benchmark::internal::Benchmark* b) {
+    for (uint32_t i = 0; i < boot_configs.size(); ++i)
+        b->ArgName("Config")->Arg(i);
+}
+
+[[maybe_unused]] static void CKKSBoot(benchmark::State& state) {
+    auto t = boot_configs[state.range(0)];
+
+    CCParams<CryptoContextCKKSRNS> parameters;
+    parameters.SetSecurityLevel(HEStd_128_classic);
+    parameters.SetRingDim(t.ringDim);
+    parameters.SetScalingModSize(t.dcrtBits);
+    parameters.SetFirstModSize(t.firstMod);
+    parameters.SetNumLargeDigits(t.numDigits);
+    parameters.SetSecretKeyDist(t.skdst);
+    parameters.SetScalingTechnique(t.stech);
+    parameters.SetKeySwitchTechnique(HYBRID);
+    uint32_t depth = t.lvlsAfter + FHECKKSRNS::GetBootstrapDepth(t.lvlb, t.skdst) + (t.iters - 1);
+    parameters.SetMultiplicativeDepth(depth);
+
+    auto cc = GenCryptoContext(parameters);
+    cc->Enable(PKE);
+    cc->Enable(KEYSWITCH);
+    cc->Enable(LEVELEDSHE);
+    cc->Enable(ADVANCEDSHE);
+    cc->Enable(FHE);
+
+    cc->EvalBootstrapSetup(t.lvlb, {0, 0}, t.slots);
+
+    auto keyPair = cc->KeyGen();
+    cc->EvalMultKeyGen(keyPair.secretKey);
+    cc->EvalBootstrapKeyGen(keyPair.secretKey, t.slots);
+
+    std::vector<double> x = {0.25, 0.5, 0.75, 1.0, 2.0, 3.0, 4.0, 5.0};
+
+    auto ptxt = cc->MakeCKKSPackedPlaintext(x, 1, depth - 1, nullptr, t.slots);
+    ptxt->SetLength(t.slots);
+
+    auto ctxt = cc->Encrypt(keyPair.publicKey, ptxt);
+
+    while (state.KeepRunning())
+        auto ctxtAfter = cc->EvalBootstrap(ctxt, t.iters);
+
+    cc->ClearStaticMapsAndVectors();
+}
+
+BENCHMARK(CKKSBoot)->Unit(benchmark::kSecond)->Iterations(4)->Apply(BootConfigs);
+
+BENCHMARK_MAIN();
diff --git a/benchmark/src/ckks-functional-bootstrapping.cpp b/benchmark/src/ckks-functional-bootstrapping.cpp
new file mode 100644
index 000000000..3fd584863
--- /dev/null
+++ b/benchmark/src/ckks-functional-bootstrapping.cpp
@@ -0,0 +1,489 @@
+//==================================================================================
+// BSD 2-Clause License
+//
+// Copyright (c) 2014-2025, NJIT, Duality Technologies Inc. and other contributors
+//
+// All rights reserved.
+//
+// Author TPOC: contact@openfhe.org
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//==================================================================================
+
+#include "benchmark/benchmark.h"
+#include "config_core.h"
+#include "cryptocontext.h"
+#include "gen-cryptocontext.h"
+#include "gtest/gtest.h"
+#include "math/hermite.h"
+#include "scheme/ckksrns/ckksrns-fhe.h"
+#include "scheme/ckksrns/ckksrns-utils.h"
+#include "scheme/ckksrns/gen-cryptocontext-ckksrns.h"
+#include "schemelet/rlwe-mp.h"
+
+#include <complex>
+#include <map>
+#include <vector>
+
+using namespace lbcrypto;
+
+struct fbt_config {
+    BigInteger QBFVInit;
+    BigInteger PInput;
+    BigInteger POutput;
+    BigInteger Q;
+    BigInteger Bigq;
+    double scaleTHI;
+    double scaleStepTHI;
+    size_t order;
+    uint32_t numSlots;
+    uint32_t ringDim;
+    uint32_t dnum;
+    std::vector<uint32_t> lvlb;
+};
+
+[[maybe_unused]] const BigInteger Q1(BigInteger(1) << 1);
+[[maybe_unused]] const BigInteger Q2(BigInteger(1) << 2);
+[[maybe_unused]] const BigInteger Q4(BigInteger(1) << 4);
+[[maybe_unused]] const BigInteger Q8(BigInteger(1) << 8);
+[[maybe_unused]] const BigInteger Q12(BigInteger(1) << 12);
+[[maybe_unused]] const BigInteger Q32(BigInteger(1) << 32);
+[[maybe_unused]] const BigInteger Q33(BigInteger(1) << 33);
+[[maybe_unused]] const BigInteger Q35(BigInteger(1) << 35);
+[[maybe_unused]] const BigInteger Q38(BigInteger(1) << 38);
+[[maybe_unused]] const BigInteger Q47(BigInteger(1) << 47);
+[[maybe_unused]] const BigInteger Q55(BigInteger(1) << 55);
+[[maybe_unused]] const BigInteger Q60(BigInteger(1) << 60);
+[[maybe_unused]] const BigInteger Q71(BigInteger(1) << 71);
+[[maybe_unused]] const BigInteger Q80(BigInteger(1) << 80);
+
+// clang-format off
+[[maybe_unused]] std::map<uint32_t, fbt_config> arblut_configs = {
+    //      QBFVInit, PInput, POutput,   Q, Bigq, scaleTHI, scaleStepTHI, order, numSlots, ringDim, dnum, lvlBudget
+    {1, {        Q60,     Q1,      Q1, Q33,  Q33,      1.0,          1.0,     1,  1 << 15, 1 << 15,    3, {3, 3}}},
+    {2, {        Q60,     Q2,      Q2, Q35,  Q35,     16.0,          1.0,     1,  1 << 16, 1 << 16,    3, {4, 4}}},
+    {4, {        Q60,     Q4,      Q4, Q38,  Q38,     32.0,          1.0,     1,  1 << 16, 1 << 16,    3, {4, 4}}},
+    {8, {        Q60,     Q8,      Q8, Q47,  Q47,     32.0,          1.0,     1,  1 << 16, 1 << 16,    4, {3, 3}}},
+    {12, {       Q80,    Q12,     Q12, Q55,  Q55,   2000.0,          1.0,     1,  1 << 17, 1 << 17,    3, {4, 4}}},
+    {32, {       Q80,    Q32,      Q4, Q71,  Q47,    256.0,         16.0,     1,  1 << 16, 1 << 16,    4, {3, 3}}}
+};
+// clang-format on
+
+[[maybe_unused]] static void ArbLUTBits(benchmark::internal::Benchmark* b) {
+    for (uint32_t bits : {12, 8, 4, 2, 1})
+        b->ArgName("bits")->Arg(bits);
+}
+
+[[maybe_unused]] static void FBTSetup(benchmark::State& state) {
+    auto t = arblut_configs[12];
+
+    bool flagSP = (t.numSlots <= t.ringDim / 2);  // sparse packing
+
+    auto numSlotsCKKS = flagSP ? t.numSlots : t.numSlots / 2;
+
+    auto a = t.PInput.ConvertToInt<int64_t>();
+    auto b = t.POutput.ConvertToInt<int64_t>();
+    auto f = [a, b](int64_t x) -> int64_t {
+        return (x % a - a / 2) % b;
+    };
+
+    std::vector<int64_t> x = {
+        (t.PInput.ConvertToInt<int64_t>() / 2), (t.PInput.ConvertToInt<int64_t>() / 2) + 1, 0, 3, 16, 33, 64,
+        (t.PInput.ConvertToInt<int64_t>() - 1)};
+    if (x.size() < t.numSlots)
+        x = Fill<int64_t>(x, t.numSlots);
+
+    std::vector<int64_t> coeffint;
+    std::vector<std::complex<double>> coeffcomp;
+    bool binaryLUT = (t.PInput.ConvertToInt() == 2) && (t.order == 1);
+    if (binaryLUT)  // coeffs for [1, cos^2(pi x)], not [1, cos(2pi x)]
+        coeffint = {f(1), f(0) - f(1)};
+    else  // divided by 2
+        coeffcomp = GetHermiteTrigCoefficients(f, t.PInput.ConvertToInt(), t.order, t.scaleTHI);
+
+    uint32_t dcrtBits = t.Bigq.GetMSB() - 1;
+    CCParams<CryptoContextCKKSRNS> parameters;
+    parameters.SetSecretKeyDist(SPARSE_ENCAPSULATED);
+    parameters.SetSecurityLevel(HEStd_NotSet);
+    parameters.SetScalingModSize(dcrtBits);
+    parameters.SetScalingTechnique(FIXEDMANUAL);
+    parameters.SetFirstModSize(dcrtBits);
+    parameters.SetNumLargeDigits(t.dnum);
+    parameters.SetBatchSize(numSlotsCKKS);
+    parameters.SetRingDim(t.ringDim);
+
+    uint32_t depth = 0;
+    if (binaryLUT)
+        depth += FHECKKSRNS::GetFBTDepth(t.lvlb, coeffint, t.PInput, t.order, SPARSE_ENCAPSULATED);
+    else
+        depth += FHECKKSRNS::GetFBTDepth(t.lvlb, coeffcomp, t.PInput, t.order, SPARSE_ENCAPSULATED);
+    parameters.SetMultiplicativeDepth(depth);
+
+    auto cc = GenCryptoContext(parameters);
+    cc->Enable(PKE);
+    cc->Enable(KEYSWITCH);
+    cc->Enable(LEVELEDSHE);
+    cc->Enable(ADVANCEDSHE);
+    cc->Enable(FHE);
+
+    auto keyPair = cc->KeyGen();
+
+    while (state.KeepRunning()) {
+        if (binaryLUT)
+            cc->EvalFBTSetup(coeffint, numSlotsCKKS, t.PInput, t.POutput, t.Bigq, keyPair.publicKey, {0, 0}, t.lvlb, 0,
+                             0, t.order);
+        else
+            cc->EvalFBTSetup(coeffcomp, numSlotsCKKS, t.PInput, t.POutput, t.Bigq, keyPair.publicKey, {0, 0}, t.lvlb, 0,
+                             0, t.order);
+    }
+
+    cc->ClearStaticMapsAndVectors();
+}
+
+[[maybe_unused]] static void FBTKeyGen(benchmark::State& state) {
+    auto t = arblut_configs[12];
+
+    bool flagSP = (t.numSlots <= t.ringDim / 2);  // sparse packing
+
+    auto numSlotsCKKS = flagSP ? t.numSlots : t.numSlots / 2;
+
+    auto a = t.PInput.ConvertToInt<int64_t>();
+    auto b = t.POutput.ConvertToInt<int64_t>();
+    auto f = [a, b](int64_t x) -> int64_t {
+        return (x % a - a / 2) % b;
+    };
+
+    std::vector<int64_t> x = {
+        (t.PInput.ConvertToInt<int64_t>() / 2), (t.PInput.ConvertToInt<int64_t>() / 2) + 1, 0, 3, 16, 33, 64,
+        (t.PInput.ConvertToInt<int64_t>() - 1)};
+    if (x.size() < t.numSlots)
+        x = Fill<int64_t>(x, t.numSlots);
+
+    std::vector<int64_t> coeffint;
+    std::vector<std::complex<double>> coeffcomp;
+    bool binaryLUT = (t.PInput.ConvertToInt() == 2) && (t.order == 1);
+    if (binaryLUT)  // coeffs for [1, cos^2(pi x)], not [1, cos(2pi x)]
+        coeffint = {f(1), f(0) - f(1)};
+    else  // divided by 2
+        coeffcomp = GetHermiteTrigCoefficients(f, t.PInput.ConvertToInt(), t.order, t.scaleTHI);
+
+    uint32_t dcrtBits = t.Bigq.GetMSB() - 1;
+    CCParams<CryptoContextCKKSRNS> parameters;
+    parameters.SetSecretKeyDist(SPARSE_ENCAPSULATED);
+    parameters.SetSecurityLevel(HEStd_NotSet);
+    parameters.SetScalingModSize(dcrtBits);
+    parameters.SetScalingTechnique(FIXEDMANUAL);
+    parameters.SetFirstModSize(dcrtBits);
+    parameters.SetNumLargeDigits(t.dnum);
+    parameters.SetBatchSize(numSlotsCKKS);
+    parameters.SetRingDim(t.ringDim);
+
+    uint32_t depth = 0;
+    if (binaryLUT)
+        depth += FHECKKSRNS::GetFBTDepth(t.lvlb, coeffint, t.PInput, t.order, SPARSE_ENCAPSULATED);
+    else
+        depth += FHECKKSRNS::GetFBTDepth(t.lvlb, coeffcomp, t.PInput, t.order, SPARSE_ENCAPSULATED);
+    parameters.SetMultiplicativeDepth(depth);
+
+    auto cc = GenCryptoContext(parameters);
+    cc->Enable(PKE);
+    cc->Enable(KEYSWITCH);
+    cc->Enable(LEVELEDSHE);
+    cc->Enable(ADVANCEDSHE);
+    cc->Enable(FHE);
+
+    auto keyPair = cc->KeyGen();
+
+    if (binaryLUT)
+        cc->EvalFBTSetup(coeffint, numSlotsCKKS, t.PInput, t.POutput, t.Bigq, keyPair.publicKey, {0, 0}, t.lvlb, 0, 0,
+                         t.order);
+    else
+        cc->EvalFBTSetup(coeffcomp, numSlotsCKKS, t.PInput, t.POutput, t.Bigq, keyPair.publicKey, {0, 0}, t.lvlb, 0, 0,
+                         t.order);
+
+    while (state.KeepRunning()) {
+        cc->EvalBootstrapKeyGen(keyPair.secretKey, numSlotsCKKS);
+        cc->EvalMultKeyGen(keyPair.secretKey);
+    }
+
+    cc->ClearStaticMapsAndVectors();
+}
+
+[[maybe_unused]] static void FBTArbLUT(benchmark::State& state) {
+    auto t = arblut_configs[state.range(0)];
+
+    bool flagSP = (t.numSlots <= t.ringDim / 2);  // sparse packing
+
+    auto numSlotsCKKS = flagSP ? t.numSlots : t.numSlots / 2;
+
+    auto a = t.PInput.ConvertToInt<int64_t>();
+    auto b = t.POutput.ConvertToInt<int64_t>();
+    auto f = [a, b](int64_t x) -> int64_t {
+        return (x % a - a / 2) % b;
+    };
+
+    std::vector<int64_t> x = {
+        (t.PInput.ConvertToInt<int64_t>() / 2), (t.PInput.ConvertToInt<int64_t>() / 2) + 1, 0, 3, 16, 33, 64,
+        (t.PInput.ConvertToInt<int64_t>() - 1)};
+    if (x.size() < t.numSlots)
+        x = Fill<int64_t>(x, t.numSlots);
+
+    std::vector<int64_t> coeffint;
+    std::vector<std::complex<double>> coeffcomp;
+    bool binaryLUT = (t.PInput.ConvertToInt() == 2) && (t.order == 1);
+    if (binaryLUT)  // coeffs for [1, cos^2(pi x)], not [1, cos(2pi x)]
+        coeffint = {f(1), f(0) - f(1)};
+    else  // divided by 2
+        coeffcomp = GetHermiteTrigCoefficients(f, t.PInput.ConvertToInt(), t.order, t.scaleTHI);
+
+    uint32_t dcrtBits = t.Bigq.GetMSB() - 1;
+    CCParams<CryptoContextCKKSRNS> parameters;
+    parameters.SetSecretKeyDist(SPARSE_ENCAPSULATED);
+    parameters.SetSecurityLevel(HEStd_NotSet);
+    parameters.SetScalingModSize(dcrtBits);
+    parameters.SetScalingTechnique(FIXEDMANUAL);
+    parameters.SetFirstModSize(dcrtBits);
+    parameters.SetNumLargeDigits(t.dnum);
+    parameters.SetBatchSize(numSlotsCKKS);
+    parameters.SetRingDim(t.ringDim);
+
+    uint32_t depth = 0;
+    if (binaryLUT)
+        depth += FHECKKSRNS::GetFBTDepth(t.lvlb, coeffint, t.PInput, t.order, SPARSE_ENCAPSULATED);
+    else
+        depth += FHECKKSRNS::GetFBTDepth(t.lvlb, coeffcomp, t.PInput, t.order, SPARSE_ENCAPSULATED);
+    parameters.SetMultiplicativeDepth(depth);
+
+    auto cc = GenCryptoContext(parameters);
+    cc->Enable(PKE);
+    cc->Enable(KEYSWITCH);
+    cc->Enable(LEVELEDSHE);
+    cc->Enable(ADVANCEDSHE);
+    cc->Enable(FHE);
+
+    auto keyPair = cc->KeyGen();
+
+    if (binaryLUT)
+        cc->EvalFBTSetup(coeffint, numSlotsCKKS, t.PInput, t.POutput, t.Bigq, keyPair.publicKey, {0, 0}, t.lvlb, 0, 0,
+                         t.order);
+    else
+        cc->EvalFBTSetup(coeffcomp, numSlotsCKKS, t.PInput, t.POutput, t.Bigq, keyPair.publicKey, {0, 0}, t.lvlb, 0, 0,
+                         t.order);
+
+    cc->EvalBootstrapKeyGen(keyPair.secretKey, numSlotsCKKS);
+    cc->EvalMultKeyGen(keyPair.secretKey);
+
+    auto ep = SchemeletRLWEMP::GetElementParams(keyPair.secretKey, depth);
+
+    auto ctxtBFV = SchemeletRLWEMP::EncryptCoeff(x, t.QBFVInit, t.PInput, keyPair.secretKey, ep);
+
+    SchemeletRLWEMP::ModSwitch(ctxtBFV, t.Q, t.QBFVInit);
+
+    auto ctxt = SchemeletRLWEMP::ConvertRLWEToCKKS(*cc, ctxtBFV, keyPair.publicKey, t.Bigq, numSlotsCKKS, depth);
+
+    while (state.KeepRunning()) {
+        Ciphertext<DCRTPoly> ctxtAfterFBT;
+        if (binaryLUT)
+            ctxtAfterFBT = cc->EvalFBT(ctxt, coeffint, t.PInput.GetMSB() - 1, ep->GetModulus(), t.scaleTHI, 0, t.order);
+        else
+            ctxtAfterFBT =
+                cc->EvalFBT(ctxt, coeffcomp, t.PInput.GetMSB() - 1, ep->GetModulus(), t.scaleTHI, 0, t.order);
+        ctxtAfterFBT.reset();
+    }
+
+    cc->ClearStaticMapsAndVectors();
+}
+
+[[maybe_unused]] static void FBTSignDigit32(benchmark::State& state) {
+    auto t = arblut_configs[32];
+
+    bool flagSP = (t.numSlots <= t.ringDim / 2);  // sparse packing
+
+    auto numSlotsCKKS = flagSP ? t.numSlots : t.numSlots / 2;
+
+    auto a = t.PInput.ConvertToInt<int64_t>();
+    auto b = t.POutput.ConvertToInt<int64_t>();
+
+    auto funcMod = [b](int64_t x) -> int64_t {
+        return (x % b);
+    };
+    auto funcStep = [a, b](int64_t x) -> int64_t {
+        return (x % a) >= (b / 2);
+    };
+
+    std::vector<int64_t> x = {
+        t.PInput.ConvertToInt<int64_t>() / 2, t.PInput.ConvertToInt<int64_t>() / 2 + 1, 0, 3, 16, 33, 64,
+        t.PInput.ConvertToInt<int64_t>() - 1};
+    if (x.size() < t.numSlots)
+        x = Fill<int64_t>(x, t.numSlots);
+
+    auto exact(x);
+    std::transform(x.begin(), x.end(), exact.begin(),
+                   [&](const int64_t& elem) { return (elem >= t.PInput.ConvertToDouble() / 2.); });
+
+    std::vector<int64_t> coeffintMod;
+    std::vector<std::complex<double>> coeffcompMod;
+    std::vector<std::complex<double>> coeffcompStep;
+    bool binaryLUT = (t.POutput.ConvertToInt() == 2) && (t.order == 1);
+    if (binaryLUT) {
+        coeffintMod = {funcMod(1), funcMod(0) - funcMod(1)};  // coeffs for [1, cos^2(pi x)], not [1, cos(2pi x)]
+    }
+    else {
+        coeffcompMod =
+            GetHermiteTrigCoefficients(funcMod, t.POutput.ConvertToInt(), t.order, t.scaleTHI);  // divided by 2
+        coeffcompStep = GetHermiteTrigCoefficients(funcStep, t.POutput.ConvertToInt(), t.order,
+                                                   t.scaleStepTHI);  // divided by 2
+    }
+
+    uint32_t dcrtBits = t.Bigq.GetMSB() - 1;
+    CCParams<CryptoContextCKKSRNS> parameters;
+    parameters.SetSecretKeyDist(SPARSE_ENCAPSULATED);
+    parameters.SetSecurityLevel(HEStd_NotSet);
+    parameters.SetScalingModSize(dcrtBits);
+    parameters.SetScalingTechnique(FIXEDMANUAL);
+    parameters.SetFirstModSize(dcrtBits);
+    parameters.SetNumLargeDigits(t.dnum);
+    parameters.SetBatchSize(numSlotsCKKS);
+    parameters.SetRingDim(t.ringDim);
+
+    uint32_t depth = 0;
+    if (binaryLUT)
+        depth += FHECKKSRNS::GetFBTDepth(t.lvlb, coeffintMod, t.PInput, t.order, SPARSE_ENCAPSULATED);
+    else
+        depth += FHECKKSRNS::GetFBTDepth(t.lvlb, coeffcompMod, t.PInput, t.order, SPARSE_ENCAPSULATED);
+    parameters.SetMultiplicativeDepth(depth);
+
+    auto cc = GenCryptoContext(parameters);
+    cc->Enable(PKE);
+    cc->Enable(KEYSWITCH);
+    cc->Enable(LEVELEDSHE);
+    cc->Enable(ADVANCEDSHE);
+    cc->Enable(FHE);
+
+    auto keyPair = cc->KeyGen();
+
+    if (binaryLUT)
+        cc->EvalFBTSetup(coeffintMod, numSlotsCKKS, t.POutput, t.PInput, t.Bigq, keyPair.publicKey, {0, 0}, t.lvlb, 0,
+                         0, t.order);
+    else
+        cc->EvalFBTSetup(coeffcompMod, numSlotsCKKS, t.POutput, t.PInput, t.Bigq, keyPair.publicKey, {0, 0}, t.lvlb, 0,
+                         0, t.order);
+
+    cc->EvalBootstrapKeyGen(keyPair.secretKey, numSlotsCKKS);
+    cc->EvalMultKeyGen(keyPair.secretKey);
+
+    auto ep = SchemeletRLWEMP::GetElementParams(keyPair.secretKey, depth);
+
+    std::vector<int64_t> coeffint;
+    std::vector<std::complex<double>> coeffcomp;
+    if (binaryLUT)
+        coeffint = coeffintMod;
+    else
+        coeffcomp = coeffcompMod;
+
+    while (state.KeepRunning()) {
+        auto ctxtBFV = SchemeletRLWEMP::EncryptCoeff(x, t.QBFVInit, t.PInput, keyPair.secretKey, ep);
+
+        SchemeletRLWEMP::ModSwitch(ctxtBFV, t.Q, t.QBFVInit);
+
+        uint32_t QBFVBits = t.Q.GetMSB() - 1;
+
+        auto Q      = t.Q;
+        auto PInput = t.PInput;
+
+        BigInteger QNew;
+
+        const bool checkgt2       = t.POutput.ConvertToInt() > 2;
+        const uint32_t pDigitBits = t.POutput.GetMSB() - 1;
+
+        uint64_t scaleTHI        = t.scaleTHI;
+        bool step                = false;
+        bool go                  = QBFVBits > dcrtBits;
+        size_t levelsToDrop      = 0;
+        uint32_t postScalingBits = 0;
+
+        // For arbitrary digit size, pNew > 2, the last iteration needs to evaluate step pNew not mod pNew.
+        // Currently this only works when log(pNew) divides log(p).
+        while (go) {
+            auto encryptedDigit = ctxtBFV;
+
+            // Apply mod q
+            encryptedDigit[0].SwitchModulus(t.Bigq, 1, 0, 0);
+            encryptedDigit[1].SwitchModulus(t.Bigq, 1, 0, 0);
+
+            auto ctxt =
+                SchemeletRLWEMP::ConvertRLWEToCKKS(*cc, encryptedDigit, keyPair.publicKey, t.Bigq, numSlotsCKKS, depth);
+
+            // Bootstrap the digit.
+            Ciphertext<DCRTPoly> ctxtAfterFBT;
+            if (binaryLUT)
+                ctxtAfterFBT = cc->EvalFBT(ctxt, coeffint, pDigitBits, ep->GetModulus(),
+                                           scaleTHI * (1 << postScalingBits), levelsToDrop, t.order);
+            else
+                ctxtAfterFBT = cc->EvalFBT(ctxt, coeffcomp, pDigitBits, ep->GetModulus(),
+                                           scaleTHI * (1 << postScalingBits), levelsToDrop, t.order);
+
+            auto polys = SchemeletRLWEMP::ConvertCKKSToRLWE(ctxtAfterFBT, Q);
+
+            if (!step) {
+                QNew = Q >> pDigitBits;
+
+                // Subtract digit and switch mod from Q to QNew for BFV ciphertext
+                ctxtBFV[0] = (ctxtBFV[0] - polys[0]).MultiplyAndRound(QNew, Q);
+                ctxtBFV[0].SwitchModulus(QNew, 1, 0, 0);
+                ctxtBFV[1] = (ctxtBFV[1] - polys[1]).MultiplyAndRound(QNew, Q);
+                ctxtBFV[1].SwitchModulus(QNew, 1, 0, 0);
+                Q >>= pDigitBits;
+                PInput >>= pDigitBits;
+                QBFVBits -= pDigitBits;
+                postScalingBits += pDigitBits;
+            }
+            else {
+                ctxtBFV[0] = std::move(polys[0]);
+                ctxtBFV[1] = std::move(polys[1]);
+            }
+
+            go = QBFVBits > dcrtBits;
+
+            if (checkgt2 && !go && !step) {
+                if (!binaryLUT)
+                    coeffcomp = coeffcompStep;
+                scaleTHI           = t.scaleStepTHI;
+                step               = true;
+                go                 = true;
+                int64_t lvlsToDrop = GetMultiplicativeDepthByCoeffVector(coeffcompMod, true) -
+                                     GetMultiplicativeDepthByCoeffVector(coeffcompStep, true);
+                if (coeffcompMod.size() > 4 && lvlsToDrop > 0)
+                    levelsToDrop = lvlsToDrop;
+            }
+        }
+    }
+
+    cc->ClearStaticMapsAndVectors();
+}
+
+BENCHMARK(FBTArbLUT)->Unit(benchmark::kSecond)->Iterations(4)->Apply(ArbLUTBits);
+BENCHMARK(FBTSignDigit32)->Unit(benchmark::kSecond)->Iterations(4);
+BENCHMARK(FBTSetup)->Unit(benchmark::kSecond)->Iterations(10);
+BENCHMARK(FBTKeyGen)->Unit(benchmark::kSecond)->Iterations(4);
+
+BENCHMARK_MAIN();
diff --git a/benchmark/src/poly-benchmark.h b/benchmark/src/poly-benchmark.h
index ad1361760..87aa0a198 100644
--- a/benchmark/src/poly-benchmark.h
+++ b/benchmark/src/poly-benchmark.h
@@ -45,10 +45,10 @@
 
 using namespace lbcrypto;
 
-constexpr size_t POLY_NUM    = 16;
+constexpr size_t POLY_NUM    = 8;
 constexpr size_t POLY_NUM_M1 = (POLY_NUM - 1);
 
-std::vector<uint32_t> tow_args({1, 2, 4, 8, 16});
+std::vector<uint32_t> tow_args({1, 2, 4, 8, 16, 32});
 std::shared_ptr<std::vector<NativePoly>> NativepolysEval;
 std::shared_ptr<std::vector<NativePoly>> NativepolysCoef;
 std::map<uint32_t, std::shared_ptr<std::vector<DCRTPoly>>> DCRTpolysEval;
@@ -219,7 +219,27 @@ static void GenerateDCRTPolys(uint32_t order, uint32_t bits,
 
 // ************************************************************************************
 
-[[maybe_unused]] static void Native_ntt(benchmark::State& state) {
+[[maybe_unused]] static void Native_Copy(benchmark::State& state) {
+    auto polys = NativepolysEval;
+    NativePoly p;
+    size_t i{0};
+    while (state.KeepRunning()) {
+        benchmark::DoNotOptimize(p = (*polys)[(i = (i + 1) & POLY_NUM_M1)]);
+    }
+}
+
+[[maybe_unused]] static void DCRT_Copy(benchmark::State& state) {
+    auto polys = DCRTpolysEval[state.range(0)];
+    DCRTPoly p;
+    size_t i{0};
+    while (state.KeepRunning()) {
+        benchmark::DoNotOptimize(p = (*polys)[(i = (i + 1) & POLY_NUM_M1)]);
+    }
+}
+
+// ************************************************************************************
+
+[[maybe_unused]] static void Native_Copy_ntt(benchmark::State& state) {
     std::shared_ptr<std::vector<NativePoly>> polys = NativepolysCoef;
     NativePoly p;
     size_t i{POLY_NUM_M1};
@@ -229,7 +249,7 @@ static void GenerateDCRTPolys(uint32_t order, uint32_t bits,
     }
 }
 
-[[maybe_unused]] static void DCRT_ntt(benchmark::State& state) {
+[[maybe_unused]] static void DCRT_Copy_ntt(benchmark::State& state) {
     std::shared_ptr<std::vector<DCRTPoly>> polys = DCRTpolysCoef[state.range(0)];
     DCRTPoly p;
     size_t i{POLY_NUM_M1};
@@ -239,7 +259,7 @@ static void GenerateDCRTPolys(uint32_t order, uint32_t bits,
     }
 }
 
-[[maybe_unused]] static void Native_intt(benchmark::State& state) {
+[[maybe_unused]] static void Native_Copy_intt(benchmark::State& state) {
     std::shared_ptr<std::vector<NativePoly>> polys = NativepolysEval;
     NativePoly p;
     size_t i{POLY_NUM_M1};
@@ -249,7 +269,7 @@ static void GenerateDCRTPolys(uint32_t order, uint32_t bits,
     }
 }
 
-[[maybe_unused]] static void DCRT_intt(benchmark::State& state) {
+[[maybe_unused]] static void DCRT_Copy_intt(benchmark::State& state) {
     std::shared_ptr<std::vector<DCRTPoly>> polys = DCRTpolysEval[state.range(0)];
     DCRTPoly p;
     size_t i{POLY_NUM_M1};
@@ -259,6 +279,26 @@ static void GenerateDCRTPolys(uint32_t order, uint32_t bits,
     }
 }
 
+[[maybe_unused]] static void Native_avg_ntt_intt(benchmark::State& state) {
+    auto polys = *NativepolysCoef;
+    NativePoly* p;
+    size_t i{POLY_NUM_M1};
+    while (state.KeepRunning()) {
+        p = &polys[(i = (i + 1) & POLY_NUM_M1)];
+        p->SwitchFormat();
+    }
+}
+
+[[maybe_unused]] static void DCRT_avg_ntt_intt(benchmark::State& state) {
+    auto polys = *DCRTpolysCoef[state.range(0)];
+    DCRTPoly* p;
+    size_t i{POLY_NUM_M1};
+    while (state.KeepRunning()) {
+        p = &polys[(i = (i + 1) & POLY_NUM_M1)];
+        p->SwitchFormat();
+    }
+}
+
 [[maybe_unused]] static void Native_ntt_intt(benchmark::State& state) {
     std::shared_ptr<std::vector<NativePoly>> polys = NativepolysCoef;
     NativePoly* p;
@@ -369,35 +409,47 @@ static void GenerateDCRTPolys(uint32_t order, uint32_t bits,
 
 // BENCHMARK(Native_Add)->Unit(benchmark::kMicrosecond);
 // BENCHMARK(DCRT_Add)->Unit(benchmark::kMicrosecond)->Apply(DCRTArguments);
-BENCHMARK(Native_AddEq)->Unit(benchmark::kMicrosecond);
-BENCHMARK(DCRT_AddEq)->Unit(benchmark::kMicrosecond)->Apply(DCRTArguments);
+
+BENCHMARK(Native_AddEq)->Unit(benchmark::kMicrosecond)->MinTime(5.0);
+BENCHMARK(DCRT_AddEq)->Unit(benchmark::kMicrosecond)->Apply(DCRTArguments)->MinTime(5.0);
 
 // BENCHMARK(Native_Sub)->Unit(benchmark::kMicrosecond);
 // BENCHMARK(DCRT_Sub)->Unit(benchmark::kMicrosecond)->Apply(DCRTArguments);
-BENCHMARK(Native_SubEq)->Unit(benchmark::kMicrosecond);
-BENCHMARK(DCRT_SubEq)->Unit(benchmark::kMicrosecond)->Apply(DCRTArguments);
+
+BENCHMARK(Native_SubEq)->Unit(benchmark::kMicrosecond)->MinTime(5.0);
+BENCHMARK(DCRT_SubEq)->Unit(benchmark::kMicrosecond)->Apply(DCRTArguments)->MinTime(5.0);
 
 // BENCHMARK(Native_Mul)->Unit(benchmark::kMicrosecond);
 // BENCHMARK(DCRT_Mul)->Unit(benchmark::kMicrosecond)->Apply(DCRTArguments);
-BENCHMARK(Native_MulEq)->Unit(benchmark::kMicrosecond);
-BENCHMARK(DCRT_MulEq)->Unit(benchmark::kMicrosecond)->Apply(DCRTArguments);
 
-BENCHMARK(Native_ntt)->Unit(benchmark::kMicrosecond);
-BENCHMARK(DCRT_ntt)->Unit(benchmark::kMicrosecond)->Apply(DCRTArguments);
-BENCHMARK(Native_intt)->Unit(benchmark::kMicrosecond);
-BENCHMARK(DCRT_intt)->Unit(benchmark::kMicrosecond)->Apply(DCRTArguments);
+BENCHMARK(Native_MulEq)->Unit(benchmark::kMicrosecond)->MinTime(5.0);
+BENCHMARK(DCRT_MulEq)->Unit(benchmark::kMicrosecond)->Apply(DCRTArguments)->MinTime(5.0);
+
+BENCHMARK(Native_Copy)->Unit(benchmark::kMicrosecond)->MinTime(5.0);
+BENCHMARK(DCRT_Copy)->Unit(benchmark::kMicrosecond)->Apply(DCRTArguments)->MinTime(5.0);
+
+BENCHMARK(Native_Copy_ntt)->Unit(benchmark::kMicrosecond)->MinTime(5.0);
+BENCHMARK(DCRT_Copy_ntt)->Unit(benchmark::kMicrosecond)->Apply(DCRTArguments)->MinTime(5.0);
+
+BENCHMARK(Native_Copy_intt)->Unit(benchmark::kMicrosecond)->MinTime(5.0);
+BENCHMARK(DCRT_Copy_intt)->Unit(benchmark::kMicrosecond)->Apply(DCRTArguments)->MinTime(5.0);
+
+BENCHMARK(Native_avg_ntt_intt)->Unit(benchmark::kMicrosecond)->MinTime(5.0);
+BENCHMARK(DCRT_avg_ntt_intt)->Unit(benchmark::kMicrosecond)->Apply(DCRTArguments)->MinTime(5.0);
+
 // BENCHMARK(Native_ntt_intt)->Unit(benchmark::kMicrosecond);
 // BENCHMARK(DCRT_ntt_intt)->Unit(benchmark::kMicrosecond)->Apply(DCRTArguments);
+
 // BENCHMARK(Native_intt_ntt)->Unit(benchmark::kMicrosecond);
 // BENCHMARK(DCRT_intt_ntt)->Unit(benchmark::kMicrosecond)->Apply(DCRTArguments);
 
-BENCHMARK(Native_CRTInterpolate)->Unit(benchmark::kMicrosecond);
-BENCHMARK(DCRT_CRTInterpolate)->Unit(benchmark::kMicrosecond)->Apply(DCRTArguments);
+BENCHMARK(Native_CRTInterpolate)->Unit(benchmark::kMicrosecond)->MinTime(5.0);
+BENCHMARK(DCRT_CRTInterpolate)->Unit(benchmark::kMicrosecond)->Apply(DCRTArguments)->MinTime(5.0);
 
-BENCHMARK(Native_DecryptionCRTInterpolate)->Unit(benchmark::kMicrosecond);
-BENCHMARK(DCRT_DecryptionCRTInterpolate)->Unit(benchmark::kMicrosecond)->Apply(DCRTArguments);
+// BENCHMARK(Native_DecryptionCRTInterpolate)->Unit(benchmark::kMicrosecond);
+// BENCHMARK(DCRT_DecryptionCRTInterpolate)->Unit(benchmark::kMicrosecond)->Apply(DCRTArguments);
 
-BENCHMARK(Native_BaseDecompose)->Unit(benchmark::kMicrosecond);
-BENCHMARK(DCRT_BaseDecompose)->Unit(benchmark::kMicrosecond)->Apply(DCRTArguments);
+// BENCHMARK(Native_BaseDecompose)->Unit(benchmark::kMicrosecond);
+// BENCHMARK(DCRT_BaseDecompose)->Unit(benchmark::kMicrosecond)->Apply(DCRTArguments);
 
 #endif
diff --git a/src/binfhe/lib/lwe-pke.cpp b/src/binfhe/lib/lwe-pke.cpp
index 5af69ecef..cf356d298 100644
--- a/src/binfhe/lib/lwe-pke.cpp
+++ b/src/binfhe/lib/lwe-pke.cpp
@@ -301,9 +301,9 @@ LWESwitchingKey LWEEncryptionScheme::KeySwitchGen(const std::shared_ptr<LWECrypt
     std::vector<std::vector<std::vector<NativeVector>>> resultVecA(N);
     std::vector<std::vector<std::vector<NativeInteger>>> resultVecB(N);
 
-    // TODO (cpascoe/dsuponit): this pragma needs to be revised as it may have to be removed completely
+    // TODO: parallelize loop using fix from KeySwitchHYBRID::KeySwitchGenInternal
+
     // #if !defined(__MINGW32__) && !defined(__MINGW64__)
-    // #pragma omp parallel for num_threads(N)
     // #pragma omp parallel for num_threads(OpenFHEParallelControls.GetThreadLimit(N))
     // #endif
     for (size_t i = 0; i < N; ++i) {
@@ -377,7 +377,7 @@ LWECiphertext LWEEncryptionScheme::KeySwitch(const std::shared_ptr<LWECryptoPara
 LWECiphertext LWEEncryptionScheme::NoiselessEmbedding(const std::shared_ptr<LWECryptoParams>& params,
                                                       LWEPlaintext m) const {
     NativeInteger q(params->Getq());
-    return std::make_shared<LWECiphertextImpl>(NativeVector(params->Getn(), q), (q >> 2)*m);
+    return std::make_shared<LWECiphertextImpl>(NativeVector(params->Getn(), q), (q >> 2) * m);
 }
 
 };  // namespace lbcrypto
diff --git a/src/core/include/math/hal/intnat/transformnat-impl.h b/src/core/include/math/hal/intnat/transformnat-impl.h
index f9365e822..b52d59d52 100644
--- a/src/core/include/math/hal/intnat/transformnat-impl.h
+++ b/src/core/include/math/hal/intnat/transformnat-impl.h
@@ -47,6 +47,7 @@
 #include "utils/utilities.h"
 
 #include <map>
+#include <utility>
 #include <vector>
 
 namespace intnat {
@@ -120,12 +121,12 @@ std::map<typename VecType::Integer, typename VecType::Integer>
     ChineseRemainderTransformArbNat<VecType>::m_DivisionNTTRootOfUnity;
 
 template <typename VecType>
-std::map<usint, usint> ChineseRemainderTransformArbNat<VecType>::m_nttDivisionDim;
+std::map<uint32_t, uint32_t> ChineseRemainderTransformArbNat<VecType>::m_nttDivisionDim;
 
 template <typename VecType>
 void NumberTheoreticTransformNat<VecType>::ForwardTransformIterative(const VecType& element,
                                                                      const VecType& rootOfUnityTable, VecType* result) {
-    usint n = element.GetLength();
+    uint32_t n = element.GetLength();
     if (result->GetLength() != n) {
         OPENFHE_THROW("size of input element and size of output element not of same size");
     }
@@ -134,24 +135,24 @@ void NumberTheoreticTransformNat<VecType>::ForwardTransformIterative(const VecTy
     IntType mu   = modulus.ComputeMu();
     result->SetModulus(modulus);
 
-    usint msb = GetMSB(n - 1);
+    uint32_t msb = GetMSB(n - 1);
     for (size_t i = 0; i < n; i++) {
         (*result)[i] = element[ReverseBits(i, msb)];
     }
 
     IntType omega, omegaFactor, oddVal, evenVal;
-    usint logm, i, j, indexEven, indexOdd;
+    uint32_t logm, i, j, indexEven, indexOdd;
 
-    usint logn = GetMSB(n - 1);
+    uint32_t logn = GetMSB(n - 1);
     for (logm = 1; logm <= logn; logm++) {
         // calculate the i indexes into the root table one time per loop
-        std::vector<usint> indexes(1 << (logm - 1));
-        for (i = 0; i < (usint)(1 << (logm - 1)); i++) {
+        std::vector<uint32_t> indexes(1 << (logm - 1));
+        for (i = 0; i < (uint32_t)(1 << (logm - 1)); i++) {
             indexes[i] = (i << (logn - logm));
         }
 
         for (j = 0; j < n; j = j + (1 << logm)) {
-            for (i = 0; i < (usint)(1 << (logm - 1)); i++) {
+            for (i = 0; i < (uint32_t)(1 << (logm - 1)); i++) {
                 omega     = rootOfUnityTable[indexes[i]];
                 indexEven = j + i;
                 indexOdd  = indexEven + (1 << (logm - 1));
@@ -182,14 +183,14 @@ template <typename VecType>
 void NumberTheoreticTransformNat<VecType>::InverseTransformIterative(const VecType& element,
                                                                      const VecType& rootOfUnityInverseTable,
                                                                      VecType* result) {
-    usint n = element.GetLength();
+    uint32_t n = element.GetLength();
 
     IntType modulus = element.GetModulus();
     IntType mu      = modulus.ComputeMu();
 
     NumberTheoreticTransformNat<VecType>().ForwardTransformIterative(element, rootOfUnityInverseTable, result);
     IntType cycloOrderInv(IntType(n).ModInverse(modulus));
-    for (usint i = 0; i < n; i++) {
+    for (uint32_t i = 0; i < n; i++) {
         (*result)[i].ModMulEq(cycloOrderInv, modulus, mu);
     }
     return;
@@ -198,15 +199,15 @@ void NumberTheoreticTransformNat<VecType>::InverseTransformIterative(const VecTy
 template <typename VecType>
 void NumberTheoreticTransformNat<VecType>::ForwardTransformToBitReverseInPlace(const VecType& rootOfUnityTable,
                                                                                VecType* element) {
-    usint n         = element->GetLength();
+    uint32_t n      = element->GetLength();
     IntType modulus = element->GetModulus();
     IntType mu      = modulus.ComputeMu();
 
-    usint i, m, j1, j2, indexOmega, indexLo, indexHi;
+    uint32_t i, m, j1, j2, indexOmega, indexLo, indexHi;
     IntType omega, omegaFactor, loVal, hiVal;
 
-    usint t     = (n >> 1);
-    usint logt1 = GetMSB(t);
+    uint32_t t     = (n >> 1);
+    uint32_t logt1 = GetMSB(t);
     for (m = 1; m < n; m <<= 1) {
         for (i = 0; i < m; ++i) {
             j1         = i << logt1;
@@ -243,7 +244,7 @@ template <typename VecType>
 void NumberTheoreticTransformNat<VecType>::ForwardTransformToBitReverse(const VecType& element,
                                                                         const VecType& rootOfUnityTable,
                                                                         VecType* result) {
-    usint n = element.GetLength();
+    uint32_t n = element.GetLength();
     if (result->GetLength() != n) {
         OPENFHE_THROW("size of input element and size of output element not of same size");
     }
@@ -252,15 +253,15 @@ void NumberTheoreticTransformNat<VecType>::ForwardTransformToBitReverse(const Ve
     IntType mu      = modulus.ComputeMu();
     result->SetModulus(modulus);
 
-    usint i, m, j1, j2, indexOmega, indexLo, indexHi;
+    uint32_t i, m, j1, j2, indexOmega, indexLo, indexHi;
     IntType omega, omegaFactor, loVal, hiVal, zero(0);
 
     for (i = 0; i < n; ++i) {
         (*result)[i] = element[i];
     }
 
-    usint t     = (n >> 1);
-    usint logt1 = GetMSB(t);
+    uint32_t t     = (n >> 1);
+    uint32_t logt1 = GetMSB(t);
     for (m = 1; m < n; m <<= 1) {
         for (i = 0; i < m; ++i) {
             j1         = i << logt1;
@@ -377,7 +378,7 @@ void NumberTheoreticTransformNat<VecType>::ForwardTransformToBitReverse(const Ve
                                                                         const VecType& rootOfUnityTable,
                                                                         const VecType& preconRootOfUnityTable,
                                                                         VecType* result) {
-    usint n = element.GetLength();
+    uint32_t n = element.GetLength();
 
     if (result->GetLength() != n) {
         OPENFHE_THROW("size of input element and size of output element not of same size");
@@ -395,8 +396,8 @@ void NumberTheoreticTransformNat<VecType>::ForwardTransformToBitReverse(const Ve
     NativeInteger preconOmega;
     IntType omega, omegaFactor, loVal, hiVal, zero(0);
 
-    usint t     = (n >> 1);
-    usint logt1 = GetMSB(t);
+    uint32_t t     = (n >> 1);
+    uint32_t logt1 = GetMSB(t);
     for (uint32_t m = 1; m < n; m <<= 1, t >>= 1, --logt1) {
         uint32_t j1, j2;
         for (uint32_t i = 0; i < m; ++i) {
@@ -438,15 +439,15 @@ template <typename VecType>
 void NumberTheoreticTransformNat<VecType>::InverseTransformFromBitReverseInPlace(const VecType& rootOfUnityInverseTable,
                                                                                  const IntType& cycloOrderInv,
                                                                                  VecType* element) {
-    usint n         = element->GetLength();
+    uint32_t n      = element->GetLength();
     IntType modulus = element->GetModulus();
     IntType mu      = modulus.ComputeMu();
 
     IntType loVal, hiVal, omega, omegaFactor;
-    usint i, m, j1, j2, indexOmega, indexLo, indexHi;
+    uint32_t i, m, j1, j2, indexOmega, indexLo, indexHi;
 
-    usint t     = 1;
-    usint logt1 = 1;
+    uint32_t t     = 1;
+    uint32_t logt1 = 1;
     for (m = (n >> 1); m >= 1; m >>= 1) {
         for (i = 0; i < m; ++i) {
             j1         = i << logt1;
@@ -493,7 +494,7 @@ void NumberTheoreticTransformNat<VecType>::InverseTransformFromBitReverse(const
                                                                           const VecType& rootOfUnityInverseTable,
                                                                           const IntType& cycloOrderInv,
                                                                           VecType* result) {
-    usint n = element.GetLength();
+    uint32_t n = element.GetLength();
 
     if (result->GetLength() != n) {
         OPENFHE_THROW("size of input element and size of output element not of same size");
@@ -501,7 +502,7 @@ void NumberTheoreticTransformNat<VecType>::InverseTransformFromBitReverse(const
 
     result->SetModulus(element.GetModulus());
 
-    for (usint i = 0; i < n; i++) {
+    for (uint32_t i = 0; i < n; i++) {
         (*result)[i] = element[i];
     }
     InverseTransformFromBitReverseInPlace(rootOfUnityInverseTable, cycloOrderInv, result);
@@ -627,14 +628,14 @@ template <typename VecType>
 void NumberTheoreticTransformNat<VecType>::InverseTransformFromBitReverse(
     const VecType& element, const VecType& rootOfUnityInverseTable, const VecType& preconRootOfUnityInverseTable,
     const IntType& cycloOrderInv, const IntType& preconCycloOrderInv, VecType* result) {
-    usint n = element.GetLength();
+    uint32_t n = element.GetLength();
     if (result->GetLength() != n) {
         OPENFHE_THROW("size of input element and size of output element not of same size");
     }
 
     result->SetModulus(element.GetModulus());
 
-    for (usint i = 0; i < n; i++) {
+    for (uint32_t i = 0; i < n; i++) {
         (*result)[i] = element[i];
     }
     InverseTransformFromBitReverseInPlace(rootOfUnityInverseTable, preconRootOfUnityInverseTable, cycloOrderInv,
@@ -645,28 +646,12 @@ void NumberTheoreticTransformNat<VecType>::InverseTransformFromBitReverse(
 
 template <typename VecType>
 void ChineseRemainderTransformFTTNat<VecType>::ForwardTransformToBitReverseInPlace(const IntType& rootOfUnity,
-                                                                                   const usint CycloOrder,
+                                                                                   const uint32_t cycloOrder,
                                                                                    VecType* element) {
-    if (rootOfUnity == IntType(1) || rootOfUnity == IntType(0)) {
+    if (rootOfUnity == IntType(1) || rootOfUnity == IntType(0))
         return;
-    }
-
-    if (!IsPowerOfTwo(CycloOrder)) {
-        OPENFHE_THROW("CyclotomicOrder is not a power of two");
-    }
-
-    usint CycloOrderHf = (CycloOrder >> 1);
-    if (element->GetLength() != CycloOrderHf) {
-        OPENFHE_THROW("element size must be equal to CyclotomicOrder / 2");
-    }
-
-    IntType modulus = element->GetModulus();
-
-    auto mapSearch = m_rootOfUnityReverseTableByModulus.find(modulus);
-    if (mapSearch == m_rootOfUnityReverseTableByModulus.end() || mapSearch->second.GetLength() != CycloOrderHf) {
-        PreCompute(rootOfUnity, CycloOrder, modulus);
-    }
-
+    auto modulus = element->GetModulus();
+    PreCompute(rootOfUnity, cycloOrder, modulus);
     NumberTheoreticTransformNat<VecType>().ForwardTransformToBitReverseInPlace(
         m_rootOfUnityReverseTableByModulus[modulus], m_rootOfUnityPreconReverseTableByModulus[modulus], element);
 }
@@ -674,28 +659,14 @@ void ChineseRemainderTransformFTTNat<VecType>::ForwardTransformToBitReverseInPla
 template <typename VecType>
 void ChineseRemainderTransformFTTNat<VecType>::ForwardTransformToBitReverse(const VecType& element,
                                                                             const IntType& rootOfUnity,
-                                                                            const usint CycloOrder, VecType* result) {
+                                                                            const uint32_t cycloOrder,
+                                                                            VecType* result) {
     if (rootOfUnity == IntType(1) || rootOfUnity == IntType(0)) {
         *result = element;
         return;
     }
-
-    if (!IsPowerOfTwo(CycloOrder)) {
-        OPENFHE_THROW("CyclotomicOrder is not a power of two");
-    }
-
-    usint CycloOrderHf = (CycloOrder >> 1);
-    if (result->GetLength() != CycloOrderHf) {
-        OPENFHE_THROW("result size must be equal to CyclotomicOrder / 2");
-    }
-
-    IntType modulus = element.GetModulus();
-
-    auto mapSearch = m_rootOfUnityReverseTableByModulus.find(modulus);
-    if (mapSearch == m_rootOfUnityReverseTableByModulus.end() || mapSearch->second.GetLength() != CycloOrderHf) {
-        PreCompute(rootOfUnity, CycloOrder, modulus);
-    }
-
+    auto modulus = element.GetModulus();
+    PreCompute(rootOfUnity, cycloOrder, modulus);
     NumberTheoreticTransformNat<VecType>().ForwardTransformToBitReverse(
         element, m_rootOfUnityReverseTableByModulus[modulus], m_rootOfUnityPreconReverseTableByModulus[modulus],
         result);
@@ -705,29 +676,13 @@ void ChineseRemainderTransformFTTNat<VecType>::ForwardTransformToBitReverse(cons
 
 template <typename VecType>
 void ChineseRemainderTransformFTTNat<VecType>::InverseTransformFromBitReverseInPlace(const IntType& rootOfUnity,
-                                                                                     const usint CycloOrder,
+                                                                                     const uint32_t cycloOrder,
                                                                                      VecType* element) {
-    if (rootOfUnity == IntType(1) || rootOfUnity == IntType(0)) {
+    if (rootOfUnity == IntType(1) || rootOfUnity == IntType(0))
         return;
-    }
-
-    if (!IsPowerOfTwo(CycloOrder)) {
-        OPENFHE_THROW("CyclotomicOrder is not a power of two");
-    }
-
-    usint CycloOrderHf = (CycloOrder >> 1);
-    if (element->GetLength() != CycloOrderHf) {
-        OPENFHE_THROW("element size must be equal to CyclotomicOrder / 2");
-    }
-
-    IntType modulus = element->GetModulus();
-
-    auto mapSearch = m_rootOfUnityReverseTableByModulus.find(modulus);
-    if (mapSearch == m_rootOfUnityReverseTableByModulus.end() || mapSearch->second.GetLength() != CycloOrderHf) {
-        PreCompute(rootOfUnity, CycloOrder, modulus);
-    }
-
-    usint msb = GetMSB(CycloOrderHf - 1);
+    auto modulus = element->GetModulus();
+    PreCompute(rootOfUnity, cycloOrder, modulus);
+    uint32_t msb = GetMSB((cycloOrder >> 1) - 1);
     NumberTheoreticTransformNat<VecType>().InverseTransformFromBitReverseInPlace(
         m_rootOfUnityInverseReverseTableByModulus[modulus], m_rootOfUnityInversePreconReverseTableByModulus[modulus],
         m_cycloOrderInverseTableByModulus[modulus][msb], m_cycloOrderInversePreconTableByModulus[modulus][msb],
@@ -737,114 +692,78 @@ void ChineseRemainderTransformFTTNat<VecType>::InverseTransformFromBitReverseInP
 template <typename VecType>
 void ChineseRemainderTransformFTTNat<VecType>::InverseTransformFromBitReverse(const VecType& element,
                                                                               const IntType& rootOfUnity,
-                                                                              const usint CycloOrder, VecType* result) {
+                                                                              const uint32_t cycloOrder,
+                                                                              VecType* result) {
     if (rootOfUnity == IntType(1) || rootOfUnity == IntType(0)) {
         *result = element;
         return;
     }
-
-    if (!IsPowerOfTwo(CycloOrder)) {
-        OPENFHE_THROW("CyclotomicOrder is not a power of two");
-    }
-
-    usint CycloOrderHf = (CycloOrder >> 1);
-    if (result->GetLength() != CycloOrderHf) {
-        OPENFHE_THROW("result size must be equal to CyclotomicOrder / 2");
-    }
-
-    IntType modulus = element.GetModulus();
-
-    auto mapSearch = m_rootOfUnityReverseTableByModulus.find(modulus);
-    if (mapSearch == m_rootOfUnityReverseTableByModulus.end() || mapSearch->second.GetLength() != CycloOrderHf) {
-        PreCompute(rootOfUnity, CycloOrder, modulus);
-    }
-
-    usint n = element.GetLength();
-    result->SetModulus(element.GetModulus());
-    for (usint i = 0; i < n; i++) {
+    auto modulus = element.GetModulus();
+    result->SetModulus(modulus);
+    PreCompute(rootOfUnity, cycloOrder, modulus);
+    uint32_t n = element.GetLength();
+    for (uint32_t i = 0; i < n; ++i)
         (*result)[i] = element[i];
-    }
-
-    usint msb = GetMSB(CycloOrderHf - 1);
+    uint32_t msb = GetMSB(n - 1);
     NumberTheoreticTransformNat<VecType>().InverseTransformFromBitReverseInPlace(
         m_rootOfUnityInverseReverseTableByModulus[modulus], m_rootOfUnityInversePreconReverseTableByModulus[modulus],
         m_cycloOrderInverseTableByModulus[modulus][msb], m_cycloOrderInversePreconTableByModulus[modulus][msb], result);
-
-    return;
 }
 
 template <typename VecType>
-void ChineseRemainderTransformFTTNat<VecType>::PreCompute(const IntType& rootOfUnity, const usint CycloOrder,
+void ChineseRemainderTransformFTTNat<VecType>::PreCompute(const IntType& rootOfUnity, const uint32_t cycloOrder,
                                                           const IntType& modulus) {
-    usint CycloOrderHf = (CycloOrder >> 1);
-
+    auto ringDim   = (cycloOrder >> 1);
     auto mapSearch = m_rootOfUnityReverseTableByModulus.find(modulus);
-    if (mapSearch == m_rootOfUnityReverseTableByModulus.end() || mapSearch->second.GetLength() != CycloOrderHf) {
+    if (mapSearch == m_rootOfUnityReverseTableByModulus.end() || mapSearch->second.GetLength() != ringDim) {
 #pragma omp critical
         {
             IntType x(1), xinv(1);
-            usint msb  = GetMSB(CycloOrderHf - 1);
-            IntType mu = modulus.ComputeMu();
-            VecType Table(CycloOrderHf, modulus);
-            VecType TableI(CycloOrderHf, modulus);
+            uint32_t msb               = GetMSB(ringDim - 1);
+            IntType mu                 = modulus.ComputeMu();
             IntType rootOfUnityInverse = rootOfUnity.ModInverse(modulus);
-            usint iinv;
-            for (usint i = 0; i < CycloOrderHf; i++) {
-                iinv         = ReverseBits(i, msb);
-                Table[iinv]  = x;
-                TableI[iinv] = xinv;
+            NativeInteger nModulus     = modulus.ConvertToInt();
+            VecType Table(ringDim, modulus);
+            VecType TableI(ringDim, modulus);
+            VecType preconTable(ringDim, nModulus);
+            VecType preconTableI(ringDim, nModulus);
+            for (uint32_t i = 0; i < ringDim; ++i) {
+                auto iinv         = ReverseBits(i, msb);
+                Table[iinv]       = x;
+                preconTable[iinv] = NativeInteger(x.ConvertToInt()).PrepModMulConst(nModulus);
                 x.ModMulEq(rootOfUnity, modulus, mu);
+                TableI[iinv]       = xinv;
+                preconTableI[iinv] = NativeInteger(xinv.ConvertToInt()).PrepModMulConst(nModulus);
                 xinv.ModMulEq(rootOfUnityInverse, modulus, mu);
             }
-            m_rootOfUnityReverseTableByModulus[modulus]        = Table;
-            m_rootOfUnityInverseReverseTableByModulus[modulus] = TableI;
+            m_rootOfUnityReverseTableByModulus[modulus]              = std::move(Table);
+            m_rootOfUnityInverseReverseTableByModulus[modulus]       = std::move(TableI);
+            m_rootOfUnityPreconReverseTableByModulus[modulus]        = std::move(preconTable);
+            m_rootOfUnityInversePreconReverseTableByModulus[modulus] = std::move(preconTableI);
 
+            IntType coInv(1);
             VecType TableCOI(msb + 1, modulus);
-            for (usint i = 0; i < msb + 1; i++) {
-                IntType coInv(IntType(1 << i).ModInverse(modulus));
-                TableCOI[i] = coInv;
+            VecType preconTableCOI(msb + 1, nModulus);
+            for (uint32_t i = 0; i <= msb; ++i) {
+                TableCOI[i]       = coInv.ModInverse(modulus);
+                preconTableCOI[i] = NativeInteger(TableCOI[i].ConvertToInt()).PrepModMulConst(nModulus);
+                coInv <<= 1;
             }
-            m_cycloOrderInverseTableByModulus[modulus] = TableCOI;
-
-            NativeInteger nativeModulus = modulus.ConvertToInt();
-            VecType preconTable(CycloOrderHf, nativeModulus);
-            VecType preconTableI(CycloOrderHf, nativeModulus);
-
-            for (usint i = 0; i < CycloOrderHf; i++) {
-                preconTable[i] = NativeInteger(m_rootOfUnityReverseTableByModulus[modulus][i].ConvertToInt())
-                                     .PrepModMulConst(nativeModulus);
-                preconTableI[i] = NativeInteger(m_rootOfUnityInverseReverseTableByModulus[modulus][i].ConvertToInt())
-                                      .PrepModMulConst(nativeModulus);
-            }
-
-            VecType preconTableCOI(msb + 1, nativeModulus);
-            for (usint i = 0; i < msb + 1; i++) {
-                preconTableCOI[i] = NativeInteger(m_cycloOrderInverseTableByModulus[modulus][i].ConvertToInt())
-                                        .PrepModMulConst(nativeModulus);
-            }
-
-            m_rootOfUnityPreconReverseTableByModulus[modulus]        = preconTable;
-            m_rootOfUnityInversePreconReverseTableByModulus[modulus] = preconTableI;
-            m_cycloOrderInversePreconTableByModulus[modulus]         = preconTableCOI;
+            m_cycloOrderInverseTableByModulus[modulus]       = std::move(TableCOI);
+            m_cycloOrderInversePreconTableByModulus[modulus] = std::move(preconTableCOI);
         }
     }
 }
 
 template <typename VecType>
-void ChineseRemainderTransformFTTNat<VecType>::PreCompute(std::vector<IntType>& rootOfUnity, const usint CycloOrder,
+void ChineseRemainderTransformFTTNat<VecType>::PreCompute(std::vector<IntType>& rootOfUnity, const uint32_t cycloOrder,
                                                           std::vector<IntType>& moduliiChain) {
-    usint numOfRootU = rootOfUnity.size();
-    usint numModulii = moduliiChain.size();
-
-    if (numOfRootU != numModulii) {
+    uint32_t numOfRootU = rootOfUnity.size();
+    uint32_t numModulii = moduliiChain.size();
+    if (numOfRootU != numModulii)
         OPENFHE_THROW("size of root of unity and size of moduli chain not of same size");
-    }
-
-    for (usint i = 0; i < numOfRootU; ++i) {
-        IntType currentRoot(rootOfUnity[i]);
-        IntType currentMod(moduliiChain[i]);
-        PreCompute(currentRoot, CycloOrder, currentMod);
-    }
+    for (uint32_t i = 0; i < numOfRootU; ++i)
+        PreCompute(rootOfUnity[i], cycloOrder, moduliiChain[i]);
 }
 
 template <typename VecType>
@@ -858,8 +777,8 @@ void ChineseRemainderTransformFTTNat<VecType>::Reset() {
 }
 
 template <typename VecType>
-void BluesteinFFTNat<VecType>::PreComputeDefaultNTTModulusRoot(usint cycloOrder, const IntType& modulus) {
-    usint nttDim                              = pow(2, ceil(log2(2 * cycloOrder - 1)));
+void BluesteinFFTNat<VecType>::PreComputeDefaultNTTModulusRoot(uint32_t cycloOrder, const IntType& modulus) {
+    uint32_t nttDim                           = pow(2, ceil(log2(2 * cycloOrder - 1)));
     const auto nttModulus                     = LastPrime<IntType>(log2(nttDim) + 2 * modulus.GetMSB(), nttDim);
     const auto nttRoot                        = RootOfUnity<IntType>(nttDim, nttModulus);
     const ModulusRoot<IntType> nttModulusRoot = {nttModulus, nttRoot};
@@ -869,44 +788,39 @@ void BluesteinFFTNat<VecType>::PreComputeDefaultNTTModulusRoot(usint cycloOrder,
 }
 
 template <typename VecType>
-void BluesteinFFTNat<VecType>::PreComputeRootTableForNTT(usint cyclotoOrder,
+void BluesteinFFTNat<VecType>::PreComputeRootTableForNTT(uint32_t cyclotoOrder,
                                                          const ModulusRoot<IntType>& nttModulusRoot) {
-    usint nttDim           = pow(2, ceil(log2(2 * cyclotoOrder - 1)));
+    uint32_t nttDim        = pow(2, ceil(log2(2 * cyclotoOrder - 1)));
     const auto& nttModulus = nttModulusRoot.first;
     const auto& nttRoot    = nttModulusRoot.second;
 
     IntType root(nttRoot);
-
     auto rootInv = root.ModInverse(nttModulus);
 
-    usint nttDimHf = (nttDim >> 1);
+    uint32_t nttDimHf = (nttDim >> 1);
     VecType rootTable(nttDimHf, nttModulus);
     VecType rootTableInverse(nttDimHf, nttModulus);
 
-    IntType x(1);
-    for (usint i = 0; i < nttDimHf; i++) {
+    IntType x(1), y(1);
+    for (uint32_t i = 0; i < nttDimHf; ++i) {
         rootTable[i] = x;
-        x            = x.ModMul(root, nttModulus);
-    }
-
-    x = 1;
-    for (usint i = 0; i < nttDimHf; i++) {
-        rootTableInverse[i] = x;
-        x                   = x.ModMul(rootInv, nttModulus);
+        x.ModMulEq(root, nttModulus);
+        rootTableInverse[i] = y;
+        y.ModMulEq(rootInv, nttModulus);
     }
 
-    m_rootOfUnityTableByModulusRoot[nttModulusRoot]        = rootTable;
-    m_rootOfUnityInverseTableByModulusRoot[nttModulusRoot] = rootTableInverse;
+    m_rootOfUnityTableByModulusRoot[nttModulusRoot]        = std::move(rootTable);
+    m_rootOfUnityInverseTableByModulusRoot[nttModulusRoot] = std::move(rootTableInverse);
 }
 
 template <typename VecType>
-void BluesteinFFTNat<VecType>::PreComputePowers(usint cycloOrder, const ModulusRoot<IntType>& modulusRoot) {
+void BluesteinFFTNat<VecType>::PreComputePowers(uint32_t cycloOrder, const ModulusRoot<IntType>& modulusRoot) {
     const auto& modulus = modulusRoot.first;
     const auto& root    = modulusRoot.second;
 
     VecType powers(cycloOrder, modulus);
     powers[0] = 1;
-    for (usint i = 1; i < cycloOrder; i++) {
+    for (uint32_t i = 1; i < cycloOrder; i++) {
         auto iSqr = (i * i) % (2 * cycloOrder);
         auto val  = root.ModExp(IntType(iSqr), modulus);
         powers[i] = val;
@@ -915,7 +829,7 @@ void BluesteinFFTNat<VecType>::PreComputePowers(usint cycloOrder, const ModulusR
 }
 
 template <typename VecType>
-void BluesteinFFTNat<VecType>::PreComputeRBTable(usint cycloOrder, const ModulusRootPair<IntType>& modulusRootPair) {
+void BluesteinFFTNat<VecType>::PreComputeRBTable(uint32_t cycloOrder, const ModulusRootPair<IntType>& modulusRootPair) {
     const auto& modulusRoot = modulusRootPair.first;
     const auto& modulus     = modulusRoot.first;
     const auto& root        = modulusRoot.second;
@@ -926,11 +840,11 @@ void BluesteinFFTNat<VecType>::PreComputeRBTable(usint cycloOrder, const Modulus
     // const auto &nttRoot = nttModulusRoot.second;
     // assumes rootTable is precomputed
     const auto& rootTable = m_rootOfUnityTableByModulusRoot[nttModulusRoot];
-    usint nttDim          = pow(2, ceil(log2(2 * cycloOrder - 1)));
+    uint32_t nttDim       = pow(2, ceil(log2(2 * cycloOrder - 1)));
 
     VecType b(2 * cycloOrder - 1, modulus);
     b[cycloOrder - 1] = 1;
-    for (usint i = 1; i < cycloOrder; i++) {
+    for (uint32_t i = 1; i < cycloOrder; i++) {
         auto iSqr             = (i * i) % (2 * cycloOrder);
         auto val              = rootInv.ModExp(IntType(iSqr), modulus);
         b[cycloOrder - 1 + i] = val;
@@ -947,7 +861,7 @@ void BluesteinFFTNat<VecType>::PreComputeRBTable(usint cycloOrder, const Modulus
 
 template <typename VecType>
 VecType BluesteinFFTNat<VecType>::ForwardTransform(const VecType& element, const IntType& root,
-                                                   const usint cycloOrder) {
+                                                   const uint32_t cycloOrder) {
     const auto& modulus        = element.GetModulus();
     const auto& nttModulusRoot = m_defaultNTTModulusRoot[modulus];
 
@@ -955,7 +869,8 @@ VecType BluesteinFFTNat<VecType>::ForwardTransform(const VecType& element, const
 }
 
 template <typename VecType>
-VecType BluesteinFFTNat<VecType>::ForwardTransform(const VecType& element, const IntType& root, const usint cycloOrder,
+VecType BluesteinFFTNat<VecType>::ForwardTransform(const VecType& element, const IntType& root,
+                                                   const uint32_t cycloOrder,
                                                    const ModulusRoot<IntType>& nttModulusRoot) {
     if (element.GetLength() != cycloOrder) {
         OPENFHE_THROW("expected size of element vector should be equal to cyclotomic order");
@@ -972,8 +887,8 @@ VecType BluesteinFFTNat<VecType>::ForwardTransform(const VecType& element, const
         m_rootOfUnityInverseTableByModulusRoot[nttModulusRoot];  // assumes rootTableInverse is precomputed
     VecType x = element.ModMul(powers);
 
-    usint nttDim = pow(2, ceil(log2(2 * cycloOrder - 1)));
-    auto Ra      = PadZeros(x, nttDim);
+    uint32_t nttDim = pow(2, ceil(log2(2 * cycloOrder - 1)));
+    auto Ra         = PadZeros(x, nttDim);
     Ra.SetModulus(nttModulus);
     VecType RA(nttDim);
     NumberTheoreticTransformNat<VecType>().ForwardTransformIterative(Ra, rootTable, &RA);
@@ -993,15 +908,15 @@ VecType BluesteinFFTNat<VecType>::ForwardTransform(const VecType& element, const
 }
 
 template <typename VecType>
-VecType BluesteinFFTNat<VecType>::PadZeros(const VecType& a, const usint finalSize) {
-    usint s = a.GetLength();
+VecType BluesteinFFTNat<VecType>::PadZeros(const VecType& a, const uint32_t finalSize) {
+    uint32_t s = a.GetLength();
     VecType result(finalSize, a.GetModulus());
 
-    for (usint i = 0; i < s; i++) {
+    for (uint32_t i = 0; i < s; i++) {
         result[i] = a[i];
     }
 
-    for (usint i = a.GetLength(); i < finalSize; i++) {
+    for (uint32_t i = a.GetLength(); i < finalSize; i++) {
         result[i] = IntType(0);
     }
 
@@ -1009,10 +924,10 @@ VecType BluesteinFFTNat<VecType>::PadZeros(const VecType& a, const usint finalSi
 }
 
 template <typename VecType>
-VecType BluesteinFFTNat<VecType>::Resize(const VecType& a, usint lo, usint hi) {
+VecType BluesteinFFTNat<VecType>::Resize(const VecType& a, uint32_t lo, uint32_t hi) {
     VecType result(hi - lo + 1, a.GetModulus());
 
-    for (usint i = lo, j = 0; i <= hi; i++, j++) {
+    for (uint32_t i = lo, j = 0; i <= hi; i++, j++) {
         result[j] = a[i];
     }
 
@@ -1034,12 +949,12 @@ void ChineseRemainderTransformArbNat<VecType>::SetCylotomicPolynomial(const VecT
 }
 
 template <typename VecType>
-void ChineseRemainderTransformArbNat<VecType>::PreCompute(const usint cyclotoOrder, const IntType& modulus) {
+void ChineseRemainderTransformArbNat<VecType>::PreCompute(const uint32_t cyclotoOrder, const IntType& modulus) {
     BluesteinFFTNat<VecType>().PreComputeDefaultNTTModulusRoot(cyclotoOrder, modulus);
 }
 
 template <typename VecType>
-void ChineseRemainderTransformArbNat<VecType>::SetPreComputedNTTModulus(usint cyclotoOrder, const IntType& modulus,
+void ChineseRemainderTransformArbNat<VecType>::SetPreComputedNTTModulus(uint32_t cyclotoOrder, const IntType& modulus,
                                                                         const IntType& nttModulus,
                                                                         const IntType& nttRoot) {
     const ModulusRoot<IntType> nttModulusRoot = {nttModulus, nttRoot};
@@ -1047,19 +962,19 @@ void ChineseRemainderTransformArbNat<VecType>::SetPreComputedNTTModulus(usint cy
 }
 
 template <typename VecType>
-void ChineseRemainderTransformArbNat<VecType>::SetPreComputedNTTDivisionModulus(usint cyclotoOrder,
+void ChineseRemainderTransformArbNat<VecType>::SetPreComputedNTTDivisionModulus(uint32_t cyclotoOrder,
                                                                                 const IntType& modulus,
                                                                                 const IntType& nttMod,
                                                                                 const IntType& nttRootBig) {
     OPENFHE_DEBUG_FLAG(false);
 
-    usint n = GetTotient(cyclotoOrder);
+    uint32_t n = GetTotient(cyclotoOrder);
     OPENFHE_DEBUG("GetTotient(" << cyclotoOrder << ")= " << n);
 
-    usint power                    = cyclotoOrder - n;
+    uint32_t power                 = cyclotoOrder - n;
     m_nttDivisionDim[cyclotoOrder] = 2 * std::pow(2, ceil(log2(power)));
 
-    usint nttDimBig = std::pow(2, ceil(log2(2 * cyclotoOrder - 1)));
+    uint32_t nttDimBig = std::pow(2, ceil(log2(2 * cyclotoOrder - 1)));
 
     // Computes the root of unity for the division NTT based on the root of unity
     // for regular NTT
@@ -1068,22 +983,22 @@ void ChineseRemainderTransformArbNat<VecType>::SetPreComputedNTTDivisionModulus(
     m_DivisionNTTModulus[modulus]     = nttMod;
     m_DivisionNTTRootOfUnity[modulus] = nttRoot;
     // part0 setting of rootTable and inverse rootTable
-    usint nttDim = m_nttDivisionDim[cyclotoOrder];
+    uint32_t nttDim = m_nttDivisionDim[cyclotoOrder];
     IntType root(nttRoot);
     auto rootInv = root.ModInverse(nttMod);
 
-    usint nttDimHf = (nttDim >> 1);
+    uint32_t nttDimHf = (nttDim >> 1);
     VecType rootTable(nttDimHf, nttMod);
     VecType rootTableInverse(nttDimHf, nttMod);
 
     IntType x(1);
-    for (usint i = 0; i < nttDimHf; i++) {
+    for (uint32_t i = 0; i < nttDimHf; i++) {
         rootTable[i] = x;
         x            = x.ModMul(root, nttMod);
     }
 
     x = 1;
-    for (usint i = 0; i < nttDimHf; i++) {
+    for (uint32_t i = 0; i < nttDimHf; i++) {
         rootTableInverse[i] = x;
         x                   = x.ModMul(rootInv, nttMod);
     }
@@ -1105,7 +1020,7 @@ void ChineseRemainderTransformArbNat<VecType>::SetPreComputedNTTDivisionModulus(
     const auto& cycloPoly = m_cyclotomicPolyMap[modulus];
 
     VecType QForwardTransform(nttDim, nttMod);
-    for (usint i = 0; i < cycloPoly.GetLength(); i++) {
+    for (uint32_t i = 0; i < cycloPoly.GetLength(); i++) {
         QForwardTransform[i] = cycloPoly[i];
     }
 
@@ -1117,17 +1032,17 @@ void ChineseRemainderTransformArbNat<VecType>::SetPreComputedNTTDivisionModulus(
 
 template <typename VecType>
 VecType ChineseRemainderTransformArbNat<VecType>::InversePolyMod(const VecType& cycloPoly, const IntType& modulus,
-                                                                 usint power) {
+                                                                 uint32_t power) {
     VecType result(power, modulus);
-    usint r = ceil(log2(power));
+    uint32_t r = ceil(log2(power));
     VecType h(1, modulus);  // h is a unit polynomial
     h[0] = 1;
 
     // Precompute the Barrett mu parameter
     IntType mu = modulus.ComputeMu();
 
-    for (usint i = 0; i < r; i++) {
-        usint qDegree = std::pow(2, i + 1);
+    for (uint32_t i = 0; i < r; i++) {
+        uint32_t qDegree = std::pow(2, i + 1);
         VecType q(qDegree + 1, modulus);  // q = x^(2^i+1)
         q[qDegree]   = 1;
         auto hSquare = PolynomialMultiplication(h, h);
@@ -1135,7 +1050,7 @@ VecType ChineseRemainderTransformArbNat<VecType>::InversePolyMod(const VecType&
         auto a = h * IntType(2);
         auto b = PolynomialMultiplication(hSquare, cycloPoly);
         // b = 2h - gh^2
-        for (usint j = 0; j < b.GetLength(); j++) {
+        for (uint32_t j = 0; j < b.GetLength(); j++) {
             if (j < a.GetLength()) {
                 b[j] = a[j].ModSub(b[j], modulus, mu);
             }
@@ -1146,7 +1061,7 @@ VecType ChineseRemainderTransformArbNat<VecType>::InversePolyMod(const VecType&
         h = PolyMod(b, q, modulus);
     }
     // take modulo x^power
-    for (usint i = 0; i < power; i++) {
+    for (uint32_t i = 0; i < power; i++) {
         result[i] = h[i];
     }
 
@@ -1156,8 +1071,8 @@ VecType ChineseRemainderTransformArbNat<VecType>::InversePolyMod(const VecType&
 template <typename VecType>
 VecType ChineseRemainderTransformArbNat<VecType>::ForwardTransform(const VecType& element, const IntType& root,
                                                                    const IntType& nttModulus, const IntType& nttRoot,
-                                                                   const usint cycloOrder) {
-    usint phim = GetTotient(cycloOrder);
+                                                                   const uint32_t cycloOrder) {
+    uint32_t phim = GetTotient(cycloOrder);
     if (element.GetLength() != phim) {
         OPENFHE_THROW("element size should be equal to phim");
     }
@@ -1194,8 +1109,8 @@ VecType ChineseRemainderTransformArbNat<VecType>::ForwardTransform(const VecType
 template <typename VecType>
 VecType ChineseRemainderTransformArbNat<VecType>::InverseTransform(const VecType& element, const IntType& root,
                                                                    const IntType& nttModulus, const IntType& nttRoot,
-                                                                   const usint cycloOrder) {
-    usint phim = GetTotient(cycloOrder);
+                                                                   const uint32_t cycloOrder) {
+    uint32_t phim = GetTotient(cycloOrder);
     if (element.GetLength() != phim) {
         OPENFHE_THROW("element size should be equal to phim");
     }
@@ -1231,20 +1146,20 @@ VecType ChineseRemainderTransformArbNat<VecType>::InverseTransform(const VecType
 }
 
 template <typename VecType>
-VecType ChineseRemainderTransformArbNat<VecType>::Pad(const VecType& element, const usint cycloOrder, bool forward) {
-    usint n = GetTotient(cycloOrder);
+VecType ChineseRemainderTransformArbNat<VecType>::Pad(const VecType& element, const uint32_t cycloOrder, bool forward) {
+    uint32_t n = GetTotient(cycloOrder);
 
     const auto& modulus = element.GetModulus();
     VecType inputToBluestein(cycloOrder, modulus);
 
     if (forward) {  // Forward transform padding
-        for (usint i = 0; i < n; i++) {
+        for (uint32_t i = 0; i < n; i++) {
             inputToBluestein[i] = element[i];
         }
     }
     else {  // Inverse transform padding
         auto tList = GetTotientList(cycloOrder);
-        usint i    = 0;
+        uint32_t i = 0;
         for (auto& coprime : tList) {
             inputToBluestein[coprime] = element[i++];
         }
@@ -1254,16 +1169,16 @@ VecType ChineseRemainderTransformArbNat<VecType>::Pad(const VecType& element, co
 }
 
 template <typename VecType>
-VecType ChineseRemainderTransformArbNat<VecType>::Drop(const VecType& element, const usint cycloOrder, bool forward,
+VecType ChineseRemainderTransformArbNat<VecType>::Drop(const VecType& element, const uint32_t cycloOrder, bool forward,
                                                        const IntType& bigMod, const IntType& bigRoot) {
-    usint n = GetTotient(cycloOrder);
+    uint32_t n = GetTotient(cycloOrder);
 
     const auto& modulus = element.GetModulus();
     VecType output(n, modulus);
 
     if (forward) {  // Forward transform drop
         auto tList = GetTotientList(cycloOrder);
-        for (usint i = 0; i < n; i++) {
+        for (uint32_t i = 0; i < n; i++) {
             output[i] = element[tList[i]];
         }
     }
@@ -1273,7 +1188,7 @@ VecType ChineseRemainderTransformArbNat<VecType>::Drop(const VecType& element, c
             // cycloOrder is prime: Reduce mod Phi_{n+1}(x)
             // Reduction involves subtracting the coeff of x^n from all terms
             auto coeff_n = element[n];
-            for (usint i = 0; i < n; i++) {
+            for (uint32_t i = 0; i < n; i++) {
                 output[i] = element[i].ModSub(coeff_n, modulus, mu);
             }
         }
@@ -1282,7 +1197,7 @@ VecType ChineseRemainderTransformArbNat<VecType>::Drop(const VecType& element, c
             // cycloOrder is 2*prime: 2 Step reduction
             // First reduce mod x^(n+1)+1 (=(x+1)*Phi_{2*(n+1)}(x))
             // Subtract co-efficient of x^(i+n+1) from x^(i)
-            for (usint i = 0; i < n; i++) {
+            for (uint32_t i = 0; i < n; i++) {
                 auto coeff_i  = element[i];
                 auto coeff_ip = element[i + n + 1];
                 output[i]     = coeff_i.ModSub(coeff_ip, modulus, mu);
@@ -1290,7 +1205,7 @@ VecType ChineseRemainderTransformArbNat<VecType>::Drop(const VecType& element, c
             auto coeff_n = element[n].ModSub(element[2 * n + 1], modulus, mu);
             // Now reduce mod Phi_{2*(n+1)}(x)
             // Similar to the prime case but with alternating signs
-            for (usint i = 0; i < n; i++) {
+            for (uint32_t i = 0; i < n; i++) {
                 if (i % 2 == 0) {
                     output[i].ModSubEq(coeff_n, modulus, mu);
                 }
@@ -1314,8 +1229,8 @@ VecType ChineseRemainderTransformArbNat<VecType>::Drop(const VecType& element, c
             const auto& rootTable = m_rootOfUnityDivisionTableByModulus[nttMod];
             VecType aPadded2(m_nttDivisionDim[cycloOrder], nttMod);
             // perform mod operation
-            usint power = cycloOrder - n;
-            for (usint i = n; i < element.GetLength(); i++) {
+            uint32_t power = cycloOrder - n;
+            for (uint32_t i = n; i < element.GetLength(); i++) {
                 aPadded2[power - (i - n) - 1] = element[i];
             }
             VecType A(m_nttDivisionDim[cycloOrder]);
@@ -1326,7 +1241,7 @@ VecType ChineseRemainderTransformArbNat<VecType>::Drop(const VecType& element, c
             NumberTheoreticTransformNat<VecType>().InverseTransformIterative(AB, rootTableInverse, &a);
 
             VecType quotient(m_nttDivisionDim[cycloOrder], modulus);
-            for (usint i = 0; i < power; i++) {
+            for (uint32_t i = 0; i < power; i++) {
                 quotient[i] = a[i];
             }
             quotient.ModEq(modulus);
@@ -1344,7 +1259,7 @@ VecType ChineseRemainderTransformArbNat<VecType>::Drop(const VecType& element, c
 
             IntType mu = modulus.ComputeMu();  // Precompute the Barrett mu parameter
 
-            for (usint i = 0; i < n; i++) {
+            for (uint32_t i = 0; i < n; i++) {
                 output[i] = element[i].ModSub(newQuotient2[cycloOrder - 1 - i], modulus, mu);
             }
         }
diff --git a/src/pke/examples/functional-bootstrapping-ckks.cpp b/src/pke/examples/functional-bootstrapping-ckks.cpp
index bb3878e16..98540fb08 100644
--- a/src/pke/examples/functional-bootstrapping-ckks.cpp
+++ b/src/pke/examples/functional-bootstrapping-ckks.cpp
@@ -163,13 +163,12 @@ void ArbitraryLUT(BigInteger QBFVInit, BigInteger PInput, BigInteger POutput, Bi
     parameters.SetNumLargeDigits(dnum);
     parameters.SetBatchSize(numSlotsCKKS);
     parameters.SetRingDim(ringDim);
-    uint32_t depth = levelsAvailableAfterBootstrap;
 
+    uint32_t depth = levelsAvailableAfterBootstrap;
     if (binaryLUT)
         depth += FHECKKSRNS::GetFBTDepth(lvlb, coeffint, PInput, order, secretKeyDist);
     else
         depth += FHECKKSRNS::GetFBTDepth(lvlb, coeffcomp, PInput, order, secretKeyDist);
-
     parameters.SetMultiplicativeDepth(depth);
 
     auto cc = GenCryptoContext(parameters);
@@ -321,13 +320,12 @@ void MultiValueBootstrapping(BigInteger QBFVInit, BigInteger PInput, BigInteger
     parameters.SetNumLargeDigits(dnum);
     parameters.SetBatchSize(numSlotsCKKS);
     parameters.SetRingDim(ringDim);
-    uint32_t depth = levelsAvailableAfterBootstrap + levelsComputation;
 
+    uint32_t depth = levelsAvailableAfterBootstrap + levelsComputation;
     if (binaryLUT)
         depth += FHECKKSRNS::GetFBTDepth(lvlb, coeffint1, PInput, order, secretKeyDist);
     else
         depth += FHECKKSRNS::GetFBTDepth(lvlb, coeffcomp1, PInput, order, secretKeyDist);
-
     parameters.SetMultiplicativeDepth(depth);
 
     auto cc = GenCryptoContext(parameters);
@@ -568,12 +566,10 @@ void MultiPrecisionSign(BigInteger QBFVInit, BigInteger PInput, BigInteger PDigi
     parameters.SetRingDim(ringDim);
 
     uint32_t depth = levelsAvailableAfterBootstrap;
-
     if (binaryLUT)
         depth += FHECKKSRNS::GetFBTDepth(lvlb, coeffintMod, PDigit, order, secretKeyDist);
     else
         depth += FHECKKSRNS::GetFBTDepth(lvlb, coeffcompMod, PDigit, order, secretKeyDist);
-
     parameters.SetMultiplicativeDepth(depth);
 
     auto cc = GenCryptoContext(parameters);
diff --git a/src/pke/examples/simple-ckks-bootstrapping.cpp b/src/pke/examples/simple-ckks-bootstrapping.cpp
index 03fcf7b94..b47bf2bb9 100644
--- a/src/pke/examples/simple-ckks-bootstrapping.cpp
+++ b/src/pke/examples/simple-ckks-bootstrapping.cpp
@@ -35,8 +35,6 @@ Example for CKKS bootstrapping with full packing
 
 */
 
-#define PROFILE
-
 #include "openfhe.h"
 
 using namespace lbcrypto;
@@ -78,12 +76,12 @@ void SimpleBootstrapExample() {
     */
 #if NATIVEINT == 128
     ScalingTechnique rescaleTech = FIXEDAUTO;
-    usint dcrtBits               = 78;
-    usint firstMod               = 89;
+    uint32_t dcrtBits            = 78;
+    uint32_t firstMod            = 89;
 #else
     ScalingTechnique rescaleTech = FLEXIBLEAUTO;
-    usint dcrtBits               = 59;
-    usint firstMod               = 60;
+    uint32_t dcrtBits            = 59;
+    uint32_t firstMod            = 60;
 #endif
 
     parameters.SetScalingModSize(dcrtBits);
@@ -99,11 +97,11 @@ void SimpleBootstrapExample() {
     */
     std::vector<uint32_t> levelBudget = {4, 4};
 
-    // Note that the actual number of levels avalailable after bootstrapping before next bootstrapping 
+    // Note that the actual number of levels avalailable after bootstrapping before next bootstrapping
     // will be levelsAvailableAfterBootstrap - 1 because an additional level
     // is used for scaling the ciphertext before next bootstrapping (in 64-bit CKKS bootstrapping)
     uint32_t levelsAvailableAfterBootstrap = 10;
-    usint depth = levelsAvailableAfterBootstrap + FHECKKSRNS::GetBootstrapDepth(levelBudget, secretKeyDist);
+    uint32_t depth = levelsAvailableAfterBootstrap + FHECKKSRNS::GetBootstrapDepth(levelBudget, secretKeyDist);
     parameters.SetMultiplicativeDepth(depth);
 
     CryptoContext<DCRTPoly> cryptoContext = GenCryptoContext(parameters);
@@ -114,10 +112,10 @@ void SimpleBootstrapExample() {
     cryptoContext->Enable(ADVANCEDSHE);
     cryptoContext->Enable(FHE);
 
-    usint ringDim = cryptoContext->GetRingDimension();
+    uint32_t ringDim = cryptoContext->GetRingDimension();
     // This is the maximum number of slots that can be used for full packing.
-    usint numSlots = ringDim / 2;
-    std::cout << "CKKS scheme is using ring dimension " << ringDim << std::endl << std::endl;
+    uint32_t numSlots = ringDim / 2;
+    std::cout << "CKKS scheme ring dimension: " << ringDim << "\n\n";
 
     cryptoContext->EvalBootstrapSetup(levelBudget);
 
@@ -132,22 +130,26 @@ void SimpleBootstrapExample() {
     Plaintext ptxt = cryptoContext->MakeCKKSPackedPlaintext(x, 1, depth - 1);
 
     ptxt->SetLength(encodedLength);
-    std::cout << "Input: " << ptxt << std::endl;
+    std::cout << "Input: " << ptxt << "\n";
 
     Ciphertext<DCRTPoly> ciph = cryptoContext->Encrypt(keyPair.publicKey, ptxt);
 
-    std::cout << "Initial number of levels remaining: " << depth - ciph->GetLevel() << std::endl;
+    std::cout << "Initial number of levels remaining: " << depth - ciph->GetLevel() << "\n\n";
+
+    // auto start = std::chrono::high_resolution_clock::now();
 
     // Perform the bootstrapping operation. The goal is to increase the number of levels remaining
     // for HE computation.
     auto ciphertextAfter = cryptoContext->EvalBootstrap(ciph);
 
+    // auto stop = std::chrono::high_resolution_clock::now();
+    // std::cout << "Bootstrapping time: " << std::chrono::duration<double>(stop - start).count() << " s\n\n";
+
     std::cout << "Number of levels remaining after bootstrapping: "
-              << depth - ciphertextAfter->GetLevel() - (ciphertextAfter->GetNoiseScaleDeg() - 1) << std::endl
-              << std::endl;
+              << depth - ciphertextAfter->GetLevel() - (ciphertextAfter->GetNoiseScaleDeg() - 1) << "\n\n";
 
     Plaintext result;
     cryptoContext->Decrypt(keyPair.secretKey, ciphertextAfter, &result);
     result->SetLength(encodedLength);
-    std::cout << "Output after bootstrapping \n\t" << result << std::endl;
+    std::cout << "Output after bootstrapping: " << result << "\n";
 }
diff --git a/src/pke/include/ciphertext-fwd.h b/src/pke/include/ciphertext-fwd.h
index b36467720..35dcbcd41 100644
--- a/src/pke/include/ciphertext-fwd.h
+++ b/src/pke/include/ciphertext-fwd.h
@@ -57,12 +57,12 @@ template <typename Element>
 struct seriesPowers {
     std::vector<Ciphertext<Element>> powersRe;
     std::vector<Ciphertext<Element>> powers2Re;
-    ConstCiphertext<Element> power2km1Re;
+    Ciphertext<Element> power2km1Re;
     uint32_t k;
     uint32_t m;
     std::vector<Ciphertext<Element>> powersIm;
     std::vector<Ciphertext<Element>> powers2Im;
-    ConstCiphertext<Element> power2km1Im;
+    Ciphertext<Element> power2km1Im;
 
     seriesPowers() = default;
 
@@ -74,13 +74,13 @@ struct seriesPowers {
         : powersRe(powers0), powersIm(powers1) {}
 
     seriesPowers(const std::vector<Ciphertext<Element>>& powers0, const std::vector<Ciphertext<Element>>& powers20,
-                 ConstCiphertext<Element>& power2km10, uint32_t k0, uint32_t m0)
+                 const Ciphertext<Element>& power2km10, uint32_t k0, uint32_t m0)
         : powersRe(powers0), powers2Re(powers20), power2km1Re(power2km10), k(k0), m(m0) {}
 
     seriesPowers(const std::vector<Ciphertext<Element>>& powers0, const std::vector<Ciphertext<Element>>& powers20,
-                 ConstCiphertext<Element>& power2km10, uint32_t k0, uint32_t m0,
+                 const Ciphertext<Element>& power2km10, uint32_t k0, uint32_t m0,
                  const std::vector<Ciphertext<Element>>& powers1, const std::vector<Ciphertext<Element>>& powers21,
-                 ConstCiphertext<Element>& power2km11)
+                 const Ciphertext<Element>& power2km11)
         : powersRe(powers0),
           powers2Re(powers20),
           power2km1Re(power2km10),
diff --git a/src/pke/include/cryptocontext.h b/src/pke/include/cryptocontext.h
index 5837b5150..a5c6c3456 100644
--- a/src/pke/include/cryptocontext.h
+++ b/src/pke/include/cryptocontext.h
@@ -631,6 +631,11 @@ class CryptoContextImpl : public Serializable {
         return !(a == b);
     }
 
+    /**
+    * @brief Clears various caches within the library
+    */
+    static void ClearStaticMapsAndVectors();
+
     /**
     * @brief Serializes either all EvalMult keys (if keyTag is empty) or the EvalMult keys for keyTag
     *
diff --git a/src/pke/include/scheme/ckksrns/ckksrns-fhe.h b/src/pke/include/scheme/ckksrns/ckksrns-fhe.h
index 233234e61..45b879874 100644
--- a/src/pke/include/scheme/ckksrns/ckksrns-fhe.h
+++ b/src/pke/include/scheme/ckksrns/ckksrns-fhe.h
@@ -1,7 +1,7 @@
 //==================================================================================
 // BSD 2-Clause License
 //
-// Copyright (c) 2014-2022, NJIT, Duality Technologies Inc. and other contributors
+// Copyright (c) 2014-2025, NJIT, Duality Technologies Inc. and other contributors
 //
 // All rights reserved.
 //
@@ -63,23 +63,13 @@ class CKKSBootstrapPrecom {
 
     CKKSBootstrapPrecom(CKKSBootstrapPrecom&& rhs) noexcept = default;
 
-    // number of slots for which the bootstrapping is performed
-    uint32_t m_slots;
-
-    // the inner dimension in the baby-step giant-step strategy
-    uint32_t m_dim1;
-    uint32_t m_gs;
-
-    uint32_t m_levelEnc;
-    uint32_t m_levelDec;
-
     // level budget for homomorphic encoding, number of layers to collapse in one level,
     // number of layers remaining to be collapsed in one level to have exactly the number
     // of levels specified in the level budget, the number of rotations in one level,
     // the baby step and giant step in the baby-step giant-step strategy, the number of
     // rotations in the remaining level, the baby step and giant step in the baby-step
     // giant-step strategy for the remaining level
-    std::vector<int32_t> m_paramsEnc = std::vector<int32_t>(CKKS_BOOT_PARAMS::TOTAL_ELEMENTS);
+    struct ckks_boot_params m_paramsEnc;
 
     // level budget for homomorphic decoding, number of layers to collapse in one level,
     // number of layers remaining to be collapsed in one level to have exactly the number
@@ -87,7 +77,10 @@ class CKKSBootstrapPrecom {
     // the baby step and giant step in the baby-step giant-step strategy, the number of
     // rotations in the remaining level, the baby step and giant step in the baby-step
     // giant-step strategy for the remaining level
-    std::vector<int32_t> m_paramsDec = std::vector<int32_t>(CKKS_BOOT_PARAMS::TOTAL_ELEMENTS);
+    struct ckks_boot_params m_paramsDec;
+
+    // number of slots for which the bootstrapping is performed
+    uint32_t m_slots;
 
     // Linear map U0; used in decoding
     std::vector<ReadOnlyPlaintext> m_U0Pre;
@@ -106,20 +99,20 @@ class CKKSBootstrapPrecom {
 
     template <class Archive>
     void save(Archive& ar) const {
-        ar(cereal::make_nvp("dim1_Enc", m_dim1));
-        ar(cereal::make_nvp("dim1_Dec", m_paramsDec[CKKS_BOOT_PARAMS::GIANT_STEP]));
+        ar(cereal::make_nvp("dim1_Enc", m_paramsEnc.g));
+        ar(cereal::make_nvp("dim1_Dec", m_paramsDec.g));
         ar(cereal::make_nvp("slots", m_slots));
-        ar(cereal::make_nvp("lEnc", m_paramsEnc[CKKS_BOOT_PARAMS::LEVEL_BUDGET]));
-        ar(cereal::make_nvp("lDec", m_paramsDec[CKKS_BOOT_PARAMS::LEVEL_BUDGET]));
+        ar(cereal::make_nvp("lEnc", m_paramsEnc.lvlb));
+        ar(cereal::make_nvp("lDec", m_paramsDec.lvlb));
     }
 
     template <class Archive>
     void load(Archive& ar) {
-        ar(cereal::make_nvp("dim1_Enc", m_dim1));
-        ar(cereal::make_nvp("dim1_Dec", m_paramsDec[CKKS_BOOT_PARAMS::GIANT_STEP]));
+        ar(cereal::make_nvp("dim1_Enc", m_paramsEnc.g));
+        ar(cereal::make_nvp("dim1_Dec", m_paramsDec.g));
         ar(cereal::make_nvp("slots", m_slots));
-        ar(cereal::make_nvp("lEnc", m_paramsEnc[CKKS_BOOT_PARAMS::LEVEL_BUDGET]));
-        ar(cereal::make_nvp("lDec", m_paramsDec[CKKS_BOOT_PARAMS::LEVEL_BUDGET]));
+        ar(cereal::make_nvp("lEnc", m_paramsEnc.lvlb));
+        ar(cereal::make_nvp("lDec", m_paramsDec.lvlb));
     }
 };
 
@@ -395,7 +388,7 @@ class FHECKKSRNS : public FHERNS {
                                                                       size_t order = 1);
 
     template <typename VectorDataType>
-    Ciphertext<DCRTPoly> EvalMVBNoDecodingInternal(std::shared_ptr<seriesPowers<DCRTPoly>> ciphertext,
+    Ciphertext<DCRTPoly> EvalMVBNoDecodingInternal(const std::shared_ptr<seriesPowers<DCRTPoly>>& ciphertext,
                                                    const std::vector<VectorDataType>& coefficients,
                                                    uint32_t digitBitSize, size_t order = 1);
 
diff --git a/src/pke/include/scheme/ckksrns/ckksrns-utils.h b/src/pke/include/scheme/ckksrns/ckksrns-utils.h
index e90c2cade..bdbd92a29 100644
--- a/src/pke/include/scheme/ckksrns/ckksrns-utils.h
+++ b/src/pke/include/scheme/ckksrns/ckksrns-utils.h
@@ -59,6 +59,9 @@ inline bool IsNotEqualOne(double v, double delta = 0x1p-44) {
 inline bool IsNotEqualZero(double v, double delta = 0x1p-44) {
     return std::abs(v) > delta;
 }
+inline bool IsNotEqualNegOne(double v, double delta = 0x1p-44) {
+    return std::abs(v + 1.0) > delta;
+}
 inline bool IsNotEqualOne(std::complex<double> val, double delta = 0x1p-44) {
     return IsNotEqualOne(val.real(), delta) || IsNotEqualZero(val.imag(), delta);
 }
@@ -253,15 +256,25 @@ uint32_t ReduceRotation(int32_t index, uint32_t slots);
 
 /**
  * Computes all parameters needed for the homomorphic encoding and decoding in the bootstrapping
- * operation and returns them as a vector. The returned vector's data can be accessed using
- * enum'ed indices from CKKS_BOOT_PARAMS that are defined below.
+ * operation and returns them as a struct.
  *
  * @param slots number of slots
  * @param levelBudget the allocated level budget for the computation.
  * @param dim1 the value for the inner dimension in the baby-step giant-step strategy
- * @return vector with parameters for the homomorphic encoding and decoding in bootstrapping
+ * @return struct with parameters for the homomorphic encoding and decoding in bootstrapping
  */
-std::vector<int32_t> GetCollapsedFFTParams(uint32_t slots, uint32_t levelBudget = 4, uint32_t dim1 = 0);
+struct ckks_boot_params {
+    uint32_t lvlb;             // level budget
+    uint32_t layersCollapse;   // layers to collapse in one level
+    uint32_t remCollapse;      // remaining layers to collapse
+    uint32_t numRotations;     // umber of rotations in one level
+    uint32_t b;                // baby step in the baby-step giant-step strategy
+    uint32_t g;                // giant step in the baby-step giant-step strategy
+    uint32_t numRotationsRem;  // number of rotations in the remaining level
+    uint32_t bRem;             // baby step in the baby-step giant-step strategy for the remaining level
+    uint32_t gRem;             // giant step in the baby-step giant-step strategy for the remaining level
+};
+struct ckks_boot_params GetCollapsedFFTParams(uint32_t slots, uint32_t levelBudget = 4, uint32_t dim1 = 0);
 
 /**
  *  Gets inner loop dimension for baby step giant step algorithm for linear transform,
@@ -291,23 +304,6 @@ std::vector<int32_t> FindLTRotationIndicesSwitch(uint32_t dim1, uint32_t m, uint
 */
 std::vector<int32_t> FindLTRotationIndicesSwitchArgmin(uint32_t m, uint32_t blockDimension, uint32_t cols);
 
-namespace CKKS_BOOT_PARAMS {
-
-// Enums representing indices for the vector returned by GetCollapsedFFTParams()
-enum {
-    LEVEL_BUDGET,  // the level budget
-    LAYERS_COLL,   // the number of layers to collapse in one level
-    LAYERS_REM,  // the number of layers remaining to be collapsed in one level to have exactly the number of levels specified in the level budget
-    NUM_ROTATIONS,      // the number of rotations in one level
-    BABY_STEP,          // the baby step in the baby-step giant-step strategy
-    GIANT_STEP,         // the giant step in the baby-step giant-step strategy
-    NUM_ROTATIONS_REM,  // the number of rotations in the remaining level
-    BABY_STEP_REM,      // the baby step in the baby-step giant-step strategy for the remaining level
-    GIANT_STEP_REM,     // the giant step in the baby-step giant-step strategy for the remaining level
-    TOTAL_ELEMENTS      // total number of elements in the vector
-};
-}  // namespace CKKS_BOOT_PARAMS
-
 }  // namespace lbcrypto
 
 #endif
diff --git a/src/pke/lib/cryptocontext.cpp b/src/pke/lib/cryptocontext.cpp
index fe26edc97..ee2a31749 100644
--- a/src/pke/lib/cryptocontext.cpp
+++ b/src/pke/lib/cryptocontext.cpp
@@ -48,6 +48,23 @@ template <typename Element>
 std::map<std::string, std::shared_ptr<std::map<uint32_t, EvalKey<Element>>>>
     CryptoContextImpl<Element>::s_evalAutomorphismKeyMap{};
 
+template <typename Element>
+void CryptoContextImpl<Element>::ClearStaticMapsAndVectors() {
+    CryptoContextImpl<Element>::s_evalAutomorphismKeyMap.clear();
+    CryptoContextImpl<Element>::s_evalMultKeyMap.clear();
+    PackedEncoding::Destroy();
+    intnat::ChineseRemainderTransformFTTNat<NativeVector>().Reset();
+#ifdef WITH_BE2
+    bigintfxd::ChineseRemainderTransformFTTFxd<M2Vector>().Reset();
+#endif
+#ifdef WITH_BE4
+    bigintdyn::ChineseRemainderTransformFTTDyn<M4Vector>().Reset();
+#endif
+#ifdef WITH_NTL
+    NTL::ChineseRemainderTransformFTTNtl<M6Vector>().Reset();
+#endif
+}
+
 template <typename Element>
 void CryptoContextImpl<Element>::SetKSTechniqueInScheme() {
     // check if the scheme is an RNS scheme
diff --git a/src/pke/lib/keyswitch/keyswitch-bv.cpp b/src/pke/lib/keyswitch/keyswitch-bv.cpp
index 93ee3b08b..04b3fe201 100644
--- a/src/pke/lib/keyswitch/keyswitch-bv.cpp
+++ b/src/pke/lib/keyswitch/keyswitch-bv.cpp
@@ -50,8 +50,6 @@ namespace lbcrypto {
 
 EvalKey<DCRTPoly> KeySwitchBV::KeySwitchGenInternal(const PrivateKey<DCRTPoly> oldKey,
                                                     const PrivateKey<DCRTPoly> newKey) const {
-    EvalKeyRelin<DCRTPoly> ek(std::make_shared<EvalKeyRelinImpl<DCRTPoly>>(newKey->GetCryptoContext()));
-
     const auto cryptoParams = std::dynamic_pointer_cast<CryptoParametersRNS>(newKey->GetCryptoParameters());
 
     const DCRTPoly& sNew = newKey->GetPrivateElement();
@@ -86,6 +84,8 @@ EvalKey<DCRTPoly> KeySwitchBV::KeySwitchGenInternal(const PrivateKey<DCRTPoly> o
     std::vector<DCRTPoly> av(nWindows);
     std::vector<DCRTPoly> bv(nWindows);
 
+    // TODO: parallelize loop using fix from KeySwitchHYBRID::KeySwitchGenInternal
+
     if (digitSize > 0) {
         for (usint i = 0; i < sOld.GetNumOfElements(); i++) {
             std::vector<DCRTPoly::PolyType> sOldDecomposed = sOld.GetElementAtIndex(i).PowersOfBase(digitSize);
@@ -115,6 +115,7 @@ EvalKey<DCRTPoly> KeySwitchBV::KeySwitchGenInternal(const PrivateKey<DCRTPoly> o
         }
     }
 
+    EvalKeyRelin<DCRTPoly> ek(std::make_shared<EvalKeyRelinImpl<DCRTPoly>>(newKey->GetCryptoContext()));
     ek->SetAVector(std::move(av));
     ek->SetBVector(std::move(bv));
     ek->SetKeyTag(newKey->GetKeyTag());
@@ -124,8 +125,6 @@ EvalKey<DCRTPoly> KeySwitchBV::KeySwitchGenInternal(const PrivateKey<DCRTPoly> o
 EvalKey<DCRTPoly> KeySwitchBV::KeySwitchGenInternal(const PrivateKey<DCRTPoly> oldKey,
                                                     const PrivateKey<DCRTPoly> newKey,
                                                     const EvalKey<DCRTPoly> ek) const {
-    EvalKeyRelin<DCRTPoly> evalKey(std::make_shared<EvalKeyRelinImpl<DCRTPoly>>(newKey->GetCryptoContext()));
-
     const auto cryptoParams = std::dynamic_pointer_cast<CryptoParametersRNS>(oldKey->GetCryptoParameters());
 
     const DCRTPoly& sNew = newKey->GetPrivateElement();
@@ -161,6 +160,8 @@ EvalKey<DCRTPoly> KeySwitchBV::KeySwitchGenInternal(const PrivateKey<DCRTPoly> o
     std::vector<DCRTPoly> av(nWindows);
     std::vector<DCRTPoly> bv(nWindows);
 
+    // TODO: parallelize loop using fix from KeySwitchHYBRID::KeySwitchGenInternal
+
     if (digitSize > 0) {
         for (usint i = 0; i < sizeSOld; i++) {
             std::vector<DCRTPoly::PolyType> sOldDecomposed = sOld.GetElementAtIndex(i).PowersOfBase(digitSize);
@@ -200,6 +201,7 @@ EvalKey<DCRTPoly> KeySwitchBV::KeySwitchGenInternal(const PrivateKey<DCRTPoly> o
         }
     }
 
+    EvalKeyRelin<DCRTPoly> evalKey(std::make_shared<EvalKeyRelinImpl<DCRTPoly>>(newKey->GetCryptoContext()));
     evalKey->SetAVector(std::move(av));
     evalKey->SetBVector(std::move(bv));
     evalKey->SetKeyTag(newKey->GetKeyTag());
diff --git a/src/pke/lib/keyswitch/keyswitch-hybrid.cpp b/src/pke/lib/keyswitch/keyswitch-hybrid.cpp
index 552fee8fc..868e57d45 100644
--- a/src/pke/lib/keyswitch/keyswitch-hybrid.cpp
+++ b/src/pke/lib/keyswitch/keyswitch-hybrid.cpp
@@ -35,13 +35,12 @@
  */
 #define PROFILE
 
-#include "keyswitch/keyswitch-hybrid.h"
-
+#include "ciphertext.h"
+#include "key/evalkeyrelin.h"
 #include "key/privatekey.h"
 #include "key/publickey.h"
-#include "key/evalkeyrelin.h"
+#include "keyswitch/keyswitch-hybrid.h"
 #include "scheme/ckksrns/ckksrns-cryptoparameters.h"
-#include "ciphertext.h"
 
 namespace lbcrypto {
 
@@ -53,81 +52,78 @@ EvalKey<DCRTPoly> KeySwitchHYBRID::KeySwitchGenInternal(const PrivateKey<DCRTPol
 EvalKey<DCRTPoly> KeySwitchHYBRID::KeySwitchGenInternal(const PrivateKey<DCRTPoly> oldKey,
                                                         const PrivateKey<DCRTPoly> newKey,
                                                         const EvalKey<DCRTPoly> ekPrev) const {
-    EvalKeyRelin<DCRTPoly> ek(std::make_shared<EvalKeyRelinImpl<DCRTPoly>>(newKey->GetCryptoContext()));
-
     const auto cryptoParams = std::dynamic_pointer_cast<CryptoParametersRNS>(newKey->GetCryptoParameters());
-
-    const std::shared_ptr<ParmType> paramsQ  = cryptoParams->GetElementParams();
-    const std::shared_ptr<ParmType> paramsQP = cryptoParams->GetParamsQP();
-
-    size_t sizeQ  = paramsQ->GetParams().size();
-    size_t sizeQP = paramsQP->GetParams().size();
-
-    DCRTPoly sOld = oldKey->GetPrivateElement();
-    DCRTPoly sNew = newKey->GetPrivateElement().Clone();
+    const auto& paramsQ     = cryptoParams->GetElementParams();
+    const auto& paramsQP    = cryptoParams->GetParamsQP();
+    const auto& pparamsQP   = paramsQP->GetParams();
 
     // skNew is currently in basis Q. This extends it to basis QP.
-    sNew.SetFormat(Format::COEFFICIENT);
-
-    DCRTPoly sNewExt(paramsQP, Format::COEFFICIENT, true);
 
-    // The part with basis Q
-    for (size_t i = 0; i < sizeQ; i++) {
-        sNewExt.SetElementAtIndex(i, sNew.GetElementAtIndex(i));
-    }
+    DCRTPoly sNewExt(paramsQP, Format::EVALUATION, true);
+    const auto& sNew = newKey->GetPrivateElement();
 
-    // The part with basis P
-    for (size_t j = sizeQ; j < sizeQP; j++) {
-        const NativeInteger& pj    = paramsQP->GetParams()[j]->GetModulus();
-        const NativeInteger& rootj = paramsQP->GetParams()[j]->GetRootOfUnity();
-        auto sNew0                 = sNew.GetElementAtIndex(0);
-        sNew0.SwitchModulus(pj, rootj, 0, 0);
-        sNewExt.SetElementAtIndex(j, std::move(sNew0));
-    }
+    auto sNew0 = sNew.GetElementAtIndex(0);
+    sNew0.SetFormat(Format::COEFFICIENT);
 
-    sNewExt.SetFormat(Format::EVALUATION);
+    const uint32_t sizeQ  = paramsQ->GetParams().size();
+    const uint32_t sizeQP = paramsQP->GetParams().size();
 
-    const auto ns      = cryptoParams->GetNoiseScale();
-    const DggType& dgg = cryptoParams->GetDiscreteGaussianGenerator();
-    DugType dug;
+#pragma omp parallel for num_threads(OpenFHEParallelControls.GetThreadLimit(sizeQP))
+    for (uint32_t i = 0; i < sizeQP; ++i) {
+        if (i < sizeQ) {
+            auto tmp = sNew.GetElementAtIndex(i);
+            tmp.SetFormat(Format::EVALUATION);
+            sNewExt.SetElementAtIndex(i, std::move(tmp));
+        }
+        else {
+            auto tmp = sNew0;
+            tmp.SwitchModulus(pparamsQP[i]->GetModulus(), pparamsQP[i]->GetRootOfUnity(), 0, 0);
+            tmp.SetFormat(Format::EVALUATION);
+            sNewExt.SetElementAtIndex(i, std::move(tmp));
+        }
+    }
 
-    size_t numPartQ = cryptoParams->GetNumPartQ();
+    const auto ns = cryptoParams->GetNoiseScale();
 
+    const uint32_t numPerPartQ = cryptoParams->GetNumPerPartQ();
+    const uint32_t numPartQ    = cryptoParams->GetNumPartQ();
     std::vector<DCRTPoly> av(numPartQ);
     std::vector<DCRTPoly> bv(numPartQ);
 
-    std::vector<NativeInteger> PModq = cryptoParams->GetPModq();
-    size_t numPerPartQ               = cryptoParams->GetNumPerPartQ();
+    DugType dug;
+    auto dgg = cryptoParams->GetDiscreteGaussianGenerator();
+
+    const auto& sOld  = oldKey->GetPrivateElement();
+    const auto& PModq = cryptoParams->GetPModq();
 
-    for (size_t part = 0; part < numPartQ; ++part) {
-        DCRTPoly a = (ekPrev == nullptr) ? DCRTPoly(dug, paramsQP, Format::EVALUATION) :  // single-key HE
-                                           ekPrev->GetAVector()[part];                                      // threshold HE
+#pragma omp parallel for num_threads(OpenFHEParallelControls.GetThreadLimit(numPartQ)) private(dug, dgg)
+    for (uint32_t part = 0; part < numPartQ; ++part) {
+        auto a = (ekPrev == nullptr) ? DCRTPoly(dug, paramsQP, Format::EVALUATION) :  // single-key HE
+                                       ekPrev->GetAVector()[part];                                      // threshold HE
         DCRTPoly e(dgg, paramsQP, Format::EVALUATION);
         DCRTPoly b(paramsQP, Format::EVALUATION, true);
 
-        // starting and ending position of current part
-        size_t startPartIdx = numPerPartQ * part;
-        size_t endPartIdx   = (sizeQ > (startPartIdx + numPerPartQ)) ? (startPartIdx + numPerPartQ) : sizeQ;
+        const uint32_t startPartIdx = numPerPartQ * part;
+        const uint32_t endPartIdx   = (sizeQ > (startPartIdx + numPerPartQ)) ? (startPartIdx + numPerPartQ) : sizeQ;
 
-        for (size_t i = 0; i < sizeQP; ++i) {
-            auto ai    = a.GetElementAtIndex(i);
-            auto ei    = e.GetElementAtIndex(i);
-            auto sNewi = sNewExt.GetElementAtIndex(i);
+        for (uint32_t i = 0; i < sizeQP; ++i) {
+            const auto& ai  = a.GetElementAtIndex(i);
+            const auto& ei  = e.GetElementAtIndex(i);
+            const auto& sni = sNewExt.GetElementAtIndex(i);
 
             if (i < startPartIdx || i >= endPartIdx) {
-                b.SetElementAtIndex(i, -ai * sNewi + ns * ei);
+                b.SetElementAtIndex(i, (-ai * sni) + (ns * ei));
             }
             else {
-                // P * sOld is only applied for the current part
-                auto sOldi = sOld.GetElementAtIndex(i);
-                b.SetElementAtIndex(i, -ai * sNewi + PModq[i] * sOldi + ns * ei);
+                const auto& soi = sOld.GetElementAtIndex(i);
+                b.SetElementAtIndex(i, (-ai * sni) + (ns * ei) + (PModq[i] * soi));
             }
         }
-
-        av[part] = a;
-        bv[part] = b;
+        av[part] = std::move(a);
+        bv[part] = std::move(b);
     }
 
+    EvalKeyRelin<DCRTPoly> ek(std::make_shared<EvalKeyRelinImpl<DCRTPoly>>(newKey->GetCryptoContext()));
     ek->SetAVector(std::move(av));
     ek->SetBVector(std::move(bv));
     ek->SetKeyTag(newKey->GetKeyTag());
@@ -136,72 +132,65 @@ EvalKey<DCRTPoly> KeySwitchHYBRID::KeySwitchGenInternal(const PrivateKey<DCRTPol
 
 EvalKey<DCRTPoly> KeySwitchHYBRID::KeySwitchGenInternal(const PrivateKey<DCRTPoly> oldKey,
                                                         const PublicKey<DCRTPoly> newKey) const {
-    EvalKeyRelin<DCRTPoly> ek = std::make_shared<EvalKeyRelinImpl<DCRTPoly>>(newKey->GetCryptoContext());
-
     const auto cryptoParams = std::dynamic_pointer_cast<CryptoParametersRNS>(newKey->GetCryptoParameters());
+    const auto& paramsQ     = cryptoParams->GetElementParams();
+    const auto& paramsQP    = cryptoParams->GetParamsQP();
 
-    const std::shared_ptr<ParmType> paramsQ  = cryptoParams->GetElementParams();
-    const std::shared_ptr<ParmType> paramsQP = cryptoParams->GetParamsQP();
-
-    usint sizeQ  = paramsQ->GetParams().size();
-    usint sizeQP = paramsQP->GetParams().size();
+    const uint32_t sizeQ  = paramsQ->GetParams().size();
+    const uint32_t sizeQP = paramsQP->GetParams().size();
 
-    DCRTPoly sOld = oldKey->GetPrivateElement();
-
-    DCRTPoly newp0 = newKey->GetPublicElements().at(0);
-    DCRTPoly newp1 = newKey->GetPublicElements().at(1);
-
-    const auto ns      = cryptoParams->GetNoiseScale();
-    const DggType& dgg = cryptoParams->GetDiscreteGaussianGenerator();
-    TugType tug;
-
-    auto numPartQ = cryptoParams->GetNumPartQ();
+    const auto ns = cryptoParams->GetNoiseScale();
 
+    const uint32_t numPerPartQ = cryptoParams->GetNumPerPartQ();
+    const uint32_t numPartQ    = cryptoParams->GetNumPartQ();
     std::vector<DCRTPoly> av(numPartQ);
     std::vector<DCRTPoly> bv(numPartQ);
 
-    std::vector<NativeInteger> PModq = cryptoParams->GetPModq();
-    usint numPerPartQ                = cryptoParams->GetNumPerPartQ();
+    TugType tug;
+    auto dgg = cryptoParams->GetDiscreteGaussianGenerator();
 
-    for (usint part = 0; part < numPartQ; part++) {
-        DCRTPoly u = (cryptoParams->GetSecretKeyDist() == GAUSSIAN) ? DCRTPoly(dgg, paramsQP, Format::EVALUATION) :
-                                                                      DCRTPoly(tug, paramsQP, Format::EVALUATION);
+    const auto& sOld  = oldKey->GetPrivateElement();
+    const auto& newp0 = newKey->GetPublicElements().at(0);
+    const auto& newp1 = newKey->GetPublicElements().at(1);
+    const auto& PModq = cryptoParams->GetPModq();
 
+#pragma omp parallel for num_threads(OpenFHEParallelControls.GetThreadLimit(numPartQ)) private(dgg, tug)
+    for (uint32_t part = 0; part < numPartQ; ++part) {
+        auto u = (cryptoParams->GetSecretKeyDist() == GAUSSIAN) ? DCRTPoly(dgg, paramsQP, Format::EVALUATION) :
+                                                                  DCRTPoly(tug, paramsQP, Format::EVALUATION);
         DCRTPoly e0(dgg, paramsQP, Format::EVALUATION);
         DCRTPoly e1(dgg, paramsQP, Format::EVALUATION);
-
         DCRTPoly a(paramsQP, Format::EVALUATION, true);
         DCRTPoly b(paramsQP, Format::EVALUATION, true);
 
         // starting and ending position of current part
-        usint startPartIdx = numPerPartQ * part;
-        usint endPartIdx   = (sizeQ > startPartIdx + numPerPartQ) ? (startPartIdx + numPerPartQ) : sizeQ;
+        const uint32_t startPartIdx = numPerPartQ * part;
+        const uint32_t endPartIdx   = (sizeQ > startPartIdx + numPerPartQ) ? (startPartIdx + numPerPartQ) : sizeQ;
 
-        for (usint i = 0; i < sizeQP; i++) {
-            auto e0i = e0.GetElementAtIndex(i);
-            auto e1i = e1.GetElementAtIndex(i);
+        for (uint32_t i = 0; i < sizeQP; ++i) {
+            const auto& ui = u.GetElementAtIndex(i);
 
-            auto ui = u.GetElementAtIndex(i);
+            const auto& e0i = e0.GetElementAtIndex(i);
+            const auto& e1i = e1.GetElementAtIndex(i);
 
-            auto newp0i = newp0.GetElementAtIndex(i);
-            auto newp1i = newp1.GetElementAtIndex(i);
+            const auto& newp0i = newp0.GetElementAtIndex(i);
+            const auto& newp1i = newp1.GetElementAtIndex(i);
 
             a.SetElementAtIndex(i, newp1i * ui + ns * e1i);
 
             if (i < startPartIdx || i >= endPartIdx) {
-                b.SetElementAtIndex(i, newp0i * ui + ns * e0i);
+                b.SetElementAtIndex(i, (newp0i * ui) + (ns * e0i));
             }
             else {
-                // P * sOld is only applied for the current part
-                auto sOldi = sOld.GetElementAtIndex(i);
-                b.SetElementAtIndex(i, newp0i * ui + ns * e0i + PModq[i] * sOldi);
+                const auto& soi = sOld.GetElementAtIndex(i);
+                b.SetElementAtIndex(i, (newp0i * ui) + (ns * e0i) + (PModq[i] * soi));
             }
         }
-
-        av[part] = a;
-        bv[part] = b;
+        av[part] = std::move(a);
+        bv[part] = std::move(b);
     }
 
+    EvalKeyRelin<DCRTPoly> ek = std::make_shared<EvalKeyRelinImpl<DCRTPoly>>(newKey->GetCryptoContext());
     ek->SetAVector(std::move(av));
     ek->SetBVector(std::move(bv));
     ek->SetKeyTag(newKey->GetKeyTag());
@@ -209,114 +198,110 @@ EvalKey<DCRTPoly> KeySwitchHYBRID::KeySwitchGenInternal(const PrivateKey<DCRTPol
 }
 
 void KeySwitchHYBRID::KeySwitchInPlace(Ciphertext<DCRTPoly>& ciphertext, const EvalKey<DCRTPoly> ek) const {
-    std::vector<DCRTPoly>& cv = ciphertext->GetElements();
-
-    std::shared_ptr<std::vector<DCRTPoly>> ba = (cv.size() == 2) ? KeySwitchCore(cv[1], ek) : KeySwitchCore(cv[2], ek);
+    auto& cv = ciphertext->GetElements();
+    auto ba  = KeySwitchCore(cv.back(), ek);
 
     cv[0].SetFormat((*ba)[0].GetFormat());
     cv[0] += (*ba)[0];
 
     cv[1].SetFormat((*ba)[1].GetFormat());
-    if (cv.size() > 2) {
+    if (cv.size() > 2)
         cv[1] += (*ba)[1];
-    }
-    else {
+    else
         cv[1] = (*ba)[1];
-    }
+
     cv.resize(2);
 }
 
 Ciphertext<DCRTPoly> KeySwitchHYBRID::KeySwitchExt(ConstCiphertext<DCRTPoly> ciphertext, bool addFirst) const {
     const auto cryptoParams = std::dynamic_pointer_cast<CryptoParametersCKKSRNS>(ciphertext->GetCryptoParameters());
 
-    const std::vector<DCRTPoly>& cv = ciphertext->GetElements();
+    const auto& cv    = ciphertext->GetElements();
+    const auto& PModq = cryptoParams->GetPModq();
 
-    const auto paramsQl  = cv[0].GetParams();
     const auto paramsP   = cryptoParams->GetParamsP();
+    const auto paramsQl  = cv[0].GetParams();
     const auto paramsQlP = cv[0].GetExtendedCRTBasis(paramsP);
 
-    uint32_t sizeQl = paramsQl->GetParams().size();
-    uint32_t sizeCv = cv.size();
-    std::vector<DCRTPoly> resultElements(sizeCv);
+    const uint32_t sizeCv = cv.size();
+    const uint32_t sizeQl = paramsQl->GetParams().size();
+    std::vector<DCRTPoly> elements(sizeCv);
+
     for (uint32_t k = 0; k < sizeCv; ++k) {
-        resultElements[k] = DCRTPoly(paramsQlP, Format::EVALUATION, true);
+        elements[k] = DCRTPoly(paramsQlP, Format::EVALUATION, true);
         if ((addFirst) || (k > 0)) {
-            auto cMult = cv[k].TimesNoCheck(cryptoParams->GetPModq());
+            auto cMult = cv[k].TimesNoCheck(PModq);
             for (uint32_t i = 0; i < sizeQl; ++i) {
-                resultElements[k].SetElementAtIndex(i, std::move(cMult.GetElementAtIndex(i)));
+                elements[k].SetElementAtIndex(i, std::move(cMult.GetElementAtIndex(i)));
             }
         }
     }
 
     auto result = ciphertext->CloneEmpty();
-    result->SetElements(std::move(resultElements));
+    result->SetElements(std::move(elements));
     return result;
 }
 
 Ciphertext<DCRTPoly> KeySwitchHYBRID::KeySwitchDown(ConstCiphertext<DCRTPoly> ciphertext) const {
-    const auto cryptoParams = std::dynamic_pointer_cast<CryptoParametersCKKSRNS>(ciphertext->GetCryptoParameters());
+    const auto& cv       = ciphertext->GetElements();
+    const auto paramsQlP = cv[0].GetParams();
 
-    const auto paramsP   = cryptoParams->GetParamsP();
-    const auto paramsQlP = ciphertext->GetElements()[0].GetParams();
+    const auto cryptoParams = std::dynamic_pointer_cast<CryptoParametersCKKSRNS>(ciphertext->GetCryptoParameters());
+    const auto paramsP      = cryptoParams->GetParamsP();
 
     // TODO : (Andrey) precompute paramsQl in cryptoparameters
-    usint sizeQl = paramsQlP->GetParams().size() - paramsP->GetParams().size();
+    const uint32_t sizeQl = paramsQlP->GetParams().size() - paramsP->GetParams().size();
     std::vector<NativeInteger> moduliQ(sizeQl);
     std::vector<NativeInteger> rootsQ(sizeQl);
-    for (size_t i = 0; i < sizeQl; i++) {
+    for (uint32_t i = 0; i < sizeQl; ++i) {
         moduliQ[i] = paramsQlP->GetParams()[i]->GetModulus();
         rootsQ[i]  = paramsQlP->GetParams()[i]->GetRootOfUnity();
     }
-    auto paramsQl = std::make_shared<typename DCRTPoly::Params>(2 * paramsQlP->GetRingDimension(), moduliQ, rootsQ);
-
-    auto cTilda = ciphertext->GetElements();
-
-    PlaintextModulus t = (cryptoParams->GetNoiseScale() == 1) ? 0 : cryptoParams->GetPlaintextModulus();
-
-    DCRTPoly ct0 = cTilda[0].ApproxModDown(paramsQl, cryptoParams->GetParamsP(), cryptoParams->GetPInvModq(),
-                                           cryptoParams->GetPInvModqPrecon(), cryptoParams->GetPHatInvModp(),
-                                           cryptoParams->GetPHatInvModpPrecon(), cryptoParams->GetPHatModq(),
-                                           cryptoParams->GetModqBarrettMu(), cryptoParams->GettInvModp(),
-                                           cryptoParams->GettInvModpPrecon(), t, cryptoParams->GettModqPrecon());
-
-    DCRTPoly ct1 = cTilda[1].ApproxModDown(paramsQl, cryptoParams->GetParamsP(), cryptoParams->GetPInvModq(),
-                                           cryptoParams->GetPInvModqPrecon(), cryptoParams->GetPHatInvModp(),
-                                           cryptoParams->GetPHatInvModpPrecon(), cryptoParams->GetPHatModq(),
-                                           cryptoParams->GetModqBarrettMu(), cryptoParams->GettInvModp(),
-                                           cryptoParams->GettInvModpPrecon(), t, cryptoParams->GettModqPrecon());
+    const auto paramsQl = std::make_shared<ParmType>(paramsQlP->GetCyclotomicOrder(), moduliQ, rootsQ);
+
+    const PlaintextModulus t = (cryptoParams->GetNoiseScale() == 1) ? 0 : cryptoParams->GetPlaintextModulus();
+
+    std::vector<DCRTPoly> elements(2);
+    elements[0] = cv[0].ApproxModDown(paramsQl, cryptoParams->GetParamsP(), cryptoParams->GetPInvModq(),
+                                      cryptoParams->GetPInvModqPrecon(), cryptoParams->GetPHatInvModp(),
+                                      cryptoParams->GetPHatInvModpPrecon(), cryptoParams->GetPHatModq(),
+                                      cryptoParams->GetModqBarrettMu(), cryptoParams->GettInvModp(),
+                                      cryptoParams->GettInvModpPrecon(), t, cryptoParams->GettModqPrecon());
+    elements[1] = cv[1].ApproxModDown(paramsQl, cryptoParams->GetParamsP(), cryptoParams->GetPInvModq(),
+                                      cryptoParams->GetPInvModqPrecon(), cryptoParams->GetPHatInvModp(),
+                                      cryptoParams->GetPHatInvModpPrecon(), cryptoParams->GetPHatModq(),
+                                      cryptoParams->GetModqBarrettMu(), cryptoParams->GettInvModp(),
+                                      cryptoParams->GettInvModpPrecon(), t, cryptoParams->GettModqPrecon());
 
     auto result = ciphertext->CloneEmpty();
-    result->SetElements({std::move(ct0), std::move(ct1)});
+    result->SetElements(std::move(elements));
     return result;
 }
 
 DCRTPoly KeySwitchHYBRID::KeySwitchDownFirstElement(ConstCiphertext<DCRTPoly> ciphertext) const {
-    const auto cryptoParams = std::dynamic_pointer_cast<CryptoParametersCKKSRNS>(ciphertext->GetCryptoParameters());
-
-    const std::vector<DCRTPoly>& cTilda = ciphertext->GetElements();
+    const auto& cv       = ciphertext->GetElements()[0];
+    const auto paramsQlP = cv.GetParams();
 
-    const auto paramsP   = cryptoParams->GetParamsP();
-    const auto paramsQlP = cTilda[0].GetParams();
+    const auto cryptoParams = std::dynamic_pointer_cast<CryptoParametersCKKSRNS>(ciphertext->GetCryptoParameters());
+    const auto paramsP      = cryptoParams->GetParamsP();
 
     // TODO : (Andrey) precompute paramsQl in cryptoparameters
-    usint sizeQl = paramsQlP->GetParams().size() - paramsP->GetParams().size();
+    const uint32_t sizeQl = paramsQlP->GetParams().size() - paramsP->GetParams().size();
     std::vector<NativeInteger> moduliQ(sizeQl);
     std::vector<NativeInteger> rootsQ(sizeQl);
-    for (size_t i = 0; i < sizeQl; i++) {
+    for (uint32_t i = 0; i < sizeQl; ++i) {
         moduliQ[i] = paramsQlP->GetParams()[i]->GetModulus();
         rootsQ[i]  = paramsQlP->GetParams()[i]->GetRootOfUnity();
     }
-    auto paramsQl = std::make_shared<typename DCRTPoly::Params>(2 * paramsQlP->GetRingDimension(), moduliQ, rootsQ);
-
-    PlaintextModulus t = (cryptoParams->GetNoiseScale() == 1) ? 0 : cryptoParams->GetPlaintextModulus();
+    const auto paramsQl = std::make_shared<ParmType>(paramsQlP->GetCyclotomicOrder(), moduliQ, rootsQ);
 
-    DCRTPoly cv0 = cTilda[0].ApproxModDown(paramsQl, cryptoParams->GetParamsP(), cryptoParams->GetPInvModq(),
-                                           cryptoParams->GetPInvModqPrecon(), cryptoParams->GetPHatInvModp(),
-                                           cryptoParams->GetPHatInvModpPrecon(), cryptoParams->GetPHatModq(),
-                                           cryptoParams->GetModqBarrettMu(), cryptoParams->GettInvModp(),
-                                           cryptoParams->GettInvModpPrecon(), t, cryptoParams->GettModqPrecon());
+    const PlaintextModulus t = (cryptoParams->GetNoiseScale() == 1) ? 0 : cryptoParams->GetPlaintextModulus();
 
-    return cv0;
+    return cv.ApproxModDown(paramsQl, cryptoParams->GetParamsP(), cryptoParams->GetPInvModq(),
+                            cryptoParams->GetPInvModqPrecon(), cryptoParams->GetPHatInvModp(),
+                            cryptoParams->GetPHatInvModpPrecon(), cryptoParams->GetPHatModq(),
+                            cryptoParams->GetModqBarrettMu(), cryptoParams->GettInvModp(),
+                            cryptoParams->GettInvModpPrecon(), t, cryptoParams->GettModqPrecon());
 }
 
 std::shared_ptr<std::vector<DCRTPoly>> KeySwitchHYBRID::KeySwitchCore(const DCRTPoly& a,
@@ -329,86 +314,67 @@ std::shared_ptr<std::vector<DCRTPoly>> KeySwitchHYBRID::EvalKeySwitchPrecomputeC
     const DCRTPoly& c, std::shared_ptr<CryptoParametersBase<DCRTPoly>> cryptoParamsBase) const {
     const auto cryptoParams = std::dynamic_pointer_cast<CryptoParametersRNS>(cryptoParamsBase);
 
-    const std::shared_ptr<ParmType> paramsQl  = c.GetParams();
-    const std::shared_ptr<ParmType> paramsP   = cryptoParams->GetParamsP();
-    const std::shared_ptr<ParmType> paramsQlP = c.GetExtendedCRTBasis(paramsP);
-
-    size_t sizeQl  = paramsQl->GetParams().size();
-    size_t sizeP   = paramsP->GetParams().size();
-    size_t sizeQlP = sizeQl + sizeP;
+    const auto paramsQl  = c.GetParams();
+    const auto paramsP   = cryptoParams->GetParamsP();
+    const auto paramsQlP = c.GetExtendedCRTBasis(paramsP);
 
-    uint32_t alpha = cryptoParams->GetNumPerPartQ();
+    const uint32_t sizeQl  = paramsQl->GetParams().size();
+    const uint32_t sizeP   = paramsP->GetParams().size();
+    const uint32_t sizeQlP = sizeQl + sizeP;
+    const uint32_t alpha   = cryptoParams->GetNumPerPartQ();
     // The number of digits of the current ciphertext
-    uint32_t numPartQl = ceil((static_cast<double>(sizeQl)) / alpha);
+    uint32_t numPartQl = std::ceil(static_cast<double>(sizeQl) / alpha);
     if (numPartQl > cryptoParams->GetNumberOfQPartitions())
         numPartQl = cryptoParams->GetNumberOfQPartitions();
 
-    std::vector<DCRTPoly> partsCt(numPartQl);
+    auto result = std::make_shared<std::vector<DCRTPoly>>(numPartQl);
 
     // Digit decomposition
     // Zero-padding and split
-    for (uint32_t part = 0; part < numPartQl; part++) {
+    for (uint32_t part = 0; part < numPartQl; ++part) {
+        DCRTPoly partsCt;
         if (part == numPartQl - 1) {
-            auto paramsPartQ = cryptoParams->GetParamsPartQ(part);
+            const auto& paramsPartQ = cryptoParams->GetParamsPartQ(part);
 
             uint32_t sizePartQl = sizeQl - alpha * part;
-
             std::vector<NativeInteger> moduli(sizePartQl);
             std::vector<NativeInteger> roots(sizePartQl);
-
-            for (uint32_t i = 0; i < sizePartQl; i++) {
+            for (uint32_t i = 0; i < sizePartQl; ++i) {
                 moduli[i] = paramsPartQ->GetParams()[i]->GetModulus();
                 roots[i]  = paramsPartQ->GetParams()[i]->GetRootOfUnity();
             }
-
-            auto params = DCRTPoly::Params(paramsPartQ->GetCyclotomicOrder(), moduli, roots);
-
-            partsCt[part] = DCRTPoly(std::make_shared<ParmType>(params), Format::EVALUATION, true);
+            auto&& params = std::make_shared<ParmType>(paramsPartQ->GetCyclotomicOrder(), moduli, roots);
+            partsCt       = DCRTPoly(params, Format::EVALUATION, true);
         }
         else {
-            partsCt[part] = DCRTPoly(cryptoParams->GetParamsPartQ(part), Format::EVALUATION, true);
+            partsCt = DCRTPoly(cryptoParams->GetParamsPartQ(part), Format::EVALUATION, true);
         }
 
-        usint sizePartQl   = partsCt[part].GetNumOfElements();
-        usint startPartIdx = alpha * part;
-        for (uint32_t i = 0, idx = startPartIdx; i < sizePartQl; i++, idx++) {
-            partsCt[part].SetElementAtIndex(i, c.GetElementAtIndex(idx));
-        }
+        const uint32_t sizePartQl   = partsCt.GetNumOfElements();
+        const uint32_t startPartIdx = alpha * part;
+        for (uint32_t i = 0, idx = startPartIdx; i < sizePartQl; ++i, ++idx)
+            partsCt.SetElementAtIndex(i, c.GetElementAtIndex(idx));
+
+        partsCt.SetFormat(Format::COEFFICIENT);
+        auto partsCtCompl = partsCt.ApproxSwitchCRTBasis(cryptoParams->GetParamsPartQ(part),
+                                                         cryptoParams->GetParamsComplPartQ(sizeQl - 1, part),
+                                                         cryptoParams->GetPartQlHatInvModq(part, sizePartQl - 1),
+                                                         cryptoParams->GetPartQlHatInvModqPrecon(part, sizePartQl - 1),
+                                                         cryptoParams->GetPartQlHatModp(sizeQl - 1, part),
+                                                         cryptoParams->GetmodComplPartqBarrettMu(sizeQl - 1, part));
+        partsCtCompl.SetFormat(Format::EVALUATION);
+
+        (*result)[part] = DCRTPoly(paramsQlP, Format::EVALUATION, true);
+
+        const uint32_t endPartIdx = startPartIdx + sizePartQl;
+        for (uint32_t i = 0; i < startPartIdx; ++i)
+            (*result)[part].SetElementAtIndex(i, std::move(partsCtCompl.GetElementAtIndex(i)));
+        for (uint32_t i = startPartIdx; i < endPartIdx; ++i)
+            (*result)[part].SetElementAtIndex(i, c.GetElementAtIndex(i));
+        for (uint32_t i = endPartIdx; i < sizeQlP; ++i)
+            (*result)[part].SetElementAtIndex(i, std::move(partsCtCompl.GetElementAtIndex(i - sizePartQl)));
     }
-
-    std::vector<DCRTPoly> partsCtCompl(numPartQl);
-    std::vector<DCRTPoly> partsCtExt(numPartQl);
-
-    for (uint32_t part = 0; part < numPartQl; part++) {
-        auto partCtClone = partsCt[part].Clone();
-        partCtClone.SetFormat(Format::COEFFICIENT);
-
-        uint32_t sizePartQl = partsCt[part].GetNumOfElements();
-        partsCtCompl[part]  = partCtClone.ApproxSwitchCRTBasis(
-            cryptoParams->GetParamsPartQ(part), cryptoParams->GetParamsComplPartQ(sizeQl - 1, part),
-            cryptoParams->GetPartQlHatInvModq(part, sizePartQl - 1),
-            cryptoParams->GetPartQlHatInvModqPrecon(part, sizePartQl - 1),
-            cryptoParams->GetPartQlHatModp(sizeQl - 1, part),
-            cryptoParams->GetmodComplPartqBarrettMu(sizeQl - 1, part));
-
-        partsCtCompl[part].SetFormat(Format::EVALUATION);
-
-        partsCtExt[part] = DCRTPoly(paramsQlP, Format::EVALUATION, true);
-
-        usint startPartIdx = alpha * part;
-        usint endPartIdx   = startPartIdx + sizePartQl;
-        for (usint i = 0; i < startPartIdx; i++) {
-            partsCtExt[part].SetElementAtIndex(i, partsCtCompl[part].GetElementAtIndex(i));
-        }
-        for (usint i = startPartIdx, idx = 0; i < endPartIdx; i++, idx++) {
-            partsCtExt[part].SetElementAtIndex(i, partsCt[part].GetElementAtIndex(idx));
-        }
-        for (usint i = endPartIdx; i < sizeQlP; ++i) {
-            partsCtExt[part].SetElementAtIndex(i, partsCtCompl[part].GetElementAtIndex(i - sizePartQl));
-        }
-    }
-
-    return std::make_shared<std::vector<DCRTPoly>>(std::move(partsCtExt));
+    return result;
 }
 
 std::shared_ptr<std::vector<DCRTPoly>> KeySwitchHYBRID::EvalFastKeySwitchCore(
@@ -416,67 +382,55 @@ std::shared_ptr<std::vector<DCRTPoly>> KeySwitchHYBRID::EvalFastKeySwitchCore(
     const std::shared_ptr<ParmType> paramsQl) const {
     const auto cryptoParams = std::dynamic_pointer_cast<CryptoParametersRNS>(evalKey->GetCryptoParameters());
 
-    std::shared_ptr<std::vector<DCRTPoly>> cTilda = EvalFastKeySwitchCoreExt(digits, evalKey, paramsQl);
-
-    PlaintextModulus t = (cryptoParams->GetNoiseScale() == 1) ? 0 : cryptoParams->GetPlaintextModulus();
+    const PlaintextModulus t = (cryptoParams->GetNoiseScale() == 1) ? 0 : cryptoParams->GetPlaintextModulus();
 
-    DCRTPoly ct0 = (*cTilda)[0].ApproxModDown(paramsQl, cryptoParams->GetParamsP(), cryptoParams->GetPInvModq(),
+    auto result  = EvalFastKeySwitchCoreExt(digits, evalKey, paramsQl);
+    (*result)[0] = (*result)[0].ApproxModDown(paramsQl, cryptoParams->GetParamsP(), cryptoParams->GetPInvModq(),
                                               cryptoParams->GetPInvModqPrecon(), cryptoParams->GetPHatInvModp(),
                                               cryptoParams->GetPHatInvModpPrecon(), cryptoParams->GetPHatModq(),
                                               cryptoParams->GetModqBarrettMu(), cryptoParams->GettInvModp(),
                                               cryptoParams->GettInvModpPrecon(), t, cryptoParams->GettModqPrecon());
-
-    DCRTPoly ct1 = (*cTilda)[1].ApproxModDown(paramsQl, cryptoParams->GetParamsP(), cryptoParams->GetPInvModq(),
+    (*result)[1] = (*result)[1].ApproxModDown(paramsQl, cryptoParams->GetParamsP(), cryptoParams->GetPInvModq(),
                                               cryptoParams->GetPInvModqPrecon(), cryptoParams->GetPHatInvModp(),
                                               cryptoParams->GetPHatInvModpPrecon(), cryptoParams->GetPHatModq(),
                                               cryptoParams->GetModqBarrettMu(), cryptoParams->GettInvModp(),
                                               cryptoParams->GettInvModpPrecon(), t, cryptoParams->GettModqPrecon());
-
-    return std::make_shared<std::vector<DCRTPoly>>(std::initializer_list<DCRTPoly>{std::move(ct0), std::move(ct1)});
+    return result;
 }
 
 std::shared_ptr<std::vector<DCRTPoly>> KeySwitchHYBRID::EvalFastKeySwitchCoreExt(
     const std::shared_ptr<std::vector<DCRTPoly>> digits, const EvalKey<DCRTPoly> evalKey,
     const std::shared_ptr<ParmType> paramsQl) const {
-    const auto cryptoParams         = std::dynamic_pointer_cast<CryptoParametersRNS>(evalKey->GetCryptoParameters());
-    const std::vector<DCRTPoly>& bv = evalKey->GetBVector();
-    const std::vector<DCRTPoly>& av = evalKey->GetAVector();
-
-    const std::shared_ptr<ParmType> paramsP   = cryptoParams->GetParamsP();
-    const std::shared_ptr<ParmType> paramsQlP = (*digits)[0].GetParams();
-
-    size_t sizeQl  = paramsQl->GetParams().size();
-    size_t sizeQlP = paramsQlP->GetParams().size();
-    size_t sizeQ   = cryptoParams->GetElementParams()->GetParams().size();
-
-    DCRTPoly cTilda0(paramsQlP, Format::EVALUATION, true);
-    DCRTPoly cTilda1(paramsQlP, Format::EVALUATION, true);
-
-    for (uint32_t j = 0; j < digits->size(); j++) {
-        const DCRTPoly& cj = (*digits)[j];
-        const DCRTPoly& bj = bv[j];
-        const DCRTPoly& aj = av[j];
-
-        for (usint i = 0; i < sizeQl; i++) {
-            const auto& cji = cj.GetElementAtIndex(i);
-            const auto& aji = aj.GetElementAtIndex(i);
-            const auto& bji = bj.GetElementAtIndex(i);
-
-            cTilda0.SetElementAtIndex(i, cTilda0.GetElementAtIndex(i) + cji * bji);
-            cTilda1.SetElementAtIndex(i, cTilda1.GetElementAtIndex(i) + cji * aji);
-        }
-        for (usint i = sizeQl, idx = sizeQ; i < sizeQlP; i++, idx++) {
-            const auto& cji = cj.GetElementAtIndex(i);
-            const auto& aji = aj.GetElementAtIndex(idx);
-            const auto& bji = bj.GetElementAtIndex(idx);
-
-            cTilda0.SetElementAtIndex(i, cTilda0.GetElementAtIndex(i) + cji * bji);
-            cTilda1.SetElementAtIndex(i, cTilda1.GetElementAtIndex(i) + cji * aji);
+    const auto paramsQlP   = (*digits)[0].GetParams();
+    const uint32_t sizeQlP = paramsQlP->GetParams().size();
+
+    const uint32_t limit  = digits->size();
+    const uint32_t sizeQl = paramsQl->GetParams().size();
+    auto&& cryptoParams   = std::dynamic_pointer_cast<CryptoParametersRNS>(evalKey->GetCryptoParameters());
+    const uint32_t delta  = cryptoParams->GetElementParams()->GetParams().size() - sizeQl;
+
+    const auto& av = evalKey->GetAVector();
+    const auto& bv = evalKey->GetBVector();
+
+    auto result = std::make_shared<std::vector<DCRTPoly>>();
+    result->reserve(2);
+    result->emplace_back(paramsQlP, Format::EVALUATION, true);
+    result->emplace_back(paramsQlP, Format::EVALUATION, true);
+    auto& elements = (*result);
+
+    for (uint32_t j = 0; j < limit; ++j) {
+#pragma omp parallel for num_threads(OpenFHEParallelControls.GetThreadLimit(sizeQlP))
+        for (uint32_t i = 0; i < sizeQlP; ++i) {
+            const auto idx  = (i >= sizeQl) ? i + delta : i;
+            const auto& cji = (*digits)[j].GetElementAtIndex(i);
+            const auto& bji = bv[j].GetElementAtIndex(idx);
+            const auto& aji = av[j].GetElementAtIndex(idx);
+            elements[0].SetElementAtIndex(i, elements[0].GetElementAtIndex(i) + cji * bji);
+            elements[1].SetElementAtIndex(i, elements[1].GetElementAtIndex(i) + cji * aji);
         }
     }
 
-    return std::make_shared<std::vector<DCRTPoly>>(
-        std::initializer_list<DCRTPoly>{std::move(cTilda0), std::move(cTilda1)});
+    return result;
 }
 
 }  // namespace lbcrypto
diff --git a/src/pke/lib/scheme/ckksrns/ckksrns-advancedshe.cpp b/src/pke/lib/scheme/ckksrns/ckksrns-advancedshe.cpp
index d9b59abd6..decd7c7aa 100644
--- a/src/pke/lib/scheme/ckksrns/ckksrns-advancedshe.cpp
+++ b/src/pke/lib/scheme/ckksrns/ckksrns-advancedshe.cpp
@@ -33,8 +33,6 @@
 CKKS implementation. See https://eprint.iacr.org/2020/1118 for details.
  */
 
-#define PROFILE
-
 #include "cryptocontext.h"
 #include "scheme/ckksrns/ckksrns-cryptoparameters.h"
 #include "scheme/ckksrns/ckksrns-advancedshe.h"
@@ -48,38 +46,36 @@ namespace lbcrypto {
 
 Ciphertext<DCRTPoly> AdvancedSHECKKSRNS::EvalMultMany(const std::vector<Ciphertext<DCRTPoly>>& ciphertextVec,
                                                       const std::vector<EvalKey<DCRTPoly>>& evalKeys) const {
-    const size_t inSize = ciphertextVec.size();
+    const uint32_t inSize = ciphertextVec.size();
 
     if (inSize == 0)
         OPENFHE_THROW("Input ciphertext vector is empty.");
 
     if (inSize == 1)
-        return ciphertextVec[0];
+        return ciphertextVec[0]->Clone();
 
-    const size_t lim = inSize * 2 - 2;
-    std::vector<Ciphertext<DCRTPoly>> ciphertextMultVec;
-    ciphertextMultVec.resize(inSize - 1);
+    const uint32_t lim = inSize * 2 - 2;
+    std::vector<Ciphertext<DCRTPoly>> ciphertextMultVec(inSize - 1);
 
     auto algo               = ciphertextVec[0]->GetCryptoContext()->GetScheme();
     const auto cryptoParams = std::dynamic_pointer_cast<CryptoParametersRNS>(ciphertextVec[0]->GetCryptoParameters());
     uint32_t levelsToDrop   = cryptoParams->GetCompositeDegree();
 
-    size_t ctrIndex = 0;
-    size_t i        = 0;
+    uint32_t i = 0, j = 0;
     for (; i < (inSize - 1); i += 2) {
-        ciphertextMultVec[ctrIndex] = algo->EvalMultAndRelinearize(ciphertextVec[i], ciphertextVec[(i + 1)], evalKeys);
-        algo->ModReduceInPlace(ciphertextMultVec[ctrIndex++], levelsToDrop);
+        ciphertextMultVec[j] = algo->EvalMultAndRelinearize(ciphertextVec[i], ciphertextVec[i + 1], evalKeys);
+        algo->ModReduceInPlace(ciphertextMultVec[j++], levelsToDrop);
     }
     if (i < inSize) {
-        ciphertextMultVec[ctrIndex] =
+        ciphertextMultVec[j] =
             algo->EvalMultAndRelinearize(ciphertextVec[i], ciphertextMultVec[i + 1 - inSize], evalKeys);
-        algo->ModReduceInPlace(ciphertextMultVec[ctrIndex++], levelsToDrop);
+        algo->ModReduceInPlace(ciphertextMultVec[j++], levelsToDrop);
         i += 2;
     }
     for (; i < lim; i += 2) {
-        ciphertextMultVec[ctrIndex] =
+        ciphertextMultVec[j] =
             algo->EvalMultAndRelinearize(ciphertextMultVec[i - inSize], ciphertextMultVec[i + 1 - inSize], evalKeys);
-        algo->ModReduceInPlace(ciphertextMultVec[ctrIndex++], levelsToDrop);
+        algo->ModReduceInPlace(ciphertextMultVec[j++], levelsToDrop);
     }
 
     return ciphertextMultVec.back();
@@ -90,27 +86,30 @@ Ciphertext<DCRTPoly> AdvancedSHECKKSRNS::EvalMultMany(const std::vector<Cipherte
 //------------------------------------------------------------------------------
 
 template <typename VectorDataType>
-static inline Ciphertext<DCRTPoly> internalEvalLinearWSum(std::vector<ReadOnlyCiphertext<DCRTPoly>>& ciphertexts,
-                                                          const std::vector<VectorDataType>& constants) {
-    std::vector<Ciphertext<DCRTPoly>> cts(ciphertexts.size());
-    for (uint32_t i = 0; i < ciphertexts.size(); i++)
+Ciphertext<DCRTPoly> internalEvalLinearWSum(const std::vector<ReadOnlyCiphertext<DCRTPoly>>& ciphertexts,
+                                            const std::vector<VectorDataType>& constants) {
+    const uint32_t limit = ciphertexts.size();
+    std::vector<Ciphertext<DCRTPoly>> cts(limit);
+    for (uint32_t i = 0; i < limit; ++i)
         cts[i] = ciphertexts[i]->Clone();
     return internalEvalLinearWSumMutable(cts, constants);
 }
 
 template <typename VectorDataType>
-static inline Ciphertext<DCRTPoly> internalEvalLinearWSumMutable(std::vector<Ciphertext<DCRTPoly>>& ciphertexts,
-                                                                 const std::vector<VectorDataType>& constants) {
+Ciphertext<DCRTPoly> internalEvalLinearWSumMutable(std::vector<Ciphertext<DCRTPoly>>& ciphertexts,
+                                                   const std::vector<VectorDataType>& constants) {
     const auto cryptoParams = std::dynamic_pointer_cast<CryptoParametersCKKSRNS>(ciphertexts[0]->GetCryptoParameters());
 
     auto cc = ciphertexts[0]->GetCryptoContext();
 
+    const uint32_t limit = ciphertexts.size();
+
     if (cryptoParams->GetScalingTechnique() != FIXEDMANUAL) {
         // Check to see if input ciphertexts are of same level
         // and adjust if needed to the max level among them
         uint32_t maxLevel = ciphertexts[0]->GetLevel();
         uint32_t maxIdx   = 0;
-        for (uint32_t i = 1; i < ciphertexts.size(); ++i) {
+        for (uint32_t i = 1; i < limit; ++i) {
             if ((ciphertexts[i]->GetLevel() > maxLevel) ||
                 ((ciphertexts[i]->GetLevel() == maxLevel) && (ciphertexts[i]->GetNoiseScaleDeg() == 2))) {
                 maxLevel = ciphertexts[i]->GetLevel();
@@ -121,28 +120,76 @@ static inline Ciphertext<DCRTPoly> internalEvalLinearWSumMutable(std::vector<Cip
         auto algo = cc->GetScheme();
         for (uint32_t i = 0; i < maxIdx; ++i)
             algo->AdjustLevelsAndDepthInPlace(ciphertexts[i], ciphertexts[maxIdx]);
-        for (uint32_t i = maxIdx + 1; i < ciphertexts.size(); ++i)
+        for (uint32_t i = maxIdx + 1; i < limit; ++i)
             algo->AdjustLevelsAndDepthInPlace(ciphertexts[i], ciphertexts[maxIdx]);
 
         uint32_t compositeDegree = cryptoParams->GetCompositeDegree();
         if (ciphertexts[maxIdx]->GetNoiseScaleDeg() == 2) {
-            for (uint32_t i = 0; i < ciphertexts.size(); ++i) {
+            for (uint32_t i = 0; i < limit; ++i)
                 algo->ModReduceInternalInPlace(ciphertexts[i], compositeDegree);
-            }
         }
     }
 
-    Ciphertext<DCRTPoly> weightedSum = cc->EvalMult(ciphertexts[0], constants[0]);
-
-    Ciphertext<DCRTPoly> tmp;
-    for (uint32_t i = 1; i < ciphertexts.size(); i++) {
-        tmp = cc->EvalMult(ciphertexts[i], constants[i]);
-        cc->EvalAddInPlace(weightedSum, tmp);
+    cc->EvalMultInPlace(ciphertexts[0], constants[0]);
+    for (uint32_t i = 1; i < limit; ++i) {
+        cc->EvalMultInPlace(ciphertexts[i], constants[i]);
+        cc->EvalAddInPlaceNoCheck(ciphertexts[0], ciphertexts[i]);
     }
+    cc->ModReduceInPlace(ciphertexts[0]);
+    return ciphertexts[0];
+}
+
+template <typename VectorDataType>
+Ciphertext<DCRTPoly> EvalPartialLinearWSum(const std::vector<Ciphertext<DCRTPoly>>& ciphertexts,
+                                           const std::vector<VectorDataType>& constants, uint32_t limit = 0) {
+    if (0 == limit)
+        limit = ciphertexts.size();
+
+    const auto cryptoParams = std::dynamic_pointer_cast<CryptoParametersCKKSRNS>(ciphertexts[0]->GetCryptoParameters());
+
+    auto cc = ciphertexts[0]->GetCryptoContext();
 
-    cc->ModReduceInPlace(weightedSum);
+    std::vector<Ciphertext<DCRTPoly>> cts(limit);
+    if (cryptoParams->GetScalingTechnique() != FIXEDMANUAL) {
+        cts[0] = ciphertexts[0]->Clone();
+        // Check to see if input ciphertexts are of same level
+        // and adjust if needed to the max level among them
+        uint32_t maxLevel = cts[0]->GetLevel();
+        uint32_t maxIdx   = 0;
+        for (uint32_t i = 1; i < limit; ++i) {
+            cts[i] = ciphertexts[i]->Clone();
+            if ((cts[i]->GetLevel() > maxLevel) ||
+                ((cts[i]->GetLevel() == maxLevel) && (cts[i]->GetNoiseScaleDeg() == 2))) {
+                maxLevel = cts[i]->GetLevel();
+                maxIdx   = i;
+            }
+        }
 
-    return weightedSum;
+        auto algo = cc->GetScheme();
+        auto& ctm = cts[maxIdx];
+        for (uint32_t i = 0; i < maxIdx; ++i)
+            algo->AdjustLevelsAndDepthInPlace(cts[i], ctm);
+        for (uint32_t i = maxIdx + 1; i < limit; ++i)
+            algo->AdjustLevelsAndDepthInPlace(cts[i], ctm);
+
+        uint32_t compositeDegree = cryptoParams->GetCompositeDegree();
+        if (ctm->GetNoiseScaleDeg() == 2) {
+            for (uint32_t i = 0; i < limit; ++i)
+                algo->ModReduceInternalInPlace(cts[i], compositeDegree);
+        }
+    }
+    else {
+        for (uint32_t i = 0; i < limit; ++i)
+            cts[i] = ciphertexts[i]->Clone();
+    }
+
+    cc->EvalMultInPlace(cts[0], constants[1]);
+    for (uint32_t i = 1; i < limit; ++i) {
+        cc->EvalMultInPlace(cts[i], constants[i + 1]);
+        cc->EvalAddInPlaceNoCheck(cts[0], cts[i]);
+    }
+    cc->ModReduceInPlace(cts[0]);
+    return cts[0];
 }
 
 Ciphertext<DCRTPoly> AdvancedSHECKKSRNS::EvalLinearWSum(std::vector<ReadOnlyCiphertext<DCRTPoly>>& ciphertexts,
@@ -178,57 +225,49 @@ Ciphertext<DCRTPoly> AdvancedSHECKKSRNS::EvalLinearWSumMutable(
 template <typename VectorDataType>
 std::shared_ptr<seriesPowers<DCRTPoly>> internalEvalPowersLinear(ConstCiphertext<DCRTPoly>& x,
                                                                  const std::vector<VectorDataType>& coefficients) {
-    uint32_t k = coefficients.size() - 1;
-    std::vector<int32_t> indices(k);
-    // set the indices for the powers of x that need to be computed to 1
+    const uint32_t k = coefficients.size() - 1;
+    std::vector<bool> indices(k);
+
+    // find indices for powers of x that need to be computed
     for (uint32_t i = k; i > 0; --i) {
-        if (!(i & (i - 1))) {
-            // if i is a power of 2
-            indices[i - 1] = 1;
+        if (0 == (i & (i - 1))) {  // if i is a power of 2
+            indices[i - 1] = true;
         }
-        else {
-            // non-power of 2
+        else {  // non-power of 2
             if (IsNotEqualZero(coefficients[i])) {
-                indices[i - 1]   = 1;
-                int64_t powerOf2 = int64_t(1) << static_cast<int64_t>(std::floor(std::log2(i)));
-                int64_t rem      = i % powerOf2;
-                if (indices[rem - 1] == 0)
-                    indices[rem - 1] = 1;
+                uint32_t rem = i;
 
                 // while rem is not a power of 2
                 // set indices required to compute rem to 1
-                while ((rem & (rem - 1))) {
-                    powerOf2 = 1 << static_cast<int64_t>(std::floor(std::log2(rem)));
-                    rem      = rem % powerOf2;
-                    if (indices[rem - 1] == 0)
-                        indices[rem - 1] = 1;
+                while (0 != (rem & (rem - 1))) {
+                    indices[rem - 1] = true;
+                    rem &= (uint64_t(1) << (GetMSB(rem) - 1)) - 1;
                 }
             }
         }
     }
 
     std::vector<Ciphertext<DCRTPoly>> powers(k);
-    powers[0]                = x->Clone();
-    auto cc                  = x->GetCryptoContext();
+    powers[0] = x->Clone();
+    auto cc   = x->GetCryptoContext();
+
     auto cryptoParams        = std::dynamic_pointer_cast<CryptoParametersCKKSRNS>(x->GetCryptoParameters());
     uint32_t compositeDegree = cryptoParams->GetCompositeDegree();
 
     // computes all powers up to k for x
     for (uint32_t i = 2; i <= k; ++i) {
-        if (!(i & (i - 1))) {
-            // if i is a power of two
-            powers[i - 1] = cc->EvalMult(powers[i / 2 - 1], powers[i / 2 - 1]);
+        if (0 == (i & (i - 1))) {
+            powers[i - 1] = cc->EvalSquare(powers[i / 2 - 1]);
             cc->ModReduceInPlace(powers[i - 1]);
         }
         else {
-            if (indices[i - 1] == 1) {
-                // non-power of 2
-                int64_t powerOf2   = int64_t(1) << static_cast<int64_t>(std::floor(std::log2(i)));
-                int64_t rem        = i % powerOf2;
-                uint32_t levelDiff = powers[powerOf2 - 1]->GetLevel() - powers[rem - 1]->GetLevel();
-                cc->LevelReduceInPlace(powers[rem - 1], nullptr, levelDiff / compositeDegree);
-
-                powers[i - 1] = cc->EvalMult(powers[powerOf2 - 1], powers[rem - 1]);
+            if (indices[i - 1]) {
+                uint64_t p    = (uint64_t(1) << (GetMSB(i) - 1)) - 1;
+                uint64_t r    = (i & p) - 1;
+                uint32_t diff = powers[p]->GetLevel() - powers[r]->GetLevel();
+                cc->LevelReduceInPlace(powers[r], nullptr, diff / compositeDegree);
+
+                powers[i - 1] = cc->EvalMult(powers[p], powers[r]);
                 cc->ModReduceInPlace(powers[i - 1]);
             }
         }
@@ -236,26 +275,22 @@ std::shared_ptr<seriesPowers<DCRTPoly>> internalEvalPowersLinear(ConstCiphertext
 
     // brings all powers of x to the same level
     for (uint32_t i = 1; i < k; ++i) {
-        if (indices[i - 1] == 1) {
-            uint32_t levelDiff = powers[k - 1]->GetLevel() - powers[i - 1]->GetLevel();
-            cc->LevelReduceInPlace(powers[i - 1], nullptr, levelDiff / compositeDegree);
+        if (indices[i - 1]) {
+            uint32_t diff = powers[k - 1]->GetLevel() - powers[i - 1]->GetLevel();
+            cc->LevelReduceInPlace(powers[i - 1], nullptr, diff / compositeDegree);
         }
     }
 
-    return std::make_shared<seriesPowers<DCRTPoly>>(powers);
+    return std::make_shared<seriesPowers<DCRTPoly>>(std::move(powers));
 }
 
-template <typename VectorDataType>
-std::shared_ptr<seriesPowers<DCRTPoly>> internalEvalPowersPS(ConstCiphertext<DCRTPoly>& x,
-                                                             const std::vector<VectorDataType>& coefficients) {
-    auto n     = Degree(coefficients);
-    auto degs  = ComputeDegreesPS(n);
+std::shared_ptr<seriesPowers<DCRTPoly>> internalEvalPowersPS(ConstCiphertext<DCRTPoly>& x, uint32_t degree) {
+    auto degs  = ComputeDegreesPS(degree);
     uint32_t k = degs[0];
     uint32_t m = degs[1];
 
-    std::vector<Ciphertext<DCRTPoly>> powers;
-    powers.reserve(k);
-    powers.push_back(x->Clone());
+    std::vector<Ciphertext<DCRTPoly>> powers(k);
+    powers[0] = x->Clone();
 
     auto cc = x->GetCryptoContext();
     uint32_t compositeDegree =
@@ -264,74 +299,71 @@ std::shared_ptr<seriesPowers<DCRTPoly>> internalEvalPowersPS(ConstCiphertext<DCR
     // computes all powers up to k for x
     uint32_t powerOf2 = 2;
     uint32_t rem      = 0;
-    for (uint32_t i = 2; i <= k; i++) {
+    for (uint32_t i = 2; i <= k; ++i) {
         if (rem == 0) {
-            powers.push_back(cc->EvalSquare(powers[(powerOf2 >> 1) - 1]));
+            powers[i - 1] = cc->EvalSquare(powers[(powerOf2 >> 1) - 1]);
         }
         else {
-            uint32_t levelDiff = powers[powerOf2 - 1]->GetLevel() - powers[rem - 1]->GetLevel();
-            cc->LevelReduceInPlace(powers[rem - 1], nullptr, levelDiff / compositeDegree);
-            powers.push_back(cc->EvalMult(powers[powerOf2 - 1], powers[rem - 1]));
+            uint32_t diff = powers[powerOf2 - 1]->GetLevel() - powers[rem - 1]->GetLevel();
+            cc->LevelReduceInPlace(powers[rem - 1], nullptr, diff / compositeDegree);
+            powers[i - 1] = cc->EvalMult(powers[powerOf2 - 1], powers[rem - 1]);
         }
-        cc->ModReduceInPlace(powers[powerOf2 - 1 + rem]);
+
         if (++rem == powerOf2) {
             powerOf2 <<= 1;
             rem = 0;
         }
+        cc->ModReduceInPlace(powers[i - 1]);
     }
 
     const auto cryptoParams = std::dynamic_pointer_cast<CryptoParametersCKKSRNS>(powers[k - 1]->GetCryptoParameters());
-    auto algo               = cc->GetScheme();
-
     if (cryptoParams->GetScalingTechnique() == FIXEDMANUAL) {
         // brings all powers of x to the same level
-        for (size_t i = 1; i < k; i++) {
-            uint32_t levelDiff = powers[k - 1]->GetLevel() - powers[i - 1]->GetLevel();
-            cc->LevelReduceInPlace(powers[i - 1], nullptr, levelDiff);
-        }
+        uint32_t levelk = powers[k - 1]->GetLevel();
+        for (uint32_t i = 1; i < k; ++i)
+            cc->LevelReduceInPlace(powers[i - 1], nullptr, levelk - powers[i - 1]->GetLevel());
     }
     else {
-        for (size_t i = 1; i < k; i++) {
-            algo->AdjustLevelsAndDepthInPlace(powers[i - 1], powers[k - 1]);
-        }
+        for (uint32_t i = 1; i < k; ++i)
+            cc->GetScheme()->AdjustLevelsAndDepthInPlace(powers[i - 1], powers[k - 1]);
     }
 
     // computes powers of form k*2^i for x and the product of the powers in power2, that yield x^{k(2*m - 1)}
-    std::vector<Ciphertext<DCRTPoly>> powers2;
-    powers2.reserve(m);
-    powers2.push_back(powers.back()->Clone());
-    auto power2km1 = powers.back()->Clone();
+    std::vector<Ciphertext<DCRTPoly>> powers2(m);
+    powers2[0] = powers.back();
+
+    auto power2km1 = powers.back();
 
-    for (uint32_t i = 1; i < m; i++) {
-        powers2.push_back(cc->EvalSquare(powers2[i - 1]));
+    for (uint32_t i = 1; i < m; ++i) {
+        powers2[i] = cc->EvalSquare(powers2[i - 1]);
         cc->ModReduceInPlace(powers2[i]);
-        power2km1 = cc->EvalMult(power2km1, powers2.back());
+        power2km1 = cc->EvalMult(powers2[i], power2km1);
         cc->ModReduceInPlace(power2km1);
     }
 
-    return std::make_shared<seriesPowers<DCRTPoly>>(powers, powers2, power2km1, k, m);
+    return std::make_shared<seriesPowers<DCRTPoly>>(std::move(powers), std::move(powers2), std::move(power2km1), k, m);
 }
 
 std::shared_ptr<seriesPowers<DCRTPoly>> AdvancedSHECKKSRNS::EvalPowers(ConstCiphertext<DCRTPoly>& x,
                                                                        const std::vector<int64_t>& coefficients) const {
-    return (Degree(coefficients) < 5) ? internalEvalPowersLinear(x, coefficients) :
-                                        internalEvalPowersPS(x, coefficients);
+    uint32_t d = Degree(coefficients);
+    return (d < 5) ? internalEvalPowersLinear(x, coefficients) : internalEvalPowersPS(x, d);
 }
 std::shared_ptr<seriesPowers<DCRTPoly>> AdvancedSHECKKSRNS::EvalPowers(ConstCiphertext<DCRTPoly>& x,
                                                                        const std::vector<double>& coefficients) const {
-    return (Degree(coefficients) < 5) ? internalEvalPowersLinear(x, coefficients) :
-                                        internalEvalPowersPS(x, coefficients);
+    uint32_t d = Degree(coefficients);
+    return (d < 5) ? internalEvalPowersLinear(x, coefficients) : internalEvalPowersPS(x, d);
 }
 std::shared_ptr<seriesPowers<DCRTPoly>> AdvancedSHECKKSRNS::EvalPowers(
     ConstCiphertext<DCRTPoly>& x, const std::vector<std::complex<double>>& coefficients) const {
-    return (Degree(coefficients) < 5) ? internalEvalPowersLinear(x, coefficients) :
-                                        internalEvalPowersPS(x, coefficients);
+    uint32_t d = Degree(coefficients);
+    return (d < 5) ? internalEvalPowersLinear(x, coefficients) : internalEvalPowersPS(x, d);
 }
 
 template <typename VectorDataType>
-static inline Ciphertext<DCRTPoly> internalEvalPolyLinearWithPrecomp(std::vector<Ciphertext<DCRTPoly>>& powers,
-                                                                     const std::vector<VectorDataType>& coefficients) {
-    uint32_t k = coefficients.size() - 1;
+Ciphertext<DCRTPoly> internalEvalPolyLinearWithPrecomp(std::vector<Ciphertext<DCRTPoly>>& powers,
+                                                       const std::vector<VectorDataType>& coefficients) {
+    const uint32_t k = coefficients.size() - 1;
     if (k <= 1)
         OPENFHE_THROW("The coefficients vector should contain at least 2 elements");
 
@@ -361,315 +393,119 @@ static inline Ciphertext<DCRTPoly> internalEvalPolyLinearWithPrecomp(std::vector
 }
 
 template <typename VectorDataType>
-static Ciphertext<DCRTPoly> InnerEvalPolyPS(ConstCiphertext<DCRTPoly>& x,
-                                            const std::vector<VectorDataType>& coefficients, uint32_t k, uint32_t m,
-                                            std::vector<Ciphertext<DCRTPoly>>& powers,
-                                            std::vector<Ciphertext<DCRTPoly>>& powers2) {
-    auto cc = x->GetCryptoContext();
-
+Ciphertext<DCRTPoly> InnerEvalPolyPS(ConstCiphertext<DCRTPoly>& x, const std::vector<VectorDataType>& coefficients,
+                                     uint32_t k, uint32_t m, const std::vector<Ciphertext<DCRTPoly>>& powers,
+                                     const std::vector<Ciphertext<DCRTPoly>>& powers2) {
     // Compute k*2^m because we use it often
     uint32_t k2m2k = k * (1 << (m - 1)) - k;
 
     // Divide coefficients by x^{k*2^{m-1}}
-    std::vector<VectorDataType> xkm(static_cast<int32_t>(k2m2k + k) + 1, 0.0);
+    std::vector<VectorDataType> xkm(k2m2k + k + 1);
     xkm.back() = 1;
-
     auto divqr = LongDivisionPoly(coefficients, xkm);
 
     // Subtract x^{k(2^{m-1} - 1)} from r
-    auto r2 = divqr->r;
-    if (static_cast<int32_t>(k2m2k - Degree(divqr->r)) <= 0) {
-        r2[static_cast<int32_t>(k2m2k)] -= 1;
-        r2.resize(Degree(r2) + 1);
+    auto& r2 = divqr->r;
+    if (auto n = Degree(r2); static_cast<int32_t>(k2m2k - n) <= 0) {
+        r2.resize(n + 1);
+        r2[k2m2k] -= 1;
     }
     else {
-        r2.resize(static_cast<int32_t>(k2m2k + 1), 0.0);
+        r2.resize(k2m2k + 1);
         r2.back() = -1;
     }
 
-    // Divide r2 by q
     auto divcs = LongDivisionPoly(r2, divqr->q);
+    auto cc    = x->GetCryptoContext();
 
-    // Add x^{k(2^{m-1} - 1)} to s
-    auto s2 = divcs->r;
-    s2.resize(static_cast<int32_t>(k2m2k + 1), 0.0);
-    s2.back() = 1;
+    Ciphertext<DCRTPoly> cu, qu, su;
 
-    Ciphertext<DCRTPoly> cu;
-    uint32_t dc = Degree(divcs->q);
-    bool flag_c = false;
+#pragma omp task shared(qu)
+    {
+        // Evaluate q and s2 at u.
+        // If their degrees are larger than k, then recursively apply the Paterson-Stockmeyer algorithm.
 
-    if (dc >= 1) {
-        if (dc == 1) {
-            if (IsNotEqualOne(divcs->q[1])) {
-                cu = cc->EvalMult(powers.front(), divcs->q[1]);
-                cc->ModReduceInPlace(cu);
-            }
-            else {
-                cu = powers.front()->Clone();
-            }
+        if (Degree(divqr->q) > k) {
+            qu = InnerEvalPolyPS(x, divqr->q, k, m - 1, powers, powers2);
         }
         else {
-            std::vector<Ciphertext<DCRTPoly>> ctxs(dc);
-            std::vector<VectorDataType> weights(dc);
-
-            for (uint32_t i = 0; i < dc; i++) {
-                ctxs[i]    = powers[i];
-                weights[i] = divcs->q[i + 1];
-            }
-
-            cu = cc->EvalLinearWSumMutable(ctxs, weights);
+            qu = cc->EvalAdd(powers[k - 1], divqr->q.front());
+            divqr->q.resize(k);
+            if (uint32_t n = Degree(divqr->q); n > 0)
+                cc->EvalAddInPlace(qu, EvalPartialLinearWSum(powers, divqr->q, n));
         }
-
-        // adds the free term (at x^0)
-        cc->EvalAddInPlace(cu, divcs->q.front());
-        flag_c = true;
     }
 
-    // Evaluate q and s2 at u. If their degrees are larger than k, then recursively apply the Paterson-Stockmeyer algorithm.
-    Ciphertext<DCRTPoly> qu;
-
-    if (Degree(divqr->q) > k) {
-        qu = InnerEvalPolyPS(x, divqr->q, k, m - 1, powers, powers2);
-    }
-    else {
-        // dq = k from construction
-        // perform scalar multiplication for all other terms and sum them up if there are non-zero coefficients
-        auto qcopy = divqr->q;
-        qcopy.resize(k);
-        if (Degree(qcopy) > 0) {
-            std::vector<Ciphertext<DCRTPoly>> ctxs(Degree(qcopy));
-            std::vector<VectorDataType> weights(Degree(qcopy));
-
-            for (uint32_t i = 0; i < Degree(qcopy); i++) {
-                ctxs[i]    = powers[i];
-                weights[i] = divqr->q[i + 1];
-            }
+#pragma omp task shared(su)
+    {
+        // Add x^{k(2^{m-1} - 1)} to s
+        auto& s2 = divcs->r;
+        s2.resize(k2m2k + 1);
+        s2.back() = 1;
 
-            qu = cc->EvalLinearWSumMutable(ctxs, weights);
-            // the highest order term will always be 1 because q is monic
-            cc->EvalAddInPlace(qu, powers[k - 1]);
+        if (Degree(s2) > k) {
+            su = InnerEvalPolyPS(x, s2, k, m - 1, powers, powers2);
         }
         else {
-            qu = powers[k - 1]->Clone();
+            su = cc->EvalAdd(powers[k - 1], s2.front());
+            s2.resize(k);
+            if (uint32_t n = Degree(s2); n > 0)
+                cc->EvalAddInPlace(su, EvalPartialLinearWSum(powers, s2, n));
         }
-        // adds the free term (at x^0)
-        cc->EvalAddInPlace(qu, divqr->q.front());
     }
 
-    uint32_t ds = Degree(s2);
-    Ciphertext<DCRTPoly> su;
-
-    if (std::equal(s2.begin(), s2.end(), divqr->q.begin())) {
-        su = qu->Clone();
+    if (uint32_t n = Degree(divcs->q); n == 0) {
+        cu = cc->EvalAdd(powers2[m - 1], divcs->q.front());
     }
-    else {
-        if (ds > k) {
-            su = InnerEvalPolyPS(x, s2, k, m - 1, powers, powers2);
+    else if (n == 1) {
+        if (IsNotEqualOne(divcs->q[1])) {
+            cu = cc->EvalMult(powers.front(), divcs->q[1]);
+            cc->ModReduceInPlace(cu);
+            cc->EvalAddInPlace(cu, powers2[m - 1]);
         }
         else {
-            // ds = k from construction
-            // perform scalar multiplication for all other terms and sum them up if there are non-zero coefficients
-            auto scopy = s2;
-            scopy.resize(k);
-            if (Degree(scopy) > 0) {
-                std::vector<Ciphertext<DCRTPoly>> ctxs(Degree(scopy));
-                std::vector<VectorDataType> weights(Degree(scopy));
-
-                for (uint32_t i = 0; i < Degree(scopy); ++i) {
-                    ctxs[i]    = powers[i];
-                    weights[i] = s2[i + 1];
-                }
-
-                su = cc->EvalLinearWSumMutable(ctxs, weights);
-                // the highest order term will always be 1 because q is monic
-                cc->EvalAddInPlace(su, powers[k - 1]);
-            }
-            else {
-                su = powers[k - 1]->Clone();
-            }
-            // adds the free term (at x^0)
-            cc->EvalAddInPlace(su, s2.front());
+            cu = cc->EvalAdd(powers2[m - 1], powers.front());
         }
-    }
-
-    Ciphertext<DCRTPoly> result;
-
-    if (flag_c) {
-        result = cc->EvalAdd(powers2[m - 1], cu);
+        cc->EvalAddInPlace(cu, divcs->q.front());
     }
     else {
-        result = cc->EvalAdd(powers2[m - 1], divcs->q.front());
+        cu = cc->EvalAdd(powers2[m - 1], EvalPartialLinearWSum(powers, divcs->q, n));
+        cc->EvalAddInPlace(cu, divcs->q.front());
     }
 
-    result = cc->EvalMult(result, qu);
+#pragma omp taskwait
+
+    auto result = cc->EvalMult(cu, qu);
     cc->ModReduceInPlace(result);
     cc->EvalAddInPlace(result, su);
-
     return result;
 }
 
 template <typename VectorDataType>
-static inline Ciphertext<DCRTPoly> internalEvalPolyPSWithPrecomp(std::shared_ptr<seriesPowers<DCRTPoly>> ctxtPowers,
-                                                                 const std::vector<VectorDataType>& coefficients) {
-    auto f2 = coefficients;
-    auto n  = Degree(f2);
-    f2.resize(n + 1);
-
-    auto powers    = ctxtPowers->powersRe;
-    auto powers2   = ctxtPowers->powers2Re;
-    auto power2km1 = ctxtPowers->power2km1Re;
-    auto k         = ctxtPowers->k;
-    auto m         = ctxtPowers->m;
+Ciphertext<DCRTPoly> internalEvalPolyPSWithPrecomp(const std::shared_ptr<seriesPowers<DCRTPoly>>& ctxtPowers,
+                                                   const std::vector<VectorDataType>& coefficients) {
+    auto& powers    = ctxtPowers->powersRe;
+    auto& powers2   = ctxtPowers->powers2Re;
+    auto& power2km1 = ctxtPowers->power2km1Re;
+    auto k          = ctxtPowers->k;
+    auto m          = ctxtPowers->m;
 
     // Compute k*2^{m-1}-k because we use it a lot
     uint32_t k2m2k = k * (1 << (m - 1)) - k;
 
-    // Add x^{k(2^m - 1)} to the polynomial that has to be evaluated
-    // std::vector<double> f2 = coefficients;
-    f2.resize(2 * k2m2k + k + 1, 0.0);
+    // Add T^{k(2^m - 1)}(y) to the polynomial that has to be evaluated
+    auto f2 = coefficients;
+    f2.resize(Degree(f2) + 1);
+    f2.resize(2 * k2m2k + k + 1);
     f2.back() = 1;
 
-    // Divide f2 by x^{k*2^{m-1}}
-    std::vector<VectorDataType> xkm(static_cast<int32_t>(k2m2k + k) + 1);
-    xkm.back() = 1;
-    auto divqr = LongDivisionPoly(f2, xkm);
-
-    // Subtract x^{k(2^{m-1} - 1)} from r
-    auto r2 = divqr->r;
-    if (static_cast<int32_t>(k2m2k - Degree(divqr->r)) <= 0) {
-        r2[static_cast<int32_t>(k2m2k)] -= 1;
-        r2.resize(Degree(r2) + 1);
-    }
-    else {
-        r2.resize(static_cast<int32_t>(k2m2k + 1), 0.0);
-        r2.back() = -1;
-    }
-
-    // Divide r2 by q
-    auto divcs = LongDivisionPoly(r2, divqr->q);
-
-    // Add x^{k(2^{m-1} - 1)} to s
-    auto s2 = divcs->r;
-    s2.resize(static_cast<int32_t>(k2m2k + 1), 0.0);
-    s2.back() = 1;
-
-    auto cc = powers[0]->GetCryptoContext();
-
-    // Evaluate c at u
-    Ciphertext<DCRTPoly> cu;
-    uint32_t dc = Degree(divcs->q);
-    bool flag_c = false;
-
-    if (dc >= 1) {
-        if (dc == 1) {
-            if (IsNotEqualOne(divcs->q[1])) {
-                cu = cc->EvalMult(powers.front(), divcs->q[1]);
-                // Do rescaling after scalar multiplication
-                cc->ModReduceInPlace(cu);
-            }
-            else {
-                cu = powers.front()->Clone();
-            }
-        }
-        else {
-            std::vector<Ciphertext<DCRTPoly>> ctxs(dc);
-            std::vector<VectorDataType> weights(dc);
-
-            for (uint32_t i = 0; i < dc; i++) {
-                ctxs[i]    = powers[i];
-                weights[i] = divcs->q[i + 1];
-            }
-
-            cu = cc->EvalLinearWSumMutable(ctxs, weights);
-        }
-
-        // adds the free term (at x^0)
-        cc->EvalAddInPlace(cu, divcs->q.front());
-        flag_c = true;
-    }
-
-    // Evaluate q and s2 at u. If their degrees are larger than k, then recursively apply the Paterson-Stockmeyer algorithm.
-    Ciphertext<DCRTPoly> qu;
-
-    if (Degree(divqr->q) > k) {
-        qu = InnerEvalPolyPS(powers[0], divqr->q, k, m - 1, powers, powers2);
-    }
-    else {
-        // dq = k from construction
-        // perform scalar multiplication for all other terms and sum them up if there are non-zero coefficients
-        auto qcopy = divqr->q;
-        qcopy.resize(k);
-        if (Degree(qcopy) > 0) {
-            std::vector<Ciphertext<DCRTPoly>> ctxs(Degree(qcopy));
-            std::vector<VectorDataType> weights(Degree(qcopy));
-
-            for (uint32_t i = 0; i < Degree(qcopy); i++) {
-                ctxs[i]    = powers[i];
-                weights[i] = divqr->q[i + 1];
-            }
-
-            qu = cc->EvalLinearWSumMutable(ctxs, weights);
-            // the highest order term will always be 1 because q is monic
-            cc->EvalAddInPlace(qu, powers[k - 1]);
-        }
-        else {
-            qu = powers[k - 1]->Clone();
-        }
-        // adds the free term (at x^0)
-        cc->EvalAddInPlace(qu, divqr->q.front());
-    }
-
-    uint32_t ds = Degree(s2);
-    Ciphertext<DCRTPoly> su;
-
-    if (std::equal(s2.begin(), s2.end(), divqr->q.begin())) {
-        su = qu->Clone();
-    }
-    else {
-        if (ds > k) {
-            su = InnerEvalPolyPS(powers[0], s2, k, m - 1, powers, powers2);
-        }
-        else {
-            // ds = k from construction
-            // perform scalar multiplication for all other terms and sum them up if there are non-zero coefficients
-            auto scopy = s2;
-            scopy.resize(k);
-            if (Degree(scopy) > 0) {
-                std::vector<Ciphertext<DCRTPoly>> ctxs(Degree(scopy));
-                std::vector<VectorDataType> weights(Degree(scopy));
-
-                for (uint32_t i = 0; i < Degree(scopy); i++) {
-                    ctxs[i]    = powers[i];
-                    weights[i] = s2[i + 1];
-                }
-
-                su = cc->EvalLinearWSumMutable(ctxs, weights);
-                // the highest order term will always be 1 because q is monic
-                cc->EvalAddInPlace(su, powers[k - 1]);
-            }
-            else {
-                su = powers[k - 1]->Clone();
-            }
-            // adds the free term (at x^0)
-            cc->EvalAddInPlace(su, s2.front());
-        }
-    }
-
     Ciphertext<DCRTPoly> result;
-
-    if (flag_c) {
-        result = cc->EvalAdd(powers2[m - 1], cu);
+#pragma omp parallel num_threads(OpenFHEParallelControls.GetThreadLimit(6 * m + 2))
+    {
+#pragma omp single
+        result =
+            powers[0]->GetCryptoContext()->EvalSub(InnerEvalPolyPS(powers[0], f2, k, m, powers, powers2), power2km1);
     }
-    else {
-        result = cc->EvalAdd(powers2[m - 1], divcs->q.front());
-    }
-
-    result = cc->EvalMult(result, qu);
-    cc->ModReduceInPlace(result);
-    cc->EvalAddInPlace(result, su);
-    cc->EvalSubInPlace(result, power2km1);
-
     return result;
 }
 
@@ -717,15 +553,15 @@ Ciphertext<DCRTPoly> AdvancedSHECKKSRNS::EvalPolyLinear(ConstCiphertext<DCRTPoly
 
 Ciphertext<DCRTPoly> AdvancedSHECKKSRNS::EvalPolyPS(ConstCiphertext<DCRTPoly>& x,
                                                     const std::vector<int64_t>& coeffs) const {
-    return internalEvalPolyPSWithPrecomp(internalEvalPowersPS(x, coeffs), coeffs);
+    return internalEvalPolyPSWithPrecomp(internalEvalPowersPS(x, Degree(coeffs)), coeffs);
 }
 Ciphertext<DCRTPoly> AdvancedSHECKKSRNS::EvalPolyPS(ConstCiphertext<DCRTPoly>& x,
                                                     const std::vector<double>& coeffs) const {
-    return internalEvalPolyPSWithPrecomp(internalEvalPowersPS(x, coeffs), coeffs);
+    return internalEvalPolyPSWithPrecomp(internalEvalPowersPS(x, Degree(coeffs)), coeffs);
 }
 Ciphertext<DCRTPoly> AdvancedSHECKKSRNS::EvalPolyPS(ConstCiphertext<DCRTPoly>& x,
                                                     const std::vector<std::complex<double>>& coeffs) const {
-    return internalEvalPolyPSWithPrecomp(internalEvalPowersPS(x, coeffs), coeffs);
+    return internalEvalPolyPSWithPrecomp(internalEvalPowersPS(x, Degree(coeffs)), coeffs);
 }
 
 //------------------------------------------------------------------------------
@@ -736,93 +572,65 @@ template <typename VectorDataType>
 std::shared_ptr<seriesPowers<DCRTPoly>> internalEvalChebyPolysLinear(ConstCiphertext<DCRTPoly>& x,
                                                                      const std::vector<VectorDataType>& coefficients,
                                                                      double a, double b) {
-    auto cc    = x->GetCryptoContext();
-    uint32_t k = coefficients.size() - 1;
+    const uint32_t k = coefficients.size() - 1;
     std::vector<Ciphertext<DCRTPoly>> T(k);
 
+    auto cc = x->GetCryptoContext();
+
     // computes linear transformation y = -1 + 2 (x-a)/(b-a)
     // consumes one level when a <> -1 && b <> 1
-    if ((a - std::round(a) < 1e-10) && (b - std::round(b) < 1e-10) && (std::round(a) == -1.0) &&
-        (std::round(b) == 1.0)) {
+    if (!IsNotEqualNegOne(a) && !IsNotEqualOne(b)) {
         T[0] = x->Clone();
     }
     else {
         // linear transformation is needed
         double alpha = 2 / (b - a);
-        double beta  = 2 * a / (b - a);
+        double beta  = a * alpha;
 
         T[0] = cc->EvalMult(x, alpha);
         cc->ModReduceInPlace(T[0]);
         cc->EvalAddInPlace(T[0], -1.0 - beta);
     }
 
-    Ciphertext<DCRTPoly> yReduced = T[0]->Clone();
-    uint32_t compositeDegree =
-        std::dynamic_pointer_cast<CryptoParametersCKKSRNS>(x->GetCryptoParameters())->GetCompositeDegree();
-
     // Computes Chebyshev polynomials up to degree k
     // for y: T_1(y) = y, T_2(y), ... , T_k(y)
     // uses binary tree multiplication
     for (uint32_t i = 2; i <= k; ++i) {
-        // if i is a power of two
-        if (!(i & (i - 1))) {
-            // compute T_{2i}(y) = 2*T_i(y)^2 - 1
-            auto square = cc->EvalSquare(T[i / 2 - 1]);
-            T[i - 1]    = cc->EvalAdd(square, square);
+        if (i & 0x1) {  // if i is odd
+            // compute T_{2i+1}(y) = 2*T_i(y)*T_{i+1}(y) - y
+            T[i - 1] = cc->EvalMult(T[i / 2 - 1], T[i / 2]);
+            cc->EvalAddInPlaceNoCheck(T[i - 1], T[i - 1]);
             cc->ModReduceInPlace(T[i - 1]);
-            cc->EvalAddInPlace(T[i - 1], -1.0);
-            // TODO: (Andrey) Do we need this?
-            if (i == 2) {
-                cc->LevelReduceInPlace(T[i / 2 - 1], nullptr);
-                cc->LevelReduceInPlace(yReduced, nullptr);
-            }
-            cc->LevelReduceInPlace(yReduced, nullptr);  // depth log_2 i + 1
-
-            // i/2 will now be used only at a lower level
-            if (i / 2 > 1) {
-                cc->LevelReduceInPlace(T[i / 2 - 1], nullptr);
-            }
-            // TODO: (Andrey) until here.
-            // If we need it, we can also add it in EvalChebyshevSeriesPS
+            cc->EvalSubInPlace(T[i - 1], T[0]);
         }
         else {
-            // non-power of 2
-            if (i % 2 == 1) {
-                // if i is odd
-                // compute T_{2i+1}(y) = 2*T_i(y)*T_{i+1}(y) - y
-                auto prod = cc->EvalMult(T[i / 2 - 1], T[i / 2]);
-                T[i - 1]  = cc->EvalAdd(prod, prod);
-                cc->ModReduceInPlace(T[i - 1]);
-                cc->EvalSubInPlace(T[i - 1], yReduced);
-            }
-            else {
-                // i is even but not power of 2
-                // compute T_{2i}(y) = 2*T_i(y)^2 - 1
-                auto square = cc->EvalSquare(T[i / 2 - 1]);
-                T[i - 1]    = cc->EvalAdd(square, square);
-                cc->ModReduceInPlace(T[i - 1]);
-                cc->EvalAddInPlace(T[i - 1], -1.0);
-            }
+            // compute T_{2i}(y) = 2*T_i(y)^2 - 1
+            T[i - 1] = cc->EvalSquare(T[i / 2 - 1]);
+            cc->EvalAddInPlaceNoCheck(T[i - 1], T[i - 1]);
+            cc->ModReduceInPlace(T[i - 1]);
+            cc->EvalAddInPlace(T[i - 1], -1.0);
         }
     }
-    for (uint32_t i = 1; i < k; ++i) {
-        uint32_t levelDiff = T[k - 1]->GetLevel() - T[i - 1]->GetLevel();
-        cc->LevelReduceInPlace(T[i - 1], nullptr, levelDiff / compositeDegree);
-    }
-    return std::make_shared<seriesPowers<DCRTPoly>>(T);
+
+    uint32_t compositeDegree =
+        std::dynamic_pointer_cast<CryptoParametersCKKSRNS>(x->GetCryptoParameters())->GetCompositeDegree();
+    for (uint32_t i = 1; i < k; ++i)
+        cc->LevelReduceInPlace(T[i - 1], nullptr, (T[k - 1]->GetLevel() - T[i - 1]->GetLevel()) / compositeDegree);
+
+    return std::make_shared<seriesPowers<DCRTPoly>>(std::move(T));
 }
 
 template <typename VectorDataType>
-static inline Ciphertext<DCRTPoly> internalEvalChebyshevSeriesLinearWithPrecomp(
-    std::vector<Ciphertext<DCRTPoly>>& T, const std::vector<VectorDataType>& coefficients) {
-    auto cc    = T[0]->GetCryptoContext();
-    uint32_t k = coefficients.size() - 1;
+Ciphertext<DCRTPoly> internalEvalChebyshevSeriesLinearWithPrecomp(std::vector<Ciphertext<DCRTPoly>>& T,
+                                                                  const std::vector<VectorDataType>& coefficients) {
+    const uint32_t k = coefficients.size() - 2;
 
     // perform scalar multiplication for the highest-order term
-    auto result = cc->EvalMult(T[k - 1], coefficients[k]);
+    auto cc     = T[0]->GetCryptoContext();
+    auto result = cc->EvalMult(T[k], coefficients[k + 1]);
 
     // perform scalar multiplication for all other terms and sum them up
-    for (uint32_t i = 0; i < k - 1; ++i) {
+    for (uint32_t i = 0; i < k; ++i) {
         if (IsNotEqualZero(coefficients[i + 1])) {
             cc->EvalMultInPlace(T[i], coefficients[i + 1]);
             cc->EvalAddInPlace(result, T[i]);
@@ -839,178 +647,122 @@ static inline Ciphertext<DCRTPoly> internalEvalChebyshevSeriesLinearWithPrecomp(
 }
 
 template <typename VectorDataType>
-static Ciphertext<DCRTPoly> InnerEvalChebyshevPS(ConstCiphertext<DCRTPoly>& x,
-                                                 const std::vector<VectorDataType>& coefficients, uint32_t k,
-                                                 uint32_t m, std::vector<Ciphertext<DCRTPoly>>& T,
-                                                 std::vector<Ciphertext<DCRTPoly>>& T2) {
-    auto cc = x->GetCryptoContext();
-    uint32_t compositeDegree =
-        std::dynamic_pointer_cast<CryptoParametersCKKSRNS>(x->GetCryptoParameters())->GetCompositeDegree();
-
+Ciphertext<DCRTPoly> InnerEvalChebyshevPS(ConstCiphertext<DCRTPoly>& x, const std::vector<VectorDataType>& coefficients,
+                                          uint32_t k, uint32_t m, const std::vector<Ciphertext<DCRTPoly>>& T,
+                                          const std::vector<Ciphertext<DCRTPoly>>& T2) {
     // Compute k*2^{m-1}-k because we use it a lot
     uint32_t k2m2k = k * (1 << (m - 1)) - k;
 
     // Divide coefficients by T^{k*2^{m-1}}
-    std::vector<VectorDataType> Tkm(static_cast<int32_t>(k2m2k + k) + 1);
+    std::vector<VectorDataType> Tkm(k2m2k + k + 1);
     Tkm.back() = 1;
     auto divqr = LongDivisionChebyshev(coefficients, Tkm);
 
     // Subtract x^{k(2^{m-1} - 1)} from r
-    auto r2 = divqr->r;
-    if (static_cast<int32_t>(k2m2k - Degree(divqr->r)) <= 0) {
-        r2[static_cast<int32_t>(k2m2k)] -= 1;
-        r2.resize(Degree(r2) + 1);
+    auto& r2 = divqr->r;
+    if (uint32_t n = Degree(r2); static_cast<int32_t>(k2m2k - n) <= 0) {
+        r2.resize(n + 1);
+        r2[k2m2k] -= 1;
     }
     else {
-        r2.resize(static_cast<int32_t>(k2m2k + 1));
+        r2.resize(k2m2k + 1);
         r2.back() = -1;
     }
 
-    // Divide r2 by q
     auto divcs = LongDivisionChebyshev(r2, divqr->q);
+    auto cc    = x->GetCryptoContext();
 
-    // Add x^{k(2^{m-1} - 1)} to s
-    auto s2 = divcs->r;
-    s2.resize(static_cast<int32_t>(k2m2k + 1), 0.0);
-    s2.back() = 1;
-
-    // Evaluate c at u
-    Ciphertext<DCRTPoly> cu;
-    uint32_t dc = Degree(divcs->q);
-    bool flag_c = false;
-    if (dc >= 1) {
-        if (dc == 1) {
-            if (IsNotEqualOne(divcs->q[1])) {
-                cu = cc->EvalMult(T.front(), divcs->q[1]);
-                cc->ModReduceInPlace(cu);
-            }
-            else {
-                cu = T.front()->Clone();
-            }
+    Ciphertext<DCRTPoly> cu, qu, su;
+
+    {
+        // Evaluate q and s2 at u.
+        // If their degrees are larger than k, then recursively apply the Paterson-Stockmeyer algorithm.
+        if (Degree(divqr->q) > k) {
+            qu = InnerEvalChebyshevPS(x, divqr->q, k, m - 1, T, T2);
         }
         else {
-            std::vector<Ciphertext<DCRTPoly>> ctxs(dc);
-            std::vector<VectorDataType> weights(dc);
-
-            for (uint32_t i = 0; i < dc; ++i) {
-                ctxs[i]    = T[i];
-                weights[i] = divcs->q[i + 1];
-            }
+            // dq = k from construction
+            // perform scalar multiplication for all other terms and sum them up if there are non-zero coefficients
 
-            cu = internalEvalLinearWSumMutable(ctxs, weights);
-        }
+            // the highest order coefficient will always be a power of two up to 2^{m-1} because q is "monic" but the Chebyshev rule adds a factor of 2
+            // we don't need to increase the depth by multiplying the highest order coefficient, but instead checking and summing, since we work with m <= 4.
+            qu                   = T[k - 1]->Clone();
+            const uint32_t limit = std::log2(ToReal(divqr->q.back()));
+            for (uint32_t i = 0; i < limit; ++i)
+                cc->EvalAddInPlaceNoCheck(qu, qu);
 
-        // adds the free term (at x^0)
-        cc->EvalAddInPlace(cu, divcs->q.front() / 2.0);
-        // Need to reduce levels up to the level of T2[m-1].
-        uint32_t levelDiff = T2[m - 1]->GetLevel() - cu->GetLevel();
-        cc->LevelReduceInPlace(cu, nullptr, levelDiff / compositeDegree);
+            // adds the free term (at x^0)
+            cc->EvalAddInPlace(qu, divqr->q.front() / 2.0);
+            // The number of levels of qu is the same as the number of levels of T[k-1] + 1.
+            // Will only get here when m = 2, so the number of levels of qu and T2[m-1] will be the same.
 
-        flag_c = true;
+            divqr->q.resize(k);
+            if (uint32_t n = Degree(divqr->q); n > 0)
+                cc->EvalAddInPlace(qu, EvalPartialLinearWSum(T, divqr->q, n));
+        }
     }
 
-    // Evaluate q and s2 at u. If their degrees are larger than k, then recursively apply the Paterson-Stockmeyer algorithm.
-    Ciphertext<DCRTPoly> qu;
-
-    if (Degree(divqr->q) > k) {
-        qu = InnerEvalChebyshevPS(x, divqr->q, k, m - 1, T, T2);
-    }
-    else {
-        // dq = k from construction
-        // perform scalar multiplication for all other terms and sum them up if there are non-zero coefficients
-        auto qcopy = divqr->q;
-        qcopy.resize(k);
-        if (Degree(qcopy) > 0) {
-            std::vector<Ciphertext<DCRTPoly>> ctxs(Degree(qcopy));
-            std::vector<VectorDataType> weights(Degree(qcopy));
-
-            for (uint32_t i = 0; i < Degree(qcopy); i++) {
-                ctxs[i]    = T[i];
-                weights[i] = divqr->q[i + 1];
-            }
+    {
+        // Add x^{k(2^{m-1} - 1)} to s
+        auto& s2 = divcs->r;
+        s2.resize(k2m2k + 1);
+        s2.back() = 1;
 
-            qu = cc->EvalLinearWSumMutable(ctxs, weights);
-            // the highest order coefficient will always be a power of two up to 2^{m-1} because q is "monic" but the Chebyshev rule adds a factor of 2
-            // we don't need to increase the depth by multiplying the highest order coefficient, but instead checking and summing, since we work with m <= 4.
-            Ciphertext<DCRTPoly> sum = T[k - 1]->Clone();
-            uint32_t limit           = log2(ToReal(divqr->q.back()));
-            for (uint32_t i = 0; i < limit; ++i) {
-                sum = cc->EvalAdd(sum, sum);
-            }
-            cc->EvalAddInPlace(qu, sum);
+        if (Degree(s2) > k) {
+            su = InnerEvalChebyshevPS(x, s2, k, m - 1, T, T2);
         }
         else {
-            Ciphertext<DCRTPoly> sum = T[k - 1]->Clone();
-            uint32_t limit           = log2(ToReal(divqr->q.back()));
-            for (uint32_t i = 0; i < limit; ++i) {
-                sum = cc->EvalAdd(sum, sum);
-            }
-            qu = sum;
-        }
+            // the highest order coefficient will always be 1 because s2 is monic.
+            su = T[k - 1]->Clone();
 
-        // adds the free term (at x^0)
-        cc->EvalAddInPlace(qu, divqr->q.front() / 2.0);
-        // The number of levels of qu is the same as the number of levels of T[k-1] or T[k-1] + 1.
-        // No need to reduce it to T2[m-1] because it only reaches here when m = 2.
-    }
+            // ds = k from construction
+            // perform scalar multiplication for all other terms and sum them up if there are non-zero coefficients
+            s2.resize(k);
+            if (uint32_t n = Degree(s2); n > 0)
+                cc->EvalAddInPlace(su, EvalPartialLinearWSum(T, s2, n));
 
-    Ciphertext<DCRTPoly> su;
+            // adds the free term (at x^0)
+            cc->EvalAddInPlace(su, s2.front() / 2.0);
 
-    if (Degree(s2) > k) {
-        su = InnerEvalChebyshevPS(x, s2, k, m - 1, T, T2);
+            // The number of levels of su is the same as the number of levels of T[k-1] or T[k-1] + 1. Need to reduce it to T2[m-1] + 1.
+            cc->LevelReduceInPlace(su, nullptr);
+        }
     }
-    else {
-        // ds = k from construction
-        // perform scalar multiplication for all other terms and sum them up if there are non-zero coefficients
-        auto scopy = s2;
-        scopy.resize(k);
-        if (Degree(scopy) > 0) {
-            std::vector<Ciphertext<DCRTPoly>> ctxs(Degree(scopy));
-            std::vector<VectorDataType> weights(Degree(scopy));
-
-            for (uint32_t i = 0; i < Degree(scopy); i++) {
-                ctxs[i]    = T[i];
-                weights[i] = s2[i + 1];
-            }
 
-            su = cc->EvalLinearWSumMutable(ctxs, weights);
-            // the highest order coefficient will always be 1 because s2 is monic.
-            cc->EvalAddInPlace(su, T[k - 1]);
+    if (uint32_t n = Degree(divcs->q); n >= 1) {
+        if (n == 1) {
+            if (IsNotEqualOne(divcs->q[1])) {
+                cu = cc->EvalMult(T.front(), divcs->q[1]);
+                cc->ModReduceInPlace(cu);
+            }
+            else {
+                cu = T.front()->Clone();
+            }
         }
         else {
-            su = T[k - 1]->Clone();
+            cu = EvalPartialLinearWSum(T, divcs->q, n);
         }
 
         // adds the free term (at x^0)
-        cc->EvalAddInPlace(su, s2.front() / 2.0);
-        // The number of levels of su is the same as the number of levels of T[k-1] or T[k-1] + 1. Need to reduce it to T2[m-1] + 1.
-        // su = cc->LevelReduce(su, nullptr, su->GetElements()[0].GetNumOfElements() - Lm + 1) ;
-        cc->LevelReduceInPlace(su, nullptr);
-    }
-
-    Ciphertext<DCRTPoly> result;
+        cc->EvalAddInPlace(cu, divcs->q.front() / 2.0);
 
-    if (flag_c) {
-        result = cc->EvalAdd(T2[m - 1], cu);
-    }
-    else {
-        result = cc->EvalAdd(T2[m - 1], divcs->q.front() / 2.0);
+        // Need to reduce levels up to the level of T2[m-1].
+        uint32_t cd =
+            std::dynamic_pointer_cast<CryptoParametersCKKSRNS>(x->GetCryptoParameters())->GetCompositeDegree();
+        cc->LevelReduceInPlace(cu, nullptr, (T2[m - 1]->GetLevel() - cu->GetLevel()) / cd);
     }
 
-    result = cc->EvalMult(result, qu);
-    cc->ModReduceInPlace(result);
+    cu = cu ? cc->EvalAdd(T2[m - 1], cu) : cc->EvalAdd(T2[m - 1], divcs->q.front() / 2.0);
 
+    auto result = cc->EvalMult(cu, qu);
+    cc->ModReduceInPlace(result);
     cc->EvalAddInPlace(result, su);
-
     return result;
 }
 
-template <typename VectorDataType>
-std::shared_ptr<seriesPowers<DCRTPoly>> internalEvalChebyPolysPS(ConstCiphertext<DCRTPoly>& x,
-                                                                 const std::vector<VectorDataType>& coefficients,
+std::shared_ptr<seriesPowers<DCRTPoly>> internalEvalChebyPolysPS(ConstCiphertext<DCRTPoly>& x, uint32_t degree,
                                                                  double a, double b) {
-    auto n     = Degree(coefficients);
-    auto degs  = ComputeDegreesPS(n);
+    auto degs  = ComputeDegreesPS(degree);
     uint32_t k = degs[0];
     uint32_t m = degs[1];
 
@@ -1018,8 +770,7 @@ std::shared_ptr<seriesPowers<DCRTPoly>> internalEvalChebyPolysPS(ConstCiphertext
     // consumes one level when a <> -1 && b <> 1
     auto cc = x->GetCryptoContext();
     std::vector<Ciphertext<DCRTPoly>> T(k);
-    if ((a - std::round(a) < 1e-10) && (b - std::round(b) < 1e-10) && (std::round(a) == -1.0) &&
-        (std::round(b) == 1.0)) {
+    if (!IsNotEqualNegOne(a) && !IsNotEqualOne(b)) {
         // no linear transformation is needed if a = -1, b = 1
         // T_1(y) = y
         T[0] = x->Clone();
@@ -1027,291 +778,105 @@ std::shared_ptr<seriesPowers<DCRTPoly>> internalEvalChebyPolysPS(ConstCiphertext
     else {
         // linear transformation is needed
         double alpha = 2 / (b - a);
-        double beta  = 2 * a / (b - a);
+        double beta  = a * alpha;
 
         T[0] = cc->EvalMult(x, alpha);
         cc->ModReduceInPlace(T[0]);
         cc->EvalAddInPlace(T[0], -1.0 - beta);
     }
 
-    Ciphertext<DCRTPoly> y = T[0]->Clone();
-
     // Computes Chebyshev polynomials up to degree k
     // for y: T_1(y) = y, T_2(y), ... , T_k(y)
     // uses binary tree multiplication
     for (uint32_t i = 2; i <= k; ++i) {
-        // if i is a power of two
-        if (!(i & (i - 1))) {
-            // compute T_{2i}(y) = 2*T_i(y)^2 - 1
-            auto square = cc->EvalSquare(T[i / 2 - 1]);
-            T[i - 1]    = cc->EvalAdd(square, square);
+        if (i & 0x1) {  // if i is odd
+            // compute T_{2i+1}(y) = 2*T_i(y)*T_{i+1}(y) - y
+            T[i - 1] = cc->EvalMult(T[i / 2 - 1], T[i / 2]);
+            cc->EvalAddInPlaceNoCheck(T[i - 1], T[i - 1]);
             cc->ModReduceInPlace(T[i - 1]);
-            cc->EvalAddInPlace(T[i - 1], -1.0);
+            cc->EvalSubInPlace(T[i - 1], T[0]);
         }
         else {
-            // non-power of 2
-            if (i % 2 == 1) {
-                // if i is odd
-                // compute T_{2i+1}(y) = 2*T_i(y)*T_{i+1}(y) - y
-                auto prod = cc->EvalMult(T[i / 2 - 1], T[i / 2]);
-                T[i - 1]  = cc->EvalAdd(prod, prod);
-
-                cc->ModReduceInPlace(T[i - 1]);
-                cc->EvalSubInPlace(T[i - 1], y);
-            }
-            else {
-                // i is even but not power of 2
-                // compute T_{2i}(y) = 2*T_i(y)^2 - 1
-                auto square = cc->EvalSquare(T[i / 2 - 1]);
-                T[i - 1]    = cc->EvalAdd(square, square);
-                cc->ModReduceInPlace(T[i - 1]);
-                cc->EvalAddInPlace(T[i - 1], -1.0);
-            }
+            // compute T_{2i}(y) = 2*T_i(y)^2 - 1
+            T[i - 1] = cc->EvalSquare(T[i / 2 - 1]);
+            cc->EvalAddInPlaceNoCheck(T[i - 1], T[i - 1]);
+            cc->ModReduceInPlace(T[i - 1]);
+            cc->EvalAddInPlace(T[i - 1], -1.0);
         }
     }
 
     const auto cryptoParams = std::dynamic_pointer_cast<CryptoParametersCKKSRNS>(T[k - 1]->GetCryptoParameters());
-
-    auto algo = cc->GetScheme();
-
     if (cryptoParams->GetScalingTechnique() == FIXEDMANUAL) {
         // brings all powers of x to the same level
-        for (uint32_t i = 1; i < k; ++i) {
-            uint32_t levelDiff = T[k - 1]->GetLevel() - T[i - 1]->GetLevel();
-            cc->LevelReduceInPlace(T[i - 1], nullptr, levelDiff);
-        }
+        for (uint32_t i = 1; i < k; ++i)
+            cc->LevelReduceInPlace(T[i - 1], nullptr, T[k - 1]->GetLevel() - T[i - 1]->GetLevel());
     }
     else {
-        for (uint32_t i = 1; i < k; ++i) {
-            algo->AdjustLevelsAndDepthInPlace(T[i - 1], T[k - 1]);
-        }
+        for (uint32_t i = 1; i < k; ++i)
+            cc->GetScheme()->AdjustLevelsAndDepthInPlace(T[i - 1], T[k - 1]);
     }
 
     std::vector<Ciphertext<DCRTPoly>> T2(m);
-    // Compute the Chebyshev polynomials T_k(y), T_{2k}(y), T_{4k}(y), ... , T_{2^{m-1}k}(y)
     // T2[0] is used as a placeholder
-    T2.front() = T.back();
-    for (uint32_t i = 1; i < m; i++) {
-        auto square = cc->EvalSquare(T2[i - 1]);
-        T2[i]       = cc->EvalAdd(square, square);
+    T2[0] = T.back();
+
+    // computes T_{k(2*m - 1)}(y)
+    auto T2km1 = T.back();
+
+    for (uint32_t i = 1; i < m; ++i) {
+        // Compute the Chebyshev polynomials T_k(y), T_{2k}(y), T_{4k}(y), ... , T_{2^{m-1}k}(y)
+        T2[i] = cc->EvalSquare(T2[i - 1]);
+        cc->EvalAddInPlaceNoCheck(T2[i], T2[i]);
         cc->ModReduceInPlace(T2[i]);
         cc->EvalAddInPlace(T2[i], -1.0);
-    }
 
-    // computes T_{k(2*m - 1)}(y)
-    auto T2km1 = T2.front();
-    for (uint32_t i = 1; i < m; i++) {
         // compute T_{k(2*m - 1)} = 2*T_{k(2^{m-1}-1)}(y)*T_{k*2^{m-1}}(y) - T_k(y)
-        auto prod = cc->EvalMult(T2km1, T2[i]);
-        T2km1     = cc->EvalAdd(prod, prod);
+        T2km1 = cc->EvalMult(T2km1, T2[i]);
+        cc->EvalAddInPlaceNoCheck(T2km1, T2km1);
         cc->ModReduceInPlace(T2km1);
-        cc->EvalSubInPlace(T2km1, T2.front());
+        cc->EvalSubInPlace(T2km1, T2[0]);
     }
 
-    // We also need to reduce the number of levels of T[k-1] and of T2[0] by another level.
-    //  cc->LevelReduceInPlace(T[k-1], nullptr);
-    //  cc->LevelReduceInPlace(T2.front(), nullptr);
-
-    return std::make_shared<seriesPowers<DCRTPoly>>(T, T2, T2km1, k, m);
+    return std::make_shared<seriesPowers<DCRTPoly>>(std::move(T), std::move(T2), std::move(T2km1), k, m);
 }
 
 template <typename VectorDataType>
-static inline Ciphertext<DCRTPoly> internalEvalChebyshevSeriesPSWithPrecomp(
-    std::shared_ptr<seriesPowers<DCRTPoly>> ctxtPolys, const std::vector<VectorDataType>& coefficients) {
-    auto f2 = coefficients;
-    auto n  = Degree(f2);
-    f2.resize(n + 1);
-
-    auto T     = ctxtPolys->powersRe;
-    auto T2    = ctxtPolys->powers2Re;
-    auto T2km1 = ctxtPolys->power2km1Re;
-    auto k     = ctxtPolys->k;
-    auto m     = ctxtPolys->m;
+Ciphertext<DCRTPoly> internalEvalChebyshevSeriesPSWithPrecomp(const std::shared_ptr<seriesPowers<DCRTPoly>>& ctxtPolys,
+                                                              const std::vector<VectorDataType>& coefficients) {
+    auto& T     = ctxtPolys->powersRe;
+    auto& T2    = ctxtPolys->powers2Re;
+    auto& T2km1 = ctxtPolys->power2km1Re;
+    auto k      = ctxtPolys->k;
+    auto m      = ctxtPolys->m;
 
     // Compute k*2^{m-1}-k because we use it a lot
     uint32_t k2m2k = k * (1 << (m - 1)) - k;
 
     // Add T^{k(2^m - 1)}(y) to the polynomial that has to be evaluated
-    f2.resize(2 * k2m2k + k + 1, 0.0);
+    auto f2 = coefficients;
+    f2.resize(Degree(f2) + 1);
+    f2.resize(2 * k2m2k + k + 1);
     f2.back() = 1;
 
-    // Divide f2 by T^{k*2^{m-1}}
-    std::vector<VectorDataType> Tkm(k2m2k + k + 1);
-    Tkm.back() = 1;
-    auto divqr = LongDivisionChebyshev(f2, Tkm);
-
-    // Subtract x^{k(2^{m-1} - 1)} from r
-    auto r2 = divqr->r;
-    if (static_cast<int32_t>(k2m2k - Degree(r2)) <= 0) {
-        r2[static_cast<int32_t>(k2m2k)] -= 1;
-        r2.resize(Degree(r2) + 1);
-    }
-    else {
-        r2.resize(static_cast<int32_t>(k2m2k + 1));
-        r2.back() = -1;
-    }
-
-    // Divide r2 by q
-    auto divcs = LongDivisionChebyshev(r2, divqr->q);
-
-    // Add x^{k(2^{m-1} - 1)} to s
-    auto s2 = divcs->r;
-    s2.resize(k2m2k + 1);
-    s2.back() = 1;
-
-    auto cc = T[0]->GetCryptoContext();
-
-    // Evaluate c at u
-    Ciphertext<DCRTPoly> cu;
-    uint32_t dc = Degree(divcs->q);
-    bool flag_c = false;
-    if (dc >= 1) {
-        if (dc == 1) {
-            if (IsNotEqualOne(divcs->q[1])) {
-                cu = cc->EvalMult(T.front(), divcs->q[1]);
-                cc->ModReduceInPlace(cu);
-            }
-            else {
-                cu = T.front()->Clone();
-            }
-        }
-        else {
-            std::vector<Ciphertext<DCRTPoly>> ctxs(dc);
-            std::vector<VectorDataType> weights(dc);
-
-            for (uint32_t i = 0; i < dc; i++) {
-                ctxs[i]    = T[i];
-                weights[i] = divcs->q[i + 1];
-            }
-
-            cu = cc->EvalLinearWSumMutable(ctxs, weights);
-        }
-
-        // adds the free term (at x^0)
-        cc->EvalAddInPlace(cu, divcs->q.front() / 2.0);
-        // TODO : Andrey why not T2[m-1]->GetLevel() instead?
-        // Need to reduce levels to the level of T2[m-1].
-        //    uint32_t levelDiff = y->GetLevel() - cu->GetLevel() + ceil(log2(k)) + m - 1;
-        //    cc->LevelReduceInPlace(cu, nullptr, levelDiff);
-
-        flag_c = true;
-    }
-
-    // Evaluate q and s2 at u. If their degrees are larger than k, then recursively apply the Paterson-Stockmeyer algorithm.
-    Ciphertext<DCRTPoly> qu;
-
-    if (Degree(divqr->q) > k) {
-        qu = InnerEvalChebyshevPS(T[0], divqr->q, k, m - 1, T, T2);
-    }
-    else {
-        // dq = k from construction
-        // perform scalar multiplication for all other terms and sum them up if there are non-zero coefficients
-        auto qcopy = divqr->q;
-        qcopy.resize(k);
-        if (Degree(qcopy) > 0) {
-            std::vector<Ciphertext<DCRTPoly>> ctxs(Degree(qcopy));
-            std::vector<VectorDataType> weights(Degree(qcopy));
-
-            for (uint32_t i = 0; i < Degree(qcopy); ++i) {
-                ctxs[i]    = T[i];
-                weights[i] = divqr->q[i + 1];
-            }
-            qu = internalEvalLinearWSumMutable(ctxs, weights);
-            // the highest order coefficient will always be a power of two up to 2^{m-1} because q is "monic" but the Chebyshev rule adds a factor of 2
-            // we don't need to increase the depth by multiplying the highest order coefficient, but instead checking and summing, since we work with m <= 4.
-            Ciphertext<DCRTPoly> sum = T[k - 1]->Clone();
-            uint32_t limit           = log2(ToReal(divqr->q.back()));
-            for (uint32_t i = 0; i < limit; ++i) {
-                sum = cc->EvalAdd(sum, sum);
-            }
-            cc->EvalAddInPlace(qu, sum);
-        }
-        else {
-            Ciphertext<DCRTPoly> sum = T[k - 1]->Clone();
-            uint32_t limit           = log2(ToReal(divqr->q.back()));
-            for (uint32_t i = 0; i < limit; ++i) {
-                sum = cc->EvalAdd(sum, sum);
-            }
-            qu = sum;
-        }
-
-        // adds the free term (at x^0)
-        cc->EvalAddInPlace(qu, divqr->q.front() / 2.0);
-        // The number of levels of qu is the same as the number of levels of T[k-1] + 1.
-        // Will only get here when m = 2, so the number of levels of qu and T2[m-1] will be the same.
-    }
-
-    Ciphertext<DCRTPoly> su;
-
-    if (Degree(s2) > k) {
-        su = InnerEvalChebyshevPS(T[0], s2, k, m - 1, T, T2);
-    }
-    else {
-        // ds = k from construction
-        // perform scalar multiplication for all other terms and sum them up if there are non-zero coefficients
-        auto scopy = s2;
-        scopy.resize(k);
-        if (Degree(scopy) > 0) {
-            std::vector<Ciphertext<DCRTPoly>> ctxs(Degree(scopy));
-            std::vector<VectorDataType> weights(Degree(scopy));
-
-            for (uint32_t i = 0; i < Degree(scopy); ++i) {
-                ctxs[i]    = T[i];
-                weights[i] = s2[i + 1];
-            }
-
-            su = cc->EvalLinearWSumMutable(ctxs, weights);
-            // the highest order coefficient will always be 1 because s2 is monic.
-            cc->EvalAddInPlace(su, T[k - 1]);
-        }
-        else {
-            su = T[k - 1];
-        }
-
-        // adds the free term (at x^0)
-        cc->EvalAddInPlace(su, s2.front() / 2.0);
-        // The number of levels of su is the same as the number of levels of T[k-1] + 1.
-        // Will only get here when m = 2, so need to reduce the number of levels by 1.
-    }
-
-    // TODO : Andrey : here is different from 895 line
-    // Reduce number of levels of su to number of levels of T2km1.
-    //  cc->LevelReduceInPlace(su, nullptr);
-
-    Ciphertext<DCRTPoly> result;
-
-    if (flag_c) {
-        result = cc->EvalAdd(T2[m - 1], cu);
-    }
-    else {
-        result = cc->EvalAdd(T2[m - 1], divcs->q.front() / 2.0);
-    }
-
-    result = cc->EvalMult(result, qu);
-    cc->ModReduceInPlace(result);
-
-    cc->EvalAddInPlace(result, su);
-    cc->EvalSubInPlace(result, T2km1);
-
-    return result;
+    return T[0]->GetCryptoContext()->EvalSub(InnerEvalChebyshevPS(T[0], f2, k, m, T, T2), T2km1);
 }
 
 std::shared_ptr<seriesPowers<DCRTPoly>> AdvancedSHECKKSRNS::EvalChebyPolys(ConstCiphertext<DCRTPoly>& x,
                                                                            const std::vector<int64_t>& coefficients,
                                                                            double a, double b) const {
-    return (Degree(coefficients) < 5) ? internalEvalChebyPolysLinear(x, coefficients, a, b) :
-                                        internalEvalChebyPolysPS(x, coefficients, a, b);
+    uint32_t d = Degree(coefficients);
+    return (d < 5) ? internalEvalChebyPolysLinear(x, coefficients, a, b) : internalEvalChebyPolysPS(x, d, a, b);
 }
 std::shared_ptr<seriesPowers<DCRTPoly>> AdvancedSHECKKSRNS::EvalChebyPolys(ConstCiphertext<DCRTPoly>& x,
                                                                            const std::vector<double>& coefficients,
                                                                            double a, double b) const {
-    return (Degree(coefficients) < 5) ? internalEvalChebyPolysLinear(x, coefficients, a, b) :
-                                        internalEvalChebyPolysPS(x, coefficients, a, b);
+    uint32_t d = Degree(coefficients);
+    return (d < 5) ? internalEvalChebyPolysLinear(x, coefficients, a, b) : internalEvalChebyPolysPS(x, d, a, b);
 }
 std::shared_ptr<seriesPowers<DCRTPoly>> AdvancedSHECKKSRNS::EvalChebyPolys(
     ConstCiphertext<DCRTPoly>& x, const std::vector<std::complex<double>>& coefficients, double a, double b) const {
-    return (Degree(coefficients) < 5) ? internalEvalChebyPolysLinear(x, coefficients, a, b) :
-                                        internalEvalChebyPolysPS(x, coefficients, a, b);
+    uint32_t d = Degree(coefficients);
+    return (d < 5) ? internalEvalChebyPolysLinear(x, coefficients, a, b) : internalEvalChebyPolysPS(x, d, a, b);
 }
 
 Ciphertext<DCRTPoly> AdvancedSHECKKSRNS::EvalChebyshevSeries(ConstCiphertext<DCRTPoly>& x,
@@ -1333,12 +898,12 @@ Ciphertext<DCRTPoly> AdvancedSHECKKSRNS::EvalChebyshevSeries(ConstCiphertext<DCR
 Ciphertext<DCRTPoly> AdvancedSHECKKSRNS::EvalChebyshevSeriesWithPrecomp(
     std::shared_ptr<seriesPowers<DCRTPoly>> ctxtPowers, const std::vector<int64_t>& coeffs) const {
     return (Degree(coeffs) < 5) ? internalEvalChebyshevSeriesLinearWithPrecomp(ctxtPowers->powersRe, coeffs) :
-                                  internalEvalPolyPSWithPrecomp(ctxtPowers, coeffs);
+                                  internalEvalChebyshevSeriesPSWithPrecomp(ctxtPowers, coeffs);
 }
 Ciphertext<DCRTPoly> AdvancedSHECKKSRNS::EvalChebyshevSeriesWithPrecomp(
     std::shared_ptr<seriesPowers<DCRTPoly>> ctxtPowers, const std::vector<double>& coeffs) const {
     return (Degree(coeffs) < 5) ? internalEvalChebyshevSeriesLinearWithPrecomp(ctxtPowers->powersRe, coeffs) :
-                                  internalEvalPolyPSWithPrecomp(ctxtPowers, coeffs);
+                                  internalEvalChebyshevSeriesPSWithPrecomp(ctxtPowers, coeffs);
 }
 Ciphertext<DCRTPoly> AdvancedSHECKKSRNS::EvalChebyshevSeriesWithPrecomp(
     std::shared_ptr<seriesPowers<DCRTPoly>> ctxtPowers, const std::vector<std::complex<double>>& coeffs) const {
@@ -1368,21 +933,17 @@ Ciphertext<DCRTPoly> AdvancedSHECKKSRNS::EvalChebyshevSeriesLinear(ConstCipherte
 Ciphertext<DCRTPoly> AdvancedSHECKKSRNS::EvalChebyshevSeriesPS(ConstCiphertext<DCRTPoly>& x,
                                                                const std::vector<int64_t>& coeffs, double a,
                                                                double b) const {
-    return internalEvalChebyshevSeriesPSWithPrecomp(internalEvalChebyPolysPS(x, coeffs, a, b), coeffs);
+    return internalEvalChebyshevSeriesPSWithPrecomp(internalEvalChebyPolysPS(x, Degree(coeffs), a, b), coeffs);
 }
 Ciphertext<DCRTPoly> AdvancedSHECKKSRNS::EvalChebyshevSeriesPS(ConstCiphertext<DCRTPoly>& x,
                                                                const std::vector<double>& coeffs, double a,
                                                                double b) const {
-    return internalEvalChebyshevSeriesPSWithPrecomp(internalEvalChebyPolysPS(x, coeffs, a, b), coeffs);
+    return internalEvalChebyshevSeriesPSWithPrecomp(internalEvalChebyPolysPS(x, Degree(coeffs), a, b), coeffs);
 }
 Ciphertext<DCRTPoly> AdvancedSHECKKSRNS::EvalChebyshevSeriesPS(ConstCiphertext<DCRTPoly>& x,
                                                                const std::vector<std::complex<double>>& coeffs,
                                                                double a, double b) const {
-    return internalEvalChebyshevSeriesPSWithPrecomp(internalEvalChebyPolysPS(x, coeffs, a, b), coeffs);
+    return internalEvalChebyshevSeriesPSWithPrecomp(internalEvalChebyPolysPS(x, Degree(coeffs), a, b), coeffs);
 }
 
-//------------------------------------------------------------------------------
-// EVAL LINEAR TRANSFORMATION
-//------------------------------------------------------------------------------
-
 }  // namespace lbcrypto
diff --git a/src/pke/lib/scheme/ckksrns/ckksrns-fhe.cpp b/src/pke/lib/scheme/ckksrns/ckksrns-fhe.cpp
index c895b3e3e..0d44ebf2d 100644
--- a/src/pke/lib/scheme/ckksrns/ckksrns-fhe.cpp
+++ b/src/pke/lib/scheme/ckksrns/ckksrns-fhe.cpp
@@ -120,32 +120,32 @@ void FHECKKSRNS::EvalBootstrapSetup(const CryptoContextImpl<DCRTPoly>& cc, std::
 
     auto& precom    = m_bootPrecomMap[slots];
     precom->m_slots = slots;
-    precom->m_dim1  = dim1[0];
 
     // even for the case of a single slot we need one level for rescaling
     uint32_t logSlots = (slots < 3) ? 1 : std::log2(slots);
 
     // Perform some checks on the level budget and compute parameters
-    std::vector<uint32_t> newBudget = levelBudget;
-    if (newBudget[0] > logSlots) {
+    uint32_t newBudget0 = levelBudget[0];
+    if (newBudget0 > logSlots) {
         std::cerr << "\nWarning, the level budget for encoding is too large. Setting it to " << logSlots << std::endl;
-        newBudget[0] = logSlots;
+        newBudget0 = logSlots;
     }
-    if (newBudget[0] < 1) {
+    if (newBudget0 < 1) {
         std::cerr << "\nWarning, the level budget for encoding can not be zero. Setting it to 1" << std::endl;
-        newBudget[0] = 1;
+        newBudget0 = 1;
     }
-    if (newBudget[1] > logSlots) {
+    uint32_t newBudget1 = levelBudget[1];
+    if (newBudget1 > logSlots) {
         std::cerr << "\nWarning, the level budget for decoding is too large. Setting it to " << logSlots << std::endl;
-        newBudget[1] = logSlots;
+        newBudget1 = logSlots;
     }
-    if (newBudget[1] < 1) {
+    if (newBudget1 < 1) {
         std::cerr << "\nWarning, the level budget for decoding can not be zero. Setting it to 1" << std::endl;
-        newBudget[1] = 1;
+        newBudget1 = 1;
     }
 
-    precom->m_paramsEnc = GetCollapsedFFTParams(slots, newBudget[0], dim1[0]);
-    precom->m_paramsDec = GetCollapsedFFTParams(slots, newBudget[1], dim1[1]);
+    precom->m_paramsEnc = GetCollapsedFFTParams(slots, newBudget0, dim1[0]);
+    precom->m_paramsDec = GetCollapsedFFTParams(slots, newBudget1, dim1[1]);
 
     if (precompute) {
         uint32_t m     = 4 * slots;
@@ -197,48 +197,51 @@ void FHECKKSRNS::EvalBootstrapSetup(const CryptoContextImpl<DCRTPoly>& cc, std::
 
         uint32_t approxModDepth = GetModDepthInternal(cryptoParams->GetSecretKeyDist());
 
-        uint32_t depthBT = approxModDepth + precom->m_paramsEnc[CKKS_BOOT_PARAMS::LEVEL_BUDGET] +
-                           precom->m_paramsDec[CKKS_BOOT_PARAMS::LEVEL_BUDGET];
+        uint32_t depthBT = approxModDepth + precom->m_paramsEnc.lvlb + precom->m_paramsDec.lvlb;
 
         // compute # of levels to remain when encoding the coefficients
         // for FLEXIBLEAUTOEXT we do not need extra modulus in auxiliary plaintexts
         auto st     = cryptoParams->GetScalingTechnique();
         uint32_t L0 = cryptoParams->GetElementParams()->GetParams().size() - (st == FLEXIBLEAUTOEXT);
 
-        uint32_t lEnc = L0 - compositeDegree * (precom->m_paramsEnc[CKKS_BOOT_PARAMS::LEVEL_BUDGET] + 1);
+        uint32_t lEnc = L0 - compositeDegree * (precom->m_paramsEnc.lvlb + 1);
         uint32_t lDec = L0 - compositeDegree * depthBT;
 
-        bool isLTBootstrap = (precom->m_paramsEnc[CKKS_BOOT_PARAMS::LEVEL_BUDGET] == 1) &&
-                             (precom->m_paramsDec[CKKS_BOOT_PARAMS::LEVEL_BUDGET] == 1);
+        bool isLTBootstrap = (precom->m_paramsEnc.lvlb == 1) && (precom->m_paramsDec.lvlb == 1);
 
         if (isLTBootstrap) {
-            // allocate all vectors
-            std::vector<std::vector<std::complex<double>>> U0(slots, std::vector<std::complex<double>>(slots));
-            std::vector<std::vector<std::complex<double>>> U1(slots, std::vector<std::complex<double>>(slots));
-            std::vector<std::vector<std::complex<double>>> U0hatT(slots, std::vector<std::complex<double>>(slots));
-            std::vector<std::vector<std::complex<double>>> U1hatT(slots, std::vector<std::complex<double>>(slots));
-
-            for (uint32_t i = 0; i < slots; i++) {
-                for (uint32_t j = 0; j < slots; j++) {
-                    U0[i][j]     = ksiPows[(j * rotGroup[i]) & mmask];
-                    U0hatT[j][i] = std::conj(U0[i][j]);
-                    U1[i][j]     = std::complex<double>(0, 1) * U0[i][j];
-                    U1hatT[j][i] = std::conj(U1[i][j]);
+            if (isSparse) {
+                std::vector<std::vector<std::complex<double>>> U0(slots, std::vector<std::complex<double>>(slots));
+                std::vector<std::vector<std::complex<double>>> U0hatT(slots, std::vector<std::complex<double>>(slots));
+                std::vector<std::vector<std::complex<double>>> U1(slots, std::vector<std::complex<double>>(slots));
+                std::vector<std::vector<std::complex<double>>> U1hatT(slots, std::vector<std::complex<double>>(slots));
+                for (uint32_t i = 0; i < slots; ++i) {
+                    for (uint32_t j = 0; j < slots; ++j) {
+                        U0[i][j]     = ksiPows[(j * rotGroup[i]) & mmask];
+                        U0hatT[j][i] = std::conj(U0[i][j]);
+                        U1[i][j]     = std::complex<double>(0, 1) * U0[i][j];
+                        U1hatT[j][i] = std::conj(U1[i][j]);
+                    }
                 }
-            }
-
-            if (!isSparse) {
-                precom->m_U0hatTPre = EvalLinearTransformPrecompute(cc, U0hatT, scaleEnc, lEnc);
-                precom->m_U0Pre     = EvalLinearTransformPrecompute(cc, U0, scaleDec, lDec);
+                precom->m_U0Pre     = EvalLinearTransformPrecompute(cc, U0, U1, 1, scaleDec, lDec);
+                precom->m_U0hatTPre = EvalLinearTransformPrecompute(cc, U0hatT, U1hatT, 0, scaleEnc, lEnc);
             }
             else {
-                precom->m_U0hatTPre = EvalLinearTransformPrecompute(cc, U0hatT, U1hatT, 0, scaleEnc, lEnc);
-                precom->m_U0Pre     = EvalLinearTransformPrecompute(cc, U0, U1, 1, scaleDec, lDec);
+                std::vector<std::vector<std::complex<double>>> U0(slots, std::vector<std::complex<double>>(slots));
+                std::vector<std::vector<std::complex<double>>> U0hatT(slots, std::vector<std::complex<double>>(slots));
+                for (uint32_t i = 0; i < slots; ++i) {
+                    for (uint32_t j = 0; j < slots; ++j) {
+                        U0[i][j]     = ksiPows[(j * rotGroup[i]) & mmask];
+                        U0hatT[j][i] = std::conj(U0[i][j]);
+                    }
+                }
+                precom->m_U0Pre     = EvalLinearTransformPrecompute(cc, U0, scaleDec, lDec);
+                precom->m_U0hatTPre = EvalLinearTransformPrecompute(cc, U0hatT, scaleEnc, lEnc);
             }
         }
         else {
-            precom->m_U0hatTPreFFT = EvalCoeffsToSlotsPrecompute(cc, ksiPows, rotGroup, false, scaleEnc, lEnc);
             precom->m_U0PreFFT     = EvalSlotsToCoeffsPrecompute(cc, ksiPows, rotGroup, false, scaleDec, lDec);
+            precom->m_U0hatTPreFFT = EvalCoeffsToSlotsPrecompute(cc, ksiPows, rotGroup, false, scaleEnc, lEnc);
         }
     }
 }
@@ -258,22 +261,19 @@ std::shared_ptr<std::map<uint32_t, EvalKey<DCRTPoly>>> FHECKKSRNS::EvalBootstrap
     auto algo = cc->GetScheme();
     auto M    = cc->GetCyclotomicOrder();
 
-    if (slots == 0)
-        slots = M / 4;
+    slots = (slots == 0) ? M / 4 : slots;
 
     // computing all indices for baby-step giant-step procedure
     auto evalKeys = algo->EvalAtIndexKeyGen(nullptr, privateKey, FindBootstrapRotationIndices(slots, M));
 
-    auto conjKey       = ConjugateKeyGen(privateKey);
-    (*evalKeys)[M - 1] = conjKey;
+    (*evalKeys)[M - 1] = ConjugateKeyGen(privateKey);
 
     if (cryptoParams->GetSecretKeyDist() == SPARSE_ENCAPSULATED) {
         DCRTPoly::TugType tug;
-        DCRTPoly sNew(tug, cryptoParams->GetElementParams(), Format::EVALUATION, 32);
 
         // sparse key used for the modraising step
         auto skNew = std::make_shared<PrivateKeyImpl<DCRTPoly>>(cc);
-        skNew->SetPrivateElement(std::move(sNew));
+        skNew->SetPrivateElement(DCRTPoly(tug, cryptoParams->GetElementParams(), Format::EVALUATION, 32));
 
         // we reserve M-4 and M-2 for the sparse encapsulation switching keys
         // Even autorphism indices are not possible, so there will not be any conflict
@@ -298,11 +298,9 @@ void FHECKKSRNS::EvalBootstrapPrecompute(const CryptoContextImpl<DCRTPoly>& cc,
     uint32_t slots = (numSlots == 0) ? M / 4 : numSlots;
 
     auto& p = GetBootPrecom(slots);
-    std::vector<uint32_t> dim1{p.m_dim1, static_cast<uint32_t>(p.m_paramsDec[CKKS_BOOT_PARAMS::GIANT_STEP])};
-    std::vector<uint32_t> newBudget{static_cast<uint32_t>(p.m_paramsEnc[CKKS_BOOT_PARAMS::LEVEL_BUDGET]),
-                                    static_cast<uint32_t>(p.m_paramsDec[CKKS_BOOT_PARAMS::LEVEL_BUDGET])};
-    p.m_paramsEnc = GetCollapsedFFTParams(slots, newBudget[0], dim1[0]);
-    p.m_paramsDec = GetCollapsedFFTParams(slots, newBudget[1], dim1[1]);
+
+    p.m_paramsEnc = GetCollapsedFFTParams(slots, p.m_paramsEnc.lvlb, p.m_paramsEnc.g);
+    p.m_paramsDec = GetCollapsedFFTParams(slots, p.m_paramsDec.lvlb, p.m_paramsDec.g);
 
     uint32_t m     = 4 * slots;
     uint32_t mmask = m - 1;  // assumes m is power of 2
@@ -353,48 +351,51 @@ void FHECKKSRNS::EvalBootstrapPrecompute(const CryptoContextImpl<DCRTPoly>& cc,
 
     uint32_t approxModDepth = GetModDepthInternal(cryptoParams->GetSecretKeyDist());
 
-    uint32_t depthBT =
-        approxModDepth + p.m_paramsEnc[CKKS_BOOT_PARAMS::LEVEL_BUDGET] + p.m_paramsDec[CKKS_BOOT_PARAMS::LEVEL_BUDGET];
+    uint32_t depthBT = approxModDepth + p.m_paramsEnc.lvlb + p.m_paramsDec.lvlb;
 
     // compute # of levels to remain when encoding the coefficients
     // for FLEXIBLEAUTOEXT we do not need extra modulus in auxiliary plaintexts
     auto st     = cryptoParams->GetScalingTechnique();
     uint32_t L0 = cryptoParams->GetElementParams()->GetParams().size() - (st == FLEXIBLEAUTOEXT);
 
-    uint32_t lEnc = L0 - compositeDegree * (p.m_paramsEnc[CKKS_BOOT_PARAMS::LEVEL_BUDGET] + 1);
+    uint32_t lEnc = L0 - compositeDegree * (p.m_paramsEnc.lvlb + 1);
     uint32_t lDec = L0 - compositeDegree * depthBT;
 
-    bool isLTBootstrap =
-        (p.m_paramsEnc[CKKS_BOOT_PARAMS::LEVEL_BUDGET] == 1) && (p.m_paramsDec[CKKS_BOOT_PARAMS::LEVEL_BUDGET] == 1);
+    bool isLTBootstrap = (p.m_paramsEnc.lvlb == 1) && (p.m_paramsDec.lvlb == 1);
 
     if (isLTBootstrap) {
-        // allocate all vectors
-        std::vector<std::vector<std::complex<double>>> U0(slots, std::vector<std::complex<double>>(slots));
-        std::vector<std::vector<std::complex<double>>> U1(slots, std::vector<std::complex<double>>(slots));
-        std::vector<std::vector<std::complex<double>>> U0hatT(slots, std::vector<std::complex<double>>(slots));
-        std::vector<std::vector<std::complex<double>>> U1hatT(slots, std::vector<std::complex<double>>(slots));
-
-        for (size_t i = 0; i < slots; i++) {
-            for (size_t j = 0; j < slots; j++) {
-                U0[i][j]     = ksiPows[(j * rotGroup[i]) & mmask];
-                U0hatT[j][i] = std::conj(U0[i][j]);
-                U1[i][j]     = std::complex<double>(0, 1) * U0[i][j];
-                U1hatT[j][i] = std::conj(U1[i][j]);
+        if (isSparse) {
+            std::vector<std::vector<std::complex<double>>> U0(slots, std::vector<std::complex<double>>(slots));
+            std::vector<std::vector<std::complex<double>>> U0hatT(slots, std::vector<std::complex<double>>(slots));
+            std::vector<std::vector<std::complex<double>>> U1(slots, std::vector<std::complex<double>>(slots));
+            std::vector<std::vector<std::complex<double>>> U1hatT(slots, std::vector<std::complex<double>>(slots));
+            for (uint32_t i = 0; i < slots; ++i) {
+                for (uint32_t j = 0; j < slots; ++j) {
+                    U0[i][j]     = ksiPows[(j * rotGroup[i]) & mmask];
+                    U0hatT[j][i] = std::conj(U0[i][j]);
+                    U1[i][j]     = std::complex<double>(0, 1) * U0[i][j];
+                    U1hatT[j][i] = std::conj(U1[i][j]);
+                }
             }
-        }
-
-        if (!isSparse) {
-            p.m_U0hatTPre = EvalLinearTransformPrecompute(cc, U0hatT, scaleEnc, lEnc);
-            p.m_U0Pre     = EvalLinearTransformPrecompute(cc, U0, scaleDec, lDec);
+            p.m_U0Pre     = EvalLinearTransformPrecompute(cc, U0, U1, 1, scaleDec, lDec);
+            p.m_U0hatTPre = EvalLinearTransformPrecompute(cc, U0hatT, U1hatT, 0, scaleEnc, lEnc);
         }
         else {
-            p.m_U0hatTPre = EvalLinearTransformPrecompute(cc, U0hatT, U1hatT, 0, scaleEnc, lEnc);
-            p.m_U0Pre     = EvalLinearTransformPrecompute(cc, U0, U1, 1, scaleDec, lDec);
+            std::vector<std::vector<std::complex<double>>> U0(slots, std::vector<std::complex<double>>(slots));
+            std::vector<std::vector<std::complex<double>>> U0hatT(slots, std::vector<std::complex<double>>(slots));
+            for (uint32_t i = 0; i < slots; ++i) {
+                for (uint32_t j = 0; j < slots; ++j) {
+                    U0[i][j]     = ksiPows[(j * rotGroup[i]) & mmask];
+                    U0hatT[j][i] = std::conj(U0[i][j]);
+                }
+            }
+            p.m_U0Pre     = EvalLinearTransformPrecompute(cc, U0, scaleDec, lDec);
+            p.m_U0hatTPre = EvalLinearTransformPrecompute(cc, U0hatT, scaleEnc, lEnc);
         }
     }
     else {
-        p.m_U0hatTPreFFT = EvalCoeffsToSlotsPrecompute(cc, ksiPows, rotGroup, false, scaleEnc, lEnc);
         p.m_U0PreFFT     = EvalSlotsToCoeffsPrecompute(cc, ksiPows, rotGroup, false, scaleDec, lDec);
+        p.m_U0hatTPreFFT = EvalCoeffsToSlotsPrecompute(cc, ksiPows, rotGroup, false, scaleEnc, lEnc);
     }
 }
 
@@ -404,11 +405,13 @@ Ciphertext<DCRTPoly> FHECKKSRNS::EvalBootstrap(ConstCiphertext<DCRTPoly>& cipher
 
     if (cryptoParams->GetKeySwitchTechnique() != HYBRID)
         OPENFHE_THROW("CKKS Bootstrapping only supported with Hybrid key switching.");
+
     auto st = cryptoParams->GetScalingTechnique();
 #if NATIVEINT == 128
     if (st == FLEXIBLEAUTO || st == FLEXIBLEAUTOEXT)
         OPENFHE_THROW("128-bit CKKS Bootstrapping only supported for FIXEDMANUAL and FIXEDAUTO.");
 #endif
+
     if (numIterations != 1 && numIterations != 2)
         OPENFHE_THROW("CKKS Bootstrapping only supported for 1 or 2 iterations.");
 
@@ -428,28 +431,28 @@ Ciphertext<DCRTPoly> FHECKKSRNS::EvalBootstrap(ConstCiphertext<DCRTPoly>& cipher
         // Step 1: Get the input.
         uint32_t powerOfTwoModulus = 1 << precision;
 
-        // Step 2: Scale up by powerOfTwoModulus, and extend the modulus to powerOfTwoModulus * q.
-        // Note that we extend the modulus implicitly without any code calls because the value always stays 0.
-        // We multiply by powerOfTwoModulus, and leave the last CRT value to be 0 (mod powerOfTwoModulus).
-        auto ctScaledUp = cc->EvalMultNoCheck(ciphertext, powerOfTwoModulus);
-        ctScaledUp->SetLevel(L0 - ctScaledUp->GetElements()[0].GetNumOfElements());
-
         // Step 3: Bootstrap the initial ciphertext.
-        auto ctInitialBootstrap = cc->EvalBootstrap(ciphertext, numIterations - 1, precision);
+        auto ctInitialBootstrap = EvalBootstrap(ciphertext, numIterations - 1, precision);
         cc->GetScheme()->ModReduceInternalInPlace(ctInitialBootstrap, compositeDegree);
 
         // Step 4: Scale up by powerOfTwoModulus.
         cc->GetScheme()->MultByIntegerInPlace(ctInitialBootstrap, powerOfTwoModulus);
 
+        // If we start with more towers, than we obtain from bootstrapping, return the original ciphertext.
+        auto bootstrappingSizeQ = ctInitialBootstrap->GetElements()[0].GetNumOfElements();
+        if (bootstrappingSizeQ <= initSizeQ)
+            return ciphertext->Clone();
+
+        // Step 2: Scale up by powerOfTwoModulus, and extend the modulus to powerOfTwoModulus * q.
+        // Note that we extend the modulus implicitly without any code calls because the value always stays 0.
+        auto ctScaledUp = ciphertext->Clone();
+        // We multiply by powerOfTwoModulus, and leave the last CRT value to be 0 (mod powerOfTwoModulus).
+        cc->GetScheme()->MultByIntegerInPlace(ctScaledUp, powerOfTwoModulus);
+        ctScaledUp->SetLevel(L0 - ctScaledUp->GetElements()[0].GetNumOfElements());
+
         // Step 5: Mod-down to powerOfTwoModulus * q
         // We mod down, and leave the last CRT value to be 0 because it's divisible by powerOfTwoModulus.
         auto ctBootstrappedScaledDown = ctInitialBootstrap->Clone();
-        auto bootstrappingSizeQ       = ctBootstrappedScaledDown->GetElements()[0].GetNumOfElements();
-
-        // If we start with more towers, than we obtain from bootstrapping, return the original ciphertext.
-        if (bootstrappingSizeQ <= initSizeQ) {
-            return ciphertext->Clone();
-        }
 
         // TODO: YSP Can be removed for FLEXIBLE* scaling techniques as well as the closeness of 2^p to moduli is no longer needed
         if (st != COMPOSITESCALINGAUTO && st != COMPOSITESCALINGMANUAL) {
@@ -462,7 +465,7 @@ Ciphertext<DCRTPoly> FHECKKSRNS::EvalBootstrap(ConstCiphertext<DCRTPoly>& cipher
         auto ctBootstrappingError = cc->EvalSub(ctBootstrappedScaledDown, ctScaledUp);
 
         // Step 8: Bootstrap the error.
-        auto ctBootstrappedError = cc->EvalBootstrap(ctBootstrappingError, 1, 0);
+        auto ctBootstrappedError = EvalBootstrap(ctBootstrappingError, 1, 0);
         cc->GetScheme()->ModReduceInternalInPlace(ctBootstrappedError, compositeDegree);
 
         // Step 9: Subtract the bootstrapped error from the initial bootstrap to get even lower error.
@@ -526,42 +529,40 @@ Ciphertext<DCRTPoly> FHECKKSRNS::EvalBootstrap(ConstCiphertext<DCRTPoly>& cipher
     uint32_t N = cc->GetRingDimension();
     if (compositeDegree > 1) {
         // RNS basis extension from level 0 RNS limbs to the raised RNS basis
-        auto& ctxtDCRT = raised->GetElements();
-        ExtendCiphertext(ctxtDCRT, *cc, elementParamsRaisedPtr);
-        raised->SetLevel(L0 - ctxtDCRT[0].GetNumOfElements());
+        auto& ctxtDCRTs = raised->GetElements();
+        ExtendCiphertext(ctxtDCRTs, *cc, elementParamsRaisedPtr);
+        raised->SetLevel(L0 - ctxtDCRTs[0].GetNumOfElements());
     }
     else {
         if (cryptoParams->GetSecretKeyDist() == SPARSE_ENCAPSULATED) {
-            auto evalKeyMap = cc->GetEvalAutomorphismKeyMap(raised->GetKeyTag());
+            auto& evalKeyMap = cc->GetEvalAutomorphismKeyMap(raised->GetKeyTag());
 
             // transform from a denser secret to a sparser one
             raised = KeySwitchSparse(raised, evalKeyMap.at(2 * N - 4));
 
             // Only level 0 ciphertext used here. Other towers ignored to make CKKS bootstrapping faster.
-            auto& ctxtDCRT = raised->GetElements();
-            for (auto& poly : ctxtDCRT) {
-                poly.SetFormat(COEFFICIENT);
-                DCRTPoly temp(elementParamsRaisedPtr, COEFFICIENT);
-                temp = poly.GetElementAtIndex(0);
-                temp.SetFormat(EVALUATION);
-                poly = std::move(temp);
+            auto& ctxtDCRTs = raised->GetElements();
+            for (auto& dcrt : ctxtDCRTs) {
+                dcrt.SetFormat(COEFFICIENT);
+                DCRTPoly tmp(dcrt.GetElementAtIndex(0), elementParamsRaisedPtr);
+                tmp.SetFormat(EVALUATION);
+                dcrt = std::move(tmp);
             }
-            raised->SetLevel(L0 - ctxtDCRT[0].GetNumOfElements());
+            raised->SetLevel(L0 - ctxtDCRTs[0].GetNumOfElements());
 
             // go back to a denser secret
             algo->KeySwitchInPlace(raised, evalKeyMap.at(2 * N - 2));
         }
         else {
             // Only level 0 ciphertext used here. Other towers ignored to make CKKS bootstrapping faster.
-            auto& ctxtDCRT = raised->GetElements();
-            for (auto& poly : ctxtDCRT) {
-                poly.SetFormat(COEFFICIENT);
-                DCRTPoly temp(elementParamsRaisedPtr, COEFFICIENT);
-                temp = poly.GetElementAtIndex(0);
-                temp.SetFormat(EVALUATION);
-                poly = std::move(temp);
+            auto& ctxtDCRTs = raised->GetElements();
+            for (auto& dcrt : ctxtDCRTs) {
+                dcrt.SetFormat(COEFFICIENT);
+                DCRTPoly tmp(dcrt.GetElementAtIndex(0), elementParamsRaisedPtr);
+                tmp.SetFormat(EVALUATION);
+                dcrt = std::move(tmp);
             }
-            raised->SetLevel(L0 - ctxtDCRT[0].GetNumOfElements());
+            raised->SetLevel(L0 - ctxtDCRTs[0].GetNumOfElements());
         }
     }
 
@@ -602,13 +603,12 @@ Ciphertext<DCRTPoly> FHECKKSRNS::EvalBootstrap(ConstCiphertext<DCRTPoly>& cipher
     cc->EvalMultInPlace(raised, pre * (1.0 / (k * N)));
 
     // no linear transformations are needed for Chebyshev series as the range has been normalized to [-1,1]
-    double coeffLowerBound = -1;
-    double coeffUpperBound = 1;
+    double coeffLowerBound = -1.0;
+    double coeffUpperBound = 1.0;
 
     auto& p = GetBootPrecom(slots);
 
-    bool isLTBootstrap =
-        (p.m_paramsEnc[CKKS_BOOT_PARAMS::LEVEL_BUDGET] == 1) && (p.m_paramsDec[CKKS_BOOT_PARAMS::LEVEL_BUDGET] == 1);
+    bool isLTBootstrap = (p.m_paramsEnc.lvlb == 1) && (p.m_paramsDec.lvlb == 1);
 
     Ciphertext<DCRTPoly> ctxtDec;
     if (slots == cc->GetCyclotomicOrder() / 4) {
@@ -631,11 +631,11 @@ Ciphertext<DCRTPoly> FHECKKSRNS::EvalBootstrap(ConstCiphertext<DCRTPoly>& cipher
         auto ctxtEnc =
             (isLTBootstrap) ? EvalLinearTransform(p.m_U0hatTPre, raised) : EvalCoeffsToSlots(p.m_U0hatTPreFFT, raised);
 
-        auto evalKeyMap = cc->GetEvalAutomorphismKeyMap(ctxtEnc->GetKeyTag());
-        auto conj       = Conjugate(ctxtEnc, evalKeyMap);
-        auto ctxtEncI   = cc->EvalSub(ctxtEnc, conj);
+        auto& evalKeyMap = cc->GetEvalAutomorphismKeyMap(ctxtEnc->GetKeyTag());
+        auto conj        = Conjugate(ctxtEnc, evalKeyMap);
+        auto ctxtEncI    = cc->EvalSub(ctxtEnc, conj);
+        cc->EvalAddInPlace(ctxtEnc, conj);
         algo->MultByMonomialInPlace(ctxtEncI, 3 * slots);
-        cc->EvalAddInPlaceNoCheck(ctxtEnc, conj);
 
         if (st == FIXEDMANUAL) {
             while (ctxtEnc->GetNoiseScaleDeg() > 1) {
@@ -655,8 +655,8 @@ Ciphertext<DCRTPoly> FHECKKSRNS::EvalBootstrap(ConstCiphertext<DCRTPoly>& cipher
         //------------------------------------------------------------------------------
 
         // Evaluate Chebyshev series for the sine wave
-        ctxtEnc  = cc->EvalChebyshevSeries(ctxtEnc, coefficients, coeffLowerBound, coeffUpperBound);
-        ctxtEncI = cc->EvalChebyshevSeries(ctxtEncI, coefficients, coeffLowerBound, coeffUpperBound);
+        ctxtEnc  = algo->EvalChebyshevSeries(ctxtEnc, coefficients, coeffLowerBound, coeffUpperBound);
+        ctxtEncI = algo->EvalChebyshevSeries(ctxtEncI, coefficients, coeffLowerBound, coeffUpperBound);
 
         // Double-angle iterations
         if (st != FIXEDMANUAL) {
@@ -707,8 +707,9 @@ Ciphertext<DCRTPoly> FHECKKSRNS::EvalBootstrap(ConstCiphertext<DCRTPoly>& cipher
         // Running PartialSum
         //------------------------------------------------------------------------------
 
-        for (uint32_t j = 1; j < N / (2 * slots); j <<= 1)
-            cc->EvalAddInPlaceNoCheck(raised, cc->EvalRotate(raised, j * slots));
+        const auto limit = N / (2 * slots);
+        for (uint32_t j = 1; j < limit; j <<= 1)
+            cc->EvalAddInPlace(raised, cc->EvalRotate(raised, j * slots));
 
 #ifdef BOOTSTRAPTIMING
         TIC(t);
@@ -723,9 +724,8 @@ Ciphertext<DCRTPoly> FHECKKSRNS::EvalBootstrap(ConstCiphertext<DCRTPoly>& cipher
         auto ctxtEnc =
             (isLTBootstrap) ? EvalLinearTransform(p.m_U0hatTPre, raised) : EvalCoeffsToSlots(p.m_U0hatTPreFFT, raised);
 
-        auto evalKeyMap = cc->GetEvalAutomorphismKeyMap(ctxtEnc->GetKeyTag());
-        auto conj       = Conjugate(ctxtEnc, evalKeyMap);
-        cc->EvalAddInPlaceNoCheck(ctxtEnc, conj);
+        auto& evalKeyMap = cc->GetEvalAutomorphismKeyMap(ctxtEnc->GetKeyTag());
+        cc->EvalAddInPlace(ctxtEnc, Conjugate(ctxtEnc, evalKeyMap));
 
         if (st == FIXEDMANUAL) {
             while (ctxtEnc->GetNoiseScaleDeg() > 1) {
@@ -750,7 +750,7 @@ Ciphertext<DCRTPoly> FHECKKSRNS::EvalBootstrap(ConstCiphertext<DCRTPoly>& cipher
         //------------------------------------------------------------------------------
 
         // Evaluate Chebyshev series for the sine wave
-        ctxtEnc = cc->EvalChebyshevSeries(ctxtEnc, coefficients, coeffLowerBound, coeffUpperBound);
+        ctxtEnc = algo->EvalChebyshevSeries(ctxtEnc, coefficients, coeffLowerBound, coeffUpperBound);
 
         // Double-angle iterations
         if (st != FIXEDMANUAL)
@@ -808,24 +808,20 @@ Ciphertext<DCRTPoly> FHECKKSRNS::EvalBootstrap(ConstCiphertext<DCRTPoly>& cipher
 //------------------------------------------------------------------------------
 
 std::vector<int32_t> FHECKKSRNS::FindBootstrapRotationIndices(uint32_t slots, uint32_t M) {
-    auto& p = GetBootPrecom(slots);
-    bool isLTBootstrap =
-        (p.m_paramsEnc[CKKS_BOOT_PARAMS::LEVEL_BUDGET] == 1) && (p.m_paramsDec[CKKS_BOOT_PARAMS::LEVEL_BUDGET] == 1);
+    const auto& p = GetBootPrecom(slots);
 
-    std::vector<uint32_t> fullIndexList;
-    if (isLTBootstrap) {
-        fullIndexList = FindLinearTransformRotationIndices(slots, M);
+    // Remove possible duplicates and remove automorphisms corresponding to 0 and M/4 by using std::set
+    std::set<uint32_t> s;
+    if (p.m_paramsEnc.lvlb == 1 && p.m_paramsDec.lvlb == 1) {
+        auto tmp = FindLinearTransformRotationIndices(slots, M);
+        s.insert(tmp.begin(), tmp.end());
     }
     else {
-        fullIndexList = FindCoeffsToSlotsRotationIndices(slots, M);
-
-        std::vector<uint32_t> indexListStC{FindSlotsToCoeffsRotationIndices(slots, M)};
-        fullIndexList.insert(fullIndexList.end(), std::make_move_iterator(indexListStC.begin()),
-                             std::make_move_iterator(indexListStC.end()));
+        auto tmp = FindCoeffsToSlotsRotationIndices(slots, M);
+        s.insert(tmp.begin(), tmp.end());
+        tmp = FindSlotsToCoeffsRotationIndices(slots, M);
+        s.insert(tmp.begin(), tmp.end());
     }
-
-    // Remove possible duplicates and remove automorphisms corresponding to 0 and M/4 by using std::set
-    std::set<uint32_t> s(fullIndexList.begin(), fullIndexList.end());
     s.erase(0);
     s.erase(M / 4);
 
@@ -837,29 +833,28 @@ std::vector<int32_t> FHECKKSRNS::FindBootstrapRotationIndices(uint32_t slots, ui
 // This method completely depends on FindBootstrapRotationIndices() to do that.
 std::vector<uint32_t> FHECKKSRNS::FindLinearTransformRotationIndices(uint32_t slots, uint32_t M) {
     // Computing the baby-step g and the giant-step h.
-    auto& p    = GetBootPrecom(slots);
-    uint32_t g = (p.m_dim1 == 0) ? static_cast<uint32_t>(std::ceil(std::sqrt(slots))) : p.m_dim1;
-    uint32_t h = static_cast<uint32_t>(std::ceil(static_cast<double>(slots) / g));
+    const auto& p    = GetBootPrecom(slots);
+    const uint32_t g = (p.m_paramsEnc.g == 0) ? std::ceil(std::sqrt(slots)) : p.m_paramsEnc.g;
+    const uint32_t h = std::ceil(static_cast<double>(slots) / g);
 
-    std::vector<uint32_t> indexList;
     // To avoid overflowing uint32_t variables, we do some math operations below in a specific order
     // computing all indices for baby-step giant-step procedure
-    int32_t indexListSz = static_cast<int32_t>(g) + h + M - 2;
+    const int32_t indexListSz = static_cast<int32_t>(g) + h + M - 2;
     if (indexListSz < 0)
         OPENFHE_THROW("indexListSz can not be negative");
 
+    std::vector<uint32_t> indexList;
     indexList.reserve(indexListSz);
-    for (size_t i = 1; i <= g; ++i)
+
+    for (uint32_t i = 1; i <= g; ++i)
         indexList.emplace_back(i);
-    for (size_t i = 2; i < h; ++i)
-        indexList.emplace_back(g * i);
+    for (uint32_t i = 2; i < h; ++i)
+        indexList.emplace_back(i * g);
 
     // additional automorphisms are needed for sparse bootstrapping
-    uint32_t m = slots * 4;
-    if (m != M) {
-        for (size_t j = 1; j < M / m; j <<= 1) {
+    if (uint32_t m = slots * 4; m != M) {
+        for (uint32_t j = 1; j < M / m; j <<= 1)
             indexList.emplace_back(j * slots);
-        }
     }
 
     return indexList;
@@ -869,114 +864,86 @@ std::vector<uint32_t> FHECKKSRNS::FindLinearTransformRotationIndices(uint32_t sl
 // so it DOES NOT remove possible duplicates and automorphisms corresponding to 0 and M/4.
 // This method completely depends on FindBootstrapRotationIndices() to do that.
 std::vector<uint32_t> FHECKKSRNS::FindCoeffsToSlotsRotationIndices(uint32_t slots, uint32_t M) {
-    auto& p = GetBootPrecom(slots);
+    const auto& p = GetBootPrecom(slots).m_paramsEnc;
 
-    uint32_t levelBudget     = p.m_paramsEnc[CKKS_BOOT_PARAMS::LEVEL_BUDGET];
-    uint32_t layersCollapse  = p.m_paramsEnc[CKKS_BOOT_PARAMS::LAYERS_COLL];
-    uint32_t remCollapse     = p.m_paramsEnc[CKKS_BOOT_PARAMS::LAYERS_REM];
-    uint32_t numRotations    = p.m_paramsEnc[CKKS_BOOT_PARAMS::NUM_ROTATIONS];
-    uint32_t b               = p.m_paramsEnc[CKKS_BOOT_PARAMS::BABY_STEP];
-    uint32_t g               = p.m_paramsEnc[CKKS_BOOT_PARAMS::GIANT_STEP];
-    uint32_t numRotationsRem = p.m_paramsEnc[CKKS_BOOT_PARAMS::NUM_ROTATIONS_REM];
-    uint32_t bRem            = p.m_paramsEnc[CKKS_BOOT_PARAMS::BABY_STEP_REM];
-    uint32_t gRem            = p.m_paramsEnc[CKKS_BOOT_PARAMS::GIANT_STEP_REM];
-
-    uint32_t flagRem = (remCollapse == 0) ? 0 : 1;
-
-    std::vector<uint32_t> indexList;
     // To avoid overflowing uint32_t variables, we do some math operations below in a specific order
     // Computing all indices for baby-step giant-step procedure for encoding and decoding
-    int32_t indexListSz = static_cast<int32_t>(b) + g - 2 + bRem + gRem - 2 + 1 + M;
+    const int32_t indexListSz = static_cast<int32_t>(p.b) + p.g - 2 + p.bRem + p.gRem - 2 + 1 + M;
     if (indexListSz < 0)
         OPENFHE_THROW("indexListSz can not be negative");
+
+    std::vector<uint32_t> indexList;
     indexList.reserve(indexListSz);
 
-    for (int32_t s = static_cast<int32_t>(levelBudget) - 1; s >= static_cast<int32_t>(flagRem); --s) {
-        const uint32_t scalingFactor = 1U << ((s - flagRem) * layersCollapse + remCollapse);
-        const int32_t halfRots       = (1 - (numRotations + 1) / 2);
-        for (int32_t j = halfRots; j < static_cast<int32_t>(g + halfRots); ++j) {
-            indexList.emplace_back(ReduceRotation(j * scalingFactor, slots));
-        }
-        for (size_t i = 0; i < b; i++) {
-            indexList.emplace_back(ReduceRotation((g * i) * scalingFactor, M / 4));
-        }
+    // additional automorphisms are needed for sparse bootstrapping
+    if (uint32_t m = slots * 4; m != M) {
+        for (uint32_t j = 1; j < M / m; j <<= 1)
+            indexList.emplace_back(j * slots);
     }
 
-    if (flagRem) {
-        const int32_t halfRots = (1 - (numRotationsRem + 1) / 2);
-        for (int32_t j = halfRots; j < static_cast<int32_t>(gRem + halfRots); ++j) {
-            indexList.emplace_back(ReduceRotation(j, slots));
-        }
-        for (size_t i = 0; i < bRem; i++) {
-            indexList.emplace_back(ReduceRotation(gRem * i, M / 4));
-        }
+    M >>= 2;
+    const int32_t flagRem   = (p.remCollapse == 0) ? 0 : 1;
+    const int32_t halfRots  = 1 - (p.numRotations + 1) / 2;
+    const int32_t halfRotsg = halfRots + p.g;
+    for (int32_t s = -1 + p.lvlb; s >= flagRem; --s) {
+        const uint32_t scalingFactor = 1U << ((s - flagRem) * p.layersCollapse + p.remCollapse);
+        for (int32_t j = halfRots; j < halfRotsg; ++j)
+            indexList.emplace_back(ReduceRotation(j * scalingFactor, slots));
+        for (uint32_t i = 0; i < p.b; ++i)
+            indexList.emplace_back(ReduceRotation(i * p.g * scalingFactor, M));
     }
 
-    uint32_t m = slots * 4;
-    // additional automorphisms are needed for sparse bootstrapping
-    if (m != M) {
-        for (size_t j = 1; j < M / m; j <<= 1) {
-            indexList.emplace_back(j * slots);
-        }
+    if (flagRem == 1) {
+        const int32_t halfRotsRem  = (1 - (p.numRotationsRem + 1) / 2);
+        const int32_t halfRotsRemg = halfRotsRem + p.gRem;
+        for (int32_t j = halfRotsRem; j < halfRotsRemg; ++j)
+            indexList.emplace_back(ReduceRotation(j, slots));
+        for (uint32_t i = 0; i < p.bRem; ++i)
+            indexList.emplace_back(ReduceRotation(i * p.gRem, M));
     }
 
     return indexList;
 }
 
 std::vector<uint32_t> FHECKKSRNS::FindSlotsToCoeffsRotationIndices(uint32_t slots, uint32_t M) {
-    auto& p = GetBootPrecom(slots);
+    const auto& p = GetBootPrecom(slots).m_paramsDec;
 
-    uint32_t levelBudget     = p.m_paramsDec[CKKS_BOOT_PARAMS::LEVEL_BUDGET];
-    uint32_t layersCollapse  = p.m_paramsDec[CKKS_BOOT_PARAMS::LAYERS_COLL];
-    uint32_t remCollapse     = p.m_paramsDec[CKKS_BOOT_PARAMS::LAYERS_REM];
-    uint32_t numRotations    = p.m_paramsDec[CKKS_BOOT_PARAMS::NUM_ROTATIONS];
-    uint32_t b               = p.m_paramsDec[CKKS_BOOT_PARAMS::BABY_STEP];
-    uint32_t g               = p.m_paramsDec[CKKS_BOOT_PARAMS::GIANT_STEP];
-    uint32_t numRotationsRem = p.m_paramsDec[CKKS_BOOT_PARAMS::NUM_ROTATIONS_REM];
-    uint32_t bRem            = p.m_paramsDec[CKKS_BOOT_PARAMS::BABY_STEP_REM];
-    uint32_t gRem            = p.m_paramsDec[CKKS_BOOT_PARAMS::GIANT_STEP_REM];
-
-    uint32_t flagRem = (remCollapse == 0) ? 0 : 1;
-    if (levelBudget < flagRem)
-        OPENFHE_THROW("levelBudget can not be less than flagRem");
-
-    std::vector<uint32_t> indexList;
     // To avoid overflowing uint32_t variables, we do some math operations below in a specific order
     // Computing all indices for baby-step giant-step procedure for encoding and decoding
-    int32_t indexListSz = static_cast<int32_t>(b) + g - 2 + bRem + gRem - 2 + 1 + M;
+    const int32_t indexListSz = static_cast<int32_t>(p.b) + p.g - 2 + p.bRem + p.gRem - 2 + 1 + M;
     if (indexListSz < 0)
         OPENFHE_THROW("indexListSz can not be negative");
+
+    std::vector<uint32_t> indexList;
     indexList.reserve(indexListSz);
 
-    for (size_t s = 0; s < (levelBudget - flagRem); ++s) {
-        const uint32_t scalingFactor = 1U << (s * layersCollapse);
-        const int32_t halfRots       = (1 - (numRotations + 1) / 2);
-        for (int32_t j = halfRots; j < static_cast<int32_t>(g + halfRots); ++j) {
-            indexList.emplace_back(ReduceRotation(j * scalingFactor, M / 4));
-        }
-        for (size_t i = 0; i < b; ++i) {
-            indexList.emplace_back(ReduceRotation((g * i) * scalingFactor, M / 4));
-        }
+    // additional automorphisms are needed for sparse bootstrapping
+    if (uint32_t m = slots * 4; m != M) {
+        for (uint32_t j = 1; j < M / m; j <<= 1)
+            indexList.emplace_back(j * slots);
     }
 
-    if (flagRem) {
-        uint32_t s                   = levelBudget - flagRem;
-        const uint32_t scalingFactor = 1U << (s * layersCollapse);
-        const int32_t halfRots       = (1 - (numRotationsRem + 1) / 2);
-        for (int32_t j = halfRots; j < static_cast<int32_t>(gRem + halfRots); ++j) {
-            indexList.emplace_back(ReduceRotation(j * scalingFactor, M / 4));
-        }
-        for (size_t i = 0; i < bRem; ++i) {
-            indexList.emplace_back(ReduceRotation((gRem * i) * scalingFactor, M / 4));
-        }
+    M >>= 2;
+    const uint32_t flagRem  = (p.remCollapse == 0) ? 0 : 1;
+    const uint32_t smax     = p.lvlb - flagRem;
+    const int32_t halfRots  = (1 - (p.numRotations + 1) / 2);
+    const int32_t halfRotsg = halfRots + p.g;
+    for (uint32_t s = 0; s < smax; ++s) {
+        const uint32_t scalingFactor = 1U << (s * p.layersCollapse);
+        for (int32_t j = halfRots; j < halfRotsg; ++j)
+            indexList.emplace_back(ReduceRotation(j * scalingFactor, M));
+        for (uint32_t i = 0; i < p.b; ++i)
+            indexList.emplace_back(ReduceRotation(i * p.g * scalingFactor, M));
     }
 
-    uint32_t m = slots * 4;
-    // additional automorphisms are needed for sparse bootstrapping
-    if (m != M) {
-        for (size_t j = 1; j < M / m; j <<= 1) {
-            indexList.emplace_back(j * slots);
-        }
+    if (flagRem == 1) {
+        const uint32_t scalingFactor = 1U << (smax * p.layersCollapse);
+        const int32_t halfRotsRem    = (1 - (p.numRotationsRem + 1) / 2);
+        const int32_t halfRotsRemg   = halfRotsRem + p.gRem;
+        for (int32_t j = halfRotsRem; j < halfRotsRemg; ++j)
+            indexList.emplace_back(ReduceRotation(j * scalingFactor, M));
+        for (uint32_t i = 0; i < p.bRem; ++i)
+            indexList.emplace_back(ReduceRotation(i * p.gRem * scalingFactor, M));
     }
 
     return indexList;
@@ -989,8 +956,8 @@ std::vector<uint32_t> FHECKKSRNS::FindSlotsToCoeffsRotationIndices(uint32_t slot
 std::vector<ReadOnlyPlaintext> FHECKKSRNS::EvalLinearTransformPrecompute(
     const CryptoContextImpl<DCRTPoly>& cc, const std::vector<std::vector<std::complex<double>>>& A, double scale,
     uint32_t L) const {
-    uint32_t slots = A.size();
-    if (slots != A[0].size())
+    const int32_t slots = A.size();
+    if (slots != static_cast<int32_t>(A[0].size()))
         OPENFHE_THROW("The matrix passed to EvalLTPrecompute is not square");
 
     // make sure the plaintext is created only with the necessary amount of moduli
@@ -1017,29 +984,20 @@ std::vector<ReadOnlyPlaintext> FHECKKSRNS::EvalLinearTransformPrecompute(
     }
     auto elementParamsPtr = std::make_shared<ILDCRTParams<DCRTPoly::Integer>>(cc.GetCyclotomicOrder(), moduli, roots);
 
-    // Computing the baby-step bStep and the giant-step gStep.
-    auto& p   = GetBootPrecom(slots);
-    int bStep = (p.m_dim1 == 0) ? std::ceil(std::sqrt(slots)) : p.m_dim1;
-    int gStep = std::ceil(static_cast<double>(slots) / bStep);
+    auto g = GetBootPrecom(slots).m_paramsEnc.g;
+
+    const int32_t step = (g == 0) ? std::ceil(std::sqrt(slots)) : g;
 
     std::vector<ReadOnlyPlaintext> result(slots);
-// parallelizing the loop (below) with OMP causes a segfault on MinGW
-// see https://github.com/openfheorg/openfhe-development/issues/176
 #if !defined(__MINGW32__) && !defined(__MINGW64__)
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(OpenFHEParallelControls.GetThreadLimit(slots))
 #endif
-    for (int j = 0; j < gStep; j++) {
-        int offset = -bStep * j;
-        for (int i = 0; i < bStep; i++) {
-            if (bStep * j + i < static_cast<int>(slots)) {
-                auto diag = ExtractShiftedDiagonal(A, bStep * j + i);
-                for (uint32_t k = 0; k < diag.size(); k++)
-                    diag[k] *= scale;
-
-                result[bStep * j + i] =
-                    MakeAuxPlaintext(cc, elementParamsPtr, Rotate(diag, offset), 1, towersToDrop, diag.size());
-            }
-        }
+    for (int32_t ji = 0; ji < slots; ++ji) {
+        auto diag = ExtractShiftedDiagonal(A, ji);
+        for (auto& d : diag)
+            d *= scale;
+        result[ji] =
+            MakeAuxPlaintext(cc, elementParamsPtr, Rotate(diag, -step * (ji / step)), 1, towersToDrop, diag.size());
     }
     return result;
 }
@@ -1072,33 +1030,27 @@ std::vector<ReadOnlyPlaintext> FHECKKSRNS::EvalLinearTransformPrecompute(
     }
     auto elementParamsPtr = std::make_shared<ILDCRTParams<DCRTPoly::Integer>>(cc.GetCyclotomicOrder(), moduli, roots);
 
-    uint32_t slots = A.size();
+    const int32_t slots = static_cast<int32_t>(A.size());
 
-    // Computing the baby-step bStep and the giant-step gStep.
-    auto& p   = GetBootPrecom(slots);
-    int bStep = (p.m_dim1 == 0) ? ceil(sqrt(slots)) : p.m_dim1;
-    int gStep = ceil(static_cast<double>(slots) / bStep);
+    auto g = GetBootPrecom(slots).m_paramsEnc.g;
+
+    const int32_t step = (g == 0) ? std::ceil(std::sqrt(slots)) : g;
 
     std::vector<ReadOnlyPlaintext> result(slots);
 
     if (orientation == 0) {
         // vertical concatenation - used during homomorphic encoding
-        // #pragma omp parallel for
-        for (int j = 0; j < gStep; j++) {
-            int offset = -bStep * j;
-            for (int i = 0; i < bStep; i++) {
-                if (bStep * j + i < static_cast<int>(slots)) {
-                    auto vecA = ExtractShiftedDiagonal(A, bStep * j + i);
-                    auto vecB = ExtractShiftedDiagonal(B, bStep * j + i);
-
-                    vecA.insert(vecA.end(), vecB.begin(), vecB.end());
-                    for (uint32_t k = 0; k < vecA.size(); k++)
-                        vecA[k] *= scale;
-
-                    result[bStep * j + i] =
-                        MakeAuxPlaintext(cc, elementParamsPtr, Rotate(vecA, offset), 1, towersToDrop, vecA.size());
-                }
-            }
+#if !defined(__MINGW32__) && !defined(__MINGW64__)
+    #pragma omp parallel for num_threads(OpenFHEParallelControls.GetThreadLimit(slots))
+#endif
+        for (int32_t ji = 0; ji < slots; ++ji) {
+            auto vecA = ExtractShiftedDiagonal(A, ji);
+            auto vecB = ExtractShiftedDiagonal(B, ji);
+            vecA.insert(vecA.end(), vecB.begin(), vecB.end());
+            for (auto& v : vecA)
+                v *= scale;
+            result[ji] =
+                MakeAuxPlaintext(cc, elementParamsPtr, Rotate(vecA, -step * (ji / step)), 1, towersToDrop, vecA.size());
         }
     }
     else {
@@ -1106,27 +1058,23 @@ std::vector<ReadOnlyPlaintext> FHECKKSRNS::EvalLinearTransformPrecompute(
         std::vector<std::vector<std::complex<double>>> newA(slots);
 
         //  A and B are concatenated horizontally
-        for (uint32_t i = 0; i < slots; ++i) {
+        for (int32_t i = 0; i < slots; ++i) {
             newA[i].reserve(A[i].size() + B[i].size());
             newA[i].insert(newA[i].end(), A[i].begin(), A[i].end());
             newA[i].insert(newA[i].end(), B[i].begin(), B[i].end());
         }
 
-#pragma omp parallel for
-        for (int j = 0; j < gStep; j++) {
-            int offset = -bStep * j;
-            for (int i = 0; i < bStep; i++) {
-                if (bStep * j + i < static_cast<int>(slots)) {
-                    // shifted diagonal is computed for rectangular map newA of dimension
-                    // slots x 2*slots
-                    auto vec = ExtractShiftedDiagonal(newA, bStep * j + i);
-                    for (uint32_t k = 0; k < vec.size(); k++)
-                        vec[k] *= scale;
-
-                    result[bStep * j + i] =
-                        MakeAuxPlaintext(cc, elementParamsPtr, Rotate(vec, offset), 1, towersToDrop, vec.size());
-                }
-            }
+#if !defined(__MINGW32__) && !defined(__MINGW64__)
+    #pragma omp parallel for num_threads(OpenFHEParallelControls.GetThreadLimit(slots))
+#endif
+        for (int32_t ji = 0; ji < slots; ++ji) {
+            // shifted diagonal is computed for rectangular map newA of dimension
+            // slots x 2*slots
+            auto vec = ExtractShiftedDiagonal(newA, ji);
+            for (auto& v : vec)
+                v *= scale;
+            result[ji] =
+                MakeAuxPlaintext(cc, elementParamsPtr, Rotate(vec, -step * (ji / step)), 1, towersToDrop, vec.size());
         }
     }
 
@@ -1136,33 +1084,21 @@ std::vector<ReadOnlyPlaintext> FHECKKSRNS::EvalLinearTransformPrecompute(
 std::vector<std::vector<ReadOnlyPlaintext>> FHECKKSRNS::EvalCoeffsToSlotsPrecompute(
     const CryptoContextImpl<DCRTPoly>& cc, const std::vector<std::complex<double>>& A,
     const std::vector<uint32_t>& rotGroup, bool flag_i, double scale, uint32_t L) const {
-    uint32_t slots = rotGroup.size();
+    const uint32_t slots = rotGroup.size();
 
-    auto& p = GetBootPrecom(slots);
+    const auto& p = GetBootPrecom(slots).m_paramsEnc;
 
-    int32_t levelBudget     = p.m_paramsEnc[CKKS_BOOT_PARAMS::LEVEL_BUDGET];
-    int32_t layersCollapse  = p.m_paramsEnc[CKKS_BOOT_PARAMS::LAYERS_COLL];
-    int32_t remCollapse     = p.m_paramsEnc[CKKS_BOOT_PARAMS::LAYERS_REM];
-    int32_t numRotations    = p.m_paramsEnc[CKKS_BOOT_PARAMS::NUM_ROTATIONS];
-    int32_t b               = p.m_paramsEnc[CKKS_BOOT_PARAMS::BABY_STEP];
-    int32_t g               = p.m_paramsEnc[CKKS_BOOT_PARAMS::GIANT_STEP];
-    int32_t numRotationsRem = p.m_paramsEnc[CKKS_BOOT_PARAMS::NUM_ROTATIONS_REM];
-    int32_t bRem            = p.m_paramsEnc[CKKS_BOOT_PARAMS::BABY_STEP_REM];
-    int32_t gRem            = p.m_paramsEnc[CKKS_BOOT_PARAMS::GIANT_STEP_REM];
+    // result is the rotated plaintext version of the coefficients
+    std::vector<std::vector<ReadOnlyPlaintext>> result(p.lvlb, std::vector<ReadOnlyPlaintext>(p.numRotations));
 
     int32_t stop    = -1;
     int32_t flagRem = 0;
-
-    if (remCollapse != 0) {
+    if (p.remCollapse != 0) {
         stop    = 0;
         flagRem = 1;
-    }
 
-    // result is the rotated plaintext version of the coefficients
-    std::vector<std::vector<ReadOnlyPlaintext>> result(levelBudget, std::vector<ReadOnlyPlaintext>(numRotations));
-    if (flagRem == 1 && levelBudget >= 1) {
         // remainder corresponds to index 0 in encoding and to last index in decoding
-        result[0].resize(numRotationsRem);
+        result[0].resize(p.numRotationsRem);
     }
 
     // make sure the plaintext is created only with the necessary amount of moduli
@@ -1171,11 +1107,11 @@ std::vector<std::vector<ReadOnlyPlaintext>> FHECKKSRNS::EvalCoeffsToSlotsPrecomp
 
     auto elementParams = *(cryptoParams->GetElementParams());
 
-    uint32_t towersToDrop = (L == 0) ? 0 : elementParams.GetParams().size() - L - compositeDegree * levelBudget;
+    uint32_t towersToDrop = (L == 0) ? 0 : elementParams.GetParams().size() - L - compositeDegree * p.lvlb;
     for (uint32_t i = 0; i < towersToDrop; ++i)
         elementParams.PopLastParam();
 
-    uint32_t level0 = towersToDrop + compositeDegree * (levelBudget - 1);
+    uint32_t level0 = towersToDrop + compositeDegree * (p.lvlb - 1);
 
     auto paramsQ   = elementParams.GetParams();
     uint32_t sizeQ = paramsQ.size();
@@ -1188,7 +1124,6 @@ std::vector<std::vector<ReadOnlyPlaintext>> FHECKKSRNS::EvalCoeffsToSlotsPrecomp
         moduli[i] = paramsQ[i]->GetModulus();
         roots[i]  = paramsQ[i]->GetRootOfUnity();
     }
-
     for (uint32_t i = 0; i < sizeP; ++i) {
         moduli[sizeQ + i] = paramsP[i]->GetModulus();
         roots[sizeQ + i]  = paramsP[i]->GetRootOfUnity();
@@ -1196,8 +1131,8 @@ std::vector<std::vector<ReadOnlyPlaintext>> FHECKKSRNS::EvalCoeffsToSlotsPrecomp
 
     // we need to pre-compute the plaintexts in the extended basis P*Q
     uint32_t M = cc.GetCyclotomicOrder();
-    std::vector<std::shared_ptr<ILDCRTParams<BigInteger>>> paramsVector(levelBudget - stop);
-    for (int32_t s = levelBudget - 1; s >= stop; s--) {
+    std::vector<std::shared_ptr<ILDCRTParams<BigInteger>>> paramsVector(p.lvlb - stop);
+    for (int32_t s = -1 + p.lvlb; s >= stop; --s) {
         paramsVector[s - stop] = std::make_shared<ILDCRTParams<BigInteger>>(M, moduli, roots);
         for (uint32_t j = 0; j < compositeDegree; ++j, --sizeQ) {
             moduli.erase(moduli.begin() + sizeQ - 1);
@@ -1205,52 +1140,48 @@ std::vector<std::vector<ReadOnlyPlaintext>> FHECKKSRNS::EvalCoeffsToSlotsPrecomp
         }
     }
 
-    if (slots == M / 4) {
+    if (uint32_t M4 = M / 4; slots == M4) {
         //------------------------------------------------------------------------------
         // fully-packed mode
         //------------------------------------------------------------------------------
 
-        auto coeff = CoeffEncodingCollapse(A, rotGroup, levelBudget, flag_i);
+        auto coeff = CoeffEncodingCollapse(A, rotGroup, p.lvlb, flag_i);
 
-        for (int32_t s = levelBudget - 1; s > stop; s--) {
-            for (int32_t i = 0; i < b; i++) {
+        for (int32_t s = -1 + p.lvlb; s > stop; --s) {
+            const int32_t rotScale = (1 << ((s - flagRem) * p.layersCollapse + p.remCollapse)) * p.g;
+            const uint32_t limit   = p.b * p.g;
 #if !defined(__MINGW32__) && !defined(__MINGW64__)
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(OpenFHEParallelControls.GetThreadLimit(limit))
 #endif
-                for (int32_t j = 0; j < g; j++) {
-                    if (g * i + j != static_cast<int32_t>(numRotations)) {
-                        uint32_t rot =
-                            ReduceRotation(-g * i * (1 << ((s - flagRem) * layersCollapse + remCollapse)), slots);
-                        if ((flagRem == 0) && (s == stop + 1)) {
-                            // do the scaling only at the last set of coefficients
-                            for (uint32_t k = 0; k < slots; k++) {
-                                coeff[s][g * i + j][k] *= scale;
-                            }
-                        }
-
-                        auto rotateTemp = Rotate(coeff[s][g * i + j], rot);
-
-                        result[s][g * i + j] = MakeAuxPlaintext(cc, paramsVector[s - stop], rotateTemp, 1,
-                                                                level0 - compositeDegree * s, rotateTemp.size());
+            for (uint32_t ij = 0; ij < limit; ++ij) {
+                if (ij != p.numRotations) {
+                    if ((flagRem == 0) && (s == stop + 1)) {
+                        // do the scaling only at the last set of coefficients
+                        for (auto& c : coeff[s][ij])
+                            c *= scale;
                     }
+
+                    auto rot = Rotate(coeff[s][ij], ReduceRotation(-rotScale * (ij / p.g), slots));
+
+                    result[s][ij] =
+                        MakeAuxPlaintext(cc, paramsVector[s - stop], rot, 1, level0 - compositeDegree * s, rot.size());
                 }
             }
         }
 
-        if (flagRem) {
-            for (int32_t i = 0; i < bRem; i++) {
-#pragma omp parallel for
-                for (int32_t j = 0; j < gRem; j++) {
-                    if (gRem * i + j != static_cast<int32_t>(numRotationsRem)) {
-                        uint32_t rot = ReduceRotation(-gRem * i, slots);
-                        for (uint32_t k = 0; k < slots; k++) {
-                            coeff[stop][gRem * i + j][k] *= scale;
-                        }
+        if (flagRem == 1) {
+            const uint32_t limit = p.bRem * p.gRem;
+#if !defined(__MINGW32__) && !defined(__MINGW64__)
+    #pragma omp parallel for num_threads(OpenFHEParallelControls.GetThreadLimit(limit))
+#endif
+            for (uint32_t ij = 0; ij < limit; ++ij) {
+                if (ij != p.numRotationsRem) {
+                    for (auto& c : coeff[stop][ij])
+                        c *= scale;
 
-                        auto rotateTemp = Rotate(coeff[stop][gRem * i + j], rot);
-                        result[stop][gRem * i + j] =
-                            MakeAuxPlaintext(cc, paramsVector[0], rotateTemp, 1, level0, rotateTemp.size());
-                    }
+                    auto rot = Rotate(coeff[stop][ij], ReduceRotation(-p.gRem * (ij / p.gRem), slots));
+
+                    result[stop][ij] = MakeAuxPlaintext(cc, paramsVector[0], rot, 1, level0, rot.size());
                 }
             }
         }
@@ -1260,55 +1191,52 @@ std::vector<std::vector<ReadOnlyPlaintext>> FHECKKSRNS::EvalCoeffsToSlotsPrecomp
         // sparsely-packed mode
         //------------------------------------------------------------------------------
 
-        auto coeff  = CoeffEncodingCollapse(A, rotGroup, levelBudget, false);
-        auto coeffi = CoeffEncodingCollapse(A, rotGroup, levelBudget, true);
+        auto coeff  = CoeffEncodingCollapse(A, rotGroup, p.lvlb, false);
+        auto coeffi = CoeffEncodingCollapse(A, rotGroup, p.lvlb, true);
 
-        for (int32_t s = levelBudget - 1; s > stop; s--) {
-            for (int32_t i = 0; i < b; i++) {
+        for (int32_t s = -1 + p.lvlb; s > stop; --s) {
+            const int32_t rotScale = (1 << ((s - flagRem) * p.layersCollapse + p.remCollapse)) * p.g;
+            const uint32_t limit   = p.b * p.g;
 #if !defined(__MINGW32__) && !defined(__MINGW64__)
-    #pragma omp parallel for
+    #pragma omp parallel for num_threads(OpenFHEParallelControls.GetThreadLimit(limit))
 #endif
-                for (int32_t j = 0; j < g; j++) {
-                    if (g * i + j != static_cast<int32_t>(numRotations)) {
-                        uint32_t rot =
-                            ReduceRotation(-g * i * (1 << ((s - flagRem) * layersCollapse + remCollapse)), M / 4);
-                        // concatenate the coefficients horizontally on their third dimension, which corresponds to the # of slots
-                        auto clearTemp  = coeff[s][g * i + j];
-                        auto clearTempi = coeffi[s][g * i + j];
-                        clearTemp.insert(clearTemp.end(), clearTempi.begin(), clearTempi.end());
-                        if ((flagRem == 0) && (s == stop + 1)) {
-                            // do the scaling only at the last set of coefficients
-                            for (uint32_t k = 0; k < clearTemp.size(); k++) {
-                                clearTemp[k] *= scale;
-                            }
-                        }
-
-                        auto rotateTemp      = Rotate(clearTemp, rot);
-                        result[s][g * i + j] = MakeAuxPlaintext(cc, paramsVector[s - stop], rotateTemp, 1,
-                                                                level0 - compositeDegree * s, rotateTemp.size());
+            for (uint32_t ij = 0; ij < limit; ++ij) {
+                if (ij != p.numRotations) {
+                    // concatenate the coefficients horizontally on their third dimension, which corresponds to the # of slots
+                    auto clearTmp   = coeff[s][ij];
+                    auto& clearTmpi = coeffi[s][ij];
+                    clearTmp.insert(clearTmp.end(), clearTmpi.begin(), clearTmpi.end());
+                    if ((flagRem == 0) && (s == stop + 1)) {
+                        // do the scaling only at the last set of coefficients
+                        for (auto& c : clearTmp)
+                            c *= scale;
                     }
+
+                    auto rot = Rotate(clearTmp, ReduceRotation(-rotScale * (ij / p.g), M4));
+
+                    result[s][ij] =
+                        MakeAuxPlaintext(cc, paramsVector[s - stop], rot, 1, level0 - compositeDegree * s, rot.size());
                 }
             }
         }
 
-        if (flagRem) {
-            for (int32_t i = 0; i < bRem; i++) {
-#pragma omp parallel for
-                for (int32_t j = 0; j < gRem; j++) {
-                    if (gRem * i + j != static_cast<int32_t>(numRotationsRem)) {
-                        uint32_t rot = ReduceRotation(-gRem * i, M / 4);
-                        // concatenate the coefficients on their third dimension, which corresponds to the # of slots
-                        auto clearTemp  = coeff[stop][gRem * i + j];
-                        auto clearTempi = coeffi[stop][gRem * i + j];
-                        clearTemp.insert(clearTemp.end(), clearTempi.begin(), clearTempi.end());
-                        for (uint32_t k = 0; k < clearTemp.size(); k++) {
-                            clearTemp[k] *= scale;
-                        }
-
-                        auto rotateTemp = Rotate(clearTemp, rot);
-                        result[stop][gRem * i + j] =
-                            MakeAuxPlaintext(cc, paramsVector[0], rotateTemp, 1, level0, rotateTemp.size());
-                    }
+        if (flagRem == 1) {
+            const uint32_t limit = p.bRem * p.gRem;
+#if !defined(__MINGW32__) && !defined(__MINGW64__)
+    #pragma omp parallel for num_threads(OpenFHEParallelControls.GetThreadLimit(limit))
+#endif
+            for (uint32_t ij = 0; ij < limit; ++ij) {
+                if (ij != p.numRotationsRem) {
+                    // concatenate the coefficients on their third dimension, which corresponds to the # of slots
+                    auto clearTmp   = coeff[stop][ij];
+                    auto& clearTmpi = coeffi[stop][ij];
+                    clearTmp.insert(clearTmp.end(), clearTmpi.begin(), clearTmpi.end());
+                    for (auto& c : clearTmp)
+                        c *= scale;
+
+                    auto rot = Rotate(clearTmp, ReduceRotation(-p.gRem * (ij / p.gRem), M4));
+
+                    result[stop][ij] = MakeAuxPlaintext(cc, paramsVector[0], rot, 1, level0, rot.size());
                 }
             }
         }
@@ -1319,27 +1247,17 @@ std::vector<std::vector<ReadOnlyPlaintext>> FHECKKSRNS::EvalCoeffsToSlotsPrecomp
 std::vector<std::vector<ReadOnlyPlaintext>> FHECKKSRNS::EvalSlotsToCoeffsPrecompute(
     const CryptoContextImpl<DCRTPoly>& cc, const std::vector<std::complex<double>>& A,
     const std::vector<uint32_t>& rotGroup, bool flag_i, double scale, uint32_t L) const {
-    uint32_t slots = rotGroup.size();
+    const uint32_t slots = rotGroup.size();
 
-    auto& p = GetBootPrecom(slots);
-
-    int32_t levelBudget     = p.m_paramsDec[CKKS_BOOT_PARAMS::LEVEL_BUDGET];
-    int32_t layersCollapse  = p.m_paramsDec[CKKS_BOOT_PARAMS::LAYERS_COLL];
-    int32_t remCollapse     = p.m_paramsDec[CKKS_BOOT_PARAMS::LAYERS_REM];
-    int32_t numRotations    = p.m_paramsDec[CKKS_BOOT_PARAMS::NUM_ROTATIONS];
-    int32_t b               = p.m_paramsDec[CKKS_BOOT_PARAMS::BABY_STEP];
-    int32_t g               = p.m_paramsDec[CKKS_BOOT_PARAMS::GIANT_STEP];
-    int32_t numRotationsRem = p.m_paramsDec[CKKS_BOOT_PARAMS::NUM_ROTATIONS_REM];
-    int32_t bRem            = p.m_paramsDec[CKKS_BOOT_PARAMS::BABY_STEP_REM];
-    int32_t gRem            = p.m_paramsDec[CKKS_BOOT_PARAMS::GIANT_STEP_REM];
+    const auto& p = GetBootPrecom(slots).m_paramsDec;
 
-    int32_t flagRem = (remCollapse == 0) ? 0 : 1;
+    const int32_t flagRem = (p.remCollapse == 0) ? 0 : 1;
 
     // result is the rotated plaintext version of coeff
-    std::vector<std::vector<ReadOnlyPlaintext>> result(levelBudget, std::vector<ReadOnlyPlaintext>(numRotations));
-    if (flagRem == 1 && levelBudget >= 1) {
+    std::vector<std::vector<ReadOnlyPlaintext>> result(p.lvlb, std::vector<ReadOnlyPlaintext>(p.numRotations));
+    if (flagRem == 1) {
         // remainder corresponds to index 0 in encoding and to last index in decoding
-        result[levelBudget - 1].resize(numRotationsRem);
+        result[p.lvlb - 1].resize(p.numRotationsRem);
     }
 
     // make sure the plaintext is created only with the necessary amount of moduli
@@ -1348,12 +1266,10 @@ std::vector<std::vector<ReadOnlyPlaintext>> FHECKKSRNS::EvalSlotsToCoeffsPrecomp
     uint32_t compositeDegree = cryptoParams->GetCompositeDegree();
     auto elementParams       = *(cryptoParams->GetElementParams());
 
-    uint32_t towersToDrop = (L == 0) ? 0 : elementParams.GetParams().size() - L - compositeDegree * levelBudget;
+    const uint32_t towersToDrop = (L == 0) ? 0 : elementParams.GetParams().size() - L - compositeDegree * p.lvlb;
     for (uint32_t i = 0; i < towersToDrop; ++i)
         elementParams.PopLastParam();
 
-    uint32_t level0 = towersToDrop;
-
     auto paramsQ   = elementParams.GetParams();
     uint32_t sizeQ = paramsQ.size();
     auto paramsP   = cryptoParams->GetParamsP()->GetParams();
@@ -1370,8 +1286,9 @@ std::vector<std::vector<ReadOnlyPlaintext>> FHECKKSRNS::EvalSlotsToCoeffsPrecomp
     }
 
     // we need to pre-compute the plaintexts in the extended basis P*Q
-    std::vector<std::shared_ptr<ILDCRTParams<BigInteger>>> paramsVector(levelBudget - flagRem + 1);
-    for (int32_t s = 0; s < levelBudget - flagRem + 1; ++s) {
+    const uint32_t pvlen = p.lvlb + 1 - flagRem;
+    std::vector<std::shared_ptr<ILDCRTParams<BigInteger>>> paramsVector(pvlen);
+    for (uint32_t s = 0; s < pvlen; ++s) {
         paramsVector[s] = std::make_shared<ILDCRTParams<BigInteger>>(cc.GetCyclotomicOrder(), moduli, roots);
         for (uint32_t i = 0; i < compositeDegree; ++i, --sizeQ) {
             moduli.erase(moduli.begin() + sizeQ - 1);
@@ -1379,47 +1296,47 @@ std::vector<std::vector<ReadOnlyPlaintext>> FHECKKSRNS::EvalSlotsToCoeffsPrecomp
         }
     }
 
-    uint32_t M4 = cc.GetCyclotomicOrder() / 4;
-    if (slots == M4) {
+    if (uint32_t M4 = cc.GetCyclotomicOrder() / 4; M4 == slots) {
         // fully-packed
-        auto coeff = CoeffDecodingCollapse(A, rotGroup, levelBudget, flag_i);
-
-        for (int32_t s = 0; s < levelBudget - flagRem; s++) {
-            for (int32_t i = 0; i < b; i++) {
-#pragma omp parallel for
-                for (int32_t j = 0; j < g; j++) {
-                    if (g * i + j != static_cast<int32_t>(numRotations)) {
-                        uint32_t rot = ReduceRotation(-g * i * (1 << (s * layersCollapse)), slots);
-                        if ((flagRem == 0) && (s == levelBudget - flagRem - 1)) {
-                            // do the scaling only at the last set of coefficients
-                            for (uint32_t k = 0; k < slots; k++) {
-                                coeff[s][g * i + j][k] *= scale;
-                            }
-                        }
-
-                        auto rotateTemp      = Rotate(coeff[s][g * i + j], rot);
-                        result[s][g * i + j] = MakeAuxPlaintext(cc, paramsVector[s], rotateTemp, 1,
-                                                                level0 + compositeDegree * s, rotateTemp.size());
+        auto coeff          = CoeffDecodingCollapse(A, rotGroup, p.lvlb, flag_i);
+        const uint32_t smax = p.lvlb - flagRem;
+        for (uint32_t s = 0; s < smax; ++s) {
+            const int32_t rotScale = (1 << (s * p.layersCollapse)) * p.g;
+            const uint32_t limit   = p.b * p.g;
+#if !defined(__MINGW32__) && !defined(__MINGW64__)
+    #pragma omp parallel for num_threads(OpenFHEParallelControls.GetThreadLimit(limit))
+#endif
+            for (uint32_t ij = 0; ij < limit; ++ij) {
+                if (ij != p.numRotations) {
+                    if ((flagRem == 0) && (s + 1 == smax)) {
+                        // do the scaling only at the last set of coefficients
+                        for (auto& c : coeff[s][ij])
+                            c *= scale;
                     }
+
+                    auto rot = Rotate(coeff[s][ij], ReduceRotation(-rotScale * (ij / p.g), slots));
+
+                    result[s][ij] =
+                        MakeAuxPlaintext(cc, paramsVector[s], rot, 1, towersToDrop + compositeDegree * s, rot.size());
                 }
             }
         }
 
-        if (flagRem) {
-            int32_t s = levelBudget - flagRem;
-            for (int32_t i = 0; i < bRem; i++) {
-#pragma omp parallel for
-                for (int32_t j = 0; j < gRem; j++) {
-                    if (gRem * i + j != static_cast<int32_t>(numRotationsRem)) {
-                        uint32_t rot = ReduceRotation(-gRem * i * (1 << (s * layersCollapse)), slots);
-                        for (uint32_t k = 0; k < slots; k++) {
-                            coeff[s][gRem * i + j][k] *= scale;
-                        }
-
-                        auto rotateTemp         = Rotate(coeff[s][gRem * i + j], rot);
-                        result[s][gRem * i + j] = MakeAuxPlaintext(cc, paramsVector[s], rotateTemp, 1,
-                                                                   level0 + compositeDegree * s, rotateTemp.size());
-                    }
+        if (flagRem == 1) {
+            const int32_t rotScale = (1 << (smax * p.layersCollapse)) * p.gRem;
+            const uint32_t limit   = p.bRem * p.gRem;
+#if !defined(__MINGW32__) && !defined(__MINGW64__)
+    #pragma omp parallel for num_threads(OpenFHEParallelControls.GetThreadLimit(limit))
+#endif
+            for (uint32_t ij = 0; ij < limit; ++ij) {
+                if (ij != p.numRotationsRem) {
+                    for (auto& c : coeff[smax][ij])
+                        c *= scale;
+
+                    auto rot = Rotate(coeff[smax][ij], ReduceRotation(-rotScale * (ij / p.g), slots));
+
+                    result[smax][ij] = MakeAuxPlaintext(cc, paramsVector[smax], rot, 1,
+                                                        towersToDrop + compositeDegree * smax, rot.size());
                 }
             }
         }
@@ -1429,53 +1346,55 @@ std::vector<std::vector<ReadOnlyPlaintext>> FHECKKSRNS::EvalSlotsToCoeffsPrecomp
         // sparsely-packed mode
         //------------------------------------------------------------------------------
 
-        auto coeff  = CoeffDecodingCollapse(A, rotGroup, levelBudget, false);
-        auto coeffi = CoeffDecodingCollapse(A, rotGroup, levelBudget, true);
-
-        for (int32_t s = 0; s < levelBudget - flagRem; s++) {
-            for (int32_t i = 0; i < b; i++) {
-#pragma omp parallel for
-                for (int32_t j = 0; j < g; j++) {
-                    if (g * i + j != static_cast<int32_t>(numRotations)) {
-                        uint32_t rot = ReduceRotation(-g * i * (1 << (s * layersCollapse)), M4);
-                        // concatenate the coefficients horizontally on their third dimension, which corresponds to the # of slots
-                        auto clearTemp  = coeff[s][g * i + j];
-                        auto clearTempi = coeffi[s][g * i + j];
-                        clearTemp.insert(clearTemp.end(), clearTempi.begin(), clearTempi.end());
-                        if ((flagRem == 0) && (s == levelBudget - flagRem - 1)) {
-                            // do the scaling only at the last set of coefficients
-                            for (uint32_t k = 0; k < clearTemp.size(); k++) {
-                                clearTemp[k] *= scale;
-                            }
-                        }
-
-                        auto rotateTemp      = Rotate(clearTemp, rot);
-                        result[s][g * i + j] = MakeAuxPlaintext(cc, paramsVector[s], rotateTemp, 1,
-                                                                level0 + compositeDegree * s, rotateTemp.size());
+        auto coeff  = CoeffDecodingCollapse(A, rotGroup, p.lvlb, false);
+        auto coeffi = CoeffDecodingCollapse(A, rotGroup, p.lvlb, true);
+
+        const uint32_t smax = p.lvlb - flagRem;
+        for (uint32_t s = 0; s < smax; ++s) {
+            const int32_t rotScale = (1 << (s * p.layersCollapse)) * p.g;
+            const uint32_t limit   = p.b * p.g;
+#if !defined(__MINGW32__) && !defined(__MINGW64__)
+    #pragma omp parallel for num_threads(OpenFHEParallelControls.GetThreadLimit(limit))
+#endif
+            for (uint32_t ij = 0; ij < limit; ++ij) {
+                if (ij != p.numRotations) {
+                    // concatenate the coefficients horizontally on their third dimension, which corresponds to the # of slots
+                    auto clearTmp   = coeff[s][ij];
+                    auto& clearTmpi = coeffi[s][ij];
+                    clearTmp.insert(clearTmp.end(), clearTmpi.begin(), clearTmpi.end());
+                    if ((flagRem == 0) && (s + 1 == smax)) {
+                        // do the scaling only at the last set of coefficients
+                        for (auto& c : clearTmp)
+                            c *= scale;
                     }
+
+                    auto rot = Rotate(clearTmp, ReduceRotation(-rotScale * (ij / p.g), M4));
+
+                    result[s][ij] =
+                        MakeAuxPlaintext(cc, paramsVector[s], rot, 1, towersToDrop + compositeDegree * s, rot.size());
                 }
             }
         }
 
-        if (flagRem) {
-            int32_t s = levelBudget - flagRem;
-            for (int32_t i = 0; i < bRem; i++) {
-#pragma omp parallel for
-                for (int32_t j = 0; j < gRem; j++) {
-                    if (gRem * i + j != static_cast<int32_t>(numRotationsRem)) {
-                        uint32_t rot = ReduceRotation(-gRem * i * (1 << (s * layersCollapse)), M4);
-                        // concatenate the coefficients horizontally on their third dimension, which corresponds to the # of slots
-                        auto clearTemp  = coeff[s][gRem * i + j];
-                        auto clearTempi = coeffi[s][gRem * i + j];
-                        clearTemp.insert(clearTemp.end(), clearTempi.begin(), clearTempi.end());
-                        for (uint32_t k = 0; k < clearTemp.size(); k++) {
-                            clearTemp[k] *= scale;
-                        }
-
-                        auto rotateTemp         = Rotate(clearTemp, rot);
-                        result[s][gRem * i + j] = MakeAuxPlaintext(cc, paramsVector[s], rotateTemp, 1,
-                                                                   level0 + compositeDegree * s, rotateTemp.size());
-                    }
+        if (flagRem == 1) {
+            const int32_t rotScale = (1 << (smax * p.layersCollapse)) * p.g;
+            const uint32_t limit   = p.bRem * p.gRem;
+#if !defined(__MINGW32__) && !defined(__MINGW64__)
+    #pragma omp parallel for num_threads(OpenFHEParallelControls.GetThreadLimit(limit))
+#endif
+            for (uint32_t ij = 0; ij < limit; ++ij) {
+                if (ij != p.numRotationsRem) {
+                    // concatenate the coefficients on their third dimension, which corresponds to the # of slots
+                    auto clearTmp   = coeff[smax][ij];
+                    auto& clearTmpi = coeffi[smax][ij];
+                    clearTmp.insert(clearTmp.end(), clearTmpi.begin(), clearTmpi.end());
+                    for (auto& c : clearTmp)
+                        c *= scale;
+
+                    auto rot = Rotate(clearTmp, ReduceRotation(-rotScale * (ij / p.g), M4));
+
+                    result[smax][ij] = MakeAuxPlaintext(cc, paramsVector[smax], rot, 1,
+                                                        towersToDrop + compositeDegree * smax, rot.size());
                 }
             }
         }
@@ -1490,26 +1409,23 @@ std::vector<std::vector<ReadOnlyPlaintext>> FHECKKSRNS::EvalSlotsToCoeffsPrecomp
 Ciphertext<DCRTPoly> FHECKKSRNS::EvalLinearTransform(const std::vector<ReadOnlyPlaintext>& A,
                                                      ConstCiphertext<DCRTPoly>& ct) const {
     // Computing the baby-step bStep and the giant-step gStep.
-    uint32_t slots = A.size();
-    auto& p        = GetBootPrecom(slots);
-    uint32_t bStep = (p.m_dim1 == 0) ? ceil(sqrt(slots)) : p.m_dim1;
-    uint32_t gStep = ceil(static_cast<double>(slots) / bStep);
+    const uint32_t slots = A.size();
+    const auto& p        = GetBootPrecom(slots);
+    const uint32_t bStep = (p.m_paramsEnc.g == 0) ? std::ceil(std::sqrt(slots)) : p.m_paramsEnc.g;
+    const uint32_t gStep = std::ceil(static_cast<double>(slots) / bStep);
 
-    auto cc    = ct->GetCryptoContext();
-    uint32_t M = cc->GetCyclotomicOrder();
-    uint32_t N = cc->GetRingDimension();
-
-    // computes the NTTs for each CRT limb (for the hoisted automorphisms used
-    // later on)
+    auto cc     = ct->GetCryptoContext();
     auto digits = cc->EvalFastRotationPrecompute(ct);
 
-    std::vector<Ciphertext<DCRTPoly>> fastRotation(bStep - 1);
-
     // hoisted automorphisms
-#pragma omp parallel for
+    std::vector<Ciphertext<DCRTPoly>> fastRotation(bStep - 1);
+#pragma omp parallel for num_threads(OpenFHEParallelControls.GetThreadLimit(bStep - 1))
     for (uint32_t j = 1; j < bStep; ++j)
         fastRotation[j - 1] = cc->EvalFastRotationExt(ct, j, digits, true);
 
+    const uint32_t M = cc->GetCyclotomicOrder();
+    const uint32_t N = cc->GetRingDimension();
+    std::vector<uint32_t> map(N);
     Ciphertext<DCRTPoly> result;
     DCRTPoly first;
     for (uint32_t j = 0; j < gStep; ++j) {
@@ -1530,7 +1446,6 @@ Ciphertext<DCRTPoly> FHECKKSRNS::EvalLinearTransform(const std::vector<ReadOnlyP
             inner = cc->KeySwitchDown(inner);
             // Find the automorphism index that corresponds to rotation index index.
             uint32_t autoIndex = FindAutomorphismIndex2nComplex(bStep * j, M);
-            std::vector<uint32_t> map(N);
             PrecomputeAutoMap(N, autoIndex, &map);
             first += inner->GetElements()[0].AutomorphismTransform(autoIndex, map);
 
@@ -1545,104 +1460,90 @@ Ciphertext<DCRTPoly> FHECKKSRNS::EvalLinearTransform(const std::vector<ReadOnlyP
 
 Ciphertext<DCRTPoly> FHECKKSRNS::EvalCoeffsToSlots(const std::vector<std::vector<ReadOnlyPlaintext>>& A,
                                                    ConstCiphertext<DCRTPoly>& ctxt) const {
-    uint32_t slots = ctxt->GetSlots();
+    const uint32_t slots = ctxt->GetSlots();
 
-    auto& p = GetBootPrecom(slots);
+    const auto& p = GetBootPrecom(slots).m_paramsEnc;
 
-    int32_t levelBudget     = p.m_paramsEnc[CKKS_BOOT_PARAMS::LEVEL_BUDGET];
-    int32_t layersCollapse  = p.m_paramsEnc[CKKS_BOOT_PARAMS::LAYERS_COLL];
-    int32_t remCollapse     = p.m_paramsEnc[CKKS_BOOT_PARAMS::LAYERS_REM];
-    int32_t numRotations    = p.m_paramsEnc[CKKS_BOOT_PARAMS::NUM_ROTATIONS];
-    int32_t b               = p.m_paramsEnc[CKKS_BOOT_PARAMS::BABY_STEP];
-    int32_t g               = p.m_paramsEnc[CKKS_BOOT_PARAMS::GIANT_STEP];
-    int32_t numRotationsRem = p.m_paramsEnc[CKKS_BOOT_PARAMS::NUM_ROTATIONS_REM];
-    int32_t bRem            = p.m_paramsEnc[CKKS_BOOT_PARAMS::BABY_STEP_REM];
-    int32_t gRem            = p.m_paramsEnc[CKKS_BOOT_PARAMS::GIANT_STEP_REM];
+    // precompute the inner and outer rotations
+    std::vector<std::vector<int32_t>> rot_out(p.lvlb, std::vector<int32_t>(p.b + p.bRem));
+    std::vector<std::vector<int32_t>> rot_in(p.lvlb, std::vector<int32_t>(p.numRotations + 1));
 
     int32_t stop    = -1;
     int32_t flagRem = 0;
-
-    auto cc                  = ctxt->GetCryptoContext();
-    auto algo                = cc->GetScheme();
-    const auto cryptoParams  = std::dynamic_pointer_cast<CryptoParametersCKKSRNS>(cc->GetCryptoParameters());
-    uint32_t compositeDegree = cryptoParams->GetCompositeDegree();
-
-    if (remCollapse != 0) {
+    if (p.remCollapse != 0) {
         stop    = 0;
         flagRem = 1;
-    }
 
-    // precompute the inner and outer rotations
-    std::vector<std::vector<int32_t>> rot_in(levelBudget, std::vector<int32_t>(numRotations + 1));
-    if (flagRem == 1) {
         // remainder corresponds to index 0 in encoding and to last index in decoding
-        rot_in[0].resize(numRotationsRem + 1);
+        rot_in[0].resize(p.numRotationsRem + 1);
     }
-    std::vector<std::vector<int32_t>> rot_out(levelBudget, std::vector<int32_t>(b + bRem));
 
-    uint32_t M = cc->GetCyclotomicOrder();
-    for (int32_t s = levelBudget - 1; s > stop; --s) {
-        for (int32_t j = 0; j < g; ++j) {
-            rot_in[s][j] = ReduceRotation((j - static_cast<int32_t>((numRotations + 1) / 2) + 1) *
-                                              (1 << ((s - flagRem) * layersCollapse + remCollapse)),
-                                          slots);
-        }
-        for (int32_t i = 0; i < b; ++i)
-            rot_out[s][i] = ReduceRotation((g * i) * (1 << ((s - flagRem) * layersCollapse + remCollapse)), M / 4);
+    auto cc = ctxt->GetCryptoContext();
+
+    const uint32_t M4 = cc->GetCyclotomicOrder() / 4;
+
+    int32_t offset = static_cast<int32_t>((p.numRotations + 1) / 2) - 1;
+    for (int32_t s = p.lvlb - 1; s > stop; --s) {
+        int32_t scale = (1 << ((s - flagRem) * p.layersCollapse + p.remCollapse));
+        for (uint32_t i = 0; i < p.b; ++i)
+            rot_out[s][i] = ReduceRotation(scale * p.g * i, M4);
+        for (uint32_t j = 0; j < p.g; ++j)
+            rot_in[s][j] = ReduceRotation(scale * (j - offset), slots);
     }
 
-    if (flagRem) {
-        for (int32_t j = 0; j < gRem; ++j)
-            rot_in[stop][j] = ReduceRotation((j - static_cast<int32_t>((numRotationsRem + 1) / 2) + 1), slots);
-        for (int32_t i = 0; i < bRem; ++i)
-            rot_out[stop][i] = ReduceRotation((gRem * i), M / 4);
+    if (flagRem == 1) {
+        offset = static_cast<int32_t>((p.numRotationsRem + 1) / 2) - 1;
+        for (uint32_t i = 0; i < p.bRem; ++i)
+            rot_out[stop][i] = ReduceRotation(p.gRem * i, M4);
+        for (uint32_t j = 0; j < p.gRem; ++j)
+            rot_in[stop][j] = ReduceRotation(j - offset, slots);
     }
 
-    uint32_t N  = cc->GetRingDimension();
     auto result = ctxt->Clone();
 
+    uint32_t N = cc->GetRingDimension();
+    std::vector<uint32_t> map(N);
+
+    auto algo                = cc->GetScheme();
+    const auto cryptoParams  = std::dynamic_pointer_cast<CryptoParametersCKKSRNS>(cc->GetCryptoParameters());
+    uint32_t compositeDegree = cryptoParams->GetCompositeDegree();
+
     // hoisted automorphisms
-    for (int32_t s = levelBudget - 1; s > stop; --s) {
-        if (s != levelBudget - 1)
+    const int32_t smax = -1 + p.lvlb;
+    for (int32_t s = smax; s > stop; --s) {
+        if (s != smax)
             algo->ModReduceInternalInPlace(result, compositeDegree);
 
         // computes the NTTs for each CRT limb (for the hoisted automorphisms used later on)
         auto digits = cc->EvalFastRotationPrecompute(result);
-
-        std::vector<Ciphertext<DCRTPoly>> fastRotation(g);
-#pragma omp parallel for
-        for (int32_t j = 0; j < g; j++) {
-            if (rot_in[s][j] != 0)
-                fastRotation[j] = cc->EvalFastRotationExt(result, rot_in[s][j], digits, true);
-            else
-                fastRotation[j] = cc->KeySwitchExt(result, true);
-        }
+        std::vector<Ciphertext<DCRTPoly>> fastRotation(p.g);
+#pragma omp parallel for num_threads(OpenFHEParallelControls.GetThreadLimit(p.g))
+        for (uint32_t j = 0; j < p.g; ++j)
+            fastRotation[j] = (rot_in[s][j] != 0) ? cc->EvalFastRotationExt(result, rot_in[s][j], digits, true) :
+                                                    cc->KeySwitchExt(result, true);
 
         Ciphertext<DCRTPoly> outer;
         DCRTPoly first;
-        for (int32_t i = 0; i < b; i++) {
+        for (uint32_t i = 0; i < p.b; ++i) {
             // for the first iteration with j=0:
-            int32_t G  = g * i;
+            uint32_t G = p.g * i;
             auto inner = EvalMultExt(fastRotation[0], A[s][G]);
             // continue the loop
-            for (int32_t j = 1; j < g; ++j) {
-                if ((G + j) != static_cast<int32_t>(numRotations))
+            for (uint32_t j = 1; j < p.g; ++j) {
+                if ((G + j) != p.numRotations)
                     EvalAddExtInPlace(inner, EvalMultExt(fastRotation[j], A[s][G + j]));
             }
 
             if (i == 0) {
-                first         = cc->KeySwitchDownFirstElement(inner);
-                auto elements = inner->GetElements();
-                elements[0].SetValuesToZero();
-                inner->SetElements(std::move(elements));
+                first = cc->KeySwitchDownFirstElement(inner);
                 outer = std::move(inner);
+                outer->GetElements()[0].SetValuesToZero();
             }
             else {
                 if (rot_out[s][i] != 0) {
                     inner = cc->KeySwitchDown(inner);
                     // Find the automorphism index that corresponds to rotation index index.
-                    uint32_t autoIndex = FindAutomorphismIndex2nComplex(rot_out[s][i], M);
-                    std::vector<uint32_t> map(N);
+                    uint32_t autoIndex = FindAutomorphismIndex2nComplex(rot_out[s][i], cc->GetCyclotomicOrder());
                     PrecomputeAutoMap(N, autoIndex, &map);
                     first += inner->GetElements()[0].AutomorphismTransform(autoIndex, map);
                     auto&& innerDigits = cc->EvalFastRotationPrecompute(inner);
@@ -1650,9 +1551,8 @@ Ciphertext<DCRTPoly> FHECKKSRNS::EvalCoeffsToSlots(const std::vector<std::vector
                 }
                 else {
                     first += cc->KeySwitchDownFirstElement(inner);
-                    auto elements = inner->GetElements();
+                    auto& elements = inner->GetElements();
                     elements[0].SetValuesToZero();
-                    inner->SetElements(std::move(elements));
                     EvalAddExtInPlace(outer, inner);
                 }
             }
@@ -1661,46 +1561,40 @@ Ciphertext<DCRTPoly> FHECKKSRNS::EvalCoeffsToSlots(const std::vector<std::vector
         result->GetElements()[0] += first;
     }
 
-    if (flagRem) {
+    if (flagRem == 1) {
         algo->ModReduceInternalInPlace(result, compositeDegree);
 
         // computes the NTTs for each CRT limb (for the hoisted automorphisms used later on)
         auto digits = cc->EvalFastRotationPrecompute(result);
-        std::vector<Ciphertext<DCRTPoly>> fastRotation(gRem);
-
-#pragma omp parallel for
-        for (int32_t j = 0; j < gRem; ++j) {
-            if (rot_in[stop][j] != 0)
-                fastRotation[j] = cc->EvalFastRotationExt(result, rot_in[stop][j], digits, true);
-            else
-                fastRotation[j] = cc->KeySwitchExt(result, true);
-        }
+        std::vector<Ciphertext<DCRTPoly>> fastRotationRem(p.gRem);
+#pragma omp parallel for num_threads(OpenFHEParallelControls.GetThreadLimit(p.gRem))
+        for (uint32_t j = 0; j < p.gRem; ++j)
+            fastRotationRem[j] = (rot_in[stop][j] != 0) ?
+                                     cc->EvalFastRotationExt(result, rot_in[stop][j], digits, true) :
+                                     cc->KeySwitchExt(result, true);
 
         Ciphertext<DCRTPoly> outer;
         DCRTPoly first;
-        for (int32_t i = 0; i < bRem; i++) {
+        for (uint32_t i = 0; i < p.bRem; ++i) {
             // for the first iteration with j=0:
-            int32_t GRem = gRem * i;
-            auto inner   = EvalMultExt(fastRotation[0], A[stop][GRem]);
+            int32_t GRem = p.gRem * i;
+            auto inner   = EvalMultExt(fastRotationRem[0], A[stop][GRem]);
             // continue the loop
-            for (int32_t j = 1; j < gRem; ++j) {
-                if ((GRem + j) != static_cast<int32_t>(numRotationsRem))
-                    EvalAddExtInPlace(inner, EvalMultExt(fastRotation[j], A[stop][GRem + j]));
+            for (uint32_t j = 1; j < p.gRem; ++j) {
+                if ((GRem + j) != p.numRotationsRem)
+                    EvalAddExtInPlace(inner, EvalMultExt(fastRotationRem[j], A[stop][GRem + j]));
             }
 
             if (i == 0) {
-                first         = cc->KeySwitchDownFirstElement(inner);
-                auto elements = inner->GetElements();
-                elements[0].SetValuesToZero();
-                inner->SetElements(std::move(elements));
+                first = cc->KeySwitchDownFirstElement(inner);
                 outer = std::move(inner);
+                outer->GetElements()[0].SetValuesToZero();
             }
             else {
                 if (rot_out[stop][i] != 0) {
                     inner = cc->KeySwitchDown(inner);
                     // Find the automorphism index that corresponds to rotation index index.
-                    uint32_t autoIndex = FindAutomorphismIndex2nComplex(rot_out[stop][i], M);
-                    std::vector<uint32_t> map(N);
+                    uint32_t autoIndex = FindAutomorphismIndex2nComplex(rot_out[stop][i], cc->GetCyclotomicOrder());
                     PrecomputeAutoMap(N, autoIndex, &map);
                     first += inner->GetElements()[0].AutomorphismTransform(autoIndex, map);
                     auto&& innerDigits = cc->EvalFastRotationPrecompute(inner);
@@ -1723,83 +1617,73 @@ Ciphertext<DCRTPoly> FHECKKSRNS::EvalCoeffsToSlots(const std::vector<std::vector
 
 Ciphertext<DCRTPoly> FHECKKSRNS::EvalSlotsToCoeffs(const std::vector<std::vector<ReadOnlyPlaintext>>& A,
                                                    ConstCiphertext<DCRTPoly>& ctxt) const {
-    uint32_t slots = ctxt->GetSlots();
+    const uint32_t slots = ctxt->GetSlots();
 
-    auto& p = GetBootPrecom(slots);
-
-    int32_t levelBudget     = p.m_paramsDec[CKKS_BOOT_PARAMS::LEVEL_BUDGET];
-    int32_t layersCollapse  = p.m_paramsDec[CKKS_BOOT_PARAMS::LAYERS_COLL];
-    int32_t remCollapse     = p.m_paramsDec[CKKS_BOOT_PARAMS::LAYERS_REM];
-    int32_t numRotations    = p.m_paramsDec[CKKS_BOOT_PARAMS::NUM_ROTATIONS];
-    int32_t b               = p.m_paramsDec[CKKS_BOOT_PARAMS::BABY_STEP];
-    int32_t g               = p.m_paramsDec[CKKS_BOOT_PARAMS::GIANT_STEP];
-    int32_t numRotationsRem = p.m_paramsDec[CKKS_BOOT_PARAMS::NUM_ROTATIONS_REM];
-    int32_t bRem            = p.m_paramsDec[CKKS_BOOT_PARAMS::BABY_STEP_REM];
-    int32_t gRem            = p.m_paramsDec[CKKS_BOOT_PARAMS::GIANT_STEP_REM];
-
-    auto cc                  = ctxt->GetCryptoContext();
-    auto algo                = cc->GetScheme();
-    const auto cryptoParams  = std::dynamic_pointer_cast<CryptoParametersCKKSRNS>(cc->GetCryptoParameters());
-    uint32_t compositeDegree = cryptoParams->GetCompositeDegree();
-
-    int32_t flagRem = (remCollapse == 0) ? 0 : 1;
+    const auto& p = GetBootPrecom(slots).m_paramsDec;
 
     // precompute the inner and outer rotations
-    std::vector<std::vector<int32_t>> rot_in(levelBudget, std::vector<int32_t>(numRotations + 1));
+    std::vector<std::vector<int32_t>> rot_out(p.lvlb, std::vector<int32_t>(p.b + p.bRem));
+    std::vector<std::vector<int32_t>> rot_in(p.lvlb, std::vector<int32_t>(p.numRotations + 1));
+    const int32_t flagRem = (p.remCollapse == 0) ? 0 : 1;
     if (flagRem == 1) {
         // remainder corresponds to index 0 in encoding and to last index in decoding
-        rot_in[levelBudget - 1].resize(numRotationsRem + 1);
+        rot_in[p.lvlb - 1].resize(p.numRotationsRem + 1);
     }
-    std::vector<std::vector<int32_t>> rot_out(levelBudget, std::vector<int32_t>(b + bRem));
 
-    uint32_t M = cc->GetCyclotomicOrder();
-    for (int32_t s = 0; s < levelBudget - flagRem; ++s) {
-        for (int32_t j = 0; j < g; ++j)
-            rot_in[s][j] = ReduceRotation(
-                (j - static_cast<int32_t>((numRotations + 1) / 2) + 1) * (1 << (s * layersCollapse)), M / 4);
-        for (int32_t i = 0; i < b; i++)
-            rot_out[s][i] = ReduceRotation((g * i) * (1 << (s * layersCollapse)), M / 4);
+    auto cc = ctxt->GetCryptoContext();
+
+    const uint32_t M4    = cc->GetCyclotomicOrder() / 4;
+    const int32_t smax   = p.lvlb - flagRem;
+    const int32_t offset = static_cast<int32_t>((p.numRotations + 1) / 2) - 1;
+    for (int32_t s = 0; s < smax; ++s) {
+        const int32_t scale = 1 << (s * p.layersCollapse);
+        for (uint32_t j = 0; j < p.g; ++j)
+            rot_in[s][j] = ReduceRotation((j - offset) * scale, M4);
+        for (uint32_t i = 0; i < p.b; ++i)
+            rot_out[s][i] = ReduceRotation((p.g * i) * scale, M4);
     }
 
-    if (flagRem) {
-        int32_t s = levelBudget - flagRem;
-        for (int32_t j = 0; j < gRem; ++j)
-            rot_in[s][j] = ReduceRotation(
-                (j - static_cast<int32_t>((numRotationsRem + 1) / 2) + 1) * (1 << (s * layersCollapse)), M / 4);
-        for (int32_t i = 0; i < bRem; ++i)
-            rot_out[s][i] = ReduceRotation((gRem * i) * (1 << (s * layersCollapse)), M / 4);
+    if (flagRem == 1) {
+        const int32_t scaleRem  = 1 << (smax * p.layersCollapse);
+        const int32_t offsetRem = static_cast<int32_t>((p.numRotationsRem + 1) / 2) - 1;
+        for (uint32_t j = 0; j < p.gRem; ++j)
+            rot_in[smax][j] = ReduceRotation((j - offsetRem) * scaleRem, M4);
+        for (uint32_t i = 0; i < p.bRem; ++i)
+            rot_out[smax][i] = ReduceRotation((p.gRem * i) * scaleRem, M4);
     }
 
     //  No need for Encrypted Bit Reverse
     auto result = ctxt->Clone();
-    uint32_t N  = cc->GetRingDimension();
+
+    uint32_t N = cc->GetRingDimension();
+    std::vector<uint32_t> map(N);
+
+    auto algo                = cc->GetScheme();
+    const auto cryptoParams  = std::dynamic_pointer_cast<CryptoParametersCKKSRNS>(cc->GetCryptoParameters());
+    uint32_t compositeDegree = cryptoParams->GetCompositeDegree();
 
     // hoisted automorphisms
-    for (int32_t s = 0; s < levelBudget - flagRem; ++s) {
+    for (int32_t s = 0; s < smax; ++s) {
         if (s != 0)
             algo->ModReduceInternalInPlace(result, compositeDegree);
 
         // computes the NTTs for each CRT limb (for the hoisted automorphisms used later on)
         auto digits = cc->EvalFastRotationPrecompute(result);
-
-        std::vector<Ciphertext<DCRTPoly>> fastRotation(g);
-#pragma omp parallel for
-        for (int32_t j = 0; j < g; ++j) {
-            if (rot_in[s][j] != 0)
-                fastRotation[j] = cc->EvalFastRotationExt(result, rot_in[s][j], digits, true);
-            else
-                fastRotation[j] = cc->KeySwitchExt(result, true);
-        }
+        std::vector<Ciphertext<DCRTPoly>> fastRotation(p.g);
+#pragma omp parallel for num_threads(OpenFHEParallelControls.GetThreadLimit(p.g))
+        for (uint32_t j = 0; j < p.g; ++j)
+            fastRotation[j] = (rot_in[s][j] != 0) ? cc->EvalFastRotationExt(result, rot_in[s][j], digits, true) :
+                                                    cc->KeySwitchExt(result, true);
 
         Ciphertext<DCRTPoly> outer;
         DCRTPoly first;
-        for (int32_t i = 0; i < b; ++i) {
+        for (uint32_t i = 0; i < p.b; ++i) {
             // for the first iteration with j=0:
-            int32_t G  = g * i;
+            uint32_t G = i * p.g;
             auto inner = EvalMultExt(fastRotation[0], A[s][G]);
             // continue the loop
-            for (int32_t j = 1; j < g; ++j) {
-                if ((G + j) != static_cast<int32_t>(numRotations))
+            for (uint32_t j = 1; j < p.g; ++j) {
+                if ((G + j) != p.numRotations)
                     EvalAddExtInPlace(inner, EvalMultExt(fastRotation[j], A[s][G + j]));
             }
 
@@ -1814,8 +1698,7 @@ Ciphertext<DCRTPoly> FHECKKSRNS::EvalSlotsToCoeffs(const std::vector<std::vector
                 if (rot_out[s][i] != 0) {
                     inner = cc->KeySwitchDown(inner);
                     // Find the automorphism index that corresponds to rotation index index.
-                    uint32_t autoIndex = FindAutomorphismIndex2nComplex(rot_out[s][i], M);
-                    std::vector<uint32_t> map(N);
+                    auto autoIndex = FindAutomorphismIndex2nComplex(rot_out[s][i], cc->GetCyclotomicOrder());
                     PrecomputeAutoMap(N, autoIndex, &map);
                     first += inner->GetElements()[0].AutomorphismTransform(autoIndex, map);
                     auto&& innerDigits = cc->EvalFastRotationPrecompute(inner);
@@ -1834,31 +1717,28 @@ Ciphertext<DCRTPoly> FHECKKSRNS::EvalSlotsToCoeffs(const std::vector<std::vector
         result->GetElements()[0] += first;
     }
 
-    if (flagRem) {
+    if (flagRem == 1) {
         algo->ModReduceInternalInPlace(result, compositeDegree);
+
         // computes the NTTs for each CRT limb (for the hoisted automorphisms used later on)
         auto digits = cc->EvalFastRotationPrecompute(result);
-        std::vector<Ciphertext<DCRTPoly>> fastRotation(gRem);
-
-        int32_t s = levelBudget - flagRem;
-#pragma omp parallel for
-        for (int32_t j = 0; j < gRem; ++j) {
-            if (rot_in[s][j] != 0)
-                fastRotation[j] = cc->EvalFastRotationExt(result, rot_in[s][j], digits, true);
-            else
-                fastRotation[j] = cc->KeySwitchExt(result, true);
-        }
+        std::vector<Ciphertext<DCRTPoly>> fastRotationRem(p.gRem);
+#pragma omp parallel for num_threads(OpenFHEParallelControls.GetThreadLimit(p.gRem))
+        for (uint32_t j = 0; j < p.gRem; ++j)
+            fastRotationRem[j] = (rot_in[smax][j] != 0) ?
+                                     cc->EvalFastRotationExt(result, rot_in[smax][j], digits, true) :
+                                     cc->KeySwitchExt(result, true);
 
         Ciphertext<DCRTPoly> outer;
         DCRTPoly first;
-        for (int32_t i = 0; i < bRem; i++) {
+        for (uint32_t i = 0; i < p.bRem; ++i) {
             // for the first iteration with j=0:
-            int32_t GRem = gRem * i;
-            auto inner   = EvalMultExt(fastRotation[0], A[s][GRem]);
+            uint32_t GRem = i * p.gRem;
+            auto inner    = EvalMultExt(fastRotationRem[0], A[smax][GRem]);
             // continue the loop
-            for (int32_t j = 1; j < gRem; ++j) {
-                if ((GRem + j) != static_cast<int32_t>(numRotationsRem))
-                    EvalAddExtInPlace(inner, EvalMultExt(fastRotation[j], A[s][GRem + j]));
+            for (uint32_t j = 1; j < p.gRem; ++j) {
+                if ((GRem + j) != p.numRotationsRem)
+                    EvalAddExtInPlace(inner, EvalMultExt(fastRotationRem[j], A[smax][GRem + j]));
             }
 
             if (i == 0) {
@@ -1869,15 +1749,14 @@ Ciphertext<DCRTPoly> FHECKKSRNS::EvalSlotsToCoeffs(const std::vector<std::vector
                 outer = std::move(inner);
             }
             else {
-                if (rot_out[s][i] != 0) {
+                if (rot_out[smax][i] != 0) {
                     inner = cc->KeySwitchDown(inner);
                     // Find the automorphism index that corresponds to rotation index index.
-                    uint32_t autoIndex = FindAutomorphismIndex2nComplex(rot_out[s][i], M);
-                    std::vector<uint32_t> map(N);
+                    auto autoIndex = FindAutomorphismIndex2nComplex(rot_out[smax][i], cc->GetCyclotomicOrder());
                     PrecomputeAutoMap(N, autoIndex, &map);
                     first += inner->GetElements()[0].AutomorphismTransform(autoIndex, map);
                     auto innerDigits = cc->EvalFastRotationPrecompute(inner);
-                    EvalAddExtInPlace(outer, cc->EvalFastRotationExt(inner, rot_out[s][i], innerDigits, false));
+                    EvalAddExtInPlace(outer, cc->EvalFastRotationExt(inner, rot_out[smax][i], innerDigits, false));
                 }
                 else {
                     first += cc->KeySwitchDownFirstElement(inner);
@@ -1980,7 +1859,7 @@ void FHECKKSRNS::AdjustCiphertextFBT(Ciphertext<DCRTPoly>& ciphertext, double co
 #endif
 }
 
-void FHECKKSRNS::ExtendCiphertext(std::vector<DCRTPoly>& ctxtDCRT, const CryptoContextImpl<DCRTPoly>& cc,
+void FHECKKSRNS::ExtendCiphertext(std::vector<DCRTPoly>& ctxtDCRTs, const CryptoContextImpl<DCRTPoly>& cc,
                                   const std::shared_ptr<DCRTPoly::Params> elementParamsRaisedPtr) const {
     // TODO: YSP We should be able to use one of the DCRTPoly methods for this; If not, we can define a new method there and use it here
 
@@ -1999,83 +1878,72 @@ void FHECKKSRNS::ExtendCiphertext(std::vector<DCRTPoly>& ctxtDCRT, const CryptoC
     std::vector<NativeInteger> qhat_modqj(compositeDegree);
     qhat_modqj[0] = qj[1].Mod(qj[0]);
     qhat_modqj[1] = qj[0].Mod(qj[1]);
-
-    std::vector<NativeInteger> qhat_inv_modqj(compositeDegree);
-
     for (uint32_t d = 2; d < compositeDegree; d++) {
-        for (uint32_t j = 0; j < d; ++j) {
+        for (uint32_t j = 0; j < d; ++j)
             qhat_modqj[j] = qj[d].ModMul(qhat_modqj[j], qj[j]);
-        }
         qhat_modqj[d] = qj[1].ModMul(qj[0], qj[d]);
-        for (uint32_t j = 2; j < d; ++j) {
+        for (uint32_t j = 2; j < d; ++j)
             qhat_modqj[d] = qj[j].ModMul(qhat_modqj[d], qj[d]);
-        }
     }
 
-    for (uint32_t j = 0; j < compositeDegree; ++j) {
+    std::vector<NativeInteger> qhat_inv_modqj(compositeDegree);
+    for (uint32_t j = 0; j < compositeDegree; ++j)
         qhat_inv_modqj[j] = qhat_modqj[j].ModInverse(qj[j]);
-    }
 
     NativeInteger qjProduct =
         std::accumulate(qj.begin() + 1, qj.end(), NativeInteger{1}, std::multiplies<NativeInteger>());
     uint32_t init_element_index = compositeDegree;
-    for (size_t i = 0; i < ctxtDCRT.size(); i++) {
-        std::vector<DCRTPoly> temp(compositeDegree + 1, DCRTPoly(elementParamsRaisedPtr, COEFFICIENT));
-        std::vector<DCRTPoly> ctxtDCRT_modq(compositeDegree, DCRTPoly(elementParamsRaisedPtr, COEFFICIENT));
-
-        ctxtDCRT[i].SetFormat(COEFFICIENT);
-        for (size_t j = 0; j < ctxtDCRT[i].GetNumOfElements(); j++) {
-            for (size_t k = 0; k < compositeDegree; k++)
-                ctxtDCRT_modq[k].SetElementAtIndex(j, ctxtDCRT[i].GetElementAtIndex(j) * qhat_inv_modqj[k]);
-        }
-        //=========================================================================================================
-        temp[0] = ctxtDCRT_modq[0].GetElementAtIndex(0);
-        for (auto& el : temp[0].GetAllElements()) {
-            el *= qjProduct;
+
+    for (auto& dcrt : ctxtDCRTs) {
+        dcrt.SetFormat(COEFFICIENT);
+
+        std::vector<DCRTPoly> tmp(compositeDegree + 1, DCRTPoly(elementParamsRaisedPtr, COEFFICIENT));
+        std::vector<DCRTPoly> ctxtDCRTs_modq(compositeDegree, DCRTPoly(elementParamsRaisedPtr, COEFFICIENT));
+
+#pragma omp parallel for num_threads(OpenFHEParallelControls.GetThreadLimit(dcrt.GetNumOfElements()))
+        for (size_t j = 0; j < dcrt.GetNumOfElements(); ++j) {
+            for (uint32_t k = 0; k < compositeDegree; ++k)
+                ctxtDCRTs_modq[k].SetElementAtIndex(j, dcrt.GetElementAtIndex(j) * qhat_inv_modqj[k]);
         }
-        //=========================================================================================================
-        for (size_t d = 1; d < compositeDegree; d++) {
-            temp[init_element_index] = ctxtDCRT_modq[d].GetElementAtIndex(d);
 
-            for (size_t k = 0; k < compositeDegree; k++) {
-                if (k != d) {
-                    temp[d].SetElementAtIndex(k, temp[0].GetElementAtIndex(k) * qj[k]);
-                }
-            }
-            //=========================================================================================================
+        tmp[0] = ctxtDCRTs_modq[0].GetElementAtIndex(0);
+
+#pragma omp parallel for num_threads(OpenFHEParallelControls.GetThreadLimit(tmp[0].GetAllElements().size()))
+        for (auto& el : tmp[0].GetAllElements())
+            el *= qjProduct;
+
+        for (uint32_t d = 1; d < compositeDegree; ++d) {
+            tmp[init_element_index] = ctxtDCRTs_modq[d].GetElementAtIndex(d);
+
             NativeInteger qjProductD{1};
-            for (size_t k = 0; k < compositeDegree; k++) {
-                if (k != d)
+            for (uint32_t k = 0; k < compositeDegree; ++k) {
+                if (k != d) {
                     qjProductD *= qj[k];
+                    tmp[d].SetElementAtIndex(k, tmp[0].GetElementAtIndex(k) * qj[k]);
+                }
             }
 
-            for (size_t j = compositeDegree; j < elementParamsRaisedPtr->GetParams().size(); j++) {
-                auto value = temp[init_element_index].GetElementAtIndex(j) * qjProductD;
-                temp[d].SetElementAtIndex(j, value);
-            }
-            //=========================================================================================================
-            {
-                auto value = temp[init_element_index].GetElementAtIndex(d) * qjProductD;
-                temp[d].SetElementAtIndex(d, value);
-            }
-            //=========================================================================================================
-            temp[0] += temp[d];
+            for (uint32_t j = compositeDegree; j < elementParamsRaisedPtr->GetParams().size(); ++j)
+                tmp[d].SetElementAtIndex(j, tmp[init_element_index].GetElementAtIndex(j) * qjProductD);
+
+            tmp[d].SetElementAtIndex(d, tmp[init_element_index].GetElementAtIndex(d) * qjProductD);
+            tmp[0] += tmp[d];
         }
 
-        temp[0].SetFormat(EVALUATION);
-        ctxtDCRT[i] = temp[0];
+        tmp[0].SetFormat(EVALUATION);
+        dcrt = std::move(tmp[0]);
     }
 }
 
 void FHECKKSRNS::ApplyDoubleAngleIterations(Ciphertext<DCRTPoly>& ciphertext, uint32_t numIter) const {
+    constexpr double twoPi = 2.0 * M_PI;
+
     auto cc = ciphertext->GetCryptoContext();
 
-    const int32_t r = numIter;
-    for (int32_t j = 1; j <= r; ++j) {
+    for (int32_t i = 1 - numIter; i <= 0; ++i) {
+        double scalar = -std::pow(twoPi, -std::pow(2.0, i));
         cc->EvalSquareInPlace(ciphertext);
-        ciphertext    = cc->EvalAdd(ciphertext, ciphertext);
-        double scalar = -1.0 / std::pow((2.0 * M_PI), std::pow(2.0, j - r));
-        cc->EvalAddInPlace(ciphertext, scalar);
+        cc->EvalAddInPlace(ciphertext, cc->EvalAdd(ciphertext, scalar));
         cc->ModReduceInPlace(ciphertext);
     }
 }
@@ -2105,8 +1973,8 @@ Plaintext FHECKKSRNS::MakeAuxPlaintext(const CryptoContextImpl<DCRTPoly>& cc, co
     double powP      = std::pow(2.0, MAX_DOUBLE_PRECISION);
     int32_t pCurrent = pBits - MAX_DOUBLE_PRECISION;
 
-    std::vector<int128_t> temp(2 * slots);
-    for (size_t i = 0; i < slots; ++i) {
+    std::vector<int128_t> tmp(2 * slots);
+    for (uint32_t i = 0; i < slots; ++i) {
         // extract the mantissa of real part and multiply it by 2^52
         int32_t n1 = 0;
         double dre = std::frexp(inverse[i].real(), &n1) * powP;
@@ -2181,10 +2049,10 @@ Plaintext FHECKKSRNS::MakeAuxPlaintext(const CryptoContextImpl<DCRTPoly>& cc, co
             im                     = pPowRemaining * im64;
         }
 
-        temp[i]         = (re < 0) ? Max128BitValue() + re : re;
-        temp[i + slots] = (im < 0) ? Max128BitValue() + im : im;
+        tmp[i]         = (re < 0) ? Max128BitValue() + re : re;
+        tmp[i + slots] = (im < 0) ? Max128BitValue() + im : im;
 
-        if (is128BitOverflow(temp[i]) || is128BitOverflow(temp[i + slots])) {
+        if (is128BitOverflow(tmp[i]) || is128BitOverflow(tmp[i + slots])) {
             OPENFHE_THROW("Overflow, try to decrease scaling factor");
         }
     }
@@ -2194,7 +2062,7 @@ Plaintext FHECKKSRNS::MakeAuxPlaintext(const CryptoContextImpl<DCRTPoly>& cc, co
 
     for (size_t i = 0; i < nativeParams.size(); i++) {
         NativeVector nativeVec(N, nativeParams[i]->GetModulus());
-        FitToNativeVector(N, temp, Max128BitValue(), &nativeVec);
+        FitToNativeVector(N, tmp, Max128BitValue(), &nativeVec);
         NativePoly element = plainElement.GetElementAtIndex(i);
         element.SetValues(std::move(nativeVec), Format::COEFFICIENT);
         plainElement.SetElementAtIndex(i, std::move(element));
@@ -2211,8 +2079,8 @@ Plaintext FHECKKSRNS::MakeAuxPlaintext(const CryptoContextImpl<DCRTPoly>& cc, co
 
     auto currPowP = crtPowP;
 
-    // We want to scale temp by 2^(pd), and the loop starts from j=2
-    // because temp is already scaled by 2^p in the re/im loop above,
+    // We want to scale tmp by 2^(pd), and the loop starts from j=2
+    // because tmp is already scaled by 2^p in the re/im loop above,
     // and currPowP already is 2^p.
     for (size_t i = 2; i < noiseScaleDeg; i++) {
         currPowP = CKKSPackedEncoding::CRTMult(currPowP, crtPowP, moduli);
@@ -2233,7 +2101,7 @@ Plaintext FHECKKSRNS::MakeAuxPlaintext(const CryptoContextImpl<DCRTPoly>& cc, co
                                        uint32_t level, uint32_t slots) const {
     const auto cryptoParams = std::dynamic_pointer_cast<CryptoParametersCKKSRNS>(cc.GetCryptoParameters());
 
-    double scFact = cryptoParams->GetScalingFactorReal(level);
+    const double scFact = cryptoParams->GetScalingFactorReal(level);
 
     Plaintext p = Plaintext(std::make_shared<CKKSPackedEncoding>(params, cc.GetEncodingParams(), value, noiseScaleDeg,
                                                                  level, scFact, slots, COMPLEX));
@@ -2243,7 +2111,6 @@ Plaintext FHECKKSRNS::MakeAuxPlaintext(const CryptoContextImpl<DCRTPoly>& cc, co
     uint32_t N = cc.GetRingDimension();
 
     std::vector<std::complex<double>> inverse = value;
-
     inverse.resize(slots);
 
     DiscreteFourierTransform::FFTSpecialInv(inverse, N * 2);
@@ -2256,12 +2123,12 @@ Plaintext FHECKKSRNS::MakeAuxPlaintext(const CryptoContextImpl<DCRTPoly>& cc, co
     for (uint32_t i = 0; i < slots; ++i) {
         inverse[i] *= powP;
         if (inverse[i].real() != 0) {
-            int32_t logci = static_cast<int32_t>(ceil(log2(std::abs(inverse[i].real()))));
+            int32_t logci = static_cast<int32_t>(std::ceil(std::log2(std::abs(inverse[i].real()))));
             if (logc < logci)
                 logc = logci;
         }
         if (inverse[i].imag() != 0) {
-            int32_t logci = static_cast<int32_t>(ceil(log2(std::abs(inverse[i].imag()))));
+            int32_t logci = static_cast<int32_t>(std::ceil(std::log2(std::abs(inverse[i].imag()))));
             if (logc < logci)
                 logc = logci;
         }
@@ -2272,11 +2139,11 @@ Plaintext FHECKKSRNS::MakeAuxPlaintext(const CryptoContextImpl<DCRTPoly>& cc, co
 
     int32_t logValid    = (logc <= MAX_BITS_IN_WORD) ? logc : MAX_BITS_IN_WORD;
     int32_t logApprox   = logc - logValid;
-    double approxFactor = pow(2, logApprox);
+    double approxFactor = std::pow(2, logApprox);
 
-    std::vector<int64_t> temp(2 * slots);
+    std::vector<int64_t> tmp(2 * slots);
 
-    for (size_t i = 0; i < slots; ++i) {
+    for (uint32_t i = 0; i < slots; ++i) {
         // Scale down by approxFactor in case the value exceeds a 64-bit integer.
         double dre = inverse[i].real() / approxFactor;
         double dim = inverse[i].imag() / approxFactor;
@@ -2291,27 +2158,22 @@ Plaintext FHECKKSRNS::MakeAuxPlaintext(const CryptoContextImpl<DCRTPoly>& cc, co
             double realMax = -1, imagMax = -1;
             uint32_t realMaxIdx = -1, imagMaxIdx = -1;
 
-            for (uint32_t idx = 0; idx < inverse.size(); idx++) {
-                // exp( j*2*pi*n*k/N )
-                std::complex<double> expFactor = {cos((factor * idx) / invLen), sin((factor * idx) / invLen)};
-
+            for (uint32_t idx = 0; idx < inverse.size(); ++idx) {
                 // X[k] * exp( j*2*pi*n*k/N )
-                std::complex<double> prodFactor = inverse[idx] * expFactor;
+                auto prodFactor = inverse[idx] * std::complex<double>{std::cos((factor * idx) / invLen),
+                                                                      std::sin((factor * idx) / invLen)};
 
-                double realVal = prodFactor.real();
-                double imagVal = prodFactor.imag();
-
-                if (realVal > realMax) {
-                    realMax    = realVal;
+                if (prodFactor.real() > realMax) {
+                    realMax    = prodFactor.real();
                     realMaxIdx = idx;
                 }
-                if (imagVal > imagMax) {
-                    imagMax    = imagVal;
+                if (prodFactor.imag() > imagMax) {
+                    imagMax    = prodFactor.imag();
                     imagMaxIdx = idx;
                 }
             }
 
-            auto scaledInputSize = ceil(log2(dre));
+            auto scaledInputSize = std::ceil(std::log2(dre));
 
             std::stringstream buffer;
             buffer << std::endl
@@ -2327,18 +2189,18 @@ Plaintext FHECKKSRNS::MakeAuxPlaintext(const CryptoContextImpl<DCRTPoly>& cc, co
         }
 
         int64_t re = std::llround(dre);
-        int64_t im = std::llround(dim);
+        tmp[i]     = (re < 0) ? Max64BitValue() + re : re;
 
-        temp[i]         = (re < 0) ? Max64BitValue() + re : re;
-        temp[i + slots] = (im < 0) ? Max64BitValue() + im : im;
+        int64_t im     = std::llround(dim);
+        tmp[i + slots] = (im < 0) ? Max64BitValue() + im : im;
     }
 
-    const std::shared_ptr<ILDCRTParams<BigInteger>> bigParams        = plainElement.GetParams();
-    const std::vector<std::shared_ptr<ILNativeParams>>& nativeParams = bigParams->GetParams();
+    const auto& bigParams    = plainElement.GetParams();
+    const auto& nativeParams = bigParams->GetParams();
 
-    for (size_t i = 0; i < nativeParams.size(); i++) {
+    for (size_t i = 0; i < nativeParams.size(); ++i) {
         NativeVector nativeVec(N, nativeParams[i]->GetModulus());
-        FitToNativeVector(N, temp, Max64BitValue(), &nativeVec);
+        FitToNativeVector(N, tmp, Max64BitValue(), &nativeVec);
         NativePoly element = plainElement.GetElementAtIndex(i);
         element.SetValues(std::move(nativeVec), Format::COEFFICIENT);
         plainElement.SetElementAtIndex(i, std::move(element));
@@ -2346,7 +2208,7 @@ Plaintext FHECKKSRNS::MakeAuxPlaintext(const CryptoContextImpl<DCRTPoly>& cc, co
 
     uint32_t numTowers = nativeParams.size();
     std::vector<DCRTPoly::Integer> moduli(numTowers);
-    for (uint32_t i = 0; i < numTowers; i++) {
+    for (uint32_t i = 0; i < numTowers; ++i) {
         moduli[i] = nativeParams[i]->GetModulus();
     }
 
@@ -2355,7 +2217,7 @@ Plaintext FHECKKSRNS::MakeAuxPlaintext(const CryptoContextImpl<DCRTPoly>& cc, co
         cryptoParams->GetScalingTechnique() == COMPOSITESCALINGMANUAL) {
         // Duhyeong: Support the case powP > 2^64
         //           Later we might need to use the NATIVE_INT=128 version of FHECKKSRNS::MakeAuxPlaintext for higher precision
-        int32_t logPowP = static_cast<int32_t>(ceil(log2(fabs(powP))));
+        int32_t logPowP = static_cast<int32_t>(std::ceil(std::log2(std::abs(powP))));
 
         if (logPowP > 64) {
             // Compute approxFactor, a value to scale down by, in case the value exceeds a 64-bit integer.
@@ -2364,17 +2226,17 @@ Plaintext FHECKKSRNS::MakeAuxPlaintext(const CryptoContextImpl<DCRTPoly>& cc, co
                                          LargeScalingFactorConstants::MAX_BITS_IN_WORD;
             int32_t logApprox_PowP = logPowP - logValid;
             if (logApprox_PowP > 0) {
-                int32_t logStep           = (logApprox <= LargeScalingFactorConstants::MAX_LOG_STEP) ?
-                                                logApprox_PowP :
-                                                LargeScalingFactorConstants::MAX_LOG_STEP;
-                DCRTPoly::Integer intStep = static_cast<uint64_t>(1) << logStep;
+                int32_t logStep = (logApprox <= LargeScalingFactorConstants::MAX_LOG_STEP) ?
+                                      logApprox_PowP :
+                                      LargeScalingFactorConstants::MAX_LOG_STEP;
+                auto intStep    = DCRTPoly::Integer(1) << logStep;
                 std::vector<DCRTPoly::Integer> crtApprox(numTowers, intStep);
                 logApprox_PowP -= logStep;
                 while (logApprox_PowP > 0) {
-                    int32_t logStep           = (logApprox <= LargeScalingFactorConstants::MAX_LOG_STEP) ?
-                                                    logApprox :
-                                                    LargeScalingFactorConstants::MAX_LOG_STEP;
-                    DCRTPoly::Integer intStep = static_cast<uint64_t>(1) << logStep;
+                    int32_t logStep = (logApprox <= LargeScalingFactorConstants::MAX_LOG_STEP) ?
+                                          logApprox :
+                                          LargeScalingFactorConstants::MAX_LOG_STEP;
+                    auto intStep    = DCRTPoly::Integer(1) << logStep;
                     std::vector<DCRTPoly::Integer> crtStep(numTowers, intStep);
                     crtApprox = CKKSPackedEncoding::CRTMult(crtApprox, crtStep, moduli);
                     logApprox_PowP -= logStep;
@@ -2382,7 +2244,7 @@ Plaintext FHECKKSRNS::MakeAuxPlaintext(const CryptoContextImpl<DCRTPoly>& cc, co
                 crtPowP = CKKSPackedEncoding::CRTMult(crtPowP, crtApprox, moduli);
             }
             else {
-                double approxFactor = pow(2, logApprox_PowP);
+                double approxFactor = std::pow(2, logApprox_PowP);
                 DCRTPoly::Integer intPowP{static_cast<uint64_t>(std::llround(powP / approxFactor))};
                 crtPowP = std::vector<DCRTPoly::Integer>(numTowers, intPowP);
             }
@@ -2399,10 +2261,10 @@ Plaintext FHECKKSRNS::MakeAuxPlaintext(const CryptoContextImpl<DCRTPoly>& cc, co
 
     auto currPowP = crtPowP;
 
-    // We want to scale temp by 2^(pd), and the loop starts from j=2
-    // because temp is already scaled by 2^p in the re/im loop above,
+    // We want to scale tmp by 2^(pd), and the loop starts from j=2
+    // because tmp is already scaled by 2^p in the re/im loop above,
     // and currPowP already is 2^p.
-    for (size_t i = 2; i < noiseScaleDeg; i++)
+    for (size_t i = 2; i < noiseScaleDeg; ++i)
         currPowP = CKKSPackedEncoding::CRTMult(currPowP, crtPowP, moduli);
     if (noiseScaleDeg > 1)
         plainElement = plainElement.Times(currPowP);
@@ -2410,13 +2272,13 @@ Plaintext FHECKKSRNS::MakeAuxPlaintext(const CryptoContextImpl<DCRTPoly>& cc, co
     // Scale back up by the approxFactor to get the correct encoding.
     if (logApprox > 0) {
         int32_t logStep = (logApprox <= MAX_LOG_STEP) ? logApprox : MAX_LOG_STEP;
-        auto intStep    = DCRTPoly::Integer(static_cast<uint64_t>(1) << logStep);
+        auto intStep    = DCRTPoly::Integer(1) << logStep;
         std::vector<DCRTPoly::Integer> crtApprox(numTowers, intStep);
         logApprox -= logStep;
 
         while (logApprox > 0) {
             logStep = (logApprox <= MAX_LOG_STEP) ? logApprox : MAX_LOG_STEP;
-            intStep = DCRTPoly::Integer(static_cast<uint64_t>(1) << logStep);
+            intStep = DCRTPoly::Integer(1) << logStep;
             std::vector<DCRTPoly::Integer> crtSF(numTowers, intStep);
             crtApprox = CKKSPackedEncoding::CRTMult(crtApprox, crtSF, moduli);
             logApprox -= logStep;
@@ -2433,7 +2295,6 @@ Plaintext FHECKKSRNS::MakeAuxPlaintext(const CryptoContextImpl<DCRTPoly>& cc, co
 Ciphertext<DCRTPoly> FHECKKSRNS::EvalMultExt(ConstCiphertext<DCRTPoly> ciphertext, ConstPlaintext plaintext) const {
     auto pt = plaintext->GetElement<DCRTPoly>();
     pt.SetFormat(Format::EVALUATION);
-
     auto result = ciphertext->Clone();
     for (auto& c : result->GetElements())
         c *= pt;
@@ -2461,12 +2322,10 @@ EvalKey<DCRTPoly> FHECKKSRNS::ConjugateKeyGen(const PrivateKey<DCRTPoly> private
     uint32_t N = privateKey->GetPrivateElement().GetRingDimension();
     std::vector<uint32_t> vec(N);
     PrecomputeAutoMap(N, 2 * N - 1, &vec);
-
     const auto cc   = privateKey->GetCryptoContext();
     auto pkPermuted = std::make_shared<PrivateKeyImpl<DCRTPoly>>(cc);
     pkPermuted->SetPrivateElement(privateKey->GetPrivateElement().AutomorphismTransform(2 * N - 1, vec));
     pkPermuted->SetKeyTag(privateKey->GetKeyTag());
-
     return cc->GetScheme()->KeySwitchGen(privateKey, pkPermuted);
 }
 
@@ -2496,7 +2355,7 @@ void FHECKKSRNS::FitToNativeVector(uint32_t ringDim, const std::vector<int64_t>&
     NativeInteger diff = bigBound - modulus;
     uint32_t dslots    = vec.size();
     uint32_t gap       = ringDim / dslots;
-    for (uint32_t i = 0; i < vec.size(); i++) {
+    for (uint32_t i = 0; i < dslots; ++i) {
         NativeInteger n(vec[i]);
         if (n > bigValueHf) {
             (*nativeVec)[gap * i] = n.ModSub(diff, modulus);
@@ -2517,7 +2376,7 @@ void FHECKKSRNS::FitToNativeVector(uint32_t ringDim, const std::vector<int128_t>
     NativeInteger diff = NativeInteger((uint128_t)bigBound) - modulus;
     uint32_t dslots    = vec.size();
     uint32_t gap       = ringDim / dslots;
-    for (uint32_t i = 0; i < vec.size(); i++) {
+    for (uint32_t i = 0; i < dslots; ++i) {
         NativeInteger n((uint128_t)vec[i]);
         if (n > bigValueHf) {
             (*nativeVec)[gap * i] = n.ModSub(diff, modulus);
@@ -2548,8 +2407,6 @@ void FHECKKSRNS::EvalFBTSetupInternal(const CryptoContextImpl<DCRTPoly>& cc, con
     auto& precom           = m_bootPrecomMap[slots];
 
     precom->m_slots = slots;
-    precom->m_dim1  = dim1[0];
-    precom->m_gs    = dim1[1];
 
     // even for the case of a single slot we need one level for rescaling
     uint32_t logSlots = (slots < 3) ? 1 : std::log2(slots);
@@ -2562,8 +2419,6 @@ void FHECKKSRNS::EvalFBTSetupInternal(const CryptoContextImpl<DCRTPoly>& cc, con
     if (levelBudget[0] < 1 || levelBudget[1] < 1)
         OPENFHE_THROW("The level budget cannot be zero. Please set it to be at least one and at most log(slots).");
 
-    precom->m_levelEnc  = levelBudget[0];
-    precom->m_levelDec  = levelBudget[1];
     precom->m_paramsEnc = GetCollapsedFFTParams(slots, levelBudget[0], dim1[0]);
     precom->m_paramsDec = GetCollapsedFFTParams(slots, levelBudget[1], dim1[1]);
 
@@ -2700,7 +2555,7 @@ Ciphertext<DCRTPoly> FHECKKSRNS::EvalHomDecoding(ConstCiphertext<DCRTPoly>& ciph
     // linear transform for decoding
     auto slots   = ciphertext->GetSlots();
     auto& p      = GetBootPrecom(slots);
-    auto isLTBS  = (p.m_levelEnc == 1) && (p.m_levelDec == 1);
+    auto isLTBS  = (p.m_paramsEnc.lvlb == 1) && (p.m_paramsDec.lvlb == 1);
     auto ctxtDec = (isLTBS) ? EvalLinearTransform(p.m_U0Pre, ctxtEnc) : EvalSlotsToCoeffs(p.m_U0PreFFT, ctxtEnc);
 
     if (slots != cc->GetCyclotomicOrder() / 4) {
@@ -2716,6 +2571,7 @@ Ciphertext<DCRTPoly> FHECKKSRNS::EvalHomDecoding(ConstCiphertext<DCRTPoly>& ciph
         cc->GetScheme()->MultByIntegerInPlace(ctxtDec, postScaling);
 
     cc->ModReduceInPlace(ctxtDec);
+
     // 64-bit only: No need to scale back the message to its original scale.
     return ctxtDec;
 }
@@ -2771,30 +2627,30 @@ std::shared_ptr<seriesPowers<DCRTPoly>> FHECKKSRNS::EvalMVBPrecomputeInternal(
         raised = KeySwitchSparse(raised, evalKeyMap.at(2 * N - 4));
 
         // Only level 0 ciphertext used here. Other towers ignored to make CKKS bootstrapping faster.
-        auto& ctxtDCRT = raised->GetElements();
-        for (auto& poly : ctxtDCRT) {
-            poly.SetFormat(COEFFICIENT);
-            DCRTPoly temp(elementParamsRaisedPtr, COEFFICIENT);
-            temp = poly.GetElementAtIndex(0);
-            temp.SetFormat(EVALUATION);
-            poly = std::move(temp);
+        auto& ctxtDCRTs = raised->GetElements();
+
+        for (auto& dcrt : ctxtDCRTs) {
+            dcrt.SetFormat(COEFFICIENT);
+            DCRTPoly tmp(dcrt.GetElementAtIndex(0), elementParamsRaisedPtr);
+            tmp.SetFormat(EVALUATION);
+            dcrt = std::move(tmp);
         }
-        raised->SetLevel(L0 - ctxtDCRT[0].GetNumOfElements());
+        raised->SetLevel(L0 - ctxtDCRTs[0].GetNumOfElements());
 
         // go back to a denser secret
         algo->KeySwitchInPlace(raised, evalKeyMap.at(2 * N - 2));
     }
     else {
         // Only level 0 ciphertext used here. Other towers ignored to make CKKS bootstrapping faster.
-        auto& ctxtDCRT = raised->GetElements();
-        for (auto& poly : ctxtDCRT) {
-            poly.SetFormat(COEFFICIENT);
-            DCRTPoly temp(elementParamsRaisedPtr, COEFFICIENT);
-            temp = poly.GetElementAtIndex(0);
-            temp.SetFormat(EVALUATION);
-            poly = std::move(temp);
+        auto& ctxtDCRTs = raised->GetElements();
+
+        for (auto& dcrt : ctxtDCRTs) {
+            dcrt.SetFormat(COEFFICIENT);
+            DCRTPoly tmp(dcrt.GetElementAtIndex(0), elementParamsRaisedPtr);
+            tmp.SetFormat(EVALUATION);
+            dcrt = std::move(tmp);
         }
-        raised->SetLevel(L0 - ctxtDCRT[0].GetNumOfElements());
+        raised->SetLevel(L0 - ctxtDCRTs[0].GetNumOfElements());
     }
 
 #ifdef BOOTSTRAPTIMING
@@ -2809,8 +2665,7 @@ std::shared_ptr<seriesPowers<DCRTPoly>> FHECKKSRNS::EvalMVBPrecomputeInternal(
     auto skd = cryptoParams->GetSecretKeyDist();
     double k = (skd == SPARSE_TERNARY || skd == SPARSE_ENCAPSULATED) ? 1.0 : K_UNIFORM;
 
-    double constantEvalMult = 1.0 / (k * N);
-    cc->EvalMultInPlace(raised, constantEvalMult);
+    cc->EvalMultInPlace(raised, 1.0 / (k * N));
 
     // no linear transformations are needed for Chebyshev series as the range has been normalized to [-1,1]
     double coeffLowerBound = -1.0;
@@ -2818,7 +2673,7 @@ std::shared_ptr<seriesPowers<DCRTPoly>> FHECKKSRNS::EvalMVBPrecomputeInternal(
 
     auto slots         = ciphertext->GetSlots();
     auto& p            = GetBootPrecom(slots);
-    bool isLTBootstrap = (p.m_levelEnc == 1) && (p.m_levelDec == 1);
+    bool isLTBootstrap = (p.m_paramsEnc.lvlb == 1) && (p.m_paramsDec.lvlb == 1);
 
     std::vector<Ciphertext<DCRTPoly>> ctxtEnc;
     std::shared_ptr<seriesPowers<DCRTPoly>> ctxtPowers;
@@ -2865,21 +2720,21 @@ std::shared_ptr<seriesPowers<DCRTPoly>> FHECKKSRNS::EvalMVBPrecomputeInternal(
         if (digitBitSize == 1 && order == 1) {
             auto& coeff_cos = (skd == SPARSE_ENCAPSULATED) ? coeff_cos_16_double : coeff_cos_25_double;
 
-            ctxtEnc[0] = cc->EvalChebyshevSeries(ctxtEnc[0], coeff_cos, coeffLowerBound, coeffUpperBound);
-            ctxtEnc[1] = cc->EvalChebyshevSeries(ctxtEnc[1], coeff_cos, coeffLowerBound, coeffUpperBound);
+            ctxtEnc[0] = algo->EvalChebyshevSeries(ctxtEnc[0], coeff_cos, coeffLowerBound, coeffUpperBound);
+            ctxtEnc[1] = algo->EvalChebyshevSeries(ctxtEnc[1], coeff_cos, coeffLowerBound, coeffUpperBound);
 
             // Double angle-iterations to get cos(pi*x)
             cc->EvalSquareInPlace(ctxtEnc[0]);
             cc->EvalAddInPlaceNoCheck(ctxtEnc[0], ctxtEnc[0]);
             cc->EvalSubInPlace(ctxtEnc[0], 1.0);
             cc->ModReduceInPlace(ctxtEnc[0]);  // cos(pi x)
+            cc->EvalSquareInPlace(ctxtEnc[0]);
+            cc->ModReduceInPlace(ctxtEnc[0]);  // cos^2(pi x)
+
             cc->EvalSquareInPlace(ctxtEnc[1]);
             cc->EvalAddInPlaceNoCheck(ctxtEnc[1], ctxtEnc[1]);
             cc->EvalSubInPlace(ctxtEnc[1], 1.0);
             cc->ModReduceInPlace(ctxtEnc[1]);  // cos(pi x)
-
-            cc->EvalSquareInPlace(ctxtEnc[0]);
-            cc->ModReduceInPlace(ctxtEnc[0]);  // cos^2(pi x)
             cc->EvalSquareInPlace(ctxtEnc[1]);
             cc->ModReduceInPlace(ctxtEnc[1]);  // cos^2(pi x)
         }
@@ -2889,8 +2744,8 @@ std::shared_ptr<seriesPowers<DCRTPoly>> FHECKKSRNS::EvalMVBPrecomputeInternal(
                                                              coeff_exp_25_double_58;
 
             // Obtain exp(Pi/2*i*x) approximation via Chebyshev Basis Polynomial Interpolation
-            ctxtEnc[0] = cc->EvalChebyshevSeries(ctxtEnc[0], coeff_exp, coeffLowerBound, coeffUpperBound);
-            ctxtEnc[1] = cc->EvalChebyshevSeries(ctxtEnc[1], coeff_exp, coeffLowerBound, coeffUpperBound);
+            ctxtEnc[0] = algo->EvalChebyshevSeries(ctxtEnc[0], coeff_exp, coeffLowerBound, coeffUpperBound);
+            ctxtEnc[1] = algo->EvalChebyshevSeries(ctxtEnc[1], coeff_exp, coeffLowerBound, coeffUpperBound);
 
             // Double angle-iterations to get exp(2*Pi*i*x)
             cc->EvalSquareInPlace(ctxtEnc[0]);
@@ -2904,8 +2759,9 @@ std::shared_ptr<seriesPowers<DCRTPoly>> FHECKKSRNS::EvalMVBPrecomputeInternal(
             cc->ModReduceInPlace(ctxtEnc[1]);
         }
 
-        auto ctxtPowersRe = cc->EvalPowers(ctxtEnc[0], coefficients);
-        auto ctxtPowersIm = cc->EvalPowers(ctxtEnc[1], coefficients);
+        auto ctxtPowersRe = algo->EvalPowers(ctxtEnc[0], coefficients);
+        auto ctxtPowersIm = algo->EvalPowers(ctxtEnc[1], coefficients);
+
         if (ctxtPowersRe->powers2Re.size() == 0) {
             ctxtPowers = std::make_shared<seriesPowers<DCRTPoly>>(ctxtPowersRe->powersRe, ctxtPowersIm->powersRe);
         }
@@ -2924,8 +2780,9 @@ std::shared_ptr<seriesPowers<DCRTPoly>> FHECKKSRNS::EvalMVBPrecomputeInternal(
         // Running PartialSum
         //------------------------------------------------------------------------------
 
-        for (uint32_t j = 1; j < N / (2 * slots); j <<= 1)
-            cc->EvalAddInPlaceNoCheck(raised, cc->EvalRotate(raised, j * slots));
+        const uint32_t limit = N / (2 * slots);
+        for (uint32_t j = 1; j < limit; j <<= 1)
+            cc->EvalAddInPlace(raised, cc->EvalRotate(raised, j * slots));
 
         //------------------------------------------------------------------------------
         // Running CoeffsToSlots
@@ -2936,9 +2793,8 @@ std::shared_ptr<seriesPowers<DCRTPoly>> FHECKKSRNS::EvalMVBPrecomputeInternal(
         ctxtEnc.emplace_back((isLTBootstrap) ? EvalLinearTransform(p.m_U0hatTPre, raised) :
                                                EvalCoeffsToSlots(p.m_U0hatTPreFFT, raised));
 
-        auto evalKeyMap = cc->GetEvalAutomorphismKeyMap(ctxtEnc[0]->GetKeyTag());
-        auto conj       = Conjugate(ctxtEnc[0], evalKeyMap);
-        cc->EvalAddInPlaceNoCheck(ctxtEnc[0], conj);
+        auto& evalKeyMap = cc->GetEvalAutomorphismKeyMap(ctxtEnc[0]->GetKeyTag());
+        cc->EvalAddInPlace(ctxtEnc[0], Conjugate(ctxtEnc[0], evalKeyMap));
 
         if (cryptoParams->GetScalingTechnique() == FIXEDMANUAL) {
             while (ctxtEnc[0]->GetNoiseScaleDeg() > 1)
@@ -2956,14 +2812,13 @@ std::shared_ptr<seriesPowers<DCRTPoly>> FHECKKSRNS::EvalMVBPrecomputeInternal(
         if (digitBitSize == 1 && order == 1) {
             auto& coeff_cos = (skd == SPARSE_ENCAPSULATED) ? coeff_cos_16_double : coeff_cos_25_double;
 
-            ctxtEnc[0] = cc->EvalChebyshevSeries(ctxtEnc[0], coeff_cos, coeffLowerBound, coeffUpperBound);
+            ctxtEnc[0] = algo->EvalChebyshevSeries(ctxtEnc[0], coeff_cos, coeffLowerBound, coeffUpperBound);
 
             // Double angle-iterations to get cos(pi*x)
             cc->EvalSquareInPlace(ctxtEnc[0]);
             cc->EvalAddInPlaceNoCheck(ctxtEnc[0], ctxtEnc[0]);
             cc->EvalSubInPlace(ctxtEnc[0], 1.0);
             cc->ModReduceInPlace(ctxtEnc[0]);  // cos(pi x)
-
             cc->EvalSquareInPlace(ctxtEnc[0]);
             cc->ModReduceInPlace(ctxtEnc[0]);  // cos^2(pi x)
         }
@@ -2973,7 +2828,7 @@ std::shared_ptr<seriesPowers<DCRTPoly>> FHECKKSRNS::EvalMVBPrecomputeInternal(
                                                              coeff_exp_25_double_58;
 
             // Obtain exp(Pi/2*i*x) approximation via Chebyshev Basis Polynomial Interpolation
-            ctxtEnc[0] = cc->EvalChebyshevSeries(ctxtEnc[0], coeff_exp, coeffLowerBound, coeffUpperBound);
+            ctxtEnc[0] = algo->EvalChebyshevSeries(ctxtEnc[0], coeff_exp, coeffLowerBound, coeffUpperBound);
 
             // Double angle-iterations to get exp(2*Pi*i*x)
             cc->EvalSquareInPlace(ctxtEnc[0]);
@@ -2983,7 +2838,7 @@ std::shared_ptr<seriesPowers<DCRTPoly>> FHECKKSRNS::EvalMVBPrecomputeInternal(
         }
 
         // No need to scale the message back up after Chebyshev interpolation
-        ctxtPowers = cc->EvalPowers(ctxtEnc[0], coefficients);
+        ctxtPowers = algo->EvalPowers(ctxtEnc[0], coefficients);
     }
 
     // 64-bit only: No need to scale back the message to its original scale.
@@ -3003,7 +2858,7 @@ std::shared_ptr<seriesPowers<DCRTPoly>> FHECKKSRNS::EvalMVBPrecompute(ConstCiphe
 }
 
 template <typename VectorDataType>
-Ciphertext<DCRTPoly> FHECKKSRNS::EvalMVBNoDecodingInternal(const std::shared_ptr<seriesPowers<DCRTPoly>> ciphertexts,
+Ciphertext<DCRTPoly> FHECKKSRNS::EvalMVBNoDecodingInternal(const std::shared_ptr<seriesPowers<DCRTPoly>>& ciphertexts,
                                                            const std::vector<VectorDataType>& coefficients,
                                                            uint32_t digitBitSize, size_t order) {
     const auto cryptoParams =
@@ -3014,13 +2869,13 @@ Ciphertext<DCRTPoly> FHECKKSRNS::EvalMVBNoDecodingInternal(const std::shared_ptr
         OPENFHE_THROW("CKKS Bootstrapping is only supported for the Hybrid key switching method.");
 
     auto cc        = ciphertexts->powersRe[0]->GetCryptoContext();
-    uint32_t M     = cc->GetCyclotomicOrder();
+    uint32_t M4    = cc->GetCyclotomicOrder() / 4;
     uint32_t slots = ciphertexts->powersRe[0]->GetSlots();
     auto algo      = cc->GetScheme();
 
     Ciphertext<DCRTPoly> ctxtEnc;
 
-    if (slots == M / 4) {
+    if (slots == M4) {
         //------------------------------------------------------------------------------
         // FULLY PACKED CASE
         //------------------------------------------------------------------------------
@@ -3069,14 +2924,13 @@ Ciphertext<DCRTPoly> FHECKKSRNS::EvalMVBNoDecodingInternal(const std::shared_ptr
             // Take the real part
             // Division by 2 was already performed
             ctxtEnc = cc->EvalPolyWithPrecomp(ctxtPowersRe, coefficients);
-            cc->EvalAddInPlaceNoCheck(ctxtEnc, Conjugate(ctxtEnc, cc->GetEvalAutomorphismKeyMap(ctxtEnc->GetKeyTag())));
+            cc->EvalAddInPlace(ctxtEnc, Conjugate(ctxtEnc, cc->GetEvalAutomorphismKeyMap(ctxtEnc->GetKeyTag())));
             ctxtEncI = cc->EvalPolyWithPrecomp(ctxtPowersIm, coefficients);
-            cc->EvalAddInPlaceNoCheck(ctxtEncI,
-                                      Conjugate(ctxtEncI, cc->GetEvalAutomorphismKeyMap(ctxtEnc->GetKeyTag())));
+            cc->EvalAddInPlace(ctxtEncI, Conjugate(ctxtEncI, cc->GetEvalAutomorphismKeyMap(ctxtEnc->GetKeyTag())));
         }
 
-        algo->MultByMonomialInPlace(ctxtEncI, M / 4);
-        cc->EvalAddInPlaceNoCheck(ctxtEnc, ctxtEncI);
+        algo->MultByMonomialInPlace(ctxtEncI, M4);
+        cc->EvalAddInPlace(ctxtEnc, ctxtEncI);
         // No need to scale the message back up after Chebyshev interpolation
     }
     else {
@@ -3189,9 +3043,8 @@ template <typename VectorDataType>
 Ciphertext<DCRTPoly> FHECKKSRNS::EvalHermiteTrigSeriesInternal(
     ConstCiphertext<DCRTPoly>& ciphertext, const std::vector<std::complex<double>>& coefficientsCheb, double a,
     double b, const std::vector<VectorDataType>& coefficientsHerm, size_t precomp) {
-    auto cc    = ciphertext->GetCryptoContext();
-    auto slots = ciphertext->GetSlots();
-    auto& p    = GetBootPrecom(slots);
+    auto cc = ciphertext->GetCryptoContext();
+    auto& p = GetBootPrecom(ciphertext->GetSlots());
 
     auto& ctxt_exp = (precomp == 0 || precomp == 2) ? p.m_precompExp : p.m_precompExpI;
     if (precomp == 0 || precomp == 1) {
@@ -3208,7 +3061,6 @@ Ciphertext<DCRTPoly> FHECKKSRNS::EvalHermiteTrigSeriesInternal(
     // Obtain the complex Hermite Trigonometric Interpolation via Power Basis Polynomial Interpolation
     // Coefficients are divided by 2
     auto result = cc->EvalPoly(ctxt_exp, coefficientsHerm);
-
     // Take the real part
     // Division by 2 was already performed
     cc->EvalAddInPlaceNoCheck(result, Conjugate(result, cc->GetEvalAutomorphismKeyMap(result->GetKeyTag())));
@@ -3360,6 +3212,7 @@ Ciphertext<DCRTPoly> FHECKKSRNS::KeySwitchSparse(Ciphertext<DCRTPoly>& ciphertex
 
     // modswitch cvRes from p*q to q, i.e., compute round(cvRes/p) mod q
     // In RNS, we use the technique described in Appendix B.2.2 of https://eprint.iacr.org/2021/204 for the BFV case, i.e., for t=1.
+
     for (uint32_t i = 0; i < 2; ++i) {
         auto polyP = cvRes[i].GetElementAtIndex(1);
         polyP.SetFormat(Format::COEFFICIENT);
diff --git a/src/pke/lib/scheme/ckksrns/ckksrns-leveledshe.cpp b/src/pke/lib/scheme/ckksrns/ckksrns-leveledshe.cpp
index 237368624..f4e666685 100644
--- a/src/pke/lib/scheme/ckksrns/ckksrns-leveledshe.cpp
+++ b/src/pke/lib/scheme/ckksrns/ckksrns-leveledshe.cpp
@@ -58,8 +58,13 @@ Ciphertext<DCRTPoly> LeveledSHECKKSRNS::EvalAdd(ConstCiphertext<DCRTPoly>& ciphe
 }
 
 void LeveledSHECKKSRNS::EvalAddInPlace(Ciphertext<DCRTPoly>& ciphertext, double operand) const {
-    auto& cv = ciphertext->GetElements();
-    cv[0]    = cv[0] + GetElementForEvalAddOrSub(ciphertext, operand);
+    auto elmnts = GetElementForEvalAddOrSub(ciphertext, operand);
+    auto& polys = ciphertext->GetElements()[0].GetAllElements();
+
+    const uint32_t limit = polys.size();
+#pragma omp parallel for num_threads(OpenFHEParallelControls.GetThreadLimit(limit))
+    for (uint32_t i = 0; i < limit; ++i)
+        polys[i] += elmnts[i];
 }
 
 Ciphertext<DCRTPoly> LeveledSHECKKSRNS::EvalAdd(ConstCiphertext<DCRTPoly>& ciphertext,
@@ -79,11 +84,11 @@ void LeveledSHECKKSRNS::EvalAddInPlace(Ciphertext<DCRTPoly>& ciphertext, std::co
     auto posimag   = operand.imag() > 0.;
 
     DCRTPoly elemsComplex(cv[0].GetParams(), Format::COEFFICIENT, true);
-    uint32_t sizeQl = elemsComplex.GetNumOfElements();
+    const uint32_t sizeQl = elemsComplex.GetNumOfElements();
     for (uint32_t i = 0; i < sizeQl; ++i) {
         auto element = cv[0].GetElementAtIndex(i);
         auto modulus = element.GetModulus();
-        NativeVector vec(N, modulus.ConvertToInt<BasicInteger>());
+        NativeVector vec(N, modulus);
         vec[0]     = posreal ? NativeInteger(elemsRe[i].Mod(modulus)) : modulus.ModSub(elemsRe[i], modulus);
         vec[Nhalf] = posimag ? NativeInteger(elemsIm[i].Mod(modulus)) : modulus.ModSub(elemsIm[i], modulus);
         element.SetValues(std::move(vec), Format::COEFFICIENT);
@@ -105,8 +110,13 @@ Ciphertext<DCRTPoly> LeveledSHECKKSRNS::EvalSub(ConstCiphertext<DCRTPoly>& ciphe
 }
 
 void LeveledSHECKKSRNS::EvalSubInPlace(Ciphertext<DCRTPoly>& ciphertext, double operand) const {
-    auto& cv = ciphertext->GetElements();
-    cv[0]    = cv[0] - GetElementForEvalAddOrSub(ciphertext, operand);
+    auto elmnts = GetElementForEvalAddOrSub(ciphertext, operand);
+    auto& polys = ciphertext->GetElements()[0].GetAllElements();
+
+    const uint32_t limit = polys.size();
+#pragma omp parallel for num_threads(OpenFHEParallelControls.GetThreadLimit(limit))
+    for (uint32_t i = 0; i < limit; ++i)
+        polys[i] -= elmnts[i];
 }
 
 /////////////////////////////////////////
@@ -130,7 +140,7 @@ void LeveledSHECKKSRNS::EvalMultInPlace(Ciphertext<DCRTPoly>& ciphertext, double
 
 Ciphertext<DCRTPoly> LeveledSHECKKSRNS::EvalMult(ConstCiphertext<DCRTPoly>& ciphertext,
                                                  std::complex<double> operand) const {
-    Ciphertext<DCRTPoly> result = ciphertext->Clone();
+    auto result = ciphertext->Clone();
     EvalMultInPlace(result, operand);
     return result;
 }
@@ -168,17 +178,13 @@ void LeveledSHECKKSRNS::ModReduceInternalInPlace(Ciphertext<DCRTPoly>& ciphertex
     size_t sizeQl = cv[0].GetNumOfElements();
     size_t diffQl = sizeQ - sizeQl;
 
-    for (size_t l = 0; l < levels; ++l) {
-        for (size_t i = 0; i < cv.size(); ++i) {
-            cv[i].DropLastElementAndScale(cryptoParams->GetQlQlInvModqlDivqlModq(diffQl + l),
-                                          cryptoParams->GetqlInvModq(diffQl + l));
-        }
-    }
-
     ciphertext->SetNoiseScaleDeg(ciphertext->GetNoiseScaleDeg() - levels / cryptoParams->GetCompositeDegree());
     ciphertext->SetLevel(ciphertext->GetLevel() + levels);
 
     for (size_t i = 0; i < levels; ++i) {
+        for (auto& dcrtpoly : cv)
+            dcrtpoly.DropLastElementAndScale(cryptoParams->GetQlQlInvModqlDivqlModq(diffQl + i),
+                                             cryptoParams->GetqlInvModq(diffQl + i));
         double modReduceFactor = cryptoParams->GetModReduceFactor(sizeQl - 1 - i);
         ciphertext->SetScalingFactor(ciphertext->GetScalingFactor() / modReduceFactor);
     }
@@ -210,13 +216,12 @@ std::vector<DCRTPoly::Integer> LeveledSHECKKSRNS::GetElementForEvalAddOrSub(Cons
     uint32_t precision = 52;
     double powP        = std::pow(2, precision);
 
-    const std::vector<DCRTPoly>& cv = ciphertext->GetElements();
-    uint32_t numTowers              = cv[0].GetNumOfElements();
+    const auto& cv     = ciphertext->GetElements();
+    uint32_t numTowers = cv[0].GetNumOfElements();
     std::vector<DCRTPoly::Integer> moduli(numTowers);
 
-    for (uint32_t i = 0; i < numTowers; i++) {
+    for (uint32_t i = 0; i < numTowers; ++i)
         moduli[i] = cv[0].GetElementAtIndex(i).GetModulus();
-    }
 
     // the idea is to break down real numbers
     // expressed as input_mantissa * 2^input_exponent
@@ -251,21 +256,18 @@ std::vector<DCRTPoly::Integer> LeveledSHECKKSRNS::GetElementForEvalAddOrSub(Cons
     std::vector<DCRTPoly::Integer> currPowP(numTowers, scaledConstant);
 
     // multiply c*powP with powP a total of (depth-1) times to get c*powP^d
-    for (size_t i = 0; i < ciphertext->GetNoiseScaleDeg() - 1; i++) {
+    for (uint32_t i = 0; i < ciphertext->GetNoiseScaleDeg() - 1; ++i)
         currPowP = CKKSPackedEncoding::CRTMult(currPowP, crtPowP, moduli);
-    }
-
     return currPowP;
 }
 #else  // NATIVEINT == 64
 std::vector<DCRTPoly::Integer> LeveledSHECKKSRNS::GetElementForEvalAddOrSub(ConstCiphertext<DCRTPoly>& ciphertext,
                                                                             double operand) const {
-    const std::vector<DCRTPoly>& cv = ciphertext->GetElements();
-    uint32_t sizeQl                 = cv[0].GetNumOfElements();
+    const auto& polys     = ciphertext->GetElements()[0].GetAllElements();
+    const uint32_t sizeQl = polys.size();
     std::vector<DCRTPoly::Integer> moduli(sizeQl);
-    for (uint32_t i = 0; i < sizeQl; i++) {
-        moduli[i] = cv[0].GetElementAtIndex(i).GetModulus();
-    }
+    for (uint32_t i = 0; i < sizeQl; ++i)
+        moduli[i] = polys[i].GetModulus();
 
     const auto cryptoParams = std::dynamic_pointer_cast<CryptoParametersCKKSRNS>(ciphertext->GetCryptoParameters());
 
@@ -346,47 +348,42 @@ std::vector<DCRTPoly::Integer> LeveledSHECKKSRNS::GetElementForEvalAddOrSub(Cons
         if (logSF_cp < 64) {
             DCRTPoly::Integer intScFactor = static_cast<uint64_t>(scFactor + 0.5);
             std::vector<DCRTPoly::Integer> crtScFactor(sizeQl, intScFactor);
-            for (uint32_t i = 1; i < ciphertext->GetNoiseScaleDeg(); i++) {
+            for (uint32_t i = 1; i < ciphertext->GetNoiseScaleDeg(); ++i)
                 crtConstant = CKKSPackedEncoding::CRTMult(crtConstant, crtScFactor, moduli);
-            }
         }
         else {
             // Multiply scFactor in two steps: scFactor / approxFactor and then approxFactor
             DCRTPoly::Integer intScFactor = static_cast<uint64_t>(scFactor / approxFactor + 0.5);
             std::vector<DCRTPoly::Integer> crtScFactor(sizeQl, intScFactor);
-            for (uint32_t i = 1; i < ciphertext->GetNoiseScaleDeg(); i++) {
+            for (uint32_t i = 1; i < ciphertext->GetNoiseScaleDeg(); ++i)
                 crtConstant = CKKSPackedEncoding::CRTMult(crtConstant, crtScFactor, moduli);
-            }
             if (logApprox_cp > 0) {
-                int32_t logStep           = (logApprox_cp <= LargeScalingFactorConstants::MAX_LOG_STEP) ?
-                                                logApprox_cp :
-                                                LargeScalingFactorConstants::MAX_LOG_STEP;
-                DCRTPoly::Integer intStep = static_cast<uint64_t>(1) << logStep;
+                int32_t logStep = (logApprox_cp <= LargeScalingFactorConstants::MAX_LOG_STEP) ?
+                                      logApprox_cp :
+                                      LargeScalingFactorConstants::MAX_LOG_STEP;
+                auto intStep    = DCRTPoly::Integer(1) << logStep;
                 std::vector<DCRTPoly::Integer> crtApprox(sizeQl, intStep);
                 logApprox_cp -= logStep;
 
                 while (logApprox_cp > 0) {
-                    int32_t logStep           = (logApprox_cp <= LargeScalingFactorConstants::MAX_LOG_STEP) ?
-                                                    logApprox_cp :
-                                                    LargeScalingFactorConstants::MAX_LOG_STEP;
-                    DCRTPoly::Integer intStep = static_cast<uint64_t>(1) << logStep;
+                    int32_t logStep = (logApprox_cp <= LargeScalingFactorConstants::MAX_LOG_STEP) ?
+                                          logApprox_cp :
+                                          LargeScalingFactorConstants::MAX_LOG_STEP;
+                    auto intStep    = DCRTPoly::Integer(1) << logStep;
                     std::vector<DCRTPoly::Integer> crtSF(sizeQl, intStep);
                     crtApprox = CKKSPackedEncoding::CRTMult(crtApprox, crtSF, moduli);
                     logApprox_cp -= logStep;
                 }
-                for (uint32_t i = 1; i < ciphertext->GetNoiseScaleDeg(); i++) {
+                for (uint32_t i = 1; i < ciphertext->GetNoiseScaleDeg(); ++i)
                     crtConstant = CKKSPackedEncoding::CRTMult(crtConstant, crtApprox, moduli);
-                }
             }
         }
     }
     else {
         DCRTPoly::Integer intScFactor = static_cast<uint64_t>(scFactor + 0.5);
         std::vector<DCRTPoly::Integer> crtScFactor(sizeQl, intScFactor);
-
-        for (uint32_t i = 1; i < ciphertext->GetNoiseScaleDeg(); i++) {
+        for (uint32_t i = 1; i < ciphertext->GetNoiseScaleDeg(); ++i)
             crtConstant = CKKSPackedEncoding::CRTMult(crtConstant, crtScFactor, moduli);
-        }
     }
 
     return crtConstant;
@@ -420,8 +417,8 @@ std::vector<DCRTPoly::Integer> LeveledSHECKKSRNS::GetElementForEvalMult(ConstCip
         scaled128            = ppRemaining * scaled64;
     }
 
-    const std::vector<DCRTPoly>& cv = ciphertext->GetElements();
-    uint32_t numTowers              = cv[0].GetNumOfElements();
+    const auto& cv     = ciphertext->GetElements();
+    uint32_t numTowers = cv[0].GetNumOfElements();
     std::vector<DCRTPoly::Integer> factors(numTowers);
 
     for (uint32_t i = 0; i < numTowers; i++) {
@@ -448,9 +445,8 @@ std::vector<DCRTPoly::Integer> LeveledSHECKKSRNS::GetElementForEvalMult(ConstCip
     const std::vector<DCRTPoly>& cv = ciphertext->GetElements();
     uint32_t numTowers              = cv[0].GetNumOfElements();
     std::vector<DCRTPoly::Integer> moduli(numTowers);
-    for (uint32_t i = 0; i < numTowers; i++) {
+    for (uint32_t i = 0; i < numTowers; ++i)
         moduli[i] = cv[0].GetElementAtIndex(i).GetModulus();
-    }
 
     double scFactor = cryptoParams->GetScalingFactorReal(ciphertext->GetLevel());
 
@@ -519,10 +515,10 @@ std::vector<DCRTPoly::Integer> LeveledSHECKKSRNS::GetElementForEvalMult(ConstCip
         logApprox -= logStep;
 
         while (logApprox > 0) {
-            int32_t logStep           = (logApprox <= LargeScalingFactorConstants::MAX_LOG_STEP) ?
-                                            logApprox :
-                                            LargeScalingFactorConstants::MAX_LOG_STEP;
-            DCRTPoly::Integer intStep = static_cast<uint64_t>(1) << logStep;
+            int32_t logStep = (logApprox <= LargeScalingFactorConstants::MAX_LOG_STEP) ?
+                                  logApprox :
+                                  LargeScalingFactorConstants::MAX_LOG_STEP;
+            auto intStep    = DCRTPoly::Integer(1) << logStep;
             std::vector<DCRTPoly::Integer> crtSF(numTowers, intStep);
             crtApprox = CKKSPackedEncoding::CRTMult(crtApprox, crtSF, moduli);
             logApprox -= logStep;
@@ -543,49 +539,45 @@ Ciphertext<DCRTPoly> LeveledSHECKKSRNS::EvalFastRotationExt(
     //    return result;
     //  }
 
-    const auto cc = ciphertext->GetCryptoContext();
-
     const auto cryptoParams = std::dynamic_pointer_cast<CryptoParametersCKKSRNS>(ciphertext->GetCryptoParameters());
 
-    uint32_t N = cryptoParams->GetElementParams()->GetRingDimension();
-    uint32_t M = cryptoParams->GetElementParams()->GetCyclotomicOrder();
+    const uint32_t M = cryptoParams->GetElementParams()->GetCyclotomicOrder();
 
     // Find the automorphism index that corresponds to rotation index index.
-    uint32_t autoIndex = FindAutomorphismIndex2nComplex(index, M);
+    const uint32_t autoIndex = FindAutomorphismIndex2nComplex(index, M);
 
     // Retrieve the automorphism key that corresponds to the auto index.
     auto evalKeyIterator = evalKeys.find(autoIndex);
-    if (evalKeyIterator == evalKeys.end()) {
+    if (evalKeyIterator == evalKeys.end())
         OPENFHE_THROW("EvalKey for index [" + std::to_string(autoIndex) + "] is not found.");
-    }
-    auto evalKey = evalKeyIterator->second;
+    auto& evalKey = evalKeyIterator->second;
 
-    const std::vector<DCRTPoly>& cv = ciphertext->GetElements();
-    const auto paramsQl             = cv[0].GetParams();
-
-    auto algo = cc->GetScheme();
+    const auto& cv      = ciphertext->GetElements();
+    const auto paramsQl = cv[0].GetParams();
 
-    std::shared_ptr<std::vector<DCRTPoly>> cTilda = algo->EvalFastKeySwitchCoreExt(digits, evalKey, paramsQl);
+    const auto cc = ciphertext->GetCryptoContext();
+    auto cTilda   = *cc->GetScheme()->EvalFastKeySwitchCoreExt(digits, evalKey, paramsQl);
 
     if (addFirst) {
-        const auto paramsQlP = (*cTilda)[0].GetParams();
-        size_t sizeQl        = paramsQl->GetParams().size();
-        DCRTPoly psiC0       = DCRTPoly(paramsQlP, Format::EVALUATION, true);
-        auto cMult           = ciphertext->GetElements()[0].TimesNoCheck(cryptoParams->GetPModq());
-        for (uint32_t i = 0; i < sizeQl; i++) {
+        DCRTPoly psiC0(cTilda[0].GetParams(), Format::EVALUATION, true);
+        auto cMult = cv[0].TimesNoCheck(cryptoParams->GetPModq());
+
+        const uint32_t sizeQl = paramsQl->GetParams().size();
+        for (uint32_t i = 0; i < sizeQl; ++i)
             psiC0.SetElementAtIndex(i, std::move(cMult.GetElementAtIndex(i)));
-        }
-        (*cTilda)[0] += psiC0;
+
+        cTilda[0] += psiC0;
     }
 
+    const uint32_t N = cryptoParams->GetElementParams()->GetRingDimension();
     std::vector<uint32_t> vec(N);
     PrecomputeAutoMap(N, autoIndex, &vec);
 
-    (*cTilda)[0] = (*cTilda)[0].AutomorphismTransform(autoIndex, vec);
-    (*cTilda)[1] = (*cTilda)[1].AutomorphismTransform(autoIndex, vec);
+    cTilda[0] = cTilda[0].AutomorphismTransform(autoIndex, vec);
+    cTilda[1] = cTilda[1].AutomorphismTransform(autoIndex, vec);
 
     auto result = ciphertext->CloneEmpty();
-    result->SetElements({std::move((*cTilda)[0]), std::move((*cTilda)[1])});
+    result->SetElements(std::move(cTilda));
     return result;
 }
 
@@ -603,21 +595,20 @@ Ciphertext<DCRTPoly> LeveledSHECKKSRNS::MultByInteger(ConstCiphertext<DCRTPoly>&
 }
 
 void LeveledSHECKKSRNS::MultByIntegerInPlace(Ciphertext<DCRTPoly>& ciphertext, uint64_t integer) const {
-    std::vector<DCRTPoly>& cv = ciphertext->GetElements();
-
-    for (uint32_t i = 0; i < cv.size(); i++)
+    auto& cv = ciphertext->GetElements();
+    for (uint32_t i = 0; i < cv.size(); ++i)
         cv[i] = cv[i].Times(NativeInteger(integer));
 }
 
 void LeveledSHECKKSRNS::AdjustLevelsAndDepthInPlace(Ciphertext<DCRTPoly>& ciphertext1,
                                                     Ciphertext<DCRTPoly>& ciphertext2) const {
+    const uint32_t c1lvl     = ciphertext1->GetLevel();
+    const uint32_t c2lvl     = ciphertext2->GetLevel();
+    const uint32_t c1depth   = ciphertext1->GetNoiseScaleDeg();
+    const uint32_t c2depth   = ciphertext2->GetNoiseScaleDeg();
+    const uint32_t sizeQl1   = ciphertext1->GetElements()[0].GetNumOfElements();
+    const uint32_t sizeQl2   = ciphertext2->GetElements()[0].GetNumOfElements();
     const auto cryptoParams  = std::dynamic_pointer_cast<CryptoParametersCKKSRNS>(ciphertext1->GetCryptoParameters());
-    uint32_t c1lvl           = ciphertext1->GetLevel();
-    uint32_t c2lvl           = ciphertext2->GetLevel();
-    uint32_t c1depth         = ciphertext1->GetNoiseScaleDeg();
-    uint32_t c2depth         = ciphertext2->GetNoiseScaleDeg();
-    auto sizeQl1             = ciphertext1->GetElements()[0].GetNumOfElements();
-    auto sizeQl2             = ciphertext2->GetElements()[0].GetNumOfElements();
     uint32_t compositeDegree = cryptoParams->GetCompositeDegree();
 
     if (c1lvl < c2lvl) {
@@ -627,14 +618,12 @@ void LeveledSHECKKSRNS::AdjustLevelsAndDepthInPlace(Ciphertext<DCRTPoly>& cipher
                 double scf2 = ciphertext2->GetScalingFactor();
                 double scf  = cryptoParams->GetScalingFactorReal(c1lvl);
                 double q1   = cryptoParams->GetModReduceFactor(sizeQl1 - 1);
-                for (uint32_t j = 1; j < compositeDegree; j++) {
+                for (uint32_t j = 1; j < compositeDegree; ++j)
                     q1 *= cryptoParams->GetModReduceFactor(sizeQl1 - j - 1);
-                }
                 EvalMultCoreInPlace(ciphertext1, scf2 / scf1 * q1 / scf);
                 ModReduceInternalInPlace(ciphertext1, compositeDegree);
-                if (c1lvl + compositeDegree < c2lvl) {
+                if (c1lvl + compositeDegree < c2lvl)
                     LevelReduceInternalInPlace(ciphertext1, c2lvl - c1lvl - compositeDegree);
-                }
                 ciphertext1->SetScalingFactor(ciphertext2->GetScalingFactor());
             }
             else {
@@ -646,14 +635,12 @@ void LeveledSHECKKSRNS::AdjustLevelsAndDepthInPlace(Ciphertext<DCRTPoly>& cipher
                     double scf2 = cryptoParams->GetScalingFactorRealBig(c2lvl - compositeDegree);
                     double scf  = cryptoParams->GetScalingFactorReal(c1lvl);
                     double q1   = cryptoParams->GetModReduceFactor(sizeQl1 - 1);
-                    for (uint32_t j = 1; j < compositeDegree; j++) {
+                    for (uint32_t j = 1; j < compositeDegree; ++j)
                         q1 *= cryptoParams->GetModReduceFactor(sizeQl1 - j - 1);
-                    }
                     EvalMultCoreInPlace(ciphertext1, scf2 / scf1 * q1 / scf);
                     ModReduceInternalInPlace(ciphertext1, compositeDegree);
-                    if (c1lvl + 2 * compositeDegree < c2lvl) {
+                    if (c1lvl + 2 * compositeDegree < c2lvl)
                         LevelReduceInternalInPlace(ciphertext1, c2lvl - c1lvl - 2 * compositeDegree);
-                    }
                     ModReduceInternalInPlace(ciphertext1, compositeDegree);
                     ciphertext1->SetScalingFactor(ciphertext2->GetScalingFactor());
                 }
@@ -673,9 +660,8 @@ void LeveledSHECKKSRNS::AdjustLevelsAndDepthInPlace(Ciphertext<DCRTPoly>& cipher
                 double scf2 = cryptoParams->GetScalingFactorRealBig(c2lvl - compositeDegree);
                 double scf  = cryptoParams->GetScalingFactorReal(c1lvl);
                 EvalMultCoreInPlace(ciphertext1, scf2 / scf1 / scf);
-                if (c1lvl + compositeDegree < c2lvl) {
+                if (c1lvl + compositeDegree < c2lvl)
                     LevelReduceInternalInPlace(ciphertext1, c2lvl - c1lvl - compositeDegree);
-                }
                 ModReduceInternalInPlace(ciphertext1, compositeDegree);
                 ciphertext1->SetScalingFactor(ciphertext2->GetScalingFactor());
             }
@@ -688,14 +674,12 @@ void LeveledSHECKKSRNS::AdjustLevelsAndDepthInPlace(Ciphertext<DCRTPoly>& cipher
                 double scf1 = ciphertext1->GetScalingFactor();
                 double scf  = cryptoParams->GetScalingFactorReal(c2lvl);
                 double q2   = cryptoParams->GetModReduceFactor(sizeQl2 - 1);
-                for (uint32_t j = 1; j < compositeDegree; j++) {
+                for (uint32_t j = 1; j < compositeDegree; ++j)
                     q2 *= cryptoParams->GetModReduceFactor(sizeQl2 - j - 1);
-                }
                 EvalMultCoreInPlace(ciphertext2, scf1 / scf2 * q2 / scf);
                 ModReduceInternalInPlace(ciphertext2, compositeDegree);
-                if (c2lvl + compositeDegree < c1lvl) {
+                if (c2lvl + compositeDegree < c1lvl)
                     LevelReduceInternalInPlace(ciphertext2, c1lvl - c2lvl - compositeDegree);
-                }
                 ciphertext2->SetScalingFactor(ciphertext1->GetScalingFactor());
             }
             else {
@@ -707,14 +691,12 @@ void LeveledSHECKKSRNS::AdjustLevelsAndDepthInPlace(Ciphertext<DCRTPoly>& cipher
                     double scf1 = cryptoParams->GetScalingFactorRealBig(c1lvl - compositeDegree);
                     double scf  = cryptoParams->GetScalingFactorReal(c2lvl);
                     double q2   = cryptoParams->GetModReduceFactor(sizeQl2 - 1);
-                    for (uint32_t j = 1; j < compositeDegree; j++) {
+                    for (uint32_t j = 1; j < compositeDegree; ++j)
                         q2 *= cryptoParams->GetModReduceFactor(sizeQl2 - j - 1);
-                    }
                     EvalMultCoreInPlace(ciphertext2, scf1 / scf2 * q2 / scf);
                     ModReduceInternalInPlace(ciphertext2, compositeDegree);
-                    if (c2lvl + 2 * compositeDegree < c1lvl) {
+                    if (c2lvl + 2 * compositeDegree < c1lvl)
                         LevelReduceInternalInPlace(ciphertext2, c1lvl - c2lvl - 2 * compositeDegree);
-                    }
                     ModReduceInternalInPlace(ciphertext2, compositeDegree);
                     ciphertext2->SetScalingFactor(ciphertext1->GetScalingFactor());
                 }
@@ -734,9 +716,8 @@ void LeveledSHECKKSRNS::AdjustLevelsAndDepthInPlace(Ciphertext<DCRTPoly>& cipher
                 double scf1 = cryptoParams->GetScalingFactorRealBig(c1lvl - compositeDegree);
                 double scf  = cryptoParams->GetScalingFactorReal(c2lvl);
                 EvalMultCoreInPlace(ciphertext2, scf1 / scf2 / scf);
-                if (c2lvl + compositeDegree < c1lvl) {
+                if (c2lvl + compositeDegree < c1lvl)
                     LevelReduceInternalInPlace(ciphertext2, c1lvl - c2lvl - compositeDegree);
-                }
                 ModReduceInternalInPlace(ciphertext2, compositeDegree);
                 ciphertext2->SetScalingFactor(ciphertext1->GetScalingFactor());
             }
@@ -765,62 +746,40 @@ void LeveledSHECKKSRNS::AdjustLevelsAndDepthToOneInPlace(Ciphertext<DCRTPoly>& c
 }
 
 void LeveledSHECKKSRNS::EvalMultCoreInPlace(Ciphertext<DCRTPoly>& ciphertext, double operand) const {
-    const auto cryptoParams = std::dynamic_pointer_cast<CryptoParametersCKKSRNS>(ciphertext->GetCryptoParameters());
-
-    auto factors = GetElementForEvalMult(ciphertext, operand);
     auto& cv     = ciphertext->GetElements();
-    uint32_t len = cv.size();
-    for (uint32_t i = 0; i < len; ++i)
+    auto factors = GetElementForEvalMult(ciphertext, operand);
+    for (uint32_t i = 0; i < cv.size(); ++i)
         cv[i] = cv[i] * factors;
+
     ciphertext->SetNoiseScaleDeg(ciphertext->GetNoiseScaleDeg() + 1);
 
-    double scFactor = cryptoParams->GetScalingFactorReal(ciphertext->GetLevel());
+    auto cryptoParams = std::dynamic_pointer_cast<CryptoParametersCKKSRNS>(ciphertext->GetCryptoParameters());
+    double scFactor   = cryptoParams->GetScalingFactorReal(ciphertext->GetLevel());
     ciphertext->SetScalingFactor(ciphertext->GetScalingFactor() * scFactor);
 }
 
 void LeveledSHECKKSRNS::EvalMultCoreInPlace(Ciphertext<DCRTPoly>& ciphertext, std::complex<double> operand) const {
-    const auto cryptoParams = std::dynamic_pointer_cast<CryptoParametersCKKSRNS>(ciphertext->GetCryptoParameters());
-
-    double operandRe = operand.real();
-    double operandIm = operand.imag();
-
-    auto factorsRe = GetElementForEvalMult(ciphertext, operandRe);
-    auto factorsIm = GetElementForEvalMult(ciphertext, operandIm);
-    auto& cv       = ciphertext->GetElements();
-    std::vector<DCRTPoly> cvRe;
-    cvRe.reserve(cv.size());
-    std::vector<DCRTPoly> cvIm;
-    cvIm.reserve(cv.size());
-    for (uint32_t i = 0; i < cv.size(); ++i) {
-        cvRe.emplace_back(cv[i] * factorsRe);
-        cvIm.emplace_back(cv[i] * factorsIm);
-    }
+    auto& cv = ciphertext->GetElements();
 
     // MultByMonomialInPlace
-    const auto& elemParams   = cv[0].GetParams();
-    const auto& paramsNative = elemParams->GetParams()[0];
-    uint32_t N               = elemParams->GetRingDimension();
-    uint32_t M               = 2 * N;
-
-    NativePoly monomial(paramsNative, Format::COEFFICIENT, true);
-
-    uint32_t power        = M / 4;
-    uint32_t powerReduced = power % M;
-    uint32_t index        = power % N;
-    monomial[index]       = powerReduced < N ? NativeInteger(1) : paramsNative->GetModulus() - NativeInteger(1);
+    const auto& elemParams = cv[0].GetParams();
+    NativePoly monomial(elemParams->GetParams()[0], Format::COEFFICIENT, true);
+    monomial[elemParams->GetCyclotomicOrder() >> 2] = NativeInteger(1);
 
     DCRTPoly monomialDCRT(elemParams, Format::COEFFICIENT, true);
     monomialDCRT = monomial;
     monomialDCRT.SetFormat(Format::EVALUATION);
 
-    for (uint32_t i = 0; i < cv.size(); ++i) {
-        cvIm[i] *= monomialDCRT;
-        cv[i] = cvRe[i] + cvIm[i];
-    }
+    auto factorsRe = GetElementForEvalMult(ciphertext, operand.real());
+    auto factorsIm = monomialDCRT * GetElementForEvalMult(ciphertext, operand.imag());
+
+    for (uint32_t i = 0; i < cv.size(); ++i)
+        cv[i] = (cv[i] * factorsRe) + (cv[i] * factorsIm);
 
     ciphertext->SetNoiseScaleDeg(ciphertext->GetNoiseScaleDeg() + 1);
 
-    double scFactor = cryptoParams->GetScalingFactorReal(ciphertext->GetLevel());
+    auto cryptoParams = std::dynamic_pointer_cast<CryptoParametersCKKSRNS>(ciphertext->GetCryptoParameters());
+    double scFactor   = cryptoParams->GetScalingFactorReal(ciphertext->GetLevel());
     ciphertext->SetScalingFactor(ciphertext->GetScalingFactor() * scFactor);
 }
 
diff --git a/src/pke/lib/scheme/ckksrns/ckksrns-utils.cpp b/src/pke/lib/scheme/ckksrns/ckksrns-utils.cpp
index 1586663a5..2696618c1 100644
--- a/src/pke/lib/scheme/ckksrns/ckksrns-utils.cpp
+++ b/src/pke/lib/scheme/ckksrns/ckksrns-utils.cpp
@@ -697,7 +697,7 @@ std::vector<std::vector<std::vector<std::complex<double>>>> CoeffDecodingCollaps
     return coeff;
 }
 
-std::vector<int32_t> GetCollapsedFFTParams(uint32_t slots, uint32_t levelBudget, uint32_t dim1) {
+struct ckks_boot_params GetCollapsedFFTParams(uint32_t slots, uint32_t levelBudget, uint32_t dim1) {
     if (slots == 0)
         OPENFHE_THROW("slots can not be 0");
     if (levelBudget == 0)
@@ -706,15 +706,15 @@ std::vector<int32_t> GetCollapsedFFTParams(uint32_t slots, uint32_t levelBudget,
     // even for the case of (slots = 1) we need one level for rescaling as (std::log2(1) = 0)
     uint32_t logSlots = (slots < 3) ? 1 : std::log2(slots);
 
-    std::vector<uint32_t> dims = SelectLayers(logSlots, levelBudget);
     // Need to compute how many layers are collapsed in each of the level from the budget.
     // If there is no exact division between the maximum number of possible levels (log(slots)) and the
     // level budget, the last level will contain the remaining layers collapsed.
-    const uint32_t layersCollapse = dims[0];
-    const uint32_t remCollapse    = dims[2];
+    auto dims               = SelectLayers(logSlots, levelBudget);
+    uint32_t layersCollapse = dims[0];
+    uint32_t remCollapse    = dims[2];
 
-    const uint32_t numRotations    = (1U << (layersCollapse + 1)) - 1;
-    const uint32_t numRotationsRem = (1U << (remCollapse + 1)) - 1;
+    uint32_t numRotations    = (1U << (layersCollapse + 1)) - 1;
+    uint32_t numRotationsRem = (1U << (remCollapse + 1)) - 1;
 
     // Computing the baby-step b and the giant-step g for the collapsed layers for decoding.
     uint32_t g = (dim1 == 0 || dim1 > numRotations) ? (1U << (layersCollapse / 2 + 1 + (numRotations > 7))) : dim1;
@@ -723,16 +723,7 @@ std::vector<int32_t> GetCollapsedFFTParams(uint32_t slots, uint32_t levelBudget,
     uint32_t gRem = (remCollapse != 0) ? (1U << (remCollapse / 2 + 1 + (numRotationsRem > 7))) : 0;
     uint32_t bRem = (remCollapse != 0) ? (numRotationsRem + 1) / gRem : 0;
 
-    // If this return statement changes then CKKS_BOOT_PARAMS should be altered as well
-    return {static_cast<int32_t>(levelBudget),
-            static_cast<int32_t>(layersCollapse),
-            static_cast<int32_t>(remCollapse),
-            static_cast<int32_t>(numRotations),
-            static_cast<int32_t>(b),
-            static_cast<int32_t>(g),
-            static_cast<int32_t>(numRotationsRem),
-            static_cast<int32_t>(bRem),
-            static_cast<int32_t>(gRem)};
+    return {levelBudget, layersCollapse, remCollapse, numRotations, b, g, numRotationsRem, bRem, gRem};
 }
 
 uint32_t getRatioBSGSLT(uint32_t slots) {  // returns powers of two
diff --git a/src/pke/lib/schemebase/base-leveledshe.cpp b/src/pke/lib/schemebase/base-leveledshe.cpp
index c56d2a4a8..f1a5fdc44 100644
--- a/src/pke/lib/schemebase/base-leveledshe.cpp
+++ b/src/pke/lib/schemebase/base-leveledshe.cpp
@@ -355,8 +355,8 @@ std::shared_ptr<std::map<uint32_t, EvalKey<Element>>> LeveledSHEBase<Element>::E
     const auto cc = privateKey->GetCryptoContext();
     const auto& s = privateKey->GetPrivateElement();
 
-    uint32_t N = s.GetRingDimension();
-    uint32_t M = 2 * N;
+    const uint32_t N = s.GetRingDimension();
+    const uint32_t M = s.GetCyclotomicOrder();
 
     // we already have checks on higher level?
     //  if (indexList.size() > N - 1)
@@ -366,18 +366,17 @@ std::shared_ptr<std::map<uint32_t, EvalKey<Element>>> LeveledSHEBase<Element>::E
     // we should be able to assign values to the map without using "omp critical" as all evalKeys' elements would
     // have already been created
     auto evalKeys = std::make_shared<std::map<uint32_t, EvalKey<Element>>>();
-    for (auto indx : indexList) {
+    for (auto indx : indexList)
         (*evalKeys)[indx];
-    }
-    const size_t sz = indexList.size();
-#pragma omp parallel for
-    for (size_t i = 0; i < sz; ++i) {
-        auto privateKeyPermuted = std::make_shared<PrivateKeyImpl<Element>>(cc);
 
-        uint32_t index = NativeInteger(indexList[i]).ModInverse(M).ConvertToInt();
+    const uint32_t sz = indexList.size();
+#pragma omp parallel for
+    for (uint32_t i = 0; i < sz; ++i) {
+        auto index = NativeInteger(indexList[i]).ModInverse(M).ConvertToInt<uint32_t>();
         std::vector<uint32_t> vec(N);
         PrecomputeAutoMap(N, index, &vec);
 
+        auto privateKeyPermuted = std::make_shared<PrivateKeyImpl<Element>>(cc);
         privateKeyPermuted->SetPrivateElement(s.AutomorphismTransform(index, vec));
         (*evalKeys)[indexList[i]] = cc->GetScheme()->KeySwitchGen(privateKey, privateKeyPermuted);
     }
@@ -453,19 +452,19 @@ Ciphertext<Element> LeveledSHEBase<Element>::EvalFastRotation(
 
     const auto cryptoParams = ciphertext->GetCryptoParameters();
 
-    uint32_t N = cryptoParams->GetElementParams()->GetRingDimension();
+    const uint32_t N = cryptoParams->GetElementParams()->GetRingDimension();
     std::vector<uint32_t> vec(N);
     PrecomputeAutoMap(N, autoIndex, &vec);
 
     const auto& cv = ciphertext->GetElements();
 
-    auto ba = cc->GetScheme()->EvalFastKeySwitchCore(digits, evalKey, cv[0].GetParams());
-    (*ba)[0] += cv[0];
-    (*ba)[0] = (*ba)[0].AutomorphismTransform(autoIndex, vec);
-    (*ba)[1] = (*ba)[1].AutomorphismTransform(autoIndex, vec);
+    auto ba = *cc->GetScheme()->EvalFastKeySwitchCore(digits, evalKey, cv[0].GetParams());
+    ba[0] += cv[0];
+    ba[0] = ba[0].AutomorphismTransform(autoIndex, vec);
+    ba[1] = ba[1].AutomorphismTransform(autoIndex, vec);
 
-    auto result = ciphertext->Clone();
-    result->SetElements({std::move((*ba)[0]), std::move((*ba)[1])});
+    auto result = ciphertext->CloneEmpty();
+    result->SetElements(std::move(ba));
     return result;
 }
 
diff --git a/src/pke/unittest/utckksrns/UnitTestFBT.cpp b/src/pke/unittest/utckksrns/UnitTestFBT.cpp
index e5ef41a3e..9dc7c5c33 100644
--- a/src/pke/unittest/utckksrns/UnitTestFBT.cpp
+++ b/src/pke/unittest/utckksrns/UnitTestFBT.cpp
@@ -438,6 +438,8 @@ class UTCKKSRNS_FBT : public ::testing::TestWithParam<TEST_CASE_FBT> {
             // std::cerr << "\n=======Error count: " << std::accumulate(exact.begin(), exact.end(), 0) << "\n";
             // std::cerr << "\n=======Max absolute error: " << *max_error_it << "\n";
             checkEquality((*max_error_it), int64_t(0), 0.0001, failmsg + " LUT evaluation fails");
+
+            cc->ClearStaticMapsAndVectors();
         }
         catch (std::exception& e) {
             std::cerr << "Exception thrown from " << __func__ << "(): " << e.what() << std::endl;
@@ -675,6 +677,8 @@ class UTCKKSRNS_FBT : public ::testing::TestWithParam<TEST_CASE_FBT> {
                         levelsToDrop = lvlsToDrop;
                 }
             }
+
+            cc->ClearStaticMapsAndVectors();
         }
         catch (std::exception& e) {
             std::cerr << "Exception thrown from " << __func__ << "(): " << e.what() << std::endl;
@@ -888,6 +892,8 @@ class UTCKKSRNS_FBT : public ::testing::TestWithParam<TEST_CASE_FBT> {
             // std::cerr << "\n=======Error count: " << std::accumulate(exact.begin(), exact.end(), 0) << "\n";
             // std::cerr << "\n=======Max absolute error: " << *max_error_it << "\n";
             checkEquality((*max_error_it), int64_t(0), 0.0001, failmsg + " LUT evaluation fails");
+
+            cc->ClearStaticMapsAndVectors();
         }
         catch (std::exception& e) {
             std::cerr << "Exception thrown from " << __func__ << "(): " << e.what() << std::endl;
@@ -1087,6 +1093,8 @@ class UTCKKSRNS_FBT : public ::testing::TestWithParam<TEST_CASE_FBT> {
             // std::cerr << "\n=======Error count: " << std::accumulate(exact.begin(), exact.end(), 0) << "\n";
             // std::cerr << "\n=======Max absolute error: " << *max_error_it << "\n";
             checkEquality((*max_error_it), int64_t(0), 0.0001, failmsg + " LUT evaluation fails");
+
+            cc->ClearStaticMapsAndVectors();
         }
         catch (std::exception& e) {
             std::cerr << "Exception thrown from " << __func__ << "(): " << e.what() << std::endl;