pq-code-package
diff --git a/‎.github/workflows/base.yml‎
Lines changed: 2 additions & 3 deletions b/‎.github/workflows/base.yml‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎BIBLIOGRAPHY.md‎
Lines changed: 6 additions & 0 deletions b/‎BIBLIOGRAPHY.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎dev/aarch64_opt/README.md‎
Lines changed: 15 additions & 0 deletions b/‎dev/aarch64_opt/README.md‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎dev/aarch64_opt/meta.h‎
Lines changed: 188 additions & 0 deletions b/‎dev/aarch64_opt/meta.h‎
Lines changed: 188 additions & 0 deletions
diff --git a/‎dev/aarch64_opt/src/Makefile‎
Lines changed: 122 additions & 0 deletions b/‎dev/aarch64_opt/src/Makefile‎
Lines changed: 122 additions & 0 deletions
diff --git a/‎dev/aarch64_opt/src/README.md‎
Lines changed: 15 additions & 0 deletions b/‎dev/aarch64_opt/src/README.md‎
Lines changed: 15 additions & 0 deletions
@@ -221,9 +221,8 @@ jobs:
         backend:
            - arg: '--aarch64-clean'
              name: Clean
-          # TODO: add backend option after we have optimized/clean seperation
-          #  - arg: ''
-          #    name: Optimized
+           - arg: ''
+             name: Optimized
         simplify:
            - arg: ''
              name: Simplified
 
@@ -158,6 +158,9 @@ source code and documentation.
 * Referenced from:
   - [dev/aarch64_clean/src/intt.S](dev/aarch64_clean/src/intt.S)
   - [dev/aarch64_clean/src/ntt.S](dev/aarch64_clean/src/ntt.S)
+  - [dev/aarch64_opt/README.md](dev/aarch64_opt/README.md)
+  - [dev/aarch64_opt/src/intt.S](dev/aarch64_opt/src/intt.S)
+  - [dev/aarch64_opt/src/ntt.S](dev/aarch64_opt/src/ntt.S)
   - [mldsa/src/native/aarch64/src/intt.S](mldsa/src/native/aarch64/src/intt.S)
   - [mldsa/src/native/aarch64/src/ntt.S](mldsa/src/native/aarch64/src/ntt.S)
 
@@ -266,6 +269,9 @@ source code and documentation.
 * Referenced from:
   - [dev/aarch64_clean/src/intt.S](dev/aarch64_clean/src/intt.S)
   - [dev/aarch64_clean/src/ntt.S](dev/aarch64_clean/src/ntt.S)
+  - [dev/aarch64_opt/README.md](dev/aarch64_opt/README.md)
+  - [dev/aarch64_opt/src/intt.S](dev/aarch64_opt/src/intt.S)
+  - [dev/aarch64_opt/src/ntt.S](dev/aarch64_opt/src/ntt.S)
   - [mldsa/src/native/aarch64/src/intt.S](mldsa/src/native/aarch64/src/intt.S)
   - [mldsa/src/native/aarch64/src/ntt.S](mldsa/src/native/aarch64/src/ntt.S)
 
 
@@ -0,0 +1,15 @@
+[//]: # (SPDX-License-Identifier: CC-BY-4.0)
+
+# AArch64 backend (little endian)
+
+This directory contains a native backend for little endian AArch64 systems. It is derived from [^NeonNTT] [^SLOTHY_Paper].
+
+## Variants
+
+This backend comes in two versions: "clean" and optimized. The "clean" backend is handwritten and meant to be easy to read and modify; for example, it heavily leverages register aliases and assembly macros. This directory contains the optimized version, which is automatically generated from the clean one via [SLOTHY](https://github.com/slothy-optimizer/slothy). Currently, the target architecture is Neoverse N1, but you can easily re-optimize the code for a different microarchitecture supported by SLOTHY, by adjusting the parameters in the [Makefile](src/Makefile).
+
+Performance on in-order CPUs such as the Arm Cortex-A55 can be significantly improved by re-optimizing for the specific CPU which may, however, degrade performance on other CPUs.
+
+<!--- bibliography --->
+[^NeonNTT]: Becker, Hwang, Kannwischer, Yang, Yang: Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1, [https://eprint.iacr.org/2021/986](https://eprint.iacr.org/2021/986)
+[^SLOTHY_Paper]: Abdulrahman, Becker, Kannwischer, Klein: Fast and Clean: Auditable high-performance assembly via constraint solving, [https://eprint.iacr.org/2022/1303](https://eprint.iacr.org/2022/1303)
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) The mlkem-native project authors
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#ifndef MLD_NATIVE_AARCH64_META_H
+#define MLD_NATIVE_AARCH64_META_H
+
+/* Set of primitives that this backend replaces */
+#define MLD_USE_NATIVE_NTT
+#define MLD_USE_NATIVE_INTT
+#define MLD_USE_NATIVE_REJ_UNIFORM
+#define MLD_USE_NATIVE_REJ_UNIFORM_ETA2
+#define MLD_USE_NATIVE_REJ_UNIFORM_ETA4
+#define MLD_USE_NATIVE_POLY_DECOMPOSE_32
+#define MLD_USE_NATIVE_POLY_DECOMPOSE_88
+#define MLD_USE_NATIVE_POLY_CADDQ
+#define MLD_USE_NATIVE_POLY_USE_HINT_32
+#define MLD_USE_NATIVE_POLY_USE_HINT_88
+#define MLD_USE_NATIVE_POLY_CHKNORM
+#define MLD_USE_NATIVE_POLYZ_UNPACK_17
+#define MLD_USE_NATIVE_POLYZ_UNPACK_19
+#define MLD_USE_NATIVE_POINTWISE_MONTGOMERY
+#define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L4
+#define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L5
+#define MLD_USE_NATIVE_POLYVECL_POINTWISE_ACC_MONTGOMERY_L7
+
+/* Identifier for this backend so that source and assembly files
+ * in the build can be appropriately guarded. */
+#define MLD_ARITH_BACKEND_AARCH64
+
+
+#if !defined(__ASSEMBLER__)
+#include "src/arith_native_aarch64.h"
+
+static MLD_INLINE void mld_ntt_native(int32_t data[MLDSA_N])
+{
+  mld_ntt_asm(data, mld_aarch64_ntt_zetas_layer123456,
+              mld_aarch64_ntt_zetas_layer78);
+}
+
+static MLD_INLINE void mld_intt_native(int32_t data[MLDSA_N])
+{
+  mld_intt_asm(data, mld_aarch64_intt_zetas_layer78,
+               mld_aarch64_intt_zetas_layer123456);
+}
+
+static MLD_INLINE int mld_rej_uniform_native(int32_t *r, unsigned len,
+                                             const uint8_t *buf,
+                                             unsigned buflen)
+{
+  if (len != MLDSA_N || buflen % 24 != 0)
+  {
+    return -1;
+  }
+
+  /* Safety: outlen is at most MLDSA_N, hence, this cast is safe. */
+  return (int)mld_rej_uniform_asm(r, buf, buflen, mld_rej_uniform_table);
+}
+
+static MLD_INLINE int mld_rej_uniform_eta2_native(int32_t *r, unsigned len,
+                                                  const uint8_t *buf,
+                                                  unsigned buflen)
+{
+  unsigned int outlen;
+  /* AArch64 implementation assumes specific buffer lengths */
+  if (len != MLDSA_N || buflen != MLD_AARCH64_REJ_UNIFORM_ETA2_BUFLEN)
+  {
+    return -1;
+  }
+  /* Constant time: Inputs and outputs to this function are secret.
+   * It is safe to leak which coefficients are accepted/rejected.
+   * The assembly implementation must not leak any other information about the
+   * accepted coefficients. Constant-time testing cannot cover this, and we
+   * hence have to manually verify the assembly.
+   * We declassify prior the input data and mark the outputs as secret.
+   */
+  MLD_CT_TESTING_DECLASSIFY(buf, buflen);
+  outlen = mld_rej_uniform_eta2_asm(r, buf, buflen, mld_rej_uniform_eta_table);
+  MLD_CT_TESTING_SECRET(r, sizeof(int32_t) * outlen);
+  /* Safety: outlen is at most MLDSA_N and, hence, this cast is safe. */
+  return (int)outlen;
+}
+
+static MLD_INLINE int mld_rej_uniform_eta4_native(int32_t *r, unsigned len,
+                                                  const uint8_t *buf,
+                                                  unsigned buflen)
+{
+  unsigned int outlen;
+  /* AArch64 implementation assumes specific buffer lengths */
+  if (len != MLDSA_N || buflen != MLD_AARCH64_REJ_UNIFORM_ETA4_BUFLEN)
+  {
+    return -1;
+  }
+  /* Constant time: Inputs and outputs to this function are secret.
+   * It is safe to leak which coefficients are accepted/rejected.
+   * The assembly implementation must not leak any other information about the
+   * accepted coefficients. Constant-time testing cannot cover this, and we
+   * hence have to manually verify the assembly.
+   * We declassify prior the input data and mark the outputs as secret.
+   */
+  MLD_CT_TESTING_DECLASSIFY(buf, buflen);
+  outlen = mld_rej_uniform_eta4_asm(r, buf, buflen, mld_rej_uniform_eta_table);
+  MLD_CT_TESTING_SECRET(r, sizeof(int32_t) * outlen);
+  /* Safety: outlen is at most MLDSA_N and, hence, this cast is safe. */
+  return (int)outlen;
+}
+
+static MLD_INLINE void mld_poly_decompose_32_native(int32_t *a1, int32_t *a0,
+                                                    const int32_t *a)
+{
+  mld_poly_decompose_32_asm(a1, a0, a);
+}
+
+static MLD_INLINE void mld_poly_decompose_88_native(int32_t *a1, int32_t *a0,
+                                                    const int32_t *a)
+{
+  mld_poly_decompose_88_asm(a1, a0, a);
+}
+
+static MLD_INLINE void mld_poly_caddq_native(int32_t a[MLDSA_N])
+{
+  mld_poly_caddq_asm(a);
+}
+
+static MLD_INLINE void mld_poly_use_hint_32_native(int32_t *b, const int32_t *a,
+                                                   const int32_t *h)
+{
+  mld_poly_use_hint_32_asm(b, a, h);
+}
+
+static MLD_INLINE void mld_poly_use_hint_88_native(int32_t *b, const int32_t *a,
+                                                   const int32_t *h)
+{
+  mld_poly_use_hint_88_asm(b, a, h);
+}
+
+static MLD_INLINE int mld_poly_chknorm_native(const int32_t *a, int32_t B)
+{
+  return mld_poly_chknorm_asm(a, B);
+}
+
+static MLD_INLINE void mld_polyz_unpack_17_native(int32_t *r,
+                                                  const uint8_t *buf)
+{
+  mld_polyz_unpack_17_asm(r, buf, mld_polyz_unpack_17_indices);
+}
+
+static MLD_INLINE void mld_polyz_unpack_19_native(int32_t *r,
+                                                  const uint8_t *buf)
+{
+  mld_polyz_unpack_19_asm(r, buf, mld_polyz_unpack_19_indices);
+}
+
+static MLD_INLINE void mld_poly_pointwise_montgomery_native(
+    int32_t out[MLDSA_N], const int32_t in0[MLDSA_N],
+    const int32_t in1[MLDSA_N])
+{
+  mld_poly_pointwise_montgomery_asm(out, in0, in1);
+}
+
+static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l4_native(
+    int32_t w[MLDSA_N], const int32_t u[4][MLDSA_N],
+    const int32_t v[4][MLDSA_N])
+{
+  mld_polyvecl_pointwise_acc_montgomery_l4_asm(w, (const int32_t *)u,
+                                               (const int32_t *)v);
+}
+
+static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l5_native(
+    int32_t w[MLDSA_N], const int32_t u[5][MLDSA_N],
+    const int32_t v[5][MLDSA_N])
+{
+  mld_polyvecl_pointwise_acc_montgomery_l5_asm(w, (const int32_t *)u,
+                                               (const int32_t *)v);
+}
+
+static MLD_INLINE void mld_polyvecl_pointwise_acc_montgomery_l7_native(
+    int32_t w[MLDSA_N], const int32_t u[7][MLDSA_N],
+    const int32_t v[7][MLDSA_N])
+{
+  mld_polyvecl_pointwise_acc_montgomery_l7_asm(w, (const int32_t *)u,
+                                               (const int32_t *)v);
+}
+
+#endif /* !__ASSEMBLER__ */
+#endif /* !MLD_NATIVE_AARCH64_META_H */
@@ -0,0 +1,122 @@
+# Copyright (c) The mldsa-native project authors
+# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+
+######
+# To run, see the README.md file
+######
+.PHONY: all clean
+
+# ISA to optimize for
+TARGET_ISA=Arm_AArch64
+
+# MicroArch target to optimize for
+# Changing this to Arm_Cortex_A55 results in significantly better performance
+# on the Cortex-A55, but may result in worse performance on other CPUs.
+TARGET_MICROARCH=Arm_Neoverse_N1_experimental
+
+SLOTHY_EXTRA_FLAGS ?=
+
+SLOTHY_FLAGS=-c sw_pipelining.enabled=true \
+             -c inputs_are_outputs \
+             -c sw_pipelining.minimize_overlapping=False \
+             -c sw_pipelining.allow_post \
+             -c variable_size \
+             -c constraints.stalls_first_attempt=64 \
+             $(SLOTHY_EXTRA_FLAGS)
+
+SLOTHY_FLAGS_SPLIT= -c inputs_are_outputs \
+                    -c variable_size \
+                    -c constraints.stalls_first_attempt=64 \
+                    -c split_heuristic=true \
+                    -c split_heuristic_factor=1.5 \
+                    -c split_heuristic_repeat=2 \
+                    -c sw_pipelining.enabled=true \
+                    -c sw_pipelining.halving_heuristic=True \
+                    $(SLOTHY_EXTRA_FLAGS)
+
+# For kernels which stash callee-saved v8-v15 but don't stash callee-saved GPRs x19-x30. 
+# Allow SLOTHY to use all V-registers, but only caller-saved GPRs.
+RESERVE_X_ONLY_FLAG=-c reserved_regs="[x18--x30,sp]"
+
+# Used for kernels which don't stash callee-saved registers.
+# Restrict SLOTHY to caller-saved registers.
+RESERVE_ALL_FLAG=-c reserved_regs="[x18--x30,sp,v8--v15]"
+
+all: ntt.S \
+     intt.S \
+     mld_polyvecl_pointwise_acc_montgomery_l4.S \
+     mld_polyvecl_pointwise_acc_montgomery_l5.S \
+     mld_polyvecl_pointwise_acc_montgomery_l7.S \
+     pointwise_montgomery.S \
+     poly_caddq_asm.S \
+     poly_chknorm_asm.S \
+     poly_decompose_32_asm.S \
+     poly_decompose_88_asm.S \
+     poly_use_hint_32_asm.S \
+     poly_use_hint_88_asm.S \
+     polyz_unpack_17_asm.S \
+     polyz_unpack_19_asm.S \
+     rej_uniform_asm.S \
+     rej_uniform_eta2_asm.S \
+     rej_uniform_eta4_asm.S
+
+# These units explicitly save and restore registers v8-v15, so SLOTHY can freely use
+# those registers.
+ntt.S: ../../aarch64_clean/src/ntt.S
+	# optimize first loop in one go and write to temp file
+	$(eval TMPFILE := $(shell mktemp))
+	slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $(TMPFILE) -l ntt_layer123_start $(SLOTHY_FLAGS) $(RESERVE_X_ONLY_FLAG)
+	# optimize second loop using split heuristic
+	slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $(TMPFILE) -o $@ -l ntt_layer45678_start $(SLOTHY_FLAGS_SPLIT) $(RESERVE_X_ONLY_FLAG)
+
+# Copy remaining files without optimization for now
+intt.S: ../../aarch64_clean/src/intt.S
+	cp $< $@
+
+mld_polyvecl_pointwise_acc_montgomery_l4.S: ../../aarch64_clean/src/mld_polyvecl_pointwise_acc_montgomery_l4.S
+	cp $< $@
+
+mld_polyvecl_pointwise_acc_montgomery_l5.S: ../../aarch64_clean/src/mld_polyvecl_pointwise_acc_montgomery_l5.S
+	cp $< $@
+
+mld_polyvecl_pointwise_acc_montgomery_l7.S: ../../aarch64_clean/src/mld_polyvecl_pointwise_acc_montgomery_l7.S
+	cp $< $@
+
+pointwise_montgomery.S: ../../aarch64_clean/src/pointwise_montgomery.S
+	cp $< $@
+
+poly_caddq_asm.S: ../../aarch64_clean/src/poly_caddq_asm.S
+	cp $< $@
+
+poly_chknorm_asm.S: ../../aarch64_clean/src/poly_chknorm_asm.S
+	cp $< $@
+
+poly_decompose_32_asm.S: ../../aarch64_clean/src/poly_decompose_32_asm.S
+	cp $< $@
+
+poly_decompose_88_asm.S: ../../aarch64_clean/src/poly_decompose_88_asm.S
+	cp $< $@
+
+poly_use_hint_32_asm.S: ../../aarch64_clean/src/poly_use_hint_32_asm.S
+	cp $< $@
+
+poly_use_hint_88_asm.S: ../../aarch64_clean/src/poly_use_hint_88_asm.S
+	cp $< $@
+
+polyz_unpack_17_asm.S: ../../aarch64_clean/src/polyz_unpack_17_asm.S
+	cp $< $@
+
+polyz_unpack_19_asm.S: ../../aarch64_clean/src/polyz_unpack_19_asm.S
+	cp $< $@
+
+rej_uniform_asm.S: ../../aarch64_clean/src/rej_uniform_asm.S
+	cp $< $@
+
+rej_uniform_eta2_asm.S: ../../aarch64_clean/src/rej_uniform_eta2_asm.S
+	cp $< $@
+
+rej_uniform_eta4_asm.S: ../../aarch64_clean/src/rej_uniform_eta4_asm.S
+	cp $< $@
+
+clean:
+	-$(RM) -rf *.S
@@ -0,0 +1,15 @@
+[//]: # (SPDX-License-Identifier: CC-BY-4.0)
+
+# mldsa-native AArch64 backend SLOTHY-optimized code
+
+This directory contains the AArch64 backend after it has been optimized by [SLOTHY](https://github.com/slothy-optimizer/slothy/).
+
+## Re-running SLOTHY
+
+If the "clean" sources [`../../aarch64_clean/src/*.S`](../../aarch64_clean/src/) change, take the following steps to re-optimize and install them into the main source tree:
+
+1. Run `make` to re-generate the optimized sources using SLOTHY. This assumes a working SLOTHY setup, as established e.g. by the default nix shell for mldsa-native. See also the [SLOTHY README](https://github.com/slothy-optimizer/slothy/).
+
+2. Run `autogen` to transfer the newly optimized files into the main source tree [mldsa/src/native](../../../mldsa/src/native).
+
+3. Run `./scripts/tests all --opt=OPT` to check that the new assembly is still functional.