diff --git a/Makefile b/Makefile index c7c31b9a52..84e0d5b60e 100644 --- a/Makefile +++ b/Makefile @@ -684,7 +684,10 @@ INTRINSIC_FLAGS = \ -add-include 'Hacl_SHA2_Vec256.c:"libintvector.h"' \ \ -add-include 'Hacl_Hash_Blake2b_Simd256:"libintvector.h"' \ - -add-include 'Hacl_MAC_Poly1305_Simd256:"libintvector.h"' + -add-include 'Hacl_MAC_Poly1305_Simd256:"libintvector.h"' \ + \ + -add-include 'Hacl_AES_128_CTR32_NI:"libintvector.h"' \ + -add-include 'Hacl_AES_256_CTR32_NI:"libintvector.h"' # Disabled for distributions that don't include code based on intrinsics. INTRINSIC_INT_FLAGS = \ @@ -746,7 +749,8 @@ BUNDLE_FLAGS =\ $(INTTYPES_128_BUNDLE) \ $(RSAPSS_BUNDLE) \ $(FFDHE_BUNDLE) \ - $(LEGACY_BUNDLE) + $(LEGACY_BUNDLE) \ + $(AES_CTR32_BUNDLE) DEFAULT_FLAGS = \ $(HAND_WRITTEN_LIB_FLAGS) \ @@ -758,6 +762,8 @@ DEFAULT_FLAGS = \ $(REQUIRED_FLAGS) \ $(TARGET_H_INCLUDE) +IGNORE_AES_BUNDLE = -bundle Hacl.AES_128.*,Hacl.AES_256.*,Hacl.Impl.* + # WASM distribution # ----------------- # @@ -816,6 +822,9 @@ dist/wasm/Makefile.basic: POLY_BUNDLE = \ -bundle 'Hacl.Poly1305_128,Hacl.Poly1305_256,Hacl.Impl.Poly1305.*' \ -bundle 'Hacl.Streaming.Poly1305_128,Hacl.Streaming.Poly1305_256' +# Disabling AES +dist/wasm/Makefile.basic: AES_CTR32_BUNDLE = $(IGNORE_AES_BUNDLE) + dist/wasm/Makefile.basic: CTR_BUNDLE = dist/wasm/Makefile.basic: RSAPSS_BUNDLE = -bundle Hacl.RSAPSS,Hacl.Impl.RSAPSS.*,Hacl.Impl.RSAPSS dist/wasm/Makefile.basic: FFDHE_BUNDLE = -bundle Hacl.FFDHE,Hacl.Impl.FFDHE.*,Hacl.Impl.FFDHE diff --git a/Makefile.common b/Makefile.common index 2ae897cc7b..f8aff90d3c 100644 --- a/Makefile.common +++ b/Makefile.common @@ -225,6 +225,7 @@ BIGNUM_BUNDLE= \ -bundle Hacl.Bignum.Base,Hacl.Bignum.Addition,Hacl.Bignum.Convert,Hacl.Bignum.Lib,Hacl.Bignum.Multiplication[rename=Hacl_Bignum_Base] \ -static-header Hacl.Bignum.Base,Hacl.Bignum.Addition,Hacl.Bignum.Convert,Hacl.Bignum.Lib,Hacl.Bignum.Multiplication \ -bundle Hacl.Bignum,Hacl.Bignum.*[rename=Hacl_Bignum] +AES_CTR32_BUNDLE=-bundle Hacl.AES_128.CTR32.BitSlice=Hacl.Impl.AES.* -bundle Hacl.AES_128.CTR32.NI=Hacl.Impl.AES.* -bundle Hacl.AES_256.CTR32.BitSlice=Hacl.Impl.AES.* -bundle Hacl.AES_256.CTR32.NI=Hacl.Impl.AES.* # 3. OCaml diff --git a/code/aes/Makefile b/code/aes/Makefile index 0ceed270b0..ffa3b39e24 100644 --- a/code/aes/Makefile +++ b/code/aes/Makefile @@ -3,21 +3,21 @@ HACL_HOME=$(realpath ../..) # CUSTOMIZE HERE: determine what is the main target of this Makefile, e.g. a C # test, a Low* test, or just a binary archive (like libcurve.a). all: \ - dist/aes128-bitslice-test.exe \ - dist/aes128-ni-test.exe + dist/aes128-ctr32-bitslice-test.exe \ + dist/aes128-ctr32-ni-test.exe # TODO: why are the aes128 tests succeeding? # they should not even compile! test: all - dist/aes128-bitslice-test.exe - dist/aes128-ni-test.exe + dist/aes128-ctr32-bitslice-test.exe + dist/aes128-ctr32-ni-test.exe # Defines rules for producing .checked, .krml, .depend, etc. include $(HACL_HOME)/Makefile.local #BASE_FLAGS+= -funroll-loops 4 -CFLAGS += -I$(HACL_HOME)/lib/c -maes -mpclmul -msse4.1 -O3 +CFLAGS += -I$(HACL_HOME)/lib/c -maes -msse4.1 -O3 #CFLAGS += -I../../../lib/c -march=native -mtune=native -O3 export CFLAGS @@ -32,10 +32,10 @@ dist/Makefile.basic: $(filter-out %/prims.krml,$(ALL_KRML_FILES)) -add-include '"libintvector.h"' \ -skip-compilation -dist/aes128-bitslice-test.exe: $(HACL_HOME)/tests/aes128-bitslice-test.c dist/libaes.a -dist/aes256-bitslice-test.exe: $(HACL_HOME)/tests/aes256-bitslice-test.c dist/libaes.a -dist/aes128-ni-test.exe: $(HACL_HOME)/tests/aes128-ni-test.c dist/libaes.a -dist/aes256-ni-test.exe: $(HACL_HOME)/tests/aes256-ni-test.c dist/libaes.a +dist/aes128-ctr32-bitslice-test.exe: $(HACL_HOME)/tests/aes128-ctr32-bitslice-test.c dist/libaes.a +dist/aes256-ctr32-bitslice-test.exe: $(HACL_HOME)/tests/aes256-ctr32-bitslice-test.c dist/libaes.a +dist/aes128-ctr32-ni-test.exe: $(HACL_HOME)/tests/aes128-ctr32-ni-test.c dist/libaes.a +dist/aes256-ctr32-ni-test.exe: $(HACL_HOME)/tests/aes256-ctr32-ni-test.c dist/libaes.a %.exe: $(CC) $(CFLAGS) -flto $^ -o $@ diff --git a/dist/Makefile.tmpl b/dist/Makefile.tmpl index a320e07ca5..eb4dbea098 100644 --- a/dist/Makefile.tmpl +++ b/dist/Makefile.tmpl @@ -49,6 +49,9 @@ include Makefile.basic CFLAGS += -Wno-parentheses -Wno-deprecated-declarations -Wno-\#warnings -Wno-error=cpp -Wno-cpp -g -std=gnu11 -O3 +CFLAGS_AES_NI = -maes -msse4.1 +Hacl_AES_128_CTR32_NI.o Hacl_AES_256_CTR32_NI.o: CFLAGS += $(CFLAGS_AES_NI) + Hacl_MAC_Poly1305_Simd128.o Hacl_Chacha20_Vec128.o Hacl_AEAD_Chacha20Poly1305_Simd128.o Hacl_Hash_Blake2s_Simd128.o Hacl_HMAC_Blake2s_128.o Hacl_HKDF_Blake2s_128.o Hacl_SHA2_Vec128.o: CFLAGS += $(CFLAGS_128) Hacl_MAC_Poly1305_Simd256.o Hacl_Chacha20_Vec256.o Hacl_AEAD_Chacha20Poly1305_Simd256.o Hacl_Hash_Blake2b_Simd256.o Hacl_HMAC_Blake2b_256.o Hacl_HKDF_Blake2b_256.o Hacl_SHA2_Vec256.o: CFLAGS += $(CFLAGS_256) diff --git a/lib/c/libintvector.h b/lib/c/libintvector.h index 99d1133694..6e0c7cf336 100644 --- a/lib/c/libintvector.h +++ b/lib/c/libintvector.h @@ -121,12 +121,18 @@ typedef __m128i Lib_IntVector_Intrinsics_vec128; #define Lib_IntVector_Intrinsics_vec128_load64_le(x0) \ (_mm_loadu_si128((__m128i*)(x0))) +#define Lib_IntVector_Intrinsics_vec128_load128_le(x0) \ + (_mm_loadu_si128((__m128i*)(x0))) + #define Lib_IntVector_Intrinsics_vec128_store32_le(x0, x1) \ (_mm_storeu_si128((__m128i*)(x0), x1)) #define Lib_IntVector_Intrinsics_vec128_store64_le(x0, x1) \ (_mm_storeu_si128((__m128i*)(x0), x1)) +#define Lib_IntVector_Intrinsics_vec128_store128_le(x0, x1) \ + (_mm_storeu_si128((__m128i*)(x0), x1)) + #define Lib_IntVector_Intrinsics_vec128_load_be(x0) \ (_mm_shuffle_epi8(_mm_loadu_si128((__m128i*)(x0)), _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15))) diff --git a/tests/aes128-ctr32-bitslice-test.c b/tests/aes128-ctr32-bitslice-test.c new file mode 100644 index 0000000000..3b674cdb4a --- /dev/null +++ b/tests/aes128-ctr32-bitslice-test.c @@ -0,0 +1,120 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef uint64_t cycles; + +static __inline__ cycles cpucycles_begin(void) +{ + uint64_t rax,rdx,aux; + asm volatile ( "rdtscp\n" : "=a" (rax), "=d" (rdx), "=c" (aux) : : ); + return (rdx << 32) + rax; + // unsigned hi, lo; + //__asm__ __volatile__ ("CPUID\n\t" "RDTSC\n\t" "mov %%edx, %0\n\t" "mov %%eax, %1\n\t": "=r" (hi), "=r" (lo):: "%rax", "%rbx", "%rcx", "%rdx"); + //return ( (uint64_t)lo)|( ((uint64_t)hi)<<32 ); +} + +static __inline__ cycles cpucycles_end(void) +{ + uint64_t rax,rdx,aux; + asm volatile ( "rdtscp\n" : "=a" (rax), "=d" (rdx), "=c" (aux) : : ); + return (rdx << 32) + rax; + // unsigned hi, lo; + //__asm__ __volatile__ ("RDTSCP\n\t" "mov %%edx, %0\n\t" "mov %%eax, %1\n\t" "CPUID\n\t": "=r" (hi), "=r" (lo):: "%rax", "%rbx", "%rcx", "%rdx"); + //return ( (uint64_t)lo)|( ((uint64_t)hi)<<32 ); +} + +extern void Hacl_AES_128_CTR32_BitSlice_aes128_ctr_encrypt(int in_len, uint8_t* out, uint8_t* in, uint8_t* k, uint8_t* n, uint32_t c); +extern void Hacl_AES_128_CTR32_BitSlice_aes128_init(uint64_t *ctx, uint8_t *key, uint8_t *nonce); +extern void +Hacl_AES_128_CTR32_BitSlice_aes_ctr( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint64_t *ctx, + uint32_t counter, + uint32_t rounds); + +#define ROUNDS 40960 +#define SIZE 16384 + +int main() { + int in_len = 32; + uint8_t in[32] = { + 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, + 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, + 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17, + 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F}; + uint8_t k[16] = { + 0x7E,0x24,0x06,0x78,0x17,0xFA,0xE0,0xD7, + 0x43,0xD6,0xCE,0x1F,0x32,0x53,0x91,0x63}; + uint8_t n[12] = { + 0x00,0x6C,0xB6,0xDB,0xC0,0x54,0x3B,0x59, + 0xDA,0x48,0xD9,0x0B}; + uint8_t exp[32] = { + 0x51,0x04,0xA1,0x06,0x16,0x8A,0x72,0xD9, + 0x79,0x0D,0x41,0xEE,0x8E,0xDA,0xD3,0x88, + 0xEB,0x2E,0x1E,0xFC,0x46,0xDA,0x57,0xC8, + 0xFC,0xE6,0x30,0xDF,0x91,0x41,0xBE,0x28}; + uint8_t comp[32] = {0}; + bool ok = true; + + uint64_t ctx[(uint32_t)8U + (uint32_t)15U * (uint32_t)8U] = {0}; + + Hacl_AES_128_CTR32_BitSlice_aes128_ctr_encrypt(in_len,comp,in,k,n,1); + + printf("AES-BitSlice computed:"); + for (int i = 0; i < 32; i++) + printf("%02x",comp[i]); + printf("\n"); + printf("AES-BitSlice expected:"); + for (int i = 0; i < 32; i++) + printf("%02x",exp[i]); + printf("\n"); + ok = true; + for (int i = 0; i < 32; i++) + ok = ok & (exp[i] == comp[i]); + if (ok) printf("Success!\n"); + + uint64_t len = SIZE; + uint8_t plain[SIZE]; + uint8_t key[16]; + uint8_t nonce[12]; + cycles a,b; + clock_t t1,t2; + uint64_t count = ROUNDS * SIZE; + memset(plain,'P',SIZE); + memset(key,'K',16); + memset(nonce,'N',12); + + for (int j = 0; j < ROUNDS; j++) { + Hacl_AES_128_CTR32_BitSlice_aes128_ctr_encrypt(SIZE,plain,plain,key,nonce,1); + } + + t1 = clock(); + a = cpucycles_begin(); + for (int j = 0; j < ROUNDS; j++) { + Hacl_AES_128_CTR32_BitSlice_aes128_ctr_encrypt(SIZE,plain,plain,key,nonce,1); + // Hacl_AES_128_CTR32_BitSlice_aes128_init(ctx,key,nonce); + // Hacl_AES_128_CTR32_BitSlice_aes_ctr(SIZE,plain,plain,ctx,1,10); + + } + b = cpucycles_end(); + t2 = clock(); + clock_t tdiff2 = t2 - t1; + cycles cdiff2 = b - a; + + printf("AES-BitSlice PERF:\n"); + printf("cycles for %" PRIu64 " bytes: %" PRIu64 " (%.2fcycles/byte)\n",count,(uint64_t)cdiff2,(double)cdiff2/count); + printf("time for %" PRIu64 " bytes: %" PRIu64 " (%.2fus/byte)\n",count,(uint64_t)tdiff2,(double)tdiff2/count); + printf("bw %8.2f MB/s\n",(double)count/(((double)tdiff2 / CLOCKS_PER_SEC) * 1000000.0)); + + +} diff --git a/tests/aes128-ctr32-ni-test.c b/tests/aes128-ctr32-ni-test.c new file mode 100644 index 0000000000..a5839e4525 --- /dev/null +++ b/tests/aes128-ctr32-ni-test.c @@ -0,0 +1,105 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef uint64_t cycles; + +static __inline__ cycles cpucycles_begin(void) +{ + uint64_t rax,rdx,aux; + asm volatile ( "rdtscp\n" : "=a" (rax), "=d" (rdx), "=c" (aux) : : ); + return (rdx << 32) + rax; + // unsigned hi, lo; + //__asm__ __volatile__ ("CPUID\n\t" "RDTSC\n\t" "mov %%edx, %0\n\t" "mov %%eax, %1\n\t": "=r" (hi), "=r" (lo):: "%rax", "%rbx", "%rcx", "%rdx"); + //return ( (uint64_t)lo)|( ((uint64_t)hi)<<32 ); +} + +static __inline__ cycles cpucycles_end(void) +{ + uint64_t rax,rdx,aux; + asm volatile ( "rdtscp\n" : "=a" (rax), "=d" (rdx), "=c" (aux) : : ); + return (rdx << 32) + rax; + // unsigned hi, lo; + //__asm__ __volatile__ ("RDTSCP\n\t" "mov %%edx, %0\n\t" "mov %%eax, %1\n\t" "CPUID\n\t": "=r" (hi), "=r" (lo):: "%rax", "%rbx", "%rcx", "%rdx"); + //return ( (uint64_t)lo)|( ((uint64_t)hi)<<32 ); +} + +extern void Hacl_AES_128_CTR32_NI_aes128_ctr_encrypt(int in_len, uint8_t* out, uint8_t* in, uint8_t* k, uint8_t* n, uint32_t c); + +#define ROUNDS 100000 +#define SIZE 16384 + +int main() { + int in_len = 32; + uint8_t in[32] = { + 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, + 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, + 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17, + 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F}; + uint8_t k[16] = { + 0x7E,0x24,0x06,0x78,0x17,0xFA,0xE0,0xD7, + 0x43,0xD6,0xCE,0x1F,0x32,0x53,0x91,0x63}; + uint8_t n[12] = { + 0x00,0x6C,0xB6,0xDB,0xC0,0x54,0x3B,0x59, + 0xDA,0x48,0xD9,0x0B}; + uint8_t exp[32] = { + 0x51,0x04,0xA1,0x06,0x16,0x8A,0x72,0xD9, + 0x79,0x0D,0x41,0xEE,0x8E,0xDA,0xD3,0x88, + 0xEB,0x2E,0x1E,0xFC,0x46,0xDA,0x57,0xC8, + 0xFC,0xE6,0x30,0xDF,0x91,0x41,0xBE,0x28}; + uint8_t comp[32] = {0}; + bool ok = true; + + Hacl_AES_128_CTR32_NI_aes128_ctr_encrypt(in_len,comp,in,k,n,1); + printf("AES-NI computed:"); + for (int i = 0; i < 32; i++) + printf("%02x",comp[i]); + printf("\n"); + printf("AES_NI expected:"); + for (int i = 0; i < 32; i++) + printf("%02x",exp[i]); + printf("\n"); + ok = true; + for (int i = 0; i < 32; i++) + ok = ok & (exp[i] == comp[i]); + if (ok) printf("Success!\n"); + + uint64_t len = SIZE; + uint8_t plain[SIZE]; + uint8_t key[16]; + uint8_t nonce[12]; + cycles a,b; + clock_t t1,t2; + uint64_t count = ROUNDS * SIZE; + memset(plain,'P',SIZE); + memset(key,'K',16); + memset(nonce,'N',12); + + for (int j = 0; j < ROUNDS; j++) { + Hacl_AES_128_CTR32_NI_aes128_ctr_encrypt(SIZE,plain,plain,key,nonce,1); + } + + t1 = clock(); + a = cpucycles_begin(); + for (int j = 0; j < ROUNDS; j++) { + Hacl_AES_128_CTR32_NI_aes128_ctr_encrypt(SIZE,plain,plain,key,nonce,1); + } + b = cpucycles_end(); + t2 = clock(); + clock_t tdiff1 = t2 - t1; + cycles cdiff1 = b - a; + + printf("AES-NI PERF:\n"); + printf("cycles for %" PRIu64 " bytes: %" PRIu64 " (%.2fcycles/byte)\n",count,(uint64_t)cdiff1,(double)cdiff1/count); + printf("time for %" PRIu64 " bytes: %" PRIu64 " (%.2fus/byte)\n",count,(uint64_t)tdiff1,(double)tdiff1/count); + printf("bw %8.2f MB/s\n",(double)count/(((double)tdiff1 / CLOCKS_PER_SEC) * 1000000.0)); + + +}