Skip to content

Commit

Permalink
sm3: ppc64x, s390x mult4 add comments
Browse files Browse the repository at this point in the history
  • Loading branch information
emmansun authored Sep 6, 2024
1 parent 2d32207 commit 0799942
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 36 deletions.
74 changes: 40 additions & 34 deletions sm3/sm3blocks_ppc64x.s
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
// For instruction emulation
#define ESPERMW V31 // Endian swapping permute into BE

#define TEMP R19

DATA ·mask+0x00(SB)/8, $0x0b0a09080f0e0d0c // byte swap per word
DATA ·mask+0x08(SB)/8, $0x0302010007060504
DATA ·mask+0x10(SB)/8, $0x0001020310111213 // Permute for transpose matrix
Expand Down Expand Up @@ -59,13 +61,17 @@ GLOBL ·mask(SB), RODATA, $80
#endif // defined(GOARCH_ppc64le)

// r = s <<< n
// Due to VSPLTISW's limitation, the n MUST be [0, 15],
// If n > 15, we have to call it multiple times.
// VSPLTISW takes a 5-bit immediate value as an operand.
// I also did NOT find one vector instruction to use immediate value for ROTL.
#define PROLD(s, r, n) \
VSPLTISW $n, TMP5 \
VRLW s, TMP5, r

#define loadWordByIndex(W, i) \
MOVD $(16*(i)), R19 \
LXVW4X (R19)(statePtr), W
MOVD $(16*(i)), TEMP \
LXVW4X (TEMP)(statePtr), W

// one word is 16 bytes
#define prepare4Words \
Expand Down Expand Up @@ -95,12 +101,12 @@ GLOBL ·mask(SB), RODATA, $80
VPERM TMP2, TMP3, M3, T3

// Load constant T, How to simlify it?
// Solution 1: big constant table
// Solution 1: big constant table like sha256block_ppc64x.s
// Solution 2: 2 constant T, rotate shift left one bit every time
// Which solution's performance is better?
// Solution 1's performance is better but it uses more memory.
#define LOAD_T(index, const, target) \
MOVD $const, R19 \
MTVSRWZ R19, target \
MOVD $const, TEMP \
MTVSRWZ TEMP, target \
VSPLTW $1, target, target

#define ROUND_00_11(index, const, a, b, c, d, e, f, g, h) \
Expand Down Expand Up @@ -128,7 +134,7 @@ GLOBL ·mask(SB), RODATA, $80
PROLD(TMP4, b, 9) \ // b = b <<< 9
VOR TMP1, TMP1, h \ // h = TT1
PROLD(f, TMP4, 10) \
PROLD(TMP4, f, 9) \ // f = f <<< 19
PROLD(TMP4, f, 9) \ // f = f <<< 19, Here we had to ROTL twice: ROTL 10, then ROTL 9
PROLD(TMP3, TMP4, 9) \ // TMP4 = TT2 <<< 9
PROLD(TMP4, TMP0, 8) \ // TMP0 = TT2 <<< 17
VXOR TMP3, TMP4, TMP4 \ // TMP4 = TT2 XOR (TT2 <<< 9)
Expand Down Expand Up @@ -185,7 +191,7 @@ GLOBL ·mask(SB), RODATA, $80
PROLD(TMP1, b, 9) \ // b = b <<< 9
VOR TMP4, TMP4, h \ // h = TT1
PROLD(f, TMP1, 10) \
PROLD(TMP1, f, 9) \ // f = f <<< 19
PROLD(TMP1, f, 9) \ // f = f <<< 19, Here we had to ROTL twice: ROTL 10, then ROTL 9
PROLD(TMP3, TMP1, 9) \ // TMP1 = TT2 <<< 9
PROLD(TMP1, TMP0, 8) \ // TMP0 = TT2 <<< 17
VXOR TMP3, TMP1, TMP1 \ // TMP1 = TT2 XOR (TT2 <<< 9)
Expand All @@ -198,7 +204,7 @@ TEXT ·blockMultBy4(SB), NOSPLIT, $0
MOVD $16, R16
MOVD $24, R17
MOVD $32, R18
MOVD $48, R19
MOVD $48, TEMP
#ifdef NEEDS_PERMW
MOVD $·mask(SB), R4
LVX (R4), ESPERMW
Expand All @@ -209,7 +215,7 @@ TEXT ·blockMultBy4(SB), NOSPLIT, $0
LXVD2X (R0)(R4), M0
LXVD2X (R16)(R4), M1
LXVD2X (R18)(R4), M2
LXVD2X (R19)(R4), M3
LXVD2X (TEMP)(R4), M3
#define digPtr R11
#define srcPtrPtr R5
#define statePtr R4
Expand All @@ -225,18 +231,18 @@ TEXT ·blockMultBy4(SB), NOSPLIT, $0
MOVD blocks+24(FP), blockCount

// load state
MOVD (R0)(digPtr), R19
LXVW4X (R0)(R19), a
LXVW4X (R16)(R19), e
MOVD (R15)(digPtr), R19
LXVW4X (R0)(R19), b
LXVW4X (R16)(R19), f
MOVD (R16)(digPtr), R19
LXVW4X (R0)(R19), c
LXVW4X (R16)(R19), g
MOVD (R17)(digPtr), R19
LXVW4X (R0)(R19), d
LXVW4X (R16)(R19), h
MOVD (R0)(digPtr), TEMP
LXVW4X (R0)(TEMP), a
LXVW4X (R16)(TEMP), e
MOVD (R15)(digPtr), TEMP
LXVW4X (R0)(TEMP), b
LXVW4X (R16)(TEMP), f
MOVD (R16)(digPtr), TEMP
LXVW4X (R0)(TEMP), c
LXVW4X (R16)(TEMP), g
MOVD (R17)(digPtr), TEMP
LXVW4X (R0)(TEMP), d
LXVW4X (R16)(TEMP), h

TRANSPOSE_MATRIX(a, b, c, d)
TRANSPOSE_MATRIX(e, f, g, h)
Expand Down Expand Up @@ -352,18 +358,18 @@ end:
TRANSPOSE_MATRIX(e, f, g, h)

// save state
MOVD (R0)(digPtr), R19
STXVW4X a, (R0)(R19)
STXVW4X e, (R16)(R19)
MOVD (R15)(digPtr), R19
STXVW4X b, (R0)(R19)
STXVW4X f, (R16)(R19)
MOVD (R16)(digPtr), R19
STXVW4X c, (R0)(R19)
STXVW4X g, (R16)(R19)
MOVD (R17)(digPtr), R19
STXVW4X d, (R0)(R19)
STXVW4X h, (R16)(R19)
MOVD (R0)(digPtr), TEMP
STXVW4X a, (R0)(TEMP)
STXVW4X e, (R16)(TEMP)
MOVD (R15)(digPtr), TEMP
STXVW4X b, (R0)(TEMP)
STXVW4X f, (R16)(TEMP)
MOVD (R16)(digPtr), TEMP
STXVW4X c, (R0)(TEMP)
STXVW4X g, (R16)(TEMP)
MOVD (R17)(digPtr), TEMP
STXVW4X d, (R0)(TEMP)
STXVW4X h, (R16)(TEMP)

RET

Expand Down
3 changes: 1 addition & 2 deletions sm3/sm3blocks_s390x.s
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
//go:build !purego

#include "textflag.h"
#include "go_asm.h"

DATA mask<>+0x00(SB)/8, $0x0001020310111213
DATA mask<>+0x08(SB)/8, $0x0405060714151617
Expand All @@ -15,7 +14,7 @@ DATA mask<>+0x20(SB)/8, $0x0001020304050607
DATA mask<>+0x28(SB)/8, $0x1011121314151617
DATA mask<>+0x30(SB)/8, $0x08090a0b0c0d0e0f
DATA mask<>+0x38(SB)/8, $0x18191a1b1c1d1e1f
GLOBL mask<>(SB), 8, $64
GLOBL mask<>(SB), RODATA, $64

#define a V0
#define e V1
Expand Down

0 comments on commit 0799942

Please sign in to comment.