/** Encrypt 4 block of sm4. Use AESNI opcode. TODO: MOVUPS or MOVOU? */ #include "textflag.h" DATA zero<>+0(SB)/8, $0x0 DATA zero<>+8(SB)/8, $0x0 GLOBL zero<>(SB), NOPTR, $16 // no need, use shufps instead // DATA ext<>+0(SB)/8, $0x0302010003020100 // DATA ext<>+8(SB)/8, $0x0302010003020100 // GLOBL ext<>(SB), NOPTR, $16 DATA l64<>+0(SB)/8, $0xffffffffffffffff DATA l64<>+8(SB)/8, $0x0706050403020100 GLOBL l64<>(SB), NOPTR, $16 DATA e00l<>+0(SB)/8, $0xffffffff00010203 DATA e00l<>+8(SB)/8, $0xffffffffffffffff GLOBL e00l<>(SB), NOPTR, $16 DATA e01l<>+0(SB)/8, $0x00010203ffffffff DATA e01l<>+8(SB)/8, $0xffffffffffffffff GLOBL e01l<>(SB), NOPTR, $16 DATA e02l<>+0(SB)/8, $0xffffffffffffffff DATA e02l<>+8(SB)/8, $0xffffffff00010203 GLOBL e02l<>(SB), NOPTR, $16 DATA e03l<>+0(SB)/8, $0xffffffffffffffff DATA e03l<>+8(SB)/8, $0x00010203ffffffff GLOBL e03l<>(SB), NOPTR, $16 DATA e10l<>+0(SB)/8, $0xffffffff04050607 DATA e10l<>+8(SB)/8, $0xffffffffffffffff GLOBL e10l<>(SB), NOPTR, $16 DATA e11l<>+0(SB)/8, $0x04050607ffffffff DATA e11l<>+8(SB)/8, $0xffffffffffffffff GLOBL e11l<>(SB), NOPTR, $16 DATA e12l<>+0(SB)/8, $0xffffffffffffffff DATA e12l<>+8(SB)/8, $0xffffffff04050607 GLOBL e12l<>(SB), NOPTR, $16 DATA e13l<>+0(SB)/8, $0xffffffffffffffff DATA e13l<>+8(SB)/8, $0x04050607ffffffff GLOBL e13l<>(SB), NOPTR, $16 DATA e20l<>+0(SB)/8, $0xffffffff08090a0b DATA e20l<>+8(SB)/8, $0xffffffffffffffff GLOBL e20l<>(SB), NOPTR, $16 DATA e21l<>+0(SB)/8, $0x08090a0bffffffff DATA e21l<>+8(SB)/8, $0xffffffffffffffff GLOBL e21l<>(SB), NOPTR, $16 DATA e22l<>+0(SB)/8, $0xffffffffffffffff DATA e22l<>+8(SB)/8, $0xffffffff08090a0b GLOBL e22l<>(SB), NOPTR, $16 DATA e23l<>+0(SB)/8, $0xffffffffffffffff DATA e23l<>+8(SB)/8, $0x08090a0bffffffff GLOBL e23l<>(SB), NOPTR, $16 DATA e30l<>+0(SB)/8, $0xffffffff0c0d0e0f DATA e30l<>+8(SB)/8, $0xffffffffffffffff GLOBL e30l<>(SB), NOPTR, $16 DATA e31l<>+0(SB)/8, $0x0c0d0e0fffffffff DATA e31l<>+8(SB)/8, $0xffffffffffffffff GLOBL e31l<>(SB), NOPTR, $16 DATA e32l<>+0(SB)/8, $0xffffffffffffffff DATA e32l<>+8(SB)/8, $0xffffffff0c0d0e0f GLOBL e32l<>(SB), NOPTR, $16 DATA e33l<>+0(SB)/8, $0xffffffffffffffff DATA e33l<>+8(SB)/8, $0x0c0d0e0fffffffff GLOBL e33l<>(SB), NOPTR, $16 DATA c0f<>+0(SB)/8, $0x0F0F0F0F0F0F0F0F DATA c0f<>+8(SB)/8, $0x0F0F0F0F0F0F0F0F DATA flp<>+0(SB)/8, $0x0405060700010203 DATA flp<>+8(SB)/8, $0x0C0D0E0F08090A0B DATA shr<>+0(SB)/8, $0x0B0E0104070A0D00 DATA shr<>+8(SB)/8, $0x0306090C0F020508 DATA m1l<>+0(SB)/8, $0x9197E2E474720701 DATA m1l<>+8(SB)/8, $0xC7C1B4B222245157 DATA m1h<>+0(SB)/8, $0xE240AB09EB49A200 DATA m1h<>+8(SB)/8, $0xF052B91BF95BB012 DATA m2l<>+0(SB)/8, $0x5B67F2CEA19D0834 DATA m2l<>+8(SB)/8, $0xEDD14478172BBE82 DATA m2h<>+0(SB)/8, $0xAE7201DD73AFDC00 DATA m2h<>+8(SB)/8, $0x11CDBE62CC1063BF DATA r08<>+0(SB)/8, $0x0605040702010003 DATA r08<>+8(SB)/8, $0x0E0D0C0F0A09080B DATA r16<>+0(SB)/8, $0x0504070601000302 DATA r16<>+8(SB)/8, $0x0D0C0F0E09080B0A DATA r24<>+0(SB)/8, $0x0407060500030201 DATA r24<>+8(SB)/8, $0x0C0F0E0D080B0A09 GLOBL c0f<>(SB), NOPTR, $16 GLOBL flp<>(SB), NOPTR, $16 GLOBL shr<>(SB), NOPTR, $16 GLOBL m1l<>(SB), NOPTR, $16 GLOBL m1h<>(SB), NOPTR, $16 GLOBL m2l<>(SB), NOPTR, $16 GLOBL m2h<>(SB), NOPTR, $16 GLOBL r08<>(SB), NOPTR, $16 GLOBL r16<>(SB), NOPTR, $16 GLOBL r24<>(SB), NOPTR, $16 #define transposeLoad0(reg, begin, X, XT)\ MOVL +(begin+0)(reg), AX\ MOVL +(begin+16)(reg), BX\ MOVL +(begin+32)(reg), CX\ MOVL +(begin+48)(reg), DX\ SHLQ $32, DX\ XORQ CX, DX\ SHLQ $32, BX\ XORQ AX, BX\ MOVQ DX, X\ MOVQ BX, XT\ PSHUFB l64<>(SB), X\ PXOR XT, X #define transposeLoad(X,mask0,mask1,mask2,mask3)\ MOVOU X8, X4\ MOVOU X9, X5\ MOVOU X10, X6\ MOVOU X11, X7\ PSHUFB mask0, X4\ PSHUFB mask1, X5\ PSHUFB mask2, X6\ PSHUFB mask3, X7\ PXOR X5,X4\ PXOR X6,X4\ PXOR X7,X4\ MOVOU X4, X #define storeTranspose(mask0,mask1,mask2,mask3, begin)\ MOVOU X3, X4\ MOVOU X2, X5\ MOVOU X1, X6\ MOVOU X0, X7\ PSHUFB mask0, X4\ PSHUFB mask1, X5\ PSHUFB mask2, X6\ PSHUFB mask3, X7\ PXOR X5,X4\ PXOR X6,X4\ PXOR X7,X4\ MOVOU X4, +(begin)(DI)\ // func encrypt4(dst []byte, src []byte, rk []uint32) TEXT ·encrypt4(SB),NOSPLIT,$0 MOVQ dst+0(FP), DI MOVQ src+24(FP), SI // transposeLoad0(SI, 0,X0, X4) // transposeLoad0(SI, 4,X1, X4) // transposeLoad0(SI, 8,X2, X4) // transposeLoad0(SI, 12,X3, X4) // PSHUFB flp<>(SB), X0 // to bigendian // PSHUFB flp<>(SB), X1 // PSHUFB flp<>(SB), X2 // PSHUFB flp<>(SB), X3 // X8-X11 only available in 64 bits mode. MOVOU +(+0)(SI), X8 MOVOU +(+16)(SI), X9 MOVOU +(+32)(SI), X10 MOVOU +(+48)(SI), X11 transposeLoad(X0,e00l<>(SB),e01l<>(SB),e02l<>(SB),e03l<>(SB)) transposeLoad(X1,e10l<>(SB),e11l<>(SB),e12l<>(SB),e13l<>(SB)) transposeLoad(X2,e20l<>(SB),e21l<>(SB),e22l<>(SB),e23l<>(SB)) transposeLoad(X3,e30l<>(SB),e31l<>(SB),e32l<>(SB),e33l<>(SB)) LOOP_BEGIN: MOVQ rk+48(FP), BX XORQ CX, CX LOOP_IF: CMPQ CX, $32 JL LOOP_BODY JMP LOOP_END LOOP_BODY: INCQ CX MOVL (BX), X4 SHUFPS $0,X4,X4 ADDQ $4, BX PXOR X1,X4 PXOR X2,X4 PXOR X3,X4 // inner affine MOVUPS X4,X5 PAND c0f<>(SB), X5 MOVUPS m1l<>(SB), X6 PSHUFB X5, X6 MOVUPS X6,X5 PSRLQ $4, X4 PAND c0f<>(SB), X4 MOVUPS m1h<>(SB), X6 PSHUFB X4,X6 MOVUPS X6, X4 PXOR X5,X4 PSHUFB shr<>(SB), X4 // invert ShiftRows AESENCLAST zero<>(SB), X4 // outer affine MOVUPS X4,X5 PAND c0f<>(SB), X5 MOVUPS m2l<>(SB), X6 PSHUFB X5, X6 MOVUPS X6,X5 PSRLQ $4, X4 PAND c0f<>(SB), X4 MOVUPS m2h<>(SB), X6 PSHUFB X4,X6 MOVUPS X6, X4 PXOR X5,X4 // 4 parallel L1 linear transforms MOVUPS X4,X5 MOVUPS X4,X6 PSHUFB r08<>(SB), X5 PSHUFB r16<>(SB), X6 PXOR X4,X5 PXOR X6,X5 MOVUPS X5,X6 PSLLL $2, X6 PSRLL $30, X5 PXOR X6,X5 MOVUPS X4,X6 PSHUFB r24<>(SB), X6 PXOR X6,X4 PXOR X5,X4 // shift PXOR X0, X4 MOVUPS X1,X0 MOVUPS X2,X1 MOVUPS X3,X2 MOVUPS X4,X3 JMP LOOP_IF LOOP_END: // to little endian, this is done by storeTranspose // PSHUFB flp<>(SB), X0 // PSHUFB flp<>(SB), X1 // PSHUFB flp<>(SB), X2 // PSHUFB flp<>(SB), X3 // Move X3 || X2 || X1 || X0 to DI and transpose back storeTranspose(e00l<>(SB),e01l<>(SB),e02l<>(SB),e03l<>(SB),0) storeTranspose(e10l<>(SB),e11l<>(SB),e12l<>(SB),e13l<>(SB),16) storeTranspose(e20l<>(SB),e21l<>(SB),e22l<>(SB),e23l<>(SB),32) storeTranspose(e30l<>(SB),e31l<>(SB),e32l<>(SB),e33l<>(SB),48) RET