init: v1.0.0
This commit is contained in:
@@ -0,0 +1,13 @@
|
||||
package simd
|
||||
|
||||
//go:noescape
|
||||
func encrypt4(dst []byte, src []byte, rk []uint32)
|
||||
|
||||
//go:noescape
|
||||
func mmSet(dst []byte, src []uint32)
|
||||
|
||||
//go:noescape
|
||||
func mmExtLoad32(dst []byte)
|
||||
|
||||
//go:noescape
|
||||
func mmTranspose(dst []byte, src []byte)
|
||||
@@ -0,0 +1,316 @@
|
||||
/**
|
||||
Encrypt 4 block of sm4. Use AESNI opcode.
|
||||
|
||||
TODO: MOVUPS or MOVOU?
|
||||
*/
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
DATA zero<>+0(SB)/8, $0x0
|
||||
DATA zero<>+8(SB)/8, $0x0
|
||||
GLOBL zero<>(SB), NOPTR, $16
|
||||
|
||||
DATA ext<>+0(SB)/8, $0x0302010003020100
|
||||
DATA ext<>+8(SB)/8, $0x0302010003020100
|
||||
GLOBL ext<>(SB), NOPTR, $16
|
||||
|
||||
DATA l64<>+0(SB)/8, $0xffffffffffffffff
|
||||
DATA l64<>+8(SB)/8, $0x0706050403020100
|
||||
GLOBL l64<>(SB), NOPTR, $16
|
||||
|
||||
DATA e00l<>+0(SB)/8, $0xffffffff00010203
|
||||
DATA e00l<>+8(SB)/8, $0xffffffffffffffff
|
||||
GLOBL e00l<>(SB), NOPTR, $16
|
||||
|
||||
DATA e01l<>+0(SB)/8, $0x00010203ffffffff
|
||||
DATA e01l<>+8(SB)/8, $0xffffffffffffffff
|
||||
GLOBL e01l<>(SB), NOPTR, $16
|
||||
|
||||
DATA e02l<>+0(SB)/8, $0xffffffffffffffff
|
||||
DATA e02l<>+8(SB)/8, $0xffffffff00010203
|
||||
GLOBL e02l<>(SB), NOPTR, $16
|
||||
|
||||
DATA e03l<>+0(SB)/8, $0xffffffffffffffff
|
||||
DATA e03l<>+8(SB)/8, $0x00010203ffffffff
|
||||
GLOBL e03l<>(SB), NOPTR, $16
|
||||
|
||||
DATA e10l<>+0(SB)/8, $0xffffffff04050607
|
||||
DATA e10l<>+8(SB)/8, $0xffffffffffffffff
|
||||
GLOBL e10l<>(SB), NOPTR, $16
|
||||
|
||||
DATA e11l<>+0(SB)/8, $0x04050607ffffffff
|
||||
DATA e11l<>+8(SB)/8, $0xffffffffffffffff
|
||||
GLOBL e11l<>(SB), NOPTR, $16
|
||||
|
||||
DATA e12l<>+0(SB)/8, $0xffffffffffffffff
|
||||
DATA e12l<>+8(SB)/8, $0xffffffff04050607
|
||||
GLOBL e12l<>(SB), NOPTR, $16
|
||||
|
||||
DATA e13l<>+0(SB)/8, $0xffffffffffffffff
|
||||
DATA e13l<>+8(SB)/8, $0x04050607ffffffff
|
||||
GLOBL e13l<>(SB), NOPTR, $16
|
||||
|
||||
DATA e20l<>+0(SB)/8, $0xffffffff08090a0b
|
||||
DATA e20l<>+8(SB)/8, $0xffffffffffffffff
|
||||
GLOBL e20l<>(SB), NOPTR, $16
|
||||
|
||||
DATA e21l<>+0(SB)/8, $0x08090a0bffffffff
|
||||
DATA e21l<>+8(SB)/8, $0xffffffffffffffff
|
||||
GLOBL e21l<>(SB), NOPTR, $16
|
||||
|
||||
DATA e22l<>+0(SB)/8, $0xffffffffffffffff
|
||||
DATA e22l<>+8(SB)/8, $0xffffffff08090a0b
|
||||
GLOBL e22l<>(SB), NOPTR, $16
|
||||
|
||||
DATA e23l<>+0(SB)/8, $0xffffffffffffffff
|
||||
DATA e23l<>+8(SB)/8, $0x08090a0bffffffff
|
||||
GLOBL e23l<>(SB), NOPTR, $16
|
||||
|
||||
DATA e30l<>+0(SB)/8, $0xffffffff0c0d0e0f
|
||||
DATA e30l<>+8(SB)/8, $0xffffffffffffffff
|
||||
GLOBL e30l<>(SB), NOPTR, $16
|
||||
|
||||
DATA e31l<>+0(SB)/8, $0x0c0d0e0fffffffff
|
||||
DATA e31l<>+8(SB)/8, $0xffffffffffffffff
|
||||
GLOBL e31l<>(SB), NOPTR, $16
|
||||
|
||||
DATA e32l<>+0(SB)/8, $0xffffffffffffffff
|
||||
DATA e32l<>+8(SB)/8, $0xffffffff0c0d0e0f
|
||||
GLOBL e32l<>(SB), NOPTR, $16
|
||||
|
||||
DATA e33l<>+0(SB)/8, $0xffffffffffffffff
|
||||
DATA e33l<>+8(SB)/8, $0x0c0d0e0fffffffff
|
||||
GLOBL e33l<>(SB), NOPTR, $16
|
||||
|
||||
|
||||
DATA c0f<>+0(SB)/8, $0x0F0F0F0F0F0F0F0F
|
||||
DATA c0f<>+8(SB)/8, $0x0F0F0F0F0F0F0F0F
|
||||
|
||||
DATA flp<>+0(SB)/8, $0x0405060700010203
|
||||
DATA flp<>+8(SB)/8, $0x0C0D0E0F08090A0B
|
||||
|
||||
DATA shr<>+0(SB)/8, $0x0B0E0104070A0D00
|
||||
DATA shr<>+8(SB)/8, $0x0306090C0F020508
|
||||
|
||||
DATA m1l<>+0(SB)/8, $0x9197E2E474720701
|
||||
DATA m1l<>+8(SB)/8, $0xC7C1B4B222245157
|
||||
|
||||
DATA m1h<>+0(SB)/8, $0xE240AB09EB49A200
|
||||
DATA m1h<>+8(SB)/8, $0xF052B91BF95BB012
|
||||
|
||||
DATA m2l<>+0(SB)/8, $0x5B67F2CEA19D0834
|
||||
DATA m2l<>+8(SB)/8, $0xEDD14478172BBE82
|
||||
|
||||
DATA m2h<>+0(SB)/8, $0xAE7201DD73AFDC00
|
||||
DATA m2h<>+8(SB)/8, $0x11CDBE62CC1063BF
|
||||
|
||||
DATA r08<>+0(SB)/8, $0x0605040702010003
|
||||
DATA r08<>+8(SB)/8, $0x0E0D0C0F0A09080B
|
||||
|
||||
DATA r16<>+0(SB)/8, $0x0504070601000302
|
||||
DATA r16<>+8(SB)/8, $0x0D0C0F0E09080B0A
|
||||
|
||||
DATA r24<>+0(SB)/8, $0x0407060500030201
|
||||
DATA r24<>+8(SB)/8, $0x0C0F0E0D080B0A09
|
||||
|
||||
|
||||
GLOBL c0f<>(SB), NOPTR, $16
|
||||
GLOBL flp<>(SB), NOPTR, $16
|
||||
GLOBL shr<>(SB), NOPTR, $16
|
||||
GLOBL m1l<>(SB), NOPTR, $16
|
||||
GLOBL m1h<>(SB), NOPTR, $16
|
||||
GLOBL m2l<>(SB), NOPTR, $16
|
||||
GLOBL m2h<>(SB), NOPTR, $16
|
||||
GLOBL r08<>(SB), NOPTR, $16
|
||||
GLOBL r16<>(SB), NOPTR, $16
|
||||
GLOBL r24<>(SB), NOPTR, $16
|
||||
|
||||
|
||||
#define transposeLoad(reg, begin, X, XT)\
|
||||
MOVL +(begin+0)(reg), AX\
|
||||
MOVL +(begin+16)(reg), BX\
|
||||
MOVL +(begin+32)(reg), CX\
|
||||
MOVL +(begin+48)(reg), DX\
|
||||
SHLQ $32, DX\
|
||||
XORQ CX, DX\
|
||||
SHLQ $32, BX\
|
||||
XORQ AX, BX\
|
||||
MOVQ DX, X\
|
||||
MOVQ BX, XT\
|
||||
PSHUFB l64<>(SB), X\
|
||||
PXOR XT, X
|
||||
|
||||
#define storeTranspose(mask0,mask1,mask2,mask3, begin)\
|
||||
MOVOU X3, X4\
|
||||
MOVOU X2, X5\
|
||||
MOVOU X1, X6\
|
||||
MOVOU X0, X7\
|
||||
PSHUFB mask0, X4\
|
||||
PSHUFB mask1, X5\
|
||||
PSHUFB mask2, X6\
|
||||
PSHUFB mask3, X7\
|
||||
PXOR X5,X4\
|
||||
PXOR X6,X4\
|
||||
PXOR X7,X4\
|
||||
MOVOU X4, +(begin)(DI)\
|
||||
|
||||
// func encrypt4(dst []byte, src []byte, rk []uint32)
|
||||
TEXT ·encrypt4(SB),NOSPLIT,$0
|
||||
MOVQ dst+0(FP), DI
|
||||
MOVQ src+24(FP), SI
|
||||
|
||||
transposeLoad(SI, 0,X0, X4)
|
||||
transposeLoad(SI, 4,X1, X4)
|
||||
transposeLoad(SI, 8,X2, X4)
|
||||
transposeLoad(SI, 12,X3, X4)
|
||||
|
||||
PSHUFB flp<>(SB), X0 // to bigendian
|
||||
PSHUFB flp<>(SB), X1
|
||||
PSHUFB flp<>(SB), X2
|
||||
PSHUFB flp<>(SB), X3
|
||||
|
||||
LOOP_BEGIN:
|
||||
MOVQ rk+48(FP), BX
|
||||
MOVQ $0, CX
|
||||
LOOP_IF:
|
||||
CMPQ CX, $32
|
||||
JL LOOP_BODY
|
||||
JMP LOOP_END
|
||||
LOOP_BODY:
|
||||
INCQ CX
|
||||
MOVL (BX), X4
|
||||
PSHUFB ext<>(SB), X4
|
||||
ADDQ $4, BX
|
||||
|
||||
PXOR X1,X4
|
||||
PXOR X2,X4
|
||||
PXOR X3,X4
|
||||
|
||||
// inner affine
|
||||
MOVUPS X4,X5
|
||||
PAND c0f<>(SB), X5
|
||||
MOVUPS m1l<>(SB), X6
|
||||
PSHUFB X5, X6
|
||||
MOVUPS X6,X5
|
||||
PSRLQ $4, X4
|
||||
PAND c0f<>(SB), X4
|
||||
MOVUPS m1h<>(SB), X6
|
||||
PSHUFB X4,X6
|
||||
MOVUPS X6, X4
|
||||
PXOR X5,X4
|
||||
|
||||
PSHUFB shr<>(SB), X4 // invert ShiftRows
|
||||
AESENCLAST zero<>(SB), X4
|
||||
|
||||
MOVUPS X4,X5 // outer affine
|
||||
PAND c0f<>(SB), X5
|
||||
MOVUPS m2l<>(SB), X6
|
||||
PSHUFB X5, X6
|
||||
MOVUPS X6,X5
|
||||
PSRLQ $4, X4
|
||||
PAND c0f<>(SB), X4
|
||||
MOVUPS m2h<>(SB), X6
|
||||
PSHUFB X4,X6
|
||||
MOVUPS X6, X4
|
||||
PXOR X5,X4
|
||||
|
||||
// 4 parallel L1 linear transforms
|
||||
MOVUPS X4,X5
|
||||
MOVUPS X4,X6
|
||||
PSHUFB r08<>(SB), X5
|
||||
PSHUFB r16<>(SB), X6
|
||||
PXOR X4,X5
|
||||
PXOR X6,X5
|
||||
MOVUPS X5,X6
|
||||
PSLLL $2, X6
|
||||
PSRLL $30, X5
|
||||
PXOR X6,X5
|
||||
MOVUPS X4,X6
|
||||
PSHUFB r24<>(SB), X6
|
||||
PXOR X6,X4
|
||||
PXOR X5,X4
|
||||
|
||||
PXOR X0, X4
|
||||
MOVUPS X1,X0
|
||||
MOVUPS X2,X1
|
||||
MOVUPS X3,X2
|
||||
MOVUPS X4,X3
|
||||
JMP LOOP_IF
|
||||
LOOP_END:
|
||||
|
||||
// to little endian, leave it in storeTranspose
|
||||
// PSHUFB flp<>(SB), X0
|
||||
// PSHUFB flp<>(SB), X1
|
||||
// PSHUFB flp<>(SB), X2
|
||||
// PSHUFB flp<>(SB), X3
|
||||
|
||||
// Move X3 || X2 || X1 || X0 to DI and transpose back
|
||||
storeTranspose(e00l<>(SB),e01l<>(SB),e02l<>(SB),e03l<>(SB),0)
|
||||
storeTranspose(e10l<>(SB),e11l<>(SB),e12l<>(SB),e13l<>(SB),16)
|
||||
storeTranspose(e20l<>(SB),e21l<>(SB),e22l<>(SB),e23l<>(SB),32)
|
||||
storeTranspose(e30l<>(SB),e31l<>(SB),e32l<>(SB),e33l<>(SB),48)
|
||||
RET
|
||||
|
||||
|
||||
#define mm_set(X,a3,a2,a1,a0)\
|
||||
MOVL a0, AX \
|
||||
MOVL AX, tmp-16(SP)\
|
||||
MOVL a1, AX \
|
||||
MOVL AX, tmp-12(SP)\
|
||||
MOVL a2, AX\
|
||||
MOVL AX, tmp-8(SP)\
|
||||
MOVL a3, AX\
|
||||
MOVL AX, tmp-4(SP)\
|
||||
MOVUPS tmp-16(SP), X
|
||||
|
||||
#define mm_store(X,a3,a2,a1,a0)\
|
||||
MOVUPS X, tmp-16(SP)\
|
||||
MOVL tmp-16(SP), AX\
|
||||
MOVL AX, a0 \
|
||||
MOVL tmp-12(SP), AX\
|
||||
MOVL AX, a1 \
|
||||
MOVL tmp-8(SP), AX\
|
||||
MOVL AX, a2 \
|
||||
MOVL tmp-4(SP), AX\
|
||||
MOVL AX, a3
|
||||
|
||||
// func mmSet(dst []byte, src []uint32)
|
||||
TEXT ·mmSet(SB),NOSPLIT,$16
|
||||
MOVQ dst+0(FP), DI
|
||||
MOVQ src+24(FP), SI
|
||||
|
||||
mm_set(X0,+12(SI),+8(SI),+4(SI),+0(SI))
|
||||
MOVUPS X0, (DI)
|
||||
RET
|
||||
|
||||
|
||||
|
||||
|
||||
// func mmExtLoad32(dst []byte)
|
||||
TEXT ·mmExtLoad32(SB),NOSPLIT,$0
|
||||
MOVQ dst+0(FP), DI
|
||||
MOVQ $0x04030201, AX
|
||||
// X0 = ax ax ax ax
|
||||
MOVQ AX, X0
|
||||
PSHUFB ext<>(SB), X0
|
||||
MOVUPS X0, (DI)
|
||||
RET
|
||||
|
||||
|
||||
// func mmTranspose(dst []byte, src []byte)
|
||||
TEXT ·mmTranspose(SB), NOSPLIT, $0
|
||||
MOVQ dst+0(FP), DI
|
||||
MOVQ src+24(FP), SI
|
||||
|
||||
transposeLoad(SI, 0,X0, X4)
|
||||
transposeLoad(SI, 4,X1, X4)
|
||||
transposeLoad(SI, 8,X2, X4)
|
||||
transposeLoad(SI, 12,X3, X4)
|
||||
|
||||
MOVUPS X0, +0(DI)
|
||||
MOVUPS X1, +16(DI)
|
||||
MOVUPS X2, +32(DI)
|
||||
MOVUPS X3, +48(DI)
|
||||
RET
|
||||
|
||||
|
||||
@@ -0,0 +1,75 @@
|
||||
package simd
|
||||
|
||||
import (
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"testing"
|
||||
|
||||
"xdx.jelly/xgcl/sm/sm4"
|
||||
)
|
||||
|
||||
func TestEncrypt4(t *testing.T) {
|
||||
key, _ := hex.DecodeString("AF07B5BDDF77A3727E9E5FEC48DA1D9E")
|
||||
// test 4 blocks for aesni
|
||||
src, _ := hex.DecodeString(
|
||||
"B358A63B7587FCCB46CD41FFE778D5C1" +
|
||||
"B358A63B7587FCCB46CD41FFE778D5C1" +
|
||||
"B358A63B7587FCCB46CD41FFE778D5C1" +
|
||||
"B358A63B7587FCCB46CD41FFE778D5C1",
|
||||
)
|
||||
|
||||
dst := make([]byte, len(src))
|
||||
encKey := make([]uint32, 32)
|
||||
decKey := make([]uint32, 32)
|
||||
sm4.ExpandKey(key, encKey, decKey)
|
||||
encrypt4(dst, src, encKey)
|
||||
// dst = 6845268B 91394C00 6648E71A 2D6D68C1 * 4
|
||||
for i := 0; i < 64; i++ {
|
||||
fmt.Printf("%02x ", dst[i])
|
||||
if i > 0 && (i+1)%16 == 0 {
|
||||
fmt.Println()
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func TestMMSet(t *testing.T) {
|
||||
// src := []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
|
||||
src := []uint32{0x01234567, 0x89abcdef, 0xfedcba98, 0x76543210}
|
||||
dst := make([]byte, 16)
|
||||
mmSet(dst, src)
|
||||
fmt.Printf("%x\n", dst)
|
||||
}
|
||||
|
||||
// 67452301 efcdab89 98badcfe 10325476
|
||||
|
||||
func TestMMExtLoad32(t *testing.T) {
|
||||
dst := make([]byte, 16)
|
||||
mmExtLoad32(dst)
|
||||
fmt.Printf("%x\n", dst)
|
||||
}
|
||||
|
||||
func printBlock4(s []byte) {
|
||||
for i := 0; i < 64; i++ {
|
||||
fmt.Printf("%02x", s[i])
|
||||
if i > 0 && (i+1)%4 == 0 {
|
||||
fmt.Print(" ")
|
||||
}
|
||||
if i > 0 && (i+1)%16 == 0 {
|
||||
fmt.Println()
|
||||
}
|
||||
}
|
||||
}
|
||||
func TestTranspose(t *testing.T) {
|
||||
var src = make([]byte, 64)
|
||||
for i := 0; i < 64; i++ {
|
||||
src[i] = byte(i)
|
||||
}
|
||||
|
||||
dst := make([]byte, 64)
|
||||
mmTranspose(dst, src)
|
||||
|
||||
printBlock4(src)
|
||||
fmt.Println("----------")
|
||||
printBlock4(dst)
|
||||
}
|
||||
Reference in New Issue
Block a user