init: v1.0.0
This commit is contained in:
@@ -0,0 +1,598 @@
|
||||
#include "textflag.h"
|
||||
|
||||
// SM3 block routine.
|
||||
//
|
||||
// No "function stitching" now.
|
||||
//
|
||||
// The algorithm is detailed in GB/T 32905-2016
|
||||
|
||||
// FFt(x,y,z) = GGt(x,y,z) = Parity(x,y,z) for 0 <= t <= 15
|
||||
// FFt(x,y,z) = Maj(x,y,z) for 15 <= t <= 63
|
||||
// GGt(x,y,z) = Ch(x,y,z) for 15 <= t <= 63
|
||||
//
|
||||
// Wt = Mt; for 0 <= t <= 15
|
||||
// Wt = P1(Wt-16 xor Wt-9 xor ROTL(Wt-3,15))
|
||||
// xor ROTL(Wi-13, 7) xor Wt-6 for 16 <= t <= 67
|
||||
// W't = Wt xor Wt+4 for 0 <= t <= 63.
|
||||
//
|
||||
// a = V0
|
||||
// b = V1
|
||||
// c = V2
|
||||
// d = V3
|
||||
// e = V4
|
||||
// f = V5
|
||||
// g = V6
|
||||
// h = V7
|
||||
//
|
||||
// for t = 0 to 63 {
|
||||
// SS1 = ROTL(ROTL(a,12) + E + ROTL(Tt, t mod 32), 7)
|
||||
// SS2 = SS1 xor ROTL(a,12)
|
||||
// TT1 = FFt(a,b,c) + D + SS2 +W't
|
||||
// TT2 = GGt(e,f,g) + h + SS1 +Wt
|
||||
// d = c
|
||||
// c = ROTL(b,9)
|
||||
// b = a
|
||||
// a = TT1
|
||||
// h = g
|
||||
// g = ROTL(f,19)
|
||||
// f = e
|
||||
// e = P0(TT2)
|
||||
// }
|
||||
//
|
||||
// V0 = a xor V0
|
||||
// V1 = b xor V1
|
||||
// V2 = c xor V2
|
||||
// V3 = d xor V3
|
||||
// V4 = e xor V4
|
||||
// V5 = f xor V5
|
||||
// V6 = g xor V6
|
||||
// V7 = h xor V7
|
||||
|
||||
// Definitions for AVX2 version
|
||||
|
||||
// xorm (mem), reg
|
||||
// Xor reg to mem using reg-mem xor and store
|
||||
#define xorm(P1, P2) \
|
||||
XORL P2, P1; \
|
||||
MOVL P1, P2
|
||||
|
||||
#define XDWORD0 Y4
|
||||
#define XDWORD1 Y5
|
||||
#define XDWORD2 Y6
|
||||
#define XDWORD3 Y7
|
||||
|
||||
#define XWORD0 X4
|
||||
#define XWORD1 X5
|
||||
#define XWORD2 X6
|
||||
#define XWORD3 X7
|
||||
|
||||
#define XTMP0 Y0
|
||||
#define XTMP1 Y1
|
||||
#define XTMP2 Y2
|
||||
#define XTMP3 Y3
|
||||
#define XTMP4 Y8
|
||||
#define XTMP5 Y11
|
||||
|
||||
#define XFER Y9
|
||||
|
||||
#define BYTE_FLIP_MASK Y13 // mask to convert LE -> BE
|
||||
#define X_BYTE_FLIP_MASK X13
|
||||
|
||||
#define NUM_BYTES DX
|
||||
#define INP DI
|
||||
|
||||
#define CTX SI // Beginning of digest in memory (a, b, c, ... , h)
|
||||
|
||||
#define a AX
|
||||
#define b BX
|
||||
#define c CX
|
||||
#define d R8
|
||||
#define e DX
|
||||
#define f R9
|
||||
#define g R10
|
||||
#define h R11
|
||||
|
||||
#define old_h R11
|
||||
|
||||
#define TBL BP
|
||||
|
||||
#define SRND SI // SRND is same register as CTX
|
||||
|
||||
#define T1 R12
|
||||
|
||||
#define y0 R13
|
||||
#define y1 R14
|
||||
#define y2 R15
|
||||
#define y3 DI
|
||||
|
||||
// Offsets
|
||||
#define XFER_SIZE 2*2*68*4
|
||||
#define INP_END_SIZE 8
|
||||
#define INP_SIZE 8
|
||||
|
||||
#define _XFER 0
|
||||
#define _INP_END _XFER + XFER_SIZE
|
||||
#define _INP _INP_END + INP_END_SIZE
|
||||
#define STACK_SIZE _INP + INP_SIZE
|
||||
|
||||
|
||||
// update XDWORD0
|
||||
#define SM3_SCHED(XDWORD0, XDWORD1, XDWORD2, XDWORD3)\
|
||||
; \ // ################################### Message Schedule ###########################
|
||||
VPALIGNR $8, XDWORD2, XDWORD3, XTMP0; \ // XTMP0 = W[-6]
|
||||
VPALIGNR $12, XDWORD0, XDWORD1, XTMP1; \ // XTMP1 = W[-13]
|
||||
VPSLLD $7, XTMP1, XTMP2; \
|
||||
VPSRLD $(32-7), XTMP1, XTMP1; \
|
||||
VPXOR XTMP2, XTMP1, XTMP1; \ // XTMP1 = (w[-13] <<< 7)
|
||||
VPXOR XTMP1, XTMP0, XTMP0; \ // XTMP0 = (w[-13] <<< 7) ^ w[-6]
|
||||
; \
|
||||
VPALIGNR $12, XDWORD1, XDWORD2, XTMP1; \ // XTMP1 = W[-9]
|
||||
VPXOR XDWORD0, XTMP1, XTMP1; \ // XTMP1 = W[-9]^W[-16]
|
||||
; \
|
||||
VPSHUFD $0xA5, XDWORD3, XTMP2; \ // XTMP2 = W[-3] {BBAA}
|
||||
VPSRLQ $17, XTMP2, XTMP4; \ // XTMP4 = W[-3] <<< 15 {xBxA}
|
||||
VPSHUFB shuff_00BA<>(SB), XTMP4, XTMP4; \ // XTMP4 = W[-3] <<< 15 {00BA}
|
||||
VPXOR XTMP1, XTMP4, XTMP4; \ // XTMP4 = W[-9]^W[-16] ^ (W[-3] <<< 15) {xxBA}
|
||||
VPSHUFD $0x50, XTMP4, XTMP4; \ // XTMP4 = W[-9]^W[-16] ^ (W[-3] <<< 15) {BBAA}
|
||||
VPSRLQ $17, XTMP4,XTMP2; \ // {xBxA}
|
||||
VPSRLQ $9, XTMP4,XTMP3; \ // {xBxA}
|
||||
VPXOR XTMP2, XTMP4, XTMP4; \ //
|
||||
VPXOR XTMP3, XTMP4, XTMP4; \ // XTMP4 = p1 {xBxA}
|
||||
VPSHUFB shuff_00BA<>(SB), XTMP4, XTMP4; \ // XTMP4 = p1 {00BA}
|
||||
VPXOR XTMP4, XTMP0, XTMP5; \ // XTMP5 = {..., ..., W[1], W[0]}
|
||||
; \
|
||||
VPALIGNR $4, XDWORD3, XTMP5, XTMP2; \ // XTMP2 = {W[0], W[-1], W[-2], W[-3]}
|
||||
VPSHUFD $0xFA, XTMP2, XTMP2; \ // XTMP2 = {W[0], W[0], W[-1], W[-1]} {DDCC}
|
||||
VPSRLQ $17, XTMP2, XTMP4; \ // XTMP4 = W[-3] <<< 15 {xDxC}
|
||||
VPSHUFB shuff_DC00<>(SB), XTMP4, XTMP4; \ // XTMP4 = W[-3] <<< 15 {DC00}
|
||||
VPXOR XTMP1, XTMP4, XTMP4; \ // XTMP4 = W[-9]^W[-16] ^ (W[-3] <<< 15) {DCxx}
|
||||
VPSHUFD $0xFA, XTMP4, XTMP4; \ // XTMP4 = W[-9]^W[-16] ^ (W[-3] <<< 15) {DDCC}
|
||||
VPSRLQ $17, XTMP4,XTMP2; \
|
||||
VPSRLQ $9, XTMP4,XTMP3; \
|
||||
VPXOR XTMP2, XTMP4, XTMP4; \
|
||||
VPXOR XTMP3, XTMP4, XTMP4; \ // XTMP4 = p1 {xDxC}
|
||||
VPSHUFB shuff_DC00<>(SB), XTMP4, XTMP4; \ // XTMP4 = p1 {DC00}
|
||||
VPXOR XTMP4, XTMP5,XDWORD0
|
||||
|
||||
|
||||
// 前16轮计算-0
|
||||
// SRND = # of {4 round}
|
||||
// T is saved in (dist_T)(TBL)(SRND*1)
|
||||
// W is saved in (dist_W)(SP)(SRND*2)
|
||||
// W' is saved in (dist_W + 68*4*2)(SP)(SRND*1)
|
||||
#define DO_ROUND16(dist_T, dist_W, a, b, c, d, e, f, g, h) \
|
||||
; \ // ################################### RND 0 - 15 ###########################
|
||||
RORXL $(32-12), a, y2; \ // y2 = a <<< 12
|
||||
MOVL y2, y3; \
|
||||
ADDL e, y3; \ // y3 = (a <<< 12) + e
|
||||
ADDL (dist_T)(TBL)(SRND*1), y3; \ // y3 = (a <<< 12) + e + T
|
||||
RORXL $(32-7), y3, y3; \ // y3 = ss1
|
||||
ADDL y3, h; \ // h = h+ss1
|
||||
XORL y2, y3; \ // y3 = ss2
|
||||
ADDL y3, d; \ // d = d+ss2
|
||||
; \
|
||||
ADDL (dist_W)(SP)(SRND*2), h; \ // h = h + ss1 + w
|
||||
ADDL (dist_W+68*4*2)(SP)(SRND*2), d; \ // d = d + ss2 + w'
|
||||
; \
|
||||
MOVL a, y1; \ // y1 = a //FF //PARITY
|
||||
XORL b, y1; \ // y1 = a^b //FF //PARITY
|
||||
XORL c, y1; \ // y1 = a^b^c //FF //PARITY
|
||||
ADDL y1, d; \ // d = TT1
|
||||
; \
|
||||
MOVL e, y2; \ // y2 = e // GG //PARITY
|
||||
XORL f, y2; \ // y2 = e^f // GG //PARITY
|
||||
XORL g, y2; \ // y2 = e^f^g // GG //PARITY
|
||||
ADDL y2, h; \ // h = TT2
|
||||
; \
|
||||
RORXL $(32-8), h, y0; \ // y0 = TT2<<<8
|
||||
XORL h,y0; \ // y0 = (TT2<<<8)^TT2
|
||||
RORXL $(32-9), y0, y0; \ // y0 = ((TT2<<<8)^TT2)<<<9
|
||||
XORL y0, h; \ // h = p0(TT2)
|
||||
; \
|
||||
RORXL $(32-9), b, b; \
|
||||
RORXL $(32-19), f, f
|
||||
|
||||
// 后48轮计算
|
||||
#define DO_ROUND48(dist_T, dist_W, a, b, c, d, e, f, g, h) \
|
||||
; \ // ################################### RND 16 - 63 ###########################
|
||||
RORXL $(32-12), a, y2; \ // y2 = a <<< 12
|
||||
MOVL y2, y3; \
|
||||
ADDL e, y3; \ // y3 = (a <<< 12) + e
|
||||
ADDL (dist_T)(TBL)(SRND*1), y3; \ // y3 = (a <<< 12) + e + T
|
||||
RORXL $(32-7), y3, y3; \ // y3 = ss1
|
||||
ADDL y3, h; \ // h = h+ss1
|
||||
XORL y2, y3; \ // y3 = ss2
|
||||
ADDL y3, d; \ // d = d+ss2
|
||||
; \
|
||||
ADDL (dist_W)(SP)(SRND*2), h; \ // h = h + ss1 + w
|
||||
ADDL (dist_W+68*4*2)(SP)(SRND*2), d; \ // d = d + ss2 + w'
|
||||
; \
|
||||
MOVL a, y3; \ // y3 = a //FF MAJA
|
||||
ORL c, y3; \ // y3 = a|c // MAJA
|
||||
ANDL b, y3; \ // y3 = (a|c)&b // MAJA
|
||||
MOVL a, T1; \ // T1 = a // MAJB
|
||||
ANDL c, T1; \ // T1 = a&c // MAJB
|
||||
ORL T1, y3; \ // y3 = MAJ = ((a|c)&b)|(a&c) // MAJ
|
||||
ADDL y3, d; \ // d = TT1
|
||||
; \
|
||||
MOVL f, y2; \ // y2 = f //GG CH
|
||||
XORL g, y2; \ // y2 = f^g // CH
|
||||
ANDL e, y2; \ // y2 = (f^g)&e // CH
|
||||
XORL g, y2; \ // y2 = CH = ((f^g)&e)^g // CH
|
||||
ADDL y2, h; \ // h = TT2
|
||||
; \
|
||||
RORXL $(32-8), h, y0; \ // y0 = TT2<<<8
|
||||
XORL h,y0; \ // y0 = (TT2<<<8)^TT2
|
||||
RORXL $(32-9), y0, y0; \ // y0 = ((TT2<<<8)^TT2)<<<9
|
||||
XORL y0, h; \ // h = p0(TT2)
|
||||
; \
|
||||
RORXL $(32-9), b, b; \
|
||||
RORXL $(32-19), f, f
|
||||
|
||||
|
||||
// stack:
|
||||
// block0 block1
|
||||
// 0*8: W[0:4] V[0:4]
|
||||
// 1*8: W[4:8] V[4:8]
|
||||
// ...
|
||||
// 67*8: W[64:68] V[64:68]
|
||||
// 68*8: W'[0:4] V'[0:4]
|
||||
// 69*8: W'[4:8] V'[4:8]
|
||||
// ...
|
||||
// 135*8: W'[64:68] V'[64:68]
|
||||
// 136*8: _INP_END(SP) - Pointer to the last block
|
||||
// 137*8: _INP(SP) - Save INP in round computation.
|
||||
//
|
||||
// STACK_SIZE = 1088+8+8 = 1104
|
||||
// func blockAsmAVX2(dig *digest, p []byte)
|
||||
TEXT ·blockAsmAVX2(SB), 0, $1104-32
|
||||
MOVQ dig+0(FP), CTX
|
||||
MOVQ p_base+8(FP), INP
|
||||
MOVQ p_len+16(FP), NUM_BYTES
|
||||
|
||||
LEAQ -64(INP)(NUM_BYTES*1), NUM_BYTES // Pointer to the last block
|
||||
MOVQ NUM_BYTES, _INP_END(SP) // save to stack
|
||||
|
||||
CMPQ NUM_BYTES, INP
|
||||
JE avx2_only_one_block
|
||||
|
||||
// Load initial digest
|
||||
MOVL 0(CTX), a // a = H0
|
||||
MOVL 4(CTX), b // b = H1
|
||||
MOVL 8(CTX), c // c = H2
|
||||
MOVL 12(CTX), d // d = H3
|
||||
MOVL 16(CTX), e // e = H4
|
||||
MOVL 20(CTX), f // f = H5
|
||||
MOVL 24(CTX), g // g = H6
|
||||
MOVL 28(CTX), h // h = H7
|
||||
|
||||
avx2_loop0: // at each iteration works with one block (512 bit)
|
||||
|
||||
// load two blocks,64*2 bytes
|
||||
VMOVDQU (0*32)(INP), XTMP0 // p[0:4]
|
||||
VMOVDQU (1*32)(INP), XTMP1
|
||||
VMOVDQU (2*32)(INP), XTMP2
|
||||
VMOVDQU (3*32)(INP), XTMP3
|
||||
|
||||
VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK
|
||||
|
||||
// Apply Byte Flip Mask: LE -> BE
|
||||
VPSHUFB BYTE_FLIP_MASK, XTMP0, XTMP0
|
||||
VPSHUFB BYTE_FLIP_MASK, XTMP1, XTMP1
|
||||
VPSHUFB BYTE_FLIP_MASK, XTMP2, XTMP2
|
||||
VPSHUFB BYTE_FLIP_MASK, XTMP3, XTMP3
|
||||
|
||||
// XTMP0:XTMP1 - first block
|
||||
// XTMP0: w7, w6, w5, w4, w3, w2, w1,w0
|
||||
// XTMP1: w15,w14,w13,w12,w11,w10,w9,w8
|
||||
// XTMP3:XTMP2 - second block
|
||||
// XTMP2: u7, u6, u5, u4, u3, u2, u1,u0
|
||||
// XTMP3: u15,u14,u13,u12,u11,u10,u9,u8
|
||||
|
||||
// XDWORD0: u3, u2, u1, u0, w3, w2, w1,w0
|
||||
// XDWORD1: u7, u6, u5, u4, w7, w6, w5, w4
|
||||
// XDWORD2: u11,u10,u9, u8, w11,w10,w9,w8
|
||||
// XDWORD3: u15,u14,u13,u12,w15,w14,w13,w12
|
||||
|
||||
// Transpose data into high/low parts
|
||||
VPERM2I128 $0x20, XTMP2, XTMP0, XDWORD0 // w3, w2, w1, w0
|
||||
VPERM2I128 $0x31, XTMP2, XTMP0, XDWORD1 // w7, w6, w5, w4
|
||||
VPERM2I128 $0x20, XTMP3, XTMP1, XDWORD2 // w11, w10, w9, w8
|
||||
VPERM2I128 $0x31, XTMP3, XTMP1, XDWORD3 // w15, w14, w13, w12
|
||||
MOVQ $T256<>(SB), TBL // Loading address of table with round-specific constants
|
||||
|
||||
avx2_last_block_enter:
|
||||
ADDQ $64, INP
|
||||
MOVQ INP, _INP(SP)
|
||||
XORQ SRND, SRND // SRND increace 16 of each 4 rounds (dist of T)
|
||||
|
||||
// for w0 - w15
|
||||
// Do 4 rounds and scheduling
|
||||
VMOVDQU XDWORD0, (_XFER + 0*32)(SP)(SRND*2)
|
||||
VPXOR XDWORD0, XDWORD1, XFER
|
||||
VMOVDQU XFER, (_XFER + 0*32+68*4*2)(SP)(SRND*2)
|
||||
SM3_SCHED(XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
DO_ROUND16(0*4, 0*32+0, a, b, c, d, e, f, g, h)
|
||||
DO_ROUND16(1*4, 0*32+4, d, a, b, c, h, e, f, g)
|
||||
DO_ROUND16(2*4, 0*32+8, c, d, a, b, g, h, e, f)
|
||||
DO_ROUND16(3*4, 0*32+12, b, c, d, a, f, g, h, e)
|
||||
|
||||
VMOVDQU XDWORD1, (_XFER + 1*32)(SP)(SRND*2)
|
||||
VPXOR XDWORD1, XDWORD2, XFER
|
||||
VMOVDQU XFER, (_XFER + 1*32+68*4*2)(SP)(SRND*2)
|
||||
SM3_SCHED(XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
DO_ROUND16(4*4, 1*32+0, a, b, c, d, e, f, g, h)
|
||||
DO_ROUND16(5*4, 1*32+4, d, a, b, c, h, e, f, g)
|
||||
DO_ROUND16(6*4, 1*32+8, c, d, a, b, g, h, e, f)
|
||||
DO_ROUND16(7*4, 1*32+12, b, c, d, a, f, g, h, e)
|
||||
|
||||
VMOVDQU XDWORD2, (_XFER + 2*32)(SP)(SRND*2)
|
||||
VPXOR XDWORD2, XDWORD3, XFER
|
||||
VMOVDQU XFER, (_XFER + 2*32+68*4*2)(SP)(SRND*2)
|
||||
SM3_SCHED(XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
DO_ROUND16(8*4, 2*32+0, a, b, c, d, e, f, g, h)
|
||||
DO_ROUND16(9*4, 2*32+4, d, a, b, c, h, e, f, g)
|
||||
DO_ROUND16(10*4,2*32+8, c, d, a, b, g, h, e, f)
|
||||
DO_ROUND16(11*4,2*32+12, b, c, d, a, f, g, h, e)
|
||||
|
||||
VMOVDQU XDWORD3, (_XFER + 3*32)(SP)(SRND*2)
|
||||
VPXOR XDWORD3, XDWORD0, XFER
|
||||
VMOVDQU XFER, (_XFER + 3*32+68*4*2)(SP)(SRND*2)
|
||||
SM3_SCHED(XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
DO_ROUND16(12*4, 3*32+0, a, b, c, d, e, f, g, h)
|
||||
DO_ROUND16(13*4, 3*32+4, d, a, b, c, h, e, f, g)
|
||||
DO_ROUND16(14*4, 3*32+8, c, d, a, b, g, h, e, f)
|
||||
DO_ROUND16(15*4, 3*32+12, b, c, d, a, f, g, h, e)
|
||||
|
||||
ADDQ $4*16, SRND
|
||||
|
||||
avx2_loop1:
|
||||
// for w16 - w47 with scheduling (32 rounds)
|
||||
VMOVDQU XDWORD0, (_XFER + 0*32)(SP)(SRND*2)
|
||||
VPXOR XDWORD0, XDWORD1, XFER
|
||||
VMOVDQU XFER, (_XFER + 0*32+68*4*2)(SP)(SRND*2)
|
||||
SM3_SCHED(XDWORD0, XDWORD1, XDWORD2, XDWORD3)
|
||||
DO_ROUND48(0*4, 0*32+0, a, b, c, d, e, f, g, h)
|
||||
DO_ROUND48(1*4, 0*32+4, d, a, b, c, h, e, f, g)
|
||||
DO_ROUND48(2*4, 0*32+8, c, d, a, b, g, h, e, f)
|
||||
DO_ROUND48(3*4, 0*32+12, b, c, d, a, f, g, h, e)
|
||||
|
||||
VMOVDQU XDWORD1, (_XFER + 1*32)(SP)(SRND*2)
|
||||
VPXOR XDWORD1, XDWORD2, XFER
|
||||
VMOVDQU XFER, (_XFER + 1*32+68*4*2)(SP)(SRND*2)
|
||||
SM3_SCHED(XDWORD1, XDWORD2, XDWORD3, XDWORD0)
|
||||
DO_ROUND48(4*4, 1*32+0, a, b, c, d, e, f, g, h)
|
||||
DO_ROUND48(5*4, 1*32+4, d, a, b, c, h, e, f, g)
|
||||
DO_ROUND48(6*4, 1*32+8, c, d, a, b, g, h, e, f)
|
||||
DO_ROUND48(7*4, 1*32+12, b, c, d, a, f, g, h, e)
|
||||
|
||||
VMOVDQU XDWORD2, (_XFER + 2*32)(SP)(SRND*2)
|
||||
VPXOR XDWORD2, XDWORD3, XFER
|
||||
VMOVDQU XFER, (_XFER + 2*32+68*4*2)(SP)(SRND*2)
|
||||
SM3_SCHED(XDWORD2, XDWORD3, XDWORD0, XDWORD1)
|
||||
DO_ROUND48(8*4, 2*32+0, a, b, c, d, e, f, g, h)
|
||||
DO_ROUND48(9*4, 2*32+4, d, a, b, c, h, e, f, g)
|
||||
DO_ROUND48(10*4,2*32+8, c, d, a, b, g, h, e, f)
|
||||
DO_ROUND48(11*4,2*32+12, b, c, d, a, f, g, h, e)
|
||||
|
||||
VMOVDQU XDWORD3, (_XFER + 3*32)(SP)(SRND*2)
|
||||
VPXOR XDWORD3, XDWORD0, XFER
|
||||
VMOVDQU XFER, (_XFER + 3*32+68*4*2)(SP)(SRND*2)
|
||||
SM3_SCHED(XDWORD3, XDWORD0, XDWORD1, XDWORD2)
|
||||
DO_ROUND48(12*4,3*32+0, a, b, c, d, e, f, g, h)
|
||||
DO_ROUND48(13*4,3*32+4, d, a, b, c, h, e, f, g)
|
||||
DO_ROUND48(14*4,3*32+8, c, d, a, b, g, h, e, f)
|
||||
DO_ROUND48(15*4,3*32+12, b, c, d, a, f, g, h, e)
|
||||
|
||||
ADDQ $4*16, SRND
|
||||
CMPQ SRND, $12*16
|
||||
JB avx2_loop1
|
||||
|
||||
// w48 - w63 processed with one scheduling (last 16 rounds)
|
||||
VMOVDQU XDWORD0, (_XFER + 0*32)(SP)(SRND*2)
|
||||
VPXOR XDWORD0, XDWORD1, XFER
|
||||
VMOVDQU XFER, (_XFER + 0*32+68*4*2)(SP)(SRND*2)
|
||||
SM3_SCHED(XDWORD0, XDWORD1, XDWORD2, XDWORD3) // scheduling XDWORD0 for W64-W67
|
||||
DO_ROUND48(0*4, 0*32+0, a, b, c, d, e, f, g, h)
|
||||
DO_ROUND48(1*4, 0*32+4, d, a, b, c, h, e, f, g)
|
||||
DO_ROUND48(2*4, 0*32+8, c, d, a, b, g, h, e, f)
|
||||
DO_ROUND48(3*4, 0*32+12, b, c, d, a, f, g, h, e)
|
||||
|
||||
VMOVDQU XDWORD1, (_XFER + 1*32)(SP)(SRND*2)
|
||||
VPXOR XDWORD1, XDWORD2, XFER
|
||||
VMOVDQU XFER, (_XFER + 1*32+68*4*2)(SP)(SRND*2)
|
||||
DO_ROUND48(4*4, 1*32+0, a, b, c, d, e, f, g, h)
|
||||
DO_ROUND48(5*4, 1*32+4, d, a, b, c, h, e, f, g)
|
||||
DO_ROUND48(6*4, 1*32+8, c, d, a, b, g, h, e, f)
|
||||
DO_ROUND48(7*4, 1*32+12, b, c, d, a, f, g, h, e)
|
||||
|
||||
VMOVDQU XDWORD2, (_XFER + 2*32)(SP)(SRND*2)
|
||||
VPXOR XDWORD2, XDWORD3, XFER
|
||||
VMOVDQU XFER, (_XFER + 2*32+68*4*2)(SP)(SRND*2)
|
||||
DO_ROUND48(8*4, 2*32+0, a, b, c, d, e, f, g, h)
|
||||
DO_ROUND48(9*4, 2*32+4, d, a, b, c, h, e, f, g)
|
||||
DO_ROUND48(10*4,2*32+8, c, d, a, b, g, h, e, f)
|
||||
DO_ROUND48(11*4,2*32+12, b, c, d, a, f, g, h, e)
|
||||
|
||||
VMOVDQU XDWORD3, (_XFER + 3*32)(SP)(SRND*2)
|
||||
VPXOR XDWORD3, XDWORD0, XFER
|
||||
VMOVDQU XFER, (_XFER + 3*32+68*4*2)(SP)(SRND*2)
|
||||
DO_ROUND48(12*4, 3*32+0, a, b, c, d, e, f, g, h)
|
||||
DO_ROUND48(13*4, 3*32+4, d, a, b, c, h, e, f, g)
|
||||
DO_ROUND48(14*4, 3*32+8, c, d, a, b, g, h, e, f)
|
||||
DO_ROUND48(15*4, 3*32+12, b, c, d, a, f, g, h, e)
|
||||
|
||||
MOVQ dig+0(FP), CTX // d.h[8]
|
||||
MOVQ _INP(SP), INP
|
||||
|
||||
xorm( 0(CTX), a)
|
||||
xorm( 4(CTX), b)
|
||||
xorm( 8(CTX), c)
|
||||
xorm( 12(CTX), d)
|
||||
xorm( 16(CTX), e)
|
||||
xorm( 20(CTX), f)
|
||||
xorm( 24(CTX), g)
|
||||
xorm( 28(CTX), h)
|
||||
|
||||
CMPQ _INP_END(SP), INP
|
||||
JB done_hash
|
||||
|
||||
XORQ SRND, SRND
|
||||
|
||||
avx2_loop3: // Do second block using previously scheduled results
|
||||
DO_ROUND16(0, 16+0, a, b, c, d, e, f, g, h)
|
||||
DO_ROUND16(4, 16+4, d, a, b, c, h, e, f, g)
|
||||
DO_ROUND16(8, 16+8, c, d, a, b, g, h, e, f)
|
||||
DO_ROUND16(12, 16+12, b, c, d, a, f, g, h, e)
|
||||
|
||||
ADDQ $16, SRND
|
||||
CMPQ SRND, $4*16
|
||||
JB avx2_loop3
|
||||
avx2_loop4:
|
||||
DO_ROUND48(0, 16+0,a, b, c, d, e, f, g, h)
|
||||
DO_ROUND48(4, 16+4,d, a, b, c, h, e, f, g)
|
||||
DO_ROUND48(8, 16+8,c, d, a, b, g, h, e, f)
|
||||
DO_ROUND48(12, 16+12, b, c, d, a, f, g, h, e)
|
||||
|
||||
ADDQ $16, SRND
|
||||
CMPQ SRND, $16*16
|
||||
JB avx2_loop4
|
||||
|
||||
MOVQ dig+0(FP), CTX // d.h[8]
|
||||
MOVQ _INP(SP), INP
|
||||
ADDQ $64, INP
|
||||
|
||||
xorm( 0(CTX), a)
|
||||
xorm( 4(CTX), b)
|
||||
xorm( 8(CTX), c)
|
||||
xorm( 12(CTX), d)
|
||||
xorm( 16(CTX), e)
|
||||
xorm( 20(CTX), f)
|
||||
xorm( 24(CTX), g)
|
||||
xorm( 28(CTX), h)
|
||||
|
||||
CMPQ _INP_END(SP), INP
|
||||
JA avx2_loop0
|
||||
JB done_hash
|
||||
|
||||
avx2_do_last_block:
|
||||
|
||||
VMOVDQU 0(INP), XWORD0
|
||||
VMOVDQU 16(INP), XWORD1
|
||||
VMOVDQU 32(INP), XWORD2
|
||||
VMOVDQU 48(INP), XWORD3
|
||||
|
||||
VMOVDQU flip_mask<>(SB), BYTE_FLIP_MASK
|
||||
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD0, XWORD0
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD1, XWORD1
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD2, XWORD2
|
||||
VPSHUFB X_BYTE_FLIP_MASK, XWORD3, XWORD3
|
||||
MOVQ $T256<>(SB), TBL
|
||||
|
||||
JMP avx2_last_block_enter
|
||||
|
||||
avx2_only_one_block:
|
||||
// Load initial digest
|
||||
MOVL 0(CTX), a // a = H0
|
||||
MOVL 4(CTX), b // b = H1
|
||||
MOVL 8(CTX), c // c = H2
|
||||
MOVL 12(CTX), d // d = H3
|
||||
MOVL 16(CTX), e // e = H4
|
||||
MOVL 20(CTX), f // f = H5
|
||||
MOVL 24(CTX), g // g = H6
|
||||
MOVL 28(CTX), h // h = H7
|
||||
|
||||
JMP avx2_do_last_block
|
||||
|
||||
done_hash:
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
||||
// shuffle byte order from LE to BE
|
||||
DATA flip_mask<>+0x00(SB)/8, $0x0405060700010203
|
||||
DATA flip_mask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
|
||||
DATA flip_mask<>+0x10(SB)/8, $0x0405060700010203
|
||||
DATA flip_mask<>+0x18(SB)/8, $0x0c0d0e0f08090a0b
|
||||
GLOBL flip_mask<>(SB), 8, $32
|
||||
|
||||
// shuffle xBxA -> 00BA
|
||||
DATA shuff_00BA<>+0x00(SB)/8, $0x0b0a090803020100
|
||||
DATA shuff_00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF
|
||||
DATA shuff_00BA<>+0x10(SB)/8, $0x0b0a090803020100
|
||||
DATA shuff_00BA<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
|
||||
GLOBL shuff_00BA<>(SB), 8, $32
|
||||
|
||||
// shuffle xDxC -> DC00
|
||||
DATA shuff_DC00<>+0x00(SB)/8, $0xFFFFFFFFFFFFFFFF
|
||||
DATA shuff_DC00<>+0x08(SB)/8, $0x0b0a090803020100
|
||||
DATA shuff_DC00<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
|
||||
DATA shuff_DC00<>+0x18(SB)/8, $0x0b0a090803020100
|
||||
GLOBL shuff_DC00<>(SB), 8, $32
|
||||
|
||||
// rotate of Tj: rotT[i] = Tj << (j mod 32)
|
||||
DATA T256<>+0x0(SB)/4, $0x79cc4519
|
||||
DATA T256<>+0x4(SB)/4, $0xf3988a32
|
||||
DATA T256<>+0x8(SB)/4, $0xe7311465
|
||||
DATA T256<>+0xc(SB)/4, $0xce6228cb
|
||||
DATA T256<>+0x10(SB)/4, $0x9cc45197
|
||||
DATA T256<>+0x14(SB)/4, $0x3988a32f
|
||||
DATA T256<>+0x18(SB)/4, $0x7311465e
|
||||
DATA T256<>+0x1c(SB)/4, $0xe6228cbc
|
||||
DATA T256<>+0x20(SB)/4, $0xcc451979
|
||||
DATA T256<>+0x24(SB)/4, $0x988a32f3
|
||||
DATA T256<>+0x28(SB)/4, $0x311465e7
|
||||
DATA T256<>+0x2c(SB)/4, $0x6228cbce
|
||||
DATA T256<>+0x30(SB)/4, $0xc451979c
|
||||
DATA T256<>+0x34(SB)/4, $0x88a32f39
|
||||
DATA T256<>+0x38(SB)/4, $0x11465e73
|
||||
DATA T256<>+0x3c(SB)/4, $0x228cbce6
|
||||
DATA T256<>+0x40(SB)/4, $0x9d8a7a87
|
||||
DATA T256<>+0x44(SB)/4, $0x3b14f50f
|
||||
DATA T256<>+0x48(SB)/4, $0x7629ea1e
|
||||
DATA T256<>+0x4c(SB)/4, $0xec53d43c
|
||||
DATA T256<>+0x50(SB)/4, $0xd8a7a879
|
||||
DATA T256<>+0x54(SB)/4, $0xb14f50f3
|
||||
DATA T256<>+0x58(SB)/4, $0x629ea1e7
|
||||
DATA T256<>+0x5c(SB)/4, $0xc53d43ce
|
||||
DATA T256<>+0x60(SB)/4, $0x8a7a879d
|
||||
DATA T256<>+0x64(SB)/4, $0x14f50f3b
|
||||
DATA T256<>+0x68(SB)/4, $0x29ea1e76
|
||||
DATA T256<>+0x6c(SB)/4, $0x53d43cec
|
||||
DATA T256<>+0x70(SB)/4, $0xa7a879d8
|
||||
DATA T256<>+0x74(SB)/4, $0x4f50f3b1
|
||||
DATA T256<>+0x78(SB)/4, $0x9ea1e762
|
||||
DATA T256<>+0x7c(SB)/4, $0x3d43cec5
|
||||
DATA T256<>+0x80(SB)/4, $0x7a879d8a
|
||||
DATA T256<>+0x84(SB)/4, $0xf50f3b14
|
||||
DATA T256<>+0x88(SB)/4, $0xea1e7629
|
||||
DATA T256<>+0x8c(SB)/4, $0xd43cec53
|
||||
DATA T256<>+0x90(SB)/4, $0xa879d8a7
|
||||
DATA T256<>+0x94(SB)/4, $0x50f3b14f
|
||||
DATA T256<>+0x98(SB)/4, $0xa1e7629e
|
||||
DATA T256<>+0x9c(SB)/4, $0x43cec53d
|
||||
DATA T256<>+0xa0(SB)/4, $0x879d8a7a
|
||||
DATA T256<>+0xa4(SB)/4, $0x0f3b14f5
|
||||
DATA T256<>+0xa8(SB)/4, $0x1e7629ea
|
||||
DATA T256<>+0xac(SB)/4, $0x3cec53d4
|
||||
DATA T256<>+0xb0(SB)/4, $0x79d8a7a8
|
||||
DATA T256<>+0xb4(SB)/4, $0xf3b14f50
|
||||
DATA T256<>+0xb8(SB)/4, $0xe7629ea1
|
||||
DATA T256<>+0xbc(SB)/4, $0xcec53d43
|
||||
DATA T256<>+0xc0(SB)/4, $0x9d8a7a87
|
||||
DATA T256<>+0xc4(SB)/4, $0x3b14f50f
|
||||
DATA T256<>+0xc8(SB)/4, $0x7629ea1e
|
||||
DATA T256<>+0xcc(SB)/4, $0xec53d43c
|
||||
DATA T256<>+0xd0(SB)/4, $0xd8a7a879
|
||||
DATA T256<>+0xd4(SB)/4, $0xb14f50f3
|
||||
DATA T256<>+0xd8(SB)/4, $0x629ea1e7
|
||||
DATA T256<>+0xdc(SB)/4, $0xc53d43ce
|
||||
DATA T256<>+0xe0(SB)/4, $0x8a7a879d
|
||||
DATA T256<>+0xe4(SB)/4, $0x14f50f3b
|
||||
DATA T256<>+0xe8(SB)/4, $0x29ea1e76
|
||||
DATA T256<>+0xec(SB)/4, $0x53d43cec
|
||||
DATA T256<>+0xf0(SB)/4, $0xa7a879d8
|
||||
DATA T256<>+0xf4(SB)/4, $0x4f50f3b1
|
||||
DATA T256<>+0xf8(SB)/4, $0x9ea1e762
|
||||
DATA T256<>+0xfc(SB)/4, $0x3d43cec5
|
||||
GLOBL T256<>(SB), (NOPTR + RODATA), $256
|
||||
Reference in New Issue
Block a user