// +build amd64,!generic #define storeBlock(a0,a1,a2,a3, r) \ MOVQ a0, 0+r \ MOVQ a1, 8+r \ MOVQ a2, 16+r \ MOVQ a3, 24+r #define loadBlock(r, a0,a1,a2,a3) \ MOVQ 0+r, a0 \ MOVQ 8+r, a1 \ MOVQ 16+r, a2 \ MOVQ 24+r, a3 #define loadModulus(p0,p1,p2,p3) \ MOVD ·p2+0(SB), p0 \ MOVD ·p2+8(SB), p1 \ MOVD ·p2+16(SB), p2 \ MOVD ·p2+24(SB), p3 #define loadR(p0,p1,p2,p3) \ MOVD ·r+0(SB), p0 \ MOVD ·r+8(SB), p1 \ MOVD ·r+16(SB), p2 \ MOVD ·r+24(SB), p3 #define gfpCarry(a0,a1,a2,a3,a4, b0,b1,b2,b3,b4) \ \ // b = a-p MOVQ a0, b0 \ MOVQ a1, b1 \ MOVQ a2, b2 \ MOVQ a3, b3 \ MOVQ a4, b4 \ \ SUBQ ·p2+0(SB), b0 \ SBBQ ·p2+8(SB), b1 \ SBBQ ·p2+16(SB), b2 \ SBBQ ·p2+24(SB), b3 \ SBBQ $0, b4 \ \ \ // if b is negative then return a \ // else return b \ // CMOVQCC = CMOVAEQ (AT&T), i.e., a >= p then move a-p to a. CMOVQCC b0, a0 \ CMOVQCC b1, a1 \ CMOVQCC b2, a2 \ CMOVQCC b3, a3 #include "mul_amd64.h" #include "mul_bmi2_amd64.h" // 可以处理a > p的情况 TEXT ·gfpNeg(SB),0,$0-16 loadModulus(R8,R9,R10,R11) loadR(R12,R13,R14,R15) // Let R = 2^{256} and r = R % p // /- p - a if p >= a // R11:R8 = -| // \- R + p - a if p < a MOVQ a+8(FP), DI SUBQ 0(DI), R8 SBBQ 8(DI), R9 SBBQ 16(DI), R10 SBBQ 24(DI), R11 // /- 0 if p >= a // R15:R12 = -| // \- r if p < a MOVQ $0, AX CMOVQCC AX, R12 CMOVQCC AX, R13 CMOVQCC AX, R14 CMOVQCC AX, R15 // /- p - a if p >= a // R11:R8 = -| // \- R + p - a - r if p < a // Note R + p - a - r = p - a mod p and // 0 < R + p - a - r < R SUBQ R12, R8 SBBQ R13, R9 SBBQ R14, R10 SBBQ R15, R11 MOVQ c+0(FP), DI storeBlock(R8,R9,R10,R11, 0(DI)) RET // cloudflare 代码 // 处理a > p的情况 不对。是mod r的结果。 // TEXT ·gfpNeg(SB),0,$0-16 // MOVQ ·p2+0(SB), R8 // MOVQ ·p2+8(SB), R9 // MOVQ ·p2+16(SB), R10 // MOVQ ·p2+24(SB), R11 // MOVQ a+8(FP), DI // SUBQ 0(DI), R8 // p - a // SBBQ 8(DI), R9 // SBBQ 16(DI), R10 // SBBQ 24(DI), R11 // MOVQ $0, AX // gfpCarry(R8,R9,R10,R11,AX, R12,R13,R14,R15,BX) // MOVQ c+0(FP), DI // storeBlock(R8,R9,R10,R11, 0(DI)) // RET TEXT ·gfpAdd(SB),0,$0-24 MOVQ a+8(FP), DI MOVQ b+16(FP), SI loadBlock(0(DI), R8,R9,R10,R11) MOVQ $0, R12 ADDQ 0(SI), R8 ADCQ 8(SI), R9 ADCQ 16(SI), R10 ADCQ 24(SI), R11 ADCQ $0, R12 gfpCarry(R8,R9,R10,R11,R12, R13,R14,R15,AX,BX) MOVQ c+0(FP), DI storeBlock(R8,R9,R10,R11, 0(DI)) RET TEXT ·gfpSub(SB),0,$0-24 MOVQ a+8(FP), DI MOVQ b+16(FP), SI loadBlock(0(DI), R8,R9,R10,R11) MOVQ ·p2+0(SB), R12 MOVQ ·p2+8(SB), R13 MOVQ ·p2+16(SB), R14 MOVQ ·p2+24(SB), R15 MOVQ $0, AX // a - b or R + a - b SUBQ 0(SI), R8 SBBQ 8(SI), R9 SBBQ 16(SI), R10 SBBQ 24(SI), R11 CMOVQCC AX, R12 CMOVQCC AX, R13 CMOVQCC AX, R14 CMOVQCC AX, R15 // sub 0 or sub r=R%p = R-p. // sub r equals add p. ADDQ R12, R8 ADCQ R13, R9 ADCQ R14, R10 ADCQ R15, R11 MOVQ c+0(FP), DI storeBlock(R8,R9,R10,R11, 0(DI)) RET TEXT ·gfpMul(SB),0,$160-24 MOVQ a+8(FP), DI MOVQ b+16(FP), SI // Jump to a slightly different implementation if MULX isn't supported. CMPB ·hasBMI2(SB), $0 JE nobmi2Mul mulBMI2(0(DI),8(DI),16(DI),24(DI), 0(SI)) storeBlock( R8, R9,R10,R11, 0(SP)) storeBlock(R12,R13,R14,R15, 32(SP)) gfpReduceBMI2() JMP end nobmi2Mul: mul(0(DI),8(DI),16(DI),24(DI), 0(SI), 0(SP)) gfpReduce(0(SP)) end: MOVQ c+0(FP), DI storeBlock(R12,R13,R14,R15, 0(DI)) RET