// +build arm64,!generic #define storeBlock(a0,a1,a2,a3, r) \ MOVD a0, 0+r \ MOVD a1, 8+r \ MOVD a2, 16+r \ MOVD a3, 24+r #define loadBlock(r, a0,a1,a2,a3) \ MOVD 0+r, a0 \ MOVD 8+r, a1 \ MOVD 16+r, a2 \ MOVD 24+r, a3 #define loadModulus(p0,p1,p2,p3) \ MOVD ·p2+0(SB), p0 \ MOVD ·p2+8(SB), p1 \ MOVD ·p2+16(SB), p2 \ MOVD ·p2+24(SB), p3 #define loadR(p0,p1,p2,p3) \ MOVD ·r+0(SB), p0 \ MOVD ·r+8(SB), p1 \ MOVD ·r+16(SB), p2 \ MOVD ·r+24(SB), p3 #include "mul_arm64.h" TEXT ·gfpNeg(SB),0,$0-16 MOVD a+8(FP), R0 loadBlock(0(R0), R1,R2,R3,R4) loadModulus(R5,R6,R7,R8) // (CS, R8:R5) = p-a SUBS R1,R5, R5 SBCS R2,R6, R6 SBCS R3,R7, R7 SBCS R4,R8, R8 // if CS = 0, then p >= a, R8:R5 = p-a // if CS = 1, then p < a, R8:R5 = R+p-a mod p // Thus we need sub R if CS = 1. // If CS = 1, R4:R1 = R, otherwise 0 loadR(R1,R2,R3,R4) MOVD $0, R0 CSEL CS, R0, R1, R1 CSEL CS, R0, R2, R2 CSEL CS, R0, R3, R3 CSEL CS, R0, R4, R4 // R5:R8 = p-a SUBS R1, R5, R5 SBCS R2, R6, R6 SBCS R3, R7, R7 SBCS R4, R8, R8 MOVD c+0(FP), R0 storeBlock(R5,R6,R7,R8, 0(R0)) RET TEXT ·gfpAdd(SB),0,$0-24 MOVD a+8(FP), R0 loadBlock(0(R0), R1,R2,R3,R4) MOVD b+16(FP), R0 loadBlock(0(R0), R5,R6,R7,R8) loadModulus(R9,R10,R11,R12) MOVD ZR, R0 // (R0,R4,R3,R2,R1) = a + b ADDS R5, R1 ADCS R6, R2 ADCS R7, R3 ADCS R8, R4 ADCS ZR, R0 // (R0,R8,R7,R6,R5) = a + b - p SUBS R9, R1, R5 SBCS R10, R2, R6 SBCS R11, R3, R7 SBCS R12, R4, R8 SBCS ZR, R0, R0 // if CS = 1, then a + b < p CSEL CS, R5, R1, R1 CSEL CS, R6, R2, R2 CSEL CS, R7, R3, R3 CSEL CS, R8, R4, R4 MOVD c+0(FP), R0 storeBlock(R1,R2,R3,R4, 0(R0)) RET TEXT ·gfpSub(SB),0,$0-24 MOVD a+8(FP), R0 loadBlock(0(R0), R1,R2,R3,R4) MOVD b+16(FP), R0 loadBlock(0(R0), R5,R6,R7,R8) loadModulus(R9,R10,R11,R12) // R4:R1 = a - b or R + a - b SUBS R5, R1 SBCS R6, R2 SBCS R7, R3 SBCS R8, R4 // R12:R9= 0 or p CSEL CS, ZR, R9, R9 CSEL CS, ZR, R10, R10 CSEL CS, ZR, R11, R11 CSEL CS, ZR, R12, R12 // actually, we should sub r if R4:R1 = R + a - b. // but R4:R1 - r = R-r + a-b = p + a-b. // Therefore, sub r equals add p. // Also, for a < b, the addtion carrys 0. ADDS R9, R1 ADCS R10, R2 ADCS R11, R3 ADCS R12, R4 MOVD c+0(FP), R0 storeBlock(R1,R2,R3,R4, 0(R0)) RET TEXT ·gfpMul(SB),0,$0-24 MOVD a+8(FP), R0 loadBlock(0(R0), R1,R2,R3,R4) MOVD b+16(FP), R0 loadBlock(0(R0), R5,R6,R7,R8) // R16:R9 = R4:R1 * R8:R5 = a * b mul(R9,R10,R11,R12,R13,R14,R15,R16) gfpReduce() MOVD c+0(FP), R0 storeBlock(R1,R2,R3,R4, 0(R0)) RET