/* mul computes [R4*R1] * [R8:R5] = [c7:c0] use registers R0, R26, R27, R29. when return, R1 is changed. (*R1) R8 R7 R6 R5 x R1 ---------------------------- c1 c0 + c2 R0 ---------------------------- c2 c1 c0 + c3 R0 ---------------------------- c3 c2 c1 c0 + c4 R0 ---------------------------- c4 c3 c2 c1 c0 (*R2) R8 R7 R6 R5 x R2 ---------------------------- R26 R1 R27 R0 R29 R0 c5 R0 + c4 c3 c2 c1 c0 ---------------------------- c5 c4 c3 c2 c1 c0 (*R3) R8 R7 R6 R5 x R3 ---------------------------- R26 R1 R27 R0 R29 R0 c6 R0 c5 c4 c3 c2 c1 c0 ---------------------------- c6 c5 c4 c3 c2 c1 c0 (*R4) R8 R7 R6 R5 x R4 ---------------------------- R26 R1 R27 R0 R29 R0 c7 R0 c6 c5 c4 c3 c2 c1 c0 ---------------------------- c7 c6 c5 c4 c3 c2 c1 c0 */ #define mul(c0,c1,c2,c3,c4,c5,c6,c7) \ MUL R1, R5, c0 /* save the 0-63 bits of R1*R5 to c0 */\ UMULH R1, R5, c1 /* save the 64-129 bits of R1*R5 to c1 */\ MUL R1, R6, R0 \ ADDS R0, c1 \ UMULH R1, R6, c2 \ MUL R1, R7, R0 \ ADCS R0, c2 /* also add the carry of R0+c1 */\ UMULH R1, R7, c3 \ MUL R1, R8, R0 \ ADCS R0, c3 \ UMULH R1, R8, c4 \ ADCS ZR, c4 \ /* [c4:c0] = R1 * [R8:R5] */\ MUL R2, R5, R1 \ UMULH R2, R5, R26 \ MUL R2, R6, R0 \ ADDS R0, R26 \ UMULH R2, R6, R27 \ MUL R2, R7, R0 \ ADCS R0, R27 \ UMULH R2, R7, R29 \ MUL R2, R8, R0 \ ADCS R0, R29 \ UMULH R2, R8, c5 \ ADCS ZR, c5 \ ADDS R1, c1 \ ADCS R26, c2 \ ADCS R27, c3 \ ADCS R29, c4 \ ADCS ZR, c5 \ \ MUL R3, R5, R1 \ UMULH R3, R5, R26 \ MUL R3, R6, R0 \ ADDS R0, R26 \ UMULH R3, R6, R27 \ MUL R3, R7, R0 \ ADCS R0, R27 \ UMULH R3, R7, R29 \ MUL R3, R8, R0 \ ADCS R0, R29 \ UMULH R3, R8, c6 \ ADCS ZR, c6 \ ADDS R1, c2 \ ADCS R26, c3 \ ADCS R27, c4 \ ADCS R29, c5 \ ADCS ZR, c6 \ \ MUL R4, R5, R1 \ UMULH R4, R5, R26 \ MUL R4, R6, R0 \ ADDS R0, R26 \ UMULH R4, R6, R27 \ MUL R4, R7, R0 \ ADCS R0, R27 \ UMULH R4, R7, R29 \ MUL R4, R8, R0 \ ADCS R0, R29 \ UMULH R4, R8, c7 \ ADCS ZR, c7 \ ADDS R1, c3 \ ADCS R26, c4 \ ADCS R27, c5 \ ADCS R29, c6 \ ADCS ZR, c7 // gfpReduce computes // [R4:R1] = [R16:R9] * R^{-1} mod p // = [R16:R13] + [The higher half of ([R12:R9] * np mod R) * P] #define gfpReduce() \ /* m = (T * N') mod R, store m in R1:R2:R3:R4, np * [R1:R4] mod R => [R1:R4] */ \ MOVD ·np+0(SB), R17 \ MOVD ·np+8(SB), R25 \ MOVD ·np+16(SB), R19 \ MOVD ·np+24(SB), R20 \ \ /*[R4:R1] <- [R20,R19,R24,R17] * R9 mod R */\ MUL R9, R17, R1 \ UMULH R9, R17, R2 \ MUL R9, R25, R0 \ ADDS R0, R2 \ UMULH R9, R25, R3 \ MUL R9, R19, R0 \ ADCS R0, R3 \ UMULH R9, R19, R4 \ MUL R9, R20, R0 \ ADCS R0, R4 \ \ /*[R23,R22,R21,0] <- [R20,R19,R24,R17] * R10 mod R */\ /* [R4:R1] <- [R4:R1] + [R23,R22,R21,0] mod R */\ MUL R10, R17, R21 \ UMULH R10, R17, R22 \ MUL R10, R25, R0 \ ADDS R0, R22 \ UMULH R10, R25, R23 \ MUL R10, R19, R0 \ ADCS R0, R23 \ ADDS R21, R2 \ ADCS R22, R3 \ ADCS R23, R4 \ \ /*[R22,R21,0,0] <- [R20,R19,R24,R17] * R11 mod R */\ /* [R4:R1] <- [R4:R1] + [R22,R21,0,0] mod R */\ MUL R11, R17, R21 \ UMULH R11, R17, R22 \ MUL R11, R25, R0 \ ADDS R0, R22 \ ADDS R21, R3 \ ADCS R22, R4 \ \ MUL R12, R17, R21 \ ADDS R21, R4 \ \ /* now [R4:R1] = [R12:R9] * np mod R*/ \ loadModulus(R5, R6, R7, R8) \ /* multiply with P */\ mul(R17, R25, R19, R20, R21, R22, R23, R24) \ \ /* Add the 512-bit intermediate to m*N */ \ /* Although R17,R25,R19,R20 must be 0 after the addtion, */\ /* But we can't omit the lower have addtion. For we not sure if there have a carry. */\ MOVD ZR , R0 \ ADDS R9 , R17 /* R17=0 */\ ADCS R10, R25 /* R25=0 */\ ADCS R11, R19 /* R19=0 */\ ADCS R12, R20 /* R20=0 */\ ADCS R13, R21 /* If one of R9,... R20 are non-zero, then there have a carry. */\ ADCS R14, R22 \ ADCS R15, R23 \ ADCS R16, R24 \ ADCS ZR , R0 \ \ /* Our output is [R0,R24,R23,R22,R21]. Reduce mod p if necessary.*/ \ SUBS R5, R21, R10 \ SBCS R6, R22, R11 \ SBCS R7, R23, R12 \ SBCS R8, R24, R13 \ SBCS $0, R0, R0\ \ CSEL CS, R10, R21, R1 \ CSEL CS, R11, R22, R2 \ CSEL CS, R12, R23, R3 \ CSEL CS, R13, R24, R4