2507 lines
48 KiB
ArmAsm
2507 lines
48 KiB
ArmAsm
// +build amd64
|
||
// +build !generic,!generic32,!generic64
|
||
|
||
// Copyright (c) 2018 xdx. All rights reserved.
|
||
// Written by xdx
|
||
//
|
||
// SM2 256曲线amd64平台汇编实现。参考golang标准库p256曲线的实现。
|
||
|
||
// Copyright 2015 The Go Authors. All rights reserved.
|
||
// Use of this source code is governed by a BSD-style
|
||
// license that can be found in the LICENSE file.
|
||
|
||
// This file contains constant-time, 64-bit assembly implementation of
|
||
// P256. The optimizations performed here are described in detail in:
|
||
// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
|
||
// 256-bit primes"
|
||
// https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
|
||
// https://eprint.iacr.org/2013/816.pdf
|
||
|
||
|
||
#include "textflag.h"
|
||
|
||
#define res_ptr DI
|
||
#define x_ptr SI
|
||
#define y_ptr CX
|
||
|
||
#define acc0 R8
|
||
#define acc1 R9
|
||
#define acc2 R10
|
||
#define acc3 R11
|
||
#define acc4 R12
|
||
#define acc5 R13
|
||
#define t0 R14
|
||
#define t1 R15
|
||
|
||
// the p
|
||
DATA c256const0<>+0x00(SB)/8, $0xffffffffffffffff
|
||
DATA c256const1<>+0x00(SB)/8, $0xffffffff00000000
|
||
DATA c256const2<>+0x00(SB)/8, $0xffffffffffffffff
|
||
DATA c256const3<>+0x00(SB)/8, $0xfffffffeffffffff
|
||
|
||
// k0 * ord[0] = -1 mod 2^{64}, k0 = -ord[0]^{-1} mod B
|
||
DATA c256ordK0<>+0x00(SB)/8, $0x327f9e8872350975
|
||
|
||
// n
|
||
DATA c256ord<>+0x00(SB)/8, $0x53bbf40939d54123
|
||
DATA c256ord<>+0x08(SB)/8, $0x7203df6b21c6052b
|
||
DATA c256ord<>+0x10(SB)/8, $0xffffffffffffffff
|
||
DATA c256ord<>+0x18(SB)/8, $0xfffffffeffffffff
|
||
|
||
// 1*R mod p
|
||
DATA c256one<>+0x00(SB)/8, $0x0000000000000001
|
||
DATA c256one<>+0x08(SB)/8, $0x00000000ffffffff
|
||
DATA c256one<>+0x10(SB)/8, $0x0000000000000000
|
||
DATA c256one<>+0x18(SB)/8, $0x0000000100000000
|
||
|
||
GLOBL c256const0<>(SB), 8, $8
|
||
GLOBL c256const1<>(SB), 8, $8
|
||
GLOBL c256const2<>(SB), 8, $8
|
||
GLOBL c256const3<>(SB), 8, $8
|
||
GLOBL c256ordK0<>(SB), 8, $8
|
||
GLOBL c256ord<>(SB), 8, $32
|
||
GLOBL c256one<>(SB), 8, $32
|
||
|
||
|
||
// in: a = [a0,a1,a2,a3,a4,a5], a5 <= 1
|
||
// output: a0 = 0, [a1,a2,a3,a4,a5] = (a + a0*p)/B = a/B mod p
|
||
// Note:
|
||
// a + a0*p <= a5*B^5 + (B^5-1) + (B-1)*p
|
||
// <= 2*B^5 - 1 + (B-1)*(B-dp), dp = B-p <<< B
|
||
// = 2*B^5 + (p+1)*B + (dp-1)
|
||
// <= 2*B^5 + (p+1)*B, for B | a + a0*p
|
||
// thus, when output, a5 <= 2
|
||
#define montReduceQW(a0,a1,a2,a3,a4,a5,tmp0) \
|
||
ADDQ a0, a1 \
|
||
ADCQ $0, a2 \
|
||
ADCQ $0, a3 \
|
||
ADCQ a0, a4 \
|
||
ADCQ $0, a5 \
|
||
\
|
||
MOVQ a0, tmp0 \
|
||
SHLQ $32, a0 \
|
||
SHRQ $32, tmp0 \
|
||
\
|
||
SUBQ a0, a1 \
|
||
SBBQ tmp0, a2 \
|
||
SBBQ a0, a3 \
|
||
SBBQ tmp0, a4 \
|
||
SBBQ $0, a5 \
|
||
XORQ a0, a0
|
||
|
||
// montMulOne QuadWord
|
||
// in: a = [a0,a1,a2,a3]
|
||
// out: [a1,a2,a3,a0] = a*B^{-1} = a+a0*p
|
||
#define montMulOneQW(a0,a1,a2,a3,tmp0,tmp1)\
|
||
MOVQ a0, tmp0 \
|
||
SHLQ $32, tmp0 \
|
||
MOVQ a0, tmp1 \
|
||
SHRQ $32, tmp1 \
|
||
ADDQ a0, a1 \
|
||
ADCQ $0, a2 \
|
||
ADCQ $0, a3 \
|
||
ADCQ $0, a0 \
|
||
SUBQ tmp0, a1 \
|
||
SBBQ tmp1, a2 \
|
||
SBBQ tmp0, a3 \
|
||
SBBQ tmp1, a0
|
||
|
||
|
||
// in: a = [a0,a1,a2,a3]
|
||
// out: [a0,a1,a2,a3] = a*R^{-1} mod p = MontMul(a,1)
|
||
// note: out may > p
|
||
#define montMulOne(a0,a1,a2,a3,tmp0,tmp1) \
|
||
montMulOneQW(a0,a1,a2,a3,tmp0,tmp1) \
|
||
montMulOneQW(a1,a2,a3,a0,tmp0,tmp1) \
|
||
montMulOneQW(a2,a3,a0,a1,tmp0,tmp1) \
|
||
montMulOneQW(a3,a0,a1,a2,tmp0,tmp1)
|
||
|
||
// sub p if [a0, a1, a2, a3, a4] >= p
|
||
#define subCond(a0, a1, a2, a3, a4, t0, t1, t2, t3) \
|
||
MOVQ a0, t0 \
|
||
MOVQ a1, t1 \
|
||
MOVQ a2, t2 \
|
||
MOVQ a3, t3 \
|
||
\
|
||
SUBQ $-1, a0 \
|
||
SBBQ c256const1<>(SB), a1 \
|
||
SBBQ $-1, a2 \
|
||
SBBQ c256const3<>(SB), a3 \
|
||
SBBQ $0, a4 \
|
||
\
|
||
CMOVQCS t0, a0 \
|
||
CMOVQCS t1, a1 \
|
||
CMOVQCS t2, a2 \
|
||
CMOVQCS t3, a3
|
||
|
||
/* ---------------------------------------*/
|
||
// func c256LittleToBig(res []byte, in []uint64)
|
||
TEXT ·c256LittleToBig(SB),NOSPLIT,$0
|
||
JMP ·c256BigToLittle(SB)
|
||
/* ---------------------------------------*/
|
||
// func c256BigToLittle(res []uint64, in []byte)
|
||
TEXT ·c256BigToLittle(SB),NOSPLIT,$0
|
||
MOVQ res+0(FP), res_ptr
|
||
MOVQ in+24(FP), x_ptr
|
||
|
||
MOVQ (8*0)(x_ptr), acc0
|
||
MOVQ (8*1)(x_ptr), acc1
|
||
MOVQ (8*2)(x_ptr), acc2
|
||
MOVQ (8*3)(x_ptr), acc3
|
||
|
||
BSWAPQ acc0
|
||
BSWAPQ acc1
|
||
BSWAPQ acc2
|
||
BSWAPQ acc3
|
||
|
||
MOVQ acc3, (8*0)(res_ptr)
|
||
MOVQ acc2, (8*1)(res_ptr)
|
||
MOVQ acc1, (8*2)(res_ptr)
|
||
MOVQ acc0, (8*3)(res_ptr)
|
||
|
||
RET
|
||
/* ---------------------------------------*/
|
||
// func c256MovCond(res, a, b []uint64, cond int)
|
||
// If cond == 0 res=b, else res=a
|
||
TEXT ·c256MovCond(SB),NOSPLIT,$0
|
||
MOVQ res+0(FP), res_ptr
|
||
MOVQ a+24(FP), x_ptr
|
||
MOVQ b+48(FP), y_ptr
|
||
MOVQ cond+72(FP), X12
|
||
|
||
PXOR X13, X13
|
||
PSHUFD $0, X12, X12
|
||
PCMPEQL X13, X12 // X12 = 0xFFFF...FF if cond = 0, otherwise X12 = 0
|
||
|
||
//X[0:5] = !X12 && a
|
||
MOVOU X12, X0
|
||
MOVOU (16*0)(x_ptr), X6
|
||
PANDN X6, X0
|
||
MOVOU X12, X1
|
||
MOVOU (16*1)(x_ptr), X7
|
||
PANDN X7, X1
|
||
MOVOU X12, X2
|
||
MOVOU (16*2)(x_ptr), X8
|
||
PANDN X8, X2
|
||
MOVOU X12, X3
|
||
MOVOU (16*3)(x_ptr), X9
|
||
PANDN X9, X3
|
||
MOVOU X12, X4
|
||
MOVOU (16*4)(x_ptr), X10
|
||
PANDN X10, X4
|
||
MOVOU X12, X5
|
||
MOVOU (16*5)(x_ptr), X11
|
||
PANDN X11, X5
|
||
|
||
MOVOU (16*0)(y_ptr), X6
|
||
MOVOU (16*1)(y_ptr), X7
|
||
MOVOU (16*2)(y_ptr), X8
|
||
MOVOU (16*3)(y_ptr), X9
|
||
MOVOU (16*4)(y_ptr), X10
|
||
MOVOU (16*5)(y_ptr), X11
|
||
|
||
// X[6:11] = X12 && b
|
||
PAND X12, X6
|
||
PAND X12, X7
|
||
PAND X12, X8
|
||
PAND X12, X9
|
||
PAND X12, X10
|
||
PAND X12, X11
|
||
|
||
// res = (!X12 && a) ^ (X12 && b)
|
||
// cond == 0 => X12 = 0xFF..FF => res = b
|
||
PXOR X6, X0
|
||
PXOR X7, X1
|
||
PXOR X8, X2
|
||
PXOR X9, X3
|
||
PXOR X10, X4
|
||
PXOR X11, X5
|
||
|
||
MOVOU X0, (16*0)(res_ptr)
|
||
MOVOU X1, (16*1)(res_ptr)
|
||
MOVOU X2, (16*2)(res_ptr)
|
||
MOVOU X3, (16*3)(res_ptr)
|
||
MOVOU X4, (16*4)(res_ptr)
|
||
MOVOU X5, (16*5)(res_ptr)
|
||
|
||
RET
|
||
/* ---------------------------------------*/
|
||
// func c256NegCond(val []uint64, cond int)
|
||
TEXT ·c256NegCond(SB),NOSPLIT,$0
|
||
MOVQ val+0(FP), res_ptr
|
||
MOVQ cond+24(FP), t0
|
||
|
||
// acc = p
|
||
MOVQ c256const0<>(SB), acc0
|
||
MOVQ c256const1<>(SB), acc1
|
||
MOVQ c256const2<>(SB), acc2
|
||
MOVQ c256const3<>(SB), acc3
|
||
|
||
// Load the original value
|
||
MOVQ (8*0)(res_ptr), acc4
|
||
MOVQ (8*1)(res_ptr), x_ptr
|
||
MOVQ (8*2)(res_ptr), y_ptr
|
||
MOVQ (8*3)(res_ptr), t1
|
||
|
||
// acc = p - val
|
||
SUBQ acc4, acc0
|
||
SBBQ x_ptr, acc1
|
||
SBBQ y_ptr, acc2
|
||
SBBQ t1, acc3
|
||
|
||
// If condition is 0, keep original value
|
||
TESTQ t0, t0
|
||
CMOVQEQ acc4, acc0
|
||
CMOVQEQ x_ptr, acc1
|
||
CMOVQEQ y_ptr, acc2
|
||
CMOVQEQ t1, acc3
|
||
|
||
// Store result
|
||
MOVQ acc0, (8*0)(res_ptr)
|
||
MOVQ acc1, (8*1)(res_ptr)
|
||
MOVQ acc2, (8*2)(res_ptr)
|
||
MOVQ acc3, (8*3)(res_ptr)
|
||
|
||
RET
|
||
|
||
/* ---------------------------------------*/
|
||
// func c256Sqr(res, in []uint64, n int)
|
||
TEXT ·c256Sqr(SB),NOSPLIT,$0
|
||
MOVQ res+0(FP), res_ptr
|
||
MOVQ in+24(FP), x_ptr
|
||
MOVQ n+48(FP), BX
|
||
|
||
sqrLoop:
|
||
// y[1:] * y[0]
|
||
MOVQ (8*0)(x_ptr), t0
|
||
MOVQ (8*1)(x_ptr), AX
|
||
MULQ t0 // implicit mul with AX: t0 * AX => DX:AX
|
||
MOVQ AX, acc1
|
||
MOVQ DX, acc2
|
||
|
||
MOVQ (8*2)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ AX, acc2
|
||
ADCQ $0, DX
|
||
MOVQ DX, acc3
|
||
|
||
MOVQ (8*3)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ AX, acc3
|
||
ADCQ $0, DX
|
||
MOVQ DX, acc4
|
||
|
||
// y[2:] * y[1]
|
||
MOVQ (8*1)(x_ptr), t0
|
||
|
||
MOVQ (8*2)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ AX, acc3
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
MOVQ (8*3)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ t1, acc4
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc4
|
||
ADCQ $0, DX
|
||
MOVQ DX, acc5
|
||
// Q: DX <= B-2, 这里可能发生进位吗?
|
||
// A: 不会。
|
||
// [0, acc1, acc2, acc3, acc4, acc5, carry]
|
||
// = (y1*B + y2B^2 + y3*B^3)*y0 + (y2*B^2 + y3*B^3)*y1
|
||
// <= (B^3-1)*B*(B-1) + (B^2-1)*B^2*(B-1)
|
||
// < B^6
|
||
|
||
// y[3] * y[2]
|
||
MOVQ (8*2)(x_ptr), t0
|
||
|
||
MOVQ (8*3)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ AX, acc5
|
||
ADCQ $0, DX
|
||
MOVQ DX, y_ptr
|
||
XORQ t1, t1
|
||
// [0, acc1, acc2, acc3, acc4, acc5, y_ptr] = \sum_{i!=j}yi * yj
|
||
|
||
// *2
|
||
ADDQ acc1, acc1
|
||
ADCQ acc2, acc2
|
||
ADCQ acc3, acc3
|
||
ADCQ acc4, acc4
|
||
ADCQ acc5, acc5
|
||
ADCQ y_ptr, y_ptr
|
||
ADCQ $0, t1
|
||
// [0, acc1, acc2, acc3, acc4, acc5, y_ptr, t1] = 2*\sum_{i!=j}yi * yj
|
||
|
||
// Missing products \sum yi^2
|
||
|
||
// acc0, acc1, acc2, acc3, acc4, acc5, y_ptr, t1
|
||
//+ AX DX
|
||
//-----------------------------------------------
|
||
//= acc0, acc1, acc2, acc3, acc4, acc5, y_ptr, t1
|
||
//+ t0
|
||
MOVQ (8*0)(x_ptr), AX
|
||
MULQ AX
|
||
MOVQ AX, acc0
|
||
MOVQ DX, t0
|
||
|
||
|
||
// acc0, acc1, acc2, acc3, acc4, acc5, y_ptr, t1
|
||
//+ t0 AX DX
|
||
//-----------------------------------------------
|
||
//=acc0, acc1, acc2, acc3, acc4, acc5, y_ptr, t1
|
||
//+ t0
|
||
MOVQ (8*1)(x_ptr), AX
|
||
MULQ AX
|
||
ADDQ t0, acc1
|
||
ADCQ AX, acc2
|
||
ADCQ $0, DX
|
||
MOVQ DX, t0
|
||
|
||
// acc0, acc1, acc2, acc3, acc4, acc5, y_ptr, t1
|
||
//+ t0 AX DX
|
||
//-----------------------------------------------
|
||
//=acc0, acc1, acc2, acc3, acc4, acc5, y_ptr, t1
|
||
//+ t0
|
||
MOVQ (8*2)(x_ptr), AX
|
||
MULQ AX
|
||
ADDQ t0, acc3
|
||
ADCQ AX, acc4
|
||
ADCQ $0, DX
|
||
MOVQ DX, t0
|
||
|
||
// acc0, acc1, acc2, acc3, acc4, acc5, y_ptr, t1
|
||
//+ t0 AX DX
|
||
//-----------------------------------------------
|
||
//=acc0, acc1, acc2, acc3, acc4, acc5, y_ptr, x_ptr
|
||
MOVQ (8*3)(x_ptr), AX
|
||
MULQ AX
|
||
ADDQ t0, acc5
|
||
ADCQ AX, y_ptr
|
||
ADCQ DX, t1
|
||
MOVQ t1, x_ptr
|
||
|
||
// Reduction step
|
||
montMulOne(acc0,acc1,acc2,acc3, t0, t1)
|
||
|
||
XORQ t0,t0
|
||
// Add bits [511:256] of the sqr result
|
||
ADCQ acc4, acc0
|
||
ADCQ acc5, acc1
|
||
ADCQ y_ptr, acc2
|
||
ADCQ x_ptr, acc3
|
||
ADCQ $0, t0
|
||
|
||
subCond(acc0, acc1, acc2, acc3, t0, acc4, acc5, y_ptr, t1)
|
||
// MOVQ acc0, acc4
|
||
// MOVQ acc1, acc5
|
||
// MOVQ acc2, y_ptr
|
||
// MOVQ acc3, t1
|
||
// // Subtract c256
|
||
// SUBQ $-1, acc0
|
||
// SBBQ c256const1<>(SB) ,acc1
|
||
// SBBQ $-1 ,acc2
|
||
// SBBQ c256const3<>(SB), acc3
|
||
// SBBQ $0, t0
|
||
|
||
// // if borrow(CF=1) then [acc0,acc1,acc2,acc3,t0] < p
|
||
// // the result is old [acc4,acc5,y_ptr,t1]
|
||
// CMOVQCS acc4, acc0 //Move if below (CF=1). = AT&T CMOVBQ
|
||
// CMOVQCS acc5, acc1
|
||
// CMOVQCS y_ptr, acc2
|
||
// CMOVQCS t1, acc3
|
||
|
||
MOVQ acc0, (8*0)(res_ptr)
|
||
MOVQ acc1, (8*1)(res_ptr)
|
||
MOVQ acc2, (8*2)(res_ptr)
|
||
MOVQ acc3, (8*3)(res_ptr)
|
||
MOVQ res_ptr, x_ptr
|
||
DECQ BX
|
||
JNE sqrLoop
|
||
|
||
RET
|
||
|
||
|
||
|
||
/* ---------------------------------------*/
|
||
// func c256Mul(res, in1, in2 []uint64)
|
||
TEXT ·c256Mul(SB),NOSPLIT,$0
|
||
MOVQ res+0(FP), res_ptr
|
||
MOVQ in1+24(FP), x_ptr
|
||
MOVQ in2+48(FP), y_ptr
|
||
// x * y[0]
|
||
MOVQ (8*0)(y_ptr), t0
|
||
|
||
MOVQ (8*0)(x_ptr), AX
|
||
MULQ t0
|
||
MOVQ AX, acc0
|
||
MOVQ DX, acc1
|
||
|
||
// acc0, acc1
|
||
//+ AX DX
|
||
//-----------------------------------------------
|
||
//=acc0, acc1, acc2
|
||
MOVQ (8*1)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ AX, acc1
|
||
ADCQ $0, DX
|
||
MOVQ DX, acc2
|
||
|
||
// acc0, acc1, acc2
|
||
//+ AX DX
|
||
//-----------------------------------------------
|
||
//=acc0, acc1, acc2, acc3
|
||
MOVQ (8*2)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ AX, acc2
|
||
ADCQ $0, DX
|
||
MOVQ DX, acc3
|
||
|
||
// acc0, acc1, acc2, acc3
|
||
//+ AX DX
|
||
//-----------------------------------------------
|
||
//=acc0, acc1, acc2, acc3, acc4
|
||
MOVQ (8*3)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ AX, acc3
|
||
ADCQ $0, DX
|
||
MOVQ DX, acc4
|
||
XORQ acc5, acc5
|
||
// [acc0, acc1, acc2, acc3, aac4, acc5(=0)] = [x0, x1, x2, x3] * y0的结果
|
||
montReduceQW(acc0,acc1,acc2,acc3,acc4, acc5, t0)
|
||
// [acc1, acc2, acc3, aac4, acc5]
|
||
// = ([acc0, acc1, acc2, acc3, aac4, acc5] + acc0 * p) / B
|
||
// = [x0, x1, x2, x3] * y0 / B mod p
|
||
|
||
// x * y[1]
|
||
MOVQ (8*1)(y_ptr), t0
|
||
|
||
// B^0 B^1 B^2 B^3 B^4
|
||
// acc1, acc2, acc3, acc4, acc5
|
||
//+ AX DX
|
||
//-----------------------------------------------
|
||
//= acc1, acc2, acc3, acc4, acc5
|
||
//+ t1
|
||
MOVQ (8*0)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ AX, acc1
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
// B^0 B^1 B^2 B^3 B^4
|
||
// acc1, acc2, acc3, acc4, acc5
|
||
//+ t1
|
||
//+ AX DX
|
||
//-----------------------------------------------
|
||
//= acc1, acc2, acc3, acc4, acc5
|
||
//+ t1
|
||
MOVQ (8*1)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ t1, acc2
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc2
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
// B^0 B^1 B^2 B^3 B^4
|
||
// acc1, acc2, acc3, acc4, acc5
|
||
//+ t1
|
||
//+ AX DX
|
||
//-----------------------------------------------
|
||
//= acc1, acc2, acc3, acc4, acc5
|
||
//+ t1
|
||
MOVQ (8*2)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ t1, acc3
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc3
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
// B^0 B^1 B^2 B^3 B^4
|
||
// acc1, acc2, acc3, acc4, acc5
|
||
//+ t1
|
||
//+ AX DX
|
||
//-----------------------------------------------
|
||
//= acc1, acc2, acc3, acc4, acc5, acc0
|
||
MOVQ (8*3)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ t1, acc4
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc4
|
||
ADCQ DX, acc5
|
||
ADCQ $0, acc0
|
||
// Second reduction step
|
||
montReduceQW(acc1, acc2, acc3, acc4, acc5, acc0, t0)
|
||
|
||
// x * y[2]
|
||
MOVQ (8*2)(y_ptr), t0
|
||
|
||
MOVQ (8*0)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ AX, acc2
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
MOVQ (8*1)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ t1, acc3
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc3
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
MOVQ (8*2)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ t1, acc4
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc4
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
MOVQ (8*3)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ t1, acc5
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc5
|
||
ADCQ DX, acc0
|
||
ADCQ $0, acc1
|
||
// Third reduction step
|
||
montReduceQW(acc2, acc3, acc4, acc5, acc0, acc1, t0)
|
||
|
||
// x * y[3]
|
||
MOVQ (8*3)(y_ptr), t0
|
||
|
||
MOVQ (8*0)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ AX, acc3
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
MOVQ (8*1)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ t1, acc4
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc4
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
MOVQ (8*2)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ t1, acc5
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc5
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
MOVQ (8*3)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ t1, acc0
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc0
|
||
ADCQ DX, acc1
|
||
ADCQ $0, acc2
|
||
// Last reduction step
|
||
montReduceQW(acc3, acc4, acc5, acc0, acc1, acc2, t0)
|
||
|
||
// Note:
|
||
// [acc4, acc5, acc0, acc1, acc2] = (((x*y0)/B + (x*y1))/B + (x*y2)/B) + (x*y3)/B
|
||
// = x*y0 + u0*p
|
||
// ------------- + x*y1 + u1*p
|
||
// B
|
||
// ---------------------------- + x*y2 + u2*p
|
||
// B
|
||
// ------------------------------------------- + x*y3 + u3*p
|
||
// B
|
||
// -----------------------------------------------------------
|
||
// B
|
||
// = (x*y + u*p)/R
|
||
// < (p*p + R*p)/R, if x,y < p
|
||
// < 2p.
|
||
//
|
||
// if let x, y < R, then
|
||
// (x*y + u*p)/R <= ((R-1)^2+(R-1)*p)/R = (R-1)/R * (R-1 + p) < R-1+p
|
||
// thus substruct p, the result is less then R.
|
||
|
||
subCond(acc4, acc5, acc0, acc1, acc2, x_ptr, acc3, t0, t1)
|
||
// MOVQ acc4, x_ptr
|
||
// MOVQ acc5, acc3
|
||
// MOVQ acc0, t0
|
||
// MOVQ acc1, t1
|
||
|
||
// // Subtract c256
|
||
// SUBQ $-1, acc4
|
||
// SBBQ c256const1<>(SB) ,acc5
|
||
// SBBQ $-1, acc0
|
||
// SBBQ c256const3<>(SB), acc1
|
||
// SBBQ $0, acc2
|
||
|
||
// CMOVQCS x_ptr, acc4
|
||
// CMOVQCS acc3, acc5
|
||
// CMOVQCS t0, acc0
|
||
// CMOVQCS t1, acc1
|
||
|
||
MOVQ acc4, (8*0)(res_ptr)
|
||
MOVQ acc5, (8*1)(res_ptr)
|
||
MOVQ acc0, (8*2)(res_ptr)
|
||
MOVQ acc1, (8*3)(res_ptr)
|
||
|
||
RET
|
||
/* ---------------------------------------*/
|
||
// func c256FromMont(res, in []uint64)
|
||
TEXT ·c256FromMont(SB),NOSPLIT,$0
|
||
MOVQ res+0(FP), res_ptr
|
||
MOVQ in+24(FP), x_ptr
|
||
|
||
MOVQ (8*0)(x_ptr), acc0
|
||
MOVQ (8*1)(x_ptr), acc1
|
||
MOVQ (8*2)(x_ptr), acc2
|
||
MOVQ (8*3)(x_ptr), acc3
|
||
|
||
montMulOne(acc0, acc1, acc2, acc3, t0, t1)
|
||
|
||
MOVQ acc0, x_ptr
|
||
MOVQ acc1, acc4
|
||
MOVQ acc2, t0
|
||
MOVQ acc3, t1
|
||
|
||
SUBQ $-1, acc0
|
||
SBBQ c256const1<>(SB), acc1
|
||
SBBQ $-1, acc2
|
||
SBBQ c256const3<>(SB), acc3
|
||
|
||
CMOVQCS x_ptr, acc0
|
||
CMOVQCS acc4, acc1
|
||
CMOVQCS t0, acc2
|
||
CMOVQCS t1, acc3
|
||
|
||
MOVQ acc0, (8*0)(res_ptr)
|
||
MOVQ acc1, (8*1)(res_ptr)
|
||
MOVQ acc2, (8*2)(res_ptr)
|
||
MOVQ acc3, (8*3)(res_ptr)
|
||
|
||
RET
|
||
|
||
/* ---------------------------------------*/
|
||
// Constant time point access to arbitrary point table.
|
||
// Indexed from 1 to 15, with -1 offset
|
||
// (index 0 is implicitly point at infinity)
|
||
// func c256Select(point, table []uint64, idx int)
|
||
TEXT ·c256Select(SB),NOSPLIT,$0
|
||
MOVQ idx+48(FP),AX
|
||
MOVQ table+24(FP),DI
|
||
MOVQ point+0(FP),DX
|
||
|
||
PXOR X15, X15 // X15 = 0
|
||
PCMPEQL X14, X14 // X14 = -1
|
||
PSUBL X14, X15 // X15 = 1
|
||
MOVL AX, X14
|
||
PSHUFD $0, X14, X14
|
||
|
||
PXOR X0, X0
|
||
PXOR X1, X1
|
||
PXOR X2, X2
|
||
PXOR X3, X3
|
||
PXOR X4, X4
|
||
PXOR X5, X5
|
||
MOVQ $16, AX
|
||
|
||
MOVOU X15, X13
|
||
|
||
loop_select:
|
||
|
||
MOVOU X13, X12
|
||
PADDL X15, X13
|
||
PCMPEQL X14, X12
|
||
|
||
MOVOU (16*0)(DI), X6
|
||
MOVOU (16*1)(DI), X7
|
||
MOVOU (16*2)(DI), X8
|
||
MOVOU (16*3)(DI), X9
|
||
MOVOU (16*4)(DI), X10
|
||
MOVOU (16*5)(DI), X11
|
||
ADDQ $(16*6), DI
|
||
|
||
PAND X12, X6
|
||
PAND X12, X7
|
||
PAND X12, X8
|
||
PAND X12, X9
|
||
PAND X12, X10
|
||
PAND X12, X11
|
||
|
||
PXOR X6, X0
|
||
PXOR X7, X1
|
||
PXOR X8, X2
|
||
PXOR X9, X3
|
||
PXOR X10, X4
|
||
PXOR X11, X5
|
||
|
||
DECQ AX
|
||
JNE loop_select
|
||
|
||
MOVOU X0, (16*0)(DX)
|
||
MOVOU X1, (16*1)(DX)
|
||
MOVOU X2, (16*2)(DX)
|
||
MOVOU X3, (16*3)(DX)
|
||
MOVOU X4, (16*4)(DX)
|
||
MOVOU X5, (16*5)(DX)
|
||
|
||
RET
|
||
/* ---------------------------------------*/
|
||
// Constant time point access to base point table.
|
||
// func c256SelectBase(point, table []uint64, idx int)
|
||
TEXT ·c256SelectBase(SB),NOSPLIT,$0
|
||
MOVQ idx+48(FP),AX
|
||
MOVQ table+24(FP),DI
|
||
MOVQ point+0(FP),DX
|
||
|
||
PXOR X15, X15 // X15 = 0
|
||
PCMPEQL X14, X14 // X14 = -1
|
||
PSUBL X14, X15 // X15 = 1
|
||
MOVL AX, X14
|
||
PSHUFD $0, X14, X14
|
||
|
||
PXOR X0, X0
|
||
PXOR X1, X1
|
||
PXOR X2, X2
|
||
PXOR X3, X3
|
||
MOVQ $16, AX
|
||
|
||
MOVOU X15, X13
|
||
|
||
loop_select_base:
|
||
|
||
MOVOU X13, X12
|
||
PADDL X15, X13
|
||
PCMPEQL X14, X12
|
||
|
||
MOVOU (16*0)(DI), X4
|
||
MOVOU (16*1)(DI), X5
|
||
MOVOU (16*2)(DI), X6
|
||
MOVOU (16*3)(DI), X7
|
||
|
||
MOVOU (16*4)(DI), X8
|
||
MOVOU (16*5)(DI), X9
|
||
MOVOU (16*6)(DI), X10
|
||
MOVOU (16*7)(DI), X11
|
||
|
||
ADDQ $(16*8), DI
|
||
|
||
PAND X12, X4
|
||
PAND X12, X5
|
||
PAND X12, X6
|
||
PAND X12, X7
|
||
|
||
MOVOU X13, X12
|
||
PADDL X15, X13
|
||
PCMPEQL X14, X12
|
||
|
||
PAND X12, X8
|
||
PAND X12, X9
|
||
PAND X12, X10
|
||
PAND X12, X11
|
||
|
||
PXOR X4, X0
|
||
PXOR X5, X1
|
||
PXOR X6, X2
|
||
PXOR X7, X3
|
||
|
||
PXOR X8, X0
|
||
PXOR X9, X1
|
||
PXOR X10, X2
|
||
PXOR X11, X3
|
||
|
||
DECQ AX
|
||
JNE loop_select_base
|
||
|
||
MOVOU X0, (16*0)(DX)
|
||
MOVOU X1, (16*1)(DX)
|
||
MOVOU X2, (16*2)(DX)
|
||
MOVOU X3, (16*3)(DX)
|
||
|
||
RET
|
||
/* ---------------------------------------*/
|
||
// func c256OrdMul(res, in1, in2 []uint64)
|
||
// The general mothod of montgomery mul of N and R=2^{64}
|
||
TEXT ·c256OrdMul(SB),NOSPLIT,$0
|
||
MOVQ res+0(FP), res_ptr
|
||
MOVQ in1+24(FP), x_ptr
|
||
MOVQ in2+48(FP), y_ptr
|
||
// x * y[0]
|
||
MOVQ (8*0)(y_ptr), t0
|
||
|
||
MOVQ (8*0)(x_ptr), AX
|
||
MULQ t0
|
||
MOVQ AX, acc0
|
||
MOVQ DX, acc1
|
||
|
||
MOVQ (8*1)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ AX, acc1
|
||
ADCQ $0, DX
|
||
MOVQ DX, acc2
|
||
|
||
MOVQ (8*2)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ AX, acc2
|
||
ADCQ $0, DX
|
||
MOVQ DX, acc3
|
||
|
||
MOVQ (8*3)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ AX, acc3
|
||
ADCQ $0, DX
|
||
MOVQ DX, acc4
|
||
XORQ acc5, acc5
|
||
|
||
MOVQ c256ordK0<>(SB), BX
|
||
// First reduction step
|
||
// t0 = acc0 * k0 mod B
|
||
MOVQ acc0, AX
|
||
MULQ BX
|
||
MOVQ AX, t0
|
||
|
||
// acc0, acc1, acc2, acc3, acc4, acc5
|
||
//+ AX DX
|
||
//-----------------------------------------------
|
||
//= acc0, acc1, acc2, acc3, acc4, acc5
|
||
//+ t1
|
||
MOVQ c256ord<>+0x00(SB), AX
|
||
MULQ t0
|
||
ADDQ AX, acc0
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
// acc0, acc1, acc2, acc3, acc4, acc5
|
||
//+ AX DX
|
||
//-----------------------------------------------
|
||
//= acc0, acc1, acc2, acc3, acc4, acc5
|
||
//+ t1
|
||
MOVQ c256ord<>+0x08(SB), AX
|
||
MULQ t0
|
||
ADDQ t1, acc1
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc1
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
// acc0, acc1, acc2, acc3, acc4, acc5
|
||
//+ AX DX
|
||
//-----------------------------------------------
|
||
//= acc0, acc1, acc2, acc3, acc4, acc5
|
||
//+ t1
|
||
MOVQ c256ord<>+0x10(SB), AX
|
||
MULQ t0
|
||
ADDQ t1, acc2
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc2
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
// acc0, acc1, acc2, acc3, acc4, acc5
|
||
//+ AX DX
|
||
//-----------------------------------------------
|
||
//= acc0, acc1, acc2, acc3, acc4, acc5
|
||
MOVQ c256ord<>+0x18(SB), AX
|
||
MULQ t0
|
||
ADDQ t1, acc3
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc3
|
||
ADCQ DX, acc4
|
||
ADCQ $0, acc5
|
||
|
||
// x * y[1]
|
||
// [acc1, acc2, acc3, acc4, acc5] + (x * y1)
|
||
MOVQ (8*1)(y_ptr), t0
|
||
|
||
MOVQ (8*0)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ AX, acc1
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
MOVQ (8*1)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ t1, acc2
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc2
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
MOVQ (8*2)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ t1, acc3
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc3
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
MOVQ (8*3)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ t1, acc4
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc4
|
||
ADCQ DX, acc5
|
||
ADCQ $0, acc0
|
||
// Second reduction step
|
||
MOVQ acc1, AX
|
||
MULQ BX
|
||
MOVQ AX, t0
|
||
|
||
MOVQ c256ord<>+0x00(SB), AX
|
||
MULQ t0
|
||
ADDQ AX, acc1
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
MOVQ c256ord<>+0x08(SB), AX
|
||
MULQ t0
|
||
ADDQ t1, acc2
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc2
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
MOVQ c256ord<>+0x10(SB), AX
|
||
MULQ t0
|
||
ADDQ t1, acc3
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc3
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
MOVQ c256ord<>+0x18(SB), AX
|
||
MULQ t0
|
||
ADDQ t1, acc4
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc4
|
||
ADCQ DX, acc5
|
||
ADCQ $0, acc0
|
||
// x * y[2]
|
||
MOVQ (8*2)(y_ptr), t0
|
||
|
||
MOVQ (8*0)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ AX, acc2
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
MOVQ (8*1)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ t1, acc3
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc3
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
MOVQ (8*2)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ t1, acc4
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc4
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
MOVQ (8*3)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ t1, acc5
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc5
|
||
ADCQ DX, acc0
|
||
ADCQ $0, acc1
|
||
// Third reduction step
|
||
MOVQ acc2, AX
|
||
MULQ BX
|
||
MOVQ AX, t0
|
||
|
||
MOVQ c256ord<>+0x00(SB), AX
|
||
MULQ t0
|
||
ADDQ AX, acc2
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
MOVQ c256ord<>+0x08(SB), AX
|
||
MULQ t0
|
||
ADDQ t1, acc3
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc3
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
MOVQ c256ord<>+0x10(SB), AX
|
||
MULQ t0
|
||
ADDQ t1, acc4
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc4
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
MOVQ c256ord<>+0x18(SB), AX
|
||
MULQ t0
|
||
ADDQ t1, acc5
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc5
|
||
ADCQ DX, acc0
|
||
ADCQ $0, acc1
|
||
// x * y[3]
|
||
MOVQ (8*3)(y_ptr), t0
|
||
|
||
MOVQ (8*0)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ AX, acc3
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
MOVQ (8*1)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ t1, acc4
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc4
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
MOVQ (8*2)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ t1, acc5
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc5
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
MOVQ (8*3)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ t1, acc0
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc0
|
||
ADCQ DX, acc1
|
||
ADCQ $0, acc2
|
||
// Last reduction step
|
||
MOVQ acc3, AX
|
||
MULQ BX
|
||
MOVQ AX, t0
|
||
|
||
MOVQ c256ord<>+0x00(SB), AX
|
||
MULQ t0
|
||
ADDQ AX, acc3
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
MOVQ c256ord<>+0x08(SB), AX
|
||
MULQ t0
|
||
ADDQ t1, acc4
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc4
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
MOVQ c256ord<>+0x10(SB), AX
|
||
MULQ t0
|
||
ADDQ t1, acc5
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc5
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
MOVQ c256ord<>+0x18(SB), AX
|
||
MULQ t0
|
||
ADDQ t1, acc0
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc0
|
||
ADCQ DX, acc1
|
||
ADCQ $0, acc2
|
||
// Copy result [255:0]
|
||
MOVQ acc4, x_ptr
|
||
MOVQ acc5, acc3
|
||
MOVQ acc0, t0
|
||
MOVQ acc1, t1
|
||
// Subtract c256
|
||
SUBQ c256ord<>+0x00(SB), acc4
|
||
SBBQ c256ord<>+0x08(SB) ,acc5
|
||
SBBQ c256ord<>+0x10(SB), acc0
|
||
SBBQ c256ord<>+0x18(SB), acc1
|
||
SBBQ $0, acc2
|
||
|
||
CMOVQCS x_ptr, acc4
|
||
CMOVQCS acc3, acc5
|
||
CMOVQCS t0, acc0
|
||
CMOVQCS t1, acc1
|
||
|
||
MOVQ acc4, (8*0)(res_ptr)
|
||
MOVQ acc5, (8*1)(res_ptr)
|
||
MOVQ acc0, (8*2)(res_ptr)
|
||
MOVQ acc1, (8*3)(res_ptr)
|
||
|
||
RET
|
||
|
||
/* ---------------------------------------*/
|
||
// func c256OrdSqr(res, in []uint64, n int)
|
||
TEXT ·c256OrdSqr(SB),NOSPLIT,$0
|
||
MOVQ res+0(FP), res_ptr
|
||
MOVQ in+24(FP), x_ptr
|
||
MOVQ n+48(FP), BX
|
||
|
||
ordSqrLoop:
|
||
|
||
// y[1:] * y[0]
|
||
MOVQ (8*0)(x_ptr), t0
|
||
|
||
MOVQ (8*1)(x_ptr), AX
|
||
MULQ t0
|
||
MOVQ AX, acc1
|
||
MOVQ DX, acc2
|
||
|
||
MOVQ (8*2)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ AX, acc2
|
||
ADCQ $0, DX
|
||
MOVQ DX, acc3
|
||
|
||
MOVQ (8*3)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ AX, acc3
|
||
ADCQ $0, DX
|
||
MOVQ DX, acc4
|
||
// y[2:] * y[1]
|
||
MOVQ (8*1)(x_ptr), t0
|
||
|
||
MOVQ (8*2)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ AX, acc3
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
MOVQ (8*3)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ t1, acc4
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc4
|
||
ADCQ $0, DX
|
||
MOVQ DX, acc5
|
||
// y[3] * y[2]
|
||
MOVQ (8*2)(x_ptr), t0
|
||
|
||
MOVQ (8*3)(x_ptr), AX
|
||
MULQ t0
|
||
ADDQ AX, acc5
|
||
ADCQ $0, DX
|
||
MOVQ DX, y_ptr
|
||
XORQ t1, t1
|
||
// *2
|
||
ADDQ acc1, acc1
|
||
ADCQ acc2, acc2
|
||
ADCQ acc3, acc3
|
||
ADCQ acc4, acc4
|
||
ADCQ acc5, acc5
|
||
ADCQ y_ptr, y_ptr
|
||
ADCQ $0, t1
|
||
// Missing products
|
||
MOVQ (8*0)(x_ptr), AX
|
||
MULQ AX
|
||
MOVQ AX, acc0
|
||
MOVQ DX, t0
|
||
|
||
MOVQ (8*1)(x_ptr), AX
|
||
MULQ AX
|
||
ADDQ t0, acc1
|
||
ADCQ AX, acc2
|
||
ADCQ $0, DX
|
||
MOVQ DX, t0
|
||
|
||
MOVQ (8*2)(x_ptr), AX
|
||
MULQ AX
|
||
ADDQ t0, acc3
|
||
ADCQ AX, acc4
|
||
ADCQ $0, DX
|
||
MOVQ DX, t0
|
||
|
||
MOVQ (8*3)(x_ptr), AX
|
||
MULQ AX
|
||
ADDQ t0, acc5
|
||
ADCQ AX, y_ptr
|
||
ADCQ DX, t1
|
||
MOVQ t1, x_ptr
|
||
|
||
// in*in = [acc0, acc1, acc2, acc3, acc4, acc5, y_ptr, x_ptr]
|
||
// in * in / R mod p = [acc0, acc1, acc2, acc3] / R mod p + [acc4, acc5, y_ptr, x_ptr]
|
||
// First reduction step
|
||
// [acc1, acc2, acc3, acc0] = [acc0, acc1, acc2, acc3] + (acc0*k0 mod B) * N
|
||
// where k0 = -N^-1 mod B constant
|
||
MOVQ acc0, AX
|
||
MULQ c256ordK0<>(SB)
|
||
MOVQ AX, t0 // t0 = acc0*k0 mod B
|
||
|
||
// [acc0, acc1, acc2, acc3] + t0 * N
|
||
// acc0, acc1, acc2, acc3
|
||
//+ AX DX
|
||
//-----------------------------------------------
|
||
//= acc0, acc1, acc2, acc3
|
||
//+ t1
|
||
MOVQ c256ord<>+0x00(SB), AX
|
||
MULQ t0
|
||
ADDQ AX, acc0
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
// acc0, acc1, acc2, acc3
|
||
//+ t1
|
||
//+ AX DX
|
||
//-----------------------------------------------
|
||
//= acc0, acc1, acc2, acc3
|
||
//+ DX
|
||
//+ CF
|
||
MOVQ c256ord<>+0x08(SB), AX
|
||
MULQ t0
|
||
ADDQ t1, acc1
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc1
|
||
|
||
|
||
// note N2 = B - 1
|
||
// acc0, acc1, acc2, acc3
|
||
//+ DX
|
||
//+ CF
|
||
//+ t0
|
||
//- t0
|
||
//-----------------------------------------------
|
||
//= acc0, acc1, acc2, acc3
|
||
//+ t1
|
||
// Note that no carry for B^4 here:
|
||
// [acc0, acc1, acc2, t1] = [acc0, acc1, acc2] + t0 * [N0, N1, N2]
|
||
// < B^3 - 1 + t0 * (B^3 - 1)
|
||
// < (t0 + 1)(B^3 - 1)
|
||
// <= B(B^3-1)
|
||
// < B^4
|
||
MOVQ t0, t1
|
||
ADCQ DX, acc2
|
||
ADCQ $0, t1
|
||
SUBQ t0, acc2
|
||
SBBQ $0, t1
|
||
|
||
// Note N3 = B - 2^{32} - 1
|
||
// 0, acc1, acc2, acc3, acc0(=t0)
|
||
//+ t1
|
||
//- t0
|
||
//- t0<<32 t0>>32
|
||
//-----------------------------------------------
|
||
//= 0, acc1, acc2, acc3, acc0
|
||
MOVQ t0, AX
|
||
MOVQ t0, DX
|
||
MOVQ t0, acc0
|
||
SHLQ $32, AX
|
||
SHRQ $32, DX
|
||
|
||
ADDQ t1, acc3
|
||
ADCQ $0, acc0
|
||
SUBQ AX, acc3
|
||
SBBQ DX, acc0
|
||
SUBQ t0, acc3
|
||
SBBQ $0, acc0
|
||
|
||
// Second reduction step
|
||
MOVQ acc1, AX
|
||
MULQ c256ordK0<>(SB)
|
||
MOVQ AX, t0
|
||
|
||
MOVQ c256ord<>+0x00(SB), AX
|
||
MULQ t0
|
||
ADDQ AX, acc1
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
MOVQ c256ord<>+0x08(SB), AX
|
||
MULQ t0
|
||
ADDQ t1, acc2
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc2
|
||
|
||
MOVQ t0, t1
|
||
ADCQ DX, acc3
|
||
ADCQ $0, t1
|
||
SUBQ t0, acc3
|
||
SBBQ $0, t1
|
||
|
||
MOVQ t0, AX
|
||
MOVQ t0, DX
|
||
MOVQ t0, acc1
|
||
SHLQ $32, AX
|
||
SHRQ $32, DX
|
||
|
||
ADDQ t1, acc0
|
||
ADCQ $0, acc1
|
||
SUBQ AX, acc0
|
||
SBBQ DX, acc1
|
||
SUBQ t0, acc0
|
||
SBBQ $0, acc1
|
||
|
||
// Third reduction step
|
||
MOVQ acc2, AX
|
||
MULQ c256ordK0<>(SB)
|
||
MOVQ AX, t0
|
||
|
||
MOVQ c256ord<>+0x00(SB), AX
|
||
MULQ t0
|
||
ADDQ AX, acc2
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
MOVQ c256ord<>+0x08(SB), AX
|
||
MULQ t0
|
||
ADDQ t1, acc3
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc3
|
||
|
||
MOVQ t0, t1
|
||
ADCQ DX, acc0
|
||
ADCQ $0, t1
|
||
SUBQ t0, acc0
|
||
SBBQ $0, t1
|
||
|
||
MOVQ t0, AX
|
||
MOVQ t0, DX
|
||
MOVQ t0, acc2
|
||
SHLQ $32, AX
|
||
SHRQ $32, DX
|
||
|
||
ADDQ t1, acc1
|
||
ADCQ $0, acc2
|
||
SUBQ AX, acc1
|
||
SBBQ DX, acc2
|
||
SUBQ t0, acc1
|
||
SBBQ $0, acc2
|
||
|
||
// Last reduction step
|
||
MOVQ acc3, AX
|
||
MULQ c256ordK0<>(SB)
|
||
MOVQ AX, t0
|
||
|
||
MOVQ c256ord<>+0x00(SB), AX
|
||
MULQ t0
|
||
ADDQ AX, acc3
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
MOVQ c256ord<>+0x08(SB), AX
|
||
MULQ t0
|
||
ADDQ t1, acc0
|
||
ADCQ $0, DX
|
||
ADDQ AX, acc0
|
||
ADCQ $0, DX
|
||
MOVQ DX, t1
|
||
|
||
MOVQ t0, t1
|
||
ADCQ DX, acc1
|
||
ADCQ $0, t1
|
||
SUBQ t0, acc1
|
||
SBBQ $0, t1
|
||
|
||
MOVQ t0, AX
|
||
MOVQ t0, DX
|
||
MOVQ t0, acc3
|
||
SHLQ $32, AX
|
||
SHRQ $32, DX
|
||
|
||
ADDQ t1, acc2
|
||
ADCQ $0, acc3
|
||
SUBQ AX, acc2
|
||
SBBQ DX, acc3
|
||
SUBQ t0, acc2
|
||
SBBQ $0, acc3
|
||
|
||
// Sub N from [acc0, acc1, acc2, acc3] if necessary.
|
||
XORQ t0, t0
|
||
ADCQ acc4, acc0
|
||
ADCQ acc5, acc1
|
||
ADCQ y_ptr, acc2
|
||
ADCQ x_ptr, acc3
|
||
ADCQ $0, t0
|
||
|
||
MOVQ acc0, acc4
|
||
MOVQ acc1, acc5
|
||
MOVQ acc2, y_ptr
|
||
MOVQ acc3, t1
|
||
|
||
SUBQ c256ord<>+0x00(SB), acc0
|
||
SBBQ c256ord<>+0x08(SB) ,acc1
|
||
SBBQ c256ord<>+0x10(SB), acc2
|
||
SBBQ c256ord<>+0x18(SB), acc3
|
||
SBBQ $0, t0
|
||
|
||
CMOVQCS acc4, acc0
|
||
CMOVQCS acc5, acc1
|
||
CMOVQCS y_ptr, acc2
|
||
CMOVQCS t1, acc3
|
||
|
||
MOVQ acc0, (8*0)(res_ptr)
|
||
MOVQ acc1, (8*1)(res_ptr)
|
||
MOVQ acc2, (8*2)(res_ptr)
|
||
MOVQ acc3, (8*3)(res_ptr)
|
||
MOVQ res_ptr, x_ptr
|
||
DECQ BX
|
||
JNE ordSqrLoop
|
||
|
||
RET
|
||
/* ---------------------------------------*/
|
||
// arithmetic for Fp and Fn complete.
|
||
#undef res_ptr
|
||
#undef x_ptr
|
||
#undef y_ptr
|
||
|
||
#undef acc0
|
||
#undef acc1
|
||
#undef acc2
|
||
#undef acc3
|
||
#undef acc4
|
||
#undef acc5
|
||
#undef t0
|
||
#undef t1
|
||
|
||
/* ---------------------------------------*/
|
||
#define mul0 AX
|
||
#define mul1 DX
|
||
#define acc0 BX
|
||
#define acc1 CX
|
||
#define acc2 R8
|
||
#define acc3 R9
|
||
#define acc4 R10
|
||
#define acc5 R11
|
||
#define acc6 R12
|
||
#define acc7 R13
|
||
#define t0 R14
|
||
#define t1 R15
|
||
#define t2 DI
|
||
#define t3 SI
|
||
#define hlp BP
|
||
|
||
/* ---------------------------------------*/
|
||
// For internal call only, no register saving and loading
|
||
// acc[4:8] = acc[4:8] - t0:t3
|
||
TEXT c256SubInternal(SB),NOSPLIT,$0
|
||
XORQ mul0, mul0
|
||
SUBQ t0, acc4
|
||
SBBQ t1, acc5
|
||
SBBQ t2, acc6
|
||
SBBQ t3, acc7
|
||
SBBQ $0, mul0
|
||
|
||
MOVQ acc4, acc0
|
||
MOVQ acc5, acc1
|
||
MOVQ acc6, acc2
|
||
MOVQ acc7, acc3
|
||
|
||
ADDQ $-1, acc4
|
||
ADCQ c256const1<>(SB), acc5
|
||
ADCQ $-1, acc6
|
||
ADCQ c256const3<>(SB), acc7
|
||
ANDQ $1, mul0
|
||
|
||
CMOVQEQ acc0, acc4
|
||
CMOVQEQ acc1, acc5
|
||
CMOVQEQ acc2, acc6
|
||
CMOVQEQ acc3, acc7
|
||
|
||
RET
|
||
/* ---------------------------------------*/
|
||
// in: a = [acc4,acc5,acc6,acc7], t = [t0,t1,t2,t3]
|
||
// out: [acc4,acc5,acc6,acc7] = a * t mod p
|
||
TEXT c256MulInternal(SB),NOSPLIT,$0
|
||
// acc4 * t[0:4] -> acc0:acc4
|
||
MOVQ acc4, mul0
|
||
MULQ t0
|
||
MOVQ mul0, acc0
|
||
MOVQ mul1, acc1
|
||
|
||
MOVQ acc4, mul0
|
||
MULQ t1
|
||
ADDQ mul0, acc1
|
||
ADCQ $0, mul1
|
||
MOVQ mul1, acc2
|
||
|
||
MOVQ acc4, mul0
|
||
MULQ t2
|
||
ADDQ mul0, acc2
|
||
ADCQ $0, mul1
|
||
MOVQ mul1, acc3
|
||
|
||
MOVQ acc4, mul0
|
||
MULQ t3
|
||
ADDQ mul0, acc3
|
||
ADCQ $0, mul1
|
||
MOVQ mul1, acc4
|
||
|
||
// add acc5 * t[0:4] to acc1:acc5
|
||
MOVQ acc5, mul0
|
||
MULQ t0
|
||
ADDQ mul0, acc1
|
||
ADCQ $0, mul1
|
||
MOVQ mul1, hlp
|
||
|
||
MOVQ acc5, mul0
|
||
MULQ t1
|
||
ADDQ hlp, acc2
|
||
ADCQ $0, mul1
|
||
ADDQ mul0, acc2
|
||
ADCQ $0, mul1
|
||
MOVQ mul1, hlp
|
||
|
||
MOVQ acc5, mul0
|
||
MULQ t2
|
||
ADDQ hlp, acc3
|
||
ADCQ $0, mul1
|
||
ADDQ mul0, acc3
|
||
ADCQ $0, mul1
|
||
MOVQ mul1, hlp
|
||
|
||
MOVQ acc5, mul0
|
||
MULQ t3
|
||
ADDQ hlp, acc4
|
||
ADCQ $0, mul1
|
||
ADDQ mul0, acc4
|
||
ADCQ $0, mul1
|
||
MOVQ mul1, acc5
|
||
|
||
// add acc6 * t[0:4] to acc2:acc6
|
||
MOVQ acc6, mul0
|
||
MULQ t0
|
||
ADDQ mul0, acc2
|
||
ADCQ $0, mul1
|
||
MOVQ mul1, hlp
|
||
|
||
MOVQ acc6, mul0
|
||
MULQ t1
|
||
ADDQ hlp, acc3
|
||
ADCQ $0, mul1
|
||
ADDQ mul0, acc3
|
||
ADCQ $0, mul1
|
||
MOVQ mul1, hlp
|
||
|
||
MOVQ acc6, mul0
|
||
MULQ t2
|
||
ADDQ hlp, acc4
|
||
ADCQ $0, mul1
|
||
ADDQ mul0, acc4
|
||
ADCQ $0, mul1
|
||
MOVQ mul1, hlp
|
||
|
||
MOVQ acc6, mul0
|
||
MULQ t3
|
||
ADDQ hlp, acc5
|
||
ADCQ $0, mul1
|
||
ADDQ mul0, acc5
|
||
ADCQ $0, mul1
|
||
MOVQ mul1, acc6
|
||
|
||
// add acc7 * t[0:4] to acc3:acc7
|
||
MOVQ acc7, mul0
|
||
MULQ t0
|
||
ADDQ mul0, acc3
|
||
ADCQ $0, mul1
|
||
MOVQ mul1, hlp
|
||
|
||
MOVQ acc7, mul0
|
||
MULQ t1
|
||
ADDQ hlp, acc4
|
||
ADCQ $0, mul1
|
||
ADDQ mul0, acc4
|
||
ADCQ $0, mul1
|
||
MOVQ mul1, hlp
|
||
|
||
MOVQ acc7, mul0
|
||
MULQ t2
|
||
ADDQ hlp, acc5
|
||
ADCQ $0, mul1
|
||
ADDQ mul0, acc5
|
||
ADCQ $0, mul1
|
||
MOVQ mul1, hlp
|
||
|
||
MOVQ acc7, mul0
|
||
MULQ t3
|
||
ADDQ hlp, acc6
|
||
ADCQ $0, mul1
|
||
ADDQ mul0, acc6
|
||
ADCQ $0, mul1
|
||
MOVQ mul1, acc7
|
||
|
||
// The result in [acc0, acc1,···, acc7]
|
||
// Calculate [acc0, acc1, acc2, acc3]*R^{-1} + [acc4,acc5,acc6,acc7]
|
||
|
||
// reduce acc0-acc3
|
||
montMulOne(acc0,acc1,acc2,acc3, mul0, hlp)
|
||
|
||
XORQ hlp,hlp
|
||
// Add bits [511:256] of the result
|
||
ADCQ acc0, acc4
|
||
ADCQ acc1, acc5
|
||
ADCQ acc2, acc6
|
||
ADCQ acc3, acc7
|
||
ADCQ $0, hlp
|
||
// Copy result
|
||
MOVQ acc4, acc0
|
||
MOVQ acc5, acc1
|
||
MOVQ acc6, acc2
|
||
MOVQ acc7, acc3
|
||
// Subtract c256
|
||
SUBQ $-1, acc4
|
||
SBBQ c256const1<>(SB) ,acc5
|
||
SBBQ $-1, acc6
|
||
SBBQ c256const3<>(SB), acc7
|
||
SBBQ $0, hlp
|
||
// If the result of the subtraction is negative, restore the previous result
|
||
CMOVQCS acc0, acc4
|
||
CMOVQCS acc1, acc5
|
||
CMOVQCS acc2, acc6
|
||
CMOVQCS acc3, acc7
|
||
|
||
RET
|
||
/* ---------------------------------------*/
|
||
// in: a = [acc4,acc5,acc6,acc7]
|
||
// out: [acc4,acc5,acc6,acc7] = a ^ 2
|
||
TEXT c256SqrInternal(SB),NOSPLIT,$0
|
||
MOVQ acc4, mul0
|
||
MULQ acc5
|
||
MOVQ mul0, acc1
|
||
MOVQ mul1, acc2
|
||
|
||
MOVQ acc4, mul0
|
||
MULQ acc6
|
||
ADDQ mul0, acc2
|
||
ADCQ $0, mul1
|
||
MOVQ mul1, acc3
|
||
|
||
MOVQ acc4, mul0
|
||
MULQ acc7
|
||
ADDQ mul0, acc3
|
||
ADCQ $0, mul1
|
||
MOVQ mul1, t0
|
||
|
||
MOVQ acc5, mul0
|
||
MULQ acc6
|
||
ADDQ mul0, acc3
|
||
ADCQ $0, mul1
|
||
MOVQ mul1, hlp
|
||
|
||
MOVQ acc5, mul0
|
||
MULQ acc7
|
||
ADDQ hlp, t0
|
||
ADCQ $0, mul1
|
||
ADDQ mul0, t0
|
||
ADCQ $0, mul1
|
||
MOVQ mul1, t1
|
||
|
||
MOVQ acc6, mul0
|
||
MULQ acc7
|
||
ADDQ mul0, t1
|
||
ADCQ $0, mul1
|
||
MOVQ mul1, t2
|
||
XORQ t3, t3
|
||
// *2
|
||
ADDQ acc1, acc1
|
||
ADCQ acc2, acc2
|
||
ADCQ acc3, acc3
|
||
ADCQ t0, t0
|
||
ADCQ t1, t1
|
||
ADCQ t2, t2
|
||
ADCQ $0, t3
|
||
// Missing products
|
||
MOVQ acc4, mul0
|
||
MULQ mul0
|
||
MOVQ mul0, acc0
|
||
MOVQ DX, acc4
|
||
|
||
MOVQ acc5, mul0
|
||
MULQ mul0
|
||
ADDQ acc4, acc1
|
||
ADCQ mul0, acc2
|
||
ADCQ $0, DX
|
||
MOVQ DX, acc4
|
||
|
||
MOVQ acc6, mul0
|
||
MULQ mul0
|
||
ADDQ acc4, acc3
|
||
ADCQ mul0, t0
|
||
ADCQ $0, DX
|
||
MOVQ DX, acc4
|
||
|
||
MOVQ acc7, mul0
|
||
MULQ mul0
|
||
ADDQ acc4, t1
|
||
ADCQ mul0, t2
|
||
ADCQ DX, t3
|
||
|
||
// acc0-acc4 t0-t3
|
||
// reduction
|
||
montMulOne(acc0,acc1,acc2,acc3, mul0, hlp)
|
||
|
||
// MOVQ $0, BP
|
||
XORQ hlp,hlp
|
||
// Add bits [511:256] of the result
|
||
ADCQ acc0, t0
|
||
ADCQ acc1, t1
|
||
ADCQ acc2, t2
|
||
ADCQ acc3, t3
|
||
ADCQ $0, hlp
|
||
// Copy result
|
||
MOVQ t0, acc4
|
||
MOVQ t1, acc5
|
||
MOVQ t2, acc6
|
||
MOVQ t3, acc7
|
||
// Subtract c256
|
||
SUBQ $-1, acc4
|
||
SBBQ c256const1<>(SB) ,acc5
|
||
SBBQ $-1 ,acc6
|
||
SBBQ c256const3<>(SB), acc7
|
||
SBBQ $0, hlp
|
||
// If the result of the subtraction is negative, restore the previous result
|
||
CMOVQCS t0, acc4
|
||
CMOVQCS t1, acc5
|
||
CMOVQCS t2, acc6
|
||
CMOVQCS t3, acc7
|
||
|
||
RET
|
||
/* ---------------------------------------*/
|
||
// acc[4:8] = 2 * acc[4:8] mod p
|
||
// done
|
||
#define c256MulBy2Inline\
|
||
XORQ mul0, mul0;\
|
||
ADDQ acc4, acc4;\
|
||
ADCQ acc5, acc5;\
|
||
ADCQ acc6, acc6;\
|
||
ADCQ acc7, acc7;\
|
||
ADCQ $0, mul0;\
|
||
MOVQ acc4, t0;\
|
||
MOVQ acc5, t1;\
|
||
MOVQ acc6, t2;\
|
||
MOVQ acc7, t3;\
|
||
SUBQ $-1, t0;\
|
||
SBBQ c256const1<>(SB), t1;\
|
||
SBBQ $-1, t2;\
|
||
SBBQ c256const3<>(SB), t3;\
|
||
SBBQ $0, mul0;\
|
||
CMOVQCS acc4, t0;\
|
||
CMOVQCS acc5, t1;\
|
||
CMOVQCS acc6, t2;\
|
||
CMOVQCS acc7, t3;
|
||
/* ---------------------------------------*/
|
||
// acc[4:8] = t0:t3 + acc[4:8] mod p
|
||
// done
|
||
#define c256AddInline \
|
||
XORQ mul0, mul0;\
|
||
ADDQ t0, acc4;\
|
||
ADCQ t1, acc5;\
|
||
ADCQ t2, acc6;\
|
||
ADCQ t3, acc7;\
|
||
ADCQ $0, mul0;\
|
||
MOVQ acc4, t0;\
|
||
MOVQ acc5, t1;\
|
||
MOVQ acc6, t2;\
|
||
MOVQ acc7, t3;\
|
||
SUBQ $-1, t0;\
|
||
SBBQ c256const1<>(SB), t1;\
|
||
SBBQ $-1, t2;\
|
||
SBBQ c256const3<>(SB), t3;\
|
||
SBBQ $0, mul0;\
|
||
CMOVQCS acc4, t0;\
|
||
CMOVQCS acc5, t1;\
|
||
CMOVQCS acc6, t2;\
|
||
CMOVQCS acc7, t3;
|
||
/* ---------------------------------------*/
|
||
#define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7
|
||
#define LDt(src) MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3
|
||
#define ST(dst) MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3)
|
||
#define STt(dst) MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3)
|
||
#define acc2t MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3
|
||
#define t2acc MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7
|
||
/* ---------------------------------------*/
|
||
// Temp vars in ·c256PointAddAffineAsm, 32*16 = 512 bytes
|
||
#define x1in(off) (32*0 + off)(SP)
|
||
#define y1in(off) (32*1 + off)(SP)
|
||
#define z1in(off) (32*2 + off)(SP)
|
||
#define x2in(off) (32*3 + off)(SP)
|
||
#define y2in(off) (32*4 + off)(SP)
|
||
#define xout(off) (32*5 + off)(SP)
|
||
#define yout(off) (32*6 + off)(SP)
|
||
#define zout(off) (32*7 + off)(SP)
|
||
#define s2(off) (32*8 + off)(SP)
|
||
#define z1sqr(off) (32*9 + off)(SP)
|
||
#define h(off) (32*10 + off)(SP)
|
||
#define r(off) (32*11 + off)(SP)
|
||
#define hsqr(off) (32*12 + off)(SP)
|
||
#define rsqr(off) (32*13 + off)(SP)
|
||
#define hcub(off) (32*14 + off)(SP)
|
||
#define rptr (32*15)(SP)
|
||
#define sel_save (32*15 + 8)(SP)
|
||
#define zero_save (32*15 + 8 + 4)(SP)
|
||
|
||
// func c256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int)
|
||
TEXT ·c256PointAddAffineAsm(SB),0,$512-96
|
||
// Special case of https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl (with Z2 = 1 in Montgomery domain)
|
||
// Z1Z1 = Z1^2
|
||
// Z2Z2 = Z2^2
|
||
// U1 = X1*Z2Z2
|
||
// U2 = X2*Z1Z1
|
||
// S1 = Y1*Z2*Z2Z2
|
||
// S2 = Y2*Z1*Z1Z1
|
||
// H = U2-U1
|
||
// I = (2*H)^2
|
||
// J = H*I
|
||
// r = 2*(S2-S1)
|
||
// V = U1*I
|
||
// X3 = r^2-J-2*V
|
||
// Y3 = r*(V-X3)-2*S1*J
|
||
// Z3 = ((Z1+Z2)^2-Z1Z1-Z2Z2)*H
|
||
// with Z2 = 1:
|
||
// X3 = r^2-J-2*V = 4 * ((S2-S1)^2 - H^3 - 2*U1*H^2)
|
||
// Y3 = r*(V-X3)-2*S1*J = 8 * ((S2-S1)(U1*H^2-X3/4) - S1*H^3)
|
||
// Z3 = ((Z1+1)^2-Z1Z1-1)*H/2 = 2 * Z1 * H.
|
||
// Thus let:
|
||
// Z1Z1 = Z1^2
|
||
// U1 = X1
|
||
// U2 = X2*Z1Z1
|
||
// S1 = Y1
|
||
// S2 = Y2*Z1*Z1Z1
|
||
// H = U2 - X1
|
||
// r = S2-S1
|
||
// X3 = (S2-S1)^2 - H^3 - 2*U1*H^2 = r^2 - H^3 - 2*X1*H^2
|
||
// Y3 = (S2-S1)(U1*H^2-X3) - S1*H^3 = r*(U1*H^2-X3) - Y1*H^3
|
||
// Z3 = ((Z1+1)^2-Z1Z1-1)*H/2 = Z1 * H
|
||
|
||
// Move input to stack in order to free registers
|
||
MOVQ res+0(FP), AX
|
||
MOVQ in1+24(FP), BX
|
||
MOVQ in2+48(FP), CX
|
||
MOVQ sign+72(FP), DX
|
||
MOVQ sel+80(FP), t1
|
||
MOVQ zero+88(FP), t2
|
||
|
||
// a point is [x,y,z] = [12]uint64 = [6]uint128
|
||
// Xi is 128bits
|
||
// move in point to stack
|
||
MOVOU (16*0)(BX), X0 // x
|
||
MOVOU (16*1)(BX), X1
|
||
MOVOU (16*2)(BX), X2 // y
|
||
MOVOU (16*3)(BX), X3
|
||
MOVOU (16*4)(BX), X4 // z
|
||
MOVOU (16*5)(BX), X5
|
||
|
||
MOVOU X0, x1in(16*0)
|
||
MOVOU X1, x1in(16*1)
|
||
MOVOU X2, y1in(16*0)
|
||
MOVOU X3, y1in(16*1)
|
||
MOVOU X4, z1in(16*0)
|
||
MOVOU X5, z1in(16*1)
|
||
|
||
MOVOU (16*0)(CX), X0
|
||
MOVOU (16*1)(CX), X1
|
||
|
||
MOVOU X0, x2in(16*0)
|
||
MOVOU X1, x2in(16*1)
|
||
// Store pointer to result
|
||
MOVQ mul0, rptr
|
||
MOVL t1, sel_save
|
||
MOVL t2, zero_save
|
||
// Negate y2in based on sign
|
||
MOVQ (16*2 + 8*0)(CX), acc4
|
||
MOVQ (16*2 + 8*1)(CX), acc5
|
||
MOVQ (16*2 + 8*2)(CX), acc6
|
||
MOVQ (16*2 + 8*3)(CX), acc7
|
||
MOVQ $-1, acc0
|
||
MOVQ c256const1<>(SB), acc1
|
||
MOVQ $-1, acc2
|
||
MOVQ c256const3<>(SB), acc3
|
||
XORQ mul0, mul0
|
||
|
||
// Speculatively subtract
|
||
SUBQ acc4, acc0
|
||
SBBQ acc5, acc1
|
||
SBBQ acc6, acc2
|
||
SBBQ acc7, acc3
|
||
SBBQ $0, mul0
|
||
|
||
MOVQ acc0, t0
|
||
MOVQ acc1, t1
|
||
MOVQ acc2, t2
|
||
MOVQ acc3, t3 // acc[0:4] = t0:t3 = -yin, acc[4:8] = yin
|
||
|
||
// Add in case the operand was > c256
|
||
ADDQ $-1, acc0
|
||
ADCQ c256const1<>(SB), acc1
|
||
ADCQ $-1, acc2
|
||
ADCQ c256const3<>(SB), acc3
|
||
|
||
// The go stdlib implementation.
|
||
// when p > y2in > p - (R-p), then [acc0, acc1, acc2, acc3] = 2p-y2in > p
|
||
// for example: y2in = p-1, then [acc0, acc1, acc2, acc3] = p+1
|
||
// but it is ok.
|
||
ADCQ $0, mul0
|
||
CMOVQNE t0, acc0
|
||
CMOVQNE t1, acc1
|
||
CMOVQNE t2, acc2
|
||
CMOVQNE t3, acc3
|
||
// TODO: it can be replaced by the following.
|
||
// CMPQ mul0, $0
|
||
// CMOVQEQ t0, acc0
|
||
// CMOVQEQ t1, acc1
|
||
// CMOVQEQ t2, acc2
|
||
// CMOVQEQ t3, acc3
|
||
|
||
|
||
// If condition is 0, keep original value
|
||
TESTQ DX, DX
|
||
CMOVQEQ acc4, acc0
|
||
CMOVQEQ acc5, acc1
|
||
CMOVQEQ acc6, acc2
|
||
CMOVQEQ acc7, acc3
|
||
// Store result
|
||
MOVQ acc0, y2in(8*0)
|
||
MOVQ acc1, y2in(8*1)
|
||
MOVQ acc2, y2in(8*2)
|
||
MOVQ acc3, y2in(8*3) // y2in = sign*yin
|
||
// Begin point add
|
||
LDacc (z1in)
|
||
CALL c256SqrInternal(SB) // z1ˆ2
|
||
ST (z1sqr)
|
||
|
||
LDt (x2in)
|
||
CALL c256MulInternal(SB) // x2 * z1ˆ2
|
||
|
||
LDt (x1in)
|
||
CALL c256SubInternal(SB) // h = u2 - u1
|
||
ST (h)
|
||
|
||
LDt (z1in)
|
||
CALL c256MulInternal(SB) // z3 = h * z1
|
||
ST (zout)
|
||
|
||
LDacc (z1sqr)
|
||
CALL c256MulInternal(SB) // z1ˆ3
|
||
|
||
LDt (y2in)
|
||
CALL c256MulInternal(SB) // s2 = y2 * z1ˆ3
|
||
ST (s2)
|
||
|
||
LDt (y1in)
|
||
CALL c256SubInternal(SB) // r = s2 - s1
|
||
ST (r)
|
||
|
||
CALL c256SqrInternal(SB) // rsqr = rˆ2
|
||
ST (rsqr)
|
||
|
||
LDacc (h)
|
||
CALL c256SqrInternal(SB) // hsqr = hˆ2
|
||
ST (hsqr)
|
||
|
||
LDt (h)
|
||
CALL c256MulInternal(SB) // hcub = hˆ3
|
||
ST (hcub)
|
||
|
||
LDt (y1in)
|
||
CALL c256MulInternal(SB) // y1 * hˆ3
|
||
ST (s2)
|
||
|
||
LDacc (x1in)
|
||
LDt (hsqr)
|
||
CALL c256MulInternal(SB) // u1 * hˆ2
|
||
ST (h)
|
||
|
||
c256MulBy2Inline // u1 * hˆ2 * 2, inline
|
||
LDacc (rsqr)
|
||
CALL c256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2
|
||
|
||
LDt (hcub)
|
||
CALL c256SubInternal(SB) //
|
||
ST (xout)
|
||
|
||
MOVQ acc4, t0
|
||
MOVQ acc5, t1
|
||
MOVQ acc6, t2
|
||
MOVQ acc7, t3
|
||
LDacc (h)
|
||
CALL c256SubInternal(SB) // u1*h^2 - x3
|
||
|
||
LDt (r)
|
||
CALL c256MulInternal(SB) // r*(u1*h^2 - x3)
|
||
|
||
LDt (s2)
|
||
CALL c256SubInternal(SB) //r*(u1*h^2 - x3) - y1*H3
|
||
ST (yout)
|
||
// Load stored values from stack
|
||
MOVQ rptr, AX
|
||
MOVL sel_save, BX
|
||
MOVL zero_save, CX
|
||
// The result is not valid if (sel == 0), conditional choose
|
||
MOVOU xout(16*0), X0
|
||
MOVOU xout(16*1), X1
|
||
MOVOU yout(16*0), X2
|
||
MOVOU yout(16*1), X3
|
||
MOVOU zout(16*0), X4
|
||
MOVOU zout(16*1), X5
|
||
|
||
MOVL BX, X6
|
||
MOVL CX, X7
|
||
|
||
PXOR X8, X8
|
||
PCMPEQL X9, X9
|
||
|
||
PSHUFD $0, X6, X6
|
||
PSHUFD $0, X7, X7
|
||
|
||
PCMPEQL X8, X6
|
||
PCMPEQL X8, X7
|
||
|
||
MOVOU X6, X15
|
||
PANDN X9, X15
|
||
|
||
MOVOU x1in(16*0), X9
|
||
MOVOU x1in(16*1), X10
|
||
MOVOU y1in(16*0), X11
|
||
MOVOU y1in(16*1), X12
|
||
MOVOU z1in(16*0), X13
|
||
MOVOU z1in(16*1), X14
|
||
|
||
PAND X15, X0
|
||
PAND X15, X1
|
||
PAND X15, X2
|
||
PAND X15, X3
|
||
PAND X15, X4
|
||
PAND X15, X5
|
||
|
||
PAND X6, X9
|
||
PAND X6, X10
|
||
PAND X6, X11
|
||
PAND X6, X12
|
||
PAND X6, X13
|
||
PAND X6, X14
|
||
|
||
PXOR X9, X0
|
||
PXOR X10, X1
|
||
PXOR X11, X2
|
||
PXOR X12, X3
|
||
PXOR X13, X4
|
||
PXOR X14, X5
|
||
// Similarly if zero == 0
|
||
PCMPEQL X9, X9
|
||
MOVOU X7, X15
|
||
PANDN X9, X15
|
||
|
||
MOVOU x2in(16*0), X9
|
||
MOVOU x2in(16*1), X10
|
||
MOVOU y2in(16*0), X11
|
||
MOVOU y2in(16*1), X12
|
||
MOVOU c256one<>+0x00(SB), X13
|
||
MOVOU c256one<>+0x10(SB), X14
|
||
|
||
PAND X15, X0
|
||
PAND X15, X1
|
||
PAND X15, X2
|
||
PAND X15, X3
|
||
PAND X15, X4
|
||
PAND X15, X5
|
||
|
||
PAND X7, X9
|
||
PAND X7, X10
|
||
PAND X7, X11
|
||
PAND X7, X12
|
||
PAND X7, X13
|
||
PAND X7, X14
|
||
|
||
PXOR X9, X0
|
||
PXOR X10, X1
|
||
PXOR X11, X2
|
||
PXOR X12, X3
|
||
PXOR X13, X4
|
||
PXOR X14, X5
|
||
// Finally output the result
|
||
MOVOU X0, (16*0)(AX)
|
||
MOVOU X1, (16*1)(AX)
|
||
MOVOU X2, (16*2)(AX)
|
||
MOVOU X3, (16*3)(AX)
|
||
MOVOU X4, (16*4)(AX)
|
||
MOVOU X5, (16*5)(AX)
|
||
MOVQ $0, rptr
|
||
|
||
RET
|
||
#undef x1in
|
||
#undef y1in
|
||
#undef z1in
|
||
#undef x2in
|
||
#undef y2in
|
||
#undef xout
|
||
#undef yout
|
||
#undef zout
|
||
#undef s2
|
||
#undef z1sqr
|
||
#undef h
|
||
#undef r
|
||
#undef hsqr
|
||
#undef rsqr
|
||
#undef hcub
|
||
#undef rptr
|
||
#undef sel_save
|
||
#undef zero_save
|
||
|
||
// c256IsZero returns 1 in AX if [acc4..acc7] represents zero and zero
|
||
// otherwise. It writes to [acc4..acc7], t0 and t1.
|
||
// done
|
||
TEXT c256IsZero(SB),NOSPLIT,$0
|
||
// AX contains a flag that is set if the input is zero.
|
||
XORQ AX, AX
|
||
MOVQ $1, t1
|
||
|
||
// Check whether [acc4..acc7] are all zero.
|
||
MOVQ acc4, t0
|
||
ORQ acc5, t0
|
||
ORQ acc6, t0
|
||
ORQ acc7, t0
|
||
|
||
// Set the zero flag if so. (CMOV of a constant to a register doesn't
|
||
// appear to be supported in Go. Thus t1 = 1.)
|
||
CMOVQEQ t1, AX
|
||
|
||
// XOR [acc4..acc7] with P and compare with zero again.
|
||
XORQ $-1, acc4
|
||
XORQ c256const1<>(SB), acc5
|
||
XORQ $-1, acc6
|
||
XORQ c256const3<>(SB), acc7
|
||
ORQ acc5, acc4
|
||
ORQ acc6, acc4
|
||
ORQ acc7, acc4
|
||
|
||
// Set the zero flag if so.
|
||
CMOVQEQ t1, AX
|
||
RET
|
||
|
||
/* ---------------------------------------*/
|
||
#define x1in(off) (32*0 + off)(SP)
|
||
#define y1in(off) (32*1 + off)(SP)
|
||
#define z1in(off) (32*2 + off)(SP)
|
||
#define x2in(off) (32*3 + off)(SP)
|
||
#define y2in(off) (32*4 + off)(SP)
|
||
#define z2in(off) (32*5 + off)(SP)
|
||
|
||
#define xout(off) (32*6 + off)(SP)
|
||
#define yout(off) (32*7 + off)(SP)
|
||
#define zout(off) (32*8 + off)(SP)
|
||
|
||
#define u1(off) (32*9 + off)(SP)
|
||
#define u2(off) (32*10 + off)(SP)
|
||
#define s1(off) (32*11 + off)(SP)
|
||
#define s2(off) (32*12 + off)(SP)
|
||
#define z1sqr(off) (32*13 + off)(SP)
|
||
#define z2sqr(off) (32*14 + off)(SP)
|
||
#define h(off) (32*15 + off)(SP)
|
||
#define r(off) (32*16 + off)(SP)
|
||
#define hsqr(off) (32*17 + off)(SP)
|
||
#define rsqr(off) (32*18 + off)(SP)
|
||
#define hcub(off) (32*19 + off)(SP)
|
||
#define rptr (32*20)(SP)
|
||
#define points_eq (32*20+8)(SP)
|
||
|
||
//func c256PointAddAsm(res, in1, in2 []uint64) int
|
||
TEXT ·c256PointAddAsm(SB),0,$680-80
|
||
// See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
|
||
// Move input to stack in order to free registers
|
||
MOVQ res+0(FP), AX
|
||
MOVQ in1+24(FP), BX
|
||
MOVQ in2+48(FP), CX
|
||
|
||
MOVOU (16*0)(BX), X0
|
||
MOVOU (16*1)(BX), X1
|
||
MOVOU (16*2)(BX), X2
|
||
MOVOU (16*3)(BX), X3
|
||
MOVOU (16*4)(BX), X4
|
||
MOVOU (16*5)(BX), X5
|
||
|
||
MOVOU X0, x1in(16*0)
|
||
MOVOU X1, x1in(16*1)
|
||
MOVOU X2, y1in(16*0)
|
||
MOVOU X3, y1in(16*1)
|
||
MOVOU X4, z1in(16*0)
|
||
MOVOU X5, z1in(16*1)
|
||
|
||
MOVOU (16*0)(CX), X0
|
||
MOVOU (16*1)(CX), X1
|
||
MOVOU (16*2)(CX), X2
|
||
MOVOU (16*3)(CX), X3
|
||
MOVOU (16*4)(CX), X4
|
||
MOVOU (16*5)(CX), X5
|
||
|
||
MOVOU X0, x2in(16*0)
|
||
MOVOU X1, x2in(16*1)
|
||
MOVOU X2, y2in(16*0)
|
||
MOVOU X3, y2in(16*1)
|
||
MOVOU X4, z2in(16*0)
|
||
MOVOU X5, z2in(16*1)
|
||
// Store pointer to result
|
||
MOVQ AX, rptr
|
||
// Begin point add
|
||
LDacc (z2in)
|
||
CALL c256SqrInternal(SB) // z2ˆ2
|
||
ST (z2sqr)
|
||
LDt (z2in)
|
||
CALL c256MulInternal(SB) // z2ˆ3
|
||
LDt (y1in)
|
||
CALL c256MulInternal(SB) // s1 = z2ˆ3*y1
|
||
ST (s1)
|
||
|
||
LDacc (z1in)
|
||
CALL c256SqrInternal(SB) // z1ˆ2
|
||
ST (z1sqr)
|
||
LDt (z1in)
|
||
CALL c256MulInternal(SB) // z1ˆ3
|
||
LDt (y2in)
|
||
CALL c256MulInternal(SB) // s2 = z1ˆ3*y2
|
||
ST (s2)
|
||
|
||
LDt (s1)
|
||
CALL c256SubInternal(SB) // r = s2 - s1
|
||
ST (r)
|
||
CALL c256IsZero(SB)
|
||
MOVQ AX, points_eq
|
||
|
||
LDacc (z2sqr)
|
||
LDt (x1in)
|
||
CALL c256MulInternal(SB) // u1 = x1 * z2ˆ2
|
||
ST (u1)
|
||
LDacc (z1sqr)
|
||
LDt (x2in)
|
||
CALL c256MulInternal(SB) // u2 = x2 * z1ˆ2
|
||
ST (u2)
|
||
|
||
LDt (u1)
|
||
CALL c256SubInternal(SB) // h = u2 - u1
|
||
ST (h)
|
||
CALL c256IsZero(SB)
|
||
ANDQ points_eq, AX
|
||
MOVQ AX, points_eq
|
||
|
||
LDacc (r)
|
||
CALL c256SqrInternal(SB) // rsqr = rˆ2
|
||
ST (rsqr)
|
||
|
||
LDacc (h)
|
||
CALL c256SqrInternal(SB) // hsqr = hˆ2
|
||
ST (hsqr)
|
||
|
||
LDt (h)
|
||
CALL c256MulInternal(SB) // hcub = hˆ3
|
||
ST (hcub)
|
||
|
||
LDt (s1)
|
||
CALL c256MulInternal(SB)
|
||
ST (s2)
|
||
|
||
LDacc (z1in)
|
||
LDt (z2in)
|
||
CALL c256MulInternal(SB) // z1 * z2
|
||
LDt (h)
|
||
CALL c256MulInternal(SB) // z1 * z2 * h
|
||
ST (zout)
|
||
|
||
LDacc (hsqr)
|
||
LDt (u1)
|
||
CALL c256MulInternal(SB) // hˆ2 * u1
|
||
ST (u2)
|
||
|
||
c256MulBy2Inline // u1 * hˆ2 * 2, inline
|
||
LDacc (rsqr)
|
||
CALL c256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2
|
||
|
||
LDt (hcub)
|
||
CALL c256SubInternal(SB)
|
||
ST (xout)
|
||
|
||
MOVQ acc4, t0
|
||
MOVQ acc5, t1
|
||
MOVQ acc6, t2
|
||
MOVQ acc7, t3
|
||
LDacc (u2)
|
||
CALL c256SubInternal(SB)
|
||
|
||
LDt (r)
|
||
CALL c256MulInternal(SB)
|
||
|
||
LDt (s2)
|
||
CALL c256SubInternal(SB)
|
||
ST (yout)
|
||
|
||
MOVOU xout(16*0), X0
|
||
MOVOU xout(16*1), X1
|
||
MOVOU yout(16*0), X2
|
||
MOVOU yout(16*1), X3
|
||
MOVOU zout(16*0), X4
|
||
MOVOU zout(16*1), X5
|
||
// Finally output the result
|
||
MOVQ rptr, AX
|
||
MOVQ $0, rptr
|
||
MOVOU X0, (16*0)(AX)
|
||
MOVOU X1, (16*1)(AX)
|
||
MOVOU X2, (16*2)(AX)
|
||
MOVOU X3, (16*3)(AX)
|
||
MOVOU X4, (16*4)(AX)
|
||
MOVOU X5, (16*5)(AX)
|
||
|
||
MOVQ points_eq, AX
|
||
MOVQ AX, ret+72(FP)
|
||
|
||
RET
|
||
|
||
#undef x1in
|
||
#undef y1in
|
||
#undef z1in
|
||
#undef x2in
|
||
#undef y2in
|
||
#undef z2in
|
||
#undef xout
|
||
#undef yout
|
||
#undef zout
|
||
#undef s1
|
||
#undef s2
|
||
#undef u1
|
||
#undef u2
|
||
#undef z1sqr
|
||
#undef z2sqr
|
||
#undef h
|
||
#undef r
|
||
#undef hsqr
|
||
#undef rsqr
|
||
#undef hcub
|
||
#undef rptr
|
||
/* ---------------------------------------*/
|
||
#define x(off) (32*0 + off)(SP)
|
||
#define y(off) (32*1 + off)(SP)
|
||
#define z(off) (32*2 + off)(SP)
|
||
|
||
#define s(off) (32*3 + off)(SP)
|
||
#define m(off) (32*4 + off)(SP)
|
||
#define zsqr(off) (32*5 + off)(SP)
|
||
#define tmp(off) (32*6 + off)(SP)
|
||
#define rptr (32*7)(SP)
|
||
|
||
//func c256PointDoubleAsm(res, in []uint64)
|
||
// done TODO - check once more
|
||
TEXT ·c256PointDoubleAsm(SB),NOSPLIT,$256-48
|
||
// Move input to stack in order to free registers
|
||
MOVQ res+0(FP), AX
|
||
MOVQ in+24(FP), BX
|
||
|
||
MOVOU (16*0)(BX), X0
|
||
MOVOU (16*1)(BX), X1
|
||
MOVOU (16*2)(BX), X2
|
||
MOVOU (16*3)(BX), X3
|
||
MOVOU (16*4)(BX), X4
|
||
MOVOU (16*5)(BX), X5
|
||
|
||
MOVOU X0, x(16*0)
|
||
MOVOU X1, x(16*1)
|
||
MOVOU X2, y(16*0)
|
||
MOVOU X3, y(16*1)
|
||
MOVOU X4, z(16*0)
|
||
MOVOU X5, z(16*1)
|
||
// Store pointer to result
|
||
MOVQ AX, rptr
|
||
// Begin point double
|
||
LDacc (z)
|
||
CALL c256SqrInternal(SB)
|
||
ST (zsqr)
|
||
|
||
LDt (x)
|
||
c256AddInline
|
||
STt (m)
|
||
|
||
LDacc (z)
|
||
LDt (y)
|
||
CALL c256MulInternal(SB)
|
||
c256MulBy2Inline
|
||
MOVQ rptr, AX
|
||
// Store z
|
||
MOVQ t0, (16*4 + 8*0)(AX)
|
||
MOVQ t1, (16*4 + 8*1)(AX)
|
||
MOVQ t2, (16*4 + 8*2)(AX)
|
||
MOVQ t3, (16*4 + 8*3)(AX)
|
||
|
||
LDacc (x)
|
||
LDt (zsqr)
|
||
CALL c256SubInternal(SB)
|
||
LDt (m)
|
||
CALL c256MulInternal(SB)
|
||
ST (m)
|
||
// Multiply by 3
|
||
c256MulBy2Inline
|
||
LDacc (m)
|
||
c256AddInline
|
||
STt (m)
|
||
////////////////////////
|
||
LDacc (y)
|
||
c256MulBy2Inline
|
||
t2acc
|
||
CALL c256SqrInternal(SB)
|
||
ST (s)
|
||
CALL c256SqrInternal(SB)
|
||
// Divide by 2
|
||
XORQ mul0, mul0
|
||
MOVQ acc4, t0
|
||
MOVQ acc5, t1
|
||
MOVQ acc6, t2
|
||
MOVQ acc7, t3
|
||
|
||
ADDQ $-1, acc4
|
||
ADCQ c256const1<>(SB), acc5
|
||
ADCQ $-1, acc6
|
||
ADCQ c256const3<>(SB), acc7
|
||
ADCQ $0, mul0
|
||
TESTQ $1, t0
|
||
|
||
CMOVQEQ t0, acc4
|
||
CMOVQEQ t1, acc5
|
||
CMOVQEQ t2, acc6
|
||
CMOVQEQ t3, acc7
|
||
ANDQ t0, mul0
|
||
|
||
SHRQ $1, acc5, acc4
|
||
SHRQ $1, acc6, acc5
|
||
SHRQ $1, acc7, acc6
|
||
SHRQ $1, mul0, acc7
|
||
ST (y)
|
||
/////////////////////////
|
||
LDacc (x)
|
||
LDt (s)
|
||
CALL c256MulInternal(SB)
|
||
ST (s)
|
||
c256MulBy2Inline
|
||
STt (tmp)
|
||
|
||
LDacc (m)
|
||
CALL c256SqrInternal(SB)
|
||
LDt (tmp)
|
||
CALL c256SubInternal(SB)
|
||
|
||
MOVQ rptr, AX
|
||
// Store x
|
||
MOVQ acc4, (16*0 + 8*0)(AX)
|
||
MOVQ acc5, (16*0 + 8*1)(AX)
|
||
MOVQ acc6, (16*0 + 8*2)(AX)
|
||
MOVQ acc7, (16*0 + 8*3)(AX)
|
||
|
||
acc2t
|
||
LDacc (s)
|
||
CALL c256SubInternal(SB)
|
||
|
||
LDt (m)
|
||
CALL c256MulInternal(SB)
|
||
|
||
LDt (y)
|
||
CALL c256SubInternal(SB)
|
||
MOVQ rptr, AX
|
||
// Store y
|
||
MOVQ acc4, (16*2 + 8*0)(AX)
|
||
MOVQ acc5, (16*2 + 8*1)(AX)
|
||
MOVQ acc6, (16*2 + 8*2)(AX)
|
||
MOVQ acc7, (16*2 + 8*3)(AX)
|
||
///////////////////////
|
||
MOVQ $0, rptr
|
||
|
||
RET
|
||
/* ---------------------------------------*/
|
||
|