// Copyright 2015 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // This file contains the Go wrapper for the constant-time, 64-bit assembly // implementation of P256. The optimizations performed here are described in // detail in: // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with // 256-bit primes" // https://link.springer.com/article/10.1007%2Fs13389-014-0090-x // https://eprint.iacr.org/2013/816.pdf //go:build (arm64 || amd64) && !generic && !generic32 && !generic64 // +build arm64 amd64 // +build !generic // +build !generic32 // +build !generic64 package ec256 import ( "crypto/elliptic" "math/big" ) const ( // montgomery of one: 1*R mod p montOne0 = 0x0000000000000001 montOne1 = 0x00000000ffffffff montOne2 = 0x0000000000000000 montOne3 = 0x0000000100000000 // montgomery of base point: montBaseX0 = 0x61328990f418029e montBaseX1 = 0x3e7981eddca6c050 montBaseX2 = 0xd6a1ed99ac24c3c3 montBaseX3 = 0x91167a5ee1c13b05 montBaseY0 = 0xc1354e593c2d0ddd montBaseY1 = 0xc1f5e5788d3295fa montBaseY2 = 0x8d4cfb066e2a48f8 montBaseY3 = 0x63cd65d481d735bd // R*R mod n rrModN0 = 0x901192af7c114f20 rrModN1 = 0x3464504ade6fa2fa rrModN2 = 0x620fc84c3affe0d4 rrModN3 = 0x1eb5e412a22b3d3b // R*R mod p rrModP0 = 0x0000000200000003 rrModP1 = 0x00000002ffffffff rrModP2 = 0x0000000100000001 rrModP3 = 0x0000000400000002 ) // c256Point Jacobian represent of a point with x,y,z in Montgomery domain type c256Point struct { xyz [12]uint64 } var ( c256Precomputed *[43][32 * 8]uint64 ) func init() { initTable() } func (curve SM2CurveParam) Params() *elliptic.CurveParams { return curve.CurveParams } //go:noescape // func c256Add(res, in1, in2 []uint64) // Functions implemented in c256_asm_*64.s // Montgomery multiplication modulo P256 // //go:noescape func c256Mul(res, in1, in2 []uint64) // Montgomery square modulo P256, repeated n times (n >= 1) // //go:noescape func c256Sqr(res, in []uint64, n int) // Montgomery multiplication by 1, montMul(in, 1) // //go:noescape func c256FromMont(res, in []uint64) // iff cond != 0 val <- -val // //go:noescape func c256NegCond(val []uint64, cond int) // if cond == 0 res <- b; else res <- a // //go:noescape func c256MovCond(res, a, b []uint64, cond int) // Endianness swap, 大端表示的32字节转4个小端表示的uint64 // //go:noescape func c256BigToLittle(res []uint64, in []byte) //go:noescape func c256LittleToBig(res []byte, in []uint64) // Constant time table access // idx = 0, returns infinity. idx = i > 0, returns table[i-1]. // //go:noescape func c256Select(point, table []uint64, idx int) //go:noescape func c256SelectBase(point, table []uint64, idx int) // Montgomery multiplication modulo Ord(G) // //go:noescape func c256OrdMul(res, in1, in2 []uint64) // Montgomery square modulo Ord(G), repeated n times // //go:noescape func c256OrdSqr(res, in []uint64, n int) // Point add with in2 being affine point // If sign == 1 -> in2 = -in2 // If sel == 0 -> res = in1 // if zero == 0 -> res = in2 // //go:noescape func c256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int) // Point add. Returns one if the two input points were equal and zero // otherwise. (Note that, due to the way that the equations work out, some // representations of ∞ are considered equal to everything by this function.) // //go:noescape func c256PointAddAsm(res, in1, in2 []uint64) int // Point double // //go:noescape func c256PointDoubleAsm(res, in []uint64) func c256ToMont(res, in []uint64) { c256Mul(res, in, rr) } // in: k = k0 mod N // out: k0^{-1} mod N // use montgomery power: k -> k*R -> k^{N-2}*R -> k^{N-2} // Done - FIXME, need improve func (curve SM2CurveParam) Inverse(k *big.Int) *big.Int { if k.Sign() < 0 { // This should never happen. k = new(big.Int).Neg(k) } if k.Cmp(c256.N) >= 0 { // This should never happen. k = new(big.Int).Mod(k, c256.N) } // table will store precomputed powers of x. var table [4 * 10]uint64 var ( _1 = table[4*0 : 4*1] // 1 _11 = table[4*1 : 4*2] // 3 _101 = table[4*2 : 4*3] // 5 _111 = table[4*3 : 4*4] // 7 _1111 = table[4*4 : 4*5] // 15 _10101 = table[4*5 : 4*6] // 21 _101111 = table[4*6 : 4*7] // 47 x = table[4*7 : 4*8] t = table[4*8 : 4*9] s = table[4*9 : 4*10] ) fromBig(x[:], k) // This code operates in the Montgomery domain where R = 2^256 mod n // and n is the order of the scalar field. (See initP256 for the // value.) Elements in the Montgomery domain take the form a×R and // multiplication of x and y in the calculates (x × y × R^-1) mod n. RR // is R×R mod n thus the Montgomery multiplication x and RR gives x×R, // i.e. converts x into the Montgomery domain. // Window values borrowed from https://briansmith.org/ecc-inversion-addition-chains-01#p256_scalar_inversion RR := []uint64{rrModN0, rrModN1, rrModN2, rrModN3} // sm2-p256 // FIXME: the ladder need improve // SM2-p256: // N-2 = 0xfffffffeffffffffffffffffffffffff7203df6b21c6052b53bbf40939d54121 c256OrdMul(_1, x, RR) // _1 c256OrdSqr(x, _1, 1) // _10 x=10 c256OrdMul(_11, x, _1) // _11 c256OrdMul(_101, x, _11) // _101 c256OrdMul(_111, x, _101) // _111 c256OrdSqr(x, _101, 1) // _1010 -- x = _1010 c256OrdMul(_1111, _101, x) // _1111 c256OrdSqr(t, x, 1) // _10100 -- t=_10100 c256OrdMul(_10101, t, _1) // _10101 c256OrdSqr(x, _10101, 1) // _101010 -- x=_101010 c256OrdMul(_101111, _101, x) // _101111 c256OrdMul(x, _10101, x) // _111111 = x6 -- x=x6 c256OrdSqr(s, x, 1) // x = _1111110 c256OrdMul(s, s, _1) // x = x7 c256OrdSqr(x, s, 1) // x = _11111110 = 0xfe c256OrdMul(s, x, _1) // s = x8 = 0xff c256OrdSqr(t, s, 8) // t=_ff00 c256OrdMul(x, t, x) // x = fffe c256OrdMul(s, t, s) // s = _ffff c256OrdSqr(t, s, 16) // t=_ffff0000 c256OrdMul(x, t, x) // x = fffffffe c256OrdMul(t, x, _1) // t = ffffffff c256OrdSqr(x, x, 32) // x=_fffffffe00000000 c256OrdMul(x, x, t) // x=_fffffffeffffffff c256OrdSqr(x, x, 32) // x = _fffffffeffffffff00000000 c256OrdMul(x, x, t) // x= _fffffffeffffffffffffffff c256OrdSqr(x, x, 32) // x = _fffffffeffffffffffffffff00000000 c256OrdMul(x, x, t) // x = _fffffffeffffffffffffffffffffffff // 7203df6b21c6052b53bbf40939d54121 = // 01110010000000111101111101101011001000011100011000000101001010110101001110111011111101000000100100111001110101010100000100100001 = // 0111 001 00000001111 01111 101 // 101 011 001 0000111 00011 // 000000101 0010101 10101 00111 0111 // 011 1111 01 0000001 001 // 00111 00111 010101 01 000001 // 001 00001 sqrs := []uint8{ 4, 3, 11, 5, 3, 3, 3, 3, 7, 5, 9, 7, 5, 5, 4, 3, 4, 2, 7, 3, 5, 5, 6, 2, 6, 3, 5, } muls := [][]uint64{ _111, _1, _1111, _1111, _101, _101, _11, _1, _111, _11, _101, _10101, _10101, _111, _111, _11, _1111, _1, _1, _1, _111, _111, _10101, _1, _1, _1, _1, } for i, s := range sqrs { c256OrdSqr(x, x, int(s)) c256OrdMul(x, x, muls[i]) } // Multiplying by one in the Montgomery domain converts a Montgomery // value out of the domain. one := []uint64{1, 0, 0, 0} c256OrdMul(x, x, one) xOut := make([]byte, 32) c256LittleToBig(xOut, x) return new(big.Int).SetBytes(xOut) } // fromBig converts a *big.Int into a format used by this code. func fromBig(out []uint64, big *big.Int) { for i := range out { out[i] = 0 } for i, v := range big.Bits() { out[i] = uint64(v) } } // c256GetScalar endian-swaps the big-endian scalar value from in and writes it // to out. If the scalar is equal or greater than the order of the group, it's // reduced modulo that order. func c256GetScalar(out []uint64, in []byte) { n := new(big.Int).SetBytes(in) if n.Cmp(c256.N) >= 0 { n.Mod(n, c256.N) } fromBig(out, n) } // c256Mul operates in a Montgomery domain with R = 2^256 mod p, where p is the // underlying field of the curve. (See initP256 for the value.) Thus rr here is // R×R mod p. See comment in Inverse about how this is used. var rr = []uint64{rrModP0, rrModP1, rrModP2, rrModP3} //// changed to sm2 // Note: for most time, in < p func maybeReduceModP(in *big.Int) *big.Int { if in.Cmp(c256.P) < 0 { return in } return new(big.Int).Mod(in, c256.P) } func CombinedMult(bigX, bigY *big.Int, baseScalar, scalar []byte) (x, y *big.Int) { scalarReversed := make([]uint64, 4) var r1, r2 c256Point c256GetScalar(scalarReversed, baseScalar) r1IsInfinity := scalarIsZero(scalarReversed) r1.c256BaseMult(scalarReversed) c256GetScalar(scalarReversed, scalar) r2IsInfinity := scalarIsZero(scalarReversed) r2.c256PointFromAffine(bigX, bigY) r2.c256ScalarMult(scalarReversed) var sum, double c256Point pointsEqual := c256PointAddAsm(sum.xyz[:], r1.xyz[:], r2.xyz[:]) c256PointDoubleAsm(double.xyz[:], r1.xyz[:]) sum.CopyConditional(&double, pointsEqual) sum.CopyConditional(&r1, r2IsInfinity) sum.CopyConditional(&r2, r1IsInfinity) return sum.c256PointToAffine() } func (curve SM2CurveParam) ScalarBaseMult(scalar []byte) (x, y *big.Int) { // return curve.ScalarMult(curve.Gx, curve.Gy, scalar) scalarReversed := make([]uint64, 4) c256GetScalar(scalarReversed, scalar) var r c256Point r.c256BaseMult(scalarReversed) return r.c256PointToAffine() } func (curve SM2CurveParam) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) { scalarReversed := make([]uint64, 4) c256GetScalar(scalarReversed, scalar) var r c256Point fromBig(r.xyz[0:4], maybeReduceModP(bigX)) fromBig(r.xyz[4:8], maybeReduceModP(bigY)) c256Mul(r.xyz[0:4], r.xyz[0:4], rr[:]) c256Mul(r.xyz[4:8], r.xyz[4:8], rr[:]) // This sets r2's Z value to 1, in the Montgomery domain. r.xyz[8] = montOne0 r.xyz[9] = montOne1 r.xyz[10] = montOne2 r.xyz[11] = montOne3 r.c256ScalarMult(scalarReversed) return r.c256PointToAffine() } func (curve SM2CurveParam) Add(x1, y1, x2, y2 *big.Int) (x, y *big.Int) { var r1, r2 c256Point r1.c256PointFromAffine(x1, y1) r2.c256PointFromAffine(x2, y2) if true { // in most cases, the input two points are not equal. // omit the time-attack risk. if c256PointAddAsm(r1.xyz[:], r1.xyz[:], r2.xyz[:]) == 1 { c256PointDoubleAsm(r1.xyz[:], r2.xyz[:]) } return r1.c256PointToAffine() } else { var res, double c256Point pointEqual := c256PointAddAsm(res.xyz[:], r1.xyz[:], r2.xyz[:]) c256PointDoubleAsm(double.xyz[:], r1.xyz[:]) c256MovCond(res.xyz[:], res.xyz[:], double.xyz[:], pointEqual) return res.c256PointToAffine() } } func (curve SM2CurveParam) Double(x1, y1 *big.Int) (x, y *big.Int) { var r c256Point r.c256PointFromAffine(x1, y1) c256PointDoubleAsm(r.xyz[:], r.xyz[:]) return r.c256PointToAffine() } // uint64IsZero returns 1 if x is zero and zero otherwise. func uint64IsZero(x uint64) int { x = ^x x &= x >> 32 x &= x >> 16 x &= x >> 8 x &= x >> 4 x &= x >> 2 x &= x >> 1 return int(x & 1) } // scalarIsZero returns 1 if scalar represents the zero value, and zero // otherwise. func scalarIsZero(scalar []uint64) int { return uint64IsZero(scalar[0] | scalar[1] | scalar[2] | scalar[3]) } // c256PointFromAffine change affine point (x,y) to Montgemery domain // Jacobian point p func (p *c256Point) c256PointFromAffine(x, y *big.Int) { xyz := p.xyz[:] fromBig(xyz[0:4], maybeReduceModP(x)) fromBig(xyz[4:8], maybeReduceModP(y)) c256Mul(xyz[0:4], xyz[0:4], rr[:]) c256Mul(xyz[4:8], xyz[4:8], rr[:]) xyz[8] = montOne0 xyz[9] = montOne1 xyz[10] = montOne2 xyz[11] = montOne3 } func (p *c256Point) c256PointToAffine() (x, y *big.Int) { zInv := make([]uint64, 4) zInvSq := make([]uint64, 4) c256Inverse(zInv, p.xyz[8:12]) c256Sqr(zInvSq, zInv, 1) c256Mul(zInv, zInv, zInvSq) c256Mul(zInvSq, p.xyz[0:4], zInvSq) c256Mul(zInv, p.xyz[4:8], zInv) c256FromMont(zInvSq, zInvSq) c256FromMont(zInv, zInv) xOut := make([]byte, 32) yOut := make([]byte, 32) c256LittleToBig(xOut, zInvSq) c256LittleToBig(yOut, zInv) return new(big.Int).SetBytes(xOut), new(big.Int).SetBytes(yOut) } // CopyConditional copies overwrites p with src if v == 1, and leaves p // unchanged if v == 0. func (p *c256Point) CopyConditional(src *c256Point, v int) { pMask := uint64(v) - 1 srcMask := ^pMask for i, n := range p.xyz { p.xyz[i] = (n & pMask) | (src.xyz[i] & srcMask) } } // c256Inverse sets out to in^-1 mod p. // in*R => in^{-1} * R = mont_power(in*R, p-2) // Tested Done func c256Inverse(out, in []uint64) { if false { var stack [8 * 4]uint64 p2 := stack[4*0 : 4*0+4] p4 := stack[4*1 : 4*1+4] p8 := stack[4*2 : 4*2+4] p16 := stack[4*3 : 4*3+4] p32 := stack[4*4 : 4*4+4] p28e := stack[4*5 : 4*6] // fffffffe p28c := stack[4*6 : 4*7] // fffffffc t := stack[4*7 : 4*8] // 0xfffffffe ffffffff ffffffff ffffffff ffffffff 00000000 ffffffff fffffffd c256Sqr(p28e, in, 1) // 10*p c256Mul(p2, p28e, in) // 11*p c256Sqr(t, p2, 2) //1100*p c256Mul(p4, t, p2) // f*p c256Sqr(t, p4, 4) // f0*p c256Mul(p8, t, p4) // ff*p c256Sqr(t, p8, 8) // ff00*p c256Mul(p16, t, p8) // ffff*p c256Sqr(t, p16, 8) // ffff00*p c256Mul(t, t, p8) // ffffff*p c256Sqr(t, t, 4) // ffffff0*p c256Mul(t, t, p4) // fffffff*p c256Sqr(t, t, 2) // fffffff_(00)*p c256Mul(t, t, p2) // fffffff_(11)*p c256Sqr(p28c, t, 2) // fffffffc*p c256Mul(p28e, p28e, p28c) // fffffffe*p c256Mul(p32, p28e, in) // ffffffff*p c256Sqr(t, p28e, 32) c256Mul(t, t, p32) // fffffffe ffffffff c256Sqr(t, t, 32) c256Mul(t, t, p32) // fffffffe ffffffff ffffffff c256Sqr(t, t, 32) c256Mul(t, t, p32) // fffffffe ffffffff ffffffff ffffffff c256Sqr(t, t, 32) c256Mul(t, t, p32) // fffffffe ffffffff ffffffff ffffffff ffffffff c256Sqr(t, t, 64) c256Mul(t, t, p32) // fffffffe ffffffff ffffffff ffffffff ffffffff 00000000 ffffffff c256Sqr(t, t, 32) c256Mul(t, t, p28c) // fffffffe ffffffff ffffffff ffffffff ffffffff 00000000 fffffffe c256Mul(out, t, in) // fffffffe ffffffff ffffffff ffffffff ffffffff 00000000 fffffffd // total 255 sqr + 16 mul } else { var stack [17 * 4]uint64 _10 := stack[4*0 : 4*0+4] _11 := stack[4*1 : 4*1+4] _110 := stack[4*2 : 4*2+4] _111 := stack[4*3 : 4*3+4] _111000 := stack[4*4 : 4*4+4] _111111 := stack[4*5 : 4*6] // fffffffe _1111110 := stack[4*6 : 4*7] // fffffffc _1111111 := stack[4*7 : 4*8] x12 := stack[4*8 : 4*9] // _111111111111 x24 := stack[4*9 : 4*10] x31 := stack[4*10 : 4*11] i39 := stack[4*11 : 4*12] i68 := stack[4*12 : 4*13] x62 := stack[4*13 : 4*14] i71 := stack[4*14 : 4*15] x64 := stack[4*15 : 4*16] i265 := stack[4*16 : 4*17] c256Sqr(_10, in, 1) // _10 = 2 * 1 c256Mul(_11, _10, in) // _11 = 1 + _10 c256Sqr(_110, _11, 1) // _110 = 2 * _11 c256Mul(_111, _110, in) // _111 = 1 + _110 c256Sqr(_111000, _111, 3) // _111000 = _111 << 3 c256Mul(_111111, _111, _111000) // _111111 = _111 + _111000 c256Sqr(_1111110, _111111, 1) // _1111110 = 2 * _111111 c256Mul(_1111111, _1111110, in) // _1111111 = 1 + _1111110 c256Sqr(x12, _1111110, 5) // x12 = _1111110<<5 + _111111 c256Mul(x12, x12, _111111) c256Sqr(x24, x12, 12) // x24 = x12<<12 + x12 c256Mul(x24, x24, x12) c256Sqr(x31, x24, 7) // x31 = x24<<7 + _1111111 c256Mul(x31, x31, _1111111) c256Sqr(i39, x31, 2) // i39 = x31 << 2 c256Sqr(i68, i39, 29) // i68 = i39 << 29 c256Mul(x62, x31, i68) // x62 = x31 + i68 c256Sqr(i71, i68, 2) // i71 = i68 << 2 c256Mul(x64, i39, i71) // x64 = i39 + i71 + _11 c256Mul(x64, x64, _11) c256Sqr(i265, i71, 32) // i265 = ((i71<<32+x64)<<64 + x64) << 94 c256Mul(i265, i265, x64) c256Sqr(i265, i265, 64) c256Mul(i265, i265, x64) c256Sqr(i265, i265, 94) c256Mul(i265, i265, x62) // return (x62+i265)<<2 + 1 c256Sqr(i265, i265, 2) c256Mul(out, i265, in) // 255 sqr + 14 mul } } func (p *c256Point) c256StorePoint(r *[16 * 4 * 3]uint64, index int) { copy(r[index*12:], p.xyz[:]) } func boothW5(in uint) (int, int) { var s uint = ^((in >> 5) - 1) var d uint = (1 << 6) - in - 1 d = (d & s) | (in & (^s)) d = (d >> 1) + (d & 1) return int(d), int(s & 1) } /* 输入in 低7位有效 i0,i1,i2,...,i6 */ func boothW6(in uint) (int, int) { if true { var s uint = ^((in >> 6) - 1) var d uint = (1 << 7) - in - 1 d = (d & s) | (in & (^s)) d = (d >> 1) + (d & 1) return int(d), int(s & 1) } else { // var sel, sign uint = 0, 0 in = in & 0x7f // 只取低7位。其中最低位是前一窗口的最高位。 // sign 是第7位 if (in >> 6) == 1 { sign = 1 } else { sign = 0 } if sign == 1 { sel = in >> 1 sel = (^sel) & 0x3f sel++ if in&1 == 1 { sel-- } } else { sel = (in + 1) >> 1 } return int(sel), int(sign) } } func initTable() { /* c256Precomputed[i][j] = 2^{6i}*(jG) = 0 1 2 31 0 G [2]G [3]G [32]G 1 [2^{6*1}]G [2^{6*1}][2]G 2 [2^{6*2}]G [2^{6*2}][2]G ························· 42 [2^{6*42}]G =========================================== 1 2 3 ... 32 64 64*2 64*3 64*32 64*64 64*64*2 ... 43*32 = */ c256Precomputed = new([43][32 * 8]uint64) basePoint := []uint64{ montBaseX0, montBaseX1, montBaseX2, montBaseX3, montBaseY0, montBaseY1, montBaseY2, montBaseY3, montOne0, montOne1, montOne2, montOne3, } t1 := make([]uint64, 12) t2 := make([]uint64, 12) copy(t2, basePoint) zInv := make([]uint64, 4) zInvSq := make([]uint64, 4) for j := 0; j < 32; j++ { copy(t1, t2) for i := 0; i < 43; i++ { // The window size is 6 so we need to double 6 times. if i != 0 { for k := 0; k < 6; k++ { c256PointDoubleAsm(t1, t1) } } // Convert the point to affine form. (Its values are // still in Montgomery form however.) c256Inverse(zInv, t1[8:12]) c256Sqr(zInvSq, zInv, 1) c256Mul(zInv, zInv, zInvSq) c256Mul(t1[:4], t1[:4], zInvSq) c256Mul(t1[4:8], t1[4:8], zInv) copy(t1[8:12], basePoint[8:12]) // Update the table entry copy(c256Precomputed[i][j*8:], t1[:8]) } if j == 0 { c256PointDoubleAsm(t2, basePoint) } else { c256PointAddAsm(t2, t2, basePoint) } } } func c256SelectBaseOfGo(point, table []uint64, idx int) { if false { c256SelectBase(point, table, idx) return } else { if idx == 0 { return } copy(point[:8], table[8*(idx-1):]) } } func (p *c256Point) c256BaseMult(scalar []uint64) { wvalue := (scalar[0] << 1) & 0x7f sel, sign := boothW6(uint(wvalue)) c256SelectBase(p.xyz[0:8], c256Precomputed[0][0:], sel) c256NegCond(p.xyz[4:8], sign) // (This is one, in the Montgomery domain.) p.xyz[8] = montOne0 p.xyz[9] = montOne1 p.xyz[10] = montOne2 p.xyz[11] = montOne3 var t0 c256Point // (This is one, in the Montgomery domain.) t0.xyz[8] = montOne0 t0.xyz[9] = montOne1 t0.xyz[10] = montOne2 t0.xyz[11] = montOne3 // 191 = 6*31 + 5 index := uint(5) zero := sel for i := 1; i < 43; i++ { if index < 192 { wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0x7f } else { wvalue = (scalar[index/64] >> (index % 64)) & 0x7f } index += 6 sel, sign = boothW6(uint(wvalue)) c256SelectBase(t0.xyz[0:8], c256Precomputed[i][0:], sel) c256PointAddAffineAsm(p.xyz[0:12], p.xyz[0:12], t0.xyz[0:8], sign, sel, zero) zero |= sel } } func (p *c256Point) c256ScalarMult(scalar []uint64) { // precomp is a table of precomputed points that stores powers of p // from p^1 to p^16. var precomp [16 * 4 * 3]uint64 var t0, t1, t2, t3 c256Point // Prepare the table p.c256StorePoint(&precomp, 0) // 1 c256PointDoubleAsm(t0.xyz[:], p.xyz[:]) c256PointDoubleAsm(t1.xyz[:], t0.xyz[:]) c256PointDoubleAsm(t2.xyz[:], t1.xyz[:]) c256PointDoubleAsm(t3.xyz[:], t2.xyz[:]) t0.c256StorePoint(&precomp, 1) // 2 t1.c256StorePoint(&precomp, 3) // 4 t2.c256StorePoint(&precomp, 7) // 8 t3.c256StorePoint(&precomp, 15) // 16 c256PointAddAsm(t0.xyz[:], t0.xyz[:], p.xyz[:]) c256PointAddAsm(t1.xyz[:], t1.xyz[:], p.xyz[:]) c256PointAddAsm(t2.xyz[:], t2.xyz[:], p.xyz[:]) t0.c256StorePoint(&precomp, 2) // 3 t1.c256StorePoint(&precomp, 4) // 5 t2.c256StorePoint(&precomp, 8) // 9 c256PointDoubleAsm(t0.xyz[:], t0.xyz[:]) c256PointDoubleAsm(t1.xyz[:], t1.xyz[:]) t0.c256StorePoint(&precomp, 5) // 6 t1.c256StorePoint(&precomp, 9) // 10 c256PointAddAsm(t2.xyz[:], t0.xyz[:], p.xyz[:]) c256PointAddAsm(t1.xyz[:], t1.xyz[:], p.xyz[:]) t2.c256StorePoint(&precomp, 6) // 7 t1.c256StorePoint(&precomp, 10) // 11 c256PointDoubleAsm(t0.xyz[:], t0.xyz[:]) c256PointDoubleAsm(t2.xyz[:], t2.xyz[:]) t0.c256StorePoint(&precomp, 11) // 12 t2.c256StorePoint(&precomp, 13) // 14 c256PointAddAsm(t0.xyz[:], t0.xyz[:], p.xyz[:]) c256PointAddAsm(t2.xyz[:], t2.xyz[:], p.xyz[:]) t0.c256StorePoint(&precomp, 12) // 13 t2.c256StorePoint(&precomp, 14) // 15 // Start scanning the window from top bit index := uint(254) var sel, sign int wvalue := (scalar[index/64] >> (index % 64)) & 0x3f sel, _ = boothW5(uint(wvalue)) c256Select(p.xyz[0:12], precomp[0:], sel) zero := sel for index > 4 { index -= 5 c256PointDoubleAsm(p.xyz[:], p.xyz[:]) c256PointDoubleAsm(p.xyz[:], p.xyz[:]) c256PointDoubleAsm(p.xyz[:], p.xyz[:]) c256PointDoubleAsm(p.xyz[:], p.xyz[:]) c256PointDoubleAsm(p.xyz[:], p.xyz[:]) if index < 192 { wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0x3f } else { wvalue = (scalar[index/64] >> (index % 64)) & 0x3f } sel, sign = boothW5(uint(wvalue)) c256Select(t0.xyz[0:], precomp[0:], sel) c256NegCond(t0.xyz[4:8], sign) c256PointAddAsm(t1.xyz[:], p.xyz[:], t0.xyz[:]) c256MovCond(t1.xyz[0:12], t1.xyz[0:12], p.xyz[0:12], sel) c256MovCond(p.xyz[0:12], t1.xyz[0:12], t0.xyz[0:12], zero) zero |= sel } c256PointDoubleAsm(p.xyz[:], p.xyz[:]) c256PointDoubleAsm(p.xyz[:], p.xyz[:]) c256PointDoubleAsm(p.xyz[:], p.xyz[:]) c256PointDoubleAsm(p.xyz[:], p.xyz[:]) c256PointDoubleAsm(p.xyz[:], p.xyz[:]) wvalue = (scalar[0] << 1) & 0x3f sel, sign = boothW5(uint(wvalue)) c256Select(t0.xyz[0:], precomp[0:], sel) c256NegCond(t0.xyz[4:8], sign) c256PointAddAsm(t1.xyz[:], p.xyz[:], t0.xyz[:]) c256MovCond(t1.xyz[0:12], t1.xyz[0:12], p.xyz[0:12], sel) c256MovCond(p.xyz[0:12], t1.xyz[0:12], t0.xyz[0:12], zero) }