init: v1.0.0

2026-05-27 23:03:00 +08:00
commit 8d97f750eb
466 changed files with 80067 additions and 0 deletions
@@ -0,0 +1,814 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file contains the Go wrapper for the constant-time, 64-bit assembly
+// implementation of P256. The optimizations performed here are described in
+// detail in:
+// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
+//                          256-bit primes"
+// https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
+// https://eprint.iacr.org/2013/816.pdf
+
+//go:build (arm64 || amd64) && !generic && !generic32 && !generic64
+// +build arm64 amd64
+// +build !generic
+// +build !generic32
+// +build !generic64
+
+package ec256
+
+import (
+	"crypto/elliptic"
+	"math/big"
+)
+
+const (
+	// montgomery of one: 1*R mod p
+	montOne0 = 0x0000000000000001
+	montOne1 = 0x00000000ffffffff
+	montOne2 = 0x0000000000000000
+	montOne3 = 0x0000000100000000
+
+	// montgomery of base point:
+	montBaseX0 = 0x61328990f418029e
+	montBaseX1 = 0x3e7981eddca6c050
+	montBaseX2 = 0xd6a1ed99ac24c3c3
+	montBaseX3 = 0x91167a5ee1c13b05
+
+	montBaseY0 = 0xc1354e593c2d0ddd
+	montBaseY1 = 0xc1f5e5788d3295fa
+	montBaseY2 = 0x8d4cfb066e2a48f8
+	montBaseY3 = 0x63cd65d481d735bd
+
+	// R*R mod n
+	rrModN0 = 0x901192af7c114f20
+	rrModN1 = 0x3464504ade6fa2fa
+	rrModN2 = 0x620fc84c3affe0d4
+	rrModN3 = 0x1eb5e412a22b3d3b
+
+	// R*R mod p
+	rrModP0 = 0x0000000200000003
+	rrModP1 = 0x00000002ffffffff
+	rrModP2 = 0x0000000100000001
+	rrModP3 = 0x0000000400000002
+)
+
+// c256Point Jacobian represent of a point with x,y,z in Montgomery domain
+type c256Point struct {
+	xyz [12]uint64
+}
+
+var (
+	c256Precomputed *[43][32 * 8]uint64
+)
+
+func init() {
+	initTable()
+}
+
+func (curve SM2CurveParam) Params() *elliptic.CurveParams {
+	return curve.CurveParams
+}
+
+//go:noescape
+// func c256Add(res, in1, in2 []uint64)
+
+// Functions implemented in c256_asm_*64.s
+// Montgomery multiplication modulo P256
+//
+//go:noescape
+func c256Mul(res, in1, in2 []uint64)
+
+// Montgomery square modulo P256, repeated n times (n >= 1)
+//
+//go:noescape
+func c256Sqr(res, in []uint64, n int)
+
+// Montgomery multiplication by 1, montMul(in, 1)
+//
+//go:noescape
+func c256FromMont(res, in []uint64)
+
+// iff cond != 0  val <- -val
+//
+//go:noescape
+func c256NegCond(val []uint64, cond int)
+
+// if cond == 0 res <- b; else res <- a
+//
+//go:noescape
+func c256MovCond(res, a, b []uint64, cond int)
+
+// Endianness swap, 大端表示的32字节转4个小端表示的uint64
+//
+//go:noescape
+func c256BigToLittle(res []uint64, in []byte)
+
+//go:noescape
+func c256LittleToBig(res []byte, in []uint64)
+
+// Constant time table access
+// idx = 0, returns infinity. idx = i > 0, returns table[i-1].
+//
+//go:noescape
+func c256Select(point, table []uint64, idx int)
+
+//go:noescape
+func c256SelectBase(point, table []uint64, idx int)
+
+// Montgomery multiplication modulo Ord(G)
+//
+//go:noescape
+func c256OrdMul(res, in1, in2 []uint64)
+
+// Montgomery square modulo Ord(G), repeated n times
+//
+//go:noescape
+func c256OrdSqr(res, in []uint64, n int)
+
+// Point add with in2 being affine point
+// If sign == 1 -> in2 = -in2
+// If sel == 0 -> res = in1
+// if zero == 0 -> res = in2
+//
+//go:noescape
+func c256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int)
+
+// Point add. Returns one if the two input points were equal and zero
+// otherwise. (Note that, due to the way that the equations work out, some
+// representations of ∞ are considered equal to everything by this function.)
+//
+//go:noescape
+func c256PointAddAsm(res, in1, in2 []uint64) int
+
+// Point double
+//
+//go:noescape
+func c256PointDoubleAsm(res, in []uint64)
+
+func c256ToMont(res, in []uint64) {
+	c256Mul(res, in, rr)
+}
+
+// in: k = k0 mod N
+// out: k0^{-1} mod N
+// use montgomery power: k -> k*R -> k^{N-2}*R -> k^{N-2}
+// Done - FIXME, need improve
+func (curve SM2CurveParam) Inverse(k *big.Int) *big.Int {
+
+	if k.Sign() < 0 {
+		// This should never happen.
+		k = new(big.Int).Neg(k)
+	}
+
+	if k.Cmp(c256.N) >= 0 {
+		// This should never happen.
+		k = new(big.Int).Mod(k, c256.N)
+	}
+
+	// table will store precomputed powers of x.
+	var table [4 * 10]uint64
+	var (
+		_1      = table[4*0 : 4*1] // 1
+		_11     = table[4*1 : 4*2] // 3
+		_101    = table[4*2 : 4*3] // 5
+		_111    = table[4*3 : 4*4] // 7
+		_1111   = table[4*4 : 4*5] // 15
+		_10101  = table[4*5 : 4*6] // 21
+		_101111 = table[4*6 : 4*7] // 47
+		x       = table[4*7 : 4*8]
+		t       = table[4*8 : 4*9]
+		s       = table[4*9 : 4*10]
+	)
+
+	fromBig(x[:], k)
+	// This code operates in the Montgomery domain where R = 2^256 mod n
+	// and n is the order of the scalar field. (See initP256 for the
+	// value.) Elements in the Montgomery domain take the form a×R and
+	// multiplication of x and y in the calculates (x × y × R^-1) mod n. RR
+	// is R×R mod n thus the Montgomery multiplication x and RR gives x×R,
+	// i.e. converts x into the Montgomery domain.
+	// Window values borrowed from https://briansmith.org/ecc-inversion-addition-chains-01#p256_scalar_inversion
+	RR := []uint64{rrModN0, rrModN1, rrModN2, rrModN3} // sm2-p256
+
+	// FIXME: the ladder need improve
+	// SM2-p256:
+	// N-2 = 0xfffffffeffffffffffffffffffffffff7203df6b21c6052b53bbf40939d54121
+	c256OrdMul(_1, x, RR)      // _1
+	c256OrdSqr(x, _1, 1)       // _10   x=10
+	c256OrdMul(_11, x, _1)     // _11
+	c256OrdMul(_101, x, _11)   // _101
+	c256OrdMul(_111, x, _101)  // _111
+	c256OrdSqr(x, _101, 1)     // _1010     -- x = _1010
+	c256OrdMul(_1111, _101, x) // _1111
+
+	c256OrdSqr(t, x, 1)          // _10100    -- t=_10100
+	c256OrdMul(_10101, t, _1)    // _10101
+	c256OrdSqr(x, _10101, 1)     // _101010   -- x=_101010
+	c256OrdMul(_101111, _101, x) // _101111
+	c256OrdMul(x, _10101, x)     // _111111 = x6  -- x=x6
+	c256OrdSqr(s, x, 1)          // x = _1111110
+	c256OrdMul(s, s, _1)         // x = x7
+	c256OrdSqr(x, s, 1)          // x = _11111110 = 0xfe
+	c256OrdMul(s, x, _1)         // s = x8 = 0xff
+	c256OrdSqr(t, s, 8)          // t=_ff00
+	c256OrdMul(x, t, x)          // x = fffe
+	c256OrdMul(s, t, s)          // s = _ffff
+	c256OrdSqr(t, s, 16)         // t=_ffff0000
+	c256OrdMul(x, t, x)          // x = fffffffe
+	c256OrdMul(t, x, _1)         //  t = ffffffff
+
+	c256OrdSqr(x, x, 32) //   x=_fffffffe00000000
+	c256OrdMul(x, x, t)  //   x=_fffffffeffffffff
+	c256OrdSqr(x, x, 32) // x = _fffffffeffffffff00000000
+	c256OrdMul(x, x, t)  //  x= _fffffffeffffffffffffffff
+	c256OrdSqr(x, x, 32) // 	x = _fffffffeffffffffffffffff00000000
+	c256OrdMul(x, x, t)  // x = _fffffffeffffffffffffffffffffffff
+
+	// 7203df6b21c6052b53bbf40939d54121 =
+	// 01110010000000111101111101101011001000011100011000000101001010110101001110111011111101000000100100111001110101010100000100100001 =
+	// 0111 001 00000001111 01111 101
+	// 101 011 001 0000111 00011
+	// 000000101 0010101 10101 00111 0111
+	// 011 1111 01 0000001 001
+	// 00111 00111 010101 01 000001
+	// 001 00001
+
+	sqrs := []uint8{
+		4, 3, 11, 5, 3,
+		3, 3, 3, 7, 5,
+		9, 7, 5, 5, 4,
+		3, 4, 2, 7, 3,
+		5, 5, 6, 2, 6,
+		3, 5,
+	}
+	muls := [][]uint64{
+		_111, _1, _1111, _1111, _101,
+		_101, _11, _1, _111, _11,
+		_101, _10101, _10101, _111, _111,
+		_11, _1111, _1, _1, _1,
+		_111, _111, _10101, _1, _1,
+		_1, _1,
+	}
+
+	for i, s := range sqrs {
+		c256OrdSqr(x, x, int(s))
+		c256OrdMul(x, x, muls[i])
+	}
+
+	// Multiplying by one in the Montgomery domain converts a Montgomery
+	// value out of the domain.
+	one := []uint64{1, 0, 0, 0}
+	c256OrdMul(x, x, one)
+
+	xOut := make([]byte, 32)
+	c256LittleToBig(xOut, x)
+	return new(big.Int).SetBytes(xOut)
+}
+
+// fromBig converts a *big.Int into a format used by this code.
+func fromBig(out []uint64, big *big.Int) {
+	for i := range out {
+		out[i] = 0
+	}
+
+	for i, v := range big.Bits() {
+		out[i] = uint64(v)
+	}
+}
+
+// c256GetScalar endian-swaps the big-endian scalar value from in and writes it
+// to out. If the scalar is equal or greater than the order of the group, it's
+// reduced modulo that order.
+func c256GetScalar(out []uint64, in []byte) {
+	n := new(big.Int).SetBytes(in)
+
+	if n.Cmp(c256.N) >= 0 {
+		n.Mod(n, c256.N)
+	}
+	fromBig(out, n)
+}
+
+// c256Mul operates in a Montgomery domain with R = 2^256 mod p, where p is the
+// underlying field of the curve. (See initP256 for the value.) Thus rr here is
+// R×R mod p. See comment in Inverse about how this is used.
+var rr = []uint64{rrModP0, rrModP1, rrModP2, rrModP3} //// changed to sm2
+
+// Note: for most time, in < p
+func maybeReduceModP(in *big.Int) *big.Int {
+	if in.Cmp(c256.P) < 0 {
+		return in
+	}
+	return new(big.Int).Mod(in, c256.P)
+}
+
+func CombinedMult(bigX, bigY *big.Int, baseScalar, scalar []byte) (x, y *big.Int) {
+	scalarReversed := make([]uint64, 4)
+	var r1, r2 c256Point
+	c256GetScalar(scalarReversed, baseScalar)
+	r1IsInfinity := scalarIsZero(scalarReversed)
+	r1.c256BaseMult(scalarReversed)
+
+	c256GetScalar(scalarReversed, scalar)
+	r2IsInfinity := scalarIsZero(scalarReversed)
+	r2.c256PointFromAffine(bigX, bigY)
+	r2.c256ScalarMult(scalarReversed)
+
+	var sum, double c256Point
+	pointsEqual := c256PointAddAsm(sum.xyz[:], r1.xyz[:], r2.xyz[:])
+	c256PointDoubleAsm(double.xyz[:], r1.xyz[:])
+	sum.CopyConditional(&double, pointsEqual)
+	sum.CopyConditional(&r1, r2IsInfinity)
+	sum.CopyConditional(&r2, r1IsInfinity)
+	return sum.c256PointToAffine()
+}
+
+func (curve SM2CurveParam) ScalarBaseMult(scalar []byte) (x, y *big.Int) {
+	// return curve.ScalarMult(curve.Gx, curve.Gy, scalar)
+	scalarReversed := make([]uint64, 4)
+	c256GetScalar(scalarReversed, scalar)
+
+	var r c256Point
+	r.c256BaseMult(scalarReversed)
+	return r.c256PointToAffine()
+}
+
+func (curve SM2CurveParam) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) {
+	scalarReversed := make([]uint64, 4)
+	c256GetScalar(scalarReversed, scalar)
+
+	var r c256Point
+	fromBig(r.xyz[0:4], maybeReduceModP(bigX))
+	fromBig(r.xyz[4:8], maybeReduceModP(bigY))
+	c256Mul(r.xyz[0:4], r.xyz[0:4], rr[:])
+	c256Mul(r.xyz[4:8], r.xyz[4:8], rr[:])
+	// This sets r2's Z value to 1, in the Montgomery domain.
+	r.xyz[8] = montOne0
+	r.xyz[9] = montOne1
+	r.xyz[10] = montOne2
+	r.xyz[11] = montOne3
+
+	r.c256ScalarMult(scalarReversed)
+	return r.c256PointToAffine()
+}
+
+func (curve SM2CurveParam) Add(x1, y1, x2, y2 *big.Int) (x, y *big.Int) {
+	var r1, r2 c256Point
+	r1.c256PointFromAffine(x1, y1)
+	r2.c256PointFromAffine(x2, y2)
+	if true {
+		// in most cases, the input two points are not equal.
+		// omit the time-attack risk.
+		if c256PointAddAsm(r1.xyz[:], r1.xyz[:], r2.xyz[:]) == 1 {
+			c256PointDoubleAsm(r1.xyz[:], r2.xyz[:])
+		}
+		return r1.c256PointToAffine()
+	} else {
+		var res, double c256Point
+		pointEqual := c256PointAddAsm(res.xyz[:], r1.xyz[:], r2.xyz[:])
+		c256PointDoubleAsm(double.xyz[:], r1.xyz[:])
+		c256MovCond(res.xyz[:], res.xyz[:], double.xyz[:], pointEqual)
+		return res.c256PointToAffine()
+	}
+
+}
+
+func (curve SM2CurveParam) Double(x1, y1 *big.Int) (x, y *big.Int) {
+	var r c256Point
+	r.c256PointFromAffine(x1, y1)
+	c256PointDoubleAsm(r.xyz[:], r.xyz[:])
+	return r.c256PointToAffine()
+}
+
+// uint64IsZero returns 1 if x is zero and zero otherwise.
+func uint64IsZero(x uint64) int {
+	x = ^x
+	x &= x >> 32
+	x &= x >> 16
+	x &= x >> 8
+	x &= x >> 4
+	x &= x >> 2
+	x &= x >> 1
+	return int(x & 1)
+}
+
+// scalarIsZero returns 1 if scalar represents the zero value, and zero
+// otherwise.
+func scalarIsZero(scalar []uint64) int {
+	return uint64IsZero(scalar[0] | scalar[1] | scalar[2] | scalar[3])
+}
+
+// c256PointFromAffine change affine point (x,y) to Montgemery domain
+// Jacobian point p
+func (p *c256Point) c256PointFromAffine(x, y *big.Int) {
+	xyz := p.xyz[:]
+	fromBig(xyz[0:4], maybeReduceModP(x))
+	fromBig(xyz[4:8], maybeReduceModP(y))
+	c256Mul(xyz[0:4], xyz[0:4], rr[:])
+	c256Mul(xyz[4:8], xyz[4:8], rr[:])
+	xyz[8] = montOne0
+	xyz[9] = montOne1
+	xyz[10] = montOne2
+	xyz[11] = montOne3
+}
+
+func (p *c256Point) c256PointToAffine() (x, y *big.Int) {
+	zInv := make([]uint64, 4)
+	zInvSq := make([]uint64, 4)
+	c256Inverse(zInv, p.xyz[8:12])
+	c256Sqr(zInvSq, zInv, 1)
+	c256Mul(zInv, zInv, zInvSq)
+
+	c256Mul(zInvSq, p.xyz[0:4], zInvSq)
+	c256Mul(zInv, p.xyz[4:8], zInv)
+
+	c256FromMont(zInvSq, zInvSq)
+	c256FromMont(zInv, zInv)
+
+	xOut := make([]byte, 32)
+	yOut := make([]byte, 32)
+	c256LittleToBig(xOut, zInvSq)
+	c256LittleToBig(yOut, zInv)
+
+	return new(big.Int).SetBytes(xOut), new(big.Int).SetBytes(yOut)
+}
+
+// CopyConditional copies overwrites p with src if v == 1, and leaves p
+// unchanged if v == 0.
+func (p *c256Point) CopyConditional(src *c256Point, v int) {
+	pMask := uint64(v) - 1
+	srcMask := ^pMask
+
+	for i, n := range p.xyz {
+		p.xyz[i] = (n & pMask) | (src.xyz[i] & srcMask)
+	}
+}
+
+// c256Inverse sets out to in^-1 mod p.
+// in*R => in^{-1} * R = mont_power(in*R, p-2)
+// Tested Done
+func c256Inverse(out, in []uint64) {
+	if false {
+		var stack [8 * 4]uint64
+		p2 := stack[4*0 : 4*0+4]
+		p4 := stack[4*1 : 4*1+4]
+		p8 := stack[4*2 : 4*2+4]
+		p16 := stack[4*3 : 4*3+4]
+		p32 := stack[4*4 : 4*4+4]
+		p28e := stack[4*5 : 4*6] // fffffffe
+		p28c := stack[4*6 : 4*7] // fffffffc
+		t := stack[4*7 : 4*8]
+
+		// 0xfffffffe ffffffff ffffffff ffffffff ffffffff 00000000 ffffffff fffffffd
+		c256Sqr(p28e, in, 1)  //  10*p
+		c256Mul(p2, p28e, in) // 11*p
+
+		c256Sqr(t, p2, 2)  //1100*p
+		c256Mul(p4, t, p2) // f*p
+
+		c256Sqr(t, p4, 4)  // f0*p
+		c256Mul(p8, t, p4) // ff*p
+
+		c256Sqr(t, p8, 8)   // ff00*p
+		c256Mul(p16, t, p8) // ffff*p
+
+		c256Sqr(t, p16, 8)        // ffff00*p
+		c256Mul(t, t, p8)         // ffffff*p
+		c256Sqr(t, t, 4)          // ffffff0*p
+		c256Mul(t, t, p4)         // fffffff*p
+		c256Sqr(t, t, 2)          // fffffff_(00)*p
+		c256Mul(t, t, p2)         // fffffff_(11)*p
+		c256Sqr(p28c, t, 2)       // fffffffc*p
+		c256Mul(p28e, p28e, p28c) // fffffffe*p
+		c256Mul(p32, p28e, in)    // ffffffff*p
+
+		c256Sqr(t, p28e, 32)
+		c256Mul(t, t, p32) // fffffffe ffffffff
+
+		c256Sqr(t, t, 32)
+		c256Mul(t, t, p32) // fffffffe ffffffff ffffffff
+
+		c256Sqr(t, t, 32)
+		c256Mul(t, t, p32) // fffffffe ffffffff ffffffff ffffffff
+
+		c256Sqr(t, t, 32)
+		c256Mul(t, t, p32) // fffffffe ffffffff ffffffff ffffffff ffffffff
+
+		c256Sqr(t, t, 64)
+		c256Mul(t, t, p32) // fffffffe ffffffff ffffffff ffffffff ffffffff 00000000 ffffffff
+
+		c256Sqr(t, t, 32)
+		c256Mul(t, t, p28c) // fffffffe ffffffff ffffffff ffffffff ffffffff 00000000 fffffffe
+		c256Mul(out, t, in) // fffffffe ffffffff ffffffff ffffffff ffffffff 00000000 fffffffd
+
+		// total 255 sqr + 16 mul
+	} else {
+		var stack [17 * 4]uint64
+		_10 := stack[4*0 : 4*0+4]
+		_11 := stack[4*1 : 4*1+4]
+		_110 := stack[4*2 : 4*2+4]
+		_111 := stack[4*3 : 4*3+4]
+		_111000 := stack[4*4 : 4*4+4]
+		_111111 := stack[4*5 : 4*6]  // fffffffe
+		_1111110 := stack[4*6 : 4*7] // fffffffc
+		_1111111 := stack[4*7 : 4*8]
+		x12 := stack[4*8 : 4*9] // _111111111111
+		x24 := stack[4*9 : 4*10]
+		x31 := stack[4*10 : 4*11]
+		i39 := stack[4*11 : 4*12]
+		i68 := stack[4*12 : 4*13]
+		x62 := stack[4*13 : 4*14]
+		i71 := stack[4*14 : 4*15]
+		x64 := stack[4*15 : 4*16]
+		i265 := stack[4*16 : 4*17]
+
+		c256Sqr(_10, in, 1)             // _10 = 2 * 1
+		c256Mul(_11, _10, in)           // _11 = 1 + _10
+		c256Sqr(_110, _11, 1)           // _110 = 2 * _11
+		c256Mul(_111, _110, in)         // _111 = 1 + _110
+		c256Sqr(_111000, _111, 3)       // _111000 = _111 << 3
+		c256Mul(_111111, _111, _111000) // _111111 = _111 + _111000
+		c256Sqr(_1111110, _111111, 1)   // _1111110 = 2 * _111111
+		c256Mul(_1111111, _1111110, in) // _1111111 = 1 + _1111110
+		c256Sqr(x12, _1111110, 5)       // x12 = _1111110<<5 + _111111
+		c256Mul(x12, x12, _111111)
+
+		c256Sqr(x24, x12, 12) // x24 = x12<<12 + x12
+		c256Mul(x24, x24, x12)
+
+		c256Sqr(x31, x24, 7) // x31 = x24<<7 + _1111111
+		c256Mul(x31, x31, _1111111)
+
+		c256Sqr(i39, x31, 2)   // i39 = x31 << 2
+		c256Sqr(i68, i39, 29)  // i68 = i39 << 29
+		c256Mul(x62, x31, i68) // x62 = x31 + i68
+		c256Sqr(i71, i68, 2)   // i71 = i68 << 2
+		c256Mul(x64, i39, i71) // x64 = i39 + i71 + _11
+		c256Mul(x64, x64, _11)
+		c256Sqr(i265, i71, 32) // i265 = ((i71<<32+x64)<<64 + x64) << 94
+		c256Mul(i265, i265, x64)
+		c256Sqr(i265, i265, 64)
+		c256Mul(i265, i265, x64)
+		c256Sqr(i265, i265, 94)
+
+		c256Mul(i265, i265, x62) // return (x62+i265)<<2 + 1
+		c256Sqr(i265, i265, 2)
+		c256Mul(out, i265, in)
+
+		// 255 sqr + 14 mul
+	}
+}
+
+func (p *c256Point) c256StorePoint(r *[16 * 4 * 3]uint64, index int) {
+	copy(r[index*12:], p.xyz[:])
+}
+
+func boothW5(in uint) (int, int) {
+	var s uint = ^((in >> 5) - 1)
+	var d uint = (1 << 6) - in - 1
+	d = (d & s) | (in & (^s))
+	d = (d >> 1) + (d & 1)
+	return int(d), int(s & 1)
+}
+
+/*
+输入in 低7位有效 i0,i1,i2,...,i6
+*/
+func boothW6(in uint) (int, int) {
+	if true {
+		var s uint = ^((in >> 6) - 1)
+		var d uint = (1 << 7) - in - 1
+		d = (d & s) | (in & (^s))
+		d = (d >> 1) + (d & 1)
+		return int(d), int(s & 1)
+	} else {
+		//
+		var sel, sign uint = 0, 0
+		in = in & 0x7f // 只取低7位。其中最低位是前一窗口的最高位。
+
+		// sign 是第7位
+		if (in >> 6) == 1 {
+			sign = 1
+		} else {
+			sign = 0
+		}
+
+		if sign == 1 {
+			sel = in >> 1
+			sel = (^sel) & 0x3f
+			sel++
+			if in&1 == 1 {
+				sel--
+			}
+		} else {
+			sel = (in + 1) >> 1
+		}
+
+		return int(sel), int(sign)
+	}
+}
+
+func initTable() {
+	/*
+			c256Precomputed[i][j] = 2^{6i}*(jG) =
+					0				1					2				31
+				0 	G				[2]G				[3]G			[32]G
+				1 	[2^{6*1}]G		[2^{6*1}][2]G
+				2	[2^{6*2}]G		[2^{6*2}][2]G
+				·························
+				42	[2^{6*42}]G
+
+				===========================================
+
+				1  		2  		3  		... 		32
+				64 		64*2	64*3    			64*32
+				64*64   64*64*2 ...
+
+		43*32 =
+	*/
+	c256Precomputed = new([43][32 * 8]uint64)
+
+	basePoint := []uint64{
+		montBaseX0, montBaseX1, montBaseX2, montBaseX3,
+		montBaseY0, montBaseY1, montBaseY2, montBaseY3,
+		montOne0, montOne1, montOne2, montOne3,
+	}
+	t1 := make([]uint64, 12)
+	t2 := make([]uint64, 12)
+	copy(t2, basePoint)
+
+	zInv := make([]uint64, 4)
+	zInvSq := make([]uint64, 4)
+	for j := 0; j < 32; j++ {
+		copy(t1, t2)
+		for i := 0; i < 43; i++ {
+			// The window size is 6 so we need to double 6 times.
+			if i != 0 {
+				for k := 0; k < 6; k++ {
+					c256PointDoubleAsm(t1, t1)
+				}
+			}
+			// Convert the point to affine form. (Its values are
+			// still in Montgomery form however.)
+			c256Inverse(zInv, t1[8:12])
+			c256Sqr(zInvSq, zInv, 1)
+			c256Mul(zInv, zInv, zInvSq)
+
+			c256Mul(t1[:4], t1[:4], zInvSq)
+			c256Mul(t1[4:8], t1[4:8], zInv)
+
+			copy(t1[8:12], basePoint[8:12])
+			// Update the table entry
+			copy(c256Precomputed[i][j*8:], t1[:8])
+		}
+		if j == 0 {
+			c256PointDoubleAsm(t2, basePoint)
+		} else {
+			c256PointAddAsm(t2, t2, basePoint)
+		}
+	}
+}
+
+func c256SelectBaseOfGo(point, table []uint64, idx int) {
+	if false {
+		c256SelectBase(point, table, idx)
+		return
+	} else {
+		if idx == 0 {
+			return
+		}
+		copy(point[:8], table[8*(idx-1):])
+	}
+}
+func (p *c256Point) c256BaseMult(scalar []uint64) {
+	wvalue := (scalar[0] << 1) & 0x7f
+	sel, sign := boothW6(uint(wvalue))
+	c256SelectBase(p.xyz[0:8], c256Precomputed[0][0:], sel)
+	c256NegCond(p.xyz[4:8], sign)
+
+	// (This is one, in the Montgomery domain.)
+	p.xyz[8] = montOne0
+	p.xyz[9] = montOne1
+	p.xyz[10] = montOne2
+	p.xyz[11] = montOne3
+
+	var t0 c256Point
+	// (This is one, in the Montgomery domain.)
+	t0.xyz[8] = montOne0
+	t0.xyz[9] = montOne1
+	t0.xyz[10] = montOne2
+	t0.xyz[11] = montOne3
+
+	// 191 = 6*31 + 5
+	index := uint(5)
+	zero := sel
+
+	for i := 1; i < 43; i++ {
+		if index < 192 {
+			wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0x7f
+		} else {
+			wvalue = (scalar[index/64] >> (index % 64)) & 0x7f
+		}
+		index += 6
+		sel, sign = boothW6(uint(wvalue))
+		c256SelectBase(t0.xyz[0:8], c256Precomputed[i][0:], sel)
+		c256PointAddAffineAsm(p.xyz[0:12], p.xyz[0:12], t0.xyz[0:8], sign, sel, zero)
+		zero |= sel
+	}
+}
+
+func (p *c256Point) c256ScalarMult(scalar []uint64) {
+	// precomp is a table of precomputed points that stores powers of p
+	// from p^1 to p^16.
+	var precomp [16 * 4 * 3]uint64
+	var t0, t1, t2, t3 c256Point
+
+	// Prepare the table
+	p.c256StorePoint(&precomp, 0) // 1
+
+	c256PointDoubleAsm(t0.xyz[:], p.xyz[:])
+	c256PointDoubleAsm(t1.xyz[:], t0.xyz[:])
+	c256PointDoubleAsm(t2.xyz[:], t1.xyz[:])
+	c256PointDoubleAsm(t3.xyz[:], t2.xyz[:])
+	t0.c256StorePoint(&precomp, 1)  // 2
+	t1.c256StorePoint(&precomp, 3)  // 4
+	t2.c256StorePoint(&precomp, 7)  // 8
+	t3.c256StorePoint(&precomp, 15) // 16
+
+	c256PointAddAsm(t0.xyz[:], t0.xyz[:], p.xyz[:])
+	c256PointAddAsm(t1.xyz[:], t1.xyz[:], p.xyz[:])
+	c256PointAddAsm(t2.xyz[:], t2.xyz[:], p.xyz[:])
+	t0.c256StorePoint(&precomp, 2) // 3
+	t1.c256StorePoint(&precomp, 4) // 5
+	t2.c256StorePoint(&precomp, 8) // 9
+
+	c256PointDoubleAsm(t0.xyz[:], t0.xyz[:])
+	c256PointDoubleAsm(t1.xyz[:], t1.xyz[:])
+	t0.c256StorePoint(&precomp, 5) // 6
+	t1.c256StorePoint(&precomp, 9) // 10
+
+	c256PointAddAsm(t2.xyz[:], t0.xyz[:], p.xyz[:])
+	c256PointAddAsm(t1.xyz[:], t1.xyz[:], p.xyz[:])
+	t2.c256StorePoint(&precomp, 6)  // 7
+	t1.c256StorePoint(&precomp, 10) // 11
+
+	c256PointDoubleAsm(t0.xyz[:], t0.xyz[:])
+	c256PointDoubleAsm(t2.xyz[:], t2.xyz[:])
+	t0.c256StorePoint(&precomp, 11) // 12
+	t2.c256StorePoint(&precomp, 13) // 14
+
+	c256PointAddAsm(t0.xyz[:], t0.xyz[:], p.xyz[:])
+	c256PointAddAsm(t2.xyz[:], t2.xyz[:], p.xyz[:])
+	t0.c256StorePoint(&precomp, 12) // 13
+	t2.c256StorePoint(&precomp, 14) // 15
+
+	// Start scanning the window from top bit
+	index := uint(254)
+	var sel, sign int
+
+	wvalue := (scalar[index/64] >> (index % 64)) & 0x3f
+	sel, _ = boothW5(uint(wvalue))
+
+	c256Select(p.xyz[0:12], precomp[0:], sel)
+	zero := sel
+
+	for index > 4 {
+		index -= 5
+		c256PointDoubleAsm(p.xyz[:], p.xyz[:])
+		c256PointDoubleAsm(p.xyz[:], p.xyz[:])
+		c256PointDoubleAsm(p.xyz[:], p.xyz[:])
+		c256PointDoubleAsm(p.xyz[:], p.xyz[:])
+		c256PointDoubleAsm(p.xyz[:], p.xyz[:])
+
+		if index < 192 {
+			wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0x3f
+		} else {
+			wvalue = (scalar[index/64] >> (index % 64)) & 0x3f
+		}
+
+		sel, sign = boothW5(uint(wvalue))
+
+		c256Select(t0.xyz[0:], precomp[0:], sel)
+		c256NegCond(t0.xyz[4:8], sign)
+		c256PointAddAsm(t1.xyz[:], p.xyz[:], t0.xyz[:])
+		c256MovCond(t1.xyz[0:12], t1.xyz[0:12], p.xyz[0:12], sel)
+		c256MovCond(p.xyz[0:12], t1.xyz[0:12], t0.xyz[0:12], zero)
+		zero |= sel
+	}
+
+	c256PointDoubleAsm(p.xyz[:], p.xyz[:])
+	c256PointDoubleAsm(p.xyz[:], p.xyz[:])
+	c256PointDoubleAsm(p.xyz[:], p.xyz[:])
+	c256PointDoubleAsm(p.xyz[:], p.xyz[:])
+	c256PointDoubleAsm(p.xyz[:], p.xyz[:])
+
+	wvalue = (scalar[0] << 1) & 0x3f
+	sel, sign = boothW5(uint(wvalue))
+
+	c256Select(t0.xyz[0:], precomp[0:], sel)
+	c256NegCond(t0.xyz[4:8], sign)
+	c256PointAddAsm(t1.xyz[:], p.xyz[:], t0.xyz[:])
+	c256MovCond(t1.xyz[0:12], t1.xyz[0:12], p.xyz[0:12], sel)
+	c256MovCond(p.xyz[0:12], t1.xyz[0:12], t0.xyz[0:12], zero)
+}