init: v1.0.0

2026-05-27 23:03:00 +08:00
commit 8d97f750eb
466 changed files with 80067 additions and 0 deletions
@@ -0,0 +1,5 @@
+# go assemble vs. arm
+## CSEL
+iff cond, dst = r1, else dst = r2
+go: CSEL cond, r1, r2, dst
+arm: CSEL dst, r1, r2, cond
@@ -0,0 +1,54 @@
+package ec256
+
+import (
+	"crypto/elliptic"
+	"fmt"
+	"math/big"
+
+	"xdx.jelly/xgcl/internal"
+)
+
+const debug = false
+
+func printFuncName() {
+	if debug {
+		fmt.Println("Calling " + internal.GetFuncName())
+	}
+}
+
+var _ elliptic.Curve = SM2CurveParam{}
+
+// SM2CurveParam CurveParams已经实现了crypto.Curve接口,增加一层把点乘等函数覆盖了。
+type SM2CurveParam struct {
+	*elliptic.CurveParams
+}
+
+type combinedMulter interface {
+	CombinedMult(bigX, bigY *big.Int, baseScalar, scalar []byte) (x, y *big.Int)
+}
+
+// p = 2^256 - 2^224 - 2^96 + 2^64 -1
+var c256 = SM2CurveParam{CurveParams: &elliptic.CurveParams{
+	Name:    "Curve SM2",
+	P:       bigFromBase16("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF00000000FFFFFFFFFFFFFFFF"),
+	N:       bigFromBase16("FFFFFFFEFFFFFFFFFFFFFFFFFFFFFFFF7203DF6B21C6052B53BBF40939D54123"),
+	B:       bigFromBase16("28E9FA9E9D9F5E344D5A9E4BCF6509A7F39789F515AB8F92DDBCBD414D940E93"),
+	Gx:      bigFromBase16("32C4AE2C1F1981195F9904466A39C9948FE30BBFF2660BE1715A4589334C74C7"),
+	Gy:      bigFromBase16("BC3736A2F4F6779C59BDCEE36B692153D0A9877CC62A474002DF32E52139F0A0"),
+	BitSize: 256},
+}
+
+var Curve256 = c256
+
+// EC256 returns the sm2-curve
+func EC256() elliptic.Curve {
+	return c256
+}
+
+func CurveSM2() elliptic.Curve {
+	return c256
+}
+
+func (SM2CurveParam) CombinedMult(bigX, bigY *big.Int, baseScalar, scalar []byte) (x, y *big.Int) {
+	return CombinedMult(bigX, bigY, baseScalar, scalar)
+}
@@ -0,0 +1,814 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file contains the Go wrapper for the constant-time, 64-bit assembly
+// implementation of P256. The optimizations performed here are described in
+// detail in:
+// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
+//                          256-bit primes"
+// https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
+// https://eprint.iacr.org/2013/816.pdf
+
+//go:build (arm64 || amd64) && !generic && !generic32 && !generic64
+// +build arm64 amd64
+// +build !generic
+// +build !generic32
+// +build !generic64
+
+package ec256
+
+import (
+	"crypto/elliptic"
+	"math/big"
+)
+
+const (
+	// montgomery of one: 1*R mod p
+	montOne0 = 0x0000000000000001
+	montOne1 = 0x00000000ffffffff
+	montOne2 = 0x0000000000000000
+	montOne3 = 0x0000000100000000
+
+	// montgomery of base point:
+	montBaseX0 = 0x61328990f418029e
+	montBaseX1 = 0x3e7981eddca6c050
+	montBaseX2 = 0xd6a1ed99ac24c3c3
+	montBaseX3 = 0x91167a5ee1c13b05
+
+	montBaseY0 = 0xc1354e593c2d0ddd
+	montBaseY1 = 0xc1f5e5788d3295fa
+	montBaseY2 = 0x8d4cfb066e2a48f8
+	montBaseY3 = 0x63cd65d481d735bd
+
+	// R*R mod n
+	rrModN0 = 0x901192af7c114f20
+	rrModN1 = 0x3464504ade6fa2fa
+	rrModN2 = 0x620fc84c3affe0d4
+	rrModN3 = 0x1eb5e412a22b3d3b
+
+	// R*R mod p
+	rrModP0 = 0x0000000200000003
+	rrModP1 = 0x00000002ffffffff
+	rrModP2 = 0x0000000100000001
+	rrModP3 = 0x0000000400000002
+)
+
+// c256Point Jacobian represent of a point with x,y,z in Montgomery domain
+type c256Point struct {
+	xyz [12]uint64
+}
+
+var (
+	c256Precomputed *[43][32 * 8]uint64
+)
+
+func init() {
+	initTable()
+}
+
+func (curve SM2CurveParam) Params() *elliptic.CurveParams {
+	return curve.CurveParams
+}
+
+//go:noescape
+// func c256Add(res, in1, in2 []uint64)
+
+// Functions implemented in c256_asm_*64.s
+// Montgomery multiplication modulo P256
+//
+//go:noescape
+func c256Mul(res, in1, in2 []uint64)
+
+// Montgomery square modulo P256, repeated n times (n >= 1)
+//
+//go:noescape
+func c256Sqr(res, in []uint64, n int)
+
+// Montgomery multiplication by 1, montMul(in, 1)
+//
+//go:noescape
+func c256FromMont(res, in []uint64)
+
+// iff cond != 0  val <- -val
+//
+//go:noescape
+func c256NegCond(val []uint64, cond int)
+
+// if cond == 0 res <- b; else res <- a
+//
+//go:noescape
+func c256MovCond(res, a, b []uint64, cond int)
+
+// Endianness swap, 大端表示的32字节转4个小端表示的uint64
+//
+//go:noescape
+func c256BigToLittle(res []uint64, in []byte)
+
+//go:noescape
+func c256LittleToBig(res []byte, in []uint64)
+
+// Constant time table access
+// idx = 0, returns infinity. idx = i > 0, returns table[i-1].
+//
+//go:noescape
+func c256Select(point, table []uint64, idx int)
+
+//go:noescape
+func c256SelectBase(point, table []uint64, idx int)
+
+// Montgomery multiplication modulo Ord(G)
+//
+//go:noescape
+func c256OrdMul(res, in1, in2 []uint64)
+
+// Montgomery square modulo Ord(G), repeated n times
+//
+//go:noescape
+func c256OrdSqr(res, in []uint64, n int)
+
+// Point add with in2 being affine point
+// If sign == 1 -> in2 = -in2
+// If sel == 0 -> res = in1
+// if zero == 0 -> res = in2
+//
+//go:noescape
+func c256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int)
+
+// Point add. Returns one if the two input points were equal and zero
+// otherwise. (Note that, due to the way that the equations work out, some
+// representations of ∞ are considered equal to everything by this function.)
+//
+//go:noescape
+func c256PointAddAsm(res, in1, in2 []uint64) int
+
+// Point double
+//
+//go:noescape
+func c256PointDoubleAsm(res, in []uint64)
+
+func c256ToMont(res, in []uint64) {
+	c256Mul(res, in, rr)
+}
+
+// in: k = k0 mod N
+// out: k0^{-1} mod N
+// use montgomery power: k -> k*R -> k^{N-2}*R -> k^{N-2}
+// Done - FIXME, need improve
+func (curve SM2CurveParam) Inverse(k *big.Int) *big.Int {
+
+	if k.Sign() < 0 {
+		// This should never happen.
+		k = new(big.Int).Neg(k)
+	}
+
+	if k.Cmp(c256.N) >= 0 {
+		// This should never happen.
+		k = new(big.Int).Mod(k, c256.N)
+	}
+
+	// table will store precomputed powers of x.
+	var table [4 * 10]uint64
+	var (
+		_1      = table[4*0 : 4*1] // 1
+		_11     = table[4*1 : 4*2] // 3
+		_101    = table[4*2 : 4*3] // 5
+		_111    = table[4*3 : 4*4] // 7
+		_1111   = table[4*4 : 4*5] // 15
+		_10101  = table[4*5 : 4*6] // 21
+		_101111 = table[4*6 : 4*7] // 47
+		x       = table[4*7 : 4*8]
+		t       = table[4*8 : 4*9]
+		s       = table[4*9 : 4*10]
+	)
+
+	fromBig(x[:], k)
+	// This code operates in the Montgomery domain where R = 2^256 mod n
+	// and n is the order of the scalar field. (See initP256 for the
+	// value.) Elements in the Montgomery domain take the form a×R and
+	// multiplication of x and y in the calculates (x × y × R^-1) mod n. RR
+	// is R×R mod n thus the Montgomery multiplication x and RR gives x×R,
+	// i.e. converts x into the Montgomery domain.
+	// Window values borrowed from https://briansmith.org/ecc-inversion-addition-chains-01#p256_scalar_inversion
+	RR := []uint64{rrModN0, rrModN1, rrModN2, rrModN3} // sm2-p256
+
+	// FIXME: the ladder need improve
+	// SM2-p256:
+	// N-2 = 0xfffffffeffffffffffffffffffffffff7203df6b21c6052b53bbf40939d54121
+	c256OrdMul(_1, x, RR)      // _1
+	c256OrdSqr(x, _1, 1)       // _10   x=10
+	c256OrdMul(_11, x, _1)     // _11
+	c256OrdMul(_101, x, _11)   // _101
+	c256OrdMul(_111, x, _101)  // _111
+	c256OrdSqr(x, _101, 1)     // _1010     -- x = _1010
+	c256OrdMul(_1111, _101, x) // _1111
+
+	c256OrdSqr(t, x, 1)          // _10100    -- t=_10100
+	c256OrdMul(_10101, t, _1)    // _10101
+	c256OrdSqr(x, _10101, 1)     // _101010   -- x=_101010
+	c256OrdMul(_101111, _101, x) // _101111
+	c256OrdMul(x, _10101, x)     // _111111 = x6  -- x=x6
+	c256OrdSqr(s, x, 1)          // x = _1111110
+	c256OrdMul(s, s, _1)         // x = x7
+	c256OrdSqr(x, s, 1)          // x = _11111110 = 0xfe
+	c256OrdMul(s, x, _1)         // s = x8 = 0xff
+	c256OrdSqr(t, s, 8)          // t=_ff00
+	c256OrdMul(x, t, x)          // x = fffe
+	c256OrdMul(s, t, s)          // s = _ffff
+	c256OrdSqr(t, s, 16)         // t=_ffff0000
+	c256OrdMul(x, t, x)          // x = fffffffe
+	c256OrdMul(t, x, _1)         //  t = ffffffff
+
+	c256OrdSqr(x, x, 32) //   x=_fffffffe00000000
+	c256OrdMul(x, x, t)  //   x=_fffffffeffffffff
+	c256OrdSqr(x, x, 32) // x = _fffffffeffffffff00000000
+	c256OrdMul(x, x, t)  //  x= _fffffffeffffffffffffffff
+	c256OrdSqr(x, x, 32) // 	x = _fffffffeffffffffffffffff00000000
+	c256OrdMul(x, x, t)  // x = _fffffffeffffffffffffffffffffffff
+
+	// 7203df6b21c6052b53bbf40939d54121 =
+	// 01110010000000111101111101101011001000011100011000000101001010110101001110111011111101000000100100111001110101010100000100100001 =
+	// 0111 001 00000001111 01111 101
+	// 101 011 001 0000111 00011
+	// 000000101 0010101 10101 00111 0111
+	// 011 1111 01 0000001 001
+	// 00111 00111 010101 01 000001
+	// 001 00001
+
+	sqrs := []uint8{
+		4, 3, 11, 5, 3,
+		3, 3, 3, 7, 5,
+		9, 7, 5, 5, 4,
+		3, 4, 2, 7, 3,
+		5, 5, 6, 2, 6,
+		3, 5,
+	}
+	muls := [][]uint64{
+		_111, _1, _1111, _1111, _101,
+		_101, _11, _1, _111, _11,
+		_101, _10101, _10101, _111, _111,
+		_11, _1111, _1, _1, _1,
+		_111, _111, _10101, _1, _1,
+		_1, _1,
+	}
+
+	for i, s := range sqrs {
+		c256OrdSqr(x, x, int(s))
+		c256OrdMul(x, x, muls[i])
+	}
+
+	// Multiplying by one in the Montgomery domain converts a Montgomery
+	// value out of the domain.
+	one := []uint64{1, 0, 0, 0}
+	c256OrdMul(x, x, one)
+
+	xOut := make([]byte, 32)
+	c256LittleToBig(xOut, x)
+	return new(big.Int).SetBytes(xOut)
+}
+
+// fromBig converts a *big.Int into a format used by this code.
+func fromBig(out []uint64, big *big.Int) {
+	for i := range out {
+		out[i] = 0
+	}
+
+	for i, v := range big.Bits() {
+		out[i] = uint64(v)
+	}
+}
+
+// c256GetScalar endian-swaps the big-endian scalar value from in and writes it
+// to out. If the scalar is equal or greater than the order of the group, it's
+// reduced modulo that order.
+func c256GetScalar(out []uint64, in []byte) {
+	n := new(big.Int).SetBytes(in)
+
+	if n.Cmp(c256.N) >= 0 {
+		n.Mod(n, c256.N)
+	}
+	fromBig(out, n)
+}
+
+// c256Mul operates in a Montgomery domain with R = 2^256 mod p, where p is the
+// underlying field of the curve. (See initP256 for the value.) Thus rr here is
+// R×R mod p. See comment in Inverse about how this is used.
+var rr = []uint64{rrModP0, rrModP1, rrModP2, rrModP3} //// changed to sm2
+
+// Note: for most time, in < p
+func maybeReduceModP(in *big.Int) *big.Int {
+	if in.Cmp(c256.P) < 0 {
+		return in
+	}
+	return new(big.Int).Mod(in, c256.P)
+}
+
+func CombinedMult(bigX, bigY *big.Int, baseScalar, scalar []byte) (x, y *big.Int) {
+	scalarReversed := make([]uint64, 4)
+	var r1, r2 c256Point
+	c256GetScalar(scalarReversed, baseScalar)
+	r1IsInfinity := scalarIsZero(scalarReversed)
+	r1.c256BaseMult(scalarReversed)
+
+	c256GetScalar(scalarReversed, scalar)
+	r2IsInfinity := scalarIsZero(scalarReversed)
+	r2.c256PointFromAffine(bigX, bigY)
+	r2.c256ScalarMult(scalarReversed)
+
+	var sum, double c256Point
+	pointsEqual := c256PointAddAsm(sum.xyz[:], r1.xyz[:], r2.xyz[:])
+	c256PointDoubleAsm(double.xyz[:], r1.xyz[:])
+	sum.CopyConditional(&double, pointsEqual)
+	sum.CopyConditional(&r1, r2IsInfinity)
+	sum.CopyConditional(&r2, r1IsInfinity)
+	return sum.c256PointToAffine()
+}
+
+func (curve SM2CurveParam) ScalarBaseMult(scalar []byte) (x, y *big.Int) {
+	// return curve.ScalarMult(curve.Gx, curve.Gy, scalar)
+	scalarReversed := make([]uint64, 4)
+	c256GetScalar(scalarReversed, scalar)
+
+	var r c256Point
+	r.c256BaseMult(scalarReversed)
+	return r.c256PointToAffine()
+}
+
+func (curve SM2CurveParam) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) {
+	scalarReversed := make([]uint64, 4)
+	c256GetScalar(scalarReversed, scalar)
+
+	var r c256Point
+	fromBig(r.xyz[0:4], maybeReduceModP(bigX))
+	fromBig(r.xyz[4:8], maybeReduceModP(bigY))
+	c256Mul(r.xyz[0:4], r.xyz[0:4], rr[:])
+	c256Mul(r.xyz[4:8], r.xyz[4:8], rr[:])
+	// This sets r2's Z value to 1, in the Montgomery domain.
+	r.xyz[8] = montOne0
+	r.xyz[9] = montOne1
+	r.xyz[10] = montOne2
+	r.xyz[11] = montOne3
+
+	r.c256ScalarMult(scalarReversed)
+	return r.c256PointToAffine()
+}
+
+func (curve SM2CurveParam) Add(x1, y1, x2, y2 *big.Int) (x, y *big.Int) {
+	var r1, r2 c256Point
+	r1.c256PointFromAffine(x1, y1)
+	r2.c256PointFromAffine(x2, y2)
+	if true {
+		// in most cases, the input two points are not equal.
+		// omit the time-attack risk.
+		if c256PointAddAsm(r1.xyz[:], r1.xyz[:], r2.xyz[:]) == 1 {
+			c256PointDoubleAsm(r1.xyz[:], r2.xyz[:])
+		}
+		return r1.c256PointToAffine()
+	} else {
+		var res, double c256Point
+		pointEqual := c256PointAddAsm(res.xyz[:], r1.xyz[:], r2.xyz[:])
+		c256PointDoubleAsm(double.xyz[:], r1.xyz[:])
+		c256MovCond(res.xyz[:], res.xyz[:], double.xyz[:], pointEqual)
+		return res.c256PointToAffine()
+	}
+
+}
+
+func (curve SM2CurveParam) Double(x1, y1 *big.Int) (x, y *big.Int) {
+	var r c256Point
+	r.c256PointFromAffine(x1, y1)
+	c256PointDoubleAsm(r.xyz[:], r.xyz[:])
+	return r.c256PointToAffine()
+}
+
+// uint64IsZero returns 1 if x is zero and zero otherwise.
+func uint64IsZero(x uint64) int {
+	x = ^x
+	x &= x >> 32
+	x &= x >> 16
+	x &= x >> 8
+	x &= x >> 4
+	x &= x >> 2
+	x &= x >> 1
+	return int(x & 1)
+}
+
+// scalarIsZero returns 1 if scalar represents the zero value, and zero
+// otherwise.
+func scalarIsZero(scalar []uint64) int {
+	return uint64IsZero(scalar[0] | scalar[1] | scalar[2] | scalar[3])
+}
+
+// c256PointFromAffine change affine point (x,y) to Montgemery domain
+// Jacobian point p
+func (p *c256Point) c256PointFromAffine(x, y *big.Int) {
+	xyz := p.xyz[:]
+	fromBig(xyz[0:4], maybeReduceModP(x))
+	fromBig(xyz[4:8], maybeReduceModP(y))
+	c256Mul(xyz[0:4], xyz[0:4], rr[:])
+	c256Mul(xyz[4:8], xyz[4:8], rr[:])
+	xyz[8] = montOne0
+	xyz[9] = montOne1
+	xyz[10] = montOne2
+	xyz[11] = montOne3
+}
+
+func (p *c256Point) c256PointToAffine() (x, y *big.Int) {
+	zInv := make([]uint64, 4)
+	zInvSq := make([]uint64, 4)
+	c256Inverse(zInv, p.xyz[8:12])
+	c256Sqr(zInvSq, zInv, 1)
+	c256Mul(zInv, zInv, zInvSq)
+
+	c256Mul(zInvSq, p.xyz[0:4], zInvSq)
+	c256Mul(zInv, p.xyz[4:8], zInv)
+
+	c256FromMont(zInvSq, zInvSq)
+	c256FromMont(zInv, zInv)
+
+	xOut := make([]byte, 32)
+	yOut := make([]byte, 32)
+	c256LittleToBig(xOut, zInvSq)
+	c256LittleToBig(yOut, zInv)
+
+	return new(big.Int).SetBytes(xOut), new(big.Int).SetBytes(yOut)
+}
+
+// CopyConditional copies overwrites p with src if v == 1, and leaves p
+// unchanged if v == 0.
+func (p *c256Point) CopyConditional(src *c256Point, v int) {
+	pMask := uint64(v) - 1
+	srcMask := ^pMask
+
+	for i, n := range p.xyz {
+		p.xyz[i] = (n & pMask) | (src.xyz[i] & srcMask)
+	}
+}
+
+// c256Inverse sets out to in^-1 mod p.
+// in*R => in^{-1} * R = mont_power(in*R, p-2)
+// Tested Done
+func c256Inverse(out, in []uint64) {
+	if false {
+		var stack [8 * 4]uint64
+		p2 := stack[4*0 : 4*0+4]
+		p4 := stack[4*1 : 4*1+4]
+		p8 := stack[4*2 : 4*2+4]
+		p16 := stack[4*3 : 4*3+4]
+		p32 := stack[4*4 : 4*4+4]
+		p28e := stack[4*5 : 4*6] // fffffffe
+		p28c := stack[4*6 : 4*7] // fffffffc
+		t := stack[4*7 : 4*8]
+
+		// 0xfffffffe ffffffff ffffffff ffffffff ffffffff 00000000 ffffffff fffffffd
+		c256Sqr(p28e, in, 1)  //  10*p
+		c256Mul(p2, p28e, in) // 11*p
+
+		c256Sqr(t, p2, 2)  //1100*p
+		c256Mul(p4, t, p2) // f*p
+
+		c256Sqr(t, p4, 4)  // f0*p
+		c256Mul(p8, t, p4) // ff*p
+
+		c256Sqr(t, p8, 8)   // ff00*p
+		c256Mul(p16, t, p8) // ffff*p
+
+		c256Sqr(t, p16, 8)        // ffff00*p
+		c256Mul(t, t, p8)         // ffffff*p
+		c256Sqr(t, t, 4)          // ffffff0*p
+		c256Mul(t, t, p4)         // fffffff*p
+		c256Sqr(t, t, 2)          // fffffff_(00)*p
+		c256Mul(t, t, p2)         // fffffff_(11)*p
+		c256Sqr(p28c, t, 2)       // fffffffc*p
+		c256Mul(p28e, p28e, p28c) // fffffffe*p
+		c256Mul(p32, p28e, in)    // ffffffff*p
+
+		c256Sqr(t, p28e, 32)
+		c256Mul(t, t, p32) // fffffffe ffffffff
+
+		c256Sqr(t, t, 32)
+		c256Mul(t, t, p32) // fffffffe ffffffff ffffffff
+
+		c256Sqr(t, t, 32)
+		c256Mul(t, t, p32) // fffffffe ffffffff ffffffff ffffffff
+
+		c256Sqr(t, t, 32)
+		c256Mul(t, t, p32) // fffffffe ffffffff ffffffff ffffffff ffffffff
+
+		c256Sqr(t, t, 64)
+		c256Mul(t, t, p32) // fffffffe ffffffff ffffffff ffffffff ffffffff 00000000 ffffffff
+
+		c256Sqr(t, t, 32)
+		c256Mul(t, t, p28c) // fffffffe ffffffff ffffffff ffffffff ffffffff 00000000 fffffffe
+		c256Mul(out, t, in) // fffffffe ffffffff ffffffff ffffffff ffffffff 00000000 fffffffd
+
+		// total 255 sqr + 16 mul
+	} else {
+		var stack [17 * 4]uint64
+		_10 := stack[4*0 : 4*0+4]
+		_11 := stack[4*1 : 4*1+4]
+		_110 := stack[4*2 : 4*2+4]
+		_111 := stack[4*3 : 4*3+4]
+		_111000 := stack[4*4 : 4*4+4]
+		_111111 := stack[4*5 : 4*6]  // fffffffe
+		_1111110 := stack[4*6 : 4*7] // fffffffc
+		_1111111 := stack[4*7 : 4*8]
+		x12 := stack[4*8 : 4*9] // _111111111111
+		x24 := stack[4*9 : 4*10]
+		x31 := stack[4*10 : 4*11]
+		i39 := stack[4*11 : 4*12]
+		i68 := stack[4*12 : 4*13]
+		x62 := stack[4*13 : 4*14]
+		i71 := stack[4*14 : 4*15]
+		x64 := stack[4*15 : 4*16]
+		i265 := stack[4*16 : 4*17]
+
+		c256Sqr(_10, in, 1)             // _10 = 2 * 1
+		c256Mul(_11, _10, in)           // _11 = 1 + _10
+		c256Sqr(_110, _11, 1)           // _110 = 2 * _11
+		c256Mul(_111, _110, in)         // _111 = 1 + _110
+		c256Sqr(_111000, _111, 3)       // _111000 = _111 << 3
+		c256Mul(_111111, _111, _111000) // _111111 = _111 + _111000
+		c256Sqr(_1111110, _111111, 1)   // _1111110 = 2 * _111111
+		c256Mul(_1111111, _1111110, in) // _1111111 = 1 + _1111110
+		c256Sqr(x12, _1111110, 5)       // x12 = _1111110<<5 + _111111
+		c256Mul(x12, x12, _111111)
+
+		c256Sqr(x24, x12, 12) // x24 = x12<<12 + x12
+		c256Mul(x24, x24, x12)
+
+		c256Sqr(x31, x24, 7) // x31 = x24<<7 + _1111111
+		c256Mul(x31, x31, _1111111)
+
+		c256Sqr(i39, x31, 2)   // i39 = x31 << 2
+		c256Sqr(i68, i39, 29)  // i68 = i39 << 29
+		c256Mul(x62, x31, i68) // x62 = x31 + i68
+		c256Sqr(i71, i68, 2)   // i71 = i68 << 2
+		c256Mul(x64, i39, i71) // x64 = i39 + i71 + _11
+		c256Mul(x64, x64, _11)
+		c256Sqr(i265, i71, 32) // i265 = ((i71<<32+x64)<<64 + x64) << 94
+		c256Mul(i265, i265, x64)
+		c256Sqr(i265, i265, 64)
+		c256Mul(i265, i265, x64)
+		c256Sqr(i265, i265, 94)
+
+		c256Mul(i265, i265, x62) // return (x62+i265)<<2 + 1
+		c256Sqr(i265, i265, 2)
+		c256Mul(out, i265, in)
+
+		// 255 sqr + 14 mul
+	}
+}
+
+func (p *c256Point) c256StorePoint(r *[16 * 4 * 3]uint64, index int) {
+	copy(r[index*12:], p.xyz[:])
+}
+
+func boothW5(in uint) (int, int) {
+	var s uint = ^((in >> 5) - 1)
+	var d uint = (1 << 6) - in - 1
+	d = (d & s) | (in & (^s))
+	d = (d >> 1) + (d & 1)
+	return int(d), int(s & 1)
+}
+
+/*
+输入in 低7位有效 i0,i1,i2,...,i6
+*/
+func boothW6(in uint) (int, int) {
+	if true {
+		var s uint = ^((in >> 6) - 1)
+		var d uint = (1 << 7) - in - 1
+		d = (d & s) | (in & (^s))
+		d = (d >> 1) + (d & 1)
+		return int(d), int(s & 1)
+	} else {
+		//
+		var sel, sign uint = 0, 0
+		in = in & 0x7f // 只取低7位。其中最低位是前一窗口的最高位。
+
+		// sign 是第7位
+		if (in >> 6) == 1 {
+			sign = 1
+		} else {
+			sign = 0
+		}
+
+		if sign == 1 {
+			sel = in >> 1
+			sel = (^sel) & 0x3f
+			sel++
+			if in&1 == 1 {
+				sel--
+			}
+		} else {
+			sel = (in + 1) >> 1
+		}
+
+		return int(sel), int(sign)
+	}
+}
+
+func initTable() {
+	/*
+			c256Precomputed[i][j] = 2^{6i}*(jG) =
+					0				1					2				31
+				0 	G				[2]G				[3]G			[32]G
+				1 	[2^{6*1}]G		[2^{6*1}][2]G
+				2	[2^{6*2}]G		[2^{6*2}][2]G
+				·························
+				42	[2^{6*42}]G
+
+				===========================================
+
+				1  		2  		3  		... 		32
+				64 		64*2	64*3    			64*32
+				64*64   64*64*2 ...
+
+		43*32 =
+	*/
+	c256Precomputed = new([43][32 * 8]uint64)
+
+	basePoint := []uint64{
+		montBaseX0, montBaseX1, montBaseX2, montBaseX3,
+		montBaseY0, montBaseY1, montBaseY2, montBaseY3,
+		montOne0, montOne1, montOne2, montOne3,
+	}
+	t1 := make([]uint64, 12)
+	t2 := make([]uint64, 12)
+	copy(t2, basePoint)
+
+	zInv := make([]uint64, 4)
+	zInvSq := make([]uint64, 4)
+	for j := 0; j < 32; j++ {
+		copy(t1, t2)
+		for i := 0; i < 43; i++ {
+			// The window size is 6 so we need to double 6 times.
+			if i != 0 {
+				for k := 0; k < 6; k++ {
+					c256PointDoubleAsm(t1, t1)
+				}
+			}
+			// Convert the point to affine form. (Its values are
+			// still in Montgomery form however.)
+			c256Inverse(zInv, t1[8:12])
+			c256Sqr(zInvSq, zInv, 1)
+			c256Mul(zInv, zInv, zInvSq)
+
+			c256Mul(t1[:4], t1[:4], zInvSq)
+			c256Mul(t1[4:8], t1[4:8], zInv)
+
+			copy(t1[8:12], basePoint[8:12])
+			// Update the table entry
+			copy(c256Precomputed[i][j*8:], t1[:8])
+		}
+		if j == 0 {
+			c256PointDoubleAsm(t2, basePoint)
+		} else {
+			c256PointAddAsm(t2, t2, basePoint)
+		}
+	}
+}
+
+func c256SelectBaseOfGo(point, table []uint64, idx int) {
+	if false {
+		c256SelectBase(point, table, idx)
+		return
+	} else {
+		if idx == 0 {
+			return
+		}
+		copy(point[:8], table[8*(idx-1):])
+	}
+}
+func (p *c256Point) c256BaseMult(scalar []uint64) {
+	wvalue := (scalar[0] << 1) & 0x7f
+	sel, sign := boothW6(uint(wvalue))
+	c256SelectBase(p.xyz[0:8], c256Precomputed[0][0:], sel)
+	c256NegCond(p.xyz[4:8], sign)
+
+	// (This is one, in the Montgomery domain.)
+	p.xyz[8] = montOne0
+	p.xyz[9] = montOne1
+	p.xyz[10] = montOne2
+	p.xyz[11] = montOne3
+
+	var t0 c256Point
+	// (This is one, in the Montgomery domain.)
+	t0.xyz[8] = montOne0
+	t0.xyz[9] = montOne1
+	t0.xyz[10] = montOne2
+	t0.xyz[11] = montOne3
+
+	// 191 = 6*31 + 5
+	index := uint(5)
+	zero := sel
+
+	for i := 1; i < 43; i++ {
+		if index < 192 {
+			wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0x7f
+		} else {
+			wvalue = (scalar[index/64] >> (index % 64)) & 0x7f
+		}
+		index += 6
+		sel, sign = boothW6(uint(wvalue))
+		c256SelectBase(t0.xyz[0:8], c256Precomputed[i][0:], sel)
+		c256PointAddAffineAsm(p.xyz[0:12], p.xyz[0:12], t0.xyz[0:8], sign, sel, zero)
+		zero |= sel
+	}
+}
+
+func (p *c256Point) c256ScalarMult(scalar []uint64) {
+	// precomp is a table of precomputed points that stores powers of p
+	// from p^1 to p^16.
+	var precomp [16 * 4 * 3]uint64
+	var t0, t1, t2, t3 c256Point
+
+	// Prepare the table
+	p.c256StorePoint(&precomp, 0) // 1
+
+	c256PointDoubleAsm(t0.xyz[:], p.xyz[:])
+	c256PointDoubleAsm(t1.xyz[:], t0.xyz[:])
+	c256PointDoubleAsm(t2.xyz[:], t1.xyz[:])
+	c256PointDoubleAsm(t3.xyz[:], t2.xyz[:])
+	t0.c256StorePoint(&precomp, 1)  // 2
+	t1.c256StorePoint(&precomp, 3)  // 4
+	t2.c256StorePoint(&precomp, 7)  // 8
+	t3.c256StorePoint(&precomp, 15) // 16
+
+	c256PointAddAsm(t0.xyz[:], t0.xyz[:], p.xyz[:])
+	c256PointAddAsm(t1.xyz[:], t1.xyz[:], p.xyz[:])
+	c256PointAddAsm(t2.xyz[:], t2.xyz[:], p.xyz[:])
+	t0.c256StorePoint(&precomp, 2) // 3
+	t1.c256StorePoint(&precomp, 4) // 5
+	t2.c256StorePoint(&precomp, 8) // 9
+
+	c256PointDoubleAsm(t0.xyz[:], t0.xyz[:])
+	c256PointDoubleAsm(t1.xyz[:], t1.xyz[:])
+	t0.c256StorePoint(&precomp, 5) // 6
+	t1.c256StorePoint(&precomp, 9) // 10
+
+	c256PointAddAsm(t2.xyz[:], t0.xyz[:], p.xyz[:])
+	c256PointAddAsm(t1.xyz[:], t1.xyz[:], p.xyz[:])
+	t2.c256StorePoint(&precomp, 6)  // 7
+	t1.c256StorePoint(&precomp, 10) // 11
+
+	c256PointDoubleAsm(t0.xyz[:], t0.xyz[:])
+	c256PointDoubleAsm(t2.xyz[:], t2.xyz[:])
+	t0.c256StorePoint(&precomp, 11) // 12
+	t2.c256StorePoint(&precomp, 13) // 14
+
+	c256PointAddAsm(t0.xyz[:], t0.xyz[:], p.xyz[:])
+	c256PointAddAsm(t2.xyz[:], t2.xyz[:], p.xyz[:])
+	t0.c256StorePoint(&precomp, 12) // 13
+	t2.c256StorePoint(&precomp, 14) // 15
+
+	// Start scanning the window from top bit
+	index := uint(254)
+	var sel, sign int
+
+	wvalue := (scalar[index/64] >> (index % 64)) & 0x3f
+	sel, _ = boothW5(uint(wvalue))
+
+	c256Select(p.xyz[0:12], precomp[0:], sel)
+	zero := sel
+
+	for index > 4 {
+		index -= 5
+		c256PointDoubleAsm(p.xyz[:], p.xyz[:])
+		c256PointDoubleAsm(p.xyz[:], p.xyz[:])
+		c256PointDoubleAsm(p.xyz[:], p.xyz[:])
+		c256PointDoubleAsm(p.xyz[:], p.xyz[:])
+		c256PointDoubleAsm(p.xyz[:], p.xyz[:])
+
+		if index < 192 {
+			wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0x3f
+		} else {
+			wvalue = (scalar[index/64] >> (index % 64)) & 0x3f
+		}
+
+		sel, sign = boothW5(uint(wvalue))
+
+		c256Select(t0.xyz[0:], precomp[0:], sel)
+		c256NegCond(t0.xyz[4:8], sign)
+		c256PointAddAsm(t1.xyz[:], p.xyz[:], t0.xyz[:])
+		c256MovCond(t1.xyz[0:12], t1.xyz[0:12], p.xyz[0:12], sel)
+		c256MovCond(p.xyz[0:12], t1.xyz[0:12], t0.xyz[0:12], zero)
+		zero |= sel
+	}
+
+	c256PointDoubleAsm(p.xyz[:], p.xyz[:])
+	c256PointDoubleAsm(p.xyz[:], p.xyz[:])
+	c256PointDoubleAsm(p.xyz[:], p.xyz[:])
+	c256PointDoubleAsm(p.xyz[:], p.xyz[:])
+	c256PointDoubleAsm(p.xyz[:], p.xyz[:])
+
+	wvalue = (scalar[0] << 1) & 0x3f
+	sel, sign = boothW5(uint(wvalue))
+
+	c256Select(t0.xyz[0:], precomp[0:], sel)
+	c256NegCond(t0.xyz[4:8], sign)
+	c256PointAddAsm(t1.xyz[:], p.xyz[:], t0.xyz[:])
+	c256MovCond(t1.xyz[0:12], t1.xyz[0:12], p.xyz[0:12], sel)
+	c256MovCond(p.xyz[0:12], t1.xyz[0:12], t0.xyz[0:12], zero)
+}
@@ -0,0 +1,112 @@
+//go:build (amd64 || arm64) && !generic && !generic32 && !generic64
+// +build amd64 arm64
+// +build !generic
+// +build !generic32
+// +build !generic64
+
+package ec256
+
+import (
+	"fmt"
+	"math/big"
+	"testing"
+	"time"
+)
+
+func pointFromBig(x, y *big.Int) []uint64 {
+	xyz := make([]uint64, 12)
+	fromBig(xyz[0:4], maybeReduceModP(x))
+	fromBig(xyz[4:8], maybeReduceModP(y))
+	c256Mul(xyz[0:4], xyz[0:4], rr[:])
+	c256Mul(xyz[4:8], xyz[4:8], rr[:])
+	xyz[8] = montOne0
+	xyz[9] = montOne1
+	xyz[10] = montOne2
+	xyz[11] = montOne3
+	return xyz
+}
+
+// func TestC256AddSpeed(t *testing.T) {
+// 	a := []uint64{0x715A4589334C74C7, 0x8FE30BBFF2660BE1, 0x5F9904466A39C994, 0x32C4AE2C1F198119}
+// 	b := []uint64{0x715A4589334C74C7, 0x8FE30BBFF2660BE1, 0x5F9904466A39C994, 0x32C4AE2C1F198119}
+// 	res := make([]uint64, 4)
+// 	begin := time.Now()
+// 	total := 1000000000
+// 	for i := 0; i < total; i++ {
+// 		c256Add(res, a, b)
+// 	}
+// 	elaspe := time.Since(begin)
+// 	fmt.Println("time: ", elaspe.Milliseconds(), "ms")
+// 	fmt.Println(int(float64(total) / float64(elaspe.Milliseconds()) * 1000))
+// }
+
+func TestC256SqrSpeed(t *testing.T) {
+	a := []uint64{0x715A4589334C74C7, 0x8FE30BBFF2660BE1, 0x5F9904466A39C994, 0x32C4AE2C1F198119}
+	res := make([]uint64, 4)
+	begin := time.Now()
+	total := 100000000
+	for i := 0; i < total; i++ {
+		c256Sqr(res, a, 1)
+		// c256Sqr(res, res, 1)
+		// c256Sqr(res, res, 1)
+		// c256Sqr(res, res, 1)
+	}
+	elaspe := time.Since(begin)
+	fmt.Println("time: ", elaspe.Milliseconds(), "ms")
+	fmt.Println(int(float64(total) / float64(elaspe.Milliseconds()) * 1000))
+}
+
+func TestC256MulSpeed(t *testing.T) {
+	a := []uint64{0x715A4589334C74C7, 0x8FE30BBFF2660BE1, 0x5F9904466A39C994, 0x32C4AE2C1F198119}
+	b := []uint64{0x715A4589334C74C6, 0x8FE30BBFF2660BE1, 0x5F9904466A39C994, 0x32C4AE2C1F198119}
+	res := make([]uint64, 4)
+	total := 100000000
+	begin := time.Now()
+	for i := 0; i < total; i++ {
+		c256Mul(res, a, b)
+	}
+	elaspe := time.Since(begin)
+	fmt.Println("time: ", elaspe.Milliseconds(), "ms")
+	fmt.Println(int(float64(total) / float64(elaspe.Milliseconds()) * 1000))
+}
+
+func TestC256PointAddAsmSpeed(t *testing.T) {
+	p1 := pointFromBig(c256.Gx, c256.Gy)
+	x2, y2 := c256.ScalarMult(c256.Gx, c256.Gy, (new(big.Int).SetInt64(2)).Bytes())
+	p2 := pointFromBig(x2, y2)
+	var res [12]uint64
+	begin := time.Now()
+	total := 10000000
+	for i := 0; i < total; i++ {
+		c256PointAddAsm(res[:], p1, p2)
+	}
+	elaspe := time.Since(begin)
+	fmt.Println("time: ", elaspe.Milliseconds(), "ms")
+	fmt.Println(int(float64(total) / float64(elaspe.Milliseconds()) * 1000))
+}
+
+func TestC256PointDoubleAsmSpeed(t *testing.T) {
+	p1 := pointFromBig(c256.Gx, c256.Gy)
+	var res [12]uint64
+	begin := time.Now()
+	total := 10000000
+	for i := 0; i < total; i++ {
+		c256PointDoubleAsm(res[:], p1)
+	}
+	elaspe := time.Since(begin)
+	fmt.Println("time: ", elaspe.Milliseconds(), "ms")
+	fmt.Println(int(float64(total) / float64(elaspe.Milliseconds()) * 1000))
+}
+
+func TestC256InvSpeed(t *testing.T) {
+	in := []uint64{34235, 23341, 3444, 55555}
+	out := make([]uint64, 4)
+	begin := time.Now()
+	total := 1000000
+	for i := 0; i < total; i++ {
+		c256Inverse(out, in)
+	}
+	elaspe := time.Since(begin)
+	fmt.Println("time: ", elaspe.Milliseconds(), "ms")
+	fmt.Println(float64(total) / float64(elaspe.Milliseconds()) * 1000)
+}
@@ -0,0 +1,941 @@
+//go:build (amd64 || arm64) && !generic && !generic32 && !generic64
+// +build amd64 arm64
+// +build !generic
+// +build !generic32
+// +build !generic64
+
+package ec256
+
+import (
+	"crypto/rand"
+	"encoding/binary"
+	"fmt"
+	"math/big"
+	"os"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+)
+
+// r^{-1} mod p
+var rModInverseP *big.Int
+var rModInverseN *big.Int
+var p *big.Int
+var n *big.Int
+
+func init() {
+	rModInverseP = new(big.Int)
+	rModInverseP.SetInt64(1)
+	rModInverseP.Lsh(rModInverseP, 256)
+	rModInverseP.ModInverse(rModInverseP, c256.P)
+
+	rModInverseN = new(big.Int)
+	rModInverseN.SetInt64(1)
+	rModInverseN.Lsh(rModInverseN, 256)
+	rModInverseN.ModInverse(rModInverseN, c256.N)
+
+	p = new(big.Int)
+	p.Set(c256.P)
+	n = new(big.Int)
+	n.Set(c256.N)
+}
+
+func randUint64(a []uint64) {
+	buf := make([]byte, 8)
+	for i := range a {
+		rand.Read(buf)
+		a[i] = binary.LittleEndian.Uint64(buf)
+	}
+}
+
+func assertEqual(a, b interface{}) {
+	switch a.(type) {
+	case *big.Int:
+		if a.(*big.Int).Cmp(b.(*big.Int)) != 0 {
+			panic("assert equal failed")
+		}
+	case []uint64:
+		aa := a.([]uint64)
+		bb := b.([]uint64)
+		for i := 0; i < len(aa); i++ {
+			if aa[i] != bb[i] {
+				panic("assert equal failed")
+			}
+		}
+	default:
+		panic("unknown type")
+	}
+}
+
+func print(a []uint64) {
+	for _, x := range a {
+		fmt.Printf("%016x ", x)
+	}
+	fmt.Println("")
+}
+
+func toBig(in []uint64) *big.Int {
+	out := new(big.Int)
+	for i := len(in) - 1; i >= 0; i-- {
+		out.Lsh(out, 64)
+		out.Add(out, new(big.Int).SetUint64(in[i]))
+	}
+	return out
+}
+
+// Functions implemented in c256_asm_*64.s
+// Montgomery multiplication modulo P256
+func c256MulOfGo(res, in1, in2 []uint64) {
+	int1 := toBig(in1)
+	int2 := toBig(in2)
+	int1.Mul(int1, int2)
+	int1.Mul(int1, rModInverseP)
+	int1.Mod(int1, p)
+	fromBig(res, int1)
+}
+
+// Montgomery square modulo P256, repeated n times (n >= 1)
+func c256SqrOfGo(res, in []uint64, n int) {
+	copy(res, in)
+	for i := 0; i < n; i++ {
+		c256MulOfGo(res, res, res)
+	}
+}
+
+// Montgomery multiplication by 1
+func c256FromMontOfGo(res, in []uint64) {
+	int1 := toBig(in)
+	int1.Mul(int1, rModInverseP)
+	int1.Mod(int1, p)
+	fromBig(res, int1)
+}
+
+// iff cond == 1  val <- -val
+func c256NegCondOfGo(val []uint64, cond int) {
+	if cond == 1 {
+		int1 := toBig(val)
+		int1.Sub(p, int1)
+
+		int1.Mod(int1, p)
+		fromBig(val, int1)
+	}
+}
+
+// Montgomery multiplication modulo Ord(G)
+func c256OrdMulOfGo(res, in1, in2 []uint64) {
+	int1 := toBig(in1)
+	int2 := toBig(in2)
+	int1.Mul(int1, int2)
+	int1.Mul(int1, rModInverseN)
+	int1.Mod(int1, n)
+	fromBig(res, int1)
+}
+
+// Montgomery square modulo Ord(G), repeated n times
+func c256OrdSqrOfGo(res, in []uint64, n int) {
+	copy(res, in)
+
+	for i := 0; i < n; i++ {
+		c256OrdMulOfGo(res, res, res)
+	}
+}
+
+// the key step of mont-mul, res = in + p * in[0]
+// res:5, in:4
+func c256MulPOfGo(res, in []uint64) {
+	int1 := toBig(in)
+	r := new(big.Int)
+	r.Mul(new(big.Int).SetUint64(in[0]), p)
+	r.Add(r, int1)
+	fromBig(res, r)
+}
+
+func montReduceOfGo(res, a []uint64) {
+	res1 := new(big.Int)
+	a1 := toBig(a)
+	res1.Mul(new(big.Int).SetUint64(a[0]), p)
+	res1.Add(res1, a1)
+	res1.Rsh(res1, 64)
+	fromBig(res, res1)
+}
+
+func randomPoint() (*big.Int, *big.Int) {
+	k, _ := rand.Int(rand.Reader, c256.N)
+	return c256.ScalarMult(c256.Gx, c256.Gy, k.Bytes())
+}
+
+//	func TestMontReduceOfGo(t *testing.T) {
+//		res1, res2, in1 := make([]uint64, 4), make([]uint64, 4), make([]uint64, 4)
+//		_ = res1
+//		for i := 0; i < 100000000; i++ {
+//			randUint64(in1)
+//			// in1 = []uint64{1, 1, 1, 1}
+//			montReduceOfGo(res1, in1)
+//			// montReduce(res2, in1)
+//			// print(res1)
+//			// print(res2)
+//			// assertEqualUint(res1, res2, "")
+//		}
+//	}
+
+func BenchmarkUint64IsZero(b *testing.B) {
+	scalar := []uint64{1, 2, 3, 4}
+	for i := 0; i < b.N; i++ {
+		scalarIsZero(scalar)
+	}
+}
+func TestC256Mul(t *testing.T) {
+	for i := 0; i < 1000000; i++ {
+		// for {
+		res1, res2, in1, in2 := make([]uint64, 4), make([]uint64, 4), make([]uint64, 4), make([]uint64, 4)
+		randUint64(in1)
+		randUint64(in2)
+
+		c256MulOfGo(res1, in1, in2)
+		c256Mul(res2, in1, in2)
+		assertEqual(res1, res2)
+	}
+}
+
+// 使用p256:
+// BenchmarkC256Mul-10    	82318298	        14.51 ns/op	       0 B/op	       0 allocs/op
+// 修改不用nist p256:
+// BenchmarkC256Mul-10    	87902702	        13.60 ns/op	       0 B/op	       0 allocs/op
+func BenchmarkC256Mul(b *testing.B) {
+	res, in1, in2 := make([]uint64, 4), make([]uint64, 4), make([]uint64, 4)
+	randUint64(in1)
+	randUint64(in2)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		c256Mul(res, in1, in2)
+	}
+}
+
+func TestC256SqrBasic(t *testing.T) {
+	res, zero, in := make([]uint64, 4), make([]uint64, 4), make([]uint64, 4)
+	c256Sqr(res, in, 1)
+	assertEqual(res, zero)
+
+	pplus1 := new(big.Int).Add(p, big.NewInt(1))
+	fromBig(in, pplus1)
+	c256Sqr(res, in, 1)
+	rInv := toBig(res)
+	assertEqual(rInv, rModInverseP)
+
+	f32 := new(big.Int).Sub(new(big.Int).Lsh(big.NewInt(1), 256), big.NewInt(1))
+	fromBig(in, f32)
+	c256Sqr(res, in, 1)
+	f32.Mul(f32, f32)
+	f32.Mul(f32, rModInverseP)
+	f32.Mod(f32, p)
+	res2 := make([]uint64, 4)
+	fromBig(res2, f32)
+	assertEqual(res, res2)
+}
+
+func TestC256Sqr(t *testing.T) {
+	for n := 1; n < 10; n++ {
+		for i := 0; i < 100000; i++ {
+			res1, res2, in := make([]uint64, 4), make([]uint64, 4), make([]uint64, 4)
+			randUint64(in)
+			c256SqrOfGo(res1, in, n)
+			c256Sqr(res2, in, n)
+			assertEqual(res1, res2)
+		}
+	}
+}
+
+// 使用p256:
+// BenchmarkC256Sqr-10    	93287706	        12.84 ns/op	       0 B/op	       0 allocs/op
+// 修改不用nist p256:
+// BenchmarkC256Sqr-10    	87514056	        11.72 ns/op	       0 B/op	       0 allocs/op
+func BenchmarkC256Sqr(b *testing.B) {
+	res, in := make([]uint64, 4), make([]uint64, 4)
+	randUint64(in)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		c256Sqr(res, in, 1)
+	}
+}
+
+func TestNegCond(t *testing.T) {
+	res1, res2, in1 := make([]uint64, 4), make([]uint64, 4), make([]uint64, 4)
+	randUint64(in1)
+	copy(res1, in1)
+	copy(res2, in1)
+
+	c256NegCondOfGo(res1, 1)
+	c256NegCond(res2, 1)
+	assertEqual(res1, res2)
+	c256NegCondOfGo(res1, 0)
+	c256NegCond(res2, 0)
+	assertEqual(res1, res2)
+}
+
+func TestMovCond(t *testing.T) {
+	res, in1, in2 := make([]uint64, 4), make([]uint64, 4), make([]uint64, 4)
+	randUint64(in1)
+	randUint64(in2)
+
+	c256MovCond(res, in1, in2, 1)
+	assertEqual(res, in1)
+
+	c256MovCond(res, in1, in2, 0)
+	assertEqual(res, in2)
+
+	c256MovCond(res, in1, in2, 12345)
+	assertEqual(res, in1)
+}
+
+func TestFromMont(t *testing.T) {
+	for i := 0; i < 1000000; i++ {
+		res1, res2, in1 := make([]uint64, 4), make([]uint64, 4), make([]uint64, 4)
+		randUint64(in1)
+		c256FromMontOfGo(res1, in1)
+		c256FromMont(res2, in1)
+		assertEqual(res1, res2)
+	}
+}
+
+func TestOrdMul(t *testing.T) {
+	for i := 0; i < 100000; i++ {
+		res1, res2, in1, in2 := make([]uint64, 4), make([]uint64, 4), make([]uint64, 4), make([]uint64, 4)
+		randUint64(in1)
+		randUint64(in2)
+
+		c256OrdMulOfGo(res1, in1, in2)
+		c256OrdMul(res2, in1, in2)
+		assertEqual(res1, res2)
+	}
+
+}
+
+func TestOrdSqr(t *testing.T) {
+	for k := 1; k < 10; k++ {
+		for i := 0; i < 10000; i++ {
+			res1, res2, in1, in2 := make([]uint64, 4), make([]uint64, 4), make([]uint64, 4), make([]uint64, 4)
+			randUint64(in1)
+			randUint64(in2)
+
+			c256OrdSqrOfGo(res1, in1, k)
+			c256OrdSqr(res2, in1, k)
+			assertEqual(res1, res2)
+		}
+	}
+}
+
+func TestOrdInverse(t *testing.T) {
+	for i := 0; i < 1000; i++ {
+		k, _ := rand.Int(rand.Reader, c256.N)
+		res1 := c256.Inverse(k)
+		res2 := new(big.Int)
+		res2.ModInverse(k, c256.N)
+
+		res1.Mul(res1, k)
+		res1.Mod(res1, c256.N)
+		res2.Mul(res2, k)
+		res2.Mod(res2, c256.N)
+
+		assertEqual(res1, res2)
+	}
+}
+
+func TestC256Inverse(t *testing.T) {
+	for i := 0; i < 1000; i++ {
+		res1, res2, in1, in2 := make([]uint64, 4), make([]uint64, 4), make([]uint64, 4), make([]uint64, 4)
+		randUint64(in1)
+		copy(in2, in1)
+
+		int1 := toBig(in1)
+
+		int1.ModInverse(int1, c256.P)
+		int1.Lsh(int1, 256*2)
+		int1.Mod(int1, c256.P)
+		fromBig(res1, int1)
+
+		c256Inverse(res2, in2)
+		assertEqual(res1, res2)
+	}
+}
+
+func TestPointAddAffineAsmG(t *testing.T) {
+	var g1, g2, g3, g c256Point
+	x1 := new(big.Int).Set(c256.Gx)
+	y1 := new(big.Int).Set(c256.Gy)
+	x2, y2 := c256.CurveParams.Add(x1, y1, x1, y1)
+	x3, y3 := c256.CurveParams.Add(x1, y1, x2, y2)
+
+	g1.c256PointFromAffine(c256.Gx, c256.Gy)
+	g2.c256PointFromAffine(x2, y2)
+	g3.c256PointFromAffine(x3, y3)
+
+	c256PointAddAffineAsm(g.xyz[:], g1.xyz[:], g2.xyz[:], 0, 1, 1)
+	x, y := g3.c256PointToAffine()
+	assertEqual(x3, x)
+	assertEqual(y3, y)
+}
+
+func TestPointAddAffineAsm(t *testing.T) {
+	var p1, p2 c256Point
+	{
+		x1, y1 := randomPoint()
+		x2 := bigFromBase16("4071bba1f6624b6e9ac69b7109db9cac04e5bba76fdc954ebe375dfb2af6df2a")
+		y2 := bigFromBase16("fffffffb00000005fffffffc00000002fffffffd00000006fffffff900000004")
+		y2.Sub(p, y2)
+		x3, y3 := c256.CurveParams.Add(x1, y1, x2, y2)
+
+		y2.Sub(p, y2)
+		p1.c256PointFromAffine(x1, y1)
+		p2.c256PointFromAffine(x2, y2)
+		// p2.y = 1, set to p+1
+		p2.xyz[4] = 0
+		p2.xyz[5] = 0xffffffff00000001
+		p2.xyz[6] = 0xffffffffffffffff
+		p2.xyz[7] = 0xfffffffeffffffff
+
+		c256PointAddAffineAsm(p1.xyz[:], p1.xyz[:], p2.xyz[:], 1, 1, 1)
+		x4, y4 := p1.c256PointToAffine()
+
+		assertEqual(x3, x4)
+		assertEqual(y3, y4)
+	}
+
+	{
+		x1, y1 := randomPoint()
+		x2 := bigFromBase16("4071bba1f6624b6e9ac69b7109db9cac04e5bba76fdc954ebe375dfb2af6df2a")
+		y2 := bigFromBase16("fffffffb00000005fffffffc00000002fffffffd00000006fffffff900000004")
+		x3, y3 := c256.CurveParams.Add(x1, y1, x2, y2)
+		y2.Sub(p, y2)
+		p1.c256PointFromAffine(x1, y1)
+		p2.c256PointFromAffine(x2, y2)
+
+		c256PointAddAffineAsm(p1.xyz[:], p1.xyz[:], p2.xyz[:], 1, 1, 1)
+		x4, y4 := p1.c256PointToAffine()
+
+		assertEqual(x3, x4)
+		assertEqual(y3, y4)
+	}
+
+	for i := 0; i < 10000; i++ {
+		x1, y1 := randomPoint()
+		x2, y2 := randomPoint()
+		x3, y3 := c256.CurveParams.Add(x1, y1, x2, y2)
+
+		p1.c256PointFromAffine(x1, y1)
+		p2.c256PointFromAffine(x2, y2)
+
+		c256PointAddAffineAsm(p1.xyz[:], p1.xyz[:], p2.xyz[:], 0, 1, 1)
+		x4, y4 := p1.c256PointToAffine()
+
+		assertEqual(x3, x4)
+		assertEqual(y3, y4)
+
+		y2.Sub(p, y2)
+		p1.c256PointFromAffine(x1, y1)
+		p2.c256PointFromAffine(x2, y2)
+
+		c256PointAddAffineAsm(p1.xyz[:], p1.xyz[:], p2.xyz[:], 1, 1, 1)
+		x4, y4 = p1.c256PointToAffine()
+
+		assertEqual(x3, x4)
+		assertEqual(y3, y4)
+	}
+
+}
+
+func BenchmarkPointAddAffineAsm(b *testing.B) {
+	var res, p1, p2 c256Point
+	x1, y1 := randomPoint()
+	x2, y2 := randomPoint()
+
+	p1.c256PointFromAffine(x1, y1)
+	p2.c256PointFromAffine(x2, y2)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		c256PointAddAffineAsm(res.xyz[:], p1.xyz[:], p2.xyz[:], 1, 1, 1)
+	}
+}
+
+func TestPointAddAffineAsmSpeed(t *testing.T) {
+	var res, p1, p2 c256Point
+	x1, y1 := randomPoint()
+	x2, y2 := randomPoint()
+
+	p1.c256PointFromAffine(x1, y1)
+	p2.c256PointFromAffine(x2, y2)
+	total := 100000
+	begin := time.Now()
+	for i := 0; i < total; i++ {
+		c256PointAddAffineAsm(res.xyz[:], p1.xyz[:], p2.xyz[:], 1, 1, 1)
+	}
+	elaspe := time.Since(begin)
+	fmt.Println("time: ", elaspe.Milliseconds(), "ms")
+	fmt.Println(float64(total) / float64(elaspe.Milliseconds()) * 1000)
+}
+
+func TestPointAddAsm(t *testing.T) {
+	var res, p1, p2 c256Point
+	x1, y1 := randomPoint()
+	x2 := new(big.Int).Set(x1)
+	y2 := new(big.Int).Set(y1)
+	y2.Sub(p, y2)
+
+	p1.c256PointFromAffine(x1, y1)
+	p2.c256PointFromAffine(x2, y2)
+	c256PointAddAsm(res.xyz[:], p1.xyz[:], p2.xyz[:])
+	x, y := res.c256PointToAffine()
+	assertEqual(x, big.NewInt(0))
+	assertEqual(y, big.NewInt(0))
+
+	for i := 0; i < 1000; i++ {
+
+		k1, _ := rand.Int(rand.Reader, c256.N)
+		k2, _ := rand.Int(rand.Reader, c256.N)
+
+		x1, y1 := c256.CurveParams.ScalarMult(c256.Gx, c256.Gy, k1.Bytes())
+		x2, y2 := c256.CurveParams.ScalarMult(c256.Gx, c256.Gy, k2.Bytes())
+		x3, y3 := c256.CurveParams.Add(x1, y1, x2, y2)
+
+		p1.c256PointFromAffine(x1, y1)
+		p2.c256PointFromAffine(x2, y2)
+		c256PointAddAsm(res.xyz[:], p1.xyz[:], p2.xyz[:])
+
+		x4, y4 := res.c256PointToAffine()
+
+		assertEqual(x3, x4)
+		assertEqual(y3, y4)
+	}
+}
+
+func TestPointDoubleAsm(t *testing.T) {
+	for i := 0; i < 1000; i++ {
+		var res1, res2, p1 c256Point
+		k1, _ := rand.Int(rand.Reader, c256.N)
+
+		x1, y1 := c256.CurveParams.ScalarMult(c256.Gx, c256.Gy, k1.Bytes())
+		x3, y3 := c256.CurveParams.Double(x1, y1)
+		res2.c256PointFromAffine(x3, y3)
+
+		p1.c256PointFromAffine(x1, y1)
+		c256PointDoubleAsm(res1.xyz[:], p1.xyz[:])
+
+		x4, y4 := res1.c256PointToAffine()
+
+		assertEqual(x3, x4)
+		assertEqual(y3, y4)
+	}
+}
+
+// / test for Curve interface
+func TestIsOnCurve(t *testing.T) {
+	if !c256.IsOnCurve(c256.Gx, c256.Gy) {
+		t.Fail()
+	}
+}
+
+func TestPointAdd(t *testing.T) {
+	x1, y1 := randomPoint()
+	x3, y3 := c256.CurveParams.Add(x1, y1, x1, y1)
+	x4, y4 := c256.Add(x1, y1, x1, y1)
+	assertEqual(x3, x4)
+	assertEqual(y3, y4)
+
+	x2 := new(big.Int).Set(x1)
+	y2 := new(big.Int).Set(y1)
+	y2.Sub(p, y2)
+	x3, y3 = c256.CurveParams.Add(x1, y1, x2, y2)
+	x4, y4 = c256.Add(x1, y1, x2, y2)
+	assertEqual(x3, x4)
+	assertEqual(y3, y4)
+
+	for i := 0; i < 1000; i++ {
+		k1, _ := rand.Int(rand.Reader, c256.N)
+		k2, _ := rand.Int(rand.Reader, c256.N)
+		x1, y1 := c256.CurveParams.ScalarMult(c256.Gx, c256.Gy, k1.Bytes())
+		x2, y2 := c256.CurveParams.ScalarMult(c256.Gx, c256.Gy, k2.Bytes())
+		x3, y3 := c256.CurveParams.Add(x1, y1, x2, y2)
+		x4, y4 := c256.Add(x1, y1, x2, y2)
+		assertEqual(x3, x4)
+		assertEqual(y3, y4)
+	}
+}
+
+func BenchmarkPointDouble(b *testing.B) {
+	x, y := randomPoint()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		// BenchmarkPointDouble-8   	  278118	      4096 ns/op	     192 B/op	       4 allocs/op
+		c256.Double(x, y)
+
+		// BenchmarkPointDouble-8   	  186952	      6471 ns/op	    3961 B/op	      52 allocs/op
+		// c256.CurveParams.Double(x, y)
+	}
+}
+func BenchmarkPointAdd(b *testing.B) {
+	x1, y1 := randomPoint()
+	x2, y2 := randomPoint()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		//BenchmarkPointAdd-8   	  273103	      4229 ns/op	     192 B/op	       4 allocs/op
+		c256.Add(x1, y1, x2, y2)
+		// c256.Add(x1, y1, x1, y1)
+
+		// BenchmarkPointAdd-8   	  175370	      7210 ns/op	    4881 B/op	      65 allocs/op
+		// c256.CurveParams.Add(x1, y1, x2, y2)
+	}
+}
+
+func TestPointDouble(t *testing.T) {
+	for i := 0; i < 10000; i++ {
+		k1, _ := rand.Int(rand.Reader, c256.N)
+
+		x1, y1 := c256.CurveParams.ScalarMult(c256.Gx, c256.Gy, k1.Bytes())
+
+		x3, y3 := c256.CurveParams.Double(x1, y1)
+		x4, y4 := c256.Double(x1, y1)
+
+		assertEqual(x3, x4)
+		assertEqual(y3, y4)
+
+	}
+}
+
+func TestScalarMult(t *testing.T) {
+	k := new(big.Int).Set(c256.N)
+	x, y := c256.ScalarMult(c256.Gx, c256.Gy, k.Bytes())
+	zero := big.NewInt(0)
+	assertEqual(x, zero)
+	assertEqual(y, zero)
+
+	for i := 0; i < 1000; i++ {
+		k, _ := rand.Int(rand.Reader, c256.N)
+
+		x1, y1 := c256.ScalarMult(c256.Gx, c256.Gy, k.Bytes())
+		x2, y2 := c256.CurveParams.ScalarMult(c256.Gx, c256.Gy, k.Bytes())
+
+		assertEqual(x1, x2)
+		assertEqual(y1, y2)
+	}
+}
+
+func TestScalarBaseMult(t *testing.T) {
+	k := new(big.Int).Add(c256.N, big.NewInt(1))
+	x1, y1 := c256.ScalarBaseMult(k.Bytes())
+	assertEqual(x1, c256.Gx)
+	assertEqual(y1, c256.Gy)
+
+	for i := 0; i < 1000; i++ {
+		k, _ := rand.Int(rand.Reader, c256.N)
+		x1, y1 := c256.ScalarBaseMult(k.Bytes())
+		x2, y2 := c256.CurveParams.ScalarBaseMult(k.Bytes())
+
+		assertEqual(x1, x2)
+		assertEqual(y1, y2)
+	}
+}
+
+func TestScalarMultSpeed(t *testing.T) {
+	k, _ := rand.Int(rand.Reader, c256.N)
+
+	x1, y1 := c256.ScalarMult(c256.Gx, c256.Gy, k.Bytes())
+
+	begin := time.Now()
+	total := 100000
+	for i := 0; i < total; i++ {
+		c256.ScalarMult(x1, y1, k.Bytes())
+	}
+	elaspe := time.Since(begin)
+	fmt.Println("time: ", elaspe.Milliseconds(), "ms")
+	fmt.Println(float64(total) / float64(elaspe.Milliseconds()) * 1000)
+}
+
+func BenchmarkScalarMultSpeed(b *testing.B) {
+	k, _ := rand.Int(rand.Reader, c256.N)
+
+	x1, y1 := c256.ScalarMult(c256.Gx, c256.Gy, k.Bytes())
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		c256.ScalarMult(x1, y1, k.Bytes())
+	}
+}
+
+func TestScalarBaseMultSpeed(t *testing.T) {
+	k, _ := rand.Int(rand.Reader, c256.N)
+
+	begin := time.Now()
+	total := 100000
+	for i := 0; i < total; i++ {
+		c256.ScalarBaseMult(k.Bytes())
+	}
+	elaspe := time.Since(begin)
+	fmt.Println("time: ", elaspe.Milliseconds(), "ms")
+	fmt.Println(float64(total) / float64(elaspe.Milliseconds()) * 1000)
+}
+
+func BenchmarkScalarBaseMultSpeed(b *testing.B) {
+	k, _ := rand.Int(rand.Reader, c256.N)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		c256.ScalarBaseMult(k.Bytes())
+	}
+}
+
+// BenchmarkCombineMult-8   	   17679	     64513 ns/op	     320 B/op	       6 allocs/op
+func BenchmarkCombineMult(b *testing.B) {
+	x, y := randomPoint()
+	k, _ := rand.Int(rand.Reader, c256.N)
+	baseScalar := k.Bytes()
+	k, _ = rand.Int(rand.Reader, c256.N)
+	scalar := k.Bytes()
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		CombinedMult(x, y, baseScalar, scalar)
+	}
+}
+
+func TestBoothW5(t *testing.T) {
+	for i := uint(0); i < 64; i++ {
+		sel, sign := boothW5(i)
+		fmt.Println(i, "\t", sel, "\t", sign)
+		_, _ = sel, sign
+	}
+}
+
+func TestBoothW6(t *testing.T) {
+	for i := uint(0); i < 128; i++ {
+		sel, sign := boothW6(i)
+		// fmt.Println(i, "\t", sel, "\t", sign)
+		_, _ = sel, sign
+
+	}
+}
+
+func TestSelectBase(t *testing.T) {
+	var t0 c256Point
+	c256SelectBase(t0.xyz[0:8], c256Precomputed[0][0:], 1)
+
+}
+
+func TestSelect(t *testing.T) {
+	var t0 c256Point
+	var precomp [16 * 4 * 3]uint64
+	var p = c256Point{
+		xyz: [12]uint64{0x715A4589334C74C7,
+			0x8FE30BBFF2660BE1,
+			0x5F9904466A39C994,
+			0x32C4AE2C1F198119,
+			0x02DF32E52139F0A0,
+			0xD0A9877CC62A4740,
+			0x59BDCEE36B692153,
+			0xBC3736A2F4F6779C,
+			1,
+			0,
+			0,
+			0},
+	}
+	c256ToMont(p.xyz[:], p.xyz[:])
+	c256Select(t0.xyz[:], precomp[:], 0)
+	assertEqual(t0.xyz[:], make([]uint64, 12))
+	equal := c256PointAddAsm(t0.xyz[:], t0.xyz[:], p.xyz[:])
+	assert.Equal(t, equal, 1)
+}
+
+/*
+TestBoothW5
+0 	 0 	 0
+1 	 1 	 0
+2 	 1 	 0
+3 	 2 	 0
+4 	 2 	 0
+5 	 3 	 0
+6 	 3 	 0
+7 	 4 	 0
+8 	 4 	 0
+9 	 5 	 0
+10 	 5 	 0
+11 	 6 	 0
+12 	 6 	 0
+13 	 7 	 0
+14 	 7 	 0
+15 	 8 	 0
+16 	 8 	 0
+17 	 9 	 0
+18 	 9 	 0
+19 	 10 	 0
+20 	 10 	 0
+21 	 11 	 0
+22 	 11 	 0
+23 	 12 	 0
+24 	 12 	 0
+25 	 13 	 0
+26 	 13 	 0
+27 	 14 	 0
+28 	 14 	 0
+29 	 15 	 0
+30 	 15 	 0
+31 	 16 	 0
+32 	 16 	 1
+33 	 15 	 1
+34 	 15 	 1
+35 	 14 	 1
+36 	 14 	 1
+37 	 13 	 1
+38 	 13 	 1
+39 	 12 	 1
+40 	 12 	 1
+41 	 11 	 1
+42 	 11 	 1
+43 	 10 	 1
+44 	 10 	 1
+45 	 9 	 1
+46 	 9 	 1
+47 	 8 	 1
+48 	 8 	 1
+49 	 7 	 1
+50 	 7 	 1
+51 	 6 	 1
+52 	 6 	 1
+53 	 5 	 1
+54 	 5 	 1
+55 	 4 	 1
+56 	 4 	 1
+57 	 3 	 1
+58 	 3 	 1
+59 	 2 	 1
+60 	 2 	 1
+61 	 1 	 1
+62 	 1 	 1
+63 	 0 	 1
+
+*/
+
+func TestPrintBaseMult(t *testing.T) {
+	if false {
+		for i, table := range c256Precomputed {
+			for j := 0; j < 32; j++ {
+				fmt.Printf("\t// [64^%d * %2d]G\n", i, j+1)
+				fmt.Print("\t")
+				for k := 0; k < 4; k++ {
+					fmt.Printf("0x%016x", table[8*j+k])
+					if k < 3 {
+						fmt.Print(", ")
+					} else {
+						fmt.Println()
+					}
+				}
+				fmt.Print("\t")
+				for k := 4; k < 8; k++ {
+					fmt.Printf("0x%016x", table[8*j+k])
+					if k < 7 {
+						fmt.Print(", ")
+					} else {
+						fmt.Println()
+					}
+				}
+			}
+			fmt.Println("\t//")
+		}
+	}
+}
+
+func writePoint(sb *strings.Builder, p c256Point) {
+	x, y := p.c256PointToAffine()
+	p.c256PointFromAffine(x, y)
+	for k := 0; k < 8; k++ {
+		sb.WriteString(fmt.Sprintf("0x%016x, ", p.xyz[k]))
+		if k == 3 {
+			sb.WriteString("\n")
+		}
+	}
+	sb.WriteString("\n")
+}
+
+func TestBaseTable(t *testing.T) {
+	var sb strings.Builder
+
+	const N = 8
+	var G = c256Point{
+		xyz: [12]uint64{0x715A4589334C74C7,
+			0x8FE30BBFF2660BE1,
+			0x5F9904466A39C994,
+			0x32C4AE2C1F198119,
+			0x02DF32E52139F0A0,
+			0xD0A9877CC62A4740,
+			0x59BDCEE36B692153,
+			0xBC3736A2F4F6779C,
+			1,
+			0,
+			0,
+			0},
+	}
+	c256ToMont(G.xyz[:4], G.xyz[:4])
+	c256ToMont(G.xyz[4:], G.xyz[4:])
+	c256ToMont(G.xyz[8:], G.xyz[8:])
+
+	var P, Q c256Point
+	P = G
+	for i := 0; i < 256/N; i++ {
+		Q = P
+
+		// P
+		sb.WriteString(fmt.Sprintf("// [%d^%d]G\n", 1<<N, i))
+		writePoint(&sb, Q)
+
+		// 2P
+		c256PointDoubleAsm(Q.xyz[:], Q.xyz[:])
+		sb.WriteString(fmt.Sprintf("// [2 * %d^%d]G\n", 1<<N, i))
+		writePoint(&sb, Q)
+
+		for j := 3; j <= (1 << (N - 1)); j++ {
+			// jP
+			c256PointAddAsm(Q.xyz[:], Q.xyz[:], P.xyz[:])
+
+			sb.WriteString(fmt.Sprintf("// [%d * %d^%d]G\n", j, 1<<N, i))
+			writePoint(&sb, Q)
+		}
+		// the last round
+		if i == 256/N-1 {
+			for j := (1 << (N - 1)) + 1; j <= (1 << N); j++ {
+				c256PointAddAsm(Q.xyz[:], Q.xyz[:], P.xyz[:])
+				sb.WriteString(fmt.Sprintf("// [%d * %d^%d]G\n", j, 1<<N, i))
+				writePoint(&sb, Q)
+			}
+		}
+		c256PointDoubleAsm(P.xyz[:], Q.xyz[:])
+	}
+
+	os.WriteFile("/Users/fengwd/Files/Codes/go/src/xdx.jelly/xgcl/sm/sm2/tbl.txt", []byte(sb.String()), 0666)
+}
+
+func TestBaseTable3(t *testing.T) {
+	var G = c256Point{
+		xyz: [12]uint64{0x715A4589334C74C7,
+			0x8FE30BBFF2660BE1,
+			0x5F9904466A39C994,
+			0x32C4AE2C1F198119,
+			0x02DF32E52139F0A0,
+			0xD0A9877CC62A4740,
+			0x59BDCEE36B692153,
+			0xBC3736A2F4F6779C,
+			1,
+			0,
+			0,
+			0},
+	}
+	c256ToMont(G.xyz[:4], G.xyz[:4])
+	c256ToMont(G.xyz[4:], G.xyz[4:])
+	c256ToMont(G.xyz[8:], G.xyz[8:])
+
+	scalarReversed := []uint64{0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF}
+	var r, P c256Point
+	P.c256BaseMult(scalarReversed)
+	c256PointAddAsm(r.xyz[:], P.xyz[:], G.xyz[:])
+	x, y := r.c256PointToAffine()
+	r.c256PointFromAffine(x, y)
+
+	for k := 0; k < 8; k++ {
+		fmt.Printf("0x%016x, ", r.xyz[k])
+	}
+}
@@ -0,0 +1,44 @@
+// +build generic32
+
+package ec256
+
+import (
+	"fmt"
+	"math/big"
+	"testing"
+
+	"xdx.jelly/xgcl/gmath"
+)
+
+// 窗口为8的预计算点
+func TestGenCurvePrecompute8(t *testing.T) {
+	table := make([]*big.Int, 0, 2*256)
+	// for i = i[k], i[i] = 0 or 1
+	// table[i] is  i[0] + i[1]*2^32 + i[2]*2^64 + ... + i[7]*2^{224}
+	for i := 0; i < 256; i++ {
+
+		k := new(big.Int)
+		for j := 7; j >= 0; j-- {
+			if (i>>j)&1 != 0 {
+				k.Add(k, gmath.BigInt1)
+			}
+			k.Lsh(k, 32)
+		}
+
+		x, y := c256.ScalarBaseMult(k.Bytes())
+		table = append(table, x)
+		table = append(table, y)
+
+	}
+
+	for _, x := range table {
+		var out [c256Limbs]uint32
+		c256FromBig(&out, x)
+		fmt.Printf("0x%08x,0x%08x,0x%08x,0x%08x,0x%08x,0x%08x,0x%08x,0x%08x,0x%08x\n",
+			out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7], out[8],
+		)
+		// fmt.Printf("&curvePoint{gfP{0x%x,0x%x,0x%x,0x%x},gfP{0x%x,0x%x,0x%x,0x%x},*newGFp(1),*newGFp(1)},\n",
+		// x.x[0], x.x[1], x.x[2], x.x[3],
+		// x.y[0], x.y[1], x.y[2], x.y[3])
+	}
+}
@@ -0,0 +1,431 @@
+//go:build (!amd64 && !arm64) || generic32 || generic64
+// +build !amd64,!arm64 generic32 generic64
+
+// build when !amd64 AND !arm64 OR generic32 OR generic64
+
+package ec256
+
+import (
+	crand "crypto/rand"
+	"fmt"
+	"math/big"
+	"math/rand"
+	"testing"
+	"time"
+)
+
+func BenchmarkScalarMultc256(b *testing.B) {
+	b.ResetTimer()
+	// _, x, y, _ := elliptic.GenerateKey(c256, rand.Reader)
+	// priv, _, _, _ := elliptic.GenerateKey(c256, rand.Reader)
+
+	priv, _ := new(big.Int).SetString("115792089210356248762697446949407573529996955224135760342422259061068512044369", 10)
+	bb := priv.Bytes()
+	b.ReportAllocs()
+	b.StartTimer()
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			// c256.ScalarMult(c256.Gx, c256.Gy, bb)
+			c256.ScalarBaseMult(bb)
+		}
+	})
+}
+
+func TestPointMul(t *testing.T) {
+	priv, _ := new(big.Int).SetString("115792089210356248762697446949407573529996955224135760342422259061068512044369", 10)
+	bb := priv.Bytes()
+	cnt := 5000
+	start := time.Now()
+	for i := 0; i < cnt; i++ {
+		// c256.ScalarMult(c256.Gx, c256.Gy, bb)
+		c256.ScalarBaseMult(bb)
+	}
+	end := time.Now()
+	elapsed := end.Sub(start)
+	fmt.Printf("SM2 Scalar Mul Point: %d PerSec\n", int(float64(cnt)/elapsed.Seconds()))
+}
+
+func TestReduceCarry(t *testing.T) {
+	// fmt.Printf("%08x\n", 1<<29-1-2<<21)
+	var inout [c256Limbs]uint32
+	var temp [c256Limbs]uint32
+	rnd := rand.New(rand.NewSource(time.Now().UnixNano()))
+	for i := 0; i < c256Limbs; i++ {
+		temp[i] = uint32(rnd.Int31()) & 0xFFFFFFF
+		inout[i] = temp[i]
+	}
+	var carry uint32 = 5
+	c256ReduceCarry(&inout, carry)
+	// for _, n := range inout {
+	// 	fmt.Printf("0x%08x, ", n)
+	// }
+	ret := c256ToBig(&inout)
+	fmt.Println(ret.Text(16))
+
+	s := c256ToBig(&temp)
+	r := big.NewInt(int64(carry))
+	r.Lsh(r, 257)
+	s.Add(s, r)
+	s.Mod(s, c256.P)
+	// c256FromBig(&inout, s)
+	fmt.Println(s.Text(16))
+	// c256FromBig(&inout, s)
+	// for _, n := range inout {
+	// 	fmt.Printf("0x%08x, ", n)
+	// }
+	ret.Sub(ret, s)
+	fmt.Println(ret)
+}
+
+func TestReduceDegree(t *testing.T) {
+
+	for j := uint64(0); j < 100000000; j++ {
+		if j%1000000 == 0 {
+			fmt.Println(j/10000, "万次pass")
+		}
+		var in [c256Limbs]uint32 //= [c256Limbs]uint32{0x1604a25, 0x6d1db34, 0x140458b9, 0xd3371b7, 0x79446ec, 0xd2bca28, 0xb98f19b, 0xc227f7c, 0xcaed5c}
+		var out [c256Limbs]uint32
+		var temp [c256Limbs]uint32 //= [c256Limbs]uint32{0xdb99003, 0x964a8c3, 0x1f7dc5a9, 0xc9db569, 0x1893e838, 0xeecb116, 0xca9ff4f, 0x68bd063, 0x11e538bf}
+
+		rnd := rand.New(rand.NewSource(time.Now().UnixNano()))
+		for i := 0; i < c256Limbs; i++ {
+			if i%2 == 0 {
+				temp[i] = uint32(rnd.Int()) & 0x1FFFFFFF
+			} else {
+				temp[i] = uint32(rnd.Int()) & 0xFFFFFFF
+			}
+			// fmt.Printf("0x%x,", temp[i])
+		}
+
+		for i := 0; i < c256Limbs; i++ {
+			if i%2 == 0 {
+				in[i] = uint32(rnd.Int31()) & 0x1FFFFFFF
+			} else {
+				in[i] = uint32(rnd.Int31()) & 0xFFFFFFF
+			}
+			// fmt.Printf("0x%x,", in[i])
+		}
+
+		ret := c256ToBig(&temp)
+		// fmt.Println("a:=  ", ret.Text(16))
+		// ret = c256ToBig(&in)
+		// fmt.Println("b:=  ", ret.Text(16))
+
+		c256Mul(&out, &in, &temp)
+		ret = c256ToBig(&out)
+
+		ret.Mod(ret, c256.P)
+		// fmt.Println("a*b=", ret.Text(16))
+
+		s := c256ToBig(&temp)
+		s.Mul(s, c256ToBig(&in))
+		s.Mul(s, c256RInverse)
+		s.Mod(s, c256.P)
+
+		// ret.Mod(ret, c256.P)
+		if ret.Cmp(s) != 0 {
+			fmt.Println("failed")
+			fmt.Println(ret.Text(16))
+			fmt.Println(s.Text(16))
+			fmt.Println("in:", in)
+			fmt.Println("temp:", temp)
+			fmt.Println("diff:", ret.Sub(ret, s).Text(16))
+			return
+		}
+		// ret.Sub(ret, s)
+		// fmt.Println("?0=", ret.Text(16))
+	}
+	fmt.Println("test over")
+}
+
+func TestInverse(t *testing.T) {
+
+	for i := 0; i < 100000; i++ {
+		if i%10000 == 0 {
+			fmt.Println(i, "pass")
+		}
+		var in [c256Limbs]uint32 //= [c256Limbs]uint32{0x1604a25, 0x6d1db34, 0x140458b9, 0xd3371b7, 0x79446ec, 0xd2bca28, 0xb98f19b, 0xc227f7c, 0xcaed5c}
+		var out [c256Limbs]uint32
+
+		rnd := rand.New(rand.NewSource(time.Now().UnixNano()))
+		for i := 0; i < c256Limbs; i++ {
+			if i%2 == 0 {
+				in[i] = uint32(rnd.Int()) & 0x1FFFFFFF
+			} else {
+				in[i] = uint32(rnd.Int()) & 0xFFFFFFF
+			}
+			// fmt.Printf("0x%x,", temp[i])
+		}
+		c256Invert(&out, &in) // in^(-1)*R
+		outInt := c256ToBig(&out)
+		outInt.Mod(outInt, c256.P)
+		// fmt.Println(outInt.Text(16))
+
+		inInt := c256ToBig(&in)         // in * R
+		inInt.ModInverse(inInt, c256.P) // (in*R)^-1
+		inInt.Lsh(inInt, 257+257)       // in^-1 * R
+		inInt.Mod(inInt, c256.P)
+		// fmt.Println(inInt.Text(16))
+
+		if inInt.Cmp(outInt) != 0 {
+			fmt.Println("Failed")
+			fmt.Println(in)
+			fmt.Println(new(big.Int).Sub(inInt, outInt).Text(16))
+			return
+		}
+	}
+}
+
+func TestGenTable32(t *testing.T) {
+	//   Index  |  Index (binary) | Value
+	//       0  |           0000  | 0G (all zeros, omitted)
+	//       1  |           0001  | G
+	//       2  |           0010  | 2**64G
+	//       3  |           0011  | 2**64G + G
+	//       4  |           0100  | 2**128G
+	//       5  |           0101  | 2**128G + G
+	//       6  |           0110  | 2**128G + 2**64G
+	//       7  |           0111  | 2**128G + 2**64G + G
+	//       8  |           1000  | 2**192G
+	//       9  |           1001  | 2**192G + G
+	//      10  |           1010  | 2**192G + 2**64G
+	//      11  |           1011  | 2**192G + 2**64G + G
+	//      12  |           1100  | 2**192G + 2**128G
+	//      13  |           1101  | 2**192G + 2**128G + G
+	//      14  |           1110  | 2**192G + 2**128G + 2**64G
+	//      15  |           1111  | 2**192G + 2**128G + 2**64G + G
+	//
+	// The second table follows the same style, but the terms are 2**32G,
+	// 2**96G, 2**160G, 2**224G.
+	for i := 1; i < 16; i++ {
+		n := new(big.Int)
+		one := new(big.Int).SetInt64(1)
+		if i&0x08 > 0 {
+			n.Add(new(big.Int).SetInt64(1).Lsh(one, 192), n)
+		}
+		if i&0x04 > 0 {
+			n.Add(new(big.Int).SetInt64(1).Lsh(one, 128), n)
+		}
+		if i&0x02 > 0 {
+			n.Add(new(big.Int).SetInt64(1).Lsh(one, 64), n)
+		}
+		if i&0x01 > 0 {
+			n.Add(one, n)
+		}
+		// fmt.Println(n.Text(16))
+		x, y := c256.ScalarMult(c256.Gx, c256.Gy, n.Bytes())
+		var xOut, yOut [c256Limbs]uint32
+		c256FromBig(&xOut, x)
+		c256FromBig(&yOut, y)
+		for _, i := range xOut {
+			fmt.Printf("0x%x, ", i)
+		}
+		fmt.Println()
+		for _, i := range yOut {
+			fmt.Printf("0x%x, ", i)
+		}
+		fmt.Println()
+	}
+
+	for i := 1; i < 16; i++ {
+		n := new(big.Int)
+		one := new(big.Int).SetInt64(1)
+		if i&0x08 > 0 {
+			n.Add(new(big.Int).SetInt64(1).Lsh(one, 224), n)
+		}
+		if i&0x04 > 0 {
+			n.Add(new(big.Int).SetInt64(1).Lsh(one, 160), n)
+		}
+		if i&0x02 > 0 {
+			n.Add(new(big.Int).SetInt64(1).Lsh(one, 96), n)
+		}
+		if i&0x01 > 0 {
+			n.Add(new(big.Int).SetInt64(1).Lsh(one, 32), n)
+		}
+		// fmt.Println(n.Text(16))
+		x, y := c256.ScalarMult(c256.Gx, c256.Gy, n.Bytes())
+		var xOut, yOut [c256Limbs]uint32
+		c256FromBig(&xOut, x)
+		c256FromBig(&yOut, y)
+		for _, i := range xOut {
+			fmt.Printf("0x%x, ", i)
+		}
+		fmt.Println()
+		for _, i := range yOut {
+			fmt.Printf("0x%x, ", i)
+		}
+		fmt.Println()
+	}
+}
+
+// c256FromBig sets out = R*in.
+func c256FromBig64(out *[5]uint64, in *big.Int) {
+	var bottom51Bits uint64 = 1<<51 - 1
+	var bottom52Bits uint64 = 1<<52 - 1
+	tmp := new(big.Int).Lsh(in, 257)
+	tmp.Mod(tmp, c256.P)
+
+	for i := 0; i < 5; i++ {
+		if bits := tmp.Bits(); len(bits) > 0 {
+			out[i] = uint64(bits[0]) & bottom51Bits
+		} else {
+			out[i] = 0
+		}
+		tmp.Rsh(tmp, 51)
+
+		i++
+		if i == 5 {
+			break
+		}
+
+		if bits := tmp.Bits(); len(bits) > 0 {
+			out[i] = uint64(bits[0]) & bottom52Bits
+		} else {
+			out[i] = 0
+		}
+		tmp.Rsh(tmp, 52)
+	}
+}
+
+func TestGenTable64(t *testing.T) {
+	//   Index  |  Index (binary) | Value
+	//       0  |           0000  | 0G (all zeros, omitted)
+	//       1  |           0001  | G
+	//       2  |           0010  | 2**64G
+	//       3  |           0011  | 2**64G + G
+	//       4  |           0100  | 2**128G
+	//       5  |           0101  | 2**128G + G
+	//       6  |           0110  | 2**128G + 2**64G
+	//       7  |           0111  | 2**128G + 2**64G + G
+	//       8  |           1000  | 2**192G
+	//       9  |           1001  | 2**192G + G
+	//      10  |           1010  | 2**192G + 2**64G
+	//      11  |           1011  | 2**192G + 2**64G + G
+	//      12  |           1100  | 2**192G + 2**128G
+	//      13  |           1101  | 2**192G + 2**128G + G
+	//      14  |           1110  | 2**192G + 2**128G + 2**64G
+	//      15  |           1111  | 2**192G + 2**128G + 2**64G + G
+	//
+	// The second table follows the same style, but the terms are 2**32G,
+	// 2**96G, 2**160G, 2**224G.
+	for i := 1; i < 16; i++ {
+		n := new(big.Int)
+		one := new(big.Int).SetInt64(1)
+		if i&0x08 > 0 {
+			n.Add(new(big.Int).SetInt64(1).Lsh(one, 192), n)
+		}
+		if i&0x04 > 0 {
+			n.Add(new(big.Int).SetInt64(1).Lsh(one, 128), n)
+		}
+		if i&0x02 > 0 {
+			n.Add(new(big.Int).SetInt64(1).Lsh(one, 64), n)
+		}
+		if i&0x01 > 0 {
+			n.Add(one, n)
+		}
+		// fmt.Println(n.Text(16))
+		x, y := c256.ScalarMult(c256.Gx, c256.Gy, n.Bytes())
+		var xOut, yOut [5]uint64
+		c256FromBig64(&xOut, x)
+		c256FromBig64(&yOut, y)
+		for _, i := range xOut {
+			fmt.Printf("0x%xLLU, ", i)
+		}
+		fmt.Println()
+		for _, i := range yOut {
+			fmt.Printf("0x%xLLU, ", i)
+		}
+		fmt.Println()
+	}
+
+	for i := 1; i < 16; i++ {
+		n := new(big.Int)
+		one := new(big.Int).SetInt64(1)
+		if i&0x08 > 0 {
+			n.Add(new(big.Int).SetInt64(1).Lsh(one, 224), n)
+		}
+		if i&0x04 > 0 {
+			n.Add(new(big.Int).SetInt64(1).Lsh(one, 160), n)
+		}
+		if i&0x02 > 0 {
+			n.Add(new(big.Int).SetInt64(1).Lsh(one, 96), n)
+		}
+		if i&0x01 > 0 {
+			n.Add(new(big.Int).SetInt64(1).Lsh(one, 32), n)
+		}
+		// fmt.Println(n.Text(16))
+		x, y := c256.ScalarMult(c256.Gx, c256.Gy, n.Bytes())
+		var xOut, yOut [5]uint64
+		c256FromBig64(&xOut, x)
+		c256FromBig64(&yOut, y)
+		for _, i := range xOut {
+			fmt.Printf("0x%xLLU, ", i)
+		}
+		fmt.Println()
+		for _, i := range yOut {
+			fmt.Printf("0x%xLLU, ", i)
+		}
+		fmt.Println()
+	}
+}
+func TestPointMul2(t *testing.T) {
+	n, _ := crand.Int(crand.Reader, c256.N)
+	n.SetInt64(4)
+	//n.Set(c256.N)
+	//n.Sub(n, gmath.BigInt1)
+	//x, y := c256.ScalarBaseMult(n.Bytes())
+	//fmt.Println(x.Text(16), y.Text(16))
+	// n.Set(c256.N)
+	//xx, yy := c256.ScalarMult(c256.Gx, c256.Gy, n.Bytes())
+	xx, yy := c256.ScalarBaseMult(n.Bytes())
+	fmt.Println(xx.Text(16), yy.Text(16))
+	//fmt.Println(xx.Text(16), yy.Text(16))
+
+	// p := c256ToBig(&c256Zero31)
+	// fmt.Println(p.Text(16))
+}
+
+// FIXME c256ScalarBaseMult error when scalar = 0
+func TestZeroScaleBaseMult(t *testing.T) {
+	n := new(big.Int)
+	var scalarReversed [32]byte
+	for i := 0; i < 32; i++ {
+		scalarReversed[i] = 0xcc
+	}
+
+	c256GetScalar(&scalarReversed, n.Bytes())
+	var x1, y1, z1 [c256Limbs]uint32
+	var tmp [17]uint64
+	c256PointDouble(&x1, &y1, &z1, &x1, &y1, &z1)
+	c256ReduceDegree(&z1, tmp)
+	c256ScalarBaseMult(&x1, &y1, &z1, &scalarReversed)
+
+	for _, z := range z1 {
+		if z != 0 {
+			t.Fail()
+		}
+	}
+}
+
+func TestReduce(t *testing.T) {
+	var tmp = [17]uint64{1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1}
+	var out [9]uint32
+	c256ReduceDegree(&out, tmp)
+	for i := 0; i < 9; i++ {
+		fmt.Println(out[i])
+	}
+}
+
+func TestIssue52075(t *testing.T) {
+	Gx, Gy := c256.Params().Gx, c256.Params().Gy
+	scalar := make([]byte, 33)
+	scalar[32] = 1
+	x, y := c256.ScalarBaseMult(scalar)
+	if x.Cmp(Gx) != 0 || y.Cmp(Gy) != 0 {
+		t.Errorf("unexpected output (%v,%v)", x, y)
+	}
+	x, y = c256.ScalarMult(Gx, Gy, scalar)
+	if x.Cmp(Gx) != 0 || y.Cmp(Gy) != 0 {
+		t.Errorf("unexpected output (%v,%v)", x, y)
+	}
+}
@@ -0,0 +1,313 @@
+//go:build ignore
+// +build ignore
+
+///
+/// Copyright (c) 2018 xdx. All rights reserved.
+///
+/// \file:
+///
+/// \brief: general elliptic curve implements, modified from the
+/// Go standed library.
+///
+/// \author: xdx
+///
+
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package ec256
+
+import (
+	"math/big"
+)
+
+// CurveParams implement Curve interface, of the most common case with big.Int
+var _ Curve = &CurveParams{}
+
+// combinedMult implements fast multiplication S1*g + S2*p (g - generator, p - arbitrary point)
+// It only do affine-to-mont and mont-to-affine once, could be faster than do it seperatly.
+type combinedMult interface {
+	CombinedMult(bigX, bigY *big.Int, baseScalar, scalar []byte) (x, y *big.Int)
+}
+
+// 没有太大作用
+type curveX interface {
+	CombinedMultX(bigX, bigY *big.Int, baseScalar, scalar []byte) (x *big.Int)
+	ScalarMultX(x1, y1 *big.Int, k []byte) (x *big.Int)
+	ScalarBaseMultX(k []byte) (x *big.Int)
+}
+
+// A Curve represents a short-form Weierstrass curve with a=-3.
+// See https://www.hyperelliptic.org/EFD/g1p/auto-shortw.html
+type Curve interface {
+	// Params returns the parameters for the curve.
+	Params() *CurveParams
+	// IsOnCurve reports whether the given (x,y) lies on the curve.
+	IsOnCurve(x, y *big.Int) bool
+	// Add returns the sum of (x1,y1) and (x2,y2)
+	Add(x1, y1, x2, y2 *big.Int) (x, y *big.Int)
+	// Double returns 2*(x,y)
+	Double(x1, y1 *big.Int) (x, y *big.Int)
+	// ScalarMult returns k*(Bx,By) where k is a number in big-endian form.
+	ScalarMult(x1, y1 *big.Int, k []byte) (x, y *big.Int)
+
+	// ScalarBaseMult returns k*G, where G is the base point of the group
+	// and k is an integer in big-endian form.
+	ScalarBaseMult(k []byte) (x, y *big.Int)
+
+	// Add by xdx
+	combinedMult
+	// curveX
+}
+
+// CurveParams contains the parameters of an elliptic curve and also provides
+// a generic, non-constant time implementation of Curve.
+type CurveParams struct {
+	P       *big.Int // the order of the underlying field
+	N       *big.Int // the order of the base point
+	B       *big.Int // the constant of the curve equation
+	Gx, Gy  *big.Int // (x,y) of the base point
+	BitSize int      // the size of the underlying field
+	Name    string   // the canonical name of the curve
+}
+
+// Params return the CurveParams
+func (curve *CurveParams) Params() *CurveParams {
+	return curve
+}
+
+// IsOnCurve return true if (x,y) is on the curve
+func (curve *CurveParams) IsOnCurve(x, y *big.Int) bool {
+	// y² = x³ - 3x + b
+	y2 := new(big.Int).Mul(y, y)
+	y2.Mod(y2, curve.P)
+
+	x3 := new(big.Int).Mul(x, x)
+	x3.Mul(x3, x)
+
+	threeX := new(big.Int).Lsh(x, 1)
+	threeX.Add(threeX, x)
+
+	x3.Sub(x3, threeX)
+	x3.Add(x3, curve.B)
+	x3.Mod(x3, curve.P)
+
+	return x3.Cmp(y2) == 0
+}
+
+// zForAffine returns a Jacobian Z value for the affine point (x, y). If x and
+// y are zero, it assumes that they represent the point at infinity because (0,
+// 0) is not on the any of the curves handled here.
+func zForAffine(x, y *big.Int) *big.Int {
+	z := new(big.Int)
+	if x.Sign() != 0 || y.Sign() != 0 {
+		z.SetInt64(1)
+	}
+	return z
+}
+
+// affineFromJacobian reverses the Jacobian transform. See the comment at the
+// top of the file. If the point is ∞ it returns 0, 0.
+func (curve *CurveParams) affineFromJacobian(x, y, z *big.Int) (xOut, yOut *big.Int) {
+	if z.Sign() == 0 {
+		return new(big.Int), new(big.Int)
+	}
+
+	zinv := new(big.Int).ModInverse(z, curve.P)
+	zinvsq := new(big.Int).Mul(zinv, zinv)
+
+	xOut = new(big.Int).Mul(x, zinvsq)
+	xOut.Mod(xOut, curve.P)
+	zinvsq.Mul(zinvsq, zinv)
+	yOut = new(big.Int).Mul(y, zinvsq)
+	yOut.Mod(yOut, curve.P)
+	return
+}
+
+// Add returns (x1,y1) + (x2,y2)
+func (curve *CurveParams) Add(x1, y1, x2, y2 *big.Int) (*big.Int, *big.Int) {
+	z1 := zForAffine(x1, y1)
+	z2 := zForAffine(x2, y2)
+	return curve.affineFromJacobian(curve.addJacobian(x1, y1, z1, x2, y2, z2))
+}
+
+// addJacobian takes two points in Jacobian coordinates, (x1, y1, z1) and
+// (x2, y2, z2) and returns their sum, also in Jacobian form.
+func (curve *CurveParams) addJacobian(x1, y1, z1, x2, y2, z2 *big.Int) (*big.Int, *big.Int, *big.Int) {
+	// See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
+	x3, y3, z3 := new(big.Int), new(big.Int), new(big.Int)
+	if z1.Sign() == 0 {
+		x3.Set(x2)
+		y3.Set(y2)
+		z3.Set(z2)
+		return x3, y3, z3
+	}
+	if z2.Sign() == 0 {
+		x3.Set(x1)
+		y3.Set(y1)
+		z3.Set(z1)
+		return x3, y3, z3
+	}
+
+	z1z1 := new(big.Int).Mul(z1, z1)
+	z1z1.Mod(z1z1, curve.P)
+	z2z2 := new(big.Int).Mul(z2, z2)
+	z2z2.Mod(z2z2, curve.P)
+
+	u1 := new(big.Int).Mul(x1, z2z2)
+	u1.Mod(u1, curve.P)
+	u2 := new(big.Int).Mul(x2, z1z1)
+	u2.Mod(u2, curve.P)
+	h := new(big.Int).Sub(u2, u1)
+	xEqual := h.Sign() == 0
+	if h.Sign() == -1 {
+		h.Add(h, curve.P)
+	}
+	i := new(big.Int).Lsh(h, 1)
+	i.Mul(i, i)
+	j := new(big.Int).Mul(h, i)
+
+	s1 := new(big.Int).Mul(y1, z2)
+	s1.Mul(s1, z2z2)
+	s1.Mod(s1, curve.P)
+	s2 := new(big.Int).Mul(y2, z1)
+	s2.Mul(s2, z1z1)
+	s2.Mod(s2, curve.P)
+	r := new(big.Int).Sub(s2, s1)
+	if r.Sign() == -1 {
+		r.Add(r, curve.P)
+	}
+	yEqual := r.Sign() == 0
+	if xEqual && yEqual {
+		return curve.doubleJacobian(x1, y1, z1)
+	}
+	r.Lsh(r, 1)
+	v := new(big.Int).Mul(u1, i)
+
+	x3.Set(r)
+	x3.Mul(x3, x3)
+	x3.Sub(x3, j)
+	x3.Sub(x3, v)
+	x3.Sub(x3, v)
+	x3.Mod(x3, curve.P)
+
+	y3.Set(r)
+	v.Sub(v, x3)
+	y3.Mul(y3, v)
+	s1.Mul(s1, j)
+	s1.Lsh(s1, 1)
+	y3.Sub(y3, s1)
+	y3.Mod(y3, curve.P)
+
+	z3.Add(z1, z2)
+	z3.Mul(z3, z3)
+	z3.Sub(z3, z1z1)
+	z3.Sub(z3, z2z2)
+	z3.Mul(z3, h)
+	z3.Mod(z3, curve.P)
+
+	return x3, y3, z3
+}
+
+// Double return 2(x1,y1)
+func (curve *CurveParams) Double(x1, y1 *big.Int) (*big.Int, *big.Int) {
+	z1 := zForAffine(x1, y1)
+	return curve.affineFromJacobian(curve.doubleJacobian(x1, y1, z1))
+}
+
+// doubleJacobian takes a point in Jacobian coordinates, (x, y, z), and
+// returns its double, also in Jacobian form.
+func (curve *CurveParams) doubleJacobian(x, y, z *big.Int) (*big.Int, *big.Int, *big.Int) {
+	// See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
+	delta := new(big.Int).Mul(z, z)
+	delta.Mod(delta, curve.P)
+	gamma := new(big.Int).Mul(y, y)
+	gamma.Mod(gamma, curve.P)
+	alpha := new(big.Int).Sub(x, delta)
+	if alpha.Sign() == -1 {
+		alpha.Add(alpha, curve.P)
+	}
+	alpha2 := new(big.Int).Add(x, delta)
+	alpha.Mul(alpha, alpha2)
+	alpha2.Set(alpha)
+	alpha.Lsh(alpha, 1)
+	alpha.Add(alpha, alpha2)
+
+	beta := alpha2.Mul(x, gamma)
+
+	x3 := new(big.Int).Mul(alpha, alpha)
+	beta8 := new(big.Int).Lsh(beta, 3)
+	beta8.Mod(beta8, curve.P)
+	x3.Sub(x3, beta8)
+	if x3.Sign() == -1 {
+		x3.Add(x3, curve.P)
+	}
+	x3.Mod(x3, curve.P)
+
+	z3 := new(big.Int).Add(y, z)
+	z3.Mul(z3, z3)
+	z3.Sub(z3, gamma)
+	if z3.Sign() == -1 {
+		z3.Add(z3, curve.P)
+	}
+	z3.Sub(z3, delta)
+	if z3.Sign() == -1 {
+		z3.Add(z3, curve.P)
+	}
+	z3.Mod(z3, curve.P)
+
+	beta.Lsh(beta, 2)
+	beta.Sub(beta, x3)
+	if beta.Sign() == -1 {
+		beta.Add(beta, curve.P)
+	}
+	y3 := alpha.Mul(alpha, beta)
+
+	gamma.Mul(gamma, gamma)
+	gamma.Lsh(gamma, 3)
+	gamma.Mod(gamma, curve.P)
+
+	y3.Sub(y3, gamma)
+	if y3.Sign() == -1 {
+		y3.Add(y3, curve.P)
+	}
+	y3.Mod(y3, curve.P)
+
+	return x3, y3, z3
+}
+
+// ScalarMult returns [k](Bx,By)
+func (curve *CurveParams) ScalarMult(Bx, By *big.Int, k []byte) (*big.Int, *big.Int) {
+	printFuncName()
+
+	Bz := new(big.Int).SetInt64(1)
+	x, y, z := new(big.Int), new(big.Int), new(big.Int)
+
+	for _, byte := range k {
+		for bitNum := 0; bitNum < 8; bitNum++ {
+			x, y, z = curve.doubleJacobian(x, y, z)
+			if byte&0x80 == 0x80 {
+				x, y, z = curve.addJacobian(Bx, By, Bz, x, y, z)
+			}
+			byte <<= 1
+		}
+	}
+
+	return curve.affineFromJacobian(x, y, z)
+}
+
+// ScalarBaseMult returns [k](Gx,Gy)
+func (curve *CurveParams) ScalarBaseMult(k []byte) (*big.Int, *big.Int) {
+	printFuncName()
+	return curve.ScalarMult(curve.Gx, curve.Gy, k)
+}
+
+// CombinedMult returns [baseScalar](Gx,Gy) + [scalar](bigX, bigY)
+func (curve *CurveParams) CombinedMult(bigX, bigY *big.Int, baseScalar, scalar []byte) (x, y *big.Int) {
+	printFuncName()
+	t1, t2 := curve.ScalarBaseMult(baseScalar)
+	t3, t4 := curve.ScalarMult(bigX, bigY, scalar)
+	x, y = curve.Add(t1, t2, t3, t4)
+	return
+}
@@ -0,0 +1,53 @@
+package ec256
+
+import (
+	"crypto/rand"
+	"fmt"
+	"testing"
+	"time"
+)
+
+func TestFuzz(t *testing.T) {
+
+	ec := CurveSM2()
+	ge := ec.Params()
+
+	var scalar1 [32]byte
+	var scalar2 [32]byte
+	var timeout *time.Timer
+
+	timeout = time.NewTimer(10 * time.Second)
+	count := 0
+
+loop:
+	for {
+		select {
+		case <-timeout.C:
+			break loop
+		default:
+			count++
+			if count%100 == 0 {
+				fmt.Println("Tested for", count, "times")
+			}
+
+			rand.Read(scalar1[:])
+			rand.Read(scalar2[:])
+
+			x, y := ec.ScalarBaseMult(scalar1[:])
+			x2, y2 := ge.ScalarBaseMult(scalar1[:])
+
+			xx, yy := ec.ScalarMult(x, y, scalar2[:])
+			xx2, yy2 := ge.ScalarMult(x2, y2, scalar2[:])
+
+			if x.Cmp(x2) != 0 || y.Cmp(y2) != 0 {
+				t.Fatalf("ScalarBaseMult does not match reference result with scalar: %x, please report this error to security@golang.org", scalar1)
+			}
+
+			if xx.Cmp(xx2) != 0 || yy.Cmp(yy2) != 0 {
+				t.Fatalf("ScalarMult does not match reference result with scalars: %x and %x, please report this error to security@golang.org", scalar1, scalar2)
+			}
+		}
+	}
+
+	fmt.Printf("Total test %d times\n", count)
+}
@@ -0,0 +1,8 @@
+package ec256
+
+import "math/big"
+
+func bigFromBase16(s string) *big.Int {
+	n, _ := new(big.Int).SetString(s, 16)
+	return n
+}