//go:build (!amd64 && !arm64) || generic
// +build !amd64,!arm64 generic

package bn256

/*
判断进位的规则
设B=2^w, w = 64 or 32.

假设a op b = c + carry·B.
注：若carry =0 or 1, 则a + b + carry是否产生进位和carry无关。
因为若a+b+carry当carry=0不产生进位，当carry=1产生进位，当且仅当
a+b = B-1, 则a,b中必然有一个数(不妨设为a)的某个非最高比特为0，则
a'=a+carry, a'与a的最高比特相同。因此a'+b 与a+b的进位相同。因此
我们只需要考虑carry为0的情况。

# Add
carry如下:

	a[w-1] 	0	0	0	1	1	1
	b[w-1]  0   1   1	0   0	1
	c[w-1]	x   0   1	0   1	x
	carry   0   1   0	1   0	1

carry>0 <=> a或b都为1(第w-1bit) 或 a，b其中一个为1，但c为0.

# Sub
carry如下:

	a[w-1] 	0	0	0	1	1	1
	b[w-1]  0   0	1  	0  	1	1
	c[w-1]	1   0	x  	x  	0	1
	carry   1   0	1  	0  	0	1

carry = (b&^a | (b|^a)&c) >> 63
carry>0 <=> a为0，b为1 或 (a,b)!=(1,0)并且c=1.
*/

// gfpCarry compute (a, head) mod p, input (a,head) < 2p
//
// 先计算 (b,carry) = a - p
//
//	carry      head     	ret
//	0(a>p)	 	0            b
//	0           1            b(此情形下，(a,head) > 2p, 不应出现。此时，应再调用一次gfpCarry)
//	1(a<p)      0            a
//	1           1            b
//
// so, carry &^ head = 1, return a, otherwise return b
func gfpCarry(a *gfP, head uint64) {
	b := &gfP{}

	var carry uint64
	for i, pi := range p2 {
		ai := a[i]
		bi := ai - pi - carry
		b[i] = bi
		carry = (pi&^ai | (pi|^ai)&bi) >> 63
	}
	carry = carry &^ head

	// If b is negative, then return a.
	// Else return b.
	carry = -carry
	ncarry := ^carry
	for i := 0; i < 4; i++ {
		a[i] = (a[i] & carry) | (b[i] & ncarry)
	}
}

// gfpNeg set c = -a, input a < p
func gfpNeg(c, a *gfP) {
	var carry uint64
	for i, pi := range p2 {
		ai := a[i]
		ci := pi - ai - carry
		c[i] = ci
		carry = (ai&^pi | (ai|^pi)&ci) >> 63
	}
	// FIXME: carry?
	gfpCarry(c, 0)
}

// gfpAdd set c = a+b
func gfpAdd(c, a, b *gfP) {
	var carry uint64
	for i, ai := range a {
		bi := b[i]
		ci := ai + bi + carry
		c[i] = ci
		carry = (ai&bi | (ai|bi)&^ci) >> 63
	}
	gfpCarry(c, carry)
}

func gfpSub(c, a, b *gfP) {
	t := &gfP{}

	// t = p-b
	var carry uint64
	for i, pi := range p2 {
		bi := b[i]
		ti := pi - bi - carry
		t[i] = ti
		carry = (bi&^pi | (bi|^pi)&ti) >> 63
	}

	// c = a+t
	carry = 0
	for i, ai := range a {
		ti := t[i]
		ci := ai + ti + carry
		c[i] = ci
		carry = (ai&ti | (ai|ti)&^ci) >> 63
	}
	gfpCarry(c, carry)
}

// mul returns the multiplication of a*b. a,b are no restrictions.
func mul(a, b [4]uint64) [8]uint64 {
	const (
		mask16 uint64 = 0x0000ffff
		mask32 uint64 = 0xffffffff
	)

	// Let B = 2^16, then
	// buff = buff[0] + buff[1]*B + ... + buff[31]*B^31
	var buff [32]uint64
	for i, ai := range a {
		a0, a1, a2, a3 := ai&mask16, (ai>>16)&mask16, (ai>>32)&mask16, ai>>48
		for j, bj := range b {
			// compute ai * bj and save to buff[4*(i+j):]
			//   (a0 + a1*B + a2*B^2 + a3*B^3) * (b0 + b2*B^2)
			// = a0*b0 + a1*b0*B + (a2*b0 + a0*b2)*B^2 + (a1*b2 + a3*b0)*B^3 + a2*b2*B^4 + a3*b2*B^5
			b0, b2 := bj&mask32, bj>>32

			off := 4 * (i + j)
			buff[off+0] += a0 * b0
			buff[off+1] += a1 * b0
			buff[off+2] += a2*b0 + a0*b2
			buff[off+3] += a3*b0 + a1*b2
			buff[off+4] += a2 * b2
			buff[off+5] += a3 * b2
		}
	}

	// buff:
	// 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15
	// 外循环对将1,2,3加到0上
	// 内循环处理0,4,8,12...
	for i := uint(1); i < 4; i++ {
		shift := 16 * i

		var head, carry uint64
		for j := uint(0); j < 8; j++ {
			block := 4 * j

			xi := buff[block]
			yi := (buff[block+i] << shift) + head
			zi := xi + yi + carry
			buff[block] = zi
			carry = (xi&yi | (xi|yi)&^zi) >> 63

			head = buff[block+i] >> (64 - shift)
		}
	}

	return [8]uint64{buff[0], buff[4], buff[8], buff[12], buff[16], buff[20], buff[24], buff[28]}
}

// halfMul returns a*b mod R, where R = 2^256.
func halfMul(a, b [4]uint64) [4]uint64 {
	const (
		mask16 uint64 = 0x0000ffff
		mask32 uint64 = 0xffffffff
	)

	var buff [18]uint64
	for i, ai := range a {
		a0, a1, a2, a3 := ai&mask16, (ai>>16)&mask16, (ai>>32)&mask16, ai>>48

		for j, bj := range b {
			if i+j > 3 {
				break
			}
			b0, b2 := bj&mask32, bj>>32

			off := 4 * (i + j)
			buff[off+0] += a0 * b0
			buff[off+1] += a1 * b0
			buff[off+2] += a2*b0 + a0*b2
			buff[off+3] += a3*b0 + a1*b2
			buff[off+4] += a2 * b2
			buff[off+5] += a3 * b2
		}
	}

	for i := uint(1); i < 4; i++ {
		shift := 16 * i

		var head, carry uint64
		for j := uint(0); j < 4; j++ {
			block := 4 * j

			xi := buff[block]
			yi := (buff[block+i] << shift) + head
			zi := xi + yi + carry
			buff[block] = zi
			carry = (xi&yi | (xi|yi)&^zi) >> 63

			head = buff[block+i] >> (64 - shift)
		}
	}

	return [4]uint64{buff[0], buff[4], buff[8], buff[12]}
}

// gfpMul implements the Montgomery multiplication of a*b, i.e.,
// c = a*b*R^{-1} mod p
//
// Let T = a*b = T_h*R + T_l, then
//
//	a*b = T_h*R + T_l
//	    = T_h*R + T_l + (T_l*np mod R)*P mod P
//	      (For np*P = -1 mod R, so T_l + (T_l*np mod R)*P = 0 mod R.)
//	    = higher parts of T + (T_l*np mod R)*P
func gfpMul(c, a, b *gfP) {
	T := mul(*a, *b)
	m := halfMul([4]uint64{T[0], T[1], T[2], T[3]}, np) // m = T_l *np mod R
	t := mul([4]uint64{m[0], m[1], m[2], m[3]}, p2)     // t = (T_l*np mod R)*P

	// (T, carry) = a*b and (c, carry) = a*b/R
	// T[0:4] must be 0.
	var carry uint64
	for i, Ti := range T {
		ti := t[i]
		zi := Ti + ti + carry
		T[i] = zi
		carry = (Ti&ti | (Ti|ti)&^zi) >> 63
	}

	*c = gfP{T[4], T[5], T[6], T[7]}
	// TODO: can c >= p?
	gfpCarry(c, carry)
}