.text

.globl	ct_inverse_mod_256
.hidden	ct_inverse_mod_256
.type	ct_inverse_mod_256, %function
.align	5
ct_inverse_mod_256:
	.inst	0xd503233f
	stp	c29, c30, [csp,#-10*__SIZEOF_POINTER__]!
	add	c29, csp, #0
	stp	c19, c20, [csp,#2*__SIZEOF_POINTER__]
	stp	c21, c22, [csp,#4*__SIZEOF_POINTER__]
	stp	c23, c24, [csp,#6*__SIZEOF_POINTER__]
	stp	c25, c26, [csp,#8*__SIZEOF_POINTER__]
	sub	csp, csp, #1040

	ldp	x4, x5, [c1,#8*0]
	ldp	x6, x7, [c1,#8*2]

#ifdef	__CHERI_PURE_CAPABILITY__
	add	c1,csp,#16+511
	alignd	c1,c1,#9
#else
	add	x1, sp, #16+511	// find closest 512-byte-aligned spot
	and	x1, x1, #-512	// in the frame...
#endif
	str	c0, [csp]		// offload out_ptr

	ldp	x8, x9, [c2,#8*0]
	ldp	x10, x11, [c2,#8*2]

	stp	x4, x5, [c1,#8*0]	// copy input to |a|
	stp	x6, x7, [c1,#8*2]
	stp	x8, x9, [c1,#8*4]	// copy modulus to |b|
	stp	x10, x11, [c1,#8*6]

	////////////////////////////////////////// first iteration
	bl	.Lab_approximation_31_256_loaded

	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
#ifdef	__CHERI_PURE_CAPABILITY__
	scvalue	c0,csp,x0
#endif
	bl	__smul_256_n_shift_by_31
	str	x12,[c0,#8*8]		// initialize |u| with |f0|

	mov	x12, x14			// |f1|
	mov	x13, x15			// |g1|
	add	c0,c0,#8*4
	bl	__smul_256_n_shift_by_31
	str	x12, [c0,#8*9]		// initialize |v| with |f1|

	////////////////////////////////////////// second iteration
	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
#ifdef	__CHERI_PURE_CAPABILITY__
	scvalue	c1,csp,x1
#endif
	bl	__ab_approximation_31_256

	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
#ifdef	__CHERI_PURE_CAPABILITY__
	scvalue	c0,csp,x0
#endif
	bl	__smul_256_n_shift_by_31
	mov	x16, x12			// corrected |f0|
	mov	x17, x13			// corrected |g0|

	mov	x12, x14			// |f1|
	mov	x13, x15			// |g1|
	add	c0,c0,#8*4
	bl	__smul_256_n_shift_by_31

	ldr	x8, [c1,#8*8]		// |u|
	ldr	x9, [c1,#8*13]	// |v|
	madd	x4, x16, x8, xzr	// |u|*|f0|
	madd	x4, x17, x9, x4	// |v|*|g0|
	str	x4, [c0,#8*4]
	asr	x5, x4, #63		// sign extension
	stp	x5, x5, [c0,#8*5]
	stp	x5, x5, [c0,#8*7]

	madd	x4, x12, x8, xzr	// |u|*|f1|
	madd	x4, x13, x9, x4	// |v|*|g1|
	str	x4, [c0,#8*9]
	asr	x5, x4, #63		// sign extension
	stp	x5, x5, [c0,#8*10]
	stp	x5, x5, [c0,#8*12]
	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
#ifdef	__CHERI_PURE_CAPABILITY__
	scvalue	c1,csp,x1
#endif
	bl	__ab_approximation_31_256

	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
#ifdef	__CHERI_PURE_CAPABILITY__
	scvalue	c0,csp,x0
#endif
	bl	__smul_256_n_shift_by_31
	mov	x16, x12			// corrected |f0|
	mov	x17, x13			// corrected |g0|

	mov	x12, x14			// |f1|
	mov	x13, x15			// |g1|
	add	c0,c0,#8*4
	bl	__smul_256_n_shift_by_31

	add	c0,c0,#8*4
	bl	__smul_256x63
	adc	x22, x22, x23
	str	x22, [c0,#8*4]

	mov	x16, x12			// corrected |f1|
	mov	x17, x13			// corrected |g1|
	add	c0,c0,#8*5
	bl	__smul_256x63
	adc	x22, x22, x23
	stp	x22, x22, [c0,#8*4]
	stp	x22, x22, [c0,#8*6]
	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
#ifdef	__CHERI_PURE_CAPABILITY__
	scvalue	c1,csp,x1
#endif
	bl	__ab_approximation_31_256

	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
#ifdef	__CHERI_PURE_CAPABILITY__
	scvalue	c0,csp,x0
#endif
	bl	__smul_256_n_shift_by_31
	mov	x16, x12			// corrected |f0|
	mov	x17, x13			// corrected |g0|

	mov	x12, x14			// |f1|
	mov	x13, x15			// |g1|
	add	c0,c0,#8*4
	bl	__smul_256_n_shift_by_31

	add	c0,c0,#8*4
	bl	__smul_256x63
	adc	x22, x22, x23
	str	x22, [c0,#8*4]

	mov	x16, x12			// corrected |f1|
	mov	x17, x13			// corrected |g1|
	add	c0,c0,#8*5
	bl	__smul_256x63
	adc	x22, x22, x23
	stp	x22, x22, [c0,#8*4]
	stp	x22, x22, [c0,#8*6]
	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
#ifdef	__CHERI_PURE_CAPABILITY__
	scvalue	c1,csp,x1
#endif
	bl	__ab_approximation_31_256

	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
#ifdef	__CHERI_PURE_CAPABILITY__
	scvalue	c0,csp,x0
#endif
	bl	__smul_256_n_shift_by_31
	mov	x16, x12			// corrected |f0|
	mov	x17, x13			// corrected |g0|

	mov	x12, x14			// |f1|
	mov	x13, x15			// |g1|
	add	c0,c0,#8*4
	bl	__smul_256_n_shift_by_31

	add	c0,c0,#8*4
	bl	__smul_256x63
	adc	x22, x22, x23
	str	x22, [c0,#8*4]

	mov	x16, x12			// corrected |f1|
	mov	x17, x13			// corrected |g1|
	add	c0,c0,#8*5
	bl	__smul_256x63
	adc	x22, x22, x23
	stp	x22, x22, [c0,#8*4]
	stp	x22, x22, [c0,#8*6]
	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
#ifdef	__CHERI_PURE_CAPABILITY__
	scvalue	c1,csp,x1
#endif
	bl	__ab_approximation_31_256

	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
#ifdef	__CHERI_PURE_CAPABILITY__
	scvalue	c0,csp,x0
#endif
	bl	__smul_256_n_shift_by_31
	mov	x16, x12			// corrected |f0|
	mov	x17, x13			// corrected |g0|

	mov	x12, x14			// |f1|
	mov	x13, x15			// |g1|
	add	c0,c0,#8*4
	bl	__smul_256_n_shift_by_31

	add	c0,c0,#8*4
	bl	__smul_256x63
	adc	x22, x22, x23
	str	x22, [c0,#8*4]

	mov	x16, x12			// corrected |f1|
	mov	x17, x13			// corrected |g1|
	add	c0,c0,#8*5
	bl	__smul_256x63
	adc	x22, x22, x23
	stp	x22, x22, [c0,#8*4]
	stp	x22, x22, [c0,#8*6]
	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
#ifdef	__CHERI_PURE_CAPABILITY__
	scvalue	c1,csp,x1
#endif
	bl	__ab_approximation_31_256

	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
#ifdef	__CHERI_PURE_CAPABILITY__
	scvalue	c0,csp,x0
#endif
	bl	__smul_256_n_shift_by_31
	mov	x16, x12			// corrected |f0|
	mov	x17, x13			// corrected |g0|

	mov	x12, x14			// |f1|
	mov	x13, x15			// |g1|
	add	c0,c0,#8*4
	bl	__smul_256_n_shift_by_31

	add	c0,c0,#8*4
	bl	__smul_256x63
	adc	x22, x22, x23
	str	x22, [c0,#8*4]

	mov	x16, x12			// corrected |f1|
	mov	x17, x13			// corrected |g1|
	add	c0,c0,#8*5
	bl	__smul_256x63
	adc	x22, x22, x23
	stp	x22, x22, [c0,#8*4]
	stp	x22, x22, [c0,#8*6]
	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
#ifdef	__CHERI_PURE_CAPABILITY__
	scvalue	c1,csp,x1
#endif
	bl	__ab_approximation_31_256

	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
#ifdef	__CHERI_PURE_CAPABILITY__
	scvalue	c0,csp,x0
#endif
	bl	__smul_256_n_shift_by_31
	mov	x16, x12			// corrected |f0|
	mov	x17, x13			// corrected |g0|

	mov	x12, x14			// |f1|
	mov	x13, x15			// |g1|
	add	c0,c0,#8*4
	bl	__smul_256_n_shift_by_31

	add	c0,c0,#8*4
	bl	__smul_256x63
	adc	x22, x22, x23
	str	x22, [c0,#8*4]

	mov	x16, x12			// corrected |f1|
	mov	x17, x13			// corrected |g1|
	add	c0,c0,#8*5
	bl	__smul_256x63
	adc	x22, x22, x23
	stp	x22, x22, [c0,#8*4]
	stp	x22, x22, [c0,#8*6]
	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
#ifdef	__CHERI_PURE_CAPABILITY__
	scvalue	c1,csp,x1
#endif
	bl	__ab_approximation_31_256

	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
#ifdef	__CHERI_PURE_CAPABILITY__
	scvalue	c0,csp,x0
#endif
	bl	__smul_256_n_shift_by_31
	mov	x16, x12			// corrected |f0|
	mov	x17, x13			// corrected |g0|

	mov	x12, x14			// |f1|
	mov	x13, x15			// |g1|
	add	c0,c0,#8*4
	bl	__smul_256_n_shift_by_31

	add	c0,c0,#8*4
	bl	__smul_256x63
	adc	x22, x22, x23
	str	x22, [c0,#8*4]

	mov	x16, x12			// corrected |f1|
	mov	x17, x13			// corrected |g1|
	add	c0,c0,#8*5
	bl	__smul_256x63
	bl	__smul_512x63_tail
	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
#ifdef	__CHERI_PURE_CAPABILITY__
	scvalue	c1,csp,x1
#endif
	bl	__ab_approximation_31_256

	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
#ifdef	__CHERI_PURE_CAPABILITY__
	scvalue	c0,csp,x0
#endif
	bl	__smul_256_n_shift_by_31
	mov	x16, x12			// corrected |f0|
	mov	x17, x13			// corrected |g0|

	mov	x12, x14			// |f1|
	mov	x13, x15			// |g1|
	add	c0,c0,#8*4
	bl	__smul_256_n_shift_by_31

	add	c0,c0,#8*4
	bl	__smul_256x63
	adc	x22, x22, x23
	str	x22, [c0,#8*4]

	mov	x16, x12			// corrected |f1|
	mov	x17, x13			// corrected |g1|
	add	c0,c0,#8*5
	bl	__smul_256x63
	bl	__smul_512x63_tail
	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
#ifdef	__CHERI_PURE_CAPABILITY__
	scvalue	c1,csp,x1
#endif
	bl	__ab_approximation_31_256

	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
#ifdef	__CHERI_PURE_CAPABILITY__
	scvalue	c0,csp,x0
#endif
	bl	__smul_256_n_shift_by_31
	mov	x16, x12			// corrected |f0|
	mov	x17, x13			// corrected |g0|

	mov	x12, x14			// |f1|
	mov	x13, x15			// |g1|
	add	c0,c0,#8*4
	bl	__smul_256_n_shift_by_31

	add	c0,c0,#8*4
	bl	__smul_256x63
	adc	x22, x22, x23
	str	x22, [c0,#8*4]

	mov	x16, x12			// corrected |f1|
	mov	x17, x13			// corrected |g1|
	add	c0,c0,#8*5
	bl	__smul_256x63
	bl	__smul_512x63_tail
	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
#ifdef	__CHERI_PURE_CAPABILITY__
	scvalue	c1,csp,x1
#endif
	bl	__ab_approximation_31_256

	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
#ifdef	__CHERI_PURE_CAPABILITY__
	scvalue	c0,csp,x0
#endif
	bl	__smul_256_n_shift_by_31
	mov	x16, x12			// corrected |f0|
	mov	x17, x13			// corrected |g0|

	mov	x12, x14			// |f1|
	mov	x13, x15			// |g1|
	add	c0,c0,#8*4
	bl	__smul_256_n_shift_by_31

	add	c0,c0,#8*4
	bl	__smul_256x63
	adc	x22, x22, x23
	str	x22, [c0,#8*4]

	mov	x16, x12			// corrected |f1|
	mov	x17, x13			// corrected |g1|
	add	c0,c0,#8*5
	bl	__smul_256x63
	bl	__smul_512x63_tail
	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
#ifdef	__CHERI_PURE_CAPABILITY__
	scvalue	c1,csp,x1
#endif
	bl	__ab_approximation_31_256

	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
#ifdef	__CHERI_PURE_CAPABILITY__
	scvalue	c0,csp,x0
#endif
	bl	__smul_256_n_shift_by_31
	mov	x16, x12			// corrected |f0|
	mov	x17, x13			// corrected |g0|

	mov	x12, x14			// |f1|
	mov	x13, x15			// |g1|
	add	c0,c0,#8*4
	bl	__smul_256_n_shift_by_31

	add	c0,c0,#8*4
	bl	__smul_256x63
	adc	x22, x22, x23
	str	x22, [c0,#8*4]

	mov	x16, x12			// corrected |f1|
	mov	x17, x13			// corrected |g1|
	add	c0,c0,#8*5
	bl	__smul_256x63
	bl	__smul_512x63_tail
	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
#ifdef	__CHERI_PURE_CAPABILITY__
	scvalue	c1,csp,x1
#endif
	bl	__ab_approximation_31_256

	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
#ifdef	__CHERI_PURE_CAPABILITY__
	scvalue	c0,csp,x0
#endif
	bl	__smul_256_n_shift_by_31
	mov	x16, x12			// corrected |f0|
	mov	x17, x13			// corrected |g0|

	mov	x12, x14			// |f1|
	mov	x13, x15			// |g1|
	add	c0,c0,#8*4
	bl	__smul_256_n_shift_by_31

	add	c0,c0,#8*4
	bl	__smul_256x63
	adc	x22, x22, x23
	str	x22, [c0,#8*4]

	mov	x16, x12			// corrected |f1|
	mov	x17, x13			// corrected |g1|
	add	c0,c0,#8*5
	bl	__smul_256x63
	bl	__smul_512x63_tail
	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
#ifdef	__CHERI_PURE_CAPABILITY__
	scvalue	c1,csp,x1
#endif
	bl	__ab_approximation_31_256

	eor	x0, x1, #256		// pointer to dst |a|b|u|v|
#ifdef	__CHERI_PURE_CAPABILITY__
	scvalue	c0,csp,x0
#endif
	bl	__smul_256_n_shift_by_31
	mov	x16, x12			// corrected |f0|
	mov	x17, x13			// corrected |g0|

	mov	x12, x14			// |f1|
	mov	x13, x15			// |g1|
	add	c0,c0,#8*4
	bl	__smul_256_n_shift_by_31

	add	c0,c0,#8*4
	bl	__smul_256x63
	adc	x22, x22, x23
	str	x22, [c0,#8*4]

	mov	x16, x12			// corrected |f1|
	mov	x17, x13			// corrected |g1|
	add	c0,c0,#8*5
	bl	__smul_256x63
	bl	__smul_512x63_tail
	////////////////////////////////////////// two[!] last iterations
	eor	x1, x1, #256		// flip-flop src |a|b|u|v|
#ifdef	__CHERI_PURE_CAPABILITY__
	scvalue	c1,csp,x1
#endif
	mov	x2, #47			// 31 + 512 % 31
	//bl	__ab_approximation_62_256	// |a| and |b| are exact,
	ldr	x7, [c1,#8*0]		// just load
	ldr	x11, [c1,#8*4]
	bl	__inner_loop_62_256

	mov	x16, x14
	mov	x17, x15
	ldr	c0, [csp]			// original out_ptr
	bl	__smul_256x63
	bl	__smul_512x63_tail
	ldr	c30, [c29,#__SIZEOF_POINTER__]

	smulh	x20, x7, x17		// figure out top-most limb
	ldp	x8, x9, [c3,#8*0]
	adc	x23, x23, x25
	ldp	x10, x11, [c3,#8*2]

	add	x20, x20, x23		// x20 is 1, 0 or -1
	asr	x19, x20, #63		// sign as mask

	and	x23,   x8, x19		// add mod<<256 conditionally
	and	x24,   x9, x19
	adds	x4, x4, x23
	and	x25,   x10, x19
	adcs	x5, x5, x24
	and	x26,   x11, x19
	adcs	x6, x6, x25
	adcs	x7, x22,   x26
	adc	x20, x20, xzr		// x20 is 1, 0 or -1

	neg	x19, x20
	orr	x20, x20, x19		// excess bit or sign as mask
	asr	x19, x19, #63		// excess bit as mask

	and	x8, x8, x20		// mask |mod|
	and	x9, x9, x20
	and	x10, x10, x20
	and	x11, x11, x20

	eor	x8, x8, x19		// conditionally negate |mod|
	eor	x9, x9, x19
	adds	x8, x8, x19, lsr#63
	eor	x10, x10, x19
	adcs	x9, x9, xzr
	eor	x11, x11, x19
	adcs	x10, x10, xzr
	adc	x11, x11, xzr

	adds	x4, x4, x8	// final adjustment for |mod|<<256
	adcs	x5, x5, x9
	adcs	x6, x6, x10
	stp	x4, x5, [c0,#8*4]
	adc	x7, x7, x11
	stp	x6, x7, [c0,#8*6]

	add	csp, csp, #1040
	ldp	c19, c20, [c29,#2*__SIZEOF_POINTER__]
	ldp	c21, c22, [c29,#4*__SIZEOF_POINTER__]
	ldp	c23, c24, [c29,#6*__SIZEOF_POINTER__]
	ldp	c25, c26, [c29,#8*__SIZEOF_POINTER__]
	ldr	c29, [csp],#10*__SIZEOF_POINTER__
	.inst	0xd50323bf
	ret
.size	ct_inverse_mod_256,.-ct_inverse_mod_256

////////////////////////////////////////////////////////////////////////
.type	__smul_256x63, %function
.align	5
__smul_256x63:
	ldp	x4, x5, [c1,#8*0+64]	// load |u| (or |v|)
	asr	x14, x16, #63		// |f_|'s sign as mask (or |g_|'s)
	ldp	x6, x7, [c1,#8*2+64]
	eor	x16, x16, x14		// conditionally negate |f_| (or |g_|)
	ldr	x22, [c1,#8*4+64]

	eor	x4, x4, x14	// conditionally negate |u| (or |v|)
	sub	x16, x16, x14
	eor	x5, x5, x14
	adds	x4, x4, x14, lsr#63
	eor	x6, x6, x14
	adcs	x5, x5, xzr
	eor	x7, x7, x14
	adcs	x6, x6, xzr
	eor	x22, x22, x14
	umulh	x19, x4, x16
	adcs	x7, x7, xzr
	umulh	x20, x5, x16
	adcs	x22, x22, xzr
	umulh	x21, x6, x16
	mul	x4, x4, x16
	cmp	x16, #0
	mul	x5, x5, x16
	csel	x22, x22, xzr, ne
	mul	x6, x6, x16
	adds	x5, x5, x19
	mul	x24, x7, x16
	adcs	x6, x6, x20
	adcs	x24, x24, x21
	adc	x26, xzr, xzr
	ldp	x8, x9, [c1,#8*0+104]	// load |u| (or |v|)
	asr	x14, x17, #63		// |f_|'s sign as mask (or |g_|'s)
	ldp	x10, x11, [c1,#8*2+104]
	eor	x17, x17, x14		// conditionally negate |f_| (or |g_|)
	ldr	x23, [c1,#8*4+104]

	eor	x8, x8, x14	// conditionally negate |u| (or |v|)
	sub	x17, x17, x14
	eor	x9, x9, x14
	adds	x8, x8, x14, lsr#63
	eor	x10, x10, x14
	adcs	x9, x9, xzr
	eor	x11, x11, x14
	adcs	x10, x10, xzr
	eor	x23, x23, x14
	umulh	x19, x8, x17
	adcs	x11, x11, xzr
	umulh	x20, x9, x17
	adcs	x23, x23, xzr
	umulh	x21, x10, x17
	adc	x15, xzr, xzr		// used in __smul_512x63_tail
	mul	x8, x8, x17
	cmp	x17, #0
	mul	x9, x9, x17
	csel	x23, x23, xzr, ne
	mul	x10, x10, x17
	adds	x9, x9, x19
	mul	x25, x11, x17
	adcs	x10, x10, x20
	adcs	x25, x25, x21
	adc	x26, x26, xzr

	adds	x4, x4, x8
	adcs	x5, x5, x9
	adcs	x6, x6, x10
	stp	x4, x5, [c0,#8*0]
	adcs	x24,   x24,   x25
	stp	x6, x24, [c0,#8*2]

	ret
.size	__smul_256x63,.-__smul_256x63

.type	__smul_512x63_tail, %function
.align	5
__smul_512x63_tail:
	umulh	x24, x7, x16
	ldp	x5, x6, [c1,#8*18]	// load rest of |v|
	adc	x26, x26, xzr
	ldr	x7, [c1,#8*20]
	and	x22, x22, x16

	umulh	x11, x11, x17	// resume |v|*|g1| chain

	sub	x24, x24, x22	// tie up |u|*|f1| chain
	asr	x25, x24, #63

	eor	x5, x5, x14	// conditionally negate rest of |v|
	eor	x6, x6, x14
	adds	x5, x5, x15
	eor	x7, x7, x14
	adcs	x6, x6, xzr
	umulh	x19, x23,   x17
	adc	x7, x7, xzr
	umulh	x20, x5, x17
	add	x11, x11, x26
	umulh	x21, x6, x17

	mul	x4, x23,   x17
	mul	x5, x5, x17
	adds	x4, x4, x11
	mul	x6, x6, x17
	adcs	x5, x5, x19
	mul	x22,   x7, x17
	adcs	x6, x6, x20
	adcs	x22,   x22,   x21
	adc	x23, xzr, xzr		// used in the final step

	adds	x4, x4, x24
	adcs	x5, x5, x25
	adcs	x6, x6, x25
	stp	x4, x5, [c0,#8*4]
	adcs	x22,   x22,   x25	// carry is used in the final step
	stp	x6, x22,   [c0,#8*6]

	ret
.size	__smul_512x63_tail,.-__smul_512x63_tail

.type	__smul_256_n_shift_by_31, %function
.align	5
__smul_256_n_shift_by_31:
	ldp	x4, x5, [c1,#8*0+0]	// load |a| (or |b|)
	asr	x24, x12, #63		// |f0|'s sign as mask (or |g0|'s)
	ldp	x6, x7, [c1,#8*2+0]
	eor	x25, x12, x24	// conditionally negate |f0| (or |g0|)

	eor	x4, x4, x24	// conditionally negate |a| (or |b|)
	sub	x25, x25, x24
	eor	x5, x5, x24
	adds	x4, x4, x24, lsr#63
	eor	x6, x6, x24
	adcs	x5, x5, xzr
	eor	x7, x7, x24
	umulh	x19, x4, x25
	adcs	x6, x6, xzr
	umulh	x20, x5, x25
	adc	x7, x7, xzr
	umulh	x21, x6, x25
	and	x24, x24, x25
	umulh	x22, x7, x25
	neg	x24, x24

	mul	x4, x4, x25
	mul	x5, x5, x25
	mul	x6, x6, x25
	adds	x5, x5, x19
	mul	x7, x7, x25
	adcs	x6, x6, x20
	adcs	x7, x7, x21
	adc	x22, x22, x24
	ldp	x8, x9, [c1,#8*0+32]	// load |a| (or |b|)
	asr	x24, x13, #63		// |f0|'s sign as mask (or |g0|'s)
	ldp	x10, x11, [c1,#8*2+32]
	eor	x25, x13, x24	// conditionally negate |f0| (or |g0|)

	eor	x8, x8, x24	// conditionally negate |a| (or |b|)
	sub	x25, x25, x24
	eor	x9, x9, x24
	adds	x8, x8, x24, lsr#63
	eor	x10, x10, x24
	adcs	x9, x9, xzr
	eor	x11, x11, x24
	umulh	x19, x8, x25
	adcs	x10, x10, xzr
	umulh	x20, x9, x25
	adc	x11, x11, xzr
	umulh	x21, x10, x25
	and	x24, x24, x25
	umulh	x23, x11, x25
	neg	x24, x24

	mul	x8, x8, x25
	mul	x9, x9, x25
	mul	x10, x10, x25
	adds	x9, x9, x19
	mul	x11, x11, x25
	adcs	x10, x10, x20
	adcs	x11, x11, x21
	adc	x23, x23, x24
	adds	x4, x4, x8
	adcs	x5, x5, x9
	adcs	x6, x6, x10
	adcs	x7, x7, x11
	adc	x8, x22,   x23

	extr	x4, x5, x4, #31
	extr	x5, x6, x5, #31
	extr	x6, x7, x6, #31
	asr	x23, x8, #63	// result's sign as mask
	extr	x7, x8, x7, #31

	eor	x4, x4, x23	// ensure the result is positive
	eor	x5, x5, x23
	adds	x4, x4, x23, lsr#63
	eor	x6, x6, x23
	adcs	x5, x5, xzr
	eor	x7, x7, x23
	adcs	x6, x6, xzr
	stp	x4, x5, [c0,#8*0]
	adc	x7, x7, xzr
	stp	x6, x7, [c0,#8*2]

	eor	x12, x12, x23		// adjust |f/g| accordingly
	eor	x13, x13, x23
	sub	x12, x12, x23
	sub	x13, x13, x23

	ret
.size	__smul_256_n_shift_by_31,.-__smul_256_n_shift_by_31
.type	__ab_approximation_31_256, %function
.align	4
__ab_approximation_31_256:
	ldp	x6, x7, [c1,#8*2]
	ldp	x10, x11, [c1,#8*6]
	ldp	x4, x5, [c1,#8*0]
	ldp	x8, x9, [c1,#8*4]

.Lab_approximation_31_256_loaded:
	orr	x19, x7, x11	// check top-most limbs, ...
	cmp	x19, #0
	csel	x7, x7, x6, ne
	csel	x11, x11, x10, ne
	csel	x6, x6, x5, ne
	orr	x19, x7, x11	// and ones before top-most, ...
	csel	x10, x10, x9, ne

	cmp	x19, #0
	csel	x7, x7, x6, ne
	csel	x11, x11, x10, ne
	csel	x6, x6, x4, ne
	orr	x19, x7, x11	// and one more, ...
	csel	x10, x10, x8, ne

	clz	x19, x19
	cmp	x19, #64
	csel	x19, x19, xzr, ne
	csel	x7, x7, x6, ne
	csel	x11, x11, x10, ne
	neg	x20, x19

	lslv	x7, x7, x19	// align high limbs to the left
	lslv	x11, x11, x19
	lsrv	x6, x6, x20
	lsrv	x10, x10, x20
	and	x6, x6, x20, asr#6
	and	x10, x10, x20, asr#6
	orr	x7, x7, x6
	orr	x11, x11, x10

	bfxil	x7, x4, #0, #31
	bfxil	x11, x8, #0, #31

	b	__inner_loop_31_256
	ret
.size	__ab_approximation_31_256,.-__ab_approximation_31_256

.type	__inner_loop_31_256, %function
.align	4
__inner_loop_31_256:
	mov	x2, #31
	mov	x13, #0x7FFFFFFF80000000	// |f0|=1, |g0|=0
	mov	x15, #0x800000007FFFFFFF	// |f1|=0, |g1|=1
	mov	x23,#0x7FFFFFFF7FFFFFFF

.Loop_31_256:
	sbfx	x22, x7, #0, #1	// if |a_| is odd, then we'll be subtracting
	sub	x2, x2, #1
	and	x19, x11, x22
	sub	x20, x11, x7	// |b_|-|a_|
	subs	x21, x7, x19	// |a_|-|b_| (or |a_|-0 if |a_| was even)
	mov	x19, x15
	csel	x11, x11, x7, hs	// |b_| = |a_|
	csel	x7, x21, x20, hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
	csel	x15, x15, x13,    hs	// exchange |fg0| and |fg1|
	csel	x13, x13, x19,   hs
	lsr	x7, x7, #1
	and	x19, x15, x22
	and	x20, x23, x22
	sub	x13, x13, x19	// |f0|-=|f1| (or |f0-=0| if |a_| was even)
	add	x15, x15, x15	// |f1|<<=1
	add	x13, x13, x20
	sub	x15, x15, x23
	cbnz	x2, .Loop_31_256

	mov	x23, #0x7FFFFFFF
	ubfx	x12, x13, #0, #32
	ubfx	x13, x13, #32, #32
	ubfx	x14, x15, #0, #32
	ubfx	x15, x15, #32, #32
	sub	x12, x12, x23		// remove bias
	sub	x13, x13, x23
	sub	x14, x14, x23
	sub	x15, x15, x23

	ret
.size	__inner_loop_31_256,.-__inner_loop_31_256

.type	__inner_loop_62_256, %function
.align	4
__inner_loop_62_256:
	mov	x12, #1		// |f0|=1
	mov	x13, #0		// |g0|=0
	mov	x14, #0		// |f1|=0
	mov	x15, #1		// |g1|=1

.Loop_62_256:
	sbfx	x22, x7, #0, #1	// if |a_| is odd, then we'll be subtracting
	sub	x2, x2, #1
	and	x19, x11, x22
	sub	x20, x11, x7	// |b_|-|a_|
	subs	x21, x7, x19	// |a_|-|b_| (or |a_|-0 if |a_| was even)
	mov	x19, x12
	csel	x11, x11, x7, hs	// |b_| = |a_|
	csel	x7, x21, x20, hs	// borrow means |a_|<|b_|, replace with |b_|-|a_|
	mov	x20, x13
	csel	x12, x12, x14,       hs	// exchange |f0| and |f1|
	csel	x14, x14, x19,     hs
	csel	x13, x13, x15,       hs	// exchange |g0| and |g1|
	csel	x15, x15, x20,     hs
	lsr	x7, x7, #1
	and	x19, x14, x22
	and	x20, x15, x22
	add	x14, x14, x14		// |f1|<<=1
	add	x15, x15, x15		// |g1|<<=1
	sub	x12, x12, x19		// |f0|-=|f1| (or |f0-=0| if |a_| was even)
	sub	x13, x13, x20		// |g0|-=|g1| (or |g0-=0| ...)
	cbnz	x2, .Loop_62_256

	ret
.size	__inner_loop_62_256,.-__inner_loop_62_256