/*********************************************************************************** * FourQlib: a high-performance crypto library based on the elliptic curve FourQ * * Copyright (c) Microsoft Corporation. All rights reserved. * * Abstract: portable modular arithmetic and other low-level operations ************************************************************************************/ #ifndef __FP_H__ #define __FP_H__ // For C++ #ifdef __cplusplus extern "C" { #endif #include "../FourQ_params.h" #include "../table_lookup.h" const digit_t mask_7fff = (digit_t)(-1) >> 1; const digit_t prime1271_0 = (digit_t)(-1); #define prime1271_1 mask_7fff void digit_x_digit(digit_t a, digit_t b, digit_t *c) { // Digit multiplication, digit * digit -> 2-digit result register digit_t al, ah, bl, bh, temp; digit_t albl, albh, ahbl, ahbh, res1, res2, res3, carry; digit_t mask_low = (digit_t)(-1) >> (sizeof(digit_t) * 4), mask_high = (digit_t)(-1) << (sizeof(digit_t) * 4); al = a & mask_low; // Low part ah = a >> (sizeof(digit_t) * 4); // High part bl = b & mask_low; bh = b >> (sizeof(digit_t) * 4); albl = al * bl; albh = al * bh; ahbl = ah * bl; ahbh = ah * bh; c[0] = albl & mask_low; // C00 res1 = albl >> (sizeof(digit_t) * 4); res2 = ahbl & mask_low; res3 = albh & mask_low; temp = res1 + res2 + res3; carry = temp >> (sizeof(digit_t) * 4); c[0] ^= temp << (sizeof(digit_t) * 4); // C01 res1 = ahbl >> (sizeof(digit_t) * 4); res2 = albh >> (sizeof(digit_t) * 4); res3 = ahbh & mask_low; temp = res1 + res2 + res3 + carry; c[1] = temp & mask_low; // C10 carry = temp & mask_high; c[1] ^= (ahbh & mask_high) + carry; // C11 } void fpcopy1271(felm_t a, felm_t c) { // Copy of a field element, c = a unsigned int i; for (i = 0; i < NWORDS_FIELD; i++) c[i] = a[i]; } static void fpzero1271(felm_t a) { // Zeroing a field element, a = 0 unsigned int i; for (i = 0; i < NWORDS_FIELD; i++) a[i] = 0; } void fpadd1271(felm_t a, felm_t b, felm_t c) { // Field addition, c = a+b mod p unsigned int i; unsigned int carry = 0; for (i = 0; i < NWORDS_FIELD; i++) { ADDC(carry, a[i], b[i], carry, c[i]); } carry = (unsigned int)(c[NWORDS_FIELD - 1] >> (RADIX - 1)); c[NWORDS_FIELD - 1] &= mask_7fff; for (i = 0; i < NWORDS_FIELD; i++) { ADDC(carry, c[i], 0, carry, c[i]); } } void fpsub1271(felm_t a, felm_t b, felm_t c) { // Field subtraction, c = a-b mod p unsigned int i; unsigned int borrow = 0; for (i = 0; i < NWORDS_FIELD; i++) { SUBC(borrow, a[i], b[i], borrow, c[i]); } borrow = (unsigned int)(c[NWORDS_FIELD - 1] >> (RADIX - 1)); c[NWORDS_FIELD - 1] &= mask_7fff; for (i = 0; i < NWORDS_FIELD; i++) { SUBC(borrow, c[i], 0, borrow, c[i]); } } void fpneg1271(felm_t a) { // Field negation, a = -a mod p unsigned int i; unsigned int borrow = 0; for (i = 0; i < (NWORDS_FIELD - 1); i++) { SUBC(borrow, prime1271_0, a[i], borrow, a[i]); } a[NWORDS_FIELD - 1] = prime1271_1 - a[NWORDS_FIELD - 1]; } void fpmul1271(felm_t a, felm_t b, felm_t c) { // Field multiplication using schoolbook method, c = a*b mod p unsigned int i, j; digit_t u, v, UV[2], temp, bit_mask; digit_t t[2 * NWORDS_FIELD] = { 0 }; unsigned int carry = 0; for (i = 0; i < NWORDS_FIELD; i++) { u = 0; for (j = 0; j < NWORDS_FIELD; j++) { MUL(a[i], b[j], UV + 1, UV[0]); ADDC(0, UV[0], u, carry, v); u = UV[1] + carry; ADDC(0, t[i + j], v, carry, v); u = u + carry; t[i + j] = v; } t[NWORDS_FIELD + i] = u; } bit_mask = (t[NWORDS_FIELD - 1] >> (RADIX - 1)); t[NWORDS_FIELD - 1] &= mask_7fff; carry = 0; for (i = 0; i < NWORDS_FIELD; i++) { temp = (t[NWORDS_FIELD + i] >> (RADIX - 1)); t[NWORDS_FIELD + i] = (t[NWORDS_FIELD + i] << 1) + bit_mask; bit_mask = temp; ADDC(carry, t[i], t[NWORDS_FIELD + i], carry, t[i]); } carry = (unsigned int)(t[NWORDS_FIELD - 1] >> (RADIX - 1)); t[NWORDS_FIELD - 1] &= mask_7fff; for (i = 0; i < NWORDS_FIELD; i++) { ADDC(carry, t[i], 0, carry, c[i]); } } void fpsqr1271(felm_t a, felm_t c) { // Field squaring using schoolbook method, c = a^2 mod p fpmul1271(a, a, c); } void mod1271(felm_t a) { // Modular correction, a = a mod (2^127-1) digit_t mask; unsigned int i; unsigned int borrow = 0; for (i = 0; i < (NWORDS_FIELD - 1); i++) { SUBC(borrow, a[i], prime1271_0, borrow, a[i]); } SUBC(borrow, a[NWORDS_FIELD - 1], prime1271_1, borrow, a[NWORDS_FIELD - 1]); mask = 0 - (digit_t)borrow; // If result < 0 then mask = 0xFF...F else sign = 0x00...0 borrow = 0; for (i = 0; i < (NWORDS_FIELD - 1); i++) { ADDC(borrow, a[i], mask, borrow, a[i]); } ADDC(borrow, a[NWORDS_FIELD - 1], (mask >> 1), borrow, a[NWORDS_FIELD - 1]); } void mp_mul(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords) { // Schoolbook multiprecision multiply, c = a*b unsigned int i, j; digit_t u, v, UV[2]; unsigned int carry = 0; for (i = 0; i < (2 * nwords); i++) c[i] = 0; for (i = 0; i < nwords; i++) { u = 0; for (j = 0; j < nwords; j++) { MUL(a[i], b[j], UV + 1, UV[0]); ADDC(0, UV[0], u, carry, v); u = UV[1] + carry; ADDC(0, c[i + j], v, carry, v); u = u + carry; c[i + j] = v; } c[nwords + i] = u; } } unsigned int mp_add(digit_t *a, digit_t *b, digit_t *c, unsigned int nwords) { // Multiprecision addition, c = a+b, where lng(a) = lng(b) = nwords. Returns the carry bit unsigned int i, carry = 0; for (i = 0; i < nwords; i++) { ADDC(carry, a[i], b[i], carry, c[i]); } return carry; } void fpexp1251(felm_t a, felm_t af) { // Exponentiation over GF(p), af = a^(125-1) int i; felm_t t1, t2, t3, t4, t5; fpsqr1271(a, t2); fpmul1271(a, t2, t2); fpsqr1271(t2, t3); fpsqr1271(t3, t3); fpmul1271(t2, t3, t3); fpsqr1271(t3, t4); fpsqr1271(t4, t4); fpsqr1271(t4, t4); fpsqr1271(t4, t4); fpmul1271(t3, t4, t4); fpsqr1271(t4, t5); for (i = 0; i < 7; i++) fpsqr1271(t5, t5); fpmul1271(t4, t5, t5); fpsqr1271(t5, t2); for (i = 0; i < 15; i++) fpsqr1271(t2, t2); fpmul1271(t5, t2, t2); fpsqr1271(t2, t1); for (i = 0; i < 31; i++) fpsqr1271(t1, t1); fpmul1271(t2, t1, t1); for (i = 0; i < 32; i++) fpsqr1271(t1, t1); fpmul1271(t1, t2, t1); for (i = 0; i < 16; i++) fpsqr1271(t1, t1); fpmul1271(t5, t1, t1); for (i = 0; i < 8; i++) fpsqr1271(t1, t1); fpmul1271(t4, t1, t1); for (i = 0; i < 4; i++) fpsqr1271(t1, t1); fpmul1271(t3, t1, t1); fpsqr1271(t1, t1); fpmul1271(a, t1, af); } void fpinv1271(felm_t a) { // Field inversion, af = a^-1 = a^(p-2) mod p // Hardcoded for p = 2^127-1 felm_t t; fpexp1251(a, t); fpsqr1271(t, t); fpsqr1271(t, t); fpmul1271(a, t, a); } static void multiply(const digit_t *a, const digit_t *b, digit_t *c) { // Schoolbook multiprecision multiply, c = a*b mp_mul(a, b, c, NWORDS_ORDER); } static unsigned int add(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords) { // Multiprecision addition, c = a+b, where lng(a) = lng(b) = nwords. Returns the carry bit return mp_add((digit_t *)a, (digit_t *)b, c, (unsigned int)nwords); } unsigned int subtract(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords) { // Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = nwords. Returns the borrow bit unsigned int i; unsigned int borrow = 0; for (i = 0; i < nwords; i++) { SUBC(borrow, a[i], b[i], borrow, c[i]); } return borrow; } void subtract_mod_order(const digit_t *a, const digit_t *b, digit_t *c) { // Subtraction modulo the curve order, c = a-b mod order digit_t mask, carry = 0; digit_t *order = (digit_t *)curve_order; unsigned int i, bout; bout = subtract(a, b, c, NWORDS_ORDER); // (bout, c) = a - b mask = 0 - (digit_t)bout; // if bout = 0 then mask = 0x00..0, else if bout = 1 then mask = 0xFF..F for (i = 0; i < NWORDS_ORDER; i++) { // c = c + (mask & order) ADDC(carry, c[i], mask & order[i], carry, c[i]); } } void add_mod_order(const digit_t *a, const digit_t *b, digit_t *c) { // Addition modulo the curve order, c = a+b mod order add(a, b, c, NWORDS_ORDER); // c = a + b subtract_mod_order(c, (digit_t *)&curve_order, c); // if c >= order then c = c - order } void Montgomery_multiply_mod_order(const digit_t *ma, const digit_t *mb, digit_t *mc) { // 256-bit Montgomery multiplication modulo the curve order, mc = ma*mb*r' mod order, where // ma,mb,mc in [0, order-1] ma, mb and mc are assumed to be in Montgomery representation The // Montgomery constant r' = -r^(-1) mod 2^(log_2(r)) is the global value "Montgomery_rprime", // where r is the order unsigned int i; digit_t mask, P[2 * NWORDS_ORDER], Q[2 * NWORDS_ORDER], temp[2 * NWORDS_ORDER]; digit_t *order = (digit_t *)curve_order; unsigned int cout = 0, bout = 0; multiply(ma, mb, P); // P = ma * mb multiply(P, (digit_t *)&Montgomery_rprime, Q); // Q = P * r' mod 2^(log_2(r)) multiply(Q, (digit_t *)&curve_order, temp); // temp = Q * r cout = add(P, temp, temp, 2 * NWORDS_ORDER); // (cout, temp) = P + Q * r for (i = 0; i < NWORDS_ORDER; i++) { // (cout, mc) = (P + Q * r)/2^(log_2(r)) mc[i] = temp[NWORDS_ORDER + i]; } // Final, constant-time subtraction bout = subtract(mc, (digit_t *)&curve_order, mc, NWORDS_ORDER); // (cout, mc) = (cout, mc) - r mask = (digit_t)cout - (digit_t)bout; // if (cout, mc) >= 0 then mask = 0x00..0, else if (cout, // mc) < 0 then mask = 0xFF..F for (i = 0; i < NWORDS_ORDER; i++) { // temp = mask & r temp[i] = (order[i] & mask); } add(mc, temp, mc, NWORDS_ORDER); // mc = mc + (mask & r) return; } void modulo_order(digit_t *a, digit_t *c) { // Reduction modulo the order using Montgomery arithmetic // ma = a*Montgomery_Rprime mod r, where a,ma in [0, r-1], a,ma,r < 2^256 // c = ma*1*Montgomery_Rprime^(-1) mod r, where ma,c in [0, r-1], ma,c,r < 2^256 digit_t ma[NWORDS_ORDER], one[NWORDS_ORDER] = { 0 }; one[0] = 1; Montgomery_multiply_mod_order(a, (digit_t *)&Montgomery_Rprime, ma); Montgomery_multiply_mod_order(ma, one, c); } void conversion_to_odd(digit_t *k, digit_t *k_odd) { // Convert scalar to odd if even using the prime subgroup order r digit_t i, mask; digit_t *order = (digit_t *)curve_order; unsigned int carry = 0; mask = ~(0 - (k[0] & 1)); for (i = 0; i < NWORDS_ORDER; i++) { // If (k is odd) then k_odd = k else k_odd = k + r ADDC(carry, order[i] & mask, k[i], carry, k_odd[i]); } } void fpdiv1271(felm_t a) { // Field division by two, c = a/2 mod p digit_t mask; unsigned int carry = 0; unsigned int i; mask = 0 - (a[0] & 1); // if a is odd then mask = 0xFF...FF, else mask = 0 for (i = 0; i < (NWORDS_FIELD - 1); i++) { ADDC(carry, mask, a[i], carry, a[i]); } ADDC(carry, (mask >> 1), a[NWORDS_FIELD - 1], carry, a[NWORDS_FIELD - 1]); for (i = 0; i < (NWORDS_FIELD - 1); i++) { SHIFTR(a[i + 1], a[i], 1, a[i], RADIX); } a[NWORDS_FIELD - 1] = (a[NWORDS_FIELD - 1] >> 1); } void fp2div1271(f2elm_t a) { // GF(p^2) division by two c = a/2 mod p fpdiv1271(a[0]); fpdiv1271(a[1]); } #ifdef __cplusplus } #endif #endif