/* Copyright (c) 2020, Google Inc. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ // An implementation of the NIST P-256 elliptic curve point multiplication. // 256-bit Montgomery form for 64 and 32-bit. Field operations are generated by // Fiat, which lives in //third_party/fiat. #include #include "../../limbs/limbs.h" #include "../../limbs/limbs.inl" #include "p256_shared.h" #include "../../internal.h" #include "./util.h" #if !defined(OPENSSL_USE_NISTZ256) #if defined(_MSC_VER) && !defined(__clang__) // '=': conversion from 'int64_t' to 'int32_t', possible loss of data #pragma warning(disable: 4242) // '=': conversion from 'int32_t' to 'uint8_t', possible loss of data #pragma warning(disable: 4244) // 'initializing': conversion from 'size_t' to 'fiat_p256_limb_t' #pragma warning(disable: 4267) #endif #if defined(__GNUC__) #pragma GCC diagnostic ignored "-Wconversion" #pragma GCC diagnostic ignored "-Wsign-conversion" #endif // MSVC does not implement uint128_t, and crashes with intrinsics #if defined(BORINGSSL_HAS_UINT128) #if defined(__GNUC__) #pragma GCC diagnostic ignored "-Wpedantic" #endif #define BORINGSSL_NISTP256_64BIT 1 #include "../../../third_party/fiat/p256_64.h" #else #include "../../../third_party/fiat/p256_32.h" #endif // utility functions, handwritten #if defined(BORINGSSL_NISTP256_64BIT) #define FIAT_P256_NLIMBS 4 typedef uint64_t fiat_p256_limb_t; typedef uint64_t fiat_p256_felem[FIAT_P256_NLIMBS]; static const fiat_p256_felem fiat_p256_one = {0x1, 0xffffffff00000000, 0xffffffffffffffff, 0xfffffffe}; #else // 64BIT; else 32BIT #define FIAT_P256_NLIMBS 8 typedef uint32_t fiat_p256_limb_t; typedef uint32_t fiat_p256_felem[FIAT_P256_NLIMBS]; static const fiat_p256_felem fiat_p256_one = { 0x1, 0x0, 0x0, 0xffffffff, 0xffffffff, 0xffffffff, 0xfffffffe, 0x0}; #endif // 64BIT static fiat_p256_limb_t fiat_p256_nz( const fiat_p256_limb_t in1[FIAT_P256_NLIMBS]) { fiat_p256_limb_t ret; fiat_p256_nonzero(&ret, in1); return ret; } static void fiat_p256_copy(fiat_p256_limb_t out[FIAT_P256_NLIMBS], const fiat_p256_limb_t in1[FIAT_P256_NLIMBS]) { for (size_t i = 0; i < FIAT_P256_NLIMBS; i++) { out[i] = in1[i]; } } static void fiat_p256_cmovznz(fiat_p256_limb_t out[FIAT_P256_NLIMBS], fiat_p256_limb_t t, const fiat_p256_limb_t z[FIAT_P256_NLIMBS], const fiat_p256_limb_t nz[FIAT_P256_NLIMBS]) { fiat_p256_selectznz(out, !!t, z, nz); } // Group operations // ---------------- // // Building on top of the field operations we have the operations on the // elliptic curve group itself. Points on the curve are represented in Jacobian // coordinates. // // Both operations were transcribed to Coq and proven to correspond to naive // implementations using Affine coordinates, for all suitable fields. In the // Coq proofs, issues of constant-time execution and memory layout (aliasing) // conventions were not considered. Specification of affine coordinates: // // As a sanity check, a proof that these points form a commutative group: // // fiat_p256_point_double calculates 2*(x_in, y_in, z_in) // // The method is taken from: // http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b // // Coq transcription and correctness proof: // // // // Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed. // while x_out == y_in is not (maybe this works, but it's not tested). static void fiat_p256_point_double(fiat_p256_felem x_out, fiat_p256_felem y_out, fiat_p256_felem z_out, const fiat_p256_felem x_in, const fiat_p256_felem y_in, const fiat_p256_felem z_in) { fiat_p256_felem delta, gamma, beta, ftmp, ftmp2, tmptmp, alpha, fourbeta; // delta = z^2 fiat_p256_square(delta, z_in); // gamma = y^2 fiat_p256_square(gamma, y_in); // beta = x*gamma fiat_p256_mul(beta, x_in, gamma); // alpha = 3*(x-delta)*(x+delta) fiat_p256_sub(ftmp, x_in, delta); fiat_p256_add(ftmp2, x_in, delta); fiat_p256_add(tmptmp, ftmp2, ftmp2); fiat_p256_add(ftmp2, ftmp2, tmptmp); fiat_p256_mul(alpha, ftmp, ftmp2); // x' = alpha^2 - 8*beta fiat_p256_square(x_out, alpha); fiat_p256_add(fourbeta, beta, beta); fiat_p256_add(fourbeta, fourbeta, fourbeta); fiat_p256_add(tmptmp, fourbeta, fourbeta); fiat_p256_sub(x_out, x_out, tmptmp); // z' = (y + z)^2 - gamma - delta fiat_p256_add(delta, gamma, delta); fiat_p256_add(ftmp, y_in, z_in); fiat_p256_square(z_out, ftmp); fiat_p256_sub(z_out, z_out, delta); // y' = alpha*(4*beta - x') - 8*gamma^2 fiat_p256_sub(y_out, fourbeta, x_out); fiat_p256_add(gamma, gamma, gamma); fiat_p256_square(gamma, gamma); fiat_p256_mul(y_out, alpha, y_out); fiat_p256_add(gamma, gamma, gamma); fiat_p256_sub(y_out, y_out, gamma); } // fiat_p256_point_add calculates (x1, y1, z1) + (x2, y2, z2) // // The method is taken from: // http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl, // adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity). // // Coq transcription and correctness proof: // // // // This function includes a branch for checking whether the two input points // are equal, (while not equal to the point at infinity). This case never // happens during single point multiplication, so there is no timing leak for // ECDH or ECDSA signing. static void fiat_p256_point_add(fiat_p256_felem x3, fiat_p256_felem y3, fiat_p256_felem z3, const fiat_p256_felem x1, const fiat_p256_felem y1, const fiat_p256_felem z1, const int mixed, const fiat_p256_felem x2, const fiat_p256_felem y2, const fiat_p256_felem z2) { fiat_p256_felem x_out, y_out, z_out; fiat_p256_limb_t z1nz = fiat_p256_nz(z1); fiat_p256_limb_t z2nz = fiat_p256_nz(z2); // z1z1 = z1z1 = z1**2 fiat_p256_felem z1z1; fiat_p256_square(z1z1, z1); fiat_p256_felem u1, s1, two_z1z2; if (!mixed) { // z2z2 = z2**2 fiat_p256_felem z2z2; fiat_p256_square(z2z2, z2); // u1 = x1*z2z2 fiat_p256_mul(u1, x1, z2z2); // two_z1z2 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 fiat_p256_add(two_z1z2, z1, z2); fiat_p256_square(two_z1z2, two_z1z2); fiat_p256_sub(two_z1z2, two_z1z2, z1z1); fiat_p256_sub(two_z1z2, two_z1z2, z2z2); // s1 = y1 * z2**3 fiat_p256_mul(s1, z2, z2z2); fiat_p256_mul(s1, s1, y1); } else { // We'll assume z2 = 1 (special case z2 = 0 is handled later). // u1 = x1*z2z2 fiat_p256_copy(u1, x1); // two_z1z2 = 2z1z2 fiat_p256_add(two_z1z2, z1, z1); // s1 = y1 * z2**3 fiat_p256_copy(s1, y1); } // u2 = x2*z1z1 fiat_p256_felem u2; fiat_p256_mul(u2, x2, z1z1); // h = u2 - u1 fiat_p256_felem h; fiat_p256_sub(h, u2, u1); fiat_p256_limb_t xneq = fiat_p256_nz(h); // z_out = two_z1z2 * h fiat_p256_mul(z_out, h, two_z1z2); // z1z1z1 = z1 * z1z1 fiat_p256_felem z1z1z1; fiat_p256_mul(z1z1z1, z1, z1z1); // s2 = y2 * z1**3 fiat_p256_felem s2; fiat_p256_mul(s2, y2, z1z1z1); // r = (s2 - s1)*2 fiat_p256_felem r; fiat_p256_sub(r, s2, s1); fiat_p256_add(r, r, r); fiat_p256_limb_t yneq = fiat_p256_nz(r); fiat_p256_limb_t is_nontrivial_double = constant_time_is_zero_w(xneq | yneq) & ~constant_time_is_zero_w(z1nz) & ~constant_time_is_zero_w(z2nz); if (is_nontrivial_double) { fiat_p256_point_double(x3, y3, z3, x1, y1, z1); return; } // I = (2h)**2 fiat_p256_felem i; fiat_p256_add(i, h, h); fiat_p256_square(i, i); // J = h * I fiat_p256_felem j; fiat_p256_mul(j, h, i); // V = U1 * I fiat_p256_felem v; fiat_p256_mul(v, u1, i); // x_out = r**2 - J - 2V fiat_p256_square(x_out, r); fiat_p256_sub(x_out, x_out, j); fiat_p256_sub(x_out, x_out, v); fiat_p256_sub(x_out, x_out, v); // y_out = r(V-x_out) - 2 * s1 * J fiat_p256_sub(y_out, v, x_out); fiat_p256_mul(y_out, y_out, r); fiat_p256_felem s1j; fiat_p256_mul(s1j, s1, j); fiat_p256_sub(y_out, y_out, s1j); fiat_p256_sub(y_out, y_out, s1j); fiat_p256_cmovznz(x_out, z1nz, x2, x_out); fiat_p256_cmovznz(x3, z2nz, x1, x_out); fiat_p256_cmovznz(y_out, z1nz, y2, y_out); fiat_p256_cmovznz(y3, z2nz, y1, y_out); fiat_p256_cmovznz(z_out, z1nz, z2, z_out); fiat_p256_cmovznz(z3, z2nz, z1, z_out); } #include "./p256_table.h" // fiat_p256_select_point_affine selects the |idx-1|th point from a // precomputation table and copies it to out. If |idx| is zero, the output is // the point at infinity. static void fiat_p256_select_point_affine( const fiat_p256_limb_t idx, size_t size, const fiat_p256_felem pre_comp[/*size*/][2], fiat_p256_felem out[3]) { OPENSSL_memset(out, 0, sizeof(fiat_p256_felem) * 3); for (size_t i = 0; i < size; i++) { fiat_p256_limb_t mismatch = i ^ (idx - 1); fiat_p256_cmovznz(out[0], mismatch, pre_comp[i][0], out[0]); fiat_p256_cmovznz(out[1], mismatch, pre_comp[i][1], out[1]); } fiat_p256_cmovznz(out[2], idx, out[2], fiat_p256_one); } // fiat_p256_select_point selects the |idx|th point from a precomputation table // and copies it to out. static void fiat_p256_select_point(const fiat_p256_limb_t idx, size_t size, const fiat_p256_felem pre_comp[/*size*/][3], fiat_p256_felem out[3]) { OPENSSL_memset(out, 0, sizeof(fiat_p256_felem) * 3); for (size_t i = 0; i < size; i++) { fiat_p256_limb_t mismatch = i ^ idx; fiat_p256_cmovznz(out[0], mismatch, pre_comp[i][0], out[0]); fiat_p256_cmovznz(out[1], mismatch, pre_comp[i][1], out[1]); fiat_p256_cmovznz(out[2], mismatch, pre_comp[i][2], out[2]); } } // fiat_p256_get_bit returns the |i|th bit in |in| static crypto_word fiat_p256_get_bit(const uint8_t *in, int i) { if (i < 0 || i >= 256) { return 0; } return (in[i >> 3] >> (i & 7)) & 1; } void p256_point_mul(P256_POINT *r, const Limb scalar[P256_LIMBS], const Limb p_x[P256_LIMBS], const Limb p_y[P256_LIMBS]) { debug_assert_nonsecret(r != NULL); debug_assert_nonsecret(scalar != NULL); debug_assert_nonsecret(p_x != NULL); debug_assert_nonsecret(p_y != NULL); P256_SCALAR_BYTES scalar_bytes; p256_scalar_bytes_from_limbs(scalar_bytes, scalar); fiat_p256_felem p_pre_comp[17][3]; OPENSSL_memset(&p_pre_comp, 0, sizeof(p_pre_comp)); // Precompute multiples. limbs_copy(&p_pre_comp[1][0][0], p_x, P256_LIMBS); limbs_copy(&p_pre_comp[1][1][0], p_y, P256_LIMBS); limbs_copy(&p_pre_comp[1][2][0], fiat_p256_one, P256_LIMBS); for (size_t j = 2; j <= 16; ++j) { if (j & 1) { fiat_p256_point_add(p_pre_comp[j][0], p_pre_comp[j][1], p_pre_comp[j][2], p_pre_comp[1][0], p_pre_comp[1][1], p_pre_comp[1][2], 0, p_pre_comp[j - 1][0], p_pre_comp[j - 1][1], p_pre_comp[j - 1][2]); } else { fiat_p256_point_double(p_pre_comp[j][0], p_pre_comp[j][1], p_pre_comp[j][2], p_pre_comp[j / 2][0], p_pre_comp[j / 2][1], p_pre_comp[j / 2][2]); } } // Set nq to the point at infinity. fiat_p256_felem nq[3] = {{0}, {0}, {0}}, ftmp, tmp[3]; // Loop over |scalar| msb-to-lsb, incorporating |p_pre_comp| every 5th round. int skip = 1; // Save two point operations in the first round. for (size_t i = 255; i < 256; i--) { // double if (!skip) { fiat_p256_point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]); } // do other additions every 5 doublings if (i % 5 == 0) { crypto_word bits = fiat_p256_get_bit(scalar_bytes, i + 4) << 5; bits |= fiat_p256_get_bit(scalar_bytes, i + 3) << 4; bits |= fiat_p256_get_bit(scalar_bytes, i + 2) << 3; bits |= fiat_p256_get_bit(scalar_bytes, i + 1) << 2; bits |= fiat_p256_get_bit(scalar_bytes, i) << 1; bits |= fiat_p256_get_bit(scalar_bytes, i - 1); crypto_word sign, digit; recode_scalar_bits(&sign, &digit, bits); // select the point to add or subtract, in constant time. fiat_p256_select_point(digit, 17, RING_CORE_POINTLESS_ARRAY_CONST_CAST((const fiat_p256_felem(*)[3]))p_pre_comp, tmp); fiat_p256_opp(ftmp, tmp[1]); // (X, -Y, Z) is the negative point. fiat_p256_cmovznz(tmp[1], sign, tmp[1], ftmp); if (!skip) { fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 0 /* mixed */, tmp[0], tmp[1], tmp[2]); } else { fiat_p256_copy(nq[0], tmp[0]); fiat_p256_copy(nq[1], tmp[1]); fiat_p256_copy(nq[2], tmp[2]); skip = 0; } } } limbs_copy(r->X, nq[0], P256_LIMBS); limbs_copy(r->Y, nq[1], P256_LIMBS); limbs_copy(r->Z, nq[2], P256_LIMBS); } void p256_point_mul_base(P256_POINT *r, const Limb scalar[P256_LIMBS]) { P256_SCALAR_BYTES scalar_bytes; p256_scalar_bytes_from_limbs(scalar_bytes, scalar); // Set nq to the point at infinity. fiat_p256_felem nq[3] = {{0}, {0}, {0}}, tmp[3]; int skip = 1; // Save two point operations in the first round. for (size_t i = 31; i < 32; i--) { if (!skip) { fiat_p256_point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]); } // First, look 32 bits upwards. crypto_word bits = fiat_p256_get_bit(scalar_bytes, i + 224) << 3; bits |= fiat_p256_get_bit(scalar_bytes, i + 160) << 2; bits |= fiat_p256_get_bit(scalar_bytes, i + 96) << 1; bits |= fiat_p256_get_bit(scalar_bytes, i + 32); // Select the point to add, in constant time. fiat_p256_select_point_affine(bits, 15, fiat_p256_g_pre_comp[1], tmp); if (!skip) { fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1 /* mixed */, tmp[0], tmp[1], tmp[2]); } else { fiat_p256_copy(nq[0], tmp[0]); fiat_p256_copy(nq[1], tmp[1]); fiat_p256_copy(nq[2], tmp[2]); skip = 0; } // Second, look at the current position. bits = fiat_p256_get_bit(scalar_bytes, i + 192) << 3; bits |= fiat_p256_get_bit(scalar_bytes, i + 128) << 2; bits |= fiat_p256_get_bit(scalar_bytes, i + 64) << 1; bits |= fiat_p256_get_bit(scalar_bytes, i); // Select the point to add, in constant time. fiat_p256_select_point_affine(bits, 15, fiat_p256_g_pre_comp[0], tmp); fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1 /* mixed */, tmp[0], tmp[1], tmp[2]); } limbs_copy(r->X, nq[0], P256_LIMBS); limbs_copy(r->Y, nq[1], P256_LIMBS); limbs_copy(r->Z, nq[2], P256_LIMBS); } void p256_mul_mont(Limb r[P256_LIMBS], const Limb a[P256_LIMBS], const Limb b[P256_LIMBS]) { fiat_p256_mul(r, a, b); } void p256_sqr_mont(Limb r[P256_LIMBS], const Limb a[P256_LIMBS]) { fiat_p256_square(r, a); } void p256_point_add(P256_POINT *r, const P256_POINT *a, const P256_POINT *b) { fiat_p256_point_add(r->X, r->Y, r->Z, a->X, a->Y, a->Z, 0, b->X, b->Y, b->Z); } void p256_point_double(P256_POINT *r, const P256_POINT *a) { fiat_p256_point_double(r->X, r->Y, r->Z, a->X, a->Y, a->Z); } // For testing only. void p256_point_add_affine(P256_POINT *r, const P256_POINT *a, const BN_ULONG b[P256_LIMBS * 2]) { const Limb *b_x = &b[0]; const Limb *b_y = &b[P256_LIMBS]; fiat_p256_felem b_z = {0}; crypto_word b_is_inf = constant_time_select_w( LIMBS_are_zero(b_x, P256_LIMBS), LIMBS_are_zero(b_y, P256_LIMBS), 0); fiat_p256_cmovznz(b_z, constant_time_is_zero_w(b_is_inf), b_z, fiat_p256_one); fiat_p256_point_add(r->X, r->Y, r->Z, a->X, a->Y, a->Z, 1, b_x, b_y, b_z); } #undef BORINGSSL_NISTP256_64BIT #endif /* !defined(OPENSSL_USE_NISTZ256) */