/*
 * Copyright (c) 2017-2021 Emil Lenngren
 * Copyright (c) 2021 Shortcut Labs AB
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <stdint.h>
#include <stdbool.h>
#include <string.h>

#include "p256-cortex-m4-config.h"
#include "p256-cortex-m4.h"

typedef const uint32_t (*constarr)[8];

struct FGInteger {
    // To get the value this struct represents,
    // interpret signed_value as a two's complement 288-bit little endian integer,
    // and negate if flip_sign is -1
    int flip_sign; // 0 or -1
    uint32_t signed_value[9]; // of 288 bits, 257 are useful (top 31 bits are sign-extended from bit 256)
};

struct XYInteger {
    // To get the value this struct represents,
    // interpret signed_value as an unsigned 288-bit little endian integer,
    // and negate if flip_sign is -1
    int flip_sign; // 0 or -1
    uint32_t value[8]; // unsigned value, 0 <= value < P256_order
};

int P256_divsteps2_31(int delta, uint32_t f, uint32_t g, uint32_t res_matrix[4]);
void P256_matrix_mul_fg_9(uint32_t a, uint32_t b, const struct FGInteger fg[2], struct FGInteger *res);
void P256_matrix_mul_mod_n(uint32_t a, uint32_t b, const struct XYInteger xy[2], struct XYInteger *res);

void P256_to_montgomery(uint32_t aR[8], const uint32_t a[8]);
void P256_from_montgomery(uint32_t a[8], const uint32_t aR[8]);
bool P256_check_range_p(const uint32_t a[8]);

bool P256_check_range_n(const uint32_t a[8]);
void P256_mul_mod_n(uint32_t res[8], const uint32_t a[8], const uint32_t b[8]);
void P256_add_mod_n(uint32_t res[8], const uint32_t a[8], const uint32_t b[8]);
void P256_mod_n_inv_vartime(uint32_t res[8], const uint32_t a[8]);
void P256_reduce_mod_n_32bytes(uint32_t res[8], const uint32_t a[8]);

void P256_select_point(uint32_t (*output)[8], uint32_t* table, uint32_t num_coordinates, uint32_t index);

void P256_jacobian_to_affine(uint32_t affine_mont_x[8], uint32_t affine_mont_y[8], const uint32_t jacobian_mont[3][8]);
bool P256_point_is_on_curve(const uint32_t x_mont[8], const uint32_t y_mont[8]);
bool P256_decompress_point(uint32_t y[8], const uint32_t x[8], uint32_t y_parity);
void P256_double_j(uint32_t jacobian_point_out[3][8], const uint32_t jacobian_point_in[3][8]);
void P256_add_sub_j(uint32_t jacobian_point1[3][8], const uint32_t (*point2)[8], bool is_sub, bool p2_is_affine);
bool P256_verify_last_step(const uint32_t r[8], const uint32_t jacobian_point[3][8]);

void P256_negate_mod_p_if(uint32_t out[8], const uint32_t in[8], uint32_t should_negate);
void P256_negate_mod_n_if(uint32_t out[8], const uint32_t in[8], uint32_t should_negate);

extern uint32_t P256_order[9];

#if include_p256_mult
static const uint32_t one_montgomery[8] = {1, 0, 0, 0xffffffff, 0xffffffff, 0xffffffff, 0xfffffffe, 0};
#endif

#if include_p256_verify
// This table contains 1G, 3G, 5G, ... 15G in affine coordinates in montgomery form
static const uint32_t p256_basepoint_precomp[8][2][8] = {
{{0x18a9143c, 0x79e730d4, 0x5fedb601, 0x75ba95fc, 0x77622510, 0x79fb732b, 0xa53755c6, 0x18905f76},
{0xce95560a, 0xddf25357, 0xba19e45c, 0x8b4ab8e4, 0xdd21f325, 0xd2e88688, 0x25885d85, 0x8571ff18}},
{{0x4eebc127, 0xffac3f90, 0x87d81fb, 0xb027f84a, 0x87cbbc98, 0x66ad77dd, 0xb6ff747e, 0x26936a3f},
{0xc983a7eb, 0xb04c5c1f, 0x861fe1a, 0x583e47ad, 0x1a2ee98e, 0x78820831, 0xe587cc07, 0xd5f06a29}},
{{0xc45c61f5, 0xbe1b8aae, 0x94b9537d, 0x90ec649a, 0xd076c20c, 0x941cb5aa, 0x890523c8, 0xc9079605},
{0xe7ba4f10, 0xeb309b4a, 0xe5eb882b, 0x73c568ef, 0x7e7a1f68, 0x3540a987, 0x2dd1e916, 0x73a076bb}},
{{0xa0173b4f, 0x746354e, 0xd23c00f7, 0x2bd20213, 0xc23bb08, 0xf43eaab5, 0xc3123e03, 0x13ba5119},
{0x3f5b9d4d, 0x2847d030, 0x5da67bdd, 0x6742f2f2, 0x77c94195, 0xef933bdc, 0x6e240867, 0xeaedd915}},
{{0x264e20e8, 0x75c96e8f, 0x59a7a841, 0xabe6bfed, 0x44c8eb00, 0x2cc09c04, 0xf0c4e16b, 0xe05b3080},
{0xa45f3314, 0x1eb7777a, 0xce5d45e3, 0x56af7bed, 0x88b12f1a, 0x2b6e019a, 0xfd835f9b, 0x86659cd}},
{{0x6245e404, 0xea7d260a, 0x6e7fdfe0, 0x9de40795, 0x8dac1ab5, 0x1ff3a415, 0x649c9073, 0x3e7090f1},
{0x2b944e88, 0x1a768561, 0xe57f61c8, 0x250f939e, 0x1ead643d, 0xc0daa89, 0xe125b88e, 0x68930023}},
{{0x4b2ed709, 0xccc42563, 0x856fd30d, 0xe356769, 0x559e9811, 0xbcbcd43f, 0x5395b759, 0x738477ac},
{0xc00ee17f, 0x35752b90, 0x742ed2e3, 0x68748390, 0xbd1f5bc1, 0x7cd06422, 0xc9e7b797, 0xfbc08769}},
{{0xbc60055b, 0x72bcd8b7, 0x56e27e4b, 0x3cc23ee, 0xe4819370, 0xee337424, 0xad3da09, 0xe2aa0e43},
{0x6383c45d, 0x40b8524f, 0x42a41b25, 0xd7663554, 0x778a4797, 0x64efa6de, 0x7079adf4, 0x2042170a}}
};
#endif

#if include_fast_p256_basemult
// This contains two tables, 8 points each in affine coordinates in montgomery form
// The first table contains these points:
// (2^192 - 2^128 - 2^64 - 1)G
// (2^192 - 2^128 - 2^64 + 1)G
// (2^192 - 2^128 + 2^64 - 1)G
// (2^192 - 2^128 + 2^64 + 1)G
// (2^192 + 2^128 - 2^64 - 1)G
// (2^192 + 2^128 - 2^64 + 1)G
// (2^192 + 2^128 + 2^64 - 1)G
// (2^192 + 2^128 + 2^64 + 1)G
// The second table contains the same points multiplied by 2^32
static const uint32_t p256_basepoint_precomp2[2][8][2][8] =
{
{
{{0x670844e0, 0x52d8a7c9, 0xef68a29d, 0xe33bdc, 0x4bdb7361, 0xf3d2848, 0x91c5304d, 0x5222c821},
{0xdf73fc25, 0xea6d2944, 0x255c81b, 0xa04c0f55, 0xefe488a8, 0x29acdc97, 0x80a560de, 0xbe2e158f}},
{{0x2b13e673, 0xfc8511ee, 0xd103ed24, 0xffc58dee, 0xea7e99b8, 0x1022523a, 0x4afc8a17, 0x8f43ea39},
{0xc5f33d0b, 0x8f4e2dbc, 0xd0aa1681, 0x3bc099fa, 0x79ff9df1, 0xffbb7b41, 0xd58b57c4, 0x180de09d}},
{{0x8bd1cda5, 0x56430752, 0x8e05eda5, 0x1807577f, 0x956896e9, 0x99c699b, 0xf1f0efb5, 0x83d6093d},
{0xed97061c, 0xef5af17e, 0x30d4c3c, 0x35b977b8, 0x49229439, 0x81fa75a2, 0xa0b6d35d, 0xf5a22070}},
{{0x74f81cf1, 0x814c5365, 0x120065b, 0xe30baff7, 0x15132621, 0x80ae1256, 0x36a80788, 0x16d2b8cb},
{0xecc50bca, 0x33d14697, 0x17aedd21, 0x19a9dfb0, 0xedc3f766, 0x523fbcc7, 0xb2cf5afd, 0x9c4de6dd}},
{{0xcf0d9f6d, 0x5305a9e6, 0x81a9b021, 0x5839172f, 0x75c687cf, 0xcca7a4dd, 0x844be22f, 0x36d59b3e},
{0x111a53e9, 0xcace7e62, 0xf063f3a1, 0x91c843d4, 0xda812da, 0xbf77e5f0, 0x437f3176, 0xe64af9c}},
{{0xcf07517d, 0xdbd568bb, 0xba6830b9, 0x2f1afba2, 0xe6c4c2a6, 0x15b6807c, 0xe4966aef, 0x91c7eabc},
{0xd6b2b6e6, 0x716dea1b, 0x19f85b4b, 0x248c43d1, 0x4a315e2a, 0x16dcfd60, 0xc72b3d0b, 0x15fdd303}},
{{0x42b7dfd5, 0xe40bf9f4, 0x2d934f2a, 0x673689f3, 0x30a6f50b, 0x8314beb4, 0x976ec64e, 0xd17af2bc},
{0x1ee7ddf1, 0x39f66c4f, 0x68ea373c, 0x7f68e18b, 0x53d0b186, 0x5166c1f2, 0x7be58f14, 0x95dda601}},
{{0x42913074, 0xd5ae356, 0x48a542b1, 0x55491b27, 0xb310732a, 0x469ca665, 0x5f1a4cc1, 0x29591d52},
{0xb84f983f, 0xe76f5b6b, 0x9f5f84e1, 0xbe7eef41, 0x80baa189, 0x1200d496, 0x18ef332c, 0x6376551f}}
},
{
{{0x7c4e54f5, 0xb9e5cbc0, 0xe1410e34, 0xc53a1a17, 0xec454425, 0x3e199130, 0x1700902e, 0xb029c97e},
{0x786423b6, 0x2de66e11, 0xb41a95be, 0x262dc914, 0x451b683, 0x51766abd, 0x85bb6fb1, 0x55ad5f34}},
{{0x9066cb79, 0x74f4f1c, 0x30c8b94e, 0x1ab31bd6, 0xd74275b3, 0x6d3f012f, 0x9ddcce40, 0xa214d0b1},
{0xd165050a, 0x24aedf74, 0xe0e5dc3e, 0x95f17ece, 0xd9224456, 0x6ada9cda, 0x2dd60eea, 0x1fadb2d1}},
{{0xe20cfb9b, 0xa3d83091, 0xba76e0cb, 0xae79c975, 0xc8858a6e, 0xa5f2a588, 0x874a3168, 0xe897a5f4},
{0x7d48f096, 0xf6c1ef40, 0xc35b132c, 0x1f9c516b, 0x53c479fd, 0xe1040f91, 0x9df06743, 0x60e881f}},
{{0x52a90e51, 0x9e0ad72, 0x38c50a96, 0xb7e66ea3, 0x7d997770, 0xab32ad05, 0x445671cb, 0xceaffe2},
{0x5d37cc99, 0xdfbe753c, 0xe0fea2d5, 0x95d068cc, 0x4dd77cb6, 0x1e37cdda, 0x55530688, 0x88c5a4bb}},
{{0xc7744f1, 0x3413f033, 0xbc816702, 0x23c05c89, 0x1192b5ac, 0x2322ee9a, 0x373180bb, 0xc1636a0},
{0xbdde0207, 0xfe2f3d4, 0xc23578d8, 0xe1a093a, 0xc888ead, 0x6e5f0d1, 0x52a2b660, 0x9ca285a5}},
{{0xce923964, 0xdae76995, 0xa34c7993, 0xcc96493a, 0xea73d9e7, 0xd19b5144, 0x311e6e34, 0x4a5c263},
{0xd9a2a443, 0x7db5b32b, 0x2cfd960c, 0x3754bd33, 0xa430f15, 0xc5bcc98, 0xd9a94574, 0x5651201f}},
{{0xfc0418fe, 0xebdd8921, 0x34e20036, 0x37015b39, 0xdf03a353, 0xcf4fcd8f, 0xf12cab16, 0xdc2de6e1},
{0xd071df14, 0x9c17cc1a, 0x63415530, 0xd7c5e6a3, 0x68f3fb1e, 0xb5301660, 0x18269301, 0xb5f70bc9}},
{{0x79ec1a0f, 0x2d8daefd, 0xceb39c97, 0x3bbcd6fd, 0x58f61a95, 0xf5575ffc, 0xadf7b420, 0xdbd986c4},
{0x15f39eb7, 0x81aa8814, 0xb98d976c, 0x6ee2fcf5, 0xcf2f717d, 0x5465475d, 0x6860bbd0, 0x8e24d3c4}}
}
};
#endif

#if include_p256_verify || include_p256_sign
// Takes the leftmost 256 bits in hash (treated as big endian),
// and converts to little endian integer z.
static void hash_to_z(uint32_t z[8], const uint8_t* hash, uint32_t hashlen) {
    if (hashlen > 32) {
        hashlen = 32;
    }
    for (uint32_t i = 0; i < hashlen; i++) {
        ((uint8_t*)z)[i] = hash[hashlen - 1 - i];
    }
    for (uint32_t i = hashlen; i < 32; i++) {
        ((uint8_t*)z)[i] = 0;
    }
}
#endif

#if include_p256_verify
// Creates a representation of a (little endian integer),
// so that r[0] + 2*r[1] + 2^2*r[2] + 2^3*r[3] + ... = a,
// where each r[i] is -15, -13, ..., 11, 13, 15 or 0.
// Only around 1/5.5 of the r[i] will be non-zero.
static void slide_257(signed char r[257], const uint8_t a[32]) {
    for (int i = 0; i < 256; ++i) {
        r[i] = 1 & (a[i >> 3] >> (i & 7));
    }
    r[256] = 0;

    for (int i = 0; i < 256; i++) {
        if (r[i] != 0) {
            for (int b = 1; b <= 4 && i + b < 256; b++) {
                if (r[i + b] != 0) {
                    if (r[i] + (r[i + b] << b) <= 15) {
                        r[i] += r[i + b] << b; r[i + b] = 0;
                    } else if (r[i] - (r[i + b] << b) >= -15) {
                        r[i] -= r[i + b] << b;
                        for (;;) {
                            r[i + b] = 0;
                            b++;
                            if (!r[i + b]) {
                                r[i + b] = 1;
                                b--; // Will be added back after loop footer b++
                                break;
                            }
                        }
                    } else {
                        break;
                    }
                }
            }
        }
    }
}
#endif

#if include_p256_sign
void P256_mod_n_inv(uint32_t out[8], const uint32_t in[8]) {
    // This function follows the algorithm in section 12.1 of https://gcd.cr.yp.to/safegcd-20190413.pdf.
    // It has been altered in the following ways:
    //   1. Due to 32-bit cpu, we use 24 * 31 iterations instead of 12 * 62.
    //   2. P-256 modulus instead of 2^255-19.
    //      744 iterations are still enough and slightly more than the required 741 (floor((49*256+57)/17)).
    //   3. Step 5 has been corrected to go back to step 2 instead of step 3.
    //   4. The order of the matrix multiplications in step 6 has been changed to (T24*(T23*(T22*(...*(T1*[0, 1]))))),
    //      where [0, 1] is a column vector to make it possible to be able to extract the "top-right corner", v, of T24*T23*...*T1.
    //      The result v will then be contained in the first element of the resulting column vector.
    
    struct {
        struct FGInteger fg[2]; // f and g
        struct XYInteger xy[2]; // x and y
    } state[2]; // "current" and "next"
    
    state[0].fg[0].flip_sign = 0; // non-negative f
    memcpy(&state[0].fg[0].signed_value, P256_order, 36); // f
    state[0].fg[1].flip_sign = 0; // non-negative g
    memcpy(&state[0].fg[1].signed_value, in, 32); // g
    state[0].fg[1].signed_value[8] = 0; // upper bits of g are 0
    memset(&state[0].xy, 0, sizeof(state[0].xy));
    // We later need a factor 2^-744. The montgomery multiplication gives 2^(24*-32)=2^-768, so multiply the init value (1) by 2^24 here.
    state[0].xy[1].value[0] = 1U << 24;
    
    int delta = 1;
    for (int i = 0; i < 24; i++) {
        // Scaled translation matrix Ti
        uint32_t matrix[4]; // element range: [-2^30, 2^31] (negative numbers are stored in two's complement form)
        
        // Decode f and g into two's complement representation and use the lowest 32 bits in the P256_divsteps2_31 calculation
        uint32_t negate_f = state[i % 2].fg[0].flip_sign;
        uint32_t negate_g = state[i % 2].fg[1].flip_sign;
        delta = P256_divsteps2_31(delta, (state[i % 2].fg[0].signed_value[0] ^ negate_f) - negate_f, (state[i % 2].fg[1].signed_value[0] ^ negate_g) - negate_g, matrix);
        
        // "Jump step", calculates the new f and g values that applies after 31 divstep2 iterations
        P256_matrix_mul_fg_9(matrix[0], matrix[1], state[i % 2].fg, &state[(i + 1) % 2].fg[0]);
        P256_matrix_mul_fg_9(matrix[2], matrix[3], state[i % 2].fg, &state[(i + 1) % 2].fg[1]);
        
        // Iterate the result vector
        // Due to montgomery multiplication inside this function, each step also adds a 2^-32 factor
        P256_matrix_mul_mod_n(matrix[0], matrix[1], state[i % 2].xy, &state[(i + 1) % 2].xy[0]);
        P256_matrix_mul_mod_n(matrix[2], matrix[3], state[i % 2].xy, &state[(i + 1) % 2].xy[1]);
    }
    // Calculates val^-1 = sgn(f) * v * 2^-744, where v is the "top-right corner" of the resulting T24*T23*...*T1 matrix.
    // In this implementation, at this point x contains v * 2^-744.
    P256_negate_mod_n_if(out, &state[0].xy[0].value[0], (state[0].xy[0].flip_sign ^ state[0].fg[0].flip_sign ^ state[0].fg[0].signed_value[8]) & 1);
}
#endif

#if include_p256_varmult || (include_p256_basemult && !use_fast_p256_basemult)
// Constant time abs
static inline uint32_t abs_int(int8_t a) {
    uint32_t a_u = (uint32_t)(int32_t)a;
    uint32_t mask = a_u >> 31;
    mask |= mask << 1;
    mask |= mask << 2;
    uint32_t result = (-a) & mask;
    result |= a & (mask ^ 0xf);
    return result;
}

// Calculates scalar*P in constant time (except for the scalars 2 and n-2, for which the results take a few extra cycles to compute)
static void scalarmult_variable_base(uint32_t output_mont_x[8], uint32_t output_mont_y[8], const uint32_t input_mont_x[8], const uint32_t input_mont_y[8], const uint32_t scalar[8]) {
    // Based on https://eprint.iacr.org/2014/130.pdf, Algorithm 1.
    
    uint32_t scalar2[8];
    int8_t e[64];
    
    // The algorithm used requires the scalar to be odd. If even, negate the scalar modulo p to make it odd, and later negate the end result.
    bool even = (scalar[0] & 1) ^ 1;
    P256_negate_mod_n_if(scalar2, scalar, even);
    
    // Rewrite the scalar as e[0] + 2^4*e[1] + 2^8*e[2] + ... + 2^252*e[63], where each e[i] is an odd number and -15 <= e[i] <= 15.
    e[0] = scalar2[0] & 0xf;
    for (int i = 1; i < 64; i++) {
        // Extract 4 bits
        e[i] = (scalar2[i / 8] >> ((i % 8) * 4)) & 0xf;
        // If even, subtract 2^4 from e[i - 1] and add 1 to e[i]
        e[i - 1] -= ((e[i] & 1) ^ 1) << 4;
        e[i] |= 1;
    }
    
    // Create a table of P, 3P, 5P, ... 15P.
    uint32_t table[8][3][8];
    memcpy(table[0][0], input_mont_x, 32);
    memcpy(table[0][1], input_mont_y, 32);
    memcpy(table[0][2], one_montgomery, 32);
    P256_double_j(table[7], (constarr)table[0]);
    for (int i = 1; i < 8; i++) {
        memcpy(table[i], table[7], 96);
        P256_add_sub_j(table[i], (constarr)table[i - 1], 0, 0);
    }
    
    // Calculate the result as (((((((((e[63]*G)*2^4)+e[62])*2^4)+e[61])*2^4)...)+e[1])*2^4)+e[0] = (2^252*e[63] + 2^248*e[62] + ... + e[0])*G.
    
    uint32_t current_point[3][8];
    
    // e[63] is never negative
    #if has_d_cache
    P256_select_point(current_point, (uint32_t*)table, 3, e[63] >> 1);
    #else
    memcpy(current_point, table[e[63] >> 1], 96);
    #endif
    
    for (uint32_t i = 63; i --> 0;) {
        for (int j = 3; j >= 0; j--) {
            P256_double_j(current_point, (constarr)current_point);
        }
        uint32_t selected_point[3][8];
        #if has_d_cache
        P256_select_point(selected_point, (uint32_t*)table, 3, abs_int(e[i]) >> 1);
        #else
        memcpy(selected_point, table[abs_int(e[i]) >> 1], 96);
        #endif
        P256_negate_mod_p_if(selected_point[1], selected_point[1], (uint8_t)e[i] >> 7);
        
        // There is (only) one odd input scalar that leads to an exception when i == 0: n-2,
        // in that case current_point will be equal to selected_point and hence a doubling
        // will occur instead. We don't bother fixing the same constant time for that case since
        // the probability of that random value to be generated is around 1/2^255 and an
        // attacker could easily test this case anyway.
        P256_add_sub_j(current_point, (constarr)selected_point, false, false);
    }
    P256_jacobian_to_affine(output_mont_x, output_mont_y, (constarr)current_point);
    
    // If the scalar was initially even, we now negate the result to get the correct result, since -(scalar*G) = (-scalar*G).
    // This is done by negating y, since -(x,y) = (x,-y).
    P256_negate_mod_p_if(output_mont_y, output_mont_y, even);
}
#endif

#define get_bit(arr, i) ((arr[(i) / 32] >> ((i) % 32)) & 1)

#if include_p256_basemult
#if include_fast_p256_basemult
// Calculates scalar*G in constant time
static void scalarmult_fixed_base(uint32_t output_mont_x[8], uint32_t output_mont_y[8], const uint32_t scalar[8]) {
    uint32_t scalar2[8];
    
    // Just as with the algorithm used in variable base scalar multiplication, this algorithm requires the scalar to be odd.
    bool even = (scalar[0] & 1) ^ 1;
    P256_negate_mod_n_if(scalar2, scalar, even);
    
    // This algorithm conceptually rewrites the odd scalar as s[0] + 2^1*s[1] + 2^2*s[2] + ... + 2^255*s[255], where each s[i] is -1 or 1.
    // By initially setting s[i] to the corresponding bit S[i] in the original odd scalar S, we go from lsb to msb, and whenever a value s[i] is 0,
    // increase s[i] by 1 and decrease s[i-1] by 2.
    // This will result in that s[i] = S[i+1] == 1 ? 1 : -1 for i < 255, and s[255] = 1.
    
    // We then form the scalars abs(s[j] + s[j+64]*2^64 + s[j+128]*2^128 + s[j+192]*2^192)*(2^32 * floor(j / 32)) for different 0 <= j < 64.
    // Each scalar times G has already been precomputed in p256_basepoint_precomp2.
    // That way we only need 31 point doublings and 63 point additions.
    
    uint32_t current_point[3][8];
    uint32_t selected_point[2][8];
    
    #if !has_d_cache
    // Load table into RAM, for example if the the table lies on external memory mapped flash, which can easily be intercepted.
    uint32_t precomp[2][8][2][8];
    memcpy(precomp, p256_basepoint_precomp2, sizeof(p256_basepoint_precomp2));
    #endif
    
    for (uint32_t i = 32; i --> 0;) {
        {
            uint32_t mask = get_bit(scalar2, i + 32 + 1) | (get_bit(scalar2, i + 64 + 32 + 1) << 1) | (get_bit(scalar2, i + 2 * 64 + 32 + 1) << 2);
            if (i == 31) {
                #if has_d_cache
                P256_select_point(current_point, (uint32_t*)p256_basepoint_precomp2[1], 2, mask);
                #else
                memcpy(current_point, precomp[1][mask], 64);
                #endif
                memcpy(current_point[2], one_montgomery, 32);
            } else {
                P256_double_j(current_point, (constarr)current_point);
                
                uint32_t sign = get_bit(scalar2, i + 3 * 64 + 32 + 1) - 1; // positive: 0, negative: -1
                mask = (mask ^ sign) & 7;
                #if has_d_cache
                P256_select_point(selected_point, (uint32_t*)p256_basepoint_precomp2[1], 2, mask);
                #else
                memcpy(selected_point, precomp[1][mask], 64);
                #endif
                P256_negate_mod_p_if(selected_point[1], selected_point[1], sign & 1);
                P256_add_sub_j(current_point, (constarr)selected_point, false, true);
            }
        }
        {
            uint32_t mask = get_bit(scalar2, i + 1) | (get_bit(scalar2, i + 64 + 1) << 1) | (get_bit(scalar2, i + 2 * 64 + 1) << 2);
            uint32_t sign = get_bit(scalar2, i + 3 * 64 + 1) - 1; // positive: 0, negative: -1
            mask = (mask ^ sign) & 7;
            #if has_d_cache
            P256_select_point(selected_point, (uint32_t*)p256_basepoint_precomp2[0], 2, mask);
            #else
            memcpy(selected_point, precomp[0][mask], 64);
            #endif
            P256_negate_mod_p_if(selected_point[1], selected_point[1], sign & 1);
            P256_add_sub_j(current_point, (constarr)selected_point, false, true);
        }
    }
    P256_jacobian_to_affine(output_mont_x, output_mont_y, (constarr)current_point);
    
    // Negate final result if the scalar was initially even.
    P256_negate_mod_p_if(output_mont_y, output_mont_y, even);
}
#else
static void scalarmult_fixed_base(uint32_t output_mont_x[8], uint32_t output_mont_y[8], const uint32_t scalar[8]) {
    #if !include_p256_verify
    static const uint32_t p[2][8] =
    {{0x18a9143c, 0x79e730d4, 0x5fedb601, 0x75ba95fc, 0x77622510, 0x79fb732b, 0xa53755c6, 0x18905f76},
    {0xce95560a, 0xddf25357, 0xba19e45c, 0x8b4ab8e4, 0xdd21f325, 0xd2e88688, 0x25885d85, 0x8571ff18}};
    scalarmult_variable_base(output_mont_x, output_mont_y, p[0], p[1], scalar);
    #else
    scalarmult_variable_base(output_mont_x, output_mont_y, p256_basepoint_precomp[0][0], p256_basepoint_precomp[0][1], scalar);
    #endif
}
#endif
#endif

void p256_convert_endianness(void* output, const void* input, size_t byte_len) {
    for (size_t i = 0; i < byte_len / 2; i++) {
        uint8_t t = ((uint8_t*)input)[byte_len - 1 - i];
        ((uint8_t*)output)[byte_len - 1 - i] = ((uint8_t*)input)[i];
        ((uint8_t*)output)[i] = t;
    }
}

#if include_p256_verify
bool p256_verify(const uint32_t public_key_x[8], const uint32_t public_key_y[8], const uint8_t* hash, uint32_t hashlen_in_bytes, const uint32_t r[8], const uint32_t s[8]) {
    if (!P256_check_range_n(r) || !P256_check_range_n(s)) {
        return false;
    }
    
    if (!P256_check_range_p(public_key_x) || !P256_check_range_p(public_key_y)) {
        return false;
    }
    
    uint32_t pk_table[8][3][8];
    P256_to_montgomery(pk_table[0][0], public_key_x);
    P256_to_montgomery(pk_table[0][1], public_key_y);
    memcpy(pk_table[0][2], one_montgomery, 32);
    
    if (!P256_point_is_on_curve(pk_table[0][0], pk_table[0][1])) {
        return false;
    }
    
    // Create a table of P, 3P, 5P, ..., 15P, where P is the public key.
    P256_double_j(pk_table[7], (constarr)pk_table[0]);
    for (int i = 1; i < 8; i++) {
        memcpy(pk_table[i], pk_table[7], 96);
        P256_add_sub_j(pk_table[i], (constarr)pk_table[i - 1], 0, 0);
    }
    
    uint32_t z[8], w[8], u1[8], u2[8];
    
    hash_to_z(z, hash, hashlen_in_bytes);
    
    #if include_p256_sign
    P256_mod_n_inv(w, s);
    #else
    // Use smaller implementation if we don't need constant time version
    P256_mod_n_inv_vartime(w, s);
    #endif
    
    P256_mul_mod_n(u1, z, w);
    P256_mul_mod_n(u2, r, w);
    
    // Each value in these arrays will be an odd integer v, so that -15 <= v <= 15.
    // Around 1/5.5 of them will be non-zero.
    signed char slide_bp[257], slide_pk[257];
    slide_257(slide_bp, (uint8_t*)u1);
    slide_257(slide_pk, (uint8_t*)u2);
    
    uint32_t cp[3][8] = {0};
    
    for (int i = 256; i >= 0; i--) {
        P256_double_j(cp, (constarr)cp);
        if (slide_bp[i] > 0) {
            P256_add_sub_j(cp, p256_basepoint_precomp[slide_bp[i]/2], 0, 1);
        } else if (slide_bp[i] < 0) {
            P256_add_sub_j(cp, p256_basepoint_precomp[(-slide_bp[i])/2], 1, 1);
        }
        if (slide_pk[i] > 0) {
            P256_add_sub_j(cp, (constarr)pk_table[slide_pk[i]/2], 0, 0);
        } else if (slide_pk[i] < 0) {
            P256_add_sub_j(cp, (constarr)pk_table[(-slide_pk[i])/2], 1, 0);
        }
    }
    
    return P256_verify_last_step(r, (constarr)cp);
}
#endif

#if include_p256_sign
bool p256_sign_step1(struct SignPrecomp *result, const uint32_t k[8]) {
    do {
        uint32_t point_res[2][8];
        if (!P256_check_range_n(k)) {
            break;
        }
        scalarmult_fixed_base(point_res[0], point_res[1], k);
        P256_mod_n_inv(result->k_inv, k);
        P256_from_montgomery(result->r, point_res[0]);
        P256_reduce_mod_n_32bytes(result->r, result->r);
        
        uint32_t r_sum = 0;
        for (int i = 0; i < 8; i++) {
            r_sum |= result->r[i];
        }
        if (r_sum == 0) {
            break;
        }
        return true;
    } while (false);
    
    memset(result, 0, sizeof(struct SignPrecomp));
    return false;
}

bool p256_sign_step2(uint32_t r[8], uint32_t s[8], const uint8_t* hash, uint32_t hashlen_in_bytes, const uint32_t private_key[8], struct SignPrecomp *sign_precomp) {
    do {
        if (!P256_check_range_n(sign_precomp->k_inv) || !P256_check_range_n(sign_precomp->r)) { // just make sure user did not input an obviously invalid precomp
            break;
        }
        uint32_t *const z = r;
        hash_to_z(z, hash, hashlen_in_bytes);
        P256_mul_mod_n(s, sign_precomp->r, private_key);
        P256_add_mod_n(s, z, s);
        P256_mul_mod_n(s, sign_precomp->k_inv, s);
        
        memcpy(r, sign_precomp->r, 32);
        
        uint32_t s_sum = 0;
        for (int i = 0; i < 8; i++) {
            s_sum |= s[i];
        }
        if (s_sum == 0) {
            break;
        }
        memset(sign_precomp, 0, sizeof(*sign_precomp));
        return true;
    } while (false);
    
    memset(r, 0, 32);
    memset(s, 0, 32);
    return false;
}

bool p256_sign(uint32_t r[8], uint32_t s[8], const uint8_t* hash, uint32_t hashlen_in_bytes, const uint32_t private_key[8], const uint32_t k[8]) {
    struct SignPrecomp t;
    if (!p256_sign_step1(&t, k)) {
        memset(r, 0, 32);
        memset(s, 0, 32);
        return false;
    }
    return p256_sign_step2(r, s, hash, hashlen_in_bytes, private_key, &t);
}
#endif

#if include_p256_keygen || include_p256_raw_scalarmult_base
bool p256_scalarmult_base(uint32_t result_x[8], uint32_t result_y[8], const uint32_t scalar[8]) {
    if (!P256_check_range_n(scalar)) {
        return false;
    }
    scalarmult_fixed_base(result_x, result_y, scalar);
    P256_from_montgomery(result_x, result_x);
    P256_from_montgomery(result_y, result_y);
    return true;
    
}

#if include_p256_keygen
bool p256_keygen(uint32_t public_key_x[8], uint32_t public_key_y[8], const uint32_t private_key[8]) {
    return p256_scalarmult_base(public_key_x, public_key_y, private_key);
}
#endif
#endif


#if include_p256_varmult
static bool p256_scalarmult_generic_no_scalar_check(uint32_t output_mont_x[8], uint32_t output_mont_y[8], const uint32_t scalar[8], const uint32_t in_x[8], const uint32_t in_y[8]) {
    if (!P256_check_range_p(in_x) || !P256_check_range_p(in_y)) {
        return false;
    }
    
    P256_to_montgomery(output_mont_x, in_x);
    P256_to_montgomery(output_mont_y, in_y);
    
    if (!P256_point_is_on_curve(output_mont_x, output_mont_y)) {
        return false;
    }
    
    scalarmult_variable_base(output_mont_x, output_mont_y, output_mont_x, output_mont_y, scalar);
    return true;
}

#if include_p256_raw_scalarmult_generic
bool p256_scalarmult_generic(uint32_t result_x[8], uint32_t result_y[8], const uint32_t scalar[8], const uint32_t in_x[8], const uint32_t in_y[8]) {
    if (!P256_check_range_n(scalar) || !p256_scalarmult_generic_no_scalar_check(result_x, result_y, scalar, in_x, in_y)) {
        return false;
    }
    P256_from_montgomery(result_x, result_x);
    P256_from_montgomery(result_y, result_y);
    return true;
}
#endif

#if include_p256_ecdh
bool p256_ecdh_calc_shared_secret(uint8_t shared_secret[32], const uint32_t private_key[8], const uint32_t others_public_key_x[8], const uint32_t others_public_key_y[8]) {
    uint32_t result_x[8], result_y[8];
    if (!p256_scalarmult_generic_no_scalar_check(result_x, result_y, private_key, others_public_key_x, others_public_key_y)) {
        return false;
    }
    P256_from_montgomery(result_x, result_x);
    p256_convert_endianness(shared_secret, result_x, 32);
    return true;
}
#endif
#endif

#if include_p256_to_octet_string_uncompressed
void p256_point_to_octet_string_uncompressed(uint8_t out[65], const uint32_t x[8], const uint32_t y[8]) {
    out[0] = 4;
    p256_convert_endianness(out + 1, x, 32);
    p256_convert_endianness(out + 33, y, 32);
}
#endif

#if include_p256_to_octet_string_compressed
void p256_point_to_octet_string_compressed(uint8_t out[33], const uint32_t x[8], const uint32_t y[8]) {
    out[0] = 2 + (y[0] & 1);
    p256_convert_endianness(out + 1, x, 32);
}
#endif

#if include_p256_to_octet_string_hybrid
void p256_point_to_octet_string_hybrid(uint8_t out[65], const uint32_t x[8], const uint32_t y[8]) {
    out[0] = 6 + (y[0] & 1);
    p256_convert_endianness(out + 1, x, 32);
    p256_convert_endianness(out + 33, y, 32);
}
#endif

#if include_p256_decode_point || include_p256_decompress_point
bool p256_octet_string_to_point(uint32_t x[8], uint32_t y[8], const uint8_t* input, uint32_t input_len_in_bytes) {
    if (input_len_in_bytes < 33) return false;
    p256_convert_endianness(x, input + 1, 32);
    if (!P256_check_range_p(x)) {
        return false;
    }
    #if include_p256_decode_point
    if ((input[0] == 4 || (input[0] >> 1) == 3) && input_len_in_bytes == 65) {
        p256_convert_endianness(y, input + 33, 32);
        if (!P256_check_range_p(y)) {
            return false;
        }
        if ((input[0] >> 1) == 3 && (input[0] & 1) != (y[0] & 1)) {
            return false;
        }
        uint32_t x_mont[8], y_mont[8];
        P256_to_montgomery(x_mont, x);
        P256_to_montgomery(y_mont, y);
        return P256_point_is_on_curve(x_mont, y_mont);
    }
    #endif
    #if include_p256_decompress_point
    if ((input[0] >> 1) == 1 && input_len_in_bytes == 33) {
        return P256_decompress_point(y, x, input[0] & 1);
    }
    #endif
    return false;
}
#endif