// Authors: Bjoern Haase, Niels Samwel // // License: CC0 1.0 (http://creativecommons.org/publicdomain/zero/1.0/legalcode) #include "../include/sc25519.h" #include "../include/montgomery_reduction.h" #define BARRET_REDUCTION 1 #define MONTGOMERY_REDUCTION 2 #define REDUCTION_TYPE BARRET_REDUCTION #if (REDUCTION_TYPE == BARRET_REDUCTION) // Fixme: This code won't work on big endian targets. static const UN_288bitValue sc25519_scalar = {{ 0xed, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58, 0xd6, 0x9c, 0xf7, 0xa2, 0xde, 0xf9, 0xde, 0x14, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x10 }}; #define uint32_scalar sc25519_scalar.as_uint32_t #else static const UNMontgomeryConstants256 sc25519_scalarMontgomeryPrecalc = {{ 0x01, 0x0f, 0x9c, 0x44, 0xe3, 0x11, 0x06, 0xa4, 0x47, 0x93, 0x85, 0x68, 0xa7, 0x1b, 0x0e, 0xd0, 0x65, 0xbe, 0xf5, 0x17, 0xd2, 0x73, 0xec, 0xce, 0x3d, 0x9a, 0x30, 0x7c, 0x1b, 0x41, 0x99, 0x03, 0xed, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58, 0xd6, 0x9c, 0xf7, 0xa2, 0xde, 0xf9, 0xde, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x1b, 0x7e, 0x54, 0x12 }}; #define uint32_scalar sc25519_scalarMontgomeryPrecalc.constants.prime.as_uint32_t #endif static int32_t bigint_sub( uint32_t* dest, const uint32_t* sub, uint32_t numberOfWords ) { unsigned ctr; int64_t accu = 0; for (ctr = 0; ctr < numberOfWords; ctr ++) { accu += dest[ctr]; accu -= sub[ctr]; dest[ctr] = (uint32_t)accu; accu >>= 32; } return (int32_t)accu; } static int32_t bigint_add( uint32_t* dest, const uint32_t* addend, uint32_t numberOfWords ) { unsigned ctr; int64_t accu = 0; for (ctr = 0; ctr < numberOfWords; ctr ++) { accu += dest[ctr]; accu += addend[ctr]; dest[ctr] = (uint32_t)accu; accu >>= 32; } return (int32_t)accu; } #if 1 /* substracts the prime in case that the result is non-negative. gets a pointer to a temp buffer in order to reduce stack consumption */ static void sc25519_reduceAddSub( sc25519* valueToReduce, sc25519* tempBufferSuppliedByCaller ) { int32_t didUnderflowHappen; cpy_256bitvalue(tempBufferSuppliedByCaller, valueToReduce); didUnderflowHappen = bigint_sub(tempBufferSuppliedByCaller->as_uint32_t, uint32_scalar, 8); conditionalMove_256bitValue(valueToReduce, tempBufferSuppliedByCaller, (uint8_t) (~didUnderflowHappen) & 1); } #else /* substracts the prime in case that the result is non-negative. gets a pointer to a temp buffer in order to reduce stack consumption */ static void sc25519_reduceAddSub( sc25519* valueToReduce, sc25519* tempBufferSuppliedByCaller ) { int32_t didUnderflowHappen; didUnderflowHappen = montgomery_mnac_U16 (valueToReduce->as_uint32_t, 1, uint32_scalar); montgomery_mac_lowU16 (valueToReduce->as_uint32_t, didUnderflowHappen * didUnderflowHappen, uint32_scalar); } #endif void sc25519_from32bytes( sc25519* r, const uint8_t x[32] ) { UN_512bitValue temp; uint8_t ctr; for (ctr = 0; ctr < 32; ctr++) { // Fixme: This code won't work on big endian targets. temp.as_uint8_t[ctr] = x[ctr]; } for (ctr = 8; ctr < 16; ctr++) { temp.as_uint32_t[ctr] = 0; } sc25519_reduce(&temp); for (ctr = 0; ctr < 8; ctr++) { r->as_uint32_t[ctr] = temp.as_uint32_t[ctr]; } } void sc25519_from64bytes( sc25519* r, const uint8_t x[64] ) { UN_512bitValue temp; uint8_t ctr; for (ctr = 0; ctr < 64; ctr++) { // Fixme: This code won't work on big endian targets. temp.as_uint8_t[ctr] = x[ctr]; } sc25519_reduce(&temp); for (ctr = 0; ctr < 8; ctr++) { r->as_uint32_t[ctr] = temp.as_uint32_t[ctr]; } } void sc25519_to32bytes( uint8_t r[32], const sc25519* x ) { uint8_t ctr; for (ctr = 0; ctr < 32; ctr++) { // Fixme: This code won't work on big endian targets. r[ctr] = x->as_uint8_t[ctr]; } } void sc25519_add( sc25519* result, const sc25519* addend1, const sc25519* addend2 ) { UN_256bitValue tmp; cpy_256bitvalue(result, addend1); bigint_add(result->as_uint32_t, addend2->as_uint32_t, 8); sc25519_reduceAddSub(result, &tmp); } void sc25519_sub( sc25519* result, const sc25519* addend, const sc25519* valueToSubstract ) { UN_256bitValue tmp; int didUnderflowHappen; cpy_256bitvalue(result, addend); didUnderflowHappen = bigint_sub( result->as_uint32_t, valueToSubstract->as_uint32_t, 8); { int i; for (i = 0; i < 8; i ++) { tmp.as_uint32_t[i] = uint32_scalar[i]; } } bigint_add (tmp.as_uint32_t, result->as_uint32_t,8); conditionalMove_256bitValue((UN_256bitValue *) result, &tmp, (uint8_t) (didUnderflowHappen & 1)); } #if (REDUCTION_TYPE == BARRET_REDUCTION) /* Calculates the barret reduction modulo the scalar prime. returns the reduced result in the lower 256 bits. */ void sc25519_reduce( UN_512bitValue* valueToReduce ) { /* use Barret reduction. Multiply with 288 bits so that all intermediate results are also aligned to four-byte boundaries. */ static const uint32_t u32_mu[9] = { 0x0a2c131bUL, 0xed9ce5a3UL, 0x086329a7UL, 0x2106215dUL, 0xffffffebUL, 0xffffffffUL, 0xffffffffUL, 0xffffffffUL, 0xfUL }; UN_288bitValue* valueForBarretMpy = (UN_288bitValue*)&valueToReduce->as_uint32_t[7]; UN_288bitValue* u288_mu = (UN_288bitValue*)u32_mu; const UN_288bitValue* u288_pr252 = &sc25519_scalar; UN_576bitValue intermResultBarret; multiply288x288(&intermResultBarret, u288_mu, valueForBarretMpy); { UN_288bitValue* approximateResultOfDivision = (UN_288bitValue*)&intermResultBarret.as_uint32_t[9]; UN_576bitValue valueToSubstract; multiply288x288(&valueToSubstract, approximateResultOfDivision, u288_pr252); bigint_sub(valueToReduce->as_uint32_t, valueToSubstract.as_uint32_t, 16); /* due to rounding the true result of the division may be wrong by one (or two?). we need to conditionally substract the prime if the result of the substraction is nonnegative. */ } sc25519_reduceAddSub(&valueToReduce->as_256_bitValue[0], &valueToReduce->as_256_bitValue[1]); sc25519_reduceAddSub(&valueToReduce->as_256_bitValue[0], &valueToReduce->as_256_bitValue[1]); } #else void sc25519_reduce( UN_512bitValue* valueToReduce ) { UN_256bitValue tmp; montgomery_reduce (&tmp, valueToReduce, &sc25519_scalarMontgomeryPrecalc.constants); multiply256x256 (valueToReduce, &tmp, &sc25519_scalarMontgomeryPrecalc.constants.kSquare); montgomery_reduce (&valueToReduce->as_256_bitValue [0], valueToReduce, &sc25519_scalarMontgomeryPrecalc.constants); } #endif #if (REDUCTION_TYPE == BARRET_REDUCTION) /* Multiplies and calculates the barret reduction modulo the scalar prime. */ void sc25519_mul( sc25519* r, const sc25519* x, const sc25519* y ) { UN_512bitValue temp; multiply256x256(&temp, x, y); sc25519_reduce(&temp); cpy_256bitvalue(r, &temp.as_256_bitValue[0]); } void sc25519_sqr( sc25519* r, const sc25519* x ) { UN_512bitValue temp; square256(&temp, x); sc25519_reduce(&temp); cpy_256bitvalue(r, &temp.as_256_bitValue[0]); } #else void sc25519_mul( sc25519* r, const sc25519* x, const sc25519* y ) { UN_512bitValue tmp; multiply256x256 (&tmp, x,y); montgomery_reduce (r,&tmp, &sc25519_scalarMontgomeryPrecalc.constants); multiply256x256 (&tmp, r, &sc25519_scalarMontgomeryPrecalc.constants.kSquare); montgomery_reduce (r,&tmp, &sc25519_scalarMontgomeryPrecalc.constants); } void sc25519_sqr( sc25519* r, const sc25519* x ) { UN_512bitValue tmp; square256(&tmp, x); montgomery_reduce(r, &tmp, &sc25519_scalarMontgomeryPrecalc.constants); multiply256x256(&tmp, r, &sc25519_scalarMontgomeryPrecalc.constants.kSquare); montgomery_reduce(r, &tmp, &sc25519_scalarMontgomeryPrecalc.constants); } #endif /// convert the scalar s to a representation of 64 or 72 signed chars containing /// 4 bits each. (NAF-Form with values -8 .. + 8). /// This is done for speedup of the fixed-window scalar multiplication /// later on, that processes 4 bits in each step. void sc25519_window4( signed char r[SC25519_WINDOW4_SIZE], const sc25519* s ) { char carry; uint8_t i; for (i = 0; i < 32; i++) { r[2 * i] = s->as_uint8_t[i] & 15; r[2 * i + 1] = s->as_uint8_t[i] >> 4; } /* Making the result signed and limited to the range -8 ... +8 for the signed NAF form. */ carry = 0; for (i = 0; i < (SC25519_WINDOW4_SIZE - 1); i++) { r[i] += carry; r[i + 1] += r[i] >> 4; r[i] &= 15; carry = r[i] >> 3; r[i] -= carry << 4; } r[SC25519_WINDOW4_SIZE - 1] += carry; } // Returns 0 if x>y, returns 1 otherwise int greaterThan(const UN_256bitValue* x, const UN_256bitValue* y) { // int i; // for(i=7;i>=0;i--) { // if(x->as_uint32_t[i] > y->as_uint32_t[i]) // return 1; // else if(x->as_uint32_t[i] < y->as_uint32_t[i]) // return 0; // } // return 0; UN_256bitValue tmp; cpy_256bitvalue(&tmp, x); bigint_sub(tmp.as_uint32_t, y->as_uint32_t, 8); return tmp.as_uint32_t[7] >> 31; } // binary extended gcd algorithm based on Alg. 14.61 in the Handbook of Applied Cryptography // void sc25519_binary_extended_gcd(UN_256bitValue *R, const UN_256bitValue *X, const UN_256bitValue *Y) { UN_256bitValue B, D, v, u, g, x, y, zero; cpy_256bitvalue(&x, X); cpy_256bitvalue(&y, Y); setone_256bitvalue(&g); while(((x.as_uint8_t[0] & 1) == 0) && ((y.as_uint8_t[0] & 1) == 0)) { shiftRightOne(&x); shiftRightOne(&y); shiftLeftOne(&g); } cpy_256bitvalue(&u, &x); cpy_256bitvalue(&v, &y); setzero_256bitvalue(&B); setone_256bitvalue(&D); setzero_256bitvalue(&zero); while(isEqual_256bitvalue(&u, &zero) > 0) { while((u.as_uint8_t[0] & 1) == 0) { shiftRightOne(&u); if((B.as_uint8_t[0] & 1) == 0) { shiftRightOne(&B); } else { bigint_sub(B.as_uint32_t, x.as_uint32_t, 8); shiftRightOne(&B); } } while((v.as_uint8_t[0] & 1) == 0) { shiftRightOne(&v); if((D.as_uint8_t[0] & 1) == 0) { shiftRightOne(&D); } else { bigint_sub(D.as_uint32_t, x.as_uint32_t, 8); shiftRightOne(&D); } } if(!greaterThan(&u, &v) || (isEqual_256bitvalue(&u, &v) == 0)) { bigint_sub(u.as_uint32_t, v.as_uint32_t, 8); bigint_sub(B.as_uint32_t, D.as_uint32_t, 8); } else { bigint_sub(v.as_uint32_t, u.as_uint32_t, 8); bigint_sub(D.as_uint32_t, B.as_uint32_t, 8); } } cpy_256bitvalue(R, &D); } // Attention: Variable time execution due to the extended GCD algorithm! void sc25519_inverse(UN_256bitValue *R, const UN_256bitValue *X) { UN_256bitValue order = {{ 0xed, 0xd3, 0xf5, 0x5c, 0x1a, 0x63, 0x12, 0x58, 0xd6, 0x9c, 0xf7, 0xa2, 0xde, 0xf9, 0xde, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10 }}; sc25519_binary_extended_gcd(R, &order, X); // In case R < 0, add order until positive while((R->as_uint32_t[7] & 0x80000000) != 0) bigint_add(R->as_uint32_t, order.as_uint32_t, 8); }