//*********************************************************************************** // FourQlib: a high-performance crypto library based on the elliptic curve FourQ // // Copyright (c) Microsoft Corporation. All rights reserved. // // Abstract: arithmetic over GF(p^2) using x64 assembly for Linux //*********************************************************************************** .intel_syntax noprefix // Registers that are used for parameter passing: #define reg_p1 rdi #define reg_p2 rsi #define reg_p3 rdx #define reg_p4 rcx .text //************************************************************************** // Quadratic extension field multiplication using lazy reduction // Based on schoolbook method // Operation: c [reg_p3] = a [reg_p1] * b [reg_p2] in GF(p^2), p = 2^127-1 // NOTE: only a=c is allowed for fp2mul1271_a(a, b, c) //************************************************************************** .global fp2mul1271_a fp2mul1271_a: push r15 #if defined(PUSH_SET) push r12 push r14 push r13 #endif mov rcx, reg_p3 // T0 = a0 * b0, (r11, r10, r9, r8) <- [reg_p1_0-8] * [reg_p2_0-8] mov rax, [reg_p1] mov r11, [reg_p2] mul r11 #if !defined(PUSH_SET) push r12 #endif xor r10, r10 mov r8, rax mov r9, rdx mov r12, [reg_p2+8] mov rax, [reg_p1] mul r12 add r9, rax #if !defined(PUSH_SET) push r14 #endif adc r10, rdx mov rax, [reg_p1+8] mul r11 add r9, rax #if !defined(PUSH_SET) push r13 #endif adc r10, rdx mov rax, [reg_p1+8] mul r12 add r10, rax mov r11, 0 adc r11, rdx // T1 = a1 * b1, (r15, r14, r13, r12) <- [reg_p1_16-24] * [reg_p2_16-24] xor r14, r14 mov rax, [reg_p1+16] mov r15, [reg_p2+16] mul r15 mov r12, rax mov rax, [reg_p2+24] mov r13, rdx mov rdx, [reg_p1+16] mul rdx add r13, rax mov rax, [reg_p1+24] adc r14, rdx mul r15 add r13, rax adc r14, rdx mov r15, [reg_p2+24] mov rax, [reg_p1+24] mul r15 mov r15, 0 add r14, rax adc r15, rdx // c0 = T0 - T1 = a0*b0 - a1*b1 xor rax, rax sub r8, r12 sbb r9, r13 sbb r10, r14 sbb r11, r15 adc rax, 0 shld r11, r10, 1 shld r10, r9, 1 mov r15, [reg_p2+16] mov rax, [reg_p1] btr r9, 63 // T0 = a0 * b1, (r15, r14, r13, r12) <- [reg_p1_0-8] * [reg_p2_16-24] mul r15 btr r11, 63 // Add prime if borrow=1 sbb r10, 0 sbb r11, 0 xor r14, r14 mov r12, rax mov rax, [reg_p2+24] mov r13, rdx mov rdx, [reg_p1] mul rdx add r13, rax mov rax, [reg_p1+8] adc r14, rdx mul r15 xor r15, r15 add r13, rax mov rax, [reg_p1+8] adc r14, rdx mul qword ptr [reg_p2+24] add r8, r10 adc r9, r11 add r14, rax adc r15, rdx // Reducing and storing c0 btr r9, 63 adc r8, 0 mov r11, [reg_p2] adc r9, 0 // T1 = a1 * b0, (r12, r11, r10, r9) <- [reg_p1_16-24] * [reg_p2_0-8] mov rax, [reg_p1+16] mul r11 mov [rcx], r8 mov [rcx+8], r9 mov r8, rax mov r9, rdx mov rax, [reg_p1+16] mov rsi, [reg_p2+8] mul rsi xor r10, r10 add r9, rax adc r10, rdx mov rax, [reg_p1+24] mul r11 add r9, rax adc r10, rdx xor r11, r11 mov rax, [reg_p1+24] mul rsi add r10, rax adc r11, rdx // c1 = T0 + T1 = a0*b1 + a1*b0 add r8, r12 adc r9, r13 pop r13 adc r10, r14 pop r14 pop r12 adc r11, r15 pop r15 // Reducing and storing c1 shld r11, r10, 1 shld r10, r9, 1 btr r9, 63 btr r11, 63 adc r8, r10 adc r9, r11 btr r9, 63 adc r8, 0 adc r9, 0 mov [rcx+16], r8 mov [rcx+24], r9 ret //*********************************************************************** // Quadratic extension field squaring // Operation: c [reg_p2] = a^2 [reg_p1] in GF(p^2), p = 2^127-1 // NOTE: a=c is not allowed for fp2sqr1271_a(a, c) //*********************************************************************** .global fp2sqr1271_a fp2sqr1271_a: push r14 // t0 = (r9, r8) = a0 + a1, (rcx, r14) <- a1 mov r8, [reg_p1] mov r14, [reg_p1+16] add r8, r14 mov r9, [reg_p1+8] mov rcx, [reg_p1+24] adc r9, rcx btr r9, 63 push r12 adc r8, 0 adc r9, 0 // t1 = (r11, r10) = a0 - a1 mov r10, [reg_p1] sub r10, r14 mov r11, [reg_p1+8] sbb r11, rcx btr r11, 63 sbb r10, 0 push r13 sbb r11, 0 // c0 = t0 * t1 = (a0 + a1)*(a0 - a1), (rcx, r14, r13, r12) <- (r9, r8) * (r11, r10) xor r14, r14 mov rax, r8 mul r10 mov r12, rax mov rax, r11 mov r13, rdx mul r8 xor rcx, rcx add r13, rax adc r14, rdx mov rax, r9 mul r10 mov r8, [reg_p1] add r13, rax adc r14, rdx mov rax, r9 mul r11 mov r9, [reg_p1+8] add r14, rax adc rcx, rdx // t2 = (r9, r8) = 2*a0 add r8, r8 adc r9, r9 btr r9, 63 adc r8, 0 adc r9, 0 // Reducing and storing c0 shld rcx, r14, 1 shld r14, r13, 1 btr r13, 63 add r12, r14 adc r13, rcx btr r13, 63 adc r12, 0 adc r13, 0 mov [reg_p2], r12 mov [reg_p2+8], r13 // c1 = 2a0 * a1, (rcx, r14, r11, r10) <- (r9, r8) * [reg_p1_16-24] mov rcx, [reg_p1+16] mov rax, r8 mul rcx mov r10, rax mov r11, rdx mov rax, [reg_p1+24] xor r14, r14 mul r8 add r11, rax adc r14, rdx mov rax, rcx mul r9 add r11, rax adc r14, rdx mov rax, [reg_p1+24] mul r9 xor rcx, rcx add r14, rax pop r13 adc rcx, rdx // Reducing and storing c1 shld rcx, r14, 1 shld r14, r11, 1 btr r11, 63 add r10, r14 pop r12 adc r11, rcx btr r11, 63 adc r10, 0 pop r14 adc r11, 0 mov [reg_p2+16], r10 mov [reg_p2+24], r11 ret //*************************************************************************** // Quadratic extension field addition/subtraction // Operation: c [reg_p3] = 2*a [reg_p1] - b [reg_p2] in GF(p^2), p = 2^127-1 //*************************************************************************** .global fp2addsub1271_a fp2addsub1271_a: mov r8, [reg_p1] mov r9, [reg_p1+8] add r8, r8 adc r9, r9 btr r9, 63 adc r8, 0 adc r9, 0 mov r10, [reg_p2] sub r8, r10 mov r10, [reg_p2+8] sbb r9, r10 btr r9, 63 sbb r8, 0 mov [reg_p3], r8 sbb r9, 0 mov [reg_p3+8], r9 mov r8, [reg_p1+16] mov r9, [reg_p1+24] add r8, r8 adc r9, r9 btr r9, 63 adc r8, 0 adc r9, 0 mov r10, [reg_p2+16] sub r8, r10 mov r10, [reg_p2+24] sbb r9, r10 btr r9, 63 sbb r8, 0 mov [reg_p3+16], r8 sbb r9, 0 mov [reg_p3+24], r9 ret