// A*B // Schoolbook multiplication of four 64b limbs // result in r8 - r15 .macro mul_256 a b xor %rax, %rax mov 0x00\a, %rdx mulx 0x00\b, %r8, %r9 mulx 0x08\b, %rbx, %r10 adcx %rbx, %r9 mulx 0x10\b, %rbx, %r11 adcx %rbx, %r10 mulx 0x18\b, %rbx, %r12 adcx %rbx, %r11 adcx %rax, %r12 xor %rax, %rax mov 0x08\a, %rdx mulx 0x00\b, %rbp, %rbx adcx %rbp, %r9 adox %rbx, %r10 mulx 0x08\b, %rbp, %rbx adcx %rbp, %r10 adox %rbx, %r11 mulx 0x10\b, %rbp, %rbx adcx %rbp, %r11 adox %rbx, %r12 mulx 0x18\b, %rbp, %r13 adcx %rbp, %r12 adox %rax, %r13 adcx %rax, %r13 xor %rax, %rax mov 0x10\a, %rdx mulx 0x00\b, %rbp, %rbx adcx %rbp, %r10 adox %rbx, %r11 mulx 0x08\b, %rbp, %rbx adcx %rbp, %r11 adox %rbx, %r12 mulx 0x10\b, %rbp, %rbx adcx %rbp, %r12 adox %rbx, %r13 mulx 0x18\b, %rbp, %r14 adcx %rbp, %r13 adox %rax, %r14 adcx %rax, %r14 xor %rax, %rax mov 0x18\a, %rdx mulx 0x00\b, %rbp, %rbx adcx %rbp, %r11 adox %rbx, %r12 mulx 0x08\b, %rbp, %rbx adcx %rbp, %r12 adox %rbx, %r13 mulx 0x10\b, %rbp, %rbx adcx %rbp, %r13 adox %rbx, %r14 mulx 0x18\b, %rbp, %r15 adcx %rbp, %r14 adox %rax, %r15 adcx %rax, %r15 .endm // Montgomery reduction // expects multiplication result in r8 - r15 // See algo 14.32 from Handbook of Applied Cryptography .macro red_256 res name push %rsi lea .LM(%rip), %rsi xor %rax, %rax mov 0x20(%rsi), %rdx mulx %r8, %rdx, %rbp mulx 0x00(%rsi), %rbp, %rbx adox %rbp, %r8 adcx %rbx, %r9 mulx 0x08(%rsi), %rbp, %rbx adox %rbp, %r9 adcx %rbx, %r10 mulx 0x10(%rsi), %rbp, %rbx adox %rbp, %r10 adcx %rbx, %r11 mulx 0x18(%rsi), %rbp, %rbx adox %rbp, %r11 adcx %rbx, %r12 adox %rax, %r12 adcx %rax, %r13 adox %rax, %r13 adcx %rax, %r14 adox %rax, %r14 adcx %rax, %r15 adox %rax, %r15 mov 0x20(%rsi), %rdx mulx %r9, %rdx, %rbp mulx 0x00(%rsi), %rbp, %rbx adox %rbp, %r9 adcx %rbx, %r10 mulx 0x08(%rsi), %rbp, %rbx adox %rbp, %r10 adcx %rbx, %r11 mulx 0x10(%rsi), %rbp, %rbx adox %rbp, %r11 adcx %rbx, %r12 mulx 0x18(%rsi), %rbp, %rbx adox %rbp, %r12 adcx %rbx, %r13 adox %rax, %r13 adcx %rax, %r14 adox %rax, %r14 adcx %rax, %r15 adox %rax, %r15 mov 0x20(%rsi), %rdx mulx %r10, %rdx, %rbp mulx 0x00(%rsi), %rbp, %rbx adox %rbp, %r10 adcx %rbx, %r11 mulx 0x08(%rsi), %rbp, %rbx adox %rbp, %r11 adcx %rbx, %r12 mulx 0x10(%rsi), %rbp, %rbx adox %rbp, %r12 adcx %rbx, %r13 mulx 0x18(%rsi), %rbp, %rbx adox %rbp, %r13 adcx %rbx, %r14 adox %rax, %r14 adcx %rax, %r15 adox %rax, %r15 mov 0x20(%rsi), %rdx mulx %r11, %rdx, %rbp mov 0x00(%rsi), %r8 mulx %r8, %rbp, %rbx adox %rbp, %r11 adcx %rbx, %r12 mov 0x08(%rsi), %r9 mulx %r9, %rbp, %rbx adox %rbp, %r12 adcx %rbx, %r13 mov 0x10(%rsi), %r10 mulx %r10, %rbp, %rbx adox %rbp, %r13 adcx %rbx, %r14 mov 0x18(%rsi), %r11 mulx %r11, %rbp, %rbx adox %rbp, %r14 adcx %rbx, %r15 adox %rax, %r15 mov %r12, 0x00\res mov %r13, 0x08\res mov %r14, 0x10\res mov %r15, 0x18\res sub %r8, %r12 sbb %r9, %r13 sbb %r10, %r14 sbb %r11, %r15 jb .Lred_256\name mov %r12, 0x00\res mov %r13, 0x08\res mov %r14, 0x10\res mov %r15, 0x18\res .Lred_256\name: pop %rsi .endm .macro mod_mul_256 a b res name mul_256 \a, \b red_256 \res, \name .endm // BLS12-381 G1 order r used as modulus // Montgomery constant -m^-1 mod b .LM: .quad 0xffffffff00000001 .quad 0x53bda402fffe5bfe .quad 0x3339d80809a1d805 .quad 0x73eda753299d7d48 .quad 0xfffffffeffffffff #ifdef __APPLE__ .global _mod_mul_4w _mod_mul_4w: #else .global mod_mul_4w mod_mul_4w: #endif // x = rdi // y = rsi // result = rdx push %rbp push %rbx push %r12 push %r13 push %r14 push %r15 mov %rdx, %rcx // rcx = result // x * y mod_mul_256 (%rdi), (%rsi), (%rcx), mm pop %r15 pop %r14 pop %r13 pop %r12 pop %rbx pop %rbp ret