# # mp_limb_t mulredc(mp_limb_t * z, const mp_limb_t * x, const mp_limb_t * y, # const mp_limb_t *m, mp_size_t n, mp_limb_t inv_m) # # Compute z := x*y mod m, in Montgomery representation, where x, y < m # and m is n limb wide. inv_m is the less significant limb of the # inverse of m modulo 2^(n*GMP_LIMB_BITS) # # The result might be unreduced (larger than m) but becomes reduced # after subtracting m. The calling function should take care of that. # # We use a temporary space for unreduced product on the stack. # Therefore, this can not be used for large integers (anyway, the # algorithm is quadratic). # # WARNING: z is only n limbs but since it might be unreduced, there # could be a carry that does not fit in z. This carry is returned. .text .globl mulredc .type mulredc,@function mulredc: pushq %rbx pushq %r12 pushq %r13 pushq %r14 pushq %r15 # push registers of caller movq %r8, %rax shlq $4, %rax addq $8, %rax subq %rax, %rsp # allocate TMP on stack movq %rsp, %r10 # r10 <- TMP pushq %r10 # pointer to TMP pushq %rdi # push parameters pushq %rsi pushq %rdx pushq %rcx # Stack: Other parameters: # (%rsp) : m %r8 : n # 8(%rsp) : y %r9 : inv_m # 16(%rsp) : x # 24(%rsp) : z # 32(%rsp) : TMP (pointer to) # 40+%rsp : TMP (first limb) ### Init: put 0 in TMP[0..2n] ############################# movq %r8, %rcx shlq $1, %rcx incq %rcx movq 32(%rsp), %rdi InitLoop: movq $0, (%rdi) addq $8, %rdi decq %rcx jnz InitLoop ### Outer: ############################## ### for (i = 0; i < n; ++i) { ### u = (TMP[0]+x[i]*y[0])*inv_m mod 2^GMP_LIMB_BITS; ### TMP[0..n+1] += x[i]*y + u*m; // this is "Inner" ### TMP++; ### } ############################## movq %r8, %r15 OuterLoop: ## compute u movq 16(%rsp), %rbx movq (%rbx), %rax movq 8(%rsp), %rbx mulq (%rbx) movq 32(%rsp), %rbx addq (%rbx), %rax mulq %r9 movq %rax, %r10 # %r10 gets the multiplier u ### Inner: ###################### ### TMP[0..n+1] += x[i]*y + u*m ### ### ax: ... r8 : n (for outer) ### bx: m r9 : inv_m (for outer) ### cx: inner cpt r10: u ### dx: ... r11: x[i] ### di: TMP r12, r13, r14: ... ### si: y r15: outer cpt ###################### movq (%rsp), %rbx movq 32(%rsp), %rdi movq 8(%rsp), %rsi movq 16(%rsp), %rdx movq (%rdx), %r11 movq %r8, %rcx xorq %r12, %r12 xorq %r13, %r13 InnerLoop: ### r12: carry lo, r13: carry hi (can be at most 2) movq (%rsi), %rax mulq %r11 addq %rax, %r12 adcq $0, %rdx addq %r12, (%rdi) adcq $0, %rdx ## carry flag is clean movq %rdx, %r14 movq (%rbx), %rax mulq %r10 movq %r13, %r12 ## carry hi becomes carry low xorq %r13, %r13 addq %rax, (%rdi) adcq %rdx, %r12 adcq $0, %r13 addq %r14, %r12 adcq $0, %r13 addq $8, %rdi addq $8, %rsi addq $8, %rbx decq %rcx jnz InnerLoop addq %r12, (%rdi) adcq %r13, 8(%rdi) ###################### ## advance TMP and x movq 32(%rsp), %rdx addq $8, %rdx movq %rdx, 32(%rsp) movq 16(%rsp), %rdx addq $8, %rdx movq %rdx, 16(%rsp) decq %r15 jnz OuterLoop ### Finish: ############################## ### Copy TMP into z ############################## movq %r8, %rcx movq 32(%rsp), %rsi movq 24(%rsp), %rdi FinishLoop: movq (%rsi), %rax movq %rax, (%rdi) addq $8, %rdi addq $8, %rsi decq %rcx jnz FinishLoop addq $40, %rsp # clean parameters movq %r8, %rax shlq $4, %rax addq $8, %rax addq %rax, %rsp # free TMP on stack popq %r15 popq %r14 popq %r13 popq %r12 popq %rbx movq (%rsi), %rax ## returned value ret