include "../../../../lib/util/Vale.Lib.Basic.vaf" include "../../../../arch/x64/Vale.X64.InsBasic.vaf" include "../../../../arch/x64/Vale.X64.InsMem.vaf" include "../../../../arch/x64/Vale.X64.InsStack.vaf" include{:fstar}{:open} "Vale.Curve25519.Fast_defs" include{:fstar}{:open} "Vale.Curve25519.Fast_lemmas_external" include{:fstar}{:open} "Vale.Curve25519.FastMul_helpers" include{:fstar}{:open} "Vale.X64.CPU_Features_s" //include{:fstar}{:open} "Vale.Def.Prop_s" module Vale.Curve25519.X64.FastMul #verbatim{:interface} open Vale.Def.Types_s open Vale.Arch.Types open Vale.X64.Machine_s open Vale.X64.Memory open Vale.X64.Stack_i open Vale.X64.State open Vale.X64.Decls open Vale.X64.InsBasic open Vale.X64.InsMem open Vale.X64.InsStack open Vale.X64.QuickCode open Vale.X64.QuickCodes open Vale.Curve25519.Fast_defs open Vale.X64.CPU_Features_s #endverbatim #verbatim{:implementation} open Vale.Def.Types_s open Vale.Arch.Types open Vale.X64.Machine_s open Vale.X64.Memory open Vale.X64.Stack_i open Vale.X64.State open Vale.X64.Decls open Vale.X64.InsBasic open Vale.X64.InsMem open Vale.X64.InsStack open Vale.X64.QuickCode open Vale.X64.QuickCodes open FStar.Tactics open Vale.Curve25519.Fast_defs open Vale.Curve25519.Fast_lemmas_external open Vale.Curve25519.FastMul_helpers open Vale.X64.CPU_Features_s #reset-options "--z3rlimit 60" #endverbatim const int_canon:tactic extern; procedure Fast_multiply_a0b( inline offset:nat, ghost dst_b:buffer64, ghost inA_b:buffer64, ghost inB_b:buffer64) {:quick} {:options z3rlimit(200)} lets dst_ptr @= rdi; inA_ptr @= rsi; inB_ptr @= rcx; a0 := buffer64_read(inA_b, 0 + offset, heap0); a1 := buffer64_read(inA_b, 1 + offset, heap0); a2 := buffer64_read(inA_b, 2 + offset, heap0); a3 := buffer64_read(inA_b, 3 + offset, heap0); b0 := buffer64_read(inB_b, 0 + offset, heap0); b1 := buffer64_read(inB_b, 1 + offset, heap0); b2 := buffer64_read(inB_b, 2 + offset, heap0); b3 := buffer64_read(inB_b, 3 + offset, heap0); a := a0 + pow2_64 * a1 + pow2_128 * a2 + pow2_192 * a3; b := b0 + pow2_64 * b1 + pow2_128 * b2 + pow2_192 * b3; reads dst_ptr; inA_ptr; inB_ptr; memLayout; modifies rax; rdx; r8; r9; r10; r11; rbx; r13; r14; heap0; efl; requires adx_enabled && bmi2_enabled; offset == 0 || offset == 4; buffers_disjoint(dst_b, inA_b); buffers_disjoint(dst_b, inB_b); validDstAddrs64(heap0, dst_ptr, dst_b, 8 + offset*2, memLayout, Secret); validSrcAddrs64(heap0, inA_ptr, inA_b, 4 + offset, memLayout, Secret); validSrcAddrs64(heap0, inB_ptr, inB_b, 4 + offset, memLayout, Secret); ensures let d0 := buffer64_read(dst_b, 0 + offset*2, heap0); let d1 := buffer64_read(dst_b, 1 + offset*2, heap0); let a0b := d0 + pow2_64 * d1 + pow2_128 * rbx + pow2_192 * r14 + pow2_256 * rax; a0b == a0 * b; rax < pow2_64 - 1; validSrcAddrs64(heap0, dst_ptr, dst_b, 8 + offset*2, memLayout, Secret); modifies_buffer_specific(dst_b, old(heap0), heap0, 0 + offset*2, 8 + offset*2 - 1); { xor_lemmas(); // "movq (%1), %%rdx; " /* A[0] */ // "mulx (B), %%r8, %%r9; " /* A[0]*B[0] */ "xorl %%r10d, %%r10d ;" "movq %%r8, (dst) ;" // "mulx 8(B), %%r10, %%r11; " /* A[0]*B[1] */ "adox %%r9, %%r10 ;" "movq %%r10, 8(dst) ;" // "mulx 16(B), %%rbx, %%r13; " /* A[0]*B[2] */ "adox %%r11, %%rbx ;" // "mulx 24(B), %%r14, %%rdx; " /* A[0]*B[3] */ "adox %%r13, %%r14 ;" "movq $0, %%rax ;" // /*******************************************/ "adox %%rdx, %%rax ;" Comment("Compute src1[0] * src2"); Load64_buffer(heap0, rdx, inA_ptr, 0 + 8 * offset, Secret, inA_b, 0 + offset); /* A[0] */ // The Xo64 clears the flags used for carry bits; NOTE: Original code uses xorl with r10d, maybe produces smaller code? NoNewline(); Mulx64(r9, r8, Mem64(heap0, inB_ptr, 0 + offset * 8, inB_b, 0 + offset, Secret)); /* A[0]*B[0] */ Space(3); Xor64(r10, r10); lemma_prod_bounds(r9, r8, a0, b0); Space(1); Store64_buffer(heap0, dst_ptr, r8, 0 + offset*16, Secret, dst_b, 0 + offset*2); Newline(); NoNewline(); Mulx64(r11, r10, Mem64(heap0, inB_ptr, 8 + offset * 8, inB_b, 1 + offset, Secret)); /* A[0]*B[1] */ let a0b1_lo := r10; lemma_prod_bounds(r11, r10, a0, b1); Space(1); Adox64Wrap(r10, r9); Space(1); Store64_buffer(heap0, dst_ptr, r10, 8 + offset * 16, Secret, dst_b, 1 + offset*2); let d0 := buffer64_read(dst_b, 0 + offset*2, heap0); let d1 := buffer64_read(dst_b, 1 + offset*2, heap0); let overflow0 := overflow(efl); ghost var carry := if overflow(efl) then 1 else 0; let a0b0b1 := d0 + pow2_64 * d1 + pow2_128 * (r11 + carry); // simple_helper(a0, b0, b1, d0, r9, a0b1_lo, r11, r10, overflow(efl)); // assert a0b0b1 == a0 * (b0 + pow2_64 * b1); // PASSES in both modes using lemma above // These work: // assert_by_tactic(a0 * (b0 + pow2_64 * b1) == a0 * b0 + pow2_64 * (a0 * b1), int_canon); // assert a0b0b1 == a0 * (b0 + pow2_64 * b1); //assert_by_tactic(a0b0b1 == a0 * (b0 + pow2_64 * b1), int_canon); // FAILS Newline(); NoNewline(); Mulx64(r13, rbx, Mem64(heap0, inB_ptr, 16 + offset * 8, inB_b, 2 + offset, Secret)); /* A[0]*B[2] */ let a0b2_lo := rbx; lemma_prod_bounds(r13, rbx, a0, b2); Adox64Wrap(rbx, r11); let overflow1 := overflow(efl); NoNewline(); Mulx64(rdx, r14, Mem64(heap0, inB_ptr, 24 + offset * 8, inB_b, 3 + offset, Secret)); /* A[0]*B[3] */ let a0b3_lo := r14; lemma_prod_bounds(rdx, r14, a0, b3); NoNewline(); Adox64Wrap(r14, r13); Mov64(rax, 0); let overflow2 := overflow(efl); /*******************************************/ Space(35); Adox64Wrap(rax, rdx); carry := if overflow2 then 1 else 0; let carry_new := if overflow(efl) then 1 else 0; //let a0b := d0 + pow2_64 * d1 + pow2_128 * rbx + pow2_192 * r14 + pow2_256 * (rdx + carry); let a0b := d0 + pow2_64 * d1 + pow2_128 * rbx + pow2_192 * r14 + pow2_256 * rax + pow2_320 * carry_new; // a0b_helper(a0, b0, b1, b2, b3, // d0, r9, // a0b1_lo, r11, // a0b2_lo, r13, // a0b3_lo, rdx, // d1, rbx, r14, // overflow0, overflow1, overflow2); // assert a0 * b == a0b; // PASSES assert_by_tactic(a0 * b == a0*b0 + pow2_64 * (a0*b1) + pow2_128 * (a0*b2) + pow2_192 * (a0*b3), int_canon); // PASSES assert a0b == a0 * b; // PASSES (with the assert_by_tactic above) //assert_by_tactic(a0b == a0 * b, int_canon); // FAILS assert carry_new == 0; assert rax < pow2_64 - 1; } procedure Fast_multiply_a1b( inline offset:nat, ghost dst_b:buffer64, ghost inA_b:buffer64, ghost inB_b:buffer64) {:quick} {:options z3rlimit(2000)} lets dst_ptr @= rdi; inA_ptr @= rsi; inB_ptr @= rcx; a0 := buffer64_read(inA_b, 0 + offset, heap0); a1 := buffer64_read(inA_b, 1 + offset, heap0); a2 := buffer64_read(inA_b, 2 + offset, heap0); a3 := buffer64_read(inA_b, 3 + offset, heap0); b0 := buffer64_read(inB_b, 0 + offset, heap0); b1 := buffer64_read(inB_b, 1 + offset, heap0); b2 := buffer64_read(inB_b, 2 + offset, heap0); b3 := buffer64_read(inB_b, 3 + offset, heap0); a := a0 + pow2_64 * a1 + pow2_128 * a2 + pow2_192 * a3; b := b0 + pow2_64 * b1 + pow2_128 * b2 + pow2_192 * b3; reads dst_ptr; inA_ptr; inB_ptr; memLayout; modifies rax; rdx; r8; r9; r10; r11; rbx; r13; r14; heap0; efl; requires adx_enabled && bmi2_enabled; offset == 0 || offset == 4; buffers_disjoint(dst_b, inA_b); buffers_disjoint(dst_b, inB_b); validDstAddrs64(heap0, dst_ptr, dst_b, 8 + offset*2, memLayout, Secret); validSrcAddrs64(heap0, inA_ptr, inA_b, 4 + offset, memLayout, Secret); validSrcAddrs64(heap0, inB_ptr, inB_b, 4 + offset, memLayout, Secret); rax < pow2_64 - 1; let d0 := buffer64_read(dst_b, 0 + offset*2, heap0); let d1 := buffer64_read(dst_b, 1 + offset*2, heap0); let a0b := d0 + pow2_64 * d1 + pow2_128 * rbx + pow2_192 * r14 + pow2_256 * rax; a0b == a0 * b; ensures let d0 := buffer64_read(dst_b, 0 + offset*2, heap0); let d1 := buffer64_read(dst_b, 1 + offset*2, heap0); let d2 := buffer64_read(dst_b, 2 + offset*2, heap0); pow2_two(a0, a1) * b == pow2_six(d0, d1, d2, rbx, r14, rax); validSrcAddrs64(heap0, dst_ptr, dst_b, 8 + offset*2, memLayout, Secret); modifies_buffer_specific(dst_b, old(heap0), heap0, 0 + offset*2, 8 + offset*2 - 1); { Comment("Compute src1[1] * src2"); xor_lemmas(); let a0b_0 := buffer64_read(dst_b, 0 + offset*2, heap0); let a0b_1 := buffer64_read(dst_b, 1 + offset*2, heap0); let a0b_2 := rbx; let a0b_3 := r14; let a0b_4 := rax; let a0b := pow2_five(a0b_0, a0b_1, a0b_2, a0b_3, a0b_4); // // "movq 8(A), %%rdx; " /* A[1] */ // "mulx (B), %%r8, %%r9; " /* A[1]*B[0] */ "xorl %%r10d, %%r10d ;" "adcx 8(dst), %%r8 ;" "movq %%r8, 8(dst) ;" // "mulx 8(B), %%r10, %%r11; " /* A[1]*B[1] */ "adox %%r9, %%r10 ;" "adcx %%rbx, %%r10 ;" "movq %%r10, 16(dst) ;" // "mulx 16(B), %%rbx, %%r13; " /* A[1]*B[2] */ "adox %%r11, %%rbx ;" "adcx %%r14, %%rbx ;" "movq $0, %%r8 ;" // "mulx 24(B), %%r14, %%rdx; " /* A[1]*B[3] */ "adox %%r13, %%r14 ;" "adcx %%rax, %%r14 ;" "movq $0, %%rax ;" // /*******************************************/ "adox %%rdx, %%rax ;" "adcx %%r8, %%rax ;" Load64_buffer(heap0, rdx, inA_ptr, 8 + offset*8, Secret, inA_b, 1 + offset); /* A[1] */ NoNewline(); Mulx64(r9, r8, Mem64(heap0, inB_ptr, 0 + offset * 8, inB_b, 0 + offset, Secret)); /* A[1]*B[0] */ lemma_prod_bounds(r9, r8, a1, b0); Space(3); Xor64(r10, r10); let a1b_0 := r8; Space(1); Adcx64Wrap(r8, Mem64(heap0, dst_ptr, 8 + offset * 16, dst_b, 1 + offset*2, Secret)); Store64_buffer(heap0, dst_ptr, r8, 8 + offset * 16, Secret, dst_b, 1 + offset*2); // REVIEW: Why not combine the Adcx with the Store? NoNewline(); Mulx64(r11, r10, Mem64(heap0, inB_ptr, 8 + offset * 8, inB_b, 1 + offset, Secret)); /* A[1]*B[1] */ lemma_prod_bounds(r11, r10, a1, b1); lemma_overflow(r11, r10, r9, bool_bit(overflow(efl))); assert valid_of(efl); // OBSERVE Space(1); Adox64Wrap(r10, r9); let a1b_1 := r10; // At this point, overflow = 0, r9 < 2^64 - 1, and (r10 is <= 1 (so no new overflow), or r11 < 2^64-2) Space(1); Adcx64Wrap(r10, rbx); Store64_buffer(heap0, dst_ptr, r10, 16 + offset*16, Secret, dst_b, 2 + offset*2); NoNewline(); Mulx64(r13, rbx, Mem64(heap0, inB_ptr, 16 + offset * 8, inB_b, 2 + offset, Secret)); /* A[1]*B[2] */ lemma_prod_bounds(r13, rbx, a1, b2); lemma_overflow(r13, rbx, r11, bool_bit(overflow(efl))); NoNewline(); Adox64Wrap(rbx, r11); let a1b_2 := rbx; // At this point, either r13 < 2^64-2 or (rbx <= 1, and (overflow = 0 or r11 < 2^64 - 2)). RHS of the first or means no overflow NoNewline(); Adcx64Wrap(rbx, r14); Mov64( r8, 0); NoNewline(); Mulx64(rdx, r14, Mem64(heap0, inB_ptr, 24 + offset * 8, inB_b, 3 + offset, Secret)); /* A[1]*B[3] */ lemma_prod_bounds(rdx, r14, a1, b3); lemma_overflow(rdx, r14, r13, bool_bit(overflow(efl))); let old_carry := bool_bit(cf(efl)); let old_overflow := bool_bit(overflow(efl)); NoNewline(); Adox64Wrap(r14, r13); let a1b_3 := r14; // If rdx = 2^64 - 1, then we know r14 <= 1. If r13 < 2^64-2, then no new overflow. Else there wasn't any previous overflow, so r13 can absorb the 1 from r14 without overflowing. NoNewline(); Adcx64Wrap(r14, rax); Mov64(rax, 0); /*******************************************/ Space(35); Adox64Wrap(rax, rdx); let a1b_4 := rax; let overflow_bit := bool_bit(overflow(efl)); Adcx64Wrap(rax, r8); assert overflow_bit == 0; let carry_bit := bool_bit(cf(efl)); assert carry_bit == 0; let d0 := buffer64_read(dst_b, 0 + offset*2, heap0); let d1 := buffer64_read(dst_b, 1 + offset*2, heap0); let d2 := buffer64_read(dst_b, 2 + offset*2, heap0); let a1b := pow2_five(a1b_0, a1b_1, a1b_2, a1b_3, a1b_4); let a0a1b := pow2_seven(d0, d1, d2, rbx, r14, rax, carry_bit); assert_by_tactic(a1 * b == 0 + pow2_four(a1 * b0, a1 * b1, a1 * b2, a1 * b3), int_canon); // PASSES assert a1b == a1 * b; // PASSES lemma_sum_a1b( a0, a1, a0b, a0b_0, a0b_1, a0b_2, a0b_3, a0b_4, a1b, a1b_0, a1b_1, a1b_2, a1b_3, a1b_4, b, b0, b1, b2, b3, d1, d2, rbx, r14, rax, carry_bit); assert pow2_two(a0, a1) * b == a0a1b; // Conclusion from the lemma } procedure Fast_multiply_a2b( inline offset:nat, ghost dst_b:buffer64, ghost inA_b:buffer64, ghost inB_b:buffer64) {:quick} {:options z3rlimit(800)} lets dst_ptr @= rdi; inA_ptr @= rsi; inB_ptr @= rcx; a0 := buffer64_read(inA_b, 0 + offset, heap0); a1 := buffer64_read(inA_b, 1 + offset, heap0); a2 := buffer64_read(inA_b, 2 + offset, heap0); a3 := buffer64_read(inA_b, 3 + offset, heap0); b0 := buffer64_read(inB_b, 0 + offset, heap0); b1 := buffer64_read(inB_b, 1 + offset, heap0); b2 := buffer64_read(inB_b, 2 + offset, heap0); b3 := buffer64_read(inB_b, 3 + offset, heap0); a := a0 + pow2_64 * a1 + pow2_128 * a2 + pow2_192 * a3; b := b0 + pow2_64 * b1 + pow2_128 * b2 + pow2_192 * b3; reads dst_ptr; inA_ptr; inB_ptr; memLayout; modifies rax; rdx; r8; r9; r10; r11; rbx; r13; r14; heap0; efl; requires adx_enabled && bmi2_enabled; offset == 0 || offset == 4; buffers_disjoint(dst_b, inA_b); buffers_disjoint(dst_b, inB_b); validDstAddrs64(heap0, dst_ptr, dst_b, 8 + offset*2, memLayout, Secret); validSrcAddrs64(heap0, inA_ptr, inA_b, 4 + offset, memLayout, Secret); validSrcAddrs64(heap0, inB_ptr, inB_b, 4 + offset, memLayout, Secret); let d0 := buffer64_read(dst_b, 0 + offset*2, heap0); let d1 := buffer64_read(dst_b, 1 + offset*2, heap0); let d2 := buffer64_read(dst_b, 2 + offset*2, heap0); mul_nats(pow2_two(a0, a1), b) == pow2_six(d0, d1, d2, rbx, r14, rax); ensures let d0 := buffer64_read(dst_b, 0 + offset*2, heap0); let d1 := buffer64_read(dst_b, 1 + offset*2, heap0); let d2 := buffer64_read(dst_b, 2 + offset*2, heap0); let d3 := buffer64_read(dst_b, 3 + offset*2, heap0); pow2_three(a0, a1, a2) * b == pow2_seven(d0, d1, d2, d3, rbx, r14, rax); validSrcAddrs64(heap0, dst_ptr, dst_b, 8 + offset*2, memLayout, Secret); modifies_buffer_specific(dst_b, old(heap0), heap0, 0 + offset*2, 8 + offset*2 - 1); { Comment("Compute src1[2] * src2"); xor_lemmas(); let a0a1b_0 := buffer64_read(dst_b, 0 + offset*2, heap0); let a0a1b_1 := buffer64_read(dst_b, 1 + offset*2, heap0); let a0a1b_2 := buffer64_read(dst_b, 2 + offset*2, heap0); let a0a1b_3 := rbx; let a0a1b_4 := r14; let a0a1b_5 := rax; let a0a1b := pow2_six(a0a1b_0, a0a1b_1, a0a1b_2, a0a1b_3, a0a1b_4, a0a1b_5); // // "movq 16(A), %%rdx; " /* A[2] */ // "mulx (B), %%r8, %%r9; " /* A[2]*B[0] */ "xorl %%r10d, %%r10d ;" "adcx 16(dst), %%r8 ;" "movq %%r8, 16(dst) ;" // "mulx 8(B), %%r10, %%r11; " /* A[2]*B[1] */ "adox %%r9, %%r10 ;" "adcx %%rbx, %%r10 ;" "movq %%r10, 24(dst) ;" // "mulx 16(B), %%rbx, %%r13; " /* A[2]*B[2] */ "adox %%r11, %%rbx ;" "adcx %%r14, %%rbx ;" "movq $0, %%r8 ;" // "mulx 24(B), %%r14, %%rdx; " /* A[2]*B[3] */ "adox %%r13, %%r14 ;" "adcx %%rax, %%r14 ;" "movq $0, %%rax ;" // /*******************************************/ "adox %%rdx, %%rax ;" "adcx %%r8, %%rax ;" Load64_buffer(heap0, rdx, inA_ptr, 16 + offset*8, Secret, inA_b, 2 + offset); /* A[2] */ NoNewline(); Mulx64(r9, r8, Mem64(heap0, inB_ptr, 0 + offset * 8, inB_b, 0 + offset, Secret)); /* A[2]*B[0] */ lemma_prod_bounds(r9, r8, a2, b0); Space(3); Xor64(r10, r10); let a2b_0 := r8; NoNewline(); Adcx64Wrap(r8, Mem64(heap0, dst_ptr, 16 + offset * 16, dst_b, 2 + offset * 2, Secret)); Store64_buffer(heap0, dst_ptr, r8, 16 + offset*16, Secret, dst_b, 2 + offset*2); // REVIEW: Why not combine the Adcx with the Store? NoNewline(); Mulx64(r11, r10, Mem64(heap0, inB_ptr, 8 + offset * 8, inB_b, 1 + offset, Secret)); /* A[2]*B[1] */ lemma_prod_bounds(r11, r10, a2, b1); lemma_overflow(r11, r10, r9, bool_bit(overflow(efl))); Space(1); Adox64Wrap(r10, r9); let a2b_1 := r10; Space(1); Adcx64Wrap(r10, rbx); Store64_buffer(heap0, dst_ptr, r10, 24 + offset*16, Secret, dst_b, 3 + offset*2); NoNewline(); Mulx64(r13, rbx, Mem64(heap0, inB_ptr, 16 + offset * 8, inB_b, 2 + offset, Secret)); /* A[2]*B[2] */ lemma_prod_bounds(r13, rbx, a2, b2); lemma_overflow(r13, rbx, r11, bool_bit(overflow(efl))); NoNewline(); Adox64Wrap(rbx, r11); let a2b_2 := rbx; NoNewline(); Adcx64Wrap(rbx, r14); Mov64( r8, 0); NoNewline(); Mulx64(rdx, r14, Mem64(heap0, inB_ptr, 24 + offset * 8, inB_b, 3 + offset, Secret)); /* A[2]*B[3] */ lemma_prod_bounds(rdx, r14, a2, b3); lemma_overflow(rdx, r14, r13, bool_bit(overflow(efl))); let old_carry := bool_bit(cf(efl)); let old_overflow := bool_bit(overflow(efl)); NoNewline(); Adox64Wrap(r14, r13); let a2b_3 := r14; NoNewline(); Adcx64Wrap(r14, rax); Mov64(rax, 0); /*******************************************/ Space(35); Adox64Wrap(rax, rdx); let a2b_4 := rax; Adcx64Wrap(rax, r8); let carry_bit := bool_bit(cf(efl)); let overflow_bit := bool_bit(overflow(efl)); assert overflow_bit == 0; assert carry_bit == 0; let d0 := buffer64_read(dst_b, 0 + offset*2, heap0); let d1 := buffer64_read(dst_b, 1 + offset*2, heap0); let d2 := buffer64_read(dst_b, 2 + offset*2, heap0); let d3 := buffer64_read(dst_b, 3 + offset*2, heap0); let a2b := pow2_five(a2b_0, a2b_1, a2b_2, a2b_3, a2b_4); let a0a1a2b := pow2_seven(d0, d1, d2, d3, rbx, r14, rax); assert_by_tactic(a2 * b == 0 + pow2_four(a2 * b0, a2 * b1, a2 * b2, a2 * b3), int_canon); // PASSES //assert a2b == a2 * b; // PASSES lemma_sum_a2b(a0, a1, a2, a0a1b, a0a1b_0, a0a1b_1, a0a1b_2, a0a1b_3, a0a1b_4, a0a1b_5, a2b, a2b_0, a2b_1, a2b_2, a2b_3, a2b_4, b, b0, b1, b2, b3, d2, d3, rbx, r14, rax); //assert pow2_three(a0, a1, a2) * b == a0a1a2b; // Conclusion from the lemma. // PASSES // assert d0 == a0a1b_0; // PASSES // assert d1 == a0a1b_1; // PASSES // assert pow2_three(a0, a1, a2) * b == pow2_seven(d0, d1, d2, d3, rbx, r14, rax); } procedure Fast_multiply_a3b( inline offset:nat, ghost dst_b:buffer64, ghost inA_b:buffer64, ghost inB_b:buffer64) {:quick} {:options z3rlimit(800)} lets dst_ptr @= rdi; inA_ptr @= rsi; inB_ptr @= rcx; a0 := buffer64_read(inA_b, 0 + offset, heap0); a1 := buffer64_read(inA_b, 1 + offset, heap0); a2 := buffer64_read(inA_b, 2 + offset, heap0); a3 := buffer64_read(inA_b, 3 + offset, heap0); b0 := buffer64_read(inB_b, 0 + offset, heap0); b1 := buffer64_read(inB_b, 1 + offset, heap0); b2 := buffer64_read(inB_b, 2 + offset, heap0); b3 := buffer64_read(inB_b, 3 + offset, heap0); a := a0 + pow2_64 * a1 + pow2_128 * a2 + pow2_192 * a3; b := b0 + pow2_64 * b1 + pow2_128 * b2 + pow2_192 * b3; reads dst_ptr; inA_ptr; inB_ptr; memLayout; modifies rax; rdx; r8; r9; r10; r11; rbx; r13; r14; heap0; efl; requires adx_enabled && bmi2_enabled; offset == 0 || offset == 4; buffers_disjoint(dst_b, inA_b); buffers_disjoint(dst_b, inB_b); validDstAddrs64(heap0, dst_ptr, dst_b, 8 + offset*2, memLayout, Secret); validSrcAddrs64(heap0, inA_ptr, inA_b, 4 + offset, memLayout, Secret); validSrcAddrs64(heap0, inB_ptr, inB_b, 4 + offset, memLayout, Secret); let d0 := buffer64_read(dst_b, 0 + offset*2, heap0); let d1 := buffer64_read(dst_b, 1 + offset*2, heap0); let d2 := buffer64_read(dst_b, 2 + offset*2, heap0); let d3 := buffer64_read(dst_b, 3 + offset*2, heap0); pow2_three(a0, a1, a2) * b == pow2_seven(d0, d1, d2, d3, rbx, r14, rax); ensures let d0 := buffer64_read(dst_b, 0 + offset*2, heap0); let d1 := buffer64_read(dst_b, 1 + offset*2, heap0); let d2 := buffer64_read(dst_b, 2 + offset*2, heap0); let d3 := buffer64_read(dst_b, 3 + offset*2, heap0); let d4 := buffer64_read(dst_b, 4 + offset*2, heap0); let d5 := buffer64_read(dst_b, 5 + offset*2, heap0); let d6 := buffer64_read(dst_b, 6 + offset*2, heap0); let d7 := buffer64_read(dst_b, 7 + offset*2, heap0); pow2_four(a0, a1, a2, a3) * b == pow2_eight(d0, d1, d2, d3, d4, d5, d6, d7); validSrcAddrs64(heap0, dst_ptr, dst_b, 8 + offset*2, memLayout, Secret); modifies_buffer_specific(dst_b, old(heap0), heap0, 0 + offset*2, 8 + offset*2 - 1); { Comment("Compute src1[3] * src2"); xor_lemmas(); let a0a1a2b_0 := buffer64_read(dst_b, 0 + offset*2, heap0); let a0a1a2b_1 := buffer64_read(dst_b, 1 + offset*2, heap0); let a0a1a2b_2 := buffer64_read(dst_b, 2 + offset*2, heap0); let a0a1a2b_3 := buffer64_read(dst_b, 3 + offset*2, heap0); let a0a1a2b_4 := rbx; let a0a1a2b_5 := r14; let a0a1a2b_6 := rax; let a0a1a2b := pow2_seven(a0a1a2b_0, a0a1a2b_1, a0a1a2b_2, a0a1a2b_3, a0a1a2b_4, a0a1a2b_5, a0a1a2b_6); // // "movq 24(A), %%rdx; " /* A[3] */ // "mulx (B), %%r8, %%r9; " /* A[3]*B[0] */ "xorl %%r10d, %%r10d ;" "adcx 24(dst), %%r8 ;" "movq %%r8, 24(dst) ;" // "mulx 8(B), %%r10, %%r11; " /* A[3]*B[1] */ "adox %%r9, %%r10 ;" "adcx %%rbx, %%r10 ;" "movq %%r10, 32(dst) ;" // "mulx 16(B), %%rbx, %%r13; " /* A[3]*B[2] */ "adox %%r11, %%rbx ;" "adcx %%r14, %%rbx ;" "movq %%rbx, 40(dst) ;" "movq $0, %%r8 ;" // "mulx 24(B), %%r14, %%rdx; " /* A[3]*B[3] */ "adox %%r13, %%r14 ;" "adcx %%rax, %%r14 ;" "movq %%r14, 48(dst) ;" "movq $0, %%rax ;" // /*******************************************/ "adox %%rdx, %%rax ;" "adcx %%r8, %%rax ;" "movq %%rax, 56(dst) ;" Load64_buffer(heap0, rdx, inA_ptr, 24 + offset*8, Secret, inA_b, 3 + offset); /* A[3] */ NoNewline(); Mulx64(r9, r8, Mem64(heap0, inB_ptr, 0 + offset * 8, inB_b, 0 + offset, Secret)); /* A[3]*B[0] */ lemma_prod_bounds(r9, r8, a3, b0); Space(3); Xor64(r10, r10); let a3b_0 := r8; NoNewline(); Adcx64Wrap(r8, Mem64(heap0, dst_ptr, 24 + offset * 16, dst_b, 3 + offset * 2, Secret)); Store64_buffer(heap0, dst_ptr, r8, 24 + offset * 16, Secret, dst_b, 3 + offset * 2); // REVIEW: Why not combine the Adcx with the Store? NoNewline(); Mulx64(r11, r10, Mem64(heap0, inB_ptr, 8 + offset * 8, inB_b, 1 + offset, Secret)); /* A[3]*B[1] */ lemma_prod_bounds(r11, r10, a3, b1); lemma_overflow(r11, r10, r9, bool_bit(overflow(efl))); Space(1); Adox64Wrap(r10, r9); let a3b_1 := r10; Space(1); Adcx64Wrap(r10, rbx); Store64_buffer(heap0, dst_ptr, r10, 32 + offset*16, Secret, dst_b, 4 + offset*2); NoNewline(); Mulx64(r13, rbx, Mem64(heap0, inB_ptr, 16 + offset * 8, inB_b, 2 + offset, Secret)); /* A[3]*B[2] */ lemma_prod_bounds(r13, rbx, a3, b2); lemma_overflow(r13, rbx, r11, bool_bit(overflow(efl))); NoNewline(); Adox64Wrap(rbx, r11); let a3b_2 := rbx; NoNewline(); Adcx64Wrap(rbx, r14); NoNewline(); Store64_buffer(heap0, dst_ptr, rbx, 40 + offset * 16, Secret, dst_b, 5 + offset * 2); Mov64( r8, 0); NoNewline(); Mulx64(rdx, r14, Mem64(heap0, inB_ptr, 24 + offset * 8, inB_b, 3 + offset, Secret)); /* A[3]*B[3] */ lemma_prod_bounds(rdx, r14, a3, b3); lemma_overflow(rdx, r14, r13, bool_bit(overflow(efl))); let old_carry := bool_bit(cf(efl)); let old_overflow := bool_bit(overflow(efl)); NoNewline(); Adox64Wrap(r14, r13); let a3b_3 := r14; NoNewline(); Adcx64Wrap(r14, rax); NoNewline(); Store64_buffer(heap0, dst_ptr, r14, 48 + offset * 16, Secret, dst_b, 6 + offset * 2); Mov64(rax, 0); /*******************************************/ Space(35); Adox64Wrap(rax, rdx); NoNewline(); let a3b_4 := rax; Adcx64Wrap(rax, r8); Space(1); Store64_buffer(heap0, dst_ptr, rax, 56 + offset * 16, Secret, dst_b, 7 + offset * 2); let carry_bit := bool_bit(cf(efl)); let overflow_bit := bool_bit(overflow(efl)); assert overflow_bit == 0; assert carry_bit == 0; let d0 := buffer64_read(dst_b, 0 + offset*2, heap0); let d1 := buffer64_read(dst_b, 1 + offset*2, heap0); let d2 := buffer64_read(dst_b, 2 + offset*2, heap0); let d3 := buffer64_read(dst_b, 3 + offset*2, heap0); let d4 := buffer64_read(dst_b, 4 + offset*2, heap0); let d5 := buffer64_read(dst_b, 5 + offset*2, heap0); let d6 := buffer64_read(dst_b, 6 + offset*2, heap0); let d7 := buffer64_read(dst_b, 7 + offset*2, heap0); let a3b := pow2_five(a3b_0, a3b_1, a3b_2, a3b_3, a3b_4); let a0a1a2a3b := pow2_eight(d0, d1, d2, d3, d4, d5, d6, d7); assert_by_tactic(a3 * b == 0 + pow2_four(a3 * b0, a3 * b1, a3 * b2, a3 * b3), int_canon); // PASSES assert a3b == a3 * b; // PASSES lemma_sum_a3b(a0, a1, a2, a3, a0a1a2b, a0a1a2b_0, a0a1a2b_1, a0a1a2b_2, a0a1a2b_3, a0a1a2b_4, a0a1a2b_5, a0a1a2b_6, a3b, a3b_0, a3b_1, a3b_2, a3b_3, a3b_4, b, b0, b1, b2, b3, d3, d4, d5, d6, d7); // assert pow2_four(a0, a1, a2, a3) * b == a0a1a2a3b; // Conclusion from the lemma. // PASSES } procedure Fast_multiply( inline offset:nat, ghost dst_b:buffer64, ghost inA_b:buffer64, ghost inB_b:buffer64) {:public} {:quick} lets dst_ptr @= rdi; inA_ptr @= rsi; inB_ptr @= rcx; a0 := buffer64_read(inA_b, 0 + offset, heap0); a1 := buffer64_read(inA_b, 1 + offset, heap0); a2 := buffer64_read(inA_b, 2 + offset, heap0); a3 := buffer64_read(inA_b, 3 + offset, heap0); b0 := buffer64_read(inB_b, 0 + offset, heap0); b1 := buffer64_read(inB_b, 1 + offset, heap0); b2 := buffer64_read(inB_b, 2 + offset, heap0); b3 := buffer64_read(inB_b, 3 + offset, heap0); a := pow2_four(a0, a1, a2, a3); b := pow2_four(b0, b1, b2, b3); reads dst_ptr; inA_ptr; inB_ptr; memLayout; modifies rax; rdx; r8; r9; r10; r11; rbx; r13; r14; heap0; efl; requires adx_enabled && bmi2_enabled; offset == 0 || offset == 4; buffers_disjoint(dst_b, inA_b); buffers_disjoint(dst_b, inB_b); validDstAddrs64(heap0, dst_ptr, dst_b, 8 + offset*2, memLayout, Secret); validSrcAddrs64(heap0, inA_ptr, inA_b, 4 + offset, memLayout, Secret); validSrcAddrs64(heap0, inB_ptr, inB_b, 4 + offset, memLayout, Secret); ensures let d0 := buffer64_read(dst_b, 0 + offset * 2, heap0); let d1 := buffer64_read(dst_b, 1 + offset * 2, heap0); let d2 := buffer64_read(dst_b, 2 + offset * 2, heap0); let d3 := buffer64_read(dst_b, 3 + offset * 2, heap0); let d4 := buffer64_read(dst_b, 4 + offset * 2, heap0); let d5 := buffer64_read(dst_b, 5 + offset * 2, heap0); let d6 := buffer64_read(dst_b, 6 + offset * 2, heap0); let d7 := buffer64_read(dst_b, 7 + offset * 2, heap0); let d := pow2_eight(d0, d1, d2, d3, d4, d5, d6, d7); d == a * b; validSrcAddrs64(heap0, dst_ptr, dst_b, 8 + offset*2, memLayout, Secret); modifies_buffer_specific(dst_b, old(heap0), heap0, 0 + offset*2, 8 + offset*2 - 1); { Fast_multiply_a0b(offset, dst_b, inA_b, inB_b); Newline(); Newline(); Fast_multiply_a1b(offset, dst_b, inA_b, inB_b); Newline(); Newline(); Fast_multiply_a2b(offset, dst_b, inA_b, inB_b); Newline(); Newline(); Fast_multiply_a3b(offset, dst_b, inA_b, inB_b); Newline(); } procedure Fast_mul_stdcall( inline win:bool, ghost dst_b:buffer64, ghost inA_b:buffer64, ghost inB_b:buffer64) {:public} {:quick} lets dst_ptr @= rdi; inA_ptr @= rsi; inB_ptr @= rcx; dst_in := (if win then rcx else rdi); inA_in := (if win then rdx else rsi); inB_in := (if win then r8 else rdx); reads memLayout; modifies rax; rbx; rcx; rdx; rdi; rsi; r8; r9; r10; r11; r13; r14; rsp; efl; heap0; stack; stackTaint; requires rsp == init_rsp(stack); adx_enabled && bmi2_enabled; buffers_disjoint(dst_b, inA_b); buffers_disjoint(dst_b, inB_b); validDstAddrs64(heap0, dst_in, dst_b, 8, memLayout, Secret); validSrcAddrs64(heap0, inA_in, inA_b, 4, memLayout, Secret); validSrcAddrs64(heap0, inB_in, inB_b, 4, memLayout, Secret); ensures let a0 := old(buffer64_read(inA_b, 0, heap0)); let a1 := old(buffer64_read(inA_b, 1, heap0)); let a2 := old(buffer64_read(inA_b, 2, heap0)); let a3 := old(buffer64_read(inA_b, 3, heap0)); let b0 := old(buffer64_read(inB_b, 0, heap0)); let b1 := old(buffer64_read(inB_b, 1, heap0)); let b2 := old(buffer64_read(inB_b, 2, heap0)); let b3 := old(buffer64_read(inB_b, 3, heap0)); let d0 := buffer64_read(dst_b, 0, heap0); let d1 := buffer64_read(dst_b, 1, heap0); let d2 := buffer64_read(dst_b, 2, heap0); let d3 := buffer64_read(dst_b, 3, heap0); let d4 := buffer64_read(dst_b, 4, heap0); let d5 := buffer64_read(dst_b, 5, heap0); let d6 := buffer64_read(dst_b, 6, heap0); let d7 := buffer64_read(dst_b, 7, heap0); let a := a0 + pow2_64 * a1 + pow2_128 * a2 + pow2_192 * a3; let b := b0 + pow2_64 * b1 + pow2_128 * b2 + pow2_192 * b3; let d := d0 + pow2_64 * d1 + pow2_128 * d2 + pow2_192 * d3 + pow2_256 * d4 + pow2_320 * d5 + pow2_384 * d6 + pow2_448 * d7; d == a * b; ////////////////////////////////////// // Framing ////////////////////////////////////// modifies_buffer(dst_b, old(heap0), heap0); validSrcAddrs64(heap0, dst_in, dst_b, 8, memLayout, Secret); rsi == old(rsi); r13 == old(r13); r14 == old(r14); rsp == old(rsp); { // Store callee-save registers Push(rsi); Push(r13); Push(r14); // Line up the rest of the arguments inline if (win) { Mov64(dst_ptr, rcx); Mov64(inA_ptr, rdx); Mov64(inB_ptr, r8); } else { Mov64(inB_ptr, rdx); } Fast_multiply(0, dst_b, inA_b, inB_b); Pop(r14); Pop(r13); Pop(rsi); } procedure Fast_mul2_stdcall( inline win:bool, ghost dst_b:buffer64, ghost inA_b:buffer64, ghost inB_b:buffer64) {:public} {:quick} lets dst_ptr @= rdi; inA_ptr @= rsi; inB_ptr @= rcx; dst_in := (if win then rcx else rdi); inA_in := (if win then rdx else rsi); inB_in := (if win then r8 else rdx); reads memLayout; modifies rax; rbx; rcx; rdx; rdi; rsi; r8; r9; r10; r11; r13; r14; rsp; efl; heap0; stack; stackTaint; requires rsp == init_rsp(stack); adx_enabled && bmi2_enabled; buffers_disjoint(dst_b, inA_b); buffers_disjoint(dst_b, inB_b); validDstAddrs64(heap0, dst_in, dst_b, 16, memLayout, Secret); validSrcAddrs64(heap0, inA_in, inA_b, 8, memLayout, Secret); validSrcAddrs64(heap0, inB_in, inB_b, 8, memLayout, Secret); ensures let a0 := old(buffer64_read(inA_b, 0, heap0)); let a1 := old(buffer64_read(inA_b, 1, heap0)); let a2 := old(buffer64_read(inA_b, 2, heap0)); let a3 := old(buffer64_read(inA_b, 3, heap0)); let b0 := old(buffer64_read(inB_b, 0, heap0)); let b1 := old(buffer64_read(inB_b, 1, heap0)); let b2 := old(buffer64_read(inB_b, 2, heap0)); let b3 := old(buffer64_read(inB_b, 3, heap0)); let a := pow2_four(a0, a1, a2, a3); let b := pow2_four(b0, b1, b2, b3); let a0' := old(buffer64_read(inA_b, 0 + 4, heap0)); let a1' := old(buffer64_read(inA_b, 1 + 4, heap0)); let a2' := old(buffer64_read(inA_b, 2 + 4, heap0)); let a3' := old(buffer64_read(inA_b, 3 + 4, heap0)); let b0' := old(buffer64_read(inB_b, 0 + 4, heap0)); let b1' := old(buffer64_read(inB_b, 1 + 4, heap0)); let b2' := old(buffer64_read(inB_b, 2 + 4, heap0)); let b3' := old(buffer64_read(inB_b, 3 + 4, heap0)); let a' := pow2_four(a0', a1', a2', a3'); let b' := pow2_four(b0', b1', b2', b3'); let d0 := buffer64_read(dst_b, 0, heap0); let d1 := buffer64_read(dst_b, 1, heap0); let d2 := buffer64_read(dst_b, 2, heap0); let d3 := buffer64_read(dst_b, 3, heap0); let d4 := buffer64_read(dst_b, 4, heap0); let d5 := buffer64_read(dst_b, 5, heap0); let d6 := buffer64_read(dst_b, 6, heap0); let d7 := buffer64_read(dst_b, 7, heap0); let d := pow2_eight(d0, d1, d2, d3, d4, d5, d6, d7); let d0' := buffer64_read(dst_b, 0 + 8, heap0); let d1' := buffer64_read(dst_b, 1 + 8, heap0); let d2' := buffer64_read(dst_b, 2 + 8, heap0); let d3' := buffer64_read(dst_b, 3 + 8, heap0); let d4' := buffer64_read(dst_b, 4 + 8, heap0); let d5' := buffer64_read(dst_b, 5 + 8, heap0); let d6' := buffer64_read(dst_b, 6 + 8, heap0); let d7' := buffer64_read(dst_b, 7 + 8, heap0); let d' := pow2_eight(d0', d1', d2', d3', d4', d5', d6', d7'); d == a * b; d' == a' * b'; ////////////////////////////////////// // Framing ////////////////////////////////////// modifies_buffer(dst_b, old(heap0), heap0); validSrcAddrs64(heap0, dst_in, dst_b, 16, memLayout, Secret); rsi == old(rsi); r13 == old(r13); r14 == old(r14); rsp == old(rsp); { // Store callee-save registers Push(r13); Push(r14); // Line up the rest of the arguments inline if (win) { Push(rsi); Mov64(dst_ptr, rcx); Mov64(inA_ptr, rdx); Mov64(inB_ptr, r8); } else { Mov64(inB_ptr, rdx); } Fast_multiply(0, dst_b, inA_b, inB_b); Fast_multiply(4, dst_b, inA_b, inB_b); inline if (win) { Pop(rsi); } Pop(r14); Pop(r13); }