include "../../../../lib/util/Vale.Lib.Basic.vaf"
include "../../../../arch/x64/Vale.X64.InsBasic.vaf"
include "../../../../arch/x64/Vale.X64.InsMem.vaf"
include "../../../../arch/x64/Vale.X64.InsStack.vaf"
include{:fstar}{:open} "Vale.Curve25519.Fast_defs"
include{:fstar}{:open} "Vale.Curve25519.Fast_lemmas_external"
include{:fstar}{:open} "Vale.Curve25519.FastMul_helpers"
include{:fstar}{:open} "Vale.X64.CPU_Features_s"

//include{:fstar}{:open} "Vale.Def.Prop_s"
module Vale.Curve25519.X64.FastMul

#verbatim{:interface}
open Vale.Def.Types_s
open Vale.Arch.Types
open Vale.X64.Machine_s
open Vale.X64.Memory
open Vale.X64.Stack_i
open Vale.X64.State
open Vale.X64.Decls
open Vale.X64.InsBasic
open Vale.X64.InsMem
open Vale.X64.InsStack
open Vale.X64.QuickCode
open Vale.X64.QuickCodes
open Vale.Curve25519.Fast_defs
open Vale.X64.CPU_Features_s
#endverbatim

#verbatim{:implementation}
open Vale.Def.Types_s
open Vale.Arch.Types
open Vale.X64.Machine_s
open Vale.X64.Memory
open Vale.X64.Stack_i
open Vale.X64.State
open Vale.X64.Decls
open Vale.X64.InsBasic
open Vale.X64.InsMem
open Vale.X64.InsStack
open Vale.X64.QuickCode
open Vale.X64.QuickCodes
open FStar.Tactics
open Vale.Curve25519.Fast_defs
open Vale.Curve25519.Fast_lemmas_external
open Vale.Curve25519.FastMul_helpers
open Vale.X64.CPU_Features_s
#reset-options "--z3rlimit 60"
#endverbatim

const int_canon:tactic extern;

procedure Fast_multiply_a0b(
        inline offset:nat,
        ghost dst_b:buffer64,
        ghost inA_b:buffer64,
        ghost inB_b:buffer64)
    {:quick}
    {:options z3rlimit(200)}
    lets
        dst_ptr @= rdi;
        inA_ptr @= rsi;
        inB_ptr @= rcx;

        a0 := buffer64_read(inA_b, 0 + offset, heap0);
        a1 := buffer64_read(inA_b, 1 + offset, heap0);
        a2 := buffer64_read(inA_b, 2 + offset, heap0);
        a3 := buffer64_read(inA_b, 3 + offset, heap0);

        b0 := buffer64_read(inB_b, 0 + offset, heap0);
        b1 := buffer64_read(inB_b, 1 + offset, heap0);
        b2 := buffer64_read(inB_b, 2 + offset, heap0);
        b3 := buffer64_read(inB_b, 3 + offset, heap0);

        a := a0 + pow2_64 * a1 + pow2_128 * a2 + pow2_192 * a3;
        b := b0 + pow2_64 * b1 + pow2_128 * b2 + pow2_192 * b3;
    reads
        dst_ptr; inA_ptr; inB_ptr; memLayout;

    modifies
        rax; rdx; r8; r9; r10; r11; rbx; r13; r14;
        heap0; efl;

    requires
        adx_enabled && bmi2_enabled;
        offset == 0 || offset == 4;
        buffers_disjoint(dst_b, inA_b);
        buffers_disjoint(dst_b, inB_b);

        validDstAddrs64(heap0, dst_ptr, dst_b, 8 + offset*2, memLayout, Secret);
        validSrcAddrs64(heap0, inA_ptr, inA_b, 4 + offset, memLayout, Secret);
        validSrcAddrs64(heap0, inB_ptr, inB_b, 4 + offset, memLayout, Secret);

    ensures
        let d0 := buffer64_read(dst_b, 0 + offset*2, heap0);
        let d1 := buffer64_read(dst_b, 1 + offset*2, heap0);
        let a0b := d0 + pow2_64 * d1 + pow2_128 * rbx + pow2_192 * r14 + pow2_256 * rax;
        a0b == a0 * b;

        rax < pow2_64 - 1;

        validSrcAddrs64(heap0, dst_ptr, dst_b, 8 + offset*2, memLayout, Secret);

        modifies_buffer_specific(dst_b, old(heap0), heap0, 0 + offset*2, 8 + offset*2 - 1);
{
    xor_lemmas();

//    "movq   (%1), %%rdx; " /* A[0] */
//    "mulx   (B),  %%r8,  %%r9; " /* A[0]*B[0] */    "xorl %%r10d, %%r10d ;"                           "movq  %%r8,  (dst) ;"
//    "mulx  8(B), %%r10, %%r11; " /* A[0]*B[1] */    "adox  %%r9, %%r10 ;"                             "movq %%r10, 8(dst) ;"
//    "mulx 16(B), %%rbx, %%r13; " /* A[0]*B[2] */    "adox %%r11, %%rbx ;"
//    "mulx 24(B), %%r14, %%rdx; " /* A[0]*B[3] */    "adox %%r13, %%r14 ;"                                                       "movq $0, %%rax ;"
//    /*******************************************/   "adox %%rdx, %%rax ;"

    Comment("Compute src1[0] * src2");
    Load64_buffer(heap0, rdx, inA_ptr, 0 + 8 * offset, Secret, inA_b, 0 + offset);     /* A[0] */
    // The Xo64 clears the flags used for carry bits; NOTE: Original code uses xorl with r10d, maybe produces smaller code?
    NoNewline();
    Mulx64(r9, r8, Mem64(heap0, inB_ptr, 0 + offset * 8, inB_b, 0 + offset, Secret));  /* A[0]*B[0] */ 
    Space(3);
    Xor64(r10, r10);     
    lemma_prod_bounds(r9, r8, a0, b0);   
    Space(1);
    Store64_buffer(heap0, dst_ptr, r8, 0 + offset*16, Secret, dst_b, 0 + offset*2);

    Newline();
    NoNewline();
    Mulx64(r11, r10, Mem64(heap0, inB_ptr, 8 + offset * 8, inB_b, 1 + offset, Secret));  /* A[0]*B[1] */ 
    let a0b1_lo := r10; lemma_prod_bounds(r11, r10, a0, b1);
    Space(1);
    Adox64Wrap(r10,  r9);
    Space(1);
    Store64_buffer(heap0, dst_ptr, r10, 8 + offset * 16, Secret, dst_b, 1 + offset*2);

    let d0 := buffer64_read(dst_b, 0 + offset*2, heap0);
    let d1 := buffer64_read(dst_b, 1 + offset*2, heap0);
    let overflow0 := overflow(efl);
    ghost var carry := if overflow(efl) then 1 else 0;
    let a0b0b1 := d0 + pow2_64 * d1 + pow2_128 * (r11 + carry);
//    simple_helper(a0, b0, b1, d0, r9, a0b1_lo, r11, r10, overflow(efl));
//    assert a0b0b1 == a0 * (b0 + pow2_64 * b1);   // PASSES in both modes using lemma above

// These work:
//    assert_by_tactic(a0 * (b0 + pow2_64 * b1) == a0 * b0 + pow2_64 * (a0 * b1), int_canon);
//    assert a0b0b1 == a0 * (b0 + pow2_64 * b1);

    //assert_by_tactic(a0b0b1 == a0 * (b0 + pow2_64 * b1), int_canon);   // FAILS
    Newline();
    NoNewline();    
    Mulx64(r13, rbx, Mem64(heap0, inB_ptr, 16 + offset * 8, inB_b, 2 + offset, Secret));  /* A[0]*B[2] */ 
    let a0b2_lo := rbx; lemma_prod_bounds(r13, rbx, a0, b2); 
    Adox64Wrap(rbx, r11);

    let overflow1 := overflow(efl);
    NoNewline();
    Mulx64(rdx, r14, Mem64(heap0, inB_ptr, 24 + offset * 8, inB_b, 3 + offset, Secret));  /* A[0]*B[3] */ 
    let a0b3_lo := r14; lemma_prod_bounds(rdx, r14, a0, b3); 
    NoNewline();
    Adox64Wrap(r14, r13);
    Mov64(rax, 0);
    let overflow2 := overflow(efl);


    /*******************************************/  Space(35);  Adox64Wrap(rax, rdx);

    carry := if overflow2 then 1 else 0;
    let carry_new := if overflow(efl) then 1 else 0;
    //let a0b := d0 + pow2_64 * d1 + pow2_128 * rbx + pow2_192 * r14 + pow2_256 * (rdx + carry);
    let a0b := d0 + pow2_64 * d1 + pow2_128 * rbx + pow2_192 * r14 + pow2_256 * rax + pow2_320 * carry_new;

//    a0b_helper(a0, b0, b1, b2, b3,
//               d0,      r9,
//               a0b1_lo, r11,
//               a0b2_lo, r13,
//               a0b3_lo, rdx,
//               d1, rbx, r14,
//               overflow0, overflow1, overflow2);
//    assert a0 * b == a0b;   // PASSES

    assert_by_tactic(a0 * b == a0*b0 + pow2_64 * (a0*b1) + pow2_128 * (a0*b2) + pow2_192 * (a0*b3), int_canon);   // PASSES
    assert a0b == a0 * b;   // PASSES (with the assert_by_tactic above)
    //assert_by_tactic(a0b == a0 * b, int_canon);   // FAILS

    assert carry_new == 0;
    assert rax < pow2_64 - 1;
}

procedure Fast_multiply_a1b(
        inline offset:nat,
        ghost dst_b:buffer64,
        ghost inA_b:buffer64,
        ghost inB_b:buffer64)
    {:quick}
    {:options z3rlimit(2000)}
    lets
        dst_ptr @= rdi;
        inA_ptr @= rsi;
        inB_ptr @= rcx;

        a0 := buffer64_read(inA_b, 0 + offset, heap0);
        a1 := buffer64_read(inA_b, 1 + offset, heap0);
        a2 := buffer64_read(inA_b, 2 + offset, heap0);
        a3 := buffer64_read(inA_b, 3 + offset, heap0);

        b0 := buffer64_read(inB_b, 0 + offset, heap0);
        b1 := buffer64_read(inB_b, 1 + offset, heap0);
        b2 := buffer64_read(inB_b, 2 + offset, heap0);
        b3 := buffer64_read(inB_b, 3 + offset, heap0);

        a := a0 + pow2_64 * a1 + pow2_128 * a2 + pow2_192 * a3;
        b := b0 + pow2_64 * b1 + pow2_128 * b2 + pow2_192 * b3;
    reads
        dst_ptr; inA_ptr; inB_ptr; memLayout;

    modifies
        rax; rdx; r8; r9; r10; r11; rbx; r13; r14;
        heap0; efl;

    requires
        adx_enabled && bmi2_enabled;
        offset == 0 || offset == 4;
        buffers_disjoint(dst_b, inA_b);
        buffers_disjoint(dst_b, inB_b);

        validDstAddrs64(heap0, dst_ptr, dst_b, 8 + offset*2, memLayout, Secret);
        validSrcAddrs64(heap0, inA_ptr, inA_b, 4 + offset, memLayout, Secret);
        validSrcAddrs64(heap0, inB_ptr, inB_b, 4 + offset, memLayout, Secret);

        rax < pow2_64 - 1;

        let d0 := buffer64_read(dst_b, 0 + offset*2, heap0);
        let d1 := buffer64_read(dst_b, 1 + offset*2, heap0);
        let a0b := d0 + pow2_64 * d1 + pow2_128 * rbx + pow2_192 * r14 + pow2_256 * rax;
        a0b == a0 * b;

    ensures
        let d0 := buffer64_read(dst_b, 0 + offset*2, heap0);
        let d1 := buffer64_read(dst_b, 1 + offset*2, heap0);
        let d2 := buffer64_read(dst_b, 2 + offset*2, heap0);
        pow2_two(a0, a1) * b == pow2_six(d0, d1, d2, rbx, r14, rax);

        validSrcAddrs64(heap0, dst_ptr, dst_b, 8 + offset*2, memLayout, Secret);
        modifies_buffer_specific(dst_b, old(heap0), heap0, 0 + offset*2, 8 + offset*2 - 1);
{
    Comment("Compute src1[1] * src2");
    xor_lemmas();

    let a0b_0 := buffer64_read(dst_b, 0 + offset*2, heap0);
    let a0b_1 := buffer64_read(dst_b, 1 + offset*2, heap0);
    let a0b_2 := rbx;
    let a0b_3 := r14;
    let a0b_4 := rax;
    let a0b := pow2_five(a0b_0, a0b_1, a0b_2, a0b_3, a0b_4);
//
//    "movq  8(A), %%rdx; " /* A[1] */
//    "mulx   (B),  %%r8,  %%r9; " /* A[1]*B[0] */    "xorl %%r10d, %%r10d ;"  "adcx 8(dst),  %%r8 ;"    "movq  %%r8,  8(dst) ;"
//    "mulx  8(B), %%r10, %%r11; " /* A[1]*B[1] */    "adox  %%r9, %%r10 ;"    "adcx %%rbx, %%r10 ;"    "movq %%r10, 16(dst) ;"
//    "mulx 16(B), %%rbx, %%r13; " /* A[1]*B[2] */    "adox %%r11, %%rbx ;"    "adcx %%r14, %%rbx ;"                              "movq $0, %%r8  ;"
//    "mulx 24(B), %%r14, %%rdx; " /* A[1]*B[3] */    "adox %%r13, %%r14 ;"    "adcx %%rax, %%r14 ;"                              "movq $0, %%rax ;"
//    /*******************************************/   "adox %%rdx, %%rax ;"    "adcx  %%r8, %%rax ;"

    Load64_buffer(heap0, rdx, inA_ptr,  8 + offset*8, Secret, inA_b, 1 + offset);     /* A[1] */

    NoNewline();
    Mulx64(r9, r8, Mem64(heap0, inB_ptr, 0 + offset * 8, inB_b, 0 + offset, Secret));  /* A[1]*B[0] */ lemma_prod_bounds(r9, r8, a1, b0);
    Space(3);
    Xor64(r10, r10);  let a1b_0 := r8;
    Space(1);
    Adcx64Wrap(r8, Mem64(heap0, dst_ptr, 8 + offset * 16, dst_b, 1 + offset*2, Secret));
    Store64_buffer(heap0, dst_ptr, r8, 8 + offset * 16, Secret, dst_b, 1 + offset*2);  // REVIEW: Why not combine the Adcx with the Store?

    NoNewline();
    Mulx64(r11, r10, Mem64(heap0, inB_ptr, 8 + offset * 8, inB_b, 1 + offset, Secret));  /* A[1]*B[1] */ lemma_prod_bounds(r11, r10, a1, b1);  lemma_overflow(r11, r10, r9, bool_bit(overflow(efl)));
    assert valid_of(efl); // OBSERVE
    Space(1);
    Adox64Wrap(r10,  r9);  let a1b_1 := r10;      // At this point, overflow = 0, r9 < 2^64 - 1, and (r10 is <= 1 (so no new overflow), or r11 < 2^64-2)
    Space(1);
    Adcx64Wrap(r10, rbx);
    Store64_buffer(heap0, dst_ptr, r10, 16 + offset*16, Secret, dst_b, 2 + offset*2);

    NoNewline();
    Mulx64(r13, rbx, Mem64(heap0, inB_ptr, 16 + offset * 8, inB_b, 2 + offset, Secret));  /* A[1]*B[2] */ lemma_prod_bounds(r13, rbx, a1, b2);  lemma_overflow(r13, rbx, r11, bool_bit(overflow(efl)));
    NoNewline();
    Adox64Wrap(rbx, r11);  let a1b_2 := rbx;      // At this point, either r13 < 2^64-2 or (rbx <= 1, and (overflow = 0 or r11 < 2^64 - 2)).  RHS of the first or means no overflow
    NoNewline();
    Adcx64Wrap(rbx, r14);
    Mov64( r8, 0);

    NoNewline();
    Mulx64(rdx, r14, Mem64(heap0, inB_ptr, 24 + offset * 8, inB_b, 3 + offset, Secret));  /* A[1]*B[3] */ lemma_prod_bounds(rdx, r14, a1, b3);  lemma_overflow(rdx, r14, r13, bool_bit(overflow(efl)));
    let old_carry    := bool_bit(cf(efl));
    let old_overflow := bool_bit(overflow(efl));
    NoNewline();
    Adox64Wrap(r14, r13);  let a1b_3 := r14;      // If rdx = 2^64 - 1, then we know r14 <= 1.  If r13 < 2^64-2, then no new overflow.  Else there wasn't any previous overflow, so r13 can absorb the 1 from r14 without overflowing.
    NoNewline();
    Adcx64Wrap(r14, rax);
    Mov64(rax, 0);

    /*******************************************/  Space(35); Adox64Wrap(rax, rdx);  let a1b_4 := rax;  let overflow_bit := bool_bit(overflow(efl));  Adcx64Wrap(rax, r8);

    assert overflow_bit == 0;

    let carry_bit := bool_bit(cf(efl));
    assert carry_bit == 0;

    let d0 := buffer64_read(dst_b, 0 + offset*2, heap0);
    let d1 := buffer64_read(dst_b, 1 + offset*2, heap0);
    let d2 := buffer64_read(dst_b, 2 + offset*2, heap0);

    let a1b := pow2_five(a1b_0, a1b_1, a1b_2, a1b_3, a1b_4);
    let a0a1b := pow2_seven(d0, d1, d2, rbx, r14, rax, carry_bit);

    assert_by_tactic(a1 * b == 0 + pow2_four(a1 * b0, a1 * b1, a1 * b2, a1 * b3), int_canon);   // PASSES
    assert a1b == a1 * b;  // PASSES

    lemma_sum_a1b(
              a0, a1,
              a0b, a0b_0, a0b_1, a0b_2, a0b_3, a0b_4,
              a1b, a1b_0, a1b_1, a1b_2, a1b_3, a1b_4,
              b, b0, b1, b2, b3,
              d1, d2, rbx, r14, rax,
              carry_bit);
    assert pow2_two(a0, a1) * b == a0a1b;   // Conclusion from the lemma
}

procedure Fast_multiply_a2b(
        inline offset:nat,
        ghost dst_b:buffer64,
        ghost inA_b:buffer64,
        ghost inB_b:buffer64)
    {:quick}
    {:options z3rlimit(800)}
    lets
        dst_ptr @= rdi;
        inA_ptr @= rsi;
        inB_ptr @= rcx;

        a0 := buffer64_read(inA_b, 0 + offset, heap0);
        a1 := buffer64_read(inA_b, 1 + offset, heap0);
        a2 := buffer64_read(inA_b, 2 + offset, heap0);
        a3 := buffer64_read(inA_b, 3 + offset, heap0);

        b0 := buffer64_read(inB_b, 0 + offset, heap0);
        b1 := buffer64_read(inB_b, 1 + offset, heap0);
        b2 := buffer64_read(inB_b, 2 + offset, heap0);
        b3 := buffer64_read(inB_b, 3 + offset, heap0);

        a := a0 + pow2_64 * a1 + pow2_128 * a2 + pow2_192 * a3;
        b := b0 + pow2_64 * b1 + pow2_128 * b2 + pow2_192 * b3;
    reads
        dst_ptr; inA_ptr; inB_ptr; memLayout;

    modifies
        rax; rdx; r8; r9; r10; r11; rbx; r13; r14;
        heap0; efl;

    requires
        adx_enabled && bmi2_enabled;
        offset == 0 || offset == 4;
        buffers_disjoint(dst_b, inA_b);
        buffers_disjoint(dst_b, inB_b);

        validDstAddrs64(heap0, dst_ptr, dst_b, 8 + offset*2, memLayout, Secret);
        validSrcAddrs64(heap0, inA_ptr, inA_b, 4 + offset, memLayout, Secret);
        validSrcAddrs64(heap0, inB_ptr, inB_b, 4 + offset, memLayout, Secret);

        let d0 := buffer64_read(dst_b, 0 + offset*2, heap0);
        let d1 := buffer64_read(dst_b, 1 + offset*2, heap0);
        let d2 := buffer64_read(dst_b, 2 + offset*2, heap0);
        mul_nats(pow2_two(a0, a1), b) == pow2_six(d0, d1, d2, rbx, r14, rax);
    ensures
        let d0 := buffer64_read(dst_b, 0 + offset*2, heap0);
        let d1 := buffer64_read(dst_b, 1 + offset*2, heap0);
        let d2 := buffer64_read(dst_b, 2 + offset*2, heap0);
        let d3 := buffer64_read(dst_b, 3 + offset*2, heap0);
        pow2_three(a0, a1, a2) * b == pow2_seven(d0, d1, d2, d3, rbx, r14, rax);

        validSrcAddrs64(heap0, dst_ptr, dst_b, 8 + offset*2, memLayout, Secret);
        modifies_buffer_specific(dst_b, old(heap0), heap0, 0 + offset*2, 8 + offset*2 - 1);
{
    Comment("Compute src1[2] * src2");
    xor_lemmas();

    let a0a1b_0 := buffer64_read(dst_b, 0 + offset*2, heap0);
    let a0a1b_1 := buffer64_read(dst_b, 1 + offset*2, heap0);
    let a0a1b_2 := buffer64_read(dst_b, 2 + offset*2, heap0);
    let a0a1b_3 := rbx;
    let a0a1b_4 := r14;
    let a0a1b_5 := rax;
    let a0a1b := pow2_six(a0a1b_0, a0a1b_1, a0a1b_2, a0a1b_3, a0a1b_4, a0a1b_5);

//
//    "movq 16(A), %%rdx; " /* A[2] */
//    "mulx   (B),  %%r8,  %%r9; " /* A[2]*B[0] */    "xorl %%r10d, %%r10d ;"  "adcx 16(dst), %%r8 ;"    "movq  %%r8, 16(dst) ;"
//    "mulx  8(B), %%r10, %%r11; " /* A[2]*B[1] */    "adox  %%r9, %%r10 ;"    "adcx %%rbx, %%r10 ;"    "movq %%r10, 24(dst) ;"
//    "mulx 16(B), %%rbx, %%r13; " /* A[2]*B[2] */    "adox %%r11, %%rbx ;"    "adcx %%r14, %%rbx ;"                              "movq $0, %%r8  ;"
//    "mulx 24(B), %%r14, %%rdx; " /* A[2]*B[3] */    "adox %%r13, %%r14 ;"    "adcx %%rax, %%r14 ;"                              "movq $0, %%rax ;"
//    /*******************************************/   "adox %%rdx, %%rax ;"    "adcx  %%r8, %%rax ;"

    Load64_buffer(heap0, rdx, inA_ptr,  16 + offset*8, Secret, inA_b, 2 + offset);     /* A[2] */

    NoNewline();
    Mulx64(r9, r8, Mem64(heap0, inB_ptr, 0 + offset * 8, inB_b, 0 + offset, Secret));  /* A[2]*B[0] */ lemma_prod_bounds(r9, r8, a2, b0);
    Space(3);
    Xor64(r10, r10);  let a2b_0 := r8;
    NoNewline();
    Adcx64Wrap(r8, Mem64(heap0, dst_ptr, 16 + offset * 16, dst_b, 2 + offset * 2, Secret));
    Store64_buffer(heap0, dst_ptr, r8,  16 + offset*16, Secret, dst_b, 2 + offset*2);  // REVIEW: Why not combine the Adcx with the Store?

    NoNewline();
    Mulx64(r11, r10, Mem64(heap0, inB_ptr, 8 + offset * 8, inB_b, 1 + offset, Secret));  /* A[2]*B[1] */ lemma_prod_bounds(r11, r10, a2, b1);  lemma_overflow(r11, r10, r9, bool_bit(overflow(efl)));
    Space(1);
    Adox64Wrap(r10,  r9);  let a2b_1 := r10;
    Space(1);
    Adcx64Wrap(r10, rbx);
    Store64_buffer(heap0, dst_ptr, r10, 24 + offset*16, Secret, dst_b, 3 + offset*2);

    NoNewline();
    Mulx64(r13, rbx, Mem64(heap0, inB_ptr, 16 + offset * 8, inB_b, 2 + offset, Secret));  /* A[2]*B[2] */ lemma_prod_bounds(r13, rbx, a2, b2);  lemma_overflow(r13, rbx, r11, bool_bit(overflow(efl)));
    NoNewline();
    Adox64Wrap(rbx, r11);  let a2b_2 := rbx;
    NoNewline();
    Adcx64Wrap(rbx, r14);
    Mov64( r8, 0);

    NoNewline();
    Mulx64(rdx, r14, Mem64(heap0, inB_ptr, 24 + offset * 8, inB_b, 3 + offset, Secret));  /* A[2]*B[3] */ lemma_prod_bounds(rdx, r14, a2, b3);  lemma_overflow(rdx, r14, r13, bool_bit(overflow(efl)));
    let old_carry    := bool_bit(cf(efl));
    let old_overflow := bool_bit(overflow(efl));
    NoNewline();
    Adox64Wrap(r14, r13);  let a2b_3 := r14;
    NoNewline();
    Adcx64Wrap(r14, rax);
    Mov64(rax, 0);

    /*******************************************/ Space(35);  Adox64Wrap(rax, rdx);  
    let a2b_4 := rax;    Adcx64Wrap(rax, r8);

    let carry_bit    := bool_bit(cf(efl));
    let overflow_bit := bool_bit(overflow(efl));

    assert overflow_bit == 0;
    assert carry_bit == 0;

    let d0 := buffer64_read(dst_b, 0 + offset*2, heap0);
    let d1 := buffer64_read(dst_b, 1 + offset*2, heap0);
    let d2 := buffer64_read(dst_b, 2 + offset*2, heap0);
    let d3 := buffer64_read(dst_b, 3 + offset*2, heap0);

    let a2b := pow2_five(a2b_0, a2b_1, a2b_2, a2b_3, a2b_4);
    let a0a1a2b := pow2_seven(d0, d1, d2, d3, rbx, r14, rax);

    assert_by_tactic(a2 * b == 0 + pow2_four(a2 * b0, a2 * b1, a2 * b2, a2 * b3), int_canon);   // PASSES
    //assert a2b == a2 * b;  // PASSES

    lemma_sum_a2b(a0, a1, a2,
              a0a1b, a0a1b_0, a0a1b_1, a0a1b_2, a0a1b_3, a0a1b_4, a0a1b_5,
              a2b, a2b_0, a2b_1, a2b_2, a2b_3, a2b_4,
              b, b0, b1, b2, b3,
              d2, d3, rbx, r14, rax);
    //assert pow2_three(a0, a1, a2) * b == a0a1a2b;   // Conclusion from the lemma.  // PASSES
//    assert d0 == a0a1b_0;       // PASSES
//    assert d1 == a0a1b_1;       // PASSES
//    assert pow2_three(a0, a1, a2) * b == pow2_seven(d0, d1, d2, d3, rbx, r14, rax);
}

procedure Fast_multiply_a3b(
        inline offset:nat,
        ghost dst_b:buffer64,
        ghost inA_b:buffer64,
        ghost inB_b:buffer64)
    {:quick}
    {:options z3rlimit(800)}
    lets
        dst_ptr @= rdi;
        inA_ptr @= rsi;
        inB_ptr @= rcx;

        a0 := buffer64_read(inA_b, 0 + offset, heap0);
        a1 := buffer64_read(inA_b, 1 + offset, heap0);
        a2 := buffer64_read(inA_b, 2 + offset, heap0);
        a3 := buffer64_read(inA_b, 3 + offset, heap0);

        b0 := buffer64_read(inB_b, 0 + offset, heap0);
        b1 := buffer64_read(inB_b, 1 + offset, heap0);
        b2 := buffer64_read(inB_b, 2 + offset, heap0);
        b3 := buffer64_read(inB_b, 3 + offset, heap0);

        a := a0 + pow2_64 * a1 + pow2_128 * a2 + pow2_192 * a3;
        b := b0 + pow2_64 * b1 + pow2_128 * b2 + pow2_192 * b3;
    reads
        dst_ptr; inA_ptr; inB_ptr; memLayout;

    modifies
        rax; rdx; r8; r9; r10; r11; rbx; r13; r14;
        heap0; efl;

    requires
        adx_enabled && bmi2_enabled;
        offset == 0 || offset == 4;
        buffers_disjoint(dst_b, inA_b);
        buffers_disjoint(dst_b, inB_b);

        validDstAddrs64(heap0, dst_ptr, dst_b, 8 + offset*2, memLayout, Secret);
        validSrcAddrs64(heap0, inA_ptr, inA_b, 4 + offset, memLayout, Secret);
        validSrcAddrs64(heap0, inB_ptr, inB_b, 4 + offset, memLayout, Secret);

        let d0 := buffer64_read(dst_b, 0 + offset*2, heap0);
        let d1 := buffer64_read(dst_b, 1 + offset*2, heap0);
        let d2 := buffer64_read(dst_b, 2 + offset*2, heap0);
        let d3 := buffer64_read(dst_b, 3 + offset*2, heap0);
        pow2_three(a0, a1, a2) * b == pow2_seven(d0, d1, d2, d3, rbx, r14, rax);
    ensures
        let d0 := buffer64_read(dst_b, 0 + offset*2, heap0);
        let d1 := buffer64_read(dst_b, 1 + offset*2, heap0);
        let d2 := buffer64_read(dst_b, 2 + offset*2, heap0);
        let d3 := buffer64_read(dst_b, 3 + offset*2, heap0);
        let d4 := buffer64_read(dst_b, 4 + offset*2, heap0);
        let d5 := buffer64_read(dst_b, 5 + offset*2, heap0);
        let d6 := buffer64_read(dst_b, 6 + offset*2, heap0);
        let d7 := buffer64_read(dst_b, 7 + offset*2, heap0);
        pow2_four(a0, a1, a2, a3) * b == pow2_eight(d0, d1, d2, d3, d4, d5, d6, d7);

        validSrcAddrs64(heap0, dst_ptr, dst_b, 8 + offset*2, memLayout, Secret);
        modifies_buffer_specific(dst_b, old(heap0), heap0, 0 + offset*2, 8 + offset*2 - 1);
{
    Comment("Compute src1[3] * src2");
    xor_lemmas();

    let a0a1a2b_0 := buffer64_read(dst_b, 0 + offset*2, heap0);
    let a0a1a2b_1 := buffer64_read(dst_b, 1 + offset*2, heap0);
    let a0a1a2b_2 := buffer64_read(dst_b, 2 + offset*2, heap0);
    let a0a1a2b_3 := buffer64_read(dst_b, 3 + offset*2, heap0);
    let a0a1a2b_4 := rbx;
    let a0a1a2b_5 := r14;
    let a0a1a2b_6 := rax;
    let a0a1a2b := pow2_seven(a0a1a2b_0, a0a1a2b_1, a0a1a2b_2, a0a1a2b_3, a0a1a2b_4, a0a1a2b_5, a0a1a2b_6);

//
//    "movq 24(A), %%rdx; " /* A[3] */
//    "mulx   (B),  %%r8,  %%r9; " /* A[3]*B[0] */    "xorl %%r10d, %%r10d ;"  "adcx 24(dst), %%r8 ;"   "movq  %%r8, 24(dst) ;"
//    "mulx  8(B), %%r10, %%r11; " /* A[3]*B[1] */    "adox  %%r9, %%r10 ;"    "adcx %%rbx, %%r10 ;"    "movq %%r10, 32(dst) ;"
//    "mulx 16(B), %%rbx, %%r13; " /* A[3]*B[2] */    "adox %%r11, %%rbx ;"    "adcx %%r14, %%rbx ;"    "movq %%rbx, 40(dst) ;"    "movq $0, %%r8  ;"
//    "mulx 24(B), %%r14, %%rdx; " /* A[3]*B[3] */    "adox %%r13, %%r14 ;"    "adcx %%rax, %%r14 ;"    "movq %%r14, 48(dst) ;"    "movq $0, %%rax ;"
//    /*******************************************/   "adox %%rdx, %%rax ;"    "adcx  %%r8, %%rax ;"    "movq %%rax, 56(dst) ;"

    Load64_buffer(heap0, rdx, inA_ptr,  24 + offset*8, Secret, inA_b, 3 + offset);     /* A[3] */

    NoNewline();
    Mulx64(r9, r8, Mem64(heap0, inB_ptr, 0 + offset * 8, inB_b, 0 + offset, Secret));  /* A[3]*B[0] */ lemma_prod_bounds(r9, r8, a3, b0);
    Space(3);
    Xor64(r10, r10);  let a3b_0 := r8;
    NoNewline();
    Adcx64Wrap(r8, Mem64(heap0, dst_ptr, 24 + offset * 16, dst_b, 3 + offset * 2, Secret));
    Store64_buffer(heap0, dst_ptr, r8, 24 + offset * 16, Secret, dst_b, 3 + offset * 2);  // REVIEW: Why not combine the Adcx with the Store?

    NoNewline();
    Mulx64(r11, r10, Mem64(heap0, inB_ptr, 8 + offset * 8, inB_b, 1 + offset, Secret));  /* A[3]*B[1] */ lemma_prod_bounds(r11, r10, a3, b1);  lemma_overflow(r11, r10, r9, bool_bit(overflow(efl)));
    Space(1);
    Adox64Wrap(r10,  r9);  let a3b_1 := r10;
    Space(1);
    Adcx64Wrap(r10, rbx);
    Store64_buffer(heap0, dst_ptr, r10, 32 + offset*16, Secret, dst_b, 4 + offset*2);

    NoNewline(); 
    Mulx64(r13, rbx, Mem64(heap0, inB_ptr, 16 + offset * 8, inB_b, 2 + offset, Secret));  /* A[3]*B[2] */ lemma_prod_bounds(r13, rbx, a3, b2);  lemma_overflow(r13, rbx, r11, bool_bit(overflow(efl)));
    NoNewline();
    Adox64Wrap(rbx, r11);  let a3b_2 := rbx;
    NoNewline();
    Adcx64Wrap(rbx, r14);
    NoNewline();
    Store64_buffer(heap0, dst_ptr, rbx, 40 + offset * 16, Secret, dst_b, 5 + offset * 2);
    Mov64( r8, 0);

    NoNewline();
    Mulx64(rdx, r14, Mem64(heap0, inB_ptr, 24 + offset * 8, inB_b, 3 + offset, Secret));  /* A[3]*B[3] */ lemma_prod_bounds(rdx, r14, a3, b3);  lemma_overflow(rdx, r14, r13, bool_bit(overflow(efl)));
    let old_carry    := bool_bit(cf(efl));
    let old_overflow := bool_bit(overflow(efl));
    NoNewline();
    Adox64Wrap(r14, r13);  let a3b_3 := r14;
    NoNewline();
    Adcx64Wrap(r14, rax);
    NoNewline();
    Store64_buffer(heap0, dst_ptr, r14, 48 + offset * 16, Secret, dst_b, 6 + offset * 2);
    Mov64(rax, 0);

    /*******************************************/  Space(35); Adox64Wrap(rax, rdx);  
    NoNewline();
    let a3b_4 := rax;    Adcx64Wrap(rax, r8);
    Space(1);
    Store64_buffer(heap0, dst_ptr, rax, 56 + offset * 16, Secret, dst_b, 7 + offset * 2);

    let carry_bit    := bool_bit(cf(efl));
    let overflow_bit := bool_bit(overflow(efl));

    assert overflow_bit == 0;
    assert carry_bit == 0;

    let d0 := buffer64_read(dst_b, 0 + offset*2, heap0);
    let d1 := buffer64_read(dst_b, 1 + offset*2, heap0);
    let d2 := buffer64_read(dst_b, 2 + offset*2, heap0);
    let d3 := buffer64_read(dst_b, 3 + offset*2, heap0);
    let d4 := buffer64_read(dst_b, 4 + offset*2, heap0);
    let d5 := buffer64_read(dst_b, 5 + offset*2, heap0);
    let d6 := buffer64_read(dst_b, 6 + offset*2, heap0);
    let d7 := buffer64_read(dst_b, 7 + offset*2, heap0);

    let a3b := pow2_five(a3b_0, a3b_1, a3b_2, a3b_3, a3b_4);
    let a0a1a2a3b := pow2_eight(d0, d1, d2, d3, d4, d5, d6, d7);

    assert_by_tactic(a3 * b == 0 + pow2_four(a3 * b0, a3 * b1, a3 * b2, a3 * b3), int_canon);   // PASSES
    assert a3b == a3 * b;  // PASSES

    lemma_sum_a3b(a0, a1, a2, a3,
              a0a1a2b, a0a1a2b_0, a0a1a2b_1, a0a1a2b_2, a0a1a2b_3, a0a1a2b_4, a0a1a2b_5, a0a1a2b_6,
              a3b, a3b_0, a3b_1, a3b_2, a3b_3, a3b_4,
              b, b0, b1, b2, b3,
              d3, d4, d5, d6, d7);
//    assert pow2_four(a0, a1, a2, a3) * b == a0a1a2a3b;   // Conclusion from the lemma.  // PASSES
}

procedure Fast_multiply(
        inline offset:nat,
        ghost dst_b:buffer64,
        ghost inA_b:buffer64,
        ghost inB_b:buffer64)
    {:public}
    {:quick}
    lets
        dst_ptr @= rdi;
        inA_ptr @= rsi;
        inB_ptr @= rcx;

        a0 := buffer64_read(inA_b, 0 + offset, heap0);
        a1 := buffer64_read(inA_b, 1 + offset, heap0);
        a2 := buffer64_read(inA_b, 2 + offset, heap0);
        a3 := buffer64_read(inA_b, 3 + offset, heap0);

        b0 := buffer64_read(inB_b, 0 + offset, heap0);
        b1 := buffer64_read(inB_b, 1 + offset, heap0);
        b2 := buffer64_read(inB_b, 2 + offset, heap0);
        b3 := buffer64_read(inB_b, 3 + offset, heap0);

        a := pow2_four(a0, a1, a2, a3);
        b := pow2_four(b0, b1, b2, b3);
    reads
        dst_ptr; inA_ptr; inB_ptr; memLayout;

    modifies
        rax; rdx; r8; r9; r10; r11; rbx; r13; r14;
        heap0; efl;

    requires
        adx_enabled && bmi2_enabled;
        offset == 0 || offset == 4;
        buffers_disjoint(dst_b, inA_b);
        buffers_disjoint(dst_b, inB_b);

        validDstAddrs64(heap0, dst_ptr, dst_b, 8 + offset*2, memLayout, Secret);
        validSrcAddrs64(heap0, inA_ptr, inA_b, 4 + offset, memLayout, Secret);
        validSrcAddrs64(heap0, inB_ptr, inB_b, 4 + offset, memLayout, Secret);

    ensures
        let d0 := buffer64_read(dst_b, 0 + offset * 2, heap0);
        let d1 := buffer64_read(dst_b, 1 + offset * 2, heap0);
        let d2 := buffer64_read(dst_b, 2 + offset * 2, heap0);
        let d3 := buffer64_read(dst_b, 3 + offset * 2, heap0);
        let d4 := buffer64_read(dst_b, 4 + offset * 2, heap0);
        let d5 := buffer64_read(dst_b, 5 + offset * 2, heap0);
        let d6 := buffer64_read(dst_b, 6 + offset * 2, heap0);
        let d7 := buffer64_read(dst_b, 7 + offset * 2, heap0);
        let d := pow2_eight(d0, d1, d2, d3, d4, d5, d6, d7);
        d == a * b;

        validSrcAddrs64(heap0, dst_ptr, dst_b, 8 + offset*2, memLayout, Secret);
        modifies_buffer_specific(dst_b, old(heap0), heap0, 0 + offset*2, 8 + offset*2 - 1);
{
    Fast_multiply_a0b(offset, dst_b, inA_b, inB_b);
    Newline();
    Newline();
    Fast_multiply_a1b(offset, dst_b, inA_b, inB_b);
    Newline();
    Newline();
    Fast_multiply_a2b(offset, dst_b, inA_b, inB_b);
    Newline();
    Newline();
    Fast_multiply_a3b(offset, dst_b, inA_b, inB_b);
    Newline();
}

procedure Fast_mul_stdcall(
        inline win:bool,
        ghost dst_b:buffer64,
        ghost inA_b:buffer64,
        ghost inB_b:buffer64)
    {:public}
    {:quick}
    lets
        dst_ptr @= rdi; inA_ptr @= rsi; inB_ptr @= rcx;
        dst_in := (if win then rcx else rdi);
        inA_in := (if win then rdx else rsi);
        inB_in := (if win then r8 else rdx);
    reads memLayout;
    modifies
        rax; rbx; rcx; rdx; rdi; rsi; r8; r9; r10; r11; r13; r14;
        rsp; efl; heap0; stack; stackTaint;
    requires
        rsp == init_rsp(stack);
        adx_enabled && bmi2_enabled;
        buffers_disjoint(dst_b, inA_b);
        buffers_disjoint(dst_b, inB_b);

        validDstAddrs64(heap0, dst_in, dst_b, 8, memLayout, Secret);
        validSrcAddrs64(heap0, inA_in, inA_b, 4, memLayout, Secret);
        validSrcAddrs64(heap0, inB_in, inB_b, 4, memLayout, Secret);
    ensures
        let a0 := old(buffer64_read(inA_b, 0, heap0));
        let a1 := old(buffer64_read(inA_b, 1, heap0));
        let a2 := old(buffer64_read(inA_b, 2, heap0));
        let a3 := old(buffer64_read(inA_b, 3, heap0));

        let b0 := old(buffer64_read(inB_b, 0, heap0));
        let b1 := old(buffer64_read(inB_b, 1, heap0));
        let b2 := old(buffer64_read(inB_b, 2, heap0));
        let b3 := old(buffer64_read(inB_b, 3, heap0));

        let d0 := buffer64_read(dst_b, 0, heap0);
        let d1 := buffer64_read(dst_b, 1, heap0);
        let d2 := buffer64_read(dst_b, 2, heap0);
        let d3 := buffer64_read(dst_b, 3, heap0);
        let d4 := buffer64_read(dst_b, 4, heap0);
        let d5 := buffer64_read(dst_b, 5, heap0);
        let d6 := buffer64_read(dst_b, 6, heap0);
        let d7 := buffer64_read(dst_b, 7, heap0);

        let a := a0 + pow2_64 * a1 + pow2_128 * a2 + pow2_192 * a3;
        let b := b0 + pow2_64 * b1 + pow2_128 * b2 + pow2_192 * b3;

        let d := d0 + pow2_64 * d1 + pow2_128 * d2 + pow2_192 * d3 +
                 pow2_256 * d4 + pow2_320 * d5 + pow2_384 * d6 + pow2_448 * d7;
        d == a * b;

        //////////////////////////////////////
        //   Framing
        //////////////////////////////////////

        modifies_buffer(dst_b, old(heap0), heap0);
        validSrcAddrs64(heap0, dst_in, dst_b, 8, memLayout, Secret);

        rsi == old(rsi);
        r13 == old(r13);
        r14 == old(r14);
        rsp == old(rsp);
{
    // Store callee-save registers
    Push(rsi);
    Push(r13);
    Push(r14);

    // Line up the rest of the arguments
    inline if (win)
    {
        Mov64(dst_ptr, rcx);
        Mov64(inA_ptr, rdx);
        Mov64(inB_ptr, r8);
    }
    else
    {
        Mov64(inB_ptr, rdx);
    }

    Fast_multiply(0, dst_b, inA_b, inB_b);

    Pop(r14);
    Pop(r13);
    Pop(rsi);
}

procedure Fast_mul2_stdcall(
        inline win:bool,
        ghost dst_b:buffer64,
        ghost inA_b:buffer64,
        ghost inB_b:buffer64)
    {:public}
    {:quick}
    lets
        dst_ptr @= rdi; inA_ptr @= rsi; inB_ptr @= rcx;
        dst_in := (if win then rcx else rdi);
        inA_in := (if win then rdx else rsi);
        inB_in := (if win then r8 else rdx);
    reads memLayout;
    modifies
        rax; rbx; rcx; rdx; rdi; rsi; r8; r9; r10; r11; r13; r14;
        rsp; efl; heap0; stack; stackTaint;
    requires
        rsp == init_rsp(stack);
        adx_enabled && bmi2_enabled;
        buffers_disjoint(dst_b, inA_b);
        buffers_disjoint(dst_b, inB_b);

        validDstAddrs64(heap0, dst_in, dst_b, 16, memLayout, Secret);
        validSrcAddrs64(heap0, inA_in, inA_b, 8, memLayout, Secret);
        validSrcAddrs64(heap0, inB_in, inB_b, 8, memLayout, Secret);
    ensures

        let a0 := old(buffer64_read(inA_b, 0, heap0));
        let a1 := old(buffer64_read(inA_b, 1, heap0));
        let a2 := old(buffer64_read(inA_b, 2, heap0));
        let a3 := old(buffer64_read(inA_b, 3, heap0));

        let b0 := old(buffer64_read(inB_b, 0, heap0));
        let b1 := old(buffer64_read(inB_b, 1, heap0));
        let b2 := old(buffer64_read(inB_b, 2, heap0));
        let b3 := old(buffer64_read(inB_b, 3, heap0));

        let a := pow2_four(a0, a1, a2, a3);
        let b := pow2_four(b0, b1, b2, b3);

        let a0' := old(buffer64_read(inA_b, 0 + 4, heap0));
        let a1' := old(buffer64_read(inA_b, 1 + 4, heap0));
        let a2' := old(buffer64_read(inA_b, 2 + 4, heap0));
        let a3' := old(buffer64_read(inA_b, 3 + 4, heap0));

        let b0' := old(buffer64_read(inB_b, 0 + 4, heap0));
        let b1' := old(buffer64_read(inB_b, 1 + 4, heap0));
        let b2' := old(buffer64_read(inB_b, 2 + 4, heap0));
        let b3' := old(buffer64_read(inB_b, 3 + 4, heap0));

        let a' := pow2_four(a0', a1', a2', a3');
        let b' := pow2_four(b0', b1', b2', b3');

        let d0 := buffer64_read(dst_b, 0, heap0);
        let d1 := buffer64_read(dst_b, 1, heap0);
        let d2 := buffer64_read(dst_b, 2, heap0);
        let d3 := buffer64_read(dst_b, 3, heap0);
        let d4 := buffer64_read(dst_b, 4, heap0);
        let d5 := buffer64_read(dst_b, 5, heap0);
        let d6 := buffer64_read(dst_b, 6, heap0);
        let d7 := buffer64_read(dst_b, 7, heap0);

        let d := pow2_eight(d0, d1, d2, d3, d4, d5, d6, d7);

        let d0' := buffer64_read(dst_b, 0 + 8, heap0);
        let d1' := buffer64_read(dst_b, 1 + 8, heap0);
        let d2' := buffer64_read(dst_b, 2 + 8, heap0);
        let d3' := buffer64_read(dst_b, 3 + 8, heap0);
        let d4' := buffer64_read(dst_b, 4 + 8, heap0);
        let d5' := buffer64_read(dst_b, 5 + 8, heap0);
        let d6' := buffer64_read(dst_b, 6 + 8, heap0);
        let d7' := buffer64_read(dst_b, 7 + 8, heap0);

        let d' := pow2_eight(d0', d1', d2', d3', d4', d5', d6', d7');

        d == a * b;
        d' == a' * b';

        //////////////////////////////////////
        //   Framing
        //////////////////////////////////////

        modifies_buffer(dst_b, old(heap0), heap0);
        validSrcAddrs64(heap0, dst_in, dst_b, 16, memLayout, Secret);

        rsi == old(rsi);
        r13 == old(r13);
        r14 == old(r14);
        rsp == old(rsp);
{
    // Store callee-save registers
    Push(r13);
    Push(r14);

    // Line up the rest of the arguments
    inline if (win)
    {
        Push(rsi);
        Mov64(dst_ptr, rcx);
        Mov64(inA_ptr, rdx);
        Mov64(inB_ptr, r8);
    }
    else
    {
        Mov64(inB_ptr, rdx);
    }

    Fast_multiply(0, dst_b, inA_b, inB_b);
    Fast_multiply(4, dst_b, inA_b, inB_b);

    inline if (win)
    {
        Pop(rsi);
    }
    Pop(r14);
    Pop(r13);
}