include "../../../arch/x64/Vale.X64.InsBasic.vaf" include "../../../arch/x64/Vale.X64.InsMem.vaf" include "../../../arch/x64/Vale.X64.InsVector.vaf" include "../../../arch/x64/Vale.X64.InsAes.vaf" include "../../../crypto/aes/x64/Vale.AES.X64.PolyOps.vaf" include{:fstar}{:open} "Vale.Def.Prop_s" include{:fstar}{:open} "Vale.Def.Opaque_s" include{:fstar}{:open} "Vale.Def.Words_s" include{:fstar}{:open} "Vale.Def.Types_s" include{:fstar}{:open} "FStar.Seq.Base" include{:fstar}{:open} "Vale.AES.AES_s" include{:fstar}{:open} "Vale.X64.Machine_s" include{:fstar}{:open} "Vale.X64.Memory" include{:fstar}{:open} "Vale.X64.State" include{:fstar}{:open} "Vale.X64.Decls" include{:fstar}{:open} "Vale.X64.QuickCode" include{:fstar}{:open} "Vale.X64.QuickCodes" include{:fstar}{:open} "Vale.Arch.Types" include{:fstar}{:open} "Vale.AES.AES_helpers" include{:fstar}{:open} "Vale.Poly1305.Math" include{:fstar}{:open} "Vale.AES.GCM_helpers" include{:fstar}{:open} "Vale.AES.GCTR_s" include{:fstar}{:open} "Vale.AES.GCTR" include{:fstar}{:open} "Vale.Arch.TypesNative" include{:fstar}{:open} "Vale.X64.CPU_Features_s" include{:fstar}{:open} "Vale.Math.Poly2_s" include{:fstar}{:open} "Vale.Math.Poly2" include{:fstar}{:open} "Vale.Math.Poly2.Bits_s" include{:fstar}{:open} "Vale.Math.Poly2.Bits" include{:fstar}{:open} "Vale.Math.Poly2.Words" include{:fstar}{:open} "Vale.Math.Poly2.Lemmas" include{:fstar}{:open} "Vale.AES.GF128_s" include{:fstar}{:open} "Vale.AES.GF128" include{:fstar}{:open} "Vale.AES.GHash" module Vale.AES.X64.AESopt2 #verbatim{:interface}{:implementation} open Vale.Def.Prop_s open Vale.Def.Opaque_s open Vale.Def.Words_s open Vale.Def.Types_s open FStar.Seq open Vale.Arch.Types open Vale.Arch.HeapImpl open Vale.AES.AES_s open Vale.X64.Machine_s open Vale.X64.Memory open Vale.X64.State open Vale.X64.Decls open Vale.X64.InsBasic open Vale.X64.InsMem open Vale.X64.InsVector open Vale.X64.InsAes open Vale.X64.QuickCode open Vale.X64.QuickCodes open Vale.AES.AES_helpers open Vale.Poly1305.Math // For lemma_poly_bits64() open Vale.AES.GCM_helpers open Vale.AES.GCTR_s open Vale.AES.GCTR open Vale.Arch.TypesNative open Vale.X64.CPU_Features_s open Vale.AES.X64.PolyOps open Vale.Math.Poly2_s open Vale.Math.Poly2 open Vale.Math.Poly2.Bits_s open Vale.Math.Poly2.Bits open Vale.Math.Poly2.Lemmas open Vale.AES.GF128_s open Vale.AES.GF128 open Vale.AES.GHash #reset-options "--z3rlimit 50" #endverbatim #token +. precedence + #token *. precedence * #token %. precedence * #token ~~ precedence ! function operator(+.) (a:poly, b:poly):poly := add; function operator(*.) (a:poly, b:poly):poly := mul; function operator(%.) (a:poly, b:poly):poly := mod; function operator(~~) (a:quad32):poly := of_quad32; function operator([]) #[a:Type(0)](s:FStar.Seq.Base.seq(a), i:int):a extern; #verbatim{:interface} unfold let va_subscript_FStar__Seq__Base__seq = Seq.index let hkeys_b_powers (hkeys_b:buffer128) (heap0:vale_heap) (layout:vale_heap_layout) (ptr:int) (h:poly) = validSrcAddrs128 heap0 ptr hkeys_b 8 layout Secret /\ of_quad32 (buffer128_read hkeys_b 0 heap0) == gf128_power h 1 /\ of_quad32 (buffer128_read hkeys_b 1 heap0) == gf128_power h 2 /\ of_quad32 (buffer128_read hkeys_b 3 heap0) == gf128_power h 3 /\ of_quad32 (buffer128_read hkeys_b 4 heap0) == gf128_power h 4 /\ of_quad32 (buffer128_read hkeys_b 6 heap0) == gf128_power h 5 /\ of_quad32 (buffer128_read hkeys_b 7 heap0) == gf128_power h 6 let quad32_opt_rev (b:bool) (q:quad32) : quad32 = if b then reverse_bytes_quad32 q else q let index_opt_rev (b:bool) (len n i:int) : int = if b then len - 1 - i else len - n + i let scratch_b_blocks (rev_bytes:bool) (rev_blocks:bool) (scratch_in_b:buffer128) (scratch_len count:int) (heap_s:vale_heap) (data:seq quad32) = (forall (i:nat).{:pattern (index data i)} i < count /\ i < length data ==> buffer128_read scratch_in_b (index_opt_rev rev_blocks scratch_len (length data) i) heap_s == quad32_opt_rev rev_bytes (index data i)) let scratch_b_data (rev_bytes:bool) (rev_blocks:bool) (scratch_in_b:buffer128) (scratch_len count:int) (heap_s:vale_heap) (layout:vale_heap_layout) (ptr:int) (data:seq quad32) = validSrcAddrs128 heap_s ptr scratch_in_b scratch_len layout Secret /\ scratch_b_blocks rev_bytes rev_blocks scratch_in_b scratch_len count heap_s data #endverbatim function hkeys_b_powers(hkeys_b:buffer128, heap0:vale_heap, layout:vale_heap_layout, ptr:int, h:poly):prop extern; function index_opt_rev(b:bool, len:int, n:int, i:int):int extern; function quad32_opt_rev(b:bool, q:quad32):quad32 extern; function scratch_b_data(rev_bytes:bool, rev_blocks:bool, scratch_in_b:buffer128, scratch_len:int, count:int, heap_s:vale_heap, layout:vale_heap_layout, ptr:int, data:seq(quad32)):prop extern; procedure MulAdd_step( inline m:nat, inline power_index:int, inout a0:xmm, inout a1:xmm, inout a2:xmm, inout a3:xmm, inout b:xmm, inout c:xmm, ghost hkeys_b:buffer128, ghost scratch_b:buffer128, ghost h:poly, ghost prev:poly, ghost data:seq(quad32)) {:public} {:quick exportOnly} lets Xip @= r9; Ii @= xmm0; Z0 @= xmm4; Z2 @= xmm6; Z3 @= xmm7; Xi @= xmm8; pdata := fun_seq_quad32_LE_poly128(data); reads heap0; heap3; memLayout; Xip; rbp; modifies efl; Ii; Z0; Z2; Z3; Xi; requires pclmulqdq_enabled && avx_enabled && sse_enabled; m < 6; length(data) >= 6; // a0 a1 a2 a3 b c //------------------------------- // T1 Z1 Z2 Z3 Z3 Hkey // Z1 T1 T2 Hkey Ii Hkey // T1 T2 Hkey Z1 Ii Z1 // T2 Hkey Z1 T1 Ii T1 // Hkey Z1 T1 T2 Ii T2 // T2 Z1 T1 Xi Xi Hkey m == 0 ==> @a0 == 1 && @a1 == 5 && @a2 == 6 && @a3 == 7 && @b == 7 && @c == 3; m == 1 ==> @a0 == 5 && @a1 == 1 && @a2 == 2 && @a3 == 3 && @b == 0 && @c == 3; m == 2 ==> @a0 == 1 && @a1 == 2 && @a2 == 3 && @a3 == 5 && @b == 0 && @c == 5; m == 3 ==> @a0 == 2 && @a1 == 3 && @a2 == 5 && @a3 == 1 && @b == 0 && @c == 1; m == 4 ==> @a0 == 3 && @a1 == 5 && @a2 == 1 && @a3 == 2 && @b == 0 && @c == 2; m == 5 ==> @a0 == 2 && @a1 == 5 && @a2 == 1 && @a3 == 8 && @b == 8 && @c == 3; validSrcAddrs128(heap0, Xip - 0x20, hkeys_b, 8, memLayout, Secret); validSrcAddrs128(heap3, rbp, scratch_b, 8, memLayout, Secret); // rbp + 0x10 * 8 <= pow2_64; 0 <= power_index < 8; let z0 := if m = 1 then a1 else Z0; m > 0 ==> of_quad32(z0) +. shift(of_quad32(Z2), 64) +. shift(of_quad32(Z3), 128) == ghash_unroll_back(h, prev, pdata, 0, 6, #nat(m - 1)); m < 5 ==> b == reverse_bytes_quad32(data[#nat(5 - m)]); m == 5 ==> of_quad32(b) == prev +. of_quad32(reverse_bytes_quad32(data[#nat(5 - m)])); m == 0 ==> of_quad32(c) == gf128_power(h, m + 1); of_quad32(buffer128_read(hkeys_b, power_index, heap0)) == gf128_power(h, m + 1); ensures m < 4 ==> Ii == buffer128_read(scratch_b, 3 + m, heap3); let z0 := if m = 0 then a0 else Z0; of_quad32(z0) +. shift(of_quad32(Z2), 64) +. shift(of_quad32(Z3), 128) == ghash_unroll_back(h, prev, pdata, 0, 6, m); m == 0 ==> of_quad32(Xi) == old(of_quad32(Xi) +. of_quad32(Z0) +. of_quad32(buffer128_read(scratch_b, 1, heap3))); m == 4 ==> of_quad32(Xi) == old(of_quad32(Xi) +. of_quad32(buffer128_read(scratch_b, 7, heap3))); 0 < m < 4 ==> Xi == old(Xi); { let data_i := #poly(pdata(5 - m)); let z0 := if m = 1 then a1 else Z0; lemma_Mul128_accum(~~z0, ~~Z2, ~~Z3, (if m = 5 then prev +. data_i else data_i), gf128_power(h, m + 1)); inline if (m = 0) { VPolyMul(a0, b, c, false, false); VPolyMul(a1, b, c, false, true); VPolyMul(a2, b, c, true, false); VPolyMul(b, b, c, true, true); lemma_Mul128(data_i, gf128_power(h, 1)); lemma_add_commute(~~a1, ~~a2); VPolyAdd(Z2, a2, a1); VPolyAdd(Xi, Xi, Z0); VPolyAdd(Xi, Xi, Mem128(heap3, rbp, 0x10, Secret, scratch_b, 1)); } else { Load128_buffer(heap0, c, Xip, 0x10 * power_index - 0x20, Secret, hkeys_b, power_index); VPolyMul(a0, b, c, false, false); inline if (m = 1) { VPolyAdd(Z0, a1, a0); } else { VPolyAdd(Z0, Z0, a0); } VPolyMul(a1, b, c, false, true); VPolyMul(a2, b, c, true, false); VPolyMul(a3, b, c, true, true); VPolyAdd(Z2, Z2, a1); VPolyAdd(Z2, Z2, a2); VPolyAdd(Z3, Z3, a3); inline if (m = 4) { VPolyAdd(Xi, Xi, Mem128(heap3, rbp, 0x70, Secret, scratch_b, 7)); } } inline if (m < 4) { Load128_buffer(heap3, Ii, rbp, 0x30 + 0x10 * m, Secret, scratch_b, 3 + m); } } procedure MulAdd_unroll_i( inline rev_bytes:bool, inline rev_blocks:bool, inline is_first:bool, inline is_last:bool, inline power_index:int, ghost scratch_len:nat64, ghost n:nat, ghost m:int, ghost hkeys_b:buffer128, ghost scratch_in_b:buffer128, ghost h:poly, ghost prev:poly, ghost data:seq(quad32)) {:quick} lets Xip @= r9; scratch_ptr @= r11; in_ptr @= rdi; Ii @= xmm0; T1 @= xmm1; T2 @= xmm2; Hkey @= xmm3; Z0 @= xmm4; Z1 @= xmm5; Z2 @= xmm6; Z3 @= xmm7; Xi @= xmm8; rev_mask @= xmm9; pdata := fun_seq_quad32_LE_poly128(data); scratch_index := index_opt_rev(rev_blocks, scratch_len, n, n - (m + 2)); // index of m + 1 iteration reads heap0; heap1; memLayout; Xip; in_ptr; Xi; rev_mask; modifies efl; scratch_ptr; Ii; T1; T2; Hkey; Z0; Z1; Z2; Z3; requires pclmulqdq_enabled && avx_enabled && sse_enabled; length(data) >= n; 0 <= m < n <= scratch_len; is_first <==> m == 0; is_last <==> m == n - 1; validSrcAddrs128(heap0, Xip - 0x20, hkeys_b, 8, memLayout, Secret); validSrcAddrs128(heap1, in_ptr, scratch_in_b, scratch_len, memLayout, Secret); in_ptr + 0x10 * scratch_len <= pow2_64; of_quad32(Xi) == prev; scratch_ptr == in_ptr + 0x10 * (scratch_index + (if rev_blocks then (-1) else 1)); 0 <= power_index < 8; !is_first ==> of_quad32(Z0) +. shift(of_quad32(Z2), 64) +. shift(of_quad32(Z3), 128) == ghash_unroll_back(h, prev, pdata, 0, n, #nat(m - 1)); Ii == reverse_bytes_quad32(data[#nat(n - m - 1)]); of_quad32(Z1) == gf128_power(h, #nat(m + 1)); !rev_bytes ==> rev_mask == Mkfour(0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203); ensures !is_last ==> Ii == quad32_opt_rev(!rev_bytes, buffer128_read(scratch_in_b, scratch_index, heap1)); !is_last ==> Z1 == buffer128_read(hkeys_b, power_index, heap0); !is_last ==> scratch_ptr == old(scratch_ptr) + (if rev_blocks then 0x10 else (-0x10)); is_last ==> scratch_ptr == old(scratch_ptr); of_quad32(Z0) +. shift(of_quad32(Z2), 64) +. shift(of_quad32(Z3), 128) == ghash_unroll_back(h, prev, pdata, 0, n, #nat(m)); { let data_i := #poly(pdata(n - m - 1)); inline if (is_last) { VPolyAdd(Ii, Xi, Ii); } else if (rev_blocks) { Add64(scratch_ptr, 0x10); } else { Sub64(scratch_ptr, 0x10); } VPolyMul(T1, Ii, Z1, false, false); VPolyMul(T2, Ii, Z1, false, true); VPolyMul(Hkey, Ii, Z1, true, false); VPolyMul(Z1, Ii, Z1, true, true); lemma_Mul128_accum(~~Z0, ~~Z2, ~~Z3, (if n = m + 1 then prev +. data_i else data_i), gf128_power(h, #nat(m + 1))); inline if (!is_last) { Load128_buffer(heap1, Ii, scratch_ptr, 0, Secret, scratch_in_b, scratch_index); inline if (!rev_bytes) { Pshufb(Ii, rev_mask); } } inline if (is_first) { Mov128(Z0, T1); } else { VPolyAdd(Z0, Z0, T1); } inline if (!is_last) { Load128_buffer(heap0, T1, Xip, 0x10 * power_index - 0x20, Secret, hkeys_b, power_index); } inline if (is_first) { lemma_Mul128((if n = m + 1 then prev +. data_i else data_i), gf128_power(h, 1)); VPolyAdd(Z2, T2, Hkey); Mov128(Z3, Z1); } else { VPolyAdd(Z2, Z2, T2); VPolyAdd(Z2, Z2, Hkey); VPolyAdd(Z3, Z3, Z1); } inline if (!is_last) { Mov128(Z1, T1); } } procedure MulAdd_unroll_n( inline rev_bytes:bool, inline rev_blocks:bool, inline exactly6:bool, ghost scratch_len:nat64, ghost hkeys_b:buffer128, ghost scratch_in_b:buffer128, ghost h:poly, ghost prev:poly, ghost data:seq(quad32)) {:quick} lets Xip @= r9; n_reg @= rdx; scratch_ptr @= r11; in_ptr @= rdi; Ii @= xmm0; T1 @= xmm1; T2 @= xmm2; Hkey @= xmm3; Z0 @= xmm4; Z1 @= xmm5; Z2 @= xmm6; Z3 @= xmm7; Xi @= xmm8; rev_mask @= xmm9; pdata := fun_seq_quad32_LE_poly128(data); n := length(data); scratch_index := index_opt_rev(rev_blocks, scratch_len, n, n - 1); reads heap0; heap1; memLayout; Xip; in_ptr; n_reg; rev_mask; modifies efl; scratch_ptr; Ii; T1; T2; Hkey; Z0; Z1; Z2; Z3; Xi; requires pclmulqdq_enabled && avx_enabled && sse_enabled; exactly6 ==> n == 6; !exactly6 ==> 1 <= n < 6; n <= scratch_len; hkeys_b_powers(hkeys_b, heap0, memLayout, Xip - 0x20, h); scratch_b_data(rev_bytes, rev_blocks, scratch_in_b, scratch_len, 6, heap1, memLayout, in_ptr, data); in_ptr + 0x10 * scratch_len <= pow2_64; of_quad32(Xi) == prev; !exactly6 ==> n_reg == n; scratch_ptr == in_ptr + 0x10 * scratch_index; !rev_bytes ==> rev_mask == Mkfour(0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203); ensures of_quad32(Z0) +. shift(of_quad32(Z2), 64) +. shift(of_quad32(Z3), 128) == ghash_unroll_back(h, prev, pdata, 0, n, #nat(n - 1)); exactly6 /\ !rev_blocks ==> scratch_ptr == old(scratch_ptr) - 0x50; { Load128_buffer(heap0, Z1, Xip, (-0x20), Secret, hkeys_b, 0); // assert of_quad32(Z1) == gf128_power(h, 1); Load128_buffer(heap1, Ii, scratch_ptr, 0, Secret, scratch_in_b, scratch_index); inline if (!rev_bytes) { Pshufb(Ii, rev_mask); } // assert Ii == reverse_bytes_quad32(index(data, #nat(n - 1))); inline if (exactly6) { MulAdd_unroll_i(rev_bytes, rev_blocks, true, false, 1, scratch_len, n, 0, hkeys_b, scratch_in_b, h, prev, data); MulAdd_unroll_i(rev_bytes, rev_blocks, false, false, 3, scratch_len, n, 1, hkeys_b, scratch_in_b, h, prev, data); MulAdd_unroll_i(rev_bytes, rev_blocks, false, false, 4, scratch_len, n, 2, hkeys_b, scratch_in_b, h, prev, data); MulAdd_unroll_i(rev_bytes, rev_blocks, false, false, 6, scratch_len, n, 3, hkeys_b, scratch_in_b, h, prev, data); MulAdd_unroll_i(rev_bytes, rev_blocks, false, false, 7, scratch_len, n, 4, hkeys_b, scratch_in_b, h, prev, data); MulAdd_unroll_i(rev_bytes, rev_blocks, false, true, 0, scratch_len, n, 5, hkeys_b, scratch_in_b, h, prev, data); } else { if (n_reg == 1) { MulAdd_unroll_i(rev_bytes, rev_blocks, true, true, 1, scratch_len, n, 0, hkeys_b, scratch_in_b, h, prev, data); } else { MulAdd_unroll_i(rev_bytes, rev_blocks, true, false, 1, scratch_len, n, 0, hkeys_b, scratch_in_b, h, prev, data); if (n_reg != 2) { MulAdd_unroll_i(rev_bytes, rev_blocks, false, false, 3, scratch_len, n, 1, hkeys_b, scratch_in_b, h, prev, data); if (n_reg != 3) { MulAdd_unroll_i(rev_bytes, rev_blocks, false, false, 4, scratch_len, n, 2, hkeys_b, scratch_in_b, h, prev, data); if (n_reg != 4) { MulAdd_unroll_i(rev_bytes, rev_blocks, false, false, 6, scratch_len, n, 3, hkeys_b, scratch_in_b, h, prev, data); } } } MulAdd_unroll_i(rev_bytes, rev_blocks, false, true, 0, scratch_len, n, n - 1, hkeys_b, scratch_in_b, h, prev, data); } } } procedure MulAdd_unroll( ghost hkeys_b:buffer128, ghost scratch_b:buffer128, ghost h:poly, ghost prev:poly, ghost data:seq(quad32)) {:quick} lets Xip @= r9; Ii @= xmm0; T1 @= xmm1; T2 @= xmm2; Hkey @= xmm3; Z0 @= xmm4; Z1 @= xmm5; Z2 @= xmm6; Z3 @= xmm7; Xi @= xmm8; pdata := fun_seq_quad32_LE_poly128(data); reads heap0; heap3; memLayout; Xip; rbp; modifies efl; rax; Ii; T1; T2; Hkey; Z0; Z1; Z2; Z3; Xi; requires pclmulqdq_enabled && avx_enabled && sse_enabled; length(data) == 6; hkeys_b_powers(hkeys_b, heap0, memLayout, Xip - 0x20, h); scratch_b_data(true, true, scratch_b, 8, 5, heap3, memLayout, rbp, data); Z3 == reverse_bytes_quad32(data[5]); of_quad32(Xi) +. of_quad32(Z0) +. of_quad32(buffer128_read(scratch_b, 1, heap3)) == prev; ensures of_quad32(Z0) +. shift(of_quad32(Z2), 64) +. shift(of_quad32(Z3), 128) == ghash_unroll_back(h, prev, pdata, 0, 6, 5); // old(buffer128_read(scratch_b, 2, heap3)) == buffer128_read(scratch_b, 2, heap3); Hkey == Mkfour(0, 0, 0, 0xc2000000); { let data0 := #poly(pdata(0)); let data1 := #poly(pdata(1)); let data2 := #poly(pdata(2)); let data3 := #poly(pdata(3)); let data4 := #poly(pdata(4)); let data5 := #poly(pdata(5)); // 1 Load128_buffer(heap0, Hkey, Xip, (-0x20), Secret, hkeys_b, 0); VPolyMul(T1, Z3, Hkey, false, false); VPolyMul(Z1, Z3, Hkey, false, true); Load128_buffer(heap3, Ii, rbp, 0x30, Secret, scratch_b, 3); VPolyMul(Z2, Z3, Hkey, true, false); VPolyMul(Z3, Z3, Hkey, true, true); lemma_Mul128(data5, gf128_power(h, 1)); ghost var z := data5 *. gf128_power(h, 1); assert z == ghash_unroll_back(h, prev, pdata, 0, 6, 0); // assert z == ~~T1 +. shift(~~Z1 +. ~~Z2, 64) +. shift(~~Z3, 128); lemma_add_commute(~~Z1, ~~Z2); // assert z == ~~T1 +. shift(~~Z2 +. ~~Z1, 64) +. shift(~~Z3, 128); Load128_buffer(heap0, Hkey, Xip, (-0x10), Secret, hkeys_b, 1); VPolyAdd(Z2, Z2, Z1); let z0 := ~~T1; // assert z == z0 +. shift(~~Z2, 64) +. shift(~~Z3, 128); // 2 VPolyMul(Z1, Ii, Hkey, false, false); VPolyAdd(Xi, Xi, Z0); VPolyAdd(Z0, T1, Z1); VPolyMul(T1, Ii, Hkey, false, true); VPolyMul(T2, Ii, Hkey, true, false); VPolyAdd(Xi, Xi, Mem128(heap3, rbp, 0x10, Secret, scratch_b, 1)); // assert ~~Xi == prev; VPolyMul(Hkey, Ii, Hkey, true, true); lemma_Mul128_accum(z0, ~~Z2, ~~Z3, data4, gf128_power(h, 2)); z := z +. data4 *. gf128_power(h, 2); assert z == ghash_unroll_back(h, prev, pdata, 0, 6, 1); // assert z == ~~Z0 +. shift(~~Z2 +. ~~T1 +. ~~T2, 64) +. shift(~~Z3 +. ~~Hkey, 128); Load128_buffer(heap3, Ii, rbp, 0x40, Secret, scratch_b, 4); Load128_buffer(heap0, Z1, Xip, 0x10, Secret, hkeys_b, 3); VPolyAdd(Z2, Z2, T1); // assert z == ~~Z0 +. shift(~~Z2 +. ~~T2, 64) +. shift(~~Z3 +. ~~Hkey, 128); // 3 VPolyMul(T1, Ii, Z1, false, false); VPolyAdd(Z2, Z2, T2); // assert z == ~~Z0 +. shift(~~Z2, 64) +. shift(~~Z3 +. ~~Hkey, 128); VPolyMul(T2, Ii, Z1, false, true); VPolyAdd(Z3, Z3, Hkey); // assert z == ~~Z0 +. shift(~~Z2, 64) +. shift(~~Z3, 128); VPolyMul(Hkey, Ii, Z1, true, false); VPolyMul(Z1, Ii, Z1, true, true); lemma_Mul128_accum(~~Z0, ~~Z2, ~~Z3, data3, gf128_power(h, 3)); z := z +. data3 *. gf128_power(h, 3); assert z == ghash_unroll_back(h, prev, pdata, 0, 6, 2); // assert z == (~~Z0 +. ~~T1) +. shift(~~Z2 +. ~~T2 +. ~~Hkey, 64) +. shift(~~Z3 +. ~~Z1, 128); Load128_buffer(heap3, Ii, rbp, 0x50, Secret, scratch_b, 5); VPolyAdd(Z0, Z0, T1); // assert z == ~~Z0 +. shift(~~Z2 +. ~~T2 +. ~~Hkey, 64) +. shift(~~Z3 +. ~~Z1, 128); Load128_buffer(heap0, T1, Xip, 0x20, Secret, hkeys_b, 4); VPolyAdd(Z2, Z2, T2); // assert z == ~~Z0 +. shift(~~Z2 +. ~~Hkey, 64) +. shift(~~Z3 +. ~~Z1, 128); // 4 VPolyMul(T2, Ii, T1, false, false); VPolyAdd(Z2, Z2, Hkey); // assert z == ~~Z0 +. shift(~~Z2, 64) +. shift(~~Z3 +. ~~Z1, 128); VPolyMul(Hkey, Ii, T1, false, true); VPolyAdd(Z3, Z3, Z1); // assert z == ~~Z0 +. shift(~~Z2, 64) +. shift(~~Z3, 128); VPolyMul(Z1, Ii, T1, true, false); VPolyMul(T1, Ii, T1, true, true); lemma_Mul128_accum(~~Z0, ~~Z2, ~~Z3, data2, gf128_power(h, 4)); z := z +. data2 *. gf128_power(h, 4); assert z == ghash_unroll_back(h, prev, pdata, 0, 6, 3); // assert z == (~~Z0 +. ~~T2) +. shift(~~Z2 +. ~~Hkey +. ~~Z1, 64) +. shift(~~Z3 +. ~~T1, 128); Load128_buffer(heap3, Ii, rbp, 0x60, Secret, scratch_b, 6); VPolyAdd(Z0, Z0, T2); // assert z == ~~Z0 +. shift(~~Z2 +. ~~Hkey +. ~~Z1, 64) +. shift(~~Z3 +. ~~T1, 128); Load128_buffer(heap0, T2, Xip, 0x40, Secret, hkeys_b, 6); VPolyAdd(Z2, Z2, Hkey); // assert z == ~~Z0 +. shift(~~Z2 +. ~~Z1, 64) +. shift(~~Z3 +. ~~T1, 128); // 5 VPolyMul(Hkey, Ii, T2, false, false); VPolyAdd(Z2, Z2, Z1); // assert z == ~~Z0 +. shift(~~Z2, 64) +. shift(~~Z3 +. ~~T1, 128); VPolyMul(Z1, Ii, T2, false, true); VPolyAdd(Z3, Z3, T1); // assert z == ~~Z0 +. shift(~~Z2, 64) +. shift(~~Z3, 128); VPolyMul(T1, Ii, T2, true, false); VPolyAdd(Xi, Xi, Mem128(heap3, rbp, 0x70, Secret, scratch_b, 7)); VPolyMul(T2, Ii, T2, true, true); lemma_Mul128_accum(~~Z0, ~~Z2, ~~Z3, data1, gf128_power(h, 5)); z := z +. data1 *. gf128_power(h, 5); assert z == ghash_unroll_back(h, prev, pdata, 0, 6, 4); // assert z == (~~Z0 +. ~~Hkey) +. shift(~~Z2 +. ~~Z1 +. ~~T1, 64) +. shift(~~Z3 +. ~~T2, 128); VPolyAdd(Z0, Z0, Hkey); // assert z == ~~Z0 +. shift(~~Z2 +. ~~Z1 +. ~~T1, 64) +. shift(~~Z3 +. ~~T2, 128); Load128_buffer(heap0, Hkey, Xip, 0x50, Secret, hkeys_b, 7); VPolyAdd(Z2, Z2, Z1); // assert z == ~~Z0 +. shift(~~Z2 +. ~~T1, 64) +. shift(~~Z3 +. ~~T2, 128); // 6 VPolyMul(Z1, Xi, Hkey, false, true); VPolyAdd(Z2, Z2, T1); // assert z == ~~Z0 +. shift(~~Z2, 64) +. shift(~~Z3 +. ~~T2, 128); VPolyMul(T1, Xi, Hkey, true, false); VPolyAdd(Z3, Z3, T2); // assert z == ~~Z0 +. shift(~~Z2, 64) +. shift(~~Z3, 128); VPolyMul(T2, Xi, Hkey, false, false); VPolyMul(Xi, Xi, Hkey, true, true); lemma_Mul128_accum(~~Z0, ~~Z2, ~~Z3, prev +. data0, gf128_power(h, 6)); z := z +. (prev +. data0) *. gf128_power(h, 6); assert z == ghash_unroll_back(h, prev, pdata, 0, 6, 5); // assert z == (~~Z0 +. ~~T2) +. shift(~~Z2 +. ~~Z1 +. ~~T1, 64) +. shift(~~Z3 +. ~~Xi, 128); VPolyAdd(Z2, Z2, Z1); // assert z == (~~Z0 +. ~~T2) +. shift(~~Z2 +. ~~T1, 64) +. shift(~~Z3 +. ~~Xi, 128); VPolyAdd(Z2, Z2, T1); // assert z == (~~Z0 +. ~~T2) +. shift(~~Z2, 64) +. shift(~~Z3 +. ~~Xi, 128); // vpslldq in Reduce VPolyAdd(Z0, Z0, T2); // assert z == ~~Z0 +. shift(~~Z2, 64) +. shift(~~Z3 +. ~~Xi, 128); ZeroXmm(Hkey); PinsrdImm(Hkey, 0xc2000000, 3, rax); // REVIEW: vmovdqu into Hkey VPolyAdd(Z3, Z3, Xi); // assert z == ~~Z0 +. shift(~~Z2, 64) +. shift(~~Z3, 128); } procedure Reduce(inline last_adds:bool, ghost f:poly) {:quick} lets Ii @= xmm0; Hkey @= xmm3; Z0 @= xmm4; Z1 @= xmm5; Z2 @= xmm6; Z3 @= xmm7; Xi @= xmm8; g := monomial(128) +. f; c := reverse(shift(f, (-1)), 63); a0 := of_quad32(Z0); a1 := of_quad32(Z2); a2 := of_quad32(Z3); a := a0 +. shift(a1, 64) +. shift(a2, 128); modifies efl; Ii; Hkey; Z0; Z1; Z2; Z3; Xi; requires pclmulqdq_enabled && avx_enabled; shift(of_quad32(Hkey), (-64)) == c; degree(f) < 64; degree(g) == 128; poly_index(f, 0); ensures let xi := if last_adds then of_quad32(Xi) else of_quad32(Xi) +. of_quad32(Z3) +. of_quad32(Z0); xi == reverse(reverse(a, 255) %. g, 127); { VLow64ToHigh(Z1, Z2); VPolyAdd(Z0, Z0, Z1); VSwap(Ii, Z0); VPolyMul(Z0, Z0, Hkey, false, true); VHigh64ToLow(Z2, Z2); VPolyAdd(Z3, Z3, Z2); lemma_add_commute(of_quad32(Z0), of_quad32(Ii)); VPolyAdd(Z0, Z0, Ii); // TODO: save Z3 in memory VSwap(Xi, Z0); VPolyMul(Z0, Z0, Hkey, false, true); inline if (last_adds) { VPolyAdd(Xi, Xi, Z3); VPolyAdd(Xi, Xi, Z0); } lemma_reduce_rev(a0, a1, a2, f, 64); } procedure ReduceLast( inline last_adds:bool, ghost h_LE:quad32, ghost y_prev:quad32, ghost data:seq(quad32)) {:public} {:quick} lets Ii @= xmm0; Hkey @= xmm3; Z0 @= xmm4; Z1 @= xmm5; Z2 @= xmm6; Z3 @= xmm7; Xi @= xmm8; h := of_quad32(reverse_bytes_quad32(h_LE)); prev := of_quad32(reverse_bytes_quad32(y_prev)); pdata := fun_seq_quad32_LE_poly128(data); n := length(data); modifies efl; Ii; Hkey; Z0; Z1; Z2; Z3; Xi; requires pclmulqdq_enabled && avx_enabled; Hkey == Mkfour(0, 0, 0, 0xc2000000); n > 0; of_quad32(Z0) +. shift(of_quad32(Z2), 64) +. shift(of_quad32(Z3), 128) == ghash_unroll_back(h, prev, pdata, 0, n, #nat(n - 1)); ensures let xi := if last_adds then of_quad32(Xi) else of_quad32(Xi) +. of_quad32(Z3) +. of_quad32(Z0); to_quad32(xi) == reverse_bytes_quad32(ghash_incremental(h_LE, y_prev, data)); xi == of_quad32(to_quad32(xi)); { lemma_gf128_low_shift(); lemma_gf128_degree(); Reduce(last_adds, gf128_modulus_low_terms); // assert of_quad32(Xi) == mod_rev(128, ghash_unroll_back(h, prev, pdata, 0, n, #nat(n - 1)), gf128_modulus); lemma_ghash_unroll_back_forward(h, prev, pdata, 0, #nat(n - 1)); // assert of_quad32(Xi) == mod_rev(128, ghash_unroll(h, prev, pdata, 0, #nat(n - 1), 0), gf128_modulus); lemma_ghash_poly_of_unroll(h, prev, pdata, 0, #nat(n - 1)); // assert of_quad32(Xi) == ghash_poly(h, prev, pdata, 0, n); lemma_ghash_incremental_poly(h_LE, y_prev, data); // assert of_quad32(Xi) == of_quad32(reverse_bytes_quad32(ghash_incremental(h_LE, y_prev, data))); lemma_to_of_quad32(reverse_bytes_quad32(ghash_incremental(h_LE, y_prev, data))); } procedure GhashUnroll_n( inline rev_bytes:bool, inline rev_blocks:bool, inline exactly6:bool, ghost scratch_len:nat64, ghost hkeys_b:buffer128, ghost scratch_in_b:buffer128, ghost h_LE:quad32, ghost y_prev:quad32, ghost data:seq(quad32)) {:public} {:quick} lets Xip @= r9; n_reg @= rdx; scratch_ptr @= r11; in_ptr @= rdi; scratch_reg @= r10; Ii @= xmm0; T1 @= xmm1; T2 @= xmm2; Hkey @= xmm3; Z0 @= xmm4; Z1 @= xmm5; Z2 @= xmm6; Z3 @= xmm7; Xi @= xmm8; rev_mask @= xmm9; h := of_quad32(reverse_bytes_quad32(h_LE)); prev := of_quad32(reverse_bytes_quad32(y_prev)); pdata := fun_seq_quad32_LE_poly128(data); n := length(data); scratch_index := index_opt_rev(rev_blocks, scratch_len, n, n - 1); reads heap0; heap1; memLayout; Xip; n_reg; in_ptr; rev_mask; modifies efl; scratch_reg; scratch_ptr; Ii; T1; T2; Hkey; Z0; Z1; Z2; Z3; Xi; requires pclmulqdq_enabled && avx_enabled && sse_enabled; exactly6 ==> n == 6; !exactly6 ==> 1 <= n < 6; n <= scratch_len; hkeys_b_powers(hkeys_b, heap0, memLayout, Xip - 0x20, h); scratch_b_data(rev_bytes, rev_blocks, scratch_in_b, scratch_len, 6, heap1, memLayout, in_ptr, data); in_ptr + 0x10 * scratch_len <= pow2_64; of_quad32(Xi) == prev; !exactly6 ==> n_reg == n; scratch_ptr == in_ptr + 0x10 * scratch_index; !rev_bytes ==> rev_mask == Mkfour(0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203); ensures Xi == reverse_bytes_quad32(ghash_incremental(h_LE, y_prev, data)); exactly6 /\ !rev_blocks ==> scratch_ptr == old(scratch_ptr) - 0x50; { MulAdd_unroll_n(rev_bytes, rev_blocks, exactly6, scratch_len, hkeys_b, scratch_in_b, h, prev, data); ZeroXmm(Hkey); PinsrdImm(Hkey, 0xc2000000, 3, scratch_reg); ReduceLast(true, h_LE, y_prev, data); } // TODO: Pair this up with a version of gcm_make_length_quad that doesn't reverse it's output, // since that reversal will cancel with this one // Expects input to be hashed to reside in Ii procedure Ghash_register( ghost hkeys_b:buffer128, ghost h_LE:quad32, ghost y_prev:quad32) {:public} {:quick} lets Xip @= r9; scratch_ptr @= r11; Ii @= xmm0; T1 @= xmm1; T2 @= xmm2; Hkey @= xmm3; Z0 @= xmm4; Z1 @= xmm5; Z2 @= xmm6; Z3 @= xmm7; Xi @= xmm8; h := of_quad32(reverse_bytes_quad32(h_LE)); data := create(1, reverse_bytes_quad32(Ii)); prev := of_quad32(reverse_bytes_quad32(y_prev)); pdata := fun_seq_quad32_LE_poly128(data); reads heap0; memLayout; Xip; modifies efl; scratch_ptr; Ii; T1; T2; Hkey; Z0; Z1; Z2; Z3; Xi; requires pclmulqdq_enabled && avx_enabled && sse_enabled; hkeys_b_powers(hkeys_b, heap0, memLayout, Xip - 0x20, h); of_quad32(Xi) == prev; ensures Xi == reverse_bytes_quad32(ghash_incremental(h_LE, y_prev, old(data))); { Load128_buffer(heap0, Z1, Xip, (-0x20), Secret, hkeys_b, 0); assert of_quad32(Z1) == gf128_power(h, 1); assert Ii == reverse_bytes_quad32(index(data, #nat(0))); let data_i := #poly(pdata(0)); VPolyAdd(Ii, Xi, Ii); VPolyMul(T1, Ii, Z1, false, false); VPolyMul(T2, Ii, Z1, false, true); VPolyMul(Hkey, Ii, Z1, true, false); VPolyMul(Z1, Ii, Z1, true, true); lemma_Mul128_accum(~~Z0, ~~Z2, ~~Z3, prev +. data_i, gf128_power(h, 1)); Mov128(Z0, T1); lemma_Mul128(prev +. data_i, gf128_power(h, 1)); VPolyAdd(Z2, T2, Hkey); Mov128(Z3, Z1); ZeroXmm(Hkey); PinsrdImm(Hkey, 0xc2000000, 3, scratch_ptr); lemma_gf128_low_shift(); lemma_gf128_degree(); Reduce(true, gf128_modulus_low_terms); assert of_quad32(Xi) == mod_rev(128, ghash_unroll_back(h, prev, pdata, 0, 1, 0), gf128_modulus); lemma_ghash_unroll_back_forward(h, prev, pdata, 0, 0); assert of_quad32(Xi) == mod_rev(128, ghash_unroll(h, prev, pdata, 0, 0, 0), gf128_modulus); lemma_ghash_poly_of_unroll(h, prev, pdata, 0, 0); assert of_quad32(Xi) == ghash_poly(h, prev, pdata, 0, 1); lemma_ghash_incremental_poly(h_LE, y_prev, data); assert of_quad32(Xi) == of_quad32(reverse_bytes_quad32(ghash_incremental(h_LE, y_prev, data))); lemma_to_of_quad32(reverse_bytes_quad32(ghash_incremental(h_LE, y_prev, data))); } procedure Ghash_buffer_loop_body( ghost hkeys_b:buffer128, ghost in_b:buffer128, ghost h_LE:quad32, ghost y_prev:quad32, ghost old_len:nat64, ghost index:nat ) //{:public} {:quick} lets Xip @= r9; len @= rdx; scratch_ptr @= r11; in_ptr @= rdi; scratch_reg @= r10; Ii @= xmm0; T1 @= xmm1; T2 @= xmm2; Hkey @= xmm3; Z0 @= xmm4; Z1 @= xmm5; Z2 @= xmm6; Z3 @= xmm7; Xi @= xmm8; rev_mask @= xmm9; h := of_quad32(reverse_bytes_quad32(h_LE)); prev := of_quad32(reverse_bytes_quad32(y_prev)); reads Xip; in_ptr; rev_mask; heap0; heap1; memLayout; modifies len; scratch_ptr; scratch_reg; efl; Ii; T1; T2; Hkey; Z0; Z1; Z2; Z3; Xi; requires len >= 6; // From the while loop condition // Loop invariants index + len == old_len; scratch_ptr == in_ptr + 0x10 * index; // Copy over preconditions pclmulqdq_enabled && avx_enabled && sse_enabled; hkeys_b_powers(hkeys_b, heap0, memLayout, Xip - 0x20, h); validSrcAddrs128(heap1, in_ptr, in_b, len, memLayout, Secret); buffer_length(in_b) == old_len; in_ptr + 0x10 * old_len < pow2_64; Xi == reverse_bytes_quad32(ghash_incremental0(h_LE, y_prev, slice(s128(heap1, in_b), 0, index))); rev_mask == Mkfour(0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203); ensures let index' := index + 6; // Loop invariants index' + len == old_len; scratch_ptr == in_ptr + 0x10 * index'; // Copy over preconditions pclmulqdq_enabled && avx_enabled; hkeys_b_powers(hkeys_b, heap0, memLayout, Xip - 0x20, h); validSrcAddrs128(heap1, in_ptr, in_b, len, memLayout, Secret); buffer_length(in_b) == old_len; in_ptr + 0x10 * old_len < pow2_64; Xi == reverse_bytes_quad32(ghash_incremental0(h_LE, y_prev, slice(s128(heap1, in_b), 0, index'))); rev_mask == Mkfour(0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203); // Loop updates scratch_ptr == old(scratch_ptr) + 0x60; len == old(len) - 6; { let data := Seq.slice(s128(heap1, in_b), index, index + 6); Add64(scratch_ptr, 0x50); GhashUnroll_n(false, false, true, #nat64(index+6), hkeys_b, in_b, h_LE, ghash_incremental0(h_LE, y_prev, slice(s128(heap1, in_b), 0, index)), data); lemma_ghash_incremental0_append(h_LE, y_prev, reverse_bytes_quad32(old(Xi)), reverse_bytes_quad32(Xi), slice(s128(heap1, in_b), 0, index), data); assert equal(append(slice(s128(heap1, in_b), 0, index), data), slice(s128(heap1, in_b), 0, index + 6)); Add64(scratch_ptr, 0x60); Sub64(len, 6); } procedure Ghash_buffer( ghost hkeys_b:buffer128, ghost in_b:buffer128, ghost h_LE:quad32, ghost y_prev:quad32 ) {:public} {:quick} lets Xip @= r9; len @= rdx; scratch_ptr @= r11; in_ptr @= rdi; scratch_reg @= r10; Ii @= xmm0; T1 @= xmm1; T2 @= xmm2; Hkey @= xmm3; Z0 @= xmm4; Z1 @= xmm5; Z2 @= xmm6; Z3 @= xmm7; Xi @= xmm8; rev_mask @= xmm9; h:poly := of_quad32(reverse_bytes_quad32(h_LE)); reads Xip; in_ptr; rev_mask; heap0; heap1; memLayout; modifies len; scratch_ptr; scratch_reg; efl; Ii; T1; T2; Hkey; Z0; Z1; Z2; Z3; Xi; requires pclmulqdq_enabled && avx_enabled && sse_enabled; hkeys_b_powers(hkeys_b, heap0, memLayout, Xip - 0x20, h); validSrcAddrs128(heap1, in_ptr, in_b, len, memLayout, Secret); buffer_length(in_b) == len; in_ptr + 0x10 * len < pow2_64; Xi == reverse_bytes_quad32(y_prev); rev_mask == Mkfour(0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203); ensures Xi == reverse_bytes_quad32(ghash_incremental0(h_LE, y_prev, s128(heap1, in_b))); old(len) == 0 ==> Xi == old(Xi); { Mov64(scratch_ptr, in_ptr); ghost var index:nat := 0; while (len >= 6) invariant // Loop invariants index + len == old(len); scratch_ptr == in_ptr + 0x10 * index; // Copy over preconditions pclmulqdq_enabled && avx_enabled && sse_enabled; hkeys_b_powers(hkeys_b, heap0, memLayout, Xip - 0x20, h); validSrcAddrs128(heap1, in_ptr, in_b, len, memLayout, Secret); buffer_length(in_b) == old(len); in_ptr + 0x10 * old(len) < pow2_64; Xi == reverse_bytes_quad32(ghash_incremental0(h_LE, y_prev, slice(s128(heap1, in_b), 0, index))); rev_mask == Mkfour(0x0C0D0E0F, 0x08090A0B, 0x04050607, 0x00010203); old(len) == 0 ==> Xi == old(Xi); h == of_quad32(reverse_bytes_quad32(h_LE)); decreases len; { Ghash_buffer_loop_body(hkeys_b, in_b, h_LE, y_prev, old(len), index); index := index + 6; } if (len > 0) { let data := Seq.slice(s128(heap1, in_b), index, old(len)); let y_loop := reverse_bytes_quad32(Xi); Mov64(scratch_reg, len); Sub64(scratch_reg, 1); // assert scratch_reg < old(len); // assert old(len) * 0x10 < pow2_64; assert scratch_reg * 0x10 < pow2_64; // OBSERVE -- Unclear why this is needed, other than general NL-arith flakiness IMul64(scratch_reg, 0x10); Add64(scratch_ptr, scratch_reg); GhashUnroll_n(false, false, false, old(len), hkeys_b, in_b, h_LE, y_loop, data); lemma_ghash_incremental0_append(h_LE, y_prev, y_loop, reverse_bytes_quad32(Xi), slice(s128(heap1, in_b), 0, index), data); assert equal(append(slice(s128(heap1, in_b), 0, index), data), s128(heap1, in_b)); // OBSERVE } } procedure GhashUnroll6x( ghost hkeys_b:buffer128, ghost scratch_b:buffer128, ghost h_LE:quad32, ghost y_prev:quad32, ghost data:seq(quad32)) {:public} {:quick} lets Xip @= r9; Ii @= xmm0; T1 @= xmm1; T2 @= xmm2; Hkey @= xmm3; Z0 @= xmm4; Z1 @= xmm5; Z2 @= xmm6; Z3 @= xmm7; Xi @= xmm8; h := of_quad32(reverse_bytes_quad32(h_LE)); prev := of_quad32(reverse_bytes_quad32(y_prev)); pdata := fun_seq_quad32_LE_poly128(data); reads heap0; heap3; memLayout; Xip; rbp; modifies efl; rax; Ii; T1; T2; Hkey; Z0; Z1; Z2; Z3; Xi; requires pclmulqdq_enabled && avx_enabled && sse_enabled; length(data) == 6; hkeys_b_powers(hkeys_b, heap0, memLayout, Xip - 0x20, h); scratch_b_data(true, true, scratch_b, 8, 5, heap3, memLayout, rbp, data); Z3 == reverse_bytes_quad32(data[5]); of_quad32(Xi) +. of_quad32(Z0) +. of_quad32(buffer128_read(scratch_b, 1, heap3)) == prev; ensures Xi == reverse_bytes_quad32(ghash_incremental(h_LE, y_prev, data)); { MulAdd_unroll(hkeys_b, scratch_b, h, prev, data); ReduceLast(true, h_LE, y_prev, data); }