// This file is generated from a similarly-named Perl script in the BoringSSL // source tree. Do not edit by hand. #include #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) #include "openssl/arm_arch.h" #if __ARM_MAX_ARCH__>=8 .text .arch armv8.2-a+crypto .globl aesv8_gcm_8x_enc_128 .hidden aesv8_gcm_8x_enc_128 .type aesv8_gcm_8x_enc_128,%function .align 4 aesv8_gcm_8x_enc_128: #ifdef BORINGSSL_DISPATCH_TEST adrp x9,BORINGSSL_function_hit add x9, x9, :lo12:BORINGSSL_function_hit mov w10, #1 strb w10, [x9,#7] // kFlag_aesv8_gcm_8x_enc_128 #endif AARCH64_VALID_CALL_TARGET cbz x1, .L128_enc_ret stp d8, d9, [sp, #-80]! lsr x9, x1, #3 mov x16, x4 mov x11, x5 stp d10, d11, [sp, #16] stp d12, d13, [sp, #32] stp d14, d15, [sp, #48] mov x5, #0xc200000000000000 stp x5, xzr, [sp, #64] add x10, sp, #64 mov x15, #0x100000000 //set up counter increment movi v31.16b, #0x0 mov v31.d[1], x15 mov x5, x9 ld1 { v0.16b}, [x16] //CTR block 0 sub x5, x5, #1 //byte_len - 1 and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) rev32 v30.16b, v0.16b //set up reversed counter add v30.4s, v30.4s, v31.4s //CTR block 0 rev32 v1.16b, v30.16b //CTR block 1 add v30.4s, v30.4s, v31.4s //CTR block 1 rev32 v2.16b, v30.16b //CTR block 2 add v30.4s, v30.4s, v31.4s //CTR block 2 rev32 v3.16b, v30.16b //CTR block 3 add v30.4s, v30.4s, v31.4s //CTR block 3 rev32 v4.16b, v30.16b //CTR block 4 add v30.4s, v30.4s, v31.4s //CTR block 4 rev32 v5.16b, v30.16b //CTR block 5 add v30.4s, v30.4s, v31.4s //CTR block 5 ldp q26, q27, [x11, #0] //load rk0, rk1 rev32 v6.16b, v30.16b //CTR block 6 add v30.4s, v30.4s, v31.4s //CTR block 6 rev32 v7.16b, v30.16b //CTR block 7 add v30.4s, v30.4s, v31.4s //CTR block 7 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 4 - round 0 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 6 - round 0 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 3 - round 0 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 0 - round 0 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 1 - round 0 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 2 - round 0 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 7 - round 0 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 5 - round 0 ldp q28, q26, [x11, #32] //load rk2, rk3 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 3 - round 1 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 7 - round 1 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 5 - round 1 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 4 - round 1 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 2 - round 1 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 6 - round 1 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 0 - round 1 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 5 - round 2 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 1 - round 1 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 0 - round 2 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 2 - round 2 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 3 - round 2 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 7 - round 2 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 1 - round 2 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 6 - round 2 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 4 - round 2 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 2 - round 3 ldp q27, q28, [x11, #64] //load rk4, rk5 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 5 - round 3 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 0 - round 3 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 4 - round 3 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 3 - round 3 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 6 - round 3 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 7 - round 3 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 6 - round 4 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 1 - round 3 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 5 - round 4 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 7 - round 4 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 4 - round 4 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 0 - round 4 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 1 - round 4 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 2 - round 4 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 3 - round 4 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 7 - round 5 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 0 - round 5 ldp q26, q27, [x11, #96] //load rk6, rk7 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 1 - round 5 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 3 - round 5 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 2 - round 5 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 4 - round 5 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 5 - round 5 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 6 - round 5 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 4 - round 6 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 3 - round 6 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 2 - round 6 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 7 - round 6 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 6 - round 6 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 5 - round 6 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 0 - round 6 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 1 - round 6 ldp q28, q26, [x11, #128] //load rk8, rk9 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 5 - round 7 ld1 { v19.16b}, [x3] ext v19.16b, v19.16b, v19.16b, #8 rev64 v19.16b, v19.16b aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 7 - round 7 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 4 - round 7 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 3 - round 7 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 6 - round 7 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 1 - round 7 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 2 - round 7 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 0 - round 7 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 ldr q27, [x11, #160] //load rk10 aese v3.16b, v26.16b //AES block 8k+11 - round 9 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 aese v2.16b, v26.16b //AES block 8k+10 - round 9 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 aese v6.16b, v26.16b //AES block 8k+14 - round 9 aese v4.16b, v26.16b //AES block 8k+12 - round 9 add x5, x5, x0 aese v0.16b, v26.16b //AES block 8k+8 - round 9 aese v7.16b, v26.16b //AES block 8k+15 - round 9 aese v5.16b, v26.16b //AES block 8k+13 - round 9 aese v1.16b, v26.16b //AES block 8k+9 - round 9 add x4, x0, x1, lsr #3 //end_input_ptr cmp x0, x5 //check if we have <= 8 blocks b.ge .L128_enc_tail //handle tail ldp q8, q9, [x0], #32 //AES block 0, 1 - load plaintext ldp q10, q11, [x0], #32 //AES block 2, 3 - load plaintext ldp q12, q13, [x0], #32 //AES block 4, 5 - load plaintext ldp q14, q15, [x0], #32 //AES block 6, 7 - load plaintext cmp x0, x5 //check if we have <= 8 blocks .inst 0xce006d08 //eor3 v8.16b, v8.16b, v0.16b, v27.16b //AES block 0 - result rev32 v0.16b, v30.16b //CTR block 8 add v30.4s, v30.4s, v31.4s //CTR block 8 .inst 0xce016d29 //eor3 v9.16b, v9.16b, v1.16b, v27.16b //AES block 1 - result stp q8, q9, [x2], #32 //AES block 0, 1 - store result rev32 v1.16b, v30.16b //CTR block 9 .inst 0xce056dad //eor3 v13.16b, v13.16b, v5.16b, v27.16b //AES block 5 - result add v30.4s, v30.4s, v31.4s //CTR block 9 .inst 0xce026d4a //eor3 v10.16b, v10.16b, v2.16b, v27.16b //AES block 2 - result .inst 0xce066dce //eor3 v14.16b, v14.16b, v6.16b, v27.16b //AES block 6 - result .inst 0xce046d8c //eor3 v12.16b, v12.16b, v4.16b, v27.16b //AES block 4 - result rev32 v2.16b, v30.16b //CTR block 10 add v30.4s, v30.4s, v31.4s //CTR block 10 .inst 0xce036d6b //eor3 v11.16b, v11.16b, v3.16b, v27.16b //AES block 3 - result .inst 0xce076def //eor3 v15.16b, v15.16b, v7.16b,v27.16b //AES block 7 - result stp q10, q11, [x2], #32 //AES block 2, 3 - store result rev32 v3.16b, v30.16b //CTR block 11 add v30.4s, v30.4s, v31.4s //CTR block 11 stp q12, q13, [x2], #32 //AES block 4, 5 - store result stp q14, q15, [x2], #32 //AES block 6, 7 - store result rev32 v4.16b, v30.16b //CTR block 12 add v30.4s, v30.4s, v31.4s //CTR block 12 b.ge .L128_enc_prepretail //do prepretail .L128_enc_main_loop: //main loop start rev32 v5.16b, v30.16b //CTR block 8k+13 ldr q20, [x6, #96] //load h5l | h5h ldr q22, [x6, #128] //load h6l | h6h add v30.4s, v30.4s, v31.4s //CTR block 8k+13 rev64 v9.16b, v9.16b //GHASH block 8k+1 rev64 v8.16b, v8.16b //GHASH block 8k ldr q23, [x6, #144] //load h7l | h7h ldr q25, [x6, #176] //load h8l | h8h rev32 v6.16b, v30.16b //CTR block 8k+14 add v30.4s, v30.4s, v31.4s //CTR block 8k+14 ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 ldr q21, [x6, #112] //load h6k | h5k ldr q24, [x6, #160] //load h8k | h7k rev64 v13.16b, v13.16b //GHASH block 8k+5 (t0, t1, t2 and t3 free) rev64 v11.16b, v11.16b //GHASH block 8k+3 ldp q26, q27, [x11, #0] //load rk0, rk1 eor v8.16b, v8.16b, v19.16b //PRE 1 rev32 v7.16b, v30.16b //CTR block 8k+15 rev64 v15.16b, v15.16b //GHASH block 8k+7 (t0, t1, t2 and t3 free) pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high rev64 v10.16b, v10.16b //GHASH block 8k+2 pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low ldr q23, [x6, #48] //load h3l | h3h ldr q25, [x6, #80] //load h3l | h3h aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high add v30.4s, v30.4s, v31.4s //CTR block 8k+15 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 .inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b,v9.16b //GHASH block 8k+2, 8k+3 - high trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid ldp q28, q26, [x11, #32] //load rk2, rk3 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid rev64 v14.16b, v14.16b //GHASH block 8k+6 (t0, t1, and t2 free) .inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 .inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 ldr q21, [x6, #16] //load h2k | h1k ldr q24, [x6, #64] //load h4k | h3k rev64 v12.16b, v12.16b //GHASH block 8k+4 (t0, t1, and t2 free) ldp q27, q28, [x11, #64] //load rk4, rk5 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 ldr q20, [x6] //load h1l | h1h ldr q22, [x6, #32] //load h1l | h1h pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 ldp q26, q27, [x11, #96] //load rk6, rk7 trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low .inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid .inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 .inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid ldr d16, [x10] //MODULO - load modulo constant pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid .inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load plaintext aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 rev32 v20.16b, v30.16b //CTR block 8k+16 add v30.4s, v30.4s, v31.4s //CTR block 8k+16 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 .inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid ldp q28, q26, [x11, #128] //load rk8, rk9 .inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load plaintext aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 rev32 v22.16b, v30.16b //CTR block 8k+17 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load plaintext add v30.4s, v30.4s, v31.4s //CTR block 8k+17 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 .inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up ldr q27, [x11, #160] //load rk10 ext v29.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment rev32 v23.16b, v30.16b //CTR block 8k+18 add v30.4s, v30.4s, v31.4s //CTR block 8k+18 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 .inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 aese v2.16b, v26.16b //AES block 8k+10 - round 9 aese v4.16b, v26.16b //AES block 8k+12 - round 9 aese v1.16b, v26.16b //AES block 8k+9 - round 9 ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load plaintext rev32 v25.16b, v30.16b //CTR block 8k+19 add v30.4s, v30.4s, v31.4s //CTR block 8k+19 cmp x0, x5 //.LOOP CONTROL .inst 0xce046d8c //eor3 v12.16b, v12.16b, v4.16b, v27.16b //AES block 4 - result aese v7.16b, v26.16b //AES block 8k+15 - round 9 aese v6.16b, v26.16b //AES block 8k+14 - round 9 aese v3.16b, v26.16b //AES block 8k+11 - round 9 .inst 0xce026d4a //eor3 v10.16b, v10.16b, v2.16b, v27.16b //AES block 8k+10 - result mov v2.16b, v23.16b //CTR block 8k+18 aese v0.16b, v26.16b //AES block 8k+8 - round 9 rev32 v4.16b, v30.16b //CTR block 8k+20 add v30.4s, v30.4s, v31.4s //CTR block 8k+20 .inst 0xce076def //eor3 v15.16b, v15.16b, v7.16b, v27.16b //AES block 7 - result aese v5.16b, v26.16b //AES block 8k+13 - round 9 pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low .inst 0xce016d29 //eor3 v9.16b, v9.16b, v1.16b, v27.16b //AES block 8k+9 - result .inst 0xce036d6b //eor3 v11.16b, v11.16b, v3.16b, v27.16b //AES block 8k+11 - result mov v3.16b, v25.16b //CTR block 8k+19 ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment .inst 0xce056dad //eor3 v13.16b, v13.16b, v5.16b, v27.16b //AES block 5 - result mov v1.16b, v22.16b //CTR block 8k+17 .inst 0xce006d08 //eor3 v8.16b, v8.16b, v0.16b, v27.16b //AES block 8k+8 - result mov v0.16b, v20.16b //CTR block 8k+16 stp q8, q9, [x2], #32 //AES block 8k+8, 8k+9 - store result stp q10, q11, [x2], #32 //AES block 8k+10, 8k+11 - store result .inst 0xce066dce //eor3 v14.16b, v14.16b, v6.16b, v27.16b //AES block 6 - result stp q12, q13, [x2], #32 //AES block 8k+12, 8k+13 - store result .inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low stp q14, q15, [x2], #32 //AES block 8k+14, 8k+15 - store result b.lt .L128_enc_main_loop .L128_enc_prepretail: //PREPRETAIL rev32 v5.16b, v30.16b //CTR block 8k+13 ldr q23, [x6, #144] //load h7l | h7h ldr q25, [x6, #176] //load h8l | h8h ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 ldr q20, [x6, #96] //load h5l | h5h ldr q22, [x6, #128] //load h6l | h6h rev64 v8.16b, v8.16b //GHASH block 8k rev64 v9.16b, v9.16b //GHASH block 8k+1 ldr q21, [x6, #112] //load h6k | h5k ldr q24, [x6, #160] //load h6k | h5k add v30.4s, v30.4s, v31.4s //CTR block 8k+13 rev64 v11.16b, v11.16b //GHASH block 8k+3 rev64 v10.16b, v10.16b //GHASH block 8k+2 eor v8.16b, v8.16b, v19.16b //PRE 1 rev32 v6.16b, v30.16b //CTR block 8k+14 pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high rev64 v13.16b, v13.16b //GHASH block 8k+5 (t0, t1, t2 and t3 free) trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid ldp q26, q27, [x11, #0] //load rk0, rk1 add v30.4s, v30.4s, v31.4s //CTR block 8k+14 pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid rev64 v12.16b, v12.16b //GHASH block 8k+4 (t0, t1, and t2 free) rev64 v15.16b, v15.16b //GHASH block 8k+7 (t0, t1, t2 and t3 free) eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid rev32 v7.16b, v30.16b //CTR block 8k+15 rev64 v14.16b, v14.16b //GHASH block 8k+6 (t0, t1, and t2 free) aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 .inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low ldr q23, [x6, #48] //load h3l | h3h ldr q25, [x6, #80] //load h4l | h4h ldp q28, q26, [x11, #32] //load rk2, rk3 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid .inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 .inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid ldr q21, [x6, #16] //load h2k | h1k ldr q24, [x6, #64] //load h4k | h3k aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 ldp q27, q28, [x11, #64] //load rk4, rk5 ldr q20, [x6] //load h1l | h1h ldr q22, [x6, #32] //load h1l | h1h trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 add v30.4s, v30.4s, v31.4s //CTR block 8k+15 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 .inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high .inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high ldp q26, q27, [x11, #96] //load rk6, rk7 pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low .inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 ldr d16, [x10] //MODULO - load modulo constant aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 .inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 .inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low .inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid .inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up ldp q28, q26, [x11, #128] //load rk8, rk9 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 ext v29.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 .inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 ext v18.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 .inst 0xce114a73 //eor3 v19.16b, v19.16b, v17.16b, v18.16b //MODULO - fold into low aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 ldr q27, [x11, #160] //load rk10 aese v6.16b, v26.16b //AES block 8k+14 - round 9 aese v2.16b, v26.16b //AES block 8k+10 - round 9 aese v0.16b, v26.16b //AES block 8k+8 - round 9 aese v1.16b, v26.16b //AES block 8k+9 - round 9 aese v3.16b, v26.16b //AES block 8k+11 - round 9 aese v5.16b, v26.16b //AES block 8k+13 - round 9 aese v4.16b, v26.16b //AES block 8k+12 - round 9 aese v7.16b, v26.16b //AES block 8k+15 - round 9 .L128_enc_tail: //TAIL sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process ldr q8, [x0], #16 //AES block 8k+8 - load plaintext mov v29.16b, v27.16b ldp q20, q21, [x6, #96] //load h5l | h5h .inst 0xce007509 //eor3 v9.16b, v8.16b, v0.16b, v29.16b //AES block 8k+8 - result ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag ldp q22, q23, [x6, #128] //load h6l | h6h ldp q24, q25, [x6, #160] //load h8k | h7k cmp x5, #112 b.gt .L128_enc_blocks_more_than_7 mov v7.16b, v6.16b mov v6.16b, v5.16b movi v17.8b, #0 cmp x5, #96 sub v30.4s, v30.4s, v31.4s mov v5.16b, v4.16b mov v4.16b, v3.16b mov v3.16b, v2.16b mov v2.16b, v1.16b movi v19.8b, #0 movi v18.8b, #0 b.gt .L128_enc_blocks_more_than_6 mov v7.16b, v6.16b cmp x5, #80 sub v30.4s, v30.4s, v31.4s mov v6.16b, v5.16b mov v5.16b, v4.16b mov v4.16b, v3.16b mov v3.16b, v1.16b b.gt .L128_enc_blocks_more_than_5 cmp x5, #64 sub v30.4s, v30.4s, v31.4s mov v7.16b, v6.16b mov v6.16b, v5.16b mov v5.16b, v4.16b mov v4.16b, v1.16b b.gt .L128_enc_blocks_more_than_4 mov v7.16b, v6.16b sub v30.4s, v30.4s, v31.4s mov v6.16b, v5.16b mov v5.16b, v1.16b cmp x5, #48 b.gt .L128_enc_blocks_more_than_3 sub v30.4s, v30.4s, v31.4s mov v7.16b, v6.16b mov v6.16b, v1.16b cmp x5, #32 ldr q24, [x6, #64] //load h4k | h3k b.gt .L128_enc_blocks_more_than_2 cmp x5, #16 sub v30.4s, v30.4s, v31.4s mov v7.16b, v1.16b b.gt .L128_enc_blocks_more_than_1 ldr q21, [x6, #16] //load h2k | h1k sub v30.4s, v30.4s, v31.4s b .L128_enc_blocks_less_than_1 .L128_enc_blocks_more_than_7: //blocks left > 7 st1 { v9.16b}, [x2], #16 //AES final-7 block - store result rev64 v8.16b, v9.16b //GHASH final-7 block ldr q9, [x0], #16 //AES final-6 block - load plaintext eor v8.16b, v8.16b, v16.16b //feed in partial tag ins v27.d[0], v8.d[1] //GHASH final-7 block - mid pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high ins v18.d[0], v24.d[1] //GHASH final-7 block - mid eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid movi v16.8b, #0 //supress further partial tag feed in .inst 0xce017529 //eor3 v9.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low .L128_enc_blocks_more_than_6: //blocks left > 6 st1 { v9.16b}, [x2], #16 //AES final-6 block - store result rev64 v8.16b, v9.16b //GHASH final-6 block ldr q9, [x0], #16 //AES final-5 block - load plaintext eor v8.16b, v8.16b, v16.16b //feed in partial tag ins v27.d[0], v8.d[1] //GHASH final-6 block - mid .inst 0xce027529 //eor3 v9.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid movi v16.8b, #0 //supress further partial tag feed in pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high .L128_enc_blocks_more_than_5: //blocks left > 5 st1 { v9.16b}, [x2], #16 //AES final-5 block - store result rev64 v8.16b, v9.16b //GHASH final-5 block eor v8.16b, v8.16b, v16.16b //feed in partial tag ins v27.d[0], v8.d[1] //GHASH final-5 block - mid ldr q9, [x0], #16 //AES final-4 block - load plaintext pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid ins v27.d[1], v27.d[0] //GHASH final-5 block - mid .inst 0xce037529 //eor3 v9.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low movi v16.8b, #0 //supress further partial tag feed in pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid .L128_enc_blocks_more_than_4: //blocks left > 4 st1 { v9.16b}, [x2], #16 //AES final-4 block - store result rev64 v8.16b, v9.16b //GHASH final-4 block ldr q9, [x0], #16 //AES final-3 block - load plaintext eor v8.16b, v8.16b, v16.16b //feed in partial tag ins v27.d[0], v8.d[1] //GHASH final-4 block - mid movi v16.8b, #0 //supress further partial tag feed in pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low .inst 0xce047529 //eor3 v9.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid .L128_enc_blocks_more_than_3: //blocks left > 3 st1 { v9.16b}, [x2], #16 //AES final-3 block - store result ldr q25, [x6, #80] //load h4l | h4h rev64 v8.16b, v9.16b //GHASH final-3 block eor v8.16b, v8.16b, v16.16b //feed in partial tag movi v16.8b, #0 //supress further partial tag feed in ins v27.d[0], v8.d[1] //GHASH final-3 block - mid ldr q24, [x6, #64] //load h4k | h3k pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low ldr q9, [x0], #16 //AES final-2 block - load plaintext eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid ins v27.d[1], v27.d[0] //GHASH final-3 block - mid eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low .inst 0xce057529 //eor3 v9.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high .L128_enc_blocks_more_than_2: //blocks left > 2 st1 { v9.16b}, [x2], #16 //AES final-2 block - store result rev64 v8.16b, v9.16b //GHASH final-2 block eor v8.16b, v8.16b, v16.16b //feed in partial tag ldr q9, [x0], #16 //AES final-1 block - load plaintext ins v27.d[0], v8.d[1] //GHASH final-2 block - mid ldr q23, [x6, #48] //load h3l | h3h movi v16.8b, #0 //supress further partial tag feed in eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid .inst 0xce067529 //eor3 v9.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low .L128_enc_blocks_more_than_1: //blocks left > 1 st1 { v9.16b}, [x2], #16 //AES final-1 block - store result ldr q22, [x6, #32] //load h2l | h2h rev64 v8.16b, v9.16b //GHASH final-1 block ldr q9, [x0], #16 //AES final block - load plaintext eor v8.16b, v8.16b, v16.16b //feed in partial tag movi v16.8b, #0 //supress further partial tag feed in ins v27.d[0], v8.d[1] //GHASH final-1 block - mid .inst 0xce077529 //eor3 v9.16b, v9.16b, v7.16b, v29.16b //AES final block - result pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid ldr q21, [x6, #16] //load h2k | h1k ins v27.d[1], v27.d[0] //GHASH final-1 block - mid pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low .L128_enc_blocks_less_than_1: //blocks left <= 1 rev32 v30.16b, v30.16b str q30, [x16] //store the updated counter and x1, x1, #127 //bit_length %= 128 sub x1, x1, #128 //bit_length -= 128 neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) mvn x7, xzr //temp0_x = 0xffffffffffffffff ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored and x1, x1, #127 //bit_length %= 128 lsr x7, x7, x1 //temp0_x is mask for top 64b of last block mvn x8, xzr //temp1_x = 0xffffffffffffffff cmp x1, #64 csel x13, x8, x7, lt csel x14, x7, xzr, lt mov v0.d[1], x14 mov v0.d[0], x13 //ctr0b is mask for last block and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits rev64 v8.16b, v9.16b //GHASH final block bif v9.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing st1 { v9.16b}, [x2] //store all 16B eor v8.16b, v8.16b, v16.16b //feed in partial tag ins v16.d[0], v8.d[1] //GHASH final block - mid eor v16.8b, v16.8b, v8.8b //GHASH final block - mid ldr q20, [x6] //load h1l | h1h pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high eor v18.16b, v18.16b, v16.16b //GHASH final block - mid ldr d16, [x10] //MODULO - load modulo constant pmull v26.1q, v8.1d, v20.1d //GHASH final block - low eor v17.16b, v17.16b, v28.16b //GHASH final block - high eor v19.16b, v19.16b, v26.16b //GHASH final block - low ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid .inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up .inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment .inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low ext v19.16b, v19.16b, v19.16b, #8 rev64 v19.16b, v19.16b st1 { v19.16b }, [x3] mov x0, x9 ldp d10, d11, [sp, #16] ldp d12, d13, [sp, #32] ldp d14, d15, [sp, #48] ldp d8, d9, [sp], #80 ret .L128_enc_ret: mov w0, #0x0 ret .size aesv8_gcm_8x_enc_128,.-aesv8_gcm_8x_enc_128 .globl aesv8_gcm_8x_dec_128 .hidden aesv8_gcm_8x_dec_128 .type aesv8_gcm_8x_dec_128,%function .align 4 aesv8_gcm_8x_dec_128: AARCH64_VALID_CALL_TARGET cbz x1, .L128_dec_ret stp d8, d9, [sp, #-80]! lsr x9, x1, #3 mov x16, x4 mov x11, x5 stp d10, d11, [sp, #16] stp d12, d13, [sp, #32] stp d14, d15, [sp, #48] mov x5, #0xc200000000000000 stp x5, xzr, [sp, #64] add x10, sp, #64 mov x5, x9 ld1 { v0.16b}, [x16] //CTR block 0 ldp q26, q27, [x11, #0] //load rk0, rk1 sub x5, x5, #1 //byte_len - 1 mov x15, #0x100000000 //set up counter increment movi v31.16b, #0x0 mov v31.d[1], x15 ld1 { v19.16b}, [x3] ext v19.16b, v19.16b, v19.16b, #8 rev64 v19.16b, v19.16b rev32 v30.16b, v0.16b //set up reversed counter aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 0 - round 0 add v30.4s, v30.4s, v31.4s //CTR block 0 rev32 v1.16b, v30.16b //CTR block 1 add v30.4s, v30.4s, v31.4s //CTR block 1 and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) rev32 v2.16b, v30.16b //CTR block 2 add v30.4s, v30.4s, v31.4s //CTR block 2 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 1 - round 0 rev32 v3.16b, v30.16b //CTR block 3 add v30.4s, v30.4s, v31.4s //CTR block 3 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 0 - round 1 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 1 - round 1 rev32 v4.16b, v30.16b //CTR block 4 add v30.4s, v30.4s, v31.4s //CTR block 4 rev32 v5.16b, v30.16b //CTR block 5 add v30.4s, v30.4s, v31.4s //CTR block 5 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 2 - round 0 rev32 v6.16b, v30.16b //CTR block 6 add v30.4s, v30.4s, v31.4s //CTR block 6 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 5 - round 0 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 3 - round 0 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 4 - round 0 rev32 v7.16b, v30.16b //CTR block 7 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 6 - round 0 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 2 - round 1 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 7 - round 0 ldp q28, q26, [x11, #32] //load rk2, rk3 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 6 - round 1 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 5 - round 1 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 4 - round 1 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 7 - round 1 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 7 - round 2 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 0 - round 2 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 3 - round 1 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 6 - round 2 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 2 - round 2 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 5 - round 2 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 4 - round 2 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 3 - round 2 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 1 - round 2 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 6 - round 3 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 2 - round 3 ldp q27, q28, [x11, #64] //load rk4, rk5 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 5 - round 3 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 0 - round 3 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 7 - round 3 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 3 - round 3 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 1 - round 3 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 0 - round 4 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 7 - round 4 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 4 - round 3 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 6 - round 4 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 1 - round 4 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 3 - round 4 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 5 - round 4 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 4 - round 4 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 2 - round 4 ldp q26, q27, [x11, #96] //load rk6, rk7 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 2 - round 5 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 3 - round 5 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 6 - round 5 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 1 - round 5 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 7 - round 5 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 5 - round 5 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 4 - round 5 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 3 - round 6 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 2 - round 6 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 0 - round 5 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 5 - round 6 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 4 - round 6 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 1 - round 6 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 0 - round 6 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 7 - round 6 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 6 - round 6 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 3 - round 7 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 4 - round 7 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 1 - round 7 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 7 - round 7 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 5 - round 7 ldp q28, q26, [x11, #128] //load rk8, rk9 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 6 - round 7 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 2 - round 7 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 0 - round 7 add x5, x5, x0 add v30.4s, v30.4s, v31.4s //CTR block 7 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 6 - round 8 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 0 - round 8 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 1 - round 8 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 7 - round 8 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 3 - round 8 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 5 - round 8 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 2 - round 8 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 4 - round 8 aese v0.16b, v26.16b //AES block 0 - round 9 aese v1.16b, v26.16b //AES block 1 - round 9 aese v6.16b, v26.16b //AES block 6 - round 9 ldr q27, [x11, #160] //load rk10 aese v4.16b, v26.16b //AES block 4 - round 9 aese v3.16b, v26.16b //AES block 3 - round 9 aese v2.16b, v26.16b //AES block 2 - round 9 aese v5.16b, v26.16b //AES block 5 - round 9 aese v7.16b, v26.16b //AES block 7 - round 9 add x4, x0, x1, lsr #3 //end_input_ptr cmp x0, x5 //check if we have <= 8 blocks b.ge .L128_dec_tail //handle tail ldp q8, q9, [x0], #32 //AES block 0, 1 - load ciphertext .inst 0xce006d00 //eor3 v0.16b, v8.16b, v0.16b, v27.16b //AES block 0 - result .inst 0xce016d21 //eor3 v1.16b, v9.16b, v1.16b, v27.16b //AES block 1 - result stp q0, q1, [x2], #32 //AES block 0, 1 - store result rev32 v0.16b, v30.16b //CTR block 8 add v30.4s, v30.4s, v31.4s //CTR block 8 ldp q10, q11, [x0], #32 //AES block 2, 3 - load ciphertext ldp q12, q13, [x0], #32 //AES block 4, 5 - load ciphertext rev32 v1.16b, v30.16b //CTR block 9 add v30.4s, v30.4s, v31.4s //CTR block 9 ldp q14, q15, [x0], #32 //AES block 6, 7 - load ciphertext .inst 0xce036d63 //eor3 v3.16b, v11.16b, v3.16b, v27.16b //AES block 3 - result .inst 0xce026d42 //eor3 v2.16b, v10.16b, v2.16b, v27.16b //AES block 2 - result stp q2, q3, [x2], #32 //AES block 2, 3 - store result rev32 v2.16b, v30.16b //CTR block 10 add v30.4s, v30.4s, v31.4s //CTR block 10 .inst 0xce066dc6 //eor3 v6.16b, v14.16b, v6.16b, v27.16b //AES block 6 - result rev32 v3.16b, v30.16b //CTR block 11 add v30.4s, v30.4s, v31.4s //CTR block 11 .inst 0xce046d84 //eor3 v4.16b, v12.16b, v4.16b, v27.16b //AES block 4 - result .inst 0xce056da5 //eor3 v5.16b, v13.16b, v5.16b, v27.16b //AES block 5 - result stp q4, q5, [x2], #32 //AES block 4, 5 - store result .inst 0xce076de7 //eor3 v7.16b, v15.16b, v7.16b, v27.16b //AES block 7 - result stp q6, q7, [x2], #32 //AES block 6, 7 - store result rev32 v4.16b, v30.16b //CTR block 12 cmp x0, x5 //check if we have <= 8 blocks add v30.4s, v30.4s, v31.4s //CTR block 12 b.ge .L128_dec_prepretail //do prepretail .L128_dec_main_loop: //main loop start ldr q23, [x6, #144] //load h7l | h7h ldr q25, [x6, #176] //load h8l | h8h rev64 v9.16b, v9.16b //GHASH block 8k+1 rev64 v8.16b, v8.16b //GHASH block 8k ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 rev64 v14.16b, v14.16b //GHASH block 8k+6 ldr q20, [x6, #96] //load h5l | h5h ldr q22, [x6, #128] //load h6l | h6h eor v8.16b, v8.16b, v19.16b //PRE 1 rev32 v5.16b, v30.16b //CTR block 8k+13 add v30.4s, v30.4s, v31.4s //CTR block 8k+13 rev64 v10.16b, v10.16b //GHASH block 8k+2 rev64 v12.16b, v12.16b //GHASH block 8k+4 ldp q26, q27, [x11, #0] //load rk0, rk1 rev32 v6.16b, v30.16b //CTR block 8k+14 add v30.4s, v30.4s, v31.4s //CTR block 8k+14 ldr q21, [x6, #112] //load h6k | h5k ldr q24, [x6, #160] //load h8k | h7k pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high rev64 v11.16b, v11.16b //GHASH block 8k+3 rev32 v7.16b, v30.16b //CTR block 8k+15 trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid rev64 v13.16b, v13.16b //GHASH block 8k+5 pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low .inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high ldp q28, q26, [x11, #32] //load rk2, rk3 trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid ldr q23, [x6, #48] //load h3l | h3h ldr q25, [x6, #80] //load h4l | h4h pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 .inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid ldr q20, [x6] //load h1l | h1h ldr q22, [x6, #32] //load h2l | h2h eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 rev64 v15.16b, v15.16b //GHASH block 8k+7 pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high ldp q27, q28, [x11, #64] //load rk4, rk5 pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low .inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid ldr q21, [x6, #16] //load h2k | h1k ldr q24, [x6, #64] //load h4k | h3k aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid ldp q26, q27, [x11, #96] //load rk6, rk7 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high .inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid .inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 .inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 add v30.4s, v30.4s, v31.4s //CTR block 8k+15 .inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 ldp q28, q26, [x11, #128] //load rk8, rk9 ldr d16, [x10] //MODULO - load modulo constant .inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 rev32 v20.16b, v30.16b //CTR block 8k+16 .inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid add v30.4s, v30.4s, v31.4s //CTR block 8k+16 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 rev32 v22.16b, v30.16b //CTR block 8k+17 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid .inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 add v30.4s, v30.4s, v31.4s //CTR block 8k+17 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load ciphertext ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load ciphertext aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 rev32 v23.16b, v30.16b //CTR block 8k+18 ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load ciphertext aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 .inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load ciphertext aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 add v30.4s, v30.4s, v31.4s //CTR block 8k+18 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 aese v0.16b, v26.16b //AES block 8k+8 - round 9 aese v1.16b, v26.16b //AES block 8k+9 - round 9 ldr q27, [x11, #160] //load rk10 aese v6.16b, v26.16b //AES block 8k+14 - round 9 pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low aese v2.16b, v26.16b //AES block 8k+10 - round 9 aese v7.16b, v26.16b //AES block 8k+15 - round 9 aese v4.16b, v26.16b //AES block 8k+12 - round 9 ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment rev32 v25.16b, v30.16b //CTR block 8k+19 add v30.4s, v30.4s, v31.4s //CTR block 8k+19 aese v3.16b, v26.16b //AES block 8k+11 - round 9 aese v5.16b, v26.16b //AES block 8k+13 - round 9 .inst 0xce016d21 //eor3 v1.16b, v9.16b, v1.16b, v27.16b //AES block 8k+9 - result .inst 0xce006d00 //eor3 v0.16b, v8.16b, v0.16b, v27.16b //AES block 8k+8 - result .inst 0xce076de7 //eor3 v7.16b, v15.16b, v7.16b, v27.16b //AES block 8k+15 - result .inst 0xce066dc6 //eor3 v6.16b, v14.16b, v6.16b, v27.16b //AES block 8k+14 - result .inst 0xce026d42 //eor3 v2.16b, v10.16b, v2.16b, v27.16b //AES block 8k+10 - result stp q0, q1, [x2], #32 //AES block 8k+8, 8k+9 - store result mov v1.16b, v22.16b //CTR block 8k+17 .inst 0xce046d84 //eor3 v4.16b, v12.16b, v4.16b, v27.16b //AES block 8k+12 - result .inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low mov v0.16b, v20.16b //CTR block 8k+16 .inst 0xce036d63 //eor3 v3.16b, v11.16b, v3.16b, v27.16b //AES block 8k+11 - result cmp x0, x5 //.LOOP CONTROL stp q2, q3, [x2], #32 //AES block 8k+10, 8k+11 - store result .inst 0xce056da5 //eor3 v5.16b, v13.16b, v5.16b, v27.16b //AES block 8k+13 - result mov v2.16b, v23.16b //CTR block 8k+18 stp q4, q5, [x2], #32 //AES block 8k+12, 8k+13 - store result rev32 v4.16b, v30.16b //CTR block 8k+20 add v30.4s, v30.4s, v31.4s //CTR block 8k+20 stp q6, q7, [x2], #32 //AES block 8k+14, 8k+15 - store result mov v3.16b, v25.16b //CTR block 8k+19 b.lt .L128_dec_main_loop .L128_dec_prepretail: //PREPRETAIL rev64 v11.16b, v11.16b //GHASH block 8k+3 ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 rev64 v8.16b, v8.16b //GHASH block 8k rev64 v10.16b, v10.16b //GHASH block 8k+2 rev32 v5.16b, v30.16b //CTR block 8k+13 ldp q26, q27, [x11, #0] //load rk0, rk1 ldr q23, [x6, #144] //load h7l | h7h ldr q25, [x6, #176] //load h8l | h8h eor v8.16b, v8.16b, v19.16b //PRE 1 rev64 v9.16b, v9.16b //GHASH block 8k+1 add v30.4s, v30.4s, v31.4s //CTR block 8k+13 ldr q20, [x6, #96] //load h5l | h5h ldr q22, [x6, #128] //load h6l | h6h rev64 v13.16b, v13.16b //GHASH block 8k+5 rev64 v12.16b, v12.16b //GHASH block 8k+4 rev64 v14.16b, v14.16b //GHASH block 8k+6 ldr q21, [x6, #112] //load h6k | h5k ldr q24, [x6, #160] //load h8k | h7k rev32 v6.16b, v30.16b //CTR block 8k+14 add v30.4s, v30.4s, v31.4s //CTR block 8k+14 pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low rev32 v7.16b, v30.16b //CTR block 8k+15 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 .inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 ldp q28, q26, [x11, #32] //load rk2, rk3 .inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid ldr q23, [x6, #48] //load h3l | h3h ldr q25, [x6, #80] //load h4l | h4h aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 ldr q20, [x6] //load h1l | h1h ldr q22, [x6, #32] //load h2l | h2h .inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid ldp q27, q28, [x11, #64] //load rk4, rk5 rev64 v15.16b, v15.16b //GHASH block 8k+7 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 ldr q21, [x6, #16] //load h2k | h1k ldr q24, [x6, #64] //load h4k | h3k pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid .inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid ldp q26, q27, [x11, #96] //load rk6, rk7 .inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 ldr d16, [x10] //MODULO - load modulo constant pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low .inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 .inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid .inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 .inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 .inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up ldp q28, q26, [x11, #128] //load rk8, rk9 pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 .inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid ldr q27, [x11, #160] //load rk10 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 aese v6.16b, v26.16b //AES block 8k+14 - round 9 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 .inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low add v30.4s, v30.4s, v31.4s //CTR block 8k+15 aese v2.16b, v26.16b //AES block 8k+10 - round 9 aese v3.16b, v26.16b //AES block 8k+11 - round 9 aese v5.16b, v26.16b //AES block 8k+13 - round 9 aese v0.16b, v26.16b //AES block 8k+8 - round 9 aese v4.16b, v26.16b //AES block 8k+12 - round 9 aese v1.16b, v26.16b //AES block 8k+9 - round 9 aese v7.16b, v26.16b //AES block 8k+15 - round 9 .L128_dec_tail: //TAIL mov v29.16b, v27.16b sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process cmp x5, #112 ldp q24, q25, [x6, #160] //load h8k | h7k ldr q9, [x0], #16 //AES block 8k+8 - load ciphertext ldp q20, q21, [x6, #96] //load h5l | h5h ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag ldp q22, q23, [x6, #128] //load h6l | h6h .inst 0xce00752c //eor3 v12.16b, v9.16b, v0.16b, v29.16b //AES block 8k+8 - result b.gt .L128_dec_blocks_more_than_7 cmp x5, #96 mov v7.16b, v6.16b movi v19.8b, #0 movi v17.8b, #0 mov v6.16b, v5.16b mov v5.16b, v4.16b mov v4.16b, v3.16b mov v3.16b, v2.16b mov v2.16b, v1.16b movi v18.8b, #0 sub v30.4s, v30.4s, v31.4s b.gt .L128_dec_blocks_more_than_6 cmp x5, #80 sub v30.4s, v30.4s, v31.4s mov v7.16b, v6.16b mov v6.16b, v5.16b mov v5.16b, v4.16b mov v4.16b, v3.16b mov v3.16b, v1.16b b.gt .L128_dec_blocks_more_than_5 cmp x5, #64 mov v7.16b, v6.16b mov v6.16b, v5.16b mov v5.16b, v4.16b mov v4.16b, v1.16b sub v30.4s, v30.4s, v31.4s b.gt .L128_dec_blocks_more_than_4 sub v30.4s, v30.4s, v31.4s mov v7.16b, v6.16b mov v6.16b, v5.16b mov v5.16b, v1.16b cmp x5, #48 b.gt .L128_dec_blocks_more_than_3 sub v30.4s, v30.4s, v31.4s mov v7.16b, v6.16b cmp x5, #32 ldr q24, [x6, #64] //load h4k | h3k mov v6.16b, v1.16b b.gt .L128_dec_blocks_more_than_2 cmp x5, #16 mov v7.16b, v1.16b sub v30.4s, v30.4s, v31.4s b.gt .L128_dec_blocks_more_than_1 sub v30.4s, v30.4s, v31.4s ldr q21, [x6, #16] //load h2k | h1k b .L128_dec_blocks_less_than_1 .L128_dec_blocks_more_than_7: //blocks left > 7 rev64 v8.16b, v9.16b //GHASH final-7 block eor v8.16b, v8.16b, v16.16b //feed in partial tag ins v18.d[0], v24.d[1] //GHASH final-7 block - mid pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low ins v27.d[0], v8.d[1] //GHASH final-7 block - mid movi v16.8b, #0 //supress further partial tag feed in ldr q9, [x0], #16 //AES final-6 block - load ciphertext eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high st1 { v12.16b}, [x2], #16 //AES final-7 block - store result .inst 0xce01752c //eor3 v12.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid .L128_dec_blocks_more_than_6: //blocks left > 6 rev64 v8.16b, v9.16b //GHASH final-6 block eor v8.16b, v8.16b, v16.16b //feed in partial tag ins v27.d[0], v8.d[1] //GHASH final-6 block - mid eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low ldr q9, [x0], #16 //AES final-5 block - load ciphertext movi v16.8b, #0 //supress further partial tag feed in pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid st1 { v12.16b}, [x2], #16 //AES final-6 block - store result pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid .inst 0xce02752c //eor3 v12.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result .L128_dec_blocks_more_than_5: //blocks left > 5 rev64 v8.16b, v9.16b //GHASH final-5 block ldr q9, [x0], #16 //AES final-4 block - load ciphertext st1 { v12.16b}, [x2], #16 //AES final-5 block - store result eor v8.16b, v8.16b, v16.16b //feed in partial tag ins v27.d[0], v8.d[1] //GHASH final-5 block - mid .inst 0xce03752c //eor3 v12.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid ins v27.d[1], v27.d[0] //GHASH final-5 block - mid pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low movi v16.8b, #0 //supress further partial tag feed in pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high .L128_dec_blocks_more_than_4: //blocks left > 4 rev64 v8.16b, v9.16b //GHASH final-4 block eor v8.16b, v8.16b, v16.16b //feed in partial tag ldr q9, [x0], #16 //AES final-3 block - load ciphertext ins v27.d[0], v8.d[1] //GHASH final-4 block - mid movi v16.8b, #0 //supress further partial tag feed in pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high st1 { v12.16b}, [x2], #16 //AES final-4 block - store result eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid .inst 0xce04752c //eor3 v12.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid .L128_dec_blocks_more_than_3: //blocks left > 3 st1 { v12.16b}, [x2], #16 //AES final-3 block - store result rev64 v8.16b, v9.16b //GHASH final-3 block eor v8.16b, v8.16b, v16.16b //feed in partial tag ins v27.d[0], v8.d[1] //GHASH final-3 block - mid ldr q25, [x6, #80] //load h4l | h4h ldr q24, [x6, #64] //load h4k | h3k eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid ldr q9, [x0], #16 //AES final-2 block - load ciphertext ins v27.d[1], v27.d[0] //GHASH final-3 block - mid pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high movi v16.8b, #0 //supress further partial tag feed in .inst 0xce05752c //eor3 v12.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid .L128_dec_blocks_more_than_2: //blocks left > 2 rev64 v8.16b, v9.16b //GHASH final-2 block st1 { v12.16b}, [x2], #16 //AES final-2 block - store result eor v8.16b, v8.16b, v16.16b //feed in partial tag ldr q23, [x6, #48] //load h3l | h3h movi v16.8b, #0 //supress further partial tag feed in ins v27.d[0], v8.d[1] //GHASH final-2 block - mid eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid ldr q9, [x0], #16 //AES final-1 block - load ciphertext eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low .inst 0xce06752c //eor3 v12.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high .L128_dec_blocks_more_than_1: //blocks left > 1 st1 { v12.16b}, [x2], #16 //AES final-1 block - store result rev64 v8.16b, v9.16b //GHASH final-1 block ldr q22, [x6, #32] //load h2l | h2h eor v8.16b, v8.16b, v16.16b //feed in partial tag movi v16.8b, #0 //supress further partial tag feed in ins v27.d[0], v8.d[1] //GHASH final-1 block - mid ldr q9, [x0], #16 //AES final block - load ciphertext pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high ldr q21, [x6, #16] //load h2k | h1k ins v27.d[1], v27.d[0] //GHASH final-1 block - mid .inst 0xce07752c //eor3 v12.16b, v9.16b, v7.16b, v29.16b //AES final block - result pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid .L128_dec_blocks_less_than_1: //blocks left <= 1 and x1, x1, #127 //bit_length %= 128 sub x1, x1, #128 //bit_length -= 128 neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) mvn x7, xzr //temp0_x = 0xffffffffffffffff and x1, x1, #127 //bit_length %= 128 lsr x7, x7, x1 //temp0_x is mask for top 64b of last block cmp x1, #64 mvn x8, xzr //temp1_x = 0xffffffffffffffff csel x13, x8, x7, lt csel x14, x7, xzr, lt mov v0.d[1], x14 mov v0.d[0], x13 //ctr0b is mask for last block ldr q20, [x6] //load h1l | h1h ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits rev64 v8.16b, v9.16b //GHASH final block eor v8.16b, v8.16b, v16.16b //feed in partial tag pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high ins v16.d[0], v8.d[1] //GHASH final block - mid eor v17.16b, v17.16b, v28.16b //GHASH final block - high eor v16.8b, v16.8b, v8.8b //GHASH final block - mid bif v12.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid st1 { v12.16b}, [x2] //store all 16B pmull v26.1q, v8.1d, v20.1d //GHASH final block - low eor v18.16b, v18.16b, v16.16b //GHASH final block - mid ldr d16, [x10] //MODULO - load modulo constant eor v19.16b, v19.16b, v26.16b //GHASH final block - low eor v14.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid ext v17.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment eor v18.16b, v18.16b, v14.16b //MODULO - karatsuba tidy up .inst 0xce115652 //eor3 v18.16b, v18.16b, v17.16b, v21.16b //MODULO - fold into mid pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low ext v18.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment .inst 0xce124673 //eor3 v19.16b, v19.16b, v18.16b, v17.16b //MODULO - fold into low ext v19.16b, v19.16b, v19.16b, #8 rev64 v19.16b, v19.16b st1 { v19.16b }, [x3] rev32 v30.16b, v30.16b str q30, [x16] //store the updated counter mov x0, x9 ldp d10, d11, [sp, #16] ldp d12, d13, [sp, #32] ldp d14, d15, [sp, #48] ldp d8, d9, [sp], #80 ret .L128_dec_ret: mov w0, #0x0 ret .size aesv8_gcm_8x_dec_128,.-aesv8_gcm_8x_dec_128 .globl aesv8_gcm_8x_enc_192 .hidden aesv8_gcm_8x_enc_192 .type aesv8_gcm_8x_enc_192,%function .align 4 aesv8_gcm_8x_enc_192: AARCH64_VALID_CALL_TARGET cbz x1, .L192_enc_ret stp d8, d9, [sp, #-80]! lsr x9, x1, #3 mov x16, x4 mov x11, x5 stp d10, d11, [sp, #16] stp d12, d13, [sp, #32] stp d14, d15, [sp, #48] mov x5, #0xc200000000000000 stp x5, xzr, [sp, #64] add x10, sp, #64 mov x5, x9 ld1 { v0.16b}, [x16] //CTR block 0 mov x15, #0x100000000 //set up counter increment movi v31.16b, #0x0 mov v31.d[1], x15 rev32 v30.16b, v0.16b //set up reversed counter add v30.4s, v30.4s, v31.4s //CTR block 0 rev32 v1.16b, v30.16b //CTR block 1 add v30.4s, v30.4s, v31.4s //CTR block 1 rev32 v2.16b, v30.16b //CTR block 2 add v30.4s, v30.4s, v31.4s //CTR block 2 rev32 v3.16b, v30.16b //CTR block 3 add v30.4s, v30.4s, v31.4s //CTR block 3 rev32 v4.16b, v30.16b //CTR block 4 add v30.4s, v30.4s, v31.4s //CTR block 4 sub x5, x5, #1 //byte_len - 1 and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) rev32 v5.16b, v30.16b //CTR block 5 add v30.4s, v30.4s, v31.4s //CTR block 5 ldp q26, q27, [x11, #0] //load rk0, rk1 add x5, x5, x0 rev32 v6.16b, v30.16b //CTR block 6 add v30.4s, v30.4s, v31.4s //CTR block 6 rev32 v7.16b, v30.16b //CTR block 7 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 5 - round 0 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 4 - round 0 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 3 - round 0 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 0 - round 0 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 1 - round 0 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 7 - round 0 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 6 - round 0 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 2 - round 0 ldp q28, q26, [x11, #32] //load rk2, rk3 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 5 - round 1 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 7 - round 1 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 2 - round 1 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 3 - round 1 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 6 - round 1 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 5 - round 2 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 4 - round 1 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 0 - round 1 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 1 - round 1 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 7 - round 2 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 3 - round 2 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 2 - round 2 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 0 - round 2 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 1 - round 2 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 4 - round 2 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 6 - round 2 ldp q27, q28, [x11, #64] //load rk4, rk5 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 4 - round 3 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 7 - round 3 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 3 - round 3 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 2 - round 3 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 1 - round 3 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 0 - round 3 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 6 - round 3 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 0 - round 4 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 1 - round 4 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 5 - round 3 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 3 - round 4 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 2 - round 4 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 4 - round 4 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 6 - round 4 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 7 - round 4 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 5 - round 4 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 1 - round 5 ldp q26, q27, [x11, #96] //load rk6, rk7 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 2 - round 5 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 4 - round 5 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 7 - round 5 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 0 - round 5 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 5 - round 5 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 6 - round 5 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 3 - round 5 add v30.4s, v30.4s, v31.4s //CTR block 7 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 5 - round 6 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 4 - round 6 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 3 - round 6 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 2 - round 6 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 6 - round 6 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 1 - round 6 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 0 - round 6 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 7 - round 6 ldp q28, q26, [x11, #128] //load rk8, rk9 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 6 - round 7 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 3 - round 7 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 4 - round 7 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 0 - round 7 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 7 - round 7 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 1 - round 7 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 2 - round 7 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 5 - round 7 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 7 - round 8 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 0 - round 8 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 4 - round 8 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 3 - round 8 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 5 - round 8 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 2 - round 8 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 1 - round 8 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 6 - round 8 add x4, x0, x1, lsr #3 //end_input_ptr cmp x0, x5 //check if we have <= 8 blocks aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 3 - round 9 ld1 { v19.16b}, [x3] ext v19.16b, v19.16b, v19.16b, #8 rev64 v19.16b, v19.16b ldp q27, q28, [x11, #160] //load rk10, rk11 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 6 - round 9 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 1 - round 9 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 5 - round 9 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 2 - round 9 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 0 - round 9 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 4 - round 9 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 14 - round 10 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 7 - round 9 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 11 - round 10 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 9 - round 10 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 13 - round 10 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 12 - round 10 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8 - round 10 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 10 - round 10 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 15 - round 10 aese v6.16b, v28.16b //AES block 14 - round 11 aese v3.16b, v28.16b //AES block 11 - round 11 aese v4.16b, v28.16b //AES block 12 - round 11 aese v7.16b, v28.16b //AES block 15 - round 11 ldr q26, [x11, #192] //load rk12 aese v1.16b, v28.16b //AES block 9 - round 11 aese v5.16b, v28.16b //AES block 13 - round 11 aese v2.16b, v28.16b //AES block 10 - round 11 aese v0.16b, v28.16b //AES block 8 - round 11 b.ge .L192_enc_tail //handle tail ldp q8, q9, [x0], #32 //AES block 0, 1 - load plaintext ldp q10, q11, [x0], #32 //AES block 2, 3 - load plaintext ldp q12, q13, [x0], #32 //AES block 4, 5 - load plaintext ldp q14, q15, [x0], #32 //AES block 6, 7 - load plaintext .inst 0xce006908 //eor3 v8.16b, v8.16b, v0.16b, v26.16b //AES block 0 - result rev32 v0.16b, v30.16b //CTR block 8 add v30.4s, v30.4s, v31.4s //CTR block 8 .inst 0xce03696b //eor3 v11.16b, v11.16b, v3.16b, v26.16b //AES block 3 - result .inst 0xce016929 //eor3 v9.16b, v9.16b, v1.16b, v26.16b //AES block 1 - result rev32 v1.16b, v30.16b //CTR block 9 add v30.4s, v30.4s, v31.4s //CTR block 9 .inst 0xce04698c //eor3 v12.16b, v12.16b, v4.16b, v26.16b //AES block 4 - result .inst 0xce0569ad //eor3 v13.16b, v13.16b, v5.16b, v26.16b //AES block 5 - result .inst 0xce0769ef //eor3 v15.16b, v15.16b, v7.16b, v26.16b //AES block 7 - result stp q8, q9, [x2], #32 //AES block 0, 1 - store result .inst 0xce02694a //eor3 v10.16b, v10.16b, v2.16b, v26.16b //AES block 2 - result rev32 v2.16b, v30.16b //CTR block 10 add v30.4s, v30.4s, v31.4s //CTR block 10 stp q10, q11, [x2], #32 //AES block 2, 3 - store result cmp x0, x5 //check if we have <= 8 blocks rev32 v3.16b, v30.16b //CTR block 11 add v30.4s, v30.4s, v31.4s //CTR block 11 .inst 0xce0669ce //eor3 v14.16b, v14.16b, v6.16b, v26.16b //AES block 6 - result stp q12, q13, [x2], #32 //AES block 4, 5 - store result rev32 v4.16b, v30.16b //CTR block 12 stp q14, q15, [x2], #32 //AES block 6, 7 - store result add v30.4s, v30.4s, v31.4s //CTR block 12 b.ge .L192_enc_prepretail //do prepretail .L192_enc_main_loop: //main loop start rev64 v12.16b, v12.16b //GHASH block 8k+4 (t0, t1, and t2 free) ldp q26, q27, [x11, #0] //load rk0, rk1 rev64 v10.16b, v10.16b //GHASH block 8k+2 rev32 v5.16b, v30.16b //CTR block 8k+13 add v30.4s, v30.4s, v31.4s //CTR block 8k+13 ldr q23, [x6, #144] //load h7l | h7h ldr q25, [x6, #176] //load h8l | h8h ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 rev64 v8.16b, v8.16b //GHASH block 8k ldr q20, [x6, #96] //load h5l | h5h ldr q22, [x6, #128] //load h6l | h6h rev64 v9.16b, v9.16b //GHASH block 8k+1 rev32 v6.16b, v30.16b //CTR block 8k+14 add v30.4s, v30.4s, v31.4s //CTR block 8k+14 eor v8.16b, v8.16b, v19.16b //PRE 1 rev64 v11.16b, v11.16b //GHASH block 8k+3 rev64 v13.16b, v13.16b //GHASH block 8k+5 (t0, t1, t2 and t3 free) aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 rev32 v7.16b, v30.16b //CTR block 8k+15 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 ldp q28, q26, [x11, #32] //load rk2, rk3 pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 ldr q21, [x6, #112] //load h6k | h5k ldr q24, [x6, #160] //load h8k | h7k pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 .inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 ldp q27, q28, [x11, #64] //load rk4, rk5 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low ldr q23, [x6, #48] //load h3l | h3h ldr q25, [x6, #80] //load h4l | h4h pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 .inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 .inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 ldr q20, [x6] //load h1l | h1h ldr q22, [x6, #32] //load h2l | h2h ldp q26, q27, [x11, #96] //load rk6, rk7 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 rev64 v15.16b, v15.16b //GHASH block 8k+7 (t0, t1, t2 and t3 free) rev64 v14.16b, v14.16b //GHASH block 8k+6 (t0, t1, and t2 free) pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 ldr q21, [x6, #16] //load h2k | h1k ldr q24, [x6, #64] //load h4k | h3k aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid ldp q28, q26, [x11, #128] //load rk8, rk9 pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 .inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 add v30.4s, v30.4s, v31.4s //CTR block 8k+15 ldr d16, [x10] //MODULO - load modulo constant .inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 .inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 ldp q27, q28, [x11, #160] //load rk10, rk11 .inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low rev32 v20.16b, v30.16b //CTR block 8k+16 add v30.4s, v30.4s, v31.4s //CTR block 8k+16 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 9 .inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid .inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 9 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 9 ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load plaintext pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid rev32 v22.16b, v30.16b //CTR block 8k+17 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 9 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 9 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 9 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 9 .inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 9 add v30.4s, v30.4s, v31.4s //CTR block 8k+17 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 10 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 10 ldr q26, [x11, #192] //load rk12 ext v29.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 10 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 10 ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load plaintext aese v4.16b, v28.16b //AES block 8k+12 - round 11 .inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load plaintext ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load plaintext aese v2.16b, v28.16b //AES block 8k+10 - round 11 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 10 rev32 v23.16b, v30.16b //CTR block 8k+18 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 10 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 10 pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 10 aese v5.16b, v28.16b //AES block 8k+13 - round 11 add v30.4s, v30.4s, v31.4s //CTR block 8k+18 aese v7.16b, v28.16b //AES block 8k+15 - round 11 aese v0.16b, v28.16b //AES block 8k+8 - round 11 .inst 0xce04698c //eor3 v12.16b, v12.16b, v4.16b, v26.16b //AES block 4 - result aese v6.16b, v28.16b //AES block 8k+14 - round 11 aese v3.16b, v28.16b //AES block 8k+11 - round 11 aese v1.16b, v28.16b //AES block 8k+9 - round 11 rev32 v25.16b, v30.16b //CTR block 8k+19 add v30.4s, v30.4s, v31.4s //CTR block 8k+19 .inst 0xce0769ef //eor3 v15.16b, v15.16b, v7.16b, v26.16b //AES block 7 - result .inst 0xce02694a //eor3 v10.16b, v10.16b, v2.16b, v26.16b //AES block 8k+10 - result .inst 0xce006908 //eor3 v8.16b, v8.16b, v0.16b, v26.16b //AES block 8k+8 - result mov v2.16b, v23.16b //CTR block 8k+18 .inst 0xce016929 //eor3 v9.16b, v9.16b, v1.16b, v26.16b //AES block 8k+9 - result mov v1.16b, v22.16b //CTR block 8k+17 stp q8, q9, [x2], #32 //AES block 8k+8, 8k+9 - store result ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment .inst 0xce0669ce //eor3 v14.16b, v14.16b, v6.16b, v26.16b //AES block 6 - result mov v0.16b, v20.16b //CTR block 8k+16 rev32 v4.16b, v30.16b //CTR block 8k+20 add v30.4s, v30.4s, v31.4s //CTR block 8k+20 .inst 0xce0569ad //eor3 v13.16b, v13.16b, v5.16b, v26.16b //AES block 5 - result .inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low .inst 0xce03696b //eor3 v11.16b, v11.16b, v3.16b, v26.16b //AES block 8k+11 - result mov v3.16b, v25.16b //CTR block 8k+19 stp q10, q11, [x2], #32 //AES block 8k+10, 8k+11 - store result stp q12, q13, [x2], #32 //AES block 8k+12, 8k+13 - store result cmp x0, x5 //.LOOP CONTROL stp q14, q15, [x2], #32 //AES block 8k+14, 8k+15 - store result b.lt .L192_enc_main_loop .L192_enc_prepretail: //PREPRETAIL rev32 v5.16b, v30.16b //CTR block 8k+13 ldp q26, q27, [x11, #0] //load rk0, rk1 add v30.4s, v30.4s, v31.4s //CTR block 8k+13 ldr q23, [x6, #144] //load h7l | h7h ldr q25, [x6, #176] //load h8l | h8h rev64 v8.16b, v8.16b //GHASH block 8k ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 rev32 v6.16b, v30.16b //CTR block 8k+14 add v30.4s, v30.4s, v31.4s //CTR block 8k+14 ldr q21, [x6, #112] //load h6k | h5k ldr q24, [x6, #160] //load h8k | h7k rev64 v11.16b, v11.16b //GHASH block 8k+3 rev64 v10.16b, v10.16b //GHASH block 8k+2 ldr q20, [x6, #96] //load h5l | h5h ldr q22, [x6, #128] //load h6l | h6h eor v8.16b, v8.16b, v19.16b //PRE 1 rev32 v7.16b, v30.16b //CTR block 8k+15 rev64 v9.16b, v9.16b //GHASH block 8k+1 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 ldp q28, q26, [x11, #32] //load rk2, rk3 pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 .inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 rev64 v13.16b, v13.16b //GHASH block 8k+5 (t0, t1, t2 and t3 free) rev64 v14.16b, v14.16b //GHASH block 8k+6 (t0, t1, and t2 free) aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid ldp q27, q28, [x11, #64] //load rk4, rk5 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid .inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 ldr q23, [x6, #48] //load h3l | h3h ldr q25, [x6, #80] //load h4l | h4h aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid ldr q20, [x6] //load h1l | h1h ldr q22, [x6, #32] //load h2l | h2h aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 rev64 v12.16b, v12.16b //GHASH block 8k+4 (t0, t1, and t2 free) aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 .inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 rev64 v15.16b, v15.16b //GHASH block 8k+7 (t0, t1, t2 and t3 free) ldr q21, [x6, #16] //load h2k | h1k ldr q24, [x6, #64] //load h4k | h3k aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 ldp q26, q27, [x11, #96] //load rk6, rk7 pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 .inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 .inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 ldr d16, [x10] //MODULO - load modulo constant aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 .inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 ldp q28, q26, [x11, #128] //load rk8, rk9 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 .inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid .inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low .inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high .inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up ext v29.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 .inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 9 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 9 ldp q27, q28, [x11, #160] //load rk10, rk11 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 9 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 9 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 9 ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 9 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 9 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 9 pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low ldr q26, [x11, #192] //load rk12 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 10 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 10 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 10 .inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 10 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 10 aese v1.16b, v28.16b //AES block 8k+9 - round 11 aese v7.16b, v28.16b //AES block 8k+15 - round 11 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 10 aese v3.16b, v28.16b //AES block 8k+11 - round 11 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 10 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 10 add v30.4s, v30.4s, v31.4s //CTR block 8k+15 aese v2.16b, v28.16b //AES block 8k+10 - round 11 aese v0.16b, v28.16b //AES block 8k+8 - round 11 aese v6.16b, v28.16b //AES block 8k+14 - round 11 aese v4.16b, v28.16b //AES block 8k+12 - round 11 aese v5.16b, v28.16b //AES block 8k+13 - round 11 .L192_enc_tail: //TAIL ldp q20, q21, [x6, #96] //load h5l | h5h sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process ldr q8, [x0], #16 //AES block 8k+8 - l3ad plaintext ldp q24, q25, [x6, #160] //load h8k | h7k mov v29.16b, v26.16b ldp q22, q23, [x6, #128] //load h6l | h6h cmp x5, #112 .inst 0xce007509 //eor3 v9.16b, v8.16b, v0.16b, v29.16b //AES block 8k+8 - result ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag b.gt .L192_enc_blocks_more_than_7 cmp x5, #96 mov v7.16b, v6.16b movi v17.8b, #0 mov v6.16b, v5.16b movi v19.8b, #0 sub v30.4s, v30.4s, v31.4s mov v5.16b, v4.16b mov v4.16b, v3.16b mov v3.16b, v2.16b mov v2.16b, v1.16b movi v18.8b, #0 b.gt .L192_enc_blocks_more_than_6 mov v7.16b, v6.16b cmp x5, #80 mov v6.16b, v5.16b mov v5.16b, v4.16b mov v4.16b, v3.16b mov v3.16b, v1.16b sub v30.4s, v30.4s, v31.4s b.gt .L192_enc_blocks_more_than_5 cmp x5, #64 sub v30.4s, v30.4s, v31.4s mov v7.16b, v6.16b mov v6.16b, v5.16b mov v5.16b, v4.16b mov v4.16b, v1.16b b.gt .L192_enc_blocks_more_than_4 mov v7.16b, v6.16b mov v6.16b, v5.16b mov v5.16b, v1.16b sub v30.4s, v30.4s, v31.4s cmp x5, #48 b.gt .L192_enc_blocks_more_than_3 mov v7.16b, v6.16b mov v6.16b, v1.16b sub v30.4s, v30.4s, v31.4s ldr q24, [x6, #64] //load h4k | h3k cmp x5, #32 b.gt .L192_enc_blocks_more_than_2 sub v30.4s, v30.4s, v31.4s cmp x5, #16 mov v7.16b, v1.16b b.gt .L192_enc_blocks_more_than_1 sub v30.4s, v30.4s, v31.4s ldr q21, [x6, #16] //load h2k | h1k b .L192_enc_blocks_less_than_1 .L192_enc_blocks_more_than_7: //blocks left > 7 st1 { v9.16b}, [x2], #16 //AES final-7 block - store result rev64 v8.16b, v9.16b //GHASH final-7 block ins v18.d[0], v24.d[1] //GHASH final-7 block - mid eor v8.16b, v8.16b, v16.16b //feed in partial tag ins v27.d[0], v8.d[1] //GHASH final-7 block - mid ldr q9, [x0], #16 //AES final-6 block - load plaintext eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid movi v16.8b, #0 //supress further partial tag feed in pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid .inst 0xce017529 //eor3 v9.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result .L192_enc_blocks_more_than_6: //blocks left > 6 st1 { v9.16b}, [x2], #16 //AES final-6 block - store result rev64 v8.16b, v9.16b //GHASH final-6 block ldr q9, [x0], #16 //AES final-5 block - load plaintext eor v8.16b, v8.16b, v16.16b //feed in partial tag ins v27.d[0], v8.d[1] //GHASH final-6 block - mid pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low .inst 0xce027529 //eor3 v9.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result movi v16.8b, #0 //supress further partial tag feed in pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid .L192_enc_blocks_more_than_5: //blocks left > 5 st1 { v9.16b}, [x2], #16 //AES final-5 block - store result rev64 v8.16b, v9.16b //GHASH final-5 block eor v8.16b, v8.16b, v16.16b //feed in partial tag ins v27.d[0], v8.d[1] //GHASH final-5 block - mid ldr q9, [x0], #16 //AES final-4 block - load plaintext pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high ins v27.d[1], v27.d[0] //GHASH final-5 block - mid pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid .inst 0xce037529 //eor3 v9.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result movi v16.8b, #0 //supress further partial tag feed in eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid .L192_enc_blocks_more_than_4: //blocks left > 4 st1 { v9.16b}, [x2], #16 //AES final-4 block - store result rev64 v8.16b, v9.16b //GHASH final-4 block eor v8.16b, v8.16b, v16.16b //feed in partial tag ldr q9, [x0], #16 //AES final-3 block - load plaintext pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high ins v27.d[0], v8.d[1] //GHASH final-4 block - mid pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid movi v16.8b, #0 //supress further partial tag feed in eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid .inst 0xce047529 //eor3 v9.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result .L192_enc_blocks_more_than_3: //blocks left > 3 ldr q24, [x6, #64] //load h4k | h3k st1 { v9.16b}, [x2], #16 //AES final-3 block - store result rev64 v8.16b, v9.16b //GHASH final-3 block eor v8.16b, v8.16b, v16.16b //feed in partial tag movi v16.8b, #0 //supress further partial tag feed in ldr q9, [x0], #16 //AES final-2 block - load plaintext ldr q25, [x6, #80] //load h4l | h4h ins v27.d[0], v8.d[1] //GHASH final-3 block - mid .inst 0xce057529 //eor3 v9.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid ins v27.d[1], v27.d[0] //GHASH final-3 block - mid pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high .L192_enc_blocks_more_than_2: //blocks left > 2 st1 { v9.16b}, [x2], #16 //AES final-2 block - store result rev64 v8.16b, v9.16b //GHASH final-2 block ldr q23, [x6, #48] //load h3l | h3h eor v8.16b, v8.16b, v16.16b //feed in partial tag ldr q9, [x0], #16 //AES final-1 block - load plaintext ins v27.d[0], v8.d[1] //GHASH final-2 block - mid eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high movi v16.8b, #0 //supress further partial tag feed in pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid .inst 0xce067529 //eor3 v9.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result .L192_enc_blocks_more_than_1: //blocks left > 1 ldr q22, [x6, #32] //load h1l | h1h st1 { v9.16b}, [x2], #16 //AES final-1 block - store result rev64 v8.16b, v9.16b //GHASH final-1 block eor v8.16b, v8.16b, v16.16b //feed in partial tag ins v27.d[0], v8.d[1] //GHASH final-1 block - mid pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid ldr q9, [x0], #16 //AES final block - load plaintext ldr q21, [x6, #16] //load h2k | h1k ins v27.d[1], v27.d[0] //GHASH final-1 block - mid .inst 0xce077529 //eor3 v9.16b, v9.16b, v7.16b, v29.16b //AES final block - result pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid movi v16.8b, #0 //supress further partial tag feed in eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high .L192_enc_blocks_less_than_1: //blocks left <= 1 mvn x7, xzr //temp0_x = 0xffffffffffffffff and x1, x1, #127 //bit_length %= 128 sub x1, x1, #128 //bit_length -= 128 neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) and x1, x1, #127 //bit_length %= 128 lsr x7, x7, x1 //temp0_x is mask for top 64b of last block cmp x1, #64 mvn x8, xzr //temp1_x = 0xffffffffffffffff csel x13, x8, x7, lt csel x14, x7, xzr, lt mov v0.d[1], x14 ldr q20, [x6] //load h1l | h1h ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored mov v0.d[0], x13 //ctr0b is mask for last block and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits rev64 v8.16b, v9.16b //GHASH final block bif v9.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing st1 { v9.16b}, [x2] //store all 16B eor v8.16b, v8.16b, v16.16b //feed in partial tag ins v16.d[0], v8.d[1] //GHASH final block - mid pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high eor v17.16b, v17.16b, v28.16b //GHASH final block - high pmull v26.1q, v8.1d, v20.1d //GHASH final block - low eor v16.8b, v16.8b, v8.8b //GHASH final block - mid pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid eor v18.16b, v18.16b, v16.16b //GHASH final block - mid ldr d16, [x10] //MODULO - load modulo constant eor v19.16b, v19.16b, v26.16b //GHASH final block - low ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment rev32 v30.16b, v30.16b str q30, [x16] //store the updated counter .inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid .inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment .inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low ext v19.16b, v19.16b, v19.16b, #8 rev64 v19.16b, v19.16b st1 { v19.16b }, [x3] mov x0, x9 //return sizes ldp d10, d11, [sp, #16] ldp d12, d13, [sp, #32] ldp d14, d15, [sp, #48] ldp d8, d9, [sp], #80 ret .L192_enc_ret: mov w0, #0x0 ret .size aesv8_gcm_8x_enc_192,.-aesv8_gcm_8x_enc_192 .globl aesv8_gcm_8x_dec_192 .hidden aesv8_gcm_8x_dec_192 .type aesv8_gcm_8x_dec_192,%function .align 4 aesv8_gcm_8x_dec_192: AARCH64_VALID_CALL_TARGET cbz x1, .L192_dec_ret stp d8, d9, [sp, #-80]! lsr x9, x1, #3 mov x16, x4 mov x11, x5 stp d10, d11, [sp, #16] stp d12, d13, [sp, #32] stp d14, d15, [sp, #48] mov x5, #0xc200000000000000 stp x5, xzr, [sp, #64] add x10, sp, #64 mov x5, x9 ld1 { v0.16b}, [x16] //CTR block 0 ld1 { v19.16b}, [x3] mov x15, #0x100000000 //set up counter increment movi v31.16b, #0x0 mov v31.d[1], x15 rev32 v30.16b, v0.16b //set up reversed counter add v30.4s, v30.4s, v31.4s //CTR block 0 rev32 v1.16b, v30.16b //CTR block 1 add v30.4s, v30.4s, v31.4s //CTR block 1 rev32 v2.16b, v30.16b //CTR block 2 add v30.4s, v30.4s, v31.4s //CTR block 2 rev32 v3.16b, v30.16b //CTR block 3 add v30.4s, v30.4s, v31.4s //CTR block 3 rev32 v4.16b, v30.16b //CTR block 4 add v30.4s, v30.4s, v31.4s //CTR block 4 rev32 v5.16b, v30.16b //CTR block 5 add v30.4s, v30.4s, v31.4s //CTR block 5 ldp q26, q27, [x11, #0] //load rk0, rk1 rev32 v6.16b, v30.16b //CTR block 6 add v30.4s, v30.4s, v31.4s //CTR block 6 rev32 v7.16b, v30.16b //CTR block 7 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 3 - round 0 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 6 - round 0 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 5 - round 0 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 0 - round 0 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 1 - round 0 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 7 - round 0 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 2 - round 0 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 4 - round 0 ldp q28, q26, [x11, #32] //load rk2, rk3 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 1 - round 1 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 2 - round 1 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 0 - round 1 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 3 - round 1 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 7 - round 1 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 5 - round 1 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 6 - round 1 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 7 - round 2 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 0 - round 2 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 4 - round 1 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 5 - round 2 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 1 - round 2 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 2 - round 2 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 3 - round 2 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 4 - round 2 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 6 - round 2 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 7 - round 3 ldp q27, q28, [x11, #64] //load rk4, rk5 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 2 - round 3 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 5 - round 3 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 0 - round 3 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 3 - round 3 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 4 - round 3 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 1 - round 3 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 6 - round 3 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 3 - round 4 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 2 - round 4 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 5 - round 4 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 1 - round 4 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 7 - round 4 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 6 - round 4 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 0 - round 4 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 5 - round 5 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 4 - round 4 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 6 - round 5 ldp q26, q27, [x11, #96] //load rk6, rk7 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 0 - round 5 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 4 - round 5 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 1 - round 5 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 3 - round 5 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 2 - round 5 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 7 - round 5 sub x5, x5, #1 //byte_len - 1 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 4 - round 6 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 5 - round 6 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 1 - round 6 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 0 - round 6 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 3 - round 6 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 6 - round 6 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 7 - round 6 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 2 - round 6 ldp q28, q26, [x11, #128] //load rk8, rk9 add v30.4s, v30.4s, v31.4s //CTR block 7 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 3 - round 7 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 7 - round 7 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 2 - round 7 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 1 - round 7 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 4 - round 7 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 6 - round 7 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 0 - round 7 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 5 - round 7 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 1 - round 8 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 2 - round 8 and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 7 - round 8 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 6 - round 8 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 5 - round 8 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 4 - round 8 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 3 - round 8 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 0 - round 8 add x4, x0, x1, lsr #3 //end_input_ptr aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 6 - round 9 ld1 { v19.16b}, [x3] ext v19.16b, v19.16b, v19.16b, #8 rev64 v19.16b, v19.16b ldp q27, q28, [x11, #160] //load rk10, rk11 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 0 - round 9 add x5, x5, x0 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 1 - round 9 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 7 - round 9 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 4 - round 9 cmp x0, x5 //check if we have <= 8 blocks aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 3 - round 9 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 5 - round 9 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 2 - round 9 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 3 - round 10 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 1 - round 10 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 7 - round 10 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 4 - round 10 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 0 - round 10 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 2 - round 10 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 6 - round 10 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 5 - round 10 ldr q26, [x11, #192] //load rk12 aese v0.16b, v28.16b //AES block 0 - round 11 aese v1.16b, v28.16b //AES block 1 - round 11 aese v4.16b, v28.16b //AES block 4 - round 11 aese v6.16b, v28.16b //AES block 6 - round 11 aese v5.16b, v28.16b //AES block 5 - round 11 aese v7.16b, v28.16b //AES block 7 - round 11 aese v2.16b, v28.16b //AES block 2 - round 11 aese v3.16b, v28.16b //AES block 3 - round 11 b.ge .L192_dec_tail //handle tail ldp q8, q9, [x0], #32 //AES block 0, 1 - load ciphertext ldp q10, q11, [x0], #32 //AES block 2, 3 - load ciphertext ldp q12, q13, [x0], #32 //AES block 4, 5 - load ciphertext .inst 0xce016921 //eor3 v1.16b, v9.16b, v1.16b, v26.16b //AES block 1 - result .inst 0xce006900 //eor3 v0.16b, v8.16b, v0.16b, v26.16b //AES block 0 - result stp q0, q1, [x2], #32 //AES block 0, 1 - store result rev32 v0.16b, v30.16b //CTR block 8 add v30.4s, v30.4s, v31.4s //CTR block 8 rev32 v1.16b, v30.16b //CTR block 9 add v30.4s, v30.4s, v31.4s //CTR block 9 .inst 0xce036963 //eor3 v3.16b, v11.16b, v3.16b, v26.16b //AES block 3 - result .inst 0xce026942 //eor3 v2.16b, v10.16b, v2.16b, v26.16b //AES block 2 - result stp q2, q3, [x2], #32 //AES block 2, 3 - store result ldp q14, q15, [x0], #32 //AES block 6, 7 - load ciphertext rev32 v2.16b, v30.16b //CTR block 10 add v30.4s, v30.4s, v31.4s //CTR block 10 .inst 0xce046984 //eor3 v4.16b, v12.16b, v4.16b, v26.16b //AES block 4 - result rev32 v3.16b, v30.16b //CTR block 11 add v30.4s, v30.4s, v31.4s //CTR block 11 .inst 0xce0569a5 //eor3 v5.16b, v13.16b, v5.16b, v26.16b //AES block 5 - result stp q4, q5, [x2], #32 //AES block 4, 5 - store result cmp x0, x5 //check if we have <= 8 blocks .inst 0xce0669c6 //eor3 v6.16b, v14.16b, v6.16b, v26.16b //AES block 6 - result .inst 0xce0769e7 //eor3 v7.16b, v15.16b, v7.16b, v26.16b //AES block 7 - result rev32 v4.16b, v30.16b //CTR block 12 add v30.4s, v30.4s, v31.4s //CTR block 12 stp q6, q7, [x2], #32 //AES block 6, 7 - store result b.ge .L192_dec_prepretail //do prepretail .L192_dec_main_loop: //main loop start rev64 v9.16b, v9.16b //GHASH block 8k+1 ldp q26, q27, [x11, #0] //load rk0, rk1 ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 rev64 v8.16b, v8.16b //GHASH block 8k rev32 v5.16b, v30.16b //CTR block 8k+13 add v30.4s, v30.4s, v31.4s //CTR block 8k+13 ldr q23, [x6, #144] //load h7l | h7h ldr q25, [x6, #176] //load h8l | h8h rev64 v12.16b, v12.16b //GHASH block 8k+4 rev64 v11.16b, v11.16b //GHASH block 8k+3 eor v8.16b, v8.16b, v19.16b //PRE 1 rev32 v6.16b, v30.16b //CTR block 8k+14 add v30.4s, v30.4s, v31.4s //CTR block 8k+14 rev64 v13.16b, v13.16b //GHASH block 8k+5 rev32 v7.16b, v30.16b //CTR block 8k+15 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high ldp q28, q26, [x11, #32] //load rk2, rk3 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low ldr q20, [x6, #96] //load h5l | h5h ldr q22, [x6, #128] //load h6l | h6h aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid rev64 v10.16b, v10.16b //GHASH block 8k+2 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 ldr q21, [x6, #112] //load h6k | h5k ldr q24, [x6, #160] //load h8k | h7k trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low .inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 ldr q23, [x6, #48] //load h3l | h3h ldr q25, [x6, #80] //load h4l | h4h aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 ldp q27, q28, [x11, #64] //load rk4, rk5 eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid .inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid add v30.4s, v30.4s, v31.4s //CTR block 8k+15 pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 ldr q20, [x6] //load h1l | h1h ldr q22, [x6, #32] //load h2l | h2h aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 ldp q26, q27, [x11, #96] //load rk6, rk7 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 rev64 v15.16b, v15.16b //GHASH block 8k+7 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 .inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 rev64 v14.16b, v14.16b //GHASH block 8k+6 ldr q21, [x6, #16] //load h2k | h1k ldr q24, [x6, #64] //load h4k | h3k pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid .inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high .inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 ldp q28, q26, [x11, #128] //load rk8, rk9 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 .inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid ldr d16, [x10] //MODULO - load modulo constant pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 .inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 .inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high rev32 v20.16b, v30.16b //CTR block 8k+16 add v30.4s, v30.4s, v31.4s //CTR block 8k+16 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 9 .inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 9 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 9 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 9 ldp q27, q28, [x11, #160] //load rk10, rk11 .inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load ciphertext aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 9 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 9 ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load ciphertext rev32 v22.16b, v30.16b //CTR block 8k+17 pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid add v30.4s, v30.4s, v31.4s //CTR block 8k+17 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 9 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 9 ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 10 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 10 ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load ciphertext rev32 v23.16b, v30.16b //CTR block 8k+18 add v30.4s, v30.4s, v31.4s //CTR block 8k+18 .inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 10 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 10 ldr q26, [x11, #192] //load rk12 ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load ciphertext aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 10 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 10 aese v0.16b, v28.16b //AES block 8k+8 - round 11 ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment aese v1.16b, v28.16b //AES block 8k+9 - round 11 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 10 aese v6.16b, v28.16b //AES block 8k+14 - round 11 aese v3.16b, v28.16b //AES block 8k+11 - round 11 .inst 0xce006900 //eor3 v0.16b, v8.16b, v0.16b, v26.16b //AES block 8k+8 - result rev32 v25.16b, v30.16b //CTR block 8k+19 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 10 aese v4.16b, v28.16b //AES block 8k+12 - round 11 aese v2.16b, v28.16b //AES block 8k+10 - round 11 add v30.4s, v30.4s, v31.4s //CTR block 8k+19 aese v7.16b, v28.16b //AES block 8k+15 - round 11 aese v5.16b, v28.16b //AES block 8k+13 - round 11 pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low .inst 0xce016921 //eor3 v1.16b, v9.16b, v1.16b, v26.16b //AES block 8k+9 - result stp q0, q1, [x2], #32 //AES block 8k+8, 8k+9 - store result .inst 0xce036963 //eor3 v3.16b, v11.16b, v3.16b, v26.16b //AES block 8k+11 - result .inst 0xce026942 //eor3 v2.16b, v10.16b, v2.16b, v26.16b //AES block 8k+10 - result .inst 0xce0769e7 //eor3 v7.16b, v15.16b, v7.16b, v26.16b //AES block 8k+15 - result stp q2, q3, [x2], #32 //AES block 8k+10, 8k+11 - store result .inst 0xce0569a5 //eor3 v5.16b, v13.16b, v5.16b, v26.16b //AES block 8k+13 - result .inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low mov v3.16b, v25.16b //CTR block 8k+19 .inst 0xce046984 //eor3 v4.16b, v12.16b, v4.16b, v26.16b //AES block 8k+12 - result stp q4, q5, [x2], #32 //AES block 8k+12, 8k+13 - store result cmp x0, x5 //.LOOP CONTROL .inst 0xce0669c6 //eor3 v6.16b, v14.16b, v6.16b, v26.16b //AES block 8k+14 - result stp q6, q7, [x2], #32 //AES block 8k+14, 8k+15 - store result mov v0.16b, v20.16b //CTR block 8k+16 mov v1.16b, v22.16b //CTR block 8k+17 mov v2.16b, v23.16b //CTR block 8k+18 rev32 v4.16b, v30.16b //CTR block 8k+20 add v30.4s, v30.4s, v31.4s //CTR block 8k+20 b.lt .L192_dec_main_loop .L192_dec_prepretail: //PREPRETAIL ldp q26, q27, [x11, #0] //load rk0, rk1 rev32 v5.16b, v30.16b //CTR block 8k+13 add v30.4s, v30.4s, v31.4s //CTR block 8k+13 ldr q23, [x6, #144] //load h7l | h7h ldr q25, [x6, #176] //load h8l | h8h rev64 v8.16b, v8.16b //GHASH block 8k ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 rev64 v11.16b, v11.16b //GHASH block 8k+3 rev32 v6.16b, v30.16b //CTR block 8k+14 add v30.4s, v30.4s, v31.4s //CTR block 8k+14 eor v8.16b, v8.16b, v19.16b //PRE 1 rev64 v10.16b, v10.16b //GHASH block 8k+2 rev64 v9.16b, v9.16b //GHASH block 8k+1 ldr q20, [x6, #96] //load h5l | h5h ldr q22, [x6, #128] //load h6l | h6h rev32 v7.16b, v30.16b //CTR block 8k+15 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 ldp q28, q26, [x11, #32] //load rk2, rk3 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 ldr q21, [x6, #112] //load h6k | h5k ldr q24, [x6, #160] //load h8k | h7k aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 rev64 v13.16b, v13.16b //GHASH block 8k+5 pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low .inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 .inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low ldp q27, q28, [x11, #64] //load rk4, rk5 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 ldr q23, [x6, #48] //load h3l | h3h ldr q25, [x6, #80] //load h4l | h4h pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid ldr q20, [x6] //load h1l | h1h ldr q22, [x6, #32] //load h2l | h2h eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 rev64 v15.16b, v15.16b //GHASH block 8k+7 .inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid rev64 v12.16b, v12.16b //GHASH block 8k+4 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 rev64 v14.16b, v14.16b //GHASH block 8k+6 ldr q21, [x6, #16] //load h2k | h1k ldr q24, [x6, #64] //load h4k | h3k trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 ldp q26, q27, [x11, #96] //load rk6, rk7 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high .inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 .inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 ldp q28, q26, [x11, #128] //load rk8, rk9 pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 ldr d16, [x10] //MODULO - load modulo constant .inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 .inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high .inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low .inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 .inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 ldp q27, q28, [x11, #160] //load rk10, rk11 .inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 9 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 9 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 9 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 9 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 9 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 9 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 9 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 9 pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low ldr q26, [x11, #192] //load rk12 ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 10 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 10 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 10 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 10 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 10 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 10 aese v0.16b, v28.16b //AES block 8k+8 - round 11 .inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low aese v5.16b, v28.16b //AES block 8k+13 - round 11 aese v2.16b, v28.16b //AES block 8k+10 - round 11 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 10 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 10 aese v6.16b, v28.16b //AES block 8k+14 - round 11 aese v4.16b, v28.16b //AES block 8k+12 - round 11 add v30.4s, v30.4s, v31.4s //CTR block 8k+15 aese v3.16b, v28.16b //AES block 8k+11 - round 11 aese v1.16b, v28.16b //AES block 8k+9 - round 11 aese v7.16b, v28.16b //AES block 8k+15 - round 11 .L192_dec_tail: //TAIL sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process ldp q20, q21, [x6, #96] //load h5l | h5h ldr q9, [x0], #16 //AES block 8k+8 - load ciphertext ldp q24, q25, [x6, #160] //load h8k | h7k mov v29.16b, v26.16b ldp q22, q23, [x6, #128] //load h6l | h6h ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag .inst 0xce00752c //eor3 v12.16b, v9.16b, v0.16b, v29.16b //AES block 8k+8 - result cmp x5, #112 b.gt .L192_dec_blocks_more_than_7 mov v7.16b, v6.16b movi v17.8b, #0 sub v30.4s, v30.4s, v31.4s mov v6.16b, v5.16b mov v5.16b, v4.16b mov v4.16b, v3.16b cmp x5, #96 movi v19.8b, #0 mov v3.16b, v2.16b mov v2.16b, v1.16b movi v18.8b, #0 b.gt .L192_dec_blocks_more_than_6 mov v7.16b, v6.16b mov v6.16b, v5.16b mov v5.16b, v4.16b mov v4.16b, v3.16b mov v3.16b, v1.16b sub v30.4s, v30.4s, v31.4s cmp x5, #80 b.gt .L192_dec_blocks_more_than_5 mov v7.16b, v6.16b mov v6.16b, v5.16b mov v5.16b, v4.16b mov v4.16b, v1.16b cmp x5, #64 sub v30.4s, v30.4s, v31.4s b.gt .L192_dec_blocks_more_than_4 sub v30.4s, v30.4s, v31.4s mov v7.16b, v6.16b mov v6.16b, v5.16b mov v5.16b, v1.16b cmp x5, #48 b.gt .L192_dec_blocks_more_than_3 sub v30.4s, v30.4s, v31.4s mov v7.16b, v6.16b cmp x5, #32 mov v6.16b, v1.16b ldr q24, [x6, #64] //load h4k | h3k b.gt .L192_dec_blocks_more_than_2 sub v30.4s, v30.4s, v31.4s mov v7.16b, v1.16b cmp x5, #16 b.gt .L192_dec_blocks_more_than_1 sub v30.4s, v30.4s, v31.4s ldr q21, [x6, #16] //load h2k | h1k b .L192_dec_blocks_less_than_1 .L192_dec_blocks_more_than_7: //blocks left > 7 rev64 v8.16b, v9.16b //GHASH final-7 block ins v18.d[0], v24.d[1] //GHASH final-7 block - mid eor v8.16b, v8.16b, v16.16b //feed in partial tag pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high ins v27.d[0], v8.d[1] //GHASH final-7 block - mid ldr q9, [x0], #16 //AES final-6 block - load ciphertext pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid st1 { v12.16b}, [x2], #16 //AES final-7 block - store result .inst 0xce01752c //eor3 v12.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid movi v16.8b, #0 //supress further partial tag feed in .L192_dec_blocks_more_than_6: //blocks left > 6 rev64 v8.16b, v9.16b //GHASH final-6 block eor v8.16b, v8.16b, v16.16b //feed in partial tag ldr q9, [x0], #16 //AES final-5 block - load ciphertext ins v27.d[0], v8.d[1] //GHASH final-6 block - mid eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid movi v16.8b, #0 //supress further partial tag feed in pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high st1 { v12.16b}, [x2], #16 //AES final-6 block - store result .inst 0xce02752c //eor3 v12.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low .L192_dec_blocks_more_than_5: //blocks left > 5 rev64 v8.16b, v9.16b //GHASH final-5 block eor v8.16b, v8.16b, v16.16b //feed in partial tag ins v27.d[0], v8.d[1] //GHASH final-5 block - mid eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid ins v27.d[1], v27.d[0] //GHASH final-5 block - mid pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high ldr q9, [x0], #16 //AES final-4 block - load ciphertext eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low movi v16.8b, #0 //supress further partial tag feed in st1 { v12.16b}, [x2], #16 //AES final-5 block - store result eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid .inst 0xce03752c //eor3 v12.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result .L192_dec_blocks_more_than_4: //blocks left > 4 rev64 v8.16b, v9.16b //GHASH final-4 block eor v8.16b, v8.16b, v16.16b //feed in partial tag movi v16.8b, #0 //supress further partial tag feed in ldr q9, [x0], #16 //AES final-3 block - load ciphertext ins v27.d[0], v8.d[1] //GHASH final-4 block - mid pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid st1 { v12.16b}, [x2], #16 //AES final-4 block - store result pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high .inst 0xce04752c //eor3 v12.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high .L192_dec_blocks_more_than_3: //blocks left > 3 ldr q25, [x6, #80] //load h4l | h4h rev64 v8.16b, v9.16b //GHASH final-3 block ldr q9, [x0], #16 //AES final-2 block - load ciphertext eor v8.16b, v8.16b, v16.16b //feed in partial tag ins v27.d[0], v8.d[1] //GHASH final-3 block - mid pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high movi v16.8b, #0 //supress further partial tag feed in pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low st1 { v12.16b}, [x2], #16 //AES final-3 block - store result eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid .inst 0xce05752c //eor3 v12.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low ldr q24, [x6, #64] //load h4k | h3k ins v27.d[1], v27.d[0] //GHASH final-3 block - mid pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid .L192_dec_blocks_more_than_2: //blocks left > 2 rev64 v8.16b, v9.16b //GHASH final-2 block ldr q23, [x6, #48] //load h3l | h3h eor v8.16b, v8.16b, v16.16b //feed in partial tag ins v27.d[0], v8.d[1] //GHASH final-2 block - mid ldr q9, [x0], #16 //AES final-1 block - load ciphertext pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid movi v16.8b, #0 //supress further partial tag feed in eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low st1 { v12.16b}, [x2], #16 //AES final-2 block - store result eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid .inst 0xce06752c //eor3 v12.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result .L192_dec_blocks_more_than_1: //blocks left > 1 rev64 v8.16b, v9.16b //GHASH final-1 block ldr q9, [x0], #16 //AES final block - load ciphertext ldr q22, [x6, #32] //load h1l | h1h eor v8.16b, v8.16b, v16.16b //feed in partial tag movi v16.8b, #0 //supress further partial tag feed in ldr q21, [x6, #16] //load h2k | h1k pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low ins v27.d[0], v8.d[1] //GHASH final-1 block - mid st1 { v12.16b}, [x2], #16 //AES final-1 block - store result pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high .inst 0xce07752c //eor3 v12.16b, v9.16b, v7.16b, v29.16b //AES final block - result eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid ins v27.d[1], v27.d[0] //GHASH final-1 block - mid pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high .L192_dec_blocks_less_than_1: //blocks left <= 1 rev32 v30.16b, v30.16b and x1, x1, #127 //bit_length %= 128 sub x1, x1, #128 //bit_length -= 128 str q30, [x16] //store the updated counter neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) mvn x7, xzr //temp0_x = 0xffffffffffffffff and x1, x1, #127 //bit_length %= 128 mvn x8, xzr //temp1_x = 0xffffffffffffffff lsr x7, x7, x1 //temp0_x is mask for top 64b of last block cmp x1, #64 csel x13, x8, x7, lt csel x14, x7, xzr, lt ldr q20, [x6] //load h1l | h1h mov v0.d[1], x14 ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored mov v0.d[0], x13 //ctr0b is mask for last block and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits bif v12.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing rev64 v8.16b, v9.16b //GHASH final block st1 { v12.16b}, [x2] //store all 16B eor v8.16b, v8.16b, v16.16b //feed in partial tag ins v16.d[0], v8.d[1] //GHASH final block - mid pmull v26.1q, v8.1d, v20.1d //GHASH final block - low eor v16.8b, v16.8b, v8.8b //GHASH final block - mid pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high eor v19.16b, v19.16b, v26.16b //GHASH final block - low pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid eor v17.16b, v17.16b, v28.16b //GHASH final block - high eor v14.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up eor v18.16b, v18.16b, v16.16b //GHASH final block - mid ldr d16, [x10] //MODULO - load modulo constant pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid ext v17.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment eor v18.16b, v18.16b, v14.16b //MODULO - karatsuba tidy up .inst 0xce115652 //eor3 v18.16b, v18.16b, v17.16b, v21.16b //MODULO - fold into mid pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low ext v18.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment .inst 0xce124673 //eor3 v19.16b, v19.16b, v18.16b, v17.16b //MODULO - fold into low ext v19.16b, v19.16b, v19.16b, #8 rev64 v19.16b, v19.16b st1 { v19.16b }, [x3] mov x0, x9 ldp d10, d11, [sp, #16] ldp d12, d13, [sp, #32] ldp d14, d15, [sp, #48] ldp d8, d9, [sp], #80 ret .L192_dec_ret: mov w0, #0x0 ret .size aesv8_gcm_8x_dec_192,.-aesv8_gcm_8x_dec_192 .globl aesv8_gcm_8x_enc_256 .hidden aesv8_gcm_8x_enc_256 .type aesv8_gcm_8x_enc_256,%function .align 4 aesv8_gcm_8x_enc_256: AARCH64_VALID_CALL_TARGET cbz x1, .L256_enc_ret stp d8, d9, [sp, #-80]! lsr x9, x1, #3 mov x16, x4 mov x11, x5 stp d10, d11, [sp, #16] stp d12, d13, [sp, #32] stp d14, d15, [sp, #48] mov x5, #0xc200000000000000 stp x5, xzr, [sp, #64] add x10, sp, #64 ld1 { v0.16b}, [x16] //CTR block 0 mov x5, x9 mov x15, #0x100000000 //set up counter increment movi v31.16b, #0x0 mov v31.d[1], x15 sub x5, x5, #1 //byte_len - 1 and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) add x5, x5, x0 rev32 v30.16b, v0.16b //set up reversed counter add v30.4s, v30.4s, v31.4s //CTR block 0 rev32 v1.16b, v30.16b //CTR block 1 add v30.4s, v30.4s, v31.4s //CTR block 1 rev32 v2.16b, v30.16b //CTR block 2 add v30.4s, v30.4s, v31.4s //CTR block 2 rev32 v3.16b, v30.16b //CTR block 3 add v30.4s, v30.4s, v31.4s //CTR block 3 rev32 v4.16b, v30.16b //CTR block 4 add v30.4s, v30.4s, v31.4s //CTR block 4 rev32 v5.16b, v30.16b //CTR block 5 add v30.4s, v30.4s, v31.4s //CTR block 5 ldp q26, q27, [x11, #0] //load rk0, rk1 rev32 v6.16b, v30.16b //CTR block 6 add v30.4s, v30.4s, v31.4s //CTR block 6 rev32 v7.16b, v30.16b //CTR block 7 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 3 - round 0 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 4 - round 0 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 2 - round 0 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 0 - round 0 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 1 - round 0 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 6 - round 0 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 5 - round 0 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 7 - round 0 ldp q28, q26, [x11, #32] //load rk2, rk3 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 4 - round 1 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 1 - round 1 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 3 - round 1 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 6 - round 1 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 5 - round 1 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 2 - round 1 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 7 - round 1 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 2 - round 2 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 3 - round 2 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 0 - round 1 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 7 - round 2 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 6 - round 2 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 5 - round 2 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 4 - round 2 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 0 - round 2 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 1 - round 2 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 5 - round 3 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 3 - round 3 ldp q27, q28, [x11, #64] //load rk4, rk5 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 4 - round 3 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 1 - round 3 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 6 - round 3 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 7 - round 3 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 2 - round 3 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 0 - round 3 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 4 - round 4 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 6 - round 4 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 1 - round 4 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 2 - round 4 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 0 - round 4 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 3 - round 4 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 7 - round 4 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 5 - round 4 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 0 - round 5 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 2 - round 5 ldp q26, q27, [x11, #96] //load rk6, rk7 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 1 - round 5 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 4 - round 5 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 5 - round 5 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 3 - round 5 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 6 - round 5 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 7 - round 5 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 1 - round 6 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 5 - round 6 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 4 - round 6 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 2 - round 6 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 6 - round 6 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 0 - round 6 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 7 - round 6 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 3 - round 6 ldp q28, q26, [x11, #128] //load rk8, rk9 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 2 - round 7 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 0 - round 7 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 7 - round 7 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 6 - round 7 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 1 - round 7 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 5 - round 7 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 3 - round 7 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 4 - round 7 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 6 - round 8 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 1 - round 8 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 3 - round 8 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 0 - round 8 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 7 - round 8 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 5 - round 8 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 4 - round 8 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 2 - round 8 ld1 { v19.16b}, [x3] ext v19.16b, v19.16b, v19.16b, #8 rev64 v19.16b, v19.16b ldp q27, q28, [x11, #160] //load rk10, rk11 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 6 - round 9 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 7 - round 9 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 3 - round 9 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 4 - round 9 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 5 - round 9 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 2 - round 9 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 1 - round 9 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 7 - round 10 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 4 - round 10 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 0 - round 9 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 1 - round 10 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 5 - round 10 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 3 - round 10 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 2 - round 10 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 0 - round 10 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 6 - round 10 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 4 - round 11 ldp q26, q27, [x11, #192] //load rk12, rk13 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 5 - round 11 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 2 - round 11 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 6 - round 11 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 1 - round 11 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 0 - round 11 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 3 - round 11 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 7 - round 11 add v30.4s, v30.4s, v31.4s //CTR block 7 ldr q28, [x11, #224] //load rk14 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 4 - round 12 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 2 - round 12 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 1 - round 12 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 0 - round 12 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 5 - round 12 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 3 - round 12 aese v2.16b, v27.16b //AES block 2 - round 13 aese v1.16b, v27.16b //AES block 1 - round 13 aese v4.16b, v27.16b //AES block 4 - round 13 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 6 - round 12 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 7 - round 12 aese v0.16b, v27.16b //AES block 0 - round 13 aese v5.16b, v27.16b //AES block 5 - round 13 aese v6.16b, v27.16b //AES block 6 - round 13 aese v7.16b, v27.16b //AES block 7 - round 13 aese v3.16b, v27.16b //AES block 3 - round 13 add x4, x0, x1, lsr #3 //end_input_ptr cmp x0, x5 //check if we have <= 8 blocks b.ge .L256_enc_tail //handle tail ldp q8, q9, [x0], #32 //AES block 0, 1 - load plaintext ldp q10, q11, [x0], #32 //AES block 2, 3 - load plaintext .inst 0xce007108 //eor3 v8.16b, v8.16b, v0.16b, v28.16b //AES block 0 - result rev32 v0.16b, v30.16b //CTR block 8 add v30.4s, v30.4s, v31.4s //CTR block 8 .inst 0xce017129 //eor3 v9.16b, v9.16b, v1.16b, v28.16b //AES block 1 - result .inst 0xce03716b //eor3 v11.16b, v11.16b, v3.16b, v28.16b //AES block 3 - result rev32 v1.16b, v30.16b //CTR block 9 add v30.4s, v30.4s, v31.4s //CTR block 9 ldp q12, q13, [x0], #32 //AES block 4, 5 - load plaintext ldp q14, q15, [x0], #32 //AES block 6, 7 - load plaintext .inst 0xce02714a //eor3 v10.16b, v10.16b, v2.16b, v28.16b //AES block 2 - result cmp x0, x5 //check if we have <= 8 blocks rev32 v2.16b, v30.16b //CTR block 10 add v30.4s, v30.4s, v31.4s //CTR block 10 stp q8, q9, [x2], #32 //AES block 0, 1 - store result stp q10, q11, [x2], #32 //AES block 2, 3 - store result rev32 v3.16b, v30.16b //CTR block 11 add v30.4s, v30.4s, v31.4s //CTR block 11 .inst 0xce04718c //eor3 v12.16b, v12.16b, v4.16b, v28.16b //AES block 4 - result .inst 0xce0771ef //eor3 v15.16b, v15.16b, v7.16b, v28.16b //AES block 7 - result .inst 0xce0671ce //eor3 v14.16b, v14.16b, v6.16b, v28.16b //AES block 6 - result .inst 0xce0571ad //eor3 v13.16b, v13.16b, v5.16b, v28.16b //AES block 5 - result stp q12, q13, [x2], #32 //AES block 4, 5 - store result rev32 v4.16b, v30.16b //CTR block 12 stp q14, q15, [x2], #32 //AES block 6, 7 - store result add v30.4s, v30.4s, v31.4s //CTR block 12 b.ge .L256_enc_prepretail //do prepretail .L256_enc_main_loop: //main loop start ldp q26, q27, [x11, #0] //load rk0, rk1 rev32 v5.16b, v30.16b //CTR block 8k+13 add v30.4s, v30.4s, v31.4s //CTR block 8k+13 ldr q21, [x6, #112] //load h6k | h5k ldr q24, [x6, #160] //load h8k | h7k rev64 v11.16b, v11.16b //GHASH block 8k+3 ldr q20, [x6, #96] //load h5l | h5h ldr q22, [x6, #128] //load h6l | h6h rev64 v9.16b, v9.16b //GHASH block 8k+1 rev32 v6.16b, v30.16b //CTR block 8k+14 add v30.4s, v30.4s, v31.4s //CTR block 8k+14 rev64 v8.16b, v8.16b //GHASH block 8k rev64 v12.16b, v12.16b //GHASH block 8k+4 ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 ldr q23, [x6, #144] //load h7l | h7h ldr q25, [x6, #176] //load h8l | h8h aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 rev32 v7.16b, v30.16b //CTR block 8k+15 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 ldp q28, q26, [x11, #32] //load rk2, rk3 eor v8.16b, v8.16b, v19.16b //PRE 1 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 rev64 v14.16b, v14.16b //GHASH block 8k+6 pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 ldp q27, q28, [x11, #64] //load rk4, rk5 rev64 v10.16b, v10.16b //GHASH block 8k+2 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high rev64 v13.16b, v13.16b //GHASH block 8k+5 pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low ldr q23, [x6, #48] //load h3l | h3h ldr q25, [x6, #80] //load h4l | h4h trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid .inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid ldp q26, q27, [x11, #96] //load rk6, rk7 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 rev64 v15.16b, v15.16b //GHASH block 8k+7 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 .inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 ldp q28, q26, [x11, #128] //load rk8, rk9 pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 ldr q20, [x6] //load h1l | h1h ldr q22, [x6, #32] //load h2l | h2h aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 .inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid ldr q21, [x6, #16] //load h2k | h1k ldr q24, [x6, #64] //load h4k | h3k aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 9 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 .inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 9 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 9 eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 9 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 9 ldp q27, q28, [x11, #160] //load rk10, rk11 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 9 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 9 pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high .inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low ldr d16, [x10] //MODULO - load modulo constant pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 9 .inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid .inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low .inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 10 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 10 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 10 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 10 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 10 add v30.4s, v30.4s, v31.4s //CTR block 8k+15 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 10 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 10 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 10 .inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high ldp q26, q27, [x11, #192] //load rk12, rk13 rev32 v20.16b, v30.16b //CTR block 8k+16 ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load plaintext aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 11 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 11 add v30.4s, v30.4s, v31.4s //CTR block 8k+16 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 11 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 11 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 11 pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 11 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 12 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 11 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 12 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 12 rev32 v22.16b, v30.16b //CTR block 8k+17 add v30.4s, v30.4s, v31.4s //CTR block 8k+17 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 11 .inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 12 ldr q28, [x11, #224] //load rk14 aese v7.16b, v27.16b //AES block 8k+15 - round 13 ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load plaintext aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 12 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 12 .inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 12 ldp q12, q13, [x0], #32 //AES block 4, 5 - load plaintext ldp q14, q15, [x0], #32 //AES block 6, 7 - load plaintext aese v2.16b, v27.16b //AES block 8k+10 - round 13 aese v4.16b, v27.16b //AES block 8k+12 - round 13 rev32 v23.16b, v30.16b //CTR block 8k+18 add v30.4s, v30.4s, v31.4s //CTR block 8k+18 aese v5.16b, v27.16b //AES block 8k+13 - round 13 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 12 aese v3.16b, v27.16b //AES block 8k+11 - round 13 cmp x0, x5 //.LOOP CONTROL .inst 0xce02714a //eor3 v10.16b, v10.16b, v2.16b, v28.16b //AES block 8k+10 - result rev32 v25.16b, v30.16b //CTR block 8k+19 add v30.4s, v30.4s, v31.4s //CTR block 8k+19 aese v0.16b, v27.16b //AES block 8k+8 - round 13 aese v6.16b, v27.16b //AES block 8k+14 - round 13 .inst 0xce0571ad //eor3 v13.16b, v13.16b, v5.16b, v28.16b //AES block 5 - result ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low aese v1.16b, v27.16b //AES block 8k+9 - round 13 .inst 0xce04718c //eor3 v12.16b, v12.16b, v4.16b, v28.16b //AES block 4 - result rev32 v4.16b, v30.16b //CTR block 8k+20 .inst 0xce03716b //eor3 v11.16b, v11.16b, v3.16b, v28.16b //AES block 8k+11 - result mov v3.16b, v25.16b //CTR block 8k+19 .inst 0xce017129 //eor3 v9.16b, v9.16b, v1.16b, v28.16b //AES block 8k+9 - result .inst 0xce007108 //eor3 v8.16b, v8.16b, v0.16b, v28.16b //AES block 8k+8 - result add v30.4s, v30.4s, v31.4s //CTR block 8k+20 stp q8, q9, [x2], #32 //AES block 8k+8, 8k+9 - store result mov v2.16b, v23.16b //CTR block 8k+18 .inst 0xce0771ef //eor3 v15.16b, v15.16b, v7.16b, v28.16b //AES block 7 - result .inst 0xce154673 //eor3 v19.16b, v19.16b, v21.16b, v17.16b //MODULO - fold into low stp q10, q11, [x2], #32 //AES block 8k+10, 8k+11 - store result .inst 0xce0671ce //eor3 v14.16b, v14.16b, v6.16b, v28.16b //AES block 6 - result mov v1.16b, v22.16b //CTR block 8k+17 stp q12, q13, [x2], #32 //AES block 4, 5 - store result stp q14, q15, [x2], #32 //AES block 6, 7 - store result mov v0.16b, v20.16b //CTR block 8k+16 b.lt .L256_enc_main_loop .L256_enc_prepretail: //PREPRETAIL rev32 v5.16b, v30.16b //CTR block 8k+13 ldp q26, q27, [x11, #0] //load rk0, rk1 add v30.4s, v30.4s, v31.4s //CTR block 8k+13 rev64 v10.16b, v10.16b //GHASH block 8k+2 rev32 v6.16b, v30.16b //CTR block 8k+14 add v30.4s, v30.4s, v31.4s //CTR block 8k+14 rev64 v13.16b, v13.16b //GHASH block 8k+5 ldr q21, [x6, #112] //load h6k | h5k ldr q24, [x6, #160] //load h8k | h7k rev32 v7.16b, v30.16b //CTR block 8k+15 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 rev64 v8.16b, v8.16b //GHASH block 8k aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 rev64 v9.16b, v9.16b //GHASH block 8k+1 ldp q28, q26, [x11, #32] //load rk2, rk3 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 ldr q23, [x6, #144] //load h7l | h7h ldr q25, [x6, #176] //load h8l | h8h aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 ldr q20, [x6, #96] //load h5l | h5h ldr q22, [x6, #128] //load h6l | h6h aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 eor v8.16b, v8.16b, v19.16b //PRE 1 rev64 v11.16b, v11.16b //GHASH block 8k+3 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 ldp q27, q28, [x11, #64] //load rk4, rk5 trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high rev64 v14.16b, v14.16b //GHASH block 8k+6 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid .inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid rev64 v12.16b, v12.16b //GHASH block 8k+4 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 ldp q26, q27, [x11, #96] //load rk6, rk7 ldr q23, [x6, #48] //load h3l | h3h ldr q25, [x6, #80] //load h4l | h4h pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid .inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 rev64 v15.16b, v15.16b //GHASH block 8k+7 trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 .inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 ldr q21, [x6, #16] //load h2k | h1k ldr q24, [x6, #64] //load h4k | h3k aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low ldr q20, [x6] //load h1l | h1h ldr q22, [x6, #32] //load h2l | h2h ldp q28, q26, [x11, #128] //load rk8, rk9 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 .inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low .inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid .inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high ldp q27, q28, [x11, #160] //load rk10, rk11 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 9 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 9 .inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high .inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid ldr d16, [x10] //MODULO - load modulo constant .inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 9 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 9 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 9 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 9 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 9 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 10 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 10 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 9 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 10 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 10 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 10 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 10 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 10 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 10 pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid .inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 11 ldp q26, q27, [x11, #192] //load rk12, rk13 ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 11 .inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 11 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 11 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 11 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 11 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 11 pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 11 ldr q28, [x11, #224] //load rk14 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 12 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 12 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 12 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 12 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 12 ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 12 add v30.4s, v30.4s, v31.4s //CTR block 8k+15 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 12 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 12 aese v0.16b, v27.16b //AES block 8k+8 - round 13 .inst 0xce154673 //eor3 v19.16b, v19.16b, v21.16b, v17.16b //MODULO - fold into low aese v5.16b, v27.16b //AES block 8k+13 - round 13 aese v1.16b, v27.16b //AES block 8k+9 - round 13 aese v3.16b, v27.16b //AES block 8k+11 - round 13 aese v4.16b, v27.16b //AES block 8k+12 - round 13 aese v7.16b, v27.16b //AES block 8k+15 - round 13 aese v2.16b, v27.16b //AES block 8k+10 - round 13 aese v6.16b, v27.16b //AES block 8k+14 - round 13 .L256_enc_tail: //TAIL ldp q24, q25, [x6, #160] //load h8l | h8h sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process ldr q8, [x0], #16 //AES block 8k+8 - load plaintext ldp q20, q21, [x6, #96] //load h5l | h5h ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag ldp q22, q23, [x6, #128] //load h6l | h6h mov v29.16b, v28.16b cmp x5, #112 .inst 0xce007509 //eor3 v9.16b, v8.16b, v0.16b, v29.16b //AES block 8k+8 - result b.gt .L256_enc_blocks_more_than_7 movi v19.8b, #0 mov v7.16b, v6.16b movi v17.8b, #0 mov v6.16b, v5.16b mov v5.16b, v4.16b mov v4.16b, v3.16b mov v3.16b, v2.16b sub v30.4s, v30.4s, v31.4s mov v2.16b, v1.16b movi v18.8b, #0 cmp x5, #96 b.gt .L256_enc_blocks_more_than_6 mov v7.16b, v6.16b mov v6.16b, v5.16b cmp x5, #80 mov v5.16b, v4.16b mov v4.16b, v3.16b mov v3.16b, v1.16b sub v30.4s, v30.4s, v31.4s b.gt .L256_enc_blocks_more_than_5 mov v7.16b, v6.16b sub v30.4s, v30.4s, v31.4s mov v6.16b, v5.16b mov v5.16b, v4.16b cmp x5, #64 mov v4.16b, v1.16b b.gt .L256_enc_blocks_more_than_4 cmp x5, #48 mov v7.16b, v6.16b mov v6.16b, v5.16b mov v5.16b, v1.16b sub v30.4s, v30.4s, v31.4s b.gt .L256_enc_blocks_more_than_3 cmp x5, #32 mov v7.16b, v6.16b ldr q24, [x6, #64] //load h4k | h3k mov v6.16b, v1.16b sub v30.4s, v30.4s, v31.4s b.gt .L256_enc_blocks_more_than_2 mov v7.16b, v1.16b sub v30.4s, v30.4s, v31.4s cmp x5, #16 b.gt .L256_enc_blocks_more_than_1 sub v30.4s, v30.4s, v31.4s ldr q21, [x6, #16] //load h2k | h1k b .L256_enc_blocks_less_than_1 .L256_enc_blocks_more_than_7: //blocks left > 7 st1 { v9.16b}, [x2], #16 //AES final-7 block - store result rev64 v8.16b, v9.16b //GHASH final-7 block eor v8.16b, v8.16b, v16.16b //feed in partial tag ldr q9, [x0], #16 //AES final-6 block - load plaintext pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high ins v27.d[0], v8.d[1] //GHASH final-7 block - mid ins v18.d[0], v24.d[1] //GHASH final-7 block - mid movi v16.8b, #0 //supress further partial tag feed in eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid .inst 0xce017529 //eor3 v9.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low .L256_enc_blocks_more_than_6: //blocks left > 6 st1 { v9.16b}, [x2], #16 //AES final-6 block - store result rev64 v8.16b, v9.16b //GHASH final-6 block eor v8.16b, v8.16b, v16.16b //feed in partial tag pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low ins v27.d[0], v8.d[1] //GHASH final-6 block - mid pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high ldr q9, [x0], #16 //AES final-5 block - load plaintext eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid .inst 0xce027529 //eor3 v9.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result movi v16.8b, #0 //supress further partial tag feed in eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high .L256_enc_blocks_more_than_5: //blocks left > 5 st1 { v9.16b}, [x2], #16 //AES final-5 block - store result rev64 v8.16b, v9.16b //GHASH final-5 block eor v8.16b, v8.16b, v16.16b //feed in partial tag ins v27.d[0], v8.d[1] //GHASH final-5 block - mid pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid ins v27.d[1], v27.d[0] //GHASH final-5 block - mid ldr q9, [x0], #16 //AES final-4 block - load plaintext pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid movi v16.8b, #0 //supress further partial tag feed in eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid .inst 0xce037529 //eor3 v9.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result .L256_enc_blocks_more_than_4: //blocks left > 4 st1 { v9.16b}, [x2], #16 //AES final-4 block - store result rev64 v8.16b, v9.16b //GHASH final-4 block ldr q9, [x0], #16 //AES final-3 block - load plaintext eor v8.16b, v8.16b, v16.16b //feed in partial tag ins v27.d[0], v8.d[1] //GHASH final-4 block - mid pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high .inst 0xce047529 //eor3 v9.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid movi v16.8b, #0 //supress further partial tag feed in eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high .L256_enc_blocks_more_than_3: //blocks left > 3 st1 { v9.16b}, [x2], #16 //AES final-3 block - store result ldr q25, [x6, #80] //load h4l | h4h rev64 v8.16b, v9.16b //GHASH final-3 block eor v8.16b, v8.16b, v16.16b //feed in partial tag ins v27.d[0], v8.d[1] //GHASH final-3 block - mid pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid ldr q24, [x6, #64] //load h4k | h3k ins v27.d[1], v27.d[0] //GHASH final-3 block - mid ldr q9, [x0], #16 //AES final-2 block - load plaintext pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low .inst 0xce057529 //eor3 v9.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result movi v16.8b, #0 //supress further partial tag feed in eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low .L256_enc_blocks_more_than_2: //blocks left > 2 ldr q23, [x6, #48] //load h3l | h3h st1 { v9.16b}, [x2], #16 //AES final-2 block - store result rev64 v8.16b, v9.16b //GHASH final-2 block ldr q9, [x0], #16 //AES final-1 block - load plaintext eor v8.16b, v8.16b, v16.16b //feed in partial tag ins v27.d[0], v8.d[1] //GHASH final-2 block - mid movi v16.8b, #0 //supress further partial tag feed in pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high .inst 0xce067529 //eor3 v9.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low .L256_enc_blocks_more_than_1: //blocks left > 1 st1 { v9.16b}, [x2], #16 //AES final-1 block - store result ldr q22, [x6, #32] //load h2l | h2h rev64 v8.16b, v9.16b //GHASH final-1 block ldr q9, [x0], #16 //AES final block - load plaintext eor v8.16b, v8.16b, v16.16b //feed in partial tag movi v16.8b, #0 //supress further partial tag feed in ins v27.d[0], v8.d[1] //GHASH final-1 block - mid pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high .inst 0xce077529 //eor3 v9.16b, v9.16b, v7.16b, v29.16b //AES final block - result eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid ldr q21, [x6, #16] //load h2k | h1k eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low ins v27.d[1], v27.d[0] //GHASH final-1 block - mid pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid .L256_enc_blocks_less_than_1: //blocks left <= 1 and x1, x1, #127 //bit_length %= 128 sub x1, x1, #128 //bit_length -= 128 neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) mvn x7, xzr //temp0_x = 0xffffffffffffffff and x1, x1, #127 //bit_length %= 128 lsr x7, x7, x1 //temp0_x is mask for top 64b of last block cmp x1, #64 mvn x8, xzr //temp1_x = 0xffffffffffffffff csel x14, x7, xzr, lt csel x13, x8, x7, lt mov v0.d[0], x13 //ctr0b is mask for last block ldr q20, [x6] //load h1l | h1h ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored mov v0.d[1], x14 and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits rev64 v8.16b, v9.16b //GHASH final block rev32 v30.16b, v30.16b bif v9.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing str q30, [x16] //store the updated counter eor v8.16b, v8.16b, v16.16b //feed in partial tag st1 { v9.16b}, [x2] //store all 16B ins v16.d[0], v8.d[1] //GHASH final block - mid pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high pmull v26.1q, v8.1d, v20.1d //GHASH final block - low eor v17.16b, v17.16b, v28.16b //GHASH final block - high eor v19.16b, v19.16b, v26.16b //GHASH final block - low eor v16.8b, v16.8b, v8.8b //GHASH final block - mid pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid eor v18.16b, v18.16b, v16.16b //GHASH final block - mid ldr d16, [x10] //MODULO - load modulo constant ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment .inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid .inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment .inst 0xce115673 //eor3 v19.16b, v19.16b, v17.16b, v21.16b //MODULO - fold into low ext v19.16b, v19.16b, v19.16b, #8 rev64 v19.16b, v19.16b st1 { v19.16b }, [x3] mov x0, x9 //return sizes ldp d10, d11, [sp, #16] ldp d12, d13, [sp, #32] ldp d14, d15, [sp, #48] ldp d8, d9, [sp], #80 ret .L256_enc_ret: mov w0, #0x0 ret .size aesv8_gcm_8x_enc_256,.-aesv8_gcm_8x_enc_256 .globl aesv8_gcm_8x_dec_256 .hidden aesv8_gcm_8x_dec_256 .type aesv8_gcm_8x_dec_256,%function .align 4 aesv8_gcm_8x_dec_256: AARCH64_VALID_CALL_TARGET cbz x1, .L256_dec_ret stp d8, d9, [sp, #-80]! lsr x9, x1, #3 mov x16, x4 mov x11, x5 stp d10, d11, [sp, #16] stp d12, d13, [sp, #32] stp d14, d15, [sp, #48] mov x5, #0xc200000000000000 stp x5, xzr, [sp, #64] add x10, sp, #64 ld1 { v0.16b}, [x16] //CTR block 0 mov x15, #0x100000000 //set up counter increment movi v31.16b, #0x0 mov v31.d[1], x15 mov x5, x9 sub x5, x5, #1 //byte_len - 1 rev32 v30.16b, v0.16b //set up reversed counter add v30.4s, v30.4s, v31.4s //CTR block 0 rev32 v1.16b, v30.16b //CTR block 1 add v30.4s, v30.4s, v31.4s //CTR block 1 rev32 v2.16b, v30.16b //CTR block 2 add v30.4s, v30.4s, v31.4s //CTR block 2 ldp q26, q27, [x11, #0] //load rk0, rk1 rev32 v3.16b, v30.16b //CTR block 3 add v30.4s, v30.4s, v31.4s //CTR block 3 rev32 v4.16b, v30.16b //CTR block 4 add v30.4s, v30.4s, v31.4s //CTR block 4 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 0 - round 0 rev32 v5.16b, v30.16b //CTR block 5 add v30.4s, v30.4s, v31.4s //CTR block 5 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 1 - round 0 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 2 - round 0 rev32 v6.16b, v30.16b //CTR block 6 add v30.4s, v30.4s, v31.4s //CTR block 6 rev32 v7.16b, v30.16b //CTR block 7 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 4 - round 0 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 6 - round 0 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 5 - round 0 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 3 - round 0 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 7 - round 0 ldp q28, q26, [x11, #32] //load rk2, rk3 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 6 - round 1 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 4 - round 1 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 0 - round 1 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 5 - round 1 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 7 - round 1 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 1 - round 1 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 2 - round 1 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 3 - round 1 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 3 - round 2 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 2 - round 2 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 6 - round 2 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 1 - round 2 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 7 - round 2 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 5 - round 2 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 0 - round 2 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 4 - round 2 ldp q27, q28, [x11, #64] //load rk4, rk5 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 1 - round 3 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 2 - round 3 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 3 - round 3 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 4 - round 3 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 5 - round 3 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 7 - round 3 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 0 - round 3 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 6 - round 3 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 7 - round 4 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 3 - round 4 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 6 - round 4 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 2 - round 4 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 0 - round 4 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 4 - round 4 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 1 - round 4 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 5 - round 4 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 0 - round 5 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 6 - round 5 ldp q26, q27, [x11, #96] //load rk6, rk7 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 4 - round 5 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 7 - round 5 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 5 - round 5 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 2 - round 5 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 3 - round 5 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 1 - round 5 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 4 - round 6 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 3 - round 6 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 7 - round 6 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 6 - round 6 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 0 - round 6 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 5 - round 6 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 2 - round 6 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 1 - round 6 ldp q28, q26, [x11, #128] //load rk8, rk9 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 5 - round 7 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 0 - round 7 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 3 - round 7 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 2 - round 7 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 7 - round 7 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 4 - round 7 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 1 - round 7 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 6 - round 7 and x5, x5, #0xffffffffffffff80 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 7 - round 8 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 5 - round 8 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 0 - round 8 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 1 - round 8 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 2 - round 8 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 4 - round 8 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 3 - round 8 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 6 - round 8 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 2 - round 9 ld1 { v19.16b}, [x3] ext v19.16b, v19.16b, v19.16b, #8 rev64 v19.16b, v19.16b ldp q27, q28, [x11, #160] //load rk10, rk11 add x4, x0, x1, lsr #3 //end_input_ptr add x5, x5, x0 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 3 - round 9 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 6 - round 9 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 4 - round 9 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 5 - round 9 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 7 - round 9 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 0 - round 9 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 1 - round 9 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 4 - round 10 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 7 - round 10 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 5 - round 10 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 1 - round 10 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 2 - round 10 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 0 - round 10 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 6 - round 10 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 3 - round 10 ldp q26, q27, [x11, #192] //load rk12, rk13 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 0 - round 11 add v30.4s, v30.4s, v31.4s //CTR block 7 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 7 - round 11 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 3 - round 11 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 1 - round 11 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 5 - round 11 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 4 - round 11 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 2 - round 11 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 6 - round 11 ldr q28, [x11, #224] //load rk14 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 1 - round 12 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 4 - round 12 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 5 - round 12 cmp x0, x5 //check if we have <= 8 blocks aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 3 - round 12 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 2 - round 12 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 6 - round 12 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 0 - round 12 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 7 - round 12 aese v5.16b, v27.16b //AES block 5 - round 13 aese v1.16b, v27.16b //AES block 1 - round 13 aese v2.16b, v27.16b //AES block 2 - round 13 aese v0.16b, v27.16b //AES block 0 - round 13 aese v4.16b, v27.16b //AES block 4 - round 13 aese v6.16b, v27.16b //AES block 6 - round 13 aese v3.16b, v27.16b //AES block 3 - round 13 aese v7.16b, v27.16b //AES block 7 - round 13 b.ge .L256_dec_tail //handle tail ldp q8, q9, [x0], #32 //AES block 0, 1 - load ciphertext ldp q10, q11, [x0], #32 //AES block 2, 3 - load ciphertext ldp q12, q13, [x0], #32 //AES block 4, 5 - load ciphertext ldp q14, q15, [x0], #32 //AES block 6, 7 - load ciphertext cmp x0, x5 //check if we have <= 8 blocks .inst 0xce017121 //eor3 v1.16b, v9.16b, v1.16b, v28.16b //AES block 1 - result .inst 0xce007100 //eor3 v0.16b, v8.16b, v0.16b, v28.16b //AES block 0 - result stp q0, q1, [x2], #32 //AES block 0, 1 - store result rev32 v0.16b, v30.16b //CTR block 8 add v30.4s, v30.4s, v31.4s //CTR block 8 .inst 0xce037163 //eor3 v3.16b, v11.16b, v3.16b, v28.16b //AES block 3 - result .inst 0xce0571a5 //eor3 v5.16b, v13.16b, v5.16b, v28.16b //AES block 5 - result .inst 0xce047184 //eor3 v4.16b, v12.16b, v4.16b, v28.16b //AES block 4 - result rev32 v1.16b, v30.16b //CTR block 9 add v30.4s, v30.4s, v31.4s //CTR block 9 .inst 0xce027142 //eor3 v2.16b, v10.16b, v2.16b, v28.16b //AES block 2 - result stp q2, q3, [x2], #32 //AES block 2, 3 - store result rev32 v2.16b, v30.16b //CTR block 10 add v30.4s, v30.4s, v31.4s //CTR block 10 .inst 0xce0671c6 //eor3 v6.16b, v14.16b, v6.16b, v28.16b //AES block 6 - result rev32 v3.16b, v30.16b //CTR block 11 add v30.4s, v30.4s, v31.4s //CTR block 11 stp q4, q5, [x2], #32 //AES block 4, 5 - store result .inst 0xce0771e7 //eor3 v7.16b, v15.16b, v7.16b, v28.16b //AES block 7 - result stp q6, q7, [x2], #32 //AES block 6, 7 - store result rev32 v4.16b, v30.16b //CTR block 12 add v30.4s, v30.4s, v31.4s //CTR block 12 b.ge .L256_dec_prepretail //do prepretail .L256_dec_main_loop: //main loop start rev32 v5.16b, v30.16b //CTR block 8k+13 ldp q26, q27, [x11, #0] //load rk0, rk1 add v30.4s, v30.4s, v31.4s //CTR block 8k+13 rev64 v9.16b, v9.16b //GHASH block 8k+1 ldr q23, [x6, #144] //load h7l | h7h ldr q25, [x6, #176] //load h8l | h8h rev32 v6.16b, v30.16b //CTR block 8k+14 add v30.4s, v30.4s, v31.4s //CTR block 8k+14 rev64 v8.16b, v8.16b //GHASH block 8k ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 rev64 v12.16b, v12.16b //GHASH block 8k+4 rev64 v11.16b, v11.16b //GHASH block 8k+3 rev32 v7.16b, v30.16b //CTR block 8k+15 rev64 v15.16b, v15.16b //GHASH block 8k+7 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 ldp q28, q26, [x11, #32] //load rk2, rk3 eor v8.16b, v8.16b, v19.16b //PRE 1 ldr q20, [x6, #96] //load h5l | h5h ldr q22, [x6, #128] //load h6l | h6h aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 rev64 v10.16b, v10.16b //GHASH block 8k+2 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 ldp q27, q28, [x11, #64] //load rk4, rk5 pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 ldr q21, [x6, #112] //load h6k | h5k ldr q24, [x6, #160] //load h8k | h7k eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low ldp q26, q27, [x11, #96] //load rk6, rk7 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 .inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid rev64 v13.16b, v13.16b //GHASH block 8k+5 pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid .inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low ldr q23, [x6, #48] //load h3l | h3h ldr q25, [x6, #80] //load h4l | h4h rev64 v14.16b, v14.16b //GHASH block 8k+6 eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 ldp q28, q26, [x11, #128] //load rk8, rk9 ldr q20, [x6] //load h1l | h1h ldr q22, [x6, #32] //load h2l | h2h .inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 ldr q21, [x6, #16] //load h2k | h1k ldr q24, [x6, #64] //load h4k | h3k aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 ldp q27, q28, [x11, #160] //load rk10, rk11 pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid add v30.4s, v30.4s, v31.4s //CTR block 8k+15 .inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 9 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 9 eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 9 ldp q8, q9, [x0], #32 //AES block 8k+8, 8k+9 - load ciphertext eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 9 pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 9 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 9 pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 10 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 10 pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 9 .inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 9 .inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid .inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 10 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 10 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 10 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 10 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 10 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 10 .inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low rev32 v20.16b, v30.16b //CTR block 8k+16 ldr d16, [x10] //MODULO - load modulo constant add v30.4s, v30.4s, v31.4s //CTR block 8k+16 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 11 ldp q26, q27, [x11, #192] //load rk12, rk13 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 11 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 11 .inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid rev32 v22.16b, v30.16b //CTR block 8k+17 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 11 ldp q10, q11, [x0], #32 //AES block 8k+10, 8k+11 - load ciphertext aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 11 ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 11 add v30.4s, v30.4s, v31.4s //CTR block 8k+17 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 11 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 12 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 12 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 12 rev32 v23.16b, v30.16b //CTR block 8k+18 add v30.4s, v30.4s, v31.4s //CTR block 8k+18 pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid .inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 12 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 11 ldr q28, [x11, #224] //load rk14 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 12 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 12 .inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 12 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 12 ldp q12, q13, [x0], #32 //AES block 8k+12, 8k+13 - load ciphertext aese v1.16b, v27.16b //AES block 8k+9 - round 13 aese v2.16b, v27.16b //AES block 8k+10 - round 13 ldp q14, q15, [x0], #32 //AES block 8k+14, 8k+15 - load ciphertext aese v0.16b, v27.16b //AES block 8k+8 - round 13 aese v5.16b, v27.16b //AES block 8k+13 - round 13 rev32 v25.16b, v30.16b //CTR block 8k+19 .inst 0xce027142 //eor3 v2.16b, v10.16b, v2.16b, v28.16b //AES block 8k+10 - result .inst 0xce017121 //eor3 v1.16b, v9.16b, v1.16b, v28.16b //AES block 8k+9 - result ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment aese v7.16b, v27.16b //AES block 8k+15 - round 13 add v30.4s, v30.4s, v31.4s //CTR block 8k+19 pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low aese v4.16b, v27.16b //AES block 8k+12 - round 13 .inst 0xce0571a5 //eor3 v5.16b, v13.16b, v5.16b, v28.16b //AES block 8k+13 - result .inst 0xce007100 //eor3 v0.16b, v8.16b, v0.16b, v28.16b //AES block 8k+8 - result aese v3.16b, v27.16b //AES block 8k+11 - round 13 stp q0, q1, [x2], #32 //AES block 8k+8, 8k+9 - store result mov v0.16b, v20.16b //CTR block 8k+16 .inst 0xce047184 //eor3 v4.16b, v12.16b, v4.16b, v28.16b //AES block 8k+12 - result .inst 0xce154673 //eor3 v19.16b, v19.16b, v21.16b, v17.16b //MODULO - fold into low .inst 0xce037163 //eor3 v3.16b, v11.16b, v3.16b, v28.16b //AES block 8k+11 - result stp q2, q3, [x2], #32 //AES block 8k+10, 8k+11 - store result mov v3.16b, v25.16b //CTR block 8k+19 mov v2.16b, v23.16b //CTR block 8k+18 aese v6.16b, v27.16b //AES block 8k+14 - round 13 mov v1.16b, v22.16b //CTR block 8k+17 stp q4, q5, [x2], #32 //AES block 8k+12, 8k+13 - store result .inst 0xce0771e7 //eor3 v7.16b, v15.16b, v7.16b, v28.16b //AES block 8k+15 - result .inst 0xce0671c6 //eor3 v6.16b, v14.16b, v6.16b, v28.16b //AES block 8k+14 - result rev32 v4.16b, v30.16b //CTR block 8k+20 add v30.4s, v30.4s, v31.4s //CTR block 8k+20 cmp x0, x5 //.LOOP CONTROL stp q6, q7, [x2], #32 //AES block 8k+14, 8k+15 - store result b.lt .L256_dec_main_loop .L256_dec_prepretail: //PREPRETAIL ldp q26, q27, [x11, #0] //load rk0, rk1 rev32 v5.16b, v30.16b //CTR block 8k+13 add v30.4s, v30.4s, v31.4s //CTR block 8k+13 rev64 v12.16b, v12.16b //GHASH block 8k+4 ldr q21, [x6, #112] //load h6k | h5k ldr q24, [x6, #160] //load h8k | h7k rev32 v6.16b, v30.16b //CTR block 8k+14 rev64 v8.16b, v8.16b //GHASH block 8k add v30.4s, v30.4s, v31.4s //CTR block 8k+14 ext v19.16b, v19.16b, v19.16b, #8 //PRE 0 ldr q23, [x6, #144] //load h7l | h7h ldr q25, [x6, #176] //load h8l | h8h rev64 v9.16b, v9.16b //GHASH block 8k+1 rev32 v7.16b, v30.16b //CTR block 8k+15 rev64 v10.16b, v10.16b //GHASH block 8k+2 ldr q20, [x6, #96] //load h5l | h5h ldr q22, [x6, #128] //load h6l | h6h aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 0 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 0 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 0 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 0 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 0 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 0 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 1 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 0 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 0 ldp q28, q26, [x11, #32] //load rk2, rk3 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 1 eor v8.16b, v8.16b, v19.16b //PRE 1 aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 1 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 1 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 1 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 1 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 1 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 1 pmull2 v16.1q, v9.2d, v23.2d //GHASH block 8k+1 - high trn1 v18.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid pmull v19.1q, v8.1d, v25.1d //GHASH block 8k - low rev64 v11.16b, v11.16b //GHASH block 8k+3 pmull v23.1q, v9.1d, v23.1d //GHASH block 8k+1 - low aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 2 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 2 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 2 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 2 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 2 pmull2 v17.1q, v8.2d, v25.2d //GHASH block 8k - high aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 2 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 3 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 3 rev64 v14.16b, v14.16b //GHASH block 8k+6 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 3 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 2 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 3 pmull2 v29.1q, v10.2d, v22.2d //GHASH block 8k+2 - high trn2 v8.2d, v9.2d, v8.2d //GHASH block 8k, 8k+1 - mid aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 2 ldp q27, q28, [x11, #64] //load rk4, rk5 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 3 pmull2 v9.1q, v11.2d, v20.2d //GHASH block 8k+3 - high aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 3 eor v17.16b, v17.16b, v16.16b //GHASH block 8k+1 - high eor v8.16b, v8.16b, v18.16b //GHASH block 8k, 8k+1 - mid aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 3 pmull v22.1q, v10.1d, v22.1d //GHASH block 8k+2 - low aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 3 .inst 0xce1d2631 //eor3 v17.16b, v17.16b, v29.16b, v9.16b //GHASH block 8k+2, 8k+3 - high trn1 v29.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid trn2 v10.2d, v11.2d, v10.2d //GHASH block 8k+2, 8k+3 - mid pmull2 v18.1q, v8.2d, v24.2d //GHASH block 8k - mid pmull v20.1q, v11.1d, v20.1d //GHASH block 8k+3 - low eor v19.16b, v19.16b, v23.16b //GHASH block 8k+1 - low pmull v24.1q, v8.1d, v24.1d //GHASH block 8k+1 - mid aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 4 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 4 .inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+2, 8k+3 - low ldr q20, [x6] //load h1l | h1h ldr q22, [x6, #32] //load h2l | h2h aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 4 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 4 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 4 eor v18.16b, v18.16b, v24.16b //GHASH block 8k+1 - mid eor v10.16b, v10.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 5 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 4 aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 5 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 4 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 4 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 5 pmull2 v29.1q, v10.2d, v21.2d //GHASH block 8k+2 - mid aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 5 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 5 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 5 pmull v21.1q, v10.1d, v21.1d //GHASH block 8k+3 - mid aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 5 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 5 ldp q26, q27, [x11, #96] //load rk6, rk7 ldr q23, [x6, #48] //load h3l | h3h ldr q25, [x6, #80] //load h4l | h4h rev64 v15.16b, v15.16b //GHASH block 8k+7 rev64 v13.16b, v13.16b //GHASH block 8k+5 .inst 0xce157652 //eor3 v18.16b, v18.16b, v21.16b, v29.16b //GHASH block 8k+2, 8k+3 - mid trn1 v16.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 6 ldr q21, [x6, #16] //load h2k | h1k ldr q24, [x6, #64] //load h4k | h3k aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 6 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 6 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 6 pmull2 v8.1q, v12.2d, v25.2d //GHASH block 8k+4 - high pmull2 v10.1q, v13.2d, v23.2d //GHASH block 8k+5 - high pmull v25.1q, v12.1d, v25.1d //GHASH block 8k+4 - low trn2 v12.2d, v13.2d, v12.2d //GHASH block 8k+4, 8k+5 - mid pmull v23.1q, v13.1d, v23.1d //GHASH block 8k+5 - low trn1 v13.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 7 pmull2 v11.1q, v14.2d, v22.2d //GHASH block 8k+6 - high aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 6 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 6 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 6 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 6 ldp q28, q26, [x11, #128] //load rk8, rk9 pmull v22.1q, v14.1d, v22.1d //GHASH block 8k+6 - low aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 7 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 7 aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 7 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 7 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 7 .inst 0xce082a31 //eor3 v17.16b, v17.16b, v8.16b, v10.16b //GHASH block 8k+4, 8k+5 - high aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 7 trn2 v14.2d, v15.2d, v14.2d //GHASH block 8k+6, 8k+7 - mid aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 7 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 8 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 8 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 8 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 8 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 8 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 8 aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 8 aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 9 eor v12.16b, v12.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 9 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 9 eor v14.16b, v14.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 9 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 9 pmull2 v16.1q, v12.2d, v24.2d //GHASH block 8k+4 - mid aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 8 pmull v24.1q, v12.1d, v24.1d //GHASH block 8k+5 - mid pmull2 v12.1q, v15.2d, v20.2d //GHASH block 8k+7 - high pmull2 v13.1q, v14.2d, v21.2d //GHASH block 8k+6 - mid pmull v21.1q, v14.1d, v21.1d //GHASH block 8k+7 - mid pmull v20.1q, v15.1d, v20.1d //GHASH block 8k+7 - low ldp q27, q28, [x11, #160] //load rk10, rk11 .inst 0xce195e73 //eor3 v19.16b, v19.16b, v25.16b, v23.16b //GHASH block 8k+4, 8k+5 - low .inst 0xce184252 //eor3 v18.16b, v18.16b, v24.16b, v16.16b //GHASH block 8k+4, 8k+5 - mid aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 9 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 9 aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 9 .inst 0xce0b3231 //eor3 v17.16b, v17.16b, v11.16b, v12.16b //GHASH block 8k+6, 8k+7 - high .inst 0xce165273 //eor3 v19.16b, v19.16b, v22.16b, v20.16b //GHASH block 8k+6, 8k+7 - low ldr d16, [x10] //MODULO - load modulo constant .inst 0xce153652 //eor3 v18.16b, v18.16b, v21.16b, v13.16b //GHASH block 8k+6, 8k+7 - mid aese v4.16b, v27.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 10 aese v6.16b, v27.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 10 aese v5.16b, v27.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 10 aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 10 aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 10 aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 10 .inst 0xce114e52 //eor3 v18.16b, v18.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up aese v7.16b, v27.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 10 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 10 ldp q26, q27, [x11, #192] //load rk12, rk13 ext v21.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 11 aese v1.16b, v28.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 11 aese v0.16b, v28.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 11 pmull v29.1q, v17.1d, v16.1d //MODULO - top 64b align with mid aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 11 aese v7.16b, v28.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 11 aese v6.16b, v28.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 11 aese v4.16b, v28.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 11 aese v5.16b, v28.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 11 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 8k+11 - round 12 .inst 0xce1d5652 //eor3 v18.16b, v18.16b, v29.16b, v21.16b //MODULO - fold into mid aese v3.16b, v27.16b //AES block 8k+11 - round 13 aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 8k+10 - round 12 aese v6.16b, v26.16b aesmc v6.16b, v6.16b //AES block 8k+14 - round 12 pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low aese v4.16b, v26.16b aesmc v4.16b, v4.16b //AES block 8k+12 - round 12 aese v7.16b, v26.16b aesmc v7.16b, v7.16b //AES block 8k+15 - round 12 aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 8k+8 - round 12 ldr q28, [x11, #224] //load rk14 aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 8k+9 - round 12 aese v4.16b, v27.16b //AES block 8k+12 - round 13 ext v21.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment aese v5.16b, v26.16b aesmc v5.16b, v5.16b //AES block 8k+13 - round 12 aese v6.16b, v27.16b //AES block 8k+14 - round 13 aese v2.16b, v27.16b //AES block 8k+10 - round 13 aese v1.16b, v27.16b //AES block 8k+9 - round 13 aese v5.16b, v27.16b //AES block 8k+13 - round 13 .inst 0xce154673 //eor3 v19.16b, v19.16b, v21.16b, v17.16b //MODULO - fold into low add v30.4s, v30.4s, v31.4s //CTR block 8k+15 aese v7.16b, v27.16b //AES block 8k+15 - round 13 aese v0.16b, v27.16b //AES block 8k+8 - round 13 .L256_dec_tail: //TAIL ext v16.16b, v19.16b, v19.16b, #8 //prepare final partial tag sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process cmp x5, #112 ldr q9, [x0], #16 //AES block 8k+8 - load ciphertext ldp q24, q25, [x6, #160] //load h8k | h7k mov v29.16b, v28.16b ldp q20, q21, [x6, #96] //load h5l | h5h .inst 0xce00752c //eor3 v12.16b, v9.16b, v0.16b, v29.16b //AES block 8k+8 - result ldp q22, q23, [x6, #128] //load h6l | h6h b.gt .L256_dec_blocks_more_than_7 mov v7.16b, v6.16b sub v30.4s, v30.4s, v31.4s mov v6.16b, v5.16b mov v5.16b, v4.16b mov v4.16b, v3.16b movi v19.8b, #0 movi v17.8b, #0 movi v18.8b, #0 mov v3.16b, v2.16b cmp x5, #96 mov v2.16b, v1.16b b.gt .L256_dec_blocks_more_than_6 mov v7.16b, v6.16b mov v6.16b, v5.16b mov v5.16b, v4.16b cmp x5, #80 sub v30.4s, v30.4s, v31.4s mov v4.16b, v3.16b mov v3.16b, v1.16b b.gt .L256_dec_blocks_more_than_5 cmp x5, #64 mov v7.16b, v6.16b sub v30.4s, v30.4s, v31.4s mov v6.16b, v5.16b mov v5.16b, v4.16b mov v4.16b, v1.16b b.gt .L256_dec_blocks_more_than_4 sub v30.4s, v30.4s, v31.4s mov v7.16b, v6.16b cmp x5, #48 mov v6.16b, v5.16b mov v5.16b, v1.16b b.gt .L256_dec_blocks_more_than_3 ldr q24, [x6, #64] //load h4k | h3k sub v30.4s, v30.4s, v31.4s mov v7.16b, v6.16b cmp x5, #32 mov v6.16b, v1.16b b.gt .L256_dec_blocks_more_than_2 sub v30.4s, v30.4s, v31.4s mov v7.16b, v1.16b cmp x5, #16 b.gt .L256_dec_blocks_more_than_1 sub v30.4s, v30.4s, v31.4s ldr q21, [x6, #16] //load h2k | h1k b .L256_dec_blocks_less_than_1 .L256_dec_blocks_more_than_7: //blocks left > 7 rev64 v8.16b, v9.16b //GHASH final-7 block ldr q9, [x0], #16 //AES final-6 block - load ciphertext st1 { v12.16b}, [x2], #16 //AES final-7 block - store result ins v18.d[0], v24.d[1] //GHASH final-7 block - mid eor v8.16b, v8.16b, v16.16b //feed in partial tag ins v27.d[0], v8.d[1] //GHASH final-7 block - mid .inst 0xce01752c //eor3 v12.16b, v9.16b, v1.16b, v29.16b //AES final-6 block - result pmull2 v17.1q, v8.2d, v25.2d //GHASH final-7 block - high eor v27.8b, v27.8b, v8.8b //GHASH final-7 block - mid movi v16.8b, #0 //supress further partial tag feed in pmull v19.1q, v8.1d, v25.1d //GHASH final-7 block - low pmull v18.1q, v27.1d, v18.1d //GHASH final-7 block - mid .L256_dec_blocks_more_than_6: //blocks left > 6 rev64 v8.16b, v9.16b //GHASH final-6 block eor v8.16b, v8.16b, v16.16b //feed in partial tag ldr q9, [x0], #16 //AES final-5 block - load ciphertext movi v16.8b, #0 //supress further partial tag feed in ins v27.d[0], v8.d[1] //GHASH final-6 block - mid st1 { v12.16b}, [x2], #16 //AES final-6 block - store result pmull2 v28.1q, v8.2d, v23.2d //GHASH final-6 block - high pmull v26.1q, v8.1d, v23.1d //GHASH final-6 block - low .inst 0xce02752c //eor3 v12.16b, v9.16b, v2.16b, v29.16b //AES final-5 block - result eor v19.16b, v19.16b, v26.16b //GHASH final-6 block - low eor v27.8b, v27.8b, v8.8b //GHASH final-6 block - mid pmull v27.1q, v27.1d, v24.1d //GHASH final-6 block - mid eor v18.16b, v18.16b, v27.16b //GHASH final-6 block - mid eor v17.16b, v17.16b, v28.16b //GHASH final-6 block - high .L256_dec_blocks_more_than_5: //blocks left > 5 rev64 v8.16b, v9.16b //GHASH final-5 block eor v8.16b, v8.16b, v16.16b //feed in partial tag pmull2 v28.1q, v8.2d, v22.2d //GHASH final-5 block - high ins v27.d[0], v8.d[1] //GHASH final-5 block - mid ldr q9, [x0], #16 //AES final-4 block - load ciphertext eor v27.8b, v27.8b, v8.8b //GHASH final-5 block - mid st1 { v12.16b}, [x2], #16 //AES final-5 block - store result pmull v26.1q, v8.1d, v22.1d //GHASH final-5 block - low ins v27.d[1], v27.d[0] //GHASH final-5 block - mid pmull2 v27.1q, v27.2d, v21.2d //GHASH final-5 block - mid eor v17.16b, v17.16b, v28.16b //GHASH final-5 block - high .inst 0xce03752c //eor3 v12.16b, v9.16b, v3.16b, v29.16b //AES final-4 block - result eor v19.16b, v19.16b, v26.16b //GHASH final-5 block - low eor v18.16b, v18.16b, v27.16b //GHASH final-5 block - mid movi v16.8b, #0 //supress further partial tag feed in .L256_dec_blocks_more_than_4: //blocks left > 4 rev64 v8.16b, v9.16b //GHASH final-4 block eor v8.16b, v8.16b, v16.16b //feed in partial tag ins v27.d[0], v8.d[1] //GHASH final-4 block - mid ldr q9, [x0], #16 //AES final-3 block - load ciphertext movi v16.8b, #0 //supress further partial tag feed in pmull v26.1q, v8.1d, v20.1d //GHASH final-4 block - low pmull2 v28.1q, v8.2d, v20.2d //GHASH final-4 block - high eor v27.8b, v27.8b, v8.8b //GHASH final-4 block - mid eor v17.16b, v17.16b, v28.16b //GHASH final-4 block - high pmull v27.1q, v27.1d, v21.1d //GHASH final-4 block - mid eor v19.16b, v19.16b, v26.16b //GHASH final-4 block - low st1 { v12.16b}, [x2], #16 //AES final-4 block - store result eor v18.16b, v18.16b, v27.16b //GHASH final-4 block - mid .inst 0xce04752c //eor3 v12.16b, v9.16b, v4.16b, v29.16b //AES final-3 block - result .L256_dec_blocks_more_than_3: //blocks left > 3 ldr q25, [x6, #80] //load h4l | h4h rev64 v8.16b, v9.16b //GHASH final-3 block eor v8.16b, v8.16b, v16.16b //feed in partial tag ldr q9, [x0], #16 //AES final-2 block - load ciphertext ldr q24, [x6, #64] //load h4k | h3k ins v27.d[0], v8.d[1] //GHASH final-3 block - mid st1 { v12.16b}, [x2], #16 //AES final-3 block - store result .inst 0xce05752c //eor3 v12.16b, v9.16b, v5.16b, v29.16b //AES final-2 block - result eor v27.8b, v27.8b, v8.8b //GHASH final-3 block - mid ins v27.d[1], v27.d[0] //GHASH final-3 block - mid pmull v26.1q, v8.1d, v25.1d //GHASH final-3 block - low pmull2 v28.1q, v8.2d, v25.2d //GHASH final-3 block - high movi v16.8b, #0 //supress further partial tag feed in pmull2 v27.1q, v27.2d, v24.2d //GHASH final-3 block - mid eor v19.16b, v19.16b, v26.16b //GHASH final-3 block - low eor v17.16b, v17.16b, v28.16b //GHASH final-3 block - high eor v18.16b, v18.16b, v27.16b //GHASH final-3 block - mid .L256_dec_blocks_more_than_2: //blocks left > 2 rev64 v8.16b, v9.16b //GHASH final-2 block ldr q23, [x6, #48] //load h3l | h3h ldr q9, [x0], #16 //AES final-1 block - load ciphertext eor v8.16b, v8.16b, v16.16b //feed in partial tag ins v27.d[0], v8.d[1] //GHASH final-2 block - mid pmull v26.1q, v8.1d, v23.1d //GHASH final-2 block - low st1 { v12.16b}, [x2], #16 //AES final-2 block - store result .inst 0xce06752c //eor3 v12.16b, v9.16b, v6.16b, v29.16b //AES final-1 block - result eor v27.8b, v27.8b, v8.8b //GHASH final-2 block - mid eor v19.16b, v19.16b, v26.16b //GHASH final-2 block - low movi v16.8b, #0 //supress further partial tag feed in pmull v27.1q, v27.1d, v24.1d //GHASH final-2 block - mid pmull2 v28.1q, v8.2d, v23.2d //GHASH final-2 block - high eor v18.16b, v18.16b, v27.16b //GHASH final-2 block - mid eor v17.16b, v17.16b, v28.16b //GHASH final-2 block - high .L256_dec_blocks_more_than_1: //blocks left > 1 rev64 v8.16b, v9.16b //GHASH final-1 block eor v8.16b, v8.16b, v16.16b //feed in partial tag ins v27.d[0], v8.d[1] //GHASH final-1 block - mid ldr q22, [x6, #32] //load h2l | h2h eor v27.8b, v27.8b, v8.8b //GHASH final-1 block - mid ldr q9, [x0], #16 //AES final block - load ciphertext st1 { v12.16b}, [x2], #16 //AES final-1 block - store result ldr q21, [x6, #16] //load h2k | h1k pmull v26.1q, v8.1d, v22.1d //GHASH final-1 block - low ins v27.d[1], v27.d[0] //GHASH final-1 block - mid eor v19.16b, v19.16b, v26.16b //GHASH final-1 block - low .inst 0xce07752c //eor3 v12.16b, v9.16b, v7.16b, v29.16b //AES final block - result pmull2 v28.1q, v8.2d, v22.2d //GHASH final-1 block - high pmull2 v27.1q, v27.2d, v21.2d //GHASH final-1 block - mid movi v16.8b, #0 //supress further partial tag feed in eor v17.16b, v17.16b, v28.16b //GHASH final-1 block - high eor v18.16b, v18.16b, v27.16b //GHASH final-1 block - mid .L256_dec_blocks_less_than_1: //blocks left <= 1 ld1 { v26.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored mvn x7, xzr //temp0_x = 0xffffffffffffffff and x1, x1, #127 //bit_length %= 128 sub x1, x1, #128 //bit_length -= 128 rev32 v30.16b, v30.16b str q30, [x16] //store the updated counter neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) and x1, x1, #127 //bit_length %= 128 lsr x7, x7, x1 //temp0_x is mask for top 64b of last block cmp x1, #64 mvn x8, xzr //temp1_x = 0xffffffffffffffff csel x14, x7, xzr, lt csel x13, x8, x7, lt mov v0.d[0], x13 //ctr0b is mask for last block mov v0.d[1], x14 and v9.16b, v9.16b, v0.16b //possibly partial last block has zeroes in highest bits ldr q20, [x6] //load h1l | h1h bif v12.16b, v26.16b, v0.16b //insert existing bytes in top end of result before storing rev64 v8.16b, v9.16b //GHASH final block eor v8.16b, v8.16b, v16.16b //feed in partial tag ins v16.d[0], v8.d[1] //GHASH final block - mid pmull2 v28.1q, v8.2d, v20.2d //GHASH final block - high eor v16.8b, v16.8b, v8.8b //GHASH final block - mid pmull v26.1q, v8.1d, v20.1d //GHASH final block - low eor v17.16b, v17.16b, v28.16b //GHASH final block - high pmull v16.1q, v16.1d, v21.1d //GHASH final block - mid eor v18.16b, v18.16b, v16.16b //GHASH final block - mid ldr d16, [x10] //MODULO - load modulo constant eor v19.16b, v19.16b, v26.16b //GHASH final block - low pmull v21.1q, v17.1d, v16.1d //MODULO - top 64b align with mid eor v14.16b, v17.16b, v19.16b //MODULO - karatsuba tidy up ext v17.16b, v17.16b, v17.16b, #8 //MODULO - other top alignment st1 { v12.16b}, [x2] //store all 16B eor v18.16b, v18.16b, v14.16b //MODULO - karatsuba tidy up eor v21.16b, v17.16b, v21.16b //MODULO - fold into mid eor v18.16b, v18.16b, v21.16b //MODULO - fold into mid pmull v17.1q, v18.1d, v16.1d //MODULO - mid 64b align with low ext v18.16b, v18.16b, v18.16b, #8 //MODULO - other mid alignment eor v19.16b, v19.16b, v17.16b //MODULO - fold into low eor v19.16b, v19.16b, v18.16b //MODULO - fold into low ext v19.16b, v19.16b, v19.16b, #8 rev64 v19.16b, v19.16b st1 { v19.16b }, [x3] mov x0, x9 ldp d10, d11, [sp, #16] ldp d12, d13, [sp, #32] ldp d14, d15, [sp, #48] ldp d8, d9, [sp], #80 ret .L256_dec_ret: mov w0, #0x0 ret .size aesv8_gcm_8x_dec_256,.-aesv8_gcm_8x_dec_256 .byte 65,69,83,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,65,82,77,118,56,44,32,83,80,68,88,32,66,83,68,45,51,45,67,108,97,117,115,101,32,98,121,32,60,120,105,97,111,107,97,110,103,46,113,105,97,110,64,97,114,109,46,99,111,109,62,0 .align 2 .align 2 #endif #endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__)