#include "macros.inc" #include "params.h" .align 2 .global PQCLEAN_DILITHIUM3_AARCH64_asm_10_to_32 .global _PQCLEAN_DILITHIUM3_AARCH64_asm_10_to_32 #ifndef __clang__ .type PQCLEAN_DILITHIUM3_AARCH64_asm_10_to_32, %function #endif PQCLEAN_DILITHIUM3_AARCH64_asm_10_to_32: _PQCLEAN_DILITHIUM3_AARCH64_asm_10_to_32: mov x7, #16 _10_to_32_loop: ldr w2, [x1], #4 ubfx w3, w2, #0, #10 str w3, [x0], #4 ubfx w4, w2, #10, #10 str w4, [x0], #4 ubfx w5, w2, #20, #10 str w5, [x0], #4 lsr w6, w2, #30 ldr w2, [x1], #4 ubfx w3, w2, #0, #8 lsl w3, w3, #2 orr w3, w3, w6 str w3, [x0], #4 ubfx w4, w2, #8, #10 str w4, [x0], #4 ubfx w5, w2, #18, #10 str w5, [x0], #4 lsr w6, w2, #28 ldr w2, [x1], #4 ubfx w3, w2, #0, #6 lsl w3, w3, #4 orr w3, w3, w6 str w3, [x0], #4 ubfx w4, w2, #6, #10 str w4, [x0], #4 ubfx w5, w2, #16, #10 str w5, [x0], #4 lsr w6, w2, #26 ldr w2, [x1], #4 ubfx w3, w2, #0, #4 lsl w3, w3, #6 orr w3, w3, w6 str w3, [x0], #4 ubfx w4, w2, #4, #10 str w4, [x0], #4 ubfx w5, w2, #14, #10 str w5, [x0], #4 lsr w6, w2, #24 ldr w2, [x1], #4 ubfx w3, w2, #0, #2 lsl w3, w3, #8 orr w3, w3, w6 str w3, [x0], #4 ubfx w4, w2, #2, #10 str w4, [x0], #4 ubfx w5, w2, #12, #10 str w5, [x0], #4 ubfx w6, w2, #22, #10 str w6, [x0], #4 sub x7, x7, #1 cbnz x7, _10_to_32_loop br lr .align 2 .global PQCLEAN_DILITHIUM3_AARCH64_asm_poly_reduce .global _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_reduce #ifndef __clang__ .type PQCLEAN_DILITHIUM3_AARCH64_asm_poly_reduce, %function #endif PQCLEAN_DILITHIUM3_AARCH64_asm_poly_reduce: _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_reduce: ldr w4, [x1] dup v24.4S, w4 add x1, x0, #0 ld1 { v0.4S}, [x1], #16 ld1 { v1.4S}, [x1], #16 ld1 { v2.4S}, [x1], #16 ld1 { v3.4S}, [x1], #16 ld1 { v4.4S}, [x1], #16 srshr v16.4S, v0.4S, #23 ld1 { v5.4S}, [x1], #16 srshr v17.4S, v1.4S, #23 ld1 { v6.4S}, [x1], #16 srshr v18.4S, v2.4S, #23 ld1 { v7.4S}, [x1], #16 srshr v19.4S, v3.4S, #23 srshr v20.4S, v4.4S, #23 mls v0.4S, v16.4S, v24.4S srshr v21.4S, v5.4S, #23 mls v1.4S, v17.4S, v24.4S srshr v22.4S, v6.4S, #23 mls v2.4S, v18.4S, v24.4S srshr v23.4S, v7.4S, #23 mls v3.4S, v19.4S, v24.4S mls v4.4S, v20.4S, v24.4S st1 { v0.4S}, [x0], #16 mls v5.4S, v21.4S, v24.4S st1 { v1.4S}, [x0], #16 mls v6.4S, v22.4S, v24.4S st1 { v2.4S}, [x0], #16 mls v7.4S, v23.4S, v24.4S st1 { v3.4S}, [x0], #16 mov x16, #7 _poly_reduce_loop: st1 { v4.4S}, [x0], #16 ld1 { v0.4S}, [x1], #16 st1 { v5.4S}, [x0], #16 ld1 { v1.4S}, [x1], #16 st1 { v6.4S}, [x0], #16 ld1 { v2.4S}, [x1], #16 st1 { v7.4S}, [x0], #16 ld1 { v3.4S}, [x1], #16 ld1 { v4.4S}, [x1], #16 srshr v16.4S, v0.4S, #23 ld1 { v5.4S}, [x1], #16 srshr v17.4S, v1.4S, #23 ld1 { v6.4S}, [x1], #16 srshr v18.4S, v2.4S, #23 ld1 { v7.4S}, [x1], #16 srshr v19.4S, v3.4S, #23 srshr v20.4S, v4.4S, #23 mls v0.4S, v16.4S, v24.4S srshr v21.4S, v5.4S, #23 mls v1.4S, v17.4S, v24.4S srshr v22.4S, v6.4S, #23 mls v2.4S, v18.4S, v24.4S srshr v23.4S, v7.4S, #23 mls v3.4S, v19.4S, v24.4S mls v4.4S, v20.4S, v24.4S st1 { v0.4S}, [x0], #16 mls v5.4S, v21.4S, v24.4S st1 { v1.4S}, [x0], #16 mls v6.4S, v22.4S, v24.4S st1 { v2.4S}, [x0], #16 mls v7.4S, v23.4S, v24.4S st1 { v3.4S}, [x0], #16 sub x16, x16, #1 cbnz x16, _poly_reduce_loop st1 { v4.4S}, [x0], #16 st1 { v5.4S}, [x0], #16 st1 { v6.4S}, [x0], #16 st1 { v7.4S}, [x0], #16 br lr .align 2 .global PQCLEAN_DILITHIUM3_AARCH64_asm_poly_caddq .global _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_caddq #ifndef __clang__ .type PQCLEAN_DILITHIUM3_AARCH64_asm_poly_caddq, %function #endif PQCLEAN_DILITHIUM3_AARCH64_asm_poly_caddq: _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_caddq: ldr w4, [x1] dup v24.4S, w4 add x1, x0, #0 ld1 { v0.4S}, [x1], #16 ld1 { v1.4S}, [x1], #16 ld1 { v2.4S}, [x1], #16 ld1 { v3.4S}, [x1], #16 ld1 { v4.4S}, [x1], #16 sshr v16.4S, v0.4S, #31 ld1 { v5.4S}, [x1], #16 sshr v17.4S, v1.4S, #31 ld1 { v6.4S}, [x1], #16 sshr v18.4S, v2.4S, #31 ld1 { v7.4S}, [x1], #16 sshr v19.4S, v3.4S, #31 sshr v20.4S, v4.4S, #31 mls v0.4S, v16.4S, v24.4S sshr v21.4S, v5.4S, #31 mls v1.4S, v17.4S, v24.4S sshr v22.4S, v6.4S, #31 mls v2.4S, v18.4S, v24.4S sshr v23.4S, v7.4S, #31 mls v3.4S, v19.4S, v24.4S mls v4.4S, v20.4S, v24.4S st1 { v0.4S}, [x0], #16 mls v5.4S, v21.4S, v24.4S st1 { v1.4S}, [x0], #16 mls v6.4S, v22.4S, v24.4S st1 { v2.4S}, [x0], #16 mls v7.4S, v23.4S, v24.4S st1 { v3.4S}, [x0], #16 mov x16, #7 _poly_caddq_loop: st1 { v4.4S}, [x0], #16 ld1 { v0.4S}, [x1], #16 st1 { v5.4S}, [x0], #16 ld1 { v1.4S}, [x1], #16 st1 { v6.4S}, [x0], #16 ld1 { v2.4S}, [x1], #16 st1 { v7.4S}, [x0], #16 ld1 { v3.4S}, [x1], #16 ld1 { v4.4S}, [x1], #16 sshr v16.4S, v0.4S, #31 ld1 { v5.4S}, [x1], #16 sshr v17.4S, v1.4S, #31 ld1 { v6.4S}, [x1], #16 sshr v18.4S, v2.4S, #31 ld1 { v7.4S}, [x1], #16 sshr v19.4S, v3.4S, #31 sshr v20.4S, v4.4S, #31 mls v0.4S, v16.4S, v24.4S sshr v21.4S, v5.4S, #31 mls v1.4S, v17.4S, v24.4S sshr v22.4S, v6.4S, #31 mls v2.4S, v18.4S, v24.4S sshr v23.4S, v7.4S, #31 mls v3.4S, v19.4S, v24.4S mls v4.4S, v20.4S, v24.4S st1 { v0.4S}, [x0], #16 mls v5.4S, v21.4S, v24.4S st1 { v1.4S}, [x0], #16 mls v6.4S, v22.4S, v24.4S st1 { v2.4S}, [x0], #16 mls v7.4S, v23.4S, v24.4S st1 { v3.4S}, [x0], #16 sub x16, x16, #1 cbnz x16, _poly_caddq_loop st1 { v4.4S}, [x0], #16 st1 { v5.4S}, [x0], #16 st1 { v6.4S}, [x0], #16 st1 { v7.4S}, [x0], #16 br lr .align 2 .global PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze .global _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze #ifndef __clang__ .type PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze, %function #endif PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze: _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_freeze: ldr w4, [x1] dup v24.4S, w4 add x1, x0, #0 ld1 { v0.4S}, [x1], #16 ld1 { v1.4S}, [x1], #16 ld1 { v2.4S}, [x1], #16 ld1 { v3.4S}, [x1], #16 ld1 { v4.4S}, [x1], #16 srshr v16.4S, v0.4S, #23 ld1 { v5.4S}, [x1], #16 srshr v17.4S, v1.4S, #23 ld1 { v6.4S}, [x1], #16 srshr v18.4S, v2.4S, #23 ld1 { v7.4S}, [x1], #16 srshr v19.4S, v3.4S, #23 srshr v20.4S, v4.4S, #23 mls v0.4S, v16.4S, v24.4S srshr v21.4S, v5.4S, #23 mls v1.4S, v17.4S, v24.4S srshr v22.4S, v6.4S, #23 mls v2.4S, v18.4S, v24.4S srshr v23.4S, v7.4S, #23 mls v3.4S, v19.4S, v24.4S mls v4.4S, v20.4S, v24.4S sshr v16.4S, v0.4S, #31 mls v5.4S, v21.4S, v24.4S sshr v17.4S, v1.4S, #31 mls v6.4S, v22.4S, v24.4S sshr v18.4S, v2.4S, #31 mls v7.4S, v23.4S, v24.4S sshr v19.4S, v3.4S, #31 sshr v20.4S, v4.4S, #31 mls v0.4S, v16.4S, v24.4S sshr v21.4S, v5.4S, #31 mls v1.4S, v17.4S, v24.4S sshr v22.4S, v6.4S, #31 mls v2.4S, v18.4S, v24.4S sshr v23.4S, v7.4S, #31 mls v3.4S, v19.4S, v24.4S mls v4.4S, v20.4S, v24.4S st1 { v0.4S}, [x0], #16 mls v5.4S, v21.4S, v24.4S st1 { v1.4S}, [x0], #16 mls v6.4S, v22.4S, v24.4S st1 { v2.4S}, [x0], #16 mls v7.4S, v23.4S, v24.4S st1 { v3.4S}, [x0], #16 mov x16, #8 _poly_freeze_loop: st1 { v4.4S}, [x0], #16 ld1 { v0.4S}, [x1], #16 st1 { v5.4S}, [x0], #16 ld1 { v1.4S}, [x1], #16 st1 { v6.4S}, [x0], #16 ld1 { v2.4S}, [x1], #16 st1 { v7.4S}, [x0], #16 ld1 { v3.4S}, [x1], #16 ld1 { v4.4S}, [x1], #16 srshr v16.4S, v0.4S, #23 ld1 { v5.4S}, [x1], #16 srshr v17.4S, v1.4S, #23 ld1 { v6.4S}, [x1], #16 srshr v18.4S, v2.4S, #23 ld1 { v7.4S}, [x1], #16 srshr v19.4S, v3.4S, #23 srshr v20.4S, v4.4S, #23 mls v0.4S, v16.4S, v24.4S srshr v21.4S, v5.4S, #23 mls v1.4S, v17.4S, v24.4S srshr v22.4S, v6.4S, #23 mls v2.4S, v18.4S, v24.4S srshr v23.4S, v7.4S, #23 mls v3.4S, v19.4S, v24.4S mls v4.4S, v20.4S, v24.4S sshr v16.4S, v0.4S, #31 mls v5.4S, v21.4S, v24.4S sshr v17.4S, v1.4S, #31 mls v6.4S, v22.4S, v24.4S sshr v18.4S, v2.4S, #31 mls v7.4S, v23.4S, v24.4S sshr v19.4S, v3.4S, #31 sshr v20.4S, v4.4S, #31 mls v0.4S, v16.4S, v24.4S sshr v21.4S, v5.4S, #31 mls v1.4S, v17.4S, v24.4S sshr v22.4S, v6.4S, #31 mls v2.4S, v18.4S, v24.4S sshr v23.4S, v7.4S, #31 mls v3.4S, v19.4S, v24.4S mls v4.4S, v20.4S, v24.4S st1 { v0.4S}, [x0], #16 mls v5.4S, v21.4S, v24.4S st1 { v1.4S}, [x0], #16 mls v6.4S, v22.4S, v24.4S st1 { v2.4S}, [x0], #16 mls v7.4S, v23.4S, v24.4S st1 { v3.4S}, [x0], #16 sub x16, x16, #1 cbnz x16, _poly_freeze_loop st1 { v4.4S}, [x0], #16 st1 { v5.4S}, [x0], #16 st1 { v6.4S}, [x0], #16 st1 { v7.4S}, [x0], #16 br lr .align 2 .global PQCLEAN_DILITHIUM3_AARCH64_asm_poly_power2round .global _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_power2round #ifndef __clang__ .type PQCLEAN_DILITHIUM3_AARCH64_asm_poly_power2round, %function #endif PQCLEAN_DILITHIUM3_AARCH64_asm_poly_power2round: _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_power2round: mov w4, #1 dup v28.4S, w4 ld1 { v0.4S}, [x2], #16 ld1 { v1.4S}, [x2], #16 ld1 { v2.4S}, [x2], #16 ld1 { v3.4S}, [x2], #16 ld1 {v20.4S}, [x2], #16 sub v4.4S, v0.4S, v28.4S ld1 {v21.4S}, [x2], #16 sub v5.4S, v1.4S, v28.4S ld1 {v22.4S}, [x2], #16 sub v6.4S, v2.4S, v28.4S ld1 {v23.4S}, [x2], #16 sub v7.4S, v3.4S, v28.4S sub v24.4S, v20.4S, v28.4S srshr v16.4S, v4.4S, #13 sub v25.4S, v21.4S, v28.4S srshr v17.4S, v5.4S, #13 sub v26.4S, v22.4S, v28.4S srshr v18.4S, v6.4S, #13 sub v27.4S, v23.4S, v28.4S srshr v19.4S, v7.4S, #13 srshr v28.4S, v24.4S, #13 st1 {v16.4S}, [x0], #16 srshr v29.4S, v25.4S, #13 st1 {v17.4S}, [x0], #16 srshr v30.4S, v26.4S, #13 st1 {v18.4S}, [x0], #16 srshr v31.4S, v27.4S, #13 st1 {v19.4S}, [x0], #16 st1 {v28.4S}, [x0], #16 shl v4.4S, v16.4S, #13 st1 {v29.4S}, [x0], #16 shl v5.4S, v17.4S, #13 st1 {v30.4S}, [x0], #16 shl v6.4S, v18.4S, #13 st1 {v31.4S}, [x0], #16 shl v7.4S, v19.4S, #13 shl v24.4S, v28.4S, #13 sub v16.4S, v0.4S, v4.4S shl v25.4S, v29.4S, #13 sub v17.4S, v1.4S, v5.4S shl v26.4S, v30.4S, #13 sub v18.4S, v2.4S, v6.4S shl v27.4S, v31.4S, #13 sub v19.4S, v3.4S, v7.4S sub v28.4S, v20.4S, v24.4S st1 {v16.4S}, [x1], #16 sub v29.4S, v21.4S, v25.4S st1 {v17.4S}, [x1], #16 sub v30.4S, v22.4S, v26.4S st1 {v18.4S}, [x1], #16 sub v31.4S, v23.4S, v27.4S st1 {v19.4S}, [x1], #16 mov x16, #7 _poly_power2round_loop: st1 {v28.4S}, [x1], #16 dup v28.4S, w4 ld1 { v0.4S}, [x2], #16 st1 {v29.4S}, [x1], #16 ld1 { v1.4S}, [x2], #16 st1 {v30.4S}, [x1], #16 ld1 { v2.4S}, [x2], #16 st1 {v31.4S}, [x1], #16 ld1 { v3.4S}, [x2], #16 ld1 {v20.4S}, [x2], #16 sub v4.4S, v0.4S, v28.4S ld1 {v21.4S}, [x2], #16 sub v5.4S, v1.4S, v28.4S ld1 {v22.4S}, [x2], #16 sub v6.4S, v2.4S, v28.4S ld1 {v23.4S}, [x2], #16 sub v7.4S, v3.4S, v28.4S sub v24.4S, v20.4S, v28.4S srshr v16.4S, v4.4S, #13 sub v25.4S, v21.4S, v28.4S srshr v17.4S, v5.4S, #13 sub v26.4S, v22.4S, v28.4S srshr v18.4S, v6.4S, #13 sub v27.4S, v23.4S, v28.4S srshr v19.4S, v7.4S, #13 srshr v28.4S, v24.4S, #13 st1 {v16.4S}, [x0], #16 srshr v29.4S, v25.4S, #13 st1 {v17.4S}, [x0], #16 srshr v30.4S, v26.4S, #13 st1 {v18.4S}, [x0], #16 srshr v31.4S, v27.4S, #13 st1 {v19.4S}, [x0], #16 st1 {v28.4S}, [x0], #16 shl v4.4S, v16.4S, #13 st1 {v29.4S}, [x0], #16 shl v5.4S, v17.4S, #13 st1 {v30.4S}, [x0], #16 shl v6.4S, v18.4S, #13 st1 {v31.4S}, [x0], #16 shl v7.4S, v19.4S, #13 shl v24.4S, v28.4S, #13 sub v16.4S, v0.4S, v4.4S shl v25.4S, v29.4S, #13 sub v17.4S, v1.4S, v5.4S shl v26.4S, v30.4S, #13 sub v18.4S, v2.4S, v6.4S shl v27.4S, v31.4S, #13 sub v19.4S, v3.4S, v7.4S sub v28.4S, v20.4S, v24.4S st1 {v16.4S}, [x1], #16 sub v29.4S, v21.4S, v25.4S st1 {v17.4S}, [x1], #16 sub v30.4S, v22.4S, v26.4S st1 {v18.4S}, [x1], #16 sub v31.4S, v23.4S, v27.4S st1 {v19.4S}, [x1], #16 sub x16, x16, #1 cbnz x16, _poly_power2round_loop st1 {v28.4S}, [x1], #16 st1 {v29.4S}, [x1], #16 st1 {v30.4S}, [x1], #16 st1 {v31.4S}, [x1], #16 br lr .align 2 .global PQCLEAN_DILITHIUM3_AARCH64_asm_poly_add .global _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_add #ifndef __clang__ .type PQCLEAN_DILITHIUM3_AARCH64_asm_poly_add, %function #endif PQCLEAN_DILITHIUM3_AARCH64_asm_poly_add: _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_add: ld1 {v0.4S}, [x1], #16 ld1 {v4.4S}, [x2], #16 add v16.4S, v0.4S, v4.4S ld1 {v1.4S}, [x1], #16 ld1 {v5.4S}, [x2], #16 add v17.4S, v1.4S, v5.4S ld1 {v2.4S}, [x1], #16 ld1 {v6.4S}, [x2], #16 add v18.4S, v2.4S, v6.4S ld1 {v3.4S}, [x1], #16 ld1 {v7.4S}, [x2], #16 add v19.4S, v3.4S, v7.4S mov x16, #15 _poly_add_loop: st1 {v16.4S}, [x0], #16 ld1 {v0.4S}, [x1], #16 ld1 {v4.4S}, [x2], #16 add v16.4S, v0.4S, v4.4S st1 {v17.4S}, [x0], #16 ld1 {v1.4S}, [x1], #16 ld1 {v5.4S}, [x2], #16 add v17.4S, v1.4S, v5.4S st1 {v18.4S}, [x0], #16 ld1 {v2.4S}, [x1], #16 ld1 {v6.4S}, [x2], #16 add v18.4S, v2.4S, v6.4S st1 {v19.4S}, [x0], #16 ld1 {v3.4S}, [x1], #16 ld1 {v7.4S}, [x2], #16 add v19.4S, v3.4S, v7.4S sub x16, x16, #1 cbnz x16, _poly_add_loop st1 {v16.4S}, [x0], #16 st1 {v17.4S}, [x0], #16 st1 {v18.4S}, [x0], #16 st1 {v19.4S}, [x0], #16 br lr .align 2 .global PQCLEAN_DILITHIUM3_AARCH64_asm_poly_sub .global _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_sub #ifndef __clang__ .type PQCLEAN_DILITHIUM3_AARCH64_asm_poly_sub, %function #endif PQCLEAN_DILITHIUM3_AARCH64_asm_poly_sub: _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_sub: ld1 {v0.4S}, [x1], #16 ld1 {v4.4S}, [x2], #16 sub v16.4S, v0.4S, v4.4S ld1 {v1.4S}, [x1], #16 ld1 {v5.4S}, [x2], #16 sub v17.4S, v1.4S, v5.4S ld1 {v2.4S}, [x1], #16 ld1 {v6.4S}, [x2], #16 sub v18.4S, v2.4S, v6.4S ld1 {v3.4S}, [x1], #16 ld1 {v7.4S}, [x2], #16 sub v19.4S, v3.4S, v7.4S mov x16, #15 _poly_sub_loop: st1 {v16.4S}, [x0], #16 ld1 {v0.4S}, [x1], #16 ld1 {v4.4S}, [x2], #16 sub v16.4S, v0.4S, v4.4S st1 {v17.4S}, [x0], #16 ld1 {v1.4S}, [x1], #16 ld1 {v5.4S}, [x2], #16 sub v17.4S, v1.4S, v5.4S st1 {v18.4S}, [x0], #16 ld1 {v2.4S}, [x1], #16 ld1 {v6.4S}, [x2], #16 sub v18.4S, v2.4S, v6.4S st1 {v19.4S}, [x0], #16 ld1 {v3.4S}, [x1], #16 ld1 {v7.4S}, [x2], #16 sub v19.4S, v3.4S, v7.4S sub x16, x16, #1 cbnz x16, _poly_sub_loop st1 {v16.4S}, [x0], #16 st1 {v17.4S}, [x0], #16 st1 {v18.4S}, [x0], #16 st1 {v19.4S}, [x0], #16 br lr .align 2 .global PQCLEAN_DILITHIUM3_AARCH64_asm_poly_shiftl .global _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_shiftl #ifndef __clang__ .type PQCLEAN_DILITHIUM3_AARCH64_asm_poly_shiftl, %function #endif PQCLEAN_DILITHIUM3_AARCH64_asm_poly_shiftl: _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_shiftl: add x1, x0, #0 ld1 { v0.4S}, [x1], #16 shl v16.4S, v0.4S, #13 ld1 { v1.4S}, [x1], #16 shl v17.4S, v1.4S, #13 ld1 { v2.4S}, [x1], #16 shl v18.4S, v2.4S, #13 ld1 { v3.4S}, [x1], #16 shl v19.4S, v3.4S, #13 ld1 { v4.4S}, [x1], #16 shl v20.4S, v4.4S, #13 ld1 { v5.4S}, [x1], #16 shl v21.4S, v5.4S, #13 ld1 { v6.4S}, [x1], #16 shl v22.4S, v6.4S, #13 ld1 { v7.4S}, [x1], #16 shl v23.4S, v7.4S, #13 mov x16, #7 _poly_shiftl_loop: st1 {v16.4S}, [x0], #16 ld1 { v0.4S}, [x1], #16 shl v16.4S, v0.4S, #13 st1 {v17.4S}, [x0], #16 ld1 { v1.4S}, [x1], #16 shl v17.4S, v1.4S, #13 st1 {v18.4S}, [x0], #16 ld1 { v2.4S}, [x1], #16 shl v18.4S, v2.4S, #13 st1 {v19.4S}, [x0], #16 ld1 { v3.4S}, [x1], #16 shl v19.4S, v3.4S, #13 st1 {v20.4S}, [x0], #16 ld1 { v4.4S}, [x1], #16 shl v20.4S, v4.4S, #13 st1 {v21.4S}, [x0], #16 ld1 { v5.4S}, [x1], #16 shl v21.4S, v5.4S, #13 st1 {v22.4S}, [x0], #16 ld1 { v6.4S}, [x1], #16 shl v22.4S, v6.4S, #13 st1 {v23.4S}, [x0], #16 ld1 { v7.4S}, [x1], #16 shl v23.4S, v7.4S, #13 sub x16, x16, #1 cbnz x16, _poly_shiftl_loop st1 {v16.4S}, [x0], #16 st1 {v17.4S}, [x0], #16 st1 {v18.4S}, [x0], #16 st1 {v19.4S}, [x0], #16 st1 {v20.4S}, [x0], #16 st1 {v21.4S}, [x0], #16 st1 {v22.4S}, [x0], #16 st1 {v23.4S}, [x0], #16 br lr .align 2 .global PQCLEAN_DILITHIUM3_AARCH64_asm_poly_pointwise_montgomery .global _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_pointwise_montgomery #ifndef __clang__ .type PQCLEAN_DILITHIUM3_AARCH64_asm_poly_pointwise_montgomery, %function #endif PQCLEAN_DILITHIUM3_AARCH64_asm_poly_pointwise_montgomery: _PQCLEAN_DILITHIUM3_AARCH64_asm_poly_pointwise_montgomery: push_all ldr w20, [x3, #0] ldr w21, [x3, #4] dup v30.4S, w20 dup v31.4S, w21 ld1 { v0.4S}, [x1], #16 ld1 { v1.4S}, [x1], #16 ld1 { v2.4S}, [x1], #16 ld1 { v3.4S}, [x1], #16 ld1 { v4.4S}, [x2], #16 ld1 { v5.4S}, [x2], #16 ld1 { v6.4S}, [x2], #16 ld1 { v7.4S}, [x2], #16 smull v12.2D, v0.2S, v4.2S smull2 v16.2D, v0.4S, v4.4S smull v13.2D, v1.2S, v5.2S smull2 v17.2D, v1.4S, v5.4S smull v14.2D, v2.2S, v6.2S smull2 v18.2D, v2.4S, v6.4S smull v15.2D, v3.2S, v7.2S smull2 v19.2D, v3.4S, v7.4S uzp1 v20.4S, v12.4S, v16.4S uzp1 v21.4S, v13.4S, v17.4S uzp1 v22.4S, v14.4S, v18.4S uzp1 v23.4S, v15.4S, v19.4S mul v24.4S, v20.4S, v31.4S mul v25.4S, v21.4S, v31.4S mul v26.4S, v22.4S, v31.4S mul v27.4S, v23.4S, v31.4S smlsl v12.2D, v24.2S, v30.2S smlsl2 v16.2D, v24.4S, v30.4S smlsl v13.2D, v25.2S, v30.2S smlsl2 v17.2D, v25.4S, v30.4S smlsl v14.2D, v26.2S, v30.2S smlsl2 v18.2D, v26.4S, v30.4S smlsl v15.2D, v27.2S, v30.2S smlsl2 v19.2D, v27.4S, v30.4S uzp2 v24.4S, v12.4S, v16.4S uzp2 v25.4S, v13.4S, v17.4S uzp2 v26.4S, v14.4S, v18.4S uzp2 v27.4S, v15.4S, v19.4S mov x16, #15 _poly_pointwise_montgomery_loop: st1 {v24.4S}, [x0], #16 ld1 { v0.4S}, [x1], #16 st1 {v25.4S}, [x0], #16 ld1 { v1.4S}, [x1], #16 st1 {v26.4S}, [x0], #16 ld1 { v2.4S}, [x1], #16 st1 {v27.4S}, [x0], #16 ld1 { v3.4S}, [x1], #16 ld1 { v4.4S}, [x2], #16 ld1 { v5.4S}, [x2], #16 ld1 { v6.4S}, [x2], #16 ld1 { v7.4S}, [x2], #16 smull v12.2D, v0.2S, v4.2S smull2 v16.2D, v0.4S, v4.4S smull v13.2D, v1.2S, v5.2S smull2 v17.2D, v1.4S, v5.4S smull v14.2D, v2.2S, v6.2S smull2 v18.2D, v2.4S, v6.4S smull v15.2D, v3.2S, v7.2S smull2 v19.2D, v3.4S, v7.4S uzp1 v20.4S, v12.4S, v16.4S uzp1 v21.4S, v13.4S, v17.4S uzp1 v22.4S, v14.4S, v18.4S uzp1 v23.4S, v15.4S, v19.4S mul v24.4S, v20.4S, v31.4S mul v25.4S, v21.4S, v31.4S mul v26.4S, v22.4S, v31.4S mul v27.4S, v23.4S, v31.4S smlsl v12.2D, v24.2S, v30.2S smlsl2 v16.2D, v24.4S, v30.4S smlsl v13.2D, v25.2S, v30.2S smlsl2 v17.2D, v25.4S, v30.4S smlsl v14.2D, v26.2S, v30.2S smlsl2 v18.2D, v26.4S, v30.4S smlsl v15.2D, v27.2S, v30.2S smlsl2 v19.2D, v27.4S, v30.4S uzp2 v24.4S, v12.4S, v16.4S uzp2 v25.4S, v13.4S, v17.4S uzp2 v26.4S, v14.4S, v18.4S uzp2 v27.4S, v15.4S, v19.4S sub x16, x16, #1 cbnz x16, _poly_pointwise_montgomery_loop st1 {v24.4S}, [x0], #16 st1 {v25.4S}, [x0], #16 st1 {v26.4S}, [x0], #16 st1 {v27.4S}, [x0], #16 pop_all br lr .align 2 .global PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery .global _PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery #ifndef __clang__ .type PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery, %function #endif PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery: _PQCLEAN_DILITHIUM3_AARCH64_asm_polyvecl_pointwise_acc_montgomery: push_all ldr w20, [x3, #0] ldr w21, [x3, #4] add x5, x1, #1024*1 add x6, x2, #1024*1 add x7, x1, #1024*2 add x8, x2, #1024*2 add x9, x1, #1024*3 add x10, x2, #1024*3 #if L > 4 add x11, x1, #1024*4 add x12, x2, #1024*4 #endif #if L > 5 add x13, x11, #1024*1 add x14, x12, #1024*1 add x15, x11, #1024*2 add x19, x12, #1024*2 #endif dup v30.4S, w20 dup v31.4S, w21 ld1 { v0.4S}, [x1], #16 ld1 { v1.4S}, [x1], #16 ld1 { v2.4S}, [x1], #16 ld1 { v3.4S}, [x1], #16 ld1 { v4.4S}, [x2], #16 ld1 { v5.4S}, [x2], #16 ld1 { v6.4S}, [x2], #16 ld1 { v7.4S}, [x2], #16 smull v12.2D, v0.2S, v4.2S smull2 v16.2D, v0.4S, v4.4S ld1 { v0.4S}, [ x5], #16 ld1 { v4.4S}, [ x6], #16 smull v13.2D, v1.2S, v5.2S smull2 v17.2D, v1.4S, v5.4S ld1 { v1.4S}, [ x5], #16 ld1 { v5.4S}, [ x6], #16 smull v14.2D, v2.2S, v6.2S smull2 v18.2D, v2.4S, v6.4S ld1 { v2.4S}, [ x5], #16 ld1 { v6.4S}, [ x6], #16 smull v15.2D, v3.2S, v7.2S smull2 v19.2D, v3.4S, v7.4S ld1 { v3.4S}, [ x5], #16 ld1 { v7.4S}, [ x6], #16 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S ld1 { v0.4S}, [ x7], #16 ld1 { v4.4S}, [ x8], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S ld1 { v1.4S}, [ x7], #16 ld1 { v5.4S}, [ x8], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S ld1 { v2.4S}, [ x7], #16 ld1 { v6.4S}, [ x8], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S ld1 { v3.4S}, [ x7], #16 ld1 { v7.4S}, [ x8], #16 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S ld1 { v0.4S}, [ x9], #16 ld1 { v4.4S}, [x10], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S ld1 { v1.4S}, [ x9], #16 ld1 { v5.4S}, [x10], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S ld1 { v2.4S}, [ x9], #16 ld1 { v6.4S}, [x10], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S ld1 { v3.4S}, [ x9], #16 ld1 { v7.4S}, [x10], #16 #if L > 4 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S ld1 { v0.4S}, [x11], #16 ld1 { v4.4S}, [x12], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S ld1 { v1.4S}, [x11], #16 ld1 { v5.4S}, [x12], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S ld1 { v2.4S}, [x11], #16 ld1 { v6.4S}, [x12], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S ld1 { v3.4S}, [x11], #16 ld1 { v7.4S}, [x12], #16 #endif #if L > 5 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S ld1 { v0.4S}, [x13], #16 ld1 { v4.4S}, [x14], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S ld1 { v1.4S}, [x13], #16 ld1 { v5.4S}, [x14], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S ld1 { v2.4S}, [x13], #16 ld1 { v6.4S}, [x14], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S ld1 { v3.4S}, [x13], #16 ld1 { v7.4S}, [x14], #16 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S ld1 { v0.4S}, [x15], #16 ld1 { v4.4S}, [x19], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S ld1 { v1.4S}, [x15], #16 ld1 { v5.4S}, [x19], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S ld1 { v2.4S}, [x15], #16 ld1 { v6.4S}, [x19], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S ld1 { v3.4S}, [x15], #16 ld1 { v7.4S}, [x19], #16 #endif mov x16, #15 _polyvecl_pointwise_acc_montgomery_loop: smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S uzp1 v20.4S, v12.4S, v16.4S ld1 { v0.4S}, [x1], #16 uzp1 v21.4S, v13.4S, v17.4S ld1 { v1.4S}, [x1], #16 uzp1 v22.4S, v14.4S, v18.4S ld1 { v2.4S}, [x1], #16 uzp1 v23.4S, v15.4S, v19.4S ld1 { v3.4S}, [x1], #16 mul v24.4S, v20.4S, v31.4S ld1 { v4.4S}, [x2], #16 mul v25.4S, v21.4S, v31.4S ld1 { v5.4S}, [x2], #16 mul v26.4S, v22.4S, v31.4S ld1 { v6.4S}, [x2], #16 mul v27.4S, v23.4S, v31.4S ld1 { v7.4S}, [x2], #16 smlsl v12.2D, v24.2S, v30.2S smlsl2 v16.2D, v24.4S, v30.4S smlsl v13.2D, v25.2S, v30.2S smlsl2 v17.2D, v25.4S, v30.4S smlsl v14.2D, v26.2S, v30.2S smlsl2 v18.2D, v26.4S, v30.4S smlsl v15.2D, v27.2S, v30.2S smlsl2 v19.2D, v27.4S, v30.4S uzp2 v24.4S, v12.4S, v16.4S uzp2 v25.4S, v13.4S, v17.4S uzp2 v26.4S, v14.4S, v18.4S uzp2 v27.4S, v15.4S, v19.4S smull v12.2D, v0.2S, v4.2S smull2 v16.2D, v0.4S, v4.4S ld1 { v0.4S}, [ x5], #16 st1 {v24.4S}, [x0], #16 ld1 { v4.4S}, [ x6], #16 smull v13.2D, v1.2S, v5.2S smull2 v17.2D, v1.4S, v5.4S ld1 { v1.4S}, [ x5], #16 st1 {v25.4S}, [x0], #16 ld1 { v5.4S}, [ x6], #16 smull v14.2D, v2.2S, v6.2S smull2 v18.2D, v2.4S, v6.4S ld1 { v2.4S}, [ x5], #16 st1 {v26.4S}, [x0], #16 ld1 { v6.4S}, [ x6], #16 smull v15.2D, v3.2S, v7.2S smull2 v19.2D, v3.4S, v7.4S ld1 { v3.4S}, [ x5], #16 st1 {v27.4S}, [x0], #16 ld1 { v7.4S}, [ x6], #16 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S ld1 { v0.4S}, [ x7], #16 ld1 { v4.4S}, [ x8], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S ld1 { v1.4S}, [ x7], #16 ld1 { v5.4S}, [ x8], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S ld1 { v2.4S}, [ x7], #16 ld1 { v6.4S}, [ x8], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S ld1 { v3.4S}, [ x7], #16 ld1 { v7.4S}, [ x8], #16 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S ld1 { v0.4S}, [ x9], #16 ld1 { v4.4S}, [x10], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S ld1 { v1.4S}, [ x9], #16 ld1 { v5.4S}, [x10], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S ld1 { v2.4S}, [ x9], #16 ld1 { v6.4S}, [x10], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S ld1 { v3.4S}, [ x9], #16 ld1 { v7.4S}, [x10], #16 #if L > 4 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S ld1 { v0.4S}, [x11], #16 ld1 { v4.4S}, [x12], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S ld1 { v1.4S}, [x11], #16 ld1 { v5.4S}, [x12], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S ld1 { v2.4S}, [x11], #16 ld1 { v6.4S}, [x12], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S ld1 { v3.4S}, [x11], #16 ld1 { v7.4S}, [x12], #16 #endif #if L > 5 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S ld1 { v0.4S}, [x13], #16 ld1 { v4.4S}, [x14], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S ld1 { v1.4S}, [x13], #16 ld1 { v5.4S}, [x14], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S ld1 { v2.4S}, [x13], #16 ld1 { v6.4S}, [x14], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S ld1 { v3.4S}, [x13], #16 ld1 { v7.4S}, [x14], #16 smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S ld1 { v0.4S}, [x15], #16 ld1 { v4.4S}, [x19], #16 smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S ld1 { v1.4S}, [x15], #16 ld1 { v5.4S}, [x19], #16 smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S ld1 { v2.4S}, [x15], #16 ld1 { v6.4S}, [x19], #16 smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S ld1 { v3.4S}, [x15], #16 ld1 { v7.4S}, [x19], #16 #endif sub x16, x16, #1 cbnz x16, _polyvecl_pointwise_acc_montgomery_loop smlal v12.2D, v0.2S, v4.2S smlal2 v16.2D, v0.4S, v4.4S smlal v13.2D, v1.2S, v5.2S smlal2 v17.2D, v1.4S, v5.4S smlal v14.2D, v2.2S, v6.2S smlal2 v18.2D, v2.4S, v6.4S smlal v15.2D, v3.2S, v7.2S smlal2 v19.2D, v3.4S, v7.4S uzp1 v20.4S, v12.4S, v16.4S uzp1 v21.4S, v13.4S, v17.4S uzp1 v22.4S, v14.4S, v18.4S uzp1 v23.4S, v15.4S, v19.4S mul v24.4S, v20.4S, v31.4S mul v25.4S, v21.4S, v31.4S mul v26.4S, v22.4S, v31.4S mul v27.4S, v23.4S, v31.4S smlsl v12.2D, v24.2S, v30.2S smlsl2 v16.2D, v24.4S, v30.4S smlsl v13.2D, v25.2S, v30.2S smlsl2 v17.2D, v25.4S, v30.4S smlsl v14.2D, v26.2S, v30.2S smlsl2 v18.2D, v26.4S, v30.4S smlsl v15.2D, v27.2S, v30.2S smlsl2 v19.2D, v27.4S, v30.4S uzp2 v24.4S, v12.4S, v16.4S uzp2 v25.4S, v13.4S, v17.4S uzp2 v26.4S, v14.4S, v18.4S uzp2 v27.4S, v15.4S, v19.4S st1 {v24.4S}, [x0], #16 st1 {v25.4S}, [x0], #16 st1 {v26.4S}, [x0], #16 st1 {v27.4S}, [x0], #16 pop_all br lr