#include "macros.inc" #include "params.h" .align 2 .global PQCLEAN_KYBER1024_AARCH64_asm_point_mul_extended .global _PQCLEAN_KYBER1024_AARCH64_asm_point_mul_extended #ifndef __clang__ .type PQCLEAN_KYBER1024_AARCH64_asm_point_mul_extended, %function #endif PQCLEAN_KYBER1024_AARCH64_asm_point_mul_extended: _PQCLEAN_KYBER1024_AARCH64_asm_point_mul_extended: push_all Q .req w20 des .req x0 src1 .req x1 src2ex .req x2 counter .req x19 ldrsh Q, [x3] dup v20.8H, Q // TODO: unroll this, currently we are using only 16 SIMD registers mov counter, #4 _point_mul_extended_loop: ld2 { v0.8H, v1.8H}, [src1], #32 ld2 { v2.8H, v3.8H}, [src1], #32 ld2 { v4.8H, v5.8H}, [src1], #32 ld2 { v6.8H, v7.8H}, [src1], #32 ld2 { v8.8H, v9.8H}, [src2ex], #32 ld2 {v10.8H, v11.8H}, [src2ex], #32 ld2 {v12.8H, v13.8H}, [src2ex], #32 ld2 {v14.8H, v15.8H}, [src2ex], #32 sqrdmulh v0.8H, v1.8H, v8.8H sqrdmulh v2.8H, v3.8H, v10.8H sqrdmulh v4.8H, v5.8H, v12.8H sqrdmulh v6.8H, v7.8H, v14.8H mul v1.8H, v1.8H, v9.8H mul v3.8H, v3.8H, v11.8H mul v5.8H, v5.8H, v13.8H mul v7.8H, v7.8H, v15.8H mls v1.8H, v0.8H, v20.8H mls v3.8H, v2.8H, v20.8H mls v5.8H, v4.8H, v20.8H mls v7.8H, v6.8H, v20.8H st1 { v1.8H}, [des], #16 st1 { v3.8H}, [des], #16 st1 { v5.8H}, [des], #16 st1 { v7.8H}, [des], #16 sub counter, counter, #1 cbnz counter, _point_mul_extended_loop .unreq Q .unreq des .unreq src1 .unreq src2ex .unreq counter pop_all br lr .align 2 .global PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul .global _PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul #ifndef __clang__ .type PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul, %function #endif PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul: _PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul: push_all Q .req w28 Qprime2 .req w27 des .req x11 src1_0 .req x0 src2_0 .req x1 src2asy_0 .req x2 src1_1 .req x4 src2_1 .req x5 src2asy_1 .req x6 src1_2 .req x8 src2_2 .req x9 src2asy_2 .req x10 src1_3 .req x12 src2_3 .req x13 src2asy_3 .req x14 counter .req x19 ldrsh Q, [x3, #0] ldrsh Qprime2, [x3, #2] add des, x4, #0 add src1_1, src1_0, #512*1 add src2_1, src2_0, #512*1 add src2asy_1, src2asy_0, #256*1 #if KYBER_K > 2 add src1_2, src1_0, #512*2 add src2_2, src2_0, #512*2 add src2asy_2, src2asy_0, #256*2 #endif #if KYBER_K > 3 add src1_3, src1_0, #512*3 add src2_3, src2_0, #512*3 add src2asy_3, src2asy_0, #256*3 #endif dup v28.8H, Q dup v29.8H, Qprime2 // TODO:interleaving mov counter, #16 _asymmetric_mul_loop: ld2 { v0.8H, v1.8H}, [ src1_0], #32 ld2 { v2.8H, v3.8H}, [ src2_0], #32 ld1 { v5.8H}, [src2asy_0], #16 smull v16.4S, v0.4H, v2.4H smull2 v20.4S, v0.8H, v2.8H smull v17.4S, v0.4H, v3.4H smull2 v21.4S, v0.8H, v3.8H smlal v16.4S, v1.4H, v5.4H smlal2 v20.4S, v1.8H, v5.8H smlal v17.4S, v1.4H, v2.4H smlal2 v21.4S, v1.8H, v2.8H ld2 { v0.8H, v1.8H}, [ src1_1], #32 ld2 { v2.8H, v3.8H}, [ src2_1], #32 ld1 { v5.8H}, [src2asy_1], #16 smlal v16.4S, v0.4H, v2.4H smlal2 v20.4S, v0.8H, v2.8H smlal v17.4S, v0.4H, v3.4H smlal2 v21.4S, v0.8H, v3.8H smlal v16.4S, v1.4H, v5.4H smlal2 v20.4S, v1.8H, v5.8H smlal v17.4S, v1.4H, v2.4H smlal2 v21.4S, v1.8H, v2.8H #if KYBER_K > 2 ld2 { v0.8H, v1.8H}, [ src1_2], #32 ld2 { v2.8H, v3.8H}, [ src2_2], #32 ld1 { v5.8H}, [src2asy_2], #16 smlal v16.4S, v0.4H, v2.4H smlal2 v20.4S, v0.8H, v2.8H smlal v17.4S, v0.4H, v3.4H smlal2 v21.4S, v0.8H, v3.8H smlal v16.4S, v1.4H, v5.4H smlal2 v20.4S, v1.8H, v5.8H smlal v17.4S, v1.4H, v2.4H smlal2 v21.4S, v1.8H, v2.8H #endif #if KYBER_K > 3 ld2 { v0.8H, v1.8H}, [ src1_3], #32 ld2 { v2.8H, v3.8H}, [ src2_3], #32 ld1 { v5.8H}, [src2asy_3], #16 smlal v16.4S, v0.4H, v2.4H smlal2 v20.4S, v0.8H, v2.8H smlal v17.4S, v0.4H, v3.4H smlal2 v21.4S, v0.8H, v3.8H smlal v16.4S, v1.4H, v5.4H smlal2 v20.4S, v1.8H, v5.8H smlal v17.4S, v1.4H, v2.4H smlal2 v21.4S, v1.8H, v2.8H #endif uzp1 v0.8H, v16.8H, v20.8H uzp1 v1.8H, v17.8H, v21.8H mul v0.8H, v0.8H, v29.8H mul v1.8H, v1.8H, v29.8H smlal v16.4S, v0.4H, v28.4H smlal2 v20.4S, v0.8H, v28.8H smlal v17.4S, v1.4H, v28.4H smlal2 v21.4S, v1.8H, v28.8H uzp2 v24.8H, v16.8H, v20.8H uzp2 v25.8H, v17.8H, v21.8H st2 {v24.8H, v25.8H}, [des], #32 sub counter, counter, #1 cbnz counter, _asymmetric_mul_loop .unreq Q .unreq Qprime2 .unreq des .unreq src1_0 .unreq src2_0 .unreq src2asy_0 .unreq src1_1 .unreq src2_1 .unreq src2asy_1 .unreq src1_2 .unreq src2_2 .unreq src2asy_2 .unreq src1_3 .unreq src2_3 .unreq src2asy_3 .unreq counter pop_all br lr .align 2 .global PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul_montgomery .global _PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul_montgomery #ifndef __clang__ .type PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul_montgomery, %function #endif PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul_montgomery: _PQCLEAN_KYBER1024_AARCH64_asm_asymmetric_mul_montgomery: push_all Q .req w28 Qprime2 .req w27 R3 .req w26 R3p .req w25 des .req x11 src1_0 .req x0 src2_0 .req x1 src2asy_0 .req x2 src1_1 .req x4 src2_1 .req x5 src2asy_1 .req x6 src1_2 .req x8 src2_2 .req x9 src2asy_2 .req x10 src1_3 .req x12 src2_3 .req x13 src2asy_3 .req x14 counter .req x19 ldrsh Q, [x3, #0] ldrsh Qprime2, [x3, #2] ldrsh R3, [x3, #8] ldrsh R3p, [x3, #10] add des, x4, #0 add src1_1, src1_0, #512*1 add src2_1, src2_0, #512*1 add src2asy_1, src2asy_0, #256*1 #if KYBER_K > 2 add src1_2, src1_0, #512*2 add src2_2, src2_0, #512*2 add src2asy_2, src2asy_0, #256*2 #endif #if KYBER_K > 3 add src1_3, src1_0, #512*3 add src2_3, src2_0, #512*3 add src2asy_3, src2asy_0, #256*3 #endif dup v26.8H, R3 dup v27.8H, R3p dup v28.8H, Q dup v29.8H, Qprime2 // TODO: interleaving mov counter, #16 _asymmetric_mul_montgomery_loop: ld2 { v0.8H, v1.8H}, [ src1_0], #32 ld2 { v2.8H, v3.8H}, [ src2_0], #32 ld1 { v5.8H}, [src2asy_0], #16 smull v16.4S, v0.4H, v2.4H smull2 v20.4S, v0.8H, v2.8H smull v17.4S, v0.4H, v3.4H smull2 v21.4S, v0.8H, v3.8H smlal v16.4S, v1.4H, v5.4H smlal2 v20.4S, v1.8H, v5.8H smlal v17.4S, v1.4H, v2.4H smlal2 v21.4S, v1.8H, v2.8H ld2 { v0.8H, v1.8H}, [ src1_1], #32 ld2 { v2.8H, v3.8H}, [ src2_1], #32 ld1 { v5.8H}, [src2asy_1], #16 smlal v16.4S, v0.4H, v2.4H smlal2 v20.4S, v0.8H, v2.8H smlal v17.4S, v0.4H, v3.4H smlal2 v21.4S, v0.8H, v3.8H smlal v16.4S, v1.4H, v5.4H smlal2 v20.4S, v1.8H, v5.8H smlal v17.4S, v1.4H, v2.4H smlal2 v21.4S, v1.8H, v2.8H #if KYBER_K > 2 ld2 { v0.8H, v1.8H}, [ src1_2], #32 ld2 { v2.8H, v3.8H}, [ src2_2], #32 ld1 { v5.8H}, [src2asy_2], #16 smlal v16.4S, v0.4H, v2.4H smlal2 v20.4S, v0.8H, v2.8H smlal v17.4S, v0.4H, v3.4H smlal2 v21.4S, v0.8H, v3.8H smlal v16.4S, v1.4H, v5.4H smlal2 v20.4S, v1.8H, v5.8H smlal v17.4S, v1.4H, v2.4H smlal2 v21.4S, v1.8H, v2.8H #endif #if KYBER_K > 3 ld2 { v0.8H, v1.8H}, [ src1_3], #32 ld2 { v2.8H, v3.8H}, [ src2_3], #32 ld1 { v5.8H}, [src2asy_3], #16 smlal v16.4S, v0.4H, v2.4H smlal2 v20.4S, v0.8H, v2.8H smlal v17.4S, v0.4H, v3.4H smlal2 v21.4S, v0.8H, v3.8H smlal v16.4S, v1.4H, v5.4H smlal2 v20.4S, v1.8H, v5.8H smlal v17.4S, v1.4H, v2.4H smlal2 v21.4S, v1.8H, v2.8H #endif uzp1 v0.8H, v16.8H, v20.8H uzp1 v1.8H, v17.8H, v21.8H mul v0.8H, v0.8H, v29.8H mul v1.8H, v1.8H, v29.8H smlal v16.4S, v0.4H, v28.4H smlal2 v20.4S, v0.8H, v28.8H smlal v17.4S, v1.4H, v28.4H smlal2 v21.4S, v1.8H, v28.8H uzp2 v24.8H, v16.8H, v20.8H uzp2 v25.8H, v17.8H, v21.8H sqrdmulh v16.8H, v24.8H, v26.8H sqrdmulh v17.8H, v25.8H, v26.8H mul v24.8H, v24.8H, v27.8H mul v25.8H, v25.8H, v27.8H mls v24.8H, v16.8H, v28.8H mls v25.8H, v17.8H, v28.8H st2 {v24.8H, v25.8H}, [des], #32 sub counter, counter, #1 cbnz counter, _asymmetric_mul_montgomery_loop .unreq Q .unreq Qprime2 .unreq R3 .unreq R3p .unreq des .unreq src1_0 .unreq src2_0 .unreq src2asy_0 .unreq src1_1 .unreq src2_1 .unreq src2asy_1 .unreq src1_2 .unreq src2_2 .unreq src2asy_2 .unreq src1_3 .unreq src2_3 .unreq src2asy_3 .unreq counter pop_all br lr