srawi. J, N, 1 ble .LZGEMM_L2_END .LZGEMM_L2_BEGIN: mr CO, C mr AO, A slwi T1, LDC , 1 add C, C, T1 srawi. I, M, 3 ble .LZGEMM_L2x8_END .LZGEMM_L2x8_BEGIN: mr BO, B srawi. L, K, 3 ble .LZGEMM_L2x8_SUB0 cmpwi cr0, L, 1 ble .LZGEMM_L2x8_SUB4 .LZGEMM_L2x8_LOOP_START: dcbt AO, PRE LOAD2x8_1 dcbt AO, PRE KERNEL2x8_I1 dcbt AO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 addic. L, L, -2 ble .LZGEMM_L2x8_LOOP_END .align 5 .LZGEMM_L2x8_LOOP: dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 addic. L, L, -1 bgt .LZGEMM_L2x8_LOOP .LZGEMM_L2x8_LOOP_END: dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 KERNEL2x8_E2 b .LZGEMM_L2x8_SUB1 .LZGEMM_L2x8_SUB4: dcbt AO, PRE KERNEL2x8_SUBI1 dcbt AO, PRE KERNEL2x8_SUB1 dcbt AO, PRE KERNEL2x8_SUB1 dcbt AO, PRE KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 KERNEL2x8_SUB1 b .LZGEMM_L2x8_SUB1 .LZGEMM_L2x8_SUB0: andi. L, K, 7 KERNEL2x8_SUBI1 addic. L, L, -1 ble .LZGEMM_L2x8_SAVE b .LZGEMM_L2x8_SUB2 .LZGEMM_L2x8_SUB1: andi. L, K, 7 ble .LZGEMM_L2x8_SAVE .LZGEMM_L2x8_SUB2: KERNEL2x8_SUB1 addic. L, L, -1 bgt .LZGEMM_L2x8_SUB2 .LZGEMM_L2x8_SAVE: SAVE2x8 addic. I, I, -1 bgt .LZGEMM_L2x8_BEGIN .LZGEMM_L2x8_END: .LZGEMM_L2x4_BEGIN: andi. T2, M, 7 ble .LZGEMM_L2x1_END andi. T1, M, 4 ble .LZGEMM_L2x4_END mr BO, B srawi. L, K, 3 ble .LZGEMM_L2x4_SUB0 cmpwi cr0, L, 1 ble .LZGEMM_L2x4_SUB4 .LZGEMM_L2x4_LOOP_START: LOAD2x4_1 KERNEL2x4_I1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 addic. L, L, -2 ble .LZGEMM_L2x4_LOOP_END .align 5 .LZGEMM_L2x4_LOOP: KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 addic. L, L, -1 bgt .LZGEMM_L2x4_LOOP .LZGEMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_2 KERNEL2x4_1 KERNEL2x4_E2 b .LZGEMM_L2x4_SUB1 .LZGEMM_L2x4_SUB4: KERNEL2x4_SUBI1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 KERNEL2x4_SUB1 b .LZGEMM_L2x4_SUB1 .LZGEMM_L2x4_SUB0: andi. L, K, 7 KERNEL2x4_SUBI1 addic. L, L, -1 ble .LZGEMM_L2x4_SAVE b .LZGEMM_L2x4_SUB2 .LZGEMM_L2x4_SUB1: andi. L, K, 7 ble .LZGEMM_L2x4_SAVE .LZGEMM_L2x4_SUB2: KERNEL2x4_SUB1 addic. L, L, -1 bgt .LZGEMM_L2x4_SUB2 .LZGEMM_L2x4_SAVE: SAVE2x4 .LZGEMM_L2x4_END: .LZGEMM_L2x2_BEGIN: andi. T1, M, 2 ble .LZGEMM_L2x2_END mr BO, B srawi. L, K, 3 ble .LZGEMM_L2x2_SUB0 cmpwi cr0, L, 1 ble .LZGEMM_L2x2_SUB4 .LZGEMM_L2x2_LOOP_START: LOAD2x2_1 KERNEL2x2_I1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 addic. L, L, -2 ble .LZGEMM_L2x2_LOOP_END .align 5 .LZGEMM_L2x2_LOOP: KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 addic. L, L, -1 bgt .LZGEMM_L2x2_LOOP .LZGEMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_2 KERNEL2x2_1 KERNEL2x2_E2 b .LZGEMM_L2x2_SUB1 .LZGEMM_L2x2_SUB4: KERNEL2x2_SUBI1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 KERNEL2x2_SUB1 b .LZGEMM_L2x2_SUB1 .LZGEMM_L2x2_SUB0: andi. L, K, 7 KERNEL2x2_SUBI1 addic. L, L, -1 ble .LZGEMM_L2x2_SAVE b .LZGEMM_L2x2_SUB2 .LZGEMM_L2x2_SUB1: andi. L, K, 7 ble .LZGEMM_L2x2_SAVE .LZGEMM_L2x2_SUB2: KERNEL2x2_SUB1 addic. L, L, -1 bgt .LZGEMM_L2x2_SUB2 .LZGEMM_L2x2_SAVE: SAVE2x2 .LZGEMM_L2x2_END: .LZGEMM_L2x1_BEGIN: andi. T1, M, 1 ble .LZGEMM_L2x1_END mr BO, B srawi. L, K, 3 ble .LZGEMM_L2x1_SUB0 cmpwi cr0, L, 1 ble .LZGEMM_L2x1_SUB4 .LZGEMM_L2x1_LOOP_START: LOAD2x1_1 KERNEL2x1_I1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 addic. L, L, -2 ble .LZGEMM_L2x1_LOOP_END .align 5 .LZGEMM_L2x1_LOOP: KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 addic. L, L, -1 bgt .LZGEMM_L2x1_LOOP .LZGEMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_2 KERNEL2x1_1 KERNEL2x1_E2 b .LZGEMM_L2x1_SUB1 .LZGEMM_L2x1_SUB4: KERNEL2x1_SUBI1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 KERNEL2x1_SUB1 b .LZGEMM_L2x1_SUB1 .LZGEMM_L2x1_SUB0: andi. L, K, 7 KERNEL2x1_SUBI1 addic. L, L, -1 ble .LZGEMM_L2x1_SAVE b .LZGEMM_L2x1_SUB2 .LZGEMM_L2x1_SUB1: andi. L, K, 7 ble .LZGEMM_L2x1_SAVE .LZGEMM_L2x1_SUB2: KERNEL2x1_SUB1 addic. L, L, -1 bgt .LZGEMM_L2x1_SUB2 .LZGEMM_L2x1_SAVE: SAVE2x1 .LZGEMM_L2x1_END: slwi T1, K, 5 add B, B, T1 addic. J, J, -1 bgt .LZGEMM_L2_BEGIN andi. T2, N, 1 ble .L999 .LZGEMM_L2_END: b .LZGEMM_L1_BEGIN .L999_H1: b .L999 .LZGEMM_L1_BEGIN: andi. T1, N, 1 ble .LZGEMM_L1_END mr CO, C mr AO, A srawi. I, M, 3 ble .LZGEMM_L1x8_END .LZGEMM_L1x8_BEGIN: mr BO, B srawi. L, K, 3 ble .LZGEMM_L1x8_SUB0 cmpwi cr0, L, 1 ble .LZGEMM_L1x8_SUB4 .LZGEMM_L1x8_LOOP_START: dcbt AO, PRE LOAD1x8_1 dcbt AO, PRE KERNEL1x8_I1 dcbt AO, PRE KERNEL1x8_2 dcbt AO, PRE KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 dcbt AO, PRE KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 dcbt AO, PRE KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 addic. L, L, -2 ble .LZGEMM_L1x8_LOOP_END .align 5 .LZGEMM_L1x8_LOOP: dcbt AO, PRE KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 dcbt AO, PRE KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 dcbt AO, PRE KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 dcbt AO, PRE KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 addic. L, L, -1 bgt .LZGEMM_L1x8_LOOP .LZGEMM_L1x8_LOOP_END: dcbt AO, PRE KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 dcbt AO, PRE KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 dcbt AO, PRE KERNEL1x8_1 dcbt AO, PRE KERNEL1x8_2 dcbt AO, PRE KERNEL1x8_1 KERNEL1x8_E2 b .LZGEMM_L1x8_SUB1 .LZGEMM_L1x8_SUB4: dcbt AO, PRE KERNEL1x8_SUBI1 dcbt AO, PRE KERNEL1x8_SUB1 dcbt AO, PRE KERNEL1x8_SUB1 dcbt AO, PRE KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 KERNEL1x8_SUB1 b .LZGEMM_L1x8_SUB1 .LZGEMM_L1x8_SUB0: andi. L, K, 7 KERNEL1x8_SUBI1 addic. L, L, -1 ble .LZGEMM_L1x8_SAVE b .LZGEMM_L1x8_SUB2 .LZGEMM_L1x8_SUB1: andi. L, K, 7 ble .LZGEMM_L1x8_SAVE .LZGEMM_L1x8_SUB2: KERNEL1x8_SUB1 addic. L, L, -1 bgt .LZGEMM_L1x8_SUB2 .LZGEMM_L1x8_SAVE: SAVE1x8 addic. I, I, -1 bgt .LZGEMM_L1x8_BEGIN .LZGEMM_L1x8_END: .LZGEMM_L1x4_BEGIN: andi. T2, M, 7 ble .LZGEMM_L1x1_END andi. T1, M, 4 ble .LZGEMM_L1x4_END mr BO, B srawi. L, K, 3 ble .LZGEMM_L1x4_SUB0 cmpwi cr0, L, 1 ble .LZGEMM_L1x4_SUB4 .LZGEMM_L1x4_LOOP_START: LOAD1x4_1 KERNEL1x4_I1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 addic. L, L, -2 ble .LZGEMM_L1x4_LOOP_END .align 5 .LZGEMM_L1x4_LOOP: KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 addic. L, L, -1 bgt .LZGEMM_L1x4_LOOP .LZGEMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_2 KERNEL1x4_1 KERNEL1x4_E2 b .LZGEMM_L1x4_SUB1 .LZGEMM_L1x4_SUB4: KERNEL1x4_SUBI1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 KERNEL1x4_SUB1 b .LZGEMM_L1x4_SUB1 .LZGEMM_L1x4_SUB0: andi. L, K, 7 KERNEL1x4_SUBI1 addic. L, L, -1 ble .LZGEMM_L1x4_SAVE b .LZGEMM_L1x4_SUB2 .LZGEMM_L1x4_SUB1: andi. L, K, 7 ble .LZGEMM_L1x4_SAVE .LZGEMM_L1x4_SUB2: KERNEL1x4_SUB1 addic. L, L, -1 bgt .LZGEMM_L1x4_SUB2 .LZGEMM_L1x4_SAVE: SAVE1x4 .LZGEMM_L1x4_END: .LZGEMM_L1x2_BEGIN: andi. T1, M, 2 ble .LZGEMM_L1x2_END mr BO, B srawi. L, K, 3 ble .LZGEMM_L1x2_SUB0 cmpwi cr0, L, 1 ble .LZGEMM_L1x2_SUB4 .LZGEMM_L1x2_LOOP_START: LOAD1x2_1 KERNEL1x2_I1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 addic. L, L, -2 ble .LZGEMM_L1x2_LOOP_END .align 5 .LZGEMM_L1x2_LOOP: KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 addic. L, L, -1 bgt .LZGEMM_L1x2_LOOP .LZGEMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_2 KERNEL1x2_1 KERNEL1x2_E2 b .LZGEMM_L1x2_SUB1 .LZGEMM_L1x2_SUB4: KERNEL1x2_SUBI1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 KERNEL1x2_SUB1 b .LZGEMM_L1x2_SUB1 .LZGEMM_L1x2_SUB0: andi. L, K, 7 KERNEL1x2_SUBI1 addic. L, L, -1 ble .LZGEMM_L1x2_SAVE b .LZGEMM_L1x2_SUB2 .LZGEMM_L1x2_SUB1: andi. L, K, 7 ble .LZGEMM_L1x2_SAVE .LZGEMM_L1x2_SUB2: KERNEL1x2_SUB1 addic. L, L, -1 bgt .LZGEMM_L1x2_SUB2 .LZGEMM_L1x2_SAVE: SAVE1x2 .LZGEMM_L1x2_END: .LZGEMM_L1x1_BEGIN: andi. T1, M, 1 ble .LZGEMM_L1x1_END mr BO, B srawi. L, K, 3 ble .LZGEMM_L1x1_SUB0 cmpwi cr0, L, 1 ble .LZGEMM_L1x1_SUB4 .LZGEMM_L1x1_LOOP_START: LOAD1x1_1 KERNEL1x1_I1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 addic. L, L, -2 ble .LZGEMM_L1x1_LOOP_END .align 5 .LZGEMM_L1x1_LOOP: KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 addic. L, L, -1 bgt .LZGEMM_L1x1_LOOP .LZGEMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_2 KERNEL1x1_1 KERNEL1x1_E2 b .LZGEMM_L1x1_SUB1 .LZGEMM_L1x1_SUB4: KERNEL1x1_SUBI1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 KERNEL1x1_SUB1 b .LZGEMM_L1x1_SUB1 .LZGEMM_L1x1_SUB0: andi. L, K, 7 KERNEL1x1_SUBI1 addic. L, L, -1 ble .LZGEMM_L1x1_SAVE b .LZGEMM_L1x1_SUB2 .LZGEMM_L1x1_SUB1: andi. L, K, 7 ble .LZGEMM_L1x1_SAVE .LZGEMM_L1x1_SUB2: KERNEL1x1_SUB1 addic. L, L, -1 bgt .LZGEMM_L1x1_SUB2 .LZGEMM_L1x1_SAVE: SAVE1x1 .LZGEMM_L1x1_END: .LZGEMM_L1_END: