// https://stackoverflow.com/questions/22396214/understanding-this-part-arm-assembly-code .arch armv7e-m .syntax unified .thumb .global fe25519_add_asm .type fe25519_add_asm, %function fe25519_add_asm: // fe25519 add for Cortex-M4 // output partially reduced // r0 = result ptr, r1,r2 = operand ptr. push {r4-r7} // constant 1, lets us sum three words via UMAAL mov r7, #1 ldr r4, [r1, 7*4] ldr r3, [r2, 7*4] mov r5, r3 umaal r4, r5, r4, r7 // (r5, r4) = r4 + r5 + r4*r7 = left[7] + right[7] + left[7] = 2*left[7] + right[7] umlal r4, r5, r3, r7 // (r5, r4) = (r5, r4) + r3*r7 = ... = 2*left[7] + 2*right[7] mov r3, #19 mul r3, r5 // r3 = 19*bits 255 and higher, add to output[0] // for the remainder, r4 stores `output[7] << 1`, // we don't touch/use it ldr r5, [r1, 0*4] ldr r6, [r2, 0*4] umaal r5, r6, r7, r3 // r5 = lo(r5 + r6 + r7*r3) str r5, [r0, 0*4] ldr r3, [r1, 1*4] ldr r5, [r2, 1*4] umaal r5, r6, r7, r3 str r5, [r0, 1*4] ldr r3, [r1, 2*4] ldr r5, [r2, 2*4] umaal r5, r6, r7, r3 str r5, [r0, 2*4] ldr r3, [r1, 3*4] ldr r5, [r2, 3*4] umaal r5, r6, r7, r3 str r5, [r0, 3*4] ldr r3, [r1, 4*4] ldr r5, [r2, 4*4] umaal r5, r6, r7, r3 str r5, [r0, 4*4] ldr r3, [r1, 5*4] ldr r5, [r2, 5*4] umaal r5, r6, r7, r3 str r5, [r0, 5*4] ldr r3, [r1, 6*4] ldr r5, [r2, 6*4] umaal r5, r6, r7, r3 str r5, [r0, 6*4] add r6, r6, r4, LSR #1 str r6, [r0, 7*4] pop {r4-r7} // don't forget this or you'll get weird values ;) bx lr /* .size fe25519_add_asm, .-fe25519_add_asm */