mul.S
#include "macros.S"
.syntax unified
.cpu cortex-m4
.align 2
.global __asm_base_mul
.type __asm_base_mul, %function
__asm_base_mul:
push.w {r4-r12, lr}
add.w r14, r0, #4096
add.w r14, r14, #4096
vmov.w s0, s1, r0, r14
vmov.w s2, r1
#ifdef LOOP
vmov.w s3, r14
_mul_loop:
#else
.rept 256
#endif
vmov.w r0, r14, s0, s1
ldrstr4 ldr.w, r0, r4, r5, r6, r7, #0, #4, #8, #12
ldrstr4 ldr.w, r14, r8, r9, r10, r11, #0, #4, #8, #12
vmov.w r12, s2
ldr.w r1, [r12], #4
vmov.w s2, r12
smull r12, r14, r5, r11
smlal r12, r14, r6, r10
smlal r12, r14, r7, r9
mul r0, r12, r2
smlal r12, r14, r0, r3
smull r12, r14, r14, r1
smlal r12, r14, r4, r8
mul r0, r12, r2
smlal r12, r14, r0, r3
vmov.w r0, s0
str.w r14, [r0, #0]
smull r12, r14, r6, r11
smlal r12, r14, r7, r10
mul r0, r12, r2
smlal r12, r14, r0, r3
smull r12, r14, r14, r1
smlal r12, r14, r4, r9
smlal r12, r14, r5, r8
mul r0, r12, r2
smlal r12, r14, r0, r3
vmov.w r0, s0
str.w r14, [r0, #4]
smull r12, r14, r7, r11
mul r0, r12, r2
smlal r12, r14, r0, r3
smull r12, r14, r14, r1
smlal r12, r14, r4, r10
smlal r12, r14, r5, r9
smlal r12, r14, r6, r8
mul r0, r12, r2
smlal r12, r14, r0, r3
vmov.w r0, s0
str.w r14, [r0, #8]
smull r12, r14, r4, r11
smlal r12, r14, r5, r10
smlal r12, r14, r6, r9
smlal r12, r14, r7, r8
mul r0, r12, r2
smlal r12, r14, r0, r3
vmov.w r0, s0
str.w r14, [r0, #12]
// ================================
vmov.w r14, s1
ldrstr4 ldr.w, r0, r4, r5, r6, r7, #16, #20, #24, #28
ldrstr4 ldr.w, r14, r8, r9, r10, r11, #16, #20, #24, #28
add.w r14, r14, #32
vmov.w s1, r14
neg.w r1, r1
smull r12, r14, r5, r11
smlal r12, r14, r6, r10
smlal r12, r14, r7, r9
mul r0, r12, r2
smlal r12, r14, r0, r3
smull r12, r14, r14, r1
smlal r12, r14, r4, r8
mul r0, r12, r2
smlal r12, r14, r0, r3
vmov.w r0, s0
str.w r14, [r0, #16]
smull r12, r14, r6, r11
smlal r12, r14, r7, r10
mul r0, r12, r2
smlal r12, r14, r0, r3
smull r12, r14, r14, r1
smlal r12, r14, r4, r9
smlal r12, r14, r5, r8
mul r0, r12, r2
smlal r12, r14, r0, r3
vmov.w r0, s0
str.w r14, [r0, #20]
smull r12, r14, r7, r11
mul r0, r12, r2
smlal r12, r14, r0, r3
smull r12, r14, r14, r1
smlal r12, r14, r4, r10
smlal r12, r14, r5, r9
smlal r12, r14, r6, r8
mul r0, r12, r2
smlal r12, r14, r0, r3
vmov.w r0, s0
str.w r14, [r0, #24]
smull r12, r14, r4, r11
smlal r12, r14, r5, r10
smlal r12, r14, r6, r9
smlal r12, r14, r7, r8
mul r0, r12, r2
smlal r12, r14, r0, r3
vmov.w r0, s0
str.w r14, [r0, #28]
add.w r0, r0, #32
vmov.w s0, r0
#ifdef LOOP
vmov.w r12, s3
cmp.w r0, r12
bne.w _mul_loop
#else
.endr
#endif
pop.w {r4-r12, pc}