#include "macros.S" .syntax unified .cpu cortex-m4 .align 2 .global __asm_intt .type __asm_intt, %function __asm_intt: push.w {r4-r12, lr} // RmodM movw r4, #35407 movt r4, #46 neg.w r4, r4 vmov.w s15, r4 add.w r1, r1, #252 vldm.w r1!, {s4-s10} vmov.w s1, r1 #ifdef LOOP add.w r14, r0, #16 vmov.w s3, r14 _8_7_6_light: #else .rept 2 #endif .rept 2 ldrstrvec ldr.w, r0, r4, r5, r6, r7, r8, r9, r10, r11, #0, #16, #32, #48, #64, #80, #96, #112 add_sub4 r4, r5, r6, r7, r8, r9, r10, r11 vmov.w r1, s9 montgomery_mul r9, r1, r12, r9, r2, r3, r14 vmov.w r1, s10 montgomery_mul r11, r1, r12, r11, r2, r3, r14 montgomery_mul_vec4 r4, r6, r8, r10, s15, r1, r2, r3, r12, r14 montgomery_mul r5, r1, r12, r5, r2, r3, r14 vmov.w r1, s8 montgomery_mul r7, r1, r12, r7, r2, r3, r14 add_sub4 r4, r6, r5, r7, r8, r10, r9, r11 montgomery_mul r10, r1, r12, r10, r2, r3, r14 montgomery_mul r11, r1, r12, r11, r2, r3, r14 add_sub4 r4, r8, r5, r9, r6, r10, r7, r11 ldrstrvecjump str.w, r0, r4, r5, r6, r7, r8, r9, r10, r11, #16, #32, #48, #64, #80, #96, #112, #4 .endr #ifdef LOOP vmov.w r14, s3 cmp.w r0, r14 bne.w _8_7_6_light #else .endr #endif add.w r0, r0, #112 #ifdef LOOP add.w r12, r0, #3968 add.w r12, r12, #4096 vmov.w s2, r12 _8_7_6: #else .rept 63 #endif vmov.w r1, s1 vldm.w r1!, {s4-s10} vmov.w s1, r1 #ifdef LOOP add.w r14, r0, #16 vmov.w s3, r14 _8_7_6_inner: #else .rept 2 #endif .rept 2 ldrstrvec ldr.w, r0, r4, r5, r6, r7, r8, r9, r10, r11, #0, #16, #32, #48, #64, #80, #96, #112 add_sub4 r4, r5, r6, r7, r8, r9, r10, r11 vmov.w r1, s7 montgomery_mul r5, r1, r12, r5, r2, r3, r14 vmov.w r1, s8 montgomery_mul r7, r1, r12, r7, r2, r3, r14 vmov.w r1, s9 montgomery_mul r9, r1, r12, r9, r2, r3, r14 vmov.w r1, s10 montgomery_mul r11, r1, r12, r11, r2, r3, r14 montgomery_mul_vec4 r4, r6, r8, r10, s15, r1, r2, r3, r12, r14 add_sub4 r4, r6, r5, r7, r8, r10, r9, r11 vmov.w r1, s5 montgomery_mul r6, r1, r12, r6, r2, r3, r14 montgomery_mul r7, r1, r12, r7, r2, r3, r14 vmov.w r1, s6 montgomery_mul r10, r1, r12, r10, r2, r3, r14 montgomery_mul r11, r1, r12, r11, r2, r3, r14 add_sub4 r4, r8, r5, r9, r6, r10, r7, r11 montgomery_mul_vec4 r8, r9, r10, r11, s4, r1, r2, r3, r12, r14 ldrstrvecjump str.w, r0, r4, r5, r6, r7, r8, r9, r10, r11, #16, #32, #48, #64, #80, #96, #112, #4 .endr #ifdef LOOP vmov.w r14, s3 cmp.w r0, r14 bne.w _8_7_6_inner #else .endr #endif add.w r0, r0, #112 #ifdef LOOP vmov.w r12, s2 cmp.w r0, r12 bne.w _8_7_6 #else .endr #endif sub.w r0, r0, #4096 sub.w r0, r0, #4096 vmov.w r1, s1 sub.w r1, r1, #2016 vldm.w r1!, {s4-s10} vmov.w s1, r1 #ifdef LOOP add.w r14, r0, #128 vmov.w s3, r14 _5_4_3_light: #else .rept 16 #endif .rept 2 ldrstr4 ldr.w, r0, r8, r5, r10, r7, #512, #640, #768, #896 _3_layer_GS_butterfly_light_fast_first r4, r5, r6, r7, r8, r9, r10, r11, s4, s5, s6, s7, s8, s9, s10, r1, r2, r3, r12, r14 ldrstr4 ldr.w, r0, r4, r5, r6, r7, #0, #128, #256, #384 _3_layer_GS_butterfly_light_fast_second r4, r5, r6, r7, r8, r9, r10, r11, s4, s5, s6, s7, s8, s9, s10, r1, r2, r3, r12, r14 ldrstrvecjump str.w, r0, r4, r5, r6, r7, r8, r9, r10, r11, #128, #256, #384, #512, #640, #768, #896, #4 .endr #ifdef LOOP vmov.w r14, s3 cmp.w r0, r14 bne.w _5_4_3_light #else .endr #endif add.w r0, r0, #896 #ifdef LOOP add.w r12, r0, #3072 add.w r12, r12, #4096 vmov.w s2, r12 _5_4_3: #else .rept 7 #endif vmov.w r1, s1 vldm.w r1!, {s4-s10} vmov.w s1, r1 #ifdef LOOP add.w r14, r0, #128 vmov.w s3, r14 _5_4_3_inner: #else .rept 16 #endif .rept 2 ldrstrvec ldr.w, r0, r4, r5, r6, r7, r8, r9, r10, r11, #0, #128, #256, #384, #512, #640, #768, #896 _3_layer_GS_butterfly r4, r5, r6, r7, r8, r9, r10, r11, s4, s5, s6, s7, s8, s9, s10, r1, r2, r3, r12, r14 ldrstrvecjump str.w, r0, r4, r5, r6, r7, r8, r9, r10, r11, #128, #256, #384, #512, #640, #768, #896, #4 .endr #ifdef LOOP vmov.w r14, s3 cmp.w r0, r14 bne.w _5_4_3_inner #else .endr #endif add.w r0, r0, #896 #ifdef LOOP vmov.w r12, s2 cmp.w r0, r12 bne.w _5_4_3 #else .endr #endif vmov.w r1, s1 sub.w r1, r1, #252 vldm.w r1, {s4-s10} sub.w r1, r0, #4096 sub.w r0, r1, #4096 vmov.w s0, r1 #ifdef LOOP add.w r14, r0, #1024 vmov.w s3, r14 _2_1_0: #else .rept 64 #endif .rept 4 vmov.w r1, s0 ldrstr4 ldr.w, r1, r8, r5, r10, r7, #0, #1024, #2048, #3072 _3_layer_GS_butterfly_light_fast_first r4, r5, r6, r7, r8, r9, r10, r11, s4, s5, s6, s7, s8, s9, s10, r1, r2, r3, r12, r14 ldrstr4 ldr.w, r0, r4, r5, r6, r7, #0, #1024, #2048, #3072 _3_layer_GS_butterfly_light_fast_second r4, r5, r6, r7, r8, r9, r10, r11, s4, s5, s6, s7, s8, s9, s10, r1, r2, r3, r12, r14 vmov.w r1, s0 ldrstr4jump str.w, r0, r4, r5, r6, r7, #1024, #2048, #3072, #4 ldrstr4jump str.w, r1, r8, r9, r10, r11, #1024, #2048, #3072, #4 vmov.w s0, r1 .endr #ifdef LOOP vmov.w r14, s3 cmp.w r0, r14 bne.w _2_1_0 #else .endr #endif pop.w {r4-r12, pc}