/****************************************************************************** * ARM assembly implemetnations of the AES-128 and AES-256 key schedule to * match fixslicing. * Note that those implementations are fully bitsliced and do not rely on any * Look-Up Table (LUT). * * See the paper at https://eprint.iacr.org/2020/1123.pdf for more details. * * @author Alexandre Adomnicai, Nanyang Technological University, Singapore * alexandre.adomnicai@ntu.edu.sg * * @date October 2020 ******************************************************************************/ .syntax unified .thumb /****************************************************************************** * Macro to compute the SWAPMOVE technique: swap the bits in 'in1' masked by 'm' * by the bits in 'in0' masked by 'm << n' and put the results in 'out0', 'out1' ******************************************************************************/ .macro swpmv out0, out1, in0, in1, m, n, tmp eor \tmp, \in1, \in0, lsr \n and \tmp, \m eor \out1, \in1, \tmp eor \out0, \in0, \tmp, lsl \n .endm /****************************************************************************** * Packing routine. Note that it is the same as the one used in the encryption * function so some code size could be saved by merging the two files. ******************************************************************************/ .align 2 packing: movw r3, #0x0f0f movt r3, #0x0f0f // r3 <- 0x0f0f0f0f (mask for SWAPMOVE) eor r2, r3, r3, lsl #2 // r2 <- 0x33333333 (mask for SWAPMOVE) eor r1, r2, r2, lsl #1 // r1 <- 0x55555555 (mask for SWAPMOVE) swpmv r8, r4, r8, r4, r1, #1, r12 swpmv r9, r5, r9, r5, r1, #1, r12 swpmv r10, r6, r10, r6, r1, #1, r12 swpmv r11, r7, r11, r7, r1, #1, r12 swpmv r0, r4, r5, r4, r2, #2, r12 swpmv r9, r5, r9, r8, r2, #2, r12 swpmv r7, r8, r7, r6, r2, #2, r12 swpmv r11, r2, r11, r10, r2, #2, r12 swpmv r8, r4, r8, r4, r3, #4, r12 swpmv r10, r6, r7, r0, r3, #4, r12 swpmv r11, r7, r11, r9, r3, #4, r12 swpmv r9, r5, r2, r5, r3, #4, r12 bx lr /****************************************************************************** * Subroutine that computes S-box. Note that the same code is used in the * encryption function, so some code size could be saved by merging the 2 files. * Credits to https://github.com/Ko-/aes-armcortexm. ******************************************************************************/ .align 2 sbox: str r14, [sp, #52] eor r1, r7, r9 //Exec y14 = U3 ^ U5; into r1 eor r3, r4, r10 //Exec y13 = U0 ^ U6; into r3 eor r2, r3, r1 //Exec y12 = y13 ^ y14; into r2 eor r0, r8, r2 //Exec t1 = U4 ^ y12; into r0 eor r14, r0, r9 //Exec y15 = t1 ^ U5; into r14 and r12, r2, r14 //Exec t2 = y12 & y15; into r12 eor r8, r14, r11 //Exec y6 = y15 ^ U7; into r8 eor r0, r0, r5 //Exec y20 = t1 ^ U1; into r0 str.w r2, [sp, #44] //Store r2/y12 on stack eor r2, r4, r7 //Exec y9 = U0 ^ U3; into r2 str r0, [sp, #40] //Store r0/y20 on stack eor r0, r0, r2 //Exec y11 = y20 ^ y9; into r0 str r2, [sp, #36] //Store r2/y9 on stack and r2, r2, r0 //Exec t12 = y9 & y11; into r2 str r8, [sp, #32] //Store r8/y6 on stack eor r8, r11, r0 //Exec y7 = U7 ^ y11; into r8 eor r9, r4, r9 //Exec y8 = U0 ^ U5; into r9 eor r6, r5, r6 //Exec t0 = U1 ^ U2; into r6 eor r5, r14, r6 //Exec y10 = y15 ^ t0; into r5 str r14, [sp, #28] //Store r14/y15 on stack eor r14, r5, r0 //Exec y17 = y10 ^ y11; into r14 str.w r1, [sp, #24] //Store r1/y14 on stack and r1, r1, r14 //Exec t13 = y14 & y17; into r1 eor r1, r1, r2 //Exec t14 = t13 ^ t12; into r1 str r14, [sp, #20] //Store r14/y17 on stack eor r14, r5, r9 //Exec y19 = y10 ^ y8; into r14 str.w r5, [sp, #16] //Store r5/y10 on stack and r5, r9, r5 //Exec t15 = y8 & y10; into r5 eor r2, r5, r2 //Exec t16 = t15 ^ t12; into r2 eor r5, r6, r0 //Exec y16 = t0 ^ y11; into r5 str.w r0, [sp, #12] //Store r0/y11 on stack eor r0, r3, r5 //Exec y21 = y13 ^ y16; into r0 str r3, [sp, #8] //Store r3/y13 on stack and r3, r3, r5 //Exec t7 = y13 & y16; into r3 str r5, [sp, #4] //Store r5/y16 on stack str r11, [sp, #0] //Store r11/U7 on stack eor r5, r4, r5 //Exec y18 = U0 ^ y16; into r5 eor r6, r6, r11 //Exec y1 = t0 ^ U7; into r6 eor r7, r6, r7 //Exec y4 = y1 ^ U3; into r7 and r11, r7, r11 //Exec t5 = y4 & U7; into r11 eor r11, r11, r12 //Exec t6 = t5 ^ t2; into r11 eor r11, r11, r2 //Exec t18 = t6 ^ t16; into r11 eor r14, r11, r14 //Exec t22 = t18 ^ y19; into r14 eor r4, r6, r4 //Exec y2 = y1 ^ U0; into r4 and r11, r4, r8 //Exec t10 = y2 & y7; into r11 eor r11, r11, r3 //Exec t11 = t10 ^ t7; into r11 eor r2, r11, r2 //Exec t20 = t11 ^ t16; into r2 eor r2, r2, r5 //Exec t24 = t20 ^ y18; into r2 eor r10, r6, r10 //Exec y5 = y1 ^ U6; into r10 and r11, r10, r6 //Exec t8 = y5 & y1; into r11 eor r3, r11, r3 //Exec t9 = t8 ^ t7; into r3 eor r3, r3, r1 //Exec t19 = t9 ^ t14; into r3 eor r3, r3, r0 //Exec t23 = t19 ^ y21; into r3 eor r0, r10, r9 //Exec y3 = y5 ^ y8; into r0 ldr r11, [sp, #32] //Load y6 into r11 and r5, r0, r11 //Exec t3 = y3 & y6; into r5 eor r12, r5, r12 //Exec t4 = t3 ^ t2; into r12 ldr r5, [sp, #40] //Load y20 into r5 str r7, [sp, #32] //Store r7/y4 on stack eor r12, r12, r5 //Exec t17 = t4 ^ y20; into r12 eor r1, r12, r1 //Exec t21 = t17 ^ t14; into r1 and r12, r1, r3 //Exec t26 = t21 & t23; into r12 eor r5, r2, r12 //Exec t27 = t24 ^ t26; into r5 eor r12, r14, r12 //Exec t31 = t22 ^ t26; into r12 eor r1, r1, r14 //Exec t25 = t21 ^ t22; into r1 and r7, r1, r5 //Exec t28 = t25 & t27; into r7 eor r14, r7, r14 //Exec t29 = t28 ^ t22; into r14 and r4, r14, r4 //Exec z14 = t29 & y2; into r4 and r8, r14, r8 //Exec z5 = t29 & y7; into r8 eor r7, r3, r2 //Exec t30 = t23 ^ t24; into r7 and r12, r12, r7 //Exec t32 = t31 & t30; into r12 eor r12, r12, r2 //Exec t33 = t32 ^ t24; into r12 eor r7, r5, r12 //Exec t35 = t27 ^ t33; into r7 and r2, r2, r7 //Exec t36 = t24 & t35; into r2 eor r5, r5, r2 //Exec t38 = t27 ^ t36; into r5 and r5, r14, r5 //Exec t39 = t29 & t38; into r5 eor r1, r1, r5 //Exec t40 = t25 ^ t39; into r1 eor r5, r14, r1 //Exec t43 = t29 ^ t40; into r5 ldr.w r7, [sp, #4] //Load y16 into r7 and r7, r5, r7 //Exec z3 = t43 & y16; into r7 eor r8, r7, r8 //Exec tc12 = z3 ^ z5; into r8 str r8, [sp, #40] //Store r8/tc12 on stack ldr r8, [sp, #8] //Load y13 into r8 and r8, r5, r8 //Exec z12 = t43 & y13; into r8 and r10, r1, r10 //Exec z13 = t40 & y5; into r10 and r6, r1, r6 //Exec z4 = t40 & y1; into r6 eor r6, r7, r6 //Exec tc6 = z3 ^ z4; into r6 eor r3, r3, r12 //Exec t34 = t23 ^ t33; into r3 eor r3, r2, r3 //Exec t37 = t36 ^ t34; into r3 eor r1, r1, r3 //Exec t41 = t40 ^ t37; into r1 ldr.w r5, [sp, #16] //Load y10 into r5 and r2, r1, r5 //Exec z8 = t41 & y10; into r2 and r9, r1, r9 //Exec z17 = t41 & y8; into r9 str r9, [sp, #16] //Store r9/z17 on stack eor r5, r12, r3 //Exec t44 = t33 ^ t37; into r5 ldr r9, [sp, #28] //Load y15 into r9 ldr.w r7, [sp, #44] //Load y12 into r7 and r9, r5, r9 //Exec z0 = t44 & y15; into r9 and r7, r5, r7 //Exec z9 = t44 & y12; into r7 and r0, r3, r0 //Exec z10 = t37 & y3; into r0 and r3, r3, r11 //Exec z1 = t37 & y6; into r3 eor r3, r3, r9 //Exec tc5 = z1 ^ z0; into r3 eor r3, r6, r3 //Exec tc11 = tc6 ^ tc5; into r3 ldr r11, [sp, #32] //Load y4 into r11 ldr.w r5, [sp, #20] //Load y17 into r5 and r11, r12, r11 //Exec z11 = t33 & y4; into r11 eor r14, r14, r12 //Exec t42 = t29 ^ t33; into r14 eor r1, r14, r1 //Exec t45 = t42 ^ t41; into r1 and r5, r1, r5 //Exec z7 = t45 & y17; into r5 eor r6, r5, r6 //Exec tc8 = z7 ^ tc6; into r6 ldr r5, [sp, #24] //Load y14 into r5 str r4, [sp, #32] //Store r4/z14 on stack and r1, r1, r5 //Exec z16 = t45 & y14; into r1 ldr r5, [sp, #12] //Load y11 into r5 ldr r4, [sp, #36] //Load y9 into r4 and r5, r14, r5 //Exec z6 = t42 & y11; into r5 eor r5, r5, r6 //Exec tc16 = z6 ^ tc8; into r5 and r4, r14, r4 //Exec z15 = t42 & y9; into r4 eor r14, r4, r5 //Exec tc20 = z15 ^ tc16; into r14 eor r4, r4, r1 //Exec tc1 = z15 ^ z16; into r4 eor r1, r0, r4 //Exec tc2 = z10 ^ tc1; into r1 eor r0, r1, r11 //Exec tc21 = tc2 ^ z11; into r0 eor r7, r7, r1 //Exec tc3 = z9 ^ tc2; into r7 eor r1, r7, r5 //Exec S0 = tc3 ^ tc16; into r1 eor r7, r7, r3 //Exec S3 = tc3 ^ tc11; into r7 eor r3, r7, r5 //Exec S1 = S3 ^ tc16 ^ 1; into r3 eor r11, r10, r4 //Exec tc13 = z13 ^ tc1; into r11 ldr.w r4, [sp, #0] //Load U7 into r4 and r12, r12, r4 //Exec z2 = t33 & U7; into r12 eor r9, r9, r12 //Exec tc4 = z0 ^ z2; into r9 eor r12, r8, r9 //Exec tc7 = z12 ^ tc4; into r12 eor r2, r2, r12 //Exec tc9 = z8 ^ tc7; into r2 eor r2, r6, r2 //Exec tc10 = tc8 ^ tc9; into r2 ldr.w r4, [sp, #32] //Load z14 into r4 eor r12, r4, r2 //Exec tc17 = z14 ^ tc10; into r12 eor r0, r0, r12 //Exec S5 = tc21 ^ tc17; into r0 eor r6, r12, r14 //Exec tc26 = tc17 ^ tc20; into r6 ldr.w r4, [sp, #16] //Load z17 into r4 ldr r12, [sp, #40] //Load tc12 into r12 eor r6, r6, r4 //Exec S2 = tc26 ^ z17 ^ 1; into r6 eor r12, r9, r12 //Exec tc14 = tc4 ^ tc12; into r12 eor r14, r11, r12 //Exec tc18 = tc13 ^ tc14; into r14 eor r2, r2, r14 //Exec S6 = tc10 ^ tc18 ^ 1; into r2 eor r11, r8, r14 //Exec S7 = z12 ^ tc18 ^ 1; into r11 ldr r14, [sp, #52] // restore link register eor r8, r12, r7 //Exec S4 = tc14 ^ S3; into r8 bx lr // [('r0', 'S5'), ('r1', 'S0'), ('r2', 'S6'), ('r3', 'S1'), // ('r6', 'S2'),('r7', 'S3'), ('r8', 'S4'), ('r11', 'S7')] /****************************************************************************** * Subroutine that XORs the columns after the S-box during the AES-128 key * schedule round function, for rounds i such that (i % 4) == 0. * Note that the code size could be reduced at the cost of some instructions * since some redundant code is applied on different registers. ******************************************************************************/ .align 2 aes128_xorcolumns_rotword: ldr r12, [sp, #56] // restore 'rkeys' address ldr.w r5, [r12, #28] // load rkey word of rkey from prev round movw r4, #0xc0c0 movt r4, #0xc0c0 // r4 <- 0xc0c0c0c0 eor r11, r5, r11, ror #2 // r11<- r5 ^ (r11 >>> 2) bic r11, r4, r11 // r11<- ~r11 & 0xc0c0c0c0 (NOT omitted in sbox) eor r9, r5, r11, ror #2 // r9 <- r5 ^ (r11 >>> 2) and r9, r9, r4, ror #2 // r9 <- r9 & 0x30303030 orr r11, r11, r9 // r11<- r11 | r9 eor r9, r5, r11, ror #2 // r9 <- r5 ^ (r11 >>> 2) and r9, r9, r4, ror #4 // r9 <- r9 & 0x0c0c0c0c orr r11, r11, r9 // r11<- r11 | r9 eor r9, r5, r11, ror #2 // r9 <- r5 ^ (r11 >>> 2) and r9, r9, r4, ror #6 // r9 <- r9 & 0x03030303 orr r11, r11, r9 // r11<- r11 | r9 mvn r9, r5 // NOT omitted in sbox ldr.w r5, [r12, #24] // load rkey word of rkey from prev round str r9, [r12, #28] // store new rkey word after NOT str r11, [r12, #60] // store new rkey word in 'rkeys' eor r10, r5, r2, ror #2 // r10<- r5 ^ (r2 >>> 2) bic r10, r4, r10 // r10<- ~r10 & 0xc0c0c0c0 (NOT omitted in sbox) eor r9, r5, r10, ror #2 // r9 <- r5 ^ (r10 >>> 2) and r9, r9, r4, ror #2 // r9 <- r9 & 0x30303030 orr r10, r10, r9 // r10<- r10 | r9 eor r9, r5, r10, ror #2 // r9 <- r5 ^ (r10 >>> 2) and r9, r9, r4, ror #4 // r9 <- r9 & 0x0c0c0c0c orr r10, r10, r9 // r10<- r10 | r9 eor r9, r5, r10, ror #2 // r9 <- r5 ^ (r10 >>> 2) and r9, r9, r4, ror #6 // r9 <- r9 & 0x03030303 orr r10, r10, r9 // r10<- r10 | r9 mvn r9, r5 // NOT omitted in sbox ldr.w r2, [r12, #20] // load rkey word of rkey from prev round str r9, [r12, #24] // store new rkey word after NOT str r10, [r12, #56] // store new rkey word in 'rkeys' eor r9, r2, r0, ror #2 // r9 <- r2 ^ (r9 >>> 2) and r9, r4, r9 // r9 <- r9 & 0xc0c0c0c0 eor r0, r2, r9, ror #2 // r0 <- r2 ^ (r9 >>> 2) and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 orr r9, r9, r0 // r9 <- r9 | r0 eor r0, r2, r9, ror #2 // r0 <- r2 ^ (r9 >>> 2) and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c orr r9, r9, r0 // r9 <- r9 | r0 eor r0, r2, r9, ror #2 // r0 <- r2 ^ (r9 >>> 2) and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 orr r9, r9, r0 // r9 <- r9 | r0 ldr.w r2, [r12, #16] // load rkey word of rkey from prev round str.w r9, [r12, #52] // store new rkey word in 'rkeys' eor r8, r2, r8, ror #2 // r8 <- r2 ^ (r8 >>> 2) and r8, r4, r8 // r8 <- r8 & 0xc0c0c0c0 eor r0, r2, r8, ror #2 // r0 <- r2 ^ (r8 >>> 2) and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 orr r8, r8, r0 // r8 <- r8 | r0 eor r0, r2, r8, ror #2 // r0 <- r2 ^ (r8 >>> 2) and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c orr r8, r8, r0 // r8 <- r8 | r0 eor r0, r2, r8, ror #2 // r0 <- r2 ^ (r8 >>> 2) and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 orr r8, r8, r0 // r8 <- r8 | r0 ldr.w r2, [r12, #12] // load rkey word of rkey from prev round str.w r8, [r12, #48] // store new rkey word in 'rkeys' eor r7, r2, r7, ror #2 // r7 <- r2 ^ (r7 >>> 2) and r7, r4, r7 // r7 <- r7 & 0xc0c0c0c0 eor r0, r2, r7, ror #2 // r0 <- r2 ^ (r7 >>> 2) and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 orr r7, r7, r0 // r7 <- r7 | r0 eor r0, r2, r7, ror #2 // r0 <- r2 ^ (r7 >>> 2) and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c orr r7, r7, r0 // r7 <- r7 | r0 eor r0, r2, r7, ror #2 // r0 <- r2 ^ (r7 >>> 2) and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 orr r7, r7, r0 // r7 <- r7 | r0 ldr.w r2, [r12, #8] // load rkey word of rkey from prev round str.w r7, [r12, #44] // store new rkey word in 'rkeys' eor r6, r2, r6, ror #2 // r6 <- r2 ^ (r6 >>> 2) bic r6, r4, r6 // r6 <- ~r6 & 0xc0c0c0c0 (NOT omitted in sbox) eor r0, r2, r6, ror #2 // r0 <- r2 ^ (r6 >>> 2) and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 orr r6, r6, r0 // r6 <- r6 | r0 eor r0, r2, r6, ror #2 // r0 <- r2 ^ (r6 >>> 2) and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c orr r6, r6, r0 // r6 <- r6 | r0 eor r0, r2, r6, ror #2 // r0 <- r2 ^ (r6 >>> 2) and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 orr r6, r6, r0 // r6 <- r6 | r0 mvn r0, r2 // NOT omitted in sbox ldr.w r2, [r12, #4] // load rkey word of rkey from prev round str.w r0, [r12, #8] // store new rkey word after NOT str.w r6, [r12, #40] // store new rkey word in 'rkeys' eor r5, r2, r3, ror #2 // r5 <- r2 ^ (r3 >>> 2) bic r5, r4, r5 // r5 <- ~r5 & 0xc0c0c0c0 (NOT omitted in sbox) eor r0, r2, r5, ror #2 // r0 <- r2 ^ (r5 >>> 2) and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 orr r5, r5, r0 // r5 <- r5 | r0 eor r0, r2, r5, ror #2 // r0 <- r2 ^ (r5 >>> 2) and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c orr r5, r5, r0 // r5 <- r5 | r0 eor r0, r2, r5, ror #2 // r0 <- r2 ^ (r5 >>> 2) and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 orr r5, r5, r0 // r5 <- r5 | r0 mvn r0, r2 // NOT omitted in sbox ldr.w r2, [r12], #32 // load rkey word of rkey from prev round str.w r0, [r12, #-28] // store new rkey word after NOT str.w r5, [r12, #4] // store new rkey word in 'rkeys' eor r3, r2, r1, ror #2 // r3 <- r2 ^ (r1 >>> 2) and r3, r4, r3 // r3 <- r3 & 0xc0c0c0c0 eor r0, r2, r3, ror #2 // r0 <- r2 ^ (r3 >>> 2) and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 orr r3, r3, r0 // r3 <- r3 | r0 eor r0, r2, r3, ror #2 // r0 <- r2 ^ (r3 >>> 2) and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c orr r3, r3, r0 // r3 <- r3 | r0 eor r0, r2, r3, ror #2 // r0 <- r2 ^ (r3 >>> 2) and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 orr r4, r3, r0 // r4 <- r3 | r0 str.w r4, [r12] str.w r12, [sp, #56] // store the new rkeys address on the stack bx lr /****************************************************************************** * Subroutine that XORs the columns after the S-box during the AES-256 key * schedule round function, for rounds i such that (i % 4) == 0. * Differs from 'aes128_xorcolumns_rotword' by the rkeys' indexes to be involved * in XORs. ******************************************************************************/ .align 2 aes256_xorcolumns_rotword: ldr r12, [sp, #56] // restore 'rkeys' address ldr.w r5, [r12, #28] // load rkey word of rkey from prev round movw r4, #0xc0c0 movt r4, #0xc0c0 // r4 <- 0xc0c0c0c0 eor r11, r5, r11, ror #2 // r11<- r5 ^ (r11 >>> 2) bic r11, r4, r11 // r11<- ~r11 & 0xc0c0c0c0 (NOT omitted in sbox) eor r9, r5, r11, ror #2 // r9 <- r5 ^ (r11 >>> 2) and r9, r9, r4, ror #2 // r9 <- r9 & 0x30303030 orr r11, r11, r9 // r11<- r11 | r9 eor r9, r5, r11, ror #2 // r9 <- r5 ^ (r11 >>> 2) and r9, r9, r4, ror #4 // r9 <- r9 & 0x0c0c0c0c orr r11, r11, r9 // r11<- r11 | r9 eor r9, r5, r11, ror #2 // r9 <- r5 ^ (r11 >>> 2) and r9, r9, r4, ror #6 // r9 <- r9 & 0x03030303 orr r11, r11, r9 // r11<- r11 | r9 mvn r9, r5 // NOT omitted in sbox ldr.w r5, [r12, #24] // load rkey word of rkey from prev round str r9, [r12, #28] // store new rkey word after NOT str r11, [r12, #92] // store new rkey word in 'rkeys' eor r10, r5, r2, ror #2 // r10<- r5 ^ (r2 >>> 2) bic r10, r4, r10 // r10<- ~r10 & 0xc0c0c0c0 (NOT omitted in sbox) eor r9, r5, r10, ror #2 // r9 <- r5 ^ (r10 >>> 2) and r9, r9, r4, ror #2 // r9 <- r9 & 0x30303030 orr r10, r10, r9 // r10<- r10 | r9 eor r9, r5, r10, ror #2 // r9 <- r5 ^ (r10 >>> 2) and r9, r9, r4, ror #4 // r9 <- r9 & 0x0c0c0c0c orr r10, r10, r9 // r10<- r10 | r9 eor r9, r5, r10, ror #2 // r9 <- r5 ^ (r10 >>> 2) and r9, r9, r4, ror #6 // r9 <- r9 & 0x03030303 orr r10, r10, r9 // r10<- r10 | r9 mvn r9, r5 // NOT omitted in sbox ldr.w r2, [r12, #20] // load rkey word of rkey from prev round str r9, [r12, #24] // store new rkey word after NOT str r10, [r12, #88] // store new rkey word in 'rkeys' eor r9, r2, r0, ror #2 // r9 <- r2 ^ (r9 >>> 2) and r9, r4, r9 // r9 <- r9 & 0xc0c0c0c0 eor r0, r2, r9, ror #2 // r0 <- r2 ^ (r9 >>> 2) and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 orr r9, r9, r0 // r9 <- r9 | r0 eor r0, r2, r9, ror #2 // r0 <- r2 ^ (r9 >>> 2) and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c orr r9, r9, r0 // r9 <- r9 | r0 eor r0, r2, r9, ror #2 // r0 <- r2 ^ (r9 >>> 2) and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 orr r9, r9, r0 // r9 <- r9 | r0 ldr.w r2, [r12, #16] // load rkey word of rkey from prev round str.w r9, [r12, #84] // store new rkey word in 'rkeys' eor r8, r2, r8, ror #2 // r8 <- r2 ^ (r8 >>> 2) and r8, r4, r8 // r8 <- r8 & 0xc0c0c0c0 eor r0, r2, r8, ror #2 // r0 <- r2 ^ (r8 >>> 2) and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 orr r8, r8, r0 // r8 <- r8 | r0 eor r0, r2, r8, ror #2 // r0 <- r2 ^ (r8 >>> 2) and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c orr r8, r8, r0 // r8 <- r8 | r0 eor r0, r2, r8, ror #2 // r0 <- r2 ^ (r8 >>> 2) and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 orr r8, r8, r0 // r8 <- r8 | r0 ldr.w r2, [r12, #12] // load rkey word of rkey from prev round str.w r8, [r12, #80] // store new rkey word in 'rkeys' eor r7, r2, r7, ror #2 // r7 <- r2 ^ (r7 >>> 2) and r7, r4, r7 // r7 <- r7 & 0xc0c0c0c0 eor r0, r2, r7, ror #2 // r0 <- r2 ^ (r7 >>> 2) and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 orr r7, r7, r0 // r7 <- r7 | r0 eor r0, r2, r7, ror #2 // r0 <- r2 ^ (r7 >>> 2) and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c orr r7, r7, r0 // r7 <- r7 | r0 eor r0, r2, r7, ror #2 // r0 <- r2 ^ (r7 >>> 2) and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 orr r7, r7, r0 // r7 <- r7 | r0 ldr.w r2, [r12, #8] // load rkey word of rkey from prev round str.w r7, [r12, #76] // store new rkey word in 'rkeys' eor r6, r2, r6, ror #2 // r6 <- r2 ^ (r6 >>> 2) bic r6, r4, r6 // r6 <- ~r6 & 0xc0c0c0c0 (NOT omitted in sbox) eor r0, r2, r6, ror #2 // r0 <- r2 ^ (r6 >>> 2) and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 orr r6, r6, r0 // r6 <- r6 | r0 eor r0, r2, r6, ror #2 // r0 <- r2 ^ (r6 >>> 2) and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c orr r6, r6, r0 // r6 <- r6 | r0 eor r0, r2, r6, ror #2 // r0 <- r2 ^ (r6 >>> 2) and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 orr r6, r6, r0 // r6 <- r6 | r0 mvn r0, r2 // NOT omitted in sbox ldr.w r2, [r12, #4] // load rkey word of rkey from prev round str.w r0, [r12, #8] // store new rkey word after NOT str.w r6, [r12, #72] // store new rkey word in 'rkeys' eor r5, r2, r3, ror #2 // r5 <- r2 ^ (r3 >>> 2) bic r5, r4, r5 // r5 <- ~r5 & 0xc0c0c0c0 (NOT omitted in sbox) eor r0, r2, r5, ror #2 // r0 <- r2 ^ (r5 >>> 2) and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 orr r5, r5, r0 // r5 <- r5 | r0 eor r0, r2, r5, ror #2 // r0 <- r2 ^ (r5 >>> 2) and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c orr r5, r5, r0 // r5 <- r5 | r0 eor r0, r2, r5, ror #2 // r0 <- r2 ^ (r5 >>> 2) and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 orr r5, r5, r0 // r5 <- r5 | r0 mvn r0, r2 // NOT omitted in sbox ldr.w r2, [r12], #32 // load rkey word of rkey from prev round str.w r0, [r12, #-28] // store new rkey word after NOT str.w r5, [r12, #36] // store new rkey word in 'rkeys' eor r3, r2, r1, ror #2 // r3 <- r2 ^ (r1 >>> 2) and r3, r4, r3 // r3 <- r3 & 0xc0c0c0c0 eor r0, r2, r3, ror #2 // r0 <- r2 ^ (r3 >>> 2) and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 orr r3, r3, r0 // r3 <- r3 | r0 eor r0, r2, r3, ror #2 // r0 <- r2 ^ (r3 >>> 2) and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c orr r3, r3, r0 // r3 <- r3 | r0 eor r0, r2, r3, ror #2 // r0 <- r2 ^ (r3 >>> 2) and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 orr r4, r3, r0 // r4 <- r3 | r0 str.w r4, [r12, #32] str.w r12, [sp, #56] // store the new rkeys address on the stack bx lr /****************************************************************************** * Subroutine that XORs the columns after the S-box during the AES-256 key * schedule round function, for rounds i such that (i % 4) == 0. * It differs from 'aes256_xorcolumns_rotword' by the omission of the rotword * operation (i.e. 'ror #26' instead of 'ror #2'). ******************************************************************************/ .align 2 aes256_xorcolumns: ldr r12, [sp, #56] // restore 'rkeys' address ldr.w r5, [r12, #28] // load rkey word of rkey from prev round movw r4, #0xc0c0 movt r4, #0xc0c0 // r4 <- 0xc0c0c0c0 eor r11, r5, r11, ror #26 // r11<- r5 ^ (r11 >>> 26) bic r11, r4, r11 // r11<- ~r11 & 0xc0c0c0c0 (NOT omitted in sbox) eor r9, r5, r11, ror #2 // r9 <- r5 ^ (r11 >>> 2) and r9, r9, r4, ror #2 // r9 <- r9 & 0x30303030 orr r11, r11, r9 // r11<- r11 | r9 eor r9, r5, r11, ror #2 // r9 <- r5 ^ (r11 >>> 2) and r9, r9, r4, ror #4 // r9 <- r9 & 0x0c0c0c0c orr r11, r11, r9 // r11<- r11 | r9 eor r9, r5, r11, ror #2 // r9 <- r5 ^ (r11 >>> 2) and r9, r9, r4, ror #6 // r9 <- r9 & 0x03030303 orr r11, r11, r9 // r11<- r11 | r9 mvn r9, r5 // NOT omitted in sbox ldr.w r5, [r12, #24] // load rkey word of rkey from prev round str r9, [r12, #28] // store new rkey word after NOT str r11, [r12, #92] // store new rkey word in 'rkeys' eor r10, r5, r2, ror #26 // r10<- r5 ^ (r2 >>> 2) bic r10, r4, r10 // r10<- ~r10 & 0xc0c0c0c0 (NOT omitted in sbox) eor r9, r5, r10, ror #2 // r9 <- r5 ^ (r10 >>> 2) and r9, r9, r4, ror #2 // r9 <- r9 & 0x30303030 orr r10, r10, r9 // r10<- r10 | r9 eor r9, r5, r10, ror #2 // r9 <- r5 ^ (r10 >>> 2) and r9, r9, r4, ror #4 // r9 <- r9 & 0x0c0c0c0c orr r10, r10, r9 // r10<- r10 | r9 eor r9, r5, r10, ror #2 // r9 <- r5 ^ (r10 >>> 2) and r9, r9, r4, ror #6 // r9 <- r9 & 0x03030303 orr r10, r10, r9 // r10<- r10 | r9 mvn r9, r5 // NOT omitted in sbox ldr.w r2, [r12, #20] // load rkey word of rkey from prev round str r9, [r12, #24] // store new rkey word after NOT str r10, [r12, #88] // store new rkey word in 'rkeys' eor r9, r2, r0, ror #26 // r9 <- r2 ^ (r9 >>> 26) and r9, r4, r9 // r9 <- r9 & 0xc0c0c0c0 eor r0, r2, r9, ror #2 // r0 <- r2 ^ (r9 >>> 2) and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 orr r9, r9, r0 // r9 <- r9 | r0 eor r0, r2, r9, ror #2 // r0 <- r2 ^ (r9 >>> 2) and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c orr r9, r9, r0 // r9 <- r9 | r0 eor r0, r2, r9, ror #2 // r0 <- r2 ^ (r9 >>> 2) and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 orr r9, r9, r0 // r9 <- r9 | r0 ldr.w r2, [r12, #16] // load rkey word of rkey from prev round str.w r9, [r12, #84] // store new rkey word in 'rkeys' eor r8, r2, r8, ror #26 // r8 <- r2 ^ (r8 >>> 26) and r8, r4, r8 // r8 <- r8 & 0xc0c0c0c0 eor r0, r2, r8, ror #2 // r0 <- r2 ^ (r8 >>> 2) and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 orr r8, r8, r0 // r8 <- r8 | r0 eor r0, r2, r8, ror #2 // r0 <- r2 ^ (r8 >>> 2) and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c orr r8, r8, r0 // r8 <- r8 | r0 eor r0, r2, r8, ror #2 // r0 <- r2 ^ (r8 >>> 2) and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 orr r8, r8, r0 // r8 <- r8 | r0 ldr.w r2, [r12, #12] // load rkey word of rkey from prev round str.w r8, [r12, #80] // store new rkey word in 'rkeys' eor r7, r2, r7, ror #26 // r7 <- r2 ^ (r7 >>> 26) and r7, r4, r7 // r7 <- r7 & 0xc0c0c0c0 eor r0, r2, r7, ror #2 // r0 <- r2 ^ (r7 >>> 2) and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 orr r7, r7, r0 // r7 <- r7 | r0 eor r0, r2, r7, ror #2 // r0 <- r2 ^ (r7 >>> 2) and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c orr r7, r7, r0 // r7 <- r7 | r0 eor r0, r2, r7, ror #2 // r0 <- r2 ^ (r7 >>> 2) and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 orr r7, r7, r0 // r7 <- r7 | r0 ldr.w r2, [r12, #8] // load rkey word of rkey from prev round str.w r7, [r12, #76] // store new rkey word in 'rkeys' eor r6, r2, r6, ror #26 // r6 <- r2 ^ (r6 >>> 26) bic r6, r4, r6 // r6 <- ~r6 & 0xc0c0c0c0 (NOT omitted in sbox) eor r0, r2, r6, ror #2 // r0 <- r2 ^ (r6 >>> 2) and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 orr r6, r6, r0 // r6 <- r6 | r0 eor r0, r2, r6, ror #2 // r0 <- r2 ^ (r6 >>> 2) and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c orr r6, r6, r0 // r6 <- r6 | r0 eor r0, r2, r6, ror #2 // r0 <- r2 ^ (r6 >>> 2) and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 orr r6, r6, r0 // r6 <- r6 | r0 mvn r0, r2 // NOT omitted in sbox ldr.w r2, [r12, #4] // load rkey word of rkey from prev round str.w r0, [r12, #8] // store new rkey word after NOT str.w r6, [r12, #72] // store new rkey word in 'rkeys' eor r5, r2, r3, ror #26 // r5 <- r2 ^ (r3 >>> 26) bic r5, r4, r5 // r5 <- ~r5 & 0xc0c0c0c0 (NOT omitted in sbox) eor r0, r2, r5, ror #2 // r0 <- r2 ^ (r5 >>> 2) and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 orr r5, r5, r0 // r5 <- r5 | r0 eor r0, r2, r5, ror #2 // r0 <- r2 ^ (r5 >>> 2) and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c orr r5, r5, r0 // r5 <- r5 | r0 eor r0, r2, r5, ror #2 // r0 <- r2 ^ (r5 >>> 2) and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 orr r5, r5, r0 // r5 <- r5 | r0 mvn r0, r2 // NOT omitted in sbox ldr.w r2, [r12], #32 // load rkey word of rkey from prev round str.w r0, [r12, #-28] // store new rkey word after NOT str.w r5, [r12, #36] // store new rkey word in 'rkeys' eor r3, r2, r1, ror #26 // r3 <- r2 ^ (r1 >>> 26) and r3, r4, r3 // r3 <- r3 & 0xc0c0c0c0 eor r0, r2, r3, ror #2 // r0 <- r2 ^ (r3 >>> 2) and r0, r0, r4, ror #2 // r0 <- r0 & 0x30303030 orr r3, r3, r0 // r3 <- r3 | r0 eor r0, r2, r3, ror #2 // r0 <- r2 ^ (r3 >>> 2) and r0, r0, r4, ror #4 // r0 <- r0 & 0x0c0c0c0c orr r3, r3, r0 // r3 <- r3 | r0 eor r0, r2, r3, ror #2 // r0 <- r2 ^ (r3 >>> 2) and r0, r0, r4, ror #6 // r0 <- r0 & 0x03030303 orr r4, r3, r0 // r4 <- r3 | r0 str.w r4, [r12, #32] str.w r12, [sp, #56] // store the new rkeys address on the stack bx lr /****************************************************************************** * Applies ShiftRows^(-1) on a round key to match the fixsliced representation. ******************************************************************************/ .align 2 inv_shiftrows_1: ldr.w r2, [r12, #-32]! str r14, [sp, #52] // store link register movw r1, #8 movw r14, #0x0300 movt r14, #0x0c0f // r14<- 0x0c0f0300 for ShiftRows^[-1] loop_inv_sr_1: movw r3, #0x3300 movt r3, #0x3300 // r3 <- 0x33003300 for ShiftRows^[-1] swpmv r2, r2, r2, r2, r14, 4, r0 eor r0, r2, r2, lsr #2 and r0, r3 eor r2, r2, r0 eor r3, r2, r0, lsl #2 ldr.w r2, [r12, #4]! str.w r3, [r12, #-4] subs r1, #1 bne loop_inv_sr_1 ldr r14, [sp, #52] // restore link register bx lr /****************************************************************************** * Applies ShiftRows^(-2) on a round key to match the fixsliced representation. * Only needed for the fully-fixsliced (ffs) representation. ******************************************************************************/ .align 2 inv_shiftrows_2: ldr.w r2, [r12, #-32]! str r14, [sp, #52] // store link register movw r1, #8 movw r14, #0x0f00 movt r14, #0x0f00 // r14<- 0x0f000f00 for ShiftRows^[-2] loop_inv_sr_2: eor r0, r2, r2, lsr #4 and r0, r14 eor r2, r2, r0 eor r3, r2, r0, lsl #4 ldr.w r2, [r12, #4]! str.w r3, [r12, #-4] subs r1, #1 bne loop_inv_sr_2 ldr r14, [sp, #52] // restore link register bx lr /****************************************************************************** * Applies ShiftRows^(-3) on a round key to match the fixsliced representation. * Only needed for the fully-fixsliced (ffs) representation. ******************************************************************************/ .align 2 inv_shiftrows_3: ldr.w r2, [r12, #-32]! str r14, [sp, #52] // store link register movw r1, #8 movw r14, #0x0c00 movt r14, #0x030f // r14<- 0x030f0c00 for ShiftRows^[-3] loop_inv_sr_3: movw r3, #0x3300 movt r3, #0x3300 // r3 <- 0x33003300 for ShiftRows^[-3] swpmv r2, r2, r2, r2, r14, 4, r0 eor r0, r2, r2, lsr #2 and r0, r3 eor r2, r2, r0 eor r3, r2, r0, lsl #2 ldr.w r2, [r12, #4]! str.w r3, [r12, #-4] subs r1, #1 bne loop_inv_sr_3 ldr r14, [sp, #52] // restore link register bx lr /****************************************************************************** * Fully bitsliced AES-128 key schedule to match the fully-fixsliced (ffs) * representation. Note that it is possible to pass two different keys as input * parameters if one wants to encrypt 2 blocks in with two different keys. ******************************************************************************/ @ void aes128_keyschedule_ffs(u32* rkeys, const u8* key); .global aes128_keyschedule_ffs .type aes128_keyschedule_ffs,%function .align 2 aes128_keyschedule_ffs: push {r0-r12,r14} sub.w sp, #56 // allow space on the stack for tmp var ldr.w r4, [r1] // load the 128-bit key in r4-r7 ldr r5, [r1, #4] ldr r6, [r1, #8] ldr r7, [r1, #12] ldr.w r8, [r1] // load the 128-bit key in r8-r11 ldr r9, [r1, #4] ldr r10,[r1, #8] ldr r11,[r1, #12] bl packing // pack the master key ldr.w r0, [sp, #56] // restore 'rkeys' address stm r0, {r4-r11} // store the packed master key in 'rkeys' bl sbox // apply the sbox to the master key eor r11, r11, #0x00000300 // add the 1st rconst bl aes128_xorcolumns_rotword bl sbox // apply the sbox to the master key eor r2, r2, #0x00000300 // add the 2nd rconst bl aes128_xorcolumns_rotword bl inv_shiftrows_1 bl sbox // apply the sbox to the master key eor r0, r0, #0x00000300 // add the 3rd rconst bl aes128_xorcolumns_rotword bl inv_shiftrows_2 bl sbox // apply the sbox to the master key eor r8, r8, #0x00000300 // add the 4th rconst bl aes128_xorcolumns_rotword bl inv_shiftrows_3 bl sbox // apply the sbox to the master key eor r7, r7, #0x00000300 // add the 5th rconst bl aes128_xorcolumns_rotword bl sbox // apply the sbox to the master key eor r6, r6, #0x00000300 // add the 6th rconst bl aes128_xorcolumns_rotword bl inv_shiftrows_1 bl sbox // apply the sbox to the master key eor r3, r3, #0x00000300 // add the 7th rconst bl aes128_xorcolumns_rotword bl inv_shiftrows_2 bl sbox // apply the sbox to the master key eor r1, r1, #0x00000300 // add the 8th rconst bl aes128_xorcolumns_rotword bl inv_shiftrows_3 bl sbox // apply the sbox to the master key eor r11, r11, #0x00000300 // add the 9th rconst eor r2, r2, #0x00000300 // add the 9th rconst eor r8, r8, #0x00000300 // add the 9th rconst eor r7, r7, #0x00000300 // add the 9th rconst bl aes128_xorcolumns_rotword bl sbox // apply the sbox to the master key eor r2, r2, #0x00000300 // add the 10th rconst eor r0, r0, #0x00000300 // add the 10th rconst eor r7, r7, #0x00000300 // add the 10th rconst eor r6, r6, #0x00000300 // add the 10th rconst bl aes128_xorcolumns_rotword bl inv_shiftrows_1 mvn r5, r5 // add the NOT for the last rkey mvn r6, r6 // add the NOT for the last rkey mvn r10, r10 // add the NOT for the last rkey mvn r11, r11 // add the NOT for the last rkey strd r5, r6, [r12, #4] strd r10, r11, [r12, #24] ldrd r0, r1, [r12, #-316] ldrd r2, r3, [r12, #-296] mvn r0, r0 // remove the NOT for the key whitening mvn r1, r1 // remove the NOT for the key whitening mvn r2, r2 // remove the NOT for the key whitening mvn r3, r3 // remove the NOT for the key whitening strd r0, r1, [r12, #-316] strd r2, r3, [r12, #-296] add.w sp, #56 // restore stack pop {r0-r12, r14} // restore context bx lr /****************************************************************************** * Fully bitsliced AES-256 key schedule to match the fully-fixsliced (ffs) * representation. Note that it is possible to pass 2 different keys as input * parameters if one wants to encrypt 2 blocks in with 2 different keys. ******************************************************************************/ @ void aes256_keyschedule_ffs(u32* rkeys, const u8* key); .global aes256_keyschedule_ffs .type aes256_keyschedule_ffs,%function .align 2 aes256_keyschedule_ffs: push {r0-r12,r14} sub.w sp, #56 // allow space on the stack for tmp var ldr.w r4, [r1] // load the 128 first key bits in r4-r7 ldr r5, [r1, #4] ldr r6, [r1, #8] ldr r7, [r1, #12] ldr.w r8, [r1] // load the 128 first key bits in r8-r11 ldr r9, [r1, #4] ldr r10,[r1, #8] ldr r11,[r1, #12] bl packing // pack the master key ldrd r0,r1, [sp, #56] // restore 'rkeys' and 'key' addresses stm r0, {r4-r11} // store the packed master key in 'rkeys' add.w r1, #16 // points to the 128 last bits of the key ldr.w r4, [r1] // load the 128 first key bits in r4-r7 ldr r5, [r1, #4] ldr r6, [r1, #8] ldr r7, [r1, #12] ldr.w r8, [r1] // load the 128 first key bits in r8-r11 ldr r9, [r1, #4] ldr r10,[r1, #8] ldr r11,[r1, #12] bl packing // pack the master key ldr.w r0, [sp, #56] // restore 'rkeys' address add.w r0, #32 // points to the 128 last bits of the key stm r0, {r4-r11} // store the packed master key in 'rkeys' bl sbox // apply the sbox to the master key eor r11, r11, #0x00000300 // add the 1st rconst bl aes256_xorcolumns_rotword bl sbox // apply the sbox to the master key bl aes256_xorcolumns bl inv_shiftrows_1 bl sbox // apply the sbox to the master key eor r2, r2, #0x00000300 // add the 2nd rconst bl aes256_xorcolumns_rotword bl inv_shiftrows_2 bl sbox // apply the sbox to the master key bl aes256_xorcolumns bl inv_shiftrows_3 bl sbox // apply the sbox to the master key eor r0, r0, #0x00000300 // add the 3rd rconst bl aes256_xorcolumns_rotword bl sbox // apply the sbox to the master key bl aes256_xorcolumns bl inv_shiftrows_1 bl sbox // apply the sbox to the master key eor r8, r8, #0x00000300 // add the 4th rconst bl aes256_xorcolumns_rotword bl inv_shiftrows_2 bl sbox // apply the sbox to the master key bl aes256_xorcolumns bl inv_shiftrows_3 bl sbox // apply the sbox to the master key eor r7, r7, #0x00000300 // add the 5th rconst bl aes256_xorcolumns_rotword bl sbox // apply the sbox to the master key bl aes256_xorcolumns bl inv_shiftrows_1 bl sbox // apply the sbox to the master key eor r6, r6, #0x00000300 // add the 6th rconst bl aes256_xorcolumns_rotword bl inv_shiftrows_2 bl sbox // apply the sbox to the master key bl aes256_xorcolumns bl inv_shiftrows_3 bl sbox // apply the sbox to the master key eor r3, r3, #0x00000300 // add the 6th rconst bl aes256_xorcolumns_rotword add r12, #32 bl inv_shiftrows_1 mvn r5, r5 // add the NOT for the last rkey mvn r6, r6 // add the NOT for the last rkey mvn r10, r10 // add the NOT for the last rkey mvn r11, r11 // add the NOT for the last rkey ldrd r0, r1, [r12, #-28] ldrd r2, r3, [r12, #-8] strd r5, r6, [r12, #4] strd r10, r11, [r12, #24] mvn r0, r0 // add the NOT for the penultimate rkey mvn r1, r1 // add the NOT for the penultimate rkey mvn r2, r2 // add the NOT for the penultimate rkey mvn r3, r3 // add the NOT for the penultimate rkey ldrd r5, r6, [r12, #-444] ldrd r10, r11, [r12, #-424] strd r0, r1, [r12, #-28] strd r2, r3, [r12, #-8] mvn r5, r5 // remove the NOT for the key whitening mvn r6, r6 // remove the NOT for the key whitening mvn r10, r10 // remove the NOT for the key whitening mvn r11, r11 // remove the NOT for the key whitening strd r5, r6, [r12, #-444] strd r10, r11, [r12, #-424] add.w sp, #56 // restore stack pop {r0-r12, r14} // restore context bx lr