https://github.com/mupq/pqm4
Raw File
Tip revision: 7f05252b1b093ce317d62d8798323b3f801c43e1 authored by Richard Petri on 16 January 2024, 22:34:10 UTC
Run only on push to master, and when PR review requested
Tip revision: 7f05252
aes-encrypt.S
/******************************************************************************
* Assembly fixsliced implementation of AES-128 and AES-256 (encryption only).
*
* Fully-fixsliced implementation runs faster than the semi-fixsliced variant
* at the cost of a larger code size.
*
* See the paper at https://eprint.iacr.org/2020/1123.pdf for more details.
*
* @author   Alexandre Adomnicai, Nanyang Technological University, Singapore
*           alexandre.adomnicai@ntu.edu.sg
*
* @date     October 2020
******************************************************************************/

.syntax unified
.thumb

/******************************************************************************
* Macro to compute the SWAPMOVE technique: swap the bits in 'in1' masked by 'm'
* by the bits in 'in0' masked by 'm << n' and put the results in 'out0', 'out1'
******************************************************************************/
.macro swpmv out0, out1, in0, in1, m, n, tmp
    eor     \tmp, \in1, \in0, lsr \n
    and     \tmp, \m
    eor     \out1, \in1, \tmp
    eor     \out0, \in0, \tmp, lsl \n
.endm

/******************************************************************************
* Rotate all bytes in 'in' by 'n0' bits to the rights and put the results in
* 'out'. 'm' refers to the appropriate bitmask and 'n1' = 8-'n0'.
******************************************************************************/
.macro byteror  out, in, m, n0, n1, tmp
    and     \out, \m, \in, lsr \n0
    bic     \tmp, \in, \m, ror \n1
    orr     \out, \out, \tmp, lsl \n1
.endm

/******************************************************************************
* Compute the MixColumns for rounds i st i%4 == 0 or 2.
* Between the two versions, only the masks and the shifts for the 'byteror' are
* differing.
******************************************************************************/
.macro mc_0_2   m, n0, n1, n2, n3
    byteror r14, r1, \m, \n0, \n1, r9   // r14 <- BYTE_ROR_n0(S0)
    eor     r4, r1, r14, ror #8         // r4 <- S0 ^ (BYTE_ROR_6(S0) >>> 8)
    movw    r1, #0x0f0f
    movt    r1, #0x0f0f                 // r1 <- 0x0f0f0f0f (for BYTE_ROR)
    byteror r5, r11, \m, \n0, \n1, r9   // r5 <- BYTE_ROR_n0(S7)
    eor     r10, r11, r5, ror #8        // r10<- S7 ^ BYTE_ROR_n0(S7 >>> 8)
    byteror r11, r10, r1, 4, 4, r9      // r11<- BYTE_ROR_4(r10)
    eor     r11, r4, r11, ror #16       // r11<- BYTE_ROR_4(r10) ^ (r10 >>> 16)
    eor     r11, r11, r5, ror #8        // r11<- S'7
    byteror r5, r2, \m, \n0, \n1, r9    // r5 <- BYTE_ROR_n0(S6)
    eor     r2, r2, r5, ror #8          // r2 <- S6 ^ BYTE_ROR_n0(S6 >>> 8)
    eor     r10, r10, r5, ror #8        // r10<- r10 ^ (BYTE_ROR_n0(S6) >>> 8)
    byteror r5, r2, r1, 4, 4, r9        // r5 <- BYTE_ROR_4(r2)
    eor     r10, r10, r5, ror #16       // r10<- r10 ^ (r5 >>> 16)
    eor     r10, r10, r4                // r10<- S'6
    byteror r5, r0, \m, \n0, \n1, r9    // r5 <- BYTE_ROR_n0(S5)
    eor     r0, r0, r5, ror #8          // r0 <- S5 ^ BYTE_ROR_6(S5 >>> 8)
    eor     r9, r2, r5, ror #8          // r9 <- r2 ^ (BYTE_ROR_n0(S5) >>> 8)
    byteror r5, r0, r1, 4, 4, r2        // r5 <- BYTE_ROR_4(r0)
    eor     r9, r9, r5, ror #16         // r9 <- S'5
    byteror r5, r8, \m, \n0, \n1, r2    // r5 <- BYTE_ROR_n0(S4)
    eor     r2, r8, r5, ror #8          // r2 <- S4 ^ BYTE_ROR_6(S4 >>> 8)
    eor     r8, r0, r5, ror #8          // r8 <- r0 ^ (BYTE_ROR_n0(S4) >>> 8)
    byteror r5, r2, r1, 4, 4, r0        // r5 <- BYTE_ROR_4(r2)
    eor     r8, r8, r5, ror #16         // r8 <- r8 ^ (r5 >>> 16)
    eor     r8, r8, r4                  // r8 <- S'4
    byteror r5, r7, \m, \n0, \n1, r0    // r5 <- BYTE_ROR_n0(S3)
    eor     r0, r7, r5, ror #8          // r0 <- S3 ^ BYTE_ROR_6(S3 >>> 8)
    eor     r7, r2, r5, ror #8          // r2 ^ (BYTE_ROR_n0(S3) >>> 8)
    byteror r5, r0, r1, 4, 4, r2        // r5 <- BYTE_ROR_4(r0)
    eor     r7, r7, r5, ror #16         // r7 <- r7 ^ (r5 >>> 16)
    eor     r7, r7, r4                  // r7 <- S'3
    byteror r5, r6, \m, \n0, \n1, r2    // r5 <- BYTE_ROR_n0(S2)
    eor     r2, r6, r5, ror #8          // r2 <- S2 ^ BYTE_ROR_6(S2 >>> 8)
    eor     r6, r0, r5, ror #8          // r6 <- r0 ^ (BYTE_ROR_n0(S2) >>> 8)
    byteror r5, r2, r1, 4, 4, r0        // r5 <- BYTE_ROR_4(r2)
    eor     r6, r6, r5, ror #16         // r6 <- S'2
    byteror r5, r3, \m, \n0, \n1, r0    // r5 <- BYTE_ROR_n0(S1)
    eor     r0, r3, r5, ror #8          // r0 <- S1 ^ BYTE_ROR_6(S1 >>> 8)
    eor     r3, r2, r5, ror #8          // r3 <- r0 ^ (BYTE_ROR_n0(S1) >>> 8)
    byteror r5, r0, r1, 4, 4, r2        // r5 <- BYTE_ROR_4(r0)
    eor     r5, r3, r5, ror #16         // r5 <- S'1
    eor     r14, r0, r14, ror #8        // r14<- r0 ^ (BYTE_ROR_n0(S0) >>> 8)
    byteror r0, r4, r1, 4, 4, r2        // r0 <- BYTE_ROR_4(r4)
    eor     r4, r14, r0, ror #16        // r4 <- S'0
.endm

/******************************************************************************
* Packs two 128-bit input blocs stored in r4-r7 and r8-r11, respectively, into
* the 256-bit internal state where the bits are packed as follows:
* r4 = b_24 b_56 b_88 b_120 || ... || b_0 b_32 b_64 b_96
* r5 = b_25 b_57 b_89 b_121 || ... || b_1 b_33 b_65 b_97
* r6 = b_26 b_58 b_90 b_122 || ... || b_2 b_34 b_66 b_98
* r7 = b_27 b_59 b_91 b_123 || ... || b_3 b_35 b_67 b_99
* r8 = b_28 b_60 b_92 b_124 || ... || b_4 b_36 b_68 b_100
* r9 = b_29 b_61 b_93 b_125 || ... || b_5 b_37 b_69 b_101
* r10 = b_30 b_62 b_94 b_126 || ... || b_6 b_38 b_70 b_102
* r11 = b_31 b_63 b_95 b_127 || ... || b_7 b_39 b_71 b_103
******************************************************************************/
.align 2
packing:
    movw    r3, #0x0f0f
    movt    r3, #0x0f0f             // r3 <- 0x0f0f0f0f (mask for SWAPMOVE)
    eor     r2, r3, r3, lsl #2      // r2 <- 0x33333333 (mask for SWAPMOVE)
    eor     r1, r2, r2, lsl #1      // r1 <- 0x55555555 (mask for SWAPMOVE)
    swpmv   r8, r4, r8, r4, r1, #1, r12
    swpmv   r9, r5, r9, r5, r1, #1, r12
    swpmv   r10, r6, r10, r6, r1, #1, r12
    swpmv   r11, r7, r11, r7, r1, #1, r12
    swpmv   r0, r4, r5, r4, r2, #2, r12
    swpmv   r9, r5, r9, r8, r2, #2, r12
    swpmv   r7, r8, r7, r6, r2, #2, r12
    swpmv   r11, r2, r11, r10, r2, #2, r12
    swpmv   r8, r4, r8, r4, r3, #4, r12
    swpmv   r10, r6, r7, r0, r3, #4, r12
    swpmv   r11, r7, r11, r9, r3, #4, r12
    swpmv   r9, r5, r2, r5, r3, #4, r12
    bx      lr

/******************************************************************************
* Unpacks the 256-bit internal state in two 128-bit blocs.
******************************************************************************/
.align 2
unpacking:
    movw    r3, #0x0f0f
    movt    r3, #0x0f0f                 // r3 <- 0x0f0f0f0f (mask for SWAPMOVE)
    swpmv   r2, r5, r9, r5, r3, #4, r12
    swpmv   r11, r9, r11, r7, r3, #4, r12
    swpmv   r7, r1, r10, r6, r3, #4, r12
    swpmv   r8, r4, r8, r4, r3, #4, r12
    eor     r3, r3, r3, lsl #2          // r3 <- 0x33333333 (mask for SWAPMOVE)
    swpmv   r11, r10,r11, r2, r3, #2, r12
    swpmv   r7, r6, r7, r8, r3, #2, r12
    swpmv   r9, r8, r9, r5, r3, #2, r12
    swpmv   r5, r4, r1, r4, r3, #2, r12
    eor     r1, r3, r3, lsl #1          // r1 <- 0x55555555 (mask for SWAPMOVE)
    swpmv   r8, r4, r8, r4, r1, #1, r12
    swpmv   r9, r5,r9, r5, r1, #1, r12
    swpmv   r10, r6, r10, r6, r1, #1, r12
    swpmv   r11, r7, r11, r7, r1, #1, r12
    bx      lr

/******************************************************************************
* Subroutine that computes the AddRoundKey and the S-box.
* Credits to https://github.com/Ko-/aes-armcortexm for the S-box implementation
******************************************************************************/
.align 2
ark_sbox:
    // add round key
    ldr.w   r1, [sp, #48]
    ldmia   r1!, {r0,r2,r3,r12}
    eor     r4, r0
    eor     r5, r2
    eor     r6, r3
    eor     r7, r12
    ldmia   r1!, {r0,r2,r3,r12}
    eor     r8, r0
    eor     r9, r2
    eor     r10, r3
    eor     r11, r12
    str.w   r1, [sp, #48]
    str     r14, [sp, #52]
    // sbox: credits to https://github.com/Ko-/aes-armcortexm
    eor     r1, r7, r9              //Exec y14 = U3 ^ U5; into r1
    eor     r3, r4, r10             //Exec y13 = U0 ^ U6; into r3
    eor     r2, r3, r1              //Exec y12 = y13 ^ y14; into r2
    eor     r0, r8, r2              //Exec t1 = U4 ^ y12; into r0
    eor     r14, r0, r9             //Exec y15 = t1 ^ U5; into r14
    and     r12, r2, r14            //Exec t2 = y12 & y15; into r12
    eor     r8, r14, r11            //Exec y6 = y15 ^ U7; into r8
    eor     r0, r0, r5              //Exec y20 = t1 ^ U1; into r0
    str.w   r2, [sp, #44]           //Store r2/y12 on stack
    eor     r2, r4, r7              //Exec y9 = U0 ^ U3; into r2
    str     r0, [sp, #40]           //Store r0/y20 on stack
    eor     r0, r0, r2              //Exec y11 = y20 ^ y9; into r0
    str     r2, [sp, #36]           //Store r2/y9 on stack
    and     r2, r2, r0              //Exec t12 = y9 & y11; into r2
    str     r8, [sp, #32]           //Store r8/y6 on stack
    eor     r8, r11, r0             //Exec y7 = U7 ^ y11; into r8
    eor     r9, r4, r9              //Exec y8 = U0 ^ U5; into r9
    eor     r6, r5, r6              //Exec t0 = U1 ^ U2; into r6
    eor     r5, r14, r6             //Exec y10 = y15 ^ t0; into r5
    str     r14, [sp, #28]          //Store r14/y15 on stack
    eor     r14, r5, r0             //Exec y17 = y10 ^ y11; into r14
    str.w   r1, [sp, #24]           //Store r1/y14 on stack
    and     r1, r1, r14             //Exec t13 = y14 & y17; into r1
    eor     r1, r1, r2              //Exec t14 = t13 ^ t12; into r1
    str     r14, [sp, #20]          //Store r14/y17 on stack
    eor     r14, r5, r9             //Exec y19 = y10 ^ y8; into r14
    str.w   r5, [sp, #16]           //Store r5/y10 on stack
    and     r5, r9, r5              //Exec t15 = y8 & y10; into r5
    eor     r2, r5, r2              //Exec t16 = t15 ^ t12; into r2
    eor     r5, r6, r0              //Exec y16 = t0 ^ y11; into r5
    str.w   r0, [sp, #12]           //Store r0/y11 on stack
    eor     r0, r3, r5              //Exec y21 = y13 ^ y16; into r0
    str     r3, [sp, #8]            //Store r3/y13 on stack
    and     r3, r3, r5              //Exec t7 = y13 & y16; into r3
    str     r5, [sp, #4]            //Store r5/y16 on stack
    str     r11, [sp, #0]           //Store r11/U7 on stack
    eor     r5, r4, r5              //Exec y18 = U0 ^ y16; into r5
    eor     r6, r6, r11             //Exec y1 = t0 ^ U7; into r6
    eor     r7, r6, r7              //Exec y4 = y1 ^ U3; into r7
    and     r11, r7, r11            //Exec t5 = y4 & U7; into r11
    eor     r11, r11, r12           //Exec t6 = t5 ^ t2; into r11
    eor     r11, r11, r2            //Exec t18 = t6 ^ t16; into r11
    eor     r14, r11, r14           //Exec t22 = t18 ^ y19; into r14
    eor     r4, r6, r4              //Exec y2 = y1 ^ U0; into r4
    and     r11, r4, r8             //Exec t10 = y2 & y7; into r11
    eor     r11, r11, r3            //Exec t11 = t10 ^ t7; into r11
    eor     r2, r11, r2             //Exec t20 = t11 ^ t16; into r2
    eor     r2, r2, r5              //Exec t24 = t20 ^ y18; into r2
    eor     r10, r6, r10            //Exec y5 = y1 ^ U6; into r10
    and     r11, r10, r6            //Exec t8 = y5 & y1; into r11
    eor     r3, r11, r3             //Exec t9 = t8 ^ t7; into r3
    eor     r3, r3, r1              //Exec t19 = t9 ^ t14; into r3
    eor     r3, r3, r0              //Exec t23 = t19 ^ y21; into r3
    eor     r0, r10, r9             //Exec y3 = y5 ^ y8; into r0
    ldr     r11, [sp, #32]          //Load y6 into r11
    and     r5, r0, r11             //Exec t3 = y3 & y6; into r5
    eor     r12, r5, r12            //Exec t4 = t3 ^ t2; into r12
    ldr     r5, [sp, #40]           //Load y20 into r5
    str     r7, [sp, #32]           //Store r7/y4 on stack
    eor     r12, r12, r5            //Exec t17 = t4 ^ y20; into r12
    eor     r1, r12, r1             //Exec t21 = t17 ^ t14; into r1
    and     r12, r1, r3             //Exec t26 = t21 & t23; into r12
    eor     r5, r2, r12             //Exec t27 = t24 ^ t26; into r5
    eor     r12, r14, r12           //Exec t31 = t22 ^ t26; into r12
    eor     r1, r1, r14             //Exec t25 = t21 ^ t22; into r1
    and     r7, r1, r5              //Exec t28 = t25 & t27; into r7
    eor     r14, r7, r14            //Exec t29 = t28 ^ t22; into r14
    and     r4, r14, r4             //Exec z14 = t29 & y2; into r4
    and     r8, r14, r8             //Exec z5 = t29 & y7; into r8
    eor     r7, r3, r2              //Exec t30 = t23 ^ t24; into r7
    and     r12, r12, r7            //Exec t32 = t31 & t30; into r12
    eor     r12, r12, r2            //Exec t33 = t32 ^ t24; into r12
    eor     r7, r5, r12             //Exec t35 = t27 ^ t33; into r7
    and     r2, r2, r7              //Exec t36 = t24 & t35; into r2
    eor     r5, r5, r2              //Exec t38 = t27 ^ t36; into r5
    and     r5, r14, r5             //Exec t39 = t29 & t38; into r5
    eor     r1, r1, r5              //Exec t40 = t25 ^ t39; into r1
    eor     r5, r14, r1             //Exec t43 = t29 ^ t40; into r5
    ldr.w   r7, [sp, #4]            //Load y16 into r7
    and     r7, r5, r7              //Exec z3 = t43 & y16; into r7
    eor     r8, r7, r8              //Exec tc12 = z3 ^ z5; into r8
    str     r8, [sp, #40]           //Store r8/tc12 on stack
    ldr     r8, [sp, #8]            //Load y13 into r8
    and     r8, r5, r8              //Exec z12 = t43 & y13; into r8
    and     r10, r1, r10            //Exec z13 = t40 & y5; into r10
    and     r6, r1, r6              //Exec z4 = t40 & y1; into r6
    eor     r6, r7, r6              //Exec tc6 = z3 ^ z4; into r6
    eor     r3, r3, r12             //Exec t34 = t23 ^ t33; into r3
    eor     r3, r2, r3              //Exec t37 = t36 ^ t34; into r3
    eor     r1, r1, r3              //Exec t41 = t40 ^ t37; into r1
    ldr.w   r5, [sp, #16]           //Load y10 into r5
    and     r2, r1, r5              //Exec z8 = t41 & y10; into r2
    and     r9, r1, r9              //Exec z17 = t41 & y8; into r9
    str     r9, [sp, #16]           //Store r9/z17 on stack
    eor     r5, r12, r3             //Exec t44 = t33 ^ t37; into r5
    ldr     r9, [sp, #28]           //Load y15 into r9
    ldr.w   r7, [sp, #44]           //Load y12 into r7
    and     r9, r5, r9              //Exec z0 = t44 & y15; into r9
    and     r7, r5, r7              //Exec z9 = t44 & y12; into r7
    and     r0, r3, r0              //Exec z10 = t37 & y3; into r0
    and     r3, r3, r11             //Exec z1 = t37 & y6; into r3
    eor     r3, r3, r9              //Exec tc5 = z1 ^ z0; into r3
    eor     r3, r6, r3              //Exec tc11 = tc6 ^ tc5; into r3
    ldr     r11, [sp, #32]          //Load y4 into r11
    ldr.w   r5, [sp, #20]           //Load y17 into r5
    and     r11, r12, r11           //Exec z11 = t33 & y4; into r11
    eor     r14, r14, r12           //Exec t42 = t29 ^ t33; into r14
    eor     r1, r14, r1             //Exec t45 = t42 ^ t41; into r1
    and     r5, r1, r5              //Exec z7 = t45 & y17; into r5
    eor     r6, r5, r6              //Exec tc8 = z7 ^ tc6; into r6
    ldr     r5, [sp, #24]           //Load y14 into r5
    str     r4, [sp, #32]           //Store r4/z14 on stack
    and     r1, r1, r5              //Exec z16 = t45 & y14; into r1
    ldr     r5, [sp, #12]           //Load y11 into r5
    ldr     r4, [sp, #36]           //Load y9 into r4
    and     r5, r14, r5             //Exec z6 = t42 & y11; into r5
    eor     r5, r5, r6              //Exec tc16 = z6 ^ tc8; into r5
    and     r4, r14, r4             //Exec z15 = t42 & y9; into r4
    eor     r14, r4, r5             //Exec tc20 = z15 ^ tc16; into r14
    eor     r4, r4, r1              //Exec tc1 = z15 ^ z16; into r4
    eor     r1, r0, r4              //Exec tc2 = z10 ^ tc1; into r1
    eor     r0, r1, r11             //Exec tc21 = tc2 ^ z11; into r0
    eor     r7, r7, r1              //Exec tc3 = z9 ^ tc2; into r7
    eor     r1, r7, r5              //Exec S0 = tc3 ^ tc16; into r1
    eor     r7, r7, r3              //Exec S3 = tc3 ^ tc11; into r7
    eor     r3, r7, r5              //Exec S1 = S3 ^ tc16 ^ 1; into r3
    eor     r11, r10, r4            //Exec tc13 = z13 ^ tc1; into r11
    ldr.w   r4, [sp, #0]            //Load U7 into r4
    and     r12, r12, r4            //Exec z2 = t33 & U7; into r12
    eor     r9, r9, r12             //Exec tc4 = z0 ^ z2; into r9
    eor     r12, r8, r9             //Exec tc7 = z12 ^ tc4; into r12
    eor     r2, r2, r12             //Exec tc9 = z8 ^ tc7; into r2
    eor     r2, r6, r2              //Exec tc10 = tc8 ^ tc9; into r2
    ldr.w   r4, [sp, #32]           //Load z14 into r4
    eor     r12, r4, r2             //Exec tc17 = z14 ^ tc10; into r12
    eor     r0, r0, r12             //Exec S5 = tc21 ^ tc17; into r0
    eor     r6, r12, r14            //Exec tc26 = tc17 ^ tc20; into r6
    ldr.w   r4, [sp, #16]           //Load z17 into r4
    ldr     r12, [sp, #40]          //Load tc12 into r12
    eor     r6, r6, r4              //Exec S2 = tc26 ^ z17 ^ 1; into r6
    eor     r12, r9, r12            //Exec tc14 = tc4 ^ tc12; into r12
    eor     r14, r11, r12           //Exec tc18 = tc13 ^ tc14; into r14
    eor     r2, r2, r14             //Exec S6 = tc10 ^ tc18 ^ 1; into r2
    eor     r11, r8, r14            //Exec S7 = z12 ^ tc18 ^ 1; into r11
    ldr     r14, [sp, #52]          // restore link register
    eor     r8, r12, r7             //Exec S4 = tc14 ^ S3; into r8
    bx      lr
    // [('r0', 'S5'), ('r1', 'S0'), ('r2', 'S6'), ('r3', 'S1'),
    // ('r6', 'S2'),('r7', 'S3'), ('r8', 'S4'), ('r11', 'S7')]

/******************************************************************************
* Computation of the MixColumns transformation in the fixsliced representation.
* For fully-fixsliced implementations, it is used for rounds i s.t. (i%4) == 0.
* For semi-fixsliced implementations, it is used for rounds i s.t. (i%2) == 0.
******************************************************************************/
.align 2
mixcolumns_0:
    str     r14, [sp, #52]          // store link register
    movw    r12, #0x0303
    movt    r12, #0x0303
    mc_0_2  r12, 6, 2, 26, 18
    ldr     r14, [sp, #52]          // restore link register
    bx      lr

/******************************************************************************
* Computation of the MixColumns transformation in the fixsliced representation.
* For fully-fixsliced implementations only, for round i s.t. (i%4) == 1.
******************************************************************************/
.align 2
mixcolumns_1:
    str     r14, [sp, #52]          // store link register
    movw    r14, #0x0f0f
    movt    r14, #0x0f0f            // r14<- 0x0f0f0f0f (mask for BYTE_ROR_4)
    and     r5, r14, r1, lsr #4     // r5 <- (S0 >> 4) & 0x0f0f0f0f
    and     r9, r14, r1             // r9 <- S0 & 0x0f0f0f0f
    orr     r5, r5, r9, lsl #4      // r5 <- BYTE_ROR_4(S0)
    eor     r4, r1, r5, ror #8      // r4 <- S0 ^ (BYTE_ROR_4(S0) >>> 8)
    mov.w   r1, r5, ror #8          // r1 <- (BYTE_ROR_4(S0) >>> 8)
    and     r5, r14, r11, lsr #4    // r5 <- (S7 >> 4) & 0x0f0f0f0f
    and     r9, r14, r11            // r9 <- S7 & 0x0f0f0f0f
    orr     r5, r5, r9, lsl #4      // r5 <- BYTE_ROR_4(S7)
    eor     r12, r11, r5, ror #8    // r12<- S7 ^ (BYTE_ROR_4(S7) >>> 8)
    eor     r10, r4, r12            // r10<- r4 ^ r12
    eor     r11, r10                // r11<- S7 ^ r4 ^ r12
    eor     r11, r11, r12, ror #16  // r11<- r11 ^ (r12 >>> 16)
    and     r5, r14, r2, lsr #4     // r5 <- (S6 >> 4) & 0x0f0f0f0f
    and     r9, r14, r2             // r9 <- S6 & 0x0f0f0f0f
    orr     r5, r5, r9, lsl #4      // r5 <- BYTE_ROR_4(S6)
    eor     r10, r10, r5, ror #8    // r10<- r10 ^ (BYTE_ROR_4(S6) >>> 8)
    eor     r12, r2, r5, ror #8     // r12<- S6 ^ (BYTE_ROR_4(S6) >>> 8)
    eor     r10, r10, r12, ror #16  // r10<- r10 ^ (r12 >>> 16)
    and     r5, r14, r0, lsr #4     // r5 <- (S5 >> 4) & 0x0f0f0f0f
    and     r9, r14, r0             // r9 <- S5 & 0x0f0f0f0f
    orr     r5, r5, r9, lsl #4      // r5 <- BYTE_ROR_4(S5)
    eor     r9, r12, r5, ror #8     // r9 <- r12 ^ (BYTE_ROR_4(S5) >>> 8)
    eor     r12, r0, r5, ror #8     // r12<- S5 ^ (BYTE_ROR_4(S5) >>> 8)
    eor     r9, r9, r12, ror #16    // r9 <- (r9 ^ r12 >>> 16)
    eor     r0, r4, r12             // r0 <- r12 ^ S0 ^ (BYTE_ROR_4(S0) >>> 8)
    and     r5, r14, r8, lsr #4     // r5 <- (S4 >> 4) & 0x0f0f0f0f
    and     r2, r14, r8             // r2 <- S4 & 0x0f0f0f0f
    orr     r2, r5, r2, lsl #4      // r2 <- BYTE_ROR_4(S4)
    eor     r0, r0, r2, ror #8      // r0 <- r0 ^ (BYTE_ROR_4(S4) >>> 8)
    eor     r2, r8, r2, ror #8      // r2 <- S4 ^ (BYTE_ROR_4(S4) >>> 8)
    eor     r8, r0, r2, ror #16     // r8 <- r0 ^ (r2 >>> 16)
    eor     r2, r4                  // r2 <- r2 ^ S0 ^ (BYTE_ROR_4(S0) >>> 8)
    and     r5, r14, r7, lsr #4     // r5 <- (S3 >> 4) & 0x0f0f0f0f
    and     r0, r14, r7             // r0 <- S3 & 0x0f0f0f0f
    orr     r0, r5, r0, lsl #4      // r0 <- BYTE_ROR_4(S3)
    eor     r2, r2, r0, ror #8      // r2 <- r2 ^ (BYTE_ROR_4(S3) >>> 8)
    eor     r0, r7, r0, ror #8      // r0 <- S3 ^ (BYTE_ROR_4(S3) >>> 8)
    eor     r7, r2, r0, ror #16     // r7 <- r2 ^ (r0 >>> 16)
    and     r5, r14, r6, lsr #4     // r5 <- (S2 >> 4) & 0x0f0f0f0f
    and     r2, r14, r6             // r2 <- S2 & 0x0f0f0f0f
    orr     r2, r5, r2, lsl #4      // r2 <- BYTE_ROR_4(S2)
    eor     r0, r0, r2, ror #8      // r0 <- r0 ^ (BYTE_ROR_4(S2) >>> 8)
    eor     r2, r6, r2, ror #8      // r2 <- S2 ^ (BYTE_ROR_4(S2) >>> 8)
    eor     r6, r0, r2, ror #16     // r6 <- r0 ^ (r2 >>> 16)
    and     r5, r14, r3, lsr #4     // r5 <- (S1 >> 4) & 0x0f0f0f0f
    and     r0, r14, r3             // r0 <- S1 & 0x0f0f0f0f
    orr     r0, r5, r0, lsl #4      // r0 <- BYTE_ROR_4(S1)
    ldr     r14, [sp, #52]          // restore link register
    eor     r2, r2, r0, ror #8      // r2 <- r2 ^ (BYTE_ROR_4(S1) >>> 8)
    eor     r0, r3, r0, ror #8      // r0 <- S1 ^ (BYTE_ROR_4(S1) >>> 8)
    eor     r5, r2, r0, ror #16     // r5 <- r2 <- (r0 >>> 16)
    eor     r1, r0, r1              // r1 <- r0 ^ BYTE_ROR_4(S0) >>> 8
    eor     r4, r1, r4, ror #16     // r4 <- r4 ^ (r0 >>> 16)
    bx      lr

/******************************************************************************
* Computation of the MixColumns transformation in the fixsliced representation.
* For fully-fixsliced implementations only, for rounds i s.t. (i%4) == 2.
******************************************************************************/
.align 2
mixcolumns_2:
    str     r14, [sp, #52]          // store link register
    movw    r12, #0x3f3f
    movt    r12, #0x3f3f
    mc_0_2  r12, 2, 6, 30, 22
    ldr     r14, [sp, #52]          // restore link register
    bx      lr

/******************************************************************************
* Computation of the MixColumns transformation in the fixsliced representation.
* For fully-fixsliced implementations, it is used for rounds i s.t. (i%4) == 3.
* For semi-fixsliced implementations, it is used for rounds i s.t. (i%2) == 1.
* Based on Käsper-Schwabe, similar to https://github.com/Ko-/aes-armcortexm.
******************************************************************************/
.align 2
mixcolumns_3:
    eor     r12, r11, r11, ror #8   // r12<- S7 ^ (S7 >>> 8)
    eor     r4, r1, r1, ror #8      // r4 <- S0 ^ (S0 >>> 8)
    eor     r11, r4, r11, ror #8    // r11<- S0 ^ (S0 >>> 8) ^ (S7 >>> 8)
    eor     r11, r11, r12, ror #16  // r11<- r11 ^ (S7 >>> 16) ^ (S7 >>> 24)
    eor     r10, r12, r2, ror #8    // r10<- S7 ^ (S7 >>> 8) ^ (S6 >>> 8)
    eor     r12, r2, r2, ror #8     // r12<- S6 ^ (S6 >>> 8)
    eor     r10, r10, r12, ror #16  // r10<- r10 ^ (S6 >>> 16) ^ (S6 >>> 24)
    eor     r10, r4                 // r10<- r10 ^ S0 ^ (S0 >>> 8)
    eor     r9, r12, r0, ror #8     // r9 <- S6 ^ (S6 >>> 8) ^ (S5 >>> 8)
    eor     r12, r0, r0, ror #8     // r12<- S5 ^ (S5 >>> 8)
    eor     r9, r9, r12, ror #16    // r9 <- r9 ^ (S5 >>> 16) ^ (S5 >>> 24)
    eor     r2, r8, r8, ror #8      // r2 <- S4 ^ (S4 >>> 8)
    eor     r8, r12, r8, ror #8     // r8 <- S5 ^ (S5 >>> 8) ^ (S4 >>> 8)
    eor     r8, r4                  // r8 <- r8 ^ S0 ^ (S0 >>> 8)
    eor     r8, r8, r2, ror #16     // r8 <- r8 ^ (S4 >>> 16) ^ (S4 >>> 24)
    eor     r12, r7, r7, ror #8     // r12<- S3 ^ (S3 >>> 8)
    eor     r7, r2, r7, ror #8      // r7 <- S4 ^ (S4 >>> 8) ^ (S3 >>> 8)
    eor     r7, r4                  // r7 <- r7 ^ S0 ^ (S0 >>> 8)
    eor     r7, r7, r12, ror #16    // r7 <- r7 ^ (S3 >>> 16) ^ (S3 >>> 24)
    eor     r2, r6, r6, ror #8      // r2 <- S2 ^ (S2 >>> 8)
    eor     r6, r12, r6, ror #8     // r6 <- S3 ^ (S3 >>> 8) ^ (S2 >>> 8)
    eor     r6, r6, r2, ror #16     // r6 <- r6 ^ (S2 >>> 16) ^ (S2 >>> 24)
    eor     r12, r3, r3, ror #8     // r12<- S1 ^ (S1 >>> 8)
    eor     r5, r2, r3, ror #8      // r5 <- S2 ^ (S2 >>> 8) ^ (S1 >>> 8)
    eor     r5, r5, r12, ror #16    // r5 <- r5 ^ (S1 >>> 16) ^ (S1 >>> 24)
    eor     r4, r12, r4, ror #16    // r4 <- S1 ^ (S1 >>> 8) ^ (r4 >>> 16)
    eor     r4, r4, r1, ror #8      // r4 <- r4 ^ (S0 >>> 8)
    bx      lr

/******************************************************************************
* Applies the ShiftRows transformation twice (i.e. SR^2) on the internal state.
******************************************************************************/
.align 2
double_shiftrows:
    movw    r10, #0x0f00
    movt    r10, #0x0f00            // r10<- 0x0f000f00 (mask)
    swpmv   r0, r0, r0, r0, r10, #4, r12
    swpmv   r1, r1, r1, r1, r10, #4, r12
    swpmv   r2, r2, r2, r2, r10, #4, r12
    swpmv   r3, r3, r3, r3, r10, #4, r12
    swpmv   r6, r6, r6, r6, r10, #4, r12
    swpmv   r7, r7, r7, r7, r10, #4, r12
    swpmv   r8, r8, r8, r8, r10, #4, r12
    swpmv   r11, r11, r11, r11, r10, #4, r12
    bx      lr

/******************************************************************************
* Fully-fixsliced implementation of AES-128.
*
* Two blocks are encrypted in parallel, without any operating mode.
*
* Note that additional 4 bytes are allocated on the stack as the function takes
* 5 arguments as input.
******************************************************************************/
@ void aes128_encrypt_ffs(u8* ctext, u8* ctext_bis, const u8* ptext,
@                   const u8* ptext_bis, const u32* rkey);
.global aes128_encrypt_ffs
.type   aes128_encrypt_ffs,%function
.align 2
aes128_encrypt_ffs:
    push    {r0-r12,r14}
    sub.w   sp, #56                 // allow space on the stack for tmp var
    ldr.w   r4, [r2]                // load the 1st 128-bit blocks in r4-r7
    ldr     r5, [r2, #4]
    ldr     r6, [r2, #8]
    ldr     r7, [r2, #12]
    ldr.w   r8, [r3]                // load the 2nd 128-bit blocks in r8-r11
    ldr     r9, [r3, #4]
    ldr     r10,[r3, #8]
    ldr     r11,[r3, #12]
    ldr.w   r1, [sp, #112]          // load 'rkey' argument from the stack
    str.w   r1, [sp, #48]           // store it there for 'add_round_key'
    bl      packing                 // pack the 2 input blocks
    bl      ark_sbox                // ark + sbox (round 0)
    bl      mixcolumns_0            // mixcolumns (round 0)
    bl      ark_sbox                // ark + sbox (round 1)
    bl      mixcolumns_1            // mixcolumns (round 1)
    bl      ark_sbox                // ark + sbox (round 2)
    bl      mixcolumns_2            // mixcolumns (round 2)
    bl      ark_sbox                // ark + sbox (round 3)
    bl      mixcolumns_3            // mixcolumns (round 3)
    bl      ark_sbox                // ark + sbox (round 4)
    bl      mixcolumns_0            // mixcolumns (round 4)
    bl      ark_sbox                // ark + sbox (round 5)
    bl      mixcolumns_1            // mixcolumns (round 5)
    bl      ark_sbox                // ark + sbox (round 6)
    bl      mixcolumns_2            // mixcolumns (round 6)
    bl      ark_sbox                // ark + sbox (round 7)
    bl      mixcolumns_3            // mixcolumns (round 7)
    bl      ark_sbox                // ark + sbox (round 8)
    bl      mixcolumns_0            // mixcolumns (round 8)
    bl      ark_sbox                // ark + sbox (round 9)
    bl      double_shiftrows        // to resynchronize with the classical rep
    ldr     r14, [sp, #48]          // ---------------------------------------
    ldmia   r14!, {r4,r5,r10,r12}   //
    eor     r4, r1                  //
    eor     r5, r3                  //
    eor     r6, r10                 //
    eor     r7, r12                 //  Last add_round_key
    ldmia   r14!, {r1,r3,r10,r12}   //
    eor     r8, r1                  //
    eor     r9, r0, r3              //
    eor     r10, r2                 //
    eor     r11, r12                // ---------------------------------------
    bl      unpacking               // unpack the internal state
    ldrd    r0, r1, [sp, #56]       // restore the addr to store the ciphertext
    add.w   sp, #64                 // restore the stack pointer
    str.w   r4, [r0]                // store the ciphertext
    str     r5, [r0, #4]
    str     r6, [r0, #8]
    str     r7, [r0, #12]
    str.w   r8, [r1]               // store the ciphertext
    str     r9, [r1, #4]
    str     r10,[r1, #8]
    str     r11,[r1, #12]
    pop     {r2-r12, r14}           // restore context
    bx      lr

/******************************************************************************
* Fully-fixsliced implementation of AES-256.
*
* Two blocks are encrypted in parallel, without any operating mode.
*
* Note that additional 4 bytes are allocated on the stack as the function takes
* 5 arguments as input.
******************************************************************************/
@ void aes256_encrypt_ffs(u8* ctext, u8* ctext_bis, const u8* ptext,
@                   const u8* ptext_bis, const u32* rkey);
.global aes256_encrypt_ffs
.type   aes256_encrypt_ffs,%function
.align 2
aes256_encrypt_ffs:
    push    {r0-r12,r14}
    sub.w   sp, #56                 // allow space on the stack for tmp var
    ldr.w   r4, [r2]                // load the 1st 128-bit blocks in r4-r7
    ldr     r5, [r2, #4]
    ldr     r6, [r2, #8]
    ldr     r7, [r2, #12]
    ldr.w   r8, [r3]                // load the 2nd 128-bit blocks in r8-r11
    ldr     r9, [r3, #4]
    ldr     r10,[r3, #8]
    ldr     r11,[r3, #12]
    ldr.w   r1, [sp, #112]          // load 'rkey' argument from the stack
    str.w   r1, [sp, #48]           // store it there for 'add_round_key'
    bl      packing                 // pack the 2 input blocks
    bl      ark_sbox                // ark + sbox (round 0)
    bl      mixcolumns_0            // mixcolumns (round 0)
    bl      ark_sbox                // ark + sbox (round 1)
    bl      mixcolumns_1            // mixcolumns (round 1)
    bl      ark_sbox                // ark + sbox (round 2)
    bl      mixcolumns_2            // mixcolumns (round 2)
    bl      ark_sbox                // ark + sbox (round 3)
    bl      mixcolumns_3            // mixcolumns (round 3)
    bl      ark_sbox                // ark + sbox (round 4)
    bl      mixcolumns_0            // mixcolumns (round 4)
    bl      ark_sbox                // ark + sbox (round 5)
    bl      mixcolumns_1            // mixcolumns (round 5)
    bl      ark_sbox                // ark + sbox (round 6)
    bl      mixcolumns_2            // mixcolumns (round 6)
    bl      ark_sbox                // ark + sbox (round 7)
    bl      mixcolumns_3            // mixcolumns (round 7)
    bl      ark_sbox                // ark + sbox (round 8)
    bl      mixcolumns_0            // mixcolumns (round 8)
    bl      ark_sbox                // ark + sbox (round 9)
    bl      mixcolumns_1            // mixcolumns (round 9)
    bl      ark_sbox                // ark + sbox (round 10)
    bl      mixcolumns_2            // mixcolumns (round 10)
    bl      ark_sbox                // ark + sbox (round 11)
    bl      mixcolumns_3            // mixcolumns (round 11)
    bl      ark_sbox                // ark + sbox (round 12)
    bl      mixcolumns_0            // mixcolumns (round 12)
    bl      ark_sbox                // ark + sbox (round 13)
    bl      double_shiftrows        // to resynchronize with the classical rep
    ldr     r14, [sp, #48]          // ---------------------------------------
    ldmia   r14!, {r4,r5,r10,r12}   //
    eor     r4, r1                  //
    eor     r5, r3                  //
    eor     r6, r10                 //
    eor     r7, r12                 //  Last add_round_key
    ldmia   r14!, {r1,r3,r10,r12}   //
    eor     r8, r1                  //
    eor     r9, r0, r3              //
    eor     r10, r2                 //
    eor     r11, r12                // ---------------------------------------
    bl      unpacking               // unpack the internal state
    ldrd    r0, r1, [sp, #56]       // restore the addr to store the ciphertext
    add.w   sp, #64                 // restore the stack pointer
    str.w   r4, [r0]                // store the ciphertext
    str     r5, [r0, #4]
    str     r6, [r0, #8]
    str     r7, [r0, #12]
    str.w   r8, [r1]                // store the ciphertext
    str     r9, [r1, #4]
    str     r10,[r1, #8]
    str     r11,[r1, #12]
    pop     {r2-r12, r14}           // restore context
    bx      lr
back to top