https://github.com/mupq/pqm4
Revision 992f0f226503d43b6d33278ecb60a9168ed8d787 authored by Michiel Van Beirendonck on 18 February 2021, 05:57:25 UTC, committed by GitHub on 18 February 2021, 05:57:25 UTC
* This is a large commit, grouping two types of changes on top of the NTT-based Saber.

Firstly, this commit merges improvements between different Saber implementations.

1) For round 3, the Saber reference code was thoroughly refactored and the codebase reduced [https://github.com/KULeuven-COSIC/SABER]. These changes are now integrated into the m4 code.

2) All unnecessary modular reductions have been removed. The only modular reductions are now in the packing functions.

3) Packing/unpacking functions are simplified [PQClean, commit f8503cb].

4) The secret-key is stored in compressed format [ia.cr/2020/268, Section 4.1]. This reduces the secret-key size, and the packing/unpacking functions are faster. (This requires a fix in pqm4’s testvectors.c, as the secret-key is checked against the one produced by PQclean).

5) During re-encryption, the verification of the ciphertext is performed in place [ia.cr/2020/268, Section 4.2].

6) Use symlinks for Light/FireSaber to make (minimal) differences with Saber more clear.

Secondly, this commit implements some optimizations and reduces the memory footprint of the NTT-based multiplication.

1) Saber does not require any modular reduction apart from bitstream packing. Elements can be kept in int16_t (central-reduced) format.

1.a) The secret-key is sign-extended from 4-bit to 16-bit when unpacked.
1.b) The vectors b and b' are sign-extended from 10-bit to 16-bit when unpacked.
1.c) 1.a and 1.b allow to remove NTT_pk (with central reduction) and use NTT (without central reduction) uniformly.
1.d) NTT_inv and NTT_inv_inner include a final step that converts from int16_t back to mod_p or mod_q. This is not necessary and removed.

2) During encryption, the NTT of s' is only computed once and reused between A*s' and b*s'.

3) Some just-in-time memory optimizations of [ia.cr/2018/682, Section 2.2] are implemented for the NTT-based multiplication. Polynomial vectors are generated from their seed just-in-time, converted to NTT domain, and pointwise multiplied. The next polynomial vectors can reuse all the buffers.

The idea is to extend this from polynomial vectors to individual polynomials. This still requires a new my_mul function.

For {Fire,Light}Saber (keygen/encaps/decaps) the resulting implementation is approximately (2.3-2.6%/4.7-5.5%/7.4-9.5%) faster and uses (27-36%/47-61%/49-62%) less dynamic memory than the current version in pqm4.

* Add central reduction for matrix A

* Add benchmarks

* WIP : more memory-efficient NTT implementation

* Make secret key compression optional
and comment out non-stack-optimized (very slightly faster) functions

* Reclaim ~1kB more stack space

shake_out was SABER_POLYVECBYTES instead of only SABER_POLYBYTES.

Introduced a few unions to overlap memory.

* rm redundant files

* clean ups; add soft links

* Reclaim ~1kB more stack space

shake_out was SABER_POLYVECBYTES instead of only SABER_POLYBYTES.

Introduced a few unions to overlap memory.

* typo

* Noinline no longer needed without fast funcs

* add benchmarks

Co-authored-by: vincentvbh <b05902122@ntu.edu.tw>
Co-authored-by: Matthias J. Kannwischer <matthias@kannwischer.eu>
1 parent 5fb6938
Raw File
Tip revision: 992f0f226503d43b6d33278ecb60a9168ed8d787 authored by Michiel Van Beirendonck on 18 February 2021, 05:57:25 UTC
Stack optimizations and refactoring of NTT-based Saber (#181)
Tip revision: 992f0f2
aes-publicinputs.S
.syntax unified
.thumb

.section .data.aestable
.global AES_Te0
.type AES_Te0,%object
.align 2
AES_Te0:
.word 0x63c6a563, 0x7cf8847c, 0x77ee9977, 0x7bf68d7b
.word 0xf2ff0df2, 0x6bd6bd6b, 0x6fdeb16f, 0xc59154c5
.word 0x30605030, 0x01020301, 0x67cea967, 0x2b567d2b
.word 0xfee719fe, 0xd7b562d7, 0xab4de6ab, 0x76ec9a76
.word 0xca8f45ca, 0x821f9d82, 0xc98940c9, 0x7dfa877d
.word 0xfaef15fa, 0x59b2eb59, 0x478ec947, 0xf0fb0bf0
.word 0xad41ecad, 0xd4b367d4, 0xa25ffda2, 0xaf45eaaf
.word 0x9c23bf9c, 0xa453f7a4, 0x72e49672, 0xc09b5bc0
.word 0xb775c2b7, 0xfde11cfd, 0x933dae93, 0x264c6a26
.word 0x366c5a36, 0x3f7e413f, 0xf7f502f7, 0xcc834fcc
.word 0x34685c34, 0xa551f4a5, 0xe5d134e5, 0xf1f908f1
.word 0x71e29371, 0xd8ab73d8, 0x31625331, 0x152a3f15
.word 0x04080c04, 0xc79552c7, 0x23466523, 0xc39d5ec3
.word 0x18302818, 0x9637a196, 0x050a0f05, 0x9a2fb59a
.word 0x070e0907, 0x12243612, 0x801b9b80, 0xe2df3de2
.word 0xebcd26eb, 0x274e6927, 0xb27fcdb2, 0x75ea9f75
.word 0x09121b09, 0x831d9e83, 0x2c58742c, 0x1a342e1a
.word 0x1b362d1b, 0x6edcb26e, 0x5ab4ee5a, 0xa05bfba0
.word 0x52a4f652, 0x3b764d3b, 0xd6b761d6, 0xb37dceb3
.word 0x29527b29, 0xe3dd3ee3, 0x2f5e712f, 0x84139784
.word 0x53a6f553, 0xd1b968d1, 0x00000000, 0xedc12ced
.word 0x20406020, 0xfce31ffc, 0xb179c8b1, 0x5bb6ed5b
.word 0x6ad4be6a, 0xcb8d46cb, 0xbe67d9be, 0x39724b39
.word 0x4a94de4a, 0x4c98d44c, 0x58b0e858, 0xcf854acf
.word 0xd0bb6bd0, 0xefc52aef, 0xaa4fe5aa, 0xfbed16fb
.word 0x4386c543, 0x4d9ad74d, 0x33665533, 0x85119485
.word 0x458acf45, 0xf9e910f9, 0x02040602, 0x7ffe817f
.word 0x50a0f050, 0x3c78443c, 0x9f25ba9f, 0xa84be3a8
.word 0x51a2f351, 0xa35dfea3, 0x4080c040, 0x8f058a8f
.word 0x923fad92, 0x9d21bc9d, 0x38704838, 0xf5f104f5
.word 0xbc63dfbc, 0xb677c1b6, 0xdaaf75da, 0x21426321
.word 0x10203010, 0xffe51aff, 0xf3fd0ef3, 0xd2bf6dd2
.word 0xcd814ccd, 0x0c18140c, 0x13263513, 0xecc32fec
.word 0x5fbee15f, 0x9735a297, 0x4488cc44, 0x172e3917
.word 0xc49357c4, 0xa755f2a7, 0x7efc827e, 0x3d7a473d
.word 0x64c8ac64, 0x5dbae75d, 0x19322b19, 0x73e69573
.word 0x60c0a060, 0x81199881, 0x4f9ed14f, 0xdca37fdc
.word 0x22446622, 0x2a547e2a, 0x903bab90, 0x880b8388
.word 0x468cca46, 0xeec729ee, 0xb86bd3b8, 0x14283c14
.word 0xdea779de, 0x5ebce25e, 0x0b161d0b, 0xdbad76db
.word 0xe0db3be0, 0x32645632, 0x3a744e3a, 0x0a141e0a
.word 0x4992db49, 0x060c0a06, 0x24486c24, 0x5cb8e45c
.word 0xc29f5dc2, 0xd3bd6ed3, 0xac43efac, 0x62c4a662
.word 0x9139a891, 0x9531a495, 0xe4d337e4, 0x79f28b79
.word 0xe7d532e7, 0xc88b43c8, 0x376e5937, 0x6ddab76d
.word 0x8d018c8d, 0xd5b164d5, 0x4e9cd24e, 0xa949e0a9
.word 0x6cd8b46c, 0x56acfa56, 0xf4f307f4, 0xeacf25ea
.word 0x65caaf65, 0x7af48e7a, 0xae47e9ae, 0x08101808
.word 0xba6fd5ba, 0x78f08878, 0x254a6f25, 0x2e5c722e
.word 0x1c38241c, 0xa657f1a6, 0xb473c7b4, 0xc69751c6
.word 0xe8cb23e8, 0xdda17cdd, 0x74e89c74, 0x1f3e211f
.word 0x4b96dd4b, 0xbd61dcbd, 0x8b0d868b, 0x8a0f858a
.word 0x70e09070, 0x3e7c423e, 0xb571c4b5, 0x66ccaa66
.word 0x4890d848, 0x03060503, 0xf6f701f6, 0x0e1c120e
.word 0x61c2a361, 0x356a5f35, 0x57aef957, 0xb969d0b9
.word 0x86179186, 0xc19958c1, 0x1d3a271d, 0x9e27b99e
.word 0xe1d938e1, 0xf8eb13f8, 0x982bb398, 0x11223311
.word 0x69d2bb69, 0xd9a970d9, 0x8e07898e, 0x9433a794
.word 0x9b2db69b, 0x1e3c221e, 0x87159287, 0xe9c920e9
.word 0xce8749ce, 0x55aaff55, 0x28507828, 0xdfa57adf
.word 0x8c038f8c, 0xa159f8a1, 0x89098089, 0x0d1a170d
.word 0xbf65dabf, 0xe6d731e6, 0x4284c642, 0x68d0b868
.word 0x4182c341, 0x9929b099, 0x2d5a772d, 0x0f1e110f
.word 0xb07bcbb0, 0x54a8fc54, 0xbb6dd6bb, 0x162c3a16
.size AES_Te0,.-AES_Te0

.section .text.aes128
@ void aes128_keyexp_publicinputs_asm(const uint8_t *key,
@       uint8_t *rk) {
.global aes128_keyexp_publicinputs_asm
.type   aes128_keyexp_publicinputs_asm,%function
.align 2
aes128_keyexp_publicinputs_asm:

    //function prologue, preserve registers
    push {r4-r11}

    //load key
    //pointer may be non-aligned, so avoid using ldm/stm
    ldr r4, [r0, #0]
    ldr r5, [r0, #4]
    ldr r6, [r0, #8]
    ldr r7, [r0, #12]

    //load table address once
    ldr r3, =AES_Te0

    //round 1
    uxtb r8, r7, ror #8
    uxtb r9, r7, ror #16
    uxtb r10, r7, ror #24
    uxtb r11, r7

    ldrb r8, [r3, r8, lsl #2]
    ldrb r9, [r3, r9, lsl #2]
    ldrb r10, [r3, r10, lsl #2]
    ldrb r11, [r3, r11, lsl #2]

    eor r4, #0x00000001 //rcon
    eor r4, r4, r8
    eor r4, r4, r9, lsl #8
    eor r4, r4, r10, lsl #16
    eor r4, r4, r11, lsl #24 //rk[4]
    eor r5, r4 //rk[5]
    eor r6, r5 //rk[6]
    eor r7, r6 //rk[7]

    //write to memory
    str r4, [r1, #0]
    str r5, [r1, #4]
    str r6, [r1, #8]
    str r7, [r1, #12]

    //round 2
    uxtb r8, r7, ror #8
    uxtb r9, r7, ror #16
    uxtb r10, r7, ror #24
    uxtb r11, r7

    ldrb r8, [r3, r8, lsl #2]
    ldrb r9, [r3, r9, lsl #2]
    ldrb r10, [r3, r10, lsl #2]
    ldrb r11, [r3, r11, lsl #2]

    eor r4, #0x00000002 //rcon
    eor r4, r4, r8
    eor r4, r4, r9, lsl #8
    eor r4, r4, r10, lsl #16
    eor r4, r4, r11, lsl #24 //rk[8]
    eor r5, r4 //rk[9]
    eor r6, r5 //rk[10]
    eor r7, r6 //rk[11]

    //write to memory
    str r4, [r1, #16]
    str r5, [r1, #20]
    str r6, [r1, #24]
    str r7, [r1, #28]

    //round 3
    uxtb r8, r7, ror #8
    uxtb r9, r7, ror #16
    uxtb r10, r7, ror #24
    uxtb r11, r7

    ldrb r8, [r3, r8, lsl #2]
    ldrb r9, [r3, r9, lsl #2]
    ldrb r10, [r3, r10, lsl #2]
    ldrb r11, [r3, r11, lsl #2]

    eor r4, #0x00000004 //rcon
    eor r4, r4, r8
    eor r4, r4, r9, lsl #8
    eor r4, r4, r10, lsl #16
    eor r4, r4, r11, lsl #24 //rk[12]
    eor r5, r4 //rk[13]
    eor r6, r5 //rk[14]
    eor r7, r6 //rk[15]

    //write to memory
    str r4, [r1, #32]
    str r5, [r1, #36]
    str r6, [r1, #40]
    str r7, [r1, #44]

    //round 4
    uxtb r8, r7, ror #8
    uxtb r9, r7, ror #16
    uxtb r10, r7, ror #24
    uxtb r11, r7

    ldrb r8, [r3, r8, lsl #2]
    ldrb r9, [r3, r9, lsl #2]
    ldrb r10, [r3, r10, lsl #2]
    ldrb r11, [r3, r11, lsl #2]

    eor r4, #0x00000008 //rcon
    eor r4, r4, r8
    eor r4, r4, r9, lsl #8
    eor r4, r4, r10, lsl #16
    eor r4, r4, r11, lsl #24 //rk[16]
    eor r5, r4 //rk[17]
    eor r6, r5 //rk[18]
    eor r7, r6 //rk[19]

    //write to memory
    str r4, [r1, #48]
    str r5, [r1, #52]
    str r6, [r1, #56]
    str r7, [r1, #60]

    //round 5
    uxtb r8, r7, ror #8
    uxtb r9, r7, ror #16
    uxtb r10, r7, ror #24
    uxtb r11, r7

    ldrb r8, [r3, r8, lsl #2]
    ldrb r9, [r3, r9, lsl #2]
    ldrb r10, [r3, r10, lsl #2]
    ldrb r11, [r3, r11, lsl #2]

    eor r4, #0x00000010 //rcon
    eor r4, r4, r8
    eor r4, r4, r9, lsl #8
    eor r4, r4, r10, lsl #16
    eor r4, r4, r11, lsl #24 //rk[20]
    eor r5, r4 //rk[21]
    eor r6, r5 //rk[22]
    eor r7, r6 //rk[23]

    //write to memory
    str r4, [r1, #64]
    str r5, [r1, #68]
    str r6, [r1, #72]
    str r7, [r1, #76]

    //round 6
    uxtb r8, r7, ror #8
    uxtb r9, r7, ror #16
    uxtb r10, r7, ror #24
    uxtb r11, r7

    ldrb r8, [r3, r8, lsl #2]
    ldrb r9, [r3, r9, lsl #2]
    ldrb r10, [r3, r10, lsl #2]
    ldrb r11, [r3, r11, lsl #2]

    eor r4, #0x00000020 //rcon
    eor r4, r4, r8
    eor r4, r4, r9, lsl #8
    eor r4, r4, r10, lsl #16
    eor r4, r4, r11, lsl #24 //rk[24]
    eor r5, r4 //rk[25]
    eor r6, r5 //rk[26]
    eor r7, r6 //rk[27]

    //write to memory
    str r4, [r1, #80]
    str r5, [r1, #84]
    str r6, [r1, #88]
    str r7, [r1, #92]

    //round 7
    uxtb r8, r7, ror #8
    uxtb r9, r7, ror #16
    uxtb r10, r7, ror #24
    uxtb r11, r7

    ldrb r8, [r3, r8, lsl #2]
    ldrb r9, [r3, r9, lsl #2]
    ldrb r10, [r3, r10, lsl #2]
    ldrb r11, [r3, r11, lsl #2]

    eor r4, #0x00000040 //rcon
    eor r4, r4, r8
    eor r4, r4, r9, lsl #8
    eor r4, r4, r10, lsl #16
    eor r4, r4, r11, lsl #24 //rk[28]
    eor r5, r4 //rk[29]
    eor r6, r5 //rk[30]
    eor r7, r6 //rk[31]

    //write to memory
    str r4, [r1, #96]
    str r5, [r1, #100]
    str r6, [r1, #104]
    str r7, [r1, #108]

    //round 8
    uxtb r8, r7, ror #8
    uxtb r9, r7, ror #16
    uxtb r10, r7, ror #24
    uxtb r11, r7

    ldrb r8, [r3, r8, lsl #2]
    ldrb r9, [r3, r9, lsl #2]
    ldrb r10, [r3, r10, lsl #2]
    ldrb r11, [r3, r11, lsl #2]

    eor r4, #0x00000080 //rcon
    eor r4, r4, r8
    eor r4, r4, r9, lsl #8
    eor r4, r4, r10, lsl #16
    eor r4, r4, r11, lsl #24 //rk[32]
    eor r5, r4 //rk[33]
    eor r6, r5 //rk[34]
    eor r7, r6 //rk[35]

    //write to memory
    str r4, [r1, #112]
    str r5, [r1, #116]
    str r6, [r1, #120]
    str r7, [r1, #124]

    add r1, #128

    //round 9
    uxtb r8, r7, ror #8
    uxtb r9, r7, ror #16
    uxtb r10, r7, ror #24
    uxtb r11, r7

    ldrb r8, [r3, r8, lsl #2]
    ldrb r9, [r3, r9, lsl #2]
    ldrb r10, [r3, r10, lsl #2]
    ldrb r11, [r3, r11, lsl #2]

    eor r4, #0x0000001B //rcon
    eor r4, r4, r8
    eor r4, r4, r9, lsl #8
    eor r4, r4, r10, lsl #16
    eor r4, r4, r11, lsl #24 //rk[36]
    eor r5, r4 //rk[37]
    eor r6, r5 //rk[38]
    eor r7, r6 //rk[39]

    //write to memory
    str r4, [r1, #0]
    str r5, [r1, #4]
    str r6, [r1, #8]
    str r7, [r1, #12]

    //round 10
    uxtb r8, r7, ror #8
    uxtb r9, r7, ror #16
    uxtb r10, r7, ror #24
    uxtb r11, r7

    ldrb r8, [r3, r8, lsl #2]
    ldrb r9, [r3, r9, lsl #2]
    ldrb r10, [r3, r10, lsl #2]
    ldrb r11, [r3, r11, lsl #2]

    eor r4, #0x00000036 //rcon
    eor r4, r4, r8
    eor r4, r4, r9, lsl #8
    eor r4, r4, r10, lsl #16
    eor r4, r4, r11, lsl #24 //rk[40]
    eor r5, r4 //rk[41]
    eor r6, r5 //rk[42]
    eor r7, r6 //rk[43]

    //write to memory
    str r4, [r1, #16]
    str r5, [r1, #20]
    str r6, [r1, #24]
    str r7, [r1, #28]

    //function epilogue, restore state
    pop {r4-r11}
    bx lr
.size aes128_keyexp_publicinputs_asm,.-aes128_keyexp_publicinputs_asm

.macro aesencrypt_oddround
    ldr r8, [r14], #4
    ldr r9, [r14], #4
    ldr r10, [r14], #4
    ldr r11, [r14], #4

    uxtb r0, r4
    uxtb r1, r5
    uxtb r2, r6
    uxtb r3, r7
    ldr r0, [r12, r0, lsl #2]
    ldr r1, [r12, r1, lsl #2]
    ldr r2, [r12, r2, lsl #2]
    ldr r3, [r12, r3, lsl #2]
    eor r8, r8, r0, ror #16
    eor r9, r9, r1, ror #16
    eor r10, r10, r2, ror #16
    eor r11, r11, r3, ror #16

    uxtb r0, r5, ror #8
    uxtb r1, r6, ror #8
    uxtb r2, r7, ror #8
    uxtb r3, r4, ror #8
    ldr r0, [r12, r0, lsl #2]
    ldr r1, [r12, r1, lsl #2]
    ldr r2, [r12, r2, lsl #2]
    ldr r3, [r12, r3, lsl #2]
    eor r8, r8, r0, ror #8
    eor r9, r9, r1, ror #8
    eor r10, r10, r2, ror #8
    eor r11, r11, r3, ror #8

    uxtb r0, r6, ror #16
    uxtb r1, r7, ror #16
    uxtb r2, r4, ror #16
    uxtb r3, r5, ror #16
    ldr r0, [r12, r0, lsl #2]
    ldr r1, [r12, r1, lsl #2]
    ldr r2, [r12, r2, lsl #2]
    ldr r3, [r12, r3, lsl #2]
    eor r8, r0
    eor r9, r1
    eor r10, r2
    eor r11, r3

    uxtb r0, r7, ror #24
    uxtb r1, r4, ror #24
    uxtb r2, r5, ror #24
    uxtb r3, r6, ror #24
    ldr r0, [r12, r0, lsl #2]
    ldr r1, [r12, r1, lsl #2]
    ldr r2, [r12, r2, lsl #2]
    ldr r3, [r12, r3, lsl #2]
    eor r8, r8, r0, ror #24
    eor r9, r9, r1, ror #24
    eor r10, r10, r2, ror #24
    eor r11, r11, r3, ror #24
.endm

.macro aesencrypt_evenround
    ldr r4, [r14], #4
    ldr r5, [r14], #4
    ldr r6, [r14], #4
    ldr r7, [r14], #4

    uxtb r0, r8
    uxtb r1, r9
    uxtb r2, r10
    uxtb r3, r11
    ldr r0, [r12, r0, lsl #2]
    ldr r1, [r12, r1, lsl #2]
    ldr r2, [r12, r2, lsl #2]
    ldr r3, [r12, r3, lsl #2]
    eor r4, r4, r0, ror #16
    eor r5, r5, r1, ror #16
    eor r6, r6, r2, ror #16
    eor r7, r7, r3, ror #16

    uxtb r0, r9, ror #8
    uxtb r1, r10, ror #8
    uxtb r2, r11, ror #8
    uxtb r3, r8, ror #8
    ldr r0, [r12, r0, lsl #2]
    ldr r1, [r12, r1, lsl #2]
    ldr r2, [r12, r2, lsl #2]
    ldr r3, [r12, r3, lsl #2]
    eor r4, r4, r0, ror #8
    eor r5, r5, r1, ror #8
    eor r6, r6, r2, ror #8
    eor r7, r7, r3, ror #8

    uxtb r0, r10, ror #16
    uxtb r1, r11, ror #16
    uxtb r2, r8, ror #16
    uxtb r3, r9, ror #16
    ldr r0, [r12, r0, lsl #2]
    ldr r1, [r12, r1, lsl #2]
    ldr r2, [r12, r2, lsl #2]
    ldr r3, [r12, r3, lsl #2]
    eor r4, r0
    eor r5, r1
    eor r6, r2
    eor r7, r3

    uxtb r0, r11, ror #24
    uxtb r1, r8, ror #24
    uxtb r2, r9, ror #24
    uxtb r3, r10, ror #24
    ldr r0, [r12, r0, lsl #2]
    ldr r1, [r12, r1, lsl #2]
    ldr r2, [r12, r2, lsl #2]
    ldr r3, [r12, r3, lsl #2]
    eor r4, r4, r0, ror #24
    eor r5, r5, r1, ror #24
    eor r6, r6, r2, ror #24
    eor r7, r7, r3, ror #24
.endm

.macro aesencrypt_finalround
    uxtb r0, r11, ror #24
    uxtb r1, r8, ror #24
    uxtb r2, r9, ror #24
    uxtb r3, r10, ror #24
    ldr r4, [r12, r0, lsl #2]
    ldr r5, [r12, r1, lsl #2]
    ldr r6, [r12, r2, lsl #2]
    ldr r7, [r12, r3, lsl #2]

    uxtb r0, r10, ror #16
    uxtb r1, r11, ror #16
    uxtb r2, r8, ror #16
    uxtb r3, r9, ror #16
    ldr r0, [r12, r0, lsl #2]
    ldr r1, [r12, r1, lsl #2]
    ldr r2, [r12, r2, lsl #2]
    ldr r3, [r12, r3, lsl #2]
    bfi r4, r0, #24, #8
    bfi r5, r1, #24, #8
    bfi r6, r2, #24, #8
    bfi r7, r3, #24, #8

    uxtb r0, r8
    uxtb r1, r9
    uxtb r2, r10
    uxtb r3, r11
    ldr r0, [r12, r0, lsl #2]
    ldr r1, [r12, r1, lsl #2]
    ldr r2, [r12, r2, lsl #2]
    ldr r3, [r12, r3, lsl #2]
    bfi r4, r0, #8, #8
    bfi r5, r1, #8, #8
    bfi r6, r2, #8, #8
    bfi r7, r3, #8, #8

    uxtb r0, r9, ror #8
    uxtb r1, r10, ror #8
    uxtb r2, r11, ror #8
    uxtb r3, r8, ror #8
    ldr r0, [r12, r0, lsl #2]
    ldr r1, [r12, r1, lsl #2]
    ldr r2, [r12, r2, lsl #2]
    ldr r3, [r12, r3, lsl #2]
    bfi r4, r0, #16, #8
    bfi r5, r1, #16, #8
    bfi r6, r2, #16, #8
    bfi r7, r3, #16, #8
.endm

@ void aes128_encrypt_publicinputs_asm(const uint8_t *rk,
@       const uint8_t *in, uint8_t *out) {
.global aes128_encrypt_publicinputs_asm
.type   aes128_encrypt_publicinputs_asm,%function
.align 2
aes128_encrypt_publicinputs_asm:

    //function prologue, preserve registers and free r2
    push {r2,r4-r12,r14}

    //load input
    ldr r4, [r1, #0]
    ldr r5, [r1, #4]
    ldr r6, [r1, #8]
    ldr r7, [r1, #12]
    //r1 now free to overwrite
    //load key
    ldr r8, [r0], #4
    ldr r9, [r0], #4
    ldr r10, [r0], #4
    ldr r11, [r0], #4
    mov.w r14, r0

    //load table address once
    ldr r12, =AES_Te0

    //initial addroundkey
    eor r4, r8
    eor r5, r9
    eor r6, r10
    eor r7, r11

.rept 4
    aesencrypt_oddround
    aesencrypt_evenround
.endr
    aesencrypt_oddround
    aesencrypt_finalround

    //rk[40]-rk[43]
    ldr r0, [r14, #0]
    ldr r1, [r14, #4]
    ldr r2, [r14, #8]
    ldr r3, [r14, #12]

    eor r0, r0, r4, ror #8
    eor r1, r1, r5, ror #8
    eor r2, r2, r6, ror #8
    pop.w {r4}
    eor r3, r3, r7, ror #8

    //write output
    str r0, [r4, #0]
    str r1, [r4, #4]
    str r2, [r4, #8]
    str r3, [r4, #12]

    //function epilogue, restore state
    pop {r4-r12,r14}
    bx lr
.size aes128_encrypt_publicinputs_asm,.-aes128_encrypt_publicinputs_asm

.section .text.aes192
@ void aes192_keyexp_publicinputs_asm(const uint8_t *key,
@       uint8_t *rk) {
.global aes192_keyexp_publicinputs_asm
.type   aes192_keyexp_publicinputs_asm,%function
.align 2
aes192_keyexp_publicinputs_asm:

    //function prologue, preserve registers
    push {r4-r11}

    //load key
    ldr r2, [r0, #0]
    ldr r3, [r0, #4]
    ldr r4, [r0, #8]
    ldr r5, [r0, #12]
    ldr r6, [r0, #16]
    ldr r7, [r0, #20]

    //load table address once
    ldr r0, =AES_Te0

    //round 1
    uxtb r8, r7, ror #8
    uxtb r9, r7, ror #16
    uxtb r10, r7, ror #24
    uxtb r11, r7

    ldrb r8, [r0, r8, lsl #2]
    ldrb r9, [r0, r9, lsl #2]
    ldrb r10, [r0, r10, lsl #2]
    ldrb r11, [r0, r11, lsl #2]

    eor r2, #0x00000001 //rcon
    eor r2, r2, r8
    eor r2, r2, r9, lsl #8
    eor r2, r2, r10, lsl #16
    eor r2, r2, r11, lsl #24 //rk[6]
    eor r3, r2 //rk[7]
    eor r4, r3 //rk[8]
    eor r5, r4 //rk[9]
    eor r6, r5 //rk[10]
    eor r7, r6 //rk[11]

    //write to memory
    str r2, [r1, #0]
    str r3, [r1, #4]
    str r4, [r1, #8]
    str r5, [r1, #12]
    str r6, [r1, #16]
    str r7, [r1, #20]

    //round 2
    uxtb r8, r7, ror #8
    uxtb r9, r7, ror #16
    uxtb r10, r7, ror #24
    uxtb r11, r7

    ldrb r8, [r0, r8, lsl #2]
    ldrb r9, [r0, r9, lsl #2]
    ldrb r10, [r0, r10, lsl #2]
    ldrb r11, [r0, r11, lsl #2]

    eor r2, #0x00000002 //rcon
    eor r2, r2, r8
    eor r2, r2, r9, lsl #8
    eor r2, r2, r10, lsl #16
    eor r2, r2, r11, lsl #24 //rk[12]
    eor r3, r2 //rk[13]
    eor r4, r3 //rk[14]
    eor r5, r4 //rk[15]
    eor r6, r5 //rk[16]
    eor r7, r6 //rk[17]

    //write to memory
    str r2, [r1, #24]
    str r3, [r1, #28]
    str r4, [r1, #32]
    str r5, [r1, #36]
    str r6, [r1, #40]
    str r7, [r1, #44]

    //round 3
    uxtb r8, r7, ror #8
    uxtb r9, r7, ror #16
    uxtb r10, r7, ror #24
    uxtb r11, r7

    ldrb r8, [r0, r8, lsl #2]
    ldrb r9, [r0, r9, lsl #2]
    ldrb r10, [r0, r10, lsl #2]
    ldrb r11, [r0, r11, lsl #2]

    eor r2, #0x00000004 //rcon
    eor r2, r2, r8
    eor r2, r2, r9, lsl #8
    eor r2, r2, r10, lsl #16
    eor r2, r2, r11, lsl #24 //rk[18]
    eor r3, r2 //rk[19]
    eor r4, r3 //rk[20]
    eor r5, r4 //rk[21]
    eor r6, r5 //rk[22]
    eor r7, r6 //rk[23]

    //write to memory
    str r2, [r1, #48]
    str r3, [r1, #52]
    str r4, [r1, #56]
    str r5, [r1, #60]
    str r6, [r1, #64]
    str r7, [r1, #68]

    //round 4
    uxtb r8, r7, ror #8
    uxtb r9, r7, ror #16
    uxtb r10, r7, ror #24
    uxtb r11, r7

    ldrb r8, [r0, r8, lsl #2]
    ldrb r9, [r0, r9, lsl #2]
    ldrb r10, [r0, r10, lsl #2]
    ldrb r11, [r0, r11, lsl #2]

    eor r2, #0x00000008 //rcon
    eor r2, r2, r8
    eor r2, r2, r9, lsl #8
    eor r2, r2, r10, lsl #16
    eor r2, r2, r11, lsl #24 //rk[24]
    eor r3, r2 //rk[25]
    eor r4, r3 //rk[26]
    eor r5, r4 //rk[27]
    eor r6, r5 //rk[28]
    eor r7, r6 //rk[29]

    //write to memory
    str r2, [r1, #72]
    str r3, [r1, #76]
    str r4, [r1, #80]
    str r5, [r1, #84]
    str r6, [r1, #88]
    str r7, [r1, #92]

    //round 5
    uxtb r8, r7, ror #8
    uxtb r9, r7, ror #16
    uxtb r10, r7, ror #24
    uxtb r11, r7

    ldrb r8, [r0, r8, lsl #2]
    ldrb r9, [r0, r9, lsl #2]
    ldrb r10, [r0, r10, lsl #2]
    ldrb r11, [r0, r11, lsl #2]

    eor r2, #0x00000010 //rcon
    eor r2, r2, r8
    eor r2, r2, r9, lsl #8
    eor r2, r2, r10, lsl #16
    eor r2, r2, r11, lsl #24 //rk[30]
    eor r3, r2 //rk[31]
    eor r4, r3 //rk[32]
    eor r5, r4 //rk[33]
    eor r6, r5 //rk[34]
    eor r7, r6 //rk[35]

    //write to memory
    str r2, [r1, #96]
    str r3, [r1, #100]
    str r4, [r1, #104]
    str r5, [r1, #108]
    str r6, [r1, #112]
    str r7, [r1, #116]

    add r1, #120

    //round 6
    uxtb r8, r7, ror #8
    uxtb r9, r7, ror #16
    uxtb r10, r7, ror #24
    uxtb r11, r7

    ldrb r8, [r0, r8, lsl #2]
    ldrb r9, [r0, r9, lsl #2]
    ldrb r10, [r0, r10, lsl #2]
    ldrb r11, [r0, r11, lsl #2]

    eor r2, #0x00000020 //rcon
    eor r2, r2, r8
    eor r2, r2, r9, lsl #8
    eor r2, r2, r10, lsl #16
    eor r2, r2, r11, lsl #24 //rk[36]
    eor r3, r2 //rk[37]
    eor r4, r3 //rk[38]
    eor r5, r4 //rk[39]
    eor r6, r5 //rk[40]
    eor r7, r6 //rk[41]

    //write to memory
    str r2, [r1, #0]
    str r3, [r1, #4]
    str r4, [r1, #8]
    str r5, [r1, #12]
    str r6, [r1, #16]
    str r7, [r1, #20]

    //round 7
    uxtb r8, r7, ror #8
    uxtb r9, r7, ror #16
    uxtb r10, r7, ror #24
    uxtb r11, r7

    ldrb r8, [r0, r8, lsl #2]
    ldrb r9, [r0, r9, lsl #2]
    ldrb r10, [r0, r10, lsl #2]
    ldrb r11, [r0, r11, lsl #2]

    eor r2, #0x00000040 //rcon
    eor r2, r2, r8
    eor r2, r2, r9, lsl #8
    eor r2, r2, r10, lsl #16
    eor r2, r2, r11, lsl #24 //rk[42]
    eor r3, r2 //rk[43]
    eor r4, r3 //rk[44]
    eor r5, r4 //rk[45]
    eor r6, r5 //rk[46]
    eor r7, r6 //rk[47]

    //write to memory
    str r2, [r1, #24]
    str r3, [r1, #28]
    str r4, [r1, #32]
    str r5, [r1, #36]
    str r6, [r1, #40]
    str r7, [r1, #44]

    //round 8
    uxtb r8, r7, ror #8
    uxtb r9, r7, ror #16
    uxtb r10, r7, ror #24
    uxtb r11, r7

    ldrb r8, [r0, r8, lsl #2]
    ldrb r9, [r0, r9, lsl #2]
    ldrb r10, [r0, r10, lsl #2]
    ldrb r11, [r0, r11, lsl #2]

    eor r2, #0x00000080 //rcon
    eor r2, r2, r8
    eor r2, r2, r9, lsl #8
    eor r2, r2, r10, lsl #16
    eor r2, r2, r11, lsl #24 //rk[48]
    eor r3, r2 //rk[49]
    eor r4, r3 //rk[50]
    eor r5, r4 //rk[51]
    //write to memory
    str r2, [r1, #48]
    str r3, [r1, #52]
    str r4, [r1, #56]
    str r5, [r1, #60]

    //function epilogue, restore state
    pop {r4-r11}
    bx lr
.size aes192_keyexp_publicinputs_asm,.-aes192_keyexp_publicinputs_asm

@ void aes192_encrypt_publicinputs_asm(const uint8_t *rk,
@       const uint8_t *in, uint8_t *out) {
.global aes192_encrypt_publicinputs_asm
.type   aes192_encrypt_publicinputs_asm,%function
.align 2
aes192_encrypt_publicinputs_asm:

    //function prologue, preserve registers and free r2
    push {r2,r4-r12,r14}

    //load input
    ldr r4, [r1, #0]
    ldr r5, [r1, #4]
    ldr r6, [r1, #8]
    ldr r7, [r1, #12]
    //r1 now free to overwrite
    //load key
    ldr r8, [r0], #4
    ldr r9, [r0], #4
    ldr r10, [r0], #4
    ldr r11, [r0], #4
    mov.w r14, r0

    //load table address once
    ldr r12, =AES_Te0

    //initial addroundkey
    eor r4, r8
    eor r5, r9
    eor r6, r10
    eor r7, r11

.rept 5
    aesencrypt_oddround
    aesencrypt_evenround
.endr
    aesencrypt_oddround
    aesencrypt_finalround

    //rk[48]-rk[51]
    ldr r0, [r14, #0]
    ldr r1, [r14, #4]
    ldr r2, [r14, #8]
    ldr r3, [r14, #12]

    eor r0, r0, r4, ror #8
    eor r1, r1, r5, ror #8
    eor r2, r2, r6, ror #8
    pop.w {r4}
    eor r3, r3, r7, ror #8

    //write output
    str r0, [r4, #0]
    str r1, [r4, #4]
    str r2, [r4, #8]
    str r3, [r4, #12]

    //function epilogue, restore state
    pop {r4-r12,r14}
    bx lr
.size aes192_encrypt_publicinputs_asm,.-aes192_encrypt_publicinputs_asm

.section .text.aes256
@ void aes256_keyexp_publicinputs_asm(const uint8_t *key,
@       uint8_t *rk) {
.global aes256_keyexp_publicinputs_asm
.type   aes256_keyexp_publicinputs_asm,%function
.align 2
aes256_keyexp_publicinputs_asm:

    //function prologue, preserve registers
    push {r4-r12,r14}

    //load key
    ldr r2, [r0, #0]
    ldr r3, [r0, #4]
    ldr r4, [r0, #8]
    ldr r5, [r0, #12]
    ldr r6, [r0, #16]
    ldr r7, [r0, #20]
    ldr r8, [r0, #24]
    ldr r9, [r0, #28]

    //load table address once
    ldr r0, =AES_Te0

    //round 1
    uxtb r10, r9, ror #8
    uxtb r11, r9, ror #16
    uxtb r12, r9, ror #24
    uxtb r14, r9

    ldrb r10, [r0, r10, lsl #2]
    ldrb r11, [r0, r11, lsl #2]
    ldrb r12, [r0, r12, lsl #2]
    ldrb r14, [r0, r14, lsl #2]

    eor r2, #0x00000001 //rcon
    eor r2, r2, r10
    eor r2, r2, r11, lsl #8
    eor r2, r2, r12, lsl #16
    eor r2, r2, r14, lsl #24 //rk[8]
    eor r3, r2 //rk[9]
    eor r4, r3 //rk[10]
    eor r5, r4 //rk[11]

    uxtb r10, r5, ror #16
    uxtb r11, r5, ror #8
    uxtb r12, r5
    uxtb r14, r5, ror #24

    ldrb r10, [r0, r10, lsl #2]
    ldrb r11, [r0, r11, lsl #2]
    ldrb r12, [r0, r12, lsl #2]
    ldrb r14, [r0, r14, lsl #2]

    eor r6, r6, r10, lsl #16
    eor r6, r6, r11, lsl #8
    eor r6, r12
    eor r6, r6, r14, lsl #24 //rk[12]
    eor r7, r6 //rk[13]
    eor r8, r7 //rk[14]
    eor r9, r8 //rk[15]

    //write to memory
    str r2, [r1, #0]
    str r3, [r1, #4]
    str r4, [r1, #8]
    str r5, [r1, #12]
    str r6, [r1, #16]
    str r7, [r1, #20]
    str r8, [r1, #24]
    str r9, [r1, #28]

    //round 2
    uxtb r10, r9, ror #8
    uxtb r11, r9, ror #16
    uxtb r12, r9, ror #24
    uxtb r14, r9

    ldrb r10, [r0, r10, lsl #2]
    ldrb r11, [r0, r11, lsl #2]
    ldrb r12, [r0, r12, lsl #2]
    ldrb r14, [r0, r14, lsl #2]

    eor r2, #0x00000002 //rcon
    eor r2, r2, r10
    eor r2, r2, r11, lsl #8
    eor r2, r2, r12, lsl #16
    eor r2, r2, r14, lsl #24 //rk[16]
    eor r3, r2 //rk[17]
    eor r4, r3 //rk[18]
    eor r5, r4 //rk[19]

    uxtb r10, r5, ror #16
    uxtb r11, r5, ror #8
    uxtb r12, r5
    uxtb r14, r5, ror #24

    ldrb r10, [r0, r10, lsl #2]
    ldrb r11, [r0, r11, lsl #2]
    ldrb r12, [r0, r12, lsl #2]
    ldrb r14, [r0, r14, lsl #2]

    eor r6, r6, r10, lsl #16
    eor r6, r6, r11, lsl #8
    eor r6, r12
    eor r6, r6, r14, lsl #24 //rk[20]
    eor r7, r6 //rk[21]
    eor r8, r7 //rk[22]
    eor r9, r8 //rk[23]

    //write to memory
    str r2, [r1, #32]
    str r3, [r1, #36]
    str r4, [r1, #40]
    str r5, [r1, #44]
    str r6, [r1, #48]
    str r7, [r1, #52]
    str r8, [r1, #56]
    str r9, [r1, #60]

    //round 3
    uxtb r10, r9, ror #8
    uxtb r11, r9, ror #16
    uxtb r12, r9, ror #24
    uxtb r14, r9

    ldrb r10, [r0, r10, lsl #2]
    ldrb r11, [r0, r11, lsl #2]
    ldrb r12, [r0, r12, lsl #2]
    ldrb r14, [r0, r14, lsl #2]

    eor r2, #0x00000004 //rcon
    eor r2, r2, r10
    eor r2, r2, r11, lsl #8
    eor r2, r2, r12, lsl #16
    eor r2, r2, r14, lsl #24 //rk[24]
    eor r3, r2 //rk[25]
    eor r4, r3 //rk[26]
    eor r5, r4 //rk[27]

    uxtb r10, r5, ror #16
    uxtb r11, r5, ror #8
    uxtb r12, r5
    uxtb r14, r5, ror #24

    ldrb r10, [r0, r10, lsl #2]
    ldrb r11, [r0, r11, lsl #2]
    ldrb r12, [r0, r12, lsl #2]
    ldrb r14, [r0, r14, lsl #2]

    eor r6, r6, r10, lsl #16
    eor r6, r6, r11, lsl #8
    eor r6, r12
    eor r6, r6, r14, lsl #24 //rk[28]
    eor r7, r6 //rk[29]
    eor r8, r7 //rk[30]
    eor r9, r8 //rk[31]

    //write to memory
    str r2, [r1, #64]
    str r3, [r1, #68]
    str r4, [r1, #72]
    str r5, [r1, #76]
    str r6, [r1, #80]
    str r7, [r1, #84]
    str r8, [r1, #88]
    str r9, [r1, #92]

    //round 4
    uxtb r10, r9, ror #8
    uxtb r11, r9, ror #16
    uxtb r12, r9, ror #24
    uxtb r14, r9

    ldrb r10, [r0, r10, lsl #2]
    ldrb r11, [r0, r11, lsl #2]
    ldrb r12, [r0, r12, lsl #2]
    ldrb r14, [r0, r14, lsl #2]

    eor r2, #0x00000008 //rcon
    eor r2, r2, r10
    eor r2, r2, r11, lsl #8
    eor r2, r2, r12, lsl #16
    eor r2, r2, r14, lsl #24 //rk[32]
    eor r3, r2 //rk[33]
    eor r4, r3 //rk[34]
    eor r5, r4 //rk[35]

    uxtb r10, r5, ror #16
    uxtb r11, r5, ror #8
    uxtb r12, r5
    uxtb r14, r5, ror #24

    ldrb r10, [r0, r10, lsl #2]
    ldrb r11, [r0, r11, lsl #2]
    ldrb r12, [r0, r12, lsl #2]
    ldrb r14, [r0, r14, lsl #2]

    eor r6, r6, r10, lsl #16
    eor r6, r6, r11, lsl #8
    eor r6, r12
    eor r6, r6, r14, lsl #24 //rk[36]
    eor r7, r6 //rk[37]
    eor r8, r7 //rk[38]
    eor r9, r8 //rk[39]

    //write to memory
    str r2, [r1, #96]
    str r3, [r1, #100]
    str r4, [r1, #104]
    str r5, [r1, #108]
    str r6, [r1, #112]
    str r7, [r1, #116]
    str r8, [r1, #120]
    str r9, [r1, #124]

    add r1, #128

    //round 5
    uxtb r10, r9, ror #8
    uxtb r11, r9, ror #16
    uxtb r12, r9, ror #24
    uxtb r14, r9

    ldrb r10, [r0, r10, lsl #2]
    ldrb r11, [r0, r11, lsl #2]
    ldrb r12, [r0, r12, lsl #2]
    ldrb r14, [r0, r14, lsl #2]

    eor r2, #0x00000010 //rcon
    eor r2, r2, r10
    eor r2, r2, r11, lsl #8
    eor r2, r2, r12, lsl #16
    eor r2, r2, r14, lsl #24 //rk[40]
    eor r3, r2 //rk[41]
    eor r4, r3 //rk[42]
    eor r5, r4 //rk[43]

    uxtb r10, r5, ror #16
    uxtb r11, r5, ror #8
    uxtb r12, r5
    uxtb r14, r5, ror #24

    ldrb r10, [r0, r10, lsl #2]
    ldrb r11, [r0, r11, lsl #2]
    ldrb r12, [r0, r12, lsl #2]
    ldrb r14, [r0, r14, lsl #2]

    eor r6, r6, r10, lsl #16
    eor r6, r6, r11, lsl #8
    eor r6, r12
    eor r6, r6, r14, lsl #24 //rk[44]
    eor r7, r6 //rk[45]
    eor r8, r7 //rk[46]
    eor r9, r8 //rk[47]

    //write to memory
    str r2, [r1, #0]
    str r3, [r1, #4]
    str r4, [r1, #8]
    str r5, [r1, #12]
    str r6, [r1, #16]
    str r7, [r1, #20]
    str r8, [r1, #24]
    str r9, [r1, #28]

    //round 6
    uxtb r10, r9, ror #8
    uxtb r11, r9, ror #16
    uxtb r12, r9, ror #24
    uxtb r14, r9

    ldrb r10, [r0, r10, lsl #2]
    ldrb r11, [r0, r11, lsl #2]
    ldrb r12, [r0, r12, lsl #2]
    ldrb r14, [r0, r14, lsl #2]

    eor r2, #0x00000020 //rcon
    eor r2, r2, r10
    eor r2, r2, r11, lsl #8
    eor r2, r2, r12, lsl #16
    eor r2, r2, r14, lsl #24 //rk[48]
    eor r3, r2 //rk[49]
    eor r4, r3 //rk[50]
    eor r5, r4 //rk[51]

    uxtb r10, r5, ror #16
    uxtb r11, r5, ror #8
    uxtb r12, r5
    uxtb r14, r5, ror #24

    ldrb r10, [r0, r10, lsl #2]
    ldrb r11, [r0, r11, lsl #2]
    ldrb r12, [r0, r12, lsl #2]
    ldrb r14, [r0, r14, lsl #2]

    eor r6, r6, r10, lsl #16
    eor r6, r6, r11, lsl #8
    eor r6, r12
    eor r6, r6, r14, lsl #24 //rk[52]
    eor r7, r6 //rk[53]
    eor r8, r7 //rk[54]
    eor r9, r8 //rk[55]

    //write to memory
    str r2, [r1, #32]
    str r3, [r1, #36]
    str r4, [r1, #40]
    str r5, [r1, #44]
    str r6, [r1, #48]
    str r7, [r1, #52]
    str r8, [r1, #56]
    str r9, [r1, #60]

    //round 7
    uxtb r10, r9, ror #8
    uxtb r11, r9, ror #16
    uxtb r12, r9, ror #24
    uxtb r14, r9

    ldrb r10, [r0, r10, lsl #2]
    ldrb r11, [r0, r11, lsl #2]
    ldrb r12, [r0, r12, lsl #2]
    ldrb r14, [r0, r14, lsl #2]

    eor r2, #0x00000040 //rcon
    eor r2, r2, r10
    eor r2, r2, r11, lsl #8
    eor r2, r2, r12, lsl #16
    eor r2, r2, r14, lsl #24 //rk[56]
    eor r3, r2 //rk[57]
    eor r4, r3 //rk[58]
    eor r5, r4 //rk[59]

    //write to memory
    str r2, [r1, #64]
    str r3, [r1, #68]
    str r4, [r1, #72]
    str r5, [r1, #76]

    //function epilogue, restore state
    pop {r4-r12,r14}
    bx lr
.size aes256_keyexp_publicinputs_asm,.-aes256_keyexp_publicinputs_asm

.align 2
.ltorg

@ void aes256_encrypt_publicinputs_asm(const uint8_t *rk,
@       const uint8_t *in, uint8_t *out) {
.global aes256_encrypt_publicinputs_asm
.type   aes256_encrypt_publicinputs_asm,%function
.align 2
aes256_encrypt_publicinputs_asm:

    //function prologue, preserve registers and free r2
    push {r2,r4-r12,r14}

    //load input
    ldr r4, [r1, #0]
    ldr r5, [r1, #4]
    ldr r6, [r1, #8]
    ldr r7, [r1, #12]
    //r1 now free to overwrite
    //load key
    ldr r8, [r0], #4
    ldr r9, [r0], #4
    ldr r10, [r0], #4
    ldr r11, [r0], #4
    mov.w r14, r0

    //load table address once
    ldr r12, =AES_Te0

    //initial addroundkey
    eor r4, r8
    eor r5, r9
    eor r6, r10
    eor r7, r11

.rept 6
    aesencrypt_oddround
    aesencrypt_evenround
.endr
    aesencrypt_oddround
    aesencrypt_finalround

    //rk[56]-rk[59]
    ldr r0, [r14, #0]
    ldr r1, [r14, #4]
    ldr r2, [r14, #8]
    ldr r3, [r14, #12]

    eor r0, r0, r4, ror #8
    eor r1, r1, r5, ror #8
    eor r2, r2, r6, ror #8
    pop.w {r4}
    eor r3, r3, r7, ror #8

    //write output
    str r0, [r4, #0]
    str r1, [r4, #4]
    str r2, [r4, #8]
    str r3, [r4, #12]

    //function epilogue, restore state
    pop {r4-r12,r14}
    bx lr
.size aes256_encrypt_publicinputs_asm,.-aes256_encrypt_publicinputs_asm
back to top