https://github.com/mupq/pqm4
Tip revision: 8d44b724396ddbc0db55d5de93bec252cedb9c04 authored by Matthias J. Kannwischer on 13 August 2024, 00:06:44 UTC
init msg buffer in {speed,hashing}.c (#351)
init msg buffer in {speed,hashing}.c (#351)
Tip revision: 8d44b72
keccakf1600.S
@
@ Implementation by the Keccak, Keyak and Ketje Teams, namely, Guido Bertoni,
@ Joan Daemen, Michaƫl Peeters, Gilles Van Assche and Ronny Van Keer, hereby
@ denoted as "the implementer".
@ Additional optimizations by Alexandre Adomnicai.
@
@ For more information, feedback or questions, please refer to our websites:
@ http://keccak.noekeon.org/
@ http://keyak.noekeon.org/
@ http://ketje.noekeon.org/
@
@ To the extent possible under law, the implementer has waived all copyright
@ and related or neighboring rights to the source code in this file.
@ http://creativecommons.org/publicdomain/zero/1.0/
@
@ WARNING: These functions work only on little endian CPU with@ ARMv7m architecture (ARM Cortex-M3, ...).
.thumb
.syntax unified
.text
@ Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002
.macro toBitInterleaving x0,x1,s0,s1,t,over
and \t,\x0,#0x55555555
orr \t,\t,\t, LSR #1
and \t,\t,#0x33333333
orr \t,\t,\t, LSR #2
and \t,\t,#0x0F0F0F0F
orr \t,\t,\t, LSR #4
and \t,\t,#0x00FF00FF
bfi \t,\t,#8, #8
.if \over != 0
lsr \s0,\t, #8
.else
eor \s0,\s0,\t, LSR #8
.endif
and \t,\x1,#0x55555555
orr \t,\t,\t, LSR #1
and \t,\t,#0x33333333
orr \t,\t,\t, LSR #2
and \t,\t,#0x0F0F0F0F
orr \t,\t,\t, LSR #4
and \t,\t,#0x00FF00FF
orr \t,\t,\t, LSR #8
eor \s0,\s0,\t, LSL #16
and \t,\x0,#0xAAAAAAAA
orr \t,\t,\t, LSL #1
and \t,\t,#0xCCCCCCCC
orr \t,\t,\t, LSL #2
and \t,\t,#0xF0F0F0F0
orr \t,\t,\t, LSL #4
and \t,\t,#0xFF00FF00
orr \t,\t,\t, LSL #8
.if \over != 0
lsr \s1,\t, #16
.else
eor \s1,\s1,\t, LSR #16
.endif
and \t,\x1,#0xAAAAAAAA
orr \t,\t,\t, LSL #1
and \t,\t,#0xCCCCCCCC
orr \t,\t,\t, LSL #2
and \t,\t,#0xF0F0F0F0
orr \t,\t,\t, LSL #4
and \t,\t,#0xFF00FF00
orr \t,\t,\t, LSL #8
bfc \t, #0, #16
eors \s1,\s1,\t
.endm
@ Credit: Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002
.macro fromBitInterleaving x0, x1, t
movs \t, \x0 @ t = x0@
bfi \x0, \x1, #16, #16 @ x0 = (x0 & 0x0000FFFF) | (x1 << 16)@
bfc \x1, #0, #16 @ x1 = (t >> 16) | (x1 & 0xFFFF0000)@
orr \x1, \x1, \t, LSR #16
eor \t, \x0, \x0, LSR #8 @ t = (x0 ^ (x0 >> 8)) & 0x0000FF00UL@ x0 = x0 ^ t ^ (t << 8)@
and \t, #0x0000FF00
eors \x0, \x0, \t
eor \x0, \x0, \t, LSL #8
eor \t, \x0, \x0, LSR #4 @ t = (x0 ^ (x0 >> 4)) & 0x00F000F0UL@ x0 = x0 ^ t ^ (t << 4)@
and \t, #0x00F000F0
eors \x0, \x0, \t
eor \x0, \x0, \t, LSL #4
eor \t, \x0, \x0, LSR #2 @ t = (x0 ^ (x0 >> 2)) & 0x0C0C0C0CUL@ x0 = x0 ^ t ^ (t << 2)@
and \t, #0x0C0C0C0C
eors \x0, \x0, \t
eor \x0, \x0, \t, LSL #2
eor \t, \x0, \x0, LSR #1 @ t = (x0 ^ (x0 >> 1)) & 0x22222222UL@ x0 = x0 ^ t ^ (t << 1)@
and \t, #0x22222222
eors \x0, \x0, \t
eor \x0, \x0, \t, LSL #1
eor \t, \x1, \x1, LSR #8 @ t = (x1 ^ (x1 >> 8)) & 0x0000FF00UL@ x1 = x1 ^ t ^ (t << 8)@
and \t, #0x0000FF00
eors \x1, \x1, \t
eor \x1, \x1, \t, LSL #8
eor \t, \x1, \x1, LSR #4 @ t = (x1 ^ (x1 >> 4)) & 0x00F000F0UL@ x1 = x1 ^ t ^ (t << 4)@
and \t, #0x00F000F0
eors \x1, \x1, \t
eor \x1, \x1, \t, LSL #4
eor \t, \x1, \x1, LSR #2 @ t = (x1 ^ (x1 >> 2)) & 0x0C0C0C0CUL@ x1 = x1 ^ t ^ (t << 2)@
and \t, #0x0C0C0C0C
eors \x1, \x1, \t
eor \x1, \x1, \t, LSL #2
eor \t, \x1, \x1, LSR #1 @ t = (x1 ^ (x1 >> 1)) & 0x22222222UL@ x1 = x1 ^ t ^ (t << 1)@
and \t, #0x22222222
eors \x1, \x1, \t
eor \x1, \x1, \t, LSL #1
.endm
@ --- offsets in state
.equ Aba0, 0*4
.equ Aba1, 1*4
.equ Abe0, 2*4
.equ Abe1, 3*4
.equ Abi0, 4*4
.equ Abi1, 5*4
.equ Abo0, 6*4
.equ Abo1, 7*4
.equ Abu0, 8*4
.equ Abu1, 9*4
.equ Aga0, 10*4
.equ Aga1, 11*4
.equ Age0, 12*4
.equ Age1, 13*4
.equ Agi0, 14*4
.equ Agi1, 15*4
.equ Ago0, 16*4
.equ Ago1, 17*4
.equ Agu0, 18*4
.equ Agu1, 19*4
.equ Aka0, 20*4
.equ Aka1, 21*4
.equ Ake0, 22*4
.equ Ake1, 23*4
.equ Aki0, 24*4
.equ Aki1, 25*4
.equ Ako0, 26*4
.equ Ako1, 27*4
.equ Aku0, 28*4
.equ Aku1, 29*4
.equ Ama0, 30*4
.equ Ama1, 31*4
.equ Ame0, 32*4
.equ Ame1, 33*4
.equ Ami0, 34*4
.equ Ami1, 35*4
.equ Amo0, 36*4
.equ Amo1, 37*4
.equ Amu0, 38*4
.equ Amu1, 39*4
.equ Asa0, 40*4
.equ Asa1, 41*4
.equ Ase0, 42*4
.equ Ase1, 43*4
.equ Asi0, 44*4
.equ Asi1, 45*4
.equ Aso0, 46*4
.equ Aso1, 47*4
.equ Asu0, 48*4
.equ Asu1, 49*4
@ --- offsets on stack
.equ mDa0, 0*4
.equ mDa1, 1*4
.equ mDo0, 2*4
.equ mDo1, 3*4
.equ mDi0, 4*4
.equ mRC , 5*4
.equ mSize, 6*4
/******************************************************************************
* Bitwise exclusive-OR where both operands are misaligned (i.e. src1 and src2
* are rotated by rot1 and rot2, respectively).
* The output result is also misaligned (i.e. dst is rotated by rot1-rot2).
* - dst destination register
* - src1-src2 source registers
* - rot1-rot2 rotation values
*****************************************************************************/
.macro eorror dst, src1, src2, rot1, rot2
.if \rot1 >= \rot2
eor \dst, \src1, \src2, ror \rot1-\rot2
.else
eor \dst, \src1, \src2, ror 32+\rot1-\rot2
.endif
.endm
/******************************************************************************
* Bit clear instruction where both operands are misaligned (i.e. src1 and src2
* are rotated by rot1 and rot2, respectively).
* The output result is also misaligned (i.e. dst is rotated by rot1-rot2).
* - dst destination register
* - src1-src2 source registers
* - rot1-rot2 rotation values
*****************************************************************************/
.macro bicror dst, src1, src2, rot1, rot2
.if \rot1 >= \rot2
bic \dst, \src1, \src2, ror \rot1-\rot2
.else
bic \dst, \src1, \src2, ror 32+\rot1-\rot2
.endif
.endm
/******************************************************************************
* Load 5 words from memory and XOR them all together. It is used to compute
* the parity columns for the Theta step.
* Note that all operands may be misaligned (i.e. rotated by a certain amount
* of bits), as well as the result.
* - dst destination register
* - src1-src5 source registers
* - rot1-rot5 rotation values
*****************************************************************************/
.macro xor5 dst, src1, src2, src3, src4, src5, rot1, rot2, rot3, rot4, rot5
ldr.w \dst, [r0, #\src1]
ldr.w r1, [r0, #\src2]
ldr.w r5, [r0, #\src3]
ldr r11, [r0, #\src4]
ldr r12, [r0, #\src5]
eorror \dst, \dst, r1, \rot1, \rot2
eorror \dst, \dst, r5, \rot1, \rot3
eorror \dst, \dst, r11, \rot1, \rot4
eorror \dst, \dst, r12, \rot1, \rot5
.endm
/******************************************************************************
* Same as xor5, except that a previous result is stored on the stack after the
* loads from memory. This allows to have the str instruction for free.
* - dst destination register
* - src1-src5 source registers
* - rot1-rot5 rotation values
* - strreg register from previous calculations to be stored in memory
* - stradr register holding the address to store `prev`
* - strofs stack pointer memory offset for the str instruction
*****************************************************************************/
.macro xor5str dst, src1, src2, src3, src4, src5, rot1, rot2, rot3, rot4, rot5, strreg, stradr, strofs
ldr.w \dst, [r0, #\src1]
ldr.w r1, [r0, #\src2]
ldr.w r5, [r0, #\src3]
ldr r11, [r0, #\src4]
ldr r12, [r0, #\src5]
str.w \strreg, [\stradr, #\strofs]
eorror \dst, \dst, r1, \rot1, \rot2
eorror \dst, \dst, r5, \rot1, \rot3
eorror \dst, \dst, r11, \rot1, \rot4
eorror \dst, \dst, r12, \rot1, \rot5
.endm
/******************************************************************************
* Exclusive-OR where the 2nd operand is rotated by 1 bit to the left.
* - dst destination register
* - src1-src2 source registers
* - rot differential rotation btw src1 & src2 (i.e. rot=rot1-rot2)
*****************************************************************************/
.macro xorrol dst, src1, src2, rot
eor \dst, \src1, \src2, ror \rot-1
.endm
/******************************************************************************
* Bitslice implementation of the Chi step with misaligned operands.
* - resofs memory offset within the internal state to store the result
* - src1-src3 source registers
* - rot1-rot3 rotation values
*****************************************************************************/
.macro xandnotlazystr resofs, src1, src2, src3, rot1, rot2, rot3
bicror r1, \src3, \src2, \rot3, \rot2
eorror r1, r1, \src1, \rot3, \rot1
str.w r1, [r0, #\resofs]
.endm
/******************************************************************************
* Same as xandnotlazystr but without the str instruction which will be carried
* out later in order to take advantage of future ldr instructions.
* - src1-src3 source registers
* - rot1-rot3 rotation values
*****************************************************************************/
.macro xandnotlazy src1, src2, src3, rot1, rot2, rot3
bicror r1, \src3, \src2, \rot3, \rot2
eorror r1, r1, \src1, \rot3, \rot1
.endm
/******************************************************************************
* Same as xandnotlazystr with an additional rotation in order to explictly
* compute the Rho step. It is useful in KeccakRound3 in order to return to the
* classical representation every 4 rounds.
* - resofs memory offset within the internal state to store the result
* - src1-src3 source registers
* - rot1-rot3 rotation values
*****************************************************************************/
.macro xandnotstr resofs, src1, src2, src3, rot1, rot2, rot3
bicror r1, \src3, \src2, \rot3, \rot2
eorror r1, r1, \src1, \rot3, \rot1
.if \rot3 > 0
ror r1, r1, #32-\rot3
.endif
str.w r1, [r0, #\resofs]
.endm
/******************************************************************************
* Same as xandnotstr but without the str instruction which will be carried
* out later in order to take advantage of future ldr instructions.
* - src1-src3 source registers
* - rot1-rot3 rotation values
*****************************************************************************/
.macro xandnot src1, src2, src3, rot1, rot2, rot3
bicror r1, \src3, \src2, \rot3, \rot2
eorror r1, r1, \src1, \rot3, \rot1
.if \rot3 > 0
ror r1, r1, #32-\rot3
.endif
.endm
/******************************************************************************
* Same as xandnot followed by the Iota step. Note that the source registers
* are not specified since they are always r3, r4 and r5.
* - out output reg (useful to store the result in the next round)
* - rot2-rot3 rotation values
* - rcofs memory offset to load the round constant
* - last Boolean to indicate whether its the last round of the
* quadruple round routine
*****************************************************************************/
.macro xandnotiota out, rot3, rot2, rcofs, last
bicror r5, r5, r4, \rot3, \rot2
ldr r1, [sp, #mRC]
ldr r4, [r1, #\rcofs]
.if \last == 1
ldr r7, [r1, #32]!
str r1, [sp, #mRC]
cmp r7, #0xFF
.endif
.if \rot3 > 0
eor r3, r3, r5, ror 32-\rot3
.else
eor.w r3, r3, r5
.endif
eor.w \out, r4, r3
.endm
/******************************************************************************
* Add the parity bits to the state registers r3-r7. If the state registers are
* not properly aligned due to previous lazy rotations, use the barrel shifter
* to fix the misalignment when adding the parity bits.
* - par1-par5 registers containing the parity bits
* - dly1-dly5 rotation values to compute the (delayed) Rho step
*****************************************************************************/
.macro addparity par1, dly1, par2, dly2, par3, dly3, par4, dly4, par5, dly5
.if \dly1 > 0
eor r3, \par1, r3, ror 32-\dly1
.else
eor.w r3, \par1, r3
.endif
.if \dly2 > 0
eor r4, \par2, r4, ror 32-\dly2
.else
eor.w r4, \par2, r4
.endif
.if \dly3 > 0
eor r5, \par3, r5, ror 32-\dly3
.else
eor.w r5, \par3, r5
.endif
.if \dly4 > 0
eor r6, \par4, r6, ror 32-\dly4
.else
eor.w r6, \par4, r6
.endif
.if \dly5 > 0
eor r7, \par5, r7, ror 32-\dly5
.else
eor.w r7, \par5, r7
.endif
.endm
/******************************************************************************
* Apply Theta, Pi, Chi and Iota steps to half a plane (i.e. 5 32-bit words) of
* the internal state.
* Note that the Rho step is calculated if and only if \lazy == 0, otherwise it
* is delayed until the next round using ''lazy reductions'' thanks to the
* inline barrel shifter.
* - src1-src5 source registers
* - par1-par5 registers containing the parity bits
* - rot2-rot5 rotation values to compute the current Rho step
* - dly1-dly5 rotation values to compute the delayed Rho step
* - prev register from previous calculations to be stored in memory
* - strofs stack pointer memory offset for the str instruction
* - reg output reg related to the Iota step (to be stored later)
*****************************************************************************/
.macro KeccakThetaRhoPiChiIota src1, par1, dly1, \
src2, par2, rot2, dly2, \
src3, par3, rot3, dly3, \
src4, par4, rot4, dly4, \
src5, par5, rot5, dly5, \
ofs, last, lazy, strofs, reg
ldr.w r3, [r0, #\src1]
ldr r4, [r0, #\src2]
ldr r5, [r0, #\src3]
ldr r6, [r0, #\src4]
ldr r7, [r0, #\src5]
str.w r1, [r0, #\strofs]
addparity \par1, \dly1, \par2, \dly2, \par3, \dly3, \par4, \dly4, \par5, \dly5
.if \lazy == 1
xandnotlazystr \src2, r4, r5, r6, \rot2, \rot3, \rot4
xandnotlazystr \src3, r5, r6, r7, \rot3, \rot4, \rot5
xandnotlazystr \src4, r6, r7, r3, \rot4, \rot5, 0
xandnotlazystr \src5, r7, r3, r4, \rot5, 0, \rot2
.else
xandnotstr \src2, r4, r5, r6, \rot2, \rot3, \rot4
xandnotstr \src3, r5, r6, r7, \rot3, \rot4, \rot5
xandnotstr \src4, r6, r7, r3, \rot4, \rot5, 0
xandnotstr \src5, r7, r3, r4, \rot5, 0, \rot2
.endif
xandnotiota \reg, \rot3, \rot2, \ofs, \last
.endm
/******************************************************************************
* Apply Theta, Pi, and Chi steps to half a plane (i.e. 5 32-bit words) of the
* internal state.
* Note that the Rho step is calculated if and only if \lazy == 0, otherwise it
* is delayed until the next round using ''lazy reductions'' thanks to the
* inline barrel shifter.
* - src1-src5 source registers
* - dst1-dst5 memory offsets to store the output registers
* - par1-par5 registers containing the parity bits
* - rot2-rot5 rotation values to compute the current Rho step
* - dly1-dly5 rotation values to compute the delayed Rho step
* - lazy Boolean to indicate whether lazy rotations are used or not
* - strofs stack pointer memory offset to store the last output of the
* previous round.
*****************************************************************************/
.macro KeccakThetaRhoPiChi src1, dst1, par1, rot1, dly1, \
src2, dst2, par2, rot2, dly2, \
src3, dst3, par3, rot3, dly3, \
src4, dst4, par4, rot4, dly4, \
src5, dst5, par5, rot5, dly5, \
lazy, strofs
ldr.w r3, [r0, #\src1]
ldr.w r4, [r0, #\src2]
ldr.w r5, [r0, #\src3]
ldr.w r6, [r0, #\src4]
ldr.w r7, [r0, #\src5]
str.w r1, [r0, #\strofs]
addparity \par1, \dly1, \par2, \dly2, \par3, \dly3, \par4, \dly4, \par5, \dly5
.if \lazy == 1
xandnotlazystr \dst1, r3, r4, r5, \rot1, \rot2, \rot3
xandnotlazystr \dst2, r4, r5, r6, \rot2, \rot3, \rot4
xandnotlazystr \dst3, r5, r6, r7, \rot3, \rot4, \rot5
xandnotlazystr \dst4, r6, r7, r3, \rot4, \rot5, \rot1
xandnotlazy r7, r3, r4, \rot5, \rot1, \rot2
.else
xandnotstr \dst1, r3, r4, r5, \rot1, \rot2, \rot3
xandnotstr \dst2, r4, r5, r6, \rot2, \rot3, \rot4
xandnotstr \dst3, r5, r6, r7, \rot3, \rot4, \rot5
xandnotstr \dst4, r6, r7, r3, \rot4, \rot5, \rot1
xandnot r7, r3, r4, \rot5, \rot1, \rot2
.endif
.endm
/******************************************************************************
* 1st round of the 4 unrolled rounds routine due to in-place processing.
* At the beginning of such rounds, the internal state is expected to match the
* classical representation (i.e. without transition and no delayed Rho step).
*****************************************************************************/
.macro KeccakRound0
xor5 r3, Abu0, Agu0, Aku0, Amu0, Asu0, 0, 0, 0, 0, 0
xor5 r7, Abe1, Age1, Ake1, Ame1, Ase1, 0, 0, 0, 0, 0
xorrol r6, r3, r7, 32
xor5str r4, Abi1, Agi1, Aki1, Ami1, Asi1, 0, 0, 0, 0, 0, r6, sp, mDa0
eor.w r6, r3, r4
xor5str r3, Abo0, Ago0, Ako0, Amo0, Aso0, 0, 0, 0, 0, 0, r6, sp, mDo1
eor.w r2, r7, r3
xor5 r7, Aba0, Aga0, Aka0, Ama0, Asa0, 0, 0, 0, 0, 0
xorrol r10, r7, r4, 32
xor5 r4, Abo1, Ago1, Ako1, Amo1, Aso1, 0, 0, 0, 0, 0
eor r14, r4, r7
xor5 r7, Abe0, Age0, Ake0, Ame0, Ase0, 0, 0, 0, 0, 0
xorrol r6, r7, r4, 32
xor5str r4, Abu1, Agu1, Aku1, Amu1, Asu1, 0, 0, 0, 0, 0, r6, sp, mDi0
eor.w r8, r4, r7
xor5str r7, Abi0, Agi0, Aki0, Ami0, Asi0, 0, 0, 0, 0, 0, r8, sp, mDa1
xorrol r9, r7, r4, 32
xor5str r4, Aba1, Aga1, Aka1, Ama1, Asa1, 0, 0, 0, 0, 0, r9, sp, mDo0
eor r11, r4, r7
xorrol r12, r3, r4, 32
KeccakThetaRhoPiChi Abo0, Aka1, r9, 14, 0, \
Agu0, Ame1, r12, 10, 0, \
Aka1, Asi1, r8, 2, 0, \
Ame1, Abo0, r11, 23, 0, \
Asi1, Agu0, r2, 31, 0, \
1, Aka1
KeccakThetaRhoPiChi Abe0, Asa1, r10, 0, 0, \
Agi1, Abe0, r2, 3, 0, \
Ako0, Agi1, r9, 12, 0, \
Amu1, Ako0, r14, 4, 0, \
Asa1, Amu1, r8, 9, 0, \
1, Agu0
ldr r8, [sp, #mDa0]
KeccakThetaRhoPiChi Abu1, Aga0, r14, 14, 0, \
Aga0, Ake0, r8, 18, 0, \
Ake0, Ami1, r10, 5, 0, \
Ami1, Aso0, r2, 8, 0, \
Aso0, Abu1, r9, 28, 0, \
1, Amu1
KeccakThetaRhoPiChi Abi1, Ama0, r2, 31, 0, \
Ago0, Ase1, r9, 27, 0, \
Aku0, Abi1, r12, 19, 0, \
Ama0, Ago0, r8, 20, 0, \
Ase1, Aku0, r11, 1, 0, \
1, Abu1
ldr r9, [sp, #mDo1]
KeccakThetaRhoPiChiIota Aba0, r8, 0, \
Age0, r10, 22, 0, \
Aki1, r2, 22, 0, \
Amo1, r9, 11, 0, \
Asu0, r12, 7, 0, \
0, 0, 1, Aku0, r1
ldr.w r2, [sp, #mDi0]
KeccakThetaRhoPiChi Abo1, Aka0, r9, 14, 0, \
Agu1, Ame0, r14, 10, 0, \
Aka0, Asi0, r8, 1, 0, \
Ame0, Abo1, r10, 22, 0, \
Asi0, Agu1, r2, 30, 0, \
1, Aba0
KeccakThetaRhoPiChi Abe1, Asa0, r11, 1, 0, \
Agi0, Abe1, r2, 3, 0, \
Ako1, Agi0, r9, 13, 0, \
Amu0, Ako1, r12, 4, 0, \
Asa0, Amu0, r8, 9, 0, \
1, Agu1
ldr r8, [sp, #mDa1]
KeccakThetaRhoPiChi Abu0, Aga1, r12, 13, 0, \
Aga1, Ake1, r8, 18, 0, \
Ake1, Ami0, r11, 5, 0, \
Ami0, Aso1, r2, 7, 0, \
Aso1, Abu0, r9, 28, 0, \
1, Amu0
KeccakThetaRhoPiChi Abi0, Ama1, r2, 31, 0, \
Ago1, Ase0, r9, 28, 0, \
Aku1, Abi0, r14, 20, 0, \
Ama1, Ago1, r8, 21, 0, \
Ase0, Aku1, r10, 1, 0, \
1, Abu0
ldr r9, [sp, #mDo0]
KeccakThetaRhoPiChiIota Aba1, r8, 0, \
Age1, r11, 22, 0, \
Aki0, r2, 21, 0, \
Amo0, r9, 10, 0, \
Asu1, r14, 7, 0, \
4, 0, 1, Aku1, r14
.endm
/******************************************************************************
* 2nd round of the 4 unrolled rounds routine due to in-place processing.
*****************************************************************************/
.macro KeccakRound1
xor5str r3, Asu0, Agu0, Amu0, Abu1, Aku1, 22, 10, 3, 18, 28, r14, r0, Aba1
xor5 r7, Age1, Ame0, Abe0, Ake1, Ase1, 10, 22, 4, 7, 20
ror r3, 32-22
xorrol r6, r3, r7, 32-10
xor5str r4, Aki0, Asi0, Agi1, Ami0, Abi1, 7, 30, 9, 28, 1, r6, sp, mDa0
eor r6, r3, r4, ror 32-7
xor5str r3, Amo1, Abo0, Ako1, Aso0, Ago1, 0, 14, 1, 14, 31, r6, sp, mDo1
eor r2, r3, r7, ror 32-10
xor5 r7, Aba0, Aka1, Asa0, Aga0, Ama1, 0, 2, 13, 5, 20
xorrol r10, r7, r4, 32-7
xor5 r4, Amo0, Abo1, Ako0, Aso1, Ago0, 0, 14, 0, 13, 31
eor r14, r4, r7
xor5 r7, Age0, Ame1, Abe1, Ake0, Ase0, 11, 23, 4, 8, 21
ror r7, 32-11
xorrol r6, r7, r4, 32
xor5str r4, Asu1, Agu1, Amu1, Abu0, Aku0, 22, 10, 3, 18, 27, r6, sp, mDi0
eor r8, r7, r4, ror 32-22
xor5str r7, Aki1, Asi1, Agi0, Ami1, Abi0, 7, 31, 9, 28, 1, r8, sp, mDa1
ror r7, 32-7
xorrol r9, r7, r4, 32-22
xor5str r4, Aba1, Aka0, Asa1, Aga1, Ama0, 0, 1, 12, 5, 19, r9, sp, mDo0
eor r11, r4, r7
xorrol r12, r3, r4, 32
KeccakThetaRhoPiChi Amo1, Asa1, r9, 14, 0, \
Agu0, Ake1, r12, 10, 10, \
Asa1, Abi1, r8, 2, 12, \
Ake1, Amo1, r11, 23, 7, \
Abi1, Agu0, r2, 31, 1, \
1, Asa1
KeccakThetaRhoPiChi Age0, Ama0, r10, 0, 11, \
Asi0, Age0, r2, 3, 30, \
Ako1, Asi0, r9, 12, 1, \
Abu0, Ako1, r14, 4, 18, \
Ama0, Abu0, r8, 9, 19, \
1, Agu0
ldr r8, [sp, #mDa0]
KeccakThetaRhoPiChi Asu1, Aka1, r14, 14, 22, \
Aka1, Abe1, r8, 18, 2, \
Abe1, Ami0, r10, 5, 4, \
Ami0, Ago1, r2, 8, 28, \
Ago1, Asu1, r9, 28, 31, \
1, Abu0
KeccakThetaRhoPiChi Aki0, Aga0, r2, 31, 7, \
Abo0, Ase1, r9, 27, 14, \
Amu0, Aki0, r12, 19, 3, \
Aga0, Abo0, r8, 20, 5, \
Ase1, Amu0, r11, 1, 20, \
1, Asu1
ldr r9, [sp, #mDo1]
KeccakThetaRhoPiChiIota Aba0, r8, 0, \
Ame1, r10, 22, 23, \
Agi1, r2, 22, 9, \
Aso1, r9, 11, 13, \
Aku1, r12, 7, 28, \
8, 0, 1, Amu0, r1
ldr.w r2, [sp, #mDi0]
KeccakThetaRhoPiChi Amo0, Asa0, r9, 14, 0, \
Agu1, Ake0, r14, 10, 10, \
Asa0, Abi0, r8, 1, 13, \
Ake0, Amo0, r10, 22, 8, \
Abi0, Agu1, r2, 30, 1, \
1, Aba0
KeccakThetaRhoPiChi Age1, Ama1, r11, 1, 10, \
Asi1, Age1, r2, 3, 31, \
Ako0, Asi1, r9, 13, 0, \
Abu1, Ako0, r12, 4, 18, \
Ama1, Abu1, r8, 9, 20, \
1, Agu1
ldr r8, [sp, #mDa1]
KeccakThetaRhoPiChi Asu0, Aka0, r12, 13, 22, \
Aka0, Abe0, r8, 18, 1, \
Abe0, Ami1, r11, 5, 4, \
Ami1, Ago0, r2, 7, 28, \
Ago0, Asu0, r9, 28, 31, \
1, Abu1
KeccakThetaRhoPiChi Aki1, Aga1, r2, 31, 7, \
Abo1, Ase0, r9, 28, 14, \
Amu1, Aki1, r14, 20, 3, \
Aga1, Abo1, r8, 21, 5, \
Ase0, Amu1, r10, 1, 21, \
1, Asu0
ldr r9, [sp, #mDo0]
KeccakThetaRhoPiChiIota Aba1, r8, 0, \
Ame0, r11, 22, 22, \
Agi0, r2, 21, 9, \
Aso0, r9, 10, 14, \
Aku0, r14, 7, 27, \
12, 0, 1, Amu1, r14
.endm
/******************************************************************************
* 3rd round of the 4 unrolled rounds routine due to in-place processing.
*****************************************************************************/
.macro KeccakRound2
xor5str r3, Aku1, Agu0, Abu1, Asu1, Amu1, 22, 10, 3, 18, 28, r14, r0, Aba1
xor5 r7, Ame0, Ake0, Age0, Abe0, Ase1, 10, 22, 4, 7, 20
ror r3, 32-22
xorrol r6, r3, r7, 32-10
xor5str r4, Agi0, Abi0, Asi0, Ami1, Aki0, 7, 30, 9, 28, 1, r6, sp, mDa0
eor r6, r3, r4, ror 32-7
xor5str r3, Aso1, Amo1, Ako0, Ago1, Abo1, 0, 14, 1, 14, 31, r6, sp, mDo1
eor r2, r3, r7, ror 32-10
xor5 r7, Aba0, Asa1, Ama1, Aka1, Aga1, 0, 2, 13, 5, 20
xorrol r10, r7, r4, 32-7
xor5 r4, Aso0, Amo0, Ako1, Ago0, Abo0, 0, 14, 0, 13, 31
eor r14, r4, r7
xor5 r7, Ame1, Ake1, Age1, Abe1, Ase0, 11, 23, 4, 8, 21
ror r7, 32-11
xorrol r6, r7, r4, 32
xor5str r4, Aku0, Agu1, Abu0, Asu0, Amu0, 22, 10, 3, 18, 27, r6, sp, mDi0
eor r8, r7, r4, ror 32-22
xor5str r7, Agi1, Abi1, Asi1, Ami0, Aki1, 7, 31, 9, 28, 1, r8, sp, mDa1
ror r7, 32-7
xorrol r9, r7, r4, 32-22
xor5str r4, Aba1, Asa0, Ama0, Aka0, Aga0, 0, 1, 12, 5, 19, r9, sp, mDo0
eor r11, r4, r7
xorrol r12, r3, r4, 32
KeccakThetaRhoPiChi Aso1, Ama0, r9, 14, 0, \
Agu0, Abe0, r12, 10, 10, \
Ama0, Aki0, r8, 2, 12, \
Abe0, Aso1, r11, 23, 7, \
Aki0, Agu0, r2, 31, 1, \
1, Ama0
KeccakThetaRhoPiChi Ame1, Aga0, r10, 0, 11, \
Abi0, Ame1, r2, 3, 30, \
Ako0, Abi0, r9, 12, 1, \
Asu0, Ako0, r14, 4, 18, \
Aga0, Asu0, r8, 9, 19, \
1, Agu0
ldr r8, [sp, #mDa0]
KeccakThetaRhoPiChi Aku0, Asa1, r14, 14, 22, \
Asa1, Age1, r8, 18, 2, \
Age1, Ami1, r10, 5, 4, \
Ami1, Abo1, r2, 8, 28, \
Abo1, Aku0, r9, 28, 31, \
1, Asu0
KeccakThetaRhoPiChi Agi0, Aka1, r2, 31, 7, \
Amo1, Ase1, r9, 27, 14, \
Abu1, Agi0, r12, 19, 3, \
Aka1, Amo1, r8, 20, 5, \
Ase1, Abu1, r11, 1, 20, \
1, Aku0
ldr r9, [sp, #mDo1]
KeccakThetaRhoPiChiIota Aba0, r8, 0, \
Ake1, r10,22, 23, \
Asi0, r2, 22, 9, \
Ago0, r9, 11, 13, \
Amu1, r12, 7, 28, \
16, 0, 1, Abu1, r1
ldr.w r2, [sp, #mDi0]
KeccakThetaRhoPiChi Aso0, Ama1, r9, 14, 0, \
Agu1, Abe1, r14, 10, 10, \
Ama1, Aki1, r8, 1, 13, \
Abe1, Aso0, r10, 22, 8, \
Aki1, Agu1, r2, 30, 1, \
1, Aba0
KeccakThetaRhoPiChi Ame0, Aga1, r11, 1, 10, \
Abi1, Ame0, r2, 3, 31, \
Ako1, Abi1, r9, 13, 0, \
Asu1, Ako1, r12, 4, 18, \
Aga1, Asu1, r8, 9, 20, \
1, Agu1
ldr r8, [sp, #mDa1]
KeccakThetaRhoPiChi Aku1, Asa0, r12, 13, 22, \
Asa0, Age0, r8, 18, 1, \
Age0, Ami0, r11, 5, 4, \
Ami0, Abo0, r2, 7, 28, \
Abo0, Aku1, r9, 28, 31, \
1, Asu1
KeccakThetaRhoPiChi Agi1, Aka0, r2, 31, 7, \
Amo0, Ase0, r9, 28, 14, \
Abu0, Agi1, r14, 20, 3, \
Aka0, Amo0, r8, 21, 5, \
Ase0, Abu0, r10, 1, 21, \
1, Aku1
ldr r9, [sp, #mDo0]
KeccakThetaRhoPiChiIota Aba1, r8, 0, \
Ake0, r11, 22, 22, \
Asi1, r2, 21, 9, \
Ago1, r9, 10, 14, \
Amu0, r14, 7, 27, \
20, 0, 1, Abu0, r14
.endm
/******************************************************************************
* 4th round of the 4 unrolled rounds routine due to in-place processing.
* Note that the Rho step is *not* delayed so that the internal state is
* compliant w/ the classical representation at the end of the routine.
*****************************************************************************/
.macro KeccakRound3
xor5str r3, Amu1, Agu0, Asu1, Aku0, Abu0, 22, 10, 3, 18, 28, r14, r0, Aba1
xor5 r7, Ake0, Abe1, Ame1, Age0, Ase1, 10, 22, 4, 7, 20
ror r3, 32-22
xorrol r6, r3, r7, 32-10
xor5str r4, Asi1, Aki1, Abi0, Ami0, Agi0, 7, 30, 9, 28, 1, r6, sp, mDa0
eor r6, r3, r4, ror 32-7
xor5str r3, Ago0, Aso1, Ako1, Abo1, Amo0, 0, 14, 1, 14, 31, r6, sp, mDo1
eor r2, r3, r7, ror 32-10
xor5 r7, Aba0, Ama0, Aga1, Asa1, Aka0, 0, 2, 13, 5, 20
xorrol r10, r7, r4, 32-7
xor5 r4, Ago1, Aso0, Ako0, Abo0, Amo1, 0, 14, 0, 13, 31
eor r14, r4, r7
xor5 r7, Ake1, Abe0, Ame0, Age1, Ase0, 11, 23, 4, 8, 21
ror r7, #32-11
xorrol r6, r7, r4, 32
xor5str r4, Amu0, Agu1, Asu0, Aku1, Abu1, 22, 10, 3, 18, 27, r6, sp, mDi0
eor r8, r7, r4, ror 32-22
xor5str r7, Asi0, Aki0, Abi1, Ami1, Agi1, 7, 31, 9, 28, 1, r8, sp, mDa1
ror r7, 32-7
xorrol r9, r7, r4, 32-22
xor5str r4, Aba1, Ama1, Aga0, Asa0, Aka1, 0, 1, 12, 5, 19, r9, sp, mDo0
eor r11, r4, r7
xorrol r12, r3, r4, 32
KeccakThetaRhoPiChi Ago0, Aga0, r9, 14, 0, \
Agu0, Age0, r12, 10, 10, \
Aga0, Agi0, r8, 2, 12, \
Age0, Ago0, r11, 23, 7, \
Agi0, Agu0, r2, 31, 1, \
0, Aga0
KeccakThetaRhoPiChi Ake1, Aka1, r10, 0, 11, \
Aki1, Ake1, r2, 3, 30, \
Ako1, Aki1, r9, 12, 1, \
Aku1, Ako1, r14, 4, 18, \
Aka1, Aku1, r8, 9, 19, \
0, Agu0
ldr r8, [sp, #mDa0]
KeccakThetaRhoPiChi Amu0, Ama0, r14, 14, 22, \
Ama0, Ame0, r8, 18, 2, \
Ame0, Ami0, r10, 5, 4, \
Ami0, Amo0, r2, 8, 28, \
Amo0, Amu0, r9, 28, 31, \
0, Aku1
KeccakThetaRhoPiChi Asi1, Asa1, r2, 31, 7, \
Aso1, Ase1, r9, 27, 14, \
Asu1, Asi1, r12, 19, 3, \
Asa1, Aso1, r8, 20, 5, \
Ase1, Asu1, r11, 1, 20, \
0, Amu0
ldr r9, [sp, #mDo1]
KeccakThetaRhoPiChiIota Aba0, r8, 0, \
Abe0, r10, 22, 23, \
Abi0, r2, 22, 9, \
Abo0, r9, 11, 13, \
Abu0, r12, 7, 28, \
24, 0, 0, Asu1, r1
ldr.w r2, [sp, #mDi0]
KeccakThetaRhoPiChi Ago1, Aga1, r9, 14, 0, \
Agu1, Age1, r14, 10, 10, \
Aga1, Agi1, r8, 1, 13, \
Age1, Ago1, r10, 22, 8, \
Agi1, Agu1, r2, 30, 1, \
0, Aba0
KeccakThetaRhoPiChi Ake0, Aka0, r11, 1, 10, \
Aki0, Ake0, r2, 3, 31, \
Ako0, Aki0, r9, 13, 0, \
Aku0, Ako0, r12, 4, 18, \
Aka0, Aku0, r8, 9, 20, \
0, Agu1
ldr r8, [sp, #mDa1]
KeccakThetaRhoPiChi Amu1, Ama1, r12, 13, 22, \
Ama1, Ame1, r8, 18, 1, \
Ame1, Ami1, r11, 5, 4, \
Ami1, Amo1, r2, 7, 28, \
Amo1, Amu1, r9, 28, 31, \
0, Aku0
KeccakThetaRhoPiChi Asi0, Asa0, r2, 31, 7, \
Aso0, Ase0, r9, 28, 14, \
Asu0, Asi0, r14, 20, 3, \
Asa0, Aso0, r8, 21, 5, \
Ase0, Asu0, r10, 1, 21, \
0, Amu1
ldr r9, [sp, #mDo0]
KeccakThetaRhoPiChiIota Aba1, r8, 0, \
Abe1, r11, 22, 22, \
Abi1, r2, 21, 9, \
Abo1, r9, 10, 14, \
Abu1, r14, 7, 27, \
28, 1, 0, Asu0, r1
str.w r1, [r0, #Aba1]
.endm
@----------------------------------------------------------------------------
@
@ void KeccakF1600_Initialize( void )
@
.align 8
.global KeccakF1600_Initialize
KeccakF1600_Initialize:
bx lr
@----------------------------------------------------------------------------
@
@ void KeccakF1600_StateXORBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length)
@
.align 8
.global KeccakF1600_StateXORBytes
KeccakF1600_StateXORBytes:
cbz r3, KeccakF1600_StateXORBytes_Exit1
push {r4 - r8, lr} @ then
bic r4, r2, #7 @ offset &= ~7
adds r0, r0, r4 @ add whole lane offset to state pointer
ands r2, r2, #7 @ offset &= 7 (part not lane aligned)
beq KeccakF1600_StateXORBytes_CheckLanes @ .if offset != 0
movs r4, r3 @ then, do remaining bytes in first lane
rsb r5, r2, #8 @ max size in lane = 8 - offset
cmp r4, r5
ble KeccakF1600_StateXORBytes_BytesAlign
movs r4, r5
KeccakF1600_StateXORBytes_BytesAlign:
sub r8, r3, r4 @ size left
movs r3, r4
bl __KeccakF1600_StateXORBytesInLane
mov r3, r8
KeccakF1600_StateXORBytes_CheckLanes:
lsrs r2, r3, #3 @ .if length >= 8
beq KeccakF1600_StateXORBytes_Bytes
mov r8, r3
bl __KeccakF1600_StateXORLanes
and r3, r8, #7
KeccakF1600_StateXORBytes_Bytes:
cbz r3, KeccakF1600_StateXORBytes_Exit
movs r2, #0
bl __KeccakF1600_StateXORBytesInLane
KeccakF1600_StateXORBytes_Exit:
pop {r4 - r8, pc}
KeccakF1600_StateXORBytes_Exit1:
bx lr
@----------------------------------------------------------------------------
@
@ __KeccakF1600_StateXORLanes
@
@ Input:
@ r0 state pointer
@ r1 data pointer
@ r2 laneCount
@
@ Output:
@ r0 state pointer next lane
@ r1 data pointer next byte to input
@
@ Changed: r2-r7
@
.align 8
__KeccakF1600_StateXORLanes:
__KeccakF1600_StateXORLanes_LoopAligned:
ldr r4, [r1], #4
ldr r5, [r1], #4
ldrd r6, r7, [r0]
toBitInterleaving r4, r5, r6, r7, r3, 0
strd r6, r7, [r0], #8
subs r2, r2, #1
bne __KeccakF1600_StateXORLanes_LoopAligned
bx lr
@----------------------------------------------------------------------------
@
@ __KeccakF1600_StateXORBytesInLane
@
@ Input:
@ r0 state pointer
@ r1 data pointer
@ r2 offset in lane
@ r3 length
@
@ Output:
@ r0 state pointer next lane
@ r1 data pointer next byte to input
@
@ Changed: r2-r7
@
.align 8
__KeccakF1600_StateXORBytesInLane:
movs r4, #0
movs r5, #0
push { r4 - r5 }
add r2, r2, sp
__KeccakF1600_StateXORBytesInLane_Loop:
ldrb r5, [r1], #1
strb r5, [r2], #1
subs r3, r3, #1
bne __KeccakF1600_StateXORBytesInLane_Loop
pop { r4 - r5 }
ldrd r6, r7, [r0]
toBitInterleaving r4, r5, r6, r7, r3, 0
strd r6, r7, [r0], #8
bx lr
@----------------------------------------------------------------------------
@
@ void KeccakF1600_StateExtractBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length)
@
.align 8
.global KeccakF1600_StateExtractBytes
KeccakF1600_StateExtractBytes:
cbz r3, KeccakF1600_StateExtractBytes_Exit1 @ .if length != 0
push {r4 - r8, lr} @ then
bic r4, r2, #7 @ offset &= ~7
adds r0, r0, r4 @ add whole lane offset to state pointer
ands r2, r2, #7 @ offset &= 7 (part not lane aligned)
beq KeccakF1600_StateExtractBytes_CheckLanes @ .if offset != 0
movs r4, r3 @ then, do remaining bytes in first lane
rsb r5, r2, #8 @ max size in lane = 8 - offset
cmp r4, r5
ble KeccakF1600_StateExtractBytes_BytesAlign
movs r4, r5
KeccakF1600_StateExtractBytes_BytesAlign:
sub r8, r3, r4 @ size left
movs r3, r4
bl __KeccakF1600_StateExtractBytesInLane
mov r3, r8
KeccakF1600_StateExtractBytes_CheckLanes:
lsrs r2, r3, #3 @ .if length >= 8
beq KeccakF1600_StateExtractBytes_Bytes
mov r8, r3
bl __KeccakF1600_StateExtractLanes
and r3, r8, #7
KeccakF1600_StateExtractBytes_Bytes:
cbz r3, KeccakF1600_StateExtractBytes_Exit
movs r2, #0
bl __KeccakF1600_StateExtractBytesInLane
KeccakF1600_StateExtractBytes_Exit:
pop {r4 - r8, pc}
KeccakF1600_StateExtractBytes_Exit1:
bx lr
@----------------------------------------------------------------------------
@
@ __KeccakF1600_StateExtractLanes
@
@ Input:
@ r0 state pointer
@ r1 data pointer
@ r2 laneCount
@
@ Output:
@ r0 state pointer next lane
@ r1 data pointer next byte to input
@
@ Changed: r2-r5
@
.align 8
__KeccakF1600_StateExtractLanes:
__KeccakF1600_StateExtractLanes_LoopAligned:
ldrd r4, r5, [r0], #8
fromBitInterleaving r4, r5, r3
str r4, [r1], #4
subs r2, r2, #1
str r5, [r1], #4
bne __KeccakF1600_StateExtractLanes_LoopAligned
bx lr
@----------------------------------------------------------------------------
@
@ __KeccakF1600_StateExtractBytesInLane
@
@ Input:
@ r0 state pointer
@ r1 data pointer
@ r2 offset in lane
@ r3 length
@
@ Output:
@ r0 state pointer next lane
@ r1 data pointer next byte to input
@
@ Changed: r2-r6
@
.align 8
__KeccakF1600_StateExtractBytesInLane:
ldrd r4, r5, [r0], #8
fromBitInterleaving r4, r5, r6
push {r4, r5}
add r2, sp, r2
__KeccakF1600_StateExtractBytesInLane_Loop:
ldrb r4, [r2], #1
subs r3, r3, #1
strb r4, [r1], #1
bne __KeccakF1600_StateExtractBytesInLane_Loop
add sp, #8
bx lr
.align 8
KeccakF1600_StatePermute_RoundConstantsWithTerminator:
@ 0 1
.long 0x00000001, 0x00000000
.long 0x00000000, 0x00000089
.long 0x00000000, 0x8000008b
.long 0x00000000, 0x80008080
.long 0x00000001, 0x0000008b
.long 0x00000001, 0x00008000
.long 0x00000001, 0x80008088
.long 0x00000001, 0x80000082
.long 0x00000000, 0x0000000b
.long 0x00000000, 0x0000000a
.long 0x00000001, 0x00008082
.long 0x00000000, 0x00008003
.long 0x00000001, 0x0000808b
.long 0x00000001, 0x8000000b
.long 0x00000001, 0x8000008a
.long 0x00000001, 0x80000081
.long 0x00000000, 0x80000081
.long 0x00000000, 0x80000008
.long 0x00000000, 0x00000083
.long 0x00000000, 0x80008003
.long 0x00000001, 0x80008088
.long 0x00000000, 0x80000088
.long 0x00000001, 0x00008000
.long 0x00000000, 0x80008082
.long 0x000000FF @terminator
@----------------------------------------------------------------------------
@
@ void KeccakF1600_StatePermute( void *state )
@
.align 8
.global KeccakF1600_StatePermute
KeccakF1600_StatePermute:
adr r1, KeccakF1600_StatePermute_RoundConstantsWithTerminator
push { r4 - r12, lr }
sub sp, #mSize
str r1, [sp, #mRC]
KeccakF1600_StatePermute_RoundLoop:
KeccakRound0
KeccakRound1
KeccakRound2
KeccakRound3
bne KeccakF1600_StatePermute_RoundLoop
add sp, #mSize
pop { r4 - r12, pc }