Skip to content

Commit

Permalink
benchmark: do not clobber callee-saved NEON registers
Browse files Browse the repository at this point in the history
Make the ARM assembly code save and restore registers q4-q7 when they
are used, since these are callee-saved registers.

This is not needed in the Linux kernel, or in other contexts where FPU
and NEON code generation is disabled.  IIRC, I had used -mfloat-abi=soft
when testing this code originally.  But that option is not actually
being set by the build system, and building with that option is not
always possible anyway since some systems don't have the softfp library.

I'm now seeing a test failure due to this, so let's fix it.
  • Loading branch information
ebiggers committed Dec 25, 2022
1 parent 818709d commit 0874175
Show file tree
Hide file tree
Showing 11 changed files with 70 additions and 9 deletions.
7 changes: 6 additions & 1 deletion benchmark/src/arm/cham-neon-core.S
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,6 @@

.macro _cham_xts_crypt decrypting
push {r4-r7}
mov r7, sp

/*
* The first four parameters were passed in registers r0-r3. Load the
Expand All @@ -209,6 +208,11 @@
ldr NBYTES, [sp, #16]
ldr TWEAK, [sp, #20]

// Save the callee-saved NEON registers.
vstmdb sp!, {q4-q7}

mov r7, sp

// Load the round keys
vld1.8 {ROUND_KEY_A, ROUND_KEY_B}, [ROUND_KEYS]

Expand Down Expand Up @@ -343,6 +347,7 @@
vst1.8 {TWEAKV}, [TWEAK]

mov sp, r7
vldmia sp!, {q4-q7}
pop {r4-r7}
bx lr
.endm
Expand Down
7 changes: 6 additions & 1 deletion benchmark/src/arm/chaskey-lts-neon-core.S
Original file line number Diff line number Diff line change
Expand Up @@ -215,14 +215,18 @@

.macro _chaskey_lts_xts_crypt decrypting
push {r4-r5}
mov r5, sp

/*
* The first four parameters were passed in registers r0-r3. Load the
* additional parameter, which was passed on the stack.
*/
ldr TWEAK, [sp, #8]

// Save the callee-saved NEON registers.
vstmdb sp!, {q4-q7}

mov r5, sp

// Load key
vld1.8 {KEYV}, [KEY]

Expand Down Expand Up @@ -358,6 +362,7 @@
vst1.8 {TWEAKV}, [TWEAK]

mov sp, r5
vldmia sp!, {q4-q7}
pop {r4-r5}
bx lr
.endm
Expand Down
7 changes: 6 additions & 1 deletion benchmark/src/arm/lea-neon-core.S
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,6 @@

.macro _lea_xts_crypt is_lea128, decrypting
push {r4-r7}
mov r7, sp

/*
* The first four parameters were passed in registers r0-r3. Load the
Expand All @@ -302,6 +301,11 @@
ldr NBYTES, [sp, #16]
ldr TWEAK, [sp, #20]

// Save the callee-saved NEON registers.
vstmdb sp!, {q4-q7}

mov r7, sp

/*
* Allocate stack space to store 128 bytes worth of tweaks. For
* performance, this space is aligned to a 16-byte boundary so that we
Expand Down Expand Up @@ -419,6 +423,7 @@
bne .Lnext_128bytes_\@

mov sp, r7
vldmia sp!, {q4-q7}
pop {r4-r7}
bx lr
.endm
Expand Down
4 changes: 4 additions & 0 deletions benchmark/src/arm/nh-neon-core.S
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,9 @@
*/
ENTRY(nh_neon)

// Save the callee-saved NEON registers.
vstmdb sp!, {q4-q7}

vld1.32 {K0,K1}, [KEY]!
vmov.u64 PASS0_SUMS, #0
vmov.u64 PASS1_SUMS, #0
Expand Down Expand Up @@ -115,5 +118,6 @@ ENTRY(nh_neon)
vadd.u64 T1_L, PASS2_SUM_A, PASS2_SUM_B
vadd.u64 T1_H, PASS3_SUM_A, PASS3_SUM_B
vst1.8 {T0-T1}, [HASH]
vldmia sp!, {q4-q7}
bx lr
ENDPROC(nh_neon)
7 changes: 6 additions & 1 deletion benchmark/src/arm/noekeon-neon-core.S
Original file line number Diff line number Diff line change
Expand Up @@ -269,14 +269,18 @@

.macro _noekeon_xts_crypt decrypting
push {r4-r5}
mov r5, sp

/*
* The first four parameters were passed in registers r0-r3. Load the
* additional parameter, which was passed on the stack.
*/
ldr TWEAK, [sp, #8]

// Save the callee-saved NEON registers.
vstmdb sp!, {q4-q7}

mov r5, sp

// Load the key
vld1.32 {K}, [KEY]

Expand Down Expand Up @@ -456,6 +460,7 @@
bne .Lnext_128bytes_\@

mov sp, r5
vldmia sp!, {q4-q7}
pop {r4-r5}
bx lr
.endm
Expand Down
7 changes: 6 additions & 1 deletion benchmark/src/arm/rc5-neon-core.S
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,6 @@
*/
.macro _rc5_xts_crypt n, decrypting
push {r4-r9}
mov r9, sp

/*
* The first four parameters were passed in registers r0-r3. Load the
Expand All @@ -271,6 +270,11 @@
ldr NBYTES, [sp, #24]
ldr TWEAK, [sp, #28]

// Save the callee-saved NEON registers.
vstmdb sp!, {q4-q7}

mov r9, sp

mov CONST_N_MINUS_1, #(\n - 1)
mov CONST_N, #\n

Expand Down Expand Up @@ -460,6 +464,7 @@
bne .Lnext_128bytes_\@

mov sp, r9
vldmia sp!, {q4-q7}
pop {r4-r9}
bx lr
.endm
Expand Down
7 changes: 6 additions & 1 deletion benchmark/src/arm/rc6-neon-core.S
Original file line number Diff line number Diff line change
Expand Up @@ -229,14 +229,18 @@

.macro _rc6_xts_crypt decrypting
push {r4-r9}
mov r9, sp

/*
* The first four parameters were passed in registers r0-r3. Load the
* additional parameter, which was passed on the stack.
*/
ldr TWEAK, [sp, #24]

// Save the callee-saved NEON registers.
vstmdb sp!, {q4-q7}

mov r9, sp

mov ONE, #1
mov THIRTY_ONE, #31
mov THIRTY_TWO, #32
Expand Down Expand Up @@ -403,6 +407,7 @@
bne .Lnext_128bytes_\@

mov sp, r9
vldmia sp!, {q4-q7}
pop {r4-r9}
bx lr
.endm
Expand Down
7 changes: 6 additions & 1 deletion benchmark/src/arm/speck-neon-core.S
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,6 @@
*/
.macro _speck_xts_crypt n, decrypting
push {r4-r7}
mov r7, sp

/*
* The first four parameters were passed in registers r0-r3. Load the
Expand All @@ -230,6 +229,11 @@
ldr NBYTES, [sp, #16]
ldr TWEAK, [sp, #20]

// Save the callee-saved NEON registers.
vstmdb sp!, {q4-q7}

mov r7, sp

/*
* If decrypting, modify the ROUND_KEYS parameter to point to the last
* round key rather than the first, since for decryption the round keys
Expand Down Expand Up @@ -430,6 +434,7 @@
.endif

mov sp, r7
vldmia sp!, {q4-q7}
pop {r4-r7}
bx lr
.endm
Expand Down
7 changes: 6 additions & 1 deletion benchmark/src/arm/xtea-neon-core.S
Original file line number Diff line number Diff line change
Expand Up @@ -138,14 +138,18 @@

.macro _xtea_xts_crypt decrypting
push {r4-r5}
mov r5, sp

/*
* The first four parameters were passed in registers r0-r3. Load the
* additional parameter, which was passed on the stack.
*/
ldr TWEAK, [sp, #8]

// Save the callee-saved NEON registers.
vstmdb sp!, {q4-q7}

mov r5, sp

// Load the key
vld1.8 {KEYV}, [KEY]

Expand Down Expand Up @@ -324,6 +328,7 @@
bne .Lnext_128bytes_\@

mov sp, r5
vldmia sp!, {q4-q7}
pop {r4-r5}
bx lr
.endm
Expand Down
6 changes: 5 additions & 1 deletion third_party/linux-kernel/arm/aes-neonbs-core.S
Original file line number Diff line number Diff line change
Expand Up @@ -442,6 +442,7 @@ M0: .quad 0x02060a0e03070b0f, 0x0004080c0105090d
* void aesbs_convert_key(u8 out[], u32 const rk[], int rounds)
*/
ENTRY(aesbs_convert_key)
vstmdb sp!, {q4-q7} // Save callee-saved NEON registers
vld1.32 {q7}, [r1]! // load round 0 key
vld1.32 {q15}, [r1]! // load round 1 key

Expand Down Expand Up @@ -485,6 +486,7 @@ ENTRY(aesbs_convert_key)
vmov.i8 q7, #0x63 // compose .L63
veor q15, q15, q7
vst1.8 {q15}, [r0, :128]
vldmia sp!, {q4-q7}
bx lr
ENDPROC(aesbs_convert_key)

Expand Down Expand Up @@ -960,8 +962,9 @@ ENDPROC(__xts_prepare8)

.macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7
push {r4-r8, lr}
mov r5, sp // preserve sp
ldrd r6, r7, [sp, #24] // get blocks and iv args
vstmdb sp!, {q4-q7} // Save callee-saved NEON registers
mov r5, sp // preserve sp
sub ip, sp, #128 // make room for 8x tweak
bic ip, ip, #0xf // align sp to 16 bytes
mov sp, ip
Expand Down Expand Up @@ -1013,6 +1016,7 @@ ENDPROC(__xts_prepare8)
bgt 99b

mov sp, r5
vldmia sp!, {q4-q7} // Restore callee-saved NEON registers
pop {r4-r8, pc}
.endm

Expand Down
13 changes: 13 additions & 0 deletions third_party/linux-kernel/arm/chacha-neon-core.S
Original file line number Diff line number Diff line change
Expand Up @@ -546,6 +546,9 @@ ENTRY(chacha_block_xor_neon)
// r2: 1 data block input, i
// r3: nrounds

// Save the callee-saved NEON registers.
vstmdb sp!, {q4-q7}

// x0..3 = s0..3
add ip, r0, #0x20
vld1.32 {q0-q1}, [r0]
Expand Down Expand Up @@ -582,6 +585,7 @@ ENTRY(chacha_block_xor_neon)
vst1.8 {q0-q1}, [r1]
vst1.8 {q2-q3}, [ip]

vldmia sp!, {q4-q7}
bx lr
ENDPROC(chacha_block_xor_neon)

Expand All @@ -591,6 +595,7 @@ ENDPROC(chacha_block_xor_neon)

.align 5
ENTRY(chacha_4block_xor_neon)
vstmdb sp!, {q4-q7} // Save callee-saved NEON registers
push {r4-r5}
mov r4, sp // preserve the stack pointer
sub ip, sp, #0x20 // allocate a 32 byte buffer
Expand Down Expand Up @@ -746,10 +751,14 @@ ENTRY(chacha_4block_xor_neon)
vst1.8 {q0-q1}, [r1]

pop {r4-r5}
vldmia sp!, {q4-q7}
bx lr
ENDPROC(chacha_4block_xor_neon)

ENTRY(chacha_perm_neon)
// Save the callee-saved NEON registers that are used.
vstmdb sp!, {q4-q5}

// r0: Input state matrix, s
// r1: nrounds
add ip, r0, #0x20
Expand All @@ -762,6 +771,7 @@ ENTRY(chacha_perm_neon)
vst1.8 {q0-q1}, [r0]!
vst1.8 {q2-q3}, [r0]

vldmia sp!, {q4-q5}
bx lr
ENDPROC(chacha_perm_neon)

Expand All @@ -782,6 +792,8 @@ ENDPROC(chacha_perm_neon)
* r3 = nrounds
*/
.macro _chacha_mem_crypt_4block_neon decrypting
// Save the callee-saved NEON registers.
vstmdb sp!, {q4-q7}

// 32-byte align the stack
push {r4}
Expand Down Expand Up @@ -991,6 +1003,7 @@ ENDPROC(chacha_perm_neon)

mov sp, r4
pop {r4}
vldmia sp!, {q4-q7}
bx lr
.endm

Expand Down

0 comments on commit 0874175

Please sign in to comment.