From 0874175169895866c2448bdcf9470acc4cc655e3 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 24 Dec 2022 20:16:20 -0800 Subject: [PATCH] benchmark: do not clobber callee-saved NEON registers Make the ARM assembly code save and restore registers q4-q7 when they are used, since these are callee-saved registers. This is not needed in the Linux kernel, or in other contexts where FPU and NEON code generation is disabled. IIRC, I had used -mfloat-abi=soft when testing this code originally. But that option is not actually being set by the build system, and building with that option is not always possible anyway since some systems don't have the softfp library. I'm now seeing a test failure due to this, so let's fix it. --- benchmark/src/arm/cham-neon-core.S | 7 ++++++- benchmark/src/arm/chaskey-lts-neon-core.S | 7 ++++++- benchmark/src/arm/lea-neon-core.S | 7 ++++++- benchmark/src/arm/nh-neon-core.S | 4 ++++ benchmark/src/arm/noekeon-neon-core.S | 7 ++++++- benchmark/src/arm/rc5-neon-core.S | 7 ++++++- benchmark/src/arm/rc6-neon-core.S | 7 ++++++- benchmark/src/arm/speck-neon-core.S | 7 ++++++- benchmark/src/arm/xtea-neon-core.S | 7 ++++++- third_party/linux-kernel/arm/aes-neonbs-core.S | 6 +++++- third_party/linux-kernel/arm/chacha-neon-core.S | 13 +++++++++++++ 11 files changed, 70 insertions(+), 9 deletions(-) diff --git a/benchmark/src/arm/cham-neon-core.S b/benchmark/src/arm/cham-neon-core.S index fe770aa..db9f6ce 100644 --- a/benchmark/src/arm/cham-neon-core.S +++ b/benchmark/src/arm/cham-neon-core.S @@ -200,7 +200,6 @@ .macro _cham_xts_crypt decrypting push {r4-r7} - mov r7, sp /* * The first four parameters were passed in registers r0-r3. Load the @@ -209,6 +208,11 @@ ldr NBYTES, [sp, #16] ldr TWEAK, [sp, #20] + // Save the callee-saved NEON registers. + vstmdb sp!, {q4-q7} + + mov r7, sp + // Load the round keys vld1.8 {ROUND_KEY_A, ROUND_KEY_B}, [ROUND_KEYS] @@ -343,6 +347,7 @@ vst1.8 {TWEAKV}, [TWEAK] mov sp, r7 + vldmia sp!, {q4-q7} pop {r4-r7} bx lr .endm diff --git a/benchmark/src/arm/chaskey-lts-neon-core.S b/benchmark/src/arm/chaskey-lts-neon-core.S index 3fcbbab..21ad5ff 100644 --- a/benchmark/src/arm/chaskey-lts-neon-core.S +++ b/benchmark/src/arm/chaskey-lts-neon-core.S @@ -215,7 +215,6 @@ .macro _chaskey_lts_xts_crypt decrypting push {r4-r5} - mov r5, sp /* * The first four parameters were passed in registers r0-r3. Load the @@ -223,6 +222,11 @@ */ ldr TWEAK, [sp, #8] + // Save the callee-saved NEON registers. + vstmdb sp!, {q4-q7} + + mov r5, sp + // Load key vld1.8 {KEYV}, [KEY] @@ -358,6 +362,7 @@ vst1.8 {TWEAKV}, [TWEAK] mov sp, r5 + vldmia sp!, {q4-q7} pop {r4-r5} bx lr .endm diff --git a/benchmark/src/arm/lea-neon-core.S b/benchmark/src/arm/lea-neon-core.S index af42cc0..8e43204 100644 --- a/benchmark/src/arm/lea-neon-core.S +++ b/benchmark/src/arm/lea-neon-core.S @@ -293,7 +293,6 @@ .macro _lea_xts_crypt is_lea128, decrypting push {r4-r7} - mov r7, sp /* * The first four parameters were passed in registers r0-r3. Load the @@ -302,6 +301,11 @@ ldr NBYTES, [sp, #16] ldr TWEAK, [sp, #20] + // Save the callee-saved NEON registers. + vstmdb sp!, {q4-q7} + + mov r7, sp + /* * Allocate stack space to store 128 bytes worth of tweaks. For * performance, this space is aligned to a 16-byte boundary so that we @@ -419,6 +423,7 @@ bne .Lnext_128bytes_\@ mov sp, r7 + vldmia sp!, {q4-q7} pop {r4-r7} bx lr .endm diff --git a/benchmark/src/arm/nh-neon-core.S b/benchmark/src/arm/nh-neon-core.S index a20df17..abf42e6 100644 --- a/benchmark/src/arm/nh-neon-core.S +++ b/benchmark/src/arm/nh-neon-core.S @@ -78,6 +78,9 @@ */ ENTRY(nh_neon) + // Save the callee-saved NEON registers. + vstmdb sp!, {q4-q7} + vld1.32 {K0,K1}, [KEY]! vmov.u64 PASS0_SUMS, #0 vmov.u64 PASS1_SUMS, #0 @@ -115,5 +118,6 @@ ENTRY(nh_neon) vadd.u64 T1_L, PASS2_SUM_A, PASS2_SUM_B vadd.u64 T1_H, PASS3_SUM_A, PASS3_SUM_B vst1.8 {T0-T1}, [HASH] + vldmia sp!, {q4-q7} bx lr ENDPROC(nh_neon) diff --git a/benchmark/src/arm/noekeon-neon-core.S b/benchmark/src/arm/noekeon-neon-core.S index a2ec8bc..e7b38e9 100644 --- a/benchmark/src/arm/noekeon-neon-core.S +++ b/benchmark/src/arm/noekeon-neon-core.S @@ -269,7 +269,6 @@ .macro _noekeon_xts_crypt decrypting push {r4-r5} - mov r5, sp /* * The first four parameters were passed in registers r0-r3. Load the @@ -277,6 +276,11 @@ */ ldr TWEAK, [sp, #8] + // Save the callee-saved NEON registers. + vstmdb sp!, {q4-q7} + + mov r5, sp + // Load the key vld1.32 {K}, [KEY] @@ -456,6 +460,7 @@ bne .Lnext_128bytes_\@ mov sp, r5 + vldmia sp!, {q4-q7} pop {r4-r5} bx lr .endm diff --git a/benchmark/src/arm/rc5-neon-core.S b/benchmark/src/arm/rc5-neon-core.S index 0620d59..53f8891 100644 --- a/benchmark/src/arm/rc5-neon-core.S +++ b/benchmark/src/arm/rc5-neon-core.S @@ -262,7 +262,6 @@ */ .macro _rc5_xts_crypt n, decrypting push {r4-r9} - mov r9, sp /* * The first four parameters were passed in registers r0-r3. Load the @@ -271,6 +270,11 @@ ldr NBYTES, [sp, #24] ldr TWEAK, [sp, #28] + // Save the callee-saved NEON registers. + vstmdb sp!, {q4-q7} + + mov r9, sp + mov CONST_N_MINUS_1, #(\n - 1) mov CONST_N, #\n @@ -460,6 +464,7 @@ bne .Lnext_128bytes_\@ mov sp, r9 + vldmia sp!, {q4-q7} pop {r4-r9} bx lr .endm diff --git a/benchmark/src/arm/rc6-neon-core.S b/benchmark/src/arm/rc6-neon-core.S index 4294e5d..b1b621d 100644 --- a/benchmark/src/arm/rc6-neon-core.S +++ b/benchmark/src/arm/rc6-neon-core.S @@ -229,7 +229,6 @@ .macro _rc6_xts_crypt decrypting push {r4-r9} - mov r9, sp /* * The first four parameters were passed in registers r0-r3. Load the @@ -237,6 +236,11 @@ */ ldr TWEAK, [sp, #24] + // Save the callee-saved NEON registers. + vstmdb sp!, {q4-q7} + + mov r9, sp + mov ONE, #1 mov THIRTY_ONE, #31 mov THIRTY_TWO, #32 @@ -403,6 +407,7 @@ bne .Lnext_128bytes_\@ mov sp, r9 + vldmia sp!, {q4-q7} pop {r4-r9} bx lr .endm diff --git a/benchmark/src/arm/speck-neon-core.S b/benchmark/src/arm/speck-neon-core.S index 7bc9fdd..5bf247c 100644 --- a/benchmark/src/arm/speck-neon-core.S +++ b/benchmark/src/arm/speck-neon-core.S @@ -221,7 +221,6 @@ */ .macro _speck_xts_crypt n, decrypting push {r4-r7} - mov r7, sp /* * The first four parameters were passed in registers r0-r3. Load the @@ -230,6 +229,11 @@ ldr NBYTES, [sp, #16] ldr TWEAK, [sp, #20] + // Save the callee-saved NEON registers. + vstmdb sp!, {q4-q7} + + mov r7, sp + /* * If decrypting, modify the ROUND_KEYS parameter to point to the last * round key rather than the first, since for decryption the round keys @@ -430,6 +434,7 @@ .endif mov sp, r7 + vldmia sp!, {q4-q7} pop {r4-r7} bx lr .endm diff --git a/benchmark/src/arm/xtea-neon-core.S b/benchmark/src/arm/xtea-neon-core.S index 62d95c0..e07bf8d 100644 --- a/benchmark/src/arm/xtea-neon-core.S +++ b/benchmark/src/arm/xtea-neon-core.S @@ -138,7 +138,6 @@ .macro _xtea_xts_crypt decrypting push {r4-r5} - mov r5, sp /* * The first four parameters were passed in registers r0-r3. Load the @@ -146,6 +145,11 @@ */ ldr TWEAK, [sp, #8] + // Save the callee-saved NEON registers. + vstmdb sp!, {q4-q7} + + mov r5, sp + // Load the key vld1.8 {KEYV}, [KEY] @@ -324,6 +328,7 @@ bne .Lnext_128bytes_\@ mov sp, r5 + vldmia sp!, {q4-q7} pop {r4-r5} bx lr .endm diff --git a/third_party/linux-kernel/arm/aes-neonbs-core.S b/third_party/linux-kernel/arm/aes-neonbs-core.S index 8252685..c844d0f 100644 --- a/third_party/linux-kernel/arm/aes-neonbs-core.S +++ b/third_party/linux-kernel/arm/aes-neonbs-core.S @@ -442,6 +442,7 @@ M0: .quad 0x02060a0e03070b0f, 0x0004080c0105090d * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds) */ ENTRY(aesbs_convert_key) + vstmdb sp!, {q4-q7} // Save callee-saved NEON registers vld1.32 {q7}, [r1]! // load round 0 key vld1.32 {q15}, [r1]! // load round 1 key @@ -485,6 +486,7 @@ ENTRY(aesbs_convert_key) vmov.i8 q7, #0x63 // compose .L63 veor q15, q15, q7 vst1.8 {q15}, [r0, :128] + vldmia sp!, {q4-q7} bx lr ENDPROC(aesbs_convert_key) @@ -960,8 +962,9 @@ ENDPROC(__xts_prepare8) .macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7 push {r4-r8, lr} - mov r5, sp // preserve sp ldrd r6, r7, [sp, #24] // get blocks and iv args + vstmdb sp!, {q4-q7} // Save callee-saved NEON registers + mov r5, sp // preserve sp sub ip, sp, #128 // make room for 8x tweak bic ip, ip, #0xf // align sp to 16 bytes mov sp, ip @@ -1013,6 +1016,7 @@ ENDPROC(__xts_prepare8) bgt 99b mov sp, r5 + vldmia sp!, {q4-q7} // Restore callee-saved NEON registers pop {r4-r8, pc} .endm diff --git a/third_party/linux-kernel/arm/chacha-neon-core.S b/third_party/linux-kernel/arm/chacha-neon-core.S index fe53aae..9cf7c81 100644 --- a/third_party/linux-kernel/arm/chacha-neon-core.S +++ b/third_party/linux-kernel/arm/chacha-neon-core.S @@ -546,6 +546,9 @@ ENTRY(chacha_block_xor_neon) // r2: 1 data block input, i // r3: nrounds + // Save the callee-saved NEON registers. + vstmdb sp!, {q4-q7} + // x0..3 = s0..3 add ip, r0, #0x20 vld1.32 {q0-q1}, [r0] @@ -582,6 +585,7 @@ ENTRY(chacha_block_xor_neon) vst1.8 {q0-q1}, [r1] vst1.8 {q2-q3}, [ip] + vldmia sp!, {q4-q7} bx lr ENDPROC(chacha_block_xor_neon) @@ -591,6 +595,7 @@ ENDPROC(chacha_block_xor_neon) .align 5 ENTRY(chacha_4block_xor_neon) + vstmdb sp!, {q4-q7} // Save callee-saved NEON registers push {r4-r5} mov r4, sp // preserve the stack pointer sub ip, sp, #0x20 // allocate a 32 byte buffer @@ -746,10 +751,14 @@ ENTRY(chacha_4block_xor_neon) vst1.8 {q0-q1}, [r1] pop {r4-r5} + vldmia sp!, {q4-q7} bx lr ENDPROC(chacha_4block_xor_neon) ENTRY(chacha_perm_neon) + // Save the callee-saved NEON registers that are used. + vstmdb sp!, {q4-q5} + // r0: Input state matrix, s // r1: nrounds add ip, r0, #0x20 @@ -762,6 +771,7 @@ ENTRY(chacha_perm_neon) vst1.8 {q0-q1}, [r0]! vst1.8 {q2-q3}, [r0] + vldmia sp!, {q4-q5} bx lr ENDPROC(chacha_perm_neon) @@ -782,6 +792,8 @@ ENDPROC(chacha_perm_neon) * r3 = nrounds */ .macro _chacha_mem_crypt_4block_neon decrypting + // Save the callee-saved NEON registers. + vstmdb sp!, {q4-q7} // 32-byte align the stack push {r4} @@ -991,6 +1003,7 @@ ENDPROC(chacha_perm_neon) mov sp, r4 pop {r4} + vldmia sp!, {q4-q7} bx lr .endm