Skip to content

Commit

Permalink
Merge pull request wolfSSL#7728 from SparkiDev/poly1305_aarch64_uniq_…
Browse files Browse the repository at this point in the history
…name

Poly1305 AArch64: unique naming of asm funcs
  • Loading branch information
JacobBarthelmeh authored Jul 16, 2024
2 parents e002b6e + 90836c7 commit bbd769d
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 28 deletions.
52 changes: 26 additions & 26 deletions wolfcrypt/src/port/arm/armv8-poly1305.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,12 @@
#include <stdio.h>
#endif

static WC_INLINE void poly1305_blocks_16(Poly1305* ctx, const unsigned char *m,
size_t bytes)
static WC_INLINE void poly1305_blocks_aarch64_16(Poly1305* ctx,
const unsigned char *m, size_t bytes)
{
__asm__ __volatile__ (
"CMP %[bytes], %[POLY1305_BLOCK_SIZE] \n\t"
"BLO L_poly1305_16_64_done_%= \n\t"
"BLO L_poly1305_aarch64_16_64_done_%= \n\t"
/* Load r and h */
"LDP x21, x23, %[ctx_r] \n\t"
"LDR w25, %[ctx_r_4] \n\t"
Expand Down Expand Up @@ -83,7 +83,7 @@ static WC_INLINE void poly1305_blocks_16(Poly1305* ctx, const unsigned char *m,
"MUL w10, w25, w15 \n\t"
"\n"
".align 2 \n\t"
"L_poly1305_16_64_loop_%=: \n\t"
"L_poly1305_aarch64_16_64_loop_%=: \n\t"
/* t0 = U8TO64(&m[0]); */
/* t1 = U8TO64(&m[8]); */
"LDP x16, x17, [%[m]], #16 \n\t"
Expand Down Expand Up @@ -162,15 +162,15 @@ static WC_INLINE void poly1305_blocks_16(Poly1305* ctx, const unsigned char *m,
"AND x5, x19, #0x3ffffff \n\t"
"SUB %[bytes], %[bytes], %[POLY1305_BLOCK_SIZE] \n\t"
"CMP %[bytes], %[POLY1305_BLOCK_SIZE] \n\t"
"BHS L_poly1305_16_64_loop_%= \n\t"
"BHS L_poly1305_aarch64_16_64_loop_%= \n\t"
/* Store h */
"ORR x2, x2, x3, LSL #32 \n\t"
"ORR x4, x4, x5, LSL #32 \n\t"
"STP x2, x4, %[ctx_h] \n\t"
"STR w6, %[ctx_h_4] \n\t"
"\n"
".align 2 \n\t"
"L_poly1305_16_64_done_%=: \n\t"
"L_poly1305_aarch64_16_64_done_%=: \n\t"
: [ctx_h] "+m" (ctx->h[0]),
[ctx_h_4] "+m" (ctx->h[4]),
[bytes] "+r" (bytes),
Expand All @@ -187,13 +187,13 @@ static WC_INLINE void poly1305_blocks_16(Poly1305* ctx, const unsigned char *m,
);
}

void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
size_t bytes)
void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m,
size_t bytes)
{
__asm__ __volatile__ (
/* If less than 4 blocks to process then use regular method */
"CMP %[bytes], %[POLY1305_BLOCK_SIZE]*4 \n\t"
"BLO L_poly1305_64_done_%= \n\t"
"BLO L_poly1305_aarch64_64_done_%= \n\t"
"MOV x9, #0x3ffffff \n\t"
/* Load h */
"LDP x20, x22, [%[h]] \n\t"
Expand Down Expand Up @@ -221,7 +221,7 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
"MOV v26.D[1], x9 \n\t"
"DUP v30.4S, v26.S[0] \n\t"
"CMP %[bytes], %[POLY1305_BLOCK_SIZE]*6 \n\t"
"BLO L_poly1305_64_start_block_size_64_%= \n\t"
"BLO L_poly1305_aarch64_64_start_block_size_64_%= \n\t"
/* Load r^2 to NEON v0, v1, v2, v3, v4 */
"LD4 { v0.S-v3.S }[2], [%[r_2]], #16 \n\t"
"LD1 { v4.S }[2], [%[r_2]] \n\t"
Expand Down Expand Up @@ -284,7 +284,7 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
"ADD v19.2S, v19.2S, v14.2S \n\t"
"\n"
".align 2 \n\t"
"L_poly1305_64_loop_128_%=: \n\t"
"L_poly1305_aarch64_64_loop_128_%=: \n\t"
/* d0 = h0*r0 + h1*s4 + h2*s3 + h3*s2 + h4*s1 */
/* d1 = h0*r1 + h1*r0 + h2*s4 + h3*s3 + h4*s2 */
/* d2 = h0*r2 + h1*r1 + h2*r0 + h3*s4 + h4*s3 */
Expand Down Expand Up @@ -395,7 +395,7 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
"UMLAL2 v25.2D, v14.4S, v0.4S \n\t"
/* If less than six message blocks left then leave loop */
"CMP %[bytes], %[POLY1305_BLOCK_SIZE]*6 \n\t"
"BLS L_poly1305_64_loop_128_final_%= \n\t"
"BLS L_poly1305_aarch64_64_loop_128_final_%= \n\t"
/* Load m */
/* Load four message blocks to NEON v10, v11, v12, v13, v14 */
"LD4 { v10.4S-v13.4S }, [%[m]], #64 \n\t"
Expand Down Expand Up @@ -447,10 +447,10 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
"MOV v17.S[1], v17.S[2] \n\t"
"MOV v18.S[1], v18.S[2] \n\t"
"MOV v19.S[1], v19.S[2] \n\t"
"B L_poly1305_64_loop_128_%= \n\t"
"B L_poly1305_aarch64_64_loop_128_%= \n\t"
"\n"
".align 2 \n\t"
"L_poly1305_64_loop_128_final_%=: \n\t"
"L_poly1305_aarch64_64_loop_128_final_%=: \n\t"
/* Load m */
/* Load two message blocks to NEON v10, v11, v12, v13, v14 */
"LD2 { v10.2D-v11.2D }, [%[m]], #32 \n\t"
Expand Down Expand Up @@ -525,12 +525,12 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
"MOV v19.S[1], v19.S[2] \n\t"
/* If less than 2 blocks left go straight to final multiplication. */
"CMP %[bytes], %[POLY1305_BLOCK_SIZE]*2 \n\t"
"BLO L_poly1305_64_last_mult_%= \n\t"
/* Else go to one loop of L_poly1305_64_loop_64 */
"B L_poly1305_64_loop_64_%= \n\t"
"BLO L_poly1305_aarch64_64_last_mult_%= \n\t"
/* Else go to one loop of L_poly1305_aarch64_64_loop_64 */
"B L_poly1305_aarch64_64_loop_64_%= \n\t"
"\n"
".align 2 \n\t"
"L_poly1305_64_start_block_size_64_%=: \n\t"
"L_poly1305_aarch64_64_start_block_size_64_%=: \n\t"
/* Load r^2 to NEON v0, v1, v2, v3, v4 */
"LD4R { v0.2S-v3.2S }, [%[r_2]], #16 \n\t"
"LD1R { v4.2S }, [%[r_2]] \n\t"
Expand Down Expand Up @@ -581,7 +581,7 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
"ADD v19.2S, v19.2S, v14.2S \n\t"
"\n"
".align 2 \n\t"
"L_poly1305_64_loop_64_%=: \n\t"
"L_poly1305_aarch64_64_loop_64_%=: \n\t"
/* d0 = h0*r0 + h1*s4 + h2*s3 + h3*s2 + h4*s1 */
/* d1 = h0*r1 + h1*r0 + h2*s4 + h3*s3 + h4*s2 */
/* d2 = h0*r2 + h1*r1 + h2*r0 + h3*s4 + h4*s3 */
Expand Down Expand Up @@ -709,10 +709,10 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
"MOV v19.S[1], v19.S[2] \n\t"
/* If at least two message blocks left then loop_64 */
"CMP %[bytes], %[POLY1305_BLOCK_SIZE]*2 \n\t"
"BHS L_poly1305_64_loop_64_%= \n\t"
"BHS L_poly1305_aarch64_64_loop_64_%= \n\t"
"\n"
".align 2 \n\t"
"L_poly1305_64_last_mult_%=: \n\t"
"L_poly1305_aarch64_64_last_mult_%=: \n\t"
/* Load r */
"LD4 { v0.S-v3.S }[1], [%[r]], #16 \n\t"
/* Compute h*r^2 */
Expand Down Expand Up @@ -849,7 +849,7 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
"SUB %[h], %[h], #16 \n\t"
"\n"
".align 2 \n\t"
"L_poly1305_64_done_%=: \n\t"
"L_poly1305_aarch64_64_done_%=: \n\t"
: [bytes] "+r" (bytes),
[m] "+r" (m),
[ctx] "+m" (ctx)
Expand All @@ -869,12 +869,12 @@ void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
"x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27",
"x28", "x30"
);
poly1305_blocks_16(ctx, m, bytes);
poly1305_blocks_aarch64_16(ctx, m, bytes);
}

void poly1305_block(Poly1305* ctx, const unsigned char *m)
void poly1305_block_aarch64(Poly1305* ctx, const unsigned char *m)
{
poly1305_blocks_16(ctx, m, POLY1305_BLOCK_SIZE);
poly1305_blocks_aarch64_16(ctx, m, POLY1305_BLOCK_SIZE);
}

#if defined(POLY130564)
Expand Down Expand Up @@ -1092,7 +1092,7 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac)
for (; i < POLY1305_BLOCK_SIZE; i++)
ctx->buffer[i] = 0;
ctx->finished = 1;
poly1305_block(ctx, ctx->buffer);
poly1305_block_aarch64(ctx, ctx->buffer);
}

__asm__ __volatile__ (
Expand Down
7 changes: 5 additions & 2 deletions wolfssl/wolfcrypt/poly1305.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,12 @@ WOLFSSL_API int wc_Poly1305_MAC(Poly1305* ctx, const byte* additional,
word32 addSz, const byte* input, word32 sz, byte* tag, word32 tagSz);

#if defined(__aarch64__ ) && defined(WOLFSSL_ARMASM)
void poly1305_blocks(Poly1305* ctx, const unsigned char *m,
#define poly1305_blocks poly1305_blocks_aarch64
#define poly1305_block poly1305_block_aarch64

void poly1305_blocks_aarch64(Poly1305* ctx, const unsigned char *m,
size_t bytes);
void poly1305_block(Poly1305* ctx, const unsigned char *m);
void poly1305_block_aarch64(Poly1305* ctx, const unsigned char *m);
#endif

#ifdef __cplusplus
Expand Down

0 comments on commit bbd769d

Please sign in to comment.