diff --git a/arm/neon/chacha-3core.asm b/arm/neon/chacha-3core.asm index bd1cf63c..b6d74ff7 100644 --- a/arm/neon/chacha-3core.asm +++ b/arm/neon/chacha-3core.asm @@ -64,10 +64,13 @@ define(`T3', `q7') C _chacha_3core(uint32_t *dst, const uint32_t *src, unsigned rounds) PROLOGUE(_nettle_chacha_3core) - vldm SRC, {X0,X1,X2,X3} +IF_LE(` vldm SRC, {X0,X1,X2,X3}') +IF_BE(` mov r12, SRC + vld1.32 {X0,X1}, [r12]! + vld1.32 {X2,X3}, [r12]') vpush {q4,q5,q6,q7} adr r12, .Lcount1 - vld1.64 {Z3}, [r12] + vld1.32 {Z3}, [r12] vadd.i64 Y3, X3, Z3 C Increment 64-bit counter vadd.i64 Z3, Y3, Z3 @@ -213,17 +216,24 @@ PROLOGUE(_nettle_chacha_3core) vadd.i32 Y3, Y3, T2 vadd.i32 Z3, Z3, T3 - vldm SRC, {T0,T1,T2,T3} +IF_LE(` vldm SRC, {T0,T1,T2,T3}') +IF_BE(` vld1.32 {T0,T1}, [SRC]! C SRC changed! + vld1.32 {T2,T3}, [SRC]') vadd.i32 X0, X0, T0 vadd.i32 X1, X1, T1 vadd.i32 X2, X2, T2 vadd.i32 X3, X3, T3 - vstmia DST!, {X0,X1,X2,X3} + C vst1.8 because caller expects results little-endian +IF_LE(` vstmia DST!, {X0,X1,X2,X3}') +IF_BE(` vst1.8 {X0,X1}, [DST]! + vst1.8 {X2,X3}, [DST]!') vadd.i32 Y0, Y0, T0 vadd.i32 Y1, Y1, T1 vadd.i32 Y2, Y2, T2 - vstmia DST!, {Y0,Y1,Y2,Y3} +IF_LE(` vstmia DST!, {Y0,Y1,Y2,Y3}') +IF_BE(` vst1.8 {Y0,Y1}, [DST]! + vst1.8 {Y2,Y3}, [DST]!') vadd.i32 Z0, Z0, T0 vadd.i32 Z1, Z1, T1 @@ -231,15 +241,20 @@ PROLOGUE(_nettle_chacha_3core) vpop {q4,q5,q6,q7} - vstm DST, {Z0,Z1,Z2,Z3} +IF_LE(` vstm DST, {Z0,Z1,Z2,Z3}') +IF_BE(` vst1.8 {Z0,Z1}, [DST]! + vst1.8 {Z2,Z3}, [DST]') bx lr EPILOGUE(_nettle_chacha_3core) PROLOGUE(_nettle_chacha_3core32) - vldm SRC, {X0,X1,X2,X3} +IF_LE(` vldm SRC, {X0,X1,X2,X3}') +IF_BE(` mov r12, SRC + vld1.32 {X0,X1}, [r12]! + vld1.32 {X2,X3}, [r12]') vpush {q4,q5,q6,q7} adr r12, .Lcount1 - vld1.64 {Z3}, [r12] + vld1.32 {Z3}, [r12] vadd.i32 Y3, X3, Z3 C Increment 32-bit counter vadd.i32 Z3, Y3, Z3 diff --git a/arm/neon/chacha-core-internal.asm b/arm/neon/chacha-core-internal.asm index b0a775bd..6613df82 100644 --- a/arm/neon/chacha-core-internal.asm +++ b/arm/neon/chacha-core-internal.asm @@ -83,7 +83,9 @@ define(`QROUND', ` C _chacha_core(uint32_t *dst, const uint32_t *src, unsigned rounds) PROLOGUE(_nettle_chacha_core) - vldm SRC, {X0,X1,X2,X3} +IF_LE(` vldm SRC, {X0,X1,X2,X3}') +IF_BE(` vld1.32 {X0,X1}, [SRC]! C SRC changed! + vld1.32 {X2,X3}, [SRC]') vmov S0, X0 vmov S1, X1 @@ -96,15 +98,6 @@ PROLOGUE(_nettle_chacha_core) C 8 9 10 11 X2 C 12 13 14 15 X3 - C Input rows big-endian: - C 1 0 3 2 X0 - C 5 4 7 6 X1 - C 9 8 11 10 X2 - C 13 12 15 14 X3 - C even and odd columns switched because - C vldm loads consecutive doublewords and - C switches words inside them to make them BE - .Loop: QROUND(X0, X1, X2, X3) @@ -113,29 +106,17 @@ PROLOGUE(_nettle_chacha_core) C 5 6 7 4 >>> 3 C 10 11 8 9 >>> 2 C 15 12 13 14 >>> 1 - - C In big-endian rotate rows, to get - C 1 0 3 2 - C 6 5 4 7 >>> 1 - C 11 10 9 8 >>> 2 - C 12 15 14 13 >>> 3 - C different number of elements needs to be - C extracted on BE because of different column order -IF_LE(` vext.32 X1, X1, X1, #1') -IF_BE(` vext.32 X1, X1, X1, #3') + vext.32 X1, X1, X1, #1 vext.32 X2, X2, X2, #2 -IF_LE(` vext.32 X3, X3, X3, #3') -IF_BE(` vext.32 X3, X3, X3, #1') + vext.32 X3, X3, X3, #3 QROUND(X0, X1, X2, X3) subs ROUNDS, ROUNDS, #2 C Inverse rotation -IF_LE(` vext.32 X1, X1, X1, #3') -IF_BE(` vext.32 X1, X1, X1, #1') + vext.32 X1, X1, X1, #3 vext.32 X2, X2, X2, #2 -IF_LE(` vext.32 X3, X3, X3, #1') -IF_BE(` vext.32 X3, X3, X3, #3') + vext.32 X3, X3, X3, #1 bhi .Loop @@ -144,13 +125,15 @@ IF_BE(` vext.32 X3, X3, X3, #3') vadd.u32 X2, X2, S2 vadd.u32 X3, X3, S3 - C caller expects result little-endian -IF_BE(` vrev32.u8 X0, X0 - vrev32.u8 X1, X1 - vrev32.u8 X2, X2 - vrev32.u8 X3, X3') + C vst1.8 because caller expects results little-endian + C speed: https://developer.arm.com/documentation/ddi0344/b/instruction-cycle-timing/instruction-specific-scheduling-for-neon-instructions/neon-load-store-instructions + C 1 q register == 2 d registers, doc talks d registers + C vstm: (number of registers/2) + mod(number of registers, 2) + 1 == (8/2) + mod(8, 2) + 1 == 4 + 0 + 1 = 5 cycles + C vst1.8: 2ops each 4-reg unaligned: 2*3 == 6 cycles (plus potentially mov to set up address counter) +IF_LE(` vstm DST, {X0,X1,X2,X3}') +IF_BE(` vst1.8 {X0,X1}, [DST]! + vst1.8 {X2,X3}, [DST]') - vstm DST, {X0,X1,X2,X3} bx lr EPILOGUE(_nettle_chacha_core) diff --git a/arm/neon/salsa20-2core.asm b/arm/neon/salsa20-2core.asm index d622edd6..c094ed58 100644 --- a/arm/neon/salsa20-2core.asm +++ b/arm/neon/salsa20-2core.asm @@ -58,11 +58,14 @@ define(`T3', `q15') C _salsa20_2core(uint32_t *dst, const uint32_t *src, unsigned rounds) PROLOGUE(_nettle_salsa20_2core) - vldm SRC, {X0,X1,X2,X3} +IF_LE(` vldm SRC, {X0,X1,X2,X3}') +IF_BE(` mov r12, SRC + vld1.32 {X0,X1}, [r12]! + vld1.32 {X2,X3}, [r12]') adr r12, .Lcount1 vmov Y3, X0 - vld1.64 {Y1}, [r12] + vld1.32 {Y1}, [r12] vmov Y0, X1 vadd.i64 Y1, Y1, X2 C Increment counter vmov Y2, X3 @@ -180,7 +183,9 @@ C Inverse swaps and transpositions vswp D1REG(Y0), D1REG(Y2) vswp D1REG(Y1), D1REG(Y3) - vldm SRC, {T0,T1,T2,T3} +IF_LE(` vldm SRC, {T0,T1,T2,T3}') +IF_BE(` vld1.32 {T0,T1}, [SRC]! C SRC changed! + vld1.32 {T2,T3}, [SRC]') vtrn.32 X0, Y3 vtrn.32 X1, Y0 @@ -193,14 +198,19 @@ C Add in the original context vadd.i32 X2, X2, T2 vadd.i32 X3, X3, T3 - vstmia DST!, {X0,X1,X2,X3} - vld1.64 {X0}, [r12] +C vst1.8 because caller expects results little-endian +IF_LE(` vstmia DST!, {X0,X1,X2,X3}') +IF_BE(` vst1.8 {X0,X1}, [DST]! + vst1.8 {X2,X3}, [DST]!') + vld1.32 {X0}, [r12] vadd.i32 T0, T0, Y3 vadd.i64 T2, T2, X0 vadd.i32 T1, T1, Y0 vadd.i32 T2, T2, Y1 vadd.i32 T3, T3, Y2 - vstm DST, {T0,T1,T2,T3} +IF_LE(` vstm DST, {T0,T1,T2,T3}') +IF_BE(` vst1.8 {T0,T1}, [DST]! + vst1.8 {T2,T3}, [DST]') bx lr EPILOGUE(_nettle_salsa20_2core) diff --git a/arm/neon/salsa20-core-internal.asm b/arm/neon/salsa20-core-internal.asm index d59d7b80..9d2578e5 100644 --- a/arm/neon/salsa20-core-internal.asm +++ b/arm/neon/salsa20-core-internal.asm @@ -86,7 +86,10 @@ define(`QROUND', ` C _salsa20_core(uint32_t *dst, const uint32_t *src, unsigned rounds) PROLOGUE(_nettle_salsa20_core) - vldm SRC, {X0,X1,X2,X3} +IF_LE(` vldm SRC, {X0,X1,X2,X3}') +IF_BE(` mov r12, SRC + vld1.32 {X0,X1}, [r12]! + vld1.32 {X2,X3}, [r12]') C Input rows little-endian: C 0 1 2 3 X0 @@ -99,23 +102,11 @@ PROLOGUE(_nettle_salsa20_core) C 8 13 2 7 C 12 1 6 11 - C Input rows big-endian: - C 1 0 3 2 X0 - C 5 4 7 6 X1 - C 9 8 11 10 X2 - C 13 12 15 14 X3 - C even and odd columns switched because - C vldm loads consecutive doublewords and - C switches words inside them to make them BE - C Permuted to: - C 5 0 15 10 - C 9 4 3 14 - C 13 8 7 2 - C 1 12 11 6 - C FIXME: Construct in some other way? adr r12, .Lmasks - vldm r12, {M0101, M0110, M0011} +IF_LE(` vldm r12, {M0101, M0110, M0011}') +IF_BE(` vld1.32 {M0101, M0110}, [r12]! + vld1.32 {M0011}, [r12]') vmov S1, X1 vmov S2, X2 @@ -160,29 +151,17 @@ PROLOGUE(_nettle_salsa20_core) C 3 4 9 14 >>> 1 C 2 7 8 13 >>> 2 C 1 6 11 12 >>> 3 - - C In big-endian rotate rows, to get - C 5 0 15 10 - C 4 3 14 9 >>> 3 - C 7 2 13 8 >>> 2 - C 6 1 12 11 >>> 1 - C different number of elements needs to be - C extracted on BE because of different column order -IF_LE(` vext.32 X1, X1, X1, #3') -IF_BE(` vext.32 X1, X1, X1, #1') + vext.32 X1, X1, X1, #3 vext.32 X2, X2, X2, #2 -IF_LE(` vext.32 X3, X3, X3, #1') -IF_BE(` vext.32 X3, X3, X3, #3') + vext.32 X3, X3, X3, #1 QROUND(X0, X3, X2, X1) subs ROUNDS, ROUNDS, #2 C Inverse rotation -IF_LE(` vext.32 X1, X1, X1, #1') -IF_BE(` vext.32 X1, X1, X1, #3') + vext.32 X1, X1, X1, #1 vext.32 X2, X2, X2, #2 -IF_LE(` vext.32 X3, X3, X3, #3') -IF_BE(` vext.32 X3, X3, X3, #1') + vext.32 X3, X3, X3, #3 bhi .Loop @@ -202,19 +181,16 @@ IF_BE(` vext.32 X3, X3, X3, #1') vbit X2, X3, M0101 vbit X3, T1, M0101 - vld1.64 {T0}, [SRC] + vld1.32 {T0}, [SRC] vadd.u32 X0, X0, T0 vadd.u32 X1, X1, S1 vadd.u32 X2, X2, S2 vadd.u32 X3, X3, S3 - C caller expects result little-endian -IF_BE(` vrev32.u8 X0, X0 - vrev32.u8 X1, X1 - vrev32.u8 X2, X2 - vrev32.u8 X3, X3') - - vstm DST, {X0,X1,X2,X3} + C vst1.8 because caller expects results little-endian +IF_LE(` vstm DST, {X0,X1,X2,X3}') +IF_BE(` vst1.8 {X0,X1}, [DST]! + vst1.8 {X2,X3}, [DST]') bx lr EPILOGUE(_nettle_salsa20_core)