---
powerpc64/p7/chacha-core-internal.asm | 55
++++++++++++++++++++++++++++++++++-
1 file changed, 54 insertions(+), 1 deletion(-)
diff --git a/powerpc64/p7/chacha-core-internal.asm
b/powerpc64/p7/chacha-core-internal.asm
index 33c721c1..922050ff 100644
--- a/powerpc64/p7/chacha-core-internal.asm
+++ b/powerpc64/p7/chacha-core-internal.asm
@@ -53,6 +53,18 @@ define(`S1', `v9')
define(`S2', `v10')
define(`S3', `v11')
+C Big-endian working state
+define(`ROT24', `v12')
+define(`ODD', `v13')
+define(`EVEN', `v14')
+define(`ZERO', `v15')
+define(`NEG', `v16')
+
+define(`XR0', `v15')
+define(`XR1', `v16')
+define(`XR2', `v17')
+define(`XR3', `v18')
+
C QROUND(X0, X1, X2, X3)
define(`QROUND', `
C x0 += x1, x3 ^= x0, x3 lrot 16
@@ -77,10 +89,42 @@ define(`QROUND', `
vrlw $2, $2, ROT7
')
+C LE_SWAP32(X0, X1, X2, X3)
+define(`LE_SWAP32', `IF_BE(`
+ C xr = x lrot 8, xr &= 0x00FF00FF
+ C x = x lrot 24, x &= 0xFF00FF00
+ C x |= xr
+
+ vrlw XR0, X0, ROT8
+ vrlw XR1, X1, ROT8
+ vrlw XR2, X2, ROT8
+ vrlw XR3, X3, ROT8
+
+ vand XR0, XR0, ODD
+ vand XR1, XR1, ODD
+ vand XR2, XR2, ODD
+ vand XR3, XR3, ODD
+
+ vrlw X0, X0, ROT24
+ vrlw X1, X1, ROT24
+ vrlw X2, X2, ROT24
+ vrlw X3, X3, ROT24
+
+ vand X0, X0, EVEN
+ vand X1, X1, EVEN
+ vand X2, X2, EVEN
+ vand X3, X3, EVEN
+
+ vor X0, X0, XR0
+ vor X1, X1, XR1
+ vor X2, X2, XR2
+ vor X3, X3, XR3
+')')
+
.text
- .align 4
C _chacha_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+define(`FUNC_ALIGN', `5')
PROLOGUE(_nettle_chacha_core)
li r6, 0x10 C set up some...
@@ -91,6 +135,13 @@ PROLOGUE(_nettle_chacha_core)
vspltisw ROT12, 12
vspltisw ROT8, 8
vspltisw ROT7, 7
+IF_BE(`
+ vspltisw ZERO, 0
+ vspltisw NEG, -1
+ vmrghb ODD, ZERO, NEG
+ vmrghb EVEN, NEG, ZERO
+ vadduwm ROT24, ROT12, ROT12
+')
lxvw4x VSR(X0), 0, SRC
lxvw4x VSR(X1), r6, SRC
@@ -131,6 +182,8 @@ PROLOGUE(_nettle_chacha_core)
vadduwm X2, X2, S2
vadduwm X3, X3, S3
+ LE_SWAP32(X0, X1, X2, X3)
+
stxvw4x VSR(X0), 0, DST
stxvw4x VSR(X1), r6, DST
stxvw4x VSR(X2), r7, DST
--
2.17.1