I reuploaded the patch as attachment since it didn't apply due to email line breaks, I also fixed the gcm table alignment issue, Thanks to Niels Möller.
On Tue, Nov 10, 2020 at 6:25 AM Maamoun TK maamoun.tk@googlemail.com wrote:
This implementation takes advantage of research made by Niels Möller to optimize GCM on PowerPC, this optimization yields a +27.7% performance boost on POWER8 over the previous implementation that was based on intel documents. The performance comparison is made by processing 4 blocks per loop without any further optimizations. I made some documentations between the lines but I suggest writing a document similar to the intel ones that go into more details and clarify the preference of this method. I'm also curious if this method can also make a difference in other architectures like ARM, I'm planning to try it out for ARM to figure that out.
configure.ac | 6 +- gcm.c | 49 +++-- powerpc64/p8/gcm-hash.asm | 502 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 542 insertions(+), 15 deletions(-) create mode 100644 powerpc64/p8/gcm-hash.asm
diff --git a/configure.ac b/configure.ac index 2a47f940..20f7cf74 100644 --- a/configure.ac +++ b/configure.ac @@ -497,7 +497,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \ sha3-permute.asm umac-nh.asm umac-nh-n.asm machine.m4"
# Assembler files which generate additional object files if they are used. -asm_nettle_optional_list="gcm-hash8.asm cpuid.asm \ +asm_nettle_optional_list="gcm-hash.asm gcm-hash8.asm cpuid.asm \ aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \ chacha-3core.asm chacha-core-internal-2.asm salsa20-2core.asm \ salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \ @@ -621,9 +621,9 @@ AH_VERBATIM([HAVE_NATIVE], #undef HAVE_NATIVE_ecc_secp384r1_redc #undef HAVE_NATIVE_ecc_secp521r1_modp #undef HAVE_NATIVE_ecc_secp521r1_redc -#undef HAVE_NATIVE_gcm_init_key8 +#undef HAVE_NATIVE_gcm_init_key +#undef HAVE_NATIVE_gcm_hash #undef HAVE_NATIVE_gcm_hash8 -#undef HAVE_NATIVE_gcm_fill #undef HAVE_NATIVE_salsa20_core #undef HAVE_NATIVE_salsa20_2core #undef HAVE_NATIVE_fat_salsa20_2core diff --git a/gcm.c b/gcm.c index 48b3e75a..81981c1c 100644 --- a/gcm.c +++ b/gcm.c @@ -140,6 +140,19 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *table) memcpy (x->b, Z.b, sizeof(Z)); } # elif GCM_TABLE_BITS == 8 +# if HAVE_NATIVE_gcm_init_key
+#define gcm_init_key _nettle_gcm_init_key +void +_nettle_gcm_init_key (union nettle_block16 *table); +# endif /* HAVE_NATIVE_gcm_init_key */ +# if HAVE_NATIVE_gcm_hash
+#define gcm_hash _nettle_gcm_hash +void +_nettle_gcm_hash (const struct gcm_key *key, union nettle_block16 *x,
- size_t length, const uint8_t *data);
+# endif /* HAVE_NATIVE_gcm_hash */ # if HAVE_NATIVE_gcm_hash8
#define gcm_hash _nettle_gcm_hash8 @@ -228,6 +241,29 @@ gcm_gf_mul (union nettle_block16 *x, const union nettle_block16 *table) /* Increment the rightmost 32 bits. */ #define INC32(block) INCREMENT(4, (block.b) + GCM_BLOCK_SIZE - 4)
+#ifndef gcm_init_key +static void +gcm_init_key(union nettle_block16 *table) +{ +#if GCM_TABLE_BITS
- /* Middle element if GCM_TABLE_BITS > 0, otherwise the first
element */
- unsigned i = (1<<GCM_TABLE_BITS)/2;
- /* Algorithm 3 from the gcm paper. First do powers of two, then do
the rest by adding. */
- while (i /= 2)
- block16_mulx_ghash(&table[i], &table[2*i]);
- for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2)
- {
unsigned j;
for (j = 1; j < i; j++)
- block16_xor3(&table[i+j], &table[i], &table[j]);
- }
+#endif +} +#endif /* !gcm_init_key */
/* Initialization of GCM.
- @ctx: The context of GCM
- @cipher: The context of the underlying block cipher
@@ -245,18 +281,7 @@ gcm_set_key(struct gcm_key *key, memset(key->h[0].b, 0, GCM_BLOCK_SIZE); f (cipher, GCM_BLOCK_SIZE, key->h[i].b, key->h[0].b);
-#if GCM_TABLE_BITS
- /* Algorithm 3 from the gcm paper. First do powers of two, then do
the rest by adding. */
- while (i /= 2)
- block16_mulx_ghash(&key->h[i], &key->h[2*i]);
- for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2)
- {
unsigned j;
for (j = 1; j < i; j++)
- block16_xor3(&key->h[i+j], &key->h[i],&key->h[j]);
- }
-#endif
- gcm_init_key(key->h);
}
#ifndef gcm_hash diff --git a/powerpc64/p8/gcm-hash.asm b/powerpc64/p8/gcm-hash.asm new file mode 100644 index 00000000..e79fbdc2 --- /dev/null +++ b/powerpc64/p8/gcm-hash.asm @@ -0,0 +1,502 @@ +C powerpc64/p8/gcm-hash.asm
+ifelse(`
- Copyright (C) 2020 Niels Möller and Mamone Tarsha
- This file is part of GNU Nettle.
- GNU Nettle is free software: you can redistribute it and/or
- modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your
option) any later version.
- or
* the GNU General Public License as published by the Free
Software Foundation; either version 2 of the License, or (at your
option) any later version.
- or both in parallel, as here.
- GNU Nettle is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
- You should have received copies of the GNU General Public License and
- the GNU Lesser General Public License along with this program. If
- not, see http://www.gnu.org/licenses/.
+')
+C Alignment of gcm_key table elements, which is declared in gcm.h +define(`TableElemAlign', `0x100')
+C Register usage:
+define(`SP', `r1') +define(`TOCP', `r2')
+define(`TABLE', `r3')
+define(`ZERO', `v0') +define(`B1', `v1') +define(`EMSB', `v16') +define(`POLY', `v17') +define(`POLY_L', `v1')
+define(`H', `v2') +define(`H2', `v3') +define(`H3', `v4') +define(`H4', `v5') +define(`H1M', `v6') +define(`H1L', `v7') +define(`H2M', `v8') +define(`H2L', `v9') +define(`Hl', `v10') +define(`Hm', `v11') +define(`Hp', `v12') +define(`Hl2', `v13') +define(`Hm2', `v14') +define(`Hp2', `v15') +define(`R', `v13') +define(`F', `v14') +define(`T', `v15') +define(`R2', `v16') +define(`F2', `v17') +define(`T2', `v18')
+define(`LE_TEMP', `v18') +define(`LE_MASK', `v19')
+.file "gcm-hash.asm"
+.text
- C void gcm_init_key (union gcm_block *table)
+C This function populates the gcm table as the following layout +C
+C | H1M = (H1 div x⁶⁴)||((H1 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ | +C | H1L = (H1 mod x⁶⁴)||(((H1 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H1 div x⁶⁴) | +C | | +C | H2M = (H2 div x⁶⁴)||((H2 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ | +C | H2L = (H2 mod x⁶⁴)||(((H2 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H2 div x⁶⁴) | +C | | +C | H3M = (H3 div x⁶⁴)||((H3 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ | +C | H3L = (H3 mod x⁶⁴)||(((H3 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H3 div x⁶⁴) | +C | | +C | H4M = (H3 div x⁶⁴)||((H4 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴ | +C | H4L = (H3 mod x⁶⁴)||(((H4 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H4 div x⁶⁴) | +C
+define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_gcm_init_key)
- DATA_LOAD_VEC(POLY,.polynomial,r7) C
0xC2000000000000000000000000000001 +IF_LE(`
- li r8,0
- lvsl LE_MASK,0,r8 C
0x000102030405060708090A0B0C0D0E0F
- vspltisb LE_TEMP,0x07 C
0x07070707070707070707070707070707
- vxor LE_MASK,LE_MASK,LE_TEMP C
0x07060504030201000F0E0D0C0B0A0908 +')
- C 'H' is assigned by gcm_set_key() to the middle element of the table
- li r10,8*TableElemAlign
- lxvd2x VSR(H),r10,TABLE C load 'H'
- C byte-reverse of each doubleword permuting on little-endian mode
+IF_LE(`
- vperm H,H,H,LE_MASK
+')
- C --- calculate H = H << 1 mod P(X), P(X) = (x¹²⁸+x¹²⁷+x¹²⁶+x¹²¹+1)
- vupkhsb EMSB,H C extend most
significant bit to first byte
- vspltisb B1,1 C
0x01010101010101010101010101010101
- vspltb EMSB,EMSB,0 C first byte
quadword-extend
- vsl H,H,B1 C H = H << 1
- vand EMSB,EMSB,POLY C EMSB &=
0xC2000000000000000000000000000001
- vxor ZERO,ZERO,ZERO C
0x00000000000000000000000000000000
- vxor H,H,EMSB C H ^= EMSB
- C --- calculate H^2 = H*H ---
- xxmrghd VSR(POLY_L),VSR(ZERO),VSR(POLY) C
0x0000000000000000C200000000000000
- C --- Hp = (H mod x⁶⁴) / x⁶⁴ mod P(X) ---
- C --- Hp = (H mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷) mod P(X), deg(Hp) ≤ 127 ---
- C --- Hp = (H mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷) ---
- vpmsumd Hp,H,POLY_L C Hp = (H mod x⁶⁴) ×
(x⁶³+x⁶²+x⁵⁷)
- xxmrgld VSR(Hl),VSR(H),VSR(ZERO) C Hl = (H mod x⁶⁴) × x⁶⁴
- xxswapd VSR(Hm),VSR(H)
- vxor Hl,Hl,Hp C Hl = Hl + Hp
- vxor Hm,Hm,Hp C Hm = Hm + Hp
- xxmrghd VSR(H1M),VSR(H),VSR(Hl) C H1M = (H div x⁶⁴)||(Hl
div x⁶⁴)
- xxmrgld VSR(H1L),VSR(H),VSR(Hm) C H1L = (H mod x⁶⁴)||(Hl
mod x⁶⁴)
- vpmsumd F,H1L,H C F = (H1Lh × Hh) +
(H1Ll × Hl)
- vpmsumd R,H1M,H C R = (H1Mh × Hh) +
(H1Ml × Hl)
- C --- rduction ---
- vpmsumd T,F,POLY_L C T = (F mod x⁶⁴) ×
(x⁶³+x⁶²+x⁵⁷)
- xxswapd VSR(H2),VSR(F)
- vxor R,R,T C R = R + T
- vxor H2,R,H2
- xxmrgld VSR(Hl),VSR(H2),VSR(ZERO)
- xxswapd VSR(Hm),VSR(H2)
- vpmsumd Hp,H2,POLY_L
- vxor Hl,Hl,Hp
- vxor Hm,Hm,Hp
- xxmrghd VSR(H2M),VSR(H2),VSR(Hl)
- xxmrgld VSR(H2L),VSR(H2),VSR(Hm)
- C store H1M, H1L, H2M, H2L
- li r8,1*TableElemAlign
- li r9,2*TableElemAlign
- li r10,3*TableElemAlign
- stxvd2x VSR(H1M),0,TABLE
- stxvd2x VSR(H1L),r8,TABLE
- stxvd2x VSR(H2M),r9,TABLE
- stxvd2x VSR(H2L),r10,TABLE
- C --- calculate H^3 = H^1*H^2, H^4 = H^2*H^2 ---
- vpmsumd F,H1L,H2
- vpmsumd F2,H2L,H2
- vpmsumd R,H1M,H2
- vpmsumd R2,H2M,H2
- vpmsumd T,F,POLY_L
- vpmsumd T2,F2,POLY_L
- xxswapd VSR(H3),VSR(F)
- xxswapd VSR(H4),VSR(F2)
- vxor R,R,T
- vxor R2,R2,T2
- vxor H3,R,H3
- vxor H4,R2,H4
- xxmrgld VSR(Hl),VSR(H3),VSR(ZERO)
- xxmrgld VSR(Hl2),VSR(H4),VSR(ZERO)
- xxswapd VSR(Hm),VSR(H3)
- xxswapd VSR(Hm2),VSR(H4)
- vpmsumd Hp,H3,POLY_L
- vpmsumd Hp2,H4,POLY_L
- vxor Hl,Hl,Hp
- vxor Hl2,Hl2,Hp2
- vxor Hm,Hm,Hp
- vxor Hm2,Hm2,Hp2
- xxmrghd VSR(H1M),VSR(H3),VSR(Hl)
- xxmrghd VSR(H2M),VSR(H4),VSR(Hl2)
- xxmrgld VSR(H1L),VSR(H3),VSR(Hm)
- xxmrgld VSR(H2L),VSR(H4),VSR(Hm2)
- C store H3M, H3L, H4M, H4L
- li r7,4*TableElemAlign
- li r8,5*TableElemAlign
- li r9,6*TableElemAlign
- li r10,7*TableElemAlign
- stxvd2x VSR(H1M),r7,TABLE
- stxvd2x VSR(H1L),r8,TABLE
- stxvd2x VSR(H2M),r9,TABLE
- stxvd2x VSR(H2L),r10,TABLE
- blr
+EPILOGUE(_nettle_gcm_init_key)
+define(`TABLE', `r3') +define(`X', `r4') +define(`LENGTH', `r5') +define(`DATA', `r6')
+define(`ZERO', `v16') +define(`POLY', `v17') +define(`POLY_L', `v0')
+define(`D', `v1') +define(`C0', `v2') +define(`C1', `v3') +define(`C2', `v4') +define(`C3', `v5') +define(`H1M', `v6') +define(`H1L', `v7') +define(`H2M', `v8') +define(`H2L', `v9') +define(`H3M', `v10') +define(`H3L', `v11') +define(`H4M', `v12') +define(`H4L', `v13') +define(`R', `v14') +define(`F', `v15') +define(`R2', `v16') +define(`F2', `v17') +define(`R3', `v18') +define(`F3', `v20') +define(`R4', `v21') +define(`F4', `v22') +define(`T', `v23')
+define(`LE_TEMP', `v18') +define(`LE_MASK', `v19')
- C void gcm_hash (const struct gcm_key *key, union gcm_block *x,
- C size_t length, const uint8_t *data)
+define(`FUNC_ALIGN', `5') +PROLOGUE(_nettle_gcm_hash)
- DATA_LOAD_VEC(POLY,.polynomial,r7)
+IF_LE(`
- li r8,0
- lvsl LE_MASK,0,r8
- vspltisb LE_TEMP,0x07
- vxor LE_MASK,LE_MASK,LE_TEMP
+')
- vxor ZERO,ZERO,ZERO
- xxmrghd VSR(POLY_L),VSR(ZERO),VSR(POLY)
- lxvd2x VSR(D),0,X C load 'X' pointer
- C byte-reverse of each doubleword permuting on little-endian mode
+IF_LE(`
- vperm D,D,D,LE_MASK
+')
- C --- process 4 blocks '128-bit each' per one loop ---
- srdi r7,LENGTH,6 C 4-blocks loop count
'LENGTH / (4 * 16)'
- cmpldi r7,0
- beq L2x
- mtctr r7 C assign counter
register to loop count
- C store non-volatile vector registers
- addi r8,SP,-64
- stvx 20,0,r8
- addi r8,r8,16
- stvx 21,0,r8
- addi r8,r8,16
- stvx 22,0,r8
- addi r8,r8,16
- stvx 23,0,r8
- C load table elements
- li r8,1*TableElemAlign
- li r9,2*TableElemAlign
- li r10,3*TableElemAlign
- lxvd2x VSR(H1M),0,TABLE
- lxvd2x VSR(H1L),r8,TABLE
- lxvd2x VSR(H2M),r9,TABLE
- lxvd2x VSR(H2L),r10,TABLE
- li r7,4*TableElemAlign
- li r8,5*TableElemAlign
- li r9,6*TableElemAlign
- li r10,7*TableElemAlign
- lxvd2x VSR(H3M),r7,TABLE
- lxvd2x VSR(H3L),r8,TABLE
- lxvd2x VSR(H4M),r9,TABLE
- lxvd2x VSR(H4L),r10,TABLE
- li r8,0x10
- li r9,0x20
- li r10,0x30
+.align 5 +L4x_loop:
- C input loading
- lxvd2x VSR(C0),0,DATA C load C0
- lxvd2x VSR(C1),r8,DATA C load C1
- lxvd2x VSR(C2),r9,DATA C load C2
- lxvd2x VSR(C3),r10,DATA C load C3
+IF_LE(`
- vperm C0,C0,C0,LE_MASK
- vperm C1,C1,C1,LE_MASK
- vperm C2,C2,C2,LE_MASK
- vperm C3,C3,C3,LE_MASK
+')
- C previous digest combining
- vxor C0,C0,D
- C polynomial multiplication
- vpmsumd F2,H3L,C1
- vpmsumd R2,H3M,C1
- vpmsumd F3,H2L,C2
- vpmsumd R3,H2M,C2
- vpmsumd F4,H1L,C3
- vpmsumd R4,H1M,C3
- vpmsumd F,H4L,C0
- vpmsumd R,H4M,C0
- C deferred recombination of partial products
- vxor F3,F3,F4
- vxor R3,R3,R4
- vxor F,F,F2
- vxor R,R,R2
- vxor F,F,F3
- vxor R,R,R3
- C reduction
- vpmsumd T,F,POLY_L
- xxswapd VSR(D),VSR(F)
- vxor R,R,T
- vxor D,R,D
- addi DATA,DATA,0x40
- bdnz L4x_loop
- C restore non-volatile vector registers
- addi r8,SP,-64
- lvx 20,0,r8
- addi r8,r8,16
- lvx 21,0,r8
- addi r8,r8,16
- lvx 22,0,r8
- addi r8,r8,16
- lvx 23,0,r8
- clrldi LENGTH,LENGTH,58 C 'set the high-order 58
bits to zeros' +L2x:
- C --- process 2 blocks ---
- srdi r7,LENGTH,5 C 'LENGTH / (2 * 16)'
- cmpldi r7,0
- beq L1x
- C load table elements
- li r8,1*TableElemAlign
- li r9,2*TableElemAlign
- li r10,3*TableElemAlign
- lxvd2x VSR(H1M),0,TABLE
- lxvd2x VSR(H1L),r8,TABLE
- lxvd2x VSR(H2M),r9,TABLE
- lxvd2x VSR(H2L),r10,TABLE
- C input loading
- li r10,0x10
- lxvd2x VSR(C0),0,DATA C load C0
- lxvd2x VSR(C1),r10,DATA C load C1
+IF_LE(`
- vperm C0,C0,C0,LE_MASK
- vperm C1,C1,C1,LE_MASK
+')
- C previous digest combining
- vxor C0,C0,D
- C polynomial multiplication
- vpmsumd F2,H1L,C1
- vpmsumd R2,H1M,C1
- vpmsumd F,H2L,C0
- vpmsumd R,H2M,C0
- C deferred recombination of partial products
- vxor F,F,F2
- vxor R,R,R2
- C reduction
- vpmsumd T,F,POLY_L
- xxswapd VSR(D),VSR(F)
- vxor R,R,T
- vxor D,R,D
- addi DATA,DATA,0x20
- clrldi LENGTH,LENGTH,59 C 'set the high-order 59
bits to zeros' +L1x:
- C --- process 1 block ---
- srdi r7,LENGTH,4 C 'LENGTH / (1 * 16)'
- cmpldi r7,0
- beq Lmod
- C load table elements
- li r8,1*TableElemAlign
- lxvd2x VSR(H1M),0,TABLE
- lxvd2x VSR(H1L),r8,TABLE
- C input loading
- lxvd2x VSR(C0),0,DATA C load C0
+IF_LE(`
- vperm C0,C0,C0,LE_MASK
+')
- C previous digest combining
- vxor C0,C0,D
- C polynomial multiplication
- vpmsumd F,H1L,C0
- vpmsumd R,H1M,C0
- C reduction
- vpmsumd T,F,POLY_L
- xxswapd VSR(D),VSR(F)
- vxor R,R,T
- vxor D,R,D
- addi DATA,DATA,0x10
- clrldi LENGTH,LENGTH,60 C 'set the high-order 60
bits to zeros' +Lmod:
- C --- process the modulo bytes, padding the low-order bytes with
zeros ---
- cmpldi LENGTH,0
- beq Ldone
- C load table elements
- li r8,1*TableElemAlign
- lxvd2x VSR(H1M),0,TABLE
- lxvd2x VSR(H1L),r8,TABLE
- C push every modulo byte to the stack and load them with padding into
vector register
- vxor ZERO,ZERO,ZERO
- addi r8,SP,-16
- stvx ZERO,0,r8
+Lstb_loop:
- subic. LENGTH,LENGTH,1
- lbzx r7,LENGTH,DATA
- stbx r7,LENGTH,r8
- bne Lstb_loop
- lxvd2x VSR(C0),0,r8
+IF_LE(`
- vperm C0,C0,C0,LE_MASK
+')
- C previous digest combining
- vxor C0,C0,D
- C polynomial multiplication
- vpmsumd F,H1L,C0
- vpmsumd R,H1M,C0
- C reduction
- vpmsumd T,F,POLY_L
- xxswapd VSR(D),VSR(F)
- vxor R,R,T
- vxor D,R,D
+Ldone:
- C byte-reverse of each doubleword permuting on little-endian mode
+IF_LE(`
- vperm D,D,D,LE_MASK
+')
- stxvd2x VSR(D),0,X C store digest 'D'
- blr
+EPILOGUE(_nettle_gcm_hash)
+.data
- C 0xC2000000000000000000000000000001
+.polynomial: +.align 4 +IF_BE(` +.byte 0xC2 +.rept 14 +.byte 0x00 +.endr +.byte 0x01 +',` +.byte 0x01 +.rept 14 +.byte 0x00 +.endr +.byte 0xC2 +')
-- 2.17.1