nisse@lysator.liu.se (Niels Möller) writes:
This is the speed I get for C implementations of poly1305_update on my x86_64 laptop:
Radix 26: 1.2 GByte/s (old code)
Radix 32: 1.3 GByte/s
Radix 64: 2.2 GByte/s
[...]
For comparison, the current x86_64 asm version: 2.5 GByte/s.
[...]
I've tried reworking folding, to reduce latency [...] With this trick I get on the same machine
Radix 32: 1.65 GByte/s
Radix 64: 2.75 GByte/s, i.e., faster than current x86_64 asm version.
And I've now tried the same method for the x86_64 implementation. See attached file + needed patch to asm.m4. This gives 2.9 GByte/s.
I'm not entirely sure cycle numbers are accurate, with clock frequence not being fixed. I think the machine runs bechmarks at 2.1GHz, and then this corresponds to 11.5 cycles per block, 0.7 cycles per byte, 4 instructions per cycle, 0.5 multiply instructions per cycle.
This laptop has an AMD zen2 processor, which should be capable of issuing four instructions per cycle and complete one multiply instruction per cycle (according to https://gmplib.org/~tege/x86-timing.pdf).
This seems to indicate that on this hardware, speed is not limited by multiplier throughput, instead, the bottleneck is instruction decoding/issuing, with max four instructions per cycle.
Regards, /Niels
C x86_64/poly1305-internal.asm
ifelse(` Copyright (C) 2013 Niels Möller
This file is part of GNU Nettle.
GNU Nettle is free software: you can redistribute it and/or modify it under the terms of either:
* the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version.
or
* the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.
or both in parallel, as here.
GNU Nettle is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received copies of the GNU General Public License and the GNU Lesser General Public License along with this program. If not, see http://www.gnu.org/licenses/. ')
.file "poly1305-internal.asm"
C Registers mainly used by poly1305_block define(`CTX', `%rdi') C First argument to all functions
define(`KEY', `%rsi') define(`MASK',` %r8') C _poly1305_set_key(struct poly1305_ctx *ctx, const uint8_t key[16]) .text ALIGN(16) PROLOGUE(_nettle_poly1305_set_key) W64_ENTRY(2,0) mov $0x0ffffffc0fffffff, MASK mov (KEY), %rax and MASK, %rax and $-4, MASK mov %rax, P1305_R0 (CTX) imul $5, %rax mov %rax, P1305_S0 (CTX) mov 8(KEY), %rax and MASK, %rax mov %rax, P1305_R1 (CTX) shr $2, %rax imul $5, %rax mov %rax, P1305_S1 (CTX) xor XREG(%rax), XREG(%rax) mov %rax, P1305_H0 (CTX) mov %rax, P1305_H1 (CTX) mov %rax, P1305_H2 (CTX) W64_EXIT(2,0) ret
undefine(`KEY') undefine(`MASK')
EPILOGUE(_nettle_poly1305_set_key)
define(`T0', `%rcx') define(`T1', `%rsi') C Overlaps message input pointer. define(`T2', `%r8') define(`H0', `%r9') define(`H1', `%r10') define(`F0', `%r11') define(`F1', `%r12')
C Compute in parallel C C {H1,H0} = R0 T0 + S1 T1 + S0 (T2 >> 2) C {F1,F0} = R1 T0 + R0 T1 + S1 T2 C T = R0 * (T2 & 3) C C Then accumulate as C C +--+--+--+ C |T |H1|H0| C +--+--+--+ C + |F1|F0| C --+--+--+--+ C |H2|H1|H0| C +--+--+--+
C _poly1305_block (struct poly1305_ctx *ctx, const uint8_t m[16], unsigned hi) PROLOGUE(_nettle_poly1305_block) W64_ENTRY(3, 0) push %r12 mov (%rsi), T0 mov 8(%rsi), T1 mov XREG(%rdx), XREG(T2) C Also zero extends
add P1305_H0 (CTX), T0 adc P1305_H1 (CTX), T1 adc P1305_H2 (CTX), T2
mov P1305_R1 (CTX), %rax mul T0 C R1 T0 mov %rax, F0 mov %rdx, F1
mov T0, %rax C Last use of T0 input mov P1305_R0 (CTX), T0 mul T0 C R0*T0 mov %rax, H0 mov %rdx, H1
mov T1, %rax mul T0 C R0*T1 add %rax, F0 adc %rdx, F1
mov P1305_S1 (CTX), T0 mov T1, %rax C Last use of T1 input mul T0 C S1*T1 add %rax, H0 adc %rdx, H1
mov T2, %rax mul T0 C S1*T2 add %rax, F0 adc %rdx, F1
mov $3, XREG(T1) and T2, T1
shr $2, T2 mov P1305_S0 (CTX), %rax mul T2 C S0*(T2 >> 2) add %rax, H0 adc %rdx, H1
imul P1305_R0 (CTX), T1 C R0*(T2 & 3) add F0, H1 adc T1, F1
mov H0, P1305_H0 (CTX) mov H1, P1305_H1 (CTX) mov F1, P1305_H2 (CTX) pop %r12 W64_EXIT(3, 0) ret EPILOGUE(_nettle_poly1305_block) undefine(`T0') undefine(`T1') undefine(`T2') undefine(`H0') undefine(`H1') undefine(`F0') undefine(`F1')
C _poly1305_digest (struct poly1305_ctx *ctx, uint8_t *s) define(`S', `%rsi')
define(`T0', `%rcx') define(`T1', `%r8') define(`H0', `%r9') define(`H1', `%r10') define(`F0', `%r11') define(`F1', `%rrd') C Overlaps CTX
PROLOGUE(_nettle_poly1305_digest) W64_ENTRY(2, 0)
mov P1305_H0 (CTX), H0 mov P1305_H1 (CTX), H1 mov P1305_H2 (CTX), F0
xor XREG(%rax), XREG(%rax) mov %rax, P1305_H0 (CTX) mov %rax, P1305_H1 (CTX) mov %rax, P1305_H2 (CTX)
mov $3, XREG(%rax) and XREG(F0), XREG(%rax) shr $2, F0 imul $5, F0 add F0, H0 adc $0, H1 adc $0, XREG(%rax)
C Add 5, use result if >= 2^130 mov $5, T0 xor T1, T1 add H0, T0 adc H1, T1 adc $0, XREG(%rax) C Use adc $-4 ? cmp $4, XREG(%rax) cmovnc T0, H0 cmovnc T1, H1
add H0, (S) adc H1, 8(S)
W64_EXIT(2, 0) ret EPILOGUE(_nettle_poly1305_digest)