From f3e529a40fa376a9ce73a229fa223655504a1ac5 Mon Sep 17 00:00:00 2001 From: Nikos Mavrogiannopoulos Date: Fri, 16 Jan 2015 16:43:23 +0100 Subject: [PATCH] Select SSE2 XOR when on Intel x86-64 --- x86_64/fat/fat.c | 33 +++++++ x86_64/fat/memxor.asm | 250 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 283 insertions(+) create mode 100644 x86_64/fat/memxor.asm diff --git a/x86_64/fat/fat.c b/x86_64/fat/fat.c index 3585cf5..e892537 100644 --- a/x86_64/fat/fat.c +++ b/x86_64/fat/fat.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "nettle-types.h" @@ -96,6 +97,12 @@ aes_crypt_internal_func _aes_decrypt IFUNC ("_aes_decrypt_resolve"); aes_crypt_internal_func _nettle_aes_decrypt_x86_64; aes_crypt_internal_func _nettle_aes_decrypt_aesni; +typedef void *(memxor_func)(void *dst_in, const void *src_in, size_t n); + +memxor_func nettle_memxor IFUNC ("_memxor_resolve"); +memxor_func _nettle_memxor_x86_64; +memxor_func _nettle_memxor_sse2; + #if HAVE_LINK_IFUNC #define _aes_encrypt_init NULL #define _aes_decrypt_init NULL @@ -106,6 +113,7 @@ static aes_crypt_internal_func _aes_decrypt_init; static aes_crypt_internal_func *_aes_encrypt_vec = _aes_encrypt_init; static aes_crypt_internal_func *_aes_decrypt_vec = _aes_decrypt_init; +static memxor_func *_memxor_vec = _nettle_memxor_x86_64; /* This function should usually be called only once, at startup. But it is idempotent, and on x86, pointer updates are atomic, so @@ -144,6 +152,16 @@ fat_init (void) _aes_encrypt_vec = _nettle_aes_encrypt_x86_64; _aes_decrypt_vec = _nettle_aes_decrypt_x86_64; } + + _nettle_cpuid (0, cpuid_data); + if (memcmp(&cpuid_data[1], "Genu", 4) == 0 && + memcmp(&cpuid_data[3], "ineI", 4) == 0 && + memcmp(&cpuid_data[2], "ntel", 4) == 0) { + if (verbose) + fprintf (stderr, "libnettle: intel SSE2 will be used for XOR.\n"); + _memxor_vec = _nettle_memxor_sse2; + } + /* FIXME: We ought to use some thread-aware memory barrier before setting the initialized flag. For now, just do another cpuinfo call to get some synchronization. */ @@ -179,6 +197,15 @@ _aes_decrypt_resolve (void) return (void_func *) _aes_decrypt_vec; } +static void_func * +_memxor_resolve (void) +{ + if (getenv ("NETTLE_FAT_VERBOSE")) + fprintf (stderr, "libnettle: _memxor_resolve\n"); + fat_init (); + return (void_func *) _memxor_vec; +} + #else /* !HAVE_LINK_IFUNC */ /* We need wrapper functions jumping via the function pointer. */ @@ -226,4 +253,10 @@ _aes_decrypt_init (unsigned rounds, const uint32_t *keys, _aes_decrypt (rounds, keys, T, length, dst, src); } +void * +memxor(void *dst_in, const void *src_in, size_t n) +{ + return _memxor_vec (dst_in, src_in, n); +} + #endif /* !HAVE_LINK_IFUNC */ diff --git a/x86_64/fat/memxor.asm b/x86_64/fat/memxor.asm new file mode 100644 index 0000000..118447a --- /dev/null +++ b/x86_64/fat/memxor.asm @@ -0,0 +1,250 @@ +C x86_64/memxor.asm + +ifelse(< + Copyright (C) 2010, 2014, Niels Möller + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +>) + +C Register usage: +define(, <%rax>) C Originally in %rdi +define(, <%rsi>) +define(, <%rdx>) +define(, <%r8>) +define(, <%r9>) +define(, <%rdi>) +define(, <%r11>) +define(, <%rdi>) C Overlaps with CNT + + .file "memxor.asm" + + .text + + C memxor(void *dst, const void *src, size_t n) + C %rdi %rsi %rdx + ALIGN(16) + +PROLOGUE(_nettle_memxor_x86_64) + W64_ENTRY(3, 0) + + test N, N + C Get number of unaligned bytes at the end + C %rdi is used as CNT, %rax as DST and as return value + mov %rdi, %rax + jz .Ldone + add N, CNT + and $7, CNT + + jz .Laligned + + cmp $8, N + jc .Lfinal_next + + C FIXME: Instead of this loop, could try cmov with memory + C destination, as a sequence of one 8-bit, one 16-bit and one + C 32-bit operations. (Except that cmov can't do 8-bit ops, so + C that step has to use a conditional). +.Lalign_loop: + + sub $1, N + movb (SRC, N), LREG(TMP) + xorb LREG(TMP), (DST, N) + sub $1, CNT + jnz .Lalign_loop + +.Laligned: + + C Next destination word is -8(DST, N) + C Setup for unrolling + test $8, N + jz .Lword_next + + sub $8, N + jz .Lone_word + + mov (SRC, N), TMP + xor TMP, (DST, N) + + jmp .Lword_next + + ALIGN(16) + +.Lword_loop: + mov 8(SRC, N), TMP + mov (SRC, N), TMP2 + xor TMP, 8(DST, N) + xor TMP2, (DST, N) + +.Lword_next: + sub $16, N + ja .Lword_loop C Not zero and no carry + jnz .Lfinal + + C Final operation is word aligned + mov 8(SRC, N), TMP + xor TMP, 8(DST, N) + +.Lone_word: + mov (SRC, N), TMP + xor TMP, (DST, N) + + W64_EXIT(3, 0) + ret + +.Lfinal: + add $15, N + +.Lfinal_loop: + movb (SRC, N), LREG(TMP) + xorb LREG(TMP), (DST, N) +.Lfinal_next: + sub $1, N + jnc .Lfinal_loop + +.Ldone: + W64_EXIT(3, 0) + ret + +EPILOGUE(_nettle_memxor_x86_64) + +PROLOGUE(_nettle_memxor_sse2) + W64_ENTRY(3, 0) + + test N, N + C Get number of unaligned bytes at the end + C %rdi is used as CNT, %rax as DST and as return value + mov %rdi, %rax + jz .SLdone + add N, CNT + and $7, CNT + + jz .SLaligned + + cmp $8, N + jc .SLfinal_next + + C FIXME: Instead of this loop, could try cmov with memory + C destination, as a sequence of one 8-bit, one 16-bit and one + C 32-bit operations. (Except that cmov can't do 8-bit ops, so + C that step has to use a conditional). +.SLalign_loop: + + sub $1, N + movb (SRC, N), LREG(TMP) + xorb LREG(TMP), (DST, N) + sub $1, CNT + jnz .SLalign_loop + +.SLaligned: + cmp $16, N + jnc .SLsse2_case + + C Next destination word is -8(DST, N) + C Setup for unrolling + test $8, N + jz .SLword_next + + sub $8, N + jz .SLone_word + + mov (SRC, N), TMP + xor TMP, (DST, N) + + jmp .SLword_next + + ALIGN(16) + +.SLword_loop: + mov 8(SRC, N), TMP + mov (SRC, N), TMP2 + xor TMP, 8(DST, N) + xor TMP2, (DST, N) + +.SLword_next: + sub $16, N + ja .SLword_loop C Not zero and no carry + jnz .SLfinal + + C Final operation is word aligned + mov 8(SRC, N), TMP + xor TMP, 8(DST, N) + +.SLone_word: + mov (SRC, N), TMP + xor TMP, (DST, N) + + W64_EXIT(3, 0) + ret + +.SLfinal: + add $15, N + +.SLfinal_loop: + movb (SRC, N), LREG(TMP) + xorb LREG(TMP), (DST, N) +.SLfinal_next: + sub $1, N + jnc .SLfinal_loop + +.SLdone: + W64_EXIT(3, 0) + ret + +.SLsse2_case: + lea (DST, N), TMP + test $8, TMP + jz .SLsse2_next + sub $8, N + mov (SRC, N), TMP + xor TMP, (DST, N) + jmp .SLsse2_next + + ALIGN(16) +.SLsse2_loop: + movdqu (SRC, N), %xmm0 + movdqa (DST, N), %xmm1 + pxor %xmm0, %xmm1 + movdqa %xmm1, (DST, N) +.SLsse2_next: + sub $16, N + ja .SLsse2_loop + + C FIXME: See if we can do a full word first, before the + C byte-wise final loop. + jnz .SLfinal + + C Final operation is aligned + movdqu (SRC), %xmm0 + movdqa (DST), %xmm1 + pxor %xmm0, %xmm1 + movdqa %xmm1, (DST) + + W64_EXIT(3, 0) + ret + +EPILOGUE(_nettle_memxor_sse2) + -- 2.1.0