x86/hweight: Get rid of the special calling convention
authorBorislav Petkov <bp@suse.de>
Mon, 30 May 2016 10:56:27 +0000 (12:56 +0200)
committerIngo Molnar <mingo@kernel.org>
Wed, 8 Jun 2016 13:01:02 +0000 (15:01 +0200)
People complained about ARCH_HWEIGHT_CFLAGS and how it throws a wrench
into kcov, lto, etc, experimentations.

Add asm versions for __sw_hweight{32,64}() and do explicit saving and
restoring of clobbered registers. This gets rid of the special calling
convention. We get to call those functions on !X86_FEATURE_POPCNT CPUs.

We still need to hardcode POPCNT and register operands as some old gas
versions which we support, do not know about POPCNT.

Btw, remove redundant REX prefix from 32-bit POPCNT because alternatives
can do padding now.

Suggested-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1464605787-20603-1-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
arch/x86/Kconfig
arch/x86/include/asm/arch_hweight.h
arch/x86/kernel/i386_ksyms_32.c
arch/x86/kernel/x8664_ksyms_64.c
arch/x86/lib/Makefile
arch/x86/lib/hweight.S [new file with mode: 0644]
lib/Makefile
lib/hweight.c

index 0a7b885..729d41d 100644 (file)
@@ -294,11 +294,6 @@ config X86_32_LAZY_GS
        def_bool y
        depends on X86_32 && !CC_STACKPROTECTOR
 
-config ARCH_HWEIGHT_CFLAGS
-       string
-       default "-fcall-saved-ecx -fcall-saved-edx" if X86_32
-       default "-fcall-saved-rdi -fcall-saved-rsi -fcall-saved-rdx -fcall-saved-rcx -fcall-saved-r8 -fcall-saved-r9 -fcall-saved-r10 -fcall-saved-r11" if X86_64
-
 config ARCH_SUPPORTS_UPROBES
        def_bool y
 
index 02e799f..e7cd631 100644 (file)
@@ -4,8 +4,8 @@
 #include <asm/cpufeatures.h>
 
 #ifdef CONFIG_64BIT
-/* popcnt %edi, %eax -- redundant REX prefix for alignment */
-#define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7"
+/* popcnt %edi, %eax */
+#define POPCNT32 ".byte 0xf3,0x0f,0xb8,0xc7"
 /* popcnt %rdi, %rax */
 #define POPCNT64 ".byte 0xf3,0x48,0x0f,0xb8,0xc7"
 #define REG_IN "D"
 #define REG_OUT "a"
 #endif
 
-/*
- * __sw_hweightXX are called from within the alternatives below
- * and callee-clobbered registers need to be taken care of. See
- * ARCH_HWEIGHT_CFLAGS in <arch/x86/Kconfig> for the respective
- * compiler switches.
- */
+#define __HAVE_ARCH_SW_HWEIGHT
+
 static __always_inline unsigned int __arch_hweight32(unsigned int w)
 {
-       unsigned int res = 0;
+       unsigned int res;
 
        asm (ALTERNATIVE("call __sw_hweight32", POPCNT32, X86_FEATURE_POPCNT)
-                    : "="REG_OUT (res)
-                    : REG_IN (w));
+                        : "="REG_OUT (res)
+                        : REG_IN (w));
 
        return res;
 }
@@ -53,11 +49,11 @@ static inline unsigned long __arch_hweight64(__u64 w)
 #else
 static __always_inline unsigned long __arch_hweight64(__u64 w)
 {
-       unsigned long res = 0;
+       unsigned long res;
 
        asm (ALTERNATIVE("call __sw_hweight64", POPCNT64, X86_FEATURE_POPCNT)
-                    : "="REG_OUT (res)
-                    : REG_IN (w));
+                        : "="REG_OUT (res)
+                        : REG_IN (w));
 
        return res;
 }
index 64341aa..d40ee8a 100644 (file)
@@ -42,3 +42,5 @@ EXPORT_SYMBOL(empty_zero_page);
 EXPORT_SYMBOL(___preempt_schedule);
 EXPORT_SYMBOL(___preempt_schedule_notrace);
 #endif
+
+EXPORT_SYMBOL(__sw_hweight32);
index cd05942..f1aebfb 100644 (file)
@@ -44,6 +44,9 @@ EXPORT_SYMBOL(clear_page);
 
 EXPORT_SYMBOL(csum_partial);
 
+EXPORT_SYMBOL(__sw_hweight32);
+EXPORT_SYMBOL(__sw_hweight64);
+
 /*
  * Export string functions. We normally rely on gcc builtin for most of these,
  * but gcc sometimes decides not to inline them.
index 72a5767..ec969cc 100644 (file)
@@ -25,7 +25,7 @@ lib-y += memcpy_$(BITS).o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
 
-obj-y += msr.o msr-reg.o msr-reg-export.o
+obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
 
 ifeq ($(CONFIG_X86_32),y)
         obj-y += atomic64_32.o
diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S
new file mode 100644 (file)
index 0000000..02de3d7
--- /dev/null
@@ -0,0 +1,77 @@
+#include <linux/linkage.h>
+
+#include <asm/asm.h>
+
+/*
+ * unsigned int __sw_hweight32(unsigned int w)
+ * %rdi: w
+ */
+ENTRY(__sw_hweight32)
+
+#ifdef CONFIG_X86_64
+       movl %edi, %eax                         # w
+#endif
+       __ASM_SIZE(push,) %__ASM_REG(dx)
+       movl %eax, %edx                         # w -> t
+       shrl %edx                               # t >>= 1
+       andl $0x55555555, %edx                  # t &= 0x55555555
+       subl %edx, %eax                         # w -= t
+
+       movl %eax, %edx                         # w -> t
+       shrl $2, %eax                           # w_tmp >>= 2
+       andl $0x33333333, %edx                  # t     &= 0x33333333
+       andl $0x33333333, %eax                  # w_tmp &= 0x33333333
+       addl %edx, %eax                         # w = w_tmp + t
+
+       movl %eax, %edx                         # w -> t
+       shrl $4, %edx                           # t >>= 4
+       addl %edx, %eax                         # w_tmp += t
+       andl  $0x0f0f0f0f, %eax                 # w_tmp &= 0x0f0f0f0f
+       imull $0x01010101, %eax, %eax           # w_tmp *= 0x01010101
+       shrl $24, %eax                          # w = w_tmp >> 24
+       __ASM_SIZE(pop,) %__ASM_REG(dx)
+       ret
+ENDPROC(__sw_hweight32)
+
+ENTRY(__sw_hweight64)
+#ifdef CONFIG_X86_64
+       pushq   %rdx
+
+       movq    %rdi, %rdx                      # w -> t
+       movabsq $0x5555555555555555, %rax
+       shrq    %rdx                            # t >>= 1
+       andq    %rdx, %rax                      # t &= 0x5555555555555555
+       movabsq $0x3333333333333333, %rdx
+       subq    %rax, %rdi                      # w -= t
+
+       movq    %rdi, %rax                      # w -> t
+       shrq    $2, %rdi                        # w_tmp >>= 2
+       andq    %rdx, %rax                      # t     &= 0x3333333333333333
+       andq    %rdi, %rdx                      # w_tmp &= 0x3333333333333333
+       addq    %rdx, %rax                      # w = w_tmp + t
+
+       movq    %rax, %rdx                      # w -> t
+       shrq    $4, %rdx                        # t >>= 4
+       addq    %rdx, %rax                      # w_tmp += t
+       movabsq $0x0f0f0f0f0f0f0f0f, %rdx
+       andq    %rdx, %rax                      # w_tmp &= 0x0f0f0f0f0f0f0f0f
+       movabsq $0x0101010101010101, %rdx
+       imulq   %rdx, %rax                      # w_tmp *= 0x0101010101010101
+       shrq    $56, %rax                       # w = w_tmp >> 56
+
+       popq    %rdx
+       ret
+#else /* CONFIG_X86_32 */
+       /* We're getting an u64 arg in (%eax,%edx): unsigned long hweight64(__u64 w) */
+       pushl   %ecx
+
+       call    __sw_hweight32
+       movl    %eax, %ecx                      # stash away result
+       movl    %edx, %eax                      # second part of input
+       call    __sw_hweight32
+       addl    %ecx, %eax                      # result
+
+       popl    %ecx
+       ret
+#endif
+ENDPROC(__sw_hweight64)
index ff6a7a6..07d06a8 100644 (file)
@@ -15,9 +15,6 @@ KCOV_INSTRUMENT_rbtree.o := n
 KCOV_INSTRUMENT_list_debug.o := n
 KCOV_INSTRUMENT_debugobjects.o := n
 KCOV_INSTRUMENT_dynamic_debug.o := n
-# Kernel does not boot if we instrument this file as it uses custom calling
-# convention (see CONFIG_ARCH_HWEIGHT_CFLAGS).
-KCOV_INSTRUMENT_hweight.o := n
 
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
         rbtree.o radix-tree.o dump_stack.o timerqueue.o\
@@ -74,8 +71,6 @@ obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o
 obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o
 obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o
 
-GCOV_PROFILE_hweight.o := n
-CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS))
 obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
 
 obj-$(CONFIG_BTREE) += btree.o
index 9a5c1f2..43273a7 100644 (file)
@@ -9,6 +9,7 @@
  * The Hamming Weight of a number is the total number of bits set in it.
  */
 
+#ifndef __HAVE_ARCH_SW_HWEIGHT
 unsigned int __sw_hweight32(unsigned int w)
 {
 #ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER
@@ -25,6 +26,7 @@ unsigned int __sw_hweight32(unsigned int w)
 #endif
 }
 EXPORT_SYMBOL(__sw_hweight32);
+#endif
 
 unsigned int __sw_hweight16(unsigned int w)
 {
@@ -43,6 +45,7 @@ unsigned int __sw_hweight8(unsigned int w)
 }
 EXPORT_SYMBOL(__sw_hweight8);
 
+#ifndef __HAVE_ARCH_SW_HWEIGHT
 unsigned long __sw_hweight64(__u64 w)
 {
 #if BITS_PER_LONG == 32
@@ -65,3 +68,4 @@ unsigned long __sw_hweight64(__u64 w)
 #endif
 }
 EXPORT_SYMBOL(__sw_hweight64);
+#endif