Merge tag 'x86-cpu-2021-08-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 30 Aug 2021 22:00:33 +0000 (15:00 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 30 Aug 2021 22:00:33 +0000 (15:00 -0700)
Pull x86 cache flush updates from Thomas Gleixner:
 "A reworked version of the opt-in L1D flush mechanism.

  This is a stop gap for potential future speculation related hardware
  vulnerabilities and a mechanism for truly security paranoid
  applications.

  It allows a task to request that the L1D cache is flushed when the
  kernel switches to a different mm. This can be requested via prctl().

  Changes vs the previous versions:

   - Get rid of the software flush fallback

   - Make the handling consistent with other mitigations

   - Kill the task when it ends up on a SMT enabled core which defeats
     the purpose of L1D flushing obviously"

* tag 'x86-cpu-2021-08-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  Documentation: Add L1D flushing Documentation
  x86, prctl: Hook L1D flushing in via prctl
  x86/mm: Prepare for opt-in based L1D flush in switch_mm()
  x86/process: Make room for TIF_SPEC_L1D_FLUSH
  sched: Add task_work callback for paranoid L1D flush
  x86/mm: Refactor cond_ibpb() to support other use cases
  x86/smp: Add a per-cpu view of SMT state

1  2 
arch/x86/Kconfig
arch/x86/include/asm/processor.h
include/linux/sched.h

diff --combined arch/x86/Kconfig
@@@ -119,6 -119,7 +119,7 @@@ config X8
        select ARCH_WANT_HUGE_PMD_SHARE
        select ARCH_WANT_LD_ORPHAN_WARN
        select ARCH_WANTS_THP_SWAP              if X86_64
+       select ARCH_HAS_PARANOID_L1D_FLUSH
        select BUILDTIME_TABLE_SORT
        select CLKEVT_I8253
        select CLOCKSOURCE_VALIDATE_LAST_CYCLE
        select HAVE_FUNCTION_TRACER
        select HAVE_GCC_PLUGINS
        select HAVE_HW_BREAKPOINT
 -      select HAVE_IDE
        select HAVE_IOREMAP_PROT
        select HAVE_IRQ_EXIT_ON_IRQ_STACK       if X86_64
        select HAVE_IRQ_TIME_ACCOUNTING
@@@ -136,6 -136,8 +136,8 @@@ struct cpuinfo_x86 
        u16                     logical_die_id;
        /* Index into per_cpu list: */
        u16                     cpu_index;
+       /*  Is SMT active on this core? */
+       bool                    smt_active;
        u32                     microcode;
        /* Address space bits used by the cache internally */
        u8                      x86_cache_bits;
@@@ -795,8 -797,6 +797,8 @@@ extern int set_tsc_mode(unsigned int va
  
  DECLARE_PER_CPU(u64, msr_misc_features_shadow);
  
 +extern u16 get_llc_id(unsigned int cpu);
 +
  #ifdef CONFIG_CPU_SUP_AMD
  extern u32 amd_get_nodes_per_socket(void);
  extern u32 amd_get_highest_perf(void);
diff --combined include/linux/sched.h
@@@ -95,9 -95,7 +95,9 @@@ struct task_group
  #define TASK_WAKING                   0x0200
  #define TASK_NOLOAD                   0x0400
  #define TASK_NEW                      0x0800
 -#define TASK_STATE_MAX                        0x1000
 +/* RT specific auxilliary flag to mark RT lock waiters */
 +#define TASK_RTLOCK_WAIT              0x1000
 +#define TASK_STATE_MAX                        0x2000
  
  /* Convenience macros for the sake of set_current_state: */
  #define TASK_KILLABLE                 (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
  
  #define task_is_stopped_or_traced(task)       ((READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) != 0)
  
 -#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 -
  /*
   * Special states are those that do not use the normal wait-loop pattern. See
   * the comment with set_special_state().
  #define is_special_task_state(state)                          \
        ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD))
  
 -#define __set_current_state(state_value)                      \
 -      do {                                                    \
 -              WARN_ON_ONCE(is_special_task_state(state_value));\
 -              current->task_state_change = _THIS_IP_;         \
 -              WRITE_ONCE(current->__state, (state_value));    \
 -      } while (0)
 -
 -#define set_current_state(state_value)                                \
 -      do {                                                    \
 -              WARN_ON_ONCE(is_special_task_state(state_value));\
 -              current->task_state_change = _THIS_IP_;         \
 -              smp_store_mb(current->__state, (state_value));  \
 +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 +# define debug_normal_state_change(state_value)                               \
 +      do {                                                            \
 +              WARN_ON_ONCE(is_special_task_state(state_value));       \
 +              current->task_state_change = _THIS_IP_;                 \
        } while (0)
  
 -#define set_special_state(state_value)                                        \
 +# define debug_special_state_change(state_value)                      \
        do {                                                            \
 -              unsigned long flags; /* may shadow */                   \
                WARN_ON_ONCE(!is_special_task_state(state_value));      \
 -              raw_spin_lock_irqsave(&current->pi_lock, flags);        \
                current->task_state_change = _THIS_IP_;                 \
 -              WRITE_ONCE(current->__state, (state_value));            \
 -              raw_spin_unlock_irqrestore(&current->pi_lock, flags);   \
        } while (0)
 +
 +# define debug_rtlock_wait_set_state()                                        \
 +      do {                                                             \
 +              current->saved_state_change = current->task_state_change;\
 +              current->task_state_change = _THIS_IP_;                  \
 +      } while (0)
 +
 +# define debug_rtlock_wait_restore_state()                            \
 +      do {                                                             \
 +              current->task_state_change = current->saved_state_change;\
 +      } while (0)
 +
  #else
 +# define debug_normal_state_change(cond)      do { } while (0)
 +# define debug_special_state_change(cond)     do { } while (0)
 +# define debug_rtlock_wait_set_state()                do { } while (0)
 +# define debug_rtlock_wait_restore_state()    do { } while (0)
 +#endif
 +
  /*
   * set_current_state() includes a barrier so that the write of current->state
   * is correctly serialised wrt the caller's subsequent test of whether to
   * Also see the comments of try_to_wake_up().
   */
  #define __set_current_state(state_value)                              \
 -      WRITE_ONCE(current->__state, (state_value))
 +      do {                                                            \
 +              debug_normal_state_change((state_value));               \
 +              WRITE_ONCE(current->__state, (state_value));            \
 +      } while (0)
  
  #define set_current_state(state_value)                                        \
 -      smp_store_mb(current->__state, (state_value))
 +      do {                                                            \
 +              debug_normal_state_change((state_value));               \
 +              smp_store_mb(current->__state, (state_value));          \
 +      } while (0)
  
  /*
   * set_special_state() should be used for those states when the blocking task
   * can not use the regular condition based wait-loop. In that case we must
 - * serialize against wakeups such that any possible in-flight TASK_RUNNING stores
 - * will not collide with our state change.
 + * serialize against wakeups such that any possible in-flight TASK_RUNNING
 + * stores will not collide with our state change.
   */
  #define set_special_state(state_value)                                        \
        do {                                                            \
                unsigned long flags; /* may shadow */                   \
 +                                                                      \
                raw_spin_lock_irqsave(&current->pi_lock, flags);        \
 +              debug_special_state_change((state_value));              \
                WRITE_ONCE(current->__state, (state_value));            \
                raw_spin_unlock_irqrestore(&current->pi_lock, flags);   \
        } while (0)
  
 -#endif
 +/*
 + * PREEMPT_RT specific variants for "sleeping" spin/rwlocks
 + *
 + * RT's spin/rwlock substitutions are state preserving. The state of the
 + * task when blocking on the lock is saved in task_struct::saved_state and
 + * restored after the lock has been acquired.  These operations are
 + * serialized by task_struct::pi_lock against try_to_wake_up(). Any non RT
 + * lock related wakeups while the task is blocked on the lock are
 + * redirected to operate on task_struct::saved_state to ensure that these
 + * are not dropped. On restore task_struct::saved_state is set to
 + * TASK_RUNNING so any wakeup attempt redirected to saved_state will fail.
 + *
 + * The lock operation looks like this:
 + *
 + *    current_save_and_set_rtlock_wait_state();
 + *    for (;;) {
 + *            if (try_lock())
 + *                    break;
 + *            raw_spin_unlock_irq(&lock->wait_lock);
 + *            schedule_rtlock();
 + *            raw_spin_lock_irq(&lock->wait_lock);
 + *            set_current_state(TASK_RTLOCK_WAIT);
 + *    }
 + *    current_restore_rtlock_saved_state();
 + */
 +#define current_save_and_set_rtlock_wait_state()                      \
 +      do {                                                            \
 +              lockdep_assert_irqs_disabled();                         \
 +              raw_spin_lock(&current->pi_lock);                       \
 +              current->saved_state = current->__state;                \
 +              debug_rtlock_wait_set_state();                          \
 +              WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT);         \
 +              raw_spin_unlock(&current->pi_lock);                     \
 +      } while (0);
 +
 +#define current_restore_rtlock_saved_state()                          \
 +      do {                                                            \
 +              lockdep_assert_irqs_disabled();                         \
 +              raw_spin_lock(&current->pi_lock);                       \
 +              debug_rtlock_wait_restore_state();                      \
 +              WRITE_ONCE(current->__state, current->saved_state);     \
 +              current->saved_state = TASK_RUNNING;                    \
 +              raw_spin_unlock(&current->pi_lock);                     \
 +      } while (0);
  
  #define get_current_state()   READ_ONCE(current->__state)
  
@@@ -288,9 -230,6 +288,9 @@@ extern long schedule_timeout_idle(long 
  asmlinkage void schedule(void);
  extern void schedule_preempt_disabled(void);
  asmlinkage void preempt_schedule_irq(void);
 +#ifdef CONFIG_PREEMPT_RT
 + extern void schedule_rtlock(void);
 +#endif
  
  extern int __must_check io_schedule_prepare(void);
  extern void io_schedule_finish(int token);
@@@ -729,11 -668,6 +729,11 @@@ struct task_struct 
  #endif
        unsigned int                    __state;
  
 +#ifdef CONFIG_PREEMPT_RT
 +      /* saved state for "spinlock sleepers" */
 +      unsigned int                    saved_state;
 +#endif
 +
        /*
         * This begins the randomizable portion of task_struct. Only
         * scheduling-critical items should be added above here.
        unsigned int                    policy;
        int                             nr_cpus_allowed;
        const cpumask_t                 *cpus_ptr;
 +      cpumask_t                       *user_cpus_ptr;
        cpumask_t                       cpus_mask;
        void                            *migration_pending;
  #ifdef CONFIG_SMP
        /* Used by page_owner=on to detect recursion in page tracking. */
        unsigned                        in_page_owner:1;
  #endif
 +#ifdef CONFIG_EVENTFD
 +      /* Recursion prevention for eventfd_signal() */
 +      unsigned                        in_eventfd_signal:1;
 +#endif
  
        unsigned long                   atomic_flags; /* Flags requiring atomic access. */
  
        struct kmap_ctrl                kmap_ctrl;
  #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
        unsigned long                   task_state_change;
 +# ifdef CONFIG_PREEMPT_RT
 +      unsigned long                   saved_state_change;
 +# endif
  #endif
        int                             pagefault_disabled;
  #ifdef CONFIG_MMU
        struct llist_head               kretprobe_instances;
  #endif
  
+ #ifdef CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH
+       /*
+        * If L1D flush is supported on mm context switch
+        * then we use this callback head to queue kill work
+        * to kill tasks that are not running on SMT disabled
+        * cores
+        */
+       struct callback_head            l1d_flush_kill;
+ #endif
        /*
         * New fields for task_struct should be added above here, so that
         * they are included in the randomized portion of task_struct.
@@@ -1779,11 -1715,6 +1789,11 @@@ extern int task_can_attach(struct task_
  #ifdef CONFIG_SMP
  extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
  extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask);
 +extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node);
 +extern void release_user_cpus_ptr(struct task_struct *p);
 +extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask);
 +extern void force_compatible_cpus_allowed_ptr(struct task_struct *p);
 +extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p);
  #else
  static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
  {
@@@ -1794,21 -1725,6 +1804,21 @@@ static inline int set_cpus_allowed_ptr(
                return -EINVAL;
        return 0;
  }
 +static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node)
 +{
 +      if (src->user_cpus_ptr)
 +              return -EINVAL;
 +      return 0;
 +}
 +static inline void release_user_cpus_ptr(struct task_struct *p)
 +{
 +      WARN_ON(p->user_cpus_ptr);
 +}
 +
 +static inline int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
 +{
 +      return 0;
 +}
  #endif
  
  extern int yield_to(struct task_struct *p, bool preempt);