Merge tag 'sched-core-2021-08-30' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Mon, 30 Aug 2021 20:42:10 +0000 (13:42 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Mon, 30 Aug 2021 20:42:10 +0000 (13:42 -0700)
Pull scheduler updates from Ingo Molnar:

 - The biggest change in this cycle is scheduler support for asymmetric
   scheduling affinity, to support the execution of legacy 32-bit tasks
   on AArch32 systems that also have 64-bit-only CPUs.

   Architectures can fill in this functionality by defining their own
   task_cpu_possible_mask(p). When this is done, the scheduler will make
   sure the task will only be scheduled on CPUs that support it.

   (The actual arm64 specific changes are not part of this tree.)

   For other architectures there will be no change in functionality.

 - Add cgroup SCHED_IDLE support

 - Increase node-distance flexibility & delay determining it until a CPU
   is brought online. (This enables platforms where node distance isn't
   final until the CPU is only.)

 - Deadline scheduler enhancements & fixes

 - Misc fixes & cleanups.

* tag 'sched-core-2021-08-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (27 commits)
  eventfd: Make signal recursion protection a task bit
  sched/fair: Mark tg_is_idle() an inline in the !CONFIG_FAIR_GROUP_SCHED case
  sched: Introduce dl_task_check_affinity() to check proposed affinity
  sched: Allow task CPU affinity to be restricted on asymmetric systems
  sched: Split the guts of sched_setaffinity() into a helper function
  sched: Introduce task_struct::user_cpus_ptr to track requested affinity
  sched: Reject CPU affinity changes based on task_cpu_possible_mask()
  cpuset: Cleanup cpuset_cpus_allowed_fallback() use in select_fallback_rq()
  cpuset: Honour task_cpu_possible_mask() in guarantee_online_cpus()
  cpuset: Don't use the cpu_possible_mask as a last resort for cgroup v1
  sched: Introduce task_cpu_possible_mask() to limit fallback rq selection
  sched: Cgroup SCHED_IDLE support
  sched/topology: Skip updating masks for non-online nodes
  sched: Replace deprecated CPU-hotplug functions.
  sched: Skip priority checks with SCHED_FLAG_KEEP_PARAMS
  sched: Fix UCLAMP_FLAG_IDLE setting
  sched/deadline: Fix missing clock update in migrate_task_rq_dl()
  sched/fair: Avoid a second scan of target in select_idle_cpu
  sched/fair: Use prev instead of new target as recent_used_cpu
  sched: Don't report SCHED_FLAG_SUGOV in sched_getattr()
  ...

1  2 
include/linux/sched.h
include/linux/wait.h
kernel/fork.c
kernel/sched/core.c
kernel/sched/fair.c
kernel/sched/sched.h

diff --combined include/linux/sched.h
@@@ -748,6 -748,7 +748,7 @@@ struct task_struct 
        unsigned int                    policy;
        int                             nr_cpus_allowed;
        const cpumask_t                 *cpus_ptr;
+       cpumask_t                       *user_cpus_ptr;
        cpumask_t                       cpus_mask;
        void                            *migration_pending;
  #ifdef CONFIG_SMP
        /* Used by page_owner=on to detect recursion in page tracking. */
        unsigned                        in_page_owner:1;
  #endif
+ #ifdef CONFIG_EVENTFD
+       /* Recursion prevention for eventfd_signal() */
+       unsigned                        in_eventfd_signal:1;
+ #endif
  
        unsigned long                   atomic_flags; /* Flags requiring atomic access. */
  
        /* Signal handlers: */
        struct signal_struct            *signal;
        struct sighand_struct __rcu             *sighand;
 -      struct sigqueue                 *sigqueue_cache;
        sigset_t                        blocked;
        sigset_t                        real_blocked;
        /* Restored if set_restore_sigmask() was used: */
@@@ -1705,6 -1711,11 +1710,11 @@@ extern int task_can_attach(struct task_
  #ifdef CONFIG_SMP
  extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
  extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask);
+ extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node);
+ extern void release_user_cpus_ptr(struct task_struct *p);
+ extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask);
+ extern void force_compatible_cpus_allowed_ptr(struct task_struct *p);
+ extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p);
  #else
  static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
  {
@@@ -1715,6 -1726,21 +1725,21 @@@ static inline int set_cpus_allowed_ptr(
                return -EINVAL;
        return 0;
  }
+ static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node)
+ {
+       if (src->user_cpus_ptr)
+               return -EINVAL;
+       return 0;
+ }
+ static inline void release_user_cpus_ptr(struct task_struct *p)
+ {
+       WARN_ON(p->user_cpus_ptr);
+ }
+ static inline int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
+ {
+       return 0;
+ }
  #endif
  
  extern int yield_to(struct task_struct *p, bool preempt);
@@@ -2028,8 -2054,6 +2053,8 @@@ static inline void set_task_cpu(struct 
  
  #endif /* CONFIG_SMP */
  
 +extern bool sched_task_on_rq(struct task_struct *p);
 +
  /*
   * In order to reduce various lock holder preemption latencies provide an
   * interface to see if a vCPU is currently running or not.
diff --combined include/linux/wait.h
@@@ -56,7 -56,7 +56,7 @@@ struct task_struct
  
  #define __WAIT_QUEUE_HEAD_INITIALIZER(name) {                                 \
        .lock           = __SPIN_LOCK_UNLOCKED(name.lock),                      \
-       .head           = { &(name).head, &(name).head } }
+       .head           = LIST_HEAD_INIT(name.head) }
  
  #define DECLARE_WAIT_QUEUE_HEAD(name) \
        struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name)
@@@ -1136,7 -1136,7 +1136,7 @@@ do {                                                                            
   * Waitqueues which are removed from the waitqueue_head at wakeup time
   */
  void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
 -void prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
 +bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
  long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
  void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
  long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout);
diff --combined kernel/fork.c
@@@ -446,6 -446,7 +446,7 @@@ void put_task_stack(struct task_struct 
  
  void free_task(struct task_struct *tsk)
  {
+       release_user_cpus_ptr(tsk);
        scs_release(tsk);
  
  #ifndef CONFIG_THREAD_INFO_IN_TASK
@@@ -825,14 -826,9 +826,14 @@@ void __init fork_init(void
        init_task.signal->rlim[RLIMIT_SIGPENDING] =
                init_task.signal->rlim[RLIMIT_NPROC];
  
 -      for (i = 0; i < UCOUNT_COUNTS; i++)
 +      for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++)
                init_user_ns.ucount_max[i] = max_threads/2;
  
 +      set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_NPROC,      RLIM_INFINITY);
 +      set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE,   RLIM_INFINITY);
 +      set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY);
 +      set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK,    RLIM_INFINITY);
 +
  #ifdef CONFIG_VMAP_STACK
        cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
                          NULL, free_vm_stack_cache);
@@@ -924,6 -920,7 +925,7 @@@ static struct task_struct *dup_task_str
  #endif
        if (orig->cpus_ptr == &orig->cpus_mask)
                tsk->cpus_ptr = &tsk->cpus_mask;
+       dup_user_cpus_ptr(tsk, orig, node);
  
        /*
         * One for the user space visible state that goes away when reaped.
@@@ -1035,6 -1032,7 +1037,6 @@@ static struct mm_struct *mm_init(struc
        mm_pgtables_bytes_init(mm);
        mm->map_count = 0;
        mm->locked_vm = 0;
 -      atomic_set(&mm->has_pinned, 0);
        atomic64_set(&mm->pinned_vm, 0);
        memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
        spin_lock_init(&mm->page_table_lock);
@@@ -1982,7 -1980,8 +1984,7 @@@ static __latent_entropy struct task_str
        DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
  #endif
        retval = -EAGAIN;
 -      if (atomic_read(&p->real_cred->user->processes) >=
 -                      task_rlimit(p, RLIMIT_NPROC)) {
 +      if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
                if (p->real_cred->user != INIT_USER &&
                    !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
                        goto bad_fork_free;
        spin_lock_init(&p->alloc_lock);
  
        init_sigpending(&p->pending);
 -      p->sigqueue_cache = NULL;
  
        p->utime = p->stime = p->gtime = 0;
  #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
@@@ -2391,7 -2391,7 +2393,7 @@@ bad_fork_cleanup_threadgroup_lock
  #endif
        delayacct_tsk_free(p);
  bad_fork_cleanup_count:
 -      atomic_dec(&p->cred->user->processes);
 +      dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
        exit_creds(p);
  bad_fork_free:
        WRITE_ONCE(p->__state, TASK_DEAD);
@@@ -3004,12 -3004,6 +3006,12 @@@ int ksys_unshare(unsigned long unshare_
        if (err)
                goto bad_unshare_cleanup_cred;
  
 +      if (new_cred) {
 +              err = set_cred_ucounts(new_cred);
 +              if (err)
 +                      goto bad_unshare_cleanup_cred;
 +      }
 +
        if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
                if (do_sysvsem) {
                        /*
diff --combined kernel/sched/core.c
@@@ -237,30 -237,9 +237,30 @@@ static DEFINE_MUTEX(sched_core_mutex)
  static atomic_t sched_core_count;
  static struct cpumask sched_core_mask;
  
 +static void sched_core_lock(int cpu, unsigned long *flags)
 +{
 +      const struct cpumask *smt_mask = cpu_smt_mask(cpu);
 +      int t, i = 0;
 +
 +      local_irq_save(*flags);
 +      for_each_cpu(t, smt_mask)
 +              raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
 +}
 +
 +static void sched_core_unlock(int cpu, unsigned long *flags)
 +{
 +      const struct cpumask *smt_mask = cpu_smt_mask(cpu);
 +      int t;
 +
 +      for_each_cpu(t, smt_mask)
 +              raw_spin_unlock(&cpu_rq(t)->__lock);
 +      local_irq_restore(*flags);
 +}
 +
  static void __sched_core_flip(bool enabled)
  {
 -      int cpu, t, i;
 +      unsigned long flags;
 +      int cpu, t;
  
        cpus_read_lock();
  
        for_each_cpu(cpu, &sched_core_mask) {
                const struct cpumask *smt_mask = cpu_smt_mask(cpu);
  
 -              i = 0;
 -              local_irq_disable();
 -              for_each_cpu(t, smt_mask) {
 -                      /* supports up to SMT8 */
 -                      raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
 -              }
 +              sched_core_lock(cpu, &flags);
  
                for_each_cpu(t, smt_mask)
                        cpu_rq(t)->core_enabled = enabled;
  
 -              for_each_cpu(t, smt_mask)
 -                      raw_spin_unlock(&cpu_rq(t)->__lock);
 -              local_irq_enable();
 +              sched_core_unlock(cpu, &flags);
  
                cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
        }
@@@ -1007,6 -993,7 +1007,7 @@@ int get_nohz_timer_target(void
  {
        int i, cpu = smp_processor_id(), default_cpu = -1;
        struct sched_domain *sd;
+       const struct cpumask *hk_mask;
  
        if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
                if (!idle_cpu(cpu))
                default_cpu = cpu;
        }
  
+       hk_mask = housekeeping_cpumask(HK_FLAG_TIMER);
        rcu_read_lock();
        for_each_domain(cpu, sd) {
-               for_each_cpu_and(i, sched_domain_span(sd),
-                       housekeeping_cpumask(HK_FLAG_TIMER)) {
+               for_each_cpu_and(i, sched_domain_span(sd), hk_mask) {
                        if (cpu == i)
                                continue;
  
@@@ -1633,6 -1621,23 +1635,23 @@@ static inline void uclamp_rq_dec(struc
                uclamp_rq_dec_id(rq, p, clamp_id);
  }
  
+ static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p,
+                                     enum uclamp_id clamp_id)
+ {
+       if (!p->uclamp[clamp_id].active)
+               return;
+       uclamp_rq_dec_id(rq, p, clamp_id);
+       uclamp_rq_inc_id(rq, p, clamp_id);
+       /*
+        * Make sure to clear the idle flag if we've transiently reached 0
+        * active tasks on rq.
+        */
+       if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
+               rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
+ }
  static inline void
  uclamp_update_active(struct task_struct *p)
  {
         * affecting a valid clamp bucket, the next time it's enqueued,
         * it will already see the updated clamp bucket value.
         */
-       for_each_clamp_id(clamp_id) {
-               if (p->uclamp[clamp_id].active) {
-                       uclamp_rq_dec_id(rq, p, clamp_id);
-                       uclamp_rq_inc_id(rq, p, clamp_id);
-               }
-       }
+       for_each_clamp_id(clamp_id)
+               uclamp_rq_reinc_id(rq, p, clamp_id);
  
        task_rq_unlock(rq, p, &rf);
  }
@@@ -1942,11 -1943,6 +1957,11 @@@ static inline void uclamp_post_fork(str
  static inline void init_uclamp(void) { }
  #endif /* CONFIG_UCLAMP_TASK */
  
 +bool sched_task_on_rq(struct task_struct *p)
 +{
 +      return task_on_rq_queued(p);
 +}
 +
  static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
  {
        if (!(flags & ENQUEUE_NOCLOCK))
@@@ -1995,18 -1991,12 +2010,18 @@@ void deactivate_task(struct rq *rq, str
        dequeue_task(rq, p, flags);
  }
  
 -/*
 - * __normal_prio - return the priority that is based on the static prio
 - */
 -static inline int __normal_prio(struct task_struct *p)
 +static inline int __normal_prio(int policy, int rt_prio, int nice)
  {
 -      return p->static_prio;
 +      int prio;
 +
 +      if (dl_policy(policy))
 +              prio = MAX_DL_PRIO - 1;
 +      else if (rt_policy(policy))
 +              prio = MAX_RT_PRIO - 1 - rt_prio;
 +      else
 +              prio = NICE_TO_PRIO(nice);
 +
 +      return prio;
  }
  
  /*
   */
  static inline int normal_prio(struct task_struct *p)
  {
 -      int prio;
 -
 -      if (task_has_dl_policy(p))
 -              prio = MAX_DL_PRIO-1;
 -      else if (task_has_rt_policy(p))
 -              prio = MAX_RT_PRIO-1 - p->rt_priority;
 -      else
 -              prio = __normal_prio(p);
 -      return prio;
 +      return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
  }
  
  /*
@@@ -2175,7 -2173,7 +2190,7 @@@ static inline bool is_cpu_allowed(struc
  
        /* Non kernel threads are not allowed during either online or offline. */
        if (!(p->flags & PF_KTHREAD))
-               return cpu_active(cpu);
+               return cpu_active(cpu) && task_cpu_possible(cpu, p);
  
        /* KTHREAD_IS_PER_CPU is always allowed. */
        if (kthread_is_per_cpu(p))
@@@ -2482,6 -2480,34 +2497,34 @@@ void do_set_cpus_allowed(struct task_st
        __do_set_cpus_allowed(p, new_mask, 0);
  }
  
+ int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
+                     int node)
+ {
+       if (!src->user_cpus_ptr)
+               return 0;
+       dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node);
+       if (!dst->user_cpus_ptr)
+               return -ENOMEM;
+       cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
+       return 0;
+ }
+ static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p)
+ {
+       struct cpumask *user_mask = NULL;
+       swap(p->user_cpus_ptr, user_mask);
+       return user_mask;
+ }
+ void release_user_cpus_ptr(struct task_struct *p)
+ {
+       kfree(clear_user_cpus_ptr(p));
+ }
  /*
   * This function is wildly self concurrent; here be dragons.
   *
@@@ -2699,28 -2725,26 +2742,26 @@@ static int affine_move_task(struct rq *
  }
  
  /*
-  * Change a given task's CPU affinity. Migrate the thread to a
-  * proper CPU and schedule it away if the CPU it's executing on
-  * is removed from the allowed bitmask.
-  *
-  * NOTE: the caller must have a valid reference to the task, the
-  * task must not exit() & deallocate itself prematurely. The
-  * call is not atomic; no spinlocks may be held.
+  * Called with both p->pi_lock and rq->lock held; drops both before returning.
   */
- static int __set_cpus_allowed_ptr(struct task_struct *p,
-                                 const struct cpumask *new_mask,
-                                 u32 flags)
+ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
+                                        const struct cpumask *new_mask,
+                                        u32 flags,
+                                        struct rq *rq,
+                                        struct rq_flags *rf)
+       __releases(rq->lock)
+       __releases(p->pi_lock)
  {
+       const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
        const struct cpumask *cpu_valid_mask = cpu_active_mask;
+       bool kthread = p->flags & PF_KTHREAD;
+       struct cpumask *user_mask = NULL;
        unsigned int dest_cpu;
-       struct rq_flags rf;
-       struct rq *rq;
        int ret = 0;
  
-       rq = task_rq_lock(p, &rf);
        update_rq_clock(rq);
  
-       if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
+       if (kthread || is_migration_disabled(p)) {
                /*
                 * Kernel threads are allowed on online && !active CPUs,
                 * however, during cpu-hot-unplug, even these might get pushed
                cpu_valid_mask = cpu_online_mask;
        }
  
+       if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) {
+               ret = -EINVAL;
+               goto out;
+       }
        /*
         * Must re-check here, to close a race against __kthread_bind(),
         * sched_setaffinity() is not guaranteed to observe the flag.
  
        __do_set_cpus_allowed(p, new_mask, flags);
  
-       return affine_move_task(rq, p, &rf, dest_cpu, flags);
+       if (flags & SCA_USER)
+               user_mask = clear_user_cpus_ptr(p);
+       ret = affine_move_task(rq, p, rf, dest_cpu, flags);
+       kfree(user_mask);
+       return ret;
  
  out:
-       task_rq_unlock(rq, p, &rf);
+       task_rq_unlock(rq, p, rf);
  
        return ret;
  }
  
+ /*
+  * Change a given task's CPU affinity. Migrate the thread to a
+  * proper CPU and schedule it away if the CPU it's executing on
+  * is removed from the allowed bitmask.
+  *
+  * NOTE: the caller must have a valid reference to the task, the
+  * task must not exit() & deallocate itself prematurely. The
+  * call is not atomic; no spinlocks may be held.
+  */
+ static int __set_cpus_allowed_ptr(struct task_struct *p,
+                                 const struct cpumask *new_mask, u32 flags)
+ {
+       struct rq_flags rf;
+       struct rq *rq;
+       rq = task_rq_lock(p, &rf);
+       return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf);
+ }
  int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
  {
        return __set_cpus_allowed_ptr(p, new_mask, 0);
  }
  EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
  
+ /*
+  * Change a given task's CPU affinity to the intersection of its current
+  * affinity mask and @subset_mask, writing the resulting mask to @new_mask
+  * and pointing @p->user_cpus_ptr to a copy of the old mask.
+  * If the resulting mask is empty, leave the affinity unchanged and return
+  * -EINVAL.
+  */
+ static int restrict_cpus_allowed_ptr(struct task_struct *p,
+                                    struct cpumask *new_mask,
+                                    const struct cpumask *subset_mask)
+ {
+       struct cpumask *user_mask = NULL;
+       struct rq_flags rf;
+       struct rq *rq;
+       int err;
+       if (!p->user_cpus_ptr) {
+               user_mask = kmalloc(cpumask_size(), GFP_KERNEL);
+               if (!user_mask)
+                       return -ENOMEM;
+       }
+       rq = task_rq_lock(p, &rf);
+       /*
+        * Forcefully restricting the affinity of a deadline task is
+        * likely to cause problems, so fail and noisily override the
+        * mask entirely.
+        */
+       if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
+               err = -EPERM;
+               goto err_unlock;
+       }
+       if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) {
+               err = -EINVAL;
+               goto err_unlock;
+       }
+       /*
+        * We're about to butcher the task affinity, so keep track of what
+        * the user asked for in case we're able to restore it later on.
+        */
+       if (user_mask) {
+               cpumask_copy(user_mask, p->cpus_ptr);
+               p->user_cpus_ptr = user_mask;
+       }
+       return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf);
+ err_unlock:
+       task_rq_unlock(rq, p, &rf);
+       kfree(user_mask);
+       return err;
+ }
+ /*
+  * Restrict the CPU affinity of task @p so that it is a subset of
+  * task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the
+  * old affinity mask. If the resulting mask is empty, we warn and walk
+  * up the cpuset hierarchy until we find a suitable mask.
+  */
+ void force_compatible_cpus_allowed_ptr(struct task_struct *p)
+ {
+       cpumask_var_t new_mask;
+       const struct cpumask *override_mask = task_cpu_possible_mask(p);
+       alloc_cpumask_var(&new_mask, GFP_KERNEL);
+       /*
+        * __migrate_task() can fail silently in the face of concurrent
+        * offlining of the chosen destination CPU, so take the hotplug
+        * lock to ensure that the migration succeeds.
+        */
+       cpus_read_lock();
+       if (!cpumask_available(new_mask))
+               goto out_set_mask;
+       if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask))
+               goto out_free_mask;
+       /*
+        * We failed to find a valid subset of the affinity mask for the
+        * task, so override it based on its cpuset hierarchy.
+        */
+       cpuset_cpus_allowed(p, new_mask);
+       override_mask = new_mask;
+ out_set_mask:
+       if (printk_ratelimit()) {
+               printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n",
+                               task_pid_nr(p), p->comm,
+                               cpumask_pr_args(override_mask));
+       }
+       WARN_ON(set_cpus_allowed_ptr(p, override_mask));
+ out_free_mask:
+       cpus_read_unlock();
+       free_cpumask_var(new_mask);
+ }
+ static int
+ __sched_setaffinity(struct task_struct *p, const struct cpumask *mask);
+ /*
+  * Restore the affinity of a task @p which was previously restricted by a
+  * call to force_compatible_cpus_allowed_ptr(). This will clear (and free)
+  * @p->user_cpus_ptr.
+  *
+  * It is the caller's responsibility to serialise this with any calls to
+  * force_compatible_cpus_allowed_ptr(@p).
+  */
+ void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
+ {
+       struct cpumask *user_mask = p->user_cpus_ptr;
+       unsigned long flags;
+       /*
+        * Try to restore the old affinity mask. If this fails, then
+        * we free the mask explicitly to avoid it being inherited across
+        * a subsequent fork().
+        */
+       if (!user_mask || !__sched_setaffinity(p, user_mask))
+               return;
+       raw_spin_lock_irqsave(&p->pi_lock, flags);
+       user_mask = clear_user_cpus_ptr(p);
+       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+       kfree(user_mask);
+ }
  void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
  {
  #ifdef CONFIG_SCHED_DEBUG
@@@ -3126,9 -3313,7 +3330,7 @@@ static int select_fallback_rq(int cpu, 
  
                /* Look for allowed, online CPU in same node. */
                for_each_cpu(dest_cpu, nodemask) {
-                       if (!cpu_active(dest_cpu))
-                               continue;
-                       if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
+                       if (is_cpu_allowed(p, dest_cpu))
                                return dest_cpu;
                }
        }
                /* No more Mr. Nice Guy. */
                switch (state) {
                case cpuset:
-                       if (IS_ENABLED(CONFIG_CPUSETS)) {
-                               cpuset_cpus_allowed_fallback(p);
+                       if (cpuset_cpus_allowed_fallback(p)) {
                                state = possible;
                                break;
                        }
                         *
                         * More yuck to audit.
                         */
-                       do_set_cpus_allowed(p, cpu_possible_mask);
+                       do_set_cpus_allowed(p, task_cpu_possible_mask(p));
                        state = fail;
                        break;
                case fail:
                        BUG();
                        break;
@@@ -4111,7 -4294,7 +4311,7 @@@ int sched_fork(unsigned long clone_flag
                } else if (PRIO_TO_NICE(p->static_prio) < 0)
                        p->static_prio = NICE_TO_PRIO(0);
  
 -              p->prio = p->normal_prio = __normal_prio(p);
 +              p->prio = p->normal_prio = p->static_prio;
                set_load_weight(p, false);
  
                /*
@@@ -4563,7 -4746,6 +4763,7 @@@ static struct rq *finish_task_switch(st
        vtime_task_switch(prev);
        perf_event_task_sched_in(prev, current);
        finish_task(prev);
 +      tick_nohz_task_switch();
        finish_lock_switch(rq);
        finish_arch_post_lock_switch();
        kcov_finish_switch(current);
                put_task_struct_rcu_user(prev);
        }
  
 -      tick_nohz_task_switch();
        return rq;
  }
  
@@@ -5674,11 -5857,9 +5874,9 @@@ static bool try_steal_cookie(int this, 
                if (p->core_occupation > dst->idle->core_occupation)
                        goto next;
  
-               p->on_rq = TASK_ON_RQ_MIGRATING;
                deactivate_task(src, p, 0);
                set_task_cpu(p, this);
                activate_task(dst, p, 0);
-               p->on_rq = TASK_ON_RQ_QUEUED;
  
                resched_curr(dst);
  
@@@ -5750,109 -5931,35 +5948,109 @@@ void queue_core_balance(struct rq *rq
        queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance);
  }
  
 -static inline void sched_core_cpu_starting(unsigned int cpu)
 +static void sched_core_cpu_starting(unsigned int cpu)
  {
        const struct cpumask *smt_mask = cpu_smt_mask(cpu);
 -      struct rq *rq, *core_rq = NULL;
 -      int i;
 +      struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
 +      unsigned long flags;
 +      int t;
 +
 +      sched_core_lock(cpu, &flags);
  
 -      core_rq = cpu_rq(cpu)->core;
 +      WARN_ON_ONCE(rq->core != rq);
 +
 +      /* if we're the first, we'll be our own leader */
 +      if (cpumask_weight(smt_mask) == 1)
 +              goto unlock;
  
 -      if (!core_rq) {
 -              for_each_cpu(i, smt_mask) {
 -                      rq = cpu_rq(i);
 -                      if (rq->core && rq->core == rq)
 -                              core_rq = rq;
 +      /* find the leader */
 +      for_each_cpu(t, smt_mask) {
 +              if (t == cpu)
 +                      continue;
 +              rq = cpu_rq(t);
 +              if (rq->core == rq) {
 +                      core_rq = rq;
 +                      break;
                }
 +      }
  
 -              if (!core_rq)
 -                      core_rq = cpu_rq(cpu);
 +      if (WARN_ON_ONCE(!core_rq)) /* whoopsie */
 +              goto unlock;
  
 -              for_each_cpu(i, smt_mask) {
 -                      rq = cpu_rq(i);
 +      /* install and validate core_rq */
 +      for_each_cpu(t, smt_mask) {
 +              rq = cpu_rq(t);
  
 -                      WARN_ON_ONCE(rq->core && rq->core != core_rq);
 +              if (t == cpu)
                        rq->core = core_rq;
 -              }
 +
 +              WARN_ON_ONCE(rq->core != core_rq);
        }
 +
 +unlock:
 +      sched_core_unlock(cpu, &flags);
  }
 +
 +static void sched_core_cpu_deactivate(unsigned int cpu)
 +{
 +      const struct cpumask *smt_mask = cpu_smt_mask(cpu);
 +      struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
 +      unsigned long flags;
 +      int t;
 +
 +      sched_core_lock(cpu, &flags);
 +
 +      /* if we're the last man standing, nothing to do */
 +      if (cpumask_weight(smt_mask) == 1) {
 +              WARN_ON_ONCE(rq->core != rq);
 +              goto unlock;
 +      }
 +
 +      /* if we're not the leader, nothing to do */
 +      if (rq->core != rq)
 +              goto unlock;
 +
 +      /* find a new leader */
 +      for_each_cpu(t, smt_mask) {
 +              if (t == cpu)
 +                      continue;
 +              core_rq = cpu_rq(t);
 +              break;
 +      }
 +
 +      if (WARN_ON_ONCE(!core_rq)) /* impossible */
 +              goto unlock;
 +
 +      /* copy the shared state to the new leader */
 +      core_rq->core_task_seq      = rq->core_task_seq;
 +      core_rq->core_pick_seq      = rq->core_pick_seq;
 +      core_rq->core_cookie        = rq->core_cookie;
 +      core_rq->core_forceidle     = rq->core_forceidle;
 +      core_rq->core_forceidle_seq = rq->core_forceidle_seq;
 +
 +      /* install new leader */
 +      for_each_cpu(t, smt_mask) {
 +              rq = cpu_rq(t);
 +              rq->core = core_rq;
 +      }
 +
 +unlock:
 +      sched_core_unlock(cpu, &flags);
 +}
 +
 +static inline void sched_core_cpu_dying(unsigned int cpu)
 +{
 +      struct rq *rq = cpu_rq(cpu);
 +
 +      if (rq->core != rq)
 +              rq->core = rq;
 +}
 +
  #else /* !CONFIG_SCHED_CORE */
  
  static inline void sched_core_cpu_starting(unsigned int cpu) {}
 +static inline void sched_core_cpu_deactivate(unsigned int cpu) {}
 +static inline void sched_core_cpu_dying(unsigned int cpu) {}
  
  static struct task_struct *
  pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
@@@ -6427,18 -6534,6 +6625,18 @@@ int default_wake_function(wait_queue_en
  }
  EXPORT_SYMBOL(default_wake_function);
  
 +static void __setscheduler_prio(struct task_struct *p, int prio)
 +{
 +      if (dl_prio(prio))
 +              p->sched_class = &dl_sched_class;
 +      else if (rt_prio(prio))
 +              p->sched_class = &rt_sched_class;
 +      else
 +              p->sched_class = &fair_sched_class;
 +
 +      p->prio = prio;
 +}
 +
  #ifdef CONFIG_RT_MUTEXES
  
  static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
@@@ -6554,19 -6649,22 +6752,19 @@@ void rt_mutex_setprio(struct task_struc
                } else {
                        p->dl.pi_se = &p->dl;
                }
 -              p->sched_class = &dl_sched_class;
        } else if (rt_prio(prio)) {
                if (dl_prio(oldprio))
                        p->dl.pi_se = &p->dl;
                if (oldprio < prio)
                        queue_flag |= ENQUEUE_HEAD;
 -              p->sched_class = &rt_sched_class;
        } else {
                if (dl_prio(oldprio))
                        p->dl.pi_se = &p->dl;
                if (rt_prio(oldprio))
                        p->rt.timeout = 0;
 -              p->sched_class = &fair_sched_class;
        }
  
 -      p->prio = prio;
 +      __setscheduler_prio(p, prio);
  
        if (queued)
                enqueue_task(rq, p, queue_flag);
@@@ -6919,6 -7017,35 +7117,6 @@@ static void __setscheduler_params(struc
        set_load_weight(p, true);
  }
  
 -/* Actually do priority change: must hold pi & rq lock. */
 -static void __setscheduler(struct rq *rq, struct task_struct *p,
 -                         const struct sched_attr *attr, bool keep_boost)
 -{
 -      /*
 -       * If params can't change scheduling class changes aren't allowed
 -       * either.
 -       */
 -      if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
 -              return;
 -
 -      __setscheduler_params(p, attr);
 -
 -      /*
 -       * Keep a potential priority boosting if called from
 -       * sched_setscheduler().
 -       */
 -      p->prio = normal_prio(p);
 -      if (keep_boost)
 -              p->prio = rt_effective_prio(p, p->prio);
 -
 -      if (dl_prio(p->prio))
 -              p->sched_class = &dl_sched_class;
 -      else if (rt_prio(p->prio))
 -              p->sched_class = &rt_sched_class;
 -      else
 -              p->sched_class = &fair_sched_class;
 -}
 -
  /*
   * Check the target process has a UID that matches the current process's:
   */
@@@ -6939,8 -7066,10 +7137,8 @@@ static int __sched_setscheduler(struct 
                                const struct sched_attr *attr,
                                bool user, bool pi)
  {
 -      int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
 -                    MAX_RT_PRIO - 1 - attr->sched_priority;
 -      int retval, oldprio, oldpolicy = -1, queued, running;
 -      int new_effective_prio, policy = attr->sched_policy;
 +      int oldpolicy = -1, policy = attr->sched_policy;
 +      int retval, oldprio, newprio, queued, running;
        const struct sched_class *prev_class;
        struct callback_head *head;
        struct rq_flags rf;
@@@ -7138,7 -7267,6 +7336,7 @@@ change
        p->sched_reset_on_fork = reset_on_fork;
        oldprio = p->prio;
  
 +      newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
        if (pi) {
                /*
                 * Take priority boosted tasks into account. If the new
                 * the runqueue. This will be done when the task deboost
                 * itself.
                 */
 -              new_effective_prio = rt_effective_prio(p, newprio);
 -              if (new_effective_prio == oldprio)
 +              newprio = rt_effective_prio(p, newprio);
 +              if (newprio == oldprio)
                        queue_flags &= ~DEQUEUE_MOVE;
        }
  
  
        prev_class = p->sched_class;
  
 -      __setscheduler(rq, p, attr, pi);
 +      if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
 +              __setscheduler_params(p, attr);
 +              __setscheduler_prio(p, newprio);
 +      }
        __setscheduler_uclamp(p, attr);
  
        if (queued) {
@@@ -7388,6 -7513,16 +7586,16 @@@ err_size
        return -E2BIG;
  }
  
+ static void get_params(struct task_struct *p, struct sched_attr *attr)
+ {
+       if (task_has_dl_policy(p))
+               __getparam_dl(p, attr);
+       else if (task_has_rt_policy(p))
+               attr->sched_priority = p->rt_priority;
+       else
+               attr->sched_nice = task_nice(p);
+ }
  /**
   * sys_sched_setscheduler - set/change the scheduler policy and RT priority
   * @pid: the pid in question.
@@@ -7449,6 -7584,8 +7657,8 @@@ SYSCALL_DEFINE3(sched_setattr, pid_t, p
        rcu_read_unlock();
  
        if (likely(p)) {
+               if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
+                       get_params(p, &attr);
                retval = sched_setattr(p, &attr);
                put_task_struct(p);
        }
@@@ -7597,12 -7734,8 +7807,8 @@@ SYSCALL_DEFINE4(sched_getattr, pid_t, p
        kattr.sched_policy = p->policy;
        if (p->sched_reset_on_fork)
                kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
-       if (task_has_dl_policy(p))
-               __getparam_dl(p, &kattr);
-       else if (task_has_rt_policy(p))
-               kattr.sched_priority = p->rt_priority;
-       else
-               kattr.sched_nice = task_nice(p);
+       get_params(p, &kattr);
+       kattr.sched_flags &= SCHED_FLAG_ALL;
  
  #ifdef CONFIG_UCLAMP_TASK
        /*
@@@ -7623,9 -7756,76 +7829,76 @@@ out_unlock
        return retval;
  }
  
- long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+ #ifdef CONFIG_SMP
+ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
  {
+       int ret = 0;
+       /*
+        * If the task isn't a deadline task or admission control is
+        * disabled then we don't care about affinity changes.
+        */
+       if (!task_has_dl_policy(p) || !dl_bandwidth_enabled())
+               return 0;
+       /*
+        * Since bandwidth control happens on root_domain basis,
+        * if admission test is enabled, we only admit -deadline
+        * tasks allowed to run on all the CPUs in the task's
+        * root_domain.
+        */
+       rcu_read_lock();
+       if (!cpumask_subset(task_rq(p)->rd->span, mask))
+               ret = -EBUSY;
+       rcu_read_unlock();
+       return ret;
+ }
+ #endif
+ static int
+ __sched_setaffinity(struct task_struct *p, const struct cpumask *mask)
+ {
+       int retval;
        cpumask_var_t cpus_allowed, new_mask;
+       if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
+               return -ENOMEM;
+       if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+               retval = -ENOMEM;
+               goto out_free_cpus_allowed;
+       }
+       cpuset_cpus_allowed(p, cpus_allowed);
+       cpumask_and(new_mask, mask, cpus_allowed);
+       retval = dl_task_check_affinity(p, new_mask);
+       if (retval)
+               goto out_free_new_mask;
+ again:
+       retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER);
+       if (retval)
+               goto out_free_new_mask;
+       cpuset_cpus_allowed(p, cpus_allowed);
+       if (!cpumask_subset(new_mask, cpus_allowed)) {
+               /*
+                * We must have raced with a concurrent cpuset update.
+                * Just reset the cpumask to the cpuset's cpus_allowed.
+                */
+               cpumask_copy(new_mask, cpus_allowed);
+               goto again;
+       }
+ out_free_new_mask:
+       free_cpumask_var(new_mask);
+ out_free_cpus_allowed:
+       free_cpumask_var(cpus_allowed);
+       return retval;
+ }
+ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+ {
        struct task_struct *p;
        int retval;
  
                retval = -EINVAL;
                goto out_put_task;
        }
-       if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
-               retval = -ENOMEM;
-               goto out_put_task;
-       }
-       if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
-               retval = -ENOMEM;
-               goto out_free_cpus_allowed;
-       }
-       retval = -EPERM;
        if (!check_same_owner(p)) {
                rcu_read_lock();
                if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
                        rcu_read_unlock();
-                       goto out_free_new_mask;
+                       retval = -EPERM;
+                       goto out_put_task;
                }
                rcu_read_unlock();
        }
  
        retval = security_task_setscheduler(p);
        if (retval)
-               goto out_free_new_mask;
-       cpuset_cpus_allowed(p, cpus_allowed);
-       cpumask_and(new_mask, in_mask, cpus_allowed);
-       /*
-        * Since bandwidth control happens on root_domain basis,
-        * if admission test is enabled, we only admit -deadline
-        * tasks allowed to run on all the CPUs in the task's
-        * root_domain.
-        */
- #ifdef CONFIG_SMP
-       if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
-               rcu_read_lock();
-               if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
-                       retval = -EBUSY;
-                       rcu_read_unlock();
-                       goto out_free_new_mask;
-               }
-               rcu_read_unlock();
-       }
- #endif
- again:
-       retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK);
+               goto out_put_task;
  
-       if (!retval) {
-               cpuset_cpus_allowed(p, cpus_allowed);
-               if (!cpumask_subset(new_mask, cpus_allowed)) {
-                       /*
-                        * We must have raced with a concurrent cpuset
-                        * update. Just reset the cpus_allowed to the
-                        * cpuset's cpus_allowed
-                        */
-                       cpumask_copy(new_mask, cpus_allowed);
-                       goto again;
-               }
-       }
- out_free_new_mask:
-       free_cpumask_var(new_mask);
- out_free_cpus_allowed:
-       free_cpumask_var(cpus_allowed);
+       retval = __sched_setaffinity(p, in_mask);
  out_put_task:
        put_task_struct(p);
        return retval;
@@@ -7849,17 -8003,6 +8076,17 @@@ int __sched __cond_resched(void
                preempt_schedule_common();
                return 1;
        }
 +      /*
 +       * In preemptible kernels, ->rcu_read_lock_nesting tells the tick
 +       * whether the current CPU is in an RCU read-side critical section,
 +       * so the tick can report quiescent states even for CPUs looping
 +       * in kernel context.  In contrast, in non-preemptible kernels,
 +       * RCU readers leave no in-memory hints, which means that CPU-bound
 +       * processes executing in kernel context might never report an
 +       * RCU quiescent state.  Therefore, the following code causes
 +       * cond_resched() to report a quiescent state, but only when RCU
 +       * is in urgent need of one.
 +       */
  #ifndef CONFIG_PREEMPT_RCU
        rcu_all_qs();
  #endif
@@@ -8806,8 -8949,6 +9033,8 @@@ int sched_cpu_deactivate(unsigned int c
         */
        if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
                static_branch_dec_cpuslocked(&sched_smt_present);
 +
 +      sched_core_cpu_deactivate(cpu);
  #endif
  
        if (!sched_smp_initialized)
@@@ -8912,7 -9053,6 +9139,7 @@@ int sched_cpu_dying(unsigned int cpu
        calc_load_migrate(rq);
        update_max_interval();
        hrtick_clear(rq);
 +      sched_core_cpu_dying(cpu);
        return 0;
  }
  #endif
@@@ -9124,7 -9264,7 +9351,7 @@@ void __init sched_init(void
                atomic_set(&rq->nr_iowait, 0);
  
  #ifdef CONFIG_SCHED_CORE
 -              rq->core = NULL;
 +              rq->core = rq;
                rq->core_pick = NULL;
                rq->core_enabled = 0;
                rq->core_tree = RB_ROOT;
@@@ -9906,7 -10046,7 +10133,7 @@@ static int tg_set_cfs_bandwidth(struct 
         * Prevent race between setting of cfs_rq->runtime_enabled and
         * unthrottle_offline_cfs_rqs().
         */
-       get_online_cpus();
+       cpus_read_lock();
        mutex_lock(&cfs_constraints_mutex);
        ret = __cfs_schedulable(tg, period, quota);
        if (ret)
                cfs_bandwidth_usage_dec();
  out_unlock:
        mutex_unlock(&cfs_constraints_mutex);
-       put_online_cpus();
+       cpus_read_unlock();
  
        return ret;
  }
@@@ -10201,6 -10341,20 +10428,20 @@@ static u64 cpu_rt_period_read_uint(stru
  }
  #endif /* CONFIG_RT_GROUP_SCHED */
  
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+ static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
+                              struct cftype *cft)
+ {
+       return css_tg(css)->idle;
+ }
+ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
+                               struct cftype *cft, s64 idle)
+ {
+       return sched_group_set_idle(css_tg(css), idle);
+ }
+ #endif
  static struct cftype cpu_legacy_files[] = {
  #ifdef CONFIG_FAIR_GROUP_SCHED
        {
                .read_u64 = cpu_shares_read_u64,
                .write_u64 = cpu_shares_write_u64,
        },
+       {
+               .name = "idle",
+               .read_s64 = cpu_idle_read_s64,
+               .write_s64 = cpu_idle_write_s64,
+       },
  #endif
  #ifdef CONFIG_CFS_BANDWIDTH
        {
@@@ -10415,6 -10574,12 +10661,12 @@@ static struct cftype cpu_files[] = 
                .read_s64 = cpu_weight_nice_read_s64,
                .write_s64 = cpu_weight_nice_write_s64,
        },
+       {
+               .name = "idle",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .read_s64 = cpu_idle_read_s64,
+               .write_s64 = cpu_idle_write_s64,
+       },
  #endif
  #ifdef CONFIG_CFS_BANDWIDTH
        {
diff --combined kernel/sched/fair.c
@@@ -431,6 -431,23 +431,23 @@@ find_matching_se(struct sched_entity **
        }
  }
  
+ static int tg_is_idle(struct task_group *tg)
+ {
+       return tg->idle > 0;
+ }
+ static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
+ {
+       return cfs_rq->idle > 0;
+ }
+ static int se_is_idle(struct sched_entity *se)
+ {
+       if (entity_is_task(se))
+               return task_has_idle_policy(task_of(se));
+       return cfs_rq_is_idle(group_cfs_rq(se));
+ }
  #else /* !CONFIG_FAIR_GROUP_SCHED */
  
  #define for_each_sched_entity(se) \
@@@ -468,6 -485,21 +485,21 @@@ find_matching_se(struct sched_entity **
  {
  }
  
+ static inline int tg_is_idle(struct task_group *tg)
+ {
+       return 0;
+ }
+ static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
+ {
+       return 0;
+ }
+ static int se_is_idle(struct sched_entity *se)
+ {
+       return 0;
+ }
  #endif        /* CONFIG_FAIR_GROUP_SCHED */
  
  static __always_inline
@@@ -1486,7 -1518,7 +1518,7 @@@ static inline bool is_core_idle(int cpu
                if (cpu == sibling)
                        continue;
  
-               if (!idle_cpu(cpu))
+               if (!idle_cpu(sibling))
                        return false;
        }
  #endif
@@@ -3037,9 -3069,8 +3069,9 @@@ enqueue_load_avg(struct cfs_rq *cfs_rq
  static inline void
  dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
 +      u32 divider = get_pelt_divider(&se->avg);
        sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
 -      sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
 +      cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider;
  }
  #else
  static inline void
@@@ -3256,31 -3287,6 +3288,31 @@@ static inline void cfs_rq_util_change(s
  
  #ifdef CONFIG_SMP
  #ifdef CONFIG_FAIR_GROUP_SCHED
 +/*
 + * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list
 + * immediately before a parent cfs_rq, and cfs_rqs are removed from the list
 + * bottom-up, we only have to test whether the cfs_rq before us on the list
 + * is our child.
 + * If cfs_rq is not on the list, test whether a child needs its to be added to
 + * connect a branch to the tree  * (see list_add_leaf_cfs_rq() for details).
 + */
 +static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq)
 +{
 +      struct cfs_rq *prev_cfs_rq;
 +      struct list_head *prev;
 +
 +      if (cfs_rq->on_list) {
 +              prev = cfs_rq->leaf_cfs_rq_list.prev;
 +      } else {
 +              struct rq *rq = rq_of(cfs_rq);
 +
 +              prev = rq->tmp_alone_branch;
 +      }
 +
 +      prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list);
 +
 +      return (prev_cfs_rq->tg->parent == cfs_rq->tg);
 +}
  
  static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
  {
        if (cfs_rq->avg.runnable_sum)
                return false;
  
 +      if (child_cfs_rq_on_list(cfs_rq))
 +              return false;
 +
        /*
         * _avg must be null when _sum are null because _avg = _sum / divider
         * Make sure that rounding and/or propagation of PELT values never
@@@ -4841,6 -4844,9 +4873,9 @@@ static bool throttle_cfs_rq(struct cfs_
  
                dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
  
+               if (cfs_rq_is_idle(group_cfs_rq(se)))
+                       idle_task_delta = cfs_rq->h_nr_running;
                qcfs_rq->h_nr_running -= task_delta;
                qcfs_rq->idle_h_nr_running -= idle_task_delta;
  
                update_load_avg(qcfs_rq, se, 0);
                se_update_runnable(se);
  
+               if (cfs_rq_is_idle(group_cfs_rq(se)))
+                       idle_task_delta = cfs_rq->h_nr_running;
                qcfs_rq->h_nr_running -= task_delta;
                qcfs_rq->idle_h_nr_running -= idle_task_delta;
        }
@@@ -4904,39 -4913,45 +4942,45 @@@ void unthrottle_cfs_rq(struct cfs_rq *c
        task_delta = cfs_rq->h_nr_running;
        idle_task_delta = cfs_rq->idle_h_nr_running;
        for_each_sched_entity(se) {
+               struct cfs_rq *qcfs_rq = cfs_rq_of(se);
                if (se->on_rq)
                        break;
-               cfs_rq = cfs_rq_of(se);
-               enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
+               enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
+               if (cfs_rq_is_idle(group_cfs_rq(se)))
+                       idle_task_delta = cfs_rq->h_nr_running;
  
-               cfs_rq->h_nr_running += task_delta;
-               cfs_rq->idle_h_nr_running += idle_task_delta;
+               qcfs_rq->h_nr_running += task_delta;
+               qcfs_rq->idle_h_nr_running += idle_task_delta;
  
                /* end evaluation on encountering a throttled cfs_rq */
-               if (cfs_rq_throttled(cfs_rq))
+               if (cfs_rq_throttled(qcfs_rq))
                        goto unthrottle_throttle;
        }
  
        for_each_sched_entity(se) {
-               cfs_rq = cfs_rq_of(se);
+               struct cfs_rq *qcfs_rq = cfs_rq_of(se);
  
-               update_load_avg(cfs_rq, se, UPDATE_TG);
+               update_load_avg(qcfs_rq, se, UPDATE_TG);
                se_update_runnable(se);
  
-               cfs_rq->h_nr_running += task_delta;
-               cfs_rq->idle_h_nr_running += idle_task_delta;
+               if (cfs_rq_is_idle(group_cfs_rq(se)))
+                       idle_task_delta = cfs_rq->h_nr_running;
  
+               qcfs_rq->h_nr_running += task_delta;
+               qcfs_rq->idle_h_nr_running += idle_task_delta;
  
                /* end evaluation on encountering a throttled cfs_rq */
-               if (cfs_rq_throttled(cfs_rq))
+               if (cfs_rq_throttled(qcfs_rq))
                        goto unthrottle_throttle;
  
                /*
                 * One parent has been throttled and cfs_rq removed from the
                 * list. Add it back to not break the leaf list.
                 */
-               if (throttled_hierarchy(cfs_rq))
-                       list_add_leaf_cfs_rq(cfs_rq);
+               if (throttled_hierarchy(qcfs_rq))
+                       list_add_leaf_cfs_rq(qcfs_rq);
        }
  
        /* At this point se is NULL and we are at root level*/
@@@ -4949,9 -4964,9 +4993,9 @@@ unthrottle_throttle
         * assertion below.
         */
        for_each_sched_entity(se) {
-               cfs_rq = cfs_rq_of(se);
+               struct cfs_rq *qcfs_rq = cfs_rq_of(se);
  
-               if (list_add_leaf_cfs_rq(cfs_rq))
+               if (list_add_leaf_cfs_rq(qcfs_rq))
                        break;
        }
  
@@@ -5082,7 -5097,7 +5126,7 @@@ static const u64 cfs_bandwidth_slack_pe
  static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
  {
        struct hrtimer *refresh_timer = &cfs_b->period_timer;
 -      u64 remaining;
 +      s64 remaining;
  
        /* if the call-back is running a quota refresh is already occurring */
        if (hrtimer_callback_running(refresh_timer))
  
        /* is a quota refresh about to occur? */
        remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
 -      if (remaining < min_expire)
 +      if (remaining < (s64)min_expire)
                return 1;
  
        return 0;
@@@ -5574,6 -5589,9 +5618,9 @@@ enqueue_task_fair(struct rq *rq, struc
                cfs_rq->h_nr_running++;
                cfs_rq->idle_h_nr_running += idle_h_nr_running;
  
+               if (cfs_rq_is_idle(cfs_rq))
+                       idle_h_nr_running = 1;
                /* end evaluation on encountering a throttled cfs_rq */
                if (cfs_rq_throttled(cfs_rq))
                        goto enqueue_throttle;
                cfs_rq->h_nr_running++;
                cfs_rq->idle_h_nr_running += idle_h_nr_running;
  
+               if (cfs_rq_is_idle(cfs_rq))
+                       idle_h_nr_running = 1;
                /* end evaluation on encountering a throttled cfs_rq */
                if (cfs_rq_throttled(cfs_rq))
                        goto enqueue_throttle;
@@@ -5668,6 -5689,9 +5718,9 @@@ static void dequeue_task_fair(struct r
                cfs_rq->h_nr_running--;
                cfs_rq->idle_h_nr_running -= idle_h_nr_running;
  
+               if (cfs_rq_is_idle(cfs_rq))
+                       idle_h_nr_running = 1;
                /* end evaluation on encountering a throttled cfs_rq */
                if (cfs_rq_throttled(cfs_rq))
                        goto dequeue_throttle;
                cfs_rq->h_nr_running--;
                cfs_rq->idle_h_nr_running -= idle_h_nr_running;
  
+               if (cfs_rq_is_idle(cfs_rq))
+                       idle_h_nr_running = 1;
                /* end evaluation on encountering a throttled cfs_rq */
                if (cfs_rq_throttled(cfs_rq))
                        goto dequeue_throttle;
@@@ -6249,7 -6276,7 +6305,7 @@@ static int select_idle_cpu(struct task_
                time = cpu_clock(this);
        }
  
-       for_each_cpu_wrap(cpu, cpus, target) {
+       for_each_cpu_wrap(cpu, cpus, target + 1) {
                if (has_idle_core) {
                        i = select_idle_core(p, cpu, cpus, &idle_cpu);
                        if ((unsigned int)i < nr_cpumask_bits)
@@@ -6376,6 -6403,7 +6432,7 @@@ static int select_idle_sibling(struct t
  
        /* Check a recently used CPU as a potential idle candidate: */
        recent_used_cpu = p->recent_used_cpu;
+       p->recent_used_cpu = prev;
        if (recent_used_cpu != prev &&
            recent_used_cpu != target &&
            cpus_share_cache(recent_used_cpu, target) &&
@@@ -6902,9 -6930,6 +6959,6 @@@ select_task_rq_fair(struct task_struct 
        } else if (wake_flags & WF_TTWU) { /* XXX always ? */
                /* Fast path */
                new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
-               if (want_affine)
-                       current->recent_used_cpu = cpu;
        }
        rcu_read_unlock();
  
@@@ -7041,24 -7066,22 +7095,22 @@@ wakeup_preempt_entity(struct sched_enti
  
  static void set_last_buddy(struct sched_entity *se)
  {
-       if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
-               return;
        for_each_sched_entity(se) {
                if (SCHED_WARN_ON(!se->on_rq))
                        return;
+               if (se_is_idle(se))
+                       return;
                cfs_rq_of(se)->last = se;
        }
  }
  
  static void set_next_buddy(struct sched_entity *se)
  {
-       if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
-               return;
        for_each_sched_entity(se) {
                if (SCHED_WARN_ON(!se->on_rq))
                        return;
+               if (se_is_idle(se))
+                       return;
                cfs_rq_of(se)->next = se;
        }
  }
@@@ -7079,6 -7102,7 +7131,7 @@@ static void check_preempt_wakeup(struc
        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
        int scale = cfs_rq->nr_running >= sched_nr_latency;
        int next_buddy_marked = 0;
+       int cse_is_idle, pse_is_idle;
  
        if (unlikely(se == pse))
                return;
                return;
  
        find_matching_se(&se, &pse);
-       update_curr(cfs_rq_of(se));
        BUG_ON(!pse);
+       cse_is_idle = se_is_idle(se);
+       pse_is_idle = se_is_idle(pse);
+       /*
+        * Preempt an idle group in favor of a non-idle group (and don't preempt
+        * in the inverse case).
+        */
+       if (cse_is_idle && !pse_is_idle)
+               goto preempt;
+       if (cse_is_idle != pse_is_idle)
+               return;
+       update_curr(cfs_rq_of(se));
        if (wakeup_preempt_entity(se, pse) == 1) {
                /*
                 * Bias pick_next to pick the sched entity that is
@@@ -10217,9 -10254,11 +10283,11 @@@ static inline int on_null_domain(struc
  static inline int find_new_ilb(void)
  {
        int ilb;
+       const struct cpumask *hk_mask;
+       hk_mask = housekeeping_cpumask(HK_FLAG_MISC);
  
-       for_each_cpu_and(ilb, nohz.idle_cpus_mask,
-                             housekeeping_cpumask(HK_FLAG_MISC)) {
+       for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) {
  
                if (ilb == smp_processor_id())
                        continue;
@@@ -11416,10 -11455,12 +11484,12 @@@ void init_tg_cfs_entry(struct task_grou
  
  static DEFINE_MUTEX(shares_mutex);
  
int sched_group_set_shares(struct task_group *tg, unsigned long shares)
static int __sched_group_set_shares(struct task_group *tg, unsigned long shares)
  {
        int i;
  
+       lockdep_assert_held(&shares_mutex);
        /*
         * We can't change the weight of the root cgroup.
         */
  
        shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
  
-       mutex_lock(&shares_mutex);
        if (tg->shares == shares)
-               goto done;
+               return 0;
  
        tg->shares = shares;
        for_each_possible_cpu(i) {
                rq_unlock_irqrestore(rq, &rf);
        }
  
- done:
+       return 0;
+ }
+ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
+ {
+       int ret;
+       mutex_lock(&shares_mutex);
+       if (tg_is_idle(tg))
+               ret = -EINVAL;
+       else
+               ret = __sched_group_set_shares(tg, shares);
+       mutex_unlock(&shares_mutex);
+       return ret;
+ }
+ int sched_group_set_idle(struct task_group *tg, long idle)
+ {
+       int i;
+       if (tg == &root_task_group)
+               return -EINVAL;
+       if (idle < 0 || idle > 1)
+               return -EINVAL;
+       mutex_lock(&shares_mutex);
+       if (tg->idle == idle) {
+               mutex_unlock(&shares_mutex);
+               return 0;
+       }
+       tg->idle = idle;
+       for_each_possible_cpu(i) {
+               struct rq *rq = cpu_rq(i);
+               struct sched_entity *se = tg->se[i];
+               struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i];
+               bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
+               long idle_task_delta;
+               struct rq_flags rf;
+               rq_lock_irqsave(rq, &rf);
+               grp_cfs_rq->idle = idle;
+               if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
+                       goto next_cpu;
+               idle_task_delta = grp_cfs_rq->h_nr_running -
+                                 grp_cfs_rq->idle_h_nr_running;
+               if (!cfs_rq_is_idle(grp_cfs_rq))
+                       idle_task_delta *= -1;
+               for_each_sched_entity(se) {
+                       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+                       if (!se->on_rq)
+                               break;
+                       cfs_rq->idle_h_nr_running += idle_task_delta;
+                       /* Already accounted at parent level and above. */
+                       if (cfs_rq_is_idle(cfs_rq))
+                               break;
+               }
+ next_cpu:
+               rq_unlock_irqrestore(rq, &rf);
+       }
+       /* Idle groups have minimum weight. */
+       if (tg_is_idle(tg))
+               __sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO));
+       else
+               __sched_group_set_shares(tg, NICE_0_LOAD);
        mutex_unlock(&shares_mutex);
        return 0;
  }
  #else /* CONFIG_FAIR_GROUP_SCHED */
  
  void free_fair_sched_group(struct task_group *tg) { }
diff --combined kernel/sched/sched.h
@@@ -227,6 -227,8 +227,8 @@@ static inline void update_avg(u64 *avg
   */
  #define SCHED_FLAG_SUGOV      0x10000000
  
+ #define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV)
  static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se)
  {
  #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
@@@ -394,6 -396,9 +396,9 @@@ struct task_group 
        struct cfs_rq           **cfs_rq;
        unsigned long           shares;
  
+       /* A positive value indicates that this is a SCHED_IDLE group. */
+       int                     idle;
  #ifdef        CONFIG_SMP
        /*
         * load_avg can be heavily contended at clock tick time, so put
@@@ -503,6 -508,8 +508,8 @@@ extern void sched_move_task(struct task
  #ifdef CONFIG_FAIR_GROUP_SCHED
  extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
  
+ extern int sched_group_set_idle(struct task_group *tg, long idle);
  #ifdef CONFIG_SMP
  extern void set_task_rq_fair(struct sched_entity *se,
                             struct cfs_rq *prev, struct cfs_rq *next);
@@@ -599,6 -606,9 +606,9 @@@ struct cfs_rq 
        struct list_head        leaf_cfs_rq_list;
        struct task_group       *tg;    /* group that "owns" this runqueue */
  
+       /* Locally cached copy of our task_group's idle value */
+       int                     idle;
  #ifdef CONFIG_CFS_BANDWIDTH
        int                     runtime_enabled;
        s64                     runtime_remaining;
@@@ -1093,7 -1103,7 +1103,7 @@@ struct rq 
        unsigned int            core_sched_seq;
        struct rb_root          core_tree;
  
 -      /* shared state */
 +      /* shared state -- careful with sched_core_cpu_deactivate() */
        unsigned int            core_task_seq;
        unsigned int            core_pick_seq;
        unsigned long           core_cookie;
@@@ -2234,6 -2244,7 +2244,7 @@@ extern struct task_struct *pick_next_ta
  #define SCA_CHECK             0x01
  #define SCA_MIGRATE_DISABLE   0x02
  #define SCA_MIGRATE_ENABLE    0x04
+ #define SCA_USER              0x08
  
  #ifdef CONFIG_SMP
  
@@@ -2255,9 -2266,6 +2266,9 @@@ static inline struct task_struct *get_p
        if (p->nr_cpus_allowed == 1)
                return NULL;
  
 +      if (p->migration_disabled)
 +              return NULL;
 +
        rq->push_busy = true;
        return get_task_struct(p);
  }
@@@ -2388,6 -2396,21 +2399,21 @@@ extern void check_preempt_curr(struct r
  extern const_debug unsigned int sysctl_sched_nr_migrate;
  extern const_debug unsigned int sysctl_sched_migration_cost;
  
+ #ifdef CONFIG_SCHED_DEBUG
+ extern unsigned int sysctl_sched_latency;
+ extern unsigned int sysctl_sched_min_granularity;
+ extern unsigned int sysctl_sched_wakeup_granularity;
+ extern int sysctl_resched_latency_warn_ms;
+ extern int sysctl_resched_latency_warn_once;
+ extern unsigned int sysctl_sched_tunable_scaling;
+ extern unsigned int sysctl_numa_balancing_scan_delay;
+ extern unsigned int sysctl_numa_balancing_scan_period_min;
+ extern unsigned int sysctl_numa_balancing_scan_period_max;
+ extern unsigned int sysctl_numa_balancing_scan_size;
+ #endif
  #ifdef CONFIG_SCHED_HRTICK
  
  /*
@@@ -2821,27 -2844,20 +2847,27 @@@ static __always_inlin
  unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
                                  struct task_struct *p)
  {
 -      unsigned long min_util;
 -      unsigned long max_util;
 +      unsigned long min_util = 0;
 +      unsigned long max_util = 0;
  
        if (!static_branch_likely(&sched_uclamp_used))
                return util;
  
 -      min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
 -      max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
 -
        if (p) {
 -              min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN));
 -              max_util = max(max_util, uclamp_eff_value(p, UCLAMP_MAX));
 +              min_util = uclamp_eff_value(p, UCLAMP_MIN);
 +              max_util = uclamp_eff_value(p, UCLAMP_MAX);
 +
 +              /*
 +               * Ignore last runnable task's max clamp, as this task will
 +               * reset it. Similarly, no need to read the rq's min clamp.
 +               */
 +              if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
 +                      goto out;
        }
  
 +      min_util = max_t(unsigned long, min_util, READ_ONCE(rq->uclamp[UCLAMP_MIN].value));
 +      max_util = max_t(unsigned long, max_util, READ_ONCE(rq->uclamp[UCLAMP_MAX].value));
 +out:
        /*
         * Since CPU's {min,max}_util clamps are MAX aggregated considering
         * RUNNABLE tasks with _different_ clamps, we can end up with an