Merge tag 'sched_urgent_for_v5.15_rc1' of git://git.kernel.org/pub/scm/linux/kernel...

author Linus Torvalds <torvalds@linux-foundation.org>

Sun, 12 Sep 2021 18:37:41 +0000 (11:37 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sun, 12 Sep 2021 18:37:41 +0000 (11:37 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Sun, 12 Sep 2021 18:37:41 +0000 (11:37 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sun, 12 Sep 2021 18:37:41 +0000 (11:37 -0700)
diff --combined kernel/sched/core.c

index c4462c4,b21a185..1bba412
--- 1/kernel/sched/core.c
--- 2/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@@ -1007,7 -1007,6 +1007,7 @@@ int get_nohz_timer_target(void
   {
         int i, cpu = smp_processor_id(), default_cpu = -1;
         struct sched_domain *sd;
+ +      const struct cpumask *hk_mask;
   
         if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
                 if (!idle_cpu(cpu))
@@@ -1015,11 -1014,10 +1015,11 @@@
                 default_cpu = cpu;
         }
   
+ +      hk_mask = housekeeping_cpumask(HK_FLAG_TIMER);
+ +
         rcu_read_lock();
         for_each_domain(cpu, sd) {
- -              for_each_cpu_and(i, sched_domain_span(sd),
- -                      housekeeping_cpumask(HK_FLAG_TIMER)) {
+ +              for_each_cpu_and(i, sched_domain_span(sd), hk_mask) {
                         if (cpu == i)
                                 continue;
   
@@@ -1635,23 -1633,6 +1635,23 @@@ static inline void uclamp_rq_dec(struc
                 uclamp_rq_dec_id(rq, p, clamp_id);
   }
   
+ +static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p,
+ +                                    enum uclamp_id clamp_id)
+ +{
+ +      if (!p->uclamp[clamp_id].active)
+ +              return;
+ +
+ +      uclamp_rq_dec_id(rq, p, clamp_id);
+ +      uclamp_rq_inc_id(rq, p, clamp_id);
+ +
+ +      /*
+ +       * Make sure to clear the idle flag if we've transiently reached 0
+ +       * active tasks on rq.
+ +       */
+ +      if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
+ +              rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
+ +}
+ +
   static inline void
   uclamp_update_active(struct task_struct *p)
   {
@@@ -1675,8 -1656,12 +1675,8 @@@
          * affecting a valid clamp bucket, the next time it's enqueued,
          * it will already see the updated clamp bucket value.
          */
- -      for_each_clamp_id(clamp_id) {
- -              if (p->uclamp[clamp_id].active) {
- -                      uclamp_rq_dec_id(rq, p, clamp_id);
- -                      uclamp_rq_inc_id(rq, p, clamp_id);
- -              }
- -      }
+ +      for_each_clamp_id(clamp_id)
+ +              uclamp_rq_reinc_id(rq, p, clamp_id);
   
         task_rq_unlock(rq, p, &rf);
   }
@@@ -2190,7 -2175,7 +2190,7 @@@ static inline bool is_cpu_allowed(struc
   
         /* Non kernel threads are not allowed during either online or offline. */
         if (!(p->flags & PF_KTHREAD))
- -              return cpu_active(cpu);
+ +              return cpu_active(cpu) && task_cpu_possible(cpu, p);
   
         /* KTHREAD_IS_PER_CPU is always allowed. */
         if (kthread_is_per_cpu(p))
@@@ -2497,34 -2482,6 +2497,34 @@@ void do_set_cpus_allowed(struct task_st
         __do_set_cpus_allowed(p, new_mask, 0);
   }
   
+ +int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
+ +                    int node)
+ +{
+ +      if (!src->user_cpus_ptr)
+ +              return 0;
+ +
+ +      dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node);
+ +      if (!dst->user_cpus_ptr)
+ +              return -ENOMEM;
+ +
+ +      cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
+ +      return 0;
+ +}
+ +
+ +static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p)
+ +{
+ +      struct cpumask *user_mask = NULL;
+ +
+ +      swap(p->user_cpus_ptr, user_mask);
+ +
+ +      return user_mask;
+ +}
+ +
+ +void release_user_cpus_ptr(struct task_struct *p)
+ +{
+ +      kfree(clear_user_cpus_ptr(p));
+ +}
+ +
   /*
    * This function is wildly self concurrent; here be dragons.
    *
@@@ -2742,26 -2699,28 +2742,26 @@@ static int affine_move_task(struct rq *
   }
   
   /*
- - * Change a given task's CPU affinity. Migrate the thread to a
- - * proper CPU and schedule it away if the CPU it's executing on
- - * is removed from the allowed bitmask.
- - *
- - * NOTE: the caller must have a valid reference to the task, the
- - * task must not exit() & deallocate itself prematurely. The
- - * call is not atomic; no spinlocks may be held.
+ + * Called with both p->pi_lock and rq->lock held; drops both before returning.
    */
- -static int __set_cpus_allowed_ptr(struct task_struct *p,
- -                                const struct cpumask *new_mask,
- -                                u32 flags)
+ +static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
+ +                                       const struct cpumask *new_mask,
+ +                                       u32 flags,
+ +                                       struct rq *rq,
+ +                                       struct rq_flags *rf)
+ +      __releases(rq->lock)
+ +      __releases(p->pi_lock)
   {
+ +      const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
         const struct cpumask *cpu_valid_mask = cpu_active_mask;
+ +      bool kthread = p->flags & PF_KTHREAD;
+ +      struct cpumask *user_mask = NULL;
         unsigned int dest_cpu;
- -      struct rq_flags rf;
- -      struct rq *rq;
         int ret = 0;
   
- -      rq = task_rq_lock(p, &rf);
         update_rq_clock(rq);
   
- -      if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
+ +      if (kthread || is_migration_disabled(p)) {
                 /*
                  * Kernel threads are allowed on online && !active CPUs,
                  * however, during cpu-hot-unplug, even these might get pushed
@@@ -2775,11 -2734,6 +2775,11 @@@
                 cpu_valid_mask = cpu_online_mask;
         }
   
+ +      if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) {
+ +              ret = -EINVAL;
+ +              goto out;
+ +      }
+ +
         /*
          * Must re-check here, to close a race against __kthread_bind(),
          * sched_setaffinity() is not guaranteed to observe the flag.
@@@ -2814,178 -2768,20 +2814,178 @@@
   
         __do_set_cpus_allowed(p, new_mask, flags);
   
- -      return affine_move_task(rq, p, &rf, dest_cpu, flags);
+ +      if (flags & SCA_USER)
+ +              user_mask = clear_user_cpus_ptr(p);
+ +
+ +      ret = affine_move_task(rq, p, rf, dest_cpu, flags);
+ +
+ +      kfree(user_mask);
+ +
+ +      return ret;
   
   out:
- -      task_rq_unlock(rq, p, &rf);
+ +      task_rq_unlock(rq, p, rf);
   
         return ret;
   }
   
+ +/*
+ + * Change a given task's CPU affinity. Migrate the thread to a
+ + * proper CPU and schedule it away if the CPU it's executing on
+ + * is removed from the allowed bitmask.
+ + *
+ + * NOTE: the caller must have a valid reference to the task, the
+ + * task must not exit() & deallocate itself prematurely. The
+ + * call is not atomic; no spinlocks may be held.
+ + */
+ +static int __set_cpus_allowed_ptr(struct task_struct *p,
+ +                                const struct cpumask *new_mask, u32 flags)
+ +{
+ +      struct rq_flags rf;
+ +      struct rq *rq;
+ +
+ +      rq = task_rq_lock(p, &rf);
+ +      return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf);
+ +}
+ +
   int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
   {
         return __set_cpus_allowed_ptr(p, new_mask, 0);
   }
   EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
   
+ +/*
+ + * Change a given task's CPU affinity to the intersection of its current
+ + * affinity mask and @subset_mask, writing the resulting mask to @new_mask
+ + * and pointing @p->user_cpus_ptr to a copy of the old mask.
+ + * If the resulting mask is empty, leave the affinity unchanged and return
+ + * -EINVAL.
+ + */
+ +static int restrict_cpus_allowed_ptr(struct task_struct *p,
+ +                                   struct cpumask *new_mask,
+ +                                   const struct cpumask *subset_mask)
+ +{
+ +      struct cpumask *user_mask = NULL;
+ +      struct rq_flags rf;
+ +      struct rq *rq;
+ +      int err;
+ +
+ +      if (!p->user_cpus_ptr) {
+ +              user_mask = kmalloc(cpumask_size(), GFP_KERNEL);
+ +              if (!user_mask)
+ +                      return -ENOMEM;
+ +      }
+ +
+ +      rq = task_rq_lock(p, &rf);
+ +
+ +      /*
+ +       * Forcefully restricting the affinity of a deadline task is
+ +       * likely to cause problems, so fail and noisily override the
+ +       * mask entirely.
+ +       */
+ +      if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
+ +              err = -EPERM;
+ +              goto err_unlock;
+ +      }
+ +
+ +      if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) {
+ +              err = -EINVAL;
+ +              goto err_unlock;
+ +      }
+ +
+ +      /*
+ +       * We're about to butcher the task affinity, so keep track of what
+ +       * the user asked for in case we're able to restore it later on.
+ +       */
+ +      if (user_mask) {
+ +              cpumask_copy(user_mask, p->cpus_ptr);
+ +              p->user_cpus_ptr = user_mask;
+ +      }
+ +
+ +      return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf);
+ +
+ +err_unlock:
+ +      task_rq_unlock(rq, p, &rf);
+ +      kfree(user_mask);
+ +      return err;
+ +}
+ +
+ +/*
+ + * Restrict the CPU affinity of task @p so that it is a subset of
+ + * task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the
+ + * old affinity mask. If the resulting mask is empty, we warn and walk
+ + * up the cpuset hierarchy until we find a suitable mask.
+ + */
+ +void force_compatible_cpus_allowed_ptr(struct task_struct *p)
+ +{
+ +      cpumask_var_t new_mask;
+ +      const struct cpumask *override_mask = task_cpu_possible_mask(p);
+ +
+ +      alloc_cpumask_var(&new_mask, GFP_KERNEL);
+ +
+ +      /*
+ +       * __migrate_task() can fail silently in the face of concurrent
+ +       * offlining of the chosen destination CPU, so take the hotplug
+ +       * lock to ensure that the migration succeeds.
+ +       */
+ +      cpus_read_lock();
+ +      if (!cpumask_available(new_mask))
+ +              goto out_set_mask;
+ +
+ +      if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask))
+ +              goto out_free_mask;
+ +
+ +      /*
+ +       * We failed to find a valid subset of the affinity mask for the
+ +       * task, so override it based on its cpuset hierarchy.
+ +       */
+ +      cpuset_cpus_allowed(p, new_mask);
+ +      override_mask = new_mask;
+ +
+ +out_set_mask:
+ +      if (printk_ratelimit()) {
+ +              printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n",
+ +                              task_pid_nr(p), p->comm,
+ +                              cpumask_pr_args(override_mask));
+ +      }
+ +
+ +      WARN_ON(set_cpus_allowed_ptr(p, override_mask));
+ +out_free_mask:
+ +      cpus_read_unlock();
+ +      free_cpumask_var(new_mask);
+ +}
+ +
+ +static int
+ +__sched_setaffinity(struct task_struct *p, const struct cpumask *mask);
+ +
+ +/*
+ + * Restore the affinity of a task @p which was previously restricted by a
+ + * call to force_compatible_cpus_allowed_ptr(). This will clear (and free)
+ + * @p->user_cpus_ptr.
+ + *
+ + * It is the caller's responsibility to serialise this with any calls to
+ + * force_compatible_cpus_allowed_ptr(@p).
+ + */
+ +void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
+ +{
+ +      struct cpumask *user_mask = p->user_cpus_ptr;
+ +      unsigned long flags;
+ +
+ +      /*
+ +       * Try to restore the old affinity mask. If this fails, then
+ +       * we free the mask explicitly to avoid it being inherited across
+ +       * a subsequent fork().
+ +       */
+ +      if (!user_mask || !__sched_setaffinity(p, user_mask))
+ +              return;
+ +
+ +      raw_spin_lock_irqsave(&p->pi_lock, flags);
+ +      user_mask = clear_user_cpus_ptr(p);
+ +      raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ +
+ +      kfree(user_mask);
+ +}
+ +
   void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
   {
   #ifdef CONFIG_SCHED_DEBUG
@@@ -3330,7 -3126,9 +3330,7 @@@ static int select_fallback_rq(int cpu, 
   
                 /* Look for allowed, online CPU in same node. */
                 for_each_cpu(dest_cpu, nodemask) {
- -                      if (!cpu_active(dest_cpu))
- -                              continue;
- -                      if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
+ +                      if (is_cpu_allowed(p, dest_cpu))
                                 return dest_cpu;
                 }
         }
@@@ -3347,7 -3145,8 +3347,7 @@@
                 /* No more Mr. Nice Guy. */
                 switch (state) {
                 case cpuset:
- -                      if (IS_ENABLED(CONFIG_CPUSETS)) {
- -                              cpuset_cpus_allowed_fallback(p);
+ +                      if (cpuset_cpus_allowed_fallback(p)) {
                                 state = possible;
                                 break;
                         }
@@@ -3359,9 -3158,10 +3359,9 @@@
                          *
                          * More yuck to audit.
                          */
- -                      do_set_cpus_allowed(p, cpu_possible_mask);
+ +                      do_set_cpus_allowed(p, task_cpu_possible_mask(p));
                         state = fail;
                         break;
- -
                 case fail:
                         BUG();
                         break;
@@@ -3775,55 -3575,6 +3775,55 @@@ static void ttwu_queue(struct task_stru
         rq_unlock(rq, &rf);
   }
   
+ +/*
+ + * Invoked from try_to_wake_up() to check whether the task can be woken up.
+ + *
+ + * The caller holds p::pi_lock if p != current or has preemption
+ + * disabled when p == current.
+ + *
+ + * The rules of PREEMPT_RT saved_state:
+ + *
+ + *   The related locking code always holds p::pi_lock when updating
+ + *   p::saved_state, which means the code is fully serialized in both cases.
+ + *
+ + *   The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other
+ + *   bits set. This allows to distinguish all wakeup scenarios.
+ + */
+ +static __always_inline
+ +bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
+ +{
+ +      if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
+ +              WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) &&
+ +                           state != TASK_RTLOCK_WAIT);
+ +      }
+ +
+ +      if (READ_ONCE(p->__state) & state) {
+ +              *success = 1;
+ +              return true;
+ +      }
+ +
+ +#ifdef CONFIG_PREEMPT_RT
+ +      /*
+ +       * Saved state preserves the task state across blocking on
+ +       * an RT lock.  If the state matches, set p::saved_state to
+ +       * TASK_RUNNING, but do not wake the task because it waits
+ +       * for a lock wakeup. Also indicate success because from
+ +       * the regular waker's point of view this has succeeded.
+ +       *
+ +       * After acquiring the lock the task will restore p::__state
+ +       * from p::saved_state which ensures that the regular
+ +       * wakeup is not lost. The restore will also set
+ +       * p::saved_state to TASK_RUNNING so any further tests will
+ +       * not result in false positives vs. @success
+ +       */
+ +      if (p->saved_state & state) {
+ +              p->saved_state = TASK_RUNNING;
+ +              *success = 1;
+ +      }
+ +#endif
+ +      return false;
+ +}
+ +
   /*
    * Notes on Program-Order guarantees on SMP systems.
    *
@@@ -3963,9 -3714,10 +3963,9 @@@ try_to_wake_up(struct task_struct *p, u
                  *  - we're serialized against set_special_state() by virtue of
                  *    it disabling IRQs (this allows not taking ->pi_lock).
                  */
- -              if (!(READ_ONCE(p->__state) & state))
+ +              if (!ttwu_state_match(p, state, &success))
                         goto out;
   
- -              success = 1;
                 trace_sched_waking(p);
                 WRITE_ONCE(p->__state, TASK_RUNNING);
                 trace_sched_wakeup(p);
@@@ -3980,11 -3732,14 +3980,11 @@@
          */
         raw_spin_lock_irqsave(&p->pi_lock, flags);
         smp_mb__after_spinlock();
- -      if (!(READ_ONCE(p->__state) & state))
+ +      if (!ttwu_state_match(p, state, &success))
                 goto unlock;
   
         trace_sched_waking(p);
   
- -      /* We're going to change ->state: */
- -      success = 1;
- -
         /*
          * Ensure we load p->on_rq _after_ p->state, otherwise it would
          * be possible to, falsely, observe p->on_rq == 0 and get stuck
@@@ -5919,9 -5674,11 +5919,9 @@@ static bool try_steal_cookie(int this, 
                 if (p->core_occupation > dst->idle->core_occupation)
                         goto next;
   
- -              p->on_rq = TASK_ON_RQ_MIGRATING;
                 deactivate_task(src, p, 0);
                 set_task_cpu(p, this);
                 activate_task(dst, p, 0);
- -              p->on_rq = TASK_ON_RQ_QUEUED;
   
                 resched_curr(dst);
   
@@@ -6105,24 -5862,6 +6105,24 @@@ pick_next_task(struct rq *rq, struct ta
   
   #endif /* CONFIG_SCHED_CORE */
   
+ +/*
+ + * Constants for the sched_mode argument of __schedule().
+ + *
+ + * The mode argument allows RT enabled kernels to differentiate a
+ + * preemption from blocking on an 'sleeping' spin/rwlock. Note that
+ + * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to
+ + * optimize the AND operation out and just check for zero.
+ + */
+ +#define SM_NONE                       0x0
+ +#define SM_PREEMPT            0x1
+ +#define SM_RTLOCK_WAIT                0x2
+ +
+ +#ifndef CONFIG_PREEMPT_RT
+ +# define SM_MASK_PREEMPT      (~0U)
+ +#else
+ +# define SM_MASK_PREEMPT      SM_PREEMPT
+ +#endif
+ +
   /*
    * __schedule() is the main scheduler function.
    *
@@@ -6162,7 -5901,7 +6162,7 @@@
    *
    * WARNING: must be called with preemption disabled!
    */
- -static void __sched notrace __schedule(bool preempt)
+ +static void __sched notrace __schedule(unsigned int sched_mode)
   {
         struct task_struct *prev, *next;
         unsigned long *switch_count;
@@@ -6175,13 -5914,13 +6175,13 @@@
         rq = cpu_rq(cpu);
         prev = rq->curr;
   
- -      schedule_debug(prev, preempt);
+ +      schedule_debug(prev, !!sched_mode);
   
         if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
                 hrtick_clear(rq);
   
         local_irq_disable();
- -      rcu_note_context_switch(preempt);
+ +      rcu_note_context_switch(!!sched_mode);
   
         /*
          * Make sure that signal_pending_state()->signal_pending() below
@@@ -6215,7 -5954,7 +6215,7 @@@
          *  - ptrace_{,un}freeze_traced() can change ->state underneath us.
          */
         prev_state = READ_ONCE(prev->__state);
- -      if (!preempt && prev_state) {
+ +      if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
                 if (signal_pending_state(prev_state, prev)) {
                         WRITE_ONCE(prev->__state, TASK_RUNNING);
                 } else {
@@@ -6281,7 -6020,7 +6281,7 @@@
                 migrate_disable_switch(rq, prev);
                 psi_sched_switch(prev, next, !task_on_rq_queued(prev));
   
- -              trace_sched_switch(preempt, prev, next);
+ +              trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next);
   
                 /* Also unlocks the rq: */
                 rq = context_switch(rq, prev, next, &rf);
@@@ -6302,7 -6041,7 +6302,7 @@@ void __noreturn do_task_dead(void
         /* Tell freezer to ignore us: */
         current->flags |= PF_NOFREEZE;
   
- -      __schedule(false);
+ +      __schedule(SM_NONE);
         BUG();
   
         /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */
@@@ -6363,7 -6102,7 +6363,7 @@@ asmlinkage __visible void __sched sched
         sched_submit_work(tsk);
         do {
                 preempt_disable();
- -              __schedule(false);
+ +              __schedule(SM_NONE);
                 sched_preempt_enable_no_resched();
         } while (need_resched());
         sched_update_worker(tsk);
@@@ -6391,7 -6130,7 +6391,7 @@@ void __sched schedule_idle(void
          */
         WARN_ON_ONCE(current->__state);
         do {
- -              __schedule(false);
+ +              __schedule(SM_NONE);
         } while (need_resched());
   }
   
@@@ -6426,18 -6165,6 +6426,18 @@@ void __sched schedule_preempt_disabled(
         preempt_disable();
   }
   
+ +#ifdef CONFIG_PREEMPT_RT
+ +void __sched notrace schedule_rtlock(void)
+ +{
+ +      do {
+ +              preempt_disable();
+ +              __schedule(SM_RTLOCK_WAIT);
+ +              sched_preempt_enable_no_resched();
+ +      } while (need_resched());
+ +}
+ +NOKPROBE_SYMBOL(schedule_rtlock);
+ +#endif
+ +
   static void __sched notrace preempt_schedule_common(void)
   {
         do {
@@@ -6456,7 -6183,7 +6456,7 @@@
                  */
                 preempt_disable_notrace();
                 preempt_latency_start(1);
- -              __schedule(true);
+ +              __schedule(SM_PREEMPT);
                 preempt_latency_stop(1);
                 preempt_enable_no_resched_notrace();
   
@@@ -6535,7 -6262,7 +6535,7 @@@ asmlinkage __visible void __sched notra
                  * an infinite recursion.
                  */
                 prev_ctx = exception_enter();
- -              __schedule(true);
+ +              __schedule(SM_PREEMPT);
                 exception_exit(prev_ctx);
   
                 preempt_latency_stop(1);
@@@ -6684,7 -6411,7 +6684,7 @@@ asmlinkage __visible void __sched preem
         do {
                 preempt_disable();
                 local_irq_enable();
- -              __schedule(true);
+ +              __schedule(SM_PREEMPT);
                 local_irq_disable();
                 sched_preempt_enable_no_resched();
         } while (need_resched());
@@@ -7661,16 -7388,6 +7661,16 @@@ err_size
         return -E2BIG;
   }
   
+ +static void get_params(struct task_struct *p, struct sched_attr *attr)
+ +{
+ +      if (task_has_dl_policy(p))
+ +              __getparam_dl(p, attr);
+ +      else if (task_has_rt_policy(p))
+ +              attr->sched_priority = p->rt_priority;
+ +      else
+ +              attr->sched_nice = task_nice(p);
+ +}
+ +
   /**
    * sys_sched_setscheduler - set/change the scheduler policy and RT priority
    * @pid: the pid in question.
@@@ -7732,8 -7449,6 +7732,8 @@@ SYSCALL_DEFINE3(sched_setattr, pid_t, p
         rcu_read_unlock();
   
         if (likely(p)) {
+ +              if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
+ +                      get_params(p, &attr);
                 retval = sched_setattr(p, &attr);
                 put_task_struct(p);
         }
@@@ -7882,8 -7597,12 +7882,8 @@@ SYSCALL_DEFINE4(sched_getattr, pid_t, p
         kattr.sched_policy = p->policy;
         if (p->sched_reset_on_fork)
                 kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
- -      if (task_has_dl_policy(p))
- -              __getparam_dl(p, &kattr);
- -      else if (task_has_rt_policy(p))
- -              kattr.sched_priority = p->rt_priority;
- -      else
- -              kattr.sched_nice = task_nice(p);
+ +      get_params(p, &kattr);
+ +      kattr.sched_flags &= SCHED_FLAG_ALL;
   
   #ifdef CONFIG_UCLAMP_TASK
         /*
@@@ -7904,76 -7623,9 +7904,76 @@@ out_unlock
         return retval;
   }
   
- -long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+ +#ifdef CONFIG_SMP
+ +int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
   {
+ +      int ret = 0;
+ +
+ +      /*
+ +       * If the task isn't a deadline task or admission control is
+ +       * disabled then we don't care about affinity changes.
+ +       */
+ +      if (!task_has_dl_policy(p) || !dl_bandwidth_enabled())
+ +              return 0;
+ +
+ +      /*
+ +       * Since bandwidth control happens on root_domain basis,
+ +       * if admission test is enabled, we only admit -deadline
+ +       * tasks allowed to run on all the CPUs in the task's
+ +       * root_domain.
+ +       */
+ +      rcu_read_lock();
+ +      if (!cpumask_subset(task_rq(p)->rd->span, mask))
+ +              ret = -EBUSY;
+ +      rcu_read_unlock();
+ +      return ret;
+ +}
+ +#endif
+ +
+ +static int
+ +__sched_setaffinity(struct task_struct *p, const struct cpumask *mask)
+ +{
+ +      int retval;
         cpumask_var_t cpus_allowed, new_mask;
+ +
+ +      if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
+ +              return -ENOMEM;
+ +
+ +      if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+ +              retval = -ENOMEM;
+ +              goto out_free_cpus_allowed;
+ +      }
+ +
+ +      cpuset_cpus_allowed(p, cpus_allowed);
+ +      cpumask_and(new_mask, mask, cpus_allowed);
+ +
+ +      retval = dl_task_check_affinity(p, new_mask);
+ +      if (retval)
+ +              goto out_free_new_mask;
+ +again:
+ +      retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER);
+ +      if (retval)
+ +              goto out_free_new_mask;
+ +
+ +      cpuset_cpus_allowed(p, cpus_allowed);
+ +      if (!cpumask_subset(new_mask, cpus_allowed)) {
+ +              /*
+ +               * We must have raced with a concurrent cpuset update.
+ +               * Just reset the cpumask to the cpuset's cpus_allowed.
+ +               */
+ +              cpumask_copy(new_mask, cpus_allowed);
+ +              goto again;
+ +      }
+ +
+ +out_free_new_mask:
+ +      free_cpumask_var(new_mask);
+ +out_free_cpus_allowed:
+ +      free_cpumask_var(cpus_allowed);
+ +      return retval;
+ +}
+ +
+ +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+ +{
         struct task_struct *p;
         int retval;
   
@@@ -7993,22 -7645,68 +7993,22 @@@
                 retval = -EINVAL;
                 goto out_put_task;
         }
- -      if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
- -              retval = -ENOMEM;
- -              goto out_put_task;
- -      }
- -      if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
- -              retval = -ENOMEM;
- -              goto out_free_cpus_allowed;
- -      }
- -      retval = -EPERM;
+ +
         if (!check_same_owner(p)) {
                 rcu_read_lock();
                 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
                         rcu_read_unlock();
- -                      goto out_free_new_mask;
+ +                      retval = -EPERM;
+ +                      goto out_put_task;
                 }
                 rcu_read_unlock();
         }
   
         retval = security_task_setscheduler(p);
         if (retval)
- -              goto out_free_new_mask;
- -
- -
- -      cpuset_cpus_allowed(p, cpus_allowed);
- -      cpumask_and(new_mask, in_mask, cpus_allowed);
- -
- -      /*
- -       * Since bandwidth control happens on root_domain basis,
- -       * if admission test is enabled, we only admit -deadline
- -       * tasks allowed to run on all the CPUs in the task's
- -       * root_domain.
- -       */
- -#ifdef CONFIG_SMP
- -      if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
- -              rcu_read_lock();
- -              if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
- -                      retval = -EBUSY;
- -                      rcu_read_unlock();
- -                      goto out_free_new_mask;
- -              }
- -              rcu_read_unlock();
- -      }
- -#endif
- -again:
- -      retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK);
+ +              goto out_put_task;
   
- -      if (!retval) {
- -              cpuset_cpus_allowed(p, cpus_allowed);
- -              if (!cpumask_subset(new_mask, cpus_allowed)) {
- -                      /*
- -                       * We must have raced with a concurrent cpuset
- -                       * update. Just reset the cpus_allowed to the
- -                       * cpuset's cpus_allowed
- -                       */
- -                      cpumask_copy(new_mask, cpus_allowed);
- -                      goto again;
- -              }
- -      }
- -out_free_new_mask:
- -      free_cpumask_var(new_mask);
- -out_free_cpus_allowed:
- -      free_cpumask_var(cpus_allowed);
+ +      retval = __sched_setaffinity(p, in_mask);
   out_put_task:
         put_task_struct(p);
         return retval;
@@@ -8151,17 -7849,6 +8151,17 @@@ int __sched __cond_resched(void
                 preempt_schedule_common();
                 return 1;
         }
+ +      /*
+ +       * In preemptible kernels, ->rcu_read_lock_nesting tells the tick
+ +       * whether the current CPU is in an RCU read-side critical section,
+ +       * so the tick can report quiescent states even for CPUs looping
+ +       * in kernel context.  In contrast, in non-preemptible kernels,
+ +       * RCU readers leave no in-memory hints, which means that CPU-bound
+ +       * processes executing in kernel context might never report an
+ +       * RCU quiescent state.  Therefore, the following code causes
+ +       * cond_resched() to report a quiescent state, but only when RCU
+ +       * is in urgent need of one.
+ +       */
   #ifndef CONFIG_PREEMPT_RCU
         rcu_all_qs();
   #endif
@@@ -8836,7 -8523,6 +8836,6 @@@ static void balance_push(struct rq *rq
         struct task_struct *push_task = rq->curr;
   
         lockdep_assert_rq_held(rq);
-       SCHED_WARN_ON(rq->cpu != smp_processor_id());
   
         /*
          * Ensure the thing is persistent until balance_push_set(.on = false);
@@@ -8844,9 -8530,10 +8843,10 @@@
         rq->balance_callback = &balance_push_callback;
   
         /*
-        * Only active while going offline.
+        * Only active while going offline and when invoked on the outgoing
+        * CPU.
          */
-       if (!cpu_dying(rq->cpu))
+       if (!cpu_dying(rq->cpu) || rq != this_rq())
                 return;
   
         /*
@@@ -10208,7 -9895,7 +10208,7 @@@ static int tg_set_cfs_bandwidth(struct 
          * Prevent race between setting of cfs_rq->runtime_enabled and
          * unthrottle_offline_cfs_rqs().
          */
- -      get_online_cpus();
+ +      cpus_read_lock();
         mutex_lock(&cfs_constraints_mutex);
         ret = __cfs_schedulable(tg, period, quota);
         if (ret)
@@@ -10252,7 -9939,7 +10252,7 @@@
                 cfs_bandwidth_usage_dec();
   out_unlock:
         mutex_unlock(&cfs_constraints_mutex);
- -      put_online_cpus();
+ +      cpus_read_unlock();
   
         return ret;
   }
@@@ -10503,20 -10190,6 +10503,20 @@@ static u64 cpu_rt_period_read_uint(stru
   }
   #endif /* CONFIG_RT_GROUP_SCHED */
   
+ +#ifdef CONFIG_FAIR_GROUP_SCHED
+ +static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
+ +                             struct cftype *cft)
+ +{
+ +      return css_tg(css)->idle;
+ +}
+ +
+ +static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
+ +                              struct cftype *cft, s64 idle)
+ +{
+ +      return sched_group_set_idle(css_tg(css), idle);
+ +}
+ +#endif
+ +
   static struct cftype cpu_legacy_files[] = {
   #ifdef CONFIG_FAIR_GROUP_SCHED
         {
@@@ -10524,11 -10197,6 +10524,11 @@@
                 .read_u64 = cpu_shares_read_u64,
                 .write_u64 = cpu_shares_write_u64,
         },
+ +      {
+ +              .name = "idle",
+ +              .read_s64 = cpu_idle_read_s64,
+ +              .write_s64 = cpu_idle_write_s64,
+ +      },
   #endif
   #ifdef CONFIG_CFS_BANDWIDTH
         {
@@@ -10736,12 -10404,6 +10736,12 @@@ static struct cftype cpu_files[] = 
                 .read_s64 = cpu_weight_nice_read_s64,
                 .write_s64 = cpu_weight_nice_write_s64,
         },
+ +      {
+ +              .name = "idle",
+ +              .flags = CFTYPE_NOT_ON_ROOT,
+ +              .read_s64 = cpu_idle_read_s64,
+ +              .write_s64 = cpu_idle_write_s64,
+ +      },
   #endif
   #ifdef CONFIG_CFS_BANDWIDTH
         {
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sun, 12 Sep 2021 18:37:41 +0000 (11:37 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sun, 12 Sep 2021 18:37:41 +0000 (11:37 -0700)