Merge tag 'sched-core-2021-08-30' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <torvalds@linux-foundation.org>

Mon, 30 Aug 2021 20:42:10 +0000 (13:42 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Mon, 30 Aug 2021 20:42:10 +0000 (13:42 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Mon, 30 Aug 2021 20:42:10 +0000 (13:42 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Mon, 30 Aug 2021 20:42:10 +0000 (13:42 -0700)
diff --git a/fs/aio.c b/fs/aio.c

index 76ce0cc..51b08ab 100644 (file)
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1695,7 +1695,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
                 list_del(&iocb->ki_list);
                 iocb->ki_res.res = mangle_poll(mask);
                 req->done = true;
-               if (iocb->ki_eventfd && eventfd_signal_count()) {
+               if (iocb->ki_eventfd && eventfd_signal_allowed()) {
                         iocb = NULL;
                         INIT_WORK(&req->work, aio_poll_put_work);
                         schedule_work(&req->work);
diff --git a/fs/eventfd.c b/fs/eventfd.c

index e265b6d..3627dd7 100644 (file)
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -25,8 +25,6 @@
  #include <linux/idr.h>
  #include <linux/uio.h>
  
-DEFINE_PER_CPU(int, eventfd_wake_count);
-
  static DEFINE_IDA(eventfd_ida);
  
  struct eventfd_ctx {
@@ -67,21 +65,21 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
          * Deadlock or stack overflow issues can happen if we recurse here
          * through waitqueue wakeup handlers. If the caller users potentially
          * nested waitqueues with custom wakeup handlers, then it should
-        * check eventfd_signal_count() before calling this function. If
-        * it returns true, the eventfd_signal() call should be deferred to a
+        * check eventfd_signal_allowed() before calling this function. If
+        * it returns false, the eventfd_signal() call should be deferred to a
          * safe context.
          */
-       if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
+       if (WARN_ON_ONCE(current->in_eventfd_signal))
                 return 0;
  
         spin_lock_irqsave(&ctx->wqh.lock, flags);
-       this_cpu_inc(eventfd_wake_count);
+       current->in_eventfd_signal = 1;
         if (ULLONG_MAX - ctx->count < n)
                 n = ULLONG_MAX - ctx->count;
         ctx->count += n;
         if (waitqueue_active(&ctx->wqh))
                 wake_up_locked_poll(&ctx->wqh, EPOLLIN);
-       this_cpu_dec(eventfd_wake_count);
+       current->in_eventfd_signal = 0;
         spin_unlock_irqrestore(&ctx->wqh.lock, flags);
  
         return n;
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h

index 04c20de..d2b9c41 100644 (file)
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -15,6 +15,7 @@
  #include <linux/cpumask.h>
  #include <linux/nodemask.h>
  #include <linux/mm.h>
+#include <linux/mmu_context.h>
  #include <linux/jump_label.h>
  
  #ifdef CONFIG_CPUSETS
@@ -58,7 +59,7 @@ extern void cpuset_wait_for_hotplug(void);
  extern void cpuset_read_lock(void);
  extern void cpuset_read_unlock(void);
  extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
-extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
+extern bool cpuset_cpus_allowed_fallback(struct task_struct *p);
  extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
  #define cpuset_current_mems_allowed (current->mems_allowed)
  void cpuset_init_current_mems_allowed(void);
@@ -184,11 +185,12 @@ static inline void cpuset_read_unlock(void) { }
  static inline void cpuset_cpus_allowed(struct task_struct *p,
                                        struct cpumask *mask)
  {
-       cpumask_copy(mask, cpu_possible_mask);
+       cpumask_copy(mask, task_cpu_possible_mask(p));
  }
  
-static inline void cpuset_cpus_allowed_fallback(struct task_struct *p)
+static inline bool cpuset_cpus_allowed_fallback(struct task_struct *p)
  {
+       return false;
  }
  
  static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h

index fa0a524..305d5f1 100644 (file)
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -14,6 +14,7 @@
  #include <linux/err.h>
  #include <linux/percpu-defs.h>
  #include <linux/percpu.h>
+#include <linux/sched.h>
  
  /*
   * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining
@@ -43,11 +44,9 @@ int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *w
                                   __u64 *cnt);
  void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt);
  
-DECLARE_PER_CPU(int, eventfd_wake_count);
-
-static inline bool eventfd_signal_count(void)
+static inline bool eventfd_signal_allowed(void)
  {
-       return this_cpu_read(eventfd_wake_count);
+       return !current->in_eventfd_signal;
  }
  
  #else /* CONFIG_EVENTFD */
@@ -78,9 +77,9 @@ static inline int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx,
         return -ENOSYS;
  }
  
-static inline bool eventfd_signal_count(void)
+static inline bool eventfd_signal_allowed(void)
  {
-       return false;
+       return true;
  }
  
  static inline void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h

index 03dee12..b9b970f 100644 (file)
--- a/include/linux/mmu_context.h
+++ b/include/linux/mmu_context.h
@@ -14,4 +14,18 @@
  static inline void leave_mm(int cpu) { }
  #endif
  
+/*
+ * CPUs that are capable of running user task @p. Must contain at least one
+ * active CPU. It is assumed that the kernel can run on all CPUs, so calling
+ * this for a kernel thread is pointless.
+ *
+ * By default, we assume a sane, homogeneous system.
+ */
+#ifndef task_cpu_possible_mask
+# define task_cpu_possible_mask(p)     cpu_possible_mask
+# define task_cpu_possible(cpu, p)     true
+#else
+# define task_cpu_possible(cpu, p)     cpumask_test_cpu((cpu), task_cpu_possible_mask(p))
+#endif
+
  #endif
diff --git a/include/linux/sched.h b/include/linux/sched.h

index ec8d07d..7c6a77d 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -748,6 +748,7 @@ struct task_struct {
         unsigned int                    policy;
         int                             nr_cpus_allowed;
         const cpumask_t                 *cpus_ptr;
+       cpumask_t                       *user_cpus_ptr;
         cpumask_t                       cpus_mask;
         void                            *migration_pending;
  #ifdef CONFIG_SMP
@@ -863,6 +864,10 @@ struct task_struct {
         /* Used by page_owner=on to detect recursion in page tracking. */
         unsigned                        in_page_owner:1;
  #endif
+#ifdef CONFIG_EVENTFD
+       /* Recursion prevention for eventfd_signal() */
+       unsigned                        in_eventfd_signal:1;
+#endif
  
         unsigned long                   atomic_flags; /* Flags requiring atomic access. */
  
@@ -1705,6 +1710,11 @@ extern int task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_
  #ifdef CONFIG_SMP
  extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
  extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask);
+extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node);
+extern void release_user_cpus_ptr(struct task_struct *p);
+extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask);
+extern void force_compatible_cpus_allowed_ptr(struct task_struct *p);
+extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p);
  #else
  static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
  {
@@ -1715,6 +1725,21 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpuma
                 return -EINVAL;
         return 0;
  }
+static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node)
+{
+       if (src->user_cpus_ptr)
+               return -EINVAL;
+       return 0;
+}
+static inline void release_user_cpus_ptr(struct task_struct *p)
+{
+       WARN_ON(p->user_cpus_ptr);
+}
+
+static inline int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
+{
+       return 0;
+}
  #endif
  
  extern int yield_to(struct task_struct *p, bool preempt);
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h

index db2c0f3..304f431 100644 (file)
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -28,30 +28,12 @@ enum { sysctl_hung_task_timeout_secs = 0 };
  
  extern unsigned int sysctl_sched_child_runs_first;
  
-extern unsigned int sysctl_sched_latency;
-extern unsigned int sysctl_sched_min_granularity;
-extern unsigned int sysctl_sched_wakeup_granularity;
-
  enum sched_tunable_scaling {
         SCHED_TUNABLESCALING_NONE,
         SCHED_TUNABLESCALING_LOG,
         SCHED_TUNABLESCALING_LINEAR,
         SCHED_TUNABLESCALING_END,
  };
-extern unsigned int sysctl_sched_tunable_scaling;
-
-extern unsigned int sysctl_numa_balancing_scan_delay;
-extern unsigned int sysctl_numa_balancing_scan_period_min;
-extern unsigned int sysctl_numa_balancing_scan_period_max;
-extern unsigned int sysctl_numa_balancing_scan_size;
-
-#ifdef CONFIG_SCHED_DEBUG
-extern __read_mostly unsigned int sysctl_sched_migration_cost;
-extern __read_mostly unsigned int sysctl_sched_nr_migrate;
-
-extern int sysctl_resched_latency_warn_ms;
-extern int sysctl_resched_latency_warn_once;
-#endif
  
  /*
   *  control realtime throttling:
diff --git a/include/linux/wait.h b/include/linux/wait.h

index 6598ae3..93dab0e 100644 (file)
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -56,7 +56,7 @@ struct task_struct;
  
  #define __WAIT_QUEUE_HEAD_INITIALIZER(name) {                                  \
         .lock           = __SPIN_LOCK_UNLOCKED(name.lock),                      \
-       .head           = { &(name).head, &(name).head } }
+       .head           = LIST_HEAD_INIT(name.head) }
  
  #define DECLARE_WAIT_QUEUE_HEAD(name) \
         struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name)
diff --git a/init/init_task.c b/init/init_task.c

index 562f2ef..2d02406 100644 (file)
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -80,6 +80,7 @@ struct task_struct init_task
         .normal_prio    = MAX_PRIO - 20,
         .policy         = SCHED_NORMAL,
         .cpus_ptr       = &init_task.cpus_mask,
+       .user_cpus_ptr  = NULL,
         .cpus_mask      = CPU_MASK_ALL,
         .nr_cpus_allowed= NR_CPUS,
         .mm             = NULL,
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c

index adb5190..6500cbe 100644 (file)
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -372,18 +372,29 @@ static inline bool is_in_v2_mode(void)
  }
  
  /*
- * Return in pmask the portion of a cpusets's cpus_allowed that
- * are online.  If none are online, walk up the cpuset hierarchy
- * until we find one that does have some online cpus.
+ * Return in pmask the portion of a task's cpusets's cpus_allowed that
+ * are online and are capable of running the task.  If none are found,
+ * walk up the cpuset hierarchy until we find one that does have some
+ * appropriate cpus.
   *
   * One way or another, we guarantee to return some non-empty subset
   * of cpu_online_mask.
   *
   * Call with callback_lock or cpuset_mutex held.
   */
-static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
+static void guarantee_online_cpus(struct task_struct *tsk,
+                                 struct cpumask *pmask)
  {
-       while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
+       const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
+       struct cpuset *cs;
+
+       if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask)))
+               cpumask_copy(pmask, cpu_online_mask);
+
+       rcu_read_lock();
+       cs = task_cs(tsk);
+
+       while (!cpumask_intersects(cs->effective_cpus, pmask)) {
                 cs = parent_cs(cs);
                 if (unlikely(!cs)) {
                         /*
@@ -393,11 +404,13 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
                          * cpuset's effective_cpus is on its way to be
                          * identical to cpu_online_mask.
                          */
-                       cpumask_copy(pmask, cpu_online_mask);
-                       return;
+                       goto out_unlock;
                 }
         }
-       cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
+       cpumask_and(pmask, pmask, cs->effective_cpus);
+
+out_unlock:
+       rcu_read_unlock();
  }
  
  /*
@@ -2199,15 +2212,13 @@ static void cpuset_attach(struct cgroup_taskset *tset)
  
         percpu_down_write(&cpuset_rwsem);
  
-       /* prepare for attach */
-       if (cs == &top_cpuset)
-               cpumask_copy(cpus_attach, cpu_possible_mask);
-       else
-               guarantee_online_cpus(cs, cpus_attach);
-
         guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
  
         cgroup_taskset_for_each(task, css, tset) {
+               if (cs != &top_cpuset)
+                       guarantee_online_cpus(task, cpus_attach);
+               else
+                       cpumask_copy(cpus_attach, task_cpu_possible_mask(task));
                 /*
                  * can_attach beforehand should guarantee that this doesn't
                  * fail.  TODO: have a better way to handle failure here
@@ -3302,9 +3313,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
         unsigned long flags;
  
         spin_lock_irqsave(&callback_lock, flags);
-       rcu_read_lock();
-       guarantee_online_cpus(task_cs(tsk), pmask);
-       rcu_read_unlock();
+       guarantee_online_cpus(tsk, pmask);
         spin_unlock_irqrestore(&callback_lock, flags);
  }
  
@@ -3318,13 +3327,22 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
   * which will not contain a sane cpumask during cases such as cpu hotplugging.
   * This is the absolute last resort for the scheduler and it is only used if
   * _every_ other avenue has been traveled.
+ *
+ * Returns true if the affinity of @tsk was changed, false otherwise.
   **/
  
-void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
+bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)
  {
+       const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
+       const struct cpumask *cs_mask;
+       bool changed = false;
+
         rcu_read_lock();
-       do_set_cpus_allowed(tsk, is_in_v2_mode() ?
-               task_cs(tsk)->cpus_allowed : cpu_possible_mask);
+       cs_mask = task_cs(tsk)->cpus_allowed;
+       if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) {
+               do_set_cpus_allowed(tsk, cs_mask);
+               changed = true;
+       }
         rcu_read_unlock();
  
         /*
@@ -3344,6 +3362,7 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
          * select_fallback_rq() will fix things ups and set cpu_possible_mask
          * if required.
          */
+       return changed;
  }
  
  void __init cpuset_init_current_mems_allowed(void)
diff --git a/kernel/fork.c b/kernel/fork.c

index 44f4c2d..757301c 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -446,6 +446,7 @@ void put_task_stack(struct task_struct *tsk)
  
  void free_task(struct task_struct *tsk)
  {
+       release_user_cpus_ptr(tsk);
         scs_release(tsk);
  
  #ifndef CONFIG_THREAD_INFO_IN_TASK
@@ -924,6 +925,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
  #endif
         if (orig->cpus_ptr == &orig->cpus_mask)
                 tsk->cpus_ptr = &tsk->cpus_mask;
+       dup_user_cpus_ptr(tsk, orig, node);
  
         /*
          * One for the user space visible state that goes away when reaped.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 2b9ed11..37bec9b 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1007,6 +1007,7 @@ int get_nohz_timer_target(void)
  {
         int i, cpu = smp_processor_id(), default_cpu = -1;
         struct sched_domain *sd;
+       const struct cpumask *hk_mask;
  
         if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
                 if (!idle_cpu(cpu))
@@ -1014,10 +1015,11 @@ int get_nohz_timer_target(void)
                 default_cpu = cpu;
         }
  
+       hk_mask = housekeeping_cpumask(HK_FLAG_TIMER);
+
         rcu_read_lock();
         for_each_domain(cpu, sd) {
-               for_each_cpu_and(i, sched_domain_span(sd),
-                       housekeeping_cpumask(HK_FLAG_TIMER)) {
+               for_each_cpu_and(i, sched_domain_span(sd), hk_mask) {
                         if (cpu == i)
                                 continue;
  
@@ -1633,6 +1635,23 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
                 uclamp_rq_dec_id(rq, p, clamp_id);
  }
  
+static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p,
+                                     enum uclamp_id clamp_id)
+{
+       if (!p->uclamp[clamp_id].active)
+               return;
+
+       uclamp_rq_dec_id(rq, p, clamp_id);
+       uclamp_rq_inc_id(rq, p, clamp_id);
+
+       /*
+        * Make sure to clear the idle flag if we've transiently reached 0
+        * active tasks on rq.
+        */
+       if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
+               rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
+}
+
  static inline void
  uclamp_update_active(struct task_struct *p)
  {
@@ -1656,12 +1675,8 @@ uclamp_update_active(struct task_struct *p)
          * affecting a valid clamp bucket, the next time it's enqueued,
          * it will already see the updated clamp bucket value.
          */
-       for_each_clamp_id(clamp_id) {
-               if (p->uclamp[clamp_id].active) {
-                       uclamp_rq_dec_id(rq, p, clamp_id);
-                       uclamp_rq_inc_id(rq, p, clamp_id);
-               }
-       }
+       for_each_clamp_id(clamp_id)
+               uclamp_rq_reinc_id(rq, p, clamp_id);
  
         task_rq_unlock(rq, p, &rf);
  }
@@ -2175,7 +2190,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
  
         /* Non kernel threads are not allowed during either online or offline. */
         if (!(p->flags & PF_KTHREAD))
-               return cpu_active(cpu);
+               return cpu_active(cpu) && task_cpu_possible(cpu, p);
  
         /* KTHREAD_IS_PER_CPU is always allowed. */
         if (kthread_is_per_cpu(p))
@@ -2482,6 +2497,34 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
         __do_set_cpus_allowed(p, new_mask, 0);
  }
  
+int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
+                     int node)
+{
+       if (!src->user_cpus_ptr)
+               return 0;
+
+       dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node);
+       if (!dst->user_cpus_ptr)
+               return -ENOMEM;
+
+       cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
+       return 0;
+}
+
+static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p)
+{
+       struct cpumask *user_mask = NULL;
+
+       swap(p->user_cpus_ptr, user_mask);
+
+       return user_mask;
+}
+
+void release_user_cpus_ptr(struct task_struct *p)
+{
+       kfree(clear_user_cpus_ptr(p));
+}
+
  /*
   * This function is wildly self concurrent; here be dragons.
   *
@@ -2699,28 +2742,26 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
  }
  
  /*
- * Change a given task's CPU affinity. Migrate the thread to a
- * proper CPU and schedule it away if the CPU it's executing on
- * is removed from the allowed bitmask.
- *
- * NOTE: the caller must have a valid reference to the task, the
- * task must not exit() & deallocate itself prematurely. The
- * call is not atomic; no spinlocks may be held.
+ * Called with both p->pi_lock and rq->lock held; drops both before returning.
   */
-static int __set_cpus_allowed_ptr(struct task_struct *p,
-                                 const struct cpumask *new_mask,
-                                 u32 flags)
+static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
+                                        const struct cpumask *new_mask,
+                                        u32 flags,
+                                        struct rq *rq,
+                                        struct rq_flags *rf)
+       __releases(rq->lock)
+       __releases(p->pi_lock)
  {
+       const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
         const struct cpumask *cpu_valid_mask = cpu_active_mask;
+       bool kthread = p->flags & PF_KTHREAD;
+       struct cpumask *user_mask = NULL;
         unsigned int dest_cpu;
-       struct rq_flags rf;
-       struct rq *rq;
         int ret = 0;
  
-       rq = task_rq_lock(p, &rf);
         update_rq_clock(rq);
  
-       if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
+       if (kthread || is_migration_disabled(p)) {
                 /*
                  * Kernel threads are allowed on online && !active CPUs,
                  * however, during cpu-hot-unplug, even these might get pushed
@@ -2734,6 +2775,11 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
                 cpu_valid_mask = cpu_online_mask;
         }
  
+       if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) {
+               ret = -EINVAL;
+               goto out;
+       }
+
         /*
          * Must re-check here, to close a race against __kthread_bind(),
          * sched_setaffinity() is not guaranteed to observe the flag.
@@ -2768,20 +2814,178 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
  
         __do_set_cpus_allowed(p, new_mask, flags);
  
-       return affine_move_task(rq, p, &rf, dest_cpu, flags);
+       if (flags & SCA_USER)
+               user_mask = clear_user_cpus_ptr(p);
+
+       ret = affine_move_task(rq, p, rf, dest_cpu, flags);
+
+       kfree(user_mask);
+
+       return ret;
  
  out:
-       task_rq_unlock(rq, p, &rf);
+       task_rq_unlock(rq, p, rf);
  
         return ret;
  }
  
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+                                 const struct cpumask *new_mask, u32 flags)
+{
+       struct rq_flags rf;
+       struct rq *rq;
+
+       rq = task_rq_lock(p, &rf);
+       return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf);
+}
+
  int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
  {
         return __set_cpus_allowed_ptr(p, new_mask, 0);
  }
  EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
  
+/*
+ * Change a given task's CPU affinity to the intersection of its current
+ * affinity mask and @subset_mask, writing the resulting mask to @new_mask
+ * and pointing @p->user_cpus_ptr to a copy of the old mask.
+ * If the resulting mask is empty, leave the affinity unchanged and return
+ * -EINVAL.
+ */
+static int restrict_cpus_allowed_ptr(struct task_struct *p,
+                                    struct cpumask *new_mask,
+                                    const struct cpumask *subset_mask)
+{
+       struct cpumask *user_mask = NULL;
+       struct rq_flags rf;
+       struct rq *rq;
+       int err;
+
+       if (!p->user_cpus_ptr) {
+               user_mask = kmalloc(cpumask_size(), GFP_KERNEL);
+               if (!user_mask)
+                       return -ENOMEM;
+       }
+
+       rq = task_rq_lock(p, &rf);
+
+       /*
+        * Forcefully restricting the affinity of a deadline task is
+        * likely to cause problems, so fail and noisily override the
+        * mask entirely.
+        */
+       if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
+               err = -EPERM;
+               goto err_unlock;
+       }
+
+       if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) {
+               err = -EINVAL;
+               goto err_unlock;
+       }
+
+       /*
+        * We're about to butcher the task affinity, so keep track of what
+        * the user asked for in case we're able to restore it later on.
+        */
+       if (user_mask) {
+               cpumask_copy(user_mask, p->cpus_ptr);
+               p->user_cpus_ptr = user_mask;
+       }
+
+       return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf);
+
+err_unlock:
+       task_rq_unlock(rq, p, &rf);
+       kfree(user_mask);
+       return err;
+}
+
+/*
+ * Restrict the CPU affinity of task @p so that it is a subset of
+ * task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the
+ * old affinity mask. If the resulting mask is empty, we warn and walk
+ * up the cpuset hierarchy until we find a suitable mask.
+ */
+void force_compatible_cpus_allowed_ptr(struct task_struct *p)
+{
+       cpumask_var_t new_mask;
+       const struct cpumask *override_mask = task_cpu_possible_mask(p);
+
+       alloc_cpumask_var(&new_mask, GFP_KERNEL);
+
+       /*
+        * __migrate_task() can fail silently in the face of concurrent
+        * offlining of the chosen destination CPU, so take the hotplug
+        * lock to ensure that the migration succeeds.
+        */
+       cpus_read_lock();
+       if (!cpumask_available(new_mask))
+               goto out_set_mask;
+
+       if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask))
+               goto out_free_mask;
+
+       /*
+        * We failed to find a valid subset of the affinity mask for the
+        * task, so override it based on its cpuset hierarchy.
+        */
+       cpuset_cpus_allowed(p, new_mask);
+       override_mask = new_mask;
+
+out_set_mask:
+       if (printk_ratelimit()) {
+               printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n",
+                               task_pid_nr(p), p->comm,
+                               cpumask_pr_args(override_mask));
+       }
+
+       WARN_ON(set_cpus_allowed_ptr(p, override_mask));
+out_free_mask:
+       cpus_read_unlock();
+       free_cpumask_var(new_mask);
+}
+
+static int
+__sched_setaffinity(struct task_struct *p, const struct cpumask *mask);
+
+/*
+ * Restore the affinity of a task @p which was previously restricted by a
+ * call to force_compatible_cpus_allowed_ptr(). This will clear (and free)
+ * @p->user_cpus_ptr.
+ *
+ * It is the caller's responsibility to serialise this with any calls to
+ * force_compatible_cpus_allowed_ptr(@p).
+ */
+void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
+{
+       struct cpumask *user_mask = p->user_cpus_ptr;
+       unsigned long flags;
+
+       /*
+        * Try to restore the old affinity mask. If this fails, then
+        * we free the mask explicitly to avoid it being inherited across
+        * a subsequent fork().
+        */
+       if (!user_mask || !__sched_setaffinity(p, user_mask))
+               return;
+
+       raw_spin_lock_irqsave(&p->pi_lock, flags);
+       user_mask = clear_user_cpus_ptr(p);
+       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+       kfree(user_mask);
+}
+
  void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
  {
  #ifdef CONFIG_SCHED_DEBUG
@@ -3126,9 +3330,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
  
                 /* Look for allowed, online CPU in same node. */
                 for_each_cpu(dest_cpu, nodemask) {
-                       if (!cpu_active(dest_cpu))
-                               continue;
-                       if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
+                       if (is_cpu_allowed(p, dest_cpu))
                                 return dest_cpu;
                 }
         }
@@ -3145,8 +3347,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
                 /* No more Mr. Nice Guy. */
                 switch (state) {
                 case cpuset:
-                       if (IS_ENABLED(CONFIG_CPUSETS)) {
-                               cpuset_cpus_allowed_fallback(p);
+                       if (cpuset_cpus_allowed_fallback(p)) {
                                 state = possible;
                                 break;
                         }
@@ -3158,10 +3359,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
                          *
                          * More yuck to audit.
                          */
-                       do_set_cpus_allowed(p, cpu_possible_mask);
+                       do_set_cpus_allowed(p, task_cpu_possible_mask(p));
                         state = fail;
                         break;
-
                 case fail:
                         BUG();
                         break;
@@ -5674,11 +5874,9 @@ static bool try_steal_cookie(int this, int that)
                 if (p->core_occupation > dst->idle->core_occupation)
                         goto next;
  
-               p->on_rq = TASK_ON_RQ_MIGRATING;
                 deactivate_task(src, p, 0);
                 set_task_cpu(p, this);
                 activate_task(dst, p, 0);
-               p->on_rq = TASK_ON_RQ_QUEUED;
  
                 resched_curr(dst);
  
@@ -7388,6 +7586,16 @@ err_size:
         return -E2BIG;
  }
  
+static void get_params(struct task_struct *p, struct sched_attr *attr)
+{
+       if (task_has_dl_policy(p))
+               __getparam_dl(p, attr);
+       else if (task_has_rt_policy(p))
+               attr->sched_priority = p->rt_priority;
+       else
+               attr->sched_nice = task_nice(p);
+}
+
  /**
   * sys_sched_setscheduler - set/change the scheduler policy and RT priority
   * @pid: the pid in question.
@@ -7449,6 +7657,8 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
         rcu_read_unlock();
  
         if (likely(p)) {
+               if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
+                       get_params(p, &attr);
                 retval = sched_setattr(p, &attr);
                 put_task_struct(p);
         }
@@ -7597,12 +7807,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
         kattr.sched_policy = p->policy;
         if (p->sched_reset_on_fork)
                 kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
-       if (task_has_dl_policy(p))
-               __getparam_dl(p, &kattr);
-       else if (task_has_rt_policy(p))
-               kattr.sched_priority = p->rt_priority;
-       else
-               kattr.sched_nice = task_nice(p);
+       get_params(p, &kattr);
+       kattr.sched_flags &= SCHED_FLAG_ALL;
  
  #ifdef CONFIG_UCLAMP_TASK
         /*
@@ -7623,9 +7829,76 @@ out_unlock:
         return retval;
  }
  
-long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+#ifdef CONFIG_SMP
+int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
  {
+       int ret = 0;
+
+       /*
+        * If the task isn't a deadline task or admission control is
+        * disabled then we don't care about affinity changes.
+        */
+       if (!task_has_dl_policy(p) || !dl_bandwidth_enabled())
+               return 0;
+
+       /*
+        * Since bandwidth control happens on root_domain basis,
+        * if admission test is enabled, we only admit -deadline
+        * tasks allowed to run on all the CPUs in the task's
+        * root_domain.
+        */
+       rcu_read_lock();
+       if (!cpumask_subset(task_rq(p)->rd->span, mask))
+               ret = -EBUSY;
+       rcu_read_unlock();
+       return ret;
+}
+#endif
+
+static int
+__sched_setaffinity(struct task_struct *p, const struct cpumask *mask)
+{
+       int retval;
         cpumask_var_t cpus_allowed, new_mask;
+
+       if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
+               return -ENOMEM;
+
+       if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+               retval = -ENOMEM;
+               goto out_free_cpus_allowed;
+       }
+
+       cpuset_cpus_allowed(p, cpus_allowed);
+       cpumask_and(new_mask, mask, cpus_allowed);
+
+       retval = dl_task_check_affinity(p, new_mask);
+       if (retval)
+               goto out_free_new_mask;
+again:
+       retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER);
+       if (retval)
+               goto out_free_new_mask;
+
+       cpuset_cpus_allowed(p, cpus_allowed);
+       if (!cpumask_subset(new_mask, cpus_allowed)) {
+               /*
+                * We must have raced with a concurrent cpuset update.
+                * Just reset the cpumask to the cpuset's cpus_allowed.
+                */
+               cpumask_copy(new_mask, cpus_allowed);
+               goto again;
+       }
+
+out_free_new_mask:
+       free_cpumask_var(new_mask);
+out_free_cpus_allowed:
+       free_cpumask_var(cpus_allowed);
+       return retval;
+}
+
+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+{
         struct task_struct *p;
         int retval;
  
@@ -7645,68 +7918,22 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
                 retval = -EINVAL;
                 goto out_put_task;
         }
-       if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
-               retval = -ENOMEM;
-               goto out_put_task;
-       }
-       if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
-               retval = -ENOMEM;
-               goto out_free_cpus_allowed;
-       }
-       retval = -EPERM;
+
         if (!check_same_owner(p)) {
                 rcu_read_lock();
                 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
                         rcu_read_unlock();
-                       goto out_free_new_mask;
+                       retval = -EPERM;
+                       goto out_put_task;
                 }
                 rcu_read_unlock();
         }
  
         retval = security_task_setscheduler(p);
         if (retval)
-               goto out_free_new_mask;
-
-
-       cpuset_cpus_allowed(p, cpus_allowed);
-       cpumask_and(new_mask, in_mask, cpus_allowed);
-
-       /*
-        * Since bandwidth control happens on root_domain basis,
-        * if admission test is enabled, we only admit -deadline
-        * tasks allowed to run on all the CPUs in the task's
-        * root_domain.
-        */
-#ifdef CONFIG_SMP
-       if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
-               rcu_read_lock();
-               if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
-                       retval = -EBUSY;
-                       rcu_read_unlock();
-                       goto out_free_new_mask;
-               }
-               rcu_read_unlock();
-       }
-#endif
-again:
-       retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK);
+               goto out_put_task;
  
-       if (!retval) {
-               cpuset_cpus_allowed(p, cpus_allowed);
-               if (!cpumask_subset(new_mask, cpus_allowed)) {
-                       /*
-                        * We must have raced with a concurrent cpuset
-                        * update. Just reset the cpus_allowed to the
-                        * cpuset's cpus_allowed
-                        */
-                       cpumask_copy(new_mask, cpus_allowed);
-                       goto again;
-               }
-       }
-out_free_new_mask:
-       free_cpumask_var(new_mask);
-out_free_cpus_allowed:
-       free_cpumask_var(cpus_allowed);
+       retval = __sched_setaffinity(p, in_mask);
  out_put_task:
         put_task_struct(p);
         return retval;
@@ -9906,7 +10133,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
          * Prevent race between setting of cfs_rq->runtime_enabled and
          * unthrottle_offline_cfs_rqs().
          */
-       get_online_cpus();
+       cpus_read_lock();
         mutex_lock(&cfs_constraints_mutex);
         ret = __cfs_schedulable(tg, period, quota);
         if (ret)
@@ -9950,7 +10177,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
                 cfs_bandwidth_usage_dec();
  out_unlock:
         mutex_unlock(&cfs_constraints_mutex);
-       put_online_cpus();
+       cpus_read_unlock();
  
         return ret;
  }
@@ -10201,6 +10428,20 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
  }
  #endif /* CONFIG_RT_GROUP_SCHED */
  
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
+                              struct cftype *cft)
+{
+       return css_tg(css)->idle;
+}
+
+static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
+                               struct cftype *cft, s64 idle)
+{
+       return sched_group_set_idle(css_tg(css), idle);
+}
+#endif
+
  static struct cftype cpu_legacy_files[] = {
  #ifdef CONFIG_FAIR_GROUP_SCHED
         {
@@ -10208,6 +10449,11 @@ static struct cftype cpu_legacy_files[] = {
                 .read_u64 = cpu_shares_read_u64,
                 .write_u64 = cpu_shares_write_u64,
         },
+       {
+               .name = "idle",
+               .read_s64 = cpu_idle_read_s64,
+               .write_s64 = cpu_idle_write_s64,
+       },
  #endif
  #ifdef CONFIG_CFS_BANDWIDTH
         {
@@ -10415,6 +10661,12 @@ static struct cftype cpu_files[] = {
                 .read_s64 = cpu_weight_nice_read_s64,
                 .write_s64 = cpu_weight_nice_write_s64,
         },
+       {
+               .name = "idle",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .read_s64 = cpu_idle_read_s64,
+               .write_s64 = cpu_idle_write_s64,
+       },
  #endif
  #ifdef CONFIG_CFS_BANDWIDTH
         {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c

index aaacd6c..e943146 100644 (file)
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1733,6 +1733,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
          */
         raw_spin_rq_lock(rq);
         if (p->dl.dl_non_contending) {
+               update_rq_clock(rq);
                 sub_running_bw(&p->dl, &rq->dl);
                 p->dl.dl_non_contending = 0;
                 /*
@@ -2741,7 +2742,7 @@ void __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
         dl_se->dl_runtime = attr->sched_runtime;
         dl_se->dl_deadline = attr->sched_deadline;
         dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
-       dl_se->flags = attr->sched_flags;
+       dl_se->flags = attr->sched_flags & SCHED_DL_FLAGS;
         dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
         dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
  }
@@ -2754,7 +2755,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
         attr->sched_runtime = dl_se->dl_runtime;
         attr->sched_deadline = dl_se->dl_deadline;
         attr->sched_period = dl_se->dl_period;
-       attr->sched_flags = dl_se->flags;
+       attr->sched_flags &= ~SCHED_DL_FLAGS;
+       attr->sched_flags |= dl_se->flags;
  }
  
  /*
@@ -2851,7 +2853,7 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
         if (dl_se->dl_runtime != attr->sched_runtime ||
             dl_se->dl_deadline != attr->sched_deadline ||
             dl_se->dl_period != attr->sched_period ||
-           dl_se->flags != attr->sched_flags)
+           dl_se->flags != (attr->sched_flags & SCHED_DL_FLAGS))
                 return true;
  
         return false;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c

index 0c5ec27..4971622 100644 (file)
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -388,6 +388,13 @@ void update_sched_domain_debugfs(void)
  {
         int cpu, i;
  
+       /*
+        * This can unfortunately be invoked before sched_debug_init() creates
+        * the debug directory. Don't touch sd_sysctl_cpus until then.
+        */
+       if (!debugfs_sched)
+               return;
+
         if (!cpumask_available(sd_sysctl_cpus)) {
                 if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
                         return;
@@ -600,6 +607,9 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
         SEQ_printf(m, "  .%-30s: %d\n", "nr_spread_over",
                         cfs_rq->nr_spread_over);
         SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
+       SEQ_printf(m, "  .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running);
+       SEQ_printf(m, "  .%-30s: %d\n", "idle_h_nr_running",
+                       cfs_rq->idle_h_nr_running);
         SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
  #ifdef CONFIG_SMP
         SEQ_printf(m, "  .%-30s: %lu\n", "load_avg",
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 44c4520..ff69f24 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -431,6 +431,23 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
         }
  }
  
+static int tg_is_idle(struct task_group *tg)
+{
+       return tg->idle > 0;
+}
+
+static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
+{
+       return cfs_rq->idle > 0;
+}
+
+static int se_is_idle(struct sched_entity *se)
+{
+       if (entity_is_task(se))
+               return task_has_idle_policy(task_of(se));
+       return cfs_rq_is_idle(group_cfs_rq(se));
+}
+
  #else  /* !CONFIG_FAIR_GROUP_SCHED */
  
  #define for_each_sched_entity(se) \
@@ -468,6 +485,21 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
  {
  }
  
+static inline int tg_is_idle(struct task_group *tg)
+{
+       return 0;
+}
+
+static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
+{
+       return 0;
+}
+
+static int se_is_idle(struct sched_entity *se)
+{
+       return 0;
+}
+
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
  static __always_inline
@@ -1486,7 +1518,7 @@ static inline bool is_core_idle(int cpu)
                 if (cpu == sibling)
                         continue;
  
-               if (!idle_cpu(cpu))
+               if (!idle_cpu(sibling))
                         return false;
         }
  #endif
@@ -4841,6 +4873,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
  
                 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
  
+               if (cfs_rq_is_idle(group_cfs_rq(se)))
+                       idle_task_delta = cfs_rq->h_nr_running;
+
                 qcfs_rq->h_nr_running -= task_delta;
                 qcfs_rq->idle_h_nr_running -= idle_task_delta;
  
@@ -4860,6 +4895,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
                 update_load_avg(qcfs_rq, se, 0);
                 se_update_runnable(se);
  
+               if (cfs_rq_is_idle(group_cfs_rq(se)))
+                       idle_task_delta = cfs_rq->h_nr_running;
+
                 qcfs_rq->h_nr_running -= task_delta;
                 qcfs_rq->idle_h_nr_running -= idle_task_delta;
         }
@@ -4904,39 +4942,45 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
         task_delta = cfs_rq->h_nr_running;
         idle_task_delta = cfs_rq->idle_h_nr_running;
         for_each_sched_entity(se) {
+               struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+
                 if (se->on_rq)
                         break;
-               cfs_rq = cfs_rq_of(se);
-               enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
+               enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
+
+               if (cfs_rq_is_idle(group_cfs_rq(se)))
+                       idle_task_delta = cfs_rq->h_nr_running;
  
-               cfs_rq->h_nr_running += task_delta;
-               cfs_rq->idle_h_nr_running += idle_task_delta;
+               qcfs_rq->h_nr_running += task_delta;
+               qcfs_rq->idle_h_nr_running += idle_task_delta;
  
                 /* end evaluation on encountering a throttled cfs_rq */
-               if (cfs_rq_throttled(cfs_rq))
+               if (cfs_rq_throttled(qcfs_rq))
                         goto unthrottle_throttle;
         }
  
         for_each_sched_entity(se) {
-               cfs_rq = cfs_rq_of(se);
+               struct cfs_rq *qcfs_rq = cfs_rq_of(se);
  
-               update_load_avg(cfs_rq, se, UPDATE_TG);
+               update_load_avg(qcfs_rq, se, UPDATE_TG);
                 se_update_runnable(se);
  
-               cfs_rq->h_nr_running += task_delta;
-               cfs_rq->idle_h_nr_running += idle_task_delta;
+               if (cfs_rq_is_idle(group_cfs_rq(se)))
+                       idle_task_delta = cfs_rq->h_nr_running;
  
+               qcfs_rq->h_nr_running += task_delta;
+               qcfs_rq->idle_h_nr_running += idle_task_delta;
  
                 /* end evaluation on encountering a throttled cfs_rq */
-               if (cfs_rq_throttled(cfs_rq))
+               if (cfs_rq_throttled(qcfs_rq))
                         goto unthrottle_throttle;
  
                 /*
                  * One parent has been throttled and cfs_rq removed from the
                  * list. Add it back to not break the leaf list.
                  */
-               if (throttled_hierarchy(cfs_rq))
-                       list_add_leaf_cfs_rq(cfs_rq);
+               if (throttled_hierarchy(qcfs_rq))
+                       list_add_leaf_cfs_rq(qcfs_rq);
         }
  
         /* At this point se is NULL and we are at root level*/
@@ -4949,9 +4993,9 @@ unthrottle_throttle:
          * assertion below.
          */
         for_each_sched_entity(se) {
-               cfs_rq = cfs_rq_of(se);
+               struct cfs_rq *qcfs_rq = cfs_rq_of(se);
  
-               if (list_add_leaf_cfs_rq(cfs_rq))
+               if (list_add_leaf_cfs_rq(qcfs_rq))
                         break;
         }
  
@@ -5574,6 +5618,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 cfs_rq->h_nr_running++;
                 cfs_rq->idle_h_nr_running += idle_h_nr_running;
  
+               if (cfs_rq_is_idle(cfs_rq))
+                       idle_h_nr_running = 1;
+
                 /* end evaluation on encountering a throttled cfs_rq */
                 if (cfs_rq_throttled(cfs_rq))
                         goto enqueue_throttle;
@@ -5591,6 +5638,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 cfs_rq->h_nr_running++;
                 cfs_rq->idle_h_nr_running += idle_h_nr_running;
  
+               if (cfs_rq_is_idle(cfs_rq))
+                       idle_h_nr_running = 1;
+
                 /* end evaluation on encountering a throttled cfs_rq */
                 if (cfs_rq_throttled(cfs_rq))
                         goto enqueue_throttle;
@@ -5668,6 +5718,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 cfs_rq->h_nr_running--;
                 cfs_rq->idle_h_nr_running -= idle_h_nr_running;
  
+               if (cfs_rq_is_idle(cfs_rq))
+                       idle_h_nr_running = 1;
+
                 /* end evaluation on encountering a throttled cfs_rq */
                 if (cfs_rq_throttled(cfs_rq))
                         goto dequeue_throttle;
@@ -5697,6 +5750,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 cfs_rq->h_nr_running--;
                 cfs_rq->idle_h_nr_running -= idle_h_nr_running;
  
+               if (cfs_rq_is_idle(cfs_rq))
+                       idle_h_nr_running = 1;
+
                 /* end evaluation on encountering a throttled cfs_rq */
                 if (cfs_rq_throttled(cfs_rq))
                         goto dequeue_throttle;
@@ -6249,7 +6305,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
                 time = cpu_clock(this);
         }
  
-       for_each_cpu_wrap(cpu, cpus, target) {
+       for_each_cpu_wrap(cpu, cpus, target + 1) {
                 if (has_idle_core) {
                         i = select_idle_core(p, cpu, cpus, &idle_cpu);
                         if ((unsigned int)i < nr_cpumask_bits)
@@ -6376,6 +6432,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
  
         /* Check a recently used CPU as a potential idle candidate: */
         recent_used_cpu = p->recent_used_cpu;
+       p->recent_used_cpu = prev;
         if (recent_used_cpu != prev &&
             recent_used_cpu != target &&
             cpus_share_cache(recent_used_cpu, target) &&
@@ -6902,9 +6959,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
         } else if (wake_flags & WF_TTWU) { /* XXX always ? */
                 /* Fast path */
                 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
-
-               if (want_affine)
-                       current->recent_used_cpu = cpu;
         }
         rcu_read_unlock();
  
@@ -7041,24 +7095,22 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
  
  static void set_last_buddy(struct sched_entity *se)
  {
-       if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
-               return;
-
         for_each_sched_entity(se) {
                 if (SCHED_WARN_ON(!se->on_rq))
                         return;
+               if (se_is_idle(se))
+                       return;
                 cfs_rq_of(se)->last = se;
         }
  }
  
  static void set_next_buddy(struct sched_entity *se)
  {
-       if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
-               return;
-
         for_each_sched_entity(se) {
                 if (SCHED_WARN_ON(!se->on_rq))
                         return;
+               if (se_is_idle(se))
+                       return;
                 cfs_rq_of(se)->next = se;
         }
  }
@@ -7079,6 +7131,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
         struct cfs_rq *cfs_rq = task_cfs_rq(curr);
         int scale = cfs_rq->nr_running >= sched_nr_latency;
         int next_buddy_marked = 0;
+       int cse_is_idle, pse_is_idle;
  
         if (unlikely(se == pse))
                 return;
@@ -7123,8 +7176,21 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
                 return;
  
         find_matching_se(&se, &pse);
-       update_curr(cfs_rq_of(se));
         BUG_ON(!pse);
+
+       cse_is_idle = se_is_idle(se);
+       pse_is_idle = se_is_idle(pse);
+
+       /*
+        * Preempt an idle group in favor of a non-idle group (and don't preempt
+        * in the inverse case).
+        */
+       if (cse_is_idle && !pse_is_idle)
+               goto preempt;
+       if (cse_is_idle != pse_is_idle)
+               return;
+
+       update_curr(cfs_rq_of(se));
         if (wakeup_preempt_entity(se, pse) == 1) {
                 /*
                  * Bias pick_next to pick the sched entity that is
@@ -10217,9 +10283,11 @@ static inline int on_null_domain(struct rq *rq)
  static inline int find_new_ilb(void)
  {
         int ilb;
+       const struct cpumask *hk_mask;
+
+       hk_mask = housekeeping_cpumask(HK_FLAG_MISC);
  
-       for_each_cpu_and(ilb, nohz.idle_cpus_mask,
-                             housekeeping_cpumask(HK_FLAG_MISC)) {
+       for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) {
  
                 if (ilb == smp_processor_id())
                         continue;
@@ -11416,10 +11484,12 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
  
  static DEFINE_MUTEX(shares_mutex);
  
-int sched_group_set_shares(struct task_group *tg, unsigned long shares)
+static int __sched_group_set_shares(struct task_group *tg, unsigned long shares)
  {
         int i;
  
+       lockdep_assert_held(&shares_mutex);
+
         /*
          * We can't change the weight of the root cgroup.
          */
@@ -11428,9 +11498,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
  
         shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
  
-       mutex_lock(&shares_mutex);
         if (tg->shares == shares)
-               goto done;
+               return 0;
  
         tg->shares = shares;
         for_each_possible_cpu(i) {
@@ -11448,10 +11517,88 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
                 rq_unlock_irqrestore(rq, &rf);
         }
  
-done:
+       return 0;
+}
+
+int sched_group_set_shares(struct task_group *tg, unsigned long shares)
+{
+       int ret;
+
+       mutex_lock(&shares_mutex);
+       if (tg_is_idle(tg))
+               ret = -EINVAL;
+       else
+               ret = __sched_group_set_shares(tg, shares);
+       mutex_unlock(&shares_mutex);
+
+       return ret;
+}
+
+int sched_group_set_idle(struct task_group *tg, long idle)
+{
+       int i;
+
+       if (tg == &root_task_group)
+               return -EINVAL;
+
+       if (idle < 0 || idle > 1)
+               return -EINVAL;
+
+       mutex_lock(&shares_mutex);
+
+       if (tg->idle == idle) {
+               mutex_unlock(&shares_mutex);
+               return 0;
+       }
+
+       tg->idle = idle;
+
+       for_each_possible_cpu(i) {
+               struct rq *rq = cpu_rq(i);
+               struct sched_entity *se = tg->se[i];
+               struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i];
+               bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
+               long idle_task_delta;
+               struct rq_flags rf;
+
+               rq_lock_irqsave(rq, &rf);
+
+               grp_cfs_rq->idle = idle;
+               if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
+                       goto next_cpu;
+
+               idle_task_delta = grp_cfs_rq->h_nr_running -
+                                 grp_cfs_rq->idle_h_nr_running;
+               if (!cfs_rq_is_idle(grp_cfs_rq))
+                       idle_task_delta *= -1;
+
+               for_each_sched_entity(se) {
+                       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+                       if (!se->on_rq)
+                               break;
+
+                       cfs_rq->idle_h_nr_running += idle_task_delta;
+
+                       /* Already accounted at parent level and above. */
+                       if (cfs_rq_is_idle(cfs_rq))
+                               break;
+               }
+
+next_cpu:
+               rq_unlock_irqrestore(rq, &rf);
+       }
+
+       /* Idle groups have minimum weight. */
+       if (tg_is_idle(tg))
+               __sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO));
+       else
+               __sched_group_set_shares(tg, NICE_0_LOAD);
+
         mutex_unlock(&shares_mutex);
         return 0;
  }
+
  #else /* CONFIG_FAIR_GROUP_SCHED */
  
  void free_fair_sched_group(struct task_group *tg) { }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index ddefb04..3d3e579 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -227,6 +227,8 @@ static inline void update_avg(u64 *avg, u64 sample)
   */
  #define SCHED_FLAG_SUGOV       0x10000000
  
+#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV)
+
  static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se)
  {
  #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
@@ -394,6 +396,9 @@ struct task_group {
         struct cfs_rq           **cfs_rq;
         unsigned long           shares;
  
+       /* A positive value indicates that this is a SCHED_IDLE group. */
+       int                     idle;
+
  #ifdef CONFIG_SMP
         /*
          * load_avg can be heavily contended at clock tick time, so put
@@ -503,6 +508,8 @@ extern void sched_move_task(struct task_struct *tsk);
  #ifdef CONFIG_FAIR_GROUP_SCHED
  extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
  
+extern int sched_group_set_idle(struct task_group *tg, long idle);
+
  #ifdef CONFIG_SMP
  extern void set_task_rq_fair(struct sched_entity *se,
                              struct cfs_rq *prev, struct cfs_rq *next);
@@ -599,6 +606,9 @@ struct cfs_rq {
         struct list_head        leaf_cfs_rq_list;
         struct task_group       *tg;    /* group that "owns" this runqueue */
  
+       /* Locally cached copy of our task_group's idle value */
+       int                     idle;
+
  #ifdef CONFIG_CFS_BANDWIDTH
         int                     runtime_enabled;
         s64                     runtime_remaining;
@@ -2234,6 +2244,7 @@ extern struct task_struct *pick_next_task_idle(struct rq *rq);
  #define SCA_CHECK              0x01
  #define SCA_MIGRATE_DISABLE    0x02
  #define SCA_MIGRATE_ENABLE     0x04
+#define SCA_USER               0x08
  
  #ifdef CONFIG_SMP
  
@@ -2388,6 +2399,21 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
  extern const_debug unsigned int sysctl_sched_nr_migrate;
  extern const_debug unsigned int sysctl_sched_migration_cost;
  
+#ifdef CONFIG_SCHED_DEBUG
+extern unsigned int sysctl_sched_latency;
+extern unsigned int sysctl_sched_min_granularity;
+extern unsigned int sysctl_sched_wakeup_granularity;
+extern int sysctl_resched_latency_warn_ms;
+extern int sysctl_resched_latency_warn_once;
+
+extern unsigned int sysctl_sched_tunable_scaling;
+
+extern unsigned int sysctl_numa_balancing_scan_delay;
+extern unsigned int sysctl_numa_balancing_scan_period_min;
+extern unsigned int sysctl_numa_balancing_scan_period_max;
+extern unsigned int sysctl_numa_balancing_scan_size;
+#endif
+
  #ifdef CONFIG_SCHED_HRTICK
  
  /*
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c

index b77ad49..4e8698e 100644 (file)
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1482,6 +1482,8 @@ int                               sched_max_numa_distance;
  static int                     *sched_domains_numa_distance;
  static struct cpumask          ***sched_domains_numa_masks;
  int __read_mostly              node_reclaim_distance = RECLAIM_DISTANCE;
+
+static unsigned long __read_mostly *sched_numa_onlined_nodes;
  #endif
  
  /*
@@ -1833,6 +1835,16 @@ void sched_init_numa(void)
                         sched_domains_numa_masks[i][j] = mask;
  
                         for_each_node(k) {
+                               /*
+                                * Distance information can be unreliable for
+                                * offline nodes, defer building the node
+                                * masks to its bringup.
+                                * This relies on all unique distance values
+                                * still being visible at init time.
+                                */
+                               if (!node_online(j))
+                                       continue;
+
                                 if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
                                         sched_numa_warn("Node-distance not symmetric");
  
@@ -1886,6 +1898,53 @@ void sched_init_numa(void)
         sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
  
         init_numa_topology_type();
+
+       sched_numa_onlined_nodes = bitmap_alloc(nr_node_ids, GFP_KERNEL);
+       if (!sched_numa_onlined_nodes)
+               return;
+
+       bitmap_zero(sched_numa_onlined_nodes, nr_node_ids);
+       for_each_online_node(i)
+               bitmap_set(sched_numa_onlined_nodes, i, 1);
+}
+
+static void __sched_domains_numa_masks_set(unsigned int node)
+{
+       int i, j;
+
+       /*
+        * NUMA masks are not built for offline nodes in sched_init_numa().
+        * Thus, when a CPU of a never-onlined-before node gets plugged in,
+        * adding that new CPU to the right NUMA masks is not sufficient: the
+        * masks of that CPU's node must also be updated.
+        */
+       if (test_bit(node, sched_numa_onlined_nodes))
+               return;
+
+       bitmap_set(sched_numa_onlined_nodes, node, 1);
+
+       for (i = 0; i < sched_domains_numa_levels; i++) {
+               for (j = 0; j < nr_node_ids; j++) {
+                       if (!node_online(j) || node == j)
+                               continue;
+
+                       if (node_distance(j, node) > sched_domains_numa_distance[i])
+                               continue;
+
+                       /* Add remote nodes in our masks */
+                       cpumask_or(sched_domains_numa_masks[i][node],
+                                  sched_domains_numa_masks[i][node],
+                                  sched_domains_numa_masks[0][j]);
+               }
+       }
+
+       /*
+        * A new node has been brought up, potentially changing the topology
+        * classification.
+        *
+        * Note that this is racy vs any use of sched_numa_topology_type :/
+        */
+       init_numa_topology_type();
  }
  
  void sched_domains_numa_masks_set(unsigned int cpu)
@@ -1893,8 +1952,14 @@ void sched_domains_numa_masks_set(unsigned int cpu)
         int node = cpu_to_node(cpu);
         int i, j;
  
+       __sched_domains_numa_masks_set(node);
+
         for (i = 0; i < sched_domains_numa_levels; i++) {
                 for (j = 0; j < nr_node_ids; j++) {
+                       if (!node_online(j))
+                               continue;
+
+                       /* Set ourselves in the remote node's masks */
                         if (node_distance(j, node) <= sched_domains_numa_distance[i])
                                 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
                 }
author	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 30 Aug 2021 20:42:10 +0000 (13:42 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 30 Aug 2021 20:42:10 +0000 (13:42 -0700)
fs/aio.c		patch \| blob \| history
fs/eventfd.c		patch \| blob \| history
include/linux/cpuset.h		patch \| blob \| history
include/linux/eventfd.h		patch \| blob \| history
include/linux/mmu_context.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
include/linux/sched/sysctl.h		patch \| blob \| history
include/linux/wait.h		patch \| blob \| history
init/init_task.c		patch \| blob \| history
kernel/cgroup/cpuset.c		patch \| blob \| history
kernel/fork.c		patch \| blob \| history
kernel/sched/core.c		patch \| blob \| history
kernel/sched/deadline.c		patch \| blob \| history
kernel/sched/debug.c		patch \| blob \| history
kernel/sched/fair.c		patch \| blob \| history
kernel/sched/sched.h		patch \| blob \| history
kernel/sched/topology.c		patch \| blob \| history