Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

author Linus Torvalds <torvalds@linux-foundation.org>

Mon, 6 May 2019 21:31:50 +0000 (14:31 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Mon, 6 May 2019 21:31:50 +0000 (14:31 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Mon, 6 May 2019 21:31:50 +0000 (14:31 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Mon, 6 May 2019 21:31:50 +0000 (14:31 -0700)
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig

index fab0bf4..88a4fb3 100644 (file)
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -313,6 +313,10 @@ config ARCH_SUSPEND_POSSIBLE
                    (PPC_85xx && !PPC_E500MC) || PPC_86xx || PPC_PSERIES \
                    || 44x || 40x
  
+config ARCH_SUSPEND_NONZERO_CPU
+       def_bool y
+       depends on PPC_POWERNV || PPC_PSERIES
+
  config PPC_DCR_NATIVE
         bool
  
diff --git a/include/linux/cpu.h b/include/linux/cpu.h

index 2d9c6f4..2d62f01 100644 (file)
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -137,9 +137,26 @@ static inline int disable_nonboot_cpus(void)
         return freeze_secondary_cpus(0);
  }
  extern void enable_nonboot_cpus(void);
+
+static inline int suspend_disable_secondary_cpus(void)
+{
+       int cpu = 0;
+
+       if (IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU))
+               cpu = -1;
+
+       return freeze_secondary_cpus(cpu);
+}
+static inline void suspend_enable_secondary_cpus(void)
+{
+       return enable_nonboot_cpus();
+}
+
  #else /* !CONFIG_PM_SLEEP_SMP */
  static inline int disable_nonboot_cpus(void) { return 0; }
  static inline void enable_nonboot_cpus(void) {}
+static inline int suspend_disable_secondary_cpus(void) { return 0; }
+static inline void suspend_enable_secondary_cpus(void) { }
  #endif /* !CONFIG_PM_SLEEP_SMP */
  
  void cpu_startup_entry(enum cpuhp_state state);
diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h

index 90bfa32..563290f 100644 (file)
--- a/include/linux/rcuwait.h
+++ b/include/linux/rcuwait.h
@@ -18,7 +18,7 @@
   * awoken.
   */
  struct rcuwait {
-       struct task_struct *task;
+       struct task_struct __rcu *task;
  };
  
  #define __RCUWAIT_INITIALIZER(name)            \
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h

index 57c7ed3..cfc0a89 100644 (file)
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -76,8 +76,8 @@ struct sched_domain_shared {
  
  struct sched_domain {
         /* These fields must be setup */
-       struct sched_domain *parent;    /* top domain must be null terminated */
-       struct sched_domain *child;     /* bottom domain must be null terminated */
+       struct sched_domain __rcu *parent;      /* top domain must be null terminated */
+       struct sched_domain __rcu *child;       /* bottom domain must be null terminated */
         struct sched_group *groups;     /* the balancing groups of the domain */
         unsigned long min_interval;     /* Minimum balance interval ms */
         unsigned long max_interval;     /* Maximum balance interval ms */
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c

index 4834c42..6a1942e 100644 (file)
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -740,11 +740,10 @@ static inline int nr_cpusets(void)
   * Must be called with cpuset_mutex held.
   *
   * The three key local variables below are:
- *    q  - a linked-list queue of cpuset pointers, used to implement a
- *        top-down scan of all cpusets.  This scan loads a pointer
- *        to each cpuset marked is_sched_load_balance into the
- *        array 'csa'.  For our purposes, rebuilding the schedulers
- *        sched domains, we can ignore !is_sched_load_balance cpusets.
+ *    cp - cpuset pointer, used (together with pos_css) to perform a
+ *        top-down scan of all cpusets. For our purposes, rebuilding
+ *        the schedulers sched domains, we can ignore !is_sched_load_
+ *        balance cpusets.
   *  csa  - (for CpuSet Array) Array of pointers to all the cpusets
   *        that need to be load balanced, for convenient iterative
   *        access by the subsequent code that finds the best partition,
@@ -775,7 +774,7 @@ static inline int nr_cpusets(void)
  static int generate_sched_domains(cpumask_var_t **domains,
                         struct sched_domain_attr **attributes)
  {
-       struct cpuset *cp;      /* scans q */
+       struct cpuset *cp;      /* top-down scan of cpusets */
         struct cpuset **csa;    /* array of all cpuset ptrs */
         int csn;                /* how many cpuset ptrs in csa so far */
         int i, j, k;            /* indices for partition finding loops */
diff --git a/kernel/cpu.c b/kernel/cpu.c

index 43e741e..1e890f5 100644 (file)
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -9,6 +9,7 @@
  #include <linux/notifier.h>
  #include <linux/sched/signal.h>
  #include <linux/sched/hotplug.h>
+#include <linux/sched/isolation.h>
  #include <linux/sched/task.h>
  #include <linux/sched/smt.h>
  #include <linux/unistd.h>
@@ -1199,8 +1200,15 @@ int freeze_secondary_cpus(int primary)
         int cpu, error = 0;
  
         cpu_maps_update_begin();
-       if (!cpu_online(primary))
+       if (primary == -1) {
                 primary = cpumask_first(cpu_online_mask);
+               if (!housekeeping_cpu(primary, HK_FLAG_TIMER))
+                       primary = housekeeping_any_cpu(HK_FLAG_TIMER);
+       } else {
+               if (!cpu_online(primary))
+                       primary = cpumask_first(cpu_online_mask);
+       }
+
         /*
          * We take down all of the non-boot CPUs in one shot to avoid races
          * with the userspace trying to use the CPU hotplug at the same time
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c

index d714044..fd5c95f 100644 (file)
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1150,7 +1150,7 @@ int kernel_kexec(void)
                 error = dpm_suspend_end(PMSG_FREEZE);
                 if (error)
                         goto Resume_devices;
-               error = disable_nonboot_cpus();
+               error = suspend_disable_secondary_cpus();
                 if (error)
                         goto Enable_cpus;
                 local_irq_disable();
@@ -1183,7 +1183,7 @@ int kernel_kexec(void)
   Enable_irqs:
                 local_irq_enable();
   Enable_cpus:
-               enable_nonboot_cpus();
+               suspend_enable_secondary_cpus();
                 dpm_resume_start(PMSG_RESTORE);
   Resume_devices:
                 dpm_resume_end(PMSG_RESTORE);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig

index f8fe57d..9bbaaab 100644 (file)
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -114,6 +114,15 @@ config PM_SLEEP_SMP
         depends on PM_SLEEP
         select HOTPLUG_CPU
  
+config PM_SLEEP_SMP_NONZERO_CPU
+       def_bool y
+       depends on PM_SLEEP_SMP
+       depends on ARCH_SUSPEND_NONZERO_CPU
+       ---help---
+       If an arch can suspend (for suspend, hibernate, kexec, etc) on a
+       non-zero numbered CPU, it may define ARCH_SUSPEND_NONZERO_CPU. This
+       will allow nohz_full mask to include CPU0.
+
  config PM_AUTOSLEEP
         bool "Opportunistic sleep"
         depends on PM_SLEEP
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c

index abef759..cfc7a57 100644 (file)
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -281,7 +281,7 @@ static int create_image(int platform_mode)
         if (error || hibernation_test(TEST_PLATFORM))
                 goto Platform_finish;
  
-       error = disable_nonboot_cpus();
+       error = suspend_disable_secondary_cpus();
         if (error || hibernation_test(TEST_CPUS))
                 goto Enable_cpus;
  
@@ -323,7 +323,7 @@ static int create_image(int platform_mode)
         local_irq_enable();
  
   Enable_cpus:
-       enable_nonboot_cpus();
+       suspend_enable_secondary_cpus();
  
   Platform_finish:
         platform_finish(platform_mode);
@@ -417,7 +417,7 @@ int hibernation_snapshot(int platform_mode)
  
  int __weak hibernate_resume_nonboot_cpu_disable(void)
  {
-       return disable_nonboot_cpus();
+       return suspend_disable_secondary_cpus();
  }
  
  /**
@@ -486,7 +486,7 @@ static int resume_target_kernel(bool platform_mode)
         local_irq_enable();
  
   Enable_cpus:
-       enable_nonboot_cpus();
+       suspend_enable_secondary_cpus();
  
   Cleanup:
         platform_restore_cleanup(platform_mode);
@@ -564,7 +564,7 @@ int hibernation_platform_enter(void)
         if (error)
                 goto Platform_finish;
  
-       error = disable_nonboot_cpus();
+       error = suspend_disable_secondary_cpus();
         if (error)
                 goto Enable_cpus;
  
@@ -586,7 +586,7 @@ int hibernation_platform_enter(void)
         local_irq_enable();
  
   Enable_cpus:
-       enable_nonboot_cpus();
+       suspend_enable_secondary_cpus();
  
   Platform_finish:
         hibernation_ops->finish();
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c

index 0bd595a..59b6def 100644 (file)
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -428,7 +428,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
         if (suspend_test(TEST_PLATFORM))
                 goto Platform_wake;
  
-       error = disable_nonboot_cpus();
+       error = suspend_disable_secondary_cpus();
         if (error || suspend_test(TEST_CPUS))
                 goto Enable_cpus;
  
@@ -458,7 +458,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
         BUG_ON(irqs_disabled());
  
   Enable_cpus:
-       enable_nonboot_cpus();
+       suspend_enable_secondary_cpus();
  
   Platform_wake:
         platform_resume_noirq(state);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index ade3f22..102dfcf 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -792,10 +792,14 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
                 rq->nr_uninterruptible--;
  
         enqueue_task(rq, p, flags);
+
+       p->on_rq = TASK_ON_RQ_QUEUED;
  }
  
  void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
  {
+       p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
+
         if (task_contributes_to_load(p))
                 rq->nr_uninterruptible++;
  
@@ -920,7 +924,7 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
  }
  
  /*
- * Per-CPU kthreads are allowed to run on !actie && online CPUs, see
+ * Per-CPU kthreads are allowed to run on !active && online CPUs, see
   * __set_cpus_allowed_ptr() and select_fallback_rq().
   */
  static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
@@ -1236,11 +1240,9 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
                 rq_pin_lock(src_rq, &srf);
                 rq_pin_lock(dst_rq, &drf);
  
-               p->on_rq = TASK_ON_RQ_MIGRATING;
                 deactivate_task(src_rq, p, 0);
                 set_task_cpu(p, cpu);
                 activate_task(dst_rq, p, 0);
-               p->on_rq = TASK_ON_RQ_QUEUED;
                 check_preempt_curr(dst_rq, p, 0);
  
                 rq_unpin_lock(dst_rq, &drf);
@@ -1680,16 +1682,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
                 __schedstat_inc(p->se.statistics.nr_wakeups_sync);
  }
  
-static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
-{
-       activate_task(rq, p, en_flags);
-       p->on_rq = TASK_ON_RQ_QUEUED;
-
-       /* If a worker is waking up, notify the workqueue: */
-       if (p->flags & PF_WQ_WORKER)
-               wq_worker_waking_up(p, cpu_of(rq));
-}
-
  /*
   * Mark the task runnable and perform wakeup-preemption.
   */
@@ -1741,7 +1733,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
                 en_flags |= ENQUEUE_MIGRATED;
  #endif
  
-       ttwu_activate(rq, p, en_flags);
+       activate_task(rq, p, en_flags);
         ttwu_do_wakeup(rq, p, wake_flags, rf);
  }
  
@@ -2105,56 +2097,6 @@ out:
         return success;
  }
  
-/**
- * try_to_wake_up_local - try to wake up a local task with rq lock held
- * @p: the thread to be awakened
- * @rf: request-queue flags for pinning
- *
- * Put @p on the run-queue if it's not already there. The caller must
- * ensure that this_rq() is locked, @p is bound to this_rq() and not
- * the current task.
- */
-static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
-{
-       struct rq *rq = task_rq(p);
-
-       if (WARN_ON_ONCE(rq != this_rq()) ||
-           WARN_ON_ONCE(p == current))
-               return;
-
-       lockdep_assert_held(&rq->lock);
-
-       if (!raw_spin_trylock(&p->pi_lock)) {
-               /*
-                * This is OK, because current is on_cpu, which avoids it being
-                * picked for load-balance and preemption/IRQs are still
-                * disabled avoiding further scheduler activity on it and we've
-                * not yet picked a replacement task.
-                */
-               rq_unlock(rq, rf);
-               raw_spin_lock(&p->pi_lock);
-               rq_relock(rq, rf);
-       }
-
-       if (!(p->state & TASK_NORMAL))
-               goto out;
-
-       trace_sched_waking(p);
-
-       if (!task_on_rq_queued(p)) {
-               if (p->in_iowait) {
-                       delayacct_blkio_end(p);
-                       atomic_dec(&rq->nr_iowait);
-               }
-               ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
-       }
-
-       ttwu_do_wakeup(rq, p, 0, rf);
-       ttwu_stat(p, smp_processor_id(), 0);
-out:
-       raw_spin_unlock(&p->pi_lock);
-}
-
  /**
   * wake_up_process - Wake up a specific process
   * @p: The process to be woken up.
@@ -2466,7 +2408,6 @@ void wake_up_new_task(struct task_struct *p)
         post_init_entity_util_avg(p);
  
         activate_task(rq, p, ENQUEUE_NOCLOCK);
-       p->on_rq = TASK_ON_RQ_QUEUED;
         trace_sched_wakeup_new(p);
         check_preempt_curr(rq, p, WF_FORK);
  #ifdef CONFIG_SMP
@@ -3465,25 +3406,11 @@ static void __sched notrace __schedule(bool preempt)
                         prev->state = TASK_RUNNING;
                 } else {
                         deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
-                       prev->on_rq = 0;
  
                         if (prev->in_iowait) {
                                 atomic_inc(&rq->nr_iowait);
                                 delayacct_blkio_start();
                         }
-
-                       /*
-                        * If a worker went to sleep, notify and ask workqueue
-                        * whether it wants to wake up a task to maintain
-                        * concurrency.
-                        */
-                       if (prev->flags & PF_WQ_WORKER) {
-                               struct task_struct *to_wakeup;
-
-                               to_wakeup = wq_worker_sleeping(prev);
-                               if (to_wakeup)
-                                       try_to_wake_up_local(to_wakeup, &rf);
-                       }
                 }
                 switch_count = &prev->nvcsw;
         }
@@ -3543,6 +3470,20 @@ static inline void sched_submit_work(struct task_struct *tsk)
  {
         if (!tsk->state || tsk_is_pi_blocked(tsk))
                 return;
+
+       /*
+        * If a worker went to sleep, notify and ask workqueue whether
+        * it wants to wake up a task to maintain concurrency.
+        * As this function is called inside the schedule() context,
+        * we disable preemption to avoid it calling schedule() again
+        * in the possible wakeup of a kworker.
+        */
+       if (tsk->flags & PF_WQ_WORKER) {
+               preempt_disable();
+               wq_worker_sleeping(tsk);
+               preempt_enable_no_resched();
+       }
+
         /*
          * If we are going to sleep and we have plugged IO queued,
          * make sure to submit it to avoid deadlocks.
@@ -3551,6 +3492,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
                 blk_schedule_flush_plug(tsk);
  }
  
+static void sched_update_worker(struct task_struct *tsk)
+{
+       if (tsk->flags & PF_WQ_WORKER)
+               wq_worker_running(tsk);
+}
+
  asmlinkage __visible void __sched schedule(void)
  {
         struct task_struct *tsk = current;
@@ -3561,6 +3508,7 @@ asmlinkage __visible void __sched schedule(void)
                 __schedule(false);
                 sched_preempt_enable_no_resched();
         } while (need_resched());
+       sched_update_worker(tsk);
  }
  EXPORT_SYMBOL(schedule);
  
@@ -5917,7 +5865,7 @@ void __init sched_init_smp(void)
  
  static int __init migration_init(void)
  {
-       sched_rq_cpu_starting(smp_processor_id());
+       sched_cpu_starting(smp_processor_id());
         return 0;
  }
  early_initcall(migration_init);
@@ -6558,6 +6506,8 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
  static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
                                 struct cftype *cftype, u64 shareval)
  {
+       if (shareval > scale_load_down(ULONG_MAX))
+               shareval = MAX_SHARES;
         return sched_group_set_shares(css_tg(css), scale_load(shareval));
  }
  
@@ -6573,7 +6523,7 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
  static DEFINE_MUTEX(cfs_constraints_mutex);
  
  const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
-const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
  
  static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
  
@@ -6653,20 +6603,22 @@ out_unlock:
         return ret;
  }
  
-int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
+static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
  {
         u64 quota, period;
  
         period = ktime_to_ns(tg->cfs_bandwidth.period);
         if (cfs_quota_us < 0)
                 quota = RUNTIME_INF;
-       else
+       else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC)
                 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
+       else
+               return -EINVAL;
  
         return tg_set_cfs_bandwidth(tg, period, quota);
  }
  
-long tg_get_cfs_quota(struct task_group *tg)
+static long tg_get_cfs_quota(struct task_group *tg)
  {
         u64 quota_us;
  
@@ -6679,17 +6631,20 @@ long tg_get_cfs_quota(struct task_group *tg)
         return quota_us;
  }
  
-int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
+static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
  {
         u64 quota, period;
  
+       if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC)
+               return -EINVAL;
+
         period = (u64)cfs_period_us * NSEC_PER_USEC;
         quota = tg->cfs_bandwidth.quota;
  
         return tg_set_cfs_bandwidth(tg, period, quota);
  }
  
-long tg_get_cfs_period(struct task_group *tg)
+static long tg_get_cfs_period(struct task_group *tg)
  {
         u64 cfs_period_us;
  
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c

index 835671f..b5dcd1d 100644 (file)
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -7,7 +7,7 @@
   */
  #include "sched.h"
  
-DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+DEFINE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
  
  /**
   * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer.
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c

index 8039d62..678bfb9 100644 (file)
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -702,7 +702,7 @@ do {                                                                        \
  
  static const char *sched_tunable_scaling_names[] = {
         "none",
-       "logaritmic",
+       "logarithmic",
         "linear"
  };
  
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 35f3ea3..f35930f 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2597,7 +2597,7 @@ out:
  /*
   * Drive the periodic memory faults..
   */
-void task_tick_numa(struct rq *rq, struct task_struct *curr)
+static void task_tick_numa(struct rq *rq, struct task_struct *curr)
  {
         struct callback_head *work = &curr->numa_work;
         u64 period, now;
@@ -3571,7 +3571,7 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
   * Synchronize entity load avg of dequeued entity without locking
   * the previous rq.
   */
-void sync_entity_load_avg(struct sched_entity *se)
+static void sync_entity_load_avg(struct sched_entity *se)
  {
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
         u64 last_update_time;
@@ -3584,7 +3584,7 @@ void sync_entity_load_avg(struct sched_entity *se)
   * Task first catches up with cfs_rq, and then subtract
   * itself from the cfs_rq (task must be off the queue now).
   */
-void remove_entity_load_avg(struct sched_entity *se)
+static void remove_entity_load_avg(struct sched_entity *se)
  {
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
         unsigned long flags;
@@ -5145,7 +5145,6 @@ static inline void hrtick_update(struct rq *rq)
  
  #ifdef CONFIG_SMP
  static inline unsigned long cpu_util(int cpu);
-static unsigned long capacity_of(int cpu);
  
  static inline bool cpu_overutilized(int cpu)
  {
@@ -7521,7 +7520,6 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
  {
         lockdep_assert_held(&env->src_rq->lock);
  
-       p->on_rq = TASK_ON_RQ_MIGRATING;
         deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
         set_task_cpu(p, env->dst_cpu);
  }
@@ -7657,7 +7655,6 @@ static void attach_task(struct rq *rq, struct task_struct *p)
  
         BUG_ON(task_rq(p) != rq);
         activate_task(rq, p, ENQUEUE_NOCLOCK);
-       p->on_rq = TASK_ON_RQ_QUEUED;
         check_preempt_curr(rq, p, 0);
  }
  
@@ -9551,22 +9548,26 @@ static inline int on_null_domain(struct rq *rq)
   * - When one of the busy CPUs notice that there may be an idle rebalancing
   *   needed, they will kick the idle load balancer, which then does idle
   *   load balancing for all the idle CPUs.
+ * - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set
+ *   anywhere yet.
   */
  
  static inline int find_new_ilb(void)
  {
-       int ilb = cpumask_first(nohz.idle_cpus_mask);
+       int ilb;
  
-       if (ilb < nr_cpu_ids && idle_cpu(ilb))
-               return ilb;
+       for_each_cpu_and(ilb, nohz.idle_cpus_mask,
+                             housekeeping_cpumask(HK_FLAG_MISC)) {
+               if (idle_cpu(ilb))
+                       return ilb;
+       }
  
         return nr_cpu_ids;
  }
  
  /*
- * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
- * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
- * CPU (if there is one).
+ * Kick a CPU to do the nohz balancing, if it is time for it. We pick any
+ * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one).
   */
  static void kick_ilb(unsigned int flags)
  {
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c

index b02d148..6873020 100644 (file)
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -65,6 +65,7 @@ void __init housekeeping_init(void)
  static int __init housekeeping_setup(char *str, enum hk_flags flags)
  {
         cpumask_var_t non_housekeeping_mask;
+       cpumask_var_t tmp;
         int err;
  
         alloc_bootmem_cpumask_var(&non_housekeeping_mask);
@@ -75,16 +76,23 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags)
                 return 0;
         }
  
+       alloc_bootmem_cpumask_var(&tmp);
         if (!housekeeping_flags) {
                 alloc_bootmem_cpumask_var(&housekeeping_mask);
                 cpumask_andnot(housekeeping_mask,
                                cpu_possible_mask, non_housekeeping_mask);
-               if (cpumask_empty(housekeeping_mask))
+
+               cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask);
+               if (cpumask_empty(tmp)) {
+                       pr_warn("Housekeeping: must include one present CPU, "
+                               "using boot CPU:%d\n", smp_processor_id());
                         __cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
+                       __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
+               }
         } else {
-               cpumask_var_t tmp;
-
-               alloc_bootmem_cpumask_var(&tmp);
+               cpumask_andnot(tmp, cpu_present_mask, non_housekeeping_mask);
+               if (cpumask_empty(tmp))
+                       __cpumask_clear_cpu(smp_processor_id(), non_housekeeping_mask);
                 cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask);
                 if (!cpumask_equal(tmp, housekeeping_mask)) {
                         pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
@@ -92,8 +100,8 @@ static int __init housekeeping_setup(char *str, enum hk_flags flags)
                         free_bootmem_cpumask_var(non_housekeeping_mask);
                         return 0;
                 }
-               free_bootmem_cpumask_var(tmp);
         }
+       free_bootmem_cpumask_var(tmp);
  
         if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) {
                 if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c

index 90fa23d..1e6b909 100644 (file)
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2555,6 +2555,8 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
         rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
         if (rt_runtime_us < 0)
                 rt_runtime = RUNTIME_INF;
+       else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC)
+               return -EINVAL;
  
         return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
  }
@@ -2575,6 +2577,9 @@ int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
  {
         u64 rt_runtime, rt_period;
  
+       if (rt_period_us > U64_MAX / NSEC_PER_USEC)
+               return -EINVAL;
+
         rt_period = rt_period_us * NSEC_PER_USEC;
         rt_runtime = tg->rt_bandwidth.rt_runtime;
  
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index efa686e..b52ed1a 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -780,7 +780,7 @@ struct root_domain {
          * NULL-terminated list of performance domains intersecting with the
          * CPUs of the rd. Protected by RCU.
          */
-       struct perf_domain      *pd;
+       struct perf_domain __rcu *pd;
  };
  
  extern struct root_domain def_root_domain;
@@ -869,8 +869,8 @@ struct rq {
         atomic_t                nr_iowait;
  
  #ifdef CONFIG_SMP
-       struct root_domain      *rd;
-       struct sched_domain     *sd;
+       struct root_domain              *rd;
+       struct sched_domain __rcu       *sd;
  
         unsigned long           cpu_capacity;
         unsigned long           cpu_capacity_orig;
@@ -1324,13 +1324,13 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
         return sd;
  }
  
-DECLARE_PER_CPU(struct sched_domain *, sd_llc);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
  DECLARE_PER_CPU(int, sd_llc_size);
  DECLARE_PER_CPU(int, sd_llc_id);
-DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
-DECLARE_PER_CPU(struct sched_domain *, sd_numa);
-DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing);
-DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity);
+DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
+DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
  extern struct static_key_false sched_asym_cpucapacity;
  
  struct sched_group_capacity {
@@ -2185,7 +2185,7 @@ static inline u64 irq_time_read(int cpu)
  #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
  
  #ifdef CONFIG_CPU_FREQ
-DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data);
  
  /**
   * cpufreq_update_util - Take a note about CPU utilization changes.
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c

index ab7f371..f53f89d 100644 (file)
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -615,13 +615,13 @@ static void destroy_sched_domains(struct sched_domain *sd)
   * the cpumask of the domain), this allows us to quickly tell if
   * two CPUs are in the same cache domain, see cpus_share_cache().
   */
-DEFINE_PER_CPU(struct sched_domain *, sd_llc);
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
  DEFINE_PER_CPU(int, sd_llc_size);
  DEFINE_PER_CPU(int, sd_llc_id);
-DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
-DEFINE_PER_CPU(struct sched_domain *, sd_numa);
-DEFINE_PER_CPU(struct sched_domain *, sd_asym_packing);
-DEFINE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity);
+DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
+DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
  DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
  
  static void update_top_cache_domain(int cpu)
@@ -1059,6 +1059,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
         struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
         struct sched_domain *child = sd->child;
         struct sched_group *sg;
+       bool already_visited;
  
         if (child)
                 cpu = cpumask_first(sched_domain_span(child));
@@ -1066,9 +1067,14 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
         sg = *per_cpu_ptr(sdd->sg, cpu);
         sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
  
-       /* For claim_allocations: */
-       atomic_inc(&sg->ref);
-       atomic_inc(&sg->sgc->ref);
+       /* Increase refcounts for claim_allocations: */
+       already_visited = atomic_inc_return(&sg->ref) > 1;
+       /* sgc visits should follow a similar trend as sg */
+       WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1));
+
+       /* If we have already visited that group, it's already initialized. */
+       if (already_visited)
+               return sg;
  
         if (child) {
                 cpumask_copy(sched_group_span(sg), sched_domain_span(child));
@@ -1087,8 +1093,8 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
  
  /*
   * build_sched_groups will build a circular linked list of the groups
- * covered by the given span, and will set each group's ->cpumask correctly,
- * and ->cpu_capacity to 0.
+ * covered by the given span, will set each group's ->cpumask correctly,
+ * and will initialize their ->sgc.
   *
   * Assumes the sched_domain tree is fully constructed
   */
@@ -2075,9 +2081,8 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
  }
  
  /*
- * Set up scheduler domains and groups. Callers must hold the hotplug lock.
- * For now this just excludes isolated CPUs, but could be used to
- * exclude other special cases in the future.
+ * Set up scheduler domains and groups.  For now this just excludes isolated
+ * CPUs, but could be used to exclude other special cases in the future.
   */
  int sched_init_domains(const struct cpumask *cpu_map)
  {
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c

index df40146..e49e809 100644 (file)
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -46,6 +46,14 @@ ktime_t tick_period;
   *    procedure also covers cpu hotplug.
   */
  int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
+#ifdef CONFIG_NO_HZ_FULL
+/*
+ * tick_do_timer_boot_cpu indicates the boot CPU temporarily owns
+ * tick_do_timer_cpu and it should be taken over by an eligible secondary
+ * when one comes online.
+ */
+static int tick_do_timer_boot_cpu __read_mostly = -1;
+#endif
  
  /*
   * Debugging: see timer_list.c
@@ -167,6 +175,26 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
         }
  }
  
+#ifdef CONFIG_NO_HZ_FULL
+static void giveup_do_timer(void *info)
+{
+       int cpu = *(unsigned int *)info;
+
+       WARN_ON(tick_do_timer_cpu != smp_processor_id());
+
+       tick_do_timer_cpu = cpu;
+}
+
+static void tick_take_do_timer_from_boot(void)
+{
+       int cpu = smp_processor_id();
+       int from = tick_do_timer_boot_cpu;
+
+       if (from >= 0 && from != cpu)
+               smp_call_function_single(from, giveup_do_timer, &cpu, 1);
+}
+#endif
+
  /*
   * Setup the tick device
   */
@@ -186,12 +214,26 @@ static void tick_setup_device(struct tick_device *td,
                  * this cpu:
                  */
                 if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
-                       if (!tick_nohz_full_cpu(cpu))
-                               tick_do_timer_cpu = cpu;
-                       else
-                               tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+                       tick_do_timer_cpu = cpu;
+
                         tick_next_period = ktime_get();
                         tick_period = NSEC_PER_SEC / HZ;
+#ifdef CONFIG_NO_HZ_FULL
+                       /*
+                        * The boot CPU may be nohz_full, in which case set
+                        * tick_do_timer_boot_cpu so the first housekeeping
+                        * secondary that comes up will take do_timer from
+                        * us.
+                        */
+                       if (tick_nohz_full_cpu(cpu))
+                               tick_do_timer_boot_cpu = cpu;
+
+               } else if (tick_do_timer_boot_cpu != -1 &&
+                                               !tick_nohz_full_cpu(cpu)) {
+                       tick_take_do_timer_from_boot();
+                       tick_do_timer_boot_cpu = -1;
+                       WARN_ON(tick_do_timer_cpu != cpu);
+#endif
                 }
  
                 /*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c

index 6fa52cd..4aa917a 100644 (file)
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -121,10 +121,16 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
          * into a long sleep. If two CPUs happen to assign themselves to
          * this duty, then the jiffies update is still serialized by
          * jiffies_lock.
+        *
+        * If nohz_full is enabled, this should not happen because the
+        * tick_do_timer_cpu never relinquishes.
          */
-       if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)
-           && !tick_nohz_full_cpu(cpu))
+       if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) {
+#ifdef CONFIG_NO_HZ_FULL
+               WARN_ON(tick_nohz_full_running);
+#endif
                 tick_do_timer_cpu = cpu;
+       }
  #endif
  
         /* Check, if the jiffies need an update */
@@ -395,8 +401,8 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask)
  static int tick_nohz_cpu_down(unsigned int cpu)
  {
         /*
-        * The boot CPU handles housekeeping duty (unbound timers,
-        * workqueues, timekeeping, ...) on behalf of full dynticks
+        * The tick_do_timer_cpu CPU handles housekeeping duty (unbound
+        * timers, workqueues, timekeeping, ...) on behalf of full dynticks
          * CPUs. It must remain online when nohz full is enabled.
          */
         if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
@@ -423,12 +429,15 @@ void __init tick_nohz_init(void)
                 return;
         }
  
-       cpu = smp_processor_id();
+       if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) &&
+                       !IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) {
+               cpu = smp_processor_id();
  
-       if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
-               pr_warn("NO_HZ: Clearing %d from nohz_full range for timekeeping\n",
-                       cpu);
-               cpumask_clear_cpu(cpu, tick_nohz_full_mask);
+               if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
+                       pr_warn("NO_HZ: Clearing %d from nohz_full range "
+                               "for timekeeping\n", cpu);
+                       cpumask_clear_cpu(cpu, tick_nohz_full_mask);
+               }
         }
  
         for_each_cpu(cpu, tick_nohz_full_mask)
@@ -904,8 +913,13 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
                 /*
                  * Boot safety: make sure the timekeeping duty has been
                  * assigned before entering dyntick-idle mode,
+                * tick_do_timer_cpu is TICK_DO_TIMER_BOOT
                  */
-               if (tick_do_timer_cpu == TICK_DO_TIMER_NONE)
+               if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_BOOT))
+                       return false;
+
+               /* Should not happen for nohz-full */
+               if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
                         return false;
         }
  
diff --git a/kernel/workqueue.c b/kernel/workqueue.c

index ddee541..56180c9 100644 (file)
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -841,43 +841,32 @@ static void wake_up_worker(struct worker_pool *pool)
  }
  
  /**
- * wq_worker_waking_up - a worker is waking up
+ * wq_worker_running - a worker is running again
   * @task: task waking up
- * @cpu: CPU @task is waking up to
   *
- * This function is called during try_to_wake_up() when a worker is
- * being awoken.
- *
- * CONTEXT:
- * spin_lock_irq(rq->lock)
+ * This function is called when a worker returns from schedule()
   */
-void wq_worker_waking_up(struct task_struct *task, int cpu)
+void wq_worker_running(struct task_struct *task)
  {
         struct worker *worker = kthread_data(task);
  
-       if (!(worker->flags & WORKER_NOT_RUNNING)) {
-               WARN_ON_ONCE(worker->pool->cpu != cpu);
+       if (!worker->sleeping)
+               return;
+       if (!(worker->flags & WORKER_NOT_RUNNING))
                 atomic_inc(&worker->pool->nr_running);
-       }
+       worker->sleeping = 0;
  }
  
  /**
   * wq_worker_sleeping - a worker is going to sleep
   * @task: task going to sleep
   *
- * This function is called during schedule() when a busy worker is
- * going to sleep.  Worker on the same cpu can be woken up by
- * returning pointer to its task.
- *
- * CONTEXT:
- * spin_lock_irq(rq->lock)
- *
- * Return:
- * Worker task on @cpu to wake up, %NULL if none.
+ * This function is called from schedule() when a busy worker is
+ * going to sleep.
   */
-struct task_struct *wq_worker_sleeping(struct task_struct *task)
+void wq_worker_sleeping(struct task_struct *task)
  {
-       struct worker *worker = kthread_data(task), *to_wakeup = NULL;
+       struct worker *next, *worker = kthread_data(task);
         struct worker_pool *pool;
  
         /*
@@ -886,13 +875,15 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
          * checking NOT_RUNNING.
          */
         if (worker->flags & WORKER_NOT_RUNNING)
-               return NULL;
+               return;
  
         pool = worker->pool;
  
-       /* this can only happen on the local cpu */
-       if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id()))
-               return NULL;
+       if (WARN_ON_ONCE(worker->sleeping))
+               return;
+
+       worker->sleeping = 1;
+       spin_lock_irq(&pool->lock);
  
         /*
          * The counterpart of the following dec_and_test, implied mb,
@@ -906,9 +897,12 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task)
          * lock is safe.
          */
         if (atomic_dec_and_test(&pool->nr_running) &&
-           !list_empty(&pool->worklist))
-               to_wakeup = first_idle_worker(pool);
-       return to_wakeup ? to_wakeup->task : NULL;
+           !list_empty(&pool->worklist)) {
+               next = first_idle_worker(pool);
+               if (next)
+                       wake_up_process(next->task);
+       }
+       spin_unlock_irq(&pool->lock);
  }
  
  /**
@@ -4929,7 +4923,7 @@ static void rebind_workers(struct worker_pool *pool)
                  *
                  * WRITE_ONCE() is necessary because @worker->flags may be
                  * tested without holding any lock in
-                * wq_worker_waking_up().  Without it, NOT_RUNNING test may
+                * wq_worker_running().  Without it, NOT_RUNNING test may
                  * fail incorrectly leading to premature concurrency
                  * management operations.
                  */
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h

index cb68b03..498de0e 100644 (file)
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -44,6 +44,7 @@ struct worker {
         unsigned long           last_active;    /* L: last active timestamp */
         unsigned int            flags;          /* X: flags */
         int                     id;             /* I: worker id */
+       int                     sleeping;       /* None */
  
         /*
          * Opaque string set with work_set_desc().  Printed out with task
@@ -72,8 +73,8 @@ static inline struct worker *current_wq_worker(void)
   * Scheduler hooks for concurrency managed workqueue.  Only to be used from
   * sched/ and workqueue.c.
   */
-void wq_worker_waking_up(struct task_struct *task, int cpu);
-struct task_struct *wq_worker_sleeping(struct task_struct *task);
+void wq_worker_running(struct task_struct *task);
+void wq_worker_sleeping(struct task_struct *task);
  work_func_t wq_worker_last_func(struct task_struct *task);
  
  #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
author	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 6 May 2019 21:31:50 +0000 (14:31 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Mon, 6 May 2019 21:31:50 +0000 (14:31 -0700)
arch/powerpc/Kconfig		patch \| blob \| history
include/linux/cpu.h		patch \| blob \| history
include/linux/rcuwait.h		patch \| blob \| history
include/linux/sched/topology.h		patch \| blob \| history
kernel/cgroup/cpuset.c		patch \| blob \| history
kernel/cpu.c		patch \| blob \| history
kernel/kexec_core.c		patch \| blob \| history
kernel/power/Kconfig		patch \| blob \| history
kernel/power/hibernate.c		patch \| blob \| history
kernel/power/suspend.c		patch \| blob \| history
kernel/sched/core.c		patch \| blob \| history
kernel/sched/cpufreq.c		patch \| blob \| history
kernel/sched/debug.c		patch \| blob \| history
kernel/sched/fair.c		patch \| blob \| history
kernel/sched/isolation.c		patch \| blob \| history
kernel/sched/rt.c		patch \| blob \| history
kernel/sched/sched.h		patch \| blob \| history
kernel/sched/topology.c		patch \| blob \| history
kernel/time/tick-common.c		patch \| blob \| history
kernel/time/tick-sched.c		patch \| blob \| history
kernel/workqueue.c		patch \| blob \| history
kernel/workqueue_internal.h		patch \| blob \| history