sched, isolcpu: make cpu_isolated_map visible outside scheduler

[linux-2.6-microblaze.git] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 5eab11d..b578bb2 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -119,7 +119,9 @@ void update_rq_clock(struct rq *rq)
  {
         s64 delta;
  
-       if (rq->skip_clock_update > 0)
+       lockdep_assert_held(&rq->lock);
+
+       if (rq->clock_skip_update & RQCF_ACT_SKIP)
                 return;
  
         delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
@@ -304,65 +306,8 @@ __read_mostly int scheduler_running;
   */
  int sysctl_sched_rt_runtime = 950000;
  
-/*
- * __task_rq_lock - lock the rq @p resides on.
- */
-static inline struct rq *__task_rq_lock(struct task_struct *p)
-       __acquires(rq->lock)
-{
-       struct rq *rq;
-
-       lockdep_assert_held(&p->pi_lock);
-
-       for (;;) {
-               rq = task_rq(p);
-               raw_spin_lock(&rq->lock);
-               if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
-                       return rq;
-               raw_spin_unlock(&rq->lock);
-
-               while (unlikely(task_on_rq_migrating(p)))
-                       cpu_relax();
-       }
-}
-
-/*
- * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
- */
-static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
-       __acquires(p->pi_lock)
-       __acquires(rq->lock)
-{
-       struct rq *rq;
-
-       for (;;) {
-               raw_spin_lock_irqsave(&p->pi_lock, *flags);
-               rq = task_rq(p);
-               raw_spin_lock(&rq->lock);
-               if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
-                       return rq;
-               raw_spin_unlock(&rq->lock);
-               raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
-
-               while (unlikely(task_on_rq_migrating(p)))
-                       cpu_relax();
-       }
-}
-
-static void __task_rq_unlock(struct rq *rq)
-       __releases(rq->lock)
-{
-       raw_spin_unlock(&rq->lock);
-}
-
-static inline void
-task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
-       __releases(rq->lock)
-       __releases(p->pi_lock)
-{
-       raw_spin_unlock(&rq->lock);
-       raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
-}
+/* cpus with isolated domains */
+cpumask_var_t cpu_isolated_map;
  
  /*
   * this_rq_lock - lock this runqueue and disable interrupts.
@@ -490,6 +435,11 @@ static __init void init_hrtick(void)
   */
  void hrtick_start(struct rq *rq, u64 delay)
  {
+       /*
+        * Don't schedule slices shorter than 10000ns, that just
+        * doesn't make sense. Rely on vruntime for fairness.
+        */
+       delay = max_t(u64, delay, 10000LL);
         __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
                         HRTIMER_MODE_REL_PINNED, 0);
  }
@@ -1046,7 +996,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
          * this case, we can save a useless back to back clock update.
          */
         if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
-               rq->skip_clock_update = 1;
+               rq_clock_skip_update(rq, true);
  }
  
  #ifdef CONFIG_SMP
@@ -1082,7 +1032,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
                 if (p->sched_class->migrate_task_rq)
                         p->sched_class->migrate_task_rq(p, new_cpu);
                 p->se.nr_migrations++;
-               perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
+               perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
         }
  
         __set_task_cpu(p, new_cpu);
@@ -1836,6 +1786,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
         p->se.prev_sum_exec_runtime     = 0;
         p->se.nr_migrations             = 0;
         p->se.vruntime                  = 0;
+#ifdef CONFIG_SMP
+       p->se.avg.decay_count           = 0;
+#endif
         INIT_LIST_HEAD(&p->se.group_node);
  
  #ifdef CONFIG_SCHEDSTATS
@@ -2755,6 +2708,10 @@ again:
   *          - explicit schedule() call
   *          - return from syscall or exception to user-space
   *          - return from interrupt-handler to user-space
+ *
+ * WARNING: all callers must re-check need_resched() afterward and reschedule
+ * accordingly in case an event triggered the need for rescheduling (such as
+ * an interrupt waking up a task) while preemption was disabled in __schedule().
   */
  static void __sched __schedule(void)
  {
@@ -2763,7 +2720,6 @@ static void __sched __schedule(void)
         struct rq *rq;
         int cpu;
  
-need_resched:
         preempt_disable();
         cpu = smp_processor_id();
         rq = cpu_rq(cpu);
@@ -2783,6 +2739,8 @@ need_resched:
         smp_mb__before_spinlock();
         raw_spin_lock_irq(&rq->lock);
  
+       rq->clock_skip_update <<= 1; /* promote REQ to ACT */
+
         switch_count = &prev->nivcsw;
         if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
                 if (unlikely(signal_pending_state(prev->state, prev))) {
@@ -2807,13 +2765,13 @@ need_resched:
                 switch_count = &prev->nvcsw;
         }
  
-       if (task_on_rq_queued(prev) || rq->skip_clock_update < 0)
+       if (task_on_rq_queued(prev))
                 update_rq_clock(rq);
  
         next = pick_next_task(rq, prev);
         clear_tsk_need_resched(prev);
         clear_preempt_need_resched();
-       rq->skip_clock_update = 0;
+       rq->clock_skip_update = 0;
  
         if (likely(prev != next)) {
                 rq->nr_switches++;
@@ -2828,8 +2786,6 @@ need_resched:
         post_schedule(rq);
  
         sched_preempt_enable_no_resched();
-       if (need_resched())
-               goto need_resched;
  }
  
  static inline void sched_submit_work(struct task_struct *tsk)
@@ -2849,7 +2805,9 @@ asmlinkage __visible void __sched schedule(void)
         struct task_struct *tsk = current;
  
         sched_submit_work(tsk);
-       __schedule();
+       do {
+               __schedule();
+       } while (need_resched());
  }
  EXPORT_SYMBOL(schedule);
  
@@ -2884,6 +2842,21 @@ void __sched schedule_preempt_disabled(void)
         preempt_disable();
  }
  
+static void __sched notrace preempt_schedule_common(void)
+{
+       do {
+               __preempt_count_add(PREEMPT_ACTIVE);
+               __schedule();
+               __preempt_count_sub(PREEMPT_ACTIVE);
+
+               /*
+                * Check again in case we missed a preemption opportunity
+                * between schedule and now.
+                */
+               barrier();
+       } while (need_resched());
+}
+
  #ifdef CONFIG_PREEMPT
  /*
   * this is the entry point to schedule() from in-kernel preemption
@@ -2899,17 +2872,7 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
         if (likely(!preemptible()))
                 return;
  
-       do {
-               __preempt_count_add(PREEMPT_ACTIVE);
-               __schedule();
-               __preempt_count_sub(PREEMPT_ACTIVE);
-
-               /*
-                * Check again in case we missed a preemption opportunity
-                * between schedule and now.
-                */
-               barrier();
-       } while (need_resched());
+       preempt_schedule_common();
  }
  NOKPROBE_SYMBOL(preempt_schedule);
  EXPORT_SYMBOL(preempt_schedule);
@@ -3405,6 +3368,20 @@ static bool check_same_owner(struct task_struct *p)
         return match;
  }
  
+static bool dl_param_changed(struct task_struct *p,
+               const struct sched_attr *attr)
+{
+       struct sched_dl_entity *dl_se = &p->dl;
+
+       if (dl_se->dl_runtime != attr->sched_runtime ||
+               dl_se->dl_deadline != attr->sched_deadline ||
+               dl_se->dl_period != attr->sched_period ||
+               dl_se->flags != attr->sched_flags)
+               return true;
+
+       return false;
+}
+
  static int __sched_setscheduler(struct task_struct *p,
                                 const struct sched_attr *attr,
                                 bool user)
@@ -3533,7 +3510,7 @@ recheck:
                         goto change;
                 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
                         goto change;
-               if (dl_policy(policy))
+               if (dl_policy(policy) && dl_param_changed(p, attr))
                         goto change;
  
                 p->sched_reset_on_fork = reset_on_fork;
@@ -4225,17 +4202,10 @@ SYSCALL_DEFINE0(sched_yield)
         return 0;
  }
  
-static void __cond_resched(void)
-{
-       __preempt_count_add(PREEMPT_ACTIVE);
-       __schedule();
-       __preempt_count_sub(PREEMPT_ACTIVE);
-}
-
  int __sched _cond_resched(void)
  {
         if (should_resched()) {
-               __cond_resched();
+               preempt_schedule_common();
                 return 1;
         }
         return 0;
@@ -4260,7 +4230,7 @@ int __cond_resched_lock(spinlock_t *lock)
         if (spin_needbreak(lock) || resched) {
                 spin_unlock(lock);
                 if (resched)
-                       __cond_resched();
+                       preempt_schedule_common();
                 else
                         cpu_relax();
                 ret = 1;
@@ -4276,7 +4246,7 @@ int __sched __cond_resched_softirq(void)
  
         if (should_resched()) {
                 local_bh_enable();
-               __cond_resched();
+               preempt_schedule_common();
                 local_bh_disable();
                 return 1;
         }
@@ -4391,36 +4361,29 @@ EXPORT_SYMBOL_GPL(yield_to);
   * This task is about to go to sleep on IO. Increment rq->nr_iowait so
   * that process accounting knows that this is a task in IO wait state.
   */
-void __sched io_schedule(void)
-{
-       struct rq *rq = raw_rq();
-
-       delayacct_blkio_start();
-       atomic_inc(&rq->nr_iowait);
-       blk_flush_plug(current);
-       current->in_iowait = 1;
-       schedule();
-       current->in_iowait = 0;
-       atomic_dec(&rq->nr_iowait);
-       delayacct_blkio_end();
-}
-EXPORT_SYMBOL(io_schedule);
-
  long __sched io_schedule_timeout(long timeout)
  {
-       struct rq *rq = raw_rq();
+       int old_iowait = current->in_iowait;
+       struct rq *rq;
         long ret;
  
+       current->in_iowait = 1;
+       if (old_iowait)
+               blk_schedule_flush_plug(current);
+       else
+               blk_flush_plug(current);
+
         delayacct_blkio_start();
+       rq = raw_rq();
         atomic_inc(&rq->nr_iowait);
-       blk_flush_plug(current);
-       current->in_iowait = 1;
         ret = schedule_timeout(timeout);
-       current->in_iowait = 0;
+       current->in_iowait = old_iowait;
         atomic_dec(&rq->nr_iowait);
         delayacct_blkio_end();
+
         return ret;
  }
+EXPORT_SYMBOL(io_schedule_timeout);
  
  /**
   * sys_sched_get_priority_max - return maximum RT priority.
@@ -4531,9 +4494,10 @@ void sched_show_task(struct task_struct *p)
  {
         unsigned long free = 0;
         int ppid;
-       unsigned state;
+       unsigned long state = p->state;
  
-       state = p->state ? __ffs(p->state) + 1 : 0;
+       if (state)
+               state = __ffs(state) + 1;
         printk(KERN_INFO "%-15.15s %c", p->comm,
                 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
  #if BITS_PER_LONG == 32
@@ -4766,7 +4730,7 @@ static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
  
  void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
  {
-       if (p->sched_class && p->sched_class->set_cpus_allowed)
+       if (p->sched_class->set_cpus_allowed)
                 p->sched_class->set_cpus_allowed(p, new_mask);
  
         cpumask_copy(&p->cpus_allowed, new_mask);
@@ -5434,9 +5398,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                                   struct cpumask *groupmask)
  {
         struct sched_group *group = sd->groups;
-       char str[256];
  
-       cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
         cpumask_clear(groupmask);
  
         printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
@@ -5449,7 +5411,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                 return -1;
         }
  
-       printk(KERN_CONT "span %s level %s\n", str, sd->name);
+       printk(KERN_CONT "span %*pbl level %s\n",
+              cpumask_pr_args(sched_domain_span(sd)), sd->name);
  
         if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
                 printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -5494,9 +5457,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
  
                 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
  
-               cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
-
-               printk(KERN_CONT " %s", str);
+               printk(KERN_CONT " %*pbl",
+                      cpumask_pr_args(sched_group_cpus(group)));
                 if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
                         printk(KERN_CONT " (cpu_capacity = %d)",
                                 group->sgc->capacity);
@@ -5852,9 +5814,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
         update_top_cache_domain(cpu);
  }
  
-/* cpus with isolated domains */
-static cpumask_var_t cpu_isolated_map;
-
  /* Setup the mask of cpus configured for isolated domains */
  static int __init isolated_cpu_setup(char *str)
  {
@@ -7275,6 +7234,11 @@ void __init sched_init(void)
         atomic_inc(&init_mm.mm_count);
         enter_lazy_tlb(&init_mm, current);
  
+       /*
+        * During early bootup we pretend to be a normal task:
+        */
+       current->sched_class = &fair_sched_class;
+
         /*
          * Make us the idle thread. Technically, schedule() should not be
          * called from this thread, however somewhere below it might be,
@@ -7285,11 +7249,6 @@ void __init sched_init(void)
  
         calc_load_update = jiffies + LOAD_FREQ;
  
-       /*
-        * During early bootup we pretend to be a normal task:
-        */
-       current->sched_class = &fair_sched_class;
-
  #ifdef CONFIG_SMP
         zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
         /* May be allocated at isolcpus cmdline parse time */
@@ -7350,6 +7309,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
                         in_atomic(), irqs_disabled(),
                         current->pid, current->comm);
  
+       if (task_stack_end_corrupted(current))
+               printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
+
         debug_show_held_locks(current);
         if (irqs_disabled())
                 print_irqtrace_events(current);
@@ -7613,6 +7575,12 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
  {
         struct task_struct *g, *p;
  
+       /*
+        * Autogroups do not have RT tasks; see autogroup_create().
+        */
+       if (task_group_is_autogroup(tg))
+               return 0;
+
         for_each_process_thread(g, p) {
                 if (rt_task(p) && task_group(p) == tg)
                         return 1;
@@ -7705,6 +7673,17 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
  {
         int i, err = 0;
  
+       /*
+        * Disallowing the root group RT runtime is BAD, it would disallow the
+        * kernel creating (and or operating) RT threads.
+        */
+       if (tg == &root_task_group && rt_runtime == 0)
+               return -EINVAL;
+
+       /* No period doesn't make any sense. */
+       if (rt_period == 0)
+               return -EINVAL;
+
         mutex_lock(&rt_constraints_mutex);
         read_lock(&tasklist_lock);
         err = __rt_schedulable(tg, rt_period, rt_runtime);
@@ -7761,9 +7740,6 @@ static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
         rt_period = (u64)rt_period_us * NSEC_PER_USEC;
         rt_runtime = tg->rt_bandwidth.rt_runtime;
  
-       if (rt_period == 0)
-               return -EINVAL;
-
         return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
  }