sched: Remove get_online_cpus() usage

[linux-2.6-microblaze.git] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 6808d35..c06b8d3 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1013,6 +1013,107 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
         __set_task_cpu(p, new_cpu);
  }
  
+static void __migrate_swap_task(struct task_struct *p, int cpu)
+{
+       if (p->on_rq) {
+               struct rq *src_rq, *dst_rq;
+
+               src_rq = task_rq(p);
+               dst_rq = cpu_rq(cpu);
+
+               deactivate_task(src_rq, p, 0);
+               set_task_cpu(p, cpu);
+               activate_task(dst_rq, p, 0);
+               check_preempt_curr(dst_rq, p, 0);
+       } else {
+               /*
+                * Task isn't running anymore; make it appear like we migrated
+                * it before it went to sleep. This means on wakeup we make the
+                * previous cpu our targer instead of where it really is.
+                */
+               p->wake_cpu = cpu;
+       }
+}
+
+struct migration_swap_arg {
+       struct task_struct *src_task, *dst_task;
+       int src_cpu, dst_cpu;
+};
+
+static int migrate_swap_stop(void *data)
+{
+       struct migration_swap_arg *arg = data;
+       struct rq *src_rq, *dst_rq;
+       int ret = -EAGAIN;
+
+       src_rq = cpu_rq(arg->src_cpu);
+       dst_rq = cpu_rq(arg->dst_cpu);
+
+       double_raw_lock(&arg->src_task->pi_lock,
+                       &arg->dst_task->pi_lock);
+       double_rq_lock(src_rq, dst_rq);
+       if (task_cpu(arg->dst_task) != arg->dst_cpu)
+               goto unlock;
+
+       if (task_cpu(arg->src_task) != arg->src_cpu)
+               goto unlock;
+
+       if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
+               goto unlock;
+
+       if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
+               goto unlock;
+
+       __migrate_swap_task(arg->src_task, arg->dst_cpu);
+       __migrate_swap_task(arg->dst_task, arg->src_cpu);
+
+       ret = 0;
+
+unlock:
+       double_rq_unlock(src_rq, dst_rq);
+       raw_spin_unlock(&arg->dst_task->pi_lock);
+       raw_spin_unlock(&arg->src_task->pi_lock);
+
+       return ret;
+}
+
+/*
+ * Cross migrate two tasks
+ */
+int migrate_swap(struct task_struct *cur, struct task_struct *p)
+{
+       struct migration_swap_arg arg;
+       int ret = -EINVAL;
+
+       arg = (struct migration_swap_arg){
+               .src_task = cur,
+               .src_cpu = task_cpu(cur),
+               .dst_task = p,
+               .dst_cpu = task_cpu(p),
+       };
+
+       if (arg.src_cpu == arg.dst_cpu)
+               goto out;
+
+       /*
+        * These three tests are all lockless; this is OK since all of them
+        * will be re-checked with proper locks held further down the line.
+        */
+       if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
+               goto out;
+
+       if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
+               goto out;
+
+       if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
+               goto out;
+
+       ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
+
+out:
+       return ret;
+}
+
  struct migration_arg {
         struct task_struct *task;
         int dest_cpu;
@@ -1232,9 +1333,9 @@ out:
   * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
   */
  static inline
-int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
  {
-       int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
+       cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
  
         /*
          * In order not to call set_task_cpu() on a blocking task we need
@@ -1518,7 +1619,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
         if (p->sched_class->task_waking)
                 p->sched_class->task_waking(p);
  
-       cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+       cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
         if (task_cpu(p) != cpu) {
                 wake_flags |= WF_MIGRATED;
                 set_task_cpu(p, cpu);
@@ -1600,7 +1701,7 @@ int wake_up_state(struct task_struct *p, unsigned int state)
   *
   * __sched_fork() is basic setup used by init_idle() too:
   */
-static void __sched_fork(struct task_struct *p)
+static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
  {
         p->on_rq                        = 0;
  
@@ -1625,16 +1726,23 @@ static void __sched_fork(struct task_struct *p)
  #ifdef CONFIG_NUMA_BALANCING
         if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
                 p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
-               p->mm->numa_next_reset = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
                 p->mm->numa_scan_seq = 0;
         }
  
+       if (clone_flags & CLONE_VM)
+               p->numa_preferred_nid = current->numa_preferred_nid;
+       else
+               p->numa_preferred_nid = -1;
+
         p->node_stamp = 0ULL;
         p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
-       p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
         p->numa_scan_period = sysctl_numa_balancing_scan_delay;
         p->numa_work.next = &p->numa_work;
         p->numa_faults = NULL;
+       p->numa_faults_buffer = NULL;
+
+       INIT_LIST_HEAD(&p->numa_entry);
+       p->numa_group = NULL;
  #endif /* CONFIG_NUMA_BALANCING */
  }
  
@@ -1660,12 +1768,12 @@ void set_numabalancing_state(bool enabled)
  /*
   * fork()/clone()-time setup:
   */
-void sched_fork(struct task_struct *p)
+void sched_fork(unsigned long clone_flags, struct task_struct *p)
  {
         unsigned long flags;
         int cpu = get_cpu();
  
-       __sched_fork(p);
+       __sched_fork(clone_flags, p);
         /*
          * We mark the process as running here. This guarantees that
          * nobody will actually run it, and a signal or other external
@@ -1750,7 +1858,7 @@ void wake_up_new_task(struct task_struct *p)
          *  - cpus_allowed can change in the fork path
          *  - any previously selected cpu might disappear through hotplug
          */
-       set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
+       set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
  #endif
  
         /* Initialize new task's runnable average */
@@ -2078,7 +2186,7 @@ void sched_exec(void)
         int dest_cpu;
  
         raw_spin_lock_irqsave(&p->pi_lock, flags);
-       dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
+       dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
         if (dest_cpu == smp_processor_id())
                 goto unlock;
  
@@ -3603,7 +3711,6 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
         struct task_struct *p;
         int retval;
  
-       get_online_cpus();
         rcu_read_lock();
  
         p = find_process_by_pid(pid);
@@ -3666,7 +3773,6 @@ out_free_cpus_allowed:
         free_cpumask_var(cpus_allowed);
  out_put_task:
         put_task_struct(p);
-       put_online_cpus();
         return retval;
  }
  
@@ -3711,7 +3817,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
         unsigned long flags;
         int retval;
  
-       get_online_cpus();
         rcu_read_lock();
  
         retval = -ESRCH;
@@ -3724,12 +3829,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
                 goto out_unlock;
  
         raw_spin_lock_irqsave(&p->pi_lock, flags);
-       cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
+       cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
         raw_spin_unlock_irqrestore(&p->pi_lock, flags);
  
  out_unlock:
         rcu_read_unlock();
-       put_online_cpus();
  
         return retval;
  }
@@ -4186,7 +4290,7 @@ void init_idle(struct task_struct *idle, int cpu)
  
         raw_spin_lock_irqsave(&rq->lock, flags);
  
-       __sched_fork(idle);
+       __sched_fork(0, idle);
         idle->state = TASK_RUNNING;
         idle->se.exec_start = sched_clock();
  
@@ -4346,6 +4450,53 @@ fail:
         return ret;
  }
  
+#ifdef CONFIG_NUMA_BALANCING
+/* Migrate current task p to target_cpu */
+int migrate_task_to(struct task_struct *p, int target_cpu)
+{
+       struct migration_arg arg = { p, target_cpu };
+       int curr_cpu = task_cpu(p);
+
+       if (curr_cpu == target_cpu)
+               return 0;
+
+       if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
+               return -EINVAL;
+
+       /* TODO: This is not properly updating schedstats */
+
+       return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
+}
+
+/*
+ * Requeue a task on a given node and accurately track the number of NUMA
+ * tasks on the runqueues
+ */
+void sched_setnuma(struct task_struct *p, int nid)
+{
+       struct rq *rq;
+       unsigned long flags;
+       bool on_rq, running;
+
+       rq = task_rq_lock(p, &flags);
+       on_rq = p->on_rq;
+       running = task_current(rq, p);
+
+       if (on_rq)
+               dequeue_task(rq, p, 0);
+       if (running)
+               p->sched_class->put_prev_task(rq, p);
+
+       p->numa_preferred_nid = nid;
+
+       if (running)
+               p->sched_class->set_curr_task(rq);
+       if (on_rq)
+               enqueue_task(rq, p, 0);
+       task_rq_unlock(rq, p, &flags);
+}
+#endif
+
  /*
   * migration_cpu_stop - this will be executed by a highprio stopper thread
   * and performs thread migration by bumping thread off CPU then
@@ -5119,6 +5270,7 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
  DEFINE_PER_CPU(struct sched_domain *, sd_llc);
  DEFINE_PER_CPU(int, sd_llc_size);
  DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(struct sched_domain *, sd_numa);
  
  static void update_top_cache_domain(int cpu)
  {
@@ -5135,6 +5287,9 @@ static void update_top_cache_domain(int cpu)
         rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
         per_cpu(sd_llc_size, cpu) = size;
         per_cpu(sd_llc_id, cpu) = id;
+
+       sd = lowest_flag_domain(cpu, SD_NUMA);
+       rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
  }
  
  /*
@@ -5654,6 +5809,7 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
                                         | 0*SD_SHARE_PKG_RESOURCES
                                         | 1*SD_SERIALIZE
                                         | 0*SD_PREFER_SIBLING
+                                       | 1*SD_NUMA
                                         | sd_local_flags(level)
                                         ,
                 .last_balance           = jiffies,
@@ -6335,14 +6491,17 @@ void __init sched_init_smp(void)
  
         sched_init_numa();
  
-       get_online_cpus();
+       /*
+        * There's no userspace yet to cause hotplug operations; hence all the
+        * cpu masks are stable and all blatant races in the below code cannot
+        * happen.
+        */
         mutex_lock(&sched_domains_mutex);
         init_sched_domains(cpu_active_mask);
         cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
         if (cpumask_empty(non_isolated_cpus))
                 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
         mutex_unlock(&sched_domains_mutex);
-       put_online_cpus();
  
         hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
         hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);