sched: Optimize finish_lock_switch()

[linux-2.6-microblaze.git] / kernel / sched / sched.h
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index 28709f6..12ada79 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -67,7 +67,6 @@
  #include <linux/tsacct_kern.h>
  
  #include <asm/tlb.h>
-#include <asm-generic/vmlinux.lds.h>
  
  #ifdef CONFIG_PARAVIRT
  # include <asm/paravirt.h>
@@ -257,30 +256,6 @@ struct rt_bandwidth {
  
  void __dl_clear_params(struct task_struct *p);
  
-/*
- * To keep the bandwidth of -deadline tasks and groups under control
- * we need some place where:
- *  - store the maximum -deadline bandwidth of the system (the group);
- *  - cache the fraction of that bandwidth that is currently allocated.
- *
- * This is all done in the data structure below. It is similar to the
- * one used for RT-throttling (rt_bandwidth), with the main difference
- * that, since here we are only interested in admission control, we
- * do not decrease any runtime while the group "executes", neither we
- * need a timer to replenish it.
- *
- * With respect to SMP, the bandwidth is given on a per-CPU basis,
- * meaning that:
- *  - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
- *  - dl_total_bw array contains, in the i-eth element, the currently
- *    allocated bandwidth on the i-eth CPU.
- * Moreover, groups consume bandwidth on each CPU, while tasks only
- * consume bandwidth on the CPU they're running on.
- * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw
- * that will be shown the next time the proc or cgroup controls will
- * be red. It on its turn can be changed by writing on its own
- * control.
- */
  struct dl_bandwidth {
         raw_spinlock_t          dl_runtime_lock;
         u64                     dl_runtime;
@@ -292,6 +267,24 @@ static inline int dl_bandwidth_enabled(void)
         return sysctl_sched_rt_runtime >= 0;
  }
  
+/*
+ * To keep the bandwidth of -deadline tasks under control
+ * we need some place where:
+ *  - store the maximum -deadline bandwidth of each cpu;
+ *  - cache the fraction of bandwidth that is currently allocated in
+ *    each root domain;
+ *
+ * This is all done in the data structure below. It is similar to the
+ * one used for RT-throttling (rt_bandwidth), with the main difference
+ * that, since here we are only interested in admission control, we
+ * do not decrease any runtime while the group "executes", neither we
+ * need a timer to replenish it.
+ *
+ * With respect to SMP, bandwidth is given on a per root domain basis,
+ * meaning that:
+ *  - bw (< 100%) is the deadline bandwidth of each CPU;
+ *  - total_bw is the currently allocated bandwidth in each root domain;
+ */
  struct dl_bw {
         raw_spinlock_t          lock;
         u64                     bw;
@@ -801,6 +794,15 @@ struct root_domain {
         struct dl_bw            dl_bw;
         struct cpudl            cpudl;
  
+       /*
+        * Indicate whether a root_domain's dl_bw has been checked or
+        * updated. It's monotonously increasing value.
+        *
+        * Also, some corner cases, like 'wrap around' is dangerous, but given
+        * that u64 is 'big enough'. So that shouldn't be a concern.
+        */
+       u64 visit_gen;
+
  #ifdef HAVE_RT_PUSH_IPI
         /*
          * For IPI pull requests, loop across the rto_mask.
@@ -1003,6 +1005,10 @@ struct rq {
  
         /* This is used to determine avg_idle's max value */
         u64                     max_idle_balance_cost;
+
+#ifdef CONFIG_HOTPLUG_CPU
+       struct rcuwait          hotplug_wait;
+#endif
  #endif /* CONFIG_SMP */
  
  #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -1048,6 +1054,12 @@ struct rq {
         /* Must be inspected within a rcu lock section */
         struct cpuidle_state    *idle_state;
  #endif
+
+#ifdef CONFIG_SMP
+       unsigned int            nr_pinned;
+#endif
+       unsigned int            push_busy;
+       struct cpu_stop_work    push_work;
  };
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1075,6 +1087,16 @@ static inline int cpu_of(struct rq *rq)
  #endif
  }
  
+#define MDF_PUSH       0x01
+
+static inline bool is_migration_disabled(struct task_struct *p)
+{
+#ifdef CONFIG_SMP
+       return p->migration_disabled;
+#else
+       return false;
+#endif
+}
  
  #ifdef CONFIG_SCHED_SMT
  extern void __update_idle_core(struct rq *rq);
@@ -1203,6 +1225,8 @@ struct rq_flags {
  #endif
  };
  
+extern struct callback_head balance_push_callback;
+
  /*
   * Lockdep annotation that avoids accidental unlocks; it's like a
   * sticky/continuous lockdep_assert_held().
@@ -1220,6 +1244,9 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
  #ifdef CONFIG_SCHED_DEBUG
         rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
         rf->clock_update_flags = 0;
+#ifdef CONFIG_SMP
+       SCHED_WARN_ON(rq->balance_callback && rq->balance_callback != &balance_push_callback);
+#endif
  #endif
  }
  
@@ -1389,7 +1416,7 @@ queue_balance_callback(struct rq *rq,
  {
         lockdep_assert_held(&rq->lock);
  
-       if (unlikely(head->next))
+       if (unlikely(head->next || rq->balance_callback == &balance_push_callback))
                 return;
  
         head->func = (void (*)(struct callback_head *))func;
@@ -1471,7 +1498,7 @@ struct sched_group_capacity {
         int                     id;
  #endif
  
-       unsigned long           cpumask[0];             /* Balance mask */
+       unsigned long           cpumask[];              /* Balance mask */
  };
  
  struct sched_group {
@@ -1629,7 +1656,7 @@ enum {
  
  #undef SCHED_FEAT
  
-#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL)
+#ifdef CONFIG_SCHED_DEBUG
  
  /*
   * To support run-time toggling of sched features, all the translation units
@@ -1637,6 +1664,7 @@ enum {
   */
  extern const_debug unsigned int sysctl_sched_features;
  
+#ifdef CONFIG_JUMP_LABEL
  #define SCHED_FEAT(name, enabled)                                      \
  static __always_inline bool static_branch_##name(struct static_key *key) \
  {                                                                      \
@@ -1649,7 +1677,13 @@ static __always_inline bool static_branch_##name(struct static_key *key) \
  extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
  #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
  
-#else /* !(SCHED_DEBUG && CONFIG_JUMP_LABEL) */
+#else /* !CONFIG_JUMP_LABEL */
+
+#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
+
+#endif /* CONFIG_JUMP_LABEL */
+
+#else /* !SCHED_DEBUG */
  
  /*
   * Each translation unit has its own copy of sysctl_sched_features to allow
@@ -1665,7 +1699,7 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features =
  
  #define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
  
-#endif /* SCHED_DEBUG && CONFIG_JUMP_LABEL */
+#endif /* SCHED_DEBUG */
  
  extern struct static_key_false sched_numa_balancing;
  extern struct static_key_false sched_schedstats;
@@ -1707,13 +1741,20 @@ static inline int task_on_rq_migrating(struct task_struct *p)
         return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING;
  }
  
-/*
- * wake flags
- */
-#define WF_SYNC                        0x01            /* Waker goes to sleep after wakeup */
-#define WF_FORK                        0x02            /* Child wakeup after fork */
-#define WF_MIGRATED            0x04            /* Internal use, task got migrated */
-#define WF_ON_CPU              0x08            /* Wakee is on_cpu */
+/* Wake flags. The first three directly map to some SD flag value */
+#define WF_EXEC     0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */
+#define WF_FORK     0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */
+#define WF_TTWU     0x08 /* Wakeup;            maps to SD_BALANCE_WAKE */
+
+#define WF_SYNC     0x10 /* Waker goes to sleep after wakeup */
+#define WF_MIGRATED 0x20 /* Internal use, task got migrated */
+#define WF_ON_CPU   0x40 /* Wakee is on_cpu */
+
+#ifdef CONFIG_SMP
+static_assert(WF_EXEC == SD_BALANCE_EXEC);
+static_assert(WF_FORK == SD_BALANCE_FORK);
+static_assert(WF_TTWU == SD_BALANCE_WAKE);
+#endif
  
  /*
   * To aid in avoiding the subversion of "niceness" due to uneven distribution
@@ -1789,16 +1830,19 @@ struct sched_class {
  
  #ifdef CONFIG_SMP
         int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
-       int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
+       int  (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);
         void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
  
         void (*task_woken)(struct rq *this_rq, struct task_struct *task);
  
         void (*set_cpus_allowed)(struct task_struct *p,
-                                const struct cpumask *newmask);
+                                const struct cpumask *newmask,
+                                u32 flags);
  
         void (*rq_online)(struct rq *rq);
         void (*rq_offline)(struct rq *rq);
+
+       struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
  #endif
  
         void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
@@ -1826,7 +1870,7 @@ struct sched_class {
  #ifdef CONFIG_FAIR_GROUP_SCHED
         void (*task_change_group)(struct task_struct *p, int type);
  #endif
-} __aligned(STRUCT_ALIGNMENT); /* STRUCT_ALIGN(), vmlinux.lds.h */
+};
  
  static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
  {
@@ -1840,6 +1884,20 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next)
         next->sched_class->set_next_task(rq, next, false);
  }
  
+
+/*
+ * Helper to define a sched_class instance; each one is placed in a separate
+ * section which is ordered by the linker script:
+ *
+ *   include/asm-generic/vmlinux.lds.h
+ *
+ * Also enforce alignment on the instance, not the type, to guarantee layout.
+ */
+#define DEFINE_SCHED_CLASS(name) \
+const struct sched_class name##_sched_class \
+       __aligned(__alignof__(struct sched_class)) \
+       __section("__" #name "_sched_class")
+
  /* Defined in include/asm-generic/vmlinux.lds.h */
  extern struct sched_class __begin_sched_classes[];
  extern struct sched_class __end_sched_classes[];
@@ -1882,13 +1940,35 @@ static inline bool sched_fair_runnable(struct rq *rq)
  extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
  extern struct task_struct *pick_next_task_idle(struct rq *rq);
  
+#define SCA_CHECK              0x01
+#define SCA_MIGRATE_DISABLE    0x02
+#define SCA_MIGRATE_ENABLE     0x04
+
  #ifdef CONFIG_SMP
  
  extern void update_group_capacity(struct sched_domain *sd, int cpu);
  
  extern void trigger_load_balance(struct rq *rq);
  
-extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
+extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
+
+static inline struct task_struct *get_push_task(struct rq *rq)
+{
+       struct task_struct *p = rq->curr;
+
+       lockdep_assert_held(&rq->lock);
+
+       if (rq->push_busy)
+               return NULL;
+
+       if (p->nr_cpus_allowed == 1)
+               return NULL;
+
+       rq->push_busy = true;
+       return get_task_struct(p);
+}
+
+extern int push_cpu_stop(void *arg);
  
  #endif