Merge remote-tracking branch 'torvalds/master' into perf/core
[linux-2.6-microblaze.git] / kernel / sched / debug.c
index 486f403..c5aacbd 100644 (file)
@@ -8,8 +8,6 @@
  */
 #include "sched.h"
 
-static DEFINE_SPINLOCK(sched_debug_lock);
-
 /*
  * This allows printing both to /proc/sched_debug and
  * to the console
@@ -169,245 +167,258 @@ static const struct file_operations sched_feat_fops = {
        .release        = single_release,
 };
 
-__read_mostly bool sched_debug_enabled;
+#ifdef CONFIG_SMP
 
-static __init int sched_init_debug(void)
+static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
+                                  size_t cnt, loff_t *ppos)
 {
-       debugfs_create_file("sched_features", 0644, NULL, NULL,
-                       &sched_feat_fops);
+       char buf[16];
 
-       debugfs_create_bool("sched_debug", 0644, NULL,
-                       &sched_debug_enabled);
+       if (cnt > 15)
+               cnt = 15;
 
-       return 0;
+       if (copy_from_user(&buf, ubuf, cnt))
+               return -EFAULT;
+
+       if (kstrtouint(buf, 10, &sysctl_sched_tunable_scaling))
+               return -EINVAL;
+
+       if (sched_update_scaling())
+               return -EINVAL;
+
+       *ppos += cnt;
+       return cnt;
 }
-late_initcall(sched_init_debug);
 
-#ifdef CONFIG_SMP
+static int sched_scaling_show(struct seq_file *m, void *v)
+{
+       seq_printf(m, "%d\n", sysctl_sched_tunable_scaling);
+       return 0;
+}
 
-#ifdef CONFIG_SYSCTL
+static int sched_scaling_open(struct inode *inode, struct file *filp)
+{
+       return single_open(filp, sched_scaling_show, NULL);
+}
 
-static struct ctl_table sd_ctl_dir[] = {
-       {
-               .procname       = "sched_domain",
-               .mode           = 0555,
-       },
-       {}
+static const struct file_operations sched_scaling_fops = {
+       .open           = sched_scaling_open,
+       .write          = sched_scaling_write,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
 };
 
-static struct ctl_table sd_ctl_root[] = {
-       {
-               .procname       = "kernel",
-               .mode           = 0555,
-               .child          = sd_ctl_dir,
-       },
-       {}
-};
+#endif /* SMP */
 
-static struct ctl_table *sd_alloc_ctl_entry(int n)
+#ifdef CONFIG_PREEMPT_DYNAMIC
+
+static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
+                                  size_t cnt, loff_t *ppos)
 {
-       struct ctl_table *entry =
-               kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
+       char buf[16];
+       int mode;
+
+       if (cnt > 15)
+               cnt = 15;
+
+       if (copy_from_user(&buf, ubuf, cnt))
+               return -EFAULT;
 
-       return entry;
+       buf[cnt] = 0;
+       mode = sched_dynamic_mode(strstrip(buf));
+       if (mode < 0)
+               return mode;
+
+       sched_dynamic_update(mode);
+
+       *ppos += cnt;
+
+       return cnt;
 }
 
-static void sd_free_ctl_entry(struct ctl_table **tablep)
+static int sched_dynamic_show(struct seq_file *m, void *v)
 {
-       struct ctl_table *entry;
-
-       /*
-        * In the intermediate directories, both the child directory and
-        * procname are dynamically allocated and could fail but the mode
-        * will always be set. In the lowest directory the names are
-        * static strings and all have proc handlers.
-        */
-       for (entry = *tablep; entry->mode; entry++) {
-               if (entry->child)
-                       sd_free_ctl_entry(&entry->child);
-               if (entry->proc_handler == NULL)
-                       kfree(entry->procname);
+       static const char * preempt_modes[] = {
+               "none", "voluntary", "full"
+       };
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) {
+               if (preempt_dynamic_mode == i)
+                       seq_puts(m, "(");
+               seq_puts(m, preempt_modes[i]);
+               if (preempt_dynamic_mode == i)
+                       seq_puts(m, ")");
+
+               seq_puts(m, " ");
        }
 
-       kfree(*tablep);
-       *tablep = NULL;
+       seq_puts(m, "\n");
+       return 0;
 }
 
-static void
-set_table_entry(struct ctl_table *entry,
-               const char *procname, void *data, int maxlen,
-               umode_t mode, proc_handler *proc_handler)
+static int sched_dynamic_open(struct inode *inode, struct file *filp)
 {
-       entry->procname = procname;
-       entry->data = data;
-       entry->maxlen = maxlen;
-       entry->mode = mode;
-       entry->proc_handler = proc_handler;
+       return single_open(filp, sched_dynamic_show, NULL);
 }
 
-static int sd_ctl_doflags(struct ctl_table *table, int write,
-                         void *buffer, size_t *lenp, loff_t *ppos)
+static const struct file_operations sched_dynamic_fops = {
+       .open           = sched_dynamic_open,
+       .write          = sched_dynamic_write,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+#endif /* CONFIG_PREEMPT_DYNAMIC */
+
+__read_mostly bool sched_debug_verbose;
+
+static const struct seq_operations sched_debug_sops;
+
+static int sched_debug_open(struct inode *inode, struct file *filp)
 {
-       unsigned long flags = *(unsigned long *)table->data;
-       size_t data_size = 0;
-       size_t len = 0;
-       char *tmp, *buf;
-       int idx;
+       return seq_open(filp, &sched_debug_sops);
+}
 
-       if (write)
-               return 0;
+static const struct file_operations sched_debug_fops = {
+       .open           = sched_debug_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = seq_release,
+};
 
-       for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
-               char *name = sd_flag_debug[idx].name;
+static struct dentry *debugfs_sched;
 
-               /* Name plus whitespace */
-               data_size += strlen(name) + 1;
-       }
+static __init int sched_init_debug(void)
+{
+       struct dentry __maybe_unused *numa;
 
-       if (*ppos > data_size) {
-               *lenp = 0;
-               return 0;
-       }
+       debugfs_sched = debugfs_create_dir("sched", NULL);
 
-       buf = kcalloc(data_size + 1, sizeof(*buf), GFP_KERNEL);
-       if (!buf)
-               return -ENOMEM;
+       debugfs_create_file("features", 0644, debugfs_sched, NULL, &sched_feat_fops);
+       debugfs_create_bool("verbose", 0644, debugfs_sched, &sched_debug_verbose);
+#ifdef CONFIG_PREEMPT_DYNAMIC
+       debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
+#endif
 
-       for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
-               char *name = sd_flag_debug[idx].name;
+       debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency);
+       debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);
+       debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity);
 
-               len += snprintf(buf + len, strlen(name) + 2, "%s ", name);
-       }
+       debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
+       debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
 
-       tmp = buf + *ppos;
-       len -= *ppos;
+#ifdef CONFIG_SMP
+       debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops);
+       debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
+       debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate);
 
-       if (len > *lenp)
-               len = *lenp;
-       if (len)
-               memcpy(buffer, tmp, len);
-       if (len < *lenp) {
-               ((char *)buffer)[len] = '\n';
-               len++;
-       }
+       mutex_lock(&sched_domains_mutex);
+       update_sched_domain_debugfs();
+       mutex_unlock(&sched_domains_mutex);
+#endif
 
-       *lenp = len;
-       *ppos += len;
+#ifdef CONFIG_NUMA_BALANCING
+       numa = debugfs_create_dir("numa_balancing", debugfs_sched);
 
-       kfree(buf);
+       debugfs_create_u32("scan_delay_ms", 0644, numa, &sysctl_numa_balancing_scan_delay);
+       debugfs_create_u32("scan_period_min_ms", 0644, numa, &sysctl_numa_balancing_scan_period_min);
+       debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max);
+       debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size);
+#endif
+
+       debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
 
        return 0;
 }
+late_initcall(sched_init_debug);
+
+#ifdef CONFIG_SMP
 
-static struct ctl_table *
-sd_alloc_ctl_domain_table(struct sched_domain *sd)
+static cpumask_var_t           sd_sysctl_cpus;
+static struct dentry           *sd_dentry;
+
+static int sd_flags_show(struct seq_file *m, void *v)
 {
-       struct ctl_table *table = sd_alloc_ctl_entry(9);
-
-       if (table == NULL)
-               return NULL;
-
-       set_table_entry(&table[0], "min_interval",        &sd->min_interval,        sizeof(long), 0644, proc_doulongvec_minmax);
-       set_table_entry(&table[1], "max_interval",        &sd->max_interval,        sizeof(long), 0644, proc_doulongvec_minmax);
-       set_table_entry(&table[2], "busy_factor",         &sd->busy_factor,         sizeof(int),  0644, proc_dointvec_minmax);
-       set_table_entry(&table[3], "imbalance_pct",       &sd->imbalance_pct,       sizeof(int),  0644, proc_dointvec_minmax);
-       set_table_entry(&table[4], "cache_nice_tries",    &sd->cache_nice_tries,    sizeof(int),  0644, proc_dointvec_minmax);
-       set_table_entry(&table[5], "flags",               &sd->flags,               sizeof(int),  0444, sd_ctl_doflags);
-       set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
-       set_table_entry(&table[7], "name",                sd->name,            CORENAME_MAX_SIZE, 0444, proc_dostring);
-       /* &table[8] is terminator */
-
-       return table;
+       unsigned long flags = *(unsigned int *)m->private;
+       int idx;
+
+       for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
+               seq_puts(m, sd_flag_debug[idx].name);
+               seq_puts(m, " ");
+       }
+       seq_puts(m, "\n");
+
+       return 0;
 }
 
-static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
+static int sd_flags_open(struct inode *inode, struct file *file)
 {
-       struct ctl_table *entry, *table;
-       struct sched_domain *sd;
-       int domain_num = 0, i;
-       char buf[32];
-
-       for_each_domain(cpu, sd)
-               domain_num++;
-       entry = table = sd_alloc_ctl_entry(domain_num + 1);
-       if (table == NULL)
-               return NULL;
-
-       i = 0;
-       for_each_domain(cpu, sd) {
-               snprintf(buf, 32, "domain%d", i);
-               entry->procname = kstrdup(buf, GFP_KERNEL);
-               entry->mode = 0555;
-               entry->child = sd_alloc_ctl_domain_table(sd);
-               entry++;
-               i++;
-       }
-       return table;
+       return single_open(file, sd_flags_show, inode->i_private);
 }
 
-static cpumask_var_t           sd_sysctl_cpus;
-static struct ctl_table_header *sd_sysctl_header;
+static const struct file_operations sd_flags_fops = {
+       .open           = sd_flags_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
 
-void register_sched_domain_sysctl(void)
+static void register_sd(struct sched_domain *sd, struct dentry *parent)
 {
-       static struct ctl_table *cpu_entries;
-       static struct ctl_table **cpu_idx;
-       static bool init_done = false;
-       char buf[32];
-       int i;
+#define SDM(type, mode, member)        \
+       debugfs_create_##type(#member, mode, parent, &sd->member)
 
-       if (!cpu_entries) {
-               cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1);
-               if (!cpu_entries)
-                       return;
+       SDM(ulong, 0644, min_interval);
+       SDM(ulong, 0644, max_interval);
+       SDM(u64,   0644, max_newidle_lb_cost);
+       SDM(u32,   0644, busy_factor);
+       SDM(u32,   0644, imbalance_pct);
+       SDM(u32,   0644, cache_nice_tries);
+       SDM(str,   0444, name);
 
-               WARN_ON(sd_ctl_dir[0].child);
-               sd_ctl_dir[0].child = cpu_entries;
-       }
+#undef SDM
 
-       if (!cpu_idx) {
-               struct ctl_table *e = cpu_entries;
-
-               cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL);
-               if (!cpu_idx)
-                       return;
+       debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops);
+}
 
-               /* deal with sparse possible map */
-               for_each_possible_cpu(i) {
-                       cpu_idx[i] = e;
-                       e++;
-               }
-       }
+void update_sched_domain_debugfs(void)
+{
+       int cpu, i;
 
        if (!cpumask_available(sd_sysctl_cpus)) {
                if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
                        return;
-       }
-
-       if (!init_done) {
-               init_done = true;
-               /* init to possible to not have holes in @cpu_entries */
                cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
        }
 
-       for_each_cpu(i, sd_sysctl_cpus) {
-               struct ctl_table *e = cpu_idx[i];
+       if (!sd_dentry)
+               sd_dentry = debugfs_create_dir("domains", debugfs_sched);
 
-               if (e->child)
-                       sd_free_ctl_entry(&e->child);
+       for_each_cpu(cpu, sd_sysctl_cpus) {
+               struct sched_domain *sd;
+               struct dentry *d_cpu;
+               char buf[32];
 
-               if (!e->procname) {
-                       snprintf(buf, 32, "cpu%d", i);
-                       e->procname = kstrdup(buf, GFP_KERNEL);
+               snprintf(buf, sizeof(buf), "cpu%d", cpu);
+               debugfs_remove(debugfs_lookup(buf, sd_dentry));
+               d_cpu = debugfs_create_dir(buf, sd_dentry);
+
+               i = 0;
+               for_each_domain(cpu, sd) {
+                       struct dentry *d_sd;
+
+                       snprintf(buf, sizeof(buf), "domain%d", i);
+                       d_sd = debugfs_create_dir(buf, d_cpu);
+
+                       register_sd(sd, d_sd);
+                       i++;
                }
-               e->mode = 0555;
-               e->child = sd_alloc_ctl_cpu_table(i);
 
-               __cpumask_clear_cpu(i, sd_sysctl_cpus);
+               __cpumask_clear_cpu(cpu, sd_sysctl_cpus);
        }
-
-       WARN_ON(sd_sysctl_header);
-       sd_sysctl_header = register_sysctl_table(sd_ctl_root);
 }
 
 void dirty_sched_domain_sysctl(int cpu)
@@ -416,13 +427,6 @@ void dirty_sched_domain_sysctl(int cpu)
                __cpumask_set_cpu(cpu, sd_sysctl_cpus);
 }
 
-/* may be called multiple times per register */
-void unregister_sched_domain_sysctl(void)
-{
-       unregister_sysctl_table(sd_sysctl_header);
-       sd_sysctl_header = NULL;
-}
-#endif /* CONFIG_SYSCTL */
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -470,16 +474,37 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 #endif
 
 #ifdef CONFIG_CGROUP_SCHED
+static DEFINE_SPINLOCK(sched_debug_lock);
 static char group_path[PATH_MAX];
 
-static char *task_group_path(struct task_group *tg)
+static void task_group_path(struct task_group *tg, char *path, int plen)
 {
-       if (autogroup_path(tg, group_path, PATH_MAX))
-               return group_path;
+       if (autogroup_path(tg, path, plen))
+               return;
 
-       cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
+       cgroup_path(tg->css.cgroup, path, plen);
+}
 
-       return group_path;
+/*
+ * Only 1 SEQ_printf_task_group_path() caller can use the full length
+ * group_path[] for cgroup path. Other simultaneous callers will have
+ * to use a shorter stack buffer. A "..." suffix is appended at the end
+ * of the stack buffer so that it will show up in case the output length
+ * matches the given buffer size to indicate possible path name truncation.
+ */
+#define SEQ_printf_task_group_path(m, tg, fmt...)                      \
+{                                                                      \
+       if (spin_trylock(&sched_debug_lock)) {                          \
+               task_group_path(tg, group_path, sizeof(group_path));    \
+               SEQ_printf(m, fmt, group_path);                         \
+               spin_unlock(&sched_debug_lock);                         \
+       } else {                                                        \
+               char buf[128];                                          \
+               char *bufend = buf + sizeof(buf) - 3;                   \
+               task_group_path(tg, buf, bufend - buf);                 \
+               strcpy(bufend - 1, "...");                              \
+               SEQ_printf(m, fmt, buf);                                \
+       }                                                               \
 }
 #endif
 
@@ -506,7 +531,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
        SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
 #endif
 #ifdef CONFIG_CGROUP_SCHED
-       SEQ_printf(m, " %s", task_group_path(task_group(p)));
+       SEQ_printf_task_group_path(m, task_group(p), " %s")
 #endif
 
        SEQ_printf(m, "\n");
@@ -543,7 +568,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
        SEQ_printf(m, "\n");
-       SEQ_printf(m, "cfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
+       SEQ_printf_task_group_path(m, cfs_rq->tg, "cfs_rq[%d]:%s\n", cpu);
 #else
        SEQ_printf(m, "\n");
        SEQ_printf(m, "cfs_rq[%d]:\n", cpu);
@@ -614,7 +639,7 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 {
 #ifdef CONFIG_RT_GROUP_SCHED
        SEQ_printf(m, "\n");
-       SEQ_printf(m, "rt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
+       SEQ_printf_task_group_path(m, rt_rq->tg, "rt_rq[%d]:%s\n", cpu);
 #else
        SEQ_printf(m, "\n");
        SEQ_printf(m, "rt_rq[%d]:\n", cpu);
@@ -666,7 +691,6 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
 static void print_cpu(struct seq_file *m, int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
-       unsigned long flags;
 
 #ifdef CONFIG_X86
        {
@@ -717,13 +741,11 @@ do {                                                                      \
        }
 #undef P
 
-       spin_lock_irqsave(&sched_debug_lock, flags);
        print_cfs_stats(m, cpu);
        print_rt_stats(m, cpu);
        print_dl_stats(m, cpu);
 
        print_rq(m, rq, cpu);
-       spin_unlock_irqrestore(&sched_debug_lock, flags);
        SEQ_printf(m, "\n");
 }
 
@@ -815,7 +837,7 @@ void sysrq_sched_debug_show(void)
 }
 
 /*
- * This itererator needs some explanation.
+ * This iterator needs some explanation.
  * It returns 1 for the header position.
  * This means 2 is CPU 0.
  * In a hotplugged system some CPUs, including CPU 0, may be missing so we have
@@ -860,18 +882,10 @@ static const struct seq_operations sched_debug_sops = {
        .show           = sched_debug_show,
 };
 
-static int __init init_sched_debug_procfs(void)
-{
-       if (!proc_create_seq("sched_debug", 0444, NULL, &sched_debug_sops))
-               return -ENOMEM;
-       return 0;
-}
-
-__initcall(init_sched_debug_procfs);
-
 #define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F))
 #define __P(F) __PS(#F, F)
 #define   P(F) __PS(#F, p->F)
+#define   PM(F, M) __PS(#F, p->F & (M))
 #define __PSN(S, F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", S, SPLIT_NS((long long)(F)))
 #define __PN(F) __PSN(#F, F)
 #define   PN(F) __PSN(#F, p->F)
@@ -998,7 +1012,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
        P(se.avg.util_avg);
        P(se.avg.last_update_time);
        P(se.avg.util_est.ewma);
-       P(se.avg.util_est.enqueued);
+       PM(se.avg.util_est.enqueued, ~UTIL_AVG_UNCHANGED);
 #endif
 #ifdef CONFIG_UCLAMP_TASK
        __PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value);
@@ -1033,3 +1047,13 @@ void proc_sched_set_task(struct task_struct *p)
        memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
 }
+
+void resched_latency_warn(int cpu, u64 latency)
+{
+       static DEFINE_RATELIMIT_STATE(latency_check_ratelimit, 60 * 60 * HZ, 1);
+
+       WARN(__ratelimit(&latency_check_ratelimit),
+            "sched: CPU %d need_resched set for > %llu ns (%d ticks) "
+            "without schedule\n",
+            cpu, latency, cpu_rq(cpu)->ticks_without_resched);
+}