trace/hwlat: Protect kdata->kthread with get/put_online_cpus

[linux-2.6-microblaze.git] / kernel / trace / trace_hwlat.c
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c

index 632ef88..20e31f7 100644 (file)
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -34,7 +34,7 @@
   * Copyright (C) 2008-2009 Jon Masters, Red Hat, Inc. <jcm@redhat.com>
   * Copyright (C) 2013-2016 Steven Rostedt, Red Hat, Inc. <srostedt@redhat.com>
   *
- * Includes useful feedback from Clark Williams <clark@redhat.com>
+ * Includes useful feedback from Clark Williams <williams@redhat.com>
   *
   */
  #include <linux/kthread.h>
@@ -54,20 +54,33 @@ static struct trace_array   *hwlat_trace;
  #define DEFAULT_SAMPLE_WIDTH   500000                  /* 0.5s */
  #define DEFAULT_LAT_THRESHOLD  10                      /* 10us */
  
-/* sampling thread*/
-static struct task_struct *hwlat_kthread;
-
  static struct dentry *hwlat_sample_width;      /* sample width us */
  static struct dentry *hwlat_sample_window;     /* sample window us */
+static struct dentry *hwlat_thread_mode;       /* hwlat thread mode */
+
+enum {
+       MODE_NONE = 0,
+       MODE_ROUND_ROBIN,
+       MODE_PER_CPU,
+       MODE_MAX
+};
+static char *thread_mode_str[] = { "none", "round-robin", "per-cpu" };
  
  /* Save the previous tracing_thresh value */
  static unsigned long save_tracing_thresh;
  
-/* NMI timestamp counters */
-static u64 nmi_ts_start;
-static u64 nmi_total_ts;
-static int nmi_count;
-static int nmi_cpu;
+/* runtime kthread data */
+struct hwlat_kthread_data {
+       struct task_struct      *kthread;
+       /* NMI timestamp counters */
+       u64                     nmi_ts_start;
+       u64                     nmi_total_ts;
+       int                     nmi_count;
+       int                     nmi_cpu;
+};
+
+struct hwlat_kthread_data hwlat_single_cpu_data;
+DEFINE_PER_CPU(struct hwlat_kthread_data, hwlat_per_cpu_data);
  
  /* Tells NMIs to call back to the hwlat tracer to record timestamps */
  bool trace_hwlat_callback_enabled;
@@ -96,11 +109,24 @@ static struct hwlat_data {
         u64     sample_window;          /* total sampling window (on+off) */
         u64     sample_width;           /* active sampling portion of window */
  
+       int     thread_mode;            /* thread mode */
+
  } hwlat_data = {
         .sample_window          = DEFAULT_SAMPLE_WINDOW,
         .sample_width           = DEFAULT_SAMPLE_WIDTH,
+       .thread_mode            = MODE_ROUND_ROBIN
  };
  
+static struct hwlat_kthread_data *get_cpu_data(void)
+{
+       if (hwlat_data.thread_mode == MODE_PER_CPU)
+               return this_cpu_ptr(&hwlat_per_cpu_data);
+       else
+               return &hwlat_single_cpu_data;
+}
+
+static bool hwlat_busy;
+
  static void trace_hwlat_sample(struct hwlat_sample *sample)
  {
         struct trace_array *tr = hwlat_trace;
@@ -136,7 +162,9 @@ static void trace_hwlat_sample(struct hwlat_sample *sample)
  
  void trace_hwlat_callback(bool enter)
  {
-       if (smp_processor_id() != nmi_cpu)
+       struct hwlat_kthread_data *kdata = get_cpu_data();
+
+       if (!kdata->kthread)
                 return;
  
         /*
@@ -145,15 +173,24 @@ void trace_hwlat_callback(bool enter)
          */
         if (!IS_ENABLED(CONFIG_GENERIC_SCHED_CLOCK)) {
                 if (enter)
-                       nmi_ts_start = time_get();
+                       kdata->nmi_ts_start = time_get();
                 else
-                       nmi_total_ts += time_get() - nmi_ts_start;
+                       kdata->nmi_total_ts += time_get() - kdata->nmi_ts_start;
         }
  
         if (enter)
-               nmi_count++;
+               kdata->nmi_count++;
  }
  
+/*
+ * hwlat_err - report a hwlat error.
+ */
+#define hwlat_err(msg) ({                                                      \
+       struct trace_array *tr = hwlat_trace;                                   \
+                                                                               \
+       trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, msg);        \
+})
+
  /**
   * get_sample - sample the CPU TSC and look for likely hardware latencies
   *
@@ -163,6 +200,7 @@ void trace_hwlat_callback(bool enter)
   */
  static int get_sample(void)
  {
+       struct hwlat_kthread_data *kdata = get_cpu_data();
         struct trace_array *tr = hwlat_trace;
         struct hwlat_sample s;
         time_type start, t1, t2, last_t2;
@@ -175,9 +213,8 @@ static int get_sample(void)
  
         do_div(thresh, NSEC_PER_USEC); /* modifies interval value */
  
-       nmi_cpu = smp_processor_id();
-       nmi_total_ts = 0;
-       nmi_count = 0;
+       kdata->nmi_total_ts = 0;
+       kdata->nmi_count = 0;
         /* Make sure NMIs see this first */
         barrier();
  
@@ -197,7 +234,7 @@ static int get_sample(void)
                         outer_diff = time_to_us(time_sub(t1, last_t2));
                         /* This shouldn't happen */
                         if (outer_diff < 0) {
-                               pr_err(BANNER "time running backwards\n");
+                               hwlat_err(BANNER "time running backwards\n");
                                 goto out;
                         }
                         if (outer_diff > outer_sample)
@@ -209,7 +246,7 @@ static int get_sample(void)
  
                 /* Check for possible overflows */
                 if (total < last_total) {
-                       pr_err("Time total overflowed\n");
+                       hwlat_err("Time total overflowed\n");
                         break;
                 }
                 last_total = total;
@@ -225,7 +262,7 @@ static int get_sample(void)
  
                 /* This shouldn't happen */
                 if (diff < 0) {
-                       pr_err(BANNER "time running backwards\n");
+                       hwlat_err(BANNER "time running backwards\n");
                         goto out;
                 }
  
@@ -247,15 +284,15 @@ static int get_sample(void)
                 ret = 1;
  
                 /* We read in microseconds */
-               if (nmi_total_ts)
-                       do_div(nmi_total_ts, NSEC_PER_USEC);
+               if (kdata->nmi_total_ts)
+                       do_div(kdata->nmi_total_ts, NSEC_PER_USEC);
  
                 hwlat_data.count++;
                 s.seqnum = hwlat_data.count;
                 s.duration = sample;
                 s.outer_duration = outer_sample;
-               s.nmi_total_ts = nmi_total_ts;
-               s.nmi_count = nmi_count;
+               s.nmi_total_ts = kdata->nmi_total_ts;
+               s.nmi_count = kdata->nmi_count;
                 s.count = count;
                 trace_hwlat_sample(&s);
  
@@ -273,7 +310,6 @@ out:
  }
  
  static struct cpumask save_cpumask;
-static bool disable_migrate;
  
  static void move_to_next_cpu(void)
  {
@@ -281,15 +317,13 @@ static void move_to_next_cpu(void)
         struct trace_array *tr = hwlat_trace;
         int next_cpu;
  
-       if (disable_migrate)
-               return;
         /*
          * If for some reason the user modifies the CPU affinity
          * of this thread, then stop migrating for the duration
          * of the current test.
          */
         if (!cpumask_equal(current_mask, current->cpus_ptr))
-               goto disable;
+               goto change_mode;
  
         get_online_cpus();
         cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
@@ -300,7 +334,7 @@ static void move_to_next_cpu(void)
                 next_cpu = cpumask_first(current_mask);
  
         if (next_cpu >= nr_cpu_ids) /* Shouldn't happen! */
-               goto disable;
+               goto change_mode;
  
         cpumask_clear(current_mask);
         cpumask_set_cpu(next_cpu, current_mask);
@@ -308,8 +342,9 @@ static void move_to_next_cpu(void)
         sched_setaffinity(0, current_mask);
         return;
  
- disable:
-       disable_migrate = true;
+ change_mode:
+       hwlat_data.thread_mode = MODE_NONE;
+       pr_info(BANNER "cpumask changed while in round-robin mode, switching to mode none\n");
  }
  
  /*
@@ -328,7 +363,8 @@ static int kthread_fn(void *data)
  
         while (!kthread_should_stop()) {
  
-               move_to_next_cpu();
+               if (hwlat_data.thread_mode == MODE_ROUND_ROBIN)
+                       move_to_next_cpu();
  
                 local_irq_disable();
                 get_sample();
@@ -351,178 +387,317 @@ static int kthread_fn(void *data)
         return 0;
  }
  
-/**
- * start_kthread - Kick off the hardware latency sampling/detector kthread
+/*
+ * stop_stop_kthread - Inform the hardware latency sampling/detector kthread to stop
+ *
+ * This kicks the running hardware latency sampling/detector kernel thread and
+ * tells it to stop sampling now. Use this on unload and at system shutdown.
+ */
+static void stop_single_kthread(void)
+{
+       struct hwlat_kthread_data *kdata = get_cpu_data();
+       struct task_struct *kthread;
+
+       get_online_cpus();
+       kthread = kdata->kthread;
+
+       if (!kthread)
+               goto out_put_cpus;
+
+       kthread_stop(kthread);
+       kdata->kthread = NULL;
+
+out_put_cpus:
+       put_online_cpus();
+}
+
+
+/*
+ * start_single_kthread - Kick off the hardware latency sampling/detector kthread
   *
   * This starts the kernel thread that will sit and sample the CPU timestamp
   * counter (TSC or similar) and look for potential hardware latencies.
   */
-static int start_kthread(struct trace_array *tr)
+static int start_single_kthread(struct trace_array *tr)
  {
+       struct hwlat_kthread_data *kdata = get_cpu_data();
         struct cpumask *current_mask = &save_cpumask;
         struct task_struct *kthread;
         int next_cpu;
  
-       if (hwlat_kthread)
-               return 0;
-
-       /* Just pick the first CPU on first iteration */
         get_online_cpus();
-       cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
-       put_online_cpus();
-       next_cpu = cpumask_first(current_mask);
+       if (kdata->kthread)
+               goto out_put_cpus;
  
         kthread = kthread_create(kthread_fn, NULL, "hwlatd");
         if (IS_ERR(kthread)) {
                 pr_err(BANNER "could not start sampling thread\n");
+               put_online_cpus();
                 return -ENOMEM;
         }
  
-       cpumask_clear(current_mask);
-       cpumask_set_cpu(next_cpu, current_mask);
+       /* Just pick the first CPU on first iteration */
+       cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
+
+       if (hwlat_data.thread_mode == MODE_ROUND_ROBIN) {
+               next_cpu = cpumask_first(current_mask);
+               cpumask_clear(current_mask);
+               cpumask_set_cpu(next_cpu, current_mask);
+
+       }
+
         sched_setaffinity(kthread->pid, current_mask);
  
-       hwlat_kthread = kthread;
+       kdata->kthread = kthread;
         wake_up_process(kthread);
  
+out_put_cpus:
+       put_online_cpus();
         return 0;
  }
  
-/**
- * stop_kthread - Inform the hardware latency sampling/detector kthread to stop
- *
- * This kicks the running hardware latency sampling/detector kernel thread and
- * tells it to stop sampling now. Use this on unload and at system shutdown.
+/*
+ * stop_cpu_kthread - Stop a hwlat cpu kthread
   */
-static void stop_kthread(void)
+static void stop_cpu_kthread(unsigned int cpu)
  {
-       if (!hwlat_kthread)
-               return;
-       kthread_stop(hwlat_kthread);
-       hwlat_kthread = NULL;
+       struct task_struct *kthread;
+
+       kthread = per_cpu(hwlat_per_cpu_data, cpu).kthread;
+       if (kthread)
+               kthread_stop(kthread);
  }
  
  /*
- * hwlat_read - Wrapper read function for reading both window and width
- * @filp: The active open file structure
- * @ubuf: The userspace provided buffer to read value into
- * @cnt: The maximum number of bytes to read
- * @ppos: The current "file" position
+ * stop_per_cpu_kthreads - Inform the hardware latency sampling/detector kthread to stop
   *
- * This function provides a generic read implementation for the global state
- * "hwlat_data" structure filesystem entries.
+ * This kicks the running hardware latency sampling/detector kernel threads and
+ * tells it to stop sampling now. Use this on unload and at system shutdown.
   */
-static ssize_t hwlat_read(struct file *filp, char __user *ubuf,
-                         size_t cnt, loff_t *ppos)
+static void stop_per_cpu_kthreads(void)
  {
-       char buf[U64STR_SIZE];
-       u64 *entry = filp->private_data;
-       u64 val;
-       int len;
+       unsigned int cpu;
  
-       if (!entry)
-               return -EFAULT;
+       get_online_cpus();
+       for_each_online_cpu(cpu)
+               stop_cpu_kthread(cpu);
+       put_online_cpus();
+}
  
-       if (cnt > sizeof(buf))
-               cnt = sizeof(buf);
+/*
+ * start_cpu_kthread - Start a hwlat cpu kthread
+ */
+static int start_cpu_kthread(unsigned int cpu)
+{
+       struct task_struct *kthread;
+       char comm[24];
  
-       val = *entry;
+       snprintf(comm, 24, "hwlatd/%d", cpu);
  
-       len = snprintf(buf, sizeof(buf), "%llu\n", val);
+       kthread = kthread_create_on_cpu(kthread_fn, NULL, cpu, comm);
+       if (IS_ERR(kthread)) {
+               pr_err(BANNER "could not start sampling thread\n");
+               return -ENOMEM;
+       }
+
+       per_cpu(hwlat_per_cpu_data, cpu).kthread = kthread;
+       wake_up_process(kthread);
  
-       return simple_read_from_buffer(ubuf, cnt, ppos, buf, len);
+       return 0;
  }
  
-/**
- * hwlat_width_write - Write function for "width" entry
- * @filp: The active open file structure
- * @ubuf: The user buffer that contains the value to write
- * @cnt: The maximum number of bytes to write to "file"
- * @ppos: The current position in @file
+/*
+ * start_per_cpu_kthreads - Kick off the hardware latency sampling/detector kthreads
   *
- * This function provides a write implementation for the "width" interface
- * to the hardware latency detector. It can be used to configure
- * for how many us of the total window us we will actively sample for any
- * hardware-induced latency periods. Obviously, it is not possible to
- * sample constantly and have the system respond to a sample reader, or,
- * worse, without having the system appear to have gone out to lunch. It
- * is enforced that width is less that the total window size.
+ * This starts the kernel threads that will sit on potentially all cpus and
+ * sample the CPU timestamp counter (TSC or similar) and look for potential
+ * hardware latencies.
   */
-static ssize_t
-hwlat_width_write(struct file *filp, const char __user *ubuf,
-                 size_t cnt, loff_t *ppos)
+static int start_per_cpu_kthreads(struct trace_array *tr)
  {
-       u64 val;
-       int err;
+       struct cpumask *current_mask = &save_cpumask;
+       unsigned int cpu;
+       int retval;
  
-       err = kstrtoull_from_user(ubuf, cnt, 10, &val);
-       if (err)
-               return err;
+       get_online_cpus();
+       /*
+        * Run only on CPUs in which hwlat is allowed to run.
+        */
+       cpumask_and(current_mask, cpu_online_mask, tr->tracing_cpumask);
+
+       for_each_online_cpu(cpu)
+               per_cpu(hwlat_per_cpu_data, cpu).kthread = NULL;
+
+       for_each_cpu(cpu, current_mask) {
+               retval = start_cpu_kthread(cpu);
+               if (retval)
+                       goto out_error;
+       }
+       put_online_cpus();
+
+       return 0;
+
+out_error:
+       put_online_cpus();
+       stop_per_cpu_kthreads();
+       return retval;
+}
+
+static void *s_mode_start(struct seq_file *s, loff_t *pos)
+{
+       int mode = *pos;
  
         mutex_lock(&hwlat_data.lock);
-       if (val < hwlat_data.sample_window)
-               hwlat_data.sample_width = val;
+
+       if (mode >= MODE_MAX)
+               return NULL;
+
+       return pos;
+}
+
+static void *s_mode_next(struct seq_file *s, void *v, loff_t *pos)
+{
+       int mode = ++(*pos);
+
+       if (mode >= MODE_MAX)
+               return NULL;
+
+       return pos;
+}
+
+static int s_mode_show(struct seq_file *s, void *v)
+{
+       loff_t *pos = v;
+       int mode = *pos;
+
+       if (mode == hwlat_data.thread_mode)
+               seq_printf(s, "[%s]", thread_mode_str[mode]);
         else
-               err = -EINVAL;
-       mutex_unlock(&hwlat_data.lock);
+               seq_printf(s, "%s", thread_mode_str[mode]);
  
-       if (err)
-               return err;
+       if (mode != MODE_MAX)
+               seq_puts(s, " ");
  
-       return cnt;
+       return 0;
  }
  
+static void s_mode_stop(struct seq_file *s, void *v)
+{
+       seq_puts(s, "\n");
+       mutex_unlock(&hwlat_data.lock);
+}
+
+static const struct seq_operations thread_mode_seq_ops = {
+       .start          = s_mode_start,
+       .next           = s_mode_next,
+       .show           = s_mode_show,
+       .stop           = s_mode_stop
+};
+
+static int hwlat_mode_open(struct inode *inode, struct file *file)
+{
+       return seq_open(file, &thread_mode_seq_ops);
+};
+
+static void hwlat_tracer_start(struct trace_array *tr);
+static void hwlat_tracer_stop(struct trace_array *tr);
+
  /**
- * hwlat_window_write - Write function for "window" entry
+ * hwlat_mode_write - Write function for "mode" entry
   * @filp: The active open file structure
   * @ubuf: The user buffer that contains the value to write
   * @cnt: The maximum number of bytes to write to "file"
   * @ppos: The current position in @file
   *
- * This function provides a write implementation for the "window" interface
- * to the hardware latency detector. The window is the total time
- * in us that will be considered one sample period. Conceptually, windows
- * occur back-to-back and contain a sample width period during which
- * actual sampling occurs. Can be used to write a new total window size. It
- * is enforced that any value written must be greater than the sample width
- * size, or an error results.
+ * This function provides a write implementation for the "mode" interface
+ * to the hardware latency detector. hwlatd has different operation modes.
+ * The "none" sets the allowed cpumask for a single hwlatd thread at the
+ * startup and lets the scheduler handle the migration. The default mode is
+ * the "round-robin" one, in which a single hwlatd thread runs, migrating
+ * among the allowed CPUs in a round-robin fashion. The "per-cpu" mode
+ * creates one hwlatd thread per allowed CPU.
   */
-static ssize_t
-hwlat_window_write(struct file *filp, const char __user *ubuf,
-                  size_t cnt, loff_t *ppos)
+static ssize_t hwlat_mode_write(struct file *filp, const char __user *ubuf,
+                                size_t cnt, loff_t *ppos)
  {
-       u64 val;
-       int err;
+       struct trace_array *tr = hwlat_trace;
+       const char *mode;
+       char buf[64];
+       int ret, i;
  
-       err = kstrtoull_from_user(ubuf, cnt, 10, &val);
-       if (err)
-               return err;
+       if (cnt >= sizeof(buf))
+               return -EINVAL;
+
+       if (copy_from_user(buf, ubuf, cnt))
+               return -EFAULT;
+
+       buf[cnt] = 0;
+
+       mode = strstrip(buf);
+
+       ret = -EINVAL;
+
+       /*
+        * trace_types_lock is taken to avoid concurrency on start/stop
+        * and hwlat_busy.
+        */
+       mutex_lock(&trace_types_lock);
+       if (hwlat_busy)
+               hwlat_tracer_stop(tr);
  
         mutex_lock(&hwlat_data.lock);
-       if (hwlat_data.sample_width < val)
-               hwlat_data.sample_window = val;
-       else
-               err = -EINVAL;
+
+       for (i = 0; i < MODE_MAX; i++) {
+               if (strcmp(mode, thread_mode_str[i]) == 0) {
+                       hwlat_data.thread_mode = i;
+                       ret = cnt;
+               }
+       }
+
         mutex_unlock(&hwlat_data.lock);
  
-       if (err)
-               return err;
+       if (hwlat_busy)
+               hwlat_tracer_start(tr);
+       mutex_unlock(&trace_types_lock);
+
+       *ppos += cnt;
  
-       return cnt;
+
+
+       return ret;
  }
  
-static const struct file_operations width_fops = {
-       .open           = tracing_open_generic,
-       .read           = hwlat_read,
-       .write          = hwlat_width_write,
+/*
+ * The width parameter is read/write using the generic trace_min_max_param
+ * method. The *val is protected by the hwlat_data lock and is upper
+ * bounded by the window parameter.
+ */
+static struct trace_min_max_param hwlat_width = {
+       .lock           = &hwlat_data.lock,
+       .val            = &hwlat_data.sample_width,
+       .max            = &hwlat_data.sample_window,
+       .min            = NULL,
  };
  
-static const struct file_operations window_fops = {
-       .open           = tracing_open_generic,
-       .read           = hwlat_read,
-       .write          = hwlat_window_write,
+/*
+ * The window parameter is read/write using the generic trace_min_max_param
+ * method. The *val is protected by the hwlat_data lock and is lower
+ * bounded by the width parameter.
+ */
+static struct trace_min_max_param hwlat_window = {
+       .lock           = &hwlat_data.lock,
+       .val            = &hwlat_data.sample_window,
+       .max            = NULL,
+       .min            = &hwlat_data.sample_width,
  };
  
+static const struct file_operations thread_mode_fops = {
+       .open           = hwlat_mode_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = seq_release,
+       .write          = hwlat_mode_write
+};
  /**
   * init_tracefs - A function to initialize the tracefs interface files
   *
@@ -546,18 +721,25 @@ static int init_tracefs(void)
  
         hwlat_sample_window = tracefs_create_file("window", 0640,
                                                   top_dir,
-                                                 &hwlat_data.sample_window,
-                                                 &window_fops);
+                                                 &hwlat_window,
+                                                 &trace_min_max_fops);
         if (!hwlat_sample_window)
                 goto err;
  
         hwlat_sample_width = tracefs_create_file("width", 0644,
                                                  top_dir,
-                                                &hwlat_data.sample_width,
-                                                &width_fops);
+                                                &hwlat_width,
+                                                &trace_min_max_fops);
         if (!hwlat_sample_width)
                 goto err;
  
+       hwlat_thread_mode = trace_create_file("mode", 0644,
+                                             top_dir,
+                                             NULL,
+                                             &thread_mode_fops);
+       if (!hwlat_thread_mode)
+               goto err;
+
         return 0;
  
   err:
@@ -569,18 +751,22 @@ static void hwlat_tracer_start(struct trace_array *tr)
  {
         int err;
  
-       err = start_kthread(tr);
+       if (hwlat_data.thread_mode == MODE_PER_CPU)
+               err = start_per_cpu_kthreads(tr);
+       else
+               err = start_single_kthread(tr);
         if (err)
                 pr_err(BANNER "Cannot start hwlat kthread\n");
  }
  
  static void hwlat_tracer_stop(struct trace_array *tr)
  {
-       stop_kthread();
+       if (hwlat_data.thread_mode == MODE_PER_CPU)
+               stop_per_cpu_kthreads();
+       else
+               stop_single_kthread();
  }
  
-static bool hwlat_busy;
-
  static int hwlat_tracer_init(struct trace_array *tr)
  {
         /* Only allow one instance to enable this */
@@ -589,7 +775,6 @@ static int hwlat_tracer_init(struct trace_array *tr)
  
         hwlat_trace = tr;
  
-       disable_migrate = false;
         hwlat_data.count = 0;
         tr->max_latency = 0;
         save_tracing_thresh = tracing_thresh;
@@ -608,7 +793,7 @@ static int hwlat_tracer_init(struct trace_array *tr)
  
  static void hwlat_tracer_reset(struct trace_array *tr)
  {
-       stop_kthread();
+       hwlat_tracer_stop(tr);
  
         /* the tracing threshold is static between runs */
         last_tracing_thresh = tracing_thresh;