perf: Fix list corruption in perf_cgroup_switch()

[linux-2.6-microblaze.git] / kernel / events / core.c
diff --git a/kernel/events/core.c b/kernel/events/core.c

index fc18664..6859229 100644 (file)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -674,6 +674,23 @@ perf_event_set_state(struct perf_event *event, enum perf_event_state state)
         WRITE_ONCE(event->state, state);
  }
  
+/*
+ * UP store-release, load-acquire
+ */
+
+#define __store_release(ptr, val)                                      \
+do {                                                                   \
+       barrier();                                                      \
+       WRITE_ONCE(*(ptr), (val));                                      \
+} while (0)
+
+#define __load_acquire(ptr)                                            \
+({                                                                     \
+       __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr));        \
+       barrier();                                                      \
+       ___p;                                                           \
+})
+
  #ifdef CONFIG_CGROUP_PERF
  
  static inline bool
@@ -719,34 +736,51 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event)
         return t->time;
  }
  
-static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
+static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
  {
-       struct perf_cgroup_info *info;
-       u64 now;
-
-       now = perf_clock();
+       struct perf_cgroup_info *t;
  
-       info = this_cpu_ptr(cgrp->info);
+       t = per_cpu_ptr(event->cgrp->info, event->cpu);
+       if (!__load_acquire(&t->active))
+               return t->time;
+       now += READ_ONCE(t->timeoffset);
+       return now;
+}
  
-       info->time += now - info->timestamp;
+static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv)
+{
+       if (adv)
+               info->time += now - info->timestamp;
         info->timestamp = now;
+       /*
+        * see update_context_time()
+        */
+       WRITE_ONCE(info->timeoffset, info->time - info->timestamp);
  }
  
-static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
  {
         struct perf_cgroup *cgrp = cpuctx->cgrp;
         struct cgroup_subsys_state *css;
+       struct perf_cgroup_info *info;
  
         if (cgrp) {
+               u64 now = perf_clock();
+
                 for (css = &cgrp->css; css; css = css->parent) {
                         cgrp = container_of(css, struct perf_cgroup, css);
-                       __update_cgrp_time(cgrp);
+                       info = this_cpu_ptr(cgrp->info);
+
+                       __update_cgrp_time(info, now, true);
+                       if (final)
+                               __store_release(&info->active, 0);
                 }
         }
  }
  
  static inline void update_cgrp_time_from_event(struct perf_event *event)
  {
+       struct perf_cgroup_info *info;
         struct perf_cgroup *cgrp;
  
         /*
@@ -760,8 +794,10 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
         /*
          * Do not update time when cgroup is not active
          */
-       if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
-               __update_cgrp_time(event->cgrp);
+       if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) {
+               info = this_cpu_ptr(event->cgrp->info);
+               __update_cgrp_time(info, perf_clock(), true);
+       }
  }
  
  static inline void
@@ -785,7 +821,8 @@ perf_cgroup_set_timestamp(struct task_struct *task,
         for (css = &cgrp->css; css; css = css->parent) {
                 cgrp = container_of(css, struct perf_cgroup, css);
                 info = this_cpu_ptr(cgrp->info);
-               info->timestamp = ctx->timestamp;
+               __update_cgrp_time(info, ctx->timestamp, false);
+               __store_release(&info->active, 1);
         }
  }
  
@@ -802,7 +839,7 @@ static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
   */
  static void perf_cgroup_switch(struct task_struct *task, int mode)
  {
-       struct perf_cpu_context *cpuctx;
+       struct perf_cpu_context *cpuctx, *tmp;
         struct list_head *list;
         unsigned long flags;
  
@@ -813,7 +850,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
         local_irq_save(flags);
  
         list = this_cpu_ptr(&cgrp_cpuctx_list);
-       list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
+       list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
                 WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
  
                 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
@@ -981,14 +1018,6 @@ out:
         return ret;
  }
  
-static inline void
-perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
-{
-       struct perf_cgroup_info *t;
-       t = per_cpu_ptr(event->cgrp->info, event->cpu);
-       event->shadow_ctx_time = now - t->timestamp;
-}
-
  static inline void
  perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
  {
@@ -1066,7 +1095,8 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
  {
  }
  
-static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
+                                               bool final)
  {
  }
  
@@ -1098,12 +1128,12 @@ perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
  {
  }
  
-static inline void
-perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
+static inline u64 perf_cgroup_event_time(struct perf_event *event)
  {
+       return 0;
  }
  
-static inline u64 perf_cgroup_event_time(struct perf_event *event)
+static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
  {
         return 0;
  }
@@ -1525,22 +1555,59 @@ static void perf_unpin_context(struct perf_event_context *ctx)
  /*
   * Update the record of the current time in a context.
   */
-static void update_context_time(struct perf_event_context *ctx)
+static void __update_context_time(struct perf_event_context *ctx, bool adv)
  {
         u64 now = perf_clock();
  
-       ctx->time += now - ctx->timestamp;
+       if (adv)
+               ctx->time += now - ctx->timestamp;
         ctx->timestamp = now;
+
+       /*
+        * The above: time' = time + (now - timestamp), can be re-arranged
+        * into: time` = now + (time - timestamp), which gives a single value
+        * offset to compute future time without locks on.
+        *
+        * See perf_event_time_now(), which can be used from NMI context where
+        * it's (obviously) not possible to acquire ctx->lock in order to read
+        * both the above values in a consistent manner.
+        */
+       WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp);
+}
+
+static void update_context_time(struct perf_event_context *ctx)
+{
+       __update_context_time(ctx, true);
  }
  
  static u64 perf_event_time(struct perf_event *event)
  {
         struct perf_event_context *ctx = event->ctx;
  
+       if (unlikely(!ctx))
+               return 0;
+
         if (is_cgroup_event(event))
                 return perf_cgroup_event_time(event);
  
-       return ctx ? ctx->time : 0;
+       return ctx->time;
+}
+
+static u64 perf_event_time_now(struct perf_event *event, u64 now)
+{
+       struct perf_event_context *ctx = event->ctx;
+
+       if (unlikely(!ctx))
+               return 0;
+
+       if (is_cgroup_event(event))
+               return perf_cgroup_event_time_now(event, now);
+
+       if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
+               return ctx->time;
+
+       now += READ_ONCE(ctx->timeoffset);
+       return now;
  }
  
  static enum event_type_t get_event_type(struct perf_event *event)
@@ -2350,7 +2417,7 @@ __perf_remove_from_context(struct perf_event *event,
  
         if (ctx->is_active & EVENT_TIME) {
                 update_context_time(ctx);
-               update_cgrp_time_from_cpuctx(cpuctx);
+               update_cgrp_time_from_cpuctx(cpuctx, false);
         }
  
         event_sched_out(event, cpuctx, ctx);
@@ -2361,6 +2428,9 @@ __perf_remove_from_context(struct perf_event *event,
         list_del_event(event, ctx);
  
         if (!ctx->nr_events && ctx->is_active) {
+               if (ctx == &cpuctx->ctx)
+                       update_cgrp_time_from_cpuctx(cpuctx, true);
+
                 ctx->is_active = 0;
                 ctx->rotate_necessary = 0;
                 if (ctx->task) {
@@ -2392,7 +2462,11 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla
          * event_function_call() user.
          */
         raw_spin_lock_irq(&ctx->lock);
-       if (!ctx->is_active) {
+       /*
+        * Cgroup events are per-cpu events, and must IPI because of
+        * cgrp_cpuctx_list.
+        */
+       if (!ctx->is_active && !is_cgroup_event(event)) {
                 __perf_remove_from_context(event, __get_cpu_context(ctx),
                                            ctx, (void *)flags);
                 raw_spin_unlock_irq(&ctx->lock);
@@ -2482,40 +2556,6 @@ void perf_event_disable_inatomic(struct perf_event *event)
         irq_work_queue(&event->pending);
  }
  
-static void perf_set_shadow_time(struct perf_event *event,
-                                struct perf_event_context *ctx)
-{
-       /*
-        * use the correct time source for the time snapshot
-        *
-        * We could get by without this by leveraging the
-        * fact that to get to this function, the caller
-        * has most likely already called update_context_time()
-        * and update_cgrp_time_xx() and thus both timestamp
-        * are identical (or very close). Given that tstamp is,
-        * already adjusted for cgroup, we could say that:
-        *    tstamp - ctx->timestamp
-        * is equivalent to
-        *    tstamp - cgrp->timestamp.
-        *
-        * Then, in perf_output_read(), the calculation would
-        * work with no changes because:
-        * - event is guaranteed scheduled in
-        * - no scheduled out in between
-        * - thus the timestamp would be the same
-        *
-        * But this is a bit hairy.
-        *
-        * So instead, we have an explicit cgroup call to remain
-        * within the time source all along. We believe it
-        * is cleaner and simpler to understand.
-        */
-       if (is_cgroup_event(event))
-               perf_cgroup_set_shadow_time(event, event->tstamp);
-       else
-               event->shadow_ctx_time = event->tstamp - ctx->timestamp;
-}
-
  #define MAX_INTERRUPTS (~0ULL)
  
  static void perf_log_throttle(struct perf_event *event, int enable);
@@ -2556,8 +2596,6 @@ event_sched_in(struct perf_event *event,
  
         perf_pmu_disable(event->pmu);
  
-       perf_set_shadow_time(event, ctx);
-
         perf_log_itrace_start(event);
  
         if (event->pmu->add(event, PERF_EF_START)) {
@@ -2861,11 +2899,14 @@ perf_install_in_context(struct perf_event_context *ctx,
          * perf_event_attr::disabled events will not run and can be initialized
          * without IPI. Except when this is the first event for the context, in
          * that case we need the magic of the IPI to set ctx->is_active.
+        * Similarly, cgroup events for the context also needs the IPI to
+        * manipulate the cgrp_cpuctx_list.
          *
          * The IOC_ENABLE that is sure to follow the creation of a disabled
          * event will issue the IPI and reprogram the hardware.
          */
-       if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) {
+       if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF &&
+           ctx->nr_events && !is_cgroup_event(event)) {
                 raw_spin_lock_irq(&ctx->lock);
                 if (ctx->task == TASK_TOMBSTONE) {
                         raw_spin_unlock_irq(&ctx->lock);
@@ -3197,6 +3238,15 @@ static int perf_event_modify_breakpoint(struct perf_event *bp,
         return err;
  }
  
+/*
+ * Copy event-type-independent attributes that may be modified.
+ */
+static void perf_event_modify_copy_attr(struct perf_event_attr *to,
+                                       const struct perf_event_attr *from)
+{
+       to->sig_data = from->sig_data;
+}
+
  static int perf_event_modify_attr(struct perf_event *event,
                                   struct perf_event_attr *attr)
  {
@@ -3219,10 +3269,17 @@ static int perf_event_modify_attr(struct perf_event *event,
         WARN_ON_ONCE(event->ctx->parent_ctx);
  
         mutex_lock(&event->child_mutex);
+       /*
+        * Event-type-independent attributes must be copied before event-type
+        * modification, which will validate that final attributes match the
+        * source attributes after all relevant attributes have been copied.
+        */
+       perf_event_modify_copy_attr(&event->attr, attr);
         err = func(event, attr);
         if (err)
                 goto out;
         list_for_each_entry(child, &event->child_list, child_list) {
+               perf_event_modify_copy_attr(&child->attr, attr);
                 err = func(child, attr);
                 if (err)
                         goto out;
@@ -3251,16 +3308,6 @@ static void ctx_sched_out(struct perf_event_context *ctx,
                 return;
         }
  
-       ctx->is_active &= ~event_type;
-       if (!(ctx->is_active & EVENT_ALL))
-               ctx->is_active = 0;
-
-       if (ctx->task) {
-               WARN_ON_ONCE(cpuctx->task_ctx != ctx);
-               if (!ctx->is_active)
-                       cpuctx->task_ctx = NULL;
-       }
-
         /*
          * Always update time if it was set; not only when it changes.
          * Otherwise we can 'forget' to update time for any but the last
@@ -3274,7 +3321,22 @@ static void ctx_sched_out(struct perf_event_context *ctx,
         if (is_active & EVENT_TIME) {
                 /* update (and stop) ctx time */
                 update_context_time(ctx);
-               update_cgrp_time_from_cpuctx(cpuctx);
+               update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
+               /*
+                * CPU-release for the below ->is_active store,
+                * see __load_acquire() in perf_event_time_now()
+                */
+               barrier();
+       }
+
+       ctx->is_active &= ~event_type;
+       if (!(ctx->is_active & EVENT_ALL))
+               ctx->is_active = 0;
+
+       if (ctx->task) {
+               WARN_ON_ONCE(cpuctx->task_ctx != ctx);
+               if (!ctx->is_active)
+                       cpuctx->task_ctx = NULL;
         }
  
         is_active ^= ctx->is_active; /* changed bits */
@@ -3711,13 +3773,19 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
         return 0;
  }
  
+/*
+ * Because the userpage is strictly per-event (there is no concept of context,
+ * so there cannot be a context indirection), every userpage must be updated
+ * when context time starts :-(
+ *
+ * IOW, we must not miss EVENT_TIME edges.
+ */
  static inline bool event_update_userpage(struct perf_event *event)
  {
         if (likely(!atomic_read(&event->mmap_count)))
                 return false;
  
         perf_event_update_time(event);
-       perf_set_shadow_time(event, event->ctx);
         perf_event_update_userpage(event);
  
         return true;
@@ -3801,13 +3869,23 @@ ctx_sched_in(struct perf_event_context *ctx,
              struct task_struct *task)
  {
         int is_active = ctx->is_active;
-       u64 now;
  
         lockdep_assert_held(&ctx->lock);
  
         if (likely(!ctx->nr_events))
                 return;
  
+       if (is_active ^ EVENT_TIME) {
+               /* start ctx time */
+               __update_context_time(ctx, false);
+               perf_cgroup_set_timestamp(task, ctx);
+               /*
+                * CPU-release for the below ->is_active store,
+                * see __load_acquire() in perf_event_time_now()
+                */
+               barrier();
+       }
+
         ctx->is_active |= (event_type | EVENT_TIME);
         if (ctx->task) {
                 if (!is_active)
@@ -3818,13 +3896,6 @@ ctx_sched_in(struct perf_event_context *ctx,
  
         is_active ^= ctx->is_active; /* changed bits */
  
-       if (is_active & EVENT_TIME) {
-               /* start ctx time */
-               now = perf_clock();
-               ctx->timestamp = now;
-               perf_cgroup_set_timestamp(task, ctx);
-       }
-
         /*
          * First go through the list and put on any pinned groups
          * in order to give them the best chance of going on.
@@ -4418,6 +4489,18 @@ static inline u64 perf_event_count(struct perf_event *event)
         return local64_read(&event->count) + atomic64_read(&event->child_count);
  }
  
+static void calc_timer_values(struct perf_event *event,
+                               u64 *now,
+                               u64 *enabled,
+                               u64 *running)
+{
+       u64 ctx_time;
+
+       *now = perf_clock();
+       ctx_time = perf_event_time_now(event, *now);
+       __perf_update_times(event, ctx_time, enabled, running);
+}
+
  /*
   * NMI-safe method to read a local event, that is an event that
   * is:
@@ -4477,10 +4560,9 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
  
         *value = local64_read(&event->count);
         if (enabled || running) {
-               u64 now = event->shadow_ctx_time + perf_clock();
-               u64 __enabled, __running;
+               u64 __enabled, __running, __now;;
  
-               __perf_update_times(event, now, &__enabled, &__running);
+               calc_timer_values(event, &__now, &__enabled, &__running);
                 if (enabled)
                         *enabled = __enabled;
                 if (running)
@@ -5802,18 +5884,6 @@ static int perf_event_index(struct perf_event *event)
         return event->pmu->event_idx(event);
  }
  
-static void calc_timer_values(struct perf_event *event,
-                               u64 *now,
-                               u64 *enabled,
-                               u64 *running)
-{
-       u64 ctx_time;
-
-       *now = perf_clock();
-       ctx_time = event->shadow_ctx_time + *now;
-       __perf_update_times(event, ctx_time, enabled, running);
-}
-
  static void perf_event_init_userpage(struct perf_event *event)
  {
         struct perf_event_mmap_page *userpg;
@@ -5938,6 +6008,8 @@ static void ring_buffer_attach(struct perf_event *event,
         struct perf_buffer *old_rb = NULL;
         unsigned long flags;
  
+       WARN_ON_ONCE(event->parent);
+
         if (event->rb) {
                 /*
                  * Should be impossible, we set this when removing
@@ -5995,6 +6067,9 @@ static void ring_buffer_wakeup(struct perf_event *event)
  {
         struct perf_buffer *rb;
  
+       if (event->parent)
+               event = event->parent;
+
         rcu_read_lock();
         rb = rcu_dereference(event->rb);
         if (rb) {
@@ -6008,6 +6083,9 @@ struct perf_buffer *ring_buffer_get(struct perf_event *event)
  {
         struct perf_buffer *rb;
  
+       if (event->parent)
+               event = event->parent;
+
         rcu_read_lock();
         rb = rcu_dereference(event->rb);
         if (rb) {
@@ -6353,7 +6431,6 @@ accounting:
                 ring_buffer_attach(event, rb);
  
                 perf_event_update_time(event);
-               perf_set_shadow_time(event, event->ctx);
                 perf_event_init_userpage(event);
                 perf_event_update_userpage(event);
         } else {
@@ -6717,7 +6794,7 @@ static unsigned long perf_prepare_sample_aux(struct perf_event *event,
         if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
                 goto out;
  
-       rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
+       rb = ring_buffer_get(sampler);
         if (!rb)
                 goto out;
  
@@ -6783,7 +6860,7 @@ static void perf_aux_sample_output(struct perf_event *event,
         if (WARN_ON_ONCE(!sampler || !data->aux_size))
                 return;
  
-       rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
+       rb = ring_buffer_get(sampler);
         if (!rb)
                 return;