Merge branch 'perf/urgent' into perf/core, to pick up fixes

author Ingo Molnar <mingo@kernel.org>

Sat, 24 Mar 2018 08:21:47 +0000 (09:21 +0100)

committer Ingo Molnar <mingo@kernel.org>

Sat, 24 Mar 2018 08:21:47 +0000 (09:21 +0100)
author Ingo Molnar <mingo@kernel.org>
Sat, 24 Mar 2018 08:21:47 +0000 (09:21 +0100)
committer Ingo Molnar <mingo@kernel.org>
Sat, 24 Mar 2018 08:21:47 +0000 (09:21 +0100)
diff --combined arch/x86/events/core.c

index bfc8f43,88797c8..4823695
--- 1/arch/x86/events/core.c
--- 2/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@@ -990,7 -990,7 +990,7 @@@ static int collect_events(struct cpu_hw
         if (!dogrp)
                 return n;
   
- -      list_for_each_entry(event, &leader->sibling_list, group_entry) {
+ +      for_each_sibling_event(event, leader) {
                 if (!is_x86_event(event) ||
                     event->state <= PERF_EVENT_STATE_OFF)
                         continue;
@@@ -1156,13 -1156,16 +1156,13 @@@ int x86_perf_event_set_period(struct pe
   
         per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
   
- -      if (!(hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) ||
- -          local64_read(&hwc->prev_count) != (u64)-left) {
- -              /*
- -               * The hw event starts counting from this event offset,
- -               * mark it to be able to extra future deltas:
- -               */
- -              local64_set(&hwc->prev_count, (u64)-left);
+ +      /*
+ +       * The hw event starts counting from this event offset,
+ +       * mark it to be able to extra future deltas:
+ +       */
+ +      local64_set(&hwc->prev_count, (u64)-left);
   
- -              wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
- -      }
+ +      wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
   
         /*
          * Due to erratum on certan cpu we need
@@@ -1881,8 -1884,6 +1881,8 @@@ early_initcall(init_hw_perf_events)
   
   static inline void x86_pmu_read(struct perf_event *event)
   {
+ +      if (x86_pmu.read)
+ +              return x86_pmu.read(event);
         x86_perf_event_update(event);
   }
   
@@@ -2118,7 -2119,7 +2118,7 @@@ static int x86_pmu_event_init(struct pe
         }
   
         if (READ_ONCE(x86_pmu.attr_rdpmc) &&
-           !(event->hw.flags & PERF_X86_EVENT_FREERUNNING))
+           !(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS))
                 event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;
   
         return err;
diff --combined arch/x86/events/intel/core.c

index 41c68d3,1e41d75..607bf56
--- 1/arch/x86/events/intel/core.c
--- 2/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@@ -2060,14 -2060,6 +2060,14 @@@ static void intel_pmu_del_event(struct 
                 intel_pmu_pebs_del(event);
   }
   
+ +static void intel_pmu_read_event(struct perf_event *event)
+ +{
+ +      if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
+ +              intel_pmu_auto_reload_read(event);
+ +      else
+ +              x86_perf_event_update(event);
+ +}
+ +
   static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
   {
         int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
@@@ -2209,15 -2201,9 +2209,15 @@@ static int intel_pmu_handle_irq(struct 
         int bit, loops;
         u64 status;
         int handled;
+ +      int pmu_enabled;
   
         cpuc = this_cpu_ptr(&cpu_hw_events);
   
+ +      /*
+ +       * Save the PMU state.
+ +       * It needs to be restored when leaving the handler.
+ +       */
+ +      pmu_enabled = cpuc->enabled;
         /*
          * No known reason to not always do late ACK,
          * but just in case do it opt-in.
@@@ -2225,7 -2211,6 +2225,7 @@@
         if (!x86_pmu.late_ack)
                 apic_write(APIC_LVTPC, APIC_DM_NMI);
         intel_bts_disable_local();
+ +      cpuc->enabled = 0;
         __intel_pmu_disable_all();
         handled = intel_pmu_drain_bts_buffer();
         handled += intel_bts_interrupt();
@@@ -2335,8 -2320,7 +2335,8 @@@ again
   
   done:
         /* Only restore PMU state when it's active. See x86_pmu_disable(). */
- -      if (cpuc->enabled)
+ +      cpuc->enabled = pmu_enabled;
+ +      if (pmu_enabled)
                 __intel_pmu_enable_all(0, true);
         intel_bts_enable_local();
   
@@@ -2968,9 -2952,9 +2968,9 @@@ static void intel_pebs_aliases_skl(stru
         return intel_pebs_aliases_precdist(event);
   }
   
- static unsigned long intel_pmu_free_running_flags(struct perf_event *event)
+ static unsigned long intel_pmu_large_pebs_flags(struct perf_event *event)
   {
-       unsigned long flags = x86_pmu.free_running_flags;
+       unsigned long flags = x86_pmu.large_pebs_flags;
   
         if (event->attr.use_clockid)
                 flags &= ~PERF_SAMPLE_TIME;
@@@ -2992,8 -2976,8 +2992,8 @@@ static int intel_pmu_hw_config(struct p
                 if (!event->attr.freq) {
                         event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
                         if (!(event->attr.sample_type &
-                             ~intel_pmu_free_running_flags(event)))
-                               event->hw.flags |= PERF_X86_EVENT_FREERUNNING;
+                             ~intel_pmu_large_pebs_flags(event)))
+                               event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS;
                 }
                 if (x86_pmu.pebs_aliases)
                         x86_pmu.pebs_aliases(event);
@@@ -3204,13 -3188,13 +3204,13 @@@ glp_get_event_constraints(struct cpu_hw
    * Therefore the effective (average) period matches the requested period,
    * despite coarser hardware granularity.
    */
- -static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
+ +static u64 bdw_limit_period(struct perf_event *event, u64 left)
   {
         if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
                         X86_CONFIG(.event=0xc0, .umask=0x01)) {
                 if (left < 128)
                         left = 128;
-               left &= ~0x3fu;
+               left &= ~0x3fULL;
         }
         return left;
   }
@@@ -3476,7 -3460,7 +3476,7 @@@ static __initconst const struct x86_pm
         .event_map              = intel_pmu_event_map,
         .max_events             = ARRAY_SIZE(intel_perfmon_event_map),
         .apic                   = 1,
-       .free_running_flags     = PEBS_FREERUNNING_FLAGS,
+       .large_pebs_flags       = LARGE_PEBS_FLAGS,
   
         /*
          * Intel PMCs cannot be accessed sanely above 32-bit width,
@@@ -3511,7 -3495,6 +3511,7 @@@ static __initconst const struct x86_pm
         .disable                = intel_pmu_disable_event,
         .add                    = intel_pmu_add_event,
         .del                    = intel_pmu_del_event,
+ +      .read                   = intel_pmu_read_event,
         .hw_config              = intel_pmu_hw_config,
         .schedule_events        = x86_schedule_events,
         .eventsel               = MSR_ARCH_PERFMON_EVENTSEL0,
@@@ -3519,7 -3502,7 +3519,7 @@@
         .event_map              = intel_pmu_event_map,
         .max_events             = ARRAY_SIZE(intel_perfmon_event_map),
         .apic                   = 1,
-       .free_running_flags     = PEBS_FREERUNNING_FLAGS,
+       .large_pebs_flags       = LARGE_PEBS_FLAGS,
         /*
          * Intel PMCs cannot be accessed sanely above 32 bit width,
          * so we install an artificial 1<<31 period regardless of
diff --combined arch/x86/events/intel/ds.c

index 7384402,d801523..209bf7c
--- 1/arch/x86/events/intel/ds.c
--- 2/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@@ -935,7 -935,7 +935,7 @@@ void intel_pmu_pebs_add(struct perf_eve
         bool needed_cb = pebs_needs_sched_cb(cpuc);
   
         cpuc->n_pebs++;
-       if (hwc->flags & PERF_X86_EVENT_FREERUNNING)
+       if (hwc->flags & PERF_X86_EVENT_LARGE_PEBS)
                 cpuc->n_large_pebs++;
   
         pebs_update_state(needed_cb, cpuc, event->ctx->pmu);
@@@ -975,7 -975,7 +975,7 @@@ void intel_pmu_pebs_del(struct perf_eve
         bool needed_cb = pebs_needs_sched_cb(cpuc);
   
         cpuc->n_pebs--;
-       if (hwc->flags & PERF_X86_EVENT_FREERUNNING)
+       if (hwc->flags & PERF_X86_EVENT_LARGE_PEBS)
                 cpuc->n_large_pebs--;
   
         pebs_update_state(needed_cb, cpuc, event->ctx->pmu);
@@@ -1306,93 -1306,17 +1306,93 @@@ get_next_pebs_record_by_bit(void *base
         return NULL;
   }
   
+ +void intel_pmu_auto_reload_read(struct perf_event *event)
+ +{
+ +      WARN_ON(!(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD));
+ +
+ +      perf_pmu_disable(event->pmu);
+ +      intel_pmu_drain_pebs_buffer();
+ +      perf_pmu_enable(event->pmu);
+ +}
+ +
+ +/*
+ + * Special variant of intel_pmu_save_and_restart() for auto-reload.
+ + */
+ +static int
+ +intel_pmu_save_and_restart_reload(struct perf_event *event, int count)
+ +{
+ +      struct hw_perf_event *hwc = &event->hw;
+ +      int shift = 64 - x86_pmu.cntval_bits;
+ +      u64 period = hwc->sample_period;
+ +      u64 prev_raw_count, new_raw_count;
+ +      s64 new, old;
+ +
+ +      WARN_ON(!period);
+ +
+ +      /*
+ +       * drain_pebs() only happens when the PMU is disabled.
+ +       */
+ +      WARN_ON(this_cpu_read(cpu_hw_events.enabled));
+ +
+ +      prev_raw_count = local64_read(&hwc->prev_count);
+ +      rdpmcl(hwc->event_base_rdpmc, new_raw_count);
+ +      local64_set(&hwc->prev_count, new_raw_count);
+ +
+ +      /*
+ +       * Since the counter increments a negative counter value and
+ +       * overflows on the sign switch, giving the interval:
+ +       *
+ +       *   [-period, 0]
+ +       *
+ +       * the difference between two consequtive reads is:
+ +       *
+ +       *   A) value2 - value1;
+ +       *      when no overflows have happened in between,
+ +       *
+ +       *   B) (0 - value1) + (value2 - (-period));
+ +       *      when one overflow happened in between,
+ +       *
+ +       *   C) (0 - value1) + (n - 1) * (period) + (value2 - (-period));
+ +       *      when @n overflows happened in between.
+ +       *
+ +       * Here A) is the obvious difference, B) is the extension to the
+ +       * discrete interval, where the first term is to the top of the
+ +       * interval and the second term is from the bottom of the next
+ +       * interval and C) the extension to multiple intervals, where the
+ +       * middle term is the whole intervals covered.
+ +       *
+ +       * An equivalent of C, by reduction, is:
+ +       *
+ +       *   value2 - value1 + n * period
+ +       */
+ +      new = ((s64)(new_raw_count << shift) >> shift);
+ +      old = ((s64)(prev_raw_count << shift) >> shift);
+ +      local64_add(new - old + count * period, &event->count);
+ +
+ +      perf_event_update_userpage(event);
+ +
+ +      return 0;
+ +}
+ +
   static void __intel_pmu_pebs_event(struct perf_event *event,
                                    struct pt_regs *iregs,
                                    void *base, void *top,
                                    int bit, int count)
   {
+ +      struct hw_perf_event *hwc = &event->hw;
         struct perf_sample_data data;
         struct pt_regs regs;
         void *at = get_next_pebs_record_by_bit(base, top, bit);
   
- -      if (!intel_pmu_save_and_restart(event) &&
- -          !(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD))
+ +      if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
+ +              /*
+ +               * Now, auto-reload is only enabled in fixed period mode.
+ +               * The reload value is always hwc->sample_period.
+ +               * May need to change it, if auto-reload is enabled in
+ +               * freq mode later.
+ +               */
+ +              intel_pmu_save_and_restart_reload(event, count);
+ +      } else if (!intel_pmu_save_and_restart(event))
                 return;
   
         while (count > 1) {
@@@ -1444,11 -1368,8 +1444,11 @@@ static void intel_pmu_drain_pebs_core(s
                 return;
   
         n = top - at;
- -      if (n <= 0)
+ +      if (n <= 0) {
+ +              if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
+ +                      intel_pmu_save_and_restart_reload(event, 0);
                 return;
+ +      }
   
         __intel_pmu_pebs_event(event, iregs, at, top, 0, n);
   }
@@@ -1471,22 -1392,8 +1471,22 @@@ static void intel_pmu_drain_pebs_nhm(st
   
         ds->pebs_index = ds->pebs_buffer_base;
   
- -      if (unlikely(base >= top))
+ +      if (unlikely(base >= top)) {
+ +              /*
+ +               * The drain_pebs() could be called twice in a short period
+ +               * for auto-reload event in pmu::read(). There are no
+ +               * overflows have happened in between.
+ +               * It needs to call intel_pmu_save_and_restart_reload() to
+ +               * update the event->count for this case.
+ +               */
+ +              for_each_set_bit(bit, (unsigned long *)&cpuc->pebs_enabled,
+ +                               x86_pmu.max_pebs_events) {
+ +                      event = cpuc->events[bit];
+ +                      if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
+ +                              intel_pmu_save_and_restart_reload(event, 0);
+ +              }
                 return;
+ +      }
   
         for (at = base; at < top; at += x86_pmu.pebs_record_size) {
                 struct pebs_record_nhm *p = at;
@@@ -1623,7 -1530,7 +1623,7 @@@ void __init intel_ds_init(void
                         x86_pmu.pebs_record_size =
                                                 sizeof(struct pebs_record_skl);
                         x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
-                       x86_pmu.free_running_flags |= PERF_SAMPLE_TIME;
+                       x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME;
                         break;
   
                 default:
diff --combined arch/x86/events/perf_event.h

index 9164347,39cd061..9f37114
--- 1/arch/x86/events/perf_event.h
--- 2/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@@ -69,7 -69,7 +69,7 @@@ struct event_constraint 
   #define PERF_X86_EVENT_RDPMC_ALLOWED  0x0100 /* grant rdpmc permission */
   #define PERF_X86_EVENT_EXCL_ACCT      0x0200 /* accounted EXCL event */
   #define PERF_X86_EVENT_AUTO_RELOAD    0x0400 /* use PEBS auto-reload */
- #define PERF_X86_EVENT_FREERUNNING    0x0800 /* use freerunning PEBS */
+ #define PERF_X86_EVENT_LARGE_PEBS     0x0800 /* use large PEBS */
   
   
   struct amd_nb {
@@@ -88,7 -88,7 +88,7 @@@
    * REGS_USER can be handled for events limited to ring 3.
    *
    */
- #define PEBS_FREERUNNING_FLAGS \
+ #define LARGE_PEBS_FLAGS \
         (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
         PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
         PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
@@@ -520,7 -520,6 +520,7 @@@ struct x86_pmu 
         void            (*disable)(struct perf_event *);
         void            (*add)(struct perf_event *);
         void            (*del)(struct perf_event *);
+ +      void            (*read)(struct perf_event *event);
         int             (*hw_config)(struct perf_event *event);
         int             (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
         unsigned        eventsel;
@@@ -558,7 -557,7 +558,7 @@@
         struct x86_pmu_quirk *quirks;
         int             perfctr_second_write;
         bool            late_ack;
- -      unsigned        (*limit_period)(struct perf_event *event, unsigned l);
+ +      u64             (*limit_period)(struct perf_event *event, u64 l);
   
         /*
          * sysfs attrs
@@@ -609,7 -608,7 +609,7 @@@
         struct event_constraint *pebs_constraints;
         void            (*pebs_aliases)(struct perf_event *event);
         int             max_pebs_events;
-       unsigned long   free_running_flags;
+       unsigned long   large_pebs_flags;
   
         /*
          * Intel LBR
@@@ -924,8 -923,6 +924,8 @@@ void intel_pmu_pebs_disable_all(void)
   
   void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in);
   
+ +void intel_pmu_auto_reload_read(struct perf_event *event);
+ +
   void intel_ds_init(void);
   
   void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
diff --combined kernel/events/core.c

index 74a6e8f,709a55b..7517b4f
--- 1/kernel/events/core.c
--- 2/kernel/events/core.c
+++ b/kernel/events/core.c
@@@ -430,7 -430,7 +430,7 @@@ static void update_perf_cpu_limits(void
         WRITE_ONCE(perf_sample_allowed_ns, tmp);
   }
   
- -static int perf_rotate_context(struct perf_cpu_context *cpuctx);
+ +static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
   
   int perf_proc_update_handler(struct ctl_table *table, int write,
                 void __user *buffer, size_t *lenp,
@@@ -643,7 -643,7 +643,7 @@@ static void perf_event_update_sibling_t
   {
         struct perf_event *sibling;
   
- -      list_for_each_entry(sibling, &leader->sibling_list, group_entry)
+ +      for_each_sibling_event(sibling, leader)
                 perf_event_update_time(sibling);
   }
   
@@@ -724,9 -724,15 +724,15 @@@ static inline void __update_cgrp_time(s
   
   static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
   {
-       struct perf_cgroup *cgrp_out = cpuctx->cgrp;
-       if (cgrp_out)
-               __update_cgrp_time(cgrp_out);
+       struct perf_cgroup *cgrp = cpuctx->cgrp;
+       struct cgroup_subsys_state *css;
+ 
+       if (cgrp) {
+               for (css = &cgrp->css; css; css = css->parent) {
+                       cgrp = container_of(css, struct perf_cgroup, css);
+                       __update_cgrp_time(cgrp);
+               }
+       }
   }
   
   static inline void update_cgrp_time_from_event(struct perf_event *event)
@@@ -754,6 -760,7 +760,7 @@@ perf_cgroup_set_timestamp(struct task_s
   {
         struct perf_cgroup *cgrp;
         struct perf_cgroup_info *info;
+       struct cgroup_subsys_state *css;
   
         /*
          * ctx->lock held by caller
@@@ -764,8 -771,12 +771,12 @@@
                 return;
   
         cgrp = perf_cgroup_from_task(task, ctx);
-       info = this_cpu_ptr(cgrp->info);
-       info->timestamp = ctx->timestamp;
+ 
+       for (css = &cgrp->css; css; css = css->parent) {
+               cgrp = container_of(css, struct perf_cgroup, css);
+               info = this_cpu_ptr(cgrp->info);
+               info->timestamp = ctx->timestamp;
+       }
   }
   
   static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
@@@ -937,39 -948,27 +948,39 @@@ list_update_cgroup_event(struct perf_ev
         if (!is_cgroup_event(event))
                 return;
   
- -      if (add && ctx->nr_cgroups++)
- -              return;
- -      else if (!add && --ctx->nr_cgroups)
- -              return;
         /*
          * Because cgroup events are always per-cpu events,
          * this will always be called from the right CPU.
          */
         cpuctx = __get_cpu_context(ctx);
- -      cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
- -      /* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/
- -      if (add) {
+ +
+ +      /*
+ +       * Since setting cpuctx->cgrp is conditional on the current @cgrp
+ +       * matching the event's cgroup, we must do this for every new event,
+ +       * because if the first would mismatch, the second would not try again
+ +       * and we would leave cpuctx->cgrp unset.
+ +       */
+ +      if (add && !cpuctx->cgrp) {
                 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
   
- -              list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
                 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
                         cpuctx->cgrp = cgrp;
- -      } else {
- -              list_del(cpuctx_entry);
- -              cpuctx->cgrp = NULL;
         }
+ +
+ +      if (add && ctx->nr_cgroups++)
+ +              return;
+ +      else if (!add && --ctx->nr_cgroups)
+ +              return;
+ +
+ +      /* no cgroup running */
+ +      if (!add)
+ +              cpuctx->cgrp = NULL;
+ +
+ +      cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
+ +      if (add)
+ +              list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
+ +      else
+ +              list_del(cpuctx_entry);
   }
   
   #else /* !CONFIG_CGROUP_PERF */
@@@ -1053,7 -1052,7 +1064,7 @@@ list_update_cgroup_event(struct perf_ev
   static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
   {
         struct perf_cpu_context *cpuctx;
- -      int rotations = 0;
+ +      bool rotations;
   
         lockdep_assert_irqs_disabled();
   
@@@ -1472,21 -1471,8 +1483,21 @@@ static enum event_type_t get_event_type
         return event_type;
   }
   
- -static struct list_head *
- -ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
+ +/*
+ + * Helper function to initialize event group nodes.
+ + */
+ +static void init_event_group(struct perf_event *event)
+ +{
+ +      RB_CLEAR_NODE(&event->group_node);
+ +      event->group_index = 0;
+ +}
+ +
+ +/*
+ + * Extract pinned or flexible groups from the context
+ + * based on event attrs bits.
+ + */
+ +static struct perf_event_groups *
+ +get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
   {
         if (event->attr.pinned)
                 return &ctx->pinned_groups;
@@@ -1494,156 -1480,6 +1505,156 @@@
                 return &ctx->flexible_groups;
   }
   
+ +/*
+ + * Helper function to initializes perf_event_group trees.
+ + */
+ +static void perf_event_groups_init(struct perf_event_groups *groups)
+ +{
+ +      groups->tree = RB_ROOT;
+ +      groups->index = 0;
+ +}
+ +
+ +/*
+ + * Compare function for event groups;
+ + *
+ + * Implements complex key that first sorts by CPU and then by virtual index
+ + * which provides ordering when rotating groups for the same CPU.
+ + */
+ +static bool
+ +perf_event_groups_less(struct perf_event *left, struct perf_event *right)
+ +{
+ +      if (left->cpu < right->cpu)
+ +              return true;
+ +      if (left->cpu > right->cpu)
+ +              return false;
+ +
+ +      if (left->group_index < right->group_index)
+ +              return true;
+ +      if (left->group_index > right->group_index)
+ +              return false;
+ +
+ +      return false;
+ +}
+ +
+ +/*
+ + * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
+ + * key (see perf_event_groups_less). This places it last inside the CPU
+ + * subtree.
+ + */
+ +static void
+ +perf_event_groups_insert(struct perf_event_groups *groups,
+ +                       struct perf_event *event)
+ +{
+ +      struct perf_event *node_event;
+ +      struct rb_node *parent;
+ +      struct rb_node **node;
+ +
+ +      event->group_index = ++groups->index;
+ +
+ +      node = &groups->tree.rb_node;
+ +      parent = *node;
+ +
+ +      while (*node) {
+ +              parent = *node;
+ +              node_event = container_of(*node, struct perf_event, group_node);
+ +
+ +              if (perf_event_groups_less(event, node_event))
+ +                      node = &parent->rb_left;
+ +              else
+ +                      node = &parent->rb_right;
+ +      }
+ +
+ +      rb_link_node(&event->group_node, parent, node);
+ +      rb_insert_color(&event->group_node, &groups->tree);
+ +}
+ +
+ +/*
+ + * Helper function to insert event into the pinned or flexible groups.
+ + */
+ +static void
+ +add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
+ +{
+ +      struct perf_event_groups *groups;
+ +
+ +      groups = get_event_groups(event, ctx);
+ +      perf_event_groups_insert(groups, event);
+ +}
+ +
+ +/*
+ + * Delete a group from a tree.
+ + */
+ +static void
+ +perf_event_groups_delete(struct perf_event_groups *groups,
+ +                       struct perf_event *event)
+ +{
+ +      WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
+ +                   RB_EMPTY_ROOT(&groups->tree));
+ +
+ +      rb_erase(&event->group_node, &groups->tree);
+ +      init_event_group(event);
+ +}
+ +
+ +/*
+ + * Helper function to delete event from its groups.
+ + */
+ +static void
+ +del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
+ +{
+ +      struct perf_event_groups *groups;
+ +
+ +      groups = get_event_groups(event, ctx);
+ +      perf_event_groups_delete(groups, event);
+ +}
+ +
+ +/*
+ + * Get the leftmost event in the @cpu subtree.
+ + */
+ +static struct perf_event *
+ +perf_event_groups_first(struct perf_event_groups *groups, int cpu)
+ +{
+ +      struct perf_event *node_event = NULL, *match = NULL;
+ +      struct rb_node *node = groups->tree.rb_node;
+ +
+ +      while (node) {
+ +              node_event = container_of(node, struct perf_event, group_node);
+ +
+ +              if (cpu < node_event->cpu) {
+ +                      node = node->rb_left;
+ +              } else if (cpu > node_event->cpu) {
+ +                      node = node->rb_right;
+ +              } else {
+ +                      match = node_event;
+ +                      node = node->rb_left;
+ +              }
+ +      }
+ +
+ +      return match;
+ +}
+ +
+ +/*
+ + * Like rb_entry_next_safe() for the @cpu subtree.
+ + */
+ +static struct perf_event *
+ +perf_event_groups_next(struct perf_event *event)
+ +{
+ +      struct perf_event *next;
+ +
+ +      next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
+ +      if (next && next->cpu == event->cpu)
+ +              return next;
+ +
+ +      return NULL;
+ +}
+ +
+ +/*
+ + * Iterate through the whole groups tree.
+ + */
+ +#define perf_event_groups_for_each(event, groups)                     \
+ +      for (event = rb_entry_safe(rb_first(&((groups)->tree)),         \
+ +                              typeof(*event), group_node); event;     \
+ +              event = rb_entry_safe(rb_next(&event->group_node),      \
+ +                              typeof(*event), group_node))
+ +
   /*
    * Add a event from the lists for its context.
    * Must be called with ctx->mutex and ctx->lock held.
@@@ -1664,8 -1500,12 +1675,8 @@@ list_add_event(struct perf_event *event
          * perf_group_detach can, at all times, locate all siblings.
          */
         if (event->group_leader == event) {
- -              struct list_head *list;
- -
                 event->group_caps = event->event_caps;
- -
- -              list = ctx_group_list(event, ctx);
- -              list_add_tail(&event->group_entry, list);
+ +              add_event_to_groups(event, ctx);
         }
   
         list_update_cgroup_event(event, ctx, true);
@@@ -1823,12 -1663,12 +1834,12 @@@ static void perf_group_attach(struct pe
   
         group_leader->group_caps &= event->event_caps;
   
- -      list_add_tail(&event->group_entry, &group_leader->sibling_list);
+ +      list_add_tail(&event->sibling_list, &group_leader->sibling_list);
         group_leader->nr_siblings++;
   
         perf_event__header_size(group_leader);
   
- -      list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
+ +      for_each_sibling_event(pos, group_leader)
                 perf_event__header_size(pos);
   }
   
@@@ -1859,7 -1699,7 +1870,7 @@@ list_del_event(struct perf_event *event
         list_del_rcu(&event->event_entry);
   
         if (event->group_leader == event)
- -              list_del_init(&event->group_entry);
+ +              del_event_from_groups(event, ctx);
   
         /*
          * If event was in error state, then keep it
@@@ -1877,9 -1717,9 +1888,9 @@@
   static void perf_group_detach(struct perf_event *event)
   {
         struct perf_event *sibling, *tmp;
- -      struct list_head *list = NULL;
+ +      struct perf_event_context *ctx = event->ctx;
   
- -      lockdep_assert_held(&event->ctx->lock);
+ +      lockdep_assert_held(&ctx->lock);
   
         /*
          * We can have double detach due to exit/hot-unplug + close.
@@@ -1893,42 -1733,34 +1904,42 @@@
          * If this is a sibling, remove it from its group.
          */
         if (event->group_leader != event) {
- -              list_del_init(&event->group_entry);
+ +              list_del_init(&event->sibling_list);
                 event->group_leader->nr_siblings--;
                 goto out;
         }
   
- -      if (!list_empty(&event->group_entry))
- -              list = &event->group_entry;
- -
         /*
          * If this was a group event with sibling events then
          * upgrade the siblings to singleton events by adding them
          * to whatever list we are on.
          */
- -      list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
- -              if (list)
- -                      list_move_tail(&sibling->group_entry, list);
+ +      list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
+ +
                 sibling->group_leader = sibling;
+ +              list_del_init(&sibling->sibling_list);
   
                 /* Inherit group flags from the previous leader */
                 sibling->group_caps = event->group_caps;
   
+ +              if (!RB_EMPTY_NODE(&event->group_node)) {
+ +                      add_event_to_groups(sibling, event->ctx);
+ +
+ +                      if (sibling->state == PERF_EVENT_STATE_ACTIVE) {
+ +                              struct list_head *list = sibling->attr.pinned ?
+ +                                      &ctx->pinned_active : &ctx->flexible_active;
+ +
+ +                              list_add_tail(&sibling->active_list, list);
+ +                      }
+ +              }
+ +
                 WARN_ON_ONCE(sibling->ctx != event->ctx);
         }
   
   out:
         perf_event__header_size(event->group_leader);
   
- -      list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
+ +      for_each_sibling_event(tmp, event->group_leader)
                 perf_event__header_size(tmp);
   }
   
@@@ -1951,13 -1783,13 +1962,13 @@@ static inline int __pmu_filter_match(st
    */
   static inline int pmu_filter_match(struct perf_event *event)
   {
- -      struct perf_event *child;
+ +      struct perf_event *sibling;
   
         if (!__pmu_filter_match(event))
                 return 0;
   
- -      list_for_each_entry(child, &event->sibling_list, group_entry) {
- -              if (!__pmu_filter_match(child))
+ +      for_each_sibling_event(sibling, event) {
+ +              if (!__pmu_filter_match(sibling))
                         return 0;
         }
   
@@@ -1984,13 -1816,6 +1995,13 @@@ event_sched_out(struct perf_event *even
         if (event->state != PERF_EVENT_STATE_ACTIVE)
                 return;
   
+ +      /*
+ +       * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but
+ +       * we can schedule events _OUT_ individually through things like
+ +       * __perf_remove_from_context().
+ +       */
+ +      list_del_init(&event->active_list);
+ +
         perf_pmu_disable(event->pmu);
   
         event->pmu->del(event, 0);
@@@ -2031,7 -1856,7 +2042,7 @@@ group_sched_out(struct perf_event *grou
         /*
          * Schedule out siblings (if any):
          */
- -      list_for_each_entry(event, &group_event->sibling_list, group_entry)
+ +      for_each_sibling_event(event, group_event)
                 event_sched_out(event, cpuctx, ctx);
   
         perf_pmu_enable(ctx->pmu);
@@@ -2310,7 -2135,7 +2321,7 @@@ group_sched_in(struct perf_event *group
         /*
          * Schedule in siblings as one group (if any):
          */
- -      list_for_each_entry(event, &group_event->sibling_list, group_entry) {
+ +      for_each_sibling_event(event, group_event) {
                 if (event_sched_in(event, cpuctx, ctx)) {
                         partial_group = event;
                         goto group_error;
@@@ -2326,7 -2151,7 +2337,7 @@@ group_error
          * partial group before returning:
          * The events up to the failed event are scheduled out normally.
          */
- -      list_for_each_entry(event, &group_event->sibling_list, group_entry) {
+ +      for_each_sibling_event(event, group_event) {
                 if (event == partial_group)
                         break;
   
@@@ -2503,18 -2328,6 +2514,18 @@@ static int  __perf_install_in_context(v
                 raw_spin_lock(&task_ctx->lock);
         }
   
+ +#ifdef CONFIG_CGROUP_PERF
+ +      if (is_cgroup_event(event)) {
+ +              /*
+ +               * If the current cgroup doesn't match the event's
+ +               * cgroup, we should not try to schedule it.
+ +               */
+ +              struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
+ +              reprogram = cgroup_is_descendant(cgrp->css.cgroup,
+ +                                      event->cgrp->css.cgroup);
+ +      }
+ +#endif
+ +
         if (reprogram) {
                 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
                 add_event_to_ctx(event, ctx);
@@@ -2848,47 -2661,12 +2859,47 @@@ int perf_event_refresh(struct perf_even
   }
   EXPORT_SYMBOL_GPL(perf_event_refresh);
   
+ +static int perf_event_modify_breakpoint(struct perf_event *bp,
+ +                                       struct perf_event_attr *attr)
+ +{
+ +      int err;
+ +
+ +      _perf_event_disable(bp);
+ +
+ +      err = modify_user_hw_breakpoint_check(bp, attr, true);
+ +      if (err) {
+ +              if (!bp->attr.disabled)
+ +                      _perf_event_enable(bp);
+ +
+ +              return err;
+ +      }
+ +
+ +      if (!attr->disabled)
+ +              _perf_event_enable(bp);
+ +      return 0;
+ +}
+ +
+ +static int perf_event_modify_attr(struct perf_event *event,
+ +                                struct perf_event_attr *attr)
+ +{
+ +      if (event->attr.type != attr->type)
+ +              return -EINVAL;
+ +
+ +      switch (event->attr.type) {
+ +      case PERF_TYPE_BREAKPOINT:
+ +              return perf_event_modify_breakpoint(event, attr);
+ +      default:
+ +              /* Place holder for future additions. */
+ +              return -EOPNOTSUPP;
+ +      }
+ +}
+ +
   static void ctx_sched_out(struct perf_event_context *ctx,
                           struct perf_cpu_context *cpuctx,
                           enum event_type_t event_type)
   {
+ +      struct perf_event *event, *tmp;
         int is_active = ctx->is_active;
- -      struct perf_event *event;
   
         lockdep_assert_held(&ctx->lock);
   
@@@ -2935,12 -2713,12 +2946,12 @@@
   
         perf_pmu_disable(ctx->pmu);
         if (is_active & EVENT_PINNED) {
- -              list_for_each_entry(event, &ctx->pinned_groups, group_entry)
+ +              list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
                         group_sched_out(event, cpuctx, ctx);
         }
   
         if (is_active & EVENT_FLEXIBLE) {
- -              list_for_each_entry(event, &ctx->flexible_groups, group_entry)
+ +              list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
                         group_sched_out(event, cpuctx, ctx);
         }
         perf_pmu_enable(ctx->pmu);
@@@ -3227,116 -3005,53 +3238,116 @@@ static void cpu_ctx_sched_out(struct pe
         ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
   }
   
- -static void
- -ctx_pinned_sched_in(struct perf_event_context *ctx,
- -                  struct perf_cpu_context *cpuctx)
+ +static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
+ +                            int (*func)(struct perf_event *, void *), void *data)
   {
- -      struct perf_event *event;
+ +      struct perf_event **evt, *evt1, *evt2;
+ +      int ret;
   
- -      list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
- -              if (event->state <= PERF_EVENT_STATE_OFF)
- -                      continue;
- -              if (!event_filter_match(event))
- -                      continue;
+ +      evt1 = perf_event_groups_first(groups, -1);
+ +      evt2 = perf_event_groups_first(groups, cpu);
+ +
+ +      while (evt1 || evt2) {
+ +              if (evt1 && evt2) {
+ +                      if (evt1->group_index < evt2->group_index)
+ +                              evt = &evt1;
+ +                      else
+ +                              evt = &evt2;
+ +              } else if (evt1) {
+ +                      evt = &evt1;
+ +              } else {
+ +                      evt = &evt2;
+ +              }
   
- -              if (group_can_go_on(event, cpuctx, 1))
- -                      group_sched_in(event, cpuctx, ctx);
+ +              ret = func(*evt, data);
+ +              if (ret)
+ +                      return ret;
   
- -              /*
- -               * If this pinned group hasn't been scheduled,
- -               * put it in error state.
- -               */
- -              if (event->state == PERF_EVENT_STATE_INACTIVE)
- -                      perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+ +              *evt = perf_event_groups_next(*evt);
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +struct sched_in_data {
+ +      struct perf_event_context *ctx;
+ +      struct perf_cpu_context *cpuctx;
+ +      int can_add_hw;
+ +};
+ +
+ +static int pinned_sched_in(struct perf_event *event, void *data)
+ +{
+ +      struct sched_in_data *sid = data;
+ +
+ +      if (event->state <= PERF_EVENT_STATE_OFF)
+ +              return 0;
+ +
+ +      if (!event_filter_match(event))
+ +              return 0;
+ +
+ +      if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
+ +              if (!group_sched_in(event, sid->cpuctx, sid->ctx))
+ +                      list_add_tail(&event->active_list, &sid->ctx->pinned_active);
         }
+ +
+ +      /*
+ +       * If this pinned group hasn't been scheduled,
+ +       * put it in error state.
+ +       */
+ +      if (event->state == PERF_EVENT_STATE_INACTIVE)
+ +              perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+ +
+ +      return 0;
+ +}
+ +
+ +static int flexible_sched_in(struct perf_event *event, void *data)
+ +{
+ +      struct sched_in_data *sid = data;
+ +
+ +      if (event->state <= PERF_EVENT_STATE_OFF)
+ +              return 0;
+ +
+ +      if (!event_filter_match(event))
+ +              return 0;
+ +
+ +      if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
+ +              if (!group_sched_in(event, sid->cpuctx, sid->ctx))
+ +                      list_add_tail(&event->active_list, &sid->ctx->flexible_active);
+ +              else
+ +                      sid->can_add_hw = 0;
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +static void
+ +ctx_pinned_sched_in(struct perf_event_context *ctx,
+ +                  struct perf_cpu_context *cpuctx)
+ +{
+ +      struct sched_in_data sid = {
+ +              .ctx = ctx,
+ +              .cpuctx = cpuctx,
+ +              .can_add_hw = 1,
+ +      };
+ +
+ +      visit_groups_merge(&ctx->pinned_groups,
+ +                         smp_processor_id(),
+ +                         pinned_sched_in, &sid);
   }
   
   static void
   ctx_flexible_sched_in(struct perf_event_context *ctx,
                       struct perf_cpu_context *cpuctx)
   {
- -      struct perf_event *event;
- -      int can_add_hw = 1;
- -
- -      list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
- -              /* Ignore events in OFF or ERROR state */
- -              if (event->state <= PERF_EVENT_STATE_OFF)
- -                      continue;
- -              /*
- -               * Listen to the 'cpu' scheduling filter constraint
- -               * of events:
- -               */
- -              if (!event_filter_match(event))
- -                      continue;
+ +      struct sched_in_data sid = {
+ +              .ctx = ctx,
+ +              .cpuctx = cpuctx,
+ +              .can_add_hw = 1,
+ +      };
   
- -              if (group_can_go_on(event, cpuctx, can_add_hw)) {
- -                      if (group_sched_in(event, cpuctx, ctx))
- -                              can_add_hw = 0;
- -              }
- -      }
+ +      visit_groups_merge(&ctx->flexible_groups,
+ +                         smp_processor_id(),
+ +                         flexible_sched_in, &sid);
   }
   
   static void
@@@ -3417,7 -3132,7 +3428,7 @@@ static void perf_event_context_sched_in
          * However, if task's ctx is not carrying any pinned
          * events, no need to flip the cpuctx's events around.
          */
- -      if (!list_empty(&ctx->pinned_groups))
+ +      if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
         perf_event_sched_in(cpuctx, ctx, task);
         perf_pmu_enable(ctx->pmu);
@@@ -3646,81 -3361,55 +3657,81 @@@ static void perf_adjust_freq_unthr_cont
   }
   
   /*
- - * Round-robin a context's events:
+ + * Move @event to the tail of the @ctx's elegible events.
    */
- -static void rotate_ctx(struct perf_event_context *ctx)
+ +static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
   {
         /*
          * Rotate the first entry last of non-pinned groups. Rotation might be
          * disabled by the inheritance code.
          */
- -      if (!ctx->rotate_disable)
- -              list_rotate_left(&ctx->flexible_groups);
+ +      if (ctx->rotate_disable)
+ +              return;
+ +
+ +      perf_event_groups_delete(&ctx->flexible_groups, event);
+ +      perf_event_groups_insert(&ctx->flexible_groups, event);
   }
   
- -static int perf_rotate_context(struct perf_cpu_context *cpuctx)
+ +static inline struct perf_event *
+ +ctx_first_active(struct perf_event_context *ctx)
   {
+ +      return list_first_entry_or_null(&ctx->flexible_active,
+ +                                      struct perf_event, active_list);
+ +}
+ +
+ +static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
+ +{
+ +      struct perf_event *cpu_event = NULL, *task_event = NULL;
+ +      bool cpu_rotate = false, task_rotate = false;
         struct perf_event_context *ctx = NULL;
- -      int rotate = 0;
+ +
+ +      /*
+ +       * Since we run this from IRQ context, nobody can install new
+ +       * events, thus the event count values are stable.
+ +       */
   
         if (cpuctx->ctx.nr_events) {
                 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
- -                      rotate = 1;
+ +                      cpu_rotate = true;
         }
   
         ctx = cpuctx->task_ctx;
         if (ctx && ctx->nr_events) {
                 if (ctx->nr_events != ctx->nr_active)
- -                      rotate = 1;
+ +                      task_rotate = true;
         }
   
- -      if (!rotate)
- -              goto done;
+ +      if (!(cpu_rotate || task_rotate))
+ +              return false;
   
         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
         perf_pmu_disable(cpuctx->ctx.pmu);
   
- -      cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
- -      if (ctx)
+ +      if (task_rotate)
+ +              task_event = ctx_first_active(ctx);
+ +      if (cpu_rotate)
+ +              cpu_event = ctx_first_active(&cpuctx->ctx);
+ +
+ +      /*
+ +       * As per the order given at ctx_resched() first 'pop' task flexible
+ +       * and then, if needed CPU flexible.
+ +       */
+ +      if (task_event || (ctx && cpu_event))
                 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
+ +      if (cpu_event)
+ +              cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
   
- -      rotate_ctx(&cpuctx->ctx);
- -      if (ctx)
- -              rotate_ctx(ctx);
+ +      if (task_event)
+ +              rotate_ctx(ctx, task_event);
+ +      if (cpu_event)
+ +              rotate_ctx(&cpuctx->ctx, cpu_event);
   
         perf_event_sched_in(cpuctx, ctx, current);
   
         perf_pmu_enable(cpuctx->ctx.pmu);
         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
- -done:
   
- -      return rotate;
+ +      return true;
   }
   
   void perf_event_task_tick(void)
@@@ -3865,7 -3554,7 +3876,7 @@@ static void __perf_event_read(void *inf
   
         pmu->read(event);
   
- -      list_for_each_entry(sub, &event->sibling_list, group_entry) {
+ +      for_each_sibling_event(sub, event) {
                 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
                         /*
                          * Use sibling's PMU rather than @event's since
@@@ -4039,11 -3728,9 +4050,11 @@@ static void __perf_event_init_context(s
         raw_spin_lock_init(&ctx->lock);
         mutex_init(&ctx->mutex);
         INIT_LIST_HEAD(&ctx->active_ctx_list);
- -      INIT_LIST_HEAD(&ctx->pinned_groups);
- -      INIT_LIST_HEAD(&ctx->flexible_groups);
+ +      perf_event_groups_init(&ctx->pinned_groups);
+ +      perf_event_groups_init(&ctx->flexible_groups);
         INIT_LIST_HEAD(&ctx->event_list);
+ +      INIT_LIST_HEAD(&ctx->pinned_active);
+ +      INIT_LIST_HEAD(&ctx->flexible_active);
         atomic_set(&ctx->refcount, 1);
   }
   
@@@ -4713,7 -4400,7 +4724,7 @@@ static int __perf_read_group_add(struc
         if (read_format & PERF_FORMAT_ID)
                 values[n++] = primary_event_id(leader);
   
- -      list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+ +      for_each_sibling_event(sub, leader) {
                 values[n++] += perf_event_count(sub);
                 if (read_format & PERF_FORMAT_ID)
                         values[n++] = primary_event_id(sub);
@@@ -4907,7 -4594,7 +4918,7 @@@ static void perf_event_for_each(struct 
         event = event->group_leader;
   
         perf_event_for_each_child(event, func);
- -      list_for_each_entry(sibling, &event->sibling_list, group_entry)
+ +      for_each_sibling_event(sibling, event)
                 perf_event_for_each_child(sibling, func);
   }
   
@@@ -4989,8 -4676,6 +5000,8 @@@ static int perf_event_set_output(struc
                                  struct perf_event *output_event);
   static int perf_event_set_filter(struct perf_event *event, void __user *arg);
   static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
+ +static int perf_copy_attr(struct perf_event_attr __user *uattr,
+ +                        struct perf_event_attr *attr);
   
   static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
   {
@@@ -5063,17 -4748,6 +5074,17 @@@
   
         case PERF_EVENT_IOC_QUERY_BPF:
                 return perf_event_query_prog_array(event, (void __user *)arg);
+ +
+ +      case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
+ +              struct perf_event_attr new_attr;
+ +              int err = perf_copy_attr((struct perf_event_attr __user *)arg,
+ +                                       &new_attr);
+ +
+ +              if (err)
+ +                      return err;
+ +
+ +              return perf_event_modify_attr(event,  &new_attr);
+ +      }
         default:
                 return -ENOTTY;
         }
@@@ -6069,8 -5743,7 +6080,8 @@@ static void perf_output_read_group(stru
         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
                 values[n++] = running;
   
- -      if (leader != event)
+ +      if ((leader != event) &&
+ +          (leader->state == PERF_EVENT_STATE_ACTIVE))
                 leader->pmu->read(leader);
   
         values[n++] = perf_event_count(leader);
@@@ -6079,7 -5752,7 +6090,7 @@@
   
         __output_copy(handle, values, n * sizeof(u64));
   
- -      list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+ +      for_each_sibling_event(sub, leader) {
                 n = 0;
   
                 if ((sub != event) &&
@@@ -8336,119 -8009,9 +8347,119 @@@ static struct pmu perf_tracepoint = 
         .read           = perf_swevent_read,
   };
   
+ +#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
+ +/*
+ + * Flags in config, used by dynamic PMU kprobe and uprobe
+ + * The flags should match following PMU_FORMAT_ATTR().
+ + *
+ + * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe
+ + *                               if not set, create kprobe/uprobe
+ + */
+ +enum perf_probe_config {
+ +      PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0,  /* [k,u]retprobe */
+ +};
+ +
+ +PMU_FORMAT_ATTR(retprobe, "config:0");
+ +
+ +static struct attribute *probe_attrs[] = {
+ +      &format_attr_retprobe.attr,
+ +      NULL,
+ +};
+ +
+ +static struct attribute_group probe_format_group = {
+ +      .name = "format",
+ +      .attrs = probe_attrs,
+ +};
+ +
+ +static const struct attribute_group *probe_attr_groups[] = {
+ +      &probe_format_group,
+ +      NULL,
+ +};
+ +#endif
+ +
+ +#ifdef CONFIG_KPROBE_EVENTS
+ +static int perf_kprobe_event_init(struct perf_event *event);
+ +static struct pmu perf_kprobe = {
+ +      .task_ctx_nr    = perf_sw_context,
+ +      .event_init     = perf_kprobe_event_init,
+ +      .add            = perf_trace_add,
+ +      .del            = perf_trace_del,
+ +      .start          = perf_swevent_start,
+ +      .stop           = perf_swevent_stop,
+ +      .read           = perf_swevent_read,
+ +      .attr_groups    = probe_attr_groups,
+ +};
+ +
+ +static int perf_kprobe_event_init(struct perf_event *event)
+ +{
+ +      int err;
+ +      bool is_retprobe;
+ +
+ +      if (event->attr.type != perf_kprobe.type)
+ +              return -ENOENT;
+ +      /*
+ +       * no branch sampling for probe events
+ +       */
+ +      if (has_branch_stack(event))
+ +              return -EOPNOTSUPP;
+ +
+ +      is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
+ +      err = perf_kprobe_init(event, is_retprobe);
+ +      if (err)
+ +              return err;
+ +
+ +      event->destroy = perf_kprobe_destroy;
+ +
+ +      return 0;
+ +}
+ +#endif /* CONFIG_KPROBE_EVENTS */
+ +
+ +#ifdef CONFIG_UPROBE_EVENTS
+ +static int perf_uprobe_event_init(struct perf_event *event);
+ +static struct pmu perf_uprobe = {
+ +      .task_ctx_nr    = perf_sw_context,
+ +      .event_init     = perf_uprobe_event_init,
+ +      .add            = perf_trace_add,
+ +      .del            = perf_trace_del,
+ +      .start          = perf_swevent_start,
+ +      .stop           = perf_swevent_stop,
+ +      .read           = perf_swevent_read,
+ +      .attr_groups    = probe_attr_groups,
+ +};
+ +
+ +static int perf_uprobe_event_init(struct perf_event *event)
+ +{
+ +      int err;
+ +      bool is_retprobe;
+ +
+ +      if (event->attr.type != perf_uprobe.type)
+ +              return -ENOENT;
+ +      /*
+ +       * no branch sampling for probe events
+ +       */
+ +      if (has_branch_stack(event))
+ +              return -EOPNOTSUPP;
+ +
+ +      is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
+ +      err = perf_uprobe_init(event, is_retprobe);
+ +      if (err)
+ +              return err;
+ +
+ +      event->destroy = perf_uprobe_destroy;
+ +
+ +      return 0;
+ +}
+ +#endif /* CONFIG_UPROBE_EVENTS */
+ +
   static inline void perf_tp_register(void)
   {
         perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
+ +#ifdef CONFIG_KPROBE_EVENTS
+ +      perf_pmu_register(&perf_kprobe, "kprobe", -1);
+ +#endif
+ +#ifdef CONFIG_UPROBE_EVENTS
+ +      perf_pmu_register(&perf_uprobe, "uprobe", -1);
+ +#endif
   }
   
   static void perf_event_free_filter(struct perf_event *event)
@@@ -8525,32 -8088,13 +8536,32 @@@ static void perf_event_free_bpf_handler
   }
   #endif
   
+ +/*
+ + * returns true if the event is a tracepoint, or a kprobe/upprobe created
+ + * with perf_event_open()
+ + */
+ +static inline bool perf_event_is_tracing(struct perf_event *event)
+ +{
+ +      if (event->pmu == &perf_tracepoint)
+ +              return true;
+ +#ifdef CONFIG_KPROBE_EVENTS
+ +      if (event->pmu == &perf_kprobe)
+ +              return true;
+ +#endif
+ +#ifdef CONFIG_UPROBE_EVENTS
+ +      if (event->pmu == &perf_uprobe)
+ +              return true;
+ +#endif
+ +      return false;
+ +}
+ +
   static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
   {
         bool is_kprobe, is_tracepoint, is_syscall_tp;
         struct bpf_prog *prog;
         int ret;
   
- -      if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ +      if (!perf_event_is_tracing(event))
                 return perf_event_set_bpf_handler(event, prog_fd);
   
         is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
@@@ -8596,7 -8140,7 +8607,7 @@@
   
   static void perf_event_free_bpf_prog(struct perf_event *event)
   {
- -      if (event->attr.type != PERF_TYPE_TRACEPOINT) {
+ +      if (!perf_event_is_tracing(event)) {
                 perf_event_free_bpf_handler(event);
                 return;
         }
@@@ -9015,36 -8559,47 +9026,36 @@@ fail_clear_files
         return ret;
   }
   
- -static int
- -perf_tracepoint_set_filter(struct perf_event *event, char *filter_str)
- -{
- -      struct perf_event_context *ctx = event->ctx;
- -      int ret;
- -
- -      /*
- -       * Beware, here be dragons!!
- -       *
- -       * the tracepoint muck will deadlock against ctx->mutex, but the tracepoint
- -       * stuff does not actually need it. So temporarily drop ctx->mutex. As per
- -       * perf_event_ctx_lock() we already have a reference on ctx.
- -       *
- -       * This can result in event getting moved to a different ctx, but that
- -       * does not affect the tracepoint state.
- -       */
- -      mutex_unlock(&ctx->mutex);
- -      ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
- -      mutex_lock(&ctx->mutex);
- -
- -      return ret;
- -}
- -
   static int perf_event_set_filter(struct perf_event *event, void __user *arg)
   {
- -      char *filter_str;
         int ret = -EINVAL;
- -
- -      if ((event->attr.type != PERF_TYPE_TRACEPOINT ||
- -          !IS_ENABLED(CONFIG_EVENT_TRACING)) &&
- -          !has_addr_filter(event))
- -              return -EINVAL;
+ +      char *filter_str;
   
         filter_str = strndup_user(arg, PAGE_SIZE);
         if (IS_ERR(filter_str))
                 return PTR_ERR(filter_str);
   
- -      if (IS_ENABLED(CONFIG_EVENT_TRACING) &&
- -          event->attr.type == PERF_TYPE_TRACEPOINT)
- -              ret = perf_tracepoint_set_filter(event, filter_str);
- -      else if (has_addr_filter(event))
+ +#ifdef CONFIG_EVENT_TRACING
+ +      if (perf_event_is_tracing(event)) {
+ +              struct perf_event_context *ctx = event->ctx;
+ +
+ +              /*
+ +               * Beware, here be dragons!!
+ +               *
+ +               * the tracepoint muck will deadlock against ctx->mutex, but
+ +               * the tracepoint stuff does not actually need it. So
+ +               * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we
+ +               * already have a reference on ctx.
+ +               *
+ +               * This can result in event getting moved to a different ctx,
+ +               * but that does not affect the tracepoint state.
+ +               */
+ +              mutex_unlock(&ctx->mutex);
+ +              ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
+ +              mutex_lock(&ctx->mutex);
+ +      } else
+ +#endif
+ +      if (has_addr_filter(event))
                 ret = perf_event_set_addr_filter(event, filter_str);
   
         kfree(filter_str);
@@@ -9897,10 -9452,9 +9908,10 @@@ perf_event_alloc(struct perf_event_att
         mutex_init(&event->child_mutex);
         INIT_LIST_HEAD(&event->child_list);
   
- -      INIT_LIST_HEAD(&event->group_entry);
         INIT_LIST_HEAD(&event->event_entry);
         INIT_LIST_HEAD(&event->sibling_list);
+ +      INIT_LIST_HEAD(&event->active_list);
+ +      init_event_group(event);
         INIT_LIST_HEAD(&event->rb_entry);
         INIT_LIST_HEAD(&event->active_entry);
         INIT_LIST_HEAD(&event->addr_filters.list);
@@@ -10175,9 -9729,6 +10186,9 @@@ static int perf_copy_attr(struct perf_e
                         ret = -EINVAL;
         }
   
+ +      if (!attr->sample_max_stack)
+ +              attr->sample_max_stack = sysctl_perf_event_max_stack;
+ +
         if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
                 ret = perf_reg_validate(attr->sample_regs_intr);
   out:
@@@ -10391,6 -9942,9 +10402,6 @@@ SYSCALL_DEFINE5(perf_event_open
             perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
                 return -EACCES;
   
- -      if (!attr.sample_max_stack)
- -              attr.sample_max_stack = sysctl_perf_event_max_stack;
- -
         /*
          * In cgroup mode, the pid argument is used to pass the fd
          * opened to the cgroup directory in cgroupfs. The cpu argument
@@@ -10664,7 -10218,8 +10675,7 @@@
                 perf_remove_from_context(group_leader, 0);
                 put_ctx(gctx);
   
- -              list_for_each_entry(sibling, &group_leader->sibling_list,
- -                                  group_entry) {
+ +              for_each_sibling_event(sibling, group_leader) {
                         perf_remove_from_context(sibling, 0);
                         put_ctx(gctx);
                 }
@@@ -10685,7 -10240,8 +10696,7 @@@
                  * By installing siblings first we NO-OP because they're not
                  * reachable through the group lists.
                  */
- -              list_for_each_entry(sibling, &group_leader->sibling_list,
- -                                  group_entry) {
+ +              for_each_sibling_event(sibling, group_leader) {
                         perf_event__state_init(sibling);
                         perf_install_in_context(ctx, sibling, sibling->cpu);
                         get_ctx(ctx);
@@@ -11324,7 -10880,7 +11335,7 @@@ static int inherit_group(struct perf_ev
          * case inherit_event() will create individual events, similar to what
          * perf_group_detach() would do anyway.
          */
- -      list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
+ +      for_each_sibling_event(sub, parent_event) {
                 child_ctr = inherit_event(sub, parent, parent_ctx,
                                             child, leader, child_ctx);
                 if (IS_ERR(child_ctr))
@@@ -11423,7 -10979,7 +11434,7 @@@ static int perf_event_init_context(stru
          * We dont have to disable NMIs - we are only looking at
          * the list, not manipulating it:
          */
- -      list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
+ +      perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
                 ret = inherit_task_group(event, parent, parent_ctx,
                                          child, ctxn, &inherited_all);
                 if (ret)
@@@ -11439,7 -10995,7 +11450,7 @@@
         parent_ctx->rotate_disable = 1;
         raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
   
- -      list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
+ +      perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
                 ret = inherit_task_group(event, parent, parent_ctx,
                                          child, ctxn, &inherited_all);
                 if (ret)
author	Ingo Molnar <mingo@kernel.org>
	Sat, 24 Mar 2018 08:21:47 +0000 (09:21 +0100)
committer	Ingo Molnar <mingo@kernel.org>
	Sat, 24 Mar 2018 08:21:47 +0000 (09:21 +0100)
		1	2
arch/x86/events/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/events/intel/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/events/intel/ds.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/events/perf_event.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/events/core.c	patch \|	diff1 \|	diff2 \|	blob \| history