Merge branch 'perf/urgent' into perf/core, to pick up fixes
authorIngo Molnar <mingo@kernel.org>
Sat, 24 Mar 2018 08:21:47 +0000 (09:21 +0100)
committerIngo Molnar <mingo@kernel.org>
Sat, 24 Mar 2018 08:21:47 +0000 (09:21 +0100)
With the cherry-picked perf/urgent commit merged separately we can now
merge all the fixes without conflicts.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
1  2 
arch/x86/events/core.c
arch/x86/events/intel/core.c
arch/x86/events/intel/ds.c
arch/x86/events/perf_event.h
kernel/events/core.c

diff --combined arch/x86/events/core.c
@@@ -990,7 -990,7 +990,7 @@@ static int collect_events(struct cpu_hw
        if (!dogrp)
                return n;
  
 -      list_for_each_entry(event, &leader->sibling_list, group_entry) {
 +      for_each_sibling_event(event, leader) {
                if (!is_x86_event(event) ||
                    event->state <= PERF_EVENT_STATE_OFF)
                        continue;
@@@ -1156,13 -1156,16 +1156,13 @@@ int x86_perf_event_set_period(struct pe
  
        per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
  
 -      if (!(hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) ||
 -          local64_read(&hwc->prev_count) != (u64)-left) {
 -              /*
 -               * The hw event starts counting from this event offset,
 -               * mark it to be able to extra future deltas:
 -               */
 -              local64_set(&hwc->prev_count, (u64)-left);
 +      /*
 +       * The hw event starts counting from this event offset,
 +       * mark it to be able to extra future deltas:
 +       */
 +      local64_set(&hwc->prev_count, (u64)-left);
  
 -              wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
 -      }
 +      wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
  
        /*
         * Due to erratum on certan cpu we need
@@@ -1881,8 -1884,6 +1881,8 @@@ early_initcall(init_hw_perf_events)
  
  static inline void x86_pmu_read(struct perf_event *event)
  {
 +      if (x86_pmu.read)
 +              return x86_pmu.read(event);
        x86_perf_event_update(event);
  }
  
@@@ -2118,7 -2119,7 +2118,7 @@@ static int x86_pmu_event_init(struct pe
        }
  
        if (READ_ONCE(x86_pmu.attr_rdpmc) &&
-           !(event->hw.flags & PERF_X86_EVENT_FREERUNNING))
+           !(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS))
                event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;
  
        return err;
@@@ -2060,14 -2060,6 +2060,14 @@@ static void intel_pmu_del_event(struct 
                intel_pmu_pebs_del(event);
  }
  
 +static void intel_pmu_read_event(struct perf_event *event)
 +{
 +      if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
 +              intel_pmu_auto_reload_read(event);
 +      else
 +              x86_perf_event_update(event);
 +}
 +
  static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
  {
        int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
@@@ -2209,15 -2201,9 +2209,15 @@@ static int intel_pmu_handle_irq(struct 
        int bit, loops;
        u64 status;
        int handled;
 +      int pmu_enabled;
  
        cpuc = this_cpu_ptr(&cpu_hw_events);
  
 +      /*
 +       * Save the PMU state.
 +       * It needs to be restored when leaving the handler.
 +       */
 +      pmu_enabled = cpuc->enabled;
        /*
         * No known reason to not always do late ACK,
         * but just in case do it opt-in.
        if (!x86_pmu.late_ack)
                apic_write(APIC_LVTPC, APIC_DM_NMI);
        intel_bts_disable_local();
 +      cpuc->enabled = 0;
        __intel_pmu_disable_all();
        handled = intel_pmu_drain_bts_buffer();
        handled += intel_bts_interrupt();
@@@ -2335,8 -2320,7 +2335,8 @@@ again
  
  done:
        /* Only restore PMU state when it's active. See x86_pmu_disable(). */
 -      if (cpuc->enabled)
 +      cpuc->enabled = pmu_enabled;
 +      if (pmu_enabled)
                __intel_pmu_enable_all(0, true);
        intel_bts_enable_local();
  
@@@ -2968,9 -2952,9 +2968,9 @@@ static void intel_pebs_aliases_skl(stru
        return intel_pebs_aliases_precdist(event);
  }
  
- static unsigned long intel_pmu_free_running_flags(struct perf_event *event)
+ static unsigned long intel_pmu_large_pebs_flags(struct perf_event *event)
  {
-       unsigned long flags = x86_pmu.free_running_flags;
+       unsigned long flags = x86_pmu.large_pebs_flags;
  
        if (event->attr.use_clockid)
                flags &= ~PERF_SAMPLE_TIME;
@@@ -2992,8 -2976,8 +2992,8 @@@ static int intel_pmu_hw_config(struct p
                if (!event->attr.freq) {
                        event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
                        if (!(event->attr.sample_type &
-                             ~intel_pmu_free_running_flags(event)))
-                               event->hw.flags |= PERF_X86_EVENT_FREERUNNING;
+                             ~intel_pmu_large_pebs_flags(event)))
+                               event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS;
                }
                if (x86_pmu.pebs_aliases)
                        x86_pmu.pebs_aliases(event);
@@@ -3204,13 -3188,13 +3204,13 @@@ glp_get_event_constraints(struct cpu_hw
   * Therefore the effective (average) period matches the requested period,
   * despite coarser hardware granularity.
   */
 -static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
 +static u64 bdw_limit_period(struct perf_event *event, u64 left)
  {
        if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
                        X86_CONFIG(.event=0xc0, .umask=0x01)) {
                if (left < 128)
                        left = 128;
-               left &= ~0x3fu;
+               left &= ~0x3fULL;
        }
        return left;
  }
@@@ -3476,7 -3460,7 +3476,7 @@@ static __initconst const struct x86_pm
        .event_map              = intel_pmu_event_map,
        .max_events             = ARRAY_SIZE(intel_perfmon_event_map),
        .apic                   = 1,
-       .free_running_flags     = PEBS_FREERUNNING_FLAGS,
+       .large_pebs_flags       = LARGE_PEBS_FLAGS,
  
        /*
         * Intel PMCs cannot be accessed sanely above 32-bit width,
@@@ -3511,7 -3495,6 +3511,7 @@@ static __initconst const struct x86_pm
        .disable                = intel_pmu_disable_event,
        .add                    = intel_pmu_add_event,
        .del                    = intel_pmu_del_event,
 +      .read                   = intel_pmu_read_event,
        .hw_config              = intel_pmu_hw_config,
        .schedule_events        = x86_schedule_events,
        .eventsel               = MSR_ARCH_PERFMON_EVENTSEL0,
        .event_map              = intel_pmu_event_map,
        .max_events             = ARRAY_SIZE(intel_perfmon_event_map),
        .apic                   = 1,
-       .free_running_flags     = PEBS_FREERUNNING_FLAGS,
+       .large_pebs_flags       = LARGE_PEBS_FLAGS,
        /*
         * Intel PMCs cannot be accessed sanely above 32 bit width,
         * so we install an artificial 1<<31 period regardless of
@@@ -935,7 -935,7 +935,7 @@@ void intel_pmu_pebs_add(struct perf_eve
        bool needed_cb = pebs_needs_sched_cb(cpuc);
  
        cpuc->n_pebs++;
-       if (hwc->flags & PERF_X86_EVENT_FREERUNNING)
+       if (hwc->flags & PERF_X86_EVENT_LARGE_PEBS)
                cpuc->n_large_pebs++;
  
        pebs_update_state(needed_cb, cpuc, event->ctx->pmu);
@@@ -975,7 -975,7 +975,7 @@@ void intel_pmu_pebs_del(struct perf_eve
        bool needed_cb = pebs_needs_sched_cb(cpuc);
  
        cpuc->n_pebs--;
-       if (hwc->flags & PERF_X86_EVENT_FREERUNNING)
+       if (hwc->flags & PERF_X86_EVENT_LARGE_PEBS)
                cpuc->n_large_pebs--;
  
        pebs_update_state(needed_cb, cpuc, event->ctx->pmu);
@@@ -1306,93 -1306,17 +1306,93 @@@ get_next_pebs_record_by_bit(void *base
        return NULL;
  }
  
 +void intel_pmu_auto_reload_read(struct perf_event *event)
 +{
 +      WARN_ON(!(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD));
 +
 +      perf_pmu_disable(event->pmu);
 +      intel_pmu_drain_pebs_buffer();
 +      perf_pmu_enable(event->pmu);
 +}
 +
 +/*
 + * Special variant of intel_pmu_save_and_restart() for auto-reload.
 + */
 +static int
 +intel_pmu_save_and_restart_reload(struct perf_event *event, int count)
 +{
 +      struct hw_perf_event *hwc = &event->hw;
 +      int shift = 64 - x86_pmu.cntval_bits;
 +      u64 period = hwc->sample_period;
 +      u64 prev_raw_count, new_raw_count;
 +      s64 new, old;
 +
 +      WARN_ON(!period);
 +
 +      /*
 +       * drain_pebs() only happens when the PMU is disabled.
 +       */
 +      WARN_ON(this_cpu_read(cpu_hw_events.enabled));
 +
 +      prev_raw_count = local64_read(&hwc->prev_count);
 +      rdpmcl(hwc->event_base_rdpmc, new_raw_count);
 +      local64_set(&hwc->prev_count, new_raw_count);
 +
 +      /*
 +       * Since the counter increments a negative counter value and
 +       * overflows on the sign switch, giving the interval:
 +       *
 +       *   [-period, 0]
 +       *
 +       * the difference between two consequtive reads is:
 +       *
 +       *   A) value2 - value1;
 +       *      when no overflows have happened in between,
 +       *
 +       *   B) (0 - value1) + (value2 - (-period));
 +       *      when one overflow happened in between,
 +       *
 +       *   C) (0 - value1) + (n - 1) * (period) + (value2 - (-period));
 +       *      when @n overflows happened in between.
 +       *
 +       * Here A) is the obvious difference, B) is the extension to the
 +       * discrete interval, where the first term is to the top of the
 +       * interval and the second term is from the bottom of the next
 +       * interval and C) the extension to multiple intervals, where the
 +       * middle term is the whole intervals covered.
 +       *
 +       * An equivalent of C, by reduction, is:
 +       *
 +       *   value2 - value1 + n * period
 +       */
 +      new = ((s64)(new_raw_count << shift) >> shift);
 +      old = ((s64)(prev_raw_count << shift) >> shift);
 +      local64_add(new - old + count * period, &event->count);
 +
 +      perf_event_update_userpage(event);
 +
 +      return 0;
 +}
 +
  static void __intel_pmu_pebs_event(struct perf_event *event,
                                   struct pt_regs *iregs,
                                   void *base, void *top,
                                   int bit, int count)
  {
 +      struct hw_perf_event *hwc = &event->hw;
        struct perf_sample_data data;
        struct pt_regs regs;
        void *at = get_next_pebs_record_by_bit(base, top, bit);
  
 -      if (!intel_pmu_save_and_restart(event) &&
 -          !(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD))
 +      if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
 +              /*
 +               * Now, auto-reload is only enabled in fixed period mode.
 +               * The reload value is always hwc->sample_period.
 +               * May need to change it, if auto-reload is enabled in
 +               * freq mode later.
 +               */
 +              intel_pmu_save_and_restart_reload(event, count);
 +      } else if (!intel_pmu_save_and_restart(event))
                return;
  
        while (count > 1) {
@@@ -1444,11 -1368,8 +1444,11 @@@ static void intel_pmu_drain_pebs_core(s
                return;
  
        n = top - at;
 -      if (n <= 0)
 +      if (n <= 0) {
 +              if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
 +                      intel_pmu_save_and_restart_reload(event, 0);
                return;
 +      }
  
        __intel_pmu_pebs_event(event, iregs, at, top, 0, n);
  }
@@@ -1471,22 -1392,8 +1471,22 @@@ static void intel_pmu_drain_pebs_nhm(st
  
        ds->pebs_index = ds->pebs_buffer_base;
  
 -      if (unlikely(base >= top))
 +      if (unlikely(base >= top)) {
 +              /*
 +               * The drain_pebs() could be called twice in a short period
 +               * for auto-reload event in pmu::read(). There are no
 +               * overflows have happened in between.
 +               * It needs to call intel_pmu_save_and_restart_reload() to
 +               * update the event->count for this case.
 +               */
 +              for_each_set_bit(bit, (unsigned long *)&cpuc->pebs_enabled,
 +                               x86_pmu.max_pebs_events) {
 +                      event = cpuc->events[bit];
 +                      if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)
 +                              intel_pmu_save_and_restart_reload(event, 0);
 +              }
                return;
 +      }
  
        for (at = base; at < top; at += x86_pmu.pebs_record_size) {
                struct pebs_record_nhm *p = at;
@@@ -1623,7 -1530,7 +1623,7 @@@ void __init intel_ds_init(void
                        x86_pmu.pebs_record_size =
                                                sizeof(struct pebs_record_skl);
                        x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
-                       x86_pmu.free_running_flags |= PERF_SAMPLE_TIME;
+                       x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME;
                        break;
  
                default:
@@@ -69,7 -69,7 +69,7 @@@ struct event_constraint 
  #define PERF_X86_EVENT_RDPMC_ALLOWED  0x0100 /* grant rdpmc permission */
  #define PERF_X86_EVENT_EXCL_ACCT      0x0200 /* accounted EXCL event */
  #define PERF_X86_EVENT_AUTO_RELOAD    0x0400 /* use PEBS auto-reload */
- #define PERF_X86_EVENT_FREERUNNING    0x0800 /* use freerunning PEBS */
+ #define PERF_X86_EVENT_LARGE_PEBS     0x0800 /* use large PEBS */
  
  
  struct amd_nb {
@@@ -88,7 -88,7 +88,7 @@@
   * REGS_USER can be handled for events limited to ring 3.
   *
   */
- #define PEBS_FREERUNNING_FLAGS \
+ #define LARGE_PEBS_FLAGS \
        (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
        PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
        PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
@@@ -520,7 -520,6 +520,7 @@@ struct x86_pmu 
        void            (*disable)(struct perf_event *);
        void            (*add)(struct perf_event *);
        void            (*del)(struct perf_event *);
 +      void            (*read)(struct perf_event *event);
        int             (*hw_config)(struct perf_event *event);
        int             (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
        unsigned        eventsel;
        struct x86_pmu_quirk *quirks;
        int             perfctr_second_write;
        bool            late_ack;
 -      unsigned        (*limit_period)(struct perf_event *event, unsigned l);
 +      u64             (*limit_period)(struct perf_event *event, u64 l);
  
        /*
         * sysfs attrs
        struct event_constraint *pebs_constraints;
        void            (*pebs_aliases)(struct perf_event *event);
        int             max_pebs_events;
-       unsigned long   free_running_flags;
+       unsigned long   large_pebs_flags;
  
        /*
         * Intel LBR
@@@ -924,8 -923,6 +924,8 @@@ void intel_pmu_pebs_disable_all(void)
  
  void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in);
  
 +void intel_pmu_auto_reload_read(struct perf_event *event);
 +
  void intel_ds_init(void);
  
  void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
diff --combined kernel/events/core.c
@@@ -430,7 -430,7 +430,7 @@@ static void update_perf_cpu_limits(void
        WRITE_ONCE(perf_sample_allowed_ns, tmp);
  }
  
 -static int perf_rotate_context(struct perf_cpu_context *cpuctx);
 +static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
  
  int perf_proc_update_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
@@@ -643,7 -643,7 +643,7 @@@ static void perf_event_update_sibling_t
  {
        struct perf_event *sibling;
  
 -      list_for_each_entry(sibling, &leader->sibling_list, group_entry)
 +      for_each_sibling_event(sibling, leader)
                perf_event_update_time(sibling);
  }
  
@@@ -724,9 -724,15 +724,15 @@@ static inline void __update_cgrp_time(s
  
  static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
  {
-       struct perf_cgroup *cgrp_out = cpuctx->cgrp;
-       if (cgrp_out)
-               __update_cgrp_time(cgrp_out);
+       struct perf_cgroup *cgrp = cpuctx->cgrp;
+       struct cgroup_subsys_state *css;
+       if (cgrp) {
+               for (css = &cgrp->css; css; css = css->parent) {
+                       cgrp = container_of(css, struct perf_cgroup, css);
+                       __update_cgrp_time(cgrp);
+               }
+       }
  }
  
  static inline void update_cgrp_time_from_event(struct perf_event *event)
@@@ -754,6 -760,7 +760,7 @@@ perf_cgroup_set_timestamp(struct task_s
  {
        struct perf_cgroup *cgrp;
        struct perf_cgroup_info *info;
+       struct cgroup_subsys_state *css;
  
        /*
         * ctx->lock held by caller
                return;
  
        cgrp = perf_cgroup_from_task(task, ctx);
-       info = this_cpu_ptr(cgrp->info);
-       info->timestamp = ctx->timestamp;
+       for (css = &cgrp->css; css; css = css->parent) {
+               cgrp = container_of(css, struct perf_cgroup, css);
+               info = this_cpu_ptr(cgrp->info);
+               info->timestamp = ctx->timestamp;
+       }
  }
  
  static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
@@@ -937,39 -948,27 +948,39 @@@ list_update_cgroup_event(struct perf_ev
        if (!is_cgroup_event(event))
                return;
  
 -      if (add && ctx->nr_cgroups++)
 -              return;
 -      else if (!add && --ctx->nr_cgroups)
 -              return;
        /*
         * Because cgroup events are always per-cpu events,
         * this will always be called from the right CPU.
         */
        cpuctx = __get_cpu_context(ctx);
 -      cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
 -      /* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/
 -      if (add) {
 +
 +      /*
 +       * Since setting cpuctx->cgrp is conditional on the current @cgrp
 +       * matching the event's cgroup, we must do this for every new event,
 +       * because if the first would mismatch, the second would not try again
 +       * and we would leave cpuctx->cgrp unset.
 +       */
 +      if (add && !cpuctx->cgrp) {
                struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
  
 -              list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
                if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
                        cpuctx->cgrp = cgrp;
 -      } else {
 -              list_del(cpuctx_entry);
 -              cpuctx->cgrp = NULL;
        }
 +
 +      if (add && ctx->nr_cgroups++)
 +              return;
 +      else if (!add && --ctx->nr_cgroups)
 +              return;
 +
 +      /* no cgroup running */
 +      if (!add)
 +              cpuctx->cgrp = NULL;
 +
 +      cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
 +      if (add)
 +              list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
 +      else
 +              list_del(cpuctx_entry);
  }
  
  #else /* !CONFIG_CGROUP_PERF */
@@@ -1053,7 -1052,7 +1064,7 @@@ list_update_cgroup_event(struct perf_ev
  static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
  {
        struct perf_cpu_context *cpuctx;
 -      int rotations = 0;
 +      bool rotations;
  
        lockdep_assert_irqs_disabled();
  
@@@ -1472,21 -1471,8 +1483,21 @@@ static enum event_type_t get_event_type
        return event_type;
  }
  
 -static struct list_head *
 -ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
 +/*
 + * Helper function to initialize event group nodes.
 + */
 +static void init_event_group(struct perf_event *event)
 +{
 +      RB_CLEAR_NODE(&event->group_node);
 +      event->group_index = 0;
 +}
 +
 +/*
 + * Extract pinned or flexible groups from the context
 + * based on event attrs bits.
 + */
 +static struct perf_event_groups *
 +get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
  {
        if (event->attr.pinned)
                return &ctx->pinned_groups;
                return &ctx->flexible_groups;
  }
  
 +/*
 + * Helper function to initializes perf_event_group trees.
 + */
 +static void perf_event_groups_init(struct perf_event_groups *groups)
 +{
 +      groups->tree = RB_ROOT;
 +      groups->index = 0;
 +}
 +
 +/*
 + * Compare function for event groups;
 + *
 + * Implements complex key that first sorts by CPU and then by virtual index
 + * which provides ordering when rotating groups for the same CPU.
 + */
 +static bool
 +perf_event_groups_less(struct perf_event *left, struct perf_event *right)
 +{
 +      if (left->cpu < right->cpu)
 +              return true;
 +      if (left->cpu > right->cpu)
 +              return false;
 +
 +      if (left->group_index < right->group_index)
 +              return true;
 +      if (left->group_index > right->group_index)
 +              return false;
 +
 +      return false;
 +}
 +
 +/*
 + * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
 + * key (see perf_event_groups_less). This places it last inside the CPU
 + * subtree.
 + */
 +static void
 +perf_event_groups_insert(struct perf_event_groups *groups,
 +                       struct perf_event *event)
 +{
 +      struct perf_event *node_event;
 +      struct rb_node *parent;
 +      struct rb_node **node;
 +
 +      event->group_index = ++groups->index;
 +
 +      node = &groups->tree.rb_node;
 +      parent = *node;
 +
 +      while (*node) {
 +              parent = *node;
 +              node_event = container_of(*node, struct perf_event, group_node);
 +
 +              if (perf_event_groups_less(event, node_event))
 +                      node = &parent->rb_left;
 +              else
 +                      node = &parent->rb_right;
 +      }
 +
 +      rb_link_node(&event->group_node, parent, node);
 +      rb_insert_color(&event->group_node, &groups->tree);
 +}
 +
 +/*
 + * Helper function to insert event into the pinned or flexible groups.
 + */
 +static void
 +add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
 +{
 +      struct perf_event_groups *groups;
 +
 +      groups = get_event_groups(event, ctx);
 +      perf_event_groups_insert(groups, event);
 +}
 +
 +/*
 + * Delete a group from a tree.
 + */
 +static void
 +perf_event_groups_delete(struct perf_event_groups *groups,
 +                       struct perf_event *event)
 +{
 +      WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
 +                   RB_EMPTY_ROOT(&groups->tree));
 +
 +      rb_erase(&event->group_node, &groups->tree);
 +      init_event_group(event);
 +}
 +
 +/*
 + * Helper function to delete event from its groups.
 + */
 +static void
 +del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
 +{
 +      struct perf_event_groups *groups;
 +
 +      groups = get_event_groups(event, ctx);
 +      perf_event_groups_delete(groups, event);
 +}
 +
 +/*
 + * Get the leftmost event in the @cpu subtree.
 + */
 +static struct perf_event *
 +perf_event_groups_first(struct perf_event_groups *groups, int cpu)
 +{
 +      struct perf_event *node_event = NULL, *match = NULL;
 +      struct rb_node *node = groups->tree.rb_node;
 +
 +      while (node) {
 +              node_event = container_of(node, struct perf_event, group_node);
 +
 +              if (cpu < node_event->cpu) {
 +                      node = node->rb_left;
 +              } else if (cpu > node_event->cpu) {
 +                      node = node->rb_right;
 +              } else {
 +                      match = node_event;
 +                      node = node->rb_left;
 +              }
 +      }
 +
 +      return match;
 +}
 +
 +/*
 + * Like rb_entry_next_safe() for the @cpu subtree.
 + */
 +static struct perf_event *
 +perf_event_groups_next(struct perf_event *event)
 +{
 +      struct perf_event *next;
 +
 +      next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
 +      if (next && next->cpu == event->cpu)
 +              return next;
 +
 +      return NULL;
 +}
 +
 +/*
 + * Iterate through the whole groups tree.
 + */
 +#define perf_event_groups_for_each(event, groups)                     \
 +      for (event = rb_entry_safe(rb_first(&((groups)->tree)),         \
 +                              typeof(*event), group_node); event;     \
 +              event = rb_entry_safe(rb_next(&event->group_node),      \
 +                              typeof(*event), group_node))
 +
  /*
   * Add a event from the lists for its context.
   * Must be called with ctx->mutex and ctx->lock held.
@@@ -1664,8 -1500,12 +1675,8 @@@ list_add_event(struct perf_event *event
         * perf_group_detach can, at all times, locate all siblings.
         */
        if (event->group_leader == event) {
 -              struct list_head *list;
 -
                event->group_caps = event->event_caps;
 -
 -              list = ctx_group_list(event, ctx);
 -              list_add_tail(&event->group_entry, list);
 +              add_event_to_groups(event, ctx);
        }
  
        list_update_cgroup_event(event, ctx, true);
@@@ -1823,12 -1663,12 +1834,12 @@@ static void perf_group_attach(struct pe
  
        group_leader->group_caps &= event->event_caps;
  
 -      list_add_tail(&event->group_entry, &group_leader->sibling_list);
 +      list_add_tail(&event->sibling_list, &group_leader->sibling_list);
        group_leader->nr_siblings++;
  
        perf_event__header_size(group_leader);
  
 -      list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
 +      for_each_sibling_event(pos, group_leader)
                perf_event__header_size(pos);
  }
  
@@@ -1859,7 -1699,7 +1870,7 @@@ list_del_event(struct perf_event *event
        list_del_rcu(&event->event_entry);
  
        if (event->group_leader == event)
 -              list_del_init(&event->group_entry);
 +              del_event_from_groups(event, ctx);
  
        /*
         * If event was in error state, then keep it
  static void perf_group_detach(struct perf_event *event)
  {
        struct perf_event *sibling, *tmp;
 -      struct list_head *list = NULL;
 +      struct perf_event_context *ctx = event->ctx;
  
 -      lockdep_assert_held(&event->ctx->lock);
 +      lockdep_assert_held(&ctx->lock);
  
        /*
         * We can have double detach due to exit/hot-unplug + close.
         * If this is a sibling, remove it from its group.
         */
        if (event->group_leader != event) {
 -              list_del_init(&event->group_entry);
 +              list_del_init(&event->sibling_list);
                event->group_leader->nr_siblings--;
                goto out;
        }
  
 -      if (!list_empty(&event->group_entry))
 -              list = &event->group_entry;
 -
        /*
         * If this was a group event with sibling events then
         * upgrade the siblings to singleton events by adding them
         * to whatever list we are on.
         */
 -      list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
 -              if (list)
 -                      list_move_tail(&sibling->group_entry, list);
 +      list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
 +
                sibling->group_leader = sibling;
 +              list_del_init(&sibling->sibling_list);
  
                /* Inherit group flags from the previous leader */
                sibling->group_caps = event->group_caps;
  
 +              if (!RB_EMPTY_NODE(&event->group_node)) {
 +                      add_event_to_groups(sibling, event->ctx);
 +
 +                      if (sibling->state == PERF_EVENT_STATE_ACTIVE) {
 +                              struct list_head *list = sibling->attr.pinned ?
 +                                      &ctx->pinned_active : &ctx->flexible_active;
 +
 +                              list_add_tail(&sibling->active_list, list);
 +                      }
 +              }
 +
                WARN_ON_ONCE(sibling->ctx != event->ctx);
        }
  
  out:
        perf_event__header_size(event->group_leader);
  
 -      list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
 +      for_each_sibling_event(tmp, event->group_leader)
                perf_event__header_size(tmp);
  }
  
@@@ -1951,13 -1783,13 +1962,13 @@@ static inline int __pmu_filter_match(st
   */
  static inline int pmu_filter_match(struct perf_event *event)
  {
 -      struct perf_event *child;
 +      struct perf_event *sibling;
  
        if (!__pmu_filter_match(event))
                return 0;
  
 -      list_for_each_entry(child, &event->sibling_list, group_entry) {
 -              if (!__pmu_filter_match(child))
 +      for_each_sibling_event(sibling, event) {
 +              if (!__pmu_filter_match(sibling))
                        return 0;
        }
  
@@@ -1984,13 -1816,6 +1995,13 @@@ event_sched_out(struct perf_event *even
        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return;
  
 +      /*
 +       * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but
 +       * we can schedule events _OUT_ individually through things like
 +       * __perf_remove_from_context().
 +       */
 +      list_del_init(&event->active_list);
 +
        perf_pmu_disable(event->pmu);
  
        event->pmu->del(event, 0);
@@@ -2031,7 -1856,7 +2042,7 @@@ group_sched_out(struct perf_event *grou
        /*
         * Schedule out siblings (if any):
         */
 -      list_for_each_entry(event, &group_event->sibling_list, group_entry)
 +      for_each_sibling_event(event, group_event)
                event_sched_out(event, cpuctx, ctx);
  
        perf_pmu_enable(ctx->pmu);
@@@ -2310,7 -2135,7 +2321,7 @@@ group_sched_in(struct perf_event *group
        /*
         * Schedule in siblings as one group (if any):
         */
 -      list_for_each_entry(event, &group_event->sibling_list, group_entry) {
 +      for_each_sibling_event(event, group_event) {
                if (event_sched_in(event, cpuctx, ctx)) {
                        partial_group = event;
                        goto group_error;
@@@ -2326,7 -2151,7 +2337,7 @@@ group_error
         * partial group before returning:
         * The events up to the failed event are scheduled out normally.
         */
 -      list_for_each_entry(event, &group_event->sibling_list, group_entry) {
 +      for_each_sibling_event(event, group_event) {
                if (event == partial_group)
                        break;
  
@@@ -2503,18 -2328,6 +2514,18 @@@ static int  __perf_install_in_context(v
                raw_spin_lock(&task_ctx->lock);
        }
  
 +#ifdef CONFIG_CGROUP_PERF
 +      if (is_cgroup_event(event)) {
 +              /*
 +               * If the current cgroup doesn't match the event's
 +               * cgroup, we should not try to schedule it.
 +               */
 +              struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
 +              reprogram = cgroup_is_descendant(cgrp->css.cgroup,
 +                                      event->cgrp->css.cgroup);
 +      }
 +#endif
 +
        if (reprogram) {
                ctx_sched_out(ctx, cpuctx, EVENT_TIME);
                add_event_to_ctx(event, ctx);
@@@ -2848,47 -2661,12 +2859,47 @@@ int perf_event_refresh(struct perf_even
  }
  EXPORT_SYMBOL_GPL(perf_event_refresh);
  
 +static int perf_event_modify_breakpoint(struct perf_event *bp,
 +                                       struct perf_event_attr *attr)
 +{
 +      int err;
 +
 +      _perf_event_disable(bp);
 +
 +      err = modify_user_hw_breakpoint_check(bp, attr, true);
 +      if (err) {
 +              if (!bp->attr.disabled)
 +                      _perf_event_enable(bp);
 +
 +              return err;
 +      }
 +
 +      if (!attr->disabled)
 +              _perf_event_enable(bp);
 +      return 0;
 +}
 +
 +static int perf_event_modify_attr(struct perf_event *event,
 +                                struct perf_event_attr *attr)
 +{
 +      if (event->attr.type != attr->type)
 +              return -EINVAL;
 +
 +      switch (event->attr.type) {
 +      case PERF_TYPE_BREAKPOINT:
 +              return perf_event_modify_breakpoint(event, attr);
 +      default:
 +              /* Place holder for future additions. */
 +              return -EOPNOTSUPP;
 +      }
 +}
 +
  static void ctx_sched_out(struct perf_event_context *ctx,
                          struct perf_cpu_context *cpuctx,
                          enum event_type_t event_type)
  {
 +      struct perf_event *event, *tmp;
        int is_active = ctx->is_active;
 -      struct perf_event *event;
  
        lockdep_assert_held(&ctx->lock);
  
  
        perf_pmu_disable(ctx->pmu);
        if (is_active & EVENT_PINNED) {
 -              list_for_each_entry(event, &ctx->pinned_groups, group_entry)
 +              list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
                        group_sched_out(event, cpuctx, ctx);
        }
  
        if (is_active & EVENT_FLEXIBLE) {
 -              list_for_each_entry(event, &ctx->flexible_groups, group_entry)
 +              list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
                        group_sched_out(event, cpuctx, ctx);
        }
        perf_pmu_enable(ctx->pmu);
@@@ -3227,116 -3005,53 +3238,116 @@@ static void cpu_ctx_sched_out(struct pe
        ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
  }
  
 -static void
 -ctx_pinned_sched_in(struct perf_event_context *ctx,
 -                  struct perf_cpu_context *cpuctx)
 +static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
 +                            int (*func)(struct perf_event *, void *), void *data)
  {
 -      struct perf_event *event;
 +      struct perf_event **evt, *evt1, *evt2;
 +      int ret;
  
 -      list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
 -              if (event->state <= PERF_EVENT_STATE_OFF)
 -                      continue;
 -              if (!event_filter_match(event))
 -                      continue;
 +      evt1 = perf_event_groups_first(groups, -1);
 +      evt2 = perf_event_groups_first(groups, cpu);
 +
 +      while (evt1 || evt2) {
 +              if (evt1 && evt2) {
 +                      if (evt1->group_index < evt2->group_index)
 +                              evt = &evt1;
 +                      else
 +                              evt = &evt2;
 +              } else if (evt1) {
 +                      evt = &evt1;
 +              } else {
 +                      evt = &evt2;
 +              }
  
 -              if (group_can_go_on(event, cpuctx, 1))
 -                      group_sched_in(event, cpuctx, ctx);
 +              ret = func(*evt, data);
 +              if (ret)
 +                      return ret;
  
 -              /*
 -               * If this pinned group hasn't been scheduled,
 -               * put it in error state.
 -               */
 -              if (event->state == PERF_EVENT_STATE_INACTIVE)
 -                      perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
 +              *evt = perf_event_groups_next(*evt);
 +      }
 +
 +      return 0;
 +}
 +
 +struct sched_in_data {
 +      struct perf_event_context *ctx;
 +      struct perf_cpu_context *cpuctx;
 +      int can_add_hw;
 +};
 +
 +static int pinned_sched_in(struct perf_event *event, void *data)
 +{
 +      struct sched_in_data *sid = data;
 +
 +      if (event->state <= PERF_EVENT_STATE_OFF)
 +              return 0;
 +
 +      if (!event_filter_match(event))
 +              return 0;
 +
 +      if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
 +              if (!group_sched_in(event, sid->cpuctx, sid->ctx))
 +                      list_add_tail(&event->active_list, &sid->ctx->pinned_active);
        }
 +
 +      /*
 +       * If this pinned group hasn't been scheduled,
 +       * put it in error state.
 +       */
 +      if (event->state == PERF_EVENT_STATE_INACTIVE)
 +              perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
 +
 +      return 0;
 +}
 +
 +static int flexible_sched_in(struct perf_event *event, void *data)
 +{
 +      struct sched_in_data *sid = data;
 +
 +      if (event->state <= PERF_EVENT_STATE_OFF)
 +              return 0;
 +
 +      if (!event_filter_match(event))
 +              return 0;
 +
 +      if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
 +              if (!group_sched_in(event, sid->cpuctx, sid->ctx))
 +                      list_add_tail(&event->active_list, &sid->ctx->flexible_active);
 +              else
 +                      sid->can_add_hw = 0;
 +      }
 +
 +      return 0;
 +}
 +
 +static void
 +ctx_pinned_sched_in(struct perf_event_context *ctx,
 +                  struct perf_cpu_context *cpuctx)
 +{
 +      struct sched_in_data sid = {
 +              .ctx = ctx,
 +              .cpuctx = cpuctx,
 +              .can_add_hw = 1,
 +      };
 +
 +      visit_groups_merge(&ctx->pinned_groups,
 +                         smp_processor_id(),
 +                         pinned_sched_in, &sid);
  }
  
  static void
  ctx_flexible_sched_in(struct perf_event_context *ctx,
                      struct perf_cpu_context *cpuctx)
  {
 -      struct perf_event *event;
 -      int can_add_hw = 1;
 -
 -      list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
 -              /* Ignore events in OFF or ERROR state */
 -              if (event->state <= PERF_EVENT_STATE_OFF)
 -                      continue;
 -              /*
 -               * Listen to the 'cpu' scheduling filter constraint
 -               * of events:
 -               */
 -              if (!event_filter_match(event))
 -                      continue;
 +      struct sched_in_data sid = {
 +              .ctx = ctx,
 +              .cpuctx = cpuctx,
 +              .can_add_hw = 1,
 +      };
  
 -              if (group_can_go_on(event, cpuctx, can_add_hw)) {
 -                      if (group_sched_in(event, cpuctx, ctx))
 -                              can_add_hw = 0;
 -              }
 -      }
 +      visit_groups_merge(&ctx->flexible_groups,
 +                         smp_processor_id(),
 +                         flexible_sched_in, &sid);
  }
  
  static void
@@@ -3417,7 -3132,7 +3428,7 @@@ static void perf_event_context_sched_in
         * However, if task's ctx is not carrying any pinned
         * events, no need to flip the cpuctx's events around.
         */
 -      if (!list_empty(&ctx->pinned_groups))
 +      if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
                cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
        perf_event_sched_in(cpuctx, ctx, task);
        perf_pmu_enable(ctx->pmu);
@@@ -3646,81 -3361,55 +3657,81 @@@ static void perf_adjust_freq_unthr_cont
  }
  
  /*
 - * Round-robin a context's events:
 + * Move @event to the tail of the @ctx's elegible events.
   */
 -static void rotate_ctx(struct perf_event_context *ctx)
 +static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
  {
        /*
         * Rotate the first entry last of non-pinned groups. Rotation might be
         * disabled by the inheritance code.
         */
 -      if (!ctx->rotate_disable)
 -              list_rotate_left(&ctx->flexible_groups);
 +      if (ctx->rotate_disable)
 +              return;
 +
 +      perf_event_groups_delete(&ctx->flexible_groups, event);
 +      perf_event_groups_insert(&ctx->flexible_groups, event);
  }
  
 -static int perf_rotate_context(struct perf_cpu_context *cpuctx)
 +static inline struct perf_event *
 +ctx_first_active(struct perf_event_context *ctx)
  {
 +      return list_first_entry_or_null(&ctx->flexible_active,
 +                                      struct perf_event, active_list);
 +}
 +
 +static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
 +{
 +      struct perf_event *cpu_event = NULL, *task_event = NULL;
 +      bool cpu_rotate = false, task_rotate = false;
        struct perf_event_context *ctx = NULL;
 -      int rotate = 0;
 +
 +      /*
 +       * Since we run this from IRQ context, nobody can install new
 +       * events, thus the event count values are stable.
 +       */
  
        if (cpuctx->ctx.nr_events) {
                if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
 -                      rotate = 1;
 +                      cpu_rotate = true;
        }
  
        ctx = cpuctx->task_ctx;
        if (ctx && ctx->nr_events) {
                if (ctx->nr_events != ctx->nr_active)
 -                      rotate = 1;
 +                      task_rotate = true;
        }
  
 -      if (!rotate)
 -              goto done;
 +      if (!(cpu_rotate || task_rotate))
 +              return false;
  
        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
        perf_pmu_disable(cpuctx->ctx.pmu);
  
 -      cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 -      if (ctx)
 +      if (task_rotate)
 +              task_event = ctx_first_active(ctx);
 +      if (cpu_rotate)
 +              cpu_event = ctx_first_active(&cpuctx->ctx);
 +
 +      /*
 +       * As per the order given at ctx_resched() first 'pop' task flexible
 +       * and then, if needed CPU flexible.
 +       */
 +      if (task_event || (ctx && cpu_event))
                ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
 +      if (cpu_event)
 +              cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
  
 -      rotate_ctx(&cpuctx->ctx);
 -      if (ctx)
 -              rotate_ctx(ctx);
 +      if (task_event)
 +              rotate_ctx(ctx, task_event);
 +      if (cpu_event)
 +              rotate_ctx(&cpuctx->ctx, cpu_event);
  
        perf_event_sched_in(cpuctx, ctx, current);
  
        perf_pmu_enable(cpuctx->ctx.pmu);
        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 -done:
  
 -      return rotate;
 +      return true;
  }
  
  void perf_event_task_tick(void)
@@@ -3865,7 -3554,7 +3876,7 @@@ static void __perf_event_read(void *inf
  
        pmu->read(event);
  
 -      list_for_each_entry(sub, &event->sibling_list, group_entry) {
 +      for_each_sibling_event(sub, event) {
                if (sub->state == PERF_EVENT_STATE_ACTIVE) {
                        /*
                         * Use sibling's PMU rather than @event's since
@@@ -4039,11 -3728,9 +4050,11 @@@ static void __perf_event_init_context(s
        raw_spin_lock_init(&ctx->lock);
        mutex_init(&ctx->mutex);
        INIT_LIST_HEAD(&ctx->active_ctx_list);
 -      INIT_LIST_HEAD(&ctx->pinned_groups);
 -      INIT_LIST_HEAD(&ctx->flexible_groups);
 +      perf_event_groups_init(&ctx->pinned_groups);
 +      perf_event_groups_init(&ctx->flexible_groups);
        INIT_LIST_HEAD(&ctx->event_list);
 +      INIT_LIST_HEAD(&ctx->pinned_active);
 +      INIT_LIST_HEAD(&ctx->flexible_active);
        atomic_set(&ctx->refcount, 1);
  }
  
@@@ -4713,7 -4400,7 +4724,7 @@@ static int __perf_read_group_add(struc
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(leader);
  
 -      list_for_each_entry(sub, &leader->sibling_list, group_entry) {
 +      for_each_sibling_event(sub, leader) {
                values[n++] += perf_event_count(sub);
                if (read_format & PERF_FORMAT_ID)
                        values[n++] = primary_event_id(sub);
@@@ -4907,7 -4594,7 +4918,7 @@@ static void perf_event_for_each(struct 
        event = event->group_leader;
  
        perf_event_for_each_child(event, func);
 -      list_for_each_entry(sibling, &event->sibling_list, group_entry)
 +      for_each_sibling_event(sibling, event)
                perf_event_for_each_child(sibling, func);
  }
  
@@@ -4989,8 -4676,6 +5000,8 @@@ static int perf_event_set_output(struc
                                 struct perf_event *output_event);
  static int perf_event_set_filter(struct perf_event *event, void __user *arg);
  static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
 +static int perf_copy_attr(struct perf_event_attr __user *uattr,
 +                        struct perf_event_attr *attr);
  
  static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
  {
  
        case PERF_EVENT_IOC_QUERY_BPF:
                return perf_event_query_prog_array(event, (void __user *)arg);
 +
 +      case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
 +              struct perf_event_attr new_attr;
 +              int err = perf_copy_attr((struct perf_event_attr __user *)arg,
 +                                       &new_attr);
 +
 +              if (err)
 +                      return err;
 +
 +              return perf_event_modify_attr(event,  &new_attr);
 +      }
        default:
                return -ENOTTY;
        }
@@@ -6069,8 -5743,7 +6080,8 @@@ static void perf_output_read_group(stru
        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
                values[n++] = running;
  
 -      if (leader != event)
 +      if ((leader != event) &&
 +          (leader->state == PERF_EVENT_STATE_ACTIVE))
                leader->pmu->read(leader);
  
        values[n++] = perf_event_count(leader);
  
        __output_copy(handle, values, n * sizeof(u64));
  
 -      list_for_each_entry(sub, &leader->sibling_list, group_entry) {
 +      for_each_sibling_event(sub, leader) {
                n = 0;
  
                if ((sub != event) &&
@@@ -8336,119 -8009,9 +8347,119 @@@ static struct pmu perf_tracepoint = 
        .read           = perf_swevent_read,
  };
  
 +#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
 +/*
 + * Flags in config, used by dynamic PMU kprobe and uprobe
 + * The flags should match following PMU_FORMAT_ATTR().
 + *
 + * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe
 + *                               if not set, create kprobe/uprobe
 + */
 +enum perf_probe_config {
 +      PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0,  /* [k,u]retprobe */
 +};
 +
 +PMU_FORMAT_ATTR(retprobe, "config:0");
 +
 +static struct attribute *probe_attrs[] = {
 +      &format_attr_retprobe.attr,
 +      NULL,
 +};
 +
 +static struct attribute_group probe_format_group = {
 +      .name = "format",
 +      .attrs = probe_attrs,
 +};
 +
 +static const struct attribute_group *probe_attr_groups[] = {
 +      &probe_format_group,
 +      NULL,
 +};
 +#endif
 +
 +#ifdef CONFIG_KPROBE_EVENTS
 +static int perf_kprobe_event_init(struct perf_event *event);
 +static struct pmu perf_kprobe = {
 +      .task_ctx_nr    = perf_sw_context,
 +      .event_init     = perf_kprobe_event_init,
 +      .add            = perf_trace_add,
 +      .del            = perf_trace_del,
 +      .start          = perf_swevent_start,
 +      .stop           = perf_swevent_stop,
 +      .read           = perf_swevent_read,
 +      .attr_groups    = probe_attr_groups,
 +};
 +
 +static int perf_kprobe_event_init(struct perf_event *event)
 +{
 +      int err;
 +      bool is_retprobe;
 +
 +      if (event->attr.type != perf_kprobe.type)
 +              return -ENOENT;
 +      /*
 +       * no branch sampling for probe events
 +       */
 +      if (has_branch_stack(event))
 +              return -EOPNOTSUPP;
 +
 +      is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
 +      err = perf_kprobe_init(event, is_retprobe);
 +      if (err)
 +              return err;
 +
 +      event->destroy = perf_kprobe_destroy;
 +
 +      return 0;
 +}
 +#endif /* CONFIG_KPROBE_EVENTS */
 +
 +#ifdef CONFIG_UPROBE_EVENTS
 +static int perf_uprobe_event_init(struct perf_event *event);
 +static struct pmu perf_uprobe = {
 +      .task_ctx_nr    = perf_sw_context,
 +      .event_init     = perf_uprobe_event_init,
 +      .add            = perf_trace_add,
 +      .del            = perf_trace_del,
 +      .start          = perf_swevent_start,
 +      .stop           = perf_swevent_stop,
 +      .read           = perf_swevent_read,
 +      .attr_groups    = probe_attr_groups,
 +};
 +
 +static int perf_uprobe_event_init(struct perf_event *event)
 +{
 +      int err;
 +      bool is_retprobe;
 +
 +      if (event->attr.type != perf_uprobe.type)
 +              return -ENOENT;
 +      /*
 +       * no branch sampling for probe events
 +       */
 +      if (has_branch_stack(event))
 +              return -EOPNOTSUPP;
 +
 +      is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
 +      err = perf_uprobe_init(event, is_retprobe);
 +      if (err)
 +              return err;
 +
 +      event->destroy = perf_uprobe_destroy;
 +
 +      return 0;
 +}
 +#endif /* CONFIG_UPROBE_EVENTS */
 +
  static inline void perf_tp_register(void)
  {
        perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
 +#ifdef CONFIG_KPROBE_EVENTS
 +      perf_pmu_register(&perf_kprobe, "kprobe", -1);
 +#endif
 +#ifdef CONFIG_UPROBE_EVENTS
 +      perf_pmu_register(&perf_uprobe, "uprobe", -1);
 +#endif
  }
  
  static void perf_event_free_filter(struct perf_event *event)
@@@ -8525,32 -8088,13 +8536,32 @@@ static void perf_event_free_bpf_handler
  }
  #endif
  
 +/*
 + * returns true if the event is a tracepoint, or a kprobe/upprobe created
 + * with perf_event_open()
 + */
 +static inline bool perf_event_is_tracing(struct perf_event *event)
 +{
 +      if (event->pmu == &perf_tracepoint)
 +              return true;
 +#ifdef CONFIG_KPROBE_EVENTS
 +      if (event->pmu == &perf_kprobe)
 +              return true;
 +#endif
 +#ifdef CONFIG_UPROBE_EVENTS
 +      if (event->pmu == &perf_uprobe)
 +              return true;
 +#endif
 +      return false;
 +}
 +
  static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
  {
        bool is_kprobe, is_tracepoint, is_syscall_tp;
        struct bpf_prog *prog;
        int ret;
  
 -      if (event->attr.type != PERF_TYPE_TRACEPOINT)
 +      if (!perf_event_is_tracing(event))
                return perf_event_set_bpf_handler(event, prog_fd);
  
        is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
  
  static void perf_event_free_bpf_prog(struct perf_event *event)
  {
 -      if (event->attr.type != PERF_TYPE_TRACEPOINT) {
 +      if (!perf_event_is_tracing(event)) {
                perf_event_free_bpf_handler(event);
                return;
        }
@@@ -9015,36 -8559,47 +9026,36 @@@ fail_clear_files
        return ret;
  }
  
 -static int
 -perf_tracepoint_set_filter(struct perf_event *event, char *filter_str)
 -{
 -      struct perf_event_context *ctx = event->ctx;
 -      int ret;
 -
 -      /*
 -       * Beware, here be dragons!!
 -       *
 -       * the tracepoint muck will deadlock against ctx->mutex, but the tracepoint
 -       * stuff does not actually need it. So temporarily drop ctx->mutex. As per
 -       * perf_event_ctx_lock() we already have a reference on ctx.
 -       *
 -       * This can result in event getting moved to a different ctx, but that
 -       * does not affect the tracepoint state.
 -       */
 -      mutex_unlock(&ctx->mutex);
 -      ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
 -      mutex_lock(&ctx->mutex);
 -
 -      return ret;
 -}
 -
  static int perf_event_set_filter(struct perf_event *event, void __user *arg)
  {
 -      char *filter_str;
        int ret = -EINVAL;
 -
 -      if ((event->attr.type != PERF_TYPE_TRACEPOINT ||
 -          !IS_ENABLED(CONFIG_EVENT_TRACING)) &&
 -          !has_addr_filter(event))
 -              return -EINVAL;
 +      char *filter_str;
  
        filter_str = strndup_user(arg, PAGE_SIZE);
        if (IS_ERR(filter_str))
                return PTR_ERR(filter_str);
  
 -      if (IS_ENABLED(CONFIG_EVENT_TRACING) &&
 -          event->attr.type == PERF_TYPE_TRACEPOINT)
 -              ret = perf_tracepoint_set_filter(event, filter_str);
 -      else if (has_addr_filter(event))
 +#ifdef CONFIG_EVENT_TRACING
 +      if (perf_event_is_tracing(event)) {
 +              struct perf_event_context *ctx = event->ctx;
 +
 +              /*
 +               * Beware, here be dragons!!
 +               *
 +               * the tracepoint muck will deadlock against ctx->mutex, but
 +               * the tracepoint stuff does not actually need it. So
 +               * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we
 +               * already have a reference on ctx.
 +               *
 +               * This can result in event getting moved to a different ctx,
 +               * but that does not affect the tracepoint state.
 +               */
 +              mutex_unlock(&ctx->mutex);
 +              ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
 +              mutex_lock(&ctx->mutex);
 +      } else
 +#endif
 +      if (has_addr_filter(event))
                ret = perf_event_set_addr_filter(event, filter_str);
  
        kfree(filter_str);
@@@ -9897,10 -9452,9 +9908,10 @@@ perf_event_alloc(struct perf_event_att
        mutex_init(&event->child_mutex);
        INIT_LIST_HEAD(&event->child_list);
  
 -      INIT_LIST_HEAD(&event->group_entry);
        INIT_LIST_HEAD(&event->event_entry);
        INIT_LIST_HEAD(&event->sibling_list);
 +      INIT_LIST_HEAD(&event->active_list);
 +      init_event_group(event);
        INIT_LIST_HEAD(&event->rb_entry);
        INIT_LIST_HEAD(&event->active_entry);
        INIT_LIST_HEAD(&event->addr_filters.list);
@@@ -10175,9 -9729,6 +10186,9 @@@ static int perf_copy_attr(struct perf_e
                        ret = -EINVAL;
        }
  
 +      if (!attr->sample_max_stack)
 +              attr->sample_max_stack = sysctl_perf_event_max_stack;
 +
        if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
                ret = perf_reg_validate(attr->sample_regs_intr);
  out:
@@@ -10391,6 -9942,9 +10402,6 @@@ SYSCALL_DEFINE5(perf_event_open
            perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
                return -EACCES;
  
 -      if (!attr.sample_max_stack)
 -              attr.sample_max_stack = sysctl_perf_event_max_stack;
 -
        /*
         * In cgroup mode, the pid argument is used to pass the fd
         * opened to the cgroup directory in cgroupfs. The cpu argument
                perf_remove_from_context(group_leader, 0);
                put_ctx(gctx);
  
 -              list_for_each_entry(sibling, &group_leader->sibling_list,
 -                                  group_entry) {
 +              for_each_sibling_event(sibling, group_leader) {
                        perf_remove_from_context(sibling, 0);
                        put_ctx(gctx);
                }
                 * By installing siblings first we NO-OP because they're not
                 * reachable through the group lists.
                 */
 -              list_for_each_entry(sibling, &group_leader->sibling_list,
 -                                  group_entry) {
 +              for_each_sibling_event(sibling, group_leader) {
                        perf_event__state_init(sibling);
                        perf_install_in_context(ctx, sibling, sibling->cpu);
                        get_ctx(ctx);
@@@ -11324,7 -10880,7 +11335,7 @@@ static int inherit_group(struct perf_ev
         * case inherit_event() will create individual events, similar to what
         * perf_group_detach() would do anyway.
         */
 -      list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
 +      for_each_sibling_event(sub, parent_event) {
                child_ctr = inherit_event(sub, parent, parent_ctx,
                                            child, leader, child_ctx);
                if (IS_ERR(child_ctr))
@@@ -11423,7 -10979,7 +11434,7 @@@ static int perf_event_init_context(stru
         * We dont have to disable NMIs - we are only looking at
         * the list, not manipulating it:
         */
 -      list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
 +      perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
                ret = inherit_task_group(event, parent, parent_ctx,
                                         child, ctxn, &inherited_all);
                if (ret)
        parent_ctx->rotate_disable = 1;
        raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
  
 -      list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
 +      perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
                ret = inherit_task_group(event, parent, parent_ctx,
                                         child, ctxn, &inherited_all);
                if (ret)