Merge tag 'gfs2-for-5.13' of git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux...
[linux-2.6-microblaze.git] / kernel / events / core.c
index 03db40f..928b166 100644 (file)
@@ -405,6 +405,7 @@ static LIST_HEAD(pmus);
 static DEFINE_MUTEX(pmus_lock);
 static struct srcu_struct pmus_srcu;
 static cpumask_var_t perf_online_mask;
+static struct kmem_cache *perf_event_cache;
 
 /*
  * perf event paranoia level:
@@ -2204,6 +2205,26 @@ out:
        perf_event__header_size(leader);
 }
 
+static void sync_child_event(struct perf_event *child_event);
+
+static void perf_child_detach(struct perf_event *event)
+{
+       struct perf_event *parent_event = event->parent;
+
+       if (!(event->attach_state & PERF_ATTACH_CHILD))
+               return;
+
+       event->attach_state &= ~PERF_ATTACH_CHILD;
+
+       if (WARN_ON_ONCE(!parent_event))
+               return;
+
+       lockdep_assert_held(&parent_event->child_mutex);
+
+       sync_child_event(event);
+       list_del_init(&event->child_list);
+}
+
 static bool is_orphaned_event(struct perf_event *event)
 {
        return event->state == PERF_EVENT_STATE_DEAD;
@@ -2311,6 +2332,7 @@ group_sched_out(struct perf_event *group_event,
 }
 
 #define DETACH_GROUP   0x01UL
+#define DETACH_CHILD   0x02UL
 
 /*
  * Cross CPU call to remove a performance event
@@ -2334,6 +2356,8 @@ __perf_remove_from_context(struct perf_event *event,
        event_sched_out(event, cpuctx, ctx);
        if (flags & DETACH_GROUP)
                perf_group_detach(event);
+       if (flags & DETACH_CHILD)
+               perf_child_detach(event);
        list_del_event(event, ctx);
 
        if (!ctx->nr_events && ctx->is_active) {
@@ -2362,25 +2386,21 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla
 
        lockdep_assert_held(&ctx->mutex);
 
-       event_function_call(event, __perf_remove_from_context, (void *)flags);
-
        /*
-        * The above event_function_call() can NO-OP when it hits
-        * TASK_TOMBSTONE. In that case we must already have been detached
-        * from the context (by perf_event_exit_event()) but the grouping
-        * might still be in-tact.
+        * Because of perf_event_exit_task(), perf_remove_from_context() ought
+        * to work in the face of TASK_TOMBSTONE, unlike every other
+        * event_function_call() user.
         */
-       WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
-       if ((flags & DETACH_GROUP) &&
-           (event->attach_state & PERF_ATTACH_GROUP)) {
-               /*
-                * Since in that case we cannot possibly be scheduled, simply
-                * detach now.
-                */
-               raw_spin_lock_irq(&ctx->lock);
-               perf_group_detach(event);
+       raw_spin_lock_irq(&ctx->lock);
+       if (!ctx->is_active) {
+               __perf_remove_from_context(event, __get_cpu_context(ctx),
+                                          ctx, (void *)flags);
                raw_spin_unlock_irq(&ctx->lock);
+               return;
        }
+       raw_spin_unlock_irq(&ctx->lock);
+
+       event_function_call(event, __perf_remove_from_context, (void *)flags);
 }
 
 /*
@@ -3180,16 +3200,36 @@ static int perf_event_modify_breakpoint(struct perf_event *bp,
 static int perf_event_modify_attr(struct perf_event *event,
                                  struct perf_event_attr *attr)
 {
+       int (*func)(struct perf_event *, struct perf_event_attr *);
+       struct perf_event *child;
+       int err;
+
        if (event->attr.type != attr->type)
                return -EINVAL;
 
        switch (event->attr.type) {
        case PERF_TYPE_BREAKPOINT:
-               return perf_event_modify_breakpoint(event, attr);
+               func = perf_event_modify_breakpoint;
+               break;
        default:
                /* Place holder for future additions. */
                return -EOPNOTSUPP;
        }
+
+       WARN_ON_ONCE(event->ctx->parent_ctx);
+
+       mutex_lock(&event->child_mutex);
+       err = func(event, attr);
+       if (err)
+               goto out;
+       list_for_each_entry(child, &event->child_list, child_list) {
+               err = func(child, attr);
+               if (err)
+                       goto out;
+       }
+out:
+       mutex_unlock(&event->child_mutex);
+       return err;
 }
 
 static void ctx_sched_out(struct perf_event_context *ctx,
@@ -4208,6 +4248,57 @@ out:
                put_ctx(clone_ctx);
 }
 
+static void perf_remove_from_owner(struct perf_event *event);
+static void perf_event_exit_event(struct perf_event *event,
+                                 struct perf_event_context *ctx);
+
+/*
+ * Removes all events from the current task that have been marked
+ * remove-on-exec, and feeds their values back to parent events.
+ */
+static void perf_event_remove_on_exec(int ctxn)
+{
+       struct perf_event_context *ctx, *clone_ctx = NULL;
+       struct perf_event *event, *next;
+       LIST_HEAD(free_list);
+       unsigned long flags;
+       bool modified = false;
+
+       ctx = perf_pin_task_context(current, ctxn);
+       if (!ctx)
+               return;
+
+       mutex_lock(&ctx->mutex);
+
+       if (WARN_ON_ONCE(ctx->task != current))
+               goto unlock;
+
+       list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) {
+               if (!event->attr.remove_on_exec)
+                       continue;
+
+               if (!is_kernel_event(event))
+                       perf_remove_from_owner(event);
+
+               modified = true;
+
+               perf_event_exit_event(event, ctx);
+       }
+
+       raw_spin_lock_irqsave(&ctx->lock, flags);
+       if (modified)
+               clone_ctx = unclone_ctx(ctx);
+       --ctx->pin_count;
+       raw_spin_unlock_irqrestore(&ctx->lock, flags);
+
+unlock:
+       mutex_unlock(&ctx->mutex);
+
+       put_ctx(ctx);
+       if (clone_ctx)
+               put_ctx(clone_ctx);
+}
+
 struct perf_read_data {
        struct perf_event *event;
        bool group;
@@ -4611,7 +4702,7 @@ static void free_event_rcu(struct rcu_head *head)
        if (event->ns)
                put_pid_ns(event->ns);
        perf_event_free_filter(event);
-       kfree(event);
+       kmem_cache_free(perf_event_cache, event);
 }
 
 static void ring_buffer_attach(struct perf_event *event,
@@ -6301,6 +6392,33 @@ void perf_event_wakeup(struct perf_event *event)
        }
 }
 
+static void perf_sigtrap(struct perf_event *event)
+{
+       struct kernel_siginfo info;
+
+       /*
+        * We'd expect this to only occur if the irq_work is delayed and either
+        * ctx->task or current has changed in the meantime. This can be the
+        * case on architectures that do not implement arch_irq_work_raise().
+        */
+       if (WARN_ON_ONCE(event->ctx->task != current))
+               return;
+
+       /*
+        * perf_pending_event() can race with the task exiting.
+        */
+       if (current->flags & PF_EXITING)
+               return;
+
+       clear_siginfo(&info);
+       info.si_signo = SIGTRAP;
+       info.si_code = TRAP_PERF;
+       info.si_errno = event->attr.type;
+       info.si_perf = event->attr.sig_data;
+       info.si_addr = (void __user *)event->pending_addr;
+       force_sig_info(&info);
+}
+
 static void perf_pending_event_disable(struct perf_event *event)
 {
        int cpu = READ_ONCE(event->pending_disable);
@@ -6310,6 +6428,13 @@ static void perf_pending_event_disable(struct perf_event *event)
 
        if (cpu == smp_processor_id()) {
                WRITE_ONCE(event->pending_disable, -1);
+
+               if (event->attr.sigtrap) {
+                       perf_sigtrap(event);
+                       atomic_set_release(&event->event_limit, 1); /* rearm event */
+                       return;
+               }
+
                perf_event_disable_local(event);
                return;
        }
@@ -7520,18 +7645,18 @@ void perf_event_exec(void)
        struct perf_event_context *ctx;
        int ctxn;
 
-       rcu_read_lock();
        for_each_task_context_nr(ctxn) {
-               ctx = current->perf_event_ctxp[ctxn];
-               if (!ctx)
-                       continue;
-
                perf_event_enable_on_exec(ctxn);
+               perf_event_remove_on_exec(ctxn);
 
-               perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
-                                  true);
+               rcu_read_lock();
+               ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+               if (ctx) {
+                       perf_iterate_ctx(ctx, perf_event_addr_filters_exec,
+                                        NULL, true);
+               }
+               rcu_read_unlock();
        }
-       rcu_read_unlock();
 }
 
 struct remote_output {
@@ -9012,6 +9137,7 @@ static int __perf_event_overflow(struct perf_event *event,
        if (events && atomic_dec_and_test(&event->event_limit)) {
                ret = 1;
                event->pending_kill = POLL_HUP;
+               event->pending_addr = data->addr;
 
                perf_event_disable_inatomic(event);
        }
@@ -11094,6 +11220,7 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
 
 static struct pmu *perf_init_event(struct perf_event *event)
 {
+       bool extended_type = false;
        int idx, type, ret;
        struct pmu *pmu;
 
@@ -11112,16 +11239,27 @@ static struct pmu *perf_init_event(struct perf_event *event)
         * are often aliases for PERF_TYPE_RAW.
         */
        type = event->attr.type;
-       if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE)
-               type = PERF_TYPE_RAW;
+       if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) {
+               type = event->attr.config >> PERF_PMU_TYPE_SHIFT;
+               if (!type) {
+                       type = PERF_TYPE_RAW;
+               } else {
+                       extended_type = true;
+                       event->attr.config &= PERF_HW_EVENT_MASK;
+               }
+       }
 
 again:
        rcu_read_lock();
        pmu = idr_find(&pmu_idr, type);
        rcu_read_unlock();
        if (pmu) {
+               if (event->attr.type != type && type != PERF_TYPE_RAW &&
+                   !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE))
+                       goto fail;
+
                ret = perf_try_init_event(pmu, event);
-               if (ret == -ENOENT && event->attr.type != type) {
+               if (ret == -ENOENT && event->attr.type != type && !extended_type) {
                        type = event->attr.type;
                        goto again;
                }
@@ -11142,6 +11280,7 @@ again:
                        goto unlock;
                }
        }
+fail:
        pmu = ERR_PTR(-ENOENT);
 unlock:
        srcu_read_unlock(&pmus_srcu, idx);
@@ -11287,13 +11426,20 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
        struct perf_event *event;
        struct hw_perf_event *hwc;
        long err = -EINVAL;
+       int node;
 
        if ((unsigned)cpu >= nr_cpu_ids) {
                if (!task || cpu != -1)
                        return ERR_PTR(-EINVAL);
        }
+       if (attr->sigtrap && !task) {
+               /* Requires a task: avoid signalling random tasks. */
+               return ERR_PTR(-EINVAL);
+       }
 
-       event = kzalloc(sizeof(*event), GFP_KERNEL);
+       node = (cpu >= 0) ? cpu_to_node(cpu) : -1;
+       event = kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO,
+                                     node);
        if (!event)
                return ERR_PTR(-ENOMEM);
 
@@ -11338,6 +11484,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 
        event->state            = PERF_EVENT_STATE_INACTIVE;
 
+       if (event->attr.sigtrap)
+               atomic_set(&event->event_limit, 1);
+
        if (task) {
                event->attach_state = PERF_ATTACH_TASK;
                /*
@@ -11497,7 +11646,7 @@ err_ns:
                put_pid_ns(event->ns);
        if (event->hw.target)
                put_task_struct(event->hw.target);
-       kfree(event);
+       kmem_cache_free(perf_event_cache, event);
 
        return ERR_PTR(err);
 }
@@ -11610,6 +11759,15 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
            (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT))
                return -EINVAL;
 
+       if (!attr->inherit && attr->inherit_thread)
+               return -EINVAL;
+
+       if (attr->remove_on_exec && attr->enable_on_exec)
+               return -EINVAL;
+
+       if (attr->sigtrap && !attr->remove_on_exec)
+               return -EINVAL;
+
 out:
        return ret;
 
@@ -11829,12 +11987,12 @@ SYSCALL_DEFINE5(perf_event_open,
                        return err;
        }
 
-       err = security_locked_down(LOCKDOWN_PERF);
-       if (err && (attr.sample_type & PERF_SAMPLE_REGS_INTR))
-               /* REGS_INTR can leak data, lockdown must prevent this */
-               return err;
-
-       err = 0;
+       /* REGS_INTR can leak data, lockdown must prevent this */
+       if (attr.sample_type & PERF_SAMPLE_REGS_INTR) {
+               err = security_locked_down(LOCKDOWN_PERF);
+               if (err)
+                       return err;
+       }
 
        /*
         * In cgroup mode, the pid argument is used to pass the fd
@@ -12373,14 +12531,17 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
 }
 EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
 
-static void sync_child_event(struct perf_event *child_event,
-                              struct task_struct *child)
+static void sync_child_event(struct perf_event *child_event)
 {
        struct perf_event *parent_event = child_event->parent;
        u64 child_val;
 
-       if (child_event->attr.inherit_stat)
-               perf_event_read_event(child_event, child);
+       if (child_event->attr.inherit_stat) {
+               struct task_struct *task = child_event->ctx->task;
+
+               if (task && task != TASK_TOMBSTONE)
+                       perf_event_read_event(child_event, task);
+       }
 
        child_val = perf_event_count(child_event);
 
@@ -12395,60 +12556,53 @@ static void sync_child_event(struct perf_event *child_event,
 }
 
 static void
-perf_event_exit_event(struct perf_event *child_event,
-                     struct perf_event_context *child_ctx,
-                     struct task_struct *child)
+perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
 {
-       struct perf_event *parent_event = child_event->parent;
+       struct perf_event *parent_event = event->parent;
+       unsigned long detach_flags = 0;
 
-       /*
-        * Do not destroy the 'original' grouping; because of the context
-        * switch optimization the original events could've ended up in a
-        * random child task.
-        *
-        * If we were to destroy the original group, all group related
-        * operations would cease to function properly after this random
-        * child dies.
-        *
-        * Do destroy all inherited groups, we don't care about those
-        * and being thorough is better.
-        */
-       raw_spin_lock_irq(&child_ctx->lock);
-       WARN_ON_ONCE(child_ctx->is_active);
+       if (parent_event) {
+               /*
+                * Do not destroy the 'original' grouping; because of the
+                * context switch optimization the original events could've
+                * ended up in a random child task.
+                *
+                * If we were to destroy the original group, all group related
+                * operations would cease to function properly after this
+                * random child dies.
+                *
+                * Do destroy all inherited groups, we don't care about those
+                * and being thorough is better.
+                */
+               detach_flags = DETACH_GROUP | DETACH_CHILD;
+               mutex_lock(&parent_event->child_mutex);
+       }
 
-       if (parent_event)
-               perf_group_detach(child_event);
-       list_del_event(child_event, child_ctx);
-       perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */
-       raw_spin_unlock_irq(&child_ctx->lock);
+       perf_remove_from_context(event, detach_flags);
+
+       raw_spin_lock_irq(&ctx->lock);
+       if (event->state > PERF_EVENT_STATE_EXIT)
+               perf_event_set_state(event, PERF_EVENT_STATE_EXIT);
+       raw_spin_unlock_irq(&ctx->lock);
 
        /*
-        * Parent events are governed by their filedesc, retain them.
+        * Child events can be freed.
         */
-       if (!parent_event) {
-               perf_event_wakeup(child_event);
+       if (parent_event) {
+               mutex_unlock(&parent_event->child_mutex);
+               /*
+                * Kick perf_poll() for is_event_hup();
+                */
+               perf_event_wakeup(parent_event);
+               free_event(event);
+               put_event(parent_event);
                return;
        }
-       /*
-        * Child events can be cleaned up.
-        */
-
-       sync_child_event(child_event, child);
 
        /*
-        * Remove this event from the parent's list
-        */
-       WARN_ON_ONCE(parent_event->ctx->parent_ctx);
-       mutex_lock(&parent_event->child_mutex);
-       list_del_init(&child_event->child_list);
-       mutex_unlock(&parent_event->child_mutex);
-
-       /*
-        * Kick perf_poll() for is_event_hup().
+        * Parent events are governed by their filedesc, retain them.
         */
-       perf_event_wakeup(parent_event);
-       free_event(child_event);
-       put_event(parent_event);
+       perf_event_wakeup(event);
 }
 
 static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
@@ -12505,7 +12659,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
        perf_event_task(child, child_ctx, 0);
 
        list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
-               perf_event_exit_event(child_event, child_ctx, child);
+               perf_event_exit_event(child_event, child_ctx);
 
        mutex_unlock(&child_ctx->mutex);
 
@@ -12765,6 +12919,7 @@ inherit_event(struct perf_event *parent_event,
         */
        raw_spin_lock_irqsave(&child_ctx->lock, flags);
        add_event_to_ctx(child_event, child_ctx);
+       child_event->attach_state |= PERF_ATTACH_CHILD;
        raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
 
        /*
@@ -12833,12 +12988,15 @@ static int
 inherit_task_group(struct perf_event *event, struct task_struct *parent,
                   struct perf_event_context *parent_ctx,
                   struct task_struct *child, int ctxn,
-                  int *inherited_all)
+                  u64 clone_flags, int *inherited_all)
 {
        int ret;
        struct perf_event_context *child_ctx;
 
-       if (!event->attr.inherit) {
+       if (!event->attr.inherit ||
+           (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) ||
+           /* Do not inherit if sigtrap and signal handlers were cleared. */
+           (event->attr.sigtrap && (clone_flags & CLONE_CLEAR_SIGHAND))) {
                *inherited_all = 0;
                return 0;
        }
@@ -12870,7 +13028,8 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
 /*
  * Initialize the perf_event context in task_struct
  */
-static int perf_event_init_context(struct task_struct *child, int ctxn)
+static int perf_event_init_context(struct task_struct *child, int ctxn,
+                                  u64 clone_flags)
 {
        struct perf_event_context *child_ctx, *parent_ctx;
        struct perf_event_context *cloned_ctx;
@@ -12910,7 +13069,8 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
         */
        perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
                ret = inherit_task_group(event, parent, parent_ctx,
-                                        child, ctxn, &inherited_all);
+                                        child, ctxn, clone_flags,
+                                        &inherited_all);
                if (ret)
                        goto out_unlock;
        }
@@ -12926,7 +13086,8 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
 
        perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
                ret = inherit_task_group(event, parent, parent_ctx,
-                                        child, ctxn, &inherited_all);
+                                        child, ctxn, clone_flags,
+                                        &inherited_all);
                if (ret)
                        goto out_unlock;
        }
@@ -12968,7 +13129,7 @@ out_unlock:
 /*
  * Initialize the perf_event context in task_struct
  */
-int perf_event_init_task(struct task_struct *child)
+int perf_event_init_task(struct task_struct *child, u64 clone_flags)
 {
        int ctxn, ret;
 
@@ -12977,7 +13138,7 @@ int perf_event_init_task(struct task_struct *child)
        INIT_LIST_HEAD(&child->perf_event_list);
 
        for_each_task_context_nr(ctxn) {
-               ret = perf_event_init_context(child, ctxn);
+               ret = perf_event_init_context(child, ctxn, clone_flags);
                if (ret) {
                        perf_event_free_task(child);
                        return ret;
@@ -13130,6 +13291,8 @@ void __init perf_event_init(void)
        ret = init_hw_breakpoint();
        WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
 
+       perf_event_cache = KMEM_CACHE(perf_event, SLAB_PANIC);
+
        /*
         * Build time assertion that we keep the data_head at the intended
         * location.  IOW, validation we got the __reserved[] size right.