Merge tag 'trace-v5.9' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux...
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 8 Aug 2020 01:29:15 +0000 (18:29 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 8 Aug 2020 01:29:15 +0000 (18:29 -0700)
Pull tracing updates from Steven Rostedt:

 - The biggest news in that the tracing ring buffer can now time events
   that interrupted other ring buffer events.

   Before this change, if an interrupt came in while recording another
   event, and that interrupt also had an event, those events would all
   have the same time stamp as the event it interrupted.

   Now, with the new design, those events will have a unique time stamp
   and rightfully display the time for those events that were recorded
   while interrupting another event.

 - Bootconfig how has an "override" operator that lets the users have a
   default config, but then add options to override the default.

 - A fix was made to properly filter function graph tracing to the
   ftrace PIDs. This came in at the end of the -rc cycle, and needs to
   be backported.

 - Several clean ups, performance updates, and minor fixes as well.

* tag 'trace-v5.9' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux-trace: (39 commits)
  tracing: Add trace_array_init_printk() to initialize instance trace_printk() buffers
  kprobes: Fix compiler warning for !CONFIG_KPROBES_ON_FTRACE
  tracing: Use trace_sched_process_free() instead of exit() for pid tracing
  bootconfig: Fix to find the initargs correctly
  Documentation: bootconfig: Add bootconfig override operator
  tools/bootconfig: Add testcases for value override operator
  lib/bootconfig: Add override operator support
  kprobes: Remove show_registers() function prototype
  tracing/uprobe: Remove dead code in trace_uprobe_register()
  kprobes: Fix NULL pointer dereference at kprobe_ftrace_handler
  ftrace: Fix ftrace_trace_task return value
  tracepoint: Use __used attribute definitions from compiler_attributes.h
  tracepoint: Mark __tracepoint_string's __used
  trace : Have tracing buffer info use kvzalloc instead of kzalloc
  tracing: Remove outdated comment in stack handling
  ftrace: Do not let direct or IPMODIFY ftrace_ops be added to module and set trampolines
  ftrace: Setup correct FTRACE_FL_REGS flags for module
  tracing/hwlat: Honor the tracing_cpumask
  tracing/hwlat: Drop the duplicate assignment in start_kthread()
  tracing: Save one trace_event->type by using __TRACE_LAST_TYPE
  ...

1  2 
include/linux/kprobes.h
init/main.c
kernel/kprobes.c
kernel/trace/ftrace.c
kernel/trace/ring_buffer.c
kernel/trace/trace.c

diff --combined include/linux/kprobes.h
@@@ -227,7 -227,6 +227,6 @@@ extern int arch_prepare_kprobe(struct k
  extern void arch_arm_kprobe(struct kprobe *p);
  extern void arch_disarm_kprobe(struct kprobe *p);
  extern int arch_init_kprobes(void);
- extern void show_registers(struct pt_regs *regs);
  extern void kprobes_inc_nmissed_count(struct kprobe *p);
  extern bool arch_within_kprobe_blacklist(unsigned long addr);
  extern int arch_populate_kprobe_blacklist(void);
@@@ -242,7 -241,6 +241,7 @@@ struct kprobe_insn_cache 
        struct mutex mutex;
        void *(*alloc)(void);   /* allocate insn page */
        void (*free)(void *);   /* free insn page */
 +      const char *sym;        /* symbol for insn pages */
        struct list_head pages; /* list of kprobe_insn_page */
        size_t insn_size;       /* size of instruction slot */
        int nr_garbage;
@@@ -273,10 -271,6 +272,10 @@@ static inline bool is_kprobe_##__name##
  {                                                                     \
        return __is_insn_slot_addr(&kprobe_##__name##_slots, addr);     \
  }
 +#define KPROBE_INSN_PAGE_SYM          "kprobe_insn_page"
 +#define KPROBE_OPTINSN_PAGE_SYM               "kprobe_optinsn_page"
 +int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum,
 +                           unsigned long *value, char *type, char *sym);
  #else /* __ARCH_WANT_KPROBES_INSN_SLOT */
  #define DEFINE_INSN_CACHE_OPS(__name)                                 \
  static inline bool is_kprobe_##__name##_slot(unsigned long addr)      \
@@@ -382,11 -376,6 +381,11 @@@ void dump_kprobe(struct kprobe *kp)
  void *alloc_insn_page(void);
  void free_insn_page(void *page);
  
 +int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 +                     char *sym);
 +
 +int arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
 +                          char *type, char *sym);
  #else /* !CONFIG_KPROBES: */
  
  static inline int kprobes_built_in(void)
@@@ -449,11 -438,6 +448,11 @@@ static inline bool within_kprobe_blackl
  {
        return true;
  }
 +static inline int kprobe_get_kallsym(unsigned int symnum, unsigned long *value,
 +                                   char *type, char *sym)
 +{
 +      return -ERANGE;
 +}
  #endif /* CONFIG_KPROBES */
  static inline int disable_kretprobe(struct kretprobe *rp)
  {
diff --combined init/main.c
@@@ -96,7 -96,6 +96,7 @@@
  #include <linux/jump_label.h>
  #include <linux/mem_encrypt.h>
  #include <linux/kcsan.h>
 +#include <linux/init_syscalls.h>
  
  #include <asm/io.h>
  #include <asm/bugs.h>
@@@ -155,7 -154,7 +155,7 @@@ static bool initargs_found
  #endif
  
  static char *execute_command;
 -static char *ramdisk_execute_command;
 +static char *ramdisk_execute_command = "/init";
  
  /*
   * Used to generate warnings if static_key manipulation functions are used
@@@ -388,8 -387,6 +388,6 @@@ static int __init bootconfig_params(cha
  {
        if (strcmp(param, "bootconfig") == 0) {
                bootconfig_found = true;
-       } else if (strcmp(param, "--") == 0) {
-               initargs_found = true;
        }
        return 0;
  }
@@@ -400,19 -397,23 +398,23 @@@ static void __init setup_boot_config(co
        const char *msg;
        int pos;
        u32 size, csum;
-       char *data, *copy;
+       char *data, *copy, *err;
        int ret;
  
        /* Cut out the bootconfig data even if we have no bootconfig option */
        data = get_boot_config_from_initrd(&size, &csum);
  
        strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE);
-       parse_args("bootconfig", tmp_cmdline, NULL, 0, 0, 0, NULL,
-                  bootconfig_params);
+       err = parse_args("bootconfig", tmp_cmdline, NULL, 0, 0, 0, NULL,
+                        bootconfig_params);
  
-       if (!bootconfig_found)
+       if (IS_ERR(err) || !bootconfig_found)
                return;
  
+       /* parse_args() stops at '--' and returns an address */
+       if (err)
+               initargs_found = true;
        if (!data) {
                pr_err("'bootconfig' found on command line, but no bootconfig found\n");
                return;
@@@ -780,16 -781,14 +782,16 @@@ static void __init report_meminit(void
  {
        const char *stack;
  
 -      if (IS_ENABLED(CONFIG_INIT_STACK_ALL))
 -              stack = "all";
 +      if (IS_ENABLED(CONFIG_INIT_STACK_ALL_PATTERN))
 +              stack = "all(pattern)";
 +      else if (IS_ENABLED(CONFIG_INIT_STACK_ALL_ZERO))
 +              stack = "all(zero)";
        else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL))
 -              stack = "byref_all";
 +              stack = "byref_all(zero)";
        else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF))
 -              stack = "byref";
 +              stack = "byref(zero)";
        else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_USER))
 -              stack = "__user";
 +              stack = "__user(zero)";
        else
                stack = "off";
  
@@@ -830,7 -829,7 +832,7 @@@ void __init __weak arch_call_rest_init(
        rest_init();
  }
  
 -asmlinkage __visible void __init start_kernel(void)
 +asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
  {
        char *command_line;
        char *after_dashes;
@@@ -1332,7 -1331,9 +1334,7 @@@ static int run_init_process(const char 
        pr_debug("  with environment:\n");
        for (p = envp_init; *p; p++)
                pr_debug("    %s\n", *p);
 -      return do_execve(getname_kernel(init_filename),
 -              (const char __user *const __user *)argv_init,
 -              (const char __user *const __user *)envp_init);
 +      return kernel_execve(init_filename, argv_init, envp_init);
  }
  
  static int try_to_run_init_process(const char *init_filename)
@@@ -1458,19 -1459,15 +1460,19 @@@ static int __ref kernel_init(void *unus
              "See Linux Documentation/admin-guide/init.rst for guidance.");
  }
  
 -void console_on_rootfs(void)
 +/* Open /dev/console, for stdin/stdout/stderr, this should never fail */
 +void __init console_on_rootfs(void)
  {
 -      /* Open the /dev/console as stdin, this should never fail */
 -      if (ksys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0)
 -              pr_err("Warning: unable to open an initial console.\n");
 +      struct file *file = filp_open("/dev/console", O_RDWR, 0);
  
 -      /* create stdout/stderr */
 -      (void) ksys_dup(0);
 -      (void) ksys_dup(0);
 +      if (IS_ERR(file)) {
 +              pr_err("Warning: unable to open an initial console.\n");
 +              return;
 +      }
 +      init_dup(file);
 +      init_dup(file);
 +      init_dup(file);
 +      fput(file);
  }
  
  static noinline void __init kernel_init_freeable(void)
         * check if there is an early userspace init.  If yes, let it do all
         * the work
         */
 -
 -      if (!ramdisk_execute_command)
 -              ramdisk_execute_command = "/init";
 -
 -      if (ksys_access((const char __user *)
 -                      ramdisk_execute_command, 0) != 0) {
 +      if (init_eaccess(ramdisk_execute_command) != 0) {
                ramdisk_execute_command = NULL;
                prepare_namespace();
        }
diff --combined kernel/kprobes.c
@@@ -35,7 -35,6 +35,7 @@@
  #include <linux/ftrace.h>
  #include <linux/cpu.h>
  #include <linux/jump_label.h>
 +#include <linux/perf_event.h>
  
  #include <asm/sections.h>
  #include <asm/cacheflush.h>
@@@ -124,7 -123,6 +124,7 @@@ struct kprobe_insn_cache kprobe_insn_sl
        .mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex),
        .alloc = alloc_insn_page,
        .free = free_insn_page,
 +      .sym = KPROBE_INSN_PAGE_SYM,
        .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
        .insn_size = MAX_INSN_SIZE,
        .nr_garbage = 0,
@@@ -190,10 -188,6 +190,10 @@@ kprobe_opcode_t *__get_insn_slot(struc
        kip->cache = c;
        list_add_rcu(&kip->list, &c->pages);
        slot = kip->insns;
 +
 +      /* Record the perf ksymbol register event after adding the page */
 +      perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, (unsigned long)kip->insns,
 +                         PAGE_SIZE, false, c->sym);
  out:
        mutex_unlock(&c->mutex);
        return slot;
@@@ -212,13 -206,6 +212,13 @@@ static int collect_one_slot(struct kpro
                 * next time somebody inserts a probe.
                 */
                if (!list_is_singular(&kip->list)) {
 +                      /*
 +                       * Record perf ksymbol unregister event before removing
 +                       * the page.
 +                       */
 +                      perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
 +                                         (unsigned long)kip->insns, PAGE_SIZE, true,
 +                                         kip->cache->sym);
                        list_del_rcu(&kip->list);
                        synchronize_rcu();
                        kip->cache->free(kip->insns);
@@@ -308,34 -295,12 +308,34 @@@ bool __is_insn_slot_addr(struct kprobe_
        return ret;
  }
  
 +int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum,
 +                           unsigned long *value, char *type, char *sym)
 +{
 +      struct kprobe_insn_page *kip;
 +      int ret = -ERANGE;
 +
 +      rcu_read_lock();
 +      list_for_each_entry_rcu(kip, &c->pages, list) {
 +              if ((*symnum)--)
 +                      continue;
 +              strlcpy(sym, c->sym, KSYM_NAME_LEN);
 +              *type = 't';
 +              *value = (unsigned long)kip->insns;
 +              ret = 0;
 +              break;
 +      }
 +      rcu_read_unlock();
 +
 +      return ret;
 +}
 +
  #ifdef CONFIG_OPTPROBES
  /* For optimized_kprobe buffer */
  struct kprobe_insn_cache kprobe_optinsn_slots = {
        .mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex),
        .alloc = alloc_insn_page,
        .free = free_insn_page,
 +      .sym = KPROBE_OPTINSN_PAGE_SYM,
        .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
        /* .insn_size is initialized later */
        .nr_garbage = 0,
@@@ -598,6 -563,8 +598,6 @@@ static void kprobe_optimizer(struct wor
        mutex_lock(&kprobe_mutex);
        cpus_read_lock();
        mutex_lock(&text_mutex);
 -      /* Lock modules while optimizing kprobes */
 -      mutex_lock(&module_mutex);
  
        /*
         * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
        /* Step 4: Free cleaned kprobes after quiesence period */
        do_free_cleaned_kprobes();
  
 -      mutex_unlock(&module_mutex);
        mutex_unlock(&text_mutex);
        cpus_read_unlock();
  
@@@ -1111,9 -1079,20 +1111,20 @@@ static int disarm_kprobe_ftrace(struct 
                ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled);
  }
  #else /* !CONFIG_KPROBES_ON_FTRACE */
- #define prepare_kprobe(p)     arch_prepare_kprobe(p)
- #define arm_kprobe_ftrace(p)  (-ENODEV)
- #define disarm_kprobe_ftrace(p)       (-ENODEV)
+ static inline int prepare_kprobe(struct kprobe *p)
+ {
+       return arch_prepare_kprobe(p);
+ }
+ static inline int arm_kprobe_ftrace(struct kprobe *p)
+ {
+       return -ENODEV;
+ }
+ static inline int disarm_kprobe_ftrace(struct kprobe *p)
+ {
+       return -ENODEV;
+ }
  #endif
  
  /* Arm a kprobe with text_mutex */
@@@ -2145,6 -2124,13 +2156,13 @@@ static void kill_kprobe(struct kprobe *
         * the original probed function (which will be freed soon) any more.
         */
        arch_remove_kprobe(p);
+       /*
+        * The module is going away. We should disarm the kprobe which
+        * is using ftrace.
+        */
+       if (kprobe_ftrace(p))
+               disarm_kprobe_ftrace(p);
  }
  
  /* Disable one kprobe */
@@@ -2264,28 -2250,6 +2282,28 @@@ static void kprobe_remove_ksym_blacklis
        kprobe_remove_area_blacklist(entry, entry + 1);
  }
  
 +int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
 +                                 char *type, char *sym)
 +{
 +      return -ERANGE;
 +}
 +
 +int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 +                     char *sym)
 +{
 +#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
 +      if (!kprobe_cache_get_kallsym(&kprobe_insn_slots, &symnum, value, type, sym))
 +              return 0;
 +#ifdef CONFIG_OPTPROBES
 +      if (!kprobe_cache_get_kallsym(&kprobe_optinsn_slots, &symnum, value, type, sym))
 +              return 0;
 +#endif
 +#endif
 +      if (!arch_kprobe_get_kallsym(&symnum, value, type, sym))
 +              return 0;
 +      return -ERANGE;
 +}
 +
  int __init __weak arch_populate_kprobe_blacklist(void)
  {
        return 0;
@@@ -2502,7 -2466,7 +2520,7 @@@ static void report_probe(struct seq_fil
        else
                kprobe_type = "k";
  
 -      if (!kallsyms_show_value())
 +      if (!kallsyms_show_value(pi->file->f_cred))
                addr = NULL;
  
        if (sym)
@@@ -2594,7 -2558,7 +2612,7 @@@ static int kprobe_blacklist_seq_show(st
         * If /proc/kallsyms is not showing kernel address, we won't
         * show them here either.
         */
 -      if (!kallsyms_show_value())
 +      if (!kallsyms_show_value(m->file->f_cred))
                seq_printf(m, "0x%px-0x%px\t%ps\n", NULL, NULL,
                           (void *)ent->start_addr);
        else
diff --combined kernel/trace/ftrace.c
@@@ -139,9 -139,6 +139,6 @@@ static inline void ftrace_ops_init(stru
  #endif
  }
  
- #define FTRACE_PID_IGNORE     -1
- #define FTRACE_PID_TRACE      -2
  static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
                            struct ftrace_ops *op, struct pt_regs *regs)
  {
@@@ -2388,6 -2385,14 +2385,14 @@@ struct ftrace_ops direct_ops = 
        .flags          = FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_RECURSION_SAFE
                          | FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS
                          | FTRACE_OPS_FL_PERMANENT,
+       /*
+        * By declaring the main trampoline as this trampoline
+        * it will never have one allocated for it. Allocated
+        * trampolines should not call direct functions.
+        * The direct_ops should only be called by the builtin
+        * ftrace_regs_caller trampoline.
+        */
+       .trampoline     = FTRACE_REGS_ADDR,
  };
  #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
  
@@@ -2764,50 -2769,6 +2769,50 @@@ void __weak arch_ftrace_trampoline_free
  {
  }
  
 +/* List of trace_ops that have allocated trampolines */
 +static LIST_HEAD(ftrace_ops_trampoline_list);
 +
 +static void ftrace_add_trampoline_to_kallsyms(struct ftrace_ops *ops)
 +{
 +      lockdep_assert_held(&ftrace_lock);
 +      list_add_rcu(&ops->list, &ftrace_ops_trampoline_list);
 +}
 +
 +static void ftrace_remove_trampoline_from_kallsyms(struct ftrace_ops *ops)
 +{
 +      lockdep_assert_held(&ftrace_lock);
 +      list_del_rcu(&ops->list);
 +}
 +
 +/*
 + * "__builtin__ftrace" is used as a module name in /proc/kallsyms for symbols
 + * for pages allocated for ftrace purposes, even though "__builtin__ftrace" is
 + * not a module.
 + */
 +#define FTRACE_TRAMPOLINE_MOD "__builtin__ftrace"
 +#define FTRACE_TRAMPOLINE_SYM "ftrace_trampoline"
 +
 +static void ftrace_trampoline_free(struct ftrace_ops *ops)
 +{
 +      if (ops && (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP) &&
 +          ops->trampoline) {
 +              /*
 +               * Record the text poke event before the ksymbol unregister
 +               * event.
 +               */
 +              perf_event_text_poke((void *)ops->trampoline,
 +                                   (void *)ops->trampoline,
 +                                   ops->trampoline_size, NULL, 0);
 +              perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
 +                                 ops->trampoline, ops->trampoline_size,
 +                                 true, FTRACE_TRAMPOLINE_SYM);
 +              /* Remove from kallsyms after the perf events */
 +              ftrace_remove_trampoline_from_kallsyms(ops);
 +      }
 +
 +      arch_ftrace_trampoline_free(ops);
 +}
 +
  static void ftrace_startup_enable(int command)
  {
        if (saved_ftrace_func != ftrace_trace_function) {
@@@ -2978,7 -2939,7 +2983,7 @@@ int ftrace_shutdown(struct ftrace_ops *
                        synchronize_rcu_tasks();
  
   free_ops:
 -              arch_ftrace_trampoline_free(ops);
 +              ftrace_trampoline_free(ops);
        }
  
        return 0;
@@@ -6222,27 -6183,6 +6227,27 @@@ struct ftrace_mod_map 
        unsigned int            num_funcs;
  };
  
 +static int ftrace_get_trampoline_kallsym(unsigned int symnum,
 +                                       unsigned long *value, char *type,
 +                                       char *name, char *module_name,
 +                                       int *exported)
 +{
 +      struct ftrace_ops *op;
 +
 +      list_for_each_entry_rcu(op, &ftrace_ops_trampoline_list, list) {
 +              if (!op->trampoline || symnum--)
 +                      continue;
 +              *value = op->trampoline;
 +              *type = 't';
 +              strlcpy(name, FTRACE_TRAMPOLINE_SYM, KSYM_NAME_LEN);
 +              strlcpy(module_name, FTRACE_TRAMPOLINE_MOD, MODULE_NAME_LEN);
 +              *exported = 0;
 +              return 0;
 +      }
 +
 +      return -ERANGE;
 +}
 +
  #ifdef CONFIG_MODULES
  
  #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
@@@ -6255,8 -6195,19 +6260,19 @@@ static int referenced_filters(struct dy
        int cnt = 0;
  
        for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) {
-               if (ops_references_rec(ops, rec))
-                   cnt++;
+               if (ops_references_rec(ops, rec)) {
+                       if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_DIRECT))
+                               continue;
+                       if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_IPMODIFY))
+                               continue;
+                       cnt++;
+                       if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
+                               rec->flags |= FTRACE_FL_REGS;
+                       if (cnt == 1 && ops->trampoline)
+                               rec->flags |= FTRACE_FL_TRAMP;
+                       else
+                               rec->flags &= ~FTRACE_FL_TRAMP;
+               }
        }
  
        return cnt;
@@@ -6435,8 -6386,8 +6451,8 @@@ void ftrace_module_enable(struct modul
                if (ftrace_start_up)
                        cnt += referenced_filters(rec);
  
-               /* This clears FTRACE_FL_DISABLED */
-               rec->flags = cnt;
+               rec->flags &= ~FTRACE_FL_DISABLED;
+               rec->flags += cnt;
  
                if (ftrace_start_up && cnt) {
                        int failed = __ftrace_replace_code(rec, 1);
@@@ -6579,7 -6530,6 +6595,7 @@@ int ftrace_mod_get_kallsym(unsigned in
  {
        struct ftrace_mod_map *mod_map;
        struct ftrace_mod_func *mod_func;
 +      int ret;
  
        preempt_disable();
        list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) {
                WARN_ON(1);
                break;
        }
 +      ret = ftrace_get_trampoline_kallsym(symnum, value, type, name,
 +                                          module_name, exported);
        preempt_enable();
 -      return -ERANGE;
 +      return ret;
  }
  
  #else
@@@ -6621,18 -6569,6 +6637,18 @@@ allocate_ftrace_mod_map(struct module *
  {
        return NULL;
  }
 +int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
 +                         char *type, char *name, char *module_name,
 +                         int *exported)
 +{
 +      int ret;
 +
 +      preempt_disable();
 +      ret = ftrace_get_trampoline_kallsym(symnum, value, type, name,
 +                                          module_name, exported);
 +      preempt_enable();
 +      return ret;
 +}
  #endif /* CONFIG_MODULES */
  
  struct ftrace_init_func {
@@@ -6813,24 -6749,7 +6829,24 @@@ void __weak arch_ftrace_update_trampoli
  
  static void ftrace_update_trampoline(struct ftrace_ops *ops)
  {
 +      unsigned long trampoline = ops->trampoline;
 +
        arch_ftrace_update_trampoline(ops);
 +      if (ops->trampoline && ops->trampoline != trampoline &&
 +          (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) {
 +              /* Add to kallsyms before the perf events */
 +              ftrace_add_trampoline_to_kallsyms(ops);
 +              perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
 +                                 ops->trampoline, ops->trampoline_size, false,
 +                                 FTRACE_TRAMPOLINE_SYM);
 +              /*
 +               * Record the perf text poke event after the ksymbol register
 +               * event.
 +               */
 +              perf_event_text_poke((void *)ops->trampoline, NULL, 0,
 +                                   (void *)ops->trampoline,
 +                                   ops->trampoline_size);
 +      }
  }
  
  void ftrace_init_trace_array(struct trace_array *tr)
@@@ -7066,12 -6985,12 +7082,12 @@@ void ftrace_pid_follow_fork(struct trac
        if (enable) {
                register_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork,
                                                  tr);
-               register_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit,
+               register_trace_sched_process_free(ftrace_pid_follow_sched_process_exit,
                                                  tr);
        } else {
                unregister_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork,
                                                    tr);
-               unregister_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit,
+               unregister_trace_sched_process_free(ftrace_pid_follow_sched_process_exit,
                                                    tr);
        }
  }
@@@ -270,6 -270,9 +270,9 @@@ EXPORT_SYMBOL_GPL(ring_buffer_event_dat
  #define for_each_buffer_cpu(buffer, cpu)              \
        for_each_cpu(cpu, buffer->cpumask)
  
+ #define for_each_online_buffer_cpu(buffer, cpu)               \
+       for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask)
  #define TS_SHIFT      27
  #define TS_MASK               ((1ULL << TS_SHIFT) - 1)
  #define TS_DELTA_TEST (~TS_MASK)
@@@ -413,11 -416,26 +416,26 @@@ struct rb_irq_work 
  struct rb_event_info {
        u64                     ts;
        u64                     delta;
+       u64                     before;
+       u64                     after;
        unsigned long           length;
        struct buffer_page      *tail_page;
        int                     add_timestamp;
  };
  
+ /*
+  * Used for the add_timestamp
+  *  NONE
+  *  EXTEND - wants a time extend
+  *  ABSOLUTE - the buffer requests all events to have absolute time stamps
+  *  FORCE - force a full time stamp.
+  */
+ enum {
+       RB_ADD_STAMP_NONE               = 0,
+       RB_ADD_STAMP_EXTEND             = BIT(1),
+       RB_ADD_STAMP_ABSOLUTE           = BIT(2),
+       RB_ADD_STAMP_FORCE              = BIT(3)
+ };
  /*
   * Used for which event context the event is in.
   *  NMI     = 0
@@@ -435,6 -453,28 +453,28 @@@ enum 
        RB_CTX_MAX
  };
  
+ #if BITS_PER_LONG == 32
+ #define RB_TIME_32
+ #endif
+ /* To test on 64 bit machines */
+ //#define RB_TIME_32
+ #ifdef RB_TIME_32
+ struct rb_time_struct {
+       local_t         cnt;
+       local_t         top;
+       local_t         bottom;
+ };
+ #else
+ #include <asm/local64.h>
+ struct rb_time_struct {
+       local64_t       time;
+ };
+ #endif
+ typedef struct rb_time_struct rb_time_t;
  /*
   * head_page == tail_page && head == tail then buffer is empty.
   */
@@@ -470,7 -510,8 +510,8 @@@ struct ring_buffer_per_cpu 
        size_t                          shortest_full;
        unsigned long                   read;
        unsigned long                   read_bytes;
-       u64                             write_stamp;
+       rb_time_t                       write_stamp;
+       rb_time_t                       before_stamp;
        u64                             read_stamp;
        /* ring buffer pages to update, > 0 to add, < 0 to remove */
        long                            nr_pages_to_update;
@@@ -513,6 -554,189 +554,189 @@@ struct ring_buffer_iter 
        int                             missed_events;
  };
  
+ #ifdef RB_TIME_32
+ /*
+  * On 32 bit machines, local64_t is very expensive. As the ring
+  * buffer doesn't need all the features of a true 64 bit atomic,
+  * on 32 bit, it uses these functions (64 still uses local64_t).
+  *
+  * For the ring buffer, 64 bit required operations for the time is
+  * the following:
+  *
+  *  - Only need 59 bits (uses 60 to make it even).
+  *  - Reads may fail if it interrupted a modification of the time stamp.
+  *      It will succeed if it did not interrupt another write even if
+  *      the read itself is interrupted by a write.
+  *      It returns whether it was successful or not.
+  *
+  *  - Writes always succeed and will overwrite other writes and writes
+  *      that were done by events interrupting the current write.
+  *
+  *  - A write followed by a read of the same time stamp will always succeed,
+  *      but may not contain the same value.
+  *
+  *  - A cmpxchg will fail if it interrupted another write or cmpxchg.
+  *      Other than that, it acts like a normal cmpxchg.
+  *
+  * The 60 bit time stamp is broken up by 30 bits in a top and bottom half
+  *  (bottom being the least significant 30 bits of the 60 bit time stamp).
+  *
+  * The two most significant bits of each half holds a 2 bit counter (0-3).
+  * Each update will increment this counter by one.
+  * When reading the top and bottom, if the two counter bits match then the
+  *  top and bottom together make a valid 60 bit number.
+  */
+ #define RB_TIME_SHIFT 30
+ #define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1)
+ static inline int rb_time_cnt(unsigned long val)
+ {
+       return (val >> RB_TIME_SHIFT) & 3;
+ }
+ static inline u64 rb_time_val(unsigned long top, unsigned long bottom)
+ {
+       u64 val;
+       val = top & RB_TIME_VAL_MASK;
+       val <<= RB_TIME_SHIFT;
+       val |= bottom & RB_TIME_VAL_MASK;
+       return val;
+ }
+ static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
+ {
+       unsigned long top, bottom;
+       unsigned long c;
+       /*
+        * If the read is interrupted by a write, then the cnt will
+        * be different. Loop until both top and bottom have been read
+        * without interruption.
+        */
+       do {
+               c = local_read(&t->cnt);
+               top = local_read(&t->top);
+               bottom = local_read(&t->bottom);
+       } while (c != local_read(&t->cnt));
+       *cnt = rb_time_cnt(top);
+       /* If top and bottom counts don't match, this interrupted a write */
+       if (*cnt != rb_time_cnt(bottom))
+               return false;
+       *ret = rb_time_val(top, bottom);
+       return true;
+ }
+ static bool rb_time_read(rb_time_t *t, u64 *ret)
+ {
+       unsigned long cnt;
+       return __rb_time_read(t, ret, &cnt);
+ }
+ static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt)
+ {
+       return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT);
+ }
+ static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom)
+ {
+       *top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK);
+       *bottom = (unsigned long)(val & RB_TIME_VAL_MASK);
+ }
+ static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt)
+ {
+       val = rb_time_val_cnt(val, cnt);
+       local_set(t, val);
+ }
+ static void rb_time_set(rb_time_t *t, u64 val)
+ {
+       unsigned long cnt, top, bottom;
+       rb_time_split(val, &top, &bottom);
+       /* Writes always succeed with a valid number even if it gets interrupted. */
+       do {
+               cnt = local_inc_return(&t->cnt);
+               rb_time_val_set(&t->top, top, cnt);
+               rb_time_val_set(&t->bottom, bottom, cnt);
+       } while (cnt != local_read(&t->cnt));
+ }
+ static inline bool
+ rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set)
+ {
+       unsigned long ret;
+       ret = local_cmpxchg(l, expect, set);
+       return ret == expect;
+ }
+ static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
+ {
+       unsigned long cnt, top, bottom;
+       unsigned long cnt2, top2, bottom2;
+       u64 val;
+       /* The cmpxchg always fails if it interrupted an update */
+        if (!__rb_time_read(t, &val, &cnt2))
+                return false;
+        if (val != expect)
+                return false;
+        cnt = local_read(&t->cnt);
+        if ((cnt & 3) != cnt2)
+                return false;
+        cnt2 = cnt + 1;
+        rb_time_split(val, &top, &bottom);
+        top = rb_time_val_cnt(top, cnt);
+        bottom = rb_time_val_cnt(bottom, cnt);
+        rb_time_split(set, &top2, &bottom2);
+        top2 = rb_time_val_cnt(top2, cnt2);
+        bottom2 = rb_time_val_cnt(bottom2, cnt2);
+       if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2))
+               return false;
+       if (!rb_time_read_cmpxchg(&t->top, top, top2))
+               return false;
+       if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2))
+               return false;
+       return true;
+ }
+ #else /* 64 bits */
+ /* local64_t always succeeds */
+ static inline bool rb_time_read(rb_time_t *t, u64 *ret)
+ {
+       *ret = local64_read(&t->time);
+       return true;
+ }
+ static void rb_time_set(rb_time_t *t, u64 val)
+ {
+       local64_set(&t->time, val);
+ }
+ static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
+ {
+       u64 val;
+       val = local64_cmpxchg(&t->time, expect, set);
+       return val == expect;
+ }
+ #endif
  /**
   * ring_buffer_nr_pages - get the number of buffer pages in the ring buffer
   * @buffer: The ring_buffer to get the number of pages from
@@@ -577,7 -801,7 +801,7 @@@ static void rb_wake_up_waiters(struct i
   */
  int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
  {
 -      struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer);
 +      struct ring_buffer_per_cpu *cpu_buffer;
        DEFINE_WAIT(wait);
        struct rb_irq_work *work;
        int ret = 0;
@@@ -746,8 -970,16 +970,16 @@@ __poll_t ring_buffer_poll_wait(struct t
  
  static inline u64 rb_time_stamp(struct trace_buffer *buffer)
  {
+       u64 ts;
+       /* Skip retpolines :-( */
+       if (IS_ENABLED(CONFIG_RETPOLINE) && likely(buffer->clock == trace_clock_local))
+               ts = trace_clock_local();
+       else
+               ts = buffer->clock();
        /* shift to debug/test normalization and TIME_EXTENTS */
-       return buffer->clock() << DEBUG_SHIFT;
+       return ts << DEBUG_SHIFT;
  }
  
  u64 ring_buffer_time_stamp(struct trace_buffer *buffer, int cpu)
@@@ -2372,8 -2604,8 +2604,8 @@@ rb_move_tail(struct ring_buffer_per_cp
        return NULL;
  }
  
- /* Slow path, do not inline */
- static noinline struct ring_buffer_event *
+ /* Slow path */
+ static struct ring_buffer_event *
  rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
  {
        if (abs)
  static inline bool rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
                                     struct ring_buffer_event *event);
  
+ #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+ static inline bool sched_clock_stable(void)
+ {
+       return true;
+ }
+ #endif
+ static void
+ rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
+                  struct rb_event_info *info)
+ {
+       u64 write_stamp;
+       WARN_ONCE(1, "Delta way too big! %llu ts=%llu before=%llu after=%llu write stamp=%llu\n%s",
+                 (unsigned long long)info->delta,
+                 (unsigned long long)info->ts,
+                 (unsigned long long)info->before,
+                 (unsigned long long)info->after,
+                 (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0),
+                 sched_clock_stable() ? "" :
+                 "If you just came from a suspend/resume,\n"
+                 "please switch to the trace global clock:\n"
+                 "  echo global > /sys/kernel/debug/tracing/trace_clock\n"
+                 "or add trace_clock=global to the kernel command line\n");
+ }
+ static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
+                                     struct ring_buffer_event **event,
+                                     struct rb_event_info *info,
+                                     u64 *delta,
+                                     unsigned int *length)
+ {
+       bool abs = info->add_timestamp &
+               (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE);
+       if (unlikely(info->delta > (1ULL << 59))) {
+               /* did the clock go backwards */
+               if (info->before == info->after && info->before > info->ts) {
+                       /* not interrupted */
+                       static int once;
+                       /*
+                        * This is possible with a recalibrating of the TSC.
+                        * Do not produce a call stack, but just report it.
+                        */
+                       if (!once) {
+                               once++;
+                               pr_warn("Ring buffer clock went backwards: %llu -> %llu\n",
+                                       info->before, info->ts);
+                       }
+               } else
+                       rb_check_timestamp(cpu_buffer, info);
+               if (!abs)
+                       info->delta = 0;
+       }
+       *event = rb_add_time_stamp(*event, info->delta, abs);
+       *length -= RB_LEN_TIME_EXTEND;
+       *delta = 0;
+ }
  /**
   * rb_update_event - update event type and data
   * @cpu_buffer: The per cpu buffer of the @event
@@@ -2416,21 -2708,12 +2708,12 @@@ rb_update_event(struct ring_buffer_per_
        unsigned length = info->length;
        u64 delta = info->delta;
  
-       /* Only a commit updates the timestamp */
-       if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
-               delta = 0;
        /*
         * If we need to add a timestamp, then we
         * add it to the start of the reserved space.
         */
-       if (unlikely(info->add_timestamp)) {
-               bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer);
-               event = rb_add_time_stamp(event, abs ? info->delta : delta, abs);
-               length -= RB_LEN_TIME_EXTEND;
-               delta = 0;
-       }
+       if (unlikely(info->add_timestamp))
+               rb_add_timestamp(cpu_buffer, &event, info, &delta, &length);
  
        event->time_delta = delta;
        length -= RB_EVNT_HDR_SIZE;
@@@ -2473,12 -2756,38 +2756,38 @@@ static unsigned rb_calculate_event_leng
        return length;
  }
  
- #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
- static inline bool sched_clock_stable(void)
+ static __always_inline bool
+ rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
+                  struct ring_buffer_event *event)
  {
-       return true;
+       unsigned long addr = (unsigned long)event;
+       unsigned long index;
+       index = rb_event_index(event);
+       addr &= PAGE_MASK;
+       return cpu_buffer->commit_page->page == (void *)addr &&
+               rb_commit_index(cpu_buffer) == index;
+ }
+ static u64 rb_time_delta(struct ring_buffer_event *event)
+ {
+       switch (event->type_len) {
+       case RINGBUF_TYPE_PADDING:
+               return 0;
+       case RINGBUF_TYPE_TIME_EXTEND:
+               return ring_buffer_event_time_stamp(event);
+       case RINGBUF_TYPE_TIME_STAMP:
+               return 0;
+       case RINGBUF_TYPE_DATA:
+               return event->time_delta;
+       default:
+               return 0;
+       }
  }
- #endif
  
  static inline int
  rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
        struct buffer_page *bpage;
        unsigned long index;
        unsigned long addr;
+       u64 write_stamp;
+       u64 delta;
  
        new_index = rb_event_index(event);
        old_index = new_index + rb_event_ts_length(event);
  
        bpage = READ_ONCE(cpu_buffer->tail_page);
  
+       delta = rb_time_delta(event);
+       if (!rb_time_read(&cpu_buffer->write_stamp, &write_stamp))
+               return 0;
+       /* Make sure the write stamp is read before testing the location */
+       barrier();
        if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
                unsigned long write_mask =
                        local_read(&bpage->write) & ~RB_WRITE_MASK;
                unsigned long event_length = rb_event_length(event);
+               /* Something came in, can't discard */
+               if (!rb_time_cmpxchg(&cpu_buffer->write_stamp,
+                                      write_stamp, write_stamp - delta))
+                       return 0;
+               /*
+                * If an event were to come in now, it would see that the
+                * write_stamp and the before_stamp are different, and assume
+                * that this event just added itself before updating
+                * the write stamp. The interrupting event will fix the
+                * write stamp for us, and use the before stamp as its delta.
+                */
                /*
                 * This is on the tail page. It is possible that
                 * a write could come in and move the tail page
@@@ -2551,10 -2884,6 +2884,6 @@@ rb_set_commit_to_write(struct ring_buff
                local_set(&cpu_buffer->commit_page->page->commit,
                          rb_page_write(cpu_buffer->commit_page));
                rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
-               /* Only update the write stamp if the page has an event */
-               if (rb_page_write(cpu_buffer->commit_page))
-                       cpu_buffer->write_stamp =
-                               cpu_buffer->commit_page->page->time_stamp;
                /* add barrier to keep gcc from optimizing too much */
                barrier();
        }
@@@ -2626,54 -2955,10 +2955,10 @@@ static inline void rb_event_discard(str
                event->time_delta = 1;
  }
  
- static __always_inline bool
- rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
-                  struct ring_buffer_event *event)
- {
-       unsigned long addr = (unsigned long)event;
-       unsigned long index;
-       index = rb_event_index(event);
-       addr &= PAGE_MASK;
-       return cpu_buffer->commit_page->page == (void *)addr &&
-               rb_commit_index(cpu_buffer) == index;
- }
- static __always_inline void
- rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
-                     struct ring_buffer_event *event)
- {
-       u64 delta;
-       /*
-        * The event first in the commit queue updates the
-        * time stamp.
-        */
-       if (rb_event_is_commit(cpu_buffer, event)) {
-               /*
-                * A commit event that is first on a page
-                * updates the write timestamp with the page stamp
-                */
-               if (!rb_event_index(event))
-                       cpu_buffer->write_stamp =
-                               cpu_buffer->commit_page->page->time_stamp;
-               else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
-                       delta = ring_buffer_event_time_stamp(event);
-                       cpu_buffer->write_stamp += delta;
-               } else if (event->type_len == RINGBUF_TYPE_TIME_STAMP) {
-                       delta = ring_buffer_event_time_stamp(event);
-                       cpu_buffer->write_stamp = delta;
-               } else
-                       cpu_buffer->write_stamp += event->time_delta;
-       }
- }
  static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
                      struct ring_buffer_event *event)
  {
        local_inc(&cpu_buffer->entries);
-       rb_update_write_stamp(cpu_buffer, event);
        rb_end_commit(cpu_buffer);
  }
  
@@@ -2864,58 -3149,138 +3149,138 @@@ int ring_buffer_unlock_commit(struct tr
  }
  EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
  
- static noinline void
- rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
-                   struct rb_event_info *info)
- {
-       WARN_ONCE(info->delta > (1ULL << 59),
-                 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
-                 (unsigned long long)info->delta,
-                 (unsigned long long)info->ts,
-                 (unsigned long long)cpu_buffer->write_stamp,
-                 sched_clock_stable() ? "" :
-                 "If you just came from a suspend/resume,\n"
-                 "please switch to the trace global clock:\n"
-                 "  echo global > /sys/kernel/debug/tracing/trace_clock\n"
-                 "or add trace_clock=global to the kernel command line\n");
-       info->add_timestamp = 1;
- }
  static struct ring_buffer_event *
  __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
                  struct rb_event_info *info)
  {
        struct ring_buffer_event *event;
        struct buffer_page *tail_page;
-       unsigned long tail, write;
-       /*
-        * If the time delta since the last event is too big to
-        * hold in the time field of the event, then we append a
-        * TIME EXTEND event ahead of the data event.
-        */
-       if (unlikely(info->add_timestamp))
-               info->length += RB_LEN_TIME_EXTEND;
+       unsigned long tail, write, w;
+       bool a_ok;
+       bool b_ok;
  
        /* Don't let the compiler play games with cpu_buffer->tail_page */
        tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page);
-       write = local_add_return(info->length, &tail_page->write);
+  /*A*/        w = local_read(&tail_page->write) & RB_WRITE_MASK;
+       barrier();
+       b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
+       a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+       barrier();
+       info->ts = rb_time_stamp(cpu_buffer->buffer);
+       if ((info->add_timestamp & RB_ADD_STAMP_ABSOLUTE)) {
+               info->delta = info->ts;
+       } else {
+               /*
+                * If interrupting an event time update, we may need an
+                * absolute timestamp.
+                * Don't bother if this is the start of a new page (w == 0).
+                */
+               if (unlikely(!a_ok || !b_ok || (info->before != info->after && w))) {
+                       info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND;
+                       info->length += RB_LEN_TIME_EXTEND;
+               } else {
+                       info->delta = info->ts - info->after;
+                       if (unlikely(test_time_stamp(info->delta))) {
+                               info->add_timestamp |= RB_ADD_STAMP_EXTEND;
+                               info->length += RB_LEN_TIME_EXTEND;
+                       }
+               }
+       }
+  /*B*/        rb_time_set(&cpu_buffer->before_stamp, info->ts);
+  /*C*/        write = local_add_return(info->length, &tail_page->write);
  
        /* set write to only the index of the write */
        write &= RB_WRITE_MASK;
        tail = write - info->length;
  
+       /* See if we shot pass the end of this buffer page */
+       if (unlikely(write > BUF_PAGE_SIZE)) {
+               if (tail != w) {
+                       /* before and after may now different, fix it up*/
+                       b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
+                       a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+                       if (a_ok && b_ok && info->before != info->after)
+                               (void)rb_time_cmpxchg(&cpu_buffer->before_stamp,
+                                                     info->before, info->after);
+               }
+               return rb_move_tail(cpu_buffer, tail, info);
+       }
+       if (likely(tail == w)) {
+               u64 save_before;
+               bool s_ok;
+               /* Nothing interrupted us between A and C */
+  /*D*/                rb_time_set(&cpu_buffer->write_stamp, info->ts);
+               barrier();
+  /*E*/                s_ok = rb_time_read(&cpu_buffer->before_stamp, &save_before);
+               RB_WARN_ON(cpu_buffer, !s_ok);
+               if (likely(!(info->add_timestamp &
+                            (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE))))
+                       /* This did not interrupt any time update */
+                       info->delta = info->ts - info->after;
+               else
+                       /* Just use full timestamp for inerrupting event */
+                       info->delta = info->ts;
+               barrier();
+               if (unlikely(info->ts != save_before)) {
+                       /* SLOW PATH - Interrupted between C and E */
+                       a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+                       RB_WARN_ON(cpu_buffer, !a_ok);
+                       /* Write stamp must only go forward */
+                       if (save_before > info->after) {
+                               /*
+                                * We do not care about the result, only that
+                                * it gets updated atomically.
+                                */
+                               (void)rb_time_cmpxchg(&cpu_buffer->write_stamp,
+                                                     info->after, save_before);
+                       }
+               }
+       } else {
+               u64 ts;
+               /* SLOW PATH - Interrupted between A and C */
+               a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+               /* Was interrupted before here, write_stamp must be valid */
+               RB_WARN_ON(cpu_buffer, !a_ok);
+               ts = rb_time_stamp(cpu_buffer->buffer);
+               barrier();
+  /*E*/                if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
+                   info->after < ts) {
+                       /* Nothing came after this event between C and E */
+                       info->delta = ts - info->after;
+                       (void)rb_time_cmpxchg(&cpu_buffer->write_stamp,
+                                             info->after, info->ts);
+                       info->ts = ts;
+               } else {
+                       /*
+                        * Interrupted beween C and E:
+                        * Lost the previous events time stamp. Just set the
+                        * delta to zero, and this will be the same time as
+                        * the event this event interrupted. And the events that
+                        * came after this will still be correct (as they would
+                        * have built their delta on the previous event.
+                        */
+                       info->delta = 0;
+               }
+               info->add_timestamp &= ~RB_ADD_STAMP_FORCE;
+       }
        /*
         * If this is the first commit on the page, then it has the same
         * timestamp as the page itself.
         */
-       if (!tail && !ring_buffer_time_stamp_abs(cpu_buffer->buffer))
+       if (unlikely(!tail && !(info->add_timestamp &
+                               (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE))))
                info->delta = 0;
  
-       /* See if we shot pass the end of this buffer page */
-       if (unlikely(write > BUF_PAGE_SIZE))
-               return rb_move_tail(cpu_buffer, tail, info);
        /* We reserved something on the buffer */
  
        event = __rb_page_index(tail_page, tail);
         * If this is the first commit on the page, then update
         * its timestamp.
         */
-       if (!tail)
+       if (unlikely(!tail))
                tail_page->page->time_stamp = info->ts;
  
        /* account for these added bytes */
@@@ -2944,9 -3309,10 +3309,10 @@@ rb_reserve_next_event(struct trace_buff
        struct ring_buffer_event *event;
        struct rb_event_info info;
        int nr_loops = 0;
-       u64 diff;
+       int add_ts_default;
  
        rb_start_commit(cpu_buffer);
+       /* The commit page can not change after this */
  
  #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
        /*
  #endif
  
        info.length = rb_calculate_event_length(length);
+       if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) {
+               add_ts_default = RB_ADD_STAMP_ABSOLUTE;
+               info.length += RB_LEN_TIME_EXTEND;
+       } else {
+               add_ts_default = RB_ADD_STAMP_NONE;
+       }
   again:
-       info.add_timestamp = 0;
+       info.add_timestamp = add_ts_default;
        info.delta = 0;
  
        /*
        if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
                goto out_fail;
  
-       info.ts = rb_time_stamp(cpu_buffer->buffer);
-       diff = info.ts - cpu_buffer->write_stamp;
-       /* make sure this diff is calculated here */
-       barrier();
-       if (ring_buffer_time_stamp_abs(buffer)) {
-               info.delta = info.ts;
-               rb_handle_timestamp(cpu_buffer, &info);
-       } else /* Did the write stamp get updated already? */
-               if (likely(info.ts >= cpu_buffer->write_stamp)) {
-               info.delta = diff;
-               if (unlikely(test_time_stamp(info.delta)))
-                       rb_handle_timestamp(cpu_buffer, &info);
-       }
        event = __rb_reserve_next(cpu_buffer, &info);
  
        if (unlikely(PTR_ERR(event) == -EAGAIN)) {
-               if (info.add_timestamp)
+               if (info.add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND))
                        info.length -= RB_LEN_TIME_EXTEND;
                goto again;
        }
  
-       if (!event)
-               goto out_fail;
-       return event;
+       if (likely(event))
+               return event;
   out_fail:
        rb_end_commit(cpu_buffer);
        return NULL;
@@@ -3154,11 -3509,6 +3509,6 @@@ void ring_buffer_discard_commit(struct 
        if (rb_try_to_discard(cpu_buffer, event))
                goto out;
  
-       /*
-        * The commit is still visible by the reader, so we
-        * must still update the timestamp.
-        */
-       rb_update_write_stamp(cpu_buffer, event);
   out:
        rb_end_commit(cpu_buffer);
  
@@@ -4475,8 -4825,8 +4825,8 @@@ rb_reset_cpu(struct ring_buffer_per_cp
        cpu_buffer->read = 0;
        cpu_buffer->read_bytes = 0;
  
-       cpu_buffer->write_stamp = 0;
-       cpu_buffer->read_stamp = 0;
+       rb_time_set(&cpu_buffer->write_stamp, 0);
+       rb_time_set(&cpu_buffer->before_stamp, 0);
  
        cpu_buffer->lost_events = 0;
        cpu_buffer->last_overrun = 0;
        rb_head_page_activate(cpu_buffer);
  }
  
+ /* Must have disabled the cpu buffer then done a synchronize_rcu */
+ static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
+ {
+       unsigned long flags;
+       raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+       if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
+               goto out;
+       arch_spin_lock(&cpu_buffer->lock);
+       rb_reset_cpu(cpu_buffer);
+       arch_spin_unlock(&cpu_buffer->lock);
+  out:
+       raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ }
  /**
   * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
   * @buffer: The ring buffer to reset a per cpu buffer of
  void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
  {
        struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
-       unsigned long flags;
  
        if (!cpumask_test_cpu(cpu, buffer->cpumask))
                return;
        /* Make sure all commits have finished */
        synchronize_rcu();
  
-       raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+       reset_disabled_cpu_buffer(cpu_buffer);
  
-       if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
-               goto out;
+       atomic_dec(&cpu_buffer->record_disabled);
+       atomic_dec(&cpu_buffer->resize_disabled);
+ }
+ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
  
-       arch_spin_lock(&cpu_buffer->lock);
+ /**
+  * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
+  * @buffer: The ring buffer to reset a per cpu buffer of
+  * @cpu: The CPU buffer to be reset
+  */
+ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer)
+ {
+       struct ring_buffer_per_cpu *cpu_buffer;
+       int cpu;
  
-       rb_reset_cpu(cpu_buffer);
+       for_each_online_buffer_cpu(buffer, cpu) {
+               cpu_buffer = buffer->buffers[cpu];
  
-       arch_spin_unlock(&cpu_buffer->lock);
+               atomic_inc(&cpu_buffer->resize_disabled);
+               atomic_inc(&cpu_buffer->record_disabled);
+       }
  
-  out:
-       raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+       /* Make sure all commits have finished */
+       synchronize_rcu();
  
-       atomic_dec(&cpu_buffer->record_disabled);
-       atomic_dec(&cpu_buffer->resize_disabled);
+       for_each_online_buffer_cpu(buffer, cpu) {
+               cpu_buffer = buffer->buffers[cpu];
+               reset_disabled_cpu_buffer(cpu_buffer);
+               atomic_dec(&cpu_buffer->record_disabled);
+               atomic_dec(&cpu_buffer->resize_disabled);
+       }
  }
- EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
  
  /**
   * ring_buffer_reset - reset a ring buffer
   */
  void ring_buffer_reset(struct trace_buffer *buffer)
  {
+       struct ring_buffer_per_cpu *cpu_buffer;
        int cpu;
  
-       for_each_buffer_cpu(buffer, cpu)
-               ring_buffer_reset_cpu(buffer, cpu);
+       for_each_buffer_cpu(buffer, cpu) {
+               cpu_buffer = buffer->buffers[cpu];
+               atomic_inc(&cpu_buffer->resize_disabled);
+               atomic_inc(&cpu_buffer->record_disabled);
+       }
+       /* Make sure all commits have finished */
+       synchronize_rcu();
+       for_each_buffer_cpu(buffer, cpu) {
+               cpu_buffer = buffer->buffers[cpu];
+               reset_disabled_cpu_buffer(cpu_buffer);
+               atomic_dec(&cpu_buffer->record_disabled);
+               atomic_dec(&cpu_buffer->resize_disabled);
+       }
  }
  EXPORT_SYMBOL_GPL(ring_buffer_reset);
  
diff --combined kernel/trace/trace.c
@@@ -1543,7 -1543,8 +1543,7 @@@ static void latency_fsnotify_workfn(str
  {
        struct trace_array *tr = container_of(work, struct trace_array,
                                              fsnotify_work);
 -      fsnotify(tr->d_max_latency->d_inode, FS_MODIFY,
 -               tr->d_max_latency->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 +      fsnotify_inode(tr->d_max_latency->d_inode, FS_MODIFY);
  }
  
  static void latency_fsnotify_workfn_irq(struct irq_work *iwork)
@@@ -2002,7 -2003,6 +2002,6 @@@ static void tracing_reset_cpu(struct ar
  void tracing_reset_online_cpus(struct array_buffer *buf)
  {
        struct trace_buffer *buffer = buf->buffer;
-       int cpu;
  
        if (!buffer)
                return;
  
        buf->time_start = buffer_ftrace_now(buf, buf->cpu);
  
-       for_each_online_cpu(cpu)
-               ring_buffer_reset_cpu(buffer, cpu);
+       ring_buffer_reset_online_cpus(buffer);
  
        ring_buffer_record_enable(buffer);
  }
@@@ -2931,12 -2930,6 +2929,6 @@@ static void __ftrace_trace_stack(struc
                skip++;
  #endif
  
-       /*
-        * Since events can happen in NMIs there's no safe way to
-        * use the per cpu ftrace_stacks. We reserve it and if an interrupt
-        * or NMI comes in, it will just have to use the default
-        * FTRACE_STACK_SIZE.
-        */
        preempt_disable_notrace();
  
        stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1;
@@@ -3136,6 -3129,9 +3128,9 @@@ static int alloc_percpu_trace_buffer(vo
  {
        struct trace_buffer_struct *buffers;
  
+       if (trace_percpu_buffer)
+               return 0;
        buffers = alloc_percpu(struct trace_buffer_struct);
        if (MEM_FAIL(!buffers, "Could not allocate percpu trace_printk buffer"))
                return -ENOMEM;
@@@ -3338,6 -3334,26 +3333,26 @@@ int trace_array_vprintk(struct trace_ar
        return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args);
  }
  
+ /**
+  * trace_array_printk - Print a message to a specific instance
+  * @tr: The instance trace_array descriptor
+  * @ip: The instruction pointer that this is called from.
+  * @fmt: The format to print (printf format)
+  *
+  * If a subsystem sets up its own instance, they have the right to
+  * printk strings into their tracing instance buffer using this
+  * function. Note, this function will not write into the top level
+  * buffer (use trace_printk() for that), as writing into the top level
+  * buffer should only have events that can be individually disabled.
+  * trace_printk() is only used for debugging a kernel, and should not
+  * be ever encorporated in normal use.
+  *
+  * trace_array_printk() can be used, as it will not add noise to the
+  * top level tracing buffer.
+  *
+  * Note, trace_array_init_printk() must be called on @tr before this
+  * can be used.
+  */
  __printf(3, 0)
  int trace_array_printk(struct trace_array *tr,
                       unsigned long ip, const char *fmt, ...)
        int ret;
        va_list ap;
  
-       if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
-               return 0;
        if (!tr)
                return -ENOENT;
  
+       /* This is only allowed for created instances */
+       if (tr == &global_trace)
+               return 0;
+       if (!(tr->trace_flags & TRACE_ITER_PRINTK))
+               return 0;
        va_start(ap, fmt);
        ret = trace_array_vprintk(tr, ip, fmt, ap);
        va_end(ap);
  }
  EXPORT_SYMBOL_GPL(trace_array_printk);
  
+ /**
+  * trace_array_init_printk - Initialize buffers for trace_array_printk()
+  * @tr: The trace array to initialize the buffers for
+  *
+  * As trace_array_printk() only writes into instances, they are OK to
+  * have in the kernel (unlike trace_printk()). This needs to be called
+  * before trace_array_printk() can be used on a trace_array.
+  */
+ int trace_array_init_printk(struct trace_array *tr)
+ {
+       if (!tr)
+               return -ENOENT;
+       /* This is only allowed for created instances */
+       if (tr == &global_trace)
+               return -EINVAL;
+       return alloc_percpu_trace_buffer();
+ }
+ EXPORT_SYMBOL_GPL(trace_array_init_printk);
  __printf(3, 4)
  int trace_array_printk_buf(struct trace_buffer *buffer,
                           unsigned long ip, const char *fmt, ...)
@@@ -5886,7 -5927,7 +5926,7 @@@ int tracing_set_tracer(struct trace_arr
        }
  
        /* If trace pipe files are being read, we can't change the tracer */
-       if (tr->current_trace->ref) {
+       if (tr->trace_ref) {
                ret = -EBUSY;
                goto out;
        }
@@@ -6102,7 -6143,7 +6142,7 @@@ static int tracing_open_pipe(struct ino
  
        nonseekable_open(inode, filp);
  
-       tr->current_trace->ref++;
+       tr->trace_ref++;
  out:
        mutex_unlock(&trace_types_lock);
        return ret;
@@@ -6121,7 -6162,7 +6161,7 @@@ static int tracing_release_pipe(struct 
  
        mutex_lock(&trace_types_lock);
  
-       tr->current_trace->ref--;
+       tr->trace_ref--;
  
        if (iter->trace->pipe_close)
                iter->trace->pipe_close(iter);
@@@ -7405,7 -7446,7 +7445,7 @@@ static int tracing_buffers_open(struct 
        if (ret)
                return ret;
  
-       info = kzalloc(sizeof(*info), GFP_KERNEL);
+       info = kvzalloc(sizeof(*info), GFP_KERNEL);
        if (!info) {
                trace_array_put(tr);
                return -ENOMEM;
  
        filp->private_data = info;
  
-       tr->current_trace->ref++;
+       tr->trace_ref++;
  
        mutex_unlock(&trace_types_lock);
  
@@@ -7524,14 -7565,14 +7564,14 @@@ static int tracing_buffers_release(stru
  
        mutex_lock(&trace_types_lock);
  
-       iter->tr->current_trace->ref--;
+       iter->tr->trace_ref--;
  
        __trace_array_put(iter->tr);
  
        if (info->spare)
                ring_buffer_free_read_page(iter->array_buffer->buffer,
                                           info->spare_cpu, info->spare);
-       kfree(info);
+       kvfree(info);
  
        mutex_unlock(&trace_types_lock);
  
@@@ -8732,7 -8773,7 +8772,7 @@@ static int __remove_instance(struct tra
        int i;
  
        /* Reference counter for a newly created trace array = 1. */
-       if (tr->ref > 1 || (tr->current_trace && tr->current_trace->ref))
+       if (tr->ref > 1 || (tr->current_trace && tr->trace_ref))
                return -EBUSY;
  
        list_del(&tr->list);
@@@ -8944,7 -8985,9 +8984,7 @@@ struct dentry *tracing_init_dentry(void
        if (tr->dir)
                return NULL;
  
 -      if (WARN_ON(!tracefs_initialized()) ||
 -              (IS_ENABLED(CONFIG_DEBUG_FS) &&
 -               WARN_ON(!debugfs_initialized())))
 +      if (WARN_ON(!tracefs_initialized()))
                return ERR_PTR(-ENODEV);
  
        /*