Merge tag 'trace-v5.9' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt/linux...

author Linus Torvalds <torvalds@linux-foundation.org>

Sat, 8 Aug 2020 01:29:15 +0000 (18:29 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sat, 8 Aug 2020 01:29:15 +0000 (18:29 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Sat, 8 Aug 2020 01:29:15 +0000 (18:29 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sat, 8 Aug 2020 01:29:15 +0000 (18:29 -0700)
diff --combined include/linux/kprobes.h

index 45b8cdc,81cb7e0..9be1bff
--- 1/include/linux/kprobes.h
--- 2/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@@ -227,7 -227,6 +227,6 @@@ extern int arch_prepare_kprobe(struct k
   extern void arch_arm_kprobe(struct kprobe *p);
   extern void arch_disarm_kprobe(struct kprobe *p);
   extern int arch_init_kprobes(void);
- extern void show_registers(struct pt_regs *regs);
   extern void kprobes_inc_nmissed_count(struct kprobe *p);
   extern bool arch_within_kprobe_blacklist(unsigned long addr);
   extern int arch_populate_kprobe_blacklist(void);
@@@ -242,7 -241,6 +241,7 @@@ struct kprobe_insn_cache 
         struct mutex mutex;
         void *(*alloc)(void);   /* allocate insn page */
         void (*free)(void *);   /* free insn page */
+ +      const char *sym;        /* symbol for insn pages */
         struct list_head pages; /* list of kprobe_insn_page */
         size_t insn_size;       /* size of instruction slot */
         int nr_garbage;
@@@ -273,10 -271,6 +272,10 @@@ static inline bool is_kprobe_##__name##
   {                                                                     \
         return __is_insn_slot_addr(&kprobe_##__name##_slots, addr);     \
   }
+ +#define KPROBE_INSN_PAGE_SYM          "kprobe_insn_page"
+ +#define KPROBE_OPTINSN_PAGE_SYM               "kprobe_optinsn_page"
+ +int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum,
+ +                           unsigned long *value, char *type, char *sym);
   #else /* __ARCH_WANT_KPROBES_INSN_SLOT */
   #define DEFINE_INSN_CACHE_OPS(__name)                                 \
   static inline bool is_kprobe_##__name##_slot(unsigned long addr)      \
@@@ -382,11 -376,6 +381,11 @@@ void dump_kprobe(struct kprobe *kp)
   void *alloc_insn_page(void);
   void free_insn_page(void *page);
   
+ +int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+ +                     char *sym);
+ +
+ +int arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
+ +                          char *type, char *sym);
   #else /* !CONFIG_KPROBES: */
   
   static inline int kprobes_built_in(void)
@@@ -449,11 -438,6 +448,11 @@@ static inline bool within_kprobe_blackl
   {
         return true;
   }
+ +static inline int kprobe_get_kallsym(unsigned int symnum, unsigned long *value,
+ +                                   char *type, char *sym)
+ +{
+ +      return -ERANGE;
+ +}
   #endif /* CONFIG_KPROBES */
   static inline int disable_kretprobe(struct kretprobe *rp)
   {
diff --combined init/main.c

index de2f9fa,883ded3..ae78fb6
--- 1/init/main.c
--- 2/init/main.c
+++ b/init/main.c
@@@ -96,7 -96,6 +96,7 @@@
   #include <linux/jump_label.h>
   #include <linux/mem_encrypt.h>
   #include <linux/kcsan.h>
+ +#include <linux/init_syscalls.h>
   
   #include <asm/io.h>
   #include <asm/bugs.h>
@@@ -155,7 -154,7 +155,7 @@@ static bool initargs_found
   #endif
   
   static char *execute_command;
- -static char *ramdisk_execute_command;
+ +static char *ramdisk_execute_command = "/init";
   
   /*
    * Used to generate warnings if static_key manipulation functions are used
@@@ -388,8 -387,6 +388,6 @@@ static int __init bootconfig_params(cha
   {
         if (strcmp(param, "bootconfig") == 0) {
                 bootconfig_found = true;
-       } else if (strcmp(param, "--") == 0) {
-               initargs_found = true;
         }
         return 0;
   }
@@@ -400,19 -397,23 +398,23 @@@ static void __init setup_boot_config(co
         const char *msg;
         int pos;
         u32 size, csum;
-       char *data, *copy;
+       char *data, *copy, *err;
         int ret;
   
         /* Cut out the bootconfig data even if we have no bootconfig option */
         data = get_boot_config_from_initrd(&size, &csum);
   
         strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE);
-       parse_args("bootconfig", tmp_cmdline, NULL, 0, 0, 0, NULL,
-                  bootconfig_params);
+       err = parse_args("bootconfig", tmp_cmdline, NULL, 0, 0, 0, NULL,
+                        bootconfig_params);
   
-       if (!bootconfig_found)
+       if (IS_ERR(err) || !bootconfig_found)
                 return;
   
+       /* parse_args() stops at '--' and returns an address */
+       if (err)
+               initargs_found = true;
+ 
         if (!data) {
                 pr_err("'bootconfig' found on command line, but no bootconfig found\n");
                 return;
@@@ -780,16 -781,14 +782,16 @@@ static void __init report_meminit(void
   {
         const char *stack;
   
- -      if (IS_ENABLED(CONFIG_INIT_STACK_ALL))
- -              stack = "all";
+ +      if (IS_ENABLED(CONFIG_INIT_STACK_ALL_PATTERN))
+ +              stack = "all(pattern)";
+ +      else if (IS_ENABLED(CONFIG_INIT_STACK_ALL_ZERO))
+ +              stack = "all(zero)";
         else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL))
- -              stack = "byref_all";
+ +              stack = "byref_all(zero)";
         else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF))
- -              stack = "byref";
+ +              stack = "byref(zero)";
         else if (IS_ENABLED(CONFIG_GCC_PLUGIN_STRUCTLEAK_USER))
- -              stack = "__user";
+ +              stack = "__user(zero)";
         else
                 stack = "off";
   
@@@ -830,7 -829,7 +832,7 @@@ void __init __weak arch_call_rest_init(
         rest_init();
   }
   
- -asmlinkage __visible void __init start_kernel(void)
+ +asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
   {
         char *command_line;
         char *after_dashes;
@@@ -1332,7 -1331,9 +1334,7 @@@ static int run_init_process(const char 
         pr_debug("  with environment:\n");
         for (p = envp_init; *p; p++)
                 pr_debug("    %s\n", *p);
- -      return do_execve(getname_kernel(init_filename),
- -              (const char __user *const __user *)argv_init,
- -              (const char __user *const __user *)envp_init);
+ +      return kernel_execve(init_filename, argv_init, envp_init);
   }
   
   static int try_to_run_init_process(const char *init_filename)
@@@ -1458,19 -1459,15 +1460,19 @@@ static int __ref kernel_init(void *unus
               "See Linux Documentation/admin-guide/init.rst for guidance.");
   }
   
- -void console_on_rootfs(void)
+ +/* Open /dev/console, for stdin/stdout/stderr, this should never fail */
+ +void __init console_on_rootfs(void)
   {
- -      /* Open the /dev/console as stdin, this should never fail */
- -      if (ksys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0)
- -              pr_err("Warning: unable to open an initial console.\n");
+ +      struct file *file = filp_open("/dev/console", O_RDWR, 0);
   
- -      /* create stdout/stderr */
- -      (void) ksys_dup(0);
- -      (void) ksys_dup(0);
+ +      if (IS_ERR(file)) {
+ +              pr_err("Warning: unable to open an initial console.\n");
+ +              return;
+ +      }
+ +      init_dup(file);
+ +      init_dup(file);
+ +      init_dup(file);
+ +      fput(file);
   }
   
   static noinline void __init kernel_init_freeable(void)
@@@ -1515,7 -1512,12 +1517,7 @@@
          * check if there is an early userspace init.  If yes, let it do all
          * the work
          */
- -
- -      if (!ramdisk_execute_command)
- -              ramdisk_execute_command = "/init";
- -
- -      if (ksys_access((const char __user *)
- -                      ramdisk_execute_command, 0) != 0) {
+ +      if (init_eaccess(ramdisk_execute_command) != 0) {
                 ramdisk_execute_command = NULL;
                 prepare_namespace();
         }
diff --combined kernel/kprobes.c

index e87679a,66d1410..287b263
--- 1/kernel/kprobes.c
--- 2/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@@ -35,7 -35,6 +35,7 @@@
   #include <linux/ftrace.h>
   #include <linux/cpu.h>
   #include <linux/jump_label.h>
+ +#include <linux/perf_event.h>
   
   #include <asm/sections.h>
   #include <asm/cacheflush.h>
@@@ -124,7 -123,6 +124,7 @@@ struct kprobe_insn_cache kprobe_insn_sl
         .mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex),
         .alloc = alloc_insn_page,
         .free = free_insn_page,
+ +      .sym = KPROBE_INSN_PAGE_SYM,
         .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
         .insn_size = MAX_INSN_SIZE,
         .nr_garbage = 0,
@@@ -190,10 -188,6 +190,10 @@@ kprobe_opcode_t *__get_insn_slot(struc
         kip->cache = c;
         list_add_rcu(&kip->list, &c->pages);
         slot = kip->insns;
+ +
+ +      /* Record the perf ksymbol register event after adding the page */
+ +      perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, (unsigned long)kip->insns,
+ +                         PAGE_SIZE, false, c->sym);
   out:
         mutex_unlock(&c->mutex);
         return slot;
@@@ -212,13 -206,6 +212,13 @@@ static int collect_one_slot(struct kpro
                  * next time somebody inserts a probe.
                  */
                 if (!list_is_singular(&kip->list)) {
+ +                      /*
+ +                       * Record perf ksymbol unregister event before removing
+ +                       * the page.
+ +                       */
+ +                      perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
+ +                                         (unsigned long)kip->insns, PAGE_SIZE, true,
+ +                                         kip->cache->sym);
                         list_del_rcu(&kip->list);
                         synchronize_rcu();
                         kip->cache->free(kip->insns);
@@@ -308,34 -295,12 +308,34 @@@ bool __is_insn_slot_addr(struct kprobe_
         return ret;
   }
   
+ +int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum,
+ +                           unsigned long *value, char *type, char *sym)
+ +{
+ +      struct kprobe_insn_page *kip;
+ +      int ret = -ERANGE;
+ +
+ +      rcu_read_lock();
+ +      list_for_each_entry_rcu(kip, &c->pages, list) {
+ +              if ((*symnum)--)
+ +                      continue;
+ +              strlcpy(sym, c->sym, KSYM_NAME_LEN);
+ +              *type = 't';
+ +              *value = (unsigned long)kip->insns;
+ +              ret = 0;
+ +              break;
+ +      }
+ +      rcu_read_unlock();
+ +
+ +      return ret;
+ +}
+ +
   #ifdef CONFIG_OPTPROBES
   /* For optimized_kprobe buffer */
   struct kprobe_insn_cache kprobe_optinsn_slots = {
         .mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex),
         .alloc = alloc_insn_page,
         .free = free_insn_page,
+ +      .sym = KPROBE_OPTINSN_PAGE_SYM,
         .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
         /* .insn_size is initialized later */
         .nr_garbage = 0,
@@@ -598,6 -563,8 +598,6 @@@ static void kprobe_optimizer(struct wor
         mutex_lock(&kprobe_mutex);
         cpus_read_lock();
         mutex_lock(&text_mutex);
- -      /* Lock modules while optimizing kprobes */
- -      mutex_lock(&module_mutex);
   
         /*
          * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
@@@ -622,6 -589,7 +622,6 @@@
         /* Step 4: Free cleaned kprobes after quiesence period */
         do_free_cleaned_kprobes();
   
- -      mutex_unlock(&module_mutex);
         mutex_unlock(&text_mutex);
         cpus_read_unlock();
   
@@@ -1111,9 -1079,20 +1111,20 @@@ static int disarm_kprobe_ftrace(struct 
                 ipmodify ? &kprobe_ipmodify_enabled : &kprobe_ftrace_enabled);
   }
   #else /* !CONFIG_KPROBES_ON_FTRACE */
- #define prepare_kprobe(p)     arch_prepare_kprobe(p)
- #define arm_kprobe_ftrace(p)  (-ENODEV)
- #define disarm_kprobe_ftrace(p)       (-ENODEV)
+ static inline int prepare_kprobe(struct kprobe *p)
+ {
+       return arch_prepare_kprobe(p);
+ }
+ 
+ static inline int arm_kprobe_ftrace(struct kprobe *p)
+ {
+       return -ENODEV;
+ }
+ 
+ static inline int disarm_kprobe_ftrace(struct kprobe *p)
+ {
+       return -ENODEV;
+ }
   #endif
   
   /* Arm a kprobe with text_mutex */
@@@ -2145,6 -2124,13 +2156,13 @@@ static void kill_kprobe(struct kprobe *
          * the original probed function (which will be freed soon) any more.
          */
         arch_remove_kprobe(p);
+ 
+       /*
+        * The module is going away. We should disarm the kprobe which
+        * is using ftrace.
+        */
+       if (kprobe_ftrace(p))
+               disarm_kprobe_ftrace(p);
   }
   
   /* Disable one kprobe */
@@@ -2264,28 -2250,6 +2282,28 @@@ static void kprobe_remove_ksym_blacklis
         kprobe_remove_area_blacklist(entry, entry + 1);
   }
   
+ +int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
+ +                                 char *type, char *sym)
+ +{
+ +      return -ERANGE;
+ +}
+ +
+ +int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+ +                     char *sym)
+ +{
+ +#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
+ +      if (!kprobe_cache_get_kallsym(&kprobe_insn_slots, &symnum, value, type, sym))
+ +              return 0;
+ +#ifdef CONFIG_OPTPROBES
+ +      if (!kprobe_cache_get_kallsym(&kprobe_optinsn_slots, &symnum, value, type, sym))
+ +              return 0;
+ +#endif
+ +#endif
+ +      if (!arch_kprobe_get_kallsym(&symnum, value, type, sym))
+ +              return 0;
+ +      return -ERANGE;
+ +}
+ +
   int __init __weak arch_populate_kprobe_blacklist(void)
   {
         return 0;
@@@ -2502,7 -2466,7 +2520,7 @@@ static void report_probe(struct seq_fil
         else
                 kprobe_type = "k";
   
- -      if (!kallsyms_show_value())
+ +      if (!kallsyms_show_value(pi->file->f_cred))
                 addr = NULL;
   
         if (sym)
@@@ -2594,7 -2558,7 +2612,7 @@@ static int kprobe_blacklist_seq_show(st
          * If /proc/kallsyms is not showing kernel address, we won't
          * show them here either.
          */
- -      if (!kallsyms_show_value())
+ +      if (!kallsyms_show_value(m->file->f_cred))
                 seq_printf(m, "0x%px-0x%px\t%ps\n", NULL, NULL,
                            (void *)ent->start_addr);
         else
diff --combined kernel/trace/ftrace.c

index 7206454,76f2dd6..2754412
--- 1/kernel/trace/ftrace.c
--- 2/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@@ -139,9 -139,6 +139,6 @@@ static inline void ftrace_ops_init(stru
   #endif
   }
   
- #define FTRACE_PID_IGNORE     -1
- #define FTRACE_PID_TRACE      -2
- 
   static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
                             struct ftrace_ops *op, struct pt_regs *regs)
   {
@@@ -2388,6 -2385,14 +2385,14 @@@ struct ftrace_ops direct_ops = 
         .flags          = FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_RECURSION_SAFE
                           | FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS
                           | FTRACE_OPS_FL_PERMANENT,
+       /*
+        * By declaring the main trampoline as this trampoline
+        * it will never have one allocated for it. Allocated
+        * trampolines should not call direct functions.
+        * The direct_ops should only be called by the builtin
+        * ftrace_regs_caller trampoline.
+        */
+       .trampoline     = FTRACE_REGS_ADDR,
   };
   #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
   
@@@ -2764,50 -2769,6 +2769,50 @@@ void __weak arch_ftrace_trampoline_free
   {
   }
   
+ +/* List of trace_ops that have allocated trampolines */
+ +static LIST_HEAD(ftrace_ops_trampoline_list);
+ +
+ +static void ftrace_add_trampoline_to_kallsyms(struct ftrace_ops *ops)
+ +{
+ +      lockdep_assert_held(&ftrace_lock);
+ +      list_add_rcu(&ops->list, &ftrace_ops_trampoline_list);
+ +}
+ +
+ +static void ftrace_remove_trampoline_from_kallsyms(struct ftrace_ops *ops)
+ +{
+ +      lockdep_assert_held(&ftrace_lock);
+ +      list_del_rcu(&ops->list);
+ +}
+ +
+ +/*
+ + * "__builtin__ftrace" is used as a module name in /proc/kallsyms for symbols
+ + * for pages allocated for ftrace purposes, even though "__builtin__ftrace" is
+ + * not a module.
+ + */
+ +#define FTRACE_TRAMPOLINE_MOD "__builtin__ftrace"
+ +#define FTRACE_TRAMPOLINE_SYM "ftrace_trampoline"
+ +
+ +static void ftrace_trampoline_free(struct ftrace_ops *ops)
+ +{
+ +      if (ops && (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP) &&
+ +          ops->trampoline) {
+ +              /*
+ +               * Record the text poke event before the ksymbol unregister
+ +               * event.
+ +               */
+ +              perf_event_text_poke((void *)ops->trampoline,
+ +                                   (void *)ops->trampoline,
+ +                                   ops->trampoline_size, NULL, 0);
+ +              perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
+ +                                 ops->trampoline, ops->trampoline_size,
+ +                                 true, FTRACE_TRAMPOLINE_SYM);
+ +              /* Remove from kallsyms after the perf events */
+ +              ftrace_remove_trampoline_from_kallsyms(ops);
+ +      }
+ +
+ +      arch_ftrace_trampoline_free(ops);
+ +}
+ +
   static void ftrace_startup_enable(int command)
   {
         if (saved_ftrace_func != ftrace_trace_function) {
@@@ -2978,7 -2939,7 +2983,7 @@@ int ftrace_shutdown(struct ftrace_ops *
                         synchronize_rcu_tasks();
   
    free_ops:
- -              arch_ftrace_trampoline_free(ops);
+ +              ftrace_trampoline_free(ops);
         }
   
         return 0;
@@@ -6222,27 -6183,6 +6227,27 @@@ struct ftrace_mod_map 
         unsigned int            num_funcs;
   };
   
+ +static int ftrace_get_trampoline_kallsym(unsigned int symnum,
+ +                                       unsigned long *value, char *type,
+ +                                       char *name, char *module_name,
+ +                                       int *exported)
+ +{
+ +      struct ftrace_ops *op;
+ +
+ +      list_for_each_entry_rcu(op, &ftrace_ops_trampoline_list, list) {
+ +              if (!op->trampoline || symnum--)
+ +                      continue;
+ +              *value = op->trampoline;
+ +              *type = 't';
+ +              strlcpy(name, FTRACE_TRAMPOLINE_SYM, KSYM_NAME_LEN);
+ +              strlcpy(module_name, FTRACE_TRAMPOLINE_MOD, MODULE_NAME_LEN);
+ +              *exported = 0;
+ +              return 0;
+ +      }
+ +
+ +      return -ERANGE;
+ +}
+ +
   #ifdef CONFIG_MODULES
   
   #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
@@@ -6255,8 -6195,19 +6260,19 @@@ static int referenced_filters(struct dy
         int cnt = 0;
   
         for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) {
-               if (ops_references_rec(ops, rec))
-                   cnt++;
+               if (ops_references_rec(ops, rec)) {
+                       if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_DIRECT))
+                               continue;
+                       if (WARN_ON_ONCE(ops->flags & FTRACE_OPS_FL_IPMODIFY))
+                               continue;
+                       cnt++;
+                       if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
+                               rec->flags |= FTRACE_FL_REGS;
+                       if (cnt == 1 && ops->trampoline)
+                               rec->flags |= FTRACE_FL_TRAMP;
+                       else
+                               rec->flags &= ~FTRACE_FL_TRAMP;
+               }
         }
   
         return cnt;
@@@ -6435,8 -6386,8 +6451,8 @@@ void ftrace_module_enable(struct modul
                 if (ftrace_start_up)
                         cnt += referenced_filters(rec);
   
-               /* This clears FTRACE_FL_DISABLED */
-               rec->flags = cnt;
+               rec->flags &= ~FTRACE_FL_DISABLED;
+               rec->flags += cnt;
   
                 if (ftrace_start_up && cnt) {
                         int failed = __ftrace_replace_code(rec, 1);
@@@ -6579,7 -6530,6 +6595,7 @@@ int ftrace_mod_get_kallsym(unsigned in
   {
         struct ftrace_mod_map *mod_map;
         struct ftrace_mod_func *mod_func;
+ +      int ret;
   
         preempt_disable();
         list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) {
@@@ -6606,10 -6556,8 +6622,10 @@@
                 WARN_ON(1);
                 break;
         }
+ +      ret = ftrace_get_trampoline_kallsym(symnum, value, type, name,
+ +                                          module_name, exported);
         preempt_enable();
- -      return -ERANGE;
+ +      return ret;
   }
   
   #else
@@@ -6621,18 -6569,6 +6637,18 @@@ allocate_ftrace_mod_map(struct module *
   {
         return NULL;
   }
+ +int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
+ +                         char *type, char *name, char *module_name,
+ +                         int *exported)
+ +{
+ +      int ret;
+ +
+ +      preempt_disable();
+ +      ret = ftrace_get_trampoline_kallsym(symnum, value, type, name,
+ +                                          module_name, exported);
+ +      preempt_enable();
+ +      return ret;
+ +}
   #endif /* CONFIG_MODULES */
   
   struct ftrace_init_func {
@@@ -6813,24 -6749,7 +6829,24 @@@ void __weak arch_ftrace_update_trampoli
   
   static void ftrace_update_trampoline(struct ftrace_ops *ops)
   {
+ +      unsigned long trampoline = ops->trampoline;
+ +
         arch_ftrace_update_trampoline(ops);
+ +      if (ops->trampoline && ops->trampoline != trampoline &&
+ +          (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) {
+ +              /* Add to kallsyms before the perf events */
+ +              ftrace_add_trampoline_to_kallsyms(ops);
+ +              perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
+ +                                 ops->trampoline, ops->trampoline_size, false,
+ +                                 FTRACE_TRAMPOLINE_SYM);
+ +              /*
+ +               * Record the perf text poke event after the ksymbol register
+ +               * event.
+ +               */
+ +              perf_event_text_poke((void *)ops->trampoline, NULL, 0,
+ +                                   (void *)ops->trampoline,
+ +                                   ops->trampoline_size);
+ +      }
   }
   
   void ftrace_init_trace_array(struct trace_array *tr)
@@@ -7066,12 -6985,12 +7082,12 @@@ void ftrace_pid_follow_fork(struct trac
         if (enable) {
                 register_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork,
                                                   tr);
-               register_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit,
+               register_trace_sched_process_free(ftrace_pid_follow_sched_process_exit,
                                                   tr);
         } else {
                 unregister_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork,
                                                     tr);
-               unregister_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit,
+               unregister_trace_sched_process_free(ftrace_pid_follow_sched_process_exit,
                                                     tr);
         }
   }
diff --combined kernel/trace/ring_buffer.c

index f15471c,c3a2e75..93ef0ab
--- 1/kernel/trace/ring_buffer.c
--- 2/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@@ -270,6 -270,9 +270,9 @@@ EXPORT_SYMBOL_GPL(ring_buffer_event_dat
   #define for_each_buffer_cpu(buffer, cpu)              \
         for_each_cpu(cpu, buffer->cpumask)
   
+ #define for_each_online_buffer_cpu(buffer, cpu)               \
+       for_each_cpu_and(cpu, buffer->cpumask, cpu_online_mask)
+ 
   #define TS_SHIFT      27
   #define TS_MASK               ((1ULL << TS_SHIFT) - 1)
   #define TS_DELTA_TEST (~TS_MASK)
@@@ -413,11 -416,26 +416,26 @@@ struct rb_irq_work 
   struct rb_event_info {
         u64                     ts;
         u64                     delta;
+       u64                     before;
+       u64                     after;
         unsigned long           length;
         struct buffer_page      *tail_page;
         int                     add_timestamp;
   };
   
+ /*
+  * Used for the add_timestamp
+  *  NONE
+  *  EXTEND - wants a time extend
+  *  ABSOLUTE - the buffer requests all events to have absolute time stamps
+  *  FORCE - force a full time stamp.
+  */
+ enum {
+       RB_ADD_STAMP_NONE               = 0,
+       RB_ADD_STAMP_EXTEND             = BIT(1),
+       RB_ADD_STAMP_ABSOLUTE           = BIT(2),
+       RB_ADD_STAMP_FORCE              = BIT(3)
+ };
   /*
    * Used for which event context the event is in.
    *  NMI     = 0
@@@ -435,6 -453,28 +453,28 @@@ enum 
         RB_CTX_MAX
   };
   
+ #if BITS_PER_LONG == 32
+ #define RB_TIME_32
+ #endif
+ 
+ /* To test on 64 bit machines */
+ //#define RB_TIME_32
+ 
+ #ifdef RB_TIME_32
+ 
+ struct rb_time_struct {
+       local_t         cnt;
+       local_t         top;
+       local_t         bottom;
+ };
+ #else
+ #include <asm/local64.h>
+ struct rb_time_struct {
+       local64_t       time;
+ };
+ #endif
+ typedef struct rb_time_struct rb_time_t;
+ 
   /*
    * head_page == tail_page && head == tail then buffer is empty.
    */
@@@ -470,7 -510,8 +510,8 @@@ struct ring_buffer_per_cpu 
         size_t                          shortest_full;
         unsigned long                   read;
         unsigned long                   read_bytes;
-       u64                             write_stamp;
+       rb_time_t                       write_stamp;
+       rb_time_t                       before_stamp;
         u64                             read_stamp;
         /* ring buffer pages to update, > 0 to add, < 0 to remove */
         long                            nr_pages_to_update;
@@@ -513,6 -554,189 +554,189 @@@ struct ring_buffer_iter 
         int                             missed_events;
   };
   
+ #ifdef RB_TIME_32
+ 
+ /*
+  * On 32 bit machines, local64_t is very expensive. As the ring
+  * buffer doesn't need all the features of a true 64 bit atomic,
+  * on 32 bit, it uses these functions (64 still uses local64_t).
+  *
+  * For the ring buffer, 64 bit required operations for the time is
+  * the following:
+  *
+  *  - Only need 59 bits (uses 60 to make it even).
+  *  - Reads may fail if it interrupted a modification of the time stamp.
+  *      It will succeed if it did not interrupt another write even if
+  *      the read itself is interrupted by a write.
+  *      It returns whether it was successful or not.
+  *
+  *  - Writes always succeed and will overwrite other writes and writes
+  *      that were done by events interrupting the current write.
+  *
+  *  - A write followed by a read of the same time stamp will always succeed,
+  *      but may not contain the same value.
+  *
+  *  - A cmpxchg will fail if it interrupted another write or cmpxchg.
+  *      Other than that, it acts like a normal cmpxchg.
+  *
+  * The 60 bit time stamp is broken up by 30 bits in a top and bottom half
+  *  (bottom being the least significant 30 bits of the 60 bit time stamp).
+  *
+  * The two most significant bits of each half holds a 2 bit counter (0-3).
+  * Each update will increment this counter by one.
+  * When reading the top and bottom, if the two counter bits match then the
+  *  top and bottom together make a valid 60 bit number.
+  */
+ #define RB_TIME_SHIFT 30
+ #define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1)
+ 
+ static inline int rb_time_cnt(unsigned long val)
+ {
+       return (val >> RB_TIME_SHIFT) & 3;
+ }
+ 
+ static inline u64 rb_time_val(unsigned long top, unsigned long bottom)
+ {
+       u64 val;
+ 
+       val = top & RB_TIME_VAL_MASK;
+       val <<= RB_TIME_SHIFT;
+       val |= bottom & RB_TIME_VAL_MASK;
+ 
+       return val;
+ }
+ 
+ static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
+ {
+       unsigned long top, bottom;
+       unsigned long c;
+ 
+       /*
+        * If the read is interrupted by a write, then the cnt will
+        * be different. Loop until both top and bottom have been read
+        * without interruption.
+        */
+       do {
+               c = local_read(&t->cnt);
+               top = local_read(&t->top);
+               bottom = local_read(&t->bottom);
+       } while (c != local_read(&t->cnt));
+ 
+       *cnt = rb_time_cnt(top);
+ 
+       /* If top and bottom counts don't match, this interrupted a write */
+       if (*cnt != rb_time_cnt(bottom))
+               return false;
+ 
+       *ret = rb_time_val(top, bottom);
+       return true;
+ }
+ 
+ static bool rb_time_read(rb_time_t *t, u64 *ret)
+ {
+       unsigned long cnt;
+ 
+       return __rb_time_read(t, ret, &cnt);
+ }
+ 
+ static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt)
+ {
+       return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT);
+ }
+ 
+ static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom)
+ {
+       *top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK);
+       *bottom = (unsigned long)(val & RB_TIME_VAL_MASK);
+ }
+ 
+ static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt)
+ {
+       val = rb_time_val_cnt(val, cnt);
+       local_set(t, val);
+ }
+ 
+ static void rb_time_set(rb_time_t *t, u64 val)
+ {
+       unsigned long cnt, top, bottom;
+ 
+       rb_time_split(val, &top, &bottom);
+ 
+       /* Writes always succeed with a valid number even if it gets interrupted. */
+       do {
+               cnt = local_inc_return(&t->cnt);
+               rb_time_val_set(&t->top, top, cnt);
+               rb_time_val_set(&t->bottom, bottom, cnt);
+       } while (cnt != local_read(&t->cnt));
+ }
+ 
+ static inline bool
+ rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set)
+ {
+       unsigned long ret;
+ 
+       ret = local_cmpxchg(l, expect, set);
+       return ret == expect;
+ }
+ 
+ static int rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
+ {
+       unsigned long cnt, top, bottom;
+       unsigned long cnt2, top2, bottom2;
+       u64 val;
+ 
+       /* The cmpxchg always fails if it interrupted an update */
+        if (!__rb_time_read(t, &val, &cnt2))
+                return false;
+ 
+        if (val != expect)
+                return false;
+ 
+        cnt = local_read(&t->cnt);
+        if ((cnt & 3) != cnt2)
+                return false;
+ 
+        cnt2 = cnt + 1;
+ 
+        rb_time_split(val, &top, &bottom);
+        top = rb_time_val_cnt(top, cnt);
+        bottom = rb_time_val_cnt(bottom, cnt);
+ 
+        rb_time_split(set, &top2, &bottom2);
+        top2 = rb_time_val_cnt(top2, cnt2);
+        bottom2 = rb_time_val_cnt(bottom2, cnt2);
+ 
+       if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2))
+               return false;
+       if (!rb_time_read_cmpxchg(&t->top, top, top2))
+               return false;
+       if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2))
+               return false;
+       return true;
+ }
+ 
+ #else /* 64 bits */
+ 
+ /* local64_t always succeeds */
+ 
+ static inline bool rb_time_read(rb_time_t *t, u64 *ret)
+ {
+       *ret = local64_read(&t->time);
+       return true;
+ }
+ static void rb_time_set(rb_time_t *t, u64 val)
+ {
+       local64_set(&t->time, val);
+ }
+ 
+ static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
+ {
+       u64 val;
+       val = local64_cmpxchg(&t->time, expect, set);
+       return val == expect;
+ }
+ #endif
+ 
   /**
    * ring_buffer_nr_pages - get the number of buffer pages in the ring buffer
    * @buffer: The ring_buffer to get the number of pages from
@@@ -577,7 -801,7 +801,7 @@@ static void rb_wake_up_waiters(struct i
    */
   int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full)
   {
- -      struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer);
+ +      struct ring_buffer_per_cpu *cpu_buffer;
         DEFINE_WAIT(wait);
         struct rb_irq_work *work;
         int ret = 0;
@@@ -746,8 -970,16 +970,16 @@@ __poll_t ring_buffer_poll_wait(struct t
   
   static inline u64 rb_time_stamp(struct trace_buffer *buffer)
   {
+       u64 ts;
+ 
+       /* Skip retpolines :-( */
+       if (IS_ENABLED(CONFIG_RETPOLINE) && likely(buffer->clock == trace_clock_local))
+               ts = trace_clock_local();
+       else
+               ts = buffer->clock();
+ 
         /* shift to debug/test normalization and TIME_EXTENTS */
-       return buffer->clock() << DEBUG_SHIFT;
+       return ts << DEBUG_SHIFT;
   }
   
   u64 ring_buffer_time_stamp(struct trace_buffer *buffer, int cpu)
@@@ -2372,8 -2604,8 +2604,8 @@@ rb_move_tail(struct ring_buffer_per_cp
         return NULL;
   }
   
- /* Slow path, do not inline */
- static noinline struct ring_buffer_event *
+ /* Slow path */
+ static struct ring_buffer_event *
   rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
   {
         if (abs)
@@@ -2397,6 -2629,66 +2629,66 @@@
   static inline bool rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
                                      struct ring_buffer_event *event);
   
+ #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+ static inline bool sched_clock_stable(void)
+ {
+       return true;
+ }
+ #endif
+ 
+ static void
+ rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
+                  struct rb_event_info *info)
+ {
+       u64 write_stamp;
+ 
+       WARN_ONCE(1, "Delta way too big! %llu ts=%llu before=%llu after=%llu write stamp=%llu\n%s",
+                 (unsigned long long)info->delta,
+                 (unsigned long long)info->ts,
+                 (unsigned long long)info->before,
+                 (unsigned long long)info->after,
+                 (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0),
+                 sched_clock_stable() ? "" :
+                 "If you just came from a suspend/resume,\n"
+                 "please switch to the trace global clock:\n"
+                 "  echo global > /sys/kernel/debug/tracing/trace_clock\n"
+                 "or add trace_clock=global to the kernel command line\n");
+ }
+ 
+ static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
+                                     struct ring_buffer_event **event,
+                                     struct rb_event_info *info,
+                                     u64 *delta,
+                                     unsigned int *length)
+ {
+       bool abs = info->add_timestamp &
+               (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE);
+ 
+       if (unlikely(info->delta > (1ULL << 59))) {
+               /* did the clock go backwards */
+               if (info->before == info->after && info->before > info->ts) {
+                       /* not interrupted */
+                       static int once;
+ 
+                       /*
+                        * This is possible with a recalibrating of the TSC.
+                        * Do not produce a call stack, but just report it.
+                        */
+                       if (!once) {
+                               once++;
+                               pr_warn("Ring buffer clock went backwards: %llu -> %llu\n",
+                                       info->before, info->ts);
+                       }
+               } else
+                       rb_check_timestamp(cpu_buffer, info);
+               if (!abs)
+                       info->delta = 0;
+       }
+       *event = rb_add_time_stamp(*event, info->delta, abs);
+       *length -= RB_LEN_TIME_EXTEND;
+       *delta = 0;
+ }
+ 
   /**
    * rb_update_event - update event type and data
    * @cpu_buffer: The per cpu buffer of the @event
@@@ -2416,21 -2708,12 +2708,12 @@@ rb_update_event(struct ring_buffer_per_
         unsigned length = info->length;
         u64 delta = info->delta;
   
-       /* Only a commit updates the timestamp */
-       if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
-               delta = 0;
- 
         /*
          * If we need to add a timestamp, then we
          * add it to the start of the reserved space.
          */
-       if (unlikely(info->add_timestamp)) {
-               bool abs = ring_buffer_time_stamp_abs(cpu_buffer->buffer);
- 
-               event = rb_add_time_stamp(event, abs ? info->delta : delta, abs);
-               length -= RB_LEN_TIME_EXTEND;
-               delta = 0;
-       }
+       if (unlikely(info->add_timestamp))
+               rb_add_timestamp(cpu_buffer, &event, info, &delta, &length);
   
         event->time_delta = delta;
         length -= RB_EVNT_HDR_SIZE;
@@@ -2473,12 -2756,38 +2756,38 @@@ static unsigned rb_calculate_event_leng
         return length;
   }
   
- #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
- static inline bool sched_clock_stable(void)
+ static __always_inline bool
+ rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
+                  struct ring_buffer_event *event)
   {
-       return true;
+       unsigned long addr = (unsigned long)event;
+       unsigned long index;
+ 
+       index = rb_event_index(event);
+       addr &= PAGE_MASK;
+ 
+       return cpu_buffer->commit_page->page == (void *)addr &&
+               rb_commit_index(cpu_buffer) == index;
+ }
+ 
+ static u64 rb_time_delta(struct ring_buffer_event *event)
+ {
+       switch (event->type_len) {
+       case RINGBUF_TYPE_PADDING:
+               return 0;
+ 
+       case RINGBUF_TYPE_TIME_EXTEND:
+               return ring_buffer_event_time_stamp(event);
+ 
+       case RINGBUF_TYPE_TIME_STAMP:
+               return 0;
+ 
+       case RINGBUF_TYPE_DATA:
+               return event->time_delta;
+       default:
+               return 0;
+       }
   }
- #endif
   
   static inline int
   rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
@@@ -2488,6 -2797,8 +2797,8 @@@
         struct buffer_page *bpage;
         unsigned long index;
         unsigned long addr;
+       u64 write_stamp;
+       u64 delta;
   
         new_index = rb_event_index(event);
         old_index = new_index + rb_event_ts_length(event);
@@@ -2496,10 -2807,32 +2807,32 @@@
   
         bpage = READ_ONCE(cpu_buffer->tail_page);
   
+       delta = rb_time_delta(event);
+ 
+       if (!rb_time_read(&cpu_buffer->write_stamp, &write_stamp))
+               return 0;
+ 
+       /* Make sure the write stamp is read before testing the location */
+       barrier();
+ 
         if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
                 unsigned long write_mask =
                         local_read(&bpage->write) & ~RB_WRITE_MASK;
                 unsigned long event_length = rb_event_length(event);
+ 
+               /* Something came in, can't discard */
+               if (!rb_time_cmpxchg(&cpu_buffer->write_stamp,
+                                      write_stamp, write_stamp - delta))
+                       return 0;
+ 
+               /*
+                * If an event were to come in now, it would see that the
+                * write_stamp and the before_stamp are different, and assume
+                * that this event just added itself before updating
+                * the write stamp. The interrupting event will fix the
+                * write stamp for us, and use the before stamp as its delta.
+                */
+ 
                 /*
                  * This is on the tail page. It is possible that
                  * a write could come in and move the tail page
@@@ -2551,10 -2884,6 +2884,6 @@@ rb_set_commit_to_write(struct ring_buff
                 local_set(&cpu_buffer->commit_page->page->commit,
                           rb_page_write(cpu_buffer->commit_page));
                 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
-               /* Only update the write stamp if the page has an event */
-               if (rb_page_write(cpu_buffer->commit_page))
-                       cpu_buffer->write_stamp =
-                               cpu_buffer->commit_page->page->time_stamp;
                 /* add barrier to keep gcc from optimizing too much */
                 barrier();
         }
@@@ -2626,54 -2955,10 +2955,10 @@@ static inline void rb_event_discard(str
                 event->time_delta = 1;
   }
   
- static __always_inline bool
- rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
-                  struct ring_buffer_event *event)
- {
-       unsigned long addr = (unsigned long)event;
-       unsigned long index;
- 
-       index = rb_event_index(event);
-       addr &= PAGE_MASK;
- 
-       return cpu_buffer->commit_page->page == (void *)addr &&
-               rb_commit_index(cpu_buffer) == index;
- }
- 
- static __always_inline void
- rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
-                     struct ring_buffer_event *event)
- {
-       u64 delta;
- 
-       /*
-        * The event first in the commit queue updates the
-        * time stamp.
-        */
-       if (rb_event_is_commit(cpu_buffer, event)) {
-               /*
-                * A commit event that is first on a page
-                * updates the write timestamp with the page stamp
-                */
-               if (!rb_event_index(event))
-                       cpu_buffer->write_stamp =
-                               cpu_buffer->commit_page->page->time_stamp;
-               else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
-                       delta = ring_buffer_event_time_stamp(event);
-                       cpu_buffer->write_stamp += delta;
-               } else if (event->type_len == RINGBUF_TYPE_TIME_STAMP) {
-                       delta = ring_buffer_event_time_stamp(event);
-                       cpu_buffer->write_stamp = delta;
-               } else
-                       cpu_buffer->write_stamp += event->time_delta;
-       }
- }
- 
   static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
                       struct ring_buffer_event *event)
   {
         local_inc(&cpu_buffer->entries);
-       rb_update_write_stamp(cpu_buffer, event);
         rb_end_commit(cpu_buffer);
   }
   
@@@ -2864,58 -3149,138 +3149,138 @@@ int ring_buffer_unlock_commit(struct tr
   }
   EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
   
- static noinline void
- rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
-                   struct rb_event_info *info)
- {
-       WARN_ONCE(info->delta > (1ULL << 59),
-                 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
-                 (unsigned long long)info->delta,
-                 (unsigned long long)info->ts,
-                 (unsigned long long)cpu_buffer->write_stamp,
-                 sched_clock_stable() ? "" :
-                 "If you just came from a suspend/resume,\n"
-                 "please switch to the trace global clock:\n"
-                 "  echo global > /sys/kernel/debug/tracing/trace_clock\n"
-                 "or add trace_clock=global to the kernel command line\n");
-       info->add_timestamp = 1;
- }
- 
   static struct ring_buffer_event *
   __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
                   struct rb_event_info *info)
   {
         struct ring_buffer_event *event;
         struct buffer_page *tail_page;
-       unsigned long tail, write;
- 
-       /*
-        * If the time delta since the last event is too big to
-        * hold in the time field of the event, then we append a
-        * TIME EXTEND event ahead of the data event.
-        */
-       if (unlikely(info->add_timestamp))
-               info->length += RB_LEN_TIME_EXTEND;
+       unsigned long tail, write, w;
+       bool a_ok;
+       bool b_ok;
   
         /* Don't let the compiler play games with cpu_buffer->tail_page */
         tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page);
-       write = local_add_return(info->length, &tail_page->write);
+ 
+  /*A*/        w = local_read(&tail_page->write) & RB_WRITE_MASK;
+       barrier();
+       b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
+       a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+       barrier();
+       info->ts = rb_time_stamp(cpu_buffer->buffer);
+ 
+       if ((info->add_timestamp & RB_ADD_STAMP_ABSOLUTE)) {
+               info->delta = info->ts;
+       } else {
+               /*
+                * If interrupting an event time update, we may need an
+                * absolute timestamp.
+                * Don't bother if this is the start of a new page (w == 0).
+                */
+               if (unlikely(!a_ok || !b_ok || (info->before != info->after && w))) {
+                       info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND;
+                       info->length += RB_LEN_TIME_EXTEND;
+               } else {
+                       info->delta = info->ts - info->after;
+                       if (unlikely(test_time_stamp(info->delta))) {
+                               info->add_timestamp |= RB_ADD_STAMP_EXTEND;
+                               info->length += RB_LEN_TIME_EXTEND;
+                       }
+               }
+       }
+ 
+  /*B*/        rb_time_set(&cpu_buffer->before_stamp, info->ts);
+ 
+  /*C*/        write = local_add_return(info->length, &tail_page->write);
   
         /* set write to only the index of the write */
         write &= RB_WRITE_MASK;
+ 
         tail = write - info->length;
   
+       /* See if we shot pass the end of this buffer page */
+       if (unlikely(write > BUF_PAGE_SIZE)) {
+               if (tail != w) {
+                       /* before and after may now different, fix it up*/
+                       b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
+                       a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+                       if (a_ok && b_ok && info->before != info->after)
+                               (void)rb_time_cmpxchg(&cpu_buffer->before_stamp,
+                                                     info->before, info->after);
+               }
+               return rb_move_tail(cpu_buffer, tail, info);
+       }
+ 
+       if (likely(tail == w)) {
+               u64 save_before;
+               bool s_ok;
+ 
+               /* Nothing interrupted us between A and C */
+  /*D*/                rb_time_set(&cpu_buffer->write_stamp, info->ts);
+               barrier();
+  /*E*/                s_ok = rb_time_read(&cpu_buffer->before_stamp, &save_before);
+               RB_WARN_ON(cpu_buffer, !s_ok);
+               if (likely(!(info->add_timestamp &
+                            (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE))))
+                       /* This did not interrupt any time update */
+                       info->delta = info->ts - info->after;
+               else
+                       /* Just use full timestamp for inerrupting event */
+                       info->delta = info->ts;
+               barrier();
+               if (unlikely(info->ts != save_before)) {
+                       /* SLOW PATH - Interrupted between C and E */
+ 
+                       a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+                       RB_WARN_ON(cpu_buffer, !a_ok);
+ 
+                       /* Write stamp must only go forward */
+                       if (save_before > info->after) {
+                               /*
+                                * We do not care about the result, only that
+                                * it gets updated atomically.
+                                */
+                               (void)rb_time_cmpxchg(&cpu_buffer->write_stamp,
+                                                     info->after, save_before);
+                       }
+               }
+       } else {
+               u64 ts;
+               /* SLOW PATH - Interrupted between A and C */
+               a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+               /* Was interrupted before here, write_stamp must be valid */
+               RB_WARN_ON(cpu_buffer, !a_ok);
+               ts = rb_time_stamp(cpu_buffer->buffer);
+               barrier();
+  /*E*/                if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
+                   info->after < ts) {
+                       /* Nothing came after this event between C and E */
+                       info->delta = ts - info->after;
+                       (void)rb_time_cmpxchg(&cpu_buffer->write_stamp,
+                                             info->after, info->ts);
+                       info->ts = ts;
+               } else {
+                       /*
+                        * Interrupted beween C and E:
+                        * Lost the previous events time stamp. Just set the
+                        * delta to zero, and this will be the same time as
+                        * the event this event interrupted. And the events that
+                        * came after this will still be correct (as they would
+                        * have built their delta on the previous event.
+                        */
+                       info->delta = 0;
+               }
+               info->add_timestamp &= ~RB_ADD_STAMP_FORCE;
+       }
+ 
         /*
          * If this is the first commit on the page, then it has the same
          * timestamp as the page itself.
          */
-       if (!tail && !ring_buffer_time_stamp_abs(cpu_buffer->buffer))
+       if (unlikely(!tail && !(info->add_timestamp &
+                               (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE))))
                 info->delta = 0;
   
-       /* See if we shot pass the end of this buffer page */
-       if (unlikely(write > BUF_PAGE_SIZE))
-               return rb_move_tail(cpu_buffer, tail, info);
- 
         /* We reserved something on the buffer */
   
         event = __rb_page_index(tail_page, tail);
@@@ -2927,7 -3292,7 +3292,7 @@@
          * If this is the first commit on the page, then update
          * its timestamp.
          */
-       if (!tail)
+       if (unlikely(!tail))
                 tail_page->page->time_stamp = info->ts;
   
         /* account for these added bytes */
@@@ -2944,9 -3309,10 +3309,10 @@@ rb_reserve_next_event(struct trace_buff
         struct ring_buffer_event *event;
         struct rb_event_info info;
         int nr_loops = 0;
-       u64 diff;
+       int add_ts_default;
   
         rb_start_commit(cpu_buffer);
+       /* The commit page can not change after this */
   
   #ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
         /*
@@@ -2964,8 -3330,16 +3330,16 @@@
   #endif
   
         info.length = rb_calculate_event_length(length);
+ 
+       if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) {
+               add_ts_default = RB_ADD_STAMP_ABSOLUTE;
+               info.length += RB_LEN_TIME_EXTEND;
+       } else {
+               add_ts_default = RB_ADD_STAMP_NONE;
+       }
+ 
    again:
-       info.add_timestamp = 0;
+       info.add_timestamp = add_ts_default;
         info.delta = 0;
   
         /*
@@@ -2980,35 -3354,16 +3354,16 @@@
         if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
                 goto out_fail;
   
-       info.ts = rb_time_stamp(cpu_buffer->buffer);
-       diff = info.ts - cpu_buffer->write_stamp;
- 
-       /* make sure this diff is calculated here */
-       barrier();
- 
-       if (ring_buffer_time_stamp_abs(buffer)) {
-               info.delta = info.ts;
-               rb_handle_timestamp(cpu_buffer, &info);
-       } else /* Did the write stamp get updated already? */
-               if (likely(info.ts >= cpu_buffer->write_stamp)) {
-               info.delta = diff;
-               if (unlikely(test_time_stamp(info.delta)))
-                       rb_handle_timestamp(cpu_buffer, &info);
-       }
- 
         event = __rb_reserve_next(cpu_buffer, &info);
   
         if (unlikely(PTR_ERR(event) == -EAGAIN)) {
-               if (info.add_timestamp)
+               if (info.add_timestamp & (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND))
                         info.length -= RB_LEN_TIME_EXTEND;
                 goto again;
         }
   
-       if (!event)
-               goto out_fail;
- 
-       return event;
- 
+       if (likely(event))
+               return event;
    out_fail:
         rb_end_commit(cpu_buffer);
         return NULL;
@@@ -3154,11 -3509,6 +3509,6 @@@ void ring_buffer_discard_commit(struct 
         if (rb_try_to_discard(cpu_buffer, event))
                 goto out;
   
-       /*
-        * The commit is still visible by the reader, so we
-        * must still update the timestamp.
-        */
-       rb_update_write_stamp(cpu_buffer, event);
    out:
         rb_end_commit(cpu_buffer);
   
@@@ -4475,8 -4825,8 +4825,8 @@@ rb_reset_cpu(struct ring_buffer_per_cp
         cpu_buffer->read = 0;
         cpu_buffer->read_bytes = 0;
   
-       cpu_buffer->write_stamp = 0;
-       cpu_buffer->read_stamp = 0;
+       rb_time_set(&cpu_buffer->write_stamp, 0);
+       rb_time_set(&cpu_buffer->before_stamp, 0);
   
         cpu_buffer->lost_events = 0;
         cpu_buffer->last_overrun = 0;
@@@ -4484,6 -4834,26 +4834,26 @@@
         rb_head_page_activate(cpu_buffer);
   }
   
+ /* Must have disabled the cpu buffer then done a synchronize_rcu */
+ static void reset_disabled_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
+ {
+       unsigned long flags;
+ 
+       raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ 
+       if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
+               goto out;
+ 
+       arch_spin_lock(&cpu_buffer->lock);
+ 
+       rb_reset_cpu(cpu_buffer);
+ 
+       arch_spin_unlock(&cpu_buffer->lock);
+ 
+  out:
+       raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ }
+ 
   /**
    * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
    * @buffer: The ring buffer to reset a per cpu buffer of
@@@ -4492,7 -4862,6 +4862,6 @@@
   void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu)
   {
         struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
-       unsigned long flags;
   
         if (!cpumask_test_cpu(cpu, buffer->cpumask))
                 return;
@@@ -4503,24 -4872,42 +4872,42 @@@
         /* Make sure all commits have finished */
         synchronize_rcu();
   
-       raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+       reset_disabled_cpu_buffer(cpu_buffer);
   
-       if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
-               goto out;
+       atomic_dec(&cpu_buffer->record_disabled);
+       atomic_dec(&cpu_buffer->resize_disabled);
+ }
+ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
   
-       arch_spin_lock(&cpu_buffer->lock);
+ /**
+  * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
+  * @buffer: The ring buffer to reset a per cpu buffer of
+  * @cpu: The CPU buffer to be reset
+  */
+ void ring_buffer_reset_online_cpus(struct trace_buffer *buffer)
+ {
+       struct ring_buffer_per_cpu *cpu_buffer;
+       int cpu;
   
-       rb_reset_cpu(cpu_buffer);
+       for_each_online_buffer_cpu(buffer, cpu) {
+               cpu_buffer = buffer->buffers[cpu];
   
-       arch_spin_unlock(&cpu_buffer->lock);
+               atomic_inc(&cpu_buffer->resize_disabled);
+               atomic_inc(&cpu_buffer->record_disabled);
+       }
   
-  out:
-       raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+       /* Make sure all commits have finished */
+       synchronize_rcu();
   
-       atomic_dec(&cpu_buffer->record_disabled);
-       atomic_dec(&cpu_buffer->resize_disabled);
+       for_each_online_buffer_cpu(buffer, cpu) {
+               cpu_buffer = buffer->buffers[cpu];
+ 
+               reset_disabled_cpu_buffer(cpu_buffer);
+ 
+               atomic_dec(&cpu_buffer->record_disabled);
+               atomic_dec(&cpu_buffer->resize_disabled);
+       }
   }
- EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
   
   /**
    * ring_buffer_reset - reset a ring buffer
@@@ -4528,10 -4915,27 +4915,27 @@@
    */
   void ring_buffer_reset(struct trace_buffer *buffer)
   {
+       struct ring_buffer_per_cpu *cpu_buffer;
         int cpu;
   
-       for_each_buffer_cpu(buffer, cpu)
-               ring_buffer_reset_cpu(buffer, cpu);
+       for_each_buffer_cpu(buffer, cpu) {
+               cpu_buffer = buffer->buffers[cpu];
+ 
+               atomic_inc(&cpu_buffer->resize_disabled);
+               atomic_inc(&cpu_buffer->record_disabled);
+       }
+ 
+       /* Make sure all commits have finished */
+       synchronize_rcu();
+ 
+       for_each_buffer_cpu(buffer, cpu) {
+               cpu_buffer = buffer->buffers[cpu];
+ 
+               reset_disabled_cpu_buffer(cpu_buffer);
+ 
+               atomic_dec(&cpu_buffer->record_disabled);
+               atomic_dec(&cpu_buffer->resize_disabled);
+       }
   }
   EXPORT_SYMBOL_GPL(ring_buffer_reset);
   
diff --combined kernel/trace/trace.c

index b0c338e,c5f8227..f40d850
--- 1/kernel/trace/trace.c
--- 2/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@@ -1543,7 -1543,8 +1543,7 @@@ static void latency_fsnotify_workfn(str
   {
         struct trace_array *tr = container_of(work, struct trace_array,
                                               fsnotify_work);
- -      fsnotify(tr->d_max_latency->d_inode, FS_MODIFY,
- -               tr->d_max_latency->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+ +      fsnotify_inode(tr->d_max_latency->d_inode, FS_MODIFY);
   }
   
   static void latency_fsnotify_workfn_irq(struct irq_work *iwork)
@@@ -2002,7 -2003,6 +2002,6 @@@ static void tracing_reset_cpu(struct ar
   void tracing_reset_online_cpus(struct array_buffer *buf)
   {
         struct trace_buffer *buffer = buf->buffer;
-       int cpu;
   
         if (!buffer)
                 return;
@@@ -2014,8 -2014,7 +2013,7 @@@
   
         buf->time_start = buffer_ftrace_now(buf, buf->cpu);
   
-       for_each_online_cpu(cpu)
-               ring_buffer_reset_cpu(buffer, cpu);
+       ring_buffer_reset_online_cpus(buffer);
   
         ring_buffer_record_enable(buffer);
   }
@@@ -2931,12 -2930,6 +2929,6 @@@ static void __ftrace_trace_stack(struc
                 skip++;
   #endif
   
-       /*
-        * Since events can happen in NMIs there's no safe way to
-        * use the per cpu ftrace_stacks. We reserve it and if an interrupt
-        * or NMI comes in, it will just have to use the default
-        * FTRACE_STACK_SIZE.
-        */
         preempt_disable_notrace();
   
         stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1;
@@@ -3136,6 -3129,9 +3128,9 @@@ static int alloc_percpu_trace_buffer(vo
   {
         struct trace_buffer_struct *buffers;
   
+       if (trace_percpu_buffer)
+               return 0;
+ 
         buffers = alloc_percpu(struct trace_buffer_struct);
         if (MEM_FAIL(!buffers, "Could not allocate percpu trace_printk buffer"))
                 return -ENOMEM;
@@@ -3338,6 -3334,26 +3333,26 @@@ int trace_array_vprintk(struct trace_ar
         return __trace_array_vprintk(tr->array_buffer.buffer, ip, fmt, args);
   }
   
+ /**
+  * trace_array_printk - Print a message to a specific instance
+  * @tr: The instance trace_array descriptor
+  * @ip: The instruction pointer that this is called from.
+  * @fmt: The format to print (printf format)
+  *
+  * If a subsystem sets up its own instance, they have the right to
+  * printk strings into their tracing instance buffer using this
+  * function. Note, this function will not write into the top level
+  * buffer (use trace_printk() for that), as writing into the top level
+  * buffer should only have events that can be individually disabled.
+  * trace_printk() is only used for debugging a kernel, and should not
+  * be ever encorporated in normal use.
+  *
+  * trace_array_printk() can be used, as it will not add noise to the
+  * top level tracing buffer.
+  *
+  * Note, trace_array_init_printk() must be called on @tr before this
+  * can be used.
+  */
   __printf(3, 0)
   int trace_array_printk(struct trace_array *tr,
                        unsigned long ip, const char *fmt, ...)
@@@ -3345,12 -3361,16 +3360,16 @@@
         int ret;
         va_list ap;
   
-       if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
-               return 0;
- 
         if (!tr)
                 return -ENOENT;
   
+       /* This is only allowed for created instances */
+       if (tr == &global_trace)
+               return 0;
+ 
+       if (!(tr->trace_flags & TRACE_ITER_PRINTK))
+               return 0;
+ 
         va_start(ap, fmt);
         ret = trace_array_vprintk(tr, ip, fmt, ap);
         va_end(ap);
@@@ -3358,6 -3378,27 +3377,27 @@@
   }
   EXPORT_SYMBOL_GPL(trace_array_printk);
   
+ /**
+  * trace_array_init_printk - Initialize buffers for trace_array_printk()
+  * @tr: The trace array to initialize the buffers for
+  *
+  * As trace_array_printk() only writes into instances, they are OK to
+  * have in the kernel (unlike trace_printk()). This needs to be called
+  * before trace_array_printk() can be used on a trace_array.
+  */
+ int trace_array_init_printk(struct trace_array *tr)
+ {
+       if (!tr)
+               return -ENOENT;
+ 
+       /* This is only allowed for created instances */
+       if (tr == &global_trace)
+               return -EINVAL;
+ 
+       return alloc_percpu_trace_buffer();
+ }
+ EXPORT_SYMBOL_GPL(trace_array_init_printk);
+ 
   __printf(3, 4)
   int trace_array_printk_buf(struct trace_buffer *buffer,
                            unsigned long ip, const char *fmt, ...)
@@@ -5886,7 -5927,7 +5926,7 @@@ int tracing_set_tracer(struct trace_arr
         }
   
         /* If trace pipe files are being read, we can't change the tracer */
-       if (tr->current_trace->ref) {
+       if (tr->trace_ref) {
                 ret = -EBUSY;
                 goto out;
         }
@@@ -6102,7 -6143,7 +6142,7 @@@ static int tracing_open_pipe(struct ino
   
         nonseekable_open(inode, filp);
   
-       tr->current_trace->ref++;
+       tr->trace_ref++;
   out:
         mutex_unlock(&trace_types_lock);
         return ret;
@@@ -6121,7 -6162,7 +6161,7 @@@ static int tracing_release_pipe(struct 
   
         mutex_lock(&trace_types_lock);
   
-       tr->current_trace->ref--;
+       tr->trace_ref--;
   
         if (iter->trace->pipe_close)
                 iter->trace->pipe_close(iter);
@@@ -7405,7 -7446,7 +7445,7 @@@ static int tracing_buffers_open(struct 
         if (ret)
                 return ret;
   
-       info = kzalloc(sizeof(*info), GFP_KERNEL);
+       info = kvzalloc(sizeof(*info), GFP_KERNEL);
         if (!info) {
                 trace_array_put(tr);
                 return -ENOMEM;
@@@ -7423,7 -7464,7 +7463,7 @@@
   
         filp->private_data = info;
   
-       tr->current_trace->ref++;
+       tr->trace_ref++;
   
         mutex_unlock(&trace_types_lock);
   
@@@ -7524,14 -7565,14 +7564,14 @@@ static int tracing_buffers_release(stru
   
         mutex_lock(&trace_types_lock);
   
-       iter->tr->current_trace->ref--;
+       iter->tr->trace_ref--;
   
         __trace_array_put(iter->tr);
   
         if (info->spare)
                 ring_buffer_free_read_page(iter->array_buffer->buffer,
                                            info->spare_cpu, info->spare);
-       kfree(info);
+       kvfree(info);
   
         mutex_unlock(&trace_types_lock);
   
@@@ -8732,7 -8773,7 +8772,7 @@@ static int __remove_instance(struct tra
         int i;
   
         /* Reference counter for a newly created trace array = 1. */
-       if (tr->ref > 1 || (tr->current_trace && tr->current_trace->ref))
+       if (tr->ref > 1 || (tr->current_trace && tr->trace_ref))
                 return -EBUSY;
   
         list_del(&tr->list);
@@@ -8944,7 -8985,9 +8984,7 @@@ struct dentry *tracing_init_dentry(void
         if (tr->dir)
                 return NULL;
   
- -      if (WARN_ON(!tracefs_initialized()) ||
- -              (IS_ENABLED(CONFIG_DEBUG_FS) &&
- -               WARN_ON(!debugfs_initialized())))
+ +      if (WARN_ON(!tracefs_initialized()))
                 return ERR_PTR(-ENODEV);
   
         /*
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 8 Aug 2020 01:29:15 +0000 (18:29 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 8 Aug 2020 01:29:15 +0000 (18:29 -0700)
		1	2
include/linux/kprobes.h	patch \|	diff1 \|	diff2 \|	blob \| history
init/main.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/kprobes.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/trace/ftrace.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/trace/ring_buffer.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/trace/trace.c	patch \|	diff1 \|	diff2 \|	blob \| history