do_wait: make PIDTYPE_PID case O(1) instead of O(n)
authorJim Newsome <jnewsome@torproject.org>
Fri, 7 May 2021 01:04:22 +0000 (18:04 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 7 May 2021 02:24:13 +0000 (19:24 -0700)
Add a special-case when waiting on a pid (via waitpid, waitid, wait4, etc)
to avoid doing an O(n) scan of children and tracees, and instead do an
O(1) lookup.  This improves performance when waiting on a pid from a
thread group with many children and/or tracees.

Time to fork and then call waitpid on the child, from a task that already
has N children [1]:

N    | Before  | After
-----|---------|------
1    | 74 us   | 74 us
20   | 72 us   | 75 us
100  | 83 us   | 77 us
500  | 99 us   | 74 us
1000 | 179 us  | 75 us
5000 | 804 us  | 79 us
8000 | 1268 us | 78 us

[1]: https://lkml.org/lkml/2021/3/12/1567

This can make a substantial performance improvement for applications with
a thread that has many children or tracees and frequently needs to wait on
them.  Tools that use ptrace to intercept syscalls for a large number of
processes are likely to fall into this category.  In particular this patch
was developed while building a ptrace-based second generation of the
Shadow emulator [2], for which it allows us to avoid quadratic scaling
(without having to use a workaround that introduces a ~40% performance
penalty) [3].  Other examples of tools that fall into this category which
this patch may help include User Mode Linux [4] and DetTrace [5].

[2]: https://shadow.github.io/
[3]: https://github.com/shadow/shadow/issues/1134#issuecomment-798992292
[4]: https://en.wikipedia.org/wiki/User-mode_Linux
[5]: https://github.com/dettrace/dettrace

Link: https://lkml.kernel.org/r/20210314231544.9379-1-jnewsome@torproject.org
Signed-off-by: James Newsome <jnewsome@torproject.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W . Biederman" <ebiederm@xmission.com>
Cc: Christian Brauner <christian@brauner.io>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
kernel/exit.c

index 0596526..fd1c041 100644 (file)
@@ -1440,9 +1440,48 @@ void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
                           TASK_INTERRUPTIBLE, p);
 }
 
+static bool is_effectively_child(struct wait_opts *wo, bool ptrace,
+                                struct task_struct *target)
+{
+       struct task_struct *parent =
+               !ptrace ? target->real_parent : target->parent;
+
+       return current == parent || (!(wo->wo_flags & __WNOTHREAD) &&
+                                    same_thread_group(current, parent));
+}
+
+/*
+ * Optimization for waiting on PIDTYPE_PID. No need to iterate through child
+ * and tracee lists to find the target task.
+ */
+static int do_wait_pid(struct wait_opts *wo)
+{
+       bool ptrace;
+       struct task_struct *target;
+       int retval;
+
+       ptrace = false;
+       target = pid_task(wo->wo_pid, PIDTYPE_TGID);
+       if (target && is_effectively_child(wo, ptrace, target)) {
+               retval = wait_consider_task(wo, ptrace, target);
+               if (retval)
+                       return retval;
+       }
+
+       ptrace = true;
+       target = pid_task(wo->wo_pid, PIDTYPE_PID);
+       if (target && target->ptrace &&
+           is_effectively_child(wo, ptrace, target)) {
+               retval = wait_consider_task(wo, ptrace, target);
+               if (retval)
+                       return retval;
+       }
+
+       return 0;
+}
+
 static long do_wait(struct wait_opts *wo)
 {
-       struct task_struct *tsk;
        int retval;
 
        trace_sched_process_wait(wo->wo_pid);
@@ -1464,19 +1503,27 @@ repeat:
 
        set_current_state(TASK_INTERRUPTIBLE);
        read_lock(&tasklist_lock);
-       tsk = current;
-       do {
-               retval = do_wait_thread(wo, tsk);
-               if (retval)
-                       goto end;
 
-               retval = ptrace_do_wait(wo, tsk);
+       if (wo->wo_type == PIDTYPE_PID) {
+               retval = do_wait_pid(wo);
                if (retval)
                        goto end;
+       } else {
+               struct task_struct *tsk = current;
+
+               do {
+                       retval = do_wait_thread(wo, tsk);
+                       if (retval)
+                               goto end;
 
-               if (wo->wo_flags & __WNOTHREAD)
-                       break;
-       } while_each_thread(current, tsk);
+                       retval = ptrace_do_wait(wo, tsk);
+                       if (retval)
+                               goto end;
+
+                       if (wo->wo_flags & __WNOTHREAD)
+                               break;
+               } while_each_thread(current, tsk);
+       }
        read_unlock(&tasklist_lock);
 
 notask: