kernel/latencytop.c: rename clear_all_latency_tracing to clear_tsk_latency_tracing
[linux-2.6-microblaze.git] / kernel / fork.c
index fbe9dfc..b4cba95 100644 (file)
@@ -11,6 +11,7 @@
  * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
  */
 
+#include <linux/anon_inodes.h>
 #include <linux/slab.h>
 #include <linux/sched/autogroup.h>
 #include <linux/sched/mm.h>
@@ -21,6 +22,7 @@
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
 #include <linux/sched/cputime.h>
+#include <linux/seq_file.h>
 #include <linux/rtmutex.h>
 #include <linux/init.h>
 #include <linux/unistd.h>
@@ -953,6 +955,15 @@ static void mm_init_aio(struct mm_struct *mm)
 #endif
 }
 
+static __always_inline void mm_clear_owner(struct mm_struct *mm,
+                                          struct task_struct *p)
+{
+#ifdef CONFIG_MEMCG
+       if (mm->owner == p)
+               WRITE_ONCE(mm->owner, NULL);
+#endif
+}
+
 static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
 {
 #ifdef CONFIG_MEMCG
@@ -1223,7 +1234,9 @@ static int wait_for_vfork_done(struct task_struct *child,
        int killed;
 
        freezer_do_not_count();
+       cgroup_enter_frozen();
        killed = wait_for_completion_killable(vfork);
+       cgroup_leave_frozen(false);
        freezer_count();
 
        if (killed) {
@@ -1339,6 +1352,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk,
 free_pt:
        /* don't put binfmt in mmput, we haven't got module yet */
        mm->binfmt = NULL;
+       mm_init_owner(mm, NULL);
        mmput(mm);
 
 fail_nomem:
@@ -1670,6 +1684,73 @@ static inline void rcu_copy_process(struct task_struct *p)
 #endif /* #ifdef CONFIG_TASKS_RCU */
 }
 
+static int pidfd_release(struct inode *inode, struct file *file)
+{
+       struct pid *pid = file->private_data;
+
+       file->private_data = NULL;
+       put_pid(pid);
+       return 0;
+}
+
+#ifdef CONFIG_PROC_FS
+static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
+{
+       struct pid_namespace *ns = proc_pid_ns(file_inode(m->file));
+       struct pid *pid = f->private_data;
+
+       seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns));
+       seq_putc(m, '\n');
+}
+#endif
+
+const struct file_operations pidfd_fops = {
+       .release = pidfd_release,
+#ifdef CONFIG_PROC_FS
+       .show_fdinfo = pidfd_show_fdinfo,
+#endif
+};
+
+/**
+ * pidfd_create() - Create a new pid file descriptor.
+ *
+ * @pid:  struct pid that the pidfd will reference
+ *
+ * This creates a new pid file descriptor with the O_CLOEXEC flag set.
+ *
+ * Note, that this function can only be called after the fd table has
+ * been unshared to avoid leaking the pidfd to the new process.
+ *
+ * Return: On success, a cloexec pidfd is returned.
+ *         On error, a negative errno number will be returned.
+ */
+static int pidfd_create(struct pid *pid)
+{
+       int fd;
+
+       fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
+                             O_RDWR | O_CLOEXEC);
+       if (fd < 0)
+               put_pid(pid);
+
+       return fd;
+}
+
+static void __delayed_free_task(struct rcu_head *rhp)
+{
+       struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
+
+       free_task(tsk);
+}
+
+static __always_inline void delayed_free_task(struct task_struct *tsk)
+{
+       if (IS_ENABLED(CONFIG_MEMCG))
+               call_rcu(&tsk->rcu, __delayed_free_task);
+       else
+               free_task(tsk);
+}
+
 /*
  * This creates a new process as a copy of the old one,
  * but does not actually start it yet.
@@ -1682,13 +1763,14 @@ static __latent_entropy struct task_struct *copy_process(
                                        unsigned long clone_flags,
                                        unsigned long stack_start,
                                        unsigned long stack_size,
+                                       int __user *parent_tidptr,
                                        int __user *child_tidptr,
                                        struct pid *pid,
                                        int trace,
                                        unsigned long tls,
                                        int node)
 {
-       int retval;
+       int pidfd = -1, retval;
        struct task_struct *p;
        struct multiprocess_signals delayed;
 
@@ -1738,6 +1820,31 @@ static __latent_entropy struct task_struct *copy_process(
                        return ERR_PTR(-EINVAL);
        }
 
+       if (clone_flags & CLONE_PIDFD) {
+               int reserved;
+
+               /*
+                * - CLONE_PARENT_SETTID is useless for pidfds and also
+                *   parent_tidptr is used to return pidfds.
+                * - CLONE_DETACHED is blocked so that we can potentially
+                *   reuse it later for CLONE_PIDFD.
+                * - CLONE_THREAD is blocked until someone really needs it.
+                */
+               if (clone_flags &
+                   (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD))
+                       return ERR_PTR(-EINVAL);
+
+               /*
+                * Verify that parent_tidptr is sane so we can potentially
+                * reuse it later.
+                */
+               if (get_user(reserved, parent_tidptr))
+                       return ERR_PTR(-EFAULT);
+
+               if (reserved != 0)
+                       return ERR_PTR(-EINVAL);
+       }
+
        /*
         * Force any signals received before this point to be delivered
         * before the fork happens.  Collect up signals sent to multiple
@@ -1944,6 +2051,22 @@ static __latent_entropy struct task_struct *copy_process(
                }
        }
 
+       /*
+        * This has to happen after we've potentially unshared the file
+        * descriptor table (so that the pidfd doesn't leak into the child
+        * if the fd table isn't shared).
+        */
+       if (clone_flags & CLONE_PIDFD) {
+               retval = pidfd_create(pid);
+               if (retval < 0)
+                       goto bad_fork_free_pid;
+
+               pidfd = retval;
+               retval = put_user(pidfd, parent_tidptr);
+               if (retval)
+                       goto bad_fork_put_pidfd;
+       }
+
 #ifdef CONFIG_BLOCK
        p->plug = NULL;
 #endif
@@ -1970,7 +2093,7 @@ static __latent_entropy struct task_struct *copy_process(
 #ifdef TIF_SYSCALL_EMU
        clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
 #endif
-       clear_all_latency_tracing(p);
+       clear_tsk_latency_tracing(p);
 
        /* ok, now we should be set up.. */
        p->pid = pid_nr(pid);
@@ -2004,7 +2127,7 @@ static __latent_entropy struct task_struct *copy_process(
         */
        retval = cgroup_can_fork(p);
        if (retval)
-               goto bad_fork_free_pid;
+               goto bad_fork_cgroup_threadgroup_change_end;
 
        /*
         * From this point on we must avoid any synchronous user-space
@@ -2119,8 +2242,12 @@ bad_fork_cancel_cgroup:
        spin_unlock(&current->sighand->siglock);
        write_unlock_irq(&tasklist_lock);
        cgroup_cancel_fork(p);
-bad_fork_free_pid:
+bad_fork_cgroup_threadgroup_change_end:
        cgroup_threadgroup_change_end(current);
+bad_fork_put_pidfd:
+       if (clone_flags & CLONE_PIDFD)
+               ksys_close(pidfd);
+bad_fork_free_pid:
        if (pid != &init_struct_pid)
                free_pid(pid);
 bad_fork_cleanup_thread:
@@ -2131,8 +2258,10 @@ bad_fork_cleanup_io:
 bad_fork_cleanup_namespaces:
        exit_task_namespaces(p);
 bad_fork_cleanup_mm:
-       if (p->mm)
+       if (p->mm) {
+               mm_clear_owner(p->mm, p);
                mmput(p->mm);
+       }
 bad_fork_cleanup_signal:
        if (!(clone_flags & CLONE_THREAD))
                free_signal_struct(p->signal);
@@ -2163,7 +2292,7 @@ bad_fork_cleanup_count:
 bad_fork_free:
        p->state = TASK_DEAD;
        put_task_stack(p);
-       free_task(p);
+       delayed_free_task(p);
 fork_out:
        spin_lock_irq(&current->sighand->siglock);
        hlist_del_init(&delayed.node);
@@ -2184,7 +2313,7 @@ static inline void init_idle_pids(struct task_struct *idle)
 struct task_struct *fork_idle(int cpu)
 {
        struct task_struct *task;
-       task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0,
+       task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0,
                            cpu_to_node(cpu));
        if (!IS_ERR(task)) {
                init_idle_pids(task);
@@ -2236,7 +2365,7 @@ long _do_fork(unsigned long clone_flags,
                        trace = 0;
        }
 
-       p = copy_process(clone_flags, stack_start, stack_size,
+       p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr,
                         child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
        add_latent_entropy();