Merge tag 'hwlock-v5.9' of git://git.kernel.org/pub/scm/linux/kernel/git/andersson...
[linux-2.6-microblaze.git] / kernel / fork.c
index 48ed227..4d32190 100644 (file)
@@ -94,8 +94,8 @@
 #include <linux/thread_info.h>
 #include <linux/stackleak.h>
 #include <linux/kasan.h>
+#include <linux/scs.h>
 
-#include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
 #include <asm/mmu_context.h>
@@ -261,7 +261,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
                                             THREAD_SIZE_ORDER);
 
        if (likely(page)) {
-               tsk->stack = page_address(page);
+               tsk->stack = kasan_reset_tag(page_address(page));
                return tsk->stack;
        }
        return NULL;
@@ -276,13 +276,8 @@ static inline void free_thread_stack(struct task_struct *tsk)
        if (vm) {
                int i;
 
-               for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
-                       mod_memcg_page_state(vm->pages[i],
-                                            MEMCG_KERNEL_STACK_KB,
-                                            -(int)(PAGE_SIZE / 1024));
-
+               for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
                        memcg_kmem_uncharge_page(vm->pages[i], 0);
-               }
 
                for (i = 0; i < NR_CACHED_STACKS; i++) {
                        if (this_cpu_cmpxchg(cached_stacks[i],
@@ -307,6 +302,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
 {
        unsigned long *stack;
        stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
+       stack = kasan_reset_tag(stack);
        tsk->stack = stack;
        return stack;
 }
@@ -359,7 +355,13 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
        struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 
        if (new) {
-               *new = *orig;
+               ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
+               ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
+               /*
+                * orig->shared.rb may be modified concurrently, but the clone
+                * will be reinitialized.
+                */
+               *new = data_race(*orig);
                INIT_LIST_HEAD(&new->anon_vma_chain);
                new->vm_next = new->vm_prev = NULL;
        }
@@ -376,31 +378,14 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
        void *stack = task_stack_page(tsk);
        struct vm_struct *vm = task_stack_vm_area(tsk);
 
-       BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
-
-       if (vm) {
-               int i;
-
-               BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
-
-               for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
-                       mod_zone_page_state(page_zone(vm->pages[i]),
-                                           NR_KERNEL_STACK_KB,
-                                           PAGE_SIZE / 1024 * account);
-               }
-       } else {
-               /*
-                * All stack pages are in the same zone and belong to the
-                * same memcg.
-                */
-               struct page *first_page = virt_to_page(stack);
-
-               mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
-                                   THREAD_SIZE / 1024 * account);
 
-               mod_memcg_obj_state(stack, MEMCG_KERNEL_STACK_KB,
-                                   account * (THREAD_SIZE / 1024));
-       }
+       /* All stack pages are in the same node. */
+       if (vm)
+               mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB,
+                                     account * (THREAD_SIZE / 1024));
+       else
+               mod_lruvec_slab_state(stack, NR_KERNEL_STACK_KB,
+                                     account * (THREAD_SIZE / 1024));
 }
 
 static int memcg_charge_kernel_stack(struct task_struct *tsk)
@@ -409,24 +394,23 @@ static int memcg_charge_kernel_stack(struct task_struct *tsk)
        struct vm_struct *vm = task_stack_vm_area(tsk);
        int ret;
 
+       BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+
        if (vm) {
                int i;
 
+               BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
+
                for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
                        /*
                         * If memcg_kmem_charge_page() fails, page->mem_cgroup
-                        * pointer is NULL, and both memcg_kmem_uncharge_page()
-                        * and mod_memcg_page_state() in free_thread_stack()
-                        * will ignore this page. So it's safe.
+                        * pointer is NULL, and memcg_kmem_uncharge_page() in
+                        * free_thread_stack() will ignore this page.
                         */
                        ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
                                                     0);
                        if (ret)
                                return ret;
-
-                       mod_memcg_page_state(vm->pages[i],
-                                            MEMCG_KERNEL_STACK_KB,
-                                            PAGE_SIZE / 1024);
                }
        }
 #endif
@@ -456,6 +440,8 @@ void put_task_stack(struct task_struct *tsk)
 
 void free_task(struct task_struct *tsk)
 {
+       scs_release(tsk);
+
 #ifndef CONFIG_THREAD_INFO_IN_TASK
        /*
         * The task is finally done with both the stack and thread_info,
@@ -471,7 +457,6 @@ void free_task(struct task_struct *tsk)
 #endif
        rt_mutex_debug_task_free(tsk);
        ftrace_graph_exit_task(tsk);
-       put_seccomp_filter(tsk);
        arch_release_task_struct(tsk);
        if (tsk->flags & PF_KTHREAD)
                free_kthread_struct(tsk);
@@ -490,7 +475,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
        LIST_HEAD(uf);
 
        uprobe_start_dup_mmap();
-       if (down_write_killable(&oldmm->mmap_sem)) {
+       if (mmap_write_lock_killable(oldmm)) {
                retval = -EINTR;
                goto fail_uprobe_end;
        }
@@ -499,7 +484,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
        /*
         * Not linked in yet - no deadlock potential:
         */
-       down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
+       mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
 
        /* No ordering required: file already has been exposed. */
        RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
@@ -615,9 +600,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
        /* a new mm has just been created */
        retval = arch_dup_mmap(oldmm, mm);
 out:
-       up_write(&mm->mmap_sem);
+       mmap_write_unlock(mm);
        flush_tlb_mm(oldmm);
-       up_write(&oldmm->mmap_sem);
+       mmap_write_unlock(oldmm);
        dup_userfaultfd_complete(&uf);
 fail_uprobe_end:
        uprobe_end_dup_mmap();
@@ -647,9 +632,9 @@ static inline void mm_free_pgd(struct mm_struct *mm)
 #else
 static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 {
-       down_write(&oldmm->mmap_sem);
+       mmap_write_lock(oldmm);
        RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
-       up_write(&oldmm->mmap_sem);
+       mmap_write_unlock(oldmm);
        return 0;
 }
 #define mm_alloc_pgd(mm)       (0)
@@ -840,6 +825,8 @@ void __init fork_init(void)
                          NULL, free_vm_stack_cache);
 #endif
 
+       scs_init();
+
        lockdep_init_task(&init_task);
        uprobes_init();
 }
@@ -899,6 +886,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
        if (err)
                goto free_stack;
 
+       err = scs_prepare(tsk, node);
+       if (err)
+               goto free_stack;
+
 #ifdef CONFIG_SECCOMP
        /*
         * We must handle setting up seccomp filters once we're under
@@ -1014,7 +1005,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
        mm->vmacache_seqnum = 0;
        atomic_set(&mm->mm_users, 1);
        atomic_set(&mm->mm_count, 1);
-       init_rwsem(&mm->mmap_sem);
+       mmap_init_lock(mm);
        INIT_LIST_HEAD(&mm->mmlist);
        mm->core_state = NULL;
        mm_pgtables_bytes_init(mm);
@@ -1466,7 +1457,7 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
                goto out;
        }
 
-       newf = dup_fd(oldf, &error);
+       newf = dup_fd(oldf, NR_OPEN_MAX, &error);
        if (!newf)
                goto out;
 
@@ -1683,6 +1674,11 @@ static inline void rcu_copy_process(struct task_struct *p)
        INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
        p->rcu_tasks_idle_cpu = -1;
 #endif /* #ifdef CONFIG_TASKS_RCU */
+#ifdef CONFIG_TASKS_TRACE_RCU
+       p->trc_reader_nesting = 0;
+       p->trc_reader_special.s = 0;
+       INIT_LIST_HEAD(&p->trc_holdout_list);
+#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
 }
 
 struct pid *pidfd_pid(const struct file *file)
@@ -1745,7 +1741,7 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
        pid_t nr = -1;
 
        if (likely(pid_has_task(pid, PIDTYPE_PID))) {
-               ns = proc_pid_ns(file_inode(m->file));
+               ns = proc_pid_ns(file_inode(m->file)->i_sb);
                nr = pid_nr_ns(pid, ns);
        }
 
@@ -1774,22 +1770,18 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
  */
 static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
 {
-       struct task_struct *task;
        struct pid *pid = file->private_data;
        __poll_t poll_flags = 0;
 
        poll_wait(file, &pid->wait_pidfd, pts);
 
-       rcu_read_lock();
-       task = pid_task(pid, PIDTYPE_PID);
        /*
         * Inform pollers only when the whole thread group exits.
         * If the thread group leader exits before all other threads in the
         * group, then poll(2) should block, similar to the wait(2) family.
         */
-       if (!task || (task->exit_state && thread_group_empty(task)))
+       if (thread_group_exited(pid))
                poll_flags = EPOLLIN | EPOLLRDNORM;
-       rcu_read_unlock();
 
        return poll_flags;
 }
@@ -1941,8 +1933,8 @@ static __latent_entropy struct task_struct *copy_process(
 
        rt_mutex_init_task(p);
 
+       lockdep_assert_irqs_enabled();
 #ifdef CONFIG_PROVE_LOCKING
-       DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
        DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
 #endif
        retval = -EAGAIN;
@@ -1964,7 +1956,7 @@ static __latent_entropy struct task_struct *copy_process(
         * to stop root fork bombs.
         */
        retval = -EAGAIN;
-       if (nr_threads >= max_threads)
+       if (data_race(nr_threads >= max_threads))
                goto bad_fork_cleanup_count;
 
        delayacct_tsk_init(p);  /* Must remain after dup_task_struct() */
@@ -2019,22 +2011,14 @@ static __latent_entropy struct task_struct *copy_process(
 #ifdef CONFIG_CPUSETS
        p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
        p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
-       seqcount_init(&p->mems_allowed_seq);
+       seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
 #endif
 #ifdef CONFIG_TRACE_IRQFLAGS
-       p->irq_events = 0;
-       p->hardirqs_enabled = 0;
-       p->hardirq_enable_ip = 0;
-       p->hardirq_enable_event = 0;
-       p->hardirq_disable_ip = _THIS_IP_;
-       p->hardirq_disable_event = 0;
-       p->softirqs_enabled = 1;
-       p->softirq_enable_ip = _THIS_IP_;
-       p->softirq_enable_event = 0;
-       p->softirq_disable_ip = 0;
-       p->softirq_disable_event = 0;
-       p->hardirq_context = 0;
-       p->softirq_context = 0;
+       memset(&p->irqtrace, 0, sizeof(p->irqtrace));
+       p->irqtrace.hardirq_disable_ip  = _THIS_IP_;
+       p->irqtrace.softirq_enable_ip   = _THIS_IP_;
+       p->softirqs_enabled             = 1;
+       p->softirq_context              = 0;
 #endif
 
        p->pagefault_disabled = 0;
@@ -2091,8 +2075,7 @@ static __latent_entropy struct task_struct *copy_process(
        retval = copy_io(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_namespaces;
-       retval = copy_thread_tls(clone_flags, args->stack, args->stack_size, p,
-                                args->tls);
+       retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls);
        if (retval)
                goto bad_fork_cleanup_io;
 
@@ -2291,6 +2274,7 @@ static __latent_entropy struct task_struct *copy_process(
        write_unlock_irq(&tasklist_lock);
 
        proc_fork_connector(p);
+       sched_post_fork(p);
        cgroup_post_fork(p, args);
        perf_event_fork(p);
 
@@ -2409,6 +2393,20 @@ long _do_fork(struct kernel_clone_args *args)
        int trace = 0;
        long nr;
 
+       /*
+        * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
+        * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
+        * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
+        * field in struct clone_args and it still doesn't make sense to have
+        * them both point at the same memory location. Performing this check
+        * here has the advantage that we don't need to have a separate helper
+        * to check for legacy clone().
+        */
+       if ((args->flags & CLONE_PIDFD) &&
+           (args->flags & CLONE_PARENT_SETTID) &&
+           (args->pidfd == args->parent_tid))
+               return -EINVAL;
+
        /*
         * Determine whether and which event to report to ptracer.  When
         * called from kernel_thread or CLONE_UNTRACED is explicitly
@@ -2466,42 +2464,6 @@ long _do_fork(struct kernel_clone_args *args)
        return nr;
 }
 
-bool legacy_clone_args_valid(const struct kernel_clone_args *kargs)
-{
-       /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */
-       if ((kargs->flags & CLONE_PIDFD) &&
-           (kargs->flags & CLONE_PARENT_SETTID))
-               return false;
-
-       return true;
-}
-
-#ifndef CONFIG_HAVE_COPY_THREAD_TLS
-/* For compatibility with architectures that call do_fork directly rather than
- * using the syscall entry points below. */
-long do_fork(unsigned long clone_flags,
-             unsigned long stack_start,
-             unsigned long stack_size,
-             int __user *parent_tidptr,
-             int __user *child_tidptr)
-{
-       struct kernel_clone_args args = {
-               .flags          = (lower_32_bits(clone_flags) & ~CSIGNAL),
-               .pidfd          = parent_tidptr,
-               .child_tid      = child_tidptr,
-               .parent_tid     = parent_tidptr,
-               .exit_signal    = (lower_32_bits(clone_flags) & CSIGNAL),
-               .stack          = stack_start,
-               .stack_size     = stack_size,
-       };
-
-       if (!legacy_clone_args_valid(&args))
-               return -EINVAL;
-
-       return _do_fork(&args);
-}
-#endif
-
 /*
  * Create a kernel thread.
  */
@@ -2580,24 +2542,12 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
                .tls            = tls,
        };
 
-       if (!legacy_clone_args_valid(&args))
-               return -EINVAL;
-
        return _do_fork(&args);
 }
 #endif
 
 #ifdef __ARCH_WANT_SYS_CLONE3
 
-/*
- * copy_thread implementations handle CLONE_SETTLS by reading the TLS value from
- * the registers containing the syscall arguments for clone. This doesn't work
- * with clone3 since the TLS value is passed in clone_args instead.
- */
-#ifndef CONFIG_HAVE_COPY_THREAD_TLS
-#error clone3 requires copy_thread_tls support in arch
-#endif
-
 noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
                                              struct clone_args __user *uargs,
                                              size_t usize)
@@ -2894,14 +2844,15 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
 /*
  * Unshare file descriptor table if it is being shared
  */
-static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
+int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
+              struct files_struct **new_fdp)
 {
        struct files_struct *fd = current->files;
        int error = 0;
 
        if ((unshare_flags & CLONE_FILES) &&
            (fd && atomic_read(&fd->count) > 1)) {
-               *new_fdp = dup_fd(fd, &error);
+               *new_fdp = dup_fd(fd, max_fds, &error);
                if (!*new_fdp)
                        return error;
        }
@@ -2912,7 +2863,7 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
 /*
  * unshare allows a process to 'unshare' part of the process
  * context which was originally shared using clone.  copy_*
- * functions used by do_fork() cannot be used here directly
+ * functions used by _do_fork() cannot be used here directly
  * because they modify an inactive task_struct that is being
  * constructed. Here we are modifying the current, active,
  * task_struct.
@@ -2961,7 +2912,7 @@ int ksys_unshare(unsigned long unshare_flags)
        err = unshare_fs(unshare_flags, &new_fs);
        if (err)
                goto bad_unshare_out;
-       err = unshare_fd(unshare_flags, &new_fd);
+       err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd);
        if (err)
                goto bad_unshare_cleanup_fs;
        err = unshare_userns(unshare_flags, &new_cred);
@@ -3050,7 +3001,7 @@ int unshare_files(struct files_struct **displaced)
        struct files_struct *copy = NULL;
        int error;
 
-       error = unshare_fd(CLONE_FILES, &copy);
+       error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, &copy);
        if (error || !copy) {
                *displaced = NULL;
                return error;