Merge tag 'hwlock-v5.9' of git://git.kernel.org/pub/scm/linux/kernel/git/andersson...

[linux-2.6-microblaze.git] / kernel / fork.c
diff --git a/kernel/fork.c b/kernel/fork.c

index 48ed227..4d32190 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -94,8 +94,8 @@
  #include <linux/thread_info.h>
  #include <linux/stackleak.h>
  #include <linux/kasan.h>
+#include <linux/scs.h>
  
-#include <asm/pgtable.h>
  #include <asm/pgalloc.h>
  #include <linux/uaccess.h>
  #include <asm/mmu_context.h>
@@ -261,7 +261,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
                                              THREAD_SIZE_ORDER);
  
         if (likely(page)) {
-               tsk->stack = page_address(page);
+               tsk->stack = kasan_reset_tag(page_address(page));
                 return tsk->stack;
         }
         return NULL;
@@ -276,13 +276,8 @@ static inline void free_thread_stack(struct task_struct *tsk)
         if (vm) {
                 int i;
  
-               for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
-                       mod_memcg_page_state(vm->pages[i],
-                                            MEMCG_KERNEL_STACK_KB,
-                                            -(int)(PAGE_SIZE / 1024));
-
+               for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
                         memcg_kmem_uncharge_page(vm->pages[i], 0);
-               }
  
                 for (i = 0; i < NR_CACHED_STACKS; i++) {
                         if (this_cpu_cmpxchg(cached_stacks[i],
@@ -307,6 +302,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
  {
         unsigned long *stack;
         stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
+       stack = kasan_reset_tag(stack);
         tsk->stack = stack;
         return stack;
  }
@@ -359,7 +355,13 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
         struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
  
         if (new) {
-               *new = *orig;
+               ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
+               ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
+               /*
+                * orig->shared.rb may be modified concurrently, but the clone
+                * will be reinitialized.
+                */
+               *new = data_race(*orig);
                 INIT_LIST_HEAD(&new->anon_vma_chain);
                 new->vm_next = new->vm_prev = NULL;
         }
@@ -376,31 +378,14 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
         void *stack = task_stack_page(tsk);
         struct vm_struct *vm = task_stack_vm_area(tsk);
  
-       BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
-
-       if (vm) {
-               int i;
-
-               BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
-
-               for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
-                       mod_zone_page_state(page_zone(vm->pages[i]),
-                                           NR_KERNEL_STACK_KB,
-                                           PAGE_SIZE / 1024 * account);
-               }
-       } else {
-               /*
-                * All stack pages are in the same zone and belong to the
-                * same memcg.
-                */
-               struct page *first_page = virt_to_page(stack);
-
-               mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
-                                   THREAD_SIZE / 1024 * account);
  
-               mod_memcg_obj_state(stack, MEMCG_KERNEL_STACK_KB,
-                                   account * (THREAD_SIZE / 1024));
-       }
+       /* All stack pages are in the same node. */
+       if (vm)
+               mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB,
+                                     account * (THREAD_SIZE / 1024));
+       else
+               mod_lruvec_slab_state(stack, NR_KERNEL_STACK_KB,
+                                     account * (THREAD_SIZE / 1024));
  }
  
  static int memcg_charge_kernel_stack(struct task_struct *tsk)
@@ -409,24 +394,23 @@ static int memcg_charge_kernel_stack(struct task_struct *tsk)
         struct vm_struct *vm = task_stack_vm_area(tsk);
         int ret;
  
+       BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+
         if (vm) {
                 int i;
  
+               BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
+
                 for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
                         /*
                          * If memcg_kmem_charge_page() fails, page->mem_cgroup
-                        * pointer is NULL, and both memcg_kmem_uncharge_page()
-                        * and mod_memcg_page_state() in free_thread_stack()
-                        * will ignore this page. So it's safe.
+                        * pointer is NULL, and memcg_kmem_uncharge_page() in
+                        * free_thread_stack() will ignore this page.
                          */
                         ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
                                                      0);
                         if (ret)
                                 return ret;
-
-                       mod_memcg_page_state(vm->pages[i],
-                                            MEMCG_KERNEL_STACK_KB,
-                                            PAGE_SIZE / 1024);
                 }
         }
  #endif
@@ -456,6 +440,8 @@ void put_task_stack(struct task_struct *tsk)
  
  void free_task(struct task_struct *tsk)
  {
+       scs_release(tsk);
+
  #ifndef CONFIG_THREAD_INFO_IN_TASK
         /*
          * The task is finally done with both the stack and thread_info,
@@ -471,7 +457,6 @@ void free_task(struct task_struct *tsk)
  #endif
         rt_mutex_debug_task_free(tsk);
         ftrace_graph_exit_task(tsk);
-       put_seccomp_filter(tsk);
         arch_release_task_struct(tsk);
         if (tsk->flags & PF_KTHREAD)
                 free_kthread_struct(tsk);
@@ -490,7 +475,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
         LIST_HEAD(uf);
  
         uprobe_start_dup_mmap();
-       if (down_write_killable(&oldmm->mmap_sem)) {
+       if (mmap_write_lock_killable(oldmm)) {
                 retval = -EINTR;
                 goto fail_uprobe_end;
         }
@@ -499,7 +484,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
         /*
          * Not linked in yet - no deadlock potential:
          */
-       down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
+       mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);
  
         /* No ordering required: file already has been exposed. */
         RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
@@ -615,9 +600,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
         /* a new mm has just been created */
         retval = arch_dup_mmap(oldmm, mm);
  out:
-       up_write(&mm->mmap_sem);
+       mmap_write_unlock(mm);
         flush_tlb_mm(oldmm);
-       up_write(&oldmm->mmap_sem);
+       mmap_write_unlock(oldmm);
         dup_userfaultfd_complete(&uf);
  fail_uprobe_end:
         uprobe_end_dup_mmap();
@@ -647,9 +632,9 @@ static inline void mm_free_pgd(struct mm_struct *mm)
  #else
  static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
  {
-       down_write(&oldmm->mmap_sem);
+       mmap_write_lock(oldmm);
         RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));
-       up_write(&oldmm->mmap_sem);
+       mmap_write_unlock(oldmm);
         return 0;
  }
  #define mm_alloc_pgd(mm)       (0)
@@ -840,6 +825,8 @@ void __init fork_init(void)
                           NULL, free_vm_stack_cache);
  #endif
  
+       scs_init();
+
         lockdep_init_task(&init_task);
         uprobes_init();
  }
@@ -899,6 +886,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
         if (err)
                 goto free_stack;
  
+       err = scs_prepare(tsk, node);
+       if (err)
+               goto free_stack;
+
  #ifdef CONFIG_SECCOMP
         /*
          * We must handle setting up seccomp filters once we're under
@@ -1014,7 +1005,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
         mm->vmacache_seqnum = 0;
         atomic_set(&mm->mm_users, 1);
         atomic_set(&mm->mm_count, 1);
-       init_rwsem(&mm->mmap_sem);
+       mmap_init_lock(mm);
         INIT_LIST_HEAD(&mm->mmlist);
         mm->core_state = NULL;
         mm_pgtables_bytes_init(mm);
@@ -1466,7 +1457,7 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
                 goto out;
         }
  
-       newf = dup_fd(oldf, &error);
+       newf = dup_fd(oldf, NR_OPEN_MAX, &error);
         if (!newf)
                 goto out;
  
@@ -1683,6 +1674,11 @@ static inline void rcu_copy_process(struct task_struct *p)
         INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
         p->rcu_tasks_idle_cpu = -1;
  #endif /* #ifdef CONFIG_TASKS_RCU */
+#ifdef CONFIG_TASKS_TRACE_RCU
+       p->trc_reader_nesting = 0;
+       p->trc_reader_special.s = 0;
+       INIT_LIST_HEAD(&p->trc_holdout_list);
+#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
  }
  
  struct pid *pidfd_pid(const struct file *file)
@@ -1745,7 +1741,7 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
         pid_t nr = -1;
  
         if (likely(pid_has_task(pid, PIDTYPE_PID))) {
-               ns = proc_pid_ns(file_inode(m->file));
+               ns = proc_pid_ns(file_inode(m->file)->i_sb);
                 nr = pid_nr_ns(pid, ns);
         }
  
@@ -1774,22 +1770,18 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
   */
  static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
  {
-       struct task_struct *task;
         struct pid *pid = file->private_data;
         __poll_t poll_flags = 0;
  
         poll_wait(file, &pid->wait_pidfd, pts);
  
-       rcu_read_lock();
-       task = pid_task(pid, PIDTYPE_PID);
         /*
          * Inform pollers only when the whole thread group exits.
          * If the thread group leader exits before all other threads in the
          * group, then poll(2) should block, similar to the wait(2) family.
          */
-       if (!task || (task->exit_state && thread_group_empty(task)))
+       if (thread_group_exited(pid))
                 poll_flags = EPOLLIN | EPOLLRDNORM;
-       rcu_read_unlock();
  
         return poll_flags;
  }
@@ -1941,8 +1933,8 @@ static __latent_entropy struct task_struct *copy_process(
  
         rt_mutex_init_task(p);
  
+       lockdep_assert_irqs_enabled();
  #ifdef CONFIG_PROVE_LOCKING
-       DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
         DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
  #endif
         retval = -EAGAIN;
@@ -1964,7 +1956,7 @@ static __latent_entropy struct task_struct *copy_process(
          * to stop root fork bombs.
          */
         retval = -EAGAIN;
-       if (nr_threads >= max_threads)
+       if (data_race(nr_threads >= max_threads))
                 goto bad_fork_cleanup_count;
  
         delayacct_tsk_init(p);  /* Must remain after dup_task_struct() */
@@ -2019,22 +2011,14 @@ static __latent_entropy struct task_struct *copy_process(
  #ifdef CONFIG_CPUSETS
         p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
         p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
-       seqcount_init(&p->mems_allowed_seq);
+       seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
  #endif
  #ifdef CONFIG_TRACE_IRQFLAGS
-       p->irq_events = 0;
-       p->hardirqs_enabled = 0;
-       p->hardirq_enable_ip = 0;
-       p->hardirq_enable_event = 0;
-       p->hardirq_disable_ip = _THIS_IP_;
-       p->hardirq_disable_event = 0;
-       p->softirqs_enabled = 1;
-       p->softirq_enable_ip = _THIS_IP_;
-       p->softirq_enable_event = 0;
-       p->softirq_disable_ip = 0;
-       p->softirq_disable_event = 0;
-       p->hardirq_context = 0;
-       p->softirq_context = 0;
+       memset(&p->irqtrace, 0, sizeof(p->irqtrace));
+       p->irqtrace.hardirq_disable_ip  = _THIS_IP_;
+       p->irqtrace.softirq_enable_ip   = _THIS_IP_;
+       p->softirqs_enabled             = 1;
+       p->softirq_context              = 0;
  #endif
  
         p->pagefault_disabled = 0;
@@ -2091,8 +2075,7 @@ static __latent_entropy struct task_struct *copy_process(
         retval = copy_io(clone_flags, p);
         if (retval)
                 goto bad_fork_cleanup_namespaces;
-       retval = copy_thread_tls(clone_flags, args->stack, args->stack_size, p,
-                                args->tls);
+       retval = copy_thread(clone_flags, args->stack, args->stack_size, p, args->tls);
         if (retval)
                 goto bad_fork_cleanup_io;
  
@@ -2291,6 +2274,7 @@ static __latent_entropy struct task_struct *copy_process(
         write_unlock_irq(&tasklist_lock);
  
         proc_fork_connector(p);
+       sched_post_fork(p);
         cgroup_post_fork(p, args);
         perf_event_fork(p);
  
@@ -2409,6 +2393,20 @@ long _do_fork(struct kernel_clone_args *args)
         int trace = 0;
         long nr;
  
+       /*
+        * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
+        * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
+        * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
+        * field in struct clone_args and it still doesn't make sense to have
+        * them both point at the same memory location. Performing this check
+        * here has the advantage that we don't need to have a separate helper
+        * to check for legacy clone().
+        */
+       if ((args->flags & CLONE_PIDFD) &&
+           (args->flags & CLONE_PARENT_SETTID) &&
+           (args->pidfd == args->parent_tid))
+               return -EINVAL;
+
         /*
          * Determine whether and which event to report to ptracer.  When
          * called from kernel_thread or CLONE_UNTRACED is explicitly
@@ -2466,42 +2464,6 @@ long _do_fork(struct kernel_clone_args *args)
         return nr;
  }
  
-bool legacy_clone_args_valid(const struct kernel_clone_args *kargs)
-{
-       /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */
-       if ((kargs->flags & CLONE_PIDFD) &&
-           (kargs->flags & CLONE_PARENT_SETTID))
-               return false;
-
-       return true;
-}
-
-#ifndef CONFIG_HAVE_COPY_THREAD_TLS
-/* For compatibility with architectures that call do_fork directly rather than
- * using the syscall entry points below. */
-long do_fork(unsigned long clone_flags,
-             unsigned long stack_start,
-             unsigned long stack_size,
-             int __user *parent_tidptr,
-             int __user *child_tidptr)
-{
-       struct kernel_clone_args args = {
-               .flags          = (lower_32_bits(clone_flags) & ~CSIGNAL),
-               .pidfd          = parent_tidptr,
-               .child_tid      = child_tidptr,
-               .parent_tid     = parent_tidptr,
-               .exit_signal    = (lower_32_bits(clone_flags) & CSIGNAL),
-               .stack          = stack_start,
-               .stack_size     = stack_size,
-       };
-
-       if (!legacy_clone_args_valid(&args))
-               return -EINVAL;
-
-       return _do_fork(&args);
-}
-#endif
-
  /*
   * Create a kernel thread.
   */
@@ -2580,24 +2542,12 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
                 .tls            = tls,
         };
  
-       if (!legacy_clone_args_valid(&args))
-               return -EINVAL;
-
         return _do_fork(&args);
  }
  #endif
  
  #ifdef __ARCH_WANT_SYS_CLONE3
  
-/*
- * copy_thread implementations handle CLONE_SETTLS by reading the TLS value from
- * the registers containing the syscall arguments for clone. This doesn't work
- * with clone3 since the TLS value is passed in clone_args instead.
- */
-#ifndef CONFIG_HAVE_COPY_THREAD_TLS
-#error clone3 requires copy_thread_tls support in arch
-#endif
-
  noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
                                               struct clone_args __user *uargs,
                                               size_t usize)
@@ -2894,14 +2844,15 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
  /*
   * Unshare file descriptor table if it is being shared
   */
-static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
+int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
+              struct files_struct **new_fdp)
  {
         struct files_struct *fd = current->files;
         int error = 0;
  
         if ((unshare_flags & CLONE_FILES) &&
             (fd && atomic_read(&fd->count) > 1)) {
-               *new_fdp = dup_fd(fd, &error);
+               *new_fdp = dup_fd(fd, max_fds, &error);
                 if (!*new_fdp)
                         return error;
         }
@@ -2912,7 +2863,7 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
  /*
   * unshare allows a process to 'unshare' part of the process
   * context which was originally shared using clone.  copy_*
- * functions used by do_fork() cannot be used here directly
+ * functions used by _do_fork() cannot be used here directly
   * because they modify an inactive task_struct that is being
   * constructed. Here we are modifying the current, active,
   * task_struct.
@@ -2961,7 +2912,7 @@ int ksys_unshare(unsigned long unshare_flags)
         err = unshare_fs(unshare_flags, &new_fs);
         if (err)
                 goto bad_unshare_out;
-       err = unshare_fd(unshare_flags, &new_fd);
+       err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd);
         if (err)
                 goto bad_unshare_cleanup_fs;
         err = unshare_userns(unshare_flags, &new_cred);
@@ -3050,7 +3001,7 @@ int unshare_files(struct files_struct **displaced)
         struct files_struct *copy = NULL;
         int error;
  
-       error = unshare_fd(CLONE_FILES, &copy);
+       error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, &copy);
         if (error || !copy) {
                 *displaced = NULL;
                 return error;