gfs2: Minor gfs2_try_evict cleanup
[linux-2.6-microblaze.git] / kernel / fork.c
index 2b6bd51..08969f5 100644 (file)
 #include <linux/fdtable.h>
 #include <linux/iocontext.h>
 #include <linux/key.h>
+#include <linux/kmsan.h>
 #include <linux/binfmts.h>
 #include <linux/mman.h>
 #include <linux/mmu_notifier.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/mm_inline.h>
-#include <linux/vmacache.h>
 #include <linux/nsproxy.h>
 #include <linux/capability.h>
 #include <linux/cpu.h>
@@ -97,7 +97,6 @@
 #include <linux/scs.h>
 #include <linux/io_uring.h>
 #include <linux/bpf.h>
-#include <linux/sched/mm.h>
 
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
@@ -475,7 +474,6 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
                 */
                *new = data_race(*orig);
                INIT_LIST_HEAD(&new->anon_vma_chain);
-               new->vm_next = new->vm_prev = NULL;
                dup_anon_vma_name(orig, new);
        }
        return new;
@@ -580,11 +578,12 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
 static __latent_entropy int dup_mmap(struct mm_struct *mm,
                                        struct mm_struct *oldmm)
 {
-       struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
-       struct rb_node **rb_link, *rb_parent;
+       struct vm_area_struct *mpnt, *tmp;
        int retval;
-       unsigned long charge;
+       unsigned long charge = 0;
        LIST_HEAD(uf);
+       MA_STATE(old_mas, &oldmm->mm_mt, 0, 0);
+       MA_STATE(mas, &mm->mm_mt, 0, 0);
 
        uprobe_start_dup_mmap();
        if (mmap_write_lock_killable(oldmm)) {
@@ -606,16 +605,16 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
        mm->exec_vm = oldmm->exec_vm;
        mm->stack_vm = oldmm->stack_vm;
 
-       rb_link = &mm->mm_rb.rb_node;
-       rb_parent = NULL;
-       pprev = &mm->mmap;
        retval = ksm_fork(mm, oldmm);
        if (retval)
                goto out;
        khugepaged_fork(mm, oldmm);
 
-       prev = NULL;
-       for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
+       retval = mas_expected_entries(&mas, oldmm->map_count);
+       if (retval)
+               goto out;
+
+       mas_for_each(&old_mas, mpnt, ULONG_MAX) {
                struct file *file;
 
                if (mpnt->vm_flags & VM_DONTCOPY) {
@@ -629,7 +628,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
                 */
                if (fatal_signal_pending(current)) {
                        retval = -EINTR;
-                       goto out;
+                       goto loop_out;
                }
                if (mpnt->vm_flags & VM_ACCOUNT) {
                        unsigned long len = vma_pages(mpnt);
@@ -675,24 +674,17 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
                }
 
                /*
-                * Clear hugetlb-related page reserves for children. This only
-                * affects MAP_PRIVATE mappings. Faults generated by the child
-                * are not guaranteed to succeed, even if read-only
+                * Copy/update hugetlb private vma information.
                 */
                if (is_vm_hugetlb_page(tmp))
-                       reset_vma_resv_huge_pages(tmp);
-
-               /*
-                * Link in the new vma and copy the page table entries.
-                */
-               *pprev = tmp;
-               pprev = &tmp->vm_next;
-               tmp->vm_prev = prev;
-               prev = tmp;
+                       hugetlb_dup_vma_private(tmp);
 
-               __vma_link_rb(mm, tmp, rb_link, rb_parent);
-               rb_link = &tmp->vm_rb.rb_right;
-               rb_parent = &tmp->vm_rb;
+               /* Link the vma into the MT */
+               mas.index = tmp->vm_start;
+               mas.last = tmp->vm_end - 1;
+               mas_store(&mas, tmp);
+               if (mas_is_err(&mas))
+                       goto fail_nomem_mas_store;
 
                mm->map_count++;
                if (!(tmp->vm_flags & VM_WIPEONFORK))
@@ -702,10 +694,12 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
                        tmp->vm_ops->open(tmp);
 
                if (retval)
-                       goto out;
+                       goto loop_out;
        }
        /* a new mm has just been created */
        retval = arch_dup_mmap(oldmm, mm);
+loop_out:
+       mas_destroy(&mas);
 out:
        mmap_write_unlock(mm);
        flush_tlb_mm(oldmm);
@@ -714,6 +708,9 @@ out:
 fail_uprobe_end:
        uprobe_end_dup_mmap();
        return retval;
+
+fail_nomem_mas_store:
+       unlink_anon_vmas(tmp);
 fail_nomem_anon_vma_fork:
        mpol_put(vma_policy(tmp));
 fail_nomem_policy:
@@ -721,7 +718,7 @@ fail_nomem_policy:
 fail_nomem:
        retval = -ENOMEM;
        vm_unacct_memory(charge);
-       goto out;
+       goto loop_out;
 }
 
 static inline int mm_alloc_pgd(struct mm_struct *mm)
@@ -925,13 +922,13 @@ void __init fork_init(void)
        init_task.signal->rlim[RLIMIT_SIGPENDING] =
                init_task.signal->rlim[RLIMIT_NPROC];
 
-       for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++)
+       for (i = 0; i < UCOUNT_COUNTS; i++)
                init_user_ns.ucount_max[i] = max_threads/2;
 
-       set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_NPROC,      RLIM_INFINITY);
-       set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE,   RLIM_INFINITY);
-       set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY);
-       set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK,    RLIM_INFINITY);
+       set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_NPROC,      RLIM_INFINITY);
+       set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE,   RLIM_INFINITY);
+       set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY);
+       set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK,    RLIM_INFINITY);
 
 #ifdef CONFIG_VMAP_STACK
        cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
@@ -1026,6 +1023,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
        tsk->worker_private = NULL;
 
        kcov_task_init(tsk);
+       kmsan_task_create(tsk);
        kmap_local_fork(tsk);
 
 #ifdef CONFIG_FAULT_INJECTION
@@ -1109,9 +1107,8 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
 static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
        struct user_namespace *user_ns)
 {
-       mm->mmap = NULL;
-       mm->mm_rb = RB_ROOT;
-       mm->vmacache_seqnum = 0;
+       mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
+       mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
        atomic_set(&mm->mm_users, 1);
        atomic_set(&mm->mm_count, 1);
        seqcount_init(&mm->write_protect_seq);
@@ -1152,6 +1149,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
                goto fail_nocontext;
 
        mm->user_ns = get_user_ns(user_ns);
+       lru_gen_init_mm(mm);
        return mm;
 
 fail_nocontext:
@@ -1194,6 +1192,7 @@ static inline void __mmput(struct mm_struct *mm)
        }
        if (mm->binfmt)
                module_put(mm->binfmt->module);
+       lru_gen_del_mm(mm);
        mmdrop(mm);
 }
 
@@ -1285,13 +1284,16 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
        /* Forbid mm->exe_file change if old file still mapped. */
        old_exe_file = get_mm_exe_file(mm);
        if (old_exe_file) {
+               VMA_ITERATOR(vmi, mm, 0);
                mmap_read_lock(mm);
-               for (vma = mm->mmap; vma && !ret; vma = vma->vm_next) {
+               for_each_vma(vmi, vma) {
                        if (!vma->vm_file)
                                continue;
                        if (path_equal(&vma->vm_file->f_path,
-                                      &old_exe_file->f_path))
+                                      &old_exe_file->f_path)) {
                                ret = -EBUSY;
+                               break;
+                       }
                }
                mmap_read_unlock(mm);
                fput(old_exe_file);
@@ -1421,13 +1423,12 @@ static void complete_vfork_done(struct task_struct *tsk)
 static int wait_for_vfork_done(struct task_struct *child,
                                struct completion *vfork)
 {
+       unsigned int state = TASK_UNINTERRUPTIBLE|TASK_KILLABLE|TASK_FREEZABLE;
        int killed;
 
-       freezer_do_not_count();
        cgroup_enter_frozen();
-       killed = wait_for_completion_killable(vfork);
+       killed = wait_for_completion_state(vfork, state);
        cgroup_leave_frozen(false);
-       freezer_count();
 
        if (killed) {
                task_lock(child);
@@ -1567,9 +1568,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
        if (!oldmm)
                return 0;
 
-       /* initialize the new vmacache entries */
-       vmacache_flush(tsk);
-
        if (clone_flags & CLONE_VM) {
                mmget(oldmm);
                mm = oldmm;
@@ -1693,6 +1691,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
                return -ENOMEM;
 
        sig->nr_threads = 1;
+       sig->quick_threads = 1;
        atomic_set(&sig->live, 1);
        refcount_set(&sig->sigcnt, 1);
 
@@ -2116,7 +2115,7 @@ static __latent_entropy struct task_struct *copy_process(
                goto bad_fork_free;
 
        retval = -EAGAIN;
-       if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
+       if (is_rlimit_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
                if (p->real_cred->user != INIT_USER &&
                    !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
                        goto bad_fork_cleanup_count;
@@ -2460,6 +2459,7 @@ static __latent_entropy struct task_struct *copy_process(
                        __this_cpu_inc(process_counts);
                } else {
                        current->signal->nr_threads++;
+                       current->signal->quick_threads++;
                        atomic_inc(&current->signal->live);
                        refcount_inc(&current->signal->sigcnt);
                        task_join_group_stop(p);
@@ -2692,6 +2692,13 @@ pid_t kernel_clone(struct kernel_clone_args *args)
                get_task_struct(p);
        }
 
+       if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
+               /* lock the task to synchronize with memcg migration */
+               task_lock(p);
+               lru_gen_add_mm(p->mm);
+               task_unlock(p);
+       }
+
        wake_up_new_task(p);
 
        /* forking complete and child started to run, tell ptracer */