scsi: cxgb4i: Fix TLS dependency
[linux-2.6-microblaze.git] / mm / madvise.c
index fd1f448..416a56b 100644 (file)
@@ -17,6 +17,8 @@
 #include <linux/falloc.h>
 #include <linux/fadvise.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/uio.h>
 #include <linux/ksm.h>
 #include <linux/fs.h>
 #include <linux/file.h>
@@ -27,7 +29,6 @@
 #include <linux/swapops.h>
 #include <linux/shmem_fs.h>
 #include <linux/mmu_notifier.h>
-#include <linux/sched/mm.h>
 
 #include <asm/tlb.h>
 
@@ -258,6 +259,7 @@ static long madvise_willneed(struct vm_area_struct *vma,
                             struct vm_area_struct **prev,
                             unsigned long start, unsigned long end)
 {
+       struct mm_struct *mm = vma->vm_mm;
        struct file *file = vma->vm_file;
        loff_t offset;
 
@@ -294,10 +296,10 @@ static long madvise_willneed(struct vm_area_struct *vma,
        get_file(file);
        offset = (loff_t)(start - vma->vm_start)
                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
-       mmap_read_unlock(current->mm);
+       mmap_read_unlock(mm);
        vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
        fput(file);
-       mmap_read_lock(current->mm);
+       mmap_read_lock(mm);
        return 0;
 }
 
@@ -766,6 +768,8 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
                                  unsigned long start, unsigned long end,
                                  int behavior)
 {
+       struct mm_struct *mm = vma->vm_mm;
+
        *prev = vma;
        if (!can_madv_lru_vma(vma))
                return -EINVAL;
@@ -773,8 +777,8 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
        if (!userfaultfd_remove(vma, start, end)) {
                *prev = NULL; /* mmap_lock has been dropped, prev is stale */
 
-               mmap_read_lock(current->mm);
-               vma = find_vma(current->mm, start);
+               mmap_read_lock(mm);
+               vma = find_vma(mm, start);
                if (!vma)
                        return -ENOMEM;
                if (start < vma->vm_start) {
@@ -828,6 +832,7 @@ static long madvise_remove(struct vm_area_struct *vma,
        loff_t offset;
        int error;
        struct file *f;
+       struct mm_struct *mm = vma->vm_mm;
 
        *prev = NULL;   /* tell sys_madvise we drop mmap_lock */
 
@@ -855,13 +860,13 @@ static long madvise_remove(struct vm_area_struct *vma,
        get_file(f);
        if (userfaultfd_remove(vma, start, end)) {
                /* mmap_lock was not released by userfaultfd_remove() */
-               mmap_read_unlock(current->mm);
+               mmap_read_unlock(mm);
        }
        error = vfs_fallocate(f,
                                FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
                                offset, end - start);
        fput(f);
-       mmap_read_lock(current->mm);
+       mmap_read_lock(mm);
        return error;
 }
 
@@ -984,6 +989,18 @@ madvise_behavior_valid(int behavior)
        }
 }
 
+static bool
+process_madvise_behavior_valid(int behavior)
+{
+       switch (behavior) {
+       case MADV_COLD:
+       case MADV_PAGEOUT:
+               return true;
+       default:
+               return false;
+       }
+}
+
 /*
  * The madvise(2) system call.
  *
@@ -1031,6 +1048,11 @@ madvise_behavior_valid(int behavior)
  *  MADV_DONTDUMP - the application wants to prevent pages in the given range
  *             from being included in its core dump.
  *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
+ *  MADV_COLD - the application is not expected to use this memory soon,
+ *             deactivate pages in this range so that they can be reclaimed
+ *             easily if memory pressure hanppens.
+ *  MADV_PAGEOUT - the application is not expected to use this memory soon,
+ *             page out the pages in this range immediately.
  *
  * return values:
  *  zero    - success
@@ -1045,7 +1067,7 @@ madvise_behavior_valid(int behavior)
  *  -EBADF  - map exists, but area maps something that isn't a file.
  *  -EAGAIN - a kernel resource was temporarily unavailable.
  */
-int do_madvise(unsigned long start, size_t len_in, int behavior)
+int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
 {
        unsigned long end, tmp;
        struct vm_area_struct *vma, *prev;
@@ -1083,10 +1105,10 @@ int do_madvise(unsigned long start, size_t len_in, int behavior)
 
        write = madvise_need_mmap_write(behavior);
        if (write) {
-               if (mmap_write_lock_killable(current->mm))
+               if (mmap_write_lock_killable(mm))
                        return -EINTR;
        } else {
-               mmap_read_lock(current->mm);
+               mmap_read_lock(mm);
        }
 
        /*
@@ -1094,7 +1116,7 @@ int do_madvise(unsigned long start, size_t len_in, int behavior)
         * ranges, just ignore them, but return -ENOMEM at the end.
         * - different from the way of handling in mlock etc.
         */
-       vma = find_vma_prev(current->mm, start, &prev);
+       vma = find_vma_prev(mm, start, &prev);
        if (vma && start > vma->vm_start)
                prev = vma;
 
@@ -1131,19 +1153,92 @@ int do_madvise(unsigned long start, size_t len_in, int behavior)
                if (prev)
                        vma = prev->vm_next;
                else    /* madvise_remove dropped mmap_lock */
-                       vma = find_vma(current->mm, start);
+                       vma = find_vma(mm, start);
        }
 out:
        blk_finish_plug(&plug);
        if (write)
-               mmap_write_unlock(current->mm);
+               mmap_write_unlock(mm);
        else
-               mmap_read_unlock(current->mm);
+               mmap_read_unlock(mm);
 
        return error;
 }
 
 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 {
-       return do_madvise(start, len_in, behavior);
+       return do_madvise(current->mm, start, len_in, behavior);
+}
+
+SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
+               size_t, vlen, int, behavior, unsigned int, flags)
+{
+       ssize_t ret;
+       struct iovec iovstack[UIO_FASTIOV], iovec;
+       struct iovec *iov = iovstack;
+       struct iov_iter iter;
+       struct pid *pid;
+       struct task_struct *task;
+       struct mm_struct *mm;
+       size_t total_len;
+       unsigned int f_flags;
+
+       if (flags != 0) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
+       if (ret < 0)
+               goto out;
+
+       pid = pidfd_get_pid(pidfd, &f_flags);
+       if (IS_ERR(pid)) {
+               ret = PTR_ERR(pid);
+               goto free_iov;
+       }
+
+       task = get_pid_task(pid, PIDTYPE_PID);
+       if (!task) {
+               ret = -ESRCH;
+               goto put_pid;
+       }
+
+       if (task->mm != current->mm &&
+                       !process_madvise_behavior_valid(behavior)) {
+               ret = -EINVAL;
+               goto release_task;
+       }
+
+       mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS);
+       if (IS_ERR_OR_NULL(mm)) {
+               ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
+               goto release_task;
+       }
+
+       total_len = iov_iter_count(&iter);
+
+       while (iov_iter_count(&iter)) {
+               iovec = iov_iter_iovec(&iter);
+               ret = do_madvise(mm, (unsigned long)iovec.iov_base,
+                                       iovec.iov_len, behavior);
+               if (ret < 0)
+                       break;
+               iov_iter_advance(&iter, iovec.iov_len);
+       }
+
+       if (ret == 0)
+               ret = total_len - iov_iter_count(&iter);
+
+       mmput(mm);
+       return ret;
+
+release_task:
+       put_task_struct(task);
+put_pid:
+       put_pid(pid);
+free_iov:
+       kfree(iov);
+out:
+       return ret;
 }