mm/madvise: introduce process_madvise() syscall: an external memory hinting API
[linux-2.6-microblaze.git] / mm / madvise.c
index d550ef0..416a56b 100644 (file)
@@ -17,6 +17,8 @@
 #include <linux/falloc.h>
 #include <linux/fadvise.h>
 #include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/uio.h>
 #include <linux/ksm.h>
 #include <linux/fs.h>
 #include <linux/file.h>
@@ -27,7 +29,6 @@
 #include <linux/swapops.h>
 #include <linux/shmem_fs.h>
 #include <linux/mmu_notifier.h>
-#include <linux/sched/mm.h>
 
 #include <asm/tlb.h>
 
@@ -988,6 +989,18 @@ madvise_behavior_valid(int behavior)
        }
 }
 
+static bool
+process_madvise_behavior_valid(int behavior)
+{
+       switch (behavior) {
+       case MADV_COLD:
+       case MADV_PAGEOUT:
+               return true;
+       default:
+               return false;
+       }
+}
+
 /*
  * The madvise(2) system call.
  *
@@ -1035,6 +1048,11 @@ madvise_behavior_valid(int behavior)
  *  MADV_DONTDUMP - the application wants to prevent pages in the given range
  *             from being included in its core dump.
  *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
+ *  MADV_COLD - the application is not expected to use this memory soon,
+ *             deactivate pages in this range so that they can be reclaimed
+ *             easily if memory pressure hanppens.
+ *  MADV_PAGEOUT - the application is not expected to use this memory soon,
+ *             page out the pages in this range immediately.
  *
  * return values:
  *  zero    - success
@@ -1151,3 +1169,76 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 {
        return do_madvise(current->mm, start, len_in, behavior);
 }
+
+SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
+               size_t, vlen, int, behavior, unsigned int, flags)
+{
+       ssize_t ret;
+       struct iovec iovstack[UIO_FASTIOV], iovec;
+       struct iovec *iov = iovstack;
+       struct iov_iter iter;
+       struct pid *pid;
+       struct task_struct *task;
+       struct mm_struct *mm;
+       size_t total_len;
+       unsigned int f_flags;
+
+       if (flags != 0) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
+       if (ret < 0)
+               goto out;
+
+       pid = pidfd_get_pid(pidfd, &f_flags);
+       if (IS_ERR(pid)) {
+               ret = PTR_ERR(pid);
+               goto free_iov;
+       }
+
+       task = get_pid_task(pid, PIDTYPE_PID);
+       if (!task) {
+               ret = -ESRCH;
+               goto put_pid;
+       }
+
+       if (task->mm != current->mm &&
+                       !process_madvise_behavior_valid(behavior)) {
+               ret = -EINVAL;
+               goto release_task;
+       }
+
+       mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS);
+       if (IS_ERR_OR_NULL(mm)) {
+               ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
+               goto release_task;
+       }
+
+       total_len = iov_iter_count(&iter);
+
+       while (iov_iter_count(&iter)) {
+               iovec = iov_iter_iovec(&iter);
+               ret = do_madvise(mm, (unsigned long)iovec.iov_base,
+                                       iovec.iov_len, behavior);
+               if (ret < 0)
+                       break;
+               iov_iter_advance(&iter, iovec.iov_len);
+       }
+
+       if (ret == 0)
+               ret = total_len - iov_iter_count(&iter);
+
+       mmput(mm);
+       return ret;
+
+release_task:
+       put_task_struct(task);
+put_pid:
+       put_pid(pid);
+free_iov:
+       kfree(iov);
+out:
+       return ret;
+}