userfaultfd: move userfaultfd_ctx struct to header file
authorLokesh Gidra <lokeshgidra@google.com>
Thu, 15 Feb 2024 18:27:53 +0000 (10:27 -0800)
committerAndrew Morton <akpm@linux-foundation.org>
Thu, 22 Feb 2024 23:27:20 +0000 (15:27 -0800)
Patch series "per-vma locks in userfaultfd", v7.

Performing userfaultfd operations (like copy/move etc.) in critical
section of mmap_lock (read-mode) causes significant contention on the lock
when operations requiring the lock in write-mode are taking place
concurrently.  We can use per-vma locks instead to significantly reduce
the contention issue.

Android runtime's Garbage Collector uses userfaultfd for concurrent
compaction.  mmap-lock contention during compaction potentially causes
jittery experience for the user.  During one such reproducible scenario,
we observed the following improvements with this patch-set:

- Wall clock time of compaction phase came down from ~3s to <500ms
- Uninterruptible sleep time (across all threads in the process) was
  ~10ms (none in mmap_lock) during compaction, instead of >20s

This patch (of 4):

Move the struct to userfaultfd_k.h to be accessible from mm/userfaultfd.c.
There are no other changes in the struct.

This is required to prepare for using per-vma locks in userfaultfd
operations.

Link: https://lkml.kernel.org/r/20240215182756.3448972-1-lokeshgidra@google.com
Link: https://lkml.kernel.org/r/20240215182756.3448972-2-lokeshgidra@google.com
Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Brian Geffon <bgeffon@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jann Horn <jannh@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Nicolas Geoffray <ngeoffray@google.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tim Murray <timmurray@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
fs/userfaultfd.c
include/linux/userfaultfd_k.h

index 05c8e8a..58331b8 100644 (file)
@@ -50,45 +50,6 @@ static struct ctl_table vm_userfaultfd_table[] = {
 
 static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init;
 
-/*
- * Start with fault_pending_wqh and fault_wqh so they're more likely
- * to be in the same cacheline.
- *
- * Locking order:
- *     fd_wqh.lock
- *             fault_pending_wqh.lock
- *                     fault_wqh.lock
- *             event_wqh.lock
- *
- * To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
- * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
- * also taken in IRQ context.
- */
-struct userfaultfd_ctx {
-       /* waitqueue head for the pending (i.e. not read) userfaults */
-       wait_queue_head_t fault_pending_wqh;
-       /* waitqueue head for the userfaults */
-       wait_queue_head_t fault_wqh;
-       /* waitqueue head for the pseudo fd to wakeup poll/read */
-       wait_queue_head_t fd_wqh;
-       /* waitqueue head for events */
-       wait_queue_head_t event_wqh;
-       /* a refile sequence protected by fault_pending_wqh lock */
-       seqcount_spinlock_t refile_seq;
-       /* pseudo fd refcounting */
-       refcount_t refcount;
-       /* userfaultfd syscall flags */
-       unsigned int flags;
-       /* features requested from the userspace */
-       unsigned int features;
-       /* released */
-       bool released;
-       /* memory mappings are changing because of non-cooperative event */
-       atomic_t mmap_changing;
-       /* mm with one ore more vmas attached to this userfaultfd_ctx */
-       struct mm_struct *mm;
-};
-
 struct userfaultfd_fork_ctx {
        struct userfaultfd_ctx *orig;
        struct userfaultfd_ctx *new;
index e405654..691d928 100644 (file)
 #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
 #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
 
+/*
+ * Start with fault_pending_wqh and fault_wqh so they're more likely
+ * to be in the same cacheline.
+ *
+ * Locking order:
+ *     fd_wqh.lock
+ *             fault_pending_wqh.lock
+ *                     fault_wqh.lock
+ *             event_wqh.lock
+ *
+ * To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
+ * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
+ * also taken in IRQ context.
+ */
+struct userfaultfd_ctx {
+       /* waitqueue head for the pending (i.e. not read) userfaults */
+       wait_queue_head_t fault_pending_wqh;
+       /* waitqueue head for the userfaults */
+       wait_queue_head_t fault_wqh;
+       /* waitqueue head for the pseudo fd to wakeup poll/read */
+       wait_queue_head_t fd_wqh;
+       /* waitqueue head for events */
+       wait_queue_head_t event_wqh;
+       /* a refile sequence protected by fault_pending_wqh lock */
+       seqcount_spinlock_t refile_seq;
+       /* pseudo fd refcounting */
+       refcount_t refcount;
+       /* userfaultfd syscall flags */
+       unsigned int flags;
+       /* features requested from the userspace */
+       unsigned int features;
+       /* released */
+       bool released;
+       /* memory mappings are changing because of non-cooperative event */
+       atomic_t mmap_changing;
+       /* mm with one ore more vmas attached to this userfaultfd_ctx */
+       struct mm_struct *mm;
+};
+
 extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
 
 /* A combined operation mode + behavior flags. */