fs/epoll: use a per-cpu counter for user's watches count
authorNicholas Piggin <npiggin@gmail.com>
Wed, 8 Sep 2021 03:00:00 +0000 (20:00 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 8 Sep 2021 18:50:27 +0000 (11:50 -0700)
This counter tracks the number of watches a user has, to compare against
the 'max_user_watches' limit. This causes a scalability bottleneck on
SPECjbb2015 on large systems as there is only one user. Changing to a
per-cpu counter increases throughput of the benchmark by about 30% on a
16-socket, > 1000 thread system.

[rdunlap@infradead.org: fix build errors in kernel/user.c when CONFIG_EPOLL=n]
[npiggin@gmail.com: move ifdefs into wrapper functions, slightly improve panic message]
Link: https://lkml.kernel.org/r/1628051945.fens3r99ox.astroid@bobo.none
[akpm@linux-foundation.org: tweak user_epoll_alloc(), per Guenter]
Link: https://lkml.kernel.org/r/20210804191421.GA1900577@roeck-us.net
Link: https://lkml.kernel.org/r/20210802032013.2751916-1-npiggin@gmail.com
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Reported-by: Anton Blanchard <anton@ozlabs.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
fs/eventpoll.c
include/linux/sched/user.h
kernel/user.c

index 1e596e1..648ed77 100644 (file)
@@ -723,7 +723,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
         */
        call_rcu(&epi->rcu, epi_rcu_free);
 
-       atomic_long_dec(&ep->user->epoll_watches);
+       percpu_counter_dec(&ep->user->epoll_watches);
 
        return 0;
 }
@@ -1439,7 +1439,6 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 {
        int error, pwake = 0;
        __poll_t revents;
-       long user_watches;
        struct epitem *epi;
        struct ep_pqueue epq;
        struct eventpoll *tep = NULL;
@@ -1449,11 +1448,15 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
 
        lockdep_assert_irqs_enabled();
 
-       user_watches = atomic_long_read(&ep->user->epoll_watches);
-       if (unlikely(user_watches >= max_user_watches))
+       if (unlikely(percpu_counter_compare(&ep->user->epoll_watches,
+                                           max_user_watches) >= 0))
                return -ENOSPC;
-       if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL)))
+       percpu_counter_inc(&ep->user->epoll_watches);
+
+       if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) {
+               percpu_counter_dec(&ep->user->epoll_watches);
                return -ENOMEM;
+       }
 
        /* Item initialization follow here ... */
        INIT_LIST_HEAD(&epi->rdllink);
@@ -1466,17 +1469,16 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
                mutex_lock_nested(&tep->mtx, 1);
        /* Add the current item to the list of active epoll hook for this file */
        if (unlikely(attach_epitem(tfile, epi) < 0)) {
-               kmem_cache_free(epi_cache, epi);
                if (tep)
                        mutex_unlock(&tep->mtx);
+               kmem_cache_free(epi_cache, epi);
+               percpu_counter_dec(&ep->user->epoll_watches);
                return -ENOMEM;
        }
 
        if (full_check && !tep)
                list_file(tfile);
 
-       atomic_long_inc(&ep->user->epoll_watches);
-
        /*
         * Add the current item to the RB tree. All RB tree operations are
         * protected by "mtx", and ep_insert() is called with "mtx" held.
index 2462f7d..00ed419 100644 (file)
@@ -4,6 +4,7 @@
 
 #include <linux/uidgid.h>
 #include <linux/atomic.h>
+#include <linux/percpu_counter.h>
 #include <linux/refcount.h>
 #include <linux/ratelimit.h>
 
@@ -13,7 +14,7 @@
 struct user_struct {
        refcount_t __count;     /* reference count */
 #ifdef CONFIG_EPOLL
-       atomic_long_t epoll_watches; /* The number of file descriptors currently watched */
+       struct percpu_counter epoll_watches; /* The number of file descriptors currently watched */
 #endif
        unsigned long unix_inflight;    /* How many files in flight in unix sockets */
        atomic_long_t pipe_bufs;  /* how many pages are allocated in pipe buffers */
index c82399c..e2cf8c2 100644 (file)
@@ -129,6 +129,22 @@ static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent)
        return NULL;
 }
 
+static int user_epoll_alloc(struct user_struct *up)
+{
+#ifdef CONFIG_EPOLL
+       return percpu_counter_init(&up->epoll_watches, 0, GFP_KERNEL);
+#else
+       return 0;
+#endif
+}
+
+static void user_epoll_free(struct user_struct *up)
+{
+#ifdef CONFIG_EPOLL
+       percpu_counter_destroy(&up->epoll_watches);
+#endif
+}
+
 /* IRQs are disabled and uidhash_lock is held upon function entry.
  * IRQ state (as stored in flags) is restored and uidhash_lock released
  * upon function exit.
@@ -138,6 +154,7 @@ static void free_user(struct user_struct *up, unsigned long flags)
 {
        uid_hash_remove(up);
        spin_unlock_irqrestore(&uidhash_lock, flags);
+       user_epoll_free(up);
        kmem_cache_free(uid_cachep, up);
 }
 
@@ -185,6 +202,10 @@ struct user_struct *alloc_uid(kuid_t uid)
 
                new->uid = uid;
                refcount_set(&new->__count, 1);
+               if (user_epoll_alloc(new)) {
+                       kmem_cache_free(uid_cachep, new);
+                       return NULL;
+               }
                ratelimit_state_init(&new->ratelimit, HZ, 100);
                ratelimit_set_flags(&new->ratelimit, RATELIMIT_MSG_ON_RELEASE);
 
@@ -195,6 +216,7 @@ struct user_struct *alloc_uid(kuid_t uid)
                spin_lock_irq(&uidhash_lock);
                up = uid_hash_find(uid, hashent);
                if (up) {
+                       user_epoll_free(new);
                        kmem_cache_free(uid_cachep, new);
                } else {
                        uid_hash_insert(new, hashent);
@@ -216,6 +238,9 @@ static int __init uid_cache_init(void)
        for(n = 0; n < UIDHASH_SZ; ++n)
                INIT_HLIST_HEAD(uidhash_table + n);
 
+       if (user_epoll_alloc(&root_user))
+               panic("root_user epoll percpu counter alloc failed");
+
        /* Insert the root user immediately (init already runs as root) */
        spin_lock_irq(&uidhash_lock);
        uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID));