Merge tag 'vfio-v6.0-rc1pt2' of https://github.com/awilliam/linux-vfio

[linux-2.6-microblaze.git] / fs / eventpoll.c
diff --git a/fs/eventpoll.c b/fs/eventpoll.c

index 06f4c5a..8b56b94 100644 (file)
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -307,7 +307,7 @@ static void unlist_file(struct epitems_head *head)
  static long long_zero;
  static long long_max = LONG_MAX;
  
-struct ctl_table epoll_table[] = {
+static struct ctl_table epoll_table[] = {
         {
                 .procname       = "max_user_watches",
                 .data           = &max_user_watches,
@@ -319,6 +319,13 @@ struct ctl_table epoll_table[] = {
         },
         { }
  };
+
+static void __init epoll_sysctls_init(void)
+{
+       register_sysctl("fs/epoll", epoll_table);
+}
+#else
+#define epoll_sysctls_init() do { } while (0)
  #endif /* CONFIG_SYSCTL */
  
  static const struct file_operations eventpoll_fops;
@@ -1740,6 +1747,21 @@ static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
         return to;
  }
  
+/*
+ * autoremove_wake_function, but remove even on failure to wake up, because we
+ * know that default_wake_function/ttwu will only fail if the thread is already
+ * woken, and in that case the ep_poll loop will remove the entry anyways, not
+ * try to reuse it.
+ */
+static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
+                                      unsigned int mode, int sync, void *key)
+{
+       int ret = default_wake_function(wq_entry, mode, sync, key);
+
+       list_del_init(&wq_entry->entry);
+       return ret;
+}
+
  /**
   * ep_poll - Retrieves ready events, and delivers them to the caller-supplied
   *           event buffer.
@@ -1821,8 +1843,15 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
                  * normal wakeup path no need to call __remove_wait_queue()
                  * explicitly, thus ep->lock is not taken, which halts the
                  * event delivery.
+                *
+                * In fact, we now use an even more aggressive function that
+                * unconditionally removes, because we don't reuse the wait
+                * entry between loop iterations. This lets us also avoid the
+                * performance issue if a process is killed, causing all of its
+                * threads to wake up without being removed normally.
                  */
                 init_wait(&wait);
+               wait.func = ep_autoremove_wake_function;
  
                 write_lock_irq(&ep->lock);
                 /*
@@ -2378,6 +2407,7 @@ static int __init eventpoll_init(void)
         /* Allocates slab cache used to allocate "struct eppoll_entry" */
         pwq_cache = kmem_cache_create("eventpoll_pwq",
                 sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
+       epoll_sysctls_init();
  
         ephead_cache = kmem_cache_create("ep_head",
                 sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);