fanotify: support limited functionality for unprivileged users

[linux-2.6-microblaze.git] / fs / notify / fanotify / fanotify_user.c
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c

index 9e0c1af..65142b1 100644 (file)
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -27,8 +27,61 @@
  #include "fanotify.h"
  
  #define FANOTIFY_DEFAULT_MAX_EVENTS    16384
-#define FANOTIFY_DEFAULT_MAX_MARKS     8192
-#define FANOTIFY_DEFAULT_MAX_LISTENERS 128
+#define FANOTIFY_OLD_DEFAULT_MAX_MARKS 8192
+#define FANOTIFY_DEFAULT_MAX_GROUPS    128
+
+/*
+ * Legacy fanotify marks limits (8192) is per group and we introduced a tunable
+ * limit of marks per user, similar to inotify.  Effectively, the legacy limit
+ * of fanotify marks per user is <max marks per group> * <max groups per user>.
+ * This default limit (1M) also happens to match the increased limit of inotify
+ * max_user_watches since v5.10.
+ */
+#define FANOTIFY_DEFAULT_MAX_USER_MARKS        \
+       (FANOTIFY_OLD_DEFAULT_MAX_MARKS * FANOTIFY_DEFAULT_MAX_GROUPS)
+
+/*
+ * Most of the memory cost of adding an inode mark is pinning the marked inode.
+ * The size of the filesystem inode struct is not uniform across filesystems,
+ * so double the size of a VFS inode is used as a conservative approximation.
+ */
+#define INODE_MARK_COST        (2 * sizeof(struct inode))
+
+/* configurable via /proc/sys/fs/fanotify/ */
+static int fanotify_max_queued_events __read_mostly;
+
+#ifdef CONFIG_SYSCTL
+
+#include <linux/sysctl.h>
+
+struct ctl_table fanotify_table[] = {
+       {
+               .procname       = "max_user_groups",
+               .data   = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS],
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = SYSCTL_ZERO,
+       },
+       {
+               .procname       = "max_user_marks",
+               .data   = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS],
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = SYSCTL_ZERO,
+       },
+       {
+               .procname       = "max_queued_events",
+               .data           = &fanotify_max_queued_events,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = SYSCTL_ZERO
+       },
+       { }
+};
+#endif /* CONFIG_SYSCTL */
  
  /*
   * All flags that may be specified in parameter event_f_flags of fanotify_init.
@@ -89,6 +142,23 @@ static int fanotify_event_info_len(unsigned int fid_mode,
         return info_len;
  }
  
+/*
+ * Remove an hashed event from merge hash table.
+ */
+static void fanotify_unhash_event(struct fsnotify_group *group,
+                                 struct fanotify_event *event)
+{
+       assert_spin_locked(&group->notification_lock);
+
+       pr_debug("%s: group=%p event=%p bucket=%u\n", __func__,
+                group, event, fanotify_event_hash_bucket(group, event));
+
+       if (WARN_ON_ONCE(hlist_unhashed(&event->merge_list)))
+               return;
+
+       hlist_del_init(&event->merge_list);
+}
+
  /*
   * Get an fanotify notification event if one exists and is small
   * enough to fit in "count". Return an error pointer if the count
@@ -100,26 +170,34 @@ static struct fanotify_event *get_one_event(struct fsnotify_group *group,
  {
         size_t event_size = FAN_EVENT_METADATA_LEN;
         struct fanotify_event *event = NULL;
+       struct fsnotify_event *fsn_event;
         unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
  
         pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
  
         spin_lock(&group->notification_lock);
-       if (fsnotify_notify_queue_is_empty(group))
+       fsn_event = fsnotify_peek_first_event(group);
+       if (!fsn_event)
                 goto out;
  
-       if (fid_mode) {
-               event_size += fanotify_event_info_len(fid_mode,
-                       FANOTIFY_E(fsnotify_peek_first_event(group)));
-       }
+       event = FANOTIFY_E(fsn_event);
+       if (fid_mode)
+               event_size += fanotify_event_info_len(fid_mode, event);
  
         if (event_size > count) {
                 event = ERR_PTR(-EINVAL);
                 goto out;
         }
-       event = FANOTIFY_E(fsnotify_remove_first_event(group));
+
+       /*
+        * Held the notification_lock the whole time, so this is the
+        * same event we peeked above.
+        */
+       fsnotify_remove_first_event(group);
         if (fanotify_is_perm_event(event->mask))
                 FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED;
+       if (fanotify_is_hashed_event(event->mask))
+               fanotify_unhash_event(group, event);
  out:
         spin_unlock(&group->notification_lock);
         return event;
@@ -341,6 +419,14 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
         metadata.reserved = 0;
         metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS;
         metadata.pid = pid_vnr(event->pid);
+       /*
+        * For an unprivileged listener, event->pid can be used to identify the
+        * events generated by the listener process itself, without disclosing
+        * the pids of other processes.
+        */
+       if (!capable(CAP_SYS_ADMIN) &&
+           task_tgid(current) != event->pid)
+               metadata.pid = 0;
  
         if (path && path->mnt && path->dentry) {
                 fd = create_fd(group, path, &f);
@@ -573,6 +659,7 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t
  static int fanotify_release(struct inode *ignored, struct file *file)
  {
         struct fsnotify_group *group = file->private_data;
+       struct fsnotify_event *fsn_event;
  
         /*
          * Stop new events from arriving in the notification queue. since
@@ -601,13 +688,12 @@ static int fanotify_release(struct inode *ignored, struct file *file)
          * dequeue them and set the response. They will be freed once the
          * response is consumed and fanotify_get_response() returns.
          */
-       while (!fsnotify_notify_queue_is_empty(group)) {
-               struct fanotify_event *event;
+       while ((fsn_event = fsnotify_remove_first_event(group))) {
+               struct fanotify_event *event = FANOTIFY_E(fsn_event);
  
-               event = FANOTIFY_E(fsnotify_remove_first_event(group));
                 if (!(event->mask & FANOTIFY_PERM_EVENTS)) {
                         spin_unlock(&group->notification_lock);
-                       fsnotify_destroy_event(group, &event->fse);
+                       fsnotify_destroy_event(group, fsn_event);
                 } else {
                         finish_permission_event(group, FANOTIFY_PERM(event),
                                                 FAN_ALLOW);
@@ -822,24 +908,38 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
                                                    unsigned int type,
                                                    __kernel_fsid_t *fsid)
  {
+       struct ucounts *ucounts = group->fanotify_data.ucounts;
         struct fsnotify_mark *mark;
         int ret;
  
-       if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
+       /*
+        * Enforce per user marks limits per user in all containing user ns.
+        * A group with FAN_UNLIMITED_MARKS does not contribute to mark count
+        * in the limited groups account.
+        */
+       if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) &&
+           !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS))
                 return ERR_PTR(-ENOSPC);
  
         mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
-       if (!mark)
-               return ERR_PTR(-ENOMEM);
+       if (!mark) {
+               ret = -ENOMEM;
+               goto out_dec_ucounts;
+       }
  
         fsnotify_init_mark(mark, group);
         ret = fsnotify_add_mark_locked(mark, connp, type, 0, fsid);
         if (ret) {
                 fsnotify_put_mark(mark);
-               return ERR_PTR(ret);
+               goto out_dec_ucounts;
         }
  
         return mark;
+
+out_dec_ucounts:
+       if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS))
+               dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS);
+       return ERR_PTR(ret);
  }
  
  
@@ -919,20 +1019,41 @@ static struct fsnotify_event *fanotify_alloc_overflow_event(void)
         return &oevent->fse;
  }
  
+static struct hlist_head *fanotify_alloc_merge_hash(void)
+{
+       struct hlist_head *hash;
+
+       hash = kmalloc(sizeof(struct hlist_head) << FANOTIFY_HTABLE_BITS,
+                      GFP_KERNEL_ACCOUNT);
+       if (!hash)
+               return NULL;
+
+       __hash_init(hash, FANOTIFY_HTABLE_SIZE);
+
+       return hash;
+}
+
  /* fanotify syscalls */
  SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
  {
         struct fsnotify_group *group;
         int f_flags, fd;
-       struct user_struct *user;
         unsigned int fid_mode = flags & FANOTIFY_FID_BITS;
         unsigned int class = flags & FANOTIFY_CLASS_BITS;
  
         pr_debug("%s: flags=%x event_f_flags=%x\n",
                  __func__, flags, event_f_flags);
  
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
+       if (!capable(CAP_SYS_ADMIN)) {
+               /*
+                * An unprivileged user can setup an fanotify group with
+                * limited functionality - an unprivileged group is limited to
+                * notification events with file handles and it cannot use
+                * unlimited queue/marks.
+                */
+               if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || !fid_mode)
+                       return -EPERM;
+       }
  
  #ifdef CONFIG_AUDITSYSCALL
         if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT))
@@ -963,12 +1084,6 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
         if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID))
                 return -EINVAL;
  
-       user = get_current_user();
-       if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) {
-               free_uid(user);
-               return -EMFILE;
-       }
-
         f_flags = O_RDWR | FMODE_NONOTIFY;
         if (flags & FAN_CLOEXEC)
                 f_flags |= O_CLOEXEC;
@@ -978,15 +1093,27 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
         /* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
         group = fsnotify_alloc_user_group(&fanotify_fsnotify_ops);
         if (IS_ERR(group)) {
-               free_uid(user);
                 return PTR_ERR(group);
         }
  
-       group->fanotify_data.user = user;
+       /* Enforce groups limits per user in all containing user ns */
+       group->fanotify_data.ucounts = inc_ucount(current_user_ns(),
+                                                 current_euid(),
+                                                 UCOUNT_FANOTIFY_GROUPS);
+       if (!group->fanotify_data.ucounts) {
+               fd = -EMFILE;
+               goto out_destroy_group;
+       }
+
         group->fanotify_data.flags = flags;
-       atomic_inc(&user->fanotify_listeners);
         group->memcg = get_mem_cgroup_from_mm(current->mm);
  
+       group->fanotify_data.merge_hash = fanotify_alloc_merge_hash();
+       if (!group->fanotify_data.merge_hash) {
+               fd = -ENOMEM;
+               goto out_destroy_group;
+       }
+
         group->overflow_event = fanotify_alloc_overflow_event();
         if (unlikely(!group->overflow_event)) {
                 fd = -ENOMEM;
@@ -1019,16 +1146,13 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
                         goto out_destroy_group;
                 group->max_events = UINT_MAX;
         } else {
-               group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS;
+               group->max_events = fanotify_max_queued_events;
         }
  
         if (flags & FAN_UNLIMITED_MARKS) {
                 fd = -EPERM;
                 if (!capable(CAP_SYS_ADMIN))
                         goto out_destroy_group;
-               group->fanotify_data.max_marks = UINT_MAX;
-       } else {
-               group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS;
         }
  
         if (flags & FAN_ENABLE_AUDIT) {
@@ -1180,6 +1304,15 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
                 goto fput_and_out;
         group = f.file->private_data;
  
+       /*
+        * An unprivileged user is not allowed to watch a mount point nor
+        * a filesystem.
+        */
+       ret = -EPERM;
+       if (!capable(CAP_SYS_ADMIN) &&
+           mark_type != FAN_MARK_INODE)
+               goto fput_and_out;
+
         /*
          * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF.  These are not
          * allowed to set permissions events.
@@ -1312,6 +1445,21 @@ SYSCALL32_DEFINE6(fanotify_mark,
   */
  static int __init fanotify_user_setup(void)
  {
+       struct sysinfo si;
+       int max_marks;
+
+       si_meminfo(&si);
+       /*
+        * Allow up to 1% of addressable memory to be accounted for per user
+        * marks limited to the range [8192, 1048576]. mount and sb marks are
+        * a lot cheaper than inode marks, but there is no reason for a user
+        * to have many of those, so calculate by the cost of inode marks.
+        */
+       max_marks = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) /
+                   INODE_MARK_COST;
+       max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS,
+                                    FANOTIFY_DEFAULT_MAX_USER_MARKS);
+
         BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10);
         BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
  
@@ -1326,6 +1474,11 @@ static int __init fanotify_user_setup(void)
                         KMEM_CACHE(fanotify_perm_event, SLAB_PANIC);
         }
  
+       fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS;
+       init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] =
+                                       FANOTIFY_DEFAULT_MAX_GROUPS;
+       init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks;
+
         return 0;
  }
  device_initcall(fanotify_user_setup);