Merge tag 'fs.rt.v5.18' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner...

[linux-2.6-microblaze.git] / fs / namespace.c
diff --git a/fs/namespace.c b/fs/namespace.c

index 3ab45b4..6e9844b 100644 (file)
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -31,12 +31,13 @@
  #include <uapi/linux/mount.h>
  #include <linux/fs_context.h>
  #include <linux/shmem_fs.h>
+#include <linux/mnt_idmapping.h>
  
  #include "pnode.h"
  #include "internal.h"
  
  /* Maximum number of mounts in a mount namespace */
-unsigned int sysctl_mount_max __read_mostly = 100000;
+static unsigned int sysctl_mount_max __read_mostly = 100000;
  
  static unsigned int m_hash_mask __read_mostly;
  static unsigned int m_hash_shift __read_mostly;
@@ -484,6 +485,24 @@ void mnt_drop_write_file(struct file *file)
  }
  EXPORT_SYMBOL(mnt_drop_write_file);
  
+/**
+ * mnt_hold_writers - prevent write access to the given mount
+ * @mnt: mnt to prevent write access to
+ *
+ * Prevents write access to @mnt if there are no active writers for @mnt.
+ * This function needs to be called and return successfully before changing
+ * properties of @mnt that need to remain stable for callers with write access
+ * to @mnt.
+ *
+ * After this functions has been called successfully callers must pair it with
+ * a call to mnt_unhold_writers() in order to stop preventing write access to
+ * @mnt.
+ *
+ * Context: This function expects lock_mount_hash() to be held serializing
+ *          setting MNT_WRITE_HOLD.
+ * Return: On success 0 is returned.
+ *        On error, -EBUSY is returned.
+ */
  static inline int mnt_hold_writers(struct mount *mnt)
  {
         mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
@@ -515,6 +534,18 @@ static inline int mnt_hold_writers(struct mount *mnt)
         return 0;
  }
  
+/**
+ * mnt_unhold_writers - stop preventing write access to the given mount
+ * @mnt: mnt to stop preventing write access to
+ *
+ * Stop preventing write access to @mnt allowing callers to gain write access
+ * to @mnt again.
+ *
+ * This function can only be called after a successful call to
+ * mnt_hold_writers().
+ *
+ * Context: This function expects lock_mount_hash() to be held.
+ */
  static inline void mnt_unhold_writers(struct mount *mnt)
  {
         /*
@@ -548,12 +579,9 @@ int sb_prepare_remount_readonly(struct super_block *sb)
         lock_mount_hash();
         list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
                 if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
-                       mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
-                       smp_mb();
-                       if (mnt_get_writers(mnt) > 0) {
-                               err = -EBUSY;
+                       err = mnt_hold_writers(mnt);
+                       if (err)
                                 break;
-                       }
                 }
         }
         if (!err && atomic_long_read(&sb->s_remove_count))
@@ -577,7 +605,7 @@ static void free_vfsmnt(struct mount *mnt)
         struct user_namespace *mnt_userns;
  
         mnt_userns = mnt_user_ns(&mnt->mnt);
-       if (mnt_userns != &init_user_ns)
+       if (!initial_idmapping(mnt_userns))
                 put_user_ns(mnt_userns);
         kfree_const(mnt->mnt_devname);
  #ifdef CONFIG_SMP
@@ -981,6 +1009,7 @@ static struct mount *skip_mnt_tree(struct mount *p)
  struct vfsmount *vfs_create_mount(struct fs_context *fc)
  {
         struct mount *mnt;
+       struct user_namespace *fs_userns;
  
         if (!fc->root)
                 return ERR_PTR(-EINVAL);
@@ -998,6 +1027,10 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc)
         mnt->mnt_mountpoint     = mnt->mnt.mnt_root;
         mnt->mnt_parent         = mnt;
  
+       fs_userns = mnt->mnt.mnt_sb->s_user_ns;
+       if (!initial_idmapping(fs_userns))
+               mnt->mnt.mnt_userns = get_user_ns(fs_userns);
+
         lock_mount_hash();
         list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
         unlock_mount_hash();
@@ -1088,7 +1121,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
  
         atomic_inc(&sb->s_active);
         mnt->mnt.mnt_userns = mnt_user_ns(&old->mnt);
-       if (mnt->mnt.mnt_userns != &init_user_ns)
+       if (!initial_idmapping(mnt->mnt.mnt_userns))
                 mnt->mnt.mnt_userns = get_user_ns(mnt->mnt.mnt_userns);
         mnt->mnt.mnt_sb = sb;
         mnt->mnt.mnt_root = dget(root);
@@ -2577,6 +2610,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *
         struct super_block *sb = mnt->mnt_sb;
  
         if (!__mnt_is_readonly(mnt) &&
+          (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) &&
            (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
                 char *buf = (char *)__get_free_page(GFP_KERNEL);
                 char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM);
@@ -2591,6 +2625,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *
                         tm.tm_year+1900, (unsigned long long)sb->s_time_max);
  
                 free_page((unsigned long)buf);
+               sb->s_iflags |= SB_I_TS_EXPIRY_WARNED;
         }
  }
  
@@ -3943,28 +3978,32 @@ static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt)
  static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
  {
         struct vfsmount *m = &mnt->mnt;
+       struct user_namespace *fs_userns = m->mnt_sb->s_user_ns;
  
         if (!kattr->mnt_userns)
                 return 0;
  
+       /*
+        * Creating an idmapped mount with the filesystem wide idmapping
+        * doesn't make sense so block that. We don't allow mushy semantics.
+        */
+       if (kattr->mnt_userns == fs_userns)
+               return -EINVAL;
+
         /*
          * Once a mount has been idmapped we don't allow it to change its
          * mapping. It makes things simpler and callers can just create
          * another bind-mount they can idmap if they want to.
          */
-       if (mnt_user_ns(m) != &init_user_ns)
+       if (is_idmapped_mnt(m))
                 return -EPERM;
  
         /* The underlying filesystem doesn't support idmapped mounts yet. */
         if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP))
                 return -EINVAL;
  
-       /* Don't yet support filesystem mountable in user namespaces. */
-       if (m->mnt_sb->s_user_ns != &init_user_ns)
-               return -EINVAL;
-
         /* We're not controlling the superblock. */
-       if (!capable(CAP_SYS_ADMIN))
+       if (!ns_capable(fs_userns, CAP_SYS_ADMIN))
                 return -EPERM;
  
         /* Mount has already been visible in the filesystem hierarchy. */
@@ -3974,102 +4013,110 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
         return 0;
  }
  
-static struct mount *mount_setattr_prepare(struct mount_kattr *kattr,
-                                          struct mount *mnt, int *err)
+/**
+ * mnt_allow_writers() - check whether the attribute change allows writers
+ * @kattr: the new mount attributes
+ * @mnt: the mount to which @kattr will be applied
+ *
+ * Check whether thew new mount attributes in @kattr allow concurrent writers.
+ *
+ * Return: true if writers need to be held, false if not
+ */
+static inline bool mnt_allow_writers(const struct mount_kattr *kattr,
+                                    const struct mount *mnt)
  {
-       struct mount *m = mnt, *last = NULL;
+       return !(kattr->attr_set & MNT_READONLY) ||
+              (mnt->mnt.mnt_flags & MNT_READONLY);
+}
  
-       if (!is_mounted(&m->mnt)) {
-               *err = -EINVAL;
-               goto out;
-       }
+static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
+{
+       struct mount *m;
+       int err;
  
-       if (!(mnt_has_parent(m) ? check_mnt(m) : is_anon_ns(m->mnt_ns))) {
-               *err = -EINVAL;
-               goto out;
-       }
+       for (m = mnt; m; m = next_mnt(m, mnt)) {
+               if (!can_change_locked_flags(m, recalc_flags(kattr, m))) {
+                       err = -EPERM;
+                       break;
+               }
  
-       do {
-               unsigned int flags;
+               err = can_idmap_mount(kattr, m);
+               if (err)
+                       break;
  
-               flags = recalc_flags(kattr, m);
-               if (!can_change_locked_flags(m, flags)) {
-                       *err = -EPERM;
-                       goto out;
+               if (!mnt_allow_writers(kattr, m)) {
+                       err = mnt_hold_writers(m);
+                       if (err)
+                               break;
                 }
  
-               *err = can_idmap_mount(kattr, m);
-               if (*err)
-                       goto out;
+               if (!kattr->recurse)
+                       return 0;
+       }
  
-               last = m;
+       if (err) {
+               struct mount *p;
  
-               if ((kattr->attr_set & MNT_READONLY) &&
-                   !(m->mnt.mnt_flags & MNT_READONLY)) {
-                       *err = mnt_hold_writers(m);
-                       if (*err)
-                               goto out;
+               for (p = mnt; p != m; p = next_mnt(p, mnt)) {
+                       /* If we had to hold writers unblock them. */
+                       if (p->mnt.mnt_flags & MNT_WRITE_HOLD)
+                               mnt_unhold_writers(p);
                 }
-       } while (kattr->recurse && (m = next_mnt(m, mnt)));
-
-out:
-       return last;
+       }
+       return err;
  }
  
  static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
  {
-       struct user_namespace *mnt_userns;
+       struct user_namespace *mnt_userns, *old_mnt_userns;
  
         if (!kattr->mnt_userns)
                 return;
  
+       /*
+        * We're the only ones able to change the mount's idmapping. So
+        * mnt->mnt.mnt_userns is stable and we can retrieve it directly.
+        */
+       old_mnt_userns = mnt->mnt.mnt_userns;
+
         mnt_userns = get_user_ns(kattr->mnt_userns);
         /* Pairs with smp_load_acquire() in mnt_user_ns(). */
         smp_store_release(&mnt->mnt.mnt_userns, mnt_userns);
+
+       /*
+        * If this is an idmapped filesystem drop the reference we've taken
+        * in vfs_create_mount() before.
+        */
+       if (!initial_idmapping(old_mnt_userns))
+               put_user_ns(old_mnt_userns);
  }
  
-static void mount_setattr_commit(struct mount_kattr *kattr,
-                                struct mount *mnt, struct mount *last,
-                                int err)
+static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
  {
-       struct mount *m = mnt;
+       struct mount *m;
  
-       do {
-               if (!err) {
-                       unsigned int flags;
+       for (m = mnt; m; m = next_mnt(m, mnt)) {
+               unsigned int flags;
  
-                       do_idmap_mount(kattr, m);
-                       flags = recalc_flags(kattr, m);
-                       WRITE_ONCE(m->mnt.mnt_flags, flags);
-               }
+               do_idmap_mount(kattr, m);
+               flags = recalc_flags(kattr, m);
+               WRITE_ONCE(m->mnt.mnt_flags, flags);
  
-               /*
-                * We either set MNT_READONLY above so make it visible
-                * before ~MNT_WRITE_HOLD or we failed to recursively
-                * apply mount options.
-                */
-               if ((kattr->attr_set & MNT_READONLY) &&
-                   (m->mnt.mnt_flags & MNT_WRITE_HOLD))
+               /* If we had to hold writers unblock them. */
+               if (m->mnt.mnt_flags & MNT_WRITE_HOLD)
                         mnt_unhold_writers(m);
  
-               if (!err && kattr->propagation)
+               if (kattr->propagation)
                         change_mnt_propagation(m, kattr->propagation);
-
-               /*
-                * On failure, only cleanup until we found the first mount
-                * we failed to handle.
-                */
-               if (err && m == last)
+               if (!kattr->recurse)
                         break;
-       } while (kattr->recurse && (m = next_mnt(m, mnt)));
-
-       if (!err)
-               touch_mnt_namespace(mnt->mnt_ns);
+       }
+       touch_mnt_namespace(mnt->mnt_ns);
  }
  
  static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
  {
-       struct mount *mnt = real_mount(path->mnt), *last = NULL;
+       struct mount *mnt = real_mount(path->mnt);
         int err = 0;
  
         if (path->dentry != mnt->mnt.mnt_root)
@@ -4090,16 +4137,32 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
                 }
         }
  
+       err = -EINVAL;
         lock_mount_hash();
  
+       /* Ensure that this isn't anything purely vfs internal. */
+       if (!is_mounted(&mnt->mnt))
+               goto out;
+
         /*
-        * Get the mount tree in a shape where we can change mount
-        * properties without failure.
+        * If this is an attached mount make sure it's located in the callers
+        * mount namespace. If it's not don't let the caller interact with it.
+        * If this is a detached mount make sure it has an anonymous mount
+        * namespace attached to it, i.e. we've created it via OPEN_TREE_CLONE.
          */
-       last = mount_setattr_prepare(kattr, mnt, &err);
-       if (last) /* Commit all changes or revert to the old state. */
-               mount_setattr_commit(kattr, mnt, last, err);
+       if (!(mnt_has_parent(mnt) ? check_mnt(mnt) : is_anon_ns(mnt->mnt_ns)))
+               goto out;
  
+       /*
+        * First, we get the mount tree in a shape where we can change mount
+        * properties without failure. If we succeeded to do so we commit all
+        * changes and if we failed we clean up.
+        */
+       err = mount_setattr_prepare(kattr, mnt);
+       if (!err)
+               mount_setattr_commit(kattr, mnt);
+
+out:
         unlock_mount_hash();
  
         if (kattr->propagation) {
@@ -4149,13 +4212,15 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
         }
  
         /*
-        * The init_user_ns is used to indicate that a vfsmount is not idmapped.
-        * This is simpler than just having to treat NULL as unmapped. Users
-        * wanting to idmap a mount to init_user_ns can just use a namespace
-        * with an identity mapping.
+        * The initial idmapping cannot be used to create an idmapped
+        * mount. We use the initial idmapping as an indicator of a mount
+        * that is not idmapped. It can simply be passed into helpers that
+        * are aware of idmapped mounts as a convenient shortcut. A user
+        * can just create a dedicated identity mapping to achieve the same
+        * result.
          */
         mnt_userns = container_of(ns, struct user_namespace, ns);
-       if (mnt_userns == &init_user_ns) {
+       if (initial_idmapping(mnt_userns)) {
                 err = -EPERM;
                 goto out_fput;
         }
@@ -4279,12 +4344,11 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
                 return err;
  
         err = user_path_at(dfd, path, kattr.lookup_flags, &target);
-       if (err)
-               return err;
-
-       err = do_mount_setattr(&target, &kattr);
+       if (!err) {
+               err = do_mount_setattr(&target, &kattr);
+               path_put(&target);
+       }
         finish_mount_kattr(&kattr);
-       path_put(&target);
         return err;
  }
  
@@ -4612,3 +4676,25 @@ const struct proc_ns_operations mntns_operations = {
         .install        = mntns_install,
         .owner          = mntns_owner,
  };
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table fs_namespace_sysctls[] = {
+       {
+               .procname       = "mount-max",
+               .data           = &sysctl_mount_max,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = SYSCTL_ONE,
+       },
+       { }
+};
+
+static int __init init_fs_namespace_sysctls(void)
+{
+       register_sysctl_init("fs", fs_namespace_sysctls);
+       return 0;
+}
+fs_initcall(init_fs_namespace_sysctls);
+
+#endif /* CONFIG_SYSCTL */