Merge tag 'fs.move_mount.move_mount_set_group.v5.15' of git://git.kernel.org/pub...
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 31 Aug 2021 18:54:02 +0000 (11:54 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 31 Aug 2021 18:54:02 +0000 (11:54 -0700)
Pull move_mount updates from Christian Brauner:
 "This contains an extension to the move_mount() syscall making it
  possible to add a single private mount into an existing propagation
  tree.

  The use-case comes from the criu folks which have been struggling with
  restoring complex mount trees for a long time. Variations of this work
  have been discussed at Plumbers before, e.g.

      https://www.linuxplumbersconf.org/event/7/contributions/640/

  The extension to move_mount() enables criu to restore any set of mount
  namespaces, mount trees and sharing group trees without introducing
  yet more complexity into mount propagation itself.

  The changes required to criu to make use of this and restore complex
  propagation trees are available at

      https://github.com/Snorch/criu/commits/mount-v2-poc

  A cleaned-up version of this will go up for merging into the main criu
  repo after this lands"

* tag 'fs.move_mount.move_mount_set_group.v5.15' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux:
  tests: add move_mount(MOVE_MOUNT_SET_GROUP) selftest
  move_mount: allow to add a mount into an existing group

fs/namespace.c
include/uapi/linux/mount.h
tools/testing/selftests/Makefile
tools/testing/selftests/move_mount_set_group/.gitignore [new file with mode: 0644]
tools/testing/selftests/move_mount_set_group/Makefile [new file with mode: 0644]
tools/testing/selftests/move_mount_set_group/config [new file with mode: 0644]
tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c [new file with mode: 0644]

index 20caa4b..1285236 100644 (file)
@@ -2694,6 +2694,78 @@ out:
        return ret;
 }
 
+static int do_set_group(struct path *from_path, struct path *to_path)
+{
+       struct mount *from, *to;
+       int err;
+
+       from = real_mount(from_path->mnt);
+       to = real_mount(to_path->mnt);
+
+       namespace_lock();
+
+       err = -EINVAL;
+       /* To and From must be mounted */
+       if (!is_mounted(&from->mnt))
+               goto out;
+       if (!is_mounted(&to->mnt))
+               goto out;
+
+       err = -EPERM;
+       /* We should be allowed to modify mount namespaces of both mounts */
+       if (!ns_capable(from->mnt_ns->user_ns, CAP_SYS_ADMIN))
+               goto out;
+       if (!ns_capable(to->mnt_ns->user_ns, CAP_SYS_ADMIN))
+               goto out;
+
+       err = -EINVAL;
+       /* To and From paths should be mount roots */
+       if (from_path->dentry != from_path->mnt->mnt_root)
+               goto out;
+       if (to_path->dentry != to_path->mnt->mnt_root)
+               goto out;
+
+       /* Setting sharing groups is only allowed across same superblock */
+       if (from->mnt.mnt_sb != to->mnt.mnt_sb)
+               goto out;
+
+       /* From mount root should be wider than To mount root */
+       if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root))
+               goto out;
+
+       /* From mount should not have locked children in place of To's root */
+       if (has_locked_children(from, to->mnt.mnt_root))
+               goto out;
+
+       /* Setting sharing groups is only allowed on private mounts */
+       if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to))
+               goto out;
+
+       /* From should not be private */
+       if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from))
+               goto out;
+
+       if (IS_MNT_SLAVE(from)) {
+               struct mount *m = from->mnt_master;
+
+               list_add(&to->mnt_slave, &m->mnt_slave_list);
+               to->mnt_master = m;
+       }
+
+       if (IS_MNT_SHARED(from)) {
+               to->mnt_group_id = from->mnt_group_id;
+               list_add(&to->mnt_share, &from->mnt_share);
+               lock_mount_hash();
+               set_mnt_shared(to);
+               unlock_mount_hash();
+       }
+
+       err = 0;
+out:
+       namespace_unlock();
+       return err;
+}
+
 static int do_move_mount(struct path *old_path, struct path *new_path)
 {
        struct mnt_namespace *ns;
@@ -3678,7 +3750,10 @@ SYSCALL_DEFINE5(move_mount,
        if (ret < 0)
                goto out_to;
 
-       ret = do_move_mount(&from_path, &to_path);
+       if (flags & MOVE_MOUNT_SET_GROUP)
+               ret = do_set_group(&from_path, &to_path);
+       else
+               ret = do_move_mount(&from_path, &to_path);
 
 out_to:
        path_put(&to_path);
index dd7a166..4d93967 100644 (file)
@@ -73,7 +73,8 @@
 #define MOVE_MOUNT_T_SYMLINKS          0x00000010 /* Follow symlinks on to path */
 #define MOVE_MOUNT_T_AUTOMOUNTS                0x00000020 /* Follow automounts on to path */
 #define MOVE_MOUNT_T_EMPTY_PATH                0x00000040 /* Empty to path permitted */
-#define MOVE_MOUNT__MASK               0x00000077
+#define MOVE_MOUNT_SET_GROUP           0x00000100 /* Set sharing group instead */
+#define MOVE_MOUNT__MASK               0x00000177
 
 /*
  * fsopen() flags.
index fb010a3..dd0388e 100644 (file)
@@ -35,6 +35,7 @@ TARGETS += memory-hotplug
 TARGETS += mincore
 TARGETS += mount
 TARGETS += mount_setattr
+TARGETS += move_mount_set_group
 TARGETS += mqueue
 TARGETS += nci
 TARGETS += net
diff --git a/tools/testing/selftests/move_mount_set_group/.gitignore b/tools/testing/selftests/move_mount_set_group/.gitignore
new file mode 100644 (file)
index 0000000..f5e3392
--- /dev/null
@@ -0,0 +1 @@
+move_mount_set_group_test
diff --git a/tools/testing/selftests/move_mount_set_group/Makefile b/tools/testing/selftests/move_mount_set_group/Makefile
new file mode 100644 (file)
index 0000000..80c2d86
--- /dev/null
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+# Makefile for mount selftests.
+CFLAGS = -g -I../../../../usr/include/ -Wall -O2
+
+TEST_GEN_FILES += move_mount_set_group_test
+
+include ../lib.mk
diff --git a/tools/testing/selftests/move_mount_set_group/config b/tools/testing/selftests/move_mount_set_group/config
new file mode 100644 (file)
index 0000000..416bd53
--- /dev/null
@@ -0,0 +1 @@
+CONFIG_USER_NS=y
diff --git a/tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c b/tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c
new file mode 100644 (file)
index 0000000..860198f
--- /dev/null
@@ -0,0 +1,375 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <sched.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/mount.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdarg.h>
+#include <sys/syscall.h>
+
+#include "../kselftest_harness.h"
+
+#ifndef CLONE_NEWNS
+#define CLONE_NEWNS 0x00020000
+#endif
+
+#ifndef CLONE_NEWUSER
+#define CLONE_NEWUSER 0x10000000
+#endif
+
+#ifndef MS_SHARED
+#define MS_SHARED (1 << 20)
+#endif
+
+#ifndef MS_PRIVATE
+#define MS_PRIVATE (1<<18)
+#endif
+
+#ifndef MOVE_MOUNT_SET_GROUP
+#define MOVE_MOUNT_SET_GROUP 0x00000100
+#endif
+
+#ifndef MOVE_MOUNT_F_EMPTY_PATH
+#define MOVE_MOUNT_F_EMPTY_PATH 0x00000004
+#endif
+
+#ifndef MOVE_MOUNT_T_EMPTY_PATH
+#define MOVE_MOUNT_T_EMPTY_PATH 0x00000040
+#endif
+
+static ssize_t write_nointr(int fd, const void *buf, size_t count)
+{
+       ssize_t ret;
+
+       do {
+               ret = write(fd, buf, count);
+       } while (ret < 0 && errno == EINTR);
+
+       return ret;
+}
+
+static int write_file(const char *path, const void *buf, size_t count)
+{
+       int fd;
+       ssize_t ret;
+
+       fd = open(path, O_WRONLY | O_CLOEXEC | O_NOCTTY | O_NOFOLLOW);
+       if (fd < 0)
+               return -1;
+
+       ret = write_nointr(fd, buf, count);
+       close(fd);
+       if (ret < 0 || (size_t)ret != count)
+               return -1;
+
+       return 0;
+}
+
+static int create_and_enter_userns(void)
+{
+       uid_t uid;
+       gid_t gid;
+       char map[100];
+
+       uid = getuid();
+       gid = getgid();
+
+       if (unshare(CLONE_NEWUSER))
+               return -1;
+
+       if (write_file("/proc/self/setgroups", "deny", sizeof("deny") - 1) &&
+           errno != ENOENT)
+               return -1;
+
+       snprintf(map, sizeof(map), "0 %d 1", uid);
+       if (write_file("/proc/self/uid_map", map, strlen(map)))
+               return -1;
+
+
+       snprintf(map, sizeof(map), "0 %d 1", gid);
+       if (write_file("/proc/self/gid_map", map, strlen(map)))
+               return -1;
+
+       if (setgid(0))
+               return -1;
+
+       if (setuid(0))
+               return -1;
+
+       return 0;
+}
+
+static int prepare_unpriv_mountns(void)
+{
+       if (create_and_enter_userns())
+               return -1;
+
+       if (unshare(CLONE_NEWNS))
+               return -1;
+
+       if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
+               return -1;
+
+       return 0;
+}
+
+static char *get_field(char *src, int nfields)
+{
+       int i;
+       char *p = src;
+
+       for (i = 0; i < nfields; i++) {
+               while (*p && *p != ' ' && *p != '\t')
+                       p++;
+
+               if (!*p)
+                       break;
+
+               p++;
+       }
+
+       return p;
+}
+
+static void null_endofword(char *word)
+{
+       while (*word && *word != ' ' && *word != '\t')
+               word++;
+       *word = '\0';
+}
+
+static bool is_shared_mount(const char *path)
+{
+       size_t len = 0;
+       char *line = NULL;
+       FILE *f = NULL;
+
+       f = fopen("/proc/self/mountinfo", "re");
+       if (!f)
+               return false;
+
+       while (getline(&line, &len, f) != -1) {
+               char *opts, *target;
+
+               target = get_field(line, 4);
+               if (!target)
+                       continue;
+
+               opts = get_field(target, 2);
+               if (!opts)
+                       continue;
+
+               null_endofword(target);
+
+               if (strcmp(target, path) != 0)
+                       continue;
+
+               null_endofword(opts);
+               if (strstr(opts, "shared:"))
+                       return true;
+       }
+
+       free(line);
+       fclose(f);
+
+       return false;
+}
+
+/* Attempt to de-conflict with the selftests tree. */
+#ifndef SKIP
+#define SKIP(s, ...)   XFAIL(s, ##__VA_ARGS__)
+#endif
+
+#define SET_GROUP_FROM "/tmp/move_mount_set_group_supported_from"
+#define SET_GROUP_TO   "/tmp/move_mount_set_group_supported_to"
+
+static int move_mount_set_group_supported(void)
+{
+       int ret;
+
+       if (mount("testing", "/tmp", "tmpfs", MS_NOATIME | MS_NODEV,
+                 "size=100000,mode=700"))
+               return -1;
+
+       if (mount(NULL, "/tmp", NULL, MS_PRIVATE, 0))
+               return -1;
+
+       if (mkdir(SET_GROUP_FROM, 0777))
+               return -1;
+
+       if (mkdir(SET_GROUP_TO, 0777))
+               return -1;
+
+       if (mount("testing", SET_GROUP_FROM, "tmpfs", MS_NOATIME | MS_NODEV,
+                 "size=100000,mode=700"))
+               return -1;
+
+       if (mount(SET_GROUP_FROM, SET_GROUP_TO, NULL, MS_BIND, NULL))
+               return -1;
+
+       if (mount(NULL, SET_GROUP_FROM, NULL, MS_SHARED, 0))
+               return -1;
+
+       ret = syscall(SYS_move_mount, AT_FDCWD, SET_GROUP_FROM,
+                     AT_FDCWD, SET_GROUP_TO, MOVE_MOUNT_SET_GROUP);
+       umount2("/tmp", MNT_DETACH);
+
+       return ret < 0 ? false : true;
+}
+
+FIXTURE(move_mount_set_group) {
+};
+
+#define SET_GROUP_A "/tmp/A"
+
+FIXTURE_SETUP(move_mount_set_group)
+{
+       int ret;
+
+       ASSERT_EQ(prepare_unpriv_mountns(), 0);
+
+       ret = move_mount_set_group_supported();
+       ASSERT_GE(ret, 0);
+       if (!ret)
+               SKIP(return, "move_mount(MOVE_MOUNT_SET_GROUP) is not supported");
+
+       umount2("/tmp", MNT_DETACH);
+
+       ASSERT_EQ(mount("testing", "/tmp", "tmpfs", MS_NOATIME | MS_NODEV,
+                       "size=100000,mode=700"), 0);
+
+       ASSERT_EQ(mkdir(SET_GROUP_A, 0777), 0);
+
+       ASSERT_EQ(mount("testing", SET_GROUP_A, "tmpfs", MS_NOATIME | MS_NODEV,
+                       "size=100000,mode=700"), 0);
+}
+
+FIXTURE_TEARDOWN(move_mount_set_group)
+{
+       int ret;
+
+       ret = move_mount_set_group_supported();
+       ASSERT_GE(ret, 0);
+       if (!ret)
+               SKIP(return, "move_mount(MOVE_MOUNT_SET_GROUP) is not supported");
+
+       umount2("/tmp", MNT_DETACH);
+}
+
+#define __STACK_SIZE (8 * 1024 * 1024)
+static pid_t do_clone(int (*fn)(void *), void *arg, int flags)
+{
+       void *stack;
+
+       stack = malloc(__STACK_SIZE);
+       if (!stack)
+               return -ENOMEM;
+
+#ifdef __ia64__
+       return __clone2(fn, stack, __STACK_SIZE, flags | SIGCHLD, arg, NULL);
+#else
+       return clone(fn, stack + __STACK_SIZE, flags | SIGCHLD, arg, NULL);
+#endif
+}
+
+static int wait_for_pid(pid_t pid)
+{
+       int status, ret;
+
+again:
+       ret = waitpid(pid, &status, 0);
+       if (ret == -1) {
+               if (errno == EINTR)
+                       goto again;
+
+               return -1;
+       }
+
+       if (!WIFEXITED(status))
+               return -1;
+
+       return WEXITSTATUS(status);
+}
+
+struct child_args {
+       int unsfd;
+       int mntnsfd;
+       bool shared;
+       int mntfd;
+};
+
+static int get_nestedns_mount_cb(void *data)
+{
+       struct child_args *ca = (struct child_args *)data;
+       int ret;
+
+       ret = prepare_unpriv_mountns();
+       if (ret)
+               return 1;
+
+       if (ca->shared) {
+               ret = mount(NULL, SET_GROUP_A, NULL, MS_SHARED, 0);
+               if (ret)
+                       return 1;
+       }
+
+       ret = open("/proc/self/ns/user", O_RDONLY);
+       if (ret < 0)
+               return 1;
+       ca->unsfd = ret;
+
+       ret = open("/proc/self/ns/mnt", O_RDONLY);
+       if (ret < 0)
+               return 1;
+       ca->mntnsfd = ret;
+
+       ret = open(SET_GROUP_A, O_RDONLY);
+       if (ret < 0)
+               return 1;
+       ca->mntfd = ret;
+
+       return 0;
+}
+
+TEST_F(move_mount_set_group, complex_sharing_copying)
+{
+       struct child_args ca_from = {
+               .shared = true,
+       };
+       struct child_args ca_to = {
+               .shared = false,
+       };
+       pid_t pid;
+       int ret;
+
+       ret = move_mount_set_group_supported();
+       ASSERT_GE(ret, 0);
+       if (!ret)
+               SKIP(return, "move_mount(MOVE_MOUNT_SET_GROUP) is not supported");
+
+       pid = do_clone(get_nestedns_mount_cb, (void *)&ca_from, CLONE_VFORK |
+                      CLONE_VM | CLONE_FILES); ASSERT_GT(pid, 0);
+       ASSERT_EQ(wait_for_pid(pid), 0);
+
+       pid = do_clone(get_nestedns_mount_cb, (void *)&ca_to, CLONE_VFORK |
+                      CLONE_VM | CLONE_FILES); ASSERT_GT(pid, 0);
+       ASSERT_EQ(wait_for_pid(pid), 0);
+
+       ASSERT_EQ(syscall(SYS_move_mount, ca_from.mntfd, "",
+                         ca_to.mntfd, "", MOVE_MOUNT_SET_GROUP
+                         | MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH),
+                 0);
+
+       ASSERT_EQ(setns(ca_to.mntnsfd, CLONE_NEWNS), 0);
+       ASSERT_EQ(is_shared_mount(SET_GROUP_A), 1);
+}
+
+TEST_HARNESS_MAIN