Commit b364f4fd authored by Pavel Tikhomirov's avatar Pavel Tikhomirov Committed by Andrei Vagin

mount: make open_mountpoint handle overmouts properly

dump of VZ7 ct fails, if we have overmounted tmpfs inside:

[root@silo ~]# prlctl enter su-test-2
entered into CT
CT-829e7b28 /# mkdir /mnt/overmntedtmp
CT-829e7b28 /# mount -t tmpfs tmpfs /mnt/overmntedtmp/
CT-829e7b28 /# mount -t tmpfs tmpfs /mnt
CT-829e7b28 /# logout

[root@silo ~]# prlctl suspend su-test-2
Suspending the CT...
Failed to suspend the CT: PRL_ERR_VZCTL_OPERATION_FAILED (Details: Will skip in-flight TCP connections
(01.657913) Error (criu/mount.c:1202): mnt: Can't open ./mnt/overmntedtmp: No such file or directory
(01.662528) Error (criu/util.c:709): exited, status=1
(01.664329) Error (criu/util.c:709): exited, status=1
(01.664694) Error (criu/cr-dump.c:2005): Dumping FAILED.
Failed to checkpoint the Container
All dump files and logs were saved to /vz/private/829e7b28-f204-4bce-b09f-d203b99befd4/dump/Dump.fail
Checkpointing failed
)

Criu wants to dump the contents of /mnt/overmntedtmp/ mount but it is
unavailable. So we copy the mount namespace in such a case and unmount
overmounts to access what we want to dump.

Actual usecase here is dumping CT with active mariadb and ssh
connection. Together they happen to create such overmount. As by default
systemd creates a separate mount namespace for mysql and also mounts
tmpfs to /run/user in it, and when ssh(root) is connected - systemd also
mounts tmpfs in container root mount namespace to /run/user/0 for user
files. As /run is slave mount /run/user/0 also propagates to mysql's
mount namespace and initially becomes overmounted by /run/user.

https://jira.sw.ru/browse/PSBM-57362

remove __maybe_unused for mnt_is_overmounted and umount_overmounts

changes in v2:
1) Use clone not fork, share resources with parent same as in
call_in_child_process.
2) Do not enter userns (create helper) for non-overmounted mounts. Thus
return back setns/resorens logic.
3) Helper opens fd for parent directly due to CLONE_FILES, remove futex.
4) Check helper exit status properly.
5) Add get_clean_fd helper.
6) Add better comments.

changes in v3:
1) Pass fd from helper through args instead of ret code, fix ret code
checking.
2) Add \n to pr_err in open_mountpoint

changes in v5:
Make comments even better.
Signed-off-by: 's avatarPavel Tikhomirov <ptikhomirov@virtuozzo.com>
Signed-off-by: 's avatarAndrei Vagin <avagin@virtuozzo.com>
parent 83df8649
...@@ -387,7 +387,7 @@ static int tmpfs_dump(struct mount_info *pm) ...@@ -387,7 +387,7 @@ static int tmpfs_dump(struct mount_info *pm)
fd = open_mountpoint(pm); fd = open_mountpoint(pm);
if (fd < 0) if (fd < 0)
return fd; return MNT_UNREACHABLE;
/* if fd happens to be 0 here, we need to move it to something /* if fd happens to be 0 here, we need to move it to something
* non-zero, because cr_system_userns closes STDIN_FILENO as we are not * non-zero, because cr_system_userns closes STDIN_FILENO as we are not
......
...@@ -12,6 +12,8 @@ struct ns_id; ...@@ -12,6 +12,8 @@ struct ns_id;
#define MOUNT_INVALID_DEV (0) #define MOUNT_INVALID_DEV (0)
#define MNT_UNREACHABLE INT_MIN
struct mount_info { struct mount_info {
int mnt_id; int mnt_id;
int parent_mnt_id; int parent_mnt_id;
......
...@@ -26,6 +26,7 @@ ...@@ -26,6 +26,7 @@
#include "path.h" #include "path.h"
#include "files-reg.h" #include "files-reg.h"
#include "external.h" #include "external.h"
#include "clone-noasan.h"
#include "images/mnt.pb-c.h" #include "images/mnt.pb-c.h"
...@@ -1096,6 +1097,19 @@ static char *get_clean_mnt(struct mount_info *mi, char *mnt_path_tmp, char *mnt_ ...@@ -1096,6 +1097,19 @@ static char *get_clean_mnt(struct mount_info *mi, char *mnt_path_tmp, char *mnt_
return mnt_path; return mnt_path;
} }
static int get_clean_fd(struct mount_info *mi)
{
char *mnt_path = NULL;
char mnt_path_tmp[] = "/tmp/cr-tmpfs.XXXXXX";
char mnt_path_root[] = "/cr-tmpfs.XXXXXX";
mnt_path = get_clean_mnt(mi, mnt_path_tmp, mnt_path_root);
if (!mnt_path)
return -1;
return open_detach_mount(mnt_path);
}
/* /*
* Our children mount can have same mountpoint as it's parent, * Our children mount can have same mountpoint as it's parent,
* call these - children-overmount. * call these - children-overmount.
...@@ -1105,7 +1119,7 @@ static char *get_clean_mnt(struct mount_info *mi, char *mnt_path_tmp, char *mnt_ ...@@ -1105,7 +1119,7 @@ static char *get_clean_mnt(struct mount_info *mi, char *mnt_path_tmp, char *mnt_
* root of our mount namespace as it is covered by other mount. * root of our mount namespace as it is covered by other mount.
* mnt_is_overmounted() checks if mount is not visible. * mnt_is_overmounted() checks if mount is not visible.
*/ */
static __maybe_unused bool mnt_is_overmounted(struct mount_info *mi) static bool mnt_is_overmounted(struct mount_info *mi)
{ {
struct mount_info *t, *c, *m = mi; struct mount_info *t, *c, *m = mi;
...@@ -1222,7 +1236,7 @@ next: ...@@ -1222,7 +1236,7 @@ next:
} }
/* Make our mountpoint fully visible */ /* Make our mountpoint fully visible */
static __maybe_unused int umount_overmounts(struct mount_info *m) static int umount_overmounts(struct mount_info *m)
{ {
if (__umount_overmounts(m)) if (__umount_overmounts(m))
return -1; return -1;
...@@ -1233,40 +1247,79 @@ static __maybe_unused int umount_overmounts(struct mount_info *m) ...@@ -1233,40 +1247,79 @@ static __maybe_unused int umount_overmounts(struct mount_info *m)
return 0; return 0;
} }
#define MNT_UNREACHABLE INT_MIN struct clone_arg {
int open_mountpoint(struct mount_info *pm) struct mount_info *mi;
int *fd;
};
/*
* Get access to the mountpoint covered by overmounts
* and open it's cleaned copy (without children mounts).
*/
int ns_open_mountpoint(void *arg)
{ {
struct mount_info *c; struct clone_arg *ca = arg;
int fd = -1, ns_old = -1; struct mount_info *mi = ca->mi;
char mnt_path_tmp[] = "/tmp/cr-tmpfs.XXXXXX"; int *fd = ca->fd;
char mnt_path_root[] = "/cr-tmpfs.XXXXXX";
char *mnt_path = mnt_path_tmp;
int cwd_fd;
/* /*
* If a mount doesn't have children, we can open a mount point, * We should enter user namespace owning mount namespace of our mount
* otherwise we need to create a "private" copy. * before creating helper mount namespace. Else all mounts in helper
* mount namespace will be locked (MNT_LOCKED) and we won't be able to
* unmount them (see CL_UNPRIVILEGED in sys_umount(), clone_mnt() and
* copy_mnt_ns() in linux kernel code).
*/ */
if (list_empty(&pm->children)) if ((root_ns_mask & CLONE_NEWUSER) &&
return __open_mountpoint(pm, -1); switch_ns(root_item->pid->real, &user_ns_desc, NULL) < 0)
goto err;
pr_info("Something is mounted on top of %s\n", pm->mountpoint);
list_for_each_entry(c, &pm->children, siblings) { /*
if (!strcmp(c->mountpoint, pm->mountpoint)) { * Create a helper mount namespace in which we can safely do unmounts
pr_debug("%d:%s is overmounted\n", pm->mnt_id, pm->mountpoint); * without breaking dumping process' environment.
return MNT_UNREACHABLE; */
} if (unshare(CLONE_NEWNS)) {
pr_perror("Unable to unshare a mount namespace");
goto err;
} }
/* Remount all mounts as private to disable propagation */
if (mount("none", "/", NULL, MS_REC|MS_PRIVATE, NULL))
goto err;
if (umount_overmounts(mi))
goto err;
/* Save fd which we opened for parent due to CLONE_FILES flag */
*fd = get_clean_fd(mi);
if (*fd < 0)
goto err;
return 0;
err:
return 1;
}
int open_mountpoint(struct mount_info *pm)
{
int fd, cwd_fd, ns_old = -1;
/* No overmounts and children - the entire mount is visible */
if (list_empty(&pm->children) && !mnt_is_overmounted(pm))
return __open_mountpoint(pm, -1);
pr_info("Mount is not fully visible %s\n", pm->mountpoint);
/* /*
* To create a "private" copy, the target mount is bind-mounted * We do two things below:
* in a temporary place w/o MS_REC (non-recursively). * a) If mount has children mounts in it which partially cover it's
* A mount point can't be bind-mounted in criu's namespace, it will be * content, to get access to the content we create a "private" copy of
* mounted in a target namespace. The sequence of actions is * such a mount, bind-mounting mount w/o MS_REC in a temporary place.
* mkdtemp, setns(tgt), mount, open, detach, setns(old). * b) If mount is overmounted we create a private copy of it's mount
* namespace so that we can safely get rid of overmounts and get an
* access to the mount.
* In both cases we can't do the thing from criu's mount namespace, so
* we need to switch to mount's mount namespace, and later swtich back.
*/ */
cwd_fd = open(".", O_DIRECTORY); cwd_fd = open(".", O_DIRECTORY);
if (cwd_fd < 0) { if (cwd_fd < 0) {
pr_perror("Unable to open cwd"); pr_perror("Unable to open cwd");
...@@ -1274,33 +1327,54 @@ int open_mountpoint(struct mount_info *pm) ...@@ -1274,33 +1327,54 @@ int open_mountpoint(struct mount_info *pm)
} }
if (switch_ns(pm->nsid->ns_pid, &mnt_ns_desc, &ns_old) < 0) if (switch_ns(pm->nsid->ns_pid, &mnt_ns_desc, &ns_old) < 0)
goto out; goto err;
if (!mnt_is_overmounted(pm)) {
pr_info("\tmount has children %s\n", pm->mountpoint);
fd = get_clean_fd(pm);
if (fd < 0)
goto err;
} else {
int pid, status;
struct clone_arg ca = {
.mi = pm,
.fd = &fd
};
pr_info("\tmount is overmounted %s\n", pm->mountpoint);
mnt_path = get_clean_mnt(pm, mnt_path_tmp, mnt_path_root);
if (mnt_path == NULL) {
/* /*
* We probably can't create a temporary direcotry, * We are overmounted - not accessible in a regular way. We
* so we can try to clone the mount namespace, open * need to clone "private" copy of mount's monut namespace and
* the required mount and destroy this mount namespace * unmount all covering overmounts in it. We also need to enter
* by calling restore_ns() below in this function. * user namespace owning these mount namespace just before that
* (see explanation in ns_open_mountpoint). Thus we also have
* to create helper process here as entering user namespace is
* irreversible operation.
*/ */
if (unshare(CLONE_NEWNS)) { pid = clone_noasan(ns_open_mountpoint, CLONE_VFORK | CLONE_VM
pr_perror("Unable to clone a mount namespace"); | CLONE_FILES | CLONE_IO | CLONE_SIGHAND
goto out; | CLONE_SYSVSEM, &ca);
if (pid == -1) {
pr_perror("Can't clone helper process");
return -1;
} }
fd = open(pm->mountpoint, O_RDONLY | O_DIRECTORY, 0); errno = 0;
if (fd < 0) if (waitpid(pid, &status, __WALL) != pid || !WIFEXITED(status)
pr_perror("Can't open directory %s: %d", pm->mountpoint, fd); || WEXITSTATUS(status)) {
} else pr_err("Can't wait or bad status: errno=%d, status=%d\n",
fd = open_detach_mount(mnt_path); errno, status);
if (fd < 0) return -1;
goto out; }
}
if (restore_ns(ns_old, &mnt_ns_desc)) { if (restore_ns(ns_old, &mnt_ns_desc)) {
ns_old = -1; ns_old = -1;
goto out; goto err;
} }
if (fchdir(cwd_fd)) { if (fchdir(cwd_fd)) {
pr_perror("Unable to restore cwd"); pr_perror("Unable to restore cwd");
close(cwd_fd); close(cwd_fd);
...@@ -1310,9 +1384,9 @@ int open_mountpoint(struct mount_info *pm) ...@@ -1310,9 +1384,9 @@ int open_mountpoint(struct mount_info *pm)
close(cwd_fd); close(cwd_fd);
return __open_mountpoint(pm, fd); return __open_mountpoint(pm, fd);
out: err:
if (ns_old >= 0) if (ns_old >= 0)
restore_ns(ns_old, &mnt_ns_desc); restore_ns(ns_old, &mnt_ns_desc);
close_safe(&fd); close_safe(&fd);
if (fchdir(cwd_fd)) if (fchdir(cwd_fd))
pr_perror("Unable to restore cwd"); pr_perror("Unable to restore cwd");
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment