restore: correctly restore cgroup mounts inside a container

Before the nsroot= mount option, we were just getting lucky because the cgroup superblocks "matched" when inspecting them from userspace, so we were actually getting a bind mount from the host when migrating from within cgroup namespaces. Instead, let's actually do a new (i.e. not a bind mount) for cgroup namespaces. For this, we need two things: 1. to prepare the cgroup namespace (and thus the cgroups) before the mount ns, so when the mount() occurrs it is relative to the right cgroup path. 2. not reject cgroup filesystems with no root. A cgroup ns mount looks like: 223 222 0:22 /lxc/unpriv /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd,nsroot=/lxc/unpriv i.e. it has /lxc/unpriv as its root, and thus doesn't look rooted to CRIU. We use the fstype->parse hook to rewrite this root to /, since it is handled by the cgroup ns infrastructure. v2: add new fstype->munge hook, allowing fstypes to munge their parsed mountinfo entries if they want to. this allows us to get rid of the ugly hacks with FSTYPE__CGROUP everywhere in teh patch. v3: s/fstype->munge/fstype->parse for FSTYPE__CGROUP Signed-off-by: Tycho Andersen <tycho.andersen@canonical.com> Signed-off-by: Pavel Emelyanov <xemul@virtuozzo.com>

restore: correctly restore cgroup mounts inside a container
Before the nsroot= mount option, we were just getting lucky because the cgroup superblocks "matched" when inspecting them from userspace, so we were actually getting a bind mount from the host when migrating from within cgroup namespaces. Instead, let's actually do a new (i.e. not a bind mount) for cgroup namespaces. For this, we need two things: 1. to prepare the cgroup namespace (and thus the cgroups) before the mount ns, so when the mount() occurrs it is relative to the right cgroup path. 2. not reject cgroup filesystems with no root. A cgroup ns mount looks like: 223 222 0:22 /lxc/unpriv /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd,nsroot=/lxc/unpriv i.e. it has /lxc/unpriv as its root, and thus doesn't look rooted to CRIU. We use the fstype->parse hook to rewrite this root to /, since it is handled by the cgroup ns infrastructure. v2: add new fstype->munge hook, allowing fstypes to munge their parsed mountinfo entries if they want to. this allows us to get rid of the ugly hacks with FSTYPE__CGROUP everywhere in teh patch. v3: s/fstype->munge/fstype->parse for FSTYPE__CGROUP Signed-off-by: Tycho Andersen <tycho.andersen@canonical.com> Signed-off-by: Pavel Emelyanov <xemul@virtuozzo.com>
84a89b46 · Tycho Andersen · Pavel Emelyanov · 870089d1 · 84a89b46 · 84a89b46
Commit 84a89b46 authored Mar 28, 2016 by Tycho Andersen Committed by Pavel Emelyanov Apr 01, 2016
Hide whitespace changes
Inline Side-by-side

Showing with 26 additions and 9 deletions

cr-restore.c criu/cr-restore.c +9 -9

mount.c criu/mount.c +17 -0

No files found.
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@@ -1564,6 +1564,15 @@ static int restore_task_with_children(void *_arg)
 			goto err;
 	}

+	/*
+	 * Call this _before_ forking to optimize cgroups
+	 * restore -- if all tasks live in one set of cgroups
+	 * we will only move the root one there, others will
+	 * just have it inherited.
+	 */
+	if (prepare_task_cgroup(current) < 0)
+		goto err;
+
 	/* Restore root task */
 	if (current->parent == NULL) {
 		if (restore_finish_stage(CR_STATE_RESTORE_NS) < 0)
@@ -1596,15 +1605,6 @@ static int restore_task_with_children(void *_arg)
 	if (prepare_mappings())
 		goto err;

-	/*
-	 * Call this _before_ forking to optimize cgroups
-	 * restore -- if all tasks live in one set of cgroups
-	 * we will only move the root one there, others will
-	 * just have it inherited.
-	 */
-	if (prepare_task_cgroup(current) < 0)
-		goto err;
-
 	if (prepare_sigactions() < 0)
 		goto err;


--- a/criu/mount.c
+++ b/criu/mount.c
@@ -1632,6 +1632,22 @@ out:
 	return ret;
 }

+static int cgroup_parse(struct mount_info *pm)
+{
+	if (!(root_ns_mask & CLONE_NEWCGROUP))
+		return 0;
+
+	/* cgroup namespaced mounts don't look rooted to CRIU, so let's fake it
+	 * here.
+	 */
+	xfree(pm->root);
+	pm->root = xstrdup("/");
+	if (!pm->root)
+		return -1;
+
+	return 0;
+}
+
 static int dump_empty_fs(struct mount_info *pm)
 {
 	int fd, ret = -1;
@@ -1717,6 +1733,7 @@ static struct fstype fstypes[32] = {
 	}, {
 		.name = "cgroup",
 		.code = FSTYPE__CGROUP,
+		.parse = cgroup_parse,
 	}, {
 		.name = "aufs",
 		.code = FSTYPE__AUFS,