Commit 203c2914 authored by Pavel Emelyanov's avatar Pavel Emelyanov

cg: Restore tasks into proper cgroups

On restore find out in which sets tasks live in and move
them there.

Optimization note -- move tasks into cgroups _before_ fork
kids to make them inherit cgroups if required. This saves
a lot of time.

Accessibility note -- when moving tasks into cgroups don't
search for existing host mounts (they may be not available)
and don't mount temporary ones (may be impossible due to
user namespaces). Instead introduce service fd with a yard
of mounts.
Signed-off-by: 's avatarPavel Emelyanov <xemul@parallels.com>
parent 1ba9d2ca
...@@ -30,10 +30,23 @@ struct cg_set { ...@@ -30,10 +30,23 @@ struct cg_set {
static LIST_HEAD(cg_sets); static LIST_HEAD(cg_sets);
static unsigned int n_sets; static unsigned int n_sets;
static CgSetEntry **rst_sets;
static char *cg_yard;
static struct cg_set *root_cgset; /* Set root item lives in */ static struct cg_set *root_cgset; /* Set root item lives in */
static struct cg_set *criu_cgset; /* Set criu process lives in */ static struct cg_set *criu_cgset; /* Set criu process lives in */
static u32 cg_set_ids = 1; static u32 cg_set_ids = 1;
static CgSetEntry *find_rst_set_by_id(u32 id)
{
int i;
for (i = 0; i < n_sets; i++)
if (rst_sets[i]->id == id)
return rst_sets[i];
return NULL;
}
#define CGCMP_MATCH 1 /* check for exact match */ #define CGCMP_MATCH 1 /* check for exact match */
#define CGCMP_ISSUB 2 /* check set is subset of ctls */ #define CGCMP_ISSUB 2 /* check set is subset of ctls */
...@@ -233,3 +246,191 @@ int dump_cgroups(void) ...@@ -233,3 +246,191 @@ int dump_cgroups(void)
pr_info("Writing CG image\n"); pr_info("Writing CG image\n");
return pb_write_one(fdset_fd(glob_fdset, CR_FD_CGROUP), &cg, PB_CGROUP); return pb_write_one(fdset_fd(glob_fdset, CR_FD_CGROUP), &cg, PB_CGROUP);
} }
static int move_in_cgroup(CgSetEntry *se)
{
int cg, i;
pr_info("Move into %d\n", se->id);
cg = get_service_fd(CGROUP_YARD);
for (i = 0; i < se->n_ctls; i++) {
char aux[1024];
int fd, err;
ControllerEntry *ce = se->ctls[i];
sprintf(aux, "%s/%s/tasks", ce->name, ce->path);
pr_debug(" `-> %s\n", aux);
err = fd = openat(cg, aux, O_WRONLY);
if (fd >= 0) {
/*
* Writing zero into this file moves current
* task w/o any permissions checks :)
*/
err = write(fd, "0", 1);
close(fd);
}
if (err < 0) {
pr_perror("Can't move into %s (%d/%d)\n",
aux, err, fd);
return -1;
}
}
close_service_fd(CGROUP_YARD);
return 0;
}
int prepare_task_cgroup(struct pstree_item *me)
{
CgSetEntry *se;
u32 current_cgset;
if (!me->rst->cg_set)
return 0;
if (me->parent)
current_cgset = me->parent->rst->cg_set;
else
current_cgset = root_cg_set;
if (me->rst->cg_set == current_cgset) {
pr_info("Cgroups %d inherited from parent\n", current_cgset);
close_service_fd(CGROUP_YARD);
return 0;
}
se = find_rst_set_by_id(me->rst->cg_set);
if (!se) {
pr_err("No set %d found\n", me->rst->cg_set);
return -1;
}
return move_in_cgroup(se);
}
void fini_cgroup(void)
{
if (!cg_yard)
return;
close_service_fd(CGROUP_YARD);
umount2(cg_yard, MNT_DETACH);
rmdir(cg_yard);
xfree(cg_yard);
}
/*
* Prepare the CGROUP_YARD service descriptor. This guy is
* tmpfs mount with the set of ctl->name directories each
* one having the respective cgroup mounted.
*
* It's required for two reasons.
*
* First, if we move more than one task into cgroups it's
* faster to have cgroup tree visible by them all in sime
* single place. Searching for this thing existing in the
* criu's space is not nice, as parsing /proc/mounts is not
* very fast, other than this not all cgroups may be mounted.
*
* Second, when we have user-namespaces support we will
* loose the ability to mount cgroups on-demand, so prepare
* them in advance.
*/
static int prepare_cgroup_sfd(CgSetEntry *root_set)
{
int off, i;
char paux[PATH_MAX], aux[128];
pr_info("Preparing cgroups yard\n");
off = sprintf(paux, ".criu.cgyard.XXXXXX");
if (mkdtemp(paux) == NULL) {
pr_perror("Can't make temp cgyard dir");
return -1;
}
cg_yard = xstrdup(paux);
if (!cg_yard) {
rmdir(paux);
return -1;
}
if (mount("none", cg_yard, "tmpfs", 0, NULL)) {
pr_perror("Can't mount tmpfs in cgyard");
goto err;
}
for (i = 0; i < root_set->n_ctls; i++) {
ControllerEntry *ce = root_set->ctls[i];
char *opt = ce->name;
sprintf(paux + off, "/%s", ce->name);
if (strstartswith(ce->name, "name=")) {
sprintf(aux, "none,%s", ce->name);
opt = aux;
}
if (mkdir(paux, 0700)) {
pr_perror("Can't make cgyard subdir");
goto err;
}
if (mount("none", paux, "cgroup", 0, opt) < 0) {
pr_perror("Can't mount %s cgyard", ce->name);
goto err;
}
}
pr_debug("Opening %s as cg yard\n", cg_yard);
i = open(cg_yard, O_DIRECTORY);
if (i < 0) {
pr_perror("Can't open cgyard");
goto err;
}
off = install_service_fd(CGROUP_YARD, i);
close(i);
if (off < 0)
goto err;
return 0;
err:
fini_cgroup();
return -1;
}
int prepare_cgroup(void)
{
int fd, ret;
CgroupEntry *ce;
fd = open_image(CR_FD_CGROUP, O_RSTR | O_OPT);
if (fd < 0) {
if (errno == ENOENT) /* backward compatibility */
return 0;
else
return fd;
}
ret = pb_read_one_eof(fd, &ce, PB_CGROUP);
close(fd);
if (ret <= 0) /* Zero is OK -- no sets there. */
return ret;
n_sets = ce->n_sets;
rst_sets = ce->sets;
if (n_sets)
/*
* We rely on the fact that all sets contain the same
* set of controllers. This is checked during dump
* with cg_set_compare(CGCMP_ISSUB) call.
*/
ret = prepare_cgroup_sfd(rst_sets[0]);
else
ret = 0;
return ret;
}
...@@ -117,6 +117,9 @@ static int crtools_prepare_shared(void) ...@@ -117,6 +117,9 @@ static int crtools_prepare_shared(void)
if (tty_prep_fds()) if (tty_prep_fds())
return -1; return -1;
if (prepare_cgroup())
return -1;
return 0; return 0;
} }
...@@ -905,6 +908,7 @@ static inline int fork_with_pid(struct pstree_item *item) ...@@ -905,6 +908,7 @@ static inline int fork_with_pid(struct pstree_item *item)
return -1; return -1;
item->state = ca.core->tc->task_state; item->state = ca.core->tc->task_state;
item->rst->cg_set = ca.core->tc->cg_set;
switch (item->state) { switch (item->state) {
case TASK_ALIVE: case TASK_ALIVE:
...@@ -917,8 +921,14 @@ static inline int fork_with_pid(struct pstree_item *item) ...@@ -917,8 +921,14 @@ static inline int fork_with_pid(struct pstree_item *item)
pr_err("Unknown task state %d\n", item->state); pr_err("Unknown task state %d\n", item->state);
return -1; return -1;
} }
} else } else {
/*
* Helper entry will not get moved around and thus
* will live in the parent's cgset.
*/
item->rst->cg_set = item->parent->rst->cg_set;
ca.core = NULL; ca.core = NULL;
}
ret = -1; ret = -1;
...@@ -1281,6 +1291,15 @@ static int restore_task_with_children(void *_arg) ...@@ -1281,6 +1291,15 @@ static int restore_task_with_children(void *_arg)
exit(1); exit(1);
} }
/*
* Call this _before_ forking to optimize cgroups
* restore -- if all tasks live in one set of cgroups
* we will only move the root one there, others will
* just have it inherited.
*/
if (prepare_task_cgroup(current) < 0)
return -1;
if (create_children_and_session()) if (create_children_and_session())
goto err; goto err;
...@@ -1645,6 +1664,8 @@ int cr_restore_tasks(void) ...@@ -1645,6 +1664,8 @@ int cr_restore_tasks(void)
goto err; goto err;
ret = restore_root_task(root_item); ret = restore_root_task(root_item);
fini_cgroup();
err: err:
cr_plugin_fini(); cr_plugin_fini();
return ret; return ret;
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
bool fdinfo_per_id = false; bool fdinfo_per_id = false;
bool ns_per_id = false; bool ns_per_id = false;
TaskKobjIdsEntry *root_ids; TaskKobjIdsEntry *root_ids;
u32 root_cg_set;
int check_img_inventory(void) int check_img_inventory(void)
{ {
...@@ -39,6 +40,15 @@ int check_img_inventory(void) ...@@ -39,6 +40,15 @@ int check_img_inventory(void)
memcpy(root_ids, he->root_ids, sizeof(*root_ids)); memcpy(root_ids, he->root_ids, sizeof(*root_ids));
} }
if (he->has_root_cg_set) {
if (he->root_cg_set == 0) {
pr_err("Corrupted root cgset\n");
goto out_err;
}
root_cg_set = he->root_cg_set;
}
if (he->img_version != CRTOOLS_IMAGES_V1) { if (he->img_version != CRTOOLS_IMAGES_V1) {
pr_err("Not supported images version %u\n", he->img_version); pr_err("Not supported images version %u\n", he->img_version);
goto out_err; goto out_err;
......
...@@ -2,6 +2,10 @@ ...@@ -2,6 +2,10 @@
#define __CR_CGROUP_H__ #define __CR_CGROUP_H__
#include "asm/int.h" #include "asm/int.h"
struct pstree_item; struct pstree_item;
extern u32 root_cg_set;
int dump_task_cgroup(struct pstree_item *, u32 *); int dump_task_cgroup(struct pstree_item *, u32 *);
int dump_cgroups(void); int dump_cgroups(void);
int prepare_task_cgroup(struct pstree_item *);
int prepare_cgroup(void);
void fini_cgroup(void);
#endif /* __CR_CGROUP_H__ */ #endif /* __CR_CGROUP_H__ */
...@@ -43,6 +43,8 @@ struct rst_info { ...@@ -43,6 +43,8 @@ struct rst_info {
struct vm_area_list vmas; struct vm_area_list vmas;
struct _MmEntry *mm; struct _MmEntry *mm;
u32 cg_set;
union { union {
struct pstree_item *pgrp_leader; struct pstree_item *pgrp_leader;
futex_t pgrp_set; futex_t pgrp_set;
......
...@@ -16,6 +16,7 @@ enum sfd_type { ...@@ -16,6 +16,7 @@ enum sfd_type {
* For restore -- CRIU ns' proc * For restore -- CRIU ns' proc
*/ */
ROOT_FD_OFF, /* Root of the namespace we dump/restore */ ROOT_FD_OFF, /* Root of the namespace we dump/restore */
CGROUP_YARD,
SERVICE_FD_MAX SERVICE_FD_MAX
}; };
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment