Commit 84eb0a19 authored by Pavel Emelyanov's avatar Pavel Emelyanov

criu: Restore tasks as siblings in swrk

Andrey validly pointed out, that restoring pdeath_sig is not
compatible with criu_restore_child() call -- after criu restore
children, it will exit and fire the pdeath_sig into restored
tree root, potentially killing it.

The fix for that could be -- when started in swrk more, criu can
restore tree not as children tasks, but as siblings, using the
CLONE_PARENT flag when fork()-ing the root task.

With this we should also take care about errors handing -- right
now criu catches the SIGCHILD from dying children tasks, and
since we plan to create them be children of the criu parent (the
library caller) we will not be able to catch them. To do so we
SEIZE the root task in advance thus causing all SIGCHLD-s go to
criu, not to its parent.

Having this done we no longer need the SUBREAPER trick in the
library call -- tasks get restored right as callers kids :)

Some thoughts for future -- using this trick we can finally make
"natural" restoration of shell jobs. I.e. -- make criu restore
some subtree right under bash, w/o leaving itself as intermediate
task and w/o re-parenting the subtree to init after restore.
Signed-off-by: 's avatarPavel Emelyanov <xemul@parallels.com>
Acked-by: 's avatarAndrey Vagin <avagin@parallels.com>
parent fba8aae3
...@@ -1374,7 +1374,7 @@ static int restore_switch_stage(int next_stage) ...@@ -1374,7 +1374,7 @@ static int restore_switch_stage(int next_stage)
return restore_wait_inprogress_tasks(); return restore_wait_inprogress_tasks();
} }
static int attach_to_tasks() static int attach_to_tasks(bool root_seized)
{ {
struct pstree_item *item; struct pstree_item *item;
...@@ -1394,9 +1394,21 @@ static int attach_to_tasks() ...@@ -1394,9 +1394,21 @@ static int attach_to_tasks()
for (i = 0; i < item->nr_threads; i++) { for (i = 0; i < item->nr_threads; i++) {
pid = item->threads[i].real; pid = item->threads[i].real;
if (ptrace(PTRACE_ATTACH, pid, 0, 0)) { if (item != root_item || !root_seized) {
pr_perror("Can't attach to %d", pid); if (ptrace(PTRACE_ATTACH, pid, 0, 0)) {
return -1; pr_perror("Can't attach to %d", pid);
return -1;
}
} else {
/*
* Root item is SEIZE-d, so we only need
* to stop one (INTERRUPT) to make wait4
* and SYSCALL below work.
*/
if (ptrace(PTRACE_INTERRUPT, pid, 0, 0)) {
pr_perror("Can't interrupt task");
return -1;
}
} }
if (wait4(pid, &status, __WALL, NULL) != pid) { if (wait4(pid, &status, __WALL, NULL) != pid) {
...@@ -1513,10 +1525,43 @@ static int restore_root_task(struct pstree_item *init) ...@@ -1513,10 +1525,43 @@ static int restore_root_task(struct pstree_item *init)
futex_set(&task_entries->nr_in_progress, futex_set(&task_entries->nr_in_progress,
stage_participants(CR_STATE_RESTORE_NS)); stage_participants(CR_STATE_RESTORE_NS));
/*
* This means we're called from lib's criu_restore_child().
* In that case create the root task as the child one to+
* the caller. This is the only way to correctly restore the
* pdeath_sig of the root task. But also looks nice.
*/
if (opts.swrk_restore)
init->rst->clone_flags |= CLONE_PARENT;
ret = fork_with_pid(init); ret = fork_with_pid(init);
if (ret < 0) if (ret < 0)
return -1; return -1;
if (opts.swrk_restore) {
/*
* Root task is not our sibling. This means, that
* we will not notice when (if) it dies in SIGCHLD
* handler, but we should. To do this -- attach to
* the guy with ptrace and (!) make the kernel
* deliver us the signal when it will get stopped.
* It will in case of e.g. segfault before handling
* the signal.
*/
act.sa_flags &= ~SA_NOCLDSTOP;
ret = sigaction(SIGCHLD, &act, NULL);
if (ret < 0) {
pr_perror("sigaction() failed");
goto out;
}
if (ptrace(PTRACE_SEIZE, init->pid.real, 0, 0)) {
pr_perror("Can't attach to init");
goto out;
}
}
pr_info("Wait until namespaces are created\n"); pr_info("Wait until namespaces are created\n");
ret = restore_wait_inprogress_tasks(); ret = restore_wait_inprogress_tasks();
if (ret) if (ret)
...@@ -1570,7 +1615,7 @@ static int restore_root_task(struct pstree_item *init) ...@@ -1570,7 +1615,7 @@ static int restore_root_task(struct pstree_item *init)
timing_stop(TIME_RESTORE); timing_stop(TIME_RESTORE);
ret = attach_to_tasks(); ret = attach_to_tasks(opts.swrk_restore);
pr_info("Restore finished successfully. Resuming tasks.\n"); pr_info("Restore finished successfully. Resuming tasks.\n");
futex_set_and_wake(&task_entries->start, CR_STATE_COMPLETE); futex_set_and_wake(&task_entries->start, CR_STATE_COMPLETE);
......
...@@ -183,14 +183,16 @@ int main(int argc, char *argv[]) ...@@ -183,14 +183,16 @@ int main(int argc, char *argv[])
if (init_service_fd()) if (init_service_fd())
return 1; return 1;
if (!strcmp(argv[1], "swrk")) if (!strcmp(argv[1], "swrk")) {
/* /*
* This is to start criu service worker from libcriu calls. * This is to start criu service worker from libcriu calls.
* The usage is "criu swrk <fd>" and is not for CLI/scripts. * The usage is "criu swrk <fd>" and is not for CLI/scripts.
* The arguments semantics can change at any tyme with the * The arguments semantics can change at any tyme with the
* corresponding lib call change. * corresponding lib call change.
*/ */
opts.swrk_restore = true;
return cr_service_work(atoi(argv[2])); return cr_service_work(atoi(argv[2]));
}
while (1) { while (1) {
idx = -1; idx = -1;
......
...@@ -34,6 +34,7 @@ struct cr_options { ...@@ -34,6 +34,7 @@ struct cr_options {
bool link_remap_ok; bool link_remap_ok;
unsigned int rst_namespaces_flags; unsigned int rst_namespaces_flags;
bool log_file_per_pid; bool log_file_per_pid;
bool swrk_restore;
char *output; char *output;
char *root; char *root;
char *pidfile; char *pidfile;
......
...@@ -15,10 +15,6 @@ ...@@ -15,10 +15,6 @@
#include "rpc.pb-c.h" #include "rpc.pb-c.h"
#include "cr-service-const.h" #include "cr-service-const.h"
#ifndef PR_SET_CHILD_SUBREAPER
#define PR_SET_CHILD_SUBREAPER 36
#endif
const char *criu_lib_version = CRIU_VERSION; const char *criu_lib_version = CRIU_VERSION;
static char *service_address = CR_DEFAULT_SERVICE_ADDRESS; static char *service_address = CR_DEFAULT_SERVICE_ADDRESS;
...@@ -582,15 +578,6 @@ int criu_restore_child(void) ...@@ -582,15 +578,6 @@ int criu_restore_child(void)
if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sks)) if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sks))
goto out; goto out;
/*
* Set us as child subreaper so that after the swrk
* finishes restore and exits the restored subtree
* gets reparented to us.
*/
if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0))
goto err;
pid = fork(); pid = fork();
if (pid < 0) if (pid < 0)
goto err; goto err;
...@@ -633,8 +620,6 @@ int criu_restore_child(void) ...@@ -633,8 +620,6 @@ int criu_restore_child(void)
close(sks[0]); close(sks[0]);
waitpid(pid, NULL, 0); waitpid(pid, NULL, 0);
/* Drop the subreaper role _after_ swrk exits */
prctl(PR_SET_CHILD_SUBREAPER, 0, 0, 0);
if (!ret) { if (!ret) {
ret = resp->success ? resp->restore->pid : -EBADE; ret = resp->success ? resp->restore->pid : -EBADE;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment