Commit 2f545730 authored by Cyrill Gorcunov's avatar Cyrill Gorcunov Committed by Pavel Emelyanov

seccomp: Add engine to checkpoint per-thread seccomp chains

To checkpoint per-thread seccomp filters we need
a significant rework of a dumping code. The general
idea is the following:

 - Each thread is tracked by its tid inside global
   seccomp rbtree thus we can easily add entries
   there or lookup on demand.

 - When we collect threads into pstree entries we fetch
   its seccomp mode from procfs parsing routine and allocate
   a new entry inside rbtree to remember the seccomp mode.
   Note at this moment we're not dumping real filters yet
   (because filter data image is a single one for all consumers)

 - Once all tids are collected and our tree is complete we call for
   seccomp_collect_dump_filters helper which walks every pstree entry
   and iterate over each tid inside thread group calling
   seccomp_dump_thread, which in turn uses ptrace engine to fetch
   filters and keep this data in memory.

   To optimize data usage we figure out if we can use TSYNC flag
   on restore calling try_use_tsync helper: for TSYNC flag kernel
   automatically propagate filter to all threads, thus we need to
   compare all filters inside thread group for identity since there
   is no other way to figure out if user passed TSYNC flag when
   been creating filters.

  - Finally dump_seccomp_filters is called which does real write
    of seccomp filter data into an image file.
Signed-off-by: 's avatarCyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: 's avatarAndrei Vagin <avagin@virtuozzo.com>
parent 0f5cce7a
...@@ -716,6 +716,8 @@ int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread ...@@ -716,6 +716,8 @@ int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread
tc->pdeath_sig = ti->pdeath_sig; tc->pdeath_sig = ti->pdeath_sig;
} }
} }
if (!ret)
ret = seccomp_dump_thread(pid, tc);
return ret; return ret;
} }
...@@ -729,7 +731,6 @@ static int dump_task_core_all(struct parasite_ctl *ctl, ...@@ -729,7 +731,6 @@ static int dump_task_core_all(struct parasite_ctl *ctl,
CoreEntry *core = item->core[0]; CoreEntry *core = item->core[0];
pid_t pid = item->pid->real; pid_t pid = item->pid->real;
int ret = -1; int ret = -1;
struct proc_status_creds *creds;
struct parasite_dump_cgroup_args cgroup_args, *info = NULL; struct parasite_dump_cgroup_args cgroup_args, *info = NULL;
BUILD_BUG_ON(sizeof(cgroup_args) < PARASITE_ARG_SIZE_MIN); BUILD_BUG_ON(sizeof(cgroup_args) < PARASITE_ARG_SIZE_MIN);
...@@ -742,18 +743,6 @@ static int dump_task_core_all(struct parasite_ctl *ctl, ...@@ -742,18 +743,6 @@ static int dump_task_core_all(struct parasite_ctl *ctl,
if (ret < 0) if (ret < 0)
goto err; goto err;
creds = dmpi(item)->pi_creds;
if (creds->s.seccomp_mode != SECCOMP_MODE_DISABLED) {
pr_info("got seccomp mode %d for %d\n", creds->s.seccomp_mode, vpid(item));
core->tc->has_old_seccomp_mode = true;
core->tc->old_seccomp_mode = creds->s.seccomp_mode;
if (creds->s.seccomp_mode == SECCOMP_MODE_FILTER) {
core->tc->has_old_seccomp_filter = true;
core->tc->old_seccomp_filter = creds->last_filter;
}
}
strlcpy((char *)core->tc->comm, stat->comm, TASK_COMM_LEN); strlcpy((char *)core->tc->comm, stat->comm, TASK_COMM_LEN);
core->tc->flags = stat->flags; core->tc->flags = stat->flags;
core->tc->task_state = item->pid->state; core->tc->task_state = item->pid->state;
...@@ -1521,6 +1510,7 @@ static int cr_pre_dump_finish(int ret) ...@@ -1521,6 +1510,7 @@ static int cr_pre_dump_finish(int ret)
} }
free_pstree(root_item); free_pstree(root_item);
seccomp_free_entries();
if (irmap_predump_run()) { if (irmap_predump_run()) {
ret = -1; ret = -1;
...@@ -1718,6 +1708,7 @@ static int cr_dump_finish(int ret) ...@@ -1718,6 +1708,7 @@ static int cr_dump_finish(int ret)
TASK_ALIVE : opts.final_state); TASK_ALIVE : opts.final_state);
timing_stop(TIME_FROZEN); timing_stop(TIME_FROZEN);
free_pstree(root_item); free_pstree(root_item);
seccomp_free_entries();
free_file_locks(); free_file_locks();
free_link_remaps(); free_link_remaps();
free_aufs_branches(); free_aufs_branches();
...@@ -1822,7 +1813,7 @@ int cr_dump_tasks(pid_t pid) ...@@ -1822,7 +1813,7 @@ int cr_dump_tasks(pid_t pid)
if (!glob_imgset) if (!glob_imgset)
goto err; goto err;
if (collect_seccomp_filters() < 0) if (seccomp_collect_dump_filters() < 0)
goto err; goto err;
/* Errors handled later in detect_pid_reuse */ /* Errors handled later in detect_pid_reuse */
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#include "common/list.h" #include "common/list.h"
#include "common/lock.h" #include "common/lock.h"
#include "pid.h" #include "pid.h"
#include "xmalloc.h"
#include "images/core.pb-c.h" #include "images/core.pb-c.h"
/* /*
...@@ -53,11 +54,6 @@ static inline struct rst_info *rsti(struct pstree_item *i) ...@@ -53,11 +54,6 @@ static inline struct rst_info *rsti(struct pstree_item *i)
struct ns_id; struct ns_id;
struct dmp_info { struct dmp_info {
struct ns_id *netns; struct ns_id *netns;
/*
* We keep the creds here so that we can compare creds while seizing
* threads. Dumping tasks with different creds is not supported.
*/
struct proc_status_creds *pi_creds;
struct page_pipe *mem_pp; struct page_pipe *mem_pp;
struct parasite_ctl *parasite_ctl; struct parasite_ctl *parasite_ctl;
}; };
......
...@@ -29,15 +29,43 @@ ...@@ -29,15 +29,43 @@
struct thread_restore_args; struct thread_restore_args;
struct task_restore_args; struct task_restore_args;
struct pstree_item;
struct rb_node;
struct seccomp_info { /*
struct seccomp_info *prev; * seccomp filters are bound to @current->seccomp.filter
int id; * in the kernel, ie they are per thread structures.
SeccompFilter filter; *
* If filter is assigned then every subsequent call
* to fork() makes a copy of this @current->seccomp.filter
* pointer into child process.
*
* The thread group can share a filter if the filter
* is assigned with SECCOMP_FILTER_FLAG_TSYNC on group
* which has no filters yet.
*/
struct seccomp_filter_chain {
struct seccomp_filter_chain *prev;
SeccompFilter filter;
}; };
extern int collect_seccomp_filters(void); struct seccomp_entry {
extern int prepare_seccomp_filters(void); struct rb_node node;
struct seccomp_entry *next;
pid_t tid_real;
size_t img_filter_pos;
unsigned int mode;
struct seccomp_filter_chain *chain;
size_t nr_chains;
};
extern struct seccomp_entry *seccomp_lookup(pid_t tid_real, bool create, bool mandatory);
#define seccomp_find_entry(tid_real) seccomp_lookup(tid_real, false, true)
extern int seccomp_collect_entry(pid_t tid_real, unsigned int mode);
extern void seccomp_free_entries(void);
extern int seccomp_dump_thread(pid_t tid_real, ThreadCoreEntry *thread_core);
extern int seccomp_collect_dump_filters(void);
extern int seccomp_read_image(void); extern int seccomp_read_image(void);
extern int seccomp_prepare_threads(struct pstree_item *item, struct task_restore_args *ta); extern int seccomp_prepare_threads(struct pstree_item *item, struct task_restore_args *ta);
......
This diff is collapsed.
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include "criu-log.h" #include "criu-log.h"
#include <compel/ptrace.h> #include <compel/ptrace.h>
#include "proc_parse.h" #include "proc_parse.h"
#include "seccomp.h"
#include "seize.h" #include "seize.h"
#include "stats.h" #include "stats.h"
#include "xmalloc.h" #include "xmalloc.h"
...@@ -458,7 +459,7 @@ static int collect_children(struct pstree_item *item) ...@@ -458,7 +459,7 @@ static int collect_children(struct pstree_item *item)
nr_inprogress = 0; nr_inprogress = 0;
for (i = 0; i < nr_children; i++) { for (i = 0; i < nr_children; i++) {
struct pstree_item *c; struct pstree_item *c;
struct proc_status_creds *creds; struct proc_status_creds creds;
pid_t pid = ch[i]; pid_t pid = ch[i];
/* Is it already frozen? */ /* Is it already frozen? */
...@@ -484,13 +485,7 @@ static int collect_children(struct pstree_item *item) ...@@ -484,13 +485,7 @@ static int collect_children(struct pstree_item *item)
/* fails when meets a zombie */ /* fails when meets a zombie */
compel_interrupt_task(pid); compel_interrupt_task(pid);
creds = xzalloc(sizeof(*creds)); ret = compel_wait_task(pid, item->pid->real, parse_pid_status, NULL, &creds.s, NULL);
if (!creds) {
ret = -1;
goto free;
}
ret = compel_wait_task(pid, item->pid->real, parse_pid_status, NULL, &creds->s, NULL);
if (ret < 0) { if (ret < 0) {
/* /*
* Here is a race window between parse_children() and seize(), * Here is a race window between parse_children() and seize(),
...@@ -501,7 +496,6 @@ static int collect_children(struct pstree_item *item) ...@@ -501,7 +496,6 @@ static int collect_children(struct pstree_item *item)
*/ */
ret = 0; ret = 0;
xfree(c); xfree(c);
xfree(creds);
continue; continue;
} }
...@@ -510,12 +504,15 @@ static int collect_children(struct pstree_item *item) ...@@ -510,12 +504,15 @@ static int collect_children(struct pstree_item *item)
else else
processes_to_wait--; processes_to_wait--;
dmpi(c)->pi_creds = creds;
c->pid->real = pid; c->pid->real = pid;
c->parent = item; c->parent = item;
c->pid->state = ret; c->pid->state = ret;
list_add_tail(&c->sibling, &item->children); list_add_tail(&c->sibling, &item->children);
ret = seccomp_collect_entry(pid, creds.s.seccomp_mode);
if (ret < 0)
goto free;
/* Here is a recursive call (Depth-first search) */ /* Here is a recursive call (Depth-first search) */
ret = collect_task(c); ret = collect_task(c);
if (ret < 0) if (ret < 0)
...@@ -626,51 +623,16 @@ static inline bool thread_collected(struct pstree_item *i, pid_t tid) ...@@ -626,51 +623,16 @@ static inline bool thread_collected(struct pstree_item *i, pid_t tid)
return false; return false;
} }
static bool creds_dumpable(struct proc_status_creds *parent,
struct proc_status_creds *child)
{
/*
* - seccomp filters should be passed via
* semantic comparison (FIXME) but for
* now we require them to be exactly
* identical
*/
if (parent->s.seccomp_mode != child->s.seccomp_mode ||
parent->last_filter != child->last_filter) {
if (!pr_quelled(LOG_DEBUG)) {
pr_debug("Creds undumpable (parent:child)\n"
" uids: %d:%d %d:%d %d:%d %d:%d\n"
" gids: %d:%d %d:%d %d:%d %d:%d\n"
" state: %d:%d"
" ppid: %d:%d\n"
" shdpnd: %llu:%llu\n"
" seccomp_mode: %d:%d\n"
" last_filter: %u:%u\n",
parent->uids[0], child->uids[0],
parent->uids[1], child->uids[1],
parent->uids[2], child->uids[2],
parent->uids[3], child->uids[3],
parent->gids[0], child->gids[0],
parent->gids[1], child->gids[1],
parent->gids[2], child->gids[2],
parent->gids[3], child->gids[3],
parent->s.state, child->s.state,
parent->s.ppid, child->s.ppid,
parent->s.shdpnd, child->s.shdpnd,
parent->s.seccomp_mode, child->s.seccomp_mode,
parent->last_filter, child->last_filter);
}
return false;
}
return true;
}
static int collect_threads(struct pstree_item *item) static int collect_threads(struct pstree_item *item)
{ {
struct seccomp_entry *task_seccomp_entry;
struct pid *threads = NULL; struct pid *threads = NULL;
int nr_threads = 0, i = 0, ret, nr_inprogress, nr_stopped = 0; int nr_threads = 0, i = 0, ret, nr_inprogress, nr_stopped = 0;
task_seccomp_entry = seccomp_find_entry(item->pid->real);
if (!task_seccomp_entry)
goto err;
ret = parse_threads(item->pid->real, &threads, &nr_threads); ret = parse_threads(item->pid->real, &threads, &nr_threads);
if (ret < 0) if (ret < 0)
goto err; goto err;
...@@ -734,7 +696,7 @@ static int collect_threads(struct pstree_item *item) ...@@ -734,7 +696,7 @@ static int collect_threads(struct pstree_item *item)
goto err; goto err;
} }
if (!creds_dumpable(dmpi(item)->pi_creds, &t_creds)) if (seccomp_collect_entry(pid, t_creds.s.seccomp_mode))
goto err; goto err;
if (ret == TASK_STOPPED) { if (ret == TASK_STOPPED) {
...@@ -823,7 +785,7 @@ int collect_pstree(void) ...@@ -823,7 +785,7 @@ int collect_pstree(void)
{ {
pid_t pid = root_item->pid->real; pid_t pid = root_item->pid->real;
int ret = -1; int ret = -1;
struct proc_status_creds *creds; struct proc_status_creds creds;
timing_start(TIME_FREEZING); timing_start(TIME_FREEZING);
...@@ -842,11 +804,7 @@ int collect_pstree(void) ...@@ -842,11 +804,7 @@ int collect_pstree(void)
goto err; goto err;
} }
creds = xzalloc(sizeof(*creds)); ret = compel_wait_task(pid, -1, parse_pid_status, NULL, &creds.s, NULL);
if (!creds)
goto err;
ret = compel_wait_task(pid, -1, parse_pid_status, NULL, &creds->s, NULL);
if (ret < 0) if (ret < 0)
goto err; goto err;
...@@ -857,7 +815,10 @@ int collect_pstree(void) ...@@ -857,7 +815,10 @@ int collect_pstree(void)
pr_info("Seized task %d, state %d\n", pid, ret); pr_info("Seized task %d, state %d\n", pid, ret);
root_item->pid->state = ret; root_item->pid->state = ret;
dmpi(root_item)->pi_creds = creds;
ret = seccomp_collect_entry(pid, creds.s.seccomp_mode);
if (ret < 0)
goto err;
ret = collect_task(root_item); ret = collect_task(root_item);
if (ret < 0) if (ret < 0)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment