Commit 0d8aec0c authored by Tycho Andersen's avatar Tycho Andersen Committed by Pavel Emelyanov

seccomp: add initial support for SECCOMP_MODE_STRICT

Unfortunately, SECCOMP_MODE_FILTER is not currently exposed to userspace,
so we can't checkpoint that. In any case, this is what we need to do for
SECCOMP_MODE_STRICT, so let's do it.

This patch works by first disabling seccomp for any processes who are going
to have seccomp filters restored, then restoring the process (including the
seccomp filters), and finally resuming the seccomp filters before detaching
from the process.

v2 changes:

* update for kernel patch v2
* use protobuf enum for seccomp type
* don't parse /proc/pid/status twice

v3 changes:

* get rid of extra CR_STAGE_SECCOMP_SUSPEND stage
* only suspend seccomp in finalize_restore(), just before the unmap
* restore the (same) seccomp state in threads too; also add a note about
  how this is slightly wrong, and that we should at least check for a
  mismatch
Signed-off-by: 's avatarTycho Andersen <tycho.andersen@canonical.com>
Signed-off-by: 's avatarPavel Emelyanov <xemul@parallels.com>
parent bf4243e3
...@@ -35,6 +35,9 @@ endif ...@@ -35,6 +35,9 @@ endif
ifeq ($(call try-cc,$(PTRACE_PEEKSIGINFO_TEST),),y) ifeq ($(call try-cc,$(PTRACE_PEEKSIGINFO_TEST),),y)
$(Q) @echo '#define CONFIG_HAS_PEEKSIGINFO_ARGS' >> $@ $(Q) @echo '#define CONFIG_HAS_PEEKSIGINFO_ARGS' >> $@
endif endif
ifeq ($(call try-cc,$(PTRACE_SUSPEND_SECCOMP_TEST),),y)
$(Q) @echo '#define CONFIG_HAS_SUSPEND_SECCOMP' >> $@
endif
ifeq ($(VDSO),y) ifeq ($(VDSO),y)
$(Q) @echo '#define CONFIG_VDSO' >> $@ $(Q) @echo '#define CONFIG_VDSO' >> $@
endif endif
......
...@@ -19,6 +19,8 @@ ...@@ -19,6 +19,8 @@
#include <sched.h> #include <sched.h>
#include <sys/resource.h> #include <sys/resource.h>
#include <linux/seccomp.h>
#include "protobuf.h" #include "protobuf.h"
#include "protobuf/fdinfo.pb-c.h" #include "protobuf/fdinfo.pb-c.h"
#include "protobuf/fs.pb-c.h" #include "protobuf/fs.pb-c.h"
...@@ -672,6 +674,12 @@ static int dump_task_core_all(struct pstree_item *item, ...@@ -672,6 +674,12 @@ static int dump_task_core_all(struct pstree_item *item,
if (ret < 0) if (ret < 0)
goto err; goto err;
if (item->seccomp_mode != SECCOMP_MODE_DISABLED) {
pr_info("got seccomp mode %d for %d\n", item->seccomp_mode, item->pid.virt);
core->tc->has_seccomp_mode = true;
core->tc->seccomp_mode = item->seccomp_mode;
}
strncpy((char *)core->tc->comm, stat->comm, TASK_COMM_LEN); strncpy((char *)core->tc->comm, stat->comm, TASK_COMM_LEN);
core->tc->flags = stat->flags; core->tc->flags = stat->flags;
core->tc->task_state = item->state; core->tc->task_state = item->state;
...@@ -801,7 +809,7 @@ static int collect_children(struct pstree_item *item) ...@@ -801,7 +809,7 @@ static int collect_children(struct pstree_item *item)
goto free; goto free;
} }
ret = seize_task(pid, item->pid.real); ret = seize_task(pid, item->pid.real, &c->seccomp_mode);
if (ret < 0) { if (ret < 0) {
/* /*
* Here is a race window between parse_children() and seize(), * Here is a race window between parse_children() and seize(),
...@@ -913,7 +921,14 @@ static int collect_threads(struct pstree_item *item) ...@@ -913,7 +921,14 @@ static int collect_threads(struct pstree_item *item)
pr_info("\tSeizing %d's %d thread\n", pr_info("\tSeizing %d's %d thread\n",
item->pid.real, pid); item->pid.real, pid);
ret = seize_task(pid, item_ppid(item)); /*
* FIXME: The NULL here is wrong; we really want to get the
* seccomp state of this thread, but we have nowhere to put it,
* so for now we ignore it. We should at least check to see
* that the seccomp state is the same for all threads; we'll do
* this in a future series.
*/
ret = seize_task(pid, item_ppid(item), NULL);
if (ret < 0) { if (ret < 0) {
/* /*
* Here is a race window between parse_threads() and seize(), * Here is a race window between parse_threads() and seize(),
...@@ -1063,7 +1078,7 @@ static int collect_pstree(pid_t pid) ...@@ -1063,7 +1078,7 @@ static int collect_pstree(pid_t pid)
return -1; return -1;
root_item->pid.real = pid; root_item->pid.real = pid;
ret = seize_task(pid, -1); ret = seize_task(pid, -1, &root_item->seccomp_mode);
if (ret < 0) if (ret < 0)
goto err; goto err;
pr_info("Seized task %d, state %d\n", pid, ret); pr_info("Seized task %d, state %d\n", pid, ret);
......
...@@ -129,7 +129,7 @@ int cr_exec(int pid, char **opt) ...@@ -129,7 +129,7 @@ int cr_exec(int pid, char **opt)
goto out; goto out;
} }
prev_state = ret = seize_task(pid, -1); prev_state = ret = seize_task(pid, -1, NULL);
if (ret < 0) { if (ret < 0) {
pr_err("Can't seize task %d\n", pid); pr_err("Can't seize task %d\n", pid);
goto out; goto out;
......
...@@ -24,6 +24,8 @@ ...@@ -24,6 +24,8 @@
#include <sys/sendfile.h> #include <sys/sendfile.h>
#include <linux/seccomp.h>
#include "ptrace.h" #include "ptrace.h"
#include "compiler.h" #include "compiler.h"
#include "asm/types.h" #include "asm/types.h"
...@@ -1060,6 +1062,7 @@ static inline int fork_with_pid(struct pstree_item *item) ...@@ -1060,6 +1062,7 @@ static inline int fork_with_pid(struct pstree_item *item)
item->state = ca.core->tc->task_state; item->state = ca.core->tc->task_state;
rsti(item)->cg_set = ca.core->tc->cg_set; rsti(item)->cg_set = ca.core->tc->cg_set;
item->seccomp_mode = ca.core->tc->seccomp_mode;
if (item->state == TASK_DEAD) if (item->state == TASK_DEAD)
rsti(item->parent)->nr_zombies++; rsti(item->parent)->nr_zombies++;
...@@ -1682,6 +1685,17 @@ static void finalize_restore(int status) ...@@ -1682,6 +1685,17 @@ static void finalize_restore(int status)
goto detach; goto detach;
/* Unmap the restorer blob */ /* Unmap the restorer blob */
/*
* Suspend seccomp if necessary. We need to do this because
* although seccomp is restored at the very end of the
* restorer blob (and the final sigreturn is ok), here we're
* doing an munmap in the process, which may be blocked by
* seccomp and cause the task to be killed.
*/
if (item->seccomp_mode != SECCOMP_MODE_DISABLED && suspend_seccomp(pid) < 0)
pr_err("failed to suspend seccomp, restore will probably fail...\n");
ctl = parasite_prep_ctl(pid, NULL); ctl = parasite_prep_ctl(pid, NULL);
if (ctl == NULL) if (ctl == NULL)
goto detach; goto detach;
...@@ -2886,6 +2900,8 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core) ...@@ -2886,6 +2900,8 @@ static int sigreturn_restore(pid_t pid, CoreEntry *core)
#undef remap_array #undef remap_array
task_args->seccomp_mode = core->tc->seccomp_mode;
if (lsm) if (lsm)
task_args->creds.lsm_profile = rst_mem_remap_ptr(lsm_pos, RM_PRIVATE); task_args->creds.lsm_profile = rst_mem_remap_ptr(lsm_pos, RM_PRIVATE);
else else
......
...@@ -9,6 +9,9 @@ ...@@ -9,6 +9,9 @@
#ifndef PR_GET_NAME #ifndef PR_GET_NAME
# define PR_GET_NAME 16 # define PR_GET_NAME 16
#endif #endif
#ifndef PR_SET_SECCOMP
# define PR_SET_SECCOMP 22
#endif
#ifndef PR_CAPBSET_READ #ifndef PR_CAPBSET_READ
# define PR_CAPBSET_READ 23 # define PR_CAPBSET_READ 23
#endif #endif
......
...@@ -84,6 +84,8 @@ struct proc_status_creds { ...@@ -84,6 +84,8 @@ struct proc_status_creds {
char state; char state;
int ppid; int ppid;
int seccomp_mode;
}; };
struct mount_info; struct mount_info;
......
...@@ -24,6 +24,14 @@ struct pstree_item { ...@@ -24,6 +24,14 @@ struct pstree_item {
int state; /* TASK_XXX constants */ int state; /* TASK_XXX constants */
/*
* We keep the seccomp mode here temporarily between seizing and
* dumping the task to avoid parsing /proc/pid/status twice. We also
* use it on restore to hold the seccomp mode so that we don't have to
* keep track of each task's core entry in the main criu process.
*/
int seccomp_mode;
int nr_threads; /* number of threads */ int nr_threads; /* number of threads */
struct pid *threads; /* array of threads */ struct pid *threads; /* array of threads */
CoreEntry **core; CoreEntry **core;
......
...@@ -11,6 +11,10 @@ ...@@ -11,6 +11,10 @@
# define PTRACE_SEIZE 0x4206 # define PTRACE_SEIZE 0x4206
#endif #endif
#ifndef PTRACE_O_SUSPEND_SECCOMP
# define PTRACE_O_SUSPEND_SECCOMP (1 << 21)
#endif
#ifndef PTRACE_INTERRUPT #ifndef PTRACE_INTERRUPT
# define PTRACE_INTERRUPT 0x4207 # define PTRACE_INTERRUPT 0x4207
#endif #endif
...@@ -62,7 +66,8 @@ struct ptrace_peeksiginfo_args { ...@@ -62,7 +66,8 @@ struct ptrace_peeksiginfo_args {
#define SI_EVENT(_si_code) (((_si_code) & 0xFFFF) >> 8) #define SI_EVENT(_si_code) (((_si_code) & 0xFFFF) >> 8)
extern int seize_task(pid_t pid, pid_t ppid); extern int seize_task(pid_t pid, pid_t ppid, int *seccomp_mode);
extern int suspend_seccomp(pid_t pid);
extern int unseize_task(pid_t pid, int orig_state, int state); extern int unseize_task(pid_t pid, int orig_state, int state);
extern int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes); extern int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes);
extern int ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes); extern int ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes);
......
...@@ -165,6 +165,8 @@ struct task_restore_args { ...@@ -165,6 +165,8 @@ struct task_restore_args {
*/ */
int proc_fd; int proc_fd;
int seccomp_mode;
#ifdef CONFIG_VDSO #ifdef CONFIG_VDSO
unsigned long vdso_rt_size; unsigned long vdso_rt_size;
struct vdso_symtable vdso_sym_rt; /* runtime vdso symbols */ struct vdso_symtable vdso_sym_rt; /* runtime vdso symbols */
......
...@@ -40,6 +40,18 @@ ...@@ -40,6 +40,18 @@
#define PR_SET_PDEATHSIG 1 #define PR_SET_PDEATHSIG 1
#endif #endif
#ifndef SECCOMP_MODE_DISABLED
#define SECCOMP_MODE_DISABLED 0
#endif
#ifndef SECCOMP_MODE_STRICT
#define SECCOMP_MODE_STRICT 1
#endif
#ifndef SECCOMP_MODE_FILTER
#define SECCOMP_MODE_FILTER 2
#endif
#define sys_prctl_safe(opcode, val1, val2, val3) \ #define sys_prctl_safe(opcode, val1, val2, val3) \
({ \ ({ \
long __ret = sys_prctl(opcode, val1, val2, val3, 0); \ long __ret = sys_prctl(opcode, val1, val2, val3, 0); \
...@@ -339,6 +351,32 @@ static int restore_signals(siginfo_t *ptr, int nr, bool group) ...@@ -339,6 +351,32 @@ static int restore_signals(siginfo_t *ptr, int nr, bool group)
return 0; return 0;
} }
static void restore_seccomp(int seccomp_mode)
{
switch (seccomp_mode) {
case SECCOMP_MODE_DISABLED:
return;
case SECCOMP_MODE_STRICT:
if (sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0))
goto die;
return;
case SECCOMP_MODE_FILTER:
goto die;
default:
goto die;
}
die:
/*
* If preparing any seccomp state failed, we should make sure this
* process doesn't continue so that it can't do things outside the
* sandbox. Unfortunately, the rest of the restore has to continue
* since we're too late in the process to stop it and have unlocked the
* network.
*/
sys_exit_group(1);
}
static int restore_thread_common(struct rt_sigframe *sigframe, static int restore_thread_common(struct rt_sigframe *sigframe,
struct thread_restore_args *args) struct thread_restore_args *args)
{ {
...@@ -417,9 +455,15 @@ long __export_restore_thread(struct thread_restore_args *args) ...@@ -417,9 +455,15 @@ long __export_restore_thread(struct thread_restore_args *args)
restore_finish_stage(CR_STATE_RESTORE_SIGCHLD); restore_finish_stage(CR_STATE_RESTORE_SIGCHLD);
restore_pdeath_sig(args); restore_pdeath_sig(args);
if (args->ta->seccomp_mode != SECCOMP_MODE_DISABLED)
pr_info("restoring seccomp mode %d for %ld\n", args->ta->seccomp_mode, sys_getpid());
restore_finish_stage(CR_STATE_RESTORE_CREDS); restore_finish_stage(CR_STATE_RESTORE_CREDS);
futex_dec_and_wake(&thread_inprogress); futex_dec_and_wake(&thread_inprogress);
restore_seccomp(args->ta->seccomp_mode);
new_sp = (long)rt_sigframe + SIGFRAME_OFFSET; new_sp = (long)rt_sigframe + SIGFRAME_OFFSET;
rst_sigreturn(new_sp); rst_sigreturn(new_sp);
...@@ -1199,6 +1243,13 @@ long __export_restore_task(struct task_restore_args *args) ...@@ -1199,6 +1243,13 @@ long __export_restore_task(struct task_restore_args *args)
futex_set_and_wake(&thread_inprogress, args->nr_threads); futex_set_and_wake(&thread_inprogress, args->nr_threads);
/*
* We have to close the log before restoring seccomp, because
* SECCOMP_MODE_STRICT blocks close().
*/
if (args->seccomp_mode != SECCOMP_MODE_DISABLED)
pr_info("restoring seccomp mode %d for %ld\n", args->seccomp_mode, sys_getpid());
restore_finish_stage(CR_STATE_RESTORE_CREDS); restore_finish_stage(CR_STATE_RESTORE_CREDS);
if (ret) if (ret)
...@@ -1230,6 +1281,8 @@ long __export_restore_task(struct task_restore_args *args) ...@@ -1230,6 +1281,8 @@ long __export_restore_task(struct task_restore_args *args)
sys_munmap(args->rst_mem, args->rst_mem_size); sys_munmap(args->rst_mem, args->rst_mem_size);
restore_seccomp(args->seccomp_mode);
/* /*
* Sigframe stack. * Sigframe stack.
*/ */
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
#include <string.h> #include <string.h>
#include <ctype.h> #include <ctype.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/seccomp.h>
#include "asm/types.h" #include "asm/types.h"
#include "list.h" #include "list.h"
...@@ -779,7 +780,7 @@ int parse_pid_status(pid_t pid, struct proc_status_creds *cr) ...@@ -779,7 +780,7 @@ int parse_pid_status(pid_t pid, struct proc_status_creds *cr)
if (bfdopenr(&f)) if (bfdopenr(&f))
return -1; return -1;
while (done < 8) { while (done < 9) {
str = breadline(&f); str = breadline(&f);
if (str == NULL) if (str == NULL)
break; break;
...@@ -840,9 +841,22 @@ int parse_pid_status(pid_t pid, struct proc_status_creds *cr) ...@@ -840,9 +841,22 @@ int parse_pid_status(pid_t pid, struct proc_status_creds *cr)
done++; done++;
} }
if (!strncmp(str, "Seccomp:", 8)) {
if (sscanf(str + 9, "%d", &cr->seccomp_mode) != 1) {
goto err_parse;
}
if (cr->seccomp_mode == SECCOMP_MODE_FILTER) {
pr_err("SECCOMP_MODE_FILTER not currently supported\n");
goto err_parse;
}
done++;
}
} }
if (done == 8) if (done == 9)
ret = 0; ret = 0;
err_parse: err_parse:
......
...@@ -10,6 +10,15 @@ import "siginfo.proto"; ...@@ -10,6 +10,15 @@ import "siginfo.proto";
import "opts.proto"; import "opts.proto";
/*
* These match the SECCOMP_MODE_* flags from <linux/seccomp.h>.
*/
enum seccomp_mode {
disabled = 0;
strict = 1;
filter = 2;
};
message task_core_entry { message task_core_entry {
required uint32 task_state = 1; required uint32 task_state = 1;
required uint32 exit_code = 2; required uint32 exit_code = 2;
...@@ -26,6 +35,8 @@ message task_core_entry { ...@@ -26,6 +35,8 @@ message task_core_entry {
optional uint32 cg_set = 9; optional uint32 cg_set = 9;
optional signal_queue_entry signals_s = 10; optional signal_queue_entry signals_s = 10;
optional seccomp_mode seccomp_mode = 11;
} }
message task_kobj_ids_entry { message task_kobj_ids_entry {
......
...@@ -8,11 +8,14 @@ ...@@ -8,11 +8,14 @@
#include <limits.h> #include <limits.h>
#include <signal.h> #include <signal.h>
#include <sys/ptrace.h>
#include <sys/types.h> #include <sys/types.h>
#include <sys/time.h> #include <sys/time.h>
#include <sys/resource.h> #include <sys/resource.h>
#include <sys/wait.h> #include <sys/wait.h>
#include <linux/seccomp.h>
#include "compiler.h" #include "compiler.h"
#include "asm/types.h" #include "asm/types.h"
#include "util.h" #include "util.h"
...@@ -30,14 +33,32 @@ int unseize_task(pid_t pid, int orig_st, int st) ...@@ -30,14 +33,32 @@ int unseize_task(pid_t pid, int orig_st, int st)
else if (st == TASK_STOPPED) { else if (st == TASK_STOPPED) {
if (orig_st == TASK_ALIVE) if (orig_st == TASK_ALIVE)
kill(pid, SIGSTOP); kill(pid, SIGSTOP);
} else if (st == TASK_ALIVE) } else if (st == TASK_ALIVE) {
/* do nothing */ ; /* do nothing */ ;
else } else
pr_err("Unknown final state %d\n", st); pr_err("Unknown final state %d\n", st);
return ptrace(PTRACE_DETACH, pid, NULL, NULL); return ptrace(PTRACE_DETACH, pid, NULL, NULL);
} }
#ifdef CONFIG_HAS_SUSPEND_SECCOMP
int suspend_seccomp(pid_t pid)
{
if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_SUSPEND_SECCOMP) < 0) {
pr_perror("suspending seccomp failed");
return -1;
}
return 0;
}
#else
int suspend_seccomp(pid_t pid)
{
pr_err("seccomp enabled and seccomp suspending not supported\n");
return -1;
}
#endif
/* /*
* This routine seizes task putting it into a special * This routine seizes task putting it into a special
* state where we can manipulate the task via ptrace * state where we can manipulate the task via ptrace
...@@ -46,7 +67,7 @@ int unseize_task(pid_t pid, int orig_st, int st) ...@@ -46,7 +67,7 @@ int unseize_task(pid_t pid, int orig_st, int st)
* up with someone else. * up with someone else.
*/ */
int seize_task(pid_t pid, pid_t ppid) int seize_task(pid_t pid, pid_t ppid, int *seccomp_mode)
{ {
siginfo_t si; siginfo_t si;
int status; int status;
...@@ -90,6 +111,9 @@ try_again: ...@@ -90,6 +111,9 @@ try_again:
if (ret2) if (ret2)
goto err; goto err;
if (seccomp_mode)
*seccomp_mode = cr.seccomp_mode;
if (!may_dump(&cr)) { if (!may_dump(&cr)) {
pr_err("Check uid (pid: %d) failed\n", pid); pr_err("Check uid (pid: %d) failed\n", pid);
goto err; goto err;
...@@ -142,6 +166,9 @@ try_again: ...@@ -142,6 +166,9 @@ try_again:
goto try_again; goto try_again;
} }
if (cr.seccomp_mode != SECCOMP_MODE_DISABLED && suspend_seccomp(pid) < 0)
goto err_stop;
if (si.si_signo == SIGTRAP) if (si.si_signo == SIGTRAP)
return TASK_ALIVE; return TASK_ALIVE;
else if (si.si_signo == SIGSTOP) { else if (si.si_signo == SIGSTOP) {
......
...@@ -92,3 +92,14 @@ int main(int argc, char *argv[], char *envp[]) ...@@ -92,3 +92,14 @@ int main(int argc, char *argv[], char *envp[])
} }
endef endef
define PTRACE_SUSPEND_SECCOMP_TEST
#include <linux/ptrace.h>
int main(void)
{
return PTRACE_O_SUSPEND_SECCOMP;
}
endef
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment