Commit c7b5efd0 authored by Pavel Emelyanov's avatar Pavel Emelyanov

ipc: Restore mprotected sysvshmems

Image a process has done shmget(2 pages), then shmat() then
mprotect(1 page, ro). In this case criu will dump 1 shmem
segment 2 pages long and 2 vmas 1 page each.

But on restore time we'll call shmat() for _each_ vma and the
very first one will occupy the whole 2 pages space in vm
(there's no size argument for shmat, only for shmget) thus
blocking the 2nd vma from shmat()-in again.

The solution is:

1. check that each shmem segment is attached by
   the sequence of vmas that cover one w/o holes

2. shmat() only the first one

3. mprotect() all of them if needed (there's no hunks
   for this step in this path, mprotect is already
   called in pie/restorer.c and does things well)

v2:

* List can contain anon shmems (caught by zdtm)
* There can be many attachments of a segment (caught by transition/ipc)
Signed-off-by: 's avatarPavel Emelyanov <xemul@virtuozzo.com>
parent 5a44eca1
...@@ -71,6 +71,7 @@ ...@@ -71,6 +71,7 @@
#include "timerfd.h" #include "timerfd.h"
#include "file-lock.h" #include "file-lock.h"
#include "action-scripts.h" #include "action-scripts.h"
#include "shmem.h"
#include "aio.h" #include "aio.h"
#include "lsm.h" #include "lsm.h"
#include "seccomp.h" #include "seccomp.h"
...@@ -683,7 +684,7 @@ static int open_vmas(int pid) ...@@ -683,7 +684,7 @@ static int open_vmas(int pid)
vma->e->pgoff, vma->e->status); vma->e->pgoff, vma->e->status);
if (vma_area_is(vma, VMA_AREA_SYSVIPC)) if (vma_area_is(vma, VMA_AREA_SYSVIPC))
ret = vma->e->shmid; ret = get_sysv_shmem_fd(vma->e);
else if (vma_area_is(vma, VMA_ANON_SHARED)) else if (vma_area_is(vma, VMA_ANON_SHARED))
ret = get_shmem_fd(pid, vma->e); ret = get_shmem_fd(pid, vma->e);
else if (vma_area_is(vma, VMA_FILE_SHARED)) else if (vma_area_is(vma, VMA_FILE_SHARED))
...@@ -922,6 +923,9 @@ static int restore_one_alive_task(int pid, CoreEntry *core) ...@@ -922,6 +923,9 @@ static int restore_one_alive_task(int pid, CoreEntry *core)
if (open_vmas(pid)) if (open_vmas(pid))
return -1; return -1;
if (fixup_sysv_shmems())
return -1;
if (open_cores(pid, core)) if (open_cores(pid, core))
return -1; return -1;
......
...@@ -6,10 +6,14 @@ ...@@ -6,10 +6,14 @@
struct _VmaEntry; struct _VmaEntry;
extern int collect_shmem(int pid, struct _VmaEntry *vi); extern int collect_shmem(int pid, struct _VmaEntry *vi);
extern int collect_sysv_shmem(unsigned long shmid, unsigned long size);
extern void show_saved_shmems(void); extern void show_saved_shmems(void);
extern int get_shmem_fd(int pid, VmaEntry *vi); extern int get_shmem_fd(int pid, VmaEntry *vi);
extern int get_sysv_shmem_fd(struct _VmaEntry *vi);
extern int cr_dump_shmem(void); extern int cr_dump_shmem(void);
extern int add_shmem_area(pid_t pid, VmaEntry *vma); extern int add_shmem_area(pid_t pid, VmaEntry *vma);
extern int fixup_sysv_shmems(void);
#define SYSV_SHMEM_SKIP_FD (0x7fffffff)
#endif /* __CR_SHMEM_H__ */ #endif /* __CR_SHMEM_H__ */
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "namespaces.h" #include "namespaces.h"
#include "sysctl.h" #include "sysctl.h"
#include "ipc_ns.h" #include "ipc_ns.h"
#include "shmem.h"
#include "protobuf.h" #include "protobuf.h"
#include "images/ipc-var.pb-c.h" #include "images/ipc-var.pb-c.h"
...@@ -759,6 +760,9 @@ static int prepare_ipc_shm_seg(struct cr_img *img, const IpcShmEntry *shm) ...@@ -759,6 +760,9 @@ static int prepare_ipc_shm_seg(struct cr_img *img, const IpcShmEntry *shm)
}; };
struct shmid_ds shmid; struct shmid_ds shmid;
if (collect_sysv_shmem(shm->desc->id, shm->size))
return -1;
ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWIPC); ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWIPC);
if (ret < 0) { if (ret < 0) {
pr_err("Failed to set desired IPC shm ID\n"); pr_err("Failed to set desired IPC shm ID\n");
......
...@@ -37,6 +37,7 @@ ...@@ -37,6 +37,7 @@
#include "images/creds.pb-c.h" #include "images/creds.pb-c.h"
#include "images/mm.pb-c.h" #include "images/mm.pb-c.h"
#include "shmem.h"
#include "asm/restorer.h" #include "asm/restorer.h"
#ifndef PR_SET_PDEATHSIG #ifndef PR_SET_PDEATHSIG
...@@ -510,9 +511,18 @@ static unsigned long restore_mapping(const VmaEntry *vma_entry) ...@@ -510,9 +511,18 @@ static unsigned long restore_mapping(const VmaEntry *vma_entry)
int flags = vma_entry->flags | MAP_FIXED; int flags = vma_entry->flags | MAP_FIXED;
unsigned long addr; unsigned long addr;
if (vma_entry_is(vma_entry, VMA_AREA_SYSVIPC)) if (vma_entry_is(vma_entry, VMA_AREA_SYSVIPC)) {
/*
* See comment in get_sysv_shmem_fd() for what SYSV_SHMEM_SKIP_FD
* means and why we check for PROT_EXEC few lines below.
*/
if (vma_entry->fd == SYSV_SHMEM_SKIP_FD)
return vma_entry->start;
pr_info("Attach SYSV shmem %d at %"PRIx64"\n", (int)vma_entry->fd, vma_entry->start);
return sys_shmat(vma_entry->fd, decode_pointer(vma_entry->start), return sys_shmat(vma_entry->fd, decode_pointer(vma_entry->start),
(vma_entry->prot & PROT_WRITE) ? 0 : SHM_RDONLY); vma_entry->prot & PROT_EXEC ? 0 : SHM_RDONLY);
}
/* /*
* Restore or shared mappings are tricky, since * Restore or shared mappings are tricky, since
......
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
* it's opened in cr-restor, because pgoff may be non zero * it's opened in cr-restor, because pgoff may be non zero
*/ */
struct shmem_info { struct shmem_info {
struct list_head l;
unsigned long shmid; unsigned long shmid;
unsigned long size; unsigned long size;
int pid; int pid;
...@@ -46,10 +47,32 @@ struct shmem_info { ...@@ -46,10 +47,32 @@ struct shmem_info {
*/ */
int count; /* the number of regions */ int count; /* the number of regions */
int self_count; /* the number of regions, which belongs to "pid" */ int self_count; /* the number of regions, which belongs to "pid" */
};
/*
* Entries collected while creating the sysv shmem
* segment. As they sit in the same list with shmem_info-s
* their initial fields should match!
*/
struct shmem_sysv_info {
struct list_head l;
unsigned long shmid;
unsigned long size;
int pid;
struct list_head att; /* list of shmem_sysv_att-s */
int want_write;
};
struct shmem_sysv_att {
struct list_head l; struct list_head l;
VmaEntry *first;
unsigned long prev_end;
}; };
/* This is the "pid that will restore shmem" value for sysv */
#define SYSVIPC_SHMEM_PID (-1)
/* /*
* This list is filled with shared objects before we fork * This list is filled with shared objects before we fork
* any tasks. Thus the head is private (COW-ed) and the * any tasks. Thus the head is private (COW-ed) and the
...@@ -77,6 +100,151 @@ static struct shmem_info *find_shmem_by_id(unsigned long shmid) ...@@ -77,6 +100,151 @@ static struct shmem_info *find_shmem_by_id(unsigned long shmid)
return NULL; return NULL;
} }
int collect_sysv_shmem(unsigned long shmid, unsigned long size)
{
struct shmem_sysv_info *si;
/*
* Tasks will not modify this object, so don't
* shmalloc() as we do it for anon shared mem
*/
si = malloc(sizeof(*si));
if (!si)
return -1;
si->shmid = shmid;
si->pid = SYSVIPC_SHMEM_PID;
si->size = size;
si->want_write = 0;
INIT_LIST_HEAD(&si->att);
list_add_tail(&si->l, &shmems);
pr_info("Collected SysV shmem %lx, size %ld\n", si->shmid, si->size);
return 0;
}
int fixup_sysv_shmems(void)
{
struct shmem_sysv_info *si;
struct shmem_sysv_att *att;
list_for_each_entry(si, &shmems, l) {
/* It can be anon shmem */
if (si->pid != SYSVIPC_SHMEM_PID)
continue;
list_for_each_entry(att, &si->att, l) {
/*
* Same thing is checked in get_sysv_shmem_fd() for
* intermediate holes.
*/
if (att->first->start + si->size != att->prev_end) {
pr_err("Sysv shmem %lx with tail hole not supported\n", si->shmid);
return -1;
}
/*
* See comment in get_shmem_fd() about this PROT_EXEC
*/
if (si->want_write)
att->first->prot |= PROT_EXEC;
}
}
return 0;
}
int get_sysv_shmem_fd(VmaEntry *vme)
{
struct shmem_sysv_info *si;
struct shmem_sysv_att *att;
int ret_fd;
si = (struct shmem_sysv_info *)find_shmem_by_id(vme->shmid);
if (!si) {
pr_err("Can't find sysv shmem for %lx\n", vme->shmid);
return -1;
}
if (si->pid != SYSVIPC_SHMEM_PID) {
pr_err("SysV shmem vma %lx points to anon vma %lx\n",
vme->start, si->shmid);
return -1;
}
/*
* We can have a chain of VMAs belonging to the same
* sysv shmem segment all with different access rights
* (ro and rw). But single shmat() system call attaches
* the whole segment regardless of the actual mapping
* size. This can be achieved by attaching a segment
* and then write-protecting its parts.
*
* So, to restore this thing we note the very first
* area of the segment and make it restore the whole
* thing. All the subsequent ones will carry the sign
* telling the restorer to omit shmat and only do the
* ro protection. Yes, it may happen that some sysv
* shmem vma-s sit in the list (and restorer's array)
* for no use.
*
* Holes in between are not handled now, as well as
* the hole at the end (see fixup_sysv_shmems).
*
* One corner case. At shmat() time we need to know
* whether to create the segment rw or ro, but the
* first vma can have different protection. So the
* segment ro-ness is marked with PROT_EXEC bit in
* the first vma. Unfortunatelly, we only know this
* after we scan all the vmas, so this bit is set
* at the end in fixup_sysv_shmems().
*/
if (vme->pgoff == 0) {
att = xmalloc(sizeof(*att));
if (!att)
return -1;
att->first = vme;
list_add(&att->l, &si->att);
ret_fd = si->shmid;
} else {
att = list_first_entry(&si->att, struct shmem_sysv_att, l);
if (att->prev_end != vme->start) {
pr_err("Sysv shmem %lx with a hole not supported\n", si->shmid);
return -1;
}
if (vme->pgoff != att->prev_end - att->first->start) {
pr_err("Sysv shmem %lx with misordered attach chunks\n", si->shmid);
return -1;
}
/*
* Value that doesn't (shouldn't) match with any real
* sysv shmem ID (thus it cannot be 0, as shmem id can)
* and still is not negative to prevent open_vmas() from
* treating it as error.
*/
ret_fd = SYSV_SHMEM_SKIP_FD;
}
pr_info("Note 0x%"PRIx64"-0x%"PRIx64" as %lx sysvshmem\n", vme->start, vme->end, si->shmid);
att->prev_end = vme->end;
if (!vme->has_fdflags || vme->fdflags == O_RDWR)
/*
* We can't look at vma->prot & PROT_WRITE as all this stuff
* can be read-protected. If !has_fdflags these are old images
* and ... we have no other choice other than make it with
* maximum access :(
*/
si->want_write = 1;
return ret_fd;
}
int collect_shmem(int pid, VmaEntry *vi) int collect_shmem(int pid, VmaEntry *vi)
{ {
unsigned long size = vi->pgoff + vi->end - vi->start; unsigned long size = vi->pgoff + vi->end - vi->start;
...@@ -84,6 +252,10 @@ int collect_shmem(int pid, VmaEntry *vi) ...@@ -84,6 +252,10 @@ int collect_shmem(int pid, VmaEntry *vi)
si = find_shmem_by_id(vi->shmid); si = find_shmem_by_id(vi->shmid);
if (si) { if (si) {
if (si->pid == SYSVIPC_SHMEM_PID) {
pr_err("Shmem %lx already collected as SYSVIPC\n", vi->shmid);
return -1;
}
if (si->size < size) if (si->size < size)
si->size = size; si->size = size;
...@@ -210,6 +382,8 @@ int get_shmem_fd(int pid, VmaEntry *vi) ...@@ -210,6 +382,8 @@ int get_shmem_fd(int pid, VmaEntry *vi)
return -1; return -1;
} }
BUG_ON(si->pid == SYSVIPC_SHMEM_PID);
if (si->pid != pid) if (si->pid != pid)
return shmem_wait_and_open(pid, si); return shmem_wait_and_open(pid, si);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment