Commit 8801f596 authored by Pavel Emelyanov's avatar Pavel Emelyanov

mem: Protobuf format for page dumps

Since now we drain pages out of parasite, we can invent any format for
page dumps. Let is be ... prorobuf one! :)

Another thing to keep in mind, is that we're about to use splices and
implement iterative migration, so it's better to have actual pages be
page-aligned in the image.

And -- backward compatibility. That said the new format is:

1. pagemap-... file which contains a header (currently with a ID of
   the image with pages, see below) and an array of <nr_pages:vaddr>
   pairs. The first value means "how many pages to take from the
   file with pages (see below)" and the second -- where in the task
   address space to put them. Simple.

2. pages-... file which containes only pages one by one (thus aligned
   as we want).

This patch breaks backward compatibility (old images with pages wil
be restored and then crash). Need to do it before v0.5 release.
Signed-off-by: 's avatarPavel Emelyanov <xemul@parallels.com>
parent a59fd353
......@@ -1367,7 +1367,7 @@ static int dump_one_task(struct pstree_item *item)
}
}
ret = parasite_dump_pages_seized(parasite_ctl, &vmas, cr_fdset);
ret = parasite_dump_pages_seized(parasite_ctl, item->pid.virt, &vmas, cr_fdset);
if (ret) {
pr_err("Can't dump pages (pid: %d) with parasite\n", pid);
goto err_cure;
......
......@@ -61,6 +61,7 @@
#include "protobuf/itimer.pb-c.h"
#include "protobuf/vma.pb-c.h"
#include "protobuf/rlimit.pb-c.h"
#include "protobuf/pagemap.pb-c.h"
#include "asm/restore.h"
......@@ -260,7 +261,7 @@ static int map_private_vma(pid_t pid, struct vma_area *vma, void *tgt_addr,
static int restore_priv_vma_content(pid_t pid)
{
struct vma_area *vma;
int fd, ret = 0;
int fd, fd_pg, ret = 0;
unsigned int nr_restored = 0;
unsigned int nr_shared = 0;
......@@ -268,26 +269,29 @@ static int restore_priv_vma_content(pid_t pid)
vma = list_first_entry(&rst_vmas.h, struct vma_area, list);
fd = open_image_ro(CR_FD_PAGES, pid);
fd = open_image_ro(CR_FD_PAGEMAP, (long)pid);
if (fd < 0)
return -1;
fd_pg = open_pages_image(O_RSTR, fd);
if (fd_pg < 0) {
close(fd);
return -1;
}
/*
* Read page contents.
*/
while (1) {
uint64_t va, page_offset;
char buf[PAGE_SIZE];
void *p;
PagemapEntry *pe;
unsigned long off, i;
unsigned long va;
ret = read(fd, &va, sizeof(va));
if (!ret)
ret = pb_read_one_eof(fd, &pe, PB_PAGEMAP);
if (ret <= 0)
break;
if (ret != sizeof(va)) {
pr_err("Bad mapping page size %d\n", ret);
return -1;
}
va = (unsigned long)decode_pointer(pe->vaddr);
BUG_ON(va < vma->vma.start);
......@@ -296,19 +300,22 @@ static int restore_priv_vma_content(pid_t pid)
vma = list_entry(vma->list.next, struct vma_area, list);
}
page_offset = (va - vma->vma.start) / PAGE_SIZE;
off = (va - vma->vma.start) / PAGE_SIZE;
for (i = 0; i < pe->nr_pages; i++) {
unsigned char buf[PAGE_SIZE];
void *p;
set_bit(page_offset, vma->page_bitmap);
set_bit(off + i, vma->page_bitmap);
if (vma->ppage_bitmap)
clear_bit(page_offset, vma->ppage_bitmap);
clear_bit(off + i, vma->ppage_bitmap);
ret = read(fd, buf, PAGE_SIZE);
ret = read(fd_pg, buf, PAGE_SIZE);
if (ret != PAGE_SIZE) {
pr_err("Can'r read mapping page %d\n", ret);
return -1;
}
p = decode_pointer(va - vma->vma.start +
p = (void *)((off + i) * PAGE_SIZE +
vma_premmaped_start(&vma->vma));
if (memcmp(p, buf, PAGE_SIZE) == 0) {
nr_shared++;
......@@ -318,7 +325,13 @@ static int restore_priv_vma_content(pid_t pid)
memcpy(p, buf, PAGE_SIZE);
nr_restored++;
}
pagemap_entry__free_unpacked(pe, NULL);
}
close(fd_pg);
close(fd);
if (ret < 0)
return ret;
/* Remove pages, which were not shared with a child */
list_for_each_entry(vma, &rst_vmas.h, list) {
......
......@@ -42,6 +42,7 @@
#include "protobuf/creds.pb-c.h"
#include "protobuf/core.pb-c.h"
#include "protobuf/tty.pb-c.h"
#include "protobuf/pagemap.pb-c.h"
#define DEF_PAGES_PER_LINE 6
......@@ -211,40 +212,15 @@ void print_image_data(int fd, unsigned int length, int show)
xfree(data);
}
void show_pages(int fd_pages, struct cr_options *o)
void show_pagemap(int fd, struct cr_options *o)
{
pr_img_head(CR_FD_PAGES);
PagemapHead *h;
if (o->show_pages_content) {
while (1) {
struct page_entry e;
if (read_img_eof(fd_pages, &e) <= 0)
break;
print_data(e.va, e.data, PAGE_IMAGE_SIZE);
pr_msg("\n --- End of page ---\n\n");
}
} else {
while (1) {
struct page_entry e;
int i;
pr_msg("\t");
for (i = 0; i < DEF_PAGES_PER_LINE; i++) {
if (read_img_eof(fd_pages, &e) <= 0) {
pr_msg("\n");
goto out;
}
pr_msg("0x%16"PRIx64" ", e.va);
}
pr_msg("\n");
}
}
out:
pr_img_tail(CR_FD_PAGES);
if (pb_read_one(fd, &h, PB_PAGEMAP_HEAD) < 0)
return;
pr_msg("Pages id: %u\n", h->pages_id);
pagemap_head__free_unpacked(h, NULL);
return pb_show_plain(fd, PB_PAGEMAP);
}
void show_sigacts(int fd_sigacts, struct cr_options *o)
......
......@@ -15,6 +15,7 @@
#include "pstree.h"
#include "protobuf.h"
#include "protobuf/inventory.pb-c.h"
#include "protobuf/pagemap.pb-c.h"
bool fdinfo_per_id = false;
TaskKobjIdsEntry *root_ids;
......@@ -108,8 +109,8 @@ static void show_raw_image(int fd, struct cr_options *opt) {};
struct cr_fd_desc_tmpl fdset_template[CR_FD_MAX] = {
FD_ENTRY(INVENTORY, "inventory", show_inventory),
FD_ENTRY(FDINFO, "fdinfo-%d", show_files),
FD_ENTRY(PAGES, "pages-%d", show_pages),
FD_ENTRY(SHMEM_PAGES, "pages-shmem-%ld", show_pages),
FD_ENTRY(PAGEMAP, "pagemap-%ld", show_pagemap),
FD_ENTRY(SHMEM_PAGEMAP, "pagemap-shmem-%ld", show_pagemap),
FD_ENTRY(REG_FILES, "reg-files", show_reg_files),
FD_ENTRY(EVENTFD, "eventfd", show_eventfds),
FD_ENTRY(EVENTPOLL, "eventpoll", show_eventpoll),
......@@ -153,6 +154,7 @@ struct cr_fd_desc_tmpl fdset_template[CR_FD_MAX] = {
FD_ENTRY(TTY_INFO, "tty-info", show_tty_info),
FD_ENTRY(FILE_LOCKS, "filelocks-%d", show_file_locks),
FD_ENTRY(RLIMIT, "rlimit", show_rlimit),
FD_ENTRY(PAGES, "pages-%u", NULL),
};
static struct cr_fdset *alloc_cr_fdset(int nr)
......@@ -319,3 +321,25 @@ void close_image_dir(void)
{
close_service_fd(IMG_FD_OFF);
}
static unsigned long page_ids = 1;
int open_pages_image(unsigned long flags, int pm_fd)
{
unsigned id;
if (flags == O_RDONLY) {
PagemapHead *h;
if (pb_read_one(pm_fd, &h, PB_PAGEMAP_HEAD) < 0)
return -1;
id = h->pages_id;
pagemap_head__free_unpacked(h, NULL);
} else {
PagemapHead h = PAGEMAP_HEAD__INIT;
id = h.pages_id = page_ids++;
if (pb_write_one(pm_fd, &h, PB_PAGEMAP_HEAD) < 0)
return -1;
}
return open_image(CR_FD_PAGES, flags, id);
}
......@@ -22,7 +22,6 @@ enum {
_CR_FD_TASK_FROM,
CR_FD_FILE_LOCKS,
CR_FD_PAGES,
CR_FD_CORE,
CR_FD_IDS,
CR_FD_MM,
......@@ -34,6 +33,8 @@ enum {
CR_FD_RLIMIT,
_CR_FD_TASK_TO,
CR_FD_PAGEMAP,
/*
* NS entries
*/
......@@ -51,7 +52,7 @@ enum {
_CR_FD_NS_TO,
CR_FD_PSTREE,
CR_FD_SHMEM_PAGES,
CR_FD_SHMEM_PAGEMAP,
CR_FD_GHOST_FILE,
CR_FD_TCP_STREAM,
CR_FD_FDINFO,
......@@ -80,6 +81,7 @@ enum {
_CR_FD_GLOB_TO,
CR_FD_TMPFS,
CR_FD_PAGES,
CR_FD_MAX
};
......@@ -141,7 +143,7 @@ struct cr_fd_desc_tmpl {
};
void show_files(int fd_files, struct cr_options *o);
void show_pages(int fd_pages, struct cr_options *o);
void show_pagemap(int fd, struct cr_options *o);
void show_reg_files(int fd_reg_files, struct cr_options *o);
void show_core(int fd_core, struct cr_options *o);
void show_ids(int fd_ids, struct cr_options *o);
......@@ -177,6 +179,7 @@ extern void close_image_dir(void);
int open_image(int type, unsigned long flags, ...);
#define open_image_ro(type, ...) open_image(type, O_RDONLY, ##__VA_ARGS__)
int open_pages_image(unsigned long flags, int pm_fd);
#define LAST_PID_PATH "/proc/sys/kernel/ns_last_pid"
#define LAST_PID_PERM 0666
......@@ -209,6 +212,7 @@ int cr_exec(int pid, char **opts);
#define O_DUMP (O_RDWR | O_CREAT | O_EXCL)
#define O_SHOW (O_RDONLY)
#define O_RSTR (O_RDONLY)
struct cr_fdset *cr_task_fdset_open(int pid, int mode);
struct cr_fdset *cr_ns_fdset_open(int pid, int mode);
......
......@@ -22,8 +22,9 @@
#define INVENTORY_MAGIC 0x58313116 /* Veliky Novgorod */
#define PSTREE_MAGIC 0x50273030 /* Kyiv */
#define FDINFO_MAGIC 0x56213732 /* Dmitrov */
#define PAGES_MAGIC 0x56084025 /* Vladimir */
#define SHMEM_PAGES_MAGIC PAGES_MAGIC
#define PAGEMAP_MAGIC 0x56084025 /* Vladimir */
#define SHMEM_PAGEMAP_MAGIC PAGEMAP_MAGIC
#define PAGES_MAGIC RAW_IMAGE_MAGIC
#define CORE_MAGIC 0x55053847 /* Kolomna */
#define IDS_MAGIC 0x54432030 /* Konigsberg */
#define VMAS_MAGIC 0x54123737 /* Tula */
......
......@@ -23,6 +23,4 @@ struct page_pipe {
struct page_pipe *create_page_pipe(unsigned int nr, struct iovec *);
void destroy_page_pipe(struct page_pipe *p);
int page_pipe_add_page(struct page_pipe *p, unsigned long addr);
int page_pipe_iterate_pages(struct page_pipe *p,
int (*fn)(int rpipe, unsigned long addr, void *), void *);
#endif
......@@ -34,7 +34,7 @@ struct vm_area_list;
extern int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_misc *misc);
struct _CredsEntry;
extern int parasite_dump_creds(struct parasite_ctl *ctl, struct _CredsEntry *ce);
extern int parasite_dump_pages_seized(struct parasite_ctl *ctl,
extern int parasite_dump_pages_seized(struct parasite_ctl *ctl, int vpid,
struct vm_area_list *vma_area_list,
struct cr_fdset *cr_fdset);
struct parasite_dump_thread;
......
......@@ -48,6 +48,8 @@ enum {
PB_FILE_LOCK,
PB_RLIMIT,
PB_IDS,
PB_PAGEMAP_HEAD,
PB_PAGEMAP,
PB_MAX
};
......
......@@ -135,55 +135,3 @@ int page_pipe_add_page(struct page_pipe *pp, unsigned long addr)
BUG_ON(ret > 0);
return ret;
}
static int page_buf_iterate(struct page_pipe_buf *ppb,
int (*fn)(int rpipe, unsigned long addr, void *), void *a)
{
unsigned int pg;
unsigned long addr, seg_end;
struct iovec *iov;
pr_debug("Iterate ppb of %u pages, %u segs\n", ppb->pages_in, ppb->nr_segs);
iov = &ppb->iov[0];
addr = (unsigned long)iov->iov_base;
seg_end = addr + iov->iov_len;
for (pg = 0; pg < ppb->pages_in; pg++) {
int ret;
if (addr >= seg_end) {
iov++;
BUG_ON(iov - ppb->iov >= ppb->nr_segs);
addr = (unsigned long)iov->iov_base;
seg_end = addr + iov->iov_len;
}
ret = fn(ppb->p[0], addr, a);
if (ret)
return ret;
addr += PAGE_SIZE;
}
return 0;
}
int page_pipe_iterate_pages(struct page_pipe *pp,
int (*fn)(int rpipe, unsigned long addr, void *), void *a)
{
int ret = 0;
struct page_pipe_buf *ppb;
pr_debug("Iterate pp\n");
list_for_each_entry(ppb, &pp->bufs, l) {
ret = page_buf_iterate(ppb, fn, a);
if (ret)
break;
}
pr_debug("Done iteration\n");
return ret;
}
......@@ -10,6 +10,7 @@
#include "protobuf/itimer.pb-c.h"
#include "protobuf/creds.pb-c.h"
#include "protobuf/core.pb-c.h"
#include "protobuf/pagemap.pb-c.h"
#include "syscall.h"
#include "ptrace.h"
......@@ -543,38 +544,12 @@ static int generate_iovs(struct vma_area *vma, int pagemap, struct page_pipe *pp
return 0;
}
static int dump_one_page(int pipe, unsigned long addr, void *arg)
{
int fd = *(int *)arg;
u64 iaddr;
iaddr = encode_pointer((void *)addr);
if (write_img(fd, &iaddr))
return -1;
if (splice(pipe, NULL, fd, NULL, PAGE_SIZE,
SPLICE_F_MOVE | SPLICE_F_NONBLOCK) != PAGE_SIZE) {
pr_perror("Can't splice page from page-pipe");
return -1;
}
return 0;
}
static int dump_pages_to_image(struct page_pipe *pp, struct cr_fdset *fds)
{
int fd;
fd = fdset_fd(fds, CR_FD_PAGES);
return page_pipe_iterate_pages(pp, dump_one_page, &fd);
}
int parasite_dump_pages_seized(struct parasite_ctl *ctl, struct vm_area_list *vma_area_list,
struct cr_fdset *cr_fdset)
int parasite_dump_pages_seized(struct parasite_ctl *ctl, int vpid,
struct vm_area_list *vma_area_list, struct cr_fdset *cr_fdset)
{
struct parasite_dump_pages_args *args;
u64 *map;
int pagemap;
int pagemap, fd, fd_pg;
struct page_pipe *pp;
struct page_pipe_buf *ppb;
struct vma_area *vma_area;
......@@ -629,8 +604,44 @@ int parasite_dump_pages_seized(struct parasite_ctl *ctl, struct vm_area_list *vm
args->off += args->nr;
}
ret = dump_pages_to_image(pp, cr_fdset);
fd = open_image(CR_FD_PAGEMAP, O_DUMP, (long)vpid);
if (fd < 0)
goto out_pp;
fd_pg = open_pages_image(O_DUMP, fd);
if (fd_pg < 0)
goto out_fd;
ret = -1;
list_for_each_entry(ppb, &pp->bufs, l) {
int i;
pr_debug("Dump pages %d/%d\n", ppb->pages_in, ppb->nr_segs);
for (i = 0; i < ppb->nr_segs; i++) {
PagemapEntry pe = PAGEMAP_ENTRY__INIT;
struct iovec *iov = &ppb->iov[i];
pe.vaddr = encode_pointer(iov->iov_base);
pe.nr_pages = iov->iov_len / PAGE_SIZE;
pr_debug("\t%p [%u]\n", iov->iov_base,
(unsigned int)(iov->iov_len / PAGE_SIZE));
if (pb_write_one(fd, &pe, PB_PAGEMAP) < 0)
break;
if (splice(ppb->p[0], NULL, fd_pg, NULL, iov->iov_len,
SPLICE_F_MOVE) != iov->iov_len)
break;
}
if (i != ppb->nr_segs)
goto out_fds;
}
ret = 0;
out_fds:
close(fd_pg);
out_fd:
close(fd);
out_pp:
destroy_page_pipe(pp);
out_close:
......
......@@ -51,6 +51,7 @@
#include "protobuf/tty.pb-c.h"
#include "protobuf/file-lock.pb-c.h"
#include "protobuf/rlimit.pb-c.h"
#include "protobuf/pagemap.pb-c.h"
typedef size_t (*pb_getpksize_t)(void *obj);
typedef size_t (*pb_pack_t)(void *obj, void *where);
......@@ -132,6 +133,8 @@ void cr_pb_init(void)
CR_PB_DESC(TTY_INFO, TtyInfo, tty_info);
CR_PB_DESC(FILE_LOCK, FileLock, file_lock);
CR_PB_DESC(RLIMIT, Rlimit, rlimit);
CR_PB_MDESC_INIT(cr_pb_descs[PB_PAGEMAP_HEAD], PagemapHead, pagemap_head);
CR_PB_DESC(PAGEMAP, Pagemap, pagemap);
}
/*
......
......@@ -50,6 +50,7 @@ proto-obj-y += netdev.o
proto-obj-y += tty.o
proto-obj-y += file-lock.o
proto-obj-y += rlimit.o
proto-obj-y += pagemap.o
proto := $(proto-obj-y:.o=)
proto-c := $(proto-obj-y:.o=.pb-c.c)
......
message pagemap_head {
required uint32 pages_id = 1;
}
message pagemap_entry {
required uint64 vaddr = 1;
required uint32 nr_pages = 2;
}
......@@ -6,8 +6,8 @@
#include "image.h"
#include "crtools.h"
#include "restorer.h"
#include "protobuf.h"
#include "protobuf/pagemap.pb-c.h"
struct shmems *rst_shmems;
......@@ -134,30 +134,44 @@ static int shmem_wait_and_open(int pid, struct shmem_info *si)
static int restore_shmem_content(void *addr, struct shmem_info *si)
{
u64 offset;
int fd, ret = 0;
int fd, fd_pg, ret = 0;
fd = open_image_ro(CR_FD_SHMEM_PAGES, si->shmid);
if (fd < 0) {
munmap(addr, si->size);
return -1;
}
fd = open_image_ro(CR_FD_SHMEM_PAGEMAP, si->shmid);
if (fd < 0)
goto err_unmap;
fd_pg = open_pages_image(O_RSTR, fd);
if (fd_pg < 0)
goto out_close;
while (1) {
ret = read_img_buf_eof(fd, &offset, sizeof(offset));
PagemapEntry *pe;
ret = pb_read_one_eof(fd, &pe, PB_PAGEMAP);
if (ret <= 0)
break;
if (offset + PAGE_SIZE > si->size)
if (pe->vaddr + pe->nr_pages * PAGE_SIZE > si->size)
break;
ret = read_img_buf(fd, addr + offset, PAGE_SIZE);
if (ret < 0)
ret = read(fd_pg, addr + pe->vaddr, pe->nr_pages * PAGE_SIZE);
if (ret != pe->nr_pages * PAGE_SIZE) {
ret = -1;
break;
}
pagemap_entry__free_unpacked(pe, NULL);
}
close(fd_pg);
close(fd);
return ret;
out_close:
close(fd);
err_unmap:
munmap(addr, si->size);
return -1;
}
int get_shmem_fd(int pid, VmaEntry *vi)
......@@ -283,13 +297,15 @@ int add_shmem_area(pid_t pid, VmaEntry *vma)
int cr_dump_shmem(void)
{
int i, err, fd;
int i, err, fd, fd_pg;
unsigned char *map = NULL;
void *addr = NULL;
struct shmem_info_dump *si;
unsigned long pfn, nrpages;
for_each_shmem_dump (i, si) {
PagemapEntry pe = PAGEMAP_ENTRY__INIT;
pr_info("Dumping shared memory 0x%lx\n", si->shmid);
nrpages = (si->size + PAGE_SIZE - 1) / PAGE_SIZE;
......@@ -320,25 +336,42 @@ int cr_dump_shmem(void)
if (err)
goto err_unmap;
fd = open_image(CR_FD_SHMEM_PAGES, O_DUMP, si->shmid);
fd = open_image(CR_FD_SHMEM_PAGEMAP, O_DUMP, si->shmid);
if (fd < 0)
goto err_unmap;
fd_pg = open_pages_image(O_DUMP, fd);
if (fd_pg < 0)
goto err_close;
pe.nr_pages = 0;
for (pfn = 0; pfn < nrpages; pfn++) {
u64 offset = pfn * PAGE_SIZE;
if (!(map[pfn] & PAGE_RSS))
if (map[pfn] & PAGE_RSS) {
if (!pe.nr_pages)
pe.vaddr = offset;
pe.nr_pages++;
if (pfn + 1 < nrpages)
continue;
}
if (!pe.nr_pages)
continue;
if (write_img_buf(fd, &offset, sizeof(offset)))
if (pb_write_one(fd, &pe, PB_PAGEMAP))
break;
if (write_img_buf(fd, addr + offset, PAGE_SIZE))
if (write(fd_pg, addr + pe.vaddr, pe.nr_pages * PAGE_SIZE) !=
pe.nr_pages * PAGE_SIZE)
break;
pe.nr_pages = 0;
}
if (pfn != nrpages)
goto err_close;
goto err_close2;
close(fd_pg);
close(fd);
munmap(addr, si->size);
xfree(map);
......@@ -346,6 +379,8 @@ int cr_dump_shmem(void)
return 0;
err_close2:
close(fd_pg);
err_close:
close(fd);
err_unmap:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment