Commit 5ad71a89 authored by Mike Rapoport's avatar Mike Rapoport Committed by Andrei Vagin

criu: lazy-pages: enable remoting of lazy pages

The remote lazy pages variant can be run as follows:

src# criu dump -t <pid> --lazy-pages --port 9876 -D /tmp/1 &
src# while ! sudo fuser 9876/tcp ; do sleep 1; done
src# scp -r /tmp/1/ dst:/tmp/

dst# criu lazy-pages --page-server --address dst --port 9876 -D /tmp/1 &
dst# criu restore --lazy-pages -D /tmp/1

In a nutshell, this implementation of remote lazy pages does the following:

- dump collects the process memory into the pipes, transfers non-lazy pages
  to the images or to the page-server on the restore side. The lazy pages
  are kept in pipes for later transfer
- when the dump creates the page_pipe_bufs, it marks the buffers containing
potentially lazy pages with PPB_LAZY
- at the dump_finish stage, the dump side starts TCP server that will
handle page requests from the restore side
- the checkpoint directory is transferred to the restore side
- on the restore side lazy-pages daemon is started, it creates UNIX socket
to receive uffd's from the restore and a TCP socket to forward page
requests to the dump side
- restore creates memory mappings and fills the VMAs that cannot be handled
by uffd with the contents of the pages*img.
- restore registers lazy VMAs with uffd and sends the userfault file
descriptors to the lazy-pages daemon
- when a #PF occurs, the lazy-pages daemon sends PS_IOV_GET command to the dump
side; the command contains PID, the faulting address and amount of pages
(always 1 at the moment)
- the dump side extracts the requested pages from the pipe and splices them
into the TCP socket.
- the lazy-pages daemon copies the received pages into the restored process
address space
Signed-off-by: 's avatarMike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: 's avatarPavel Emelyanov <xemul@virtuozzo.com>
parent 9982fc9e
......@@ -1187,6 +1187,7 @@ static int pre_dump_one_task(struct pstree_item *item)
item->pid->ns[0].virt = misc.pid;
mdc.pre_dump = true;
mdc.lazy = false;
ret = parasite_dump_pages_seized(item, &vmas, &mdc, parasite_ctl);
if (ret)
......@@ -1346,6 +1347,7 @@ static int dump_one_task(struct pstree_item *item)
}
mdc.pre_dump = false;
mdc.lazy = opts.lazy_pages;
ret = parasite_dump_pages_seized(item, &vmas, &mdc, parasite_ctl);
if (ret)
......@@ -1387,7 +1389,10 @@ static int dump_one_task(struct pstree_item *item)
goto err;
}
ret = compel_cure(parasite_ctl);
if (opts.lazy_pages)
ret = compel_cure_remote(parasite_ctl);
else
ret = compel_cure(parasite_ctl);
if (ret) {
pr_err("Can't cure (pid: %d) from parasite\n", pid);
goto err;
......@@ -1591,6 +1596,27 @@ err:
return cr_pre_dump_finish(ret);
}
static int cr_lazy_mem_dump(void)
{
struct pstree_item *item;
int ret = 0;
pr_info("Starting lazy pages server\n");
ret = cr_page_server(false, -1);
for_each_pstree_item(item) {
destroy_page_pipe(dmpi(item)->mem_pp);
compel_cure_local(dmpi(item)->parasite_ctl);
}
if (ret)
pr_err("Lazy pages transfer FAILED.\n");
else
pr_info("Lazy pages transfer finished successfully\n");
return ret;
}
static int cr_dump_finish(int ret)
{
int post_dump_ret = 0;
......@@ -1650,6 +1676,10 @@ static int cr_dump_finish(int ret)
delete_link_remaps();
clean_cr_time_mounts();
}
if (opts.lazy_pages)
ret = cr_lazy_mem_dump();
arch_set_thread_regs(root_item);
pstree_switch_state(root_item,
(ret || post_dump_ret) ?
......
......@@ -13,6 +13,7 @@ struct vma_area;
struct mem_dump_ctl {
bool pre_dump;
bool lazy;
};
extern bool vma_has_guard_gap_hidden(struct vma_area *vma);
......
......@@ -267,7 +267,7 @@ static int drain_pages(struct page_pipe *pp, struct parasite_ctl *ctl,
return 0;
}
static int xfer_pages(struct page_pipe *pp, struct page_xfer *xfer)
static int xfer_pages(struct page_pipe *pp, struct page_xfer *xfer, bool lazy)
{
int ret;
......@@ -276,7 +276,7 @@ static int xfer_pages(struct page_pipe *pp, struct page_xfer *xfer)
* pre-dump action (see pre_dump_one_task)
*/
timing_start(TIME_MEMWRITE);
ret = page_xfer_dump_pages(xfer, pp, 0, true);
ret = page_xfer_dump_pages(xfer, pp, 0, !lazy);
timing_stop(TIME_MEMWRITE);
return ret;
......@@ -316,7 +316,7 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
return -1;
ret = -1;
if (!mdc->pre_dump)
if (!(mdc->pre_dump || mdc->lazy))
/*
* Chunk mode pushes pages portion by portion. This mode
* only works when we don't need to keep pp for later
......@@ -324,7 +324,8 @@ static int __parasite_dump_pages_seized(struct pstree_item *item,
*/
cpp_flags |= PP_CHUNK_MODE;
pp = create_page_pipe(vma_area_list->priv_size,
pargs_iovs(args), cpp_flags);
mdc->lazy ? NULL : pargs_iovs(args),
cpp_flags);
if (!pp)
goto out;
......@@ -378,7 +379,7 @@ again:
ret = drain_pages(pp, ctl, args);
if (!ret)
ret = xfer_pages(pp, &xfer);
ret = xfer_pages(pp, &xfer, mdc->lazy /* false actually */);
if (!ret) {
page_pipe_reinit(pp);
goto again;
......@@ -389,9 +390,12 @@ again:
goto out_xfer;
}
if (mdc->lazy)
memcpy(pargs_iovs(args), pp->iovs,
sizeof(struct iovec) * pp->nr_iovs);
ret = drain_pages(pp, ctl, args);
if (!ret && !mdc->pre_dump)
ret = xfer_pages(pp, &xfer);
ret = xfer_pages(pp, &xfer, mdc->lazy);
if (ret)
goto out_xfer;
......@@ -406,7 +410,7 @@ out_xfer:
if (!mdc->pre_dump)
xfer.close(&xfer);
out_pp:
if (ret || !mdc->pre_dump)
if (ret || !(mdc->pre_dump || mdc->lazy))
destroy_page_pipe(pp);
else
dmpi(item)->mem_pp = pp;
......
......@@ -35,6 +35,7 @@
#include "xmalloc.h"
#include <compel/plugins/std/syscall-codes.h>
#include "restorer.h"
#include "page-xfer.h"
#undef LOG_PREFIX
#define LOG_PREFIX "lazy-pages: "
......@@ -366,7 +367,10 @@ static int uffd_copy_page(struct lazy_pages_info *lpi, __u64 address,
struct uffdio_copy uffdio_copy;
int rc;
rc = get_page(lpi, address, dest);
if (opts.use_page_server)
rc = get_remote_pages(lpi->pid, address, 1, dest);
else
rc = get_page(lpi, address, dest);
if (rc <= 0)
return rc;
......@@ -854,6 +858,9 @@ int cr_lazy_pages()
if (prepare_uffds(epollfd))
return -1;
if (connect_to_page_server())
return -1;
ret = handle_requests(epollfd, events);
lpi_hash_fini();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment