Commit 700419bc authored by Mike Rapoport's avatar Mike Rapoport Committed by Andrei Vagin

criu: lazy-pages: replace page list with IOVs list

Instead of tracking memory handled by userfaultfd on the page basis we can
use IOVs for continious chunks.

travis-ci: success for uffd: A new set of improvements
Signed-off-by: 's avatarMike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: 's avatarPavel Emelyanov <xemul@virtuozzo.com>
parent 0086dca4
...@@ -47,11 +47,17 @@ ...@@ -47,11 +47,17 @@
static mutex_t *lazy_sock_mutex; static mutex_t *lazy_sock_mutex;
struct lazy_iovec {
struct list_head l;
unsigned long base;
unsigned long len;
};
struct lazy_pages_info { struct lazy_pages_info {
int pid; int pid;
int uffd; int uffd;
struct list_head pages; struct list_head iovs;
struct page_read pr; struct page_read pr;
...@@ -72,7 +78,7 @@ static struct lazy_pages_info *lpi_init(void) ...@@ -72,7 +78,7 @@ static struct lazy_pages_info *lpi_init(void)
return NULL; return NULL;
memset(lpi, 0, sizeof(*lpi)); memset(lpi, 0, sizeof(*lpi));
INIT_LIST_HEAD(&lpi->pages); INIT_LIST_HEAD(&lpi->iovs);
INIT_LIST_HEAD(&lpi->l); INIT_LIST_HEAD(&lpi->l);
return lpi; return lpi;
...@@ -80,8 +86,12 @@ static struct lazy_pages_info *lpi_init(void) ...@@ -80,8 +86,12 @@ static struct lazy_pages_info *lpi_init(void)
static void lpi_fini(struct lazy_pages_info *lpi) static void lpi_fini(struct lazy_pages_info *lpi)
{ {
struct lazy_iovec *p, *n;
if (!lpi) if (!lpi)
return; return;
list_for_each_entry_safe(p, n, &lpi->iovs, l)
xfree(p);
if (lpi->uffd > 0) if (lpi->uffd > 0)
close(lpi->uffd); close(lpi->uffd);
if (lpi->pr.close) if (lpi->pr.close)
...@@ -288,118 +298,135 @@ out: ...@@ -288,118 +298,135 @@ out:
return -1; return -1;
} }
#define UFFD_FLAG_SENT 0x1 static MmEntry *init_mm_entry(struct lazy_pages_info *lpi)
{
struct cr_img *img;
MmEntry *mm;
int ret;
struct uffd_pages_struct { img = open_image(CR_FD_MM, O_RSTR, lpi->pid);
struct list_head list; if (!img)
unsigned long addr; return NULL;
int flags;
}; ret = pb_read_one_eof(img, &mm, PB_MM);
close_image(img);
if (ret == -1)
return NULL;
pr_debug("Found %zd VMAs in image\n", mm->n_vmas);
return mm;
}
static int collect_uffd_pages(struct lazy_pages_info *lpi, MmEntry *mm) static int update_lazy_iovecs(struct lazy_pages_info *lpi, unsigned long addr,
int len)
{ {
unsigned long base; struct lazy_iovec *lazy_iov, *n;
int i, j;
struct iovec iov;
unsigned long nr_pages;
unsigned long ps;
int rc;
struct uffd_pages_struct *uffd_pages;
struct page_read *pr = &lpi->pr;
rc = pr->get_pagemap(pr, &iov); list_for_each_entry_safe(lazy_iov, n, &lpi->iovs, l) {
if (rc <= 0) unsigned long start = lazy_iov->base;
return 0; unsigned long end = start + lazy_iov->len;
ps = page_size(); if (len <= 0)
nr_pages = iov.iov_len / ps; break;
base = (unsigned long) iov.iov_base;
pr_debug("iov.iov_base 0x%lx (%ld pages)\n", base, nr_pages);
for (i = 0; i < nr_pages; i++) { if (addr < start || addr >= end)
bool uffd_page = false; continue;
base = (unsigned long) iov.iov_base + (i * ps);
/* if (addr + len < end) {
* Only pages which are MAP_ANONYMOUS and MAP_PRIVATE if (addr == start) {
* are relevant for userfaultfd handling. lazy_iov->base += len;
* Loop over all VMAs to see if the flags matching. lazy_iov->len -= len;
*/ } else {
for (j = 0; j < mm->n_vmas; j++) { struct lazy_iovec *new_iov;
VmaEntry *vma = mm->vmas[j];
/* lazy_iov->len -= (end - addr);
* This loop assumes that base can actually be found
* in the VMA list. new_iov = xzalloc(sizeof(*new_iov));
*/ if (!new_iov)
if (base >= vma->start && base < vma->end) { return -1;
if (vma_entry_can_be_lazy(vma)) {
if(!pagemap_in_parent(pr->pe)) new_iov->base = addr + len;
uffd_page = true; new_iov->len = end - (addr + len);
break;
} list_add(&new_iov->l, &lazy_iov->l);
} }
break;
} }
/* This is not a page we are looking for. Move along */ if (addr == start) {
if (!uffd_page) list_del(&lazy_iov->l);
continue; xfree(lazy_iov);
} else {
pr_debug("Adding 0x%lx to our list\n", base); lazy_iov->len -= (end - addr);
}
uffd_pages = xzalloc(sizeof(struct uffd_pages_struct)); len -= (end - addr);
if (!uffd_pages) addr = end;
return -1;
uffd_pages->addr = base;
list_add(&uffd_pages->list, &lpi->pages);
} }
return 1; return 0;
} }
/* /*
* Setting up criu infrastructure and scan for VMAs. * Create a list of IOVs that can be handled using userfaultfd. The
* IOVs generally correspond to lazy pagemap entries, except the cases
* when a single pagemap entry covers several VMAs. In those cases
* IOVs are split at VMA boundaries because UFFDIO_COPY may be done
* only inside a single VMA.
* We assume here that pagemaps and VMAs are sorted.
*/ */
static int find_vmas(struct lazy_pages_info *lpi) static int collect_lazy_iovecs(struct lazy_pages_info *lpi)
{ {
struct cr_img *img; struct page_read *pr = &lpi->pr;
int ret; struct lazy_iovec *lazy_iov, *n;
MmEntry *mm; MmEntry *mm;
struct uffd_pages_struct *uffd_pages; int nr_pages = 0, n_vma = 0;
int ret = -1;
unsigned long start, end, len;
img = open_image(CR_FD_MM, O_RSTR, lpi->pid); mm = init_mm_entry(lpi);
if (!img) if (!mm)
return -1; return -1;
ret = pb_read_one_eof(img, &mm, PB_MM); while (pr->advance(pr)) {
close_image(img); if (!pagemap_lazy(pr->pe))
if (ret == -1) continue;
return -1;
pr_debug("Found %zd VMAs in image\n", mm->n_vmas);
ret = open_page_read(lpi->pid, &lpi->pr, PR_TASK); start = pr->pe->vaddr;
if (ret <= 0) { end = start + pr->pe->nr_pages * page_size();
ret = -1; nr_pages += pr->pe->nr_pages;
goto out;
} for (; n_vma < mm->n_vmas; n_vma++) {
/* VmaEntry *vma = mm->vmas[n_vma];
* This puts all pages which should be handled by userfaultfd
* in the list uffd_list. This list is later used to detect if if (start >= vma->end)
* a page has already been transferred or if it needs to be continue;
* pushed into the process using userfaultfd.
*/ lazy_iov = xzalloc(sizeof(*lazy_iov));
do { if (!lazy_iov)
ret = collect_uffd_pages(lpi, mm); goto free_iovs;
if (ret == -1) {
goto out; len = min_t(uint64_t, end, vma->end) - start;
lazy_iov->base = start;
lazy_iov->len = len;
list_add_tail(&lazy_iov->l, &lpi->iovs);
if (end <= vma->end)
break;
start = vma->end;
} }
} while (ret); }
/* Count detected pages */ ret = nr_pages;
list_for_each_entry(uffd_pages, &lpi->pages, list) goto free_mm;
ret++;
pr_debug("Found %d pages to be handled by UFFD\n", ret); free_iovs:
list_for_each_entry_safe(lazy_iov, n, &lpi->iovs, l)
xfree(lazy_iov);
free_mm:
mm_entry__free_unpacked(mm, NULL);
out:
return ret; return ret;
} }
...@@ -441,12 +468,22 @@ static int ud_open(int client, struct lazy_pages_info **_lpi) ...@@ -441,12 +468,22 @@ static int ud_open(int client, struct lazy_pages_info **_lpi)
uffd_flags = fcntl(lpi->uffd, F_GETFD, NULL); uffd_flags = fcntl(lpi->uffd, F_GETFD, NULL);
pr_debug("uffd_flags are 0x%x\n", uffd_flags); pr_debug("uffd_flags are 0x%x\n", uffd_flags);
ret = open_page_read(lpi->pid, &lpi->pr, PR_TASK);
if (ret <= 0) {
ret = -1;
goto out;
}
/* /*
* Find the memory pages belonging to the restored process * Find the memory pages belonging to the restored process
* so that it is trackable when all pages have been transferred. * so that it is trackable when all pages have been transferred.
*/ */
if ((lpi->total_pages = find_vmas(lpi)) == -1) ret = collect_lazy_iovecs(lpi);
if (ret < 0)
goto out; goto out;
lpi->total_pages = ret;
pr_debug("Found %ld pages to be handled by UFFD\n", lpi->total_pages);
list_add_tail(&lpi->l, &lpis); list_add_tail(&lpi->l, &lpis);
*_lpi = lpi; *_lpi = lpi;
...@@ -556,33 +593,33 @@ static int uffd_handle_page(struct lazy_pages_info *lpi, __u64 address, ...@@ -556,33 +593,33 @@ static int uffd_handle_page(struct lazy_pages_info *lpi, __u64 address,
static int handle_remaining_pages(struct lazy_pages_info *lpi, void *dest) static int handle_remaining_pages(struct lazy_pages_info *lpi, void *dest)
{ {
struct uffd_pages_struct *uffd_pages; struct lazy_iovec *lazy_iov;
int rc; int nr_pages, i, err;
unsigned long addr;
list_for_each_entry(uffd_pages, &lpi->pages, list) { lpi->pr.reset(&lpi->pr);
pr_debug("Checking remaining pages 0x%lx (flags 0x%x)\n",
uffd_pages->addr, uffd_pages->flags);
if (uffd_pages->flags & UFFD_FLAG_SENT)
continue;
rc = uffd_handle_page(lpi, uffd_pages->addr, dest); list_for_each_entry(lazy_iov, &lpi->iovs, l) {
if (rc < 0) { nr_pages = lazy_iov->len / PAGE_SIZE;
pr_err("Error during UFFD copy\n");
return -1; for (i = 0; i < nr_pages; i++) {
} addr = lazy_iov->base + i * PAGE_SIZE;
uffd_pages->flags |= UFFD_FLAG_SENT; err = uffd_handle_page(lpi, addr, dest);
if (err < 0) {
pr_err("Error during UFFD copy\n");
return -1;
}
}
} }
return 0; return 0;
} }
static int handle_regular_pages(struct lazy_pages_info *lpi, void *dest, static int handle_regular_pages(struct lazy_pages_info *lpi, void *dest,
__u64 address) __u64 address)
{ {
int rc; int rc;
struct uffd_pages_struct *uffd_pages;
rc = uffd_handle_page(lpi, address, dest); rc = uffd_handle_page(lpi, address, dest);
if (rc < 0) { if (rc < 0) {
...@@ -590,14 +627,9 @@ static int handle_regular_pages(struct lazy_pages_info *lpi, void *dest, ...@@ -590,14 +627,9 @@ static int handle_regular_pages(struct lazy_pages_info *lpi, void *dest,
return -1; return -1;
} }
/* rc = update_lazy_iovecs(lpi, address, PAGE_SIZE);
* Mark this page as having been already transferred, so if (rc < 0)
* that it has not to be copied again later. return -1;
*/
list_for_each_entry(uffd_pages, &lpi->pages, list) {
if (uffd_pages->addr == address)
uffd_pages->flags |= UFFD_FLAG_SENT;
}
return 0; return 0;
} }
...@@ -607,7 +639,6 @@ static int handle_user_fault(struct lazy_pages_info *lpi, void *dest) ...@@ -607,7 +639,6 @@ static int handle_user_fault(struct lazy_pages_info *lpi, void *dest)
struct uffd_msg msg; struct uffd_msg msg;
__u64 flags; __u64 flags;
__u64 address; __u64 address;
struct uffd_pages_struct *uffd_pages;
int ret; int ret;
ret = read(lpi->uffd, &msg, sizeof(msg)); ret = read(lpi->uffd, &msg, sizeof(msg));
...@@ -632,11 +663,6 @@ static int handle_user_fault(struct lazy_pages_info *lpi, void *dest) ...@@ -632,11 +663,6 @@ static int handle_user_fault(struct lazy_pages_info *lpi, void *dest)
address = msg.arg.pagefault.address & ~(page_size() - 1); address = msg.arg.pagefault.address & ~(page_size() - 1);
pr_debug("msg.arg.pagefault.address 0x%llx\n", address); pr_debug("msg.arg.pagefault.address 0x%llx\n", address);
/* Make sure to not transfer a page twice */
list_for_each_entry(uffd_pages, &lpi->pages, list)
if ((uffd_pages->addr == address) && (uffd_pages->flags & UFFD_FLAG_SENT))
return 0;
/* Now handle the pages actually requested. */ /* Now handle the pages actually requested. */
flags = msg.arg.pagefault.flags; flags = msg.arg.pagefault.flags;
pr_debug("msg.arg.pagefault.flags 0x%llx\n", flags); pr_debug("msg.arg.pagefault.flags 0x%llx\n", flags);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment