Commit 52242be3 authored by Pavel Emelyanov's avatar Pavel Emelyanov

dump: Stop using mincore and switch to /proc/pid/pagemap

We use mincore to check which pages we should take with us into
the image. The anonymour present or swapped should go, file present
but not cow-ed should not.

The mincore syscall wasn't very helpful with this -- it didn't detect
swap, but did detect some non present pages (pagecache). Plus it
didn't know anything about cow-ing filemaps.

Andrey Morton suggested to use the pagemap file in proc, but it lacked
the importaint stuff -- the cow filemap bit. Now it's there and we
can switch to using it.

The mincore usage for shared memory is still there, as for _that_ case
it's correct.
Signed-off-by: 's avatarPavel Emelyanov <xemul@parallels.com>
parent cee5e817
...@@ -1819,6 +1819,13 @@ static int cr_dump_shmem(void) ...@@ -1819,6 +1819,13 @@ static int cr_dump_shmem(void)
goto err; goto err;
} }
/*
* We can't use pagemap here, because this vma is
* not mapped to us at all, but mincore reports the
* pagecache status of a file, which is correct in
* this case.
*/
err = mincore(addr, si->size, map); err = mincore(addr, si->size, map);
if (err) if (err)
goto err_unmap; goto err_unmap;
......
...@@ -101,16 +101,18 @@ static void sys_write_msg(const char *msg) ...@@ -101,16 +101,18 @@ static void sys_write_msg(const char *msg)
sys_write(logfd, msg, size); sys_write(logfd, msg, size);
} }
static inline int should_dump_page(struct vma_entry *vmae, unsigned char mincore_flags) #define PME_PRESENT (1ULL << 63)
#define PME_SWAP (1ULL << 62)
#define PME_FILE (1ULL << 61)
static inline int should_dump_page(struct vma_entry *vmae, u64 pme)
{ {
#ifdef PAGE_ANON return (pme & (PME_PRESENT | PME_SWAP)) &&
if (vma_entry_is(vmae, VMA_FILE_PRIVATE)) /*
return mincore_flags & PAGE_ANON; * Optimisation for private mapping pages, that haven't
else * yet being COW-ed
return mincore_flags & PAGE_RSS; */
#else !(vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE));
return (mincore_flags & PAGE_RSS);
#endif
} }
static int fd_pages = -1; static int fd_pages = -1;
...@@ -137,29 +139,52 @@ static int dump_pages(struct parasite_dump_pages_args *args) ...@@ -137,29 +139,52 @@ static int dump_pages(struct parasite_dump_pages_args *args)
parasite_status_t *st = &args->status; parasite_status_t *st = &args->status;
unsigned long nrpages, pfn, length; unsigned long nrpages, pfn, length;
unsigned long prot_old, prot_new; unsigned long prot_old, prot_new;
unsigned char *map; u64 *map, off;
int ret = -1, fd; int ret = -1, fd;
args->nrpages_dumped = 0; args->nrpages_dumped = 0;
args->nrpages_skipped = 0; args->nrpages_skipped = 0;
prot_old = prot_new = 0; prot_old = prot_new = 0;
fd = fd_pages; pfn = args->vma_entry.start / PAGE_SIZE;
nrpages = (args->vma_entry.end - args->vma_entry.start) / PAGE_SIZE;
length = args->vma_entry.end - args->vma_entry.start;
nrpages = length / PAGE_SIZE;
args->nrpages_total = nrpages; args->nrpages_total = nrpages;
length = nrpages * sizeof(*map);
/* /*
* brk should allow us to handle up to 128M of memory, * Up to 10M of pagemap will handle 5G mapping.
* otherwise call for mmap.
*/ */
map = brk_alloc(nrpages); map = brk_alloc(length);
if (!map) { if (!map) {
SET_PARASITE_RET(st, -ENOMEM); SET_PARASITE_RET(st, -ENOMEM);
goto err; goto err;
} }
fd = sys_open("/proc/self/pagemap", O_RDONLY, 0);
if (fd < 0) {
sys_write_msg("Can't open self pagemap");
SET_PARASITE_RET(st, fd);
goto err_free;
}
off = pfn * sizeof(*map);
off = sys_lseek(fd, off, SEEK_SET);
if (off != pfn * sizeof(*map)) {
sys_write_msg("Can't seek pagemap");
SET_PARASITE_RET(st, off);
goto err_close;
}
ret = sys_read(fd, map, length);
if (ret != length) {
sys_write_msg("Can't read self pagemap");
SET_PARASITE_RET(st, ret);
goto err_free;
}
sys_close(fd);
fd = fd_pages;
/* /*
* Try to change page protection if needed so we would * Try to change page protection if needed so we would
* be able to dump contents. * be able to dump contents.
...@@ -177,18 +202,6 @@ static int dump_pages(struct parasite_dump_pages_args *args) ...@@ -177,18 +202,6 @@ static int dump_pages(struct parasite_dump_pages_args *args)
} }
} }
/*
* Dumping the whole VMA range is not a common operation
* so stick for mincore as a basis.
*/
ret = sys_mincore((void *)args->vma_entry.start, length, map);
if (ret) {
sys_write_msg("sys_mincore failed\n");
SET_PARASITE_RET(st, ret);
goto err_free;
}
ret = 0; ret = 0;
for (pfn = 0; pfn < nrpages; pfn++) { for (pfn = 0; pfn < nrpages; pfn++) {
unsigned long vaddr, written; unsigned long vaddr, written;
...@@ -210,7 +223,7 @@ static int dump_pages(struct parasite_dump_pages_args *args) ...@@ -210,7 +223,7 @@ static int dump_pages(struct parasite_dump_pages_args *args)
} }
args->nrpages_dumped++; args->nrpages_dumped++;
} else if (map[pfn] & PAGE_RSS) } else if (map[pfn] & PME_PRESENT)
args->nrpages_skipped++; args->nrpages_skipped++;
} }
...@@ -230,9 +243,13 @@ static int dump_pages(struct parasite_dump_pages_args *args) ...@@ -230,9 +243,13 @@ static int dump_pages(struct parasite_dump_pages_args *args)
ret = 0; ret = 0;
err_free: err_free:
brk_free(nrpages); brk_free(length);
err: err:
return ret; return ret;
err_close:
sys_close(fd);
goto err_free;
} }
static int dump_pages_fini(void) static int dump_pages_fini(void)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment