Commit 9bb44e6f authored by Pawel Stradomski's avatar Pawel Stradomski Committed by Andrei Vagin

Punch holes in input files when restoring anonymous non-shared memory if --auto-dedup is enabled.

This reduces memory usage if image files are stored on tmpfs.
Signed-off-by: 's avatarPawel Stradomski <pstradomski@google.com>
Signed-off-by: 's avatarAndrei Vagin <avagin@virtuozzo.com>
parent 69663091
...@@ -109,3 +109,4 @@ seccomp 277 383 (unsigned int op, unsigned int flags, const char *uargs) ...@@ -109,3 +109,4 @@ seccomp 277 383 (unsigned int op, unsigned int flags, const char *uargs)
gettimeofday 169 78 (struct timeval *tv, struct timezone *tz) gettimeofday 169 78 (struct timeval *tv, struct timezone *tz)
preadv_raw 69 361 (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) preadv_raw 69 361 (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h)
userfaultfd 282 388 (int flags) userfaultfd 282 388 (int flags)
fallocate 47 352 (int fd, int mode, loff_t offset, loff_t len)
...@@ -89,6 +89,7 @@ __NR_set_robust_list 300 sys_set_robust_list (struct robust_list_head *head, si ...@@ -89,6 +89,7 @@ __NR_set_robust_list 300 sys_set_robust_list (struct robust_list_head *head, si
__NR_get_robust_list 299 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) __NR_get_robust_list 299 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
__NR_vmsplice 285 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) __NR_vmsplice 285 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags)
__NR_openat 286 sys_openat (int dfd, const char *filename, int flags, int mode) __NR_openat 286 sys_openat (int dfd, const char *filename, int flags, int mode)
__NR_fallocate 309 sys_fallocate (int fd, int mode, loff_t offset, loff_t len)
__NR_timerfd_settime 311 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) __NR_timerfd_settime 311 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
__NR_signalfd4 313 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) __NR_signalfd4 313 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags)
__NR_rt_tgsigqueueinfo 322 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info) __NR_rt_tgsigqueueinfo 322 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info)
......
...@@ -89,6 +89,7 @@ __NR_set_robust_list 304 sys_set_robust_list (struct robust_list_head *head, si ...@@ -89,6 +89,7 @@ __NR_set_robust_list 304 sys_set_robust_list (struct robust_list_head *head, si
__NR_get_robust_list 305 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) __NR_get_robust_list 305 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
__NR_vmsplice 309 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) __NR_vmsplice 309 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags)
__NR_openat 288 sys_openat (int dfd, const char *filename, int flags, int mode) __NR_openat 288 sys_openat (int dfd, const char *filename, int flags, int mode)
__NR_fallocate 314 sys_fallocate (int fd, int mode, loff_t offset, loff_t len)
__NR_timerfd_settime 320 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) __NR_timerfd_settime 320 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
__NR_signalfd4 322 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) __NR_signalfd4 322 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags)
__NR_rt_tgsigqueueinfo 330 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info) __NR_rt_tgsigqueueinfo 330 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info)
......
...@@ -83,6 +83,7 @@ __NR_set_robust_list 311 sys_set_robust_list (struct robust_list_head *head, si ...@@ -83,6 +83,7 @@ __NR_set_robust_list 311 sys_set_robust_list (struct robust_list_head *head, si
__NR_get_robust_list 312 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) __NR_get_robust_list 312 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
__NR_vmsplice 316 sys_vmsplice (int fd, const struct iovec *iov, unsigned int nr_segs, unsigned int flags) __NR_vmsplice 316 sys_vmsplice (int fd, const struct iovec *iov, unsigned int nr_segs, unsigned int flags)
__NR_signalfd 321 sys_signalfd (int ufd, const k_rtsigset_t *sigmask, size_t sigsetsize) __NR_signalfd 321 sys_signalfd (int ufd, const k_rtsigset_t *sigmask, size_t sigsetsize)
__NR_fallocate 324 sys_fallocate (int fd, int mode, loff_t offset, loff_t len)
__NR_timerfd_settime 325 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) __NR_timerfd_settime 325 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
__NR_preadv 333 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) __NR_preadv 333 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h)
__NR_rt_tgsigqueueinfo 335 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *uinfo) __NR_rt_tgsigqueueinfo 335 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *uinfo)
......
...@@ -94,6 +94,7 @@ __NR_set_robust_list 273 sys_set_robust_list (struct robust_list_head *head, s ...@@ -94,6 +94,7 @@ __NR_set_robust_list 273 sys_set_robust_list (struct robust_list_head *head, s
__NR_get_robust_list 274 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr) __NR_get_robust_list 274 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
__NR_seccomp 317 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs) __NR_seccomp 317 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs)
__NR_vmsplice 278 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags) __NR_vmsplice 278 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags)
__NR_fallocate 285 sys_fallocate (int fd, int mode, loff_t offset, loff_t len)
__NR_timerfd_settime 286 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr) __NR_timerfd_settime 286 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
__NR_signalfd4 289 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags) __NR_signalfd4 289 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags)
__NR_preadv 295 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) __NR_preadv 295 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h)
......
...@@ -3495,6 +3495,8 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns ...@@ -3495,6 +3495,8 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
task_args->nr_threads = current->nr_threads; task_args->nr_threads = current->nr_threads;
task_args->thread_args = thread_args; task_args->thread_args = thread_args;
task_args->auto_dedup = opts.auto_dedup;
/* /*
* Make root and cwd restore _that_ late not to break any * Make root and cwd restore _that_ late not to break any
* attempts to open files by paths above (e.g. /proc). * attempts to open files by paths above (e.g. /proc).
......
...@@ -202,6 +202,7 @@ struct task_restore_args { ...@@ -202,6 +202,7 @@ struct task_restore_args {
bool compatible_mode; bool compatible_mode;
bool can_map_vdso; bool can_map_vdso;
bool auto_dedup;
#ifdef CONFIG_VDSO #ifdef CONFIG_VDSO
unsigned long vdso_rt_size; unsigned long vdso_rt_size;
struct vdso_maps vdso_maps_rt; /* runtime vdso symbols */ struct vdso_maps vdso_maps_rt; /* runtime vdso symbols */
......
...@@ -1271,7 +1271,11 @@ static int prepare_vma_ios(struct pstree_item *t, struct task_restore_args *ta) ...@@ -1271,7 +1271,11 @@ static int prepare_vma_ios(struct pstree_item *t, struct task_restore_args *ta)
{ {
struct cr_img *pages; struct cr_img *pages;
pages = open_image(CR_FD_PAGES, O_RSTR, rsti(t)->pages_img_id); /* if auto-dedup is on we need RDWR mode to be able to punch holes
* in the input files (in restorer.c)
*/
pages = open_image(CR_FD_PAGES, opts.auto_dedup ? O_RDWR : O_RSTR,
rsti(t)->pages_img_id);
if (!pages) if (!pages)
return -1; return -1;
......
...@@ -51,6 +51,15 @@ ...@@ -51,6 +51,15 @@
#define PR_SET_PDEATHSIG 1 #define PR_SET_PDEATHSIG 1
#endif #endif
#ifndef FALLOC_FL_KEEP_SIZE
#define FALLOC_FL_KEEP_SIZE 0x01
#endif
#ifndef FALLOC_FL_PUNCH_HOLE
#define FALLOC_FL_PUNCH_HOLE 0x02
#endif
#define sys_prctl_safe(opcode, val1, val2, val3) \ #define sys_prctl_safe(opcode, val1, val2, val3) \
({ \ ({ \
long __ret = sys_prctl(opcode, val1, val2, val3, 0); \ long __ret = sys_prctl(opcode, val1, val2, val3, 0); \
...@@ -647,6 +656,14 @@ static unsigned long restore_mapping(VmaEntry *vma_entry) ...@@ -647,6 +656,14 @@ static unsigned long restore_mapping(VmaEntry *vma_entry)
!(vma_entry->status & VMA_NO_PROT_WRITE)) !(vma_entry->status & VMA_NO_PROT_WRITE))
prot |= PROT_WRITE; prot |= PROT_WRITE;
/* TODO: Drop MAP_LOCKED bit and restore it after reading memory.
*
* Code below tries to limit memory usage by running fallocate()
* after each preadv() to avoid doubling memory usage (once in
* image files, once in process). Unfortunately, MAP_LOCKED defeats
* that mechanism as it causes the process to be charged for memory
* immediately upon mmap, not later upon preadv().
*/
pr_debug("\tmmap(%"PRIx64" -> %"PRIx64", %x %x %d)\n", pr_debug("\tmmap(%"PRIx64" -> %"PRIx64", %x %x %d)\n",
vma_entry->start, vma_entry->end, vma_entry->start, vma_entry->end,
prot, flags, (int)vma_entry->fd); prot, flags, (int)vma_entry->fd);
...@@ -1368,6 +1385,15 @@ long __export_restore_task(struct task_restore_args *args) ...@@ -1368,6 +1385,15 @@ long __export_restore_task(struct task_restore_args *args)
} }
pr_debug("`- returned %ld\n", (long)r); pr_debug("`- returned %ld\n", (long)r);
/* If the file is open for writing, then it means we should punch holes
* in it. */
if (r > 0 && args->auto_dedup) {
int fr = sys_fallocate(args->vma_ios_fd, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE,
rio->off, r);
if (fr < 0) {
pr_debug("Failed to punch holes with fallocate: %d\n", fr);
}
}
rio->off += r; rio->off += r;
/* Advance the iovecs */ /* Advance the iovecs */
do { do {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment