Commit 3a7968d4 authored by Kirill Tkhai's avatar Kirill Tkhai Committed by Pavel Emelyanov

aio: Restore aio ring content

1)Dump/restore mmaped aio ring like any other private vma entry,
  with the only exception we do not predump it.
2)Create io context, set head and tail using write to /dev/null.
3)Copy aio ring restored in (1) to created in (2).
4)Remap (2) to address of (1).
Signed-off-by: 's avatarKirill Tkhai <ktkhai@virtuozzo.com>
Signed-off-by: 's avatarPavel Emelyanov <xemul@virtuozzo.com>
parent 868f24e6
...@@ -102,4 +102,5 @@ __NR_seccomp 358 sys_seccomp (unsigned int op, unsigned int flags, const char ...@@ -102,4 +102,5 @@ __NR_seccomp 358 sys_seccomp (unsigned int op, unsigned int flags, const char
__NR_memfd_create 360 sys_memfd_create (const char *name, unsigned int flags) __NR_memfd_create 360 sys_memfd_create (const char *name, unsigned int flags)
__NR_io_setup 227 sys_io_setup (unsigned nr_events, aio_context_t *ctx_idp) __NR_io_setup 227 sys_io_setup (unsigned nr_events, aio_context_t *ctx_idp)
__NR_io_getevents 229 sys_io_getevents (aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout) __NR_io_getevents 229 sys_io_getevents (aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout)
__NR_io_submit 230 sys_io_submit (aio_context_t ctx_id, long nr, struct iocb **iocbpp)
__NR_ipc 117 sys_ipc (unsigned int call, int first, unsigned long second, unsigned long third, const void *ptr, long fifth) __NR_ipc 117 sys_ipc (unsigned int call, int first, unsigned long second, unsigned long third, const void *ptr, long fifth)
...@@ -66,6 +66,7 @@ __NR_set_thread_area 243 sys_set_thread_area (user_desc_t *info) ...@@ -66,6 +66,7 @@ __NR_set_thread_area 243 sys_set_thread_area (user_desc_t *info)
__NR_get_thread_area 244 sys_get_thread_area (user_desc_t *info) __NR_get_thread_area 244 sys_get_thread_area (user_desc_t *info)
__NR_io_setup 245 sys_io_setup (unsigned nr_reqs, aio_context_t *ctx32p) __NR_io_setup 245 sys_io_setup (unsigned nr_reqs, aio_context_t *ctx32p)
__NR_io_getevents 247 sys_io_getevents (aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout) __NR_io_getevents 247 sys_io_getevents (aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout)
__NR_io_submit 248 sys_io_submit (aio_context_t ctx_id, long nr, struct iocb **iocbpp)
__NR_exit_group 252 sys_exit_group (int error_code) __NR_exit_group 252 sys_exit_group (int error_code)
__NR_set_tid_address 258 sys_set_tid_address (int *tid_addr) __NR_set_tid_address 258 sys_set_tid_address (int *tid_addr)
__NR_timer_create 259 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id) __NR_timer_create 259 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id)
......
...@@ -74,6 +74,7 @@ __NR_futex 202 sys_futex (u32 *uaddr, int op, u32 val, struct timespec *utim ...@@ -74,6 +74,7 @@ __NR_futex 202 sys_futex (u32 *uaddr, int op, u32 val, struct timespec *utim
__NR_set_thread_area 205 sys_set_thread_area (user_desc_t *info) __NR_set_thread_area 205 sys_set_thread_area (user_desc_t *info)
__NR_io_setup 206 sys_io_setup (unsigned nr_events, aio_context_t *ctx) __NR_io_setup 206 sys_io_setup (unsigned nr_events, aio_context_t *ctx)
__NR_io_getevents 208 sys_io_getevents (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) __NR_io_getevents 208 sys_io_getevents (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo)
__NR_io_submit 209 sys_io_submit (aio_context_t ctx, long nr, struct iocb **iocbpp)
__NR_get_thread_area 211 sys_get_thread_area (user_desc_t *info) __NR_get_thread_area 211 sys_get_thread_area (user_desc_t *info)
__NR_set_tid_address 218 sys_set_tid_address (int *tid_addr) __NR_set_tid_address 218 sys_set_tid_address (int *tid_addr)
__NR_restart_syscall 219 sys_restart_syscall (void) __NR_restart_syscall 219 sys_restart_syscall (void)
......
...@@ -341,6 +341,7 @@ static int map_private_vma(struct vma_area *vma, void **tgt_addr, ...@@ -341,6 +341,7 @@ static int map_private_vma(struct vma_area *vma, void **tgt_addr,
size = vma_entry_len(vma->e); size = vma_entry_len(vma->e);
if (paddr == NULL) { if (paddr == NULL) {
int flag = 0;
/* /*
* The respective memory area was NOT found in the parent. * The respective memory area was NOT found in the parent.
* Map a new one. * Map a new one.
...@@ -348,9 +349,16 @@ static int map_private_vma(struct vma_area *vma, void **tgt_addr, ...@@ -348,9 +349,16 @@ static int map_private_vma(struct vma_area *vma, void **tgt_addr,
pr_info("Map 0x%016"PRIx64"-0x%016"PRIx64" 0x%016"PRIx64" vma\n", pr_info("Map 0x%016"PRIx64"-0x%016"PRIx64" 0x%016"PRIx64" vma\n",
vma->e->start, vma->e->end, vma->e->pgoff); vma->e->start, vma->e->end, vma->e->pgoff);
/*
* Restore AIO ring buffer content to temporary anonymous area.
* This will be placed in io_setup'ed AIO in restore_aio_ring().
*/
if (vma_entry_is(vma->e, VMA_AREA_AIORING))
flag |= MAP_ANONYMOUS;
addr = mmap(*tgt_addr, size, addr = mmap(*tgt_addr, size,
vma->e->prot | PROT_WRITE, vma->e->prot | PROT_WRITE,
vma->e->flags | MAP_FIXED, vma->e->flags | MAP_FIXED | flag,
vma->e->fd, vma->e->pgoff); vma->e->fd, vma->e->pgoff);
if (addr == MAP_FAILED) { if (addr == MAP_FAILED) {
......
...@@ -31,6 +31,7 @@ struct rusage; ...@@ -31,6 +31,7 @@ struct rusage;
struct file_handle; struct file_handle;
struct robust_list_head; struct robust_list_head;
struct io_event; struct io_event;
struct iocb;
struct timespec; struct timespec;
typedef unsigned long aio_context_t; typedef unsigned long aio_context_t;
......
...@@ -95,10 +95,11 @@ static inline int in_vma_area(struct vma_area *vma, unsigned long addr) ...@@ -95,10 +95,11 @@ static inline int in_vma_area(struct vma_area *vma, unsigned long addr)
static inline bool vma_entry_is_private(VmaEntry *entry, static inline bool vma_entry_is_private(VmaEntry *entry,
unsigned long task_size) unsigned long task_size)
{ {
return vma_entry_is(entry, VMA_AREA_REGULAR) && return (vma_entry_is(entry, VMA_AREA_REGULAR) &&
(vma_entry_is(entry, VMA_ANON_PRIVATE) || (vma_entry_is(entry, VMA_ANON_PRIVATE) ||
vma_entry_is(entry, VMA_FILE_PRIVATE)) && vma_entry_is(entry, VMA_FILE_PRIVATE)) &&
(entry->end <= task_size); (entry->end <= task_size)) ||
vma_entry_is(entry, VMA_AREA_AIORING);
} }
static inline bool vma_area_is_private(struct vma_area *vma, static inline bool vma_area_is_private(struct vma_area *vma,
......
...@@ -190,6 +190,12 @@ static struct parasite_dump_pages_args *prep_dump_pages_args(struct parasite_ctl ...@@ -190,6 +190,12 @@ static struct parasite_dump_pages_args *prep_dump_pages_args(struct parasite_ctl
list_for_each_entry(vma, &vma_area_list->h, list) { list_for_each_entry(vma, &vma_area_list->h, list) {
if (!vma_area_is_private(vma, kdat.task_size)) if (!vma_area_is_private(vma, kdat.task_size))
continue; continue;
/*
* Kernel write to aio ring is not soft-dirty tracked,
* so we ignore them at pre-dump.
*/
if (vma_entry_is(vma->e, VMA_AREA_AIORING) && pp_ret)
continue;
if (vma->e->prot & PROT_READ) if (vma->e->prot & PROT_READ)
continue; continue;
...@@ -303,6 +309,8 @@ static int __parasite_dump_pages_seized(struct parasite_ctl *ctl, ...@@ -303,6 +309,8 @@ static int __parasite_dump_pages_seized(struct parasite_ctl *ctl,
if (!vma_area_is_private(vma_area, kdat.task_size)) if (!vma_area_is_private(vma_area, kdat.task_size))
continue; continue;
if (vma_entry_is(vma_area->e, VMA_AREA_AIORING) && pp_ret)
continue;
map = pmc_get_map(&pmc, vma_area); map = pmc_get_map(&pmc, vma_area);
if (!map) if (!map)
......
...@@ -410,14 +410,7 @@ static int parasite_check_aios(struct parasite_check_aios_args *args) ...@@ -410,14 +410,7 @@ static int parasite_check_aios(struct parasite_check_aios_args *args)
return -1; return -1;
} }
/* /* XXX: wait aio completion */
* XXX what else can we do if there are requests
* in the ring?
*/
if (ring->head != ring->tail) {
pr_err("Pending AIO requests in ring #%d\n", i);
return -1;
}
args->ring[i].max_reqs = ring->nr; args->ring[i].max_reqs = ring->nr;
} }
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
#include <linux/securebits.h> #include <linux/securebits.h>
#include <linux/capability.h> #include <linux/capability.h>
#include <linux/aio_abi.h>
#include <sys/types.h> #include <sys/types.h>
#include <sys/mman.h> #include <sys/mman.h>
#include <sys/stat.h> #include <sys/stat.h>
...@@ -16,6 +17,7 @@ ...@@ -16,6 +17,7 @@
#include <signal.h> #include <signal.h>
#include "compiler.h" #include "compiler.h"
#include "asm/string.h"
#include "asm/types.h" #include "asm/types.h"
#include "syscall.h" #include "syscall.h"
#include "config.h" #include "config.h"
...@@ -545,10 +547,23 @@ static unsigned long restore_mapping(const VmaEntry *vma_entry) ...@@ -545,10 +547,23 @@ static unsigned long restore_mapping(const VmaEntry *vma_entry)
return addr; return addr;
} }
/*
* This restores aio ring header, content, head and in-kernel position
* of tail. To set tail, we write to /dev/null and use the fact this
* operation is synchronious for the device. Also, we unmap temporary
* anonymous area, used to store content of ring buffer during restore
* and mapped in map_private_vma().
*/
static int restore_aio_ring(struct rst_aio_ring *raio) static int restore_aio_ring(struct rst_aio_ring *raio)
{ {
struct aio_ring *ring = (void *)raio->addr;
int i, maxr, count, fd, ret;
unsigned head = ring->head;
unsigned tail = ring->tail;
struct iocb *iocb, **iocbp;
unsigned long ctx = 0; unsigned long ctx = 0;
int ret; unsigned size;
char buf[1];
ret = sys_io_setup(raio->nr_req, &ctx); ret = sys_io_setup(raio->nr_req, &ctx);
if (ret < 0) { if (ret < 0) {
...@@ -556,8 +571,80 @@ static int restore_aio_ring(struct rst_aio_ring *raio) ...@@ -556,8 +571,80 @@ static int restore_aio_ring(struct rst_aio_ring *raio)
return -1; return -1;
} }
if (ctx == raio->addr) /* Lucky bastards we are! */ if (tail == 0 && head == 0)
return 0; goto populate;
fd = sys_open("/dev/null", O_WRONLY, 0);
if (fd < 0) {
pr_err("Can't open /dev/null for aio\n");
return -1;
}
/*
* If tail < head, we have to do full turn and then submit
* tail more request, i.e. ring->nr + tail.
* If we do not do full turn, in-kernel completed_events
* will initialize wrong.
*
* Maximum number reqs to submit at once are ring->nr-1,
* so we won't allocate more.
*/
if (tail < head)
count = ring->nr + tail;
else
count = tail;
maxr = min_t(unsigned, count, ring->nr-1);
/*
* Since we only interested in moving the tail, the requests
* may be any. We submit count identical requests.
*/
size = sizeof(struct iocb) + maxr * sizeof(struct iocb *);
iocb = (void *)sys_mmap(NULL, size, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
iocbp = (void *)iocb + sizeof(struct iocb);
if (iocb == MAP_FAILED) {
pr_err("Can't mmap aio tmp buffer\n");
return -1;
}
iocb->aio_fildes = fd;
iocb->aio_buf = (unsigned long)buf;
iocb->aio_nbytes = 1;
iocb->aio_lio_opcode = IOCB_CMD_PWRITE; /* Write is nop, read populates buf */
for (i = 0; i < maxr; i++)
iocbp[i] = iocb;
i = 0;
do {
ret = sys_io_submit(ctx, count - i, iocbp);
if (ret < 0) {
pr_err("Can't submit aio iocbs: ret=%d\n", ret);
return -1;
}
i += ret;
/*
* We may submit less than requested, because of too big
* count OR behaviour of get_reqs_available(), which
* takes available requests only if their number is
* aliquot to kioctx::req_batch. Free part of buffer
* for next iteration.
*
* Direct set of head is equal to sys_io_getevents() call,
* and faster. See kernel for the details.
*/
((struct aio_ring *)ctx)->head = i < head ? i : head;
} while (i < count);
sys_munmap(iocb, size);
sys_close(fd);
populate:
i = offsetof(struct aio_ring, io_events);
builtin_memcpy((void *)ctx + i, (void *)ring + i, raio->len - i);
/* /*
* If we failed to get the proper nr_req right and * If we failed to get the proper nr_req right and
...@@ -567,6 +654,8 @@ static int restore_aio_ring(struct rst_aio_ring *raio) ...@@ -567,6 +654,8 @@ static int restore_aio_ring(struct rst_aio_ring *raio)
* *
* This is not great, but anyway better than putting * This is not great, but anyway better than putting
* a ring of wrong size into correct place. * a ring of wrong size into correct place.
*
* Also, this unmaps temporary anonymous area on raio->addr.
*/ */
ctx = sys_mremap(ctx, raio->len, raio->len, ctx = sys_mremap(ctx, raio->len, raio->len,
...@@ -576,23 +665,6 @@ static int restore_aio_ring(struct rst_aio_ring *raio) ...@@ -576,23 +665,6 @@ static int restore_aio_ring(struct rst_aio_ring *raio)
pr_err("Ring remap failed with %ld\n", ctx); pr_err("Ring remap failed with %ld\n", ctx);
return -1; return -1;
} }
/*
* Now check that kernel not just remapped the
* ring into new place, but updated the internal
* context state respectively.
*/
ret = sys_io_getevents(ctx, 0, 1, NULL, NULL);
if (ret != 0) {
if (ret < 0)
pr_err("Kernel doesn't remap AIO rings\n");
else
pr_err("AIO context screwed up\n");
return -1;
}
return 0; return 0;
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment