Commit 08c20482 authored by Pavel Emelyanov's avatar Pavel Emelyanov

aio: Dump AIO rings

When AIO context is set up kernel does two things:

1. creates an in-kernel aioctx object
2. maps a ring into process memory

The 2nd thing gives us all the needed information
about how the AIO was set up. So, in order to dump
one we need to pick the ring in memory and get all
the information we need from it.

One thing to note -- we cannot dump tasks if there
are any AIO requests pending. So we also need to
go to parasite and check the ring to be empty.
Signed-off-by: 's avatarPavel Emelyanov <xemul@parallels.com>
parent 80cf0426
......@@ -61,6 +61,7 @@ obj-y += kerndat.o
obj-y += stats.o
obj-y += cgroup.o
obj-y += timerfd.o
obj-y += aio.o
obj-y += string.o
obj-y += sigframe.o
ifeq ($(VDSO),y)
......
#include <unistd.h>
#include <stdio.h>
#include <stdbool.h>
#include "vma.h"
#include "xmalloc.h"
#include "aio.h"
#include "parasite.h"
#include "parasite-syscall.h"
#include "protobuf/mm.pb-c.h"
int dump_aio_ring(MmEntry *mme, struct vma_area *vma)
{
int nr = mme->n_aios;
AioRingEntry *re;
pr_info("Dumping AIO ring @%#lx, %u reqs\n",
vma->e->start, vma->aio_nr_req);
mme->aios = xrealloc(mme->aios, (nr + 1) * sizeof(re));
if (!mme->aios)
return -1;
re = xmalloc(sizeof(*re));
if (!re)
return -1;
aio_ring_entry__init(re);
re->id = vma->e->start;
re->nr_req = vma->aio_nr_req;
re->ring_len = vma->e->end - vma->e->start;
mme->aios[nr] = re;
mme->n_aios = nr + 1;
return 0;
}
void free_aios(MmEntry *mme)
{
int i;
if (mme->aios) {
for (i = 0; i < mme->n_aios; i++)
xfree(mme->aios[i]);
xfree(mme->aios);
}
}
static unsigned int aio_estimate_nr_reqs(unsigned int k_max_reqs)
{
/*
* Kernel does
*
* nr_reqs = max(nr_reqs, nr_cpus * 4)
* nr_reqs *= 2
* nr_reqs += 2
* ring = roundup(sizeof(head) + nr_reqs * sizeof(req))
* nr_reqs = (ring - sizeof(head)) / sizeof(req)
*
* And the k_max_reqs here is the resulting value.
*
* We need to get the initial nr_reqs that would grow
* up back to the k_max_reqs.
*/
return (k_max_reqs - 2) / 2;
}
unsigned long aio_rings_args_size(struct vm_area_list *vmas)
{
return sizeof(struct parasite_check_aios_args) +
vmas->nr_aios * sizeof(struct parasite_aio);
}
int parasite_check_aios(struct parasite_ctl *ctl, struct vm_area_list *vmas)
{
struct vma_area *vma;
struct parasite_check_aios_args *aa;
struct parasite_aio *pa;
int i;
if (!vmas->nr_aios)
return 0;
pr_info("Checking AIO rings\n");
/*
* Go to parasite and
* a) check that no requests are currently pengind
* b) get the maximum number of requests kernel handles
* to estimate what was the user request on ring
* creation.
*/
aa = parasite_args_s(ctl, aio_rings_args_size(vmas));
pa = &aa->ring[0];
list_for_each_entry(vma, &vmas->h, list) {
if (!vma_area_is(vma, VMA_AREA_AIORING))
continue;
pr_debug(" `- Ring #%ld @%#lx\n",
pa - &aa->ring[0], vma->e->start);
pa->ctx = vma->e->start;
pa->max_reqs = 0;
pa->vma_nr_reqs = &vma->aio_nr_req;
pa++;
}
aa->nr_rings = vmas->nr_aios;
if (parasite_execute_daemon(PARASITE_CMD_CHECK_AIOS, ctl))
return -1;
pa = &aa->ring[0];
for (i = 0; i < vmas->nr_aios; i++) {
pa = &aa->ring[i];
*pa->vma_nr_reqs = aio_estimate_nr_reqs(pa->max_reqs);
pr_debug(" `- Ring #%d has %u reqs, estimated to %u\n", i,
pa->max_reqs, *pa->vma_nr_reqs);
}
return 0;
}
......@@ -72,6 +72,7 @@
#include "irmap.h"
#include "sysfs_parse.h"
#include "action-scripts.h"
#include "aio.h"
#include "asm/dump.h"
......@@ -464,6 +465,12 @@ static int dump_task_mm(pid_t pid, const struct proc_pid_stat *stat,
goto err;
mme.vmas[i++] = vma;
if (vma_entry_is(vma, VMA_AREA_AIORING)) {
ret = dump_aio_ring(&mme, vma_area);
if (ret)
goto err;
}
}
mme.mm_start_code = stat->start_code;
......@@ -496,6 +503,7 @@ static int dump_task_mm(pid_t pid, const struct proc_pid_stat *stat,
ret = pb_write_one(img_from_set(imgset, CR_FD_MM), &mme, PB_MM);
xfree(mme.mm_saved_auxv);
free_aios(&mme);
err:
return ret;
}
......@@ -1566,6 +1574,12 @@ static int dump_one_task(struct pstree_item *item)
goto err_cure_imgset;
}
ret = parasite_check_aios(parasite_ctl, &vmas); /* FIXME -- merge with above */
if (ret) {
pr_err("Failed to check aio rings (pid: %d)\n", pid);
goto err_cure_imgset;
}
ret = parasite_dump_misc_seized(parasite_ctl, &misc);
if (ret) {
pr_err("Can't dump misc (pid: %d)\n", pid);
......
#ifndef __CR_AIO_H__
#define __CR_AIO_H__
#include "protobuf/mm.pb-c.h"
int dump_aio_ring(MmEntry *mme, struct vma_area *vma);
void free_aios(MmEntry *mme);
struct parasite_ctl;
int parasite_check_aios(struct parasite_ctl *, struct vm_area_list *);
unsigned long aio_rings_args_size(struct vm_area_list *);
#endif /* __CR_AIO_H__ */
......@@ -54,6 +54,7 @@
#define VMA_AREA_SYSVIPC (1 << 10)
#define VMA_AREA_SOCKET (1 << 11)
#define VMA_AREA_VVAR (1 << 12)
#define VMA_AREA_AIORING (1 << 13)
#define VMA_UNSUPP (1 << 31) /* Unsupported VMA */
......
......@@ -47,6 +47,7 @@ enum {
PARASITE_CMD_GET_PROC_FD,
PARASITE_CMD_DUMP_TTY,
PARASITE_CMD_CHECK_VDSO_MARK,
PARASITE_CMD_CHECK_AIOS,
PARASITE_CMD_MAX,
};
......@@ -133,6 +134,17 @@ struct parasite_dump_posix_timers_args {
struct posix_timer timer[0];
};
struct parasite_aio {
unsigned long ctx;
unsigned int max_reqs;
unsigned int *vma_nr_reqs;
};
struct parasite_check_aios_args {
unsigned nr_rings;
struct parasite_aio ring[0];
};
static inline int posix_timers_dump_size(int timer_n)
{
return sizeof(int) + sizeof(struct posix_timer) * timer_n;
......
......@@ -7,6 +7,7 @@
struct vm_area_list {
struct list_head h;
unsigned nr;
unsigned int nr_aios;
unsigned long priv_size; /* nr of pages in private VMAs */
unsigned long longest; /* nr of pages in longest VMA */
};
......@@ -35,9 +36,12 @@ struct vma_area {
* The file_fd is an fd for a regular file and
* the socket_id is the inode number of the
* mapped (PF_PACKET) socket.
*
* The aio_nr_req is only for aio rings.
*/
int vm_file_fd;
int vm_socket_id;
unsigned int aio_nr_req;
};
char *aufs_rpath; /* path from aufs root */
......
......@@ -28,6 +28,7 @@
#include "mem.h"
#include "vma.h"
#include "proc_parse.h"
#include "aio.h"
#include <string.h>
#include <stdlib.h>
......@@ -1193,6 +1194,7 @@ struct parasite_ctl *parasite_infect_seized(pid_t pid, struct pstree_item *item,
return NULL;
parasite_ensure_args_size(dump_pages_args_size(vma_area_list));
parasite_ensure_args_size(aio_rings_args_size(vma_area_list));
/*
* Inject a parasite engine. Ie allocate memory inside alien
......
......@@ -330,6 +330,73 @@ static inline int tty_ioctl(int fd, int cmd, int *arg)
return 0;
}
/*
* Stolen from kernel/fs/aio.c
*
* Is it valid to go to memory and check it? Should be,
* as libaio does the same.
*/
#define AIO_RING_MAGIC 0xa10a10a1
#define AIO_RING_COMPAT_FEATURES 1
#define AIO_RING_INCOMPAT_FEATURES 0
struct aio_ring {
unsigned id; /* kernel internal index number */
unsigned nr; /* number of io_events */
unsigned head; /* Written to by userland or under ring_lock
* mutex by aio_read_events_ring(). */
unsigned tail;
unsigned magic;
unsigned compat_features;
unsigned incompat_features;
unsigned header_length; /* size of aio_ring */
/* struct io_event io_events[0]; */
};
static int sane_ring(struct aio_ring *ring)
{
return ring->magic == AIO_RING_MAGIC &&
ring->compat_features == AIO_RING_COMPAT_FEATURES &&
ring->incompat_features == AIO_RING_INCOMPAT_FEATURES &&
ring->header_length == sizeof(struct aio_ring);
}
static int parasite_check_aios(struct parasite_check_aios_args *args)
{
int i;
for (i = 0; i < args->nr_rings; i++) {
struct aio_ring *ring;
ring = (struct aio_ring *)args->ring[i].ctx;
if (!sane_ring(ring)) {
pr_err("Not valid ring #%d\n", i);
pr_info(" `- magic %x\n", ring->magic);
pr_info(" `- cf %d\n", ring->compat_features);
pr_info(" `- if %d\n", ring->incompat_features);
pr_info(" `- size %d (%ld)\n", ring->header_length, sizeof(struct aio_ring));
return -1;
}
/*
* XXX what else can we do if there are requests
* in the ring?
*/
if (ring->head != ring->tail) {
pr_err("Pending AIO requests in ring #%d\n", i);
return -1;
}
args->ring[i].max_reqs = ring->nr;
}
return 0;
}
static int parasite_dump_tty(struct parasite_tty_args *args)
{
int ret;
......@@ -541,6 +608,9 @@ static noinline __used int noinline parasite_daemon(void *args)
case PARASITE_CMD_DUMP_TTY:
ret = parasite_dump_tty(args);
break;
case PARASITE_CMD_CHECK_AIOS:
ret = parasite_check_aios(args);
break;
#ifdef CONFIG_VDSO
case PARASITE_CMD_CHECK_VDSO_MARK:
ret = parasite_check_vdso_mark(args);
......
......@@ -43,6 +43,12 @@ static char *buf = __buf.buf;
#define BUF_SIZE sizeof(__buf.buf)
/*
* This is how AIO ring buffers look like in proc
*/
#define AIO_FNAME "/[aio]"
int parse_cpuinfo_features(int (*handler)(char *tok))
{
FILE *cpuinfo;
......@@ -191,7 +197,7 @@ static inline int vfi_equal(struct vma_file_info *a, struct vma_file_info *b)
(a->dev_min ^ b->dev_min)) == 0;
}
static int vma_get_mapfile(struct vma_area *vma, DIR *mfd,
static int vma_get_mapfile(char *fname, struct vma_area *vma, DIR *mfd,
struct vma_file_info *vfi, struct vma_file_info *prev_vfi)
{
char path[32];
......@@ -244,13 +250,22 @@ static int vma_get_mapfile(struct vma_area *vma, DIR *mfd,
if (fstatat(dirfd(mfd), path, &buf, 0))
return -1;
if (!S_ISSOCK(buf.st_mode))
return -1;
if (S_ISSOCK(buf.st_mode)) {
pr_info("Found socket mapping @%"PRIx64"\n", vma->e->start);
vma->vm_socket_id = buf.st_ino;
vma->e->status |= VMA_AREA_SOCKET | VMA_AREA_REGULAR;
return 0;
}
vma->vm_socket_id = buf.st_ino;
pr_info("Found socket mapping @%"PRIx64"\n", vma->e->start);
vma->e->status |= VMA_AREA_SOCKET | VMA_AREA_REGULAR;
return 0;
if ((buf.st_mode & S_IFMT) == 0 && !strcmp(fname, AIO_FNAME)) {
/* AIO ring, let's try */
close(vma->vm_file_fd);
vma->aio_nr_req = -1;
vma->e->status = VMA_AREA_AIORING;
return 0;
}
pr_err("Unknown shit %o (%s)\n", buf.st_mode, fname);
}
return -1;
......@@ -325,6 +340,7 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list)
struct bfd f;
vma_area_list->nr = 0;
vma_area_list->nr_aios = 0;
vma_area_list->longest = 0;
vma_area_list->priv_size = 0;
INIT_LIST_HEAD(&vma_area_list->h);
......@@ -417,7 +433,7 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list)
vma_area->e->pgoff = pgoff;
vma_area->e->prot = PROT_NONE;
if (vma_get_mapfile(vma_area, map_files_dir, &vfi, &prev_vfi))
if (vma_get_mapfile(file_path, vma_area, map_files_dir, &vfi, &prev_vfi))
goto err_bogus_mapfile;
if (r == 'r')
......@@ -437,6 +453,8 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list)
}
if (vma_area->e->status != 0) {
if (vma_area->e->status & VMA_AREA_AIORING)
vma_area_list->nr_aios++;
continue;
} else if (!strcmp(file_path, "[vsyscall]") ||
!strcmp(file_path, "[vectors]")) {
......
import "vma.proto";
message aio_ring_entry {
required uint64 id = 1;
required uint32 nr_req = 2;
required uint32 ring_len = 3;
}
message mm_entry {
required uint64 mm_start_code = 1;
required uint64 mm_end_code = 2;
......@@ -19,4 +25,5 @@ message mm_entry {
repeated vma_entry vmas = 14;
optional int32 dumpable = 15;
repeated aio_ring_entry aios = 16;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment