pipe: Rework pipe data restore to use vmsplice

This serves two things -- kills annoying data offset management which makes protobuf switch too ugly. And removes the need to seek the pipe data image, which in turn is the prerequisity for streaming migration. Memory usage is still optimal, since we still use splice on dump path and on restore stage we F_GIFT pipe buffers to the kernel. Signed-off-by: Pavel Emelyanov <xemul@parallels.com>

pipe: Rework pipe data restore to use vmsplice
This serves two things -- kills annoying data offset management which makes protobuf switch too ugly. And removes the need to seek the pipe data image, which in turn is the prerequisity for streaming migration. Memory usage is still optimal, since we still use splice on dump path and on restore stage we F_GIFT pipe buffers to the kernel. Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
237ef3c8 · Pavel Emelyanov · c69d18ea · 237ef3c8 · 237ef3c8 · 237ef3c8
Commit 237ef3c8 authored Jul 12, 2012 by Pavel Emelyanov
Show whitespace changes
Inline Side-by-side

Showing with 74 additions and 47 deletions

cr-check.c cr-check.c +20 -0

cr-show.c cr-show.c +2 -3

image.h include/image.h +0 -9

pipes.h include/pipes.h +1 -0

pipes.c pipes.c +51 -35

No files found.
--- a/cr-check.c
+++ b/cr-check.c
@@ -279,6 +279,25 @@ static int check_fdinfo_ext(void)
 	return ret;
 }

+static int check_unaligned_vmsplice(void)
+{
+	int p[2], ret;
+	char buf; /* :) */
+	struct iovec iov;
+
+	pipe(p);
+	iov.iov_base = &buf;
+	iov.iov_len = sizeof(buf);
+	ret = vmsplice(p[1], &iov, 1, SPLICE_F_GIFT | SPLICE_F_NONBLOCK);
+	if (ret < 0) {
+		pr_perror("Unaligned vmsplice doesn't work");
+		return -1;
+	}
+
+	pr_info("Unaligned vmsplice works OK\n");
+	return 0;
+}
+
 int cr_check(void)
 {
 	int ret = 0;
@@ -293,6 +312,7 @@ int cr_check(void)
 	ret |= check_proc_stat();
 	ret |= check_tcp_repair();
 	ret |= check_fdinfo_ext();
+	ret |= check_unaligned_vmsplice();

 	if (!ret)
 		pr_msg("Looks good.\n");

--- a/cr-show.c
+++ b/cr-show.c
@@ -183,9 +183,8 @@ void __show_pipes_data(int fd, struct cr_options *o)
 	while (1) {
 		if (read_img_eof(fd, &e) <= 0)
 			break;
-		pr_msg("pipeid: 0x%8x bytes: 0x%8x off: 0x%8x\n",
-		       e.pipe_id, e.bytes, e.off);
-		lseek(fd, e.off + e.bytes, SEEK_CUR);
+		pr_msg("pipeid: 0x%8x bytes: 0x%8x\n", e.pipe_id, e.bytes);
+		lseek(fd, e.bytes, SEEK_CUR);
 	}
 }


--- a/include/image.h
+++ b/include/image.h
@@ -161,7 +161,6 @@ struct pipe_entry {
 struct pipe_data_entry {
 	u32	pipe_id;
 	u32	bytes;
-	u32	off;
 	u8	data[0];
 } __packed;

@@ -170,14 +169,6 @@ struct fifo_entry {
 	u32	pipe_id;
 } __packed;

-/*
- * splice() connect cache pages to pipe buffer, so
- * some part of pages may be loosed if data are not
- * aligned in a file.
- */
-#define PIPE_DEF_BUFFERS	16
-#define PIPE_MAX_NONALIG_SIZE	((PIPE_DEF_BUFFERS - 1) * PAGE_SIZE)
-
 #define USK_EXTERN	(1 << 0)

 struct sk_opts_entry {

--- a/include/pipes.h
+++ b/include/pipes.h
@@ -22,6 +22,7 @@ extern int dump_one_pipe_data(struct pipe_data_dump *pd, int lfd, const struct f

 struct pipe_data_rst {
 	struct pipe_data_entry	*pde;
+	void *data;
 	struct pipe_data_rst	*next;
 };


--- a/pipes.c
+++ b/pipes.c
@@ -3,6 +3,7 @@
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <stdlib.h>
+#include <sys/mman.h>

 #include "crtools.h"
 #include "image.h"
@@ -37,6 +38,27 @@ static void show_saved_pipe_fds(struct pipe_info *pi)
 		pr_info("   `- FD %d pid %d\n", fle->fe->fd, fle->pid);
 }

+static int pipe_data_read(int fd, struct pipe_data_rst *r)
+{
+	/*
+	 * We potentially allocate more memory than required for data,
+	 * but this is OK. Look at restore_pipe_data -- it vmsplice-s
+	 * this into the kernel with F_GIFT flag (since some time it
+	 * works on non-aligned data), thus just giving this page to
+	 * pipe buffer. And since kernel allocates pipe buffers in pages
+	 * anyway we don't increase memory consumption :)
+	 */
+
+	r->data = mmap(NULL, r->pde->bytes, PROT_READ | PROT_WRITE,
+			MAP_SHARED | MAP_ANON, 0, 0);
+	if (r->data == MAP_FAILED) {
+		pr_perror("Can't map mem for pipe buffers");
+		return -1;
+	}
+
+	return read_img_buf(fd, r->data, r->pde->bytes);
+}
+
 int collect_pipe_data(int img_type, struct pipe_data_rst **hash)
 {
 	int fd, ret;
@@ -47,8 +69,6 @@ int collect_pipe_data(int img_type, struct pipe_data_rst **hash)
 		return -1;

 	while (1) {
-		u32 off;
-
 		ret = -1;
 		r = xmalloc(sizeof(*r));
 		if (!r)
@@ -61,9 +81,9 @@ int collect_pipe_data(int img_type, struct pipe_data_rst **hash)
 		if (ret <= 0)
 			break;

-		off = r->pde->off + lseek(fd, 0, SEEK_CUR);
-		lseek(fd, r->pde->bytes + r->pde->off, SEEK_CUR);
-		r->pde->off = off;
+		ret = pipe_data_read(fd, r);
+		if (ret < 0)
+			break;

 		ret = r->pde->pipe_id & PIPE_DATA_HASH_MASK;
 		r->next = hash[ret];
@@ -132,8 +152,9 @@ static struct pipe_data_rst *pd_hash_pipes[PIPE_DATA_HASH_SIZE];

 int restore_pipe_data(int img_type, int pfd, u32 id, struct pipe_data_rst **hash)
 {
-	int img, size = 0, ret;
+	int ret;
 	struct pipe_data_rst *pd;
+	struct iovec iov;

 	for (pd = hash[id & PIPE_DATA_HASH_MASK]; pd != NULL; pd = pd->next)
 		if (pd->pde->pipe_id == id)
@@ -144,33 +165,46 @@ int restore_pipe_data(int img_type, int pfd, u32 id, struct pipe_data_rst **hash
 		return 0;
 	}

-	img = open_image_ro(img_type);
-	if (img < 0)
+	if (!pd->data) {
+		pr_err("Double data restore occurred on %#x\n", id);
 		return -1;
+	}

-	pr_info("\t\tSplicing data size=%u off=%u\n", pd->pde->bytes, pd->pde->off);
-	lseek(img, pd->pde->off, SEEK_SET);
+	iov.iov_base = pd->data;
+	iov.iov_len = pd->pde->bytes;

-	while (size != pd->pde->bytes) {
-		ret = splice(img, NULL, pfd, NULL, pd->pde->bytes - size, 0);
+	while (iov.iov_len > 0) {
+		ret = vmsplice(pfd, &iov, 1, SPLICE_F_GIFT | SPLICE_F_NONBLOCK);
 		if (ret < 0) {
 			pr_perror("%#x: Error splicing data", id);
 			goto err;
 		}

-		if (ret == 0) {
-			pr_err("%#x: Wanted to restore %d bytes, but got %d\n",
-			       id, pd->pde->bytes, size);
+		if (ret == 0 || ret > iov.iov_len /* sanity */) {
+			pr_err("%#x: Wanted to restore %lu bytes, but got %d\n", id,
+					iov.iov_len, ret);
 			ret = -1;
 			goto err;
 		}

-		size += ret;
+		iov.iov_base += ret;
+		iov.iov_len -= ret;
 	}

+	/*
+	 * 3 reasons for killing the buffer from our address space:
+	 *
+	 * 1. We gifted the pages to the kernel to optimize memory usage, thus
+	 *    accidental memory corruption can change the pipe buffer.
+	 * 2. This will make the vmas restoration a bit faster due to less self
+	 *    mappings to be unmapped.
+	 * 3. We can catch bugs with double pipe data restore.
+	 */
+
+	munmap(pd->data, pd->pde->bytes);
+	pd->data = NULL;
 	ret = 0;
 err:
-	close(img);
 	return ret;
 }

@@ -362,28 +396,10 @@ int dump_one_pipe_data(struct pipe_data_dump *pd, int lfd, const struct fd_parms

 		pde.pipe_id	= pipe_id(p);
 		pde.bytes	= bytes;
-		pde.off		= 0;
-
-		if (bytes > PIPE_MAX_NONALIG_SIZE) {
-			off_t off;
-
-			off  = lseek(img, 0, SEEK_CUR);
-			off += sizeof(pde);
-			off &= ~PAGE_MASK;
-
-			if (off)
-				pde.off = PAGE_SIZE - off;
-
-			pr_info("\toff %#lx %#x bytes %#x\n", off, pde.off, bytes);
-		}

 		if (write_img(img, &pde))
 			goto err_close;

-		/* Don't forget to advance position if a hole needed */
-		if (pde.off)
-			lseek(img, pde.off, SEEK_CUR);
-
 		wrote = splice(steal_pipe[0], NULL, img, NULL, bytes, 0);
 		if (wrote < 0) {
 			pr_perror("Can't push pipe data");