Commit afa9d8df authored by Mike Rapoport's avatar Mike Rapoport Committed by Pavel Emelyanov

criu: page-xfer: move code aroud

to remove forward declarations and create better grouping
Signed-off-by: 's avatarMike Rapoport <rppt@linux.vnet.ibm.com>
Signed-off-by: 's avatarPavel Emelyanov <xemul@virtuozzo.com>
parent 673ffade
...@@ -17,6 +17,8 @@ ...@@ -17,6 +17,8 @@
#include "protobuf.h" #include "protobuf.h"
#include "images/pagemap.pb-c.h" #include "images/pagemap.pb-c.h"
static int page_server_sk = -1;
struct page_server_iov { struct page_server_iov {
u32 cmd; u32 cmd;
u32 nr_pages; u32 nr_pages;
...@@ -36,8 +38,6 @@ static void iovec2psi(struct iovec *iov, struct page_server_iov *ps) ...@@ -36,8 +38,6 @@ static void iovec2psi(struct iovec *iov, struct page_server_iov *ps)
ps->nr_pages = iov->iov_len / PAGE_SIZE; ps->nr_pages = iov->iov_len / PAGE_SIZE;
} }
static int open_page_local_xfer(struct page_xfer *xfer, int fd_type, long id);
#define PS_IOV_ADD 1 #define PS_IOV_ADD 1
#define PS_IOV_HOLE 2 #define PS_IOV_HOLE 2
#define PS_IOV_OPEN 3 #define PS_IOV_OPEN 3
...@@ -65,723 +65,720 @@ static long decode_pm_id(u64 dst_id) ...@@ -65,723 +65,720 @@ static long decode_pm_id(u64 dst_id)
return (long)(dst_id >> PS_TYPE_BITS); return (long)(dst_id >> PS_TYPE_BITS);
} }
struct page_xfer_job { /* page-server xfer */
u64 dst_id; static int write_pagemap_to_server(struct page_xfer *xfer,
int p[2]; struct iovec *iov)
unsigned pipe_size; {
struct page_xfer loc_xfer; struct page_server_iov pi;
};
static struct page_xfer_job cxfer = { pi.cmd = PS_IOV_ADD;
.dst_id = ~0, pi.dst_id = xfer->dst_id;
}; iovec2psi(iov, &pi);
static void page_server_close(void) if (write(xfer->sk, &pi, sizeof(pi)) != sizeof(pi)) {
{ pr_perror("Can't write pagemap to server");
if (cxfer.dst_id != ~0) return -1;
cxfer.loc_xfer.close(&cxfer.loc_xfer); }
return 0;
} }
static void close_page_xfer(struct page_xfer *xfer); static int write_pages_to_server(struct page_xfer *xfer,
static int page_server_open(int sk, struct page_server_iov *pi) int p, unsigned long len)
{ {
int type; pr_debug("Splicing %lu bytes / %lu pages into socket\n", len, len / PAGE_SIZE);
long id;
type = decode_pm_type(pi->dst_id);
id = decode_pm_id(pi->dst_id);
pr_info("Opening %d/%ld\n", type, id);
page_server_close();
if (open_page_local_xfer(&cxfer.loc_xfer, type, id)) if (splice(p, NULL, xfer->sk, NULL, len, SPLICE_F_MOVE) != len) {
pr_perror("Can't write pages to socket");
return -1; return -1;
}
cxfer.dst_id = pi->dst_id; return 0;
}
if (sk >= 0) { static int write_hole_to_server(struct page_xfer *xfer, struct iovec *iov)
char has_parent = !!cxfer.loc_xfer.parent; {
struct page_server_iov pi;
if (write(sk, &has_parent, 1) != 1) { pi.cmd = PS_IOV_HOLE;
pr_perror("Unable to send reponse"); pi.dst_id = xfer->dst_id;
close_page_xfer(&cxfer.loc_xfer); iovec2psi(iov, &pi);
return -1;
} if (write(xfer->sk, &pi, sizeof(pi)) != sizeof(pi)) {
pr_perror("Can't write pagehole to server");
return -1;
} }
return 0; return 0;
} }
static int prep_loc_xfer(struct page_server_iov *pi) static void close_server_xfer(struct page_xfer *xfer)
{ {
if (cxfer.dst_id != pi->dst_id) { xfer->sk = -1;
pr_warn("Deprecated IO w/o open\n");
return page_server_open(-1, pi);
} else
return 0;
} }
static int page_server_add(int sk, struct page_server_iov *pi) static int open_page_server_xfer(struct page_xfer *xfer, int fd_type, long id)
{ {
size_t len; struct page_server_iov pi;
struct page_xfer *lxfer = &cxfer.loc_xfer; char has_parent;
struct iovec iov;
pr_debug("Adding %"PRIx64"/%u\n", pi->vaddr, pi->nr_pages); xfer->sk = page_server_sk;
xfer->write_pagemap = write_pagemap_to_server;
xfer->write_pages = write_pages_to_server;
xfer->write_hole = write_hole_to_server;
xfer->close = close_server_xfer;
xfer->dst_id = encode_pm_id(fd_type, id);
xfer->parent = NULL;
if (prep_loc_xfer(pi)) pi.cmd = PS_IOV_OPEN2;
return -1; pi.dst_id = xfer->dst_id;
pi.vaddr = 0;
pi.nr_pages = 0;
psi2iovec(pi, &iov); if (write(xfer->sk, &pi, sizeof(pi)) != sizeof(pi)) {
if (lxfer->write_pagemap(lxfer, &iov)) pr_perror("Can't write to page server");
return -1; return -1;
}
len = iov.iov_len; /* Push the command NOW */
while (len > 0) { tcp_nodelay(xfer->sk, true);
ssize_t chunk;
chunk = len;
if (chunk > cxfer.pipe_size)
chunk = cxfer.pipe_size;
chunk = splice(sk, NULL, cxfer.p[1], NULL, chunk, SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
if (chunk < 0) {
pr_perror("Can't read from socket");
return -1;
}
if (lxfer->write_pages(lxfer, cxfer.p[0], chunk))
return -1;
len -= chunk; if (read(xfer->sk, &has_parent, 1) != 1) {
pr_perror("The page server doesn't answer");
return -1;
} }
if (has_parent)
xfer->parent = (void *) 1; /* This is required for generate_iovs() */
return 0; return 0;
} }
static int page_server_hole(int sk, struct page_server_iov *pi) /* local xfer */
static int write_pagemap_loc(struct page_xfer *xfer,
struct iovec *iov)
{ {
struct page_xfer *lxfer = &cxfer.loc_xfer; int ret;
struct iovec iov; PagemapEntry pe = PAGEMAP_ENTRY__INIT;
pr_debug("Adding %"PRIx64"/%u hole\n", pi->vaddr, pi->nr_pages); iovec2pagemap(iov, &pe);
if (opts.auto_dedup && xfer->parent != NULL) {
ret = dedup_one_iovec(xfer->parent, iov);
if (ret == -1) {
pr_perror("Auto-deduplication failed");
return ret;
}
}
return pb_write_one(xfer->pmi, &pe, PB_PAGEMAP);
}
if (prep_loc_xfer(pi)) static int write_pages_loc(struct page_xfer *xfer,
return -1; int p, unsigned long len)
{
ssize_t ret;
psi2iovec(pi, &iov); ret = splice(p, NULL, img_raw_fd(xfer->pi), NULL, len, SPLICE_F_MOVE);
if (lxfer->write_hole(lxfer, &iov)) if (ret == -1) {
pr_perror("Unable to spice data");
return -1;
}
if (ret != len) {
pr_err("Only %zu of %lu bytes have been spliced\n", ret, len);
return -1; return -1;
}
return 0; return 0;
} }
static int page_server_check_parent(int sk, struct page_server_iov *pi); static int check_pagehole_in_parent(struct page_read *p, struct iovec *iov)
static int page_server_serve(int sk)
{ {
int ret = -1; int ret;
bool flushed = false; unsigned long off, end;
/* /*
* This socket only accepts data except one thing -- it * Try to find pagemap entry in parent, from which
* writes back the has_parent bit from time to time, so * the data will be read on restore.
* make it NODELAY all the time. *
* This is the optimized version of the page-by-page
* read_pagemap_page routine.
*/ */
tcp_nodelay(sk, true);
if (pipe(cxfer.p)) { pr_debug("Checking %p/%zu hole\n", iov->iov_base, iov->iov_len);
pr_perror("Can't make pipe for xfer"); off = (unsigned long)iov->iov_base;
close(sk); end = off + iov->iov_len;
return -1; while (1) {
} struct iovec piov;
unsigned long pend;
cxfer.pipe_size = fcntl(cxfer.p[0], F_GETPIPE_SZ, 0); ret = p->seek_page(p, off, true);
pr_debug("Created xfer pipe size %u\n", cxfer.pipe_size); if (ret <= 0 || !p->pe)
return -1;
while (1) { pagemap2iovec(p->pe, &piov);
struct page_server_iov pi; pr_debug("\tFound %p/%zu\n", piov.iov_base, piov.iov_len);
ret = recv(sk, &pi, sizeof(pi), MSG_WAITALL); /*
if (!ret) * The pagemap entry in parent may heppen to be
break; * shorter, than the hole we write. In this case
* we should go ahead and check the remainder.
*/
if (ret != sizeof(pi)) { pend = (unsigned long)piov.iov_base + piov.iov_len;
pr_perror("Can't read pagemap from socket"); if (end <= pend)
ret = -1; return 0;
break;
}
flushed = false; pr_debug("\t\tcontinue on %lx\n", pend);
off = pend;
}
}
switch (pi.cmd) { static int write_pagehole_loc(struct page_xfer *xfer, struct iovec *iov)
case PS_IOV_OPEN: {
ret = page_server_open(-1, &pi); PagemapEntry pe = PAGEMAP_ENTRY__INIT;
break;
case PS_IOV_OPEN2:
ret = page_server_open(sk, &pi);
break;
case PS_IOV_PARENT:
ret = page_server_check_parent(sk, &pi);
break;
case PS_IOV_ADD:
ret = page_server_add(sk, &pi);
break;
case PS_IOV_HOLE:
ret = page_server_hole(sk, &pi);
break;
case PS_IOV_FLUSH:
case PS_IOV_FLUSH_N_CLOSE:
{
int32_t status = 0;
ret = 0; if (xfer->parent != NULL) {
int ret;
/* ret = check_pagehole_in_parent(xfer->parent, iov);
* An answer must be sent back to inform another side, if (ret) {
* that all data were received pr_err("Hole %p/%zu not found in parent\n",
*/ iov->iov_base, iov->iov_len);
if (write(sk, &status, sizeof(status)) != sizeof(status)) { return -1;
pr_perror("Can't send the final package"); }
ret = -1; }
}
flushed = true; iovec2pagemap(iov, &pe);
break; pe.has_in_parent = true;
} pe.in_parent = true;
default:
pr_err("Unknown command %u\n", pi.cmd);
ret = -1;
break;
}
if (ret || (pi.cmd == PS_IOV_FLUSH_N_CLOSE)) if (pb_write_one(xfer->pmi, &pe, PB_PAGEMAP) < 0)
break; return -1;
return 0;
}
static void close_page_xfer(struct page_xfer *xfer)
{
if (xfer->parent != NULL) {
xfer->parent->close(xfer->parent);
xfree(xfer->parent);
xfer->parent = NULL;
} }
close_image(xfer->pi);
close_image(xfer->pmi);
}
if (!ret && !flushed) { static int open_page_local_xfer(struct page_xfer *xfer, int fd_type, long id)
pr_err("The data were not flushed\n"); {
ret = -1; xfer->pmi = open_image(fd_type, O_DUMP, id);
if (!xfer->pmi)
return -1;
xfer->pi = open_pages_image(O_DUMP, xfer->pmi);
if (!xfer->pi) {
close_image(xfer->pmi);
return -1;
} }
if (ret == 0 && opts.ps_socket == -1) { /*
char c; * Open page-read for parent images (if it exists). It will
* be used for two things:
* 1) when writing a page, those from parent will be dedup-ed
* 2) when writing a hole, the respective place would be checked
* to exist in parent (either pagemap or hole)
*/
xfer->parent = NULL;
if (fd_type == CR_FD_PAGEMAP) {
int ret;
int pfd;
/* pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY);
* Wait when a remote side closes the connection if (pfd < 0 && errno == ENOENT)
* to avoid TIME_WAIT bucket goto out;
*/
if (read(sk, &c, sizeof(c)) != 0) { xfer->parent = xmalloc(sizeof(*xfer->parent));
pr_perror("Unexpected data"); if (!xfer->parent) {
ret = -1; close(pfd);
return -1;
}
ret = open_page_read_at(pfd, id, xfer->parent, PR_TASK);
if (ret <= 0) {
pr_perror("No parent image found, though parent directory is set");
xfree(xfer->parent);
xfer->parent = NULL;
close(pfd);
goto out;
} }
close(pfd);
} }
page_server_close(); out:
pr_info("Session over\n"); xfer->write_pagemap = write_pagemap_loc;
xfer->write_pages = write_pages_loc;
xfer->write_hole = write_pagehole_loc;
xfer->close = close_page_xfer;
return 0;
}
close(sk); int open_page_xfer(struct page_xfer *xfer, int fd_type, long id)
return ret; {
if (opts.use_page_server)
return open_page_server_xfer(xfer, fd_type, id);
else
return open_page_local_xfer(xfer, fd_type, id);
} }
int cr_page_server(bool daemon_mode, int cfd) int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp,
unsigned long off)
{ {
int ask = -1; struct page_pipe_buf *ppb;
int sk = -1; struct iovec *hole = NULL;
int ret;
up_page_ids_base(); pr_debug("Transfering pages:\n");
if (opts.ps_socket != -1) { if (pp->free_hole)
ret = 0; hole = &pp->holes[0];
ask = opts.ps_socket;
pr_info("Re-using ps socket %d\n", ask);
goto no_server;
}
sk = setup_tcp_server("page"); list_for_each_entry(ppb, &pp->bufs, l) {
if (sk == -1) int i;
return -1;
no_server:
ret = run_tcp_server(daemon_mode, &ask, cfd, sk);
if (ret != 0)
return ret;
if (ask >= 0) pr_debug("\tbuf %d/%d\n", ppb->pages_in, ppb->nr_segs);
ret = page_server_serve(ask);
if (daemon_mode) for (i = 0; i < ppb->nr_segs; i++) {
exit(ret); struct iovec *iov = &ppb->iov[i];
return ret; while (hole && (hole->iov_base < iov->iov_base)) {
BUG_ON(hole->iov_base < (void *)off);
hole->iov_base -= off;
pr_debug("\th %p [%u]\n", hole->iov_base,
(unsigned int)(hole->iov_len / PAGE_SIZE));
if (xfer->write_hole(xfer, hole))
return -1;
} hole++;
if (hole >= &pp->holes[pp->free_hole])
hole = NULL;
}
static int page_server_sk = -1; BUG_ON(iov->iov_base < (void *)off);
iov->iov_base -= off;
pr_debug("\tp %p [%u]\n", iov->iov_base,
(unsigned int)(iov->iov_len / PAGE_SIZE));
int connect_to_page_server(void) if (xfer->write_pagemap(xfer, iov))
{ return -1;
if (!opts.use_page_server) if (xfer->write_pages(xfer, ppb->p[0], iov->iov_len))
return 0; return -1;
}
}
if (opts.ps_socket != -1) { while (hole) {
page_server_sk = opts.ps_socket; BUG_ON(hole->iov_base < (void *)off);
pr_info("Re-using ps socket %d\n", page_server_sk); hole->iov_base -= off;
goto out; pr_debug("\th* %p [%u]\n", hole->iov_base,
(unsigned int)(hole->iov_len / PAGE_SIZE));
if (xfer->write_hole(xfer, hole))
return -1;
hole++;
if (hole >= &pp->holes[pp->free_hole])
hole = NULL;
} }
page_server_sk = setup_tcp_client(opts.addr);
if (page_server_sk == -1)
return -1;
out:
/*
* CORK the socket at the very beginning. As per ANK
* the corked by default socket with sporadic NODELAY-s
* on urgent data is the smartest mode ever.
*/
tcp_cork(page_server_sk, true);
return 0; return 0;
} }
int disconnect_from_page_server(void) /*
* Return:
* 1 - if a parent image exists
* 0 - if a parent image doesn't exist
* -1 - in error cases
*/
int check_parent_local_xfer(int fd_type, int id)
{ {
struct page_server_iov pi = { }; char path[PATH_MAX];
int32_t status = -1; struct stat st;
int ret = -1; int ret, pfd;
if (!opts.use_page_server)
return 0;
if (page_server_sk == -1) pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY);
if (pfd < 0 && errno == ENOENT)
return 0; return 0;
pr_info("Disconnect from the page server %s:%u\n", snprintf(path, sizeof(path), imgset_template[fd_type].fmt, id);
opts.addr, (int)ntohs(opts.port)); ret = fstatat(pfd, path, &st, 0);
if (ret == -1 && errno != ENOENT) {
if (opts.ps_socket != -1) pr_perror("Unable to stat %s", path);
/* close(pfd);
* The socket might not get closed (held by return -1;
* the parent process) so we must order the
* page-server to terminate itself.
*/
pi.cmd = PS_IOV_FLUSH_N_CLOSE;
else
pi.cmd = PS_IOV_FLUSH;
if (write(page_server_sk, &pi, sizeof(pi)) != sizeof(pi)) {
pr_perror("Can't write the fini command to server");
goto out;
}
if (read(page_server_sk, &status, sizeof(status)) != sizeof(status)) {
pr_perror("The page server doesn't answer");
goto out;
} }
ret = 0; close(pfd);
out: return (ret == 0);
close_safe(&page_server_sk);
return ret ? : status;
} }
static int write_pagemap_to_server(struct page_xfer *xfer, /* page server */
struct iovec *iov) static int page_server_check_parent(int sk, struct page_server_iov *pi)
{ {
struct page_server_iov pi; int type, ret;
long id;
pi.cmd = PS_IOV_ADD; type = decode_pm_type(pi->dst_id);
pi.dst_id = xfer->dst_id; id = decode_pm_id(pi->dst_id);
iovec2psi(iov, &pi);
if (write(xfer->sk, &pi, sizeof(pi)) != sizeof(pi)) { ret = check_parent_local_xfer(type, id);
pr_perror("Can't write pagemap to server"); if (ret < 0)
return -1;
if (write(sk, &ret, sizeof(ret)) != sizeof(ret)) {
pr_perror("Unable to send reponse");
return -1; return -1;
} }
return 0; return 0;
} }
static int write_pages_to_server(struct page_xfer *xfer, static int check_parent_server_xfer(int fd_type, long id)
int p, unsigned long len)
{ {
pr_debug("Splicing %lu bytes / %lu pages into socket\n", len, len / PAGE_SIZE); struct page_server_iov pi = {};
int has_parent;
if (splice(p, NULL, xfer->sk, NULL, len, SPLICE_F_MOVE) != len) { pi.cmd = PS_IOV_PARENT;
pr_perror("Can't write pages to socket"); pi.dst_id = encode_pm_id(fd_type, id);
if (write(page_server_sk, &pi, sizeof(pi)) != sizeof(pi)) {
pr_perror("Can't write to page server");
return -1; return -1;
} }
return 0; tcp_nodelay(page_server_sk, true);
}
static int write_hole_to_server(struct page_xfer *xfer, struct iovec *iov)
{
struct page_server_iov pi;
pi.cmd = PS_IOV_HOLE;
pi.dst_id = xfer->dst_id;
iovec2psi(iov, &pi);
if (write(xfer->sk, &pi, sizeof(pi)) != sizeof(pi)) { if (read(page_server_sk, &has_parent, sizeof(int)) != sizeof(int)) {
pr_perror("Can't write pagehole to server"); pr_perror("The page server doesn't answer");
return -1; return -1;
} }
return 0; return has_parent;
} }
static void close_server_xfer(struct page_xfer *xfer) int check_parent_page_xfer(int fd_type, long id)
{ {
xfer->sk = -1; if (opts.use_page_server)
return check_parent_server_xfer(fd_type, id);
else
return check_parent_local_xfer(fd_type, id);
} }
static int open_page_server_xfer(struct page_xfer *xfer, int fd_type, long id) struct page_xfer_job {
u64 dst_id;
int p[2];
unsigned pipe_size;
struct page_xfer loc_xfer;
};
static struct page_xfer_job cxfer = {
.dst_id = ~0,
};
static void page_server_close(void)
{ {
struct page_server_iov pi; if (cxfer.dst_id != ~0)
char has_parent; cxfer.loc_xfer.close(&cxfer.loc_xfer);
}
xfer->sk = page_server_sk; static int page_server_open(int sk, struct page_server_iov *pi)
xfer->write_pagemap = write_pagemap_to_server; {
xfer->write_pages = write_pages_to_server; int type;
xfer->write_hole = write_hole_to_server; long id;
xfer->close = close_server_xfer;
xfer->dst_id = encode_pm_id(fd_type, id);
xfer->parent = NULL;
pi.cmd = PS_IOV_OPEN2; type = decode_pm_type(pi->dst_id);
pi.dst_id = xfer->dst_id; id = decode_pm_id(pi->dst_id);
pi.vaddr = 0; pr_info("Opening %d/%ld\n", type, id);
pi.nr_pages = 0;
if (write(xfer->sk, &pi, sizeof(pi)) != sizeof(pi)) { page_server_close();
pr_perror("Can't write to page server");
if (open_page_local_xfer(&cxfer.loc_xfer, type, id))
return -1; return -1;
}
/* Push the command NOW */ cxfer.dst_id = pi->dst_id;
tcp_nodelay(xfer->sk, true);
if (read(xfer->sk, &has_parent, 1) != 1) { if (sk >= 0) {
pr_perror("The page server doesn't answer"); char has_parent = !!cxfer.loc_xfer.parent;
return -1;
}
if (has_parent) if (write(sk, &has_parent, 1) != 1) {
xfer->parent = (void *) 1; /* This is required for generate_iovs() */ pr_perror("Unable to send reponse");
close_page_xfer(&cxfer.loc_xfer);
return -1;
}
}
return 0; return 0;
} }
static int write_pagemap_loc(struct page_xfer *xfer, static int prep_loc_xfer(struct page_server_iov *pi)
struct iovec *iov)
{ {
int ret; if (cxfer.dst_id != pi->dst_id) {
PagemapEntry pe = PAGEMAP_ENTRY__INIT; pr_warn("Deprecated IO w/o open\n");
return page_server_open(-1, pi);
iovec2pagemap(iov, &pe); } else
if (opts.auto_dedup && xfer->parent != NULL) { return 0;
ret = dedup_one_iovec(xfer->parent, iov);
if (ret == -1) {
pr_perror("Auto-deduplication failed");
return ret;
}
}
return pb_write_one(xfer->pmi, &pe, PB_PAGEMAP);
} }
static int write_pages_loc(struct page_xfer *xfer, static int page_server_add(int sk, struct page_server_iov *pi)
int p, unsigned long len)
{ {
ssize_t ret; size_t len;
struct page_xfer *lxfer = &cxfer.loc_xfer;
struct iovec iov;
ret = splice(p, NULL, img_raw_fd(xfer->pi), NULL, len, SPLICE_F_MOVE); pr_debug("Adding %"PRIx64"/%u\n", pi->vaddr, pi->nr_pages);
if (ret == -1) {
pr_perror("Unable to spice data");
return -1;
}
if (ret != len) {
pr_err("Only %zu of %lu bytes have been spliced\n", ret, len);
return -1;
}
return 0; if (prep_loc_xfer(pi))
} return -1;
static int check_pagehole_in_parent(struct page_read *p, struct iovec *iov) psi2iovec(pi, &iov);
{ if (lxfer->write_pagemap(lxfer, &iov))
int ret; return -1;
unsigned long off, end;
/* len = iov.iov_len;
* Try to find pagemap entry in parent, from which while (len > 0) {
* the data will be read on restore. ssize_t chunk;
*
* This is the optimized version of the page-by-page
* read_pagemap_page routine.
*/
pr_debug("Checking %p/%zu hole\n", iov->iov_base, iov->iov_len); chunk = len;
off = (unsigned long)iov->iov_base; if (chunk > cxfer.pipe_size)
end = off + iov->iov_len; chunk = cxfer.pipe_size;
while (1) {
struct iovec piov;
unsigned long pend;
ret = p->seek_page(p, off, true); chunk = splice(sk, NULL, cxfer.p[1], NULL, chunk, SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
if (ret <= 0 || !p->pe) if (chunk < 0) {
pr_perror("Can't read from socket");
return -1; return -1;
}
pagemap2iovec(p->pe, &piov); if (lxfer->write_pages(lxfer, cxfer.p[0], chunk))
pr_debug("\tFound %p/%zu\n", piov.iov_base, piov.iov_len); return -1;
/*
* The pagemap entry in parent may heppen to be
* shorter, than the hole we write. In this case
* we should go ahead and check the remainder.
*/
pend = (unsigned long)piov.iov_base + piov.iov_len;
if (end <= pend)
return 0;
pr_debug("\t\tcontinue on %lx\n", pend); len -= chunk;
off = pend;
} }
return 0;
} }
static int write_pagehole_loc(struct page_xfer *xfer, struct iovec *iov) static int page_server_hole(int sk, struct page_server_iov *pi)
{ {
PagemapEntry pe = PAGEMAP_ENTRY__INIT; struct page_xfer *lxfer = &cxfer.loc_xfer;
struct iovec iov;
if (xfer->parent != NULL) {
int ret;
ret = check_pagehole_in_parent(xfer->parent, iov); pr_debug("Adding %"PRIx64"/%u hole\n", pi->vaddr, pi->nr_pages);
if (ret) {
pr_err("Hole %p/%zu not found in parent\n",
iov->iov_base, iov->iov_len);
return -1;
}
}
iovec2pagemap(iov, &pe); if (prep_loc_xfer(pi))
pe.has_in_parent = true; return -1;
pe.in_parent = true;
if (pb_write_one(xfer->pmi, &pe, PB_PAGEMAP) < 0) psi2iovec(pi, &iov);
if (lxfer->write_hole(lxfer, &iov))
return -1; return -1;
return 0; return 0;
} }
static void close_page_xfer(struct page_xfer *xfer) static int page_server_serve(int sk)
{
if (xfer->parent != NULL) {
xfer->parent->close(xfer->parent);
xfree(xfer->parent);
xfer->parent = NULL;
}
close_image(xfer->pi);
close_image(xfer->pmi);
}
int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp,
unsigned long off)
{ {
struct page_pipe_buf *ppb; int ret = -1;
struct iovec *hole = NULL; bool flushed = false;
pr_debug("Transfering pages:\n"); /*
* This socket only accepts data except one thing -- it
* writes back the has_parent bit from time to time, so
* make it NODELAY all the time.
*/
tcp_nodelay(sk, true);
if (pp->free_hole) if (pipe(cxfer.p)) {
hole = &pp->holes[0]; pr_perror("Can't make pipe for xfer");
close(sk);
return -1;
}
list_for_each_entry(ppb, &pp->bufs, l) { cxfer.pipe_size = fcntl(cxfer.p[0], F_GETPIPE_SZ, 0);
int i; pr_debug("Created xfer pipe size %u\n", cxfer.pipe_size);
pr_debug("\tbuf %d/%d\n", ppb->pages_in, ppb->nr_segs); while (1) {
struct page_server_iov pi;
for (i = 0; i < ppb->nr_segs; i++) { ret = recv(sk, &pi, sizeof(pi), MSG_WAITALL);
struct iovec *iov = &ppb->iov[i]; if (!ret)
break;
while (hole && (hole->iov_base < iov->iov_base)) { if (ret != sizeof(pi)) {
BUG_ON(hole->iov_base < (void *)off); pr_perror("Can't read pagemap from socket");
hole->iov_base -= off; ret = -1;
pr_debug("\th %p [%u]\n", hole->iov_base, break;
(unsigned int)(hole->iov_len / PAGE_SIZE));
if (xfer->write_hole(xfer, hole))
return -1;
hole++;
if (hole >= &pp->holes[pp->free_hole])
hole = NULL;
}
BUG_ON(iov->iov_base < (void *)off);
iov->iov_base -= off;
pr_debug("\tp %p [%u]\n", iov->iov_base,
(unsigned int)(iov->iov_len / PAGE_SIZE));
if (xfer->write_pagemap(xfer, iov))
return -1;
if (xfer->write_pages(xfer, ppb->p[0], iov->iov_len))
return -1;
} }
}
while (hole) { flushed = false;
BUG_ON(hole->iov_base < (void *)off);
hole->iov_base -= off;
pr_debug("\th* %p [%u]\n", hole->iov_base,
(unsigned int)(hole->iov_len / PAGE_SIZE));
if (xfer->write_hole(xfer, hole))
return -1;
hole++; switch (pi.cmd) {
if (hole >= &pp->holes[pp->free_hole]) case PS_IOV_OPEN:
hole = NULL; ret = page_server_open(-1, &pi);
} break;
case PS_IOV_OPEN2:
ret = page_server_open(sk, &pi);
break;
case PS_IOV_PARENT:
ret = page_server_check_parent(sk, &pi);
break;
case PS_IOV_ADD:
ret = page_server_add(sk, &pi);
break;
case PS_IOV_HOLE:
ret = page_server_hole(sk, &pi);
break;
case PS_IOV_FLUSH:
case PS_IOV_FLUSH_N_CLOSE:
{
int32_t status = 0;
return 0; ret = 0;
}
static int open_page_local_xfer(struct page_xfer *xfer, int fd_type, long id) /*
{ * An answer must be sent back to inform another side,
xfer->pmi = open_image(fd_type, O_DUMP, id); * that all data were received
if (!xfer->pmi) */
return -1; if (write(sk, &status, sizeof(status)) != sizeof(status)) {
pr_perror("Can't send the final package");
ret = -1;
}
xfer->pi = open_pages_image(O_DUMP, xfer->pmi); flushed = true;
if (!xfer->pi) { break;
close_image(xfer->pmi); }
return -1; default:
pr_err("Unknown command %u\n", pi.cmd);
ret = -1;
break;
}
if (ret || (pi.cmd == PS_IOV_FLUSH_N_CLOSE))
break;
} }
/* if (!ret && !flushed) {
* Open page-read for parent images (if it exists). It will pr_err("The data were not flushed\n");
* be used for two things: ret = -1;
* 1) when writing a page, those from parent will be dedup-ed }
* 2) when writing a hole, the respective place would be checked
* to exist in parent (either pagemap or hole)
*/
xfer->parent = NULL;
if (fd_type == CR_FD_PAGEMAP) {
int ret;
int pfd;
pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY); if (ret == 0 && opts.ps_socket == -1) {
if (pfd < 0 && errno == ENOENT) char c;
goto out;
xfer->parent = xmalloc(sizeof(*xfer->parent)); /*
if (!xfer->parent) { * Wait when a remote side closes the connection
close(pfd); * to avoid TIME_WAIT bucket
return -1; */
}
ret = open_page_read_at(pfd, id, xfer->parent, PR_TASK); if (read(sk, &c, sizeof(c)) != 0) {
if (ret <= 0) { pr_perror("Unexpected data");
pr_perror("No parent image found, though parent directory is set"); ret = -1;
xfree(xfer->parent);
xfer->parent = NULL;
close(pfd);
goto out;
} }
close(pfd);
} }
out: page_server_close();
xfer->write_pagemap = write_pagemap_loc; pr_info("Session over\n");
xfer->write_pages = write_pages_loc;
xfer->write_hole = write_pagehole_loc;
xfer->close = close_page_xfer;
return 0;
}
int open_page_xfer(struct page_xfer *xfer, int fd_type, long id) close(sk);
{ return ret;
if (opts.use_page_server)
return open_page_server_xfer(xfer, fd_type, id);
else
return open_page_local_xfer(xfer, fd_type, id);
} }
/* int cr_page_server(bool daemon_mode, int cfd)
* Return:
* 1 - if a parent image exists
* 0 - if a parent image doesn't exist
* -1 - in error cases
*/
int check_parent_local_xfer(int fd_type, int id)
{ {
char path[PATH_MAX]; int ask = -1;
struct stat st; int sk = -1;
int ret, pfd; int ret;
pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY); up_page_ids_base();
if (pfd < 0 && errno == ENOENT)
return 0;
snprintf(path, sizeof(path), imgset_template[fd_type].fmt, id); if (opts.ps_socket != -1) {
ret = fstatat(pfd, path, &st, 0); ret = 0;
if (ret == -1 && errno != ENOENT) { ask = opts.ps_socket;
pr_perror("Unable to stat %s", path); pr_info("Re-using ps socket %d\n", ask);
close(pfd); goto no_server;
return -1;
} }
close(pfd); sk = setup_tcp_server("page");
return (ret == 0); if (sk == -1)
} return -1;
no_server:
ret = run_tcp_server(daemon_mode, &ask, cfd, sk);
if (ret != 0)
return ret;
static int page_server_check_parent(int sk, struct page_server_iov *pi) if (ask >= 0)
{ ret = page_server_serve(ask);
int type, ret;
long id;
type = decode_pm_type(pi->dst_id); if (daemon_mode)
id = decode_pm_id(pi->dst_id); exit(ret);
ret = check_parent_local_xfer(type, id); return ret;
if (ret < 0) }
return -1;
if (write(sk, &ret, sizeof(ret)) != sizeof(ret)) { int connect_to_page_server(void)
pr_perror("Unable to send reponse"); {
return -1; if (!opts.use_page_server)
return 0;
if (opts.ps_socket != -1) {
page_server_sk = opts.ps_socket;
pr_info("Re-using ps socket %d\n", page_server_sk);
goto out;
} }
page_server_sk = setup_tcp_client(opts.addr);
if (page_server_sk == -1)
return -1;
out:
/*
* CORK the socket at the very beginning. As per ANK
* the corked by default socket with sporadic NODELAY-s
* on urgent data is the smartest mode ever.
*/
tcp_cork(page_server_sk, true);
return 0; return 0;
} }
static int check_parent_server_xfer(int fd_type, long id) int disconnect_from_page_server(void)
{ {
struct page_server_iov pi = {}; struct page_server_iov pi = { };
int has_parent; int32_t status = -1;
int ret = -1;
pi.cmd = PS_IOV_PARENT; if (!opts.use_page_server)
pi.dst_id = encode_pm_id(fd_type, id); return 0;
if (page_server_sk == -1)
return 0;
pr_info("Disconnect from the page server %s:%u\n",
opts.addr, (int)ntohs(opts.port));
if (opts.ps_socket != -1)
/*
* The socket might not get closed (held by
* the parent process) so we must order the
* page-server to terminate itself.
*/
pi.cmd = PS_IOV_FLUSH_N_CLOSE;
else
pi.cmd = PS_IOV_FLUSH;
if (write(page_server_sk, &pi, sizeof(pi)) != sizeof(pi)) { if (write(page_server_sk, &pi, sizeof(pi)) != sizeof(pi)) {
pr_perror("Can't write to page server"); pr_perror("Can't write the fini command to server");
return -1; goto out;
} }
tcp_nodelay(page_server_sk, true); if (read(page_server_sk, &status, sizeof(status)) != sizeof(status)) {
if (read(page_server_sk, &has_parent, sizeof(int)) != sizeof(int)) {
pr_perror("The page server doesn't answer"); pr_perror("The page server doesn't answer");
return -1; goto out;
} }
return has_parent; ret = 0;
} out:
close_safe(&page_server_sk);
int check_parent_page_xfer(int fd_type, long id) return ret ? : status;
{
if (opts.use_page_server)
return check_parent_server_xfer(fd_type, id);
else
return check_parent_local_xfer(fd_type, id);
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment