Commit d6a1cd0f authored by Andrey Vagin's avatar Andrey Vagin Committed by Cyrill Gorcunov

restore: Learn to work with shared struct file-s

Some process can share one struct file-s, we may find them by "object IDs".
A file descriptor is opened in one process and send to other via unix socket.

The procedure of restoring files contains four stages.
* Collect data about all file's descriptors
  On this stage we find process which will restore a file descriptor and
  create a list of processes, who should get this descriptor.

* Create datagrams unix sockets
  If a file descriptor should be received, a unix socket is created
  instead of it.

* Open file descriptors
  A process with the least pid opens a file and sends this file
  descriptors to all one who wait it.

* Receive file descriptors.

When we were thinking up this algoritm, we wanted to minimize a number
of context switches. A number of context switches is proportional of a
number of processes.
Signed-off-by: 's avatarAndrey Vagin <avagin@openvz.org>
Acked-by: 's avatarPavel Emelyanov <xemul@parallels.com>
Signed-off-by: 's avatarCyrill Gorcunov <gorcunov@openvz.org>
parent b5cc5fc3
......@@ -94,7 +94,7 @@ err_bogus_mapping:
static int dump_one_reg_file(int type, unsigned long fd_name, int lfd,
bool do_close, unsigned long pos, unsigned int flags,
struct cr_fdset *cr_fdset)
char *id, struct cr_fdset *cr_fdset)
{
struct fdinfo_entry e;
char fd_str[128];
......@@ -120,6 +120,8 @@ static int dump_one_reg_file(int type, unsigned long fd_name, int lfd,
e.flags = flags;
e.pos = pos;
e.addr = fd_name;
if (id)
memcpy(e.id, id, FD_ID_SIZE);
pr_info("fdinfo: type: %2x len: %2x flags: %4x pos: %8x addr: %16lx\n",
type, len, flags, pos, fd_name);
......@@ -143,7 +145,7 @@ static int dump_cwd(char *path, struct cr_fdset *cr_fdset)
return -1;
}
return dump_one_reg_file(FDINFO_FD, ~0L, fd, 1, 0, 0, cr_fdset);
return dump_one_reg_file(FDINFO_FD, ~0L, fd, 1, 0, 0, NULL, cr_fdset);
}
......@@ -231,7 +233,7 @@ err:
}
static int dump_one_fd(char *pid_fd_dir, int dir, char *fd_name, unsigned long pos,
unsigned int flags, struct cr_fdset *cr_fdset)
unsigned int flags, char *id, struct cr_fdset *cr_fdset)
{
struct statfs stfs_buf;
struct stat st_buf;
......@@ -269,7 +271,7 @@ static int dump_one_fd(char *pid_fd_dir, int dir, char *fd_name, unsigned long p
S_ISDIR(st_buf.st_mode) ||
(S_ISCHR(st_buf.st_mode) && major(st_buf.st_rdev) == MEM_MAJOR))
return dump_one_reg_file(FDINFO_FD, atol(fd_name),
fd, 1, pos, flags, cr_fdset);
fd, 1, pos, flags, id, cr_fdset);
if (S_ISFIFO(st_buf.st_mode)) {
if (fstatfs(fd, &stfs_buf) < 0) {
......@@ -290,9 +292,11 @@ out_close:
return err;
}
static int read_fd_params(pid_t pid, char *fd, unsigned long *pos, unsigned int *flags)
static int read_fd_params(pid_t pid, char *fd, unsigned long *pos,
unsigned int *flags, char *id)
{
FILE *file;
unsigned int f;
file = fopen_proc("%d/fdinfo/%s", "r", pid, fd);
if (!file) {
......@@ -300,10 +304,11 @@ static int read_fd_params(pid_t pid, char *fd, unsigned long *pos, unsigned int
return -1;
}
fscanf(file, "pos:\t%li\nflags:\t%o\n", pos, flags);
fscanf(file, "pos:\t%li\nflags:\t%o\nid:\t%s\n", pos, flags, id);
fclose(file);
pr_info("%d fdinfo %s: pos: %16lx flags: %16lx\n", pid, fd, *pos, *flags);
pr_info("%d fdinfo %s: pos: %16lx flags: %16o id %s\n",
pid, fd, *pos, *flags, id);
return 0;
}
......@@ -314,6 +319,7 @@ static int dump_task_files(pid_t pid, struct cr_fdset *cr_fdset)
struct dirent *de;
unsigned long pos;
unsigned int flags;
char id[FD_ID_SIZE];
DIR *fd_dir;
pr_info("\n");
......@@ -336,9 +342,10 @@ static int dump_task_files(pid_t pid, struct cr_fdset *cr_fdset)
while ((de = readdir(fd_dir))) {
if (de->d_name[0] == '.')
continue;
if (read_fd_params(pid, de->d_name, &pos, &flags))
if (read_fd_params(pid, de->d_name, &pos, &flags, id))
return -1;
if (dump_one_fd(pid_fd_dir, dirfd(fd_dir), de->d_name, pos, flags, cr_fdset))
if (dump_one_fd(pid_fd_dir, dirfd(fd_dir), de->d_name,
pos, flags, id, cr_fdset))
return -1;
}
......@@ -393,7 +400,7 @@ static int dump_task_mappings(pid_t pid, struct list_head *vma_area_list, struct
ret = dump_one_reg_file(FDINFO_MAP,
vma->start,
vma_area->vm_file_fd,
0, 0, flags,
0, 0, flags, NULL,
cr_fdset);
if (ret)
goto err;
......
......@@ -382,6 +382,9 @@ static int prepare_shared(int ps_fd)
return -1;
}
if (prepare_fdinfo_global())
return -1;
while (1) {
struct pstree_entry e;
int ret;
......@@ -401,6 +404,9 @@ static int prepare_shared(int ps_fd)
if (prepare_pipes_pid(e.pid))
return -1;
if (prepare_fd_pid(e.pid))
return -1;
lseek(ps_fd, e.nr_children * sizeof(u32) + e.nr_threads * sizeof(u32), SEEK_CUR);
}
......
#include <unistd.h>
#include <fcntl.h>
#include <errno.h>
#include <linux/limits.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/socket.h>
#include <sys/un.h>
#include "crtools.h"
#include "files.h"
......@@ -11,6 +18,17 @@
#include "util.h"
#include "lock.h"
#define UNIX_PATH_MAX (sizeof(struct sockaddr_un) - \
(size_t)((struct sockaddr_un *) 0)->sun_path)
enum fdinfo_states {
FD_STATE_PREP, /* Create unix sockets */
FD_STATE_CREATE, /* Create and send fd */
FD_STATE_RECV, /* Receive fd */
FD_STATE_MAX
};
struct fmap_fd {
struct fmap_fd *next;
unsigned long start;
......@@ -18,8 +36,44 @@ struct fmap_fd {
int fd;
};
struct fdinfo_desc {
char id[FD_ID_SIZE];
u64 addr;
int pid;
u32 real_pid; /* futex */
u32 users; /* futex */
struct list_head list;
};
struct fdinfo_list_entry {
struct list_head list;
int fd;
int pid;
u32 real_pid;
};
static struct fdinfo_desc *fdinfo_descs;
static int nr_fdinfo_descs;
static struct fdinfo_list_entry *fdinfo_list;
static int nr_fdinfo_list;
static struct fmap_fd *fmap_fds;
static struct fdinfo_desc *find_fd(char *id)
{
struct fdinfo_desc *fi;
int i;
for (i = 0; i < nr_fdinfo_descs; i++) {
fi = fdinfo_descs + i;
if (!strncmp(fi->id, id, FD_ID_SIZE))
return fi;
}
return NULL;
}
static int get_file_path(char *path, struct fdinfo_entry *fe, int fd)
{
if (read(fd, path, fe->len) != fe->len) {
......@@ -32,6 +86,110 @@ static int get_file_path(char *path, struct fdinfo_entry *fe, int fd)
return 0;
}
int prepare_fdinfo_global()
{
fdinfo_descs = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, 0, 0);
if (fdinfo_descs == MAP_FAILED) {
pr_perror("Can't map fdinfo_descs\n");
if (prepare_fdinfo_global())
return -1;
}
fdinfo_list = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, 0, 0);
if (fdinfo_list == MAP_FAILED) {
pr_perror("Can't map fdinfo_list\n");
return -1;
}
return 0;
}
static int collect_fd(int pid, struct fdinfo_entry *e)
{
int i;
struct fdinfo_list_entry *le = &fdinfo_list[nr_fdinfo_list];
struct fdinfo_desc *desc;
pr_info("Collect fdinfo pid=%d fd=%d id=%s\n", pid, e->addr, e->id);
nr_fdinfo_list++;
le->pid = pid;
le->fd = e->addr;
le->real_pid = 0;
for (i = 0; i < nr_fdinfo_descs; i++) {
desc = &fdinfo_descs[i];
if (strncmp(desc->id, e->id, FD_ID_SIZE))
continue;
fdinfo_descs[i].users++;
list_add(&le->list, &desc->list);
if (fdinfo_descs[i].pid < pid)
return 0;
desc->pid = pid;
desc->addr = e->addr;
return 0;
}
if ((nr_fdinfo_descs + 1) * sizeof(struct fdinfo_desc) >= 4096) {
pr_panic("OOM storing pipes\n");
return -1;
}
desc = &fdinfo_descs[nr_fdinfo_descs];
memset(desc, 0, sizeof(fdinfo_descs[nr_fdinfo_descs]));
memcpy(desc->id, e->id, FD_ID_SIZE);
desc->addr= e->addr;
desc->pid = pid;
desc->users = 1;
INIT_LIST_HEAD(&desc->list);
list_add(&le->list, &desc->list);
nr_fdinfo_descs++;
return 0;
}
int prepare_fd_pid(int pid)
{
int fdinfo_fd;
u32 type = 0;
fdinfo_fd = open_image_ro(CR_FD_FDINFO, pid);
if (fdinfo_fd < 0) {
pr_perror("%d: Can't open fdinfo image\n", pid);
return -1;
}
while (1) {
int ret;
struct fdinfo_entry e;
ret = read(fdinfo_fd, &e, sizeof(e));
if (ret == 0)
break;
if (ret != sizeof(e)) {
pr_perror("%d: Read fdinfo failed %d (expected %li)\n",
pid, ret, sizeof(e));
return -1;
}
if (e.len)
lseek(fdinfo_fd, e.len, SEEK_CUR);
if (e.type == FDINFO_MAP)
continue;
if (e.addr == -1)
continue;
if (collect_fd(pid, &e))
return -1;
}
close(fdinfo_fd);
return 0;
}
static int open_fe_fd(struct fdinfo_entry *fe, int fd)
{
char path[PATH_MAX];
......@@ -69,20 +227,214 @@ static int restore_cwd(struct fdinfo_entry *fe, int fd)
return 0;
}
static int open_fd(int pid, struct fdinfo_entry *fe, int *cfd)
struct fdinfo_list_entry *find_fdinfo_list_entry(int pid, int fd, struct fdinfo_desc *fi)
{
int fd, tmp;
struct fdinfo_list_entry *fle;
int found = 0;
list_for_each_entry(fle, &fi->list, list) {
if (fle->fd == fd && fle->pid == pid) {
found = 1;
break;
}
}
BUG_ON(found == 0);
return fle;
}
static int open_transport_fd(int pid, struct fdinfo_entry *fe,
struct fdinfo_desc *fi, int *fdinfo_fd)
{
struct fdinfo_list_entry *fle;
struct sockaddr_un saddr;
int sock;
int ret, sun_len;
saddr.sun_family = AF_UNIX;
snprintf(saddr.sun_path, UNIX_PATH_MAX,
"X/crtools-fd-%d-%ld", getpid(), fe->addr);
sun_len = SUN_LEN(&saddr);
*saddr.sun_path = '\0';
pr_info("\t%d: Got fd for %lx type %d namelen %d users %d\n", pid,
(unsigned long)fe->addr, fe->type, fe->len, fi->users);
if (fi->pid == pid)
return 0;
fle = find_fdinfo_list_entry(pid, fe->addr, fi);
sock = socket(PF_UNIX, SOCK_DGRAM, 0);
if (sock < 0) {
pr_perror("Can't create socket");
return -1;
}
ret = bind(sock, &saddr, sun_len);
if (ret < 0) {
pr_perror("Can't bind unix socket %s\n", saddr.sun_path + 1);
return -1;
}
if (move_img_fd(cfd, (int)fe->addr))
ret = reopen_fd_as((int)fe->addr, sock);
if (ret < 0)
return -1;
if (fe->addr == ~0L)
return restore_cwd(fe, *cfd);
pr_info("Wake up fdinfo pid=%d fd=%d\n", fle->pid, fle->fd);
cr_wait_set(&fle->real_pid, getpid());
return 0;
}
tmp = open_fe_fd(fe, *cfd);
static int open_fd(int pid, struct fdinfo_entry *fe,
struct fdinfo_desc *fi, int *fdinfo_fd)
{
int tmp;
int serv, sock;
struct sockaddr_un saddr;
struct fdinfo_list_entry *fle;
tmp = open_fe_fd(fe, *fdinfo_fd);
if (tmp < 0)
return -1;
if (reopen_fd_as((int)fe->addr, tmp))
return -1;
if (!fi->users == 1)
goto out;
sock = socket(PF_UNIX, SOCK_DGRAM, 0);
if (sock < 0) {
pr_perror("Can't create socket");
return -1;
}
cr_wait_set(&fi->real_pid, getpid());
pr_info("\t%d: Got fd for %lx type %d namelen %d users %d\n", pid,
(unsigned long)fe->addr, fe->type, fe->len, fi->users);
list_for_each_entry(fle, &fi->list, list) {
struct msghdr hdr;
struct iovec data;
char cmsgbuf[CMSG_SPACE(sizeof(int))];
struct cmsghdr* cmsg;
char dummy = '*';
fi->users--;
if (pid == fle->pid)
continue;
pr_info("Wait fdinfo pid=%d fd=%d\n", fle->pid, fle->fd);
cr_wait_while(&fle->real_pid, 0);
saddr.sun_family = AF_UNIX;
snprintf(saddr.sun_path, UNIX_PATH_MAX,
"X/crtools-fd-%d-%d", fle->real_pid, fle->fd);
pr_info("Send fd %d to %s\n", fe->addr, saddr.sun_path + 1);
data.iov_base = &dummy;
data.iov_len = sizeof(dummy);
hdr.msg_name = (struct sockaddr *)&saddr;
hdr.msg_namelen = SUN_LEN(&saddr);
*saddr.sun_path = '\0';
hdr.msg_iov = &data;
hdr.msg_iovlen = 1;
hdr.msg_flags = 0;
hdr.msg_control = &cmsgbuf;
hdr.msg_controllen = CMSG_LEN(sizeof(int));
cmsg = CMSG_FIRSTHDR(&hdr);
cmsg->cmsg_len = hdr.msg_controllen;
cmsg->cmsg_level = SOL_SOCKET;
cmsg->cmsg_type = SCM_RIGHTS;
*(int*)CMSG_DATA(cmsg) = fe->addr;
tmp = sendmsg(sock, &hdr, 0);
if (tmp < 0) {
pr_perror("Can't send file descriptor");
return -1;
}
}
BUG_ON(fi->users);
close(sock);
out:
return 0;
}
static int recv_fd(int sock)
{
struct msghdr msg;
struct iovec iov;
char buf[1];
char ccmsg[CMSG_SPACE(sizeof(int))];
struct cmsghdr *cmsg;
iov.iov_base = buf;
iov.iov_len = 1;
int ret;
msg.msg_name = 0;
msg.msg_namelen = 0;
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
msg.msg_control = ccmsg;
msg.msg_controllen = sizeof(ccmsg);
ret = recvmsg(sock, &msg, 0);
if (ret == -1) {
pr_perror("recvmsg");
return -1;
}
cmsg = CMSG_FIRSTHDR(&msg);
if (!cmsg->cmsg_type == SCM_RIGHTS) {
pr_perror("got control message of unknown type %d\n",
cmsg->cmsg_type);
return -1;
}
return *(int*)CMSG_DATA(cmsg);
}
static int receive_fd(int pid, struct fdinfo_entry *fe, struct fdinfo_desc *fi, int *fdinfo_fd)
{
int tmp, fd;
int sock;
struct sockaddr_un saddr;
socklen_t address_length;
struct fdinfo_list_entry *fle;
if (fi->pid == pid) {
tmp = dup2(fi->addr, fe->addr);
if (tmp < 0) {
pr_perror("Can't duplicate fd %d %d\n", fi->addr, fe->addr);
return -1;
}
return 0;
}
fle = find_fdinfo_list_entry(pid, fe->addr, fi);
pr_info("\t%d: Got fd for %lx type %d namelen %d users %d\n", pid,
(unsigned long)fe->addr, fe->type, fe->len, fi->users);
tmp = recv_fd(fe->addr);
if (tmp < 0) {
pr_err("Can't get fd");
return -1;
}
close(fe->addr);
return reopen_fd_as((int)fe->addr, tmp);
}
......@@ -108,52 +460,93 @@ static int open_fmap(int pid, struct fdinfo_entry *fe, int fd)
return 0;
}
int prepare_fds(int pid)
static int open_fdinfo(int pid, struct fdinfo_entry *fe, int *fdinfo_fd, int state)
{
int fdinfo_fd;
u32 mag;
int ret;
pr_info("%d: Opening files img\n", pid);
struct fdinfo_desc *fi = find_fd(fe->id);
fdinfo_fd = open_image_ro(CR_FD_FDINFO, pid);
if (fdinfo_fd < 0)
if (move_img_fd(fdinfo_fd, (int)fe->addr))
return -1;
while (1) {
int ret;
struct fdinfo_entry fe;
pr_info("\t%d: Got fd for %lx type %d namelen %d users %d\n", pid,
(unsigned long)fe->addr, fe->type, fe->len, fi->users);
ret = read(fdinfo_fd, &fe, sizeof(fe));
if (ret == 0) {
close(fdinfo_fd);
return 0;
}
BUG_ON(fe->type != FDINFO_FD);
if (ret < 0) {
pr_perror("Error reading %d fdinfo\n", pid);
return -1;
}
if (ret != sizeof(fe)) {
pr_err("Corrupted %d fdinfo\n", pid);
if (pid == fi->pid && fe->addr == fi->addr) {
if (state == FD_STATE_CREATE)
ret = open_fd(pid, fe, fi, fdinfo_fd);
} else {
if (state == FD_STATE_PREP)
ret = open_transport_fd(pid, fe, fi, fdinfo_fd);
else if (state == FD_STATE_RECV)
ret = receive_fd(pid, fe, fi, fdinfo_fd);
}
return ret;
}
int prepare_fds(int pid)
{
u32 type = 0, err = -11, ret;
int fdinfo_fd;
int state;
off_t offset;
struct fdinfo_entry fe;
int nr = 0;
pr_info("%d: Opening fdinfo-s\n", pid);
for (state = 0; state < FD_STATE_MAX; state++) {
fdinfo_fd = open_image_ro(CR_FD_FDINFO, pid);
if (fdinfo_fd < 0) {
pr_perror("%d: Can't open pipes img\n", pid);
return -1;
}
pr_info("\t%d: Got fd for %lx type %d namelen %d\n", pid,
(unsigned long)fe.addr, fe.type, fe.len);
switch (fe.type) {
case FDINFO_FD:
if (open_fd(pid, &fe, &fdinfo_fd))
return -1;
break;
case FDINFO_MAP:
if (open_fmap(pid, &fe, fdinfo_fd))
return -1;
break;
default:
pr_err("Unknown %d fdinfo file type\n", pid);
return -1;
while (1) {
ret = read(fdinfo_fd, &fe, sizeof(fe));
if (ret == 0)
break;
if (ret != sizeof(fe)) {
pr_perror("%d: Bad fdinfo entry\n", pid);
goto err;
}
if (state == FD_STATE_RECV) {
if (fe.type == FDINFO_MAP) {
if (open_fmap(pid, &fe, fdinfo_fd))
goto err;
continue;
} else if (fe.addr == ~0L) {
if (restore_cwd(&fe, fdinfo_fd))
goto err;
continue;
}
} else if (fe.type == FDINFO_MAP || fe.addr == ~0L) {
lseek(fdinfo_fd, fe.len, SEEK_CUR);
continue;
}
offset = lseek(fdinfo_fd, 0, SEEK_CUR);
if (open_fdinfo(pid, &fe, &fdinfo_fd, state))
goto err;
lseek(fdinfo_fd, offset + fe.len, SEEK_SET);
}
close(fdinfo_fd);
}
return 0;
err:
close(fdinfo_fd);
return err;
}
static struct fmap_fd *pop_fmap_fd(int pid, unsigned long start)
......
......@@ -2,6 +2,8 @@
#define FILES_H_
extern int prepare_fds(int pid);
extern int prepare_fd_pid(int pid);
extern int prepare_fdinfo_global(void);
extern int try_fixup_file_map(int pid, struct vma_entry *vma_entry, int fd);
#endif /* FILES_H_ */
......@@ -21,12 +21,15 @@
#define PAGE_RSS 1
#define PAGE_ANON 2
#define FD_ID_SIZE 50
struct fdinfo_entry {
u8 type;
u8 len;
u16 flags;
u32 pos;
u64 addr;
char id[FD_ID_SIZE];
u8 name[0];
} __packed;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment