Commit c5eb61e8 authored by Pavel Emelyanov's avatar Pavel Emelyanov Committed by Cyrill Gorcunov

Unix sockets initial support

Currently it can only work with stream sockets, which have no skbs in queues
(listening or established -- both work OK).

The cpt part uses the sock_diag engine that was merged to Dave recently to
collect sockets. Then it dumps sockets by checking the filesystem ID of a
failed-to-open through /proc/pid/fd descriptors (sockets do not allow for
such tricks with opens through proc) against SOCKFS_TYPE.

The rst part is more tricky. Listen sockets are just restored, this is simple.
Connected sockets are restored like this:

1. One end establishes a listening anon socket at the desired descriptor;
2. The other end just creates a socket at the desired descriptor;
3. All sockets, that are to be connect()-ed call connect. Unix sockets
   do not block connect() till the accept() time and thus we continue with...
4. ... all listening sockets call accept() and ... dup2 the new fd into the
   accepting end.

There's a problem with this approach -- socket names are not preserved, but
looking into our OpenVZ implementation I think this is OK for existing apps.

What should be done next is:

1. Need to merge the file IDs patches in our tree and make Andrey to
   support files sharing. This will solve the

	sk = socket();
	fork();

   case. Currently it simply doesn't work :(

2. Need to add support for DGRAM sockets -- I wrote comment how to do it
   in the can_dump_unix_sk()

3. Need to add support for in-flight connections

4. Implement support for UDP sockets (quite simple)

5. Implement support for listening TCP sockets (also not very complex)

6. Implement support for connected TCP scokets (hard one, Tejun's patches are not
   very good for this from my POV)

Cyrill, plz, apply this patch and put the above descriptions onto wiki docs (do we
have the plans page yet?).

Andrey, plz, take care of unix sockets tests in zdtm. Most likely it won't work till
you do the shared files support for sockets.
Signed-off-by: 's avatarPavel Emelyanov <xemul@parallels.com>
Signed-off-by: 's avatarCyrill Gorcunov <gorcunov@openvz.org>
parent bf7a74d4
...@@ -67,6 +67,8 @@ OBJS += util.o ...@@ -67,6 +67,8 @@ OBJS += util.o
OBJS += ptrace.o OBJS += ptrace.o
OBJS += restorer.o OBJS += restorer.o
OBJS += log.o OBJS += log.o
OBJS += libnetlink.o
OBJS += sockets.o
DEPS := $(patsubst %.o,%.d,$(OBJS)) DEPS := $(patsubst %.o,%.d,$(OBJS))
......
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#include "syscall.h" #include "syscall.h"
#include "ptrace.h" #include "ptrace.h"
#include "util.h" #include "util.h"
#include "sockets.h"
#include "image.h" #include "image.h"
...@@ -239,6 +240,10 @@ static int dump_one_fd(char *pid_fd_dir, int dir, char *fd_name, unsigned long p ...@@ -239,6 +240,10 @@ static int dump_one_fd(char *pid_fd_dir, int dir, char *fd_name, unsigned long p
fd = openat(dir, fd_name, O_RDONLY); fd = openat(dir, fd_name, O_RDONLY);
if (fd < 0) { if (fd < 0) {
err = __try_dump_socket(pid_fd_dir, fd_name, cr_fdset);
if (err != 1)
return err;
pr_perror("Failed to openat %s/%d %s\n", pid_fd_dir, dir, fd_name); pr_perror("Failed to openat %s/%d %s\n", pid_fd_dir, dir, fd_name);
return -1; return -1;
} }
...@@ -1179,6 +1184,9 @@ int cr_dump_tasks(pid_t pid, struct cr_options *opts) ...@@ -1179,6 +1184,9 @@ int cr_dump_tasks(pid_t pid, struct cr_options *opts)
if (collect_pstree(pid, &pstree_list)) if (collect_pstree(pid, &pstree_list))
goto err; goto err;
if (collect_sockets())
goto err;
/* /*
* Since ptrace-seize doesn't work on frozen tasks * Since ptrace-seize doesn't work on frozen tasks
* we stick with explicit tasks stopping via stop * we stick with explicit tasks stopping via stop
......
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
#include "log.h" #include "log.h"
#include "syscall.h" #include "syscall.h"
#include "restorer.h" #include "restorer.h"
#include "sockets.h"
#include "crtools.h" #include "crtools.h"
...@@ -1251,6 +1252,9 @@ static int restore_one_task(int pid) ...@@ -1251,6 +1252,9 @@ static int restore_one_task(int pid)
if (prepare_pipes(pid)) if (prepare_pipes(pid))
return -1; return -1;
if (prepare_sockets(pid))
return -1;
if (prepare_fds(pid)) if (prepare_fds(pid))
return -1; return -1;
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
#include "compiler.h" #include "compiler.h"
#include "crtools.h" #include "crtools.h"
#include "util.h" #include "util.h"
#include "sockets.h"
#include "image.h" #include "image.h"
#define DEF_PAGES_PER_LINE 6 #define DEF_PAGES_PER_LINE 6
...@@ -449,6 +449,9 @@ static int cr_parse_file(struct cr_options *opts) ...@@ -449,6 +449,9 @@ static int cr_parse_file(struct cr_options *opts)
case SIGACT_MAGIC: case SIGACT_MAGIC:
show_sigacts(opts->show_dump_file, fd, true); show_sigacts(opts->show_dump_file, fd, true);
break; break;
case UNIXSK_MAGIC:
show_unixsk(opts->show_dump_file, fd, true);
break;
default: default:
pr_err("Unknown magic %x on %s\n", opts->show_dump_file); pr_err("Unknown magic %x on %s\n", opts->show_dump_file);
goto err; goto err;
...@@ -617,6 +620,9 @@ static int cr_show_all(unsigned long pid, struct cr_options *opts) ...@@ -617,6 +620,9 @@ static int cr_show_all(unsigned long pid, struct cr_options *opts)
show_sigacts(cr_fdset->desc[CR_FD_SIGACT].path, show_sigacts(cr_fdset->desc[CR_FD_SIGACT].path,
cr_fdset->desc[CR_FD_SIGACT].fd, true); cr_fdset->desc[CR_FD_SIGACT].fd, true);
show_unixsk(cr_fdset->desc[CR_FD_UNIXSK].path,
cr_fdset->desc[CR_FD_UNIXSK].fd, true);
close_cr_fdset(cr_fdset); close_cr_fdset(cr_fdset);
free_cr_fdset(&cr_fdset); free_cr_fdset(&cr_fdset);
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include "crtools.h" #include "crtools.h"
#include "util.h" #include "util.h"
#include "log.h" #include "log.h"
#include "sockets.h"
static struct cr_options opts; static struct cr_options opts;
struct page_entry zero_page_entry; struct page_entry zero_page_entry;
...@@ -78,6 +79,12 @@ struct cr_fd_desc_tmpl fdset_template[CR_FD_MAX] = { ...@@ -78,6 +79,12 @@ struct cr_fd_desc_tmpl fdset_template[CR_FD_MAX] = {
.fmt = FMT_FNAME_SIGACTS, .fmt = FMT_FNAME_SIGACTS,
.magic = SIGACT_MAGIC, .magic = SIGACT_MAGIC,
}, },
/* info about unix sockets */
[CR_FD_UNIXSK] = {
.fmt = FMT_FNAME_UNIXSK,
.magic = UNIXSK_MAGIC,
},
}; };
struct cr_fdset *alloc_cr_fdset(pid_t pid) struct cr_fdset *alloc_cr_fdset(pid_t pid)
......
...@@ -23,6 +23,7 @@ enum { ...@@ -23,6 +23,7 @@ enum {
CR_FD_PSTREE, CR_FD_PSTREE,
CR_FD_SHMEM, CR_FD_SHMEM,
CR_FD_SIGACT, CR_FD_SIGACT,
CR_FD_UNIXSK,
CR_FD_MAX CR_FD_MAX
}; };
...@@ -57,6 +58,7 @@ struct cr_fd_desc_tmpl { ...@@ -57,6 +58,7 @@ struct cr_fd_desc_tmpl {
#define FMT_FNAME_SHMEM "shmem-%d.img" #define FMT_FNAME_SHMEM "shmem-%d.img"
#define FMT_FNAME_VMAS "vmas-%d.img" #define FMT_FNAME_VMAS "vmas-%d.img"
#define FMT_FNAME_SIGACTS "sigacts-%d.img" #define FMT_FNAME_SIGACTS "sigacts-%d.img"
#define FMT_FNAME_UNIXSK "unixsk-%d.img"
extern int get_image_path(char *path, int size, const char *fmt, int pid); extern int get_image_path(char *path, int size, const char *fmt, int pid);
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
#define PSTREE_MAGIC 0x40044004 #define PSTREE_MAGIC 0x40044004
#define PIPES_MAGIC 0x05055050 #define PIPES_MAGIC 0x05055050
#define SIGACT_MAGIC 0x60606060 #define SIGACT_MAGIC 0x60606060
#define UNIXSK_MAGIC 0x07070707
#define FDINFO_FD 1 #define FDINFO_FD 1
#define FDINFO_MAP 2 #define FDINFO_MAP 2
...@@ -52,6 +53,18 @@ struct pipe_entry { ...@@ -52,6 +53,18 @@ struct pipe_entry {
u8 data[0]; u8 data[0];
} __packed; } __packed;
struct unix_sk_entry {
u32 fd;
u32 id;
u8 type;
u8 state;
u8 namelen; /* fits UNIX_PATH_MAX */
u8 pad;
u32 backlog;
u32 peer;
u8 name[0];
} __packed;
struct vma_entry { struct vma_entry {
u64 start; u64 start;
u64 end; u64 end;
......
#ifndef __LINUX_RTNETLINK_H
#define __LINUX_RTNETLINK_H
struct rtattr {
unsigned short rta_len;
unsigned short rta_type;
};
/* Macros to handle rtattributes */
#define RTA_ALIGNTO 4
#define RTA_ALIGN(len) ( ((len)+RTA_ALIGNTO-1) & ~(RTA_ALIGNTO-1) )
#define RTA_OK(rta,len) ((len) >= (int)sizeof(struct rtattr) && \
(rta)->rta_len >= sizeof(struct rtattr) && \
(rta)->rta_len <= (len))
#define RTA_NEXT(rta,attrlen) ((attrlen) -= RTA_ALIGN((rta)->rta_len), \
(struct rtattr*)(((char*)(rta)) + RTA_ALIGN((rta)->rta_len)))
#define RTA_LENGTH(len) (RTA_ALIGN(sizeof(struct rtattr)) + (len))
#define RTA_SPACE(len) RTA_ALIGN(RTA_LENGTH(len))
#define RTA_DATA(rta) ((void*)(((char*)(rta)) + RTA_LENGTH(0)))
#define RTA_PAYLOAD(rta) ((int)((rta)->rta_len) - RTA_LENGTH(0))
#define NLMSG_ALIGN(len) ( ((len)+NLMSG_ALIGNTO-1) & ~(NLMSG_ALIGNTO-1) )
#define NLMSG_HDRLEN ((int) NLMSG_ALIGN(sizeof(struct nlmsghdr)))
#define NLMSG_LENGTH(len) ((len)+NLMSG_ALIGN(NLMSG_HDRLEN))
#define NLMSG_SPACE(len) NLMSG_ALIGN(NLMSG_LENGTH(len))
#define NLMSG_DATA(nlh) ((void*)(((char*)nlh) + NLMSG_LENGTH(0)))
#define NLMSG_NEXT(nlh,len) ((len) -= NLMSG_ALIGN((nlh)->nlmsg_len), \
(struct nlmsghdr*)(((char*)(nlh)) + NLMSG_ALIGN((nlh)->nlmsg_len)))
#define NLMSG_OK(nlh,len) ((len) >= (int)sizeof(struct nlmsghdr) && \
(nlh)->nlmsg_len >= sizeof(struct nlmsghdr) && \
(nlh)->nlmsg_len <= (len))
#define NLMSG_PAYLOAD(nlh,len) ((nlh)->nlmsg_len - NLMSG_SPACE((len)))
#define NLMSG_NOOP 0x1 /* Nothing. */
#define NLMSG_ERROR 0x2 /* Error */
#define NLMSG_DONE 0x3 /* End of a dump */
#define NLMSG_OVERRUN 0x4 /* Data lost */
#define NLMSG_MIN_TYPE 0x10 /* < 0x10: reserved control messages */
int parse_rtattr(struct rtattr *tb[], int max, struct rtattr *rta, int len);
int nlmsg_receive(char *buf, int len, int (*cb)(struct nlmsghdr *));
#endif
#ifndef __CRTOOLS_SOCKETS_H__
#define __CRTOOLS_SOCKETS_H__
int collect_sockets(void);
struct cr_fdset;
int __try_dump_socket(char *dir_name, char *fd_name, struct cr_fdset *cr_fdset);
int prepare_sockets(int pid);
void show_unixsk(char *name, int fd, bool show_header);
#endif
#ifndef __UNIX_DIAG_H__
#define __UNIX_DIAG_H__
struct unix_diag_req {
__u8 sdiag_family;
__u8 sdiag_protocol;
__u16 pad;
__u32 udiag_states;
__u32 udiag_ino;
__u32 udiag_show;
__u32 udiag_cookie[2];
};
#define UDIAG_SHOW_NAME 0x00000001 /* show name (not path) */
#define UDIAG_SHOW_VFS 0x00000002 /* show VFS inode info */
#define UDIAG_SHOW_PEER 0x00000004 /* show peer socket info */
#define UDIAG_SHOW_ICONS 0x00000008 /* show pending connections */
#define UDIAG_SHOW_RQLEN 0x00000010 /* show skb receive queue len */
struct unix_diag_msg {
__u8 udiag_family;
__u8 udiag_type;
__u8 udiag_state;
__u8 pad;
__u32 udiag_ino;
__u32 udiag_cookie[2];
};
enum {
UNIX_DIAG_NAME,
UNIX_DIAG_VFS,
UNIX_DIAG_PEER,
UNIX_DIAG_ICONS,
UNIX_DIAG_RQLEN,
UNIX_DIAG_MAX,
};
struct unix_diag_vfs {
__u32 udiag_vfs_ino;
__u32 udiag_vfs_dev;
};
#endif
#include <linux/types.h>
#include <linux/netlink.h>
#include <string.h>
#include "libnetlink.h"
#include "util.h"
int parse_rtattr(struct rtattr *tb[], int max, struct rtattr *rta, int len)
{
memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
while (RTA_OK(rta, len)) {
if ((rta->rta_type <= max) && (!tb[rta->rta_type]))
tb[rta->rta_type] = rta;
rta = RTA_NEXT(rta,len);
}
if (len)
pr_warning("Trimmed RTA: len %d, rta_len %d\n", len, rta->rta_len);
return 0;
}
int nlmsg_receive(char *buf, int len, int (*cb)(struct nlmsghdr *))
{
struct nlmsghdr *hdr;
for (hdr = (struct nlmsghdr *)buf; NLMSG_OK(hdr, len); hdr = NLMSG_NEXT(hdr, len)) {
if (hdr->nlmsg_seq != 24680)
continue;
if (hdr->nlmsg_type == NLMSG_DONE)
return 0;
if (hdr->nlmsg_type == NLMSG_ERROR) {
pr_err("Error getting scokets list\n");
return -1;
}
if (cb(hdr))
return -1;
}
return 1;
}
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment