Commit 2acc741a authored by Cyrill Gorcunov's avatar Cyrill Gorcunov

files: Use sys_kcmp to find file descriptor duplicates v4

We switch generic-object-id concept with sys_kcmp approach,
which implies changes of image format a bit (and since it's
early time for project overall, we're allowed to).

In short -- previously every file descriptor had an ID
generated by a kernel and exported via procfs. If the
appropriate file descriptors were the same objects in
kernel memory -- the IDs did match up to bit. It allows
us to figure out which files were actually the identical
ones and should be restored in a special way.

Once sys_kcmp system call was merged into the kernel,
we've got a new opprotunity -- to use this syscall instead.
The syscall basically compares kernel objects and returns
ordered results suitable for objects sorting in a userspace.

For us it means -- we treat every file descriptor as a combination
of 'genid' and 'subid'. While 'genid' serves for fast comparison
between fds, the 'subid' is kind of a second key, which guarantees
uniqueness of genid+subid tuple over all file descritors found
in a process (or group of processes).

To be able to find and dump file descriptors in a single pass we
collect every fd into a global rbtree, where (!) each node might
become a root for a subtree as well.

The main tree carries only non-equal genid. If we find genid which
is already in tree, we need to make sure that it's either indeed
a duplicate or not. For this we use sys_kcmp syscall and if we
find that file descriptors are different -- we simply put new
fd into a subtree.
Signed-off-by: 's avatarCyrill Gorcunov <gorcunov@openvz.org>
Acked-by: 's avatarPavel Emelyanov <xemul@parallels.com>
parent 8b187454
...@@ -41,6 +41,7 @@ OBJS += log.o ...@@ -41,6 +41,7 @@ OBJS += log.o
OBJS += libnetlink.o OBJS += libnetlink.o
OBJS += sockets.o OBJS += sockets.o
OBJS += files.o OBJS += files.o
OBJS += file-ids.o
OBJS += namespaces.o OBJS += namespaces.o
OBJS += uts_ns.o OBJS += uts_ns.o
OBJS += ipc_ns.o OBJS += ipc_ns.o
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include "types.h" #include "types.h"
#include "list.h" #include "list.h"
#include "file-ids.h"
#include "compiler.h" #include "compiler.h"
#include "crtools.h" #include "crtools.h"
...@@ -31,6 +32,7 @@ ...@@ -31,6 +32,7 @@
#include "image.h" #include "image.h"
#include "proc_parse.h" #include "proc_parse.h"
#include "parasite-syscall.h" #include "parasite-syscall.h"
#include "file-ids.h"
#ifndef CONFIG_X86_64 #ifndef CONFIG_X86_64
# error No x86-32 support yet # error No x86-32 support yet
...@@ -89,7 +91,9 @@ struct fd_parms { ...@@ -89,7 +91,9 @@ struct fd_parms {
unsigned long fd_name; unsigned long fd_name;
unsigned long pos; unsigned long pos;
unsigned int flags; unsigned int flags;
char *id;
u64 id;
pid_t pid;
}; };
static int dump_one_reg_file(int type, struct fd_parms *p, int lfd, static int dump_one_reg_file(int type, struct fd_parms *p, int lfd,
...@@ -120,10 +124,24 @@ static int dump_one_reg_file(int type, struct fd_parms *p, int lfd, ...@@ -120,10 +124,24 @@ static int dump_one_reg_file(int type, struct fd_parms *p, int lfd,
e.flags = p->flags; e.flags = p->flags;
e.pos = p->pos; e.pos = p->pos;
e.addr = p->fd_name; e.addr = p->fd_name;
if (p->id)
memcpy(e.id, p->id, FD_ID_SIZE); if (likely(!fd_is_special(&e))) {
else struct fd_id_entry *fd_id_entry;
memzero(e.id, FD_ID_SIZE);
/*
* Make sure the union is still correlate with structure
* we write to disk.
*/
BUILD_BUG_ON(sizeof(fd_id_entry->u.key) != sizeof(e.id));
fd_id_entry = fd_id_entry_collect((u32)p->id, p->pid, p->fd_name);
if (!fd_id_entry)
goto err;
/* Now it might have completely new ID here */
e.id = fd_id_entry->u.id;
} else
e.id = FD_ID_INVALID;
pr_info("fdinfo: type: %2x len: %2x flags: %4x pos: %8lx addr: %16lx\n", pr_info("fdinfo: type: %2x len: %2x flags: %4x pos: %8lx addr: %16lx\n",
type, len, p->flags, p->pos, p->fd_name); type, len, p->flags, p->pos, p->fd_name);
...@@ -144,7 +162,12 @@ static int dump_task_special_files(pid_t pid, struct cr_fdset *cr_fdset) ...@@ -144,7 +162,12 @@ static int dump_task_special_files(pid_t pid, struct cr_fdset *cr_fdset)
int fd, ret; int fd, ret;
/* Dump /proc/pid/cwd */ /* Dump /proc/pid/cwd */
params = (struct fd_parms){ .fd_name = FDINFO_CWD, }; params = (struct fd_parms) {
.fd_name = FDINFO_CWD,
.id = FD_ID_INVALID,
.pid = FD_PID_INVALID,
};
fd = open_proc(pid, "cwd"); fd = open_proc(pid, "cwd");
if (fd < 0) if (fd < 0)
return -1; return -1;
...@@ -153,7 +176,12 @@ static int dump_task_special_files(pid_t pid, struct cr_fdset *cr_fdset) ...@@ -153,7 +176,12 @@ static int dump_task_special_files(pid_t pid, struct cr_fdset *cr_fdset)
return ret; return ret;
/* Dump /proc/pid/exe */ /* Dump /proc/pid/exe */
params = (struct fd_parms){ .fd_name = FDINFO_EXE, }; params = (struct fd_parms) {
.fd_name = FDINFO_EXE,
.id = FD_ID_INVALID,
.pid = FD_PID_INVALID,
};
fd = open_proc(pid, "exe"); fd = open_proc(pid, "exe");
if (fd < 0) if (fd < 0)
return -1; return -1;
...@@ -281,8 +309,12 @@ static int dump_one_fd(pid_t pid, int pid_fd_dir, int lfd, ...@@ -281,8 +309,12 @@ static int dump_one_fd(pid_t pid, int pid_fd_dir, int lfd,
if (S_ISREG(st_buf.st_mode) || if (S_ISREG(st_buf.st_mode) ||
S_ISDIR(st_buf.st_mode) || S_ISDIR(st_buf.st_mode) ||
(S_ISCHR(st_buf.st_mode) && major(st_buf.st_rdev) == MEM_MAJOR)) (S_ISCHR(st_buf.st_mode) && major(st_buf.st_rdev) == MEM_MAJOR)) {
p->id = MAKE_FD_GENID(st_buf.st_dev, st_buf.st_ino, p->pos);
return dump_one_reg_file(FDINFO_FD, p, lfd, cr_fdset, 1); return dump_one_reg_file(FDINFO_FD, p, lfd, cr_fdset, 1);
}
if (S_ISFIFO(st_buf.st_mode)) { if (S_ISFIFO(st_buf.st_mode)) {
if (fstatfs(lfd, &stfs_buf) < 0) { if (fstatfs(lfd, &stfs_buf) < 0) {
...@@ -312,16 +344,19 @@ static int read_fd_params(pid_t pid, char *fd, struct fd_parms *p) ...@@ -312,16 +344,19 @@ static int read_fd_params(pid_t pid, char *fd, struct fd_parms *p)
return -1; return -1;
p->fd_name = atoi(fd); p->fd_name = atoi(fd);
ret = fscanf(file, "pos:\t%li\nflags:\t%o\nid:\t%s\n", &p->pos, &p->flags, p->id); ret = fscanf(file, "pos:\t%li\nflags:\t%o\n", &p->pos, &p->flags);
fclose(file); fclose(file);
if (ret != 3) { if (ret != 2) {
pr_err("Bad format of fdinfo file (%d items, want 3)\n", ret); pr_err("Bad format of fdinfo file (%d items, want 2)\n", ret);
return -1; return -1;
} }
pr_info("%d fdinfo %s: pos: %16lx flags: %16o id %s\n", pr_info("%d fdinfo %s: pos: %16lx flags: %16o\n",
pid, fd, p->pos, p->flags, p->id); pid, fd, p->pos, p->flags);
p->pid = pid;
p->id = FD_ID_INVALID;
return 0; return 0;
} }
...@@ -352,8 +387,7 @@ static int dump_task_files(pid_t pid, struct cr_fdset *cr_fdset) ...@@ -352,8 +387,7 @@ static int dump_task_files(pid_t pid, struct cr_fdset *cr_fdset)
return -1; return -1;
while ((de = readdir(fd_dir))) { while ((de = readdir(fd_dir))) {
char id[FD_ID_SIZE]; struct fd_parms p;
struct fd_parms p = { .id = id };
int lfd; int lfd;
if (de->d_name[0] == '.') if (de->d_name[0] == '.')
...@@ -407,9 +441,9 @@ static int dump_task_mappings(pid_t pid, struct list_head *vma_area_list, struct ...@@ -407,9 +441,9 @@ static int dump_task_mappings(pid_t pid, struct list_head *vma_area_list, struct
} else if (vma_entry_is(vma, VMA_FILE_PRIVATE) || } else if (vma_entry_is(vma, VMA_FILE_PRIVATE) ||
vma_entry_is(vma, VMA_FILE_SHARED)) { vma_entry_is(vma, VMA_FILE_SHARED)) {
struct fd_parms p = { struct fd_parms p = {
.fd_name = vma->start, .fd_name = vma->start,
.pos = 0, .id = FD_ID_INVALID,
.id = NULL, .pid = pid,
}; };
if (vma->prot & PROT_WRITE && if (vma->prot & PROT_WRITE &&
......
...@@ -85,8 +85,9 @@ static void show_files(int fd_files) ...@@ -85,8 +85,9 @@ static void show_files(int fd_files)
if (ret <= 0) if (ret <= 0)
goto out; goto out;
pr_info("type: %02x len: %02x flags: %4x pos: %8x addr: %16lx id: %s", pr_info("type: %02x len: %02x flags: %4x pos: %8x "
e.type, e.len, e.flags, e.pos, e.addr, e.id); "addr: %16lx id: %16lx",
e.type, e.len, e.flags, e.pos, e.addr, e.id);
if (e.len) { if (e.len) {
int ret = read(fd_files, local_buf, e.len); int ret = read(fd_files, local_buf, e.len);
......
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <signal.h>
#include <limits.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <sys/types.h>
#include "types.h"
#include "file-ids.h"
#include "rbtree.h"
#include "compiler.h"
#include "syscall.h"
#include "image.h"
#include "util.h"
/*
* We track shared files by global rbtree, where each node might
* be a root for subtree. The reason for that is the nature of data
* we obtain from operating system.
*
* Basically OS provides us two ways to distinguish files
*
* - information obtained from fstat call
* - shiny new sys_kcmp system call (which may compare the file descriptor
* pointers inside the kernel and provide us order info)
*
* So, to speedup procedure of searching for shared file descriptors
* we use both techniques. From fstat call we get that named general file
* IDs (genid) which are carried in the main rbtree.
*
* In case if two genid are the same -- we need to use a second way and
* call for sys_kcmp. Thus, if kernel tells us that files have identical
* genid but in real they are different from kernel point of view -- we assign
* a second unique key (subid) to such file descriptor and put it into a subtree.
*
* So the tree will look like
*
* (root)
* genid-1
* / \
* genid-2 genid-3
* / \ / \
*
* Where each genid node might be a sub-rbtree as well
*
* (genid-N)
* / \
* subid-1 subid-2
* / \ / \
*
* Carrying two rbtree at once allow us to minimize the number
* of sys_kcmp syscalls, also to collect and dump file descriptors
* in one pass.
*/
struct rb_root fd_id_root = RB_ROOT;
static unsigned long fd_id_entries_subid = 1;
static struct fd_id_entry *alloc_fd_id_entry(u32 genid, pid_t pid, int fd)
{
struct fd_id_entry *e;
e = xmalloc(sizeof(*e));
if (!e)
goto err;
e->u.key.subid = fd_id_entries_subid++;
e->u.key.genid = genid;
e->pid = pid;
e->fd = fd;
/* Make sure no overflow here */
BUG_ON(!e->u.key.subid);
rb_init_node(&e->node);
rb_init_node(&e->subtree_node);
rb_attach_node(&e->subtree_root, &e->subtree_node);
err:
return e;
}
static struct fd_id_entry *
lookup_alloc_subtree(struct fd_id_entry *e, u32 genid, pid_t pid, int fd)
{
struct rb_node *node = e->subtree_root.rb_node;
struct fd_id_entry *sub = NULL;
struct rb_node **new = &e->subtree_root.rb_node;
struct rb_node *parent = NULL;
while (node) {
struct fd_id_entry *this = rb_entry(node, struct fd_id_entry, subtree_node);
int ret = sys_kcmp(this->pid, pid, KCMP_FILE, this->fd, fd);
parent = *new;
if (ret < 0)
node = node->rb_left, new = &((*new)->rb_left);
else if (ret > 0)
node = node->rb_right, new = &((*new)->rb_right);
else
return this;
}
sub = alloc_fd_id_entry(genid, pid, fd);
if (!sub)
goto err;
rb_link_and_balance(&e->subtree_root, &sub->subtree_node, parent, new);
err:
return sub;
}
struct fd_id_entry *fd_id_entry_collect(u32 genid, pid_t pid, int fd)
{
struct rb_node *node = fd_id_root.rb_node;
struct fd_id_entry *e = NULL;
struct rb_node **new = &fd_id_root.rb_node;
struct rb_node *parent = NULL;
while (node) {
struct fd_id_entry *this = rb_entry(node, struct fd_id_entry, node);
parent = *new;
if (genid < this->u.key.genid)
node = node->rb_left, new = &((*new)->rb_left);
else if (genid > this->u.key.genid)
node = node->rb_right, new = &((*new)->rb_right);
else
return lookup_alloc_subtree(this, genid, pid, fd);
}
e = alloc_fd_id_entry(genid, pid, fd);
if (!e)
goto err;
rb_link_and_balance(&fd_id_root, &e->node, parent, new);
err:
return e;
}
...@@ -44,14 +44,14 @@ int prepare_shared_fdinfo(void) ...@@ -44,14 +44,14 @@ int prepare_shared_fdinfo(void)
return 0; return 0;
} }
static struct fdinfo_desc *find_fd(char *id) static struct fdinfo_desc *find_fd(u64 id)
{ {
struct fdinfo_desc *fi; struct fdinfo_desc *fi;
int i; int i;
for (i = 0; i < nr_fdinfo_descs; i++) { for (i = 0; i < nr_fdinfo_descs; i++) {
fi = fdinfo_descs + i; fi = fdinfo_descs + i;
if (!strncmp(fi->id, id, FD_ID_SIZE)) if (fi->id == id)
return fi; return fi;
} }
...@@ -74,9 +74,10 @@ static int collect_fd(int pid, struct fdinfo_entry *e) ...@@ -74,9 +74,10 @@ static int collect_fd(int pid, struct fdinfo_entry *e)
{ {
int i; int i;
struct fdinfo_list_entry *le = &fdinfo_list[nr_fdinfo_list]; struct fdinfo_list_entry *le = &fdinfo_list[nr_fdinfo_list];
struct fdinfo_desc *desc; struct fdinfo_desc *desc;
pr_info("Collect fdinfo pid=%d fd=%ld id=%s\n", pid, e->addr, e->id); pr_info("Collect fdinfo pid=%d fd=%ld id=%16lx\n",
pid, e->addr, e->id);
nr_fdinfo_list++; nr_fdinfo_list++;
if ((nr_fdinfo_list) * sizeof(struct fdinfo_list_entry) >= 4096) { if ((nr_fdinfo_list) * sizeof(struct fdinfo_list_entry) >= 4096) {
...@@ -90,7 +91,8 @@ static int collect_fd(int pid, struct fdinfo_entry *e) ...@@ -90,7 +91,8 @@ static int collect_fd(int pid, struct fdinfo_entry *e)
for (i = 0; i < nr_fdinfo_descs; i++) { for (i = 0; i < nr_fdinfo_descs; i++) {
desc = &fdinfo_descs[i]; desc = &fdinfo_descs[i];
if (strncmp(desc->id, (char *) e->id, FD_ID_SIZE))
if (desc->id != e->id)
continue; continue;
fdinfo_descs[i].users++; fdinfo_descs[i].users++;
...@@ -111,13 +113,14 @@ static int collect_fd(int pid, struct fdinfo_entry *e) ...@@ -111,13 +113,14 @@ static int collect_fd(int pid, struct fdinfo_entry *e)
} }
desc = &fdinfo_descs[nr_fdinfo_descs]; desc = &fdinfo_descs[nr_fdinfo_descs];
memset(desc, 0, sizeof(fdinfo_descs[nr_fdinfo_descs])); memzero(desc, sizeof(*desc));
memcpy(desc->id, e->id, FD_ID_SIZE); desc->id = e->id;
desc->addr= e->addr; desc->addr = e->addr;
desc->pid = pid; desc->pid = pid;
desc->users = 1; desc->users = 1;
INIT_LIST_HEAD(&desc->list); INIT_LIST_HEAD(&desc->list);
list_add(&le->list, &desc->list); list_add(&le->list, &desc->list);
nr_fdinfo_descs++; nr_fdinfo_descs++;
...@@ -392,7 +395,7 @@ static int open_fdinfo(int pid, struct fdinfo_entry *fe, int *fdinfo_fd, int sta ...@@ -392,7 +395,7 @@ static int open_fdinfo(int pid, struct fdinfo_entry *fe, int *fdinfo_fd, int sta
u32 mag; u32 mag;
int ret = 0; int ret = 0;
struct fdinfo_desc *fi = find_fd((char *)fe->id); struct fdinfo_desc *fi = find_fd(fe->id);
if (move_img_fd(fdinfo_fd, (int)fe->addr)) if (move_img_fd(fdinfo_fd, (int)fe->addr))
return -1; return -1;
......
...@@ -16,6 +16,8 @@ ...@@ -16,6 +16,8 @@
#define NORETURN __attribute__((__noreturn__)) #define NORETURN __attribute__((__noreturn__))
#define __packed __attribute__((__packed__)) #define __packed __attribute__((__packed__))
#define __used __attribute__((__used__)) #define __used __attribute__((__used__))
#define __maybe_unused __attribute__((unused))
#define __always_unused __attribute__((unused))
#define __section(S) __attribute__ ((__section__(#S))) #define __section(S) __attribute__ ((__section__(#S)))
......
#ifndef FILE_IDS_H__
#define FILE_IDS_H__
#include "compiler.h"
#include "types.h"
#include "rbtree.h"
#define FD_ID_INVALID (-1UL)
#define FD_PID_INVALID ((int)-2UL)
struct fd_id_entry {
struct rb_node node;
struct rb_root subtree_root;
struct rb_node subtree_node;
union {
struct {
u32 genid; /* generic id, may have duplicates */
u32 subid; /* subid is always unique */
} key;
u64 id;
} u;
pid_t pid;
int fd;
} __aligned(sizeof(long));
#define MAKE_FD_GENID(dev, ino, pos) \
(((u32)(dev) ^ (u32)(ino) ^ (u32)(pos)))
extern struct fd_id_entry *fd_id_entry_collect(u32 genid, pid_t pid, int fd);
#endif /* FILE_IDS_H__ */
...@@ -22,7 +22,7 @@ struct fmap_fd { ...@@ -22,7 +22,7 @@ struct fmap_fd {
}; };
struct fdinfo_desc { struct fdinfo_desc {
char id[FD_ID_SIZE]; u64 id;
u64 addr; u64 addr;
int pid; int pid;
u32 real_pid; /* futex */ u32 real_pid; /* futex */
......
...@@ -39,15 +39,13 @@ ...@@ -39,15 +39,13 @@
#define PAGE_RSS 1 #define PAGE_RSS 1
#define PAGE_ANON 2 #define PAGE_ANON 2
#define FD_ID_SIZE 50
struct fdinfo_entry { struct fdinfo_entry {
u8 type; u8 type;
u8 len; u8 len;
u16 flags; u16 flags;
u32 pos; u32 pos;
u64 addr; u64 addr;
u8 id[FD_ID_SIZE]; u64 id;
u8 name[0]; u8 name[0];
} __packed; } __packed;
......
...@@ -82,4 +82,13 @@ static void rb_link_node(struct rb_node *node, struct rb_node *parent, ...@@ -82,4 +82,13 @@ static void rb_link_node(struct rb_node *node, struct rb_node *parent,
*rb_link = node; *rb_link = node;
} }
static void rb_link_and_balance(struct rb_root *root,
struct rb_node *node,
struct rb_node *parent,
struct rb_node **rb_link)
{
rb_link_node(node, parent, rb_link);
rb_insert_color(node, root);
}
#endif /* RBTREE_H__ */ #endif /* RBTREE_H__ */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment