Commit b50ee4a1 authored by Cyrill Gorcunov's avatar Cyrill Gorcunov

Update kernel area

Signed-off-by: 's avatarCyrill Gorcunov <gorcunov@gmail.com>
parent 83c209ef
crtools
=======
An utility to to checkpoint/restore tasks.
An utility to checkpoint/restore tasks.
Some code snippets are borrowed from
......@@ -13,3 +13,20 @@ Some code snippets are borrowed from
Many thanks to these projects.
Licensed under GPLv2 (http://www.gnu.org/licenses/gpl-2.0.txt)
Kernel patching
===============
To have crtools up and running either
1) use patches from kernel/ directory
2) or clone git://github.com/cyrillos/linux-2.6.git
and switch to branch "crtools".
It's based on Linux
| commit 1ea6b8f48918282bdca0b32a34095504ee65bab5
| Author: Linus Torvalds <torvalds@linux-foundation.org>
| Date: Mon Nov 7 16:16:02 2011 -0800
|
| Linux 3.2-rc1
fs, proc: Make proc_get_link to use dentry instead of inode
From fc4504ee8f471ac1ac8162ec68e98f2c09d53411 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Tue, 8 Nov 2011 14:57:10 +0400
Subject: [PATCH 1/4] fs, proc: Make proc_get_link to use dentry instead of
inode
This patch prepares the ground for the next "map_files"
patch which needs a name of a link file to analyse.
......@@ -16,11 +20,11 @@ CC: Andrew Morton <akpm@linux-foundation.org>
include/linux/proc_fs.h | 2 +-
2 files changed, 11 insertions(+), 11 deletions(-)
Index: linux-2.6.git/fs/proc/base.c
===================================================================
--- linux-2.6.git.orig/fs/proc/base.c
+++ linux-2.6.git/fs/proc/base.c
@@ -165,9 +165,9 @@ static int get_task_root(struct task_str
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 2db1bd3..93c81aa 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -165,9 +165,9 @@ static int get_task_root(struct task_struct *task, struct path *root)
return result;
}
......@@ -32,7 +36,7 @@ Index: linux-2.6.git/fs/proc/base.c
int result = -ENOENT;
if (task) {
@@ -182,9 +182,9 @@ static int proc_cwd_link(struct inode *i
@@ -182,9 +182,9 @@ static int proc_cwd_link(struct inode *inode, struct path *path)
return result;
}
......@@ -44,7 +48,7 @@ Index: linux-2.6.git/fs/proc/base.c
int result = -ENOENT;
if (task) {
@@ -1580,13 +1580,13 @@ static const struct file_operations proc
@@ -1567,13 +1567,13 @@ static const struct file_operations proc_pid_set_comm_operations = {
.release = single_release,
};
......@@ -60,7 +64,7 @@ Index: linux-2.6.git/fs/proc/base.c
if (!task)
return -ENOENT;
mm = get_task_mm(task);
@@ -1616,7 +1616,7 @@ static void *proc_pid_follow_link(struct
@@ -1603,7 +1603,7 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
if (!proc_fd_access_allowed(inode))
goto out;
......@@ -69,7 +73,7 @@ Index: linux-2.6.git/fs/proc/base.c
out:
return ERR_PTR(error);
}
@@ -1655,7 +1655,7 @@ static int proc_pid_readlink(struct dent
@@ -1642,7 +1642,7 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b
if (!proc_fd_access_allowed(inode))
goto out;
......@@ -78,7 +82,7 @@ Index: linux-2.6.git/fs/proc/base.c
if (error)
goto out;
@@ -1959,9 +1959,9 @@ out_task:
@@ -1980,9 +1980,9 @@ out_task:
return rc;
}
......@@ -90,11 +94,11 @@ Index: linux-2.6.git/fs/proc/base.c
}
static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
Index: linux-2.6.git/include/linux/proc_fs.h
===================================================================
--- linux-2.6.git.orig/include/linux/proc_fs.h
+++ linux-2.6.git/include/linux/proc_fs.h
@@ -253,7 +253,7 @@ extern const struct proc_ns_operations u
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 643b96c..c3d11ff 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -253,7 +253,7 @@ extern const struct proc_ns_operations utsns_operations;
extern const struct proc_ns_operations ipcns_operations;
union proc_op {
......@@ -103,3 +107,6 @@ Index: linux-2.6.git/include/linux/proc_fs.h
int (*proc_read)(struct task_struct *task, char *page);
int (*proc_show)(struct seq_file *m,
struct pid_namespace *ns, struct pid *pid,
--
1.7.6.4
fs, proc: Introduce the /proc/<pid>/map_files/ directory v14
From d23bde31590a7679aa2be7960848b0fedd0ce032 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Tue, 8 Nov 2011 14:58:01 +0400
Subject: [PATCH 2/4] fs, proc: Introduce the /proc/<pid>/map_files/ directory
v14
This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end",
......@@ -115,14 +117,14 @@ CC: Al Viro <viro@ZenIV.linux.org.uk>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Pavel Machek <pavel@ucw.cz>
---
fs/proc/base.c | 345 +++++++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/mm.h | 12 +
2 files changed, 357 insertions(+)
fs/proc/base.c | 345 ++++++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/mm.h | 12 ++
2 files changed, 357 insertions(+), 0 deletions(-)
Index: linux-2.6.git/fs/proc/base.c
===================================================================
--- linux-2.6.git.orig/fs/proc/base.c
+++ linux-2.6.git/fs/proc/base.c
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 93c81aa..9b7a9cd 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -83,6 +83,7 @@
#include <linux/pid_namespace.h>
#include <linux/fs_struct.h>
......@@ -140,7 +142,7 @@ Index: linux-2.6.git/fs/proc/base.c
/*
* Count the number of hardlinks for the pid_entry table, excluding the .
* and .. links.
@@ -2201,6 +2204,347 @@ static const struct file_operations proc
@@ -2217,6 +2220,347 @@ static const struct file_operations proc_fd_operations = {
};
/*
......@@ -488,7 +490,7 @@ Index: linux-2.6.git/fs/proc/base.c
* /proc/pid/fd needs a special permission handler so that a process can still
* access /proc/self/fd after it has executed a setuid().
*/
@@ -2815,6 +3159,7 @@ static const struct inode_operations pro
@@ -2832,6 +3176,7 @@ static const struct inode_operations proc_task_inode_operations;
static const struct pid_entry tgid_base_stuff[] = {
DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
......@@ -496,11 +498,11 @@ Index: linux-2.6.git/fs/proc/base.c
DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
Index: linux-2.6.git/include/linux/mm.h
===================================================================
--- linux-2.6.git.orig/include/linux/mm.h
+++ linux-2.6.git/include/linux/mm.h
@@ -1491,6 +1491,18 @@ static inline unsigned long vma_pages(st
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3dc3a8c..14159d3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1491,6 +1491,18 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma)
return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
}
......@@ -519,3 +521,6 @@ Index: linux-2.6.git/include/linux/mm.h
#ifdef CONFIG_MMU
pgprot_t vm_get_page_prot(unsigned long vm_flags);
#else
--
1.7.6.4
proc: Introduce the Children: line in /proc/<pid>/status
From 9e489dbc4f796b76adb4440ccf4888d934ede61d Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Tue, 8 Nov 2011 14:59:40 +0400
Subject: [PATCH 3/4] fs, proc: Introduce the Children: line in
/proc/<pid>/status
Although we can get the pids of some task's issue, this is just
more convenient to have them this way.
......@@ -10,13 +12,13 @@ Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
---
fs/proc/array.c | 14 ++++++++++++++
1 file changed, 14 insertions(+)
1 files changed, 14 insertions(+), 0 deletions(-)
Index: linux-2.6.git/fs/proc/array.c
===================================================================
--- linux-2.6.git.orig/fs/proc/array.c
+++ linux-2.6.git/fs/proc/array.c
@@ -158,6 +158,18 @@ static inline const char *get_task_state
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 3a1dafd..8f33329 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -158,6 +158,18 @@ static inline const char *get_task_state(struct task_struct *tsk)
return *p;
}
......@@ -35,7 +37,7 @@ Index: linux-2.6.git/fs/proc/array.c
static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *p)
{
@@ -192,6 +204,8 @@ static inline void task_state(struct seq
@@ -192,6 +204,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
cred->uid, cred->euid, cred->suid, cred->fsuid,
cred->gid, cred->egid, cred->sgid, cred->fsgid);
......@@ -44,3 +46,6 @@ Index: linux-2.6.git/fs/proc/array.c
task_lock(p);
if (p->files)
fdt = files_fdtable(p->files);
--
1.7.6.4
fs, proc: Add start_data, end_data, start_brk members to /proc/$pid/stat
From e46fc1fa01faea36ad4c5608436f5900e66c9529 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Tue, 8 Nov 2011 15:00:56 +0400
Subject: [PATCH 4/4] fs, proc: Add start_data, end_data, start_brk members to
/proc/$pid/stat
It helps to dump and restore this mm_struct members at chekpoint/restore time.
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
---
fs/proc/array.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
1 files changed, 5 insertions(+), 2 deletions(-)
Index: linux-2.6.git/fs/proc/array.c
===================================================================
--- linux-2.6.git.orig/fs/proc/array.c
+++ linux-2.6.git/fs/proc/array.c
@@ -478,7 +478,7 @@ static int do_task_stat(struct seq_file
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 8f33329..8248682 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -478,7 +478,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \
%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
......@@ -20,7 +24,7 @@ Index: linux-2.6.git/fs/proc/array.c
pid_nr_ns(pid, ns),
tcomm,
state,
@@ -525,7 +525,10 @@ static int do_task_stat(struct seq_file
@@ -525,7 +525,10 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
task->policy,
(unsigned long long)delayacct_blkio_ticks(task),
cputime_to_clock_t(gtime),
......@@ -32,3 +36,6 @@ Index: linux-2.6.git/fs/proc/array.c
if (mm)
mmput(mm);
return 0;
--
1.7.6.4
This diff is collapsed.
clone: Introduce the CLONE_CHILD_USEPID functionality
From: Pavel Emelyanov <xemul@openvz.org>
When restoring a task (or a set of tasks) we need to recreate them with
exactly the same pid as they had before. Thus we need the ability to create
a task with specified pid.
The proposal is to reuse the already free CLONE_STOPPED clone flag.
About the security implication - this can create some problems with pids
wraparound and similar, so this approach can be restricted with the "don't
allow for CLONE_CHILD_USEPID when the current pid namespace has ever done
real pid allocation". This will work perfectly for checkpoint-restore and
will not give anyone chances for screwing pids up on a living system.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
---
include/linux/pid.h | 2 -
include/linux/sched.h | 1
kernel/fork.c | 10 ++++++-
kernel/pid.c | 70 ++++++++++++++++++++++++++++++++++++--------------
4 files changed, 62 insertions(+), 21 deletions(-)
Index: linux-2.6.git/include/linux/pid.h
===================================================================
--- linux-2.6.git.orig/include/linux/pid.h
+++ linux-2.6.git/include/linux/pid.h
@@ -119,7 +119,7 @@ extern struct pid *find_get_pid(int nr);
extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
int next_pidmap(struct pid_namespace *pid_ns, unsigned int last);
-extern struct pid *alloc_pid(struct pid_namespace *ns);
+extern struct pid *alloc_pid(struct pid_namespace *ns, int pid);
extern void free_pid(struct pid *pid);
/*
Index: linux-2.6.git/include/linux/sched.h
===================================================================
--- linux-2.6.git.orig/include/linux/sched.h
+++ linux-2.6.git/include/linux/sched.h
@@ -23,6 +23,7 @@
#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */
/* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state)
and is now available for re-use. */
+#define CLONE_CHILD_USEPID 0x02000000 /* use the given pid */
#define CLONE_NEWUTS 0x04000000 /* New utsname group? */
#define CLONE_NEWIPC 0x08000000 /* New ipcs */
#define CLONE_NEWUSER 0x10000000 /* New user namespace */
Index: linux-2.6.git/kernel/fork.c
===================================================================
--- linux-2.6.git.orig/kernel/fork.c
+++ linux-2.6.git/kernel/fork.c
@@ -1253,8 +1253,16 @@ static struct task_struct *copy_process(
goto bad_fork_cleanup_io;
if (pid != &init_struct_pid) {
+ int want_pid = 0;
+
+ if (clone_flags & CLONE_CHILD_USEPID) {
+ retval = get_user(want_pid, child_tidptr);
+ if (retval)
+ goto bad_fork_cleanup_io;
+ }
+
retval = -ENOMEM;
- pid = alloc_pid(p->nsproxy->pid_ns);
+ pid = alloc_pid(p->nsproxy->pid_ns, want_pid);
if (!pid)
goto bad_fork_cleanup_io;
}
Index: linux-2.6.git/kernel/pid.c
===================================================================
--- linux-2.6.git.orig/kernel/pid.c
+++ linux-2.6.git/kernel/pid.c
@@ -159,11 +159,55 @@ static void set_last_pid(struct pid_name
} while ((prev != last_write) && (pid_before(base, last_write, pid)));
}
-static int alloc_pidmap(struct pid_namespace *pid_ns)
+static int alloc_pidmap_page(struct pidmap *map)
+{
+ if (unlikely(!map->page)) {
+ void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ /*
+ * Free the page if someone raced with us
+ * installing it:
+ */
+ spin_lock_irq(&pidmap_lock);
+ if (!map->page) {
+ map->page = page;
+ page = NULL;
+ }
+ spin_unlock_irq(&pidmap_lock);
+ kfree(page);
+ if (unlikely(!map->page))
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int set_pidmap(struct pid_namespace *pid_ns, int pid)
+{
+ int offset;
+ struct pidmap *map;
+
+ offset = pid & BITS_PER_PAGE_MASK;
+ map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
+
+ if (alloc_pidmap_page(map) < 0)
+ return -ENOMEM;
+
+ if (!test_and_set_bit(offset, map->page)) {
+ atomic_dec(&map->nr_free);
+ return pid;
+ }
+
+ return -EBUSY;
+}
+
+static int alloc_pidmap(struct pid_namespace *pid_ns, int desired_pid)
{
int i, offset, max_scan, pid, last = pid_ns->last_pid;
struct pidmap *map;
+ if (desired_pid)
+ return set_pidmap(pid_ns, desired_pid);
+
pid = last + 1;
if (pid >= pid_max)
pid = RESERVED_PIDS;
@@ -176,22 +220,9 @@ static int alloc_pidmap(struct pid_names
*/
max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
for (i = 0; i <= max_scan; ++i) {
- if (unlikely(!map->page)) {
- void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- /*
- * Free the page if someone raced with us
- * installing it:
- */
- spin_lock_irq(&pidmap_lock);
- if (!map->page) {
- map->page = page;
- page = NULL;
- }
- spin_unlock_irq(&pidmap_lock);
- kfree(page);
- if (unlikely(!map->page))
- break;
- }
+ if (alloc_pidmap_page(map) < 0)
+ break;
+
if (likely(atomic_read(&map->nr_free))) {
do {
if (!test_and_set_bit(offset, map->page)) {
@@ -277,7 +308,7 @@ void free_pid(struct pid *pid)
call_rcu(&pid->rcu, delayed_put_pid);
}
-struct pid *alloc_pid(struct pid_namespace *ns)
+struct pid *alloc_pid(struct pid_namespace *ns, int this_ns_pid)
{
struct pid *pid;
enum pid_type type;
@@ -291,13 +322,14 @@ struct pid *alloc_pid(struct pid_namespa
tmp = ns;
for (i = ns->level; i >= 0; i--) {
- nr = alloc_pidmap(tmp);
+ nr = alloc_pidmap(tmp, this_ns_pid);
if (nr < 0)
goto out_free;
pid->numbers[i].nr = nr;
pid->numbers[i].ns = tmp;
tmp = tmp->parent;
+ this_ns_pid = 0;
}
get_pid_ns(ns);
vfs: Add ->statfs callback for pipefs
From: Pavel Emelyanov <xemul@parallels.com>
This is done to make it possible to distinguish pipes
from fifos when opening one via /proc/<pid>/fd/ link.
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
---
fs/pipe.c | 1 +
1 file changed, 1 insertion(+)
Index: linux-2.6.git/fs/pipe.c
===================================================================
--- linux-2.6.git.orig/fs/pipe.c
+++ linux-2.6.git/fs/pipe.c
@@ -1254,6 +1254,7 @@ out:
static const struct super_operations pipefs_ops = {
.destroy_inode = free_inode_nonrcu,
+ .statfs = simple_statfs,
};
/*
fs: Add do_close helper
To be able to close file descriptors right from inside
kernel space do_close() helper is added.
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
---
fs/open.c | 32 ++++++++++++++++++++------------
include/linux/fs.h | 1 +
2 files changed, 21 insertions(+), 12 deletions(-)
Index: linux-2.6.git/fs/open.c
===================================================================
--- linux-2.6.git.orig/fs/open.c
+++ linux-2.6.git/fs/open.c
@@ -1056,17 +1056,11 @@ int filp_close(struct file *filp, fl_own
EXPORT_SYMBOL(filp_close);
-/*
- * Careful here! We test whether the file pointer is NULL before
- * releasing the fd. This ensures that one clone task can't release
- * an fd while another clone is opening it.
- */
-SYSCALL_DEFINE1(close, unsigned int, fd)
+int do_close(unsigned int fd)
{
struct file * filp;
struct files_struct *files = current->files;
struct fdtable *fdt;
- int retval;
spin_lock(&files->file_lock);
fdt = files_fdtable(files);
@@ -1079,7 +1073,25 @@ SYSCALL_DEFINE1(close, unsigned int, fd)
FD_CLR(fd, fdt->close_on_exec);
__put_unused_fd(files, fd);
spin_unlock(&files->file_lock);
- retval = filp_close(filp, files);
+
+ return filp_close(filp, files);
+
+out_unlock:
+ spin_unlock(&files->file_lock);
+ return -EBADF;
+}
+EXPORT_SYMBOL_GPL(do_close);
+
+/*
+ * Careful here! We test whether the file pointer is NULL before
+ * releasing the fd. This ensures that one clone task can't release
+ * an fd while another clone is opening it.
+ */
+SYSCALL_DEFINE1(close, unsigned int, fd)
+{
+ int retval;
+
+ retval = do_close(fd);
/* can't restart close syscall because file table entry was cleared */
if (unlikely(retval == -ERESTARTSYS ||
@@ -1089,10 +1101,6 @@ SYSCALL_DEFINE1(close, unsigned int, fd)
retval = -EINTR;
return retval;
-
-out_unlock:
- spin_unlock(&files->file_lock);
- return -EBADF;
}
EXPORT_SYMBOL(sys_close);
Index: linux-2.6.git/include/linux/fs.h
===================================================================
--- linux-2.6.git.orig/include/linux/fs.h
+++ linux-2.6.git/include/linux/fs.h
@@ -2025,6 +2025,7 @@ extern struct file *file_open_root(struc
extern struct file * dentry_open(struct dentry *, struct vfsmount *, int,
const struct cred *);
extern int filp_close(struct file *, fl_owner_t id);
+extern int do_close(unsigned int fd);
extern char * getname(const char __user *);
/* fs/ioctl.c */
fs, proc: Add /proc/$pid/tls entry
To be able to restart checkpointed tasks we need
to know TLS status at dumping time. Export this
information by /proc/$pid/tls entry.
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
---
fs/proc/base.c | 20 ++++++++++++++++++++
1 file changed, 20 insertions(+)
Index: linux-2.6.git/fs/proc/base.c
===================================================================
--- linux-2.6.git.orig/fs/proc/base.c
+++ linux-2.6.git/fs/proc/base.c
@@ -3150,6 +3150,23 @@ static int proc_pid_personality(struct s
return err;
}
+#ifdef CONFIG_X86
+static int proc_pid_tls(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ int err = lock_trace(task);
+ if (!err) {
+ int i;
+ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
+ seq_printf(m, "%x %x\n",
+ task->thread.tls_array[i].a,
+ task->thread.tls_array[i].b);
+ unlock_trace(task);
+ }
+ return err;
+}
+#endif
+
/*
* Thread groups
*/
@@ -3169,6 +3186,9 @@ static const struct pid_entry tgid_base_
INF("auxv", S_IRUSR, proc_pid_auxv),
ONE("status", S_IRUGO, proc_pid_status),
ONE("personality", S_IRUGO, proc_pid_personality),
+#ifdef CONFIG_X86
+ ONE("tls", S_IRUGO, proc_pid_tls),
+#endif
INF("limits", S_IRUGO, proc_pid_limits),
#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
From: Vasiliy Kulikov <segooon@gmail.com>
In the patch "proc: fix races against execve() of /proc/PID/fd**"
proc_pid_fd_link_getattr() leaked task_struct if ptrace check fails.
Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
Reported-by: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
fs/proc/base.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff -puN fs/proc/base.c~proc-fix-races-against-execve-of-proc-pid-fd-fix fs/proc/base.c
--- a/fs/proc/base.c~proc-fix-races-against-execve-of-proc-pid-fd-fix
+++ a/fs/proc/base.c
@@ -1681,9 +1681,9 @@ static int proc_pid_fd_link_getattr(stru
generic_fillattr(inode, stat);
unlock_trace(task);
- put_task_struct(task);
rc = 0;
out_task:
+ put_task_struct(task);
return rc;
}
_
From: Vasiliy Kulikov <segoon@openwall.com>
fd* files are restricted to the task's owner, and other users may not get
direct access to them. But one may open any of these files and run any
setuid program, keeping opened file descriptors. As there are permission
checks on open(), but not on readdir() and read(), operations on the kept
file descriptors will not be checked. It makes it possible to violate
procfs permission model.
Reading fdinfo/* may disclosure current fds' position and flags, reading
directory contents of fdinfo/ and fd/ may disclosure the number of opened
files by the target task. This information is not sensible per se, but it
can reveal some private information (like length of a password stored in a
file) under certain conditions.
Used existing (un)lock_trace functions to check for ptrace_may_access(),
but instead of using EPERM return code from it use EACCES to be consistent
with existing proc_pid_follow_link()/proc_pid_readlink() return code. If
they differ, attacker can guess what fds exist by analyzing stat() return
code. Patched handlers: stat() for fd/*, stat() and read() for fdindo/*,
readdir() and lookup() for fd/ and fdinfo/.
Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
fs/proc/base.c | 146 ++++++++++++++++++++++++++++++++++++++++-----------------
1 file changed, 103 insertions(+), 43 deletions(-)
Index: linux-2.6.git/fs/proc/base.c
===================================================================
--- linux-2.6.git.orig/fs/proc/base.c
+++ linux-2.6.git/fs/proc/base.c
@@ -1665,12 +1665,46 @@ out:
return error;
}
+static int proc_pid_fd_link_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct inode *inode = dentry->d_inode;
+ struct task_struct *task = get_proc_task(inode);
+ int rc;
+
+ if (task == NULL)
+ return -ESRCH;
+
+ rc = -EACCES;
+ if (lock_trace(task))
+ goto out_task;
+
+ generic_fillattr(inode, stat);
+ unlock_trace(task);
+ put_task_struct(task);
+ rc = 0;
+out_task:
+ return rc;
+}
+
static const struct inode_operations proc_pid_link_inode_operations = {
.readlink = proc_pid_readlink,
.follow_link = proc_pid_follow_link,
.setattr = proc_setattr,
};
+static const struct inode_operations proc_fdinfo_link_inode_operations = {
+ .setattr = proc_setattr,
+ .getattr = proc_pid_fd_link_getattr,
+};
+
+static const struct inode_operations proc_fd_link_inode_operations = {
+ .readlink = proc_pid_readlink,
+ .follow_link = proc_pid_follow_link,
+ .setattr = proc_setattr,
+ .getattr = proc_pid_fd_link_getattr,
+};
+
/* building an inode */
@@ -1902,49 +1936,61 @@ out:
static int proc_fd_info(struct inode *inode, struct path *path, char *info)
{
- struct task_struct *task = get_proc_task(inode);
- struct files_struct *files = NULL;
+ struct task_struct *task;
+ struct files_struct *files;
struct file *file;
int fd = proc_fd(inode);
+ int rc;
- if (task) {
- files = get_files_struct(task);
- put_task_struct(task);
- }
- if (files) {
- /*
- * We are not taking a ref to the file structure, so we must
- * hold ->file_lock.
- */
- spin_lock(&files->file_lock);
- file = fcheck_files(files, fd);
- if (file) {
- unsigned int f_flags;
- struct fdtable *fdt;
-
- fdt = files_fdtable(files);
- f_flags = file->f_flags & ~O_CLOEXEC;
- if (FD_ISSET(fd, fdt->close_on_exec))
- f_flags |= O_CLOEXEC;
-
- if (path) {
- *path = file->f_path;
- path_get(&file->f_path);
- }
- if (info)
- snprintf(info, PROC_FDINFO_MAX,
- "pos:\t%lli\n"
- "flags:\t0%o\n",
- (long long) file->f_pos,
- f_flags);
- spin_unlock(&files->file_lock);
- put_files_struct(files);
- return 0;
+ task = get_proc_task(inode);
+ if (!task)
+ return -ENOENT;
+
+ rc = -EACCES;
+ if (lock_trace(task))
+ goto out_task;
+
+ rc = -ENOENT;
+ files = get_files_struct(task);
+ if (files == NULL)
+ goto out_unlock;
+
+ /*
+ * We are not taking a ref to the file structure, so we must
+ * hold ->file_lock.
+ */
+ spin_lock(&files->file_lock);
+ file = fcheck_files(files, fd);
+ if (file) {
+ unsigned int f_flags;
+ struct fdtable *fdt;
+
+ fdt = files_fdtable(files);
+ f_flags = file->f_flags & ~O_CLOEXEC;
+ if (FD_ISSET(fd, fdt->close_on_exec))
+ f_flags |= O_CLOEXEC;
+
+ if (path) {
+ *path = file->f_path;
+ path_get(&file->f_path);
}
- spin_unlock(&files->file_lock);
- put_files_struct(files);
- }
- return -ENOENT;
+ if (info)
+ snprintf(info, PROC_FDINFO_MAX,
+ "pos:\t%lli\n"
+ "flags:\t0%o\n",
+ (long long) file->f_pos,
+ f_flags);
+ rc = 0;
+ } else
+ rc = -ENOENT;
+ spin_unlock(&files->file_lock);
+ put_files_struct(files);
+
+out_unlock:
+ unlock_trace(task);
+out_task:
+ put_task_struct(task);
+ return rc;
}
static int proc_fd_link(struct inode *inode, struct path *path)
@@ -2039,7 +2085,7 @@ static struct dentry *proc_fd_instantiat
spin_unlock(&files->file_lock);
put_files_struct(files);
- inode->i_op = &proc_pid_link_inode_operations;
+ inode->i_op = &proc_fd_link_inode_operations;
inode->i_size = 64;
ei->op.proc_get_link = proc_fd_link;
d_set_d_op(dentry, &tid_fd_dentry_operations);
@@ -2071,7 +2117,12 @@ static struct dentry *proc_lookupfd_comm
if (fd == ~0U)
goto out;
+ result = ERR_PTR(-EACCES);
+ if (lock_trace(task))
+ goto out;
+
result = instantiate(dir, dentry, task, &fd);
+ unlock_trace(task);
out:
put_task_struct(task);
out_no_task:
@@ -2091,23 +2142,28 @@ static int proc_readfd_common(struct fil
retval = -ENOENT;
if (!p)
goto out_no_task;
+
+ retval = -EACCES;
+ if (lock_trace(p))
+ goto out;
+
retval = 0;
fd = filp->f_pos;
switch (fd) {
case 0:
if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
- goto out;
+ goto out_unlock;
filp->f_pos++;
case 1:
ino = parent_ino(dentry);
if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
- goto out;
+ goto out_unlock;
filp->f_pos++;
default:
files = get_files_struct(p);
if (!files)
- goto out;
+ goto out_unlock;
rcu_read_lock();
for (fd = filp->f_pos-2;
fd < files_fdtable(files)->max_fds;
@@ -2131,6 +2187,9 @@ static int proc_readfd_common(struct fil
rcu_read_unlock();
put_files_struct(files);
}
+
+out_unlock:
+ unlock_trace(p);
out:
put_task_struct(p);
out_no_task:
@@ -2208,6 +2267,7 @@ static struct dentry *proc_fdinfo_instan
ei->fd = fd;
inode->i_mode = S_IFREG | S_IRUSR;
inode->i_fop = &proc_fdinfo_file_operations;
+ inode->i_op = &proc_fdinfo_link_inode_operations;
d_set_d_op(dentry, &tid_fd_dentry_operations);
d_add(dentry, inode);
/* Close the race of the process dying before we return the dentry */
From: Vasiliy Kulikov <segoon@openwall.com>
The patch "proc: fix races against execve() of /proc/PID/fd**" is still a
partial fix for a setxid problem. link(2) is a yet another way to
identify whether a specific fd is opened by a privileged process. By
calling link(2) against /proc/PID/fd/* an attacker may identify whether
the fd number is valid for PID by analysing link(2) return code.
Both getattr() and link() can be used by the attacker iff the dentry is
present in the dcache. In this case ->lookup() is not called and the only
way to check ptrace permissions is either operation handler or
->revalidate(). The easiest solution to prevent any unauthorized access
to /proc/PID/fd*/ files is to force the dentry drop on each unauthorized
access attempt.
If an attacker keeps opened fd of /proc/PID/fd/ and dcache contains a
specific dentry for some /proc/PID/fd/XXX, any future attemp to use the
dentry by the attacker would lead to the dentry drop as a result of a
failed ptrace check in ->revalidate(). Then the attacker cannot spawn a
dentry for the specific fd number because of ptrace check in ->lookup().
The dentry drop can be still observed by an attacker by analysing
information from /proc/slabinfo, which is addressed in the successive
patch.
Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
fs/proc/base.c | 42 ++++++------------------------------------
1 file changed, 6 insertions(+), 36 deletions(-)
Index: linux-2.6.git/fs/proc/base.c
===================================================================
--- linux-2.6.git.orig/fs/proc/base.c
+++ linux-2.6.git/fs/proc/base.c
@@ -1665,46 +1665,12 @@ out:
return error;
}
-static int proc_pid_fd_link_getattr(struct vfsmount *mnt, struct dentry *dentry,
- struct kstat *stat)
-{
- struct inode *inode = dentry->d_inode;
- struct task_struct *task = get_proc_task(inode);
- int rc;
-
- if (task == NULL)
- return -ESRCH;
-
- rc = -EACCES;
- if (lock_trace(task))
- goto out_task;
-
- generic_fillattr(inode, stat);
- unlock_trace(task);
- rc = 0;
-out_task:
- put_task_struct(task);
- return rc;
-}
-
static const struct inode_operations proc_pid_link_inode_operations = {
.readlink = proc_pid_readlink,
.follow_link = proc_pid_follow_link,
.setattr = proc_setattr,
};
-static const struct inode_operations proc_fdinfo_link_inode_operations = {
- .setattr = proc_setattr,
- .getattr = proc_pid_fd_link_getattr,
-};
-
-static const struct inode_operations proc_fd_link_inode_operations = {
- .readlink = proc_pid_readlink,
- .follow_link = proc_pid_follow_link,
- .setattr = proc_setattr,
- .getattr = proc_pid_fd_link_getattr,
-};
-
/* building an inode */
@@ -2013,6 +1979,11 @@ static int tid_fd_revalidate(struct dent
task = get_proc_task(inode);
fd = proc_fd(inode);
+ if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
+ put_task_struct(task);
+ task = NULL;
+ }
+
if (task) {
files = get_files_struct(task);
if (files) {
@@ -2085,7 +2056,7 @@ static struct dentry *proc_fd_instantiat
spin_unlock(&files->file_lock);
put_files_struct(files);
- inode->i_op = &proc_fd_link_inode_operations;
+ inode->i_op = &proc_pid_link_inode_operations;
inode->i_size = 64;
ei->op.proc_get_link = proc_fd_link;
d_set_d_op(dentry, &tid_fd_dentry_operations);
@@ -2267,7 +2238,6 @@ static struct dentry *proc_fdinfo_instan
ei->fd = fd;
inode->i_mode = S_IFREG | S_IRUSR;
inode->i_fop = &proc_fdinfo_file_operations;
- inode->i_op = &proc_fdinfo_link_inode_operations;
d_set_d_op(dentry, &tid_fd_dentry_operations);
d_add(dentry, inode);
/* Close the race of the process dying before we return the dentry */
From: Pavel Emelyanov <xemul@openvz.org>
On reading sysctl dirs we should return -EISDIR instead of -EINVAL.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
fs/proc/proc_sysctl.c | 1 +
1 file changed, 1 insertion(+)
Index: linux-2.6.git/fs/proc/proc_sysctl.c
===================================================================
--- linux-2.6.git.orig/fs/proc/proc_sysctl.c
+++ linux-2.6.git/fs/proc/proc_sysctl.c
@@ -370,6 +370,7 @@ static const struct file_operations proc
};
static const struct file_operations proc_sys_dir_file_operations = {
+ .read = generic_read_dir,
.readdir = proc_sys_readdir,
.llseek = generic_file_llseek,
};
The kernel patches series. See "series" file to obtain
order of appliance. Not all patches do address C/R directly
but some of them are needed due to dependencies.
The following patches are known to be in -mm tree already
procfs-report-eisdir-when-reading-sysctl-dirs-in-proc.patch
proc-fix-races-against-execve-of-proc-pid-fd.patch
proc-fix-races-against-execve-of-proc-pid-fd-fix.patch
proc-force-dcache-drop-on-unauthorized-access.patch
cr-statfs-callback-for-pipefs
Has been tested on Linux 3.1-rc3.
procfs-report-eisdir-when-reading-sysctl-dirs-in-proc.patch
proc-fix-races-against-execve-of-proc-pid-fd.patch
proc-fix-races-against-execve-of-proc-pid-fd-fix.patch
proc-force-dcache-drop-on-unauthorized-access.patch
cr-statfs-callback-for-pipefs
fs-proc-switch-to-dentry
cr-proc-map-files-21
cr-clone-with-pid-support
cr-proc-add-children
fs-add-do-close
fs-proc-add-tls
fs-proc-add-mm-task-stat
binfmt-elf-for-cr-5
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment