Commit a4d37dcf authored by Cyrill Gorcunov's avatar Cyrill Gorcunov

kernel: Add patches needed

Signed-off-by: 's avatarCyrill Gorcunov <gorcunov@gmail.com>
parent 76df6d9e
From 110e1f59ca03d4f0aca9c42eb466870920e3d0c5 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Tue, 8 Nov 2011 17:07:58 +0400
Subject: [PATCH 5/6] clone: Introduce the CLONE_CHILD_USEPID functionality
When restoring a task (or a set of tasks) we need to recreate them with
exactly the same pid as they had before. Thus we need the ability to create
a task with specified pid.
The proposal is to reuse the already free CLONE_STOPPED clone flag.
About the security implication - this can create some problems with pids
wraparound and similar, so this approach can be restricted with the "don't
allow for CLONE_CHILD_USEPID when the current pid namespace has ever done
real pid allocation". This will work perfectly for checkpoint-restore and
will not give anyone chances for screwing pids up on a living system.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
---
include/linux/pid.h | 2 +-
include/linux/sched.h | 1 +
kernel/fork.c | 10 ++++++-
kernel/pid.c | 70 +++++++++++++++++++++++++++++++++++-------------
4 files changed, 62 insertions(+), 21 deletions(-)
diff --git a/include/linux/pid.h b/include/linux/pid.h
index b152d44..6bfe317 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -119,7 +119,7 @@ extern struct pid *find_get_pid(int nr);
extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
int next_pidmap(struct pid_namespace *pid_ns, unsigned int last);
-extern struct pid *alloc_pid(struct pid_namespace *ns);
+extern struct pid *alloc_pid(struct pid_namespace *ns, int pid);
extern void free_pid(struct pid *pid);
/*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 68daf4f..389068d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -23,6 +23,7 @@
#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */
/* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state)
and is now available for re-use. */
+#define CLONE_CHILD_USEPID 0x02000000 /* use the given pid */
#define CLONE_NEWUTS 0x04000000 /* New utsname group? */
#define CLONE_NEWIPC 0x08000000 /* New ipcs */
#define CLONE_NEWUSER 0x10000000 /* New user namespace */
diff --git a/kernel/fork.c b/kernel/fork.c
index ba0d172..0c67c63 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1250,8 +1250,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto bad_fork_cleanup_io;
if (pid != &init_struct_pid) {
+ int want_pid = 0;
+
+ if (clone_flags & CLONE_CHILD_USEPID) {
+ retval = get_user(want_pid, child_tidptr);
+ if (retval)
+ goto bad_fork_cleanup_io;
+ }
+
retval = -ENOMEM;
- pid = alloc_pid(p->nsproxy->pid_ns);
+ pid = alloc_pid(p->nsproxy->pid_ns, want_pid);
if (!pid)
goto bad_fork_cleanup_io;
}
diff --git a/kernel/pid.c b/kernel/pid.c
index fa5f722..b69f6a3 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -159,11 +159,55 @@ static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid)
} while ((prev != last_write) && (pid_before(base, last_write, pid)));
}
-static int alloc_pidmap(struct pid_namespace *pid_ns)
+static int alloc_pidmap_page(struct pidmap *map)
+{
+ if (unlikely(!map->page)) {
+ void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ /*
+ * Free the page if someone raced with us
+ * installing it:
+ */
+ spin_lock_irq(&pidmap_lock);
+ if (!map->page) {
+ map->page = page;
+ page = NULL;
+ }
+ spin_unlock_irq(&pidmap_lock);
+ kfree(page);
+ if (unlikely(!map->page))
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int set_pidmap(struct pid_namespace *pid_ns, int pid)
+{
+ int offset;
+ struct pidmap *map;
+
+ offset = pid & BITS_PER_PAGE_MASK;
+ map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
+
+ if (alloc_pidmap_page(map) < 0)
+ return -ENOMEM;
+
+ if (!test_and_set_bit(offset, map->page)) {
+ atomic_dec(&map->nr_free);
+ return pid;
+ }
+
+ return -EBUSY;
+}
+
+static int alloc_pidmap(struct pid_namespace *pid_ns, int desired_pid)
{
int i, offset, max_scan, pid, last = pid_ns->last_pid;
struct pidmap *map;
+ if (desired_pid)
+ return set_pidmap(pid_ns, desired_pid);
+
pid = last + 1;
if (pid >= pid_max)
pid = RESERVED_PIDS;
@@ -176,22 +220,9 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
*/
max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
for (i = 0; i <= max_scan; ++i) {
- if (unlikely(!map->page)) {
- void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- /*
- * Free the page if someone raced with us
- * installing it:
- */
- spin_lock_irq(&pidmap_lock);
- if (!map->page) {
- map->page = page;
- page = NULL;
- }
- spin_unlock_irq(&pidmap_lock);
- kfree(page);
- if (unlikely(!map->page))
- break;
- }
+ if (alloc_pidmap_page(map) < 0)
+ break;
+
if (likely(atomic_read(&map->nr_free))) {
do {
if (!test_and_set_bit(offset, map->page)) {
@@ -277,7 +308,7 @@ void free_pid(struct pid *pid)
call_rcu(&pid->rcu, delayed_put_pid);
}
-struct pid *alloc_pid(struct pid_namespace *ns)
+struct pid *alloc_pid(struct pid_namespace *ns, int this_ns_pid)
{
struct pid *pid;
enum pid_type type;
@@ -291,13 +322,14 @@ struct pid *alloc_pid(struct pid_namespace *ns)
tmp = ns;
for (i = ns->level; i >= 0; i--) {
- nr = alloc_pidmap(tmp);
+ nr = alloc_pidmap(tmp, this_ns_pid);
if (nr < 0)
goto out_free;
pid->numbers[i].nr = nr;
pid->numbers[i].ns = tmp;
tmp = tmp->parent;
+ this_ns_pid = 0;
}
get_pid_ns(ns);
--
1.7.6.4
From 8b85d835bca59de5b5c318f6492be0c73d2da313 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Tue, 8 Nov 2011 17:45:52 +0400
Subject: [PATCH 6/6] prctl: Add PR_CKPT_CTL helper
In a sake of checkpoint restore we need a number
of things to be tuned up at restore time, such as
vDSO address, task status, code start/end fields.
This is done via prctl interface.
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
---
arch/x86/vdso/vma.c | 33 +++++++++++++++++++++++++++++++++
include/linux/prctl.h | 12 ++++++++++++
kernel/sys.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 89 insertions(+), 0 deletions(-)
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 153407c..e0b974f 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -137,6 +137,39 @@ up_fail:
return ret;
}
+int arch_setup_additional_pages_at(void *addr)
+{
+ struct mm_struct *mm = current->mm;
+ int ret;
+
+ if (!vdso_enabled)
+ return 0;
+
+ down_write(&mm->mmap_sem);
+
+ /*
+ * Unmap previous entry.
+ */
+ if (mm->context.vdso) {
+ ret = do_munmap(mm, (unsigned long)mm->context.vdso, vdso_size);
+ if (ret)
+ goto err;
+ }
+
+ mm->context.vdso = addr;
+ ret = install_special_mapping(mm, (unsigned long)addr, vdso_size,
+ VM_READ | VM_EXEC |
+ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC |
+ VM_ALWAYSDUMP,
+ vdso_pages);
+ if (ret)
+ mm->context.vdso = NULL;
+
+err:
+ up_write(&mm->mmap_sem);
+ return ret;
+}
+
static __init int vdso_setup(char *s)
{
vdso_enabled = simple_strtoul(s, NULL, 0);
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index a3baeb2..2b460ac 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -102,4 +102,16 @@
#define PR_MCE_KILL_GET 34
+/* Checkpoint/restore specifics */
+#define PR_CKPT_CTL 35
+# define PR_CKPT_CTL_SETUP_VDSO_AT 1
+# define PR_CKPT_CTL_SET_TASK_FLAGS 2
+# define PR_CKPT_CTL_SET_MM_START_CODE 3
+# define PR_CKPT_CTL_SET_MM_END_CODE 4
+# define PR_CKPT_CTL_SET_MM_START_DATA 5
+# define PR_CKPT_CTL_SET_MM_END_DATA 6
+# define PR_CKPT_CTL_SET_MM_START_STACK 7
+# define PR_CKPT_CTL_SET_MM_START_BRK 8
+# define PR_CKPT_CTL_SET_MM_BRK 9
+
#endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 481611f..2bdb30c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -123,6 +123,12 @@ EXPORT_SYMBOL(cad_pid);
void (*pm_power_off_prepare)(void);
+#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
+extern int arch_setup_additional_pages_at(void *addr);
+#else
+static int arch_setup_additional_pages_at(void *addr) { return 0; }
+#endif
+
/*
* Returns true if current's euid is same as p's uid or euid,
* or has CAP_SYS_NICE to p's user_ns.
@@ -1841,6 +1847,44 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
else
error = PR_MCE_KILL_DEFAULT;
break;
+ case PR_CKPT_CTL:
+ if (arg4 | arg5)
+ return -EINVAL;
+ switch (arg2) {
+#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
+ case PR_CKPT_CTL_SETUP_VDSO_AT:
+ error = arch_setup_additional_pages_at((void *)arg3);
+ break;
+#endif
+ case PR_CKPT_CTL_SET_TASK_FLAGS:
+ current->flags = arg3;
+ break;
+ case PR_CKPT_CTL_SET_MM_START_CODE:
+ current->mm->start_code = arg3;
+ break;
+ case PR_CKPT_CTL_SET_MM_END_CODE:
+ current->mm->end_code = arg3;
+ break;
+ case PR_CKPT_CTL_SET_MM_START_DATA:
+ current->mm->start_data = arg3;
+ break;
+ case PR_CKPT_CTL_SET_MM_END_DATA:
+ current->mm->end_data = arg3;
+ break;
+ case PR_CKPT_CTL_SET_MM_START_STACK:
+ current->mm->start_stack = arg3;
+ break;
+ case PR_CKPT_CTL_SET_MM_START_BRK:
+ current->mm->start_brk = arg3;
+ break;
+ case PR_CKPT_CTL_SET_MM_BRK:
+ current->mm->brk = arg3;
+ break;
+ default:
+ error = -EINVAL;
+ break;
+ }
+ break;
default:
error = -EINVAL;
break;
--
1.7.6.4
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment