Commit 7f01d691 authored by Laurent Dufour's avatar Laurent Dufour Committed by Pavel Emelyanov

vdso: Rework vdso processing files

There were multiple copy of the same code spread over the different
architectures handling the vDSO.

This patch is merging the duplicated code in arch/*/vdso-pie.c and
arch/*/include/asm/vdso.h in the common files and let only the architecture
specific part in the arch/*/* files.

The file are now organized this way:

include/asm-generic/vdso.h
	contains basic definition which could be overwritten by
	architectures.

arch/*/include/asm/vdso.h
	contains per architecture definitions.
	It may includes include/asm-generic/vdso.h

pie/util-vdso.c
include/util-vdso.h
	These files contains code and definitions common to both criu and
	the parasite code.
	The file include/util-vdso.h includes arch/*/include/asm/vdso.h.

pie/parsite-vdso.c
include/parasite-vdso.h
	contains code and definition specific to the parasite code handling
	the vDSO.
	The file include/parasite-vdso.h includes include/util-vdso.h.

arch/*/vdso-pie.c
	contains the architecture specific code installing the vDSO
	trampoline.

vdso.c
include/vdso.h
	contains code and definition specific to the criu code handling the
	vDSO.
	The file include/vdso.h includes include/util-vdso.h.

CC: Christopher Covington <cov@codeaurora.org>
CC: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: 's avatarLaurent Dufour <ldufour@linux.vnet.ibm.com>
Acked-by: 's avatarCyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: 's avatarPavel Emelyanov <xemul@parallels.com>
parent 6eaa4e92
......@@ -234,15 +234,7 @@ lib: $(VERSION_HEADER) config built-in.o
$(Q) $(MAKE) $(build)=lib all
ifeq ($(VDSO),y)
$(ARCH_DIR)/vdso-pie.o: pie
$(Q) $(MAKE) $(build)=pie $(ARCH_DIR)/vdso-pie.o
PROGRAM-BUILTINS += $(ARCH_DIR)/vdso-pie.o
ifeq ($(SRCARCH),aarch64)
PROGRAM-BUILTINS += $(ARCH_DIR)/intraprocedure.o
endif
ifeq ($(SRCARCH),ppc64)
PROGRAM-BUILTINS += $(ARCH_DIR)/vdso-trampoline.o
endif
PROGRAM-BUILTINS += pie/util-vdso.o
endif
PROGRAM-BUILTINS += pie/util-fd.o
......
#ifndef __CR_ASM_VDSO_H__
#define __CR_ASM_VDSO_H__
#include <sys/types.h>
#include "asm/int.h"
#include "protobuf/vma.pb-c.h"
struct parasite_ctl;
struct vm_area_list;
#define VDSO_PROT (PROT_READ | PROT_EXEC)
#define VVAR_PROT (PROT_READ)
#define VDSO_BAD_ADDR (-1ul)
#define VVAR_BAD_ADDR VDSO_BAD_ADDR
#define VDSO_BAD_PFN (-1ull)
#define VVAR_BAD_PFN VDSO_BAD_PFN
struct vdso_symbol {
char name[32];
unsigned long offset;
};
#define VDSO_SYMBOL_INIT { .offset = VDSO_BAD_ADDR, }
/* Check if symbol present in symtable */
static inline bool vdso_symbol_empty(struct vdso_symbol *s)
{
return s->offset == VDSO_BAD_ADDR && s->name[0] == '\0';
}
#include "asm-generic/vdso.h"
/*
* This is a minimal amount of symbols
* we should support at the moment.
*/
enum {
VDSO_SYMBOL_CLOCK_GETRES,
VDSO_SYMBOL_CLOCK_GETTIME,
VDSO_SYMBOL_GETTIMEOFDAY,
VDSO_SYMBOL_RT_SIGRETURN,
VDSO_SYMBOL_MAX
};
struct vdso_symtable {
unsigned long vma_start;
unsigned long vma_end;
unsigned long vvar_start;
unsigned long vvar_end;
struct vdso_symbol symbols[VDSO_SYMBOL_MAX];
};
#define VDSO_SYMTABLE_INIT \
{ \
.vma_start = VDSO_BAD_ADDR, \
.vma_end = VDSO_BAD_ADDR, \
.vvar_start = VVAR_BAD_ADDR, \
.vvar_end = VVAR_BAD_ADDR, \
.symbols = { \
[0 ... VDSO_SYMBOL_MAX - 1] = \
(struct vdso_symbol)VDSO_SYMBOL_INIT, \
}, \
}
/* Size of VMA associated with vdso */
static inline unsigned long vdso_vma_size(struct vdso_symtable *t)
{
return t->vma_end - t->vma_start;
}
static inline unsigned long vvar_vma_size(struct vdso_symtable *t)
{
return t->vvar_end - t->vvar_start;
}
/*
* Special mark which allows to identify runtime vdso where
* calls from proxy vdso are redirected. This mark usually
* placed at the start of vdso area where Elf header lives.
* Since such runtime vdso is solevey used by proxy and
* nobody else is supposed to access it, it's more-less
* safe to screw the Elf header with @signature and
* @proxy_addr.
*
* The @proxy_addr deserves a few comments. When we redirect
* the calls from proxy to runtime vdso, on next checkpoint
* it won't be possible to find which VMA is proxy, thus
* we save its address in the member.
*/
struct vdso_mark {
u64 signature;
unsigned long proxy_vdso_addr;
unsigned long version;
/*
* In case of new vDSO format the VVAR area address
* neeed for easier discovering where it lives without
* relying on procfs output.
*/
unsigned long proxy_vvar_addr;
};
#define VDSO_MARK_SIGNATURE (0x6f73647675697263ULL) /* Magic number (criuvdso) */
#define VDSO_MARK_SIGNATURE_V2 (0x4f53447675697263ULL) /* Magic number (criuvDSO) */
#define VDSO_MARK_CUR_VERSION (2)
static inline void vdso_put_mark(void *where, unsigned long proxy_vdso_addr, unsigned long proxy_vvar_addr)
{
struct vdso_mark *m = where;
m->signature = VDSO_MARK_SIGNATURE_V2;
m->proxy_vdso_addr = proxy_vdso_addr;
m->version = VDSO_MARK_CUR_VERSION;
m->proxy_vvar_addr = proxy_vvar_addr;
}
static inline bool is_vdso_mark(void *addr)
{
struct vdso_mark *m = addr;
if (m->signature == VDSO_MARK_SIGNATURE_V2) {
/*
* New format
*/
return true;
} else if (m->signature == VDSO_MARK_SIGNATURE) {
/*
* Old format -- simply extend the mark up
* to the version we support.
*/
vdso_put_mark(m, m->proxy_vdso_addr, VVAR_BAD_ADDR);
return true;
}
return false;
}
#define VDSO_SYMBOL_CLOCK_GETRES_NAME "__kernel_clock_getres"
#define VDSO_SYMBOL_CLOCK_GETTIME_NAME "__kernel_clock_gettime"
#define VDSO_SYMBOL_GETTIMEOFDAY_NAME "__kernel_gettimeofday"
#define VDSO_SYMBOL_RT_SIGRETURN_NAME "__kernel_rt_sigreturn"
extern struct vdso_symtable vdso_sym_rt;
extern u64 vdso_pfn;
extern int vdso_init(void);
extern int vdso_do_park(struct vdso_symtable *sym_rt, unsigned long park_at, unsigned long park_size);
extern int vdso_fill_symtable(char *mem, size_t size, struct vdso_symtable *t);
extern int vdso_proxify(char *who, struct vdso_symtable *sym_rt,
unsigned long vdso_rt_parked_at, size_t index,
VmaEntry *vmas, size_t nr_vmas);
extern int vdso_redirect_calls(void *base_to, void *base_from, struct vdso_symtable *to, struct vdso_symtable *from);
extern int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid,
struct vm_area_list *vma_area_list);
extern void write_intraprocedure_branch(void *to, void *from);
#define VDSO_SYMBOL_MAX 4
#define ARCH_VDSO_SYMBOLS \
"__kernel_clock_getres", \
"__kernel_clock_gettime", \
"__kernel_gettimeofday", \
"__kernel_rt_sigreturn"
struct vdso_symtable;
extern int vdso_redirect_calls(unsigned long base_to,
unsigned long base_from,
struct vdso_symtable *to,
struct vdso_symtable *from);
#endif /* __CR_ASM_VDSO_H__ */
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <elf.h>
#include <fcntl.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include "asm/string.h"
#include "asm/types.h"
#include "compiler.h"
#include "syscall.h"
#include "image.h"
#include "vdso.h"
#include "vma.h"
#include "parasite-vdso.h"
#include "log.h"
#include "bug.h"
......@@ -26,7 +13,7 @@
#endif
#define LOG_PREFIX "vdso: "
int vdso_redirect_calls(void *base_to, void *base_from,
int vdso_redirect_calls(unsigned long base_to, unsigned long base_from,
struct vdso_symtable *to,
struct vdso_symtable *from)
{
......@@ -37,8 +24,8 @@ int vdso_redirect_calls(void *base_to, void *base_from,
continue;
pr_debug("br: %lx/%lx -> %lx/%lx (index %d)\n",
(unsigned long)base_from, from->symbols[i].offset,
(unsigned long)base_to, to->symbols[i].offset, i);
base_from, from->symbols[i].offset,
base_to, to->symbols[i].offset, i);
write_intraprocedure_branch(base_to + to->symbols[i].offset,
base_from + from->symbols[i].offset);
......@@ -46,383 +33,3 @@ int vdso_redirect_calls(void *base_to, void *base_from,
return 0;
}
/* Check if pointer is out-of-bound */
static bool __ptr_oob(void *ptr, void *start, size_t size)
{
void *end = (void *)((unsigned long)start + size);
return ptr > end || ptr < start;
}
/*
* Elf hash, see format specification.
*/
static unsigned long elf_hash(const unsigned char *name)
{
unsigned long h = 0, g;
while (*name) {
h = (h << 4) + *name++;
g = h & 0xf0000000ul;
if (g)
h ^= g >> 24;
h &= ~g;
}
return h;
}
int vdso_fill_symtable(char *mem, size_t size, struct vdso_symtable *t)
{
Elf64_Phdr *dynamic = NULL, *load = NULL;
Elf64_Ehdr *ehdr = (void *)mem;
Elf64_Dyn *dyn_strtab = NULL;
Elf64_Dyn *dyn_symtab = NULL;
Elf64_Dyn *dyn_strsz = NULL;
Elf64_Dyn *dyn_syment = NULL;
Elf64_Dyn *dyn_hash = NULL;
Elf64_Word *hash = NULL;
Elf64_Phdr *phdr;
Elf64_Dyn *d;
Elf64_Word *bucket, *chain;
Elf64_Word nbucket, nchain;
/*
* See Elf specification for this magic values.
*/
const char elf_ident[] = {
0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
};
const char *vdso_symbols[VDSO_SYMBOL_MAX] = {
[VDSO_SYMBOL_CLOCK_GETRES] = VDSO_SYMBOL_CLOCK_GETRES_NAME,
[VDSO_SYMBOL_CLOCK_GETTIME] = VDSO_SYMBOL_CLOCK_GETTIME_NAME,
[VDSO_SYMBOL_GETTIMEOFDAY] = VDSO_SYMBOL_GETTIMEOFDAY_NAME,
[VDSO_SYMBOL_RT_SIGRETURN] = VDSO_SYMBOL_RT_SIGRETURN_NAME,
};
char *dynsymbol_names;
unsigned int i, j, k;
BUILD_BUG_ON(sizeof(elf_ident) != sizeof(ehdr->e_ident));
pr_debug("Parsing at %lx %lx\n", (long)mem, (long)mem + (long)size);
/*
* Make sure it's a file we support.
*/
if (builtin_memcmp(ehdr->e_ident, elf_ident, sizeof(elf_ident))) {
pr_err("Elf header magic mismatch\n");
return -EINVAL;
}
/*
* We need PT_LOAD and PT_DYNAMIC here. Each once.
*/
phdr = (void *)&mem[ehdr->e_phoff];
for (i = 0; i < ehdr->e_phnum; i++, phdr++) {
if (__ptr_oob(phdr, mem, size))
goto err_oob;
switch (phdr->p_type) {
case PT_DYNAMIC:
if (dynamic) {
pr_err("Second PT_DYNAMIC header\n");
return -EINVAL;
}
dynamic = phdr;
break;
case PT_LOAD:
if (load) {
pr_err("Second PT_LOAD header\n");
return -EINVAL;
}
load = phdr;
break;
}
}
if (!load || !dynamic) {
pr_err("One of obligated program headers is missed\n");
return -EINVAL;
}
pr_debug("PT_LOAD p_vaddr: %lx\n", (unsigned long)load->p_vaddr);
/*
* Dynamic section tags should provide us the rest of information
* needed. Note that we're interested in a small set of tags.
*/
d = (void *)&mem[dynamic->p_offset];
for (i = 0; i < dynamic->p_filesz / sizeof(*d); i++, d++) {
if (__ptr_oob(d, mem, size))
goto err_oob;
if (d->d_tag == DT_NULL) {
break;
} else if (d->d_tag == DT_STRTAB) {
dyn_strtab = d;
pr_debug("DT_STRTAB: %p\n", (void *)d->d_un.d_ptr);
} else if (d->d_tag == DT_SYMTAB) {
dyn_symtab = d;
pr_debug("DT_SYMTAB: %p\n", (void *)d->d_un.d_ptr);
} else if (d->d_tag == DT_STRSZ) {
dyn_strsz = d;
pr_debug("DT_STRSZ: %lu\n", (unsigned long)d->d_un.d_val);
} else if (d->d_tag == DT_SYMENT) {
dyn_syment = d;
pr_debug("DT_SYMENT: %lu\n", (unsigned long)d->d_un.d_val);
} else if (d->d_tag == DT_HASH) {
dyn_hash = d;
pr_debug("DT_HASH: %p\n", (void *)d->d_un.d_ptr);
}
}
if (!dyn_strtab || !dyn_symtab || !dyn_strsz || !dyn_syment || !dyn_hash) {
pr_err("Not all dynamic entries are present\n");
return -EINVAL;
}
dynsymbol_names = &mem[dyn_strtab->d_un.d_val - load->p_vaddr];
if (__ptr_oob(dynsymbol_names, mem, size))
goto err_oob;
hash = (void *)&mem[(unsigned long)dyn_hash->d_un.d_ptr - (unsigned long)load->p_vaddr];
if (__ptr_oob(hash, mem, size))
goto err_oob;
nbucket = hash[0];
nchain = hash[1];
bucket = &hash[2];
chain = &hash[nbucket + 2];
pr_debug("nbucket %lu nchain %lu bucket %p chain %p\n",
(long)nbucket, (long)nchain, bucket, chain);
for (i = 0; i < ARRAY_SIZE(vdso_symbols); i++) {
k = elf_hash((const unsigned char *)vdso_symbols[i]);
for (j = bucket[k % nbucket]; j < nchain && chain[j] != STN_UNDEF; j = chain[j]) {
Elf64_Sym *sym = (void *)&mem[dyn_symtab->d_un.d_ptr - load->p_vaddr];
char *name;
sym = &sym[j];
if (__ptr_oob(sym, mem, size))
continue;
if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC &&
ELF64_ST_BIND(sym->st_info) != STB_GLOBAL)
continue;
name = &dynsymbol_names[sym->st_name];
if (__ptr_oob(name, mem, size))
continue;
if (builtin_strcmp(name, vdso_symbols[i]))
continue;
builtin_memcpy(t->symbols[i].name, name, sizeof(t->symbols[i].name));
t->symbols[i].offset = (unsigned long)sym->st_value - load->p_vaddr;
break;
}
}
return 0;
err_oob:
pr_err("Corrupted Elf data\n");
return -EFAULT;
}
static int vdso_remap(char *who, unsigned long from, unsigned long to, size_t size)
{
unsigned long addr;
pr_debug("Remap %s %lx -> %lx\n", who, from, to);
addr = sys_mremap(from, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, to);
if (addr != to) {
pr_err("Unable to remap %lx -> %lx %lx\n",
from, to, addr);
return -1;
}
return 0;
}
/* Park runtime vDSO in some safe place where it can be accessible from restorer */
int vdso_do_park(struct vdso_symtable *sym_rt, unsigned long park_at, unsigned long park_size)
{
int ret;
BUG_ON((vdso_vma_size(sym_rt) + vvar_vma_size(sym_rt)) < park_size);
if (sym_rt->vvar_start != VDSO_BAD_ADDR) {
if (sym_rt->vma_start < sym_rt->vvar_start) {
ret = vdso_remap("rt-vdso", sym_rt->vma_start,
park_at, vdso_vma_size(sym_rt));
park_at += vdso_vma_size(sym_rt);
ret |= vdso_remap("rt-vvar", sym_rt->vvar_start,
park_at, vvar_vma_size(sym_rt));
} else {
ret = vdso_remap("rt-vvar", sym_rt->vvar_start,
park_at, vvar_vma_size(sym_rt));
park_at += vvar_vma_size(sym_rt);
ret |= vdso_remap("rt-vdso", sym_rt->vma_start,
park_at, vdso_vma_size(sym_rt));
}
} else
ret = vdso_remap("rt-vdso", sym_rt->vma_start,
park_at, vdso_vma_size(sym_rt));
return ret;
}
int vdso_proxify(char *who, struct vdso_symtable *sym_rt,
unsigned long vdso_rt_parked_at, size_t index,
VmaEntry *vmas, size_t nr_vmas)
{
VmaEntry *vma_vdso = NULL, *vma_vvar = NULL;
struct vdso_symtable s = VDSO_SYMTABLE_INIT;
bool remap_rt = false;
/*
* Figue out which kind of vdso tuple we get.
*/
if (vma_entry_is(&vmas[index], VMA_AREA_VDSO))
vma_vdso = &vmas[index];
else if (vma_entry_is(&vmas[index], VMA_AREA_VVAR))
vma_vvar = &vmas[index];
if (index < (nr_vmas - 1)) {
if (vma_entry_is(&vmas[index + 1], VMA_AREA_VDSO))
vma_vdso = &vmas[index + 1];
else if (vma_entry_is(&vmas[index + 1], VMA_AREA_VVAR))
vma_vvar = &vmas[index + 1];
}
if (!vma_vdso) {
pr_err("Can't find vDSO area in image\n");
return -1;
}
/*
* vDSO mark overwrites Elf program header of proxy vDSO thus
* it must never ever be greater in size.
*/
BUILD_BUG_ON(sizeof(struct vdso_mark) > sizeof(Elf64_Phdr));
/*
* Find symbols in vDSO zone read from image.
*/
if (vdso_fill_symtable((void *)vma_vdso->start, vma_entry_len(vma_vdso), &s))
return -1;
/*
* Proxification strategy
*
* - There might be two vDSO zones: vdso code and optionally vvar data
* - To be able to use in-place remapping we need
*
* a) Size and order of vDSO zones are to match
* b) Symbols offsets must match
* c) Have same number of vDSO zones
*/
if (vma_entry_len(vma_vdso) == vdso_vma_size(sym_rt)) {
size_t i;
for (i = 0; i < ARRAY_SIZE(s.symbols); i++) {
if (s.symbols[i].offset != sym_rt->symbols[i].offset)
break;
}
if (i == ARRAY_SIZE(s.symbols)) {
if (vma_vvar && sym_rt->vvar_start != VVAR_BAD_ADDR) {
remap_rt = (vvar_vma_size(sym_rt) == vma_entry_len(vma_vvar));
if (remap_rt) {
long delta_rt = sym_rt->vvar_start - sym_rt->vma_start;
long delta_this = vma_vvar->start - vma_vdso->start;
remap_rt = (delta_rt ^ delta_this) < 0 ? false : true;
}
} else
remap_rt = true;
}
}
pr_debug("image [vdso] %lx-%lx [vvar] %lx-%lx\n",
vma_vdso->start, vma_vdso->end,
vma_vvar ? vma_vvar->start : VVAR_BAD_ADDR,
vma_vvar ? vma_vvar->end : VVAR_BAD_ADDR);
/*
* Easy case -- the vdso from image has same offsets, order and size
* as runtime, so we simply remap runtime vdso to dumpee position
* without generating any proxy.
*
* Note we may remap VVAR vdso as well which might not yet been mapped
* by a caller code. So drop VMA_AREA_REGULAR from it and caller would
* not touch it anymore.
*/
if (remap_rt) {
int ret = 0;
pr_info("Runtime vdso/vvar matches dumpee, remap inplace\n");
if (sys_munmap((void *)vma_vdso->start, vma_entry_len(vma_vdso))) {
pr_err("Failed to unmap %s\n", who);
return -1;
}
if (vma_vvar) {
if (sys_munmap((void *)vma_vvar->start, vma_entry_len(vma_vvar))) {
pr_err("Failed to unmap %s\n", who);
return -1;
}
if (vma_vdso->start < vma_vvar->start) {
ret = vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt));
vdso_rt_parked_at += vdso_vma_size(sym_rt);
ret |= vdso_remap(who, vdso_rt_parked_at, vma_vvar->start, vvar_vma_size(sym_rt));
} else {
ret = vdso_remap(who, vdso_rt_parked_at, vma_vvar->start, vvar_vma_size(sym_rt));
vdso_rt_parked_at += vvar_vma_size(sym_rt);
ret |= vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt));
}
} else
ret = vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt));
return ret;
}
/*
* Now complex case -- we need to proxify calls. We redirect
* calls from dumpee vdso to runtime vdso, making dumpee
* to operate as proxy vdso.
*/
pr_info("Runtime vdso mismatches dumpee, generate proxy\n");
/*
* Don't forget to shift if vvar is before vdso.
*/
if (sym_rt->vvar_start != VDSO_BAD_ADDR &&
sym_rt->vvar_start < sym_rt->vma_start)
vdso_rt_parked_at += vvar_vma_size(sym_rt);
if (vdso_redirect_calls((void *)vdso_rt_parked_at,
(void *)vma_vdso->start,
sym_rt, &s)) {
pr_err("Failed to proxify dumpee contents\n");
return -1;
}
/*
* Put a special mark into runtime vdso, thus at next checkpoint
* routine we could detect this vdso and do not dump it, since
* it's auto-generated every new session if proxy required.
*/
sys_mprotect((void *)vdso_rt_parked_at, vdso_vma_size(sym_rt), PROT_WRITE);
vdso_put_mark((void *)vdso_rt_parked_at, vma_vdso->start, vma_vvar ? vma_vvar->start : VVAR_BAD_ADDR);
sys_mprotect((void *)vdso_rt_parked_at, vdso_vma_size(sym_rt), VDSO_PROT);
return 0;
}
#ifndef __CR_ASM_VDSO_H__
#define __CR_ASM_VDSO_H__
#include <sys/types.h>
#include "asm/int.h"
#include "protobuf/vma.pb-c.h"
struct parasite_ctl;
struct vm_area_list;
#define VDSO_PROT (PROT_READ | PROT_EXEC)
#define VVAR_PROT (PROT_READ)
#define VDSO_BAD_ADDR (-1ul)
#define VVAR_BAD_ADDR VDSO_BAD_ADDR
#define VDSO_BAD_PFN (-1ull)
#define VVAR_BAD_PFN VDSO_BAD_PFN
struct vdso_symbol {
char name[32];
unsigned long offset;
};
#define VDSO_SYMBOL_INIT { .offset = VDSO_BAD_ADDR, }
/* Check if symbol present in symtable */
static inline bool vdso_symbol_empty(struct vdso_symbol *s)
{
return s->offset == VDSO_BAD_ADDR && s->name[0] == '\0';
}
#include "asm-generic/vdso.h"
/*
* Pick from kernel file arch/powerpc/kernel/vdso64/vdso64.lds.S
/* This definition is used in pie/util-vdso.c to initialize the vdso symbol
* name string table 'vdso_symbols'
*
* Poke from kernel file arch/powerpc/kernel/vdso64/vdso64.lds.S
*
* Note that '__kernel_datapage_offset' is not a service but mostly a data
* inside the text page which should not be used as is from user space.
*/
enum {
VDSO_SYMBOL_CLOCK_GETRES,
VDSO_SYMBOL_CLOCK_GETTIME,
VDSO_SYMBOL_GET_SYSCALL_MAP,
VDSO_SYMBOL_GET_TBFREQ,
VDSO_SYMBOL_GETCPU,
VDSO_SYMBOL_GETTIMEOFDAY,
VDSO_SYMBOL_SIGTRAMP_RT64,
VDSO_SYMBOL_SYNC_DICACHE,
VDSO_SYMBOL_SYNC_DICACHE_P5,
VDSO_SYMBOL_TIME,
VDSO_SYMBOL_MAX
};
#define VDSO_SYMBOL_CLOCK_GETRES_NAME "__kernel_clock_getres"
#define VDSO_SYMBOL_CLOCK_GETTIME_NAME "__kernel_clock_gettime"
#define VDSO_SYMBOL_GET_SYSCALL_MAP_NAME "__kernel_get_syscall_map"
#define VDSO_SYMBOL_GET_TBFREQ_NAME "__kernel_get_tbfreq"
#define VDSO_SYMBOL_GETCPU_NAME "__kernel_getcpu"
#define VDSO_SYMBOL_GETTIMEOFDAY_NAME "__kernel_gettimeofday"
#define VDSO_SYMBOL_SIGTRAMP_RT64_NAME "__kernel_sigtramp_rt64"
#define VDSO_SYMBOL_SYNC_DICACHE_NAME "__kernel_sync_dicache"
#define VDSO_SYMBOL_SYNC_DICACHE_P5_NAME "__kernel_sync_dicache_p5"
#define VDSO_SYMBOL_TIME_NAME "__kernel_time"
struct vdso_symtable {
unsigned long vma_start;
unsigned long vma_end;
unsigned long vvar_start;
unsigned long vvar_end;
struct vdso_symbol symbols[VDSO_SYMBOL_MAX];
};
#define VDSO_SYMTABLE_INIT \
{ \
.vma_start = VDSO_BAD_ADDR, \
.vma_end = VDSO_BAD_ADDR, \
.vvar_start = VVAR_BAD_ADDR, \
.vvar_end = VVAR_BAD_ADDR, \
.symbols = { \
[0 ... VDSO_SYMBOL_MAX - 1] = \
(struct vdso_symbol)VDSO_SYMBOL_INIT, \
}, \
}
/* Size of VMA associated with vdso */
static inline unsigned long vdso_vma_size(struct vdso_symtable *t)
{
return t->vma_end - t->vma_start;
}
static inline unsigned long vvar_vma_size(struct vdso_symtable *t)
{
return t->vvar_end - t->vvar_start;
}
/*
* Special mark which allows to identify runtime vdso where
* calls from proxy vdso are redirected. This mark usually
* placed at the start of vdso area where Elf header lives.
* Since such runtime vdso is solevey used by proxy and
* nobody else is supposed to access it, it's more-less
* safe to screw the Elf header with @signature and
* @proxy_addr.
*
* The @proxy_addr deserves a few comments. When we redirect
* the calls from proxy to runtime vdso, on next checkpoint
* it won't be possible to find which VMA is proxy, thus
* we save its address in the member.
*/
struct vdso_mark {
u64 signature;
unsigned long proxy_vdso_addr;
unsigned long version;
/*
* In case of new vDSO format the VVAR area address
* neeed for easier discovering where it lives without
* relying on procfs output.
*/
unsigned long proxy_vvar_addr;
};
#define VDSO_MARK_SIGNATURE (0x6f73647675697263ULL) /* Magic number (criuvdso) */
#define VDSO_MARK_SIGNATURE_V2 (0x4f53447675697263ULL) /* Magic number (criuvDSO) */
#define VDSO_MARK_CUR_VERSION (2)
static inline void vdso_put_mark(void *where, unsigned long proxy_vdso_addr, unsigned long proxy_vvar_addr)
{
struct vdso_mark *m = where;
m->signature = VDSO_MARK_SIGNATURE_V2;
m->proxy_vdso_addr = proxy_vdso_addr;
m->version = VDSO_MARK_CUR_VERSION;
m->proxy_vvar_addr = proxy_vvar_addr;
}
static inline bool is_vdso_mark(void *addr)
{
struct vdso_mark *m = addr;
if (m->signature == VDSO_MARK_SIGNATURE_V2) {
/*
* New format
*/
return true;
} else if (m->signature == VDSO_MARK_SIGNATURE) {
/*
* Old format -- simply extend the mark up
* to the version we support.
*/
vdso_put_mark(m, m->proxy_vdso_addr, VVAR_BAD_ADDR);
return true;
}
return false;
}
extern struct vdso_symtable vdso_sym_rt;
extern u64 vdso_pfn;
extern int vdso_init(void);
extern int vdso_do_park(struct vdso_symtable *sym_rt, unsigned long park_at, unsigned long park_size);
extern int vdso_fill_symtable(char *mem, size_t size, struct vdso_symtable *t);
extern int vdso_proxify(char *who, struct vdso_symtable *sym_rt,
unsigned long vdso_rt_parked_at, size_t index,
VmaEntry *vmas, size_t nr_vmas);
extern int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid,
struct vm_area_list *vma_area_list);
extern void write_intraprocedure_branch(void *to, void *from);
#define VDSO_SYMBOL_MAX 10
#define ARCH_VDSO_SYMBOLS \
"__kernel_clock_getres", \
"__kernel_clock_gettime", \
"__kernel_get_syscall_map", \
"__kernel_get_tbfreq", \
"__kernel_getcpu", \
"__kernel_gettimeofday", \
"__kernel_sigtramp_rt64", \
"__kernel_sync_dicache", \
"__kernel_sync_dicache_p5", \
"__kernel_time"
struct vdso_symtable;
extern int vdso_redirect_calls(unsigned long base_to,
unsigned long base_from,
struct vdso_symtable *to,
struct vdso_symtable *from);
#endif /* __CR_ASM_VDSO_H__ */
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <elf.h>
#include <fcntl.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include "asm/string.h"
#include "asm/types.h"
#include "syscall.h"
#include "image.h"
#include "vdso.h"
#include "vma.h"
#include "parasite-vdso.h"
#include "log.h"
#include "bug.h"
......@@ -139,10 +128,10 @@ static inline void put_trampoline_call(unsigned long at, unsigned long to,
invalidate_caches(at);
}
static int vdso_redirect_calls(unsigned long base_to,
unsigned long base_from,
struct vdso_symtable *to,
struct vdso_symtable *from)
int vdso_redirect_calls(unsigned long base_to,
unsigned long base_from,
struct vdso_symtable *to,
struct vdso_symtable *from)
{
unsigned int i;
unsigned long trampoline;
......@@ -167,391 +156,3 @@ static int vdso_redirect_calls(unsigned long base_to,
return 0;
}
/* Check if pointer is out-of-bound */
static bool __ptr_oob(void *ptr, void *start, size_t size)
{
void *end = (void *)((unsigned long)start + size);
return ptr > end || ptr < start;
}
/*
* Elf hash, see format specification.
*/
static unsigned long elf_hash(const unsigned char *name)
{
unsigned long h = 0, g;
while (*name) {
h = (h << 4) + *name++;
g = h & 0xf0000000ul;
if (g)
h ^= g >> 24;
h &= ~g;
}
return h;
}
#define SET_VDSO_SYM(s) [VDSO_SYMBOL_##s] = VDSO_SYMBOL_##s##_NAME
const char *vdso_symbols[VDSO_SYMBOL_MAX] = {
SET_VDSO_SYM(CLOCK_GETRES),
SET_VDSO_SYM(CLOCK_GETTIME),
SET_VDSO_SYM(GET_SYSCALL_MAP),
SET_VDSO_SYM(GET_TBFREQ),
SET_VDSO_SYM(GETCPU),
SET_VDSO_SYM(GETTIMEOFDAY),
SET_VDSO_SYM(SIGTRAMP_RT64),
SET_VDSO_SYM(SYNC_DICACHE),
SET_VDSO_SYM(SYNC_DICACHE_P5),
SET_VDSO_SYM(TIME)
};
#define VDSO_SYMBOL(i) vdso_symbols[i]
int vdso_fill_symtable(char *mem, size_t size, struct vdso_symtable *t)
{
Elf64_Phdr *dynamic = NULL, *load = NULL;
Elf64_Ehdr *ehdr = (void *)mem;
Elf64_Dyn *dyn_strtab = NULL;
Elf64_Dyn *dyn_symtab = NULL;
Elf64_Dyn *dyn_strsz = NULL;
Elf64_Dyn *dyn_syment = NULL;
Elf64_Dyn *dyn_hash = NULL;
Elf64_Word *hash = NULL;
Elf64_Phdr *phdr;
Elf64_Dyn *d;
Elf64_Word *bucket, *chain;
Elf64_Word nbucket, nchain;
/*
* See Elf specification for this magic values.
*/
static const char elf_ident[] = {
0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
};
char *dynsymbol_names;
unsigned int i, j, k;
BUILD_BUG_ON(sizeof(elf_ident) != sizeof(ehdr->e_ident));
pr_debug("Parsing at %lx %lx\n", (long)mem, (long)mem + (long)size);
/*
* Make sure it's a file we support.
*/
if (builtin_memcmp(ehdr->e_ident, elf_ident, sizeof(elf_ident))) {
pr_err("Elf header magic mismatch\n");
return -EINVAL;
}
/*
* We need PT_LOAD and PT_DYNAMIC here. Each once.
*/
phdr = (void *)&mem[ehdr->e_phoff];
for (i = 0; i < ehdr->e_phnum; i++, phdr++) {
if (__ptr_oob(phdr, mem, size))
goto err_oob;
switch (phdr->p_type) {
case PT_DYNAMIC:
if (dynamic) {
pr_err("Second PT_DYNAMIC header\n");
return -EINVAL;
}
dynamic = phdr;
break;
case PT_LOAD:
if (load) {
pr_err("Second PT_LOAD header\n");
return -EINVAL;
}
load = phdr;
break;
}
}
if (!load || !dynamic) {
pr_err("One of obligated program headers is missed\n");
return -EINVAL;
}
pr_debug("PT_LOAD p_vaddr: %lx\n", (unsigned long)load->p_vaddr);
/*
* Dynamic section tags should provide us the rest of information
* needed. Note that we're interested in a small set of tags.
*/
d = (void *)&mem[dynamic->p_offset];
for (i = 0; i < dynamic->p_filesz / sizeof(*d); i++, d++) {
if (__ptr_oob(d, mem, size))
goto err_oob;
if (d->d_tag == DT_NULL) {
break;
} else if (d->d_tag == DT_STRTAB) {
dyn_strtab = d;
pr_debug("DT_STRTAB: %lx\n", (unsigned long)d->d_un.d_ptr);
} else if (d->d_tag == DT_SYMTAB) {
dyn_symtab = d;
pr_debug("DT_SYMTAB: %lx\n", (unsigned long)d->d_un.d_ptr);
} else if (d->d_tag == DT_STRSZ) {
dyn_strsz = d;
pr_debug("DT_STRSZ: %lx\n", (unsigned long)d->d_un.d_val);
} else if (d->d_tag == DT_SYMENT) {
dyn_syment = d;
pr_debug("DT_SYMENT: %lx\n", (unsigned long)d->d_un.d_val);
} else if (d->d_tag == DT_HASH) {
dyn_hash = d;
pr_debug("DT_HASH: %lx\n", (unsigned long)d->d_un.d_ptr);
}
}
if (!dyn_strtab || !dyn_symtab || !dyn_strsz || !dyn_syment || !dyn_hash) {
pr_err("Not all dynamic entries are present\n");
return -EINVAL;
}
dynsymbol_names = &mem[dyn_strtab->d_un.d_val - load->p_vaddr];
if (__ptr_oob(dynsymbol_names, mem, size))
goto err_oob;
hash = (void *)&mem[(unsigned long)dyn_hash->d_un.d_ptr - (unsigned long)load->p_vaddr];
if (__ptr_oob(hash, mem, size))
goto err_oob;
nbucket = hash[0];
nchain = hash[1];
bucket = &hash[2];
chain = &hash[nbucket + 2];
pr_debug("nbucket %lx nchain %lx bucket %lx chain %lx\n",
(long)nbucket, (long)nchain, (unsigned long)bucket, (unsigned long)chain);
for (i = 0; i < VDSO_SYMBOL_MAX; i++) {
const char * symbol = VDSO_SYMBOL(i);
k = elf_hash((const unsigned char *)symbol);
for (j = bucket[k % nbucket]; j < nchain && chain[j] != STN_UNDEF; j = chain[j]) {
Elf64_Sym *sym = (void *)&mem[dyn_symtab->d_un.d_ptr - load->p_vaddr];
char *name;
sym = &sym[j];
if (__ptr_oob(sym, mem, size))
continue;
if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC &&
ELF64_ST_BIND(sym->st_info) != STB_GLOBAL)
continue;
name = &dynsymbol_names[sym->st_name];
if (__ptr_oob(name, mem, size))
continue;
if (builtin_strcmp(name, symbol))
continue;
builtin_memcpy(t->symbols[i].name, name, sizeof(t->symbols[i].name));
t->symbols[i].offset = (unsigned long)sym->st_value - load->p_vaddr;
break;
}
}
return 0;
err_oob:
pr_err("Corrupted Elf data\n");
return -EFAULT;
}
static int vdso_remap(char *who, unsigned long from, unsigned long to, size_t size)
{
unsigned long addr;
pr_debug("Remap %s %lx -> %lx\n", who, from, to);
addr = sys_mremap(from, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, to);
if (addr != to) {
pr_err("Unable to remap %lx -> %lx %lx\n",
from, to, addr);
return -1;
}
return 0;
}
/* Park runtime vDSO in some safe place where it can be accessible from restorer */
int vdso_do_park(struct vdso_symtable *sym_rt, unsigned long park_at, unsigned long park_size)
{
int ret;
BUG_ON((vdso_vma_size(sym_rt) + vvar_vma_size(sym_rt)) < park_size);
if (sym_rt->vvar_start != VDSO_BAD_ADDR) {
if (sym_rt->vma_start < sym_rt->vvar_start) {
ret = vdso_remap("rt-vdso", sym_rt->vma_start,
park_at, vdso_vma_size(sym_rt));
park_at += vdso_vma_size(sym_rt);
ret |= vdso_remap("rt-vvar", sym_rt->vvar_start,
park_at, vvar_vma_size(sym_rt));
} else {
ret = vdso_remap("rt-vvar", sym_rt->vvar_start,
park_at, vvar_vma_size(sym_rt));
park_at += vvar_vma_size(sym_rt);
ret |= vdso_remap("rt-vdso", sym_rt->vma_start,
park_at, vdso_vma_size(sym_rt));
}
} else
ret = vdso_remap("rt-vdso", sym_rt->vma_start,
park_at, vdso_vma_size(sym_rt));
return ret;
}
int vdso_proxify(char *who, struct vdso_symtable *sym_rt,
unsigned long vdso_rt_parked_at, size_t index,
VmaEntry *vmas, size_t nr_vmas)
{
VmaEntry *vma_vdso = NULL, *vma_vvar = NULL;
struct vdso_symtable s = VDSO_SYMTABLE_INIT;
bool remap_rt = false;
/*
* Figure out which kind of vdso tuple we get.
*/
if (vma_entry_is(&vmas[index], VMA_AREA_VDSO))
vma_vdso = &vmas[index];
else if (vma_entry_is(&vmas[index], VMA_AREA_VVAR))
vma_vvar = &vmas[index];
if (index < (nr_vmas - 1)) {
if (vma_entry_is(&vmas[index + 1], VMA_AREA_VDSO))
vma_vdso = &vmas[index + 1];
else if (vma_entry_is(&vmas[index + 1], VMA_AREA_VVAR))
vma_vvar = &vmas[index + 1];
}
if (!vma_vdso) {
pr_err("Can't find vDSO area in image\n");
return -1;
}
/*
* vDSO mark overwrites Elf program header of proxy vDSO thus
* it must never ever be greater in size.
*/
BUILD_BUG_ON(sizeof(struct vdso_mark) > sizeof(Elf64_Phdr));
/*
* Find symbols in vDSO zone read from image.
*/
if (vdso_fill_symtable((void *)vma_vdso->start, vma_entry_len(vma_vdso), &s))
return -1;
/*
* Proxification strategy
*
* - There might be two vDSO zones: vdso code and optionally vvar data
* - To be able to use in-place remapping we need
*
* a) Size and order of vDSO zones are to match
* b) Symbols offsets must match
* c) Have same number of vDSO zones
*/
if (vma_entry_len(vma_vdso) == vdso_vma_size(sym_rt)) {
size_t i;
for (i = 0; i < ARRAY_SIZE(s.symbols); i++) {
if (s.symbols[i].offset != sym_rt->symbols[i].offset)
break;
}
if (i == ARRAY_SIZE(s.symbols)) {
if (vma_vvar && sym_rt->vvar_start != VVAR_BAD_ADDR) {
remap_rt = (vvar_vma_size(sym_rt) == vma_entry_len(vma_vvar));
if (remap_rt) {
long delta_rt = sym_rt->vvar_start - sym_rt->vma_start;
long delta_this = vma_vvar->start - vma_vdso->start;
remap_rt = (delta_rt ^ delta_this) < 0 ? false : true;
}
} else
remap_rt = true;
}
}
pr_debug("image [vdso] %lx-%lx [vvar] %lx-%lx\n",
vma_vdso->start, vma_vdso->end,
vma_vvar ? vma_vvar->start : VVAR_BAD_ADDR,
vma_vvar ? vma_vvar->end : VVAR_BAD_ADDR);
/*
* Easy case -- the vdso from image has same offsets, order and size
* as runtime, so we simply remap runtime vdso to dumpee position
* without generating any proxy.
*
* Note we may remap VVAR vdso as well which might not yet been mapped
* by a caller code. So drop VMA_AREA_REGULAR from it and caller would
* not touch it anymore.
*/
if (remap_rt) {
int ret = 0;
pr_info("Runtime vdso/vvar matches dumpee, remap inplace\n");
if (sys_munmap((void *)vma_vdso->start, vma_entry_len(vma_vdso))) {
pr_err("Failed to unmap %s\n", who);
return -1;
}
if (vma_vvar) {
if (sys_munmap((void *)vma_vvar->start, vma_entry_len(vma_vvar))) {
pr_err("Failed to unmap %s\n", who);
return -1;
}
if (vma_vdso->start < vma_vvar->start) {
ret = vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt));
vdso_rt_parked_at += vdso_vma_size(sym_rt);
ret |= vdso_remap(who, vdso_rt_parked_at, vma_vvar->start, vvar_vma_size(sym_rt));
} else {
ret = vdso_remap(who, vdso_rt_parked_at, vma_vvar->start, vvar_vma_size(sym_rt));
vdso_rt_parked_at += vvar_vma_size(sym_rt);
ret |= vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt));
}
} else
ret = vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt));
return ret;
}
/*
* Now complex case -- we need to proxify calls. We redirect
* calls from dumpee vdso to runtime vdso, making dumpee
* to operate as proxy vdso.
*/
pr_info("Runtime vdso mismatches dumpee, generate proxy\n");
/*
* Don't forget to shift if vvar is before vdso.
*/
if (sym_rt->vvar_start != VDSO_BAD_ADDR &&
sym_rt->vvar_start < sym_rt->vma_start)
vdso_rt_parked_at += vvar_vma_size(sym_rt);
if (vdso_redirect_calls(vdso_rt_parked_at,
vma_vdso->start,
sym_rt, &s)) {
pr_err("Failed to proxify dumpee contents\n");
return -1;
}
/*
* Put a special mark into runtime vdso, thus at next checkpoint
* routine we could detect this vdso and do not dump it, since
* it's auto-generated every new session if proxy required.
*/
sys_mprotect((void *)vdso_rt_parked_at, vdso_vma_size(sym_rt), PROT_WRITE);
vdso_put_mark((void *)vdso_rt_parked_at, vma_vdso->start, vma_vvar ? vma_vvar->start : VVAR_BAD_ADDR);
sys_mprotect((void *)vdso_rt_parked_at, vdso_vma_size(sym_rt), VDSO_PROT);
return 0;
}
......@@ -2,6 +2,7 @@
#include <unistd.h>
#include <elf.h>
#include <sys/user.h>
#include <sys/mman.h>
#include "asm/processor-flags.h"
#include "asm/restorer.h"
......
#ifndef __CR_ASM_VDSO_H__
#define __CR_ASM_VDSO_H__
#include <sys/types.h>
#include "asm/int.h"
#include "protobuf/vma.pb-c.h"
struct parasite_ctl;
struct vm_area_list;
#define VDSO_PROT (PROT_READ | PROT_EXEC)
#define VVAR_PROT (PROT_READ)
#define VDSO_BAD_ADDR (-1ul)
#define VVAR_BAD_ADDR VDSO_BAD_ADDR
#define VDSO_BAD_PFN (-1ull)
#define VVAR_BAD_PFN VDSO_BAD_PFN
struct vdso_symbol {
char name[32];
unsigned long offset;
};
#define VDSO_SYMBOL_INIT { .offset = VDSO_BAD_ADDR, }
#include "asm-generic/vdso.h"
/* Check if symbol present in symtable */
static inline bool vdso_symbol_empty(struct vdso_symbol *s)
{
return s->offset == VDSO_BAD_ADDR && s->name[0] == '\0';
}
/* This definition is used in pie/util-vdso.c to initialize the vdso symbol
* name string table 'vdso_symbols'
*/
/*
* This is a minimal amount of symbols
* we should support at the moment.
*/
enum {
VDSO_SYMBOL_CLOCK_GETTIME,
VDSO_SYMBOL_GETCPU,
VDSO_SYMBOL_GETTIMEOFDAY,
VDSO_SYMBOL_TIME,
VDSO_SYMBOL_MAX
};
struct vdso_symtable {
unsigned long vma_start;
unsigned long vma_end;
unsigned long vvar_start;
unsigned long vvar_end;
struct vdso_symbol symbols[VDSO_SYMBOL_MAX];
};
#define VDSO_SYMTABLE_INIT \
{ \
.vma_start = VDSO_BAD_ADDR, \
.vma_end = VDSO_BAD_ADDR, \
.vvar_start = VVAR_BAD_ADDR, \
.vvar_end = VVAR_BAD_ADDR, \
.symbols = { \
[0 ... VDSO_SYMBOL_MAX - 1] = \
(struct vdso_symbol)VDSO_SYMBOL_INIT, \
}, \
}
/* Size of VMA associated with vdso */
static inline unsigned long vdso_vma_size(struct vdso_symtable *t)
{
return t->vma_end - t->vma_start;
}
static inline unsigned long vvar_vma_size(struct vdso_symtable *t)
{
return t->vvar_end - t->vvar_start;
}
/*
* Special mark which allows to identify runtime vdso where
* calls from proxy vdso are redirected. This mark usually
* placed at the start of vdso area where Elf header lives.
* Since such runtime vdso is solevey used by proxy and
* nobody else is supposed to access it, it's more-less
* safe to screw the Elf header with @signature and
* @proxy_addr.
*
* The @proxy_addr deserves a few comments. When we redirect
* the calls from proxy to runtime vdso, on next checkpoint
* it won't be possible to find which VMA is proxy, thus
* we save its address in the member.
*/
struct vdso_mark {
u64 signature;
unsigned long proxy_vdso_addr;
unsigned long version;
/*
* In case of new vDSO format the VVAR area address
* neeed for easier discovering where it lives without
* relying on procfs output.
*/
unsigned long proxy_vvar_addr;
};
#define VDSO_MARK_SIGNATURE (0x6f73647675697263ULL) /* Magic number (criuvdso) */
#define VDSO_MARK_SIGNATURE_V2 (0x4f53447675697263ULL) /* Magic number (criuvDSO) */
#define VDSO_MARK_CUR_VERSION (2)
static inline void vdso_put_mark(void *where, unsigned long proxy_vdso_addr, unsigned long proxy_vvar_addr)
{
struct vdso_mark *m = where;
m->signature = VDSO_MARK_SIGNATURE_V2;
m->proxy_vdso_addr = proxy_vdso_addr;
m->version = VDSO_MARK_CUR_VERSION;
m->proxy_vvar_addr = proxy_vvar_addr;
}
static inline bool is_vdso_mark(void *addr)
{
struct vdso_mark *m = addr;
if (m->signature == VDSO_MARK_SIGNATURE_V2) {
/*
* New format
*/
return true;
} else if (m->signature == VDSO_MARK_SIGNATURE) {
/*
* Old format -- simply extend the mark up
* to the version we support.
*/
vdso_put_mark(m, m->proxy_vdso_addr, VVAR_BAD_ADDR);
return true;
}
return false;
}
#define VDSO_SYMBOL_CLOCK_GETTIME_NAME "__vdso_clock_gettime"
#define VDSO_SYMBOL_GETCPU_NAME "__vdso_getcpu"
#define VDSO_SYMBOL_GETTIMEOFDAY_NAME "__vdso_gettimeofday"
#define VDSO_SYMBOL_TIME_NAME "__vdso_time"
#define VDSO_SYMBOL_MAX 4
extern struct vdso_symtable vdso_sym_rt;
extern u64 vdso_pfn;
#define ARCH_VDSO_SYMBOLS \
"__vdso_clock_gettime", \
"__vdso_getcpu", \
"__vdso_gettimeofday", \
"__vdso_time"
extern int vdso_init(void);
extern int vdso_do_park(struct vdso_symtable *sym_rt, unsigned long park_at, unsigned long park_size);
extern int vdso_fill_symtable(char *mem, size_t size, struct vdso_symtable *t);
extern int vdso_proxify(char *who, struct vdso_symtable *sym_rt,
unsigned long vdso_rt_parked_at, size_t index,
VmaEntry *vmas, size_t nr_vmas);
extern int vdso_redirect_calls(void *base_to, void *base_from, struct vdso_symtable *to, struct vdso_symtable *from);
extern int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid,
struct vm_area_list *vma_area_list);
struct vdso_symtable;
extern int vdso_redirect_calls(unsigned long base_to,
unsigned long base_from,
struct vdso_symtable *to,
struct vdso_symtable *from);
#endif /* __CR_ASM_VDSO_H__ */
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>
#include <elf.h>
#include <fcntl.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include "asm/string.h"
#include "asm/types.h"
#include "compiler.h"
#include "syscall.h"
#include "image.h"
#include "vdso.h"
#include "vma.h"
#include "parasite-vdso.h"
#include "log.h"
#include "bug.h"
......@@ -34,7 +21,7 @@ typedef struct {
u32 guards;
} __packed jmp_t;
int vdso_redirect_calls(void *base_to, void *base_from,
int vdso_redirect_calls(unsigned long base_to, unsigned long base_from,
struct vdso_symtable *to,
struct vdso_symtable *from)
{
......@@ -50,420 +37,23 @@ int vdso_redirect_calls(void *base_to, void *base_from,
continue;
pr_debug("jmp: %lx/%lx -> %lx/%lx (index %d)\n",
(unsigned long)base_from, from->symbols[i].offset,
(unsigned long)base_to, to->symbols[i].offset, i);
base_from, from->symbols[i].offset,
base_to, to->symbols[i].offset, i);
jmp.imm64 = (unsigned long)base_to + to->symbols[i].offset;
jmp.imm64 = base_to + to->symbols[i].offset;
builtin_memcpy((void *)(base_from + from->symbols[i].offset), &jmp, sizeof(jmp));
}
return 0;
}
/* Check if pointer is out-of-bound */
static bool __ptr_oob(void *ptr, void *start, size_t size)
{
void *end = (void *)((unsigned long)start + size);
return ptr > end || ptr < start;
}
/*
* Elf hash, see format specification.
*/
static unsigned long elf_hash(const unsigned char *name)
{
unsigned long h = 0, g;
while (*name) {
h = (h << 4) + *name++;
g = h & 0xf0000000ul;
if (g)
h ^= g >> 24;
h &= ~g;
}
return h;
}
int vdso_fill_symtable(char *mem, size_t size, struct vdso_symtable *t)
{
Elf64_Phdr *dynamic = NULL, *load = NULL;
Elf64_Ehdr *ehdr = (void *)mem;
Elf64_Dyn *dyn_strtab = NULL;
Elf64_Dyn *dyn_symtab = NULL;
Elf64_Dyn *dyn_strsz = NULL;
Elf64_Dyn *dyn_syment = NULL;
Elf64_Dyn *dyn_hash = NULL;
Elf64_Word *hash = NULL;
Elf64_Phdr *phdr;
Elf64_Dyn *d;
Elf64_Word *bucket, *chain;
Elf64_Word nbucket, nchain;
/*
* See Elf specification for this magic values.
*/
const char elf_ident[] = {
0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
};
const char *vdso_symbols[VDSO_SYMBOL_MAX] = {
[VDSO_SYMBOL_CLOCK_GETTIME] = VDSO_SYMBOL_CLOCK_GETTIME_NAME,
[VDSO_SYMBOL_GETCPU] = VDSO_SYMBOL_GETCPU_NAME,
[VDSO_SYMBOL_GETTIMEOFDAY] = VDSO_SYMBOL_GETTIMEOFDAY_NAME,
[VDSO_SYMBOL_TIME] = VDSO_SYMBOL_TIME_NAME,
};
char *dynsymbol_names;
unsigned int i, j, k;
BUILD_BUG_ON(sizeof(elf_ident) != sizeof(ehdr->e_ident));
pr_debug("Parsing at %lx %lx\n", (long)mem, (long)mem + (long)size);
/*
* Make sure it's a file we support.
*/
if (builtin_memcmp(ehdr->e_ident, elf_ident, sizeof(elf_ident))) {
pr_err("Elf header magic mismatch\n");
return -EINVAL;
}
/*
* We need PT_LOAD and PT_DYNAMIC here. Each once.
*/
phdr = (void *)&mem[ehdr->e_phoff];
for (i = 0; i < ehdr->e_phnum; i++, phdr++) {
if (__ptr_oob(phdr, mem, size))
goto err_oob;
switch (phdr->p_type) {
case PT_DYNAMIC:
if (dynamic) {
pr_err("Second PT_DYNAMIC header\n");
return -EINVAL;
}
dynamic = phdr;
break;
case PT_LOAD:
if (load) {
pr_err("Second PT_LOAD header\n");
return -EINVAL;
}
load = phdr;
break;
}
}
if (!load || !dynamic) {
pr_err("One of obligated program headers is missed\n");
return -EINVAL;
}
pr_debug("PT_LOAD p_vaddr: %lx\n", (unsigned long)load->p_vaddr);
/*
* Dynamic section tags should provide us the rest of information
* needed. Note that we're interested in a small set of tags.
*/
d = (void *)&mem[dynamic->p_offset];
for (i = 0; i < dynamic->p_filesz / sizeof(*d); i++, d++) {
if (__ptr_oob(d, mem, size))
goto err_oob;
if (d->d_tag == DT_NULL) {
break;
} else if (d->d_tag == DT_STRTAB) {
dyn_strtab = d;
pr_debug("DT_STRTAB: %p\n", (void *)d->d_un.d_ptr);
} else if (d->d_tag == DT_SYMTAB) {
dyn_symtab = d;
pr_debug("DT_SYMTAB: %p\n", (void *)d->d_un.d_ptr);
} else if (d->d_tag == DT_STRSZ) {
dyn_strsz = d;
pr_debug("DT_STRSZ: %lu\n", (unsigned long)d->d_un.d_val);
} else if (d->d_tag == DT_SYMENT) {
dyn_syment = d;
pr_debug("DT_SYMENT: %lu\n", (unsigned long)d->d_un.d_val);
} else if (d->d_tag == DT_HASH) {
dyn_hash = d;
pr_debug("DT_HASH: %p\n", (void *)d->d_un.d_ptr);
}
}
if (!dyn_strtab || !dyn_symtab || !dyn_strsz || !dyn_syment || !dyn_hash) {
pr_err("Not all dynamic entries are present\n");
return -EINVAL;
}
dynsymbol_names = &mem[dyn_strtab->d_un.d_val - load->p_vaddr];
if (__ptr_oob(dynsymbol_names, mem, size))
goto err_oob;
hash = (void *)&mem[(unsigned long)dyn_hash->d_un.d_ptr - (unsigned long)load->p_vaddr];
if (__ptr_oob(hash, mem, size))
goto err_oob;
nbucket = hash[0];
nchain = hash[1];
bucket = &hash[2];
chain = &hash[nbucket + 2];
pr_debug("nbucket %lu nchain %lu bucket %p chain %p\n",
(long)nbucket, (long)nchain, bucket, chain);
for (i = 0; i < ARRAY_SIZE(vdso_symbols); i++) {
k = elf_hash((const unsigned char *)vdso_symbols[i]);
for (j = bucket[k % nbucket]; j < nchain && chain[j] != STN_UNDEF; j = chain[j]) {
Elf64_Sym *sym = (void *)&mem[dyn_symtab->d_un.d_ptr - load->p_vaddr];
char *name;
sym = &sym[j];
if (__ptr_oob(sym, mem, size))
continue;
if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC &&
ELF64_ST_BIND(sym->st_info) != STB_GLOBAL)
continue;
name = &dynsymbol_names[sym->st_name];
if (__ptr_oob(name, mem, size))
continue;
if (builtin_strcmp(name, vdso_symbols[i]))
continue;
builtin_memcpy(t->symbols[i].name, name, sizeof(t->symbols[i].name));
t->symbols[i].offset = (unsigned long)sym->st_value - load->p_vaddr;
break;
}
}
return 0;
err_oob:
pr_err("Corrupted Elf data\n");
return -EFAULT;
}
static int vdso_remap(char *who, unsigned long from, unsigned long to, size_t size)
{
unsigned long addr;
pr_debug("Remap %s %lx -> %lx\n", who, from, to);
addr = sys_mremap(from, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, to);
if (addr != to) {
pr_err("Unable to remap %lx -> %lx %lx\n",
from, to, addr);
return -1;
}
return 0;
}
/* Park runtime vDSO in some safe place where it can be accessible from restorer */
int vdso_do_park(struct vdso_symtable *sym_rt, unsigned long park_at, unsigned long park_size)
{
int ret;
BUG_ON((vdso_vma_size(sym_rt) + vvar_vma_size(sym_rt)) < park_size);
if (sym_rt->vvar_start != VDSO_BAD_ADDR) {
if (sym_rt->vma_start < sym_rt->vvar_start) {
ret = vdso_remap("rt-vdso", sym_rt->vma_start,
park_at, vdso_vma_size(sym_rt));
park_at += vdso_vma_size(sym_rt);
ret |= vdso_remap("rt-vvar", sym_rt->vvar_start,
park_at, vvar_vma_size(sym_rt));
} else {
ret = vdso_remap("rt-vvar", sym_rt->vvar_start,
park_at, vvar_vma_size(sym_rt));
park_at += vvar_vma_size(sym_rt);
ret |= vdso_remap("rt-vdso", sym_rt->vma_start,
park_at, vdso_vma_size(sym_rt));
}
} else
ret = vdso_remap("rt-vdso", sym_rt->vma_start,
park_at, vdso_vma_size(sym_rt));
return ret;
}
int vdso_proxify(char *who, struct vdso_symtable *sym_rt,
unsigned long vdso_rt_parked_at, size_t index,
VmaEntry *vmas, size_t nr_vmas)
{
VmaEntry *vma_vdso = NULL, *vma_vvar = NULL;
struct vdso_symtable s = VDSO_SYMTABLE_INIT;
bool remap_rt = false;
/*
* Figue out which kind of vdso tuple we get.
*/
if (vma_entry_is(&vmas[index], VMA_AREA_VDSO))
vma_vdso = &vmas[index];
else if (vma_entry_is(&vmas[index], VMA_AREA_VVAR))
vma_vvar = &vmas[index];
if (index < (nr_vmas - 1)) {
if (vma_entry_is(&vmas[index + 1], VMA_AREA_VDSO))
vma_vdso = &vmas[index + 1];
else if (vma_entry_is(&vmas[index + 1], VMA_AREA_VVAR))
vma_vvar = &vmas[index + 1];
}
if (!vma_vdso) {
pr_err("Can't find vDSO area in image\n");
return -1;
}
/*
* vDSO mark overwrites Elf program header of proxy vDSO thus
* it must never ever be greater in size.
*/
BUILD_BUG_ON(sizeof(struct vdso_mark) > sizeof(Elf64_Phdr));
/*
* Find symbols in vDSO zone read from image.
*/
if (vdso_fill_symtable((void *)vma_vdso->start, vma_entry_len(vma_vdso), &s))
return -1;
/*
* Proxification strategy
*
* - There might be two vDSO zones: vdso code and optionally vvar data
* - To be able to use in-place remapping we need
*
* a) Size and order of vDSO zones are to match
* b) Symbols offsets must match
* c) Have same number of vDSO zones
*/
if (vma_entry_len(vma_vdso) == vdso_vma_size(sym_rt)) {
size_t i;
for (i = 0; i < ARRAY_SIZE(s.symbols); i++) {
if (s.symbols[i].offset != sym_rt->symbols[i].offset)
break;
}
if (i == ARRAY_SIZE(s.symbols)) {
if (vma_vvar && sym_rt->vvar_start != VVAR_BAD_ADDR) {
remap_rt = (vvar_vma_size(sym_rt) == vma_entry_len(vma_vvar));
if (remap_rt) {
long delta_rt = sym_rt->vvar_start - sym_rt->vma_start;
long delta_this = vma_vvar->start - vma_vdso->start;
remap_rt = (delta_rt ^ delta_this) < 0 ? false : true;
}
} else
remap_rt = true;
}
}
pr_debug("image [vdso] %lx-%lx [vvar] %lx-%lx\n",
vma_vdso->start, vma_vdso->end,
vma_vvar ? vma_vvar->start : VVAR_BAD_ADDR,
vma_vvar ? vma_vvar->end : VVAR_BAD_ADDR);
/*
* Easy case -- the vdso from image has same offsets, order and size
* as runtime, so we simply remap runtime vdso to dumpee position
* without generating any proxy.
*
* Note we may remap VVAR vdso as well which might not yet been mapped
* by a caller code. So drop VMA_AREA_REGULAR from it and caller would
* not touch it anymore.
*/
if (remap_rt) {
int ret = 0;
pr_info("Runtime vdso/vvar matches dumpee, remap inplace\n");
if (sys_munmap((void *)vma_vdso->start, vma_entry_len(vma_vdso))) {
pr_err("Failed to unmap %s\n", who);
return -1;
}
if (vma_vvar) {
if (sys_munmap((void *)vma_vvar->start, vma_entry_len(vma_vvar))) {
pr_err("Failed to unmap %s\n", who);
return -1;
}
if (vma_vdso->start < vma_vvar->start) {
ret = vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt));
vdso_rt_parked_at += vdso_vma_size(sym_rt);
ret |= vdso_remap(who, vdso_rt_parked_at, vma_vvar->start, vvar_vma_size(sym_rt));
} else {
ret = vdso_remap(who, vdso_rt_parked_at, vma_vvar->start, vvar_vma_size(sym_rt));
vdso_rt_parked_at += vvar_vma_size(sym_rt);
ret |= vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt));
}
} else
ret = vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt));
return ret;
}
/*
* Now complex case -- we need to proxify calls. We redirect
* calls from dumpee vdso to runtime vdso, making dumpee
* to operate as proxy vdso.
*/
pr_info("Runtime vdso mismatches dumpee, generate proxy\n");
/*
* Don't forget to shift if vvar is before vdso.
*/
if (sym_rt->vvar_start != VDSO_BAD_ADDR &&
sym_rt->vvar_start < sym_rt->vma_start)
vdso_rt_parked_at += vvar_vma_size(sym_rt);
if (vdso_redirect_calls((void *)vdso_rt_parked_at,
(void *)vma_vdso->start,
sym_rt, &s)) {
pr_err("Failed to proxify dumpee contents\n");
return -1;
}
/*
* Put a special mark into runtime vdso, thus at next checkpoint
* routine we could detect this vdso and do not dump it, since
* it's auto-generated every new session if proxy required.
*/
sys_mprotect((void *)vdso_rt_parked_at, vdso_vma_size(sym_rt), PROT_WRITE);
vdso_put_mark((void *)vdso_rt_parked_at, vma_vdso->start, vma_vvar ? vma_vvar->start : VVAR_BAD_ADDR);
sys_mprotect((void *)vdso_rt_parked_at, vdso_vma_size(sym_rt), VDSO_PROT);
return 0;
}
#else /* CONFIG_X86_64 */
int vdso_redirect_calls(void *base_to, void *base_from,
int vdso_redirect_calls(unsigned long base_to, unsigned long base_from,
struct vdso_symtable *to,
struct vdso_symtable *from)
{
return 0;
}
int vdso_fill_symtable(char *mem, size_t size, struct vdso_symtable *t)
{
return 0;
}
int vdso_do_park(struct vdso_symtable *sym_rt, unsigned long park_at, unsigned long park_size)
{
return 0;
}
int vdso_proxify(char *who, struct vdso_symtable *sym_rt,
unsigned long vdso_rt_parked_at, size_t index,
VmaEntry *vmas, size_t nr_vmas)
{
return 0;
}
#endif /* CONFIG_X86_64 */
#ifndef __CR_ASM_GENERIC_VDSO_H__
#define __CR_ASM_GENERIC_VDSO_H__
#define VDSO_PROT (PROT_READ | PROT_EXEC)
#define VVAR_PROT (PROT_READ)
#define VDSO_BAD_ADDR (-1ul)
#define VVAR_BAD_ADDR VDSO_BAD_ADDR
#define VDSO_BAD_PFN (-1ull)
#define VVAR_BAD_PFN VDSO_BAD_PFN
#endif /* __CR_ASM_GENERIC_VDSO_H__ */
......@@ -118,11 +118,6 @@ extern int __parasite_execute_syscall(struct parasite_ctl *ctl,
user_regs_struct_t *regs);
extern bool arch_can_dump_task(pid_t pid);
#ifdef CONFIG_VDSO
extern int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid,
struct vm_area_list *vma_area_list);
#endif
/*
* The PTRACE_SYSCALL will trap task twice -- on
* enter into and on exit from syscall. If we trace
......
#ifndef __CR_PARASITE_VDSO_H__
#define __CR_PARASITE_VDSO_H__
#include "config.h"
#ifdef CONFIG_VDSO
#include "util-vdso.h"
#include "protobuf/vma.pb-c.h"
struct parasite_ctl;
struct vm_area_list;
/* Check if symbol present in symtable */
static inline bool vdso_symbol_empty(struct vdso_symbol *s)
{
return s->offset == VDSO_BAD_ADDR && s->name[0] == '\0';
}
/*
* Special mark which allows to identify runtime vdso where
* calls from proxy vdso are redirected. This mark usually
* placed at the start of vdso area where Elf header lives.
* Since such runtime vdso is solevey used by proxy and
* nobody else is supposed to access it, it's more-less
* safe to screw the Elf header with @signature and
* @proxy_addr.
*
* The @proxy_addr deserves a few comments. When we redirect
* the calls from proxy to runtime vdso, on next checkpoint
* it won't be possible to find which VMA is proxy, thus
* we save its address in the member.
*/
struct vdso_mark {
u64 signature;
unsigned long proxy_vdso_addr;
unsigned long version;
/*
* In case of new vDSO format the VVAR area address
* neeed for easier discovering where it lives without
* relying on procfs output.
*/
unsigned long proxy_vvar_addr;
};
#define VDSO_MARK_SIGNATURE (0x6f73647675697263ULL) /* Magic number (criuvdso) */
#define VDSO_MARK_SIGNATURE_V2 (0x4f53447675697263ULL) /* Magic number (criuvDSO) */
#define VDSO_MARK_CUR_VERSION (2)
static inline void vdso_put_mark(void *where, unsigned long proxy_vdso_addr, unsigned long proxy_vvar_addr)
{
struct vdso_mark *m = where;
m->signature = VDSO_MARK_SIGNATURE_V2;
m->proxy_vdso_addr = proxy_vdso_addr;
m->version = VDSO_MARK_CUR_VERSION;
m->proxy_vvar_addr = proxy_vvar_addr;
}
static inline bool is_vdso_mark(void *addr)
{
struct vdso_mark *m = addr;
if (m->signature == VDSO_MARK_SIGNATURE_V2) {
/*
* New format
*/
return true;
} else if (m->signature == VDSO_MARK_SIGNATURE) {
/*
* Old format -- simply extend the mark up
* to the version we support.
*/
vdso_put_mark(m, m->proxy_vdso_addr, VVAR_BAD_ADDR);
return true;
}
return false;
}
extern int vdso_do_park(struct vdso_symtable *sym_rt, unsigned long park_at, unsigned long park_size);
extern int vdso_fill_symtable(char *mem, size_t size, struct vdso_symtable *t);
extern int vdso_proxify(char *who, struct vdso_symtable *sym_rt,
unsigned long vdso_rt_parked_at, size_t index,
VmaEntry *vmas, size_t nr_vmas);
/* only used by aarch64 => to be moved to aarch64/include/asm/vdso.h */
extern void write_intraprocedure_branch(void *to, void *from);
#else /* CONFIG_VDSO */
#define vdso_do_park(sym_rt, park_at, park_size) (0)
#endif /* CONFIG_VDSO */
#endif /* __CR_PARASITE_VDSO_H__ */
......@@ -19,7 +19,7 @@
#include "timerfd.h"
#include "shmem.h"
#include "sigframe.h"
#include "vdso.h"
#include "parasite-vdso.h"
#include <time.h>
......
#ifndef __CR_UTIL_VDSO_H__
#define __CR_UTIL_VDSO_H__
/*
* VDSO management common definitions.
*
* This header file is included by the criu main code and the parasite code.
* It contains definitions shared by these 2 parts.
*
* This file should not be included except in pie/util-vdso.c, include/vdso.h
* and include/parasite-vdso.h
*/
#include <sys/types.h>
/*
* Each architecture must export:
* VDSO_SYMBOL_MAX, the number of vDSO symbols to manage
* ARCH_VDSO_SYMBOLS, a table of string containing the vDSO symbol names
* vdso_redirect_calls, a service called to redirect the vDSO symbols in
* the parasite code.
*/
#include "asm/vdso.h"
struct vdso_symbol {
char name[32];
unsigned long offset;
};
struct vdso_symtable {
unsigned long vma_start;
unsigned long vma_end;
unsigned long vvar_start;
unsigned long vvar_end;
struct vdso_symbol symbols[VDSO_SYMBOL_MAX];
};
#define VDSO_SYMBOL_INIT { .offset = VDSO_BAD_ADDR, }
#define VDSO_SYMTABLE_INIT \
{ \
.vma_start = VDSO_BAD_ADDR, \
.vma_end = VDSO_BAD_ADDR, \
.vvar_start = VVAR_BAD_ADDR, \
.vvar_end = VVAR_BAD_ADDR, \
.symbols = { \
[0 ... VDSO_SYMBOL_MAX - 1] = \
(struct vdso_symbol)VDSO_SYMBOL_INIT, \
}, \
}
/* Size of VMA associated with vdso */
static inline unsigned long vdso_vma_size(struct vdso_symtable *t)
{
return t->vma_end - t->vma_start;
}
static inline unsigned long vvar_vma_size(struct vdso_symtable *t)
{
return t->vvar_end - t->vvar_start;
}
extern const char *vdso_symbols[VDSO_SYMBOL_MAX];
extern int vdso_fill_symtable(char *mem, size_t size, struct vdso_symtable *t);
#endif /* __CR_UTIL_VDSO_H__ */
......@@ -8,17 +8,19 @@
#ifdef CONFIG_VDSO
#include "asm/vdso.h"
#include "util-vdso.h"
extern struct vdso_symtable vdso_sym_rt;
extern int vdso_init(void);
extern int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid,
struct vm_area_list *vma_area_list);
#else /* CONFIG_VDSO */
#define vdso_init() (0)
#define parasite_fixup_vdso(ctl, pid, vma_area_list) (0)
#define vdso_vma_size(t) (0)
#define vdso_do_park(sym_rt, park_at, park_size) (0)
#define vdso_remap(who, from, to, size) (0)
#define vdso_proxify(who, sym_rt, vdso_rt_parked_at, \
index, vmas, nr_vmas) (0)
#endif /* CONFIG_VDSO */
......
......@@ -6,6 +6,8 @@ obj-y += util.o
obj-y += util-fd.o
ifeq ($(VDSO),y)
obj-y += util-vdso.o
obj-y += parasite-vdso.o
obj-e += $(ARCH_DIR)/vdso-pie.o
ifeq ($(SRCARCH),aarch64)
asm-e += $(ARCH_DIR)/intraprocedure.o
......
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <elf.h>
#include <fcntl.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include "asm/string.h"
#include "asm/types.h"
#include "syscall.h"
#include "image.h"
#include "parasite-vdso.h"
#include "vma.h"
#include "log.h"
#include "bug.h"
#ifdef LOG_PREFIX
# undef LOG_PREFIX
#endif
#define LOG_PREFIX "vdso: "
static int vdso_remap(char *who, unsigned long from, unsigned long to, size_t size)
{
unsigned long addr;
pr_debug("Remap %s %lx -> %lx\n", who, from, to);
addr = sys_mremap(from, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, to);
if (addr != to) {
pr_err("Unable to remap %lx -> %lx %lx\n",
from, to, addr);
return -1;
}
return 0;
}
/* Park runtime vDSO in some safe place where it can be accessible from restorer */
int vdso_do_park(struct vdso_symtable *sym_rt, unsigned long park_at, unsigned long park_size)
{
int ret;
BUG_ON((vdso_vma_size(sym_rt) + vvar_vma_size(sym_rt)) < park_size);
if (sym_rt->vvar_start != VDSO_BAD_ADDR) {
if (sym_rt->vma_start < sym_rt->vvar_start) {
ret = vdso_remap("rt-vdso", sym_rt->vma_start,
park_at, vdso_vma_size(sym_rt));
park_at += vdso_vma_size(sym_rt);
ret |= vdso_remap("rt-vvar", sym_rt->vvar_start,
park_at, vvar_vma_size(sym_rt));
} else {
ret = vdso_remap("rt-vvar", sym_rt->vvar_start,
park_at, vvar_vma_size(sym_rt));
park_at += vvar_vma_size(sym_rt);
ret |= vdso_remap("rt-vdso", sym_rt->vma_start,
park_at, vdso_vma_size(sym_rt));
}
} else
ret = vdso_remap("rt-vdso", sym_rt->vma_start,
park_at, vdso_vma_size(sym_rt));
return ret;
}
int vdso_proxify(char *who, struct vdso_symtable *sym_rt,
unsigned long vdso_rt_parked_at, size_t index,
VmaEntry *vmas, size_t nr_vmas)
{
VmaEntry *vma_vdso = NULL, *vma_vvar = NULL;
struct vdso_symtable s = VDSO_SYMTABLE_INIT;
bool remap_rt = false;
/*
* Figure out which kind of vdso tuple we get.
*/
if (vma_entry_is(&vmas[index], VMA_AREA_VDSO))
vma_vdso = &vmas[index];
else if (vma_entry_is(&vmas[index], VMA_AREA_VVAR))
vma_vvar = &vmas[index];
if (index < (nr_vmas - 1)) {
if (vma_entry_is(&vmas[index + 1], VMA_AREA_VDSO))
vma_vdso = &vmas[index + 1];
else if (vma_entry_is(&vmas[index + 1], VMA_AREA_VVAR))
vma_vvar = &vmas[index + 1];
}
if (!vma_vdso) {
pr_err("Can't find vDSO area in image\n");
return -1;
}
/*
* vDSO mark overwrites Elf program header of proxy vDSO thus
* it must never ever be greater in size.
*/
BUILD_BUG_ON(sizeof(struct vdso_mark) > sizeof(Elf64_Phdr));
/*
* Find symbols in vDSO zone read from image.
*/
if (vdso_fill_symtable((void *)vma_vdso->start, vma_entry_len(vma_vdso), &s))
return -1;
/*
* Proxification strategy
*
* - There might be two vDSO zones: vdso code and optionally vvar data
* - To be able to use in-place remapping we need
*
* a) Size and order of vDSO zones are to match
* b) Symbols offsets must match
* c) Have same number of vDSO zones
*/
if (vma_entry_len(vma_vdso) == vdso_vma_size(sym_rt)) {
size_t i;
for (i = 0; i < ARRAY_SIZE(s.symbols); i++) {
if (s.symbols[i].offset != sym_rt->symbols[i].offset)
break;
}
if (i == ARRAY_SIZE(s.symbols)) {
if (vma_vvar && sym_rt->vvar_start != VVAR_BAD_ADDR) {
remap_rt = (vvar_vma_size(sym_rt) == vma_entry_len(vma_vvar));
if (remap_rt) {
long delta_rt = sym_rt->vvar_start - sym_rt->vma_start;
long delta_this = vma_vvar->start - vma_vdso->start;
remap_rt = (delta_rt ^ delta_this) < 0 ? false : true;
}
} else
remap_rt = true;
}
}
pr_debug("image [vdso] %lx-%lx [vvar] %lx-%lx\n",
vma_vdso->start, vma_vdso->end,
vma_vvar ? vma_vvar->start : VVAR_BAD_ADDR,
vma_vvar ? vma_vvar->end : VVAR_BAD_ADDR);
/*
* Easy case -- the vdso from image has same offsets, order and size
* as runtime, so we simply remap runtime vdso to dumpee position
* without generating any proxy.
*
* Note we may remap VVAR vdso as well which might not yet been mapped
* by a caller code. So drop VMA_AREA_REGULAR from it and caller would
* not touch it anymore.
*/
if (remap_rt) {
int ret = 0;
pr_info("Runtime vdso/vvar matches dumpee, remap inplace\n");
if (sys_munmap((void *)vma_vdso->start, vma_entry_len(vma_vdso))) {
pr_err("Failed to unmap %s\n", who);
return -1;
}
if (vma_vvar) {
if (sys_munmap((void *)vma_vvar->start, vma_entry_len(vma_vvar))) {
pr_err("Failed to unmap %s\n", who);
return -1;
}
if (vma_vdso->start < vma_vvar->start) {
ret = vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt));
vdso_rt_parked_at += vdso_vma_size(sym_rt);
ret |= vdso_remap(who, vdso_rt_parked_at, vma_vvar->start, vvar_vma_size(sym_rt));
} else {
ret = vdso_remap(who, vdso_rt_parked_at, vma_vvar->start, vvar_vma_size(sym_rt));
vdso_rt_parked_at += vvar_vma_size(sym_rt);
ret |= vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt));
}
} else
ret = vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt));
return ret;
}
/*
* Now complex case -- we need to proxify calls. We redirect
* calls from dumpee vdso to runtime vdso, making dumpee
* to operate as proxy vdso.
*/
pr_info("Runtime vdso mismatches dumpee, generate proxy\n");
/*
* Don't forget to shift if vvar is before vdso.
*/
if (sym_rt->vvar_start != VDSO_BAD_ADDR &&
sym_rt->vvar_start < sym_rt->vma_start)
vdso_rt_parked_at += vvar_vma_size(sym_rt);
if (vdso_redirect_calls(vdso_rt_parked_at,
vma_vdso->start,
sym_rt, &s)) {
pr_err("Failed to proxify dumpee contents\n");
return -1;
}
/*
* Put a special mark into runtime vdso, thus at next checkpoint
* routine we could detect this vdso and do not dump it, since
* it's auto-generated every new session if proxy required.
*/
sys_mprotect((void *)vdso_rt_parked_at, vdso_vma_size(sym_rt), PROT_WRITE);
vdso_put_mark((void *)vdso_rt_parked_at, vma_vdso->start, vma_vvar ? vma_vvar->start : VVAR_BAD_ADDR);
sys_mprotect((void *)vdso_rt_parked_at, vdso_vma_size(sym_rt), VDSO_PROT);
return 0;
}
......@@ -13,7 +13,7 @@
#include "fcntl.h"
#include "prctl.h"
#include "lock.h"
#include "vdso.h"
#include "parasite-vdso.h"
#include "log.h"
#include "tty.h"
......
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <elf.h>
#include <fcntl.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include "asm/string.h"
#include "asm/types.h"
#include "syscall.h"
#include "image.h"
#include "util-vdso.h"
#include "vma.h"
#include "log.h"
#include "bug.h"
#ifdef LOG_PREFIX
# undef LOG_PREFIX
#endif
#define LOG_PREFIX "vdso: "
const char *vdso_symbols[VDSO_SYMBOL_MAX] = {
ARCH_VDSO_SYMBOLS
};
/* Check if pointer is out-of-bound */
static bool __ptr_oob(void *ptr, void *start, size_t size)
{
void *end = (void *)((unsigned long)start + size);
return ptr > end || ptr < start;
}
/*
* Elf hash, see format specification.
*/
static unsigned long elf_hash(const unsigned char *name)
{
unsigned long h = 0, g;
while (*name) {
h = (h << 4) + *name++;
g = h & 0xf0000000ul;
if (g)
h ^= g >> 24;
h &= ~g;
}
return h;
}
int vdso_fill_symtable(char *mem, size_t size, struct vdso_symtable *t)
{
Elf64_Phdr *dynamic = NULL, *load = NULL;
Elf64_Ehdr *ehdr = (void *)mem;
Elf64_Dyn *dyn_strtab = NULL;
Elf64_Dyn *dyn_symtab = NULL;
Elf64_Dyn *dyn_strsz = NULL;
Elf64_Dyn *dyn_syment = NULL;
Elf64_Dyn *dyn_hash = NULL;
Elf64_Word *hash = NULL;
Elf64_Phdr *phdr;
Elf64_Dyn *d;
Elf64_Word *bucket, *chain;
Elf64_Word nbucket, nchain;
/*
* See Elf specification for this magic values.
*/
static const char elf_ident[] = {
0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
};
char *dynsymbol_names;
unsigned int i, j, k;
BUILD_BUG_ON(sizeof(elf_ident) != sizeof(ehdr->e_ident));
pr_debug("Parsing at %lx %lx\n", (long)mem, (long)mem + (long)size);
/*
* Make sure it's a file we support.
*/
if (builtin_memcmp(ehdr->e_ident, elf_ident, sizeof(elf_ident))) {
pr_err("Elf header magic mismatch\n");
return -EINVAL;
}
/*
* We need PT_LOAD and PT_DYNAMIC here. Each once.
*/
phdr = (void *)&mem[ehdr->e_phoff];
for (i = 0; i < ehdr->e_phnum; i++, phdr++) {
if (__ptr_oob(phdr, mem, size))
goto err_oob;
switch (phdr->p_type) {
case PT_DYNAMIC:
if (dynamic) {
pr_err("Second PT_DYNAMIC header\n");
return -EINVAL;
}
dynamic = phdr;
break;
case PT_LOAD:
if (load) {
pr_err("Second PT_LOAD header\n");
return -EINVAL;
}
load = phdr;
break;
}
}
if (!load || !dynamic) {
pr_err("One of obligated program headers is missed\n");
return -EINVAL;
}
pr_debug("PT_LOAD p_vaddr: %lx\n", (unsigned long)load->p_vaddr);
/*
* Dynamic section tags should provide us the rest of information
* needed. Note that we're interested in a small set of tags.
*/
d = (void *)&mem[dynamic->p_offset];
for (i = 0; i < dynamic->p_filesz / sizeof(*d); i++, d++) {
if (__ptr_oob(d, mem, size))
goto err_oob;
if (d->d_tag == DT_NULL) {
break;
} else if (d->d_tag == DT_STRTAB) {
dyn_strtab = d;
pr_debug("DT_STRTAB: %lx\n", (unsigned long)d->d_un.d_ptr);
} else if (d->d_tag == DT_SYMTAB) {
dyn_symtab = d;
pr_debug("DT_SYMTAB: %lx\n", (unsigned long)d->d_un.d_ptr);
} else if (d->d_tag == DT_STRSZ) {
dyn_strsz = d;
pr_debug("DT_STRSZ: %lx\n", (unsigned long)d->d_un.d_val);
} else if (d->d_tag == DT_SYMENT) {
dyn_syment = d;
pr_debug("DT_SYMENT: %lx\n", (unsigned long)d->d_un.d_val);
} else if (d->d_tag == DT_HASH) {
dyn_hash = d;
pr_debug("DT_HASH: %lx\n", (unsigned long)d->d_un.d_ptr);
}
}
if (!dyn_strtab || !dyn_symtab || !dyn_strsz || !dyn_syment || !dyn_hash) {
pr_err("Not all dynamic entries are present\n");
return -EINVAL;
}
dynsymbol_names = &mem[dyn_strtab->d_un.d_val - load->p_vaddr];
if (__ptr_oob(dynsymbol_names, mem, size))
goto err_oob;
hash = (void *)&mem[(unsigned long)dyn_hash->d_un.d_ptr - (unsigned long)load->p_vaddr];
if (__ptr_oob(hash, mem, size))
goto err_oob;
nbucket = hash[0];
nchain = hash[1];
bucket = &hash[2];
chain = &hash[nbucket + 2];
pr_debug("nbucket %lx nchain %lx bucket %lx chain %lx\n",
(long)nbucket, (long)nchain, (unsigned long)bucket, (unsigned long)chain);
for (i = 0; i < VDSO_SYMBOL_MAX; i++) {
const char * symbol = vdso_symbols[i];
k = elf_hash((const unsigned char *)symbol);
for (j = bucket[k % nbucket]; j < nchain && chain[j] != STN_UNDEF; j = chain[j]) {
Elf64_Sym *sym = (void *)&mem[dyn_symtab->d_un.d_ptr - load->p_vaddr];
char *name;
sym = &sym[j];
if (__ptr_oob(sym, mem, size))
continue;
if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC &&
ELF64_ST_BIND(sym->st_info) != STB_GLOBAL)
continue;
name = &dynsymbol_names[sym->st_name];
if (__ptr_oob(name, mem, size))
continue;
if (builtin_strcmp(name, symbol))
continue;
builtin_memcpy(t->symbols[i].name, name, sizeof(t->symbols[i].name));
t->symbols[i].offset = (unsigned long)sym->st_value - load->p_vaddr;
break;
}
}
return 0;
err_oob:
pr_err("Corrupted Elf data\n");
return -EFAULT;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment