#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <dirent.h>
#include <errno.h>
#include <sys/stat.h>
#include <string.h>
#include <stdlib.h>
#include <sys/mount.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sched.h>

#include "cr_options.h"
#include "util.h"
#include "util-pie.h"
#include "log.h"
#include "plugin.h"
#include "filesystems.h"
#include "mount.h"
#include "pstree.h"
#include "image.h"
#include "namespaces.h"
#include "protobuf.h"
#include "fs-magic.h"
#include "path.h"
#include "files-reg.h"
#include "external.h"

#include "images/mnt.pb-c.h"

/*
 * Put a : in here since those are invalid on
 * the cli, so we know it's autogenerated in
 * debugging.
 */
#define AUTODETECTED_MOUNT "CRIU:AUTOGENERATED"
#define MS_PROPAGATE (MS_SHARED | MS_PRIVATE | MS_UNBINDABLE | MS_SLAVE)

#undef	LOG_PREFIX
#define LOG_PREFIX "mnt: "

#define BINFMT_MISC_HOME "/proc/sys/fs/binfmt_misc"
#define CRTIME_MNT_ID 0

int ext_mount_add(char *key, char *val)
{
	char *e_str;

	e_str = xmalloc(strlen(key) + strlen(val) + 8);
	if (!e_str)
		return -1;

	/*
	 * On dump the key is the mountpoint as seen from the mount
	 * namespace, the val is some name that will be put into image
	 * instead of the mount point's root path.
	 *
	 * On restore the key is the name from the image (the one
	 * mentioned above) and the val is the path in criu's mount
	 * namespace that will become the mount point's root, i.e. --
	 * be bind mounted to the respective mountpoint.
	 */

	sprintf(e_str, "mnt[%s]:%s", key, val);
	return add_external(e_str);
}

int ext_mount_parse_auto(char *key)
{
	opts.autodetect_ext_mounts = true;

	if (*key == ':') {
		while (1) {
			key++;
			if (*key == '\0')
				break;
			else if (*key == 'm')
				opts.enable_external_masters = true;
			else if (*key == 's')
				opts.enable_external_sharing = true;
			else
				return -1;
		}
	}

	return 0;
}

/* Lookup ext_mount by key field */
static char *ext_mount_lookup(char *key)
{
	char *v;
	int len = strlen(key);
	char mkey[len + 8];

	sprintf(mkey, "mnt[%s]", key);
	v = external_lookup_by_key(mkey);
	if (IS_ERR(v))
		v = NULL;

	return v;
}

/*
 * Single linked list of mount points get from proc/images
 */
struct mount_info *mntinfo;

static void mntinfo_add_list(struct mount_info *new)
{
	if (!mntinfo)
		mntinfo = new;
	else {
		struct mount_info *pm;

		/* Add to the tail. (FIXME -- make O(1) ) */
		for (pm = mntinfo; pm->next != NULL; pm = pm->next)
			;
		pm->next = new;
	}
}

static struct mount_info *__lookup_overlayfs(struct mount_info *list, char *rpath,
						unsigned int st_dev, unsigned int st_ino,
						unsigned int mnt_id)
{
	/*
	 * Goes through all entries in the mountinfo table
	 * looking for a mount point that contains the file specified
	 * in rpath. Uses the device number st_dev and the inode number st_ino
	 * to make sure the file is correct.
	 */
	struct mount_info *mi_ret = NULL;
	struct mount_info *m;
	int mntns_root = -1;

	for (m = list; m != NULL; m = m->next) {
		struct stat f_stat;
		int ret_stat;

		if (m->fstype->code != FSTYPE__OVERLAYFS)
			continue;

		/*
		 * We need the mntns root fd of the process to be dumped,
		 * to make sure we stat the correct file
		 */
		if (mntns_root == -1) {
			mntns_root = __mntns_get_root_fd(root_item->pid.real);
			if (mntns_root < 0) {
				pr_err("Unable to get the root file descriptor of pid %d\n", root_item->pid.real);
				return ERR_PTR(-ENOENT);
			}
		}

		/* Concatenates m->mountpoint with rpath and attempts to stat the resulting path */
		if (is_root_mount(m)) {
			ret_stat = fstatat(mntns_root, rpath, &f_stat, 0);
		} else {
			char _full_path[PATH_MAX];
			int n = snprintf(_full_path, PATH_MAX, "%s/%s", m->mountpoint, rpath);

			if (n >= PATH_MAX) {
				pr_err("Not enough space to concatenate %s and %s\n", m->mountpoint, rpath);
				return ERR_PTR(-ENOSPC);
			}
			ret_stat = fstatat(mntns_root, _full_path, &f_stat, 0);
		}

		if (ret_stat == 0 && st_dev == f_stat.st_dev && st_ino == f_stat.st_ino)
			mi_ret = m;
	}

	return mi_ret;
}

/*
 * Looks up the mnt_id and path of a file in an overlayFS directory.
 *
 * This is useful in order to fix the OverlayFS bug present in the
 * Linux Kernel before version 4.2. See fixup_overlayfs for details.
 *
 * We first check to see if the mnt_id and st_dev numbers currently match
 * some entry in the mountinfo table. If so, we already have the correct mnt_id
 * and no fixup is needed.
 *
 * Then we proceed to see if there are any overlayFS mounted directories
 * in the mountinfo table. If so, we concatenate the mountpoint with the
 * name of the file, and stat the resulting path to check if we found the
 * correct device id and node number. If that is the case, we update the
 * mount id and link variables with the correct values.
 */
struct mount_info *lookup_overlayfs(char *rpath, unsigned int st_dev,
					unsigned int st_ino, unsigned int mnt_id)
{
	struct mount_info *m;

	/* If the mnt_id and device number match for some entry, no fixup is needed */
	for (m = mntinfo; m != NULL; m = m->next)
		if (st_dev == kdev_to_odev(m->s_dev) && mnt_id == m->mnt_id)
			return NULL;

	return __lookup_overlayfs(mntinfo, rpath, st_dev, st_ino, mnt_id);
}

static struct mount_info *__lookup_mnt_id(struct mount_info *list, int id)
{
	struct mount_info *m;

	for (m = list; m != NULL; m = m->next)
		if (m->mnt_id == id)
			return m;

	return NULL;
}

struct mount_info *lookup_mnt_id(unsigned int id)
{
	return __lookup_mnt_id(mntinfo, id);
}

struct mount_info *lookup_mnt_sdev(unsigned int s_dev)
{
	struct mount_info *m;

	for (m = mntinfo; m != NULL; m = m->next)
		if (m->s_dev == s_dev)
			return m;

	return NULL;
}

static struct mount_info *mount_resolve_path(struct mount_info *mntinfo_tree, const char *path)
{
	size_t pathlen = strlen(path);
	struct mount_info *m = mntinfo_tree, *c;

	while (1) {
		list_for_each_entry(c, &m->children, siblings) {
			size_t n;

			n = strlen(c->mountpoint + 1);
			if (n > pathlen)
				continue;

			if (strncmp(c->mountpoint + 1, path, min(n, pathlen)))
				continue;
			if (n < pathlen && path[n] != '/')
				continue;

			m = c;
			break;
		}
		if (&c->siblings == &m->children)
			break;
	}

	pr_debug("Path `%s' resolved to `%s' mountpoint\n", path, m->mountpoint);
	return m;
}

dev_t phys_stat_resolve_dev(struct ns_id *ns, dev_t st_dev, const char *path)
{
	struct mount_info *m;

	m = mount_resolve_path(ns->mnt.mntinfo_tree, path);
	/*
	 * BTRFS returns subvolume dev-id instead of
	 * superblock dev-id, in such case return device
	 * obtained from mountinfo (ie subvolume0).
	 */
	return strcmp(m->fstype->name, "btrfs") ?
		MKKDEV(major(st_dev), minor(st_dev)) : m->s_dev;
}

bool phys_stat_dev_match(dev_t st_dev, dev_t phys_dev,
		struct ns_id *ns, const char *path)
{
	if (st_dev == kdev_to_odev(phys_dev))
		return true;

	return phys_dev == phys_stat_resolve_dev(ns, st_dev, path);
}

/*
 * Compare super-blocks mounted at two places
 */
static bool mounts_sb_equal(struct mount_info *a, struct mount_info *b)
{
	if (a->fstype != b->fstype)
		return false;

	/* There is a btrfs bug where it doesn't emit subvol= correctly when
	 * files are bind mounted, so let's ignore it for now.
	 * https://marc.info/?l=linux-btrfs&m=145857372803614&w=2
	 */
	if (!strcmp(a->fstype->name, "btrfs")) {
		char *posa = strstr(a->options, "subvol="), *posb = strstr(b->options, "subvol=");
		bool equal;

		if (!posa || !posb) {
			pr_err("invalid btrfs options, no subvol argument");
			return false;
		}

		*posa = *posb = 0;
		equal = !strcmp(a->options, b->options);
		*posa = *posb = 's';

		if (!equal)
			return false;

		posa = strchr(posa, ',');
		posb = strchr(posb, ',');

		if ((posa && !posb) || (!posa && posb))
			return false;

		if (posa && strcmp(posa, posb))
			return false;
	} else {
		if (strcmp(a->options, b->options))
			return false;
	}

	if (a->fstype->code == FSTYPE__CGROUP &&
	    a->private && b->private &&
	    strcmp(a->private, b->private))
		return false;

	return a->s_dev == b->s_dev && !strcmp(a->source, b->source);
}

/*
 * Compare superblocks AND the way they are mounted
 */
static bool mounts_equal(struct mount_info *a, struct mount_info *b)
{
	if (!mounts_sb_equal(a, b))
		return false;
	if (strcmp(a->root, b->root))
		return false;

	return true;
}

/*
 * mnt_roots is a temporary directory for restoring sub-trees of
 * non-root namespaces.
 */
static char *mnt_roots;

static struct mount_info *mnt_build_ids_tree(struct mount_info *list, struct mount_info *tmp_root_mount)
{
	struct mount_info *m, *root = NULL;

	/*
	 * Just resolve the mnt_id:parent_mnt_id relations
	 */

	pr_debug("\tBuilding plain mount tree\n");
	for (m = list; m != NULL; m = m->next) {
		struct mount_info *parent;

		pr_debug("\t\tWorking on %d->%d\n", m->mnt_id, m->parent_mnt_id);

		if (m->mnt_id != m->parent_mnt_id)
			parent = __lookup_mnt_id(list, m->parent_mnt_id);
		else /* a circular mount reference. It's rootfs or smth like it. */
			parent = NULL;

		if (!parent) {
			/* This should be / */
			if (root == NULL && is_root_mount(m)) {
				root = m;
				continue;
			}

			pr_debug("Mountpoint %d (@%s) w/o parent %d\n",
				 m->mnt_id, m->mountpoint, m->parent_mnt_id);

			if (root && m->is_ns_root) {
				if (!mounts_sb_equal(root, m) ||
				    strcmp(root->root, m->root)) {
					pr_err("Nested mount namespaces with different "
					       "roots %d (@%s %s) %d (@%s %s) are not supported yet\n",
					       root->mnt_id, root->mountpoint, root->root,
					       m->mnt_id, m->mountpoint, m->root);
					return NULL;
				}

				/*
				 * A root of a sub mount namespace is
				 * mounted in a temporary directory in the
				 * root mount namespace, so its parent is
				 * the main root.
				 */
				parent = tmp_root_mount;
				if (unlikely(!tmp_root_mount)) {
					pr_err("Nested mount %d (@%s %s) w/o root insertion detected\n",
					       m->mnt_id, m->mountpoint, m->root);
					return NULL;
				}

				pr_debug("Mountpoint %d (@%s) get parent %d (@%s)\n",
					 m->mnt_id, m->mountpoint,
					 parent->mnt_id, parent->mountpoint);
			} else {
				pr_err("No root found for mountpoint %d (@%s)\n",
					m->mnt_id, m->mountpoint);
				return NULL;
			}
		}

		m->parent = parent;
		list_add_tail(&m->siblings, &parent->children);
	}

	if (!root) {
		pr_err("No root found for tree\n");
		return NULL;
	}

	if (tmp_root_mount) {
		tmp_root_mount->parent = root;
		list_add_tail(&tmp_root_mount->siblings, &root->children);
	}

	return root;
}

static unsigned int mnt_depth(struct mount_info *m)
{
	unsigned int depth = 0;
	char *c;

	for (c = m->mountpoint; *c != '\0'; c++)
		if (*c == '/')
			depth++;

	return depth;
}

static void mnt_resort_siblings(struct mount_info *tree)
{
	struct mount_info *m, *p;
	LIST_HEAD(list);

	/*
	 * Put siblings of each node in an order they can be (u)mounted
	 * I.e. if we have mounts on foo/bar/, foo/bar/foobar/ and foo/
	 * we should put them in the foo/bar/foobar/, foo/bar/, foo/ order.
	 * Otherwise we will not be able to (u)mount them in a sequence.
	 *
	 * Funny, but all we need for this is to sort them in the descending
	 * order of the amount of /-s in a path =)
	 *
	 * Use stupid insertion sort here, we're not expecting mount trees
	 * to contain hundreds (or more) elements.
	 */

	pr_info("\tResorting siblings on %d\n", tree->mnt_id);
	while (!list_empty(&tree->children)) {
		unsigned int depth;

		m = list_first_entry(&tree->children, struct mount_info, siblings);
		list_del(&m->siblings);

		depth = mnt_depth(m);
		list_for_each_entry(p, &list, siblings)
			if (mnt_depth(p) <= depth)
				break;

		list_add(&m->siblings, &p->siblings);
		mnt_resort_siblings(m);
	}

	list_splice(&list, &tree->children);
}

static void mnt_tree_show(struct mount_info *tree, int off)
{
	struct mount_info *m;

	pr_info("%*s[%s](%d->%d)\n", off, "",
			tree->mountpoint, tree->mnt_id, tree->parent_mnt_id);

	list_for_each_entry(m, &tree->children, siblings)
		mnt_tree_show(m, off + 1);

	pr_info("%*s<--\n", off, "");
}

/* Returns -1 on error, 1 if external mount resolved, 0 otherwise */
static int try_resolve_ext_mount(struct mount_info *info)
{
	char *ext;
	char devstr[64];

	ext = ext_mount_lookup(info->mountpoint + 1 /* trim the . */);
	if (ext) {
		pr_info("Found %s mapping for %s mountpoint\n",
				ext, info->mountpoint);
		info->external = ext;
		return 1;
	}

	snprintf(devstr, sizeof(devstr), "dev[%d/%d]",
			kdev_major(info->s_dev),  kdev_minor(info->s_dev));

	if (info->fstype->code == FSTYPE__UNSUPPORTED) {
		char *val;

		val = external_lookup_by_key(devstr);
		if (!IS_ERR_OR_NULL(val)) {
			char *source;
			int len;

			len = strlen(val) + sizeof("dev[]");
			source = xmalloc(len);
			if (source == NULL)
				return -1;

			snprintf(source, len, "dev[%s]", val);
			info->fstype = fstype_auto();
			BUG_ON(info->fstype->code != FSTYPE__AUTO);
			xfree(info->source);
			info->source = source;
			return 1;
		}
	}

	return 0;
}

static struct mount_info *find_widest_shared(struct mount_info *m)
{
	struct mount_info *p;

	/*
	 * Try to find a mount, which is wider or equal.
	 * A is wider than B, if A->root is a subpath of B->root.
	 */
	list_for_each_entry(p, &m->mnt_share, mnt_share)
		if (issubpath(m->root, p->root))
			return p;

	return NULL;
}

static struct mount_info *find_shared_peer(struct mount_info *m,
		struct mount_info *ct, char *ct_mountpoint)
{
	struct mount_info *cm;

	list_for_each_entry(cm, &m->children, siblings) {
		if (strcmp(ct_mountpoint, cm->mountpoint))
			continue;

		if (!mounts_equal(cm, ct))
			break;

		return cm;
	}

	return NULL;
}

static int validate_shared(struct mount_info *m)
{
	struct mount_info *t, *ct;
	char buf[PATH_MAX], *sibling_path;
	LIST_HEAD(children);

	/*
	 * Check that all mounts in one shared group has the same set of
	 * children. Only visible children are accounted. A non-root bind-mount
	 * doesn't see children out of its root and it's excpected case.
	 *
	 * Here is a few conditions:
	 * 1. t is wider than m
	 * 2. We search a wider mount in the same direction, so when we
	 *    enumirate all mounts, we can't be sure that all of them
	 *    has the same set of children.
	 */

	t = find_widest_shared(m);
	if (!t)
		/*
		 * The current mount is the widest one in its shared group,
		 * all others will be compared to it or with some other,
		 * which will be compared to it.
		 */
		return 0;

	/* Search a child, which is visiable in both mounts. */
	list_for_each_entry(ct, &t->children, siblings) {
		struct mount_info *cm;

		if (ct->is_ns_root || ct->mnt_id == CRTIME_MNT_ID)
			continue;

		sibling_path = mnt_get_sibling_path(ct, m, buf, sizeof(buf));
		if (sibling_path == NULL)
			continue;

		cm = find_shared_peer(m, ct, sibling_path);
		if (!cm)
			goto err;

		/*
		 * Keep this one aside. At the end of t's children scan we should
		 * move _all_ m's children here (the list_empty check below).
		 */
		list_move(&cm->siblings, &children);
	}

	/* Now all real mounts should be moved */
	list_for_each_entry(ct, &m->children, siblings) {
		if (ct->mnt_id != CRTIME_MNT_ID)
			goto err;
	}

	list_splice(&children, &m->children);
	return 0;

err:
	list_splice(&children, &m->children);
	pr_err("%d:%s and %d:%s have different set of mounts\n",
			m->mnt_id, m->mountpoint, t->mnt_id, t->mountpoint);
	return -1;
}

/*
 * Find the mount_info from which the respective bind-mount
 * can be created. It can be either an FS-root mount, or the
 * root of the tree (the latter only if its root path is the
 * sub-path of the bind mount's root).
 */

static struct mount_info *find_fsroot_mount_for(struct mount_info *bm)
{
	struct mount_info *sm;

	list_for_each_entry(sm, &bm->mnt_bind, mnt_bind)
		if (fsroot_mounted(sm) ||
				(sm->parent == NULL &&
				 strstartswith(bm->root, sm->root)))
			return sm;

	return NULL;
}

static bool does_mnt_overmount(struct mount_info *m)
{
	struct mount_info *t;

	if (!m->parent)
		return false;

	list_for_each_entry(t, &m->parent->children, siblings) {
		if (m == t)
			continue;
		if (issubpath(t->mountpoint, m->mountpoint))
			return true;
	}

	return false;
}

static int validate_mounts(struct mount_info *info, bool for_dump)
{
	struct mount_info *m, *t;

	for (m = info; m; m = m->next) {
		if (m->parent == NULL || m->is_ns_root)
			/* root mount can be any */
			continue;

		if (m->shared_id && validate_shared(m))
			return -1;

		if (m->external)
			goto skip_fstype;

		/*
		 * Mountpoint can point to / of an FS. In that case this FS
		 * should be of some known type so that we can just mount one.
		 *
		 * Otherwise it's a bindmount mountpoint and we try to find
		 * what fsroot mountpoint it's bound to. If this point is the
		 * root mount, the path to bindmount root should be accessible
		 * form the rootmount path (the strstartswith check in the
		 * else branch below).
		 */

		if (fsroot_mounted(m)) {
			if (m->fstype->code == FSTYPE__UNSUPPORTED) {
				pr_err("FS mnt %s dev %#x root %s unsupported id %d\n",
						m->mountpoint, m->s_dev, m->root, m->mnt_id);
				return -1;
			}
		} else {
			t = find_fsroot_mount_for(m);
			if (!t) {
				int ret;

				/*
				 * No root-mount found for this bind and it's neither
				 * marked nor auto-resolved as external one. So last
				 * chance not to fail is to talk to plugins.
				 */

				if (for_dump) {
					ret = run_plugins(DUMP_EXT_MOUNT, m->mountpoint, m->mnt_id);
					if (ret == 0)
						m->need_plugin = true;
				} else
					/*
					 * Plugin should take care of this one
					 * in restore_ext_mount, or do_bind_mount
					 * will mount it as external
					 */
					ret = m->need_plugin ? 0 : -ENOTSUP;

				if (ret < 0) {
					if (ret == -ENOTSUP)
						pr_err("%d:%s doesn't have a proper root mount\n",
								m->mnt_id, m->mountpoint);
					return -1;
				}
			}
		}
skip_fstype:
		if (does_mnt_overmount(m)) {
			pr_err("Unable to handle mounts under %d:%s\n",
					m->mnt_id, m->mountpoint);
			return -1;
		}
	}

	return 0;
}

static struct mount_info *find_best_external_match(struct mount_info *list, struct mount_info *info)
{
	struct mount_info *it, *candidate = NULL;

	for (it = list; it; it = it->next) {
		if (!mounts_sb_equal(info, it))
			continue;

		/*
		 * This means we have a situation like:
		 *
		 * root@criu:~# mount --bind bind1/subdir/ bind2
		 * root@criu:~# mount --bind bind1/ bind3
		 *
		 * outside the container, and bind1 is directly bind mounted
		 * inside the container. mounts_equal() considers these mounts
		 * equal for bind purposes, but their roots are different, and
		 * we want to match the one with the right root.
		 */
		if (!issubpath(info->root, it->root))
			continue;

		candidate = it;

		/*
		 * Consider the case of:
		 *
		 * mount /xxx
		 * mount --bind /xxx /yyy
		 * mount --make-shared /yyy
		 * mount --bind /xxx /zzz
		 * mount --make-shared /zzz
		 * bind mount a shared mount into the namespace
		 *
		 * Here, we want to return the /right/ mount, not just a mount
		 * that's equal. However, in the case:
		 *
		 * bind mount a shared mount into the namespace
		 * inside the namespace, remount MS_PRIVATE
		 * inside the namespace, remount MS_SHARED
		 *
		 * there will be no external mount with matching sharing
		 * because the sharing is only internal; we still want to bind
		 * mount from this mountinfo so we should return it, but we
		 * should make the sharing namespace private after that bind
		 * mount.
		 *
		 * Below are the cases where we found an exact match.
		 */
		if (info->flags & MS_SHARED && info->shared_id == it->shared_id)
			return candidate;

		if (info->flags & MS_SLAVE && info->master_id == it->shared_id)
			return candidate;
	}

	return candidate;
}

static struct ns_id *find_ext_ns_id(void)
{
	struct ns_id *ns;

	for (ns = ns_ids; ns->next; ns = ns->next)
		if (ns->type == NS_CRIU && ns->nd == &mnt_ns_desc) {
			if (!ns->mnt.mntinfo_list &&
			    !collect_mntinfo(ns, true))
				break;
			return ns;
		}

	pr_err("Failed to find criu pid's mount ns\n");
	return NULL;
}

static int resolve_external_mounts(struct mount_info *info)
{
	struct ns_id *ext_ns = NULL;
	struct mount_info *m;

	if (opts.autodetect_ext_mounts) {
		ext_ns = find_ext_ns_id();
		if (!ext_ns)
			return -1;
	}

	for (m = info; m; m = m->next) {
		int ret;
		char *p, *cut_root;
		struct mount_info *match;

		if (m->parent == NULL || m->is_ns_root)
			continue;

		ret = try_resolve_ext_mount(m);
		if (ret < 0)
			return ret;
		if (ret == 1 || !ext_ns)
			continue;

		match = find_best_external_match(ext_ns->mnt.mntinfo_list, m);
		if (!match)
			continue;

		if (m->flags & MS_SHARED) {
			if (!opts.enable_external_sharing)
				continue;

			if (m->shared_id != match->shared_id)
				m->internal_sharing = true;
		}

		if (m->flags & MS_SLAVE) {
			if (!opts.enable_external_masters)
				continue;

			/*
			 * In order to support something like internal slavery,
			 * we need to teach can_mount_now and do_mount_one
			 * about slavery relationships in external mounts. This
			 * seems like an uncommon case, so we punt for not.
			 */
			if (m->master_id != match->shared_id && m->master_id != match->master_id)
				continue;
		}

		cut_root = cut_root_for_bind(m->root, match->root);

		p = xsprintf("%s/%s", match->mountpoint + 1, cut_root);
		if (!p)
			return -1;

		m->external = AUTODETECTED_MOUNT;

		/*
		 * Put the guessed name in source. It will be picked up
		 * as auto-root in get_mp_root() on restore.
		 */
		xfree(m->source);
		m->source = p;

		pr_info("autodetected external mount %s for %s\n", p, m->mountpoint);
	}

	return 0;
}

static int resolve_shared_mounts(struct mount_info *info, int root_master_id)
{
	struct mount_info *m, *t;

	/*
	 * If we have a shared mounts, both master
	 * slave targets are to be present in mount
	 * list, otherwise we can't be sure if we can
	 * recreate the scheme later on restore.
	 */
	for (m = info; m; m = m->next) {
		bool need_share, need_master;

		/* the root master_id can be ignored, because it's already created */
		if (root_master_id && root_master_id == m->master_id)
			m->master_id = -1;

		need_share = m->shared_id && list_empty(&m->mnt_share);
		need_master = m->master_id > 0;

		pr_debug("Inspecting sharing on %2d shared_id %d master_id %d (@%s)\n",
			 m->mnt_id, m->shared_id, m->master_id, m->mountpoint);

		for (t = info; t && (need_share || need_master); t = t->next) {
			if (t == m)
				continue;
			if (need_master && t->shared_id == m->master_id) {
				pr_debug("\tThe mount %3d is slave for %3d (@%s -> @%s)\n",
					 m->mnt_id, t->mnt_id,
					 m->mountpoint, t->mountpoint);
				list_add(&m->mnt_slave, &t->mnt_slave_list);
				m->mnt_master = t;
				need_master = false;
			}

			/* Collect all mounts from this group */
			if (need_share && t->shared_id == m->shared_id) {
				pr_debug("\tMount %3d is shared with %3d group %3d (@%s -> @%s)\n",
					 m->mnt_id, t->mnt_id, m->shared_id,
					 t->mountpoint, m->mountpoint);
				list_add(&t->mnt_share, &m->mnt_share);
			}
		}

		/*
		 * If we haven't already determined this mount is external,
		 * then we don't know where it came from.
		 */
		if (need_master && m->parent && !m->external) {
			pr_err("Mount %d %s (master_id: %d shared_id: %d) "
			       "has unreachable sharing. Try --enable-external-masters.\n", m->mnt_id,
				m->mountpoint, m->master_id, m->shared_id);
			return -1;
		}

		/* Search bind-mounts */
		if (list_empty(&m->mnt_bind)) {
			/*
			 * A first mounted point will be set up as a source point
			 * for others. Look at propagate_mount()
			 */
			for (t = m->next; t; t = t->next) {
				if (mounts_sb_equal(m, t)) {
					list_add(&t->mnt_bind, &m->mnt_bind);
					pr_debug("\tThe mount %3d is bind for %3d (@%s -> @%s)\n",
						 t->mnt_id, m->mnt_id,
						 t->mountpoint, m->mountpoint);
				}
			}
		}
	}

	return 0;
}

static struct mount_info *mnt_build_tree(struct mount_info *list, struct mount_info *roots_mp)
{
	struct mount_info *tree;

	/*
	 * Organize them in a sequence in which they can be mounted/umounted.
	 */

	pr_info("Building mountpoints tree\n");
	tree = mnt_build_ids_tree(list, roots_mp);
	if (!tree)
		return NULL;

	mnt_resort_siblings(tree);
	pr_info("Done:\n");
	mnt_tree_show(tree, 0);
	return tree;
}

/*
 * mnt_fd is a file descriptor on the mountpoint, which is closed in an error case.
 * If mnt_fd is -1, the mountpoint will be opened by this function.
 */
int __open_mountpoint(struct mount_info *pm, int mnt_fd)
{
	dev_t dev;
	struct stat st;
	int ret;

	if (mnt_fd == -1) {
		int mntns_root;

		mntns_root = mntns_get_root_fd(pm->nsid);
		if (mntns_root < 0)
			return -1;

		mnt_fd = openat(mntns_root, pm->ns_mountpoint, O_RDONLY);
		if (mnt_fd < 0) {
			pr_perror("Can't open %s", pm->ns_mountpoint);
			return -1;
		}
	}

	ret = fstat(mnt_fd, &st);
	if (ret < 0) {
		pr_perror("fstat(%s) failed", pm->ns_mountpoint);
		goto err;
	}

	if (pm->s_dev_rt == MOUNT_INVALID_DEV) {
		pr_err("Resolving over unvalid device for %#x %s %s\n",
		       pm->s_dev, pm->fstype->name, pm->ns_mountpoint);
		goto err;
	}

	dev = phys_stat_resolve_dev(pm->nsid, st.st_dev, pm->ns_mountpoint + 1);
	/*
	 * Always check for @s_dev_rt here, because the @s_dev
	 * from the image (in case of restore) has all rights
	 * to not match the device (say it's migrated and kernel
	 * allocates new device ID).
	 */
	if (dev != pm->s_dev_rt) {
		pr_err("The file system %#x %#x (%#x) %s %s is inaccessible\n",
		       pm->s_dev, pm->s_dev_rt, (int)dev,
		       pm->fstype->name, pm->ns_mountpoint);
		goto err;
	}

	return mnt_fd;
err:
	close(mnt_fd);
	return -1;
}

int open_mount(unsigned int s_dev)
{
	struct mount_info *m;

	m = lookup_mnt_sdev(s_dev);
	if (!m)
		return -ENOENT;

	return __open_mountpoint(m, -1);
}

/* Bind-mount a mount point in a temporary place without children */
static char *get_clean_mnt(struct mount_info *mi, char *mnt_path_tmp, char *mnt_path_root)
{
	char *mnt_path;

	mnt_path = mkdtemp(mnt_path_tmp);
	if (mnt_path == NULL && errno == ENOENT)
		mnt_path = mkdtemp(mnt_path_root);
	if (mnt_path == NULL) {
		pr_perror("Can't create a temporary directory");
		return NULL;
	}

	if (mount(mi->mountpoint, mnt_path, NULL, MS_BIND, NULL)) {
		pr_perror("Can't bind-mount %d:%s to %s",
				mi->mnt_id, mi->mountpoint, mnt_path);
		rmdir(mnt_path);
		return NULL;
	}

	return mnt_path;
}

#define MNT_UNREACHABLE INT_MIN
int open_mountpoint(struct mount_info *pm)
{
	struct mount_info *c;
	int fd = -1, ns_old = -1;
	char mnt_path_tmp[] = "/tmp/cr-tmpfs.XXXXXX";
	char mnt_path_root[] = "/cr-tmpfs.XXXXXX";
	char *mnt_path = mnt_path_tmp;
	int cwd_fd;

	/*
	 * If a mount doesn't have children, we can open a mount point,
	 * otherwise we need to create a "private" copy.
	 */
	if (list_empty(&pm->children))
		return __open_mountpoint(pm, -1);

	pr_info("Something is mounted on top of %s\n", pm->mountpoint);

	list_for_each_entry(c, &pm->children, siblings) {
		if (!strcmp(c->mountpoint, pm->mountpoint)) {
			pr_debug("%d:%s is overmounted\n", pm->mnt_id, pm->mountpoint);
			return MNT_UNREACHABLE;
		}
	}

	/*
	 * To create a "private" copy, the target mount is bind-mounted
	 * in a temporary place w/o MS_REC (non-recursively).
	 * A mount point can't be bind-mounted in criu's namespace, it will be
	 * mounted in a target namespace. The sequence of actions is
	 * mkdtemp, setns(tgt), mount, open, detach, setns(old).
	 */

	cwd_fd = open(".", O_DIRECTORY);
	if (cwd_fd < 0) {
		pr_perror("Unable to open cwd");
		return -1;
	}

	if (switch_ns(pm->nsid->ns_pid, &mnt_ns_desc, &ns_old) < 0)
		goto out;

	mnt_path = get_clean_mnt(pm, mnt_path_tmp, mnt_path_root);
	if (mnt_path == NULL)
		goto out;

	fd = open_detach_mount(mnt_path);
	if (fd < 0)
		goto out;

	if (restore_ns(ns_old, &mnt_ns_desc)) {
		ns_old = -1;
		goto out;
	}
	if (fchdir(cwd_fd)) {
		pr_perror("Unable to restore cwd");
		close(cwd_fd);
		close(fd);
		return -1;
	}
	close(cwd_fd);

	return __open_mountpoint(pm, fd);
out:
	if (ns_old >= 0)
		 restore_ns(ns_old, &mnt_ns_desc);
	close_safe(&fd);
	if (fchdir(cwd_fd))
		pr_perror("Unable to restore cwd");
	close(cwd_fd);
	return -1;
}

static __maybe_unused int add_cr_time_mount(struct mount_info *root, char *fsname, const char *path, unsigned int s_dev)
{
	struct mount_info *mi, *t, *parent;

	mi = mnt_entry_alloc();
	if (!mi)
		return -1;
	mi->mountpoint = xmalloc(strlen(path) + 2);
	if (!mi->mountpoint)
		return -1;
	mi->ns_mountpoint = mi->mountpoint;
	sprintf(mi->mountpoint, ".%s", path);
	mi->mnt_id = CRTIME_MNT_ID;
	mi->flags = mi->sb_flags = 0;
	mi->root = xstrdup("/");
	mi->fsname = xstrdup(fsname);
	mi->source = xstrdup(fsname);
	mi->options = xstrdup("");
	if (!mi->root || !mi->fsname || !mi->source || !mi->options)
		return -1;
	mi->fstype = find_fstype_by_name(fsname);

	mi->s_dev = mi->s_dev_rt = s_dev;

	parent = root;
	while (1) {
		list_for_each_entry(t, &parent->children, siblings) {
			if (strstartswith(mi->mountpoint, t->mountpoint)) {
				parent = t;
				break;
			}
		}
		if (&t->siblings == &parent->children)
			break;
	}

	mi->nsid = parent->nsid;
	mi->parent = parent;
	mi->parent_mnt_id = parent->mnt_id;
	mi->next = parent->next;
	parent->next = mi;
	list_add(&mi->siblings, &parent->children);
	pr_info("Add cr-time mountpoint %s with parent %s(%u)\n",
		mi->mountpoint, parent->mountpoint, parent->mnt_id);
	return 0;
}

/* Returns 1 in case of success, -errno in case of mount fail, and 0 on other errors */
static __maybe_unused int mount_cr_time_mount(struct ns_id *ns, unsigned int *s_dev, const char *source,
			       const char *target, const char *type)
{
	int mnt_fd, ret, exit_code = 0;
	struct stat st;

	ret = switch_ns(ns->ns_pid, &mnt_ns_desc, &mnt_fd);
	if (ret < 0) {
		pr_err("Can't switch mnt_ns\n");
		goto out;
	}

	ret = mount(source, target, type, 0, NULL);
	if (ret < 0) {
		exit_code = -errno;
		goto restore_ns;
	} else {
		if (stat(target, &st) < 0) {
			 pr_perror("Can't stat %s", target);
			 exit_code = 0;
		} else {
			*s_dev = MKKDEV(major(st.st_dev), minor(st.st_dev));
			exit_code = 1;
		}
	}

restore_ns:
	ret = restore_ns(mnt_fd, &mnt_ns_desc);
out:
	return ret < 0 ? 0 : exit_code;
}



static int dump_one_fs(struct mount_info *mi)
{
	struct mount_info *pm = mi;
	struct mount_info *t;
	bool first = true;

	if (mi->is_ns_root || mi->need_plugin || mi->external || !mi->fstype->dump)
		return 0;

	/* mnt_bind is a cycled list, so list_for_each can't be used here. */
	for (; &pm->mnt_bind != &mi->mnt_bind || first;
	     pm = list_entry(pm->mnt_bind.next, typeof(*pm), mnt_bind)) {
		int ret;

		first = false;

		if (!fsroot_mounted(pm))
			continue;

		ret = pm->fstype->dump(pm);
		if (ret == MNT_UNREACHABLE)
			continue;
		if (ret < 0)
			return ret;

		list_for_each_entry(t, &pm->mnt_bind, mnt_bind)
			t->dumped = true;
		return 0;
	}

	pr_err("Unable to dump a file system for %d:%s\n",
				mi->mnt_id, mi->mountpoint);
	return -1;
}

static int dump_one_mountpoint(struct mount_info *pm, struct cr_img *img)
{
	MntEntry me = MNT_ENTRY__INIT;

	pr_info("\t%d: %x:%s @ %s\n", pm->mnt_id, pm->s_dev,
			pm->root, pm->mountpoint);

	me.fstype		= pm->fstype->code;

	if (me.fstype == FSTYPE__AUTO)
		me.fsname = pm->fsname;


	if (!pm->dumped && dump_one_fs(pm))
		return -1;

	if (pm->mnt_id == CRTIME_MNT_ID) {
		pr_info("Skip dumping cr-time mountpoint: %s\n", pm->mountpoint);
		return 0;
	}

	me.mnt_id		= pm->mnt_id;
	me.root_dev		= pm->s_dev;
	me.parent_mnt_id	= pm->parent_mnt_id;
	me.flags		= pm->flags;
	me.sb_flags		= pm->sb_flags;
	me.has_sb_flags		= true;
	me.mountpoint		= pm->mountpoint + 1;
	me.source		= pm->source;
	me.options		= pm->options;
	me.shared_id		= pm->shared_id;
	me.has_shared_id	= true;
	me.master_id		= pm->master_id;
	me.has_master_id	= true;
	if (pm->need_plugin) {
		me.has_with_plugin = true;
		me.with_plugin = true;
	}
	if (pm->deleted) {
		me.has_deleted	= true;
		me.deleted	= true;
	}

	if (pm->internal_sharing) {
		me.has_internal_sharing = true;
		me.internal_sharing = true;
	}

	if (pm->external) {
		/*
		 * For external mount points dump the mapping's
		 * value instead of root. See collect_mnt_from_image
		 * for reverse mapping details.
		 */
		me.root	= pm->external;
		me.has_ext_mount = true;
		me.ext_mount = true;
	} else
		me.root = pm->root;

	if (pb_write_one(img, &me, PB_MNT))
		return -1;

	return 0;
}

static void free_mntinfo(struct mount_info *pms)
{
	while (pms) {
		struct mount_info *pm;

		pm = pms->next;
		mnt_entry_free(pms);
		pms = pm;
	}
}

struct mount_info *collect_mntinfo(struct ns_id *ns, bool for_dump)
{
	struct mount_info *pm;

	pm = parse_mountinfo(ns->ns_pid, ns, for_dump);
	if (!pm) {
		pr_err("Can't parse %d's mountinfo\n", ns->ns_pid);
		return NULL;
	}

	ns->mnt.mntinfo_tree = mnt_build_tree(pm, NULL);
	if (ns->mnt.mntinfo_tree == NULL)
		goto err;

	ns->mnt.mntinfo_list = pm;
	return pm;
err:
	free_mntinfo(pm);
	return NULL;
}

static int dump_mnt_ns(struct ns_id *ns, struct mount_info *pms)
{
	struct mount_info *pm;
	int ret = -1;
	struct cr_img *img;
	int ns_id = ns->id;

	pr_info("Dumping mountpoints\n");
	img = open_image(CR_FD_MNTS, O_DUMP, ns_id);
	if (!img)
		goto err;

	for (pm = pms; pm && pm->nsid == ns; pm = pm->next)
		if (dump_one_mountpoint(pm, img))
			goto err_i;

	ret = 0;
err_i:
	close_image(img);
err:
	return ret;
}

/*
 * _fn_f  - pre-order traversal function
 * _fn_f  - post-order traversal function
 * _plist - a postpone list. _el is added to this list, if _fn_f returns
 *	    a positive value, and all lower elements are not enumirated.
 */
#define MNT_TREE_WALK(_r, _el, _fn_f, _fn_r, _plist, _prgs) do {		\
		struct mount_info *_mi = _r;					\
										\
		while (1) {							\
			int ret;						\
										\
			list_del_init(&_mi->postpone);				\
										\
			ret = _fn_f(_mi);					\
			if (ret < 0)						\
				return -1;					\
			else if (ret > 0) {					\
				list_add_tail(&_mi->postpone, _plist);		\
				goto up;					\
			}							\
										\
			_prgs++;					\
										\
			if (!list_empty(&_mi->children)) {			\
				_mi = list_entry(_mi->children._el,		\
						struct mount_info, siblings);	\
				continue;					\
			}							\
	up:									\
			if (_fn_r(_mi))						\
				return -1;					\
			if (_mi == _r)						\
				break;						\
			if (_mi->siblings._el == &_mi->parent->children) {	\
				_mi = _mi->parent;				\
				goto up;					\
			}							\
			_mi = list_entry(_mi->siblings._el,			\
					struct mount_info, siblings);		\
		}								\
	} while (0)

#define MNT_WALK_NONE	0 &&


static int mnt_tree_for_each(struct mount_info *start,
		int (*fn)(struct mount_info *))
{
	struct mount_info *tmp;
	LIST_HEAD(postpone);
	LIST_HEAD(postpone2);
	int progress;

	pr_debug("Start with %d:%s\n", start->mnt_id, start->mountpoint);
	list_add(&start->postpone, &postpone);

again:
	progress = 0;

	list_for_each_entry_safe(start, tmp, &postpone, postpone)
		MNT_TREE_WALK(start, next, fn, MNT_WALK_NONE, &postpone2, progress);

	if (!progress) {
		struct mount_info *m;

		pr_err("A few mount points can't be mounted\n");
		list_for_each_entry(m, &postpone2, postpone) {
			pr_err("%d:%d %s %s %s\n", m->mnt_id,
				m->parent_mnt_id, m->root,
				m->mountpoint, m->source);
		}
		return -1;
	}

	list_splice_init(&postpone2, &postpone);

	if (!list_empty(&postpone))
		goto again;

	return 0;

}

static int mnt_tree_for_each_reverse(struct mount_info *m,
		int (*fn)(struct mount_info *))
{
	int progress = 0;

	MNT_TREE_WALK(m, prev, MNT_WALK_NONE, fn, (struct list_head *) NULL, progress);

	return 0;
}

static char *resolve_source(struct mount_info *mi)
{
	if (kdev_major(mi->s_dev) == 0)
		/*
		 * Anonymous block device. Kernel creates them for
		 * diskless mounts.
		 */
		return mi->source;

	if (mi->fstype->code == FSTYPE__AUTO) {
		struct stat st;
		char *val;

		val = external_lookup_by_key(mi->source);
		if (!IS_ERR_OR_NULL(val))
			return val;

		if (!stat(mi->source, &st) && S_ISBLK(st.st_mode) &&
		    major(st.st_rdev) == kdev_major(mi->s_dev) &&
		    minor(st.st_rdev) == kdev_minor(mi->s_dev))
			return mi->source;
	}

	pr_err("No device for %s mount\n", mi->mountpoint);
	return NULL;
}

static int restore_shared_options(struct mount_info *mi, bool private, bool shared, bool slave)
{
	pr_debug("%d:%s private %d shared %d slave %d\n",
			mi->mnt_id, mi->mountpoint, private, shared, slave);

	if (mi->flags & MS_UNBINDABLE) {
		if (shared || slave)
			pr_warn("%s has both unbindable and sharing, ignoring unbindable\n", mi->mountpoint);
		else
			return mount(NULL, mi->mountpoint, NULL, MS_UNBINDABLE, NULL);
	}

	if (private && mount(NULL, mi->mountpoint, NULL, MS_PRIVATE, NULL)) {
		pr_perror("Unable to make %s private", mi->mountpoint);
		return -1;
	}
	if (slave && mount(NULL, mi->mountpoint, NULL, MS_SLAVE, NULL)) {
		pr_perror("Unable to make %s slave", mi->mountpoint);
		return -1;
	}
	if (shared && mount(NULL, mi->mountpoint, NULL, MS_SHARED, NULL)) {
		pr_perror("Unable to make %s shared", mi->mountpoint);
		return -1;
	}

	return 0;
}

/*
 * Umount points, which are propagated in slave parents, because
 * we can't be sure, that they were inherited in a real life.
 */
static int umount_from_slaves(struct mount_info *mi)
{
	struct mount_info *t;
	char *mpath, buf[PATH_MAX];

	list_for_each_entry(t, &mi->parent->mnt_slave_list, mnt_slave) {
		if (!t->mounted)
			continue;

		mpath = mnt_get_sibling_path(mi, t, buf, sizeof(buf));
		if (mpath == NULL)
			continue;

		pr_debug("\t\tUmount slave %s\n", mpath);
		if (umount(mpath) == -1) {
			pr_perror("Can't umount slave %s", mpath);
			return -1;
		}
	}

	return 0;
}

/*
 * If something is mounted in one shared point, it will be spread in
 * all other points from this shared group.
 *
 * Look at Documentation/filesystems/sharedsubtree.txt for more details
 */
static int propagate_siblings(struct mount_info *mi)
{
	struct mount_info *t;

	/*
	 * Find all mounts, which must be bind-mounted from this one
	 * to inherite shared group or master id
	 */
	list_for_each_entry(t, &mi->mnt_share, mnt_share) {
		if (t->mounted)
			continue;
		if (t->bind && t->bind->shared_id == t->shared_id)
			continue;
		pr_debug("\t\tBind share %s\n", t->mountpoint);
		t->bind = mi;
		t->s_dev_rt = mi->s_dev_rt;
	}

	list_for_each_entry(t, &mi->mnt_slave_list, mnt_slave) {
		if (t->mounted || t->bind)
			continue;
		pr_debug("\t\tBind slave %s\n", t->mountpoint);
		t->bind = mi;
		t->s_dev_rt = mi->s_dev_rt;
	}

	return 0;
}

static int propagate_mount(struct mount_info *mi)
{
	struct mount_info *t;

	propagate_siblings(mi);

	if (!mi->parent)
		goto skip_parent;

	umount_from_slaves(mi);

	/* Propagate this mount to everyone from a parent group */

	list_for_each_entry(t, &mi->parent->mnt_share, mnt_share) {
		struct mount_info *c;
		char path[PATH_MAX], *mp;
		bool found = false;

		mp = mnt_get_sibling_path(mi, t, path, sizeof(path));
		if (mp == NULL)
			continue;

		list_for_each_entry(c, &t->children, siblings) {
			if (mounts_equal(mi, c) && !strcmp(mp, c->mountpoint)) {
				pr_debug("\t\tPropagate %s\n", c->mountpoint);

				/*
				 * When a mount is propagated, the result mount
				 * is always shared. If we want to get a private
				 * mount, we need to convert it.
				 */
				restore_shared_options(c, !c->shared_id, 0, 0);

				c->mounted = true;
				propagate_siblings(c);
				umount_from_slaves(c);
				found = true;
			}
		}
		if (!found) {
			pr_err("Unable to find %s\n", mp);
			return -1;
		}
	}

skip_parent:
	/*
	 * FIXME Currently non-root mounts can be restored
	 * only if a proper root mount exists
	 */
	if (fsroot_mounted(mi) || mi->parent == NULL) {
		list_for_each_entry(t, &mi->mnt_bind, mnt_bind) {
			if (t->mounted)
				continue;
			if (t->bind)
				continue;
			if (t->master_id > 0)
				continue;
			t->bind = mi;
			t->s_dev_rt = mi->s_dev_rt;
		}
	}

	return 0;
}

static int fetch_rt_stat(struct mount_info *m, const char *where)
{
	struct stat st;

	if (stat(where, &st)) {
		pr_perror("Can't stat on %s", where);
		return -1;
	}

	m->s_dev_rt = MKKDEV(major(st.st_dev), minor(st.st_dev));
	return 0;
}

/*
 * Here are a set of flags which we know how to handle for the one mount call.
 * All of them except MS_RDONLY are set only as mnt flags.
 * MS_RDONLY is set for both mnt ans sb flags, so we can restore it for one
 * mount call only if it set for both masks.
 */
#define MS_MNT_KNOWN_FLAGS (MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_NOATIME | \
				MS_NODIRATIME | MS_RELATIME | MS_RDONLY)

static int do_simple_mount(struct mount_info *mi, const char *src, const
			   char *fstype, unsigned long mountflags)
{
	return mount(src, mi->mountpoint, fstype, mountflags, mi->options);
}

static char *mnt_fsname(struct mount_info *mi)
{
	if (mi->fstype->code == FSTYPE__AUTO)
		return mi->fsname;
	return mi->fstype->name;
}

static int apply_sb_flags(void *args, int fd, pid_t pid)
{
	int rst = -1, err = -1;
	long flags = *(int *) args;
	char path[PSFDS];

	snprintf(path, sizeof(path), "/proc/self/fd/%d", fd);

	if (pid != getpid() && switch_ns(pid, &mnt_ns_desc, &rst))
		return -1;

	err = mount(NULL, path, NULL, MS_REMOUNT | flags, NULL);
	if (err)
		pr_perror("Unable to remount %s", path);

	if (rst >= 0 &&	restore_ns(rst, &mnt_ns_desc))
		return -1;

	return err;
}

static int do_new_mount(struct mount_info *mi)
{
	unsigned long sflags = mi->sb_flags;
	unsigned long mflags = mi->flags & (~MS_PROPAGATE);
	char *src;
	struct fstype *tp = mi->fstype;
	bool remount_ro = (tp->restore && mi->sb_flags & MS_RDONLY);
	mount_fn_t do_mount = (tp->mount) ? tp->mount : do_simple_mount;

	src = resolve_source(mi);
	if (!src)
		return -1;

	/* Merge superblock and mount flags if it's possible */
	if (!(mflags & ~MS_MNT_KNOWN_FLAGS) && !((sflags ^ mflags) & MS_RDONLY)) {
		sflags |= mflags;
		mflags = 0;
	}

	if (remount_ro)
		sflags &= ~MS_RDONLY;

	if (do_mount(mi, src, mnt_fsname(mi), sflags) < 0) {
		pr_perror("Can't mount at %s", mi->mountpoint);
		return -1;
	}

	if (tp->restore && tp->restore(mi))
		return -1;

	if (mi->mnt_id == CRTIME_MNT_ID) {
		/* C-r time mountpoint, umount it */
		if (umount(mi->mountpoint) < 0) {
			pr_perror("Can't umount %s", mi->mountpoint);
			return -1;
		}
		goto out;
	}

	if (!mi->is_ns_root && !mi->external && remount_ro) {
		int fd;

		fd = open(mi->mountpoint, O_PATH);
		if (fd < 0) {
			pr_perror("Unable to open %s", mi->mountpoint);
			return -1;
		}

		sflags |= MS_RDONLY;
		if (userns_call(apply_sb_flags, 0,
				&sflags, sizeof(int), fd)) {
			pr_perror("Unable to apply mount falgs %d for %s",
						mi->sb_flags, mi->mountpoint);
			close(fd);
			return -1;
		}
		close(fd);
	}

	if (mflags && mount(NULL, mi->mountpoint, NULL,
				MS_REMOUNT | MS_BIND | mflags, NULL)) {
		pr_perror("Unable to apply bind-mount options");
		return -1;
	}

	/*
	 * A slave should be mounted from do_bind_mount().
	 * Look at can_mount_now() for details.
	 */
	BUG_ON(mi->master_id);
	if (restore_shared_options(mi, !mi->shared_id, mi->shared_id, 0))
		return -1;
out:
	mi->mounted = true;

	return 0;
}

static int restore_ext_mount(struct mount_info *mi)
{
	int ret;

	pr_debug("Restoring external bind mount %s\n", mi->mountpoint);
	ret = run_plugins(RESTORE_EXT_MOUNT, mi->mnt_id, mi->mountpoint, "/", NULL);
	if (ret)
		pr_err("Can't restore ext mount (%d)\n", ret);
	return ret;
}

static char mnt_clean_path[] = "/tmp/cr-tmpfs.XXXXXX";

static int mount_clean_path()
{
	/*
	 * To make a bind mount, we need to have access to a source directory,
	 * which can be over-mounted. The idea is to mount a source mount in
	 * an intermediate place without MS_REC and then create a target mounts.
	 * This intermediate place should be a private mount to not affect
	 * properties of the source mount.
	 */
	if (mkdtemp(mnt_clean_path) == NULL) {
		pr_perror("Unable to create a temporary directory");
		return -1;
	}

	if (mount(mnt_clean_path, mnt_clean_path, NULL, MS_BIND, NULL)) {
		pr_perror("Unable to mount tmpfs into %s", mnt_clean_path);
		return -1;
	}

	if (mount(NULL, mnt_clean_path, NULL, MS_PRIVATE, NULL)) {
		pr_perror("Unable to mark %s as private", mnt_clean_path);
		return -1;
	}

	return 0;
}

static int umount_clean_path()
{
	if (umount2(mnt_clean_path, MNT_DETACH)) {
		pr_perror("Unable to umount %s", mnt_clean_path);
		return -1;
	}

	if (rmdir(mnt_clean_path)) {
		pr_perror("Unable to remove %s", mnt_clean_path);
	}

	return 0;
}

static int do_bind_mount(struct mount_info *mi)
{
	char mnt_fd_path[PSFDS];
	char *root, *cut_root, rpath[PATH_MAX];
	unsigned long mflags;
	int exit_code = -1, mp_len;
	bool shared = false;
	bool master = false;
	bool private = false;
	char *mnt_path = NULL;
	struct stat st;
	bool umount_mnt_path = false;
	struct mount_info *c;

	if (mi->need_plugin) {
		if (restore_ext_mount(mi))
			return -1;
		goto out;
	}

	if (mi->external) {
		/*
		 * We have / pointing to criu's ns root still,
		 * so just use the mapping's path. The mountpoint
		 * is tuned in collect_mnt_from_image to refer
		 * to proper location in the namespace we restore.
		 */
		root = mi->external;
		private = !mi->master_id && (mi->internal_sharing || !mi->shared_id);
		goto do_bind;
	}

	shared = mi->shared_id && mi->shared_id == mi->bind->shared_id;
	master = mi->master_id && mi->master_id == mi->bind->master_id;
	private = !mi->master_id && !shared;
	cut_root = cut_root_for_bind(mi->root, mi->bind->root);

	/* Mount private can be initialized on mount() callback, which is
	 * called only once.
	 * It have to be copied to all it's sibling structures to provide users
	 * of it with actual data.
	 */
	mi->private = mi->bind->private;

	mnt_path = mi->bind->mountpoint;

	/* Access a mount by fd if mi->bind->mountpoint is overmounted */
	if (mi->bind->fd >= 0) {
		snprintf(mnt_fd_path, sizeof(mnt_fd_path),
					"/proc/self/fd/%d", mi->bind->fd);
		mnt_path = mnt_fd_path;
	}

	if (cut_root[0] == 0) /* This case is handled by mi->bind->fd */
		goto skip_overmount_check;

	/*
	 * The target path may be over-mounted by one of child mounts
	 * and we need to create a new bind-mount to get access to the path.
	 */
	mp_len = strlen(mi->bind->mountpoint);
	if (mp_len > 1) /* skip a joining / if mi->bind->mountpoint isn't "/" */
		mp_len++;

	list_for_each_entry(c, &mi->bind->children, siblings) {
		if (!c->mounted)
			continue;
		if (issubpath(cut_root, c->mountpoint + mp_len))
			break; /* a source path is overmounted */
	}

	if (&c->siblings != &mi->bind->children) {
		/* Get a copy of mi->bind without child mounts */
		if (mount(mnt_path, mnt_clean_path, NULL, MS_BIND, NULL)) {
			pr_perror("Unable to bind-mount %s to %s",
					mnt_path, mnt_clean_path);
			return -1;
		}
		mnt_path = mnt_clean_path;
		umount_mnt_path = true;
	}

	if (mnt_path == NULL)
		return -1;

skip_overmount_check:
	snprintf(rpath, sizeof(rpath), "%s/%s",
			mnt_path, cut_root);
	root = rpath;
do_bind:
	pr_info("\tBind %s to %s\n", root, mi->mountpoint);

	if (unlikely(mi->deleted)) {
		if (stat(mi->mountpoint, &st)) {
			pr_perror("Can't fetch stat on %s", mi->mountpoint);
			goto err;
		}

		if (S_ISDIR(st.st_mode)) {
			if (mkdir(root, (st.st_mode & ~S_IFMT))) {
				pr_perror("Can't re-create deleted directory %s", root);
				goto err;
			}
		} else if (S_ISREG(st.st_mode)) {
			int fd = open(root, O_WRONLY | O_CREAT | O_EXCL,
				      st.st_mode & ~S_IFMT);
			if (fd < 0) {
				pr_perror("Can't re-create deleted file %s", root);
				goto err;
			}
			close(fd);
		} else {
			pr_err("Unsupported st_mode 0%o deleted root %s\n",
			       (int)st.st_mode, root);
			goto err;
		}
	}

	if (mount(root, mi->mountpoint, NULL, MS_BIND | (mi->flags & MS_REC), NULL) < 0) {
		pr_perror("Can't mount at %s", mi->mountpoint);
		goto err;
	}

	mflags = mi->flags & (~MS_PROPAGATE);
	if (!mi->bind || mflags != (mi->bind->flags & (~MS_PROPAGATE)))
		if (mount(NULL, mi->mountpoint, NULL, MS_BIND | MS_REMOUNT | mflags, NULL)) {
			pr_perror("Can't mount at %s", mi->mountpoint);
			goto err;
		}

	if (unlikely(mi->deleted)) {
		if (S_ISDIR(st.st_mode)) {
			if (rmdir(root)) {
				pr_perror("Can't remove deleted directory %s", root);
				goto err;
			}
		} else if (S_ISREG(st.st_mode)) {
			if (unlink(root)) {
				pr_perror("Can't unlink deleted file %s", root);
				goto err;
			}
		}
	}
out:
	/*
	 * shared - the mount is in the same shared group with mi->bind
	 * mi->shared_id && !shared - create a new shared group
	 */
	if (restore_shared_options(mi, private,
	                           mi->shared_id && !shared,
	                           mi->master_id && !master))
		return -1;

	mi->mounted = true;
	exit_code = 0;
err:
	if (umount_mnt_path) {
		/*
		 * If mnt_path was shared, a new mount may be propagated
		 * into it.
		 */
		if (mount(NULL, mnt_path, NULL, MS_PRIVATE, NULL)) {
			pr_perror("Unable to make %s private", mnt_path);
			return -1;
		}
		if (umount2(mnt_path, MNT_DETACH)) {
			pr_perror("Unable to umount %s", mnt_path);
			return -1;
		}
	}
	return exit_code;
}

static bool can_mount_now(struct mount_info *mi)
{
	/* The root mount */
	if (!mi->parent)
		return true;

	if (mi->external)
		goto shared;

	/*
	 * We're the slave peer:
	 *   - Make sure the master peer is already mounted
	 *   - Make sure all children is mounted as well to
	 *     eliminame mounts duplications
	 */
	if (mi->master_id > 0) {
		struct mount_info *c;

		if (mi->bind == NULL)
			return false;

		list_for_each_entry(c, &mi->bind->children, siblings) {
			if (!c->mounted)
				return false;
		}
	}

	if (!fsroot_mounted(mi) && (mi->bind == NULL && !mi->need_plugin && !mi->external))
		return false;

shared:
	if (mi->parent->shared_id) {
		struct mount_info *p = mi->parent, *n;

		if (mi->parent->shared_id == mi->shared_id) {
			int rlen = strlen(mi->root);
			list_for_each_entry(n, &p->mnt_share, mnt_share)
				if (strlen(n->root) < rlen && !n->mounted)
					return false;
		} else {
			list_for_each_entry(n, &p->mnt_share, mnt_share)
				if (!n->mounted)
					return false;
		}
	}

	return true;
}

static int do_mount_root(struct mount_info *mi)
{
	if (restore_shared_options(mi, !mi->shared_id && !mi->master_id,
						mi->shared_id, mi->master_id))
		return -1;

	return fetch_rt_stat(mi, mi->mountpoint);
}

static int do_close_one(struct mount_info *mi)
{
	close_safe(&mi->fd);
	return 0;
}

static int do_mount_one(struct mount_info *mi)
{
	int ret;

	if (mi->mounted)
		return 0;

	if (!can_mount_now(mi)) {
		pr_debug("Postpone slave %s\n", mi->mountpoint);
		return 1;
	}

	if (mi->parent && !strcmp(mi->parent->mountpoint, mi->mountpoint)) {
		mi->parent->fd = open(mi->parent->mountpoint, O_PATH);
		if (mi->parent->fd < 0) {
			pr_perror("Unable to open %s", mi->mountpoint);
			return -1;
		}
	}

	pr_debug("\tMounting %s @%s (%d)\n", mi->fstype->name, mi->mountpoint, mi->need_plugin);

	if (!mi->parent) {
		/* do_mount_root() is called from populate_mnt_ns() */
		mi->mounted = true;
		ret = 0;
	} else if (!mi->bind && !mi->need_plugin && !mi->external)
		ret = do_new_mount(mi);
	else
		ret = do_bind_mount(mi);

	if (ret == 0 && fetch_rt_stat(mi, mi->mountpoint))
		return -1;

	if (ret == 0 && propagate_mount(mi))
		return -1;

	if (mi->fstype->code == FSTYPE__UNSUPPORTED) {
		struct statfs st;

		if (statfs(mi->mountpoint, &st)) {
			pr_perror("Unable to statfs %s", mi->mountpoint);
			return -1;
		}
		if (st.f_type == BTRFS_SUPER_MAGIC)
			mi->fstype = find_fstype_by_name("btrfs");
	}

	return ret;
}

static int do_umount_one(struct mount_info *mi)
{
	if (!mi->parent)
		return 0;

	if (mount("none", mi->parent->mountpoint, "none", MS_REC|MS_PRIVATE, NULL)) {
		pr_perror("Can't mark %s as private", mi->parent->mountpoint);
		return -1;
	}

	if (umount(mi->mountpoint)) {
		pr_perror("Can't umount at %s", mi->mountpoint);
		return -1;
	}

	pr_info("Umounted at %s\n", mi->mountpoint);
	return 0;
}

static int cr_pivot_root(char *root)
{
	char tmp_dir_tmpl[] = "crtools-put-root.XXXXXX";
	bool tmp_dir = false;
	char *put_root = "tmp";
	int exit_code = -1;
	struct stat st;

	pr_info("Move the root to %s\n", root ? : ".");

	if (root) {
		if (chdir(root)) {
			pr_perror("chdir(%s) failed", root);
			return -1;
		}
	}

	if (stat(put_root, &st) || !S_ISDIR(st.st_mode)) {
		put_root = mkdtemp(tmp_dir_tmpl);
		if (put_root == NULL) {
			pr_perror("Can't create a temporary directory");
			return -1;
		}
		tmp_dir = true;
	}

	if (mount(put_root, put_root, NULL, MS_BIND, NULL)) {
		pr_perror("Unable to mount tmpfs in %s", put_root);
		goto err_root;
	}

	if (mount(NULL, put_root, NULL, MS_PRIVATE, NULL)) {
		pr_perror("Can't remount %s with MS_PRIVATE", put_root);
		goto err_tmpfs;
	}

	if (pivot_root(".", put_root)) {
		pr_perror("pivot_root(., %s) failed", put_root);
		goto err_tmpfs;
	}

	if (mount("none", put_root, "none", MS_REC|MS_SLAVE, NULL)) {
		pr_perror("Can't remount root with MS_PRIVATE");
		return -1;
	}

	exit_code = 0;

	if (umount2(put_root, MNT_DETACH)) {
		pr_perror("Can't umount %s", put_root);
		return -1;
	}

err_tmpfs:
	if (umount2(put_root, MNT_DETACH)) {
		pr_perror("Can't umount %s", put_root);
		return -1;
	}

err_root:
	if (tmp_dir && rmdir(put_root)) {
		pr_perror("Can't remove the directory %s", put_root);
		return -1;
	}

	return exit_code;
}

struct mount_info *mnt_entry_alloc()
{
	struct mount_info *new;

	/*
	 * We rely on xzalloc here for MOUNT_INVALID_DEV.
	 */
	BUILD_BUG_ON(MOUNT_INVALID_DEV);

	new = xzalloc(sizeof(struct mount_info));
	if (new) {
		new->fd = -1;
		INIT_LIST_HEAD(&new->children);
		INIT_LIST_HEAD(&new->siblings);
		INIT_LIST_HEAD(&new->mnt_slave_list);
		INIT_LIST_HEAD(&new->mnt_share);
		INIT_LIST_HEAD(&new->mnt_bind);
		INIT_LIST_HEAD(&new->postpone);
	}
	return new;
}

void mnt_entry_free(struct mount_info *mi)
{
	if (mi) {
		xfree(mi->root);
		xfree(mi->mountpoint);
		xfree(mi->source);
		xfree(mi->options);
		xfree(mi->fsname);
		xfree(mi);
	}
}

/*
 * Helper for getting a path to where the namespace's root
 * is re-constructed.
 */
static inline int print_ns_root(struct ns_id *ns, char *buf, int bs)
{
	return snprintf(buf, bs, "%s/%d", mnt_roots, ns->id);
}

static int create_mnt_roots(void)
{
	int exit_code = -1, cwd_fd;

	if (mnt_roots)
		return 0;

	cwd_fd = open(".", O_DIRECTORY);
	if (cwd_fd < 0) {
		pr_perror("Unable to open cwd");
		return -1;
	}

	if (chdir(opts.root ? : "/")) {
		pr_perror("Unable to change working directory on %s", opts.root);
		goto out;
	}

	mnt_roots = xstrdup(".criu.mntns.XXXXXX");
	if (mnt_roots == NULL)
		goto out;

	if (mkdtemp(mnt_roots) == NULL) {
		pr_perror("Unable to create a temporary directory");
		mnt_roots = NULL;
		goto out;
	}

	exit_code = 0;
out:
	if (fchdir(cwd_fd)) {
		pr_perror("Unable to restore cwd");
		exit_code = -1;
	}
	close(cwd_fd);

	return exit_code;
}

static int rst_collect_local_mntns(enum ns_type typ)
{
	struct ns_id *nsid;

	nsid = rst_new_ns_id(0, getpid(), &mnt_ns_desc, typ);
	if (!nsid)
		return -1;

	mntinfo = collect_mntinfo(nsid, false);
	if (!mntinfo)
		return -1;

	nsid->ns_populated = true;
	return 0;
}

static int get_mp_root(MntEntry *me, struct mount_info *mi)
{
	char *ext = NULL;

	mi->root = xstrdup(me->root);
	if (!mi->root)
		return -1;

	if (!me->ext_mount)
		goto out;

	/*
	 * External mount point -- get the reverse mapping
	 * from the command line and put into root's place
	 */

	ext = ext_mount_lookup(me->root);
	if (!ext) {
		if (!opts.autodetect_ext_mounts) {
			pr_err("No mapping for %s mountpoint\n", me->mountpoint);
			return -1;
		}

		/*
		 * Make up an external mount entry for this
		 * mount point, since we couldn't find a user
		 * supplied one.
		 *
		 * The 'val' was put into mi->source during
		 * dump by resolve_external_mounts().
		 */

		ext = mi->source;
	}

	mi->external = ext;
out:
	pr_debug("\t\tWill mount %d from %s%s\n",
			mi->mnt_id, ext ? : mi->root, ext ? " (E)" : "");
	return 0;
}

static int get_mp_mountpoint(char *mountpoint, struct mount_info *mi, char *root, int root_len)
{
	int len;

	len  = strlen(mountpoint) + root_len + 1;
	mi->mountpoint = xmalloc(len);
	if (!mi->mountpoint)
		return -1;

	/*
	 * For bind-mounts we would also fix the root here
	 * too, but bind-mounts restore merges mountpoint
	 * and root paths together, so there's no need in
	 * that.
	 */

	strcpy(mi->mountpoint, root);
	strcpy(mi->mountpoint + root_len, mountpoint);

	mi->ns_mountpoint = mi->mountpoint + root_len;

	pr_debug("\t\tWill mount %d @ %s\n", mi->mnt_id, mi->mountpoint);
	return 0;
}

static int collect_mnt_from_image(struct mount_info **pms, struct ns_id *nsid)
{
	MntEntry *me = NULL;
	int ret, root_len = 1;
	struct cr_img *img;
	char root[PATH_MAX] = ".";

	img = open_image(CR_FD_MNTS, O_RSTR, nsid->id);
	if (!img)
		return -1;

	if (nsid->type == NS_OTHER)
		root_len = print_ns_root(nsid, root, sizeof(root));

	pr_debug("Reading mountpoint images (id %d pid %d)\n",
		 nsid->id, (int)nsid->ns_pid);

	while (1) {
		struct mount_info *pm;

		ret = pb_read_one_eof(img, &me, PB_MNT);
		if (ret <= 0)
			break;

		pm = mnt_entry_alloc();
		if (!pm)
			goto err;

		pm->nsid = nsid;
		pm->next = *pms;
		*pms = pm;

		pm->mnt_id		= me->mnt_id;
		pm->parent_mnt_id	= me->parent_mnt_id;
		pm->s_dev		= me->root_dev;
		pm->flags		= me->flags;
		pm->sb_flags		= me->sb_flags;
		if (!me->has_sb_flags) {
			const unsigned int mflags = MS_SHARED | MS_PRIVATE |
						MS_SLAVE | MS_UNBINDABLE |
						MS_NOSUID | MS_NODEV | MS_NOEXEC |
						MS_NOATIME | MS_NODIRATIME | MS_RELATIME;

			/*
			 * In old images mnt and sb flags are saved together.
			 * Here we separate them and save the old logic about MS_RDONLY.
			 */

			pm->sb_flags = pm->flags & ~mflags;
			pm->flags = pm->flags & mflags;
		}
		pm->shared_id		= me->shared_id;
		pm->master_id		= me->master_id;
		pm->need_plugin		= me->with_plugin;
		pm->deleted		= me->deleted;
		pm->is_ns_root		= is_root(me->mountpoint);
		if (me->has_internal_sharing)
			pm->internal_sharing = me->internal_sharing;

		pm->source = xstrdup(me->source);
		if (!pm->source)
			goto err;

		pm->options = xstrdup(me->options);
		if (!pm->options)
			goto err;

		if (me->fstype != FSTYPE__AUTO && me->fsname) {
			pr_err("fsname can be set only for FSTYPE__AUTO mounts\n");
			goto err;
		}

#ifdef CONFIG_BINFMT_MISC_VIRTUALIZED
		if (me->fstype == FSTYPE__BINFMT_MISC)
			opts.has_binfmt_misc = true;
#endif

		/* FIXME: abort unsupported early */
		pm->fstype = decode_fstype(me->fstype);

		if (me->fsname) {
			pm->fsname = xstrdup(me->fsname);
			if (!pm->fsname)
				goto err;
		}

		if (get_mp_root(me, pm))
			goto err;

		if (get_mp_mountpoint(me->mountpoint, pm, root, root_len))
			goto err;

		pr_debug("\tRead %d mp @ %s\n", pm->mnt_id, pm->mountpoint);
	}

	if (me)
		mnt_entry__free_unpacked(me, NULL);

	close_image(img);

	return 0;
err:
	close_image(img);
	return -1;
}

int read_mnt_ns_img(void)
{
	struct mount_info *pms = NULL;
	struct ns_id *nsid;

	for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) {
		if (nsid->nd != &mnt_ns_desc)
			continue;

		if (collect_mnt_from_image(&pms, nsid))
			return -1;
	}

	mntinfo = pms;
	return 0;
}

int rst_get_mnt_root(int mnt_id, char *path, int plen)
{
	struct mount_info *m;

	if (!(root_ns_mask & CLONE_NEWNS) || mnt_id == -1)
		goto rroot;

	m = lookup_mnt_id(mnt_id);
	if (m == NULL)
		return -1;

	if (m->nsid->type == NS_OTHER)
		return print_ns_root(m->nsid, path, plen);

rroot:
	path[0] = '/';
	path[1] = '\0';
	return 1;
}

int mntns_maybe_create_roots(void)
{
	struct ns_id *ns;

	if (!(root_ns_mask & CLONE_NEWNS))
		return 0;

	for (ns = ns_ids; ns != NULL; ns = ns->next) {
		if (ns->nd != &mnt_ns_desc)
			continue;

		if (ns->type != NS_ROOT) {
			BUG_ON(ns->type == NS_CRIU);

			/*
			 * If we have more than one (root) namespace,
			 * then we'll need the roots yard.
			 */
			return create_mnt_roots();
		}
	}

	/* No "other" mntns found, just go ahead, we don't need roots yard. */
	return 0;
}

static int do_restore_task_mnt_ns(struct ns_id *nsid, struct pstree_item *current)
{
	int fd;

	fd = open_proc(root_item->pid.virt, "fd/%d", nsid->mnt.ns_fd);
	if (fd < 0)
		return -1;

	if (setns(fd, CLONE_NEWNS)) {
		pr_perror("Can't restore mntns");
		close(fd);
		return -1;
	}
	close(fd);

	return 0;
}

int restore_task_mnt_ns(struct pstree_item *current)
{
	if (current->ids && current->ids->has_mnt_ns_id) {
		unsigned int id = current->ids->mnt_ns_id;
		struct ns_id *nsid;

		/*
		 * Regardless of the namespace a task wants to
		 * live in, by that point they all will live in
		 * root's one (see prepare_pstree_kobj_ids() +
		 * get_clone_mask()). So if the current task's
		 * target namespace is the root's one -- it's
		 * already there, otherwise it will have to do
		 * setns().
		 */
		if (!current->parent || id == current->parent->ids->mnt_ns_id)
			return 0;

		nsid = lookup_ns_by_id(id, &mnt_ns_desc);
		if (nsid == NULL) {
			pr_err("Can't find mount namespace %d\n", id);
			return -1;
		}

		BUG_ON(nsid->type == NS_CRIU);

		if (do_restore_task_mnt_ns(nsid, current))
			return -1;
	}

	return 0;
}

void fini_restore_mntns(void)
{
	struct ns_id *nsid;

	if (!(root_ns_mask & CLONE_NEWNS))
		return;

	for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) {
		if (nsid->nd != &mnt_ns_desc)
			continue;
		close_safe(&nsid->mnt.ns_fd);
		if (nsid->type != NS_ROOT)
			close_safe(&nsid->mnt.root_fd);
		nsid->ns_populated = true;
	}
}

/*
 * All nested mount namespaces are restore as sub-trees of the root namespace.
 */
static int populate_roots_yard(void)
{
	char path[PATH_MAX];
	struct ns_id *nsid;

	if (mnt_roots == NULL)
		return 0;

	if (make_yard(mnt_roots))
		return -1;

	for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) {
		if (nsid->nd != &mnt_ns_desc)
			continue;

		print_ns_root(nsid, path, sizeof(path));
		if (mkdir(path, 0600)) {
			pr_perror("Unable to create %s", path);
			return -1;
		}
	}

	return 0;
}

static int populate_mnt_ns(void)
{
	struct mount_info *pms;
	struct ns_id *nsid;
	struct mount_info *roots_mp = NULL;
	int ret;

	if (mnt_roots) {
		/* mnt_roots is a tmpfs mount and it's private */
		roots_mp = mnt_entry_alloc();
		if (!roots_mp)
			return -1;

		roots_mp->mountpoint = mnt_roots;
		roots_mp->mounted = true;
	}

	pms = mnt_build_tree(mntinfo, roots_mp);
	if (!pms)
		return -1;

#ifdef CONFIG_BINFMT_MISC_VIRTUALIZED
	if (!opts.has_binfmt_misc && !list_empty(&binfmt_misc_list)) {
		/* Add to mount tree. Generic code will mount it later */
		ret = add_cr_time_mount(pms, "binfmt_misc", BINFMT_MISC_HOME, 0);
		if (ret)
			return -1;
	}
#endif

	if (resolve_shared_mounts(mntinfo, pms->master_id))
		return -1;

	for (nsid = ns_ids; nsid; nsid = nsid->next) {
		if (nsid->nd != &mnt_ns_desc)
			continue;

		/*
		 * Make trees of all namespaces look the
		 * same, so that manual paths resolution
		 * works on them.
		 */
		nsid->mnt.mntinfo_tree = pms;
	}

	if (validate_mounts(mntinfo, false))
		return -1;

	/*
	 * Set properties for the root before mounting a root yard,
	 * otherwise the root yard can be propagated into the host
	 * mntns and remain there.
	 */
	if (do_mount_root(pms))
		return -1;

	if (populate_roots_yard())
		return -1;

	if (mount_clean_path())
		return -1;

	ret = mnt_tree_for_each(pms, do_mount_one);
	mnt_tree_for_each(pms, do_close_one);

	if (umount_clean_path())
		return -1;
	return ret;
}

static int __depopulate_roots_yard(void)
{
	int ret = 0;

	if (mnt_roots == NULL)
		return 0;

	if (mount("none", mnt_roots, "none", MS_REC|MS_PRIVATE, NULL)) {
		pr_perror("Can't remount root with MS_PRIVATE");
		ret = 1;
	}
	/*
	 * Don't exit after a first error, because this function
	 * can be used to rollback in a error case.
	 * Don't worry about MNT_DETACH, because files are restored after this
	 * and nobody will not be restored from a wrong mount namespace.
	 */
	if (umount2(mnt_roots, MNT_DETACH)) {
		pr_perror("Can't unmount %s", mnt_roots);
		ret = -1;
	}

	if (rmdir(mnt_roots)) {
		pr_perror("Can't remove the directory %s", mnt_roots);
		ret = -1;
	}

	return ret;
}

int depopulate_roots_yard(int mntns_fd, bool clean_remaps)
{
	int ret = 0, old_cwd = -1, old_ns = -1;

	if (mntns_fd < 0) {
		if (clean_remaps)
			try_clean_remaps();
		cleanup_mnt_ns();
		return 0;
	}

	pr_info("Switching to new ns to clean ghosts\n");

	old_cwd = open(".", O_PATH);
	if (old_cwd < 0) {
		pr_perror("Unable to open cwd");
		return -1;
	}

	old_ns = open_proc(PROC_SELF, "ns/mnt");
	if (old_ns < 0) {
		pr_perror("`- Can't keep old ns");
		close(old_cwd);
		return -1;
	}
	if (setns(mntns_fd, CLONE_NEWNS) < 0) {
		pr_perror("`- Can't switch");
		close(old_ns);
		close(old_cwd);
		return -1;
	}

	if (clean_remaps)
		try_clean_remaps();

	if (__depopulate_roots_yard())
		ret = -1;

	if (setns(old_ns, CLONE_NEWNS) < 0) {
		pr_perror("Fail to switch back!");
		ret = -1;
	}
	close(old_ns);

	if (fchdir(old_cwd)) {
		pr_perror("Unable to restore cwd");
		ret = -1;
	}
	close(old_cwd);

	return ret;
}

void cleanup_mnt_ns(void)
{
	char path[PATH_MAX], *root = opts.root ? : "/";

	if (mnt_roots == NULL)
		return;

	snprintf(path, sizeof(path), "%s/%s", root, mnt_roots);
	if (rmdir(path))
		pr_perror("Can't remove the directory %s", mnt_roots);
}

int prepare_mnt_ns(void)
{
	int ret = -1, rst = -1;
	struct mount_info *old;
	struct ns_id ns = { .type = NS_CRIU, .ns_pid = PROC_SELF, .nd = &mnt_ns_desc };
	struct ns_id *nsid;

	if (!(root_ns_mask & CLONE_NEWNS))
		return rst_collect_local_mntns(NS_CRIU);

	pr_info("Restoring mount namespace\n");

	old = collect_mntinfo(&ns, false);
	if (old == NULL)
		return -1;

	if (!opts.root) {
		if (chdir("/")) {
			pr_perror("chdir(\"/\") failed");
			return -1;
		}

		/*
		 * The new mount namespace is filled with the mountpoint
		 * clones from the original one. We have to umount them
		 * prior to recreating new ones.
		 */
		pr_info("Cleaning mount namespace\n");
		if (mnt_tree_for_each_reverse(ns.mnt.mntinfo_tree, do_umount_one))
			return -1;
	} else {
		struct mount_info *mi;
		char *ret;
		char path[PATH_MAX];

		/*
		 * The whole tree of mountpoints is to be moved into one
		 * place with the pivot_root() call. Don't do manual
		 * umount (as we do above), all this stuff will go away
		 * with a single umount call later.
		 */

		ret = realpath(opts.root, path);
		if (!ret) {
			pr_err("Unable to find real path for %s\n", opts.root);
			return -1;
		}

		/* moving a mount residing under a shared mount is invalid. */
		mi = mount_resolve_path(ns.mnt.mntinfo_tree, path);
		if (mi == NULL) {
			pr_err("Unable to find mount point for %s\n", opts.root);
			return -1;
		}
		if (mi->parent == NULL) {
			pr_err("New root and old root are the same\n");
			return -1;
		}

		/* Our root is mounted over the parent (in the same directory) */
		if (!strcmp(mi->parent->mountpoint, mi->mountpoint)) {
			pr_err("The parent of the new root is unreachable\n");
			return -1;
		}

		if (mount("none", mi->parent->mountpoint + 1, "none", MS_SLAVE, NULL)) {
			pr_perror("Can't remount the parent of the new root with MS_SLAVE");
			return -1;
		}

		/* Unprivileged users can't reveal what is under a mount */
		if (root_ns_mask & CLONE_NEWUSER) {
			if (mount(opts.root, opts.root, NULL, MS_BIND | MS_REC, NULL)) {
				pr_perror("Can't remount bind-mount %s into itself", opts.root);
				return -1;
			}
		}
		if (chdir(opts.root)) {
			pr_perror("chdir(%s) failed", opts.root ? : "/");
			return -1;
		}
	}

	free_mntinfo(old);

	ret = populate_mnt_ns();
	if (!ret && opts.root)
		ret = cr_pivot_root(NULL);
	if (ret)
		return -1;

	rst = open_proc(PROC_SELF, "ns/mnt");
	if (rst < 0)
		return -1;

	/* resotre non-root namespaces */
	for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) {
		char path[PATH_MAX];

		if (nsid->nd != &mnt_ns_desc)
			continue;
		if (nsid->type == NS_ROOT) {
			/* Pin one with a file descriptor */
			nsid->mnt.ns_fd = open_proc(PROC_SELF, "ns/mnt");
			if (nsid->mnt.ns_fd < 0)
				goto err;
			/* we set ns_populated so we don't need to open root_fd */
			nsid->ns_populated = true;
			continue;
		}

		/* Create the new mount namespace */
		if (unshare(CLONE_NEWNS)) {
			pr_perror("Unable to create a new mntns");
			goto err;
		}

		/* Set its root */
		path[0] = '/';
		print_ns_root(nsid, path + 1, sizeof(path) - 1);
		if (cr_pivot_root(path))
			goto err;

		/* Pin one with a file descriptor */
		nsid->mnt.ns_fd = open_proc(PROC_SELF, "ns/mnt");
		if (nsid->mnt.ns_fd < 0)
			goto err;

		/* root_fd is used to restore file mappings */
		nsid->mnt.root_fd = open_proc(PROC_SELF, "root");
		if (nsid->mnt.root_fd < 0)
			goto err;

		/* And return back to regain the access to the roots yard */
		if (setns(rst, CLONE_NEWNS)) {
			pr_perror("Can't restore mntns back");
			goto err;
		}
	}
	close(rst);

	return ret;
err:
	if (rst >= 0)
		restore_ns(rst, &mnt_ns_desc);
	return -1;
}

static int mntns_root_pid = -1;
static int mntns_set_root_fd(pid_t pid, int fd)
{
	int ret;

	ret = install_service_fd(ROOT_FD_OFF, fd);
	if (ret >= 0)
		mntns_root_pid = pid;
	close(fd);

	return ret;
}

int __mntns_get_root_fd(pid_t pid)
{

	int fd, pfd;
	int ret;
	char path[PATH_MAX + 1];

	if (mntns_root_pid == pid) /* The required root is already opened */
		return get_service_fd(ROOT_FD_OFF);

	close_service_fd(ROOT_FD_OFF);

	if (!(root_ns_mask & CLONE_NEWNS)) {
		/*
		 * If criu and tasks we dump live in the same mount
		 * namespace, we can just open the root directory.
		 * All paths resolution would occur relative to criu's
		 * root. Even if it is not namespace's root, provided
		 * file paths are resolved, we'd get consistent dump.
		 */
		fd = open("/", O_RDONLY | O_DIRECTORY);
		if (fd < 0) {
			pr_perror("Can't open root");
			return -1;
		}

		goto set_root;
	}

	/*
	 * If /proc/pid/root links on '/', it signs that a root of the task
	 * and a root of mntns is the same.
	 */

	pfd = open_pid_proc(pid);
	ret = readlinkat(pfd, "root", path, sizeof(path) - 1);
	if (ret < 0) {
		close_pid_proc();
		return ret;
	}

	path[ret] = '\0';

	if (ret != 1 || path[0] != '/') {
		pr_err("The root task has another root than mntns: %s\n", path);
		close_pid_proc();
		return -1;
	}

	fd = openat(pfd, "root", O_RDONLY | O_DIRECTORY, 0);
	close_pid_proc();
	if (fd < 0) {
		pr_perror("Can't open the task root");
		return -1;
	}

set_root:
	return mntns_set_root_fd(pid, fd);
}

int mntns_get_root_fd(struct ns_id *mntns)
{
	/*
	 * All namespaces are restored from the root task and during the
	 * CR_STATE_FORKING stage the root task has two file descriptors for
	 * each mntns. One is associated with a namespace and another one is a
	 * root of this mntns.
	 *
	 * When a non-root task is forked, it enters into a proper mount
	 * namespace, restores private mappings and forks children. Some of
	 * these mappings can be associated with files from other namespaces.
	 *
	 * After the CR_STATE_FORKING stage the root task has to close all
	 * mntns file descriptors to restore its descriptors and at this moment
	 * we know that all tasks live in their mount namespaces.
	 *
	 * If we find that a mount namespace isn't populated, we can get its
	 * root from the root task.
	 */

	if (!mntns->ns_populated) {
		int fd;

		fd = open_proc(root_item->pid.virt, "fd/%d", mntns->mnt.root_fd);
		if (fd < 0)
			return -1;

		return mntns_set_root_fd(mntns->ns_pid, fd);
	}

	return __mntns_get_root_fd(mntns->ns_pid);
}

struct ns_id *lookup_nsid_by_mnt_id(int mnt_id)
{
	struct mount_info *mi;

	/*
	 * Kernel before 3.15 doesn't show mnt_id for file descriptors.
	 * mnt_id isn't saved for files, if mntns isn't dumped.
	 * In both these cases we have only one root, so here
	 * is not matter which mount will be restured.
	 */
	if (mnt_id == -1)
		mi = mntinfo;
	else
		mi = lookup_mnt_id(mnt_id);
	return mi ? mi->nsid : NULL;
}

int mntns_get_root_by_mnt_id(int mnt_id)
{
	struct ns_id *mntns;

	mntns = lookup_nsid_by_mnt_id(mnt_id);
	BUG_ON(mntns == NULL);

	return mntns_get_root_fd(mntns);
}

struct collect_mntns_arg {
	bool need_to_validate;
	bool for_dump;
	int root_master_id;
};

static int collect_mntns(struct ns_id *ns, void *__arg)
{
	struct collect_mntns_arg *arg = __arg;
	struct mount_info *pms;

	pms = collect_mntinfo(ns, arg->for_dump);
	if (!pms)
		return -1;

	if (arg->for_dump && ns->type != NS_CRIU)
		arg->need_to_validate = true;

	mntinfo_add_list(pms);

	if (arg->need_to_validate && ns->id == root_item->ids->mnt_ns_id)
		arg->root_master_id = ns->mnt.mntinfo_tree->master_id;

	return 0;
}

int collect_mnt_namespaces(bool for_dump)
{
	struct collect_mntns_arg arg;
	int ret;

	arg.for_dump = for_dump;
	arg.need_to_validate = false;

	ret = walk_namespaces(&mnt_ns_desc, collect_mntns, &arg);
	if (ret)
		goto err;

#ifdef CONFIG_BINFMT_MISC_VIRTUALIZED
	if (for_dump && !opts.has_binfmt_misc) {
		unsigned int s_dev = 0;
		struct ns_id *ns;

		for (ns = ns_ids; ns != NULL; ns = ns->next) {
			if (ns->type == NS_ROOT && ns->nd == &mnt_ns_desc)
				break;
		}

		if (ns) {
			ret = mount_cr_time_mount(ns, &s_dev, "binfmt_misc", BINFMT_MISC_HOME,
						  "binfmt_misc");
			if (ret == -EPERM)
				pr_info("Can't mount binfmt_misc: EPERM. Running in user_ns?\n");
			else if (ret < 0 && ret != -EBUSY && ret != -ENODEV && ret != -ENOENT) {
				pr_err("Can't mount binfmt_misc: %d %s", ret, strerror(-ret));
				goto err;
			} else if (ret == 0) {
				ret = -1;
				goto err;
			} else if (ret > 0 && add_cr_time_mount(ns->mnt.mntinfo_tree, "binfmt_misc",
								BINFMT_MISC_HOME, s_dev) < 0) {
				ret = -1;
				goto err;
			}
		}
	}
#endif

	ret = resolve_external_mounts(mntinfo);
	if (ret)
		goto err;

	if (arg.need_to_validate) {
		ret = -1;

		if (resolve_shared_mounts(mntinfo, arg.root_master_id))
			goto err;
		if (validate_mounts(mntinfo, true))
			goto err;
	}

	ret = 0;
err:
	return ret;
}

int dump_mnt_namespaces(void)
{
	struct ns_id *nsid;

	if (!(root_ns_mask & CLONE_NEWNS))
		return 0;

	for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) {
		if (nsid->nd != &mnt_ns_desc || nsid->type == NS_CRIU)
			continue;

		if ((nsid->type == NS_OTHER) && check_mnt_id()) {
			pr_err("Nested mount namespaces are not supported "
				"without mnt_id in fdinfo\n");
			return -1;
		}

		if (dump_mnt_ns(nsid, nsid->mnt.mntinfo_list))
			return -1;
	}

	return 0;
}

void clean_cr_time_mounts(void)
{
	struct mount_info *mi;
	int mnt_fd, ret;

	for (mi = mntinfo; mi; mi = mi->next) {
		if (mi->mnt_id != CRTIME_MNT_ID)
			continue;
		ret = switch_ns(mi->nsid->ns_pid, &mnt_ns_desc, &mnt_fd);
		if (ret) {
			pr_err("Can't switch to pid's %u mnt_ns\n", mi->nsid->ns_pid);
			continue;
		}

		if (umount(mi->mountpoint) < 0)
			pr_perror("Can't umount forced mount %s", mi->mountpoint);

		if (restore_ns(mnt_fd, &mnt_ns_desc)) {
			pr_err("cleanup_forced_mounts exiting with wrong mnt_ns\n");
			return;
		}
	}
}

struct ns_desc mnt_ns_desc = NS_DESC_ENTRY(CLONE_NEWNS, "mnt");
