From 49cb2fc42ce4b7a656ee605e30c302efaa39c1a7 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Fri, 15 Nov 2019 13:36:20 +0100 Subject: [PATCH] fork: extend clone3() to support setting a PID The main motivation to add set_tid to clone3() is CRIU. To restore a process with the same PID/TID CRIU currently uses /proc/sys/kernel/ns_last_pid. It writes the desired (PID - 1) to ns_last_pid and then (quickly) does a clone(). This works most of the time, but it is racy. It is also slow as it requires multiple syscalls. Extending clone3() to support *set_tid makes it possible restore a process using CRIU without accessing /proc/sys/kernel/ns_last_pid and race free (as long as the desired PID/TID is available). This clone3() extension places the same restrictions (CAP_SYS_ADMIN) on clone3() with *set_tid as they are currently in place for ns_last_pid. The original version of this change was using a single value for set_tid. At the 2019 LPC, after presenting set_tid, it was, however, decided to change set_tid to an array to enable setting the PID of a process in multiple PID namespaces at the same time. If a process is created in a PID namespace it is possible to influence the PID inside and outside of the PID namespace. Details also in the corresponding selftest. To create a process with the following PIDs: PID NS level Requested PID 0 (host) 31496 1 42 2 1 For that example the two newly introduced parameters to struct clone_args (set_tid and set_tid_size) would need to be: set_tid[0] = 1; set_tid[1] = 42; set_tid[2] = 31496; set_tid_size = 3; If only the PIDs of the two innermost nested PID namespaces should be defined it would look like this: set_tid[0] = 1; set_tid[1] = 42; set_tid_size = 2; The PID of the newly created process would then be the next available free PID in the PID namespace level 0 (host) and 42 in the PID namespace at level 1 and the PID of the process in the innermost PID namespace would be 1. The set_tid array is used to specify the PID of a process starting from the innermost nested PID namespaces up to set_tid_size PID namespaces. set_tid_size cannot be larger then the current PID namespace level. Signed-off-by: Adrian Reber Reviewed-by: Christian Brauner Reviewed-by: Oleg Nesterov Reviewed-by: Dmitry Safonov <0x7f454c46@gmail.com> Acked-by: Andrei Vagin Link: https://lore.kernel.org/r/20191115123621.142252-1-areber@redhat.com Signed-off-by: Christian Brauner --- include/linux/pid.h | 3 +- include/linux/pid_namespace.h | 2 + include/linux/sched/task.h | 3 ++ include/uapi/linux/sched.h | 53 +++++++++++++++++--------- kernel/fork.c | 24 +++++++++++- kernel/pid.c | 70 ++++++++++++++++++++++++++++------- kernel/pid_namespace.c | 2 - 7 files changed, 121 insertions(+), 36 deletions(-) diff --git a/include/linux/pid.h b/include/linux/pid.h index 034e3cd60dc0..998ae7d24450 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -124,7 +124,8 @@ extern struct pid *find_vpid(int nr); extern struct pid *find_get_pid(int nr); extern struct pid *find_ge_pid(int nr, struct pid_namespace *); -extern struct pid *alloc_pid(struct pid_namespace *ns); +extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, + size_t set_tid_size); extern void free_pid(struct pid *pid); extern void disable_pid_allocation(struct pid_namespace *ns); diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 49538b172483..2ed6af88794b 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -12,6 +12,8 @@ #include #include +/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ +#define MAX_PID_NS_LEVEL 32 struct fs_pin; diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 4b1c3b664f51..f1879884238e 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -26,6 +26,9 @@ struct kernel_clone_args { unsigned long stack; unsigned long stack_size; unsigned long tls; + pid_t *set_tid; + /* Number of elements in *set_tid */ + size_t set_tid_size; }; /* diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 1d500ed03c63..a0b1c224c72b 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -39,24 +39,38 @@ #ifndef __ASSEMBLY__ /** * struct clone_args - arguments for the clone3 syscall - * @flags: Flags for the new process as listed above. - * All flags are valid except for CSIGNAL and - * CLONE_DETACHED. - * @pidfd: If CLONE_PIDFD is set, a pidfd will be - * returned in this argument. - * @child_tid: If CLONE_CHILD_SETTID is set, the TID of the - * child process will be returned in the child's - * memory. - * @parent_tid: If CLONE_PARENT_SETTID is set, the TID of - * the child process will be returned in the - * parent's memory. - * @exit_signal: The exit_signal the parent process will be - * sent when the child exits. - * @stack: Specify the location of the stack for the - * child process. - * @stack_size: The size of the stack for the child process. - * @tls: If CLONE_SETTLS is set, the tls descriptor - * is set to tls. + * @flags: Flags for the new process as listed above. + * All flags are valid except for CSIGNAL and + * CLONE_DETACHED. + * @pidfd: If CLONE_PIDFD is set, a pidfd will be + * returned in this argument. + * @child_tid: If CLONE_CHILD_SETTID is set, the TID of the + * child process will be returned in the child's + * memory. + * @parent_tid: If CLONE_PARENT_SETTID is set, the TID of + * the child process will be returned in the + * parent's memory. + * @exit_signal: The exit_signal the parent process will be + * sent when the child exits. + * @stack: Specify the location of the stack for the + * child process. + * @stack_size: The size of the stack for the child process. + * @tls: If CLONE_SETTLS is set, the tls descriptor + * is set to tls. + * @set_tid: Pointer to an array of type *pid_t. The size + * of the array is defined using @set_tid_size. + * This array is used to select PIDs/TIDs for + * newly created processes. The first element in + * this defines the PID in the most nested PID + * namespace. Each additional element in the array + * defines the PID in the parent PID namespace of + * the original PID namespace. If the array has + * less entries than the number of currently + * nested PID namespaces only the PIDs in the + * corresponding namespaces are set. + * @set_tid_size: This defines the size of the array referenced + * in @set_tid. This cannot be larger than the + * kernel's limit of nested PID namespaces. * * The structure is versioned by size and thus extensible. * New struct members must go at the end of the struct and @@ -71,10 +85,13 @@ struct clone_args { __aligned_u64 stack; __aligned_u64 stack_size; __aligned_u64 tls; + __aligned_u64 set_tid; + __aligned_u64 set_tid_size; }; #endif #define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ +#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ /* * Scheduling policies diff --git a/kernel/fork.c b/kernel/fork.c index 954e875e72b1..417570263f1f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2087,7 +2087,8 @@ static __latent_entropy struct task_struct *copy_process( stackleak_task_init(p); if (pid != &init_struct_pid) { - pid = alloc_pid(p->nsproxy->pid_ns_for_children); + pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid, + args->set_tid_size); if (IS_ERR(pid)) { retval = PTR_ERR(pid); goto bad_fork_cleanup_thread; @@ -2590,6 +2591,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, { int err; struct clone_args args; + pid_t *kset_tid = kargs->set_tid; if (unlikely(usize > PAGE_SIZE)) return -E2BIG; @@ -2600,6 +2602,15 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, if (err) return err; + if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL)) + return -EINVAL; + + if (unlikely(!args.set_tid && args.set_tid_size > 0)) + return -EINVAL; + + if (unlikely(args.set_tid && args.set_tid_size == 0)) + return -EINVAL; + /* * Verify that higher 32bits of exit_signal are unset and that * it is a valid signal @@ -2617,8 +2628,16 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, .stack = args.stack, .stack_size = args.stack_size, .tls = args.tls, + .set_tid_size = args.set_tid_size, }; + if (args.set_tid && + copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid), + (kargs->set_tid_size * sizeof(pid_t)))) + return -EFAULT; + + kargs->set_tid = kset_tid; + return 0; } @@ -2662,6 +2681,9 @@ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size) int err; struct kernel_clone_args kargs; + pid_t set_tid[MAX_PID_NS_LEVEL]; + + kargs.set_tid = set_tid; err = copy_clone_args_from_user(&kargs, uargs, size); if (err) diff --git a/kernel/pid.c b/kernel/pid.c index 7b5f6c963d72..2278e249141d 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -157,7 +157,8 @@ void free_pid(struct pid *pid) call_rcu(&pid->rcu, delayed_put_pid); } -struct pid *alloc_pid(struct pid_namespace *ns) +struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, + size_t set_tid_size) { struct pid *pid; enum pid_type type; @@ -166,6 +167,17 @@ struct pid *alloc_pid(struct pid_namespace *ns) struct upid *upid; int retval = -ENOMEM; + /* + * set_tid_size contains the size of the set_tid array. Starting at + * the most nested currently active PID namespace it tells alloc_pid() + * which PID to set for a process in that most nested PID namespace + * up to set_tid_size PID namespaces. It does not have to set the PID + * for a process in all nested PID namespaces but set_tid_size must + * never be greater than the current ns->level + 1. + */ + if (set_tid_size > ns->level + 1) + return ERR_PTR(-EINVAL); + pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); if (!pid) return ERR_PTR(retval); @@ -174,24 +186,54 @@ struct pid *alloc_pid(struct pid_namespace *ns) pid->level = ns->level; for (i = ns->level; i >= 0; i--) { - int pid_min = 1; + int tid = 0; + + if (set_tid_size) { + tid = set_tid[ns->level - i]; + + retval = -EINVAL; + if (tid < 1 || tid >= pid_max) + goto out_free; + /* + * Also fail if a PID != 1 is requested and + * no PID 1 exists. + */ + if (tid != 1 && !tmp->child_reaper) + goto out_free; + retval = -EPERM; + if (!ns_capable(tmp->user_ns, CAP_SYS_ADMIN)) + goto out_free; + set_tid_size--; + } idr_preload(GFP_KERNEL); spin_lock_irq(&pidmap_lock); - /* - * init really needs pid 1, but after reaching the maximum - * wrap back to RESERVED_PIDS - */ - if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS) - pid_min = RESERVED_PIDS; + if (tid) { + nr = idr_alloc(&tmp->idr, NULL, tid, + tid + 1, GFP_ATOMIC); + /* + * If ENOSPC is returned it means that the PID is + * alreay in use. Return EEXIST in that case. + */ + if (nr == -ENOSPC) + nr = -EEXIST; + } else { + int pid_min = 1; + /* + * init really needs pid 1, but after reaching the + * maximum wrap back to RESERVED_PIDS + */ + if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS) + pid_min = RESERVED_PIDS; - /* - * Store a null pointer so find_pid_ns does not find - * a partially initialized PID (see below). - */ - nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, - pid_max, GFP_ATOMIC); + /* + * Store a null pointer so find_pid_ns does not find + * a partially initialized PID (see below). + */ + nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, + pid_max, GFP_ATOMIC); + } spin_unlock_irq(&pidmap_lock); idr_preload_end(); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a6a79f85c81a..d40017e79ebe 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -26,8 +26,6 @@ static DEFINE_MUTEX(pid_caches_mutex); static struct kmem_cache *pid_ns_cachep; -/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ -#define MAX_PID_NS_LEVEL 32 /* Write once array, filled from the beginning. */ static struct kmem_cache *pid_cache[MAX_PID_NS_LEVEL];