From 6c9a27f5da9609fca46cb2b183724531b48f71ad Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Tue, 10 Sep 2013 18:16:36 +0900 Subject: [PATCH 1/4] sched/fair: Fix small race where child->se.parent,cfs_rq might point to invalid ones There is a small race between copy_process() and cgroup_attach_task() where child->se.parent,cfs_rq points to invalid (old) ones. parent doing fork() | someone moving the parent to another cgroup -------------------------------+--------------------------------------------- copy_process() + dup_task_struct() -> parent->se is copied to child->se. se.parent,cfs_rq of them point to old ones. cgroup_attach_task() + cgroup_task_migrate() -> parent->cgroup is updated. + cpu_cgroup_attach() + sched_move_task() + task_move_group_fair() +- set_task_rq() -> se.parent,cfs_rq of parent are updated. + cgroup_fork() -> parent->cgroup is copied to child->cgroup. (*1) + sched_fork() + task_fork_fair() -> se.parent,cfs_rq of child are accessed while they point to old ones. (*2) In the worst case, this bug can lead to "use-after-free" and cause a panic, because it's new cgroup's refcount that is incremented at (*1), so the old cgroup(and related data) can be freed before (*2). In fact, a panic caused by this bug was originally caught in RHEL6.4. BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] sched_slice+0x6e/0xa0 [...] Call Trace: [] place_entity+0x75/0xa0 [] task_fork_fair+0xaa/0x160 [] sched_fork+0x6b/0x140 [] copy_process+0x5b2/0x1450 [] ? wake_up_new_task+0xd9/0x130 [] do_fork+0x94/0x460 [] ? sys_wait4+0xae/0x100 [] sys_clone+0x28/0x30 [] stub_clone+0x13/0x20 [] ? system_call_fastpath+0x16/0x1b Signed-off-by: Daisuke Nishimura Signed-off-by: Peter Zijlstra Cc: Link: http://lkml.kernel.org/r/039601ceae06$733d3130$59b79390$@mxp.nes.nec.co.jp Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9b3fe1cd8f40..11cd13667359 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5928,11 +5928,15 @@ static void task_fork_fair(struct task_struct *p) cfs_rq = task_cfs_rq(current); curr = cfs_rq->curr; - if (unlikely(task_cpu(p) != this_cpu)) { - rcu_read_lock(); - __set_task_cpu(p, this_cpu); - rcu_read_unlock(); - } + /* + * Not only the cpu but also the task_group of the parent might have + * been changed after parent->se.parent,cfs_rq were copied to + * child->se.parent,cfs_rq. So call __set_task_cpu() to make those + * of child point to valid ones. + */ + rcu_read_lock(); + __set_task_cpu(p, this_cpu); + rcu_read_unlock(); update_curr(cfs_rq); From fc840914e9b07ab4685c195e1e54e58de4f84c03 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 9 Sep 2013 13:01:41 +0200 Subject: [PATCH 2/4] sched/debug: Take PID namespace into account Emmanuel reported that /proc/sched_debug didn't report the right PIDs when using namespaces, cure this. Reported-by: Emmanuel Deloget Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20130909110141.GM31370@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index e076bddd4c66..196559994f7c 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -124,7 +124,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SEQ_printf(m, " "); SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", - p->comm, p->pid, + p->comm, task_pid_nr(p), SPLIT_NS(p->se.vruntime), (long long)(p->nvcsw + p->nivcsw), p->prio); @@ -289,7 +289,7 @@ do { \ P(nr_load_updates); P(nr_uninterruptible); PN(next_balance); - P(curr->pid); + SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); PN(clock); P(cpu_load[0]); P(cpu_load[1]); @@ -492,7 +492,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) { unsigned long nr_switches; - SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, + SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p), get_nr_threads(p)); SEQ_printf(m, "---------------------------------------------------------" From 3b524d60943a2f9ee1194323ff9d5ee01a4d1ce1 Mon Sep 17 00:00:00 2001 From: Li Bin Date: Mon, 9 Sep 2013 14:05:40 +0800 Subject: [PATCH 3/4] sched/Documentation: Update sched-design-CFS.txt documentation 2bd2d6f2dc ("sched: Replace use of entity_key()") had remove the entity_key() which function is to subtract a tasks vruntime by its groups minvruntime as the rbtree key. The phrase "there is a subtraction using rq->cfs.min_vruntime to account for possible wraparounds" in documentation feels strange and meaningless. So remove it. Signed-off-by: Li Bin Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1378706740-7500-1-git-send-email-huawei.libin@huawei.com Signed-off-by: Ingo Molnar --- Documentation/scheduler/sched-design-CFS.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt index d529e02d928d..f14f49304222 100644 --- a/Documentation/scheduler/sched-design-CFS.txt +++ b/Documentation/scheduler/sched-design-CFS.txt @@ -66,9 +66,7 @@ rq->cfs.load value, which is the sum of the weights of the tasks queued on the runqueue. CFS maintains a time-ordered rbtree, where all runnable tasks are sorted by the -p->se.vruntime key (there is a subtraction using rq->cfs.min_vruntime to -account for possible wraparounds). CFS picks the "leftmost" task from this -tree and sticks to it. +p->se.vruntime key. CFS picks the "leftmost" task from this tree and sticks to it. As the system progresses forwards, the executed tasks are put into the tree more and more to the right --- slowly but surely giving a chance for every task to become the "leftmost task" and thus get on the CPU within a deterministic From 13b62e46d5407c7d619aea1dc9c3e0991b631b57 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 16 Sep 2013 11:30:36 +0300 Subject: [PATCH 4/4] sched: Fix comment for sched_info_depart sched_info_depart seems to be only called from sched_info_switch(), so only on involuntary task switch. Fix the comment to match. Signed-off-by: Michael S. Tsirkin Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: KOSAKI Motohiro Link: http://lkml.kernel.org/r/20130916083036.GA1113@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/stats.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 5aef494fc8b4..c7edee71bce8 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -104,8 +104,9 @@ static inline void sched_info_queued(struct task_struct *t) } /* - * Called when a process ceases being the active-running process, either - * voluntarily or involuntarily. Now we can calculate how long we ran. + * Called when a process ceases being the active-running process involuntarily + * due, typically, to expiring its time slice (this may also be called when + * switching to the idle task). Now we can calculate how long we ran. * Also, if the process is still in the TASK_RUNNING state, call * sched_info_queued() to mark that it has now again started waiting on * the runqueue.