From 1ff6bbfd13ca2c114a5cb58e1a92d1e5d68ce0b7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 28 Jan 2014 18:10:37 -0500 Subject: [PATCH 01/10] arm, pm, vmpressure: add missing slab.h includes arch/arm/mach-tegra/pm.c, kernel/power/console.c and mm/vmpressure.c were somehow getting slab.h indirectly through cgroup.h which in turn was getting it indirectly through xattr.h. A scheduled cgroup change drops xattr.h inclusion from cgroup.h and breaks compilation of these three files. Add explicit slab.h includes to the three files. A pending cgroup patch depends on this change and it'd be great if this can be routed through cgroup/for-3.14-fixes branch. Signed-off-by: Tejun Heo Acked-by: Stephen Warren Cc: Thierry Reding Cc: linux-tegra@vger.kernel.org Cc: "Rafael J. Wysocki" Cc: linux-pm@vger.kernel.org Cc: Johannes Weiner Cc: Michal Hocko Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: cgroups@vger.kernel.org --- arch/arm/mach-tegra/pm.c | 1 + kernel/power/console.c | 1 + mm/vmpressure.c | 1 + 3 files changed, 3 insertions(+) diff --git a/arch/arm/mach-tegra/pm.c b/arch/arm/mach-tegra/pm.c index 4ae0286b468d..f55b05a29b55 100644 --- a/arch/arm/mach-tegra/pm.c +++ b/arch/arm/mach-tegra/pm.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include diff --git a/kernel/power/console.c b/kernel/power/console.c index eacb8bd8cab4..aba9c545a0e3 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "power.h" #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 196970a4541f..d4042e75f7c7 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include From 230579d7e0c46fa078ba11899d9b091df689362f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 28 Jan 2014 11:29:09 +0800 Subject: [PATCH 02/10] cpuset: update MAINTAINERS entry Add mailing list and tree tag to the entry. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index b2cf5cfb4d29..41e52c57cd1a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2408,8 +2408,10 @@ F: tools/power/cpupower/ CPUSETS M: Li Zefan +L: cgroups@vger.kernel.org W: http://www.bullopensource.org/cpuset/ W: http://oss.sgi.com/projects/cpusets/ +T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git S: Maintained F: Documentation/cgroups/cpusets.txt F: include/linux/cpuset.h From 0a6be6555302eebb14510fd6b35bb17e8dfa1386 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 3 Feb 2014 14:31:07 -0500 Subject: [PATCH 03/10] nfs: include xattr.h from fs/nfs/nfs3proc.c fs/nfs/nfs3proc.c is making use of xattr but was getting linux/xattr.h indirectly through linux/cgroup.h, which will soon drop the inclusion of xattr.h. Explicitly include linux/xattr.h from nfs3proc.c so that compilation doesn't fail when linux/cgroup.h drops linux/xattr.h. As the following cgroup changes will depend on these changes, it probably would be easier to route this through cgroup branch. Would that be okay? Signed-off-by: Tejun Heo Acked-by: Trond Myklebust Cc: linux-nfs@vger.kernel.org --- fs/nfs/nfs3proc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index aa9bc973f36a..a462ef0fb5d6 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "iostat.h" #include "internal.h" From ab3f5faa6255a0eb4f832675507d9e295ca7e9ba Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 6 Feb 2014 15:56:01 -0800 Subject: [PATCH 04/10] cgroup: use an ordered workqueue for cgroup destruction Sometimes the cleanup after memcg hierarchy testing gets stuck in mem_cgroup_reparent_charges(), unable to bring non-kmem usage down to 0. There may turn out to be several causes, but a major cause is this: the workitem to offline parent can get run before workitem to offline child; parent's mem_cgroup_reparent_charges() circles around waiting for the child's pages to be reparented to its lrus, but it's holding cgroup_mutex which prevents the child from reaching its mem_cgroup_reparent_charges(). Just use an ordered workqueue for cgroup_destroy_wq. tj: Committing as the temporary fix until the reverse dependency can be removed from memcg. Comment updated accordingly. Fixes: e5fca243abae ("cgroup: use a dedicated workqueue for cgroup destruction") Suggested-by: Filipe Brandenburger Signed-off-by: Hugh Dickins Cc: stable@vger.kernel.org # 3.10+ Signed-off-by: Tejun Heo --- kernel/cgroup.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e2f46ba37f72..aa95591c1430 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4845,12 +4845,16 @@ static int __init cgroup_wq_init(void) /* * There isn't much point in executing destruction path in * parallel. Good chunk is serialized with cgroup_mutex anyway. - * Use 1 for @max_active. + * + * XXX: Must be ordered to make sure parent is offlined after + * children. The ordering requirement is for memcg where a + * parent's offline may wait for a child's leading to deadlock. In + * the long term, this should be fixed from memcg side. * * We would prefer to do this in cgroup_init() above, but that * is called before init_workqueues(): so leave this until after. */ - cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); + cgroup_destroy_wq = alloc_ordered_workqueue("cgroup_destroy", 0); BUG_ON(!cgroup_destroy_wq); /* From eb46bf89696972b856a9adb6aebd5c7b65c266e4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 8 Feb 2014 10:26:33 -0500 Subject: [PATCH 05/10] cgroup: fix error return value in cgroup_mount() When cgroup_mount() fails to allocate an id for the root, it didn't set ret before jumping to unlock_drop ending up returning 0 after a failure. Fix it. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: stable@vger.kernel.org --- kernel/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index aa95591c1430..793f37176077 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1566,10 +1566,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_root_mutex); - root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp, - 0, 1, GFP_KERNEL); - if (root_cgrp->id < 0) + ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); + if (ret < 0) goto unlock_drop; + root_cgrp->id = ret; /* Check for name clashes with existing mounts */ ret = -EBUSY; From b58c89986a77a23658682a100eb15d8edb571ebb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 8 Feb 2014 10:26:33 -0500 Subject: [PATCH 06/10] cgroup: fix error return from cgroup_create() cgroup_create() was returning 0 after allocation failures. Fix it. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: stable@vger.kernel.org --- kernel/cgroup.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 793f37176077..0eb7b868e1ab 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4158,7 +4158,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, struct cgroup *cgrp; struct cgroup_name *name; struct cgroupfs_root *root = parent->root; - int ssid, err = 0; + int ssid, err; struct cgroup_subsys *ss; struct super_block *sb = root->sb; @@ -4168,8 +4168,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, return -ENOMEM; name = cgroup_alloc_name(dentry); - if (!name) + if (!name) { + err = -ENOMEM; goto err_free_cgrp; + } rcu_assign_pointer(cgrp->name, name); /* @@ -4177,8 +4179,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, * a half-baked cgroup. */ cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); - if (cgrp->id < 0) + if (cgrp->id < 0) { + err = -ENOMEM; goto err_free_name; + } /* * Only live parents can have children. Note that the liveliness From 48573a893303986e3b0b2974d6fb11f3d1bb7064 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 8 Feb 2014 10:26:34 -0500 Subject: [PATCH 07/10] cgroup: fix locking in cgroup_cfts_commit() cgroup_cfts_commit() walks the cgroup hierarchy that the target subsystem is attached to and tries to apply the file changes. Due to the convolution with inode locking, it can't keep cgroup_mutex locked while iterating. It currently holds only RCU read lock around the actual iteration and then pins the found cgroup using dget(). Unfortunately, this is incorrect. Although the iteration does check cgroup_is_dead() before invoking dget(), there's nothing which prevents the dentry from going away inbetween. Note that this is different from the usual css iterations where css_tryget() is used to pin the css - css_tryget() tests whether the css can be pinned and fails if not. The problem can be solved by simply holding cgroup_mutex instead of RCU read lock around the iteration, which actually reduces LOC. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: stable@vger.kernel.org --- kernel/cgroup.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0eb7b868e1ab..3edf7163b84f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2763,10 +2763,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) */ update_before = cgroup_serial_nr_next; - mutex_unlock(&cgroup_mutex); - /* add/rm files for all cgroups created before */ - rcu_read_lock(); css_for_each_descendant_pre(css, cgroup_css(root, ss)) { struct cgroup *cgrp = css->cgroup; @@ -2775,23 +2772,19 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) inode = cgrp->dentry->d_inode; dget(cgrp->dentry); - rcu_read_unlock(); - dput(prev); prev = cgrp->dentry; + mutex_unlock(&cgroup_mutex); mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) ret = cgroup_addrm_files(cgrp, cfts, is_add); - mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); - - rcu_read_lock(); if (ret) break; } - rcu_read_unlock(); + mutex_unlock(&cgroup_mutex); dput(prev); deactivate_super(sb); return ret; From 0ab02ca8f887908152d1a96db5130fc661d36a1e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 11 Feb 2014 16:05:46 +0800 Subject: [PATCH 08/10] cgroup: protect modifications to cgroup_idr with cgroup_mutex Setup cgroupfs like this: # mount -t cgroup -o cpuacct xxx /cgroup # mkdir /cgroup/sub1 # mkdir /cgroup/sub2 Then run these two commands: # for ((; ;)) { mkdir /cgroup/sub1/tmp && rmdir /mnt/sub1/tmp; } & # for ((; ;)) { mkdir /cgroup/sub2/tmp && rmdir /mnt/sub2/tmp; } & After seconds you may see this warning: ------------[ cut here ]------------ WARNING: CPU: 1 PID: 25243 at lib/idr.c:527 sub_remove+0x87/0x1b0() idr_remove called for id=6 which is not allocated. ... Call Trace: [] dump_stack+0x7a/0x96 [] warn_slowpath_common+0x8c/0xc0 [] warn_slowpath_fmt+0x46/0x50 [] sub_remove+0x87/0x1b0 [] ? css_killed_work_fn+0x32/0x1b0 [] idr_remove+0x25/0xd0 [] cgroup_destroy_css_killed+0x5b/0xc0 [] css_killed_work_fn+0x130/0x1b0 [] process_one_work+0x26c/0x550 [] worker_thread+0x12e/0x3b0 [] kthread+0xe6/0xf0 [] ret_from_fork+0x7c/0xb0 ---[ end trace 2d1577ec10cf80d0 ]--- It's because allocating/removing cgroup ID is not properly synchronized. The bug was introduced when we converted cgroup_ida to cgroup_idr. While synchronization is already done inside ida_simple_{get,remove}(), users are responsible for concurrent calls to idr_{alloc,remove}(). tj: Refreshed on top of b58c89986a77 ("cgroup: fix error return from cgroup_create()"). Fixes: 4e96ee8e981b ("cgroup: convert cgroup_ida to cgroup_idr") Cc: #3.12+ Reported-by: Michal Hocko Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 2 ++ kernel/cgroup.c | 34 ++++++++++++++++++---------------- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 5c097596104b..9450f025fe0c 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -166,6 +166,8 @@ struct cgroup { * * The ID of the root cgroup is always 0, and a new cgroup * will be assigned with a smallest available ID. + * + * Allocating/Removing ID must be protected by cgroup_mutex. */ int id; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3edf7163b84f..52719ce55dd3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -886,7 +886,9 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) * per-subsystem and moved to css->id so that lookups are * successful until the target css is released. */ + mutex_lock(&cgroup_mutex); idr_remove(&cgrp->root->cgroup_idr, cgrp->id); + mutex_unlock(&cgroup_mutex); cgrp->id = -1; call_rcu(&cgrp->rcu_head, cgroup_free_rcu); @@ -4167,16 +4169,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } rcu_assign_pointer(cgrp->name, name); - /* - * Temporarily set the pointer to NULL, so idr_find() won't return - * a half-baked cgroup. - */ - cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); - if (cgrp->id < 0) { - err = -ENOMEM; - goto err_free_name; - } - /* * Only live parents can have children. Note that the liveliness * check isn't strictly necessary because cgroup_mkdir() and @@ -4186,7 +4178,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, */ if (!cgroup_lock_live_group(parent)) { err = -ENODEV; - goto err_free_id; + goto err_free_name; + } + + /* + * Temporarily set the pointer to NULL, so idr_find() won't return + * a half-baked cgroup. + */ + cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); + if (cgrp->id < 0) { + err = -ENOMEM; + goto err_unlock; } /* Grab a reference on the superblock so the hierarchy doesn't @@ -4218,7 +4220,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, */ err = cgroup_create_file(dentry, S_IFDIR | mode, sb); if (err < 0) - goto err_unlock; + goto err_free_id; lockdep_assert_held(&dentry->d_inode->i_mutex); cgrp->serial_nr = cgroup_serial_nr_next++; @@ -4254,12 +4256,12 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, return 0; -err_unlock: - mutex_unlock(&cgroup_mutex); - /* Release the reference count that we took on the superblock */ - deactivate_super(sb); err_free_id: idr_remove(&root->cgroup_idr, cgrp->id); + /* Release the reference count that we took on the superblock */ + deactivate_super(sb); +err_unlock: + mutex_unlock(&cgroup_mutex); err_free_name: kfree(rcu_dereference_raw(cgrp->name)); err_free_cgrp: From 1a11533fbd71792e8c5d36f6763fbce8df0d231d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 12 Feb 2014 19:06:19 -0500 Subject: [PATCH 09/10] Revert "cgroup: use an ordered workqueue for cgroup destruction" This reverts commit ab3f5faa6255a0eb4f832675507d9e295ca7e9ba. Explanation from Hugh: It's because more thorough testing, by others here, found that it wasn't always solving the problem: so I asked Tejun privately to hold off from sending it in, until we'd worked out why not. Most of our testing being on a v3,11-based kernel, it was perfectly possible that the problem was merely our own e.g. missing Tejun's 8a2b75384444 ("workqueue: fix ordered workqueues in NUMA setups"). But that turned out not to be enough to fix it either. Then Filipe pointed out how percpu_ref_kill_and_confirm() uses call_rcu_sched() before we ever get to put the offline on to the workqueue: by the time we get to the workqueue, the ordering has already been lost. So, thanks for the Acks, but I'm afraid that this ordered workqueue solution is just not good enough: we should simply forget that patch and provide a different answer." Signed-off-by: Tejun Heo Cc: Hugh Dickins --- kernel/cgroup.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 52719ce55dd3..68d87103b493 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4844,16 +4844,12 @@ static int __init cgroup_wq_init(void) /* * There isn't much point in executing destruction path in * parallel. Good chunk is serialized with cgroup_mutex anyway. - * - * XXX: Must be ordered to make sure parent is offlined after - * children. The ordering requirement is for memcg where a - * parent's offline may wait for a child's leading to deadlock. In - * the long term, this should be fixed from memcg side. + * Use 1 for @max_active. * * We would prefer to do this in cgroup_init() above, but that * is called before init_workqueues(): so leave this until after. */ - cgroup_destroy_wq = alloc_ordered_workqueue("cgroup_destroy", 0); + cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1); BUG_ON(!cgroup_destroy_wq); /* From 532de3fc72adc2a6525c4d53c07bf81e1732083d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Feb 2014 13:29:31 -0500 Subject: [PATCH 10/10] cgroup: update cgroup_enable_task_cg_lists() to grab siglock Currently, there's nothing preventing cgroup_enable_task_cg_lists() from missing set PF_EXITING and race against cgroup_exit(). Depending on the timing, cgroup_exit() may finish with the task still linked on css_set leading to list corruption. Fix it by grabbing siglock in cgroup_enable_task_cg_lists() so that PF_EXITING is guaranteed to be visible. This whole on-demand cg_list optimization is extremely fragile and has ample possibility to lead to bugs which can cause things like once-a-year oops during boot. I'm wondering whether the better approach would be just adding "cgroup_disable=all" handling which disables the whole cgroup rather than tempting fate with this on-demand craziness. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: stable@vger.kernel.org --- kernel/cgroup.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 68d87103b493..105f273b6f86 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2905,9 +2905,14 @@ static void cgroup_enable_task_cg_lists(void) * We should check if the process is exiting, otherwise * it will race with cgroup_exit() in that the list * entry won't be deleted though the process has exited. + * Do it while holding siglock so that we don't end up + * racing against cgroup_exit(). */ + spin_lock_irq(&p->sighand->siglock); if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) list_add(&p->cg_list, &task_css_set(p)->tasks); + spin_unlock_irq(&p->sighand->siglock); + task_unlock(p); } while_each_thread(g, p); read_unlock(&tasklist_lock);