BACKPORT: FROMGIT: cgroup: Use separate src/dst nodes when preloading css_sets for migration
Each cset (css_set) is pinned by its tasks. When we're moving tasks around
across csets for a migration, we need to hold the source and destination
csets to ensure that they don't go away while we're moving tasks about. This
is done by linking cset->mg_preload_node on either the
mgctx->preloaded_dst_csets or mgctx->preloaded_dst_csets list. Using the
same cset->mg_preload_node for both the src and dst lists was deemed okay as
a cset can't be both the source and destination at the same time.
Unfortunately, this overloading becomes problematic when multiple tasks are
involved in a migration and some of them are identity noop migrations while
others are actually moving across cgroups. For example, this can happen with
the following sequence on cgroup1:
#1> mkdir -p /sys/fs/cgroup/misc/a/b
#2> echo $$ > /sys/fs/cgroup/misc/a/cgroup.procs
#3> RUN_A_COMMAND_WHICH_CREATES_MULTIPLE_THREADS &
#4> PID=$!
#5> echo $PID > /sys/fs/cgroup/misc/a/b/tasks
#6> echo $PID > /sys/fs/cgroup/misc/a/cgroup.procs
the process including the group leader back into a. In this final migration,
non-leader threads would be doing identity migration while the group leader
is doing an actual one.
After #3, let's say the whole process was in cset A, and that after #4, the
leader moves to cset B. Then, during #6, the following happens:
1. cgroup_migrate_add_src() is called on B for the leader.
2. cgroup_migrate_add_src() is called on A for the other threads.
3. cgroup_migrate_prepare_dst() is called. It scans the src list.
3. It notices that B wants to migrate to A, so it tries to A to the dst
list but realizes that its ->mg_preload_node is already busy.
4. and then it notices A wants to migrate to A as it's an identity
migration, it culls it by list_del_init()'ing its ->mg_preload_node and
putting references accordingly.
5. The rest of migration takes place with B on the src list but nothing on
the dst list.
This means that A isn't held while migration is in progress. If all tasks
leave A before the migration finishes and the incoming task pins it, the
cset will be destroyed leading to use-after-free.
This is caused by overloading cset->mg_preload_node for both src and dst
preload lists. We wanted to exclude the cset from the src list but ended up
inadvertently excluding it from the dst list too.
This patch fixes the issue by separating out cset->mg_preload_node into
->mg_src_preload_node and ->mg_dst_preload_node, so that the src and dst
preloadings don't interfere with each other.
Bug: 236582926
Change-Id: Ieaf1c0c8fc23753570897fd6e48a54335ab939ce
Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Mukesh Ojha <quic_mojha@quicinc.com>
Reported-by: shisiyuan <shisiyuan19870131@gmail.com>
Link: http://lkml.kernel.org/r/1654187688-27411-1-git-send-email-shisiyuan@xiaomi.com
Link: https://lore.kernel.org/lkml/Yh+RGIJ0f3nrqIiN@slm.duckdns.org/#t
Fixes: f817de9851
("cgroup: prepare migration path for unified hierarchy")
Cc: stable@vger.kernel.org # v3.16+
(cherry picked from commit 07fd5b6cdf3cc30bfde8fe0f644771688be04447
https://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git for-5.19-fixes)
Signed-off-by: Elliot Berman <quic_eberman@quicinc.com>
Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com>
[mojha: Move the two new list heads into a wrapper ext_css_set struct to ensure
ABI doesn't break and also defined a macro init_css_set which will be replaced
with init_ext_css_set.cset to avoid too much code changes]
Signed-off-by: Srinivasarao Pathipati <quic_spathi@quicinc.com>
This commit is contained in:
parent
b62f2af118
commit
7cfbc7501b
@ -276,6 +276,13 @@ struct css_set {
|
||||
struct rcu_head rcu_head;
|
||||
};
|
||||
|
||||
struct ext_css_set {
|
||||
struct css_set cset;
|
||||
|
||||
struct list_head mg_src_preload_node;
|
||||
struct list_head mg_dst_preload_node;
|
||||
};
|
||||
|
||||
struct cgroup_base_stat {
|
||||
struct task_cputime cputime;
|
||||
};
|
||||
|
@ -70,7 +70,8 @@ struct css_task_iter {
|
||||
};
|
||||
|
||||
extern struct cgroup_root cgrp_dfl_root;
|
||||
extern struct css_set init_css_set;
|
||||
extern struct ext_css_set init_ext_css_set;
|
||||
#define init_css_set init_ext_css_set.cset
|
||||
|
||||
#define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
|
||||
#include <linux/cgroup_subsys.h>
|
||||
|
@ -735,25 +735,28 @@ EXPORT_SYMBOL_GPL(of_css);
|
||||
* reference-counted, to improve performance when child cgroups
|
||||
* haven't been created.
|
||||
*/
|
||||
struct css_set init_css_set = {
|
||||
.refcount = REFCOUNT_INIT(1),
|
||||
.dom_cset = &init_css_set,
|
||||
.tasks = LIST_HEAD_INIT(init_css_set.tasks),
|
||||
.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
|
||||
.dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
|
||||
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
|
||||
.threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
|
||||
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
|
||||
.mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
|
||||
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
|
||||
|
||||
/*
|
||||
* The following field is re-initialized when this cset gets linked
|
||||
* in cgroup_init(). However, let's initialize the field
|
||||
* statically too so that the default cgroup can be accessed safely
|
||||
* early during boot.
|
||||
*/
|
||||
.dfl_cgrp = &cgrp_dfl_root.cgrp,
|
||||
struct ext_css_set init_ext_css_set = {
|
||||
.cset = {
|
||||
.refcount = REFCOUNT_INIT(1),
|
||||
.dom_cset = &init_css_set,
|
||||
.tasks = LIST_HEAD_INIT(init_css_set.tasks),
|
||||
.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
|
||||
.dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
|
||||
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
|
||||
.threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
|
||||
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
|
||||
.mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
|
||||
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
|
||||
/*
|
||||
* The following field is re-initialized when this cset gets linked
|
||||
* in cgroup_init(). However, let's initialize the field
|
||||
* statically too so that the default cgroup can be accessed safely
|
||||
* early during boot.
|
||||
*/
|
||||
.dfl_cgrp = &cgrp_dfl_root.cgrp,
|
||||
},
|
||||
.mg_src_preload_node = LIST_HEAD_INIT(init_ext_css_set.mg_src_preload_node),
|
||||
.mg_dst_preload_node = LIST_HEAD_INIT(init_ext_css_set.mg_dst_preload_node),
|
||||
};
|
||||
|
||||
static int css_set_count = 1; /* 1 for init_css_set */
|
||||
@ -1181,6 +1184,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
|
||||
struct cgroup *cgrp)
|
||||
{
|
||||
struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
|
||||
struct ext_css_set *ext_cset;
|
||||
struct css_set *cset;
|
||||
struct list_head tmp_links;
|
||||
struct cgrp_cset_link *link;
|
||||
@ -1201,9 +1205,10 @@ static struct css_set *find_css_set(struct css_set *old_cset,
|
||||
if (cset)
|
||||
return cset;
|
||||
|
||||
cset = kzalloc(sizeof(*cset), GFP_KERNEL);
|
||||
if (!cset)
|
||||
ext_cset = kzalloc(sizeof(*ext_cset), GFP_KERNEL);
|
||||
if (!ext_cset)
|
||||
return NULL;
|
||||
cset = &ext_cset->cset;
|
||||
|
||||
/* Allocate all the cgrp_cset_link objects that we'll need */
|
||||
if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
|
||||
@ -1221,6 +1226,8 @@ static struct css_set *find_css_set(struct css_set *old_cset,
|
||||
INIT_HLIST_NODE(&cset->hlist);
|
||||
INIT_LIST_HEAD(&cset->cgrp_links);
|
||||
INIT_LIST_HEAD(&cset->mg_preload_node);
|
||||
INIT_LIST_HEAD(&ext_cset->mg_src_preload_node);
|
||||
INIT_LIST_HEAD(&ext_cset->mg_dst_preload_node);
|
||||
INIT_LIST_HEAD(&cset->mg_node);
|
||||
|
||||
/* Copy the set of subsystem state objects generated in
|
||||
@ -2671,22 +2678,28 @@ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
|
||||
*/
|
||||
void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
|
||||
{
|
||||
LIST_HEAD(preloaded);
|
||||
struct css_set *cset, *tmp_cset;
|
||||
struct ext_css_set *cset, *tmp_cset;
|
||||
|
||||
lockdep_assert_held(&cgroup_mutex);
|
||||
|
||||
spin_lock_irq(&css_set_lock);
|
||||
|
||||
list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
|
||||
list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
|
||||
list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets,
|
||||
mg_src_preload_node) {
|
||||
cset->cset.mg_src_cgrp = NULL;
|
||||
cset->cset.mg_dst_cgrp = NULL;
|
||||
cset->cset.mg_dst_cset = NULL;
|
||||
list_del_init(&cset->mg_src_preload_node);
|
||||
put_css_set_locked(&cset->cset);
|
||||
}
|
||||
|
||||
list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
|
||||
cset->mg_src_cgrp = NULL;
|
||||
cset->mg_dst_cgrp = NULL;
|
||||
cset->mg_dst_cset = NULL;
|
||||
list_del_init(&cset->mg_preload_node);
|
||||
put_css_set_locked(cset);
|
||||
list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets,
|
||||
mg_dst_preload_node) {
|
||||
cset->cset.mg_src_cgrp = NULL;
|
||||
cset->cset.mg_dst_cgrp = NULL;
|
||||
cset->cset.mg_dst_cset = NULL;
|
||||
list_del_init(&cset->mg_dst_preload_node);
|
||||
put_css_set_locked(&cset->cset);
|
||||
}
|
||||
|
||||
spin_unlock_irq(&css_set_lock);
|
||||
@ -2713,6 +2726,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
|
||||
struct cgroup_mgctx *mgctx)
|
||||
{
|
||||
struct cgroup *src_cgrp;
|
||||
struct ext_css_set *ext_src_cset;
|
||||
|
||||
lockdep_assert_held(&cgroup_mutex);
|
||||
lockdep_assert_held(&css_set_lock);
|
||||
@ -2726,8 +2740,9 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
|
||||
return;
|
||||
|
||||
src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
|
||||
ext_src_cset = container_of(src_cset, struct ext_css_set, cset);
|
||||
|
||||
if (!list_empty(&src_cset->mg_preload_node))
|
||||
if (!list_empty(&ext_src_cset->mg_src_preload_node))
|
||||
return;
|
||||
|
||||
WARN_ON(src_cset->mg_src_cgrp);
|
||||
@ -2738,7 +2753,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
|
||||
src_cset->mg_src_cgrp = src_cgrp;
|
||||
src_cset->mg_dst_cgrp = dst_cgrp;
|
||||
get_css_set(src_cset);
|
||||
list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
|
||||
list_add_tail(&ext_src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -2757,20 +2772,23 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
|
||||
*/
|
||||
int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
|
||||
{
|
||||
struct css_set *src_cset, *tmp_cset;
|
||||
struct ext_css_set *ext_src_set, *tmp_cset;
|
||||
|
||||
lockdep_assert_held(&cgroup_mutex);
|
||||
|
||||
/* look up the dst cset for each src cset and link it to src */
|
||||
list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
|
||||
mg_preload_node) {
|
||||
list_for_each_entry_safe(ext_src_set, tmp_cset, &mgctx->preloaded_src_csets,
|
||||
mg_src_preload_node) {
|
||||
struct css_set *src_cset = &ext_src_set->cset;
|
||||
struct css_set *dst_cset;
|
||||
struct ext_css_set *ext_dst_cset;
|
||||
struct cgroup_subsys *ss;
|
||||
int ssid;
|
||||
|
||||
dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
|
||||
if (!dst_cset)
|
||||
return -ENOMEM;
|
||||
ext_dst_cset = container_of(dst_cset, struct ext_css_set, cset);
|
||||
|
||||
WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
|
||||
|
||||
@ -2782,7 +2800,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
|
||||
if (src_cset == dst_cset) {
|
||||
src_cset->mg_src_cgrp = NULL;
|
||||
src_cset->mg_dst_cgrp = NULL;
|
||||
list_del_init(&src_cset->mg_preload_node);
|
||||
list_del_init(&ext_src_set->mg_src_preload_node);
|
||||
put_css_set(src_cset);
|
||||
put_css_set(dst_cset);
|
||||
continue;
|
||||
@ -2790,8 +2808,8 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
|
||||
|
||||
src_cset->mg_dst_cset = dst_cset;
|
||||
|
||||
if (list_empty(&dst_cset->mg_preload_node))
|
||||
list_add_tail(&dst_cset->mg_preload_node,
|
||||
if (list_empty(&ext_dst_cset->mg_dst_preload_node))
|
||||
list_add_tail(&ext_dst_cset->mg_dst_preload_node,
|
||||
&mgctx->preloaded_dst_csets);
|
||||
else
|
||||
put_css_set(dst_cset);
|
||||
@ -3010,7 +3028,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
|
||||
DEFINE_CGROUP_MGCTX(mgctx);
|
||||
struct cgroup_subsys_state *d_css;
|
||||
struct cgroup *dsct;
|
||||
struct css_set *src_cset;
|
||||
struct ext_css_set *ext_src_set;
|
||||
bool has_tasks;
|
||||
int ret;
|
||||
|
||||
@ -3041,11 +3059,12 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
|
||||
goto out_finish;
|
||||
|
||||
spin_lock_irq(&css_set_lock);
|
||||
list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
|
||||
list_for_each_entry(ext_src_set, &mgctx.preloaded_src_csets,
|
||||
mg_src_preload_node) {
|
||||
struct task_struct *task, *ntask;
|
||||
|
||||
/* all tasks in src_csets need to be migrated */
|
||||
list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
|
||||
list_for_each_entry_safe(task, ntask, &ext_src_set->cset.tasks, cg_list)
|
||||
cgroup_migrate_add_task(task, &mgctx);
|
||||
}
|
||||
spin_unlock_irq(&css_set_lock);
|
||||
|
Loading…
Reference in New Issue
Block a user