Skip to content

Commit 3d8b38e

Browse files
rgushchintorvalds
authored andcommitted
mm, oom: introduce memory.oom.group
For some workloads an intervention from the OOM killer can be painful. Killing a random task can bring the workload into an inconsistent state. Historically, there are two common solutions for this problem: 1) enabling panic_on_oom, 2) using a userspace daemon to monitor OOMs and kill all outstanding processes. Both approaches have their downsides: rebooting on each OOM is an obvious waste of capacity, and handling all in userspace is tricky and requires a userspace agent, which will monitor all cgroups for OOMs. In most cases an in-kernel after-OOM cleaning-up mechanism can eliminate the necessity of enabling panic_on_oom. Also, it can simplify the cgroup management for userspace applications. This commit introduces a new knob for cgroup v2 memory controller: memory.oom.group. The knob determines whether the cgroup should be treated as an indivisible workload by the OOM killer. If set, all tasks belonging to the cgroup or to its descendants (if the memory cgroup is not a leaf cgroup) are killed together or not at all. To determine which cgroup has to be killed, we do traverse the cgroup hierarchy from the victim task's cgroup up to the OOMing cgroup (or root) and looking for the highest-level cgroup with memory.oom.group set. Tasks with the OOM protection (oom_score_adj set to -1000) are treated as an exception and are never killed. This patch doesn't change the OOM victim selection algorithm. Link: http://lkml.kernel.org/r/20180802003201.817-4-guro@fb.com Signed-off-by: Roman Gushchin <guro@fb.com> Acked-by: Michal Hocko <mhocko@suse.com> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Cc: David Rientjes <rientjes@google.com> Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Cc: Tejun Heo <tj@kernel.org> Cc: Vladimir Davydov <vdavydov.dev@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 5989ad7 commit 3d8b38e

4 files changed

Lines changed: 159 additions & 0 deletions

File tree

Documentation/admin-guide/cgroup-v2.rst

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1072,6 +1072,24 @@ PAGE_SIZE multiple when read back.
10721072
high limit is used and monitored properly, this limit's
10731073
utility is limited to providing the final safety net.
10741074

1075+
memory.oom.group
1076+
A read-write single value file which exists on non-root
1077+
cgroups. The default value is "0".
1078+
1079+
Determines whether the cgroup should be treated as
1080+
an indivisible workload by the OOM killer. If set,
1081+
all tasks belonging to the cgroup or to its descendants
1082+
(if the memory cgroup is not a leaf cgroup) are killed
1083+
together or not at all. This can be used to avoid
1084+
partial kills to guarantee workload integrity.
1085+
1086+
Tasks with the OOM protection (oom_score_adj set to -1000)
1087+
are treated as an exception and are never killed.
1088+
1089+
If the OOM killer is invoked in a cgroup, it's not going
1090+
to kill any tasks outside of this cgroup, regardless
1091+
memory.oom.group values of ancestor cgroups.
1092+
10751093
memory.events
10761094
A read-only flat-keyed file which exists on non-root cgroups.
10771095
The following entries are defined. Unless specified

include/linux/memcontrol.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,11 @@ struct mem_cgroup {
225225
*/
226226
bool use_hierarchy;
227227

228+
/*
229+
* Should the OOM killer kill all belonging tasks, had it kill one?
230+
*/
231+
bool oom_group;
232+
228233
/* protected by memcg_oom_lock */
229234
bool oom_lock;
230235
int under_oom;
@@ -542,6 +547,9 @@ static inline bool task_in_memcg_oom(struct task_struct *p)
542547
}
543548

544549
bool mem_cgroup_oom_synchronize(bool wait);
550+
struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
551+
struct mem_cgroup *oom_domain);
552+
void mem_cgroup_print_oom_group(struct mem_cgroup *memcg);
545553

546554
#ifdef CONFIG_MEMCG_SWAP
547555
extern int do_swap_account;
@@ -1001,6 +1009,16 @@ static inline bool mem_cgroup_oom_synchronize(bool wait)
10011009
return false;
10021010
}
10031011

1012+
static inline struct mem_cgroup *mem_cgroup_get_oom_group(
1013+
struct task_struct *victim, struct mem_cgroup *oom_domain)
1014+
{
1015+
return NULL;
1016+
}
1017+
1018+
static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
1019+
{
1020+
}
1021+
10041022
static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
10051023
int idx)
10061024
{

mm/memcontrol.c

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1776,6 +1776,62 @@ bool mem_cgroup_oom_synchronize(bool handle)
17761776
return true;
17771777
}
17781778

1779+
/**
1780+
* mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
1781+
* @victim: task to be killed by the OOM killer
1782+
* @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM
1783+
*
1784+
* Returns a pointer to a memory cgroup, which has to be cleaned up
1785+
* by killing all belonging OOM-killable tasks.
1786+
*
1787+
* Caller has to call mem_cgroup_put() on the returned non-NULL memcg.
1788+
*/
1789+
struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
1790+
struct mem_cgroup *oom_domain)
1791+
{
1792+
struct mem_cgroup *oom_group = NULL;
1793+
struct mem_cgroup *memcg;
1794+
1795+
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
1796+
return NULL;
1797+
1798+
if (!oom_domain)
1799+
oom_domain = root_mem_cgroup;
1800+
1801+
rcu_read_lock();
1802+
1803+
memcg = mem_cgroup_from_task(victim);
1804+
if (memcg == root_mem_cgroup)
1805+
goto out;
1806+
1807+
/*
1808+
* Traverse the memory cgroup hierarchy from the victim task's
1809+
* cgroup up to the OOMing cgroup (or root) to find the
1810+
* highest-level memory cgroup with oom.group set.
1811+
*/
1812+
for (; memcg; memcg = parent_mem_cgroup(memcg)) {
1813+
if (memcg->oom_group)
1814+
oom_group = memcg;
1815+
1816+
if (memcg == oom_domain)
1817+
break;
1818+
}
1819+
1820+
if (oom_group)
1821+
css_get(&oom_group->css);
1822+
out:
1823+
rcu_read_unlock();
1824+
1825+
return oom_group;
1826+
}
1827+
1828+
void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
1829+
{
1830+
pr_info("Tasks in ");
1831+
pr_cont_cgroup_path(memcg->css.cgroup);
1832+
pr_cont(" are going to be killed due to memory.oom.group set\n");
1833+
}
1834+
17791835
/**
17801836
* lock_page_memcg - lock a page->mem_cgroup binding
17811837
* @page: the page
@@ -5561,6 +5617,37 @@ static int memory_stat_show(struct seq_file *m, void *v)
55615617
return 0;
55625618
}
55635619

5620+
static int memory_oom_group_show(struct seq_file *m, void *v)
5621+
{
5622+
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5623+
5624+
seq_printf(m, "%d\n", memcg->oom_group);
5625+
5626+
return 0;
5627+
}
5628+
5629+
static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
5630+
char *buf, size_t nbytes, loff_t off)
5631+
{
5632+
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5633+
int ret, oom_group;
5634+
5635+
buf = strstrip(buf);
5636+
if (!buf)
5637+
return -EINVAL;
5638+
5639+
ret = kstrtoint(buf, 0, &oom_group);
5640+
if (ret)
5641+
return ret;
5642+
5643+
if (oom_group != 0 && oom_group != 1)
5644+
return -EINVAL;
5645+
5646+
memcg->oom_group = oom_group;
5647+
5648+
return nbytes;
5649+
}
5650+
55645651
static struct cftype memory_files[] = {
55655652
{
55665653
.name = "current",
@@ -5602,6 +5689,12 @@ static struct cftype memory_files[] = {
56025689
.flags = CFTYPE_NOT_ON_ROOT,
56035690
.seq_show = memory_stat_show,
56045691
},
5692+
{
5693+
.name = "oom.group",
5694+
.flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
5695+
.seq_show = memory_oom_group_show,
5696+
.write = memory_oom_group_write,
5697+
},
56055698
{ } /* terminate */
56065699
};
56075700

mm/oom_kill.c

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -908,13 +908,27 @@ static void __oom_kill_process(struct task_struct *victim)
908908
}
909909
#undef K
910910

911+
/*
912+
* Kill provided task unless it's secured by setting
913+
* oom_score_adj to OOM_SCORE_ADJ_MIN.
914+
*/
915+
static int oom_kill_memcg_member(struct task_struct *task, void *unused)
916+
{
917+
if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
918+
get_task_struct(task);
919+
__oom_kill_process(task);
920+
}
921+
return 0;
922+
}
923+
911924
static void oom_kill_process(struct oom_control *oc, const char *message)
912925
{
913926
struct task_struct *p = oc->chosen;
914927
unsigned int points = oc->chosen_points;
915928
struct task_struct *victim = p;
916929
struct task_struct *child;
917930
struct task_struct *t;
931+
struct mem_cgroup *oom_group;
918932
unsigned int victim_points = 0;
919933
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
920934
DEFAULT_RATELIMIT_BURST);
@@ -968,7 +982,23 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
968982
}
969983
read_unlock(&tasklist_lock);
970984

985+
/*
986+
* Do we need to kill the entire memory cgroup?
987+
* Or even one of the ancestor memory cgroups?
988+
* Check this out before killing the victim task.
989+
*/
990+
oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);
991+
971992
__oom_kill_process(victim);
993+
994+
/*
995+
* If necessary, kill all tasks in the selected memory cgroup.
996+
*/
997+
if (oom_group) {
998+
mem_cgroup_print_oom_group(oom_group);
999+
mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, NULL);
1000+
mem_cgroup_put(oom_group);
1001+
}
9721002
}
9731003

9741004
/*

0 commit comments

Comments
 (0)