Skip to content

Commit 9c479db

Browse files
committed
mm: scrape LRU pages for offlined memcgs
jira KERNEL-172 feature Add ability to scrape LRU pages from offlined memcgs commit-author Yu Zhao <[email protected]> commit-source v1-0001-mm-scrape-LRU-pages-for-offlined-memcgs.patch commit-source-path Provided by Google Engineering upstream-diff A few tweaks to the original patch were necessary: * Removed unused nid variable from scrape_offlined_memcgs * Switched extra2 to 8 (otherwise 'echo 8 > /proc/sys/vm/drop_caches' would be rejected) * Renamed nr_pages_to_scrape to offlined_memcg_nr_pages in the !CONFIG_MEMCG case to match the CONFIG_MEMCG case * Added 'return 0' to scrape_offlined_memcgs in the !CONFIG_MEMCG case For offlined memcgs, kmem (slab) is reparented so that it does not hold refcnts which would in turn prevent those memcgs from being released. However, reparenting does not apply to LRU pages (pagecache), and therefore they need to be scraped as well for offlined memcgs. "echo 8 > /proc/sys/vm/drop_caches" was introduced for this reason. And unlike "echo 1", it does not have performance impact on online memcgs in terms of zapping pagecache. Signed-off-by: Yu Zhao <[email protected]> Signed-off-by: Brett Mastbergen <[email protected]>
1 parent ab07cdc commit 9c479db

File tree

6 files changed

+94
-1
lines changed

6 files changed

+94
-1
lines changed

Documentation/sysctl/vm.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,8 @@ To free reclaimable slab objects (includes dentries and inodes):
225225
echo 2 > /proc/sys/vm/drop_caches
226226
To free slab objects and pagecache:
227227
echo 3 > /proc/sys/vm/drop_caches
228+
To scrape LRU pages from offlined memcgs:
229+
echo 8 > /proc/sys/vm/drop_caches
228230

229231
This is a non-destructive operation and will not free any dirty objects.
230232
To increase the number of objects freed by this operation, the user may run
@@ -249,6 +251,14 @@ used:
249251
These are informational only. They do not mean that anything is wrong
250252
with your system. To disable them, echo 4 (bit 3) into drop_caches.
251253

254+
Note that for offlined memcgs, kmem (slab) is reparented so that it
255+
does not hold refcnts which would in turn prevent those memcgs from
256+
being released. However, reparenting does not apply to LRU pages
257+
(pagecache), and therefore they need to be scraped as well for
258+
offlined memcgs. "echo 8" was introduced for this reason. And unlike
259+
"echo 1", it does not have performance impact on online memcgs in
260+
terms of zapping pagecache.
261+
252262
==============================================================
253263

254264
extfrag_threshold

fs/drop_caches.c

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
#include <linux/writeback.h>
1111
#include <linux/sysctl.h>
1212
#include <linux/gfp.h>
13+
#include <linux/memcontrol.h>
14+
#include <linux/backing-dev.h>
1315
#include "internal.h"
1416

1517
/* A global variable is a bit ugly, but it keeps the code simple */
@@ -66,6 +68,24 @@ int drop_caches_sysctl_handler(struct ctl_table *table, int write,
6668
drop_slab();
6769
count_vm_event(DROP_SLAB);
6870
}
71+
if (sysctl_drop_caches & 8) {
72+
int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
73+
unsigned long target = offlined_memcg_nr_pages();
74+
75+
while (nr_retries) {
76+
unsigned long progress = scrape_offlined_memcgs(target);
77+
78+
if (progress >= target)
79+
break;
80+
81+
if (!progress) {
82+
congestion_wait(BLK_RW_ASYNC, HZ / 10);
83+
nr_retries--;
84+
}
85+
86+
target -= progress;
87+
}
88+
}
6989
if (!stfu) {
7090
pr_info("%s (%d): drop_caches: %d\n",
7191
current->comm, task_pid_nr(current),

include/linux/memcontrol.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ struct mem_cgroup_reclaim_cookie {
7373
unsigned int generation;
7474
};
7575

76+
#define MEM_CGROUP_RECLAIM_RETRIES 5
77+
7678
#ifdef CONFIG_MEMCG
7779

7880
#define MEM_CGROUP_ID_SHIFT 16
@@ -1150,6 +1152,15 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
11501152
gfp_t gfp_mask,
11511153
unsigned long *total_scanned);
11521154

1155+
static inline unsigned long offlined_memcg_nr_pages(void)
1156+
{
1157+
extern atomic_t nr_offlined_memcgs;
1158+
1159+
return atomic_read(&nr_offlined_memcgs) * MEMCG_CHARGE_BATCH;
1160+
}
1161+
1162+
unsigned long scrape_offlined_memcgs(unsigned long nr_to_reclaim);
1163+
11531164
#else /* CONFIG_MEMCG */
11541165

11551166
#define MEM_CGROUP_ID_SHIFT 0
@@ -1526,6 +1537,17 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
15261537
{
15271538
return 0;
15281539
}
1540+
1541+
static inline unsigned long offlined_memcg_nr_pages(void)
1542+
{
1543+
return 0;
1544+
}
1545+
1546+
static inline unsigned long scrape_offlined_memcgs(unsigned long nr_to_reclaim)
1547+
{
1548+
return 0;
1549+
}
1550+
15291551
#endif /* CONFIG_MEMCG */
15301552

15311553
static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)

kernel/sysctl.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ static int sixty = 60;
128128
static int __maybe_unused neg_one = -1;
129129
static int __maybe_unused two = 2;
130130
static int __maybe_unused four = 4;
131+
static int __maybe_unused eight = 8;
131132
static unsigned long zero_ul;
132133
static unsigned long one_ul = 1;
133134
static unsigned long long_max = LONG_MAX;
@@ -1483,7 +1484,7 @@ static struct ctl_table vm_table[] = {
14831484
.mode = 0644,
14841485
.proc_handler = drop_caches_sysctl_handler,
14851486
.extra1 = SYSCTL_ONE,
1486-
.extra2 = &four,
1487+
.extra2 = &eight,
14871488
},
14881489
#ifdef CONFIG_COMPACTION
14891490
{

mm/memcontrol.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5594,6 +5594,8 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
55945594
return 0;
55955595
}
55965596

5597+
atomic_t nr_offlined_memcgs = ATOMIC_INIT(0);
5598+
55975599
static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
55985600
{
55995601
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
@@ -5621,13 +5623,17 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
56215623
drain_all_stock(memcg);
56225624

56235625
memcg_percpu_stats_disable(memcg);
5626+
5627+
atomic_inc(&nr_offlined_memcgs);
56245628
}
56255629

56265630
static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
56275631
{
56285632
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
56295633

56305634
invalidate_reclaim_iterators(memcg);
5635+
5636+
atomic_dec(&nr_offlined_memcgs);
56315637
}
56325638

56335639
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)

mm/vmscan.c

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,9 @@ struct scan_control {
123123
/* The file pages on the current node are dangerously low */
124124
unsigned int file_is_tiny:1;
125125

126+
/* Scrape LRU pages from offlined memcgs */
127+
unsigned int scrape_offlined_memcgs:1;
128+
126129
/* Allocation order */
127130
s8 order;
128131

@@ -3034,6 +3037,9 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
30343037
memcg_memory_event(memcg, MEMCG_LOW);
30353038
}
30363039

3040+
if (sc->scrape_offlined_memcgs && mem_cgroup_online(memcg))
3041+
continue;
3042+
30373043
reclaimed = sc->nr_reclaimed;
30383044
scanned = sc->nr_scanned;
30393045

@@ -4736,3 +4742,31 @@ void check_move_unevictable_pages(struct pagevec *pvec)
47364742
}
47374743
}
47384744
EXPORT_SYMBOL_GPL(check_move_unevictable_pages);
4745+
4746+
#ifdef CONFIG_MEMCG
4747+
unsigned long scrape_offlined_memcgs(unsigned long nr_to_reclaim)
4748+
{
4749+
unsigned int flags;
4750+
unsigned long nr_reclaimed;
4751+
struct scan_control sc = {
4752+
.nr_to_reclaim = max(nr_to_reclaim, SWAP_CLUSTER_MAX),
4753+
.gfp_mask = GFP_KERNEL,
4754+
.target_mem_cgroup = root_mem_cgroup,
4755+
.reclaim_idx = MAX_NR_ZONES - 1,
4756+
.may_writepage = true,
4757+
.may_unmap = true,
4758+
.scrape_offlined_memcgs = true,
4759+
};
4760+
struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
4761+
4762+
set_task_reclaim_state(current, &sc.reclaim_state);
4763+
flags = memalloc_noreclaim_save();
4764+
4765+
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
4766+
4767+
memalloc_noreclaim_restore(flags);
4768+
set_task_reclaim_state(current, NULL);
4769+
4770+
return nr_reclaimed;
4771+
}
4772+
#endif

0 commit comments

Comments
 (0)