Skip to content

Commit b41f544

Browse files
committed
mm: scrape LRU pages for offlined memcgs
jira KERNEL-173 feature Add ability to scrape LRU pages from offlined memcgs commit-author: Yu Zhao <[email protected]> commit-source v1-0001-mm-scrape-LRU-pages-for-offlined-memcgs.patch commit-source-path Provided by Google Engineering upstream-diff A few tweaks to the original patch were necessary: * Format changes because Documentation/sysctl/vm.txt has been changed to Documentation/admin-guide/sysctl/vm.rst * Removed unused nid variable from scrape_offlined_memcgs * Switched drop_caches_sysctl_handler to use SYSCTL_EIGHT (otherwise 'echo 8 > /proc/sys/vm/drop_caches' would be rejected) * Renamed nr_pages_to_scrape to offlined_memcg_nr_pages in the !CONFIG_MEMCG case to match the CONFIG_MEMCG case * Added 'return 0' to scrape_offlined_memcgs in the !CONFIG_MEMCG case For offlined memcgs, kmem (slab) is reparented so that it does not hold refcnts which would in turn prevent those memcgs from being released. However, reparenting does not apply to LRU pages (pagecache), and therefore they need to be scraped as well for offlined memcgs. "echo 8 > /proc/sys/vm/drop_caches" was introduced for this reason. And unlike "echo 1", it does not have performance impact on online memcgs in terms of zapping pagecache. Signed-off-by: Yu Zhao <[email protected]> Signed-off-by: Brett Mastbergen <[email protected]>
1 parent 6d8936c commit b41f544

File tree

8 files changed

+102
-7
lines changed

8 files changed

+102
-7
lines changed

Documentation/admin-guide/sysctl/vm.rst

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,10 @@ To free slab objects and pagecache::
243243

244244
echo 3 > /proc/sys/vm/drop_caches
245245

246+
To scrape LRU pages from offlined memcgs:
247+
248+
echo 8 > /proc/sys/vm/drop_caches
249+
246250
This is a non-destructive operation and will not free any dirty objects.
247251
To increase the number of objects freed by this operation, the user may run
248252
`sync` prior to writing to /proc/sys/vm/drop_caches. This will minimize the
@@ -266,6 +270,14 @@ used::
266270
These are informational only. They do not mean that anything is wrong
267271
with your system. To disable them, echo 4 (bit 2) into drop_caches.
268272

273+
Note that for offlined memcgs, kmem (slab) is reparented so that it
274+
does not hold refcnts which would in turn prevent those memcgs from
275+
being released. However, reparenting does not apply to LRU pages
276+
(pagecache), and therefore they need to be scraped as well for
277+
offlined memcgs. "echo 8" was introduced for this reason. And unlike
278+
"echo 1", it does not have performance impact on online memcgs in
279+
terms of zapping pagecache.
280+
269281

270282
extfrag_threshold
271283
=================

fs/drop_caches.c

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
#include <linux/writeback.h>
1111
#include <linux/sysctl.h>
1212
#include <linux/gfp.h>
13+
#include <linux/memcontrol.h>
14+
#include <linux/backing-dev.h>
1315
#include "internal.h"
1416

1517
/* A global variable is a bit ugly, but it keeps the code simple */
@@ -66,6 +68,24 @@ int drop_caches_sysctl_handler(struct ctl_table *table, int write,
6668
drop_slab();
6769
count_vm_event(DROP_SLAB);
6870
}
71+
if (sysctl_drop_caches & 8) {
72+
int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
73+
unsigned long target = offlined_memcg_nr_pages();
74+
75+
while (nr_retries) {
76+
unsigned long progress = scrape_offlined_memcgs(target);
77+
78+
if (progress >= target)
79+
break;
80+
81+
if (!progress) {
82+
congestion_wait(BLK_RW_ASYNC, HZ / 10);
83+
nr_retries--;
84+
}
85+
86+
target -= progress;
87+
}
88+
}
6989
if (!stfu) {
7090
pr_info("%s (%d): drop_caches: %d\n",
7191
current->comm, task_pid_nr(current),

fs/proc/proc_sysctl.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ static const struct file_operations proc_sys_dir_file_operations;
2626
static const struct inode_operations proc_sys_dir_operations;
2727

2828
/* shared constants to be used in various sysctls */
29-
const int sysctl_vals[] = { -1, 0, 1, 2, 4, 100, 200, 1000, 3000, INT_MAX };
29+
const int sysctl_vals[] = { -1, 0, 1, 2, 4, 8, 100, 200, 1000, 3000, INT_MAX };
3030
EXPORT_SYMBOL(sysctl_vals);
3131

3232
/* Support for permanently empty directories */

include/linux/memcontrol.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ struct mem_cgroup_reclaim_cookie {
5858
unsigned int generation;
5959
};
6060

61+
#define MEM_CGROUP_RECLAIM_RETRIES 5
62+
6163
#ifdef CONFIG_MEMCG
6264

6365
#define MEM_CGROUP_ID_SHIFT 16
@@ -1137,6 +1139,15 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
11371139
gfp_t gfp_mask,
11381140
unsigned long *total_scanned);
11391141

1142+
static inline unsigned long offlined_memcg_nr_pages(void)
1143+
{
1144+
extern atomic_t nr_offlined_memcgs;
1145+
1146+
return atomic_read(&nr_offlined_memcgs) * MEMCG_CHARGE_BATCH;
1147+
}
1148+
1149+
unsigned long scrape_offlined_memcgs(unsigned long nr_to_reclaim);
1150+
11401151
#else /* CONFIG_MEMCG */
11411152

11421153
#define MEM_CGROUP_ID_SHIFT 0
@@ -1545,6 +1556,17 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
15451556
{
15461557
return 0;
15471558
}
1559+
1560+
static inline unsigned long offlined_memcg_nr_pages(void)
1561+
{
1562+
return 0;
1563+
}
1564+
1565+
static inline unsigned long scrape_offlined_memcgs(unsigned long nr_to_reclaim)
1566+
{
1567+
return 0;
1568+
}
1569+
15481570
#endif /* CONFIG_MEMCG */
15491571

15501572
static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)

include/linux/sysctl.h

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,12 @@ struct ctl_dir;
4343
#define SYSCTL_ONE ((void *)&sysctl_vals[2])
4444
#define SYSCTL_TWO ((void *)&sysctl_vals[3])
4545
#define SYSCTL_FOUR ((void *)&sysctl_vals[4])
46-
#define SYSCTL_ONE_HUNDRED ((void *)&sysctl_vals[5])
47-
#define SYSCTL_TWO_HUNDRED ((void *)&sysctl_vals[6])
48-
#define SYSCTL_ONE_THOUSAND ((void *)&sysctl_vals[7])
49-
#define SYSCTL_THREE_THOUSAND ((void *)&sysctl_vals[8])
50-
#define SYSCTL_INT_MAX ((void *)&sysctl_vals[9])
46+
#define SYSCTL_EIGHT ((void *)&sysctl_vals[5])
47+
#define SYSCTL_ONE_HUNDRED ((void *)&sysctl_vals[6])
48+
#define SYSCTL_TWO_HUNDRED ((void *)&sysctl_vals[7])
49+
#define SYSCTL_ONE_THOUSAND ((void *)&sysctl_vals[8])
50+
#define SYSCTL_THREE_THOUSAND ((void *)&sysctl_vals[9])
51+
#define SYSCTL_INT_MAX ((void *)&sysctl_vals[10])
5152

5253
extern const int sysctl_vals[];
5354

kernel/sysctl.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2627,7 +2627,7 @@ static struct ctl_table vm_table[] = {
26272627
.mode = 0200,
26282628
.proc_handler = drop_caches_sysctl_handler,
26292629
.extra1 = SYSCTL_ONE,
2630-
.extra2 = SYSCTL_FOUR,
2630+
.extra2 = SYSCTL_EIGHT,
26312631
},
26322632
#ifdef CONFIG_COMPACTION
26332633
{

mm/memcontrol.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5350,6 +5350,8 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
53505350
return -ENOMEM;
53515351
}
53525352

5353+
atomic_t nr_offlined_memcgs = ATOMIC_INIT(0);
5354+
53535355
static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
53545356
{
53555357
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
@@ -5377,13 +5379,17 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
53775379
drain_all_stock(memcg);
53785380

53795381
mem_cgroup_id_put(memcg);
5382+
5383+
atomic_inc(&nr_offlined_memcgs);
53805384
}
53815385

53825386
static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
53835387
{
53845388
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
53855389

53865390
invalidate_reclaim_iterators(memcg);
5391+
5392+
atomic_dec(&nr_offlined_memcgs);
53875393
}
53885394

53895395
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)

mm/vmscan.c

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,9 @@ struct scan_control {
123123
/* The file pages on the current node are dangerously low */
124124
unsigned int file_is_tiny:1;
125125

126+
/* Scrape LRU pages from offlined memcgs */
127+
unsigned int scrape_offlined_memcgs:1;
128+
126129
/* Always discard instead of demoting to lower tier memory */
127130
unsigned int no_demotion:1;
128131

@@ -3092,6 +3095,9 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
30923095
memcg_memory_event(memcg, MEMCG_LOW);
30933096
}
30943097

3098+
if (sc->scrape_offlined_memcgs && mem_cgroup_online(memcg))
3099+
continue;
3100+
30953101
reclaimed = sc->nr_reclaimed;
30963102
scanned = sc->nr_scanned;
30973103

@@ -4816,3 +4822,31 @@ void check_move_unevictable_pages(struct pagevec *pvec)
48164822
}
48174823
}
48184824
EXPORT_SYMBOL_GPL(check_move_unevictable_pages);
4825+
4826+
#ifdef CONFIG_MEMCG
4827+
unsigned long scrape_offlined_memcgs(unsigned long nr_to_reclaim)
4828+
{
4829+
unsigned int flags;
4830+
unsigned long nr_reclaimed;
4831+
struct scan_control sc = {
4832+
.nr_to_reclaim = max(nr_to_reclaim, SWAP_CLUSTER_MAX),
4833+
.gfp_mask = GFP_KERNEL,
4834+
.target_mem_cgroup = root_mem_cgroup,
4835+
.reclaim_idx = MAX_NR_ZONES - 1,
4836+
.may_writepage = true,
4837+
.may_unmap = true,
4838+
.scrape_offlined_memcgs = true,
4839+
};
4840+
struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
4841+
4842+
set_task_reclaim_state(current, &sc.reclaim_state);
4843+
flags = memalloc_noreclaim_save();
4844+
4845+
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
4846+
4847+
memalloc_noreclaim_restore(flags);
4848+
set_task_reclaim_state(current, NULL);
4849+
4850+
return nr_reclaimed;
4851+
}
4852+
#endif

0 commit comments

Comments
 (0)