Merge "vmscan: Support multiple kswapd threads per node"

This commit is contained in:
qctecmdr 2020-07-03 04:10:18 -07:00 committed by Gerrit - the friendly Code Review server
commit 64f0825f01
7 changed files with 209 additions and 4 deletions

View File

@ -39,6 +39,7 @@ Currently, these files are in /proc/sys/vm:
- extfrag_threshold
- extra_free_kbytes
- hugetlb_shm_group
- kswapd_threads
- laptop_mode
- legacy_va_layout
- lowmem_reserve_ratio
@ -310,6 +311,25 @@ hugetlb_shm_group
hugetlb_shm_group contains group id that is allowed to create SysV
shared memory segment using hugetlb page.
kswapd_threads
==============
kswapd_threads allows you to control the number of kswapd threads per node
running on the system. This provides the ability to devote additional CPU
resources toward proactive page replacement with the goal of reducing
direct reclaims. When direct reclaims are prevented, the CPU consumed
by them is prevented as well. Depending on the workload, the result can
cause aggregate CPU usage on the system to go up, down or stay the same.
More aggressive page replacement can reduce direct reclaims which cause
latency for tasks and decrease throughput when doing filesystem IO through
the pagecache. Direct reclaims are recorded using the allocstall counter
in /proc/vmstat.
The default value is 1 and the range of acceptible values are 1-16.
Always start with lower values in the 2-6 range. Higher values should
be justified with testing. If direct reclaims occur in spite of high
values, the cost of direct reclaims (in latency) that occur can be
higher due to increased lock contention.
laptop_mode
===========

View File

@ -2309,6 +2309,7 @@ extern void set_dma_reserve(unsigned long new_dma_reserve);
extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long,
enum memmap_context, struct vmem_altmap *);
extern void setup_per_zone_wmarks(void);
extern void update_kswapd_threads(void);
extern int __meminit init_per_zone_wmark_min(void);
extern void mem_init(void);
extern void __init mmap_init(void);
@ -2329,6 +2330,7 @@ extern void zone_pcp_update(struct zone *zone);
extern void zone_pcp_reset(struct zone *zone);
/* page_alloc.c */
extern int kswapd_threads;
extern int min_free_kbytes;
extern int watermark_boost_factor;
extern int watermark_scale_factor;
@ -3017,5 +3019,12 @@ static inline int pages_identical(struct page *page1, struct page *page2)
extern int want_old_faultaround_pte;
#ifndef CONFIG_MULTIPLE_KSWAPD
static inline void update_kswapd_threads_node(int nid) {}
static inline int multi_kswapd_run(int nid) { return 0; }
static inline void multi_kswapd_stop(int nid) {}
static inline void multi_kswapd_cpu_online(pg_data_t *pgdat,
const struct cpumask *mask) {}
#endif /* CONFIG_MULTIPLE_KSWAPD */
#endif /* __KERNEL__ */
#endif /* _LINUX_MM_H */

View File

@ -39,6 +39,8 @@
*/
#define PAGE_ALLOC_COSTLY_ORDER 3
#define MAX_KSWAPD_THREADS 16
enum migratetype {
MIGRATE_UNMOVABLE,
MIGRATE_MOVABLE,
@ -743,8 +745,13 @@ typedef struct pglist_data {
int node_id;
wait_queue_head_t kswapd_wait;
wait_queue_head_t pfmemalloc_wait;
struct task_struct *kswapd; /* Protected by
mem_hotplug_begin/end() */
struct task_struct *kswapd;
#ifdef CONFIG_MULTIPLE_KSWAPD
/*
* Protected by mem_hotplug_begin/end()
*/
struct task_struct *mkswapd[MAX_KSWAPD_THREADS];
#endif
int kswapd_order;
enum zone_type kswapd_classzone_idx;
@ -957,6 +964,9 @@ static inline int is_highmem(struct zone *zone)
/* These two functions are used to setup the per zone pages min values */
struct ctl_table;
int kswapd_threads_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length,
loff_t *pos);
int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
int watermark_boost_factor_sysctl_handler(struct ctl_table *, int,

View File

@ -143,6 +143,8 @@ static int six_hundred_forty_kb = 640 * 1024;
static unsigned int __maybe_unused half_million = 500000;
static unsigned int __maybe_unused one_hundred_million = 100000000;
static unsigned int __maybe_unused one_million = 1000000;
static int __maybe_unused max_kswapd_threads = MAX_KSWAPD_THREADS;
#ifdef CONFIG_SCHED_WALT
static int neg_three = -3;
static int three = 3;
@ -1825,6 +1827,17 @@ static struct ctl_table vm_table[] = {
.proc_handler = watermark_boost_factor_sysctl_handler,
.extra1 = SYSCTL_ZERO,
},
#ifdef CONFIG_MULTIPLE_KSWAPD
{
.procname = "kswapd_threads",
.data = &kswapd_threads,
.maxlen = sizeof(kswapd_threads),
.mode = 0644,
.proc_handler = kswapd_threads_sysctl_handler,
.extra1 = SYSCTL_ONE,
.extra2 = &max_kswapd_threads,
},
#endif
{
.procname = "watermark_scale_factor",
.data = &watermark_scale_factor,

View File

@ -861,3 +861,16 @@ config PROCESS_RECLAIM
(addr, addr + size-bytes) of the process.
Any other value is ignored.
config MULTIPLE_KSWAPD
bool "Spawn multiple kswapd threads"
depends on QGKI
default y
help
kswapd_threads allows you to control the number of kswapd threads
per node running on the system. The default value is 1 and the
range of acceptible values are 1-16. The number of threads can
be controlled by below command:
(echo <num> > /proc/sys/vm/kswapd_threads)
Values not in the range of 1..16 are ignored.

View File

@ -8047,6 +8047,22 @@ int watermark_boost_factor_sysctl_handler(struct ctl_table *table, int write,
return 0;
}
#ifdef CONFIG_MULTIPLE_KSWAPD
int kswapd_threads_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
int rc;
rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (rc)
return rc;
if (write)
update_kswapd_threads();
return 0;
}
#endif
int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{

View File

@ -139,6 +139,13 @@ struct scan_control {
struct vm_area_struct *target_vma;
};
/*
* Number of active kswapd threads
*/
#define DEF_KSWAPD_THREADS_PER_NODE 1
int kswapd_threads = DEF_KSWAPD_THREADS_PER_NODE;
int kswapd_threads_current = DEF_KSWAPD_THREADS_PER_NODE;
#ifdef ARCH_HAS_PREFETCH
#define prefetch_prev_lru_page(_page, _base, _field) \
do { \
@ -4111,6 +4118,116 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
}
#endif /* CONFIG_HIBERNATION */
#ifdef CONFIG_MULTIPLE_KSWAPD
static void update_kswapd_threads_node(int nid)
{
pg_data_t *pgdat;
int drop, increase;
int last_idx, start_idx, hid;
int nr_threads = kswapd_threads_current;
pgdat = NODE_DATA(nid);
last_idx = nr_threads - 1;
if (kswapd_threads < nr_threads) {
drop = nr_threads - kswapd_threads;
for (hid = last_idx; hid > (last_idx - drop); hid--) {
if (pgdat->mkswapd[hid]) {
kthread_stop(pgdat->mkswapd[hid]);
pgdat->mkswapd[hid] = NULL;
}
}
} else {
increase = kswapd_threads - nr_threads;
start_idx = last_idx + 1;
for (hid = start_idx; hid < (start_idx + increase); hid++) {
pgdat->mkswapd[hid] = kthread_run(kswapd, pgdat,
"kswapd%d:%d", nid, hid);
if (IS_ERR(pgdat->mkswapd[hid])) {
pr_err("Failed to start kswapd%d on node %d\n",
hid, nid);
pgdat->mkswapd[hid] = NULL;
/*
* We are out of resources. Do not start any
* more threads.
*/
break;
}
}
}
}
void update_kswapd_threads(void)
{
int nid;
if (kswapd_threads_current == kswapd_threads)
return;
/*
* Hold the memory hotplug lock to avoid racing with memory
* hotplug initiated updates
*/
mem_hotplug_begin();
for_each_node_state(nid, N_MEMORY)
update_kswapd_threads_node(nid);
pr_info("kswapd_thread count changed, old:%d new:%d\n",
kswapd_threads_current, kswapd_threads);
kswapd_threads_current = kswapd_threads;
mem_hotplug_done();
}
static int multi_kswapd_run(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
int hid, nr_threads = kswapd_threads;
int ret = 0;
pgdat->mkswapd[0] = pgdat->kswapd;
for (hid = 1; hid < nr_threads; ++hid) {
pgdat->mkswapd[hid] = kthread_run(kswapd, pgdat, "kswapd%d:%d",
nid, hid);
if (IS_ERR(pgdat->mkswapd[hid])) {
/* failure at boot is fatal */
WARN_ON(system_state < SYSTEM_RUNNING);
pr_err("Failed to start kswapd%d on node %d\n",
hid, nid);
ret = PTR_ERR(pgdat->mkswapd[hid]);
pgdat->mkswapd[hid] = NULL;
}
}
kswapd_threads_current = nr_threads;
return ret;
}
static void multi_kswapd_stop(int nid)
{
int hid = 0;
int nr_threads = kswapd_threads_current;
struct task_struct *kswapd;
NODE_DATA(nid)->mkswapd[hid] = NULL;
for (hid = 1; hid < nr_threads; hid++) {
kswapd = NODE_DATA(nid)->mkswapd[hid];
if (kswapd) {
kthread_stop(kswapd);
NODE_DATA(nid)->mkswapd[hid] = NULL;
}
}
}
static void multi_kswapd_cpu_online(pg_data_t *pgdat,
const struct cpumask *mask)
{
int hid;
int nr_threads = kswapd_threads_current;
for (hid = 1; hid < nr_threads; hid++)
set_cpus_allowed_ptr(pgdat->mkswapd[hid], mask);
}
#endif
/* It's optimal to keep kswapds on the same CPUs as their memory, but
not required for correctness. So if the last cpu in a node goes
away, we get changed to run anywhere: as the first one comes back,
@ -4125,9 +4242,11 @@ static int kswapd_cpu_online(unsigned int cpu)
mask = cpumask_of_node(pgdat->node_id);
if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) {
/* One of our CPUs online: restore mask */
set_cpus_allowed_ptr(pgdat->kswapd, mask);
multi_kswapd_cpu_online(pgdat, mask);
}
}
return 0;
}
@ -4144,14 +4263,17 @@ int kswapd_run(int nid)
if (pgdat->kswapd)
return 0;
pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d:0", nid);
if (IS_ERR(pgdat->kswapd)) {
/* failure at boot is fatal */
BUG_ON(system_state < SYSTEM_RUNNING);
pr_err("Failed to start kswapd on node %d\n", nid);
ret = PTR_ERR(pgdat->kswapd);
pgdat->kswapd = NULL;
return ret;
}
ret = multi_kswapd_run(nid);
return ret;
}
@ -4167,6 +4289,8 @@ void kswapd_stop(int nid)
kthread_stop(kswapd);
NODE_DATA(nid)->kswapd = NULL;
}
multi_kswapd_stop(nid);
}
static int __init kswapd_init(void)