Merge "vmscan: Support multiple kswapd threads per node"
This commit is contained in:
commit
64f0825f01
@ -39,6 +39,7 @@ Currently, these files are in /proc/sys/vm:
|
|||||||
- extfrag_threshold
|
- extfrag_threshold
|
||||||
- extra_free_kbytes
|
- extra_free_kbytes
|
||||||
- hugetlb_shm_group
|
- hugetlb_shm_group
|
||||||
|
- kswapd_threads
|
||||||
- laptop_mode
|
- laptop_mode
|
||||||
- legacy_va_layout
|
- legacy_va_layout
|
||||||
- lowmem_reserve_ratio
|
- lowmem_reserve_ratio
|
||||||
@ -310,6 +311,25 @@ hugetlb_shm_group
|
|||||||
hugetlb_shm_group contains group id that is allowed to create SysV
|
hugetlb_shm_group contains group id that is allowed to create SysV
|
||||||
shared memory segment using hugetlb page.
|
shared memory segment using hugetlb page.
|
||||||
|
|
||||||
|
kswapd_threads
|
||||||
|
==============
|
||||||
|
kswapd_threads allows you to control the number of kswapd threads per node
|
||||||
|
running on the system. This provides the ability to devote additional CPU
|
||||||
|
resources toward proactive page replacement with the goal of reducing
|
||||||
|
direct reclaims. When direct reclaims are prevented, the CPU consumed
|
||||||
|
by them is prevented as well. Depending on the workload, the result can
|
||||||
|
cause aggregate CPU usage on the system to go up, down or stay the same.
|
||||||
|
|
||||||
|
More aggressive page replacement can reduce direct reclaims which cause
|
||||||
|
latency for tasks and decrease throughput when doing filesystem IO through
|
||||||
|
the pagecache. Direct reclaims are recorded using the allocstall counter
|
||||||
|
in /proc/vmstat.
|
||||||
|
|
||||||
|
The default value is 1 and the range of acceptible values are 1-16.
|
||||||
|
Always start with lower values in the 2-6 range. Higher values should
|
||||||
|
be justified with testing. If direct reclaims occur in spite of high
|
||||||
|
values, the cost of direct reclaims (in latency) that occur can be
|
||||||
|
higher due to increased lock contention.
|
||||||
|
|
||||||
laptop_mode
|
laptop_mode
|
||||||
===========
|
===========
|
||||||
|
@ -2309,6 +2309,7 @@ extern void set_dma_reserve(unsigned long new_dma_reserve);
|
|||||||
extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long,
|
extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long,
|
||||||
enum memmap_context, struct vmem_altmap *);
|
enum memmap_context, struct vmem_altmap *);
|
||||||
extern void setup_per_zone_wmarks(void);
|
extern void setup_per_zone_wmarks(void);
|
||||||
|
extern void update_kswapd_threads(void);
|
||||||
extern int __meminit init_per_zone_wmark_min(void);
|
extern int __meminit init_per_zone_wmark_min(void);
|
||||||
extern void mem_init(void);
|
extern void mem_init(void);
|
||||||
extern void __init mmap_init(void);
|
extern void __init mmap_init(void);
|
||||||
@ -2329,6 +2330,7 @@ extern void zone_pcp_update(struct zone *zone);
|
|||||||
extern void zone_pcp_reset(struct zone *zone);
|
extern void zone_pcp_reset(struct zone *zone);
|
||||||
|
|
||||||
/* page_alloc.c */
|
/* page_alloc.c */
|
||||||
|
extern int kswapd_threads;
|
||||||
extern int min_free_kbytes;
|
extern int min_free_kbytes;
|
||||||
extern int watermark_boost_factor;
|
extern int watermark_boost_factor;
|
||||||
extern int watermark_scale_factor;
|
extern int watermark_scale_factor;
|
||||||
@ -3017,5 +3019,12 @@ static inline int pages_identical(struct page *page1, struct page *page2)
|
|||||||
|
|
||||||
extern int want_old_faultaround_pte;
|
extern int want_old_faultaround_pte;
|
||||||
|
|
||||||
|
#ifndef CONFIG_MULTIPLE_KSWAPD
|
||||||
|
static inline void update_kswapd_threads_node(int nid) {}
|
||||||
|
static inline int multi_kswapd_run(int nid) { return 0; }
|
||||||
|
static inline void multi_kswapd_stop(int nid) {}
|
||||||
|
static inline void multi_kswapd_cpu_online(pg_data_t *pgdat,
|
||||||
|
const struct cpumask *mask) {}
|
||||||
|
#endif /* CONFIG_MULTIPLE_KSWAPD */
|
||||||
#endif /* __KERNEL__ */
|
#endif /* __KERNEL__ */
|
||||||
#endif /* _LINUX_MM_H */
|
#endif /* _LINUX_MM_H */
|
||||||
|
@ -39,6 +39,8 @@
|
|||||||
*/
|
*/
|
||||||
#define PAGE_ALLOC_COSTLY_ORDER 3
|
#define PAGE_ALLOC_COSTLY_ORDER 3
|
||||||
|
|
||||||
|
#define MAX_KSWAPD_THREADS 16
|
||||||
|
|
||||||
enum migratetype {
|
enum migratetype {
|
||||||
MIGRATE_UNMOVABLE,
|
MIGRATE_UNMOVABLE,
|
||||||
MIGRATE_MOVABLE,
|
MIGRATE_MOVABLE,
|
||||||
@ -743,8 +745,13 @@ typedef struct pglist_data {
|
|||||||
int node_id;
|
int node_id;
|
||||||
wait_queue_head_t kswapd_wait;
|
wait_queue_head_t kswapd_wait;
|
||||||
wait_queue_head_t pfmemalloc_wait;
|
wait_queue_head_t pfmemalloc_wait;
|
||||||
struct task_struct *kswapd; /* Protected by
|
struct task_struct *kswapd;
|
||||||
mem_hotplug_begin/end() */
|
#ifdef CONFIG_MULTIPLE_KSWAPD
|
||||||
|
/*
|
||||||
|
* Protected by mem_hotplug_begin/end()
|
||||||
|
*/
|
||||||
|
struct task_struct *mkswapd[MAX_KSWAPD_THREADS];
|
||||||
|
#endif
|
||||||
int kswapd_order;
|
int kswapd_order;
|
||||||
enum zone_type kswapd_classzone_idx;
|
enum zone_type kswapd_classzone_idx;
|
||||||
|
|
||||||
@ -957,6 +964,9 @@ static inline int is_highmem(struct zone *zone)
|
|||||||
|
|
||||||
/* These two functions are used to setup the per zone pages min values */
|
/* These two functions are used to setup the per zone pages min values */
|
||||||
struct ctl_table;
|
struct ctl_table;
|
||||||
|
int kswapd_threads_sysctl_handler(struct ctl_table *table, int write,
|
||||||
|
void __user *buffer, size_t *length,
|
||||||
|
loff_t *pos);
|
||||||
int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
|
int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
|
||||||
void __user *, size_t *, loff_t *);
|
void __user *, size_t *, loff_t *);
|
||||||
int watermark_boost_factor_sysctl_handler(struct ctl_table *, int,
|
int watermark_boost_factor_sysctl_handler(struct ctl_table *, int,
|
||||||
|
@ -143,6 +143,8 @@ static int six_hundred_forty_kb = 640 * 1024;
|
|||||||
static unsigned int __maybe_unused half_million = 500000;
|
static unsigned int __maybe_unused half_million = 500000;
|
||||||
static unsigned int __maybe_unused one_hundred_million = 100000000;
|
static unsigned int __maybe_unused one_hundred_million = 100000000;
|
||||||
static unsigned int __maybe_unused one_million = 1000000;
|
static unsigned int __maybe_unused one_million = 1000000;
|
||||||
|
static int __maybe_unused max_kswapd_threads = MAX_KSWAPD_THREADS;
|
||||||
|
|
||||||
#ifdef CONFIG_SCHED_WALT
|
#ifdef CONFIG_SCHED_WALT
|
||||||
static int neg_three = -3;
|
static int neg_three = -3;
|
||||||
static int three = 3;
|
static int three = 3;
|
||||||
@ -1825,6 +1827,17 @@ static struct ctl_table vm_table[] = {
|
|||||||
.proc_handler = watermark_boost_factor_sysctl_handler,
|
.proc_handler = watermark_boost_factor_sysctl_handler,
|
||||||
.extra1 = SYSCTL_ZERO,
|
.extra1 = SYSCTL_ZERO,
|
||||||
},
|
},
|
||||||
|
#ifdef CONFIG_MULTIPLE_KSWAPD
|
||||||
|
{
|
||||||
|
.procname = "kswapd_threads",
|
||||||
|
.data = &kswapd_threads,
|
||||||
|
.maxlen = sizeof(kswapd_threads),
|
||||||
|
.mode = 0644,
|
||||||
|
.proc_handler = kswapd_threads_sysctl_handler,
|
||||||
|
.extra1 = SYSCTL_ONE,
|
||||||
|
.extra2 = &max_kswapd_threads,
|
||||||
|
},
|
||||||
|
#endif
|
||||||
{
|
{
|
||||||
.procname = "watermark_scale_factor",
|
.procname = "watermark_scale_factor",
|
||||||
.data = &watermark_scale_factor,
|
.data = &watermark_scale_factor,
|
||||||
|
13
mm/Kconfig
13
mm/Kconfig
@ -861,3 +861,16 @@ config PROCESS_RECLAIM
|
|||||||
(addr, addr + size-bytes) of the process.
|
(addr, addr + size-bytes) of the process.
|
||||||
|
|
||||||
Any other value is ignored.
|
Any other value is ignored.
|
||||||
|
|
||||||
|
config MULTIPLE_KSWAPD
|
||||||
|
bool "Spawn multiple kswapd threads"
|
||||||
|
depends on QGKI
|
||||||
|
default y
|
||||||
|
help
|
||||||
|
kswapd_threads allows you to control the number of kswapd threads
|
||||||
|
per node running on the system. The default value is 1 and the
|
||||||
|
range of acceptible values are 1-16. The number of threads can
|
||||||
|
be controlled by below command:
|
||||||
|
(echo <num> > /proc/sys/vm/kswapd_threads)
|
||||||
|
|
||||||
|
Values not in the range of 1..16 are ignored.
|
||||||
|
@ -8047,6 +8047,22 @@ int watermark_boost_factor_sysctl_handler(struct ctl_table *table, int write,
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef CONFIG_MULTIPLE_KSWAPD
|
||||||
|
int kswapd_threads_sysctl_handler(struct ctl_table *table, int write,
|
||||||
|
void __user *buffer, size_t *length, loff_t *ppos)
|
||||||
|
{
|
||||||
|
int rc;
|
||||||
|
|
||||||
|
rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
|
||||||
|
if (rc)
|
||||||
|
return rc;
|
||||||
|
|
||||||
|
if (write)
|
||||||
|
update_kswapd_threads();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
|
int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
|
||||||
void __user *buffer, size_t *length, loff_t *ppos)
|
void __user *buffer, size_t *length, loff_t *ppos)
|
||||||
{
|
{
|
||||||
|
128
mm/vmscan.c
128
mm/vmscan.c
@ -139,6 +139,13 @@ struct scan_control {
|
|||||||
struct vm_area_struct *target_vma;
|
struct vm_area_struct *target_vma;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Number of active kswapd threads
|
||||||
|
*/
|
||||||
|
#define DEF_KSWAPD_THREADS_PER_NODE 1
|
||||||
|
int kswapd_threads = DEF_KSWAPD_THREADS_PER_NODE;
|
||||||
|
int kswapd_threads_current = DEF_KSWAPD_THREADS_PER_NODE;
|
||||||
|
|
||||||
#ifdef ARCH_HAS_PREFETCH
|
#ifdef ARCH_HAS_PREFETCH
|
||||||
#define prefetch_prev_lru_page(_page, _base, _field) \
|
#define prefetch_prev_lru_page(_page, _base, _field) \
|
||||||
do { \
|
do { \
|
||||||
@ -4111,6 +4118,116 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
|
|||||||
}
|
}
|
||||||
#endif /* CONFIG_HIBERNATION */
|
#endif /* CONFIG_HIBERNATION */
|
||||||
|
|
||||||
|
#ifdef CONFIG_MULTIPLE_KSWAPD
|
||||||
|
static void update_kswapd_threads_node(int nid)
|
||||||
|
{
|
||||||
|
pg_data_t *pgdat;
|
||||||
|
int drop, increase;
|
||||||
|
int last_idx, start_idx, hid;
|
||||||
|
int nr_threads = kswapd_threads_current;
|
||||||
|
|
||||||
|
pgdat = NODE_DATA(nid);
|
||||||
|
last_idx = nr_threads - 1;
|
||||||
|
if (kswapd_threads < nr_threads) {
|
||||||
|
drop = nr_threads - kswapd_threads;
|
||||||
|
for (hid = last_idx; hid > (last_idx - drop); hid--) {
|
||||||
|
if (pgdat->mkswapd[hid]) {
|
||||||
|
kthread_stop(pgdat->mkswapd[hid]);
|
||||||
|
pgdat->mkswapd[hid] = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
increase = kswapd_threads - nr_threads;
|
||||||
|
start_idx = last_idx + 1;
|
||||||
|
for (hid = start_idx; hid < (start_idx + increase); hid++) {
|
||||||
|
pgdat->mkswapd[hid] = kthread_run(kswapd, pgdat,
|
||||||
|
"kswapd%d:%d", nid, hid);
|
||||||
|
if (IS_ERR(pgdat->mkswapd[hid])) {
|
||||||
|
pr_err("Failed to start kswapd%d on node %d\n",
|
||||||
|
hid, nid);
|
||||||
|
pgdat->mkswapd[hid] = NULL;
|
||||||
|
/*
|
||||||
|
* We are out of resources. Do not start any
|
||||||
|
* more threads.
|
||||||
|
*/
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void update_kswapd_threads(void)
|
||||||
|
{
|
||||||
|
int nid;
|
||||||
|
|
||||||
|
if (kswapd_threads_current == kswapd_threads)
|
||||||
|
return;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Hold the memory hotplug lock to avoid racing with memory
|
||||||
|
* hotplug initiated updates
|
||||||
|
*/
|
||||||
|
mem_hotplug_begin();
|
||||||
|
for_each_node_state(nid, N_MEMORY)
|
||||||
|
update_kswapd_threads_node(nid);
|
||||||
|
|
||||||
|
pr_info("kswapd_thread count changed, old:%d new:%d\n",
|
||||||
|
kswapd_threads_current, kswapd_threads);
|
||||||
|
kswapd_threads_current = kswapd_threads;
|
||||||
|
mem_hotplug_done();
|
||||||
|
}
|
||||||
|
|
||||||
|
static int multi_kswapd_run(int nid)
|
||||||
|
{
|
||||||
|
pg_data_t *pgdat = NODE_DATA(nid);
|
||||||
|
int hid, nr_threads = kswapd_threads;
|
||||||
|
int ret = 0;
|
||||||
|
|
||||||
|
pgdat->mkswapd[0] = pgdat->kswapd;
|
||||||
|
for (hid = 1; hid < nr_threads; ++hid) {
|
||||||
|
pgdat->mkswapd[hid] = kthread_run(kswapd, pgdat, "kswapd%d:%d",
|
||||||
|
nid, hid);
|
||||||
|
if (IS_ERR(pgdat->mkswapd[hid])) {
|
||||||
|
/* failure at boot is fatal */
|
||||||
|
WARN_ON(system_state < SYSTEM_RUNNING);
|
||||||
|
pr_err("Failed to start kswapd%d on node %d\n",
|
||||||
|
hid, nid);
|
||||||
|
ret = PTR_ERR(pgdat->mkswapd[hid]);
|
||||||
|
pgdat->mkswapd[hid] = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
kswapd_threads_current = nr_threads;
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void multi_kswapd_stop(int nid)
|
||||||
|
{
|
||||||
|
int hid = 0;
|
||||||
|
int nr_threads = kswapd_threads_current;
|
||||||
|
struct task_struct *kswapd;
|
||||||
|
|
||||||
|
NODE_DATA(nid)->mkswapd[hid] = NULL;
|
||||||
|
for (hid = 1; hid < nr_threads; hid++) {
|
||||||
|
kswapd = NODE_DATA(nid)->mkswapd[hid];
|
||||||
|
if (kswapd) {
|
||||||
|
kthread_stop(kswapd);
|
||||||
|
NODE_DATA(nid)->mkswapd[hid] = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void multi_kswapd_cpu_online(pg_data_t *pgdat,
|
||||||
|
const struct cpumask *mask)
|
||||||
|
{
|
||||||
|
int hid;
|
||||||
|
int nr_threads = kswapd_threads_current;
|
||||||
|
|
||||||
|
for (hid = 1; hid < nr_threads; hid++)
|
||||||
|
set_cpus_allowed_ptr(pgdat->mkswapd[hid], mask);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
/* It's optimal to keep kswapds on the same CPUs as their memory, but
|
/* It's optimal to keep kswapds on the same CPUs as their memory, but
|
||||||
not required for correctness. So if the last cpu in a node goes
|
not required for correctness. So if the last cpu in a node goes
|
||||||
away, we get changed to run anywhere: as the first one comes back,
|
away, we get changed to run anywhere: as the first one comes back,
|
||||||
@ -4125,9 +4242,11 @@ static int kswapd_cpu_online(unsigned int cpu)
|
|||||||
|
|
||||||
mask = cpumask_of_node(pgdat->node_id);
|
mask = cpumask_of_node(pgdat->node_id);
|
||||||
|
|
||||||
if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
|
if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) {
|
||||||
/* One of our CPUs online: restore mask */
|
/* One of our CPUs online: restore mask */
|
||||||
set_cpus_allowed_ptr(pgdat->kswapd, mask);
|
set_cpus_allowed_ptr(pgdat->kswapd, mask);
|
||||||
|
multi_kswapd_cpu_online(pgdat, mask);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -4144,14 +4263,17 @@ int kswapd_run(int nid)
|
|||||||
if (pgdat->kswapd)
|
if (pgdat->kswapd)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
|
pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d:0", nid);
|
||||||
if (IS_ERR(pgdat->kswapd)) {
|
if (IS_ERR(pgdat->kswapd)) {
|
||||||
/* failure at boot is fatal */
|
/* failure at boot is fatal */
|
||||||
BUG_ON(system_state < SYSTEM_RUNNING);
|
BUG_ON(system_state < SYSTEM_RUNNING);
|
||||||
pr_err("Failed to start kswapd on node %d\n", nid);
|
pr_err("Failed to start kswapd on node %d\n", nid);
|
||||||
ret = PTR_ERR(pgdat->kswapd);
|
ret = PTR_ERR(pgdat->kswapd);
|
||||||
pgdat->kswapd = NULL;
|
pgdat->kswapd = NULL;
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
ret = multi_kswapd_run(nid);
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -4167,6 +4289,8 @@ void kswapd_stop(int nid)
|
|||||||
kthread_stop(kswapd);
|
kthread_stop(kswapd);
|
||||||
NODE_DATA(nid)->kswapd = NULL;
|
NODE_DATA(nid)->kswapd = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
multi_kswapd_stop(nid);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int __init kswapd_init(void)
|
static int __init kswapd_init(void)
|
||||||
|
Loading…
Reference in New Issue
Block a user