From adf5091e6ccaa02905e7a28f9ff44f46c7f4c230 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 28 Jun 2012 11:20:21 -0700 Subject: [PATCH 01/16] rcu: New rcu_user_enter() and rcu_user_exit() APIs RCU currently insists that only idle tasks can enter RCU idle mode, which prohibits an adaptive tickless kernel (AKA nohz cpusets), which in turn would mean that usermode execution would always take scheduling-clock interrupts, even when there is only one task runnable on the CPU in question. This commit therefore adds rcu_user_enter() and rcu_user_exit(), which allow non-idle tasks to enter RCU idle mode. These are quite similar to rcu_idle_enter() and rcu_idle_exit(), respectively, except that they omit the idle-task checks. [ Updated to use "user" flag rather than separate check functions. ] [ paulmck: Updated to drop exports of new functions based on Josh's patch getting rid of the need for them. ] Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Daniel Lezcano Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 2 + kernel/rcutree.c | 141 +++++++++++++++++++++++++++++---------- 2 files changed, 106 insertions(+), 37 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 0fbbd52e01f9..d8b20bfd4795 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -191,6 +191,8 @@ extern void rcu_idle_enter(void); extern void rcu_idle_exit(void); extern void rcu_irq_enter(void); extern void rcu_irq_exit(void); +extern void rcu_user_enter(void); +extern void rcu_user_exit(void); extern void exit_rcu(void); /** diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 7387e46009d9..af0dc3472a4b 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -322,16 +322,17 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) } /* - * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle + * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state * * If the new value of the ->dynticks_nesting counter now is zero, * we really have entered idle, and must do the appropriate accounting. * The caller must have disabled interrupts. */ -static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) +static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, + bool user) { trace_rcu_dyntick("Start", oldval, 0); - if (!is_idle_task(current)) { + if (!is_idle_task(current) && !user) { struct task_struct *idle = idle_task(smp_processor_id()); trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); @@ -348,7 +349,7 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); /* - * The idle task is not permitted to enter the idle loop while + * It is illegal to enter an extended quiescent state while * in an RCU read-side critical section. */ rcu_lockdep_assert(!lock_is_held(&rcu_lock_map), @@ -359,6 +360,28 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) "Illegal idle entry in RCU-sched read-side critical section."); } +/* + * Enter an RCU extended quiescent state, which can be either the + * idle loop or adaptive-tickless usermode execution. + */ +static void rcu_eqs_enter(bool user) +{ + unsigned long flags; + long long oldval; + struct rcu_dynticks *rdtp; + + local_irq_save(flags); + rdtp = &__get_cpu_var(rcu_dynticks); + oldval = rdtp->dynticks_nesting; + WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); + if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) + rdtp->dynticks_nesting = 0; + else + rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; + rcu_eqs_enter_common(rdtp, oldval, user); + local_irq_restore(flags); +} + /** * rcu_idle_enter - inform RCU that current CPU is entering idle * @@ -373,23 +396,35 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) */ void rcu_idle_enter(void) { - unsigned long flags; - long long oldval; - struct rcu_dynticks *rdtp; - - local_irq_save(flags); - rdtp = &__get_cpu_var(rcu_dynticks); - oldval = rdtp->dynticks_nesting; - WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); - if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) - rdtp->dynticks_nesting = 0; - else - rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; - rcu_idle_enter_common(rdtp, oldval); - local_irq_restore(flags); + rcu_eqs_enter(0); } EXPORT_SYMBOL_GPL(rcu_idle_enter); +/** + * rcu_user_enter - inform RCU that we are resuming userspace. + * + * Enter RCU idle mode right before resuming userspace. No use of RCU + * is permitted between this call and rcu_user_exit(). This way the + * CPU doesn't need to maintain the tick for RCU maintenance purposes + * when the CPU runs in userspace. + */ +void rcu_user_enter(void) +{ + /* + * Some contexts may involve an exception occuring in an irq, + * leading to that nesting: + * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() + * This would mess up the dyntick_nesting count though. And rcu_irq_*() + * helpers are enough to protect RCU uses inside the exception. So + * just return immediately if we detect we are in an IRQ. + */ + if (in_interrupt()) + return; + + rcu_eqs_enter(1); +} + + /** * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle * @@ -420,18 +455,19 @@ void rcu_irq_exit(void) if (rdtp->dynticks_nesting) trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); else - rcu_idle_enter_common(rdtp, oldval); + rcu_eqs_enter_common(rdtp, oldval, 1); local_irq_restore(flags); } /* - * rcu_idle_exit_common - inform RCU that current CPU is moving away from idle + * rcu_eqs_exit_common - current CPU moving away from extended quiescent state * * If the new value of the ->dynticks_nesting counter was previously zero, * we really have exited idle, and must do the appropriate accounting. * The caller must have disabled interrupts. */ -static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) +static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, + int user) { smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ atomic_inc(&rdtp->dynticks); @@ -440,7 +476,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); rcu_cleanup_after_idle(smp_processor_id()); trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); - if (!is_idle_task(current)) { + if (!is_idle_task(current) && !user) { struct task_struct *idle = idle_task(smp_processor_id()); trace_rcu_dyntick("Error on exit: not idle task", @@ -452,6 +488,28 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) } } +/* + * Exit an RCU extended quiescent state, which can be either the + * idle loop or adaptive-tickless usermode execution. + */ +static void rcu_eqs_exit(bool user) +{ + unsigned long flags; + struct rcu_dynticks *rdtp; + long long oldval; + + local_irq_save(flags); + rdtp = &__get_cpu_var(rcu_dynticks); + oldval = rdtp->dynticks_nesting; + WARN_ON_ONCE(oldval < 0); + if (oldval & DYNTICK_TASK_NEST_MASK) + rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; + else + rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; + rcu_eqs_exit_common(rdtp, oldval, user); + local_irq_restore(flags); +} + /** * rcu_idle_exit - inform RCU that current CPU is leaving idle * @@ -465,23 +523,32 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) */ void rcu_idle_exit(void) { - unsigned long flags; - struct rcu_dynticks *rdtp; - long long oldval; - - local_irq_save(flags); - rdtp = &__get_cpu_var(rcu_dynticks); - oldval = rdtp->dynticks_nesting; - WARN_ON_ONCE(oldval < 0); - if (oldval & DYNTICK_TASK_NEST_MASK) - rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; - else - rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - rcu_idle_exit_common(rdtp, oldval); - local_irq_restore(flags); + rcu_eqs_exit(0); } EXPORT_SYMBOL_GPL(rcu_idle_exit); +/** + * rcu_user_exit - inform RCU that we are exiting userspace. + * + * Exit RCU idle mode while entering the kernel because it can + * run a RCU read side critical section anytime. + */ +void rcu_user_exit(void) +{ + /* + * Some contexts may involve an exception occuring in an irq, + * leading to that nesting: + * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() + * This would mess up the dyntick_nesting count though. And rcu_irq_*() + * helpers are enough to protect RCU uses inside the exception. So + * just return immediately if we detect we are in an IRQ. + */ + if (in_interrupt()) + return; + + rcu_eqs_exit(1); +} + /** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle * @@ -515,7 +582,7 @@ void rcu_irq_enter(void) if (oldval) trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); else - rcu_idle_exit_common(rdtp, oldval); + rcu_eqs_exit_common(rdtp, oldval, 1); local_irq_restore(flags); } From 19dd1591fc379f1d89f39cd99cbbe97433baa3c3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 4 Jun 2012 16:42:35 -0700 Subject: [PATCH 02/16] rcu: New rcu_user_enter_after_irq() and rcu_user_exit_after_irq() APIs In some cases, it is necessary to enter or exit userspace-RCU-idle mode from an interrupt handler, for example, if some other CPU sends this CPU a resched IPI. In this case, the current CPU would enter the IPI handler in userspace-RCU-idle mode, but would need to exit the IPI handler after having exited that mode. To allow this to work, this commit adds two new APIs to TREE_RCU: - rcu_user_enter_after_irq(). This must be called from an interrupt between rcu_irq_enter() and rcu_irq_exit(). After the irq calls rcu_irq_exit(), the irq handler will return into an RCU extended quiescent state. In theory, this interrupt is never a nested interrupt, but in practice it might interrupt softirq, which looks to RCU like a nested interrupt. - rcu_user_exit_after_irq(). This must be called from a non-nesting interrupt, interrupting an RCU extended quiescent state, also between rcu_irq_enter() and rcu_irq_exit(). After the irq calls rcu_irq_exit(), the irq handler will return in an RCU non-quiescent state. [ Combined with "Allow calls to rcu_exit_user_irq from nesting irqs." ] Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 2 ++ kernel/rcutree.c | 43 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index d8b20bfd4795..f818dd165b44 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -193,6 +193,8 @@ extern void rcu_irq_enter(void); extern void rcu_irq_exit(void); extern void rcu_user_enter(void); extern void rcu_user_exit(void); +extern void rcu_user_enter_after_irq(void); +extern void rcu_user_exit_after_irq(void); extern void exit_rcu(void); /** diff --git a/kernel/rcutree.c b/kernel/rcutree.c index af0dc3472a4b..4138f59fa2f4 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -425,6 +425,27 @@ void rcu_user_enter(void) } +/** + * rcu_user_enter_after_irq - inform RCU that we are going to resume userspace + * after the current irq returns. + * + * This is similar to rcu_user_enter() but in the context of a non-nesting + * irq. After this call, RCU enters into idle mode when the interrupt + * returns. + */ +void rcu_user_enter_after_irq(void) +{ + unsigned long flags; + struct rcu_dynticks *rdtp; + + local_irq_save(flags); + rdtp = &__get_cpu_var(rcu_dynticks); + /* Ensure this irq is interrupting a non-idle RCU state. */ + WARN_ON_ONCE(!(rdtp->dynticks_nesting & DYNTICK_TASK_MASK)); + rdtp->dynticks_nesting = 1; + local_irq_restore(flags); +} + /** * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle * @@ -549,6 +570,28 @@ void rcu_user_exit(void) rcu_eqs_exit(1); } +/** + * rcu_user_exit_after_irq - inform RCU that we won't resume to userspace + * idle mode after the current non-nesting irq returns. + * + * This is similar to rcu_user_exit() but in the context of an irq. + * This is called when the irq has interrupted a userspace RCU idle mode + * context. When the current non-nesting interrupt returns after this call, + * the CPU won't restore the RCU idle mode. + */ +void rcu_user_exit_after_irq(void) +{ + unsigned long flags; + struct rcu_dynticks *rdtp; + + local_irq_save(flags); + rdtp = &__get_cpu_var(rcu_dynticks); + /* Ensure we are interrupting an RCU idle mode. */ + WARN_ON_ONCE(rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK); + rdtp->dynticks_nesting += DYNTICK_TASK_EXIT_IDLE; + local_irq_restore(flags); +} + /** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle * From 9a0c6fef423528ba5b62aa31b29aabf689eb8f70 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Jun 2012 12:33:51 -0700 Subject: [PATCH 03/16] rcu: Make RCU_FAST_NO_HZ handle adaptive ticks The current implementation of RCU_FAST_NO_HZ tries reasonably hard to rid the current CPU of RCU callbacks. This is appropriate when the CPU is entering idle, where it doesn't have much useful to do anyway, but is most definitely not what you want when transitioning to user-mode execution. This commit therefore detects the adaptive-tick case, and refrains from burning CPU time getting rid of RCU callbacks in that case. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree_plugin.h | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 9c71c1b18e03..f92115488187 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1757,6 +1757,26 @@ static void rcu_prepare_for_idle(int cpu) if (!tne) return; + /* Adaptive-tick mode, where usermode execution is idle to RCU. */ + if (!is_idle_task(current)) { + rdtp->dyntick_holdoff = jiffies - 1; + if (rcu_cpu_has_nonlazy_callbacks(cpu)) { + trace_rcu_prep_idle("User dyntick with callbacks"); + rdtp->idle_gp_timer_expires = + round_up(jiffies + RCU_IDLE_GP_DELAY, + RCU_IDLE_GP_DELAY); + } else if (rcu_cpu_has_callbacks(cpu)) { + rdtp->idle_gp_timer_expires = + round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY); + trace_rcu_prep_idle("User dyntick with lazy callbacks"); + } else { + return; + } + tp = &rdtp->idle_gp_timer; + mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); + return; + } + /* * If this is an idle re-entry, for example, due to use of * RCU_NONIDLE() or the new idle-loop tracing API within the idle From 2b1d5024e17be459aa6385763ca3faa8f01c52d9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 11 Jul 2012 20:26:30 +0200 Subject: [PATCH 04/16] rcu: Settle config for userspace extended quiescent state Create a new config option under the RCU menu that put CPUs under RCU extended quiescent state (as in dynticks idle mode) when they run in userspace. This require some contribution from architectures to hook into kernel and userspace boundaries. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Josh Triplett Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- arch/Kconfig | 10 ++++++++++ include/linux/rcupdate.h | 9 +++++++++ init/Kconfig | 10 ++++++++++ kernel/rcutree.c | 5 ++++- 4 files changed, 33 insertions(+), 1 deletion(-) diff --git a/arch/Kconfig b/arch/Kconfig index 72f2fa189cc5..1401a7587973 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -281,4 +281,14 @@ config SECCOMP_FILTER See Documentation/prctl/seccomp_filter.txt for details. +config HAVE_RCU_USER_QS + bool + help + Provide kernel entry/exit hooks necessary for userspace + RCU extended quiescent state. Syscalls need to be wrapped inside + rcu_user_exit()-rcu_user_enter() through the slow path using + TIF_NOHZ flag. Exceptions handlers must be wrapped as well. Irqs + are already protected inside rcu_irq_enter/rcu_irq_exit() but + preemption or signal handling on irq exit still need to be protected. + source "kernel/gcov/Kconfig" diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index f818dd165b44..f5034f22e94b 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -191,10 +191,19 @@ extern void rcu_idle_enter(void); extern void rcu_idle_exit(void); extern void rcu_irq_enter(void); extern void rcu_irq_exit(void); + +#ifdef CONFIG_RCU_USER_QS extern void rcu_user_enter(void); extern void rcu_user_exit(void); extern void rcu_user_enter_after_irq(void); extern void rcu_user_exit_after_irq(void); +#else +static inline void rcu_user_enter(void) { } +static inline void rcu_user_exit(void) { } +static inline void rcu_user_enter_after_irq(void) { } +static inline void rcu_user_exit_after_irq(void) { } +#endif /* CONFIG_RCU_USER_QS */ + extern void exit_rcu(void); /** diff --git a/init/Kconfig b/init/Kconfig index af6c7f8ba019..f6a1830165ce 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -441,6 +441,16 @@ config PREEMPT_RCU This option enables preemptible-RCU code that is common between the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations. +config RCU_USER_QS + bool "Consider userspace as in RCU extended quiescent state" + depends on HAVE_RCU_USER_QS && SMP + help + This option sets hooks on kernel / userspace boundaries and + puts RCU in extended quiescent state when the CPU runs in + userspace. It means that when a CPU runs in userspace, it is + excluded from the global RCU state machine and thus doesn't + to keep the timer tick on for RCU. + config RCU_FANOUT int "Tree-based hierarchical RCU fanout value" range 2 64 if 64BIT diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 4138f59fa2f4..79fa2db1595b 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -400,6 +400,7 @@ void rcu_idle_enter(void) } EXPORT_SYMBOL_GPL(rcu_idle_enter); +#ifdef CONFIG_RCU_USER_QS /** * rcu_user_enter - inform RCU that we are resuming userspace. * @@ -424,7 +425,6 @@ void rcu_user_enter(void) rcu_eqs_enter(1); } - /** * rcu_user_enter_after_irq - inform RCU that we are going to resume userspace * after the current irq returns. @@ -445,6 +445,7 @@ void rcu_user_enter_after_irq(void) rdtp->dynticks_nesting = 1; local_irq_restore(flags); } +#endif /* CONFIG_RCU_USER_QS */ /** * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle @@ -548,6 +549,7 @@ void rcu_idle_exit(void) } EXPORT_SYMBOL_GPL(rcu_idle_exit); +#ifdef CONFIG_RCU_USER_QS /** * rcu_user_exit - inform RCU that we are exiting userspace. * @@ -591,6 +593,7 @@ void rcu_user_exit_after_irq(void) rdtp->dynticks_nesting += DYNTICK_TASK_EXIT_IDLE; local_irq_restore(flags); } +#endif /* CONFIG_RCU_USER_QS */ /** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle From c5d900bf676b1e2a61c44483932c8088651bbb4e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 11 Jul 2012 20:26:31 +0200 Subject: [PATCH 05/16] rcu: Allow rcu_user_enter()/exit() to nest Allow calls to rcu_user_enter() even if we are already in userspace (as seen by RCU) and allow calls to rcu_user_exit() even if we are already in the kernel. This makes the APIs more flexible to be called from architectures. Exception entries for example won't need to know if they come from userspace before calling rcu_user_exit(). Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Josh Triplett Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 41 +++++++++++++++++++++++++++++++++-------- kernel/rcutree.h | 3 +++ 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 79fa2db1595b..d62c04482228 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -366,11 +366,9 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, */ static void rcu_eqs_enter(bool user) { - unsigned long flags; long long oldval; struct rcu_dynticks *rdtp; - local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); @@ -379,7 +377,6 @@ static void rcu_eqs_enter(bool user) else rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; rcu_eqs_enter_common(rdtp, oldval, user); - local_irq_restore(flags); } /** @@ -396,7 +393,11 @@ static void rcu_eqs_enter(bool user) */ void rcu_idle_enter(void) { + unsigned long flags; + + local_irq_save(flags); rcu_eqs_enter(0); + local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -411,6 +412,9 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter); */ void rcu_user_enter(void) { + unsigned long flags; + struct rcu_dynticks *rdtp; + /* * Some contexts may involve an exception occuring in an irq, * leading to that nesting: @@ -422,7 +426,15 @@ void rcu_user_enter(void) if (in_interrupt()) return; - rcu_eqs_enter(1); + WARN_ON_ONCE(!current->mm); + + local_irq_save(flags); + rdtp = &__get_cpu_var(rcu_dynticks); + if (!rdtp->in_user) { + rdtp->in_user = true; + rcu_eqs_enter(1); + } + local_irq_restore(flags); } /** @@ -516,11 +528,9 @@ static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, */ static void rcu_eqs_exit(bool user) { - unsigned long flags; struct rcu_dynticks *rdtp; long long oldval; - local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE(oldval < 0); @@ -529,7 +539,6 @@ static void rcu_eqs_exit(bool user) else rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; rcu_eqs_exit_common(rdtp, oldval, user); - local_irq_restore(flags); } /** @@ -545,7 +554,11 @@ static void rcu_eqs_exit(bool user) */ void rcu_idle_exit(void) { + unsigned long flags; + + local_irq_save(flags); rcu_eqs_exit(0); + local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_exit); @@ -558,6 +571,9 @@ EXPORT_SYMBOL_GPL(rcu_idle_exit); */ void rcu_user_exit(void) { + unsigned long flags; + struct rcu_dynticks *rdtp; + /* * Some contexts may involve an exception occuring in an irq, * leading to that nesting: @@ -569,7 +585,13 @@ void rcu_user_exit(void) if (in_interrupt()) return; - rcu_eqs_exit(1); + local_irq_save(flags); + rdtp = &__get_cpu_var(rcu_dynticks); + if (rdtp->in_user) { + rdtp->in_user = false; + rcu_eqs_exit(1); + } + local_irq_restore(flags); } /** @@ -2586,6 +2608,9 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); +#ifdef CONFIG_RCU_USER_QS + WARN_ON_ONCE(rdp->dynticks->in_user); +#endif rdp->cpu = cpu; rdp->rsp = rsp; raw_spin_unlock_irqrestore(&rnp->lock, flags); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 7576fd4d8ce6..10cc2f9f8433 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -102,6 +102,9 @@ struct rcu_dynticks { /* idle-period nonlazy_posted snapshot. */ int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ +#ifdef CONFIG_RCU_USER_QS + bool in_user; /* Is the CPU in userland from RCU POV? */ +#endif }; /* RCU's kthread states for tracing. */ From 1e1a689f10a27a4fe1ab9b4c6db04fa7232746a5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 11 Jul 2012 20:26:32 +0200 Subject: [PATCH 06/16] rcu: Ignore userspace extended quiescent state by default By default we don't want to enter into RCU extended quiescent state while in userspace because doing this produces some overhead (eg: use of syscall slowpath). Set it off by default and ready to run when some feature like adaptive tickless need it. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Josh Triplett Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 5 ++++- kernel/rcutree.h | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d62c04482228..6b82a9565149 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -206,6 +206,9 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, .dynticks = ATOMIC_INIT(1), +#ifdef CONFIG_RCU_USER_QS + .ignore_user_qs = true, +#endif }; static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ @@ -430,7 +433,7 @@ void rcu_user_enter(void) local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); - if (!rdtp->in_user) { + if (!rdtp->ignore_user_qs && !rdtp->in_user) { rdtp->in_user = true; rcu_eqs_enter(1); } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 10cc2f9f8433..5faf05d68326 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -103,6 +103,7 @@ struct rcu_dynticks { int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ #ifdef CONFIG_RCU_USER_QS + bool ignore_user_qs; /* Treat userspace as extended QS or not */ bool in_user; /* Is the CPU in userland from RCU POV? */ #endif }; From 04e7e951532b390b16feb070be9972b8fad2fc57 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 16 Jul 2012 15:06:40 -0700 Subject: [PATCH 07/16] rcu: Switch task's syscall hooks on context switch Clear the syscalls hook of a task when it's scheduled out so that if the task migrates, it doesn't run the syscall slow path on a CPU that might not need it. Also set the syscalls hook on the next task if needed. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Josh Triplett Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- arch/um/drivers/mconsole_kern.c | 1 + include/linux/rcupdate.h | 2 ++ include/linux/sched.h | 8 ++++++++ kernel/rcutree.c | 15 +++++++++++++++ kernel/sched/core.c | 1 + 5 files changed, 27 insertions(+) diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index 664a60e8dfb4..c17de0db6736 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -705,6 +705,7 @@ static void stack_proc(void *arg) struct task_struct *from = current, *to = arg; to->thread.saved_task = from; + rcu_switch(from, to); switch_to(from, to, from); } diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index f5034f22e94b..7c968e4f929e 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -197,6 +197,8 @@ extern void rcu_user_enter(void); extern void rcu_user_exit(void); extern void rcu_user_enter_after_irq(void); extern void rcu_user_exit_after_irq(void); +extern void rcu_user_hooks_switch(struct task_struct *prev, + struct task_struct *next); #else static inline void rcu_user_enter(void) { } static inline void rcu_user_exit(void) { } diff --git a/include/linux/sched.h b/include/linux/sched.h index 23bddac4bad8..335720a1fc33 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1885,6 +1885,14 @@ static inline void rcu_copy_process(struct task_struct *p) #endif +static inline void rcu_switch(struct task_struct *prev, + struct task_struct *next) +{ +#ifdef CONFIG_RCU_USER_QS + rcu_user_hooks_switch(prev, next); +#endif +} + static inline void tsk_restore_flags(struct task_struct *task, unsigned long orig_flags, unsigned long flags) { diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 6b82a9565149..d2e74c8d4b0e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -717,6 +717,21 @@ int rcu_is_cpu_idle(void) } EXPORT_SYMBOL(rcu_is_cpu_idle); +#ifdef CONFIG_RCU_USER_QS +void rcu_user_hooks_switch(struct task_struct *prev, + struct task_struct *next) +{ + struct rcu_dynticks *rdtp; + + /* Interrupts are disabled in context switch */ + rdtp = &__get_cpu_var(rcu_dynticks); + if (!rdtp->ignore_user_qs) { + clear_tsk_thread_flag(prev, TIF_NOHZ); + set_tsk_thread_flag(next, TIF_NOHZ); + } +} +#endif /* #ifdef CONFIG_RCU_USER_QS */ + #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1a48cdbc8631..ea2213b07d9d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2081,6 +2081,7 @@ context_switch(struct rq *rq, struct task_struct *prev, #endif /* Here we just switch the register state and the stack. */ + rcu_switch(prev, next); switch_to(prev, next, prev); barrier(); From bf5a3c13b939813d28ce26c01425054c740d6731 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 11 Jul 2012 20:26:34 +0200 Subject: [PATCH 08/16] x86: Syscall hooks for userspace RCU extended QS Add syscall slow path hooks to notify syscall entry and exit on CPUs that want to support userspace RCU extended quiescent state. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Josh Triplett Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- arch/x86/include/asm/thread_info.h | 10 +++++++--- arch/x86/kernel/ptrace.c | 5 +++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 89f794f007ec..c535d847e3b5 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -89,6 +89,7 @@ struct thread_info { #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_IA32 17 /* IA32 compatibility process */ #define TIF_FORK 18 /* ret_from_fork */ +#define TIF_NOHZ 19 /* in adaptive nohz mode */ #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ #define TIF_DEBUG 21 /* uses debug registers */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ @@ -114,6 +115,7 @@ struct thread_info { #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_IA32 (1 << TIF_IA32) #define _TIF_FORK (1 << TIF_FORK) +#define _TIF_NOHZ (1 << TIF_NOHZ) #define _TIF_DEBUG (1 << TIF_DEBUG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) @@ -126,12 +128,13 @@ struct thread_info { /* work to do in syscall_trace_enter() */ #define _TIF_WORK_SYSCALL_ENTRY \ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \ - _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT) + _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT | \ + _TIF_NOHZ) /* work to do in syscall_trace_leave() */ #define _TIF_WORK_SYSCALL_EXIT \ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \ - _TIF_SYSCALL_TRACEPOINT) + _TIF_SYSCALL_TRACEPOINT | _TIF_NOHZ) /* work to do on interrupt/exception return */ #define _TIF_WORK_MASK \ @@ -141,7 +144,8 @@ struct thread_info { /* work to do on any return to user space */ #define _TIF_ALLWORK_MASK \ - ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT) + ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT | \ + _TIF_NOHZ) /* Only used for 64 bit */ #define _TIF_DO_NOTIFY_MASK \ diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index c4c6a5c2bf0f..9f94f8ec26e4 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -1463,6 +1464,8 @@ long syscall_trace_enter(struct pt_regs *regs) { long ret = 0; + rcu_user_exit(); + /* * If we stepped into a sysenter/syscall insn, it trapped in * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. @@ -1526,4 +1529,6 @@ void syscall_trace_leave(struct pt_regs *regs) !test_thread_flag(TIF_SYSCALL_EMU); if (step || test_thread_flag(TIF_SYSCALL_TRACE)) tracehook_report_syscall_exit(regs, step); + + rcu_user_enter(); } From ef3f628872c838933a279d0d7e63e707783c9710 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 24 Sep 2012 21:05:52 +0200 Subject: [PATCH 09/16] x86: Unspaghettize do_general_protection() There is some unnatural label based layout in this function. Convert the unnecessary goto to readable conditional blocks. Signed-off-by: Frederic Weisbecker Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Ingo Molnar --- arch/x86/kernel/traps.c | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b481341c9369..74850e559449 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -258,13 +258,25 @@ do_general_protection(struct pt_regs *regs, long error_code) conditional_sti(regs); #ifdef CONFIG_X86_32 - if (regs->flags & X86_VM_MASK) - goto gp_in_vm86; + if (regs->flags & X86_VM_MASK) { + local_irq_enable(); + handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); + return; + } #endif tsk = current; - if (!user_mode(regs)) - goto gp_in_kernel; + if (!user_mode(regs)) { + if (fixup_exception(regs)) + return; + + tsk->thread.error_code = error_code; + tsk->thread.trap_nr = X86_TRAP_GP; + if (!notify_die(DIE_GPF, "general protection fault", regs, error_code, + X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP) + die("general protection fault", regs, error_code); + return; + } tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_GP; @@ -280,24 +292,6 @@ do_general_protection(struct pt_regs *regs, long error_code) force_sig(SIGSEGV, tsk); return; - -#ifdef CONFIG_X86_32 -gp_in_vm86: - local_irq_enable(); - handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); - return; -#endif - -gp_in_kernel: - if (fixup_exception(regs)) - return; - - tsk->thread.error_code = error_code; - tsk->thread.trap_nr = X86_TRAP_GP; - if (notify_die(DIE_GPF, "general protection fault", regs, error_code, - X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP) - return; - die("general protection fault", regs, error_code); } /* May run on IST stack. */ From 6ba3c97a38803883c2eee489505796cb0a727122 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 11 Jul 2012 20:26:35 +0200 Subject: [PATCH 10/16] x86: Exception hooks for userspace RCU extended QS Add necessary hooks to x86 exception for userspace RCU extended quiescent state support. This includes traps, page fault, debug exceptions, etc... Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Josh Triplett Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney --- arch/x86/include/asm/rcu.h | 20 ++++++++++ arch/x86/kernel/traps.c | 81 ++++++++++++++++++++++++++------------ arch/x86/mm/fault.c | 13 +++++- 3 files changed, 86 insertions(+), 28 deletions(-) create mode 100644 arch/x86/include/asm/rcu.h diff --git a/arch/x86/include/asm/rcu.h b/arch/x86/include/asm/rcu.h new file mode 100644 index 000000000000..439815b35ced --- /dev/null +++ b/arch/x86/include/asm/rcu.h @@ -0,0 +1,20 @@ +#ifndef _ASM_X86_RCU_H +#define _ASM_X86_RCU_H + +#include +#include + +static inline void exception_enter(struct pt_regs *regs) +{ + rcu_user_exit(); +} + +static inline void exception_exit(struct pt_regs *regs) +{ +#ifdef CONFIG_RCU_USER_QS + if (user_mode(regs)) + rcu_user_enter(); +#endif +} + +#endif diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 74850e559449..378967578f22 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -55,6 +55,7 @@ #include #include #include +#include #include @@ -180,11 +181,15 @@ vm86_trap: #define DO_ERROR(trapnr, signr, str, name) \ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ { \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ + exception_enter(regs); \ + if (notify_die(DIE_TRAP, str, regs, error_code, \ + trapnr, signr) == NOTIFY_STOP) { \ + exception_exit(regs); \ return; \ + } \ conditional_sti(regs); \ do_trap(trapnr, signr, str, regs, error_code, NULL); \ + exception_exit(regs); \ } #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ @@ -195,11 +200,15 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ info.si_errno = 0; \ info.si_code = sicode; \ info.si_addr = (void __user *)siaddr; \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ + exception_enter(regs); \ + if (notify_die(DIE_TRAP, str, regs, error_code, \ + trapnr, signr) == NOTIFY_STOP) { \ + exception_exit(regs); \ return; \ + } \ conditional_sti(regs); \ do_trap(trapnr, signr, str, regs, error_code, &info); \ + exception_exit(regs); \ } DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, @@ -222,12 +231,14 @@ DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check, /* Runs on IST stack */ dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) { + exception_enter(regs); if (notify_die(DIE_TRAP, "stack segment", regs, error_code, - X86_TRAP_SS, SIGBUS) == NOTIFY_STOP) - return; - preempt_conditional_sti(regs); - do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL); - preempt_conditional_cli(regs); + X86_TRAP_SS, SIGBUS) != NOTIFY_STOP) { + preempt_conditional_sti(regs); + do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL); + preempt_conditional_cli(regs); + } + exception_exit(regs); } dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) @@ -235,6 +246,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) static const char str[] = "double fault"; struct task_struct *tsk = current; + exception_enter(regs); /* Return not checked because double check cannot be ignored */ notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); @@ -255,27 +267,28 @@ do_general_protection(struct pt_regs *regs, long error_code) { struct task_struct *tsk; + exception_enter(regs); conditional_sti(regs); #ifdef CONFIG_X86_32 if (regs->flags & X86_VM_MASK) { local_irq_enable(); handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); - return; + goto exit; } #endif tsk = current; if (!user_mode(regs)) { if (fixup_exception(regs)) - return; + goto exit; tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_GP; - if (!notify_die(DIE_GPF, "general protection fault", regs, error_code, - X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP) + if (notify_die(DIE_GPF, "general protection fault", regs, error_code, + X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP) die("general protection fault", regs, error_code); - return; + goto exit; } tsk->thread.error_code = error_code; @@ -291,7 +304,8 @@ do_general_protection(struct pt_regs *regs, long error_code) } force_sig(SIGSEGV, tsk); - return; +exit: + exception_exit(regs); } /* May run on IST stack. */ @@ -306,15 +320,16 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co ftrace_int3_handler(regs)) return; #endif + exception_enter(regs); #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, SIGTRAP) == NOTIFY_STOP) - return; + goto exit; #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, SIGTRAP) == NOTIFY_STOP) - return; + goto exit; /* * Let others (NMI) know that the debug stack is in use @@ -325,6 +340,8 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL); preempt_conditional_cli(regs); debug_stack_usage_dec(); +exit: + exception_exit(regs); } #ifdef CONFIG_X86_64 @@ -385,6 +402,8 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) unsigned long dr6; int si_code; + exception_enter(regs); + get_debugreg(dr6, 6); /* Filter out all the reserved bits which are preset to 1 */ @@ -400,7 +419,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) /* Catch kmemcheck conditions first of all! */ if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) - return; + goto exit; /* DR6 may or may not be cleared by the CPU */ set_debugreg(0, 6); @@ -415,7 +434,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code, SIGTRAP) == NOTIFY_STOP) - return; + goto exit; /* * Let others (NMI) know that the debug stack is in use @@ -431,7 +450,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) X86_TRAP_DB); preempt_conditional_cli(regs); debug_stack_usage_dec(); - return; + goto exit; } /* @@ -452,7 +471,8 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) preempt_conditional_cli(regs); debug_stack_usage_dec(); - return; +exit: + exception_exit(regs); } /* @@ -549,14 +569,17 @@ dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) #ifdef CONFIG_X86_32 ignore_fpu_irq = 1; #endif - + exception_enter(regs); math_error(regs, error_code, X86_TRAP_MF); + exception_exit(regs); } dotraplinkage void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) { + exception_enter(regs); math_error(regs, error_code, X86_TRAP_XF); + exception_exit(regs); } dotraplinkage void @@ -623,6 +646,7 @@ EXPORT_SYMBOL_GPL(math_state_restore); dotraplinkage void __kprobes do_device_not_available(struct pt_regs *regs, long error_code) { + exception_enter(regs); #ifdef CONFIG_MATH_EMULATION if (read_cr0() & X86_CR0_EM) { struct math_emu_info info = { }; @@ -631,6 +655,7 @@ do_device_not_available(struct pt_regs *regs, long error_code) info.regs = regs; math_emulate(&info); + exception_exit(regs); return; } #endif @@ -638,12 +663,15 @@ do_device_not_available(struct pt_regs *regs, long error_code) #ifdef CONFIG_X86_32 conditional_sti(regs); #endif + exception_exit(regs); } #ifdef CONFIG_X86_32 dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) { siginfo_t info; + + exception_enter(regs); local_irq_enable(); info.si_signo = SIGILL; @@ -651,10 +679,11 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) info.si_code = ILL_BADSTK; info.si_addr = NULL; if (notify_die(DIE_TRAP, "iret exception", regs, error_code, - X86_TRAP_IRET, SIGILL) == NOTIFY_STOP) - return; - do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code, - &info); + X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) { + do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code, + &info); + } + exception_exit(regs); } #endif diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 76dcd9d8e0bc..7dde46d68a25 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -18,6 +18,7 @@ #include /* pgd_*(), ... */ #include /* kmemcheck_*(), ... */ #include /* VSYSCALL_START */ +#include /* exception_enter(), ... */ /* * Page fault error code bits: @@ -1000,8 +1001,8 @@ static int fault_in_kernel_space(unsigned long address) * and the problem, and then passes it off to one of the appropriate * routines. */ -dotraplinkage void __kprobes -do_page_fault(struct pt_regs *regs, unsigned long error_code) +static void __kprobes +__do_page_fault(struct pt_regs *regs, unsigned long error_code) { struct vm_area_struct *vma; struct task_struct *tsk; @@ -1209,3 +1210,11 @@ good_area: up_read(&mm->mmap_sem); } + +dotraplinkage void __kprobes +do_page_fault(struct pt_regs *regs, unsigned long error_code) +{ + exception_enter(regs); + __do_page_fault(regs, error_code); + exception_exit(regs); +} From 90a340ed53f0f3bcc3fdf1b2cff56c0e4e911d01 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 11 Jul 2012 20:26:36 +0200 Subject: [PATCH 11/16] rcu: Exit RCU extended QS on kernel preemption after irq/exception When an exception or an irq exits, and we are going to resume into interrupted kernel code, the low level architecture code calls preempt_schedule_irq() if there is a need to reschedule. If the interrupt/exception occured between a call to rcu_user_enter() (from syscall exit, exception exit, do_notify_resume exit, ...) and a real resume to userspace (iret,...), preempt_schedule_irq() can be called whereas RCU thinks we are in userspace. But preempt_schedule_irq() is going to run kernel code and may be some RCU read side critical section. We must exit the userspace extended quiescent state before we call it. To solve this, just call rcu_user_exit() in the beginning of preempt_schedule_irq(). Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Josh Triplett Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ea2213b07d9d..4adcd237c545 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3570,6 +3570,7 @@ asmlinkage void __sched preempt_schedule_irq(void) /* Catch callers which need to be fixed */ BUG_ON(ti->preempt_count || !irqs_disabled()); + rcu_user_exit(); do { add_preempt_count(PREEMPT_ACTIVE); local_irq_enable(); From 20ab65e33f469c35f3dabde3445b668aa9c943ee Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 11 Jul 2012 20:26:37 +0200 Subject: [PATCH 12/16] rcu: Exit RCU extended QS on user preemption When exceptions or irq are about to resume userspace, if the task needs to be rescheduled, the arch low level code calls schedule() directly. If we call it, it is because we have the TIF_RESCHED flag: - It can be set after random local calls to set_need_resched() (RCU, drm, ...) - A wake up happened and the CPU needs preemption. This can happen in several ways: * Remotely: the remote waking CPU has set TIF_RESCHED and send the wakee an IPI to schedule the new task. * Remotely enqueued: the remote waking CPU sends an IPI to the target and the wake up is made by the target. * Locally: waking CPU == wakee CPU and the wakeup is done locally. set_need_resched() is called without IPI. In the case of local and remotely enqueued wake ups, the tick can be restarted when we enqueue the new task and RCU can exit the extended quiescent state at the same time. Then by the time we reach irq exit path and we call schedule, we are not in RCU user mode. But if we call schedule() only because something called set_need_resched(), RCU may still be in user mode when we reach schedule. Also if a wake up is done remotely, the CPU might see the TIF_RESCHED flag and call schedule while the IPI has not yet happen to restart the tick and exit RCU user mode. We need to manually protect against these corner cases. Create a new API schedule_user() that calls schedule() inside rcu_user_exit()-rcu_user_enter() in order to protect it. Archs will need to rely on it now to implement user preemption safely. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Josh Triplett Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/sched/core.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4adcd237c545..3c4dec0594d6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3469,6 +3469,21 @@ asmlinkage void __sched schedule(void) } EXPORT_SYMBOL(schedule); +#ifdef CONFIG_RCU_USER_QS +asmlinkage void __sched schedule_user(void) +{ + /* + * If we come here after a random call to set_need_resched(), + * or we have been woken up remotely but the IPI has not yet arrived, + * we haven't yet exited the RCU idle mode. Do it here manually until + * we find a better solution. + */ + rcu_user_exit(); + schedule(); + rcu_user_enter(); +} +#endif + /** * schedule_preempt_disabled - called with preemption disabled * From 0430499ce9d78691f3985962021b16bf8f8a8048 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 11 Jul 2012 20:26:38 +0200 Subject: [PATCH 13/16] x86: Use the new schedule_user API on userspace preemption This way we can exit the RCU extended quiescent state before we schedule a new task from irq/exception exit. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Josh Triplett Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- arch/x86/include/asm/rcu.h | 12 ++++++++++++ arch/x86/kernel/entry_64.S | 9 +++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/rcu.h b/arch/x86/include/asm/rcu.h index 439815b35ced..d1ac07a23979 100644 --- a/arch/x86/include/asm/rcu.h +++ b/arch/x86/include/asm/rcu.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_RCU_H #define _ASM_X86_RCU_H +#ifndef __ASSEMBLY__ + #include #include @@ -17,4 +19,14 @@ static inline void exception_exit(struct pt_regs *regs) #endif } +#else /* __ASSEMBLY__ */ + +#ifdef CONFIG_RCU_USER_QS +# define SCHEDULE_USER call schedule_user +#else +# define SCHEDULE_USER call schedule +#endif + +#endif /* !__ASSEMBLY__ */ + #endif diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 69babd8c834f..1a8f3cbb6ee3 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -56,6 +56,7 @@ #include #include #include +#include #include /* Avoid __ASSEMBLER__'ifying just for this. */ @@ -565,7 +566,7 @@ sysret_careful: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) pushq_cfi %rdi - call schedule + SCHEDULE_USER popq_cfi %rdi jmp sysret_check @@ -678,7 +679,7 @@ int_careful: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) pushq_cfi %rdi - call schedule + SCHEDULE_USER popq_cfi %rdi DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF @@ -974,7 +975,7 @@ retint_careful: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) pushq_cfi %rdi - call schedule + SCHEDULE_USER popq_cfi %rdi GET_THREAD_INFO(%rcx) DISABLE_INTERRUPTS(CLBR_NONE) @@ -1449,7 +1450,7 @@ paranoid_userspace: paranoid_schedule: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_ANY) - call schedule + SCHEDULE_USER DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF jmp paranoid_userspace From edf55fda35c7dc7f2d9241c3abaddaf759b457c6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 11 Jul 2012 20:26:39 +0200 Subject: [PATCH 14/16] x86: Exit RCU extended QS on notify resume do_notify_resume() may be called on irq or exception exit. But at that time the exception has already called rcu_user_enter() and the irq has already called rcu_irq_exit(). Since it can use RCU read side critical section, we must call rcu_user_exit() before doing anything there. Then we must call back rcu_user_enter() after this function because we know we are going to userspace from there. This complete support for userspace RCU extended quiescent state in x86-64. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Josh Triplett Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- arch/x86/Kconfig | 1 + arch/x86/kernel/signal.c | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 50a1d1f9b6d3..20c49b8450b8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -97,6 +97,7 @@ config X86 select KTIME_SCALAR if X86_32 select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER + select HAVE_RCU_USER_QS if X86_64 config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS || UPROBES) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index b280908a376e..bca0ab903e57 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -779,6 +779,8 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { + rcu_user_exit(); + #ifdef CONFIG_X86_MCE /* notify userspace of pending MCEs */ if (thread_info_flags & _TIF_MCE_NOTIFY) @@ -804,6 +806,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) #ifdef CONFIG_X86_32 clear_thread_flag(TIF_IRET); #endif /* CONFIG_X86_32 */ + + rcu_user_enter(); } void signal_fault(struct pt_regs *regs, void __user *frame, char *where) From 1fd2b4425a5702c112b441e20b250ac8833a9608 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 11 Jul 2012 20:26:40 +0200 Subject: [PATCH 15/16] rcu: Userspace RCU extended QS selftest Provide a config option that enables the userspace RCU extended quiescent state on every CPUs by default. This is for testing purpose. Signed-off-by: Frederic Weisbecker Cc: Alessio Igor Bogani Cc: Andrew Morton Cc: Avi Kivity Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Josh Triplett Cc: Kevin Hilman Cc: Max Krasnyansky Cc: Peter Zijlstra Cc: Stephen Hemminger Cc: Steven Rostedt Cc: Sven-Thorsten Dietrich Cc: Thomas Gleixner Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- init/Kconfig | 8 ++++++++ kernel/rcutree.c | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/init/Kconfig b/init/Kconfig index f6a1830165ce..c26b8a1d2b57 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -451,6 +451,14 @@ config RCU_USER_QS excluded from the global RCU state machine and thus doesn't to keep the timer tick on for RCU. +config RCU_USER_QS_FORCE + bool "Force userspace extended QS by default" + depends on RCU_USER_QS + help + Set the hooks in user/kernel boundaries by default in order to + test this feature that treats userspace as an extended quiescent + state until we have a real user like a full adaptive nohz option. + config RCU_FANOUT int "Tree-based hierarchical RCU fanout value" range 2 64 if 64BIT diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d2e74c8d4b0e..812d04b6b395 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -206,7 +206,7 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, .dynticks = ATOMIC_INIT(1), -#ifdef CONFIG_RCU_USER_QS +#if defined(CONFIG_RCU_USER_QS) && !defined(CONFIG_RCU_USER_QS_FORCE) .ignore_user_qs = true, #endif }; From cb349ca95407cbc11424d5e9fc7c8e700709041b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 4 Sep 2012 17:35:31 -0700 Subject: [PATCH 16/16] rcu: Apply micro-optimization and int/bool fixes to RCU's idle handling Checking "user" before "is_idle_task()" allows better optimizations in cases where inlining is possible. Also, "bool" should be passed "true" or "false" rather than "1" or "0". This commit therefore makes these changes, as noted in Josh's review. Reported-by: Josh Triplett Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 812d04b6b395..4fb2376ddf06 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -335,7 +335,7 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, bool user) { trace_rcu_dyntick("Start", oldval, 0); - if (!is_idle_task(current) && !user) { + if (!user && !is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); @@ -399,7 +399,7 @@ void rcu_idle_enter(void) unsigned long flags; local_irq_save(flags); - rcu_eqs_enter(0); + rcu_eqs_enter(false); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -435,7 +435,7 @@ void rcu_user_enter(void) rdtp = &__get_cpu_var(rcu_dynticks); if (!rdtp->ignore_user_qs && !rdtp->in_user) { rdtp->in_user = true; - rcu_eqs_enter(1); + rcu_eqs_enter(true); } local_irq_restore(flags); } @@ -492,7 +492,7 @@ void rcu_irq_exit(void) if (rdtp->dynticks_nesting) trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); else - rcu_eqs_enter_common(rdtp, oldval, 1); + rcu_eqs_enter_common(rdtp, oldval, true); local_irq_restore(flags); } @@ -513,7 +513,7 @@ static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); rcu_cleanup_after_idle(smp_processor_id()); trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); - if (!is_idle_task(current) && !user) { + if (!user && !is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); trace_rcu_dyntick("Error on exit: not idle task", @@ -560,7 +560,7 @@ void rcu_idle_exit(void) unsigned long flags; local_irq_save(flags); - rcu_eqs_exit(0); + rcu_eqs_exit(false); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_exit); @@ -592,7 +592,7 @@ void rcu_user_exit(void) rdtp = &__get_cpu_var(rcu_dynticks); if (rdtp->in_user) { rdtp->in_user = false; - rcu_eqs_exit(1); + rcu_eqs_exit(true); } local_irq_restore(flags); } @@ -653,7 +653,7 @@ void rcu_irq_enter(void) if (oldval) trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); else - rcu_eqs_exit_common(rdtp, oldval, 1); + rcu_eqs_exit_common(rdtp, oldval, true); local_irq_restore(flags); }