Merge b34133fec8
("Merge tag 'perf-core-2020-08-03' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip") into android-mainline
Steps on the way to 5.9-rc1 Signed-off-by: Greg Kroah-Hartman <gregkh@google.com> Change-Id: Ib901b7670a0313d91a57bce86c9587a25020cc3d
This commit is contained in:
commit
d332fba061
33
Documentation/ABI/testing/sysfs-devices-mapping
Normal file
33
Documentation/ABI/testing/sysfs-devices-mapping
Normal file
@ -0,0 +1,33 @@
|
||||
What: /sys/devices/uncore_iio_x/dieX
|
||||
Date: February 2020
|
||||
Contact: Roman Sudarikov <roman.sudarikov@linux.intel.com>
|
||||
Description:
|
||||
Each IIO stack (PCIe root port) has its own IIO PMON block, so
|
||||
each dieX file (where X is die number) holds "Segment:Root Bus"
|
||||
for PCIe root port, which can be monitored by that IIO PMON
|
||||
block.
|
||||
For example, on 4-die Xeon platform with up to 6 IIO stacks per
|
||||
die and, therefore, 6 IIO PMON blocks per die, the mapping of
|
||||
IIO PMON block 0 exposes as the following:
|
||||
|
||||
$ ls /sys/devices/uncore_iio_0/die*
|
||||
-r--r--r-- /sys/devices/uncore_iio_0/die0
|
||||
-r--r--r-- /sys/devices/uncore_iio_0/die1
|
||||
-r--r--r-- /sys/devices/uncore_iio_0/die2
|
||||
-r--r--r-- /sys/devices/uncore_iio_0/die3
|
||||
|
||||
$ tail /sys/devices/uncore_iio_0/die*
|
||||
==> /sys/devices/uncore_iio_0/die0 <==
|
||||
0000:00
|
||||
==> /sys/devices/uncore_iio_0/die1 <==
|
||||
0000:40
|
||||
==> /sys/devices/uncore_iio_0/die2 <==
|
||||
0000:80
|
||||
==> /sys/devices/uncore_iio_0/die3 <==
|
||||
0000:c0
|
||||
|
||||
Which means:
|
||||
IIO PMU 0 on die 0 belongs to PCI RP on bus 0x00, domain 0x0000
|
||||
IIO PMU 0 on die 1 belongs to PCI RP on bus 0x40, domain 0x0000
|
||||
IIO PMU 0 on die 2 belongs to PCI RP on bus 0x80, domain 0x0000
|
||||
IIO PMU 0 on die 3 belongs to PCI RP on bus 0xc0, domain 0x0000
|
@ -463,7 +463,7 @@ again without disrupting RCU readers.
|
||||
This guarantee was only partially premeditated. DYNIX/ptx used an
|
||||
explicit memory barrier for publication, but had nothing resembling
|
||||
``rcu_dereference()`` for subscription, nor did it have anything
|
||||
resembling the ``smp_read_barrier_depends()`` that was later subsumed
|
||||
resembling the dependency-ordering barrier that was later subsumed
|
||||
into ``rcu_dereference()`` and later still into ``READ_ONCE()``. The
|
||||
need for these operations made itself known quite suddenly at a
|
||||
late-1990s meeting with the DEC Alpha architects, back in the days when
|
||||
@ -2583,7 +2583,12 @@ not work to have these markers in the trampoline itself, because there
|
||||
would need to be instructions following ``rcu_read_unlock()``. Although
|
||||
``synchronize_rcu()`` would guarantee that execution reached the
|
||||
``rcu_read_unlock()``, it would not be able to guarantee that execution
|
||||
had completely left the trampoline.
|
||||
had completely left the trampoline. Worse yet, in some situations
|
||||
the trampoline's protection must extend a few instructions *prior* to
|
||||
execution reaching the trampoline. For example, these few instructions
|
||||
might calculate the address of the trampoline, so that entering the
|
||||
trampoline would be pre-ordained a surprisingly long time before execution
|
||||
actually reached the trampoline itself.
|
||||
|
||||
The solution, in the form of `Tasks
|
||||
RCU <https://lwn.net/Articles/607117/>`__, is to have implicit read-side
|
||||
|
@ -1,4 +1,8 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
================================
|
||||
Review Checklist for RCU Patches
|
||||
================================
|
||||
|
||||
|
||||
This document contains a checklist for producing and reviewing patches
|
||||
@ -411,18 +415,21 @@ over a rather long period of time, but improvements are always welcome!
|
||||
__rcu sparse checks to validate your RCU code. These can help
|
||||
find problems as follows:
|
||||
|
||||
CONFIG_PROVE_LOCKING: check that accesses to RCU-protected data
|
||||
CONFIG_PROVE_LOCKING:
|
||||
check that accesses to RCU-protected data
|
||||
structures are carried out under the proper RCU
|
||||
read-side critical section, while holding the right
|
||||
combination of locks, or whatever other conditions
|
||||
are appropriate.
|
||||
|
||||
CONFIG_DEBUG_OBJECTS_RCU_HEAD: check that you don't pass the
|
||||
CONFIG_DEBUG_OBJECTS_RCU_HEAD:
|
||||
check that you don't pass the
|
||||
same object to call_rcu() (or friends) before an RCU
|
||||
grace period has elapsed since the last time that you
|
||||
passed that same object to call_rcu() (or friends).
|
||||
|
||||
__rcu sparse checks: tag the pointer to the RCU-protected data
|
||||
__rcu sparse checks:
|
||||
tag the pointer to the RCU-protected data
|
||||
structure with __rcu, and sparse will warn you if you
|
||||
access that pointer without the services of one of the
|
||||
variants of rcu_dereference().
|
||||
@ -442,8 +449,8 @@ over a rather long period of time, but improvements are always welcome!
|
||||
|
||||
You instead need to use one of the barrier functions:
|
||||
|
||||
o call_rcu() -> rcu_barrier()
|
||||
o call_srcu() -> srcu_barrier()
|
||||
- call_rcu() -> rcu_barrier()
|
||||
- call_srcu() -> srcu_barrier()
|
||||
|
||||
However, these barrier functions are absolutely -not- guaranteed
|
||||
to wait for a grace period. In fact, if there are no call_rcu()
|
@ -1,3 +1,5 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
.. _rcu_concepts:
|
||||
|
||||
============
|
||||
@ -8,10 +10,17 @@ RCU concepts
|
||||
:maxdepth: 3
|
||||
|
||||
arrayRCU
|
||||
checklist
|
||||
lockdep
|
||||
lockdep-splat
|
||||
rcubarrier
|
||||
rcu_dereference
|
||||
whatisRCU
|
||||
rcu
|
||||
rculist_nulls
|
||||
rcuref
|
||||
torture
|
||||
stallwarn
|
||||
listRCU
|
||||
NMI-RCU
|
||||
UP
|
||||
|
@ -1,3 +1,9 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=================
|
||||
Lockdep-RCU Splat
|
||||
=================
|
||||
|
||||
Lockdep-RCU was added to the Linux kernel in early 2010
|
||||
(http://lwn.net/Articles/371986/). This facility checks for some common
|
||||
misuses of the RCU API, most notably using one of the rcu_dereference()
|
||||
@ -12,55 +18,54 @@ overwriting or worse. There can of course be false positives, this
|
||||
being the real world and all that.
|
||||
|
||||
So let's look at an example RCU lockdep splat from 3.0-rc5, one that
|
||||
has long since been fixed:
|
||||
has long since been fixed::
|
||||
|
||||
=============================
|
||||
WARNING: suspicious RCU usage
|
||||
-----------------------------
|
||||
block/cfq-iosched.c:2776 suspicious rcu_dereference_protected() usage!
|
||||
=============================
|
||||
WARNING: suspicious RCU usage
|
||||
-----------------------------
|
||||
block/cfq-iosched.c:2776 suspicious rcu_dereference_protected() usage!
|
||||
|
||||
other info that might help us debug this:
|
||||
other info that might help us debug this::
|
||||
|
||||
rcu_scheduler_active = 1, debug_locks = 0
|
||||
3 locks held by scsi_scan_6/1552:
|
||||
#0: (&shost->scan_mutex){+.+.}, at: [<ffffffff8145efca>]
|
||||
scsi_scan_host_selected+0x5a/0x150
|
||||
#1: (&eq->sysfs_lock){+.+.}, at: [<ffffffff812a5032>]
|
||||
elevator_exit+0x22/0x60
|
||||
#2: (&(&q->__queue_lock)->rlock){-.-.}, at: [<ffffffff812b6233>]
|
||||
cfq_exit_queue+0x43/0x190
|
||||
|
||||
rcu_scheduler_active = 1, debug_locks = 0
|
||||
3 locks held by scsi_scan_6/1552:
|
||||
#0: (&shost->scan_mutex){+.+.}, at: [<ffffffff8145efca>]
|
||||
scsi_scan_host_selected+0x5a/0x150
|
||||
#1: (&eq->sysfs_lock){+.+.}, at: [<ffffffff812a5032>]
|
||||
elevator_exit+0x22/0x60
|
||||
#2: (&(&q->__queue_lock)->rlock){-.-.}, at: [<ffffffff812b6233>]
|
||||
cfq_exit_queue+0x43/0x190
|
||||
stack backtrace:
|
||||
Pid: 1552, comm: scsi_scan_6 Not tainted 3.0.0-rc5 #17
|
||||
Call Trace:
|
||||
[<ffffffff810abb9b>] lockdep_rcu_dereference+0xbb/0xc0
|
||||
[<ffffffff812b6139>] __cfq_exit_single_io_context+0xe9/0x120
|
||||
[<ffffffff812b626c>] cfq_exit_queue+0x7c/0x190
|
||||
[<ffffffff812a5046>] elevator_exit+0x36/0x60
|
||||
[<ffffffff812a802a>] blk_cleanup_queue+0x4a/0x60
|
||||
[<ffffffff8145cc09>] scsi_free_queue+0x9/0x10
|
||||
[<ffffffff81460944>] __scsi_remove_device+0x84/0xd0
|
||||
[<ffffffff8145dca3>] scsi_probe_and_add_lun+0x353/0xb10
|
||||
[<ffffffff817da069>] ? error_exit+0x29/0xb0
|
||||
[<ffffffff817d98ed>] ? _raw_spin_unlock_irqrestore+0x3d/0x80
|
||||
[<ffffffff8145e722>] __scsi_scan_target+0x112/0x680
|
||||
[<ffffffff812c690d>] ? trace_hardirqs_off_thunk+0x3a/0x3c
|
||||
[<ffffffff817da069>] ? error_exit+0x29/0xb0
|
||||
[<ffffffff812bcc60>] ? kobject_del+0x40/0x40
|
||||
[<ffffffff8145ed16>] scsi_scan_channel+0x86/0xb0
|
||||
[<ffffffff8145f0b0>] scsi_scan_host_selected+0x140/0x150
|
||||
[<ffffffff8145f149>] do_scsi_scan_host+0x89/0x90
|
||||
[<ffffffff8145f170>] do_scan_async+0x20/0x160
|
||||
[<ffffffff8145f150>] ? do_scsi_scan_host+0x90/0x90
|
||||
[<ffffffff810975b6>] kthread+0xa6/0xb0
|
||||
[<ffffffff817db154>] kernel_thread_helper+0x4/0x10
|
||||
[<ffffffff81066430>] ? finish_task_switch+0x80/0x110
|
||||
[<ffffffff817d9c04>] ? retint_restore_args+0xe/0xe
|
||||
[<ffffffff81097510>] ? __kthread_init_worker+0x70/0x70
|
||||
[<ffffffff817db150>] ? gs_change+0xb/0xb
|
||||
|
||||
stack backtrace:
|
||||
Pid: 1552, comm: scsi_scan_6 Not tainted 3.0.0-rc5 #17
|
||||
Call Trace:
|
||||
[<ffffffff810abb9b>] lockdep_rcu_dereference+0xbb/0xc0
|
||||
[<ffffffff812b6139>] __cfq_exit_single_io_context+0xe9/0x120
|
||||
[<ffffffff812b626c>] cfq_exit_queue+0x7c/0x190
|
||||
[<ffffffff812a5046>] elevator_exit+0x36/0x60
|
||||
[<ffffffff812a802a>] blk_cleanup_queue+0x4a/0x60
|
||||
[<ffffffff8145cc09>] scsi_free_queue+0x9/0x10
|
||||
[<ffffffff81460944>] __scsi_remove_device+0x84/0xd0
|
||||
[<ffffffff8145dca3>] scsi_probe_and_add_lun+0x353/0xb10
|
||||
[<ffffffff817da069>] ? error_exit+0x29/0xb0
|
||||
[<ffffffff817d98ed>] ? _raw_spin_unlock_irqrestore+0x3d/0x80
|
||||
[<ffffffff8145e722>] __scsi_scan_target+0x112/0x680
|
||||
[<ffffffff812c690d>] ? trace_hardirqs_off_thunk+0x3a/0x3c
|
||||
[<ffffffff817da069>] ? error_exit+0x29/0xb0
|
||||
[<ffffffff812bcc60>] ? kobject_del+0x40/0x40
|
||||
[<ffffffff8145ed16>] scsi_scan_channel+0x86/0xb0
|
||||
[<ffffffff8145f0b0>] scsi_scan_host_selected+0x140/0x150
|
||||
[<ffffffff8145f149>] do_scsi_scan_host+0x89/0x90
|
||||
[<ffffffff8145f170>] do_scan_async+0x20/0x160
|
||||
[<ffffffff8145f150>] ? do_scsi_scan_host+0x90/0x90
|
||||
[<ffffffff810975b6>] kthread+0xa6/0xb0
|
||||
[<ffffffff817db154>] kernel_thread_helper+0x4/0x10
|
||||
[<ffffffff81066430>] ? finish_task_switch+0x80/0x110
|
||||
[<ffffffff817d9c04>] ? retint_restore_args+0xe/0xe
|
||||
[<ffffffff81097510>] ? __kthread_init_worker+0x70/0x70
|
||||
[<ffffffff817db150>] ? gs_change+0xb/0xb
|
||||
|
||||
Line 2776 of block/cfq-iosched.c in v3.0-rc5 is as follows:
|
||||
Line 2776 of block/cfq-iosched.c in v3.0-rc5 is as follows::
|
||||
|
||||
if (rcu_dereference(ioc->ioc_data) == cic) {
|
||||
|
||||
@ -70,7 +75,7 @@ case. Instead, we hold three locks, one of which might be RCU related.
|
||||
And maybe that lock really does protect this reference. If so, the fix
|
||||
is to inform RCU, perhaps by changing __cfq_exit_single_io_context() to
|
||||
take the struct request_queue "q" from cfq_exit_queue() as an argument,
|
||||
which would permit us to invoke rcu_dereference_protected as follows:
|
||||
which would permit us to invoke rcu_dereference_protected as follows::
|
||||
|
||||
if (rcu_dereference_protected(ioc->ioc_data,
|
||||
lockdep_is_held(&q->queue_lock)) == cic) {
|
||||
@ -85,7 +90,7 @@ On the other hand, perhaps we really do need an RCU read-side critical
|
||||
section. In this case, the critical section must span the use of the
|
||||
return value from rcu_dereference(), or at least until there is some
|
||||
reference count incremented or some such. One way to handle this is to
|
||||
add rcu_read_lock() and rcu_read_unlock() as follows:
|
||||
add rcu_read_lock() and rcu_read_unlock() as follows::
|
||||
|
||||
rcu_read_lock();
|
||||
if (rcu_dereference(ioc->ioc_data) == cic) {
|
||||
@ -102,7 +107,7 @@ above lockdep-RCU splat.
|
||||
But in this particular case, we don't actually dereference the pointer
|
||||
returned from rcu_dereference(). Instead, that pointer is just compared
|
||||
to the cic pointer, which means that the rcu_dereference() can be replaced
|
||||
by rcu_access_pointer() as follows:
|
||||
by rcu_access_pointer() as follows::
|
||||
|
||||
if (rcu_access_pointer(ioc->ioc_data) == cic) {
|
||||
|
@ -1,4 +1,8 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
========================
|
||||
RCU and lockdep checking
|
||||
========================
|
||||
|
||||
All flavors of RCU have lockdep checking available, so that lockdep is
|
||||
aware of when each task enters and leaves any flavor of RCU read-side
|
||||
@ -8,7 +12,7 @@ tracking to include RCU state, which can sometimes help when debugging
|
||||
deadlocks and the like.
|
||||
|
||||
In addition, RCU provides the following primitives that check lockdep's
|
||||
state:
|
||||
state::
|
||||
|
||||
rcu_read_lock_held() for normal RCU.
|
||||
rcu_read_lock_bh_held() for RCU-bh.
|
||||
@ -63,7 +67,7 @@ checking of rcu_dereference() primitives:
|
||||
The rcu_dereference_check() check expression can be any boolean
|
||||
expression, but would normally include a lockdep expression. However,
|
||||
any boolean expression can be used. For a moderately ornate example,
|
||||
consider the following:
|
||||
consider the following::
|
||||
|
||||
file = rcu_dereference_check(fdt->fd[fd],
|
||||
lockdep_is_held(&files->file_lock) ||
|
||||
@ -82,7 +86,7 @@ RCU read-side critical sections, in case (2) the ->file_lock prevents
|
||||
any change from taking place, and finally, in case (3) the current task
|
||||
is the only task accessing the file_struct, again preventing any change
|
||||
from taking place. If the above statement was invoked only from updater
|
||||
code, it could instead be written as follows:
|
||||
code, it could instead be written as follows::
|
||||
|
||||
file = rcu_dereference_protected(fdt->fd[fd],
|
||||
lockdep_is_held(&files->file_lock) ||
|
||||
@ -105,7 +109,7 @@ false and they are called from outside any RCU read-side critical section.
|
||||
|
||||
For example, the workqueue for_each_pwq() macro is intended to be used
|
||||
either within an RCU read-side critical section or with wq->mutex held.
|
||||
It is thus implemented as follows:
|
||||
It is thus implemented as follows::
|
||||
|
||||
#define for_each_pwq(pwq, wq)
|
||||
list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node,
|
200
Documentation/RCU/rculist_nulls.rst
Normal file
200
Documentation/RCU/rculist_nulls.rst
Normal file
@ -0,0 +1,200 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=================================================
|
||||
Using RCU hlist_nulls to protect list and objects
|
||||
=================================================
|
||||
|
||||
This section describes how to use hlist_nulls to
|
||||
protect read-mostly linked lists and
|
||||
objects using SLAB_TYPESAFE_BY_RCU allocations.
|
||||
|
||||
Please read the basics in Documentation/RCU/listRCU.rst
|
||||
|
||||
Using 'nulls'
|
||||
=============
|
||||
|
||||
Using special makers (called 'nulls') is a convenient way
|
||||
to solve following problem :
|
||||
|
||||
A typical RCU linked list managing objects which are
|
||||
allocated with SLAB_TYPESAFE_BY_RCU kmem_cache can
|
||||
use following algos :
|
||||
|
||||
1) Lookup algo
|
||||
--------------
|
||||
|
||||
::
|
||||
|
||||
rcu_read_lock()
|
||||
begin:
|
||||
obj = lockless_lookup(key);
|
||||
if (obj) {
|
||||
if (!try_get_ref(obj)) // might fail for free objects
|
||||
goto begin;
|
||||
/*
|
||||
* Because a writer could delete object, and a writer could
|
||||
* reuse these object before the RCU grace period, we
|
||||
* must check key after getting the reference on object
|
||||
*/
|
||||
if (obj->key != key) { // not the object we expected
|
||||
put_ref(obj);
|
||||
goto begin;
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
Beware that lockless_lookup(key) cannot use traditional hlist_for_each_entry_rcu()
|
||||
but a version with an additional memory barrier (smp_rmb())
|
||||
|
||||
::
|
||||
|
||||
lockless_lookup(key)
|
||||
{
|
||||
struct hlist_node *node, *next;
|
||||
for (pos = rcu_dereference((head)->first);
|
||||
pos && ({ next = pos->next; smp_rmb(); prefetch(next); 1; }) &&
|
||||
({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; });
|
||||
pos = rcu_dereference(next))
|
||||
if (obj->key == key)
|
||||
return obj;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
And note the traditional hlist_for_each_entry_rcu() misses this smp_rmb()::
|
||||
|
||||
struct hlist_node *node;
|
||||
for (pos = rcu_dereference((head)->first);
|
||||
pos && ({ prefetch(pos->next); 1; }) &&
|
||||
({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; });
|
||||
pos = rcu_dereference(pos->next))
|
||||
if (obj->key == key)
|
||||
return obj;
|
||||
return NULL;
|
||||
|
||||
Quoting Corey Minyard::
|
||||
|
||||
"If the object is moved from one list to another list in-between the
|
||||
time the hash is calculated and the next field is accessed, and the
|
||||
object has moved to the end of a new list, the traversal will not
|
||||
complete properly on the list it should have, since the object will
|
||||
be on the end of the new list and there's not a way to tell it's on a
|
||||
new list and restart the list traversal. I think that this can be
|
||||
solved by pre-fetching the "next" field (with proper barriers) before
|
||||
checking the key."
|
||||
|
||||
2) Insert algo
|
||||
--------------
|
||||
|
||||
We need to make sure a reader cannot read the new 'obj->obj_next' value
|
||||
and previous value of 'obj->key'. Or else, an item could be deleted
|
||||
from a chain, and inserted into another chain. If new chain was empty
|
||||
before the move, 'next' pointer is NULL, and lockless reader can
|
||||
not detect it missed following items in original chain.
|
||||
|
||||
::
|
||||
|
||||
/*
|
||||
* Please note that new inserts are done at the head of list,
|
||||
* not in the middle or end.
|
||||
*/
|
||||
obj = kmem_cache_alloc(...);
|
||||
lock_chain(); // typically a spin_lock()
|
||||
obj->key = key;
|
||||
/*
|
||||
* we need to make sure obj->key is updated before obj->next
|
||||
* or obj->refcnt
|
||||
*/
|
||||
smp_wmb();
|
||||
atomic_set(&obj->refcnt, 1);
|
||||
hlist_add_head_rcu(&obj->obj_node, list);
|
||||
unlock_chain(); // typically a spin_unlock()
|
||||
|
||||
|
||||
3) Remove algo
|
||||
--------------
|
||||
Nothing special here, we can use a standard RCU hlist deletion.
|
||||
But thanks to SLAB_TYPESAFE_BY_RCU, beware a deleted object can be reused
|
||||
very very fast (before the end of RCU grace period)
|
||||
|
||||
::
|
||||
|
||||
if (put_last_reference_on(obj) {
|
||||
lock_chain(); // typically a spin_lock()
|
||||
hlist_del_init_rcu(&obj->obj_node);
|
||||
unlock_chain(); // typically a spin_unlock()
|
||||
kmem_cache_free(cachep, obj);
|
||||
}
|
||||
|
||||
|
||||
|
||||
--------------------------------------------------------------------------
|
||||
|
||||
Avoiding extra smp_rmb()
|
||||
========================
|
||||
|
||||
With hlist_nulls we can avoid extra smp_rmb() in lockless_lookup()
|
||||
and extra smp_wmb() in insert function.
|
||||
|
||||
For example, if we choose to store the slot number as the 'nulls'
|
||||
end-of-list marker for each slot of the hash table, we can detect
|
||||
a race (some writer did a delete and/or a move of an object
|
||||
to another chain) checking the final 'nulls' value if
|
||||
the lookup met the end of chain. If final 'nulls' value
|
||||
is not the slot number, then we must restart the lookup at
|
||||
the beginning. If the object was moved to the same chain,
|
||||
then the reader doesn't care : It might eventually
|
||||
scan the list again without harm.
|
||||
|
||||
|
||||
1) lookup algo
|
||||
--------------
|
||||
|
||||
::
|
||||
|
||||
head = &table[slot];
|
||||
rcu_read_lock();
|
||||
begin:
|
||||
hlist_nulls_for_each_entry_rcu(obj, node, head, member) {
|
||||
if (obj->key == key) {
|
||||
if (!try_get_ref(obj)) // might fail for free objects
|
||||
goto begin;
|
||||
if (obj->key != key) { // not the object we expected
|
||||
put_ref(obj);
|
||||
goto begin;
|
||||
}
|
||||
goto out;
|
||||
}
|
||||
/*
|
||||
* if the nulls value we got at the end of this lookup is
|
||||
* not the expected one, we must restart lookup.
|
||||
* We probably met an item that was moved to another chain.
|
||||
*/
|
||||
if (get_nulls_value(node) != slot)
|
||||
goto begin;
|
||||
obj = NULL;
|
||||
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
|
||||
2) Insert function
|
||||
------------------
|
||||
|
||||
::
|
||||
|
||||
/*
|
||||
* Please note that new inserts are done at the head of list,
|
||||
* not in the middle or end.
|
||||
*/
|
||||
obj = kmem_cache_alloc(cachep);
|
||||
lock_chain(); // typically a spin_lock()
|
||||
obj->key = key;
|
||||
/*
|
||||
* changes to obj->key must be visible before refcnt one
|
||||
*/
|
||||
smp_wmb();
|
||||
atomic_set(&obj->refcnt, 1);
|
||||
/*
|
||||
* insert obj in RCU way (readers might be traversing chain)
|
||||
*/
|
||||
hlist_nulls_add_head_rcu(&obj->obj_node, list);
|
||||
unlock_chain(); // typically a spin_unlock()
|
@ -1,172 +0,0 @@
|
||||
Using hlist_nulls to protect read-mostly linked lists and
|
||||
objects using SLAB_TYPESAFE_BY_RCU allocations.
|
||||
|
||||
Please read the basics in Documentation/RCU/listRCU.rst
|
||||
|
||||
Using special makers (called 'nulls') is a convenient way
|
||||
to solve following problem :
|
||||
|
||||
A typical RCU linked list managing objects which are
|
||||
allocated with SLAB_TYPESAFE_BY_RCU kmem_cache can
|
||||
use following algos :
|
||||
|
||||
1) Lookup algo
|
||||
--------------
|
||||
rcu_read_lock()
|
||||
begin:
|
||||
obj = lockless_lookup(key);
|
||||
if (obj) {
|
||||
if (!try_get_ref(obj)) // might fail for free objects
|
||||
goto begin;
|
||||
/*
|
||||
* Because a writer could delete object, and a writer could
|
||||
* reuse these object before the RCU grace period, we
|
||||
* must check key after getting the reference on object
|
||||
*/
|
||||
if (obj->key != key) { // not the object we expected
|
||||
put_ref(obj);
|
||||
goto begin;
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
Beware that lockless_lookup(key) cannot use traditional hlist_for_each_entry_rcu()
|
||||
but a version with an additional memory barrier (smp_rmb())
|
||||
|
||||
lockless_lookup(key)
|
||||
{
|
||||
struct hlist_node *node, *next;
|
||||
for (pos = rcu_dereference((head)->first);
|
||||
pos && ({ next = pos->next; smp_rmb(); prefetch(next); 1; }) &&
|
||||
({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; });
|
||||
pos = rcu_dereference(next))
|
||||
if (obj->key == key)
|
||||
return obj;
|
||||
return NULL;
|
||||
|
||||
And note the traditional hlist_for_each_entry_rcu() misses this smp_rmb() :
|
||||
|
||||
struct hlist_node *node;
|
||||
for (pos = rcu_dereference((head)->first);
|
||||
pos && ({ prefetch(pos->next); 1; }) &&
|
||||
({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; });
|
||||
pos = rcu_dereference(pos->next))
|
||||
if (obj->key == key)
|
||||
return obj;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Quoting Corey Minyard :
|
||||
|
||||
"If the object is moved from one list to another list in-between the
|
||||
time the hash is calculated and the next field is accessed, and the
|
||||
object has moved to the end of a new list, the traversal will not
|
||||
complete properly on the list it should have, since the object will
|
||||
be on the end of the new list and there's not a way to tell it's on a
|
||||
new list and restart the list traversal. I think that this can be
|
||||
solved by pre-fetching the "next" field (with proper barriers) before
|
||||
checking the key."
|
||||
|
||||
2) Insert algo :
|
||||
----------------
|
||||
|
||||
We need to make sure a reader cannot read the new 'obj->obj_next' value
|
||||
and previous value of 'obj->key'. Or else, an item could be deleted
|
||||
from a chain, and inserted into another chain. If new chain was empty
|
||||
before the move, 'next' pointer is NULL, and lockless reader can
|
||||
not detect it missed following items in original chain.
|
||||
|
||||
/*
|
||||
* Please note that new inserts are done at the head of list,
|
||||
* not in the middle or end.
|
||||
*/
|
||||
obj = kmem_cache_alloc(...);
|
||||
lock_chain(); // typically a spin_lock()
|
||||
obj->key = key;
|
||||
/*
|
||||
* we need to make sure obj->key is updated before obj->next
|
||||
* or obj->refcnt
|
||||
*/
|
||||
smp_wmb();
|
||||
atomic_set(&obj->refcnt, 1);
|
||||
hlist_add_head_rcu(&obj->obj_node, list);
|
||||
unlock_chain(); // typically a spin_unlock()
|
||||
|
||||
|
||||
3) Remove algo
|
||||
--------------
|
||||
Nothing special here, we can use a standard RCU hlist deletion.
|
||||
But thanks to SLAB_TYPESAFE_BY_RCU, beware a deleted object can be reused
|
||||
very very fast (before the end of RCU grace period)
|
||||
|
||||
if (put_last_reference_on(obj) {
|
||||
lock_chain(); // typically a spin_lock()
|
||||
hlist_del_init_rcu(&obj->obj_node);
|
||||
unlock_chain(); // typically a spin_unlock()
|
||||
kmem_cache_free(cachep, obj);
|
||||
}
|
||||
|
||||
|
||||
|
||||
--------------------------------------------------------------------------
|
||||
With hlist_nulls we can avoid extra smp_rmb() in lockless_lookup()
|
||||
and extra smp_wmb() in insert function.
|
||||
|
||||
For example, if we choose to store the slot number as the 'nulls'
|
||||
end-of-list marker for each slot of the hash table, we can detect
|
||||
a race (some writer did a delete and/or a move of an object
|
||||
to another chain) checking the final 'nulls' value if
|
||||
the lookup met the end of chain. If final 'nulls' value
|
||||
is not the slot number, then we must restart the lookup at
|
||||
the beginning. If the object was moved to the same chain,
|
||||
then the reader doesn't care : It might eventually
|
||||
scan the list again without harm.
|
||||
|
||||
|
||||
1) lookup algo
|
||||
|
||||
head = &table[slot];
|
||||
rcu_read_lock();
|
||||
begin:
|
||||
hlist_nulls_for_each_entry_rcu(obj, node, head, member) {
|
||||
if (obj->key == key) {
|
||||
if (!try_get_ref(obj)) // might fail for free objects
|
||||
goto begin;
|
||||
if (obj->key != key) { // not the object we expected
|
||||
put_ref(obj);
|
||||
goto begin;
|
||||
}
|
||||
goto out;
|
||||
}
|
||||
/*
|
||||
* if the nulls value we got at the end of this lookup is
|
||||
* not the expected one, we must restart lookup.
|
||||
* We probably met an item that was moved to another chain.
|
||||
*/
|
||||
if (get_nulls_value(node) != slot)
|
||||
goto begin;
|
||||
obj = NULL;
|
||||
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
|
||||
2) Insert function :
|
||||
--------------------
|
||||
|
||||
/*
|
||||
* Please note that new inserts are done at the head of list,
|
||||
* not in the middle or end.
|
||||
*/
|
||||
obj = kmem_cache_alloc(cachep);
|
||||
lock_chain(); // typically a spin_lock()
|
||||
obj->key = key;
|
||||
/*
|
||||
* changes to obj->key must be visible before refcnt one
|
||||
*/
|
||||
smp_wmb();
|
||||
atomic_set(&obj->refcnt, 1);
|
||||
/*
|
||||
* insert obj in RCU way (readers might be traversing chain)
|
||||
*/
|
||||
hlist_nulls_add_head_rcu(&obj->obj_node, list);
|
||||
unlock_chain(); // typically a spin_unlock()
|
@ -1,4 +1,8 @@
|
||||
Reference-count design for elements of lists/arrays protected by RCU.
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
====================================================================
|
||||
Reference-count design for elements of lists/arrays protected by RCU
|
||||
====================================================================
|
||||
|
||||
|
||||
Please note that the percpu-ref feature is likely your first
|
||||
@ -12,32 +16,33 @@ please read on.
|
||||
Reference counting on elements of lists which are protected by traditional
|
||||
reader/writer spinlocks or semaphores are straightforward:
|
||||
|
||||
CODE LISTING A:
|
||||
1. 2.
|
||||
add() search_and_reference()
|
||||
{ {
|
||||
alloc_object read_lock(&list_lock);
|
||||
... search_for_element
|
||||
atomic_set(&el->rc, 1); atomic_inc(&el->rc);
|
||||
write_lock(&list_lock); ...
|
||||
add_element read_unlock(&list_lock);
|
||||
... ...
|
||||
write_unlock(&list_lock); }
|
||||
}
|
||||
CODE LISTING A::
|
||||
|
||||
3. 4.
|
||||
release_referenced() delete()
|
||||
{ {
|
||||
... write_lock(&list_lock);
|
||||
if(atomic_dec_and_test(&el->rc)) ...
|
||||
kfree(el);
|
||||
... remove_element
|
||||
} write_unlock(&list_lock);
|
||||
...
|
||||
if (atomic_dec_and_test(&el->rc))
|
||||
kfree(el);
|
||||
...
|
||||
}
|
||||
1. 2.
|
||||
add() search_and_reference()
|
||||
{ {
|
||||
alloc_object read_lock(&list_lock);
|
||||
... search_for_element
|
||||
atomic_set(&el->rc, 1); atomic_inc(&el->rc);
|
||||
write_lock(&list_lock); ...
|
||||
add_element read_unlock(&list_lock);
|
||||
... ...
|
||||
write_unlock(&list_lock); }
|
||||
}
|
||||
|
||||
3. 4.
|
||||
release_referenced() delete()
|
||||
{ {
|
||||
... write_lock(&list_lock);
|
||||
if(atomic_dec_and_test(&el->rc)) ...
|
||||
kfree(el);
|
||||
... remove_element
|
||||
} write_unlock(&list_lock);
|
||||
...
|
||||
if (atomic_dec_and_test(&el->rc))
|
||||
kfree(el);
|
||||
...
|
||||
}
|
||||
|
||||
If this list/array is made lock free using RCU as in changing the
|
||||
write_lock() in add() and delete() to spin_lock() and changing read_lock()
|
||||
@ -46,34 +51,35 @@ search_and_reference() could potentially hold reference to an element which
|
||||
has already been deleted from the list/array. Use atomic_inc_not_zero()
|
||||
in this scenario as follows:
|
||||
|
||||
CODE LISTING B:
|
||||
1. 2.
|
||||
add() search_and_reference()
|
||||
{ {
|
||||
alloc_object rcu_read_lock();
|
||||
... search_for_element
|
||||
atomic_set(&el->rc, 1); if (!atomic_inc_not_zero(&el->rc)) {
|
||||
spin_lock(&list_lock); rcu_read_unlock();
|
||||
return FAIL;
|
||||
add_element }
|
||||
... ...
|
||||
spin_unlock(&list_lock); rcu_read_unlock();
|
||||
} }
|
||||
3. 4.
|
||||
release_referenced() delete()
|
||||
{ {
|
||||
... spin_lock(&list_lock);
|
||||
if (atomic_dec_and_test(&el->rc)) ...
|
||||
call_rcu(&el->head, el_free); remove_element
|
||||
... spin_unlock(&list_lock);
|
||||
} ...
|
||||
if (atomic_dec_and_test(&el->rc))
|
||||
call_rcu(&el->head, el_free);
|
||||
...
|
||||
}
|
||||
CODE LISTING B::
|
||||
|
||||
1. 2.
|
||||
add() search_and_reference()
|
||||
{ {
|
||||
alloc_object rcu_read_lock();
|
||||
... search_for_element
|
||||
atomic_set(&el->rc, 1); if (!atomic_inc_not_zero(&el->rc)) {
|
||||
spin_lock(&list_lock); rcu_read_unlock();
|
||||
return FAIL;
|
||||
add_element }
|
||||
... ...
|
||||
spin_unlock(&list_lock); rcu_read_unlock();
|
||||
} }
|
||||
3. 4.
|
||||
release_referenced() delete()
|
||||
{ {
|
||||
... spin_lock(&list_lock);
|
||||
if (atomic_dec_and_test(&el->rc)) ...
|
||||
call_rcu(&el->head, el_free); remove_element
|
||||
... spin_unlock(&list_lock);
|
||||
} ...
|
||||
if (atomic_dec_and_test(&el->rc))
|
||||
call_rcu(&el->head, el_free);
|
||||
...
|
||||
}
|
||||
|
||||
Sometimes, a reference to the element needs to be obtained in the
|
||||
update (write) stream. In such cases, atomic_inc_not_zero() might be
|
||||
update (write) stream. In such cases, atomic_inc_not_zero() might be
|
||||
overkill, since we hold the update-side spinlock. One might instead
|
||||
use atomic_inc() in such cases.
|
||||
|
||||
@ -82,39 +88,40 @@ search_and_reference() code path. In such cases, the
|
||||
atomic_dec_and_test() may be moved from delete() to el_free()
|
||||
as follows:
|
||||
|
||||
CODE LISTING C:
|
||||
1. 2.
|
||||
add() search_and_reference()
|
||||
{ {
|
||||
alloc_object rcu_read_lock();
|
||||
... search_for_element
|
||||
atomic_set(&el->rc, 1); atomic_inc(&el->rc);
|
||||
spin_lock(&list_lock); ...
|
||||
CODE LISTING C::
|
||||
|
||||
add_element rcu_read_unlock();
|
||||
... }
|
||||
spin_unlock(&list_lock); 4.
|
||||
} delete()
|
||||
3. {
|
||||
release_referenced() spin_lock(&list_lock);
|
||||
{ ...
|
||||
... remove_element
|
||||
if (atomic_dec_and_test(&el->rc)) spin_unlock(&list_lock);
|
||||
kfree(el); ...
|
||||
... call_rcu(&el->head, el_free);
|
||||
} ...
|
||||
5. }
|
||||
void el_free(struct rcu_head *rhp)
|
||||
{
|
||||
release_referenced();
|
||||
}
|
||||
1. 2.
|
||||
add() search_and_reference()
|
||||
{ {
|
||||
alloc_object rcu_read_lock();
|
||||
... search_for_element
|
||||
atomic_set(&el->rc, 1); atomic_inc(&el->rc);
|
||||
spin_lock(&list_lock); ...
|
||||
|
||||
add_element rcu_read_unlock();
|
||||
... }
|
||||
spin_unlock(&list_lock); 4.
|
||||
} delete()
|
||||
3. {
|
||||
release_referenced() spin_lock(&list_lock);
|
||||
{ ...
|
||||
... remove_element
|
||||
if (atomic_dec_and_test(&el->rc)) spin_unlock(&list_lock);
|
||||
kfree(el); ...
|
||||
... call_rcu(&el->head, el_free);
|
||||
} ...
|
||||
5. }
|
||||
void el_free(struct rcu_head *rhp)
|
||||
{
|
||||
release_referenced();
|
||||
}
|
||||
|
||||
The key point is that the initial reference added by add() is not removed
|
||||
until after a grace period has elapsed following removal. This means that
|
||||
search_and_reference() cannot find this element, which means that the value
|
||||
of el->rc cannot increase. Thus, once it reaches zero, there are no
|
||||
readers that can or ever will be able to reference the element. The
|
||||
element can therefore safely be freed. This in turn guarantees that if
|
||||
readers that can or ever will be able to reference the element. The
|
||||
element can therefore safely be freed. This in turn guarantees that if
|
||||
any reader finds the element, that reader may safely acquire a reference
|
||||
without checking the value of the reference counter.
|
||||
|
||||
@ -130,21 +137,21 @@ the eventual invocation of kfree(), which is usually not a problem on
|
||||
modern computer systems, even the small ones.
|
||||
|
||||
In cases where delete() can sleep, synchronize_rcu() can be called from
|
||||
delete(), so that el_free() can be subsumed into delete as follows:
|
||||
delete(), so that el_free() can be subsumed into delete as follows::
|
||||
|
||||
4.
|
||||
delete()
|
||||
{
|
||||
spin_lock(&list_lock);
|
||||
...
|
||||
remove_element
|
||||
spin_unlock(&list_lock);
|
||||
...
|
||||
synchronize_rcu();
|
||||
if (atomic_dec_and_test(&el->rc))
|
||||
kfree(el);
|
||||
...
|
||||
}
|
||||
4.
|
||||
delete()
|
||||
{
|
||||
spin_lock(&list_lock);
|
||||
...
|
||||
remove_element
|
||||
spin_unlock(&list_lock);
|
||||
...
|
||||
synchronize_rcu();
|
||||
if (atomic_dec_and_test(&el->rc))
|
||||
kfree(el);
|
||||
...
|
||||
}
|
||||
|
||||
As additional examples in the kernel, the pattern in listing C is used by
|
||||
reference counting of struct pid, while the pattern in listing B is used by
|
@ -1,4 +1,8 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
==============================
|
||||
Using RCU's CPU Stall Detector
|
||||
==============================
|
||||
|
||||
This document first discusses what sorts of issues RCU's CPU stall
|
||||
detector can locate, and then discusses kernel parameters and Kconfig
|
||||
@ -7,39 +11,40 @@ this document explains the stall detector's "splat" format.
|
||||
|
||||
|
||||
What Causes RCU CPU Stall Warnings?
|
||||
===================================
|
||||
|
||||
So your kernel printed an RCU CPU stall warning. The next question is
|
||||
"What caused it?" The following problems can result in RCU CPU stall
|
||||
warnings:
|
||||
|
||||
o A CPU looping in an RCU read-side critical section.
|
||||
- A CPU looping in an RCU read-side critical section.
|
||||
|
||||
o A CPU looping with interrupts disabled.
|
||||
- A CPU looping with interrupts disabled.
|
||||
|
||||
o A CPU looping with preemption disabled.
|
||||
- A CPU looping with preemption disabled.
|
||||
|
||||
o A CPU looping with bottom halves disabled.
|
||||
- A CPU looping with bottom halves disabled.
|
||||
|
||||
o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel
|
||||
- For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel
|
||||
without invoking schedule(). If the looping in the kernel is
|
||||
really expected and desirable behavior, you might need to add
|
||||
some calls to cond_resched().
|
||||
|
||||
o Booting Linux using a console connection that is too slow to
|
||||
- Booting Linux using a console connection that is too slow to
|
||||
keep up with the boot-time console-message rate. For example,
|
||||
a 115Kbaud serial console can be -way- too slow to keep up
|
||||
with boot-time message rates, and will frequently result in
|
||||
RCU CPU stall warning messages. Especially if you have added
|
||||
debug printk()s.
|
||||
|
||||
o Anything that prevents RCU's grace-period kthreads from running.
|
||||
- Anything that prevents RCU's grace-period kthreads from running.
|
||||
This can result in the "All QSes seen" console-log message.
|
||||
This message will include information on when the kthread last
|
||||
ran and how often it should be expected to run. It can also
|
||||
result in the "rcu_.*kthread starved for" console-log message,
|
||||
result in the ``rcu_.*kthread starved for`` console-log message,
|
||||
which will include additional debugging information.
|
||||
|
||||
o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might
|
||||
- A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might
|
||||
happen to preempt a low-priority task in the middle of an RCU
|
||||
read-side critical section. This is especially damaging if
|
||||
that low-priority task is not permitted to run on any other CPU,
|
||||
@ -48,7 +53,7 @@ o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might
|
||||
While the system is in the process of running itself out of
|
||||
memory, you might see stall-warning messages.
|
||||
|
||||
o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that
|
||||
- A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that
|
||||
is running at a higher priority than the RCU softirq threads.
|
||||
This will prevent RCU callbacks from ever being invoked,
|
||||
and in a CONFIG_PREEMPT_RCU kernel will further prevent
|
||||
@ -63,7 +68,7 @@ o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that
|
||||
can increase your system's context-switch rate and thus degrade
|
||||
performance.
|
||||
|
||||
o A periodic interrupt whose handler takes longer than the time
|
||||
- A periodic interrupt whose handler takes longer than the time
|
||||
interval between successive pairs of interrupts. This can
|
||||
prevent RCU's kthreads and softirq handlers from running.
|
||||
Note that certain high-overhead debugging options, for example
|
||||
@ -71,20 +76,27 @@ o A periodic interrupt whose handler takes longer than the time
|
||||
considerably longer than normal, which can in turn result in
|
||||
RCU CPU stall warnings.
|
||||
|
||||
o Testing a workload on a fast system, tuning the stall-warning
|
||||
- Testing a workload on a fast system, tuning the stall-warning
|
||||
timeout down to just barely avoid RCU CPU stall warnings, and then
|
||||
running the same workload with the same stall-warning timeout on a
|
||||
slow system. Note that thermal throttling and on-demand governors
|
||||
can cause a single system to be sometimes fast and sometimes slow!
|
||||
|
||||
o A hardware or software issue shuts off the scheduler-clock
|
||||
- A hardware or software issue shuts off the scheduler-clock
|
||||
interrupt on a CPU that is not in dyntick-idle mode. This
|
||||
problem really has happened, and seems to be most likely to
|
||||
result in RCU CPU stall warnings for CONFIG_NO_HZ_COMMON=n kernels.
|
||||
|
||||
o A bug in the RCU implementation.
|
||||
- A hardware or software issue that prevents time-based wakeups
|
||||
from occurring. These issues can range from misconfigured or
|
||||
buggy timer hardware through bugs in the interrupt or exception
|
||||
path (whether hardware, firmware, or software) through bugs
|
||||
in Linux's timer subsystem through bugs in the scheduler, and,
|
||||
yes, even including bugs in RCU itself.
|
||||
|
||||
o A hardware failure. This is quite unlikely, but has occurred
|
||||
- A bug in the RCU implementation.
|
||||
|
||||
- A hardware failure. This is quite unlikely, but has occurred
|
||||
at least once in real life. A CPU failed in a running system,
|
||||
becoming unresponsive, but not causing an immediate crash.
|
||||
This resulted in a series of RCU CPU stall warnings, eventually
|
||||
@ -109,6 +121,7 @@ see include/trace/events/rcu.h.
|
||||
|
||||
|
||||
Fine-Tuning the RCU CPU Stall Detector
|
||||
======================================
|
||||
|
||||
The rcuupdate.rcu_cpu_stall_suppress module parameter disables RCU's
|
||||
CPU stall detector, which detects conditions that unduly delay RCU grace
|
||||
@ -118,6 +131,7 @@ The stall detector's idea of what constitutes "unduly delayed" is
|
||||
controlled by a set of kernel configuration variables and cpp macros:
|
||||
|
||||
CONFIG_RCU_CPU_STALL_TIMEOUT
|
||||
----------------------------
|
||||
|
||||
This kernel configuration parameter defines the period of time
|
||||
that RCU will wait from the beginning of a grace period until it
|
||||
@ -137,6 +151,7 @@ CONFIG_RCU_CPU_STALL_TIMEOUT
|
||||
/sys/module/rcupdate/parameters/rcu_cpu_stall_suppress.
|
||||
|
||||
RCU_STALL_DELAY_DELTA
|
||||
---------------------
|
||||
|
||||
Although the lockdep facility is extremely useful, it does add
|
||||
some overhead. Therefore, under CONFIG_PROVE_RCU, the
|
||||
@ -145,6 +160,7 @@ RCU_STALL_DELAY_DELTA
|
||||
macro, not a kernel configuration parameter.)
|
||||
|
||||
RCU_STALL_RAT_DELAY
|
||||
-------------------
|
||||
|
||||
The CPU stall detector tries to make the offending CPU print its
|
||||
own warnings, as this often gives better-quality stack traces.
|
||||
@ -155,6 +171,7 @@ RCU_STALL_RAT_DELAY
|
||||
parameter.)
|
||||
|
||||
rcupdate.rcu_task_stall_timeout
|
||||
-------------------------------
|
||||
|
||||
This boot/sysfs parameter controls the RCU-tasks stall warning
|
||||
interval. A value of zero or less suppresses RCU-tasks stall
|
||||
@ -168,9 +185,10 @@ rcupdate.rcu_task_stall_timeout
|
||||
|
||||
|
||||
Interpreting RCU's CPU Stall-Detector "Splats"
|
||||
==============================================
|
||||
|
||||
For non-RCU-tasks flavors of RCU, when a CPU detects that it is stalling,
|
||||
it will print a message similar to the following:
|
||||
it will print a message similar to the following::
|
||||
|
||||
INFO: rcu_sched detected stalls on CPUs/tasks:
|
||||
2-...: (3 GPs behind) idle=06c/0/0 softirq=1453/1455 fqs=0
|
||||
@ -223,7 +241,7 @@ an estimate of the total number of RCU callbacks queued across all CPUs
|
||||
(625 in this case).
|
||||
|
||||
In kernels with CONFIG_RCU_FAST_NO_HZ, more information is printed
|
||||
for each CPU:
|
||||
for each CPU::
|
||||
|
||||
0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 dyntick_enabled: 1
|
||||
|
||||
@ -235,7 +253,7 @@ processing is enabled.
|
||||
|
||||
If the grace period ends just as the stall warning starts printing,
|
||||
there will be a spurious stall-warning message, which will include
|
||||
the following:
|
||||
the following::
|
||||
|
||||
INFO: Stall ended before state dump start
|
||||
|
||||
@ -248,7 +266,7 @@ which is overkill for this sort of problem.
|
||||
|
||||
If all CPUs and tasks have passed through quiescent states, but the
|
||||
grace period has nevertheless failed to end, the stall-warning splat
|
||||
will include something like the following:
|
||||
will include something like the following::
|
||||
|
||||
All QSes seen, last rcu_preempt kthread activity 23807 (4297905177-4297881370), jiffies_till_next_fqs=3, root ->qsmask 0x0
|
||||
|
||||
@ -261,7 +279,7 @@ which is way less than 23807. Finally, the root rcu_node structure's
|
||||
|
||||
If the relevant grace-period kthread has been unable to run prior to
|
||||
the stall warning, as was the case in the "All QSes seen" line above,
|
||||
the following additional line is printed:
|
||||
the following additional line is printed::
|
||||
|
||||
kthread starved for 23807 jiffies! g7075 f0x0 RCU_GP_WAIT_FQS(3) ->state=0x1 ->cpu=5
|
||||
|
||||
@ -276,6 +294,7 @@ kthread last ran on CPU 5.
|
||||
|
||||
|
||||
Multiple Warnings From One Stall
|
||||
================================
|
||||
|
||||
If a stall lasts long enough, multiple stall-warning messages will be
|
||||
printed for it. The second and subsequent messages are printed at
|
||||
@ -285,9 +304,10 @@ of the stall and the first message.
|
||||
|
||||
|
||||
Stall Warnings for Expedited Grace Periods
|
||||
==========================================
|
||||
|
||||
If an expedited grace period detects a stall, it will place a message
|
||||
like the following in dmesg:
|
||||
like the following in dmesg::
|
||||
|
||||
INFO: rcu_sched detected expedited stalls on CPUs/tasks: { 7-... } 21119 jiffies s: 73 root: 0x2/.
|
||||
|
@ -1,7 +1,12 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
==========================
|
||||
RCU Torture Test Operation
|
||||
==========================
|
||||
|
||||
|
||||
CONFIG_RCU_TORTURE_TEST
|
||||
=======================
|
||||
|
||||
The CONFIG_RCU_TORTURE_TEST config option is available for all RCU
|
||||
implementations. It creates an rcutorture kernel module that can
|
||||
@ -13,9 +18,10 @@ when the module is loaded, and stops when the module is unloaded.
|
||||
Module parameters are prefixed by "rcutorture." in
|
||||
Documentation/admin-guide/kernel-parameters.txt.
|
||||
|
||||
OUTPUT
|
||||
Output
|
||||
======
|
||||
|
||||
The statistics output is as follows:
|
||||
The statistics output is as follows::
|
||||
|
||||
rcu-torture:--- Start of test: nreaders=16 nfakewriters=4 stat_interval=30 verbose=0 test_no_idle_hz=1 shuffle_interval=3 stutter=5 irqreader=1 fqs_duration=0 fqs_holdoff=0 fqs_stutter=3 test_boost=1/0 test_boost_interval=7 test_boost_duration=4
|
||||
rcu-torture: rtc: (null) ver: 155441 tfle: 0 rta: 155441 rtaf: 8884 rtf: 155440 rtmbe: 0 rtbe: 0 rtbke: 0 rtbre: 0 rtbf: 0 rtb: 0 nt: 3055767
|
||||
@ -36,53 +42,53 @@ automatic determination as to whether RCU operated correctly.
|
||||
|
||||
The entries are as follows:
|
||||
|
||||
o "rtc": The hexadecimal address of the structure currently visible
|
||||
* "rtc": The hexadecimal address of the structure currently visible
|
||||
to readers.
|
||||
|
||||
o "ver": The number of times since boot that the RCU writer task
|
||||
* "ver": The number of times since boot that the RCU writer task
|
||||
has changed the structure visible to readers.
|
||||
|
||||
o "tfle": If non-zero, indicates that the "torture freelist"
|
||||
* "tfle": If non-zero, indicates that the "torture freelist"
|
||||
containing structures to be placed into the "rtc" area is empty.
|
||||
This condition is important, since it can fool you into thinking
|
||||
that RCU is working when it is not. :-/
|
||||
|
||||
o "rta": Number of structures allocated from the torture freelist.
|
||||
* "rta": Number of structures allocated from the torture freelist.
|
||||
|
||||
o "rtaf": Number of allocations from the torture freelist that have
|
||||
* "rtaf": Number of allocations from the torture freelist that have
|
||||
failed due to the list being empty. It is not unusual for this
|
||||
to be non-zero, but it is bad for it to be a large fraction of
|
||||
the value indicated by "rta".
|
||||
|
||||
o "rtf": Number of frees into the torture freelist.
|
||||
* "rtf": Number of frees into the torture freelist.
|
||||
|
||||
o "rtmbe": A non-zero value indicates that rcutorture believes that
|
||||
* "rtmbe": A non-zero value indicates that rcutorture believes that
|
||||
rcu_assign_pointer() and rcu_dereference() are not working
|
||||
correctly. This value should be zero.
|
||||
|
||||
o "rtbe": A non-zero value indicates that one of the rcu_barrier()
|
||||
* "rtbe": A non-zero value indicates that one of the rcu_barrier()
|
||||
family of functions is not working correctly.
|
||||
|
||||
o "rtbke": rcutorture was unable to create the real-time kthreads
|
||||
* "rtbke": rcutorture was unable to create the real-time kthreads
|
||||
used to force RCU priority inversion. This value should be zero.
|
||||
|
||||
o "rtbre": Although rcutorture successfully created the kthreads
|
||||
* "rtbre": Although rcutorture successfully created the kthreads
|
||||
used to force RCU priority inversion, it was unable to set them
|
||||
to the real-time priority level of 1. This value should be zero.
|
||||
|
||||
o "rtbf": The number of times that RCU priority boosting failed
|
||||
* "rtbf": The number of times that RCU priority boosting failed
|
||||
to resolve RCU priority inversion.
|
||||
|
||||
o "rtb": The number of times that rcutorture attempted to force
|
||||
* "rtb": The number of times that rcutorture attempted to force
|
||||
an RCU priority inversion condition. If you are testing RCU
|
||||
priority boosting via the "test_boost" module parameter, this
|
||||
value should be non-zero.
|
||||
|
||||
o "nt": The number of times rcutorture ran RCU read-side code from
|
||||
* "nt": The number of times rcutorture ran RCU read-side code from
|
||||
within a timer handler. This value should be non-zero only
|
||||
if you specified the "irqreader" module parameter.
|
||||
|
||||
o "Reader Pipe": Histogram of "ages" of structures seen by readers.
|
||||
* "Reader Pipe": Histogram of "ages" of structures seen by readers.
|
||||
If any entries past the first two are non-zero, RCU is broken.
|
||||
And rcutorture prints the error flag string "!!!" to make sure
|
||||
you notice. The age of a newly allocated structure is zero,
|
||||
@ -94,14 +100,14 @@ o "Reader Pipe": Histogram of "ages" of structures seen by readers.
|
||||
RCU. If you want to see what it looks like when broken, break
|
||||
it yourself. ;-)
|
||||
|
||||
o "Reader Batch": Another histogram of "ages" of structures seen
|
||||
* "Reader Batch": Another histogram of "ages" of structures seen
|
||||
by readers, but in terms of counter flips (or batches) rather
|
||||
than in terms of grace periods. The legal number of non-zero
|
||||
entries is again two. The reason for this separate view is that
|
||||
it is sometimes easier to get the third entry to show up in the
|
||||
"Reader Batch" list than in the "Reader Pipe" list.
|
||||
|
||||
o "Free-Block Circulation": Shows the number of torture structures
|
||||
* "Free-Block Circulation": Shows the number of torture structures
|
||||
that have reached a given point in the pipeline. The first element
|
||||
should closely correspond to the number of structures allocated,
|
||||
the second to the number that have been removed from reader view,
|
||||
@ -112,7 +118,7 @@ o "Free-Block Circulation": Shows the number of torture structures
|
||||
|
||||
Different implementations of RCU can provide implementation-specific
|
||||
additional information. For example, Tree SRCU provides the following
|
||||
additional line:
|
||||
additional line::
|
||||
|
||||
srcud-torture: Tree SRCU per-CPU(idx=0): 0(35,-21) 1(-4,24) 2(1,1) 3(-26,20) 4(28,-47) 5(-9,4) 6(-10,14) 7(-14,11) T(1,6)
|
||||
|
||||
@ -123,15 +129,15 @@ using a dynamically allocated srcu_struct (hence "srcud-" rather than
|
||||
"old" and "current" values to the underlying array, and is useful for
|
||||
debugging. The final "T" entry contains the totals of the counters.
|
||||
|
||||
|
||||
USAGE ON SPECIFIC KERNEL BUILDS
|
||||
Usage on Specific Kernel Builds
|
||||
===============================
|
||||
|
||||
It is sometimes desirable to torture RCU on a specific kernel build,
|
||||
for example, when preparing to put that kernel build into production.
|
||||
In that case, the kernel should be built with CONFIG_RCU_TORTURE_TEST=m
|
||||
so that the test can be started using modprobe and terminated using rmmod.
|
||||
|
||||
For example, the following script may be used to torture RCU:
|
||||
For example, the following script may be used to torture RCU::
|
||||
|
||||
#!/bin/sh
|
||||
|
||||
@ -148,7 +154,8 @@ two are self-explanatory, while the last indicates that while there
|
||||
were no RCU failures, CPU-hotplug problems were detected.
|
||||
|
||||
|
||||
USAGE ON MAINLINE KERNELS
|
||||
Usage on Mainline Kernels
|
||||
=========================
|
||||
|
||||
When using rcutorture to test changes to RCU itself, it is often
|
||||
necessary to build a number of kernels in order to test that change
|
||||
@ -180,16 +187,16 @@ to Tree SRCU might run only the SRCU-N and SRCU-P scenarios using the
|
||||
--configs argument to kvm.sh as follows: "--configs 'SRCU-N SRCU-P'".
|
||||
Large systems can run multiple copies of of the full set of scenarios,
|
||||
for example, a system with 448 hardware threads can run five instances
|
||||
of the full set concurrently. To make this happen:
|
||||
of the full set concurrently. To make this happen::
|
||||
|
||||
kvm.sh --cpus 448 --configs '5*CFLIST'
|
||||
|
||||
Alternatively, such a system can run 56 concurrent instances of a single
|
||||
eight-CPU scenario:
|
||||
eight-CPU scenario::
|
||||
|
||||
kvm.sh --cpus 448 --configs '56*TREE04'
|
||||
|
||||
Or 28 concurrent instances of each of two eight-CPU scenarios:
|
||||
Or 28 concurrent instances of each of two eight-CPU scenarios::
|
||||
|
||||
kvm.sh --cpus 448 --configs '28*TREE03 28*TREE04'
|
||||
|
||||
@ -199,14 +206,14 @@ values for memory may require disabling the callback-flooding tests
|
||||
using the --bootargs parameter discussed below.
|
||||
|
||||
Sometimes additional debugging is useful, and in such cases the --kconfig
|
||||
parameter to kvm.sh may be used, for example, "--kconfig 'CONFIG_KASAN=y'".
|
||||
parameter to kvm.sh may be used, for example, ``--kconfig 'CONFIG_KASAN=y'``.
|
||||
|
||||
Kernel boot arguments can also be supplied, for example, to control
|
||||
rcutorture's module parameters. For example, to test a change to RCU's
|
||||
CPU stall-warning code, use "--bootargs 'rcutorture.stall_cpu=30'".
|
||||
This will of course result in the scripting reporting a failure, namely
|
||||
the resuling RCU CPU stall warning. As noted above, reducing memory may
|
||||
require disabling rcutorture's callback-flooding tests:
|
||||
require disabling rcutorture's callback-flooding tests::
|
||||
|
||||
kvm.sh --cpus 448 --configs '56*TREE04' --memory 128M \
|
||||
--bootargs 'rcutorture.fwd_progress=0'
|
||||
@ -225,7 +232,7 @@ is listed at the end of the kvm.sh output, which you really should redirect
|
||||
to a file. The build products and console output of each run is kept in
|
||||
tools/testing/selftests/rcutorture/res in timestamped directories. A
|
||||
given directory can be supplied to kvm-find-errors.sh in order to have
|
||||
it cycle you through summaries of errors and full error logs. For example:
|
||||
it cycle you through summaries of errors and full error logs. For example::
|
||||
|
||||
tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh \
|
||||
tools/testing/selftests/rcutorture/res/2020.01.20-15.54.23
|
||||
@ -245,38 +252,42 @@ that was tested and any uncommitted changes in diff format.
|
||||
|
||||
The most frequently used files in each per-scenario-run directory are:
|
||||
|
||||
.config: This file contains the Kconfig options.
|
||||
.config:
|
||||
This file contains the Kconfig options.
|
||||
|
||||
Make.out: This contains build output for a specific scenario.
|
||||
Make.out:
|
||||
This contains build output for a specific scenario.
|
||||
|
||||
console.log: This contains the console output for a specific scenario.
|
||||
console.log:
|
||||
This contains the console output for a specific scenario.
|
||||
This file may be examined once the kernel has booted, but
|
||||
it might not exist if the build failed.
|
||||
|
||||
vmlinux: This contains the kernel, which can be useful with tools like
|
||||
vmlinux:
|
||||
This contains the kernel, which can be useful with tools like
|
||||
objdump and gdb.
|
||||
|
||||
A number of additional files are available, but are less frequently used.
|
||||
Many are intended for debugging of rcutorture itself or of its scripting.
|
||||
|
||||
As of v5.4, a successful run with the default set of scenarios produces
|
||||
the following summary at the end of the run on a 12-CPU system:
|
||||
the following summary at the end of the run on a 12-CPU system::
|
||||
|
||||
SRCU-N ------- 804233 GPs (148.932/s) [srcu: g10008272 f0x0 ]
|
||||
SRCU-P ------- 202320 GPs (37.4667/s) [srcud: g1809476 f0x0 ]
|
||||
SRCU-t ------- 1122086 GPs (207.794/s) [srcu: g0 f0x0 ]
|
||||
SRCU-u ------- 1111285 GPs (205.794/s) [srcud: g1 f0x0 ]
|
||||
TASKS01 ------- 19666 GPs (3.64185/s) [tasks: g0 f0x0 ]
|
||||
TASKS02 ------- 20541 GPs (3.80389/s) [tasks: g0 f0x0 ]
|
||||
TASKS03 ------- 19416 GPs (3.59556/s) [tasks: g0 f0x0 ]
|
||||
TINY01 ------- 836134 GPs (154.84/s) [rcu: g0 f0x0 ] n_max_cbs: 34198
|
||||
TINY02 ------- 850371 GPs (157.476/s) [rcu: g0 f0x0 ] n_max_cbs: 2631
|
||||
TREE01 ------- 162625 GPs (30.1157/s) [rcu: g1124169 f0x0 ]
|
||||
TREE02 ------- 333003 GPs (61.6672/s) [rcu: g2647753 f0x0 ] n_max_cbs: 35844
|
||||
TREE03 ------- 306623 GPs (56.782/s) [rcu: g2975325 f0x0 ] n_max_cbs: 1496497
|
||||
CPU count limited from 16 to 12
|
||||
TREE04 ------- 246149 GPs (45.5831/s) [rcu: g1695737 f0x0 ] n_max_cbs: 434961
|
||||
TREE05 ------- 314603 GPs (58.2598/s) [rcu: g2257741 f0x2 ] n_max_cbs: 193997
|
||||
TREE07 ------- 167347 GPs (30.9902/s) [rcu: g1079021 f0x0 ] n_max_cbs: 478732
|
||||
CPU count limited from 16 to 12
|
||||
TREE09 ------- 752238 GPs (139.303/s) [rcu: g13075057 f0x0 ] n_max_cbs: 99011
|
||||
SRCU-N ------- 804233 GPs (148.932/s) [srcu: g10008272 f0x0 ]
|
||||
SRCU-P ------- 202320 GPs (37.4667/s) [srcud: g1809476 f0x0 ]
|
||||
SRCU-t ------- 1122086 GPs (207.794/s) [srcu: g0 f0x0 ]
|
||||
SRCU-u ------- 1111285 GPs (205.794/s) [srcud: g1 f0x0 ]
|
||||
TASKS01 ------- 19666 GPs (3.64185/s) [tasks: g0 f0x0 ]
|
||||
TASKS02 ------- 20541 GPs (3.80389/s) [tasks: g0 f0x0 ]
|
||||
TASKS03 ------- 19416 GPs (3.59556/s) [tasks: g0 f0x0 ]
|
||||
TINY01 ------- 836134 GPs (154.84/s) [rcu: g0 f0x0 ] n_max_cbs: 34198
|
||||
TINY02 ------- 850371 GPs (157.476/s) [rcu: g0 f0x0 ] n_max_cbs: 2631
|
||||
TREE01 ------- 162625 GPs (30.1157/s) [rcu: g1124169 f0x0 ]
|
||||
TREE02 ------- 333003 GPs (61.6672/s) [rcu: g2647753 f0x0 ] n_max_cbs: 35844
|
||||
TREE03 ------- 306623 GPs (56.782/s) [rcu: g2975325 f0x0 ] n_max_cbs: 1496497
|
||||
CPU count limited from 16 to 12
|
||||
TREE04 ------- 246149 GPs (45.5831/s) [rcu: g1695737 f0x0 ] n_max_cbs: 434961
|
||||
TREE05 ------- 314603 GPs (58.2598/s) [rcu: g2257741 f0x2 ] n_max_cbs: 193997
|
||||
TREE07 ------- 167347 GPs (30.9902/s) [rcu: g1079021 f0x0 ] n_max_cbs: 478732
|
||||
CPU count limited from 16 to 12
|
||||
TREE09 ------- 752238 GPs (139.303/s) [rcu: g13075057 f0x0 ] n_max_cbs: 99011
|
@ -93,6 +93,11 @@ It exists in the sparse memory mapping model, and it is also somewhat
|
||||
similar to the mem_map variable, both of them are used to translate an
|
||||
address.
|
||||
|
||||
MAX_PHYSMEM_BITS
|
||||
----------------
|
||||
|
||||
Defines the maximum supported physical address space memory.
|
||||
|
||||
page
|
||||
----
|
||||
|
||||
@ -399,6 +404,17 @@ KERNELPACMASK
|
||||
The mask to extract the Pointer Authentication Code from a kernel virtual
|
||||
address.
|
||||
|
||||
TCR_EL1.T1SZ
|
||||
------------
|
||||
|
||||
Indicates the size offset of the memory region addressed by TTBR1_EL1.
|
||||
The region size is 2^(64-T1SZ) bytes.
|
||||
|
||||
TTBR1_EL1 is the table base address register specified by ARMv8-A
|
||||
architecture which is used to lookup the page-tables for the Virtual
|
||||
addresses in the higher VA range (refer to ARMv8 ARM document for
|
||||
more details).
|
||||
|
||||
arm
|
||||
===
|
||||
|
||||
|
@ -4038,6 +4038,14 @@
|
||||
latencies, which will choose a value aligned
|
||||
with the appropriate hardware boundaries.
|
||||
|
||||
rcutree.rcu_min_cached_objs= [KNL]
|
||||
Minimum number of objects which are cached and
|
||||
maintained per one CPU. Object size is equal
|
||||
to PAGE_SIZE. The cache allows to reduce the
|
||||
pressure to page allocator, also it makes the
|
||||
whole algorithm to behave better in low memory
|
||||
condition.
|
||||
|
||||
rcutree.jiffies_till_first_fqs= [KNL]
|
||||
Set delay from grace-period initialization to
|
||||
first attempt to force quiescent states.
|
||||
@ -4258,6 +4266,20 @@
|
||||
Set time (jiffies) between CPU-hotplug operations,
|
||||
or zero to disable CPU-hotplug testing.
|
||||
|
||||
rcutorture.read_exit= [KNL]
|
||||
Set the number of read-then-exit kthreads used
|
||||
to test the interaction of RCU updaters and
|
||||
task-exit processing.
|
||||
|
||||
rcutorture.read_exit_burst= [KNL]
|
||||
The number of times in a given read-then-exit
|
||||
episode that a set of read-then-exit kthreads
|
||||
is spawned.
|
||||
|
||||
rcutorture.read_exit_delay= [KNL]
|
||||
The delay, in seconds, between successive
|
||||
read-then-exit testing episodes.
|
||||
|
||||
rcutorture.shuffle_interval= [KNL]
|
||||
Set task-shuffle interval (s). Shuffling tasks
|
||||
allows some CPUs to go into dyntick-idle mode
|
||||
@ -4407,6 +4429,45 @@
|
||||
reboot_cpu is s[mp]#### with #### being the processor
|
||||
to be used for rebooting.
|
||||
|
||||
refscale.holdoff= [KNL]
|
||||
Set test-start holdoff period. The purpose of
|
||||
this parameter is to delay the start of the
|
||||
test until boot completes in order to avoid
|
||||
interference.
|
||||
|
||||
refscale.loops= [KNL]
|
||||
Set the number of loops over the synchronization
|
||||
primitive under test. Increasing this number
|
||||
reduces noise due to loop start/end overhead,
|
||||
but the default has already reduced the per-pass
|
||||
noise to a handful of picoseconds on ca. 2020
|
||||
x86 laptops.
|
||||
|
||||
refscale.nreaders= [KNL]
|
||||
Set number of readers. The default value of -1
|
||||
selects N, where N is roughly 75% of the number
|
||||
of CPUs. A value of zero is an interesting choice.
|
||||
|
||||
refscale.nruns= [KNL]
|
||||
Set number of runs, each of which is dumped onto
|
||||
the console log.
|
||||
|
||||
refscale.readdelay= [KNL]
|
||||
Set the read-side critical-section duration,
|
||||
measured in microseconds.
|
||||
|
||||
refscale.scale_type= [KNL]
|
||||
Specify the read-protection implementation to test.
|
||||
|
||||
refscale.shutdown= [KNL]
|
||||
Shut down the system at the end of the performance
|
||||
test. This defaults to 1 (shut it down) when
|
||||
rcuperf is built into the kernel and to 0 (leave
|
||||
it running) when rcuperf is built as a module.
|
||||
|
||||
refscale.verbose= [KNL]
|
||||
Enable additional printk() statements.
|
||||
|
||||
relax_domain_level=
|
||||
[KNL, SMP] Set scheduler's default relax_domain_level.
|
||||
See Documentation/admin-guide/cgroup-v1/cpusets.rst.
|
||||
@ -5082,6 +5143,13 @@
|
||||
Prevent the CPU-hotplug component of torturing
|
||||
until after init has spawned.
|
||||
|
||||
torture.ftrace_dump_at_shutdown= [KNL]
|
||||
Dump the ftrace buffer at torture-test shutdown,
|
||||
even if there were no errors. This can be a
|
||||
very costly operation when many torture tests
|
||||
are running concurrently, especially on systems
|
||||
with rotating-rust storage.
|
||||
|
||||
tp720= [HW,PS2]
|
||||
|
||||
tpm_suspend_pcr=[HW,TPM]
|
||||
|
@ -85,21 +85,21 @@ smp_store_release() respectively. Therefore, if you find yourself only using
|
||||
the Non-RMW operations of atomic_t, you do not in fact need atomic_t at all
|
||||
and are doing it wrong.
|
||||
|
||||
A subtle detail of atomic_set{}() is that it should be observable to the RMW
|
||||
ops. That is:
|
||||
A note for the implementation of atomic_set{}() is that it must not break the
|
||||
atomicity of the RMW ops. That is:
|
||||
|
||||
C atomic-set
|
||||
C Atomic-RMW-ops-are-atomic-WRT-atomic_set
|
||||
|
||||
{
|
||||
atomic_set(v, 1);
|
||||
atomic_t v = ATOMIC_INIT(1);
|
||||
}
|
||||
|
||||
P0(atomic_t *v)
|
||||
{
|
||||
(void)atomic_add_unless(v, 1, 0);
|
||||
}
|
||||
|
||||
P1(atomic_t *v)
|
||||
{
|
||||
atomic_add_unless(v, 1, 0);
|
||||
}
|
||||
|
||||
P2(atomic_t *v)
|
||||
{
|
||||
atomic_set(v, 0);
|
||||
}
|
||||
@ -233,19 +233,19 @@ as well. Similarly, something like:
|
||||
is an ACQUIRE pattern (though very much not typical), but again the barrier is
|
||||
strictly stronger than ACQUIRE. As illustrated:
|
||||
|
||||
C strong-acquire
|
||||
C Atomic-RMW+mb__after_atomic-is-stronger-than-acquire
|
||||
|
||||
{
|
||||
}
|
||||
|
||||
P1(int *x, atomic_t *y)
|
||||
P0(int *x, atomic_t *y)
|
||||
{
|
||||
r0 = READ_ONCE(*x);
|
||||
smp_rmb();
|
||||
r1 = atomic_read(y);
|
||||
}
|
||||
|
||||
P2(int *x, atomic_t *y)
|
||||
P1(int *x, atomic_t *y)
|
||||
{
|
||||
atomic_inc(y);
|
||||
smp_mb__after_atomic();
|
||||
@ -253,14 +253,14 @@ strictly stronger than ACQUIRE. As illustrated:
|
||||
}
|
||||
|
||||
exists
|
||||
(r0=1 /\ r1=0)
|
||||
(0:r0=1 /\ 0:r1=0)
|
||||
|
||||
This should not happen; but a hypothetical atomic_inc_acquire() --
|
||||
(void)atomic_fetch_inc_acquire() for instance -- would allow the outcome,
|
||||
because it would not order the W part of the RMW against the following
|
||||
WRITE_ONCE. Thus:
|
||||
|
||||
P1 P2
|
||||
P0 P1
|
||||
|
||||
t = LL.acq *y (0)
|
||||
t++;
|
||||
|
@ -8,7 +8,8 @@ approach to detect races. KCSAN's primary purpose is to detect `data races`_.
|
||||
Usage
|
||||
-----
|
||||
|
||||
KCSAN requires Clang version 11 or later.
|
||||
KCSAN is supported by both GCC and Clang. With GCC we require version 11 or
|
||||
later, and with Clang also require version 11 or later.
|
||||
|
||||
To enable KCSAN configure the kernel with::
|
||||
|
||||
|
@ -28,6 +28,16 @@ Documentation/devicetree/bindings/iommu/iommu.txt.
|
||||
For arm-smmu binding, see:
|
||||
Documentation/devicetree/bindings/iommu/arm,smmu.yaml.
|
||||
|
||||
The MSI writes are accompanied by sideband data which is derived from the ICID.
|
||||
The msi-map property is used to associate the devices with both the ITS
|
||||
controller and the sideband data which accompanies the writes.
|
||||
|
||||
For generic MSI bindings, see
|
||||
Documentation/devicetree/bindings/interrupt-controller/msi.txt.
|
||||
|
||||
For GICv3 and GIC ITS bindings, see:
|
||||
Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.yaml.
|
||||
|
||||
Required properties:
|
||||
|
||||
- compatible
|
||||
@ -49,11 +59,6 @@ Required properties:
|
||||
region may not be present in some scenarios, such
|
||||
as in the device tree presented to a virtual machine.
|
||||
|
||||
- msi-parent
|
||||
Value type: <phandle>
|
||||
Definition: Must be present and point to the MSI controller node
|
||||
handling message interrupts for the MC.
|
||||
|
||||
- ranges
|
||||
Value type: <prop-encoded-array>
|
||||
Definition: A standard property. Defines the mapping between the child
|
||||
@ -119,6 +124,28 @@ Optional properties:
|
||||
associated with the listed IOMMU, with the iommu-specifier
|
||||
(i - icid-base + iommu-base).
|
||||
|
||||
- msi-map: Maps an ICID to a GIC ITS and associated msi-specifier
|
||||
data.
|
||||
|
||||
The property is an arbitrary number of tuples of
|
||||
(icid-base,gic-its,msi-base,length).
|
||||
|
||||
Any ICID in the interval [icid-base, icid-base + length) is
|
||||
associated with the listed GIC ITS, with the msi-specifier
|
||||
(i - icid-base + msi-base).
|
||||
|
||||
Deprecated properties:
|
||||
|
||||
- msi-parent
|
||||
Value type: <phandle>
|
||||
Definition: Describes the MSI controller node handling message
|
||||
interrupts for the MC. When there is no translation
|
||||
between the ICID and deviceID this property can be used
|
||||
to describe the MSI controller used by the devices on the
|
||||
mc-bus.
|
||||
The use of this property for mc-bus is deprecated. Please
|
||||
use msi-map.
|
||||
|
||||
Example:
|
||||
|
||||
smmu: iommu@5000000 {
|
||||
@ -128,13 +155,24 @@ Example:
|
||||
...
|
||||
};
|
||||
|
||||
gic: interrupt-controller@6000000 {
|
||||
compatible = "arm,gic-v3";
|
||||
...
|
||||
}
|
||||
its: gic-its@6020000 {
|
||||
compatible = "arm,gic-v3-its";
|
||||
msi-controller;
|
||||
...
|
||||
};
|
||||
|
||||
fsl_mc: fsl-mc@80c000000 {
|
||||
compatible = "fsl,qoriq-mc";
|
||||
reg = <0x00000008 0x0c000000 0 0x40>, /* MC portal base */
|
||||
<0x00000000 0x08340000 0 0x40000>; /* MC control reg */
|
||||
msi-parent = <&its>;
|
||||
/* define map for ICIDs 23-64 */
|
||||
iommu-map = <23 &smmu 23 41>;
|
||||
/* define msi map for ICIDs 23-64 */
|
||||
msi-map = <23 &its 23 41>;
|
||||
#address-cells = <3>;
|
||||
#size-cells = <1>;
|
||||
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | TODO |
|
||||
| sparc: | ok |
|
||||
| um: | TODO |
|
||||
| unicore32: | TODO |
|
||||
| x86: | TODO |
|
||||
| xtensa: | TODO |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | TODO |
|
||||
| sparc: | ok |
|
||||
| um: | TODO |
|
||||
| unicore32: | TODO |
|
||||
| x86: | ok |
|
||||
| xtensa: | TODO |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | ok |
|
||||
| sparc: | ok |
|
||||
| um: | TODO |
|
||||
| unicore32: | TODO |
|
||||
| x86: | ok |
|
||||
| xtensa: | ok |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | TODO |
|
||||
| sparc: | ok |
|
||||
| um: | TODO |
|
||||
| unicore32: | TODO |
|
||||
| x86: | ok |
|
||||
| xtensa: | ok |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | ok |
|
||||
| sparc: | ok |
|
||||
| um: | TODO |
|
||||
| unicore32: | TODO |
|
||||
| x86: | ok |
|
||||
| xtensa: | ok |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | TODO |
|
||||
| sparc: | TODO |
|
||||
| um: | TODO |
|
||||
| unicore32: | TODO |
|
||||
| x86: | ok |
|
||||
| xtensa: | ok |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | TODO |
|
||||
| sparc: | TODO |
|
||||
| um: | TODO |
|
||||
| unicore32: | TODO |
|
||||
| x86: | ok |
|
||||
| xtensa: | TODO |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | ok |
|
||||
| sparc: | TODO |
|
||||
| um: | TODO |
|
||||
| unicore32: | TODO |
|
||||
| x86: | ok |
|
||||
| xtensa: | TODO |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | ok |
|
||||
| sparc: | ok |
|
||||
| um: | TODO |
|
||||
| unicore32: | TODO |
|
||||
| x86: | ok |
|
||||
| xtensa: | TODO |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | TODO |
|
||||
| sparc: | TODO |
|
||||
| um: | TODO |
|
||||
| unicore32: | TODO |
|
||||
| x86: | ok |
|
||||
| xtensa: | TODO |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | ok |
|
||||
| sparc: | ok |
|
||||
| um: | TODO |
|
||||
| unicore32: | TODO |
|
||||
| x86: | ok |
|
||||
| xtensa: | TODO |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | ok |
|
||||
| sparc: | ok |
|
||||
| um: | TODO |
|
||||
| unicore32: | TODO |
|
||||
| x86: | ok |
|
||||
| xtensa: | TODO |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | TODO |
|
||||
| sparc: | TODO |
|
||||
| um: | TODO |
|
||||
| unicore32: | TODO |
|
||||
| x86: | ok |
|
||||
| xtensa: | TODO |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | ok |
|
||||
| sparc: | TODO |
|
||||
| um: | TODO |
|
||||
| unicore32: | TODO |
|
||||
| x86: | ok |
|
||||
| xtensa: | ok |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | TODO |
|
||||
| sparc: | ok |
|
||||
| um: | TODO |
|
||||
| unicore32: | TODO |
|
||||
| x86: | ok |
|
||||
| xtensa: | TODO |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | TODO |
|
||||
| sparc: | TODO |
|
||||
| um: | TODO |
|
||||
| unicore32: | TODO |
|
||||
| x86: | ok |
|
||||
| xtensa: | TODO |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | TODO |
|
||||
| sparc: | TODO |
|
||||
| um: | TODO |
|
||||
| unicore32: | TODO |
|
||||
| x86: | ok |
|
||||
| xtensa: | ok |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | TODO |
|
||||
| sparc: | TODO |
|
||||
| um: | TODO |
|
||||
| unicore32: | TODO |
|
||||
| x86: | ok |
|
||||
| xtensa: | TODO |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | ok |
|
||||
| sparc: | ok |
|
||||
| um: | ok |
|
||||
| unicore32: | ok |
|
||||
| x86: | ok |
|
||||
| xtensa: | ok |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | TODO |
|
||||
| sparc: | ok |
|
||||
| um: | TODO |
|
||||
| unicore32: | TODO |
|
||||
| x86: | ok |
|
||||
| xtensa: | ok |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | TODO |
|
||||
| sparc: | ok |
|
||||
| um: | TODO |
|
||||
| unicore32: | TODO |
|
||||
| x86: | ok |
|
||||
| xtensa: | ok |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | ok |
|
||||
| sparc: | ok |
|
||||
| um: | TODO |
|
||||
| unicore32: | TODO |
|
||||
| x86: | ok |
|
||||
| xtensa: | TODO |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | TODO |
|
||||
| sparc: | TODO |
|
||||
| um: | TODO |
|
||||
| unicore32: | TODO |
|
||||
| x86: | ok |
|
||||
| xtensa: | TODO |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | TODO |
|
||||
| sparc: | TODO |
|
||||
| um: | TODO |
|
||||
| unicore32: | TODO |
|
||||
| x86: | ok |
|
||||
| xtensa: | TODO |
|
||||
-----------------------
|
||||
|
@ -51,7 +51,6 @@
|
||||
| sh: | TODO |
|
||||
| sparc: | TODO |
|
||||
| um: | TODO |
|
||||
| unicore32: | TODO |
|
||||
| x86: | ok |
|
||||
| xtensa: | TODO |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | .. |
|
||||
| sparc: | TODO |
|
||||
| um: | .. |
|
||||
| unicore32: | .. |
|
||||
| x86: | ok |
|
||||
| xtensa: | .. |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | TODO |
|
||||
| sparc: | TODO |
|
||||
| um: | ok |
|
||||
| unicore32: | TODO |
|
||||
| x86: | ok |
|
||||
| xtensa: | TODO |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | ok |
|
||||
| sparc: | TODO |
|
||||
| um: | TODO |
|
||||
| unicore32: | TODO |
|
||||
| x86: | TODO |
|
||||
| xtensa: | TODO |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | ok |
|
||||
| sparc: | ok |
|
||||
| um: | ok |
|
||||
| unicore32: | ok |
|
||||
| x86: | ok |
|
||||
| xtensa: | ok |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | TODO |
|
||||
| sparc: | ok |
|
||||
| um: | TODO |
|
||||
| unicore32: | TODO |
|
||||
| x86: | ok |
|
||||
| xtensa: | TODO |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | TODO |
|
||||
| sparc: | .. |
|
||||
| um: | TODO |
|
||||
| unicore32: | TODO |
|
||||
| x86: | ok |
|
||||
| xtensa: | ok |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | ok |
|
||||
| sparc: | ok |
|
||||
| um: | ok |
|
||||
| unicore32: | ok |
|
||||
| x86: | ok |
|
||||
| xtensa: | ok |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | TODO |
|
||||
| sparc: | ok |
|
||||
| um: | TODO |
|
||||
| unicore32: | TODO |
|
||||
| x86: | ok |
|
||||
| xtensa: | TODO |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | TODO |
|
||||
| sparc: | TODO |
|
||||
| um: | TODO |
|
||||
| unicore32: | TODO |
|
||||
| x86: | ok |
|
||||
| xtensa: | TODO |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | TODO |
|
||||
| sparc: | TODO |
|
||||
| um: | TODO |
|
||||
| unicore32: | TODO |
|
||||
| x86: | ok |
|
||||
| xtensa: | TODO |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | .. |
|
||||
| sparc: | ok |
|
||||
| um: | .. |
|
||||
| unicore32: | .. |
|
||||
| x86: | ok |
|
||||
| xtensa: | .. |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | TODO |
|
||||
| sparc: | TODO |
|
||||
| um: | .. |
|
||||
| unicore32: | .. |
|
||||
| x86: | ok |
|
||||
| xtensa: | TODO |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | TODO |
|
||||
| sparc: | TODO |
|
||||
| um: | TODO |
|
||||
| unicore32: | TODO |
|
||||
| x86: | ok |
|
||||
| xtensa: | TODO |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | ok |
|
||||
| sparc: | TODO |
|
||||
| um: | TODO |
|
||||
| unicore32: | TODO |
|
||||
| x86: | ok |
|
||||
| xtensa: | TODO |
|
||||
-----------------------
|
||||
|
@ -28,7 +28,6 @@
|
||||
| sh: | ok |
|
||||
| sparc: | ok |
|
||||
| um: | TODO |
|
||||
| unicore32: | TODO |
|
||||
| x86: | ok |
|
||||
| xtensa: | TODO |
|
||||
-----------------------
|
||||
|
35
Documentation/litmus-tests/README
Normal file
35
Documentation/litmus-tests/README
Normal file
@ -0,0 +1,35 @@
|
||||
============
|
||||
LITMUS TESTS
|
||||
============
|
||||
|
||||
Each subdirectory contains litmus tests that are typical to describe the
|
||||
semantics of respective kernel APIs.
|
||||
For more information about how to "run" a litmus test or how to generate
|
||||
a kernel test module based on a litmus test, please see
|
||||
tools/memory-model/README.
|
||||
|
||||
|
||||
atomic (/atomic derectory)
|
||||
--------------------------
|
||||
|
||||
Atomic-RMW+mb__after_atomic-is-stronger-than-acquire.litmus
|
||||
Test that an atomic RMW followed by a smp_mb__after_atomic() is
|
||||
stronger than a normal acquire: both the read and write parts of
|
||||
the RMW are ordered before the subsequential memory accesses.
|
||||
|
||||
Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus
|
||||
Test that atomic_set() cannot break the atomicity of atomic RMWs.
|
||||
NOTE: Require herd7 7.56 or later which supports "(void)expr".
|
||||
|
||||
|
||||
RCU (/rcu directory)
|
||||
--------------------
|
||||
|
||||
MP+onceassign+derefonce.litmus (under tools/memory-model/litmus-tests/)
|
||||
Demonstrates the use of rcu_assign_pointer() and rcu_dereference() to
|
||||
ensure that an RCU reader will not see pre-initialization garbage.
|
||||
|
||||
RCU+sync+read.litmus
|
||||
RCU+sync+free.litmus
|
||||
Both the above litmus tests demonstrate the RCU grace period guarantee
|
||||
that an RCU read-side critical section can never span a grace period.
|
@ -0,0 +1,32 @@
|
||||
C Atomic-RMW+mb__after_atomic-is-stronger-than-acquire
|
||||
|
||||
(*
|
||||
* Result: Never
|
||||
*
|
||||
* Test that an atomic RMW followed by a smp_mb__after_atomic() is
|
||||
* stronger than a normal acquire: both the read and write parts of
|
||||
* the RMW are ordered before the subsequential memory accesses.
|
||||
*)
|
||||
|
||||
{
|
||||
}
|
||||
|
||||
P0(int *x, atomic_t *y)
|
||||
{
|
||||
int r0;
|
||||
int r1;
|
||||
|
||||
r0 = READ_ONCE(*x);
|
||||
smp_rmb();
|
||||
r1 = atomic_read(y);
|
||||
}
|
||||
|
||||
P1(int *x, atomic_t *y)
|
||||
{
|
||||
atomic_inc(y);
|
||||
smp_mb__after_atomic();
|
||||
WRITE_ONCE(*x, 1);
|
||||
}
|
||||
|
||||
exists
|
||||
(0:r0=1 /\ 0:r1=0)
|
@ -0,0 +1,25 @@
|
||||
C Atomic-RMW-ops-are-atomic-WRT-atomic_set
|
||||
|
||||
(*
|
||||
* Result: Never
|
||||
*
|
||||
* Test that atomic_set() cannot break the atomicity of atomic RMWs.
|
||||
* NOTE: This requires herd7 7.56 or later which supports "(void)expr".
|
||||
*)
|
||||
|
||||
{
|
||||
atomic_t v = ATOMIC_INIT(1);
|
||||
}
|
||||
|
||||
P0(atomic_t *v)
|
||||
{
|
||||
(void)atomic_add_unless(v, 1, 0);
|
||||
}
|
||||
|
||||
P1(atomic_t *v)
|
||||
{
|
||||
atomic_set(v, 0);
|
||||
}
|
||||
|
||||
exists
|
||||
(v=2)
|
42
Documentation/litmus-tests/rcu/RCU+sync+free.litmus
Normal file
42
Documentation/litmus-tests/rcu/RCU+sync+free.litmus
Normal file
@ -0,0 +1,42 @@
|
||||
C RCU+sync+free
|
||||
|
||||
(*
|
||||
* Result: Never
|
||||
*
|
||||
* This litmus test demonstrates that an RCU reader can never see a write that
|
||||
* follows a grace period, if it did not see writes that precede that grace
|
||||
* period.
|
||||
*
|
||||
* This is a typical pattern of RCU usage, where the write before the grace
|
||||
* period assigns a pointer, and the writes following the grace period destroy
|
||||
* the object that the pointer used to point to.
|
||||
*
|
||||
* This is one implication of the RCU grace-period guarantee, which says (among
|
||||
* other things) that an RCU read-side critical section cannot span a grace period.
|
||||
*)
|
||||
|
||||
{
|
||||
int x = 1;
|
||||
int *y = &x;
|
||||
int z = 1;
|
||||
}
|
||||
|
||||
P0(int *x, int *z, int **y)
|
||||
{
|
||||
int *r0;
|
||||
int r1;
|
||||
|
||||
rcu_read_lock();
|
||||
r0 = rcu_dereference(*y);
|
||||
r1 = READ_ONCE(*r0);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
P1(int *x, int *z, int **y)
|
||||
{
|
||||
rcu_assign_pointer(*y, z);
|
||||
synchronize_rcu();
|
||||
WRITE_ONCE(*x, 0);
|
||||
}
|
||||
|
||||
exists (0:r0=x /\ 0:r1=0)
|
37
Documentation/litmus-tests/rcu/RCU+sync+read.litmus
Normal file
37
Documentation/litmus-tests/rcu/RCU+sync+read.litmus
Normal file
@ -0,0 +1,37 @@
|
||||
C RCU+sync+read
|
||||
|
||||
(*
|
||||
* Result: Never
|
||||
*
|
||||
* This litmus test demonstrates that after a grace period, an RCU updater always
|
||||
* sees all stores done in prior RCU read-side critical sections. Such
|
||||
* read-side critical sections would have ended before the grace period ended.
|
||||
*
|
||||
* This is one implication of the RCU grace-period guarantee, which says (among
|
||||
* other things) that an RCU read-side critical section cannot span a grace period.
|
||||
*)
|
||||
|
||||
{
|
||||
int x = 0;
|
||||
int y = 0;
|
||||
}
|
||||
|
||||
P0(int *x, int *y)
|
||||
{
|
||||
rcu_read_lock();
|
||||
WRITE_ONCE(*x, 1);
|
||||
WRITE_ONCE(*y, 1);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
P1(int *x, int *y)
|
||||
{
|
||||
int r0;
|
||||
int r1;
|
||||
|
||||
r0 = READ_ONCE(*x);
|
||||
synchronize_rcu();
|
||||
r1 = READ_ONCE(*y);
|
||||
}
|
||||
|
||||
exists (1:r0=1 /\ 1:r1=0)
|
@ -14,6 +14,7 @@ locking
|
||||
mutex-design
|
||||
rt-mutex-design
|
||||
rt-mutex
|
||||
seqlock
|
||||
spinlocks
|
||||
ww-mutex-design
|
||||
preempt-locking
|
||||
|
@ -166,4 +166,4 @@ checked for such errors. The "rmmod" command forces a "SUCCESS",
|
||||
two are self-explanatory, while the last indicates that while there
|
||||
were no locking failures, CPU-hotplug problems were detected.
|
||||
|
||||
Also see: Documentation/RCU/torture.txt
|
||||
Also see: Documentation/RCU/torture.rst
|
||||
|
@ -18,7 +18,7 @@ as an alternative to these. This new data structure provided a number
|
||||
of advantages, including simpler interfaces, and at that time smaller
|
||||
code (see Disadvantages).
|
||||
|
||||
[1] http://lwn.net/Articles/164802/
|
||||
[1] https://lwn.net/Articles/164802/
|
||||
|
||||
Implementation
|
||||
--------------
|
||||
|
170
Documentation/locking/seqlock.rst
Normal file
170
Documentation/locking/seqlock.rst
Normal file
@ -0,0 +1,170 @@
|
||||
======================================
|
||||
Sequence counters and sequential locks
|
||||
======================================
|
||||
|
||||
Introduction
|
||||
============
|
||||
|
||||
Sequence counters are a reader-writer consistency mechanism with
|
||||
lockless readers (read-only retry loops), and no writer starvation. They
|
||||
are used for data that's rarely written to (e.g. system time), where the
|
||||
reader wants a consistent set of information and is willing to retry if
|
||||
that information changes.
|
||||
|
||||
A data set is consistent when the sequence count at the beginning of the
|
||||
read side critical section is even and the same sequence count value is
|
||||
read again at the end of the critical section. The data in the set must
|
||||
be copied out inside the read side critical section. If the sequence
|
||||
count has changed between the start and the end of the critical section,
|
||||
the reader must retry.
|
||||
|
||||
Writers increment the sequence count at the start and the end of their
|
||||
critical section. After starting the critical section the sequence count
|
||||
is odd and indicates to the readers that an update is in progress. At
|
||||
the end of the write side critical section the sequence count becomes
|
||||
even again which lets readers make progress.
|
||||
|
||||
A sequence counter write side critical section must never be preempted
|
||||
or interrupted by read side sections. Otherwise the reader will spin for
|
||||
the entire scheduler tick due to the odd sequence count value and the
|
||||
interrupted writer. If that reader belongs to a real-time scheduling
|
||||
class, it can spin forever and the kernel will livelock.
|
||||
|
||||
This mechanism cannot be used if the protected data contains pointers,
|
||||
as the writer can invalidate a pointer that the reader is following.
|
||||
|
||||
|
||||
.. _seqcount_t:
|
||||
|
||||
Sequence counters (``seqcount_t``)
|
||||
==================================
|
||||
|
||||
This is the the raw counting mechanism, which does not protect against
|
||||
multiple writers. Write side critical sections must thus be serialized
|
||||
by an external lock.
|
||||
|
||||
If the write serialization primitive is not implicitly disabling
|
||||
preemption, preemption must be explicitly disabled before entering the
|
||||
write side section. If the read section can be invoked from hardirq or
|
||||
softirq contexts, interrupts or bottom halves must also be respectively
|
||||
disabled before entering the write section.
|
||||
|
||||
If it's desired to automatically handle the sequence counter
|
||||
requirements of writer serialization and non-preemptibility, use
|
||||
:ref:`seqlock_t` instead.
|
||||
|
||||
Initialization::
|
||||
|
||||
/* dynamic */
|
||||
seqcount_t foo_seqcount;
|
||||
seqcount_init(&foo_seqcount);
|
||||
|
||||
/* static */
|
||||
static seqcount_t foo_seqcount = SEQCNT_ZERO(foo_seqcount);
|
||||
|
||||
/* C99 struct init */
|
||||
struct {
|
||||
.seq = SEQCNT_ZERO(foo.seq),
|
||||
} foo;
|
||||
|
||||
Write path::
|
||||
|
||||
/* Serialized context with disabled preemption */
|
||||
|
||||
write_seqcount_begin(&foo_seqcount);
|
||||
|
||||
/* ... [[write-side critical section]] ... */
|
||||
|
||||
write_seqcount_end(&foo_seqcount);
|
||||
|
||||
Read path::
|
||||
|
||||
do {
|
||||
seq = read_seqcount_begin(&foo_seqcount);
|
||||
|
||||
/* ... [[read-side critical section]] ... */
|
||||
|
||||
} while (read_seqcount_retry(&foo_seqcount, seq));
|
||||
|
||||
|
||||
.. _seqlock_t:
|
||||
|
||||
Sequential locks (``seqlock_t``)
|
||||
================================
|
||||
|
||||
This contains the :ref:`seqcount_t` mechanism earlier discussed, plus an
|
||||
embedded spinlock for writer serialization and non-preemptibility.
|
||||
|
||||
If the read side section can be invoked from hardirq or softirq context,
|
||||
use the write side function variants which disable interrupts or bottom
|
||||
halves respectively.
|
||||
|
||||
Initialization::
|
||||
|
||||
/* dynamic */
|
||||
seqlock_t foo_seqlock;
|
||||
seqlock_init(&foo_seqlock);
|
||||
|
||||
/* static */
|
||||
static DEFINE_SEQLOCK(foo_seqlock);
|
||||
|
||||
/* C99 struct init */
|
||||
struct {
|
||||
.seql = __SEQLOCK_UNLOCKED(foo.seql)
|
||||
} foo;
|
||||
|
||||
Write path::
|
||||
|
||||
write_seqlock(&foo_seqlock);
|
||||
|
||||
/* ... [[write-side critical section]] ... */
|
||||
|
||||
write_sequnlock(&foo_seqlock);
|
||||
|
||||
Read path, three categories:
|
||||
|
||||
1. Normal Sequence readers which never block a writer but they must
|
||||
retry if a writer is in progress by detecting change in the sequence
|
||||
number. Writers do not wait for a sequence reader::
|
||||
|
||||
do {
|
||||
seq = read_seqbegin(&foo_seqlock);
|
||||
|
||||
/* ... [[read-side critical section]] ... */
|
||||
|
||||
} while (read_seqretry(&foo_seqlock, seq));
|
||||
|
||||
2. Locking readers which will wait if a writer or another locking reader
|
||||
is in progress. A locking reader in progress will also block a writer
|
||||
from entering its critical section. This read lock is
|
||||
exclusive. Unlike rwlock_t, only one locking reader can acquire it::
|
||||
|
||||
read_seqlock_excl(&foo_seqlock);
|
||||
|
||||
/* ... [[read-side critical section]] ... */
|
||||
|
||||
read_sequnlock_excl(&foo_seqlock);
|
||||
|
||||
3. Conditional lockless reader (as in 1), or locking reader (as in 2),
|
||||
according to a passed marker. This is used to avoid lockless readers
|
||||
starvation (too much retry loops) in case of a sharp spike in write
|
||||
activity. First, a lockless read is tried (even marker passed). If
|
||||
that trial fails (odd sequence counter is returned, which is used as
|
||||
the next iteration marker), the lockless read is transformed to a
|
||||
full locking read and no retry loop is necessary::
|
||||
|
||||
/* marker; even initialization */
|
||||
int seq = 0;
|
||||
do {
|
||||
read_seqbegin_or_lock(&foo_seqlock, &seq);
|
||||
|
||||
/* ... [[read-side critical section]] ... */
|
||||
|
||||
} while (need_seqretry(&foo_seqlock, seq));
|
||||
done_seqretry(&foo_seqlock, seq);
|
||||
|
||||
|
||||
API documentation
|
||||
=================
|
||||
|
||||
.. kernel-doc:: include/linux/seqlock.h
|
@ -553,12 +553,12 @@ There are certain things that the Linux kernel memory barriers do not guarantee:
|
||||
DATA DEPENDENCY BARRIERS (HISTORICAL)
|
||||
-------------------------------------
|
||||
|
||||
As of v4.15 of the Linux kernel, an smp_read_barrier_depends() was
|
||||
added to READ_ONCE(), which means that about the only people who
|
||||
need to pay attention to this section are those working on DEC Alpha
|
||||
architecture-specific code and those working on READ_ONCE() itself.
|
||||
For those who need it, and for those who are interested in the history,
|
||||
here is the story of data-dependency barriers.
|
||||
As of v4.15 of the Linux kernel, an smp_mb() was added to READ_ONCE() for
|
||||
DEC Alpha, which means that about the only people who need to pay attention
|
||||
to this section are those working on DEC Alpha architecture-specific code
|
||||
and those working on READ_ONCE() itself. For those who need it, and for
|
||||
those who are interested in the history, here is the story of
|
||||
data-dependency barriers.
|
||||
|
||||
The usage requirements of data dependency barriers are a little subtle, and
|
||||
it's not always obvious that they're needed. To illustrate, consider the
|
||||
@ -2708,144 +2708,6 @@ the properties of the memory window through which devices are accessed and/or
|
||||
the use of any special device communication instructions the CPU may have.
|
||||
|
||||
|
||||
CACHE COHERENCY
|
||||
---------------
|
||||
|
||||
Life isn't quite as simple as it may appear above, however: for while the
|
||||
caches are expected to be coherent, there's no guarantee that that coherency
|
||||
will be ordered. This means that while changes made on one CPU will
|
||||
eventually become visible on all CPUs, there's no guarantee that they will
|
||||
become apparent in the same order on those other CPUs.
|
||||
|
||||
|
||||
Consider dealing with a system that has a pair of CPUs (1 & 2), each of which
|
||||
has a pair of parallel data caches (CPU 1 has A/B, and CPU 2 has C/D):
|
||||
|
||||
:
|
||||
: +--------+
|
||||
: +---------+ | |
|
||||
+--------+ : +--->| Cache A |<------->| |
|
||||
| | : | +---------+ | |
|
||||
| CPU 1 |<---+ | |
|
||||
| | : | +---------+ | |
|
||||
+--------+ : +--->| Cache B |<------->| |
|
||||
: +---------+ | |
|
||||
: | Memory |
|
||||
: +---------+ | System |
|
||||
+--------+ : +--->| Cache C |<------->| |
|
||||
| | : | +---------+ | |
|
||||
| CPU 2 |<---+ | |
|
||||
| | : | +---------+ | |
|
||||
+--------+ : +--->| Cache D |<------->| |
|
||||
: +---------+ | |
|
||||
: +--------+
|
||||
:
|
||||
|
||||
Imagine the system has the following properties:
|
||||
|
||||
(*) an odd-numbered cache line may be in cache A, cache C or it may still be
|
||||
resident in memory;
|
||||
|
||||
(*) an even-numbered cache line may be in cache B, cache D or it may still be
|
||||
resident in memory;
|
||||
|
||||
(*) while the CPU core is interrogating one cache, the other cache may be
|
||||
making use of the bus to access the rest of the system - perhaps to
|
||||
displace a dirty cacheline or to do a speculative load;
|
||||
|
||||
(*) each cache has a queue of operations that need to be applied to that cache
|
||||
to maintain coherency with the rest of the system;
|
||||
|
||||
(*) the coherency queue is not flushed by normal loads to lines already
|
||||
present in the cache, even though the contents of the queue may
|
||||
potentially affect those loads.
|
||||
|
||||
Imagine, then, that two writes are made on the first CPU, with a write barrier
|
||||
between them to guarantee that they will appear to reach that CPU's caches in
|
||||
the requisite order:
|
||||
|
||||
CPU 1 CPU 2 COMMENT
|
||||
=============== =============== =======================================
|
||||
u == 0, v == 1 and p == &u, q == &u
|
||||
v = 2;
|
||||
smp_wmb(); Make sure change to v is visible before
|
||||
change to p
|
||||
<A:modify v=2> v is now in cache A exclusively
|
||||
p = &v;
|
||||
<B:modify p=&v> p is now in cache B exclusively
|
||||
|
||||
The write memory barrier forces the other CPUs in the system to perceive that
|
||||
the local CPU's caches have apparently been updated in the correct order. But
|
||||
now imagine that the second CPU wants to read those values:
|
||||
|
||||
CPU 1 CPU 2 COMMENT
|
||||
=============== =============== =======================================
|
||||
...
|
||||
q = p;
|
||||
x = *q;
|
||||
|
||||
The above pair of reads may then fail to happen in the expected order, as the
|
||||
cacheline holding p may get updated in one of the second CPU's caches while
|
||||
the update to the cacheline holding v is delayed in the other of the second
|
||||
CPU's caches by some other cache event:
|
||||
|
||||
CPU 1 CPU 2 COMMENT
|
||||
=============== =============== =======================================
|
||||
u == 0, v == 1 and p == &u, q == &u
|
||||
v = 2;
|
||||
smp_wmb();
|
||||
<A:modify v=2> <C:busy>
|
||||
<C:queue v=2>
|
||||
p = &v; q = p;
|
||||
<D:request p>
|
||||
<B:modify p=&v> <D:commit p=&v>
|
||||
<D:read p>
|
||||
x = *q;
|
||||
<C:read *q> Reads from v before v updated in cache
|
||||
<C:unbusy>
|
||||
<C:commit v=2>
|
||||
|
||||
Basically, while both cachelines will be updated on CPU 2 eventually, there's
|
||||
no guarantee that, without intervention, the order of update will be the same
|
||||
as that committed on CPU 1.
|
||||
|
||||
|
||||
To intervene, we need to interpolate a data dependency barrier or a read
|
||||
barrier between the loads (which as of v4.15 is supplied unconditionally
|
||||
by the READ_ONCE() macro). This will force the cache to commit its
|
||||
coherency queue before processing any further requests:
|
||||
|
||||
CPU 1 CPU 2 COMMENT
|
||||
=============== =============== =======================================
|
||||
u == 0, v == 1 and p == &u, q == &u
|
||||
v = 2;
|
||||
smp_wmb();
|
||||
<A:modify v=2> <C:busy>
|
||||
<C:queue v=2>
|
||||
p = &v; q = p;
|
||||
<D:request p>
|
||||
<B:modify p=&v> <D:commit p=&v>
|
||||
<D:read p>
|
||||
smp_read_barrier_depends()
|
||||
<C:unbusy>
|
||||
<C:commit v=2>
|
||||
x = *q;
|
||||
<C:read *q> Reads from v after v updated in cache
|
||||
|
||||
|
||||
This sort of problem can be encountered on DEC Alpha processors as they have a
|
||||
split cache that improves performance by making better use of the data bus.
|
||||
While most CPUs do imply a data dependency barrier on the read when a memory
|
||||
access depends on a read, not all do, so it may not be relied on.
|
||||
|
||||
Other CPUs may also have split caches, but must coordinate between the various
|
||||
cachelets for normal memory accesses. The semantics of the Alpha removes the
|
||||
need for hardware coordination in the absence of memory barriers, which
|
||||
permitted Alpha to sport higher CPU clock rates back in the day. However,
|
||||
please note that (again, as of v4.15) smp_read_barrier_depends() should not
|
||||
be used except in Alpha arch-specific code and within the READ_ONCE() macro.
|
||||
|
||||
|
||||
CACHE COHERENCY VS DMA
|
||||
----------------------
|
||||
|
||||
@ -3009,10 +2871,8 @@ caches with the memory coherence system, thus making it seem like pointer
|
||||
changes vs new data occur in the right order.
|
||||
|
||||
The Alpha defines the Linux kernel's memory model, although as of v4.15
|
||||
the Linux kernel's addition of smp_read_barrier_depends() to READ_ONCE()
|
||||
greatly reduced Alpha's impact on the memory model.
|
||||
|
||||
See the subsection on "Cache Coherency" above.
|
||||
the Linux kernel's addition of smp_mb() to READ_ONCE() on Alpha greatly
|
||||
reduced its impact on the memory model.
|
||||
|
||||
|
||||
VIRTUAL MACHINE GUESTS
|
||||
|
@ -67,7 +67,7 @@ corresponding component. The debugfs normally should be mounted to
|
||||
The content of the directories are files which represent different views
|
||||
to the debug log. Each component can decide which views should be
|
||||
used through registering them with the function :c:func:`debug_register_view()`.
|
||||
Predefined views for hex/ascii, sprintf and raw binary data are provided.
|
||||
Predefined views for hex/ascii and sprintf data are provided.
|
||||
It is also possible to define other views. The content of
|
||||
a view can be inspected simply by reading the corresponding debugfs file.
|
||||
|
||||
@ -119,8 +119,6 @@ Predefined views:
|
||||
|
||||
extern struct debug_view debug_hex_ascii_view;
|
||||
|
||||
extern struct debug_view debug_raw_view;
|
||||
|
||||
extern struct debug_view debug_sprintf_view;
|
||||
|
||||
Examples
|
||||
@ -129,7 +127,7 @@ Examples
|
||||
.. code-block:: c
|
||||
|
||||
/*
|
||||
* hex_ascii- + raw-view Example
|
||||
* hex_ascii-view Example
|
||||
*/
|
||||
|
||||
#include <linux/init.h>
|
||||
@ -143,7 +141,6 @@ Examples
|
||||
|
||||
debug_info = debug_register("test", 1, 4, 4 );
|
||||
debug_register_view(debug_info, &debug_hex_ascii_view);
|
||||
debug_register_view(debug_info, &debug_raw_view);
|
||||
|
||||
debug_text_event(debug_info, 4 , "one ");
|
||||
debug_int_exception(debug_info, 4, 4711);
|
||||
@ -201,7 +198,7 @@ debugfs-files:
|
||||
Example::
|
||||
|
||||
> ls /sys/kernel/debug/s390dbf/dasd
|
||||
flush hex_ascii level pages raw
|
||||
flush hex_ascii level pages
|
||||
> cat /sys/kernel/debug/s390dbf/dasd/hex_ascii | sort -k2,2 -s
|
||||
00 00974733272:680099 2 - 02 0006ad7e 07 ea 4a 90 | ....
|
||||
00 00974733272:682210 2 - 02 0006ade6 46 52 45 45 | FREE
|
||||
@ -298,10 +295,9 @@ order to see the debug entries well formatted.
|
||||
Predefined Views
|
||||
----------------
|
||||
|
||||
There are three predefined views: hex_ascii, raw and sprintf.
|
||||
There are two predefined views: hex_ascii and sprintf.
|
||||
The hex_ascii view shows the data field in hex and ascii representation
|
||||
(e.g. ``45 43 4b 44 | ECKD``).
|
||||
The raw view returns a bytestream as the debug areas are stored in memory.
|
||||
|
||||
The sprintf view formats the debug entries in the same way as the sprintf
|
||||
function would do. The sprintf event/exception functions write to the
|
||||
@ -334,11 +330,6 @@ The format of the hex_ascii and sprintf view is as follows:
|
||||
- Return Address to caller
|
||||
- data field
|
||||
|
||||
The format of the raw view is:
|
||||
|
||||
- Header as described in debug.h
|
||||
- datafield
|
||||
|
||||
A typical line of the hex_ascii view will look like the following (first line
|
||||
is only for explanation and will not be displayed when 'cating' the view)::
|
||||
|
||||
|
@ -577,7 +577,7 @@ ACQUIRE 는 해당 오퍼레이션의 로드 부분에만 적용되고 RELEASE
|
||||
데이터 의존성 배리어 (역사적)
|
||||
-----------------------------
|
||||
|
||||
리눅스 커널 v4.15 기준으로, smp_read_barrier_depends() 가 READ_ONCE() 에
|
||||
리눅스 커널 v4.15 기준으로, smp_mb() 가 DEC Alpha 용 READ_ONCE() 코드에
|
||||
추가되었는데, 이는 이 섹션에 주의를 기울여야 하는 사람들은 DEC Alpha 아키텍쳐
|
||||
전용 코드를 만드는 사람들과 READ_ONCE() 자체를 만드는 사람들 뿐임을 의미합니다.
|
||||
그런 분들을 위해, 그리고 역사에 관심 있는 분들을 위해, 여기 데이터 의존성
|
||||
@ -2664,144 +2664,6 @@ CPU 코어는 프로그램의 인과성이 유지된다고만 여겨진다면
|
||||
수도 있습니다.
|
||||
|
||||
|
||||
캐시 일관성
|
||||
-----------
|
||||
|
||||
하지만 삶은 앞에서 이야기한 것처럼 단순하지 않습니다: 캐시들은 일관적일 것으로
|
||||
기대되지만, 그 일관성이 순서에도 적용될 거라는 보장은 없습니다. 한 CPU 에서
|
||||
만들어진 변경 사항은 최종적으로는 시스템의 모든 CPU 에게 보여지게 되지만, 다른
|
||||
CPU 들에게도 같은 순서로 보이게 될 거라는 보장은 없다는 뜻입니다.
|
||||
|
||||
|
||||
두개의 CPU (1 & 2) 가 달려 있고, 각 CPU 에 두개의 데이터 캐시(CPU 1 은 A/B 를,
|
||||
CPU 2 는 C/D 를 갖습니다)가 병렬로 연결되어 있는 시스템을 다룬다고 생각해
|
||||
봅시다:
|
||||
|
||||
:
|
||||
: +--------+
|
||||
: +---------+ | |
|
||||
+--------+ : +--->| Cache A |<------->| |
|
||||
| | : | +---------+ | |
|
||||
| CPU 1 |<---+ | |
|
||||
| | : | +---------+ | |
|
||||
+--------+ : +--->| Cache B |<------->| |
|
||||
: +---------+ | |
|
||||
: | Memory |
|
||||
: +---------+ | System |
|
||||
+--------+ : +--->| Cache C |<------->| |
|
||||
| | : | +---------+ | |
|
||||
| CPU 2 |<---+ | |
|
||||
| | : | +---------+ | |
|
||||
+--------+ : +--->| Cache D |<------->| |
|
||||
: +---------+ | |
|
||||
: +--------+
|
||||
:
|
||||
|
||||
이 시스템이 다음과 같은 특성을 갖는다 생각해 봅시다:
|
||||
|
||||
(*) 홀수번 캐시라인은 캐시 A, 캐시 C 또는 메모리에 위치할 수 있음;
|
||||
|
||||
(*) 짝수번 캐시라인은 캐시 B, 캐시 D 또는 메모리에 위치할 수 있음;
|
||||
|
||||
(*) CPU 코어가 한개의 캐시에 접근하는 동안, 다른 캐시는 - 더티 캐시라인을
|
||||
메모리에 내리거나 추측성 로드를 하거나 하기 위해 - 시스템의 다른 부분에
|
||||
액세스 하기 위해 버스를 사용할 수 있음;
|
||||
|
||||
(*) 각 캐시는 시스템의 나머지 부분들과 일관성을 맞추기 위해 해당 캐시에
|
||||
적용되어야 할 오퍼레이션들의 큐를 가짐;
|
||||
|
||||
(*) 이 일관성 큐는 캐시에 이미 존재하는 라인에 가해지는 평범한 로드에 의해서는
|
||||
비워지지 않는데, 큐의 오퍼레이션들이 이 로드의 결과에 영향을 끼칠 수 있다
|
||||
할지라도 그러함.
|
||||
|
||||
이제, 첫번째 CPU 에서 두개의 쓰기 오퍼레이션을 만드는데, 해당 CPU 의 캐시에
|
||||
요청된 순서로 오퍼레이션이 도달됨을 보장하기 위해 두 오퍼레이션 사이에 쓰기
|
||||
배리어를 사용하는 상황을 상상해 봅시다:
|
||||
|
||||
CPU 1 CPU 2 COMMENT
|
||||
=============== =============== =======================================
|
||||
u == 0, v == 1 and p == &u, q == &u
|
||||
v = 2;
|
||||
smp_wmb(); v 의 변경이 p 의 변경 전에 보일 것을
|
||||
분명히 함
|
||||
<A:modify v=2> v 는 이제 캐시 A 에 독점적으로 존재함
|
||||
p = &v;
|
||||
<B:modify p=&v> p 는 이제 캐시 B 에 독점적으로 존재함
|
||||
|
||||
여기서의 쓰기 메모리 배리어는 CPU 1 의 캐시가 올바른 순서로 업데이트 된 것으로
|
||||
시스템의 다른 CPU 들이 인지하게 만듭니다. 하지만, 이제 두번째 CPU 가 그 값들을
|
||||
읽으려 하는 상황을 생각해 봅시다:
|
||||
|
||||
CPU 1 CPU 2 COMMENT
|
||||
=============== =============== =======================================
|
||||
...
|
||||
q = p;
|
||||
x = *q;
|
||||
|
||||
위의 두개의 읽기 오퍼레이션은 예상된 순서로 일어나지 못할 수 있는데, 두번째 CPU
|
||||
의 한 캐시에 다른 캐시 이벤트가 발생해 v 를 담고 있는 캐시라인의 해당 캐시에의
|
||||
업데이트가 지연되는 사이, p 를 담고 있는 캐시라인은 두번째 CPU 의 다른 캐시에
|
||||
업데이트 되어버렸을 수 있기 때문입니다.
|
||||
|
||||
CPU 1 CPU 2 COMMENT
|
||||
=============== =============== =======================================
|
||||
u == 0, v == 1 and p == &u, q == &u
|
||||
v = 2;
|
||||
smp_wmb();
|
||||
<A:modify v=2> <C:busy>
|
||||
<C:queue v=2>
|
||||
p = &v; q = p;
|
||||
<D:request p>
|
||||
<B:modify p=&v> <D:commit p=&v>
|
||||
<D:read p>
|
||||
x = *q;
|
||||
<C:read *q> 캐시에 업데이트 되기 전의 v 를 읽음
|
||||
<C:unbusy>
|
||||
<C:commit v=2>
|
||||
|
||||
기본적으로, 두개의 캐시라인 모두 CPU 2 에 최종적으로는 업데이트 될 것이지만,
|
||||
별도의 개입 없이는, 업데이트의 순서가 CPU 1 에서 만들어진 순서와 동일할
|
||||
것이라는 보장이 없습니다.
|
||||
|
||||
|
||||
여기에 개입하기 위해선, 데이터 의존성 배리어나 읽기 배리어를 로드 오퍼레이션들
|
||||
사이에 넣어야 합니다 (v4.15 부터는 READ_ONCE() 매크로에 의해 무조건적으로
|
||||
그렇게 됩니다). 이렇게 함으로써 캐시가 다음 요청을 처리하기 전에 일관성 큐를
|
||||
처리하도록 강제하게 됩니다.
|
||||
|
||||
CPU 1 CPU 2 COMMENT
|
||||
=============== =============== =======================================
|
||||
u == 0, v == 1 and p == &u, q == &u
|
||||
v = 2;
|
||||
smp_wmb();
|
||||
<A:modify v=2> <C:busy>
|
||||
<C:queue v=2>
|
||||
p = &v; q = p;
|
||||
<D:request p>
|
||||
<B:modify p=&v> <D:commit p=&v>
|
||||
<D:read p>
|
||||
smp_read_barrier_depends()
|
||||
<C:unbusy>
|
||||
<C:commit v=2>
|
||||
x = *q;
|
||||
<C:read *q> 캐시에 업데이트 된 v 를 읽음
|
||||
|
||||
|
||||
이런 부류의 문제는 DEC Alpha 계열 프로세서들에서 발견될 수 있는데, 이들은
|
||||
데이터 버스를 좀 더 잘 사용해 성능을 개선할 수 있는, 분할된 캐시를 가지고 있기
|
||||
때문입니다. 대부분의 CPU 는 하나의 읽기 오퍼레이션의 메모리 액세스가 다른 읽기
|
||||
오퍼레이션에 의존적이라면 데이터 의존성 배리어를 내포시킵니다만, 모두가 그런건
|
||||
아니기 때문에 이점에 의존해선 안됩니다.
|
||||
|
||||
다른 CPU 들도 분할된 캐시를 가지고 있을 수 있지만, 그런 CPU 들은 평범한 메모리
|
||||
액세스를 위해서도 이 분할된 캐시들 사이의 조정을 해야만 합니다. Alpha 는 가장
|
||||
약한 메모리 순서 시맨틱 (semantic) 을 선택함으로써 메모리 배리어가 명시적으로
|
||||
사용되지 않았을 때에는 그런 조정이 필요하지 않게 했으며, 이는 Alpha 가 당시에
|
||||
더 높은 CPU 클락 속도를 가질 수 있게 했습니다. 하지만, (다시 말하건대, v4.15
|
||||
이후부터는) Alpha 아키텍쳐 전용 코드와 READ_ONCE() 매크로 내부에서를 제외하고는
|
||||
smp_read_barrier_depends() 가 사용되지 않아야 함을 알아두시기 바랍니다.
|
||||
|
||||
|
||||
캐시 일관성 VS DMA
|
||||
------------------
|
||||
|
||||
@ -2962,10 +2824,8 @@ Alpha CPU 의 일부 버전은 분할된 데이터 캐시를 가지고 있어서
|
||||
데이터의 발견을 올바른 순서로 일어나게 하기 때문입니다.
|
||||
|
||||
리눅스 커널의 메모리 배리어 모델은 Alpha 에 기초해서 정의되었습니다만, v4.15
|
||||
부터는 리눅스 커널이 READ_ONCE() 내에 smp_read_barrier_depends() 를 추가해서
|
||||
Alpha 의 메모리 모델로의 영향력이 크게 줄어들긴 했습니다.
|
||||
|
||||
위의 "캐시 일관성" 서브섹션을 참고하세요.
|
||||
부터는 Alpha 용 READ_ONCE() 코드 내에 smp_mb() 가 추가되어서 메모리 모델로의
|
||||
Alpha 의 영향력이 크게 줄어들었습니다.
|
||||
|
||||
|
||||
가상 머신 게스트
|
||||
|
23
MAINTAINERS
23
MAINTAINERS
@ -9988,6 +9988,7 @@ M: Luc Maranget <luc.maranget@inria.fr>
|
||||
M: "Paul E. McKenney" <paulmck@kernel.org>
|
||||
R: Akira Yokosawa <akiyks@gmail.com>
|
||||
R: Daniel Lustig <dlustig@nvidia.com>
|
||||
R: Joel Fernandes <joel@joelfernandes.org>
|
||||
L: linux-kernel@vger.kernel.org
|
||||
L: linux-arch@vger.kernel.org
|
||||
S: Supported
|
||||
@ -9996,6 +9997,7 @@ F: Documentation/atomic_bitops.txt
|
||||
F: Documentation/atomic_t.txt
|
||||
F: Documentation/core-api/atomic_ops.rst
|
||||
F: Documentation/core-api/refcount-vs-atomic.rst
|
||||
F: Documentation/litmus-tests/
|
||||
F: Documentation/memory-barriers.txt
|
||||
F: tools/memory-model/
|
||||
|
||||
@ -13614,16 +13616,6 @@ F: drivers/block/pktcdvd.c
|
||||
F: include/linux/pktcdvd.h
|
||||
F: include/uapi/linux/pktcdvd.h
|
||||
|
||||
PKUNITY SOC DRIVERS
|
||||
M: Guan Xuetao <gxt@pku.edu.cn>
|
||||
S: Maintained
|
||||
W: http://mprc.pku.edu.cn/~guanxuetao/linux
|
||||
T: git git://github.com/gxt/linux.git
|
||||
F: drivers/i2c/busses/i2c-puv3.c
|
||||
F: drivers/input/serio/i8042-unicore32io.h
|
||||
F: drivers/rtc/rtc-puv3.c
|
||||
F: drivers/video/fbdev/fb-puv3.c
|
||||
|
||||
PLANTOWER PMS7003 AIR POLLUTION SENSOR DRIVER
|
||||
M: Tomasz Duszynski <tduszyns@gmail.com>
|
||||
S: Maintained
|
||||
@ -14472,7 +14464,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git dev
|
||||
F: Documentation/RCU/
|
||||
F: include/linux/rcu*
|
||||
F: kernel/rcu/
|
||||
X: Documentation/RCU/torture.txt
|
||||
X: Documentation/RCU/torture.rst
|
||||
X: include/linux/srcu*.h
|
||||
X: kernel/rcu/srcu*.c
|
||||
|
||||
@ -17324,7 +17316,7 @@ M: Josh Triplett <josh@joshtriplett.org>
|
||||
L: linux-kernel@vger.kernel.org
|
||||
S: Supported
|
||||
T: git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git dev
|
||||
F: Documentation/RCU/torture.txt
|
||||
F: Documentation/RCU/torture.rst
|
||||
F: kernel/locking/locktorture.c
|
||||
F: kernel/rcu/rcuperf.c
|
||||
F: kernel/rcu/rcutorture.c
|
||||
@ -17568,13 +17560,6 @@ L: linux-fsdevel@vger.kernel.org
|
||||
S: Supported
|
||||
F: fs/unicode/
|
||||
|
||||
UNICORE32 ARCHITECTURE
|
||||
M: Guan Xuetao <gxt@pku.edu.cn>
|
||||
S: Maintained
|
||||
W: http://mprc.pku.edu.cn/~guanxuetao/linux
|
||||
T: git git://github.com/gxt/linux.git
|
||||
F: arch/unicore32/
|
||||
|
||||
UNIFDEF
|
||||
M: Tony Finch <dot@dotat.at>
|
||||
S: Maintained
|
||||
|
@ -16,15 +16,14 @@
|
||||
|
||||
/*
|
||||
* To ensure dependency ordering is preserved for the _relaxed and
|
||||
* _release atomics, an smp_read_barrier_depends() is unconditionally
|
||||
* inserted into the _relaxed variants, which are used to build the
|
||||
* barriered versions. Avoid redundant back-to-back fences in the
|
||||
* _acquire and _fence versions.
|
||||
* _release atomics, an smp_mb() is unconditionally inserted into the
|
||||
* _relaxed variants, which are used to build the barriered versions.
|
||||
* Avoid redundant back-to-back fences in the _acquire and _fence
|
||||
* versions.
|
||||
*/
|
||||
#define __atomic_acquire_fence()
|
||||
#define __atomic_post_full_fence()
|
||||
|
||||
#define ATOMIC_INIT(i) { (i) }
|
||||
#define ATOMIC64_INIT(i) { (i) }
|
||||
|
||||
#define atomic_read(v) READ_ONCE((v)->counter)
|
||||
@ -70,7 +69,7 @@ static inline int atomic_##op##_return_relaxed(int i, atomic_t *v) \
|
||||
".previous" \
|
||||
:"=&r" (temp), "=m" (v->counter), "=&r" (result) \
|
||||
:"Ir" (i), "m" (v->counter) : "memory"); \
|
||||
smp_read_barrier_depends(); \
|
||||
smp_mb(); \
|
||||
return result; \
|
||||
}
|
||||
|
||||
@ -88,7 +87,7 @@ static inline int atomic_fetch_##op##_relaxed(int i, atomic_t *v) \
|
||||
".previous" \
|
||||
:"=&r" (temp), "=m" (v->counter), "=&r" (result) \
|
||||
:"Ir" (i), "m" (v->counter) : "memory"); \
|
||||
smp_read_barrier_depends(); \
|
||||
smp_mb(); \
|
||||
return result; \
|
||||
}
|
||||
|
||||
@ -123,7 +122,7 @@ static __inline__ s64 atomic64_##op##_return_relaxed(s64 i, atomic64_t * v) \
|
||||
".previous" \
|
||||
:"=&r" (temp), "=m" (v->counter), "=&r" (result) \
|
||||
:"Ir" (i), "m" (v->counter) : "memory"); \
|
||||
smp_read_barrier_depends(); \
|
||||
smp_mb(); \
|
||||
return result; \
|
||||
}
|
||||
|
||||
@ -141,7 +140,7 @@ static __inline__ s64 atomic64_fetch_##op##_relaxed(s64 i, atomic64_t * v) \
|
||||
".previous" \
|
||||
:"=&r" (temp), "=m" (v->counter), "=&r" (result) \
|
||||
:"Ir" (i), "m" (v->counter) : "memory"); \
|
||||
smp_read_barrier_depends(); \
|
||||
smp_mb(); \
|
||||
return result; \
|
||||
}
|
||||
|
||||
|
@ -2,64 +2,15 @@
|
||||
#ifndef __BARRIER_H
|
||||
#define __BARRIER_H
|
||||
|
||||
#include <asm/compiler.h>
|
||||
|
||||
#define mb() __asm__ __volatile__("mb": : :"memory")
|
||||
#define rmb() __asm__ __volatile__("mb": : :"memory")
|
||||
#define wmb() __asm__ __volatile__("wmb": : :"memory")
|
||||
|
||||
/**
|
||||
* read_barrier_depends - Flush all pending reads that subsequents reads
|
||||
* depend on.
|
||||
*
|
||||
* No data-dependent reads from memory-like regions are ever reordered
|
||||
* over this barrier. All reads preceding this primitive are guaranteed
|
||||
* to access memory (but not necessarily other CPUs' caches) before any
|
||||
* reads following this primitive that depend on the data return by
|
||||
* any of the preceding reads. This primitive is much lighter weight than
|
||||
* rmb() on most CPUs, and is never heavier weight than is
|
||||
* rmb().
|
||||
*
|
||||
* These ordering constraints are respected by both the local CPU
|
||||
* and the compiler.
|
||||
*
|
||||
* Ordering is not guaranteed by anything other than these primitives,
|
||||
* not even by data dependencies. See the documentation for
|
||||
* memory_barrier() for examples and URLs to more information.
|
||||
*
|
||||
* For example, the following code would force ordering (the initial
|
||||
* value of "a" is zero, "b" is one, and "p" is "&a"):
|
||||
*
|
||||
* <programlisting>
|
||||
* CPU 0 CPU 1
|
||||
*
|
||||
* b = 2;
|
||||
* memory_barrier();
|
||||
* p = &b; q = p;
|
||||
* read_barrier_depends();
|
||||
* d = *q;
|
||||
* </programlisting>
|
||||
*
|
||||
* because the read of "*q" depends on the read of "p" and these
|
||||
* two reads are separated by a read_barrier_depends(). However,
|
||||
* the following code, with the same initial values for "a" and "b":
|
||||
*
|
||||
* <programlisting>
|
||||
* CPU 0 CPU 1
|
||||
*
|
||||
* a = 2;
|
||||
* memory_barrier();
|
||||
* b = 3; y = b;
|
||||
* read_barrier_depends();
|
||||
* x = a;
|
||||
* </programlisting>
|
||||
*
|
||||
* does not enforce ordering, since there is no data dependency between
|
||||
* the read of "a" and the read of "b". Therefore, on some CPUs, such
|
||||
* as Alpha, "y" could be set to 3 and "x" to 0. Use rmb()
|
||||
* in cases like this where there are no data dependencies.
|
||||
*/
|
||||
#define read_barrier_depends() __asm__ __volatile__("mb": : :"memory")
|
||||
#define __smp_load_acquire(p) \
|
||||
({ \
|
||||
compiletime_assert_atomic_type(*p); \
|
||||
__READ_ONCE(*p); \
|
||||
})
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
#define __ASM_SMP_MB "\tmb\n"
|
||||
|
@ -277,9 +277,9 @@ extern inline pte_t pte_mkdirty(pte_t pte) { pte_val(pte) |= __DIRTY_BITS; retur
|
||||
extern inline pte_t pte_mkyoung(pte_t pte) { pte_val(pte) |= __ACCESS_BITS; return pte; }
|
||||
|
||||
/*
|
||||
* The smp_read_barrier_depends() in the following functions are required to
|
||||
* order the load of *dir (the pointer in the top level page table) with any
|
||||
* subsequent load of the returned pmd_t *ret (ret is data dependent on *dir).
|
||||
* The smp_rmb() in the following functions are required to order the load of
|
||||
* *dir (the pointer in the top level page table) with any subsequent load of
|
||||
* the returned pmd_t *ret (ret is data dependent on *dir).
|
||||
*
|
||||
* If this ordering is not enforced, the CPU might load an older value of
|
||||
* *ret, which may be uninitialized data. See mm/memory.c:__pte_alloc for
|
||||
@ -293,7 +293,7 @@ extern inline pte_t pte_mkyoung(pte_t pte) { pte_val(pte) |= __ACCESS_BITS; retu
|
||||
extern inline pmd_t * pmd_offset(pud_t * dir, unsigned long address)
|
||||
{
|
||||
pmd_t *ret = (pmd_t *) pud_page_vaddr(*dir) + ((address >> PMD_SHIFT) & (PTRS_PER_PAGE - 1));
|
||||
smp_read_barrier_depends(); /* see above */
|
||||
smp_rmb(); /* see above */
|
||||
return ret;
|
||||
}
|
||||
#define pmd_offset pmd_offset
|
||||
@ -303,7 +303,7 @@ extern inline pte_t * pte_offset_kernel(pmd_t * dir, unsigned long address)
|
||||
{
|
||||
pte_t *ret = (pte_t *) pmd_page_vaddr(*dir)
|
||||
+ ((address >> PAGE_SHIFT) & (PTRS_PER_PAGE - 1));
|
||||
smp_read_barrier_depends(); /* see above */
|
||||
smp_rmb(); /* see above */
|
||||
return ret;
|
||||
}
|
||||
#define pte_offset_kernel pte_offset_kernel
|
||||
|
35
arch/alpha/include/asm/rwonce.h
Normal file
35
arch/alpha/include/asm/rwonce.h
Normal file
@ -0,0 +1,35 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/*
|
||||
* Copyright (C) 2019 Google LLC.
|
||||
*/
|
||||
#ifndef __ASM_RWONCE_H
|
||||
#define __ASM_RWONCE_H
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
|
||||
#include <asm/barrier.h>
|
||||
|
||||
/*
|
||||
* Alpha is apparently daft enough to reorder address-dependent loads
|
||||
* on some CPU implementations. Knock some common sense into it with
|
||||
* a memory barrier in READ_ONCE().
|
||||
*
|
||||
* For the curious, more information about this unusual reordering is
|
||||
* available in chapter 15 of the "perfbook":
|
||||
*
|
||||
* https://kernel.org/pub/linux/kernel/people/paulmck/perfbook/perfbook.html
|
||||
*
|
||||
*/
|
||||
#define __READ_ONCE(x) \
|
||||
({ \
|
||||
__unqual_scalar_typeof(x) __x = \
|
||||
(*(volatile typeof(__x) *)(&(x))); \
|
||||
mb(); \
|
||||
(typeof(x))__x; \
|
||||
})
|
||||
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
#include <asm-generic/rwonce.h>
|
||||
|
||||
#endif /* __ASM_RWONCE_H */
|
@ -14,8 +14,6 @@
|
||||
#include <asm/barrier.h>
|
||||
#include <asm/smp.h>
|
||||
|
||||
#define ATOMIC_INIT(i) { (i) }
|
||||
|
||||
#ifndef CONFIG_ARC_PLAT_EZNPS
|
||||
|
||||
#define atomic_read(v) READ_ONCE((v)->counter)
|
||||
|
@ -15,8 +15,6 @@
|
||||
#include <asm/barrier.h>
|
||||
#include <asm/cmpxchg.h>
|
||||
|
||||
#define ATOMIC_INIT(i) { (i) }
|
||||
|
||||
#ifdef __KERNEL__
|
||||
|
||||
/*
|
||||
|
@ -5,7 +5,7 @@
|
||||
#ifndef _ASM_ARM_PERCPU_H_
|
||||
#define _ASM_ARM_PERCPU_H_
|
||||
|
||||
#include <asm/thread_info.h>
|
||||
register unsigned long current_stack_pointer asm ("sp");
|
||||
|
||||
/*
|
||||
* Same as asm-generic/percpu.h, except that we store the per cpu offset
|
||||
|
@ -75,11 +75,6 @@ struct thread_info {
|
||||
.addr_limit = KERNEL_DS, \
|
||||
}
|
||||
|
||||
/*
|
||||
* how to get the current stack pointer in C
|
||||
*/
|
||||
register unsigned long current_stack_pointer asm ("sp");
|
||||
|
||||
/*
|
||||
* how to get the thread information struct from C
|
||||
*/
|
||||
|
@ -7,6 +7,7 @@
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
|
||||
#include <asm/barrier.h>
|
||||
#include <asm/errno.h>
|
||||
#include <asm/unistd.h>
|
||||
#include <asm/vdso/cp15.h>
|
||||
|
@ -118,6 +118,7 @@ config ARM64
|
||||
select GENERIC_STRNLEN_USER
|
||||
select GENERIC_TIME_VSYSCALL
|
||||
select GENERIC_GETTIMEOFDAY
|
||||
select GENERIC_VDSO_TIME_NS
|
||||
select HANDLE_DOMAIN_IRQ
|
||||
select HARDIRQS_SW_RESEND
|
||||
select HAVE_PCI
|
||||
@ -1327,6 +1328,8 @@ config SWP_EMULATION
|
||||
ARMv8 obsoletes the use of A32 SWP/SWPB instructions such that
|
||||
they are always undefined. Say Y here to enable software
|
||||
emulation of these instructions for userspace using LDXR/STXR.
|
||||
This feature can be controlled at runtime with the abi.swp
|
||||
sysctl which is disabled by default.
|
||||
|
||||
In some older versions of glibc [<=2.8] SWP is used during futex
|
||||
trylock() operations with the assumption that the code will not
|
||||
@ -1353,7 +1356,8 @@ config CP15_BARRIER_EMULATION
|
||||
Say Y here to enable software emulation of these
|
||||
instructions for AArch32 userspace code. When this option is
|
||||
enabled, CP15 barrier usage is traced which can help
|
||||
identify software that needs updating.
|
||||
identify software that needs updating. This feature can be
|
||||
controlled at runtime with the abi.cp15_barrier sysctl.
|
||||
|
||||
If unsure, say Y
|
||||
|
||||
@ -1364,7 +1368,8 @@ config SETEND_EMULATION
|
||||
AArch32 EL0, and is deprecated in ARMv8.
|
||||
|
||||
Say Y here to enable software emulation of the instruction
|
||||
for AArch32 userspace code.
|
||||
for AArch32 userspace code. This feature can be controlled
|
||||
at runtime with the abi.setend sysctl.
|
||||
|
||||
Note: All the cpus on the system must have mixed endian support at EL0
|
||||
for this feature to be enabled. If a new CPU - which doesn't support mixed
|
||||
@ -1596,6 +1601,20 @@ config ARM64_AMU_EXTN
|
||||
correctly reflect reality. Most commonly, the value read will be 0,
|
||||
indicating that the counter is not enabled.
|
||||
|
||||
config AS_HAS_ARMV8_4
|
||||
def_bool $(cc-option,-Wa$(comma)-march=armv8.4-a)
|
||||
|
||||
config ARM64_TLB_RANGE
|
||||
bool "Enable support for tlbi range feature"
|
||||
default y
|
||||
depends on AS_HAS_ARMV8_4
|
||||
help
|
||||
ARMv8.4-TLBI provides TLBI invalidation instruction that apply to a
|
||||
range of input addresses.
|
||||
|
||||
The feature introduces new assembly instructions, and they were
|
||||
support when binutils >= 2.30.
|
||||
|
||||
endmenu
|
||||
|
||||
menu "ARMv8.5 architectural features"
|
||||
|
@ -82,11 +82,18 @@ endif
|
||||
# compiler to generate them and consequently to break the single image contract
|
||||
# we pass it only to the assembler. This option is utilized only in case of non
|
||||
# integrated assemblers.
|
||||
ifneq ($(CONFIG_AS_HAS_ARMV8_4), y)
|
||||
branch-prot-flags-$(CONFIG_AS_HAS_PAC) += -Wa,-march=armv8.3-a
|
||||
endif
|
||||
endif
|
||||
|
||||
KBUILD_CFLAGS += $(branch-prot-flags-y)
|
||||
|
||||
ifeq ($(CONFIG_AS_HAS_ARMV8_4), y)
|
||||
# make sure to pass the newest target architecture to -march.
|
||||
KBUILD_CFLAGS += -Wa,-march=armv8.4-a
|
||||
endif
|
||||
|
||||
ifeq ($(CONFIG_SHADOW_CALL_STACK), y)
|
||||
KBUILD_CFLAGS += -ffixed-x18
|
||||
endif
|
||||
|
@ -68,6 +68,7 @@ CONFIG_SCHED_SMT=y
|
||||
CONFIG_NUMA=y
|
||||
CONFIG_SECCOMP=y
|
||||
CONFIG_KEXEC=y
|
||||
CONFIG_KEXEC_FILE=y
|
||||
CONFIG_CRASH_DUMP=y
|
||||
CONFIG_XEN=y
|
||||
CONFIG_COMPAT=y
|
||||
|
@ -47,20 +47,7 @@
|
||||
pgprot_t __acpi_get_mem_attribute(phys_addr_t addr);
|
||||
|
||||
/* ACPI table mapping after acpi_permanent_mmap is set */
|
||||
static inline void __iomem *acpi_os_ioremap(acpi_physical_address phys,
|
||||
acpi_size size)
|
||||
{
|
||||
/* For normal memory we already have a cacheable mapping. */
|
||||
if (memblock_is_map_memory(phys))
|
||||
return (void __iomem *)__phys_to_virt(phys);
|
||||
|
||||
/*
|
||||
* We should still honor the memory's attribute here because
|
||||
* crash dump kernel possibly excludes some ACPI (reclaim)
|
||||
* regions from memblock list.
|
||||
*/
|
||||
return __ioremap(phys, size, __acpi_get_mem_attribute(phys));
|
||||
}
|
||||
void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size);
|
||||
#define acpi_os_ioremap acpi_os_ioremap
|
||||
|
||||
typedef u64 phys_cpuid_t;
|
||||
|
@ -99,8 +99,6 @@ static inline long arch_atomic64_dec_if_positive(atomic64_t *v)
|
||||
return __lse_ll_sc_body(atomic64_dec_if_positive, v);
|
||||
}
|
||||
|
||||
#define ATOMIC_INIT(i) { (i) }
|
||||
|
||||
#define arch_atomic_read(v) __READ_ONCE((v)->counter)
|
||||
#define arch_atomic_set(v, i) __WRITE_ONCE(((v)->counter), (i))
|
||||
|
||||
|
@ -62,7 +62,9 @@
|
||||
#define ARM64_HAS_GENERIC_AUTH 52
|
||||
#define ARM64_HAS_32BIT_EL1 53
|
||||
#define ARM64_BTI 54
|
||||
#define ARM64_HAS_ARMv8_4_TTL 55
|
||||
#define ARM64_HAS_TLB_RANGE 56
|
||||
|
||||
#define ARM64_NCAPS 55
|
||||
#define ARM64_NCAPS 57
|
||||
|
||||
#endif /* __ASM_CPUCAPS_H */
|
||||
|
@ -692,6 +692,12 @@ static inline bool system_supports_bti(void)
|
||||
return IS_ENABLED(CONFIG_ARM64_BTI) && cpus_have_const_cap(ARM64_BTI);
|
||||
}
|
||||
|
||||
static inline bool system_supports_tlb_range(void)
|
||||
{
|
||||
return IS_ENABLED(CONFIG_ARM64_TLB_RANGE) &&
|
||||
cpus_have_const_cap(ARM64_HAS_TLB_RANGE);
|
||||
}
|
||||
|
||||
#define ARM64_BP_HARDEN_UNKNOWN -1
|
||||
#define ARM64_BP_HARDEN_WA_NEEDED 0
|
||||
#define ARM64_BP_HARDEN_NOT_REQUIRED 1
|
||||
@ -774,6 +780,7 @@ static inline unsigned int get_vmid_bits(u64 mmfr1)
|
||||
}
|
||||
|
||||
u32 get_kvm_ipa_limit(void);
|
||||
void dump_cpu_features(void);
|
||||
|
||||
#endif /* __ASSEMBLY__ */
|
||||
|
||||
|
@ -49,6 +49,8 @@ extern void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
|
||||
pte_t *ptep, pte_t pte, unsigned long sz);
|
||||
#define set_huge_swap_pte_at set_huge_swap_pte_at
|
||||
|
||||
void __init arm64_hugetlb_cma_reserve(void);
|
||||
|
||||
#include <asm-generic/hugetlb.h>
|
||||
|
||||
#endif /* __ASM_HUGETLB_H */
|
||||
|
@ -95,6 +95,7 @@
|
||||
#define KERNEL_HWCAP_DGH __khwcap2_feature(DGH)
|
||||
#define KERNEL_HWCAP_RNG __khwcap2_feature(RNG)
|
||||
#define KERNEL_HWCAP_BTI __khwcap2_feature(BTI)
|
||||
/* reserved for KERNEL_HWCAP_MTE __khwcap2_feature(MTE) */
|
||||
|
||||
/*
|
||||
* This yields a mask that user programs can use to figure out what
|
||||
|
@ -8,7 +8,7 @@
|
||||
#ifndef __ASM_KERNEL_PGTABLE_H
|
||||
#define __ASM_KERNEL_PGTABLE_H
|
||||
|
||||
#include <linux/pgtable.h>
|
||||
#include <asm/pgtable-hwdef.h>
|
||||
#include <asm/sparsemem.h>
|
||||
|
||||
/*
|
||||
|
@ -10,11 +10,8 @@
|
||||
#ifndef __ASM_MEMORY_H
|
||||
#define __ASM_MEMORY_H
|
||||
|
||||
#include <linux/compiler.h>
|
||||
#include <linux/const.h>
|
||||
#include <linux/sizes.h>
|
||||
#include <linux/types.h>
|
||||
#include <asm/bug.h>
|
||||
#include <asm/page-def.h>
|
||||
|
||||
/*
|
||||
@ -157,11 +154,15 @@
|
||||
#endif
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
extern u64 vabits_actual;
|
||||
#define PAGE_END (_PAGE_END(vabits_actual))
|
||||
|
||||
#include <linux/bitops.h>
|
||||
#include <linux/compiler.h>
|
||||
#include <linux/mmdebug.h>
|
||||
#include <linux/types.h>
|
||||
#include <asm/bug.h>
|
||||
|
||||
extern u64 vabits_actual;
|
||||
#define PAGE_END (_PAGE_END(vabits_actual))
|
||||
|
||||
extern s64 physvirt_offset;
|
||||
extern s64 memstart_addr;
|
||||
@ -322,6 +323,7 @@ static inline void *phys_to_virt(phys_addr_t x)
|
||||
__is_lm_address(__addr) && pfn_valid(virt_to_pfn(__addr)); \
|
||||
})
|
||||
|
||||
void dump_mem_limit(void);
|
||||
#endif /* !ASSEMBLY */
|
||||
|
||||
/*
|
||||
|
@ -175,7 +175,7 @@ static inline void cpu_replace_ttbr1(pgd_t *pgdp)
|
||||
* take CPU migration into account.
|
||||
*/
|
||||
#define destroy_context(mm) do { } while(0)
|
||||
void check_and_switch_context(struct mm_struct *mm, unsigned int cpu);
|
||||
void check_and_switch_context(struct mm_struct *mm);
|
||||
|
||||
#define init_new_context(tsk,mm) ({ atomic64_set(&(mm)->context.id, 0); 0; })
|
||||
|
||||
@ -214,8 +214,6 @@ enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
|
||||
|
||||
static inline void __switch_mm(struct mm_struct *next)
|
||||
{
|
||||
unsigned int cpu = smp_processor_id();
|
||||
|
||||
/*
|
||||
* init_mm.pgd does not contain any user mappings and it is always
|
||||
* active for kernel addresses in TTBR1. Just set the reserved TTBR0.
|
||||
@ -225,7 +223,7 @@ static inline void __switch_mm(struct mm_struct *next)
|
||||
return;
|
||||
}
|
||||
|
||||
check_and_switch_context(next, cpu);
|
||||
check_and_switch_context(next);
|
||||
}
|
||||
|
||||
static inline void
|
||||
|
@ -72,6 +72,13 @@
|
||||
#define ARMV8_PMUV3_PERFCTR_LL_CACHE_RD 0x36
|
||||
#define ARMV8_PMUV3_PERFCTR_LL_CACHE_MISS_RD 0x37
|
||||
#define ARMV8_PMUV3_PERFCTR_REMOTE_ACCESS_RD 0x38
|
||||
#define ARMV8_PMUV3_PERFCTR_L1D_CACHE_LMISS_RD 0x39
|
||||
#define ARMV8_PMUV3_PERFCTR_OP_RETIRED 0x3A
|
||||
#define ARMV8_PMUV3_PERFCTR_OP_SPEC 0x3B
|
||||
#define ARMV8_PMUV3_PERFCTR_STALL 0x3C
|
||||
#define ARMV8_PMUV3_PERFCTR_STALL_SLOT_BACKEND 0x3D
|
||||
#define ARMV8_PMUV3_PERFCTR_STALL_SLOT_FRONTEND 0x3E
|
||||
#define ARMV8_PMUV3_PERFCTR_STALL_SLOT 0x3F
|
||||
|
||||
/* Statistical profiling extension microarchitectural events */
|
||||
#define ARMV8_SPE_PERFCTR_SAMPLE_POP 0x4000
|
||||
@ -79,6 +86,26 @@
|
||||
#define ARMV8_SPE_PERFCTR_SAMPLE_FILTRATE 0x4002
|
||||
#define ARMV8_SPE_PERFCTR_SAMPLE_COLLISION 0x4003
|
||||
|
||||
/* AMUv1 architecture events */
|
||||
#define ARMV8_AMU_PERFCTR_CNT_CYCLES 0x4004
|
||||
#define ARMV8_AMU_PERFCTR_STALL_BACKEND_MEM 0x4005
|
||||
|
||||
/* long-latency read miss events */
|
||||
#define ARMV8_PMUV3_PERFCTR_L1I_CACHE_LMISS 0x4006
|
||||
#define ARMV8_PMUV3_PERFCTR_L2D_CACHE_LMISS_RD 0x4009
|
||||
#define ARMV8_PMUV3_PERFCTR_L2I_CACHE_LMISS 0x400A
|
||||
#define ARMV8_PMUV3_PERFCTR_L3D_CACHE_LMISS_RD 0x400B
|
||||
|
||||
/* additional latency from alignment events */
|
||||
#define ARMV8_PMUV3_PERFCTR_LDST_ALIGN_LAT 0x4020
|
||||
#define ARMV8_PMUV3_PERFCTR_LD_ALIGN_LAT 0x4021
|
||||
#define ARMV8_PMUV3_PERFCTR_ST_ALIGN_LAT 0x4022
|
||||
|
||||
/* Armv8.5 Memory Tagging Extension events */
|
||||
#define ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED 0x4024
|
||||
#define ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED_RD 0x4025
|
||||
#define ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED_WR 0x4026
|
||||
|
||||
/* ARMv8 recommended implementation defined event types */
|
||||
#define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_RD 0x40
|
||||
#define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR 0x41
|
||||
|
@ -29,7 +29,7 @@
|
||||
* Size mapped by an entry at level n ( 0 <= n <= 3)
|
||||
* We map (PAGE_SHIFT - 3) at all translation levels and PAGE_SHIFT bits
|
||||
* in the final page. The maximum number of translation levels supported by
|
||||
* the architecture is 4. Hence, starting at at level n, we have further
|
||||
* the architecture is 4. Hence, starting at level n, we have further
|
||||
* ((4 - n) - 1) levels of translation excluding the offset within the page.
|
||||
* So, the total number of bits mapped by an entry at level n is :
|
||||
*
|
||||
@ -82,23 +82,23 @@
|
||||
* Contiguous page definitions.
|
||||
*/
|
||||
#ifdef CONFIG_ARM64_64K_PAGES
|
||||
#define CONT_PTE_SHIFT 5
|
||||
#define CONT_PMD_SHIFT 5
|
||||
#define CONT_PTE_SHIFT (5 + PAGE_SHIFT)
|
||||
#define CONT_PMD_SHIFT (5 + PMD_SHIFT)
|
||||
#elif defined(CONFIG_ARM64_16K_PAGES)
|
||||
#define CONT_PTE_SHIFT 7
|
||||
#define CONT_PMD_SHIFT 5
|
||||
#define CONT_PTE_SHIFT (7 + PAGE_SHIFT)
|
||||
#define CONT_PMD_SHIFT (5 + PMD_SHIFT)
|
||||
#else
|
||||
#define CONT_PTE_SHIFT 4
|
||||
#define CONT_PMD_SHIFT 4
|
||||
#define CONT_PTE_SHIFT (4 + PAGE_SHIFT)
|
||||
#define CONT_PMD_SHIFT (4 + PMD_SHIFT)
|
||||
#endif
|
||||
|
||||
#define CONT_PTES (1 << CONT_PTE_SHIFT)
|
||||
#define CONT_PTES (1 << (CONT_PTE_SHIFT - PAGE_SHIFT))
|
||||
#define CONT_PTE_SIZE (CONT_PTES * PAGE_SIZE)
|
||||
#define CONT_PTE_MASK (~(CONT_PTE_SIZE - 1))
|
||||
#define CONT_PMDS (1 << CONT_PMD_SHIFT)
|
||||
#define CONT_PMDS (1 << (CONT_PMD_SHIFT - PMD_SHIFT))
|
||||
#define CONT_PMD_SIZE (CONT_PMDS * PMD_SIZE)
|
||||
#define CONT_PMD_MASK (~(CONT_PMD_SIZE - 1))
|
||||
/* the the numerical offset of the PTE within a range of CONT_PTES */
|
||||
/* the numerical offset of the PTE within a range of CONT_PTES */
|
||||
#define CONT_RANGE_OFFSET(addr) (((addr)>>PAGE_SHIFT)&(CONT_PTES-1))
|
||||
|
||||
/*
|
||||
@ -178,10 +178,12 @@
|
||||
#define PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[2:1] */
|
||||
#define PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */
|
||||
#define PTE_S2_XN (_AT(pteval_t, 2) << 53) /* XN[1:0] */
|
||||
#define PTE_S2_SW_RESVD (_AT(pteval_t, 15) << 55) /* Reserved for SW */
|
||||
|
||||
#define PMD_S2_RDONLY (_AT(pmdval_t, 1) << 6) /* HAP[2:1] */
|
||||
#define PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */
|
||||
#define PMD_S2_XN (_AT(pmdval_t, 2) << 53) /* XN[1:0] */
|
||||
#define PMD_S2_SW_RESVD (_AT(pmdval_t, 15) << 55) /* Reserved for SW */
|
||||
|
||||
#define PUD_S2_RDONLY (_AT(pudval_t, 1) << 6) /* HAP[2:1] */
|
||||
#define PUD_S2_RDWR (_AT(pudval_t, 3) << 6) /* HAP[2:1] */
|
||||
@ -216,6 +218,7 @@
|
||||
#define TCR_TxSZ(x) (TCR_T0SZ(x) | TCR_T1SZ(x))
|
||||
#define TCR_TxSZ_WIDTH 6
|
||||
#define TCR_T0SZ_MASK (((UL(1) << TCR_TxSZ_WIDTH) - 1) << TCR_T0SZ_OFFSET)
|
||||
#define TCR_T1SZ_MASK (((UL(1) << TCR_TxSZ_WIDTH) - 1) << TCR_T1SZ_OFFSET)
|
||||
|
||||
#define TCR_EPD0_SHIFT 7
|
||||
#define TCR_EPD0_MASK (UL(1) << TCR_EPD0_SHIFT)
|
||||
|
@ -40,6 +40,16 @@ extern void __pmd_error(const char *file, int line, unsigned long val);
|
||||
extern void __pud_error(const char *file, int line, unsigned long val);
|
||||
extern void __pgd_error(const char *file, int line, unsigned long val);
|
||||
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
#define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
|
||||
|
||||
/* Set stride and tlb_level in flush_*_tlb_range */
|
||||
#define flush_pmd_tlb_range(vma, addr, end) \
|
||||
__flush_tlb_range(vma, addr, end, PMD_SIZE, false, 2)
|
||||
#define flush_pud_tlb_range(vma, addr, end) \
|
||||
__flush_tlb_range(vma, addr, end, PUD_SIZE, false, 1)
|
||||
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||
|
||||
/*
|
||||
* ZERO_PAGE is a global shared page that is always zero: used
|
||||
* for zero-mapped memory areas etc..
|
||||
|
@ -27,7 +27,7 @@
|
||||
*
|
||||
* Some code sections either automatically switch back to PSR.I or explicitly
|
||||
* require to not use priority masking. If bit GIC_PRIO_PSR_I_SET is included
|
||||
* in the the priority mask, it indicates that PSR.I should be set and
|
||||
* in the priority mask, it indicates that PSR.I should be set and
|
||||
* interrupt disabling temporarily does not rely on IRQ priorities.
|
||||
*/
|
||||
#define GIC_PRIO_IRQON 0xe0
|
||||
|
@ -256,4 +256,13 @@ stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
|
||||
return (boundary - 1 < end - 1) ? boundary : end;
|
||||
}
|
||||
|
||||
/*
|
||||
* Level values for the ARMv8.4-TTL extension, mapping PUD/PMD/PTE and
|
||||
* the architectural page-table level.
|
||||
*/
|
||||
#define S2_NO_LEVEL_HINT 0
|
||||
#define S2_PUD_LEVEL 1
|
||||
#define S2_PMD_LEVEL 2
|
||||
#define S2_PTE_LEVEL 3
|
||||
|
||||
#endif /* __ARM64_S2_PGTABLE_H_ */
|
||||
|
@ -421,9 +421,9 @@
|
||||
*/
|
||||
|
||||
#define SYS_AMEVCNTR0_EL0(n) SYS_AM_EL0(4 + ((n) >> 3), (n) & 7)
|
||||
#define SYS_AMEVTYPE0_EL0(n) SYS_AM_EL0(6 + ((n) >> 3), (n) & 7)
|
||||
#define SYS_AMEVTYPER0_EL0(n) SYS_AM_EL0(6 + ((n) >> 3), (n) & 7)
|
||||
#define SYS_AMEVCNTR1_EL0(n) SYS_AM_EL0(12 + ((n) >> 3), (n) & 7)
|
||||
#define SYS_AMEVTYPE1_EL0(n) SYS_AM_EL0(14 + ((n) >> 3), (n) & 7)
|
||||
#define SYS_AMEVTYPER1_EL0(n) SYS_AM_EL0(14 + ((n) >> 3), (n) & 7)
|
||||
|
||||
/* AMU v1: Fixed (architecturally defined) activity monitors */
|
||||
#define SYS_AMEVCNTR0_CORE_EL0 SYS_AMEVCNTR0_EL0(0)
|
||||
@ -617,6 +617,9 @@
|
||||
#define ID_AA64ISAR0_SHA1_SHIFT 8
|
||||
#define ID_AA64ISAR0_AES_SHIFT 4
|
||||
|
||||
#define ID_AA64ISAR0_TLB_RANGE_NI 0x0
|
||||
#define ID_AA64ISAR0_TLB_RANGE 0x2
|
||||
|
||||
/* id_aa64isar1 */
|
||||
#define ID_AA64ISAR1_I8MM_SHIFT 52
|
||||
#define ID_AA64ISAR1_DGH_SHIFT 48
|
||||
@ -706,6 +709,9 @@
|
||||
#define ID_AA64ZFR0_SVEVER_SVE2 0x1
|
||||
|
||||
/* id_aa64mmfr0 */
|
||||
#define ID_AA64MMFR0_ECV_SHIFT 60
|
||||
#define ID_AA64MMFR0_FGT_SHIFT 56
|
||||
#define ID_AA64MMFR0_EXS_SHIFT 44
|
||||
#define ID_AA64MMFR0_TGRAN4_2_SHIFT 40
|
||||
#define ID_AA64MMFR0_TGRAN64_2_SHIFT 36
|
||||
#define ID_AA64MMFR0_TGRAN16_2_SHIFT 32
|
||||
@ -734,6 +740,10 @@
|
||||
#endif
|
||||
|
||||
/* id_aa64mmfr1 */
|
||||
#define ID_AA64MMFR1_ETS_SHIFT 36
|
||||
#define ID_AA64MMFR1_TWED_SHIFT 32
|
||||
#define ID_AA64MMFR1_XNX_SHIFT 28
|
||||
#define ID_AA64MMFR1_SPECSEI_SHIFT 24
|
||||
#define ID_AA64MMFR1_PAN_SHIFT 20
|
||||
#define ID_AA64MMFR1_LOR_SHIFT 16
|
||||
#define ID_AA64MMFR1_HPD_SHIFT 12
|
||||
@ -746,8 +756,15 @@
|
||||
|
||||
/* id_aa64mmfr2 */
|
||||
#define ID_AA64MMFR2_E0PD_SHIFT 60
|
||||
#define ID_AA64MMFR2_EVT_SHIFT 56
|
||||
#define ID_AA64MMFR2_BBM_SHIFT 52
|
||||
#define ID_AA64MMFR2_TTL_SHIFT 48
|
||||
#define ID_AA64MMFR2_FWB_SHIFT 40
|
||||
#define ID_AA64MMFR2_IDS_SHIFT 36
|
||||
#define ID_AA64MMFR2_AT_SHIFT 32
|
||||
#define ID_AA64MMFR2_ST_SHIFT 28
|
||||
#define ID_AA64MMFR2_NV_SHIFT 24
|
||||
#define ID_AA64MMFR2_CCIDX_SHIFT 20
|
||||
#define ID_AA64MMFR2_LVA_SHIFT 16
|
||||
#define ID_AA64MMFR2_IESB_SHIFT 12
|
||||
#define ID_AA64MMFR2_LSM_SHIFT 8
|
||||
@ -755,6 +772,7 @@
|
||||
#define ID_AA64MMFR2_CNP_SHIFT 0
|
||||
|
||||
/* id_aa64dfr0 */
|
||||
#define ID_AA64DFR0_DOUBLELOCK_SHIFT 36
|
||||
#define ID_AA64DFR0_PMSVER_SHIFT 32
|
||||
#define ID_AA64DFR0_CTX_CMPS_SHIFT 28
|
||||
#define ID_AA64DFR0_WRPS_SHIFT 20
|
||||
@ -807,18 +825,40 @@
|
||||
#define ID_ISAR6_DP_SHIFT 4
|
||||
#define ID_ISAR6_JSCVT_SHIFT 0
|
||||
|
||||
#define ID_MMFR0_INNERSHR_SHIFT 28
|
||||
#define ID_MMFR0_FCSE_SHIFT 24
|
||||
#define ID_MMFR0_AUXREG_SHIFT 20
|
||||
#define ID_MMFR0_TCM_SHIFT 16
|
||||
#define ID_MMFR0_SHARELVL_SHIFT 12
|
||||
#define ID_MMFR0_OUTERSHR_SHIFT 8
|
||||
#define ID_MMFR0_PMSA_SHIFT 4
|
||||
#define ID_MMFR0_VMSA_SHIFT 0
|
||||
|
||||
#define ID_MMFR4_EVT_SHIFT 28
|
||||
#define ID_MMFR4_CCIDX_SHIFT 24
|
||||
#define ID_MMFR4_LSM_SHIFT 20
|
||||
#define ID_MMFR4_HPDS_SHIFT 16
|
||||
#define ID_MMFR4_CNP_SHIFT 12
|
||||
#define ID_MMFR4_XNX_SHIFT 8
|
||||
#define ID_MMFR4_AC2_SHIFT 4
|
||||
#define ID_MMFR4_SPECSEI_SHIFT 0
|
||||
|
||||
#define ID_MMFR5_ETS_SHIFT 0
|
||||
|
||||
#define ID_PFR0_DIT_SHIFT 24
|
||||
#define ID_PFR0_CSV2_SHIFT 16
|
||||
#define ID_PFR0_STATE3_SHIFT 12
|
||||
#define ID_PFR0_STATE2_SHIFT 8
|
||||
#define ID_PFR0_STATE1_SHIFT 4
|
||||
#define ID_PFR0_STATE0_SHIFT 0
|
||||
|
||||
#define ID_DFR0_PERFMON_SHIFT 24
|
||||
#define ID_DFR0_MPROFDBG_SHIFT 20
|
||||
#define ID_DFR0_MMAPTRC_SHIFT 16
|
||||
#define ID_DFR0_COPTRC_SHIFT 12
|
||||
#define ID_DFR0_MMAPDBG_SHIFT 8
|
||||
#define ID_DFR0_COPSDBG_SHIFT 4
|
||||
#define ID_DFR0_COPDBG_SHIFT 0
|
||||
|
||||
#define ID_PFR2_SSBS_SHIFT 4
|
||||
#define ID_PFR2_CSV3_SHIFT 0
|
||||
@ -861,6 +901,11 @@
|
||||
#define ID_AA64MMFR0_TGRAN_SUPPORTED ID_AA64MMFR0_TGRAN64_SUPPORTED
|
||||
#endif
|
||||
|
||||
#define MVFR2_FPMISC_SHIFT 4
|
||||
#define MVFR2_SIMDMISC_SHIFT 0
|
||||
|
||||
#define DCZID_DZP_SHIFT 4
|
||||
#define DCZID_BS_SHIFT 0
|
||||
|
||||
/*
|
||||
* The ZCR_ELx_LEN_* definitions intentionally include bits [8:4] which
|
||||
|
@ -21,11 +21,37 @@ static void tlb_flush(struct mmu_gather *tlb);
|
||||
|
||||
#include <asm-generic/tlb.h>
|
||||
|
||||
/*
|
||||
* get the tlbi levels in arm64. Default value is 0 if more than one
|
||||
* of cleared_* is set or neither is set.
|
||||
* Arm64 doesn't support p4ds now.
|
||||
*/
|
||||
static inline int tlb_get_level(struct mmu_gather *tlb)
|
||||
{
|
||||
if (tlb->cleared_ptes && !(tlb->cleared_pmds ||
|
||||
tlb->cleared_puds ||
|
||||
tlb->cleared_p4ds))
|
||||
return 3;
|
||||
|
||||
if (tlb->cleared_pmds && !(tlb->cleared_ptes ||
|
||||
tlb->cleared_puds ||
|
||||
tlb->cleared_p4ds))
|
||||
return 2;
|
||||
|
||||
if (tlb->cleared_puds && !(tlb->cleared_ptes ||
|
||||
tlb->cleared_pmds ||
|
||||
tlb->cleared_p4ds))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void tlb_flush(struct mmu_gather *tlb)
|
||||
{
|
||||
struct vm_area_struct vma = TLB_FLUSH_VMA(tlb->mm, 0);
|
||||
bool last_level = !tlb->freed_tables;
|
||||
unsigned long stride = tlb_get_unmap_size(tlb);
|
||||
int tlb_level = tlb_get_level(tlb);
|
||||
|
||||
/*
|
||||
* If we're tearing down the address space then we only care about
|
||||
@ -38,7 +64,8 @@ static inline void tlb_flush(struct mmu_gather *tlb)
|
||||
return;
|
||||
}
|
||||
|
||||
__flush_tlb_range(&vma, tlb->start, tlb->end, stride, last_level);
|
||||
__flush_tlb_range(&vma, tlb->start, tlb->end, stride,
|
||||
last_level, tlb_level);
|
||||
}
|
||||
|
||||
static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
|
||||
|
@ -10,6 +10,7 @@
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
|
||||
#include <linux/bitfield.h>
|
||||
#include <linux/mm_types.h>
|
||||
#include <linux/sched.h>
|
||||
#include <asm/cputype.h>
|
||||
@ -59,6 +60,102 @@
|
||||
__ta; \
|
||||
})
|
||||
|
||||
/*
|
||||
* Get translation granule of the system, which is decided by
|
||||
* PAGE_SIZE. Used by TTL.
|
||||
* - 4KB : 1
|
||||
* - 16KB : 2
|
||||
* - 64KB : 3
|
||||
*/
|
||||
#define TLBI_TTL_TG_4K 1
|
||||
#define TLBI_TTL_TG_16K 2
|
||||
#define TLBI_TTL_TG_64K 3
|
||||
|
||||
static inline unsigned long get_trans_granule(void)
|
||||
{
|
||||
switch (PAGE_SIZE) {
|
||||
case SZ_4K:
|
||||
return TLBI_TTL_TG_4K;
|
||||
case SZ_16K:
|
||||
return TLBI_TTL_TG_16K;
|
||||
case SZ_64K:
|
||||
return TLBI_TTL_TG_64K;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Level-based TLBI operations.
|
||||
*
|
||||
* When ARMv8.4-TTL exists, TLBI operations take an additional hint for
|
||||
* the level at which the invalidation must take place. If the level is
|
||||
* wrong, no invalidation may take place. In the case where the level
|
||||
* cannot be easily determined, a 0 value for the level parameter will
|
||||
* perform a non-hinted invalidation.
|
||||
*
|
||||
* For Stage-2 invalidation, use the level values provided to that effect
|
||||
* in asm/stage2_pgtable.h.
|
||||
*/
|
||||
#define TLBI_TTL_MASK GENMASK_ULL(47, 44)
|
||||
|
||||
#define __tlbi_level(op, addr, level) do { \
|
||||
u64 arg = addr; \
|
||||
\
|
||||
if (cpus_have_const_cap(ARM64_HAS_ARMv8_4_TTL) && \
|
||||
level) { \
|
||||
u64 ttl = level & 3; \
|
||||
ttl |= get_trans_granule() << 2; \
|
||||
arg &= ~TLBI_TTL_MASK; \
|
||||
arg |= FIELD_PREP(TLBI_TTL_MASK, ttl); \
|
||||
} \
|
||||
\
|
||||
__tlbi(op, arg); \
|
||||
} while(0)
|
||||
|
||||
#define __tlbi_user_level(op, arg, level) do { \
|
||||
if (arm64_kernel_unmapped_at_el0()) \
|
||||
__tlbi_level(op, (arg | USER_ASID_FLAG), level); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
* This macro creates a properly formatted VA operand for the TLB RANGE.
|
||||
* The value bit assignments are:
|
||||
*
|
||||
* +----------+------+-------+-------+-------+----------------------+
|
||||
* | ASID | TG | SCALE | NUM | TTL | BADDR |
|
||||
* +-----------------+-------+-------+-------+----------------------+
|
||||
* |63 48|47 46|45 44|43 39|38 37|36 0|
|
||||
*
|
||||
* The address range is determined by below formula:
|
||||
* [BADDR, BADDR + (NUM + 1) * 2^(5*SCALE + 1) * PAGESIZE)
|
||||
*
|
||||
*/
|
||||
#define __TLBI_VADDR_RANGE(addr, asid, scale, num, ttl) \
|
||||
({ \
|
||||
unsigned long __ta = (addr) >> PAGE_SHIFT; \
|
||||
__ta &= GENMASK_ULL(36, 0); \
|
||||
__ta |= (unsigned long)(ttl) << 37; \
|
||||
__ta |= (unsigned long)(num) << 39; \
|
||||
__ta |= (unsigned long)(scale) << 44; \
|
||||
__ta |= get_trans_granule() << 46; \
|
||||
__ta |= (unsigned long)(asid) << 48; \
|
||||
__ta; \
|
||||
})
|
||||
|
||||
/* These macros are used by the TLBI RANGE feature. */
|
||||
#define __TLBI_RANGE_PAGES(num, scale) \
|
||||
((unsigned long)((num) + 1) << (5 * (scale) + 1))
|
||||
#define MAX_TLBI_RANGE_PAGES __TLBI_RANGE_PAGES(31, 3)
|
||||
|
||||
/*
|
||||
* Generate 'num' values from -1 to 30 with -1 rejected by the
|
||||
* __flush_tlb_range() loop below.
|
||||
*/
|
||||
#define TLBI_RANGE_MASK GENMASK_ULL(4, 0)
|
||||
#define __TLBI_RANGE_NUM(pages, scale) \
|
||||
((((pages) >> (5 * (scale) + 1)) & TLBI_RANGE_MASK) - 1)
|
||||
|
||||
/*
|
||||
* TLB Invalidation
|
||||
* ================
|
||||
@ -179,34 +276,83 @@ static inline void flush_tlb_page(struct vm_area_struct *vma,
|
||||
|
||||
static inline void __flush_tlb_range(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end,
|
||||
unsigned long stride, bool last_level)
|
||||
unsigned long stride, bool last_level,
|
||||
int tlb_level)
|
||||
{
|
||||
int num = 0;
|
||||
int scale = 0;
|
||||
unsigned long asid = ASID(vma->vm_mm);
|
||||
unsigned long addr;
|
||||
unsigned long pages;
|
||||
|
||||
start = round_down(start, stride);
|
||||
end = round_up(end, stride);
|
||||
pages = (end - start) >> PAGE_SHIFT;
|
||||
|
||||
if ((end - start) >= (MAX_TLBI_OPS * stride)) {
|
||||
/*
|
||||
* When not uses TLB range ops, we can handle up to
|
||||
* (MAX_TLBI_OPS - 1) pages;
|
||||
* When uses TLB range ops, we can handle up to
|
||||
* (MAX_TLBI_RANGE_PAGES - 1) pages.
|
||||
*/
|
||||
if ((!system_supports_tlb_range() &&
|
||||
(end - start) >= (MAX_TLBI_OPS * stride)) ||
|
||||
pages >= MAX_TLBI_RANGE_PAGES) {
|
||||
flush_tlb_mm(vma->vm_mm);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Convert the stride into units of 4k */
|
||||
stride >>= 12;
|
||||
|
||||
start = __TLBI_VADDR(start, asid);
|
||||
end = __TLBI_VADDR(end, asid);
|
||||
|
||||
dsb(ishst);
|
||||
for (addr = start; addr < end; addr += stride) {
|
||||
if (last_level) {
|
||||
__tlbi(vale1is, addr);
|
||||
__tlbi_user(vale1is, addr);
|
||||
} else {
|
||||
__tlbi(vae1is, addr);
|
||||
__tlbi_user(vae1is, addr);
|
||||
|
||||
/*
|
||||
* When the CPU does not support TLB range operations, flush the TLB
|
||||
* entries one by one at the granularity of 'stride'. If the the TLB
|
||||
* range ops are supported, then:
|
||||
*
|
||||
* 1. If 'pages' is odd, flush the first page through non-range
|
||||
* operations;
|
||||
*
|
||||
* 2. For remaining pages: the minimum range granularity is decided
|
||||
* by 'scale', so multiple range TLBI operations may be required.
|
||||
* Start from scale = 0, flush the corresponding number of pages
|
||||
* ((num+1)*2^(5*scale+1) starting from 'addr'), then increase it
|
||||
* until no pages left.
|
||||
*
|
||||
* Note that certain ranges can be represented by either num = 31 and
|
||||
* scale or num = 0 and scale + 1. The loop below favours the latter
|
||||
* since num is limited to 30 by the __TLBI_RANGE_NUM() macro.
|
||||
*/
|
||||
while (pages > 0) {
|
||||
if (!system_supports_tlb_range() ||
|
||||
pages % 2 == 1) {
|
||||
addr = __TLBI_VADDR(start, asid);
|
||||
if (last_level) {
|
||||
__tlbi_level(vale1is, addr, tlb_level);
|
||||
__tlbi_user_level(vale1is, addr, tlb_level);
|
||||
} else {
|
||||
__tlbi_level(vae1is, addr, tlb_level);
|
||||
__tlbi_user_level(vae1is, addr, tlb_level);
|
||||
}
|
||||
start += stride;
|
||||
pages -= stride >> PAGE_SHIFT;
|
||||
continue;
|
||||
}
|
||||
|
||||
num = __TLBI_RANGE_NUM(pages, scale);
|
||||
if (num >= 0) {
|
||||
addr = __TLBI_VADDR_RANGE(start, asid, scale,
|
||||
num, tlb_level);
|
||||
if (last_level) {
|
||||
__tlbi(rvale1is, addr);
|
||||
__tlbi_user(rvale1is, addr);
|
||||
} else {
|
||||
__tlbi(rvae1is, addr);
|
||||
__tlbi_user(rvae1is, addr);
|
||||
}
|
||||
start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT;
|
||||
pages -= __TLBI_RANGE_PAGES(num, scale);
|
||||
}
|
||||
scale++;
|
||||
}
|
||||
dsb(ish);
|
||||
}
|
||||
@ -217,8 +363,9 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
|
||||
/*
|
||||
* We cannot use leaf-only invalidation here, since we may be invalidating
|
||||
* table entries as part of collapsing hugepages or moving page tables.
|
||||
* Set the tlb_level to 0 because we can not get enough information here.
|
||||
*/
|
||||
__flush_tlb_range(vma, start, end, PAGE_SIZE, false);
|
||||
__flush_tlb_range(vma, start, end, PAGE_SIZE, false, 0);
|
||||
}
|
||||
|
||||
static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end)
|
||||
|
@ -19,6 +19,7 @@
|
||||
#include <linux/string.h>
|
||||
|
||||
#include <asm/cpufeature.h>
|
||||
#include <asm/mmu.h>
|
||||
#include <asm/ptrace.h>
|
||||
#include <asm/memory.h>
|
||||
#include <asm/extable.h>
|
||||
|
@ -12,6 +12,8 @@
|
||||
*/
|
||||
#define VDSO_LBASE 0x0
|
||||
|
||||
#define __VVAR_PAGES 2
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
|
||||
#include <generated/vdso-offsets.h>
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user